summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
committerTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
commitfcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch)
tree22962a4387943edc841c72a4e636a068c66d58fd /Documentation
downloadast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip
ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz
Initial import of modified Linux 2.6.28 tree
Original upstream URL: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX352
-rw-r--r--Documentation/ABI/README77
-rw-r--r--Documentation/ABI/obsolete/dv13949
-rw-r--r--Documentation/ABI/obsolete/o2cb11
-rw-r--r--Documentation/ABI/removed/devfs12
-rw-r--r--Documentation/ABI/removed/raw1394_legacy_isochronous16
-rw-r--r--Documentation/ABI/stable/o2cb10
-rw-r--r--Documentation/ABI/stable/syscalls10
-rw-r--r--Documentation/ABI/stable/sysfs-class-ubi212
-rw-r--r--Documentation/ABI/stable/sysfs-driver-usb-usbtmc62
-rw-r--r--Documentation/ABI/stable/sysfs-module30
-rw-r--r--Documentation/ABI/testing/debugfs-pktcdvd19
-rw-r--r--Documentation/ABI/testing/procfs-diskstats22
-rw-r--r--Documentation/ABI/testing/sysfs-block62
-rw-r--r--Documentation/ABI/testing/sysfs-bus-css35
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci11
-rw-r--r--Documentation/ABI/testing/sysfs-bus-umc28
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb146
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg43
-rw-r--r--Documentation/ABI/testing/sysfs-c2port88
-rw-r--r--Documentation/ABI/testing/sysfs-class16
-rw-r--r--Documentation/ABI/testing/sysfs-class-bdi50
-rw-r--r--Documentation/ABI/testing/sysfs-class-pktcdvd72
-rw-r--r--Documentation/ABI/testing/sysfs-class-regulator328
-rw-r--r--Documentation/ABI/testing/sysfs-class-usb_host25
-rw-r--r--Documentation/ABI/testing/sysfs-class-uwb_rc144
-rw-r--r--Documentation/ABI/testing/sysfs-dev20
-rw-r--r--Documentation/ABI/testing/sysfs-devices25
-rw-r--r--Documentation/ABI/testing/sysfs-devices-memory24
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-acpi146
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-memmap71
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-sgi_uv27
-rw-r--r--Documentation/ABI/testing/sysfs-gpio26
-rw-r--r--Documentation/ABI/testing/sysfs-ibft23
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm6
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-hugepages15
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-uids14
-rw-r--r--Documentation/ABI/testing/sysfs-ocfs289
-rw-r--r--Documentation/ABI/testing/sysfs-power103
-rw-r--r--Documentation/ABI/testing/sysfs-profiling13
-rw-r--r--Documentation/ABI/testing/sysfs-wusb_cbaf100
-rw-r--r--Documentation/BUG-HUNTING246
-rw-r--r--Documentation/Changes396
-rw-r--r--Documentation/CodingStyle816
-rw-r--r--Documentation/DMA-API.txt612
-rw-r--r--Documentation/DMA-ISA-LPC.txt151
-rw-r--r--Documentation/DMA-attributes.txt33
-rw-r--r--Documentation/DMA-mapping.txt766
-rw-r--r--Documentation/DocBook/.gitignore6
-rw-r--r--Documentation/DocBook/Makefile240
-rw-r--r--Documentation/DocBook/debugobjects.tmpl391
-rw-r--r--Documentation/DocBook/deviceiobook.tmpl323
-rw-r--r--Documentation/DocBook/filesystems.tmpl421
-rw-r--r--Documentation/DocBook/gadget.tmpl793
-rw-r--r--Documentation/DocBook/genericirq.tmpl474
-rw-r--r--Documentation/DocBook/kernel-api.tmpl707
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl1327
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl2143
-rw-r--r--Documentation/DocBook/kgdb.tmpl459
-rw-r--r--Documentation/DocBook/libata.tmpl1632
-rw-r--r--Documentation/DocBook/librs.tmpl289
-rw-r--r--Documentation/DocBook/lsm.tmpl265
-rw-r--r--Documentation/DocBook/mac80211.tmpl331
-rw-r--r--Documentation/DocBook/mcabook.tmpl107
-rw-r--r--Documentation/DocBook/mtdnand.tmpl1318
-rw-r--r--Documentation/DocBook/networking.tmpl106
-rw-r--r--Documentation/DocBook/procfs-guide.tmpl626
-rw-r--r--Documentation/DocBook/procfs_example.c210
-rw-r--r--Documentation/DocBook/rapidio.tmpl159
-rw-r--r--Documentation/DocBook/s390-drivers.tmpl161
-rw-r--r--Documentation/DocBook/scsi.tmpl409
-rw-r--r--Documentation/DocBook/sh.tmpl105
-rw-r--r--Documentation/DocBook/stylesheet.xsl8
-rw-r--r--Documentation/DocBook/uio-howto.tmpl620
-rw-r--r--Documentation/DocBook/usb.tmpl980
-rw-r--r--Documentation/DocBook/wanbook.tmpl99
-rw-r--r--Documentation/DocBook/writing_usb_driver.tmpl412
-rw-r--r--Documentation/DocBook/z8530book.tmpl371
-rw-r--r--Documentation/HOWTO651
-rw-r--r--Documentation/IO-mapping.txt208
-rw-r--r--Documentation/IPMI.txt665
-rw-r--r--Documentation/IRQ-affinity.txt56
-rw-r--r--Documentation/IRQ.txt22
-rw-r--r--Documentation/Intel-IOMMU.txt115
-rw-r--r--Documentation/Makefile3
-rw-r--r--Documentation/ManagementStyle276
-rw-r--r--Documentation/PCI/00-INDEX14
-rw-r--r--Documentation/PCI/MSI-HOWTO.txt509
-rw-r--r--Documentation/PCI/PCIEBUS-HOWTO.txt217
-rw-r--r--Documentation/PCI/pci-error-recovery.txt396
-rw-r--r--Documentation/PCI/pci.txt650
-rw-r--r--Documentation/PCI/pcieaer-howto.txt248
-rw-r--r--Documentation/RCU/00-INDEX22
-rw-r--r--Documentation/RCU/NMI-RCU.txt115
-rw-r--r--Documentation/RCU/RTFP.txt745
-rw-r--r--Documentation/RCU/UP.txt119
-rw-r--r--Documentation/RCU/arrayRCU.txt141
-rw-r--r--Documentation/RCU/checklist.txt300
-rw-r--r--Documentation/RCU/listRCU.txt315
-rw-r--r--Documentation/RCU/rcu.txt138
-rw-r--r--Documentation/RCU/rcuref.txt66
-rw-r--r--Documentation/RCU/torture.txt180
-rw-r--r--Documentation/RCU/whatisRCU.txt936
-rw-r--r--Documentation/SAK.txt88
-rw-r--r--Documentation/SELinux.txt27
-rw-r--r--Documentation/SM501.txt71
-rw-r--r--Documentation/SecurityBugs38
-rw-r--r--Documentation/Smack.txt493
-rw-r--r--Documentation/SubmitChecklist90
-rw-r--r--Documentation/SubmittingDrivers162
-rw-r--r--Documentation/SubmittingPatches680
-rw-r--r--Documentation/VGA-softcursor.txt39
-rw-r--r--Documentation/accounting/.gitignore1
-rw-r--r--Documentation/accounting/Makefile10
-rw-r--r--Documentation/accounting/cgroupstats.txt27
-rw-r--r--Documentation/accounting/delay-accounting.txt117
-rw-r--r--Documentation/accounting/getdelays.c501
-rw-r--r--Documentation/accounting/taskstats-struct.txt180
-rw-r--r--Documentation/accounting/taskstats.txt181
-rw-r--r--Documentation/acpi/debug.txt148
-rw-r--r--Documentation/acpi/dsdt-override.txt7
-rw-r--r--Documentation/acpi/method-tracing.txt26
-rw-r--r--Documentation/aoe/aoe.txt123
-rw-r--r--Documentation/aoe/autoload.sh17
-rw-r--r--Documentation/aoe/mkdevs.sh41
-rw-r--r--Documentation/aoe/mkshelf.sh28
-rw-r--r--Documentation/aoe/status.sh27
-rw-r--r--Documentation/aoe/todo.txt14
-rw-r--r--Documentation/aoe/udev-install.sh33
-rw-r--r--Documentation/aoe/udev.txt26
-rw-r--r--Documentation/applying-patches.txt454
-rw-r--r--Documentation/arm/00-INDEX32
-rw-r--r--Documentation/arm/Booting141
-rw-r--r--Documentation/arm/IXP200069
-rw-r--r--Documentation/arm/IXP4xx174
-rw-r--r--Documentation/arm/Interrupts167
-rw-r--r--Documentation/arm/Netwinder78
-rw-r--r--Documentation/arm/Porting135
-rw-r--r--Documentation/arm/README197
-rw-r--r--Documentation/arm/SA1100/ADSBitsy43
-rw-r--r--Documentation/arm/SA1100/Assabet301
-rw-r--r--Documentation/arm/SA1100/Brutus66
-rw-r--r--Documentation/arm/SA1100/CERF29
-rw-r--r--Documentation/arm/SA1100/FreeBird21
-rw-r--r--Documentation/arm/SA1100/GraphicsClient98
-rw-r--r--Documentation/arm/SA1100/GraphicsMaster53
-rw-r--r--Documentation/arm/SA1100/HUW_WEBPANEL17
-rw-r--r--Documentation/arm/SA1100/Itsy39
-rw-r--r--Documentation/arm/SA1100/LART14
-rw-r--r--Documentation/arm/SA1100/PLEB11
-rw-r--r--Documentation/arm/SA1100/Pangolin23
-rw-r--r--Documentation/arm/SA1100/Tifon7
-rw-r--r--Documentation/arm/SA1100/Victor16
-rw-r--r--Documentation/arm/SA1100/Yopy2
-rw-r--r--Documentation/arm/SA1100/empeg2
-rw-r--r--Documentation/arm/SA1100/nanoEngine11
-rw-r--r--Documentation/arm/SA1100/serial_UART47
-rw-r--r--Documentation/arm/Samsung-S3C24XX/DMA.txt46
-rw-r--r--Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt58
-rw-r--r--Documentation/arm/Samsung-S3C24XX/GPIO.txt137
-rw-r--r--Documentation/arm/Samsung-S3C24XX/H1940.txt40
-rw-r--r--Documentation/arm/Samsung-S3C24XX/NAND.txt30
-rw-r--r--Documentation/arm/Samsung-S3C24XX/Overview.txt302
-rw-r--r--Documentation/arm/Samsung-S3C24XX/S3C2412.txt120
-rw-r--r--Documentation/arm/Samsung-S3C24XX/S3C2413.txt21
-rw-r--r--Documentation/arm/Samsung-S3C24XX/SMDK2440.txt56
-rw-r--r--Documentation/arm/Samsung-S3C24XX/Suspend.txt137
-rw-r--r--Documentation/arm/Samsung-S3C24XX/USB-Host.txt93
-rw-r--r--Documentation/arm/Setup129
-rw-r--r--Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen61
-rw-r--r--Documentation/arm/Sharp-LH/CompactFlash32
-rw-r--r--Documentation/arm/Sharp-LH/IOBarrier45
-rw-r--r--Documentation/arm/Sharp-LH/KEV7A4008
-rw-r--r--Documentation/arm/Sharp-LH/LCDPanels59
-rw-r--r--Documentation/arm/Sharp-LH/LPD7A40015
-rw-r--r--Documentation/arm/Sharp-LH/LPD7A40X16
-rw-r--r--Documentation/arm/Sharp-LH/SDRAM51
-rw-r--r--Documentation/arm/Sharp-LH/VectoredInterruptController80
-rw-r--r--Documentation/arm/VFP/release-notes.txt55
-rw-r--r--Documentation/arm/mem_alignment58
-rw-r--r--Documentation/arm/memory.txt74
-rw-r--r--Documentation/arm/nwfpe/NOTES29
-rw-r--r--Documentation/arm/nwfpe/README70
-rw-r--r--Documentation/arm/nwfpe/README.FPE156
-rw-r--r--Documentation/arm/nwfpe/TODO67
-rw-r--r--Documentation/atomic_ops.txt547
-rw-r--r--Documentation/auxdisplay/.gitignore1
-rw-r--r--Documentation/auxdisplay/Makefile10
-rw-r--r--Documentation/auxdisplay/cfag12864b105
-rw-r--r--Documentation/auxdisplay/cfag12864b-example.c282
-rw-r--r--Documentation/auxdisplay/ks010855
-rw-r--r--Documentation/basic_profiling.txt56
-rw-r--r--Documentation/binfmt_misc.txt116
-rw-r--r--Documentation/blackfin/00-INDEX11
-rw-r--r--Documentation/blackfin/Filesystems169
-rw-r--r--Documentation/blackfin/cache-lock.txt48
-rw-r--r--Documentation/blackfin/cachefeatures.txt65
-rw-r--r--Documentation/block/00-INDEX20
-rw-r--r--Documentation/block/as-iosched.txt172
-rw-r--r--Documentation/block/barrier.txt261
-rw-r--r--Documentation/block/biodoc.txt1215
-rw-r--r--Documentation/block/capability.txt15
-rw-r--r--Documentation/block/data-integrity.txt327
-rw-r--r--Documentation/block/deadline-iosched.txt75
-rw-r--r--Documentation/block/ioprio.txt183
-rw-r--r--Documentation/block/request.txt88
-rw-r--r--Documentation/block/stat.txt82
-rw-r--r--Documentation/block/switching-sched.txt43
-rw-r--r--Documentation/blockdev/00-INDEX16
-rw-r--r--Documentation/blockdev/README.DAC960756
-rw-r--r--Documentation/blockdev/cciss.txt171
-rw-r--r--Documentation/blockdev/cpqarray.txt93
-rw-r--r--Documentation/blockdev/floppy.txt245
-rw-r--r--Documentation/blockdev/nbd.txt47
-rw-r--r--Documentation/blockdev/paride.txt417
-rw-r--r--Documentation/blockdev/ramdisk.txt165
-rw-r--r--Documentation/braille-console.txt34
-rw-r--r--Documentation/bt8xxgpio.txt67
-rw-r--r--Documentation/c2port.txt90
-rw-r--r--Documentation/cachetlb.txt379
-rw-r--r--Documentation/cdrom/00-INDEX11
-rw-r--r--Documentation/cdrom/Makefile21
-rw-r--r--Documentation/cdrom/cdrom-standard.tex1022
-rw-r--r--Documentation/cdrom/ide-cd573
-rw-r--r--Documentation/cdrom/packet-writing.txt132
-rw-r--r--Documentation/cgroups/cgroups.txt548
-rw-r--r--Documentation/cgroups/freezer-subsystem.txt102
-rw-r--r--Documentation/connector/.gitignore1
-rw-r--r--Documentation/connector/Makefile11
-rw-r--r--Documentation/connector/cn_test.c193
-rw-r--r--Documentation/connector/connector.txt178
-rw-r--r--Documentation/connector/ucon.c206
-rw-r--r--Documentation/console/console.txt144
-rw-r--r--Documentation/controllers/devices.txt52
-rw-r--r--Documentation/controllers/memory.txt284
-rw-r--r--Documentation/controllers/resource_counter.txt181
-rw-r--r--Documentation/cpu-freq/amd-powernow.txt38
-rw-r--r--Documentation/cpu-freq/core.txt98
-rw-r--r--Documentation/cpu-freq/cpu-drivers.txt216
-rw-r--r--Documentation/cpu-freq/cpufreq-nforce2.txt19
-rw-r--r--Documentation/cpu-freq/cpufreq-stats.txt128
-rw-r--r--Documentation/cpu-freq/governors.txt213
-rw-r--r--Documentation/cpu-freq/index.txt54
-rw-r--r--Documentation/cpu-freq/user-guide.txt215
-rw-r--r--Documentation/cpu-hotplug.txt389
-rw-r--r--Documentation/cpu-load.txt113
-rw-r--r--Documentation/cpuidle/core.txt23
-rw-r--r--Documentation/cpuidle/driver.txt31
-rw-r--r--Documentation/cpuidle/governor.txt29
-rw-r--r--Documentation/cpuidle/sysfs.txt79
-rw-r--r--Documentation/cpusets.txt808
-rw-r--r--Documentation/cputopology.txt33
-rw-r--r--Documentation/cris/README195
-rw-r--r--Documentation/crypto/api-intro.txt248
-rw-r--r--Documentation/crypto/async-tx-api.txt219
-rw-r--r--Documentation/crypto/descore-readme.txt352
-rw-r--r--Documentation/dcdbas.txt91
-rw-r--r--Documentation/debugging-modules.txt22
-rw-r--r--Documentation/debugging-via-ohci1394.txt184
-rw-r--r--Documentation/dell_rbu.txt100
-rw-r--r--Documentation/development-process/1.Intro274
-rw-r--r--Documentation/development-process/2.Process459
-rw-r--r--Documentation/development-process/3.Early-stage195
-rw-r--r--Documentation/development-process/4.Coding384
-rw-r--r--Documentation/development-process/5.Posting278
-rw-r--r--Documentation/development-process/6.Followthrough202
-rw-r--r--Documentation/development-process/7.AdvancedTopics173
-rw-r--r--Documentation/development-process/8.Conclusion74
-rw-r--r--Documentation/device-mapper/delay.txt26
-rw-r--r--Documentation/device-mapper/dm-crypt.txt52
-rw-r--r--Documentation/device-mapper/dm-io.txt75
-rw-r--r--Documentation/device-mapper/dm-uevent.txt97
-rw-r--r--Documentation/device-mapper/kcopyd.txt47
-rw-r--r--Documentation/device-mapper/linear.txt61
-rw-r--r--Documentation/device-mapper/snapshot.txt74
-rw-r--r--Documentation/device-mapper/striped.txt58
-rw-r--r--Documentation/device-mapper/zero.txt37
-rw-r--r--Documentation/devices.txt3335
-rw-r--r--Documentation/dontdiff201
-rw-r--r--Documentation/driver-model/binding.txt102
-rw-r--r--Documentation/driver-model/bus.txt160
-rw-r--r--Documentation/driver-model/class.txt162
-rw-r--r--Documentation/driver-model/device.txt162
-rw-r--r--Documentation/driver-model/devres.txt268
-rw-r--r--Documentation/driver-model/driver.txt230
-rw-r--r--Documentation/driver-model/interface.txt129
-rw-r--r--Documentation/driver-model/overview.txt107
-rw-r--r--Documentation/driver-model/platform.txt171
-rw-r--r--Documentation/driver-model/porting.txt445
-rw-r--r--Documentation/dvb/README.dvb-usb232
-rw-r--r--Documentation/dvb/README.flexcop205
-rw-r--r--Documentation/dvb/avermedia.txt301
-rw-r--r--Documentation/dvb/bt8xx.txt98
-rw-r--r--Documentation/dvb/cards.txt122
-rw-r--r--Documentation/dvb/ci.txt212
-rw-r--r--Documentation/dvb/contributors.txt96
-rw-r--r--Documentation/dvb/faq.txt159
-rw-r--r--Documentation/dvb/get_dvb_firmware509
-rw-r--r--Documentation/dvb/opera-firmware.txt27
-rw-r--r--Documentation/dvb/readme.txt62
-rw-r--r--Documentation/dvb/ttusb-dec.txt45
-rw-r--r--Documentation/dvb/udev.txt46
-rw-r--r--Documentation/early-userspace/README152
-rw-r--r--Documentation/early-userspace/buffer-format.txt112
-rw-r--r--Documentation/edac.txt720
-rw-r--r--Documentation/eisa.txt203
-rw-r--r--Documentation/email-clients.txt241
-rw-r--r--Documentation/exception.txt292
-rw-r--r--Documentation/fault-injection/fault-injection.txt237
-rw-r--r--Documentation/fb/00-INDEX53
-rw-r--r--Documentation/fb/arkfb.txt68
-rw-r--r--Documentation/fb/aty128fb.txt72
-rw-r--r--Documentation/fb/cirrusfb.txt97
-rw-r--r--Documentation/fb/cmap_xfbdev.txt53
-rw-r--r--Documentation/fb/cyblafb/bugs13
-rw-r--r--Documentation/fb/cyblafb/credits7
-rw-r--r--Documentation/fb/cyblafb/documentation17
-rw-r--r--Documentation/fb/cyblafb/fb.modes154
-rw-r--r--Documentation/fb/cyblafb/performance79
-rw-r--r--Documentation/fb/cyblafb/todo31
-rw-r--r--Documentation/fb/cyblafb/usage217
-rw-r--r--Documentation/fb/cyblafb/whatsnew29
-rw-r--r--Documentation/fb/cyblafb/whycyblafb85
-rw-r--r--Documentation/fb/deferred_io.txt75
-rw-r--r--Documentation/fb/fbcon.txt324
-rw-r--r--Documentation/fb/framebuffer.txt345
-rw-r--r--Documentation/fb/gxfb.txt52
-rw-r--r--Documentation/fb/imacfb.txt31
-rw-r--r--Documentation/fb/intel810.txt278
-rw-r--r--Documentation/fb/intelfb.txt149
-rw-r--r--Documentation/fb/internals.txt82
-rw-r--r--Documentation/fb/lxfb.txt52
-rw-r--r--Documentation/fb/matroxfb.txt415
-rw-r--r--Documentation/fb/metronomefb.txt36
-rw-r--r--Documentation/fb/modedb.txt136
-rw-r--r--Documentation/fb/pvr2fb.txt65
-rw-r--r--Documentation/fb/pxafb.txt54
-rw-r--r--Documentation/fb/s3fb.txt82
-rw-r--r--Documentation/fb/sa1100fb.txt39
-rw-r--r--Documentation/fb/sh7760fb.txt131
-rw-r--r--Documentation/fb/sisfb.txt158
-rw-r--r--Documentation/fb/sstfb.txt174
-rw-r--r--Documentation/fb/tgafb.txt69
-rw-r--r--Documentation/fb/tridentfb.txt70
-rw-r--r--Documentation/fb/uvesafb.txt188
-rw-r--r--Documentation/fb/vesafb.txt181
-rw-r--r--Documentation/fb/viafb.modes870
-rw-r--r--Documentation/fb/viafb.txt214
-rw-r--r--Documentation/fb/vt8623fb.txt64
-rw-r--r--Documentation/feature-removal-schedule.txt345
-rw-r--r--Documentation/filesystems/00-INDEX118
-rw-r--r--Documentation/filesystems/9p.txt144
-rw-r--r--Documentation/filesystems/Exporting147
-rw-r--r--Documentation/filesystems/Locking537
-rw-r--r--Documentation/filesystems/adfs.txt57
-rw-r--r--Documentation/filesystems/affs.txt219
-rw-r--r--Documentation/filesystems/afs.txt249
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt393
-rw-r--r--Documentation/filesystems/automount-support.txt118
-rw-r--r--Documentation/filesystems/befs.txt117
-rw-r--r--Documentation/filesystems/bfs.txt57
-rw-r--r--Documentation/filesystems/cifs.txt51
-rw-r--r--Documentation/filesystems/coda.txt1673
-rw-r--r--Documentation/filesystems/configfs/Makefile3
-rw-r--r--Documentation/filesystems/configfs/configfs.txt484
-rw-r--r--Documentation/filesystems/configfs/configfs_example_explicit.c485
-rw-r--r--Documentation/filesystems/configfs/configfs_example_macros.c448
-rw-r--r--Documentation/filesystems/cramfs.txt76
-rw-r--r--Documentation/filesystems/dentry-locking.txt173
-rw-r--r--Documentation/filesystems/directory-locking114
-rw-r--r--Documentation/filesystems/dlmfs.txt130
-rw-r--r--Documentation/filesystems/dnotify.txt99
-rw-r--r--Documentation/filesystems/ecryptfs.txt77
-rw-r--r--Documentation/filesystems/ext2.txt382
-rw-r--r--Documentation/filesystems/ext3.txt202
-rw-r--r--Documentation/filesystems/ext4.txt302
-rw-r--r--Documentation/filesystems/fiemap.txt228
-rw-r--r--Documentation/filesystems/files.txt123
-rw-r--r--Documentation/filesystems/fuse.txt423
-rw-r--r--Documentation/filesystems/gfs2-glocks.txt114
-rw-r--r--Documentation/filesystems/gfs2.txt43
-rw-r--r--Documentation/filesystems/hfs.txt83
-rw-r--r--Documentation/filesystems/hfsplus.txt59
-rw-r--r--Documentation/filesystems/hpfs.txt296
-rw-r--r--Documentation/filesystems/inotify.txt269
-rw-r--r--Documentation/filesystems/isofs.txt43
-rw-r--r--Documentation/filesystems/jfs.txt41
-rw-r--r--Documentation/filesystems/locks.txt67
-rw-r--r--Documentation/filesystems/mandatory-locking.txt171
-rw-r--r--Documentation/filesystems/ncpfs.txt12
-rw-r--r--Documentation/filesystems/nfs-rdma.txt271
-rw-r--r--Documentation/filesystems/nfsroot.txt270
-rw-r--r--Documentation/filesystems/ntfs.txt716
-rw-r--r--Documentation/filesystems/ocfs2.txt81
-rw-r--r--Documentation/filesystems/omfs.txt106
-rw-r--r--Documentation/filesystems/porting275
-rw-r--r--Documentation/filesystems/proc.txt2513
-rw-r--r--Documentation/filesystems/quota.txt65
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt355
-rw-r--r--Documentation/filesystems/relay.txt494
-rw-r--r--Documentation/filesystems/romfs.txt187
-rw-r--r--Documentation/filesystems/rpc-cache.txt202
-rw-r--r--Documentation/filesystems/seq_file.txt294
-rw-r--r--Documentation/filesystems/sharedsubtree.txt1061
-rw-r--r--Documentation/filesystems/smbfs.txt8
-rw-r--r--Documentation/filesystems/spufs.txt521
-rw-r--r--Documentation/filesystems/sysfs-pci.txt107
-rw-r--r--Documentation/filesystems/sysfs.txt357
-rw-r--r--Documentation/filesystems/sysv-fs.txt197
-rw-r--r--Documentation/filesystems/tmpfs.txt136
-rw-r--r--Documentation/filesystems/ubifs.txt173
-rw-r--r--Documentation/filesystems/udf.txt80
-rw-r--r--Documentation/filesystems/ufs.txt60
-rw-r--r--Documentation/filesystems/unionfs/00-INDEX10
-rw-r--r--Documentation/filesystems/unionfs/concepts.txt287
-rw-r--r--Documentation/filesystems/unionfs/issues.txt28
-rw-r--r--Documentation/filesystems/unionfs/rename.txt31
-rw-r--r--Documentation/filesystems/unionfs/usage.txt134
-rw-r--r--Documentation/filesystems/vfat.txt289
-rw-r--r--Documentation/filesystems/vfs.txt1000
-rw-r--r--Documentation/filesystems/xfs.txt261
-rw-r--r--Documentation/filesystems/xip.txt68
-rw-r--r--Documentation/firmware_class/README107
-rw-r--r--Documentation/firmware_class/hotplug-script16
-rw-r--r--Documentation/frv/README.txt51
-rw-r--r--Documentation/frv/atomic-ops.txt134
-rw-r--r--Documentation/frv/booting.txt181
-rw-r--r--Documentation/frv/clock.txt65
-rw-r--r--Documentation/frv/configuring.txt125
-rw-r--r--Documentation/frv/features.txt310
-rw-r--r--Documentation/frv/gdbinit102
-rw-r--r--Documentation/frv/gdbstub.txt130
-rw-r--r--Documentation/frv/kernel-ABI.txt262
-rw-r--r--Documentation/frv/mmu-layout.txt306
-rw-r--r--Documentation/ftrace.txt1339
-rw-r--r--Documentation/gpio.txt570
-rw-r--r--Documentation/highuid.txt77
-rw-r--r--Documentation/hw_random.txt90
-rw-r--r--Documentation/hwmon/abituguru92
-rw-r--r--Documentation/hwmon/abituguru-datasheet312
-rw-r--r--Documentation/hwmon/abituguru365
-rw-r--r--Documentation/hwmon/adm1021111
-rw-r--r--Documentation/hwmon/adm102551
-rw-r--r--Documentation/hwmon/adm102693
-rw-r--r--Documentation/hwmon/adm103135
-rw-r--r--Documentation/hwmon/adm9240177
-rw-r--r--Documentation/hwmon/ads782836
-rw-r--r--Documentation/hwmon/adt746267
-rw-r--r--Documentation/hwmon/adt747076
-rw-r--r--Documentation/hwmon/adt747372
-rw-r--r--Documentation/hwmon/asb10072
-rw-r--r--Documentation/hwmon/coretemp38
-rw-r--r--Documentation/hwmon/dme1737295
-rw-r--r--Documentation/hwmon/ds1621108
-rw-r--r--Documentation/hwmon/f71805f167
-rw-r--r--Documentation/hwmon/fscher169
-rw-r--r--Documentation/hwmon/gl518sm74
-rw-r--r--Documentation/hwmon/ibmaem38
-rw-r--r--Documentation/hwmon/it87157
-rw-r--r--Documentation/hwmon/k8temp55
-rw-r--r--Documentation/hwmon/lis3lv02d49
-rw-r--r--Documentation/hwmon/lm6357
-rw-r--r--Documentation/hwmon/lm7031
-rw-r--r--Documentation/hwmon/lm7565
-rw-r--r--Documentation/hwmon/lm7722
-rw-r--r--Documentation/hwmon/lm7867
-rw-r--r--Documentation/hwmon/lm8056
-rw-r--r--Documentation/hwmon/lm8385
-rw-r--r--Documentation/hwmon/lm85208
-rw-r--r--Documentation/hwmon/lm8777
-rw-r--r--Documentation/hwmon/lm90189
-rw-r--r--Documentation/hwmon/lm9237
-rw-r--r--Documentation/hwmon/lm93302
-rw-r--r--Documentation/hwmon/max161929
-rw-r--r--Documentation/hwmon/max665053
-rw-r--r--Documentation/hwmon/pc87360184
-rw-r--r--Documentation/hwmon/pc8742738
-rw-r--r--Documentation/hwmon/pmbus214
-rw-r--r--Documentation/hwmon/pmbus-core283
-rw-r--r--Documentation/hwmon/sis5595106
-rw-r--r--Documentation/hwmon/smsc47b397163
-rw-r--r--Documentation/hwmon/smsc47m166
-rw-r--r--Documentation/hwmon/smsc47m192103
-rw-r--r--Documentation/hwmon/sysfs-interface505
-rw-r--r--Documentation/hwmon/thmc5074
-rw-r--r--Documentation/hwmon/userspace-tools40
-rw-r--r--Documentation/hwmon/via686a78
-rw-r--r--Documentation/hwmon/vt1211206
-rw-r--r--Documentation/hwmon/w83627ehf126
-rw-r--r--Documentation/hwmon/w83627hf72
-rw-r--r--Documentation/hwmon/w83781d453
-rw-r--r--Documentation/hwmon/w83791d161
-rw-r--r--Documentation/hwmon/w83792d174
-rw-r--r--Documentation/hwmon/w83793106
-rw-r--r--Documentation/hwmon/w83l785ts40
-rw-r--r--Documentation/hwmon/w83l786ng54
-rw-r--r--Documentation/i2c/busses/i2c-ali153542
-rw-r--r--Documentation/i2c/busses/i2c-ali156327
-rw-r--r--Documentation/i2c/busses/i2c-ali15x3112
-rw-r--r--Documentation/i2c/busses/i2c-amd75625
-rw-r--r--Documentation/i2c/busses/i2c-amd811141
-rw-r--r--Documentation/i2c/busses/i2c-i801134
-rw-r--r--Documentation/i2c/busses/i2c-nforce246
-rw-r--r--Documentation/i2c/busses/i2c-ocores51
-rw-r--r--Documentation/i2c/busses/i2c-parport174
-rw-r--r--Documentation/i2c/busses/i2c-parport-light11
-rw-r--r--Documentation/i2c/busses/i2c-pca-isa23
-rw-r--r--Documentation/i2c/busses/i2c-piix498
-rw-r--r--Documentation/i2c/busses/i2c-sis559559
-rw-r--r--Documentation/i2c/busses/i2c-sis63049
-rw-r--r--Documentation/i2c/busses/i2c-sis96x73
-rw-r--r--Documentation/i2c/busses/i2c-taos-evm46
-rw-r--r--Documentation/i2c/busses/i2c-via34
-rw-r--r--Documentation/i2c/busses/i2c-viapro65
-rw-r--r--Documentation/i2c/busses/i2c-voodoo362
-rw-r--r--Documentation/i2c/busses/scx200_acb32
-rw-r--r--Documentation/i2c/chips/eeprom96
-rw-r--r--Documentation/i2c/chips/max6875108
-rw-r--r--Documentation/i2c/chips/pca953958
-rw-r--r--Documentation/i2c/chips/pcf857465
-rw-r--r--Documentation/i2c/chips/pcf857569
-rw-r--r--Documentation/i2c/chips/pcf859190
-rw-r--r--Documentation/i2c/dev-interface214
-rw-r--r--Documentation/i2c/fault-codes127
-rw-r--r--Documentation/i2c/functionality145
-rw-r--r--Documentation/i2c/i2c-protocol76
-rw-r--r--Documentation/i2c/i2c-stub47
-rw-r--r--Documentation/i2c/smbus-protocol227
-rw-r--r--Documentation/i2c/summary47
-rw-r--r--Documentation/i2c/ten-bit-addresses22
-rw-r--r--Documentation/i2c/upgrading-clients281
-rw-r--r--Documentation/i2c/writing-clients389
-rw-r--r--Documentation/i2o/README63
-rw-r--r--Documentation/i2o/ioctl394
-rw-r--r--Documentation/ia64/.gitignore1
-rw-r--r--Documentation/ia64/IRQ-redir.txt69
-rw-r--r--Documentation/ia64/Makefile8
-rw-r--r--Documentation/ia64/README43
-rw-r--r--Documentation/ia64/aliasing-test.c262
-rw-r--r--Documentation/ia64/aliasing.txt223
-rw-r--r--Documentation/ia64/efirtc.txt128
-rw-r--r--Documentation/ia64/err_inject.txt1068
-rw-r--r--Documentation/ia64/fsys.txt286
-rw-r--r--Documentation/ia64/kvm.txt83
-rw-r--r--Documentation/ia64/mca.txt194
-rw-r--r--Documentation/ia64/paravirt_ops.txt137
-rw-r--r--Documentation/ia64/serial.txt151
-rw-r--r--Documentation/ia64/xen.txt183
-rw-r--r--Documentation/ics932s40131
-rw-r--r--Documentation/ide/00-INDEX12
-rw-r--r--Documentation/ide/ChangeLog.ide-cd.1994-2004268
-rw-r--r--Documentation/ide/ChangeLog.ide-floppy.1996-200263
-rw-r--r--Documentation/ide/ChangeLog.ide-tape.1995-2002257
-rw-r--r--Documentation/ide/ide-tape.txt65
-rw-r--r--Documentation/ide/ide.txt254
-rw-r--r--Documentation/ide/warm-plug-howto.txt13
-rw-r--r--Documentation/infiniband/core_locking.txt114
-rw-r--r--Documentation/infiniband/ipoib.txt57
-rw-r--r--Documentation/infiniband/sysfs.txt66
-rw-r--r--Documentation/infiniband/user_mad.txt148
-rw-r--r--Documentation/infiniband/user_verbs.txt69
-rw-r--r--Documentation/initrd.txt366
-rw-r--r--Documentation/input/amijoy.txt184
-rw-r--r--Documentation/input/appletouch.txt85
-rw-r--r--Documentation/input/atarikbd.txt709
-rw-r--r--Documentation/input/cd32.txt19
-rw-r--r--Documentation/input/cs461x.txt45
-rw-r--r--Documentation/input/elantech.txt405
-rw-r--r--Documentation/input/ff.txt219
-rw-r--r--Documentation/input/gameport-programming.txt187
-rw-r--r--Documentation/input/iforce-protocol.txt258
-rw-r--r--Documentation/input/input-programming.txt302
-rw-r--r--Documentation/input/input.txt290
-rw-r--r--Documentation/input/interactive.fig42
-rw-r--r--Documentation/input/joystick-api.txt314
-rw-r--r--Documentation/input/joystick-parport.txt542
-rw-r--r--Documentation/input/joystick.txt586
-rw-r--r--Documentation/input/notifier.txt52
-rw-r--r--Documentation/input/shape.fig65
-rw-r--r--Documentation/input/xpad.txt183
-rw-r--r--Documentation/input/yealink.txt216
-rw-r--r--Documentation/io-mapping.txt82
-rw-r--r--Documentation/io_ordering.txt47
-rw-r--r--Documentation/ioctl/00-INDEX10
-rw-r--r--Documentation/ioctl/cdrom.txt966
-rw-r--r--Documentation/ioctl/hdio.txt1071
-rw-r--r--Documentation/ioctl/ioctl-decoding.txt24
-rw-r--r--Documentation/ioctl/ioctl-number.txt201
-rw-r--r--Documentation/iostats.txt163
-rw-r--r--Documentation/irqflags-tracing.txt57
-rw-r--r--Documentation/isapnp.txt14
-rw-r--r--Documentation/isdn/00-INDEX43
-rw-r--r--Documentation/isdn/CREDITS70
-rw-r--r--Documentation/isdn/HiSax.cert96
-rw-r--r--Documentation/isdn/INTERFACE759
-rw-r--r--Documentation/isdn/INTERFACE.fax163
-rw-r--r--Documentation/isdn/README599
-rw-r--r--Documentation/isdn/README.FAQ26
-rw-r--r--Documentation/isdn/README.HiSax659
-rw-r--r--Documentation/isdn/README.act2000104
-rw-r--r--Documentation/isdn/README.audio138
-rw-r--r--Documentation/isdn/README.avmb1187
-rw-r--r--Documentation/isdn/README.concap259
-rw-r--r--Documentation/isdn/README.diversion127
-rw-r--r--Documentation/isdn/README.fax45
-rw-r--r--Documentation/isdn/README.gigaset318
-rw-r--r--Documentation/isdn/README.hfc-pci41
-rw-r--r--Documentation/isdn/README.hysdn195
-rw-r--r--Documentation/isdn/README.icn148
-rw-r--r--Documentation/isdn/README.mISDN6
-rw-r--r--Documentation/isdn/README.pcbit40
-rw-r--r--Documentation/isdn/README.sc281
-rw-r--r--Documentation/isdn/README.syncppp58
-rw-r--r--Documentation/isdn/README.x25184
-rw-r--r--Documentation/isdn/syncPPP.FAQ224
-rw-r--r--Documentation/ja_JP/HOWTO687
-rw-r--r--Documentation/ja_JP/SubmitChecklist111
-rw-r--r--Documentation/ja_JP/SubmittingPatches556
-rw-r--r--Documentation/ja_JP/stable_api_nonsense.txt263
-rw-r--r--Documentation/ja_JP/stable_kernel_rules.txt79
-rw-r--r--Documentation/java.txt396
-rw-r--r--Documentation/kbuild/00-INDEX8
-rw-r--r--Documentation/kbuild/kconfig-language.txt379
-rw-r--r--Documentation/kbuild/makefiles.txt1228
-rw-r--r--Documentation/kbuild/modules.txt552
-rw-r--r--Documentation/kdump/gdbmacros.txt201
-rw-r--r--Documentation/kdump/kdump.txt444
-rw-r--r--Documentation/kernel-doc-nano-HOWTO.txt307
-rw-r--r--Documentation/kernel-docs.txt761
-rw-r--r--Documentation/kernel-parameters.txt2385
-rw-r--r--Documentation/keys-request-key.txt201
-rw-r--r--Documentation/keys.txt1233
-rw-r--r--Documentation/ko_KR/HOWTO625
-rw-r--r--Documentation/ko_KR/stable_api_nonsense.txt195
-rw-r--r--Documentation/kobject.txt390
-rw-r--r--Documentation/kprobes.txt506
-rw-r--r--Documentation/kref.txt216
-rw-r--r--Documentation/laptops/00-INDEX12
-rw-r--r--Documentation/laptops/acer-wmi.txt180
-rw-r--r--Documentation/laptops/disk-shock-protection.txt149
-rw-r--r--Documentation/laptops/laptop-mode.txt950
-rw-r--r--Documentation/laptops/sony-laptop.txt116
-rw-r--r--Documentation/laptops/sonypi.txt152
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt1488
-rw-r--r--Documentation/ldm.txt109
-rw-r--r--Documentation/leds-class.txt94
-rw-r--r--Documentation/lguest/Makefile8
-rw-r--r--Documentation/lguest/extract58
-rw-r--r--Documentation/lguest/lguest.c2088
-rw-r--r--Documentation/lguest/lguest.txt120
-rw-r--r--Documentation/local_ops.txt186
-rw-r--r--Documentation/lockdep-design.txt219
-rw-r--r--Documentation/lockstat.txt120
-rw-r--r--Documentation/logo.gifbin0 -> 16335 bytes
-rw-r--r--Documentation/logo.txt13
-rw-r--r--Documentation/m68k/00-INDEX5
-rw-r--r--Documentation/m68k/README.buddha210
-rw-r--r--Documentation/m68k/kernel-options.txt872
-rw-r--r--Documentation/magic-number.txt172
-rw-r--r--Documentation/make/headers_install.txt46
-rw-r--r--Documentation/markers.txt85
-rw-r--r--Documentation/mca.txt313
-rw-r--r--Documentation/md.txt486
-rw-r--r--Documentation/memory-barriers.txt2156
-rw-r--r--Documentation/memory-hotplug.txt374
-rw-r--r--Documentation/memory.txt60
-rw-r--r--Documentation/mips/00-INDEX4
-rw-r--r--Documentation/mips/AU1xxx_IDE.README126
-rw-r--r--Documentation/mn10300/ABI.txt149
-rw-r--r--Documentation/mn10300/compartmentalisation.txt60
-rw-r--r--Documentation/mono.txt66
-rw-r--r--Documentation/mtd/nand_ecc.txt714
-rw-r--r--Documentation/mutex-design.txt138
-rw-r--r--Documentation/namespaces/compatibility-list.txt39
-rw-r--r--Documentation/netlabel/00-INDEX10
-rw-r--r--Documentation/netlabel/cipso_ipv4.txt48
-rw-r--r--Documentation/netlabel/draft-ietf-cipso-ipsecurity-01.txt791
-rw-r--r--Documentation/netlabel/introduction.txt46
-rw-r--r--Documentation/netlabel/lsm_interface.txt47
-rw-r--r--Documentation/networking/.gitignore1
-rw-r--r--Documentation/networking/00-INDEX110
-rw-r--r--Documentation/networking/3c359.txt58
-rw-r--r--Documentation/networking/3c505.txt45
-rw-r--r--Documentation/networking/3c509.txt210
-rw-r--r--Documentation/networking/6pack.txt175
-rw-r--r--Documentation/networking/DLINK.txt203
-rw-r--r--Documentation/networking/LICENSE.qla3xxx46
-rw-r--r--Documentation/networking/LICENSE.qlge46
-rw-r--r--Documentation/networking/Makefile8
-rw-r--r--Documentation/networking/PLIP.txt215
-rw-r--r--Documentation/networking/README.ipw2100294
-rw-r--r--Documentation/networking/README.ipw2200472
-rw-r--r--Documentation/networking/README.sb1000207
-rw-r--r--Documentation/networking/alias.txt53
-rw-r--r--Documentation/networking/arcnet-hardware.txt3133
-rw-r--r--Documentation/networking/arcnet.txt555
-rw-r--r--Documentation/networking/atm.txt8
-rw-r--r--Documentation/networking/ax25.txt10
-rw-r--r--Documentation/networking/baycom.txt158
-rw-r--r--Documentation/networking/bonding.txt2346
-rw-r--r--Documentation/networking/bridge.txt8
-rw-r--r--Documentation/networking/can.txt665
-rw-r--r--Documentation/networking/cops.txt63
-rw-r--r--Documentation/networking/cs89x0.txt703
-rw-r--r--Documentation/networking/cxacru.txt84
-rw-r--r--Documentation/networking/cxgb.txt352
-rw-r--r--Documentation/networking/dccp.txt156
-rw-r--r--Documentation/networking/de4x5.txt178
-rw-r--r--Documentation/networking/decnet.txt232
-rw-r--r--Documentation/networking/depca.txt92
-rw-r--r--Documentation/networking/dl2k.txt281
-rw-r--r--Documentation/networking/dm9000.txt167
-rw-r--r--Documentation/networking/dmfe.txt65
-rw-r--r--Documentation/networking/driver.txt94
-rw-r--r--Documentation/networking/e100.txt206
-rw-r--r--Documentation/networking/e1000.txt642
-rw-r--r--Documentation/networking/eql.txt528
-rw-r--r--Documentation/networking/ewrk3.txt46
-rw-r--r--Documentation/networking/fib_trie.txt145
-rw-r--r--Documentation/networking/filter.txt42
-rw-r--r--Documentation/networking/fore200e.txt66
-rw-r--r--Documentation/networking/framerelay.txt39
-rw-r--r--Documentation/networking/gen_stats.txt117
-rw-r--r--Documentation/networking/generic-hdlc.txt132
-rw-r--r--Documentation/networking/generic_netlink.txt3
-rw-r--r--Documentation/networking/gianfar.txt72
-rw-r--r--Documentation/networking/ifenslave.c1103
-rw-r--r--Documentation/networking/ip-sysctl.txt1268
-rw-r--r--Documentation/networking/ip_dynaddr.txt29
-rw-r--r--Documentation/networking/ipddp.txt78
-rw-r--r--Documentation/networking/iphase.txt158
-rw-r--r--Documentation/networking/ipvs-sysctl.txt143
-rw-r--r--Documentation/networking/irda.txt10
-rw-r--r--Documentation/networking/ixgb.txt433
-rw-r--r--Documentation/networking/l2tp.txt169
-rw-r--r--Documentation/networking/lapb-module.txt263
-rw-r--r--Documentation/networking/ltpc.txt131
-rw-r--r--Documentation/networking/mac80211-injection.txt79
-rw-r--r--Documentation/networking/mac80211_hwsim/README67
-rw-r--r--Documentation/networking/mac80211_hwsim/hostapd.conf11
-rw-r--r--Documentation/networking/mac80211_hwsim/wpa_supplicant.conf10
-rw-r--r--Documentation/networking/multicast.txt63
-rw-r--r--Documentation/networking/multiqueue.txt79
-rw-r--r--Documentation/networking/netconsole.txt156
-rw-r--r--Documentation/networking/netdevices.txt108
-rw-r--r--Documentation/networking/netif-msg.txt79
-rw-r--r--Documentation/networking/olympic.txt79
-rw-r--r--Documentation/networking/operstates.txt161
-rw-r--r--Documentation/networking/packet_mmap.txt399
-rw-r--r--Documentation/networking/phonet.txt175
-rw-r--r--Documentation/networking/phy.txt329
-rw-r--r--Documentation/networking/pktgen.txt248
-rw-r--r--Documentation/networking/policy-routing.txt150
-rw-r--r--Documentation/networking/ppp_generic.txt432
-rw-r--r--Documentation/networking/proc_net_tcp.txt48
-rw-r--r--Documentation/networking/radiotap-headers.txt152
-rw-r--r--Documentation/networking/ray_cs.txt150
-rw-r--r--Documentation/networking/regulatory.txt194
-rw-r--r--Documentation/networking/rxrpc.txt866
-rw-r--r--Documentation/networking/s2io.txt150
-rw-r--r--Documentation/networking/sctp.txt38
-rw-r--r--Documentation/networking/secid.txt14
-rw-r--r--Documentation/networking/skfp.txt220
-rw-r--r--Documentation/networking/smc9.txt42
-rw-r--r--Documentation/networking/smctr.txt66
-rw-r--r--Documentation/networking/spider_net.txt204
-rw-r--r--Documentation/networking/tc-actions-env-rules.txt30
-rw-r--r--Documentation/networking/tcp.txt106
-rw-r--r--Documentation/networking/tlan.txt117
-rw-r--r--Documentation/networking/tms380tr.txt147
-rw-r--r--Documentation/networking/tproxy.txt85
-rw-r--r--Documentation/networking/tuntap.txt150
-rw-r--r--Documentation/networking/udplite.txt281
-rw-r--r--Documentation/networking/vortex.txt448
-rw-r--r--Documentation/networking/wavelan.txt73
-rw-r--r--Documentation/networking/x25-iface.txt123
-rw-r--r--Documentation/networking/x25.txt44
-rw-r--r--Documentation/networking/xfrm_proc.txt74
-rw-r--r--Documentation/networking/xfrm_sync.txt169
-rw-r--r--Documentation/networking/xfrm_sysctl.txt4
-rw-r--r--Documentation/networking/z8530drv.txt657
-rw-r--r--Documentation/nmi_watchdog.txt78
-rw-r--r--Documentation/nommu-mmap.txt244
-rw-r--r--Documentation/numastat.txt22
-rw-r--r--Documentation/oops-tracing.txt264
-rw-r--r--Documentation/parisc/00-INDEX6
-rw-r--r--Documentation/parisc/debugging39
-rw-r--r--Documentation/parisc/registers121
-rw-r--r--Documentation/parport-lowlevel.txt1471
-rw-r--r--Documentation/parport.txt268
-rw-r--r--Documentation/pcmcia/.gitignore1
-rw-r--r--Documentation/pcmcia/Makefile10
-rw-r--r--Documentation/pcmcia/crc32hash.c32
-rw-r--r--Documentation/pcmcia/devicetable.txt33
-rw-r--r--Documentation/pcmcia/driver-changes.txt90
-rw-r--r--Documentation/pcmcia/driver.txt30
-rw-r--r--Documentation/pi-futex.txt121
-rw-r--r--Documentation/pnp.txt252
-rw-r--r--Documentation/power/00-INDEX40
-rw-r--r--Documentation/power/apm-acpi.txt32
-rw-r--r--Documentation/power/basic-pm-debugging.txt204
-rw-r--r--Documentation/power/devices.txt512
-rw-r--r--Documentation/power/drivers-testing.txt46
-rw-r--r--Documentation/power/freezing-of-tasks.txt178
-rw-r--r--Documentation/power/interface.txt75
-rw-r--r--Documentation/power/notifiers.txt58
-rw-r--r--Documentation/power/pci.txt299
-rw-r--r--Documentation/power/pm_qos_interface.txt64
-rw-r--r--Documentation/power/power_supply_class.txt173
-rw-r--r--Documentation/power/regulator/consumer.txt182
-rw-r--r--Documentation/power/regulator/machine.txt93
-rw-r--r--Documentation/power/regulator/overview.txt171
-rw-r--r--Documentation/power/regulator/regulator.txt30
-rw-r--r--Documentation/power/s2ram.txt74
-rw-r--r--Documentation/power/states.txt80
-rw-r--r--Documentation/power/swsusp-and-swap-files.txt60
-rw-r--r--Documentation/power/swsusp-dmcrypt.txt138
-rw-r--r--Documentation/power/swsusp.txt407
-rw-r--r--Documentation/power/tricks.txt27
-rw-r--r--Documentation/power/userland-swsusp.txt165
-rw-r--r--Documentation/power/video.txt185
-rw-r--r--Documentation/power/video_extension.txt37
-rw-r--r--Documentation/powerpc/00-INDEX27
-rw-r--r--Documentation/powerpc/booting-without-of.txt2703
-rw-r--r--Documentation/powerpc/bootwrapper.txt141
-rw-r--r--Documentation/powerpc/cpu_features.txt56
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/83xx-512x-pci.txt40
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt40
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/board.txt29
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt67
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/brg.txt21
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/i2c.txt41
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/pic.txt18
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/usb.txt15
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt38
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/network.txt45
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt58
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt24
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/par_io.txt51
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/pincfg.txt60
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/ucc.txt70
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/usb.txt37
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/serial.txt32
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/diu.txt18
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/dma.txt136
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/gtm.txt31
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/guts.txt25
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/i2c.txt32
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/lbc.txt35
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/mcu-mpc8349emitx.txt17
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/msi-pic.txt36
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/pmc.txt63
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/sata.txt29
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/sec.txt68
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/spi.txt24
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/ssi.txt61
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/tsec.txt62
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/upm-nand.txt28
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/usb.txt59
-rw-r--r--Documentation/powerpc/dts-bindings/gpio/led.txt15
-rw-r--r--Documentation/powerpc/eeh-pci-error-recovery.txt334
-rw-r--r--Documentation/powerpc/hvcs.txt567
-rw-r--r--Documentation/powerpc/kvm_440.txt41
-rw-r--r--Documentation/powerpc/mpc52xx-device-tree-bindings.txt277
-rw-r--r--Documentation/powerpc/mpc52xx.txt39
-rw-r--r--Documentation/powerpc/phyp-assisted-dump.txt127
-rw-r--r--Documentation/powerpc/qe_firmware.txt295
-rw-r--r--Documentation/powerpc/sound.txt81
-rw-r--r--Documentation/powerpc/zImage_layout.txt47
-rw-r--r--Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c96
-rw-r--r--Documentation/prctl/disable-tsc-on-off-stress-test.c95
-rw-r--r--Documentation/prctl/disable-tsc-test.c94
-rw-r--r--Documentation/preempt-locking.txt135
-rw-r--r--Documentation/printk-formats.txt35
-rw-r--r--Documentation/prio_tree.txt107
-rw-r--r--Documentation/rbtree.txt192
-rw-r--r--Documentation/rfkill.txt569
-rw-r--r--Documentation/robust-futex-ABI.txt182
-rw-r--r--Documentation/robust-futexes.txt218
-rw-r--r--Documentation/rt-mutex-design.txt781
-rw-r--r--Documentation/rt-mutex.txt79
-rw-r--r--Documentation/rtc.txt450
-rw-r--r--Documentation/s390/00-INDEX26
-rw-r--r--Documentation/s390/3270.ChangeLog44
-rw-r--r--Documentation/s390/3270.txt272
-rw-r--r--Documentation/s390/CommonIO117
-rw-r--r--Documentation/s390/DASD73
-rw-r--r--Documentation/s390/Debugging390.txt2535
-rw-r--r--Documentation/s390/TAPE122
-rw-r--r--Documentation/s390/cds.txt472
-rw-r--r--Documentation/s390/config3270.sh76
-rw-r--r--Documentation/s390/driver-model.txt287
-rw-r--r--Documentation/s390/kvm.txt125
-rw-r--r--Documentation/s390/monreader.txt197
-rw-r--r--Documentation/s390/s390dbf.txt649
-rw-r--r--Documentation/s390/zfcpdump.txt87
-rw-r--r--Documentation/scheduler/00-INDEX16
-rw-r--r--Documentation/scheduler/sched-arch.txt89
-rw-r--r--Documentation/scheduler/sched-coding.txt126
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt275
-rw-r--r--Documentation/scheduler/sched-domains.txt67
-rw-r--r--Documentation/scheduler/sched-nice-design.txt108
-rw-r--r--Documentation/scheduler/sched-rt-group.txt177
-rw-r--r--Documentation/scheduler/sched-stats.txt156
-rw-r--r--Documentation/scsi/00-INDEX94
-rw-r--r--Documentation/scsi/53c700.txt154
-rw-r--r--Documentation/scsi/BusLogic.txt566
-rw-r--r--Documentation/scsi/ChangeLog.1992-19972023
-rw-r--r--Documentation/scsi/ChangeLog.arcmsr118
-rw-r--r--Documentation/scsi/ChangeLog.ips122
-rw-r--r--Documentation/scsi/ChangeLog.lpfc1865
-rw-r--r--Documentation/scsi/ChangeLog.megaraid614
-rw-r--r--Documentation/scsi/ChangeLog.megaraid_sas325
-rw-r--r--Documentation/scsi/ChangeLog.ncr53c8xx495
-rw-r--r--Documentation/scsi/ChangeLog.sym53c8xx593
-rw-r--r--Documentation/scsi/ChangeLog.sym53c8xx_2144
-rw-r--r--Documentation/scsi/FlashPoint.txt163
-rw-r--r--Documentation/scsi/LICENSE.FlashPoint60
-rw-r--r--Documentation/scsi/LICENSE.qla2xxx45
-rw-r--r--Documentation/scsi/Mylex.txt5
-rw-r--r--Documentation/scsi/NinjaSCSI.txt130
-rw-r--r--Documentation/scsi/aacraid.txt157
-rw-r--r--Documentation/scsi/advansys.txt243
-rw-r--r--Documentation/scsi/aha152x.txt183
-rw-r--r--Documentation/scsi/aic79xx.txt497
-rw-r--r--Documentation/scsi/aic7xxx.txt394
-rw-r--r--Documentation/scsi/aic7xxx_old.txt511
-rw-r--r--Documentation/scsi/arcmsr_spec.txt574
-rw-r--r--Documentation/scsi/dc395x.txt102
-rw-r--r--Documentation/scsi/dpti.txt83
-rw-r--r--Documentation/scsi/dtc3x80.txt43
-rw-r--r--Documentation/scsi/g_NCR5380.txt63
-rw-r--r--Documentation/scsi/hptiop.txt104
-rw-r--r--Documentation/scsi/ibmmca.txt1402
-rw-r--r--Documentation/scsi/in2000.txt202
-rw-r--r--Documentation/scsi/libsas.txt484
-rw-r--r--Documentation/scsi/link_power_management_policy.txt19
-rw-r--r--Documentation/scsi/lpfc.txt83
-rw-r--r--Documentation/scsi/megaraid.txt70
-rw-r--r--Documentation/scsi/ncr53c8xx.txt1849
-rw-r--r--Documentation/scsi/osst.txt218
-rw-r--r--Documentation/scsi/ppa.txt14
-rw-r--r--Documentation/scsi/qlogicfas.txt78
-rw-r--r--Documentation/scsi/scsi-changer.txt180
-rw-r--r--Documentation/scsi/scsi-generic.txt101
-rw-r--r--Documentation/scsi/scsi.txt44
-rw-r--r--Documentation/scsi/scsi_eh.txt479
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt486
-rw-r--r--Documentation/scsi/scsi_mid_low_api.txt1421
-rw-r--r--Documentation/scsi/st.txt509
-rw-r--r--Documentation/scsi/sym53c500_cs.txt23
-rw-r--r--Documentation/scsi/sym53c8xx_2.txt1048
-rw-r--r--Documentation/scsi/tmscsim.txt449
-rw-r--r--Documentation/serial-console.txt109
-rw-r--r--Documentation/serial/00-INDEX24
-rw-r--r--Documentation/serial/README.cycladesZ8
-rw-r--r--Documentation/serial/computone.txt522
-rw-r--r--Documentation/serial/digiepca.txt98
-rw-r--r--Documentation/serial/driver397
-rw-r--r--Documentation/serial/hayes-esp.txt154
-rw-r--r--Documentation/serial/moxa-smartio523
-rw-r--r--Documentation/serial/riscom8.txt36
-rw-r--r--Documentation/serial/rocket.txt189
-rw-r--r--Documentation/serial/specialix.txt383
-rw-r--r--Documentation/serial/stallion.txt392
-rw-r--r--Documentation/serial/sx.txt294
-rw-r--r--Documentation/serial/tty.txt292
-rw-r--r--Documentation/sgi-ioc4.txt45
-rw-r--r--Documentation/sgi-visws.txt13
-rw-r--r--Documentation/sh/clk.txt32
-rw-r--r--Documentation/sh/kgdb.txt179
-rw-r--r--Documentation/sh/new-machine.txt278
-rw-r--r--Documentation/sh/register-banks.txt33
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt2417
-rw-r--r--Documentation/sound/alsa/Audigy-mixer.txt345
-rw-r--r--Documentation/sound/alsa/Audiophile-Usb.txt442
-rw-r--r--Documentation/sound/alsa/Bt87x.txt78
-rw-r--r--Documentation/sound/alsa/CMIPCI.txt254
-rw-r--r--Documentation/sound/alsa/ControlNames.txt84
-rw-r--r--Documentation/sound/alsa/DocBook/alsa-driver-api.tmpl100
-rw-r--r--Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl6210
-rw-r--r--Documentation/sound/alsa/Joystick.txt86
-rw-r--r--Documentation/sound/alsa/MIXART.txt100
-rw-r--r--Documentation/sound/alsa/OSS-Emulation.txt305
-rw-r--r--Documentation/sound/alsa/Procfile.txt207
-rw-r--r--Documentation/sound/alsa/SB-Live-mixer.txt356
-rw-r--r--Documentation/sound/alsa/VIA82xx-mixer.txt8
-rw-r--r--Documentation/sound/alsa/emu10k1-jack.txt74
-rw-r--r--Documentation/sound/alsa/hda_codec.txt322
-rw-r--r--Documentation/sound/alsa/hdspm.txt362
-rw-r--r--Documentation/sound/alsa/powersave.txt41
-rw-r--r--Documentation/sound/alsa/seq_oss.html409
-rw-r--r--Documentation/sound/alsa/serial-u16550.txt88
-rw-r--r--Documentation/sound/alsa/soc/DAI.txt56
-rw-r--r--Documentation/sound/alsa/soc/clocking.txt51
-rw-r--r--Documentation/sound/alsa/soc/codec.txt198
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt290
-rw-r--r--Documentation/sound/alsa/soc/machine.txt113
-rw-r--r--Documentation/sound/alsa/soc/overview.txt86
-rw-r--r--Documentation/sound/alsa/soc/platform.txt58
-rw-r--r--Documentation/sound/alsa/soc/pops_clicks.txt52
-rw-r--r--Documentation/sound/oss/ALS66
-rw-r--r--Documentation/sound/oss/AudioExcelDSP16101
-rw-r--r--Documentation/sound/oss/CMI8330153
-rw-r--r--Documentation/sound/oss/CS423223
-rw-r--r--Documentation/sound/oss/ESS34
-rw-r--r--Documentation/sound/oss/ESS186855
-rw-r--r--Documentation/sound/oss/Introduction459
-rw-r--r--Documentation/sound/oss/MultiSound1137
-rw-r--r--Documentation/sound/oss/OPL36
-rw-r--r--Documentation/sound/oss/Opti222
-rw-r--r--Documentation/sound/oss/PAS16163
-rw-r--r--Documentation/sound/oss/PSS41
-rw-r--r--Documentation/sound/oss/PSS-updates88
-rw-r--r--Documentation/sound/oss/README.OSS1456
-rw-r--r--Documentation/sound/oss/README.modules106
-rw-r--r--Documentation/sound/oss/README.ymfsb107
-rw-r--r--Documentation/sound/oss/SoundPro105
-rw-r--r--Documentation/sound/oss/Soundblaster53
-rw-r--r--Documentation/sound/oss/Tropez+26
-rw-r--r--Documentation/sound/oss/VIBRA1680
-rw-r--r--Documentation/sound/oss/WaveArtist170
-rw-r--r--Documentation/sound/oss/btaudio92
-rw-r--r--Documentation/sound/oss/mwave185
-rw-r--r--Documentation/sound/oss/ultrasound30
-rw-r--r--Documentation/sound/oss/vwsnd293
-rw-r--r--Documentation/sparc/README-2.546
-rw-r--r--Documentation/sparse.txt82
-rw-r--r--Documentation/spi/.gitignore2
-rw-r--r--Documentation/spi/Makefile12
-rw-r--r--Documentation/spi/butterfly68
-rw-r--r--Documentation/spi/pxa2xx246
-rw-r--r--Documentation/spi/spi-lm70llp69
-rw-r--r--Documentation/spi/spi-summary558
-rw-r--r--Documentation/spi/spidev143
-rw-r--r--Documentation/spi/spidev_fdx.c158
-rw-r--r--Documentation/spi/spidev_test.c202
-rw-r--r--Documentation/spinlocks.txt236
-rw-r--r--Documentation/stable_api_nonsense.txt190
-rw-r--r--Documentation/stable_kernel_rules.txt60
-rw-r--r--Documentation/svga.txt276
-rw-r--r--Documentation/sysctl/00-INDEX16
-rw-r--r--Documentation/sysctl/README75
-rw-r--r--Documentation/sysctl/abi.txt54
-rw-r--r--Documentation/sysctl/ctl_unnumbered.txt22
-rw-r--r--Documentation/sysctl/fs.txt180
-rw-r--r--Documentation/sysctl/kernel.txt383
-rw-r--r--Documentation/sysctl/sunrpc.txt20
-rw-r--r--Documentation/sysctl/vm.txt349
-rw-r--r--Documentation/sysfs-rules.txt163
-rw-r--r--Documentation/sysrq.txt225
-rw-r--r--Documentation/telephony/00-INDEX4
-rw-r--r--Documentation/telephony/ixj.txt399
-rw-r--r--Documentation/thermal/sysfs-api.txt266
-rw-r--r--Documentation/timers/00-INDEX10
-rw-r--r--Documentation/timers/highres.txt249
-rw-r--r--Documentation/timers/hpet.txt299
-rw-r--r--Documentation/timers/hrtimers.txt178
-rw-r--r--Documentation/timers/timer_stats.txt73
-rw-r--r--Documentation/tracepoints.txt101
-rw-r--r--Documentation/tracers/mmiotrace.txt165
-rw-r--r--Documentation/uml/UserModeLinux-HOWTO.txt4636
-rw-r--r--Documentation/unaligned-memory-access.txt252
-rw-r--r--Documentation/unicode.txt175
-rw-r--r--Documentation/unshare.txt295
-rw-r--r--Documentation/usb/CREDITS175
-rw-r--r--Documentation/usb/URB.txt240
-rw-r--r--Documentation/usb/WUSB-Design-overview.txt448
-rw-r--r--Documentation/usb/acm.txt128
-rw-r--r--Documentation/usb/anchors.txt79
-rw-r--r--Documentation/usb/authorization.txt92
-rw-r--r--Documentation/usb/callbacks.txt132
-rw-r--r--Documentation/usb/dma.txt138
-rw-r--r--Documentation/usb/ehci.txt212
-rw-r--r--Documentation/usb/error-codes.txt165
-rw-r--r--Documentation/usb/gadget_printer.txt510
-rw-r--r--Documentation/usb/gadget_serial.txt349
-rw-r--r--Documentation/usb/hiddev.txt205
-rw-r--r--Documentation/usb/hotplug.txt148
-rw-r--r--Documentation/usb/iuu_phoenix.txt84
-rw-r--r--Documentation/usb/linux.inf200
-rw-r--r--Documentation/usb/misc_usbsevseg.txt46
-rw-r--r--Documentation/usb/mtouchusb.txt76
-rw-r--r--Documentation/usb/ohci.txt32
-rw-r--r--Documentation/usb/persist.txt162
-rw-r--r--Documentation/usb/power-management.txt530
-rw-r--r--Documentation/usb/proc_usb_info.txt377
-rw-r--r--Documentation/usb/rio.txt138
-rw-r--r--Documentation/usb/usb-help.txt16
-rw-r--r--Documentation/usb/usb-serial.txt472
-rw-r--r--Documentation/usb/usbmon.txt362
-rw-r--r--Documentation/usb/wusb-cbaf139
-rw-r--r--Documentation/video-output.txt34
-rw-r--r--Documentation/video4linux/.gitignore1
-rw-r--r--Documentation/video4linux/API.html16
-rw-r--r--Documentation/video4linux/CARDLIST.au08286
-rw-r--r--Documentation/video4linux/CARDLIST.bttv153
-rw-r--r--Documentation/video4linux/CARDLIST.cx2388513
-rw-r--r--Documentation/video4linux/CARDLIST.cx8876
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx59
-rw-r--r--Documentation/video4linux/CARDLIST.ivtv24
-rw-r--r--Documentation/video4linux/CARDLIST.saa7134153
-rw-r--r--Documentation/video4linux/CARDLIST.tuner78
-rw-r--r--Documentation/video4linux/CARDLIST.usbvision65
-rw-r--r--Documentation/video4linux/CQcam.txt215
-rw-r--r--Documentation/video4linux/Makefile8
-rw-r--r--Documentation/video4linux/README.cpia191
-rw-r--r--Documentation/video4linux/README.cpia2130
-rw-r--r--Documentation/video4linux/README.cx8869
-rw-r--r--Documentation/video4linux/README.ir72
-rw-r--r--Documentation/video4linux/README.ivtv187
-rw-r--r--Documentation/video4linux/README.pvrusb2212
-rw-r--r--Documentation/video4linux/README.saa713482
-rw-r--r--Documentation/video4linux/Zoran580
-rw-r--r--Documentation/video4linux/bttv/CONTRIBUTORS25
-rw-r--r--Documentation/video4linux/bttv/Cards964
-rw-r--r--Documentation/video4linux/bttv/ICs37
-rw-r--r--Documentation/video4linux/bttv/Insmod-options182
-rw-r--r--Documentation/video4linux/bttv/MAKEDEV28
-rw-r--r--Documentation/video4linux/bttv/Modprobe.conf11
-rw-r--r--Documentation/video4linux/bttv/Modules.conf14
-rw-r--r--Documentation/video4linux/bttv/PROBLEMS62
-rw-r--r--Documentation/video4linux/bttv/README90
-rw-r--r--Documentation/video4linux/bttv/README.WINVIEW33
-rw-r--r--Documentation/video4linux/bttv/README.freeze74
-rw-r--r--Documentation/video4linux/bttv/README.quirks83
-rw-r--r--Documentation/video4linux/bttv/Sound-FAQ148
-rw-r--r--Documentation/video4linux/bttv/Specs3
-rw-r--r--Documentation/video4linux/bttv/THANKS24
-rw-r--r--Documentation/video4linux/bttv/Tuners115
-rw-r--r--Documentation/video4linux/cafe_ccic54
-rw-r--r--Documentation/video4linux/cpia2_overview.txt38
-rw-r--r--Documentation/video4linux/cx18.txt30
-rw-r--r--Documentation/video4linux/cx2341x/README.hm12116
-rw-r--r--Documentation/video4linux/cx2341x/README.vbi45
-rw-r--r--Documentation/video4linux/cx2341x/fw-calling.txt69
-rw-r--r--Documentation/video4linux/cx2341x/fw-decoder-api.txt297
-rw-r--r--Documentation/video4linux/cx2341x/fw-decoder-regs.txt817
-rw-r--r--Documentation/video4linux/cx2341x/fw-dma.txt96
-rw-r--r--Documentation/video4linux/cx2341x/fw-encoder-api.txt709
-rw-r--r--Documentation/video4linux/cx2341x/fw-memory.txt139
-rw-r--r--Documentation/video4linux/cx2341x/fw-osd-api.txt350
-rw-r--r--Documentation/video4linux/cx2341x/fw-upload.txt49
-rw-r--r--Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt54
-rw-r--r--Documentation/video4linux/et61x251.txt315
-rw-r--r--Documentation/video4linux/extract_xc3028.pl926
-rw-r--r--Documentation/video4linux/gspca.txt274
-rw-r--r--Documentation/video4linux/hauppauge-wintv-cx88-ir.txt54
-rw-r--r--Documentation/video4linux/ibmcam.txt324
-rw-r--r--Documentation/video4linux/lifeview.txt42
-rw-r--r--Documentation/video4linux/m5602.txt12
-rw-r--r--Documentation/video4linux/meye.txt129
-rw-r--r--Documentation/video4linux/not-in-cx2388x-datasheet.txt41
-rw-r--r--Documentation/video4linux/ov511.txt288
-rw-r--r--Documentation/video4linux/radiotrack.txt147
-rw-r--r--Documentation/video4linux/se401.txt54
-rw-r--r--Documentation/video4linux/si470x.txt118
-rw-r--r--Documentation/video4linux/sn9c102.txt592
-rw-r--r--Documentation/video4linux/soc-camera.txt120
-rw-r--r--Documentation/video4linux/stv680.txt53
-rw-r--r--Documentation/video4linux/v4lgrab.c192
-rw-r--r--Documentation/video4linux/w9966.txt33
-rw-r--r--Documentation/video4linux/w9968cf.txt458
-rw-r--r--Documentation/video4linux/zc0301.txt270
-rw-r--r--Documentation/video4linux/zr364xx.txt67
-rw-r--r--Documentation/vm/.gitignore1
-rw-r--r--Documentation/vm/00-INDEX20
-rw-r--r--Documentation/vm/Makefile8
-rw-r--r--Documentation/vm/balance93
-rw-r--r--Documentation/vm/hugetlbpage.txt339
-rw-r--r--Documentation/vm/locking130
-rw-r--r--Documentation/vm/numa41
-rw-r--r--Documentation/vm/numa_memory_policy.txt452
-rw-r--r--Documentation/vm/overcommit-accounting73
-rw-r--r--Documentation/vm/page_migration148
-rw-r--r--Documentation/vm/pagemap.txt77
-rw-r--r--Documentation/vm/slabinfo.c1364
-rw-r--r--Documentation/vm/slub.txt269
-rw-r--r--Documentation/vm/unevictable-lru.txt615
-rw-r--r--Documentation/volatile-considered-harmful.txt119
-rw-r--r--Documentation/voyager.txt95
-rw-r--r--Documentation/w1/00-INDEX10
-rw-r--r--Documentation/w1/masters/00-INDEX8
-rw-r--r--Documentation/w1/masters/ds248231
-rw-r--r--Documentation/w1/masters/ds249070
-rw-r--r--Documentation/w1/masters/omap-hdq46
-rw-r--r--Documentation/w1/masters/w1-gpio33
-rw-r--r--Documentation/w1/slaves/00-INDEX4
-rw-r--r--Documentation/w1/slaves/w1_therm41
-rw-r--r--Documentation/w1/w1.generic113
-rw-r--r--Documentation/w1/w1.netlink98
-rw-r--r--Documentation/watchdog/00-INDEX10
-rw-r--r--Documentation/watchdog/pcwd-watchdog.txt66
-rw-r--r--Documentation/watchdog/src/.gitignore2
-rw-r--r--Documentation/watchdog/src/Makefile8
-rw-r--r--Documentation/watchdog/src/watchdog-simple.c27
-rw-r--r--Documentation/watchdog/src/watchdog-test.c68
-rw-r--r--Documentation/watchdog/watchdog-api.txt238
-rw-r--r--Documentation/watchdog/wdt.txt43
-rw-r--r--Documentation/x86/00-INDEX4
-rw-r--r--Documentation/x86/boot.txt900
-rw-r--r--Documentation/x86/i386/IO-APIC.txt119
-rw-r--r--Documentation/x86/mtrr.txt305
-rw-r--r--Documentation/x86/pat.txt136
-rw-r--r--Documentation/x86/usb-legacy-support.txt44
-rw-r--r--Documentation/x86/x86_64/00-INDEX16
-rw-r--r--Documentation/x86/x86_64/boot-options.txt310
-rw-r--r--Documentation/x86/x86_64/cpu-hotplug-spec21
-rw-r--r--Documentation/x86/x86_64/fake-numa-for-cpusets66
-rw-r--r--Documentation/x86/x86_64/kernel-stacks99
-rw-r--r--Documentation/x86/x86_64/machinecheck77
-rw-r--r--Documentation/x86/x86_64/mm.txt28
-rw-r--r--Documentation/x86/x86_64/uefi.txt42
-rw-r--r--Documentation/x86/zero-page.txt31
-rw-r--r--Documentation/zh_CN/CodingStyle701
-rw-r--r--Documentation/zh_CN/HOWTO538
-rw-r--r--Documentation/zh_CN/SubmittingDrivers168
-rw-r--r--Documentation/zh_CN/SubmittingPatches416
-rw-r--r--Documentation/zh_CN/oops-tracing.txt212
-rw-r--r--Documentation/zh_CN/sparse.txt100
-rw-r--r--Documentation/zh_CN/stable_api_nonsense.txt157
-rw-r--r--Documentation/zh_CN/stable_kernel_rules.txt66
-rw-r--r--Documentation/zh_CN/volatile-considered-harmful.txt113
-rw-r--r--Documentation/zorro.txt102
1224 files changed, 269135 insertions, 0 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
new file mode 100644
index 0000000..2a39aeb
--- /dev/null
+++ b/Documentation/00-INDEX
@@ -0,0 +1,352 @@
+
+This is a brief list of all the files in ./linux/Documentation and what
+they contain. If you add a documentation file, please list it here in
+alphabetical order as well, or risk being hunted down like a rabid dog.
+Please try and keep the descriptions small enough to fit on one line.
+ Thanks -- Paul G.
+
+Following translations are available on the WWW:
+
+ - Japanese, maintained by the JF Project (JF@linux.or.jp), at
+ http://www.linux.or.jp/JF/
+
+00-INDEX
+ - this file.
+ABI/
+ - info on kernel <-> userspace ABI and relative interface stability.
+
+BUG-HUNTING
+ - brute force method of doing binary search of patches to find bug.
+Changes
+ - list of changes that break older software packages.
+CodingStyle
+ - how the boss likes the C code in the kernel to look.
+development-process/
+ - An extended tutorial on how to work with the kernel development
+ process.
+DMA-API.txt
+ - DMA API, pci_ API & extensions for non-consistent memory machines.
+DMA-ISA-LPC.txt
+ - How to do DMA with ISA (and LPC) devices.
+DocBook/
+ - directory with DocBook templates etc. for kernel documentation.
+HOWTO
+ - the process and procedures of how to do Linux kernel development.
+IO-mapping.txt
+ - how to access I/O mapped memory from within device drivers.
+IPMI.txt
+ - info on Linux Intelligent Platform Management Interface (IPMI) Driver.
+IRQ-affinity.txt
+ - how to select which CPU(s) handle which interrupt events on SMP.
+IRQ.txt
+ - description of what an IRQ is.
+ManagementStyle
+ - how to (attempt to) manage kernel hackers.
+RCU/
+ - directory with info on RCU (read-copy update).
+SAK.txt
+ - info on Secure Attention Keys.
+SM501.txt
+ - Silicon Motion SM501 multimedia companion chip
+SecurityBugs
+ - procedure for reporting security bugs found in the kernel.
+SubmitChecklist
+ - Linux kernel patch submission checklist.
+SubmittingDrivers
+ - procedure to get a new driver source included into the kernel tree.
+SubmittingPatches
+ - procedure to get a source patch included into the kernel tree.
+VGA-softcursor.txt
+ - how to change your VGA cursor from a blinking underscore.
+accounting/
+ - documentation on accounting and taskstats.
+acpi/
+ - info on ACPI-specific hooks in the kernel.
+aoe/
+ - description of AoE (ATA over Ethernet) along with config examples.
+applying-patches.txt
+ - description of various trees and how to apply their patches.
+arm/
+ - directory with info about Linux on the ARM architecture.
+atomic_ops.txt
+ - semantics and behavior of atomic and bitmask operations.
+auxdisplay/
+ - misc. LCD driver documentation (cfag12864b, ks0108).
+basic_profiling.txt
+ - basic instructions for those who wants to profile Linux kernel.
+binfmt_misc.txt
+ - info on the kernel support for extra binary formats.
+blackfin/
+ - directory with documentation for the Blackfin arch.
+block/
+ - info on the Block I/O (BIO) layer.
+blockdev/
+ - info on block devices & drivers
+cachetlb.txt
+ - describes the cache/TLB flushing interfaces Linux uses.
+cdrom/
+ - directory with information on the CD-ROM drivers that Linux has.
+connector/
+ - docs on the netlink based userspace<->kernel space communication mod.
+console/
+ - documentation on Linux console drivers.
+cpu-freq/
+ - info on CPU frequency and voltage scaling.
+cpu-hotplug.txt
+ - document describing CPU hotplug support in the Linux kernel.
+cpu-load.txt
+ - document describing how CPU load statistics are collected.
+cpuidle/
+ - info on CPU_IDLE, CPU idle state management subsystem.
+cpusets.txt
+ - documents the cpusets feature; assign CPUs and Mem to a set of tasks.
+cputopology.txt
+ - documentation on how CPU topology info is exported via sysfs.
+cris/
+ - directory with info about Linux on CRIS architecture.
+crypto/
+ - directory with info on the Crypto API.
+dcdbas.txt
+ - information on the Dell Systems Management Base Driver.
+debugging-modules.txt
+ - some notes on debugging modules after Linux 2.6.3.
+dell_rbu.txt
+ - document demonstrating the use of the Dell Remote BIOS Update driver.
+device-mapper/
+ - directory with info on Device Mapper.
+devices.txt
+ - plain ASCII listing of all the nodes in /dev/ with major minor #'s.
+dontdiff
+ - file containing a list of files that should never be diff'ed.
+driver-model/
+ - directory with info about Linux driver model.
+dvb/
+ - info on Linux Digital Video Broadcast (DVB) subsystem.
+early-userspace/
+ - info about initramfs, klibc, and userspace early during boot.
+edac.txt
+ - information on EDAC - Error Detection And Correction
+eisa.txt
+ - info on EISA bus support.
+exception.txt
+ - how Linux v2.2 handles exceptions without verify_area etc.
+fault-injection/
+ - dir with docs about the fault injection capabilities infrastructure.
+fb/
+ - directory with info on the frame buffer graphics abstraction layer.
+feature-removal-schedule.txt
+ - list of files and features that are going to be removed.
+filesystems/
+ - info on the vfs and the various filesystems that Linux supports.
+firmware_class/
+ - request_firmware() hotplug interface info.
+frv/
+ - Fujitsu FR-V Linux documentation.
+gpio.txt
+ - overview of GPIO (General Purpose Input/Output) access conventions.
+highuid.txt
+ - notes on the change from 16 bit to 32 bit user/group IDs.
+timers/
+ - info on the timer related topics
+hw_random.txt
+ - info on Linux support for random number generator in i8xx chipsets.
+hwmon/
+ - directory with docs on various hardware monitoring drivers.
+i2c/
+ - directory with info about the I2C bus/protocol (2 wire, kHz speed).
+i2o/
+ - directory with info about the Linux I2O subsystem.
+x86/i386/
+ - directory with info about Linux on Intel 32 bit architecture.
+ia64/
+ - directory with info about Linux on Intel 64 bit architecture.
+infiniband/
+ - directory with documents concerning Linux InfiniBand support.
+initrd.txt
+ - how to use the RAM disk as an initial/temporary root filesystem.
+input/
+ - info on Linux input device support.
+io_ordering.txt
+ - info on ordering I/O writes to memory-mapped addresses.
+ioctl/
+ - directory with documents describing various IOCTL calls.
+iostats.txt
+ - info on I/O statistics Linux kernel provides.
+irqflags-tracing.txt
+ - how to use the irq-flags tracing feature.
+isapnp.txt
+ - info on Linux ISA Plug & Play support.
+isdn/
+ - directory with info on the Linux ISDN support, and supported cards.
+java.txt
+ - info on the in-kernel binary support for Java(tm).
+kbuild/
+ - directory with info about the kernel build process.
+kdump/
+ - directory with mini HowTo on getting the crash dump code to work.
+kernel-doc-nano-HOWTO.txt
+ - mini HowTo on generation and location of kernel documentation files.
+kernel-docs.txt
+ - listing of various WWW + books that document kernel internals.
+kernel-parameters.txt
+ - summary listing of command line / boot prompt args for the kernel.
+keys-request-key.txt
+ - description of the kernel key request service.
+keys.txt
+ - description of the kernel key retention service.
+kobject.txt
+ - info of the kobject infrastructure of the Linux kernel.
+kprobes.txt
+ - documents the kernel probes debugging feature.
+kref.txt
+ - docs on adding reference counters (krefs) to kernel objects.
+laptops/
+ - directory with laptop related info and laptop driver documentation.
+ldm.txt
+ - a brief description of LDM (Windows Dynamic Disks).
+leds-class.txt
+ - documents LED handling under Linux.
+local_ops.txt
+ - semantics and behavior of local atomic operations.
+lockdep-design.txt
+ - documentation on the runtime locking correctness validator.
+logo.gif
+ - full colour GIF image of Linux logo (penguin - Tux).
+logo.txt
+ - info on creator of above logo & site to get additional images from.
+m68k/
+ - directory with info about Linux on Motorola 68k architecture.
+magic-number.txt
+ - list of magic numbers used to mark/protect kernel data structures.
+mca.txt
+ - info on supporting Micro Channel Architecture (e.g. PS/2) systems.
+md.txt
+ - info on boot arguments for the multiple devices driver.
+memory-barriers.txt
+ - info on Linux kernel memory barriers.
+memory-hotplug.txt
+ - Hotpluggable memory support, how to use and current status.
+memory.txt
+ - info on typical Linux memory problems.
+mips/
+ - directory with info about Linux on MIPS architecture.
+mono.txt
+ - how to execute Mono-based .NET binaries with the help of BINFMT_MISC.
+mutex-design.txt
+ - info on the generic mutex subsystem.
+namespaces/
+ - directory with various information about namespaces
+netlabel/
+ - directory with information on the NetLabel subsystem.
+networking/
+ - directory with info on various aspects of networking with Linux.
+nmi_watchdog.txt
+ - info on NMI watchdog for SMP systems.
+nommu-mmap.txt
+ - documentation about no-mmu memory mapping support.
+numastat.txt
+ - info on how to read Numa policy hit/miss statistics in sysfs.
+oops-tracing.txt
+ - how to decode those nasty internal kernel error dump messages.
+parisc/
+ - directory with info on using Linux on PA-RISC architecture.
+parport.txt
+ - how to use the parallel-port driver.
+parport-lowlevel.txt
+ - description and usage of the low level parallel port functions.
+pcmcia/
+ - info on the Linux PCMCIA driver.
+pi-futex.txt
+ - documentation on lightweight PI-futexes.
+pnp.txt
+ - Linux Plug and Play documentation.
+power/
+ - directory with info on Linux PCI power management.
+powerpc/
+ - directory with info on using Linux with the PowerPC.
+preempt-locking.txt
+ - info on locking under a preemptive kernel.
+printk-formats.txt
+ - how to get printk format specifiers right
+prio_tree.txt
+ - info on radix-priority-search-tree use for indexing vmas.
+rbtree.txt
+ - info on what red-black trees are and what they are for.
+robust-futex-ABI.txt
+ - documentation of the robust futex ABI.
+robust-futexes.txt
+ - a description of what robust futexes are.
+rt-mutex-design.txt
+ - description of the RealTime mutex implementation design.
+rt-mutex.txt
+ - desc. of RT-mutex subsystem with PI (Priority Inheritance) support.
+rtc.txt
+ - notes on how to use the Real Time Clock (aka CMOS clock) driver.
+s390/
+ - directory with info on using Linux on the IBM S390.
+scheduler/
+ - directory with info on the scheduler.
+scsi/
+ - directory with info on Linux scsi support.
+serial/
+ - directory with info on the low level serial API.
+serial-console.txt
+ - how to set up Linux with a serial line console as the default.
+sgi-ioc4.txt
+ - description of the SGI IOC4 PCI (multi function) device.
+sgi-visws.txt
+ - short blurb on the SGI Visual Workstations.
+sh/
+ - directory with info on porting Linux to a new architecture.
+sound/
+ - directory with info on sound card support.
+sparc/
+ - directory with info on using Linux on Sparc architecture.
+sparse.txt
+ - info on how to obtain and use the sparse tool for typechecking.
+spi/
+ - overview of Linux kernel Serial Peripheral Interface (SPI) support.
+spinlocks.txt
+ - info on using spinlocks to provide exclusive access in kernel.
+stable_api_nonsense.txt
+ - info on why the kernel does not have a stable in-kernel api or abi.
+stable_kernel_rules.txt
+ - rules and procedures for the -stable kernel releases.
+svga.txt
+ - short guide on selecting video modes at boot via VGA BIOS.
+sysfs-rules.txt
+ - How not to use sysfs.
+sysctl/
+ - directory with info on the /proc/sys/* files.
+sysrq.txt
+ - info on the magic SysRq key.
+telephony/
+ - directory with info on telephony (e.g. voice over IP) support.
+time_interpolators.txt
+ - info on time interpolators.
+uml/
+ - directory with information about User Mode Linux.
+unicode.txt
+ - info on the Unicode character/font mapping used in Linux.
+unshare.txt
+ - description of the Linux unshare system call.
+usb/
+ - directory with info regarding the Universal Serial Bus.
+video-output.txt
+ - sysfs class driver interface to enable/disable a video output device.
+video4linux/
+ - directory with info regarding video/TV/radio cards and linux.
+vm/
+ - directory with info on the Linux vm code.
+volatile-considered-harmful.txt
+ - Why the "volatile" type class should not be used
+voyager.txt
+ - guide to running Linux on the Voyager architecture.
+w1/
+ - directory with documents regarding the 1-wire (w1) subsystem.
+watchdog/
+ - how to auto-reboot Linux if it has "fallen and can't get up". ;-)
+x86/x86_64/
+ - directory with info on Linux support for AMD x86-64 (Hammer) machines.
+zorro.txt
+ - info on writing drivers for Zorro bus devices found on Amigas.
diff --git a/Documentation/ABI/README b/Documentation/ABI/README
new file mode 100644
index 0000000..9feaf16
--- /dev/null
+++ b/Documentation/ABI/README
@@ -0,0 +1,77 @@
+This directory attempts to document the ABI between the Linux kernel and
+userspace, and the relative stability of these interfaces. Due to the
+everchanging nature of Linux, and the differing maturity levels, these
+interfaces should be used by userspace programs in different ways.
+
+We have four different levels of ABI stability, as shown by the four
+different subdirectories in this location. Interfaces may change levels
+of stability according to the rules described below.
+
+The different levels of stability are:
+
+ stable/
+ This directory documents the interfaces that the developer has
+ defined to be stable. Userspace programs are free to use these
+ interfaces with no restrictions, and backward compatibility for
+ them will be guaranteed for at least 2 years. Most interfaces
+ (like syscalls) are expected to never change and always be
+ available.
+
+ testing/
+ This directory documents interfaces that are felt to be stable,
+ as the main development of this interface has been completed.
+ The interface can be changed to add new features, but the
+ current interface will not break by doing this, unless grave
+ errors or security problems are found in them. Userspace
+ programs can start to rely on these interfaces, but they must be
+ aware of changes that can occur before these interfaces move to
+ be marked stable. Programs that use these interfaces are
+ strongly encouraged to add their name to the description of
+ these interfaces, so that the kernel developers can easily
+ notify them if any changes occur (see the description of the
+ layout of the files below for details on how to do this.)
+
+ obsolete/
+ This directory documents interfaces that are still remaining in
+ the kernel, but are marked to be removed at some later point in
+ time. The description of the interface will document the reason
+ why it is obsolete and when it can be expected to be removed.
+ The file Documentation/feature-removal-schedule.txt may describe
+ some of these interfaces, giving a schedule for when they will
+ be removed.
+
+ removed/
+ This directory contains a list of the old interfaces that have
+ been removed from the kernel.
+
+Every file in these directories will contain the following information:
+
+What: Short description of the interface
+Date: Date created
+KernelVersion: Kernel version this feature first showed up in.
+Contact: Primary contact for this interface (may be a mailing list)
+Description: Long description of the interface and how to use it.
+Users: All users of this interface who wish to be notified when
+ it changes. This is very important for interfaces in
+ the "testing" stage, so that kernel developers can work
+ with userspace developers to ensure that things do not
+ break in ways that are unacceptable. It is also
+ important to get feedback for these interfaces to make
+ sure they are working in a proper way and do not need to
+ be changed further.
+
+
+How things move between levels:
+
+Interfaces in stable may move to obsolete, as long as the proper
+notification is given.
+
+Interfaces may be removed from obsolete and the kernel as long as the
+documented amount of time has gone by.
+
+Interfaces in the testing state can move to the stable state when the
+developers feel they are finished. They cannot be removed from the
+kernel tree without going through the obsolete state first.
+
+It's up to the developer to place their interfaces in the category they
+wish for it to start out in.
diff --git a/Documentation/ABI/obsolete/dv1394 b/Documentation/ABI/obsolete/dv1394
new file mode 100644
index 0000000..2ee3686
--- /dev/null
+++ b/Documentation/ABI/obsolete/dv1394
@@ -0,0 +1,9 @@
+What: dv1394 (a.k.a. "OHCI-DV I/O support" for FireWire)
+Contact: linux1394-devel@lists.sourceforge.net
+Description:
+ New application development should use raw1394 + userspace libraries
+ instead, notably libiec61883 which is functionally equivalent.
+
+Users:
+ ffmpeg/libavformat (used by a variety of media players)
+ dvgrab v1.x (replaced by dvgrab2 on top of raw1394 and resp. libraries)
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb
new file mode 100644
index 0000000..9c49d8e
--- /dev/null
+++ b/Documentation/ABI/obsolete/o2cb
@@ -0,0 +1,11 @@
+What: /sys/o2cb symlink
+Date: Dec 2005
+KernelVersion: 2.6.16
+Contact: ocfs2-devel@oss.oracle.com
+Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
+ be removed when new versions of ocfs2-tools which know to look
+ in /sys/fs/o2cb are sufficiently prevalent. Don't code new
+ software to look here, it should try /sys/fs/o2cb instead.
+ See Documentation/ABI/stable/o2cb for more information on usage.
+Users: ocfs2-tools. It's sufficient to mail proposed changes to
+ ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/removed/devfs b/Documentation/ABI/removed/devfs
new file mode 100644
index 0000000..8ffd28b
--- /dev/null
+++ b/Documentation/ABI/removed/devfs
@@ -0,0 +1,12 @@
+What: devfs
+Date: July 2005 (scheduled), finally removed in kernel v2.6.18
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ devfs has been unmaintained for a number of years, has unfixable
+ races, contains a naming policy within the kernel that is
+ against the LSB, and can be replaced by using udev.
+ The files fs/devfs/*, include/linux/devfs_fs*.h were removed,
+ along with the assorted devfs function calls throughout the
+ kernel tree.
+
+Users:
diff --git a/Documentation/ABI/removed/raw1394_legacy_isochronous b/Documentation/ABI/removed/raw1394_legacy_isochronous
new file mode 100644
index 0000000..1b62962
--- /dev/null
+++ b/Documentation/ABI/removed/raw1394_legacy_isochronous
@@ -0,0 +1,16 @@
+What: legacy isochronous ABI of raw1394 (1st generation iso ABI)
+Date: June 2007 (scheduled), removed in kernel v2.6.23
+Contact: linux1394-devel@lists.sourceforge.net
+Description:
+ The two request types RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN have
+ been deprecated for quite some time. They are very inefficient as they
+ come with high interrupt load and several layers of callbacks for each
+ packet. Because of these deficiencies, the video1394 and dv1394 drivers
+ and the 3rd-generation isochronous ABI in raw1394 (rawiso) were created.
+
+Users:
+ libraw1394 users via the long deprecated API raw1394_iso_write,
+ raw1394_start_iso_write, raw1394_start_iso_rcv, raw1394_stop_iso_rcv
+
+ libdc1394, which optionally uses these old libraw1394 calls
+ alternatively to the more efficient video1394 ABI
diff --git a/Documentation/ABI/stable/o2cb b/Documentation/ABI/stable/o2cb
new file mode 100644
index 0000000..5eb1545
--- /dev/null
+++ b/Documentation/ABI/stable/o2cb
@@ -0,0 +1,10 @@
+What: /sys/fs/o2cb/ (was /sys/o2cb)
+Date: Dec 2005
+KernelVersion: 2.6.16
+Contact: ocfs2-devel@oss.oracle.com
+Description: Ocfs2-tools looks at 'interface-revision' for versioning
+ information. Each logmask/ file controls a set of debug prints
+ and can be written into with the strings "allow", "deny", or
+ "off". Reading the file returns the current state.
+Users: ocfs2-tools. It's sufficient to mail proposed changes to
+ ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/stable/syscalls b/Documentation/ABI/stable/syscalls
new file mode 100644
index 0000000..c3ae3e7
--- /dev/null
+++ b/Documentation/ABI/stable/syscalls
@@ -0,0 +1,10 @@
+What: The kernel syscall interface
+Description:
+ This interface matches much of the POSIX interface and is based
+ on it and other Unix based interfaces. It will only be added to
+ over time, and not have things removed from it.
+
+ Note that this interface is different for every architecture
+ that Linux supports. Please see the architecture-specific
+ documentation for details on the syscall numbers that are to be
+ mapped to each syscall.
diff --git a/Documentation/ABI/stable/sysfs-class-ubi b/Documentation/ABI/stable/sysfs-class-ubi
new file mode 100644
index 0000000..18d471d
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-class-ubi
@@ -0,0 +1,212 @@
+What: /sys/class/ubi/
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ The ubi/ class sub-directory belongs to the UBI subsystem and
+ provides general UBI information, per-UBI device information
+ and per-UBI volume information.
+
+What: /sys/class/ubi/version
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ This file contains version of the latest supported UBI on-media
+ format. Currently it is 1, and there is no plan to change this.
+ However, if in the future UBI needs on-flash format changes
+ which cannot be done in a compatible manner, a new format
+ version will be added. So this is a mechanism for possible
+ future backward-compatible (but forward-incompatible)
+ improvements.
+
+What: /sys/class/ubiX/
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ The /sys/class/ubi0, /sys/class/ubi1, etc directories describe
+ UBI devices (UBI device 0, 1, etc). They contain general UBI
+ device information and per UBI volume information (each UBI
+ device may have many UBI volumes)
+
+What: /sys/class/ubi/ubiX/avail_eraseblocks
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Amount of available logical eraseblock. For example, one may
+ create a new UBI volume which has this amount of logical
+ eraseblocks.
+
+What: /sys/class/ubi/ubiX/bad_peb_count
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Count of bad physical eraseblocks on the underlying MTD device.
+
+What: /sys/class/ubi/ubiX/bgt_enabled
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Contains ASCII "0\n" if the UBI background thread is disabled,
+ and ASCII "1\n" if it is enabled.
+
+What: /sys/class/ubi/ubiX/dev
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Major and minor numbers of the character device corresponding
+ to this UBI device (in <major>:<minor> format).
+
+What: /sys/class/ubi/ubiX/eraseblock_size
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Maximum logical eraseblock size this UBI device may provide. UBI
+ volumes may have smaller logical eraseblock size because of their
+ alignment.
+
+What: /sys/class/ubi/ubiX/max_ec
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Maximum physical eraseblock erase counter value.
+
+What: /sys/class/ubi/ubiX/max_vol_count
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Maximum number of volumes which this UBI device may have.
+
+What: /sys/class/ubi/ubiX/min_io_size
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Minimum input/output unit size. All the I/O may only be done
+ in fractions of the contained number.
+
+What: /sys/class/ubi/ubiX/mtd_num
+Date: January 2008
+KernelVersion: 2.6.25
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Number of the underlying MTD device.
+
+What: /sys/class/ubi/ubiX/reserved_for_bad
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Number of physical eraseblocks reserved for bad block handling.
+
+What: /sys/class/ubi/ubiX/total_eraseblocks
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Total number of good (not marked as bad) physical eraseblocks on
+ the underlying MTD device.
+
+What: /sys/class/ubi/ubiX/volumes_count
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Count of volumes on this UBI device.
+
+What: /sys/class/ubi/ubiX/ubiX_Y/
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ The /sys/class/ubi/ubiX/ubiX_0/, /sys/class/ubi/ubiX/ubiX_1/,
+ etc directories describe UBI volumes on UBI device X (volumes
+ 0, 1, etc).
+
+What: /sys/class/ubi/ubiX/ubiX_Y/alignment
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Volume alignment - the value the logical eraseblock size of
+ this volume has to be aligned on. For example, 2048 means that
+ logical eraseblock size is multiple of 2048. In other words,
+ volume logical eraseblock size is UBI device logical eraseblock
+ size aligned to the alignment value.
+
+What: /sys/class/ubi/ubiX/ubiX_Y/corrupted
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Contains ASCII "0\n" if the UBI volume is OK, and ASCII "1\n"
+ if it is corrupted (e.g., due to an interrupted volume update).
+
+What: /sys/class/ubi/ubiX/ubiX_Y/data_bytes
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ The amount of data this volume contains. This value makes sense
+ only for static volumes, and for dynamic volume it equivalent
+ to the total volume size in bytes.
+
+What: /sys/class/ubi/ubiX/ubiX_Y/dev
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Major and minor numbers of the character device corresponding
+ to this UBI volume (in <major>:<minor> format).
+
+What: /sys/class/ubi/ubiX/ubiX_Y/name
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Volume name.
+
+What: /sys/class/ubi/ubiX/ubiX_Y/reserved_ebs
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Count of physical eraseblock reserved for this volume.
+ Equivalent to the volume size in logical eraseblocks.
+
+What: /sys/class/ubi/ubiX/ubiX_Y/type
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Volume type. Contains ASCII "dynamic\n" for dynamic volumes and
+ "static\n" for static volumes.
+
+What: /sys/class/ubi/ubiX/ubiX_Y/upd_marker
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Contains ASCII "0\n" if the update marker is not set for this
+ volume, and "1\n" if it is set. The update marker is set when
+ volume update starts, and cleaned when it ends. So the presence
+ of the update marker indicates that the volume is being updated
+ at the moment of the update was interrupted. The later may be
+ checked using the "corrupted" sysfs file.
+
+What: /sys/class/ubi/ubiX/ubiX_Y/usable_eb_size
+Date: July 2006
+KernelVersion: 2.6.22
+Contact: Artem Bityutskiy <dedekind@infradead.org>
+Description:
+ Logical eraseblock size of this volume. Equivalent to logical
+ eraseblock size of the device aligned on the volume alignment
+ value.
diff --git a/Documentation/ABI/stable/sysfs-driver-usb-usbtmc b/Documentation/ABI/stable/sysfs-driver-usb-usbtmc
new file mode 100644
index 0000000..9a75fb2
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-driver-usb-usbtmc
@@ -0,0 +1,62 @@
+What: /sys/bus/usb/drivers/usbtmc/devices/*/interface_capabilities
+What: /sys/bus/usb/drivers/usbtmc/devices/*/device_capabilities
+Date: August 2008
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ These files show the various USB TMC capabilities as described
+ by the device itself. The full description of the bitfields
+ can be found in the USB TMC documents from the USB-IF entitled
+ "Universal Serial Bus Test and Measurement Class Specification
+ (USBTMC) Revision 1.0" section 4.2.1.8.
+
+ The files are read only.
+
+
+What: /sys/bus/usb/drivers/usbtmc/devices/*/usb488_interface_capabilities
+What: /sys/bus/usb/drivers/usbtmc/devices/*/usb488_device_capabilities
+Date: August 2008
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ These files show the various USB TMC capabilities as described
+ by the device itself. The full description of the bitfields
+ can be found in the USB TMC documents from the USB-IF entitled
+ "Universal Serial Bus Test and Measurement Class, Subclass
+ USB488 Specification (USBTMC-USB488) Revision 1.0" section
+ 4.2.2.
+
+ The files are read only.
+
+
+What: /sys/bus/usb/drivers/usbtmc/devices/*/TermChar
+Date: August 2008
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ This file is the TermChar value to be sent to the USB TMC
+ device as described by the document, "Universal Serial Bus Test
+ and Measurement Class Specification
+ (USBTMC) Revision 1.0" as published by the USB-IF.
+
+ Note that the TermCharEnabled file determines if this value is
+ sent to the device or not.
+
+
+What: /sys/bus/usb/drivers/usbtmc/devices/*/TermCharEnabled
+Date: August 2008
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ This file determines if the TermChar is to be sent to the
+ device on every transaction or not. For more details about
+ this, please see the document, "Universal Serial Bus Test and
+ Measurement Class Specification (USBTMC) Revision 1.0" as
+ published by the USB-IF.
+
+
+What: /sys/bus/usb/drivers/usbtmc/devices/*/auto_abort
+Date: August 2008
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ This file determines if the the transaction of the USB TMC
+ device is to be automatically aborted if there is any error.
+ For more details about this, please see the document,
+ "Universal Serial Bus Test and Measurement Class Specification
+ (USBTMC) Revision 1.0" as published by the USB-IF.
diff --git a/Documentation/ABI/stable/sysfs-module b/Documentation/ABI/stable/sysfs-module
new file mode 100644
index 0000000..75be431
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-module
@@ -0,0 +1,30 @@
+What: /sys/module
+Description:
+ The /sys/module tree consists of the following structure:
+
+ /sys/module/MODULENAME
+ The name of the module that is in the kernel. This
+ module name will show up either if the module is built
+ directly into the kernel, or if it is loaded as a
+ dyanmic module.
+
+ /sys/module/MODULENAME/parameters
+ This directory contains individual files that are each
+ individual parameters of the module that are able to be
+ changed at runtime. See the individual module
+ documentation as to the contents of these parameters and
+ what they accomplish.
+
+ Note: The individual parameter names and values are not
+ considered stable, only the fact that they will be
+ placed in this location within sysfs. See the
+ individual driver documentation for details as to the
+ stability of the different parameters.
+
+ /sys/module/MODULENAME/refcnt
+ If the module is able to be unloaded from the kernel, this file
+ will contain the current reference count of the module.
+
+ Note: If the module is built into the kernel, or if the
+ CONFIG_MODULE_UNLOAD kernel configuration value is not enabled,
+ this file will not be present.
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd
new file mode 100644
index 0000000..bf9c16b
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-pktcdvd
@@ -0,0 +1,19 @@
+What: /debug/pktcdvd/pktcdvd[0-7]
+Date: Oct. 2006
+KernelVersion: 2.6.20
+Contact: Thomas Maier <balagi@justmail.de>
+Description:
+
+debugfs interface
+-----------------
+
+The pktcdvd module (packet writing driver) creates
+these files in debugfs:
+
+/debug/pktcdvd/pktcdvd[0-7]/
+ info (0444) Lots of driver statistics and infos.
+
+Example:
+-------
+
+cat /debug/pktcdvd/pktcdvd0/info
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
new file mode 100644
index 0000000..9923390
--- /dev/null
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -0,0 +1,22 @@
+What: /proc/diskstats
+Date: February 2008
+Contact: Jerome Marchand <jmarchan@redhat.com>
+Description:
+ The /proc/diskstats file displays the I/O statistics
+ of block devices. Each line contains the following 14
+ fields:
+ 1 - major number
+ 2 - minor mumber
+ 3 - device name
+ 4 - reads completed succesfully
+ 5 - reads merged
+ 6 - sectors read
+ 7 - time spent reading (ms)
+ 8 - writes completed
+ 9 - writes merged
+ 10 - sectors written
+ 11 - time spent writing (ms)
+ 12 - I/Os currently in progress
+ 13 - time spent doing I/Os (ms)
+ 14 - weighted time spent doing I/Os (ms)
+ For more details refer to Documentation/iostats.txt
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
new file mode 100644
index 0000000..44f52a4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-block
@@ -0,0 +1,62 @@
+What: /sys/block/<disk>/stat
+Date: February 2008
+Contact: Jerome Marchand <jmarchan@redhat.com>
+Description:
+ The /sys/block/<disk>/stat files displays the I/O
+ statistics of disk <disk>. They contain 11 fields:
+ 1 - reads completed succesfully
+ 2 - reads merged
+ 3 - sectors read
+ 4 - time spent reading (ms)
+ 5 - writes completed
+ 6 - writes merged
+ 7 - sectors written
+ 8 - time spent writing (ms)
+ 9 - I/Os currently in progress
+ 10 - time spent doing I/Os (ms)
+ 11 - weighted time spent doing I/Os (ms)
+ For more details refer Documentation/iostats.txt
+
+
+What: /sys/block/<disk>/<part>/stat
+Date: February 2008
+Contact: Jerome Marchand <jmarchan@redhat.com>
+Description:
+ The /sys/block/<disk>/<part>/stat files display the
+ I/O statistics of partition <part>. The format is the
+ same as the above-written /sys/block/<disk>/stat
+ format.
+
+
+What: /sys/block/<disk>/integrity/format
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Metadata format for integrity capable block device.
+ E.g. T10-DIF-TYPE1-CRC.
+
+
+What: /sys/block/<disk>/integrity/read_verify
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Indicates whether the block layer should verify the
+ integrity of read requests serviced by devices that
+ support sending integrity metadata.
+
+
+What: /sys/block/<disk>/integrity/tag_size
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Number of bytes of integrity tag space available per
+ 512 bytes of data.
+
+
+What: /sys/block/<disk>/integrity/write_generate
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Indicates whether the block layer should automatically
+ generate checksums for write requests bound for
+ devices that support receiving integrity metadata.
diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css
new file mode 100644
index 0000000..b585ec2
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-css
@@ -0,0 +1,35 @@
+What: /sys/bus/css/devices/.../type
+Date: March 2008
+Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
+ linux-s390@vger.kernel.org
+Description: Contains the subchannel type, as reported by the hardware.
+ This attribute is present for all subchannel types.
+
+What: /sys/bus/css/devices/.../modalias
+Date: March 2008
+Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
+ linux-s390@vger.kernel.org
+Description: Contains the module alias as reported with uevents.
+ It is of the format css:t<type> and present for all
+ subchannel types.
+
+What: /sys/bus/css/drivers/io_subchannel/.../chpids
+Date: December 2002
+Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
+ linux-s390@vger.kernel.org
+Description: Contains the ids of the channel paths used by this
+ subchannel, as reported by the channel subsystem
+ during subchannel recognition.
+ Note: This is an I/O-subchannel specific attribute.
+Users: s390-tools, HAL
+
+What: /sys/bus/css/drivers/io_subchannel/.../pimpampom
+Date: December 2002
+Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
+ linux-s390@vger.kernel.org
+Description: Contains the PIM/PAM/POM values, as reported by the
+ channel subsystem when last queried by the common I/O
+ layer (this implies that this attribute is not neccessarily
+ in sync with the values current in the channel subsystem).
+ Note: This is an I/O-subchannel specific attribute.
+Users: s390-tools, HAL
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
new file mode 100644
index 0000000..ceddcff
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -0,0 +1,11 @@
+What: /sys/bus/pci/devices/.../vpd
+Date: February 2008
+Contact: Ben Hutchings <bhutchings@solarflare.com>
+Description:
+ A file named vpd in a device directory will be a
+ binary file containing the Vital Product Data for the
+ device. It should follow the VPD format defined in
+ PCI Specification 2.1 or 2.2, but users should consider
+ that some devices may have malformatted data. If the
+ underlying VPD has a writable section then the
+ corresponding section of this file will be writable.
diff --git a/Documentation/ABI/testing/sysfs-bus-umc b/Documentation/ABI/testing/sysfs-bus-umc
new file mode 100644
index 0000000..948fec4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-umc
@@ -0,0 +1,28 @@
+What: /sys/bus/umc/
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ The Wireless Host Controller Interface (WHCI)
+ specification describes a PCI-based device with
+ multiple capabilities; the UWB Multi-interface
+ Controller (UMC).
+
+ The umc bus presents each of the individual
+ capabilties as a device.
+
+What: /sys/bus/umc/devices/.../capability_id
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ The ID of this capability, with 0 being the radio
+ controller capability.
+
+What: /sys/bus/umc/devices/.../version
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ The specification version this capability's hardware
+ interface complies with.
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
new file mode 100644
index 0000000..7772928
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -0,0 +1,146 @@
+What: /sys/bus/usb/devices/.../power/autosuspend
+Date: March 2007
+KernelVersion: 2.6.21
+Contact: Alan Stern <stern@rowland.harvard.edu>
+Description:
+ Each USB device directory will contain a file named
+ power/autosuspend. This file holds the time (in seconds)
+ the device must be idle before it will be autosuspended.
+ 0 means the device will be autosuspended as soon as
+ possible. Negative values will prevent the device from
+ being autosuspended at all, and writing a negative value
+ will resume the device if it is already suspended.
+
+ The autosuspend delay for newly-created devices is set to
+ the value of the usbcore.autosuspend module parameter.
+
+What: /sys/bus/usb/devices/.../power/level
+Date: March 2007
+KernelVersion: 2.6.21
+Contact: Alan Stern <stern@rowland.harvard.edu>
+Description:
+ Each USB device directory will contain a file named
+ power/level. This file holds a power-level setting for
+ the device, one of "on", "auto", or "suspend".
+
+ "on" means that the device is not allowed to autosuspend,
+ although normal suspends for system sleep will still
+ be honored. "auto" means the device will autosuspend
+ and autoresume in the usual manner, according to the
+ capabilities of its driver. "suspend" means the device
+ is forced into a suspended state and it will not autoresume
+ in response to I/O requests. However remote-wakeup requests
+ from the device may still be enabled (the remote-wakeup
+ setting is controlled separately by the power/wakeup
+ attribute).
+
+ During normal use, devices should be left in the "auto"
+ level. The other levels are meant for administrative uses.
+ If you want to suspend a device immediately but leave it
+ free to wake up in response to I/O requests, you should
+ write "0" to power/autosuspend.
+
+What: /sys/bus/usb/devices/.../power/persist
+Date: May 2007
+KernelVersion: 2.6.23
+Contact: Alan Stern <stern@rowland.harvard.edu>
+Description:
+ If CONFIG_USB_PERSIST is set, then each USB device directory
+ will contain a file named power/persist. The file holds a
+ boolean value (0 or 1) indicating whether or not the
+ "USB-Persist" facility is enabled for the device. Since the
+ facility is inherently dangerous, it is disabled by default
+ for all devices except hubs. For more information, see
+ Documentation/usb/persist.txt.
+
+What: /sys/bus/usb/device/.../power/connected_duration
+Date: January 2008
+KernelVersion: 2.6.25
+Contact: Sarah Sharp <sarah.a.sharp@intel.com>
+Description:
+ If CONFIG_PM and CONFIG_USB_SUSPEND are enabled, then this file
+ is present. When read, it returns the total time (in msec)
+ that the USB device has been connected to the machine. This
+ file is read-only.
+Users:
+ PowerTOP <power@bughost.org>
+ http://www.lesswatts.org/projects/powertop/
+
+What: /sys/bus/usb/device/.../power/active_duration
+Date: January 2008
+KernelVersion: 2.6.25
+Contact: Sarah Sharp <sarah.a.sharp@intel.com>
+Description:
+ If CONFIG_PM and CONFIG_USB_SUSPEND are enabled, then this file
+ is present. When read, it returns the total time (in msec)
+ that the USB device has been active, i.e. not in a suspended
+ state. This file is read-only.
+
+ Tools can use this file and the connected_duration file to
+ compute the percentage of time that a device has been active.
+ For example,
+ echo $((100 * `cat active_duration` / `cat connected_duration`))
+ will give an integer percentage. Note that this does not
+ account for counter wrap.
+Users:
+ PowerTOP <power@bughost.org>
+ http://www.lesswatts.org/projects/powertop/
+
+What: /sys/bus/usb/device/<busnum>-<devnum>...:<config num>-<interface num>/supports_autosuspend
+Date: January 2008
+KernelVersion: 2.6.27
+Contact: Sarah Sharp <sarah.a.sharp@intel.com>
+Description:
+ When read, this file returns 1 if the interface driver
+ for this interface supports autosuspend. It also
+ returns 1 if no driver has claimed this interface, as an
+ unclaimed interface will not stop the device from being
+ autosuspended if all other interface drivers are idle.
+ The file returns 0 if autosuspend support has not been
+ added to the driver.
+Users:
+ USB PM tool
+ git://git.moblin.org/users/sarah/usb-pm-tool/
+
+What: /sys/bus/usb/device/.../authorized
+Date: July 2008
+KernelVersion: 2.6.26
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ Authorized devices are available for use by device
+ drivers, non-authorized one are not. By default, wired
+ USB devices are authorized.
+
+ Certified Wireless USB devices are not authorized
+ initially and should be (by writing 1) after the
+ device has been authenticated.
+
+What: /sys/bus/usb/device/.../wusb_cdid
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ For Certified Wireless USB devices only.
+
+ A devices's CDID, as 16 space-separated hex octets.
+
+What: /sys/bus/usb/device/.../wusb_ck
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ For Certified Wireless USB devices only.
+
+ Write the device's connection key (CK) to start the
+ authentication of the device. The CK is 16
+ space-separated hex octets.
+
+What: /sys/bus/usb/device/.../wusb_disconnect
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ For Certified Wireless USB devices only.
+
+ Write a 1 to force the device to disconnect
+ (equivalent to unplugging a wired USB device).
diff --git a/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
new file mode 100644
index 0000000..cb830df
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
@@ -0,0 +1,43 @@
+Where: /sys/bus/usb/.../powered
+Date: August 2008
+Kernel Version: 2.6.26
+Contact: Harrison Metzger <harrisonmetz@gmail.com>
+Description: Controls whether the device's display will powered.
+ A value of 0 is off and a non-zero value is on.
+
+Where: /sys/bus/usb/.../mode_msb
+Where: /sys/bus/usb/.../mode_lsb
+Date: August 2008
+Kernel Version: 2.6.26
+Contact: Harrison Metzger <harrisonmetz@gmail.com>
+Description: Controls the devices display mode.
+ For a 6 character display the values are
+ MSB 0x06; LSB 0x3F, and
+ for an 8 character display the values are
+ MSB 0x08; LSB 0xFF.
+
+Where: /sys/bus/usb/.../textmode
+Date: August 2008
+Kernel Version: 2.6.26
+Contact: Harrison Metzger <harrisonmetz@gmail.com>
+Description: Controls the way the device interprets its text buffer.
+ raw: each character controls its segment manually
+ hex: each character is between 0-15
+ ascii: each character is between '0'-'9' and 'A'-'F'.
+
+Where: /sys/bus/usb/.../text
+Date: August 2008
+Kernel Version: 2.6.26
+Contact: Harrison Metzger <harrisonmetz@gmail.com>
+Description: The text (or data) for the device to display
+
+Where: /sys/bus/usb/.../decimals
+Date: August 2008
+Kernel Version: 2.6.26
+Contact: Harrison Metzger <harrisonmetz@gmail.com>
+Description: Controls the decimal places on the device.
+ To set the nth decimal place, give this field
+ the value of 10 ** n. Assume this field has
+ the value k and has 1 or more decimal places set,
+ to set the mth place (where m is not already set),
+ change this fields value to k + 10 ** m. \ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-c2port b/Documentation/ABI/testing/sysfs-c2port
new file mode 100644
index 0000000..716cffc
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-c2port
@@ -0,0 +1,88 @@
+What: /sys/class/c2port/
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/ directory will contain files and
+ directories that will provide a unified interface to
+ the C2 port interface.
+
+What: /sys/class/c2port/c2portX
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/ directory is related to X-th
+ C2 port into the system. Each directory will contain files to
+ manage and control its C2 port.
+
+What: /sys/class/c2port/c2portX/access
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/access file enable the access
+ to the C2 port from the system. No commands can be sent
+ till this entry is set to 0.
+
+What: /sys/class/c2port/c2portX/dev_id
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/dev_id file show the device ID
+ of the connected micro.
+
+What: /sys/class/c2port/c2portX/flash_access
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/flash_access file enable the
+ access to the on-board flash of the connected micro.
+ No commands can be sent till this entry is set to 0.
+
+What: /sys/class/c2port/c2portX/flash_block_size
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/flash_block_size file show
+ the on-board flash block size of the connected micro.
+
+What: /sys/class/c2port/c2portX/flash_blocks_num
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/flash_blocks_num file show
+ the on-board flash blocks number of the connected micro.
+
+What: /sys/class/c2port/c2portX/flash_data
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/flash_data file export
+ the content of the on-board flash of the connected micro.
+
+What: /sys/class/c2port/c2portX/flash_erase
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/flash_erase file execute
+ the "erase" command on the on-board flash of the connected
+ micro.
+
+What: /sys/class/c2port/c2portX/flash_erase
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/flash_erase file show the
+ on-board flash size of the connected micro.
+
+What: /sys/class/c2port/c2portX/reset
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/reset file execute a "reset"
+ command on the connected micro.
+
+What: /sys/class/c2port/c2portX/rev_id
+Date: October 2008
+Contact: Rodolfo Giometti <giometti@linux.it>
+Description:
+ The /sys/class/c2port/c2portX/rev_id file show the revision ID
+ of the connected micro.
diff --git a/Documentation/ABI/testing/sysfs-class b/Documentation/ABI/testing/sysfs-class
new file mode 100644
index 0000000..4b0cb89
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class
@@ -0,0 +1,16 @@
+What: /sys/class/
+Date: Febuary 2006
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ The /sys/class directory will consist of a group of
+ subdirectories describing individual classes of devices
+ in the kernel. The individual directories will consist
+ of either subdirectories, or symlinks to other
+ directories.
+
+ All programs that use this directory tree must be able
+ to handle both subdirectories or symlinks in order to
+ work properly.
+
+Users:
+ udev <linux-hotplug-devel@lists.sourceforge.net>
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
new file mode 100644
index 0000000..5f50097
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -0,0 +1,50 @@
+What: /sys/class/bdi/<bdi>/
+Date: January 2008
+Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description:
+
+Provide a place in sysfs for the backing_dev_info object. This allows
+setting and retrieving various BDI specific variables.
+
+The <bdi> identifier can be either of the following:
+
+MAJOR:MINOR
+
+ Device number for block devices, or value of st_dev on
+ non-block filesystems which provide their own BDI, such as NFS
+ and FUSE.
+
+MAJOR:MINOR-fuseblk
+
+ Value of st_dev on fuseblk filesystems.
+
+default
+
+ The default backing dev, used for non-block device backed
+ filesystems which do not provide their own BDI.
+
+Files under /sys/class/bdi/<bdi>/
+---------------------------------
+
+read_ahead_kb (read-write)
+
+ Size of the read-ahead window in kilobytes
+
+min_ratio (read-write)
+
+ Under normal circumstances each device is given a part of the
+ total write-back cache that relates to its current average
+ writeout speed in relation to the other devices.
+
+ The 'min_ratio' parameter allows assigning a minimum
+ percentage of the write-back cache to a particular device.
+ For example, this is useful for providing a minimum QoS.
+
+max_ratio (read-write)
+
+ Allows limiting a particular device to use not more than the
+ given percentage of the write-back cache. This is useful in
+ situations where we want to avoid one device taking all or
+ most of the write-back cache. For example in case of an NFS
+ mount that is prone to get stuck, or a FUSE mount which cannot
+ be trusted to play fair.
diff --git a/Documentation/ABI/testing/sysfs-class-pktcdvd b/Documentation/ABI/testing/sysfs-class-pktcdvd
new file mode 100644
index 0000000..b1c3f02
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-pktcdvd
@@ -0,0 +1,72 @@
+What: /sys/class/pktcdvd/
+Date: Oct. 2006
+KernelVersion: 2.6.20
+Contact: Thomas Maier <balagi@justmail.de>
+Description:
+
+sysfs interface
+---------------
+
+The pktcdvd module (packet writing driver) creates
+these files in the sysfs:
+(<devid> is in format major:minor )
+
+/sys/class/pktcdvd/
+ add (0200) Write a block device id (major:minor)
+ to create a new pktcdvd device and map
+ it to the block device.
+
+ remove (0200) Write the pktcdvd device id (major:minor)
+ to it to remove the pktcdvd device.
+
+ device_map (0444) Shows the device mapping in format:
+ pktcdvd[0-7] <pktdevid> <blkdevid>
+
+/sys/class/pktcdvd/pktcdvd[0-7]/
+ dev (0444) Device id
+ uevent (0200) To send an uevent.
+
+/sys/class/pktcdvd/pktcdvd[0-7]/stat/
+ packets_started (0444) Number of started packets.
+ packets_finished (0444) Number of finished packets.
+
+ kb_written (0444) kBytes written.
+ kb_read (0444) kBytes read.
+ kb_read_gather (0444) kBytes read to fill write packets.
+
+ reset (0200) Write any value to it to reset
+ pktcdvd device statistic values, like
+ bytes read/written.
+
+/sys/class/pktcdvd/pktcdvd[0-7]/write_queue/
+ size (0444) Contains the size of the bio write
+ queue.
+
+ congestion_off (0644) If bio write queue size is below
+ this mark, accept new bio requests
+ from the block layer.
+
+ congestion_on (0644) If bio write queue size is higher
+ as this mark, do no longer accept
+ bio write requests from the block
+ layer and wait till the pktcdvd
+ device has processed enough bio's
+ so that bio write queue size is
+ below congestion off mark.
+ A value of <= 0 disables congestion
+ control.
+
+
+Example:
+--------
+To use the pktcdvd sysfs interface directly, you can do:
+
+# create a new pktcdvd device mapped to /dev/hdc
+echo "22:0" >/sys/class/pktcdvd/add
+cat /sys/class/pktcdvd/device_map
+# assuming device pktcdvd0 was created, look at stat's
+cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written
+# print the device id of the mapped block device
+fgrep pktcdvd0 /sys/class/pktcdvd/device_map
+# remove device, using pktcdvd0 device id 253:0
+echo "253:0" >/sys/class/pktcdvd/remove
diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator
new file mode 100644
index 0000000..3731f6f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-regulator
@@ -0,0 +1,328 @@
+What: /sys/class/regulator/.../state
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ state. This holds the regulator output state.
+
+ This will be one of the following strings:
+
+ 'enabled'
+ 'disabled'
+ 'unknown'
+
+ 'enabled' means the regulator output is ON and is supplying
+ power to the system.
+
+ 'disabled' means the regulator output is OFF and is not
+ supplying power to the system..
+
+ 'unknown' means software cannot determine the state.
+
+ NOTE: this field can be used in conjunction with microvolts
+ and microamps to determine regulator output levels.
+
+
+What: /sys/class/regulator/.../type
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ type. This holds the regulator type.
+
+ This will be one of the following strings:
+
+ 'voltage'
+ 'current'
+ 'unknown'
+
+ 'voltage' means the regulator output voltage can be controlled
+ by software.
+
+ 'current' means the regulator output current limit can be
+ controlled by software.
+
+ 'unknown' means software cannot control either voltage or
+ current limit.
+
+
+What: /sys/class/regulator/.../microvolts
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ microvolts. This holds the regulator output voltage setting
+ measured in microvolts (i.e. E-6 Volts).
+
+ NOTE: This value should not be used to determine the regulator
+ output voltage level as this value is the same regardless of
+ whether the regulator is enabled or disabled.
+
+
+What: /sys/class/regulator/.../microamps
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ microamps. This holds the regulator output current limit
+ setting measured in microamps (i.e. E-6 Amps).
+
+ NOTE: This value should not be used to determine the regulator
+ output current level as this value is the same regardless of
+ whether the regulator is enabled or disabled.
+
+
+What: /sys/class/regulator/.../opmode
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ opmode. This holds the regulator operating mode setting.
+
+ The opmode value can be one of the following strings:
+
+ 'fast'
+ 'normal'
+ 'idle'
+ 'standby'
+ 'unknown'
+
+ The modes are described in include/linux/regulator/regulator.h
+
+ NOTE: This value should not be used to determine the regulator
+ output operating mode as this value is the same regardless of
+ whether the regulator is enabled or disabled.
+
+
+What: /sys/class/regulator/.../min_microvolts
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ min_microvolts. This holds the minimum safe working regulator
+ output voltage setting for this domain measured in microvolts.
+
+ NOTE: this will return the string 'constraint not defined' if
+ the power domain has no min microvolts constraint defined by
+ platform code.
+
+
+What: /sys/class/regulator/.../max_microvolts
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ max_microvolts. This holds the maximum safe working regulator
+ output voltage setting for this domain measured in microvolts.
+
+ NOTE: this will return the string 'constraint not defined' if
+ the power domain has no max microvolts constraint defined by
+ platform code.
+
+
+What: /sys/class/regulator/.../min_microamps
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ min_microamps. This holds the minimum safe working regulator
+ output current limit setting for this domain measured in
+ microamps.
+
+ NOTE: this will return the string 'constraint not defined' if
+ the power domain has no min microamps constraint defined by
+ platform code.
+
+
+What: /sys/class/regulator/.../max_microamps
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ max_microamps. This holds the maximum safe working regulator
+ output current limit setting for this domain measured in
+ microamps.
+
+ NOTE: this will return the string 'constraint not defined' if
+ the power domain has no max microamps constraint defined by
+ platform code.
+
+
+What: /sys/class/regulator/.../name
+Date: October 2008
+KernelVersion: 2.6.28
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ name. This holds a string identifying the regulator for
+ display purposes.
+
+ NOTE: this will be empty if no suitable name is provided
+ by platform or regulator drivers.
+
+
+What: /sys/class/regulator/.../num_users
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ num_users. This holds the number of consumer devices that
+ have called regulator_enable() on this regulator.
+
+
+What: /sys/class/regulator/.../requested_microamps
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ requested_microamps. This holds the total requested load
+ current in microamps for this regulator from all its consumer
+ devices.
+
+
+What: /sys/class/regulator/.../parent
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Some regulator directories will contain a link called parent.
+ This points to the parent or supply regulator if one exists.
+
+What: /sys/class/regulator/.../suspend_mem_microvolts
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_mem_microvolts. This holds the regulator output
+ voltage setting for this domain measured in microvolts when
+ the system is suspended to memory.
+
+ NOTE: this will return the string 'not defined' if
+ the power domain has no suspend to memory voltage defined by
+ platform code.
+
+What: /sys/class/regulator/.../suspend_disk_microvolts
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_disk_microvolts. This holds the regulator output
+ voltage setting for this domain measured in microvolts when
+ the system is suspended to disk.
+
+ NOTE: this will return the string 'not defined' if
+ the power domain has no suspend to disk voltage defined by
+ platform code.
+
+What: /sys/class/regulator/.../suspend_standby_microvolts
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_standby_microvolts. This holds the regulator output
+ voltage setting for this domain measured in microvolts when
+ the system is suspended to standby.
+
+ NOTE: this will return the string 'not defined' if
+ the power domain has no suspend to standby voltage defined by
+ platform code.
+
+What: /sys/class/regulator/.../suspend_mem_mode
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_mem_mode. This holds the regulator operating mode
+ setting for this domain when the system is suspended to
+ memory.
+
+ NOTE: this will return the string 'not defined' if
+ the power domain has no suspend to memory mode defined by
+ platform code.
+
+What: /sys/class/regulator/.../suspend_disk_mode
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_disk_mode. This holds the regulator operating mode
+ setting for this domain when the system is suspended to disk.
+
+ NOTE: this will return the string 'not defined' if
+ the power domain has no suspend to disk mode defined by
+ platform code.
+
+What: /sys/class/regulator/.../suspend_standby_mode
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_standby_mode. This holds the regulator operating mode
+ setting for this domain when the system is suspended to
+ standby.
+
+ NOTE: this will return the string 'not defined' if
+ the power domain has no suspend to standby mode defined by
+ platform code.
+
+What: /sys/class/regulator/.../suspend_mem_state
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_mem_state. This holds the regulator operating state
+ when suspended to memory.
+
+ This will be one of the following strings:
+
+ 'enabled'
+ 'disabled'
+ 'not defined'
+
+What: /sys/class/regulator/.../suspend_disk_state
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_disk_state. This holds the regulator operating state
+ when suspended to disk.
+
+ This will be one of the following strings:
+
+ 'enabled'
+ 'disabled'
+ 'not defined'
+
+What: /sys/class/regulator/.../suspend_standby_state
+Date: May 2008
+KernelVersion: 2.6.26
+Contact: Liam Girdwood <lrg@slimlogic.co.uk>
+Description:
+ Each regulator directory will contain a field called
+ suspend_standby_state. This holds the regulator operating
+ state when suspended to standby.
+
+ This will be one of the following strings:
+
+ 'enabled'
+ 'disabled'
+ 'not defined'
diff --git a/Documentation/ABI/testing/sysfs-class-usb_host b/Documentation/ABI/testing/sysfs-class-usb_host
new file mode 100644
index 0000000..46b66ad
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-usb_host
@@ -0,0 +1,25 @@
+What: /sys/class/usb_host/usb_hostN/wusb_chid
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ Write the CHID (16 space-separated hex octets) for this host controller.
+ This starts the host controller, allowing it to accept connection from
+ WUSB devices.
+
+ Set an all zero CHID to stop the host controller.
+
+What: /sys/class/usb_host/usb_hostN/wusb_trust_timeout
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ Devices that haven't sent a WUSB packet to the host
+ within 'wusb_trust_timeout' ms are considered to have
+ disconnected and are removed. The default value of
+ 4000 ms is the value required by the WUSB
+ specification.
+
+ Since this relates to security (specifically, the
+ lifetime of PTKs and GTKs) it should not be changed
+ from the default.
diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc b/Documentation/ABI/testing/sysfs-class-uwb_rc
new file mode 100644
index 0000000..a0d18db
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-uwb_rc
@@ -0,0 +1,144 @@
+What: /sys/class/uwb_rc
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ Interfaces for WiMedia Ultra Wideband Common Radio
+ Platform (UWB) radio controllers.
+
+ Familiarity with the ECMA-368 'High Rate Ultra
+ Wideband MAC and PHY Specification' is assumed.
+
+What: /sys/class/uwb_rc/beacon_timeout_ms
+Date: July 2008
+KernelVersion: 2.6.27
+Description:
+ If no beacons are received from a device for at least
+ this time, the device will be considered to have gone
+ and it will be removed. The default is 3 superframes
+ (~197 ms) as required by the specification.
+
+What: /sys/class/uwb_rc/uwbN/
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ An individual UWB radio controller.
+
+What: /sys/class/uwb_rc/uwbN/beacon
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ Write:
+
+ <channel> [<bpst offset>]
+
+ to start beaconing on a specific channel, or stop
+ beaconing if <channel> is -1. Valid channels depends
+ on the radio controller's supported band groups.
+
+ <bpst offset> may be used to try and join a specific
+ beacon group if more than one was found during a scan.
+
+What: /sys/class/uwb_rc/uwbN/scan
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ Write:
+
+ <channel> <type> [<bpst offset>]
+
+ to start (or stop) scanning on a channel. <type> is one of:
+ 0 - scan
+ 1 - scan outside BP
+ 2 - scan while inactive
+ 3 - scanning disabled
+ 4 - scan (with start time of <bpst offset>)
+
+What: /sys/class/uwb_rc/uwbN/mac_address
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ The EUI-48, in colon-separated hex octets, for this
+ radio controller. A write will change the radio
+ controller's EUI-48 but only do so while the device is
+ not beaconing or scanning.
+
+What: /sys/class/uwb_rc/uwbN/wusbhc
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ A symlink to the device (if any) of the WUSB Host
+ Controller PAL using this radio controller.
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ A neighbour UWB device that has either been detected
+ as part of a scan or is a member of the radio
+ controllers beacon group.
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/BPST
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ The time (using the radio controllers internal 1 ms
+ interval superframe timer) of the last beacon from
+ this device was received.
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/DevAddr
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ The current DevAddr of this device in colon separated
+ hex octets.
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/EUI_48
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+
+ The EUI-48 of this device in colon separated hex
+ octets.
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/BPST
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/IEs
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ The latest IEs included in this device's beacon, in
+ space separated hex octets with one IE per line.
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/LQE
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ Link Quality Estimate - the Signal to Noise Ratio
+ (SNR) of all packets received from this device in dB.
+ This gives an estimate on a suitable PHY rate. Refer
+ to [ECMA-368] section 13.3 for more details.
+
+What: /sys/class/uwb_rc/uwbN/<EUI-48>/RSSI
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: linux-usb@vger.kernel.org
+Description:
+ Received Signal Strength Indication - the strength of
+ the received signal in dB. LQE is a more useful
+ measure of the radio link quality.
diff --git a/Documentation/ABI/testing/sysfs-dev b/Documentation/ABI/testing/sysfs-dev
new file mode 100644
index 0000000..a9f2b8b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-dev
@@ -0,0 +1,20 @@
+What: /sys/dev
+Date: April 2008
+KernelVersion: 2.6.26
+Contact: Dan Williams <dan.j.williams@intel.com>
+Description: The /sys/dev tree provides a method to look up the sysfs
+ path for a device using the information returned from
+ stat(2). There are two directories, 'block' and 'char',
+ beneath /sys/dev containing symbolic links with names of
+ the form "<major>:<minor>". These links point to the
+ corresponding sysfs path for the given device.
+
+ Example:
+ $ readlink /sys/dev/block/8:32
+ ../../block/sdc
+
+ Entries in /sys/dev/char and /sys/dev/block will be
+ dynamically created and destroyed as devices enter and
+ leave the system.
+
+Users: mdadm <linux-raid@vger.kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-devices b/Documentation/ABI/testing/sysfs-devices
new file mode 100644
index 0000000..6a25671
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices
@@ -0,0 +1,25 @@
+What: /sys/devices
+Date: February 2006
+Contact: Greg Kroah-Hartman <gregkh@suse.de>
+Description:
+ The /sys/devices tree contains a snapshot of the
+ internal state of the kernel device tree. Devices will
+ be added and removed dynamically as the machine runs,
+ and between different kernel versions, the layout of the
+ devices within this tree will change.
+
+ Please do not rely on the format of this tree because of
+ this. If a program wishes to find different things in
+ the tree, please use the /sys/class structure and rely
+ on the symlinks there to point to the proper location
+ within the /sys/devices tree of the individual devices.
+ Or rely on the uevent messages to notify programs of
+ devices being added and removed from this tree to find
+ the location of those devices.
+
+ Note that sometimes not all devices along the directory
+ chain will have emitted uevent messages, so userspace
+ programs must be able to handle such occurrences.
+
+Users:
+ udev <linux-hotplug-devel@lists.sourceforge.net>
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
new file mode 100644
index 0000000..7a16fe1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -0,0 +1,24 @@
+What: /sys/devices/system/memory
+Date: June 2008
+Contact: Badari Pulavarty <pbadari@us.ibm.com>
+Description:
+ The /sys/devices/system/memory contains a snapshot of the
+ internal state of the kernel memory blocks. Files could be
+ added or removed dynamically to represent hot-add/remove
+ operations.
+
+Users: hotplug memory add/remove tools
+ https://w3.opensource.ibm.com/projects/powerpc-utils/
+
+What: /sys/devices/system/memory/memoryX/removable
+Date: June 2008
+Contact: Badari Pulavarty <pbadari@us.ibm.com>
+Description:
+ The file /sys/devices/system/memory/memoryX/removable
+ indicates whether this memory block is removable or not.
+ This is useful for a user-level agent to determine
+ identify removable sections of the memory before attempting
+ potentially expensive hot-remove memory operation
+
+Users: hotplug memory remove tools
+ https://w3.opensource.ibm.com/projects/powerpc-utils/
diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi
new file mode 100644
index 0000000..e8ffc70
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-acpi
@@ -0,0 +1,146 @@
+What: /sys/firmware/acpi/interrupts/
+Date: February 2008
+Contact: Len Brown <lenb@kernel.org>
+Description:
+ All ACPI interrupts are handled via a single IRQ,
+ the System Control Interrupt (SCI), which appears
+ as "acpi" in /proc/interrupts.
+
+ However, one of the main functions of ACPI is to make
+ the platform understand random hardware without
+ special driver support. So while the SCI handles a few
+ well known (fixed feature) interrupts sources, such
+ as the power button, it can also handle a variable
+ number of a "General Purpose Events" (GPE).
+
+ A GPE vectors to a specified handler in AML, which
+ can do a anything the BIOS writer wants from
+ OS context. GPE 0x12, for example, would vector
+ to a level or edge handler called _L12 or _E12.
+ The handler may do its business and return.
+ Or the handler may send send a Notify event
+ to a Linux device driver registered on an ACPI device,
+ such as a battery, or a processor.
+
+ To figure out where all the SCI's are coming from,
+ /sys/firmware/acpi/interrupts contains a file listing
+ every possible source, and the count of how many
+ times it has triggered.
+
+ $ cd /sys/firmware/acpi/interrupts
+ $ grep . *
+ error: 0
+ ff_gbl_lock: 0 enable
+ ff_pmtimer: 0 invalid
+ ff_pwr_btn: 0 enable
+ ff_rt_clk: 2 disable
+ ff_slp_btn: 0 invalid
+ gpe00: 0 invalid
+ gpe01: 0 enable
+ gpe02: 108 enable
+ gpe03: 0 invalid
+ gpe04: 0 invalid
+ gpe05: 0 invalid
+ gpe06: 0 enable
+ gpe07: 0 enable
+ gpe08: 0 invalid
+ gpe09: 0 invalid
+ gpe0A: 0 invalid
+ gpe0B: 0 invalid
+ gpe0C: 0 invalid
+ gpe0D: 0 invalid
+ gpe0E: 0 invalid
+ gpe0F: 0 invalid
+ gpe10: 0 invalid
+ gpe11: 0 invalid
+ gpe12: 0 invalid
+ gpe13: 0 invalid
+ gpe14: 0 invalid
+ gpe15: 0 invalid
+ gpe16: 0 invalid
+ gpe17: 1084 enable
+ gpe18: 0 enable
+ gpe19: 0 invalid
+ gpe1A: 0 invalid
+ gpe1B: 0 invalid
+ gpe1C: 0 invalid
+ gpe1D: 0 invalid
+ gpe1E: 0 invalid
+ gpe1F: 0 invalid
+ gpe_all: 1192
+ sci: 1194
+
+ sci - The total number of times the ACPI SCI
+ has claimed an interrupt.
+
+ gpe_all - count of SCI caused by GPEs.
+
+ gpeXX - count for individual GPE source
+
+ ff_gbl_lock - Global Lock
+
+ ff_pmtimer - PM Timer
+
+ ff_pwr_btn - Power Button
+
+ ff_rt_clk - Real Time Clock
+
+ ff_slp_btn - Sleep Button
+
+ error - an interrupt that can't be accounted for above.
+
+ invalid: it's either a GPE or a Fixed Event that
+ doesn't have an event handler.
+
+ disable: the GPE/Fixed Event is valid but disabled.
+
+ enable: the GPE/Fixed Event is valid and enabled.
+
+ Root has permission to clear any of these counters. Eg.
+ # echo 0 > gpe11
+
+ All counters can be cleared by clearing the total "sci":
+ # echo 0 > sci
+
+ None of these counters has an effect on the function
+ of the system, they are simply statistics.
+
+ Besides this, user can also write specific strings to these files
+ to enable/disable/clear ACPI interrupts in user space, which can be
+ used to debug some ACPI interrupt storm issues.
+
+ Note that only writting to VALID GPE/Fixed Event is allowed,
+ i.e. user can only change the status of runtime GPE and
+ Fixed Event with event handler installed.
+
+ Let's take power button fixed event for example, please kill acpid
+ and other user space applications so that the machine won't shutdown
+ when pressing the power button.
+ # cat ff_pwr_btn
+ 0 enabled
+ # press the power button for 3 times;
+ # cat ff_pwr_btn
+ 3 enabled
+ # echo disable > ff_pwr_btn
+ # cat ff_pwr_btn
+ 3 disabled
+ # press the power button for 3 times;
+ # cat ff_pwr_btn
+ 3 disabled
+ # echo enable > ff_pwr_btn
+ # cat ff_pwr_btn
+ 4 enabled
+ /*
+ * this is because the status bit is set even if the enable bit is cleared,
+ * and it triggers an ACPI fixed event when the enable bit is set again
+ */
+ # press the power button for 3 times;
+ # cat ff_pwr_btn
+ 7 enabled
+ # echo disable > ff_pwr_btn
+ # press the power button for 3 times;
+ # echo clear > ff_pwr_btn /* clear the status bit */
+ # echo disable > ff_pwr_btn
+ # cat ff_pwr_btn
+ 7 enabled
+
diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap
new file mode 100644
index 0000000..0d99ee6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-memmap
@@ -0,0 +1,71 @@
+What: /sys/firmware/memmap/
+Date: June 2008
+Contact: Bernhard Walle <bwalle@suse.de>
+Description:
+ On all platforms, the firmware provides a memory map which the
+ kernel reads. The resources from that memory map are registered
+ in the kernel resource tree and exposed to userspace via
+ /proc/iomem (together with other resources).
+
+ However, on most architectures that firmware-provided memory
+ map is modified afterwards by the kernel itself, either because
+ the kernel merges that memory map with other information or
+ just because the user overwrites that memory map via command
+ line.
+
+ kexec needs the raw firmware-provided memory map to setup the
+ parameter segment of the kernel that should be booted with
+ kexec. Also, the raw memory map is useful for debugging. For
+ that reason, /sys/firmware/memmap is an interface that provides
+ the raw memory map to userspace.
+
+ The structure is as follows: Under /sys/firmware/memmap there
+ are subdirectories with the number of the entry as their name:
+
+ /sys/firmware/memmap/0
+ /sys/firmware/memmap/1
+ /sys/firmware/memmap/2
+ /sys/firmware/memmap/3
+ ...
+
+ The maximum depends on the number of memory map entries provided
+ by the firmware. The order is just the order that the firmware
+ provides.
+
+ Each directory contains three files:
+
+ start : The start address (as hexadecimal number with the
+ '0x' prefix).
+ end : The end address, inclusive (regardless whether the
+ firmware provides inclusive or exclusive ranges).
+ type : Type of the entry as string. See below for a list of
+ valid types.
+
+ So, for example:
+
+ /sys/firmware/memmap/0/start
+ /sys/firmware/memmap/0/end
+ /sys/firmware/memmap/0/type
+ /sys/firmware/memmap/1/start
+ ...
+
+ Currently following types exist:
+
+ - System RAM
+ - ACPI Tables
+ - ACPI Non-volatile Storage
+ - reserved
+
+ Following shell snippet can be used to display that memory
+ map in a human-readable format:
+
+ -------------------- 8< ----------------------------------------
+ #!/bin/bash
+ cd /sys/firmware/memmap
+ for dir in * ; do
+ start=$(cat $dir/start)
+ end=$(cat $dir/end)
+ type=$(cat $dir/type)
+ printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type"
+ done
+ -------------------- >8 ----------------------------------------
diff --git a/Documentation/ABI/testing/sysfs-firmware-sgi_uv b/Documentation/ABI/testing/sysfs-firmware-sgi_uv
new file mode 100644
index 0000000..4573fd4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-sgi_uv
@@ -0,0 +1,27 @@
+What: /sys/firmware/sgi_uv/
+Date: August 2008
+Contact: Russ Anderson <rja@sgi.com>
+Description:
+ The /sys/firmware/sgi_uv directory contains information
+ about the SGI UV platform.
+
+ Under that directory are a number of files:
+
+ partition_id
+ coherence_id
+
+ The partition_id entry contains the partition id.
+ SGI UV systems can be partitioned into multiple physical
+ machines, which each partition running a unique copy
+ of the operating system. Each partition will have a unique
+ partition id. To display the partition id, use the command:
+
+ cat /sys/firmware/sgi_uv/partition_id
+
+ The coherence_id entry contains the coherence id.
+ A partitioned SGI UV system can have one or more coherence
+ domain. The coherence id indicates which coherence domain
+ this partition is in. To display the coherence id, use the
+ command:
+
+ cat /sys/firmware/sgi_uv/coherence_id
diff --git a/Documentation/ABI/testing/sysfs-gpio b/Documentation/ABI/testing/sysfs-gpio
new file mode 100644
index 0000000..8aab809
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-gpio
@@ -0,0 +1,26 @@
+What: /sys/class/gpio/
+Date: July 2008
+KernelVersion: 2.6.27
+Contact: David Brownell <dbrownell@users.sourceforge.net>
+Description:
+
+ As a Kconfig option, individual GPIO signals may be accessed from
+ userspace. GPIOs are only made available to userspace by an explicit
+ "export" operation. If a given GPIO is not claimed for use by
+ kernel code, it may be exported by userspace (and unexported later).
+ Kernel code may export it for complete or partial access.
+
+ GPIOs are identified as they are inside the kernel, using integers in
+ the range 0..INT_MAX. See Documentation/gpio.txt for more information.
+
+ /sys/class/gpio
+ /export ... asks the kernel to export a GPIO to userspace
+ /unexport ... to return a GPIO to the kernel
+ /gpioN ... for each exported GPIO #N
+ /value ... always readable, writes fail for input GPIOs
+ /direction ... r/w as: in, out (default low); write: high, low
+ /gpiochipN ... for each gpiochip; #N is its first GPIO
+ /base ... (r/o) same as N
+ /label ... (r/o) descriptive, not necessarily unique
+ /ngpio ... (r/o) number of GPIOs; numbered N to N + (ngpio - 1)
+
diff --git a/Documentation/ABI/testing/sysfs-ibft b/Documentation/ABI/testing/sysfs-ibft
new file mode 100644
index 0000000..c2b7d11
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-ibft
@@ -0,0 +1,23 @@
+What: /sys/firmware/ibft/initiator
+Date: November 2007
+Contact: Konrad Rzeszutek <ketuzsezr@darnok.org>
+Description: The /sys/firmware/ibft/initiator directory will contain
+ files that expose the iSCSI Boot Firmware Table initiator data.
+ Usually this contains the Initiator name.
+
+What: /sys/firmware/ibft/targetX
+Date: November 2007
+Contact: Konrad Rzeszutek <ketuzsezr@darnok.org>
+Description: The /sys/firmware/ibft/targetX directory will contain
+ files that expose the iSCSI Boot Firmware Table target data.
+ Usually this contains the target's IP address, boot LUN,
+ target name, and what NIC it is associated with. It can also
+ contain the CHAP name (and password), the reverse CHAP
+ name (and password)
+
+What: /sys/firmware/ibft/ethernetX
+Date: November 2007
+Contact: Konrad Rzeszutek <ketuzsezr@darnok.org>
+Description: The /sys/firmware/ibft/ethernetX directory will contain
+ files that expose the iSCSI Boot Firmware Table NIC data.
+ This can this can the IP address, MAC, and gateway of the NIC.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm b/Documentation/ABI/testing/sysfs-kernel-mm
new file mode 100644
index 0000000..190d523
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm
@@ -0,0 +1,6 @@
+What: /sys/kernel/mm
+Date: July 2008
+Contact: Nishanth Aravamudan <nacc@us.ibm.com>, VM maintainers
+Description:
+ /sys/kernel/mm/ should contain any and all VM
+ related information in /sys/kernel/.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
new file mode 100644
index 0000000..e21c005
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
@@ -0,0 +1,15 @@
+What: /sys/kernel/mm/hugepages/
+Date: June 2008
+Contact: Nishanth Aravamudan <nacc@us.ibm.com>, hugetlb maintainers
+Description:
+ /sys/kernel/mm/hugepages/ contains a number of subdirectories
+ of the form hugepages-<size>kB, where <size> is the page size
+ of the hugepages supported by the kernel/CPU combination.
+
+ Under these directories are a number of files:
+ nr_hugepages
+ nr_overcommit_hugepages
+ free_hugepages
+ surplus_hugepages
+ resv_hugepages
+ See Documentation/vm/hugetlbpage.txt for details.
diff --git a/Documentation/ABI/testing/sysfs-kernel-uids b/Documentation/ABI/testing/sysfs-kernel-uids
new file mode 100644
index 0000000..28f1469
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-uids
@@ -0,0 +1,14 @@
+What: /sys/kernel/uids/<uid>/cpu_shares
+Date: December 2007
+Contact: Dhaval Giani <dhaval@linux.vnet.ibm.com>
+ Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+Description:
+ The /sys/kernel/uids/<uid>/cpu_shares tunable is used
+ to set the cpu bandwidth a user is allowed. This is a
+ propotional value. What that means is that if there
+ are two users logged in, each with an equal number of
+ shares, then they will get equal CPU bandwidth. Another
+ example would be, if User A has shares = 1024 and user
+ B has shares = 2048, User B will get twice the CPU
+ bandwidth user A will. For more details refer
+ Documentation/scheduler/sched-design-CFS.txt
diff --git a/Documentation/ABI/testing/sysfs-ocfs2 b/Documentation/ABI/testing/sysfs-ocfs2
new file mode 100644
index 0000000..b7cc516
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-ocfs2
@@ -0,0 +1,89 @@
+What: /sys/fs/ocfs2/
+Date: April 2008
+Contact: ocfs2-devel@oss.oracle.com
+Description:
+ The /sys/fs/ocfs2 directory contains knobs used by the
+ ocfs2-tools to interact with the filesystem.
+
+What: /sys/fs/ocfs2/max_locking_protocol
+Date: April 2008
+Contact: ocfs2-devel@oss.oracle.com
+Description:
+ The /sys/fs/ocfs2/max_locking_protocol file displays version
+ of ocfs2 locking supported by the filesystem. This version
+ covers how ocfs2 uses distributed locking between cluster
+ nodes.
+
+ The protocol version has a major and minor number. Two
+ cluster nodes can interoperate if they have an identical
+ major number and an overlapping minor number - thus,
+ a node with version 1.10 can interoperate with a node
+ sporting version 1.8, as long as both use the 1.8 protocol.
+
+ Reading from this file returns a single line, the major
+ number and minor number joined by a period, eg "1.10".
+
+ This file is read-only. The value is compiled into the
+ driver.
+
+What: /sys/fs/ocfs2/loaded_cluster_plugins
+Date: April 2008
+Contact: ocfs2-devel@oss.oracle.com
+Description:
+ The /sys/fs/ocfs2/loaded_cluster_plugins file describes
+ the available plugins to support ocfs2 cluster operation.
+ A cluster plugin is required to use ocfs2 in a cluster.
+ There are currently two available plugins:
+
+ * 'o2cb' - The classic o2cb cluster stack that ocfs2 has
+ used since its inception.
+ * 'user' - A plugin supporting userspace cluster software
+ in conjunction with fs/dlm.
+
+ Reading from this file returns the names of all loaded
+ plugins, one per line.
+
+ This file is read-only. Its contents may change as
+ plugins are loaded or removed.
+
+What: /sys/fs/ocfs2/active_cluster_plugin
+Date: April 2008
+Contact: ocfs2-devel@oss.oracle.com
+Description:
+ The /sys/fs/ocfs2/active_cluster_plugin displays which
+ cluster plugin is currently in use by the filesystem.
+ The active plugin will appear in the loaded_cluster_plugins
+ file as well. Only one plugin can be used at a time.
+
+ Reading from this file returns the name of the active plugin
+ on a single line.
+
+ This file is read-only. Which plugin is active depends on
+ the cluster stack in use. The contents may change
+ when all filesystems are unmounted and the cluster stack
+ is changed.
+
+What: /sys/fs/ocfs2/cluster_stack
+Date: April 2008
+Contact: ocfs2-devel@oss.oracle.com
+Description:
+ The /sys/fs/ocfs2/cluster_stack file contains the name
+ of current ocfs2 cluster stack. This value is set by
+ userspace tools when bringing the cluster stack online.
+
+ Cluster stack names are 4 characters in length.
+
+ When the 'o2cb' cluster stack is used, the 'o2cb' cluster
+ plugin is active. All other cluster stacks use the 'user'
+ cluster plugin.
+
+ Reading from this file returns the name of the current
+ cluster stack on a single line.
+
+ Writing a new stack name to this file changes the current
+ cluster stack unless there are mounted ocfs2 filesystems.
+ If there are mounted filesystems, attempts to change the
+ stack return an error.
+
+Users:
+ ocfs2-tools <ocfs2-tools-devel@oss.oracle.com>
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
new file mode 100644
index 0000000..dcff4d0
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-power
@@ -0,0 +1,103 @@
+What: /sys/power/
+Date: August 2006
+Contact: Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+ The /sys/power directory will contain files that will
+ provide a unified interface to the power management
+ subsystem.
+
+What: /sys/power/state
+Date: August 2006
+Contact: Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+ The /sys/power/state file controls the system power state.
+ Reading from this file returns what states are supported,
+ which is hard-coded to 'standby' (Power-On Suspend), 'mem'
+ (Suspend-to-RAM), and 'disk' (Suspend-to-Disk).
+
+ Writing to this file one of these strings causes the system to
+ transition into that state. Please see the file
+ Documentation/power/states.txt for a description of each of
+ these states.
+
+What: /sys/power/disk
+Date: September 2006
+Contact: Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+ The /sys/power/disk file controls the operating mode of the
+ suspend-to-disk mechanism. Reading from this file returns
+ the name of the method by which the system will be put to
+ sleep on the next suspend. There are four methods supported:
+ 'firmware' - means that the memory image will be saved to disk
+ by some firmware, in which case we also assume that the
+ firmware will handle the system suspend.
+ 'platform' - the memory image will be saved by the kernel and
+ the system will be put to sleep by the platform driver (e.g.
+ ACPI or other PM registers).
+ 'shutdown' - the memory image will be saved by the kernel and
+ the system will be powered off.
+ 'reboot' - the memory image will be saved by the kernel and
+ the system will be rebooted.
+
+ Additionally, /sys/power/disk can be used to turn on one of the
+ two testing modes of the suspend-to-disk mechanism: 'testproc'
+ or 'test'. If the suspend-to-disk mechanism is in the
+ 'testproc' mode, writing 'disk' to /sys/power/state will cause
+ the kernel to disable nonboot CPUs and freeze tasks, wait for 5
+ seconds, unfreeze tasks and enable nonboot CPUs. If it is in
+ the 'test' mode, writing 'disk' to /sys/power/state will cause
+ the kernel to disable nonboot CPUs and freeze tasks, shrink
+ memory, suspend devices, wait for 5 seconds, resume devices,
+ unfreeze tasks and enable nonboot CPUs. Then, we are able to
+ look in the log messages and work out, for example, which code
+ is being slow and which device drivers are misbehaving.
+
+ The suspend-to-disk method may be chosen by writing to this
+ file one of the accepted strings:
+
+ 'firmware'
+ 'platform'
+ 'shutdown'
+ 'reboot'
+ 'testproc'
+ 'test'
+
+ It will only change to 'firmware' or 'platform' if the system
+ supports that.
+
+What: /sys/power/image_size
+Date: August 2006
+Contact: Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+ The /sys/power/image_size file controls the size of the image
+ created by the suspend-to-disk mechanism. It can be written a
+ string representing a non-negative integer that will be used
+ as an upper limit of the image size, in bytes. The kernel's
+ suspend-to-disk code will do its best to ensure the image size
+ will not exceed this number. However, if it turns out to be
+ impossible, the kernel will try to suspend anyway using the
+ smallest image possible. In particular, if "0" is written to
+ this file, the suspend image will be as small as possible.
+
+ Reading from this file will display the current image size
+ limit, which is set to 500 MB by default.
+
+What: /sys/power/pm_trace
+Date: August 2006
+Contact: Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+ The /sys/power/pm_trace file controls the code which saves the
+ last PM event point in the RTC across reboots, so that you can
+ debug a machine that just hangs during suspend (or more
+ commonly, during resume). Namely, the RTC is only used to save
+ the last PM event point if this file contains '1'. Initially
+ it contains '0' which may be changed to '1' by writing a
+ string representing a nonzero integer into it.
+
+ To use this debugging feature you should attempt to suspend
+ the machine, then reboot it and run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+ CAUTION: Using it will cause your machine's real-time (CMOS)
+ clock to be set to a random invalid time after a resume.
diff --git a/Documentation/ABI/testing/sysfs-profiling b/Documentation/ABI/testing/sysfs-profiling
new file mode 100644
index 0000000..b02d8b8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-profiling
@@ -0,0 +1,13 @@
+What: /sys/kernel/profile
+Date: September 2008
+Contact: Dave Hansen <dave@linux.vnet.ibm.com>
+Description:
+ /sys/kernel/profile is the runtime equivalent
+ of the boot-time profile= option.
+
+ You can get the same effect running:
+
+ echo 2 > /sys/kernel/profile
+
+ as you would by issuing profile=2 on the boot
+ command line.
diff --git a/Documentation/ABI/testing/sysfs-wusb_cbaf b/Documentation/ABI/testing/sysfs-wusb_cbaf
new file mode 100644
index 0000000..a99c5f8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-wusb_cbaf
@@ -0,0 +1,100 @@
+What: /sys/bus/usb/drivers/wusb_cbaf/.../wusb_*
+Date: August 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ Various files for managing Cable Based Association of
+ (wireless) USB devices.
+
+ The sequence of operations should be:
+
+ 1. Device is plugged in.
+
+ 2. The connection manager (CM) sees a device with CBA capability.
+ (the wusb_chid etc. files in /sys/devices/blah/OURDEVICE).
+
+ 3. The CM writes the host name, supported band groups,
+ and the CHID (host ID) into the wusb_host_name,
+ wusb_host_band_groups and wusb_chid files. These
+ get sent to the device and the CDID (if any) for
+ this host is requested.
+
+ 4. The CM can verify that the device's supported band
+ groups (wusb_device_band_groups) are compatible
+ with the host.
+
+ 5. The CM reads the wusb_cdid file.
+
+ 6. The CM looks it up its database.
+
+ - If it has a matching CHID,CDID entry, the device
+ has been authorized before and nothing further
+ needs to be done.
+
+ - If the CDID is zero (or the CM doesn't find a
+ matching CDID in its database), the device is
+ assumed to be not known. The CM may associate
+ the host with device by: writing a randomly
+ generated CDID to wusb_cdid and then a random CK
+ to wusb_ck (this uploads the new CC to the
+ device).
+
+ CMD may choose to prompt the user before
+ associating with a new device.
+
+ 7. Device is unplugged.
+
+ References:
+ [WUSB-AM] Association Models Supplement to the
+ Certified Wireless Universal Serial Bus
+ Specification, version 1.0.
+
+What: /sys/bus/usb/drivers/wusb_cbaf/.../wusb_chid
+Date: August 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ The CHID of the host formatted as 16 space-separated
+ hex octets.
+
+ Writes fetches device's supported band groups and the
+ the CDID for any existing association with this host.
+
+What: /sys/bus/usb/drivers/wusb_cbaf/.../wusb_host_name
+Date: August 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ A friendly name for the host as a UTF-8 encoded string.
+
+What: /sys/bus/usb/drivers/wusb_cbaf/.../wusb_host_band_groups
+Date: August 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ The band groups supported by the host, in the format
+ defined in [WUSB-AM].
+
+What: /sys/bus/usb/drivers/wusb_cbaf/.../wusb_device_band_groups
+Date: August 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ The band groups supported by the device, in the format
+ defined in [WUSB-AM].
+
+What: /sys/bus/usb/drivers/wusb_cbaf/.../wusb_cdid
+Date: August 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ The device's CDID formatted as 16 space-separated hex
+ octets.
+
+What: /sys/bus/usb/drivers/wusb_cbaf/.../wusb_ck
+Date: August 2008
+KernelVersion: 2.6.27
+Contact: David Vrabel <david.vrabel@csr.com>
+Description:
+ Write 16 space-separated random, hex octets to
+ associate with the device.
diff --git a/Documentation/BUG-HUNTING b/Documentation/BUG-HUNTING
new file mode 100644
index 0000000..65022a8
--- /dev/null
+++ b/Documentation/BUG-HUNTING
@@ -0,0 +1,246 @@
+Table of contents
+=================
+
+Last updated: 20 December 2005
+
+Contents
+========
+
+- Introduction
+- Devices not appearing
+- Finding patch that caused a bug
+-- Finding using git-bisect
+-- Finding it the old way
+- Fixing the bug
+
+Introduction
+============
+
+Always try the latest kernel from kernel.org and build from source. If you are
+not confident in doing that please report the bug to your distribution vendor
+instead of to a kernel developer.
+
+Finding bugs is not always easy. Have a go though. If you can't find it don't
+give up. Report as much as you have found to the relevant maintainer. See
+MAINTAINERS for who that is for the subsystem you have worked on.
+
+Before you submit a bug report read REPORTING-BUGS.
+
+Devices not appearing
+=====================
+
+Often this is caused by udev. Check that first before blaming it on the
+kernel.
+
+Finding patch that caused a bug
+===============================
+
+
+
+Finding using git-bisect
+------------------------
+
+Using the provided tools with git makes finding bugs easy provided the bug is
+reproducible.
+
+Steps to do it:
+- start using git for the kernel source
+- read the man page for git-bisect
+- have fun
+
+Finding it the old way
+----------------------
+
+[Sat Mar 2 10:32:33 PST 1996 KERNEL_BUG-HOWTO lm@sgi.com (Larry McVoy)]
+
+This is how to track down a bug if you know nothing about kernel hacking.
+It's a brute force approach but it works pretty well.
+
+You need:
+
+ . A reproducible bug - it has to happen predictably (sorry)
+ . All the kernel tar files from a revision that worked to the
+ revision that doesn't
+
+You will then do:
+
+ . Rebuild a revision that you believe works, install, and verify that.
+ . Do a binary search over the kernels to figure out which one
+ introduced the bug. I.e., suppose 1.3.28 didn't have the bug, but
+ you know that 1.3.69 does. Pick a kernel in the middle and build
+ that, like 1.3.50. Build & test; if it works, pick the mid point
+ between .50 and .69, else the mid point between .28 and .50.
+ . You'll narrow it down to the kernel that introduced the bug. You
+ can probably do better than this but it gets tricky.
+
+ . Narrow it down to a subdirectory
+
+ - Copy kernel that works into "test". Let's say that 3.62 works,
+ but 3.63 doesn't. So you diff -r those two kernels and come
+ up with a list of directories that changed. For each of those
+ directories:
+
+ Copy the non-working directory next to the working directory
+ as "dir.63".
+ One directory at time, try moving the working directory to
+ "dir.62" and mv dir.63 dir"time, try
+
+ mv dir dir.62
+ mv dir.63 dir
+ find dir -name '*.[oa]' -print | xargs rm -f
+
+ And then rebuild and retest. Assuming that all related
+ changes were contained in the sub directory, this should
+ isolate the change to a directory.
+
+ Problems: changes in header files may have occurred; I've
+ found in my case that they were self explanatory - you may
+ or may not want to give up when that happens.
+
+ . Narrow it down to a file
+
+ - You can apply the same technique to each file in the directory,
+ hoping that the changes in that file are self contained.
+
+ . Narrow it down to a routine
+
+ - You can take the old file and the new file and manually create
+ a merged file that has
+
+ #ifdef VER62
+ routine()
+ {
+ ...
+ }
+ #else
+ routine()
+ {
+ ...
+ }
+ #endif
+
+ And then walk through that file, one routine at a time and
+ prefix it with
+
+ #define VER62
+ /* both routines here */
+ #undef VER62
+
+ Then recompile, retest, move the ifdefs until you find the one
+ that makes the difference.
+
+Finally, you take all the info that you have, kernel revisions, bug
+description, the extent to which you have narrowed it down, and pass
+that off to whomever you believe is the maintainer of that section.
+A post to linux.dev.kernel isn't such a bad idea if you've done some
+work to narrow it down.
+
+If you get it down to a routine, you'll probably get a fix in 24 hours.
+
+My apologies to Linus and the other kernel hackers for describing this
+brute force approach, it's hardly what a kernel hacker would do. However,
+it does work and it lets non-hackers help fix bugs. And it is cool
+because Linux snapshots will let you do this - something that you can't
+do with vendor supplied releases.
+
+Fixing the bug
+==============
+
+Nobody is going to tell you how to fix bugs. Seriously. You need to work it
+out. But below are some hints on how to use the tools.
+
+To debug a kernel, use objdump and look for the hex offset from the crash
+output to find the valid line of code/assembler. Without debug symbols, you
+will see the assembler code for the routine shown, but if your kernel has
+debug symbols the C code will also be available. (Debug symbols can be enabled
+in the kernel hacking menu of the menu configuration.) For example:
+
+ objdump -r -S -l --disassemble net/dccp/ipv4.o
+
+NB.: you need to be at the top level of the kernel tree for this to pick up
+your C files.
+
+If you don't have access to the code you can also debug on some crash dumps
+e.g. crash dump output as shown by Dave Miller.
+
+> EIP is at ip_queue_xmit+0x14/0x4c0
+> ...
+> Code: 44 24 04 e8 6f 05 00 00 e9 e8 fe ff ff 8d 76 00 8d bc 27 00 00
+> 00 00 55 57 56 53 81 ec bc 00 00 00 8b ac 24 d0 00 00 00 8b 5d 08
+> <8b> 83 3c 01 00 00 89 44 24 14 8b 45 28 85 c0 89 44 24 18 0f 85
+>
+> Put the bytes into a "foo.s" file like this:
+>
+> .text
+> .globl foo
+> foo:
+> .byte .... /* bytes from Code: part of OOPS dump */
+>
+> Compile it with "gcc -c -o foo.o foo.s" then look at the output of
+> "objdump --disassemble foo.o".
+>
+> Output:
+>
+> ip_queue_xmit:
+> push %ebp
+> push %edi
+> push %esi
+> push %ebx
+> sub $0xbc, %esp
+> mov 0xd0(%esp), %ebp ! %ebp = arg0 (skb)
+> mov 0x8(%ebp), %ebx ! %ebx = skb->sk
+> mov 0x13c(%ebx), %eax ! %eax = inet_sk(sk)->opt
+
+In addition, you can use GDB to figure out the exact file and line
+number of the OOPS from the vmlinux file. If you have
+CONFIG_DEBUG_INFO enabled, you can simply copy the EIP value from the
+OOPS:
+
+ EIP: 0060:[<c021e50e>] Not tainted VLI
+
+And use GDB to translate that to human-readable form:
+
+ gdb vmlinux
+ (gdb) l *0xc021e50e
+
+If you don't have CONFIG_DEBUG_INFO enabled, you use the function
+offset from the OOPS:
+
+ EIP is at vt_ioctl+0xda8/0x1482
+
+And recompile the kernel with CONFIG_DEBUG_INFO enabled:
+
+ make vmlinux
+ gdb vmlinux
+ (gdb) p vt_ioctl
+ (gdb) l *(0x<address of vt_ioctl> + 0xda8)
+or, as one command
+ (gdb) l *(vt_ioctl + 0xda8)
+
+If you have a call trace, such as :-
+>Call Trace:
+> [<ffffffff8802c8e9>] :jbd:log_wait_commit+0xa3/0xf5
+> [<ffffffff810482d9>] autoremove_wake_function+0x0/0x2e
+> [<ffffffff8802770b>] :jbd:journal_stop+0x1be/0x1ee
+> ...
+this shows the problem in the :jbd: module. You can load that module in gdb
+and list the relevant code.
+ gdb fs/jbd/jbd.ko
+ (gdb) p log_wait_commit
+ (gdb) l *(0x<address> + 0xa3)
+or
+ (gdb) l *(log_wait_commit + 0xa3)
+
+
+Another very useful option of the Kernel Hacking section in menuconfig is
+Debug memory allocations. This will help you see whether data has been
+initialised and not set before use etc. To see the values that get assigned
+with this look at mm/slab.c and search for POISON_INUSE. When using this an
+Oops will often show the poisoned data instead of zero which is the default.
+
+Once you have worked out a fix please submit it upstream. After all open
+source is about sharing what you do and don't you want to be recognised for
+your genius?
+
+Please do read Documentation/SubmittingPatches though to help your code get
+accepted.
diff --git a/Documentation/Changes b/Documentation/Changes
new file mode 100644
index 0000000..cb2b141
--- /dev/null
+++ b/Documentation/Changes
@@ -0,0 +1,396 @@
+Intro
+=====
+
+This document is designed to provide a list of the minimum levels of
+software necessary to run the 2.6 kernels, as well as provide brief
+instructions regarding any other "Gotchas" users may encounter when
+trying life on the Bleeding Edge. If upgrading from a pre-2.4.x
+kernel, please consult the Changes file included with 2.4.x kernels for
+additional information; most of that information will not be repeated
+here. Basically, this document assumes that your system is already
+functional and running at least 2.4.x kernels.
+
+This document is originally based on my "Changes" file for 2.0.x kernels
+and therefore owes credit to the same people as that file (Jared Mauch,
+Axel Boldt, Alessandro Sigala, and countless other users all over the
+'net).
+
+Current Minimal Requirements
+============================
+
+Upgrade to at *least* these software revisions before thinking you've
+encountered a bug! If you're unsure what version you're currently
+running, the suggested command should tell you.
+
+Again, keep in mind that this list assumes you are already
+functionally running a Linux 2.4 kernel. Also, not all tools are
+necessary on all systems; obviously, if you don't have any ISDN
+hardware, for example, you probably needn't concern yourself with
+isdn4k-utils.
+
+o Gnu C 3.2 # gcc --version
+o Gnu make 3.79.1 # make --version
+o binutils 2.12 # ld -v
+o util-linux 2.10o # fdformat --version
+o module-init-tools 0.9.10 # depmod -V
+o e2fsprogs 1.29 # tune2fs
+o jfsutils 1.1.3 # fsck.jfs -V
+o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
+o xfsprogs 2.6.0 # xfs_db -V
+o pcmciautils 004 # pccardctl -V
+o quota-tools 3.09 # quota -V
+o PPP 2.4.0 # pppd --version
+o isdn4k-utils 3.1pre1 # isdnctrl 2>&1|grep version
+o nfs-utils 1.0.5 # showmount --version
+o procps 3.2.0 # ps --version
+o oprofile 0.9 # oprofiled --version
+o udev 081 # udevinfo -V
+o grub 0.93 # grub --version
+
+Kernel compilation
+==================
+
+GCC
+---
+
+The gcc version requirements may vary depending on the type of CPU in your
+computer.
+
+Make
+----
+
+You will need Gnu make 3.79.1 or later to build the kernel.
+
+Binutils
+--------
+
+Linux on IA-32 has recently switched from using as86 to using gas for
+assembling the 16-bit boot code, removing the need for as86 to compile
+your kernel. This change does, however, mean that you need a recent
+release of binutils.
+
+System utilities
+================
+
+Architectural changes
+---------------------
+
+DevFS has been obsoleted in favour of udev
+(http://www.kernel.org/pub/linux/utils/kernel/hotplug/)
+
+32-bit UID support is now in place. Have fun!
+
+Linux documentation for functions is transitioning to inline
+documentation via specially-formatted comments near their
+definitions in the source. These comments can be combined with the
+SGML templates in the Documentation/DocBook directory to make DocBook
+files, which can then be converted by DocBook stylesheets to PostScript,
+HTML, PDF files, and several other formats. In order to convert from
+DocBook format to a format of your choice, you'll need to install Jade as
+well as the desired DocBook stylesheets.
+
+Util-linux
+----------
+
+New versions of util-linux provide *fdisk support for larger disks,
+support new options to mount, recognize more supported partition
+types, have a fdformat which works with 2.4 kernels, and similar goodies.
+You'll probably want to upgrade.
+
+Ksymoops
+--------
+
+If the unthinkable happens and your kernel oopses, you may need the
+ksymoops tool to decode it, but in most cases you don't.
+In the 2.6 kernel it is generally preferred to build the kernel with
+CONFIG_KALLSYMS so that it produces readable dumps that can be used as-is
+(this also produces better output than ksymoops).
+If for some reason your kernel is not build with CONFIG_KALLSYMS and
+you have no way to rebuild and reproduce the Oops with that option, then
+you can still decode that Oops with ksymoops.
+
+Module-Init-Tools
+-----------------
+
+A new module loader is now in the kernel that requires module-init-tools
+to use. It is backward compatible with the 2.4.x series kernels.
+
+Mkinitrd
+--------
+
+These changes to the /lib/modules file tree layout also require that
+mkinitrd be upgraded.
+
+E2fsprogs
+---------
+
+The latest version of e2fsprogs fixes several bugs in fsck and
+debugfs. Obviously, it's a good idea to upgrade.
+
+JFSutils
+--------
+
+The jfsutils package contains the utilities for the file system.
+The following utilities are available:
+o fsck.jfs - initiate replay of the transaction log, and check
+ and repair a JFS formatted partition.
+o mkfs.jfs - create a JFS formatted partition.
+o other file system utilities are also available in this package.
+
+Reiserfsprogs
+-------------
+
+The reiserfsprogs package should be used for reiserfs-3.6.x
+(Linux kernels 2.4.x). It is a combined package and contains working
+versions of mkreiserfs, resize_reiserfs, debugreiserfs and
+reiserfsck. These utils work on both i386 and alpha platforms.
+
+Xfsprogs
+--------
+
+The latest version of xfsprogs contains mkfs.xfs, xfs_db, and the
+xfs_repair utilities, among others, for the XFS filesystem. It is
+architecture independent and any version from 2.0.0 onward should
+work correctly with this version of the XFS kernel code (2.6.0 or
+later is recommended, due to some significant improvements).
+
+PCMCIAutils
+-----------
+
+PCMCIAutils replaces pcmcia-cs (see below). It properly sets up
+PCMCIA sockets at system startup and loads the appropriate modules
+for 16-bit PCMCIA devices if the kernel is modularized and the hotplug
+subsystem is used.
+
+Pcmcia-cs
+---------
+
+PCMCIA (PC Card) support is now partially implemented in the main
+kernel source. The "pcmciautils" package (see above) replaces pcmcia-cs
+for newest kernels.
+
+Quota-tools
+-----------
+
+Support for 32 bit uid's and gid's is required if you want to use
+the newer version 2 quota format. Quota-tools version 3.07 and
+newer has this support. Use the recommended version or newer
+from the table above.
+
+Intel IA32 microcode
+--------------------
+
+A driver has been added to allow updating of Intel IA32 microcode,
+accessible as a normal (misc) character device. If you are not using
+udev you may need to:
+
+mkdir /dev/cpu
+mknod /dev/cpu/microcode c 10 184
+chmod 0644 /dev/cpu/microcode
+
+as root before you can use this. You'll probably also want to
+get the user-space microcode_ctl utility to use with this.
+
+Powertweak
+----------
+
+If you are running v0.1.17 or earlier, you should upgrade to
+version v0.99.0 or higher. Running old versions may cause problems
+with programs using shared memory.
+
+udev
+----
+udev is a userspace application for populating /dev dynamically with
+only entries for devices actually present. udev replaces the basic
+functionality of devfs, while allowing persistent device naming for
+devices.
+
+FUSE
+----
+
+Needs libfuse 2.4.0 or later. Absolute minimum is 2.3.0 but mount
+options 'direct_io' and 'kernel_cache' won't work.
+
+Networking
+==========
+
+General changes
+---------------
+
+If you have advanced network configuration needs, you should probably
+consider using the network tools from ip-route2.
+
+Packet Filter / NAT
+-------------------
+The packet filtering and NAT code uses the same tools like the previous 2.4.x
+kernel series (iptables). It still includes backwards-compatibility modules
+for 2.2.x-style ipchains and 2.0.x-style ipfwadm.
+
+PPP
+---
+
+The PPP driver has been restructured to support multilink and to
+enable it to operate over diverse media layers. If you use PPP,
+upgrade pppd to at least 2.4.0.
+
+If you are not using udev, you must have the device file /dev/ppp
+which can be made by:
+
+mknod /dev/ppp c 108 0
+
+as root.
+
+Isdn4k-utils
+------------
+
+Due to changes in the length of the phone number field, isdn4k-utils
+needs to be recompiled or (preferably) upgraded.
+
+NFS-utils
+---------
+
+In 2.4 and earlier kernels, the nfs server needed to know about any
+client that expected to be able to access files via NFS. This
+information would be given to the kernel by "mountd" when the client
+mounted the filesystem, or by "exportfs" at system startup. exportfs
+would take information about active clients from /var/lib/nfs/rmtab.
+
+This approach is quite fragile as it depends on rmtab being correct
+which is not always easy, particularly when trying to implement
+fail-over. Even when the system is working well, rmtab suffers from
+getting lots of old entries that never get removed.
+
+With 2.6 we have the option of having the kernel tell mountd when it
+gets a request from an unknown host, and mountd can give appropriate
+export information to the kernel. This removes the dependency on
+rmtab and means that the kernel only needs to know about currently
+active clients.
+
+To enable this new functionality, you need to:
+
+ mount -t nfsd nfsd /proc/fs/nfsd
+
+before running exportfs or mountd. It is recommended that all NFS
+services be protected from the internet-at-large by a firewall where
+that is possible.
+
+Getting updated software
+========================
+
+Kernel compilation
+******************
+
+gcc
+---
+o <ftp://ftp.gnu.org/gnu/gcc/>
+
+Make
+----
+o <ftp://ftp.gnu.org/gnu/make/>
+
+Binutils
+--------
+o <ftp://ftp.kernel.org/pub/linux/devel/binutils/>
+
+System utilities
+****************
+
+Util-linux
+----------
+o <ftp://ftp.kernel.org/pub/linux/utils/util-linux/>
+
+Ksymoops
+--------
+o <ftp://ftp.kernel.org/pub/linux/utils/kernel/ksymoops/v2.4/>
+
+Module-Init-Tools
+-----------------
+o <ftp://ftp.kernel.org/pub/linux/kernel/people/rusty/modules/>
+
+Mkinitrd
+--------
+o <ftp://rawhide.redhat.com/pub/rawhide/SRPMS/SRPMS/>
+
+E2fsprogs
+---------
+o <http://prdownloads.sourceforge.net/e2fsprogs/e2fsprogs-1.29.tar.gz>
+
+JFSutils
+--------
+o <http://jfs.sourceforge.net/>
+
+Reiserfsprogs
+-------------
+o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
+
+Xfsprogs
+--------
+o <ftp://oss.sgi.com/projects/xfs/download/>
+
+Pcmciautils
+-----------
+o <ftp://ftp.kernel.org/pub/linux/utils/kernel/pcmcia/>
+
+Pcmcia-cs
+---------
+o <http://pcmcia-cs.sourceforge.net/>
+
+Quota-tools
+----------
+o <http://sourceforge.net/projects/linuxquota/>
+
+DocBook Stylesheets
+-------------------
+o <http://nwalsh.com/docbook/dsssl/>
+
+XMLTO XSLT Frontend
+-------------------
+o <http://cyberelk.net/tim/xmlto/>
+
+Intel P6 microcode
+------------------
+o <http://www.urbanmyth.org/microcode/>
+
+Powertweak
+----------
+o <http://powertweak.sourceforge.net/>
+
+udev
+----
+o <http://www.kernel.org/pub/linux/utils/kernel/hotplug/udev.html>
+
+FUSE
+----
+o <http://sourceforge.net/projects/fuse>
+
+Networking
+**********
+
+PPP
+---
+o <ftp://ftp.samba.org/pub/ppp/ppp-2.4.0.tar.gz>
+
+Isdn4k-utils
+------------
+o <ftp://ftp.isdn4linux.de/pub/isdn4linux/utils/isdn4k-utils.v3.1pre1.tar.gz>
+
+NFS-utils
+---------
+o <http://sourceforge.net/project/showfiles.php?group_id=14>
+
+Iptables
+--------
+o <http://www.iptables.org/downloads.html>
+
+Ip-route2
+---------
+o <ftp://ftp.tux.org/pub/net/ip-routing/iproute2-2.2.4-now-ss991023.tar.gz>
+
+OProfile
+--------
+o <http://oprofile.sf.net/download/>
+
+NFS-Utils
+---------
+o <http://nfs.sourceforge.net/>
+
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
new file mode 100644
index 0000000..1875e50
--- /dev/null
+++ b/Documentation/CodingStyle
@@ -0,0 +1,816 @@
+
+ Linux kernel coding style
+
+This is a short document describing the preferred coding style for the
+linux kernel. Coding style is very personal, and I won't _force_ my
+views on anybody, but this is what goes for anything that I have to be
+able to maintain, and I'd prefer it for most other things too. Please
+at least consider the points made here.
+
+First off, I'd suggest printing out a copy of the GNU coding standards,
+and NOT read it. Burn them, it's a great symbolic gesture.
+
+Anyway, here goes:
+
+
+ Chapter 1: Indentation
+
+Tabs are 8 characters, and thus indentations are also 8 characters.
+There are heretic movements that try to make indentations 4 (or even 2!)
+characters deep, and that is akin to trying to define the value of PI to
+be 3.
+
+Rationale: The whole idea behind indentation is to clearly define where
+a block of control starts and ends. Especially when you've been looking
+at your screen for 20 straight hours, you'll find it a lot easier to see
+how the indentation works if you have large indentations.
+
+Now, some people will claim that having 8-character indentations makes
+the code move too far to the right, and makes it hard to read on a
+80-character terminal screen. The answer to that is that if you need
+more than 3 levels of indentation, you're screwed anyway, and should fix
+your program.
+
+In short, 8-char indents make things easier to read, and have the added
+benefit of warning you when you're nesting your functions too deep.
+Heed that warning.
+
+The preferred way to ease multiple indentation levels in a switch statement is
+to align the "switch" and its subordinate "case" labels in the same column
+instead of "double-indenting" the "case" labels. E.g.:
+
+ switch (suffix) {
+ case 'G':
+ case 'g':
+ mem <<= 30;
+ break;
+ case 'M':
+ case 'm':
+ mem <<= 20;
+ break;
+ case 'K':
+ case 'k':
+ mem <<= 10;
+ /* fall through */
+ default:
+ break;
+ }
+
+
+Don't put multiple statements on a single line unless you have
+something to hide:
+
+ if (condition) do_this;
+ do_something_everytime;
+
+Don't put multiple assignments on a single line either. Kernel coding style
+is super simple. Avoid tricky expressions.
+
+Outside of comments, documentation and except in Kconfig, spaces are never
+used for indentation, and the above example is deliberately broken.
+
+Get a decent editor and don't leave whitespace at the end of lines.
+
+
+ Chapter 2: Breaking long lines and strings
+
+Coding style is all about readability and maintainability using commonly
+available tools.
+
+The limit on the length of lines is 80 columns and this is a strongly
+preferred limit.
+
+Statements longer than 80 columns will be broken into sensible chunks.
+Descendants are always substantially shorter than the parent and are placed
+substantially to the right. The same applies to function headers with a long
+argument list. Long strings are as well broken into shorter strings. The
+only exception to this is where exceeding 80 columns significantly increases
+readability and does not hide information.
+
+void fun(int a, int b, int c)
+{
+ if (condition)
+ printk(KERN_WARNING "Warning this is a long printk with "
+ "3 parameters a: %u b: %u "
+ "c: %u \n", a, b, c);
+ else
+ next_statement;
+}
+
+ Chapter 3: Placing Braces and Spaces
+
+The other issue that always comes up in C styling is the placement of
+braces. Unlike the indent size, there are few technical reasons to
+choose one placement strategy over the other, but the preferred way, as
+shown to us by the prophets Kernighan and Ritchie, is to put the opening
+brace last on the line, and put the closing brace first, thusly:
+
+ if (x is true) {
+ we do y
+ }
+
+This applies to all non-function statement blocks (if, switch, for,
+while, do). E.g.:
+
+ switch (action) {
+ case KOBJ_ADD:
+ return "add";
+ case KOBJ_REMOVE:
+ return "remove";
+ case KOBJ_CHANGE:
+ return "change";
+ default:
+ return NULL;
+ }
+
+However, there is one special case, namely functions: they have the
+opening brace at the beginning of the next line, thus:
+
+ int function(int x)
+ {
+ body of function
+ }
+
+Heretic people all over the world have claimed that this inconsistency
+is ... well ... inconsistent, but all right-thinking people know that
+(a) K&R are _right_ and (b) K&R are right. Besides, functions are
+special anyway (you can't nest them in C).
+
+Note that the closing brace is empty on a line of its own, _except_ in
+the cases where it is followed by a continuation of the same statement,
+ie a "while" in a do-statement or an "else" in an if-statement, like
+this:
+
+ do {
+ body of do-loop
+ } while (condition);
+
+and
+
+ if (x == y) {
+ ..
+ } else if (x > y) {
+ ...
+ } else {
+ ....
+ }
+
+Rationale: K&R.
+
+Also, note that this brace-placement also minimizes the number of empty
+(or almost empty) lines, without any loss of readability. Thus, as the
+supply of new-lines on your screen is not a renewable resource (think
+25-line terminal screens here), you have more empty lines to put
+comments on.
+
+Do not unnecessarily use braces where a single statement will do.
+
+if (condition)
+ action();
+
+This does not apply if one branch of a conditional statement is a single
+statement. Use braces in both branches.
+
+if (condition) {
+ do_this();
+ do_that();
+} else {
+ otherwise();
+}
+
+ 3.1: Spaces
+
+Linux kernel style for use of spaces depends (mostly) on
+function-versus-keyword usage. Use a space after (most) keywords. The
+notable exceptions are sizeof, typeof, alignof, and __attribute__, which look
+somewhat like functions (and are usually used with parentheses in Linux,
+although they are not required in the language, as in: "sizeof info" after
+"struct fileinfo info;" is declared).
+
+So use a space after these keywords:
+ if, switch, case, for, do, while
+but not with sizeof, typeof, alignof, or __attribute__. E.g.,
+ s = sizeof(struct file);
+
+Do not add spaces around (inside) parenthesized expressions. This example is
+*bad*:
+
+ s = sizeof( struct file );
+
+When declaring pointer data or a function that returns a pointer type, the
+preferred use of '*' is adjacent to the data name or function name and not
+adjacent to the type name. Examples:
+
+ char *linux_banner;
+ unsigned long long memparse(char *ptr, char **retptr);
+ char *match_strdup(substring_t *s);
+
+Use one space around (on each side of) most binary and ternary operators,
+such as any of these:
+
+ = + - < > * / % | & ^ <= >= == != ? :
+
+but no space after unary operators:
+ & * + - ~ ! sizeof typeof alignof __attribute__ defined
+
+no space before the postfix increment & decrement unary operators:
+ ++ --
+
+no space after the prefix increment & decrement unary operators:
+ ++ --
+
+and no space around the '.' and "->" structure member operators.
+
+Do not leave trailing whitespace at the ends of lines. Some editors with
+"smart" indentation will insert whitespace at the beginning of new lines as
+appropriate, so you can start typing the next line of code right away.
+However, some such editors do not remove the whitespace if you end up not
+putting a line of code there, such as if you leave a blank line. As a result,
+you end up with lines containing trailing whitespace.
+
+Git will warn you about patches that introduce trailing whitespace, and can
+optionally strip the trailing whitespace for you; however, if applying a series
+of patches, this may make later patches in the series fail by changing their
+context lines.
+
+
+ Chapter 4: Naming
+
+C is a Spartan language, and so should your naming be. Unlike Modula-2
+and Pascal programmers, C programmers do not use cute names like
+ThisVariableIsATemporaryCounter. A C programmer would call that
+variable "tmp", which is much easier to write, and not the least more
+difficult to understand.
+
+HOWEVER, while mixed-case names are frowned upon, descriptive names for
+global variables are a must. To call a global function "foo" is a
+shooting offense.
+
+GLOBAL variables (to be used only if you _really_ need them) need to
+have descriptive names, as do global functions. If you have a function
+that counts the number of active users, you should call that
+"count_active_users()" or similar, you should _not_ call it "cntusr()".
+
+Encoding the type of a function into the name (so-called Hungarian
+notation) is brain damaged - the compiler knows the types anyway and can
+check those, and it only confuses the programmer. No wonder MicroSoft
+makes buggy programs.
+
+LOCAL variable names should be short, and to the point. If you have
+some random integer loop counter, it should probably be called "i".
+Calling it "loop_counter" is non-productive, if there is no chance of it
+being mis-understood. Similarly, "tmp" can be just about any type of
+variable that is used to hold a temporary value.
+
+If you are afraid to mix up your local variable names, you have another
+problem, which is called the function-growth-hormone-imbalance syndrome.
+See chapter 6 (Functions).
+
+
+ Chapter 5: Typedefs
+
+Please don't use things like "vps_t".
+
+It's a _mistake_ to use typedef for structures and pointers. When you see a
+
+ vps_t a;
+
+in the source, what does it mean?
+
+In contrast, if it says
+
+ struct virtual_container *a;
+
+you can actually tell what "a" is.
+
+Lots of people think that typedefs "help readability". Not so. They are
+useful only for:
+
+ (a) totally opaque objects (where the typedef is actively used to _hide_
+ what the object is).
+
+ Example: "pte_t" etc. opaque objects that you can only access using
+ the proper accessor functions.
+
+ NOTE! Opaqueness and "accessor functions" are not good in themselves.
+ The reason we have them for things like pte_t etc. is that there
+ really is absolutely _zero_ portably accessible information there.
+
+ (b) Clear integer types, where the abstraction _helps_ avoid confusion
+ whether it is "int" or "long".
+
+ u8/u16/u32 are perfectly fine typedefs, although they fit into
+ category (d) better than here.
+
+ NOTE! Again - there needs to be a _reason_ for this. If something is
+ "unsigned long", then there's no reason to do
+
+ typedef unsigned long myflags_t;
+
+ but if there is a clear reason for why it under certain circumstances
+ might be an "unsigned int" and under other configurations might be
+ "unsigned long", then by all means go ahead and use a typedef.
+
+ (c) when you use sparse to literally create a _new_ type for
+ type-checking.
+
+ (d) New types which are identical to standard C99 types, in certain
+ exceptional circumstances.
+
+ Although it would only take a short amount of time for the eyes and
+ brain to become accustomed to the standard types like 'uint32_t',
+ some people object to their use anyway.
+
+ Therefore, the Linux-specific 'u8/u16/u32/u64' types and their
+ signed equivalents which are identical to standard types are
+ permitted -- although they are not mandatory in new code of your
+ own.
+
+ When editing existing code which already uses one or the other set
+ of types, you should conform to the existing choices in that code.
+
+ (e) Types safe for use in userspace.
+
+ In certain structures which are visible to userspace, we cannot
+ require C99 types and cannot use the 'u32' form above. Thus, we
+ use __u32 and similar types in all structures which are shared
+ with userspace.
+
+Maybe there are other cases too, but the rule should basically be to NEVER
+EVER use a typedef unless you can clearly match one of those rules.
+
+In general, a pointer, or a struct that has elements that can reasonably
+be directly accessed should _never_ be a typedef.
+
+
+ Chapter 6: Functions
+
+Functions should be short and sweet, and do just one thing. They should
+fit on one or two screenfuls of text (the ISO/ANSI screen size is 80x24,
+as we all know), and do one thing and do that well.
+
+The maximum length of a function is inversely proportional to the
+complexity and indentation level of that function. So, if you have a
+conceptually simple function that is just one long (but simple)
+case-statement, where you have to do lots of small things for a lot of
+different cases, it's OK to have a longer function.
+
+However, if you have a complex function, and you suspect that a
+less-than-gifted first-year high-school student might not even
+understand what the function is all about, you should adhere to the
+maximum limits all the more closely. Use helper functions with
+descriptive names (you can ask the compiler to in-line them if you think
+it's performance-critical, and it will probably do a better job of it
+than you would have done).
+
+Another measure of the function is the number of local variables. They
+shouldn't exceed 5-10, or you're doing something wrong. Re-think the
+function, and split it into smaller pieces. A human brain can
+generally easily keep track of about 7 different things, anything more
+and it gets confused. You know you're brilliant, but maybe you'd like
+to understand what you did 2 weeks from now.
+
+In source files, separate functions with one blank line. If the function is
+exported, the EXPORT* macro for it should follow immediately after the closing
+function brace line. E.g.:
+
+int system_is_up(void)
+{
+ return system_state == SYSTEM_RUNNING;
+}
+EXPORT_SYMBOL(system_is_up);
+
+In function prototypes, include parameter names with their data types.
+Although this is not required by the C language, it is preferred in Linux
+because it is a simple way to add valuable information for the reader.
+
+
+ Chapter 7: Centralized exiting of functions
+
+Albeit deprecated by some people, the equivalent of the goto statement is
+used frequently by compilers in form of the unconditional jump instruction.
+
+The goto statement comes in handy when a function exits from multiple
+locations and some common work such as cleanup has to be done.
+
+The rationale is:
+
+- unconditional statements are easier to understand and follow
+- nesting is reduced
+- errors by not updating individual exit points when making
+ modifications are prevented
+- saves the compiler work to optimize redundant code away ;)
+
+int fun(int a)
+{
+ int result = 0;
+ char *buffer = kmalloc(SIZE);
+
+ if (buffer == NULL)
+ return -ENOMEM;
+
+ if (condition1) {
+ while (loop1) {
+ ...
+ }
+ result = 1;
+ goto out;
+ }
+ ...
+out:
+ kfree(buffer);
+ return result;
+}
+
+ Chapter 8: Commenting
+
+Comments are good, but there is also a danger of over-commenting. NEVER
+try to explain HOW your code works in a comment: it's much better to
+write the code so that the _working_ is obvious, and it's a waste of
+time to explain badly written code.
+
+Generally, you want your comments to tell WHAT your code does, not HOW.
+Also, try to avoid putting comments inside a function body: if the
+function is so complex that you need to separately comment parts of it,
+you should probably go back to chapter 6 for a while. You can make
+small comments to note or warn about something particularly clever (or
+ugly), but try to avoid excess. Instead, put the comments at the head
+of the function, telling people what it does, and possibly WHY it does
+it.
+
+When commenting the kernel API functions, please use the kernel-doc format.
+See the files Documentation/kernel-doc-nano-HOWTO.txt and scripts/kernel-doc
+for details.
+
+Linux style for comments is the C89 "/* ... */" style.
+Don't use C99-style "// ..." comments.
+
+The preferred style for long (multi-line) comments is:
+
+ /*
+ * This is the preferred style for multi-line
+ * comments in the Linux kernel source code.
+ * Please use it consistently.
+ *
+ * Description: A column of asterisks on the left side,
+ * with beginning and ending almost-blank lines.
+ */
+
+It's also important to comment data, whether they are basic types or derived
+types. To this end, use just one data declaration per line (no commas for
+multiple data declarations). This leaves you room for a small comment on each
+item, explaining its use.
+
+
+ Chapter 9: You've made a mess of it
+
+That's OK, we all do. You've probably been told by your long-time Unix
+user helper that "GNU emacs" automatically formats the C sources for
+you, and you've noticed that yes, it does do that, but the defaults it
+uses are less than desirable (in fact, they are worse than random
+typing - an infinite number of monkeys typing into GNU emacs would never
+make a good program).
+
+So, you can either get rid of GNU emacs, or change it to use saner
+values. To do the latter, you can stick the following in your .emacs file:
+
+(defun c-lineup-arglist-tabs-only (ignored)
+ "Line up argument lists by tabs, not spaces"
+ (let* ((anchor (c-langelem-pos c-syntactic-element))
+ (column (c-langelem-2nd-pos c-syntactic-element))
+ (offset (- (1+ column) anchor))
+ (steps (floor offset c-basic-offset)))
+ (* (max steps 1)
+ c-basic-offset)))
+
+(add-hook 'c-mode-hook
+ (lambda ()
+ (let ((filename (buffer-file-name)))
+ ;; Enable kernel mode for the appropriate files
+ (when (and filename
+ (string-match "~/src/linux-trees" filename))
+ (setq indent-tabs-mode t)
+ (c-set-style "linux")
+ (c-set-offset 'arglist-cont-nonempty
+ '(c-lineup-gcc-asm-reg
+ c-lineup-arglist-tabs-only))))))
+
+This will make emacs go better with the kernel coding style for C
+files below ~/src/linux-trees.
+
+But even if you fail in getting emacs to do sane formatting, not
+everything is lost: use "indent".
+
+Now, again, GNU indent has the same brain-dead settings that GNU emacs
+has, which is why you need to give it a few command line options.
+However, that's not too bad, because even the makers of GNU indent
+recognize the authority of K&R (the GNU people aren't evil, they are
+just severely misguided in this matter), so you just give indent the
+options "-kr -i8" (stands for "K&R, 8 character indents"), or use
+"scripts/Lindent", which indents in the latest style.
+
+"indent" has a lot of options, and especially when it comes to comment
+re-formatting you may want to take a look at the man page. But
+remember: "indent" is not a fix for bad programming.
+
+
+ Chapter 10: Kconfig configuration files
+
+For all of the Kconfig* configuration files throughout the source tree,
+the indentation is somewhat different. Lines under a "config" definition
+are indented with one tab, while help text is indented an additional two
+spaces. Example:
+
+config AUDIT
+ bool "Auditing support"
+ depends on NET
+ help
+ Enable auditing infrastructure that can be used with another
+ kernel subsystem, such as SELinux (which requires this for
+ logging of avc messages output). Does not do system-call
+ auditing without CONFIG_AUDITSYSCALL.
+
+Features that might still be considered unstable should be defined as
+dependent on "EXPERIMENTAL":
+
+config SLUB
+ depends on EXPERIMENTAL && !ARCH_USES_SLAB_PAGE_STRUCT
+ bool "SLUB (Unqueued Allocator)"
+ ...
+
+while seriously dangerous features (such as write support for certain
+filesystems) should advertise this prominently in their prompt string:
+
+config ADFS_FS_RW
+ bool "ADFS write support (DANGEROUS)"
+ depends on ADFS_FS
+ ...
+
+For full documentation on the configuration files, see the file
+Documentation/kbuild/kconfig-language.txt.
+
+
+ Chapter 11: Data structures
+
+Data structures that have visibility outside the single-threaded
+environment they are created and destroyed in should always have
+reference counts. In the kernel, garbage collection doesn't exist (and
+outside the kernel garbage collection is slow and inefficient), which
+means that you absolutely _have_ to reference count all your uses.
+
+Reference counting means that you can avoid locking, and allows multiple
+users to have access to the data structure in parallel - and not having
+to worry about the structure suddenly going away from under them just
+because they slept or did something else for a while.
+
+Note that locking is _not_ a replacement for reference counting.
+Locking is used to keep data structures coherent, while reference
+counting is a memory management technique. Usually both are needed, and
+they are not to be confused with each other.
+
+Many data structures can indeed have two levels of reference counting,
+when there are users of different "classes". The subclass count counts
+the number of subclass users, and decrements the global count just once
+when the subclass count goes to zero.
+
+Examples of this kind of "multi-level-reference-counting" can be found in
+memory management ("struct mm_struct": mm_users and mm_count), and in
+filesystem code ("struct super_block": s_count and s_active).
+
+Remember: if another thread can find your data structure, and you don't
+have a reference count on it, you almost certainly have a bug.
+
+
+ Chapter 12: Macros, Enums and RTL
+
+Names of macros defining constants and labels in enums are capitalized.
+
+#define CONSTANT 0x12345
+
+Enums are preferred when defining several related constants.
+
+CAPITALIZED macro names are appreciated but macros resembling functions
+may be named in lower case.
+
+Generally, inline functions are preferable to macros resembling functions.
+
+Macros with multiple statements should be enclosed in a do - while block:
+
+#define macrofun(a, b, c) \
+ do { \
+ if (a == 5) \
+ do_this(b, c); \
+ } while (0)
+
+Things to avoid when using macros:
+
+1) macros that affect control flow:
+
+#define FOO(x) \
+ do { \
+ if (blah(x) < 0) \
+ return -EBUGGERED; \
+ } while(0)
+
+is a _very_ bad idea. It looks like a function call but exits the "calling"
+function; don't break the internal parsers of those who will read the code.
+
+2) macros that depend on having a local variable with a magic name:
+
+#define FOO(val) bar(index, val)
+
+might look like a good thing, but it's confusing as hell when one reads the
+code and it's prone to breakage from seemingly innocent changes.
+
+3) macros with arguments that are used as l-values: FOO(x) = y; will
+bite you if somebody e.g. turns FOO into an inline function.
+
+4) forgetting about precedence: macros defining constants using expressions
+must enclose the expression in parentheses. Beware of similar issues with
+macros using parameters.
+
+#define CONSTANT 0x4000
+#define CONSTEXP (CONSTANT | 3)
+
+The cpp manual deals with macros exhaustively. The gcc internals manual also
+covers RTL which is used frequently with assembly language in the kernel.
+
+
+ Chapter 13: Printing kernel messages
+
+Kernel developers like to be seen as literate. Do mind the spelling
+of kernel messages to make a good impression. Do not use crippled
+words like "dont"; use "do not" or "don't" instead. Make the messages
+concise, clear, and unambiguous.
+
+Kernel messages do not have to be terminated with a period.
+
+Printing numbers in parentheses (%d) adds no value and should be avoided.
+
+There are a number of driver model diagnostic macros in <linux/device.h>
+which you should use to make sure messages are matched to the right device
+and driver, and are tagged with the right level: dev_err(), dev_warn(),
+dev_info(), and so forth. For messages that aren't associated with a
+particular device, <linux/kernel.h> defines pr_debug() and pr_info().
+
+Coming up with good debugging messages can be quite a challenge; and once
+you have them, they can be a huge help for remote troubleshooting. Such
+messages should be compiled out when the DEBUG symbol is not defined (that
+is, by default they are not included). When you use dev_dbg() or pr_debug(),
+that's automatic. Many subsystems have Kconfig options to turn on -DDEBUG.
+A related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to the
+ones already enabled by DEBUG.
+
+
+ Chapter 14: Allocating memory
+
+The kernel provides the following general purpose memory allocators:
+kmalloc(), kzalloc(), kcalloc(), and vmalloc(). Please refer to the API
+documentation for further information about them.
+
+The preferred form for passing a size of a struct is the following:
+
+ p = kmalloc(sizeof(*p), ...);
+
+The alternative form where struct name is spelled out hurts readability and
+introduces an opportunity for a bug when the pointer variable type is changed
+but the corresponding sizeof that is passed to a memory allocator is not.
+
+Casting the return value which is a void pointer is redundant. The conversion
+from void pointer to any other pointer type is guaranteed by the C programming
+language.
+
+
+ Chapter 15: The inline disease
+
+There appears to be a common misperception that gcc has a magic "make me
+faster" speedup option called "inline". While the use of inlines can be
+appropriate (for example as a means of replacing macros, see Chapter 12), it
+very often is not. Abundant use of the inline keyword leads to a much bigger
+kernel, which in turn slows the system as a whole down, due to a bigger
+icache footprint for the CPU and simply because there is less memory
+available for the pagecache. Just think about it; a pagecache miss causes a
+disk seek, which easily takes 5 miliseconds. There are a LOT of cpu cycles
+that can go into these 5 miliseconds.
+
+A reasonable rule of thumb is to not put inline at functions that have more
+than 3 lines of code in them. An exception to this rule are the cases where
+a parameter is known to be a compiletime constant, and as a result of this
+constantness you *know* the compiler will be able to optimize most of your
+function away at compile time. For a good example of this later case, see
+the kmalloc() inline function.
+
+Often people argue that adding inline to functions that are static and used
+only once is always a win since there is no space tradeoff. While this is
+technically correct, gcc is capable of inlining these automatically without
+help, and the maintenance issue of removing the inline when a second user
+appears outweighs the potential value of the hint that tells gcc to do
+something it would have done anyway.
+
+
+ Chapter 16: Function return values and names
+
+Functions can return values of many different kinds, and one of the
+most common is a value indicating whether the function succeeded or
+failed. Such a value can be represented as an error-code integer
+(-Exxx = failure, 0 = success) or a "succeeded" boolean (0 = failure,
+non-zero = success).
+
+Mixing up these two sorts of representations is a fertile source of
+difficult-to-find bugs. If the C language included a strong distinction
+between integers and booleans then the compiler would find these mistakes
+for us... but it doesn't. To help prevent such bugs, always follow this
+convention:
+
+ If the name of a function is an action or an imperative command,
+ the function should return an error-code integer. If the name
+ is a predicate, the function should return a "succeeded" boolean.
+
+For example, "add work" is a command, and the add_work() function returns 0
+for success or -EBUSY for failure. In the same way, "PCI device present" is
+a predicate, and the pci_dev_present() function returns 1 if it succeeds in
+finding a matching device or 0 if it doesn't.
+
+All EXPORTed functions must respect this convention, and so should all
+public functions. Private (static) functions need not, but it is
+recommended that they do.
+
+Functions whose return value is the actual result of a computation, rather
+than an indication of whether the computation succeeded, are not subject to
+this rule. Generally they indicate failure by returning some out-of-range
+result. Typical examples would be functions that return pointers; they use
+NULL or the ERR_PTR mechanism to report failure.
+
+
+ Chapter 17: Don't re-invent the kernel macros
+
+The header file include/linux/kernel.h contains a number of macros that
+you should use, rather than explicitly coding some variant of them yourself.
+For example, if you need to calculate the length of an array, take advantage
+of the macro
+
+ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+Similarly, if you need to calculate the size of some structure member, use
+
+ #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
+
+There are also min() and max() macros that do strict type checking if you
+need them. Feel free to peruse that header file to see what else is already
+defined that you shouldn't reproduce in your code.
+
+
+ Chapter 18: Editor modelines and other cruft
+
+Some editors can interpret configuration information embedded in source files,
+indicated with special markers. For example, emacs interprets lines marked
+like this:
+
+-*- mode: c -*-
+
+Or like this:
+
+/*
+Local Variables:
+compile-command: "gcc -DMAGIC_DEBUG_FLAG foo.c"
+End:
+*/
+
+Vim interprets markers that look like this:
+
+/* vim:set sw=8 noet */
+
+Do not include any of these in source files. People have their own personal
+editor configurations, and your source files should not override them. This
+includes markers for indentation and mode configuration. People may use their
+own custom mode, or may have some other magic method for making indentation
+work correctly.
+
+
+
+ Appendix I: References
+
+The C Programming Language, Second Edition
+by Brian W. Kernighan and Dennis M. Ritchie.
+Prentice Hall, Inc., 1988.
+ISBN 0-13-110362-8 (paperback), 0-13-110370-9 (hardback).
+URL: http://cm.bell-labs.com/cm/cs/cbook/
+
+The Practice of Programming
+by Brian W. Kernighan and Rob Pike.
+Addison-Wesley, Inc., 1999.
+ISBN 0-201-61586-X.
+URL: http://cm.bell-labs.com/cm/cs/tpop/
+
+GNU manuals - where in compliance with K&R and this text - for cpp, gcc,
+gcc internals and indent, all available from http://www.gnu.org/manual/
+
+WG14 is the international standardization working group for the programming
+language C, URL: http://www.open-std.org/JTC1/SC22/WG14/
+
+Kernel CodingStyle, by greg@kroah.com at OLS 2002:
+http://www.kroah.com/linux/talks/ols_2002_kernel_codingstyle_talk/html/
+
+--
+Last updated on 2007-July-13.
+
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
new file mode 100644
index 0000000..b462bb1
--- /dev/null
+++ b/Documentation/DMA-API.txt
@@ -0,0 +1,612 @@
+ Dynamic DMA mapping using the generic device
+ ============================================
+
+ James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
+
+This document describes the DMA API. For a more gentle introduction
+phrased in terms of the pci_ equivalents (and actual examples) see
+DMA-mapping.txt
+
+This API is split into two pieces. Part I describes the API and the
+corresponding pci_ API. Part II describes the extensions to the API
+for supporting non-consistent memory machines. Unless you know that
+your driver absolutely has to support non-consistent platforms (this
+is usually only legacy platforms) you should only use the API
+described in part I.
+
+Part I - pci_ and dma_ Equivalent API
+-------------------------------------
+
+To get the pci_ API, you must #include <linux/pci.h>
+To get the dma_ API, you must #include <linux/dma-mapping.h>
+
+
+Part Ia - Using large dma-coherent buffers
+------------------------------------------
+
+void *
+dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag)
+void *
+pci_alloc_consistent(struct pci_dev *dev, size_t size,
+ dma_addr_t *dma_handle)
+
+Consistent memory is memory for which a write by either the device or
+the processor can immediately be read by the processor or device
+without having to worry about caching effects. (You may however need
+to make sure to flush the processor's write buffers before telling
+devices to read that memory.)
+
+This routine allocates a region of <size> bytes of consistent memory.
+It also returns a <dma_handle> which may be cast to an unsigned
+integer the same width as the bus and used as the physical address
+base of the region.
+
+Returns: a pointer to the allocated region (in the processor's virtual
+address space) or NULL if the allocation failed.
+
+Note: consistent memory can be expensive on some platforms, and the
+minimum allocation length may be as big as a page, so you should
+consolidate your requests for consistent memory as much as possible.
+The simplest way to do that is to use the dma_pool calls (see below).
+
+The flag parameter (dma_alloc_coherent only) allows the caller to
+specify the GFP_ flags (see kmalloc) for the allocation (the
+implementation may choose to ignore flags that affect the location of
+the returned memory, like GFP_DMA). For pci_alloc_consistent, you
+must assume GFP_ATOMIC behaviour.
+
+void
+dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle)
+void
+pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle)
+
+Free the region of consistent memory you previously allocated. dev,
+size and dma_handle must all be the same as those passed into the
+consistent allocate. cpu_addr must be the virtual address returned by
+the consistent allocate.
+
+Note that unlike their sibling allocation calls, these routines
+may only be called with IRQs enabled.
+
+
+Part Ib - Using small dma-coherent buffers
+------------------------------------------
+
+To get this part of the dma_ API, you must #include <linux/dmapool.h>
+
+Many drivers need lots of small dma-coherent memory regions for DMA
+descriptors or I/O buffers. Rather than allocating in units of a page
+or more using dma_alloc_coherent(), you can use DMA pools. These work
+much like a struct kmem_cache, except that they use the dma-coherent allocator,
+not __get_free_pages(). Also, they understand common hardware constraints
+for alignment, like queue heads needing to be aligned on N-byte boundaries.
+
+
+ struct dma_pool *
+ dma_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t alloc);
+
+ struct pci_pool *
+ pci_pool_create(const char *name, struct pci_device *dev,
+ size_t size, size_t align, size_t alloc);
+
+The pool create() routines initialize a pool of dma-coherent buffers
+for use with a given device. It must be called in a context which
+can sleep.
+
+The "name" is for diagnostics (like a struct kmem_cache name); dev and size
+are like what you'd pass to dma_alloc_coherent(). The device's hardware
+alignment requirement for this type of data is "align" (which is expressed
+in bytes, and must be a power of two). If your device has no boundary
+crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
+from this pool must not cross 4KByte boundaries.
+
+
+ void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
+ dma_addr_t *dma_handle);
+
+ void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags,
+ dma_addr_t *dma_handle);
+
+This allocates memory from the pool; the returned memory will meet the size
+and alignment requirements specified at creation time. Pass GFP_ATOMIC to
+prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks),
+pass GFP_KERNEL to allow blocking. Like dma_alloc_coherent(), this returns
+two values: an address usable by the cpu, and the dma address usable by the
+pool's device.
+
+
+ void dma_pool_free(struct dma_pool *pool, void *vaddr,
+ dma_addr_t addr);
+
+ void pci_pool_free(struct pci_pool *pool, void *vaddr,
+ dma_addr_t addr);
+
+This puts memory back into the pool. The pool is what was passed to
+the pool allocation routine; the cpu (vaddr) and dma addresses are what
+were returned when that routine allocated the memory being freed.
+
+
+ void dma_pool_destroy(struct dma_pool *pool);
+
+ void pci_pool_destroy(struct pci_pool *pool);
+
+The pool destroy() routines free the resources of the pool. They must be
+called in a context which can sleep. Make sure you've freed all allocated
+memory back to the pool before you destroy it.
+
+
+Part Ic - DMA addressing limitations
+------------------------------------
+
+int
+dma_supported(struct device *dev, u64 mask)
+int
+pci_dma_supported(struct pci_dev *hwdev, u64 mask)
+
+Checks to see if the device can support DMA to the memory described by
+mask.
+
+Returns: 1 if it can and 0 if it can't.
+
+Notes: This routine merely tests to see if the mask is possible. It
+won't change the current mask settings. It is more intended as an
+internal API for use by the platform than an external API for use by
+driver writers.
+
+int
+dma_set_mask(struct device *dev, u64 mask)
+int
+pci_set_dma_mask(struct pci_device *dev, u64 mask)
+
+Checks to see if the mask is possible and updates the device
+parameters if it is.
+
+Returns: 0 if successful and a negative error if not.
+
+u64
+dma_get_required_mask(struct device *dev)
+
+After setting the mask with dma_set_mask(), this API returns the
+actual mask (within that already set) that the platform actually
+requires to operate efficiently. Usually this means the returned mask
+is the minimum required to cover all of memory. Examining the
+required mask gives drivers with variable descriptor sizes the
+opportunity to use smaller descriptors as necessary.
+
+Requesting the required mask does not alter the current mask. If you
+wish to take advantage of it, you should issue another dma_set_mask()
+call to lower the mask again.
+
+
+Part Id - Streaming DMA mappings
+--------------------------------
+
+dma_addr_t
+dma_map_single(struct device *dev, void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+dma_addr_t
+pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size,
+ int direction)
+
+Maps a piece of processor virtual memory so it can be accessed by the
+device and returns the physical handle of the memory.
+
+The direction for both api's may be converted freely by casting.
+However the dma_ API uses a strongly typed enumerator for its
+direction:
+
+DMA_NONE = PCI_DMA_NONE no direction (used for
+ debugging)
+DMA_TO_DEVICE = PCI_DMA_TODEVICE data is going from the
+ memory to the device
+DMA_FROM_DEVICE = PCI_DMA_FROMDEVICE data is coming from
+ the device to the
+ memory
+DMA_BIDIRECTIONAL = PCI_DMA_BIDIRECTIONAL direction isn't known
+
+Notes: Not all memory regions in a machine can be mapped by this
+API. Further, regions that appear to be physically contiguous in
+kernel virtual space may not be contiguous as physical memory. Since
+this API does not provide any scatter/gather capability, it will fail
+if the user tries to map a non-physically contiguous piece of memory.
+For this reason, it is recommended that memory mapped by this API be
+obtained only from sources which guarantee it to be physically contiguous
+(like kmalloc).
+
+Further, the physical address of the memory must be within the
+dma_mask of the device (the dma_mask represents a bit mask of the
+addressable region for the device. I.e., if the physical address of
+the memory anded with the dma_mask is still equal to the physical
+address, then the device can perform DMA to the memory). In order to
+ensure that the memory allocated by kmalloc is within the dma_mask,
+the driver may specify various platform-dependent flags to restrict
+the physical memory range of the allocation (e.g. on x86, GFP_DMA
+guarantees to be within the first 16Mb of available physical memory,
+as required by ISA devices).
+
+Note also that the above constraints on physical contiguity and
+dma_mask may not apply if the platform has an IOMMU (a device which
+supplies a physical to virtual mapping between the I/O memory bus and
+the device). However, to be portable, device driver writers may *not*
+assume that such an IOMMU exists.
+
+Warnings: Memory coherency operates at a granularity called the cache
+line width. In order for memory mapped by this API to operate
+correctly, the mapped region must begin exactly on a cache line
+boundary and end exactly on one (to prevent two separately mapped
+regions from sharing a single cache line). Since the cache line size
+may not be known at compile time, the API will not enforce this
+requirement. Therefore, it is recommended that driver writers who
+don't take special care to determine the cache line size at run time
+only map virtual regions that begin and end on page boundaries (which
+are guaranteed also to be cache line boundaries).
+
+DMA_TO_DEVICE synchronisation must be done after the last modification
+of the memory region by the software and before it is handed off to
+the driver. Once this primitive is used, memory covered by this
+primitive should be treated as read-only by the device. If the device
+may write to it at any point, it should be DMA_BIDIRECTIONAL (see
+below).
+
+DMA_FROM_DEVICE synchronisation must be done before the driver
+accesses data that may be changed by the device. This memory should
+be treated as read-only by the driver. If the driver needs to write
+to it at any point, it should be DMA_BIDIRECTIONAL (see below).
+
+DMA_BIDIRECTIONAL requires special handling: it means that the driver
+isn't sure if the memory was modified before being handed off to the
+device and also isn't sure if the device will also modify it. Thus,
+you must always sync bidirectional memory twice: once before the
+memory is handed off to the device (to make sure all memory changes
+are flushed from the processor) and once before the data may be
+accessed after being used by the device (to make sure any processor
+cache lines are updated with data that the device may have changed).
+
+void
+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+ enum dma_data_direction direction)
+void
+pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
+ size_t size, int direction)
+
+Unmaps the region previously mapped. All the parameters passed in
+must be identical to those passed in (and returned) by the mapping
+API.
+
+dma_addr_t
+dma_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction)
+dma_addr_t
+pci_map_page(struct pci_dev *hwdev, struct page *page,
+ unsigned long offset, size_t size, int direction)
+void
+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+ enum dma_data_direction direction)
+void
+pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
+ size_t size, int direction)
+
+API for mapping and unmapping for pages. All the notes and warnings
+for the other mapping APIs apply here. Also, although the <offset>
+and <size> parameters are provided to do partial page mapping, it is
+recommended that you never use these unless you really know what the
+cache width is.
+
+int
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+
+int
+pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr)
+
+In some circumstances dma_map_single and dma_map_page will fail to create
+a mapping. A driver can check for these errors by testing the returned
+dma address with dma_mapping_error(). A non-zero return value means the mapping
+could not be created and the driver should take appropriate action (e.g.
+reduce current DMA mapping usage or delay and try again later).
+
+ int
+ dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction direction)
+ int
+ pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+
+Returns: the number of physical segments mapped (this may be shorter
+than <nents> passed in if some elements of the scatter/gather list are
+physically or virtually adjacent and an IOMMU maps them with a single
+entry).
+
+Please note that the sg cannot be mapped again if it has been mapped once.
+The mapping process is allowed to destroy information in the sg.
+
+As with the other mapping interfaces, dma_map_sg can fail. When it
+does, 0 is returned and a driver must take appropriate action. It is
+critical that the driver do something, in the case of a block driver
+aborting the request or even oopsing is better than doing nothing and
+corrupting the filesystem.
+
+With scatterlists, you use the resulting mapping like this:
+
+ int i, count = dma_map_sg(dev, sglist, nents, direction);
+ struct scatterlist *sg;
+
+ for_each_sg(sglist, sg, count, i) {
+ hw_address[i] = sg_dma_address(sg);
+ hw_len[i] = sg_dma_len(sg);
+ }
+
+where nents is the number of entries in the sglist.
+
+The implementation is free to merge several consecutive sglist entries
+into one (e.g. with an IOMMU, or if several pages just happen to be
+physically contiguous) and returns the actual number of sg entries it
+mapped them to. On failure 0, is returned.
+
+Then you should loop count times (note: this can be less than nents times)
+and use sg_dma_address() and sg_dma_len() macros where you previously
+accessed sg->address and sg->length as shown above.
+
+ void
+ dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+ int nhwentries, enum dma_data_direction direction)
+ void
+ pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+
+Unmap the previously mapped scatter/gather list. All the parameters
+must be the same as those and passed in to the scatter/gather mapping
+API.
+
+Note: <nents> must be the number you passed in, *not* the number of
+physical entries returned.
+
+void
+dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size,
+ enum dma_data_direction direction)
+void
+pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
+ size_t size, int direction)
+void
+dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
+ enum dma_data_direction direction)
+void
+pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+ int nelems, int direction)
+
+Synchronise a single contiguous or scatter/gather mapping. All the
+parameters must be the same as those passed into the single mapping
+API.
+
+Notes: You must do this:
+
+- Before reading values that have been written by DMA from the device
+ (use the DMA_FROM_DEVICE direction)
+- After writing values that will be written to the device using DMA
+ (use the DMA_TO_DEVICE) direction
+- before *and* after handing memory to the device if the memory is
+ DMA_BIDIRECTIONAL
+
+See also dma_map_single().
+
+dma_addr_t
+dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+void
+dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+int
+dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+void
+dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+
+The four functions above are just like the counterpart functions
+without the _attrs suffixes, except that they pass an optional
+struct dma_attrs*.
+
+struct dma_attrs encapsulates a set of "dma attributes". For the
+definition of struct dma_attrs see linux/dma-attrs.h.
+
+The interpretation of dma attributes is architecture-specific, and
+each attribute should be documented in Documentation/DMA-attributes.txt.
+
+If struct dma_attrs* is NULL, the semantics of each of these
+functions is identical to those of the corresponding function
+without the _attrs suffix. As a result dma_map_single_attrs()
+can generally replace dma_map_single(), etc.
+
+As an example of the use of the *_attrs functions, here's how
+you could pass an attribute DMA_ATTR_FOO when mapping memory
+for DMA:
+
+#include <linux/dma-attrs.h>
+/* DMA_ATTR_FOO should be defined in linux/dma-attrs.h and
+ * documented in Documentation/DMA-attributes.txt */
+...
+
+ DEFINE_DMA_ATTRS(attrs);
+ dma_set_attr(DMA_ATTR_FOO, &attrs);
+ ....
+ n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, &attr);
+ ....
+
+Architectures that care about DMA_ATTR_FOO would check for its
+presence in their implementations of the mapping and unmapping
+routines, e.g.:
+
+void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ ....
+ int foo = dma_get_attr(DMA_ATTR_FOO, attrs);
+ ....
+ if (foo)
+ /* twizzle the frobnozzle */
+ ....
+
+
+Part II - Advanced dma_ usage
+-----------------------------
+
+Warning: These pieces of the DMA API have no PCI equivalent. They
+should also not be used in the majority of cases, since they cater for
+unlikely corner cases that don't belong in usual drivers.
+
+If you don't understand how cache line coherency works between a
+processor and an I/O device, you should not be using this part of the
+API at all.
+
+void *
+dma_alloc_noncoherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag)
+
+Identical to dma_alloc_coherent() except that the platform will
+choose to return either consistent or non-consistent memory as it sees
+fit. By using this API, you are guaranteeing to the platform that you
+have all the correct and necessary sync points for this memory in the
+driver should it choose to return non-consistent memory.
+
+Note: where the platform can return consistent memory, it will
+guarantee that the sync points become nops.
+
+Warning: Handling non-consistent memory is a real pain. You should
+only ever use this API if you positively know your driver will be
+required to work on one of the rare (usually non-PCI) architectures
+that simply cannot make consistent memory.
+
+void
+dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle)
+
+Free memory allocated by the nonconsistent API. All parameters must
+be identical to those passed in (and returned by
+dma_alloc_noncoherent()).
+
+int
+dma_is_consistent(struct device *dev, dma_addr_t dma_handle)
+
+Returns true if the device dev is performing consistent DMA on the memory
+area pointed to by the dma_handle.
+
+int
+dma_get_cache_alignment(void)
+
+Returns the processor cache alignment. This is the absolute minimum
+alignment *and* width that you must observe when either mapping
+memory or doing partial flushes.
+
+Notes: This API may return a number *larger* than the actual cache
+line, but it will guarantee that one or more cache lines fit exactly
+into the width returned by this call. It will also always be a power
+of two for easy alignment.
+
+void
+dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction)
+
+Does a partial sync, starting at offset and continuing for size. You
+must be careful to observe the cache alignment and width when doing
+anything like this. You must also be extra careful about accessing
+memory you intend to sync partially.
+
+void
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+ enum dma_data_direction direction)
+
+Do a partial sync of memory that was allocated by
+dma_alloc_noncoherent(), starting at virtual address vaddr and
+continuing on for size. Again, you *must* observe the cache line
+boundaries when doing this.
+
+int
+dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+ dma_addr_t device_addr, size_t size, int
+ flags)
+
+Declare region of memory to be handed out by dma_alloc_coherent when
+it's asked for coherent memory for this device.
+
+bus_addr is the physical address to which the memory is currently
+assigned in the bus responding region (this will be used by the
+platform to perform the mapping).
+
+device_addr is the physical address the device needs to be programmed
+with actually to address this memory (this will be handed out as the
+dma_addr_t in dma_alloc_coherent()).
+
+size is the size of the area (must be multiples of PAGE_SIZE).
+
+flags can be or'd together and are:
+
+DMA_MEMORY_MAP - request that the memory returned from
+dma_alloc_coherent() be directly writable.
+
+DMA_MEMORY_IO - request that the memory returned from
+dma_alloc_coherent() be addressable using read/write/memcpy_toio etc.
+
+One or both of these flags must be present.
+
+DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
+dma_alloc_coherent of any child devices of this one (for memory residing
+on a bridge).
+
+DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
+Do not allow dma_alloc_coherent() to fall back to system memory when
+it's out of memory in the declared region.
+
+The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and
+must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO
+if only DMA_MEMORY_MAP were passed in) for success or zero for
+failure.
+
+Note, for DMA_MEMORY_IO returns, all subsequent memory returned by
+dma_alloc_coherent() may no longer be accessed directly, but instead
+must be accessed using the correct bus functions. If your driver
+isn't prepared to handle this contingency, it should not specify
+DMA_MEMORY_IO in the input flags.
+
+As a simplification for the platforms, only *one* such region of
+memory may be declared per device.
+
+For reasons of efficiency, most platforms choose to track the declared
+region only at the granularity of a page. For smaller allocations,
+you should use the dma_pool() API.
+
+void
+dma_release_declared_memory(struct device *dev)
+
+Remove the memory region previously declared from the system. This
+API performs *no* in-use checking for this region and will return
+unconditionally having removed all the required structures. It is the
+driver's job to ensure that no parts of this memory region are
+currently in use.
+
+void *
+dma_mark_declared_memory_occupied(struct device *dev,
+ dma_addr_t device_addr, size_t size)
+
+This is used to occupy specific regions of the declared space
+(dma_alloc_coherent() will hand out the first free region it finds).
+
+device_addr is the *device* address of the region requested.
+
+size is the size (and should be a page-sized multiple).
+
+The return value will be either a pointer to the processor virtual
+address of the memory, or an error (via PTR_ERR()) if any part of the
+region is occupied.
diff --git a/Documentation/DMA-ISA-LPC.txt b/Documentation/DMA-ISA-LPC.txt
new file mode 100644
index 0000000..e767805
--- /dev/null
+++ b/Documentation/DMA-ISA-LPC.txt
@@ -0,0 +1,151 @@
+ DMA with ISA and LPC devices
+ ============================
+
+ Pierre Ossman <drzeus@drzeus.cx>
+
+This document describes how to do DMA transfers using the old ISA DMA
+controller. Even though ISA is more or less dead today the LPC bus
+uses the same DMA system so it will be around for quite some time.
+
+Part I - Headers and dependencies
+---------------------------------
+
+To do ISA style DMA you need to include two headers:
+
+#include <linux/dma-mapping.h>
+#include <asm/dma.h>
+
+The first is the generic DMA API used to convert virtual addresses to
+physical addresses (see Documentation/DMA-API.txt for details).
+
+The second contains the routines specific to ISA DMA transfers. Since
+this is not present on all platforms make sure you construct your
+Kconfig to be dependent on ISA_DMA_API (not ISA) so that nobody tries
+to build your driver on unsupported platforms.
+
+Part II - Buffer allocation
+---------------------------
+
+The ISA DMA controller has some very strict requirements on which
+memory it can access so extra care must be taken when allocating
+buffers.
+
+(You usually need a special buffer for DMA transfers instead of
+transferring directly to and from your normal data structures.)
+
+The DMA-able address space is the lowest 16 MB of _physical_ memory.
+Also the transfer block may not cross page boundaries (which are 64
+or 128 KiB depending on which channel you use).
+
+In order to allocate a piece of memory that satisfies all these
+requirements you pass the flag GFP_DMA to kmalloc.
+
+Unfortunately the memory available for ISA DMA is scarce so unless you
+allocate the memory during boot-up it's a good idea to also pass
+__GFP_REPEAT and __GFP_NOWARN to make the allocater try a bit harder.
+
+(This scarcity also means that you should allocate the buffer as
+early as possible and not release it until the driver is unloaded.)
+
+Part III - Address translation
+------------------------------
+
+To translate the virtual address to a physical use the normal DMA
+API. Do _not_ use isa_virt_to_phys() even though it does the same
+thing. The reason for this is that the function isa_virt_to_phys()
+will require a Kconfig dependency to ISA, not just ISA_DMA_API which
+is really all you need. Remember that even though the DMA controller
+has its origins in ISA it is used elsewhere.
+
+Note: x86_64 had a broken DMA API when it came to ISA but has since
+been fixed. If your arch has problems then fix the DMA API instead of
+reverting to the ISA functions.
+
+Part IV - Channels
+------------------
+
+A normal ISA DMA controller has 8 channels. The lower four are for
+8-bit transfers and the upper four are for 16-bit transfers.
+
+(Actually the DMA controller is really two separate controllers where
+channel 4 is used to give DMA access for the second controller (0-3).
+This means that of the four 16-bits channels only three are usable.)
+
+You allocate these in a similar fashion as all basic resources:
+
+extern int request_dma(unsigned int dmanr, const char * device_id);
+extern void free_dma(unsigned int dmanr);
+
+The ability to use 16-bit or 8-bit transfers is _not_ up to you as a
+driver author but depends on what the hardware supports. Check your
+specs or test different channels.
+
+Part V - Transfer data
+----------------------
+
+Now for the good stuff, the actual DMA transfer. :)
+
+Before you use any ISA DMA routines you need to claim the DMA lock
+using claim_dma_lock(). The reason is that some DMA operations are
+not atomic so only one driver may fiddle with the registers at a
+time.
+
+The first time you use the DMA controller you should call
+clear_dma_ff(). This clears an internal register in the DMA
+controller that is used for the non-atomic operations. As long as you
+(and everyone else) uses the locking functions then you only need to
+reset this once.
+
+Next, you tell the controller in which direction you intend to do the
+transfer using set_dma_mode(). Currently you have the options
+DMA_MODE_READ and DMA_MODE_WRITE.
+
+Set the address from where the transfer should start (this needs to
+be 16-bit aligned for 16-bit transfers) and how many bytes to
+transfer. Note that it's _bytes_. The DMA routines will do all the
+required translation to values that the DMA controller understands.
+
+The final step is enabling the DMA channel and releasing the DMA
+lock.
+
+Once the DMA transfer is finished (or timed out) you should disable
+the channel again. You should also check get_dma_residue() to make
+sure that all data has been transferred.
+
+Example:
+
+int flags, residue;
+
+flags = claim_dma_lock();
+
+clear_dma_ff();
+
+set_dma_mode(channel, DMA_MODE_WRITE);
+set_dma_addr(channel, phys_addr);
+set_dma_count(channel, num_bytes);
+
+dma_enable(channel);
+
+release_dma_lock(flags);
+
+while (!device_done());
+
+flags = claim_dma_lock();
+
+dma_disable(channel);
+
+residue = dma_get_residue(channel);
+if (residue != 0)
+ printk(KERN_ERR "driver: Incomplete DMA transfer!"
+ " %d bytes left!\n", residue);
+
+release_dma_lock(flags);
+
+Part VI - Suspend/resume
+------------------------
+
+It is the driver's responsibility to make sure that the machine isn't
+suspended while a DMA transfer is in progress. Also, all DMA settings
+are lost when the system suspends so if your driver relies on the DMA
+controller being in a certain state then you have to restore these
+registers upon resume.
diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt
new file mode 100644
index 0000000..b768cc0
--- /dev/null
+++ b/Documentation/DMA-attributes.txt
@@ -0,0 +1,33 @@
+ DMA attributes
+ ==============
+
+This document describes the semantics of the DMA attributes that are
+defined in linux/dma-attrs.h.
+
+DMA_ATTR_WRITE_BARRIER
+----------------------
+
+DMA_ATTR_WRITE_BARRIER is a (write) barrier attribute for DMA. DMA
+to a memory region with the DMA_ATTR_WRITE_BARRIER attribute forces
+all pending DMA writes to complete, and thus provides a mechanism to
+strictly order DMA from a device across all intervening busses and
+bridges. This barrier is not specific to a particular type of
+interconnect, it applies to the system as a whole, and so its
+implementation must account for the idiosyncracies of the system all
+the way from the DMA device to memory.
+
+As an example of a situation where DMA_ATTR_WRITE_BARRIER would be
+useful, suppose that a device does a DMA write to indicate that data is
+ready and available in memory. The DMA of the "completion indication"
+could race with data DMA. Mapping the memory used for completion
+indications with DMA_ATTR_WRITE_BARRIER would prevent the race.
+
+DMA_ATTR_WEAK_ORDERING
+----------------------
+
+DMA_ATTR_WEAK_ORDERING specifies that reads and writes to the mapping
+may be weakly ordered, that is that reads and writes may pass each other.
+
+Since it is optional for platforms to implement DMA_ATTR_WEAK_ORDERING,
+those that do not will simply ignore the attribute and exhibit default
+behavior.
diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt
new file mode 100644
index 0000000..c74fec8
--- /dev/null
+++ b/Documentation/DMA-mapping.txt
@@ -0,0 +1,766 @@
+ Dynamic DMA mapping
+ ===================
+
+ David S. Miller <davem@redhat.com>
+ Richard Henderson <rth@cygnus.com>
+ Jakub Jelinek <jakub@redhat.com>
+
+This document describes the DMA mapping system in terms of the pci_
+API. For a similar API that works for generic devices, see
+DMA-API.txt.
+
+Most of the 64bit platforms have special hardware that translates bus
+addresses (DMA addresses) into physical addresses. This is similar to
+how page tables and/or a TLB translates virtual addresses to physical
+addresses on a CPU. This is needed so that e.g. PCI devices can
+access with a Single Address Cycle (32bit DMA address) any page in the
+64bit physical address space. Previously in Linux those 64bit
+platforms had to set artificial limits on the maximum RAM size in the
+system, so that the virt_to_bus() static scheme works (the DMA address
+translation tables were simply filled on bootup to map each bus
+address to the physical page __pa(bus_to_virt())).
+
+So that Linux can use the dynamic DMA mapping, it needs some help from the
+drivers, namely it has to take into account that DMA addresses should be
+mapped only for the time they are actually used and unmapped after the DMA
+transfer.
+
+The following API will work of course even on platforms where no such
+hardware exists, see e.g. include/asm-i386/pci.h for how it is implemented on
+top of the virt_to_bus interface.
+
+First of all, you should make sure
+
+#include <linux/pci.h>
+
+is in your driver. This file will obtain for you the definition of the
+dma_addr_t (which can hold any valid DMA address for the platform)
+type which should be used everywhere you hold a DMA (bus) address
+returned from the DMA mapping functions.
+
+ What memory is DMA'able?
+
+The first piece of information you must know is what kernel memory can
+be used with the DMA mapping facilities. There has been an unwritten
+set of rules regarding this, and this text is an attempt to finally
+write them down.
+
+If you acquired your memory via the page allocator
+(i.e. __get_free_page*()) or the generic memory allocators
+(i.e. kmalloc() or kmem_cache_alloc()) then you may DMA to/from
+that memory using the addresses returned from those routines.
+
+This means specifically that you may _not_ use the memory/addresses
+returned from vmalloc() for DMA. It is possible to DMA to the
+_underlying_ memory mapped into a vmalloc() area, but this requires
+walking page tables to get the physical addresses, and then
+translating each of those pages back to a kernel address using
+something like __va(). [ EDIT: Update this when we integrate
+Gerd Knorr's generic code which does this. ]
+
+This rule also means that you may use neither kernel image addresses
+(items in data/text/bss segments), nor module image addresses, nor
+stack addresses for DMA. These could all be mapped somewhere entirely
+different than the rest of physical memory. Even if those classes of
+memory could physically work with DMA, you'd need to ensure the I/O
+buffers were cacheline-aligned. Without that, you'd see cacheline
+sharing problems (data corruption) on CPUs with DMA-incoherent caches.
+(The CPU could write to one word, DMA would write to a different one
+in the same cache line, and one of them could be overwritten.)
+
+Also, this means that you cannot take the return of a kmap()
+call and DMA to/from that. This is similar to vmalloc().
+
+What about block I/O and networking buffers? The block I/O and
+networking subsystems make sure that the buffers they use are valid
+for you to DMA from/to.
+
+ DMA addressing limitations
+
+Does your device have any DMA addressing limitations? For example, is
+your device only capable of driving the low order 24-bits of address
+on the PCI bus for SAC DMA transfers? If so, you need to inform the
+PCI layer of this fact.
+
+By default, the kernel assumes that your device can address the full
+32-bits in a SAC cycle. For a 64-bit DAC capable device, this needs
+to be increased. And for a device with limitations, as discussed in
+the previous paragraph, it needs to be decreased.
+
+pci_alloc_consistent() by default will return 32-bit DMA addresses.
+PCI-X specification requires PCI-X devices to support 64-bit
+addressing (DAC) for all transactions. And at least one platform (SGI
+SN2) requires 64-bit consistent allocations to operate correctly when
+the IO bus is in PCI-X mode. Therefore, like with pci_set_dma_mask(),
+it's good practice to call pci_set_consistent_dma_mask() to set the
+appropriate mask even if your device only supports 32-bit DMA
+(default) and especially if it's a PCI-X device.
+
+For correct operation, you must interrogate the PCI layer in your
+device probe routine to see if the PCI controller on the machine can
+properly support the DMA addressing limitation your device has. It is
+good style to do this even if your device holds the default setting,
+because this shows that you did think about these issues wrt. your
+device.
+
+The query is performed via a call to pci_set_dma_mask():
+
+ int pci_set_dma_mask(struct pci_dev *pdev, u64 device_mask);
+
+The query for consistent allocations is performed via a call to
+pci_set_consistent_dma_mask():
+
+ int pci_set_consistent_dma_mask(struct pci_dev *pdev, u64 device_mask);
+
+Here, pdev is a pointer to the PCI device struct of your device, and
+device_mask is a bit mask describing which bits of a PCI address your
+device supports. It returns zero if your card can perform DMA
+properly on the machine given the address mask you provided.
+
+If it returns non-zero, your device cannot perform DMA properly on
+this platform, and attempting to do so will result in undefined
+behavior. You must either use a different mask, or not use DMA.
+
+This means that in the failure case, you have three options:
+
+1) Use another DMA mask, if possible (see below).
+2) Use some non-DMA mode for data transfer, if possible.
+3) Ignore this device and do not initialize it.
+
+It is recommended that your driver print a kernel KERN_WARNING message
+when you end up performing either #2 or #3. In this manner, if a user
+of your driver reports that performance is bad or that the device is not
+even detected, you can ask them for the kernel messages to find out
+exactly why.
+
+The standard 32-bit addressing PCI device would do something like
+this:
+
+ if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+ printk(KERN_WARNING
+ "mydev: No suitable DMA available.\n");
+ goto ignore_this_device;
+ }
+
+Another common scenario is a 64-bit capable device. The approach
+here is to try for 64-bit DAC addressing, but back down to a
+32-bit mask should that fail. The PCI platform code may fail the
+64-bit mask not because the platform is not capable of 64-bit
+addressing. Rather, it may fail in this case simply because
+32-bit SAC addressing is done more efficiently than DAC addressing.
+Sparc64 is one platform which behaves in this way.
+
+Here is how you would handle a 64-bit capable device which can drive
+all 64-bits when accessing streaming DMA:
+
+ int using_dac;
+
+ if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
+ using_dac = 1;
+ } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+ using_dac = 0;
+ } else {
+ printk(KERN_WARNING
+ "mydev: No suitable DMA available.\n");
+ goto ignore_this_device;
+ }
+
+If a card is capable of using 64-bit consistent allocations as well,
+the case would look like this:
+
+ int using_dac, consistent_using_dac;
+
+ if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
+ using_dac = 1;
+ consistent_using_dac = 1;
+ pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
+ } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
+ using_dac = 0;
+ consistent_using_dac = 0;
+ pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+ } else {
+ printk(KERN_WARNING
+ "mydev: No suitable DMA available.\n");
+ goto ignore_this_device;
+ }
+
+pci_set_consistent_dma_mask() will always be able to set the same or a
+smaller mask as pci_set_dma_mask(). However for the rare case that a
+device driver only uses consistent allocations, one would have to
+check the return value from pci_set_consistent_dma_mask().
+
+Finally, if your device can only drive the low 24-bits of
+address during PCI bus mastering you might do something like:
+
+ if (pci_set_dma_mask(pdev, DMA_24BIT_MASK)) {
+ printk(KERN_WARNING
+ "mydev: 24-bit DMA addressing not available.\n");
+ goto ignore_this_device;
+ }
+
+When pci_set_dma_mask() is successful, and returns zero, the PCI layer
+saves away this mask you have provided. The PCI layer will use this
+information later when you make DMA mappings.
+
+There is a case which we are aware of at this time, which is worth
+mentioning in this documentation. If your device supports multiple
+functions (for example a sound card provides playback and record
+functions) and the various different functions have _different_
+DMA addressing limitations, you may wish to probe each mask and
+only provide the functionality which the machine can handle. It
+is important that the last call to pci_set_dma_mask() be for the
+most specific mask.
+
+Here is pseudo-code showing how this might be done:
+
+ #define PLAYBACK_ADDRESS_BITS DMA_32BIT_MASK
+ #define RECORD_ADDRESS_BITS 0x00ffffff
+
+ struct my_sound_card *card;
+ struct pci_dev *pdev;
+
+ ...
+ if (!pci_set_dma_mask(pdev, PLAYBACK_ADDRESS_BITS)) {
+ card->playback_enabled = 1;
+ } else {
+ card->playback_enabled = 0;
+ printk(KERN_WARN "%s: Playback disabled due to DMA limitations.\n",
+ card->name);
+ }
+ if (!pci_set_dma_mask(pdev, RECORD_ADDRESS_BITS)) {
+ card->record_enabled = 1;
+ } else {
+ card->record_enabled = 0;
+ printk(KERN_WARN "%s: Record disabled due to DMA limitations.\n",
+ card->name);
+ }
+
+A sound card was used as an example here because this genre of PCI
+devices seems to be littered with ISA chips given a PCI front end,
+and thus retaining the 16MB DMA addressing limitations of ISA.
+
+ Types of DMA mappings
+
+There are two types of DMA mappings:
+
+- Consistent DMA mappings which are usually mapped at driver
+ initialization, unmapped at the end and for which the hardware should
+ guarantee that the device and the CPU can access the data
+ in parallel and will see updates made by each other without any
+ explicit software flushing.
+
+ Think of "consistent" as "synchronous" or "coherent".
+
+ The current default is to return consistent memory in the low 32
+ bits of the PCI bus space. However, for future compatibility you
+ should set the consistent mask even if this default is fine for your
+ driver.
+
+ Good examples of what to use consistent mappings for are:
+
+ - Network card DMA ring descriptors.
+ - SCSI adapter mailbox command data structures.
+ - Device firmware microcode executed out of
+ main memory.
+
+ The invariant these examples all require is that any CPU store
+ to memory is immediately visible to the device, and vice
+ versa. Consistent mappings guarantee this.
+
+ IMPORTANT: Consistent DMA memory does not preclude the usage of
+ proper memory barriers. The CPU may reorder stores to
+ consistent memory just as it may normal memory. Example:
+ if it is important for the device to see the first word
+ of a descriptor updated before the second, you must do
+ something like:
+
+ desc->word0 = address;
+ wmb();
+ desc->word1 = DESC_VALID;
+
+ in order to get correct behavior on all platforms.
+
+ Also, on some platforms your driver may need to flush CPU write
+ buffers in much the same way as it needs to flush write buffers
+ found in PCI bridges (such as by reading a register's value
+ after writing it).
+
+- Streaming DMA mappings which are usually mapped for one DMA transfer,
+ unmapped right after it (unless you use pci_dma_sync_* below) and for which
+ hardware can optimize for sequential accesses.
+
+ This of "streaming" as "asynchronous" or "outside the coherency
+ domain".
+
+ Good examples of what to use streaming mappings for are:
+
+ - Networking buffers transmitted/received by a device.
+ - Filesystem buffers written/read by a SCSI device.
+
+ The interfaces for using this type of mapping were designed in
+ such a way that an implementation can make whatever performance
+ optimizations the hardware allows. To this end, when using
+ such mappings you must be explicit about what you want to happen.
+
+Neither type of DMA mapping has alignment restrictions that come
+from PCI, although some devices may have such restrictions.
+Also, systems with caches that aren't DMA-coherent will work better
+when the underlying buffers don't share cache lines with other data.
+
+
+ Using Consistent DMA mappings.
+
+To allocate and map large (PAGE_SIZE or so) consistent DMA regions,
+you should do:
+
+ dma_addr_t dma_handle;
+
+ cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle);
+
+where pdev is a struct pci_dev *. This may be called in interrupt context.
+You should use dma_alloc_coherent (see DMA-API.txt) for buses
+where devices don't have struct pci_dev (like ISA, EISA).
+
+This argument is needed because the DMA translations may be bus
+specific (and often is private to the bus which the device is attached
+to).
+
+Size is the length of the region you want to allocate, in bytes.
+
+This routine will allocate RAM for that region, so it acts similarly to
+__get_free_pages (but takes size instead of a page order). If your
+driver needs regions sized smaller than a page, you may prefer using
+the pci_pool interface, described below.
+
+The consistent DMA mapping interfaces, for non-NULL pdev, will by
+default return a DMA address which is SAC (Single Address Cycle)
+addressable. Even if the device indicates (via PCI dma mask) that it
+may address the upper 32-bits and thus perform DAC cycles, consistent
+allocation will only return > 32-bit PCI addresses for DMA if the
+consistent dma mask has been explicitly changed via
+pci_set_consistent_dma_mask(). This is true of the pci_pool interface
+as well.
+
+pci_alloc_consistent returns two values: the virtual address which you
+can use to access it from the CPU and dma_handle which you pass to the
+card.
+
+The cpu return address and the DMA bus master address are both
+guaranteed to be aligned to the smallest PAGE_SIZE order which
+is greater than or equal to the requested size. This invariant
+exists (for example) to guarantee that if you allocate a chunk
+which is smaller than or equal to 64 kilobytes, the extent of the
+buffer you receive will not cross a 64K boundary.
+
+To unmap and free such a DMA region, you call:
+
+ pci_free_consistent(pdev, size, cpu_addr, dma_handle);
+
+where pdev, size are the same as in the above call and cpu_addr and
+dma_handle are the values pci_alloc_consistent returned to you.
+This function may not be called in interrupt context.
+
+If your driver needs lots of smaller memory regions, you can write
+custom code to subdivide pages returned by pci_alloc_consistent,
+or you can use the pci_pool API to do that. A pci_pool is like
+a kmem_cache, but it uses pci_alloc_consistent not __get_free_pages.
+Also, it understands common hardware constraints for alignment,
+like queue heads needing to be aligned on N byte boundaries.
+
+Create a pci_pool like this:
+
+ struct pci_pool *pool;
+
+ pool = pci_pool_create(name, pdev, size, align, alloc);
+
+The "name" is for diagnostics (like a kmem_cache name); pdev and size
+are as above. The device's hardware alignment requirement for this
+type of data is "align" (which is expressed in bytes, and must be a
+power of two). If your device has no boundary crossing restrictions,
+pass 0 for alloc; passing 4096 says memory allocated from this pool
+must not cross 4KByte boundaries (but at that time it may be better to
+go for pci_alloc_consistent directly instead).
+
+Allocate memory from a pci pool like this:
+
+ cpu_addr = pci_pool_alloc(pool, flags, &dma_handle);
+
+flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor
+holding SMP locks), SLAB_ATOMIC otherwise. Like pci_alloc_consistent,
+this returns two values, cpu_addr and dma_handle.
+
+Free memory that was allocated from a pci_pool like this:
+
+ pci_pool_free(pool, cpu_addr, dma_handle);
+
+where pool is what you passed to pci_pool_alloc, and cpu_addr and
+dma_handle are the values pci_pool_alloc returned. This function
+may be called in interrupt context.
+
+Destroy a pci_pool by calling:
+
+ pci_pool_destroy(pool);
+
+Make sure you've called pci_pool_free for all memory allocated
+from a pool before you destroy the pool. This function may not
+be called in interrupt context.
+
+ DMA Direction
+
+The interfaces described in subsequent portions of this document
+take a DMA direction argument, which is an integer and takes on
+one of the following values:
+
+ PCI_DMA_BIDIRECTIONAL
+ PCI_DMA_TODEVICE
+ PCI_DMA_FROMDEVICE
+ PCI_DMA_NONE
+
+One should provide the exact DMA direction if you know it.
+
+PCI_DMA_TODEVICE means "from main memory to the PCI device"
+PCI_DMA_FROMDEVICE means "from the PCI device to main memory"
+It is the direction in which the data moves during the DMA
+transfer.
+
+You are _strongly_ encouraged to specify this as precisely
+as you possibly can.
+
+If you absolutely cannot know the direction of the DMA transfer,
+specify PCI_DMA_BIDIRECTIONAL. It means that the DMA can go in
+either direction. The platform guarantees that you may legally
+specify this, and that it will work, but this may be at the
+cost of performance for example.
+
+The value PCI_DMA_NONE is to be used for debugging. One can
+hold this in a data structure before you come to know the
+precise direction, and this will help catch cases where your
+direction tracking logic has failed to set things up properly.
+
+Another advantage of specifying this value precisely (outside of
+potential platform-specific optimizations of such) is for debugging.
+Some platforms actually have a write permission boolean which DMA
+mappings can be marked with, much like page protections in the user
+program address space. Such platforms can and do report errors in the
+kernel logs when the PCI controller hardware detects violation of the
+permission setting.
+
+Only streaming mappings specify a direction, consistent mappings
+implicitly have a direction attribute setting of
+PCI_DMA_BIDIRECTIONAL.
+
+The SCSI subsystem tells you the direction to use in the
+'sc_data_direction' member of the SCSI command your driver is
+working on.
+
+For Networking drivers, it's a rather simple affair. For transmit
+packets, map/unmap them with the PCI_DMA_TODEVICE direction
+specifier. For receive packets, just the opposite, map/unmap them
+with the PCI_DMA_FROMDEVICE direction specifier.
+
+ Using Streaming DMA mappings
+
+The streaming DMA mapping routines can be called from interrupt
+context. There are two versions of each map/unmap, one which will
+map/unmap a single memory region, and one which will map/unmap a
+scatterlist.
+
+To map a single region, you do:
+
+ struct pci_dev *pdev = mydev->pdev;
+ dma_addr_t dma_handle;
+ void *addr = buffer->ptr;
+ size_t size = buffer->len;
+
+ dma_handle = pci_map_single(pdev, addr, size, direction);
+
+and to unmap it:
+
+ pci_unmap_single(pdev, dma_handle, size, direction);
+
+You should call pci_unmap_single when the DMA activity is finished, e.g.
+from the interrupt which told you that the DMA transfer is done.
+
+Using cpu pointers like this for single mappings has a disadvantage,
+you cannot reference HIGHMEM memory in this way. Thus, there is a
+map/unmap interface pair akin to pci_{map,unmap}_single. These
+interfaces deal with page/offset pairs instead of cpu pointers.
+Specifically:
+
+ struct pci_dev *pdev = mydev->pdev;
+ dma_addr_t dma_handle;
+ struct page *page = buffer->page;
+ unsigned long offset = buffer->offset;
+ size_t size = buffer->len;
+
+ dma_handle = pci_map_page(pdev, page, offset, size, direction);
+
+ ...
+
+ pci_unmap_page(pdev, dma_handle, size, direction);
+
+Here, "offset" means byte offset within the given page.
+
+With scatterlists, you map a region gathered from several regions by:
+
+ int i, count = pci_map_sg(pdev, sglist, nents, direction);
+ struct scatterlist *sg;
+
+ for_each_sg(sglist, sg, count, i) {
+ hw_address[i] = sg_dma_address(sg);
+ hw_len[i] = sg_dma_len(sg);
+ }
+
+where nents is the number of entries in the sglist.
+
+The implementation is free to merge several consecutive sglist entries
+into one (e.g. if DMA mapping is done with PAGE_SIZE granularity, any
+consecutive sglist entries can be merged into one provided the first one
+ends and the second one starts on a page boundary - in fact this is a huge
+advantage for cards which either cannot do scatter-gather or have very
+limited number of scatter-gather entries) and returns the actual number
+of sg entries it mapped them to. On failure 0 is returned.
+
+Then you should loop count times (note: this can be less than nents times)
+and use sg_dma_address() and sg_dma_len() macros where you previously
+accessed sg->address and sg->length as shown above.
+
+To unmap a scatterlist, just call:
+
+ pci_unmap_sg(pdev, sglist, nents, direction);
+
+Again, make sure DMA activity has already finished.
+
+PLEASE NOTE: The 'nents' argument to the pci_unmap_sg call must be
+ the _same_ one you passed into the pci_map_sg call,
+ it should _NOT_ be the 'count' value _returned_ from the
+ pci_map_sg call.
+
+Every pci_map_{single,sg} call should have its pci_unmap_{single,sg}
+counterpart, because the bus address space is a shared resource (although
+in some ports the mapping is per each BUS so less devices contend for the
+same bus address space) and you could render the machine unusable by eating
+all bus addresses.
+
+If you need to use the same streaming DMA region multiple times and touch
+the data in between the DMA transfers, the buffer needs to be synced
+properly in order for the cpu and device to see the most uptodate and
+correct copy of the DMA buffer.
+
+So, firstly, just map it with pci_map_{single,sg}, and after each DMA
+transfer call either:
+
+ pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction);
+
+or:
+
+ pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction);
+
+as appropriate.
+
+Then, if you wish to let the device get at the DMA area again,
+finish accessing the data with the cpu, and then before actually
+giving the buffer to the hardware call either:
+
+ pci_dma_sync_single_for_device(pdev, dma_handle, size, direction);
+
+or:
+
+ pci_dma_sync_sg_for_device(dev, sglist, nents, direction);
+
+as appropriate.
+
+After the last DMA transfer call one of the DMA unmap routines
+pci_unmap_{single,sg}. If you don't touch the data from the first pci_map_*
+call till pci_unmap_*, then you don't have to call the pci_dma_sync_*
+routines at all.
+
+Here is pseudo code which shows a situation in which you would need
+to use the pci_dma_sync_*() interfaces.
+
+ my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
+ {
+ dma_addr_t mapping;
+
+ mapping = pci_map_single(cp->pdev, buffer, len, PCI_DMA_FROMDEVICE);
+
+ cp->rx_buf = buffer;
+ cp->rx_len = len;
+ cp->rx_dma = mapping;
+
+ give_rx_buf_to_card(cp);
+ }
+
+ ...
+
+ my_card_interrupt_handler(int irq, void *devid, struct pt_regs *regs)
+ {
+ struct my_card *cp = devid;
+
+ ...
+ if (read_card_status(cp) == RX_BUF_TRANSFERRED) {
+ struct my_card_header *hp;
+
+ /* Examine the header to see if we wish
+ * to accept the data. But synchronize
+ * the DMA transfer with the CPU first
+ * so that we see updated contents.
+ */
+ pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma,
+ cp->rx_len,
+ PCI_DMA_FROMDEVICE);
+
+ /* Now it is safe to examine the buffer. */
+ hp = (struct my_card_header *) cp->rx_buf;
+ if (header_is_ok(hp)) {
+ pci_unmap_single(cp->pdev, cp->rx_dma, cp->rx_len,
+ PCI_DMA_FROMDEVICE);
+ pass_to_upper_layers(cp->rx_buf);
+ make_and_setup_new_rx_buf(cp);
+ } else {
+ /* Just sync the buffer and give it back
+ * to the card.
+ */
+ pci_dma_sync_single_for_device(cp->pdev,
+ cp->rx_dma,
+ cp->rx_len,
+ PCI_DMA_FROMDEVICE);
+ give_rx_buf_to_card(cp);
+ }
+ }
+ }
+
+Drivers converted fully to this interface should not use virt_to_bus any
+longer, nor should they use bus_to_virt. Some drivers have to be changed a
+little bit, because there is no longer an equivalent to bus_to_virt in the
+dynamic DMA mapping scheme - you have to always store the DMA addresses
+returned by the pci_alloc_consistent, pci_pool_alloc, and pci_map_single
+calls (pci_map_sg stores them in the scatterlist itself if the platform
+supports dynamic DMA mapping in hardware) in your driver structures and/or
+in the card registers.
+
+All PCI drivers should be using these interfaces with no exceptions.
+It is planned to completely remove virt_to_bus() and bus_to_virt() as
+they are entirely deprecated. Some ports already do not provide these
+as it is impossible to correctly support them.
+
+ Optimizing Unmap State Space Consumption
+
+On many platforms, pci_unmap_{single,page}() is simply a nop.
+Therefore, keeping track of the mapping address and length is a waste
+of space. Instead of filling your drivers up with ifdefs and the like
+to "work around" this (which would defeat the whole purpose of a
+portable API) the following facilities are provided.
+
+Actually, instead of describing the macros one by one, we'll
+transform some example code.
+
+1) Use DECLARE_PCI_UNMAP_{ADDR,LEN} in state saving structures.
+ Example, before:
+
+ struct ring_state {
+ struct sk_buff *skb;
+ dma_addr_t mapping;
+ __u32 len;
+ };
+
+ after:
+
+ struct ring_state {
+ struct sk_buff *skb;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+ DECLARE_PCI_UNMAP_LEN(len)
+ };
+
+ NOTE: DO NOT put a semicolon at the end of the DECLARE_*()
+ macro.
+
+2) Use pci_unmap_{addr,len}_set to set these values.
+ Example, before:
+
+ ringp->mapping = FOO;
+ ringp->len = BAR;
+
+ after:
+
+ pci_unmap_addr_set(ringp, mapping, FOO);
+ pci_unmap_len_set(ringp, len, BAR);
+
+3) Use pci_unmap_{addr,len} to access these values.
+ Example, before:
+
+ pci_unmap_single(pdev, ringp->mapping, ringp->len,
+ PCI_DMA_FROMDEVICE);
+
+ after:
+
+ pci_unmap_single(pdev,
+ pci_unmap_addr(ringp, mapping),
+ pci_unmap_len(ringp, len),
+ PCI_DMA_FROMDEVICE);
+
+It really should be self-explanatory. We treat the ADDR and LEN
+separately, because it is possible for an implementation to only
+need the address in order to perform the unmap operation.
+
+ Platform Issues
+
+If you are just writing drivers for Linux and do not maintain
+an architecture port for the kernel, you can safely skip down
+to "Closing".
+
+1) Struct scatterlist requirements.
+
+ Struct scatterlist must contain, at a minimum, the following
+ members:
+
+ struct page *page;
+ unsigned int offset;
+ unsigned int length;
+
+ The base address is specified by a "page+offset" pair.
+
+ Previous versions of struct scatterlist contained a "void *address"
+ field that was sometimes used instead of page+offset. As of Linux
+ 2.5., page+offset is always used, and the "address" field has been
+ deleted.
+
+2) More to come...
+
+ Handling Errors
+
+DMA address space is limited on some architectures and an allocation
+failure can be determined by:
+
+- checking if pci_alloc_consistent returns NULL or pci_map_sg returns 0
+
+- checking the returned dma_addr_t of pci_map_single and pci_map_page
+ by using pci_dma_mapping_error():
+
+ dma_addr_t dma_handle;
+
+ dma_handle = pci_map_single(pdev, addr, size, direction);
+ if (pci_dma_mapping_error(pdev, dma_handle)) {
+ /*
+ * reduce current DMA mapping usage,
+ * delay and try again later or
+ * reset driver.
+ */
+ }
+
+ Closing
+
+This document, and the API itself, would not be in it's current
+form without the feedback and suggestions from numerous individuals.
+We would like to specifically mention, in no particular order, the
+following people:
+
+ Russell King <rmk@arm.linux.org.uk>
+ Leo Dagum <dagum@barrel.engr.sgi.com>
+ Ralf Baechle <ralf@oss.sgi.com>
+ Grant Grundler <grundler@cup.hp.com>
+ Jay Estabrook <Jay.Estabrook@compaq.com>
+ Thomas Sailer <sailer@ife.ee.ethz.ch>
+ Andrea Arcangeli <andrea@suse.de>
+ Jens Axboe <jens.axboe@oracle.com>
+ David Mosberger-Tang <davidm@hpl.hp.com>
diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore
new file mode 100644
index 0000000..c102c02
--- /dev/null
+++ b/Documentation/DocBook/.gitignore
@@ -0,0 +1,6 @@
+*.xml
+*.ps
+*.pdf
+*.html
+*.9.gz
+*.9
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
new file mode 100644
index 0000000..9b1f6ca
--- /dev/null
+++ b/Documentation/DocBook/Makefile
@@ -0,0 +1,240 @@
+###
+# This makefile is used to generate the kernel documentation,
+# primarily based on in-line comments in various source files.
+# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how
+# to document the SRC - and how to read it.
+# To add a new book the only step required is to add the book to the
+# list of DOCBOOKS.
+
+DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml \
+ kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
+ procfs-guide.xml writing_usb_driver.xml networking.xml \
+ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \
+ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+ genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
+ mac80211.xml debugobjects.xml sh.xml
+
+###
+# The build process is as follows (targets):
+# (xmldocs) [by docproc]
+# file.tmpl --> file.xml +--> file.ps (psdocs) [by db2ps or xmlto]
+# +--> file.pdf (pdfdocs) [by db2pdf or xmlto]
+# +--> DIR=file (htmldocs) [by xmlto]
+# +--> man/ (mandocs) [by xmlto]
+
+
+# for PDF and PS output you can choose between xmlto and docbook-utils tools
+PDF_METHOD = $(prefer-db2x)
+PS_METHOD = $(prefer-db2x)
+
+
+###
+# The targets that may be used.
+PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs
+
+BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
+xmldocs: $(BOOKS)
+sgmldocs: xmldocs
+
+PS := $(patsubst %.xml, %.ps, $(BOOKS))
+psdocs: $(PS)
+
+PDF := $(patsubst %.xml, %.pdf, $(BOOKS))
+pdfdocs: $(PDF)
+
+HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS)))
+htmldocs: $(HTML)
+ $(call build_main_index)
+
+MAN := $(patsubst %.xml, %.9, $(BOOKS))
+mandocs: $(MAN)
+
+installmandocs: mandocs
+ mkdir -p /usr/local/man/man9/
+ install Documentation/DocBook/man/*.9.gz /usr/local/man/man9/
+
+###
+#External programs used
+KERNELDOC = $(srctree)/scripts/kernel-doc
+DOCPROC = $(objtree)/scripts/basic/docproc
+
+XMLTOFLAGS = -m $(srctree)/Documentation/DocBook/stylesheet.xsl
+#XMLTOFLAGS += --skip-validation
+
+###
+# DOCPROC is used for two purposes:
+# 1) To generate a dependency list for a .tmpl file
+# 2) To preprocess a .tmpl file and call kernel-doc with
+# appropriate parameters.
+# The following rules are used to generate the .xml documentation
+# required to generate the final targets. (ps, pdf, html).
+quiet_cmd_docproc = DOCPROC $@
+ cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@
+define rule_docproc
+ set -e; \
+ $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \
+ $(cmd_$(1)); \
+ ( \
+ echo 'cmd_$@ := $(cmd_$(1))'; \
+ echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \
+ ) > $(dir $@).$(notdir $@).cmd
+endef
+
+%.xml: %.tmpl FORCE
+ $(call if_changed_rule,docproc)
+
+###
+#Read in all saved dependency files
+cmd_files := $(wildcard $(foreach f,$(BOOKS),$(dir $(f)).$(notdir $(f)).cmd))
+
+ifneq ($(cmd_files),)
+ include $(cmd_files)
+endif
+
+###
+# Changes in kernel-doc force a rebuild of all documentation
+$(BOOKS): $(KERNELDOC)
+
+###
+# procfs guide uses a .c file as example code.
+# This requires an explicit dependency
+C-procfs-example = procfs_example.xml
+C-procfs-example2 = $(addprefix $(obj)/,$(C-procfs-example))
+$(obj)/procfs-guide.xml: $(C-procfs-example2)
+
+# List of programs to build
+##oops, this is a kernel module::hostprogs-y := procfs_example
+obj-m += procfs_example.o
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \
+ exit 1
+db2xtemplate = db2TYPE -o $(dir $@) $<
+xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $<
+
+# determine which methods are available
+ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found)
+ use-db2x = db2x
+ prefer-db2x = db2x
+else
+ use-db2x = notfound
+ prefer-db2x = $(use-xmlto)
+endif
+ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found)
+ use-xmlto = xmlto
+ prefer-xmlto = xmlto
+else
+ use-xmlto = notfound
+ prefer-xmlto = $(use-db2x)
+endif
+
+# the commands, generated from the chosen template
+quiet_cmd_db2ps = PS $@
+ cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template))
+%.ps : %.xml
+ $(call cmd,db2ps)
+
+quiet_cmd_db2pdf = PDF $@
+ cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template))
+%.pdf : %.xml
+ $(call cmd,db2pdf)
+
+
+main_idx = Documentation/DocBook/index.html
+build_main_index = rm -rf $(main_idx) && \
+ echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \
+ echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \
+ cat $(HTML) >> $(main_idx)
+
+quiet_cmd_db2html = HTML $@
+ cmd_db2html = xmlto xhtml $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \
+ echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \
+ $(patsubst %.html,%,$(notdir $@))</a><p>' > $@
+
+%.html: %.xml
+ @(which xmlto > /dev/null 2>&1) || \
+ (echo "*** You need to install xmlto ***"; \
+ exit 1)
+ @rm -rf $@ $(patsubst %.html,%,$@)
+ $(call cmd,db2html)
+ @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
+ cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
+
+quiet_cmd_db2man = MAN $@
+ cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man $< ; gzip -f $(obj)/man/*.9; fi
+%.9 : %.xml
+ @(which xmlto > /dev/null 2>&1) || \
+ (echo "*** You need to install xmlto ***"; \
+ exit 1)
+ $(Q)mkdir -p $(obj)/man
+ $(call cmd,db2man)
+ @touch $@
+
+###
+# Rules to generate postscripts and PNG images from .fig format files
+quiet_cmd_fig2eps = FIG2EPS $@
+ cmd_fig2eps = fig2dev -Leps $< $@
+
+%.eps: %.fig
+ @(which fig2dev > /dev/null 2>&1) || \
+ (echo "*** You need to install transfig ***"; \
+ exit 1)
+ $(call cmd,fig2eps)
+
+quiet_cmd_fig2png = FIG2PNG $@
+ cmd_fig2png = fig2dev -Lpng $< $@
+
+%.png: %.fig
+ @(which fig2dev > /dev/null 2>&1) || \
+ (echo "*** You need to install transfig ***"; \
+ exit 1)
+ $(call cmd,fig2png)
+
+###
+# Rule to convert a .c file to inline XML documentation
+ gen_xml = :
+ quiet_gen_xml = echo ' GEN $@'
+silent_gen_xml = :
+%.xml: %.c
+ @$($(quiet)gen_xml)
+ @( \
+ echo "<programlisting>"; \
+ expand --tabs=8 < $< | \
+ sed -e "s/&/\\&amp;/g" \
+ -e "s/</\\&lt;/g" \
+ -e "s/>/\\&gt;/g"; \
+ echo "</programlisting>") > $@
+
+###
+# Help targets as used by the top-level makefile
+dochelp:
+ @echo ' Linux kernel internal documentation in different formats:'
+ @echo ' htmldocs - HTML'
+ @echo ' installmandocs - install man pages generated by mandocs'
+ @echo ' mandocs - man pages'
+ @echo ' pdfdocs - PDF'
+ @echo ' psdocs - Postscript'
+ @echo ' xmldocs - XML DocBook'
+
+###
+# Temporary files left by various tools
+clean-files := $(DOCBOOKS) \
+ $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.aux, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.tex, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.log, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.out, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.ps, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.html, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.9, $(DOCBOOKS)) \
+ $(C-procfs-example)
+
+clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
+
+# Declare the contents of the .PHONY variable as phony. We keep that
+# information in a variable se we can use it in if_changed and friends.
+
+.PHONY: $(PHONY)
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
new file mode 100644
index 0000000..7f5f218
--- /dev/null
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -0,0 +1,391 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="debug-objects-guide">
+ <bookinfo>
+ <title>Debug objects life time</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Thomas</firstname>
+ <surname>Gleixner</surname>
+ <affiliation>
+ <address>
+ <email>tglx@linutronix.de</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2008</year>
+ <holder>Thomas Gleixner</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ debugobjects is a generic infrastructure to track the life time
+ of kernel objects and validate the operations on those.
+ </para>
+ <para>
+ debugobjects is useful to check for the following error patterns:
+ <itemizedlist>
+ <listitem><para>Activation of uninitialized objects</para></listitem>
+ <listitem><para>Initialization of active objects</para></listitem>
+ <listitem><para>Usage of freed/destroyed objects</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ debugobjects is not changing the data structure of the real
+ object so it can be compiled in with a minimal runtime impact
+ and enabled on demand with a kernel command line option.
+ </para>
+ </chapter>
+
+ <chapter id="howto">
+ <title>Howto use debugobjects</title>
+ <para>
+ A kernel subsystem needs to provide a data structure which
+ describes the object type and add calls into the debug code at
+ appropriate places. The data structure to describe the object
+ type needs at minimum the name of the object type. Optional
+ functions can and should be provided to fixup detected problems
+ so the kernel can continue to work and the debug information can
+ be retrieved from a live system instead of hard core debugging
+ with serial consoles and stack trace transcripts from the
+ monitor.
+ </para>
+ <para>
+ The debug calls provided by debugobjects are:
+ <itemizedlist>
+ <listitem><para>debug_object_init</para></listitem>
+ <listitem><para>debug_object_init_on_stack</para></listitem>
+ <listitem><para>debug_object_activate</para></listitem>
+ <listitem><para>debug_object_deactivate</para></listitem>
+ <listitem><para>debug_object_destroy</para></listitem>
+ <listitem><para>debug_object_free</para></listitem>
+ </itemizedlist>
+ Each of these functions takes the address of the real object and
+ a pointer to the object type specific debug description
+ structure.
+ </para>
+ <para>
+ Each detected error is reported in the statistics and a limited
+ number of errors are printk'ed including a full stack trace.
+ </para>
+ <para>
+ The statistics are available via debugfs/debug_objects/stats.
+ They provide information about the number of warnings and the
+ number of successful fixups along with information about the
+ usage of the internal tracking objects and the state of the
+ internal tracking objects pool.
+ </para>
+ </chapter>
+ <chapter id="debugfunctions">
+ <title>Debug functions</title>
+ <sect1 id="prototypes">
+ <title>Debug object function reference</title>
+!Elib/debugobjects.c
+ </sect1>
+ <sect1 id="debug_object_init">
+ <title>debug_object_init</title>
+ <para>
+ This function is called whenever the initialization function
+ of a real object is called.
+ </para>
+ <para>
+ When the real object is already tracked by debugobjects it is
+ checked, whether the object can be initialized. Initializing
+ is not allowed for active and destroyed objects. When
+ debugobjects detects an error, then it calls the fixup_init
+ function of the object type description structure if provided
+ by the caller. The fixup function can correct the problem
+ before the real initialization of the object happens. E.g. it
+ can deactivate an active object in order to prevent damage to
+ the subsystem.
+ </para>
+ <para>
+ When the real object is not yet tracked by debugobjects,
+ debugobjects allocates a tracker object for the real object
+ and sets the tracker object state to ODEBUG_STATE_INIT. It
+ verifies that the object is not on the callers stack. If it is
+ on the callers stack then a limited number of warnings
+ including a full stack trace is printk'ed. The calling code
+ must use debug_object_init_on_stack() and remove the object
+ before leaving the function which allocated it. See next
+ section.
+ </para>
+ </sect1>
+
+ <sect1 id="debug_object_init_on_stack">
+ <title>debug_object_init_on_stack</title>
+ <para>
+ This function is called whenever the initialization function
+ of a real object which resides on the stack is called.
+ </para>
+ <para>
+ When the real object is already tracked by debugobjects it is
+ checked, whether the object can be initialized. Initializing
+ is not allowed for active and destroyed objects. When
+ debugobjects detects an error, then it calls the fixup_init
+ function of the object type description structure if provided
+ by the caller. The fixup function can correct the problem
+ before the real initialization of the object happens. E.g. it
+ can deactivate an active object in order to prevent damage to
+ the subsystem.
+ </para>
+ <para>
+ When the real object is not yet tracked by debugobjects
+ debugobjects allocates a tracker object for the real object
+ and sets the tracker object state to ODEBUG_STATE_INIT. It
+ verifies that the object is on the callers stack.
+ </para>
+ <para>
+ An object which is on the stack must be removed from the
+ tracker by calling debug_object_free() before the function
+ which allocates the object returns. Otherwise we keep track of
+ stale objects.
+ </para>
+ </sect1>
+
+ <sect1 id="debug_object_activate">
+ <title>debug_object_activate</title>
+ <para>
+ This function is called whenever the activation function of a
+ real object is called.
+ </para>
+ <para>
+ When the real object is already tracked by debugobjects it is
+ checked, whether the object can be activated. Activating is
+ not allowed for active and destroyed objects. When
+ debugobjects detects an error, then it calls the
+ fixup_activate function of the object type description
+ structure if provided by the caller. The fixup function can
+ correct the problem before the real activation of the object
+ happens. E.g. it can deactivate an active object in order to
+ prevent damage to the subsystem.
+ </para>
+ <para>
+ When the real object is not yet tracked by debugobjects then
+ the fixup_activate function is called if available. This is
+ necessary to allow the legitimate activation of statically
+ allocated and initialized objects. The fixup function checks
+ whether the object is valid and calls the debug_objects_init()
+ function to initialize the tracking of this object.
+ </para>
+ <para>
+ When the activation is legitimate, then the state of the
+ associated tracker object is set to ODEBUG_STATE_ACTIVE.
+ </para>
+ </sect1>
+
+ <sect1 id="debug_object_deactivate">
+ <title>debug_object_deactivate</title>
+ <para>
+ This function is called whenever the deactivation function of
+ a real object is called.
+ </para>
+ <para>
+ When the real object is tracked by debugobjects it is checked,
+ whether the object can be deactivated. Deactivating is not
+ allowed for untracked or destroyed objects.
+ </para>
+ <para>
+ When the deactivation is legitimate, then the state of the
+ associated tracker object is set to ODEBUG_STATE_INACTIVE.
+ </para>
+ </sect1>
+
+ <sect1 id="debug_object_destroy">
+ <title>debug_object_destroy</title>
+ <para>
+ This function is called to mark an object destroyed. This is
+ useful to prevent the usage of invalid objects, which are
+ still available in memory: either statically allocated objects
+ or objects which are freed later.
+ </para>
+ <para>
+ When the real object is tracked by debugobjects it is checked,
+ whether the object can be destroyed. Destruction is not
+ allowed for active and destroyed objects. When debugobjects
+ detects an error, then it calls the fixup_destroy function of
+ the object type description structure if provided by the
+ caller. The fixup function can correct the problem before the
+ real destruction of the object happens. E.g. it can deactivate
+ an active object in order to prevent damage to the subsystem.
+ </para>
+ <para>
+ When the destruction is legitimate, then the state of the
+ associated tracker object is set to ODEBUG_STATE_DESTROYED.
+ </para>
+ </sect1>
+
+ <sect1 id="debug_object_free">
+ <title>debug_object_free</title>
+ <para>
+ This function is called before an object is freed.
+ </para>
+ <para>
+ When the real object is tracked by debugobjects it is checked,
+ whether the object can be freed. Free is not allowed for
+ active objects. When debugobjects detects an error, then it
+ calls the fixup_free function of the object type description
+ structure if provided by the caller. The fixup function can
+ correct the problem before the real free of the object
+ happens. E.g. it can deactivate an active object in order to
+ prevent damage to the subsystem.
+ </para>
+ <para>
+ Note that debug_object_free removes the object from the
+ tracker. Later usage of the object is detected by the other
+ debug checks.
+ </para>
+ </sect1>
+ </chapter>
+ <chapter id="fixupfunctions">
+ <title>Fixup functions</title>
+ <sect1 id="debug_obj_descr">
+ <title>Debug object type description structure</title>
+!Iinclude/linux/debugobjects.h
+ </sect1>
+ <sect1 id="fixup_init">
+ <title>fixup_init</title>
+ <para>
+ This function is called from the debug code whenever a problem
+ in debug_object_init is detected. The function takes the
+ address of the object and the state which is currently
+ recorded in the tracker.
+ </para>
+ <para>
+ Called from debug_object_init when the object state is:
+ <itemizedlist>
+ <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ The function returns 1 when the fixup was successful,
+ otherwise 0. The return value is used to update the
+ statistics.
+ </para>
+ <para>
+ Note, that the function needs to call the debug_object_init()
+ function again, after the damage has been repaired in order to
+ keep the state consistent.
+ </para>
+ </sect1>
+
+ <sect1 id="fixup_activate">
+ <title>fixup_activate</title>
+ <para>
+ This function is called from the debug code whenever a problem
+ in debug_object_activate is detected.
+ </para>
+ <para>
+ Called from debug_object_activate when the object state is:
+ <itemizedlist>
+ <listitem><para>ODEBUG_STATE_NOTAVAILABLE</para></listitem>
+ <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ The function returns 1 when the fixup was successful,
+ otherwise 0. The return value is used to update the
+ statistics.
+ </para>
+ <para>
+ Note that the function needs to call the debug_object_activate()
+ function again after the damage has been repaired in order to
+ keep the state consistent.
+ </para>
+ <para>
+ The activation of statically initialized objects is a special
+ case. When debug_object_activate() has no tracked object for
+ this object address then fixup_activate() is called with
+ object state ODEBUG_STATE_NOTAVAILABLE. The fixup function
+ needs to check whether this is a legitimate case of a
+ statically initialized object or not. In case it is it calls
+ debug_object_init() and debug_object_activate() to make the
+ object known to the tracker and marked active. In this case
+ the function should return 0 because this is not a real fixup.
+ </para>
+ </sect1>
+
+ <sect1 id="fixup_destroy">
+ <title>fixup_destroy</title>
+ <para>
+ This function is called from the debug code whenever a problem
+ in debug_object_destroy is detected.
+ </para>
+ <para>
+ Called from debug_object_destroy when the object state is:
+ <itemizedlist>
+ <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ The function returns 1 when the fixup was successful,
+ otherwise 0. The return value is used to update the
+ statistics.
+ </para>
+ </sect1>
+ <sect1 id="fixup_free">
+ <title>fixup_free</title>
+ <para>
+ This function is called from the debug code whenever a problem
+ in debug_object_free is detected. Further it can be called
+ from the debug checks in kfree/vfree, when an active object is
+ detected from the debug_check_no_obj_freed() sanity checks.
+ </para>
+ <para>
+ Called from debug_object_free() or debug_check_no_obj_freed()
+ when the object state is:
+ <itemizedlist>
+ <listitem><para>ODEBUG_STATE_ACTIVE</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ The function returns 1 when the fixup was successful,
+ otherwise 0. The return value is used to update the
+ statistics.
+ </para>
+ </sect1>
+ </chapter>
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None (knock on wood).
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl
new file mode 100644
index 0000000..3ed8812
--- /dev/null
+++ b/Documentation/DocBook/deviceiobook.tmpl
@@ -0,0 +1,323 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="DoingIO">
+ <bookinfo>
+ <title>Bus-Independent Device Accesses</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Matthew</firstname>
+ <surname>Wilcox</surname>
+ <affiliation>
+ <address>
+ <email>matthew@wil.cx</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@lxorguk.ukuu.org.uk</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2001</year>
+ <holder>Matthew Wilcox</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ Linux provides an API which abstracts performing IO across all busses
+ and devices, allowing device drivers to be written independently of
+ bus type.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="mmio">
+ <title>Memory Mapped IO</title>
+ <sect1 id="getting_access_to_the_device">
+ <title>Getting Access to the Device</title>
+ <para>
+ The most widely supported form of IO is memory mapped IO.
+ That is, a part of the CPU's address space is interpreted
+ not as accesses to memory, but as accesses to a device. Some
+ architectures define devices to be at a fixed address, but most
+ have some method of discovering devices. The PCI bus walk is a
+ good example of such a scheme. This document does not cover how
+ to receive such an address, but assumes you are starting with one.
+ Physical addresses are of type unsigned long.
+ </para>
+
+ <para>
+ This address should not be used directly. Instead, to get an
+ address suitable for passing to the accessor functions described
+ below, you should call <function>ioremap</function>.
+ An address suitable for accessing the device will be returned to you.
+ </para>
+
+ <para>
+ After you've finished using the device (say, in your module's
+ exit routine), call <function>iounmap</function> in order to return
+ the address space to the kernel. Most architectures allocate new
+ address space each time you call <function>ioremap</function>, and
+ they can run out unless you call <function>iounmap</function>.
+ </para>
+ </sect1>
+
+ <sect1 id="accessing_the_device">
+ <title>Accessing the device</title>
+ <para>
+ The part of the interface most used by drivers is reading and
+ writing memory-mapped registers on the device. Linux provides
+ interfaces to read and write 8-bit, 16-bit, 32-bit and 64-bit
+ quantities. Due to a historical accident, these are named byte,
+ word, long and quad accesses. Both read and write accesses are
+ supported; there is no prefetch support at this time.
+ </para>
+
+ <para>
+ The functions are named <function>readb</function>,
+ <function>readw</function>, <function>readl</function>,
+ <function>readq</function>, <function>readb_relaxed</function>,
+ <function>readw_relaxed</function>, <function>readl_relaxed</function>,
+ <function>readq_relaxed</function>, <function>writeb</function>,
+ <function>writew</function>, <function>writel</function> and
+ <function>writeq</function>.
+ </para>
+
+ <para>
+ Some devices (such as framebuffers) would like to use larger
+ transfers than 8 bytes at a time. For these devices, the
+ <function>memcpy_toio</function>, <function>memcpy_fromio</function>
+ and <function>memset_io</function> functions are provided.
+ Do not use memset or memcpy on IO addresses; they
+ are not guaranteed to copy data in order.
+ </para>
+
+ <para>
+ The read and write functions are defined to be ordered. That is the
+ compiler is not permitted to reorder the I/O sequence. When the
+ ordering can be compiler optimised, you can use <function>
+ __readb</function> and friends to indicate the relaxed ordering. Use
+ this with care.
+ </para>
+
+ <para>
+ While the basic functions are defined to be synchronous with respect
+ to each other and ordered with respect to each other the busses the
+ devices sit on may themselves have asynchronicity. In particular many
+ authors are burned by the fact that PCI bus writes are posted
+ asynchronously. A driver author must issue a read from the same
+ device to ensure that writes have occurred in the specific cases the
+ author cares. This kind of property cannot be hidden from driver
+ writers in the API. In some cases, the read used to flush the device
+ may be expected to fail (if the card is resetting, for example). In
+ that case, the read should be done from config space, which is
+ guaranteed to soft-fail if the card doesn't respond.
+ </para>
+
+ <para>
+ The following is an example of flushing a write to a device when
+ the driver would like to ensure the write's effects are visible prior
+ to continuing execution.
+ </para>
+
+<programlisting>
+static inline void
+qla1280_disable_intrs(struct scsi_qla_host *ha)
+{
+ struct device_reg *reg;
+
+ reg = ha->iobase;
+ /* disable risc and host interrupts */
+ WRT_REG_WORD(&amp;reg->ictrl, 0);
+ /*
+ * The following read will ensure that the above write
+ * has been received by the device before we return from this
+ * function.
+ */
+ RD_REG_WORD(&amp;reg->ictrl);
+ ha->flags.ints_enabled = 0;
+}
+</programlisting>
+
+ <para>
+ In addition to write posting, on some large multiprocessing systems
+ (e.g. SGI Challenge, Origin and Altix machines) posted writes won't
+ be strongly ordered coming from different CPUs. Thus it's important
+ to properly protect parts of your driver that do memory-mapped writes
+ with locks and use the <function>mmiowb</function> to make sure they
+ arrive in the order intended. Issuing a regular <function>readX
+ </function> will also ensure write ordering, but should only be used
+ when the driver has to be sure that the write has actually arrived
+ at the device (not that it's simply ordered with respect to other
+ writes), since a full <function>readX</function> is a relatively
+ expensive operation.
+ </para>
+
+ <para>
+ Generally, one should use <function>mmiowb</function> prior to
+ releasing a spinlock that protects regions using <function>writeb
+ </function> or similar functions that aren't surrounded by <function>
+ readb</function> calls, which will ensure ordering and flushing. The
+ following pseudocode illustrates what might occur if write ordering
+ isn't guaranteed via <function>mmiowb</function> or one of the
+ <function>readX</function> functions.
+ </para>
+
+<programlisting>
+CPU A: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU A: ...
+CPU A: writel(newval, ring_ptr);
+CPU A: spin_unlock_irqrestore(&amp;dev_lock, flags)
+ ...
+CPU B: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU B: writel(newval2, ring_ptr);
+CPU B: ...
+CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
+</programlisting>
+
+ <para>
+ In the case above, newval2 could be written to ring_ptr before
+ newval. Fixing it is easy though:
+ </para>
+
+<programlisting>
+CPU A: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU A: ...
+CPU A: writel(newval, ring_ptr);
+CPU A: mmiowb(); /* ensure no other writes beat us to the device */
+CPU A: spin_unlock_irqrestore(&amp;dev_lock, flags)
+ ...
+CPU B: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU B: writel(newval2, ring_ptr);
+CPU B: ...
+CPU B: mmiowb();
+CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
+</programlisting>
+
+ <para>
+ See tg3.c for a real world example of how to use <function>mmiowb
+ </function>
+ </para>
+
+ <para>
+ PCI ordering rules also guarantee that PIO read responses arrive
+ after any outstanding DMA writes from that bus, since for some devices
+ the result of a <function>readb</function> call may signal to the
+ driver that a DMA transaction is complete. In many cases, however,
+ the driver may want to indicate that the next
+ <function>readb</function> call has no relation to any previous DMA
+ writes performed by the device. The driver can use
+ <function>readb_relaxed</function> for these cases, although only
+ some platforms will honor the relaxed semantics. Using the relaxed
+ read functions will provide significant performance benefits on
+ platforms that support it. The qla2xxx driver provides examples
+ of how to use <function>readX_relaxed</function>. In many cases,
+ a majority of the driver's <function>readX</function> calls can
+ safely be converted to <function>readX_relaxed</function> calls, since
+ only a few will indicate or depend on DMA completion.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="port_space_accesses">
+ <title>Port Space Accesses</title>
+ <sect1 id="port_space_explained">
+ <title>Port Space Explained</title>
+
+ <para>
+ Another form of IO commonly supported is Port Space. This is a
+ range of addresses separate to the normal memory address space.
+ Access to these addresses is generally not as fast as accesses
+ to the memory mapped addresses, and it also has a potentially
+ smaller address space.
+ </para>
+
+ <para>
+ Unlike memory mapped IO, no preparation is required
+ to access port space.
+ </para>
+
+ </sect1>
+ <sect1 id="accessing_port_space">
+ <title>Accessing Port Space</title>
+ <para>
+ Accesses to this space are provided through a set of functions
+ which allow 8-bit, 16-bit and 32-bit accesses; also
+ known as byte, word and long. These functions are
+ <function>inb</function>, <function>inw</function>,
+ <function>inl</function>, <function>outb</function>,
+ <function>outw</function> and <function>outl</function>.
+ </para>
+
+ <para>
+ Some variants are provided for these functions. Some devices
+ require that accesses to their ports are slowed down. This
+ functionality is provided by appending a <function>_p</function>
+ to the end of the function. There are also equivalents to memcpy.
+ The <function>ins</function> and <function>outs</function>
+ functions copy bytes, words or longs to the given port.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Iarch/x86/include/asm/io_32.h
+!Elib/iomap.c
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
new file mode 100644
index 0000000..5e87ad5
--- /dev/null
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -0,0 +1,421 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Linux-filesystems-API">
+ <bookinfo>
+ <title>Linux Filesystems API</title>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="vfs">
+ <title>The Linux VFS</title>
+ <sect1 id="the_filesystem_types"><title>The Filesystem types</title>
+!Iinclude/linux/fs.h
+ </sect1>
+ <sect1 id="the_directory_cache"><title>The Directory Cache</title>
+!Efs/dcache.c
+!Iinclude/linux/dcache.h
+ </sect1>
+ <sect1 id="inode_handling"><title>Inode Handling</title>
+!Efs/inode.c
+!Efs/bad_inode.c
+ </sect1>
+ <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title>
+!Efs/super.c
+ </sect1>
+ <sect1 id="file_locks"><title>File Locks</title>
+!Efs/locks.c
+!Ifs/locks.c
+ </sect1>
+ <sect1 id="other_functions"><title>Other Functions</title>
+!Efs/mpage.c
+!Efs/namei.c
+!Efs/buffer.c
+!Efs/bio.c
+!Efs/seq_file.c
+!Efs/filesystems.c
+!Efs/fs-writeback.c
+!Efs/block_dev.c
+ </sect1>
+ </chapter>
+
+ <chapter id="proc">
+ <title>The proc filesystem</title>
+
+ <sect1 id="sysctl_interface"><title>sysctl interface</title>
+!Ekernel/sysctl.c
+ </sect1>
+
+ <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title>
+!Ifs/proc/base.c
+ </sect1>
+ </chapter>
+
+ <chapter id="sysfs">
+ <title>The Filesystem for Exporting Kernel Objects</title>
+!Efs/sysfs/file.c
+!Efs/sysfs/symlink.c
+!Efs/sysfs/bin.c
+ </chapter>
+
+ <chapter id="debugfs">
+ <title>The debugfs filesystem</title>
+
+ <sect1 id="debugfs_interface"><title>debugfs interface</title>
+!Efs/debugfs/inode.c
+!Efs/debugfs/file.c
+ </sect1>
+ </chapter>
+
+ <chapter id="LinuxJDBAPI">
+ <chapterinfo>
+ <title>The Linux Journalling API</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Roger</firstname>
+ <surname>Gammans</surname>
+ <affiliation>
+ <address>
+ <email>rgammans@computer-surgery.co.uk</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <authorgroup>
+ <author>
+ <firstname>Stephen</firstname>
+ <surname>Tweedie</surname>
+ <affiliation>
+ <address>
+ <email>sct@redhat.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2002</year>
+ <holder>Roger Gammans</holder>
+ </copyright>
+ </chapterinfo>
+
+ <title>The Linux Journalling API</title>
+
+ <sect1 id="journaling_overview">
+ <title>Overview</title>
+ <sect2 id="journaling_details">
+ <title>Details</title>
+<para>
+The journalling layer is easy to use. You need to
+first of all create a journal_t data structure. There are
+two calls to do this dependent on how you decide to allocate the physical
+media on which the journal resides. The journal_init_inode() call
+is for journals stored in filesystem inodes, or the journal_init_dev()
+call can be use for journal stored on a raw device (in a continuous range
+of blocks). A journal_t is a typedef for a struct pointer, so when
+you are finally finished make sure you call journal_destroy() on it
+to free up any used kernel memory.
+</para>
+
+<para>
+Once you have got your journal_t object you need to 'mount' or load the journal
+file, unless of course you haven't initialised it yet - in which case you
+need to call journal_create().
+</para>
+
+<para>
+Most of the time however your journal file will already have been created, but
+before you load it you must call journal_wipe() to empty the journal file.
+Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the
+job of the client file system to detect this and skip the call to journal_wipe().
+</para>
+
+<para>
+In either case the next call should be to journal_load() which prepares the
+journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery()
+for you if it detects any outstanding transactions in the journal and similarly
+journal_load() will call journal_recover() if necessary.
+I would advise reading fs/ext3/super.c for examples on this stage.
+[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly
+complicate the API. Or isn't a good idea for the journal layer to hide
+dirty mounts from the client fs]
+</para>
+
+<para>
+Now you can go ahead and start modifying the underlying
+filesystem. Almost.
+</para>
+
+<para>
+
+You still need to actually journal your filesystem changes, this
+is done by wrapping them into transactions. Additionally you
+also need to wrap the modification of each of the buffers
+with calls to the journal layer, so it knows what the modifications
+you are actually making are. To do this use journal_start() which
+returns a transaction handle.
+</para>
+
+<para>
+journal_start()
+and its counterpart journal_stop(), which indicates the end of a transaction
+are nestable calls, so you can reenter a transaction if necessary,
+but remember you must call journal_stop() the same number of times as
+journal_start() before the transaction is completed (or more accurately
+leaves the update phase). Ext3/VFS makes use of this feature to simplify
+quota support.
+</para>
+
+<para>
+Inside each transaction you need to wrap the modifications to the
+individual buffers (blocks). Before you start to modify a buffer you
+need to call journal_get_{create,write,undo}_access() as appropriate,
+this allows the journalling layer to copy the unmodified data if it
+needs to. After all the buffer may be part of a previously uncommitted
+transaction.
+At this point you are at last ready to modify a buffer, and once
+you are have done so you need to call journal_dirty_{meta,}data().
+Or if you've asked for access to a buffer you now know is now longer
+required to be pushed back on the device you can call journal_forget()
+in much the same way as you might have used bforget() in the past.
+</para>
+
+<para>
+A journal_flush() may be called at any time to commit and checkpoint
+all your transactions.
+</para>
+
+<para>
+Then at umount time , in your put_super() (2.4) or write_super() (2.5)
+you can then call journal_destroy() to clean up your in-core journal object.
+</para>
+
+<para>
+Unfortunately there a couple of ways the journal layer can cause a deadlock.
+The first thing to note is that each task can only have
+a single outstanding transaction at any one time, remember nothing
+commits until the outermost journal_stop(). This means
+you must complete the transaction at the end of each file/inode/address
+etc. operation you perform, so that the journalling system isn't re-entered
+on another journal. Since transactions can't be nested/batched
+across differing journals, and another filesystem other than
+yours (say ext3) may be modified in a later syscall.
+</para>
+
+<para>
+The second case to bear in mind is that journal_start() can
+block if there isn't enough space in the journal for your transaction
+(based on the passed nblocks param) - when it blocks it merely(!) needs to
+wait for transactions to complete and be committed from other tasks,
+so essentially we are waiting for journal_stop(). So to avoid
+deadlocks you must treat journal_start/stop() as if they
+were semaphores and include them in your semaphore ordering rules to prevent
+deadlocks. Note that journal_extend() has similar blocking behaviour to
+journal_start() so you can deadlock here just as easily as on journal_start().
+</para>
+
+<para>
+Try to reserve the right number of blocks the first time. ;-). This will
+be the maximum number of blocks you are going to touch in this transaction.
+I advise having a look at at least ext3_jbd.h to see the basis on which
+ext3 uses to make these decisions.
+</para>
+
+<para>
+Another wriggle to watch out for is your on-disk block allocation strategy.
+why? Because, if you undo a delete, you need to ensure you haven't reused any
+of the freed blocks in a later transaction. One simple way of doing this
+is make sure any blocks you allocate only have checkpointed transactions
+listed against them. Ext3 does this in ext3_test_allocatable().
+</para>
+
+<para>
+Lock is also providing through journal_{un,}lock_updates(),
+ext3 uses this when it wants a window with a clean and stable fs for a moment.
+eg.
+</para>
+
+<programlisting>
+
+ journal_lock_updates() //stop new stuff happening..
+ journal_flush() // checkpoint everything.
+ ..do stuff on stable fs
+ journal_unlock_updates() // carry on with filesystem use.
+</programlisting>
+
+<para>
+The opportunities for abuse and DOS attacks with this should be obvious,
+if you allow unprivileged userspace to trigger codepaths containing these
+calls.
+</para>
+
+<para>
+A new feature of jbd since 2.5.25 is commit callbacks with the new
+journal_callback_set() function you can now ask the journalling layer
+to call you back when the transaction is finally committed to disk, so that
+you can do some of your own management. The key to this is the journal_callback
+struct, this maintains the internal callback information but you can
+extend it like this:-
+</para>
+<programlisting>
+ struct myfs_callback_s {
+ //Data structure element required by jbd..
+ struct journal_callback for_jbd;
+ // Stuff for myfs allocated together.
+ myfs_inode* i_commited;
+
+ }
+</programlisting>
+
+<para>
+this would be useful if you needed to know when data was committed to a
+particular inode.
+</para>
+
+ </sect2>
+
+ <sect2 id="jbd_summary">
+ <title>Summary</title>
+<para>
+Using the journal is a matter of wrapping the different context changes,
+being each mount, each modification (transaction) and each changed buffer
+to tell the journalling layer about them.
+</para>
+
+<para>
+Here is a some pseudo code to give you an idea of how it works, as
+an example.
+</para>
+
+<programlisting>
+ journal_t* my_jnrl = journal_create();
+ journal_init_{dev,inode}(jnrl,...)
+ if (clean) journal_wipe();
+ journal_load();
+
+ foreach(transaction) { /*transactions must be
+ completed before
+ a syscall returns to
+ userspace*/
+
+ handle_t * xct=journal_start(my_jnrl);
+ foreach(bh) {
+ journal_get_{create,write,undo}_access(xact,bh);
+ if ( myfs_modify(bh) ) { /* returns true
+ if makes changes */
+ journal_dirty_{meta,}data(xact,bh);
+ } else {
+ journal_forget(bh);
+ }
+ }
+ journal_stop(xct);
+ }
+ journal_destroy(my_jrnl);
+</programlisting>
+ </sect2>
+
+ </sect1>
+
+ <sect1 id="data_types">
+ <title>Data Types</title>
+ <para>
+ The journalling layer uses typedefs to 'hide' the concrete definitions
+ of the structures used. As a client of the JBD layer you can
+ just rely on the using the pointer as a magic cookie of some sort.
+
+ Obviously the hiding is not enforced as this is 'C'.
+ </para>
+ <sect2 id="structures"><title>Structures</title>
+!Iinclude/linux/jbd.h
+ </sect2>
+ </sect1>
+
+ <sect1 id="functions">
+ <title>Functions</title>
+ <para>
+ The functions here are split into two groups those that
+ affect a journal as a whole, and those which are used to
+ manage transactions
+ </para>
+ <sect2 id="journal_level"><title>Journal Level</title>
+!Efs/jbd/journal.c
+!Ifs/jbd/recovery.c
+ </sect2>
+ <sect2 id="transaction_level"><title>Transasction Level</title>
+!Efs/jbd/transaction.c
+ </sect2>
+ </sect1>
+ <sect1 id="see_also">
+ <title>See also</title>
+ <para>
+ <citation>
+ <ulink url="ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz">
+ Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie
+ </ulink>
+ </citation>
+ </para>
+ <para>
+ <citation>
+ <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html">
+ Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie
+ </ulink>
+ </citation>
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="splice">
+ <title>splice API</title>
+ <para>
+ splice is a method for moving blocks of data around inside the
+ kernel, without continually transferring them between the kernel
+ and user space.
+ </para>
+!Ffs/splice.c
+ </chapter>
+
+ <chapter id="pipes">
+ <title>pipes API</title>
+ <para>
+ Pipe interfaces are all for in-kernel (builtin image) use.
+ They are not exported for use by modules.
+ </para>
+!Iinclude/linux/pipe_fs_i.h
+!Ffs/pipe.c
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl
new file mode 100644
index 0000000..6ef2f00
--- /dev/null
+++ b/Documentation/DocBook/gadget.tmpl
@@ -0,0 +1,793 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="USB-Gadget-API">
+ <bookinfo>
+ <title>USB Gadget API for Linux</title>
+ <date>20 August 2004</date>
+ <edition>20 August 2004</edition>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ <copyright>
+ <year>2003-2004</year>
+ <holder>David Brownell</holder>
+ </copyright>
+
+ <author>
+ <firstname>David</firstname>
+ <surname>Brownell</surname>
+ <affiliation>
+ <address><email>dbrownell@users.sourceforge.net</email></address>
+ </affiliation>
+ </author>
+ </bookinfo>
+
+<toc></toc>
+
+<chapter id="intro"><title>Introduction</title>
+
+<para>This document presents a Linux-USB "Gadget"
+kernel mode
+API, for use within peripherals and other USB devices
+that embed Linux.
+It provides an overview of the API structure,
+and shows how that fits into a system development project.
+This is the first such API released on Linux to address
+a number of important problems, including: </para>
+
+<itemizedlist>
+ <listitem><para>Supports USB 2.0, for high speed devices which
+ can stream data at several dozen megabytes per second.
+ </para></listitem>
+ <listitem><para>Handles devices with dozens of endpoints just as
+ well as ones with just two fixed-function ones. Gadget drivers
+ can be written so they're easy to port to new hardware.
+ </para></listitem>
+ <listitem><para>Flexible enough to expose more complex USB device
+ capabilities such as multiple configurations, multiple interfaces,
+ composite devices,
+ and alternate interface settings.
+ </para></listitem>
+ <listitem><para>USB "On-The-Go" (OTG) support, in conjunction
+ with updates to the Linux-USB host side.
+ </para></listitem>
+ <listitem><para>Sharing data structures and API models with the
+ Linux-USB host side API. This helps the OTG support, and
+ looks forward to more-symmetric frameworks (where the same
+ I/O model is used by both host and device side drivers).
+ </para></listitem>
+ <listitem><para>Minimalist, so it's easier to support new device
+ controller hardware. I/O processing doesn't imply large
+ demands for memory or CPU resources.
+ </para></listitem>
+</itemizedlist>
+
+
+<para>Most Linux developers will not be able to use this API, since they
+have USB "host" hardware in a PC, workstation, or server.
+Linux users with embedded systems are more likely to
+have USB peripheral hardware.
+To distinguish drivers running inside such hardware from the
+more familiar Linux "USB device drivers",
+which are host side proxies for the real USB devices,
+a different term is used:
+the drivers inside the peripherals are "USB gadget drivers".
+In USB protocol interactions, the device driver is the master
+(or "client driver")
+and the gadget driver is the slave (or "function driver").
+</para>
+
+<para>The gadget API resembles the host side Linux-USB API in that both
+use queues of request objects to package I/O buffers, and those requests
+may be submitted or canceled.
+They share common definitions for the standard USB
+<emphasis>Chapter 9</emphasis> messages, structures, and constants.
+Also, both APIs bind and unbind drivers to devices.
+The APIs differ in detail, since the host side's current
+URB framework exposes a number of implementation details
+and assumptions that are inappropriate for a gadget API.
+While the model for control transfers and configuration
+management is necessarily different (one side is a hardware-neutral master,
+the other is a hardware-aware slave), the endpoint I/0 API used here
+should also be usable for an overhead-reduced host side API.
+</para>
+
+</chapter>
+
+<chapter id="structure"><title>Structure of Gadget Drivers</title>
+
+<para>A system running inside a USB peripheral
+normally has at least three layers inside the kernel to handle
+USB protocol processing, and may have additional layers in
+user space code.
+The "gadget" API is used by the middle layer to interact
+with the lowest level (which directly handles hardware).
+</para>
+
+<para>In Linux, from the bottom up, these layers are:
+</para>
+
+<variablelist>
+
+ <varlistentry>
+ <term><emphasis>USB Controller Driver</emphasis></term>
+
+ <listitem>
+ <para>This is the lowest software level.
+ It is the only layer that talks to hardware,
+ through registers, fifos, dma, irqs, and the like.
+ The <filename>&lt;linux/usb/gadget.h&gt;</filename> API abstracts
+ the peripheral controller endpoint hardware.
+ That hardware is exposed through endpoint objects, which accept
+ streams of IN/OUT buffers, and through callbacks that interact
+ with gadget drivers.
+ Since normal USB devices only have one upstream
+ port, they only have one of these drivers.
+ The controller driver can support any number of different
+ gadget drivers, but only one of them can be used at a time.
+ </para>
+
+ <para>Examples of such controller hardware include
+ the PCI-based NetChip 2280 USB 2.0 high speed controller,
+ the SA-11x0 or PXA-25x UDC (found within many PDAs),
+ and a variety of other products.
+ </para>
+
+ </listitem></varlistentry>
+
+ <varlistentry>
+ <term><emphasis>Gadget Driver</emphasis></term>
+
+ <listitem>
+ <para>The lower boundary of this driver implements hardware-neutral
+ USB functions, using calls to the controller driver.
+ Because such hardware varies widely in capabilities and restrictions,
+ and is used in embedded environments where space is at a premium,
+ the gadget driver is often configured at compile time
+ to work with endpoints supported by one particular controller.
+ Gadget drivers may be portable to several different controllers,
+ using conditional compilation.
+ (Recent kernels substantially simplify the work involved in
+ supporting new hardware, by <emphasis>autoconfiguring</emphasis>
+ endpoints automatically for many bulk-oriented drivers.)
+ Gadget driver responsibilities include:
+ </para>
+ <itemizedlist>
+ <listitem><para>handling setup requests (ep0 protocol responses)
+ possibly including class-specific functionality
+ </para></listitem>
+ <listitem><para>returning configuration and string descriptors
+ </para></listitem>
+ <listitem><para>(re)setting configurations and interface
+ altsettings, including enabling and configuring endpoints
+ </para></listitem>
+ <listitem><para>handling life cycle events, such as managing
+ bindings to hardware,
+ USB suspend/resume, remote wakeup,
+ and disconnection from the USB host.
+ </para></listitem>
+ <listitem><para>managing IN and OUT transfers on all currently
+ enabled endpoints
+ </para></listitem>
+ </itemizedlist>
+
+ <para>
+ Such drivers may be modules of proprietary code, although
+ that approach is discouraged in the Linux community.
+ </para>
+ </listitem></varlistentry>
+
+ <varlistentry>
+ <term><emphasis>Upper Level</emphasis></term>
+
+ <listitem>
+ <para>Most gadget drivers have an upper boundary that connects
+ to some Linux driver or framework in Linux.
+ Through that boundary flows the data which the gadget driver
+ produces and/or consumes through protocol transfers over USB.
+ Examples include:
+ </para>
+ <itemizedlist>
+ <listitem><para>user mode code, using generic (gadgetfs)
+ or application specific files in
+ <filename>/dev</filename>
+ </para></listitem>
+ <listitem><para>networking subsystem (for network gadgets,
+ like the CDC Ethernet Model gadget driver)
+ </para></listitem>
+ <listitem><para>data capture drivers, perhaps video4Linux or
+ a scanner driver; or test and measurement hardware.
+ </para></listitem>
+ <listitem><para>input subsystem (for HID gadgets)
+ </para></listitem>
+ <listitem><para>sound subsystem (for audio gadgets)
+ </para></listitem>
+ <listitem><para>file system (for PTP gadgets)
+ </para></listitem>
+ <listitem><para>block i/o subsystem (for usb-storage gadgets)
+ </para></listitem>
+ <listitem><para>... and more </para></listitem>
+ </itemizedlist>
+ </listitem></varlistentry>
+
+ <varlistentry>
+ <term><emphasis>Additional Layers</emphasis></term>
+
+ <listitem>
+ <para>Other layers may exist.
+ These could include kernel layers, such as network protocol stacks,
+ as well as user mode applications building on standard POSIX
+ system call APIs such as
+ <emphasis>open()</emphasis>, <emphasis>close()</emphasis>,
+ <emphasis>read()</emphasis> and <emphasis>write()</emphasis>.
+ On newer systems, POSIX Async I/O calls may be an option.
+ Such user mode code will not necessarily be subject to
+ the GNU General Public License (GPL).
+ </para>
+ </listitem></varlistentry>
+
+
+</variablelist>
+
+<para>OTG-capable systems will also need to include a standard Linux-USB
+host side stack,
+with <emphasis>usbcore</emphasis>,
+one or more <emphasis>Host Controller Drivers</emphasis> (HCDs),
+<emphasis>USB Device Drivers</emphasis> to support
+the OTG "Targeted Peripheral List",
+and so forth.
+There will also be an <emphasis>OTG Controller Driver</emphasis>,
+which is visible to gadget and device driver developers only indirectly.
+That helps the host and device side USB controllers implement the
+two new OTG protocols (HNP and SRP).
+Roles switch (host to peripheral, or vice versa) using HNP
+during USB suspend processing, and SRP can be viewed as a
+more battery-friendly kind of device wakeup protocol.
+</para>
+
+<para>Over time, reusable utilities are evolving to help make some
+gadget driver tasks simpler.
+For example, building configuration descriptors from vectors of
+descriptors for the configurations interfaces and endpoints is
+now automated, and many drivers now use autoconfiguration to
+choose hardware endpoints and initialize their descriptors.
+
+A potential example of particular interest
+is code implementing standard USB-IF protocols for
+HID, networking, storage, or audio classes.
+Some developers are interested in KDB or KGDB hooks, to let
+target hardware be remotely debugged.
+Most such USB protocol code doesn't need to be hardware-specific,
+any more than network protocols like X11, HTTP, or NFS are.
+Such gadget-side interface drivers should eventually be combined,
+to implement composite devices.
+</para>
+
+</chapter>
+
+
+<chapter id="api"><title>Kernel Mode Gadget API</title>
+
+<para>Gadget drivers declare themselves through a
+<emphasis>struct usb_gadget_driver</emphasis>, which is responsible for
+most parts of enumeration for a <emphasis>struct usb_gadget</emphasis>.
+The response to a set_configuration usually involves
+enabling one or more of the <emphasis>struct usb_ep</emphasis> objects
+exposed by the gadget, and submitting one or more
+<emphasis>struct usb_request</emphasis> buffers to transfer data.
+Understand those four data types, and their operations, and
+you will understand how this API works.
+</para>
+
+<note><title>Incomplete Data Type Descriptions</title>
+
+<para>This documentation was prepared using the standard Linux
+kernel <filename>docproc</filename> tool, which turns text
+and in-code comments into SGML DocBook and then into usable
+formats such as HTML or PDF.
+Other than the "Chapter 9" data types, most of the significant
+data types and functions are described here.
+</para>
+
+<para>However, docproc does not understand all the C constructs
+that are used, so some relevant information is likely omitted from
+what you are reading.
+One example of such information is endpoint autoconfiguration.
+You'll have to read the header file, and use example source
+code (such as that for "Gadget Zero"), to fully understand the API.
+</para>
+
+<para>The part of the API implementing some basic
+driver capabilities is specific to the version of the
+Linux kernel that's in use.
+The 2.6 kernel includes a <emphasis>driver model</emphasis>
+framework that has no analogue on earlier kernels;
+so those parts of the gadget API are not fully portable.
+(They are implemented on 2.4 kernels, but in a different way.)
+The driver model state is another part of this API that is
+ignored by the kerneldoc tools.
+</para>
+</note>
+
+<para>The core API does not expose
+every possible hardware feature, only the most widely available ones.
+There are significant hardware features, such as device-to-device DMA
+(without temporary storage in a memory buffer)
+that would be added using hardware-specific APIs.
+</para>
+
+<para>This API allows drivers to use conditional compilation to handle
+endpoint capabilities of different hardware, but doesn't require that.
+Hardware tends to have arbitrary restrictions, relating to
+transfer types, addressing, packet sizes, buffering, and availability.
+As a rule, such differences only matter for "endpoint zero" logic
+that handles device configuration and management.
+The API supports limited run-time
+detection of capabilities, through naming conventions for endpoints.
+Many drivers will be able to at least partially autoconfigure
+themselves.
+In particular, driver init sections will often have endpoint
+autoconfiguration logic that scans the hardware's list of endpoints
+to find ones matching the driver requirements
+(relying on those conventions), to eliminate some of the most
+common reasons for conditional compilation.
+</para>
+
+<para>Like the Linux-USB host side API, this API exposes
+the "chunky" nature of USB messages: I/O requests are in terms
+of one or more "packets", and packet boundaries are visible to drivers.
+Compared to RS-232 serial protocols, USB resembles
+synchronous protocols like HDLC
+(N bytes per frame, multipoint addressing, host as the primary
+station and devices as secondary stations)
+more than asynchronous ones
+(tty style: 8 data bits per frame, no parity, one stop bit).
+So for example the controller drivers won't buffer
+two single byte writes into a single two-byte USB IN packet,
+although gadget drivers may do so when they implement
+protocols where packet boundaries (and "short packets")
+are not significant.
+</para>
+
+<sect1 id="lifecycle"><title>Driver Life Cycle</title>
+
+<para>Gadget drivers make endpoint I/O requests to hardware without
+needing to know many details of the hardware, but driver
+setup/configuration code needs to handle some differences.
+Use the API like this:
+</para>
+
+<orderedlist numeration='arabic'>
+
+<listitem><para>Register a driver for the particular device side
+usb controller hardware,
+such as the net2280 on PCI (USB 2.0),
+sa11x0 or pxa25x as found in Linux PDAs,
+and so on.
+At this point the device is logically in the USB ch9 initial state
+("attached"), drawing no power and not usable
+(since it does not yet support enumeration).
+Any host should not see the device, since it's not
+activated the data line pullup used by the host to
+detect a device, even if VBUS power is available.
+</para></listitem>
+
+<listitem><para>Register a gadget driver that implements some higher level
+device function. That will then bind() to a usb_gadget, which
+activates the data line pullup sometime after detecting VBUS.
+</para></listitem>
+
+<listitem><para>The hardware driver can now start enumerating.
+The steps it handles are to accept USB power and set_address requests.
+Other steps are handled by the gadget driver.
+If the gadget driver module is unloaded before the host starts to
+enumerate, steps before step 7 are skipped.
+</para></listitem>
+
+<listitem><para>The gadget driver's setup() call returns usb descriptors,
+based both on what the bus interface hardware provides and on the
+functionality being implemented.
+That can involve alternate settings or configurations,
+unless the hardware prevents such operation.
+For OTG devices, each configuration descriptor includes
+an OTG descriptor.
+</para></listitem>
+
+<listitem><para>The gadget driver handles the last step of enumeration,
+when the USB host issues a set_configuration call.
+It enables all endpoints used in that configuration,
+with all interfaces in their default settings.
+That involves using a list of the hardware's endpoints, enabling each
+endpoint according to its descriptor.
+It may also involve using <function>usb_gadget_vbus_draw</function>
+to let more power be drawn from VBUS, as allowed by that configuration.
+For OTG devices, setting a configuration may also involve reporting
+HNP capabilities through a user interface.
+</para></listitem>
+
+<listitem><para>Do real work and perform data transfers, possibly involving
+changes to interface settings or switching to new configurations, until the
+device is disconnect()ed from the host.
+Queue any number of transfer requests to each endpoint.
+It may be suspended and resumed several times before being disconnected.
+On disconnect, the drivers go back to step 3 (above).
+</para></listitem>
+
+<listitem><para>When the gadget driver module is being unloaded,
+the driver unbind() callback is issued. That lets the controller
+driver be unloaded.
+</para></listitem>
+
+</orderedlist>
+
+<para>Drivers will normally be arranged so that just loading the
+gadget driver module (or statically linking it into a Linux kernel)
+allows the peripheral device to be enumerated, but some drivers
+will defer enumeration until some higher level component (like
+a user mode daemon) enables it.
+Note that at this lowest level there are no policies about how
+ep0 configuration logic is implemented,
+except that it should obey USB specifications.
+Such issues are in the domain of gadget drivers,
+including knowing about implementation constraints
+imposed by some USB controllers
+or understanding that composite devices might happen to
+be built by integrating reusable components.
+</para>
+
+<para>Note that the lifecycle above can be slightly different
+for OTG devices.
+Other than providing an additional OTG descriptor in each
+configuration, only the HNP-related differences are particularly
+visible to driver code.
+They involve reporting requirements during the SET_CONFIGURATION
+request, and the option to invoke HNP during some suspend callbacks.
+Also, SRP changes the semantics of
+<function>usb_gadget_wakeup</function>
+slightly.
+</para>
+
+</sect1>
+
+<sect1 id="ch9"><title>USB 2.0 Chapter 9 Types and Constants</title>
+
+<para>Gadget drivers
+rely on common USB structures and constants
+defined in the
+<filename>&lt;linux/usb/ch9.h&gt;</filename>
+header file, which is standard in Linux 2.6 kernels.
+These are the same types and constants used by host
+side drivers (and usbcore).
+</para>
+
+!Iinclude/linux/usb/ch9.h
+</sect1>
+
+<sect1 id="core"><title>Core Objects and Methods</title>
+
+<para>These are declared in
+<filename>&lt;linux/usb/gadget.h&gt;</filename>,
+and are used by gadget drivers to interact with
+USB peripheral controller drivers.
+</para>
+
+ <!-- yeech, this is ugly in nsgmls PDF output.
+
+ the PDF bookmark and refentry output nesting is wrong,
+ and the member/argument documentation indents ugly.
+
+ plus something (docproc?) adds whitespace before the
+ descriptive paragraph text, so it can't line up right
+ unless the explanations are trivial.
+ -->
+
+!Iinclude/linux/usb/gadget.h
+</sect1>
+
+<sect1 id="utils"><title>Optional Utilities</title>
+
+<para>The core API is sufficient for writing a USB Gadget Driver,
+but some optional utilities are provided to simplify common tasks.
+These utilities include endpoint autoconfiguration.
+</para>
+
+!Edrivers/usb/gadget/usbstring.c
+!Edrivers/usb/gadget/config.c
+<!-- !Edrivers/usb/gadget/epautoconf.c -->
+</sect1>
+
+<sect1 id="composite"><title>Composite Device Framework</title>
+
+<para>The core API is sufficient for writing drivers for composite
+USB devices (with more than one function in a given configuration),
+and also multi-configuration devices (also more than one function,
+but not necessarily sharing a given configuration).
+There is however an optional framework which makes it easier to
+reuse and combine functions.
+</para>
+
+<para>Devices using this framework provide a <emphasis>struct
+usb_composite_driver</emphasis>, which in turn provides one or
+more <emphasis>struct usb_configuration</emphasis> instances.
+Each such configuration includes at least one
+<emphasis>struct usb_function</emphasis>, which packages a user
+visible role such as "network link" or "mass storage device".
+Management functions may also exist, such as "Device Firmware
+Upgrade".
+</para>
+
+!Iinclude/linux/usb/composite.h
+!Edrivers/usb/gadget/composite.c
+
+</sect1>
+
+<sect1 id="functions"><title>Composite Device Functions</title>
+
+<para>At this writing, a few of the current gadget drivers have
+been converted to this framework.
+Near-term plans include converting all of them, except for "gadgetfs".
+</para>
+
+!Edrivers/usb/gadget/f_acm.c
+!Edrivers/usb/gadget/f_ecm.c
+!Edrivers/usb/gadget/f_subset.c
+!Edrivers/usb/gadget/f_obex.c
+!Edrivers/usb/gadget/f_serial.c
+
+</sect1>
+
+
+</chapter>
+
+<chapter id="controllers"><title>Peripheral Controller Drivers</title>
+
+<para>The first hardware supporting this API was the NetChip 2280
+controller, which supports USB 2.0 high speed and is based on PCI.
+This is the <filename>net2280</filename> driver module.
+The driver supports Linux kernel versions 2.4 and 2.6;
+contact NetChip Technologies for development boards and product
+information.
+</para>
+
+<para>Other hardware working in the "gadget" framework includes:
+Intel's PXA 25x and IXP42x series processors
+(<filename>pxa2xx_udc</filename>),
+Toshiba TC86c001 "Goku-S" (<filename>goku_udc</filename>),
+Renesas SH7705/7727 (<filename>sh_udc</filename>),
+MediaQ 11xx (<filename>mq11xx_udc</filename>),
+Hynix HMS30C7202 (<filename>h7202_udc</filename>),
+National 9303/4 (<filename>n9604_udc</filename>),
+Texas Instruments OMAP (<filename>omap_udc</filename>),
+Sharp LH7A40x (<filename>lh7a40x_udc</filename>),
+and more.
+Most of those are full speed controllers.
+</para>
+
+<para>At this writing, there are people at work on drivers in
+this framework for several other USB device controllers,
+with plans to make many of them be widely available.
+</para>
+
+<!-- !Edrivers/usb/gadget/net2280.c -->
+
+<para>A partial USB simulator,
+the <filename>dummy_hcd</filename> driver, is available.
+It can act like a net2280, a pxa25x, or an sa11x0 in terms
+of available endpoints and device speeds; and it simulates
+control, bulk, and to some extent interrupt transfers.
+That lets you develop some parts of a gadget driver on a normal PC,
+without any special hardware, and perhaps with the assistance
+of tools such as GDB running with User Mode Linux.
+At least one person has expressed interest in adapting that
+approach, hooking it up to a simulator for a microcontroller.
+Such simulators can help debug subsystems where the runtime hardware
+is unfriendly to software development, or is not yet available.
+</para>
+
+<para>Support for other controllers is expected to be developed
+and contributed
+over time, as this driver framework evolves.
+</para>
+
+</chapter>
+
+<chapter id="gadget"><title>Gadget Drivers</title>
+
+<para>In addition to <emphasis>Gadget Zero</emphasis>
+(used primarily for testing and development with drivers
+for usb controller hardware), other gadget drivers exist.
+</para>
+
+<para>There's an <emphasis>ethernet</emphasis> gadget
+driver, which implements one of the most useful
+<emphasis>Communications Device Class</emphasis> (CDC) models.
+One of the standards for cable modem interoperability even
+specifies the use of this ethernet model as one of two
+mandatory options.
+Gadgets using this code look to a USB host as if they're
+an Ethernet adapter.
+It provides access to a network where the gadget's CPU is one host,
+which could easily be bridging, routing, or firewalling
+access to other networks.
+Since some hardware can't fully implement the CDC Ethernet
+requirements, this driver also implements a "good parts only"
+subset of CDC Ethernet.
+(That subset doesn't advertise itself as CDC Ethernet,
+to avoid creating problems.)
+</para>
+
+<para>Support for Microsoft's <emphasis>RNDIS</emphasis>
+protocol has been contributed by Pengutronix and Auerswald GmbH.
+This is like CDC Ethernet, but it runs on more slightly USB hardware
+(but less than the CDC subset).
+However, its main claim to fame is being able to connect directly to
+recent versions of Windows, using drivers that Microsoft bundles
+and supports, making it much simpler to network with Windows.
+</para>
+
+<para>There is also support for user mode gadget drivers,
+using <emphasis>gadgetfs</emphasis>.
+This provides a <emphasis>User Mode API</emphasis> that presents
+each endpoint as a single file descriptor. I/O is done using
+normal <emphasis>read()</emphasis> and <emphasis>read()</emphasis> calls.
+Familiar tools like GDB and pthreads can be used to
+develop and debug user mode drivers, so that once a robust
+controller driver is available many applications for it
+won't require new kernel mode software.
+Linux 2.6 <emphasis>Async I/O (AIO)</emphasis>
+support is available, so that user mode software
+can stream data with only slightly more overhead
+than a kernel driver.
+</para>
+
+<para>There's a USB Mass Storage class driver, which provides
+a different solution for interoperability with systems such
+as MS-Windows and MacOS.
+That <emphasis>File-backed Storage</emphasis> driver uses a
+file or block device as backing store for a drive,
+like the <filename>loop</filename> driver.
+The USB host uses the BBB, CB, or CBI versions of the mass
+storage class specification, using transparent SCSI commands
+to access the data from the backing store.
+</para>
+
+<para>There's a "serial line" driver, useful for TTY style
+operation over USB.
+The latest version of that driver supports CDC ACM style
+operation, like a USB modem, and so on most hardware it can
+interoperate easily with MS-Windows.
+One interesting use of that driver is in boot firmware (like a BIOS),
+which can sometimes use that model with very small systems without
+real serial lines.
+</para>
+
+<para>Support for other kinds of gadget is expected to
+be developed and contributed
+over time, as this driver framework evolves.
+</para>
+
+</chapter>
+
+<chapter id="otg"><title>USB On-The-GO (OTG)</title>
+
+<para>USB OTG support on Linux 2.6 was initially developed
+by Texas Instruments for
+<ulink url="http://www.omap.com">OMAP</ulink> 16xx and 17xx
+series processors.
+Other OTG systems should work in similar ways, but the
+hardware level details could be very different.
+</para>
+
+<para>Systems need specialized hardware support to implement OTG,
+notably including a special <emphasis>Mini-AB</emphasis> jack
+and associated transciever to support <emphasis>Dual-Role</emphasis>
+operation:
+they can act either as a host, using the standard
+Linux-USB host side driver stack,
+or as a peripheral, using this "gadget" framework.
+To do that, the system software relies on small additions
+to those programming interfaces,
+and on a new internal component (here called an "OTG Controller")
+affecting which driver stack connects to the OTG port.
+In each role, the system can re-use the existing pool of
+hardware-neutral drivers, layered on top of the controller
+driver interfaces (<emphasis>usb_bus</emphasis> or
+<emphasis>usb_gadget</emphasis>).
+Such drivers need at most minor changes, and most of the calls
+added to support OTG can also benefit non-OTG products.
+</para>
+
+<itemizedlist>
+ <listitem><para>Gadget drivers test the <emphasis>is_otg</emphasis>
+ flag, and use it to determine whether or not to include
+ an OTG descriptor in each of their configurations.
+ </para></listitem>
+ <listitem><para>Gadget drivers may need changes to support the
+ two new OTG protocols, exposed in new gadget attributes
+ such as <emphasis>b_hnp_enable</emphasis> flag.
+ HNP support should be reported through a user interface
+ (two LEDs could suffice), and is triggered in some cases
+ when the host suspends the peripheral.
+ SRP support can be user-initiated just like remote wakeup,
+ probably by pressing the same button.
+ </para></listitem>
+ <listitem><para>On the host side, USB device drivers need
+ to be taught to trigger HNP at appropriate moments, using
+ <function>usb_suspend_device()</function>.
+ That also conserves battery power, which is useful even
+ for non-OTG configurations.
+ </para></listitem>
+ <listitem><para>Also on the host side, a driver must support the
+ OTG "Targeted Peripheral List". That's just a whitelist,
+ used to reject peripherals not supported with a given
+ Linux OTG host.
+ <emphasis>This whitelist is product-specific;
+ each product must modify <filename>otg_whitelist.h</filename>
+ to match its interoperability specification.
+ </emphasis>
+ </para>
+ <para>Non-OTG Linux hosts, like PCs and workstations,
+ normally have some solution for adding drivers, so that
+ peripherals that aren't recognized can eventually be supported.
+ That approach is unreasonable for consumer products that may
+ never have their firmware upgraded, and where it's usually
+ unrealistic to expect traditional PC/workstation/server kinds
+ of support model to work.
+ For example, it's often impractical to change device firmware
+ once the product has been distributed, so driver bugs can't
+ normally be fixed if they're found after shipment.
+ </para></listitem>
+</itemizedlist>
+
+<para>
+Additional changes are needed below those hardware-neutral
+<emphasis>usb_bus</emphasis> and <emphasis>usb_gadget</emphasis>
+driver interfaces; those aren't discussed here in any detail.
+Those affect the hardware-specific code for each USB Host or Peripheral
+controller, and how the HCD initializes (since OTG can be active only
+on a single port).
+They also involve what may be called an <emphasis>OTG Controller
+Driver</emphasis>, managing the OTG transceiver and the OTG state
+machine logic as well as much of the root hub behavior for the
+OTG port.
+The OTG controller driver needs to activate and deactivate USB
+controllers depending on the relevant device role.
+Some related changes were needed inside usbcore, so that it
+can identify OTG-capable devices and respond appropriately
+to HNP or SRP protocols.
+</para>
+
+</chapter>
+
+</book>
+<!--
+ vim:syntax=sgml:sw=4
+-->
diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl
new file mode 100644
index 0000000..3a882d9
--- /dev/null
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -0,0 +1,474 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Generic-IRQ-Guide">
+ <bookinfo>
+ <title>Linux generic IRQ handling</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Thomas</firstname>
+ <surname>Gleixner</surname>
+ <affiliation>
+ <address>
+ <email>tglx@linutronix.de</email>
+ </address>
+ </affiliation>
+ </author>
+ <author>
+ <firstname>Ingo</firstname>
+ <surname>Molnar</surname>
+ <affiliation>
+ <address>
+ <email>mingo@elte.hu</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2005-2006</year>
+ <holder>Thomas Gleixner</holder>
+ </copyright>
+ <copyright>
+ <year>2005-2006</year>
+ <holder>Ingo Molnar</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The generic interrupt handling layer is designed to provide a
+ complete abstraction of interrupt handling for device drivers.
+ It is able to handle all the different types of interrupt controller
+ hardware. Device drivers use generic API functions to request, enable,
+ disable and free interrupts. The drivers do not have to know anything
+ about interrupt hardware details, so they can be used on different
+ platforms without code changes.
+ </para>
+ <para>
+ This documentation is provided to developers who want to implement
+ an interrupt subsystem based for their architecture, with the help
+ of the generic IRQ handling layer.
+ </para>
+ </chapter>
+
+ <chapter id="rationale">
+ <title>Rationale</title>
+ <para>
+ The original implementation of interrupt handling in Linux is using
+ the __do_IRQ() super-handler, which is able to deal with every
+ type of interrupt logic.
+ </para>
+ <para>
+ Originally, Russell King identified different types of handlers to
+ build a quite universal set for the ARM interrupt handler
+ implementation in Linux 2.5/2.6. He distinguished between:
+ <itemizedlist>
+ <listitem><para>Level type</para></listitem>
+ <listitem><para>Edge type</para></listitem>
+ <listitem><para>Simple type</para></listitem>
+ </itemizedlist>
+ In the SMP world of the __do_IRQ() super-handler another type
+ was identified:
+ <itemizedlist>
+ <listitem><para>Per CPU type</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ This split implementation of highlevel IRQ handlers allows us to
+ optimize the flow of the interrupt handling for each specific
+ interrupt type. This reduces complexity in that particular codepath
+ and allows the optimized handling of a given type.
+ </para>
+ <para>
+ The original general IRQ implementation used hw_interrupt_type
+ structures and their ->ack(), ->end() [etc.] callbacks to
+ differentiate the flow control in the super-handler. This leads to
+ a mix of flow logic and lowlevel hardware logic, and it also leads
+ to unnecessary code duplication: for example in i386, there is a
+ ioapic_level_irq and a ioapic_edge_irq irq-type which share many
+ of the lowlevel details but have different flow handling.
+ </para>
+ <para>
+ A more natural abstraction is the clean separation of the
+ 'irq flow' and the 'chip details'.
+ </para>
+ <para>
+ Analysing a couple of architecture's IRQ subsystem implementations
+ reveals that most of them can use a generic set of 'irq flow'
+ methods and only need to add the chip level specific code.
+ The separation is also valuable for (sub)architectures
+ which need specific quirks in the irq flow itself but not in the
+ chip-details - and thus provides a more transparent IRQ subsystem
+ design.
+ </para>
+ <para>
+ Each interrupt descriptor is assigned its own highlevel flow
+ handler, which is normally one of the generic
+ implementations. (This highlevel flow handler implementation also
+ makes it simple to provide demultiplexing handlers which can be
+ found in embedded platforms on various architectures.)
+ </para>
+ <para>
+ The separation makes the generic interrupt handling layer more
+ flexible and extensible. For example, an (sub)architecture can
+ use a generic irq-flow implementation for 'level type' interrupts
+ and add a (sub)architecture specific 'edge type' implementation.
+ </para>
+ <para>
+ To make the transition to the new model easier and prevent the
+ breakage of existing implementations, the __do_IRQ() super-handler
+ is still available. This leads to a kind of duality for the time
+ being. Over time the new model should be used in more and more
+ architectures, as it enables smaller and cleaner IRQ subsystems.
+ </para>
+ </chapter>
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None (knock on wood).
+ </para>
+ </chapter>
+
+ <chapter id="Abstraction">
+ <title>Abstraction layers</title>
+ <para>
+ There are three main levels of abstraction in the interrupt code:
+ <orderedlist>
+ <listitem><para>Highlevel driver API</para></listitem>
+ <listitem><para>Highlevel IRQ flow handlers</para></listitem>
+ <listitem><para>Chiplevel hardware encapsulation</para></listitem>
+ </orderedlist>
+ </para>
+ <sect1 id="Interrupt_control_flow">
+ <title>Interrupt control flow</title>
+ <para>
+ Each interrupt is described by an interrupt descriptor structure
+ irq_desc. The interrupt is referenced by an 'unsigned int' numeric
+ value which selects the corresponding interrupt decription structure
+ in the descriptor structures array.
+ The descriptor structure contains status information and pointers
+ to the interrupt flow method and the interrupt chip structure
+ which are assigned to this interrupt.
+ </para>
+ <para>
+ Whenever an interrupt triggers, the lowlevel arch code calls into
+ the generic interrupt code by calling desc->handle_irq().
+ This highlevel IRQ handling function only uses desc->chip primitives
+ referenced by the assigned chip descriptor structure.
+ </para>
+ </sect1>
+ <sect1 id="Highlevel_Driver_API">
+ <title>Highlevel Driver API</title>
+ <para>
+ The highlevel Driver API consists of following functions:
+ <itemizedlist>
+ <listitem><para>request_irq()</para></listitem>
+ <listitem><para>free_irq()</para></listitem>
+ <listitem><para>disable_irq()</para></listitem>
+ <listitem><para>enable_irq()</para></listitem>
+ <listitem><para>disable_irq_nosync() (SMP only)</para></listitem>
+ <listitem><para>synchronize_irq() (SMP only)</para></listitem>
+ <listitem><para>set_irq_type()</para></listitem>
+ <listitem><para>set_irq_wake()</para></listitem>
+ <listitem><para>set_irq_data()</para></listitem>
+ <listitem><para>set_irq_chip()</para></listitem>
+ <listitem><para>set_irq_chip_data()</para></listitem>
+ </itemizedlist>
+ See the autogenerated function documentation for details.
+ </para>
+ </sect1>
+ <sect1 id="Highlevel_IRQ_flow_handlers">
+ <title>Highlevel IRQ flow handlers</title>
+ <para>
+ The generic layer provides a set of pre-defined irq-flow methods:
+ <itemizedlist>
+ <listitem><para>handle_level_irq</para></listitem>
+ <listitem><para>handle_edge_irq</para></listitem>
+ <listitem><para>handle_simple_irq</para></listitem>
+ <listitem><para>handle_percpu_irq</para></listitem>
+ </itemizedlist>
+ The interrupt flow handlers (either predefined or architecture
+ specific) are assigned to specific interrupts by the architecture
+ either during bootup or during device initialization.
+ </para>
+ <sect2 id="Default_flow_implementations">
+ <title>Default flow implementations</title>
+ <sect3 id="Helper_functions">
+ <title>Helper functions</title>
+ <para>
+ The helper functions call the chip primitives and
+ are used by the default flow implementations.
+ The following helper functions are implemented (simplified excerpt):
+ <programlisting>
+default_enable(irq)
+{
+ desc->chip->unmask(irq);
+}
+
+default_disable(irq)
+{
+ if (!delay_disable(irq))
+ desc->chip->mask(irq);
+}
+
+default_ack(irq)
+{
+ chip->ack(irq);
+}
+
+default_mask_ack(irq)
+{
+ if (chip->mask_ack) {
+ chip->mask_ack(irq);
+ } else {
+ chip->mask(irq);
+ chip->ack(irq);
+ }
+}
+
+noop(irq)
+{
+}
+
+ </programlisting>
+ </para>
+ </sect3>
+ </sect2>
+ <sect2 id="Default_flow_handler_implementations">
+ <title>Default flow handler implementations</title>
+ <sect3 id="Default_Level_IRQ_flow_handler">
+ <title>Default Level IRQ flow handler</title>
+ <para>
+ handle_level_irq provides a generic implementation
+ for level-triggered interrupts.
+ </para>
+ <para>
+ The following control flow is implemented (simplified excerpt):
+ <programlisting>
+desc->chip->start();
+handle_IRQ_event(desc->action);
+desc->chip->end();
+ </programlisting>
+ </para>
+ </sect3>
+ <sect3 id="Default_Edge_IRQ_flow_handler">
+ <title>Default Edge IRQ flow handler</title>
+ <para>
+ handle_edge_irq provides a generic implementation
+ for edge-triggered interrupts.
+ </para>
+ <para>
+ The following control flow is implemented (simplified excerpt):
+ <programlisting>
+if (desc->status &amp; running) {
+ desc->chip->hold();
+ desc->status |= pending | masked;
+ return;
+}
+desc->chip->start();
+desc->status |= running;
+do {
+ if (desc->status &amp; masked)
+ desc->chip->enable();
+ desc->status &amp;= ~pending;
+ handle_IRQ_event(desc->action);
+} while (status &amp; pending);
+desc->status &amp;= ~running;
+desc->chip->end();
+ </programlisting>
+ </para>
+ </sect3>
+ <sect3 id="Default_simple_IRQ_flow_handler">
+ <title>Default simple IRQ flow handler</title>
+ <para>
+ handle_simple_irq provides a generic implementation
+ for simple interrupts.
+ </para>
+ <para>
+ Note: The simple flow handler does not call any
+ handler/chip primitives.
+ </para>
+ <para>
+ The following control flow is implemented (simplified excerpt):
+ <programlisting>
+handle_IRQ_event(desc->action);
+ </programlisting>
+ </para>
+ </sect3>
+ <sect3 id="Default_per_CPU_flow_handler">
+ <title>Default per CPU flow handler</title>
+ <para>
+ handle_percpu_irq provides a generic implementation
+ for per CPU interrupts.
+ </para>
+ <para>
+ Per CPU interrupts are only available on SMP and
+ the handler provides a simplified version without
+ locking.
+ </para>
+ <para>
+ The following control flow is implemented (simplified excerpt):
+ <programlisting>
+desc->chip->start();
+handle_IRQ_event(desc->action);
+desc->chip->end();
+ </programlisting>
+ </para>
+ </sect3>
+ </sect2>
+ <sect2 id="Quirks_and_optimizations">
+ <title>Quirks and optimizations</title>
+ <para>
+ The generic functions are intended for 'clean' architectures and chips,
+ which have no platform-specific IRQ handling quirks. If an architecture
+ needs to implement quirks on the 'flow' level then it can do so by
+ overriding the highlevel irq-flow handler.
+ </para>
+ </sect2>
+ <sect2 id="Delayed_interrupt_disable">
+ <title>Delayed interrupt disable</title>
+ <para>
+ This per interrupt selectable feature, which was introduced by Russell
+ King in the ARM interrupt implementation, does not mask an interrupt
+ at the hardware level when disable_irq() is called. The interrupt is
+ kept enabled and is masked in the flow handler when an interrupt event
+ happens. This prevents losing edge interrupts on hardware which does
+ not store an edge interrupt event while the interrupt is disabled at
+ the hardware level. When an interrupt arrives while the IRQ_DISABLED
+ flag is set, then the interrupt is masked at the hardware level and
+ the IRQ_PENDING bit is set. When the interrupt is re-enabled by
+ enable_irq() the pending bit is checked and if it is set, the
+ interrupt is resent either via hardware or by a software resend
+ mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when
+ you want to use the delayed interrupt disable feature and your
+ hardware is not capable of retriggering an interrupt.)
+ The delayed interrupt disable can be runtime enabled, per interrupt,
+ by setting the IRQ_DELAYED_DISABLE flag in the irq_desc status field.
+ </para>
+ </sect2>
+ </sect1>
+ <sect1 id="Chiplevel_hardware_encapsulation">
+ <title>Chiplevel hardware encapsulation</title>
+ <para>
+ The chip level hardware descriptor structure irq_chip
+ contains all the direct chip relevant functions, which
+ can be utilized by the irq flow implementations.
+ <itemizedlist>
+ <listitem><para>ack()</para></listitem>
+ <listitem><para>mask_ack() - Optional, recommended for performance</para></listitem>
+ <listitem><para>mask()</para></listitem>
+ <listitem><para>unmask()</para></listitem>
+ <listitem><para>retrigger() - Optional</para></listitem>
+ <listitem><para>set_type() - Optional</para></listitem>
+ <listitem><para>set_wake() - Optional</para></listitem>
+ </itemizedlist>
+ These primitives are strictly intended to mean what they say: ack means
+ ACK, masking means masking of an IRQ line, etc. It is up to the flow
+ handler(s) to use these basic units of lowlevel functionality.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="doirq">
+ <title>__do_IRQ entry point</title>
+ <para>
+ The original implementation __do_IRQ() is an alternative entry
+ point for all types of interrupts.
+ </para>
+ <para>
+ This handler turned out to be not suitable for all
+ interrupt hardware and was therefore reimplemented with split
+ functionality for egde/level/simple/percpu interrupts. This is not
+ only a functional optimization. It also shortens code paths for
+ interrupts.
+ </para>
+ <para>
+ To make use of the split implementation, replace the call to
+ __do_IRQ by a call to desc->chip->handle_irq() and associate
+ the appropriate handler function to desc->chip->handle_irq().
+ In most cases the generic handler implementations should
+ be sufficient.
+ </para>
+ </chapter>
+
+ <chapter id="locking">
+ <title>Locking on SMP</title>
+ <para>
+ The locking of chip registers is up to the architecture that
+ defines the chip primitives. There is a chip->lock field that can be used
+ for serialization, but the generic layer does not touch it. The per-irq
+ structure is protected via desc->lock, by the generic layer.
+ </para>
+ </chapter>
+ <chapter id="structs">
+ <title>Structures</title>
+ <para>
+ This chapter contains the autogenerated documentation of the structures which are
+ used in the generic IRQ layer.
+ </para>
+!Iinclude/linux/irq.h
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the kernel API functions
+ which are exported.
+ </para>
+!Ekernel/irq/manage.c
+!Ekernel/irq/chip.c
+ </chapter>
+
+ <chapter id="intfunctions">
+ <title>Internal Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the internal functions.
+ </para>
+!Ikernel/irq/handle.c
+!Ikernel/irq/chip.c
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The following people have contributed to this document:
+ <orderedlist>
+ <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+ <listitem><para>Ingo Molnar<email>mingo@elte.hu</email></para></listitem>
+ </orderedlist>
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
new file mode 100644
index 0000000..5818ff7
--- /dev/null
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -0,0 +1,707 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LinuxKernelAPI">
+ <bookinfo>
+ <title>The Linux Kernel API</title>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="Basics">
+ <title>Driver Basics</title>
+ <sect1><title>Driver Entry and Exit points</title>
+!Iinclude/linux/init.h
+ </sect1>
+
+ <sect1><title>Atomic and pointer manipulation</title>
+!Iarch/x86/include/asm/atomic_32.h
+!Iarch/x86/include/asm/unaligned.h
+ </sect1>
+
+ <sect1><title>Delaying, scheduling, and timer routines</title>
+!Iinclude/linux/sched.h
+!Ekernel/sched.c
+!Ekernel/timer.c
+ </sect1>
+ <sect1><title>High-resolution timers</title>
+!Iinclude/linux/ktime.h
+!Iinclude/linux/hrtimer.h
+!Ekernel/hrtimer.c
+ </sect1>
+ <sect1><title>Workqueues and Kevents</title>
+!Ekernel/workqueue.c
+ </sect1>
+ <sect1><title>Internal Functions</title>
+!Ikernel/exit.c
+!Ikernel/signal.c
+!Iinclude/linux/kthread.h
+!Ekernel/kthread.c
+ </sect1>
+
+ <sect1><title>Kernel objects manipulation</title>
+<!--
+X!Iinclude/linux/kobject.h
+-->
+!Elib/kobject.c
+ </sect1>
+
+ <sect1><title>Kernel utility functions</title>
+!Iinclude/linux/kernel.h
+!Ekernel/printk.c
+!Ekernel/panic.c
+!Ekernel/sys.c
+!Ekernel/rcupdate.c
+ </sect1>
+
+ <sect1><title>Device Resource Management</title>
+!Edrivers/base/devres.c
+ </sect1>
+
+ </chapter>
+
+ <chapter id="adt">
+ <title>Data Types</title>
+ <sect1><title>Doubly Linked Lists</title>
+!Iinclude/linux/list.h
+ </sect1>
+ </chapter>
+
+ <chapter id="libc">
+ <title>Basic C Library Functions</title>
+
+ <para>
+ When writing drivers, you cannot in general use routines which are
+ from the C Library. Some of the functions have been found generally
+ useful and they are listed below. The behaviour of these functions
+ may vary slightly from those defined by ANSI, and these deviations
+ are noted in the text.
+ </para>
+
+ <sect1><title>String Conversions</title>
+!Ilib/vsprintf.c
+!Elib/vsprintf.c
+ </sect1>
+ <sect1><title>String Manipulation</title>
+<!-- All functions are exported at now
+X!Ilib/string.c
+ -->
+!Elib/string.c
+ </sect1>
+ <sect1><title>Bit Operations</title>
+!Iarch/x86/include/asm/bitops.h
+ </sect1>
+ </chapter>
+
+ <chapter id="kernel-lib">
+ <title>Basic Kernel Library Functions</title>
+
+ <para>
+ The Linux kernel provides more basic utility functions.
+ </para>
+
+ <sect1><title>Bitmap Operations</title>
+!Elib/bitmap.c
+!Ilib/bitmap.c
+ </sect1>
+
+ <sect1><title>Command-line Parsing</title>
+!Elib/cmdline.c
+ </sect1>
+
+ <sect1 id="crc"><title>CRC Functions</title>
+!Elib/crc7.c
+!Elib/crc16.c
+!Elib/crc-itu-t.c
+!Elib/crc32.c
+!Elib/crc-ccitt.c
+ </sect1>
+ </chapter>
+
+ <chapter id="mm">
+ <title>Memory Management in Linux</title>
+ <sect1><title>The Slab Cache</title>
+!Iinclude/linux/slab.h
+!Emm/slab.c
+ </sect1>
+ <sect1><title>User Space Memory Access</title>
+!Iarch/x86/include/asm/uaccess_32.h
+!Earch/x86/lib/usercopy_32.c
+ </sect1>
+ <sect1><title>More Memory Management Functions</title>
+!Emm/readahead.c
+!Emm/filemap.c
+!Emm/memory.c
+!Emm/vmalloc.c
+!Imm/page_alloc.c
+!Emm/mempool.c
+!Emm/dmapool.c
+!Emm/page-writeback.c
+!Emm/truncate.c
+ </sect1>
+ </chapter>
+
+
+ <chapter id="ipc">
+ <title>Kernel IPC facilities</title>
+
+ <sect1><title>IPC utilities</title>
+!Iipc/util.c
+ </sect1>
+ </chapter>
+
+ <chapter id="kfifo">
+ <title>FIFO Buffer</title>
+ <sect1><title>kfifo interface</title>
+!Iinclude/linux/kfifo.h
+!Ekernel/kfifo.c
+ </sect1>
+ </chapter>
+
+ <chapter id="relayfs">
+ <title>relay interface support</title>
+
+ <para>
+ Relay interface support
+ is designed to provide an efficient mechanism for tools and
+ facilities to relay large amounts of data from kernel space to
+ user space.
+ </para>
+
+ <sect1><title>relay interface</title>
+!Ekernel/relay.c
+!Ikernel/relay.c
+ </sect1>
+ </chapter>
+
+ <chapter id="modload">
+ <title>Module Support</title>
+ <sect1><title>Module Loading</title>
+!Ekernel/kmod.c
+ </sect1>
+ <sect1><title>Inter Module support</title>
+ <para>
+ Refer to the file kernel/module.c for more information.
+ </para>
+<!-- FIXME: Removed for now since no structured comments in source
+X!Ekernel/module.c
+-->
+ </sect1>
+ </chapter>
+
+ <chapter id="hardware">
+ <title>Hardware Interfaces</title>
+ <sect1><title>Interrupt Handling</title>
+!Ekernel/irq/manage.c
+ </sect1>
+
+ <sect1><title>DMA Channels</title>
+!Ekernel/dma.c
+ </sect1>
+
+ <sect1><title>Resources Management</title>
+!Ikernel/resource.c
+!Ekernel/resource.c
+ </sect1>
+
+ <sect1><title>MTRR Handling</title>
+!Earch/x86/kernel/cpu/mtrr/main.c
+ </sect1>
+
+ <sect1><title>PCI Support Library</title>
+!Edrivers/pci/pci.c
+!Edrivers/pci/pci-driver.c
+!Edrivers/pci/remove.c
+!Edrivers/pci/pci-acpi.c
+!Edrivers/pci/search.c
+!Edrivers/pci/msi.c
+!Edrivers/pci/bus.c
+<!-- FIXME: Removed for now since no structured comments in source
+X!Edrivers/pci/hotplug.c
+-->
+!Edrivers/pci/probe.c
+!Edrivers/pci/rom.c
+ </sect1>
+ <sect1><title>PCI Hotplug Support Library</title>
+!Edrivers/pci/hotplug/pci_hotplug_core.c
+ </sect1>
+ <sect1><title>MCA Architecture</title>
+ <sect2><title>MCA Device Functions</title>
+ <para>
+ Refer to the file arch/x86/kernel/mca_32.c for more information.
+ </para>
+<!-- FIXME: Removed for now since no structured comments in source
+X!Earch/x86/kernel/mca_32.c
+-->
+ </sect2>
+ <sect2><title>MCA Bus DMA</title>
+!Iarch/x86/include/asm/mca_dma.h
+ </sect2>
+ </sect1>
+ </chapter>
+
+ <chapter id="firmware">
+ <title>Firmware Interfaces</title>
+ <sect1><title>DMI Interfaces</title>
+!Edrivers/firmware/dmi_scan.c
+ </sect1>
+ <sect1><title>EDD Interfaces</title>
+!Idrivers/firmware/edd.c
+ </sect1>
+ </chapter>
+
+ <chapter id="security">
+ <title>Security Framework</title>
+!Isecurity/security.c
+!Esecurity/inode.c
+ </chapter>
+
+ <chapter id="audit">
+ <title>Audit Interfaces</title>
+!Ekernel/audit.c
+!Ikernel/auditsc.c
+!Ikernel/auditfilter.c
+ </chapter>
+
+ <chapter id="accounting">
+ <title>Accounting Framework</title>
+!Ikernel/acct.c
+ </chapter>
+
+ <chapter id="devdrivers">
+ <title>Device drivers infrastructure</title>
+ <sect1><title>Device Drivers Base</title>
+<!--
+X!Iinclude/linux/device.h
+-->
+!Edrivers/base/driver.c
+!Edrivers/base/core.c
+!Edrivers/base/class.c
+!Edrivers/base/firmware_class.c
+!Edrivers/base/transport_class.c
+<!-- Cannot be included, because
+ attribute_container_add_class_device_adapter
+ and attribute_container_classdev_to_container
+ exceed allowed 44 characters maximum
+X!Edrivers/base/attribute_container.c
+-->
+!Edrivers/base/sys.c
+<!--
+X!Edrivers/base/interface.c
+-->
+!Edrivers/base/platform.c
+!Edrivers/base/bus.c
+ </sect1>
+ <sect1><title>Device Drivers Power Management</title>
+!Edrivers/base/power/main.c
+ </sect1>
+ <sect1><title>Device Drivers ACPI Support</title>
+<!-- Internal functions only
+X!Edrivers/acpi/sleep/main.c
+X!Edrivers/acpi/sleep/wakeup.c
+X!Edrivers/acpi/motherboard.c
+X!Edrivers/acpi/bus.c
+-->
+!Edrivers/acpi/scan.c
+!Idrivers/acpi/scan.c
+<!-- No correct structured comments
+X!Edrivers/acpi/pci_bind.c
+-->
+ </sect1>
+ <sect1><title>Device drivers PnP support</title>
+!Idrivers/pnp/core.c
+<!-- No correct structured comments
+X!Edrivers/pnp/system.c
+ -->
+!Edrivers/pnp/card.c
+!Idrivers/pnp/driver.c
+!Edrivers/pnp/manager.c
+!Edrivers/pnp/support.c
+ </sect1>
+ <sect1><title>Userspace IO devices</title>
+!Edrivers/uio/uio.c
+!Iinclude/linux/uio_driver.h
+ </sect1>
+ </chapter>
+
+ <chapter id="blkdev">
+ <title>Block Devices</title>
+!Eblock/blk-core.c
+!Iblock/blk-core.c
+!Eblock/blk-map.c
+!Iblock/blk-sysfs.c
+!Eblock/blk-settings.c
+!Eblock/blk-exec.c
+!Eblock/blk-barrier.c
+!Eblock/blk-tag.c
+!Iblock/blk-tag.c
+!Eblock/blk-integrity.c
+!Iblock/blktrace.c
+!Iblock/genhd.c
+!Eblock/genhd.c
+ </chapter>
+
+ <chapter id="chrdev">
+ <title>Char devices</title>
+!Efs/char_dev.c
+ </chapter>
+
+ <chapter id="miscdev">
+ <title>Miscellaneous Devices</title>
+!Edrivers/char/misc.c
+ </chapter>
+
+ <chapter id="parportdev">
+ <title>Parallel Port Devices</title>
+!Iinclude/linux/parport.h
+!Edrivers/parport/ieee1284.c
+!Edrivers/parport/share.c
+!Idrivers/parport/daisy.c
+ </chapter>
+
+ <chapter id="message_devices">
+ <title>Message-based devices</title>
+ <sect1><title>Fusion message devices</title>
+!Edrivers/message/fusion/mptbase.c
+!Idrivers/message/fusion/mptbase.c
+!Edrivers/message/fusion/mptscsih.c
+!Idrivers/message/fusion/mptscsih.c
+!Idrivers/message/fusion/mptctl.c
+!Idrivers/message/fusion/mptspi.c
+!Idrivers/message/fusion/mptfc.c
+!Idrivers/message/fusion/mptlan.c
+ </sect1>
+ <sect1><title>I2O message devices</title>
+!Iinclude/linux/i2o.h
+!Idrivers/message/i2o/core.h
+!Edrivers/message/i2o/iop.c
+!Idrivers/message/i2o/iop.c
+!Idrivers/message/i2o/config-osm.c
+!Edrivers/message/i2o/exec-osm.c
+!Idrivers/message/i2o/exec-osm.c
+!Idrivers/message/i2o/bus-osm.c
+!Edrivers/message/i2o/device.c
+!Idrivers/message/i2o/device.c
+!Idrivers/message/i2o/driver.c
+!Idrivers/message/i2o/pci.c
+!Idrivers/message/i2o/i2o_block.c
+!Idrivers/message/i2o/i2o_scsi.c
+!Idrivers/message/i2o/i2o_proc.c
+ </sect1>
+ </chapter>
+
+ <chapter id="snddev">
+ <title>Sound Devices</title>
+!Iinclude/sound/core.h
+!Esound/sound_core.c
+!Iinclude/sound/pcm.h
+!Esound/core/pcm.c
+!Esound/core/device.c
+!Esound/core/info.c
+!Esound/core/rawmidi.c
+!Esound/core/sound.c
+!Esound/core/memory.c
+!Esound/core/pcm_memory.c
+!Esound/core/init.c
+!Esound/core/isadma.c
+!Esound/core/control.c
+!Esound/core/pcm_lib.c
+!Esound/core/hwdep.c
+!Esound/core/pcm_native.c
+!Esound/core/memalloc.c
+<!-- FIXME: Removed for now since no structured comments in source
+X!Isound/sound_firmware.c
+-->
+ </chapter>
+
+ <chapter id="uart16x50">
+ <title>16x50 UART Driver</title>
+!Iinclude/linux/serial_core.h
+!Edrivers/serial/serial_core.c
+!Edrivers/serial/8250.c
+ </chapter>
+
+ <chapter id="fbdev">
+ <title>Frame Buffer Library</title>
+
+ <para>
+ The frame buffer drivers depend heavily on four data structures.
+ These structures are declared in include/linux/fb.h. They are
+ fb_info, fb_var_screeninfo, fb_fix_screeninfo and fb_monospecs.
+ The last three can be made available to and from userland.
+ </para>
+
+ <para>
+ fb_info defines the current state of a particular video card.
+ Inside fb_info, there exists a fb_ops structure which is a
+ collection of needed functions to make fbdev and fbcon work.
+ fb_info is only visible to the kernel.
+ </para>
+
+ <para>
+ fb_var_screeninfo is used to describe the features of a video card
+ that are user defined. With fb_var_screeninfo, things such as
+ depth and the resolution may be defined.
+ </para>
+
+ <para>
+ The next structure is fb_fix_screeninfo. This defines the
+ properties of a card that are created when a mode is set and can't
+ be changed otherwise. A good example of this is the start of the
+ frame buffer memory. This "locks" the address of the frame buffer
+ memory, so that it cannot be changed or moved.
+ </para>
+
+ <para>
+ The last structure is fb_monospecs. In the old API, there was
+ little importance for fb_monospecs. This allowed for forbidden things
+ such as setting a mode of 800x600 on a fix frequency monitor. With
+ the new API, fb_monospecs prevents such things, and if used
+ correctly, can prevent a monitor from being cooked. fb_monospecs
+ will not be useful until kernels 2.5.x.
+ </para>
+
+ <sect1><title>Frame Buffer Memory</title>
+!Edrivers/video/fbmem.c
+ </sect1>
+<!--
+ <sect1><title>Frame Buffer Console</title>
+X!Edrivers/video/console/fbcon.c
+ </sect1>
+-->
+ <sect1><title>Frame Buffer Colormap</title>
+!Edrivers/video/fbcmap.c
+ </sect1>
+<!-- FIXME:
+ drivers/video/fbgen.c has no docs, which stuffs up the sgml. Comment
+ out until somebody adds docs. KAO
+ <sect1><title>Frame Buffer Generic Functions</title>
+X!Idrivers/video/fbgen.c
+ </sect1>
+KAO -->
+ <sect1><title>Frame Buffer Video Mode Database</title>
+!Idrivers/video/modedb.c
+!Edrivers/video/modedb.c
+ </sect1>
+ <sect1><title>Frame Buffer Macintosh Video Mode Database</title>
+!Edrivers/video/macmodes.c
+ </sect1>
+ <sect1><title>Frame Buffer Fonts</title>
+ <para>
+ Refer to the file drivers/video/console/fonts.c for more information.
+ </para>
+<!-- FIXME: Removed for now since no structured comments in source
+X!Idrivers/video/console/fonts.c
+-->
+ </sect1>
+ </chapter>
+
+ <chapter id="input_subsystem">
+ <title>Input Subsystem</title>
+!Iinclude/linux/input.h
+!Edrivers/input/input.c
+!Edrivers/input/ff-core.c
+!Edrivers/input/ff-memless.c
+ </chapter>
+
+ <chapter id="spi">
+ <title>Serial Peripheral Interface (SPI)</title>
+ <para>
+ SPI is the "Serial Peripheral Interface", widely used with
+ embedded systems because it is a simple and efficient
+ interface: basically a multiplexed shift register.
+ Its three signal wires hold a clock (SCK, often in the range
+ of 1-20 MHz), a "Master Out, Slave In" (MOSI) data line, and
+ a "Master In, Slave Out" (MISO) data line.
+ SPI is a full duplex protocol; for each bit shifted out the
+ MOSI line (one per clock) another is shifted in on the MISO line.
+ Those bits are assembled into words of various sizes on the
+ way to and from system memory.
+ An additional chipselect line is usually active-low (nCS);
+ four signals are normally used for each peripheral, plus
+ sometimes an interrupt.
+ </para>
+ <para>
+ The SPI bus facilities listed here provide a generalized
+ interface to declare SPI busses and devices, manage them
+ according to the standard Linux driver model, and perform
+ input/output operations.
+ At this time, only "master" side interfaces are supported,
+ where Linux talks to SPI peripherals and does not implement
+ such a peripheral itself.
+ (Interfaces to support implementing SPI slaves would
+ necessarily look different.)
+ </para>
+ <para>
+ The programming interface is structured around two kinds of driver,
+ and two kinds of device.
+ A "Controller Driver" abstracts the controller hardware, which may
+ be as simple as a set of GPIO pins or as complex as a pair of FIFOs
+ connected to dual DMA engines on the other side of the SPI shift
+ register (maximizing throughput). Such drivers bridge between
+ whatever bus they sit on (often the platform bus) and SPI, and
+ expose the SPI side of their device as a
+ <structname>struct spi_master</structname>.
+ SPI devices are children of that master, represented as a
+ <structname>struct spi_device</structname> and manufactured from
+ <structname>struct spi_board_info</structname> descriptors which
+ are usually provided by board-specific initialization code.
+ A <structname>struct spi_driver</structname> is called a
+ "Protocol Driver", and is bound to a spi_device using normal
+ driver model calls.
+ </para>
+ <para>
+ The I/O model is a set of queued messages. Protocol drivers
+ submit one or more <structname>struct spi_message</structname>
+ objects, which are processed and completed asynchronously.
+ (There are synchronous wrappers, however.) Messages are
+ built from one or more <structname>struct spi_transfer</structname>
+ objects, each of which wraps a full duplex SPI transfer.
+ A variety of protocol tweaking options are needed, because
+ different chips adopt very different policies for how they
+ use the bits transferred with SPI.
+ </para>
+!Iinclude/linux/spi/spi.h
+!Fdrivers/spi/spi.c spi_register_board_info
+!Edrivers/spi/spi.c
+ </chapter>
+
+ <chapter id="i2c">
+ <title>I<superscript>2</superscript>C and SMBus Subsystem</title>
+
+ <para>
+ I<superscript>2</superscript>C (or without fancy typography, "I2C")
+ is an acronym for the "Inter-IC" bus, a simple bus protocol which is
+ widely used where low data rate communications suffice.
+ Since it's also a licensed trademark, some vendors use another
+ name (such as "Two-Wire Interface", TWI) for the same bus.
+ I2C only needs two signals (SCL for clock, SDA for data), conserving
+ board real estate and minimizing signal quality issues.
+ Most I2C devices use seven bit addresses, and bus speeds of up
+ to 400 kHz; there's a high speed extension (3.4 MHz) that's not yet
+ found wide use.
+ I2C is a multi-master bus; open drain signaling is used to
+ arbitrate between masters, as well as to handshake and to
+ synchronize clocks from slower clients.
+ </para>
+
+ <para>
+ The Linux I2C programming interfaces support only the master
+ side of bus interactions, not the slave side.
+ The programming interface is structured around two kinds of driver,
+ and two kinds of device.
+ An I2C "Adapter Driver" abstracts the controller hardware; it binds
+ to a physical device (perhaps a PCI device or platform_device) and
+ exposes a <structname>struct i2c_adapter</structname> representing
+ each I2C bus segment it manages.
+ On each I2C bus segment will be I2C devices represented by a
+ <structname>struct i2c_client</structname>. Those devices will
+ be bound to a <structname>struct i2c_driver</structname>,
+ which should follow the standard Linux driver model.
+ (At this writing, a legacy model is more widely used.)
+ There are functions to perform various I2C protocol operations; at
+ this writing all such functions are usable only from task context.
+ </para>
+
+ <para>
+ The System Management Bus (SMBus) is a sibling protocol. Most SMBus
+ systems are also I2C conformant. The electrical constraints are
+ tighter for SMBus, and it standardizes particular protocol messages
+ and idioms. Controllers that support I2C can also support most
+ SMBus operations, but SMBus controllers don't support all the protocol
+ options that an I2C controller will.
+ There are functions to perform various SMBus protocol operations,
+ either using I2C primitives or by issuing SMBus commands to
+ i2c_adapter devices which don't support those I2C operations.
+ </para>
+
+!Iinclude/linux/i2c.h
+!Fdrivers/i2c/i2c-boardinfo.c i2c_register_board_info
+!Edrivers/i2c/i2c-core.c
+ </chapter>
+
+ <chapter id="clk">
+ <title>Clock Framework</title>
+
+ <para>
+ The clock framework defines programming interfaces to support
+ software management of the system clock tree.
+ This framework is widely used with System-On-Chip (SOC) platforms
+ to support power management and various devices which may need
+ custom clock rates.
+ Note that these "clocks" don't relate to timekeeping or real
+ time clocks (RTCs), each of which have separate frameworks.
+ These <structname>struct clk</structname> instances may be used
+ to manage for example a 96 MHz signal that is used to shift bits
+ into and out of peripherals or busses, or otherwise trigger
+ synchronous state machine transitions in system hardware.
+ </para>
+
+ <para>
+ Power management is supported by explicit software clock gating:
+ unused clocks are disabled, so the system doesn't waste power
+ changing the state of transistors that aren't in active use.
+ On some systems this may be backed by hardware clock gating,
+ where clocks are gated without being disabled in software.
+ Sections of chips that are powered but not clocked may be able
+ to retain their last state.
+ This low power state is often called a <emphasis>retention
+ mode</emphasis>.
+ This mode still incurs leakage currents, especially with finer
+ circuit geometries, but for CMOS circuits power is mostly used
+ by clocked state changes.
+ </para>
+
+ <para>
+ Power-aware drivers only enable their clocks when the device
+ they manage is in active use. Also, system sleep states often
+ differ according to which clock domains are active: while a
+ "standby" state may allow wakeup from several active domains, a
+ "mem" (suspend-to-RAM) state may require a more wholesale shutdown
+ of clocks derived from higher speed PLLs and oscillators, limiting
+ the number of possible wakeup event sources. A driver's suspend
+ method may need to be aware of system-specific clock constraints
+ on the target sleep state.
+ </para>
+
+ <para>
+ Some platforms support programmable clock generators. These
+ can be used by external chips of various kinds, such as other
+ CPUs, multimedia codecs, and devices with strict requirements
+ for interface clocking.
+ </para>
+
+!Iinclude/linux/clk.h
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
new file mode 100644
index 0000000..a50d6cd
--- /dev/null
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -0,0 +1,1327 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="lk-hacking-guide">
+ <bookinfo>
+ <title>Unreliable Guide To Hacking The Linux Kernel</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Rusty</firstname>
+ <surname>Russell</surname>
+ <affiliation>
+ <address>
+ <email>rusty@rustcorp.com.au</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2005</year>
+ <holder>Rusty Russell</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+
+ <releaseinfo>
+ This is the first release of this document as part of the kernel tarball.
+ </releaseinfo>
+
+ </bookinfo>
+
+ <toc></toc>
+
+ <chapter id="introduction">
+ <title>Introduction</title>
+ <para>
+ Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux
+ Kernel Hacking. This document describes the common routines and
+ general requirements for kernel code: its goal is to serve as a
+ primer for Linux kernel development for experienced C
+ programmers. I avoid implementation details: that's what the
+ code is for, and I ignore whole tracts of useful routines.
+ </para>
+ <para>
+ Before you read this, please understand that I never wanted to
+ write this document, being grossly under-qualified, but I always
+ wanted to read it, and this was the only way. I hope it will
+ grow into a compendium of best practice, common starting points
+ and random information.
+ </para>
+ </chapter>
+
+ <chapter id="basic-players">
+ <title>The Players</title>
+
+ <para>
+ At any time each of the CPUs in a system can be:
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ not associated with any process, serving a hardware interrupt;
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ not associated with any process, serving a softirq or tasklet;
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ running in kernel space, associated with a process (user context);
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ running a process in user space.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ There is an ordering between these. The bottom two can preempt
+ each other, but above that is a strict hierarchy: each can only be
+ preempted by the ones above it. For example, while a softirq is
+ running on a CPU, no other softirq will preempt it, but a hardware
+ interrupt can. However, any other CPUs in the system execute
+ independently.
+ </para>
+
+ <para>
+ We'll see a number of ways that the user context can block
+ interrupts, to become truly non-preemptable.
+ </para>
+
+ <sect1 id="basics-usercontext">
+ <title>User Context</title>
+
+ <para>
+ User context is when you are coming in from a system call or other
+ trap: like userspace, you can be preempted by more important tasks
+ and by interrupts. You can sleep, by calling
+ <function>schedule()</function>.
+ </para>
+
+ <note>
+ <para>
+ You are always in user context on module load and unload,
+ and on operations on the block device layer.
+ </para>
+ </note>
+
+ <para>
+ In user context, the <varname>current</varname> pointer (indicating
+ the task we are currently executing) is valid, and
+ <function>in_interrupt()</function>
+ (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false
+ </returnvalue>.
+ </para>
+
+ <caution>
+ <para>
+ Beware that if you have preemption or softirqs disabled
+ (see below), <function>in_interrupt()</function> will return a
+ false positive.
+ </para>
+ </caution>
+ </sect1>
+
+ <sect1 id="basics-hardirqs">
+ <title>Hardware Interrupts (Hard IRQs)</title>
+
+ <para>
+ Timer ticks, <hardware>network cards</hardware> and
+ <hardware>keyboard</hardware> are examples of real
+ hardware which produce interrupts at any time. The kernel runs
+ interrupt handlers, which services the hardware. The kernel
+ guarantees that this handler is never re-entered: if the same
+ interrupt arrives, it is queued (or dropped). Because it
+ disables interrupts, this handler has to be fast: frequently it
+ simply acknowledges the interrupt, marks a 'software interrupt'
+ for execution and exits.
+ </para>
+
+ <para>
+ You can tell you are in a hardware interrupt, because
+ <function>in_irq()</function> returns <returnvalue>true</returnvalue>.
+ </para>
+ <caution>
+ <para>
+ Beware that this will return a false positive if interrupts are disabled
+ (see below).
+ </para>
+ </caution>
+ </sect1>
+
+ <sect1 id="basics-softirqs">
+ <title>Software Interrupt Context: Softirqs and Tasklets</title>
+
+ <para>
+ Whenever a system call is about to return to userspace, or a
+ hardware interrupt handler exits, any 'software interrupts'
+ which are marked pending (usually by hardware interrupts) are
+ run (<filename>kernel/softirq.c</filename>).
+ </para>
+
+ <para>
+ Much of the real interrupt handling work is done here. Early in
+ the transition to <acronym>SMP</acronym>, there were only 'bottom
+ halves' (BHs), which didn't take advantage of multiple CPUs. Shortly
+ after we switched from wind-up computers made of match-sticks and snot,
+ we abandoned this limitation and switched to 'softirqs'.
+ </para>
+
+ <para>
+ <filename class="headerfile">include/linux/interrupt.h</filename> lists the
+ different softirqs. A very important softirq is the
+ timer softirq (<filename
+ class="headerfile">include/linux/timer.h</filename>): you can
+ register to have it call functions for you in a given length of
+ time.
+ </para>
+
+ <para>
+ Softirqs are often a pain to deal with, since the same softirq
+ will run simultaneously on more than one CPU. For this reason,
+ tasklets (<filename
+ class="headerfile">include/linux/interrupt.h</filename>) are more
+ often used: they are dynamically-registrable (meaning you can have
+ as many as you want), and they also guarantee that any tasklet
+ will only run on one CPU at any time, although different tasklets
+ can run simultaneously.
+ </para>
+ <caution>
+ <para>
+ The name 'tasklet' is misleading: they have nothing to do with 'tasks',
+ and probably more to do with some bad vodka Alexey Kuznetsov had at the
+ time.
+ </para>
+ </caution>
+
+ <para>
+ You can tell you are in a softirq (or tasklet)
+ using the <function>in_softirq()</function> macro
+ (<filename class="headerfile">include/linux/interrupt.h</filename>).
+ </para>
+ <caution>
+ <para>
+ Beware that this will return a false positive if a bh lock (see below)
+ is held.
+ </para>
+ </caution>
+ </sect1>
+ </chapter>
+
+ <chapter id="basic-rules">
+ <title>Some Basic Rules</title>
+
+ <variablelist>
+ <varlistentry>
+ <term>No memory protection</term>
+ <listitem>
+ <para>
+ If you corrupt memory, whether in user context or
+ interrupt context, the whole machine will crash. Are you
+ sure you can't do what you want in userspace?
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>No floating point or <acronym>MMX</acronym></term>
+ <listitem>
+ <para>
+ The <acronym>FPU</acronym> context is not saved; even in user
+ context the <acronym>FPU</acronym> state probably won't
+ correspond with the current process: you would mess with some
+ user process' <acronym>FPU</acronym> state. If you really want
+ to do this, you would have to explicitly save/restore the full
+ <acronym>FPU</acronym> state (and avoid context switches). It
+ is generally a bad idea; use fixed point arithmetic first.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>A rigid stack limit</term>
+ <listitem>
+ <para>
+ Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's
+ about 14K on most 64-bit archs, and often shared with interrupts
+ so you can't use it all. Avoid deep recursion and huge local
+ arrays on the stack (allocate them dynamically instead).
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>The Linux kernel is portable</term>
+ <listitem>
+ <para>
+ Let's keep it that way. Your code should be 64-bit clean,
+ and endian-independent. You should also minimize CPU
+ specific stuff, e.g. inline assembly should be cleanly
+ encapsulated and minimized to ease porting. Generally it
+ should be restricted to the architecture-dependent part of
+ the kernel tree.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </chapter>
+
+ <chapter id="ioctls">
+ <title>ioctls: Not writing a new system call</title>
+
+ <para>
+ A system call generally looks like this
+ </para>
+
+ <programlisting>
+asmlinkage long sys_mycall(int arg)
+{
+ return 0;
+}
+ </programlisting>
+
+ <para>
+ First, in most cases you don't want to create a new system call.
+ You create a character device and implement an appropriate ioctl
+ for it. This is much more flexible than system calls, doesn't have
+ to be entered in every architecture's
+ <filename class="headerfile">include/asm/unistd.h</filename> and
+ <filename>arch/kernel/entry.S</filename> file, and is much more
+ likely to be accepted by Linus.
+ </para>
+
+ <para>
+ If all your routine does is read or write some parameter, consider
+ implementing a <function>sysfs</function> interface instead.
+ </para>
+
+ <para>
+ Inside the ioctl you're in user context to a process. When a
+ error occurs you return a negated errno (see
+ <filename class="headerfile">include/linux/errno.h</filename>),
+ otherwise you return <returnvalue>0</returnvalue>.
+ </para>
+
+ <para>
+ After you slept you should check if a signal occurred: the
+ Unix/Linux way of handling signals is to temporarily exit the
+ system call with the <constant>-ERESTARTSYS</constant> error. The
+ system call entry code will switch back to user context, process
+ the signal handler and then your system call will be restarted
+ (unless the user disabled that). So you should be prepared to
+ process the restart, e.g. if you're in the middle of manipulating
+ some data structure.
+ </para>
+
+ <programlisting>
+if (signal_pending())
+ return -ERESTARTSYS;
+ </programlisting>
+
+ <para>
+ If you're doing longer computations: first think userspace. If you
+ <emphasis>really</emphasis> want to do it in kernel you should
+ regularly check if you need to give up the CPU (remember there is
+ cooperative multitasking per CPU). Idiom:
+ </para>
+
+ <programlisting>
+cond_resched(); /* Will sleep */
+ </programlisting>
+
+ <para>
+ A short note on interface design: the UNIX system call motto is
+ "Provide mechanism not policy".
+ </para>
+ </chapter>
+
+ <chapter id="deadlock-recipes">
+ <title>Recipes for Deadlock</title>
+
+ <para>
+ You cannot call any routines which may sleep, unless:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ You are in user context.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ You do not own any spinlocks.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ You have interrupts enabled (actually, Andi Kleen says
+ that the scheduling code will enable them for you, but
+ that's probably not what you wanted).
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ Note that some functions may sleep implicitly: common ones are
+ the user space access functions (*_user) and memory allocation
+ functions without <symbol>GFP_ATOMIC</symbol>.
+ </para>
+
+ <para>
+ You should always compile your kernel
+ <symbol>CONFIG_DEBUG_SPINLOCK_SLEEP</symbol> on, and it will warn
+ you if you break these rules. If you <emphasis>do</emphasis> break
+ the rules, you will eventually lock up your box.
+ </para>
+
+ <para>
+ Really.
+ </para>
+ </chapter>
+
+ <chapter id="common-routines">
+ <title>Common Routines</title>
+
+ <sect1 id="routines-printk">
+ <title>
+ <function>printk()</function>
+ <filename class="headerfile">include/linux/kernel.h</filename>
+ </title>
+
+ <para>
+ <function>printk()</function> feeds kernel messages to the
+ console, dmesg, and the syslog daemon. It is useful for debugging
+ and reporting errors, and can be used inside interrupt context,
+ but use with caution: a machine which has its console flooded with
+ printk messages is unusable. It uses a format string mostly
+ compatible with ANSI C printf, and C string concatenation to give
+ it a first "priority" argument:
+ </para>
+
+ <programlisting>
+printk(KERN_INFO "i = %u\n", i);
+ </programlisting>
+
+ <para>
+ See <filename class="headerfile">include/linux/kernel.h</filename>;
+ for other KERN_ values; these are interpreted by syslog as the
+ level. Special case: for printing an IP address use
+ </para>
+
+ <programlisting>
+__u32 ipaddress;
+printk(KERN_INFO "my ip: %d.%d.%d.%d\n", NIPQUAD(ipaddress));
+ </programlisting>
+
+ <para>
+ <function>printk()</function> internally uses a 1K buffer and does
+ not catch overruns. Make sure that will be enough.
+ </para>
+
+ <note>
+ <para>
+ You will know when you are a real kernel hacker
+ when you start typoing printf as printk in your user programs :)
+ </para>
+ </note>
+
+ <!--- From the Lions book reader department -->
+
+ <note>
+ <para>
+ Another sidenote: the original Unix Version 6 sources had a
+ comment on top of its printf function: "Printf should not be
+ used for chit-chat". You should follow that advice.
+ </para>
+ </note>
+ </sect1>
+
+ <sect1 id="routines-copy">
+ <title>
+ <function>copy_[to/from]_user()</function>
+ /
+ <function>get_user()</function>
+ /
+ <function>put_user()</function>
+ <filename class="headerfile">include/asm/uaccess.h</filename>
+ </title>
+
+ <para>
+ <emphasis>[SLEEPS]</emphasis>
+ </para>
+
+ <para>
+ <function>put_user()</function> and <function>get_user()</function>
+ are used to get and put single values (such as an int, char, or
+ long) from and to userspace. A pointer into userspace should
+ never be simply dereferenced: data should be copied using these
+ routines. Both return <constant>-EFAULT</constant> or 0.
+ </para>
+ <para>
+ <function>copy_to_user()</function> and
+ <function>copy_from_user()</function> are more general: they copy
+ an arbitrary amount of data to and from userspace.
+ <caution>
+ <para>
+ Unlike <function>put_user()</function> and
+ <function>get_user()</function>, they return the amount of
+ uncopied data (ie. <returnvalue>0</returnvalue> still means
+ success).
+ </para>
+ </caution>
+ [Yes, this moronic interface makes me cringe. The flamewar comes up every year or so. --RR.]
+ </para>
+ <para>
+ The functions may sleep implicitly. This should never be called
+ outside user context (it makes no sense), with interrupts
+ disabled, or a spinlock held.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-kmalloc">
+ <title><function>kmalloc()</function>/<function>kfree()</function>
+ <filename class="headerfile">include/linux/slab.h</filename></title>
+
+ <para>
+ <emphasis>[MAY SLEEP: SEE BELOW]</emphasis>
+ </para>
+
+ <para>
+ These routines are used to dynamically request pointer-aligned
+ chunks of memory, like malloc and free do in userspace, but
+ <function>kmalloc()</function> takes an extra flag word.
+ Important values:
+ </para>
+
+ <variablelist>
+ <varlistentry>
+ <term>
+ <constant>
+ GFP_KERNEL
+ </constant>
+ </term>
+ <listitem>
+ <para>
+ May sleep and swap to free memory. Only allowed in user
+ context, but is the most reliable way to allocate memory.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>
+ <constant>
+ GFP_ATOMIC
+ </constant>
+ </term>
+ <listitem>
+ <para>
+ Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>,
+ but may be called from interrupt context. You should
+ <emphasis>really</emphasis> have a good out-of-memory
+ error-handling strategy.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>
+ <constant>
+ GFP_DMA
+ </constant>
+ </term>
+ <listitem>
+ <para>
+ Allocate ISA DMA lower than 16MB. If you don't know what that
+ is you don't need it. Very unreliable.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+
+ <para>
+ If you see a <errorname>sleeping function called from invalid
+ context</errorname> warning message, then maybe you called a
+ sleeping allocation function from interrupt context without
+ <constant>GFP_ATOMIC</constant>. You should really fix that.
+ Run, don't walk.
+ </para>
+
+ <para>
+ If you are allocating at least <constant>PAGE_SIZE</constant>
+ (<filename class="headerfile">include/asm/page.h</filename>) bytes,
+ consider using <function>__get_free_pages()</function>
+
+ (<filename class="headerfile">include/linux/mm.h</filename>). It
+ takes an order argument (0 for page sized, 1 for double page, 2
+ for four pages etc.) and the same memory priority flag word as
+ above.
+ </para>
+
+ <para>
+ If you are allocating more than a page worth of bytes you can use
+ <function>vmalloc()</function>. It'll allocate virtual memory in
+ the kernel map. This block is not contiguous in physical memory,
+ but the <acronym>MMU</acronym> makes it look like it is for you
+ (so it'll only look contiguous to the CPUs, not to external device
+ drivers). If you really need large physically contiguous memory
+ for some weird device, you have a problem: it is poorly supported
+ in Linux because after some time memory fragmentation in a running
+ kernel makes it hard. The best way is to allocate the block early
+ in the boot process via the <function>alloc_bootmem()</function>
+ routine.
+ </para>
+
+ <para>
+ Before inventing your own cache of often-used objects consider
+ using a slab cache in
+ <filename class="headerfile">include/linux/slab.h</filename>
+ </para>
+ </sect1>
+
+ <sect1 id="routines-current">
+ <title><function>current</function>
+ <filename class="headerfile">include/asm/current.h</filename></title>
+
+ <para>
+ This global variable (really a macro) contains a pointer to
+ the current task structure, so is only valid in user context.
+ For example, when a process makes a system call, this will
+ point to the task structure of the calling process. It is
+ <emphasis>not NULL</emphasis> in interrupt context.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-udelay">
+ <title><function>mdelay()</function>/<function>udelay()</function>
+ <filename class="headerfile">include/asm/delay.h</filename>
+ <filename class="headerfile">include/linux/delay.h</filename>
+ </title>
+
+ <para>
+ The <function>udelay()</function> and <function>ndelay()</function> functions can be used for small pauses.
+ Do not use large values with them as you risk
+ overflow - the helper function <function>mdelay()</function> is useful
+ here, or consider <function>msleep()</function>.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-endian">
+ <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function>
+ <filename class="headerfile">include/asm/byteorder.h</filename>
+ </title>
+
+ <para>
+ The <function>cpu_to_be32()</function> family (where the "32" can
+ be replaced by 64 or 16, and the "be" can be replaced by "le") are
+ the general way to do endian conversions in the kernel: they
+ return the converted value. All variations supply the reverse as
+ well: <function>be32_to_cpu()</function>, etc.
+ </para>
+
+ <para>
+ There are two major variations of these functions: the pointer
+ variation, such as <function>cpu_to_be32p()</function>, which take
+ a pointer to the given type, and return the converted value. The
+ other variation is the "in-situ" family, such as
+ <function>cpu_to_be32s()</function>, which convert value referred
+ to by the pointer, and return void.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-local-irqs">
+ <title><function>local_irq_save()</function>/<function>local_irq_restore()</function>
+ <filename class="headerfile">include/asm/system.h</filename>
+ </title>
+
+ <para>
+ These routines disable hard interrupts on the local CPU, and
+ restore them. They are reentrant; saving the previous state in
+ their one <varname>unsigned long flags</varname> argument. If you
+ know that interrupts are enabled, you can simply use
+ <function>local_irq_disable()</function> and
+ <function>local_irq_enable()</function>.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-softirqs">
+ <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function>
+ <filename class="headerfile">include/linux/interrupt.h</filename></title>
+
+ <para>
+ These routines disable soft interrupts on the local CPU, and
+ restore them. They are reentrant; if soft interrupts were
+ disabled before, they will still be disabled after this pair
+ of functions has been called. They prevent softirqs and tasklets
+ from running on the current CPU.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-processorids">
+ <title><function>smp_processor_id</function>()
+ <filename class="headerfile">include/asm/smp.h</filename></title>
+
+ <para>
+ <function>get_cpu()</function> disables preemption (so you won't
+ suddenly get moved to another CPU) and returns the current
+ processor number, between 0 and <symbol>NR_CPUS</symbol>. Note
+ that the CPU numbers are not necessarily continuous. You return
+ it again with <function>put_cpu()</function> when you are done.
+ </para>
+ <para>
+ If you know you cannot be preempted by another task (ie. you are
+ in interrupt context, or have preemption disabled) you can use
+ smp_processor_id().
+ </para>
+ </sect1>
+
+ <sect1 id="routines-init">
+ <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type>
+ <filename class="headerfile">include/linux/init.h</filename></title>
+
+ <para>
+ After boot, the kernel frees up a special section; functions
+ marked with <type>__init</type> and data structures marked with
+ <type>__initdata</type> are dropped after boot is complete: similarly
+ modules discard this memory after initialization. <type>__exit</type>
+ is used to declare a function which is only required on exit: the
+ function will be dropped if this file is not compiled as a module.
+ See the header file for use. Note that it makes no sense for a function
+ marked with <type>__init</type> to be exported to modules with
+ <function>EXPORT_SYMBOL()</function> - this will break.
+ </para>
+
+ </sect1>
+
+ <sect1 id="routines-init-again">
+ <title><function>__initcall()</function>/<function>module_init()</function>
+ <filename class="headerfile">include/linux/init.h</filename></title>
+ <para>
+ Many parts of the kernel are well served as a module
+ (dynamically-loadable parts of the kernel). Using the
+ <function>module_init()</function> and
+ <function>module_exit()</function> macros it is easy to write code
+ without #ifdefs which can operate both as a module or built into
+ the kernel.
+ </para>
+
+ <para>
+ The <function>module_init()</function> macro defines which
+ function is to be called at module insertion time (if the file is
+ compiled as a module), or at boot time: if the file is not
+ compiled as a module the <function>module_init()</function> macro
+ becomes equivalent to <function>__initcall()</function>, which
+ through linker magic ensures that the function is called on boot.
+ </para>
+
+ <para>
+ The function can return a negative error number to cause
+ module loading to fail (unfortunately, this has no effect if
+ the module is compiled into the kernel). This function is
+ called in user context with interrupts enabled, so it can sleep.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-moduleexit">
+ <title> <function>module_exit()</function>
+ <filename class="headerfile">include/linux/init.h</filename> </title>
+
+ <para>
+ This macro defines the function to be called at module removal
+ time (or never, in the case of the file compiled into the
+ kernel). It will only be called if the module usage count has
+ reached zero. This function can also sleep, but cannot fail:
+ everything must be cleaned up by the time it returns.
+ </para>
+
+ <para>
+ Note that this macro is optional: if it is not present, your
+ module will not be removable (except for 'rmmod -f').
+ </para>
+ </sect1>
+
+ <sect1 id="routines-module-use-counters">
+ <title> <function>try_module_get()</function>/<function>module_put()</function>
+ <filename class="headerfile">include/linux/module.h</filename></title>
+
+ <para>
+ These manipulate the module usage count, to protect against
+ removal (a module also can't be removed if another module uses one
+ of its exported symbols: see below). Before calling into module
+ code, you should call <function>try_module_get()</function> on
+ that module: if it fails, then the module is being removed and you
+ should act as if it wasn't there. Otherwise, you can safely enter
+ the module, and call <function>module_put()</function> when you're
+ finished.
+ </para>
+
+ <para>
+ Most registerable structures have an
+ <structfield>owner</structfield> field, such as in the
+ <structname>file_operations</structname> structure. Set this field
+ to the macro <symbol>THIS_MODULE</symbol>.
+ </para>
+ </sect1>
+
+ <!-- add info on new-style module refcounting here -->
+ </chapter>
+
+ <chapter id="queues">
+ <title>Wait Queues
+ <filename class="headerfile">include/linux/wait.h</filename>
+ </title>
+ <para>
+ <emphasis>[SLEEPS]</emphasis>
+ </para>
+
+ <para>
+ A wait queue is used to wait for someone to wake you up when a
+ certain condition is true. They must be used carefully to ensure
+ there is no race condition. You declare a
+ <type>wait_queue_head_t</type>, and then processes which want to
+ wait for that condition declare a <type>wait_queue_t</type>
+ referring to themselves, and place that in the queue.
+ </para>
+
+ <sect1 id="queue-declaring">
+ <title>Declaring</title>
+
+ <para>
+ You declare a <type>wait_queue_head_t</type> using the
+ <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the
+ <function>init_waitqueue_head()</function> routine in your
+ initialization code.
+ </para>
+ </sect1>
+
+ <sect1 id="queue-waitqueue">
+ <title>Queuing</title>
+
+ <para>
+ Placing yourself in the waitqueue is fairly complex, because you
+ must put yourself in the queue before checking the condition.
+ There is a macro to do this:
+ <function>wait_event_interruptible()</function>
+
+ <filename class="headerfile">include/linux/wait.h</filename> The
+ first argument is the wait queue head, and the second is an
+ expression which is evaluated; the macro returns
+ <returnvalue>0</returnvalue> when this expression is true, or
+ <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received.
+ The <function>wait_event()</function> version ignores signals.
+ </para>
+ <para>
+ Do not use the <function>sleep_on()</function> function family -
+ it is very easy to accidentally introduce races; almost certainly
+ one of the <function>wait_event()</function> family will do, or a
+ loop around <function>schedule_timeout()</function>. If you choose
+ to loop around <function>schedule_timeout()</function> remember
+ you must set the task state (with
+ <function>set_current_state()</function>) on each iteration to avoid
+ busy-looping.
+ </para>
+
+ </sect1>
+
+ <sect1 id="queue-waking">
+ <title>Waking Up Queued Tasks</title>
+
+ <para>
+ Call <function>wake_up()</function>
+
+ <filename class="headerfile">include/linux/wait.h</filename>;,
+ which will wake up every process in the queue. The exception is
+ if one has <constant>TASK_EXCLUSIVE</constant> set, in which case
+ the remainder of the queue will not be woken. There are other variants
+ of this basic function available in the same header.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="atomic-ops">
+ <title>Atomic Operations</title>
+
+ <para>
+ Certain operations are guaranteed atomic on all platforms. The
+ first class of operations work on <type>atomic_t</type>
+
+ <filename class="headerfile">include/asm/atomic.h</filename>; this
+ contains a signed integer (at least 32 bits long), and you must use
+ these functions to manipulate or read atomic_t variables.
+ <function>atomic_read()</function> and
+ <function>atomic_set()</function> get and set the counter,
+ <function>atomic_add()</function>,
+ <function>atomic_sub()</function>,
+ <function>atomic_inc()</function>,
+ <function>atomic_dec()</function>, and
+ <function>atomic_dec_and_test()</function> (returns
+ <returnvalue>true</returnvalue> if it was decremented to zero).
+ </para>
+
+ <para>
+ Yes. It returns <returnvalue>true</returnvalue> (i.e. != 0) if the
+ atomic variable is zero.
+ </para>
+
+ <para>
+ Note that these functions are slower than normal arithmetic, and
+ so should not be used unnecessarily.
+ </para>
+
+ <para>
+ The second class of atomic operations is atomic bit operations on an
+ <type>unsigned long</type>, defined in
+
+ <filename class="headerfile">include/linux/bitops.h</filename>. These
+ operations generally take a pointer to the bit pattern, and a bit
+ number: 0 is the least significant bit.
+ <function>set_bit()</function>, <function>clear_bit()</function>
+ and <function>change_bit()</function> set, clear, and flip the
+ given bit. <function>test_and_set_bit()</function>,
+ <function>test_and_clear_bit()</function> and
+ <function>test_and_change_bit()</function> do the same thing,
+ except return true if the bit was previously set; these are
+ particularly useful for atomically setting flags.
+ </para>
+
+ <para>
+ It is possible to call these operations with bit indices greater
+ than BITS_PER_LONG. The resulting behavior is strange on big-endian
+ platforms though so it is a good idea not to do this.
+ </para>
+ </chapter>
+
+ <chapter id="symbols">
+ <title>Symbols</title>
+
+ <para>
+ Within the kernel proper, the normal linking rules apply
+ (ie. unless a symbol is declared to be file scope with the
+ <type>static</type> keyword, it can be used anywhere in the
+ kernel). However, for modules, a special exported symbol table is
+ kept which limits the entry points to the kernel proper. Modules
+ can also export symbols.
+ </para>
+
+ <sect1 id="sym-exportsymbols">
+ <title><function>EXPORT_SYMBOL()</function>
+ <filename class="headerfile">include/linux/module.h</filename></title>
+
+ <para>
+ This is the classic method of exporting a symbol: dynamically
+ loaded modules will be able to use the symbol as normal.
+ </para>
+ </sect1>
+
+ <sect1 id="sym-exportsymbols-gpl">
+ <title><function>EXPORT_SYMBOL_GPL()</function>
+ <filename class="headerfile">include/linux/module.h</filename></title>
+
+ <para>
+ Similar to <function>EXPORT_SYMBOL()</function> except that the
+ symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can
+ only be seen by modules with a
+ <function>MODULE_LICENSE()</function> that specifies a GPL
+ compatible license. It implies that the function is considered
+ an internal implementation issue, and not really an interface.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="conventions">
+ <title>Routines and Conventions</title>
+
+ <sect1 id="conventions-doublelinkedlist">
+ <title>Double-linked lists
+ <filename class="headerfile">include/linux/list.h</filename></title>
+
+ <para>
+ There used to be three sets of linked-list routines in the kernel
+ headers, but this one is the winner. If you don't have some
+ particular pressing need for a single list, it's a good choice.
+ </para>
+
+ <para>
+ In particular, <function>list_for_each_entry</function> is useful.
+ </para>
+ </sect1>
+
+ <sect1 id="convention-returns">
+ <title>Return Conventions</title>
+
+ <para>
+ For code called in user context, it's very common to defy C
+ convention, and return <returnvalue>0</returnvalue> for success,
+ and a negative error number
+ (eg. <returnvalue>-EFAULT</returnvalue>) for failure. This can be
+ unintuitive at first, but it's fairly widespread in the kernel.
+ </para>
+
+ <para>
+ Using <function>ERR_PTR()</function>
+
+ <filename class="headerfile">include/linux/err.h</filename>; to
+ encode a negative error number into a pointer, and
+ <function>IS_ERR()</function> and <function>PTR_ERR()</function>
+ to get it back out again: avoids a separate pointer parameter for
+ the error number. Icky, but in a good way.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-borkedcompile">
+ <title>Breaking Compilation</title>
+
+ <para>
+ Linus and the other developers sometimes change function or
+ structure names in development kernels; this is not done just to
+ keep everyone on their toes: it reflects a fundamental change
+ (eg. can no longer be called with interrupts on, or does extra
+ checks, or doesn't do checks which were caught before). Usually
+ this is accompanied by a fairly complete note to the linux-kernel
+ mailing list; search the archive. Simply doing a global replace
+ on the file usually makes things <emphasis>worse</emphasis>.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-initialising">
+ <title>Initializing structure members</title>
+
+ <para>
+ The preferred method of initializing structures is to use
+ designated initialisers, as defined by ISO C99, eg:
+ </para>
+ <programlisting>
+static struct block_device_operations opt_fops = {
+ .open = opt_open,
+ .release = opt_release,
+ .ioctl = opt_ioctl,
+ .check_media_change = opt_media_change,
+};
+ </programlisting>
+ <para>
+ This makes it easy to grep for, and makes it clear which
+ structure fields are set. You should do this because it looks
+ cool.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-gnu-extns">
+ <title>GNU Extensions</title>
+
+ <para>
+ GNU Extensions are explicitly allowed in the Linux kernel.
+ Note that some of the more complex ones are not very well
+ supported, due to lack of general use, but the following are
+ considered standard (see the GCC info page section "C
+ Extensions" for more details - Yes, really the info page, the
+ man page is only a short summary of the stuff in info).
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ Inline functions
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Statement expressions (ie. the ({ and }) constructs).
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Declaring attributes of a function / variable / type
+ (__attribute__)
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ typeof
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Zero length arrays
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Macro varargs
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Arithmetic on void pointers
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Non-Constant initializers
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Assembler Instructions (not outside arch/ and include/asm/)
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Function names as strings (__func__).
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ __builtin_constant_p()
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ Be wary when using long long in the kernel, the code gcc generates for
+ it is horrible and worse: division and multiplication does not work
+ on i386 because the GCC runtime functions for it are missing from
+ the kernel environment.
+ </para>
+
+ <!-- FIXME: add a note about ANSI aliasing cleanness -->
+ </sect1>
+
+ <sect1 id="conventions-cplusplus">
+ <title>C++</title>
+
+ <para>
+ Using C++ in the kernel is usually a bad idea, because the
+ kernel does not provide the necessary runtime environment
+ and the include files are not tested for it. It is still
+ possible, but not recommended. If you really want to do
+ this, forget about exceptions at least.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-ifdef">
+ <title>&num;if</title>
+
+ <para>
+ It is generally considered cleaner to use macros in header files
+ (or at the top of .c files) to abstract away functions rather than
+ using `#if' pre-processor statements throughout the source code.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="submitting">
+ <title>Putting Your Stuff in the Kernel</title>
+
+ <para>
+ In order to get your stuff into shape for official inclusion, or
+ even to make a neat patch, there's administrative work to be
+ done:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ Figure out whose pond you've been pissing in. Look at the top of
+ the source files, inside the <filename>MAINTAINERS</filename>
+ file, and last of all in the <filename>CREDITS</filename> file.
+ You should coordinate with this person to make sure you're not
+ duplicating effort, or trying something that's already been
+ rejected.
+ </para>
+
+ <para>
+ Make sure you put your name and EMail address at the top of
+ any files you create or mangle significantly. This is the
+ first place people will look when they find a bug, or when
+ <emphasis>they</emphasis> want to make a change.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Usually you want a configuration option for your kernel hack.
+ Edit <filename>Kconfig</filename> in the appropriate directory.
+ The Config language is simple to use by cut and paste, and there's
+ complete documentation in
+ <filename>Documentation/kbuild/kconfig-language.txt</filename>.
+ </para>
+
+ <para>
+ You may well want to make your CONFIG option only visible if
+ <symbol>CONFIG_EXPERIMENTAL</symbol> is enabled: this serves as a
+ warning to users. There many other fancy things you can do: see
+ the various <filename>Kconfig</filename> files for ideas.
+ </para>
+
+ <para>
+ In your description of the option, make sure you address both the
+ expert user and the user who knows nothing about your feature. Mention
+ incompatibilities and issues here. <emphasis> Definitely
+ </emphasis> end your description with <quote> if in doubt, say N
+ </quote> (or, occasionally, `Y'); this is for people who have no
+ idea what you are talking about.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Edit the <filename>Makefile</filename>: the CONFIG variables are
+ exported here so you can usually just add a "obj-$(CONFIG_xxx) +=
+ xxx.o" line. The syntax is documented in
+ <filename>Documentation/kbuild/makefiles.txt</filename>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Put yourself in <filename>CREDITS</filename> if you've done
+ something noteworthy, usually beyond a single file (your name
+ should be at the top of the source files anyway).
+ <filename>MAINTAINERS</filename> means you want to be consulted
+ when changes are made to a subsystem, and hear about bugs; it
+ implies a more-than-passing commitment to some part of the code.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Finally, don't forget to read <filename>Documentation/SubmittingPatches</filename>
+ and possibly <filename>Documentation/SubmittingDrivers</filename>.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </chapter>
+
+ <chapter id="cantrips">
+ <title>Kernel Cantrips</title>
+
+ <para>
+ Some favorites from browsing the source. Feel free to add to this
+ list.
+ </para>
+
+ <para>
+ <filename>arch/x86/include/asm/delay.h:</filename>
+ </para>
+ <programlisting>
+#define ndelay(n) (__builtin_constant_p(n) ? \
+ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
+ __ndelay(n))
+ </programlisting>
+
+ <para>
+ <filename>include/linux/fs.h</filename>:
+ </para>
+ <programlisting>
+/*
+ * Kernel pointers have redundant information, so we can use a
+ * scheme where we can return either an error code or a dentry
+ * pointer with the same return value.
+ *
+ * This should be a per-architecture thing, to allow different
+ * error and pointer decisions.
+ */
+ #define ERR_PTR(err) ((void *)((long)(err)))
+ #define PTR_ERR(ptr) ((long)(ptr))
+ #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000))
+</programlisting>
+
+ <para>
+ <filename>arch/x86/include/asm/uaccess_32.h:</filename>
+ </para>
+
+ <programlisting>
+#define copy_to_user(to,from,n) \
+ (__builtin_constant_p(n) ? \
+ __constant_copy_to_user((to),(from),(n)) : \
+ __generic_copy_to_user((to),(from),(n)))
+ </programlisting>
+
+ <para>
+ <filename>arch/sparc/kernel/head.S:</filename>
+ </para>
+
+ <programlisting>
+/*
+ * Sun people can't spell worth damn. "compatability" indeed.
+ * At least we *know* we can't spell, and use a spell-checker.
+ */
+
+/* Uh, actually Linus it is I who cannot spell. Too much murky
+ * Sparc assembly will do this to ya.
+ */
+C_LABEL(cputypvar):
+ .asciz "compatability"
+
+/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */
+ .align 4
+C_LABEL(cputypvar_sun4m):
+ .asciz "compatible"
+ </programlisting>
+
+ <para>
+ <filename>arch/sparc/lib/checksum.S:</filename>
+ </para>
+
+ <programlisting>
+ /* Sun, you just can't beat me, you just can't. Stop trying,
+ * give up. I'm serious, I am going to kick the living shit
+ * out of you, game over, lights out.
+ */
+ </programlisting>
+ </chapter>
+
+ <chapter id="credits">
+ <title>Thanks</title>
+
+ <para>
+ Thanks to Andi Kleen for the idea, answering my questions, fixing
+ my mistakes, filling content, etc. Philipp Rumpf for more spelling
+ and clarity fixes, and some excellent non-obvious points. Werner
+ Almesberger for giving me a great summary of
+ <function>disable_irq()</function>, and Jes Sorensen and Andrea
+ Arcangeli added caveats. Michael Elizabeth Chastain for checking
+ and adding to the Configure section. <!-- Rusty insisted on this
+ bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook.
+ </para>
+ </chapter>
+</book>
+
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
new file mode 100644
index 0000000..084f6ad
--- /dev/null
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -0,0 +1,2143 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LKLockingGuide">
+ <bookinfo>
+ <title>Unreliable Guide To Locking</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Rusty</firstname>
+ <surname>Russell</surname>
+ <affiliation>
+ <address>
+ <email>rusty@rustcorp.com.au</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2003</year>
+ <holder>Rusty Russell</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
+ Locking issues. This document describes the locking systems in
+ the Linux Kernel in 2.6.
+ </para>
+ <para>
+ With the wide availability of HyperThreading, and <firstterm
+ linkend="gloss-preemption">preemption </firstterm> in the Linux
+ Kernel, everyone hacking on the kernel needs to know the
+ fundamentals of concurrency and locking for
+ <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>.
+ </para>
+ </chapter>
+
+ <chapter id="races">
+ <title>The Problem With Concurrency</title>
+ <para>
+ (Skip this if you know what a Race Condition is).
+ </para>
+ <para>
+ In a normal program, you can increment a counter like so:
+ </para>
+ <programlisting>
+ very_important_count++;
+ </programlisting>
+
+ <para>
+ This is what they would expect to happen:
+ </para>
+
+ <table>
+ <title>Expected Results</title>
+
+ <tgroup cols="2" align="left">
+
+ <thead>
+ <row>
+ <entry>Instance 1</entry>
+ <entry>Instance 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>read very_important_count (5)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry>add 1 (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry>write very_important_count (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>read very_important_count (6)</entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>add 1 (7)</entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>write very_important_count (7)</entry>
+ </row>
+ </tbody>
+
+ </tgroup>
+ </table>
+
+ <para>
+ This is what might happen:
+ </para>
+
+ <table>
+ <title>Possible Results</title>
+
+ <tgroup cols="2" align="left">
+ <thead>
+ <row>
+ <entry>Instance 1</entry>
+ <entry>Instance 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>read very_important_count (5)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>read very_important_count (5)</entry>
+ </row>
+ <row>
+ <entry>add 1 (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>add 1 (6)</entry>
+ </row>
+ <row>
+ <entry>write very_important_count (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>write very_important_count (6)</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <sect1 id="race-condition">
+ <title>Race Conditions and Critical Regions</title>
+ <para>
+ This overlap, where the result depends on the
+ relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>.
+ The piece of code containing the concurrency issue is called a
+ <firstterm>critical region</firstterm>. And especially since Linux starting running
+ on SMP machines, they became one of the major issues in kernel
+ design and implementation.
+ </para>
+ <para>
+ Preemption can have the same effect, even if there is only one
+ CPU: by preempting one task during the critical region, we have
+ exactly the same race condition. In this case the thread which
+ preempts might run the critical region itself.
+ </para>
+ <para>
+ The solution is to recognize when these simultaneous accesses
+ occur, and use locks to make sure that only one instance can
+ enter the critical region at any time. There are many
+ friendly primitives in the Linux kernel to help you do this.
+ And then there are the unfriendly primitives, but I'll pretend
+ they don't exist.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="locks">
+ <title>Locking in the Linux Kernel</title>
+
+ <para>
+ If I could give you one piece of advice: never sleep with anyone
+ crazier than yourself. But if I had to give you advice on
+ locking: <emphasis>keep it simple</emphasis>.
+ </para>
+
+ <para>
+ Be reluctant to introduce new locks.
+ </para>
+
+ <para>
+ Strangely enough, this last one is the exact reverse of my advice when
+ you <emphasis>have</emphasis> slept with someone crazier than yourself.
+ And you should think about getting a big dog.
+ </para>
+
+ <sect1 id="lock-intro">
+ <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title>
+
+ <para>
+ There are two main types of kernel locks. The fundamental type
+ is the spinlock
+ (<filename class="headerfile">include/asm/spinlock.h</filename>),
+ which is a very simple single-holder lock: if you can't get the
+ spinlock, you keep trying (spinning) until you can. Spinlocks are
+ very small and fast, and can be used anywhere.
+ </para>
+ <para>
+ The second type is a mutex
+ (<filename class="headerfile">include/linux/mutex.h</filename>): it
+ is like a spinlock, but you may block holding a mutex.
+ If you can't lock a mutex, your task will suspend itself, and be woken
+ up when the mutex is released. This means the CPU can do something
+ else while you are waiting. There are many cases when you simply
+ can't sleep (see <xref linkend="sleeping-things"/>), and so have to
+ use a spinlock instead.
+ </para>
+ <para>
+ Neither type of lock is recursive: see
+ <xref linkend="deadlock"/>.
+ </para>
+ </sect1>
+
+ <sect1 id="uniprocessor">
+ <title>Locks and Uniprocessor Kernels</title>
+
+ <para>
+ For kernels compiled without <symbol>CONFIG_SMP</symbol>, and
+ without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at
+ all. This is an excellent design decision: when no-one else can
+ run at the same time, there is no reason to have a lock.
+ </para>
+
+ <para>
+ If the kernel is compiled without <symbol>CONFIG_SMP</symbol>,
+ but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks
+ simply disable preemption, which is sufficient to prevent any
+ races. For most purposes, we can think of preemption as
+ equivalent to SMP, and not worry about it separately.
+ </para>
+
+ <para>
+ You should always test your locking code with <symbol>CONFIG_SMP</symbol>
+ and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it
+ will still catch some kinds of locking bugs.
+ </para>
+
+ <para>
+ Mutexes still exist, because they are required for
+ synchronization between <firstterm linkend="gloss-usercontext">user
+ contexts</firstterm>, as we will see below.
+ </para>
+ </sect1>
+
+ <sect1 id="usercontextlocking">
+ <title>Locking Only In User Context</title>
+
+ <para>
+ If you have a data structure which is only ever accessed from
+ user context, then you can use a simple mutex
+ (<filename>include/linux/mutex.h</filename>) to protect it. This
+ is the most trivial case: you initialize the mutex. Then you can
+ call <function>mutex_lock_interruptible()</function> to grab the mutex,
+ and <function>mutex_unlock()</function> to release it. There is also a
+ <function>mutex_lock()</function>, which should be avoided, because it
+ will not return if a signal is received.
+ </para>
+
+ <para>
+ Example: <filename>net/netfilter/nf_sockopt.c</filename> allows
+ registration of new <function>setsockopt()</function> and
+ <function>getsockopt()</function> calls, with
+ <function>nf_register_sockopt()</function>. Registration and
+ de-registration are only done on module load and unload (and boot
+ time, where there is no concurrency), and the list of registrations
+ is only consulted for an unknown <function>setsockopt()</function>
+ or <function>getsockopt()</function> system call. The
+ <varname>nf_sockopt_mutex</varname> is perfect to protect this,
+ especially since the setsockopt and getsockopt calls may well
+ sleep.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-bh">
+ <title>Locking Between User Context and Softirqs</title>
+
+ <para>
+ If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares
+ data with user context, you have two problems. Firstly, the current
+ user context can be interrupted by a softirq, and secondly, the
+ critical region could be entered from another CPU. This is where
+ <function>spin_lock_bh()</function>
+ (<filename class="headerfile">include/linux/spinlock.h</filename>) is
+ used. It disables softirqs on that CPU, then grabs the lock.
+ <function>spin_unlock_bh()</function> does the reverse. (The
+ '_bh' suffix is a historical reference to "Bottom Halves", the
+ old name for software interrupts. It should really be
+ called spin_lock_softirq()' in a perfect world).
+ </para>
+
+ <para>
+ Note that you can also use <function>spin_lock_irq()</function>
+ or <function>spin_lock_irqsave()</function> here, which stop
+ hardware interrupts as well: see <xref linkend="hardirq-context"/>.
+ </para>
+
+ <para>
+ This works perfectly for <firstterm linkend="gloss-up"><acronym>UP
+ </acronym></firstterm> as well: the spin lock vanishes, and this macro
+ simply becomes <function>local_bh_disable()</function>
+ (<filename class="headerfile">include/linux/interrupt.h</filename>), which
+ protects you from the softirq being run.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-tasklet">
+ <title>Locking Between User Context and Tasklets</title>
+
+ <para>
+ This is exactly the same as above, because <firstterm
+ linkend="gloss-tasklet">tasklets</firstterm> are actually run
+ from a softirq.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-timers">
+ <title>Locking Between User Context and Timers</title>
+
+ <para>
+ This, too, is exactly the same as above, because <firstterm
+ linkend="gloss-timers">timers</firstterm> are actually run from
+ a softirq. From a locking point of view, tasklets and timers
+ are identical.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-tasklets">
+ <title>Locking Between Tasklets/Timers</title>
+
+ <para>
+ Sometimes a tasklet or timer might want to share data with
+ another tasklet or timer.
+ </para>
+
+ <sect2 id="lock-tasklets-same">
+ <title>The Same Tasklet/Timer</title>
+ <para>
+ Since a tasklet is never run on two CPUs at once, you don't
+ need to worry about your tasklet being reentrant (running
+ twice at once), even on SMP.
+ </para>
+ </sect2>
+
+ <sect2 id="lock-tasklets-different">
+ <title>Different Tasklets/Timers</title>
+ <para>
+ If another tasklet/timer wants
+ to share data with your tasklet or timer , you will both need to use
+ <function>spin_lock()</function> and
+ <function>spin_unlock()</function> calls.
+ <function>spin_lock_bh()</function> is
+ unnecessary here, as you are already in a tasklet, and
+ none will be run on the same CPU.
+ </para>
+ </sect2>
+ </sect1>
+
+ <sect1 id="lock-softirqs">
+ <title>Locking Between Softirqs</title>
+
+ <para>
+ Often a softirq might
+ want to share data with itself or a tasklet/timer.
+ </para>
+
+ <sect2 id="lock-softirqs-same">
+ <title>The Same Softirq</title>
+
+ <para>
+ The same softirq can run on the other CPUs: you can use a
+ per-CPU array (see <xref linkend="per-cpu"/>) for better
+ performance. If you're going so far as to use a softirq,
+ you probably care about scalable performance enough
+ to justify the extra complexity.
+ </para>
+
+ <para>
+ You'll need to use <function>spin_lock()</function> and
+ <function>spin_unlock()</function> for shared data.
+ </para>
+ </sect2>
+
+ <sect2 id="lock-softirqs-different">
+ <title>Different Softirqs</title>
+
+ <para>
+ You'll need to use <function>spin_lock()</function> and
+ <function>spin_unlock()</function> for shared data, whether it
+ be a timer, tasklet, different softirq or the same or another
+ softirq: any of them could be running on a different CPU.
+ </para>
+ </sect2>
+ </sect1>
+ </chapter>
+
+ <chapter id="hardirq-context">
+ <title>Hard IRQ Context</title>
+
+ <para>
+ Hardware interrupts usually communicate with a
+ tasklet or softirq. Frequently this involves putting work in a
+ queue, which the softirq will take out.
+ </para>
+
+ <sect1 id="hardirq-softirq">
+ <title>Locking Between Hard IRQ and Softirqs/Tasklets</title>
+
+ <para>
+ If a hardware irq handler shares data with a softirq, you have
+ two concerns. Firstly, the softirq processing can be
+ interrupted by a hardware interrupt, and secondly, the
+ critical region could be entered by a hardware interrupt on
+ another CPU. This is where <function>spin_lock_irq()</function> is
+ used. It is defined to disable interrupts on that cpu, then grab
+ the lock. <function>spin_unlock_irq()</function> does the reverse.
+ </para>
+
+ <para>
+ The irq handler does not to use
+ <function>spin_lock_irq()</function>, because the softirq cannot
+ run while the irq handler is running: it can use
+ <function>spin_lock()</function>, which is slightly faster. The
+ only exception would be if a different hardware irq handler uses
+ the same lock: <function>spin_lock_irq()</function> will stop
+ that from interrupting us.
+ </para>
+
+ <para>
+ This works perfectly for UP as well: the spin lock vanishes,
+ and this macro simply becomes <function>local_irq_disable()</function>
+ (<filename class="headerfile">include/asm/smp.h</filename>), which
+ protects you from the softirq/tasklet/BH being run.
+ </para>
+
+ <para>
+ <function>spin_lock_irqsave()</function>
+ (<filename>include/linux/spinlock.h</filename>) is a variant
+ which saves whether interrupts were on or off in a flags word,
+ which is passed to <function>spin_unlock_irqrestore()</function>. This
+ means that the same code can be used inside an hard irq handler (where
+ interrupts are already off) and in softirqs (where the irq
+ disabling is required).
+ </para>
+
+ <para>
+ Note that softirqs (and hence tasklets and timers) are run on
+ return from hardware interrupts, so
+ <function>spin_lock_irq()</function> also stops these. In that
+ sense, <function>spin_lock_irqsave()</function> is the most
+ general and powerful locking function.
+ </para>
+
+ </sect1>
+ <sect1 id="hardirq-hardirq">
+ <title>Locking Between Two Hard IRQ Handlers</title>
+ <para>
+ It is rare to have to share data between two IRQ handlers, but
+ if you do, <function>spin_lock_irqsave()</function> should be
+ used: it is architecture-specific whether all interrupts are
+ disabled inside irq handlers themselves.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="cheatsheet">
+ <title>Cheat Sheet For Locking</title>
+ <para>
+ Pete Zaitcev gives the following summary:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ If you are in a process context (any syscall) and want to
+ lock other process out, use a mutex. You can take a mutex
+ and sleep (<function>copy_from_user*(</function> or
+ <function>kmalloc(x,GFP_KERNEL)</function>).
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Otherwise (== data can be touched in an interrupt), use
+ <function>spin_lock_irqsave()</function> and
+ <function>spin_unlock_irqrestore()</function>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Avoid holding spinlock for more than 5 lines of code and
+ across any function call (except accessors like
+ <function>readb</function>).
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <sect1 id="minimum-lock-reqirements">
+ <title>Table of Minimum Requirements</title>
+
+ <para> The following table lists the <emphasis>minimum</emphasis>
+ locking requirements between various contexts. In some cases,
+ the same context can only be running on one CPU at a time, so
+ no locking is required for that context (eg. a particular
+ thread can only run on one CPU at a time, but if it needs
+ shares data with another thread, locking is required).
+ </para>
+ <para>
+ Remember the advice above: you can always use
+ <function>spin_lock_irqsave()</function>, which is a superset
+ of all other spinlock primitives.
+ </para>
+
+ <table>
+<title>Table of Locking Requirements</title>
+<tgroup cols="11">
+<tbody>
+
+<row>
+<entry></entry>
+<entry>IRQ Handler A</entry>
+<entry>IRQ Handler B</entry>
+<entry>Softirq A</entry>
+<entry>Softirq B</entry>
+<entry>Tasklet A</entry>
+<entry>Tasklet B</entry>
+<entry>Timer A</entry>
+<entry>Timer B</entry>
+<entry>User Context A</entry>
+<entry>User Context B</entry>
+</row>
+
+<row>
+<entry>IRQ Handler A</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>IRQ Handler B</entry>
+<entry>SLIS</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Softirq A</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SL</entry>
+</row>
+
+<row>
+<entry>Softirq B</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+</row>
+
+<row>
+<entry>Tasklet A</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Tasklet B</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer A</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer B</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>SL</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context A</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context B</entry>
+<entry>SLI</entry>
+<entry>SLI</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>SLBH</entry>
+<entry>MLI</entry>
+<entry>None</entry>
+</row>
+
+</tbody>
+</tgroup>
+</table>
+
+ <table>
+<title>Legend for Locking Requirements Table</title>
+<tgroup cols="2">
+<tbody>
+
+<row>
+<entry>SLIS</entry>
+<entry>spin_lock_irqsave</entry>
+</row>
+<row>
+<entry>SLI</entry>
+<entry>spin_lock_irq</entry>
+</row>
+<row>
+<entry>SL</entry>
+<entry>spin_lock</entry>
+</row>
+<row>
+<entry>SLBH</entry>
+<entry>spin_lock_bh</entry>
+</row>
+<row>
+<entry>MLI</entry>
+<entry>mutex_lock_interruptible</entry>
+</row>
+
+</tbody>
+</tgroup>
+</table>
+
+</sect1>
+</chapter>
+
+<chapter id="trylock-functions">
+ <title>The trylock Functions</title>
+ <para>
+ There are functions that try to acquire a lock only once and immediately
+ return a value telling about success or failure to acquire the lock.
+ They can be used if you need no access to the data protected with the lock
+ when some other thread is holding the lock. You should acquire the lock
+ later if you then need access to the data protected with the lock.
+ </para>
+
+ <para>
+ <function>spin_trylock()</function> does not spin but returns non-zero if
+ it acquires the spinlock on the first try or 0 if not. This function can
+ be used in all contexts like <function>spin_lock</function>: you must have
+ disabled the contexts that might interrupt you and acquire the spin lock.
+ </para>
+
+ <para>
+ <function>mutex_trylock()</function> does not suspend your task
+ but returns non-zero if it could lock the mutex on the first try
+ or 0 if not. This function cannot be safely used in hardware or software
+ interrupt contexts despite not sleeping.
+ </para>
+</chapter>
+
+ <chapter id="Examples">
+ <title>Common Examples</title>
+ <para>
+Let's step through a simple example: a cache of number to name
+mappings. The cache keeps a count of how often each of the objects is
+used, and when it gets full, throws out the least used one.
+
+ </para>
+
+ <sect1 id="examples-usercontext">
+ <title>All In User Context</title>
+ <para>
+For our first example, we assume that all operations are in user
+context (ie. from system calls), so we can sleep. This means we can
+use a mutex to protect the cache and all the objects within
+it. Here's the code:
+ </para>
+
+ <programlisting>
+#include &lt;linux/list.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/string.h&gt;
+#include &lt;linux/mutex.h&gt;
+#include &lt;asm/errno.h&gt;
+
+struct object
+{
+ struct list_head list;
+ int id;
+ char name[32];
+ int popularity;
+};
+
+/* Protects the cache, cache_num, and the objects within it */
+static DEFINE_MUTEX(cache_lock);
+static LIST_HEAD(cache);
+static unsigned int cache_num = 0;
+#define MAX_CACHE_SIZE 10
+
+/* Must be holding cache_lock */
+static struct object *__cache_find(int id)
+{
+ struct object *i;
+
+ list_for_each_entry(i, &amp;cache, list)
+ if (i-&gt;id == id) {
+ i-&gt;popularity++;
+ return i;
+ }
+ return NULL;
+}
+
+/* Must be holding cache_lock */
+static void __cache_delete(struct object *obj)
+{
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
+ kfree(obj);
+ cache_num--;
+}
+
+/* Must be holding cache_lock */
+static void __cache_add(struct object *obj)
+{
+ list_add(&amp;obj-&gt;list, &amp;cache);
+ if (++cache_num > MAX_CACHE_SIZE) {
+ struct object *i, *outcast = NULL;
+ list_for_each_entry(i, &amp;cache, list) {
+ if (!outcast || i-&gt;popularity &lt; outcast-&gt;popularity)
+ outcast = i;
+ }
+ __cache_delete(outcast);
+ }
+}
+
+int cache_add(int id, const char *name)
+{
+ struct object *obj;
+
+ if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+
+ mutex_lock(&amp;cache_lock);
+ __cache_add(obj);
+ mutex_unlock(&amp;cache_lock);
+ return 0;
+}
+
+void cache_delete(int id)
+{
+ mutex_lock(&amp;cache_lock);
+ __cache_delete(__cache_find(id));
+ mutex_unlock(&amp;cache_lock);
+}
+
+int cache_find(int id, char *name)
+{
+ struct object *obj;
+ int ret = -ENOENT;
+
+ mutex_lock(&amp;cache_lock);
+ obj = __cache_find(id);
+ if (obj) {
+ ret = 0;
+ strcpy(name, obj-&gt;name);
+ }
+ mutex_unlock(&amp;cache_lock);
+ return ret;
+}
+</programlisting>
+
+ <para>
+Note that we always make sure we have the cache_lock when we add,
+delete, or look up the cache: both the cache infrastructure itself and
+the contents of the objects are protected by the lock. In this case
+it's easy, since we copy the data for the user, and never let them
+access the objects directly.
+ </para>
+ <para>
+There is a slight (and common) optimization here: in
+<function>cache_add</function> we set up the fields of the object
+before grabbing the lock. This is safe, as no-one else can access it
+until we put it in cache.
+ </para>
+ </sect1>
+
+ <sect1 id="examples-interrupt">
+ <title>Accessing From Interrupt Context</title>
+ <para>
+Now consider the case where <function>cache_find</function> can be
+called from interrupt context: either a hardware interrupt or a
+softirq. An example would be a timer which deletes object from the
+cache.
+ </para>
+ <para>
+The change is shown below, in standard patch format: the
+<symbol>-</symbol> are lines which are taken away, and the
+<symbol>+</symbol> are lines which are added.
+ </para>
+<programlisting>
+--- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100
++++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100
+@@ -12,7 +12,7 @@
+ int popularity;
+ };
+
+-static DEFINE_MUTEX(cache_lock);
++static DEFINE_SPINLOCK(cache_lock);
+ static LIST_HEAD(cache);
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+@@ -55,6 +55,7 @@
+ int cache_add(int id, const char *name)
+ {
+ struct object *obj;
++ unsigned long flags;
+
+ if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+@@ -63,30 +64,33 @@
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+
+- mutex_lock(&amp;cache_lock);
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+- mutex_unlock(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return 0;
+ }
+
+ void cache_delete(int id)
+ {
+- mutex_lock(&amp;cache_lock);
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_delete(__cache_find(id));
+- mutex_unlock(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+ int cache_find(int id, char *name)
+ {
+ struct object *obj;
+ int ret = -ENOENT;
++ unsigned long flags;
+
+- mutex_lock(&amp;cache_lock);
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+ if (obj) {
+ ret = 0;
+ strcpy(name, obj-&gt;name);
+ }
+- mutex_unlock(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return ret;
+ }
+</programlisting>
+
+ <para>
+Note that the <function>spin_lock_irqsave</function> will turn off
+interrupts if they are on, otherwise does nothing (if we are already
+in an interrupt handler), hence these functions are safe to call from
+any context.
+ </para>
+ <para>
+Unfortunately, <function>cache_add</function> calls
+<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol>
+flag, which is only legal in user context. I have assumed that
+<function>cache_add</function> is still only called in user context,
+otherwise this should become a parameter to
+<function>cache_add</function>.
+ </para>
+ </sect1>
+ <sect1 id="examples-refcnt">
+ <title>Exposing Objects Outside This File</title>
+ <para>
+If our objects contained more information, it might not be sufficient
+to copy the information in and out: other parts of the code might want
+to keep pointers to these objects, for example, rather than looking up
+the id every time. This produces two problems.
+ </para>
+ <para>
+The first problem is that we use the <symbol>cache_lock</symbol> to
+protect objects: we'd need to make this non-static so the rest of the
+code can use it. This makes locking trickier, as it is no longer all
+in one place.
+ </para>
+ <para>
+The second problem is the lifetime problem: if another structure keeps
+a pointer to an object, it presumably expects that pointer to remain
+valid. Unfortunately, this is only guaranteed while you hold the
+lock, otherwise someone might call <function>cache_delete</function>
+and even worse, add another object, re-using the same address.
+ </para>
+ <para>
+As there is only one lock, you can't hold it forever: no-one else would
+get any work done.
+ </para>
+ <para>
+The solution to this problem is to use a reference count: everyone who
+has a pointer to the object increases it when they first get the
+object, and drops the reference count when they're finished with it.
+Whoever drops it to zero knows it is unused, and can actually delete it.
+ </para>
+ <para>
+Here is the code:
+ </para>
+
+<programlisting>
+--- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100
++++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100
+@@ -7,6 +7,7 @@
+ struct object
+ {
+ struct list_head list;
++ unsigned int refcnt;
+ int id;
+ char name[32];
+ int popularity;
+@@ -17,6 +18,35 @@
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+
++static void __object_put(struct object *obj)
++{
++ if (--obj-&gt;refcnt == 0)
++ kfree(obj);
++}
++
++static void __object_get(struct object *obj)
++{
++ obj-&gt;refcnt++;
++}
++
++void object_put(struct object *obj)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
++ __object_put(obj);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
++void object_get(struct object *obj)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
++ __object_get(obj);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
+ /* Must be holding cache_lock */
+ static struct object *__cache_find(int id)
+ {
+@@ -35,6 +65,7 @@
+ {
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
++ __object_put(obj);
+ cache_num--;
+ }
+
+@@ -63,6 +94,7 @@
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
++ obj-&gt;refcnt = 1; /* The cache holds a reference */
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+@@ -79,18 +111,15 @@
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+-int cache_find(int id, char *name)
++struct object *cache_find(int id)
+ {
+ struct object *obj;
+- int ret = -ENOENT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+- if (obj) {
+- ret = 0;
+- strcpy(name, obj-&gt;name);
+- }
++ if (obj)
++ __object_get(obj);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+- return ret;
++ return obj;
+ }
+</programlisting>
+
+<para>
+We encapsulate the reference counting in the standard 'get' and 'put'
+functions. Now we can return the object itself from
+<function>cache_find</function> which has the advantage that the user
+can now sleep holding the object (eg. to
+<function>copy_to_user</function> to name to userspace).
+</para>
+<para>
+The other point to note is that I said a reference should be held for
+every pointer to the object: thus the reference count is 1 when first
+inserted into the cache. In some versions the framework does not hold
+a reference count, but they are more complicated.
+</para>
+
+ <sect2 id="examples-refcnt-atomic">
+ <title>Using Atomic Operations For The Reference Count</title>
+<para>
+In practice, <type>atomic_t</type> would usually be used for
+<structfield>refcnt</structfield>. There are a number of atomic
+operations defined in
+
+<filename class="headerfile">include/asm/atomic.h</filename>: these are
+guaranteed to be seen atomically from all CPUs in the system, so no
+lock is required. In this case, it is simpler than using spinlocks,
+although for anything non-trivial using spinlocks is clearer. The
+<function>atomic_inc</function> and
+<function>atomic_dec_and_test</function> are used instead of the
+standard increment and decrement operators, and the lock is no longer
+used to protect the reference count itself.
+</para>
+
+<programlisting>
+--- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100
++++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100
+@@ -7,7 +7,7 @@
+ struct object
+ {
+ struct list_head list;
+- unsigned int refcnt;
++ atomic_t refcnt;
+ int id;
+ char name[32];
+ int popularity;
+@@ -18,33 +18,15 @@
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+
+-static void __object_put(struct object *obj)
+-{
+- if (--obj-&gt;refcnt == 0)
+- kfree(obj);
+-}
+-
+-static void __object_get(struct object *obj)
+-{
+- obj-&gt;refcnt++;
+-}
+-
+ void object_put(struct object *obj)
+ {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&amp;cache_lock, flags);
+- __object_put(obj);
+- spin_unlock_irqrestore(&amp;cache_lock, flags);
++ if (atomic_dec_and_test(&amp;obj-&gt;refcnt))
++ kfree(obj);
+ }
+
+ void object_get(struct object *obj)
+ {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&amp;cache_lock, flags);
+- __object_get(obj);
+- spin_unlock_irqrestore(&amp;cache_lock, flags);
++ atomic_inc(&amp;obj-&gt;refcnt);
+ }
+
+ /* Must be holding cache_lock */
+@@ -65,7 +47,7 @@
+ {
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
+- __object_put(obj);
++ object_put(obj);
+ cache_num--;
+ }
+
+@@ -94,7 +76,7 @@
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+- obj-&gt;refcnt = 1; /* The cache holds a reference */
++ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+@@ -119,7 +101,7 @@
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+ if (obj)
+- __object_get(obj);
++ object_get(obj);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return obj;
+ }
+</programlisting>
+</sect2>
+</sect1>
+
+ <sect1 id="examples-lock-per-obj">
+ <title>Protecting The Objects Themselves</title>
+ <para>
+In these examples, we assumed that the objects (except the reference
+counts) never changed once they are created. If we wanted to allow
+the name to change, there are three possibilities:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+You can make <symbol>cache_lock</symbol> non-static, and tell people
+to grab that lock before changing the name in any object.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+You can provide a <function>cache_obj_rename</function> which grabs
+this lock and changes the name for the caller, and tell everyone to
+use that function.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+You can make the <symbol>cache_lock</symbol> protect only the cache
+itself, and use another lock to protect the name.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+Theoretically, you can make the locks as fine-grained as one lock for
+every field, for every object. In practice, the most common variants
+are:
+</para>
+ <itemizedlist>
+ <listitem>
+ <para>
+One lock which protects the infrastructure (the <symbol>cache</symbol>
+list in this example) and all the objects. This is what we have done
+so far.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+One lock which protects the infrastructure (including the list
+pointers inside the objects), and one lock inside the object which
+protects the rest of that object.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+Multiple locks to protect the infrastructure (eg. one lock per hash
+chain), possibly with a separate per-object lock.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+<para>
+Here is the "lock-per-object" implementation:
+</para>
+<programlisting>
+--- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100
++++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
+@@ -6,11 +6,17 @@
+
+ struct object
+ {
++ /* These two protected by cache_lock. */
+ struct list_head list;
++ int popularity;
++
+ atomic_t refcnt;
++
++ /* Doesn't change once created. */
+ int id;
++
++ spinlock_t lock; /* Protects the name */
+ char name[32];
+- int popularity;
+ };
+
+ static DEFINE_SPINLOCK(cache_lock);
+@@ -77,6 +84,7 @@
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
++ spin_lock_init(&amp;obj-&gt;lock);
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+</programlisting>
+
+<para>
+Note that I decide that the <structfield>popularity</structfield>
+count should be protected by the <symbol>cache_lock</symbol> rather
+than the per-object lock: this is because it (like the
+<structname>struct list_head</structname> inside the object) is
+logically part of the infrastructure. This way, I don't need to grab
+the lock of every object in <function>__cache_add</function> when
+seeking the least popular.
+</para>
+
+<para>
+I also decided that the <structfield>id</structfield> member is
+unchangeable, so I don't need to grab each object lock in
+<function>__cache_find()</function> to examine the
+<structfield>id</structfield>: the object lock is only used by a
+caller who wants to read or write the <structfield>name</structfield>
+field.
+</para>
+
+<para>
+Note also that I added a comment describing what data was protected by
+which locks. This is extremely important, as it describes the runtime
+behavior of the code, and can be hard to gain from just reading. And
+as Alan Cox says, <quote>Lock data, not code</quote>.
+</para>
+</sect1>
+</chapter>
+
+ <chapter id="common-problems">
+ <title>Common Problems</title>
+ <sect1 id="deadlock">
+ <title>Deadlock: Simple and Advanced</title>
+
+ <para>
+ There is a coding bug where a piece of code tries to grab a
+ spinlock twice: it will spin forever, waiting for the lock to
+ be released (spinlocks, rwlocks and mutexes are not
+ recursive in Linux). This is trivial to diagnose: not a
+ stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
+ problem.
+ </para>
+
+ <para>
+ For a slightly more complex case, imagine you have a region
+ shared by a softirq and user context. If you use a
+ <function>spin_lock()</function> call to protect it, it is
+ possible that the user context will be interrupted by the softirq
+ while it holds the lock, and the softirq will then spin
+ forever trying to get the same lock.
+ </para>
+
+ <para>
+ Both of these are called deadlock, and as shown above, it can
+ occur even with a single CPU (although not on UP compiles,
+ since spinlocks vanish on kernel compiles with
+ <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption
+ in the second example).
+ </para>
+
+ <para>
+ This complete lockup is easy to diagnose: on SMP boxes the
+ watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set
+ (<filename>include/linux/spinlock.h</filename>) will show this up
+ immediately when it happens.
+ </para>
+
+ <para>
+ A more complex problem is the so-called 'deadly embrace',
+ involving two or more locks. Say you have a hash table: each
+ entry in the table is a spinlock, and a chain of hashed
+ objects. Inside a softirq handler, you sometimes want to
+ alter an object from one place in the hash to another: you
+ grab the spinlock of the old hash chain and the spinlock of
+ the new hash chain, and delete the object from the old one,
+ and insert it in the new one.
+ </para>
+
+ <para>
+ There are two problems here. First, if your code ever
+ tries to move the object to the same chain, it will deadlock
+ with itself as it tries to lock it twice. Secondly, if the
+ same softirq on another CPU is trying to move another object
+ in the reverse direction, the following could happen:
+ </para>
+
+ <table>
+ <title>Consequences</title>
+
+ <tgroup cols="2" align="left">
+
+ <thead>
+ <row>
+ <entry>CPU 1</entry>
+ <entry>CPU 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>Grab lock A -&gt; OK</entry>
+ <entry>Grab lock B -&gt; OK</entry>
+ </row>
+ <row>
+ <entry>Grab lock B -&gt; spin</entry>
+ <entry>Grab lock A -&gt; spin</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <para>
+ The two CPUs will spin forever, waiting for the other to give up
+ their lock. It will look, smell, and feel like a crash.
+ </para>
+ </sect1>
+
+ <sect1 id="techs-deadlock-prevent">
+ <title>Preventing Deadlock</title>
+
+ <para>
+ Textbooks will tell you that if you always lock in the same
+ order, you will never get this kind of deadlock. Practice
+ will tell you that this approach doesn't scale: when I
+ create a new lock, I don't understand enough of the kernel
+ to figure out where in the 5000 lock hierarchy it will fit.
+ </para>
+
+ <para>
+ The best locks are encapsulated: they never get exposed in
+ headers, and are never held around calls to non-trivial
+ functions outside the same file. You can read through this
+ code and see that it will never deadlock, because it never
+ tries to grab another lock while it has that one. People
+ using your code don't even need to know you are using a
+ lock.
+ </para>
+
+ <para>
+ A classic problem here is when you provide callbacks or
+ hooks: if you call these with the lock held, you risk simple
+ deadlock, or a deadly embrace (who knows what the callback
+ will do?). Remember, the other programmers are out to get
+ you, so don't do this.
+ </para>
+
+ <sect2 id="techs-deadlock-overprevent">
+ <title>Overzealous Prevention Of Deadlocks</title>
+
+ <para>
+ Deadlocks are problematic, but not as bad as data
+ corruption. Code which grabs a read lock, searches a list,
+ fails to find what it wants, drops the read lock, grabs a
+ write lock and inserts the object has a race condition.
+ </para>
+
+ <para>
+ If you don't see why, please stay the fuck away from my code.
+ </para>
+ </sect2>
+ </sect1>
+
+ <sect1 id="racing-timers">
+ <title>Racing Timers: A Kernel Pastime</title>
+
+ <para>
+ Timers can produce their own special problems with races.
+ Consider a collection of objects (list, hash, etc) where each
+ object has a timer which is due to destroy it.
+ </para>
+
+ <para>
+ If you want to destroy the entire collection (say on module
+ removal), you might do the following:
+ </para>
+
+ <programlisting>
+ /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE
+ HUNGARIAN NOTATION */
+ spin_lock_bh(&amp;list_lock);
+
+ while (list) {
+ struct foo *next = list-&gt;next;
+ del_timer(&amp;list-&gt;timer);
+ kfree(list);
+ list = next;
+ }
+
+ spin_unlock_bh(&amp;list_lock);
+ </programlisting>
+
+ <para>
+ Sooner or later, this will crash on SMP, because a timer can
+ have just gone off before the <function>spin_lock_bh()</function>,
+ and it will only get the lock after we
+ <function>spin_unlock_bh()</function>, and then try to free
+ the element (which has already been freed!).
+ </para>
+
+ <para>
+ This can be avoided by checking the result of
+ <function>del_timer()</function>: if it returns
+ <returnvalue>1</returnvalue>, the timer has been deleted.
+ If <returnvalue>0</returnvalue>, it means (in this
+ case) that it is currently running, so we can do:
+ </para>
+
+ <programlisting>
+ retry:
+ spin_lock_bh(&amp;list_lock);
+
+ while (list) {
+ struct foo *next = list-&gt;next;
+ if (!del_timer(&amp;list-&gt;timer)) {
+ /* Give timer a chance to delete this */
+ spin_unlock_bh(&amp;list_lock);
+ goto retry;
+ }
+ kfree(list);
+ list = next;
+ }
+
+ spin_unlock_bh(&amp;list_lock);
+ </programlisting>
+
+ <para>
+ Another common problem is deleting timers which restart
+ themselves (by calling <function>add_timer()</function> at the end
+ of their timer function). Because this is a fairly common case
+ which is prone to races, you should use <function>del_timer_sync()</function>
+ (<filename class="headerfile">include/linux/timer.h</filename>)
+ to handle this case. It returns the number of times the timer
+ had to be deleted before we finally stopped it from adding itself back
+ in.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="Efficiency">
+ <title>Locking Speed</title>
+
+ <para>
+There are three main things to worry about when considering speed of
+some code which does locking. First is concurrency: how many things
+are going to be waiting while someone else is holding a lock. Second
+is the time taken to actually acquire and release an uncontended lock.
+Third is using fewer, or smarter locks. I'm assuming that the lock is
+used fairly often: otherwise, you wouldn't be concerned about
+efficiency.
+</para>
+ <para>
+Concurrency depends on how long the lock is usually held: you should
+hold the lock for as long as needed, but no longer. In the cache
+example, we always create the object without the lock held, and then
+grab the lock only when we are ready to insert it in the list.
+</para>
+ <para>
+Acquisition times depend on how much damage the lock operations do to
+the pipeline (pipeline stalls) and how likely it is that this CPU was
+the last one to grab the lock (ie. is the lock cache-hot for this
+CPU): on a machine with more CPUs, this likelihood drops fast.
+Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns,
+an atomic increment takes about 58ns, a lock which is cache-hot on
+this CPU takes 160ns, and a cacheline transfer from another CPU takes
+an additional 170 to 360ns. (These figures from Paul McKenney's
+<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux
+Journal RCU article</ulink>).
+</para>
+ <para>
+These two aims conflict: holding a lock for a short time might be done
+by splitting locks into parts (such as in our final per-object-lock
+example), but this increases the number of lock acquisitions, and the
+results are often slower than having a single lock. This is another
+reason to advocate locking simplicity.
+</para>
+ <para>
+The third concern is addressed below: there are some methods to reduce
+the amount of locking which needs to be done.
+</para>
+
+ <sect1 id="efficiency-rwlocks">
+ <title>Read/Write Lock Variants</title>
+
+ <para>
+ Both spinlocks and mutexes have read/write variants:
+ <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>.
+ These divide users into two classes: the readers and the writers. If
+ you are only reading the data, you can get a read lock, but to write to
+ the data you need the write lock. Many people can hold a read lock,
+ but a writer must be sole holder.
+ </para>
+
+ <para>
+ If your code divides neatly along reader/writer lines (as our
+ cache code does), and the lock is held by readers for
+ significant lengths of time, using these locks can help. They
+ are slightly slower than the normal locks though, so in practice
+ <type>rwlock_t</type> is not usually worthwhile.
+ </para>
+ </sect1>
+
+ <sect1 id="efficiency-read-copy-update">
+ <title>Avoiding Locks: Read Copy Update</title>
+
+ <para>
+ There is a special method of read/write locking called Read Copy
+ Update. Using RCU, the readers can avoid taking a lock
+ altogether: as we expect our cache to be read more often than
+ updated (otherwise the cache is a waste of time), it is a
+ candidate for this optimization.
+ </para>
+
+ <para>
+ How do we get rid of read locks? Getting rid of read locks
+ means that writers may be changing the list underneath the
+ readers. That is actually quite simple: we can read a linked
+ list while an element is being added if the writer adds the
+ element very carefully. For example, adding
+ <symbol>new</symbol> to a single linked list called
+ <symbol>list</symbol>:
+ </para>
+
+ <programlisting>
+ new-&gt;next = list-&gt;next;
+ wmb();
+ list-&gt;next = new;
+ </programlisting>
+
+ <para>
+ The <function>wmb()</function> is a write memory barrier. It
+ ensures that the first operation (setting the new element's
+ <symbol>next</symbol> pointer) is complete and will be seen by
+ all CPUs, before the second operation is (putting the new
+ element into the list). This is important, since modern
+ compilers and modern CPUs can both reorder instructions unless
+ told otherwise: we want a reader to either not see the new
+ element at all, or see the new element with the
+ <symbol>next</symbol> pointer correctly pointing at the rest of
+ the list.
+ </para>
+ <para>
+ Fortunately, there is a function to do this for standard
+ <structname>struct list_head</structname> lists:
+ <function>list_add_rcu()</function>
+ (<filename>include/linux/list.h</filename>).
+ </para>
+ <para>
+ Removing an element from the list is even simpler: we replace
+ the pointer to the old element with a pointer to its successor,
+ and readers will either see it, or skip over it.
+ </para>
+ <programlisting>
+ list-&gt;next = old-&gt;next;
+ </programlisting>
+ <para>
+ There is <function>list_del_rcu()</function>
+ (<filename>include/linux/list.h</filename>) which does this (the
+ normal version poisons the old object, which we don't want).
+ </para>
+ <para>
+ The reader must also be careful: some CPUs can look through the
+ <symbol>next</symbol> pointer to start reading the contents of
+ the next element early, but don't realize that the pre-fetched
+ contents is wrong when the <symbol>next</symbol> pointer changes
+ underneath them. Once again, there is a
+ <function>list_for_each_entry_rcu()</function>
+ (<filename>include/linux/list.h</filename>) to help you. Of
+ course, writers can just use
+ <function>list_for_each_entry()</function>, since there cannot
+ be two simultaneous writers.
+ </para>
+ <para>
+ Our final dilemma is this: when can we actually destroy the
+ removed element? Remember, a reader might be stepping through
+ this element in the list right now: if we free this element and
+ the <symbol>next</symbol> pointer changes, the reader will jump
+ off into garbage and crash. We need to wait until we know that
+ all the readers who were traversing the list when we deleted the
+ element are finished. We use <function>call_rcu()</function> to
+ register a callback which will actually destroy the object once
+ the readers are finished.
+ </para>
+ <para>
+ But how does Read Copy Update know when the readers are
+ finished? The method is this: firstly, the readers always
+ traverse the list inside
+ <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function>
+ pairs: these simply disable preemption so the reader won't go to
+ sleep while reading the list.
+ </para>
+ <para>
+ RCU then waits until every other CPU has slept at least once:
+ since readers cannot sleep, we know that any readers which were
+ traversing the list during the deletion are finished, and the
+ callback is triggered. The real Read Copy Update code is a
+ little more optimized than this, but this is the fundamental
+ idea.
+ </para>
+
+<programlisting>
+--- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
++++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100
+@@ -1,15 +1,18 @@
+ #include &lt;linux/list.h&gt;
+ #include &lt;linux/slab.h&gt;
+ #include &lt;linux/string.h&gt;
++#include &lt;linux/rcupdate.h&gt;
+ #include &lt;linux/mutex.h&gt;
+ #include &lt;asm/errno.h&gt;
+
+ struct object
+ {
+- /* These two protected by cache_lock. */
++ /* This is protected by RCU */
+ struct list_head list;
+ int popularity;
+
++ struct rcu_head rcu;
++
+ atomic_t refcnt;
+
+ /* Doesn't change once created. */
+@@ -40,7 +43,7 @@
+ {
+ struct object *i;
+
+- list_for_each_entry(i, &amp;cache, list) {
++ list_for_each_entry_rcu(i, &amp;cache, list) {
+ if (i-&gt;id == id) {
+ i-&gt;popularity++;
+ return i;
+@@ -49,19 +52,25 @@
+ return NULL;
+ }
+
++/* Final discard done once we know no readers are looking. */
++static void cache_delete_rcu(void *arg)
++{
++ object_put(arg);
++}
++
+ /* Must be holding cache_lock */
+ static void __cache_delete(struct object *obj)
+ {
+ BUG_ON(!obj);
+- list_del(&amp;obj-&gt;list);
+- object_put(obj);
++ list_del_rcu(&amp;obj-&gt;list);
+ cache_num--;
++ call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj);
+ }
+
+ /* Must be holding cache_lock */
+ static void __cache_add(struct object *obj)
+ {
+- list_add(&amp;obj-&gt;list, &amp;cache);
++ list_add_rcu(&amp;obj-&gt;list, &amp;cache);
+ if (++cache_num > MAX_CACHE_SIZE) {
+ struct object *i, *outcast = NULL;
+ list_for_each_entry(i, &amp;cache, list) {
+@@ -85,6 +94,7 @@
+ obj-&gt;popularity = 0;
+ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
+ spin_lock_init(&amp;obj-&gt;lock);
++ INIT_RCU_HEAD(&amp;obj-&gt;rcu);
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+@@ -104,12 +114,11 @@
+ struct object *cache_find(int id)
+ {
+ struct object *obj;
+- unsigned long flags;
+
+- spin_lock_irqsave(&amp;cache_lock, flags);
++ rcu_read_lock();
+ obj = __cache_find(id);
+ if (obj)
+ object_get(obj);
+- spin_unlock_irqrestore(&amp;cache_lock, flags);
++ rcu_read_unlock();
+ return obj;
+ }
+</programlisting>
+
+<para>
+Note that the reader will alter the
+<structfield>popularity</structfield> member in
+<function>__cache_find()</function>, and now it doesn't hold a lock.
+One solution would be to make it an <type>atomic_t</type>, but for
+this usage, we don't really care about races: an approximate result is
+good enough, so I didn't change it.
+</para>
+
+<para>
+The result is that <function>cache_find()</function> requires no
+synchronization with any other functions, so is almost as fast on SMP
+as it would be on UP.
+</para>
+
+<para>
+There is a furthur optimization possible here: remember our original
+cache code, where there were no reference counts and the caller simply
+held the lock whenever using the object? This is still possible: if
+you hold the lock, noone can delete the object, so you don't need to
+get and put the reference count.
+</para>
+
+<para>
+Now, because the 'read lock' in RCU is simply disabling preemption, a
+caller which always has preemption disabled between calling
+<function>cache_find()</function> and
+<function>object_put()</function> does not need to actually get and
+put the reference count: we could expose
+<function>__cache_find()</function> by making it non-static, and
+such callers could simply call that.
+</para>
+<para>
+The benefit here is that the reference count is not written to: the
+object is not altered in any way, which is much faster on SMP
+machines due to caching.
+</para>
+ </sect1>
+
+ <sect1 id="per-cpu">
+ <title>Per-CPU Data</title>
+
+ <para>
+ Another technique for avoiding locking which is used fairly
+ widely is to duplicate information for each CPU. For example,
+ if you wanted to keep a count of a common condition, you could
+ use a spin lock and a single counter. Nice and simple.
+ </para>
+
+ <para>
+ If that was too slow (it's usually not, but if you've got a
+ really big machine to test on and can show that it is), you
+ could instead use a counter for each CPU, then none of them need
+ an exclusive lock. See <function>DEFINE_PER_CPU()</function>,
+ <function>get_cpu_var()</function> and
+ <function>put_cpu_var()</function>
+ (<filename class="headerfile">include/linux/percpu.h</filename>).
+ </para>
+
+ <para>
+ Of particular use for simple per-cpu counters is the
+ <type>local_t</type> type, and the
+ <function>cpu_local_inc()</function> and related functions,
+ which are more efficient than simple code on some architectures
+ (<filename class="headerfile">include/asm/local.h</filename>).
+ </para>
+
+ <para>
+ Note that there is no simple, reliable way of getting an exact
+ value of such a counter, without introducing more locks. This
+ is not a problem for some uses.
+ </para>
+ </sect1>
+
+ <sect1 id="mostly-hardirq">
+ <title>Data Which Mostly Used By An IRQ Handler</title>
+
+ <para>
+ If data is always accessed from within the same IRQ handler, you
+ don't need a lock at all: the kernel already guarantees that the
+ irq handler will not run simultaneously on multiple CPUs.
+ </para>
+ <para>
+ Manfred Spraul points out that you can still do this, even if
+ the data is very occasionally accessed in user context or
+ softirqs/tasklets. The irq handler doesn't use a lock, and
+ all other accesses are done as so:
+ </para>
+
+<programlisting>
+ spin_lock(&amp;lock);
+ disable_irq(irq);
+ ...
+ enable_irq(irq);
+ spin_unlock(&amp;lock);
+</programlisting>
+ <para>
+ The <function>disable_irq()</function> prevents the irq handler
+ from running (and waits for it to finish if it's currently
+ running on other CPUs). The spinlock prevents any other
+ accesses happening at the same time. Naturally, this is slower
+ than just a <function>spin_lock_irq()</function> call, so it
+ only makes sense if this type of access happens extremely
+ rarely.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="sleeping-things">
+ <title>What Functions Are Safe To Call From Interrupts?</title>
+
+ <para>
+ Many functions in the kernel sleep (ie. call schedule())
+ directly or indirectly: you can never call them while holding a
+ spinlock, or with preemption disabled. This also means you need
+ to be in user context: calling them from an interrupt is illegal.
+ </para>
+
+ <sect1 id="sleeping">
+ <title>Some Functions Which Sleep</title>
+
+ <para>
+ The most common ones are listed below, but you usually have to
+ read the code to find out if other calls are safe. If everyone
+ else who calls it can sleep, you probably need to be able to
+ sleep, too. In particular, registration and deregistration
+ functions usually expect to be called from user context, and can
+ sleep.
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ Accesses to
+ <firstterm linkend="gloss-userspace">userspace</firstterm>:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ <function>copy_from_user()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>copy_to_user()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>get_user()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>put_user()</function>
+ </para>
+ </listitem>
+ </itemizedlist>
+ </listitem>
+
+ <listitem>
+ <para>
+ <function>kmalloc(GFP_KERNEL)</function>
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <function>mutex_lock_interruptible()</function> and
+ <function>mutex_lock()</function>
+ </para>
+ <para>
+ There is a <function>mutex_trylock()</function> which can be
+ used inside interrupt context, as it will not sleep.
+ <function>mutex_unlock()</function> will also never sleep.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1 id="dont-sleep">
+ <title>Some Functions Which Don't Sleep</title>
+
+ <para>
+ Some functions are safe to call from any context, or holding
+ almost any lock.
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ <function>printk()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>kfree()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>add_timer()</function> and <function>del_timer()</function>
+ </para>
+ </listitem>
+ </itemizedlist>
+ </sect1>
+ </chapter>
+
+ <chapter id="references">
+ <title>Further reading</title>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ <filename>Documentation/spinlocks.txt</filename>:
+ Linus Torvalds' spinlocking tutorial in the kernel sources.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Unix Systems for Modern Architectures: Symmetric
+ Multiprocessing and Caching for Kernel Programmers:
+ </para>
+
+ <para>
+ Curt Schimmel's very good introduction to kernel level
+ locking (not written for Linux, but nearly everything
+ applies). The book is expensive, but really worth every
+ penny to understand SMP locking. [ISBN: 0201633388]
+ </para>
+ </listitem>
+ </itemizedlist>
+ </chapter>
+
+ <chapter id="thanks">
+ <title>Thanks</title>
+
+ <para>
+ Thanks to Telsa Gwynne for DocBooking, neatening and adding
+ style.
+ </para>
+
+ <para>
+ Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul
+ Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim
+ Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney,
+ John Ashby for proofreading, correcting, flaming, commenting.
+ </para>
+
+ <para>
+ Thanks to the cabal for having no influence on this document.
+ </para>
+ </chapter>
+
+ <glossary id="glossary">
+ <title>Glossary</title>
+
+ <glossentry id="gloss-preemption">
+ <glossterm>preemption</glossterm>
+ <glossdef>
+ <para>
+ Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is
+ unset, processes in user context inside the kernel would not
+ preempt each other (ie. you had that CPU until you gave it up,
+ except for interrupts). With the addition of
+ <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when
+ in user context, higher priority tasks can "cut in": spinlocks
+ were changed to disable preemption, even on UP.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-bh">
+ <glossterm>bh</glossterm>
+ <glossdef>
+ <para>
+ Bottom Half: for historical reasons, functions with
+ '_bh' in them often now refer to any software interrupt, e.g.
+ <function>spin_lock_bh()</function> blocks any software interrupt
+ on the current CPU. Bottom halves are deprecated, and will
+ eventually be replaced by tasklets. Only one bottom half will be
+ running at any time.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-hwinterrupt">
+ <glossterm>Hardware Interrupt / Hardware IRQ</glossterm>
+ <glossdef>
+ <para>
+ Hardware interrupt request. <function>in_irq()</function> returns
+ <returnvalue>true</returnvalue> in a hardware interrupt handler.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-interruptcontext">
+ <glossterm>Interrupt Context</glossterm>
+ <glossdef>
+ <para>
+ Not user context: processing a hardware irq or software irq.
+ Indicated by the <function>in_interrupt()</function> macro
+ returning <returnvalue>true</returnvalue>.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-smp">
+ <glossterm><acronym>SMP</acronym></glossterm>
+ <glossdef>
+ <para>
+ Symmetric Multi-Processor: kernels compiled for multiple-CPU
+ machines. (CONFIG_SMP=y).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-softirq">
+ <glossterm>Software Interrupt / softirq</glossterm>
+ <glossdef>
+ <para>
+ Software interrupt handler. <function>in_irq()</function> returns
+ <returnvalue>false</returnvalue>; <function>in_softirq()</function>
+ returns <returnvalue>true</returnvalue>. Tasklets and softirqs
+ both fall into the category of 'software interrupts'.
+ </para>
+ <para>
+ Strictly speaking a softirq is one of up to 32 enumerated software
+ interrupts which can run on multiple CPUs at once.
+ Sometimes used to refer to tasklets as
+ well (ie. all software interrupts).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-tasklet">
+ <glossterm>tasklet</glossterm>
+ <glossdef>
+ <para>
+ A dynamically-registrable software interrupt,
+ which is guaranteed to only run on one CPU at a time.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-timers">
+ <glossterm>timer</glossterm>
+ <glossdef>
+ <para>
+ A dynamically-registrable software interrupt, which is run at
+ (or close to) a given time. When running, it is just like a
+ tasklet (in fact, they are called from the TIMER_SOFTIRQ).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-up">
+ <glossterm><acronym>UP</acronym></glossterm>
+ <glossdef>
+ <para>
+ Uni-Processor: Non-SMP. (CONFIG_SMP=n).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-usercontext">
+ <glossterm>User Context</glossterm>
+ <glossdef>
+ <para>
+ The kernel executing on behalf of a particular process (ie. a
+ system call or trap) or kernel thread. You can tell which
+ process with the <symbol>current</symbol> macro.) Not to
+ be confused with userspace. Can be interrupted by software or
+ hardware interrupts.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-userspace">
+ <glossterm>Userspace</glossterm>
+ <glossdef>
+ <para>
+ A process executing its own code outside the kernel.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ </glossary>
+</book>
+
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
new file mode 100644
index 0000000..372dec2
--- /dev/null
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -0,0 +1,459 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="kgdbOnLinux">
+ <bookinfo>
+ <title>Using kgdb and the kgdb Internals</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Jason</firstname>
+ <surname>Wessel</surname>
+ <affiliation>
+ <address>
+ <email>jason.wessel@windriver.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <authorgroup>
+ <author>
+ <firstname>Tom</firstname>
+ <surname>Rini</surname>
+ <affiliation>
+ <address>
+ <email>trini@kernel.crashing.org</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <authorgroup>
+ <author>
+ <firstname>Amit S.</firstname>
+ <surname>Kale</surname>
+ <affiliation>
+ <address>
+ <email>amitkale@linsyssoft.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2008</year>
+ <holder>Wind River Systems, Inc.</holder>
+ </copyright>
+ <copyright>
+ <year>2004-2005</year>
+ <holder>MontaVista Software, Inc.</holder>
+ </copyright>
+ <copyright>
+ <year>2004</year>
+ <holder>Amit S. Kale</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This file is licensed under the terms of the GNU General Public License
+ version 2. This program is licensed "as is" without any warranty of any
+ kind, whether express or implied.
+ </para>
+
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+ <chapter id="Introduction">
+ <title>Introduction</title>
+ <para>
+ kgdb is a source level debugger for linux kernel. It is used along
+ with gdb to debug a linux kernel. The expectation is that gdb can
+ be used to "break in" to the kernel to inspect memory, variables
+ and look through call stack information similar to what an
+ application developer would use gdb for. It is possible to place
+ breakpoints in kernel code and perform some limited execution
+ stepping.
+ </para>
+ <para>
+ Two machines are required for using kgdb. One of these machines is a
+ development machine and the other is a test machine. The kernel
+ to be debugged runs on the test machine. The development machine
+ runs an instance of gdb against the vmlinux file which contains
+ the symbols (not boot image such as bzImage, zImage, uImage...).
+ In gdb the developer specifies the connection parameters and
+ connects to kgdb. The type of connection a developer makes with
+ gdb depends on the availability of kgdb I/O modules compiled as
+ builtin's or kernel modules in the test machine's kernel.
+ </para>
+ </chapter>
+ <chapter id="CompilingAKernel">
+ <title>Compiling a kernel</title>
+ <para>
+ To enable <symbol>CONFIG_KGDB</symbol> you should first turn on
+ "Prompt for development and/or incomplete code/drivers"
+ (CONFIG_EXPERIMENTAL) in "General setup", then under the
+ "Kernel debugging" select "KGDB: kernel debugging with remote gdb".
+ </para>
+ <para>
+ It is advised, but not required that you turn on the
+ CONFIG_FRAME_POINTER kernel option. This option inserts code to
+ into the compiled executable which saves the frame information in
+ registers or on the stack at different points which will allow a
+ debugger such as gdb to more accurately construct stack back traces
+ while debugging the kernel.
+ </para>
+ <para>
+ If the architecture that you are using supports the kernel option
+ CONFIG_DEBUG_RODATA, you should consider turning it off. This
+ option will prevent the use of software breakpoints because it
+ marks certain regions of the kernel's memory space as read-only.
+ If kgdb supports it for the architecture you are using, you can
+ use hardware breakpoints if you desire to run with the
+ CONFIG_DEBUG_RODATA option turned on, else you need to turn off
+ this option.
+ </para>
+ <para>
+ Next you should choose one of more I/O drivers to interconnect debugging
+ host and debugged target. Early boot debugging requires a KGDB
+ I/O driver that supports early debugging and the driver must be
+ built into the kernel directly. Kgdb I/O driver configuration
+ takes place via kernel or module parameters, see following
+ chapter.
+ </para>
+ <para>
+ The kgdb test compile options are described in the kgdb test suite chapter.
+ </para>
+
+ </chapter>
+ <chapter id="EnableKGDB">
+ <title>Enable kgdb for debugging</title>
+ <para>
+ In order to use kgdb you must activate it by passing configuration
+ information to one of the kgdb I/O drivers. If you do not pass any
+ configuration information kgdb will not do anything at all. Kgdb
+ will only actively hook up to the kernel trap hooks if a kgdb I/O
+ driver is loaded and configured. If you unconfigure a kgdb I/O
+ driver, kgdb will unregister all the kernel hook points.
+ </para>
+ <para>
+ All drivers can be reconfigured at run time, if
+ <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol>
+ are enabled, by echo'ing a new config string to
+ <constant>/sys/module/&lt;driver&gt;/parameter/&lt;option&gt;</constant>.
+ The driver can be unconfigured by passing an empty string. You cannot
+ change the configuration while the debugger is attached. Make sure
+ to detach the debugger with the <constant>detach</constant> command
+ prior to trying unconfigure a kgdb I/O driver.
+ </para>
+ <sect1 id="kgdbwait">
+ <title>Kernel parameter: kgdbwait</title>
+ <para>
+ The Kernel command line option <constant>kgdbwait</constant> makes
+ kgdb wait for a debugger connection during booting of a kernel. You
+ can only use this option you compiled a kgdb I/O driver into the
+ kernel and you specified the I/O driver configuration as a kernel
+ command line option. The kgdbwait parameter should always follow the
+ configuration parameter for the kgdb I/O driver in the kernel
+ command line else the I/O driver will not be configured prior to
+ asking the kernel to use it to wait.
+ </para>
+ <para>
+ The kernel will stop and wait as early as the I/O driver and
+ architecture will allow when you use this option. If you build the
+ kgdb I/O driver as a kernel module kgdbwait will not do anything.
+ </para>
+ </sect1>
+ <sect1 id="kgdboc">
+ <title>Kernel parameter: kgdboc</title>
+ <para>
+ The kgdboc driver was originally an abbreviation meant to stand for
+ "kgdb over console". Kgdboc is designed to work with a single
+ serial port. It was meant to cover the circumstance
+ where you wanted to use a serial console as your primary console as
+ well as using it to perform kernel debugging. Of course you can
+ also use kgdboc without assigning a console to the same port.
+ </para>
+ <sect2 id="UsingKgdboc">
+ <title>Using kgdboc</title>
+ <para>
+ You can configure kgdboc via sysfs or a module or kernel boot line
+ parameter depending on if you build with CONFIG_KGDBOC as a module
+ or built-in.
+ <orderedlist>
+ <listitem><para>From the module load or build-in</para>
+ <para><constant>kgdboc=&lt;tty-device&gt;,[baud]</constant></para>
+ <para>
+ The example here would be if your console port was typically ttyS0, you would use something like <constant>kgdboc=ttyS0,115200</constant> or on the ARM Versatile AB you would likely use <constant>kgdboc=ttyAMA0,115200</constant>
+ </para>
+ </listitem>
+ <listitem><para>From sysfs</para>
+ <para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para>
+ </listitem>
+ </orderedlist>
+ </para>
+ <para>
+ NOTE: Kgdboc does not support interrupting the target via the
+ gdb remote protocol. You must manually send a sysrq-g unless you
+ have a proxy that splits console output to a terminal problem and
+ has a separate port for the debugger to connect to that sends the
+ sysrq-g for you.
+ </para>
+ <para>When using kgdboc with no debugger proxy, you can end up
+ connecting the debugger for one of two entry points. If an
+ exception occurs after you have loaded kgdboc a message should print
+ on the console stating it is waiting for the debugger. In case you
+ disconnect your terminal program and then connect the debugger in
+ its place. If you want to interrupt the target system and forcibly
+ enter a debug session you have to issue a Sysrq sequence and then
+ type the letter <constant>g</constant>. Then you disconnect the
+ terminal session and connect gdb. Your options if you don't like
+ this are to hack gdb to send the sysrq-g for you as well as on the
+ initial connect, or to use a debugger proxy that allows an
+ unmodified gdb to do the debugging.
+ </para>
+ </sect2>
+ </sect1>
+ <sect1 id="kgdbcon">
+ <title>Kernel parameter: kgdbcon</title>
+ <para>
+ Kgdb supports using the gdb serial protocol to send console messages
+ to the debugger when the debugger is connected and running. There
+ are two ways to activate this feature.
+ <orderedlist>
+ <listitem><para>Activate with the kernel command line option:</para>
+ <para><constant>kgdbcon</constant></para>
+ </listitem>
+ <listitem><para>Use sysfs before configuring an io driver</para>
+ <para>
+ <constant>echo 1 &gt; /sys/module/kgdb/parameters/kgdb_use_con</constant>
+ </para>
+ <para>
+ NOTE: If you do this after you configure the kgdb I/O driver, the
+ setting will not take effect until the next point the I/O is
+ reconfigured.
+ </para>
+ </listitem>
+ </orderedlist>
+ </para>
+ <para>
+ IMPORTANT NOTE: Using this option with kgdb over the console
+ (kgdboc) is not supported.
+ </para>
+ </sect1>
+ </chapter>
+ <chapter id="ConnectingGDB">
+ <title>Connecting gdb</title>
+ <para>
+ If you are using kgdboc, you need to have used kgdbwait as a boot
+ argument, issued a sysrq-g, or the system you are going to debug
+ has already taken an exception and is waiting for the debugger to
+ attach before you can connect gdb.
+ </para>
+ <para>
+ If you are not using different kgdb I/O driver other than kgdboc,
+ you should be able to connect and the target will automatically
+ respond.
+ </para>
+ <para>
+ Example (using a serial port):
+ </para>
+ <programlisting>
+ % gdb ./vmlinux
+ (gdb) set remotebaud 115200
+ (gdb) target remote /dev/ttyS0
+ </programlisting>
+ <para>
+ Example (kgdb to a terminal server on tcp port 2012):
+ </para>
+ <programlisting>
+ % gdb ./vmlinux
+ (gdb) target remote 192.168.2.2:2012
+ </programlisting>
+ <para>
+ Once connected, you can debug a kernel the way you would debug an
+ application program.
+ </para>
+ <para>
+ If you are having problems connecting or something is going
+ seriously wrong while debugging, it will most often be the case
+ that you want to enable gdb to be verbose about its target
+ communications. You do this prior to issuing the <constant>target
+ remote</constant> command by typing in: <constant>set remote debug 1</constant>
+ </para>
+ </chapter>
+ <chapter id="KGDBTestSuite">
+ <title>kgdb Test Suite</title>
+ <para>
+ When kgdb is enabled in the kernel config you can also elect to
+ enable the config parameter KGDB_TESTS. Turning this on will
+ enable a special kgdb I/O module which is designed to test the
+ kgdb internal functions.
+ </para>
+ <para>
+ The kgdb tests are mainly intended for developers to test the kgdb
+ internals as well as a tool for developing a new kgdb architecture
+ specific implementation. These tests are not really for end users
+ of the Linux kernel. The primary source of documentation would be
+ to look in the drivers/misc/kgdbts.c file.
+ </para>
+ <para>
+ The kgdb test suite can also be configured at compile time to run
+ the core set of tests by setting the kernel config parameter
+ KGDB_TESTS_ON_BOOT. This particular option is aimed at automated
+ regression testing and does not require modifying the kernel boot
+ config arguments. If this is turned on, the kgdb test suite can
+ be disabled by specifying "kgdbts=" as a kernel boot argument.
+ </para>
+ </chapter>
+ <chapter id="CommonBackEndReq">
+ <title>KGDB Internals</title>
+ <sect1 id="kgdbArchitecture">
+ <title>Architecture Specifics</title>
+ <para>
+ Kgdb is organized into three basic components:
+ <orderedlist>
+ <listitem><para>kgdb core</para>
+ <para>
+ The kgdb core is found in kernel/kgdb.c. It contains:
+ <itemizedlist>
+ <listitem><para>All the logic to implement the gdb serial protocol</para></listitem>
+ <listitem><para>A generic OS exception handler which includes sync'ing the processors into a stopped state on an multi cpu system.</para></listitem>
+ <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem>
+ <listitem><para>The API to make calls to the arch specific kgdb implementation</para></listitem>
+ <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem>
+ <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem>
+ </itemizedlist>
+ </para>
+ </listitem>
+ <listitem><para>kgdb arch specific implementation</para>
+ <para>
+ This implementation is generally found in arch/*/kernel/kgdb.c.
+ As an example, arch/x86/kernel/kgdb.c contains the specifics to
+ implement HW breakpoint as well as the initialization to
+ dynamically register and unregister for the trap handlers on
+ this architecture. The arch specific portion implements:
+ <itemizedlist>
+ <listitem><para>contains an arch specific trap catcher which
+ invokes kgdb_handle_exception() to start kgdb about doing its
+ work</para></listitem>
+ <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem>
+ <listitem><para>Registration and unregistration of architecture specific trap hooks</para></listitem>
+ <listitem><para>Any special exception handling and cleanup</para></listitem>
+ <listitem><para>NMI exception handling and cleanup</para></listitem>
+ <listitem><para>(optional)HW breakpoints</para></listitem>
+ </itemizedlist>
+ </para>
+ </listitem>
+ <listitem><para>kgdb I/O driver</para>
+ <para>
+ Each kgdb I/O driver has to provide an implemenation for the following:
+ <itemizedlist>
+ <listitem><para>configuration via builtin or module</para></listitem>
+ <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem>
+ <listitem><para>read and write character interface</para></listitem>
+ <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem>
+ <listitem><para>(optional) Early debug methodology</para></listitem>
+ </itemizedlist>
+ Any given kgdb I/O driver has to operate very closely with the
+ hardware and must do it in such a way that does not enable
+ interrupts or change other parts of the system context without
+ completely restoring them. The kgdb core will repeatedly "poll"
+ a kgdb I/O driver for characters when it needs input. The I/O
+ driver is expected to return immediately if there is no data
+ available. Doing so allows for the future possibility to touch
+ watch dog hardware in such a way as to have a target system not
+ reset when these are enabled.
+ </para>
+ </listitem>
+ </orderedlist>
+ </para>
+ <para>
+ If you are intent on adding kgdb architecture specific support
+ for a new architecture, the architecture should define
+ <constant>HAVE_ARCH_KGDB</constant> in the architecture specific
+ Kconfig file. This will enable kgdb for the architecture, and
+ at that point you must create an architecture specific kgdb
+ implementation.
+ </para>
+ <para>
+ There are a few flags which must be set on every architecture in
+ their &lt;asm/kgdb.h&gt; file. These are:
+ <itemizedlist>
+ <listitem>
+ <para>
+ NUMREGBYTES: The size in bytes of all of the registers, so
+ that we can ensure they will all fit into a packet.
+ </para>
+ <para>
+ BUFMAX: The size in bytes of the buffer GDB will read into.
+ This must be larger than NUMREGBYTES.
+ </para>
+ <para>
+ CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call
+ flush_cache_range or flush_icache_range. On some architectures,
+ these functions may not be safe to call on SMP since we keep other
+ CPUs in a holding pattern.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ There are also the following functions for the common backend,
+ found in kernel/kgdb.c, that must be supplied by the
+ architecture-specific backend unless marked as (optional), in
+ which case a default function maybe used if the architecture
+ does not need to provide a specific implementation.
+ </para>
+!Iinclude/linux/kgdb.h
+ </sect1>
+ <sect1 id="kgdbocDesign">
+ <title>kgdboc internals</title>
+ <para>
+ The kgdboc driver is actually a very thin driver that relies on the
+ underlying low level to the hardware driver having "polling hooks"
+ which the to which the tty driver is attached. In the initial
+ implementation of kgdboc it the serial_core was changed to expose a
+ low level uart hook for doing polled mode reading and writing of a
+ single character while in an atomic context. When kgdb makes an I/O
+ request to the debugger, kgdboc invokes a call back in the serial
+ core which in turn uses the call back in the uart driver. It is
+ certainly possible to extend kgdboc to work with non-uart based
+ consoles in the future.
+ </para>
+ <para>
+ When using kgdboc with a uart, the uart driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting>
+#ifdef CONFIG_CONSOLE_POLL
+ .poll_get_char = serial8250_get_poll_char,
+ .poll_put_char = serial8250_put_poll_char,
+#endif
+ </programlisting>
+ Any implementation specifics around creating a polling driver use the
+ <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above.
+ Keep in mind that polling hooks have to be implemented in such a way
+ that they can be called from an atomic context and have to restore
+ the state of the uart chip on return such that the system can return
+ to normal when the debugger detaches. You need to be very careful
+ with any kind of lock you consider, because failing here is most
+ going to mean pressing the reset button.
+ </para>
+ </sect1>
+ </chapter>
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The following people have contributed to this document:
+ <orderedlist>
+ <listitem><para>Amit Kale<email>amitkale@linsyssoft.com</email></para></listitem>
+ <listitem><para>Tom Rini<email>trini@kernel.crashing.org</email></para></listitem>
+ </orderedlist>
+ In March 2008 this document was completely rewritten by:
+ <itemizedlist>
+ <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
+ </itemizedlist>
+ </para>
+ </chapter>
+</book>
+
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
new file mode 100644
index 0000000..ba99757
--- /dev/null
+++ b/Documentation/DocBook/libata.tmpl
@@ -0,0 +1,1632 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="libataDevGuide">
+ <bookinfo>
+ <title>libATA Developer's Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Jeff</firstname>
+ <surname>Garzik</surname>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2003-2006</year>
+ <holder>Jeff Garzik</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ The contents of this file are subject to the Open
+ Software License version 1.1 that can be found at
+ <ulink url="http://www.opensource.org/licenses/osl-1.1.txt">http://www.opensource.org/licenses/osl-1.1.txt</ulink> and is included herein
+ by reference.
+ </para>
+
+ <para>
+ Alternatively, the contents of this file may be used under the terms
+ of the GNU General Public License version 2 (the "GPL") as distributed
+ in the kernel source COPYING file, in which case the provisions of
+ the GPL are applicable instead of the above. If you wish to allow
+ the use of your version of this file only under the terms of the
+ GPL and not to allow others to use your version of this file under
+ the OSL, indicate your decision by deleting the provisions above and
+ replace them with the notice and other provisions required by the GPL.
+ If you do not delete the provisions above, a recipient may use your
+ version of this file under either the OSL or the GPL.
+ </para>
+
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="libataIntroduction">
+ <title>Introduction</title>
+ <para>
+ libATA is a library used inside the Linux kernel to support ATA host
+ controllers and devices. libATA provides an ATA driver API, class
+ transports for ATA and ATAPI devices, and SCSI&lt;-&gt;ATA translation
+ for ATA devices according to the T10 SAT specification.
+ </para>
+ <para>
+ This Guide documents the libATA driver API, library functions, library
+ internals, and a couple sample ATA low-level drivers.
+ </para>
+ </chapter>
+
+ <chapter id="libataDriverApi">
+ <title>libata Driver API</title>
+ <para>
+ struct ata_port_operations is defined for every low-level libata
+ hardware driver, and it controls how the low-level driver
+ interfaces with the ATA and SCSI layers.
+ </para>
+ <para>
+ FIS-based drivers will hook into the system with ->qc_prep() and
+ ->qc_issue() high-level hooks. Hardware which behaves in a manner
+ similar to PCI IDE hardware may utilize several generic helpers,
+ defining at a bare minimum the bus I/O addresses of the ATA shadow
+ register blocks.
+ </para>
+ <sect1>
+ <title>struct ata_port_operations</title>
+
+ <sect2><title>Disable ATA port</title>
+ <programlisting>
+void (*port_disable) (struct ata_port *);
+ </programlisting>
+
+ <para>
+ Called from ata_bus_probe() and ata_bus_reset() error paths,
+ as well as when unregistering from the SCSI module (rmmod, hot
+ unplug).
+ This function should do whatever needs to be done to take the
+ port out of use. In most cases, ata_port_disable() can be used
+ as this hook.
+ </para>
+ <para>
+ Called from ata_bus_probe() on a failed probe.
+ Called from ata_bus_reset() on a failed bus reset.
+ Called from ata_scsi_release().
+ </para>
+
+ </sect2>
+
+ <sect2><title>Post-IDENTIFY device configuration</title>
+ <programlisting>
+void (*dev_config) (struct ata_port *, struct ata_device *);
+ </programlisting>
+
+ <para>
+ Called after IDENTIFY [PACKET] DEVICE is issued to each device
+ found. Typically used to apply device-specific fixups prior to
+ issue of SET FEATURES - XFER MODE, and prior to operation.
+ </para>
+ <para>
+ Called by ata_device_add() after ata_dev_identify() determines
+ a device is present.
+ </para>
+ <para>
+ This entry may be specified as NULL in ata_port_operations.
+ </para>
+
+ </sect2>
+
+ <sect2><title>Set PIO/DMA mode</title>
+ <programlisting>
+void (*set_piomode) (struct ata_port *, struct ata_device *);
+void (*set_dmamode) (struct ata_port *, struct ata_device *);
+void (*post_set_mode) (struct ata_port *);
+unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int);
+ </programlisting>
+
+ <para>
+ Hooks called prior to the issue of SET FEATURES - XFER MODE
+ command. The optional ->mode_filter() hook is called when libata
+ has built a mask of the possible modes. This is passed to the
+ ->mode_filter() function which should return a mask of valid modes
+ after filtering those unsuitable due to hardware limits. It is not
+ valid to use this interface to add modes.
+ </para>
+ <para>
+ dev->pio_mode and dev->dma_mode are guaranteed to be valid when
+ ->set_piomode() and when ->set_dmamode() is called. The timings for
+ any other drive sharing the cable will also be valid at this point.
+ That is the library records the decisions for the modes of each
+ drive on a channel before it attempts to set any of them.
+ </para>
+ <para>
+ ->post_set_mode() is
+ called unconditionally, after the SET FEATURES - XFER MODE
+ command completes successfully.
+ </para>
+
+ <para>
+ ->set_piomode() is always called (if present), but
+ ->set_dma_mode() is only called if DMA is possible.
+ </para>
+
+ </sect2>
+
+ <sect2><title>Taskfile read/write</title>
+ <programlisting>
+void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
+void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
+ </programlisting>
+
+ <para>
+ ->tf_load() is called to load the given taskfile into hardware
+ registers / DMA buffers. ->tf_read() is called to read the
+ hardware registers / DMA buffers, to obtain the current set of
+ taskfile register values.
+ Most drivers for taskfile-based hardware (PIO or MMIO) use
+ ata_tf_load() and ata_tf_read() for these hooks.
+ </para>
+
+ </sect2>
+
+ <sect2><title>PIO data read/write</title>
+ <programlisting>
+void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
+ </programlisting>
+
+ <para>
+All bmdma-style drivers must implement this hook. This is the low-level
+operation that actually copies the data bytes during a PIO data
+transfer.
+Typically the driver
+will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or
+ata_mmio_data_xfer().
+ </para>
+
+ </sect2>
+
+ <sect2><title>ATA command execute</title>
+ <programlisting>
+void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
+ </programlisting>
+
+ <para>
+ causes an ATA command, previously loaded with
+ ->tf_load(), to be initiated in hardware.
+ Most drivers for taskfile-based hardware use ata_exec_command()
+ for this hook.
+ </para>
+
+ </sect2>
+
+ <sect2><title>Per-cmd ATAPI DMA capabilities filter</title>
+ <programlisting>
+int (*check_atapi_dma) (struct ata_queued_cmd *qc);
+ </programlisting>
+
+ <para>
+Allow low-level driver to filter ATA PACKET commands, returning a status
+indicating whether or not it is OK to use DMA for the supplied PACKET
+command.
+ </para>
+ <para>
+ This hook may be specified as NULL, in which case libata will
+ assume that atapi dma can be supported.
+ </para>
+
+ </sect2>
+
+ <sect2><title>Read specific ATA shadow registers</title>
+ <programlisting>
+u8 (*check_status)(struct ata_port *ap);
+u8 (*check_altstatus)(struct ata_port *ap);
+ </programlisting>
+
+ <para>
+ Reads the Status/AltStatus ATA shadow register from
+ hardware. On some hardware, reading the Status register has
+ the side effect of clearing the interrupt condition.
+ Most drivers for taskfile-based hardware use
+ ata_check_status() for this hook.
+ </para>
+ <para>
+ Note that because this is called from ata_device_add(), at
+ least a dummy function that clears device interrupts must be
+ provided for all drivers, even if the controller doesn't
+ actually have a taskfile status register.
+ </para>
+
+ </sect2>
+
+ <sect2><title>Select ATA device on bus</title>
+ <programlisting>
+void (*dev_select)(struct ata_port *ap, unsigned int device);
+ </programlisting>
+
+ <para>
+ Issues the low-level hardware command(s) that causes one of N
+ hardware devices to be considered 'selected' (active and
+ available for use) on the ATA bus. This generally has no
+ meaning on FIS-based devices.
+ </para>
+ <para>
+ Most drivers for taskfile-based hardware use
+ ata_std_dev_select() for this hook. Controllers which do not
+ support second drives on a port (such as SATA contollers) will
+ use ata_noop_dev_select().
+ </para>
+
+ </sect2>
+
+ <sect2><title>Private tuning method</title>
+ <programlisting>
+void (*set_mode) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+ By default libata performs drive and controller tuning in
+ accordance with the ATA timing rules and also applies blacklists
+ and cable limits. Some controllers need special handling and have
+ custom tuning rules, typically raid controllers that use ATA
+ commands but do not actually do drive timing.
+ </para>
+
+ <warning>
+ <para>
+ This hook should not be used to replace the standard controller
+ tuning logic when a controller has quirks. Replacing the default
+ tuning logic in that case would bypass handling for drive and
+ bridge quirks that may be important to data reliability. If a
+ controller needs to filter the mode selection it should use the
+ mode_filter hook instead.
+ </para>
+ </warning>
+
+ </sect2>
+
+ <sect2><title>Control PCI IDE BMDMA engine</title>
+ <programlisting>
+void (*bmdma_setup) (struct ata_queued_cmd *qc);
+void (*bmdma_start) (struct ata_queued_cmd *qc);
+void (*bmdma_stop) (struct ata_port *ap);
+u8 (*bmdma_status) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+When setting up an IDE BMDMA transaction, these hooks arm
+(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop)
+the hardware's DMA engine. ->bmdma_status is used to read the standard
+PCI IDE DMA Status register.
+ </para>
+
+ <para>
+These hooks are typically either no-ops, or simply not implemented, in
+FIS-based drivers.
+ </para>
+ <para>
+Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup()
+hook. ata_bmdma_setup() will write the pointer to the PRD table to
+the IDE PRD Table Address register, enable DMA in the DMA Command
+register, and call exec_command() to begin the transfer.
+ </para>
+ <para>
+Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start()
+hook. ata_bmdma_start() will write the ATA_DMA_START flag to the DMA
+Command register.
+ </para>
+ <para>
+Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop()
+hook. ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA
+command register.
+ </para>
+ <para>
+Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook.
+ </para>
+
+ </sect2>
+
+ <sect2><title>High-level taskfile hooks</title>
+ <programlisting>
+void (*qc_prep) (struct ata_queued_cmd *qc);
+int (*qc_issue) (struct ata_queued_cmd *qc);
+ </programlisting>
+
+ <para>
+ Higher-level hooks, these two hooks can potentially supercede
+ several of the above taskfile/DMA engine hooks. ->qc_prep is
+ called after the buffers have been DMA-mapped, and is typically
+ used to populate the hardware's DMA scatter-gather table.
+ Most drivers use the standard ata_qc_prep() helper function, but
+ more advanced drivers roll their own.
+ </para>
+ <para>
+ ->qc_issue is used to make a command active, once the hardware
+ and S/G tables have been prepared. IDE BMDMA drivers use the
+ helper function ata_qc_issue_prot() for taskfile protocol-based
+ dispatch. More advanced drivers implement their own ->qc_issue.
+ </para>
+ <para>
+ ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and
+ ->bmdma_start() as necessary to initiate a transfer.
+ </para>
+
+ </sect2>
+
+ <sect2><title>Exception and probe handling (EH)</title>
+ <programlisting>
+void (*eng_timeout) (struct ata_port *ap);
+void (*phy_reset) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+Deprecated. Use ->error_handler() instead.
+ </para>
+
+ <programlisting>
+void (*freeze) (struct ata_port *ap);
+void (*thaw) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+ata_port_freeze() is called when HSM violations or some other
+condition disrupts normal operation of the port. A frozen port
+is not allowed to perform any operation until the port is
+thawed, which usually follows a successful reset.
+ </para>
+
+ <para>
+The optional ->freeze() callback can be used for freezing the port
+hardware-wise (e.g. mask interrupt and stop DMA engine). If a
+port cannot be frozen hardware-wise, the interrupt handler
+must ack and clear interrupts unconditionally while the port
+is frozen.
+ </para>
+ <para>
+The optional ->thaw() callback is called to perform the opposite of ->freeze():
+prepare the port for normal operation once again. Unmask interrupts,
+start DMA engine, etc.
+ </para>
+
+ <programlisting>
+void (*error_handler) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+->error_handler() is a driver's hook into probe, hotplug, and recovery
+and other exceptional conditions. The primary responsibility of an
+implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set
+of EH hooks as arguments:
+ </para>
+
+ <para>
+'prereset' hook (may be NULL) is called during an EH reset, before any other actions
+are taken.
+ </para>
+
+ <para>
+'postreset' hook (may be NULL) is called after the EH reset is performed. Based on
+existing conditions, severity of the problem, and hardware capabilities,
+ </para>
+
+ <para>
+Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be
+called to perform the low-level EH reset.
+ </para>
+
+ <programlisting>
+void (*post_internal_cmd) (struct ata_queued_cmd *qc);
+ </programlisting>
+
+ <para>
+Perform any hardware-specific actions necessary to finish processing
+after executing a probe-time or EH-time command via ata_exec_internal().
+ </para>
+
+ </sect2>
+
+ <sect2><title>Hardware interrupt handling</title>
+ <programlisting>
+irqreturn_t (*irq_handler)(int, void *, struct pt_regs *);
+void (*irq_clear) (struct ata_port *);
+ </programlisting>
+
+ <para>
+ ->irq_handler is the interrupt handling routine registered with
+ the system, by libata. ->irq_clear is called during probe just
+ before the interrupt handler is registered, to be sure hardware
+ is quiet.
+ </para>
+ <para>
+ The second argument, dev_instance, should be cast to a pointer
+ to struct ata_host_set.
+ </para>
+ <para>
+ Most legacy IDE drivers use ata_interrupt() for the
+ irq_handler hook, which scans all ports in the host_set,
+ determines which queued command was active (if any), and calls
+ ata_host_intr(ap,qc).
+ </para>
+ <para>
+ Most legacy IDE drivers use ata_bmdma_irq_clear() for the
+ irq_clear() hook, which simply clears the interrupt and error
+ flags in the DMA status register.
+ </para>
+
+ </sect2>
+
+ <sect2><title>SATA phy read/write</title>
+ <programlisting>
+int (*scr_read) (struct ata_port *ap, unsigned int sc_reg,
+ u32 *val);
+int (*scr_write) (struct ata_port *ap, unsigned int sc_reg,
+ u32 val);
+ </programlisting>
+
+ <para>
+ Read and write standard SATA phy registers. Currently only used
+ if ->phy_reset hook called the sata_phy_reset() helper function.
+ sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE.
+ </para>
+
+ </sect2>
+
+ <sect2><title>Init and shutdown</title>
+ <programlisting>
+int (*port_start) (struct ata_port *ap);
+void (*port_stop) (struct ata_port *ap);
+void (*host_stop) (struct ata_host_set *host_set);
+ </programlisting>
+
+ <para>
+ ->port_start() is called just after the data structures for each
+ port are initialized. Typically this is used to alloc per-port
+ DMA buffers / tables / rings, enable DMA engines, and similar
+ tasks. Some drivers also use this entry point as a chance to
+ allocate driver-private memory for ap->private_data.
+ </para>
+ <para>
+ Many drivers use ata_port_start() as this hook or call
+ it from their own port_start() hooks. ata_port_start()
+ allocates space for a legacy IDE PRD table and returns.
+ </para>
+ <para>
+ ->port_stop() is called after ->host_stop(). It's sole function
+ is to release DMA/memory resources, now that they are no longer
+ actively being used. Many drivers also free driver-private
+ data from port at this time.
+ </para>
+ <para>
+ Many drivers use ata_port_stop() as this hook, which frees the
+ PRD table.
+ </para>
+ <para>
+ ->host_stop() is called after all ->port_stop() calls
+have completed. The hook must finalize hardware shutdown, release DMA
+and other resources, etc.
+ This hook may be specified as NULL, in which case it is not called.
+ </para>
+
+ </sect2>
+
+ </sect1>
+ </chapter>
+
+ <chapter id="libataEH">
+ <title>Error handling</title>
+
+ <para>
+ This chapter describes how errors are handled under libata.
+ Readers are advised to read SCSI EH
+ (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first.
+ </para>
+
+ <sect1><title>Origins of commands</title>
+ <para>
+ In libata, a command is represented with struct ata_queued_cmd
+ or qc. qc's are preallocated during port initialization and
+ repetitively used for command executions. Currently only one
+ qc is allocated per port but yet-to-be-merged NCQ branch
+ allocates one for each tag and maps each qc to NCQ tag 1-to-1.
+ </para>
+ <para>
+ libata commands can originate from two sources - libata itself
+ and SCSI midlayer. libata internal commands are used for
+ initialization and error handling. All normal blk requests
+ and commands for SCSI emulation are passed as SCSI commands
+ through queuecommand callback of SCSI host template.
+ </para>
+ </sect1>
+
+ <sect1><title>How commands are issued</title>
+
+ <variablelist>
+
+ <varlistentry><term>Internal commands</term>
+ <listitem>
+ <para>
+ First, qc is allocated and initialized using
+ ata_qc_new_init(). Although ata_qc_new_init() doesn't
+ implement any wait or retry mechanism when qc is not
+ available, internal commands are currently issued only during
+ initialization and error recovery, so no other command is
+ active and allocation is guaranteed to succeed.
+ </para>
+ <para>
+ Once allocated qc's taskfile is initialized for the command to
+ be executed. qc currently has two mechanisms to notify
+ completion. One is via qc->complete_fn() callback and the
+ other is completion qc->waiting. qc->complete_fn() callback
+ is the asynchronous path used by normal SCSI translated
+ commands and qc->waiting is the synchronous (issuer sleeps in
+ process context) path used by internal commands.
+ </para>
+ <para>
+ Once initialization is complete, host_set lock is acquired
+ and the qc is issued.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>SCSI commands</term>
+ <listitem>
+ <para>
+ All libata drivers use ata_scsi_queuecmd() as
+ hostt->queuecommand callback. scmds can either be simulated
+ or translated. No qc is involved in processing a simulated
+ scmd. The result is computed right away and the scmd is
+ completed.
+ </para>
+ <para>
+ For a translated scmd, ata_qc_new_init() is invoked to
+ allocate a qc and the scmd is translated into the qc. SCSI
+ midlayer's completion notification function pointer is stored
+ into qc->scsidone.
+ </para>
+ <para>
+ qc->complete_fn() callback is used for completion
+ notification. ATA commands use ata_scsi_qc_complete() while
+ ATAPI commands use atapi_qc_complete(). Both functions end up
+ calling qc->scsidone to notify upper layer when the qc is
+ finished. After translation is completed, the qc is issued
+ with ata_qc_issue().
+ </para>
+ <para>
+ Note that SCSI midlayer invokes hostt->queuecommand while
+ holding host_set lock, so all above occur while holding
+ host_set lock.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+ </sect1>
+
+ <sect1><title>How commands are processed</title>
+ <para>
+ Depending on which protocol and which controller are used,
+ commands are processed differently. For the purpose of
+ discussion, a controller which uses taskfile interface and all
+ standard callbacks is assumed.
+ </para>
+ <para>
+ Currently 6 ATA command protocols are used. They can be
+ sorted into the following four categories according to how
+ they are processed.
+ </para>
+
+ <variablelist>
+ <varlistentry><term>ATA NO DATA or DMA</term>
+ <listitem>
+ <para>
+ ATA_PROT_NODATA and ATA_PROT_DMA fall into this category.
+ These types of commands don't require any software
+ intervention once issued. Device will raise interrupt on
+ completion.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATA PIO</term>
+ <listitem>
+ <para>
+ ATA_PROT_PIO is in this category. libata currently
+ implements PIO with polling. ATA_NIEN bit is set to turn
+ off interrupt and pio_task on ata_wq performs polling and
+ IO.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATAPI NODATA or DMA</term>
+ <listitem>
+ <para>
+ ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this
+ category. packet_task is used to poll BSY bit after
+ issuing PACKET command. Once BSY is turned off by the
+ device, packet_task transfers CDB and hands off processing
+ to interrupt handler.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATAPI PIO</term>
+ <listitem>
+ <para>
+ ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set
+ and, as in ATAPI NODATA or DMA, packet_task submits cdb.
+ However, after submitting cdb, further processing (data
+ transfer) is handed off to pio_task.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </sect1>
+
+ <sect1><title>How commands are completed</title>
+ <para>
+ Once issued, all qc's are either completed with
+ ata_qc_complete() or time out. For commands which are handled
+ by interrupts, ata_host_intr() invokes ata_qc_complete(), and,
+ for PIO tasks, pio_task invokes ata_qc_complete(). In error
+ cases, packet_task may also complete commands.
+ </para>
+ <para>
+ ata_qc_complete() does the following.
+ </para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>
+ DMA memory is unmapped.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ ATA_QCFLAG_ACTIVE is clared from qc->flags.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ qc->complete_fn() callback is invoked. If the return value of
+ the callback is not zero. Completion is short circuited and
+ ata_qc_complete() returns.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ __ata_qc_complete() is called, which does
+ <orderedlist>
+
+ <listitem>
+ <para>
+ qc->flags is cleared to zero.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ ap->active_tag and qc->tag are poisoned.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ qc->waiting is claread &amp; completed (in that order).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ qc is deallocated by clearing appropriate bit in ap->qactive.
+ </para>
+ </listitem>
+
+ </orderedlist>
+ </para>
+ </listitem>
+
+ </orderedlist>
+
+ <para>
+ So, it basically notifies upper layer and deallocates qc. One
+ exception is short-circuit path in #3 which is used by
+ atapi_qc_complete().
+ </para>
+ <para>
+ For all non-ATAPI commands, whether it fails or not, almost
+ the same code path is taken and very little error handling
+ takes place. A qc is completed with success status if it
+ succeeded, with failed status otherwise.
+ </para>
+ <para>
+ However, failed ATAPI commands require more handling as
+ REQUEST SENSE is needed to acquire sense data. If an ATAPI
+ command fails, ata_qc_complete() is invoked with error status,
+ which in turn invokes atapi_qc_complete() via
+ qc->complete_fn() callback.
+ </para>
+ <para>
+ This makes atapi_qc_complete() set scmd->result to
+ SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As
+ the sense data is empty but scmd->result is CHECK CONDITION,
+ SCSI midlayer will invoke EH for the scmd, and returning 1
+ makes ata_qc_complete() to return without deallocating the qc.
+ This leads us to ata_scsi_error() with partially completed qc.
+ </para>
+
+ </sect1>
+
+ <sect1><title>ata_scsi_error()</title>
+ <para>
+ ata_scsi_error() is the current transportt->eh_strategy_handler()
+ for libata. As discussed above, this will be entered in two
+ cases - timeout and ATAPI error completion. This function
+ calls low level libata driver's eng_timeout() callback, the
+ standard callback for which is ata_eng_timeout(). It checks
+ if a qc is active and calls ata_qc_timeout() on the qc if so.
+ Actual error handling occurs in ata_qc_timeout().
+ </para>
+ <para>
+ If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and
+ completes the qc. Note that as we're currently in EH, we
+ cannot call scsi_done. As described in SCSI EH doc, a
+ recovered scmd should be either retried with
+ scsi_queue_insert() or finished with scsi_finish_command().
+ Here, we override qc->scsidone with scsi_finish_command() and
+ calls ata_qc_complete().
+ </para>
+ <para>
+ If EH is invoked due to a failed ATAPI qc, the qc here is
+ completed but not deallocated. The purpose of this
+ half-completion is to use the qc as place holder to make EH
+ code reach this place. This is a bit hackish, but it works.
+ </para>
+ <para>
+ Once control reaches here, the qc is deallocated by invoking
+ __ata_qc_complete() explicitly. Then, internal qc for REQUEST
+ SENSE is issued. Once sense data is acquired, scmd is
+ finished by directly invoking scsi_finish_command() on the
+ scmd. Note that as we already have completed and deallocated
+ the qc which was associated with the scmd, we don't need
+ to/cannot call ata_qc_complete() again.
+ </para>
+
+ </sect1>
+
+ <sect1><title>Problems with the current EH</title>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Error representation is too crude. Currently any and all
+ error conditions are represented with ATA STATUS and ERROR
+ registers. Errors which aren't ATA device errors are treated
+ as ATA device errors by setting ATA_ERR bit. Better error
+ descriptor which can properly represent ATA and other
+ errors/exceptions is needed.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ When handling timeouts, no action is taken to make device
+ forget about the timed out command and ready for new commands.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ EH handling via ata_scsi_error() is not properly protected
+ from usual command processing. On EH entrance, the device is
+ not in quiescent state. Timed out commands may succeed or
+ fail any time. pio_task and atapi_task may still be running.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Too weak error recovery. Devices / controllers causing HSM
+ mismatch errors and other errors quite often require reset to
+ return to known state. Also, advanced error handling is
+ necessary to support features like NCQ and hotplug.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ ATA errors are directly handled in the interrupt handler and
+ PIO errors in pio_task. This is problematic for advanced
+ error handling for the following reasons.
+ </para>
+ <para>
+ First, advanced error handling often requires context and
+ internal qc execution.
+ </para>
+ <para>
+ Second, even a simple failure (say, CRC error) needs
+ information gathering and could trigger complex error handling
+ (say, resetting &amp; reconfiguring). Having multiple code
+ paths to gather information, enter EH and trigger actions
+ makes life painful.
+ </para>
+ <para>
+ Third, scattered EH code makes implementing low level drivers
+ difficult. Low level drivers override libata callbacks. If
+ EH is scattered over several places, each affected callbacks
+ should perform its part of error handling. This can be error
+ prone and painful.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+ </sect1>
+ </chapter>
+
+ <chapter id="libataExt">
+ <title>libata Library</title>
+!Edrivers/ata/libata-core.c
+ </chapter>
+
+ <chapter id="libataInt">
+ <title>libata Core Internals</title>
+!Idrivers/ata/libata-core.c
+ </chapter>
+
+ <chapter id="libataScsiInt">
+ <title>libata SCSI translation/emulation</title>
+!Edrivers/ata/libata-scsi.c
+!Idrivers/ata/libata-scsi.c
+ </chapter>
+
+ <chapter id="ataExceptions">
+ <title>ATA errors and exceptions</title>
+
+ <para>
+ This chapter tries to identify what error/exception conditions exist
+ for ATA/ATAPI devices and describe how they should be handled in
+ implementation-neutral way.
+ </para>
+
+ <para>
+ The term 'error' is used to describe conditions where either an
+ explicit error condition is reported from device or a command has
+ timed out.
+ </para>
+
+ <para>
+ The term 'exception' is either used to describe exceptional
+ conditions which are not errors (say, power or hotplug events), or
+ to describe both errors and non-error exceptional conditions. Where
+ explicit distinction between error and exception is necessary, the
+ term 'non-error exception' is used.
+ </para>
+
+ <sect1 id="excat">
+ <title>Exception categories</title>
+ <para>
+ Exceptions are described primarily with respect to legacy
+ taskfile + bus master IDE interface. If a controller provides
+ other better mechanism for error reporting, mapping those into
+ categories described below shouldn't be difficult.
+ </para>
+
+ <para>
+ In the following sections, two recovery actions - reset and
+ reconfiguring transport - are mentioned. These are described
+ further in <xref linkend="exrec"/>.
+ </para>
+
+ <sect2 id="excatHSMviolation">
+ <title>HSM violation</title>
+ <para>
+ This error is indicated when STATUS value doesn't match HSM
+ requirement during issuing or excution any ATA/ATAPI command.
+ </para>
+
+ <itemizedlist>
+ <title>Examples</title>
+
+ <listitem>
+ <para>
+ ATA_STATUS doesn't contain !BSY &amp;&amp; DRDY &amp;&amp; !DRQ while trying
+ to issue a command.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY &amp;&amp; !DRQ during PIO data transfer.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ DRQ on command completion.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY &amp;&amp; ERR after CDB tranfer starts but before the
+ last byte of CDB is transferred. ATA/ATAPI standard states
+ that &quot;The device shall not terminate the PACKET command
+ with an error before the last byte of the command packet has
+ been written&quot; in the error outputs description of PACKET
+ command and the state diagram doesn't include such
+ transitions.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ In these cases, HSM is violated and not much information
+ regarding the error can be acquired from STATUS or ERROR
+ register. IOW, this error can be anything - driver bug,
+ faulty device, controller and/or cable.
+ </para>
+
+ <para>
+ As HSM is violated, reset is necessary to restore known state.
+ Reconfiguring transport for lower speed might be helpful too
+ as transmission errors sometimes cause this kind of errors.
+ </para>
+ </sect2>
+
+ <sect2 id="excatDevErr">
+ <title>ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)</title>
+
+ <para>
+ These are errors detected and reported by ATA/ATAPI devices
+ indicating device problems. For this type of errors, STATUS
+ and ERROR register values are valid and describe error
+ condition. Note that some of ATA bus errors are detected by
+ ATA/ATAPI devices and reported using the same mechanism as
+ device errors. Those cases are described later in this
+ section.
+ </para>
+
+ <para>
+ For ATA commands, this type of errors are indicated by !BSY
+ &amp;&amp; ERR during command execution and on completion.
+ </para>
+
+ <para>For ATAPI commands,</para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ !BSY &amp;&amp; ERR &amp;&amp; ABRT right after issuing PACKET
+ indicates that PACKET command is not supported and falls in
+ this category.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY &amp;&amp; ERR(==CHK) &amp;&amp; !ABRT after the last
+ byte of CDB is transferred indicates CHECK CONDITION and
+ doesn't fall in this category.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY &amp;&amp; ERR(==CHK) &amp;&amp; ABRT after the last byte
+ of CDB is transferred *probably* indicates CHECK CONDITION and
+ doesn't fall in this category.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ Of errors detected as above, the followings are not ATA/ATAPI
+ device errors but ATA bus errors and should be handled
+ according to <xref linkend="excatATAbusErr"/>.
+ </para>
+
+ <variablelist>
+
+ <varlistentry>
+ <term>CRC error during data transfer</term>
+ <listitem>
+ <para>
+ This is indicated by ICRC bit in the ERROR register and
+ means that corruption occurred during data transfer. Upto
+ ATA/ATAPI-7, the standard specifies that this bit is only
+ applicable to UDMA transfers but ATA/ATAPI-8 draft revision
+ 1f says that the bit may be applicable to multiword DMA and
+ PIO.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>ABRT error during data transfer or on completion</term>
+ <listitem>
+ <para>
+ Upto ATA/ATAPI-7, the standard specifies that ABRT could be
+ set on ICRC errors and on cases where a device is not able
+ to complete a command. Combined with the fact that MWDMA
+ and PIO transfer errors aren't allowed to use ICRC bit upto
+ ATA/ATAPI-7, it seems to imply that ABRT bit alone could
+ indicate tranfer errors.
+ </para>
+ <para>
+ However, ATA/ATAPI-8 draft revision 1f removes the part
+ that ICRC errors can turn on ABRT. So, this is kind of
+ gray area. Some heuristics are needed here.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+
+ <para>
+ ATA/ATAPI device errors can be further categorized as follows.
+ </para>
+
+ <variablelist>
+
+ <varlistentry>
+ <term>Media errors</term>
+ <listitem>
+ <para>
+ This is indicated by UNC bit in the ERROR register. ATA
+ devices reports UNC error only after certain number of
+ retries cannot recover the data, so there's nothing much
+ else to do other than notifying upper layer.
+ </para>
+ <para>
+ READ and WRITE commands report CHS or LBA of the first
+ failed sector but ATA/ATAPI standard specifies that the
+ amount of transferred data on error completion is
+ indeterminate, so we cannot assume that sectors preceding
+ the failed sector have been transferred and thus cannot
+ complete those sectors successfully as SCSI does.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>Media changed / media change requested error</term>
+ <listitem>
+ <para>
+ &lt;&lt;TODO: fill here&gt;&gt;
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>Address error</term>
+ <listitem>
+ <para>
+ This is indicated by IDNF bit in the ERROR register.
+ Report to upper layer.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>Other errors</term>
+ <listitem>
+ <para>
+ This can be invalid command or parameter indicated by ABRT
+ ERROR bit or some other error condition. Note that ABRT
+ bit can indicate a lot of things including ICRC and Address
+ errors. Heuristics needed.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+
+ <para>
+ Depending on commands, not all STATUS/ERROR bits are
+ applicable. These non-applicable bits are marked with
+ &quot;na&quot; in the output descriptions but upto ATA/ATAPI-7
+ no definition of &quot;na&quot; can be found. However,
+ ATA/ATAPI-8 draft revision 1f describes &quot;N/A&quot; as
+ follows.
+ </para>
+
+ <blockquote>
+ <variablelist>
+ <varlistentry><term>3.2.3.3a N/A</term>
+ <listitem>
+ <para>
+ A keyword the indicates a field has no defined value in
+ this standard and should not be checked by the host or
+ device. N/A fields should be cleared to zero.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </blockquote>
+
+ <para>
+ So, it seems reasonable to assume that &quot;na&quot; bits are
+ cleared to zero by devices and thus need no explicit masking.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatATAPIcc">
+ <title>ATAPI device CHECK CONDITION</title>
+
+ <para>
+ ATAPI device CHECK CONDITION error is indicated by set CHK bit
+ (ERR bit) in the STATUS register after the last byte of CDB is
+ transferred for a PACKET command. For this kind of errors,
+ sense data should be acquired to gather information regarding
+ the errors. REQUEST SENSE packet command should be used to
+ acquire sense data.
+ </para>
+
+ <para>
+ Once sense data is acquired, this type of errors can be
+ handled similary to other SCSI errors. Note that sense data
+ may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR
+ &amp;&amp; ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such
+ cases, the error should be considered as an ATA bus error and
+ handled according to <xref linkend="excatATAbusErr"/>.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatNCQerr">
+ <title>ATA device error (NCQ)</title>
+
+ <para>
+ NCQ command error is indicated by cleared BSY and set ERR bit
+ during NCQ command phase (one or more NCQ commands
+ outstanding). Although STATUS and ERROR registers will
+ contain valid values describing the error, READ LOG EXT is
+ required to clear the error condition, determine which command
+ has failed and acquire more information.
+ </para>
+
+ <para>
+ READ LOG EXT Log Page 10h reports which tag has failed and
+ taskfile register values describing the error. With this
+ information the failed command can be handled as a normal ATA
+ command error as in <xref linkend="excatDevErr"/> and all
+ other in-flight commands must be retried. Note that this
+ retry should not be counted - it's likely that commands
+ retried this way would have completed normally if it were not
+ for the failed command.
+ </para>
+
+ <para>
+ Note that ATA bus errors can be reported as ATA device NCQ
+ errors. This should be handled as described in <xref
+ linkend="excatATAbusErr"/>.
+ </para>
+
+ <para>
+ If READ LOG EXT Log Page 10h fails or reports NQ, we're
+ thoroughly screwed. This condition should be treated
+ according to <xref linkend="excatHSMviolation"/>.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatATAbusErr">
+ <title>ATA bus error</title>
+
+ <para>
+ ATA bus error means that data corruption occurred during
+ transmission over ATA bus (SATA or PATA). This type of errors
+ can be indicated by
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ ICRC or ABRT error as described in <xref linkend="excatDevErr"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Controller-specific error completion with error information
+ indicating transmission error.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ On some controllers, command timeout. In this case, there may
+ be a mechanism to determine that the timeout is due to
+ transmission error.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Unknown/random errors, timeouts and all sorts of weirdities.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ As described above, transmission errors can cause wide variety
+ of symptoms ranging from device ICRC error to random device
+ lockup, and, for many cases, there is no way to tell if an
+ error condition is due to transmission error or not;
+ therefore, it's necessary to employ some kind of heuristic
+ when dealing with errors and timeouts. For example,
+ encountering repetitive ABRT errors for known supported
+ command is likely to indicate ATA bus error.
+ </para>
+
+ <para>
+ Once it's determined that ATA bus errors have possibly
+ occurred, lowering ATA bus transmission speed is one of
+ actions which may alleviate the problem. See <xref
+ linkend="exrecReconf"/> for more information.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatPCIbusErr">
+ <title>PCI bus error</title>
+
+ <para>
+ Data corruption or other failures during transmission over PCI
+ (or other system bus). For standard BMDMA, this is indicated
+ by Error bit in the BMDMA Status register. This type of
+ errors must be logged as it indicates something is very wrong
+ with the system. Resetting host controller is recommended.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatLateCompletion">
+ <title>Late completion</title>
+
+ <para>
+ This occurs when timeout occurs and the timeout handler finds
+ out that the timed out command has completed successfully or
+ with error. This is usually caused by lost interrupts. This
+ type of errors must be logged. Resetting host controller is
+ recommended.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatUnknown">
+ <title>Unknown error (timeout)</title>
+
+ <para>
+ This is when timeout occurs and the command is still
+ processing or the host and device are in unknown state. When
+ this occurs, HSM could be in any valid or invalid state. To
+ bring the device to known state and make it forget about the
+ timed out command, resetting is necessary. The timed out
+ command may be retried.
+ </para>
+
+ <para>
+ Timeouts can also be caused by transmission errors. Refer to
+ <xref linkend="excatATAbusErr"/> for more details.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatHoplugPM">
+ <title>Hotplug and power management exceptions</title>
+
+ <para>
+ &lt;&lt;TODO: fill here&gt;&gt;
+ </para>
+
+ </sect2>
+
+ </sect1>
+
+ <sect1 id="exrec">
+ <title>EH recovery actions</title>
+
+ <para>
+ This section discusses several important recovery actions.
+ </para>
+
+ <sect2 id="exrecClr">
+ <title>Clearing error condition</title>
+
+ <para>
+ Many controllers require its error registers to be cleared by
+ error handler. Different controllers may have different
+ requirements.
+ </para>
+
+ <para>
+ For SATA, it's strongly recommended to clear at least SError
+ register during error handling.
+ </para>
+ </sect2>
+
+ <sect2 id="exrecRst">
+ <title>Reset</title>
+
+ <para>
+ During EH, resetting is necessary in the following cases.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ HSM is in unknown or invalid state
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ HBA is in unknown or invalid state
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ EH needs to make HBA/device forget about in-flight commands
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ HBA/device behaves weirdly
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ Resetting during EH might be a good idea regardless of error
+ condition to improve EH robustness. Whether to reset both or
+ either one of HBA and device depends on situation but the
+ following scheme is recommended.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ When it's known that HBA is in ready state but ATA/ATAPI
+ device is in unknown state, reset only device.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If HBA is in unknown state, reset both HBA and device.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ HBA resetting is implementation specific. For a controller
+ complying to taskfile/BMDMA PCI IDE, stopping active DMA
+ transaction may be sufficient iff BMDMA state is the only HBA
+ context. But even mostly taskfile/BMDMA PCI IDE complying
+ controllers may have implementation specific requirements and
+ mechanism to reset themselves. This must be addressed by
+ specific drivers.
+ </para>
+
+ <para>
+ OTOH, ATA/ATAPI standard describes in detail ways to reset
+ ATA/ATAPI devices.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>PATA hardware reset</term>
+ <listitem>
+ <para>
+ This is hardware initiated device reset signalled with
+ asserted PATA RESET- signal. There is no standard way to
+ initiate hardware reset from software although some
+ hardware provides registers that allow driver to directly
+ tweak the RESET- signal.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>Software reset</term>
+ <listitem>
+ <para>
+ This is achieved by turning CONTROL SRST bit on for at
+ least 5us. Both PATA and SATA support it but, in case of
+ SATA, this may require controller-specific support as the
+ second Register FIS to clear SRST should be transmitted
+ while BSY bit is still set. Note that on PATA, this resets
+ both master and slave devices on a channel.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>EXECUTE DEVICE DIAGNOSTIC command</term>
+ <listitem>
+ <para>
+ Although ATA/ATAPI standard doesn't describe exactly, EDD
+ implies some level of resetting, possibly similar level
+ with software reset. Host-side EDD protocol can be handled
+ with normal command processing and most SATA controllers
+ should be able to handle EDD's just like other commands.
+ As in software reset, EDD affects both devices on a PATA
+ bus.
+ </para>
+ <para>
+ Although EDD does reset devices, this doesn't suit error
+ handling as EDD cannot be issued while BSY is set and it's
+ unclear how it will act when device is in unknown/weird
+ state.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATAPI DEVICE RESET command</term>
+ <listitem>
+ <para>
+ This is very similar to software reset except that reset
+ can be restricted to the selected device without affecting
+ the other device sharing the cable.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>SATA phy reset</term>
+ <listitem>
+ <para>
+ This is the preferred way of resetting a SATA device. In
+ effect, it's identical to PATA hardware reset. Note that
+ this can be done with the standard SCR Control register.
+ As such, it's usually easier to implement than software
+ reset.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+
+ <para>
+ One more thing to consider when resetting devices is that
+ resetting clears certain configuration parameters and they
+ need to be set to their previous or newly adjusted values
+ after reset.
+ </para>
+
+ <para>
+ Parameters affected are.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ CHS set up with INITIALIZE DEVICE PARAMETERS (seldomly used)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Parameters set with SET FEATURES including transfer mode setting
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Block count set with SET MULTIPLE MODE
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Other parameters (SET MAX, MEDIA LOCK...)
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ ATA/ATAPI standard specifies that some parameters must be
+ maintained across hardware or software reset, but doesn't
+ strictly specify all of them. Always reconfiguring needed
+ parameters after reset is required for robustness. Note that
+ this also applies when resuming from deep sleep (power-off).
+ </para>
+
+ <para>
+ Also, ATA/ATAPI standard requires that IDENTIFY DEVICE /
+ IDENTIFY PACKET DEVICE is issued after any configuration
+ parameter is updated or a hardware reset and the result used
+ for further operation. OS driver is required to implement
+ revalidation mechanism to support this.
+ </para>
+
+ </sect2>
+
+ <sect2 id="exrecReconf">
+ <title>Reconfigure transport</title>
+
+ <para>
+ For both PATA and SATA, a lot of corners are cut for cheap
+ connectors, cables or controllers and it's quite common to see
+ high transmission error rate. This can be mitigated by
+ lowering transmission speed.
+ </para>
+
+ <para>
+ The following is a possible scheme Jeff Garzik suggested.
+ </para>
+
+ <blockquote>
+ <para>
+ If more than $N (3?) transmission errors happen in 15 minutes,
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ if SATA, decrease SATA PHY speed. if speed cannot be decreased,
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ decrease UDMA xfer speed. if at UDMA0, switch to PIO4,
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ decrease PIO xfer speed. if at PIO3, complain, but continue
+ </para>
+ </listitem>
+ </itemizedlist>
+ </blockquote>
+
+ </sect2>
+
+ </sect1>
+
+ </chapter>
+
+ <chapter id="PiixInt">
+ <title>ata_piix Internals</title>
+!Idrivers/ata/ata_piix.c
+ </chapter>
+
+ <chapter id="SILInt">
+ <title>sata_sil Internals</title>
+!Idrivers/ata/sata_sil.c
+ </chapter>
+
+ <chapter id="libataThanks">
+ <title>Thanks</title>
+ <para>
+ The bulk of the ATA knowledge comes thanks to long conversations with
+ Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA
+ and SCSI specifications.
+ </para>
+ <para>
+ Thanks to Alan Cox for pointing out similarities
+ between SATA and SCSI, and in general for motivation to hack on
+ libata.
+ </para>
+ <para>
+ libata's device detection
+ method, ata_pio_devchk, and in general all the early probing was
+ based on extensive study of Hale Landis's probe/reset code in his
+ ATADRVR driver (www.ata-atapi.com).
+ </para>
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl
new file mode 100644
index 0000000..94f2136
--- /dev/null
+++ b/Documentation/DocBook/librs.tmpl
@@ -0,0 +1,289 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Reed-Solomon-Library-Guide">
+ <bookinfo>
+ <title>Reed-Solomon Library Programming Interface</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Thomas</firstname>
+ <surname>Gleixner</surname>
+ <affiliation>
+ <address>
+ <email>tglx@linutronix.de</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2004</year>
+ <holder>Thomas Gleixner</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The generic Reed-Solomon Library provides encoding, decoding
+ and error correction functions.
+ </para>
+ <para>
+ Reed-Solomon codes are used in communication and storage
+ applications to ensure data integrity.
+ </para>
+ <para>
+ This documentation is provided for developers who want to utilize
+ the functions provided by the library.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="usage">
+ <title>Usage</title>
+ <para>
+ This chapter provides examples of how to use the library.
+ </para>
+ <sect1>
+ <title>Initializing</title>
+ <para>
+ The init function init_rs returns a pointer to an
+ rs decoder structure, which holds the necessary
+ information for encoding, decoding and error correction
+ with the given polynomial. It either uses an existing
+ matching decoder or creates a new one. On creation all
+ the lookup tables for fast en/decoding are created.
+ The function may take a while, so make sure not to
+ call it in critical code paths.
+ </para>
+ <programlisting>
+/* the Reed Solomon control structure */
+static struct rs_control *rs_decoder;
+
+/* Symbolsize is 10 (bits)
+ * Primitive polynomial is x^10+x^3+1
+ * first consecutive root is 0
+ * primitive element to generate roots = 1
+ * generator polynomial degree (number of roots) = 6
+ */
+rs_decoder = init_rs (10, 0x409, 0, 1, 6);
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Encoding</title>
+ <para>
+ The encoder calculates the Reed-Solomon code over
+ the given data length and stores the result in
+ the parity buffer. Note that the parity buffer must
+ be initialized before calling the encoder.
+ </para>
+ <para>
+ The expanded data can be inverted on the fly by
+ providing a non-zero inversion mask. The expanded data is
+ XOR'ed with the mask. This is used e.g. for FLASH
+ ECC, where the all 0xFF is inverted to an all 0x00.
+ The Reed-Solomon code for all 0x00 is all 0x00. The
+ code is inverted before storing to FLASH so it is 0xFF
+ too. This prevents that reading from an erased FLASH
+ results in ECC errors.
+ </para>
+ <para>
+ The databytes are expanded to the given symbol size
+ on the fly. There is no support for encoding continuous
+ bitstreams with a symbol size != 8 at the moment. If
+ it is necessary it should be not a big deal to implement
+ such functionality.
+ </para>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6];
+/* Initialize the parity buffer */
+memset(par, 0, sizeof(par));
+/* Encode 512 byte in data8. Store parity in buffer par */
+encode_rs8 (rs_decoder, data8, 512, par, 0);
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Decoding</title>
+ <para>
+ The decoder calculates the syndrome over
+ the given data length and the received parity symbols
+ and corrects errors in the data.
+ </para>
+ <para>
+ If a syndrome is available from a hardware decoder
+ then the syndrome calculation is skipped.
+ </para>
+ <para>
+ The correction of the data buffer can be suppressed
+ by providing a correction pattern buffer and an error
+ location buffer to the decoder. The decoder stores the
+ calculated error location and the correction bitmask
+ in the given buffers. This is useful for hardware
+ decoders which use a weird bit ordering scheme.
+ </para>
+ <para>
+ The databytes are expanded to the given symbol size
+ on the fly. There is no support for decoding continuous
+ bitstreams with a symbolsize != 8 at the moment. If
+ it is necessary it should be not a big deal to implement
+ such functionality.
+ </para>
+
+ <sect2>
+ <title>
+ Decoding with syndrome calculation, direct data correction
+ </title>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6];
+uint8_t data[512];
+int numerr;
+/* Receive data */
+.....
+/* Receive parity */
+.....
+/* Decode 512 byte in data8.*/
+numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
+ </programlisting>
+ </sect2>
+
+ <sect2>
+ <title>
+ Decoding with syndrome given by hardware decoder, direct data correction
+ </title>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6], syn[6];
+uint8_t data[512];
+int numerr;
+/* Receive data */
+.....
+/* Receive parity */
+.....
+/* Get syndrome from hardware decoder */
+.....
+/* Decode 512 byte in data8.*/
+numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
+ </programlisting>
+ </sect2>
+
+ <sect2>
+ <title>
+ Decoding with syndrome given by hardware decoder, no direct data correction.
+ </title>
+ <para>
+ Note: It's not necessary to give data and received parity to the decoder.
+ </para>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6], syn[6], corr[8];
+uint8_t data[512];
+int numerr, errpos[8];
+/* Receive data */
+.....
+/* Receive parity */
+.....
+/* Get syndrome from hardware decoder */
+.....
+/* Decode 512 byte in data8.*/
+numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
+for (i = 0; i &lt; numerr; i++) {
+ do_error_correction_in_your_buffer(errpos[i], corr[i]);
+}
+ </programlisting>
+ </sect2>
+ </sect1>
+ <sect1>
+ <title>Cleanup</title>
+ <para>
+ The function free_rs frees the allocated resources,
+ if the caller is the last user of the decoder.
+ </para>
+ <programlisting>
+/* Release resources */
+free_rs(rs_decoder);
+ </programlisting>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="structs">
+ <title>Structures</title>
+ <para>
+ This chapter contains the autogenerated documentation of the structures which are
+ used in the Reed-Solomon Library and are relevant for a developer.
+ </para>
+!Iinclude/linux/rslib.h
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the Reed-Solomon functions
+ which are exported.
+ </para>
+!Elib/reed_solomon/reed_solomon.c
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The library code for encoding and decoding was written by Phil Karn.
+ </para>
+ <programlisting>
+ Copyright 2002, Phil Karn, KA9Q
+ May be used under the terms of the GNU General Public License (GPL)
+ </programlisting>
+ <para>
+ The wrapper functions and interfaces are written by Thomas Gleixner.
+ </para>
+ <para>
+ Many users have provided bugfixes, improvements and helping hands for testing.
+ Thanks a lot.
+ </para>
+ <para>
+ The following people have contributed to this document:
+ </para>
+ <para>
+ Thomas Gleixner<email>tglx@linutronix.de</email>
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl
new file mode 100644
index 0000000..fe7664c
--- /dev/null
+++ b/Documentation/DocBook/lsm.tmpl
@@ -0,0 +1,265 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<article class="whitepaper" id="LinuxSecurityModule" lang="en">
+ <articleinfo>
+ <title>Linux Security Modules: General Security Hooks for Linux</title>
+ <authorgroup>
+ <author>
+ <firstname>Stephen</firstname>
+ <surname>Smalley</surname>
+ <affiliation>
+ <orgname>NAI Labs</orgname>
+ <address><email>ssmalley@nai.com</email></address>
+ </affiliation>
+ </author>
+ <author>
+ <firstname>Timothy</firstname>
+ <surname>Fraser</surname>
+ <affiliation>
+ <orgname>NAI Labs</orgname>
+ <address><email>tfraser@nai.com</email></address>
+ </affiliation>
+ </author>
+ <author>
+ <firstname>Chris</firstname>
+ <surname>Vance</surname>
+ <affiliation>
+ <orgname>NAI Labs</orgname>
+ <address><email>cvance@nai.com</email></address>
+ </affiliation>
+ </author>
+ </authorgroup>
+ </articleinfo>
+
+<sect1 id="Introduction"><title>Introduction</title>
+
+<para>
+In March 2001, the National Security Agency (NSA) gave a presentation
+about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel
+Summit. SELinux is an implementation of flexible and fine-grained
+nondiscretionary access controls in the Linux kernel, originally
+implemented as its own particular kernel patch. Several other
+security projects (e.g. RSBAC, Medusa) have also developed flexible
+access control architectures for the Linux kernel, and various
+projects have developed particular access control models for Linux
+(e.g. LIDS, DTE, SubDomain). Each project has developed and
+maintained its own kernel patch to support its security needs.
+</para>
+
+<para>
+In response to the NSA presentation, Linus Torvalds made a set of
+remarks that described a security framework he would be willing to
+consider for inclusion in the mainstream Linux kernel. He described a
+general framework that would provide a set of security hooks to
+control operations on kernel objects and a set of opaque security
+fields in kernel data structures for maintaining security attributes.
+This framework could then be used by loadable kernel modules to
+implement any desired model of security. Linus also suggested the
+possibility of migrating the Linux capabilities code into such a
+module.
+</para>
+
+<para>
+The Linux Security Modules (LSM) project was started by WireX to
+develop such a framework. LSM is a joint development effort by
+several security projects, including Immunix, SELinux, SGI and Janus,
+and several individuals, including Greg Kroah-Hartman and James
+Morris, to develop a Linux kernel patch that implements this
+framework. The patch is currently tracking the 2.4 series and is
+targeted for integration into the 2.5 development series. This
+technical report provides an overview of the framework and the example
+capabilities security module provided by the LSM kernel patch.
+</para>
+
+</sect1>
+
+<sect1 id="framework"><title>LSM Framework</title>
+
+<para>
+The LSM kernel patch provides a general kernel framework to support
+security modules. In particular, the LSM framework is primarily
+focused on supporting access control modules, although future
+development is likely to address other security needs such as
+auditing. By itself, the framework does not provide any additional
+security; it merely provides the infrastructure to support security
+modules. The LSM kernel patch also moves most of the capabilities
+logic into an optional security module, with the system defaulting
+to the traditional superuser logic. This capabilities module
+is discussed further in <xref linkend="cap"/>.
+</para>
+
+<para>
+The LSM kernel patch adds security fields to kernel data structures
+and inserts calls to hook functions at critical points in the kernel
+code to manage the security fields and to perform access control. It
+also adds functions for registering and unregistering security
+modules, and adds a general <function>security</function> system call
+to support new system calls for security-aware applications.
+</para>
+
+<para>
+The LSM security fields are simply <type>void*</type> pointers. For
+process and program execution security information, security fields
+were added to <structname>struct task_struct</structname> and
+<structname>struct linux_binprm</structname>. For filesystem security
+information, a security field was added to
+<structname>struct super_block</structname>. For pipe, file, and socket
+security information, security fields were added to
+<structname>struct inode</structname> and
+<structname>struct file</structname>. For packet and network device security
+information, security fields were added to
+<structname>struct sk_buff</structname> and
+<structname>struct net_device</structname>. For System V IPC security
+information, security fields were added to
+<structname>struct kern_ipc_perm</structname> and
+<structname>struct msg_msg</structname>; additionally, the definitions
+for <structname>struct msg_msg</structname>, <structname>struct
+msg_queue</structname>, and <structname>struct
+shmid_kernel</structname> were moved to header files
+(<filename>include/linux/msg.h</filename> and
+<filename>include/linux/shm.h</filename> as appropriate) to allow
+the security modules to use these definitions.
+</para>
+
+<para>
+Each LSM hook is a function pointer in a global table,
+security_ops. This table is a
+<structname>security_operations</structname> structure as defined by
+<filename>include/linux/security.h</filename>. Detailed documentation
+for each hook is included in this header file. At present, this
+structure consists of a collection of substructures that group related
+hooks based on the kernel object (e.g. task, inode, file, sk_buff,
+etc) as well as some top-level hook function pointers for system
+operations. This structure is likely to be flattened in the future
+for performance. The placement of the hook calls in the kernel code
+is described by the "called:" lines in the per-hook documentation in
+the header file. The hook calls can also be easily found in the
+kernel code by looking for the string "security_ops->".
+
+</para>
+
+<para>
+Linus mentioned per-process security hooks in his original remarks as a
+possible alternative to global security hooks. However, if LSM were
+to start from the perspective of per-process hooks, then the base
+framework would have to deal with how to handle operations that
+involve multiple processes (e.g. kill), since each process might have
+its own hook for controlling the operation. This would require a
+general mechanism for composing hooks in the base framework.
+Additionally, LSM would still need global hooks for operations that
+have no process context (e.g. network input operations).
+Consequently, LSM provides global security hooks, but a security
+module is free to implement per-process hooks (where that makes sense)
+by storing a security_ops table in each process' security field and
+then invoking these per-process hooks from the global hooks.
+The problem of composition is thus deferred to the module.
+</para>
+
+<para>
+The global security_ops table is initialized to a set of hook
+functions provided by a dummy security module that provides
+traditional superuser logic. A <function>register_security</function>
+function (in <filename>security/security.c</filename>) is provided to
+allow a security module to set security_ops to refer to its own hook
+functions, and an <function>unregister_security</function> function is
+provided to revert security_ops to the dummy module hooks. This
+mechanism is used to set the primary security module, which is
+responsible for making the final decision for each hook.
+</para>
+
+<para>
+LSM also provides a simple mechanism for stacking additional security
+modules with the primary security module. It defines
+<function>register_security</function> and
+<function>unregister_security</function> hooks in the
+<structname>security_operations</structname> structure and provides
+<function>mod_reg_security</function> and
+<function>mod_unreg_security</function> functions that invoke these
+hooks after performing some sanity checking. A security module can
+call these functions in order to stack with other modules. However,
+the actual details of how this stacking is handled are deferred to the
+module, which can implement these hooks in any way it wishes
+(including always returning an error if it does not wish to support
+stacking). In this manner, LSM again defers the problem of
+composition to the module.
+</para>
+
+<para>
+Although the LSM hooks are organized into substructures based on
+kernel object, all of the hooks can be viewed as falling into two
+major categories: hooks that are used to manage the security fields
+and hooks that are used to perform access control. Examples of the
+first category of hooks include the
+<function>alloc_security</function> and
+<function>free_security</function> hooks defined for each kernel data
+structure that has a security field. These hooks are used to allocate
+and free security structures for kernel objects. The first category
+of hooks also includes hooks that set information in the security
+field after allocation, such as the <function>post_lookup</function>
+hook in <structname>struct inode_security_ops</structname>. This hook
+is used to set security information for inodes after successful lookup
+operations. An example of the second category of hooks is the
+<function>permission</function> hook in
+<structname>struct inode_security_ops</structname>. This hook checks
+permission when accessing an inode.
+</para>
+
+</sect1>
+
+<sect1 id="cap"><title>LSM Capabilities Module</title>
+
+<para>
+The LSM kernel patch moves most of the existing POSIX.1e capabilities
+logic into an optional security module stored in the file
+<filename>security/capability.c</filename>. This change allows
+users who do not want to use capabilities to omit this code entirely
+from their kernel, instead using the dummy module for traditional
+superuser logic or any other module that they desire. This change
+also allows the developers of the capabilities logic to maintain and
+enhance their code more freely, without needing to integrate patches
+back into the base kernel.
+</para>
+
+<para>
+In addition to moving the capabilities logic, the LSM kernel patch
+could move the capability-related fields from the kernel data
+structures into the new security fields managed by the security
+modules. However, at present, the LSM kernel patch leaves the
+capability fields in the kernel data structures. In his original
+remarks, Linus suggested that this might be preferable so that other
+security modules can be easily stacked with the capabilities module
+without needing to chain multiple security structures on the security field.
+It also avoids imposing extra overhead on the capabilities module
+to manage the security fields. However, the LSM framework could
+certainly support such a move if it is determined to be desirable,
+with only a few additional changes described below.
+</para>
+
+<para>
+At present, the capabilities logic for computing process capabilities
+on <function>execve</function> and <function>set*uid</function>,
+checking capabilities for a particular process, saving and checking
+capabilities for netlink messages, and handling the
+<function>capget</function> and <function>capset</function> system
+calls have been moved into the capabilities module. There are still a
+few locations in the base kernel where capability-related fields are
+directly examined or modified, but the current version of the LSM
+patch does allow a security module to completely replace the
+assignment and testing of capabilities. These few locations would
+need to be changed if the capability-related fields were moved into
+the security field. The following is a list of known locations that
+still perform such direct examination or modification of
+capability-related fields:
+<itemizedlist>
+<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem>
+<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem>
+<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem>
+<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem>
+</itemizedlist>
+</para>
+
+</sect1>
+
+</article>
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
new file mode 100644
index 0000000..77c3c20
--- /dev/null
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -0,0 +1,331 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="mac80211-developers-guide">
+ <bookinfo>
+ <title>The mac80211 subsystem for kernel developers</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Johannes</firstname>
+ <surname>Berg</surname>
+ <affiliation>
+ <address><email>johannes@sipsolutions.net</email></address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2007</year>
+ <year>2008</year>
+ <holder>Johannes Berg</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This documentation is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this documentation; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+
+ <abstract>
+!Pinclude/net/mac80211.h Introduction
+!Pinclude/net/mac80211.h Warning
+ </abstract>
+ </bookinfo>
+
+ <toc></toc>
+
+<!--
+Generally, this document shall be ordered by increasing complexity.
+It is important to note that readers should be able to read only
+the first few sections to get a working driver and only advanced
+usage should require reading the full document.
+-->
+
+ <part>
+ <title>The basic mac80211 driver interface</title>
+ <partintro>
+ <para>
+ You should read and understand the information contained
+ within this part of the book while implementing a driver.
+ In some chapters, advanced usage is noted, that may be
+ skipped at first.
+ </para>
+ <para>
+ This part of the book only covers station and monitor mode
+ functionality, additional information required to implement
+ the other modes is covered in the second part of the book.
+ </para>
+ </partintro>
+
+ <chapter id="basics">
+ <title>Basic hardware handling</title>
+ <para>TBD</para>
+ <para>
+ This chapter shall contain information on getting a hw
+ struct allocated and registered with mac80211.
+ </para>
+ <para>
+ Since it is required to allocate rates/modes before registering
+ a hw struct, this chapter shall also contain information on setting
+ up the rate/mode structs.
+ </para>
+ <para>
+ Additionally, some discussion about the callbacks and
+ the general programming model should be in here, including
+ the definition of ieee80211_ops which will be referred to
+ a lot.
+ </para>
+ <para>
+ Finally, a discussion of hardware capabilities should be done
+ with references to other parts of the book.
+ </para>
+<!-- intentionally multiple !F lines to get proper order -->
+!Finclude/net/mac80211.h ieee80211_hw
+!Finclude/net/mac80211.h ieee80211_hw_flags
+!Finclude/net/mac80211.h SET_IEEE80211_DEV
+!Finclude/net/mac80211.h SET_IEEE80211_PERM_ADDR
+!Finclude/net/mac80211.h ieee80211_ops
+!Finclude/net/mac80211.h ieee80211_alloc_hw
+!Finclude/net/mac80211.h ieee80211_register_hw
+!Finclude/net/mac80211.h ieee80211_get_tx_led_name
+!Finclude/net/mac80211.h ieee80211_get_rx_led_name
+!Finclude/net/mac80211.h ieee80211_get_assoc_led_name
+!Finclude/net/mac80211.h ieee80211_get_radio_led_name
+!Finclude/net/mac80211.h ieee80211_unregister_hw
+!Finclude/net/mac80211.h ieee80211_free_hw
+ </chapter>
+
+ <chapter id="phy-handling">
+ <title>PHY configuration</title>
+ <para>TBD</para>
+ <para>
+ This chapter should describe PHY handling including
+ start/stop callbacks and the various structures used.
+ </para>
+!Finclude/net/mac80211.h ieee80211_conf
+!Finclude/net/mac80211.h ieee80211_conf_flags
+ </chapter>
+
+ <chapter id="iface-handling">
+ <title>Virtual interfaces</title>
+ <para>TBD</para>
+ <para>
+ This chapter should describe virtual interface basics
+ that are relevant to the driver (VLANs, MGMT etc are not.)
+ It should explain the use of the add_iface/remove_iface
+ callbacks as well as the interface configuration callbacks.
+ </para>
+ <para>Things related to AP mode should be discussed there.</para>
+ <para>
+ Things related to supporting multiple interfaces should be
+ in the appropriate chapter, a BIG FAT note should be here about
+ this though and the recommendation to allow only a single
+ interface in STA mode at first!
+ </para>
+!Finclude/net/mac80211.h ieee80211_if_init_conf
+!Finclude/net/mac80211.h ieee80211_if_conf
+ </chapter>
+
+ <chapter id="rx-tx">
+ <title>Receive and transmit processing</title>
+ <sect1>
+ <title>what should be here</title>
+ <para>TBD</para>
+ <para>
+ This should describe the receive and transmit
+ paths in mac80211/the drivers as well as
+ transmit status handling.
+ </para>
+ </sect1>
+ <sect1>
+ <title>Frame format</title>
+!Pinclude/net/mac80211.h Frame format
+ </sect1>
+ <sect1>
+ <title>Alignment issues</title>
+ <para>TBD</para>
+ </sect1>
+ <sect1>
+ <title>Calling into mac80211 from interrupts</title>
+!Pinclude/net/mac80211.h Calling mac80211 from interrupts
+ </sect1>
+ <sect1>
+ <title>functions/definitions</title>
+!Finclude/net/mac80211.h ieee80211_rx_status
+!Finclude/net/mac80211.h mac80211_rx_flags
+!Finclude/net/mac80211.h ieee80211_tx_info
+!Finclude/net/mac80211.h ieee80211_rx
+!Finclude/net/mac80211.h ieee80211_rx_irqsafe
+!Finclude/net/mac80211.h ieee80211_tx_status
+!Finclude/net/mac80211.h ieee80211_tx_status_irqsafe
+!Finclude/net/mac80211.h ieee80211_rts_get
+!Finclude/net/mac80211.h ieee80211_rts_duration
+!Finclude/net/mac80211.h ieee80211_ctstoself_get
+!Finclude/net/mac80211.h ieee80211_ctstoself_duration
+!Finclude/net/mac80211.h ieee80211_generic_frame_duration
+!Finclude/net/mac80211.h ieee80211_get_hdrlen_from_skb
+!Finclude/net/mac80211.h ieee80211_hdrlen
+!Finclude/net/mac80211.h ieee80211_wake_queue
+!Finclude/net/mac80211.h ieee80211_stop_queue
+!Finclude/net/mac80211.h ieee80211_wake_queues
+!Finclude/net/mac80211.h ieee80211_stop_queues
+ </sect1>
+ </chapter>
+
+ <chapter id="filters">
+ <title>Frame filtering</title>
+!Pinclude/net/mac80211.h Frame filtering
+!Finclude/net/mac80211.h ieee80211_filter_flags
+ </chapter>
+ </part>
+
+ <part id="advanced">
+ <title>Advanced driver interface</title>
+ <partintro>
+ <para>
+ Information contained within this part of the book is
+ of interest only for advanced interaction of mac80211
+ with drivers to exploit more hardware capabilities and
+ improve performance.
+ </para>
+ </partintro>
+
+ <chapter id="hardware-crypto-offload">
+ <title>Hardware crypto acceleration</title>
+!Pinclude/net/mac80211.h Hardware crypto acceleration
+<!-- intentionally multiple !F lines to get proper order -->
+!Finclude/net/mac80211.h set_key_cmd
+!Finclude/net/mac80211.h ieee80211_key_conf
+!Finclude/net/mac80211.h ieee80211_key_alg
+!Finclude/net/mac80211.h ieee80211_key_flags
+ </chapter>
+
+ <chapter id="qos">
+ <title>Multiple queues and QoS support</title>
+ <para>TBD</para>
+!Finclude/net/mac80211.h ieee80211_tx_queue_params
+!Finclude/net/mac80211.h ieee80211_tx_queue_stats
+ </chapter>
+
+ <chapter id="AP">
+ <title>Access point mode support</title>
+ <para>TBD</para>
+ <para>Some parts of the if_conf should be discussed here instead</para>
+ <para>
+ Insert notes about VLAN interfaces with hw crypto here or
+ in the hw crypto chapter.
+ </para>
+!Finclude/net/mac80211.h ieee80211_get_buffered_bc
+!Finclude/net/mac80211.h ieee80211_beacon_get
+ </chapter>
+
+ <chapter id="multi-iface">
+ <title>Supporting multiple virtual interfaces</title>
+ <para>TBD</para>
+ <para>
+ Note: WDS with identical MAC address should almost always be OK
+ </para>
+ <para>
+ Insert notes about having multiple virtual interfaces with
+ different MAC addresses here, note which configurations are
+ supported by mac80211, add notes about supporting hw crypto
+ with it.
+ </para>
+ </chapter>
+
+ <chapter id="hardware-scan-offload">
+ <title>Hardware scan offload</title>
+ <para>TBD</para>
+!Finclude/net/mac80211.h ieee80211_scan_completed
+ </chapter>
+ </part>
+
+ <part id="rate-control">
+ <title>Rate control interface</title>
+ <partintro>
+ <para>TBD</para>
+ <para>
+ This part of the book describes the rate control algorithm
+ interface and how it relates to mac80211 and drivers.
+ </para>
+ </partintro>
+ <chapter id="dummy">
+ <title>dummy chapter</title>
+ <para>TBD</para>
+ </chapter>
+ </part>
+
+ <part id="internal">
+ <title>Internals</title>
+ <partintro>
+ <para>TBD</para>
+ <para>
+ This part of the book describes mac80211 internals.
+ </para>
+ </partintro>
+
+ <chapter id="key-handling">
+ <title>Key handling</title>
+ <sect1>
+ <title>Key handling basics</title>
+!Pnet/mac80211/key.c Key handling basics
+ </sect1>
+ <sect1>
+ <title>MORE TBD</title>
+ <para>TBD</para>
+ </sect1>
+ </chapter>
+
+ <chapter id="rx-processing">
+ <title>Receive processing</title>
+ <para>TBD</para>
+ </chapter>
+
+ <chapter id="tx-processing">
+ <title>Transmit processing</title>
+ <para>TBD</para>
+ </chapter>
+
+ <chapter id="sta-info">
+ <title>Station info handling</title>
+ <sect1>
+ <title>Programming information</title>
+!Fnet/mac80211/sta_info.h sta_info
+!Fnet/mac80211/sta_info.h ieee80211_sta_info_flags
+ </sect1>
+ <sect1>
+ <title>STA information lifetime rules</title>
+!Pnet/mac80211/sta_info.c STA information lifetime rules
+ </sect1>
+ </chapter>
+
+ <chapter id="synchronisation">
+ <title>Synchronisation</title>
+ <para>TBD</para>
+ <para>Locking, lots of RCU</para>
+ </chapter>
+ </part>
+</book>
diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl
new file mode 100644
index 0000000..467ccac
--- /dev/null
+++ b/Documentation/DocBook/mcabook.tmpl
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="MCAGuide">
+ <bookinfo>
+ <title>MCA Driver Programming Interface</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@lxorguk.ukuu.org.uk</email>
+ </address>
+ </affiliation>
+ </author>
+ <author>
+ <firstname>David</firstname>
+ <surname>Weinehall</surname>
+ </author>
+ <author>
+ <firstname>Chris</firstname>
+ <surname>Beauregard</surname>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2000</year>
+ <holder>Alan Cox</holder>
+ <holder>David Weinehall</holder>
+ <holder>Chris Beauregard</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The MCA bus functions provide a generalised interface to find MCA
+ bus cards, to claim them for a driver, and to read and manipulate POS
+ registers without being aware of the motherboard internals or
+ certain deep magic specific to onboard devices.
+ </para>
+ <para>
+ The basic interface to the MCA bus devices is the slot. Each slot
+ is numbered and virtual slot numbers are assigned to the internal
+ devices. Using a pci_dev as other busses do does not really make
+ sense in the MCA context as the MCA bus resources require card
+ specific interpretation.
+ </para>
+ <para>
+ Finally the MCA bus functions provide a parallel set of DMA
+ functions mimicing the ISA bus DMA functions as closely as possible,
+ although also supporting the additional DMA functionality on the
+ MCA bus controllers.
+ </para>
+ </chapter>
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Edrivers/mca/mca-legacy.c
+ </chapter>
+
+ <chapter id="dmafunctions">
+ <title>DMA Functions Provided</title>
+!Iarch/x86/include/asm/mca_dma.h
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
new file mode 100644
index 0000000..8e14585
--- /dev/null
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -0,0 +1,1318 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="MTD-NAND-Guide">
+ <bookinfo>
+ <title>MTD NAND Driver Programming Interface</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Thomas</firstname>
+ <surname>Gleixner</surname>
+ <affiliation>
+ <address>
+ <email>tglx@linutronix.de</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2004</year>
+ <holder>Thomas Gleixner</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The generic NAND driver supports almost all NAND and AG-AND based
+ chips and connects them to the Memory Technology Devices (MTD)
+ subsystem of the Linux Kernel.
+ </para>
+ <para>
+ This documentation is provided for developers who want to implement
+ board drivers or filesystem drivers suitable for NAND devices.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="dochints">
+ <title>Documentation hints</title>
+ <para>
+ The function and structure docs are autogenerated. Each function and
+ struct member has a short description which is marked with an [XXX] identifier.
+ The following chapters explain the meaning of those identifiers.
+ </para>
+ <sect1 id="Function_identifiers_XXX">
+ <title>Function identifiers [XXX]</title>
+ <para>
+ The functions are marked with [XXX] identifiers in the short
+ comment. The identifiers explain the usage and scope of the
+ functions. Following identifiers are used:
+ </para>
+ <itemizedlist>
+ <listitem><para>
+ [MTD Interface]</para><para>
+ These functions provide the interface to the MTD kernel API.
+ They are not replacable and provide functionality
+ which is complete hardware independent.
+ </para></listitem>
+ <listitem><para>
+ [NAND Interface]</para><para>
+ These functions are exported and provide the interface to the NAND kernel API.
+ </para></listitem>
+ <listitem><para>
+ [GENERIC]</para><para>
+ Generic functions are not replacable and provide functionality
+ which is complete hardware independent.
+ </para></listitem>
+ <listitem><para>
+ [DEFAULT]</para><para>
+ Default functions provide hardware related functionality which is suitable
+ for most of the implementations. These functions can be replaced by the
+ board driver if neccecary. Those functions are called via pointers in the
+ NAND chip description structure. The board driver can set the functions which
+ should be replaced by board dependent functions before calling nand_scan().
+ If the function pointer is NULL on entry to nand_scan() then the pointer
+ is set to the default function which is suitable for the detected chip type.
+ </para></listitem>
+ </itemizedlist>
+ </sect1>
+ <sect1 id="Struct_member_identifiers_XXX">
+ <title>Struct member identifiers [XXX]</title>
+ <para>
+ The struct members are marked with [XXX] identifiers in the
+ comment. The identifiers explain the usage and scope of the
+ members. Following identifiers are used:
+ </para>
+ <itemizedlist>
+ <listitem><para>
+ [INTERN]</para><para>
+ These members are for NAND driver internal use only and must not be
+ modified. Most of these values are calculated from the chip geometry
+ information which is evaluated during nand_scan().
+ </para></listitem>
+ <listitem><para>
+ [REPLACEABLE]</para><para>
+ Replaceable members hold hardware related functions which can be
+ provided by the board driver. The board driver can set the functions which
+ should be replaced by board dependent functions before calling nand_scan().
+ If the function pointer is NULL on entry to nand_scan() then the pointer
+ is set to the default function which is suitable for the detected chip type.
+ </para></listitem>
+ <listitem><para>
+ [BOARDSPECIFIC]</para><para>
+ Board specific members hold hardware related information which must
+ be provided by the board driver. The board driver must set the function
+ pointers and datafields before calling nand_scan().
+ </para></listitem>
+ <listitem><para>
+ [OPTIONAL]</para><para>
+ Optional members can hold information relevant for the board driver. The
+ generic NAND driver code does not use this information.
+ </para></listitem>
+ </itemizedlist>
+ </sect1>
+ </chapter>
+
+ <chapter id="basicboarddriver">
+ <title>Basic board driver</title>
+ <para>
+ For most boards it will be sufficient to provide just the
+ basic functions and fill out some really board dependent
+ members in the nand chip description structure.
+ </para>
+ <sect1 id="Basic_defines">
+ <title>Basic defines</title>
+ <para>
+ At least you have to provide a mtd structure and
+ a storage for the ioremap'ed chip address.
+ You can allocate the mtd structure using kmalloc
+ or you can allocate it statically.
+ In case of static allocation you have to allocate
+ a nand_chip structure too.
+ </para>
+ <para>
+ Kmalloc based example
+ </para>
+ <programlisting>
+static struct mtd_info *board_mtd;
+static unsigned long baseaddr;
+ </programlisting>
+ <para>
+ Static example
+ </para>
+ <programlisting>
+static struct mtd_info board_mtd;
+static struct nand_chip board_chip;
+static unsigned long baseaddr;
+ </programlisting>
+ </sect1>
+ <sect1 id="Partition_defines">
+ <title>Partition defines</title>
+ <para>
+ If you want to divide your device into partitions, then
+ enable the configuration switch CONFIG_MTD_PARTITIONS and define
+ a partitioning scheme suitable to your board.
+ </para>
+ <programlisting>
+#define NUM_PARTITIONS 2
+static struct mtd_partition partition_info[] = {
+ { .name = "Flash partition 1",
+ .offset = 0,
+ .size = 8 * 1024 * 1024 },
+ { .name = "Flash partition 2",
+ .offset = MTDPART_OFS_NEXT,
+ .size = MTDPART_SIZ_FULL },
+};
+ </programlisting>
+ </sect1>
+ <sect1 id="Hardware_control_functions">
+ <title>Hardware control function</title>
+ <para>
+ The hardware control function provides access to the
+ control pins of the NAND chip(s).
+ The access can be done by GPIO pins or by address lines.
+ If you use address lines, make sure that the timing
+ requirements are met.
+ </para>
+ <para>
+ <emphasis>GPIO based example</emphasis>
+ </para>
+ <programlisting>
+static void board_hwcontrol(struct mtd_info *mtd, int cmd)
+{
+ switch(cmd){
+ case NAND_CTL_SETCLE: /* Set CLE pin high */ break;
+ case NAND_CTL_CLRCLE: /* Set CLE pin low */ break;
+ case NAND_CTL_SETALE: /* Set ALE pin high */ break;
+ case NAND_CTL_CLRALE: /* Set ALE pin low */ break;
+ case NAND_CTL_SETNCE: /* Set nCE pin low */ break;
+ case NAND_CTL_CLRNCE: /* Set nCE pin high */ break;
+ }
+}
+ </programlisting>
+ <para>
+ <emphasis>Address lines based example.</emphasis> It's assumed that the
+ nCE pin is driven by a chip select decoder.
+ </para>
+ <programlisting>
+static void board_hwcontrol(struct mtd_info *mtd, int cmd)
+{
+ struct nand_chip *this = (struct nand_chip *) mtd->priv;
+ switch(cmd){
+ case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break;
+ case NAND_CTL_CLRCLE: this->IO_ADDR_W &amp;= ~CLE_ADRR_BIT; break;
+ case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break;
+ case NAND_CTL_CLRALE: this->IO_ADDR_W &amp;= ~ALE_ADRR_BIT; break;
+ }
+}
+ </programlisting>
+ </sect1>
+ <sect1 id="Device_ready_function">
+ <title>Device ready function</title>
+ <para>
+ If the hardware interface has the ready busy pin of the NAND chip connected to a
+ GPIO or other accesible I/O pin, this function is used to read back the state of the
+ pin. The function has no arguments and should return 0, if the device is busy (R/B pin
+ is low) and 1, if the device is ready (R/B pin is high).
+ If the hardware interface does not give access to the ready busy pin, then
+ the function must not be defined and the function pointer this->dev_ready is set to NULL.
+ </para>
+ </sect1>
+ <sect1 id="Init_function">
+ <title>Init function</title>
+ <para>
+ The init function allocates memory and sets up all the board
+ specific parameters and function pointers. When everything
+ is set up nand_scan() is called. This function tries to
+ detect and identify then chip. If a chip is found all the
+ internal data fields are initialized accordingly.
+ The structure(s) have to be zeroed out first and then filled with the neccecary
+ information about the device.
+ </para>
+ <programlisting>
+int __init board_init (void)
+{
+ struct nand_chip *this;
+ int err = 0;
+
+ /* Allocate memory for MTD device structure and private data */
+ board_mtd = kzalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
+ if (!board_mtd) {
+ printk ("Unable to allocate NAND MTD device structure.\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* map physical address */
+ baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
+ if(!baseaddr){
+ printk("Ioremap to access NAND chip failed\n");
+ err = -EIO;
+ goto out_mtd;
+ }
+
+ /* Get pointer to private data */
+ this = (struct nand_chip *) ();
+ /* Link the private data with the MTD structure */
+ board_mtd->priv = this;
+
+ /* Set address of NAND IO lines */
+ this->IO_ADDR_R = baseaddr;
+ this->IO_ADDR_W = baseaddr;
+ /* Reference hardware control function */
+ this->hwcontrol = board_hwcontrol;
+ /* Set command delay time, see datasheet for correct value */
+ this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY;
+ /* Assign the device ready function, if available */
+ this->dev_ready = board_dev_ready;
+ this->eccmode = NAND_ECC_SOFT;
+
+ /* Scan to find existence of the device */
+ if (nand_scan (board_mtd, 1)) {
+ err = -ENXIO;
+ goto out_ior;
+ }
+
+ add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS);
+ goto out;
+
+out_ior:
+ iounmap((void *)baseaddr);
+out_mtd:
+ kfree (board_mtd);
+out:
+ return err;
+}
+module_init(board_init);
+ </programlisting>
+ </sect1>
+ <sect1 id="Exit_function">
+ <title>Exit function</title>
+ <para>
+ The exit function is only neccecary if the driver is
+ compiled as a module. It releases all resources which
+ are held by the chip driver and unregisters the partitions
+ in the MTD layer.
+ </para>
+ <programlisting>
+#ifdef MODULE
+static void __exit board_cleanup (void)
+{
+ /* Release resources, unregister device */
+ nand_release (board_mtd);
+
+ /* unmap physical address */
+ iounmap((void *)baseaddr);
+
+ /* Free the MTD device structure */
+ kfree (board_mtd);
+}
+module_exit(board_cleanup);
+#endif
+ </programlisting>
+ </sect1>
+ </chapter>
+
+ <chapter id="boarddriversadvanced">
+ <title>Advanced board driver functions</title>
+ <para>
+ This chapter describes the advanced functionality of the NAND
+ driver. For a list of functions which can be overridden by the board
+ driver see the documentation of the nand_chip structure.
+ </para>
+ <sect1 id="Multiple_chip_control">
+ <title>Multiple chip control</title>
+ <para>
+ The nand driver can control chip arrays. Therefor the
+ board driver must provide an own select_chip function. This
+ function must (de)select the requested chip.
+ The function pointer in the nand_chip structure must
+ be set before calling nand_scan(). The maxchip parameter
+ of nand_scan() defines the maximum number of chips to
+ scan for. Make sure that the select_chip function can
+ handle the requested number of chips.
+ </para>
+ <para>
+ The nand driver concatenates the chips to one virtual
+ chip and provides this virtual chip to the MTD layer.
+ </para>
+ <para>
+ <emphasis>Note: The driver can only handle linear chip arrays
+ of equally sized chips. There is no support for
+ parallel arrays which extend the buswidth.</emphasis>
+ </para>
+ <para>
+ <emphasis>GPIO based example</emphasis>
+ </para>
+ <programlisting>
+static void board_select_chip (struct mtd_info *mtd, int chip)
+{
+ /* Deselect all chips, set all nCE pins high */
+ GPIO(BOARD_NAND_NCE) |= 0xff;
+ if (chip >= 0)
+ GPIO(BOARD_NAND_NCE) &amp;= ~ (1 &lt;&lt; chip);
+}
+ </programlisting>
+ <para>
+ <emphasis>Address lines based example.</emphasis>
+ Its assumed that the nCE pins are connected to an
+ address decoder.
+ </para>
+ <programlisting>
+static void board_select_chip (struct mtd_info *mtd, int chip)
+{
+ struct nand_chip *this = (struct nand_chip *) mtd->priv;
+
+ /* Deselect all chips */
+ this->IO_ADDR_R &amp;= ~BOARD_NAND_ADDR_MASK;
+ this->IO_ADDR_W &amp;= ~BOARD_NAND_ADDR_MASK;
+ switch (chip) {
+ case 0:
+ this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0;
+ this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0;
+ break;
+ ....
+ case n:
+ this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn;
+ this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn;
+ break;
+ }
+}
+ </programlisting>
+ </sect1>
+ <sect1 id="Hardware_ECC_support">
+ <title>Hardware ECC support</title>
+ <sect2 id="Functions_and_constants">
+ <title>Functions and constants</title>
+ <para>
+ The nand driver supports three different types of
+ hardware ECC.
+ <itemizedlist>
+ <listitem><para>NAND_ECC_HW3_256</para><para>
+ Hardware ECC generator providing 3 bytes ECC per
+ 256 byte.
+ </para> </listitem>
+ <listitem><para>NAND_ECC_HW3_512</para><para>
+ Hardware ECC generator providing 3 bytes ECC per
+ 512 byte.
+ </para> </listitem>
+ <listitem><para>NAND_ECC_HW6_512</para><para>
+ Hardware ECC generator providing 6 bytes ECC per
+ 512 byte.
+ </para> </listitem>
+ <listitem><para>NAND_ECC_HW8_512</para><para>
+ Hardware ECC generator providing 6 bytes ECC per
+ 512 byte.
+ </para> </listitem>
+ </itemizedlist>
+ If your hardware generator has a different functionality
+ add it at the appropriate place in nand_base.c
+ </para>
+ <para>
+ The board driver must provide following functions:
+ <itemizedlist>
+ <listitem><para>enable_hwecc</para><para>
+ This function is called before reading / writing to
+ the chip. Reset or initialize the hardware generator
+ in this function. The function is called with an
+ argument which let you distinguish between read
+ and write operations.
+ </para> </listitem>
+ <listitem><para>calculate_ecc</para><para>
+ This function is called after read / write from / to
+ the chip. Transfer the ECC from the hardware to
+ the buffer. If the option NAND_HWECC_SYNDROME is set
+ then the function is only called on write. See below.
+ </para> </listitem>
+ <listitem><para>correct_data</para><para>
+ In case of an ECC error this function is called for
+ error detection and correction. Return 1 respectively 2
+ in case the error can be corrected. If the error is
+ not correctable return -1. If your hardware generator
+ matches the default algorithm of the nand_ecc software
+ generator then use the correction function provided
+ by nand_ecc instead of implementing duplicated code.
+ </para> </listitem>
+ </itemizedlist>
+ </para>
+ </sect2>
+ <sect2 id="Hardware_ECC_with_syndrome_calculation">
+ <title>Hardware ECC with syndrome calculation</title>
+ <para>
+ Many hardware ECC implementations provide Reed-Solomon
+ codes and calculate an error syndrome on read. The syndrome
+ must be converted to a standard Reed-Solomon syndrome
+ before calling the error correction code in the generic
+ Reed-Solomon library.
+ </para>
+ <para>
+ The ECC bytes must be placed immidiately after the data
+ bytes in order to make the syndrome generator work. This
+ is contrary to the usual layout used by software ECC. The
+ seperation of data and out of band area is not longer
+ possible. The nand driver code handles this layout and
+ the remaining free bytes in the oob area are managed by
+ the autoplacement code. Provide a matching oob-layout
+ in this case. See rts_from4.c and diskonchip.c for
+ implementation reference. In those cases we must also
+ use bad block tables on FLASH, because the ECC layout is
+ interferring with the bad block marker positions.
+ See bad block table support for details.
+ </para>
+ </sect2>
+ </sect1>
+ <sect1 id="Bad_Block_table_support">
+ <title>Bad block table support</title>
+ <para>
+ Most NAND chips mark the bad blocks at a defined
+ position in the spare area. Those blocks must
+ not be erased under any circumstances as the bad
+ block information would be lost.
+ It is possible to check the bad block mark each
+ time when the blocks are accessed by reading the
+ spare area of the first page in the block. This
+ is time consuming so a bad block table is used.
+ </para>
+ <para>
+ The nand driver supports various types of bad block
+ tables.
+ <itemizedlist>
+ <listitem><para>Per device</para><para>
+ The bad block table contains all bad block information
+ of the device which can consist of multiple chips.
+ </para> </listitem>
+ <listitem><para>Per chip</para><para>
+ A bad block table is used per chip and contains the
+ bad block information for this particular chip.
+ </para> </listitem>
+ <listitem><para>Fixed offset</para><para>
+ The bad block table is located at a fixed offset
+ in the chip (device). This applies to various
+ DiskOnChip devices.
+ </para> </listitem>
+ <listitem><para>Automatic placed</para><para>
+ The bad block table is automatically placed and
+ detected either at the end or at the beginning
+ of a chip (device)
+ </para> </listitem>
+ <listitem><para>Mirrored tables</para><para>
+ The bad block table is mirrored on the chip (device) to
+ allow updates of the bad block table without data loss.
+ </para> </listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ nand_scan() calls the function nand_default_bbt().
+ nand_default_bbt() selects appropriate default
+ bad block table desriptors depending on the chip information
+ which was retrieved by nand_scan().
+ </para>
+ <para>
+ The standard policy is scanning the device for bad
+ blocks and build a ram based bad block table which
+ allows faster access than always checking the
+ bad block information on the flash chip itself.
+ </para>
+ <sect2 id="Flash_based_tables">
+ <title>Flash based tables</title>
+ <para>
+ It may be desired or neccecary to keep a bad block table in FLASH.
+ For AG-AND chips this is mandatory, as they have no factory marked
+ bad blocks. They have factory marked good blocks. The marker pattern
+ is erased when the block is erased to be reused. So in case of
+ powerloss before writing the pattern back to the chip this block
+ would be lost and added to the bad blocks. Therefor we scan the
+ chip(s) when we detect them the first time for good blocks and
+ store this information in a bad block table before erasing any
+ of the blocks.
+ </para>
+ <para>
+ The blocks in which the tables are stored are procteted against
+ accidental access by marking them bad in the memory bad block
+ table. The bad block table managment functions are allowed
+ to circumvernt this protection.
+ </para>
+ <para>
+ The simplest way to activate the FLASH based bad block table support
+ is to set the option NAND_USE_FLASH_BBT in the option field of
+ the nand chip structure before calling nand_scan(). For AG-AND
+ chips is this done by default.
+ This activates the default FLASH based bad block table functionality
+ of the NAND driver. The default bad block table options are
+ <itemizedlist>
+ <listitem><para>Store bad block table per chip</para></listitem>
+ <listitem><para>Use 2 bits per block</para></listitem>
+ <listitem><para>Automatic placement at the end of the chip</para></listitem>
+ <listitem><para>Use mirrored tables with version numbers</para></listitem>
+ <listitem><para>Reserve 4 blocks at the end of the chip</para></listitem>
+ </itemizedlist>
+ </para>
+ </sect2>
+ <sect2 id="User_defined_tables">
+ <title>User defined tables</title>
+ <para>
+ User defined tables are created by filling out a
+ nand_bbt_descr structure and storing the pointer in the
+ nand_chip structure member bbt_td before calling nand_scan().
+ If a mirror table is neccecary a second structure must be
+ created and a pointer to this structure must be stored
+ in bbt_md inside the nand_chip structure. If the bbt_md
+ member is set to NULL then only the main table is used
+ and no scan for the mirrored table is performed.
+ </para>
+ <para>
+ The most important field in the nand_bbt_descr structure
+ is the options field. The options define most of the
+ table properties. Use the predefined constants from
+ nand.h to define the options.
+ <itemizedlist>
+ <listitem><para>Number of bits per block</para>
+ <para>The supported number of bits is 1, 2, 4, 8.</para></listitem>
+ <listitem><para>Table per chip</para>
+ <para>Setting the constant NAND_BBT_PERCHIP selects that
+ a bad block table is managed for each chip in a chip array.
+ If this option is not set then a per device bad block table
+ is used.</para></listitem>
+ <listitem><para>Table location is absolute</para>
+ <para>Use the option constant NAND_BBT_ABSPAGE and
+ define the absolute page number where the bad block
+ table starts in the field pages. If you have selected bad block
+ tables per chip and you have a multi chip array then the start page
+ must be given for each chip in the chip array. Note: there is no scan
+ for a table ident pattern performed, so the fields
+ pattern, veroffs, offs, len can be left uninitialized</para></listitem>
+ <listitem><para>Table location is automatically detected</para>
+ <para>The table can either be located in the first or the last good
+ blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place
+ the bad block table at the end of the chip (device). The
+ bad block tables are marked and identified by a pattern which
+ is stored in the spare area of the first page in the block which
+ holds the bad block table. Store a pointer to the pattern
+ in the pattern field. Further the length of the pattern has to be
+ stored in len and the offset in the spare area must be given
+ in the offs member of the nand_bbt_descr stucture. For mirrored
+ bad block tables different patterns are mandatory.</para></listitem>
+ <listitem><para>Table creation</para>
+ <para>Set the option NAND_BBT_CREATE to enable the table creation
+ if no table can be found during the scan. Usually this is done only
+ once if a new chip is found. </para></listitem>
+ <listitem><para>Table write support</para>
+ <para>Set the option NAND_BBT_WRITE to enable the table write support.
+ This allows the update of the bad block table(s) in case a block has
+ to be marked bad due to wear. The MTD interface function block_markbad
+ is calling the update function of the bad block table. If the write
+ support is enabled then the table is updated on FLASH.</para>
+ <para>
+ Note: Write support should only be enabled for mirrored tables with
+ version control.
+ </para></listitem>
+ <listitem><para>Table version control</para>
+ <para>Set the option NAND_BBT_VERSION to enable the table version control.
+ It's highly recommended to enable this for mirrored tables with write
+ support. It makes sure that the risk of loosing the bad block
+ table information is reduced to the loss of the information about the
+ one worn out block which should be marked bad. The version is stored in
+ 4 consecutive bytes in the spare area of the device. The position of
+ the version number is defined by the member veroffs in the bad block table
+ descriptor.</para></listitem>
+ <listitem><para>Save block contents on write</para>
+ <para>
+ In case that the block which holds the bad block table does contain
+ other useful information, set the option NAND_BBT_SAVECONTENT. When
+ the bad block table is written then the whole block is read the bad
+ block table is updated and the block is erased and everything is
+ written back. If this option is not set only the bad block table
+ is written and everything else in the block is ignored and erased.
+ </para></listitem>
+ <listitem><para>Number of reserved blocks</para>
+ <para>
+ For automatic placement some blocks must be reserved for
+ bad block table storage. The number of reserved blocks is defined
+ in the maxblocks member of the babd block table description structure.
+ Reserving 4 blocks for mirrored tables should be a reasonable number.
+ This also limits the number of blocks which are scanned for the bad
+ block table ident pattern.
+ </para></listitem>
+ </itemizedlist>
+ </para>
+ </sect2>
+ </sect1>
+ <sect1 id="Spare_area_placement">
+ <title>Spare area (auto)placement</title>
+ <para>
+ The nand driver implements different possibilities for
+ placement of filesystem data in the spare area,
+ <itemizedlist>
+ <listitem><para>Placement defined by fs driver</para></listitem>
+ <listitem><para>Automatic placement</para></listitem>
+ </itemizedlist>
+ The default placement function is automatic placement. The
+ nand driver has built in default placement schemes for the
+ various chiptypes. If due to hardware ECC functionality the
+ default placement does not fit then the board driver can
+ provide a own placement scheme.
+ </para>
+ <para>
+ File system drivers can provide a own placement scheme which
+ is used instead of the default placement scheme.
+ </para>
+ <para>
+ Placement schemes are defined by a nand_oobinfo structure
+ <programlisting>
+struct nand_oobinfo {
+ int useecc;
+ int eccbytes;
+ int eccpos[24];
+ int oobfree[8][2];
+};
+ </programlisting>
+ <itemizedlist>
+ <listitem><para>useecc</para><para>
+ The useecc member controls the ecc and placement function. The header
+ file include/mtd/mtd-abi.h contains constants to select ecc and
+ placement. MTD_NANDECC_OFF switches off the ecc complete. This is
+ not recommended and available for testing and diagnosis only.
+ MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE
+ selects automatic placement.
+ </para></listitem>
+ <listitem><para>eccbytes</para><para>
+ The eccbytes member defines the number of ecc bytes per page.
+ </para></listitem>
+ <listitem><para>eccpos</para><para>
+ The eccpos array holds the byte offsets in the spare area where
+ the ecc codes are placed.
+ </para></listitem>
+ <listitem><para>oobfree</para><para>
+ The oobfree array defines the areas in the spare area which can be
+ used for automatic placement. The information is given in the format
+ {offset, size}. offset defines the start of the usable area, size the
+ length in bytes. More than one area can be defined. The list is terminated
+ by an {0, 0} entry.
+ </para></listitem>
+ </itemizedlist>
+ </para>
+ <sect2 id="Placement_defined_by_fs_driver">
+ <title>Placement defined by fs driver</title>
+ <para>
+ The calling function provides a pointer to a nand_oobinfo
+ structure which defines the ecc placement. For writes the
+ caller must provide a spare area buffer along with the
+ data buffer. The spare area buffer size is (number of pages) *
+ (size of spare area). For reads the buffer size is
+ (number of pages) * ((size of spare area) + (number of ecc
+ steps per page) * sizeof (int)). The driver stores the
+ result of the ecc check for each tuple in the spare buffer.
+ The storage sequence is
+ </para>
+ <para>
+ &lt;spare data page 0&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
+ </para>
+ <para>
+ ...
+ </para>
+ <para>
+ &lt;spare data page n&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
+ </para>
+ <para>
+ This is a legacy mode used by YAFFS1.
+ </para>
+ <para>
+ If the spare area buffer is NULL then only the ECC placement is
+ done according to the given scheme in the nand_oobinfo structure.
+ </para>
+ </sect2>
+ <sect2 id="Automatic_placement">
+ <title>Automatic placement</title>
+ <para>
+ Automatic placement uses the built in defaults to place the
+ ecc bytes in the spare area. If filesystem data have to be stored /
+ read into the spare area then the calling function must provide a
+ buffer. The buffer size per page is determined by the oobfree array in
+ the nand_oobinfo structure.
+ </para>
+ <para>
+ If the spare area buffer is NULL then only the ECC placement is
+ done according to the default builtin scheme.
+ </para>
+ </sect2>
+ <sect2 id="User_space_placement_selection">
+ <title>User space placement selection</title>
+ <para>
+ All non ecc functions like mtd->read and mtd->write use an internal
+ structure, which can be set by an ioctl. This structure is preset
+ to the autoplacement default.
+ <programlisting>
+ ioctl (fd, MEMSETOOBSEL, oobsel);
+ </programlisting>
+ oobsel is a pointer to a user supplied structure of type
+ nand_oobconfig. The contents of this structure must match the
+ criteria of the filesystem, which will be used. See an example in utils/nandwrite.c.
+ </para>
+ </sect2>
+ </sect1>
+ <sect1 id="Spare_area_autoplacement_default">
+ <title>Spare area autoplacement default schemes</title>
+ <sect2 id="pagesize_256">
+ <title>256 byte pagesize</title>
+<informaltable><tgroup cols="3"><tbody>
+<row>
+<entry>Offset</entry>
+<entry>Content</entry>
+<entry>Comment</entry>
+</row>
+<row>
+<entry>0x00</entry>
+<entry>ECC byte 0</entry>
+<entry>Error correction code byte 0</entry>
+</row>
+<row>
+<entry>0x01</entry>
+<entry>ECC byte 1</entry>
+<entry>Error correction code byte 1</entry>
+</row>
+<row>
+<entry>0x02</entry>
+<entry>ECC byte 2</entry>
+<entry>Error correction code byte 2</entry>
+</row>
+<row>
+<entry>0x03</entry>
+<entry>Autoplace 0</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x04</entry>
+<entry>Autoplace 1</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x05</entry>
+<entry>Bad block marker</entry>
+<entry>If any bit in this byte is zero, then this block is bad.
+This applies only to the first page in a block. In the remaining
+pages this byte is reserved</entry>
+</row>
+<row>
+<entry>0x06</entry>
+<entry>Autoplace 2</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x07</entry>
+<entry>Autoplace 3</entry>
+<entry></entry>
+</row>
+</tbody></tgroup></informaltable>
+ </sect2>
+ <sect2 id="pagesize_512">
+ <title>512 byte pagesize</title>
+<informaltable><tgroup cols="3"><tbody>
+<row>
+<entry>Offset</entry>
+<entry>Content</entry>
+<entry>Comment</entry>
+</row>
+<row>
+<entry>0x00</entry>
+<entry>ECC byte 0</entry>
+<entry>Error correction code byte 0 of the lower 256 Byte data in
+this page</entry>
+</row>
+<row>
+<entry>0x01</entry>
+<entry>ECC byte 1</entry>
+<entry>Error correction code byte 1 of the lower 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x02</entry>
+<entry>ECC byte 2</entry>
+<entry>Error correction code byte 2 of the lower 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x03</entry>
+<entry>ECC byte 3</entry>
+<entry>Error correction code byte 0 of the upper 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x04</entry>
+<entry>reserved</entry>
+<entry>reserved</entry>
+</row>
+<row>
+<entry>0x05</entry>
+<entry>Bad block marker</entry>
+<entry>If any bit in this byte is zero, then this block is bad.
+This applies only to the first page in a block. In the remaining
+pages this byte is reserved</entry>
+</row>
+<row>
+<entry>0x06</entry>
+<entry>ECC byte 4</entry>
+<entry>Error correction code byte 1 of the upper 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x07</entry>
+<entry>ECC byte 5</entry>
+<entry>Error correction code byte 2 of the upper 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x08 - 0x0F</entry>
+<entry>Autoplace 0 - 7</entry>
+<entry></entry>
+</row>
+</tbody></tgroup></informaltable>
+ </sect2>
+ <sect2 id="pagesize_2048">
+ <title>2048 byte pagesize</title>
+<informaltable><tgroup cols="3"><tbody>
+<row>
+<entry>Offset</entry>
+<entry>Content</entry>
+<entry>Comment</entry>
+</row>
+<row>
+<entry>0x00</entry>
+<entry>Bad block marker</entry>
+<entry>If any bit in this byte is zero, then this block is bad.
+This applies only to the first page in a block. In the remaining
+pages this byte is reserved</entry>
+</row>
+<row>
+<entry>0x01</entry>
+<entry>Reserved</entry>
+<entry>Reserved</entry>
+</row>
+<row>
+<entry>0x02-0x27</entry>
+<entry>Autoplace 0 - 37</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x28</entry>
+<entry>ECC byte 0</entry>
+<entry>Error correction code byte 0 of the first 256 Byte data in
+this page</entry>
+</row>
+<row>
+<entry>0x29</entry>
+<entry>ECC byte 1</entry>
+<entry>Error correction code byte 1 of the first 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2A</entry>
+<entry>ECC byte 2</entry>
+<entry>Error correction code byte 2 of the first 256 Bytes data in
+this page</entry>
+</row>
+<row>
+<entry>0x2B</entry>
+<entry>ECC byte 3</entry>
+<entry>Error correction code byte 0 of the second 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2C</entry>
+<entry>ECC byte 4</entry>
+<entry>Error correction code byte 1 of the second 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2D</entry>
+<entry>ECC byte 5</entry>
+<entry>Error correction code byte 2 of the second 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2E</entry>
+<entry>ECC byte 6</entry>
+<entry>Error correction code byte 0 of the third 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2F</entry>
+<entry>ECC byte 7</entry>
+<entry>Error correction code byte 1 of the third 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x30</entry>
+<entry>ECC byte 8</entry>
+<entry>Error correction code byte 2 of the third 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x31</entry>
+<entry>ECC byte 9</entry>
+<entry>Error correction code byte 0 of the fourth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x32</entry>
+<entry>ECC byte 10</entry>
+<entry>Error correction code byte 1 of the fourth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x33</entry>
+<entry>ECC byte 11</entry>
+<entry>Error correction code byte 2 of the fourth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x34</entry>
+<entry>ECC byte 12</entry>
+<entry>Error correction code byte 0 of the fifth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x35</entry>
+<entry>ECC byte 13</entry>
+<entry>Error correction code byte 1 of the fifth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x36</entry>
+<entry>ECC byte 14</entry>
+<entry>Error correction code byte 2 of the fifth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x37</entry>
+<entry>ECC byte 15</entry>
+<entry>Error correction code byte 0 of the sixt 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x38</entry>
+<entry>ECC byte 16</entry>
+<entry>Error correction code byte 1 of the sixt 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x39</entry>
+<entry>ECC byte 17</entry>
+<entry>Error correction code byte 2 of the sixt 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x3A</entry>
+<entry>ECC byte 18</entry>
+<entry>Error correction code byte 0 of the seventh 256 Bytes of
+data in this page</entry>
+</row>
+<row>
+<entry>0x3B</entry>
+<entry>ECC byte 19</entry>
+<entry>Error correction code byte 1 of the seventh 256 Bytes of
+data in this page</entry>
+</row>
+<row>
+<entry>0x3C</entry>
+<entry>ECC byte 20</entry>
+<entry>Error correction code byte 2 of the seventh 256 Bytes of
+data in this page</entry>
+</row>
+<row>
+<entry>0x3D</entry>
+<entry>ECC byte 21</entry>
+<entry>Error correction code byte 0 of the eigth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x3E</entry>
+<entry>ECC byte 22</entry>
+<entry>Error correction code byte 1 of the eigth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x3F</entry>
+<entry>ECC byte 23</entry>
+<entry>Error correction code byte 2 of the eigth 256 Bytes of data
+in this page</entry>
+</row>
+</tbody></tgroup></informaltable>
+ </sect2>
+ </sect1>
+ </chapter>
+
+ <chapter id="filesystems">
+ <title>Filesystem support</title>
+ <para>
+ The NAND driver provides all neccecary functions for a
+ filesystem via the MTD interface.
+ </para>
+ <para>
+ Filesystems must be aware of the NAND pecularities and
+ restrictions. One major restrictions of NAND Flash is, that you cannot
+ write as often as you want to a page. The consecutive writes to a page,
+ before erasing it again, are restricted to 1-3 writes, depending on the
+ manufacturers specifications. This applies similar to the spare area.
+ </para>
+ <para>
+ Therefor NAND aware filesystems must either write in page size chunks
+ or hold a writebuffer to collect smaller writes until they sum up to
+ pagesize. Available NAND aware filesystems: JFFS2, YAFFS.
+ </para>
+ <para>
+ The spare area usage to store filesystem data is controlled by
+ the spare area placement functionality which is described in one
+ of the earlier chapters.
+ </para>
+ </chapter>
+ <chapter id="tools">
+ <title>Tools</title>
+ <para>
+ The MTD project provides a couple of helpful tools to handle NAND Flash.
+ <itemizedlist>
+ <listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem>
+ <listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem>
+ <listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ These tools are aware of the NAND restrictions. Please use those tools
+ instead of complaining about errors which are caused by non NAND aware
+ access methods.
+ </para>
+ </chapter>
+
+ <chapter id="defines">
+ <title>Constants</title>
+ <para>
+ This chapter describes the constants which might be relevant for a driver developer.
+ </para>
+ <sect1 id="Chip_option_constants">
+ <title>Chip option constants</title>
+ <sect2 id="Constants_for_chip_id_table">
+ <title>Constants for chip id table</title>
+ <para>
+ These constants are defined in nand.h. They are ored together to describe
+ the chip functionality.
+ <programlisting>
+/* Chip can not auto increment pages */
+#define NAND_NO_AUTOINCR 0x00000001
+/* Buswitdh is 16 bit */
+#define NAND_BUSWIDTH_16 0x00000002
+/* Device supports partial programming without padding */
+#define NAND_NO_PADDING 0x00000004
+/* Chip has cache program function */
+#define NAND_CACHEPRG 0x00000008
+/* Chip has copy back function */
+#define NAND_COPYBACK 0x00000010
+/* AND Chip which has 4 banks and a confusing page / block
+ * assignment. See Renesas datasheet for further information */
+#define NAND_IS_AND 0x00000020
+/* Chip has a array of 4 pages which can be read without
+ * additional ready /busy waits */
+#define NAND_4PAGE_ARRAY 0x00000040
+ </programlisting>
+ </para>
+ </sect2>
+ <sect2 id="Constants_for_runtime_options">
+ <title>Constants for runtime options</title>
+ <para>
+ These constants are defined in nand.h. They are ored together to describe
+ the functionality.
+ <programlisting>
+/* Use a flash based bad block table. This option is parsed by the
+ * default bad block table function (nand_default_bbt). */
+#define NAND_USE_FLASH_BBT 0x00010000
+/* The hw ecc generator provides a syndrome instead a ecc value on read
+ * This can only work if we have the ecc bytes directly behind the
+ * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
+#define NAND_HWECC_SYNDROME 0x00020000
+ </programlisting>
+ </para>
+ </sect2>
+ </sect1>
+
+ <sect1 id="EEC_selection_constants">
+ <title>ECC selection constants</title>
+ <para>
+ Use these constants to select the ECC algorithm.
+ <programlisting>
+/* No ECC. Usage is not recommended ! */
+#define NAND_ECC_NONE 0
+/* Software ECC 3 byte ECC per 256 Byte data */
+#define NAND_ECC_SOFT 1
+/* Hardware ECC 3 byte ECC per 256 Byte data */
+#define NAND_ECC_HW3_256 2
+/* Hardware ECC 3 byte ECC per 512 Byte data */
+#define NAND_ECC_HW3_512 3
+/* Hardware ECC 6 byte ECC per 512 Byte data */
+#define NAND_ECC_HW6_512 4
+/* Hardware ECC 6 byte ECC per 512 Byte data */
+#define NAND_ECC_HW8_512 6
+ </programlisting>
+ </para>
+ </sect1>
+
+ <sect1 id="Hardware_control_related_constants">
+ <title>Hardware control related constants</title>
+ <para>
+ These constants describe the requested hardware access function when
+ the boardspecific hardware control function is called
+ <programlisting>
+/* Select the chip by setting nCE to low */
+#define NAND_CTL_SETNCE 1
+/* Deselect the chip by setting nCE to high */
+#define NAND_CTL_CLRNCE 2
+/* Select the command latch by setting CLE to high */
+#define NAND_CTL_SETCLE 3
+/* Deselect the command latch by setting CLE to low */
+#define NAND_CTL_CLRCLE 4
+/* Select the address latch by setting ALE to high */
+#define NAND_CTL_SETALE 5
+/* Deselect the address latch by setting ALE to low */
+#define NAND_CTL_CLRALE 6
+/* Set write protection by setting WP to high. Not used! */
+#define NAND_CTL_SETWP 7
+/* Clear write protection by setting WP to low. Not used! */
+#define NAND_CTL_CLRWP 8
+ </programlisting>
+ </para>
+ </sect1>
+
+ <sect1 id="Bad_block_table_constants">
+ <title>Bad block table related constants</title>
+ <para>
+ These constants describe the options used for bad block
+ table descriptors.
+ <programlisting>
+/* Options for the bad block table descriptors */
+
+/* The number of bits used per block in the bbt on the device */
+#define NAND_BBT_NRBITS_MSK 0x0000000F
+#define NAND_BBT_1BIT 0x00000001
+#define NAND_BBT_2BIT 0x00000002
+#define NAND_BBT_4BIT 0x00000004
+#define NAND_BBT_8BIT 0x00000008
+/* The bad block table is in the last good block of the device */
+#define NAND_BBT_LASTBLOCK 0x00000010
+/* The bbt is at the given page, else we must scan for the bbt */
+#define NAND_BBT_ABSPAGE 0x00000020
+/* The bbt is at the given page, else we must scan for the bbt */
+#define NAND_BBT_SEARCH 0x00000040
+/* bbt is stored per chip on multichip devices */
+#define NAND_BBT_PERCHIP 0x00000080
+/* bbt has a version counter at offset veroffs */
+#define NAND_BBT_VERSION 0x00000100
+/* Create a bbt if none axists */
+#define NAND_BBT_CREATE 0x00000200
+/* Search good / bad pattern through all pages of a block */
+#define NAND_BBT_SCANALLPAGES 0x00000400
+/* Scan block empty during good / bad block scan */
+#define NAND_BBT_SCANEMPTY 0x00000800
+/* Write bbt if neccecary */
+#define NAND_BBT_WRITE 0x00001000
+/* Read and write back block contents when writing bbt */
+#define NAND_BBT_SAVECONTENT 0x00002000
+ </programlisting>
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="structs">
+ <title>Structures</title>
+ <para>
+ This chapter contains the autogenerated documentation of the structures which are
+ used in the NAND driver and might be relevant for a driver developer. Each
+ struct member has a short description which is marked with an [XXX] identifier.
+ See the chapter "Documentation hints" for an explanation.
+ </para>
+!Iinclude/linux/mtd/nand.h
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the NAND kernel API functions
+ which are exported. Each function has a short description which is marked with an [XXX] identifier.
+ See the chapter "Documentation hints" for an explanation.
+ </para>
+!Edrivers/mtd/nand/nand_base.c
+!Edrivers/mtd/nand/nand_bbt.c
+!Edrivers/mtd/nand/nand_ecc.c
+ </chapter>
+
+ <chapter id="intfunctions">
+ <title>Internal Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the NAND driver internal functions.
+ Each function has a short description which is marked with an [XXX] identifier.
+ See the chapter "Documentation hints" for an explanation.
+ The functions marked with [DEFAULT] might be relevant for a board driver developer.
+ </para>
+!Idrivers/mtd/nand/nand_base.c
+!Idrivers/mtd/nand/nand_bbt.c
+<!-- No internal functions for kernel-doc:
+X!Idrivers/mtd/nand/nand_ecc.c
+-->
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The following people have contributed to the NAND driver:
+ <orderedlist>
+ <listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem>
+ <listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem>
+ <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+ </orderedlist>
+ A lot of users have provided bugfixes, improvements and helping hands for testing.
+ Thanks a lot.
+ </para>
+ <para>
+ The following people have contributed to this document:
+ <orderedlist>
+ <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+ </orderedlist>
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl
new file mode 100644
index 0000000..f24f9e8
--- /dev/null
+++ b/Documentation/DocBook/networking.tmpl
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LinuxNetworking">
+ <bookinfo>
+ <title>Linux Networking and Network Devices APIs</title>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="netcore">
+ <title>Linux Networking</title>
+ <sect1><title>Networking Base Types</title>
+!Iinclude/linux/net.h
+ </sect1>
+ <sect1><title>Socket Buffer Functions</title>
+!Iinclude/linux/skbuff.h
+!Iinclude/net/sock.h
+!Enet/socket.c
+!Enet/core/skbuff.c
+!Enet/core/sock.c
+!Enet/core/datagram.c
+!Enet/core/stream.c
+ </sect1>
+ <sect1><title>Socket Filter</title>
+!Enet/core/filter.c
+ </sect1>
+ <sect1><title>Generic Network Statistics</title>
+!Iinclude/linux/gen_stats.h
+!Enet/core/gen_stats.c
+!Enet/core/gen_estimator.c
+ </sect1>
+ <sect1><title>SUN RPC subsystem</title>
+<!-- The !D functionality is not perfect, garbage has to be protected by comments
+!Dnet/sunrpc/sunrpc_syms.c
+-->
+!Enet/sunrpc/xdr.c
+!Enet/sunrpc/svc_xprt.c
+!Enet/sunrpc/xprt.c
+!Enet/sunrpc/sched.c
+!Enet/sunrpc/socklib.c
+!Enet/sunrpc/stats.c
+!Enet/sunrpc/rpc_pipe.c
+!Enet/sunrpc/rpcb_clnt.c
+!Enet/sunrpc/clnt.c
+ </sect1>
+ </chapter>
+
+ <chapter id="netdev">
+ <title>Network device support</title>
+ <sect1><title>Driver Support</title>
+!Enet/core/dev.c
+!Enet/ethernet/eth.c
+!Enet/sched/sch_generic.c
+!Iinclude/linux/etherdevice.h
+!Iinclude/linux/netdevice.h
+ </sect1>
+ <sect1><title>PHY Support</title>
+!Edrivers/net/phy/phy.c
+!Idrivers/net/phy/phy.c
+!Edrivers/net/phy/phy_device.c
+!Idrivers/net/phy/phy_device.c
+!Edrivers/net/phy/mdio_bus.c
+!Idrivers/net/phy/mdio_bus.c
+ </sect1>
+<!-- FIXME: Removed for now since no structured comments in source
+ <sect1><title>Wireless</title>
+X!Enet/core/wireless.c
+ </sect1>
+-->
+ <sect1><title>Synchronous PPP</title>
+!Edrivers/net/wan/syncppp.c
+ </sect1>
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/procfs-guide.tmpl b/Documentation/DocBook/procfs-guide.tmpl
new file mode 100644
index 0000000..9eba4b7
--- /dev/null
+++ b/Documentation/DocBook/procfs-guide.tmpl
@@ -0,0 +1,626 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
+<!ENTITY procfsexample SYSTEM "procfs_example.xml">
+]>
+
+<book id="LKProcfsGuide">
+ <bookinfo>
+ <title>Linux Kernel Procfs Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Erik</firstname>
+ <othername>(J.A.K.)</othername>
+ <surname>Mouw</surname>
+ <affiliation>
+ <address>
+ <email>mouw@nl.linux.org</email>
+ </address>
+ </affiliation>
+ </author>
+ <othercredit>
+ <contrib>
+ This software and documentation were written while working on the
+ LART computing board
+ (<ulink url="http://www.lartmaker.nl/">http://www.lartmaker.nl/</ulink>),
+ which was sponsored by the Delt University of Technology projects
+ Mobile Multi-media Communications and Ubiquitous Communications.
+ </contrib>
+ </othercredit>
+ </authorgroup>
+
+ <revhistory>
+ <revision>
+ <revnumber>1.0</revnumber>
+ <date>May 30, 2001</date>
+ <revremark>Initial revision posted to linux-kernel</revremark>
+ </revision>
+ <revision>
+ <revnumber>1.1</revnumber>
+ <date>June 3, 2001</date>
+ <revremark>Revised after comments from linux-kernel</revremark>
+ </revision>
+ </revhistory>
+
+ <copyright>
+ <year>2001</year>
+ <holder>Erik Mouw</holder>
+ </copyright>
+
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute it
+ and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This documentation is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE. See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+
+
+
+ <toc>
+ </toc>
+
+
+
+
+ <preface id="Preface">
+ <title>Preface</title>
+
+ <para>
+ This guide describes the use of the procfs file system from
+ within the Linux kernel. The idea to write this guide came up on
+ the #kernelnewbies IRC channel (see <ulink
+ url="http://www.kernelnewbies.org/">http://www.kernelnewbies.org/</ulink>),
+ when Jeff Garzik explained the use of procfs and forwarded me a
+ message Alexander Viro wrote to the linux-kernel mailing list. I
+ agreed to write it up nicely, so here it is.
+ </para>
+
+ <para>
+ I'd like to thank Jeff Garzik
+ <email>jgarzik@pobox.com</email> and Alexander Viro
+ <email>viro@parcelfarce.linux.theplanet.co.uk</email> for their input,
+ Tim Waugh <email>twaugh@redhat.com</email> for his <ulink
+ url="http://people.redhat.com/twaugh/docbook/selfdocbook/">Selfdocbook</ulink>,
+ and Marc Joosen <email>marcj@historia.et.tudelft.nl</email> for
+ proofreading.
+ </para>
+
+ <para>
+ Erik
+ </para>
+ </preface>
+
+
+
+
+ <chapter id="intro">
+ <title>Introduction</title>
+
+ <para>
+ The <filename class="directory">/proc</filename> file system
+ (procfs) is a special file system in the linux kernel. It's a
+ virtual file system: it is not associated with a block device
+ but exists only in memory. The files in the procfs are there to
+ allow userland programs access to certain information from the
+ kernel (like process information in <filename
+ class="directory">/proc/[0-9]+/</filename>), but also for debug
+ purposes (like <filename>/proc/ksyms</filename>).
+ </para>
+
+ <para>
+ This guide describes the use of the procfs file system from
+ within the Linux kernel. It starts by introducing all relevant
+ functions to manage the files within the file system. After that
+ it shows how to communicate with userland, and some tips and
+ tricks will be pointed out. Finally a complete example will be
+ shown.
+ </para>
+
+ <para>
+ Note that the files in <filename
+ class="directory">/proc/sys</filename> are sysctl files: they
+ don't belong to procfs and are governed by a completely
+ different API described in the Kernel API book.
+ </para>
+ </chapter>
+
+
+
+
+ <chapter id="managing">
+ <title>Managing procfs entries</title>
+
+ <para>
+ This chapter describes the functions that various kernel
+ components use to populate the procfs with files, symlinks,
+ device nodes, and directories.
+ </para>
+
+ <para>
+ A minor note before we start: if you want to use any of the
+ procfs functions, be sure to include the correct header file!
+ This should be one of the first lines in your code:
+ </para>
+
+ <programlisting>
+#include &lt;linux/proc_fs.h&gt;
+ </programlisting>
+
+
+
+
+ <sect1 id="regularfile">
+ <title>Creating a regular file</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry* <function>create_proc_entry</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>mode_t <parameter>mode</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ This function creates a regular file with the name
+ <parameter>name</parameter>, file mode
+ <parameter>mode</parameter> in the directory
+ <parameter>parent</parameter>. To create a file in the root of
+ the procfs, use <constant>NULL</constant> as
+ <parameter>parent</parameter> parameter. When successful, the
+ function will return a pointer to the freshly created
+ <structname>struct proc_dir_entry</structname>; otherwise it
+ will return <constant>NULL</constant>. <xref
+ linkend="userland"/> describes how to do something useful with
+ regular files.
+ </para>
+
+ <para>
+ Note that it is specifically supported that you can pass a
+ path that spans multiple directories. For example
+ <function>create_proc_entry</function>(<parameter>"drivers/via0/info"</parameter>)
+ will create the <filename class="directory">via0</filename>
+ directory if necessary, with standard
+ <constant>0755</constant> permissions.
+ </para>
+
+ <para>
+ If you only want to be able to read the file, the function
+ <function>create_proc_read_entry</function> described in <xref
+ linkend="convenience"/> may be used to create and initialise
+ the procfs entry in one single call.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1 id="Creating_a_symlink">
+ <title>Creating a symlink</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry*
+ <function>proc_symlink</function></funcdef> <paramdef>const
+ char* <parameter>name</parameter></paramdef>
+ <paramdef>struct proc_dir_entry*
+ <parameter>parent</parameter></paramdef> <paramdef>const
+ char* <parameter>dest</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ This creates a symlink in the procfs directory
+ <parameter>parent</parameter> that points from
+ <parameter>name</parameter> to
+ <parameter>dest</parameter>. This translates in userland to
+ <literal>ln -s</literal> <parameter>dest</parameter>
+ <parameter>name</parameter>.
+ </para>
+ </sect1>
+
+ <sect1 id="Creating_a_directory">
+ <title>Creating a directory</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry* <function>proc_mkdir</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ Create a directory <parameter>name</parameter> in the procfs
+ directory <parameter>parent</parameter>.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1 id="Removing_an_entry">
+ <title>Removing an entry</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>void <function>remove_proc_entry</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ Removes the entry <parameter>name</parameter> in the directory
+ <parameter>parent</parameter> from the procfs. Entries are
+ removed by their <emphasis>name</emphasis>, not by the
+ <structname>struct proc_dir_entry</structname> returned by the
+ various create functions. Note that this function doesn't
+ recursively remove entries.
+ </para>
+
+ <para>
+ Be sure to free the <structfield>data</structfield> entry from
+ the <structname>struct proc_dir_entry</structname> before
+ <function>remove_proc_entry</function> is called (that is: if
+ there was some <structfield>data</structfield> allocated, of
+ course). See <xref linkend="usingdata"/> for more information
+ on using the <structfield>data</structfield> entry.
+ </para>
+ </sect1>
+ </chapter>
+
+
+
+
+ <chapter id="userland">
+ <title>Communicating with userland</title>
+
+ <para>
+ Instead of reading (or writing) information directly from
+ kernel memory, procfs works with <emphasis>call back
+ functions</emphasis> for files: functions that are called when
+ a specific file is being read or written. Such functions have
+ to be initialised after the procfs file is created by setting
+ the <structfield>read_proc</structfield> and/or
+ <structfield>write_proc</structfield> fields in the
+ <structname>struct proc_dir_entry*</structname> that the
+ function <function>create_proc_entry</function> returned:
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+
+entry->read_proc = read_proc_foo;
+entry->write_proc = write_proc_foo;
+ </programlisting>
+
+ <para>
+ If you only want to use a the
+ <structfield>read_proc</structfield>, the function
+ <function>create_proc_read_entry</function> described in <xref
+ linkend="convenience"/> may be used to create and initialise the
+ procfs entry in one single call.
+ </para>
+
+
+
+ <sect1 id="Reading_data">
+ <title>Reading data</title>
+
+ <para>
+ The read function is a call back function that allows userland
+ processes to read data from the kernel. The read function
+ should have the following format:
+ </para>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>int <function>read_func</function></funcdef>
+ <paramdef>char* <parameter>buffer</parameter></paramdef>
+ <paramdef>char** <parameter>start</parameter></paramdef>
+ <paramdef>off_t <parameter>off</parameter></paramdef>
+ <paramdef>int <parameter>count</parameter></paramdef>
+ <paramdef>int* <parameter>peof</parameter></paramdef>
+ <paramdef>void* <parameter>data</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ The read function should write its information into the
+ <parameter>buffer</parameter>, which will be exactly
+ <literal>PAGE_SIZE</literal> bytes long.
+ </para>
+
+ <para>
+ The parameter
+ <parameter>peof</parameter> should be used to signal that the
+ end of the file has been reached by writing
+ <literal>1</literal> to the memory location
+ <parameter>peof</parameter> points to.
+ </para>
+
+ <para>
+ The <parameter>data</parameter>
+ parameter can be used to create a single call back function for
+ several files, see <xref linkend="usingdata"/>.
+ </para>
+
+ <para>
+ The rest of the parameters and the return value are described
+ by a comment in <filename>fs/proc/generic.c</filename> as follows:
+ </para>
+
+ <blockquote>
+ <para>
+ You have three ways to return data:
+ </para>
+ <orderedlist>
+ <listitem>
+ <para>
+ Leave <literal>*start = NULL</literal>. (This is the default.)
+ Put the data of the requested offset at that
+ offset within the buffer. Return the number (<literal>n</literal>)
+ of bytes there are from the beginning of the
+ buffer up to the last byte of data. If the
+ number of supplied bytes (<literal>= n - offset</literal>) is
+ greater than zero and you didn't signal eof
+ and the reader is prepared to take more data
+ you will be called again with the requested
+ offset advanced by the number of bytes
+ absorbed. This interface is useful for files
+ no larger than the buffer.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Set <literal>*start</literal> to an unsigned long value less than
+ the buffer address but greater than zero.
+ Put the data of the requested offset at the
+ beginning of the buffer. Return the number of
+ bytes of data placed there. If this number is
+ greater than zero and you didn't signal eof
+ and the reader is prepared to take more data
+ you will be called again with the requested
+ offset advanced by <literal>*start</literal>. This interface is
+ useful when you have a large file consisting
+ of a series of blocks which you want to count
+ and return as wholes.
+ (Hack by Paul.Russell@rustcorp.com.au)
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Set <literal>*start</literal> to an address within the buffer.
+ Put the data of the requested offset at <literal>*start</literal>.
+ Return the number of bytes of data placed there.
+ If this number is greater than zero and you
+ didn't signal eof and the reader is prepared to
+ take more data you will be called again with the
+ requested offset advanced by the number of bytes
+ absorbed.
+ </para>
+ </listitem>
+ </orderedlist>
+ </blockquote>
+
+ <para>
+ <xref linkend="example"/> shows how to use a read call back
+ function.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1 id="Writing_data">
+ <title>Writing data</title>
+
+ <para>
+ The write call back function allows a userland process to write
+ data to the kernel, so it has some kind of control over the
+ kernel. The write function should have the following format:
+ </para>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>int <function>write_func</function></funcdef>
+ <paramdef>struct file* <parameter>file</parameter></paramdef>
+ <paramdef>const char* <parameter>buffer</parameter></paramdef>
+ <paramdef>unsigned long <parameter>count</parameter></paramdef>
+ <paramdef>void* <parameter>data</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ The write function should read <parameter>count</parameter>
+ bytes at maximum from the <parameter>buffer</parameter>. Note
+ that the <parameter>buffer</parameter> doesn't live in the
+ kernel's memory space, so it should first be copied to kernel
+ space with <function>copy_from_user</function>. The
+ <parameter>file</parameter> parameter is usually
+ ignored. <xref linkend="usingdata"/> shows how to use the
+ <parameter>data</parameter> parameter.
+ </para>
+
+ <para>
+ Again, <xref linkend="example"/> shows how to use this call back
+ function.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1 id="usingdata">
+ <title>A single call back for many files</title>
+
+ <para>
+ When a large number of almost identical files is used, it's
+ quite inconvenient to use a separate call back function for
+ each file. A better approach is to have a single call back
+ function that distinguishes between the files by using the
+ <structfield>data</structfield> field in <structname>struct
+ proc_dir_entry</structname>. First of all, the
+ <structfield>data</structfield> field has to be initialised:
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+struct my_file_data *file_data;
+
+file_data = kmalloc(sizeof(struct my_file_data), GFP_KERNEL);
+entry->data = file_data;
+ </programlisting>
+
+ <para>
+ The <structfield>data</structfield> field is a <type>void
+ *</type>, so it can be initialised with anything.
+ </para>
+
+ <para>
+ Now that the <structfield>data</structfield> field is set, the
+ <function>read_proc</function> and
+ <function>write_proc</function> can use it to distinguish
+ between files because they get it passed into their
+ <parameter>data</parameter> parameter:
+ </para>
+
+ <programlisting>
+int foo_read_func(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len;
+
+ if(data == file_data) {
+ /* special case for this file */
+ } else {
+ /* normal processing */
+ }
+
+ return len;
+}
+ </programlisting>
+
+ <para>
+ Be sure to free the <structfield>data</structfield> data field
+ when removing the procfs entry.
+ </para>
+ </sect1>
+ </chapter>
+
+
+
+
+ <chapter id="tips">
+ <title>Tips and tricks</title>
+
+
+
+
+ <sect1 id="convenience">
+ <title>Convenience functions</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry* <function>create_proc_read_entry</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>mode_t <parameter>mode</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ <paramdef>read_proc_t* <parameter>read_proc</parameter></paramdef>
+ <paramdef>void* <parameter>data</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ This function creates a regular file in exactly the same way
+ as <function>create_proc_entry</function> from <xref
+ linkend="regularfile"/> does, but also allows to set the read
+ function <parameter>read_proc</parameter> in one call. This
+ function can set the <parameter>data</parameter> as well, like
+ explained in <xref linkend="usingdata"/>.
+ </para>
+ </sect1>
+
+
+
+ <sect1 id="Modules">
+ <title>Modules</title>
+
+ <para>
+ If procfs is being used from within a module, be sure to set
+ the <structfield>owner</structfield> field in the
+ <structname>struct proc_dir_entry</structname> to
+ <constant>THIS_MODULE</constant>.
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+
+entry->owner = THIS_MODULE;
+ </programlisting>
+ </sect1>
+
+
+
+
+ <sect1 id="Mode_and_ownership">
+ <title>Mode and ownership</title>
+
+ <para>
+ Sometimes it is useful to change the mode and/or ownership of
+ a procfs entry. Here is an example that shows how to achieve
+ that:
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+
+entry->mode = S_IWUSR |S_IRUSR | S_IRGRP | S_IROTH;
+entry->uid = 0;
+entry->gid = 100;
+ </programlisting>
+
+ </sect1>
+ </chapter>
+
+
+
+
+ <chapter id="example">
+ <title>Example</title>
+
+ <!-- be careful with the example code: it shouldn't be wider than
+ approx. 60 columns, or otherwise it won't fit properly on a page
+ -->
+
+&procfsexample;
+
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c
new file mode 100644
index 0000000..8c6396e
--- /dev/null
+++ b/Documentation/DocBook/procfs_example.c
@@ -0,0 +1,210 @@
+/*
+ * procfs_example.c: an example proc interface
+ *
+ * Copyright (C) 2001, Erik Mouw (mouw@nl.linux.org)
+ *
+ * This file accompanies the procfs-guide in the Linux kernel
+ * source. Its main use is to demonstrate the concepts and
+ * functions described in the guide.
+ *
+ * This software has been developed while working on the LART
+ * computing board (http://www.lartmaker.nl), which was sponsored
+ * by the Delt University of Technology projects Mobile Multi-media
+ * Communications and Ubiquitous Communications.
+ *
+ * This program is free software; you can redistribute
+ * it and/or modify it under the terms of the GNU General
+ * Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place,
+ * Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/jiffies.h>
+#include <asm/uaccess.h>
+
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "procfs_example"
+
+#define FOOBAR_LEN 8
+
+struct fb_data_t {
+ char name[FOOBAR_LEN + 1];
+ char value[FOOBAR_LEN + 1];
+};
+
+
+static struct proc_dir_entry *example_dir, *foo_file,
+ *bar_file, *jiffies_file, *symlink;
+
+
+struct fb_data_t foo_data, bar_data;
+
+
+static int proc_read_jiffies(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len;
+
+ len = sprintf(page, "jiffies = %ld\n",
+ jiffies);
+
+ return len;
+}
+
+
+static int proc_read_foobar(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len;
+ struct fb_data_t *fb_data = (struct fb_data_t *)data;
+
+ /* DON'T DO THAT - buffer overruns are bad */
+ len = sprintf(page, "%s = '%s'\n",
+ fb_data->name, fb_data->value);
+
+ return len;
+}
+
+
+static int proc_write_foobar(struct file *file,
+ const char *buffer,
+ unsigned long count,
+ void *data)
+{
+ int len;
+ struct fb_data_t *fb_data = (struct fb_data_t *)data;
+
+ if(count > FOOBAR_LEN)
+ len = FOOBAR_LEN;
+ else
+ len = count;
+
+ if(copy_from_user(fb_data->value, buffer, len))
+ return -EFAULT;
+
+ fb_data->value[len] = '\0';
+
+ return len;
+}
+
+
+static int __init init_procfs_example(void)
+{
+ int rv = 0;
+
+ /* create directory */
+ example_dir = proc_mkdir(MODULE_NAME, NULL);
+ if(example_dir == NULL) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ example_dir->owner = THIS_MODULE;
+
+ /* create jiffies using convenience function */
+ jiffies_file = create_proc_read_entry("jiffies",
+ 0444, example_dir,
+ proc_read_jiffies,
+ NULL);
+ if(jiffies_file == NULL) {
+ rv = -ENOMEM;
+ goto no_jiffies;
+ }
+
+ jiffies_file->owner = THIS_MODULE;
+
+ /* create foo and bar files using same callback
+ * functions
+ */
+ foo_file = create_proc_entry("foo", 0644, example_dir);
+ if(foo_file == NULL) {
+ rv = -ENOMEM;
+ goto no_foo;
+ }
+
+ strcpy(foo_data.name, "foo");
+ strcpy(foo_data.value, "foo");
+ foo_file->data = &foo_data;
+ foo_file->read_proc = proc_read_foobar;
+ foo_file->write_proc = proc_write_foobar;
+ foo_file->owner = THIS_MODULE;
+
+ bar_file = create_proc_entry("bar", 0644, example_dir);
+ if(bar_file == NULL) {
+ rv = -ENOMEM;
+ goto no_bar;
+ }
+
+ strcpy(bar_data.name, "bar");
+ strcpy(bar_data.value, "bar");
+ bar_file->data = &bar_data;
+ bar_file->read_proc = proc_read_foobar;
+ bar_file->write_proc = proc_write_foobar;
+ bar_file->owner = THIS_MODULE;
+
+ /* create symlink */
+ symlink = proc_symlink("jiffies_too", example_dir,
+ "jiffies");
+ if(symlink == NULL) {
+ rv = -ENOMEM;
+ goto no_symlink;
+ }
+
+ symlink->owner = THIS_MODULE;
+
+ /* everything OK */
+ printk(KERN_INFO "%s %s initialised\n",
+ MODULE_NAME, MODULE_VERS);
+ return 0;
+
+no_symlink:
+ remove_proc_entry("bar", example_dir);
+no_bar:
+ remove_proc_entry("foo", example_dir);
+no_foo:
+ remove_proc_entry("jiffies", example_dir);
+no_jiffies:
+ remove_proc_entry(MODULE_NAME, NULL);
+out:
+ return rv;
+}
+
+
+static void __exit cleanup_procfs_example(void)
+{
+ remove_proc_entry("jiffies_too", example_dir);
+ remove_proc_entry("bar", example_dir);
+ remove_proc_entry("foo", example_dir);
+ remove_proc_entry("jiffies", example_dir);
+ remove_proc_entry(MODULE_NAME, NULL);
+
+ printk(KERN_INFO "%s %s removed\n",
+ MODULE_NAME, MODULE_VERS);
+}
+
+
+module_init(init_procfs_example);
+module_exit(cleanup_procfs_example);
+
+MODULE_AUTHOR("Erik Mouw");
+MODULE_DESCRIPTION("procfs examples");
+MODULE_LICENSE("GPL");
diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl
new file mode 100644
index 0000000..54eb26b
--- /dev/null
+++ b/Documentation/DocBook/rapidio.tmpl
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
+ <!ENTITY rapidio SYSTEM "rapidio.xml">
+ ]>
+
+<book id="RapidIO-Guide">
+ <bookinfo>
+ <title>RapidIO Subsystem Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Matt</firstname>
+ <surname>Porter</surname>
+ <affiliation>
+ <address>
+ <email>mporter@kernel.crashing.org</email>
+ <email>mporter@mvista.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2005</year>
+ <holder>MontaVista Software, Inc.</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ RapidIO is a high speed switched fabric interconnect with
+ features aimed at the embedded market. RapidIO provides
+ support for memory-mapped I/O as well as message-based
+ transactions over the switched fabric network. RapidIO has
+ a standardized discovery mechanism not unlike the PCI bus
+ standard that allows simple detection of devices in a
+ network.
+ </para>
+ <para>
+ This documentation is provided for developers intending
+ to support RapidIO on new architectures, write new drivers,
+ or to understand the subsystem internals.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs and Limitations</title>
+
+ <sect1 id="known_bugs">
+ <title>Bugs</title>
+ <para>None. ;)</para>
+ </sect1>
+ <sect1 id="Limitations">
+ <title>Limitations</title>
+ <para>
+ <orderedlist>
+ <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem>
+ <listitem><para>Multiple host enumeration is not supported</para></listitem>
+ </orderedlist>
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="drivers">
+ <title>RapidIO driver interface</title>
+ <para>
+ Drivers are provided a set of calls in order
+ to interface with the subsystem to gather info
+ on devices, request/map memory region resources,
+ and manage mailboxes/doorbells.
+ </para>
+ <sect1 id="Functions">
+ <title>Functions</title>
+!Iinclude/linux/rio_drv.h
+!Edrivers/rapidio/rio-driver.c
+!Edrivers/rapidio/rio.c
+ </sect1>
+ </chapter>
+
+ <chapter id="internals">
+ <title>Internals</title>
+
+ <para>
+ This chapter contains the autogenerated documentation of the RapidIO
+ subsystem.
+ </para>
+
+ <sect1 id="Structures"><title>Structures</title>
+!Iinclude/linux/rio.h
+ </sect1>
+ <sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title>
+!Idrivers/rapidio/rio-scan.c
+ </sect1>
+ <sect1 id="Driver_functionality"><title>Driver functionality</title>
+!Idrivers/rapidio/rio.c
+!Idrivers/rapidio/rio-access.c
+ </sect1>
+ <sect1 id="Device_model_support"><title>Device model support</title>
+!Idrivers/rapidio/rio-driver.c
+ </sect1>
+ <sect1 id="Sysfs_support"><title>Sysfs support</title>
+!Idrivers/rapidio/rio-sysfs.c
+ </sect1>
+ <sect1 id="PPC32_support"><title>PPC32 support</title>
+!Earch/powerpc/sysdev/fsl_rio.c
+!Iarch/powerpc/sysdev/fsl_rio.c
+ </sect1>
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The following people have contributed to the RapidIO
+ subsystem directly or indirectly:
+ <orderedlist>
+ <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
+ <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem>
+ <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem>
+ </orderedlist>
+ </para>
+ <para>
+ The following people have contributed to this document:
+ <orderedlist>
+ <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
+ </orderedlist>
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl
new file mode 100644
index 0000000..95bfc12
--- /dev/null
+++ b/Documentation/DocBook/s390-drivers.tmpl
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="s390drivers">
+ <bookinfo>
+ <title>Writing s390 channel device drivers</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Cornelia</firstname>
+ <surname>Huck</surname>
+ <affiliation>
+ <address>
+ <email>cornelia.huck@de.ibm.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2007</year>
+ <holder>IBM Corp.</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ This document describes the interfaces available for device drivers that
+ drive s390 based channel attached I/O devices. This includes interfaces for
+ interaction with the hardware and interfaces for interacting with the
+ common driver core. Those interfaces are provided by the s390 common I/O
+ layer.
+ </para>
+ <para>
+ The document assumes a familarity with the technical terms associated
+ with the s390 channel I/O architecture. For a description of this
+ architecture, please refer to the "z/Architecture: Principles of
+ Operation", IBM publication no. SA22-7832.
+ </para>
+ <para>
+ While most I/O devices on a s390 system are typically driven through the
+ channel I/O mechanism described here, there are various other methods
+ (like the diag interface). These are out of the scope of this document.
+ </para>
+ <para>
+ Some additional information can also be found in the kernel source
+ under Documentation/s390/driver-model.txt.
+ </para>
+ </chapter>
+ <chapter id="ccw">
+ <title>The ccw bus</title>
+ <para>
+ The ccw bus typically contains the majority of devices available to
+ a s390 system. Named after the channel command word (ccw), the basic
+ command structure used to address its devices, the ccw bus contains
+ so-called channel attached devices. They are addressed via I/O
+ subchannels, visible on the css bus. A device driver for
+ channel-attached devices, however, will never interact with the
+ subchannel directly, but only via the I/O device on the ccw bus,
+ the ccw device.
+ </para>
+ <sect1 id="channelIO">
+ <title>I/O functions for channel-attached devices</title>
+ <para>
+ Some hardware structures have been translated into C structures for use
+ by the common I/O layer and device drivers. For more information on
+ the hardware structures represented here, please consult the Principles
+ of Operation.
+ </para>
+!Iarch/s390/include/asm/cio.h
+ </sect1>
+ <sect1 id="ccwdev">
+ <title>ccw devices</title>
+ <para>
+ Devices that want to initiate channel I/O need to attach to the ccw bus.
+ Interaction with the driver core is done via the common I/O layer, which
+ provides the abstractions of ccw devices and ccw device drivers.
+ </para>
+ <para>
+ The functions that initiate or terminate channel I/O all act upon a
+ ccw device structure. Device drivers must not bypass those functions
+ or strange side effects may happen.
+ </para>
+!Iarch/s390/include/asm/ccwdev.h
+!Edrivers/s390/cio/device.c
+!Edrivers/s390/cio/device_ops.c
+ </sect1>
+ <sect1 id="cmf">
+ <title>The channel-measurement facility</title>
+ <para>
+ The channel-measurement facility provides a means to collect
+ measurement data which is made available by the channel subsystem
+ for each channel attached device.
+ </para>
+!Iarch/s390/include/asm/cmb.h
+!Edrivers/s390/cio/cmf.c
+ </sect1>
+ </chapter>
+
+ <chapter id="ccwgroup">
+ <title>The ccwgroup bus</title>
+ <para>
+ The ccwgroup bus only contains artificial devices, created by the user.
+ Many networking devices (e.g. qeth) are in fact composed of several
+ ccw devices (like read, write and data channel for qeth). The
+ ccwgroup bus provides a mechanism to create a meta-device which
+ contains those ccw devices as slave devices and can be associated
+ with the netdevice.
+ </para>
+ <sect1 id="ccwgroupdevices">
+ <title>ccw group devices</title>
+!Iarch/s390/include/asm/ccwgroup.h
+!Edrivers/s390/cio/ccwgroup.c
+ </sect1>
+ </chapter>
+
+ <chapter id="genericinterfaces">
+ <title>Generic interfaces</title>
+ <para>
+ Some interfaces are available to other drivers that do not necessarily
+ have anything to do with the busses described above, but still are
+ indirectly using basic infrastructure in the common I/O layer.
+ One example is the support for adapter interrupts.
+ </para>
+!Edrivers/s390/cio/airq.c
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl
new file mode 100644
index 0000000..10a150a
--- /dev/null
+++ b/Documentation/DocBook/scsi.tmpl
@@ -0,0 +1,409 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="scsimid">
+ <bookinfo>
+ <title>SCSI Interfaces Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>James</firstname>
+ <surname>Bottomley</surname>
+ <affiliation>
+ <address>
+ <email>James.Bottomley@hansenpartnership.com</email>
+ </address>
+ </affiliation>
+ </author>
+
+ <author>
+ <firstname>Rob</firstname>
+ <surname>Landley</surname>
+ <affiliation>
+ <address>
+ <email>rob@landley.net</email>
+ </address>
+ </affiliation>
+ </author>
+
+ </authorgroup>
+
+ <copyright>
+ <year>2007</year>
+ <holder>Linux Foundation</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <sect1 id="protocol_vs_bus">
+ <title>Protocol vs bus</title>
+ <para>
+ Once upon a time, the Small Computer Systems Interface defined both
+ a parallel I/O bus and a data protocol to connect a wide variety of
+ peripherals (disk drives, tape drives, modems, printers, scanners,
+ optical drives, test equipment, and medical devices) to a host
+ computer.
+ </para>
+ <para>
+ Although the old parallel (fast/wide/ultra) SCSI bus has largely
+ fallen out of use, the SCSI command set is more widely used than ever
+ to communicate with devices over a number of different busses.
+ </para>
+ <para>
+ The <ulink url='http://www.t10.org/scsi-3.htm'>SCSI protocol</ulink>
+ is a big-endian peer-to-peer packet based protocol. SCSI commands
+ are 6, 10, 12, or 16 bytes long, often followed by an associated data
+ payload.
+ </para>
+ <para>
+ SCSI commands can be transported over just about any kind of bus, and
+ are the default protocol for storage devices attached to USB, SATA,
+ SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are
+ also commonly exchanged over Infiniband,
+ <ulink url='http://i2o.shadowconnect.com/faq.php'>I20</ulink>, TCP/IP
+ (<ulink url='http://en.wikipedia.org/wiki/ISCSI'>iSCSI</ulink>), even
+ <ulink url='http://cyberelk.net/tim/parport/parscsi.html'>Parallel
+ ports</ulink>.
+ </para>
+ </sect1>
+ <sect1 id="subsystem_design">
+ <title>Design of the Linux SCSI subsystem</title>
+ <para>
+ The SCSI subsystem uses a three layer design, with upper, mid, and low
+ layers. Every operation involving the SCSI subsystem (such as reading
+ a sector from a disk) uses one driver at each of the 3 levels: one
+ upper layer driver, one lower layer driver, and the SCSI midlayer.
+ </para>
+ <para>
+ The SCSI upper layer provides the interface between userspace and the
+ kernel, in the form of block and char device nodes for I/O and
+ ioctl(). The SCSI lower layer contains drivers for specific hardware
+ devices.
+ </para>
+ <para>
+ In between is the SCSI mid-layer, analogous to a network routing
+ layer such as the IPv4 stack. The SCSI mid-layer routes a packet
+ based data protocol between the upper layer's /dev nodes and the
+ corresponding devices in the lower layer. It manages command queues,
+ provides error handling and power management functions, and responds
+ to ioctl() requests.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="upper_layer">
+ <title>SCSI upper layer</title>
+ <para>
+ The upper layer supports the user-kernel interface by providing
+ device nodes.
+ </para>
+ <sect1 id="sd">
+ <title>sd (SCSI Disk)</title>
+ <para>sd (sd_mod.o)</para>
+<!-- !Idrivers/scsi/sd.c -->
+ </sect1>
+ <sect1 id="sr">
+ <title>sr (SCSI CD-ROM)</title>
+ <para>sr (sr_mod.o)</para>
+ </sect1>
+ <sect1 id="st">
+ <title>st (SCSI Tape)</title>
+ <para>st (st.o)</para>
+ </sect1>
+ <sect1 id="sg">
+ <title>sg (SCSI Generic)</title>
+ <para>sg (sg.o)</para>
+ </sect1>
+ <sect1 id="ch">
+ <title>ch (SCSI Media Changer)</title>
+ <para>ch (ch.c)</para>
+ </sect1>
+ </chapter>
+
+ <chapter id="mid_layer">
+ <title>SCSI mid layer</title>
+
+ <sect1 id="midlayer_implementation">
+ <title>SCSI midlayer implementation</title>
+ <sect2 id="scsi_device.h">
+ <title>include/scsi/scsi_device.h</title>
+ <para>
+ </para>
+!Iinclude/scsi/scsi_device.h
+ </sect2>
+
+ <sect2 id="scsi.c">
+ <title>drivers/scsi/scsi.c</title>
+ <para>Main file for the SCSI midlayer.</para>
+!Edrivers/scsi/scsi.c
+ </sect2>
+ <sect2 id="scsicam.c">
+ <title>drivers/scsi/scsicam.c</title>
+ <para>
+ <ulink url='http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf'>SCSI
+ Common Access Method</ulink> support functions, for use with
+ HDIO_GETGEO, etc.
+ </para>
+!Edrivers/scsi/scsicam.c
+ </sect2>
+ <sect2 id="scsi_error.c">
+ <title>drivers/scsi/scsi_error.c</title>
+ <para>Common SCSI error/timeout handling routines.</para>
+!Edrivers/scsi/scsi_error.c
+ </sect2>
+ <sect2 id="scsi_devinfo.c">
+ <title>drivers/scsi/scsi_devinfo.c</title>
+ <para>
+ Manage scsi_dev_info_list, which tracks blacklisted and whitelisted
+ devices.
+ </para>
+!Idrivers/scsi/scsi_devinfo.c
+ </sect2>
+ <sect2 id="scsi_ioctl.c">
+ <title>drivers/scsi/scsi_ioctl.c</title>
+ <para>
+ Handle ioctl() calls for SCSI devices.
+ </para>
+!Edrivers/scsi/scsi_ioctl.c
+ </sect2>
+ <sect2 id="scsi_lib.c">
+ <title>drivers/scsi/scsi_lib.c</title>
+ <para>
+ SCSI queuing library.
+ </para>
+!Edrivers/scsi/scsi_lib.c
+ </sect2>
+ <sect2 id="scsi_lib_dma.c">
+ <title>drivers/scsi/scsi_lib_dma.c</title>
+ <para>
+ SCSI library functions depending on DMA
+ (map and unmap scatter-gather lists).
+ </para>
+!Edrivers/scsi/scsi_lib_dma.c
+ </sect2>
+ <sect2 id="scsi_module.c">
+ <title>drivers/scsi/scsi_module.c</title>
+ <para>
+ The file drivers/scsi/scsi_module.c contains legacy support for
+ old-style host templates. It should never be used by any new driver.
+ </para>
+ </sect2>
+ <sect2 id="scsi_proc.c">
+ <title>drivers/scsi/scsi_proc.c</title>
+ <para>
+ The functions in this file provide an interface between
+ the PROC file system and the SCSI device drivers
+ It is mainly used for debugging, statistics and to pass
+ information directly to the lowlevel driver.
+
+ I.E. plumbing to manage /proc/scsi/*
+ </para>
+!Idrivers/scsi/scsi_proc.c
+ </sect2>
+ <sect2 id="scsi_netlink.c">
+ <title>drivers/scsi/scsi_netlink.c</title>
+ <para>
+ Infrastructure to provide async events from transports to userspace
+ via netlink, using a single NETLINK_SCSITRANSPORT protocol for all
+ transports.
+
+ See <ulink url='http://marc.info/?l=linux-scsi&amp;m=115507374832500&amp;w=2'>the
+ original patch submission</ulink> for more details.
+ </para>
+!Idrivers/scsi/scsi_netlink.c
+ </sect2>
+ <sect2 id="scsi_scan.c">
+ <title>drivers/scsi/scsi_scan.c</title>
+ <para>
+ Scan a host to determine which (if any) devices are attached.
+
+ The general scanning/probing algorithm is as follows, exceptions are
+ made to it depending on device specific flags, compilation options,
+ and global variable (boot or module load time) settings.
+
+ A specific LUN is scanned via an INQUIRY command; if the LUN has a
+ device attached, a scsi_device is allocated and setup for it.
+
+ For every id of every channel on the given host, start by scanning
+ LUN 0. Skip hosts that don't respond at all to a scan of LUN 0.
+ Otherwise, if LUN 0 has a device attached, allocate and setup a
+ scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN,
+ and scan all of the LUNs returned by the REPORT LUN; else,
+ sequentially scan LUNs up until some maximum is reached, or a LUN is
+ seen that cannot have a device attached to it.
+ </para>
+!Idrivers/scsi/scsi_scan.c
+ </sect2>
+ <sect2 id="scsi_sysctl.c">
+ <title>drivers/scsi/scsi_sysctl.c</title>
+ <para>
+ Set up the sysctl entry: "/dev/scsi/logging_level"
+ (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level.
+ </para>
+ </sect2>
+ <sect2 id="scsi_sysfs.c">
+ <title>drivers/scsi/scsi_sysfs.c</title>
+ <para>
+ SCSI sysfs interface routines.
+ </para>
+!Edrivers/scsi/scsi_sysfs.c
+ </sect2>
+ <sect2 id="hosts.c">
+ <title>drivers/scsi/hosts.c</title>
+ <para>
+ mid to lowlevel SCSI driver interface
+ </para>
+!Edrivers/scsi/hosts.c
+ </sect2>
+ <sect2 id="constants.c">
+ <title>drivers/scsi/constants.c</title>
+ <para>
+ mid to lowlevel SCSI driver interface
+ </para>
+!Edrivers/scsi/constants.c
+ </sect2>
+ </sect1>
+
+ <sect1 id="Transport_classes">
+ <title>Transport classes</title>
+ <para>
+ Transport classes are service libraries for drivers in the SCSI
+ lower layer, which expose transport attributes in sysfs.
+ </para>
+ <sect2 id="Fibre_Channel_transport">
+ <title>Fibre Channel transport</title>
+ <para>
+ The file drivers/scsi/scsi_transport_fc.c defines transport attributes
+ for Fibre Channel.
+ </para>
+!Edrivers/scsi/scsi_transport_fc.c
+ </sect2>
+ <sect2 id="iSCSI_transport">
+ <title>iSCSI transport class</title>
+ <para>
+ The file drivers/scsi/scsi_transport_iscsi.c defines transport
+ attributes for the iSCSI class, which sends SCSI packets over TCP/IP
+ connections.
+ </para>
+!Edrivers/scsi/scsi_transport_iscsi.c
+ </sect2>
+ <sect2 id="SAS_transport">
+ <title>Serial Attached SCSI (SAS) transport class</title>
+ <para>
+ The file drivers/scsi/scsi_transport_sas.c defines transport
+ attributes for Serial Attached SCSI, a variant of SATA aimed at
+ large high-end systems.
+ </para>
+ <para>
+ The SAS transport class contains common code to deal with SAS HBAs,
+ an aproximated representation of SAS topologies in the driver model,
+ and various sysfs attributes to expose these topologies and managment
+ interfaces to userspace.
+ </para>
+ <para>
+ In addition to the basic SCSI core objects this transport class
+ introduces two additional intermediate objects: The SAS PHY
+ as represented by struct sas_phy defines an "outgoing" PHY on
+ a SAS HBA or Expander, and the SAS remote PHY represented by
+ struct sas_rphy defines an "incoming" PHY on a SAS Expander or
+ end device. Note that this is purely a software concept, the
+ underlying hardware for a PHY and a remote PHY is the exactly
+ the same.
+ </para>
+ <para>
+ There is no concept of a SAS port in this code, users can see
+ what PHYs form a wide port based on the port_identifier attribute,
+ which is the same for all PHYs in a port.
+ </para>
+!Edrivers/scsi/scsi_transport_sas.c
+ </sect2>
+ <sect2 id="SATA_transport">
+ <title>SATA transport class</title>
+ <para>
+ The SATA transport is handled by libata, which has its own book of
+ documentation in this directory.
+ </para>
+ </sect2>
+ <sect2 id="SPI_transport">
+ <title>Parallel SCSI (SPI) transport class</title>
+ <para>
+ The file drivers/scsi/scsi_transport_spi.c defines transport
+ attributes for traditional (fast/wide/ultra) SCSI busses.
+ </para>
+!Edrivers/scsi/scsi_transport_spi.c
+ </sect2>
+ <sect2 id="SRP_transport">
+ <title>SCSI RDMA (SRP) transport class</title>
+ <para>
+ The file drivers/scsi/scsi_transport_srp.c defines transport
+ attributes for SCSI over Remote Direct Memory Access.
+ </para>
+!Edrivers/scsi/scsi_transport_srp.c
+ </sect2>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="lower_layer">
+ <title>SCSI lower layer</title>
+ <sect1 id="hba_drivers">
+ <title>Host Bus Adapter transport types</title>
+ <para>
+ Many modern device controllers use the SCSI command set as a protocol to
+ communicate with their devices through many different types of physical
+ connections.
+ </para>
+ <para>
+ In SCSI language a bus capable of carrying SCSI commands is
+ called a "transport", and a controller connecting to such a bus is
+ called a "host bus adapter" (HBA).
+ </para>
+ <sect2 id="scsi_debug.c">
+ <title>Debug transport</title>
+ <para>
+ The file drivers/scsi/scsi_debug.c simulates a host adapter with a
+ variable number of disks (or disk like devices) attached, sharing a
+ common amount of RAM. Does a lot of checking to make sure that we are
+ not getting blocks mixed up, and panics the kernel if anything out of
+ the ordinary is seen.
+ </para>
+ <para>
+ To be more realistic, the simulated devices have the transport
+ attributes of SAS disks.
+ </para>
+ <para>
+ For documentation see
+ <ulink url='http://www.torque.net/sg/sdebug26.html'>http://www.torque.net/sg/sdebug26.html</ulink>
+ </para>
+<!-- !Edrivers/scsi/scsi_debug.c -->
+ </sect2>
+ <sect2 id="todo">
+ <title>todo</title>
+ <para>Parallel (fast/wide/ultra) SCSI, USB, SATA,
+ SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband,
+ I20, iSCSI, Parallel ports, netlink...
+ </para>
+ </sect2>
+ </sect1>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl
new file mode 100644
index 0000000..0c3dc4c
--- /dev/null
+++ b/Documentation/DocBook/sh.tmpl
@@ -0,0 +1,105 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="sh-drivers">
+ <bookinfo>
+ <title>SuperH Interfaces Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Paul</firstname>
+ <surname>Mundt</surname>
+ <affiliation>
+ <address>
+ <email>lethal@linux-sh.org</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2008</year>
+ <holder>Paul Mundt</holder>
+ </copyright>
+ <copyright>
+ <year>2008</year>
+ <holder>Renesas Technology Corp.</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="mm">
+ <title>Memory Management</title>
+ <sect1 id="sh4">
+ <title>SH-4</title>
+ <sect2 id="sq">
+ <title>Store Queue API</title>
+!Earch/sh/kernel/cpu/sh4/sq.c
+ </sect2>
+ </sect1>
+ <sect1 id="sh5">
+ <title>SH-5</title>
+ <sect2 id="tlb">
+ <title>TLB Interfaces</title>
+!Iarch/sh/mm/tlb-sh5.c
+!Iarch/sh/include/asm/tlb_64.h
+ </sect2>
+ </sect1>
+ </chapter>
+ <chapter id="clk">
+ <title>Clock Framework Extensions</title>
+!Iarch/sh/include/asm/clock.h
+ </chapter>
+ <chapter id="mach">
+ <title>Machine Specific Interfaces</title>
+ <sect1 id="dreamcast">
+ <title>mach-dreamcast</title>
+!Iarch/sh/boards/mach-dreamcast/rtc.c
+ </sect1>
+ <sect1 id="x3proto">
+ <title>mach-x3proto</title>
+!Earch/sh/boards/mach-x3proto/ilsel.c
+ </sect1>
+ </chapter>
+ <chapter id="busses">
+ <title>Busses</title>
+ <sect1 id="superhyway">
+ <title>SuperHyway</title>
+!Edrivers/sh/superhyway/superhyway.c
+ </sect1>
+
+ <sect1 id="maple">
+ <title>Maple</title>
+!Edrivers/sh/maple/maple.c
+ </sect1>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl
new file mode 100644
index 0000000..974e17c
--- /dev/null
+++ b/Documentation/DocBook/stylesheet.xsl
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<stylesheet xmlns="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<param name="chunk.quietly">1</param>
+<param name="funcsynopsis.style">ansi</param>
+<param name="funcsynopsis.tabular.threshold">80</param>
+<!-- <param name="paper.type">A4</param> -->
+<param name="generate.section.toc.level">2</param>
+</stylesheet>
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl
new file mode 100644
index 0000000..df87d1b
--- /dev/null
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -0,0 +1,620 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
+"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd" []>
+
+<book id="index">
+<bookinfo>
+<title>The Userspace I/O HOWTO</title>
+
+<author>
+ <firstname>Hans-Jürgen</firstname>
+ <surname>Koch</surname>
+ <authorblurb><para>Linux developer, Linutronix</para></authorblurb>
+ <affiliation>
+ <orgname>
+ <ulink url="http://www.linutronix.de">Linutronix</ulink>
+ </orgname>
+
+ <address>
+ <email>hjk@linutronix.de</email>
+ </address>
+ </affiliation>
+</author>
+
+<copyright>
+ <year>2006-2008</year>
+ <holder>Hans-Jürgen Koch.</holder>
+</copyright>
+
+<legalnotice>
+<para>
+This documentation is Free Software licensed under the terms of the
+GPL version 2.
+</para>
+</legalnotice>
+
+<pubdate>2006-12-11</pubdate>
+
+<abstract>
+ <para>This HOWTO describes concept and usage of Linux kernel's
+ Userspace I/O system.</para>
+</abstract>
+
+<revhistory>
+ <revision>
+ <revnumber>0.5</revnumber>
+ <date>2008-05-22</date>
+ <authorinitials>hjk</authorinitials>
+ <revremark>Added description of write() function.</revremark>
+ </revision>
+ <revision>
+ <revnumber>0.4</revnumber>
+ <date>2007-11-26</date>
+ <authorinitials>hjk</authorinitials>
+ <revremark>Removed section about uio_dummy.</revremark>
+ </revision>
+ <revision>
+ <revnumber>0.3</revnumber>
+ <date>2007-04-29</date>
+ <authorinitials>hjk</authorinitials>
+ <revremark>Added section about userspace drivers.</revremark>
+ </revision>
+ <revision>
+ <revnumber>0.2</revnumber>
+ <date>2007-02-13</date>
+ <authorinitials>hjk</authorinitials>
+ <revremark>Update after multiple mappings were added.</revremark>
+ </revision>
+ <revision>
+ <revnumber>0.1</revnumber>
+ <date>2006-12-11</date>
+ <authorinitials>hjk</authorinitials>
+ <revremark>First draft.</revremark>
+ </revision>
+</revhistory>
+</bookinfo>
+
+<chapter id="aboutthisdoc">
+<?dbhtml filename="aboutthis.html"?>
+<title>About this document</title>
+
+<sect1 id="translations">
+<?dbhtml filename="translations.html"?>
+<title>Translations</title>
+
+<para>If you know of any translations for this document, or you are
+interested in translating it, please email me
+<email>hjk@linutronix.de</email>.
+</para>
+</sect1>
+
+<sect1 id="preface">
+<title>Preface</title>
+ <para>
+ For many types of devices, creating a Linux kernel driver is
+ overkill. All that is really needed is some way to handle an
+ interrupt and provide access to the memory space of the
+ device. The logic of controlling the device does not
+ necessarily have to be within the kernel, as the device does
+ not need to take advantage of any of other resources that the
+ kernel provides. One such common class of devices that are
+ like this are for industrial I/O cards.
+ </para>
+ <para>
+ To address this situation, the userspace I/O system (UIO) was
+ designed. For typical industrial I/O cards, only a very small
+ kernel module is needed. The main part of the driver will run in
+ user space. This simplifies development and reduces the risk of
+ serious bugs within a kernel module.
+ </para>
+ <para>
+ Please note that UIO is not an universal driver interface. Devices
+ that are already handled well by other kernel subsystems (like
+ networking or serial or USB) are no candidates for an UIO driver.
+ Hardware that is ideally suited for an UIO driver fulfills all of
+ the following:
+ </para>
+<itemizedlist>
+<listitem>
+ <para>The device has memory that can be mapped. The device can be
+ controlled completely by writing to this memory.</para>
+</listitem>
+<listitem>
+ <para>The device usually generates interrupts.</para>
+</listitem>
+<listitem>
+ <para>The device does not fit into one of the standard kernel
+ subsystems.</para>
+</listitem>
+</itemizedlist>
+</sect1>
+
+<sect1 id="thanks">
+<title>Acknowledgments</title>
+ <para>I'd like to thank Thomas Gleixner and Benedikt Spranger of
+ Linutronix, who have not only written most of the UIO code, but also
+ helped greatly writing this HOWTO by giving me all kinds of background
+ information.</para>
+</sect1>
+
+<sect1 id="feedback">
+<title>Feedback</title>
+ <para>Find something wrong with this document? (Or perhaps something
+ right?) I would love to hear from you. Please email me at
+ <email>hjk@linutronix.de</email>.</para>
+</sect1>
+</chapter>
+
+<chapter id="about">
+<?dbhtml filename="about.html"?>
+<title>About UIO</title>
+
+<para>If you use UIO for your card's driver, here's what you get:</para>
+
+<itemizedlist>
+<listitem>
+ <para>only one small kernel module to write and maintain.</para>
+</listitem>
+<listitem>
+ <para>develop the main part of your driver in user space,
+ with all the tools and libraries you're used to.</para>
+</listitem>
+<listitem>
+ <para>bugs in your driver won't crash the kernel.</para>
+</listitem>
+<listitem>
+ <para>updates of your driver can take place without recompiling
+ the kernel.</para>
+</listitem>
+</itemizedlist>
+
+<sect1 id="how_uio_works">
+<title>How UIO works</title>
+ <para>
+ Each UIO device is accessed through a device file and several
+ sysfs attribute files. The device file will be called
+ <filename>/dev/uio0</filename> for the first device, and
+ <filename>/dev/uio1</filename>, <filename>/dev/uio2</filename>
+ and so on for subsequent devices.
+ </para>
+
+ <para><filename>/dev/uioX</filename> is used to access the
+ address space of the card. Just use
+ <function>mmap()</function> to access registers or RAM
+ locations of your card.
+ </para>
+
+ <para>
+ Interrupts are handled by reading from
+ <filename>/dev/uioX</filename>. A blocking
+ <function>read()</function> from
+ <filename>/dev/uioX</filename> will return as soon as an
+ interrupt occurs. You can also use
+ <function>select()</function> on
+ <filename>/dev/uioX</filename> to wait for an interrupt. The
+ integer value read from <filename>/dev/uioX</filename>
+ represents the total interrupt count. You can use this number
+ to figure out if you missed some interrupts.
+ </para>
+ <para>
+ For some hardware that has more than one interrupt source internally,
+ but not separate IRQ mask and status registers, there might be
+ situations where userspace cannot determine what the interrupt source
+ was if the kernel handler disables them by writing to the chip's IRQ
+ register. In such a case, the kernel has to disable the IRQ completely
+ to leave the chip's register untouched. Now the userspace part can
+ determine the cause of the interrupt, but it cannot re-enable
+ interrupts. Another cornercase is chips where re-enabling interrupts
+ is a read-modify-write operation to a combined IRQ status/acknowledge
+ register. This would be racy if a new interrupt occurred
+ simultaneously.
+ </para>
+ <para>
+ To address these problems, UIO also implements a write() function. It
+ is normally not used and can be ignored for hardware that has only a
+ single interrupt source or has separate IRQ mask and status registers.
+ If you need it, however, a write to <filename>/dev/uioX</filename>
+ will call the <function>irqcontrol()</function> function implemented
+ by the driver. You have to write a 32-bit value that is usually either
+ 0 or 1 to disable or enable interrupts. If a driver does not implement
+ <function>irqcontrol()</function>, <function>write()</function> will
+ return with <varname>-ENOSYS</varname>.
+ </para>
+
+ <para>
+ To handle interrupts properly, your custom kernel module can
+ provide its own interrupt handler. It will automatically be
+ called by the built-in handler.
+ </para>
+
+ <para>
+ For cards that don't generate interrupts but need to be
+ polled, there is the possibility to set up a timer that
+ triggers the interrupt handler at configurable time intervals.
+ This interrupt simulation is done by calling
+ <function>uio_event_notify()</function>
+ from the timer's event handler.
+ </para>
+
+ <para>
+ Each driver provides attributes that are used to read or write
+ variables. These attributes are accessible through sysfs
+ files. A custom kernel driver module can add its own
+ attributes to the device owned by the uio driver, but not added
+ to the UIO device itself at this time. This might change in the
+ future if it would be found to be useful.
+ </para>
+
+ <para>
+ The following standard attributes are provided by the UIO
+ framework:
+ </para>
+<itemizedlist>
+<listitem>
+ <para>
+ <filename>name</filename>: The name of your device. It is
+ recommended to use the name of your kernel module for this.
+ </para>
+</listitem>
+<listitem>
+ <para>
+ <filename>version</filename>: A version string defined by your
+ driver. This allows the user space part of your driver to deal
+ with different versions of the kernel module.
+ </para>
+</listitem>
+<listitem>
+ <para>
+ <filename>event</filename>: The total number of interrupts
+ handled by the driver since the last time the device node was
+ read.
+ </para>
+</listitem>
+</itemizedlist>
+<para>
+ These attributes appear under the
+ <filename>/sys/class/uio/uioX</filename> directory. Please
+ note that this directory might be a symlink, and not a real
+ directory. Any userspace code that accesses it must be able
+ to handle this.
+</para>
+<para>
+ Each UIO device can make one or more memory regions available for
+ memory mapping. This is necessary because some industrial I/O cards
+ require access to more than one PCI memory region in a driver.
+</para>
+<para>
+ Each mapping has its own directory in sysfs, the first mapping
+ appears as <filename>/sys/class/uio/uioX/maps/map0/</filename>.
+ Subsequent mappings create directories <filename>map1/</filename>,
+ <filename>map2/</filename>, and so on. These directories will only
+ appear if the size of the mapping is not 0.
+</para>
+<para>
+ Each <filename>mapX/</filename> directory contains two read-only files
+ that show start address and size of the memory:
+</para>
+<itemizedlist>
+<listitem>
+ <para>
+ <filename>addr</filename>: The address of memory that can be mapped.
+ </para>
+</listitem>
+<listitem>
+ <para>
+ <filename>size</filename>: The size, in bytes, of the memory
+ pointed to by addr.
+ </para>
+</listitem>
+</itemizedlist>
+
+<para>
+ From userspace, the different mappings are distinguished by adjusting
+ the <varname>offset</varname> parameter of the
+ <function>mmap()</function> call. To map the memory of mapping N, you
+ have to use N times the page size as your offset:
+</para>
+<programlisting format="linespecific">
+offset = N * getpagesize();
+</programlisting>
+
+</sect1>
+</chapter>
+
+<chapter id="custom_kernel_module" xreflabel="Writing your own kernel module">
+<?dbhtml filename="custom_kernel_module.html"?>
+<title>Writing your own kernel module</title>
+ <para>
+ Please have a look at <filename>uio_cif.c</filename> as an
+ example. The following paragraphs explain the different
+ sections of this file.
+ </para>
+
+<sect1 id="uio_info">
+<title>struct uio_info</title>
+ <para>
+ This structure tells the framework the details of your driver,
+ Some of the members are required, others are optional.
+ </para>
+
+<itemizedlist>
+<listitem><para>
+<varname>char *name</varname>: Required. The name of your driver as
+it will appear in sysfs. I recommend using the name of your module for this.
+</para></listitem>
+
+<listitem><para>
+<varname>char *version</varname>: Required. This string appears in
+<filename>/sys/class/uio/uioX/version</filename>.
+</para></listitem>
+
+<listitem><para>
+<varname>struct uio_mem mem[ MAX_UIO_MAPS ]</varname>: Required if you
+have memory that can be mapped with <function>mmap()</function>. For each
+mapping you need to fill one of the <varname>uio_mem</varname> structures.
+See the description below for details.
+</para></listitem>
+
+<listitem><para>
+<varname>long irq</varname>: Required. If your hardware generates an
+interrupt, it's your modules task to determine the irq number during
+initialization. If you don't have a hardware generated interrupt but
+want to trigger the interrupt handler in some other way, set
+<varname>irq</varname> to <varname>UIO_IRQ_CUSTOM</varname>.
+If you had no interrupt at all, you could set
+<varname>irq</varname> to <varname>UIO_IRQ_NONE</varname>, though this
+rarely makes sense.
+</para></listitem>
+
+<listitem><para>
+<varname>unsigned long irq_flags</varname>: Required if you've set
+<varname>irq</varname> to a hardware interrupt number. The flags given
+here will be used in the call to <function>request_irq()</function>.
+</para></listitem>
+
+<listitem><para>
+<varname>int (*mmap)(struct uio_info *info, struct vm_area_struct
+*vma)</varname>: Optional. If you need a special
+<function>mmap()</function> function, you can set it here. If this
+pointer is not NULL, your <function>mmap()</function> will be called
+instead of the built-in one.
+</para></listitem>
+
+<listitem><para>
+<varname>int (*open)(struct uio_info *info, struct inode *inode)
+</varname>: Optional. You might want to have your own
+<function>open()</function>, e.g. to enable interrupts only when your
+device is actually used.
+</para></listitem>
+
+<listitem><para>
+<varname>int (*release)(struct uio_info *info, struct inode *inode)
+</varname>: Optional. If you define your own
+<function>open()</function>, you will probably also want a custom
+<function>release()</function> function.
+</para></listitem>
+
+<listitem><para>
+<varname>int (*irqcontrol)(struct uio_info *info, s32 irq_on)
+</varname>: Optional. If you need to be able to enable or disable
+interrupts from userspace by writing to <filename>/dev/uioX</filename>,
+you can implement this function. The parameter <varname>irq_on</varname>
+will be 0 to disable interrupts and 1 to enable them.
+</para></listitem>
+</itemizedlist>
+
+<para>
+Usually, your device will have one or more memory regions that can be mapped
+to user space. For each region, you have to set up a
+<varname>struct uio_mem</varname> in the <varname>mem[]</varname> array.
+Here's a description of the fields of <varname>struct uio_mem</varname>:
+</para>
+
+<itemizedlist>
+<listitem><para>
+<varname>int memtype</varname>: Required if the mapping is used. Set this to
+<varname>UIO_MEM_PHYS</varname> if you you have physical memory on your
+card to be mapped. Use <varname>UIO_MEM_LOGICAL</varname> for logical
+memory (e.g. allocated with <function>kmalloc()</function>). There's also
+<varname>UIO_MEM_VIRTUAL</varname> for virtual memory.
+</para></listitem>
+
+<listitem><para>
+<varname>unsigned long addr</varname>: Required if the mapping is used.
+Fill in the address of your memory block. This address is the one that
+appears in sysfs.
+</para></listitem>
+
+<listitem><para>
+<varname>unsigned long size</varname>: Fill in the size of the
+memory block that <varname>addr</varname> points to. If <varname>size</varname>
+is zero, the mapping is considered unused. Note that you
+<emphasis>must</emphasis> initialize <varname>size</varname> with zero for
+all unused mappings.
+</para></listitem>
+
+<listitem><para>
+<varname>void *internal_addr</varname>: If you have to access this memory
+region from within your kernel module, you will want to map it internally by
+using something like <function>ioremap()</function>. Addresses
+returned by this function cannot be mapped to user space, so you must not
+store it in <varname>addr</varname>. Use <varname>internal_addr</varname>
+instead to remember such an address.
+</para></listitem>
+</itemizedlist>
+
+<para>
+Please do not touch the <varname>kobj</varname> element of
+<varname>struct uio_mem</varname>! It is used by the UIO framework
+to set up sysfs files for this mapping. Simply leave it alone.
+</para>
+</sect1>
+
+<sect1 id="adding_irq_handler">
+<title>Adding an interrupt handler</title>
+ <para>
+ What you need to do in your interrupt handler depends on your
+ hardware and on how you want to handle it. You should try to
+ keep the amount of code in your kernel interrupt handler low.
+ If your hardware requires no action that you
+ <emphasis>have</emphasis> to perform after each interrupt,
+ then your handler can be empty.</para> <para>If, on the other
+ hand, your hardware <emphasis>needs</emphasis> some action to
+ be performed after each interrupt, then you
+ <emphasis>must</emphasis> do it in your kernel module. Note
+ that you cannot rely on the userspace part of your driver. Your
+ userspace program can terminate at any time, possibly leaving
+ your hardware in a state where proper interrupt handling is
+ still required.
+ </para>
+
+ <para>
+ There might also be applications where you want to read data
+ from your hardware at each interrupt and buffer it in a piece
+ of kernel memory you've allocated for that purpose. With this
+ technique you could avoid loss of data if your userspace
+ program misses an interrupt.
+ </para>
+
+ <para>
+ A note on shared interrupts: Your driver should support
+ interrupt sharing whenever this is possible. It is possible if
+ and only if your driver can detect whether your hardware has
+ triggered the interrupt or not. This is usually done by looking
+ at an interrupt status register. If your driver sees that the
+ IRQ bit is actually set, it will perform its actions, and the
+ handler returns IRQ_HANDLED. If the driver detects that it was
+ not your hardware that caused the interrupt, it will do nothing
+ and return IRQ_NONE, allowing the kernel to call the next
+ possible interrupt handler.
+ </para>
+
+ <para>
+ If you decide not to support shared interrupts, your card
+ won't work in computers with no free interrupts. As this
+ frequently happens on the PC platform, you can save yourself a
+ lot of trouble by supporting interrupt sharing.
+ </para>
+</sect1>
+
+</chapter>
+
+<chapter id="userspace_driver" xreflabel="Writing a driver in user space">
+<?dbhtml filename="userspace_driver.html"?>
+<title>Writing a driver in userspace</title>
+ <para>
+ Once you have a working kernel module for your hardware, you can
+ write the userspace part of your driver. You don't need any special
+ libraries, your driver can be written in any reasonable language,
+ you can use floating point numbers and so on. In short, you can
+ use all the tools and libraries you'd normally use for writing a
+ userspace application.
+ </para>
+
+<sect1 id="getting_uio_information">
+<title>Getting information about your UIO device</title>
+ <para>
+ Information about all UIO devices is available in sysfs. The
+ first thing you should do in your driver is check
+ <varname>name</varname> and <varname>version</varname> to
+ make sure your talking to the right device and that its kernel
+ driver has the version you expect.
+ </para>
+ <para>
+ You should also make sure that the memory mapping you need
+ exists and has the size you expect.
+ </para>
+ <para>
+ There is a tool called <varname>lsuio</varname> that lists
+ UIO devices and their attributes. It is available here:
+ </para>
+ <para>
+ <ulink url="http://www.osadl.org/projects/downloads/UIO/user/">
+ http://www.osadl.org/projects/downloads/UIO/user/</ulink>
+ </para>
+ <para>
+ With <varname>lsuio</varname> you can quickly check if your
+ kernel module is loaded and which attributes it exports.
+ Have a look at the manpage for details.
+ </para>
+ <para>
+ The source code of <varname>lsuio</varname> can serve as an
+ example for getting information about an UIO device.
+ The file <filename>uio_helper.c</filename> contains a lot of
+ functions you could use in your userspace driver code.
+ </para>
+</sect1>
+
+<sect1 id="mmap_device_memory">
+<title>mmap() device memory</title>
+ <para>
+ After you made sure you've got the right device with the
+ memory mappings you need, all you have to do is to call
+ <function>mmap()</function> to map the device's memory
+ to userspace.
+ </para>
+ <para>
+ The parameter <varname>offset</varname> of the
+ <function>mmap()</function> call has a special meaning
+ for UIO devices: It is used to select which mapping of
+ your device you want to map. To map the memory of
+ mapping N, you have to use N times the page size as
+ your offset:
+ </para>
+<programlisting format="linespecific">
+ offset = N * getpagesize();
+</programlisting>
+ <para>
+ N starts from zero, so if you've got only one memory
+ range to map, set <varname>offset = 0</varname>.
+ A drawback of this technique is that memory is always
+ mapped beginning with its start address.
+ </para>
+</sect1>
+
+<sect1 id="wait_for_interrupts">
+<title>Waiting for interrupts</title>
+ <para>
+ After you successfully mapped your devices memory, you
+ can access it like an ordinary array. Usually, you will
+ perform some initialization. After that, your hardware
+ starts working and will generate an interrupt as soon
+ as it's finished, has some data available, or needs your
+ attention because an error occured.
+ </para>
+ <para>
+ <filename>/dev/uioX</filename> is a read-only file. A
+ <function>read()</function> will always block until an
+ interrupt occurs. There is only one legal value for the
+ <varname>count</varname> parameter of
+ <function>read()</function>, and that is the size of a
+ signed 32 bit integer (4). Any other value for
+ <varname>count</varname> causes <function>read()</function>
+ to fail. The signed 32 bit integer read is the interrupt
+ count of your device. If the value is one more than the value
+ you read the last time, everything is OK. If the difference
+ is greater than one, you missed interrupts.
+ </para>
+ <para>
+ You can also use <function>select()</function> on
+ <filename>/dev/uioX</filename>.
+ </para>
+</sect1>
+
+</chapter>
+
+<appendix id="app1">
+<title>Further information</title>
+<itemizedlist>
+ <listitem><para>
+ <ulink url="http://www.osadl.org">
+ OSADL homepage.</ulink>
+ </para></listitem>
+ <listitem><para>
+ <ulink url="http://www.linutronix.de">
+ Linutronix homepage.</ulink>
+ </para></listitem>
+</itemizedlist>
+</appendix>
+
+</book>
diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl
new file mode 100644
index 0000000..af29360
--- /dev/null
+++ b/Documentation/DocBook/usb.tmpl
@@ -0,0 +1,980 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Linux-USB-API">
+ <bookinfo>
+ <title>The Linux-USB Host Side API</title>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+<chapter id="intro">
+ <title>Introduction to USB on Linux</title>
+
+ <para>A Universal Serial Bus (USB) is used to connect a host,
+ such as a PC or workstation, to a number of peripheral
+ devices. USB uses a tree structure, with the host as the
+ root (the system's master), hubs as interior nodes, and
+ peripherals as leaves (and slaves).
+ Modern PCs support several such trees of USB devices, usually
+ one USB 2.0 tree (480 Mbit/sec each) with
+ a few USB 1.1 trees (12 Mbit/sec each) that are used when you
+ connect a USB 1.1 device directly to the machine's "root hub".
+ </para>
+
+ <para>That master/slave asymmetry was designed-in for a number of
+ reasons, one being ease of use. It is not physically possible to
+ assemble (legal) USB cables incorrectly: all upstream "to the host"
+ connectors are the rectangular type (matching the sockets on
+ root hubs), and all downstream connectors are the squarish type
+ (or they are built into the peripheral).
+ Also, the host software doesn't need to deal with distributed
+ auto-configuration since the pre-designated master node manages all that.
+ And finally, at the electrical level, bus protocol overhead is reduced by
+ eliminating arbitration and moving scheduling into the host software.
+ </para>
+
+ <para>USB 1.0 was announced in January 1996 and was revised
+ as USB 1.1 (with improvements in hub specification and
+ support for interrupt-out transfers) in September 1998.
+ USB 2.0 was released in April 2000, adding high-speed
+ transfers and transaction-translating hubs (used for USB 1.1
+ and 1.0 backward compatibility).
+ </para>
+
+ <para>Kernel developers added USB support to Linux early in the 2.2 kernel
+ series, shortly before 2.3 development forked. Updates from 2.3 were
+ regularly folded back into 2.2 releases, which improved reliability and
+ brought <filename>/sbin/hotplug</filename> support as well more drivers.
+ Such improvements were continued in the 2.5 kernel series, where they added
+ USB 2.0 support, improved performance, and made the host controller drivers
+ (HCDs) more consistent. They also simplified the API (to make bugs less
+ likely) and added internal "kerneldoc" documentation.
+ </para>
+
+ <para>Linux can run inside USB devices as well as on
+ the hosts that control the devices.
+ But USB device drivers running inside those peripherals
+ don't do the same things as the ones running inside hosts,
+ so they've been given a different name:
+ <emphasis>gadget drivers</emphasis>.
+ This document does not cover gadget drivers.
+ </para>
+
+ </chapter>
+
+<chapter id="host">
+ <title>USB Host-Side API Model</title>
+
+ <para>Host-side drivers for USB devices talk to the "usbcore" APIs.
+ There are two. One is intended for
+ <emphasis>general-purpose</emphasis> drivers (exposed through
+ driver frameworks), and the other is for drivers that are
+ <emphasis>part of the core</emphasis>.
+ Such core drivers include the <emphasis>hub</emphasis> driver
+ (which manages trees of USB devices) and several different kinds
+ of <emphasis>host controller drivers</emphasis>,
+ which control individual busses.
+ </para>
+
+ <para>The device model seen by USB drivers is relatively complex.
+ </para>
+
+ <itemizedlist>
+
+ <listitem><para>USB supports four kinds of data transfers
+ (control, bulk, interrupt, and isochronous). Two of them (control
+ and bulk) use bandwidth as it's available,
+ while the other two (interrupt and isochronous)
+ are scheduled to provide guaranteed bandwidth.
+ </para></listitem>
+
+ <listitem><para>The device description model includes one or more
+ "configurations" per device, only one of which is active at a time.
+ Devices that are capable of high-speed operation must also support
+ full-speed configurations, along with a way to ask about the
+ "other speed" configurations which might be used.
+ </para></listitem>
+
+ <listitem><para>Configurations have one or more "interfaces", each
+ of which may have "alternate settings". Interfaces may be
+ standardized by USB "Class" specifications, or may be specific to
+ a vendor or device.</para>
+
+ <para>USB device drivers actually bind to interfaces, not devices.
+ Think of them as "interface drivers", though you
+ may not see many devices where the distinction is important.
+ <emphasis>Most USB devices are simple, with only one configuration,
+ one interface, and one alternate setting.</emphasis>
+ </para></listitem>
+
+ <listitem><para>Interfaces have one or more "endpoints", each of
+ which supports one type and direction of data transfer such as
+ "bulk out" or "interrupt in". The entire configuration may have
+ up to sixteen endpoints in each direction, allocated as needed
+ among all the interfaces.
+ </para></listitem>
+
+ <listitem><para>Data transfer on USB is packetized; each endpoint
+ has a maximum packet size.
+ Drivers must often be aware of conventions such as flagging the end
+ of bulk transfers using "short" (including zero length) packets.
+ </para></listitem>
+
+ <listitem><para>The Linux USB API supports synchronous calls for
+ control and bulk messages.
+ It also supports asynchnous calls for all kinds of data transfer,
+ using request structures called "URBs" (USB Request Blocks).
+ </para></listitem>
+
+ </itemizedlist>
+
+ <para>Accordingly, the USB Core API exposed to device drivers
+ covers quite a lot of territory. You'll probably need to consult
+ the USB 2.0 specification, available online from www.usb.org at
+ no cost, as well as class or device specifications.
+ </para>
+
+ <para>The only host-side drivers that actually touch hardware
+ (reading/writing registers, handling IRQs, and so on) are the HCDs.
+ In theory, all HCDs provide the same functionality through the same
+ API. In practice, that's becoming more true on the 2.5 kernels,
+ but there are still differences that crop up especially with
+ fault handling. Different controllers don't necessarily report
+ the same aspects of failures, and recovery from faults (including
+ software-induced ones like unlinking an URB) isn't yet fully
+ consistent.
+ Device driver authors should make a point of doing disconnect
+ testing (while the device is active) with each different host
+ controller driver, to make sure drivers don't have bugs of
+ their own as well as to make sure they aren't relying on some
+ HCD-specific behavior.
+ (You will need external USB 1.1 and/or
+ USB 2.0 hubs to perform all those tests.)
+ </para>
+
+ </chapter>
+
+<chapter id="types"><title>USB-Standard Types</title>
+
+ <para>In <filename>&lt;linux/usb/ch9.h&gt;</filename> you will find
+ the USB data types defined in chapter 9 of the USB specification.
+ These data types are used throughout USB, and in APIs including
+ this host side API, gadget APIs, and usbfs.
+ </para>
+
+!Iinclude/linux/usb/ch9.h
+
+ </chapter>
+
+<chapter id="hostside"><title>Host-Side Data Types and Macros</title>
+
+ <para>The host side API exposes several layers to drivers, some of
+ which are more necessary than others.
+ These support lifecycle models for host side drivers
+ and devices, and support passing buffers through usbcore to
+ some HCD that performs the I/O for the device driver.
+ </para>
+
+
+!Iinclude/linux/usb.h
+
+ </chapter>
+
+ <chapter id="usbcore"><title>USB Core APIs</title>
+
+ <para>There are two basic I/O models in the USB API.
+ The most elemental one is asynchronous: drivers submit requests
+ in the form of an URB, and the URB's completion callback
+ handle the next step.
+ All USB transfer types support that model, although there
+ are special cases for control URBs (which always have setup
+ and status stages, but may not have a data stage) and
+ isochronous URBs (which allow large packets and include
+ per-packet fault reports).
+ Built on top of that is synchronous API support, where a
+ driver calls a routine that allocates one or more URBs,
+ submits them, and waits until they complete.
+ There are synchronous wrappers for single-buffer control
+ and bulk transfers (which are awkward to use in some
+ driver disconnect scenarios), and for scatterlist based
+ streaming i/o (bulk or interrupt).
+ </para>
+
+ <para>USB drivers need to provide buffers that can be
+ used for DMA, although they don't necessarily need to
+ provide the DMA mapping themselves.
+ There are APIs to use used when allocating DMA buffers,
+ which can prevent use of bounce buffers on some systems.
+ In some cases, drivers may be able to rely on 64bit DMA
+ to eliminate another kind of bounce buffer.
+ </para>
+
+!Edrivers/usb/core/urb.c
+!Edrivers/usb/core/message.c
+!Edrivers/usb/core/file.c
+!Edrivers/usb/core/driver.c
+!Edrivers/usb/core/usb.c
+!Edrivers/usb/core/hub.c
+ </chapter>
+
+ <chapter id="hcd"><title>Host Controller APIs</title>
+
+ <para>These APIs are only for use by host controller drivers,
+ most of which implement standard register interfaces such as
+ EHCI, OHCI, or UHCI.
+ UHCI was one of the first interfaces, designed by Intel and
+ also used by VIA; it doesn't do much in hardware.
+ OHCI was designed later, to have the hardware do more work
+ (bigger transfers, tracking protocol state, and so on).
+ EHCI was designed with USB 2.0; its design has features that
+ resemble OHCI (hardware does much more work) as well as
+ UHCI (some parts of ISO support, TD list processing).
+ </para>
+
+ <para>There are host controllers other than the "big three",
+ although most PCI based controllers (and a few non-PCI based
+ ones) use one of those interfaces.
+ Not all host controllers use DMA; some use PIO, and there
+ is also a simulator.
+ </para>
+
+ <para>The same basic APIs are available to drivers for all
+ those controllers.
+ For historical reasons they are in two layers:
+ <structname>struct usb_bus</structname> is a rather thin
+ layer that became available in the 2.2 kernels, while
+ <structname>struct usb_hcd</structname> is a more featureful
+ layer (available in later 2.4 kernels and in 2.5) that
+ lets HCDs share common code, to shrink driver size
+ and significantly reduce hcd-specific behaviors.
+ </para>
+
+!Edrivers/usb/core/hcd.c
+!Edrivers/usb/core/hcd-pci.c
+!Idrivers/usb/core/buffer.c
+ </chapter>
+
+ <chapter id="usbfs">
+ <title>The USB Filesystem (usbfs)</title>
+
+ <para>This chapter presents the Linux <emphasis>usbfs</emphasis>.
+ You may prefer to avoid writing new kernel code for your
+ USB driver; that's the problem that usbfs set out to solve.
+ User mode device drivers are usually packaged as applications
+ or libraries, and may use usbfs through some programming library
+ that wraps it. Such libraries include
+ <ulink url="http://libusb.sourceforge.net">libusb</ulink>
+ for C/C++, and
+ <ulink url="http://jUSB.sourceforge.net">jUSB</ulink> for Java.
+ </para>
+
+ <note><title>Unfinished</title>
+ <para>This particular documentation is incomplete,
+ especially with respect to the asynchronous mode.
+ As of kernel 2.5.66 the code and this (new) documentation
+ need to be cross-reviewed.
+ </para>
+ </note>
+
+ <para>Configure usbfs into Linux kernels by enabling the
+ <emphasis>USB filesystem</emphasis> option (CONFIG_USB_DEVICEFS),
+ and you get basic support for user mode USB device drivers.
+ Until relatively recently it was often (confusingly) called
+ <emphasis>usbdevfs</emphasis> although it wasn't solving what
+ <emphasis>devfs</emphasis> was.
+ Every USB device will appear in usbfs, regardless of whether or
+ not it has a kernel driver.
+ </para>
+
+ <sect1 id="usbfs-files">
+ <title>What files are in "usbfs"?</title>
+
+ <para>Conventionally mounted at
+ <filename>/proc/bus/usb</filename>, usbfs
+ features include:
+ <itemizedlist>
+ <listitem><para><filename>/proc/bus/usb/devices</filename>
+ ... a text file
+ showing each of the USB devices on known to the kernel,
+ and their configuration descriptors.
+ You can also poll() this to learn about new devices.
+ </para></listitem>
+ <listitem><para><filename>/proc/bus/usb/BBB/DDD</filename>
+ ... magic files
+ exposing the each device's configuration descriptors, and
+ supporting a series of ioctls for making device requests,
+ including I/O to devices. (Purely for access by programs.)
+ </para></listitem>
+ </itemizedlist>
+ </para>
+
+ <para> Each bus is given a number (BBB) based on when it was
+ enumerated; within each bus, each device is given a similar
+ number (DDD).
+ Those BBB/DDD paths are not "stable" identifiers;
+ expect them to change even if you always leave the devices
+ plugged in to the same hub port.
+ <emphasis>Don't even think of saving these in application
+ configuration files.</emphasis>
+ Stable identifiers are available, for user mode applications
+ that want to use them. HID and networking devices expose
+ these stable IDs, so that for example you can be sure that
+ you told the right UPS to power down its second server.
+ "usbfs" doesn't (yet) expose those IDs.
+ </para>
+
+ </sect1>
+
+ <sect1 id="usbfs-fstab">
+ <title>Mounting and Access Control</title>
+
+ <para>There are a number of mount options for usbfs, which will
+ be of most interest to you if you need to override the default
+ access control policy.
+ That policy is that only root may read or write device files
+ (<filename>/proc/bus/BBB/DDD</filename>) although anyone may read
+ the <filename>devices</filename>
+ or <filename>drivers</filename> files.
+ I/O requests to the device also need the CAP_SYS_RAWIO capability,
+ </para>
+
+ <para>The significance of that is that by default, all user mode
+ device drivers need super-user privileges.
+ You can change modes or ownership in a driver setup
+ when the device hotplugs, or maye just start the
+ driver right then, as a privileged server (or some activity
+ within one).
+ That's the most secure approach for multi-user systems,
+ but for single user systems ("trusted" by that user)
+ it's more convenient just to grant everyone all access
+ (using the <emphasis>devmode=0666</emphasis> option)
+ so the driver can start whenever it's needed.
+ </para>
+
+ <para>The mount options for usbfs, usable in /etc/fstab or
+ in command line invocations of <emphasis>mount</emphasis>, are:
+
+ <variablelist>
+ <varlistentry>
+ <term><emphasis>busgid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the GID used for the
+ /proc/bus/usb/BBB
+ directories. (Default: 0)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>busmode</emphasis>=MMM</term>
+ <listitem><para>Controls the file mode used for the
+ /proc/bus/usb/BBB
+ directories. (Default: 0555)
+ </para></listitem></varlistentry>
+ <varlistentry><term><emphasis>busuid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the UID used for the
+ /proc/bus/usb/BBB
+ directories. (Default: 0)</para></listitem></varlistentry>
+
+ <varlistentry><term><emphasis>devgid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the GID used for the
+ /proc/bus/usb/BBB/DDD
+ files. (Default: 0)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>devmode</emphasis>=MMM</term>
+ <listitem><para>Controls the file mode used for the
+ /proc/bus/usb/BBB/DDD
+ files. (Default: 0644)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>devuid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the UID used for the
+ /proc/bus/usb/BBB/DDD
+ files. (Default: 0)</para></listitem></varlistentry>
+
+ <varlistentry><term><emphasis>listgid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the GID used for the
+ /proc/bus/usb/devices and drivers files.
+ (Default: 0)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>listmode</emphasis>=MMM</term>
+ <listitem><para>Controls the file mode used for the
+ /proc/bus/usb/devices and drivers files.
+ (Default: 0444)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>listuid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the UID used for the
+ /proc/bus/usb/devices and drivers files.
+ (Default: 0)</para></listitem></varlistentry>
+ </variablelist>
+
+ </para>
+
+ <para>Note that many Linux distributions hard-wire the mount options
+ for usbfs in their init scripts, such as
+ <filename>/etc/rc.d/rc.sysinit</filename>,
+ rather than making it easy to set this per-system
+ policy in <filename>/etc/fstab</filename>.
+ </para>
+
+ </sect1>
+
+ <sect1 id="usbfs-devices">
+ <title>/proc/bus/usb/devices</title>
+
+ <para>This file is handy for status viewing tools in user
+ mode, which can scan the text format and ignore most of it.
+ More detailed device status (including class and vendor
+ status) is available from device-specific files.
+ For information about the current format of this file,
+ see the
+ <filename>Documentation/usb/proc_usb_info.txt</filename>
+ file in your Linux kernel sources.
+ </para>
+
+ <para>This file, in combination with the poll() system call, can
+ also be used to detect when devices are added or removed:
+<programlisting>int fd;
+struct pollfd pfd;
+
+fd = open("/proc/bus/usb/devices", O_RDONLY);
+pfd = { fd, POLLIN, 0 };
+for (;;) {
+ /* The first time through, this call will return immediately. */
+ poll(&amp;pfd, 1, -1);
+
+ /* To see what's changed, compare the file's previous and current
+ contents or scan the filesystem. (Scanning is more precise.) */
+}</programlisting>
+ Note that this behavior is intended to be used for informational
+ and debug purposes. It would be more appropriate to use programs
+ such as udev or HAL to initialize a device or start a user-mode
+ helper program, for instance.
+ </para>
+ </sect1>
+
+ <sect1 id="usbfs-bbbddd">
+ <title>/proc/bus/usb/BBB/DDD</title>
+
+ <para>Use these files in one of these basic ways:
+ </para>
+
+ <para><emphasis>They can be read,</emphasis>
+ producing first the device descriptor
+ (18 bytes) and then the descriptors for the current configuration.
+ See the USB 2.0 spec for details about those binary data formats.
+ You'll need to convert most multibyte values from little endian
+ format to your native host byte order, although a few of the
+ fields in the device descriptor (both of the BCD-encoded fields,
+ and the vendor and product IDs) will be byteswapped for you.
+ Note that configuration descriptors include descriptors for
+ interfaces, altsettings, endpoints, and maybe additional
+ class descriptors.
+ </para>
+
+ <para><emphasis>Perform USB operations</emphasis> using
+ <emphasis>ioctl()</emphasis> requests to make endpoint I/O
+ requests (synchronously or asynchronously) or manage
+ the device.
+ These requests need the CAP_SYS_RAWIO capability,
+ as well as filesystem access permissions.
+ Only one ioctl request can be made on one of these
+ device files at a time.
+ This means that if you are synchronously reading an endpoint
+ from one thread, you won't be able to write to a different
+ endpoint from another thread until the read completes.
+ This works for <emphasis>half duplex</emphasis> protocols,
+ but otherwise you'd use asynchronous i/o requests.
+ </para>
+
+ </sect1>
+
+
+ <sect1 id="usbfs-lifecycle">
+ <title>Life Cycle of User Mode Drivers</title>
+
+ <para>Such a driver first needs to find a device file
+ for a device it knows how to handle.
+ Maybe it was told about it because a
+ <filename>/sbin/hotplug</filename> event handling agent
+ chose that driver to handle the new device.
+ Or maybe it's an application that scans all the
+ /proc/bus/usb device files, and ignores most devices.
+ In either case, it should <function>read()</function> all
+ the descriptors from the device file,
+ and check them against what it knows how to handle.
+ It might just reject everything except a particular
+ vendor and product ID, or need a more complex policy.
+ </para>
+
+ <para>Never assume there will only be one such device
+ on the system at a time!
+ If your code can't handle more than one device at
+ a time, at least detect when there's more than one, and
+ have your users choose which device to use.
+ </para>
+
+ <para>Once your user mode driver knows what device to use,
+ it interacts with it in either of two styles.
+ The simple style is to make only control requests; some
+ devices don't need more complex interactions than those.
+ (An example might be software using vendor-specific control
+ requests for some initialization or configuration tasks,
+ with a kernel driver for the rest.)
+ </para>
+
+ <para>More likely, you need a more complex style driver:
+ one using non-control endpoints, reading or writing data
+ and claiming exclusive use of an interface.
+ <emphasis>Bulk</emphasis> transfers are easiest to use,
+ but only their sibling <emphasis>interrupt</emphasis> transfers
+ work with low speed devices.
+ Both interrupt and <emphasis>isochronous</emphasis> transfers
+ offer service guarantees because their bandwidth is reserved.
+ Such "periodic" transfers are awkward to use through usbfs,
+ unless you're using the asynchronous calls. However, interrupt
+ transfers can also be used in a synchronous "one shot" style.
+ </para>
+
+ <para>Your user-mode driver should never need to worry
+ about cleaning up request state when the device is
+ disconnected, although it should close its open file
+ descriptors as soon as it starts seeing the ENODEV
+ errors.
+ </para>
+
+ </sect1>
+
+ <sect1 id="usbfs-ioctl"><title>The ioctl() Requests</title>
+
+ <para>To use these ioctls, you need to include the following
+ headers in your userspace program:
+<programlisting>#include &lt;linux/usb.h&gt;
+#include &lt;linux/usbdevice_fs.h&gt;
+#include &lt;asm/byteorder.h&gt;</programlisting>
+ The standard USB device model requests, from "Chapter 9" of
+ the USB 2.0 specification, are automatically included from
+ the <filename>&lt;linux/usb/ch9.h&gt;</filename> header.
+ </para>
+
+ <para>Unless noted otherwise, the ioctl requests
+ described here will
+ update the modification time on the usbfs file to which
+ they are applied (unless they fail).
+ A return of zero indicates success; otherwise, a
+ standard USB error code is returned. (These are
+ documented in
+ <filename>Documentation/usb/error-codes.txt</filename>
+ in your kernel sources.)
+ </para>
+
+ <para>Each of these files multiplexes access to several
+ I/O streams, one per endpoint.
+ Each device has one control endpoint (endpoint zero)
+ which supports a limited RPC style RPC access.
+ Devices are configured
+ by khubd (in the kernel) setting a device-wide
+ <emphasis>configuration</emphasis> that affects things
+ like power consumption and basic functionality.
+ The endpoints are part of USB <emphasis>interfaces</emphasis>,
+ which may have <emphasis>altsettings</emphasis>
+ affecting things like which endpoints are available.
+ Many devices only have a single configuration and interface,
+ so drivers for them will ignore configurations and altsettings.
+ </para>
+
+
+ <sect2 id="usbfs-mgmt">
+ <title>Management/Status Requests</title>
+
+ <para>A number of usbfs requests don't deal very directly
+ with device I/O.
+ They mostly relate to device management and status.
+ These are all synchronous requests.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>USBDEVFS_CLAIMINTERFACE</term>
+ <listitem><para>This is used to force usbfs to
+ claim a specific interface,
+ which has not previously been claimed by usbfs or any other
+ kernel driver.
+ The ioctl parameter is an integer holding the number of
+ the interface (bInterfaceNumber from descriptor).
+ </para><para>
+ Note that if your driver doesn't claim an interface
+ before trying to use one of its endpoints, and no
+ other driver has bound to it, then the interface is
+ automatically claimed by usbfs.
+ </para><para>
+ This claim will be released by a RELEASEINTERFACE ioctl,
+ or by closing the file descriptor.
+ File modification time is not updated by this request.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_CONNECTINFO</term>
+ <listitem><para>Says whether the device is lowspeed.
+ The ioctl parameter points to a structure like this:
+<programlisting>struct usbdevfs_connectinfo {
+ unsigned int devnum;
+ unsigned char slow;
+}; </programlisting>
+ File modification time is not updated by this request.
+ </para><para>
+ <emphasis>You can't tell whether a "not slow"
+ device is connected at high speed (480 MBit/sec)
+ or just full speed (12 MBit/sec).</emphasis>
+ You should know the devnum value already,
+ it's the DDD value of the device file name.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_GETDRIVER</term>
+ <listitem><para>Returns the name of the kernel driver
+ bound to a given interface (a string). Parameter
+ is a pointer to this structure, which is modified:
+<programlisting>struct usbdevfs_getdriver {
+ unsigned int interface;
+ char driver[USBDEVFS_MAXDRIVERNAME + 1];
+};</programlisting>
+ File modification time is not updated by this request.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_IOCTL</term>
+ <listitem><para>Passes a request from userspace through
+ to a kernel driver that has an ioctl entry in the
+ <emphasis>struct usb_driver</emphasis> it registered.
+<programlisting>struct usbdevfs_ioctl {
+ int ifno;
+ int ioctl_code;
+ void *data;
+};
+
+/* user mode call looks like this.
+ * 'request' becomes the driver->ioctl() 'code' parameter.
+ * the size of 'param' is encoded in 'request', and that data
+ * is copied to or from the driver->ioctl() 'buf' parameter.
+ */
+static int
+usbdev_ioctl (int fd, int ifno, unsigned request, void *param)
+{
+ struct usbdevfs_ioctl wrapper;
+
+ wrapper.ifno = ifno;
+ wrapper.ioctl_code = request;
+ wrapper.data = param;
+
+ return ioctl (fd, USBDEVFS_IOCTL, &amp;wrapper);
+} </programlisting>
+ File modification time is not updated by this request.
+ </para><para>
+ This request lets kernel drivers talk to user mode code
+ through filesystem operations even when they don't create
+ a charactor or block special device.
+ It's also been used to do things like ask devices what
+ device special file should be used.
+ Two pre-defined ioctls are used
+ to disconnect and reconnect kernel drivers, so
+ that user mode code can completely manage binding
+ and configuration of devices.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_RELEASEINTERFACE</term>
+ <listitem><para>This is used to release the claim usbfs
+ made on interface, either implicitly or because of a
+ USBDEVFS_CLAIMINTERFACE call, before the file
+ descriptor is closed.
+ The ioctl parameter is an integer holding the number of
+ the interface (bInterfaceNumber from descriptor);
+ File modification time is not updated by this request.
+ </para><warning><para>
+ <emphasis>No security check is made to ensure
+ that the task which made the claim is the one
+ which is releasing it.
+ This means that user mode driver may interfere
+ other ones. </emphasis>
+ </para></warning></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_RESETEP</term>
+ <listitem><para>Resets the data toggle value for an endpoint
+ (bulk or interrupt) to DATA0.
+ The ioctl parameter is an integer endpoint number
+ (1 to 15, as identified in the endpoint descriptor),
+ with USB_DIR_IN added if the device's endpoint sends
+ data to the host.
+ </para><warning><para>
+ <emphasis>Avoid using this request.
+ It should probably be removed.</emphasis>
+ Using it typically means the device and driver will lose
+ toggle synchronization. If you really lost synchronization,
+ you likely need to completely handshake with the device,
+ using a request like CLEAR_HALT
+ or SET_INTERFACE.
+ </para></warning></listitem></varlistentry>
+
+ </variablelist>
+
+ </sect2>
+
+ <sect2 id="usbfs-sync">
+ <title>Synchronous I/O Support</title>
+
+ <para>Synchronous requests involve the kernel blocking
+ until the user mode request completes, either by
+ finishing successfully or by reporting an error.
+ In most cases this is the simplest way to use usbfs,
+ although as noted above it does prevent performing I/O
+ to more than one endpoint at a time.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>USBDEVFS_BULK</term>
+ <listitem><para>Issues a bulk read or write request to the
+ device.
+ The ioctl parameter is a pointer to this structure:
+<programlisting>struct usbdevfs_bulktransfer {
+ unsigned int ep;
+ unsigned int len;
+ unsigned int timeout; /* in milliseconds */
+ void *data;
+};</programlisting>
+ </para><para>The "ep" value identifies a
+ bulk endpoint number (1 to 15, as identified in an endpoint
+ descriptor),
+ masked with USB_DIR_IN when referring to an endpoint which
+ sends data to the host from the device.
+ The length of the data buffer is identified by "len";
+ Recent kernels support requests up to about 128KBytes.
+ <emphasis>FIXME say how read length is returned,
+ and how short reads are handled.</emphasis>.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_CLEAR_HALT</term>
+ <listitem><para>Clears endpoint halt (stall) and
+ resets the endpoint toggle. This is only
+ meaningful for bulk or interrupt endpoints.
+ The ioctl parameter is an integer endpoint number
+ (1 to 15, as identified in an endpoint descriptor),
+ masked with USB_DIR_IN when referring to an endpoint which
+ sends data to the host from the device.
+ </para><para>
+ Use this on bulk or interrupt endpoints which have
+ stalled, returning <emphasis>-EPIPE</emphasis> status
+ to a data transfer request.
+ Do not issue the control request directly, since
+ that could invalidate the host's record of the
+ data toggle.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_CONTROL</term>
+ <listitem><para>Issues a control request to the device.
+ The ioctl parameter points to a structure like this:
+<programlisting>struct usbdevfs_ctrltransfer {
+ __u8 bRequestType;
+ __u8 bRequest;
+ __u16 wValue;
+ __u16 wIndex;
+ __u16 wLength;
+ __u32 timeout; /* in milliseconds */
+ void *data;
+};</programlisting>
+ </para><para>
+ The first eight bytes of this structure are the contents
+ of the SETUP packet to be sent to the device; see the
+ USB 2.0 specification for details.
+ The bRequestType value is composed by combining a
+ USB_TYPE_* value, a USB_DIR_* value, and a
+ USB_RECIP_* value (from
+ <emphasis>&lt;linux/usb.h&gt;</emphasis>).
+ If wLength is nonzero, it describes the length of the data
+ buffer, which is either written to the device
+ (USB_DIR_OUT) or read from the device (USB_DIR_IN).
+ </para><para>
+ At this writing, you can't transfer more than 4 KBytes
+ of data to or from a device; usbfs has a limit, and
+ some host controller drivers have a limit.
+ (That's not usually a problem.)
+ <emphasis>Also</emphasis> there's no way to say it's
+ not OK to get a short read back from the device.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_RESET</term>
+ <listitem><para>Does a USB level device reset.
+ The ioctl parameter is ignored.
+ After the reset, this rebinds all device interfaces.
+ File modification time is not updated by this request.
+ </para><warning><para>
+ <emphasis>Avoid using this call</emphasis>
+ until some usbcore bugs get fixed,
+ since it does not fully synchronize device, interface,
+ and driver (not just usbfs) state.
+ </para></warning></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_SETINTERFACE</term>
+ <listitem><para>Sets the alternate setting for an
+ interface. The ioctl parameter is a pointer to a
+ structure like this:
+<programlisting>struct usbdevfs_setinterface {
+ unsigned int interface;
+ unsigned int altsetting;
+}; </programlisting>
+ File modification time is not updated by this request.
+ </para><para>
+ Those struct members are from some interface descriptor
+ applying to the current configuration.
+ The interface number is the bInterfaceNumber value, and
+ the altsetting number is the bAlternateSetting value.
+ (This resets each endpoint in the interface.)
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_SETCONFIGURATION</term>
+ <listitem><para>Issues the
+ <function>usb_set_configuration</function> call
+ for the device.
+ The parameter is an integer holding the number of
+ a configuration (bConfigurationValue from descriptor).
+ File modification time is not updated by this request.
+ </para><warning><para>
+ <emphasis>Avoid using this call</emphasis>
+ until some usbcore bugs get fixed,
+ since it does not fully synchronize device, interface,
+ and driver (not just usbfs) state.
+ </para></warning></listitem></varlistentry>
+
+ </variablelist>
+ </sect2>
+
+ <sect2 id="usbfs-async">
+ <title>Asynchronous I/O Support</title>
+
+ <para>As mentioned above, there are situations where it may be
+ important to initiate concurrent operations from user mode code.
+ This is particularly important for periodic transfers
+ (interrupt and isochronous), but it can be used for other
+ kinds of USB requests too.
+ In such cases, the asynchronous requests described here
+ are essential. Rather than submitting one request and having
+ the kernel block until it completes, the blocking is separate.
+ </para>
+
+ <para>These requests are packaged into a structure that
+ resembles the URB used by kernel device drivers.
+ (No POSIX Async I/O support here, sorry.)
+ It identifies the endpoint type (USBDEVFS_URB_TYPE_*),
+ endpoint (number, masked with USB_DIR_IN as appropriate),
+ buffer and length, and a user "context" value serving to
+ uniquely identify each request.
+ (It's usually a pointer to per-request data.)
+ Flags can modify requests (not as many as supported for
+ kernel drivers).
+ </para>
+
+ <para>Each request can specify a realtime signal number
+ (between SIGRTMIN and SIGRTMAX, inclusive) to request a
+ signal be sent when the request completes.
+ </para>
+
+ <para>When usbfs returns these urbs, the status value
+ is updated, and the buffer may have been modified.
+ Except for isochronous transfers, the actual_length is
+ updated to say how many bytes were transferred; if the
+ USBDEVFS_URB_DISABLE_SPD flag is set
+ ("short packets are not OK"), if fewer bytes were read
+ than were requested then you get an error report.
+ </para>
+
+<programlisting>struct usbdevfs_iso_packet_desc {
+ unsigned int length;
+ unsigned int actual_length;
+ unsigned int status;
+};
+
+struct usbdevfs_urb {
+ unsigned char type;
+ unsigned char endpoint;
+ int status;
+ unsigned int flags;
+ void *buffer;
+ int buffer_length;
+ int actual_length;
+ int start_frame;
+ int number_of_packets;
+ int error_count;
+ unsigned int signr;
+ void *usercontext;
+ struct usbdevfs_iso_packet_desc iso_frame_desc[];
+};</programlisting>
+
+ <para> For these asynchronous requests, the file modification
+ time reflects when the request was initiated.
+ This contrasts with their use with the synchronous requests,
+ where it reflects when requests complete.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>USBDEVFS_DISCARDURB</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_DISCSIGNAL</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_REAPURB</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_REAPURBNDELAY</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_SUBMITURB</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ </para><para>
+ </para></listitem></varlistentry>
+
+ </variablelist>
+ </sect2>
+
+ </sect1>
+
+ </chapter>
+
+</book>
+<!-- vim:syntax=sgml:sw=4
+-->
diff --git a/Documentation/DocBook/wanbook.tmpl b/Documentation/DocBook/wanbook.tmpl
new file mode 100644
index 0000000..8c93db1
--- /dev/null
+++ b/Documentation/DocBook/wanbook.tmpl
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="WANGuide">
+ <bookinfo>
+ <title>Synchronous PPP and Cisco HDLC Programming Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@lxorguk.ukuu.org.uk</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2000</year>
+ <holder>Alan Cox</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The syncppp drivers in Linux provide a fairly complete
+ implementation of Cisco HDLC and a minimal implementation of
+ PPP. The longer term goal is to switch the PPP layer to the
+ generic PPP interface that is new in Linux 2.3.x. The API should
+ remain unchanged when this is done, but support will then be
+ available for IPX, compression and other PPP features
+ </para>
+ </chapter>
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ <variablelist>
+ <varlistentry><term>PPP is minimal</term>
+ <listitem>
+ <para>
+ The current PPP implementation is very basic, although sufficient
+ for most wan usages.
+ </para>
+ </listitem></varlistentry>
+
+ <varlistentry><term>Cisco HDLC Quirks</term>
+ <listitem>
+ <para>
+ Currently we do not end all packets with the correct Cisco multicast
+ or unicast flags. Nothing appears to mind too much but this should
+ be corrected.
+ </para>
+ </listitem></varlistentry>
+ </variablelist>
+
+ </para>
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Edrivers/net/wan/syncppp.c
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl
new file mode 100644
index 0000000..eeff19c
--- /dev/null
+++ b/Documentation/DocBook/writing_usb_driver.tmpl
@@ -0,0 +1,412 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="USBDeviceDriver">
+ <bookinfo>
+ <title>Writing USB Device Drivers</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Greg</firstname>
+ <surname>Kroah-Hartman</surname>
+ <affiliation>
+ <address>
+ <email>greg@kroah.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2001-2002</year>
+ <holder>Greg Kroah-Hartman</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+
+ <para>
+ This documentation is based on an article published in
+ Linux Journal Magazine, October 2001, Issue 90.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The Linux USB subsystem has grown from supporting only two different
+ types of devices in the 2.2.7 kernel (mice and keyboards), to over 20
+ different types of devices in the 2.4 kernel. Linux currently supports
+ almost all USB class devices (standard types of devices like keyboards,
+ mice, modems, printers and speakers) and an ever-growing number of
+ vendor-specific devices (such as USB to serial converters, digital
+ cameras, Ethernet devices and MP3 players). For a full list of the
+ different USB devices currently supported, see Resources.
+ </para>
+ <para>
+ The remaining kinds of USB devices that do not have support on Linux are
+ almost all vendor-specific devices. Each vendor decides to implement a
+ custom protocol to talk to their device, so a custom driver usually needs
+ to be created. Some vendors are open with their USB protocols and help
+ with the creation of Linux drivers, while others do not publish them, and
+ developers are forced to reverse-engineer. See Resources for some links
+ to handy reverse-engineering tools.
+ </para>
+ <para>
+ Because each different protocol causes a new driver to be created, I have
+ written a generic USB driver skeleton, modeled after the pci-skeleton.c
+ file in the kernel source tree upon which many PCI network drivers have
+ been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c
+ in the kernel source tree. In this article I will walk through the basics
+ of the skeleton driver, explaining the different pieces and what needs to
+ be done to customize it to your specific device.
+ </para>
+ </chapter>
+
+ <chapter id="basics">
+ <title>Linux USB Basics</title>
+ <para>
+ If you are going to write a Linux USB driver, please become familiar with
+ the USB protocol specification. It can be found, along with many other
+ useful documents, at the USB home page (see Resources). An excellent
+ introduction to the Linux USB subsystem can be found at the USB Working
+ Devices List (see Resources). It explains how the Linux USB subsystem is
+ structured and introduces the reader to the concept of USB urbs
+ (USB Request Blocks), which are essential to USB drivers.
+ </para>
+ <para>
+ The first thing a Linux USB driver needs to do is register itself with
+ the Linux USB subsystem, giving it some information about which devices
+ the driver supports and which functions to call when a device supported
+ by the driver is inserted or removed from the system. All of this
+ information is passed to the USB subsystem in the usb_driver structure.
+ The skeleton driver declares a usb_driver as:
+ </para>
+ <programlisting>
+static struct usb_driver skel_driver = {
+ .name = "skeleton",
+ .probe = skel_probe,
+ .disconnect = skel_disconnect,
+ .fops = &amp;skel_fops,
+ .minor = USB_SKEL_MINOR_BASE,
+ .id_table = skel_table,
+};
+ </programlisting>
+ <para>
+ The variable name is a string that describes the driver. It is used in
+ informational messages printed to the system log. The probe and
+ disconnect function pointers are called when a device that matches the
+ information provided in the id_table variable is either seen or removed.
+ </para>
+ <para>
+ The fops and minor variables are optional. Most USB drivers hook into
+ another kernel subsystem, such as the SCSI, network or TTY subsystem.
+ These types of drivers register themselves with the other kernel
+ subsystem, and any user-space interactions are provided through that
+ interface. But for drivers that do not have a matching kernel subsystem,
+ such as MP3 players or scanners, a method of interacting with user space
+ is needed. The USB subsystem provides a way to register a minor device
+ number and a set of file_operations function pointers that enable this
+ user-space interaction. The skeleton driver needs this kind of interface,
+ so it provides a minor starting number and a pointer to its
+ file_operations functions.
+ </para>
+ <para>
+ The USB driver is then registered with a call to usb_register, usually in
+ the driver's init function, as shown here:
+ </para>
+ <programlisting>
+static int __init usb_skel_init(void)
+{
+ int result;
+
+ /* register this driver with the USB subsystem */
+ result = usb_register(&amp;skel_driver);
+ if (result &lt; 0) {
+ err(&quot;usb_register failed for the &quot;__FILE__ &quot;driver.&quot;
+ &quot;Error number %d&quot;, result);
+ return -1;
+ }
+
+ return 0;
+}
+module_init(usb_skel_init);
+ </programlisting>
+ <para>
+ When the driver is unloaded from the system, it needs to deregister
+ itself with the USB subsystem. This is done with the usb_deregister
+ function:
+ </para>
+ <programlisting>
+static void __exit usb_skel_exit(void)
+{
+ /* deregister this driver with the USB subsystem */
+ usb_deregister(&amp;skel_driver);
+}
+module_exit(usb_skel_exit);
+ </programlisting>
+ <para>
+ To enable the linux-hotplug system to load the driver automatically when
+ the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The
+ following code tells the hotplug scripts that this module supports a
+ single device with a specific vendor and product ID:
+ </para>
+ <programlisting>
+/* table of devices that work with this driver */
+static struct usb_device_id skel_table [] = {
+ { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) },
+ { } /* Terminating entry */
+};
+MODULE_DEVICE_TABLE (usb, skel_table);
+ </programlisting>
+ <para>
+ There are other macros that can be used in describing a usb_device_id for
+ drivers that support a whole class of USB drivers. See usb.h for more
+ information on this.
+ </para>
+ </chapter>
+
+ <chapter id="device">
+ <title>Device operation</title>
+ <para>
+ When a device is plugged into the USB bus that matches the device ID
+ pattern that your driver registered with the USB core, the probe function
+ is called. The usb_device structure, interface number and the interface ID
+ are passed to the function:
+ </para>
+ <programlisting>
+static int skel_probe(struct usb_interface *interface,
+ const struct usb_device_id *id)
+ </programlisting>
+ <para>
+ The driver now needs to verify that this device is actually one that it
+ can accept. If so, it returns 0.
+ If not, or if any error occurs during initialization, an errorcode
+ (such as <literal>-ENOMEM</literal> or <literal>-ENODEV</literal>)
+ is returned from the probe function.
+ </para>
+ <para>
+ In the skeleton driver, we determine what end points are marked as bulk-in
+ and bulk-out. We create buffers to hold the data that will be sent and
+ received from the device, and a USB urb to write data to the device is
+ initialized.
+ </para>
+ <para>
+ Conversely, when the device is removed from the USB bus, the disconnect
+ function is called with the device pointer. The driver needs to clean any
+ private data that has been allocated at this time and to shut down any
+ pending urbs that are in the USB system.
+ </para>
+ <para>
+ Now that the device is plugged into the system and the driver is bound to
+ the device, any of the functions in the file_operations structure that
+ were passed to the USB subsystem will be called from a user program trying
+ to talk to the device. The first function called will be open, as the
+ program tries to open the device for I/O. We increment our private usage
+ count and save a pointer to our internal structure in the file
+ structure. This is done so that future calls to file operations will
+ enable the driver to determine which device the user is addressing. All
+ of this is done with the following code:
+ </para>
+ <programlisting>
+/* increment our usage count for the module */
+++skel->open_count;
+
+/* save our object in the file's private structure */
+file->private_data = dev;
+ </programlisting>
+ <para>
+ After the open function is called, the read and write functions are called
+ to receive and send data to the device. In the skel_write function, we
+ receive a pointer to some data that the user wants to send to the device
+ and the size of the data. The function determines how much data it can
+ send to the device based on the size of the write urb it has created (this
+ size depends on the size of the bulk out end point that the device has).
+ Then it copies the data from user space to kernel space, points the urb to
+ the data and submits the urb to the USB subsystem. This can be seen in
+ the following code:
+ </para>
+ <programlisting>
+/* we can only write as much as 1 urb will hold */
+bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count;
+
+/* copy the data from user space into our urb */
+copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written);
+
+/* set up our urb */
+usb_fill_bulk_urb(skel->write_urb,
+ skel->dev,
+ usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr),
+ skel->write_urb->transfer_buffer,
+ bytes_written,
+ skel_write_bulk_callback,
+ skel);
+
+/* send the data out the bulk port */
+result = usb_submit_urb(skel->write_urb);
+if (result) {
+ err(&quot;Failed submitting write urb, error %d&quot;, result);
+}
+ </programlisting>
+ <para>
+ When the write urb is filled up with the proper information using the
+ usb_fill_bulk_urb function, we point the urb's completion callback to call our
+ own skel_write_bulk_callback function. This function is called when the
+ urb is finished by the USB subsystem. The callback function is called in
+ interrupt context, so caution must be taken not to do very much processing
+ at that time. Our implementation of skel_write_bulk_callback merely
+ reports if the urb was completed successfully or not and then returns.
+ </para>
+ <para>
+ The read function works a bit differently from the write function in that
+ we do not use an urb to transfer data from the device to the driver.
+ Instead we call the usb_bulk_msg function, which can be used to send or
+ receive data from a device without having to create urbs and handle
+ urb completion callback functions. We call the usb_bulk_msg function,
+ giving it a buffer into which to place any data received from the device
+ and a timeout value. If the timeout period expires without receiving any
+ data from the device, the function will fail and return an error message.
+ This can be shown with the following code:
+ </para>
+ <programlisting>
+/* do an immediate bulk read to get data from the device */
+retval = usb_bulk_msg (skel->dev,
+ usb_rcvbulkpipe (skel->dev,
+ skel->bulk_in_endpointAddr),
+ skel->bulk_in_buffer,
+ skel->bulk_in_size,
+ &amp;count, HZ*10);
+/* if the read was successful, copy the data to user space */
+if (!retval) {
+ if (copy_to_user (buffer, skel->bulk_in_buffer, count))
+ retval = -EFAULT;
+ else
+ retval = count;
+}
+ </programlisting>
+ <para>
+ The usb_bulk_msg function can be very useful for doing single reads or
+ writes to a device; however, if you need to read or write constantly to a
+ device, it is recommended to set up your own urbs and submit them to the
+ USB subsystem.
+ </para>
+ <para>
+ When the user program releases the file handle that it has been using to
+ talk to the device, the release function in the driver is called. In this
+ function we decrement our private usage count and wait for possible
+ pending writes:
+ </para>
+ <programlisting>
+/* decrement our usage count for the device */
+--skel->open_count;
+ </programlisting>
+ <para>
+ One of the more difficult problems that USB drivers must be able to handle
+ smoothly is the fact that the USB device may be removed from the system at
+ any point in time, even if a program is currently talking to it. It needs
+ to be able to shut down any current reads and writes and notify the
+ user-space programs that the device is no longer there. The following
+ code (function <function>skel_delete</function>)
+ is an example of how to do this: </para>
+ <programlisting>
+static inline void skel_delete (struct usb_skel *dev)
+{
+ kfree (dev->bulk_in_buffer);
+ if (dev->bulk_out_buffer != NULL)
+ usb_buffer_free (dev->udev, dev->bulk_out_size,
+ dev->bulk_out_buffer,
+ dev->write_urb->transfer_dma);
+ usb_free_urb (dev->write_urb);
+ kfree (dev);
+}
+ </programlisting>
+ <para>
+ If a program currently has an open handle to the device, we reset the flag
+ <literal>device_present</literal>. For
+ every read, write, release and other functions that expect a device to be
+ present, the driver first checks this flag to see if the device is
+ still present. If not, it releases that the device has disappeared, and a
+ -ENODEV error is returned to the user-space program. When the release
+ function is eventually called, it determines if there is no device
+ and if not, it does the cleanup that the skel_disconnect
+ function normally does if there are no open files on the device (see
+ Listing 5).
+ </para>
+ </chapter>
+
+ <chapter id="iso">
+ <title>Isochronous Data</title>
+ <para>
+ This usb-skeleton driver does not have any examples of interrupt or
+ isochronous data being sent to or from the device. Interrupt data is sent
+ almost exactly as bulk data is, with a few minor exceptions. Isochronous
+ data works differently with continuous streams of data being sent to or
+ from the device. The audio and video camera drivers are very good examples
+ of drivers that handle isochronous data and will be useful if you also
+ need to do this.
+ </para>
+ </chapter>
+
+ <chapter id="Conclusion">
+ <title>Conclusion</title>
+ <para>
+ Writing Linux USB device drivers is not a difficult task as the
+ usb-skeleton driver shows. This driver, combined with the other current
+ USB drivers, should provide enough examples to help a beginning author
+ create a working driver in a minimal amount of time. The linux-usb-devel
+ mailing list archives also contain a lot of helpful information.
+ </para>
+ </chapter>
+
+ <chapter id="resources">
+ <title>Resources</title>
+ <para>
+ The Linux USB Project: <ulink url="http://www.linux-usb.org">http://www.linux-usb.org/</ulink>
+ </para>
+ <para>
+ Linux Hotplug Project: <ulink url="http://linux-hotplug.sourceforge.net">http://linux-hotplug.sourceforge.net/</ulink>
+ </para>
+ <para>
+ Linux USB Working Devices List: <ulink url="http://www.qbik.ch/usb/devices">http://www.qbik.ch/usb/devices/</ulink>
+ </para>
+ <para>
+ linux-usb-devel Mailing List Archives: <ulink url="http://marc.theaimsgroup.com/?l=linux-usb-devel">http://marc.theaimsgroup.com/?l=linux-usb-devel</ulink>
+ </para>
+ <para>
+ Programming Guide for Linux USB Device Drivers: <ulink url="http://usb.cs.tum.edu/usbdoc">http://usb.cs.tum.edu/usbdoc</ulink>
+ </para>
+ <para>
+ USB Home Page: <ulink url="http://www.usb.org">http://www.usb.org</ulink>
+ </para>
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl
new file mode 100644
index 0000000..6f3883b
--- /dev/null
+++ b/Documentation/DocBook/z8530book.tmpl
@@ -0,0 +1,371 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Z85230Guide">
+ <bookinfo>
+ <title>Z8530 Programming Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@lxorguk.ukuu.org.uk</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2000</year>
+ <holder>Alan Cox</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The Z85x30 family synchronous/asynchronous controller chips are
+ used on a large number of cheap network interface cards. The
+ kernel provides a core interface layer that is designed to make
+ it easy to provide WAN services using this chip.
+ </para>
+ <para>
+ The current driver only support synchronous operation. Merging the
+ asynchronous driver support into this code to allow any Z85x30
+ device to be used as both a tty interface and as a synchronous
+ controller is a project for Linux post the 2.4 release
+ </para>
+ </chapter>
+
+ <chapter id="Driver_Modes">
+ <title>Driver Modes</title>
+ <para>
+ The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
+ in three different modes. Each mode can be applied to an individual
+ channel on the chip (each chip has two channels).
+ </para>
+ <para>
+ The PIO synchronous mode supports the most common Z8530 wiring. Here
+ the chip is interface to the I/O and interrupt facilities of the
+ host machine but not to the DMA subsystem. When running PIO the
+ Z8530 has extremely tight timing requirements. Doing high speeds,
+ even with a Z85230 will be tricky. Typically you should expect to
+ achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230.
+ </para>
+ <para>
+ The DMA mode supports the chip when it is configured to use dual DMA
+ channels on an ISA bus. The better cards tend to support this mode
+ of operation for a single channel. With DMA running the Z85230 tops
+ out when it starts to hit ISA DMA constraints at about 512Kbits. It
+ is worth noting here that many PC machines hang or crash when the
+ chip is driven fast enough to hold the ISA bus solid.
+ </para>
+ <para>
+ Transmit DMA mode uses a single DMA channel. The DMA channel is used
+ for transmission as the transmit FIFO is smaller than the receive
+ FIFO. it gives better performance than pure PIO mode but is nowhere
+ near as ideal as pure DMA mode.
+ </para>
+ </chapter>
+
+ <chapter id="Using_the_Z85230_driver">
+ <title>Using the Z85230 driver</title>
+ <para>
+ The Z85230 driver provides the back end interface to your board. To
+ configure a Z8530 interface you need to detect the board and to
+ identify its ports and interrupt resources. It is also your problem
+ to verify the resources are available.
+ </para>
+ <para>
+ Having identified the chip you need to fill in a struct z8530_dev,
+ which describes each chip. This object must exist until you finally
+ shutdown the board. Firstly zero the active field. This ensures
+ nothing goes off without you intending it. The irq field should
+ be set to the interrupt number of the chip. (Each chip has a single
+ interrupt source rather than each channel). You are responsible
+ for allocating the interrupt line. The interrupt handler should be
+ set to <function>z8530_interrupt</function>. The device id should
+ be set to the z8530_dev structure pointer. Whether the interrupt can
+ be shared or not is board dependent, and up to you to initialise.
+ </para>
+ <para>
+ The structure holds two channel structures.
+ Initialise chanA.ctrlio and chanA.dataio with the address of the
+ control and data ports. You can or this with Z8530_PORT_SLEEP to
+ indicate your interface needs the 5uS delay for chip settling done
+ in software. The PORT_SLEEP option is architecture specific. Other
+ flags may become available on future platforms, eg for MMIO.
+ Initialise the chanA.irqs to &amp;z8530_nop to start the chip up
+ as disabled and discarding interrupt events. This ensures that
+ stray interrupts will be mopped up and not hang the bus. Set
+ chanA.dev to point to the device structure itself. The
+ private and name field you may use as you wish. The private field
+ is unused by the Z85230 layer. The name is used for error reporting
+ and it may thus make sense to make it match the network name.
+ </para>
+ <para>
+ Repeat the same operation with the B channel if your chip has
+ both channels wired to something useful. This isn't always the
+ case. If it is not wired then the I/O values do not matter, but
+ you must initialise chanB.dev.
+ </para>
+ <para>
+ If your board has DMA facilities then initialise the txdma and
+ rxdma fields for the relevant channels. You must also allocate the
+ ISA DMA channels and do any necessary board level initialisation
+ to configure them. The low level driver will do the Z8530 and
+ DMA controller programming but not board specific magic.
+ </para>
+ <para>
+ Having initialised the device you can then call
+ <function>z8530_init</function>. This will probe the chip and
+ reset it into a known state. An identification sequence is then
+ run to identify the chip type. If the checks fail to pass the
+ function returns a non zero error code. Typically this indicates
+ that the port given is not valid. After this call the
+ type field of the z8530_dev structure is initialised to either
+ Z8530, Z85C30 or Z85230 according to the chip found.
+ </para>
+ <para>
+ Once you have called z8530_init you can also make use of the utility
+ function <function>z8530_describe</function>. This provides a
+ consistent reporting format for the Z8530 devices, and allows all
+ the drivers to provide consistent reporting.
+ </para>
+ </chapter>
+
+ <chapter id="Attaching_Network_Interfaces">
+ <title>Attaching Network Interfaces</title>
+ <para>
+ If you wish to use the network interface facilities of the driver,
+ then you need to attach a network device to each channel that is
+ present and in use. In addition to use the generic HDLC
+ you need to follow some additional plumbing rules. They may seem
+ complex but a look at the example hostess_sv11 driver should
+ reassure you.
+ </para>
+ <para>
+ The network device used for each channel should be pointed to by
+ the netdevice field of each channel. The hdlc-&gt; priv field of the
+ network device points to your private data - you will need to be
+ able to find your private data from this.
+ </para>
+ <para>
+ The way most drivers approach this particular problem is to
+ create a structure holding the Z8530 device definition and
+ put that into the private field of the network device. The
+ network device fields of the channels then point back to the
+ network devices.
+ </para>
+ <para>
+ If you wish to use the generic HDLC then you need to register
+ the HDLC device.
+ </para>
+ <para>
+ Before you register your network device you will also need to
+ provide suitable handlers for most of the network device callbacks.
+ See the network device documentation for more details on this.
+ </para>
+ </chapter>
+
+ <chapter id="Configuring_And_Activating_The_Port">
+ <title>Configuring And Activating The Port</title>
+ <para>
+ The Z85230 driver provides helper functions and tables to load the
+ port registers on the Z8530 chips. When programming the register
+ settings for a channel be aware that the documentation recommends
+ initialisation orders. Strange things happen when these are not
+ followed.
+ </para>
+ <para>
+ <function>z8530_channel_load</function> takes an array of
+ pairs of initialisation values in an array of u8 type. The first
+ value is the Z8530 register number. Add 16 to indicate the alternate
+ register bank on the later chips. The array is terminated by a 255.
+ </para>
+ <para>
+ The driver provides a pair of public tables. The
+ z8530_hdlc_kilostream table is for the UK 'Kilostream' service and
+ also happens to cover most other end host configurations. The
+ z8530_hdlc_kilostream_85230 table is the same configuration using
+ the enhancements of the 85230 chip. The configuration loaded is
+ standard NRZ encoded synchronous data with HDLC bitstuffing. All
+ of the timing is taken from the other end of the link.
+ </para>
+ <para>
+ When writing your own tables be aware that the driver internally
+ tracks register values. It may need to reload values. You should
+ therefore be sure to set registers 1-7, 9-11, 14 and 15 in all
+ configurations. Where the register settings depend on DMA selection
+ the driver will update the bits itself when you open or close.
+ Loading a new table with the interface open is not recommended.
+ </para>
+ <para>
+ There are three standard configurations supported by the core
+ code. In PIO mode the interface is programmed up to use
+ interrupt driven PIO. This places high demands on the host processor
+ to avoid latency. The driver is written to take account of latency
+ issues but it cannot avoid latencies caused by other drivers,
+ notably IDE in PIO mode. Because the drivers allocate buffers you
+ must also prevent MTU changes while the port is open.
+ </para>
+ <para>
+ Once the port is open it will call the rx_function of each channel
+ whenever a completed packet arrived. This is invoked from
+ interrupt context and passes you the channel and a network
+ buffer (struct sk_buff) holding the data. The data includes
+ the CRC bytes so most users will want to trim the last two
+ bytes before processing the data. This function is very timing
+ critical. When you wish to simply discard data the support
+ code provides the function <function>z8530_null_rx</function>
+ to discard the data.
+ </para>
+ <para>
+ To active PIO mode sending and receiving the <function>
+ z8530_sync_open</function> is called. This expects to be passed
+ the network device and the channel. Typically this is called from
+ your network device open callback. On a failure a non zero error
+ status is returned. The <function>z8530_sync_close</function>
+ function shuts down a PIO channel. This must be done before the
+ channel is opened again and before the driver shuts down
+ and unloads.
+ </para>
+ <para>
+ The ideal mode of operation is dual channel DMA mode. Here the
+ kernel driver will configure the board for DMA in both directions.
+ The driver also handles ISA DMA issues such as controller
+ programming and the memory range limit for you. This mode is
+ activated by calling the <function>z8530_sync_dma_open</function>
+ function. On failure a non zero error value is returned.
+ Once this mode is activated it can be shut down by calling the
+ <function>z8530_sync_dma_close</function>. You must call the close
+ function matching the open mode you used.
+ </para>
+ <para>
+ The final supported mode uses a single DMA channel to drive the
+ transmit side. As the Z85C30 has a larger FIFO on the receive
+ channel this tends to increase the maximum speed a little.
+ This is activated by calling the <function>z8530_sync_txdma_open
+ </function>. This returns a non zero error code on failure. The
+ <function>z8530_sync_txdma_close</function> function closes down
+ the Z8530 interface from this mode.
+ </para>
+ </chapter>
+
+ <chapter id="Network_Layer_Functions">
+ <title>Network Layer Functions</title>
+ <para>
+ The Z8530 layer provides functions to queue packets for
+ transmission. The driver internally buffers the frame currently
+ being transmitted and one further frame (in order to keep back
+ to back transmission running). Any further buffering is up to
+ the caller.
+ </para>
+ <para>
+ The function <function>z8530_queue_xmit</function> takes a network
+ buffer in sk_buff format and queues it for transmission. The
+ caller must provide the entire packet with the exception of the
+ bitstuffing and CRC. This is normally done by the caller via
+ the generic HDLC interface layer. It returns 0 if the buffer has been
+ queued and non zero values for queue full. If the function accepts
+ the buffer it becomes property of the Z8530 layer and the caller
+ should not free it.
+ </para>
+ <para>
+ The function <function>z8530_get_stats</function> returns a pointer
+ to an internally maintained per interface statistics block. This
+ provides most of the interface code needed to implement the network
+ layer get_stats callback.
+ </para>
+ </chapter>
+
+ <chapter id="Porting_The_Z8530_Driver">
+ <title>Porting The Z8530 Driver</title>
+ <para>
+ The Z8530 driver is written to be portable. In DMA mode it makes
+ assumptions about the use of ISA DMA. These are probably warranted
+ in most cases as the Z85230 in particular was designed to glue to PC
+ type machines. The PIO mode makes no real assumptions.
+ </para>
+ <para>
+ Should you need to retarget the Z8530 driver to another architecture
+ the only code that should need changing are the port I/O functions.
+ At the moment these assume PC I/O port accesses. This may not be
+ appropriate for all platforms. Replacing
+ <function>z8530_read_port</function> and <function>z8530_write_port
+ </function> is intended to be all that is required to port this
+ driver layer.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ <variablelist>
+ <varlistentry><term>Interrupt Locking</term>
+ <listitem>
+ <para>
+ The locking in the driver is done via the global cli/sti lock. This
+ makes for relatively poor SMP performance. Switching this to use a
+ per device spin lock would probably materially improve performance.
+ </para>
+ </listitem></varlistentry>
+
+ <varlistentry><term>Occasional Failures</term>
+ <listitem>
+ <para>
+ We have reports of occasional failures when run for very long
+ periods of time and the driver starts to receive junk frames. At
+ the moment the cause of this is not clear.
+ </para>
+ </listitem></varlistentry>
+ </variablelist>
+
+ </para>
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Edrivers/net/wan/z85230.c
+ </chapter>
+
+ <chapter id="intfunctions">
+ <title>Internal Functions</title>
+!Idrivers/net/wan/z85230.c
+ </chapter>
+
+</book>
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
new file mode 100644
index 0000000..8495fc9
--- /dev/null
+++ b/Documentation/HOWTO
@@ -0,0 +1,651 @@
+HOWTO do Linux kernel development
+---------------------------------
+
+This is the be-all, end-all document on this topic. It contains
+instructions on how to become a Linux kernel developer and how to learn
+to work with the Linux kernel development community. It tries to not
+contain anything related to the technical aspects of kernel programming,
+but will help point you in the right direction for that.
+
+If anything in this document becomes out of date, please send in patches
+to the maintainer of this file, who is listed at the bottom of the
+document.
+
+
+Introduction
+------------
+
+So, you want to learn how to become a Linux kernel developer? Or you
+have been told by your manager, "Go write a Linux driver for this
+device." This document's goal is to teach you everything you need to
+know to achieve this by describing the process you need to go through,
+and hints on how to work with the community. It will also try to
+explain some of the reasons why the community works like it does.
+
+The kernel is written mostly in C, with some architecture-dependent
+parts written in assembly. A good understanding of C is required for
+kernel development. Assembly (any architecture) is not required unless
+you plan to do low-level development for that architecture. Though they
+are not a good substitute for a solid C education and/or years of
+experience, the following books are good for, if anything, reference:
+ - "The C Programming Language" by Kernighan and Ritchie [Prentice Hall]
+ - "Practical C Programming" by Steve Oualline [O'Reilly]
+ - "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
+
+The kernel is written using GNU C and the GNU toolchain. While it
+adheres to the ISO C89 standard, it uses a number of extensions that are
+not featured in the standard. The kernel is a freestanding C
+environment, with no reliance on the standard C library, so some
+portions of the C standard are not supported. Arbitrary long long
+divisions and floating point are not allowed. It can sometimes be
+difficult to understand the assumptions the kernel has on the toolchain
+and the extensions that it uses, and unfortunately there is no
+definitive reference for them. Please check the gcc info pages (`info
+gcc`) for some information on them.
+
+Please remember that you are trying to learn how to work with the
+existing development community. It is a diverse group of people, with
+high standards for coding, style and procedure. These standards have
+been created over time based on what they have found to work best for
+such a large and geographically dispersed team. Try to learn as much as
+possible about these standards ahead of time, as they are well
+documented; do not expect people to adapt to you or your company's way
+of doing things.
+
+
+Legal Issues
+------------
+
+The Linux kernel source code is released under the GPL. Please see the
+file, COPYING, in the main directory of the source tree, for details on
+the license. If you have further questions about the license, please
+contact a lawyer, and do not ask on the Linux kernel mailing list. The
+people on the mailing lists are not lawyers, and you should not rely on
+their statements on legal matters.
+
+For common questions and answers about the GPL, please see:
+ http://www.gnu.org/licenses/gpl-faq.html
+
+
+Documentation
+------------
+
+The Linux kernel source tree has a large range of documents that are
+invaluable for learning how to interact with the kernel community. When
+new features are added to the kernel, it is recommended that new
+documentation files are also added which explain how to use the feature.
+When a kernel change causes the interface that the kernel exposes to
+userspace to change, it is recommended that you send the information or
+a patch to the manual pages explaining the change to the manual pages
+maintainer at mtk.manpages@gmail.com, and CC the list
+linux-api@vger.kernel.org.
+
+Here is a list of files that are in the kernel source tree that are
+required reading:
+ README
+ This file gives a short background on the Linux kernel and describes
+ what is necessary to do to configure and build the kernel. People
+ who are new to the kernel should start here.
+
+ Documentation/Changes
+ This file gives a list of the minimum levels of various software
+ packages that are necessary to build and run the kernel
+ successfully.
+
+ Documentation/CodingStyle
+ This describes the Linux kernel coding style, and some of the
+ rationale behind it. All new code is expected to follow the
+ guidelines in this document. Most maintainers will only accept
+ patches if these rules are followed, and many people will only
+ review code if it is in the proper style.
+
+ Documentation/SubmittingPatches
+ Documentation/SubmittingDrivers
+ These files describe in explicit detail how to successfully create
+ and send a patch, including (but not limited to):
+ - Email contents
+ - Email format
+ - Who to send it to
+ Following these rules will not guarantee success (as all patches are
+ subject to scrutiny for content and style), but not following them
+ will almost always prevent it.
+
+ Other excellent descriptions of how to create patches properly are:
+ "The Perfect Patch"
+ http://userweb.kernel.org/~akpm/stuff/tpp.txt
+ "Linux kernel patch submission format"
+ http://linux.yyz.us/patch-format.html
+
+ Documentation/stable_api_nonsense.txt
+ This file describes the rationale behind the conscious decision to
+ not have a stable API within the kernel, including things like:
+ - Subsystem shim-layers (for compatibility?)
+ - Driver portability between Operating Systems.
+ - Mitigating rapid change within the kernel source tree (or
+ preventing rapid change)
+ This document is crucial for understanding the Linux development
+ philosophy and is very important for people moving to Linux from
+ development on other Operating Systems.
+
+ Documentation/SecurityBugs
+ If you feel you have found a security problem in the Linux kernel,
+ please follow the steps in this document to help notify the kernel
+ developers, and help solve the issue.
+
+ Documentation/ManagementStyle
+ This document describes how Linux kernel maintainers operate and the
+ shared ethos behind their methodologies. This is important reading
+ for anyone new to kernel development (or anyone simply curious about
+ it), as it resolves a lot of common misconceptions and confusion
+ about the unique behavior of kernel maintainers.
+
+ Documentation/stable_kernel_rules.txt
+ This file describes the rules on how the stable kernel releases
+ happen, and what to do if you want to get a change into one of these
+ releases.
+
+ Documentation/kernel-docs.txt
+ A list of external documentation that pertains to kernel
+ development. Please consult this list if you do not find what you
+ are looking for within the in-kernel documentation.
+
+ Documentation/applying-patches.txt
+ A good introduction describing exactly what a patch is and how to
+ apply it to the different development branches of the kernel.
+
+The kernel also has a large number of documents that can be
+automatically generated from the source code itself. This includes a
+full description of the in-kernel API, and rules on how to handle
+locking properly. The documents will be created in the
+Documentation/DocBook/ directory and can be generated as PDF,
+Postscript, HTML, and man pages by running:
+ make pdfdocs
+ make psdocs
+ make htmldocs
+ make mandocs
+respectively from the main kernel source directory.
+
+
+Becoming A Kernel Developer
+---------------------------
+
+If you do not know anything about Linux kernel development, you should
+look at the Linux KernelNewbies project:
+ http://kernelnewbies.org
+It consists of a helpful mailing list where you can ask almost any type
+of basic kernel development question (make sure to search the archives
+first, before asking something that has already been answered in the
+past.) It also has an IRC channel that you can use to ask questions in
+real-time, and a lot of helpful documentation that is useful for
+learning about Linux kernel development.
+
+The website has basic information about code organization, subsystems,
+and current projects (both in-tree and out-of-tree). It also describes
+some basic logistical information, like how to compile a kernel and
+apply a patch.
+
+If you do not know where you want to start, but you want to look for
+some task to start doing to join into the kernel development community,
+go to the Linux Kernel Janitor's project:
+ http://janitor.kernelnewbies.org/
+It is a great place to start. It describes a list of relatively simple
+problems that need to be cleaned up and fixed within the Linux kernel
+source tree. Working with the developers in charge of this project, you
+will learn the basics of getting your patch into the Linux kernel tree,
+and possibly be pointed in the direction of what to go work on next, if
+you do not already have an idea.
+
+If you already have a chunk of code that you want to put into the kernel
+tree, but need some help getting it in the proper form, the
+kernel-mentors project was created to help you out with this. It is a
+mailing list, and can be found at:
+ http://selenic.com/mailman/listinfo/kernel-mentors
+
+Before making any actual modifications to the Linux kernel code, it is
+imperative to understand how the code in question works. For this
+purpose, nothing is better than reading through it directly (most tricky
+bits are commented well), perhaps even with the help of specialized
+tools. One such tool that is particularly recommended is the Linux
+Cross-Reference project, which is able to present source code in a
+self-referential, indexed webpage format. An excellent up-to-date
+repository of the kernel code may be found at:
+ http://users.sosdg.org/~qiyong/lxr/
+
+
+The development process
+-----------------------
+
+Linux kernel development process currently consists of a few different
+main kernel "branches" and lots of different subsystem-specific kernel
+branches. These different branches are:
+ - main 2.6.x kernel tree
+ - 2.6.x.y -stable kernel tree
+ - 2.6.x -git kernel patches
+ - 2.6.x -mm kernel patches
+ - subsystem specific kernel trees and patches
+
+2.6.x kernel tree
+-----------------
+2.6.x kernels are maintained by Linus Torvalds, and can be found on
+kernel.org in the pub/linux/kernel/v2.6/ directory. Its development
+process is as follows:
+ - As soon as a new kernel is released a two weeks window is open,
+ during this period of time maintainers can submit big diffs to
+ Linus, usually the patches that have already been included in the
+ -mm kernel for a few weeks. The preferred way to submit big changes
+ is using git (the kernel's source management tool, more information
+ can be found at http://git.or.cz/) but plain patches are also just
+ fine.
+ - After two weeks a -rc1 kernel is released it is now possible to push
+ only patches that do not include new features that could affect the
+ stability of the whole kernel. Please note that a whole new driver
+ (or filesystem) might be accepted after -rc1 because there is no
+ risk of causing regressions with such a change as long as the change
+ is self-contained and does not affect areas outside of the code that
+ is being added. git can be used to send patches to Linus after -rc1
+ is released, but the patches need to also be sent to a public
+ mailing list for review.
+ - A new -rc is released whenever Linus deems the current git tree to
+ be in a reasonably sane state adequate for testing. The goal is to
+ release a new -rc kernel every week.
+ - Process continues until the kernel is considered "ready", the
+ process should last around 6 weeks.
+ - Known regressions in each release are periodically posted to the
+ linux-kernel mailing list. The goal is to reduce the length of
+ that list to zero before declaring the kernel to be "ready," but, in
+ the real world, a small number of regressions often remain at
+ release time.
+
+It is worth mentioning what Andrew Morton wrote on the linux-kernel
+mailing list about kernel releases:
+ "Nobody knows when a kernel will be released, because it's
+ released according to perceived bug status, not according to a
+ preconceived timeline."
+
+2.6.x.y -stable kernel tree
+---------------------------
+Kernels with 4-part versions are -stable kernels. They contain
+relatively small and critical fixes for security problems or significant
+regressions discovered in a given 2.6.x kernel.
+
+This is the recommended branch for users who want the most recent stable
+kernel and are not interested in helping test development/experimental
+versions.
+
+If no 2.6.x.y kernel is available, then the highest numbered 2.6.x
+kernel is the current stable kernel.
+
+2.6.x.y are maintained by the "stable" team <stable@kernel.org>, and are
+released as needs dictate. The normal release period is approximately
+two weeks, but it can be longer if there are no pressing problems. A
+security-related problem, instead, can cause a release to happen almost
+instantly.
+
+The file Documentation/stable_kernel_rules.txt in the kernel tree
+documents what kinds of changes are acceptable for the -stable tree, and
+how the release process works.
+
+2.6.x -git patches
+------------------
+These are daily snapshots of Linus' kernel tree which are managed in a
+git repository (hence the name.) These patches are usually released
+daily and represent the current state of Linus' tree. They are more
+experimental than -rc kernels since they are generated automatically
+without even a cursory glance to see if they are sane.
+
+2.6.x -mm kernel patches
+------------------------
+These are experimental kernel patches released by Andrew Morton. Andrew
+takes all of the different subsystem kernel trees and patches and mushes
+them together, along with a lot of patches that have been plucked from
+the linux-kernel mailing list. This tree serves as a proving ground for
+new features and patches. Once a patch has proved its worth in -mm for
+a while Andrew or the subsystem maintainer pushes it on to Linus for
+inclusion in mainline.
+
+It is heavily encouraged that all new patches get tested in the -mm tree
+before they are sent to Linus for inclusion in the main kernel tree. Code
+which does not make an appearance in -mm before the opening of the merge
+window will prove hard to merge into the mainline.
+
+These kernels are not appropriate for use on systems that are supposed
+to be stable and they are more risky to run than any of the other
+branches.
+
+If you wish to help out with the kernel development process, please test
+and use these kernel releases and provide feedback to the linux-kernel
+mailing list if you have any problems, and if everything works properly.
+
+In addition to all the other experimental patches, these kernels usually
+also contain any changes in the mainline -git kernels available at the
+time of release.
+
+The -mm kernels are not released on a fixed schedule, but usually a few
+-mm kernels are released in between each -rc kernel (1 to 3 is common).
+
+Subsystem Specific kernel trees and patches
+-------------------------------------------
+A number of the different kernel subsystem developers expose their
+development trees so that others can see what is happening in the
+different areas of the kernel. These trees are pulled into the -mm
+kernel releases as described above.
+
+Here is a list of some of the different kernel trees available:
+ git trees:
+ - Kbuild development tree, Sam Ravnborg <sam@ravnborg.org>
+ git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
+
+ - ACPI development tree, Len Brown <len.brown@intel.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
+
+ - Block development tree, Jens Axboe <jens.axboe@oracle.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
+
+ - DRM development tree, Dave Airlie <airlied@linux.ie>
+ git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
+
+ - ia64 development tree, Tony Luck <tony.luck@intel.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
+
+ - infiniband, Roland Dreier <rolandd@cisco.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
+
+ - libata, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
+
+ - network drivers, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
+
+ - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
+ git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
+
+ - SCSI, James Bottomley <James.Bottomley@hansenpartnership.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
+
+ - x86, Ingo Molnar <mingo@elte.hu>
+ git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
+
+ quilt trees:
+ - USB, Driver Core, and I2C, Greg Kroah-Hartman <gregkh@suse.de>
+ kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+
+ Other kernel trees can be found listed at http://git.kernel.org/ and in
+ the MAINTAINERS file.
+
+Bug Reporting
+-------------
+
+bugzilla.kernel.org is where the Linux kernel developers track kernel
+bugs. Users are encouraged to report all bugs that they find in this
+tool. For details on how to use the kernel bugzilla, please see:
+ http://bugzilla.kernel.org/page.cgi?id=faq.html
+
+The file REPORTING-BUGS in the main kernel source directory has a good
+template for how to report a possible kernel bug, and details what kind
+of information is needed by the kernel developers to help track down the
+problem.
+
+
+Managing bug reports
+--------------------
+
+One of the best ways to put into practice your hacking skills is by fixing
+bugs reported by other people. Not only you will help to make the kernel
+more stable, you'll learn to fix real world problems and you will improve
+your skills, and other developers will be aware of your presence. Fixing
+bugs is one of the best ways to get merits among other developers, because
+not many people like wasting time fixing other people's bugs.
+
+To work in the already reported bug reports, go to http://bugzilla.kernel.org.
+If you want to be advised of the future bug reports, you can subscribe to the
+bugme-new mailing list (only new bug reports are mailed here) or to the
+bugme-janitor mailing list (every change in the bugzilla is mailed here)
+
+ http://lists.linux-foundation.org/mailman/listinfo/bugme-new
+ http://lists.linux-foundation.org/mailman/listinfo/bugme-janitors
+
+
+
+Mailing lists
+-------------
+
+As some of the above documents describe, the majority of the core kernel
+developers participate on the Linux Kernel Mailing list. Details on how
+to subscribe and unsubscribe from the list can be found at:
+ http://vger.kernel.org/vger-lists.html#linux-kernel
+There are archives of the mailing list on the web in many different
+places. Use a search engine to find these archives. For example:
+ http://dir.gmane.org/gmane.linux.kernel
+It is highly recommended that you search the archives about the topic
+you want to bring up, before you post it to the list. A lot of things
+already discussed in detail are only recorded at the mailing list
+archives.
+
+Most of the individual kernel subsystems also have their own separate
+mailing list where they do their development efforts. See the
+MAINTAINERS file for a list of what these lists are for the different
+groups.
+
+Many of the lists are hosted on kernel.org. Information on them can be
+found at:
+ http://vger.kernel.org/vger-lists.html
+
+Please remember to follow good behavioral habits when using the lists.
+Though a bit cheesy, the following URL has some simple guidelines for
+interacting with the list (or any list):
+ http://www.albion.com/netiquette/
+
+If multiple people respond to your mail, the CC: list of recipients may
+get pretty large. Don't remove anybody from the CC: list without a good
+reason, or don't reply only to the list address. Get used to receiving the
+mail twice, one from the sender and the one from the list, and don't try
+to tune that by adding fancy mail-headers, people will not like it.
+
+Remember to keep the context and the attribution of your replies intact,
+keep the "John Kernelhacker wrote ...:" lines at the top of your reply, and
+add your statements between the individual quoted sections instead of
+writing at the top of the mail.
+
+If you add patches to your mail, make sure they are plain readable text
+as stated in Documentation/SubmittingPatches. Kernel developers don't
+want to deal with attachments or compressed patches; they may want
+to comment on individual lines of your patch, which works only that way.
+Make sure you use a mail program that does not mangle spaces and tab
+characters. A good first test is to send the mail to yourself and try
+to apply your own patch by yourself. If that doesn't work, get your
+mail program fixed or change it until it works.
+
+Above all, please remember to show respect to other subscribers.
+
+
+Working with the community
+--------------------------
+
+The goal of the kernel community is to provide the best possible kernel
+there is. When you submit a patch for acceptance, it will be reviewed
+on its technical merits and those alone. So, what should you be
+expecting?
+ - criticism
+ - comments
+ - requests for change
+ - requests for justification
+ - silence
+
+Remember, this is part of getting your patch into the kernel. You have
+to be able to take criticism and comments about your patches, evaluate
+them at a technical level and either rework your patches or provide
+clear and concise reasoning as to why those changes should not be made.
+If there are no responses to your posting, wait a few days and try
+again, sometimes things get lost in the huge volume.
+
+What should you not do?
+ - expect your patch to be accepted without question
+ - become defensive
+ - ignore comments
+ - resubmit the patch without making any of the requested changes
+
+In a community that is looking for the best technical solution possible,
+there will always be differing opinions on how beneficial a patch is.
+You have to be cooperative, and willing to adapt your idea to fit within
+the kernel. Or at least be willing to prove your idea is worth it.
+Remember, being wrong is acceptable as long as you are willing to work
+toward a solution that is right.
+
+It is normal that the answers to your first patch might simply be a list
+of a dozen things you should correct. This does _not_ imply that your
+patch will not be accepted, and it is _not_ meant against you
+personally. Simply correct all issues raised against your patch and
+resend it.
+
+
+Differences between the kernel community and corporate structures
+-----------------------------------------------------------------
+
+The kernel community works differently than most traditional corporate
+development environments. Here are a list of things that you can try to
+do to try to avoid problems:
+ Good things to say regarding your proposed changes:
+ - "This solves multiple problems."
+ - "This deletes 2000 lines of code."
+ - "Here is a patch that explains what I am trying to describe."
+ - "I tested it on 5 different architectures..."
+ - "Here is a series of small patches that..."
+ - "This increases performance on typical machines..."
+
+ Bad things you should avoid saying:
+ - "We did it this way in AIX/ptx/Solaris, so therefore it must be
+ good..."
+ - "I've being doing this for 20 years, so..."
+ - "This is required for my company to make money"
+ - "This is for our Enterprise product line."
+ - "Here is my 1000 page design document that describes my idea"
+ - "I've been working on this for 6 months..."
+ - "Here's a 5000 line patch that..."
+ - "I rewrote all of the current mess, and here it is..."
+ - "I have a deadline, and this patch needs to be applied now."
+
+Another way the kernel community is different than most traditional
+software engineering work environments is the faceless nature of
+interaction. One benefit of using email and irc as the primary forms of
+communication is the lack of discrimination based on gender or race.
+The Linux kernel work environment is accepting of women and minorities
+because all you are is an email address. The international aspect also
+helps to level the playing field because you can't guess gender based on
+a person's name. A man may be named Andrea and a woman may be named Pat.
+Most women who have worked in the Linux kernel and have expressed an
+opinion have had positive experiences.
+
+The language barrier can cause problems for some people who are not
+comfortable with English. A good grasp of the language can be needed in
+order to get ideas across properly on mailing lists, so it is
+recommended that you check your emails to make sure they make sense in
+English before sending them.
+
+
+Break up your changes
+---------------------
+
+The Linux kernel community does not gladly accept large chunks of code
+dropped on it all at once. The changes need to be properly introduced,
+discussed, and broken up into tiny, individual portions. This is almost
+the exact opposite of what companies are used to doing. Your proposal
+should also be introduced very early in the development process, so that
+you can receive feedback on what you are doing. It also lets the
+community feel that you are working with them, and not simply using them
+as a dumping ground for your feature. However, don't send 50 emails at
+one time to a mailing list, your patch series should be smaller than
+that almost all of the time.
+
+The reasons for breaking things up are the following:
+
+1) Small patches increase the likelihood that your patches will be
+ applied, since they don't take much time or effort to verify for
+ correctness. A 5 line patch can be applied by a maintainer with
+ barely a second glance. However, a 500 line patch may take hours to
+ review for correctness (the time it takes is exponentially
+ proportional to the size of the patch, or something).
+
+ Small patches also make it very easy to debug when something goes
+ wrong. It's much easier to back out patches one by one than it is
+ to dissect a very large patch after it's been applied (and broken
+ something).
+
+2) It's important not only to send small patches, but also to rewrite
+ and simplify (or simply re-order) patches before submitting them.
+
+Here is an analogy from kernel developer Al Viro:
+ "Think of a teacher grading homework from a math student. The
+ teacher does not want to see the student's trials and errors
+ before they came up with the solution. They want to see the
+ cleanest, most elegant answer. A good student knows this, and
+ would never submit her intermediate work before the final
+ solution."
+
+ The same is true of kernel development. The maintainers and
+ reviewers do not want to see the thought process behind the
+ solution to the problem one is solving. They want to see a
+ simple and elegant solution."
+
+It may be challenging to keep the balance between presenting an elegant
+solution and working together with the community and discussing your
+unfinished work. Therefore it is good to get early in the process to
+get feedback to improve your work, but also keep your changes in small
+chunks that they may get already accepted, even when your whole task is
+not ready for inclusion now.
+
+Also realize that it is not acceptable to send patches for inclusion
+that are unfinished and will be "fixed up later."
+
+
+Justify your change
+-------------------
+
+Along with breaking up your patches, it is very important for you to let
+the Linux community know why they should add this change. New features
+must be justified as being needed and useful.
+
+
+Document your change
+--------------------
+
+When sending in your patches, pay special attention to what you say in
+the text in your email. This information will become the ChangeLog
+information for the patch, and will be preserved for everyone to see for
+all time. It should describe the patch completely, containing:
+ - why the change is necessary
+ - the overall design approach in the patch
+ - implementation details
+ - testing results
+
+For more details on what this should all look like, please see the
+ChangeLog section of the document:
+ "The Perfect Patch"
+ http://userweb.kernel.org/~akpm/stuff/tpp.txt
+
+
+
+
+All of these things are sometimes very hard to do. It can take years to
+perfect these practices (if at all). It's a continuous process of
+improvement that requires a lot of patience and determination. But
+don't give up, it's possible. Many have done it before, and each had to
+start exactly where you are now.
+
+
+
+
+----------
+Thanks to Paolo Ciarrocchi who allowed the "Development Process"
+(http://linux.tar.bz/articles/2.6-development_process) section
+to be based on text he had written, and to Randy Dunlap and Gerrit
+Huizenga for some of the list of things you should and should not say.
+Also thanks to Pat Mochel, Hanna Linder, Randy Dunlap, Kay Sievers,
+Vojtech Pavlik, Jan Kara, Josh Boyer, Kees Cook, Andrew Morton, Andi
+Kleen, Vadim Lobanov, Jesper Juhl, Adrian Bunk, Keri Harris, Frans Pop,
+David A. Wheeler, Junio Hamano, Michael Kerrisk, and Alex Shepard for
+their review, comments, and contributions. Without their help, this
+document would not have been possible.
+
+
+
+Maintainer: Greg Kroah-Hartman <greg@kroah.com>
diff --git a/Documentation/IO-mapping.txt b/Documentation/IO-mapping.txt
new file mode 100644
index 0000000..86edb61
--- /dev/null
+++ b/Documentation/IO-mapping.txt
@@ -0,0 +1,208 @@
+[ NOTE: The virt_to_bus() and bus_to_virt() functions have been
+ superseded by the functionality provided by the PCI DMA
+ interface (see Documentation/DMA-mapping.txt). They continue
+ to be documented below for historical purposes, but new code
+ must not use them. --davidm 00/12/12 ]
+
+[ This is a mail message in response to a query on IO mapping, thus the
+ strange format for a "document" ]
+
+The AHA-1542 is a bus-master device, and your patch makes the driver give the
+controller the physical address of the buffers, which is correct on x86
+(because all bus master devices see the physical memory mappings directly).
+
+However, on many setups, there are actually _three_ different ways of looking
+at memory addresses, and in this case we actually want the third, the
+so-called "bus address".
+
+Essentially, the three ways of addressing memory are (this is "real memory",
+that is, normal RAM--see later about other details):
+
+ - CPU untranslated. This is the "physical" address. Physical address
+ 0 is what the CPU sees when it drives zeroes on the memory bus.
+
+ - CPU translated address. This is the "virtual" address, and is
+ completely internal to the CPU itself with the CPU doing the appropriate
+ translations into "CPU untranslated".
+
+ - bus address. This is the address of memory as seen by OTHER devices,
+ not the CPU. Now, in theory there could be many different bus
+ addresses, with each device seeing memory in some device-specific way, but
+ happily most hardware designers aren't actually actively trying to make
+ things any more complex than necessary, so you can assume that all
+ external hardware sees the memory the same way.
+
+Now, on normal PCs the bus address is exactly the same as the physical
+address, and things are very simple indeed. However, they are that simple
+because the memory and the devices share the same address space, and that is
+not generally necessarily true on other PCI/ISA setups.
+
+Now, just as an example, on the PReP (PowerPC Reference Platform), the
+CPU sees a memory map something like this (this is from memory):
+
+ 0-2 GB "real memory"
+ 2 GB-3 GB "system IO" (inb/out and similar accesses on x86)
+ 3 GB-4 GB "IO memory" (shared memory over the IO bus)
+
+Now, that looks simple enough. However, when you look at the same thing from
+the viewpoint of the devices, you have the reverse, and the physical memory
+address 0 actually shows up as address 2 GB for any IO master.
+
+So when the CPU wants any bus master to write to physical memory 0, it
+has to give the master address 0x80000000 as the memory address.
+
+So, for example, depending on how the kernel is actually mapped on the
+PPC, you can end up with a setup like this:
+
+ physical address: 0
+ virtual address: 0xC0000000
+ bus address: 0x80000000
+
+where all the addresses actually point to the same thing. It's just seen
+through different translations..
+
+Similarly, on the Alpha, the normal translation is
+
+ physical address: 0
+ virtual address: 0xfffffc0000000000
+ bus address: 0x40000000
+
+(but there are also Alphas where the physical address and the bus address
+are the same).
+
+Anyway, the way to look up all these translations, you do
+
+ #include <asm/io.h>
+
+ phys_addr = virt_to_phys(virt_addr);
+ virt_addr = phys_to_virt(phys_addr);
+ bus_addr = virt_to_bus(virt_addr);
+ virt_addr = bus_to_virt(bus_addr);
+
+Now, when do you need these?
+
+You want the _virtual_ address when you are actually going to access that
+pointer from the kernel. So you can have something like this:
+
+ /*
+ * this is the hardware "mailbox" we use to communicate with
+ * the controller. The controller sees this directly.
+ */
+ struct mailbox {
+ __u32 status;
+ __u32 bufstart;
+ __u32 buflen;
+ ..
+ } mbox;
+
+ unsigned char * retbuffer;
+
+ /* get the address from the controller */
+ retbuffer = bus_to_virt(mbox.bufstart);
+ switch (retbuffer[0]) {
+ case STATUS_OK:
+ ...
+
+on the other hand, you want the bus address when you have a buffer that
+you want to give to the controller:
+
+ /* ask the controller to read the sense status into "sense_buffer" */
+ mbox.bufstart = virt_to_bus(&sense_buffer);
+ mbox.buflen = sizeof(sense_buffer);
+ mbox.status = 0;
+ notify_controller(&mbox);
+
+And you generally _never_ want to use the physical address, because you can't
+use that from the CPU (the CPU only uses translated virtual addresses), and
+you can't use it from the bus master.
+
+So why do we care about the physical address at all? We do need the physical
+address in some cases, it's just not very often in normal code. The physical
+address is needed if you use memory mappings, for example, because the
+"remap_pfn_range()" mm function wants the physical address of the memory to
+be remapped as measured in units of pages, a.k.a. the pfn (the memory
+management layer doesn't know about devices outside the CPU, so it
+shouldn't need to know about "bus addresses" etc).
+
+NOTE NOTE NOTE! The above is only one part of the whole equation. The above
+only talks about "real memory", that is, CPU memory (RAM).
+
+There is a completely different type of memory too, and that's the "shared
+memory" on the PCI or ISA bus. That's generally not RAM (although in the case
+of a video graphics card it can be normal DRAM that is just used for a frame
+buffer), but can be things like a packet buffer in a network card etc.
+
+This memory is called "PCI memory" or "shared memory" or "IO memory" or
+whatever, and there is only one way to access it: the readb/writeb and
+related functions. You should never take the address of such memory, because
+there is really nothing you can do with such an address: it's not
+conceptually in the same memory space as "real memory" at all, so you cannot
+just dereference a pointer. (Sadly, on x86 it _is_ in the same memory space,
+so on x86 it actually works to just deference a pointer, but it's not
+portable).
+
+For such memory, you can do things like
+
+ - reading:
+ /*
+ * read first 32 bits from ISA memory at 0xC0000, aka
+ * C000:0000 in DOS terms
+ */
+ unsigned int signature = isa_readl(0xC0000);
+
+ - remapping and writing:
+ /*
+ * remap framebuffer PCI memory area at 0xFC000000,
+ * size 1MB, so that we can access it: We can directly
+ * access only the 640k-1MB area, so anything else
+ * has to be remapped.
+ */
+ char * baseptr = ioremap(0xFC000000, 1024*1024);
+
+ /* write a 'A' to the offset 10 of the area */
+ writeb('A',baseptr+10);
+
+ /* unmap when we unload the driver */
+ iounmap(baseptr);
+
+ - copying and clearing:
+ /* get the 6-byte Ethernet address at ISA address E000:0040 */
+ memcpy_fromio(kernel_buffer, 0xE0040, 6);
+ /* write a packet to the driver */
+ memcpy_toio(0xE1000, skb->data, skb->len);
+ /* clear the frame buffer */
+ memset_io(0xA0000, 0, 0x10000);
+
+OK, that just about covers the basics of accessing IO portably. Questions?
+Comments? You may think that all the above is overly complex, but one day you
+might find yourself with a 500 MHz Alpha in front of you, and then you'll be
+happy that your driver works ;)
+
+Note that kernel versions 2.0.x (and earlier) mistakenly called the
+ioremap() function "vremap()". ioremap() is the proper name, but I
+didn't think straight when I wrote it originally. People who have to
+support both can do something like:
+
+ /* support old naming silliness */
+ #if LINUX_VERSION_CODE < 0x020100
+ #define ioremap vremap
+ #define iounmap vfree
+ #endif
+
+at the top of their source files, and then they can use the right names
+even on 2.0.x systems.
+
+And the above sounds worse than it really is. Most real drivers really
+don't do all that complex things (or rather: the complexity is not so
+much in the actual IO accesses as in error handling and timeouts etc).
+It's generally not hard to fix drivers, and in many cases the code
+actually looks better afterwards:
+
+ unsigned long signature = *(unsigned int *) 0xC0000;
+ vs
+ unsigned long signature = readl(0xC0000);
+
+I think the second version actually is more readable, no?
+
+ Linus
+
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
new file mode 100644
index 0000000..bc38283
--- /dev/null
+++ b/Documentation/IPMI.txt
@@ -0,0 +1,665 @@
+
+ The Linux IPMI Driver
+ ---------------------
+ Corey Minyard
+ <minyard@mvista.com>
+ <minyard@acm.org>
+
+The Intelligent Platform Management Interface, or IPMI, is a
+standard for controlling intelligent devices that monitor a system.
+It provides for dynamic discovery of sensors in the system and the
+ability to monitor the sensors and be informed when the sensor's
+values change or go outside certain boundaries. It also has a
+standardized database for field-replaceable units (FRUs) and a watchdog
+timer.
+
+To use this, you need an interface to an IPMI controller in your
+system (called a Baseboard Management Controller, or BMC) and
+management software that can use the IPMI system.
+
+This document describes how to use the IPMI driver for Linux. If you
+are not familiar with IPMI itself, see the web site at
+http://www.intel.com/design/servers/ipmi/index.htm. IPMI is a big
+subject and I can't cover it all here!
+
+Configuration
+-------------
+
+The Linux IPMI driver is modular, which means you have to pick several
+things to have it work right depending on your hardware. Most of
+these are available in the 'Character Devices' menu then the IPMI
+menu.
+
+No matter what, you must pick 'IPMI top-level message handler' to use
+IPMI. What you do beyond that depends on your needs and hardware.
+
+The message handler does not provide any user-level interfaces.
+Kernel code (like the watchdog) can still use it. If you need access
+from userland, you need to select 'Device interface for IPMI' if you
+want access through a device driver.
+
+The driver interface depends on your hardware. If your system
+properly provides the SMBIOS info for IPMI, the driver will detect it
+and just work. If you have a board with a standard interface (These
+will generally be either "KCS", "SMIC", or "BT", consult your hardware
+manual), choose the 'IPMI SI handler' option. A driver also exists
+for direct I2C access to the IPMI management controller. Some boards
+support this, but it is unknown if it will work on every board. For
+this, choose 'IPMI SMBus handler', but be ready to try to do some
+figuring to see if it will work on your system if the SMBIOS/APCI
+information is wrong or not present. It is fairly safe to have both
+these enabled and let the drivers auto-detect what is present.
+
+You should generally enable ACPI on your system, as systems with IPMI
+can have ACPI tables describing them.
+
+If you have a standard interface and the board manufacturer has done
+their job correctly, the IPMI controller should be automatically
+detected (via ACPI or SMBIOS tables) and should just work. Sadly,
+many boards do not have this information. The driver attempts
+standard defaults, but they may not work. If you fall into this
+situation, you need to read the section below named 'The SI Driver' or
+"The SMBus Driver" on how to hand-configure your system.
+
+IPMI defines a standard watchdog timer. You can enable this with the
+'IPMI Watchdog Timer' config option. If you compile the driver into
+the kernel, then via a kernel command-line option you can have the
+watchdog timer start as soon as it initializes. It also have a lot
+of other options, see the 'Watchdog' section below for more details.
+Note that you can also have the watchdog continue to run if it is
+closed (by default it is disabled on close). Go into the 'Watchdog
+Cards' menu, enable 'Watchdog Timer Support', and enable the option
+'Disable watchdog shutdown on close'.
+
+IPMI systems can often be powered off using IPMI commands. Select
+'IPMI Poweroff' to do this. The driver will auto-detect if the system
+can be powered off by IPMI. It is safe to enable this even if your
+system doesn't support this option. This works on ATCA systems, the
+Radisys CPI1 card, and any IPMI system that supports standard chassis
+management commands.
+
+If you want the driver to put an event into the event log on a panic,
+enable the 'Generate a panic event to all BMCs on a panic' option. If
+you want the whole panic string put into the event log using OEM
+events, enable the 'Generate OEM events containing the panic string'
+option.
+
+Basic Design
+------------
+
+The Linux IPMI driver is designed to be very modular and flexible, you
+only need to take the pieces you need and you can use it in many
+different ways. Because of that, it's broken into many chunks of
+code. These chunks (by module name) are:
+
+ipmi_msghandler - This is the central piece of software for the IPMI
+system. It handles all messages, message timing, and responses. The
+IPMI users tie into this, and the IPMI physical interfaces (called
+System Management Interfaces, or SMIs) also tie in here. This
+provides the kernelland interface for IPMI, but does not provide an
+interface for use by application processes.
+
+ipmi_devintf - This provides a userland IOCTL interface for the IPMI
+driver, each open file for this device ties in to the message handler
+as an IPMI user.
+
+ipmi_si - A driver for various system interfaces. This supports KCS,
+SMIC, and BT interfaces. Unless you have an SMBus interface or your
+own custom interface, you probably need to use this.
+
+ipmi_smb - A driver for accessing BMCs on the SMBus. It uses the
+I2C kernel driver's SMBus interfaces to send and receive IPMI messages
+over the SMBus.
+
+ipmi_watchdog - IPMI requires systems to have a very capable watchdog
+timer. This driver implements the standard Linux watchdog timer
+interface on top of the IPMI message handler.
+
+ipmi_poweroff - Some systems support the ability to be turned off via
+IPMI commands.
+
+These are all individually selectable via configuration options.
+
+Note that the KCS-only interface has been removed. The af_ipmi driver
+is no longer supported and has been removed because it was impossible
+to do 32 bit emulation on 64-bit kernels with it.
+
+Much documentation for the interface is in the include files. The
+IPMI include files are:
+
+net/af_ipmi.h - Contains the socket interface.
+
+linux/ipmi.h - Contains the user interface and IOCTL interface for IPMI.
+
+linux/ipmi_smi.h - Contains the interface for system management interfaces
+(things that interface to IPMI controllers) to use.
+
+linux/ipmi_msgdefs.h - General definitions for base IPMI messaging.
+
+
+Addressing
+----------
+
+The IPMI addressing works much like IP addresses, you have an overlay
+to handle the different address types. The overlay is:
+
+ struct ipmi_addr
+ {
+ int addr_type;
+ short channel;
+ char data[IPMI_MAX_ADDR_SIZE];
+ };
+
+The addr_type determines what the address really is. The driver
+currently understands two different types of addresses.
+
+"System Interface" addresses are defined as:
+
+ struct ipmi_system_interface_addr
+ {
+ int addr_type;
+ short channel;
+ };
+
+and the type is IPMI_SYSTEM_INTERFACE_ADDR_TYPE. This is used for talking
+straight to the BMC on the current card. The channel must be
+IPMI_BMC_CHANNEL.
+
+Messages that are destined to go out on the IPMB bus use the
+IPMI_IPMB_ADDR_TYPE address type. The format is
+
+ struct ipmi_ipmb_addr
+ {
+ int addr_type;
+ short channel;
+ unsigned char slave_addr;
+ unsigned char lun;
+ };
+
+The "channel" here is generally zero, but some devices support more
+than one channel, it corresponds to the channel as defined in the IPMI
+spec.
+
+
+Messages
+--------
+
+Messages are defined as:
+
+struct ipmi_msg
+{
+ unsigned char netfn;
+ unsigned char lun;
+ unsigned char cmd;
+ unsigned char *data;
+ int data_len;
+};
+
+The driver takes care of adding/stripping the header information. The
+data portion is just the data to be send (do NOT put addressing info
+here) or the response. Note that the completion code of a response is
+the first item in "data", it is not stripped out because that is how
+all the messages are defined in the spec (and thus makes counting the
+offsets a little easier :-).
+
+When using the IOCTL interface from userland, you must provide a block
+of data for "data", fill it, and set data_len to the length of the
+block of data, even when receiving messages. Otherwise the driver
+will have no place to put the message.
+
+Messages coming up from the message handler in kernelland will come in
+as:
+
+ struct ipmi_recv_msg
+ {
+ struct list_head link;
+
+ /* The type of message as defined in the "Receive Types"
+ defines above. */
+ int recv_type;
+
+ ipmi_user_t *user;
+ struct ipmi_addr addr;
+ long msgid;
+ struct ipmi_msg msg;
+
+ /* Call this when done with the message. It will presumably free
+ the message and do any other necessary cleanup. */
+ void (*done)(struct ipmi_recv_msg *msg);
+
+ /* Place-holder for the data, don't make any assumptions about
+ the size or existence of this, since it may change. */
+ unsigned char msg_data[IPMI_MAX_MSG_LENGTH];
+ };
+
+You should look at the receive type and handle the message
+appropriately.
+
+
+The Upper Layer Interface (Message Handler)
+-------------------------------------------
+
+The upper layer of the interface provides the users with a consistent
+view of the IPMI interfaces. It allows multiple SMI interfaces to be
+addressed (because some boards actually have multiple BMCs on them)
+and the user should not have to care what type of SMI is below them.
+
+
+Creating the User
+
+To user the message handler, you must first create a user using
+ipmi_create_user. The interface number specifies which SMI you want
+to connect to, and you must supply callback functions to be called
+when data comes in. The callback function can run at interrupt level,
+so be careful using the callbacks. This also allows to you pass in a
+piece of data, the handler_data, that will be passed back to you on
+all calls.
+
+Once you are done, call ipmi_destroy_user() to get rid of the user.
+
+From userland, opening the device automatically creates a user, and
+closing the device automatically destroys the user.
+
+
+Messaging
+
+To send a message from kernel-land, the ipmi_request() call does
+pretty much all message handling. Most of the parameter are
+self-explanatory. However, it takes a "msgid" parameter. This is NOT
+the sequence number of messages. It is simply a long value that is
+passed back when the response for the message is returned. You may
+use it for anything you like.
+
+Responses come back in the function pointed to by the ipmi_recv_hndl
+field of the "handler" that you passed in to ipmi_create_user().
+Remember again, these may be running at interrupt level. Remember to
+look at the receive type, too.
+
+From userland, you fill out an ipmi_req_t structure and use the
+IPMICTL_SEND_COMMAND ioctl. For incoming stuff, you can use select()
+or poll() to wait for messages to come in. However, you cannot use
+read() to get them, you must call the IPMICTL_RECEIVE_MSG with the
+ipmi_recv_t structure to actually get the message. Remember that you
+must supply a pointer to a block of data in the msg.data field, and
+you must fill in the msg.data_len field with the size of the data.
+This gives the receiver a place to actually put the message.
+
+If the message cannot fit into the data you provide, you will get an
+EMSGSIZE error and the driver will leave the data in the receive
+queue. If you want to get it and have it truncate the message, us
+the IPMICTL_RECEIVE_MSG_TRUNC ioctl.
+
+When you send a command (which is defined by the lowest-order bit of
+the netfn per the IPMI spec) on the IPMB bus, the driver will
+automatically assign the sequence number to the command and save the
+command. If the response is not receive in the IPMI-specified 5
+seconds, it will generate a response automatically saying the command
+timed out. If an unsolicited response comes in (if it was after 5
+seconds, for instance), that response will be ignored.
+
+In kernelland, after you receive a message and are done with it, you
+MUST call ipmi_free_recv_msg() on it, or you will leak messages. Note
+that you should NEVER mess with the "done" field of a message, that is
+required to properly clean up the message.
+
+Note that when sending, there is an ipmi_request_supply_msgs() call
+that lets you supply the smi and receive message. This is useful for
+pieces of code that need to work even if the system is out of buffers
+(the watchdog timer uses this, for instance). You supply your own
+buffer and own free routines. This is not recommended for normal use,
+though, since it is tricky to manage your own buffers.
+
+
+Events and Incoming Commands
+
+The driver takes care of polling for IPMI events and receiving
+commands (commands are messages that are not responses, they are
+commands that other things on the IPMB bus have sent you). To receive
+these, you must register for them, they will not automatically be sent
+to you.
+
+To receive events, you must call ipmi_set_gets_events() and set the
+"val" to non-zero. Any events that have been received by the driver
+since startup will immediately be delivered to the first user that
+registers for events. After that, if multiple users are registered
+for events, they will all receive all events that come in.
+
+For receiving commands, you have to individually register commands you
+want to receive. Call ipmi_register_for_cmd() and supply the netfn
+and command name for each command you want to receive. You also
+specify a bitmask of the channels you want to receive the command from
+(or use IPMI_CHAN_ALL for all channels if you don't care). Only one
+user may be registered for each netfn/cmd/channel, but different users
+may register for different commands, or the same command if the
+channel bitmasks do not overlap.
+
+From userland, equivalent IOCTLs are provided to do these functions.
+
+
+The Lower Layer (SMI) Interface
+-------------------------------
+
+As mentioned before, multiple SMI interfaces may be registered to the
+message handler, each of these is assigned an interface number when
+they register with the message handler. They are generally assigned
+in the order they register, although if an SMI unregisters and then
+another one registers, all bets are off.
+
+The ipmi_smi.h defines the interface for management interfaces, see
+that for more details.
+
+
+The SI Driver
+-------------
+
+The SI driver allows up to 4 KCS or SMIC interfaces to be configured
+in the system. By default, scan the ACPI tables for interfaces, and
+if it doesn't find any the driver will attempt to register one KCS
+interface at the spec-specified I/O port 0xca2 without interrupts.
+You can change this at module load time (for a module) with:
+
+ modprobe ipmi_si.o type=<type1>,<type2>....
+ ports=<port1>,<port2>... addrs=<addr1>,<addr2>...
+ irqs=<irq1>,<irq2>... trydefaults=[0|1]
+ regspacings=<sp1>,<sp2>,... regsizes=<size1>,<size2>,...
+ regshifts=<shift1>,<shift2>,...
+ slave_addrs=<addr1>,<addr2>,...
+ force_kipmid=<enable1>,<enable2>,...
+ unload_when_empty=[0|1]
+
+Each of these except si_trydefaults is a list, the first item for the
+first interface, second item for the second interface, etc.
+
+The si_type may be either "kcs", "smic", or "bt". If you leave it blank, it
+defaults to "kcs".
+
+If you specify si_addrs as non-zero for an interface, the driver will
+use the memory address given as the address of the device. This
+overrides si_ports.
+
+If you specify si_ports as non-zero for an interface, the driver will
+use the I/O port given as the device address.
+
+If you specify si_irqs as non-zero for an interface, the driver will
+attempt to use the given interrupt for the device.
+
+si_trydefaults sets whether the standard IPMI interface at 0xca2 and
+any interfaces specified by ACPE are tried. By default, the driver
+tries it, set this value to zero to turn this off.
+
+The next three parameters have to do with register layout. The
+registers used by the interfaces may not appear at successive
+locations and they may not be in 8-bit registers. These parameters
+allow the layout of the data in the registers to be more precisely
+specified.
+
+The regspacings parameter give the number of bytes between successive
+register start addresses. For instance, if the regspacing is set to 4
+and the start address is 0xca2, then the address for the second
+register would be 0xca6. This defaults to 1.
+
+The regsizes parameter gives the size of a register, in bytes. The
+data used by IPMI is 8-bits wide, but it may be inside a larger
+register. This parameter allows the read and write type to specified.
+It may be 1, 2, 4, or 8. The default is 1.
+
+Since the register size may be larger than 32 bits, the IPMI data may not
+be in the lower 8 bits. The regshifts parameter give the amount to shift
+the data to get to the actual IPMI data.
+
+The slave_addrs specifies the IPMI address of the local BMC. This is
+usually 0x20 and the driver defaults to that, but in case it's not, it
+can be specified when the driver starts up.
+
+The force_ipmid parameter forcefully enables (if set to 1) or disables
+(if set to 0) the kernel IPMI daemon. Normally this is auto-detected
+by the driver, but systems with broken interrupts might need an enable,
+or users that don't want the daemon (don't need the performance, don't
+want the CPU hit) can disable it.
+
+If unload_when_empty is set to 1, the driver will be unloaded if it
+doesn't find any interfaces or all the interfaces fail to work. The
+default is one. Setting to 0 is useful with the hotmod, but is
+obviously only useful for modules.
+
+When compiled into the kernel, the parameters can be specified on the
+kernel command line as:
+
+ ipmi_si.type=<type1>,<type2>...
+ ipmi_si.ports=<port1>,<port2>... ipmi_si.addrs=<addr1>,<addr2>...
+ ipmi_si.irqs=<irq1>,<irq2>... ipmi_si.trydefaults=[0|1]
+ ipmi_si.regspacings=<sp1>,<sp2>,...
+ ipmi_si.regsizes=<size1>,<size2>,...
+ ipmi_si.regshifts=<shift1>,<shift2>,...
+ ipmi_si.slave_addrs=<addr1>,<addr2>,...
+ ipmi_si.force_kipmid=<enable1>,<enable2>,...
+
+It works the same as the module parameters of the same names.
+
+By default, the driver will attempt to detect any device specified by
+ACPI, and if none of those then a KCS device at the spec-specified
+0xca2. If you want to turn this off, set the "trydefaults" option to
+false.
+
+If your IPMI interface does not support interrupts and is a KCS or
+SMIC interface, the IPMI driver will start a kernel thread for the
+interface to help speed things up. This is a low-priority kernel
+thread that constantly polls the IPMI driver while an IPMI operation
+is in progress. The force_kipmid module parameter will all the user to
+force this thread on or off. If you force it off and don't have
+interrupts, the driver will run VERY slowly. Don't blame me,
+these interfaces suck.
+
+The driver supports a hot add and remove of interfaces. This way,
+interfaces can be added or removed after the kernel is up and running.
+This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
+write-only parameter. You write a string to this interface. The string
+has the format:
+ <op1>[:op2[:op3...]]
+The "op"s are:
+ add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
+You can specify more than one interface on the line. The "opt"s are:
+ rsp=<regspacing>
+ rsi=<regsize>
+ rsh=<regshift>
+ irq=<irq>
+ ipmb=<ipmb slave addr>
+and these have the same meanings as discussed above. Note that you
+can also use this on the kernel command line for a more compact format
+for specifying an interface. Note that when removing an interface,
+only the first three parameters (si type, address type, and address)
+are used for the comparison. Any options are ignored for removing.
+
+The SMBus Driver
+----------------
+
+The SMBus driver allows up to 4 SMBus devices to be configured in the
+system. By default, the driver will register any SMBus interfaces it finds
+in the I2C address range of 0x20 to 0x4f on any adapter. You can change this
+at module load time (for a module) with:
+
+ modprobe ipmi_smb.o
+ addr=<adapter1>,<i2caddr1>[,<adapter2>,<i2caddr2>[,...]]
+ dbg=<flags1>,<flags2>...
+ [defaultprobe=1] [dbg_probe=1]
+
+The addresses are specified in pairs, the first is the adapter ID and the
+second is the I2C address on that adapter.
+
+The debug flags are bit flags for each BMC found, they are:
+IPMI messages: 1, driver state: 2, timing: 4, I2C probe: 8
+
+Setting smb_defaultprobe to zero disabled the default probing of SMBus
+interfaces at address range 0x20 to 0x4f. This means that only the
+BMCs specified on the smb_addr line will be detected.
+
+Setting smb_dbg_probe to 1 will enable debugging of the probing and
+detection process for BMCs on the SMBusses.
+
+Discovering the IPMI compliant BMC on the SMBus can cause devices
+on the I2C bus to fail. The SMBus driver writes a "Get Device ID" IPMI
+message as a block write to the I2C bus and waits for a response.
+This action can be detrimental to some I2C devices. It is highly recommended
+that the known I2c address be given to the SMBus driver in the smb_addr
+parameter. The default address range will not be used when a smb_addr
+parameter is provided.
+
+When compiled into the kernel, the addresses can be specified on the
+kernel command line as:
+
+ ipmb_smb.addr=<adapter1>,<i2caddr1>[,<adapter2>,<i2caddr2>[,...]]
+ ipmi_smb.dbg=<flags1>,<flags2>...
+ ipmi_smb.defaultprobe=0 ipmi_smb.dbg_probe=1
+
+These are the same options as on the module command line.
+
+Note that you might need some I2C changes if CONFIG_IPMI_PANIC_EVENT
+is enabled along with this, so the I2C driver knows to run to
+completion during sending a panic event.
+
+
+Other Pieces
+------------
+
+Watchdog
+--------
+
+A watchdog timer is provided that implements the Linux-standard
+watchdog timer interface. It has three module parameters that can be
+used to control it:
+
+ modprobe ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
+ preaction=<preaction type> preop=<preop type> start_now=x
+ nowayout=x ifnum_to_use=n
+
+ifnum_to_use specifies which interface the watchdog timer should use.
+The default is -1, which means to pick the first one registered.
+
+The timeout is the number of seconds to the action, and the pretimeout
+is the amount of seconds before the reset that the pre-timeout panic will
+occur (if pretimeout is zero, then pretimeout will not be enabled). Note
+that the pretimeout is the time before the final timeout. So if the
+timeout is 50 seconds and the pretimeout is 10 seconds, then the pretimeout
+will occur in 40 second (10 seconds before the timeout).
+
+The action may be "reset", "power_cycle", or "power_off", and
+specifies what to do when the timer times out, and defaults to
+"reset".
+
+The preaction may be "pre_smi" for an indication through the SMI
+interface, "pre_int" for an indication through the SMI with an
+interrupts, and "pre_nmi" for a NMI on a preaction. This is how
+the driver is informed of the pretimeout.
+
+The preop may be set to "preop_none" for no operation on a pretimeout,
+"preop_panic" to set the preoperation to panic, or "preop_give_data"
+to provide data to read from the watchdog device when the pretimeout
+occurs. A "pre_nmi" setting CANNOT be used with "preop_give_data"
+because you can't do data operations from an NMI.
+
+When preop is set to "preop_give_data", one byte comes ready to read
+on the device when the pretimeout occurs. Select and fasync work on
+the device, as well.
+
+If start_now is set to 1, the watchdog timer will start running as
+soon as the driver is loaded.
+
+If nowayout is set to 1, the watchdog timer will not stop when the
+watchdog device is closed. The default value of nowayout is true
+if the CONFIG_WATCHDOG_NOWAYOUT option is enabled, or false if not.
+
+When compiled into the kernel, the kernel command line is available
+for configuring the watchdog:
+
+ ipmi_watchdog.timeout=<t> ipmi_watchdog.pretimeout=<t>
+ ipmi_watchdog.action=<action type>
+ ipmi_watchdog.preaction=<preaction type>
+ ipmi_watchdog.preop=<preop type>
+ ipmi_watchdog.start_now=x
+ ipmi_watchdog.nowayout=x
+
+The options are the same as the module parameter options.
+
+The watchdog will panic and start a 120 second reset timeout if it
+gets a pre-action. During a panic or a reboot, the watchdog will
+start a 120 timer if it is running to make sure the reboot occurs.
+
+Note that if you use the NMI preaction for the watchdog, you MUST NOT
+use the nmi watchdog. There is no reasonable way to tell if an NMI
+comes from the IPMI controller, so it must assume that if it gets an
+otherwise unhandled NMI, it must be from IPMI and it will panic
+immediately.
+
+Once you open the watchdog timer, you must write a 'V' character to the
+device to close it, or the timer will not stop. This is a new semantic
+for the driver, but makes it consistent with the rest of the watchdog
+drivers in Linux.
+
+
+Panic Timeouts
+--------------
+
+The OpenIPMI driver supports the ability to put semi-custom and custom
+events in the system event log if a panic occurs. if you enable the
+'Generate a panic event to all BMCs on a panic' option, you will get
+one event on a panic in a standard IPMI event format. If you enable
+the 'Generate OEM events containing the panic string' option, you will
+also get a bunch of OEM events holding the panic string.
+
+
+The field settings of the events are:
+* Generator ID: 0x21 (kernel)
+* EvM Rev: 0x03 (this event is formatting in IPMI 1.0 format)
+* Sensor Type: 0x20 (OS critical stop sensor)
+* Sensor #: The first byte of the panic string (0 if no panic string)
+* Event Dir | Event Type: 0x6f (Assertion, sensor-specific event info)
+* Event Data 1: 0xa1 (Runtime stop in OEM bytes 2 and 3)
+* Event data 2: second byte of panic string
+* Event data 3: third byte of panic string
+See the IPMI spec for the details of the event layout. This event is
+always sent to the local management controller. It will handle routing
+the message to the right place
+
+Other OEM events have the following format:
+Record ID (bytes 0-1): Set by the SEL.
+Record type (byte 2): 0xf0 (OEM non-timestamped)
+byte 3: The slave address of the card saving the panic
+byte 4: A sequence number (starting at zero)
+The rest of the bytes (11 bytes) are the panic string. If the panic string
+is longer than 11 bytes, multiple messages will be sent with increasing
+sequence numbers.
+
+Because you cannot send OEM events using the standard interface, this
+function will attempt to find an SEL and add the events there. It
+will first query the capabilities of the local management controller.
+If it has an SEL, then they will be stored in the SEL of the local
+management controller. If not, and the local management controller is
+an event generator, the event receiver from the local management
+controller will be queried and the events sent to the SEL on that
+device. Otherwise, the events go nowhere since there is nowhere to
+send them.
+
+
+Poweroff
+--------
+
+If the poweroff capability is selected, the IPMI driver will install
+a shutdown function into the standard poweroff function pointer. This
+is in the ipmi_poweroff module. When the system requests a powerdown,
+it will send the proper IPMI commands to do this. This is supported on
+several platforms.
+
+There is a module parameter named "poweroff_powercycle" that may
+either be zero (do a power down) or non-zero (do a power cycle, power
+the system off, then power it on in a few seconds). Setting
+ipmi_poweroff.poweroff_control=x will do the same thing on the kernel
+command line. The parameter is also available via the proc filesystem
+in /proc/sys/dev/ipmi/poweroff_powercycle. Note that if the system
+does not support power cycling, it will always do the power off.
+
+The "ifnum_to_use" parameter specifies which interface the poweroff
+code should use. The default is -1, which means to pick the first one
+registered.
+
+Note that if you have ACPI enabled, the system will prefer using ACPI to
+power off.
diff --git a/Documentation/IRQ-affinity.txt b/Documentation/IRQ-affinity.txt
new file mode 100644
index 0000000..b4a615b
--- /dev/null
+++ b/Documentation/IRQ-affinity.txt
@@ -0,0 +1,56 @@
+ChangeLog:
+ Started by Ingo Molnar <mingo@redhat.com>
+ Update by Max Krasnyansky <maxk@qualcomm.com>
+
+SMP IRQ affinity
+
+/proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted
+for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed
+to turn off all CPUs, and if an IRQ controller does not support IRQ
+affinity then the value will not change from the default 0xffffffff.
+
+/proc/irq/default_smp_affinity specifies default affinity mask that applies
+to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask
+will be set to the default mask. It can then be changed as described above.
+Default mask is 0xffffffff.
+
+Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting
+it to CPU4-7 (this is an 8-CPU SMP box):
+
+[root@moon 44]# cd /proc/irq/44
+[root@moon 44]# cat smp_affinity
+ffffffff
+
+[root@moon 44]# echo 0f > smp_affinity
+[root@moon 44]# cat smp_affinity
+0000000f
+[root@moon 44]# ping -f h
+PING hell (195.4.7.3): 56 data bytes
+...
+--- hell ping statistics ---
+6029 packets transmitted, 6027 packets received, 0% packet loss
+round-trip min/avg/max = 0.1/0.1/0.4 ms
+[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
+ CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
+ 44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1
+
+As can be seen from the line above IRQ44 was delivered only to the first four
+processors (0-3).
+Now lets restrict that IRQ to CPU(4-7).
+
+[root@moon 44]# echo f0 > smp_affinity
+[root@moon 44]# cat smp_affinity
+000000f0
+[root@moon 44]# ping -f h
+PING hell (195.4.7.3): 56 data bytes
+..
+--- hell ping statistics ---
+2779 packets transmitted, 2777 packets received, 0% packet loss
+round-trip min/avg/max = 0.1/0.5/585.4 ms
+[root@moon 44]# cat /proc/interrupts | 'CPU\|44:'
+ CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
+ 44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1
+
+This time around IRQ44 was delivered only to the last four processors.
+i.e counters for the CPU0-3 did not change.
+
diff --git a/Documentation/IRQ.txt b/Documentation/IRQ.txt
new file mode 100644
index 0000000..1011e71
--- /dev/null
+++ b/Documentation/IRQ.txt
@@ -0,0 +1,22 @@
+What is an IRQ?
+
+An IRQ is an interrupt request from a device.
+Currently they can come in over a pin, or over a packet.
+Several devices may be connected to the same pin thus
+sharing an IRQ.
+
+An IRQ number is a kernel identifier used to talk about a hardware
+interrupt source. Typically this is an index into the global irq_desc
+array, but except for what linux/interrupt.h implements the details
+are architecture specific.
+
+An IRQ number is an enumeration of the possible interrupt sources on a
+machine. Typically what is enumerated is the number of input pins on
+all of the interrupt controller in the system. In the case of ISA
+what is enumerated are the 16 input pins on the two i8259 interrupt
+controllers.
+
+Architectures can assign additional meaning to the IRQ numbers, and
+are encouraged to in the case where there is any manual configuration
+of the hardware involved. The ISA IRQs are a classic example of
+assigning this kind of additional meaning.
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 0000000..21bc416
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,115 @@
+Linux IOMMU Support
+===================
+
+The architecture spec can be obtained from the below location.
+
+http://www.intel.com/technology/virtualization/
+
+This guide gives a quick cheat sheet for some basic understanding.
+
+Some Keywords
+
+DMAR - DMA remapping
+DRHD - DMA Engine Reporting Structure
+RMRR - Reserved memory Region Reporting Structure
+ZLR - Zero length reads from PCI devices
+IOVA - IO Virtual address.
+
+Basic stuff
+-----------
+
+ACPI enumerates and lists the different DMA engines in the platform, and
+device scope relationships between PCI devices and which DMA engine controls
+them.
+
+What is RMRR?
+-------------
+
+There are some devices the BIOS controls, for e.g USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+unity mappings for these regions for these devices to access these regions.
+
+How is IOVA generated?
+---------------------
+
+Well behaved drivers call pci_map_*() calls before sending command to device
+that needs to perform DMA. Once DMA is completed and mapping is no longer
+required, device performs a pci_unmap_*() calls to unmap the region.
+
+The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
+device has its own domain (hence protection). Devices under p2p bridges
+share the virtual address with all devices under the p2p bridge due to
+transaction id aliasing for p2p bridges.
+
+IOVA generation is pretty generic. We used the same technique as vmalloc()
+but these are not global address spaces, but separate for each domain.
+Different DMA engines may support different number of domains.
+
+We also allocate guard pages with each mapping, so we can attempt to catch
+any overflow that might happen.
+
+
+Graphics Problems?
+------------------
+If you encounter issues with graphics devices, you can try adding
+option intel_iommu=igfx_off to turn off the integrated graphics engine.
+
+If it happens to be a PCI device included in the INCLUDE_ALL Engine,
+then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear
+graphics drivers may be in process of using DMA api's in the near
+future and at that time this option can be yanked out.
+
+Some exceptions to IOVA
+-----------------------
+Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
+The same is true for peer to peer transactions. Hence we reserve the
+address from PCI MMIO ranges so they are not allocated for IOVA addresses.
+
+
+Fault reporting
+---------------
+When errors are reported, the DMA engine signals via an interrupt. The fault
+reason and device that caused it with fault reason is printed on console.
+
+See below for sample.
+
+
+Boot Message Sample
+-------------------
+
+Something like this gets printed indicating presence of DMAR tables
+in ACPI.
+
+ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
+
+When DMAR is being processed and initialized by ACPI, prints DMAR locations
+and any RMRR's processed.
+
+ACPI DMAR:Host address width 36
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
+ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
+ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
+ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
+
+When DMAR is enabled for use, you will notice..
+
+PCI-DMA: Using DMAR IOMMU
+
+Fault reporting
+---------------
+
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+
+TBD
+----
+
+- For compatibility testing, could use unity map domain for all devices, just
+ provide a 1-1 for all useful memory under a single domain for all devices.
+- API for paravirt ops for abstracting functionality for VMM folks.
diff --git a/Documentation/Makefile b/Documentation/Makefile
new file mode 100644
index 0000000..94b9457
--- /dev/null
+++ b/Documentation/Makefile
@@ -0,0 +1,3 @@
+obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
+ filesystems/configfs/ ia64/ networking/ \
+ pcmcia/ spi/ video4linux/ vm/ watchdog/src/
diff --git a/Documentation/ManagementStyle b/Documentation/ManagementStyle
new file mode 100644
index 0000000..a5f0ea5
--- /dev/null
+++ b/Documentation/ManagementStyle
@@ -0,0 +1,276 @@
+
+ Linux kernel management style
+
+This is a short document describing the preferred (or made up, depending
+on who you ask) management style for the linux kernel. It's meant to
+mirror the CodingStyle document to some degree, and mainly written to
+avoid answering (*) the same (or similar) questions over and over again.
+
+Management style is very personal and much harder to quantify than
+simple coding style rules, so this document may or may not have anything
+to do with reality. It started as a lark, but that doesn't mean that it
+might not actually be true. You'll have to decide for yourself.
+
+Btw, when talking about "kernel manager", it's all about the technical
+lead persons, not the people who do traditional management inside
+companies. If you sign purchase orders or you have any clue about the
+budget of your group, you're almost certainly not a kernel manager.
+These suggestions may or may not apply to you.
+
+First off, I'd suggest buying "Seven Habits of Highly Effective
+People", and NOT read it. Burn it, it's a great symbolic gesture.
+
+(*) This document does so not so much by answering the question, but by
+making it painfully obvious to the questioner that we don't have a clue
+to what the answer is.
+
+Anyway, here goes:
+
+
+ Chapter 1: Decisions
+
+Everybody thinks managers make decisions, and that decision-making is
+important. The bigger and more painful the decision, the bigger the
+manager must be to make it. That's very deep and obvious, but it's not
+actually true.
+
+The name of the game is to _avoid_ having to make a decision. In
+particular, if somebody tells you "choose (a) or (b), we really need you
+to decide on this", you're in trouble as a manager. The people you
+manage had better know the details better than you, so if they come to
+you for a technical decision, you're screwed. You're clearly not
+competent to make that decision for them.
+
+(Corollary:if the people you manage don't know the details better than
+you, you're also screwed, although for a totally different reason.
+Namely that you are in the wrong job, and that _they_ should be managing
+your brilliance instead).
+
+So the name of the game is to _avoid_ decisions, at least the big and
+painful ones. Making small and non-consequential decisions is fine, and
+makes you look like you know what you're doing, so what a kernel manager
+needs to do is to turn the big and painful ones into small things where
+nobody really cares.
+
+It helps to realize that the key difference between a big decision and a
+small one is whether you can fix your decision afterwards. Any decision
+can be made small by just always making sure that if you were wrong (and
+you _will_ be wrong), you can always undo the damage later by
+backtracking. Suddenly, you get to be doubly managerial for making
+_two_ inconsequential decisions - the wrong one _and_ the right one.
+
+And people will even see that as true leadership (*cough* bullshit
+*cough*).
+
+Thus the key to avoiding big decisions becomes to just avoiding to do
+things that can't be undone. Don't get ushered into a corner from which
+you cannot escape. A cornered rat may be dangerous - a cornered manager
+is just pitiful.
+
+It turns out that since nobody would be stupid enough to ever really let
+a kernel manager have huge fiscal responsibility _anyway_, it's usually
+fairly easy to backtrack. Since you're not going to be able to waste
+huge amounts of money that you might not be able to repay, the only
+thing you can backtrack on is a technical decision, and there
+back-tracking is very easy: just tell everybody that you were an
+incompetent nincompoop, say you're sorry, and undo all the worthless
+work you had people work on for the last year. Suddenly the decision
+you made a year ago wasn't a big decision after all, since it could be
+easily undone.
+
+It turns out that some people have trouble with this approach, for two
+reasons:
+ - admitting you were an idiot is harder than it looks. We all like to
+ maintain appearances, and coming out in public to say that you were
+ wrong is sometimes very hard indeed.
+ - having somebody tell you that what you worked on for the last year
+ wasn't worthwhile after all can be hard on the poor lowly engineers
+ too, and while the actual _work_ was easy enough to undo by just
+ deleting it, you may have irrevocably lost the trust of that
+ engineer. And remember: "irrevocable" was what we tried to avoid in
+ the first place, and your decision ended up being a big one after
+ all.
+
+Happily, both of these reasons can be mitigated effectively by just
+admitting up-front that you don't have a friggin' clue, and telling
+people ahead of the fact that your decision is purely preliminary, and
+might be the wrong thing. You should always reserve the right to change
+your mind, and make people very _aware_ of that. And it's much easier
+to admit that you are stupid when you haven't _yet_ done the really
+stupid thing.
+
+Then, when it really does turn out to be stupid, people just roll their
+eyes and say "Oops, he did it again".
+
+This preemptive admission of incompetence might also make the people who
+actually do the work also think twice about whether it's worth doing or
+not. After all, if _they_ aren't certain whether it's a good idea, you
+sure as hell shouldn't encourage them by promising them that what they
+work on will be included. Make them at least think twice before they
+embark on a big endeavor.
+
+Remember: they'd better know more about the details than you do, and
+they usually already think they have the answer to everything. The best
+thing you can do as a manager is not to instill confidence, but rather a
+healthy dose of critical thinking on what they do.
+
+Btw, another way to avoid a decision is to plaintively just whine "can't
+we just do both?" and look pitiful. Trust me, it works. If it's not
+clear which approach is better, they'll eventually figure it out. The
+answer may end up being that both teams get so frustrated by the
+situation that they just give up.
+
+That may sound like a failure, but it's usually a sign that there was
+something wrong with both projects, and the reason the people involved
+couldn't decide was that they were both wrong. You end up coming up
+smelling like roses, and you avoided yet another decision that you could
+have screwed up on.
+
+
+ Chapter 2: People
+
+Most people are idiots, and being a manager means you'll have to deal
+with it, and perhaps more importantly, that _they_ have to deal with
+_you_.
+
+It turns out that while it's easy to undo technical mistakes, it's not
+as easy to undo personality disorders. You just have to live with
+theirs - and yours.
+
+However, in order to prepare yourself as a kernel manager, it's best to
+remember not to burn any bridges, bomb any innocent villagers, or
+alienate too many kernel developers. It turns out that alienating people
+is fairly easy, and un-alienating them is hard. Thus "alienating"
+immediately falls under the heading of "not reversible", and becomes a
+no-no according to Chapter 1.
+
+There's just a few simple rules here:
+ (1) don't call people d*ckheads (at least not in public)
+ (2) learn how to apologize when you forgot rule (1)
+
+The problem with #1 is that it's very easy to do, since you can say
+"you're a d*ckhead" in millions of different ways (*), sometimes without
+even realizing it, and almost always with a white-hot conviction that
+you are right.
+
+And the more convinced you are that you are right (and let's face it,
+you can call just about _anybody_ a d*ckhead, and you often _will_ be
+right), the harder it ends up being to apologize afterwards.
+
+To solve this problem, you really only have two options:
+ - get really good at apologies
+ - spread the "love" out so evenly that nobody really ends up feeling
+ like they get unfairly targeted. Make it inventive enough, and they
+ might even be amused.
+
+The option of being unfailingly polite really doesn't exist. Nobody will
+trust somebody who is so clearly hiding his true character.
+
+(*) Paul Simon sang "Fifty Ways to Leave Your Lover", because quite
+frankly, "A Million Ways to Tell a Developer He Is a D*ckhead" doesn't
+scan nearly as well. But I'm sure he thought about it.
+
+
+ Chapter 3: People II - the Good Kind
+
+While it turns out that most people are idiots, the corollary to that is
+sadly that you are one too, and that while we can all bask in the secure
+knowledge that we're better than the average person (let's face it,
+nobody ever believes that they're average or below-average), we should
+also admit that we're not the sharpest knife around, and there will be
+other people that are less of an idiot that you are.
+
+Some people react badly to smart people. Others take advantage of them.
+
+Make sure that you, as a kernel maintainer, are in the second group.
+Suck up to them, because they are the people who will make your job
+easier. In particular, they'll be able to make your decisions for you,
+which is what the game is all about.
+
+So when you find somebody smarter than you are, just coast along. Your
+management responsibilities largely become ones of saying "Sounds like a
+good idea - go wild", or "That sounds good, but what about xxx?". The
+second version in particular is a great way to either learn something
+new about "xxx" or seem _extra_ managerial by pointing out something the
+smarter person hadn't thought about. In either case, you win.
+
+One thing to look out for is to realize that greatness in one area does
+not necessarily translate to other areas. So you might prod people in
+specific directions, but let's face it, they might be good at what they
+do, and suck at everything else. The good news is that people tend to
+naturally gravitate back to what they are good at, so it's not like you
+are doing something irreversible when you _do_ prod them in some
+direction, just don't push too hard.
+
+
+ Chapter 4: Placing blame
+
+Things will go wrong, and people want somebody to blame. Tag, you're it.
+
+It's not actually that hard to accept the blame, especially if people
+kind of realize that it wasn't _all_ your fault. Which brings us to the
+best way of taking the blame: do it for another guy. You'll feel good
+for taking the fall, he'll feel good about not getting blamed, and the
+guy who lost his whole 36GB porn-collection because of your incompetence
+will grudgingly admit that you at least didn't try to weasel out of it.
+
+Then make the developer who really screwed up (if you can find him) know
+_in_private_ that he screwed up. Not just so he can avoid it in the
+future, but so that he knows he owes you one. And, perhaps even more
+importantly, he's also likely the person who can fix it. Because, let's
+face it, it sure ain't you.
+
+Taking the blame is also why you get to be manager in the first place.
+It's part of what makes people trust you, and allow you the potential
+glory, because you're the one who gets to say "I screwed up". And if
+you've followed the previous rules, you'll be pretty good at saying that
+by now.
+
+
+ Chapter 5: Things to avoid
+
+There's one thing people hate even more than being called "d*ckhead",
+and that is being called a "d*ckhead" in a sanctimonious voice. The
+first you can apologize for, the second one you won't really get the
+chance. They likely will no longer be listening even if you otherwise
+do a good job.
+
+We all think we're better than anybody else, which means that when
+somebody else puts on airs, it _really_ rubs us the wrong way. You may
+be morally and intellectually superior to everybody around you, but
+don't try to make it too obvious unless you really _intend_ to irritate
+somebody (*).
+
+Similarly, don't be too polite or subtle about things. Politeness easily
+ends up going overboard and hiding the problem, and as they say, "On the
+internet, nobody can hear you being subtle". Use a big blunt object to
+hammer the point in, because you can't really depend on people getting
+your point otherwise.
+
+Some humor can help pad both the bluntness and the moralizing. Going
+overboard to the point of being ridiculous can drive a point home
+without making it painful to the recipient, who just thinks you're being
+silly. It can thus help get through the personal mental block we all
+have about criticism.
+
+(*) Hint: internet newsgroups that are not directly related to your work
+are great ways to take out your frustrations at other people. Write
+insulting posts with a sneer just to get into a good flame every once in
+a while, and you'll feel cleansed. Just don't crap too close to home.
+
+
+ Chapter 6: Why me?
+
+Since your main responsibility seems to be to take the blame for other
+peoples mistakes, and make it painfully obvious to everybody else that
+you're incompetent, the obvious question becomes one of why do it in the
+first place?
+
+First off, while you may or may not get screaming teenage girls (or
+boys, let's not be judgmental or sexist here) knocking on your dressing
+room door, you _will_ get an immense feeling of personal accomplishment
+for being "in charge". Never mind the fact that you're really leading
+by trying to keep up with everybody else and running after them as fast
+as you can. Everybody will still think you're the person in charge.
+
+It's a great job if you can hack it.
diff --git a/Documentation/PCI/00-INDEX b/Documentation/PCI/00-INDEX
new file mode 100644
index 0000000..812b17f
--- /dev/null
+++ b/Documentation/PCI/00-INDEX
@@ -0,0 +1,14 @@
+00-INDEX
+ - this file
+MSI-HOWTO.txt
+ - the Message Signaled Interrupts (MSI) Driver Guide HOWTO and FAQ.
+PCI-DMA-mapping.txt
+ - info for PCI drivers using DMA portably across all platforms
+PCIEBUS-HOWTO.txt
+ - a guide describing the PCI Express Port Bus driver
+pci-error-recovery.txt
+ - info on PCI error recovery
+pci.txt
+ - info on the PCI subsystem for device driver authors
+pcieaer-howto.txt
+ - the PCI Express Advanced Error Reporting Driver Guide HOWTO
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
new file mode 100644
index 0000000..256defd
--- /dev/null
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -0,0 +1,509 @@
+ The MSI Driver Guide HOWTO
+ Tom L Nguyen tom.l.nguyen@intel.com
+ 10/03/2003
+ Revised Feb 12, 2004 by Martine Silbermann
+ email: Martine.Silbermann@hp.com
+ Revised Jun 25, 2004 by Tom L Nguyen
+
+1. About this guide
+
+This guide describes the basics of Message Signaled Interrupts (MSI),
+the advantages of using MSI over traditional interrupt mechanisms,
+and how to enable your driver to use MSI or MSI-X. Also included is
+a Frequently Asked Questions (FAQ) section.
+
+1.1 Terminology
+
+PCI devices can be single-function or multi-function. In either case,
+when this text talks about enabling or disabling MSI on a "device
+function," it is referring to one specific PCI device and function and
+not to all functions on a PCI device (unless the PCI device has only
+one function).
+
+2. Copyright 2003 Intel Corporation
+
+3. What is MSI/MSI-X?
+
+Message Signaled Interrupt (MSI), as described in the PCI Local Bus
+Specification Revision 2.3 or later, is an optional feature, and a
+required feature for PCI Express devices. MSI enables a device function
+to request service by sending an Inbound Memory Write on its PCI bus to
+the FSB as a Message Signal Interrupt transaction. Because MSI is
+generated in the form of a Memory Write, all transaction conditions,
+such as a Retry, Master-Abort, Target-Abort or normal completion, are
+supported.
+
+A PCI device that supports MSI must also support pin IRQ assertion
+interrupt mechanism to provide backward compatibility for systems that
+do not support MSI. In systems which support MSI, the bus driver is
+responsible for initializing the message address and message data of
+the device function's MSI/MSI-X capability structure during device
+initial configuration.
+
+An MSI capable device function indicates MSI support by implementing
+the MSI/MSI-X capability structure in its PCI capability list. The
+device function may implement both the MSI capability structure and
+the MSI-X capability structure; however, the bus driver should not
+enable both.
+
+The MSI capability structure contains Message Control register,
+Message Address register and Message Data register. These registers
+provide the bus driver control over MSI. The Message Control register
+indicates the MSI capability supported by the device. The Message
+Address register specifies the target address and the Message Data
+register specifies the characteristics of the message. To request
+service, the device function writes the content of the Message Data
+register to the target address. The device and its software driver
+are prohibited from writing to these registers.
+
+The MSI-X capability structure is an optional extension to MSI. It
+uses an independent and separate capability structure. There are
+some key advantages to implementing the MSI-X capability structure
+over the MSI capability structure as described below.
+
+ - Support a larger maximum number of vectors per function.
+
+ - Provide the ability for system software to configure
+ each vector with an independent message address and message
+ data, specified by a table that resides in Memory Space.
+
+ - MSI and MSI-X both support per-vector masking. Per-vector
+ masking is an optional extension of MSI but a required
+ feature for MSI-X. Per-vector masking provides the kernel the
+ ability to mask/unmask a single MSI while running its
+ interrupt service routine. If per-vector masking is
+ not supported, then the device driver should provide the
+ hardware/software synchronization to ensure that the device
+ generates MSI when the driver wants it to do so.
+
+4. Why use MSI?
+
+As a benefit to the simplification of board design, MSI allows board
+designers to remove out-of-band interrupt routing. MSI is another
+step towards a legacy-free environment.
+
+Due to increasing pressure on chipset and processor packages to
+reduce pin count, the need for interrupt pins is expected to
+diminish over time. Devices, due to pin constraints, may implement
+messages to increase performance.
+
+PCI Express endpoints uses INTx emulation (in-band messages) instead
+of IRQ pin assertion. Using INTx emulation requires interrupt
+sharing among devices connected to the same node (PCI bridge) while
+MSI is unique (non-shared) and does not require BIOS configuration
+support. As a result, the PCI Express technology requires MSI
+support for better interrupt performance.
+
+Using MSI enables the device functions to support two or more
+vectors, which can be configured to target different CPUs to
+increase scalability.
+
+5. Configuring a driver to use MSI/MSI-X
+
+By default, the kernel will not enable MSI/MSI-X on all devices that
+support this capability. The CONFIG_PCI_MSI kernel option
+must be selected to enable MSI/MSI-X support.
+
+5.1 Including MSI/MSI-X support into the kernel
+
+To allow MSI/MSI-X capable device drivers to selectively enable
+MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described
+below), the VECTOR based scheme needs to be enabled by setting
+CONFIG_PCI_MSI during kernel config.
+
+Since the target of the inbound message is the local APIC, providing
+CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI.
+
+5.2 Configuring for MSI support
+
+Due to the non-contiguous fashion in vector assignment of the
+existing Linux kernel, this version does not support multiple
+messages regardless of a device function is capable of supporting
+more than one vector. To enable MSI on a device function's MSI
+capability structure requires a device driver to call the function
+pci_enable_msi() explicitly.
+
+5.2.1 API pci_enable_msi
+
+int pci_enable_msi(struct pci_dev *dev)
+
+With this new API, a device driver that wants to have MSI
+enabled on its device function must call this API to enable MSI.
+A successful call will initialize the MSI capability structure
+with ONE vector, regardless of whether a device function is
+capable of supporting multiple messages. This vector replaces the
+pre-assigned dev->irq with a new MSI vector. To avoid a conflict
+of the new assigned vector with existing pre-assigned vector requires
+a device driver to call this API before calling request_irq().
+
+5.2.2 API pci_disable_msi
+
+void pci_disable_msi(struct pci_dev *dev)
+
+This API should always be used to undo the effect of pci_enable_msi()
+when a device driver is unloading. This API restores dev->irq with
+the pre-assigned IOAPIC vector and switches a device's interrupt
+mode to PCI pin-irq assertion/INTx emulation mode.
+
+Note that a device driver should always call free_irq() on the MSI vector
+that it has done request_irq() on before calling this API. Failure to do
+so results in a BUG_ON() and a device will be left with MSI enabled and
+leaks its vector.
+
+5.2.3 MSI mode vs. legacy mode diagram
+
+The below diagram shows the events which switch the interrupt
+mode on the MSI-capable device function between MSI mode and
+PIN-IRQ assertion mode.
+
+ ------------ pci_enable_msi ------------------------
+ | | <=============== | |
+ | MSI MODE | | PIN-IRQ ASSERTION MODE |
+ | | ===============> | |
+ ------------ pci_disable_msi ------------------------
+
+
+Figure 1. MSI Mode vs. Legacy Mode
+
+In Figure 1, a device operates by default in legacy mode. Legacy
+in this context means PCI pin-irq assertion or PCI-Express INTx
+emulation. A successful MSI request (using pci_enable_msi()) switches
+a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector
+stored in dev->irq will be saved by the PCI subsystem and a new
+assigned MSI vector will replace dev->irq.
+
+To return back to its default mode, a device driver should always call
+pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a
+device driver should always call free_irq() on the MSI vector it has
+done request_irq() on before calling pci_disable_msi(). Failure to do
+so results in a BUG_ON() and a device will be left with MSI enabled and
+leaks its vector. Otherwise, the PCI subsystem restores a device's
+dev->irq with a pre-assigned IOAPIC vector and marks the released
+MSI vector as unused.
+
+Once being marked as unused, there is no guarantee that the PCI
+subsystem will reserve this MSI vector for a device. Depending on
+the availability of current PCI vector resources and the number of
+MSI/MSI-X requests from other drivers, this MSI may be re-assigned.
+
+For the case where the PCI subsystem re-assigns this MSI vector to
+another driver, a request to switch back to MSI mode may result
+in being assigned a different MSI vector or a failure if no more
+vectors are available.
+
+5.3 Configuring for MSI-X support
+
+Due to the ability of the system software to configure each vector of
+the MSI-X capability structure with an independent message address
+and message data, the non-contiguous fashion in vector assignment of
+the existing Linux kernel has no impact on supporting multiple
+messages on an MSI-X capable device functions. To enable MSI-X on
+a device function's MSI-X capability structure requires its device
+driver to call the function pci_enable_msix() explicitly.
+
+The function pci_enable_msix(), once invoked, enables either
+all or nothing, depending on the current availability of PCI vector
+resources. If the PCI vector resources are available for the number
+of vectors requested by a device driver, this function will configure
+the MSI-X table of the MSI-X capability structure of a device with
+requested messages. To emphasize this reason, for example, a device
+may be capable for supporting the maximum of 32 vectors while its
+software driver usually may request 4 vectors. It is recommended
+that the device driver should call this function once during the
+initialization phase of the device driver.
+
+Unlike the function pci_enable_msi(), the function pci_enable_msix()
+does not replace the pre-assigned IOAPIC dev->irq with a new MSI
+vector because the PCI subsystem writes the 1:1 vector-to-entry mapping
+into the field vector of each element contained in a second argument.
+Note that the pre-assigned IOAPIC dev->irq is valid only if the device
+operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at
+using dev->irq by the device driver to request for interrupt service
+may result in unpredictable behavior.
+
+For each MSI-X vector granted, a device driver is responsible for calling
+other functions like request_irq(), enable_irq(), etc. to enable
+this vector with its corresponding interrupt service handler. It is
+a device driver's choice to assign all vectors with the same
+interrupt service handler or each vector with a unique interrupt
+service handler.
+
+5.3.1 Handling MMIO address space of MSI-X Table
+
+The PCI 3.0 specification has implementation notes that MMIO address
+space for a device's MSI-X structure should be isolated so that the
+software system can set different pages for controlling accesses to the
+MSI-X structure. The implementation of MSI support requires the PCI
+subsystem, not a device driver, to maintain full control of the MSI-X
+table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X
+table/MSI-X PBA. A device driver should not access the MMIO address
+space of the MSI-X table/MSI-X PBA.
+
+5.3.2 API pci_enable_msix
+
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+
+This API enables a device driver to request the PCI subsystem
+to enable MSI-X messages on its hardware device. Depending on
+the availability of PCI vectors resources, the PCI subsystem enables
+either all or none of the requested vectors.
+
+Argument 'dev' points to the device (pci_dev) structure.
+
+Argument 'entries' is a pointer to an array of msix_entry structs.
+The number of entries is indicated in argument 'nvec'.
+struct msix_entry is defined in /driver/pci/msi.h:
+
+struct msix_entry {
+ u16 vector; /* kernel uses to write alloc vector */
+ u16 entry; /* driver uses to specify entry */
+};
+
+A device driver is responsible for initializing the field 'entry' of
+each element with a unique entry supported by MSI-X table. Otherwise,
+-EINVAL will be returned as a result. A successful return of zero
+indicates the PCI subsystem completed initializing each of the requested
+entries of the MSI-X table with message address and message data.
+Last but not least, the PCI subsystem will write the 1:1
+vector-to-entry mapping into the field 'vector' of each element. A
+device driver is responsible for keeping track of allocated MSI-X
+vectors in its internal data structure.
+
+A return of zero indicates that the number of MSI-X vectors was
+successfully allocated. A return of greater than zero indicates
+MSI-X vector shortage. Or a return of less than zero indicates
+a failure. This failure may be a result of duplicate entries
+specified in second argument, or a result of no available vector,
+or a result of failing to initialize MSI-X table entries.
+
+5.3.3 API pci_disable_msix
+
+void pci_disable_msix(struct pci_dev *dev)
+
+This API should always be used to undo the effect of pci_enable_msix()
+when a device driver is unloading. Note that a device driver should
+always call free_irq() on all MSI-X vectors it has done request_irq()
+on before calling this API. Failure to do so results in a BUG_ON() and
+a device will be left with MSI-X enabled and leaks its vectors.
+
+5.3.4 MSI-X mode vs. legacy mode diagram
+
+The below diagram shows the events which switch the interrupt
+mode on the MSI-X capable device function between MSI-X mode and
+PIN-IRQ assertion mode (legacy).
+
+ ------------ pci_enable_msix(,,n) ------------------------
+ | | <=============== | |
+ | MSI-X MODE | | PIN-IRQ ASSERTION MODE |
+ | | ===============> | |
+ ------------ pci_disable_msix ------------------------
+
+Figure 2. MSI-X Mode vs. Legacy Mode
+
+In Figure 2, a device operates by default in legacy mode. A
+successful MSI-X request (using pci_enable_msix()) switches a
+device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector
+stored in dev->irq will be saved by the PCI subsystem; however,
+unlike MSI mode, the PCI subsystem will not replace dev->irq with
+assigned MSI-X vector because the PCI subsystem already writes the 1:1
+vector-to-entry mapping into the field 'vector' of each element
+specified in second argument.
+
+To return back to its default mode, a device driver should always call
+pci_disable_msix() to undo the effect of pci_enable_msix(). Note that
+a device driver should always call free_irq() on all MSI-X vectors it
+has done request_irq() on before calling pci_disable_msix(). Failure
+to do so results in a BUG_ON() and a device will be left with MSI-X
+enabled and leaks its vectors. Otherwise, the PCI subsystem switches a
+device function's interrupt mode from MSI-X mode to legacy mode and
+marks all allocated MSI-X vectors as unused.
+
+Once being marked as unused, there is no guarantee that the PCI
+subsystem will reserve these MSI-X vectors for a device. Depending on
+the availability of current PCI vector resources and the number of
+MSI/MSI-X requests from other drivers, these MSI-X vectors may be
+re-assigned.
+
+For the case where the PCI subsystem re-assigned these MSI-X vectors
+to other drivers, a request to switch back to MSI-X mode may result
+being assigned with another set of MSI-X vectors or a failure if no
+more vectors are available.
+
+5.4 Handling function implementing both MSI and MSI-X capabilities
+
+For the case where a function implements both MSI and MSI-X
+capabilities, the PCI subsystem enables a device to run either in MSI
+mode or MSI-X mode but not both. A device driver determines whether it
+wants MSI or MSI-X enabled on its hardware device. Once a device
+driver requests for MSI, for example, it is prohibited from requesting
+MSI-X; in other words, a device driver is not permitted to ping-pong
+between MSI mod MSI-X mode during a run-time.
+
+5.5 Hardware requirements for MSI/MSI-X support
+
+MSI/MSI-X support requires support from both system hardware and
+individual hardware device functions.
+
+5.5.1 Required x86 hardware support
+
+Since the target of MSI address is the local APIC CPU, enabling
+MSI/MSI-X support in the Linux kernel is dependent on whether existing
+system hardware supports local APIC. Users should verify that their
+system supports local APIC operation by testing that it runs when
+CONFIG_X86_LOCAL_APIC=y.
+
+In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set;
+however, in UP environment, users must manually set
+CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting
+CONFIG_PCI_MSI enables the VECTOR based scheme and the option for
+MSI-capable device drivers to selectively enable MSI/MSI-X.
+
+Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X
+vector is allocated new during runtime and MSI/MSI-X support does not
+depend on BIOS support. This key independency enables MSI/MSI-X
+support on future IOxAPIC free platforms.
+
+5.5.2 Device hardware support
+
+The hardware device function supports MSI by indicating the
+MSI/MSI-X capability structure on its PCI capability list. By
+default, this capability structure will not be initialized by
+the kernel to enable MSI during the system boot. In other words,
+the device function is running on its default pin assertion mode.
+Note that in many cases the hardware supporting MSI have bugs,
+which may result in system hangs. The software driver of specific
+MSI-capable hardware is responsible for deciding whether to call
+pci_enable_msi or not. A return of zero indicates the kernel
+successfully initialized the MSI/MSI-X capability structure of the
+device function. The device function is now running on MSI/MSI-X mode.
+
+5.6 How to tell whether MSI/MSI-X is enabled on device function
+
+At the driver level, a return of zero from the function call of
+pci_enable_msi()/pci_enable_msix() indicates to a device driver that
+its device function is initialized successfully and ready to run in
+MSI/MSI-X mode.
+
+At the user level, users can use the command 'cat /proc/interrupts'
+to display the vectors allocated for devices and their interrupt
+MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is
+enabled on a SCSI Adaptec 39320D Ultra320 controller.
+
+ CPU0 CPU1
+ 0: 324639 0 IO-APIC-edge timer
+ 1: 1186 0 IO-APIC-edge i8042
+ 2: 0 0 XT-PIC cascade
+ 12: 2797 0 IO-APIC-edge i8042
+ 14: 6543 0 IO-APIC-edge ide0
+ 15: 1 0 IO-APIC-edge ide1
+169: 0 0 IO-APIC-level uhci-hcd
+185: 0 0 IO-APIC-level uhci-hcd
+193: 138 10 PCI-MSI aic79xx
+201: 30 0 PCI-MSI aic79xx
+225: 30 0 IO-APIC-level aic7xxx
+233: 30 0 IO-APIC-level aic7xxx
+NMI: 0 0
+LOC: 324553 325068
+ERR: 0
+MIS: 0
+
+6. MSI quirks
+
+Several PCI chipsets or devices are known to not support MSI.
+The PCI stack provides 3 possible levels of MSI disabling:
+* on a single device
+* on all devices behind a specific bridge
+* globally
+
+6.1. Disabling MSI on a single device
+
+Under some circumstances it might be required to disable MSI on a
+single device. This may be achieved by either not calling pci_enable_msi()
+or all, or setting the pci_dev->no_msi flag before (most of the time
+in a quirk).
+
+6.2. Disabling MSI below a bridge
+
+The vast majority of MSI quirks are required by PCI bridges not
+being able to route MSI between busses. In this case, MSI have to be
+disabled on all devices behind this bridge. It is achieves by setting
+the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge
+subordinate bus. There is no need to set the same flag on bridges that
+are below the broken bridge. When pci_enable_msi() is called to enable
+MSI on a device, pci_msi_supported() takes care of checking the NO_MSI
+flag in all parent busses of the device.
+
+Some bridges actually support dynamic MSI support enabling/disabling
+by changing some bits in their PCI configuration space (especially
+the Hypertransport chipsets such as the nVidia nForce and Serverworks
+HT2000). It may then be required to update the NO_MSI flag on the
+corresponding devices in the sysfs hierarchy. To enable MSI support
+on device "0000:00:0e", do:
+
+ echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus
+
+To disable MSI support, echo 0 instead of 1. Note that it should be
+used with caution since changing this value might break interrupts.
+
+6.3. Disabling MSI globally
+
+Some extreme cases may require to disable MSI globally on the system.
+For now, the only known case is a Serverworks PCI-X chipsets (MSI are
+not supported on several busses that are not all connected to the
+chipset in the Linux PCI hierarchy). In the vast majority of other
+cases, disabling only behind a specific bridge is enough.
+
+For debugging purpose, the user may also pass pci=nomsi on the kernel
+command-line to explicitly disable MSI globally. But, once the appro-
+priate quirks are added to the kernel, this option should not be
+required anymore.
+
+6.4. Finding why MSI cannot be enabled on a device
+
+Assuming that MSI are not enabled on a device, you should look at
+dmesg to find messages that quirks may output when disabling MSI
+on some devices, some bridges or even globally.
+Then, lspci -t gives the list of bridges above a device. Reading
+/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI
+are enabled (1) or disabled (0). In 0 is found in a single bridge
+msi_bus file above the device, MSI cannot be enabled.
+
+7. FAQ
+
+Q1. Are there any limitations on using the MSI?
+
+A1. If the PCI device supports MSI and conforms to the
+specification and the platform supports the APIC local bus,
+then using MSI should work.
+
+Q2. Will it work on all the Pentium processors (P3, P4, Xeon,
+AMD processors)? In P3 IPI's are transmitted on the APIC local
+bus and in P4 and Xeon they are transmitted on the system
+bus. Are there any implications with this?
+
+A2. MSI support enables a PCI device sending an inbound
+memory write (0xfeexxxxx as target address) on its PCI bus
+directly to the FSB. Since the message address has a
+redirection hint bit cleared, it should work.
+
+Q3. The target address 0xfeexxxxx will be translated by the
+Host Bridge into an interrupt message. Are there any
+limitations on the chipsets such as Intel 8xx, Intel e7xxx,
+or VIA?
+
+A3. If these chipsets support an inbound memory write with
+target address set as 0xfeexxxxx, as conformed to PCI
+specification 2.3 or latest, then it should work.
+
+Q4. From the driver point of view, if the MSI is lost because
+of errors occurring during inbound memory write, then it may
+wait forever. Is there a mechanism for it to recover?
+
+A4. Since the target of the transaction is an inbound memory
+write, all transaction termination conditions (Retry,
+Master-Abort, Target-Abort, or normal completion) are
+supported. A device sending an MSI must abide by all the PCI
+rules and conditions regarding that inbound memory write. So,
+if a retry is signaled it must retry, etc... We believe that
+the recommendation for Abort is also a retry (refer to PCI
+specification 2.3 or latest).
diff --git a/Documentation/PCI/PCIEBUS-HOWTO.txt b/Documentation/PCI/PCIEBUS-HOWTO.txt
new file mode 100644
index 0000000..9a07e38
--- /dev/null
+++ b/Documentation/PCI/PCIEBUS-HOWTO.txt
@@ -0,0 +1,217 @@
+ The PCI Express Port Bus Driver Guide HOWTO
+ Tom L Nguyen tom.l.nguyen@intel.com
+ 11/03/2004
+
+1. About this guide
+
+This guide describes the basics of the PCI Express Port Bus driver
+and provides information on how to enable the service drivers to
+register/unregister with the PCI Express Port Bus Driver.
+
+2. Copyright 2004 Intel Corporation
+
+3. What is the PCI Express Port Bus Driver
+
+A PCI Express Port is a logical PCI-PCI Bridge structure. There
+are two types of PCI Express Port: the Root Port and the Switch
+Port. The Root Port originates a PCI Express link from a PCI Express
+Root Complex and the Switch Port connects PCI Express links to
+internal logical PCI buses. The Switch Port, which has its secondary
+bus representing the switch's internal routing logic, is called the
+switch's Upstream Port. The switch's Downstream Port is bridging from
+switch's internal routing bus to a bus representing the downstream
+PCI Express link from the PCI Express Switch.
+
+A PCI Express Port can provide up to four distinct functions,
+referred to in this document as services, depending on its port type.
+PCI Express Port's services include native hotplug support (HP),
+power management event support (PME), advanced error reporting
+support (AER), and virtual channel support (VC). These services may
+be handled by a single complex driver or be individually distributed
+and handled by corresponding service drivers.
+
+4. Why use the PCI Express Port Bus Driver?
+
+In existing Linux kernels, the Linux Device Driver Model allows a
+physical device to be handled by only a single driver. The PCI
+Express Port is a PCI-PCI Bridge device with multiple distinct
+services. To maintain a clean and simple solution each service
+may have its own software service driver. In this case several
+service drivers will compete for a single PCI-PCI Bridge device.
+For example, if the PCI Express Root Port native hotplug service
+driver is loaded first, it claims a PCI-PCI Bridge Root Port. The
+kernel therefore does not load other service drivers for that Root
+Port. In other words, it is impossible to have multiple service
+drivers load and run on a PCI-PCI Bridge device simultaneously
+using the current driver model.
+
+To enable multiple service drivers running simultaneously requires
+having a PCI Express Port Bus driver, which manages all populated
+PCI Express Ports and distributes all provided service requests
+to the corresponding service drivers as required. Some key
+advantages of using the PCI Express Port Bus driver are listed below:
+
+ - Allow multiple service drivers to run simultaneously on
+ a PCI-PCI Bridge Port device.
+
+ - Allow service drivers implemented in an independent
+ staged approach.
+
+ - Allow one service driver to run on multiple PCI-PCI Bridge
+ Port devices.
+
+ - Manage and distribute resources of a PCI-PCI Bridge Port
+ device to requested service drivers.
+
+5. Configuring the PCI Express Port Bus Driver vs. Service Drivers
+
+5.1 Including the PCI Express Port Bus Driver Support into the Kernel
+
+Including the PCI Express Port Bus driver depends on whether the PCI
+Express support is included in the kernel config. The kernel will
+automatically include the PCI Express Port Bus driver as a kernel
+driver when the PCI Express support is enabled in the kernel.
+
+5.2 Enabling Service Driver Support
+
+PCI device drivers are implemented based on Linux Device Driver Model.
+All service drivers are PCI device drivers. As discussed above, it is
+impossible to load any service driver once the kernel has loaded the
+PCI Express Port Bus Driver. To meet the PCI Express Port Bus Driver
+Model requires some minimal changes on existing service drivers that
+imposes no impact on the functionality of existing service drivers.
+
+A service driver is required to use the two APIs shown below to
+register its service with the PCI Express Port Bus driver (see
+section 5.2.1 & 5.2.2). It is important that a service driver
+initializes the pcie_port_service_driver data structure, included in
+header file /include/linux/pcieport_if.h, before calling these APIs.
+Failure to do so will result an identity mismatch, which prevents
+the PCI Express Port Bus driver from loading a service driver.
+
+5.2.1 pcie_port_service_register
+
+int pcie_port_service_register(struct pcie_port_service_driver *new)
+
+This API replaces the Linux Driver Model's pci_module_init API. A
+service driver should always calls pcie_port_service_register at
+module init. Note that after service driver being loaded, calls
+such as pci_enable_device(dev) and pci_set_master(dev) are no longer
+necessary since these calls are executed by the PCI Port Bus driver.
+
+5.2.2 pcie_port_service_unregister
+
+void pcie_port_service_unregister(struct pcie_port_service_driver *new)
+
+pcie_port_service_unregister replaces the Linux Driver Model's
+pci_unregister_driver. It's always called by service driver when a
+module exits.
+
+5.2.3 Sample Code
+
+Below is sample service driver code to initialize the port service
+driver data structure.
+
+static struct pcie_port_service_id service_id[] = { {
+ .vendor = PCI_ANY_ID,
+ .device = PCI_ANY_ID,
+ .port_type = PCIE_RC_PORT,
+ .service_type = PCIE_PORT_SERVICE_AER,
+ }, { /* end: all zeroes */ }
+};
+
+static struct pcie_port_service_driver root_aerdrv = {
+ .name = (char *)device_name,
+ .id_table = &service_id[0],
+
+ .probe = aerdrv_load,
+ .remove = aerdrv_unload,
+
+ .suspend = aerdrv_suspend,
+ .resume = aerdrv_resume,
+};
+
+Below is a sample code for registering/unregistering a service
+driver.
+
+static int __init aerdrv_service_init(void)
+{
+ int retval = 0;
+
+ retval = pcie_port_service_register(&root_aerdrv);
+ if (!retval) {
+ /*
+ * FIX ME
+ */
+ }
+ return retval;
+}
+
+static void __exit aerdrv_service_exit(void)
+{
+ pcie_port_service_unregister(&root_aerdrv);
+}
+
+module_init(aerdrv_service_init);
+module_exit(aerdrv_service_exit);
+
+6. Possible Resource Conflicts
+
+Since all service drivers of a PCI-PCI Bridge Port device are
+allowed to run simultaneously, below lists a few of possible resource
+conflicts with proposed solutions.
+
+6.1 MSI Vector Resource
+
+The MSI capability structure enables a device software driver to call
+pci_enable_msi to request MSI based interrupts. Once MSI interrupts
+are enabled on a device, it stays in this mode until a device driver
+calls pci_disable_msi to disable MSI interrupts and revert back to
+INTx emulation mode. Since service drivers of the same PCI-PCI Bridge
+port share the same physical device, if an individual service driver
+calls pci_enable_msi/pci_disable_msi it may result unpredictable
+behavior. For example, two service drivers run simultaneously on the
+same physical Root Port. Both service drivers call pci_enable_msi to
+request MSI based interrupts. A service driver may not know whether
+any other service drivers have run on this Root Port. If either one
+of them calls pci_disable_msi, it puts the other service driver
+in a wrong interrupt mode.
+
+To avoid this situation all service drivers are not permitted to
+switch interrupt mode on its device. The PCI Express Port Bus driver
+is responsible for determining the interrupt mode and this should be
+transparent to service drivers. Service drivers need to know only
+the vector IRQ assigned to the field irq of struct pcie_device, which
+is passed in when the PCI Express Port Bus driver probes each service
+driver. Service drivers should use (struct pcie_device*)dev->irq to
+call request_irq/free_irq. In addition, the interrupt mode is stored
+in the field interrupt_mode of struct pcie_device.
+
+6.2 MSI-X Vector Resources
+
+Similar to the MSI a device driver for an MSI-X capable device can
+call pci_enable_msix to request MSI-X interrupts. All service drivers
+are not permitted to switch interrupt mode on its device. The PCI
+Express Port Bus driver is responsible for determining the interrupt
+mode and this should be transparent to service drivers. Any attempt
+by service driver to call pci_enable_msix/pci_disable_msix may
+result unpredictable behavior. Service drivers should use
+(struct pcie_device*)dev->irq and call request_irq/free_irq.
+
+6.3 PCI Memory/IO Mapped Regions
+
+Service drivers for PCI Express Power Management (PME), Advanced
+Error Reporting (AER), Hot-Plug (HP) and Virtual Channel (VC) access
+PCI configuration space on the PCI Express port. In all cases the
+registers accessed are independent of each other. This patch assumes
+that all service drivers will be well behaved and not overwrite
+other service driver's configuration settings.
+
+6.4 PCI Config Registers
+
+Each service driver runs its PCI config operations on its own
+capability structure except the PCI Express capability structure, in
+which Root Control register and Device Control register are shared
+between PME and AER. This patch assumes that all service drivers
+will be well behaved and not overwrite other service driver's
+configuration settings.
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
new file mode 100644
index 0000000..6650af4
--- /dev/null
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -0,0 +1,396 @@
+
+ PCI Error Recovery
+ ------------------
+ February 2, 2006
+
+ Current document maintainer:
+ Linas Vepstas <linas@austin.ibm.com>
+
+
+Many PCI bus controllers are able to detect a variety of hardware
+PCI errors on the bus, such as parity errors on the data and address
+busses, as well as SERR and PERR errors. Some of the more advanced
+chipsets are able to deal with these errors; these include PCI-E chipsets,
+and the PCI-host bridges found on IBM Power4 and Power5-based pSeries
+boxes. A typical action taken is to disconnect the affected device,
+halting all I/O to it. The goal of a disconnection is to avoid system
+corruption; for example, to halt system memory corruption due to DMA's
+to "wild" addresses. Typically, a reconnection mechanism is also
+offered, so that the affected PCI device(s) are reset and put back
+into working condition. The reset phase requires coordination
+between the affected device drivers and the PCI controller chip.
+This document describes a generic API for notifying device drivers
+of a bus disconnection, and then performing error recovery.
+This API is currently implemented in the 2.6.16 and later kernels.
+
+Reporting and recovery is performed in several steps. First, when
+a PCI hardware error has resulted in a bus disconnect, that event
+is reported as soon as possible to all affected device drivers,
+including multiple instances of a device driver on multi-function
+cards. This allows device drivers to avoid deadlocking in spinloops,
+waiting for some i/o-space register to change, when it never will.
+It also gives the drivers a chance to defer incoming I/O as
+needed.
+
+Next, recovery is performed in several stages. Most of the complexity
+is forced by the need to handle multi-function devices, that is,
+devices that have multiple device drivers associated with them.
+In the first stage, each driver is allowed to indicate what type
+of reset it desires, the choices being a simple re-enabling of I/O
+or requesting a hard reset (a full electrical #RST of the PCI card).
+If any driver requests a full reset, that is what will be done.
+
+After a full reset and/or a re-enabling of I/O, all drivers are
+again notified, so that they may then perform any device setup/config
+that may be required. After these have all completed, a final
+"resume normal operations" event is sent out.
+
+The biggest reason for choosing a kernel-based implementation rather
+than a user-space implementation was the need to deal with bus
+disconnects of PCI devices attached to storage media, and, in particular,
+disconnects from devices holding the root file system. If the root
+file system is disconnected, a user-space mechanism would have to go
+through a large number of contortions to complete recovery. Almost all
+of the current Linux file systems are not tolerant of disconnection
+from/reconnection to their underlying block device. By contrast,
+bus errors are easy to manage in the device driver. Indeed, most
+device drivers already handle very similar recovery procedures;
+for example, the SCSI-generic layer already provides significant
+mechanisms for dealing with SCSI bus errors and SCSI bus resets.
+
+
+Detailed Design
+---------------
+Design and implementation details below, based on a chain of
+public email discussions with Ben Herrenschmidt, circa 5 April 2005.
+
+The error recovery API support is exposed to the driver in the form of
+a structure of function pointers pointed to by a new field in struct
+pci_driver. A driver that fails to provide the structure is "non-aware",
+and the actual recovery steps taken are platform dependent. The
+arch/powerpc implementation will simulate a PCI hotplug remove/add.
+
+This structure has the form:
+struct pci_error_handlers
+{
+ int (*error_detected)(struct pci_dev *dev, enum pci_channel_state);
+ int (*mmio_enabled)(struct pci_dev *dev);
+ int (*link_reset)(struct pci_dev *dev);
+ int (*slot_reset)(struct pci_dev *dev);
+ void (*resume)(struct pci_dev *dev);
+};
+
+The possible channel states are:
+enum pci_channel_state {
+ pci_channel_io_normal, /* I/O channel is in normal state */
+ pci_channel_io_frozen, /* I/O to channel is blocked */
+ pci_channel_io_perm_failure, /* PCI card is dead */
+};
+
+Possible return values are:
+enum pci_ers_result {
+ PCI_ERS_RESULT_NONE, /* no result/none/not supported in device driver */
+ PCI_ERS_RESULT_CAN_RECOVER, /* Device driver can recover without slot reset */
+ PCI_ERS_RESULT_NEED_RESET, /* Device driver wants slot to be reset. */
+ PCI_ERS_RESULT_DISCONNECT, /* Device has completely failed, is unrecoverable */
+ PCI_ERS_RESULT_RECOVERED, /* Device driver is fully recovered and operational */
+};
+
+A driver does not have to implement all of these callbacks; however,
+if it implements any, it must implement error_detected(). If a callback
+is not implemented, the corresponding feature is considered unsupported.
+For example, if mmio_enabled() and resume() aren't there, then it
+is assumed that the driver is not doing any direct recovery and requires
+a reset. If link_reset() is not implemented, the card is assumed as
+not care about link resets. Typically a driver will want to know about
+a slot_reset().
+
+The actual steps taken by a platform to recover from a PCI error
+event will be platform-dependent, but will follow the general
+sequence described below.
+
+STEP 0: Error Event
+-------------------
+PCI bus error is detect by the PCI hardware. On powerpc, the slot
+is isolated, in that all I/O is blocked: all reads return 0xffffffff,
+all writes are ignored.
+
+
+STEP 1: Notification
+--------------------
+Platform calls the error_detected() callback on every instance of
+every driver affected by the error.
+
+At this point, the device might not be accessible anymore, depending on
+the platform (the slot will be isolated on powerpc). The driver may
+already have "noticed" the error because of a failing I/O, but this
+is the proper "synchronization point", that is, it gives the driver
+a chance to cleanup, waiting for pending stuff (timers, whatever, etc...)
+to complete; it can take semaphores, schedule, etc... everything but
+touch the device. Within this function and after it returns, the driver
+shouldn't do any new IOs. Called in task context. This is sort of a
+"quiesce" point. See note about interrupts at the end of this doc.
+
+All drivers participating in this system must implement this call.
+The driver must return one of the following result codes:
+ - PCI_ERS_RESULT_CAN_RECOVER:
+ Driver returns this if it thinks it might be able to recover
+ the HW by just banging IOs or if it wants to be given
+ a chance to extract some diagnostic information (see
+ mmio_enable, below).
+ - PCI_ERS_RESULT_NEED_RESET:
+ Driver returns this if it can't recover without a hard
+ slot reset.
+ - PCI_ERS_RESULT_DISCONNECT:
+ Driver returns this if it doesn't want to recover at all.
+
+The next step taken will depend on the result codes returned by the
+drivers.
+
+If all drivers on the segment/slot return PCI_ERS_RESULT_CAN_RECOVER,
+then the platform should re-enable IOs on the slot (or do nothing in
+particular, if the platform doesn't isolate slots), and recovery
+proceeds to STEP 2 (MMIO Enable).
+
+If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET),
+then recovery proceeds to STEP 4 (Slot Reset).
+
+If the platform is unable to recover the slot, the next step
+is STEP 6 (Permanent Failure).
+
+>>> The current powerpc implementation assumes that a device driver will
+>>> *not* schedule or semaphore in this routine; the current powerpc
+>>> implementation uses one kernel thread to notify all devices;
+>>> thus, if one device sleeps/schedules, all devices are affected.
+>>> Doing better requires complex multi-threaded logic in the error
+>>> recovery implementation (e.g. waiting for all notification threads
+>>> to "join" before proceeding with recovery.) This seems excessively
+>>> complex and not worth implementing.
+
+>>> The current powerpc implementation doesn't much care if the device
+>>> attempts I/O at this point, or not. I/O's will fail, returning
+>>> a value of 0xff on read, and writes will be dropped. If the device
+>>> driver attempts more than 10K I/O's to a frozen adapter, it will
+>>> assume that the device driver has gone into an infinite loop, and
+>>> it will panic the kernel. There doesn't seem to be any other
+>>> way of stopping a device driver that insists on spinning on I/O.
+
+STEP 2: MMIO Enabled
+-------------------
+The platform re-enables MMIO to the device (but typically not the
+DMA), and then calls the mmio_enabled() callback on all affected
+device drivers.
+
+This is the "early recovery" call. IOs are allowed again, but DMA is
+not (hrm... to be discussed, I prefer not), with some restrictions. This
+is NOT a callback for the driver to start operations again, only to
+peek/poke at the device, extract diagnostic information, if any, and
+eventually do things like trigger a device local reset or some such,
+but not restart operations. This is callback is made if all drivers on
+a segment agree that they can try to recover and if no automatic link reset
+was performed by the HW. If the platform can't just re-enable IOs without
+a slot reset or a link reset, it wont call this callback, and instead
+will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
+
+>>> The following is proposed; no platform implements this yet:
+>>> Proposal: All I/O's should be done _synchronously_ from within
+>>> this callback, errors triggered by them will be returned via
+>>> the normal pci_check_whatever() API, no new error_detected()
+>>> callback will be issued due to an error happening here. However,
+>>> such an error might cause IOs to be re-blocked for the whole
+>>> segment, and thus invalidate the recovery that other devices
+>>> on the same segment might have done, forcing the whole segment
+>>> into one of the next states, that is, link reset or slot reset.
+
+The driver should return one of the following result codes:
+ - PCI_ERS_RESULT_RECOVERED
+ Driver returns this if it thinks the device is fully
+ functional and thinks it is ready to start
+ normal driver operations again. There is no
+ guarantee that the driver will actually be
+ allowed to proceed, as another driver on the
+ same segment might have failed and thus triggered a
+ slot reset on platforms that support it.
+
+ - PCI_ERS_RESULT_NEED_RESET
+ Driver returns this if it thinks the device is not
+ recoverable in it's current state and it needs a slot
+ reset to proceed.
+
+ - PCI_ERS_RESULT_DISCONNECT
+ Same as above. Total failure, no recovery even after
+ reset driver dead. (To be defined more precisely)
+
+The next step taken depends on the results returned by the drivers.
+If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform
+proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
+
+If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
+proceeds to STEP 4 (Slot Reset)
+
+>>> The current powerpc implementation does not implement this callback.
+
+
+STEP 3: Link Reset
+------------------
+The platform resets the link, and then calls the link_reset() callback
+on all affected device drivers. This is a PCI-Express specific state
+and is done whenever a non-fatal error has been detected that can be
+"solved" by resetting the link. This call informs the driver of the
+reset and the driver should check to see if the device appears to be
+in working condition.
+
+The driver is not supposed to restart normal driver I/O operations
+at this point. It should limit itself to "probing" the device to
+check it's recoverability status. If all is right, then the platform
+will call resume() once all drivers have ack'd link_reset().
+
+ Result codes:
+ (identical to STEP 3 (MMIO Enabled)
+
+The platform then proceeds to either STEP 4 (Slot Reset) or STEP 5
+(Resume Operations).
+
+>>> The current powerpc implementation does not implement this callback.
+
+
+STEP 4: Slot Reset
+------------------
+The platform performs a soft or hard reset of the device, and then
+calls the slot_reset() callback.
+
+A soft reset consists of asserting the adapter #RST line and then
+restoring the PCI BAR's and PCI configuration header to a state
+that is equivalent to what it would be after a fresh system
+power-on followed by power-on BIOS/system firmware initialization.
+If the platform supports PCI hotplug, then the reset might be
+performed by toggling the slot electrical power off/on.
+
+It is important for the platform to restore the PCI config space
+to the "fresh poweron" state, rather than the "last state". After
+a slot reset, the device driver will almost always use its standard
+device initialization routines, and an unusual config space setup
+may result in hung devices, kernel panics, or silent data corruption.
+
+This call gives drivers the chance to re-initialize the hardware
+(re-download firmware, etc.). At this point, the driver may assume
+that he card is in a fresh state and is fully functional. In
+particular, interrupt generation should work normally.
+
+Drivers should not yet restart normal I/O processing operations
+at this point. If all device drivers report success on this
+callback, the platform will call resume() to complete the sequence,
+and let the driver restart normal I/O processing.
+
+A driver can still return a critical failure for this function if
+it can't get the device operational after reset. If the platform
+previously tried a soft reset, it might now try a hard reset (power
+cycle) and then call slot_reset() again. It the device still can't
+be recovered, there is nothing more that can be done; the platform
+will typically report a "permanent failure" in such a case. The
+device will be considered "dead" in this case.
+
+Drivers for multi-function cards will need to coordinate among
+themselves as to which driver instance will perform any "one-shot"
+or global device initialization. For example, the Symbios sym53cxx2
+driver performs device init only from PCI function 0:
+
++ if (PCI_FUNC(pdev->devfn) == 0)
++ sym_reset_scsi_bus(np, 0);
+
+ Result codes:
+ - PCI_ERS_RESULT_DISCONNECT
+ Same as above.
+
+Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent
+Failure).
+
+>>> The current powerpc implementation does not currently try a
+>>> power-cycle reset if the driver returned PCI_ERS_RESULT_DISCONNECT.
+>>> However, it probably should.
+
+
+STEP 5: Resume Operations
+-------------------------
+The platform will call the resume() callback on all affected device
+drivers if all drivers on the segment have returned
+PCI_ERS_RESULT_RECOVERED from one of the 3 previous callbacks.
+The goal of this callback is to tell the driver to restart activity,
+that everything is back and running. This callback does not return
+a result code.
+
+At this point, if a new error happens, the platform will restart
+a new error recovery sequence.
+
+STEP 6: Permanent Failure
+-------------------------
+A "permanent failure" has occurred, and the platform cannot recover
+the device. The platform will call error_detected() with a
+pci_channel_state value of pci_channel_io_perm_failure.
+
+The device driver should, at this point, assume the worst. It should
+cancel all pending I/O, refuse all new I/O, returning -EIO to
+higher layers. The device driver should then clean up all of its
+memory and remove itself from kernel operations, much as it would
+during system shutdown.
+
+The platform will typically notify the system operator of the
+permanent failure in some way. If the device is hotplug-capable,
+the operator will probably want to remove and replace the device.
+Note, however, not all failures are truly "permanent". Some are
+caused by over-heating, some by a poorly seated card. Many
+PCI error events are caused by software bugs, e.g. DMA's to
+wild addresses or bogus split transactions due to programming
+errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
+for additional detail on real-life experience of the causes of
+software errors.
+
+
+Conclusion; General Remarks
+---------------------------
+The way those callbacks are called is platform policy. A platform with
+no slot reset capability may want to just "ignore" drivers that can't
+recover (disconnect them) and try to let other cards on the same segment
+recover. Keep in mind that in most real life cases, though, there will
+be only one driver per segment.
+
+Now, a note about interrupts. If you get an interrupt and your
+device is dead or has been isolated, there is a problem :)
+The current policy is to turn this into a platform policy.
+That is, the recovery API only requires that:
+
+ - There is no guarantee that interrupt delivery can proceed from any
+device on the segment starting from the error detection and until the
+resume callback is sent, at which point interrupts are expected to be
+fully operational.
+
+ - There is no guarantee that interrupt delivery is stopped, that is,
+a driver that gets an interrupt after detecting an error, or that detects
+an error within the interrupt handler such that it prevents proper
+ack'ing of the interrupt (and thus removal of the source) should just
+return IRQ_NOTHANDLED. It's up to the platform to deal with that
+condition, typically by masking the IRQ source during the duration of
+the error handling. It is expected that the platform "knows" which
+interrupts are routed to error-management capable slots and can deal
+with temporarily disabling that IRQ number during error processing (this
+isn't terribly complex). That means some IRQ latency for other devices
+sharing the interrupt, but there is simply no other way. High end
+platforms aren't supposed to share interrupts between many devices
+anyway :)
+
+>>> Implementation details for the powerpc platform are discussed in
+>>> the file Documentation/powerpc/eeh-pci-error-recovery.txt
+
+>>> As of this writing, there are six device drivers with patches
+>>> implementing error recovery. Not all of these patches are in
+>>> mainline yet. These may be used as "examples":
+>>>
+>>> drivers/scsi/ipr.c
+>>> drivers/scsi/sym53cxx_2
+>>> drivers/next/e100.c
+>>> drivers/net/e1000
+>>> drivers/net/ixgb
+>>> drivers/net/s2io.c
+
+The End
+-------
diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt
new file mode 100644
index 0000000..fd4907a
--- /dev/null
+++ b/Documentation/PCI/pci.txt
@@ -0,0 +1,650 @@
+
+ How To Write Linux PCI Drivers
+
+ by Martin Mares <mj@ucw.cz> on 07-Feb-2000
+ updated by Grant Grundler <grundler@parisc-linux.org> on 23-Dec-2006
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The world of PCI is vast and full of (mostly unpleasant) surprises.
+Since each CPU architecture implements different chip-sets and PCI devices
+have different requirements (erm, "features"), the result is the PCI support
+in the Linux kernel is not as trivial as one would wish. This short paper
+tries to introduce all potential driver authors to Linux APIs for
+PCI device drivers.
+
+A more complete resource is the third edition of "Linux Device Drivers"
+by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
+LDD3 is available for free (under Creative Commons License) from:
+
+ http://lwn.net/Kernel/LDD3/
+
+However, keep in mind that all documents are subject to "bit rot".
+Refer to the source code if things are not working as described here.
+
+Please send questions/comments/patches about Linux PCI API to the
+"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
+
+
+
+0. Structure of PCI drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+PCI drivers "discover" PCI devices in a system via pci_register_driver().
+Actually, it's the other way around. When the PCI generic code discovers
+a new device, the driver with a matching "description" will be notified.
+Details on this below.
+
+pci_register_driver() leaves most of the probing for devices to
+the PCI layer and supports online insertion/removal of devices [thus
+supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
+pci_register_driver() call requires passing in a table of function
+pointers and thus dictates the high level structure of a driver.
+
+Once the driver knows about a PCI device and takes ownership, the
+driver generally needs to perform the following initialization:
+
+ Enable the device
+ Request MMIO/IOP resources
+ Set the DMA mask size (for both coherent and streaming DMA)
+ Allocate and initialize shared control data (pci_allocate_coherent())
+ Access device configuration space (if needed)
+ Register IRQ handler (request_irq())
+ Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+ Enable DMA/processing engines
+
+When done using the device, and perhaps the module needs to be unloaded,
+the driver needs to take the follow steps:
+ Disable the device from generating IRQs
+ Release the IRQ (free_irq())
+ Stop all DMA activity
+ Release DMA buffers (both streaming and coherent)
+ Unregister from other subsystems (e.g. scsi or netdev)
+ Release MMIO/IOP resources
+ Disable the device
+
+Most of these topics are covered in the following sections.
+For the rest look at LDD3 or <linux/pci.h> .
+
+If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
+the PCI functions described below are defined as inline functions either
+completely empty or just returning an appropriate error codes to avoid
+lots of ifdefs in the drivers.
+
+
+
+1. pci_register_driver() call
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PCI device drivers call pci_register_driver() during their
+initialization with a pointer to a structure describing the driver
+(struct pci_driver):
+
+ field name Description
+ ---------- ------------------------------------------------------
+ id_table Pointer to table of device ID's the driver is
+ interested in. Most drivers should export this
+ table using MODULE_DEVICE_TABLE(pci,...).
+
+ probe This probing function gets called (during execution
+ of pci_register_driver() for already existing
+ devices or later if a new device gets inserted) for
+ all PCI devices which match the ID table and are not
+ "owned" by the other drivers yet. This function gets
+ passed a "struct pci_dev *" for each device whose
+ entry in the ID table matches the device. The probe
+ function returns zero when the driver chooses to
+ take "ownership" of the device or an error code
+ (negative number) otherwise.
+ The probe function always gets called from process
+ context, so it can sleep.
+
+ remove The remove() function gets called whenever a device
+ being handled by this driver is removed (either during
+ deregistration of the driver or when it's manually
+ pulled out of a hot-pluggable slot).
+ The remove function always gets called from process
+ context, so it can sleep.
+
+ suspend Put device into low power state.
+ suspend_late Put device into low power state.
+
+ resume_early Wake device from low power state.
+ resume Wake device from low power state.
+
+ (Please see Documentation/power/pci.txt for descriptions
+ of PCI Power Management and the related functions.)
+
+ shutdown Hook into reboot_notifier_list (kernel/sys.c).
+ Intended to stop any idling DMA operations.
+ Useful for enabling wake-on-lan (NIC) or changing
+ the power state of a device before reboot.
+ e.g. drivers/net/e100.c.
+
+ err_handler See Documentation/PCI/pci-error-recovery.txt
+
+
+The ID table is an array of struct pci_device_id entries ending with an
+all-zero entry; use of the macro DEFINE_PCI_DEVICE_TABLE is the preferred
+method of declaring the table. Each entry consists of:
+
+ vendor,device Vendor and device ID to match (or PCI_ANY_ID)
+
+ subvendor, Subsystem vendor and device ID to match (or PCI_ANY_ID)
+ subdevice,
+
+ class Device class, subclass, and "interface" to match.
+ See Appendix D of the PCI Local Bus Spec or
+ include/linux/pci_ids.h for a full list of classes.
+ Most drivers do not need to specify class/class_mask
+ as vendor/device is normally sufficient.
+
+ class_mask limit which sub-fields of the class field are compared.
+ See drivers/scsi/sym53c8xx_2/ for example of usage.
+
+ driver_data Data private to the driver.
+ Most drivers don't need to use driver_data field.
+ Best practice is to use driver_data as an index
+ into a static list of equivalent device types,
+ instead of using it as a pointer.
+
+
+Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up
+a pci_device_id table.
+
+New PCI IDs may be added to a device driver pci_ids table at runtime
+as shown below:
+
+echo "vendor device subvendor subdevice class class_mask driver_data" > \
+/sys/bus/pci/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The vendor and device fields are mandatory, the others are optional. Users
+need pass only as many optional fields as necessary:
+ o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
+ o class and classmask fields default to 0
+ o driver_data defaults to 0UL.
+
+Note that driver_data must match the value used by any of the pci_device_id
+entries defined in the driver. This makes the driver_data field mandatory
+if all the pci_device_id entries have a non-zero driver_data value.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCI devices listed in its (newly updated) pci_ids list.
+
+When the driver exits, it just calls pci_unregister_driver() and the PCI layer
+automatically calls the remove hook for all devices handled by the driver.
+
+
+1.1 "Attributes" for driver functions/data
+
+Please mark the initialization and cleanup functions where appropriate
+(the corresponding macros are defined in <linux/init.h>):
+
+ __init Initialization code. Thrown away after the driver
+ initializes.
+ __exit Exit code. Ignored for non-modular drivers.
+
+
+ __devinit Device initialization code.
+ Identical to __init if the kernel is not compiled
+ with CONFIG_HOTPLUG, normal function otherwise.
+ __devexit The same for __exit.
+
+Tips on when/where to use the above attributes:
+ o The module_init()/module_exit() functions (and all
+ initialization functions called _only_ from these)
+ should be marked __init/__exit.
+
+ o Do not mark the struct pci_driver.
+
+ o The ID table array should be marked __devinitconst; this is done
+ automatically if the table is declared with DEFINE_PCI_DEVICE_TABLE().
+
+ o The probe() and remove() functions should be marked __devinit
+ and __devexit respectively. All initialization functions
+ exclusively called by the probe() routine, can be marked __devinit.
+ Ditto for remove() and __devexit.
+
+ o If mydriver_remove() is marked with __devexit(), then all address
+ references to mydriver_remove must use __devexit_p(mydriver_remove)
+ (in the struct pci_driver declaration for example).
+ __devexit_p() will generate the function name _or_ NULL if the
+ function will be discarded. For an example, see drivers/net/tg3.c.
+
+ o Do NOT mark a function if you are not sure which mark to use.
+ Better to not mark the function than mark the function wrong.
+
+
+
+2. How to find PCI devices manually
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+PCI drivers should have a really good reason for not using the
+pci_register_driver() interface to search for PCI devices.
+The main reason PCI devices are controlled by multiple drivers
+is because one PCI device implements several different HW services.
+E.g. combined serial/parallel port/floppy controller.
+
+A manual search may be performed using the following constructs:
+
+Searching by vendor and device ID:
+
+ struct pci_dev *dev = NULL;
+ while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
+ configure_device(dev);
+
+Searching by class ID (iterate in a similar way):
+
+ pci_get_class(CLASS_ID, dev)
+
+Searching by both vendor/device and subsystem vendor/device ID:
+
+ pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
+
+You can use the constant PCI_ANY_ID as a wildcard replacement for
+VENDOR_ID or DEVICE_ID. This allows searching for any device from a
+specific vendor, for example.
+
+These functions are hotplug-safe. They increment the reference count on
+the pci_dev that they return. You must eventually (possibly at module unload)
+decrement the reference count on these devices by calling pci_dev_put().
+
+
+
+3. Device Initialization Steps
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As noted in the introduction, most PCI drivers need the following steps
+for device initialization:
+
+ Enable the device
+ Request MMIO/IOP resources
+ Set the DMA mask size (for both coherent and streaming DMA)
+ Allocate and initialize shared control data (pci_allocate_coherent())
+ Access device configuration space (if needed)
+ Register IRQ handler (request_irq())
+ Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+ Enable DMA/processing engines.
+
+The driver can access PCI config space registers at any time.
+(Well, almost. When running BIST, config space can go away...but
+that will just result in a PCI Bus Master Abort and config reads
+will return garbage).
+
+
+3.1 Enable the PCI device
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Before touching any device registers, the driver needs to enable
+the PCI device by calling pci_enable_device(). This will:
+ o wake up the device if it was in suspended state,
+ o allocate I/O and memory regions of the device (if BIOS did not),
+ o allocate an IRQ (if BIOS did not).
+
+NOTE: pci_enable_device() can fail! Check the return value.
+
+[ OS BUG: we don't check resource allocations before enabling those
+ resources. The sequence would make more sense if we called
+ pci_request_resources() before calling pci_enable_device().
+ Currently, the device drivers can't detect the bug when when two
+ devices have been allocated the same range. This is not a common
+ problem and unlikely to get fixed soon.
+
+ This has been discussed before but not changed as of 2.6.19:
+ http://lkml.org/lkml/2006/3/2/194
+]
+
+pci_set_master() will enable DMA by setting the bus master bit
+in the PCI_COMMAND register. It also fixes the latency timer value if
+it's set to something bogus by the BIOS.
+
+If the PCI device can use the PCI Memory-Write-Invalidate transaction,
+call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval
+and also ensures that the cache line size register is set correctly.
+Check the return value of pci_set_mwi() as not all architectures
+or chip-sets may support Memory-Write-Invalidate. Alternatively,
+if Mem-Wr-Inval would be nice to have but is not required, call
+pci_try_set_mwi() to have the system do its best effort at enabling
+Mem-Wr-Inval.
+
+
+3.2 Request MMIO/IOP resources
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Memory (MMIO), and I/O port addresses should NOT be read directly
+from the PCI device config space. Use the values in the pci_dev structure
+as the PCI "bus address" might have been remapped to a "host physical"
+address by the arch/chip-set specific kernel support.
+
+See Documentation/IO-mapping.txt for how to access device registers
+or device memory.
+
+The device driver needs to call pci_request_region() to verify
+no other device is already using the same address resource.
+Conversely, drivers should call pci_release_region() AFTER
+calling pci_disable_device().
+The idea is to prevent two devices colliding on the same address range.
+
+[ See OS BUG comment above. Currently (2.6.19), The driver can only
+ determine MMIO and IO Port resource availability _after_ calling
+ pci_enable_device(). ]
+
+Generic flavors of pci_request_region() are request_mem_region()
+(for MMIO ranges) and request_region() (for IO Port ranges).
+Use these for address resources that are not described by "normal" PCI
+BARs.
+
+Also see pci_request_selected_regions() below.
+
+
+3.3 Set the DMA mask size
+~~~~~~~~~~~~~~~~~~~~~~~~~
+[ If anything below doesn't make sense, please refer to
+ Documentation/DMA-API.txt. This section is just a reminder that
+ drivers need to indicate DMA capabilities of the device and is not
+ an authoritative source for DMA interfaces. ]
+
+While all drivers should explicitly indicate the DMA capability
+(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
+32-bit bus master capability for streaming data need the driver
+to "register" this capability by calling pci_set_dma_mask() with
+appropriate parameters. In general this allows more efficient DMA
+on systems where System RAM exists above 4G _physical_ address.
+
+Drivers for all PCI-X and PCIe compliant devices must call
+pci_set_dma_mask() as they are 64-bit DMA devices.
+
+Similarly, drivers must also "register" this capability if the device
+can directly address "consistent memory" in System RAM above 4G physical
+address by calling pci_set_consistent_dma_mask().
+Again, this includes drivers for all PCI-X and PCIe compliant devices.
+Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
+64-bit DMA capable for payload ("streaming") data but not control
+("consistent") data.
+
+
+3.4 Setup shared control data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
+memory. See Documentation/DMA-API.txt for a full description of
+the DMA APIs. This section is just a reminder that it needs to be done
+before enabling DMA on the device.
+
+
+3.5 Initialize device registers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some drivers will need specific "capability" fields programmed
+or other "vendor specific" register initialized or reset.
+E.g. clearing pending interrupts.
+
+
+3.6 Register IRQ handler
+~~~~~~~~~~~~~~~~~~~~~~~~
+While calling request_irq() is the last step described here,
+this is often just another intermediate step to initialize a device.
+This step can often be deferred until the device is opened for use.
+
+All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
+and use the devid to map IRQs to devices (remember that all PCI IRQ lines
+can be shared).
+
+request_irq() will associate an interrupt handler and device handle
+with an interrupt number. Historically interrupt numbers represent
+IRQ lines which run from the PCI device to the Interrupt controller.
+With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
+
+request_irq() also enables the interrupt. Make sure the device is
+quiesced and does not have any interrupts pending before registering
+the interrupt handler.
+
+MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
+which deliver interrupts to the CPU via a DMA write to a Local APIC.
+The fundamental difference between MSI and MSI-X is how multiple
+"vectors" get allocated. MSI requires contiguous blocks of vectors
+while MSI-X can allocate several individual ones.
+
+MSI capability can be enabled by calling pci_enable_msi() or
+pci_enable_msix() before calling request_irq(). This causes
+the PCI support to program CPU vector data into the PCI device
+capability registers.
+
+If your PCI device supports both, try to enable MSI-X first.
+Only one can be enabled at a time. Many architectures, chip-sets,
+or BIOSes do NOT support MSI or MSI-X and the call to pci_enable_msi/msix
+will fail. This is important to note since many drivers have
+two (or more) interrupt handlers: one for MSI/MSI-X and another for IRQs.
+They choose which handler to register with request_irq() based on the
+return value from pci_enable_msi/msix().
+
+There are (at least) two really good reasons for using MSI:
+1) MSI is an exclusive interrupt vector by definition.
+ This means the interrupt handler doesn't have to verify
+ its device caused the interrupt.
+
+2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
+ to be visible to the host CPU(s) when the MSI is delivered. This
+ is important for both data coherency and avoiding stale control data.
+ This guarantee allows the driver to omit MMIO reads to flush
+ the DMA stream.
+
+See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
+of MSI/MSI-X usage.
+
+
+
+4. PCI device shutdown
+~~~~~~~~~~~~~~~~~~~~~~~
+
+When a PCI device driver is being unloaded, most of the following
+steps need to be performed:
+
+ Disable the device from generating IRQs
+ Release the IRQ (free_irq())
+ Stop all DMA activity
+ Release DMA buffers (both streaming and consistent)
+ Unregister from other subsystems (e.g. scsi or netdev)
+ Disable device from responding to MMIO/IO Port addresses
+ Release MMIO/IO Port resource(s)
+
+
+4.1 Stop IRQs on the device
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+How to do this is chip/device specific. If it's not done, it opens
+the possibility of a "screaming interrupt" if (and only if)
+the IRQ is shared with another device.
+
+When the shared IRQ handler is "unhooked", the remaining devices
+using the same IRQ line will still need the IRQ enabled. Thus if the
+"unhooked" device asserts IRQ line, the system will respond assuming
+it was one of the remaining devices asserted the IRQ line. Since none
+of the other devices will handle the IRQ, the system will "hang" until
+it decides the IRQ isn't going to get handled and masks the IRQ (100,000
+iterations later). Once the shared IRQ is masked, the remaining devices
+will stop functioning properly. Not a nice situation.
+
+This is another reason to use MSI or MSI-X if it's available.
+MSI and MSI-X are defined to be exclusive interrupts and thus
+are not susceptible to the "screaming interrupt" problem.
+
+
+4.2 Release the IRQ
+~~~~~~~~~~~~~~~~~~~
+Once the device is quiesced (no more IRQs), one can call free_irq().
+This function will return control once any pending IRQs are handled,
+"unhook" the drivers IRQ handler from that IRQ, and finally release
+the IRQ if no one else is using it.
+
+
+4.3 Stop all DMA activity
+~~~~~~~~~~~~~~~~~~~~~~~~~
+It's extremely important to stop all DMA operations BEFORE attempting
+to deallocate DMA control data. Failure to do so can result in memory
+corruption, hangs, and on some chip-sets a hard crash.
+
+Stopping DMA after stopping the IRQs can avoid races where the
+IRQ handler might restart DMA engines.
+
+While this step sounds obvious and trivial, several "mature" drivers
+didn't get this step right in the past.
+
+
+4.4 Release DMA buffers
+~~~~~~~~~~~~~~~~~~~~~~~
+Once DMA is stopped, clean up streaming DMA first.
+I.e. unmap data buffers and return buffers to "upstream"
+owners if there is one.
+
+Then clean up "consistent" buffers which contain the control data.
+
+See Documentation/DMA-API.txt for details on unmapping interfaces.
+
+
+4.5 Unregister from other subsystems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Most low level PCI device drivers support some other subsystem
+like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
+driver isn't losing resources from that other subsystem.
+If this happens, typically the symptom is an Oops (panic) when
+the subsystem attempts to call into a driver that has been unloaded.
+
+
+4.6 Disable Device from responding to MMIO/IO Port addresses
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+io_unmap() MMIO or IO Port resources and then call pci_disable_device().
+This is the symmetric opposite of pci_enable_device().
+Do not access device registers after calling pci_disable_device().
+
+
+4.7 Release MMIO/IO Port Resource(s)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Call pci_release_region() to mark the MMIO or IO Port range as available.
+Failure to do so usually results in the inability to reload the driver.
+
+
+
+5. How to access PCI config space
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can use pci_(read|write)_config_(byte|word|dword) to access the config
+space of a device represented by struct pci_dev *. All these functions return 0
+when successful or an error code (PCIBIOS_...) which can be translated to a text
+string by pcibios_strerror. Most drivers expect that accesses to valid PCI
+devices don't fail.
+
+If you don't have a struct pci_dev available, you can call
+pci_bus_(read|write)_config_(byte|word|dword) to access a given device
+and function on that bus.
+
+If you access fields in the standard portion of the config header, please
+use symbolic names of locations and bits declared in <linux/pci.h>.
+
+If you need to access Extended PCI Capability registers, just call
+pci_find_capability() for the particular capability and it will find the
+corresponding register block for you.
+
+
+
+6. Other interesting functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+pci_find_slot() Find pci_dev corresponding to given bus and
+ slot numbers.
+pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3)
+pci_find_capability() Find specified capability in device's capability
+ list.
+pci_resource_start() Returns bus start address for a given PCI region
+pci_resource_end() Returns bus end address for a given PCI region
+pci_resource_len() Returns the byte length of a PCI region
+pci_set_drvdata() Set private driver data pointer for a pci_dev
+pci_get_drvdata() Return private driver data pointer for a pci_dev
+pci_set_mwi() Enable Memory-Write-Invalidate transactions.
+pci_clear_mwi() Disable Memory-Write-Invalidate transactions.
+
+
+
+7. Miscellaneous hints
+~~~~~~~~~~~~~~~~~~~~~~
+
+When displaying PCI device names to the user (for example when a driver wants
+to tell the user what card has it found), please use pci_name(pci_dev).
+
+Always refer to the PCI devices by a pointer to the pci_dev structure.
+All PCI layer functions use this identification and it's the only
+reasonable one. Don't use bus/slot/function numbers except for very
+special purposes -- on systems with multiple primary buses their semantics
+can be pretty complex.
+
+Don't try to turn on Fast Back to Back writes in your driver. All devices
+on the bus need to be capable of doing it, so this is something which needs
+to be handled by platform and generic code, not individual drivers.
+
+
+
+8. Vendor and device identifications
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One is not not required to add new device ids to include/linux/pci_ids.h.
+Please add PCI_VENDOR_ID_xxx for vendors and a hex constant for device ids.
+
+PCI_VENDOR_ID_xxx constants are re-used. The device ids are arbitrary
+hex numbers (vendor controlled) and normally used only in a single
+location, the pci_device_id table.
+
+Please DO submit new vendor/device ids to pciids.sourceforge.net project.
+
+
+
+9. Obsolete functions
+~~~~~~~~~~~~~~~~~~~~~
+
+There are several functions which you might come across when trying to
+port an old driver to the new PCI interface. They are no longer present
+in the kernel as they aren't compatible with hotplug or PCI domains or
+having sane locking.
+
+pci_find_device() Superseded by pci_get_device()
+pci_find_subsys() Superseded by pci_get_subsys()
+pci_find_slot() Superseded by pci_get_slot()
+
+
+The alternative is the traditional PCI device driver that walks PCI
+device lists. This is still possible but discouraged.
+
+
+
+10. MMIO Space and "Write Posting"
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Converting a driver from using I/O Port space to using MMIO space
+often requires some additional changes. Specifically, "write posting"
+needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
+already do this. I/O Port space guarantees write transactions reach the PCI
+device before the CPU can continue. Writes to MMIO space allow the CPU
+to continue before the transaction reaches the PCI device. HW weenies
+call this "Write Posting" because the write completion is "posted" to
+the CPU before the transaction has reached its destination.
+
+Thus, timing sensitive code should add readl() where the CPU is
+expected to wait before doing other work. The classic "bit banging"
+sequence works fine for I/O Port space:
+
+ for (i = 8; --i; val >>= 1) {
+ outb(val & 1, ioport_reg); /* write bit */
+ udelay(10);
+ }
+
+The same sequence for MMIO space should be:
+
+ for (i = 8; --i; val >>= 1) {
+ writeb(val & 1, mmio_reg); /* write bit */
+ readb(safe_mmio_reg); /* flush posted write */
+ udelay(10);
+ }
+
+It is important that "safe_mmio_reg" not have any side effects that
+interferes with the correct operation of the device.
+
+Another case to watch out for is when resetting a PCI device. Use PCI
+Configuration space reads to flush the writel(). This will gracefully
+handle the PCI master abort on all platforms if the PCI device is
+expected to not respond to a readl(). Most x86 platforms will allow
+MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
+(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
+
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
new file mode 100644
index 0000000..ddeb14b
--- /dev/null
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -0,0 +1,248 @@
+ The PCI Express Advanced Error Reporting Driver Guide HOWTO
+ T. Long Nguyen <tom.l.nguyen@intel.com>
+ Yanmin Zhang <yanmin.zhang@intel.com>
+ 07/29/2006
+
+
+1. Overview
+
+1.1 About this guide
+
+This guide describes the basics of the PCI Express Advanced Error
+Reporting (AER) driver and provides information on how to use it, as
+well as how to enable the drivers of endpoint devices to conform with
+PCI Express AER driver.
+
+1.2 Copyright Intel Corporation 2006.
+
+1.3 What is the PCI Express AER Driver?
+
+PCI Express error signaling can occur on the PCI Express link itself
+or on behalf of transactions initiated on the link. PCI Express
+defines two error reporting paradigms: the baseline capability and
+the Advanced Error Reporting capability. The baseline capability is
+required of all PCI Express components providing a minimum defined
+set of error reporting requirements. Advanced Error Reporting
+capability is implemented with a PCI Express advanced error reporting
+extended capability structure providing more robust error reporting.
+
+The PCI Express AER driver provides the infrastructure to support PCI
+Express Advanced Error Reporting capability. The PCI Express AER
+driver provides three basic functions:
+
+- Gathers the comprehensive error information if errors occurred.
+- Reports error to the users.
+- Performs error recovery actions.
+
+AER driver only attaches root ports which support PCI-Express AER
+capability.
+
+
+2. User Guide
+
+2.1 Include the PCI Express AER Root Driver into the Linux Kernel
+
+The PCI Express AER Root driver is a Root Port service driver attached
+to the PCI Express Port Bus driver. If a user wants to use it, the driver
+has to be compiled. Option CONFIG_PCIEAER supports this capability. It
+depends on CONFIG_PCIEPORTBUS, so pls. set CONFIG_PCIEPORTBUS=y and
+CONFIG_PCIEAER = y.
+
+2.2 Load PCI Express AER Root Driver
+There is a case where a system has AER support in BIOS. Enabling the AER
+Root driver and having AER support in BIOS may result unpredictable
+behavior. To avoid this conflict, a successful load of the AER Root driver
+requires ACPI _OSC support in the BIOS to allow the AER Root driver to
+request for native control of AER. See the PCI FW 3.0 Specification for
+details regarding OSC usage. Currently, lots of firmwares don't provide
+_OSC support while they use PCI Express. To support such firmwares,
+forceload, a parameter of type bool, could enable AER to continue to
+be initiated although firmwares have no _OSC support. To enable the
+walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line
+when booting kernel. Note that forceload=n by default.
+
+2.3 AER error output
+When a PCI-E AER error is captured, an error message will be outputed to
+console. If it's a correctable error, it is outputed as a warning.
+Otherwise, it is printed as an error. So users could choose different
+log level to filter out correctable error messages.
+
+Below shows an example.
++------ PCI-Express Device Error -----+
+Error Severity : Uncorrected (Fatal)
+PCIE Bus Error type : Transaction Layer
+Unsupported Request : First
+Requester ID : 0500
+VendorID=8086h, DeviceID=0329h, Bus=05h, Device=00h, Function=00h
+TLB Header:
+04000001 00200a03 05010000 00050100
+
+In the example, 'Requester ID' means the ID of the device who sends
+the error message to root port. Pls. refer to pci express specs for
+other fields.
+
+
+3. Developer Guide
+
+To enable AER aware support requires a software driver to configure
+the AER capability structure within its device and to provide callbacks.
+
+To support AER better, developers need understand how AER does work
+firstly.
+
+PCI Express errors are classified into two types: correctable errors
+and uncorrectable errors. This classification is based on the impacts
+of those errors, which may result in degraded performance or function
+failure.
+
+Correctable errors pose no impacts on the functionality of the
+interface. The PCI Express protocol can recover without any software
+intervention or any loss of data. These errors are detected and
+corrected by hardware. Unlike correctable errors, uncorrectable
+errors impact functionality of the interface. Uncorrectable errors
+can cause a particular transaction or a particular PCI Express link
+to be unreliable. Depending on those error conditions, uncorrectable
+errors are further classified into non-fatal errors and fatal errors.
+Non-fatal errors cause the particular transaction to be unreliable,
+but the PCI Express link itself is fully functional. Fatal errors, on
+the other hand, cause the link to be unreliable.
+
+When AER is enabled, a PCI Express device will automatically send an
+error message to the PCIE root port above it when the device captures
+an error. The Root Port, upon receiving an error reporting message,
+internally processes and logs the error message in its PCI Express
+capability structure. Error information being logged includes storing
+the error reporting agent's requestor ID into the Error Source
+Identification Registers and setting the error bits of the Root Error
+Status Register accordingly. If AER error reporting is enabled in Root
+Error Command Register, the Root Port generates an interrupt if an
+error is detected.
+
+Note that the errors as described above are related to the PCI Express
+hierarchy and links. These errors do not include any device specific
+errors because device specific errors will still get sent directly to
+the device driver.
+
+3.1 Configure the AER capability structure
+
+AER aware drivers of PCI Express component need change the device
+control registers to enable AER. They also could change AER registers,
+including mask and severity registers. Helper function
+pci_enable_pcie_error_reporting could be used to enable AER. See
+section 3.3.
+
+3.2. Provide callbacks
+
+3.2.1 callback reset_link to reset pci express link
+
+This callback is used to reset the pci express physical link when a
+fatal error happens. The root port aer service driver provides a
+default reset_link function, but different upstream ports might
+have different specifications to reset pci express link, so all
+upstream ports should provide their own reset_link functions.
+
+In struct pcie_port_service_driver, a new pointer, reset_link, is
+added.
+
+pci_ers_result_t (*reset_link) (struct pci_dev *dev);
+
+Section 3.2.2.2 provides more detailed info on when to call
+reset_link.
+
+3.2.2 PCI error-recovery callbacks
+
+The PCI Express AER Root driver uses error callbacks to coordinate
+with downstream device drivers associated with a hierarchy in question
+when performing error recovery actions.
+
+Data struct pci_driver has a pointer, err_handler, to point to
+pci_error_handlers who consists of a couple of callback function
+pointers. AER driver follows the rules defined in
+pci-error-recovery.txt except pci express specific parts (e.g.
+reset_link). Pls. refer to pci-error-recovery.txt for detailed
+definitions of the callbacks.
+
+Below sections specify when to call the error callback functions.
+
+3.2.2.1 Correctable errors
+
+Correctable errors pose no impacts on the functionality of
+the interface. The PCI Express protocol can recover without any
+software intervention or any loss of data. These errors do not
+require any recovery actions. The AER driver clears the device's
+correctable error status register accordingly and logs these errors.
+
+3.2.2.2 Non-correctable (non-fatal and fatal) errors
+
+If an error message indicates a non-fatal error, performing link reset
+at upstream is not required. The AER driver calls error_detected(dev,
+pci_channel_io_normal) to all drivers associated within a hierarchy in
+question. for example,
+EndPoint<==>DownstreamPort B<==>UpstreamPort A<==>RootPort.
+If Upstream port A captures an AER error, the hierarchy consists of
+Downstream port B and EndPoint.
+
+A driver may return PCI_ERS_RESULT_CAN_RECOVER,
+PCI_ERS_RESULT_DISCONNECT, or PCI_ERS_RESULT_NEED_RESET, depending on
+whether it can recover or the AER driver calls mmio_enabled as next.
+
+If an error message indicates a fatal error, kernel will broadcast
+error_detected(dev, pci_channel_io_frozen) to all drivers within
+a hierarchy in question. Then, performing link reset at upstream is
+necessary. As different kinds of devices might use different approaches
+to reset link, AER port service driver is required to provide the
+function to reset link. Firstly, kernel looks for if the upstream
+component has an aer driver. If it has, kernel uses the reset_link
+callback of the aer driver. If the upstream component has no aer driver
+and the port is downstream port, we will use the aer driver of the
+root port who reports the AER error. As for upstream ports,
+they should provide their own aer service drivers with reset_link
+function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
+reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
+to mmio_enabled.
+
+3.3 helper functions
+
+3.3.1 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
+pci_enable_pcie_error_reporting enables the device to send error
+messages to root port when an error is detected. Note that devices
+don't enable the error reporting by default, so device drivers need
+call this function to enable it.
+
+3.3.2 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+pci_disable_pcie_error_reporting disables the device to send error
+messages to root port when an error is detected.
+
+3.3.3 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
+pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
+error status register.
+
+3.4 Frequent Asked Questions
+
+Q: What happens if a PCI Express device driver does not provide an
+error recovery handler (pci_driver->err_handler is equal to NULL)?
+
+A: The devices attached with the driver won't be recovered. If the
+error is fatal, kernel will print out warning messages. Please refer
+to section 3 for more information.
+
+Q: What happens if an upstream port service driver does not provide
+callback reset_link?
+
+A: Fatal error recovery will fail if the errors are reported by the
+upstream ports who are attached by the service driver.
+
+Q: How does this infrastructure deal with driver that is not PCI
+Express aware?
+
+A: This infrastructure calls the error callback functions of the
+driver when an error happens. But if the driver is not aware of
+PCI Express, the device might not report its own errors to root
+port.
+
+Q: What modifications will that driver need to make it compatible
+with the PCI Express AER Root driver?
+
+A: It could call the helper functions to enable AER in devices and
+cleanup uncorrectable status register. Pls. refer to section 3.3.
+
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
new file mode 100644
index 0000000..461481d
--- /dev/null
+++ b/Documentation/RCU/00-INDEX
@@ -0,0 +1,22 @@
+00-INDEX
+ - This file
+arrayRCU.txt
+ - Using RCU to Protect Read-Mostly Arrays
+checklist.txt
+ - Review Checklist for RCU Patches
+listRCU.txt
+ - Using RCU to Protect Read-Mostly Linked Lists
+NMI-RCU.txt
+ - Using RCU to Protect Dynamic NMI Handlers
+rcuref.txt
+ - Reference-count design for elements of lists/arrays protected by RCU
+rcu.txt
+ - RCU Concepts
+RTFP.txt
+ - List of RCU papers (bibliography) going back to 1980.
+torture.txt
+ - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
+UP.txt
+ - RCU on Uniprocessor Systems
+whatisRCU.txt
+ - What is RCU?
diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt
new file mode 100644
index 0000000..a6d32e6
--- /dev/null
+++ b/Documentation/RCU/NMI-RCU.txt
@@ -0,0 +1,115 @@
+Using RCU to Protect Dynamic NMI Handlers
+
+
+Although RCU is usually used to protect read-mostly data structures,
+it is possible to use RCU to provide dynamic non-maskable interrupt
+handlers, as well as dynamic irq handlers. This document describes
+how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
+work in "arch/i386/oprofile/nmi_timer_int.c" and in
+"arch/i386/kernel/traps.c".
+
+The relevant pieces of code are listed below, each followed by a
+brief explanation.
+
+ static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
+ {
+ return 0;
+ }
+
+The dummy_nmi_callback() function is a "dummy" NMI handler that does
+nothing, but returns zero, thus saying that it did nothing, allowing
+the NMI handler to take the default machine-specific action.
+
+ static nmi_callback_t nmi_callback = dummy_nmi_callback;
+
+This nmi_callback variable is a global function pointer to the current
+NMI handler.
+
+ void do_nmi(struct pt_regs * regs, long error_code)
+ {
+ int cpu;
+
+ nmi_enter();
+
+ cpu = smp_processor_id();
+ ++nmi_count(cpu);
+
+ if (!rcu_dereference(nmi_callback)(regs, cpu))
+ default_do_nmi(regs);
+
+ nmi_exit();
+ }
+
+The do_nmi() function processes each NMI. It first disables preemption
+in the same way that a hardware irq would, then increments the per-CPU
+count of NMIs. It then invokes the NMI handler stored in the nmi_callback
+function pointer. If this handler returns zero, do_nmi() invokes the
+default_do_nmi() function to handle a machine-specific NMI. Finally,
+preemption is restored.
+
+Strictly speaking, rcu_dereference() is not needed, since this code runs
+only on i386, which does not need rcu_dereference() anyway. However,
+it is a good documentation aid, particularly for anyone attempting to
+do something similar on Alpha.
+
+Quick Quiz: Why might the rcu_dereference() be necessary on Alpha,
+ given that the code referenced by the pointer is read-only?
+
+
+Back to the discussion of NMI and RCU...
+
+ void set_nmi_callback(nmi_callback_t callback)
+ {
+ rcu_assign_pointer(nmi_callback, callback);
+ }
+
+The set_nmi_callback() function registers an NMI handler. Note that any
+data that is to be used by the callback must be initialized up -before-
+the call to set_nmi_callback(). On architectures that do not order
+writes, the rcu_assign_pointer() ensures that the NMI handler sees the
+initialized values.
+
+ void unset_nmi_callback(void)
+ {
+ rcu_assign_pointer(nmi_callback, dummy_nmi_callback);
+ }
+
+This function unregisters an NMI handler, restoring the original
+dummy_nmi_handler(). However, there may well be an NMI handler
+currently executing on some other CPU. We therefore cannot free
+up any data structures used by the old NMI handler until execution
+of it completes on all other CPUs.
+
+One way to accomplish this is via synchronize_sched(), perhaps as
+follows:
+
+ unset_nmi_callback();
+ synchronize_sched();
+ kfree(my_nmi_data);
+
+This works because synchronize_sched() blocks until all CPUs complete
+any preemption-disabled segments of code that they were executing.
+Since NMI handlers disable preemption, synchronize_sched() is guaranteed
+not to return until all ongoing NMI handlers exit. It is therefore safe
+to free up the handler's data as soon as synchronize_sched() returns.
+
+Important note: for this to work, the architecture in question must
+invoke irq_enter() and irq_exit() on NMI entry and exit, respectively.
+
+
+Answer to Quick Quiz
+
+ Why might the rcu_dereference() be necessary on Alpha, given
+ that the code referenced by the pointer is read-only?
+
+ Answer: The caller to set_nmi_callback() might well have
+ initialized some data that is to be used by the
+ new NMI handler. In this case, the rcu_dereference()
+ would be needed, because otherwise a CPU that received
+ an NMI just after the new handler was set might see
+ the pointer to the new NMI handler, but the old
+ pre-initialized version of the handler's data.
+
+ More important, the rcu_dereference() makes it clear
+ to someone reading the code that the pointer is being
+ protected by RCU.
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
new file mode 100644
index 0000000..9f711d2
--- /dev/null
+++ b/Documentation/RCU/RTFP.txt
@@ -0,0 +1,745 @@
+Read the F-ing Papers!
+
+
+This document describes RCU-related publications, and is followed by
+the corresponding bibtex entries. A number of the publications may
+be found at http://www.rdrop.com/users/paulmck/RCU/.
+
+The first thing resembling RCU was published in 1980, when Kung and Lehman
+[Kung80] recommended use of a garbage collector to defer destruction
+of nodes in a parallel binary search tree in order to simplify its
+implementation. This works well in environments that have garbage
+collectors, but most production garbage collectors incur significant
+overhead.
+
+In 1982, Manber and Ladner [Manber82,Manber84] recommended deferring
+destruction until all threads running at that time have terminated, again
+for a parallel binary search tree. This approach works well in systems
+with short-lived threads, such as the K42 research operating system.
+However, Linux has long-lived tasks, so more is needed.
+
+In 1986, Hennessy, Osisek, and Seigh [Hennessy89] introduced passive
+serialization, which is an RCU-like mechanism that relies on the presence
+of "quiescent states" in the VM/XA hypervisor that are guaranteed not
+to be referencing the data structure. However, this mechanism was not
+optimized for modern computer systems, which is not surprising given
+that these overheads were not so expensive in the mid-80s. Nonetheless,
+passive serialization appears to be the first deferred-destruction
+mechanism to be used in production. Furthermore, the relevant patent has
+lapsed, so this approach may be used in non-GPL software, if desired.
+(In contrast, use of RCU is permitted only in software licensed under
+GPL. Sorry!!!)
+
+In 1990, Pugh [Pugh90] noted that explicitly tracking which threads
+were reading a given data structure permitted deferred free to operate
+in the presence of non-terminating threads. However, this explicit
+tracking imposes significant read-side overhead, which is undesirable
+in read-mostly situations. This algorithm does take pains to avoid
+write-side contention and parallelize the other write-side overheads by
+providing a fine-grained locking design, however, it would be interesting
+to see how much of the performance advantage reported in 1990 remains
+in 2004.
+
+At about this same time, Adams [Adams91] described ``chaotic relaxation'',
+where the normal barriers between successive iterations of convergent
+numerical algorithms are relaxed, so that iteration $n$ might use
+data from iteration $n-1$ or even $n-2$. This introduces error,
+which typically slows convergence and thus increases the number of
+iterations required. However, this increase is sometimes more than made
+up for by a reduction in the number of expensive barrier operations,
+which are otherwise required to synchronize the threads at the end
+of each iteration. Unfortunately, chaotic relaxation requires highly
+structured data, such as the matrices used in scientific programs, and
+is thus inapplicable to most data structures in operating-system kernels.
+
+In 1992, Henry (now Alexia) Massalin completed a dissertation advising
+parallel programmers to defer processing when feasible to simplify
+synchronization. RCU makes extremely heavy use of this advice.
+
+In 1993, Jacobson [Jacobson93] verbally described what is perhaps the
+simplest deferred-free technique: simply waiting a fixed amount of time
+before freeing blocks awaiting deferred free. Jacobson did not describe
+any write-side changes he might have made in this work using SGI's Irix
+kernel. Aju John published a similar technique in 1995 [AjuJohn95].
+This works well if there is a well-defined upper bound on the length of
+time that reading threads can hold references, as there might well be in
+hard real-time systems. However, if this time is exceeded, perhaps due
+to preemption, excessive interrupts, or larger-than-anticipated load,
+memory corruption can ensue, with no reasonable means of diagnosis.
+Jacobson's technique is therefore inappropriate for use in production
+operating-system kernels, except when such kernels can provide hard
+real-time response guarantees for all operations.
+
+Also in 1995, Pu et al. [Pu95a] applied a technique similar to that of Pugh's
+read-side-tracking to permit replugging of algorithms within a commercial
+Unix operating system. However, this replugging permitted only a single
+reader at a time. The following year, this same group of researchers
+extended their technique to allow for multiple readers [Cowan96a].
+Their approach requires memory barriers (and thus pipeline stalls),
+but reduces memory latency, contention, and locking overheads.
+
+1995 also saw the first publication of DYNIX/ptx's RCU mechanism
+[Slingwine95], which was optimized for modern CPU architectures,
+and was successfully applied to a number of situations within the
+DYNIX/ptx kernel. The corresponding conference paper appeared in 1998
+[McKenney98].
+
+In 1999, the Tornado and K42 groups described their "generations"
+mechanism, which quite similar to RCU [Gamsa99]. These operating systems
+made pervasive use of RCU in place of "existence locks", which greatly
+simplifies locking hierarchies.
+
+2001 saw the first RCU presentation involving Linux [McKenney01a]
+at OLS. The resulting abundance of RCU patches was presented the
+following year [McKenney02a], and use of RCU in dcache was first
+described that same year [Linder02a].
+
+Also in 2002, Michael [Michael02b,Michael02a] presented "hazard-pointer"
+techniques that defer the destruction of data structures to simplify
+non-blocking synchronization (wait-free synchronization, lock-free
+synchronization, and obstruction-free synchronization are all examples of
+non-blocking synchronization). In particular, this technique eliminates
+locking, reduces contention, reduces memory latency for readers, and
+parallelizes pipeline stalls and memory latency for writers. However,
+these techniques still impose significant read-side overhead in the
+form of memory barriers. Researchers at Sun worked along similar lines
+in the same timeframe [HerlihyLM02]. These techniques can be thought
+of as inside-out reference counts, where the count is represented by the
+number of hazard pointers referencing a given data structure (rather than
+the more conventional counter field within the data structure itself).
+
+By the same token, RCU can be thought of as a "bulk reference count",
+where some form of reference counter covers all reference by a given CPU
+or thread during a set timeframe. This timeframe is related to, but
+not necessarily exactly the same as, an RCU grace period. In classic
+RCU, the reference counter is the per-CPU bit in the "bitmask" field,
+and each such bit covers all references that might have been made by
+the corresponding CPU during the prior grace period. Of course, RCU
+can be thought of in other terms as well.
+
+In 2003, the K42 group described how RCU could be used to create
+hot-pluggable implementations of operating-system functions [Appavoo03a].
+Later that year saw a paper describing an RCU implementation of System
+V IPC [Arcangeli03], and an introduction to RCU in Linux Journal
+[McKenney03a].
+
+2004 has seen a Linux-Journal article on use of RCU in dcache
+[McKenney04a], a performance comparison of locking to RCU on several
+different CPUs [McKenney04b], a dissertation describing use of RCU in a
+number of operating-system kernels [PaulEdwardMcKenneyPhD], a paper
+describing how to make RCU safe for soft-realtime applications [Sarma04c],
+and a paper describing SELinux performance with RCU [JamesMorris04b].
+
+2005 brought further adaptation of RCU to realtime use, permitting
+preemption of RCU realtime critical sections [PaulMcKenney05a,
+PaulMcKenney05b].
+
+2006 saw the first best-paper award for an RCU paper [ThomasEHart2006a],
+as well as further work on efficient implementations of preemptible
+RCU [PaulEMcKenney2006b], but priority-boosting of RCU read-side critical
+sections proved elusive. An RCU implementation permitting general
+blocking in read-side critical sections appeared [PaulEMcKenney2006c],
+Robert Olsson described an RCU-protected trie-hash combination
+[RobertOlsson2006a].
+
+2007 saw the journal version of the award-winning RCU paper from 2006
+[ThomasEHart2007a], as well as a paper demonstrating use of Promela
+and Spin to mechanically verify an optimization to Oleg Nesterov's
+QRCU [PaulEMcKenney2007QRCUspin], a design document describing
+preemptible RCU [PaulEMcKenney2007PreemptibleRCU], and the three-part
+LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally,
+PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI].
+
+Bibtex Entries
+
+@article{Kung80
+,author="H. T. Kung and Q. Lehman"
+,title="Concurrent Maintenance of Binary Search Trees"
+,Year="1980"
+,Month="September"
+,journal="ACM Transactions on Database Systems"
+,volume="5"
+,number="3"
+,pages="354-382"
+}
+
+@techreport{Manber82
+,author="Udi Manber and Richard E. Ladner"
+,title="Concurrency Control in a Dynamic Search Structure"
+,institution="Department of Computer Science, University of Washington"
+,address="Seattle, Washington"
+,year="1982"
+,number="82-01-01"
+,month="January"
+,pages="28"
+}
+
+@article{Manber84
+,author="Udi Manber and Richard E. Ladner"
+,title="Concurrency Control in a Dynamic Search Structure"
+,Year="1984"
+,Month="September"
+,journal="ACM Transactions on Database Systems"
+,volume="9"
+,number="3"
+,pages="439-455"
+}
+
+@techreport{Hennessy89
+,author="James P. Hennessy and Damian L. Osisek and Joseph W. {Seigh II}"
+,title="Passive Serialization in a Multitasking Environment"
+,institution="US Patent and Trademark Office"
+,address="Washington, DC"
+,year="1989"
+,number="US Patent 4,809,168 (lapsed)"
+,month="February"
+,pages="11"
+}
+
+@techreport{Pugh90
+,author="William Pugh"
+,title="Concurrent Maintenance of Skip Lists"
+,institution="Institute of Advanced Computer Science Studies, Department of Computer Science, University of Maryland"
+,address="College Park, Maryland"
+,year="1990"
+,number="CS-TR-2222.1"
+,month="June"
+}
+
+@Book{Adams91
+,Author="Gregory R. Adams"
+,title="Concurrent Programming, Principles, and Practices"
+,Publisher="Benjamin Cummins"
+,Year="1991"
+}
+
+@phdthesis{HMassalinPhD
+,author="H. Massalin"
+,title="Synthesis: An Efficient Implementation of Fundamental Operating
+System Services"
+,school="Columbia University"
+,address="New York, NY"
+,year="1992"
+,annotation="
+ Mondo optimizing compiler.
+ Wait-free stuff.
+ Good advice: defer work to avoid synchronization.
+"
+}
+
+@unpublished{Jacobson93
+,author="Van Jacobson"
+,title="Avoid Read-Side Locking Via Delayed Free"
+,year="1993"
+,month="September"
+,note="Verbal discussion"
+}
+
+@Conference{AjuJohn95
+,Author="Aju John"
+,Title="Dynamic vnodes -- Design and Implementation"
+,Booktitle="{USENIX Winter 1995}"
+,Publisher="USENIX Association"
+,Month="January"
+,Year="1995"
+,pages="11-23"
+,Address="New Orleans, LA"
+}
+
+@conference{Pu95a,
+Author = "Calton Pu and Tito Autrey and Andrew Black and Charles Consel and
+Crispin Cowan and Jon Inouye and Lakshmi Kethana and Jonathan Walpole and
+Ke Zhang",
+Title = "Optimistic Incremental Specialization: Streamlining a Commercial
+Operating System",
+Booktitle = "15\textsuperscript{th} ACM Symposium on
+Operating Systems Principles (SOSP'95)",
+address = "Copper Mountain, CO",
+month="December",
+year="1995",
+pages="314-321",
+annotation="
+ Uses a replugger, but with a flag to signal when people are
+ using the resource at hand. Only one reader at a time.
+"
+}
+
+@conference{Cowan96a,
+Author = "Crispin Cowan and Tito Autrey and Charles Krasic and
+Calton Pu and Jonathan Walpole",
+Title = "Fast Concurrent Dynamic Linking for an Adaptive Operating System",
+Booktitle = "International Conference on Configurable Distributed Systems
+(ICCDS'96)",
+address = "Annapolis, MD",
+month="May",
+year="1996",
+pages="108",
+isbn="0-8186-7395-8",
+annotation="
+ Uses a replugger, but with a counter to signal when people are
+ using the resource at hand. Allows multiple readers.
+"
+}
+
+@techreport{Slingwine95
+,author="John D. Slingwine and Paul E. McKenney"
+,title="Apparatus and Method for Achieving Reduced Overhead Mutual
+Exclusion and Maintaining Coherency in a Multiprocessor System
+Utilizing Execution History and Thread Monitoring"
+,institution="US Patent and Trademark Office"
+,address="Washington, DC"
+,year="1995"
+,number="US Patent 5,442,758 (contributed under GPL)"
+,month="August"
+}
+
+@techreport{Slingwine97
+,author="John D. Slingwine and Paul E. McKenney"
+,title="Method for maintaining data coherency using thread
+activity summaries in a multicomputer system"
+,institution="US Patent and Trademark Office"
+,address="Washington, DC"
+,year="1997"
+,number="US Patent 5,608,893 (contributed under GPL)"
+,month="March"
+}
+
+@techreport{Slingwine98
+,author="John D. Slingwine and Paul E. McKenney"
+,title="Apparatus and method for achieving reduced overhead
+mutual exclusion and maintaining coherency in a multiprocessor
+system utilizing execution history and thread monitoring"
+,institution="US Patent and Trademark Office"
+,address="Washington, DC"
+,year="1998"
+,number="US Patent 5,727,209 (contributed under GPL)"
+,month="March"
+}
+
+@Conference{McKenney98
+,Author="Paul E. McKenney and John D. Slingwine"
+,Title="Read-Copy Update: Using Execution History to Solve Concurrency
+Problems"
+,Booktitle="{Parallel and Distributed Computing and Systems}"
+,Month="October"
+,Year="1998"
+,pages="509-518"
+,Address="Las Vegas, NV"
+}
+
+@Conference{Gamsa99
+,Author="Ben Gamsa and Orran Krieger and Jonathan Appavoo and Michael Stumm"
+,Title="Tornado: Maximizing Locality and Concurrency in a Shared Memory
+Multiprocessor Operating System"
+,Booktitle="{Proceedings of the 3\textsuperscript{rd} Symposium on
+Operating System Design and Implementation}"
+,Month="February"
+,Year="1999"
+,pages="87-100"
+,Address="New Orleans, LA"
+}
+
+@techreport{Slingwine01
+,author="John D. Slingwine and Paul E. McKenney"
+,title="Apparatus and method for achieving reduced overhead
+mutual exclusion and maintaining coherency in a multiprocessor
+system utilizing execution history and thread monitoring"
+,institution="US Patent and Trademark Office"
+,address="Washington, DC"
+,year="2001"
+,number="US Patent 5,219,690 (contributed under GPL)"
+,month="April"
+}
+
+@Conference{McKenney01a
+,Author="Paul E. McKenney and Jonathan Appavoo and Andi Kleen and
+Orran Krieger and Rusty Russell and Dipankar Sarma and Maneesh Soni"
+,Title="Read-Copy Update"
+,Booktitle="{Ottawa Linux Symposium}"
+,Month="July"
+,Year="2001"
+,note="Available:
+\url{http://www.linuxsymposium.org/2001/abstracts/readcopy.php}
+\url{http://www.rdrop.com/users/paulmck/rclock/rclock_OLS.2001.05.01c.pdf}
+[Viewed June 23, 2004]"
+annotation="
+Described RCU, and presented some patches implementing and using it in
+the Linux kernel.
+"
+}
+
+@Conference{Linder02a
+,Author="Hanna Linder and Dipankar Sarma and Maneesh Soni"
+,Title="Scalability of the Directory Entry Cache"
+,Booktitle="{Ottawa Linux Symposium}"
+,Month="June"
+,Year="2002"
+,pages="289-300"
+}
+
+@Conference{McKenney02a
+,Author="Paul E. McKenney and Dipankar Sarma and
+Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
+,Title="Read-Copy Update"
+,Booktitle="{Ottawa Linux Symposium}"
+,Month="June"
+,Year="2002"
+,pages="338-367"
+,note="Available:
+\url{http://www.linux.org.uk/~ajh/ols2002_proceedings.pdf.gz}
+[Viewed June 23, 2004]"
+}
+
+@conference{Michael02a
+,author="Maged M. Michael"
+,title="Safe Memory Reclamation for Dynamic Lock-Free Objects Using Atomic
+Reads and Writes"
+,Year="2002"
+,Month="August"
+,booktitle="{Proceedings of the 21\textsuperscript{st} Annual ACM
+Symposium on Principles of Distributed Computing}"
+,pages="21-30"
+,annotation="
+ Each thread keeps an array of pointers to items that it is
+ currently referencing. Sort of an inside-out garbage collection
+ mechanism, but one that requires the accessing code to explicitly
+ state its needs. Also requires read-side memory barriers on
+ most architectures.
+"
+}
+
+@conference{Michael02b
+,author="Maged M. Michael"
+,title="High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
+,Year="2002"
+,Month="August"
+,booktitle="{Proceedings of the 14\textsuperscript{th} Annual ACM
+Symposium on Parallel
+Algorithms and Architecture}"
+,pages="73-82"
+,annotation="
+ Like the title says...
+"
+}
+
+@InProceedings{HerlihyLM02
+,author={Maurice Herlihy and Victor Luchangco and Mark Moir}
+,title="The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized,
+Lock-Free Data Structures"
+,booktitle={Proceedings of 16\textsuperscript{th} International
+Symposium on Distributed Computing}
+,year=2002
+,month="October"
+,pages="339-353"
+}
+
+@article{Appavoo03a
+,author="J. Appavoo and K. Hui and C. A. N. Soules and R. W. Wisniewski and
+D. M. {Da Silva} and O. Krieger and M. A. Auslander and D. J. Edelsohn and
+B. Gamsa and G. R. Ganger and P. McKenney and M. Ostrowski and
+B. Rosenburg and M. Stumm and J. Xenidis"
+,title="Enabling Autonomic Behavior in Systems Software With Hot Swapping"
+,Year="2003"
+,Month="January"
+,journal="IBM Systems Journal"
+,volume="42"
+,number="1"
+,pages="60-76"
+}
+
+@Conference{Arcangeli03
+,Author="Andrea Arcangeli and Mingming Cao and Paul E. McKenney and
+Dipankar Sarma"
+,Title="Using Read-Copy Update Techniques for {System V IPC} in the
+{Linux} 2.5 Kernel"
+,Booktitle="Proceedings of the 2003 USENIX Annual Technical Conference
+(FREENIX Track)"
+,Publisher="USENIX Association"
+,year="2003"
+,month="June"
+,pages="297-310"
+}
+
+@article{McKenney03a
+,author="Paul E. McKenney"
+,title="Using {RCU} in the {Linux} 2.5 Kernel"
+,Year="2003"
+,Month="October"
+,journal="Linux Journal"
+,volume="1"
+,number="114"
+,pages="18-26"
+}
+
+@techreport{Friedberg03a
+,author="Stuart A. Friedberg"
+,title="Lock-Free Wild Card Search Data Structure and Method"
+,institution="US Patent and Trademark Office"
+,address="Washington, DC"
+,year="2003"
+,number="US Patent 6,662,184 (contributed under GPL)"
+,month="December"
+,pages="112"
+}
+
+@article{McKenney04a
+,author="Paul E. McKenney and Dipankar Sarma and Maneesh Soni"
+,title="Scaling dcache with {RCU}"
+,Year="2004"
+,Month="January"
+,journal="Linux Journal"
+,volume="1"
+,number="118"
+,pages="38-46"
+}
+
+@Conference{McKenney04b
+,Author="Paul E. McKenney"
+,Title="{RCU} vs. Locking Performance on Different {CPUs}"
+,Booktitle="{linux.conf.au}"
+,Month="January"
+,Year="2004"
+,Address="Adelaide, Australia"
+,note="Available:
+\url{http://www.linux.org.au/conf/2004/abstracts.html#90}
+\url{http://www.rdrop.com/users/paulmck/rclock/lockperf.2004.01.17a.pdf}
+[Viewed June 23, 2004]"
+}
+
+@phdthesis{PaulEdwardMcKenneyPhD
+,author="Paul E. McKenney"
+,title="Exploiting Deferred Destruction:
+An Analysis of Read-Copy-Update Techniques
+in Operating System Kernels"
+,school="OGI School of Science and Engineering at
+Oregon Health and Sciences University"
+,year="2004"
+,note="Available:
+\url{http://www.rdrop.com/users/paulmck/RCU/RCUdissertation.2004.07.14e1.pdf}
+[Viewed October 15, 2004]"
+}
+
+@Conference{Sarma04c
+,Author="Dipankar Sarma and Paul E. McKenney"
+,Title="Making RCU Safe for Deep Sub-Millisecond Response Realtime Applications"
+,Booktitle="Proceedings of the 2004 USENIX Annual Technical Conference
+(FREENIX Track)"
+,Publisher="USENIX Association"
+,year="2004"
+,month="June"
+,pages="182-191"
+}
+
+@unpublished{JamesMorris04b
+,Author="James Morris"
+,Title="Recent Developments in {SELinux} Kernel Performance"
+,month="December"
+,year="2004"
+,note="Available:
+\url{http://www.livejournal.com/users/james_morris/2153.html}
+[Viewed December 10, 2004]"
+}
+
+@unpublished{PaulMcKenney05a
+,Author="Paul E. McKenney"
+,Title="{[RFC]} {RCU} and {CONFIG\_PREEMPT\_RT} progress"
+,month="May"
+,year="2005"
+,note="Available:
+\url{http://lkml.org/lkml/2005/5/9/185}
+[Viewed May 13, 2005]"
+,annotation="
+ First publication of working lock-based deferred free patches
+ for the CONFIG_PREEMPT_RT environment.
+"
+}
+
+@conference{PaulMcKenney05b
+,Author="Paul E. McKenney and Dipankar Sarma"
+,Title="Towards Hard Realtime Response from the Linux Kernel on SMP Hardware"
+,Booktitle="linux.conf.au 2005"
+,month="April"
+,year="2005"
+,address="Canberra, Australia"
+,note="Available:
+\url{http://www.rdrop.com/users/paulmck/RCU/realtimeRCU.2005.04.23a.pdf}
+[Viewed May 13, 2005]"
+,annotation="
+ Realtime turns into making RCU yet more realtime friendly.
+"
+}
+
+@conference{ThomasEHart2006a
+,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown"
+,Title="Making Lockless Synchronization Fast: Performance Implications
+of Memory Reclamation"
+,Booktitle="20\textsuperscript{th} {IEEE} International Parallel and
+Distributed Processing Symposium"
+,month="April"
+,year="2006"
+,day="25-29"
+,address="Rhodes, Greece"
+,annotation="
+ Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free
+ reference counting.
+"
+}
+
+@Conference{PaulEMcKenney2006b
+,Author="Paul E. McKenney and Dipankar Sarma and Ingo Molnar and
+Suparna Bhattacharya"
+,Title="Extending RCU for Realtime and Embedded Workloads"
+,Booktitle="{Ottawa Linux Symposium}"
+,Month="July"
+,Year="2006"
+,pages="v2 123-138"
+,note="Available:
+\url{http://www.linuxsymposium.org/2006/view_abstract.php?content_key=184}
+\url{http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf}
+[Viewed January 1, 2007]"
+,annotation="
+ Described how to improve the -rt implementation of realtime RCU.
+"
+}
+
+@unpublished{PaulEMcKenney2006c
+,Author="Paul E. McKenney"
+,Title="Sleepable {RCU}"
+,month="October"
+,day="9"
+,year="2006"
+,note="Available:
+\url{http://lwn.net/Articles/202847/}
+Revised:
+\url{http://www.rdrop.com/users/paulmck/RCU/srcu.2007.01.14a.pdf}
+[Viewed August 21, 2006]"
+,annotation="
+ LWN article introducing SRCU.
+"
+}
+
+@unpublished{RobertOlsson2006a
+,Author="Robert Olsson and Stefan Nilsson"
+,Title="{TRASH}: A dynamic {LC}-trie and hash data structure"
+,month="August"
+,day="18"
+,year="2006"
+,note="Available:
+\url{http://www.nada.kth.se/~snilsson/public/papers/trash/trash.pdf}
+[Viewed February 24, 2007]"
+,annotation="
+ RCU-protected dynamic trie-hash combination.
+"
+}
+
+@unpublished{ThomasEHart2007a
+,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown and Jonathan Walpole"
+,Title="Performance of memory reclamation for lockless synchronization"
+,journal="J. Parallel Distrib. Comput."
+,year="2007"
+,note="To appear in J. Parallel Distrib. Comput.
+ \url{doi=10.1016/j.jpdc.2007.04.010}"
+,annotation={
+ Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free
+ reference counting. Journal version of ThomasEHart2006a.
+}
+}
+
+@unpublished{PaulEMcKenney2007QRCUspin
+,Author="Paul E. McKenney"
+,Title="Using Promela and Spin to verify parallel algorithms"
+,month="August"
+,day="1"
+,year="2007"
+,note="Available:
+\url{http://lwn.net/Articles/243851/}
+[Viewed September 8, 2007]"
+,annotation="
+ LWN article describing Promela and spin, and also using Oleg
+ Nesterov's QRCU as an example (with Paul McKenney's fastpath).
+"
+}
+
+@unpublished{PaulEMcKenney2007PreemptibleRCU
+,Author="Paul E. McKenney"
+,Title="The design of preemptible read-copy-update"
+,month="October"
+,day="8"
+,year="2007"
+,note="Available:
+\url{http://lwn.net/Articles/253651/}
+[Viewed October 25, 2007]"
+,annotation="
+ LWN article describing the design of preemptible RCU.
+"
+}
+
+########################################################################
+#
+# "What is RCU?" LWN series.
+#
+
+@unpublished{PaulEMcKenney2007WhatIsRCUFundamentally
+,Author="Paul E. McKenney and Jonathan Walpole"
+,Title="What is {RCU}, Fundamentally?"
+,month="December"
+,day="17"
+,year="2007"
+,note="Available:
+\url{http://lwn.net/Articles/262464/}
+[Viewed December 27, 2007]"
+,annotation="
+ Lays out the three basic components of RCU: (1) publish-subscribe,
+ (2) wait for pre-existing readers to complete, and (2) maintain
+ multiple versions.
+"
+}
+
+@unpublished{PaulEMcKenney2008WhatIsRCUUsage
+,Author="Paul E. McKenney"
+,Title="What is {RCU}? Part 2: Usage"
+,month="January"
+,day="4"
+,year="2008"
+,note="Available:
+\url{http://lwn.net/Articles/263130/}
+[Viewed January 4, 2008]"
+,annotation="
+ Lays out six uses of RCU:
+ 1. RCU is a Reader-Writer Lock Replacement
+ 2. RCU is a Restricted Reference-Counting Mechanism
+ 3. RCU is a Bulk Reference-Counting Mechanism
+ 4. RCU is a Poor Man's Garbage Collector
+ 5. RCU is a Way of Providing Existence Guarantees
+ 6. RCU is a Way of Waiting for Things to Finish
+"
+}
+
+@unpublished{PaulEMcKenney2008WhatIsRCUAPI
+,Author="Paul E. McKenney"
+,Title="{RCU} part 3: the {RCU} {API}"
+,month="January"
+,day="17"
+,year="2008"
+,note="Available:
+\url{http://lwn.net/Articles/264090/}
+[Viewed January 10, 2008]"
+,annotation="
+ Gives an overview of the Linux-kernel RCU API and a brief annotated RCU
+ bibliography.
+"
+}
+
+@article{DinakarGuniguntala2008IBMSysJ
+,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole"
+,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}"
+,Year="2008"
+,Month="April"
+,journal="IBM Systems Journal"
+,volume="47"
+,number="2"
+,pages="@@-@@"
+,annotation="
+ RCU, realtime RCU, sleepable RCU, performance.
+"
+}
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt
new file mode 100644
index 0000000..aab4a9e
--- /dev/null
+++ b/Documentation/RCU/UP.txt
@@ -0,0 +1,119 @@
+RCU on Uniprocessor Systems
+
+
+A common misconception is that, on UP systems, the call_rcu() primitive
+may immediately invoke its function, and that the synchronize_rcu()
+primitive may return immediately. The basis of this misconception
+is that since there is only one CPU, it should not be necessary to
+wait for anything else to get done, since there are no other CPUs for
+anything else to be happening on. Although this approach will -sort- -of-
+work a surprising amount of the time, it is a very bad idea in general.
+This document presents three examples that demonstrate exactly how bad an
+idea this is.
+
+
+Example 1: softirq Suicide
+
+Suppose that an RCU-based algorithm scans a linked list containing
+elements A, B, and C in process context, and can delete elements from
+this same list in softirq context. Suppose that the process-context scan
+is referencing element B when it is interrupted by softirq processing,
+which deletes element B, and then invokes call_rcu() to free element B
+after a grace period.
+
+Now, if call_rcu() were to directly invoke its arguments, then upon return
+from softirq, the list scan would find itself referencing a newly freed
+element B. This situation can greatly decrease the life expectancy of
+your kernel.
+
+This same problem can occur if call_rcu() is invoked from a hardware
+interrupt handler.
+
+
+Example 2: Function-Call Fatality
+
+Of course, one could avert the suicide described in the preceding example
+by having call_rcu() directly invoke its arguments only if it was called
+from process context. However, this can fail in a similar manner.
+
+Suppose that an RCU-based algorithm again scans a linked list containing
+elements A, B, and C in process contexts, but that it invokes a function
+on each element as it is scanned. Suppose further that this function
+deletes element B from the list, then passes it to call_rcu() for deferred
+freeing. This may be a bit unconventional, but it is perfectly legal
+RCU usage, since call_rcu() must wait for a grace period to elapse.
+Therefore, in this case, allowing call_rcu() to immediately invoke
+its arguments would cause it to fail to make the fundamental guarantee
+underlying RCU, namely that call_rcu() defers invoking its arguments until
+all RCU read-side critical sections currently executing have completed.
+
+Quick Quiz #1: why is it -not- legal to invoke synchronize_rcu() in
+ this case?
+
+
+Example 3: Death by Deadlock
+
+Suppose that call_rcu() is invoked while holding a lock, and that the
+callback function must acquire this same lock. In this case, if
+call_rcu() were to directly invoke the callback, the result would
+be self-deadlock.
+
+In some cases, it would possible to restructure to code so that
+the call_rcu() is delayed until after the lock is released. However,
+there are cases where this can be quite ugly:
+
+1. If a number of items need to be passed to call_rcu() within
+ the same critical section, then the code would need to create
+ a list of them, then traverse the list once the lock was
+ released.
+
+2. In some cases, the lock will be held across some kernel API,
+ so that delaying the call_rcu() until the lock is released
+ requires that the data item be passed up via a common API.
+ It is far better to guarantee that callbacks are invoked
+ with no locks held than to have to modify such APIs to allow
+ arbitrary data items to be passed back up through them.
+
+If call_rcu() directly invokes the callback, painful locking restrictions
+or API changes would be required.
+
+Quick Quiz #2: What locking restriction must RCU callbacks respect?
+
+
+Summary
+
+Permitting call_rcu() to immediately invoke its arguments or permitting
+synchronize_rcu() to immediately return breaks RCU, even on a UP system.
+So do not do it! Even on a UP system, the RCU infrastructure -must-
+respect grace periods, and -must- invoke callbacks from a known environment
+in which no locks are held.
+
+
+Answer to Quick Quiz #1:
+ Why is it -not- legal to invoke synchronize_rcu() in this case?
+
+ Because the calling function is scanning an RCU-protected linked
+ list, and is therefore within an RCU read-side critical section.
+ Therefore, the called function has been invoked within an RCU
+ read-side critical section, and is not permitted to block.
+
+Answer to Quick Quiz #2:
+ What locking restriction must RCU callbacks respect?
+
+ Any lock that is acquired within an RCU callback must be
+ acquired elsewhere using an _irq variant of the spinlock
+ primitive. For example, if "mylock" is acquired by an
+ RCU callback, then a process-context acquisition of this
+ lock must use something like spin_lock_irqsave() to
+ acquire the lock.
+
+ If the process-context code were to simply use spin_lock(),
+ then, since RCU callbacks can be invoked from softirq context,
+ the callback might be called from a softirq that interrupted
+ the process-context critical section. This would result in
+ self-deadlock.
+
+ This restriction might seem gratuitous, since very few RCU
+ callbacks acquire locks directly. However, a great many RCU
+ callbacks do acquire locks -indirectly-, for example, via
+ the kfree() primitive.
diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.txt
new file mode 100644
index 0000000..453ebe6
--- /dev/null
+++ b/Documentation/RCU/arrayRCU.txt
@@ -0,0 +1,141 @@
+Using RCU to Protect Read-Mostly Arrays
+
+
+Although RCU is more commonly used to protect linked lists, it can
+also be used to protect arrays. Three situations are as follows:
+
+1. Hash Tables
+
+2. Static Arrays
+
+3. Resizeable Arrays
+
+Each of these situations are discussed below.
+
+
+Situation 1: Hash Tables
+
+Hash tables are often implemented as an array, where each array entry
+has a linked-list hash chain. Each hash chain can be protected by RCU
+as described in the listRCU.txt document. This approach also applies
+to other array-of-list situations, such as radix trees.
+
+
+Situation 2: Static Arrays
+
+Static arrays, where the data (rather than a pointer to the data) is
+located in each array element, and where the array is never resized,
+have not been used with RCU. Rik van Riel recommends using seqlock in
+this situation, which would also have minimal read-side overhead as long
+as updates are rare.
+
+Quick Quiz: Why is it so important that updates be rare when
+ using seqlock?
+
+
+Situation 3: Resizeable Arrays
+
+Use of RCU for resizeable arrays is demonstrated by the grow_ary()
+function used by the System V IPC code. The array is used to map from
+semaphore, message-queue, and shared-memory IDs to the data structure
+that represents the corresponding IPC construct. The grow_ary()
+function does not acquire any locks; instead its caller must hold the
+ids->sem semaphore.
+
+The grow_ary() function, shown below, does some limit checks, allocates a
+new ipc_id_ary, copies the old to the new portion of the new, initializes
+the remainder of the new, updates the ids->entries pointer to point to
+the new array, and invokes ipc_rcu_putref() to free up the old array.
+Note that rcu_assign_pointer() is used to update the ids->entries pointer,
+which includes any memory barriers required on whatever architecture
+you are running on.
+
+ static int grow_ary(struct ipc_ids* ids, int newsize)
+ {
+ struct ipc_id_ary* new;
+ struct ipc_id_ary* old;
+ int i;
+ int size = ids->entries->size;
+
+ if(newsize > IPCMNI)
+ newsize = IPCMNI;
+ if(newsize <= size)
+ return newsize;
+
+ new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize +
+ sizeof(struct ipc_id_ary));
+ if(new == NULL)
+ return size;
+ new->size = newsize;
+ memcpy(new->p, ids->entries->p,
+ sizeof(struct kern_ipc_perm *)*size +
+ sizeof(struct ipc_id_ary));
+ for(i=size;i<newsize;i++) {
+ new->p[i] = NULL;
+ }
+ old = ids->entries;
+
+ /*
+ * Use rcu_assign_pointer() to make sure the memcpyed
+ * contents of the new array are visible before the new
+ * array becomes visible.
+ */
+ rcu_assign_pointer(ids->entries, new);
+
+ ipc_rcu_putref(old);
+ return newsize;
+ }
+
+The ipc_rcu_putref() function decrements the array's reference count
+and then, if the reference count has dropped to zero, uses call_rcu()
+to free the array after a grace period has elapsed.
+
+The array is traversed by the ipc_lock() function. This function
+indexes into the array under the protection of rcu_read_lock(),
+using rcu_dereference() to pick up the pointer to the array so
+that it may later safely be dereferenced -- memory barriers are
+required on the Alpha CPU. Since the size of the array is stored
+with the array itself, there can be no array-size mismatches, so
+a simple check suffices. The pointer to the structure corresponding
+to the desired IPC object is placed in "out", with NULL indicating
+a non-existent entry. After acquiring "out->lock", the "out->deleted"
+flag indicates whether the IPC object is in the process of being
+deleted, and, if not, the pointer is returned.
+
+ struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
+ {
+ struct kern_ipc_perm* out;
+ int lid = id % SEQ_MULTIPLIER;
+ struct ipc_id_ary* entries;
+
+ rcu_read_lock();
+ entries = rcu_dereference(ids->entries);
+ if(lid >= entries->size) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ out = entries->p[lid];
+ if(out == NULL) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ spin_lock(&out->lock);
+
+ /* ipc_rmid() may have already freed the ID while ipc_lock
+ * was spinning: here verify that the structure is still valid
+ */
+ if (out->deleted) {
+ spin_unlock(&out->lock);
+ rcu_read_unlock();
+ return NULL;
+ }
+ return out;
+ }
+
+
+Answer to Quick Quiz:
+
+ The reason that it is important that updates be rare when
+ using seqlock is that frequent updates can livelock readers.
+ One way to avoid this problem is to assign a seqlock for
+ each array entry rather than to the entire array.
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
new file mode 100644
index 0000000..6e25340
--- /dev/null
+++ b/Documentation/RCU/checklist.txt
@@ -0,0 +1,300 @@
+Review Checklist for RCU Patches
+
+
+This document contains a checklist for producing and reviewing patches
+that make use of RCU. Violating any of the rules listed below will
+result in the same sorts of problems that leaving out a locking primitive
+would cause. This list is based on experiences reviewing such patches
+over a rather long period of time, but improvements are always welcome!
+
+0. Is RCU being applied to a read-mostly situation? If the data
+ structure is updated more than about 10% of the time, then
+ you should strongly consider some other approach, unless
+ detailed performance measurements show that RCU is nonetheless
+ the right tool for the job.
+
+ Another exception is where performance is not an issue, and RCU
+ provides a simpler implementation. An example of this situation
+ is the dynamic NMI code in the Linux 2.6 kernel, at least on
+ architectures where NMIs are rare.
+
+ Yet another exception is where the low real-time latency of RCU's
+ read-side primitives is critically important.
+
+1. Does the update code have proper mutual exclusion?
+
+ RCU does allow -readers- to run (almost) naked, but -writers- must
+ still use some sort of mutual exclusion, such as:
+
+ a. locking,
+ b. atomic operations, or
+ c. restricting updates to a single task.
+
+ If you choose #b, be prepared to describe how you have handled
+ memory barriers on weakly ordered machines (pretty much all of
+ them -- even x86 allows reads to be reordered), and be prepared
+ to explain why this added complexity is worthwhile. If you
+ choose #c, be prepared to explain how this single task does not
+ become a major bottleneck on big multiprocessor machines (for
+ example, if the task is updating information relating to itself
+ that other tasks can read, there by definition can be no
+ bottleneck).
+
+2. Do the RCU read-side critical sections make proper use of
+ rcu_read_lock() and friends? These primitives are needed
+ to prevent grace periods from ending prematurely, which
+ could result in data being unceremoniously freed out from
+ under your read-side code, which can greatly increase the
+ actuarial risk of your kernel.
+
+ As a rough rule of thumb, any dereference of an RCU-protected
+ pointer must be covered by rcu_read_lock() or rcu_read_lock_bh()
+ or by the appropriate update-side lock.
+
+3. Does the update code tolerate concurrent accesses?
+
+ The whole point of RCU is to permit readers to run without
+ any locks or atomic operations. This means that readers will
+ be running while updates are in progress. There are a number
+ of ways to handle this concurrency, depending on the situation:
+
+ a. Use the RCU variants of the list and hlist update
+ primitives to add, remove, and replace elements on an
+ RCU-protected list. Alternatively, use the RCU-protected
+ trees that have been added to the Linux kernel.
+
+ This is almost always the best approach.
+
+ b. Proceed as in (a) above, but also maintain per-element
+ locks (that are acquired by both readers and writers)
+ that guard per-element state. Of course, fields that
+ the readers refrain from accessing can be guarded by the
+ update-side lock.
+
+ This works quite well, also.
+
+ c. Make updates appear atomic to readers. For example,
+ pointer updates to properly aligned fields will appear
+ atomic, as will individual atomic primitives. Operations
+ performed under a lock and sequences of multiple atomic
+ primitives will -not- appear to be atomic.
+
+ This can work, but is starting to get a bit tricky.
+
+ d. Carefully order the updates and the reads so that
+ readers see valid data at all phases of the update.
+ This is often more difficult than it sounds, especially
+ given modern CPUs' tendency to reorder memory references.
+ One must usually liberally sprinkle memory barriers
+ (smp_wmb(), smp_rmb(), smp_mb()) through the code,
+ making it difficult to understand and to test.
+
+ It is usually better to group the changing data into
+ a separate structure, so that the change may be made
+ to appear atomic by updating a pointer to reference
+ a new structure containing updated values.
+
+4. Weakly ordered CPUs pose special challenges. Almost all CPUs
+ are weakly ordered -- even i386 CPUs allow reads to be reordered.
+ RCU code must take all of the following measures to prevent
+ memory-corruption problems:
+
+ a. Readers must maintain proper ordering of their memory
+ accesses. The rcu_dereference() primitive ensures that
+ the CPU picks up the pointer before it picks up the data
+ that the pointer points to. This really is necessary
+ on Alpha CPUs. If you don't believe me, see:
+
+ http://www.openvms.compaq.com/wizard/wiz_2637.html
+
+ The rcu_dereference() primitive is also an excellent
+ documentation aid, letting the person reading the code
+ know exactly which pointers are protected by RCU.
+
+ The rcu_dereference() primitive is used by the various
+ "_rcu()" list-traversal primitives, such as the
+ list_for_each_entry_rcu(). Note that it is perfectly
+ legal (if redundant) for update-side code to use
+ rcu_dereference() and the "_rcu()" list-traversal
+ primitives. This is particularly useful in code
+ that is common to readers and updaters.
+
+ b. If the list macros are being used, the list_add_tail_rcu()
+ and list_add_rcu() primitives must be used in order
+ to prevent weakly ordered machines from misordering
+ structure initialization and pointer planting.
+ Similarly, if the hlist macros are being used, the
+ hlist_add_head_rcu() primitive is required.
+
+ c. If the list macros are being used, the list_del_rcu()
+ primitive must be used to keep list_del()'s pointer
+ poisoning from inflicting toxic effects on concurrent
+ readers. Similarly, if the hlist macros are being used,
+ the hlist_del_rcu() primitive is required.
+
+ The list_replace_rcu() primitive may be used to
+ replace an old structure with a new one in an
+ RCU-protected list.
+
+ d. Updates must ensure that initialization of a given
+ structure happens before pointers to that structure are
+ publicized. Use the rcu_assign_pointer() primitive
+ when publicizing a pointer to a structure that can
+ be traversed by an RCU read-side critical section.
+
+5. If call_rcu(), or a related primitive such as call_rcu_bh() or
+ call_rcu_sched(), is used, the callback function must be
+ written to be called from softirq context. In particular,
+ it cannot block.
+
+6. Since synchronize_rcu() can block, it cannot be called from
+ any sort of irq context. Ditto for synchronize_sched() and
+ synchronize_srcu().
+
+7. If the updater uses call_rcu(), then the corresponding readers
+ must use rcu_read_lock() and rcu_read_unlock(). If the updater
+ uses call_rcu_bh(), then the corresponding readers must use
+ rcu_read_lock_bh() and rcu_read_unlock_bh(). If the updater
+ uses call_rcu_sched(), then the corresponding readers must
+ disable preemption. Mixing things up will result in confusion
+ and broken kernels.
+
+ One exception to this rule: rcu_read_lock() and rcu_read_unlock()
+ may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
+ in cases where local bottom halves are already known to be
+ disabled, for example, in irq or softirq context. Commenting
+ such cases is a must, of course! And the jury is still out on
+ whether the increased speed is worth it.
+
+8. Although synchronize_rcu() is slower than is call_rcu(), it
+ usually results in simpler code. So, unless update performance
+ is critically important or the updaters cannot block,
+ synchronize_rcu() should be used in preference to call_rcu().
+
+ An especially important property of the synchronize_rcu()
+ primitive is that it automatically self-limits: if grace periods
+ are delayed for whatever reason, then the synchronize_rcu()
+ primitive will correspondingly delay updates. In contrast,
+ code using call_rcu() should explicitly limit update rate in
+ cases where grace periods are delayed, as failing to do so can
+ result in excessive realtime latencies or even OOM conditions.
+
+ Ways of gaining this self-limiting property when using call_rcu()
+ include:
+
+ a. Keeping a count of the number of data-structure elements
+ used by the RCU-protected data structure, including those
+ waiting for a grace period to elapse. Enforce a limit
+ on this number, stalling updates as needed to allow
+ previously deferred frees to complete.
+
+ Alternatively, limit only the number awaiting deferred
+ free rather than the total number of elements.
+
+ b. Limiting update rate. For example, if updates occur only
+ once per hour, then no explicit rate limiting is required,
+ unless your system is already badly broken. The dcache
+ subsystem takes this approach -- updates are guarded
+ by a global lock, limiting their rate.
+
+ c. Trusted update -- if updates can only be done manually by
+ superuser or some other trusted user, then it might not
+ be necessary to automatically limit them. The theory
+ here is that superuser already has lots of ways to crash
+ the machine.
+
+ d. Use call_rcu_bh() rather than call_rcu(), in order to take
+ advantage of call_rcu_bh()'s faster grace periods.
+
+ e. Periodically invoke synchronize_rcu(), permitting a limited
+ number of updates per grace period.
+
+9. All RCU list-traversal primitives, which include
+ rcu_dereference(), list_for_each_entry_rcu(),
+ list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
+ must be either within an RCU read-side critical section or
+ must be protected by appropriate update-side locks. RCU
+ read-side critical sections are delimited by rcu_read_lock()
+ and rcu_read_unlock(), or by similar primitives such as
+ rcu_read_lock_bh() and rcu_read_unlock_bh().
+
+ The reason that it is permissible to use RCU list-traversal
+ primitives when the update-side lock is held is that doing so
+ can be quite helpful in reducing code bloat when common code is
+ shared between readers and updaters.
+
+10. Conversely, if you are in an RCU read-side critical section,
+ and you don't hold the appropriate update-side lock, you -must-
+ use the "_rcu()" variants of the list macros. Failing to do so
+ will break Alpha and confuse people reading your code.
+
+11. Note that synchronize_rcu() -only- guarantees to wait until
+ all currently executing rcu_read_lock()-protected RCU read-side
+ critical sections complete. It does -not- necessarily guarantee
+ that all currently running interrupts, NMIs, preempt_disable()
+ code, or idle loops will complete. Therefore, if you do not have
+ rcu_read_lock()-protected read-side critical sections, do -not-
+ use synchronize_rcu().
+
+ If you want to wait for some of these other things, you might
+ instead need to use synchronize_irq() or synchronize_sched().
+
+12. Any lock acquired by an RCU callback must be acquired elsewhere
+ with irq disabled, e.g., via spin_lock_irqsave(). Failing to
+ disable irq on a given acquisition of that lock will result in
+ deadlock as soon as the RCU callback happens to interrupt that
+ acquisition's critical section.
+
+13. RCU callbacks can be and are executed in parallel. In many cases,
+ the callback code simply wrappers around kfree(), so that this
+ is not an issue (or, more accurately, to the extent that it is
+ an issue, the memory-allocator locking handles it). However,
+ if the callbacks do manipulate a shared data structure, they
+ must use whatever locking or other synchronization is required
+ to safely access and/or modify that data structure.
+
+ RCU callbacks are -usually- executed on the same CPU that executed
+ the corresponding call_rcu(), call_rcu_bh(), or call_rcu_sched(),
+ but are by -no- means guaranteed to be. For example, if a given
+ CPU goes offline while having an RCU callback pending, then that
+ RCU callback will execute on some surviving CPU. (If this was
+ not the case, a self-spawning RCU callback would prevent the
+ victim CPU from ever going offline.)
+
+14. SRCU (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu())
+ may only be invoked from process context. Unlike other forms of
+ RCU, it -is- permissible to block in an SRCU read-side critical
+ section (demarked by srcu_read_lock() and srcu_read_unlock()),
+ hence the "SRCU": "sleepable RCU". Please note that if you
+ don't need to sleep in read-side critical sections, you should
+ be using RCU rather than SRCU, because RCU is almost always
+ faster and easier to use than is SRCU.
+
+ Also unlike other forms of RCU, explicit initialization
+ and cleanup is required via init_srcu_struct() and
+ cleanup_srcu_struct(). These are passed a "struct srcu_struct"
+ that defines the scope of a given SRCU domain. Once initialized,
+ the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
+ and synchronize_srcu(). A given synchronize_srcu() waits only
+ for SRCU read-side critical sections governed by srcu_read_lock()
+ and srcu_read_unlock() calls that have been passd the same
+ srcu_struct. This property is what makes sleeping read-side
+ critical sections tolerable -- a given subsystem delays only
+ its own updates, not those of other subsystems using SRCU.
+ Therefore, SRCU is less prone to OOM the system than RCU would
+ be if RCU's read-side critical sections were permitted to
+ sleep.
+
+ The ability to sleep in read-side critical sections does not
+ come for free. First, corresponding srcu_read_lock() and
+ srcu_read_unlock() calls must be passed the same srcu_struct.
+ Second, grace-period-detection overhead is amortized only
+ over those updates sharing a given srcu_struct, rather than
+ being globally amortized as they are for other forms of RCU.
+ Therefore, SRCU should be used in preference to rw_semaphore
+ only in extremely read-intensive situations, or in situations
+ requiring SRCU's read-side deadlock immunity or low read-side
+ realtime latency.
+
+ Note that, rcu_assign_pointer() and rcu_dereference() relate to
+ SRCU just as they do to other forms of RCU.
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
new file mode 100644
index 0000000..1fd1753
--- /dev/null
+++ b/Documentation/RCU/listRCU.txt
@@ -0,0 +1,315 @@
+Using RCU to Protect Read-Mostly Linked Lists
+
+
+One of the best applications of RCU is to protect read-mostly linked lists
+("struct list_head" in list.h). One big advantage of this approach
+is that all of the required memory barriers are included for you in
+the list macros. This document describes several applications of RCU,
+with the best fits first.
+
+
+Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
+
+The best applications are cases where, if reader-writer locking were
+used, the read-side lock would be dropped before taking any action
+based on the results of the search. The most celebrated example is
+the routing table. Because the routing table is tracking the state of
+equipment outside of the computer, it will at times contain stale data.
+Therefore, once the route has been computed, there is no need to hold
+the routing table static during transmission of the packet. After all,
+you can hold the routing table static all you want, but that won't keep
+the external Internet from changing, and it is the state of the external
+Internet that really matters. In addition, routing entries are typically
+added or deleted, rather than being modified in place.
+
+A straightforward example of this use of RCU may be found in the
+system-call auditing support. For example, a reader-writer locked
+implementation of audit_filter_task() might be as follows:
+
+ static enum audit_state audit_filter_task(struct task_struct *tsk)
+ {
+ struct audit_entry *e;
+ enum audit_state state;
+
+ read_lock(&auditsc_lock);
+ /* Note: audit_netlink_sem held by caller. */
+ list_for_each_entry(e, &audit_tsklist, list) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ read_unlock(&auditsc_lock);
+ return state;
+ }
+ }
+ read_unlock(&auditsc_lock);
+ return AUDIT_BUILD_CONTEXT;
+ }
+
+Here the list is searched under the lock, but the lock is dropped before
+the corresponding value is returned. By the time that this value is acted
+on, the list may well have been modified. This makes sense, since if
+you are turning auditing off, it is OK to audit a few extra system calls.
+
+This means that RCU can be easily applied to the read side, as follows:
+
+ static enum audit_state audit_filter_task(struct task_struct *tsk)
+ {
+ struct audit_entry *e;
+ enum audit_state state;
+
+ rcu_read_lock();
+ /* Note: audit_netlink_sem held by caller. */
+ list_for_each_entry_rcu(e, &audit_tsklist, list) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+ }
+
+The read_lock() and read_unlock() calls have become rcu_read_lock()
+and rcu_read_unlock(), respectively, and the list_for_each_entry() has
+become list_for_each_entry_rcu(). The _rcu() list-traversal primitives
+insert the read-side memory barriers that are required on DEC Alpha CPUs.
+
+The changes to the update side are also straightforward. A reader-writer
+lock might be used as follows for deletion and insertion:
+
+ static inline int audit_del_rule(struct audit_rule *rule,
+ struct list_head *list)
+ {
+ struct audit_entry *e;
+
+ write_lock(&auditsc_lock);
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(rule, &e->rule)) {
+ list_del(&e->list);
+ write_unlock(&auditsc_lock);
+ return 0;
+ }
+ }
+ write_unlock(&auditsc_lock);
+ return -EFAULT; /* No matching rule */
+ }
+
+ static inline int audit_add_rule(struct audit_entry *entry,
+ struct list_head *list)
+ {
+ write_lock(&auditsc_lock);
+ if (entry->rule.flags & AUDIT_PREPEND) {
+ entry->rule.flags &= ~AUDIT_PREPEND;
+ list_add(&entry->list, list);
+ } else {
+ list_add_tail(&entry->list, list);
+ }
+ write_unlock(&auditsc_lock);
+ return 0;
+ }
+
+Following are the RCU equivalents for these two functions:
+
+ static inline int audit_del_rule(struct audit_rule *rule,
+ struct list_head *list)
+ {
+ struct audit_entry *e;
+
+ /* Do not use the _rcu iterator here, since this is the only
+ * deletion routine. */
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(rule, &e->rule)) {
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule, e);
+ return 0;
+ }
+ }
+ return -EFAULT; /* No matching rule */
+ }
+
+ static inline int audit_add_rule(struct audit_entry *entry,
+ struct list_head *list)
+ {
+ if (entry->rule.flags & AUDIT_PREPEND) {
+ entry->rule.flags &= ~AUDIT_PREPEND;
+ list_add_rcu(&entry->list, list);
+ } else {
+ list_add_tail_rcu(&entry->list, list);
+ }
+ return 0;
+ }
+
+Normally, the write_lock() and write_unlock() would be replaced by
+a spin_lock() and a spin_unlock(), but in this case, all callers hold
+audit_netlink_sem, so no additional locking is required. The auditsc_lock
+can therefore be eliminated, since use of RCU eliminates the need for
+writers to exclude readers. Normally, the write_lock() calls would
+be converted into spin_lock() calls.
+
+The list_del(), list_add(), and list_add_tail() primitives have been
+replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
+The _rcu() list-manipulation primitives add memory barriers that are
+needed on weakly ordered CPUs (most of them!). The list_del_rcu()
+primitive omits the pointer poisoning debug-assist code that would
+otherwise cause concurrent readers to fail spectacularly.
+
+So, when readers can tolerate stale data and when entries are either added
+or deleted, without in-place modification, it is very easy to use RCU!
+
+
+Example 2: Handling In-Place Updates
+
+The system-call auditing code does not update auditing rules in place.
+However, if it did, reader-writer-locked code to do so might look as
+follows (presumably, the field_count is only permitted to decrease,
+otherwise, the added fields would need to be filled in):
+
+ static inline int audit_upd_rule(struct audit_rule *rule,
+ struct list_head *list,
+ __u32 newaction,
+ __u32 newfield_count)
+ {
+ struct audit_entry *e;
+ struct audit_newentry *ne;
+
+ write_lock(&auditsc_lock);
+ /* Note: audit_netlink_sem held by caller. */
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(rule, &e->rule)) {
+ e->rule.action = newaction;
+ e->rule.file_count = newfield_count;
+ write_unlock(&auditsc_lock);
+ return 0;
+ }
+ }
+ write_unlock(&auditsc_lock);
+ return -EFAULT; /* No matching rule */
+ }
+
+The RCU version creates a copy, updates the copy, then replaces the old
+entry with the newly updated entry. This sequence of actions, allowing
+concurrent reads while doing a copy to perform an update, is what gives
+RCU ("read-copy update") its name. The RCU code is as follows:
+
+ static inline int audit_upd_rule(struct audit_rule *rule,
+ struct list_head *list,
+ __u32 newaction,
+ __u32 newfield_count)
+ {
+ struct audit_entry *e;
+ struct audit_newentry *ne;
+
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(rule, &e->rule)) {
+ ne = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (ne == NULL)
+ return -ENOMEM;
+ audit_copy_rule(&ne->rule, &e->rule);
+ ne->rule.action = newaction;
+ ne->rule.file_count = newfield_count;
+ list_replace_rcu(e, ne);
+ call_rcu(&e->rcu, audit_free_rule, e);
+ return 0;
+ }
+ }
+ return -EFAULT; /* No matching rule */
+ }
+
+Again, this assumes that the caller holds audit_netlink_sem. Normally,
+the reader-writer lock would become a spinlock in this sort of code.
+
+
+Example 3: Eliminating Stale Data
+
+The auditing examples above tolerate stale data, as do most algorithms
+that are tracking external state. Because there is a delay from the
+time the external state changes before Linux becomes aware of the change,
+additional RCU-induced staleness is normally not a problem.
+
+However, there are many examples where stale data cannot be tolerated.
+One example in the Linux kernel is the System V IPC (see the ipc_lock()
+function in ipc/util.c). This code checks a "deleted" flag under a
+per-entry spinlock, and, if the "deleted" flag is set, pretends that the
+entry does not exist. For this to be helpful, the search function must
+return holding the per-entry spinlock, as ipc_lock() does in fact do.
+
+Quick Quiz: Why does the search function need to return holding the
+ per-entry lock for this deleted-flag technique to be helpful?
+
+If the system-call audit module were to ever need to reject stale data,
+one way to accomplish this would be to add a "deleted" flag and a "lock"
+spinlock to the audit_entry structure, and modify audit_filter_task()
+as follows:
+
+ static enum audit_state audit_filter_task(struct task_struct *tsk)
+ {
+ struct audit_entry *e;
+ enum audit_state state;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_tsklist, list) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ spin_lock(&e->lock);
+ if (e->deleted) {
+ spin_unlock(&e->lock);
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+ }
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+ }
+
+Note that this example assumes that entries are only added and deleted.
+Additional mechanism is required to deal correctly with the
+update-in-place performed by audit_upd_rule(). For one thing,
+audit_upd_rule() would need additional memory barriers to ensure
+that the list_add_rcu() was really executed before the list_del_rcu().
+
+The audit_del_rule() function would need to set the "deleted"
+flag under the spinlock as follows:
+
+ static inline int audit_del_rule(struct audit_rule *rule,
+ struct list_head *list)
+ {
+ struct audit_entry *e;
+
+ /* Do not need to use the _rcu iterator here, since this
+ * is the only deletion routine. */
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(rule, &e->rule)) {
+ spin_lock(&e->lock);
+ list_del_rcu(&e->list);
+ e->deleted = 1;
+ spin_unlock(&e->lock);
+ call_rcu(&e->rcu, audit_free_rule, e);
+ return 0;
+ }
+ }
+ return -EFAULT; /* No matching rule */
+ }
+
+
+Summary
+
+Read-mostly list-based data structures that can tolerate stale data are
+the most amenable to use of RCU. The simplest case is where entries are
+either added or deleted from the data structure (or atomically modified
+in place), but non-atomic in-place modifications can be handled by making
+a copy, updating the copy, then replacing the original with the copy.
+If stale data cannot be tolerated, then a "deleted" flag may be used
+in conjunction with a per-entry spinlock in order to allow the search
+function to reject newly deleted data.
+
+
+Answer to Quick Quiz
+ Why does the search function need to return holding the per-entry
+ lock for this deleted-flag technique to be helpful?
+
+ If the search function drops the per-entry lock before returning,
+ then the caller will be processing stale data in any case. If it
+ is really OK to be processing stale data, then you don't need a
+ "deleted" flag. If processing stale data really is a problem,
+ then you need to hold the per-entry lock across all of the code
+ that uses the value that was returned.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
new file mode 100644
index 0000000..95821a2
--- /dev/null
+++ b/Documentation/RCU/rcu.txt
@@ -0,0 +1,138 @@
+RCU Concepts
+
+
+The basic idea behind RCU (read-copy update) is to split destructive
+operations into two parts, one that prevents anyone from seeing the data
+item being destroyed, and one that actually carries out the destruction.
+A "grace period" must elapse between the two parts, and this grace period
+must be long enough that any readers accessing the item being deleted have
+since dropped their references. For example, an RCU-protected deletion
+from a linked list would first remove the item from the list, wait for
+a grace period to elapse, then free the element. See the listRCU.txt
+file for more information on using RCU with linked lists.
+
+
+Frequently Asked Questions
+
+o Why would anyone want to use RCU?
+
+ The advantage of RCU's two-part approach is that RCU readers need
+ not acquire any locks, perform any atomic instructions, write to
+ shared memory, or (on CPUs other than Alpha) execute any memory
+ barriers. The fact that these operations are quite expensive
+ on modern CPUs is what gives RCU its performance advantages
+ in read-mostly situations. The fact that RCU readers need not
+ acquire locks can also greatly simplify deadlock-avoidance code.
+
+o How can the updater tell when a grace period has completed
+ if the RCU readers give no indication when they are done?
+
+ Just as with spinlocks, RCU readers are not permitted to
+ block, switch to user-mode execution, or enter the idle loop.
+ Therefore, as soon as a CPU is seen passing through any of these
+ three states, we know that that CPU has exited any previous RCU
+ read-side critical sections. So, if we remove an item from a
+ linked list, and then wait until all CPUs have switched context,
+ executed in user mode, or executed in the idle loop, we can
+ safely free up that item.
+
+ Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the
+ same effect, but require that the readers manipulate CPU-local
+ counters. These counters allow limited types of blocking
+ within RCU read-side critical sections. SRCU also uses
+ CPU-local counters, and permits general blocking within
+ RCU read-side critical sections. These two variants of
+ RCU detect grace periods by sampling these counters.
+
+o If I am running on a uniprocessor kernel, which can only do one
+ thing at a time, why should I wait for a grace period?
+
+ See the UP.txt file in this directory.
+
+o How can I see where RCU is currently used in the Linux kernel?
+
+ Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
+ "rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh",
+ "srcu_read_lock", "srcu_read_unlock", "synchronize_rcu",
+ "synchronize_net", "synchronize_srcu", and the other RCU
+ primitives. Or grab one of the cscope databases from:
+
+ http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
+
+o What guidelines should I follow when writing code that uses RCU?
+
+ See the checklist.txt file in this directory.
+
+o Why the name "RCU"?
+
+ "RCU" stands for "read-copy update". The file listRCU.txt has
+ more information on where this name came from, search for
+ "read-copy update" to find it.
+
+o I hear that RCU is patented? What is with that?
+
+ Yes, it is. There are several known patents related to RCU,
+ search for the string "Patent" in RTFP.txt to find them.
+ Of these, one was allowed to lapse by the assignee, and the
+ others have been contributed to the Linux kernel under GPL.
+
+o I hear that RCU needs work in order to support realtime kernels?
+
+ This work is largely completed. Realtime-friendly RCU can be
+ enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
+ However, work is in progress for enabling priority boosting of
+ preempted RCU read-side critical sections.This is needed if you
+ have CPU-bound realtime threads.
+
+o Where can I find more information on RCU?
+
+ See the RTFP.txt file in this directory.
+ Or point your browser at http://www.rdrop.com/users/paulmck/RCU/.
+
+o What are all these files in this directory?
+
+
+ NMI-RCU.txt
+
+ Describes how to use RCU to implement dynamic
+ NMI handlers, which can be revectored on the fly,
+ without rebooting.
+
+ RTFP.txt
+
+ List of RCU-related publications and web sites.
+
+ UP.txt
+
+ Discussion of RCU usage in UP kernels.
+
+ arrayRCU.txt
+
+ Describes how to use RCU to protect arrays, with
+ resizeable arrays whose elements reference other
+ data structures being of the most interest.
+
+ checklist.txt
+
+ Lists things to check for when inspecting code that
+ uses RCU.
+
+ listRCU.txt
+
+ Describes how to use RCU to protect linked lists.
+ This is the simplest and most common use of RCU
+ in the Linux kernel.
+
+ rcu.txt
+
+ You are reading it!
+
+ rcuref.txt
+
+ Describes how to combine use of reference counts
+ with RCU.
+
+ whatisRCU.txt
+
+ Overview of how the RCU implementation works. Along
+ the way, presents a conceptual view of RCU.
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
new file mode 100644
index 0000000..4202ad0
--- /dev/null
+++ b/Documentation/RCU/rcuref.txt
@@ -0,0 +1,66 @@
+Reference-count design for elements of lists/arrays protected by RCU.
+
+Reference counting on elements of lists which are protected by traditional
+reader/writer spinlocks or semaphores are straightforward:
+
+1. 2.
+add() search_and_reference()
+{ {
+ alloc_object read_lock(&list_lock);
+ ... search_for_element
+ atomic_set(&el->rc, 1); atomic_inc(&el->rc);
+ write_lock(&list_lock); ...
+ add_element read_unlock(&list_lock);
+ ... ...
+ write_unlock(&list_lock); }
+}
+
+3. 4.
+release_referenced() delete()
+{ {
+ ... write_lock(&list_lock);
+ atomic_dec(&el->rc, relfunc) ...
+ ... delete_element
+} write_unlock(&list_lock);
+ ...
+ if (atomic_dec_and_test(&el->rc))
+ kfree(el);
+ ...
+ }
+
+If this list/array is made lock free using RCU as in changing the
+write_lock() in add() and delete() to spin_lock() and changing read_lock()
+in search_and_reference() to rcu_read_lock(), the atomic_inc() in
+search_and_reference() could potentially hold reference to an element which
+has already been deleted from the list/array. Use atomic_inc_not_zero()
+in this scenario as follows:
+
+1. 2.
+add() search_and_reference()
+{ {
+ alloc_object rcu_read_lock();
+ ... search_for_element
+ atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) {
+ spin_lock(&list_lock); rcu_read_unlock();
+ return FAIL;
+ add_element }
+ ... ...
+ spin_unlock(&list_lock); rcu_read_unlock();
+} }
+3. 4.
+release_referenced() delete()
+{ {
+ ... spin_lock(&list_lock);
+ if (atomic_dec_and_test(&el->rc)) ...
+ call_rcu(&el->head, el_free); delete_element
+ ... spin_unlock(&list_lock);
+} ...
+ if (atomic_dec_and_test(&el->rc))
+ call_rcu(&el->head, el_free);
+ ...
+ }
+
+Sometimes, a reference to the element needs to be obtained in the
+update (write) stream. In such cases, atomic_inc_not_zero() might be
+overkill, since we hold the update-side spinlock. One might instead
+use atomic_inc() in such cases.
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
new file mode 100644
index 0000000..a342b6e
--- /dev/null
+++ b/Documentation/RCU/torture.txt
@@ -0,0 +1,180 @@
+RCU Torture Test Operation
+
+
+CONFIG_RCU_TORTURE_TEST
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations. It creates an rcutorture kernel module that can
+be loaded to run a torture test. The test periodically outputs
+status messages via printk(), which can be examined via the dmesg
+command (perhaps grepping for "torture"). The test is started
+when the module is loaded, and stops when the module is unloaded.
+
+CONFIG_RCU_TORTURE_TEST_RUNNABLE
+
+It is also possible to specify CONFIG_RCU_TORTURE_TEST=y, which will
+result in the tests being loaded into the base kernel. In this case,
+the CONFIG_RCU_TORTURE_TEST_RUNNABLE config option is used to specify
+whether the RCU torture tests are to be started immediately during
+boot or whether the /proc/sys/kernel/rcutorture_runnable file is used
+to enable them. This /proc file can be used to repeatedly pause and
+restart the tests, regardless of the initial state specified by the
+CONFIG_RCU_TORTURE_TEST_RUNNABLE config option.
+
+You will normally -not- want to start the RCU torture tests during boot
+(and thus the default is CONFIG_RCU_TORTURE_TEST_RUNNABLE=n), but doing
+this can sometimes be useful in finding boot-time bugs.
+
+
+MODULE PARAMETERS
+
+This module has the following parameters:
+
+irqreaders Says to invoke RCU readers from irq level. This is currently
+ done via timers. Defaults to "1" for variants of RCU that
+ permit this. (Or, more accurately, variants of RCU that do
+ -not- permit this know to ignore this variable.)
+
+nfakewriters This is the number of RCU fake writer threads to run. Fake
+ writer threads repeatedly use the synchronous "wait for
+ current readers" function of the interface selected by
+ torture_type, with a delay between calls to allow for various
+ different numbers of writers running in parallel.
+ nfakewriters defaults to 4, which provides enough parallelism
+ to trigger special cases caused by multiple writers, such as
+ the synchronize_srcu() early return optimization.
+
+nreaders This is the number of RCU reading threads supported.
+ The default is twice the number of CPUs. Why twice?
+ To properly exercise RCU implementations with preemptible
+ read-side critical sections.
+
+shuffle_interval
+ The number of seconds to keep the test threads affinitied
+ to a particular subset of the CPUs, defaults to 3 seconds.
+ Used in conjunction with test_no_idle_hz.
+
+stat_interval The number of seconds between output of torture
+ statistics (via printk()). Regardless of the interval,
+ statistics are printed when the module is unloaded.
+ Setting the interval to zero causes the statistics to
+ be printed -only- when the module is unloaded, and this
+ is the default.
+
+stutter The length of time to run the test before pausing for this
+ same period of time. Defaults to "stutter=5", so as
+ to run and pause for (roughly) five-second intervals.
+ Specifying "stutter=0" causes the test to run continuously
+ without pausing, which is the old default behavior.
+
+test_no_idle_hz Whether or not to test the ability of RCU to operate in
+ a kernel that disables the scheduling-clock interrupt to
+ idle CPUs. Boolean parameter, "1" to test, "0" otherwise.
+ Defaults to omitting this test.
+
+torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API,
+ "rcu_sync" for rcu_read_lock() with synchronous reclamation,
+ "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for
+ rcu_read_lock_bh() with synchronous reclamation, "srcu" for
+ the "srcu_read_lock()" API, and "sched" for the use of
+ preempt_disable() together with synchronize_sched().
+
+verbose Enable debug printk()s. Default is disabled.
+
+
+OUTPUT
+
+The statistics output is as follows:
+
+ rcu-torture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
+ rcu-torture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
+ rcu-torture: Reader Pipe: 1466408 9747 0 0 0 0 0 0 0 0 0
+ rcu-torture: Reader Batch: 1464477 11678 0 0 0 0 0 0 0 0
+ rcu-torture: Free-Block Circulation: 1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
+ rcu-torture: --- End of test
+
+The command "dmesg | grep torture:" will extract this information on
+most systems. On more esoteric configurations, it may be necessary to
+use other commands to access the output of the printk()s used by
+the RCU torture test. The printk()s use KERN_ALERT, so they should
+be evident. ;-)
+
+The entries are as follows:
+
+o "rtc": The hexadecimal address of the structure currently visible
+ to readers.
+
+o "ver": The number of times since boot that the rcutw writer task
+ has changed the structure visible to readers.
+
+o "tfle": If non-zero, indicates that the "torture freelist"
+ containing structure to be placed into the "rtc" area is empty.
+ This condition is important, since it can fool you into thinking
+ that RCU is working when it is not. :-/
+
+o "rta": Number of structures allocated from the torture freelist.
+
+o "rtaf": Number of allocations from the torture freelist that have
+ failed due to the list being empty.
+
+o "rtf": Number of frees into the torture freelist.
+
+o "Reader Pipe": Histogram of "ages" of structures seen by readers.
+ If any entries past the first two are non-zero, RCU is broken.
+ And rcutorture prints the error flag string "!!!" to make sure
+ you notice. The age of a newly allocated structure is zero,
+ it becomes one when removed from reader visibility, and is
+ incremented once per grace period subsequently -- and is freed
+ after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+ The output displayed above was taken from a correctly working
+ RCU. If you want to see what it looks like when broken, break
+ it yourself. ;-)
+
+o "Reader Batch": Another histogram of "ages" of structures seen
+ by readers, but in terms of counter flips (or batches) rather
+ than in terms of grace periods. The legal number of non-zero
+ entries is again two. The reason for this separate view is that
+ it is sometimes easier to get the third entry to show up in the
+ "Reader Batch" list than in the "Reader Pipe" list.
+
+o "Free-Block Circulation": Shows the number of torture structures
+ that have reached a given point in the pipeline. The first element
+ should closely correspond to the number of structures allocated,
+ the second to the number that have been removed from reader view,
+ and all but the last remaining to the corresponding number of
+ passes through a grace period. The last entry should be zero,
+ as it is only incremented if a torture structure's counter
+ somehow gets incremented farther than it should.
+
+Different implementations of RCU can provide implementation-specific
+additional information. For example, SRCU provides the following:
+
+ srcu-torture: rtc: f8cf46a8 ver: 355 tfle: 0 rta: 356 rtaf: 0 rtf: 346 rtmbe: 0
+ srcu-torture: Reader Pipe: 559738 939 0 0 0 0 0 0 0 0 0
+ srcu-torture: Reader Batch: 560434 243 0 0 0 0 0 0 0 0
+ srcu-torture: Free-Block Circulation: 355 354 353 352 351 350 349 348 347 346 0
+ srcu-torture: per-CPU(idx=1): 0(0,1) 1(0,1) 2(0,0) 3(0,1)
+
+The first four lines are similar to those for RCU. The last line shows
+the per-CPU counter state. The numbers in parentheses are the values
+of the "old" and "current" counters for the corresponding CPU. The
+"idx" value maps the "old" and "current" values to the underlying array,
+and is useful for debugging.
+
+
+USAGE
+
+The following script may be used to torture RCU:
+
+ #!/bin/sh
+
+ modprobe rcutorture
+ sleep 100
+ rmmod rcutorture
+ dmesg | grep torture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors. The "rmmod" command forces a "SUCCESS" or
+"FAILURE" indication to be printk()ed.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
new file mode 100644
index 0000000..9617082
--- /dev/null
+++ b/Documentation/RCU/whatisRCU.txt
@@ -0,0 +1,936 @@
+Please note that the "What is RCU?" LWN series is an excellent place
+to start learning about RCU:
+
+1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
+2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
+3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
+
+
+What is RCU?
+
+RCU is a synchronization mechanism that was added to the Linux kernel
+during the 2.5 development effort that is optimized for read-mostly
+situations. Although RCU is actually quite simple once you understand it,
+getting there can sometimes be a challenge. Part of the problem is that
+most of the past descriptions of RCU have been written with the mistaken
+assumption that there is "one true way" to describe RCU. Instead,
+the experience has been that different people must take different paths
+to arrive at an understanding of RCU. This document provides several
+different paths, as follows:
+
+1. RCU OVERVIEW
+2. WHAT IS RCU'S CORE API?
+3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
+4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
+5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
+6. ANALOGY WITH READER-WRITER LOCKING
+7. FULL LIST OF RCU APIs
+8. ANSWERS TO QUICK QUIZZES
+
+People who prefer starting with a conceptual overview should focus on
+Section 1, though most readers will profit by reading this section at
+some point. People who prefer to start with an API that they can then
+experiment with should focus on Section 2. People who prefer to start
+with example uses should focus on Sections 3 and 4. People who need to
+understand the RCU implementation should focus on Section 5, then dive
+into the kernel source code. People who reason best by analogy should
+focus on Section 6. Section 7 serves as an index to the docbook API
+documentation, and Section 8 is the traditional answer key.
+
+So, start with the section that makes the most sense to you and your
+preferred method of learning. If you need to know everything about
+everything, feel free to read the whole thing -- but if you are really
+that type of person, you have perused the source code and will therefore
+never need this document anyway. ;-)
+
+
+1. RCU OVERVIEW
+
+The basic idea behind RCU is to split updates into "removal" and
+"reclamation" phases. The removal phase removes references to data items
+within a data structure (possibly by replacing them with references to
+new versions of these data items), and can run concurrently with readers.
+The reason that it is safe to run the removal phase concurrently with
+readers is the semantics of modern CPUs guarantee that readers will see
+either the old or the new version of the data structure rather than a
+partially updated reference. The reclamation phase does the work of reclaiming
+(e.g., freeing) the data items removed from the data structure during the
+removal phase. Because reclaiming data items can disrupt any readers
+concurrently referencing those data items, the reclamation phase must
+not start until readers no longer hold references to those data items.
+
+Splitting the update into removal and reclamation phases permits the
+updater to perform the removal phase immediately, and to defer the
+reclamation phase until all readers active during the removal phase have
+completed, either by blocking until they finish or by registering a
+callback that is invoked after they finish. Only readers that are active
+during the removal phase need be considered, because any reader starting
+after the removal phase will be unable to gain a reference to the removed
+data items, and therefore cannot be disrupted by the reclamation phase.
+
+So the typical RCU update sequence goes something like the following:
+
+a. Remove pointers to a data structure, so that subsequent
+ readers cannot gain a reference to it.
+
+b. Wait for all previous readers to complete their RCU read-side
+ critical sections.
+
+c. At this point, there cannot be any readers who hold references
+ to the data structure, so it now may safely be reclaimed
+ (e.g., kfree()d).
+
+Step (b) above is the key idea underlying RCU's deferred destruction.
+The ability to wait until all readers are done allows RCU readers to
+use much lighter-weight synchronization, in some cases, absolutely no
+synchronization at all. In contrast, in more conventional lock-based
+schemes, readers must use heavy-weight synchronization in order to
+prevent an updater from deleting the data structure out from under them.
+This is because lock-based updaters typically update data items in place,
+and must therefore exclude readers. In contrast, RCU-based updaters
+typically take advantage of the fact that writes to single aligned
+pointers are atomic on modern CPUs, allowing atomic insertion, removal,
+and replacement of data items in a linked structure without disrupting
+readers. Concurrent RCU readers can then continue accessing the old
+versions, and can dispense with the atomic operations, memory barriers,
+and communications cache misses that are so expensive on present-day
+SMP computer systems, even in absence of lock contention.
+
+In the three-step procedure shown above, the updater is performing both
+the removal and the reclamation step, but it is often helpful for an
+entirely different thread to do the reclamation, as is in fact the case
+in the Linux kernel's directory-entry cache (dcache). Even if the same
+thread performs both the update step (step (a) above) and the reclamation
+step (step (c) above), it is often helpful to think of them separately.
+For example, RCU readers and updaters need not communicate at all,
+but RCU provides implicit low-overhead communication between readers
+and reclaimers, namely, in step (b) above.
+
+So how the heck can a reclaimer tell when a reader is done, given
+that readers are not doing any sort of synchronization operations???
+Read on to learn about how RCU's API makes this easy.
+
+
+2. WHAT IS RCU'S CORE API?
+
+The core RCU API is quite small:
+
+a. rcu_read_lock()
+b. rcu_read_unlock()
+c. synchronize_rcu() / call_rcu()
+d. rcu_assign_pointer()
+e. rcu_dereference()
+
+There are many other members of the RCU API, but the rest can be
+expressed in terms of these five, though most implementations instead
+express synchronize_rcu() in terms of the call_rcu() callback API.
+
+The five core RCU APIs are described below, the other 18 will be enumerated
+later. See the kernel docbook documentation for more info, or look directly
+at the function header comments.
+
+rcu_read_lock()
+
+ void rcu_read_lock(void);
+
+ Used by a reader to inform the reclaimer that the reader is
+ entering an RCU read-side critical section. It is illegal
+ to block while in an RCU read-side critical section, though
+ kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side
+ critical sections. Any RCU-protected data structure accessed
+ during an RCU read-side critical section is guaranteed to remain
+ unreclaimed for the full duration of that critical section.
+ Reference counts may be used in conjunction with RCU to maintain
+ longer-term references to data structures.
+
+rcu_read_unlock()
+
+ void rcu_read_unlock(void);
+
+ Used by a reader to inform the reclaimer that the reader is
+ exiting an RCU read-side critical section. Note that RCU
+ read-side critical sections may be nested and/or overlapping.
+
+synchronize_rcu()
+
+ void synchronize_rcu(void);
+
+ Marks the end of updater code and the beginning of reclaimer
+ code. It does this by blocking until all pre-existing RCU
+ read-side critical sections on all CPUs have completed.
+ Note that synchronize_rcu() will -not- necessarily wait for
+ any subsequent RCU read-side critical sections to complete.
+ For example, consider the following sequence of events:
+
+ CPU 0 CPU 1 CPU 2
+ ----------------- ------------------------- ---------------
+ 1. rcu_read_lock()
+ 2. enters synchronize_rcu()
+ 3. rcu_read_lock()
+ 4. rcu_read_unlock()
+ 5. exits synchronize_rcu()
+ 6. rcu_read_unlock()
+
+ To reiterate, synchronize_rcu() waits only for ongoing RCU
+ read-side critical sections to complete, not necessarily for
+ any that begin after synchronize_rcu() is invoked.
+
+ Of course, synchronize_rcu() does not necessarily return
+ -immediately- after the last pre-existing RCU read-side critical
+ section completes. For one thing, there might well be scheduling
+ delays. For another thing, many RCU implementations process
+ requests in batches in order to improve efficiencies, which can
+ further delay synchronize_rcu().
+
+ Since synchronize_rcu() is the API that must figure out when
+ readers are done, its implementation is key to RCU. For RCU
+ to be useful in all but the most read-intensive situations,
+ synchronize_rcu()'s overhead must also be quite small.
+
+ The call_rcu() API is a callback form of synchronize_rcu(),
+ and is described in more detail in a later section. Instead of
+ blocking, it registers a function and argument which are invoked
+ after all ongoing RCU read-side critical sections have completed.
+ This callback variant is particularly useful in situations where
+ it is illegal to block or where update-side performance is
+ critically important.
+
+ However, the call_rcu() API should not be used lightly, as use
+ of the synchronize_rcu() API generally results in simpler code.
+ In addition, the synchronize_rcu() API has the nice property
+ of automatically limiting update rate should grace periods
+ be delayed. This property results in system resilience in face
+ of denial-of-service attacks. Code using call_rcu() should limit
+ update rate in order to gain this same sort of resilience. See
+ checklist.txt for some approaches to limiting the update rate.
+
+rcu_assign_pointer()
+
+ typeof(p) rcu_assign_pointer(p, typeof(p) v);
+
+ Yes, rcu_assign_pointer() -is- implemented as a macro, though it
+ would be cool to be able to declare a function in this manner.
+ (Compiler experts will no doubt disagree.)
+
+ The updater uses this function to assign a new value to an
+ RCU-protected pointer, in order to safely communicate the change
+ in value from the updater to the reader. This function returns
+ the new value, and also executes any memory-barrier instructions
+ required for a given CPU architecture.
+
+ Perhaps just as important, it serves to document (1) which
+ pointers are protected by RCU and (2) the point at which a
+ given structure becomes accessible to other CPUs. That said,
+ rcu_assign_pointer() is most frequently used indirectly, via
+ the _rcu list-manipulation primitives such as list_add_rcu().
+
+rcu_dereference()
+
+ typeof(p) rcu_dereference(p);
+
+ Like rcu_assign_pointer(), rcu_dereference() must be implemented
+ as a macro.
+
+ The reader uses rcu_dereference() to fetch an RCU-protected
+ pointer, which returns a value that may then be safely
+ dereferenced. Note that rcu_deference() does not actually
+ dereference the pointer, instead, it protects the pointer for
+ later dereferencing. It also executes any needed memory-barrier
+ instructions for a given CPU architecture. Currently, only Alpha
+ needs memory barriers within rcu_dereference() -- on other CPUs,
+ it compiles to nothing, not even a compiler directive.
+
+ Common coding practice uses rcu_dereference() to copy an
+ RCU-protected pointer to a local variable, then dereferences
+ this local variable, for example as follows:
+
+ p = rcu_dereference(head.next);
+ return p->data;
+
+ However, in this case, one could just as easily combine these
+ into one statement:
+
+ return rcu_dereference(head.next)->data;
+
+ If you are going to be fetching multiple fields from the
+ RCU-protected structure, using the local variable is of
+ course preferred. Repeated rcu_dereference() calls look
+ ugly and incur unnecessary overhead on Alpha CPUs.
+
+ Note that the value returned by rcu_dereference() is valid
+ only within the enclosing RCU read-side critical section.
+ For example, the following is -not- legal:
+
+ rcu_read_lock();
+ p = rcu_dereference(head.next);
+ rcu_read_unlock();
+ x = p->address;
+ rcu_read_lock();
+ y = p->data;
+ rcu_read_unlock();
+
+ Holding a reference from one RCU read-side critical section
+ to another is just as illegal as holding a reference from
+ one lock-based critical section to another! Similarly,
+ using a reference outside of the critical section in which
+ it was acquired is just as illegal as doing so with normal
+ locking.
+
+ As with rcu_assign_pointer(), an important function of
+ rcu_dereference() is to document which pointers are protected by
+ RCU, in particular, flagging a pointer that is subject to changing
+ at any time, including immediately after the rcu_dereference().
+ And, again like rcu_assign_pointer(), rcu_dereference() is
+ typically used indirectly, via the _rcu list-manipulation
+ primitives, such as list_for_each_entry_rcu().
+
+The following diagram shows how each API communicates among the
+reader, updater, and reclaimer.
+
+
+ rcu_assign_pointer()
+ +--------+
+ +---------------------->| reader |---------+
+ | +--------+ |
+ | | |
+ | | | Protect:
+ | | | rcu_read_lock()
+ | | | rcu_read_unlock()
+ | rcu_dereference() | |
+ +---------+ | |
+ | updater |<---------------------+ |
+ +---------+ V
+ | +-----------+
+ +----------------------------------->| reclaimer |
+ +-----------+
+ Defer:
+ synchronize_rcu() & call_rcu()
+
+
+The RCU infrastructure observes the time sequence of rcu_read_lock(),
+rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in
+order to determine when (1) synchronize_rcu() invocations may return
+to their callers and (2) call_rcu() callbacks may be invoked. Efficient
+implementations of the RCU infrastructure make heavy use of batching in
+order to amortize their overhead over many uses of the corresponding APIs.
+
+There are no fewer than three RCU mechanisms in the Linux kernel; the
+diagram above shows the first one, which is by far the most commonly used.
+The rcu_dereference() and rcu_assign_pointer() primitives are used for
+all three mechanisms, but different defer and protect primitives are
+used as follows:
+
+ Defer Protect
+
+a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock()
+ call_rcu()
+
+b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
+
+c. synchronize_sched() preempt_disable() / preempt_enable()
+ local_irq_save() / local_irq_restore()
+ hardirq enter / hardirq exit
+ NMI enter / NMI exit
+
+These three mechanisms are used as follows:
+
+a. RCU applied to normal data structures.
+
+b. RCU applied to networking data structures that may be subjected
+ to remote denial-of-service attacks.
+
+c. RCU applied to scheduler and interrupt/NMI-handler tasks.
+
+Again, most uses will be of (a). The (b) and (c) cases are important
+for specialized uses, but are relatively uncommon.
+
+
+3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
+
+This section shows a simple use of the core RCU API to protect a
+global pointer to a dynamically allocated structure. More-typical
+uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
+
+ struct foo {
+ int a;
+ char b;
+ long c;
+ };
+ DEFINE_SPINLOCK(foo_mutex);
+
+ struct foo *gbl_foo;
+
+ /*
+ * Create a new struct foo that is the same as the one currently
+ * pointed to by gbl_foo, except that field "a" is replaced
+ * with "new_a". Points gbl_foo to the new structure, and
+ * frees up the old structure after a grace period.
+ *
+ * Uses rcu_assign_pointer() to ensure that concurrent readers
+ * see the initialized version of the new structure.
+ *
+ * Uses synchronize_rcu() to ensure that any readers that might
+ * have references to the old structure complete before freeing
+ * the old structure.
+ */
+ void foo_update_a(int new_a)
+ {
+ struct foo *new_fp;
+ struct foo *old_fp;
+
+ new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
+ spin_lock(&foo_mutex);
+ old_fp = gbl_foo;
+ *new_fp = *old_fp;
+ new_fp->a = new_a;
+ rcu_assign_pointer(gbl_foo, new_fp);
+ spin_unlock(&foo_mutex);
+ synchronize_rcu();
+ kfree(old_fp);
+ }
+
+ /*
+ * Return the value of field "a" of the current gbl_foo
+ * structure. Use rcu_read_lock() and rcu_read_unlock()
+ * to ensure that the structure does not get deleted out
+ * from under us, and use rcu_dereference() to ensure that
+ * we see the initialized version of the structure (important
+ * for DEC Alpha and for people reading the code).
+ */
+ int foo_get_a(void)
+ {
+ int retval;
+
+ rcu_read_lock();
+ retval = rcu_dereference(gbl_foo)->a;
+ rcu_read_unlock();
+ return retval;
+ }
+
+So, to sum up:
+
+o Use rcu_read_lock() and rcu_read_unlock() to guard RCU
+ read-side critical sections.
+
+o Within an RCU read-side critical section, use rcu_dereference()
+ to dereference RCU-protected pointers.
+
+o Use some solid scheme (such as locks or semaphores) to
+ keep concurrent updates from interfering with each other.
+
+o Use rcu_assign_pointer() to update an RCU-protected pointer.
+ This primitive protects concurrent readers from the updater,
+ -not- concurrent updates from each other! You therefore still
+ need to use locking (or something similar) to keep concurrent
+ rcu_assign_pointer() primitives from interfering with each other.
+
+o Use synchronize_rcu() -after- removing a data element from an
+ RCU-protected data structure, but -before- reclaiming/freeing
+ the data element, in order to wait for the completion of all
+ RCU read-side critical sections that might be referencing that
+ data item.
+
+See checklist.txt for additional rules to follow when using RCU.
+And again, more-typical uses of RCU may be found in listRCU.txt,
+arrayRCU.txt, and NMI-RCU.txt.
+
+
+4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
+
+In the example above, foo_update_a() blocks until a grace period elapses.
+This is quite simple, but in some cases one cannot afford to wait so
+long -- there might be other high-priority work to be done.
+
+In such cases, one uses call_rcu() rather than synchronize_rcu().
+The call_rcu() API is as follows:
+
+ void call_rcu(struct rcu_head * head,
+ void (*func)(struct rcu_head *head));
+
+This function invokes func(head) after a grace period has elapsed.
+This invocation might happen from either softirq or process context,
+so the function is not permitted to block. The foo struct needs to
+have an rcu_head structure added, perhaps as follows:
+
+ struct foo {
+ int a;
+ char b;
+ long c;
+ struct rcu_head rcu;
+ };
+
+The foo_update_a() function might then be written as follows:
+
+ /*
+ * Create a new struct foo that is the same as the one currently
+ * pointed to by gbl_foo, except that field "a" is replaced
+ * with "new_a". Points gbl_foo to the new structure, and
+ * frees up the old structure after a grace period.
+ *
+ * Uses rcu_assign_pointer() to ensure that concurrent readers
+ * see the initialized version of the new structure.
+ *
+ * Uses call_rcu() to ensure that any readers that might have
+ * references to the old structure complete before freeing the
+ * old structure.
+ */
+ void foo_update_a(int new_a)
+ {
+ struct foo *new_fp;
+ struct foo *old_fp;
+
+ new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
+ spin_lock(&foo_mutex);
+ old_fp = gbl_foo;
+ *new_fp = *old_fp;
+ new_fp->a = new_a;
+ rcu_assign_pointer(gbl_foo, new_fp);
+ spin_unlock(&foo_mutex);
+ call_rcu(&old_fp->rcu, foo_reclaim);
+ }
+
+The foo_reclaim() function might appear as follows:
+
+ void foo_reclaim(struct rcu_head *rp)
+ {
+ struct foo *fp = container_of(rp, struct foo, rcu);
+
+ kfree(fp);
+ }
+
+The container_of() primitive is a macro that, given a pointer into a
+struct, the type of the struct, and the pointed-to field within the
+struct, returns a pointer to the beginning of the struct.
+
+The use of call_rcu() permits the caller of foo_update_a() to
+immediately regain control, without needing to worry further about the
+old version of the newly updated element. It also clearly shows the
+RCU distinction between updater, namely foo_update_a(), and reclaimer,
+namely foo_reclaim().
+
+The summary of advice is the same as for the previous section, except
+that we are now using call_rcu() rather than synchronize_rcu():
+
+o Use call_rcu() -after- removing a data element from an
+ RCU-protected data structure in order to register a callback
+ function that will be invoked after the completion of all RCU
+ read-side critical sections that might be referencing that
+ data item.
+
+Again, see checklist.txt for additional rules governing the use of RCU.
+
+
+5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
+
+One of the nice things about RCU is that it has extremely simple "toy"
+implementations that are a good first step towards understanding the
+production-quality implementations in the Linux kernel. This section
+presents two such "toy" implementations of RCU, one that is implemented
+in terms of familiar locking primitives, and another that more closely
+resembles "classic" RCU. Both are way too simple for real-world use,
+lacking both functionality and performance. However, they are useful
+in getting a feel for how RCU works. See kernel/rcupdate.c for a
+production-quality implementation, and see:
+
+ http://www.rdrop.com/users/paulmck/RCU
+
+for papers describing the Linux kernel RCU implementation. The OLS'01
+and OLS'02 papers are a good introduction, and the dissertation provides
+more details on the current implementation as of early 2004.
+
+
+5A. "TOY" IMPLEMENTATION #1: LOCKING
+
+This section presents a "toy" RCU implementation that is based on
+familiar locking primitives. Its overhead makes it a non-starter for
+real-life use, as does its lack of scalability. It is also unsuitable
+for realtime use, since it allows scheduling latency to "bleed" from
+one read-side critical section to another.
+
+However, it is probably the easiest implementation to relate to, so is
+a good starting point.
+
+It is extremely simple:
+
+ static DEFINE_RWLOCK(rcu_gp_mutex);
+
+ void rcu_read_lock(void)
+ {
+ read_lock(&rcu_gp_mutex);
+ }
+
+ void rcu_read_unlock(void)
+ {
+ read_unlock(&rcu_gp_mutex);
+ }
+
+ void synchronize_rcu(void)
+ {
+ write_lock(&rcu_gp_mutex);
+ write_unlock(&rcu_gp_mutex);
+ }
+
+[You can ignore rcu_assign_pointer() and rcu_dereference() without
+missing much. But here they are anyway. And whatever you do, don't
+forget about them when submitting patches making use of RCU!]
+
+ #define rcu_assign_pointer(p, v) ({ \
+ smp_wmb(); \
+ (p) = (v); \
+ })
+
+ #define rcu_dereference(p) ({ \
+ typeof(p) _________p1 = p; \
+ smp_read_barrier_depends(); \
+ (_________p1); \
+ })
+
+
+The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
+and release a global reader-writer lock. The synchronize_rcu()
+primitive write-acquires this same lock, then immediately releases
+it. This means that once synchronize_rcu() exits, all RCU read-side
+critical sections that were in progress before synchronize_rcu() was
+called are guaranteed to have completed -- there is no way that
+synchronize_rcu() would have been able to write-acquire the lock
+otherwise.
+
+It is possible to nest rcu_read_lock(), since reader-writer locks may
+be recursively acquired. Note also that rcu_read_lock() is immune
+from deadlock (an important property of RCU). The reason for this is
+that the only thing that can block rcu_read_lock() is a synchronize_rcu().
+But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
+so there can be no deadlock cycle.
+
+Quick Quiz #1: Why is this argument naive? How could a deadlock
+ occur when using this algorithm in a real-world Linux
+ kernel? How could this deadlock be avoided?
+
+
+5B. "TOY" EXAMPLE #2: CLASSIC RCU
+
+This section presents a "toy" RCU implementation that is based on
+"classic RCU". It is also short on performance (but only for updates) and
+on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
+kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
+are the same as those shown in the preceding section, so they are omitted.
+
+ void rcu_read_lock(void) { }
+
+ void rcu_read_unlock(void) { }
+
+ void synchronize_rcu(void)
+ {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ run_on(cpu);
+ }
+
+Note that rcu_read_lock() and rcu_read_unlock() do absolutely nothing.
+This is the great strength of classic RCU in a non-preemptive kernel:
+read-side overhead is precisely zero, at least on non-Alpha CPUs.
+And there is absolutely no way that rcu_read_lock() can possibly
+participate in a deadlock cycle!
+
+The implementation of synchronize_rcu() simply schedules itself on each
+CPU in turn. The run_on() primitive can be implemented straightforwardly
+in terms of the sched_setaffinity() primitive. Of course, a somewhat less
+"toy" implementation would restore the affinity upon completion rather
+than just leaving all tasks running on the last CPU, but when I said
+"toy", I meant -toy-!
+
+So how the heck is this supposed to work???
+
+Remember that it is illegal to block while in an RCU read-side critical
+section. Therefore, if a given CPU executes a context switch, we know
+that it must have completed all preceding RCU read-side critical sections.
+Once -all- CPUs have executed a context switch, then -all- preceding
+RCU read-side critical sections will have completed.
+
+So, suppose that we remove a data item from its structure and then invoke
+synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed
+that there are no RCU read-side critical sections holding a reference
+to that data item, so we can safely reclaim it.
+
+Quick Quiz #2: Give an example where Classic RCU's read-side
+ overhead is -negative-.
+
+Quick Quiz #3: If it is illegal to block in an RCU read-side
+ critical section, what the heck do you do in
+ PREEMPT_RT, where normal spinlocks can block???
+
+
+6. ANALOGY WITH READER-WRITER LOCKING
+
+Although RCU can be used in many different ways, a very common use of
+RCU is analogous to reader-writer locking. The following unified
+diff shows how closely related RCU and reader-writer locking can be.
+
+ @@ -13,15 +14,15 @@
+ struct list_head *lp;
+ struct el *p;
+
+ - read_lock();
+ - list_for_each_entry(p, head, lp) {
+ + rcu_read_lock();
+ + list_for_each_entry_rcu(p, head, lp) {
+ if (p->key == key) {
+ *result = p->data;
+ - read_unlock();
+ + rcu_read_unlock();
+ return 1;
+ }
+ }
+ - read_unlock();
+ + rcu_read_unlock();
+ return 0;
+ }
+
+ @@ -29,15 +30,16 @@
+ {
+ struct el *p;
+
+ - write_lock(&listmutex);
+ + spin_lock(&listmutex);
+ list_for_each_entry(p, head, lp) {
+ if (p->key == key) {
+ - list_del(&p->list);
+ - write_unlock(&listmutex);
+ + list_del_rcu(&p->list);
+ + spin_unlock(&listmutex);
+ + synchronize_rcu();
+ kfree(p);
+ return 1;
+ }
+ }
+ - write_unlock(&listmutex);
+ + spin_unlock(&listmutex);
+ return 0;
+ }
+
+Or, for those who prefer a side-by-side listing:
+
+ 1 struct el { 1 struct el {
+ 2 struct list_head list; 2 struct list_head list;
+ 3 long key; 3 long key;
+ 4 spinlock_t mutex; 4 spinlock_t mutex;
+ 5 int data; 5 int data;
+ 6 /* Other data fields */ 6 /* Other data fields */
+ 7 }; 7 };
+ 8 spinlock_t listmutex; 8 spinlock_t listmutex;
+ 9 struct el head; 9 struct el head;
+
+ 1 int search(long key, int *result) 1 int search(long key, int *result)
+ 2 { 2 {
+ 3 struct list_head *lp; 3 struct list_head *lp;
+ 4 struct el *p; 4 struct el *p;
+ 5 5
+ 6 read_lock(); 6 rcu_read_lock();
+ 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
+ 8 if (p->key == key) { 8 if (p->key == key) {
+ 9 *result = p->data; 9 *result = p->data;
+10 read_unlock(); 10 rcu_read_unlock();
+11 return 1; 11 return 1;
+12 } 12 }
+13 } 13 }
+14 read_unlock(); 14 rcu_read_unlock();
+15 return 0; 15 return 0;
+16 } 16 }
+
+ 1 int delete(long key) 1 int delete(long key)
+ 2 { 2 {
+ 3 struct el *p; 3 struct el *p;
+ 4 4
+ 5 write_lock(&listmutex); 5 spin_lock(&listmutex);
+ 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
+ 7 if (p->key == key) { 7 if (p->key == key) {
+ 8 list_del(&p->list); 8 list_del_rcu(&p->list);
+ 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
+ 10 synchronize_rcu();
+10 kfree(p); 11 kfree(p);
+11 return 1; 12 return 1;
+12 } 13 }
+13 } 14 }
+14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
+15 return 0; 16 return 0;
+16 } 17 }
+
+Either way, the differences are quite small. Read-side locking moves
+to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
+a reader-writer lock to a simple spinlock, and a synchronize_rcu()
+precedes the kfree().
+
+However, there is one potential catch: the read-side and update-side
+critical sections can now run concurrently. In many cases, this will
+not be a problem, but it is necessary to check carefully regardless.
+For example, if multiple independent list updates must be seen as
+a single atomic update, converting to RCU will require special care.
+
+Also, the presence of synchronize_rcu() means that the RCU version of
+delete() can now block. If this is a problem, there is a callback-based
+mechanism that never blocks, namely call_rcu(), that can be used in
+place of synchronize_rcu().
+
+
+7. FULL LIST OF RCU APIs
+
+The RCU APIs are documented in docbook-format header comments in the
+Linux-kernel source code, but it helps to have a full list of the
+APIs, since there does not appear to be a way to categorize them
+in docbook. Here is the list, by category.
+
+RCU pointer/list traversal:
+
+ rcu_dereference
+ list_for_each_entry_rcu
+ hlist_for_each_entry_rcu
+
+ list_for_each_continue_rcu (to be deprecated in favor of new
+ list_for_each_entry_continue_rcu)
+
+RCU pointer/list update:
+
+ rcu_assign_pointer
+ list_add_rcu
+ list_add_tail_rcu
+ list_del_rcu
+ list_replace_rcu
+ hlist_del_rcu
+ hlist_add_after_rcu
+ hlist_add_before_rcu
+ hlist_add_head_rcu
+ hlist_replace_rcu
+ list_splice_init_rcu()
+
+RCU: Critical sections Grace period Barrier
+
+ rcu_read_lock synchronize_net rcu_barrier
+ rcu_read_unlock synchronize_rcu
+ call_rcu
+
+
+bh: Critical sections Grace period Barrier
+
+ rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
+ rcu_read_unlock_bh
+
+
+sched: Critical sections Grace period Barrier
+
+ [preempt_disable] synchronize_sched rcu_barrier_sched
+ [and friends] call_rcu_sched
+
+
+SRCU: Critical sections Grace period Barrier
+
+ srcu_read_lock synchronize_srcu N/A
+ srcu_read_unlock
+
+
+See the comment headers in the source code (or the docbook generated
+from them) for more information.
+
+
+8. ANSWERS TO QUICK QUIZZES
+
+Quick Quiz #1: Why is this argument naive? How could a deadlock
+ occur when using this algorithm in a real-world Linux
+ kernel? [Referring to the lock-based "toy" RCU
+ algorithm.]
+
+Answer: Consider the following sequence of events:
+
+ 1. CPU 0 acquires some unrelated lock, call it
+ "problematic_lock", disabling irq via
+ spin_lock_irqsave().
+
+ 2. CPU 1 enters synchronize_rcu(), write-acquiring
+ rcu_gp_mutex.
+
+ 3. CPU 0 enters rcu_read_lock(), but must wait
+ because CPU 1 holds rcu_gp_mutex.
+
+ 4. CPU 1 is interrupted, and the irq handler
+ attempts to acquire problematic_lock.
+
+ The system is now deadlocked.
+
+ One way to avoid this deadlock is to use an approach like
+ that of CONFIG_PREEMPT_RT, where all normal spinlocks
+ become blocking locks, and all irq handlers execute in
+ the context of special tasks. In this case, in step 4
+ above, the irq handler would block, allowing CPU 1 to
+ release rcu_gp_mutex, avoiding the deadlock.
+
+ Even in the absence of deadlock, this RCU implementation
+ allows latency to "bleed" from readers to other
+ readers through synchronize_rcu(). To see this,
+ consider task A in an RCU read-side critical section
+ (thus read-holding rcu_gp_mutex), task B blocked
+ attempting to write-acquire rcu_gp_mutex, and
+ task C blocked in rcu_read_lock() attempting to
+ read_acquire rcu_gp_mutex. Task A's RCU read-side
+ latency is holding up task C, albeit indirectly via
+ task B.
+
+ Realtime RCU implementations therefore use a counter-based
+ approach where tasks in RCU read-side critical sections
+ cannot be blocked by tasks executing synchronize_rcu().
+
+Quick Quiz #2: Give an example where Classic RCU's read-side
+ overhead is -negative-.
+
+Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
+ kernel where a routing table is used by process-context
+ code, but can be updated by irq-context code (for example,
+ by an "ICMP REDIRECT" packet). The usual way of handling
+ this would be to have the process-context code disable
+ interrupts while searching the routing table. Use of
+ RCU allows such interrupt-disabling to be dispensed with.
+ Thus, without RCU, you pay the cost of disabling interrupts,
+ and with RCU you don't.
+
+ One can argue that the overhead of RCU in this
+ case is negative with respect to the single-CPU
+ interrupt-disabling approach. Others might argue that
+ the overhead of RCU is merely zero, and that replacing
+ the positive overhead of the interrupt-disabling scheme
+ with the zero-overhead RCU scheme does not constitute
+ negative overhead.
+
+ In real life, of course, things are more complex. But
+ even the theoretical possibility of negative overhead for
+ a synchronization primitive is a bit unexpected. ;-)
+
+Quick Quiz #3: If it is illegal to block in an RCU read-side
+ critical section, what the heck do you do in
+ PREEMPT_RT, where normal spinlocks can block???
+
+Answer: Just as PREEMPT_RT permits preemption of spinlock
+ critical sections, it permits preemption of RCU
+ read-side critical sections. It also permits
+ spinlocks blocking while in RCU read-side critical
+ sections.
+
+ Why the apparent inconsistency? Because it is it
+ possible to use priority boosting to keep the RCU
+ grace periods short if need be (for example, if running
+ short of memory). In contrast, if blocking waiting
+ for (say) network reception, there is no way to know
+ what should be boosted. Especially given that the
+ process we need to boost might well be a human being
+ who just went out for a pizza or something. And although
+ a computer-operated cattle prod might arouse serious
+ interest, it might also provoke serious objections.
+ Besides, how does the computer know what pizza parlor
+ the human being went to???
+
+
+ACKNOWLEDGEMENTS
+
+My thanks to the people who helped make this human-readable, including
+Jon Walpole, Josh Triplett, Serge Hallyn, Suzanne Wood, and Alan Stern.
+
+
+For more information, see http://www.rdrop.com/users/paulmck/RCU.
diff --git a/Documentation/SAK.txt b/Documentation/SAK.txt
new file mode 100644
index 0000000..74be146
--- /dev/null
+++ b/Documentation/SAK.txt
@@ -0,0 +1,88 @@
+Linux 2.4.2 Secure Attention Key (SAK) handling
+18 March 2001, Andrew Morton
+
+An operating system's Secure Attention Key is a security tool which is
+provided as protection against trojan password capturing programs. It
+is an undefeatable way of killing all programs which could be
+masquerading as login applications. Users need to be taught to enter
+this key sequence before they log in to the system.
+
+From the PC keyboard, Linux has two similar but different ways of
+providing SAK. One is the ALT-SYSRQ-K sequence. You shouldn't use
+this sequence. It is only available if the kernel was compiled with
+sysrq support.
+
+The proper way of generating a SAK is to define the key sequence using
+`loadkeys'. This will work whether or not sysrq support is compiled
+into the kernel.
+
+SAK works correctly when the keyboard is in raw mode. This means that
+once defined, SAK will kill a running X server. If the system is in
+run level 5, the X server will restart. This is what you want to
+happen.
+
+What key sequence should you use? Well, CTRL-ALT-DEL is used to reboot
+the machine. CTRL-ALT-BACKSPACE is magical to the X server. We'll
+choose CTRL-ALT-PAUSE.
+
+In your rc.sysinit (or rc.local) file, add the command
+
+ echo "control alt keycode 101 = SAK" | /bin/loadkeys
+
+And that's it! Only the superuser may reprogram the SAK key.
+
+
+NOTES
+=====
+
+1: Linux SAK is said to be not a "true SAK" as is required by
+ systems which implement C2 level security. This author does not
+ know why.
+
+
+2: On the PC keyboard, SAK kills all applications which have
+ /dev/console opened.
+
+ Unfortunately this includes a number of things which you don't
+ actually want killed. This is because these applications are
+ incorrectly holding /dev/console open. Be sure to complain to your
+ Linux distributor about this!
+
+ You can identify processes which will be killed by SAK with the
+ command
+
+ # ls -l /proc/[0-9]*/fd/* | grep console
+ l-wx------ 1 root root 64 Mar 18 00:46 /proc/579/fd/0 -> /dev/console
+
+ Then:
+
+ # ps aux|grep 579
+ root 579 0.0 0.1 1088 436 ? S 00:43 0:00 gpm -t ps/2
+
+ So `gpm' will be killed by SAK. This is a bug in gpm. It should
+ be closing standard input. You can work around this by finding the
+ initscript which launches gpm and changing it thusly:
+
+ Old:
+
+ daemon gpm
+
+ New:
+
+ daemon gpm < /dev/null
+
+ Vixie cron also seems to have this problem, and needs the same treatment.
+
+ Also, one prominent Linux distribution has the following three
+ lines in its rc.sysinit and rc scripts:
+
+ exec 3<&0
+ exec 4>&1
+ exec 5>&2
+
+ These commands cause *all* daemons which are launched by the
+ initscripts to have file descriptors 3, 4 and 5 attached to
+ /dev/console. So SAK kills them all. A workaround is to simply
+ delete these lines, but this may cause system management
+ applications to malfunction - test everything well.
+
diff --git a/Documentation/SELinux.txt b/Documentation/SELinux.txt
new file mode 100644
index 0000000..07eae00
--- /dev/null
+++ b/Documentation/SELinux.txt
@@ -0,0 +1,27 @@
+If you want to use SELinux, chances are you will want
+to use the distro-provided policies, or install the
+latest reference policy release from
+ http://oss.tresys.com/projects/refpolicy
+
+However, if you want to install a dummy policy for
+testing, you can do using 'mdp' provided under
+scripts/selinux. Note that this requires the selinux
+userspace to be installed - in particular you will
+need checkpolicy to compile a kernel, and setfiles and
+fixfiles to label the filesystem.
+
+ 1. Compile the kernel with selinux enabled.
+ 2. Type 'make' to compile mdp.
+ 3. Make sure that you are not running with
+ SELinux enabled and a real policy. If
+ you are, reboot with selinux disabled
+ before continuing.
+ 4. Run install_policy.sh:
+ cd scripts/selinux
+ sh install_policy.sh
+
+Step 4 will create a new dummy policy valid for your
+kernel, with a single selinux user, role, and type.
+It will compile the policy, will set your SELINUXTYPE to
+dummy in /etc/selinux/config, install the compiled policy
+as 'dummy', and relabel your filesystem.
diff --git a/Documentation/SM501.txt b/Documentation/SM501.txt
new file mode 100644
index 0000000..6fc6560
--- /dev/null
+++ b/Documentation/SM501.txt
@@ -0,0 +1,71 @@
+ SM501 Driver
+ ============
+
+Copyright 2006, 2007 Simtec Electronics
+
+The Silicon Motion SM501 multimedia companion chip is a multifunction device
+which may provide numerous interfaces including USB host controller USB gadget,
+Asyncronous Serial ports, Audio functions and a dual display video interface.
+The device may be connected by PCI or local bus with varying functions enabled.
+
+Core
+----
+
+The core driver in drivers/mfd provides common services for the
+drivers which manage the specific hardware blocks. These services
+include locking for common registers, clock control and resource
+management.
+
+The core registers drivers for both PCI and generic bus based
+chips via the platform device and driver system.
+
+On detection of a device, the core initialises the chip (which may
+be specified by the platform data) and then exports the selected
+peripheral set as platform devices for the specific drivers.
+
+The core re-uses the platform device system as the platform device
+system provides enough features to support the drivers without the
+need to create a new bus-type and the associated code to go with it.
+
+
+Resources
+---------
+
+Each peripheral has a view of the device which is implicitly narrowed to
+the specific set of resources that peripheral requires in order to
+function correctly.
+
+The centralised memory allocation allows the driver to ensure that the
+maximum possible resource allocation can be made to the video subsystem
+as this is by-far the most resource-sensitive of the on-chip functions.
+
+The primary issue with memory allocation is that of moving the video
+buffers once a display mode is chosen. Indeed when a video mode change
+occurs the memory footprint of the video subsystem changes.
+
+Since video memory is difficult to move without changing the display
+(unless sufficient contiguous memory can be provided for the old and new
+modes simultaneously) the video driver fully utilises the memory area
+given to it by aligning fb0 to the start of the area and fb1 to the end
+of it. Any memory left over in the middle is used for the acceleration
+functions, which are transient and thus their location is less critical
+as it can be moved.
+
+
+Configuration
+-------------
+
+The platform device driver uses a set of platform data to pass
+configurations through to the core and the subsidiary drivers
+so that there can be support for more than one system carrying
+an SM501 built into a single kernel image.
+
+The PCI driver assumes that the PCI card behaves as per the Silicon
+Motion reference design.
+
+There is an errata (AB-5) affecting the selection of the
+of the M1XCLK and M1CLK frequencies. These two clocks
+must be sourced from the same PLL, although they can then
+be divided down individually. If this is not set, then SM501 may
+lock and hang the whole system. The driver will refuse to
+attach if the PLL selection is different.
diff --git a/Documentation/SecurityBugs b/Documentation/SecurityBugs
new file mode 100644
index 0000000..26c3b36
--- /dev/null
+++ b/Documentation/SecurityBugs
@@ -0,0 +1,38 @@
+Linux kernel developers take security very seriously. As such, we'd
+like to know when a security bug is found so that it can be fixed and
+disclosed as quickly as possible. Please report security bugs to the
+Linux kernel security team.
+
+1) Contact
+
+The Linux kernel security team can be contacted by email at
+<security@kernel.org>. This is a private list of security officers
+who will help verify the bug report and develop and release a fix.
+It is possible that the security team will bring in extra help from
+area maintainers to understand and fix the security vulnerability.
+
+As it is with any bug, the more information provided the easier it
+will be to diagnose and fix. Please review the procedure outlined in
+REPORTING-BUGS if you are unclear about what information is helpful.
+Any exploit code is very helpful and will not be released without
+consent from the reporter unless it has already been made public.
+
+2) Disclosure
+
+The goal of the Linux kernel security team is to work with the
+bug submitter to bug resolution as well as disclosure. We prefer
+to fully disclose the bug as soon as possible. It is reasonable to
+delay disclosure when the bug or the fix is not yet fully understood,
+the solution is not well-tested or for vendor coordination. However, we
+expect these delays to be short, measurable in days, not weeks or months.
+A disclosure date is negotiated by the security team working with the
+bug submitter as well as vendors. However, the kernel security team
+holds the final say when setting a disclosure date. The timeframe for
+disclosure is from immediate (esp. if it's already publically known)
+to a few weeks. As a basic default policy, we expect report date to
+disclosure date to be on the order of 7 days.
+
+3) Non-disclosure agreements
+
+The Linux kernel security team is not a formal body and therefore unable
+to enter any non-disclosure agreements.
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
new file mode 100644
index 0000000..989c2fc
--- /dev/null
+++ b/Documentation/Smack.txt
@@ -0,0 +1,493 @@
+
+
+ "Good for you, you've decided to clean the elevator!"
+ - The Elevator, from Dark Star
+
+Smack is the the Simplified Mandatory Access Control Kernel.
+Smack is a kernel based implementation of mandatory access
+control that includes simplicity in its primary design goals.
+
+Smack is not the only Mandatory Access Control scheme
+available for Linux. Those new to Mandatory Access Control
+are encouraged to compare Smack with the other mechanisms
+available to determine which is best suited to the problem
+at hand.
+
+Smack consists of three major components:
+ - The kernel
+ - A start-up script and a few modified applications
+ - Configuration data
+
+The kernel component of Smack is implemented as a Linux
+Security Modules (LSM) module. It requires netlabel and
+works best with file systems that support extended attributes,
+although xattr support is not strictly required.
+It is safe to run a Smack kernel under a "vanilla" distribution.
+Smack kernels use the CIPSO IP option. Some network
+configurations are intolerant of IP options and can impede
+access to systems that use them as Smack does.
+
+The startup script etc-init.d-smack should be installed
+in /etc/init.d/smack and should be invoked early in the
+start-up process. On Fedora rc5.d/S02smack is recommended.
+This script ensures that certain devices have the correct
+Smack attributes and loads the Smack configuration if
+any is defined. This script invokes two programs that
+ensure configuration data is properly formatted. These
+programs are /usr/sbin/smackload and /usr/sin/smackcipso.
+The system will run just fine without these programs,
+but it will be difficult to set access rules properly.
+
+A version of "ls" that provides a "-M" option to display
+Smack labels on long listing is available.
+
+A hacked version of sshd that allows network logins by users
+with specific Smack labels is available. This version does
+not work for scp. You must set the /etc/ssh/sshd_config
+line:
+ UsePrivilegeSeparation no
+
+The format of /etc/smack/usr is:
+
+ username smack
+
+In keeping with the intent of Smack, configuration data is
+minimal and not strictly required. The most important
+configuration step is mounting the smackfs pseudo filesystem.
+
+Add this line to /etc/fstab:
+
+ smackfs /smack smackfs smackfsdef=* 0 0
+
+and create the /smack directory for mounting.
+
+Smack uses extended attributes (xattrs) to store file labels.
+The command to set a Smack label on a file is:
+
+ # attr -S -s SMACK64 -V "value" path
+
+NOTE: Smack labels are limited to 23 characters. The attr command
+ does not enforce this restriction and can be used to set
+ invalid Smack labels on files.
+
+If you don't do anything special all users will get the floor ("_")
+label when they log in. If you do want to log in via the hacked ssh
+at other labels use the attr command to set the smack value on the
+home directory and it's contents.
+
+You can add access rules in /etc/smack/accesses. They take the form:
+
+ subjectlabel objectlabel access
+
+access is a combination of the letters rwxa which specify the
+kind of access permitted a subject with subjectlabel on an
+object with objectlabel. If there is no rule no access is allowed.
+
+A process can see the smack label it is running with by
+reading /proc/self/attr/current. A privileged process can
+set the process smack by writing there.
+
+Look for additional programs on http://schaufler-ca.com
+
+From the Smack Whitepaper:
+
+The Simplified Mandatory Access Control Kernel
+
+Casey Schaufler
+casey@schaufler-ca.com
+
+Mandatory Access Control
+
+Computer systems employ a variety of schemes to constrain how information is
+shared among the people and services using the machine. Some of these schemes
+allow the program or user to decide what other programs or users are allowed
+access to pieces of data. These schemes are called discretionary access
+control mechanisms because the access control is specified at the discretion
+of the user. Other schemes do not leave the decision regarding what a user or
+program can access up to users or programs. These schemes are called mandatory
+access control mechanisms because you don't have a choice regarding the users
+or programs that have access to pieces of data.
+
+Bell & LaPadula
+
+From the middle of the 1980's until the turn of the century Mandatory Access
+Control (MAC) was very closely associated with the Bell & LaPadula security
+model, a mathematical description of the United States Department of Defense
+policy for marking paper documents. MAC in this form enjoyed a following
+within the Capital Beltway and Scandinavian supercomputer centers but was
+often sited as failing to address general needs.
+
+Domain Type Enforcement
+
+Around the turn of the century Domain Type Enforcement (DTE) became popular.
+This scheme organizes users, programs, and data into domains that are
+protected from each other. This scheme has been widely deployed as a component
+of popular Linux distributions. The administrative overhead required to
+maintain this scheme and the detailed understanding of the whole system
+necessary to provide a secure domain mapping leads to the scheme being
+disabled or used in limited ways in the majority of cases.
+
+Smack
+
+Smack is a Mandatory Access Control mechanism designed to provide useful MAC
+while avoiding the pitfalls of its predecessors. The limitations of Bell &
+LaPadula are addressed by providing a scheme whereby access can be controlled
+according to the requirements of the system and its purpose rather than those
+imposed by an arcane government policy. The complexity of Domain Type
+Enforcement and avoided by defining access controls in terms of the access
+modes already in use.
+
+Smack Terminology
+
+The jargon used to talk about Smack will be familiar to those who have dealt
+with other MAC systems and shouldn't be too difficult for the uninitiated to
+pick up. There are four terms that are used in a specific way and that are
+especially important:
+
+ Subject: A subject is an active entity on the computer system.
+ On Smack a subject is a task, which is in turn the basic unit
+ of execution.
+
+ Object: An object is a passive entity on the computer system.
+ On Smack files of all types, IPC, and tasks can be objects.
+
+ Access: Any attempt by a subject to put information into or get
+ information from an object is an access.
+
+ Label: Data that identifies the Mandatory Access Control
+ characteristics of a subject or an object.
+
+These definitions are consistent with the traditional use in the security
+community. There are also some terms from Linux that are likely to crop up:
+
+ Capability: A task that possesses a capability has permission to
+ violate an aspect of the system security policy, as identified by
+ the specific capability. A task that possesses one or more
+ capabilities is a privileged task, whereas a task with no
+ capabilities is an unprivileged task.
+
+ Privilege: A task that is allowed to violate the system security
+ policy is said to have privilege. As of this writing a task can
+ have privilege either by possessing capabilities or by having an
+ effective user of root.
+
+Smack Basics
+
+Smack is an extension to a Linux system. It enforces additional restrictions
+on what subjects can access which objects, based on the labels attached to
+each of the subject and the object.
+
+Labels
+
+Smack labels are ASCII character strings, one to twenty-three characters in
+length. Single character labels using special characters, that being anything
+other than a letter or digit, are reserved for use by the Smack development
+team. Smack labels are unstructured, case sensitive, and the only operation
+ever performed on them is comparison for equality. Smack labels cannot
+contain unprintable characters or the "/" (slash) character.
+
+There are some predefined labels:
+
+ _ Pronounced "floor", a single underscore character.
+ ^ Pronounced "hat", a single circumflex character.
+ * Pronounced "star", a single asterisk character.
+ ? Pronounced "huh", a single question mark character.
+
+Every task on a Smack system is assigned a label. System tasks, such as
+init(8) and systems daemons, are run with the floor ("_") label. User tasks
+are assigned labels according to the specification found in the
+/etc/smack/user configuration file.
+
+Access Rules
+
+Smack uses the traditional access modes of Linux. These modes are read,
+execute, write, and occasionally append. There are a few cases where the
+access mode may not be obvious. These include:
+
+ Signals: A signal is a write operation from the subject task to
+ the object task.
+ Internet Domain IPC: Transmission of a packet is considered a
+ write operation from the source task to the destination task.
+
+Smack restricts access based on the label attached to a subject and the label
+attached to the object it is trying to access. The rules enforced are, in
+order:
+
+ 1. Any access requested by a task labeled "*" is denied.
+ 2. A read or execute access requested by a task labeled "^"
+ is permitted.
+ 3. A read or execute access requested on an object labeled "_"
+ is permitted.
+ 4. Any access requested on an object labeled "*" is permitted.
+ 5. Any access requested by a task on an object with the same
+ label is permitted.
+ 6. Any access requested that is explicitly defined in the loaded
+ rule set is permitted.
+ 7. Any other access is denied.
+
+Smack Access Rules
+
+With the isolation provided by Smack access separation is simple. There are
+many interesting cases where limited access by subjects to objects with
+different labels is desired. One example is the familiar spy model of
+sensitivity, where a scientist working on a highly classified project would be
+able to read documents of lower classifications and anything she writes will
+be "born" highly classified. To accommodate such schemes Smack includes a
+mechanism for specifying rules allowing access between labels.
+
+Access Rule Format
+
+The format of an access rule is:
+
+ subject-label object-label access
+
+Where subject-label is the Smack label of the task, object-label is the Smack
+label of the thing being accessed, and access is a string specifying the sort
+of access allowed. The Smack labels are limited to 23 characters. The access
+specification is searched for letters that describe access modes:
+
+ a: indicates that append access should be granted.
+ r: indicates that read access should be granted.
+ w: indicates that write access should be granted.
+ x: indicates that execute access should be granted.
+
+Uppercase values for the specification letters are allowed as well.
+Access mode specifications can be in any order. Examples of acceptable rules
+are:
+
+ TopSecret Secret rx
+ Secret Unclass R
+ Manager Game x
+ User HR w
+ New Old rRrRr
+ Closed Off -
+
+Examples of unacceptable rules are:
+
+ Top Secret Secret rx
+ Ace Ace r
+ Odd spells waxbeans
+
+Spaces are not allowed in labels. Since a subject always has access to files
+with the same label specifying a rule for that case is pointless. Only
+valid letters (rwxaRWXA) and the dash ('-') character are allowed in
+access specifications. The dash is a placeholder, so "a-r" is the same
+as "ar". A lone dash is used to specify that no access should be allowed.
+
+Applying Access Rules
+
+The developers of Linux rarely define new sorts of things, usually importing
+schemes and concepts from other systems. Most often, the other systems are
+variants of Unix. Unix has many endearing properties, but consistency of
+access control models is not one of them. Smack strives to treat accesses as
+uniformly as is sensible while keeping with the spirit of the underlying
+mechanism.
+
+File system objects including files, directories, named pipes, symbolic links,
+and devices require access permissions that closely match those used by mode
+bit access. To open a file for reading read access is required on the file. To
+search a directory requires execute access. Creating a file with write access
+requires both read and write access on the containing directory. Deleting a
+file requires read and write access to the file and to the containing
+directory. It is possible that a user may be able to see that a file exists
+but not any of its attributes by the circumstance of having read access to the
+containing directory but not to the differently labeled file. This is an
+artifact of the file name being data in the directory, not a part of the file.
+
+IPC objects, message queues, semaphore sets, and memory segments exist in flat
+namespaces and access requests are only required to match the object in
+question.
+
+Process objects reflect tasks on the system and the Smack label used to access
+them is the same Smack label that the task would use for its own access
+attempts. Sending a signal via the kill() system call is a write operation
+from the signaler to the recipient. Debugging a process requires both reading
+and writing. Creating a new task is an internal operation that results in two
+tasks with identical Smack labels and requires no access checks.
+
+Sockets are data structures attached to processes and sending a packet from
+one process to another requires that the sender have write access to the
+receiver. The receiver is not required to have read access to the sender.
+
+Setting Access Rules
+
+The configuration file /etc/smack/accesses contains the rules to be set at
+system startup. The contents are written to the special file /smack/load.
+Rules can be written to /smack/load at any time and take effect immediately.
+For any pair of subject and object labels there can be only one rule, with the
+most recently specified overriding any earlier specification.
+
+The program smackload is provided to ensure data is formatted
+properly when written to /smack/load. This program reads lines
+of the form
+
+ subjectlabel objectlabel mode.
+
+Task Attribute
+
+The Smack label of a process can be read from /proc/<pid>/attr/current. A
+process can read its own Smack label from /proc/self/attr/current. A
+privileged process can change its own Smack label by writing to
+/proc/self/attr/current but not the label of another process.
+
+File Attribute
+
+The Smack label of a filesystem object is stored as an extended attribute
+named SMACK64 on the file. This attribute is in the security namespace. It can
+only be changed by a process with privilege.
+
+Privilege
+
+A process with CAP_MAC_OVERRIDE is privileged.
+
+Smack Networking
+
+As mentioned before, Smack enforces access control on network protocol
+transmissions. Every packet sent by a Smack process is tagged with its Smack
+label. This is done by adding a CIPSO tag to the header of the IP packet. Each
+packet received is expected to have a CIPSO tag that identifies the label and
+if it lacks such a tag the network ambient label is assumed. Before the packet
+is delivered a check is made to determine that a subject with the label on the
+packet has write access to the receiving process and if that is not the case
+the packet is dropped.
+
+CIPSO Configuration
+
+It is normally unnecessary to specify the CIPSO configuration. The default
+values used by the system handle all internal cases. Smack will compose CIPSO
+label values to match the Smack labels being used without administrative
+intervention. Unlabeled packets that come into the system will be given the
+ambient label.
+
+Smack requires configuration in the case where packets from a system that is
+not smack that speaks CIPSO may be encountered. Usually this will be a Trusted
+Solaris system, but there are other, less widely deployed systems out there.
+CIPSO provides 3 important values, a Domain Of Interpretation (DOI), a level,
+and a category set with each packet. The DOI is intended to identify a group
+of systems that use compatible labeling schemes, and the DOI specified on the
+smack system must match that of the remote system or packets will be
+discarded. The DOI is 3 by default. The value can be read from /smack/doi and
+can be changed by writing to /smack/doi.
+
+The label and category set are mapped to a Smack label as defined in
+/etc/smack/cipso.
+
+A Smack/CIPSO mapping has the form:
+
+ smack level [category [category]*]
+
+Smack does not expect the level or category sets to be related in any
+particular way and does not assume or assign accesses based on them. Some
+examples of mappings:
+
+ TopSecret 7
+ TS:A,B 7 1 2
+ SecBDE 5 2 4 6
+ RAFTERS 7 12 26
+
+The ":" and "," characters are permitted in a Smack label but have no special
+meaning.
+
+The mapping of Smack labels to CIPSO values is defined by writing to
+/smack/cipso. Again, the format of data written to this special file
+is highly restrictive, so the program smackcipso is provided to
+ensure the writes are done properly. This program takes mappings
+on the standard input and sends them to /smack/cipso properly.
+
+In addition to explicit mappings Smack supports direct CIPSO mappings. One
+CIPSO level is used to indicate that the category set passed in the packet is
+in fact an encoding of the Smack label. The level used is 250 by default. The
+value can be read from /smack/direct and changed by writing to /smack/direct.
+
+Socket Attributes
+
+There are two attributes that are associated with sockets. These attributes
+can only be set by privileged tasks, but any task can read them for their own
+sockets.
+
+ SMACK64IPIN: The Smack label of the task object. A privileged
+ program that will enforce policy may set this to the star label.
+
+ SMACK64IPOUT: The Smack label transmitted with outgoing packets.
+ A privileged program may set this to match the label of another
+ task with which it hopes to communicate.
+
+Writing Applications for Smack
+
+There are three sorts of applications that will run on a Smack system. How an
+application interacts with Smack will determine what it will have to do to
+work properly under Smack.
+
+Smack Ignorant Applications
+
+By far the majority of applications have no reason whatever to care about the
+unique properties of Smack. Since invoking a program has no impact on the
+Smack label associated with the process the only concern likely to arise is
+whether the process has execute access to the program.
+
+Smack Relevant Applications
+
+Some programs can be improved by teaching them about Smack, but do not make
+any security decisions themselves. The utility ls(1) is one example of such a
+program.
+
+Smack Enforcing Applications
+
+These are special programs that not only know about Smack, but participate in
+the enforcement of system policy. In most cases these are the programs that
+set up user sessions. There are also network services that provide information
+to processes running with various labels.
+
+File System Interfaces
+
+Smack maintains labels on file system objects using extended attributes. The
+Smack label of a file, directory, or other file system object can be obtained
+using getxattr(2).
+
+ len = getxattr("/", "security.SMACK64", value, sizeof (value));
+
+will put the Smack label of the root directory into value. A privileged
+process can set the Smack label of a file system object with setxattr(2).
+
+ len = strlen("Rubble");
+ rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0);
+
+will set the Smack label of /foo to "Rubble" if the program has appropriate
+privilege.
+
+Socket Interfaces
+
+The socket attributes can be read using fgetxattr(2).
+
+A privileged process can set the Smack label of outgoing packets with
+fsetxattr(2).
+
+ len = strlen("Rubble");
+ rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0);
+
+will set the Smack label "Rubble" on packets going out from the socket if the
+program has appropriate privilege.
+
+ rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0);
+
+will set the Smack label "*" as the object label against which incoming
+packets will be checked if the program has appropriate privilege.
+
+Administration
+
+Smack supports some mount options:
+
+ smackfsdef=label: specifies the label to give files that lack
+ the Smack label extended attribute.
+
+ smackfsroot=label: specifies the label to assign the root of the
+ file system if it lacks the Smack extended attribute.
+
+ smackfshat=label: specifies a label that must have read access to
+ all labels set on the filesystem. Not yet enforced.
+
+ smackfsfloor=label: specifies a label to which all labels set on the
+ filesystem must have read access. Not yet enforced.
+
+These mount options apply to all file system types.
+
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
new file mode 100644
index 0000000..ac5e0b2
--- /dev/null
+++ b/Documentation/SubmitChecklist
@@ -0,0 +1,90 @@
+Linux Kernel patch submission checklist
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here are some basic things that developers should do if they want to see their
+kernel patch submissions accepted more quickly.
+
+These are all above and beyond the documentation that is provided in
+Documentation/SubmittingPatches and elsewhere regarding submitting Linux
+kernel patches.
+
+
+1: Builds cleanly with applicable or modified CONFIG options =y, =m, and
+ =n. No gcc warnings/errors, no linker warnings/errors.
+
+2: Passes allnoconfig, allmodconfig
+
+3: Builds on multiple CPU architectures by using local cross-compile tools
+ or something like PLM at OSDL.
+
+4: ppc64 is a good architecture for cross-compilation checking because it
+ tends to use `unsigned long' for 64-bit quantities.
+
+5: Check your patch for general style as detailed in
+ Documentation/CodingStyle. Check for trivial violations with the
+ patch style checker prior to submission (scripts/checkpatch.pl).
+ You should be able to justify all violations that remain in
+ your patch.
+
+6: Any new or modified CONFIG options don't muck up the config menu.
+
+7: All new Kconfig options have help text.
+
+8: Has been carefully reviewed with respect to relevant Kconfig
+ combinations. This is very hard to get right with testing -- brainpower
+ pays off here.
+
+9: Check cleanly with sparse.
+
+10: Use 'make checkstack' and 'make namespacecheck' and fix any problems
+ that they find. Note: checkstack does not point out problems explicitly,
+ but any one function that uses more than 512 bytes on the stack is a
+ candidate for change.
+
+11: Include kernel-doc to document global kernel APIs. (Not required for
+ static functions, but OK there also.) Use 'make htmldocs' or 'make
+ mandocs' to check the kernel-doc and fix any issues.
+
+12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
+ CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
+ CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously
+ enabled.
+
+13: Has been build- and runtime tested with and without CONFIG_SMP and
+ CONFIG_PREEMPT.
+
+14: If the patch affects IO/Disk, etc: has been tested with and without
+ CONFIG_LBD.
+
+15: All codepaths have been exercised with all lockdep features enabled.
+
+16: All new /proc entries are documented under Documentation/
+
+17: All new kernel boot parameters are documented in
+ Documentation/kernel-parameters.txt.
+
+18: All new module parameters are documented with MODULE_PARM_DESC()
+
+19: All new userspace interfaces are documented in Documentation/ABI/.
+ See Documentation/ABI/README for more information.
+ Patches that change userspace interfaces should be CCed to
+ linux-api@vger.kernel.org.
+
+20: Check that it all passes `make headers_check'.
+
+21: Has been checked with injection of at least slab and page-allocation
+ failures. See Documentation/fault-injection/.
+
+ If the new code is substantial, addition of subsystem-specific fault
+ injection might be appropriate.
+
+22: Newly-added code has been compiled with `gcc -W' (use "make
+ EXTRA_CFLAGS=-W"). This will generate lots of noise, but is good for
+ finding bugs like "warning: comparison between signed and unsigned".
+
+23: Tested after it has been merged into the -mm patchset to make sure
+ that it still works with all of the other queued patches and various
+ changes in the VM, VFS, and other subsystems.
+
+24: All memory barriers {e.g., barrier(), rmb(), wmb()} need a comment in the
+ source code that explains the logic of what they are doing and why.
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
new file mode 100644
index 0000000..99e72a8
--- /dev/null
+++ b/Documentation/SubmittingDrivers
@@ -0,0 +1,162 @@
+Submitting Drivers For The Linux Kernel
+---------------------------------------
+
+This document is intended to explain how to submit device drivers to the
+various kernel trees. Note that if you are interested in video card drivers
+you should probably talk to XFree86 (http://www.xfree86.org/) and/or X.Org
+(http://x.org/) instead.
+
+Also read the Documentation/SubmittingPatches document.
+
+
+Allocating Device Numbers
+-------------------------
+
+Major and minor numbers for block and character devices are allocated
+by the Linux assigned name and number authority (currently this is
+Torben Mathiasen). The site is http://www.lanana.org/. This
+also deals with allocating numbers for devices that are not going to
+be submitted to the mainstream kernel.
+See Documentation/devices.txt for more information on this.
+
+If you don't use assigned numbers then when your device is submitted it will
+be given an assigned number even if that is different from values you may
+have shipped to customers before.
+
+Who To Submit Drivers To
+------------------------
+
+Linux 2.0:
+ No new drivers are accepted for this kernel tree.
+
+Linux 2.2:
+ No new drivers are accepted for this kernel tree.
+
+Linux 2.4:
+ If the code area has a general maintainer then please submit it to
+ the maintainer listed in MAINTAINERS in the kernel file. If the
+ maintainer does not respond or you cannot find the appropriate
+ maintainer then please contact Willy Tarreau <w@1wt.eu>.
+
+Linux 2.6:
+ The same rules apply as 2.4 except that you should follow linux-kernel
+ to track changes in API's. The final contact point for Linux 2.6
+ submissions is Andrew Morton.
+
+What Criteria Determine Acceptance
+----------------------------------
+
+Licensing: The code must be released to us under the
+ GNU General Public License. We don't insist on any kind
+ of exclusive GPL licensing, and if you wish the driver
+ to be useful to other communities such as BSD you may well
+ wish to release under multiple licenses.
+ See accepted licenses at include/linux/module.h
+
+Copyright: The copyright owner must agree to use of GPL.
+ It's best if the submitter and copyright owner
+ are the same person/entity. If not, the name of
+ the person/entity authorizing use of GPL should be
+ listed in case it's necessary to verify the will of
+ the copyright owner.
+
+Interfaces: If your driver uses existing interfaces and behaves like
+ other drivers in the same class it will be much more likely
+ to be accepted than if it invents gratuitous new ones.
+ If you need to implement a common API over Linux and NT
+ drivers do it in userspace.
+
+Code: Please use the Linux style of code formatting as documented
+ in Documentation/CodingStyle. If you have sections of code
+ that need to be in other formats, for example because they
+ are shared with a windows driver kit and you want to
+ maintain them just once separate them out nicely and note
+ this fact.
+
+Portability: Pointers are not always 32bits, not all computers are little
+ endian, people do not all have floating point and you
+ shouldn't use inline x86 assembler in your driver without
+ careful thought. Pure x86 drivers generally are not popular.
+ If you only have x86 hardware it is hard to test portability
+ but it is easy to make sure the code can easily be made
+ portable.
+
+Clarity: It helps if anyone can see how to fix the driver. It helps
+ you because you get patches not bug reports. If you submit a
+ driver that intentionally obfuscates how the hardware works
+ it will go in the bitbucket.
+
+PM support: Since Linux is used on many portable and desktop systems, your
+ driver is likely to be used on such a system and therefore it
+ should support basic power management by implementing, if
+ necessary, the .suspend and .resume methods used during the
+ system-wide suspend and resume transitions. You should verify
+ that your driver correctly handles the suspend and resume, but
+ if you are unable to ensure that, please at least define the
+ .suspend method returning the -ENOSYS ("Function not
+ implemented") error. You should also try to make sure that your
+ driver uses as little power as possible when it's not doing
+ anything. For the driver testing instructions see
+ Documentation/power/drivers-testing.txt and for a relatively
+ complete overview of the power management issues related to
+ drivers see Documentation/power/devices.txt .
+
+Control: In general if there is active maintainance of a driver by
+ the author then patches will be redirected to them unless
+ they are totally obvious and without need of checking.
+ If you want to be the contact and update point for the
+ driver it is a good idea to state this in the comments,
+ and include an entry in MAINTAINERS for your driver.
+
+What Criteria Do Not Determine Acceptance
+-----------------------------------------
+
+Vendor: Being the hardware vendor and maintaining the driver is
+ often a good thing. If there is a stable working driver from
+ other people already in the tree don't expect 'we are the
+ vendor' to get your driver chosen. Ideally work with the
+ existing driver author to build a single perfect driver.
+
+Author: It doesn't matter if a large Linux company wrote the driver,
+ or you did. Nobody has any special access to the kernel
+ tree. Anyone who tells you otherwise isn't telling the
+ whole story.
+
+
+Resources
+---------
+
+Linux kernel master tree:
+ ftp.??.kernel.org:/pub/linux/kernel/...
+ ?? == your country code, such as "us", "uk", "fr", etc.
+
+Linux kernel mailing list:
+ linux-kernel@vger.kernel.org
+ [mail majordomo@vger.kernel.org to subscribe]
+
+Linux Device Drivers, Third Edition (covers 2.6.10):
+ http://lwn.net/Kernel/LDD3/ (free version)
+
+LWN.net:
+ Weekly summary of kernel development activity - http://lwn.net/
+ 2.6 API changes:
+ http://lwn.net/Articles/2.6-kernel-api/
+ Porting drivers from prior kernels to 2.6:
+ http://lwn.net/Articles/driver-porting/
+
+KernelTrap:
+ Occasional Linux kernel articles and developer interviews
+ http://kerneltrap.org/
+
+KernelNewbies:
+ Documentation and assistance for new kernel programmers
+ http://kernelnewbies.org/
+
+Linux USB project:
+ http://www.linux-usb.org/
+
+How to NOT write kernel driver by Arjan van de Ven:
+ http://www.fenrus.org/how-to-not-write-a-device-driver-paper.pdf
+
+Kernel Janitor:
+ http://janitor.kernelnewbies.org/
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
new file mode 100644
index 0000000..f309d3c
--- /dev/null
+++ b/Documentation/SubmittingPatches
@@ -0,0 +1,680 @@
+
+ How to Get Your Change Into the Linux Kernel
+ or
+ Care And Operation Of Your Linus Torvalds
+
+
+
+For a person or company who wishes to submit a change to the Linux
+kernel, the process can sometimes be daunting if you're not familiar
+with "the system." This text is a collection of suggestions which
+can greatly increase the chances of your change being accepted.
+
+Read Documentation/SubmitChecklist for a list of items to check
+before submitting code. If you are submitting a driver, also read
+Documentation/SubmittingDrivers.
+
+
+
+--------------------------------------------
+SECTION 1 - CREATING AND SENDING YOUR CHANGE
+--------------------------------------------
+
+
+
+1) "diff -up"
+------------
+
+Use "diff -up" or "diff -uprN" to create patches.
+
+All changes to the Linux kernel occur in the form of patches, as
+generated by diff(1). When creating your patch, make sure to create it
+in "unified diff" format, as supplied by the '-u' argument to diff(1).
+Also, please use the '-p' argument which shows which C function each
+change is in - that makes the resultant diff a lot easier to read.
+Patches should be based in the root kernel source directory,
+not in any lower subdirectory.
+
+To create a patch for a single file, it is often sufficient to do:
+
+ SRCTREE= linux-2.6
+ MYFILE= drivers/net/mydriver.c
+
+ cd $SRCTREE
+ cp $MYFILE $MYFILE.orig
+ vi $MYFILE # make your change
+ cd ..
+ diff -up $SRCTREE/$MYFILE{.orig,} > /tmp/patch
+
+To create a patch for multiple files, you should unpack a "vanilla",
+or unmodified kernel source tree, and generate a diff against your
+own source tree. For example:
+
+ MYSRC= /devel/linux-2.6
+
+ tar xvfz linux-2.6.12.tar.gz
+ mv linux-2.6.12 linux-2.6.12-vanilla
+ diff -uprN -X linux-2.6.12-vanilla/Documentation/dontdiff \
+ linux-2.6.12-vanilla $MYSRC > /tmp/patch
+
+"dontdiff" is a list of files which are generated by the kernel during
+the build process, and should be ignored in any diff(1)-generated
+patch. The "dontdiff" file is included in the kernel tree in
+2.6.12 and later. For earlier kernel versions, you can get it
+from <http://www.xenotime.net/linux/doc/dontdiff>.
+
+Make sure your patch does not include any extra files which do not
+belong in a patch submission. Make sure to review your patch -after-
+generated it with diff(1), to ensure accuracy.
+
+If your changes produce a lot of deltas, you may want to look into
+splitting them into individual patches which modify things in
+logical stages. This will facilitate easier reviewing by other
+kernel developers, very important if you want your patch accepted.
+There are a number of scripts which can aid in this:
+
+Quilt:
+http://savannah.nongnu.org/projects/quilt
+
+Andrew Morton's patch scripts:
+http://userweb.kernel.org/~akpm/stuff/patch-scripts.tar.gz
+Instead of these scripts, quilt is the recommended patch management
+tool (see above).
+
+
+
+2) Describe your changes.
+
+Describe the technical detail of the change(s) your patch includes.
+
+Be as specific as possible. The WORST descriptions possible include
+things like "update driver X", "bug fix for driver X", or "this patch
+includes updates for subsystem X. Please apply."
+
+If your description starts to get long, that's a sign that you probably
+need to split up your patch. See #3, next.
+
+
+
+3) Separate your changes.
+
+Separate _logical changes_ into a single patch file.
+
+For example, if your changes include both bug fixes and performance
+enhancements for a single driver, separate those changes into two
+or more patches. If your changes include an API update, and a new
+driver which uses that new API, separate those into two patches.
+
+On the other hand, if you make a single change to numerous files,
+group those changes into a single patch. Thus a single logical change
+is contained within a single patch.
+
+If one patch depends on another patch in order for a change to be
+complete, that is OK. Simply note "this patch depends on patch X"
+in your patch description.
+
+If you cannot condense your patch set into a smaller set of patches,
+then only post say 15 or so at a time and wait for review and integration.
+
+
+
+4) Style check your changes.
+
+Check your patch for basic style violations, details of which can be
+found in Documentation/CodingStyle. Failure to do so simply wastes
+the reviewers time and will get your patch rejected, probably
+without even being read.
+
+At a minimum you should check your patches with the patch style
+checker prior to submission (scripts/checkpatch.pl). You should
+be able to justify all violations that remain in your patch.
+
+
+
+5) Select e-mail destination.
+
+Look through the MAINTAINERS file and the source code, and determine
+if your change applies to a specific subsystem of the kernel, with
+an assigned maintainer. If so, e-mail that person.
+
+If no maintainer is listed, or the maintainer does not respond, send
+your patch to the primary Linux kernel developer's mailing list,
+linux-kernel@vger.kernel.org. Most kernel developers monitor this
+e-mail list, and can comment on your changes.
+
+
+Do not send more than 15 patches at once to the vger mailing lists!!!
+
+
+Linus Torvalds is the final arbiter of all changes accepted into the
+Linux kernel. His e-mail address is <torvalds@linux-foundation.org>.
+He gets a lot of e-mail, so typically you should do your best to -avoid-
+sending him e-mail.
+
+Patches which are bug fixes, are "obvious" changes, or similarly
+require little discussion should be sent or CC'd to Linus. Patches
+which require discussion or do not have a clear advantage should
+usually be sent first to linux-kernel. Only after the patch is
+discussed should the patch then be submitted to Linus.
+
+
+
+6) Select your CC (e-mail carbon copy) list.
+
+Unless you have a reason NOT to do so, CC linux-kernel@vger.kernel.org.
+
+Other kernel developers besides Linus need to be aware of your change,
+so that they may comment on it and offer code review and suggestions.
+linux-kernel is the primary Linux kernel developer mailing list.
+Other mailing lists are available for specific subsystems, such as
+USB, framebuffer devices, the VFS, the SCSI subsystem, etc. See the
+MAINTAINERS file for a mailing list that relates specifically to
+your change.
+
+Majordomo lists of VGER.KERNEL.ORG at:
+ <http://vger.kernel.org/vger-lists.html>
+
+If changes affect userland-kernel interfaces, please send
+the MAN-PAGES maintainer (as listed in the MAINTAINERS file)
+a man-pages patch, or at least a notification of the change,
+so that some information makes its way into the manual pages.
+
+Even if the maintainer did not respond in step #4, make sure to ALWAYS
+copy the maintainer when you change their code.
+
+For small patches you may want to CC the Trivial Patch Monkey
+trivial@kernel.org managed by Jesper Juhl; which collects "trivial"
+patches. Trivial patches must qualify for one of the following rules:
+ Spelling fixes in documentation
+ Spelling fixes which could break grep(1)
+ Warning fixes (cluttering with useless warnings is bad)
+ Compilation fixes (only if they are actually correct)
+ Runtime fixes (only if they actually fix things)
+ Removing use of deprecated functions/macros (eg. check_region)
+ Contact detail and documentation fixes
+ Non-portable code replaced by portable code (even in arch-specific,
+ since people copy, as long as it's trivial)
+ Any fix by the author/maintainer of the file (ie. patch monkey
+ in re-transmission mode)
+URL: <http://www.kernel.org/pub/linux/kernel/people/juhl/trivial/>
+
+
+
+7) No MIME, no links, no compression, no attachments. Just plain text.
+
+Linus and other kernel developers need to be able to read and comment
+on the changes you are submitting. It is important for a kernel
+developer to be able to "quote" your changes, using standard e-mail
+tools, so that they may comment on specific portions of your code.
+
+For this reason, all patches should be submitting e-mail "inline".
+WARNING: Be wary of your editor's word-wrap corrupting your patch,
+if you choose to cut-n-paste your patch.
+
+Do not attach the patch as a MIME attachment, compressed or not.
+Many popular e-mail applications will not always transmit a MIME
+attachment as plain text, making it impossible to comment on your
+code. A MIME attachment also takes Linus a bit more time to process,
+decreasing the likelihood of your MIME-attached change being accepted.
+
+Exception: If your mailer is mangling patches then someone may ask
+you to re-send them using MIME.
+
+See Documentation/email-clients.txt for hints about configuring
+your e-mail client so that it sends your patches untouched.
+
+8) E-mail size.
+
+When sending patches to Linus, always follow step #7.
+
+Large changes are not appropriate for mailing lists, and some
+maintainers. If your patch, uncompressed, exceeds 40 kB in size,
+it is preferred that you store your patch on an Internet-accessible
+server, and provide instead a URL (link) pointing to your patch.
+
+
+
+9) Name your kernel version.
+
+It is important to note, either in the subject line or in the patch
+description, the kernel version to which this patch applies.
+
+If the patch does not apply cleanly to the latest kernel version,
+Linus will not apply it.
+
+
+
+10) Don't get discouraged. Re-submit.
+
+After you have submitted your change, be patient and wait. If Linus
+likes your change and applies it, it will appear in the next version
+of the kernel that he releases.
+
+However, if your change doesn't appear in the next version of the
+kernel, there could be any number of reasons. It's YOUR job to
+narrow down those reasons, correct what was wrong, and submit your
+updated change.
+
+It is quite common for Linus to "drop" your patch without comment.
+That's the nature of the system. If he drops your patch, it could be
+due to
+* Your patch did not apply cleanly to the latest kernel version.
+* Your patch was not sufficiently discussed on linux-kernel.
+* A style issue (see section 2).
+* An e-mail formatting issue (re-read this section).
+* A technical problem with your change.
+* He gets tons of e-mail, and yours got lost in the shuffle.
+* You are being annoying.
+
+When in doubt, solicit comments on linux-kernel mailing list.
+
+
+
+11) Include PATCH in the subject
+
+Due to high e-mail traffic to Linus, and to linux-kernel, it is common
+convention to prefix your subject line with [PATCH]. This lets Linus
+and other kernel developers more easily distinguish patches from other
+e-mail discussions.
+
+
+
+12) Sign your work
+
+To improve tracking of who did what, especially with patches that can
+percolate to their final resting place in the kernel through several
+layers of maintainers, we've introduced a "sign-off" procedure on
+patches that are being emailed around.
+
+The sign-off is a simple line at the end of the explanation for the
+patch, which certifies that you wrote it or otherwise have the right to
+pass it on as a open-source patch. The rules are pretty simple: if you
+can certify the below:
+
+ Developer's Certificate of Origin 1.1
+
+ By making a contribution to this project, I certify that:
+
+ (a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+ (b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+ (c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+ (d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
+
+then you just add a line saying
+
+ Signed-off-by: Random J Developer <random@developer.example.org>
+
+using your real name (sorry, no pseudonyms or anonymous contributions.)
+
+Some people also put extra tags at the end. They'll just be ignored for
+now, but you can do this to mark internal company procedures or just
+point out some special detail about the sign-off.
+
+If you are a subsystem or branch maintainer, sometimes you need to slightly
+modify patches you receive in order to merge them, because the code is not
+exactly the same in your tree and the submitters'. If you stick strictly to
+rule (c), you should ask the submitter to rediff, but this is a totally
+counter-productive waste of time and energy. Rule (b) allows you to adjust
+the code, but then it is very impolite to change one submitter's code and
+make him endorse your bugs. To solve this problem, it is recommended that
+you add a line between the last Signed-off-by header and yours, indicating
+the nature of your changes. While there is nothing mandatory about this, it
+seems like prepending the description with your mail and/or name, all
+enclosed in square brackets, is noticeable enough to make it obvious that
+you are responsible for last-minute changes. Example :
+
+ Signed-off-by: Random J Developer <random@developer.example.org>
+ [lucky@maintainer.example.org: struct foo moved from foo.c to foo.h]
+ Signed-off-by: Lucky K Maintainer <lucky@maintainer.example.org>
+
+This practise is particularly helpful if you maintain a stable branch and
+want at the same time to credit the author, track changes, merge the fix,
+and protect the submitter from complaints. Note that under no circumstances
+can you change the author's identity (the From header), as it is the one
+which appears in the changelog.
+
+Special note to back-porters: It seems to be a common and useful practise
+to insert an indication of the origin of a patch at the top of the commit
+message (just after the subject line) to facilitate tracking. For instance,
+here's what we see in 2.6-stable :
+
+ Date: Tue May 13 19:10:30 2008 +0000
+
+ SCSI: libiscsi regression in 2.6.25: fix nop timer handling
+
+ commit 4cf1043593db6a337f10e006c23c69e5fc93e722 upstream
+
+And here's what appears in 2.4 :
+
+ Date: Tue May 13 22:12:27 2008 +0200
+
+ wireless, airo: waitbusy() won't delay
+
+ [backport of 2.6 commit b7acbdfbd1f277c1eb23f344f899cfa4cd0bf36a]
+
+Whatever the format, this information provides a valuable help to people
+tracking your trees, and to people trying to trouble-shoot bugs in your
+tree.
+
+
+13) When to use Acked-by: and Cc:
+
+The Signed-off-by: tag indicates that the signer was involved in the
+development of the patch, or that he/she was in the patch's delivery path.
+
+If a person was not directly involved in the preparation or handling of a
+patch but wishes to signify and record their approval of it then they can
+arrange to have an Acked-by: line added to the patch's changelog.
+
+Acked-by: is often used by the maintainer of the affected code when that
+maintainer neither contributed to nor forwarded the patch.
+
+Acked-by: is not as formal as Signed-off-by:. It is a record that the acker
+has at least reviewed the patch and has indicated acceptance. Hence patch
+mergers will sometimes manually convert an acker's "yep, looks good to me"
+into an Acked-by:.
+
+Acked-by: does not necessarily indicate acknowledgement of the entire patch.
+For example, if a patch affects multiple subsystems and has an Acked-by: from
+one subsystem maintainer then this usually indicates acknowledgement of just
+the part which affects that maintainer's code. Judgement should be used here.
+When in doubt people should refer to the original discussion in the mailing
+list archives.
+
+If a person has had the opportunity to comment on a patch, but has not
+provided such comments, you may optionally add a "Cc:" tag to the patch.
+This is the only tag which might be added without an explicit action by the
+person it names. This tag documents that potentially interested parties
+have been included in the discussion
+
+
+14) Using Tested-by: and Reviewed-by:
+
+A Tested-by: tag indicates that the patch has been successfully tested (in
+some environment) by the person named. This tag informs maintainers that
+some testing has been performed, provides a means to locate testers for
+future patches, and ensures credit for the testers.
+
+Reviewed-by:, instead, indicates that the patch has been reviewed and found
+acceptable according to the Reviewer's Statement:
+
+ Reviewer's statement of oversight
+
+ By offering my Reviewed-by: tag, I state that:
+
+ (a) I have carried out a technical review of this patch to
+ evaluate its appropriateness and readiness for inclusion into
+ the mainline kernel.
+
+ (b) Any problems, concerns, or questions relating to the patch
+ have been communicated back to the submitter. I am satisfied
+ with the submitter's response to my comments.
+
+ (c) While there may be things that could be improved with this
+ submission, I believe that it is, at this time, (1) a
+ worthwhile modification to the kernel, and (2) free of known
+ issues which would argue against its inclusion.
+
+ (d) While I have reviewed the patch and believe it to be sound, I
+ do not (unless explicitly stated elsewhere) make any
+ warranties or guarantees that it will achieve its stated
+ purpose or function properly in any given situation.
+
+A Reviewed-by tag is a statement of opinion that the patch is an
+appropriate modification of the kernel without any remaining serious
+technical issues. Any interested reviewer (who has done the work) can
+offer a Reviewed-by tag for a patch. This tag serves to give credit to
+reviewers and to inform maintainers of the degree of review which has been
+done on the patch. Reviewed-by: tags, when supplied by reviewers known to
+understand the subject area and to perform thorough reviews, will normally
+increase the liklihood of your patch getting into the kernel.
+
+
+15) The canonical patch format
+
+The canonical patch subject line is:
+
+ Subject: [PATCH 001/123] subsystem: summary phrase
+
+The canonical patch message body contains the following:
+
+ - A "from" line specifying the patch author.
+
+ - An empty line.
+
+ - The body of the explanation, which will be copied to the
+ permanent changelog to describe this patch.
+
+ - The "Signed-off-by:" lines, described above, which will
+ also go in the changelog.
+
+ - A marker line containing simply "---".
+
+ - Any additional comments not suitable for the changelog.
+
+ - The actual patch (diff output).
+
+The Subject line format makes it very easy to sort the emails
+alphabetically by subject line - pretty much any email reader will
+support that - since because the sequence number is zero-padded,
+the numerical and alphabetic sort is the same.
+
+The "subsystem" in the email's Subject should identify which
+area or subsystem of the kernel is being patched.
+
+The "summary phrase" in the email's Subject should concisely
+describe the patch which that email contains. The "summary
+phrase" should not be a filename. Do not use the same "summary
+phrase" for every patch in a whole patch series (where a "patch
+series" is an ordered sequence of multiple, related patches).
+
+Bear in mind that the "summary phrase" of your email becomes
+a globally-unique identifier for that patch. It propagates
+all the way into the git changelog. The "summary phrase" may
+later be used in developer discussions which refer to the patch.
+People will want to google for the "summary phrase" to read
+discussion regarding that patch.
+
+A couple of example Subjects:
+
+ Subject: [patch 2/5] ext2: improve scalability of bitmap searching
+ Subject: [PATCHv2 001/207] x86: fix eflags tracking
+
+The "from" line must be the very first line in the message body,
+and has the form:
+
+ From: Original Author <author@example.com>
+
+The "from" line specifies who will be credited as the author of the
+patch in the permanent changelog. If the "from" line is missing,
+then the "From:" line from the email header will be used to determine
+the patch author in the changelog.
+
+The explanation body will be committed to the permanent source
+changelog, so should make sense to a competent reader who has long
+since forgotten the immediate details of the discussion that might
+have led to this patch.
+
+The "---" marker line serves the essential purpose of marking for patch
+handling tools where the changelog message ends.
+
+One good use for the additional comments after the "---" marker is for
+a diffstat, to show what files have changed, and the number of inserted
+and deleted lines per file. A diffstat is especially useful on bigger
+patches. Other comments relevant only to the moment or the maintainer,
+not suitable for the permanent changelog, should also go here.
+Use diffstat options "-p 1 -w 70" so that filenames are listed from the
+top of the kernel source tree and don't use too much horizontal space
+(easily fit in 80 columns, maybe with some indentation).
+
+See more details on the proper patch format in the following
+references.
+
+
+16) Sending "git pull" requests (from Linus emails)
+
+Please write the git repo address and branch name alone on the same line
+so that I can't even by mistake pull from the wrong branch, and so
+that a triple-click just selects the whole thing.
+
+So the proper format is something along the lines of:
+
+ "Please pull from
+
+ git://jdelvare.pck.nerim.net/jdelvare-2.6 i2c-for-linus
+
+ to get these changes:"
+
+so that I don't have to hunt-and-peck for the address and inevitably
+get it wrong (actually, I've only gotten it wrong a few times, and
+checking against the diffstat tells me when I get it wrong, but I'm
+just a lot more comfortable when I don't have to "look for" the right
+thing to pull, and double-check that I have the right branch-name).
+
+
+Please use "git diff -M --stat --summary" to generate the diffstat:
+the -M enables rename detection, and the summary enables a summary of
+new/deleted or renamed files.
+
+With rename detection, the statistics are rather different [...]
+because git will notice that a fair number of the changes are renames.
+
+-----------------------------------
+SECTION 2 - HINTS, TIPS, AND TRICKS
+-----------------------------------
+
+This section lists many of the common "rules" associated with code
+submitted to the kernel. There are always exceptions... but you must
+have a really good reason for doing so. You could probably call this
+section Linus Computer Science 101.
+
+
+
+1) Read Documentation/CodingStyle
+
+Nuff said. If your code deviates too much from this, it is likely
+to be rejected without further review, and without comment.
+
+One significant exception is when moving code from one file to
+another -- in this case you should not modify the moved code at all in
+the same patch which moves it. This clearly delineates the act of
+moving the code and your changes. This greatly aids review of the
+actual differences and allows tools to better track the history of
+the code itself.
+
+Check your patches with the patch style checker prior to submission
+(scripts/checkpatch.pl). The style checker should be viewed as
+a guide not as the final word. If your code looks better with
+a violation then its probably best left alone.
+
+The checker reports at three levels:
+ - ERROR: things that are very likely to be wrong
+ - WARNING: things requiring careful review
+ - CHECK: things requiring thought
+
+You should be able to justify all violations that remain in your
+patch.
+
+
+
+2) #ifdefs are ugly
+
+Code cluttered with ifdefs is difficult to read and maintain. Don't do
+it. Instead, put your ifdefs in a header, and conditionally define
+'static inline' functions, or macros, which are used in the code.
+Let the compiler optimize away the "no-op" case.
+
+Simple example, of poor code:
+
+ dev = alloc_etherdev (sizeof(struct funky_private));
+ if (!dev)
+ return -ENODEV;
+ #ifdef CONFIG_NET_FUNKINESS
+ init_funky_net(dev);
+ #endif
+
+Cleaned-up example:
+
+(in header)
+ #ifndef CONFIG_NET_FUNKINESS
+ static inline void init_funky_net (struct net_device *d) {}
+ #endif
+
+(in the code itself)
+ dev = alloc_etherdev (sizeof(struct funky_private));
+ if (!dev)
+ return -ENODEV;
+ init_funky_net(dev);
+
+
+
+3) 'static inline' is better than a macro
+
+Static inline functions are greatly preferred over macros.
+They provide type safety, have no length limitations, no formatting
+limitations, and under gcc they are as cheap as macros.
+
+Macros should only be used for cases where a static inline is clearly
+suboptimal [there are a few, isolated cases of this in fast paths],
+or where it is impossible to use a static inline function [such as
+string-izing].
+
+'static inline' is preferred over 'static __inline__', 'extern inline',
+and 'extern __inline__'.
+
+
+
+4) Don't over-design.
+
+Don't try to anticipate nebulous future cases which may or may not
+be useful: "Make it as simple as you can, and no simpler."
+
+
+
+----------------------
+SECTION 3 - REFERENCES
+----------------------
+
+Andrew Morton, "The perfect patch" (tpp).
+ <http://userweb.kernel.org/~akpm/stuff/tpp.txt>
+
+Jeff Garzik, "Linux kernel patch submission format".
+ <http://linux.yyz.us/patch-format.html>
+
+Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer".
+ <http://www.kroah.com/log/2005/03/31/>
+ <http://www.kroah.com/log/2005/07/08/>
+ <http://www.kroah.com/log/2005/10/19/>
+ <http://www.kroah.com/log/2006/01/11/>
+
+NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!
+ <http://marc.theaimsgroup.com/?l=linux-kernel&m=112112749912944&w=2>
+
+Kernel Documentation/CodingStyle:
+ <http://users.sosdg.org/~qiyong/lxr/source/Documentation/CodingStyle>
+
+Linus Torvalds's mail on the canonical patch format:
+ <http://lkml.org/lkml/2005/4/7/183>
+
+Andi Kleen, "On submitting kernel patches"
+ Some strategies to get difficult or controversal changes in.
+ http://halobates.de/on-submitting-patches.pdf
+
+--
diff --git a/Documentation/VGA-softcursor.txt b/Documentation/VGA-softcursor.txt
new file mode 100644
index 0000000..70acfbf
--- /dev/null
+++ b/Documentation/VGA-softcursor.txt
@@ -0,0 +1,39 @@
+Software cursor for VGA by Pavel Machek <pavel@atrey.karlin.mff.cuni.cz>
+======================= and Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+
+ Linux now has some ability to manipulate cursor appearance. Normally, you
+can set the size of hardware cursor (and also work around some ugly bugs in
+those miserable Trident cards--see #define TRIDENT_GLITCH in drivers/video/
+vgacon.c). You can now play a few new tricks: you can make your cursor look
+like a non-blinking red block, make it inverse background of the character it's
+over or to highlight that character and still choose whether the original
+hardware cursor should remain visible or not. There may be other things I have
+never thought of.
+
+ The cursor appearance is controlled by a "<ESC>[?1;2;3c" escape sequence
+where 1, 2 and 3 are parameters described below. If you omit any of them,
+they will default to zeroes.
+
+ Parameter 1 specifies cursor size (0=default, 1=invisible, 2=underline, ...,
+8=full block) + 16 if you want the software cursor to be applied + 32 if you
+want to always change the background color + 64 if you dislike having the
+background the same as the foreground. Highlights are ignored for the last two
+flags.
+
+ The second parameter selects character attribute bits you want to change
+(by simply XORing them with the value of this parameter). On standard VGA,
+the high four bits specify background and the low four the foreground. In both
+groups, low three bits set color (as in normal color codes used by the console)
+and the most significant one turns on highlight (or sometimes blinking--it
+depends on the configuration of your VGA).
+
+ The third parameter consists of character attribute bits you want to set.
+Bit setting takes place before bit toggling, so you can simply clear a bit by
+including it in both the set mask and the toggle mask.
+
+Examples:
+=========
+
+To get normal blinking underline, use: echo -e '\033[?2c'
+To get blinking block, use: echo -e '\033[?6c'
+To get red non-blinking block, use: echo -e '\033[?17;0;64c'
diff --git a/Documentation/accounting/.gitignore b/Documentation/accounting/.gitignore
new file mode 100644
index 0000000..8648520
--- /dev/null
+++ b/Documentation/accounting/.gitignore
@@ -0,0 +1 @@
+getdelays
diff --git a/Documentation/accounting/Makefile b/Documentation/accounting/Makefile
new file mode 100644
index 0000000..31929eb
--- /dev/null
+++ b/Documentation/accounting/Makefile
@@ -0,0 +1,10 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := getdelays
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_getdelays.o += -I$(objtree)/usr/include
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
new file mode 100644
index 0000000..eda40fd
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.txt
@@ -0,0 +1,27 @@
+Control Groupstats is inspired by the discussion at
+http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
+suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
+
+Per cgroup statistics infrastructure re-uses code from the taskstats
+interface. A new set of cgroup operations are registered with commands
+and attributes specific to cgroups. It should be very easy to
+extend per cgroup statistics, by adding members to the cgroupstats
+structure.
+
+The current model for cgroupstats is a pull, a push model (to post
+statistics on interesting events), should be very easy to add. Currently
+user space requests for statistics by passing the cgroup path.
+Statistics about the state of all the tasks in the cgroup is returned to
+user space.
+
+NOTE: We currently rely on delay accounting for extracting information
+about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
+information will not be available.
+
+To extract cgroup statistics a utility very similar to getdelays.c
+has been developed, the sample output of the utility is shown below
+
+~/balbir/cgroupstats # ./getdelays -C "/cgroup/a"
+sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
+~/balbir/cgroupstats # ./getdelays -C "/cgroup"
+sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
new file mode 100644
index 0000000..8a12f07
--- /dev/null
+++ b/Documentation/accounting/delay-accounting.txt
@@ -0,0 +1,117 @@
+Delay accounting
+----------------
+
+Tasks encounter delays in execution when they wait
+for some kernel resource to become available e.g. a
+runnable task may wait for a free CPU to run on.
+
+The per-task delay accounting functionality measures
+the delays experienced by a task while
+
+a) waiting for a CPU (while being runnable)
+b) completion of synchronous block I/O initiated by the task
+c) swapping in pages
+d) memory reclaim
+
+and makes these statistics available to userspace through
+the taskstats interface.
+
+Such delays provide feedback for setting a task's cpu priority,
+io priority and rss limit values appropriately. Long delays for
+important tasks could be a trigger for raising its corresponding priority.
+
+The functionality, through its use of the taskstats interface, also provides
+delay statistics aggregated for all tasks (or threads) belonging to a
+thread group (corresponding to a traditional Unix process). This is a commonly
+needed aggregation that is more efficiently done by the kernel.
+
+Userspace utilities, particularly resource management applications, can also
+aggregate delay statistics into arbitrary groups. To enable this, delay
+statistics of a task are available both during its lifetime as well as on its
+exit, ensuring continuous and complete monitoring can be done.
+
+
+Interface
+---------
+
+Delay accounting uses the taskstats interface which is described
+in detail in a separate document in this directory. Taskstats returns a
+generic data structure to userspace corresponding to per-pid and per-tgid
+statistics. The delay accounting functionality populates specific fields of
+this structure. See
+ include/linux/taskstats.h
+for a description of the fields pertaining to delay accounting.
+It will generally be in the form of counters returning the cumulative
+delay seen for cpu, sync block I/O, swapin, memory reclaim etc.
+
+Taking the difference of two successive readings of a given
+counter (say cpu_delay_total) for a task will give the delay
+experienced by the task waiting for the corresponding resource
+in that interval.
+
+When a task exits, records containing the per-task statistics
+are sent to userspace without requiring a command. If it is the last exiting
+task of a thread group, the per-tgid statistics are also sent. More details
+are given in the taskstats interface description.
+
+The getdelays.c userspace utility in this directory allows simple commands to
+be run and the corresponding delay statistics to be displayed. It also serves
+as an example of using the taskstats interface.
+
+Usage
+-----
+
+Compile the kernel with
+ CONFIG_TASK_DELAY_ACCT=y
+ CONFIG_TASKSTATS=y
+
+Delay accounting is enabled by default at boot up.
+To disable, add
+ nodelayacct
+to the kernel boot options. The rest of the instructions
+below assume this has not been done.
+
+After the system has booted up, use a utility
+similar to getdelays.c to access the delays
+seen by a given task or a task group (tgid).
+The utility also allows a given command to be
+executed and the corresponding delays to be
+seen.
+
+General format of the getdelays command
+
+getdelays [-t tgid] [-p pid] [-c cmd...]
+
+
+Get delays, since system boot, for pid 10
+# ./getdelays -p 10
+(output similar to next case)
+
+Get sum of delays, since system boot, for all pids with tgid 5
+# ./getdelays -t 5
+
+
+CPU count real total virtual total delay total
+ 7876 92005750 100000000 24001500
+IO count delay total
+ 0 0
+SWAP count delay total
+ 0 0
+RECLAIM count delay total
+ 0 0
+
+Get delays seen in executing a given simple command
+# ./getdelays -c ls /
+
+bin data1 data3 data5 dev home media opt root srv sys usr
+boot data2 data4 data6 etc lib mnt proc sbin subdomain tmp var
+
+
+CPU count real total virtual total delay total
+ 6 4000250 4000000 0
+IO count delay total
+ 0 0
+SWAP count delay total
+ 0 0
+RECLAIM count delay total
+ 0 0
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
new file mode 100644
index 0000000..cc49400
--- /dev/null
+++ b/Documentation/accounting/getdelays.c
@@ -0,0 +1,501 @@
+/* getdelays.c
+ *
+ * Utility to get per-pid and per-tgid delay accounting statistics
+ * Also illustrates usage of the taskstats interface
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2005
+ * Copyright (C) Balbir Singh, IBM Corp. 2006
+ * Copyright (c) Jay Lan, SGI. 2006
+ *
+ * Compile with
+ * gcc -I/usr/src/linux/include getdelays.c -o getdelays
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <poll.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <signal.h>
+
+#include <linux/genetlink.h>
+#include <linux/taskstats.h>
+#include <linux/cgroupstats.h>
+
+/*
+ * Generic macros for dealing with netlink sockets. Might be duplicated
+ * elsewhere. It is recommended that commercial grade applications use
+ * libnl or libnetlink and use the interfaces provided by the library
+ */
+#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+#define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len) (len - NLA_HDRLEN)
+
+#define err(code, fmt, arg...) \
+ do { \
+ fprintf(stderr, fmt, ##arg); \
+ exit(code); \
+ } while (0)
+
+int done;
+int rcvbufsz;
+char name[100];
+int dbg;
+int print_delays;
+int print_io_accounting;
+int print_task_context_switch_counts;
+__u64 stime, utime;
+
+#define PRINTF(fmt, arg...) { \
+ if (dbg) { \
+ printf(fmt, ##arg); \
+ } \
+ }
+
+/* Maximum size of response requested or message sent */
+#define MAX_MSG_SIZE 1024
+/* Maximum number of cpus expected to be specified in a cpumask */
+#define MAX_CPUS 32
+
+struct msgtemplate {
+ struct nlmsghdr n;
+ struct genlmsghdr g;
+ char buf[MAX_MSG_SIZE];
+};
+
+char cpumask[100+6*MAX_CPUS];
+
+static void usage(void)
+{
+ fprintf(stderr, "getdelays [-dilv] [-w logfile] [-r bufsize] "
+ "[-m cpumask] [-t tgid] [-p pid]\n");
+ fprintf(stderr, " -d: print delayacct stats\n");
+ fprintf(stderr, " -i: print IO accounting (works only with -p)\n");
+ fprintf(stderr, " -l: listen forever\n");
+ fprintf(stderr, " -v: debug on\n");
+ fprintf(stderr, " -C: container path\n");
+}
+
+/*
+ * Create a raw netlink socket and bind
+ */
+static int create_nl_socket(int protocol)
+{
+ int fd;
+ struct sockaddr_nl local;
+
+ fd = socket(AF_NETLINK, SOCK_RAW, protocol);
+ if (fd < 0)
+ return -1;
+
+ if (rcvbufsz)
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
+ &rcvbufsz, sizeof(rcvbufsz)) < 0) {
+ fprintf(stderr, "Unable to set socket rcv buf size "
+ "to %d\n",
+ rcvbufsz);
+ return -1;
+ }
+
+ memset(&local, 0, sizeof(local));
+ local.nl_family = AF_NETLINK;
+
+ if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
+ goto error;
+
+ return fd;
+error:
+ close(fd);
+ return -1;
+}
+
+
+int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+ __u8 genl_cmd, __u16 nla_type,
+ void *nla_data, int nla_len)
+{
+ struct nlattr *na;
+ struct sockaddr_nl nladdr;
+ int r, buflen;
+ char *buf;
+
+ struct msgtemplate msg;
+
+ msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+ msg.n.nlmsg_type = nlmsg_type;
+ msg.n.nlmsg_flags = NLM_F_REQUEST;
+ msg.n.nlmsg_seq = 0;
+ msg.n.nlmsg_pid = nlmsg_pid;
+ msg.g.cmd = genl_cmd;
+ msg.g.version = 0x1;
+ na = (struct nlattr *) GENLMSG_DATA(&msg);
+ na->nla_type = nla_type;
+ na->nla_len = nla_len + 1 + NLA_HDRLEN;
+ memcpy(NLA_DATA(na), nla_data, nla_len);
+ msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+ buf = (char *) &msg;
+ buflen = msg.n.nlmsg_len ;
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+ while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+ sizeof(nladdr))) < buflen) {
+ if (r > 0) {
+ buf += r;
+ buflen -= r;
+ } else if (errno != EAGAIN)
+ return -1;
+ }
+ return 0;
+}
+
+
+/*
+ * Probe the controller in genetlink to find the family id
+ * for the TASKSTATS family
+ */
+int get_family_id(int sd)
+{
+ struct {
+ struct nlmsghdr n;
+ struct genlmsghdr g;
+ char buf[256];
+ } ans;
+
+ int id = 0, rc;
+ struct nlattr *na;
+ int rep_len;
+
+ strcpy(name, TASKSTATS_GENL_NAME);
+ rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+ CTRL_ATTR_FAMILY_NAME, (void *)name,
+ strlen(TASKSTATS_GENL_NAME)+1);
+
+ rep_len = recv(sd, &ans, sizeof(ans), 0);
+ if (ans.n.nlmsg_type == NLMSG_ERROR ||
+ (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+ return 0;
+
+ na = (struct nlattr *) GENLMSG_DATA(&ans);
+ na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+ if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
+ id = *(__u16 *) NLA_DATA(na);
+ }
+ return id;
+}
+
+void print_delayacct(struct taskstats *t)
+{
+ printf("\n\nCPU %15s%15s%15s%15s\n"
+ " %15llu%15llu%15llu%15llu\n"
+ "IO %15s%15s\n"
+ " %15llu%15llu\n"
+ "SWAP %15s%15s\n"
+ " %15llu%15llu\n"
+ "RECLAIM %12s%15s\n"
+ " %15llu%15llu\n",
+ "count", "real total", "virtual total", "delay total",
+ (unsigned long long)t->cpu_count,
+ (unsigned long long)t->cpu_run_real_total,
+ (unsigned long long)t->cpu_run_virtual_total,
+ (unsigned long long)t->cpu_delay_total,
+ "count", "delay total",
+ (unsigned long long)t->blkio_count,
+ (unsigned long long)t->blkio_delay_total,
+ "count", "delay total",
+ (unsigned long long)t->swapin_count,
+ (unsigned long long)t->swapin_delay_total,
+ "count", "delay total",
+ (unsigned long long)t->freepages_count,
+ (unsigned long long)t->freepages_delay_total);
+}
+
+void task_context_switch_counts(struct taskstats *t)
+{
+ printf("\n\nTask %15s%15s\n"
+ " %15llu%15llu\n",
+ "voluntary", "nonvoluntary",
+ (unsigned long long)t->nvcsw, (unsigned long long)t->nivcsw);
+}
+
+void print_cgroupstats(struct cgroupstats *c)
+{
+ printf("sleeping %llu, blocked %llu, running %llu, stopped %llu, "
+ "uninterruptible %llu\n", (unsigned long long)c->nr_sleeping,
+ (unsigned long long)c->nr_io_wait,
+ (unsigned long long)c->nr_running,
+ (unsigned long long)c->nr_stopped,
+ (unsigned long long)c->nr_uninterruptible);
+}
+
+
+void print_ioacct(struct taskstats *t)
+{
+ printf("%s: read=%llu, write=%llu, cancelled_write=%llu\n",
+ t->ac_comm,
+ (unsigned long long)t->read_bytes,
+ (unsigned long long)t->write_bytes,
+ (unsigned long long)t->cancelled_write_bytes);
+}
+
+int main(int argc, char *argv[])
+{
+ int c, rc, rep_len, aggr_len, len2, cmd_type;
+ __u16 id;
+ __u32 mypid;
+
+ struct nlattr *na;
+ int nl_sd = -1;
+ int len = 0;
+ pid_t tid = 0;
+ pid_t rtid = 0;
+
+ int fd = 0;
+ int count = 0;
+ int write_file = 0;
+ int maskset = 0;
+ char *logfile = NULL;
+ int loop = 0;
+ int containerset = 0;
+ char containerpath[1024];
+ int cfd = 0;
+
+ struct msgtemplate msg;
+
+ while (1) {
+ c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:");
+ if (c < 0)
+ break;
+
+ switch (c) {
+ case 'd':
+ printf("print delayacct stats ON\n");
+ print_delays = 1;
+ break;
+ case 'i':
+ printf("printing IO accounting\n");
+ print_io_accounting = 1;
+ break;
+ case 'q':
+ printf("printing task/process context switch rates\n");
+ print_task_context_switch_counts = 1;
+ break;
+ case 'C':
+ containerset = 1;
+ strncpy(containerpath, optarg, strlen(optarg) + 1);
+ break;
+ case 'w':
+ logfile = strdup(optarg);
+ printf("write to file %s\n", logfile);
+ write_file = 1;
+ break;
+ case 'r':
+ rcvbufsz = atoi(optarg);
+ printf("receive buf size %d\n", rcvbufsz);
+ if (rcvbufsz < 0)
+ err(1, "Invalid rcv buf size\n");
+ break;
+ case 'm':
+ strncpy(cpumask, optarg, sizeof(cpumask));
+ maskset = 1;
+ printf("cpumask %s maskset %d\n", cpumask, maskset);
+ break;
+ case 't':
+ tid = atoi(optarg);
+ if (!tid)
+ err(1, "Invalid tgid\n");
+ cmd_type = TASKSTATS_CMD_ATTR_TGID;
+ break;
+ case 'p':
+ tid = atoi(optarg);
+ if (!tid)
+ err(1, "Invalid pid\n");
+ cmd_type = TASKSTATS_CMD_ATTR_PID;
+ break;
+ case 'v':
+ printf("debug on\n");
+ dbg = 1;
+ break;
+ case 'l':
+ printf("listen forever\n");
+ loop = 1;
+ break;
+ default:
+ usage();
+ exit(-1);
+ }
+ }
+
+ if (write_file) {
+ fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (fd == -1) {
+ perror("Cannot open output file\n");
+ exit(1);
+ }
+ }
+
+ if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0)
+ err(1, "error creating Netlink socket\n");
+
+
+ mypid = getpid();
+ id = get_family_id(nl_sd);
+ if (!id) {
+ fprintf(stderr, "Error getting family id, errno %d\n", errno);
+ goto err;
+ }
+ PRINTF("family id %d\n", id);
+
+ if (maskset) {
+ rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+ TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+ &cpumask, strlen(cpumask) + 1);
+ PRINTF("Sent register cpumask, retval %d\n", rc);
+ if (rc < 0) {
+ fprintf(stderr, "error sending register cpumask\n");
+ goto err;
+ }
+ }
+
+ if (tid && containerset) {
+ fprintf(stderr, "Select either -t or -C, not both\n");
+ goto err;
+ }
+
+ if (tid) {
+ rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+ cmd_type, &tid, sizeof(__u32));
+ PRINTF("Sent pid/tgid, retval %d\n", rc);
+ if (rc < 0) {
+ fprintf(stderr, "error sending tid/tgid cmd\n");
+ goto done;
+ }
+ }
+
+ if (containerset) {
+ cfd = open(containerpath, O_RDONLY);
+ if (cfd < 0) {
+ perror("error opening container file");
+ goto err;
+ }
+ rc = send_cmd(nl_sd, id, mypid, CGROUPSTATS_CMD_GET,
+ CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32));
+ if (rc < 0) {
+ perror("error sending cgroupstats command");
+ goto err;
+ }
+ }
+
+ do {
+ int i;
+
+ rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
+ PRINTF("received %d bytes\n", rep_len);
+
+ if (rep_len < 0) {
+ fprintf(stderr, "nonfatal reply error: errno %d\n",
+ errno);
+ continue;
+ }
+ if (msg.n.nlmsg_type == NLMSG_ERROR ||
+ !NLMSG_OK((&msg.n), rep_len)) {
+ struct nlmsgerr *err = NLMSG_DATA(&msg);
+ fprintf(stderr, "fatal reply error, errno %d\n",
+ err->error);
+ goto done;
+ }
+
+ PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n",
+ sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);
+
+
+ rep_len = GENLMSG_PAYLOAD(&msg.n);
+
+ na = (struct nlattr *) GENLMSG_DATA(&msg);
+ len = 0;
+ i = 0;
+ while (len < rep_len) {
+ len += NLA_ALIGN(na->nla_len);
+ switch (na->nla_type) {
+ case TASKSTATS_TYPE_AGGR_TGID:
+ /* Fall through */
+ case TASKSTATS_TYPE_AGGR_PID:
+ aggr_len = NLA_PAYLOAD(na->nla_len);
+ len2 = 0;
+ /* For nested attributes, na follows */
+ na = (struct nlattr *) NLA_DATA(na);
+ done = 0;
+ while (len2 < aggr_len) {
+ switch (na->nla_type) {
+ case TASKSTATS_TYPE_PID:
+ rtid = *(int *) NLA_DATA(na);
+ if (print_delays)
+ printf("PID\t%d\n", rtid);
+ break;
+ case TASKSTATS_TYPE_TGID:
+ rtid = *(int *) NLA_DATA(na);
+ if (print_delays)
+ printf("TGID\t%d\n", rtid);
+ break;
+ case TASKSTATS_TYPE_STATS:
+ count++;
+ if (print_delays)
+ print_delayacct((struct taskstats *) NLA_DATA(na));
+ if (print_io_accounting)
+ print_ioacct((struct taskstats *) NLA_DATA(na));
+ if (print_task_context_switch_counts)
+ task_context_switch_counts((struct taskstats *) NLA_DATA(na));
+ if (fd) {
+ if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
+ err(1,"write error\n");
+ }
+ }
+ if (!loop)
+ goto done;
+ break;
+ default:
+ fprintf(stderr, "Unknown nested"
+ " nla_type %d\n",
+ na->nla_type);
+ break;
+ }
+ len2 += NLA_ALIGN(na->nla_len);
+ na = (struct nlattr *) ((char *) na + len2);
+ }
+ break;
+
+ case CGROUPSTATS_TYPE_CGROUP_STATS:
+ print_cgroupstats(NLA_DATA(na));
+ break;
+ default:
+ fprintf(stderr, "Unknown nla_type %d\n",
+ na->nla_type);
+ break;
+ }
+ na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
+ }
+ } while (loop);
+done:
+ if (maskset) {
+ rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+ TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
+ &cpumask, strlen(cpumask) + 1);
+ printf("Sent deregister mask, retval %d\n", rc);
+ if (rc < 0)
+ err(rc, "error sending deregister cpumask\n");
+ }
+err:
+ close(nl_sd);
+ if (fd)
+ close(fd);
+ if (cfd)
+ close(cfd);
+ return 0;
+}
diff --git a/Documentation/accounting/taskstats-struct.txt b/Documentation/accounting/taskstats-struct.txt
new file mode 100644
index 0000000..e7512c0
--- /dev/null
+++ b/Documentation/accounting/taskstats-struct.txt
@@ -0,0 +1,180 @@
+The struct taskstats
+--------------------
+
+This document contains an explanation of the struct taskstats fields.
+
+There are three different groups of fields in the struct taskstats:
+
+1) Common and basic accounting fields
+ If CONFIG_TASKSTATS is set, the taskstats interface is enabled and
+ the common fields and basic accounting fields are collected for
+ delivery at do_exit() of a task.
+2) Delay accounting fields
+ These fields are placed between
+ /* Delay accounting fields start */
+ and
+ /* Delay accounting fields end */
+ Their values are collected if CONFIG_TASK_DELAY_ACCT is set.
+3) Extended accounting fields
+ These fields are placed between
+ /* Extended accounting fields start */
+ and
+ /* Extended accounting fields end */
+ Their values are collected if CONFIG_TASK_XACCT is set.
+
+4) Per-task and per-thread context switch count statistics
+
+5) Time accounting for SMT machines
+
+6) Extended delay accounting fields for memory reclaim
+
+Future extension should add fields to the end of the taskstats struct, and
+should not change the relative position of each field within the struct.
+
+
+struct taskstats {
+
+1) Common and basic accounting fields:
+ /* The version number of this struct. This field is always set to
+ * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
+ * Each time the struct is changed, the value should be incremented.
+ */
+ __u16 version;
+
+ /* The exit code of a task. */
+ __u32 ac_exitcode; /* Exit status */
+
+ /* The accounting flags of a task as defined in <linux/acct.h>
+ * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
+ */
+ __u8 ac_flag; /* Record flags */
+
+ /* The value of task_nice() of a task. */
+ __u8 ac_nice; /* task_nice */
+
+ /* The name of the command that started this task. */
+ char ac_comm[TS_COMM_LEN]; /* Command name */
+
+ /* The scheduling discipline as set in task->policy field. */
+ __u8 ac_sched; /* Scheduling discipline */
+
+ __u8 ac_pad[3];
+ __u32 ac_uid; /* User ID */
+ __u32 ac_gid; /* Group ID */
+ __u32 ac_pid; /* Process ID */
+ __u32 ac_ppid; /* Parent process ID */
+
+ /* The time when a task begins, in [secs] since 1970. */
+ __u32 ac_btime; /* Begin time [sec since 1970] */
+
+ /* The elapsed time of a task, in [usec]. */
+ __u64 ac_etime; /* Elapsed time [usec] */
+
+ /* The user CPU time of a task, in [usec]. */
+ __u64 ac_utime; /* User CPU time [usec] */
+
+ /* The system CPU time of a task, in [usec]. */
+ __u64 ac_stime; /* System CPU time [usec] */
+
+ /* The minor page fault count of a task, as set in task->min_flt. */
+ __u64 ac_minflt; /* Minor Page Fault Count */
+
+ /* The major page fault count of a task, as set in task->maj_flt. */
+ __u64 ac_majflt; /* Major Page Fault Count */
+
+
+2) Delay accounting fields:
+ /* Delay accounting fields start
+ *
+ * All values, until the comment "Delay accounting fields end" are
+ * available only if delay accounting is enabled, even though the last
+ * few fields are not delays
+ *
+ * xxx_count is the number of delay values recorded
+ * xxx_delay_total is the corresponding cumulative delay in nanoseconds
+ *
+ * xxx_delay_total wraps around to zero on overflow
+ * xxx_count incremented regardless of overflow
+ */
+
+ /* Delay waiting for cpu, while runnable
+ * count, delay_total NOT updated atomically
+ */
+ __u64 cpu_count;
+ __u64 cpu_delay_total;
+
+ /* Following four fields atomically updated using task->delays->lock */
+
+ /* Delay waiting for synchronous block I/O to complete
+ * does not account for delays in I/O submission
+ */
+ __u64 blkio_count;
+ __u64 blkio_delay_total;
+
+ /* Delay waiting for page fault I/O (swap in only) */
+ __u64 swapin_count;
+ __u64 swapin_delay_total;
+
+ /* cpu "wall-clock" running time
+ * On some architectures, value will adjust for cpu time stolen
+ * from the kernel in involuntary waits due to virtualization.
+ * Value is cumulative, in nanoseconds, without a corresponding count
+ * and wraps around to zero silently on overflow
+ */
+ __u64 cpu_run_real_total;
+
+ /* cpu "virtual" running time
+ * Uses time intervals seen by the kernel i.e. no adjustment
+ * for kernel's involuntary waits due to virtualization.
+ * Value is cumulative, in nanoseconds, without a corresponding count
+ * and wraps around to zero silently on overflow
+ */
+ __u64 cpu_run_virtual_total;
+ /* Delay accounting fields end */
+ /* version 1 ends here */
+
+
+3) Extended accounting fields
+ /* Extended accounting fields start */
+
+ /* Accumulated RSS usage in duration of a task, in MBytes-usecs.
+ * The current rss usage is added to this counter every time
+ * a tick is charged to a task's system time. So, at the end we
+ * will have memory usage multiplied by system time. Thus an
+ * average usage per system time unit can be calculated.
+ */
+ __u64 coremem; /* accumulated RSS usage in MB-usec */
+
+ /* Accumulated virtual memory usage in duration of a task.
+ * Same as acct_rss_mem1 above except that we keep track of VM usage.
+ */
+ __u64 virtmem; /* accumulated VM usage in MB-usec */
+
+ /* High watermark of RSS usage in duration of a task, in KBytes. */
+ __u64 hiwater_rss; /* High-watermark of RSS usage */
+
+ /* High watermark of VM usage in duration of a task, in KBytes. */
+ __u64 hiwater_vm; /* High-water virtual memory usage */
+
+ /* The following four fields are I/O statistics of a task. */
+ __u64 read_char; /* bytes read */
+ __u64 write_char; /* bytes written */
+ __u64 read_syscalls; /* read syscalls */
+ __u64 write_syscalls; /* write syscalls */
+
+ /* Extended accounting fields end */
+
+4) Per-task and per-thread statistics
+ __u64 nvcsw; /* Context voluntary switch counter */
+ __u64 nivcsw; /* Context involuntary switch counter */
+
+5) Time accounting for SMT machines
+ __u64 ac_utimescaled; /* utime scaled on frequency etc */
+ __u64 ac_stimescaled; /* stime scaled on frequency etc */
+ __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
+
+6) Extended delay accounting fields for memory reclaim
+ /* Delay waiting for memory reclaim */
+ __u64 freepages_count;
+ __u64 freepages_delay_total;
+}
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
new file mode 100644
index 0000000..ff06b73
--- /dev/null
+++ b/Documentation/accounting/taskstats.txt
@@ -0,0 +1,181 @@
+Per-task statistics interface
+-----------------------------
+
+
+Taskstats is a netlink-based interface for sending per-task and
+per-process statistics from the kernel to userspace.
+
+Taskstats was designed for the following benefits:
+
+- efficiently provide statistics during lifetime of a task and on its exit
+- unified interface for multiple accounting subsystems
+- extensibility for use by future accounting patches
+
+Terminology
+-----------
+
+"pid", "tid" and "task" are used interchangeably and refer to the standard
+Linux task defined by struct task_struct. per-pid stats are the same as
+per-task stats.
+
+"tgid", "process" and "thread group" are used interchangeably and refer to the
+tasks that share an mm_struct i.e. the traditional Unix process. Despite the
+use of tgid, there is no special treatment for the task that is thread group
+leader - a process is deemed alive as long as it has any task belonging to it.
+
+Usage
+-----
+
+To get statistics during a task's lifetime, userspace opens a unicast netlink
+socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
+The response contains statistics for a task (if pid is specified) or the sum of
+statistics for all tasks of the process (if tgid is specified).
+
+To obtain statistics for tasks which are exiting, the userspace listener
+sends a register command and specifies a cpumask. Whenever a task exits on
+one of the cpus in the cpumask, its per-pid statistics are sent to the
+registered listener. Using cpumasks allows the data received by one listener
+to be limited and assists in flow control over the netlink interface and is
+explained in more detail below.
+
+If the exiting task is the last thread exiting its thread group,
+an additional record containing the per-tgid stats is also sent to userspace.
+The latter contains the sum of per-pid stats for all threads in the thread
+group, both past and present.
+
+getdelays.c is a simple utility demonstrating usage of the taskstats interface
+for reporting delay accounting statistics. Users can register cpumasks,
+send commands and process responses, listen for per-tid/tgid exit data,
+write the data received to a file and do basic flow control by increasing
+receive buffer sizes.
+
+Interface
+---------
+
+The user-kernel interface is encapsulated in include/linux/taskstats.h
+
+To avoid this documentation becoming obsolete as the interface evolves, only
+an outline of the current version is given. taskstats.h always overrides the
+description here.
+
+struct taskstats is the common accounting structure for both per-pid and
+per-tgid data. It is versioned and can be extended by each accounting subsystem
+that is added to the kernel. The fields and their semantics are defined in the
+taskstats.h file.
+
+The data exchanged between user and kernel space is a netlink message belonging
+to the NETLINK_GENERIC family and using the netlink attributes interface.
+The messages are in the format
+
+ +----------+- - -+-------------+-------------------+
+ | nlmsghdr | Pad | genlmsghdr | taskstats payload |
+ +----------+- - -+-------------+-------------------+
+
+
+The taskstats payload is one of the following three kinds:
+
+1. Commands: Sent from user to kernel. Commands to get data on
+a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
+containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
+the task/process for which userspace wants statistics.
+
+Commands to register/deregister interest in exit data from a set of cpus
+consist of one attribute, of type
+TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
+attribute payload. The cpumask is specified as an ascii string of
+comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
+the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
+in cpus before closing the listening socket, the kernel cleans up its interest
+set over time. However, for the sake of efficiency, an explicit deregistration
+is advisable.
+
+2. Response for a command: sent from the kernel in response to a userspace
+command. The payload is a series of three attributes of type:
+
+a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
+a pid/tgid will be followed by some stats.
+
+b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
+are being returned.
+
+c) TASKSTATS_TYPE_STATS: attribute with a struct taskstats as payload. The
+same structure is used for both per-pid and per-tgid stats.
+
+3. New message sent by kernel whenever a task exits. The payload consists of a
+ series of attributes of the following type:
+
+a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
+b) TASKSTATS_TYPE_PID: contains exiting task's pid
+c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
+d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
+e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
+f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
+
+
+per-tgid stats
+--------------
+
+Taskstats provides per-process stats, in addition to per-task stats, since
+resource management is often done at a process granularity and aggregating task
+stats in userspace alone is inefficient and potentially inaccurate (due to lack
+of atomicity).
+
+However, maintaining per-process, in addition to per-task stats, within the
+kernel has space and time overheads. To address this, the taskstats code
+accumulates each exiting task's statistics into a process-wide data structure.
+When the last task of a process exits, the process level data accumulated also
+gets sent to userspace (along with the per-task data).
+
+When a user queries to get per-tgid data, the sum of all other live threads in
+the group is added up and added to the accumulated total for previously exited
+threads of the same thread group.
+
+Extending taskstats
+-------------------
+
+There are two ways to extend the taskstats interface to export more
+per-task/process stats as patches to collect them get added to the kernel
+in future:
+
+1. Adding more fields to the end of the existing struct taskstats. Backward
+ compatibility is ensured by the version number within the
+ structure. Userspace will use only the fields of the struct that correspond
+ to the version its using.
+
+2. Defining separate statistic structs and using the netlink attributes
+ interface to return them. Since userspace processes each netlink attribute
+ independently, it can always ignore attributes whose type it does not
+ understand (because it is using an older version of the interface).
+
+
+Choosing between 1. and 2. is a matter of trading off flexibility and
+overhead. If only a few fields need to be added, then 1. is the preferable
+path since the kernel and userspace don't need to incur the overhead of
+processing new netlink attributes. But if the new fields expand the existing
+struct too much, requiring disparate userspace accounting utilities to
+unnecessarily receive large structures whose fields are of no interest, then
+extending the attributes structure would be worthwhile.
+
+Flow control for taskstats
+--------------------------
+
+When the rate of task exits becomes large, a listener may not be able to keep
+up with the kernel's rate of sending per-tid/tgid exit data leading to data
+loss. This possibility gets compounded when the taskstats structure gets
+extended and the number of cpus grows large.
+
+To avoid losing statistics, userspace should do one or more of the following:
+
+- increase the receive buffer sizes for the netlink sockets opened by
+listeners to receive exit data.
+
+- create more listeners and reduce the number of cpus being listened to by
+each listener. In the extreme case, there could be one listener for each cpu.
+Users may also consider setting the cpu affinity of the listener to the subset
+of cpus to which it listens, especially if they are listening to just one cpu.
+
+Despite these measures, if the userspace receives ENOBUFS error messages
+indicated overflow of receive buffers, it should take measures to handle the
+loss of data.
+
+----
diff --git a/Documentation/acpi/debug.txt b/Documentation/acpi/debug.txt
new file mode 100644
index 0000000..65bf47c
--- /dev/null
+++ b/Documentation/acpi/debug.txt
@@ -0,0 +1,148 @@
+ ACPI Debug Output
+
+
+The ACPI CA, the Linux ACPI core, and some ACPI drivers can generate debug
+output. This document describes how to use this facility.
+
+Compile-time configuration
+--------------------------
+
+ACPI debug output is globally enabled by CONFIG_ACPI_DEBUG. If this config
+option is turned off, the debug messages are not even built into the
+kernel.
+
+Boot- and run-time configuration
+--------------------------------
+
+When CONFIG_ACPI_DEBUG=y, you can select the component and level of messages
+you're interested in. At boot-time, use the acpi.debug_layer and
+acpi.debug_level kernel command line options. After boot, you can use the
+debug_layer and debug_level files in /sys/module/acpi/parameters/ to control
+the debug messages.
+
+debug_layer (component)
+-----------------------
+
+The "debug_layer" is a mask that selects components of interest, e.g., a
+specific driver or part of the ACPI interpreter. To build the debug_layer
+bitmask, look for the "#define _COMPONENT" in an ACPI source file.
+
+You can set the debug_layer mask at boot-time using the acpi.debug_layer
+command line argument, and you can change it after boot by writing values
+to /sys/module/acpi/parameters/debug_layer.
+
+The possible components are defined in include/acpi/acoutput.h and
+include/acpi/acpi_drivers.h. Reading /sys/module/acpi/parameters/debug_layer
+shows the supported mask values, currently these:
+
+ ACPI_UTILITIES 0x00000001
+ ACPI_HARDWARE 0x00000002
+ ACPI_EVENTS 0x00000004
+ ACPI_TABLES 0x00000008
+ ACPI_NAMESPACE 0x00000010
+ ACPI_PARSER 0x00000020
+ ACPI_DISPATCHER 0x00000040
+ ACPI_EXECUTER 0x00000080
+ ACPI_RESOURCES 0x00000100
+ ACPI_CA_DEBUGGER 0x00000200
+ ACPI_OS_SERVICES 0x00000400
+ ACPI_CA_DISASSEMBLER 0x00000800
+ ACPI_COMPILER 0x00001000
+ ACPI_TOOLS 0x00002000
+ ACPI_BUS_COMPONENT 0x00010000
+ ACPI_AC_COMPONENT 0x00020000
+ ACPI_BATTERY_COMPONENT 0x00040000
+ ACPI_BUTTON_COMPONENT 0x00080000
+ ACPI_SBS_COMPONENT 0x00100000
+ ACPI_FAN_COMPONENT 0x00200000
+ ACPI_PCI_COMPONENT 0x00400000
+ ACPI_POWER_COMPONENT 0x00800000
+ ACPI_CONTAINER_COMPONENT 0x01000000
+ ACPI_SYSTEM_COMPONENT 0x02000000
+ ACPI_THERMAL_COMPONENT 0x04000000
+ ACPI_MEMORY_DEVICE_COMPONENT 0x08000000
+ ACPI_VIDEO_COMPONENT 0x10000000
+ ACPI_PROCESSOR_COMPONENT 0x20000000
+
+debug_level
+-----------
+
+The "debug_level" is a mask that selects different types of messages, e.g.,
+those related to initialization, method execution, informational messages, etc.
+To build debug_level, look at the level specified in an ACPI_DEBUG_PRINT()
+statement.
+
+The ACPI interpreter uses several different levels, but the Linux
+ACPI core and ACPI drivers generally only use ACPI_LV_INFO.
+
+You can set the debug_level mask at boot-time using the acpi.debug_level
+command line argument, and you can change it after boot by writing values
+to /sys/module/acpi/parameters/debug_level.
+
+The possible levels are defined in include/acpi/acoutput.h. Reading
+/sys/module/acpi/parameters/debug_level shows the supported mask values,
+currently these:
+
+ ACPI_LV_INIT 0x00000001
+ ACPI_LV_DEBUG_OBJECT 0x00000002
+ ACPI_LV_INFO 0x00000004
+ ACPI_LV_INIT_NAMES 0x00000020
+ ACPI_LV_PARSE 0x00000040
+ ACPI_LV_LOAD 0x00000080
+ ACPI_LV_DISPATCH 0x00000100
+ ACPI_LV_EXEC 0x00000200
+ ACPI_LV_NAMES 0x00000400
+ ACPI_LV_OPREGION 0x00000800
+ ACPI_LV_BFIELD 0x00001000
+ ACPI_LV_TABLES 0x00002000
+ ACPI_LV_VALUES 0x00004000
+ ACPI_LV_OBJECTS 0x00008000
+ ACPI_LV_RESOURCES 0x00010000
+ ACPI_LV_USER_REQUESTS 0x00020000
+ ACPI_LV_PACKAGE 0x00040000
+ ACPI_LV_ALLOCATIONS 0x00100000
+ ACPI_LV_FUNCTIONS 0x00200000
+ ACPI_LV_OPTIMIZATIONS 0x00400000
+ ACPI_LV_MUTEX 0x01000000
+ ACPI_LV_THREADS 0x02000000
+ ACPI_LV_IO 0x04000000
+ ACPI_LV_INTERRUPTS 0x08000000
+ ACPI_LV_AML_DISASSEMBLE 0x10000000
+ ACPI_LV_VERBOSE_INFO 0x20000000
+ ACPI_LV_FULL_TABLES 0x40000000
+ ACPI_LV_EVENTS 0x80000000
+
+Examples
+--------
+
+For example, drivers/acpi/bus.c contains this:
+
+ #define _COMPONENT ACPI_BUS_COMPONENT
+ ...
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device insertion detected\n"));
+
+To turn on this message, set the ACPI_BUS_COMPONENT bit in acpi.debug_layer
+and the ACPI_LV_INFO bit in acpi.debug_level. (The ACPI_DEBUG_PRINT
+statement uses ACPI_DB_INFO, which is macro based on the ACPI_LV_INFO
+definition.)
+
+Enable all AML "Debug" output (stores to the Debug object while interpreting
+AML) during boot:
+
+ acpi.debug_layer=0xffffffff acpi.debug_level=0x2
+
+Enable PCI and PCI interrupt routing debug messages:
+
+ acpi.debug_layer=0x400000 acpi.debug_level=0x4
+
+Enable all ACPI hardware-related messages:
+
+ acpi.debug_layer=0x2 acpi.debug_level=0xffffffff
+
+Enable all ACPI_DB_INFO messages after boot:
+
+ # echo 0x4 > /sys/module/acpi/parameters/debug_level
+
+Show all valid component values:
+
+ # cat /sys/module/acpi/parameters/debug_layer
diff --git a/Documentation/acpi/dsdt-override.txt b/Documentation/acpi/dsdt-override.txt
new file mode 100644
index 0000000..febbb1b
--- /dev/null
+++ b/Documentation/acpi/dsdt-override.txt
@@ -0,0 +1,7 @@
+Linux supports a method of overriding the BIOS DSDT:
+
+CONFIG_ACPI_CUSTOM_DSDT builds the image into the kernel.
+
+When to use this method is described in detail on the
+Linux/ACPI home page:
+http://www.lesswatts.org/projects/acpi/overridingDSDT.php
diff --git a/Documentation/acpi/method-tracing.txt b/Documentation/acpi/method-tracing.txt
new file mode 100644
index 0000000..f6efb1e
--- /dev/null
+++ b/Documentation/acpi/method-tracing.txt
@@ -0,0 +1,26 @@
+/sys/module/acpi/parameters/:
+
+trace_method_name
+ The AML method name that the user wants to trace
+
+trace_debug_layer
+ The temporary debug_layer used when tracing the method.
+ Using 0xffffffff by default if it is 0.
+
+trace_debug_level
+ The temporary debug_level used when tracing the method.
+ Using 0x00ffffff by default if it is 0.
+
+trace_state
+ The status of the tracing feature.
+
+ "enabled" means this feature is enabled
+ and the AML method is traced every time it's executed.
+
+ "1" means this feature is enabled and the AML method
+ will only be traced during the next execution.
+
+ "disabled" means this feature is disabled.
+ Users can enable/disable this debug tracing feature by
+ "echo string > /sys/module/acpi/parameters/trace_state".
+ "string" should be one of "enable", "disable" and "1".
diff --git a/Documentation/aoe/aoe.txt b/Documentation/aoe/aoe.txt
new file mode 100644
index 0000000..3a4dbe4
--- /dev/null
+++ b/Documentation/aoe/aoe.txt
@@ -0,0 +1,123 @@
+The EtherDrive (R) HOWTO for users of 2.6 kernels is found at ...
+
+ http://www.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html
+
+ It has many tips and hints!
+
+The aoetools are userland programs that are designed to work with this
+driver. The aoetools are on sourceforge.
+
+ http://aoetools.sourceforge.net/
+
+The scripts in this Documentation/aoe directory are intended to
+document the use of the driver and are not necessary if you install
+the aoetools.
+
+
+CREATING DEVICE NODES
+
+ Users of udev should find the block device nodes created
+ automatically, but to create all the necessary device nodes, use the
+ udev configuration rules provided in udev.txt (in this directory).
+
+ There is a udev-install.sh script that shows how to install these
+ rules on your system.
+
+ If you are not using udev, two scripts are provided in
+ Documentation/aoe as examples of static device node creation for
+ using the aoe driver.
+
+ rm -rf /dev/etherd
+ sh Documentation/aoe/mkdevs.sh /dev/etherd
+
+ ... or to make just one shelf's worth of block device nodes ...
+
+ sh Documentation/aoe/mkshelf.sh /dev/etherd 0
+
+ There is also an autoload script that shows how to edit
+ /etc/modprobe.conf to ensure that the aoe module is loaded when
+ necessary.
+
+USING DEVICE NODES
+
+ "cat /dev/etherd/err" blocks, waiting for error diagnostic output,
+ like any retransmitted packets.
+
+ "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to
+ limit ATA over Ethernet traffic to eth2 and eth4. AoE traffic from
+ untrusted networks should be ignored as a matter of security. See
+ also the aoe_iflist driver option described below.
+
+ "echo > /dev/etherd/discover" tells the driver to find out what AoE
+ devices are available.
+
+ These character devices may disappear and be replaced by sysfs
+ counterparts. Using the commands in aoetools insulates users from
+ these implementation details.
+
+ The block devices are named like this:
+
+ e{shelf}.{slot}
+ e{shelf}.{slot}p{part}
+
+ ... so that "e0.2" is the third blade from the left (slot 2) in the
+ first shelf (shelf address zero). That's the whole disk. The first
+ partition on that disk would be "e0.2p1".
+
+USING SYSFS
+
+ Each aoe block device in /sys/block has the extra attributes of
+ state, mac, and netif. The state attribute is "up" when the device
+ is ready for I/O and "down" if detected but unusable. The
+ "down,closewait" state shows that the device is still open and
+ cannot come up again until it has been closed.
+
+ The mac attribute is the ethernet address of the remote AoE device.
+ The netif attribute is the network interface on the localhost
+ through which we are communicating with the remote AoE device.
+
+ There is a script in this directory that formats this information
+ in a convenient way. Users with aoetools can use the aoe-stat
+ command.
+
+ root@makki root# sh Documentation/aoe/status.sh
+ e10.0 eth3 up
+ e10.1 eth3 up
+ e10.2 eth3 up
+ e10.3 eth3 up
+ e10.4 eth3 up
+ e10.5 eth3 up
+ e10.6 eth3 up
+ e10.7 eth3 up
+ e10.8 eth3 up
+ e10.9 eth3 up
+ e4.0 eth1 up
+ e4.1 eth1 up
+ e4.2 eth1 up
+ e4.3 eth1 up
+ e4.4 eth1 up
+ e4.5 eth1 up
+ e4.6 eth1 up
+ e4.7 eth1 up
+ e4.8 eth1 up
+ e4.9 eth1 up
+
+ Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver
+ option discussed below) instead of /dev/etherd/interfaces to limit
+ AoE traffic to the network interfaces in the given
+ whitespace-separated list. Unlike the old character device, the
+ sysfs entry can be read from as well as written to.
+
+ It's helpful to trigger discovery after setting the list of allowed
+ interfaces. The aoetools package provides an aoe-discover script
+ for this purpose. You can also directly use the
+ /dev/etherd/discover special file described above.
+
+DRIVER OPTIONS
+
+ There is a boot option for the built-in aoe driver and a
+ corresponding module parameter, aoe_iflist. Without this option,
+ all network interfaces may be used for ATA over Ethernet. Here is a
+ usage example for the module parameter.
+
+ modprobe aoe_iflist="eth1 eth3"
diff --git a/Documentation/aoe/autoload.sh b/Documentation/aoe/autoload.sh
new file mode 100644
index 0000000..78dad13
--- /dev/null
+++ b/Documentation/aoe/autoload.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+# set aoe to autoload by installing the
+# aliases in /etc/modprobe.conf
+
+f=/etc/modprobe.conf
+
+if test ! -r $f || test ! -w $f; then
+ echo "cannot configure $f for module autoloading" 1>&2
+ exit 1
+fi
+
+grep major-152 $f >/dev/null
+if [ $? = 1 ]; then
+ echo alias block-major-152 aoe >> $f
+ echo alias char-major-152 aoe >> $f
+fi
+
diff --git a/Documentation/aoe/mkdevs.sh b/Documentation/aoe/mkdevs.sh
new file mode 100644
index 0000000..44c0ab7
--- /dev/null
+++ b/Documentation/aoe/mkdevs.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+n_shelves=${n_shelves:-10}
+n_partitions=${n_partitions:-16}
+
+if test "$#" != "1"; then
+ echo "Usage: sh `basename $0` {dir}" 1>&2
+ echo " n_partitions=16 sh `basename $0` {dir}" 1>&2
+ exit 1
+fi
+dir=$1
+
+MAJOR=152
+
+echo "Creating AoE devnode files in $dir ..."
+
+set -e
+
+mkdir -p $dir
+
+# (Status info is in sysfs. See status.sh.)
+# rm -f $dir/stat
+# mknod -m 0400 $dir/stat c $MAJOR 1
+rm -f $dir/err
+mknod -m 0400 $dir/err c $MAJOR 2
+rm -f $dir/discover
+mknod -m 0200 $dir/discover c $MAJOR 3
+rm -f $dir/interfaces
+mknod -m 0200 $dir/interfaces c $MAJOR 4
+rm -f $dir/revalidate
+mknod -m 0200 $dir/revalidate c $MAJOR 5
+rm -f $dir/flush
+mknod -m 0200 $dir/flush c $MAJOR 6
+
+export n_partitions
+mkshelf=`echo $0 | sed 's!mkdevs!mkshelf!'`
+i=0
+while test $i -lt $n_shelves; do
+ sh -xc "sh $mkshelf $dir $i"
+ i=`expr $i + 1`
+done
diff --git a/Documentation/aoe/mkshelf.sh b/Documentation/aoe/mkshelf.sh
new file mode 100644
index 0000000..3261581
--- /dev/null
+++ b/Documentation/aoe/mkshelf.sh
@@ -0,0 +1,28 @@
+#! /bin/sh
+
+if test "$#" != "2"; then
+ echo "Usage: sh `basename $0` {dir} {shelfaddress}" 1>&2
+ echo " n_partitions=16 sh `basename $0` {dir} {shelfaddress}" 1>&2
+ exit 1
+fi
+n_partitions=${n_partitions:-16}
+dir=$1
+shelf=$2
+nslots=16
+maxslot=`echo $nslots 1 - p | dc`
+MAJOR=152
+
+set -e
+
+minor=`echo $nslots \* $shelf \* $n_partitions | bc`
+endp=`echo $n_partitions - 1 | bc`
+for slot in `seq 0 $maxslot`; do
+ for part in `seq 0 $endp`; do
+ name=e$shelf.$slot
+ test "$part" != "0" && name=${name}p$part
+ rm -f $dir/$name
+ mknod -m 0660 $dir/$name b $MAJOR $minor
+
+ minor=`expr $minor + 1`
+ done
+done
diff --git a/Documentation/aoe/status.sh b/Documentation/aoe/status.sh
new file mode 100644
index 0000000..751f3be
--- /dev/null
+++ b/Documentation/aoe/status.sh
@@ -0,0 +1,27 @@
+#! /bin/sh
+# collate and present sysfs information about AoE storage
+
+set -e
+format="%8s\t%8s\t%8s\n"
+me=`basename $0`
+sysd=${sysfs_dir:-/sys}
+
+# printf "$format" device mac netif state
+
+# Suse 9.1 Pro doesn't put /sys in /etc/mtab
+#test -z "`mount | grep sysfs`" && {
+test ! -d "$sysd/block" && {
+ echo "$me Error: sysfs is not mounted" 1>&2
+ exit 1
+}
+
+for d in `ls -d $sysd/block/etherd* 2>/dev/null | grep -v p` end; do
+ # maybe ls comes up empty, so we use "end"
+ test $d = end && continue
+
+ dev=`echo "$d" | sed 's/.*!//'`
+ printf "$format" \
+ "$dev" \
+ "`cat \"$d/netif\"`" \
+ "`cat \"$d/state\"`"
+done | sort
diff --git a/Documentation/aoe/todo.txt b/Documentation/aoe/todo.txt
new file mode 100644
index 0000000..c09dfad
--- /dev/null
+++ b/Documentation/aoe/todo.txt
@@ -0,0 +1,14 @@
+There is a potential for deadlock when allocating a struct sk_buff for
+data that needs to be written out to aoe storage. If the data is
+being written from a dirty page in order to free that page, and if
+there are no other pages available, then deadlock may occur when a
+free page is needed for the sk_buff allocation. This situation has
+not been observed, but it would be nice to eliminate any potential for
+deadlock under memory pressure.
+
+Because ATA over Ethernet is not fragmented by the kernel's IP code,
+the destructor member of the struct sk_buff is available to the aoe
+driver. By using a mempool for allocating all but the first few
+sk_buffs, and by registering a destructor, we should be able to
+efficiently allocate sk_buffs without introducing any potential for
+deadlock.
diff --git a/Documentation/aoe/udev-install.sh b/Documentation/aoe/udev-install.sh
new file mode 100644
index 0000000..15e86f5
--- /dev/null
+++ b/Documentation/aoe/udev-install.sh
@@ -0,0 +1,33 @@
+# install the aoe-specific udev rules from udev.txt into
+# the system's udev configuration
+#
+
+me="`basename $0`"
+
+# find udev.conf, often /etc/udev/udev.conf
+# (or environment can specify where to find udev.conf)
+#
+if test -z "$conf"; then
+ if test -r /etc/udev/udev.conf; then
+ conf=/etc/udev/udev.conf
+ else
+ conf="`find /etc -type f -name udev.conf 2> /dev/null`"
+ if test -z "$conf" || test ! -r "$conf"; then
+ echo "$me Error: no udev.conf found" 1>&2
+ exit 1
+ fi
+ fi
+fi
+
+# find the directory where udev rules are stored, often
+# /etc/udev/rules.d
+#
+rules_d="`sed -n '/^udev_rules=/{ s!udev_rules=!!; s!\"!!g; p; }' $conf`"
+if test -z "$rules_d" ; then
+ rules_d=/etc/udev/rules.d
+fi
+if test ! -d "$rules_d"; then
+ echo "$me Error: cannot find udev rules directory" 1>&2
+ exit 1
+fi
+sh -xc "cp `dirname $0`/udev.txt $rules_d/60-aoe.rules"
diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt
new file mode 100644
index 0000000..8686e78
--- /dev/null
+++ b/Documentation/aoe/udev.txt
@@ -0,0 +1,26 @@
+# These rules tell udev what device nodes to create for aoe support.
+# They may be installed along the following lines. Check the section
+# 8 udev manpage to see whether your udev supports SUBSYSTEM, and
+# whether it uses one or two equal signs for SUBSYSTEM and KERNEL.
+#
+# ecashin@makki ~$ su
+# Password:
+# bash# find /etc -type f -name udev.conf
+# /etc/udev/udev.conf
+# bash# grep udev_rules= /etc/udev/udev.conf
+# udev_rules="/etc/udev/rules.d/"
+# bash# ls /etc/udev/rules.d/
+# 10-wacom.rules 50-udev.rules
+# bash# cp /path/to/linux-2.6.xx/Documentation/aoe/udev.txt \
+# /etc/udev/rules.d/60-aoe.rules
+#
+
+# aoe char devices
+SUBSYSTEM=="aoe", KERNEL=="discover", NAME="etherd/%k", GROUP="disk", MODE="0220"
+SUBSYSTEM=="aoe", KERNEL=="err", NAME="etherd/%k", GROUP="disk", MODE="0440"
+SUBSYSTEM=="aoe", KERNEL=="interfaces", NAME="etherd/%k", GROUP="disk", MODE="0220"
+SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="0220"
+SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220"
+
+# aoe block devices
+KERNEL=="etherd*", NAME="%k", GROUP="disk"
diff --git a/Documentation/applying-patches.txt b/Documentation/applying-patches.txt
new file mode 100644
index 0000000..a083ba3
--- /dev/null
+++ b/Documentation/applying-patches.txt
@@ -0,0 +1,454 @@
+
+ Applying Patches To The Linux Kernel
+ ------------------------------------
+
+ Original by: Jesper Juhl, August 2005
+ Last update: 2006-01-05
+
+
+A frequently asked question on the Linux Kernel Mailing List is how to apply
+a patch to the kernel or, more specifically, what base kernel a patch for
+one of the many trees/branches should be applied to. Hopefully this document
+will explain this to you.
+
+In addition to explaining how to apply and revert patches, a brief
+description of the different kernel trees (and examples of how to apply
+their specific patches) is also provided.
+
+
+What is a patch?
+---
+ A patch is a small text document containing a delta of changes between two
+different versions of a source tree. Patches are created with the `diff'
+program.
+To correctly apply a patch you need to know what base it was generated from
+and what new version the patch will change the source tree into. These
+should both be present in the patch file metadata or be possible to deduce
+from the filename.
+
+
+How do I apply or revert a patch?
+---
+ You apply a patch with the `patch' program. The patch program reads a diff
+(or patch) file and makes the changes to the source tree described in it.
+
+Patches for the Linux kernel are generated relative to the parent directory
+holding the kernel source dir.
+
+This means that paths to files inside the patch file contain the name of the
+kernel source directories it was generated against (or some other directory
+names like "a/" and "b/").
+Since this is unlikely to match the name of the kernel source dir on your
+local machine (but is often useful info to see what version an otherwise
+unlabeled patch was generated against) you should change into your kernel
+source directory and then strip the first element of the path from filenames
+in the patch file when applying it (the -p1 argument to `patch' does this).
+
+To revert a previously applied patch, use the -R argument to patch.
+So, if you applied a patch like this:
+ patch -p1 < ../patch-x.y.z
+
+You can revert (undo) it like this:
+ patch -R -p1 < ../patch-x.y.z
+
+
+How do I feed a patch/diff file to `patch'?
+---
+ This (as usual with Linux and other UNIX like operating systems) can be
+done in several different ways.
+In all the examples below I feed the file (in uncompressed form) to patch
+via stdin using the following syntax:
+ patch -p1 < path/to/patch-x.y.z
+
+If you just want to be able to follow the examples below and don't want to
+know of more than one way to use patch, then you can stop reading this
+section here.
+
+Patch can also get the name of the file to use via the -i argument, like
+this:
+ patch -p1 -i path/to/patch-x.y.z
+
+If your patch file is compressed with gzip or bzip2 and you don't want to
+uncompress it before applying it, then you can feed it to patch like this
+instead:
+ zcat path/to/patch-x.y.z.gz | patch -p1
+ bzcat path/to/patch-x.y.z.bz2 | patch -p1
+
+If you wish to uncompress the patch file by hand first before applying it
+(what I assume you've done in the examples below), then you simply run
+gunzip or bunzip2 on the file -- like this:
+ gunzip patch-x.y.z.gz
+ bunzip2 patch-x.y.z.bz2
+
+Which will leave you with a plain text patch-x.y.z file that you can feed to
+patch via stdin or the -i argument, as you prefer.
+
+A few other nice arguments for patch are -s which causes patch to be silent
+except for errors which is nice to prevent errors from scrolling out of the
+screen too fast, and --dry-run which causes patch to just print a listing of
+what would happen, but doesn't actually make any changes. Finally --verbose
+tells patch to print more information about the work being done.
+
+
+Common errors when patching
+---
+ When patch applies a patch file it attempts to verify the sanity of the
+file in different ways.
+Checking that the file looks like a valid patch file & checking the code
+around the bits being modified matches the context provided in the patch are
+just two of the basic sanity checks patch does.
+
+If patch encounters something that doesn't look quite right it has two
+options. It can either refuse to apply the changes and abort or it can try
+to find a way to make the patch apply with a few minor changes.
+
+One example of something that's not 'quite right' that patch will attempt to
+fix up is if all the context matches, the lines being changed match, but the
+line numbers are different. This can happen, for example, if the patch makes
+a change in the middle of the file but for some reasons a few lines have
+been added or removed near the beginning of the file. In that case
+everything looks good it has just moved up or down a bit, and patch will
+usually adjust the line numbers and apply the patch.
+
+Whenever patch applies a patch that it had to modify a bit to make it fit
+it'll tell you about it by saying the patch applied with 'fuzz'.
+You should be wary of such changes since even though patch probably got it
+right it doesn't /always/ get it right, and the result will sometimes be
+wrong.
+
+When patch encounters a change that it can't fix up with fuzz it rejects it
+outright and leaves a file with a .rej extension (a reject file). You can
+read this file to see exactly what change couldn't be applied, so you can
+go fix it up by hand if you wish.
+
+If you don't have any third-party patches applied to your kernel source, but
+only patches from kernel.org and you apply the patches in the correct order,
+and have made no modifications yourself to the source files, then you should
+never see a fuzz or reject message from patch. If you do see such messages
+anyway, then there's a high risk that either your local source tree or the
+patch file is corrupted in some way. In that case you should probably try
+re-downloading the patch and if things are still not OK then you'd be advised
+to start with a fresh tree downloaded in full from kernel.org.
+
+Let's look a bit more at some of the messages patch can produce.
+
+If patch stops and presents a "File to patch:" prompt, then patch could not
+find a file to be patched. Most likely you forgot to specify -p1 or you are
+in the wrong directory. Less often, you'll find patches that need to be
+applied with -p0 instead of -p1 (reading the patch file should reveal if
+this is the case -- if so, then this is an error by the person who created
+the patch but is not fatal).
+
+If you get "Hunk #2 succeeded at 1887 with fuzz 2 (offset 7 lines)." or a
+message similar to that, then it means that patch had to adjust the location
+of the change (in this example it needed to move 7 lines from where it
+expected to make the change to make it fit).
+The resulting file may or may not be OK, depending on the reason the file
+was different than expected.
+This often happens if you try to apply a patch that was generated against a
+different kernel version than the one you are trying to patch.
+
+If you get a message like "Hunk #3 FAILED at 2387.", then it means that the
+patch could not be applied correctly and the patch program was unable to
+fuzz its way through. This will generate a .rej file with the change that
+caused the patch to fail and also a .orig file showing you the original
+content that couldn't be changed.
+
+If you get "Reversed (or previously applied) patch detected! Assume -R? [n]"
+then patch detected that the change contained in the patch seems to have
+already been made.
+If you actually did apply this patch previously and you just re-applied it
+in error, then just say [n]o and abort this patch. If you applied this patch
+previously and actually intended to revert it, but forgot to specify -R,
+then you can say [y]es here to make patch revert it for you.
+This can also happen if the creator of the patch reversed the source and
+destination directories when creating the patch, and in that case reverting
+the patch will in fact apply it.
+
+A message similar to "patch: **** unexpected end of file in patch" or "patch
+unexpectedly ends in middle of line" means that patch could make no sense of
+the file you fed to it. Either your download is broken, you tried to feed
+patch a compressed patch file without uncompressing it first, or the patch
+file that you are using has been mangled by a mail client or mail transfer
+agent along the way somewhere, e.g., by splitting a long line into two lines.
+Often these warnings can easily be fixed by joining (concatenating) the
+two lines that had been split.
+
+As I already mentioned above, these errors should never happen if you apply
+a patch from kernel.org to the correct version of an unmodified source tree.
+So if you get these errors with kernel.org patches then you should probably
+assume that either your patch file or your tree is broken and I'd advise you
+to start over with a fresh download of a full kernel tree and the patch you
+wish to apply.
+
+
+Are there any alternatives to `patch'?
+---
+ Yes there are alternatives.
+
+ You can use the `interdiff' program (http://cyberelk.net/tim/patchutils/) to
+generate a patch representing the differences between two patches and then
+apply the result.
+This will let you move from something like 2.6.12.2 to 2.6.12.3 in a single
+step. The -z flag to interdiff will even let you feed it patches in gzip or
+bzip2 compressed form directly without the use of zcat or bzcat or manual
+decompression.
+
+Here's how you'd go from 2.6.12.2 to 2.6.12.3 in a single step:
+ interdiff -z ../patch-2.6.12.2.bz2 ../patch-2.6.12.3.gz | patch -p1
+
+Although interdiff may save you a step or two you are generally advised to
+do the additional steps since interdiff can get things wrong in some cases.
+
+ Another alternative is `ketchup', which is a python script for automatic
+downloading and applying of patches (http://www.selenic.com/ketchup/).
+
+ Other nice tools are diffstat, which shows a summary of changes made by a
+patch; lsdiff, which displays a short listing of affected files in a patch
+file, along with (optionally) the line numbers of the start of each patch;
+and grepdiff, which displays a list of the files modified by a patch where
+the patch contains a given regular expression.
+
+
+Where can I download the patches?
+---
+ The patches are available at http://kernel.org/
+Most recent patches are linked from the front page, but they also have
+specific homes.
+
+The 2.6.x.y (-stable) and 2.6.x patches live at
+ ftp://ftp.kernel.org/pub/linux/kernel/v2.6/
+
+The -rc patches live at
+ ftp://ftp.kernel.org/pub/linux/kernel/v2.6/testing/
+
+The -git patches live at
+ ftp://ftp.kernel.org/pub/linux/kernel/v2.6/snapshots/
+
+The -mm kernels live at
+ ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/
+
+In place of ftp.kernel.org you can use ftp.cc.kernel.org, where cc is a
+country code. This way you'll be downloading from a mirror site that's most
+likely geographically closer to you, resulting in faster downloads for you,
+less bandwidth used globally and less load on the main kernel.org servers --
+these are good things, so do use mirrors when possible.
+
+
+The 2.6.x kernels
+---
+ These are the base stable releases released by Linus. The highest numbered
+release is the most recent.
+
+If regressions or other serious flaws are found, then a -stable fix patch
+will be released (see below) on top of this base. Once a new 2.6.x base
+kernel is released, a patch is made available that is a delta between the
+previous 2.6.x kernel and the new one.
+
+To apply a patch moving from 2.6.11 to 2.6.12, you'd do the following (note
+that such patches do *NOT* apply on top of 2.6.x.y kernels but on top of the
+base 2.6.x kernel -- if you need to move from 2.6.x.y to 2.6.x+1 you need to
+first revert the 2.6.x.y patch).
+
+Here are some examples:
+
+# moving from 2.6.11 to 2.6.12
+$ cd ~/linux-2.6.11 # change to kernel source dir
+$ patch -p1 < ../patch-2.6.12 # apply the 2.6.12 patch
+$ cd ..
+$ mv linux-2.6.11 linux-2.6.12 # rename source dir
+
+# moving from 2.6.11.1 to 2.6.12
+$ cd ~/linux-2.6.11.1 # change to kernel source dir
+$ patch -p1 -R < ../patch-2.6.11.1 # revert the 2.6.11.1 patch
+ # source dir is now 2.6.11
+$ patch -p1 < ../patch-2.6.12 # apply new 2.6.12 patch
+$ cd ..
+$ mv linux-2.6.11.1 linux-2.6.12 # rename source dir
+
+
+The 2.6.x.y kernels
+---
+ Kernels with 4-digit versions are -stable kernels. They contain small(ish)
+critical fixes for security problems or significant regressions discovered
+in a given 2.6.x kernel.
+
+This is the recommended branch for users who want the most recent stable
+kernel and are not interested in helping test development/experimental
+versions.
+
+If no 2.6.x.y kernel is available, then the highest numbered 2.6.x kernel is
+the current stable kernel.
+
+ note: the -stable team usually do make incremental patches available as well
+ as patches against the latest mainline release, but I only cover the
+ non-incremental ones below. The incremental ones can be found at
+ ftp://ftp.kernel.org/pub/linux/kernel/v2.6/incr/
+
+These patches are not incremental, meaning that for example the 2.6.12.3
+patch does not apply on top of the 2.6.12.2 kernel source, but rather on top
+of the base 2.6.12 kernel source .
+So, in order to apply the 2.6.12.3 patch to your existing 2.6.12.2 kernel
+source you have to first back out the 2.6.12.2 patch (so you are left with a
+base 2.6.12 kernel source) and then apply the new 2.6.12.3 patch.
+
+Here's a small example:
+
+$ cd ~/linux-2.6.12.2 # change into the kernel source dir
+$ patch -p1 -R < ../patch-2.6.12.2 # revert the 2.6.12.2 patch
+$ patch -p1 < ../patch-2.6.12.3 # apply the new 2.6.12.3 patch
+$ cd ..
+$ mv linux-2.6.12.2 linux-2.6.12.3 # rename the kernel source dir
+
+
+The -rc kernels
+---
+ These are release-candidate kernels. These are development kernels released
+by Linus whenever he deems the current git (the kernel's source management
+tool) tree to be in a reasonably sane state adequate for testing.
+
+These kernels are not stable and you should expect occasional breakage if
+you intend to run them. This is however the most stable of the main
+development branches and is also what will eventually turn into the next
+stable kernel, so it is important that it be tested by as many people as
+possible.
+
+This is a good branch to run for people who want to help out testing
+development kernels but do not want to run some of the really experimental
+stuff (such people should see the sections about -git and -mm kernels below).
+
+The -rc patches are not incremental, they apply to a base 2.6.x kernel, just
+like the 2.6.x.y patches described above. The kernel version before the -rcN
+suffix denotes the version of the kernel that this -rc kernel will eventually
+turn into.
+So, 2.6.13-rc5 means that this is the fifth release candidate for the 2.6.13
+kernel and the patch should be applied on top of the 2.6.12 kernel source.
+
+Here are 3 examples of how to apply these patches:
+
+# first an example of moving from 2.6.12 to 2.6.13-rc3
+$ cd ~/linux-2.6.12 # change into the 2.6.12 source dir
+$ patch -p1 < ../patch-2.6.13-rc3 # apply the 2.6.13-rc3 patch
+$ cd ..
+$ mv linux-2.6.12 linux-2.6.13-rc3 # rename the source dir
+
+# now let's move from 2.6.13-rc3 to 2.6.13-rc5
+$ cd ~/linux-2.6.13-rc3 # change into the 2.6.13-rc3 dir
+$ patch -p1 -R < ../patch-2.6.13-rc3 # revert the 2.6.13-rc3 patch
+$ patch -p1 < ../patch-2.6.13-rc5 # apply the new 2.6.13-rc5 patch
+$ cd ..
+$ mv linux-2.6.13-rc3 linux-2.6.13-rc5 # rename the source dir
+
+# finally let's try and move from 2.6.12.3 to 2.6.13-rc5
+$ cd ~/linux-2.6.12.3 # change to the kernel source dir
+$ patch -p1 -R < ../patch-2.6.12.3 # revert the 2.6.12.3 patch
+$ patch -p1 < ../patch-2.6.13-rc5 # apply new 2.6.13-rc5 patch
+$ cd ..
+$ mv linux-2.6.12.3 linux-2.6.13-rc5 # rename the kernel source dir
+
+
+The -git kernels
+---
+ These are daily snapshots of Linus' kernel tree (managed in a git
+repository, hence the name).
+
+These patches are usually released daily and represent the current state of
+Linus's tree. They are more experimental than -rc kernels since they are
+generated automatically without even a cursory glance to see if they are
+sane.
+
+-git patches are not incremental and apply either to a base 2.6.x kernel or
+a base 2.6.x-rc kernel -- you can see which from their name.
+A patch named 2.6.12-git1 applies to the 2.6.12 kernel source and a patch
+named 2.6.13-rc3-git2 applies to the source of the 2.6.13-rc3 kernel.
+
+Here are some examples of how to apply these patches:
+
+# moving from 2.6.12 to 2.6.12-git1
+$ cd ~/linux-2.6.12 # change to the kernel source dir
+$ patch -p1 < ../patch-2.6.12-git1 # apply the 2.6.12-git1 patch
+$ cd ..
+$ mv linux-2.6.12 linux-2.6.12-git1 # rename the kernel source dir
+
+# moving from 2.6.12-git1 to 2.6.13-rc2-git3
+$ cd ~/linux-2.6.12-git1 # change to the kernel source dir
+$ patch -p1 -R < ../patch-2.6.12-git1 # revert the 2.6.12-git1 patch
+ # we now have a 2.6.12 kernel
+$ patch -p1 < ../patch-2.6.13-rc2 # apply the 2.6.13-rc2 patch
+ # the kernel is now 2.6.13-rc2
+$ patch -p1 < ../patch-2.6.13-rc2-git3 # apply the 2.6.13-rc2-git3 patch
+ # the kernel is now 2.6.13-rc2-git3
+$ cd ..
+$ mv linux-2.6.12-git1 linux-2.6.13-rc2-git3 # rename source dir
+
+
+The -mm kernels
+---
+ These are experimental kernels released by Andrew Morton.
+
+The -mm tree serves as a sort of proving ground for new features and other
+experimental patches.
+Once a patch has proved its worth in -mm for a while Andrew pushes it on to
+Linus for inclusion in mainline.
+
+Although it's encouraged that patches flow to Linus via the -mm tree, this
+is not always enforced.
+Subsystem maintainers (or individuals) sometimes push their patches directly
+to Linus, even though (or after) they have been merged and tested in -mm (or
+sometimes even without prior testing in -mm).
+
+You should generally strive to get your patches into mainline via -mm to
+ensure maximum testing.
+
+This branch is in constant flux and contains many experimental features, a
+lot of debugging patches not appropriate for mainline etc., and is the most
+experimental of the branches described in this document.
+
+These kernels are not appropriate for use on systems that are supposed to be
+stable and they are more risky to run than any of the other branches (make
+sure you have up-to-date backups -- that goes for any experimental kernel but
+even more so for -mm kernels).
+
+These kernels in addition to all the other experimental patches they contain
+usually also contain any changes in the mainline -git kernels available at
+the time of release.
+
+Testing of -mm kernels is greatly appreciated since the whole point of the
+tree is to weed out regressions, crashes, data corruption bugs, build
+breakage (and any other bug in general) before changes are merged into the
+more stable mainline Linus tree.
+But testers of -mm should be aware that breakage in this tree is more common
+than in any other tree.
+
+The -mm kernels are not released on a fixed schedule, but usually a few -mm
+kernels are released in between each -rc kernel (1 to 3 is common).
+The -mm kernels apply to either a base 2.6.x kernel (when no -rc kernels
+have been released yet) or to a Linus -rc kernel.
+
+Here are some examples of applying the -mm patches:
+
+# moving from 2.6.12 to 2.6.12-mm1
+$ cd ~/linux-2.6.12 # change to the 2.6.12 source dir
+$ patch -p1 < ../2.6.12-mm1 # apply the 2.6.12-mm1 patch
+$ cd ..
+$ mv linux-2.6.12 linux-2.6.12-mm1 # rename the source appropriately
+
+# moving from 2.6.12-mm1 to 2.6.13-rc3-mm3
+$ cd ~/linux-2.6.12-mm1
+$ patch -p1 -R < ../2.6.12-mm1 # revert the 2.6.12-mm1 patch
+ # we now have a 2.6.12 source
+$ patch -p1 < ../patch-2.6.13-rc3 # apply the 2.6.13-rc3 patch
+ # we now have a 2.6.13-rc3 source
+$ patch -p1 < ../2.6.13-rc3-mm3 # apply the 2.6.13-rc3-mm3 patch
+$ cd ..
+$ mv linux-2.6.12-mm1 linux-2.6.13-rc3-mm3 # rename the source dir
+
+
+This concludes this list of explanations of the various kernel trees.
+I hope you are now clear on how to apply the various patches and help testing
+the kernel.
+
+Thank you's to Randy Dunlap, Rolf Eike Beer, Linus Torvalds, Bodo Eggert,
+Johannes Stezenbach, Grant Coady, Pavel Machek and others that I may have
+forgotten for their reviews and contributions to this document.
+
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX
new file mode 100644
index 0000000..82e418d
--- /dev/null
+++ b/Documentation/arm/00-INDEX
@@ -0,0 +1,32 @@
+00-INDEX
+ - this file
+Booting
+ - requirements for booting
+Interrupts
+ - ARM Interrupt subsystem documentation
+IXP2000
+ - Release Notes for Linux on Intel's IXP2000 Network Processor
+Netwinder
+ - Netwinder specific documentation
+Porting
+ - Symbol definitions for porting Linux to a new ARM machine.
+Setup
+ - Kernel initialization parameters on ARM Linux
+README
+ - General ARM documentation
+SA1100/
+ - SA1100 documentation
+Samsung-S3C24XX
+ - S3C24XX ARM Linux Overview
+Sharp-LH
+ - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC)
+VFP/
+ - Release notes for Linux Kernel Vector Floating Point support code
+empeg/
+ - Ltd's Empeg MP3 Car Audio Player
+mem_alignment
+ - alignment abort handler documentation
+memory.txt
+ - description of the virtual memory layout
+nwfpe/
+ - NWFPE floating point emulator documentation
diff --git a/Documentation/arm/Booting b/Documentation/arm/Booting
new file mode 100644
index 0000000..7685029
--- /dev/null
+++ b/Documentation/arm/Booting
@@ -0,0 +1,141 @@
+ Booting ARM Linux
+ =================
+
+Author: Russell King
+Date : 18 May 2002
+
+The following documentation is relevant to 2.4.18-rmk6 and beyond.
+
+In order to boot ARM Linux, you require a boot loader, which is a small
+program that runs before the main kernel. The boot loader is expected
+to initialise various devices, and eventually call the Linux kernel,
+passing information to the kernel.
+
+Essentially, the boot loader should provide (as a minimum) the
+following:
+
+1. Setup and initialise the RAM.
+2. Initialise one serial port.
+3. Detect the machine type.
+4. Setup the kernel tagged list.
+5. Call the kernel image.
+
+
+1. Setup and initialise RAM
+---------------------------
+
+Existing boot loaders: MANDATORY
+New boot loaders: MANDATORY
+
+The boot loader is expected to find and initialise all RAM that the
+kernel will use for volatile data storage in the system. It performs
+this in a machine dependent manner. (It may use internal algorithms
+to automatically locate and size all RAM, or it may use knowledge of
+the RAM in the machine, or any other method the boot loader designer
+sees fit.)
+
+
+2. Initialise one serial port
+-----------------------------
+
+Existing boot loaders: OPTIONAL, RECOMMENDED
+New boot loaders: OPTIONAL, RECOMMENDED
+
+The boot loader should initialise and enable one serial port on the
+target. This allows the kernel serial driver to automatically detect
+which serial port it should use for the kernel console (generally
+used for debugging purposes, or communication with the target.)
+
+As an alternative, the boot loader can pass the relevant 'console='
+option to the kernel via the tagged lists specifying the port, and
+serial format options as described in
+
+ Documentation/kernel-parameters.txt.
+
+
+3. Detect the machine type
+--------------------------
+
+Existing boot loaders: OPTIONAL
+New boot loaders: MANDATORY
+
+The boot loader should detect the machine type its running on by some
+method. Whether this is a hard coded value or some algorithm that
+looks at the connected hardware is beyond the scope of this document.
+The boot loader must ultimately be able to provide a MACH_TYPE_xxx
+value to the kernel. (see linux/arch/arm/tools/mach-types).
+
+
+4. Setup the kernel tagged list
+-------------------------------
+
+Existing boot loaders: OPTIONAL, HIGHLY RECOMMENDED
+New boot loaders: MANDATORY
+
+The boot loader must create and initialise the kernel tagged list.
+A valid tagged list starts with ATAG_CORE and ends with ATAG_NONE.
+The ATAG_CORE tag may or may not be empty. An empty ATAG_CORE tag
+has the size field set to '2' (0x00000002). The ATAG_NONE must set
+the size field to zero.
+
+Any number of tags can be placed in the list. It is undefined
+whether a repeated tag appends to the information carried by the
+previous tag, or whether it replaces the information in its
+entirety; some tags behave as the former, others the latter.
+
+The boot loader must pass at a minimum the size and location of
+the system memory, and root filesystem location. Therefore, the
+minimum tagged list should look:
+
+ +-----------+
+base -> | ATAG_CORE | |
+ +-----------+ |
+ | ATAG_MEM | | increasing address
+ +-----------+ |
+ | ATAG_NONE | |
+ +-----------+ v
+
+The tagged list should be stored in system RAM.
+
+The tagged list must be placed in a region of memory where neither
+the kernel decompressor nor initrd 'bootp' program will overwrite
+it. The recommended placement is in the first 16KiB of RAM.
+
+5. Calling the kernel image
+---------------------------
+
+Existing boot loaders: MANDATORY
+New boot loaders: MANDATORY
+
+There are two options for calling the kernel zImage. If the zImage
+is stored in flash, and is linked correctly to be run from flash,
+then it is legal for the boot loader to call the zImage in flash
+directly.
+
+The zImage may also be placed in system RAM (at any location) and
+called there. Note that the kernel uses 16K of RAM below the image
+to store page tables. The recommended placement is 32KiB into RAM.
+
+In either case, the following conditions must be met:
+
+- Quiesce all DMA capable devices so that memory does not get
+ corrupted by bogus network packets or disk data. This will save
+ you many hours of debug.
+
+- CPU register settings
+ r0 = 0,
+ r1 = machine type number discovered in (3) above.
+ r2 = physical address of tagged list in system RAM.
+
+- CPU mode
+ All forms of interrupts must be disabled (IRQs and FIQs)
+ The CPU must be in SVC mode. (A special exception exists for Angel)
+
+- Caches, MMUs
+ The MMU must be off.
+ Instruction cache may be on or off.
+ Data cache must be off.
+
+- The boot loader is expected to call the kernel image by jumping
+ directly to the first instruction of the kernel image.
+
diff --git a/Documentation/arm/IXP2000 b/Documentation/arm/IXP2000
new file mode 100644
index 0000000..e0148b6
--- /dev/null
+++ b/Documentation/arm/IXP2000
@@ -0,0 +1,69 @@
+
+-------------------------------------------------------------------------
+Release Notes for Linux on Intel's IXP2000 Network Processor
+
+Maintained by Deepak Saxena <dsaxena@plexity.net>
+-------------------------------------------------------------------------
+
+1. Overview
+
+Intel's IXP2000 family of NPUs (IXP2400, IXP2800, IXP2850) is designed
+for high-performance network applications such high-availability
+telecom systems. In addition to an XScale core, it contains up to 8
+"MicroEngines" that run special code, several high-end networking
+interfaces (UTOPIA, SPI, etc), a PCI host bridge, one serial port,
+flash interface, and some other odds and ends. For more information, see:
+
+http://developer.intel.com/design/network/products/npfamily/ixp2xxx.htm
+
+2. Linux Support
+
+Linux currently supports the following features on the IXP2000 NPUs:
+
+- On-chip serial
+- PCI
+- Flash (MTD/JFFS2)
+- I2C through GPIO
+- Timers (watchdog, OS)
+
+That is about all we can support under Linux ATM b/c the core networking
+components of the chip are accessed via Intel's closed source SDK.
+Please contact Intel directly on issues with using those. There is
+also a mailing list run by some folks at Princeton University that might
+be of help: https://lists.cs.princeton.edu/mailman/listinfo/ixp2xxx
+
+WHATEVER YOU DO, DO NOT POST EMAIL TO THE LINUX-ARM OR LINUX-ARM-KERNEL
+MAILING LISTS REGARDING THE INTEL SDK.
+
+3. Supported Platforms
+
+- Intel IXDP2400 Reference Platform
+- Intel IXDP2800 Reference Platform
+- Intel IXDP2401 Reference Platform
+- Intel IXDP2801 Reference Platform
+- RadiSys ENP-2611
+
+4. Usage Notes
+
+- The IXP2000 platforms usually have rather complex PCI bus topologies
+ with large memory space requirements. In addition, b/c of the way the
+ Intel SDK is designed, devices are enumerated in a very specific
+ way. B/c of this this, we use "pci=firmware" option in the kernel
+ command line so that we do not re-enumerate the bus.
+
+- IXDP2x01 systems have variable clock tick rates that we cannot determine
+ via HW registers. The "ixdp2x01_clk=XXX" cmd line options allow you
+ to pass the clock rate to the board port.
+
+5. Thanks
+
+The IXP2000 work has been funded by Intel Corp. and MontaVista Software, Inc.
+
+The following people have contributed patches/comments/etc:
+
+Naeem F. Afzal
+Lennert Buytenhek
+Jeffrey Daly
+
+-------------------------------------------------------------------------
+Last Update: 8/09/2004
diff --git a/Documentation/arm/IXP4xx b/Documentation/arm/IXP4xx
new file mode 100644
index 0000000..72fbcc4
--- /dev/null
+++ b/Documentation/arm/IXP4xx
@@ -0,0 +1,174 @@
+
+-------------------------------------------------------------------------
+Release Notes for Linux on Intel's IXP4xx Network Processor
+
+Maintained by Deepak Saxena <dsaxena@plexity.net>
+-------------------------------------------------------------------------
+
+1. Overview
+
+Intel's IXP4xx network processor is a highly integrated SOC that
+is targeted for network applications, though it has become popular
+in industrial control and other areas due to low cost and power
+consumption. The IXP4xx family currently consists of several processors
+that support different network offload functions such as encryption,
+routing, firewalling, etc. The IXP46x family is an updated version which
+supports faster speeds, new memory and flash configurations, and more
+integration such as an on-chip I2C controller.
+
+For more information on the various versions of the CPU, see:
+
+ http://developer.intel.com/design/network/products/npfamily/ixp4xx.htm
+
+Intel also made the IXCP1100 CPU for sometime which is an IXP4xx
+stripped of much of the network intelligence.
+
+2. Linux Support
+
+Linux currently supports the following features on the IXP4xx chips:
+
+- Dual serial ports
+- PCI interface
+- Flash access (MTD/JFFS)
+- I2C through GPIO on IXP42x
+- GPIO for input/output/interrupts
+ See arch/arm/mach-ixp4xx/include/mach/platform.h for access functions.
+- Timers (watchdog, OS)
+
+The following components of the chips are not supported by Linux and
+require the use of Intel's propietary CSR softare:
+
+- USB device interface
+- Network interfaces (HSS, Utopia, NPEs, etc)
+- Network offload functionality
+
+If you need to use any of the above, you need to download Intel's
+software from:
+
+ http://developer.intel.com/design/network/products/npfamily/ixp425swr1.htm
+
+DO NOT POST QUESTIONS TO THE LINUX MAILING LISTS REGARDING THE PROPIETARY
+SOFTWARE.
+
+There are several websites that provide directions/pointers on using
+Intel's software:
+
+http://ixp4xx-osdg.sourceforge.net/
+ Open Source Developer's Guide for using uClinux and the Intel libraries
+
+http://gatewaymaker.sourceforge.net/
+ Simple one page summary of building a gateway using an IXP425 and Linux
+
+http://ixp425.sourceforge.net/
+ ATM device driver for IXP425 that relies on Intel's libraries
+
+3. Known Issues/Limitations
+
+3a. Limited inbound PCI window
+
+The IXP4xx family allows for up to 256MB of memory but the PCI interface
+can only expose 64MB of that memory to the PCI bus. This means that if
+you are running with > 64MB, all PCI buffers outside of the accessible
+range will be bounced using the routines in arch/arm/common/dmabounce.c.
+
+3b. Limited outbound PCI window
+
+IXP4xx provides two methods of accessing PCI memory space:
+
+1) A direct mapped window from 0x48000000 to 0x4bffffff (64MB).
+ To access PCI via this space, we simply ioremap() the BAR
+ into the kernel and we can use the standard read[bwl]/write[bwl]
+ macros. This is the preffered method due to speed but it
+ limits the system to just 64MB of PCI memory. This can be
+ problamatic if using video cards and other memory-heavy devices.
+
+2) If > 64MB of memory space is required, the IXP4xx can be
+ configured to use indirect registers to access PCI This allows
+ for up to 128MB (0x48000000 to 0x4fffffff) of memory on the bus.
+ The disadvantage of this is that every PCI access requires
+ three local register accesses plus a spinlock, but in some
+ cases the performance hit is acceptable. In addition, you cannot
+ mmap() PCI devices in this case due to the indirect nature
+ of the PCI window.
+
+By default, the direct method is used for performance reasons. If
+you need more PCI memory, enable the IXP4XX_INDIRECT_PCI config option.
+
+3c. GPIO as Interrupts
+
+Currently the code only handles level-sensitive GPIO interrupts
+
+4. Supported platforms
+
+ADI Engineering Coyote Gateway Reference Platform
+http://www.adiengineering.com/productsCoyote.html
+
+ The ADI Coyote platform is reference design for those building
+ small residential/office gateways. One NPE is connected to a 10/100
+ interface, one to 4-port 10/100 switch, and the third to and ADSL
+ interface. In addition, it also supports to POTs interfaces connected
+ via SLICs. Note that those are not supported by Linux ATM. Finally,
+ the platform has two mini-PCI slots used for 802.11[bga] cards.
+ Finally, there is an IDE port hanging off the expansion bus.
+
+Gateworks Avila Network Platform
+http://www.gateworks.com/avila_sbc.htm
+
+ The Avila platform is basically and IXDP425 with the 4 PCI slots
+ replaced with mini-PCI slots and a CF IDE interface hanging off
+ the expansion bus.
+
+Intel IXDP425 Development Platform
+http://developer.intel.com/design/network/products/npfamily/ixdp425.htm
+
+ This is Intel's standard reference platform for the IXDP425 and is
+ also known as the Richfield board. It contains 4 PCI slots, 16MB
+ of flash, two 10/100 ports and one ADSL port.
+
+Intel IXDP465 Development Platform
+http://developer.intel.com/design/network/products/npfamily/ixdp465.htm
+
+ This is basically an IXDP425 with an IXP465 and 32M of flash instead
+ of just 16.
+
+Intel IXDPG425 Development Platform
+
+ This is basically and ADI Coyote board with a NEC EHCI controller
+ added. One issue with this board is that the mini-PCI slots only
+ have the 3.3v line connected, so you can't use a PCI to mini-PCI
+ adapter with an E100 card. So to NFS root you need to use either
+ the CSR or a WiFi card and a ramdisk that BOOTPs and then does
+ a pivot_root to NFS.
+
+Motorola PrPMC1100 Processor Mezanine Card
+http://www.fountainsys.com/datasheet/PrPMC1100.pdf
+
+ The PrPMC1100 is based on the IXCP1100 and is meant to plug into
+ and IXP2400/2800 system to act as the system controller. It simply
+ contains a CPU and 16MB of flash on the board and needs to be
+ plugged into a carrier board to function. Currently Linux only
+ supports the Motorola PrPMC carrier board for this platform.
+ See https://mcg.motorola.com/us/ds/pdf/ds0144.pdf for info
+ on the carrier board.
+
+5. TODO LIST
+
+- Add support for Coyote IDE
+- Add support for edge-based GPIO interrupts
+- Add support for CF IDE on expansion bus
+
+6. Thanks
+
+The IXP4xx work has been funded by Intel Corp. and MontaVista Software, Inc.
+
+The following people have contributed patches/comments/etc:
+
+Lennerty Buytenhek
+Lutz Jaenicke
+Justin Mayfield
+Robert E. Ranslam
+[I know I've forgotten others, please email me to be added]
+
+-------------------------------------------------------------------------
+
+Last Update: 01/04/2005
diff --git a/Documentation/arm/Interrupts b/Documentation/arm/Interrupts
new file mode 100644
index 0000000..f09ab1b
--- /dev/null
+++ b/Documentation/arm/Interrupts
@@ -0,0 +1,167 @@
+2.5.2-rmk5
+----------
+
+This is the first kernel that contains a major shake up of some of the
+major architecture-specific subsystems.
+
+Firstly, it contains some pretty major changes to the way we handle the
+MMU TLB. Each MMU TLB variant is now handled completely separately -
+we have TLB v3, TLB v4 (without write buffer), TLB v4 (with write buffer),
+and finally TLB v4 (with write buffer, with I TLB invalidate entry).
+There is more assembly code inside each of these functions, mainly to
+allow more flexible TLB handling for the future.
+
+Secondly, the IRQ subsystem.
+
+The 2.5 kernels will be having major changes to the way IRQs are handled.
+Unfortunately, this means that machine types that touch the irq_desc[]
+array (basically all machine types) will break, and this means every
+machine type that we currently have.
+
+Lets take an example. On the Assabet with Neponset, we have:
+
+ GPIO25 IRR:2
+ SA1100 ------------> Neponset -----------> SA1111
+ IIR:1
+ -----------> USAR
+ IIR:0
+ -----------> SMC9196
+
+The way stuff currently works, all SA1111 interrupts are mutually
+exclusive of each other - if you're processing one interrupt from the
+SA1111 and another comes in, you have to wait for that interrupt to
+finish processing before you can service the new interrupt. Eg, an
+IDE PIO-based interrupt on the SA1111 excludes all other SA1111 and
+SMC9196 interrupts until it has finished transferring its multi-sector
+data, which can be a long time. Note also that since we loop in the
+SA1111 IRQ handler, SA1111 IRQs can hold off SMC9196 IRQs indefinitely.
+
+
+The new approach brings several new ideas...
+
+We introduce the concept of a "parent" and a "child". For example,
+to the Neponset handler, the "parent" is GPIO25, and the "children"d
+are SA1111, SMC9196 and USAR.
+
+We also bring the idea of an IRQ "chip" (mainly to reduce the size of
+the irqdesc array). This doesn't have to be a real "IC"; indeed the
+SA11x0 IRQs are handled by two separate "chip" structures, one for
+GPIO0-10, and another for all the rest. It is just a container for
+the various operations (maybe this'll change to a better name).
+This structure has the following operations:
+
+struct irqchip {
+ /*
+ * Acknowledge the IRQ.
+ * If this is a level-based IRQ, then it is expected to mask the IRQ
+ * as well.
+ */
+ void (*ack)(unsigned int irq);
+ /*
+ * Mask the IRQ in hardware.
+ */
+ void (*mask)(unsigned int irq);
+ /*
+ * Unmask the IRQ in hardware.
+ */
+ void (*unmask)(unsigned int irq);
+ /*
+ * Re-run the IRQ
+ */
+ void (*rerun)(unsigned int irq);
+ /*
+ * Set the type of the IRQ.
+ */
+ int (*type)(unsigned int irq, unsigned int, type);
+};
+
+ack - required. May be the same function as mask for IRQs
+ handled by do_level_IRQ.
+mask - required.
+unmask - required.
+rerun - optional. Not required if you're using do_level_IRQ for all
+ IRQs that use this 'irqchip'. Generally expected to re-trigger
+ the hardware IRQ if possible. If not, may call the handler
+ directly.
+type - optional. If you don't support changing the type of an IRQ,
+ it should be null so people can detect if they are unable to
+ set the IRQ type.
+
+For each IRQ, we keep the following information:
+
+ - "disable" depth (number of disable_irq()s without enable_irq()s)
+ - flags indicating what we can do with this IRQ (valid, probe,
+ noautounmask) as before
+ - status of the IRQ (probing, enable, etc)
+ - chip
+ - per-IRQ handler
+ - irqaction structure list
+
+The handler can be one of the 3 standard handlers - "level", "edge" and
+"simple", or your own specific handler if you need to do something special.
+
+The "level" handler is what we currently have - its pretty simple.
+"edge" knows about the brokenness of such IRQ implementations - that you
+need to leave the hardware IRQ enabled while processing it, and queueing
+further IRQ events should the IRQ happen again while processing. The
+"simple" handler is very basic, and does not perform any hardware
+manipulation, nor state tracking. This is useful for things like the
+SMC9196 and USAR above.
+
+So, what's changed?
+
+1. Machine implementations must not write to the irqdesc array.
+
+2. New functions to manipulate the irqdesc array. The first 4 are expected
+ to be useful only to machine specific code. The last is recommended to
+ only be used by machine specific code, but may be used in drivers if
+ absolutely necessary.
+
+ set_irq_chip(irq,chip)
+
+ Set the mask/unmask methods for handling this IRQ
+
+ set_irq_handler(irq,handler)
+
+ Set the handler for this IRQ (level, edge, simple)
+
+ set_irq_chained_handler(irq,handler)
+
+ Set a "chained" handler for this IRQ - automatically
+ enables this IRQ (eg, Neponset and SA1111 handlers).
+
+ set_irq_flags(irq,flags)
+
+ Set the valid/probe/noautoenable flags.
+
+ set_irq_type(irq,type)
+
+ Set active the IRQ edge(s)/level. This replaces the
+ SA1111 INTPOL manipulation, and the set_GPIO_IRQ_edge()
+ function. Type should be one of IRQ_TYPE_xxx defined in
+ <linux/irq.h>
+
+3. set_GPIO_IRQ_edge() is obsolete, and should be replaced by set_irq_type.
+
+4. Direct access to SA1111 INTPOL is deprecated. Use set_irq_type instead.
+
+5. A handler is expected to perform any necessary acknowledgement of the
+ parent IRQ via the correct chip specific function. For instance, if
+ the SA1111 is directly connected to a SA1110 GPIO, then you should
+ acknowledge the SA1110 IRQ each time you re-read the SA1111 IRQ status.
+
+6. For any child which doesn't have its own IRQ enable/disable controls
+ (eg, SMC9196), the handler must mask or acknowledge the parent IRQ
+ while the child handler is called, and the child handler should be the
+ "simple" handler (not "edge" nor "level"). After the handler completes,
+ the parent IRQ should be unmasked, and the status of all children must
+ be re-checked for pending events. (see the Neponset IRQ handler for
+ details).
+
+7. fixup_irq() is gone, as is arch/arm/mach-*/include/mach/irq.h
+
+Please note that this will not solve all problems - some of them are
+hardware based. Mixing level-based and edge-based IRQs on the same
+parent signal (eg neponset) is one such area where a software based
+solution can't provide the full answer to low IRQ latency.
+
diff --git a/Documentation/arm/Netwinder b/Documentation/arm/Netwinder
new file mode 100644
index 0000000..f1b457f
--- /dev/null
+++ b/Documentation/arm/Netwinder
@@ -0,0 +1,78 @@
+NetWinder specific documentation
+================================
+
+The NetWinder is a small low-power computer, primarily designed
+to run Linux. It is based around the StrongARM RISC processor,
+DC21285 PCI bridge, with PC-type hardware glued around it.
+
+Port usage
+==========
+
+Min - Max Description
+---------------------------
+0x0000 - 0x000f DMA1
+0x0020 - 0x0021 PIC1
+0x0060 - 0x006f Keyboard
+0x0070 - 0x007f RTC
+0x0080 - 0x0087 DMA1
+0x0088 - 0x008f DMA2
+0x00a0 - 0x00a3 PIC2
+0x00c0 - 0x00df DMA2
+0x0180 - 0x0187 IRDA
+0x01f0 - 0x01f6 ide0
+0x0201 Game port
+0x0203 RWA010 configuration read
+0x0220 - ? SoundBlaster
+0x0250 - ? WaveArtist
+0x0279 RWA010 configuration index
+0x02f8 - 0x02ff Serial ttyS1
+0x0300 - 0x031f Ether10
+0x0338 GPIO1
+0x033a GPIO2
+0x0370 - 0x0371 W83977F configuration registers
+0x0388 - ? AdLib
+0x03c0 - 0x03df VGA
+0x03f6 ide0
+0x03f8 - 0x03ff Serial ttyS0
+0x0400 - 0x0408 DC21143
+0x0480 - 0x0487 DMA1
+0x0488 - 0x048f DMA2
+0x0a79 RWA010 configuration write
+0xe800 - 0xe80f ide0/ide1 BM DMA
+
+
+Interrupt usage
+===============
+
+IRQ type Description
+---------------------------
+ 0 ISA 100Hz timer
+ 1 ISA Keyboard
+ 2 ISA cascade
+ 3 ISA Serial ttyS1
+ 4 ISA Serial ttyS0
+ 5 ISA PS/2 mouse
+ 6 ISA IRDA
+ 7 ISA Printer
+ 8 ISA RTC alarm
+ 9 ISA
+10 ISA GP10 (Orange reset button)
+11 ISA
+12 ISA WaveArtist
+13 ISA
+14 ISA hda1
+15 ISA
+
+DMA usage
+=========
+
+DMA type Description
+---------------------------
+ 0 ISA IRDA
+ 1 ISA
+ 2 ISA cascade
+ 3 ISA WaveArtist
+ 4 ISA
+ 5 ISA
+ 6 ISA
+ 7 ISA WaveArtist
diff --git a/Documentation/arm/Porting b/Documentation/arm/Porting
new file mode 100644
index 0000000..a492233
--- /dev/null
+++ b/Documentation/arm/Porting
@@ -0,0 +1,135 @@
+Taken from list archive at http://lists.arm.linux.org.uk/pipermail/linux-arm-kernel/2001-July/004064.html
+
+Initial definitions
+-------------------
+
+The following symbol definitions rely on you knowing the translation that
+__virt_to_phys() does for your machine. This macro converts the passed
+virtual address to a physical address. Normally, it is simply:
+
+ phys = virt - PAGE_OFFSET + PHYS_OFFSET
+
+
+Decompressor Symbols
+--------------------
+
+ZTEXTADDR
+ Start address of decompressor. There's no point in talking about
+ virtual or physical addresses here, since the MMU will be off at
+ the time when you call the decompressor code. You normally call
+ the kernel at this address to start it booting. This doesn't have
+ to be located in RAM, it can be in flash or other read-only or
+ read-write addressable medium.
+
+ZBSSADDR
+ Start address of zero-initialised work area for the decompressor.
+ This must be pointing at RAM. The decompressor will zero initialise
+ this for you. Again, the MMU will be off.
+
+ZRELADDR
+ This is the address where the decompressed kernel will be written,
+ and eventually executed. The following constraint must be valid:
+
+ __virt_to_phys(TEXTADDR) == ZRELADDR
+
+ The initial part of the kernel is carefully coded to be position
+ independent.
+
+INITRD_PHYS
+ Physical address to place the initial RAM disk. Only relevant if
+ you are using the bootpImage stuff (which only works on the old
+ struct param_struct).
+
+INITRD_VIRT
+ Virtual address of the initial RAM disk. The following constraint
+ must be valid:
+
+ __virt_to_phys(INITRD_VIRT) == INITRD_PHYS
+
+PARAMS_PHYS
+ Physical address of the struct param_struct or tag list, giving the
+ kernel various parameters about its execution environment.
+
+
+Kernel Symbols
+--------------
+
+PHYS_OFFSET
+ Physical start address of the first bank of RAM.
+
+PAGE_OFFSET
+ Virtual start address of the first bank of RAM. During the kernel
+ boot phase, virtual address PAGE_OFFSET will be mapped to physical
+ address PHYS_OFFSET, along with any other mappings you supply.
+ This should be the same value as TASK_SIZE.
+
+TASK_SIZE
+ The maximum size of a user process in bytes. Since user space
+ always starts at zero, this is the maximum address that a user
+ process can access+1. The user space stack grows down from this
+ address.
+
+ Any virtual address below TASK_SIZE is deemed to be user process
+ area, and therefore managed dynamically on a process by process
+ basis by the kernel. I'll call this the user segment.
+
+ Anything above TASK_SIZE is common to all processes. I'll call
+ this the kernel segment.
+
+ (In other words, you can't put IO mappings below TASK_SIZE, and
+ hence PAGE_OFFSET).
+
+TEXTADDR
+ Virtual start address of kernel, normally PAGE_OFFSET + 0x8000.
+ This is where the kernel image ends up. With the latest kernels,
+ it must be located at 32768 bytes into a 128MB region. Previous
+ kernels placed a restriction of 256MB here.
+
+DATAADDR
+ Virtual address for the kernel data segment. Must not be defined
+ when using the decompressor.
+
+VMALLOC_START
+VMALLOC_END
+ Virtual addresses bounding the vmalloc() area. There must not be
+ any static mappings in this area; vmalloc will overwrite them.
+ The addresses must also be in the kernel segment (see above).
+ Normally, the vmalloc() area starts VMALLOC_OFFSET bytes above the
+ last virtual RAM address (found using variable high_memory).
+
+VMALLOC_OFFSET
+ Offset normally incorporated into VMALLOC_START to provide a hole
+ between virtual RAM and the vmalloc area. We do this to allow
+ out of bounds memory accesses (eg, something writing off the end
+ of the mapped memory map) to be caught. Normally set to 8MB.
+
+Architecture Specific Macros
+----------------------------
+
+BOOT_MEM(pram,pio,vio)
+ `pram' specifies the physical start address of RAM. Must always
+ be present, and should be the same as PHYS_OFFSET.
+
+ `pio' is the physical address of an 8MB region containing IO for
+ use with the debugging macros in arch/arm/kernel/debug-armv.S.
+
+ `vio' is the virtual address of the 8MB debugging region.
+
+ It is expected that the debugging region will be re-initialised
+ by the architecture specific code later in the code (via the
+ MAPIO function).
+
+BOOT_PARAMS
+ Same as, and see PARAMS_PHYS.
+
+FIXUP(func)
+ Machine specific fixups, run before memory subsystems have been
+ initialised.
+
+MAPIO(func)
+ Machine specific function to map IO areas (including the debug
+ region above).
+
+INITIRQ(func)
+ Machine specific function to initialise interrupts.
+
diff --git a/Documentation/arm/README b/Documentation/arm/README
new file mode 100644
index 0000000..d98783f
--- /dev/null
+++ b/Documentation/arm/README
@@ -0,0 +1,197 @@
+ ARM Linux 2.6
+ =============
+
+ Please check <ftp://ftp.arm.linux.org.uk/pub/armlinux> for
+ updates.
+
+Compilation of kernel
+---------------------
+
+ In order to compile ARM Linux, you will need a compiler capable of
+ generating ARM ELF code with GNU extensions. GCC 3.3 is known to be
+ a good compiler. Fortunately, you needn't guess. The kernel will report
+ an error if your compiler is a recognized offender.
+
+ To build ARM Linux natively, you shouldn't have to alter the ARCH = line
+ in the top level Makefile. However, if you don't have the ARM Linux ELF
+ tools installed as default, then you should change the CROSS_COMPILE
+ line as detailed below.
+
+ If you wish to cross-compile, then alter the following lines in the top
+ level make file:
+
+ ARCH = <whatever>
+ with
+ ARCH = arm
+
+ and
+
+ CROSS_COMPILE=
+ to
+ CROSS_COMPILE=<your-path-to-your-compiler-without-gcc>
+ eg.
+ CROSS_COMPILE=arm-linux-
+
+ Do a 'make config', followed by 'make Image' to build the kernel
+ (arch/arm/boot/Image). A compressed image can be built by doing a
+ 'make zImage' instead of 'make Image'.
+
+
+Bug reports etc
+---------------
+
+ Please send patches to the patch system. For more information, see
+ http://www.arm.linux.org.uk/patches/info.html Always include some
+ explanation as to what the patch does and why it is needed.
+
+ Bug reports should be sent to linux-arm-kernel@lists.arm.linux.org.uk,
+ or submitted through the web form at
+ http://www.arm.linux.org.uk/forms/solution.shtml
+
+ When sending bug reports, please ensure that they contain all relevant
+ information, eg. the kernel messages that were printed before/during
+ the problem, what you were doing, etc.
+
+
+Include files
+-------------
+
+ Several new include directories have been created under include/asm-arm,
+ which are there to reduce the clutter in the top-level directory. These
+ directories, and their purpose is listed below:
+
+ arch-* machine/platform specific header files
+ hardware driver-internal ARM specific data structures/definitions
+ mach descriptions of generic ARM to specific machine interfaces
+ proc-* processor dependent header files (currently only two
+ categories)
+
+
+Machine/Platform support
+------------------------
+
+ The ARM tree contains support for a lot of different machine types. To
+ continue supporting these differences, it has become necessary to split
+ machine-specific parts by directory. For this, the machine category is
+ used to select which directories and files get included (we will use
+ $(MACHINE) to refer to the category)
+
+ To this end, we now have arch/arm/mach-$(MACHINE) directories which are
+ designed to house the non-driver files for a particular machine (eg, PCI,
+ memory management, architecture definitions etc). For all future
+ machines, there should be a corresponding arch/arm/mach-$(MACHINE)/include/mach
+ directory.
+
+
+Modules
+-------
+
+ Although modularisation is supported (and required for the FP emulator),
+ each module on an ARM2/ARM250/ARM3 machine when is loaded will take
+ memory up to the next 32k boundary due to the size of the pages.
+ Therefore, is modularisation on these machines really worth it?
+
+ However, ARM6 and up machines allow modules to take multiples of 4k, and
+ as such Acorn RiscPCs and other architectures using these processors can
+ make good use of modularisation.
+
+
+ADFS Image files
+----------------
+
+ You can access image files on your ADFS partitions by mounting the ADFS
+ partition, and then using the loopback device driver. You must have
+ losetup installed.
+
+ Please note that the PCEmulator DOS partitions have a partition table at
+ the start, and as such, you will have to give '-o offset' to losetup.
+
+
+Request to developers
+---------------------
+
+ When writing device drivers which include a separate assembler file, please
+ include it in with the C file, and not the arch/arm/lib directory. This
+ allows the driver to be compiled as a loadable module without requiring
+ half the code to be compiled into the kernel image.
+
+ In general, try to avoid using assembler unless it is really necessary. It
+ makes drivers far less easy to port to other hardware.
+
+
+ST506 hard drives
+-----------------
+
+ The ST506 hard drive controllers seem to be working fine (if a little
+ slowly). At the moment they will only work off the controllers on an
+ A4x0's motherboard, but for it to work off a Podule just requires
+ someone with a podule to add the addresses for the IRQ mask and the
+ HDC base to the source.
+
+ As of 31/3/96 it works with two drives (you should get the ADFS
+ *configure harddrive set to 2). I've got an internal 20MB and a great
+ big external 5.25" FH 64MB drive (who could ever want more :-) ).
+
+ I've just got 240K/s off it (a dd with bs=128k); thats about half of what
+ RiscOS gets; but it's a heck of a lot better than the 50K/s I was getting
+ last week :-)
+
+ Known bug: Drive data errors can cause a hang; including cases where
+ the controller has fixed the error using ECC. (Possibly ONLY
+ in that case...hmm).
+
+
+1772 Floppy
+-----------
+ This also seems to work OK, but hasn't been stressed much lately. It
+ hasn't got any code for disc change detection in there at the moment which
+ could be a bit of a problem! Suggestions on the correct way to do this
+ are welcome.
+
+
+CONFIG_MACH_ and CONFIG_ARCH_
+-----------------------------
+ A change was made in 2003 to the macro names for new machines.
+ Historically, CONFIG_ARCH_ was used for the bonafide architecture,
+ e.g. SA1100, as well as implementations of the architecture,
+ e.g. Assabet. It was decided to change the implementation macros
+ to read CONFIG_MACH_ for clarity. Moreover, a retroactive fixup has
+ not been made because it would complicate patching.
+
+ Previous registrations may be found online.
+
+ <http://www.arm.linux.org.uk/developer/machines/>
+
+Kernel entry (head.S)
+--------------------------
+ The initial entry into the kernel is via head.S, which uses machine
+ independent code. The machine is selected by the value of 'r1' on
+ entry, which must be kept unique.
+
+ Due to the large number of machines which the ARM port of Linux provides
+ for, we have a method to manage this which ensures that we don't end up
+ duplicating large amounts of code.
+
+ We group machine (or platform) support code into machine classes. A
+ class typically based around one or more system on a chip devices, and
+ acts as a natural container around the actual implementations. These
+ classes are given directories - arch/arm/mach-<class> and
+ arch/arm/mach-<class> - which contain the source files to/include/mach
+ support the machine class. This directories also contain any machine
+ specific supporting code.
+
+ For example, the SA1100 class is based upon the SA1100 and SA1110 SoC
+ devices, and contains the code to support the way the on-board and off-
+ board devices are used, or the device is setup, and provides that
+ machine specific "personality."
+
+ This fine-grained machine specific selection is controlled by the machine
+ type ID, which acts both as a run-time and a compile-time code selection
+ method.
+
+ You can register a new machine via the web site at:
+
+ <http://www.arm.linux.org.uk/developer/machines/>
+
+---
+Russell King (15/03/2004)
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
new file mode 100644
index 0000000..ab47c38
--- /dev/null
+++ b/Documentation/arm/SA1100/ADSBitsy
@@ -0,0 +1,43 @@
+ADS Bitsy Single Board Computer
+(It is different from Bitsy(iPAQ) of Compaq)
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The Linux support for this product has been provided by
+Woojung Huh <whuh@applieddata.net>
+
+Use 'make adsbitsy_config' before any 'make config'.
+This will set up defaults for ADS Bitsy support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0400000.
+
+Linux can be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+
+Supported peripherals:
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- SA1111 USB Master
+- SA1100 serial port
+- pcmcia, compact flash
+- touchscreen(ucb1200)
+- console on LCD screen
+- serial ports (ttyS[0-2])
+ - ttyS0 is default for serial console
+
+To do:
+- everything else! :-)
+
+Notes:
+
+- The flash on board is divided into 3 partitions.
+ You should be careful to use flash on board.
+ It's partition is different from GraphicsClient Plus and GraphicsMaster
+
+- 16bpp mode requires a different cable than what ships with the board.
+ Contact ADS or look through the manual to wire your own. Currently,
+ if you compile with 16bit mode support and switch into a lower bpp
+ mode, the timing is off so the image is corrupted. This will be
+ fixed soon.
+
+Any contribution can be sent to nico@cam.org and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/Assabet b/Documentation/arm/SA1100/Assabet
new file mode 100644
index 0000000..78bc1c1
--- /dev/null
+++ b/Documentation/arm/SA1100/Assabet
@@ -0,0 +1,301 @@
+The Intel Assabet (SA-1110 evaluation) board
+============================================
+
+Please see:
+http://developer.intel.com/design/strong/quicklist/eval-plat/sa-1110.htm
+http://developer.intel.com/design/strong/guides/278278.htm
+
+Also some notes from John G Dorsey <jd5q@andrew.cmu.edu>:
+http://www.cs.cmu.edu/~wearable/software/assabet.html
+
+
+Building the kernel
+-------------------
+
+To build the kernel with current defaults:
+
+ make assabet_config
+ make oldconfig
+ make zImage
+
+The resulting kernel image should be available in linux/arch/arm/boot/zImage.
+
+
+Installing a bootloader
+-----------------------
+
+A couple of bootloaders able to boot Linux on Assabet are available:
+
+BLOB (http://www.lartmaker.nl/lartware/blob/)
+
+ BLOB is a bootloader used within the LART project. Some contributed
+ patches were merged into BLOB to add support for Assabet.
+
+Compaq's Bootldr + John Dorsey's patch for Assabet support
+(http://www.handhelds.org/Compaq/bootldr.html)
+(http://www.wearablegroup.org/software/bootldr/)
+
+ Bootldr is the bootloader developed by Compaq for the iPAQ Pocket PC.
+ John Dorsey has produced add-on patches to add support for Assabet and
+ the JFFS filesystem.
+
+RedBoot (http://sources.redhat.com/redboot/)
+
+ RedBoot is a bootloader developed by Red Hat based on the eCos RTOS
+ hardware abstraction layer. It supports Assabet amongst many other
+ hardware platforms.
+
+RedBoot is currently the recommended choice since it's the only one to have
+networking support, and is the most actively maintained.
+
+Brief examples on how to boot Linux with RedBoot are shown below. But first
+you need to have RedBoot installed in your flash memory. A known to work
+precompiled RedBoot binary is available from the following location:
+
+ftp://ftp.netwinder.org/users/n/nico/
+ftp://ftp.arm.linux.org.uk/pub/linux/arm/people/nico/
+ftp://ftp.handhelds.org/pub/linux/arm/sa-1100-patches/
+
+Look for redboot-assabet*.tgz. Some installation infos are provided in
+redboot-assabet*.txt.
+
+
+Initial RedBoot configuration
+-----------------------------
+
+The commands used here are explained in The RedBoot User's Guide available
+on-line at http://sources.redhat.com/ecos/docs-latest/redboot/redboot.html.
+Please refer to it for explanations.
+
+If you have a CF network card (my Assabet kit contained a CF+ LP-E from
+Socket Communications Inc.), you should strongly consider using it for TFTP
+file transfers. You must insert it before RedBoot runs since it can't detect
+it dynamically.
+
+To initialize the flash directory:
+
+ fis init -f
+
+To initialize the non-volatile settings, like whether you want to use BOOTP or
+a static IP address, etc, use this command:
+
+ fconfig -i
+
+
+Writing a kernel image into flash
+---------------------------------
+
+First, the kernel image must be loaded into RAM. If you have the zImage file
+available on a TFTP server:
+
+ load zImage -r -b 0x100000
+
+If you rather want to use Y-Modem upload over the serial port:
+
+ load -m ymodem -r -b 0x100000
+
+To write it to flash:
+
+ fis create "Linux kernel" -b 0x100000 -l 0xc0000
+
+
+Booting the kernel
+------------------
+
+The kernel still requires a filesystem to boot. A ramdisk image can be loaded
+as follows:
+
+ load ramdisk_image.gz -r -b 0x800000
+
+Again, Y-Modem upload can be used instead of TFTP by replacing the file name
+by '-y ymodem'.
+
+Now the kernel can be retrieved from flash like this:
+
+ fis load "Linux kernel"
+
+or loaded as described previously. To boot the kernel:
+
+ exec -b 0x100000 -l 0xc0000
+
+The ramdisk image could be stored into flash as well, but there are better
+solutions for on-flash filesystems as mentioned below.
+
+
+Using JFFS2
+-----------
+
+Using JFFS2 (the Second Journalling Flash File System) is probably the most
+convenient way to store a writable filesystem into flash. JFFS2 is used in
+conjunction with the MTD layer which is responsible for low-level flash
+management. More information on the Linux MTD can be found on-line at:
+http://www.linux-mtd.infradead.org/. A JFFS howto with some infos about
+creating JFFS/JFFS2 images is available from the same site.
+
+For instance, a sample JFFS2 image can be retrieved from the same FTP sites
+mentioned below for the precompiled RedBoot image.
+
+To load this file:
+
+ load sample_img.jffs2 -r -b 0x100000
+
+The result should look like:
+
+RedBoot> load sample_img.jffs2 -r -b 0x100000
+Raw file loaded 0x00100000-0x00377424
+
+Now we must know the size of the unallocated flash:
+
+ fis free
+
+Result:
+
+RedBoot> fis free
+ 0x500E0000 .. 0x503C0000
+
+The values above may be different depending on the size of the filesystem and
+the type of flash. See their usage below as an example and take care of
+substituting yours appropriately.
+
+We must determine some values:
+
+size of unallocated flash: 0x503c0000 - 0x500e0000 = 0x2e0000
+size of the filesystem image: 0x00377424 - 0x00100000 = 0x277424
+
+We want to fit the filesystem image of course, but we also want to give it all
+the remaining flash space as well. To write it:
+
+ fis unlock -f 0x500E0000 -l 0x2e0000
+ fis erase -f 0x500E0000 -l 0x2e0000
+ fis write -b 0x100000 -l 0x277424 -f 0x500E0000
+ fis create "JFFS2" -n -f 0x500E0000 -l 0x2e0000
+
+Now the filesystem is associated to a MTD "partition" once Linux has discovered
+what they are in the boot process. From Redboot, the 'fis list' command
+displays them:
+
+RedBoot> fis list
+Name FLASH addr Mem addr Length Entry point
+RedBoot 0x50000000 0x50000000 0x00020000 0x00000000
+RedBoot config 0x503C0000 0x503C0000 0x00020000 0x00000000
+FIS directory 0x503E0000 0x503E0000 0x00020000 0x00000000
+Linux kernel 0x50020000 0x00100000 0x000C0000 0x00000000
+JFFS2 0x500E0000 0x500E0000 0x002E0000 0x00000000
+
+However Linux should display something like:
+
+SA1100 flash: probing 32-bit flash bus
+SA1100 flash: Found 2 x16 devices at 0x0 in 32-bit mode
+Using RedBoot partition definition
+Creating 5 MTD partitions on "SA1100 flash":
+0x00000000-0x00020000 : "RedBoot"
+0x00020000-0x000e0000 : "Linux kernel"
+0x000e0000-0x003c0000 : "JFFS2"
+0x003c0000-0x003e0000 : "RedBoot config"
+0x003e0000-0x00400000 : "FIS directory"
+
+What's important here is the position of the partition we are interested in,
+which is the third one. Within Linux, this correspond to /dev/mtdblock2.
+Therefore to boot Linux with the kernel and its root filesystem in flash, we
+need this RedBoot command:
+
+ fis load "Linux kernel"
+ exec -b 0x100000 -l 0xc0000 -c "root=/dev/mtdblock2"
+
+Of course other filesystems than JFFS might be used, like cramfs for example.
+You might want to boot with a root filesystem over NFS, etc. It is also
+possible, and sometimes more convenient, to flash a filesystem directly from
+within Linux while booted from a ramdisk or NFS. The Linux MTD repository has
+many tools to deal with flash memory as well, to erase it for example. JFFS2
+can then be mounted directly on a freshly erased partition and files can be
+copied over directly. Etc...
+
+
+RedBoot scripting
+-----------------
+
+All the commands above aren't so useful if they have to be typed in every
+time the Assabet is rebooted. Therefore it's possible to automatize the boot
+process using RedBoot's scripting capability.
+
+For example, I use this to boot Linux with both the kernel and the ramdisk
+images retrieved from a TFTP server on the network:
+
+RedBoot> fconfig
+Run script at boot: false true
+Boot script:
+Enter script, terminate with empty line
+>> load zImage -r -b 0x100000
+>> load ramdisk_ks.gz -r -b 0x800000
+>> exec -b 0x100000 -l 0xc0000
+>>
+Boot script timeout (1000ms resolution): 3
+Use BOOTP for network configuration: true
+GDB connection port: 9000
+Network debug at boot time: false
+Update RedBoot non-volatile configuration - are you sure (y/n)? y
+
+Then, rebooting the Assabet is just a matter of waiting for the login prompt.
+
+
+
+Nicolas Pitre
+nico@cam.org
+June 12, 2001
+
+
+Status of peripherals in -rmk tree (updated 14/10/2001)
+-------------------------------------------------------
+
+Assabet:
+ Serial ports:
+ Radio: TX, RX, CTS, DSR, DCD, RI
+ PM: Not tested.
+ COM: TX, RX, CTS, DSR, DCD, RTS, DTR, PM
+ PM: Not tested.
+ I2C: Implemented, not fully tested.
+ L3: Fully tested, pass.
+ PM: Not tested.
+
+ Video:
+ LCD: Fully tested. PM
+ (LCD doesn't like being blanked with
+ neponset connected)
+ Video out: Not fully
+
+ Audio:
+ UDA1341:
+ Playback: Fully tested, pass.
+ Record: Implemented, not tested.
+ PM: Not tested.
+
+ UCB1200:
+ Audio play: Implemented, not heavily tested.
+ Audio rec: Implemented, not heavily tested.
+ Telco audio play: Implemented, not heavily tested.
+ Telco audio rec: Implemented, not heavily tested.
+ POTS control: No
+ Touchscreen: Yes
+ PM: Not tested.
+
+ Other:
+ PCMCIA:
+ LPE: Fully tested, pass.
+ USB: No
+ IRDA:
+ SIR: Fully tested, pass.
+ FIR: Fully tested, pass.
+ PM: Not tested.
+
+Neponset:
+ Serial ports:
+ COM1,2: TX, RX, CTS, DSR, DCD, RTS, DTR
+ PM: Not tested.
+ USB: Implemented, not heavily tested.
+ PCMCIA: Implemented, not heavily tested.
+ PM: Not tested.
+ CF: Implemented, not heavily tested.
+ PM: Not tested.
+
+More stuff can be found in the -np (Nicolas Pitre's) tree.
+
diff --git a/Documentation/arm/SA1100/Brutus b/Documentation/arm/SA1100/Brutus
new file mode 100644
index 0000000..2254c8f
--- /dev/null
+++ b/Documentation/arm/SA1100/Brutus
@@ -0,0 +1,66 @@
+Brutus is an evaluation platform for the SA1100 manufactured by Intel.
+For more details, see:
+
+http://developer.intel.com/design/strong/applnots/sa1100lx/getstart.htm
+
+To compile for Brutus, you must issue the following commands:
+
+ make brutus_config
+ make config
+ [accept all the defaults]
+ make zImage
+
+The resulting kernel will end up in linux/arch/arm/boot/zImage. This file
+must be loaded at 0xc0008000 in Brutus's memory and execution started at
+0xc0008000 as well with the value of registers r0 = 0 and r1 = 16 upon
+entry.
+
+But prior to execute the kernel, a ramdisk image must also be loaded in
+memory. Use memory address 0xd8000000 for this. Note that the file
+containing the (compressed) ramdisk image must not exceed 4 MB.
+
+Typically, you'll need angelboot to load the kernel.
+The following angelboot.opt file should be used:
+
+----- begin angelboot.opt -----
+base 0xc0008000
+entry 0xc0008000
+r0 0x00000000
+r1 0x00000010
+device /dev/ttyS0
+options "9600 8N1"
+baud 115200
+otherfile ramdisk_img.gz
+otherbase 0xd8000000
+----- end angelboot.opt -----
+
+Then load the kernel and ramdisk with:
+
+ angelboot -f angelboot.opt zImage
+
+The first Brutus serial port (assumed to be linked to /dev/ttyS0 on your
+host PC) is used by angel to load the kernel and ramdisk image. The serial
+console is provided through the second Brutus serial port. To access it,
+you may use minicom configured with /dev/ttyS1, 9600 baud, 8N1, no flow
+control.
+
+Currently supported:
+ - RS232 serial ports
+ - audio output
+ - LCD screen
+ - keyboard
+
+The actual Brutus support may not be complete without extra patches.
+If such patches exist, they should be found from
+ftp.netwinder.org/users/n/nico.
+
+A full PCMCIA support is still missing, although it's possible to hack
+some drivers in order to drive already inserted cards at boot time with
+little modifications.
+
+Any contribution is welcome.
+
+Please send patches to nico@cam.org
+
+Have Fun !
+
diff --git a/Documentation/arm/SA1100/CERF b/Documentation/arm/SA1100/CERF
new file mode 100644
index 0000000..b3d8453
--- /dev/null
+++ b/Documentation/arm/SA1100/CERF
@@ -0,0 +1,29 @@
+*** The StrongARM version of the CerfBoard/Cube has been discontinued ***
+
+The Intrinsyc CerfBoard is a StrongARM 1110-based computer on a board
+that measures approximately 2" square. It includes an Ethernet
+controller, an RS232-compatible serial port, a USB function port, and
+one CompactFlash+ slot on the back. Pictures can be found at the
+Intrinsyc website, http://www.intrinsyc.com.
+
+This document describes the support in the Linux kernel for the
+Intrinsyc CerfBoard.
+
+Supported in this version:
+ - CompactFlash+ slot (select PCMCIA in General Setup and any options
+ that may be required)
+ - Onboard Crystal CS8900 Ethernet controller (Cerf CS8900A support in
+ Network Devices)
+ - Serial ports with a serial console (hardcoded to 38400 8N1)
+
+In order to get this kernel onto your Cerf, you need a server that runs
+both BOOTP and TFTP. Detailed instructions should have come with your
+evaluation kit on how to use the bootloader. This series of commands
+will suffice:
+
+ make ARCH=arm CROSS_COMPILE=arm-linux- cerfcube_defconfig
+ make ARCH=arm CROSS_COMPILE=arm-linux- zImage
+ make ARCH=arm CROSS_COMPILE=arm-linux- modules
+ cp arch/arm/boot/zImage <TFTP directory>
+
+support@intrinsyc.com
diff --git a/Documentation/arm/SA1100/FreeBird b/Documentation/arm/SA1100/FreeBird
new file mode 100644
index 0000000..eda28b3
--- /dev/null
+++ b/Documentation/arm/SA1100/FreeBird
@@ -0,0 +1,21 @@
+Freebird-1.1 is produced by Legned(C) ,Inc.
+(http://www.legend.com.cn)
+and software/linux mainatined by Coventive(C),Inc.
+(http://www.coventive.com)
+
+Based on the Nicolas's strongarm kernel tree.
+
+===============================================================
+Maintainer:
+
+Chester Kuo <chester@coventive.com>
+ <chester@linux.org.tw>
+
+Author :
+Tim wu <timwu@coventive.com>
+CIH <cih@coventive.com>
+Eric Peng <ericpeng@coventive.com>
+Jeff Lee <jeff_lee@coventive.com>
+Allen Cheng
+Tony Liu <tonyliu@coventive.com>
+
diff --git a/Documentation/arm/SA1100/GraphicsClient b/Documentation/arm/SA1100/GraphicsClient
new file mode 100644
index 0000000..8fa7e80
--- /dev/null
+++ b/Documentation/arm/SA1100/GraphicsClient
@@ -0,0 +1,98 @@
+ADS GraphicsClient Plus Single Board Computer
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The original Linux support for this product has been provided by
+Nicolas Pitre <nico@cam.org>. Continued development work by
+Woojung Huh <whuh@applieddata.net>
+
+It's currently possible to mount a root filesystem via NFS providing a
+complete Linux environment. Otherwise a ramdisk image may be used. The
+board supports MTD/JFFS, so you could also mount something on there.
+
+Use 'make graphicsclient_config' before any 'make config'. This will set up
+defaults for GraphicsClient Plus support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0200000.
+Also the following registers should have the specified values upon entry:
+
+ r0 = 0
+ r1 = 29 (this is the GraphicsClient architecture number)
+
+Linux can be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+Angel is not available for the GraphicsClient Plus AFAIK.
+
+There is a board known as just the GraphicsClient that ADS used to
+produce but has end of lifed. This code will not work on the older
+board with the ADS bootloader, but should still work with Angel,
+as outlined below. In any case, if you're planning on deploying
+something en masse, you should probably get the newer board.
+
+If using Angel on the older boards, here is a typical angel.opt option file
+if the kernel is loaded through the Angel Debug Monitor:
+
+----- begin angelboot.opt -----
+base 0xc0200000
+entry 0xc0200000
+r0 0x00000000
+r1 0x0000001d
+device /dev/ttyS1
+options "38400 8N1"
+baud 115200
+#otherfile ramdisk.gz
+#otherbase 0xc0800000
+exec minicom
+----- end angelboot.opt -----
+
+Then the kernel (and ramdisk if otherfile/otherbase lines above are
+uncommented) would be loaded with:
+
+ angelboot -f angelboot.opt zImage
+
+Here it is assumed that the board is connected to ttyS1 on your PC
+and that minicom is preconfigured with /dev/ttyS1, 38400 baud, 8N1, no flow
+control by default.
+
+If any other bootloader is used, ensure it accomplish the same, especially
+for r0/r1 register values before jumping into the kernel.
+
+
+Supported peripherals:
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- on-board SMC 92C96 ethernet NIC
+- SA1100 serial port
+- flash memory access (MTD/JFFS)
+- pcmcia
+- touchscreen(ucb1200)
+- ps/2 keyboard
+- console on LCD screen
+- serial ports (ttyS[0-2])
+ - ttyS0 is default for serial console
+- Smart I/O (ADC, keypad, digital inputs, etc)
+ See http://www.applieddata.com/developers/linux for IOCTL documentation
+ and example user space code. ps/2 keybd is multiplexed through this driver
+
+To do:
+- UCB1200 audio with new ucb_generic layer
+- everything else! :-)
+
+Notes:
+
+- The flash on board is divided into 3 partitions. mtd0 is where
+ the ADS boot ROM and zImage is stored. It's been marked as
+ read-only to keep you from blasting over the bootloader. :) mtd1 is
+ for the ramdisk.gz image. mtd2 is user flash space and can be
+ utilized for either JFFS or if you're feeling crazy, running ext2
+ on top of it. If you're not using the ADS bootloader, you're
+ welcome to blast over the mtd1 partition also.
+
+- 16bpp mode requires a different cable than what ships with the board.
+ Contact ADS or look through the manual to wire your own. Currently,
+ if you compile with 16bit mode support and switch into a lower bpp
+ mode, the timing is off so the image is corrupted. This will be
+ fixed soon.
+
+Any contribution can be sent to nico@cam.org and will be greatly welcome!
+
diff --git a/Documentation/arm/SA1100/GraphicsMaster b/Documentation/arm/SA1100/GraphicsMaster
new file mode 100644
index 0000000..dd28745
--- /dev/null
+++ b/Documentation/arm/SA1100/GraphicsMaster
@@ -0,0 +1,53 @@
+ADS GraphicsMaster Single Board Computer
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The original Linux support for this product has been provided by
+Nicolas Pitre <nico@cam.org>. Continued development work by
+Woojung Huh <whuh@applieddata.net>
+
+Use 'make graphicsmaster_config' before any 'make config'.
+This will set up defaults for GraphicsMaster support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0400000.
+
+Linux can be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+
+Supported peripherals:
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- SA1111 USB Master
+- on-board SMC 92C96 ethernet NIC
+- SA1100 serial port
+- flash memory access (MTD/JFFS)
+- pcmcia, compact flash
+- touchscreen(ucb1200)
+- ps/2 keyboard
+- console on LCD screen
+- serial ports (ttyS[0-2])
+ - ttyS0 is default for serial console
+- Smart I/O (ADC, keypad, digital inputs, etc)
+ See http://www.applieddata.com/developers/linux for IOCTL documentation
+ and example user space code. ps/2 keybd is multiplexed through this driver
+
+To do:
+- everything else! :-)
+
+Notes:
+
+- The flash on board is divided into 3 partitions. mtd0 is where
+ the zImage is stored. It's been marked as read-only to keep you
+ from blasting over the bootloader. :) mtd1 is
+ for the ramdisk.gz image. mtd2 is user flash space and can be
+ utilized for either JFFS or if you're feeling crazy, running ext2
+ on top of it. If you're not using the ADS bootloader, you're
+ welcome to blast over the mtd1 partition also.
+
+- 16bpp mode requires a different cable than what ships with the board.
+ Contact ADS or look through the manual to wire your own. Currently,
+ if you compile with 16bit mode support and switch into a lower bpp
+ mode, the timing is off so the image is corrupted. This will be
+ fixed soon.
+
+Any contribution can be sent to nico@cam.org and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/HUW_WEBPANEL b/Documentation/arm/SA1100/HUW_WEBPANEL
new file mode 100644
index 0000000..fd56b48
--- /dev/null
+++ b/Documentation/arm/SA1100/HUW_WEBPANEL
@@ -0,0 +1,17 @@
+The HUW_WEBPANEL is a product of the german company Hoeft & Wessel AG
+
+If you want more information, please visit
+http://www.hoeft-wessel.de
+
+To build the kernel:
+ make huw_webpanel_config
+ make oldconfig
+ [accept all defaults]
+ make zImage
+
+Mostly of the work is done by:
+Roman Jordan jor@hoeft-wessel.de
+Christoph Schulz schu@hoeft-wessel.de
+
+2000/12/18/
+
diff --git a/Documentation/arm/SA1100/Itsy b/Documentation/arm/SA1100/Itsy
new file mode 100644
index 0000000..3b59453
--- /dev/null
+++ b/Documentation/arm/SA1100/Itsy
@@ -0,0 +1,39 @@
+Itsy is a research project done by the Western Research Lab, and Systems
+Research Center in Palo Alto, CA. The Itsy project is one of several
+research projects at Compaq that are related to pocket computing.
+
+For more information, see:
+
+ http://www.research.digital.com/wrl/itsy/index.html
+
+Notes on initial 2.4 Itsy support (8/27/2000) :
+The port was done on an Itsy version 1.5 machine with a daughtercard with
+64 Meg of DRAM and 32 Meg of Flash. The initial work includes support for
+serial console (to see what you're doing). No other devices have been
+enabled.
+
+To build, do a "make menuconfig" (or xmenuconfig) and select Itsy support.
+Disable Flash and LCD support. and then do a make zImage.
+Finally, you will need to cd to arch/arm/boot/tools and execute a make there
+to build the params-itsy program used to boot the kernel.
+
+In order to install the port of 2.4 to the itsy, You will need to set the
+configuration parameters in the monitor as follows:
+Arg 1:0x08340000, Arg2: 0xC0000000, Arg3:18 (0x12), Arg4:0
+Make sure the start-routine address is set to 0x00060000.
+
+Next, flash the params-itsy program to 0x00060000 ("p 1 0x00060000" in the
+flash menu) Flash the kernel in arch/arm/boot/zImage into 0x08340000
+("p 1 0x00340000"). Finally flash an initial ramdisk into 0xC8000000
+("p 2 0x0") We used ramdisk-2-30.gz from the 0.11 version directory on
+handhelds.org.
+
+The serial connection we established was at:
+ 8-bit data, no parity, 1 stop bit(s), 115200.00 b/s. in the monitor, in the
+params-itsy program, and in the kernel itself. This can be changed, but
+not easily. The monitor parameters are easily changed, the params program
+setup is assembly outl's, and the kernel is a configuration item specific to
+the itsy. (i.e. grep for CONFIG_SA1100_ITSY and you'll find where it is.)
+
+
+This should get you a properly booting 2.4 kernel on the itsy.
diff --git a/Documentation/arm/SA1100/LART b/Documentation/arm/SA1100/LART
new file mode 100644
index 0000000..6d412b6
--- /dev/null
+++ b/Documentation/arm/SA1100/LART
@@ -0,0 +1,14 @@
+Linux Advanced Radio Terminal (LART)
+------------------------------------
+
+The LART is a small (7.5 x 10cm) SA-1100 board, designed for embedded
+applications. It has 32 MB DRAM, 4MB Flash ROM, double RS232 and all
+other StrongARM-gadgets. Almost all SA signals are directly accessible
+through a number of connectors. The powersupply accepts voltages
+between 3.5V and 16V and is overdimensioned to support a range of
+daughterboards. A quad Ethernet / IDE / PS2 / sound daughterboard
+is under development, with plenty of others in different stages of
+planning.
+
+The hardware designs for this board have been released under an open license;
+see the LART page at http://www.lartmaker.nl/ for more information.
diff --git a/Documentation/arm/SA1100/PLEB b/Documentation/arm/SA1100/PLEB
new file mode 100644
index 0000000..92cae06
--- /dev/null
+++ b/Documentation/arm/SA1100/PLEB
@@ -0,0 +1,11 @@
+The PLEB project was started as a student initiative at the School of
+Computer Science and Engineering, University of New South Wales to make a
+pocket computer capable of running the Linux Kernel.
+
+PLEB support has yet to be fully integrated.
+
+For more information, see:
+
+ http://www.cse.unsw.edu.au/~pleb/
+
+
diff --git a/Documentation/arm/SA1100/Pangolin b/Documentation/arm/SA1100/Pangolin
new file mode 100644
index 0000000..077a612
--- /dev/null
+++ b/Documentation/arm/SA1100/Pangolin
@@ -0,0 +1,23 @@
+Pangolin is a StrongARM 1110-based evaluation platform produced
+by Dialogue Technology (http://www.dialogue.com.tw/).
+It has EISA slots for ease of configuration with SDRAM/Flash
+memory card, USB/Serial/Audio card, Compact Flash card,
+PCMCIA/IDE card and TFT-LCD card.
+
+To compile for Pangolin, you must issue the following commands:
+
+ make pangolin_config
+ make oldconfig
+ make zImage
+
+Supported peripherals:
+- SA1110 serial port (UART1/UART2/UART3)
+- flash memory access
+- compact flash driver
+- UDA1341 sound driver
+- SA1100 LCD controller for 800x600 16bpp TFT-LCD
+- MQ-200 driver for 800x600 16bpp TFT-LCD
+- Penmount(touch panel) driver
+- PCMCIA driver
+- SMC91C94 LAN driver
+- IDE driver (experimental)
diff --git a/Documentation/arm/SA1100/Tifon b/Documentation/arm/SA1100/Tifon
new file mode 100644
index 0000000..dd1934d
--- /dev/null
+++ b/Documentation/arm/SA1100/Tifon
@@ -0,0 +1,7 @@
+Tifon
+-----
+
+More info has to come...
+
+Contact: Peter Danielsson <peter.danielsson@era-t.ericsson.se>
+
diff --git a/Documentation/arm/SA1100/Victor b/Documentation/arm/SA1100/Victor
new file mode 100644
index 0000000..01e81fc
--- /dev/null
+++ b/Documentation/arm/SA1100/Victor
@@ -0,0 +1,16 @@
+Victor is known as a "digital talking book player" manufactured by
+VisuAide, Inc. to be used by blind people.
+
+For more information related to Victor, see:
+
+ http://www.visuaide.com/victor
+
+Of course Victor is using Linux as its main operating system.
+The Victor implementation for Linux is maintained by Nicolas Pitre:
+
+ nico@visuaide.com
+ nico@cam.org
+
+For any comments, please feel free to contact me through the above
+addresses.
+
diff --git a/Documentation/arm/SA1100/Yopy b/Documentation/arm/SA1100/Yopy
new file mode 100644
index 0000000..e14f16d
--- /dev/null
+++ b/Documentation/arm/SA1100/Yopy
@@ -0,0 +1,2 @@
+See http://www.yopydeveloper.org for more.
+
diff --git a/Documentation/arm/SA1100/empeg b/Documentation/arm/SA1100/empeg
new file mode 100644
index 0000000..4ece484
--- /dev/null
+++ b/Documentation/arm/SA1100/empeg
@@ -0,0 +1,2 @@
+See ../empeg/README
+
diff --git a/Documentation/arm/SA1100/nanoEngine b/Documentation/arm/SA1100/nanoEngine
new file mode 100644
index 0000000..fc431cb
--- /dev/null
+++ b/Documentation/arm/SA1100/nanoEngine
@@ -0,0 +1,11 @@
+nanoEngine
+----------
+
+"nanoEngine" is a SA1110 based single board computer from
+Bright Star Engineering Inc. See www.brightstareng.com/arm
+for more info.
+(Ref: Stuart Adams <sja@brightstareng.com>)
+
+Also visit Larry Doolittle's "Linux for the nanoEngine" site:
+http://recycle.lbl.gov/~ldoolitt/bse/
+
diff --git a/Documentation/arm/SA1100/serial_UART b/Documentation/arm/SA1100/serial_UART
new file mode 100644
index 0000000..a63966f
--- /dev/null
+++ b/Documentation/arm/SA1100/serial_UART
@@ -0,0 +1,47 @@
+The SA1100 serial port had its major/minor numbers officially assigned:
+
+> Date: Sun, 24 Sep 2000 21:40:27 -0700
+> From: H. Peter Anvin <hpa@transmeta.com>
+> To: Nicolas Pitre <nico@CAM.ORG>
+> Cc: Device List Maintainer <device@lanana.org>
+> Subject: Re: device
+>
+> Okay. Note that device numbers 204 and 205 are used for "low density
+> serial devices", so you will have a range of minors on those majors (the
+> tty device layer handles this just fine, so you don't have to worry about
+> doing anything special.)
+>
+> So your assignments are:
+>
+> 204 char Low-density serial ports
+> 5 = /dev/ttySA0 SA1100 builtin serial port 0
+> 6 = /dev/ttySA1 SA1100 builtin serial port 1
+> 7 = /dev/ttySA2 SA1100 builtin serial port 2
+>
+> 205 char Low-density serial ports (alternate device)
+> 5 = /dev/cusa0 Callout device for ttySA0
+> 6 = /dev/cusa1 Callout device for ttySA1
+> 7 = /dev/cusa2 Callout device for ttySA2
+>
+
+You must create those inodes in /dev on the root filesystem used
+by your SA1100-based device:
+
+ mknod ttySA0 c 204 5
+ mknod ttySA1 c 204 6
+ mknod ttySA2 c 204 7
+ mknod cusa0 c 205 5
+ mknod cusa1 c 205 6
+ mknod cusa2 c 205 7
+
+In addition to the creation of the appropriate device nodes above, you
+must ensure your user space applications make use of the correct device
+name. The classic example is the content of the /etc/inittab file where
+you might have a getty process started on ttyS0. In this case:
+
+- replace occurrences of ttyS0 with ttySA0, ttyS1 with ttySA1, etc.
+
+- don't forget to add 'ttySA0', 'console', or the appropriate tty name
+ in /etc/securetty for root to be allowed to login as well.
+
+
diff --git a/Documentation/arm/Samsung-S3C24XX/DMA.txt b/Documentation/arm/Samsung-S3C24XX/DMA.txt
new file mode 100644
index 0000000..3ed8238
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/DMA.txt
@@ -0,0 +1,46 @@
+ S3C2410 DMA
+ ===========
+
+Introduction
+------------
+
+ The kernel provides an interface to manage DMA transfers
+ using the DMA channels in the CPU, so that the central
+ duty of managing channel mappings, and programming the
+ channel generators is in one place.
+
+
+DMA Channel Ordering
+--------------------
+
+ Many of the range do not have connections for the DMA
+ channels to all sources, which means that some devices
+ have a restricted number of channels that can be used.
+
+ To allow flexibility for each CPU type and board, the
+ DMA code can be given a DMA ordering structure which
+ allows the order of channel search to be specified, as
+ well as allowing the prohibition of certain claims.
+
+ struct s3c24xx_dma_order has a list of channels, and
+ each channel within has a slot for a list of DMA
+ channel numbers. The slots are searched in order for
+ the presence of a DMA channel number with DMA_CH_VALID
+ or-ed in.
+
+ If the order has the flag DMA_CH_NEVER set, then after
+ checking the channel list, the system will return no
+ found channel, thus denying the request.
+
+ A board support file can call s3c24xx_dma_order_set()
+ to register a complete ordering set. The routine will
+ copy the data, so the original can be discarded with
+ __initdata.
+
+
+Authour
+-------
+
+Ben Dooks,
+Copyright (c) 2007 Ben Dooks, Simtec Electronics
+Licensed under the GPL v2
diff --git a/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt b/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt
new file mode 100644
index 0000000..26422f0
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt
@@ -0,0 +1,58 @@
+ Simtec Electronics EB2410ITX (BAST)
+ ===================================
+
+ http://www.simtec.co.uk/products/EB2410ITX/
+
+Introduction
+------------
+
+ The EB2410ITX is a S3C2410 based development board with a variety of
+ peripherals and expansion connectors. This board is also known by
+ the shortened name of Bast.
+
+
+Configuration
+-------------
+
+ To set the default configuration, use `make bast_defconfig` which
+ supports the commonly used features of this board.
+
+
+Support
+-------
+
+ Official support information can be found on the Simtec Electronics
+ website, at the product page http://www.simtec.co.uk/products/EB2410ITX/
+
+ Useful links:
+
+ - Resources Page http://www.simtec.co.uk/products/EB2410ITX/resources.html
+
+ - Board FAQ at http://www.simtec.co.uk/products/EB2410ITX/faq.html
+
+ - Bootloader info http://www.simtec.co.uk/products/SWABLE/resources.html
+ and FAQ http://www.simtec.co.uk/products/SWABLE/faq.html
+
+
+MTD
+---
+
+ The NAND and NOR support has been merged from the linux-mtd project.
+ Any problems, see http://www.linux-mtd.infradead.org/ for more
+ information or up-to-date versions of linux-mtd.
+
+
+IDE
+---
+
+ Both onboard IDE ports are supported, however there is no support for
+ changing speed of devices, PIO Mode 4 capable drives should be used.
+
+
+Maintainers
+-----------
+
+ This board is maintained by Simtec Electronics.
+
+
+(c) 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/GPIO.txt b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
new file mode 100644
index 0000000..ea7ccfc
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
@@ -0,0 +1,137 @@
+ S3C2410 GPIO Control
+ ====================
+
+Introduction
+------------
+
+ The s3c2410 kernel provides an interface to configure and
+ manipulate the state of the GPIO pins, and find out other
+ information about them.
+
+ There are a number of conditions attached to the configuration
+ of the s3c2410 GPIO system, please read the Samsung provided
+ data-sheet/users manual to find out the complete list.
+
+
+GPIOLIB
+-------
+
+ With the event of the GPIOLIB in drivers/gpio, support for some
+ of the GPIO functions such as reading and writing a pin will
+ be removed in favour of this common access method.
+
+ Once all the extant drivers have been converted, the functions
+ listed below will be removed (they may be marked as __deprecated
+ in the near future).
+
+ - s3c2410_gpio_getpin
+ - s3c2410_gpio_setpin
+
+
+Headers
+-------
+
+ See arch/arm/mach-s3c2410/include/mach/regs-gpio.h for the list
+ of GPIO pins, and the configuration values for them. This
+ is included by using #include <mach/regs-gpio.h>
+
+ The GPIO management functions are defined in the hardware
+ header arch/arm/mach-s3c2410/include/mach/hardware.h which can be
+ included by #include <mach/hardware.h>
+
+ A useful amount of documentation can be found in the hardware
+ header on how the GPIO functions (and others) work.
+
+ Whilst a number of these functions do make some checks on what
+ is passed to them, for speed of use, they may not always ensure
+ that the user supplied data to them is correct.
+
+
+PIN Numbers
+-----------
+
+ Each pin has an unique number associated with it in regs-gpio.h,
+ eg S3C2410_GPA0 or S3C2410_GPF1. These defines are used to tell
+ the GPIO functions which pin is to be used.
+
+
+Configuring a pin
+-----------------
+
+ The following function allows the configuration of a given pin to
+ be changed.
+
+ void s3c2410_gpio_cfgpin(unsigned int pin, unsigned int function);
+
+ Eg:
+
+ s3c2410_gpio_cfgpin(S3C2410_GPA0, S3C2410_GPA0_ADDR0);
+ s3c2410_gpio_cfgpin(S3C2410_GPE8, S3C2410_GPE8_SDDAT1);
+
+ which would turn GPA0 into the lowest Address line A0, and set
+ GPE8 to be connected to the SDIO/MMC controller's SDDAT1 line.
+
+
+Reading the current configuration
+---------------------------------
+
+ The current configuration of a pin can be read by using:
+
+ s3c2410_gpio_getcfg(unsigned int pin);
+
+ The return value will be from the same set of values which can be
+ passed to s3c2410_gpio_cfgpin().
+
+
+Configuring a pull-up resistor
+------------------------------
+
+ A large proportion of the GPIO pins on the S3C2410 can have weak
+ pull-up resistors enabled. This can be configured by the following
+ function:
+
+ void s3c2410_gpio_pullup(unsigned int pin, unsigned int to);
+
+ Where the to value is zero to set the pull-up off, and 1 to enable
+ the specified pull-up. Any other values are currently undefined.
+
+
+Getting the state of a PIN
+--------------------------
+
+ The state of a pin can be read by using the function:
+
+ unsigned int s3c2410_gpio_getpin(unsigned int pin);
+
+ This will return either zero or non-zero. Do not count on this
+ function returning 1 if the pin is set.
+
+
+Setting the state of a PIN
+--------------------------
+
+ The value an pin is outputing can be modified by using the following:
+
+ void s3c2410_gpio_setpin(unsigned int pin, unsigned int to);
+
+ Which sets the given pin to the value. Use 0 to write 0, and 1 to
+ set the output to 1.
+
+
+Getting the IRQ number associated with a PIN
+--------------------------------------------
+
+ The following function can map the given pin number to an IRQ
+ number to pass to the IRQ system.
+
+ int s3c2410_gpio_getirq(unsigned int pin);
+
+ Note, not all pins have an IRQ.
+
+
+Authour
+-------
+
+
+Ben Dooks, 03 October 2004
+(c) 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/H1940.txt b/Documentation/arm/Samsung-S3C24XX/H1940.txt
new file mode 100644
index 0000000..f4a7b22
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/H1940.txt
@@ -0,0 +1,40 @@
+ HP IPAQ H1940
+ =============
+
+http://www.handhelds.org/projects/h1940.html
+
+Introduction
+------------
+
+ The HP H1940 is a S3C2410 based handheld device, with
+ bluetooth connectivity.
+
+
+Support
+-------
+
+ A variety of information is available
+
+ handhelds.org project page:
+
+ http://www.handhelds.org/projects/h1940.html
+
+ handhelds.org wiki page:
+
+ http://handhelds.org/moin/moin.cgi/HpIpaqH1940
+
+ Herbert Pötzl pages:
+
+ http://vserver.13thfloor.at/H1940/
+
+
+Maintainers
+-----------
+
+ This project is being maintained and developed by a variety
+ of people, including Ben Dooks, Arnaud Patard, and Herbert Pötzl.
+
+ Thanks to the many others who have also provided support.
+
+
+(c) 2005 Ben Dooks \ No newline at end of file
diff --git a/Documentation/arm/Samsung-S3C24XX/NAND.txt b/Documentation/arm/Samsung-S3C24XX/NAND.txt
new file mode 100644
index 0000000..bc478a3
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/NAND.txt
@@ -0,0 +1,30 @@
+ S3C24XX NAND Support
+ ====================
+
+Introduction
+------------
+
+Small Page NAND
+---------------
+
+The driver uses a 512 byte (1 page) ECC code for this setup. The
+ECC code is not directly compatible with the default kernel ECC
+code, so the driver enforces its own OOB layout and ECC parameters
+
+Large Page NAND
+---------------
+
+The driver is capable of handling NAND flash with a 2KiB page
+size, with support for hardware ECC generation and correction.
+
+Unlike the 512byte page mode, the driver generates ECC data for
+each 256 byte block in an 2KiB page. This means that more than
+one error in a page can be rectified. It also means that the
+OOB layout remains the default kernel layout for these flashes.
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2007 Simtec Electronics
+
diff --git a/Documentation/arm/Samsung-S3C24XX/Overview.txt b/Documentation/arm/Samsung-S3C24XX/Overview.txt
new file mode 100644
index 0000000..cff6227
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/Overview.txt
@@ -0,0 +1,302 @@
+ S3C24XX ARM Linux Overview
+ ==========================
+
+
+
+Introduction
+------------
+
+ The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported
+ by the 's3c2410' architecture of ARM Linux. Currently the S3C2410,
+ S3C2412, S3C2413, S3C2440, S3C2442 and S3C2443 devices are supported.
+
+ Support for the S3C2400 and S3C24A0 series are in progress.
+
+
+Configuration
+-------------
+
+ A generic S3C2410 configuration is provided, and can be used as the
+ default by `make s3c2410_defconfig`. This configuration has support
+ for all the machines, and the commonly used features on them.
+
+ Certain machines may have their own default configurations as well,
+ please check the machine specific documentation.
+
+
+Layout
+------
+
+ The core support files are located in the platform code contained in
+ arch/arm/plat-s3c24xx with headers in include/asm-arm/plat-s3c24xx.
+ This directory should be kept to items shared between the platform
+ code (arch/arm/plat-s3c24xx) and the arch/arm/mach-s3c24* code.
+
+ Each cpu has a directory with the support files for it, and the
+ machines that carry the device. For example S3C2410 is contained
+ in arch/arm/mach-s3c2410 and S3C2440 in arch/arm/mach-s3c2440
+
+ Register, kernel and platform data definitions are held in the
+ arch/arm/mach-s3c2410 directory./include/mach
+
+arch/arm/plat-s3c24xx:
+
+ Files in here are either common to all the s3c24xx family,
+ or are common to only some of them with names to indicate this
+ status. The files that are not common to all are generally named
+ with the initial cpu they support in the series to ensure a short
+ name without any possibility of confusion with newer devices.
+
+ As an example, initially s3c244x would cover s3c2440 and s3c2442, but
+ with the s3c2443 which does not share many of the same drivers in
+ this directory, the name becomes invalid. We stick to s3c2440-<x>
+ to indicate a driver that is s3c2440 and s3c2442 compatible.
+
+ This does mean that to find the status of any given SoC, a number
+ of directories may need to be searched.
+
+
+Machines
+--------
+
+ The currently supported machines are as follows:
+
+ Simtec Electronics EB2410ITX (BAST)
+
+ A general purpose development board, see EB2410ITX.txt for further
+ details
+
+ Simtec Electronics IM2440D20 (Osiris)
+
+ CPU Module from Simtec Electronics, with a S3C2440A CPU, nand flash
+ and a PCMCIA controller.
+
+ Samsung SMDK2410
+
+ Samsung's own development board, geared for PDA work.
+
+ Samsung/Aiji SMDK2412
+
+ The S3C2412 version of the SMDK2440.
+
+ Samsung/Aiji SMDK2413
+
+ The S3C2412 version of the SMDK2440.
+
+ Samsung/Meritech SMDK2440
+
+ The S3C2440 compatible version of the SMDK2440, which has the
+ option of an S3C2440 or S3C2442 CPU module.
+
+ Thorcom VR1000
+
+ Custom embedded board
+
+ HP IPAQ 1940
+
+ Handheld (IPAQ), available in several varieties
+
+ HP iPAQ rx3715
+
+ S3C2440 based IPAQ, with a number of variations depending on
+ features shipped.
+
+ Acer N30
+
+ A S3C2410 based PDA from Acer. There is a Wiki page at
+ http://handhelds.org/moin/moin.cgi/AcerN30Documentation .
+
+ AML M5900
+
+ American Microsystems' M5900
+
+ Nex Vision Nexcoder
+ Nex Vision Otom
+
+ Two machines by Nex Vision
+
+
+Adding New Machines
+-------------------
+
+ The architecture has been designed to support as many machines as can
+ be configured for it in one kernel build, and any future additions
+ should keep this in mind before altering items outside of their own
+ machine files.
+
+ Machine definitions should be kept in linux/arch/arm/mach-s3c2410,
+ and there are a number of examples that can be looked at.
+
+ Read the kernel patch submission policies as well as the
+ Documentation/arm directory before submitting patches. The
+ ARM kernel series is managed by Russell King, and has a patch system
+ located at http://www.arm.linux.org.uk/developer/patches/
+ as well as mailing lists that can be found from the same site.
+
+ As a courtesy, please notify <ben-linux@fluff.org> of any new
+ machines or other modifications.
+
+ Any large scale modifications, or new drivers should be discussed
+ on the ARM kernel mailing list (linux-arm-kernel) before being
+ attempted. See http://www.arm.linux.org.uk/mailinglists/ for the
+ mailing list information.
+
+
+I2C
+---
+
+ The hardware I2C core in the CPU is supported in single master
+ mode, and can be configured via platform data.
+
+
+RTC
+---
+
+ Support for the onboard RTC unit, including alarm function.
+
+ This has recently been upgraded to use the new RTC core,
+ and the module has been renamed to rtc-s3c to fit in with
+ the new rtc naming scheme.
+
+
+Watchdog
+--------
+
+ The onchip watchdog is available via the standard watchdog
+ interface.
+
+
+NAND
+----
+
+ The current kernels now have support for the s3c2410 NAND
+ controller. If there are any problems the latest linux-mtd
+ code can be found from http://www.linux-mtd.infradead.org/
+
+ For more information see Documentation/arm/Samsung-S3C24XX/NAND.txt
+
+
+SD/MMC
+------
+
+ The SD/MMC hardware pre S3C2443 is supported in the current
+ kernel, the driver is drivers/mmc/host/s3cmci.c and supports
+ 1 and 4 bit SD or MMC cards.
+
+ The SDIO behaviour of this driver has not been fully tested. There is no
+ current support for hardware SDIO interrupts.
+
+
+Serial
+------
+
+ The s3c2410 serial driver provides support for the internal
+ serial ports. These devices appear as /dev/ttySAC0 through 3.
+
+ To create device nodes for these, use the following commands
+
+ mknod ttySAC0 c 204 64
+ mknod ttySAC1 c 204 65
+ mknod ttySAC2 c 204 66
+
+
+GPIO
+----
+
+ The core contains support for manipulating the GPIO, see the
+ documentation in GPIO.txt in the same directory as this file.
+
+ Newer kernels carry GPIOLIB, and support is being moved towards
+ this with some of the older support in line to be removed.
+
+
+Clock Management
+----------------
+
+ The core provides the interface defined in the header file
+ include/asm-arm/hardware/clock.h, to allow control over the
+ various clock units
+
+
+Suspend to RAM
+--------------
+
+ For boards that provide support for suspend to RAM, the
+ system can be placed into low power suspend.
+
+ See Suspend.txt for more information.
+
+
+SPI
+---
+
+ SPI drivers are available for both the in-built hardware
+ (although there is no DMA support yet) and a generic
+ GPIO based solution.
+
+
+LEDs
+----
+
+ There is support for GPIO based LEDs via a platform driver
+ in the LED subsystem.
+
+
+Platform Data
+-------------
+
+ Whenever a device has platform specific data that is specified
+ on a per-machine basis, care should be taken to ensure the
+ following:
+
+ 1) that default data is not left in the device to confuse the
+ driver if a machine does not set it at startup
+
+ 2) the data should (if possible) be marked as __initdata,
+ to ensure that the data is thrown away if the machine is
+ not the one currently in use.
+
+ The best way of doing this is to make a function that
+ kmalloc()s an area of memory, and copies the __initdata
+ and then sets the relevant device's platform data. Making
+ the function `__init` takes care of ensuring it is discarded
+ with the rest of the initialisation code
+
+ static __init void s3c24xx_xxx_set_platdata(struct xxx_data *pd)
+ {
+ struct s3c2410_xxx_mach_info *npd;
+
+ npd = kmalloc(sizeof(struct s3c2410_xxx_mach_info), GFP_KERNEL);
+ if (npd) {
+ memcpy(npd, pd, sizeof(struct s3c2410_xxx_mach_info));
+ s3c_device_xxx.dev.platform_data = npd;
+ } else {
+ printk(KERN_ERR "no memory for xxx platform data\n");
+ }
+ }
+
+ Note, since the code is marked as __init, it should not be
+ exported outside arch/arm/mach-s3c2410/, or exported to
+ modules via EXPORT_SYMBOL() and related functions.
+
+
+Port Contributors
+-----------------
+
+ Ben Dooks (BJD)
+ Vincent Sanders
+ Herbert Potzl
+ Arnaud Patard (RTP)
+ Roc Wu
+ Klaus Fetscher
+ Dimitry Andric
+ Shannon Holland
+ Guillaume Gourat (NexVision)
+ Christer Weinigel (wingel) (Acer N30)
+ Lucas Correia Villa Real (S3C2400 port)
+
+
+Document Author
+---------------
+
+Ben Dooks, (c) 2004-2005,2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/S3C2412.txt b/Documentation/arm/Samsung-S3C24XX/S3C2412.txt
new file mode 100644
index 0000000..295d971
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/S3C2412.txt
@@ -0,0 +1,120 @@
+ S3C2412 ARM Linux Overview
+ ==========================
+
+Introduction
+------------
+
+ The S3C2412 is part of the S3C24XX range of ARM9 System-on-Chip CPUs
+ from Samsung. This part has an ARM926-EJS core, capable of running up
+ to 266MHz (see data-sheet for more information)
+
+
+Clock
+-----
+
+ The core clock code provides a set of clocks to the drivers, and allows
+ for source selection and a number of other features.
+
+
+Power
+-----
+
+ No support for suspend/resume to RAM in the current system.
+
+
+DMA
+---
+
+ No current support for DMA.
+
+
+GPIO
+----
+
+ There is support for setting the GPIO to input/output/special function
+ and reading or writing to them.
+
+
+UART
+----
+
+ The UART hardware is similar to the S3C2440, and is supported by the
+ s3c2410 driver in the drivers/serial directory.
+
+
+NAND
+----
+
+ The NAND hardware is similar to the S3C2440, and is supported by the
+ s3c2410 driver in the drivers/mtd/nand directory.
+
+
+USB Host
+--------
+
+ The USB hardware is similar to the S3C2410, with extended clock source
+ control. The OHCI portion is supported by the ohci-s3c2410 driver, and
+ the clock control selection is supported by the core clock code.
+
+
+USB Device
+----------
+
+ No current support in the kernel
+
+
+IRQs
+----
+
+ All the standard, and external interrupt sources are supported. The
+ extra sub-sources are not yet supported.
+
+
+RTC
+---
+
+ The RTC hardware is similar to the S3C2410, and is supported by the
+ s3c2410-rtc driver.
+
+
+Watchdog
+--------
+
+ The watchdog hardware is the same as the S3C2410, and is supported by
+ the s3c2410_wdt driver.
+
+
+MMC/SD/SDIO
+-----------
+
+ No current support for the MMC/SD/SDIO block.
+
+IIC
+---
+
+ The IIC hardware is the same as the S3C2410, and is supported by the
+ i2c-s3c24xx driver.
+
+
+IIS
+---
+
+ No current support for the IIS interface.
+
+
+SPI
+---
+
+ No current support for the SPI interfaces.
+
+
+ATA
+---
+
+ No current support for the on-board ATA block.
+
+
+Document Author
+---------------
+
+Ben Dooks, (c) 2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/S3C2413.txt b/Documentation/arm/Samsung-S3C24XX/S3C2413.txt
new file mode 100644
index 0000000..ab2a888
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/S3C2413.txt
@@ -0,0 +1,21 @@
+ S3C2413 ARM Linux Overview
+ ==========================
+
+Introduction
+------------
+
+ The S3C2413 is an extended version of the S3C2412, with an camera
+ interface and mobile DDR memory support. See the S3C2412 support
+ documentation for more information.
+
+
+Camera Interface
+---------------
+
+ This block is currently not supported.
+
+
+Document Author
+---------------
+
+Ben Dooks, (c) 2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt b/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt
new file mode 100644
index 0000000..32e1eae
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt
@@ -0,0 +1,56 @@
+ Samsung/Meritech SMDK2440
+ =========================
+
+Introduction
+------------
+
+ The SMDK2440 is a two part evaluation board for the Samsung S3C2440
+ processor. It includes support for LCD, SmartMedia, Audio, SD and
+ 10MBit Ethernet, and expansion headers for various signals, including
+ the camera and unused GPIO.
+
+
+Configuration
+-------------
+
+ To set the default configuration, use `make smdk2440_defconfig` which
+ will configure the common features of this board, or use
+ `make s3c2410_config` to include support for all s3c2410/s3c2440 machines
+
+
+Support
+-------
+
+ Ben Dooks' SMDK2440 site at http://www.fluff.org/ben/smdk2440/ which
+ includes linux based USB download tools.
+
+ Some of the h1940 patches that can be found from the H1940 project
+ site at http://www.handhelds.org/projects/h1940.html can also be
+ applied to this board.
+
+
+Peripherals
+-----------
+
+ There is no current support for any of the extra peripherals on the
+ base-board itself.
+
+
+MTD
+---
+
+ The NAND flash should be supported by the in kernel MTD NAND support,
+ NOR flash will be added later.
+
+
+Maintainers
+-----------
+
+ This board is being maintained by Ben Dooks, for more info, see
+ http://www.fluff.org/ben/smdk2440/
+
+ Many thanks to Dimitry Andric of TomTom for the loan of the SMDK2440,
+ and to Simtec Electronics for allowing me time to work on this.
+
+
+(c) 2004 Ben Dooks \ No newline at end of file
diff --git a/Documentation/arm/Samsung-S3C24XX/Suspend.txt b/Documentation/arm/Samsung-S3C24XX/Suspend.txt
new file mode 100644
index 0000000..0dab6e3
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/Suspend.txt
@@ -0,0 +1,137 @@
+ S3C24XX Suspend Support
+ =======================
+
+
+Introduction
+------------
+
+ The S3C24XX supports a low-power suspend mode, where the SDRAM is kept
+ in Self-Refresh mode, and all but the essential peripheral blocks are
+ powered down. For more information on how this works, please look
+ at the relevant CPU datasheet from Samsung.
+
+
+Requirements
+------------
+
+ 1) A bootloader that can support the necessary resume operation
+
+ 2) Support for at least 1 source for resume
+
+ 3) CONFIG_PM enabled in the kernel
+
+ 4) Any peripherals that are going to be powered down at the same
+ time require suspend/resume support.
+
+
+Resuming
+--------
+
+ The S3C2410 user manual defines the process of sending the CPU to
+ sleep and how it resumes. The default behaviour of the Linux code
+ is to set the GSTATUS3 register to the physical address of the
+ code to resume Linux operation.
+
+ GSTATUS4 is currently left alone by the sleep code, and is free to
+ use for any other purposes (for example, the EB2410ITX uses this to
+ save memory configuration in).
+
+
+Machine Support
+---------------
+
+ The machine specific functions must call the s3c2410_pm_init() function
+ to say that its bootloader is capable of resuming. This can be as
+ simple as adding the following to the machine's definition:
+
+ INITMACHINE(s3c2410_pm_init)
+
+ A board can do its own setup before calling s3c2410_pm_init, if it
+ needs to setup anything else for power management support.
+
+ There is currently no support for over-riding the default method of
+ saving the resume address, if your board requires it, then contact
+ the maintainer and discuss what is required.
+
+ Note, the original method of adding an late_initcall() is wrong,
+ and will end up initialising all compiled machines' pm init!
+
+ The following is an example of code used for testing wakeup from
+ an falling edge on IRQ_EINT0:
+
+
+static irqreturn_t button_irq(int irq, void *pw)
+{
+ return IRQ_HANDLED;
+}
+
+statuc void __init machine_init(void)
+{
+ ...
+
+ request_irq(IRQ_EINT0, button_irq, IRQF_TRIGGER_FALLING,
+ "button-irq-eint0", NULL);
+
+ enable_irq_wake(IRQ_EINT0);
+
+ s3c2410_pm_init();
+}
+
+
+Debugging
+---------
+
+ There are several important things to remember when using PM suspend:
+
+ 1) The uart drivers will disable the clocks to the UART blocks when
+ suspending, which means that use of printascii() or similar direct
+ access to the UARTs will cause the debug to stop.
+
+ 2) Whilst the pm code itself will attempt to re-enable the UART clocks,
+ care should be taken that any external clock sources that the UARTs
+ rely on are still enabled at that point.
+
+ 3) If any debugging is placed in the resume path, then it must have the
+ relevant clocks and peripherals setup before use (ie, bootloader).
+
+ For example, if you transmit a character from the UART, the baud
+ rate and uart controls must be setup beforehand.
+
+
+Configuration
+-------------
+
+ The S3C2410 specific configuration in `System Type` defines various
+ aspects of how the S3C2410 suspend and resume support is configured
+
+ `S3C2410 PM Suspend debug`
+
+ This option prints messages to the serial console before and after
+ the actual suspend, giving detailed information on what is
+ happening
+
+
+ `S3C2410 PM Suspend Memory CRC`
+
+ Allows the entire memory to be checksummed before and after the
+ suspend to see if there has been any corruption of the contents.
+
+ Note, the time to calculate the CRC is dependant on the CPU speed
+ and the size of memory. For an 64Mbyte RAM area on an 200MHz
+ S3C2410, this can take approximately 4 seconds to complete.
+
+ This support requires the CRC32 function to be enabled.
+
+
+ `S3C2410 PM Suspend CRC Chunksize (KiB)`
+
+ Defines the size of memory each CRC chunk covers. A smaller value
+ will mean that the CRC data block will take more memory, but will
+ identify any faults with better precision
+
+
+Document Author
+---------------
+
+Ben Dooks, (c) 2004 Simtec Electronics
+
diff --git a/Documentation/arm/Samsung-S3C24XX/USB-Host.txt b/Documentation/arm/Samsung-S3C24XX/USB-Host.txt
new file mode 100644
index 0000000..67671eb
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/USB-Host.txt
@@ -0,0 +1,93 @@
+ S3C24XX USB Host support
+ ========================
+
+
+
+Introduction
+------------
+
+ This document details the S3C2410/S3C2440 in-built OHCI USB host support.
+
+Configuration
+-------------
+
+ Enable at least the following kernel options:
+
+ menuconfig:
+
+ Device Drivers --->
+ USB support --->
+ <*> Support for Host-side USB
+ <*> OHCI HCD support
+
+
+ .config:
+ CONFIG_USB
+ CONFIG_USB_OHCI_HCD
+
+
+ Once these options are configured, the standard set of USB device
+ drivers can be configured and used.
+
+
+Board Support
+-------------
+
+ The driver attaches to a platform device, which will need to be
+ added by the board specific support file in linux/arch/arm/mach-s3c2410,
+ such as mach-bast.c or mach-smdk2410.c
+
+ The platform device's platform_data field is only needed if the
+ board implements extra power control or over-current monitoring.
+
+ The OHCI driver does not ensure the state of the S3C2410's MISCCTRL
+ register, so if both ports are to be used for the host, then it is
+ the board support file's responsibility to ensure that the second
+ port is configured to be connected to the OHCI core.
+
+
+Platform Data
+-------------
+
+ See arch/arm/mach-s3c2410/include/mach/usb-control.h for the
+ descriptions of the platform device data. An implementation
+ can be found in linux/arch/arm/mach-s3c2410/usb-simtec.c .
+
+ The `struct s3c2410_hcd_info` contains a pair of functions
+ that get called to enable over-current detection, and to
+ control the port power status.
+
+ The ports are numbered 0 and 1.
+
+ power_control:
+
+ Called to enable or disable the power on the port.
+
+ enable_oc:
+
+ Called to enable or disable the over-current monitoring.
+ This should claim or release the resources being used to
+ check the power condition on the port, such as an IRQ.
+
+ report_oc:
+
+ The OHCI driver fills this field in for the over-current code
+ to call when there is a change to the over-current state on
+ an port. The ports argument is a bitmask of 1 bit per port,
+ with bit X being 1 for an over-current on port X.
+
+ The function s3c2410_usb_report_oc() has been provided to
+ ensure this is called correctly.
+
+ port[x]:
+
+ This is struct describes each port, 0 or 1. The platform driver
+ should set the flags field of each port to S3C_HCDFLG_USED if
+ the port is enabled.
+
+
+
+Document Author
+---------------
+
+Ben Dooks, (c) 2005 Simtec Electronics
diff --git a/Documentation/arm/Setup b/Documentation/arm/Setup
new file mode 100644
index 0000000..0cb1e64
--- /dev/null
+++ b/Documentation/arm/Setup
@@ -0,0 +1,129 @@
+Kernel initialisation parameters on ARM Linux
+---------------------------------------------
+
+The following document describes the kernel initialisation parameter
+structure, otherwise known as 'struct param_struct' which is used
+for most ARM Linux architectures.
+
+This structure is used to pass initialisation parameters from the
+kernel loader to the Linux kernel proper, and may be short lived
+through the kernel initialisation process. As a general rule, it
+should not be referenced outside of arch/arm/kernel/setup.c:setup_arch().
+
+There are a lot of parameters listed in there, and they are described
+below:
+
+ page_size
+
+ This parameter must be set to the page size of the machine, and
+ will be checked by the kernel.
+
+ nr_pages
+
+ This is the total number of pages of memory in the system. If
+ the memory is banked, then this should contain the total number
+ of pages in the system.
+
+ If the system contains separate VRAM, this value should not
+ include this information.
+
+ ramdisk_size
+
+ This is now obsolete, and should not be used.
+
+ flags
+
+ Various kernel flags, including:
+ bit 0 - 1 = mount root read only
+ bit 1 - unused
+ bit 2 - 0 = load ramdisk
+ bit 3 - 0 = prompt for ramdisk
+
+ rootdev
+
+ major/minor number pair of device to mount as the root filesystem.
+
+ video_num_cols
+ video_num_rows
+
+ These two together describe the character size of the dummy console,
+ or VGA console character size. They should not be used for any other
+ purpose.
+
+ It's generally a good idea to set these to be either standard VGA, or
+ the equivalent character size of your fbcon display. This then allows
+ all the bootup messages to be displayed correctly.
+
+ video_x
+ video_y
+
+ This describes the character position of cursor on VGA console, and
+ is otherwise unused. (should not be used for other console types, and
+ should not be used for other purposes).
+
+ memc_control_reg
+
+ MEMC chip control register for Acorn Archimedes and Acorn A5000
+ based machines. May be used differently by different architectures.
+
+ sounddefault
+
+ Default sound setting on Acorn machines. May be used differently by
+ different architectures.
+
+ adfsdrives
+
+ Number of ADFS/MFM disks. May be used differently by different
+ architectures.
+
+ bytes_per_char_h
+ bytes_per_char_v
+
+ These are now obsolete, and should not be used.
+
+ pages_in_bank[4]
+
+ Number of pages in each bank of the systems memory (used for RiscPC).
+ This is intended to be used on systems where the physical memory
+ is non-contiguous from the processors point of view.
+
+ pages_in_vram
+
+ Number of pages in VRAM (used on Acorn RiscPC). This value may also
+ be used by loaders if the size of the video RAM can't be obtained
+ from the hardware.
+
+ initrd_start
+ initrd_size
+
+ This describes the kernel virtual start address and size of the
+ initial ramdisk.
+
+ rd_start
+
+ Start address in sectors of the ramdisk image on a floppy disk.
+
+ system_rev
+
+ system revision number.
+
+ system_serial_low
+ system_serial_high
+
+ system 64-bit serial number
+
+ mem_fclk_21285
+
+ The speed of the external oscillator to the 21285 (footbridge),
+ which control's the speed of the memory bus, timer & serial port.
+ Depending upon the speed of the cpu its value can be between
+ 0-66 MHz. If no params are passed or a value of zero is passed,
+ then a value of 50 Mhz is the default on 21285 architectures.
+
+ paths[8][128]
+
+ These are now obsolete, and should not be used.
+
+ commandline
+
+ Kernel command line parameters. Details can be found elsewhere.
diff --git a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
new file mode 100644
index 0000000..1e6a23f
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
@@ -0,0 +1,61 @@
+README on the ADC/Touchscreen Controller
+========================================
+
+The LH79524 and LH7A404 include a built-in Analog to Digital
+controller (ADC) that is used to process input from a touchscreen.
+The driver only implements a four-wire touch panel protocol.
+
+The touchscreen driver is maintenance free except for the pen-down or
+touch threshold. Some resistive displays and board combinations may
+require tuning of this threshold. The driver exposes some of it's
+internal state in the sys filesystem. If the kernel is configured
+with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a
+directory
+
+ /sys/devices/platform/adc-lh7.0
+
+containing these files.
+
+ -r--r--r-- 1 root root 4096 Jan 1 00:00 samples
+ -rw-r--r-- 1 root root 4096 Jan 1 00:00 threshold
+ -r--r--r-- 1 root root 4096 Jan 1 00:00 threshold_range
+
+The threshold is the current touch threshold. It defaults to 750 on
+most targets.
+
+ # cat threshold
+ 750
+
+The threshold_range contains the range of valid values for the
+threshold. Values outside of this range will be silently ignored.
+
+ # cat threshold_range
+ 0 1023
+
+To change the threshold, write a value to the threshold file.
+
+ # echo 500 > threshold
+ # cat threshold
+ 500
+
+The samples file contains the most recently sampled values from the
+ADC. There are 12. Below are typical of the last sampled values when
+the pen has been released. The first two and last two samples are for
+detecting whether or not the pen is down. The third through sixth are
+X coordinate samples. The seventh through tenth are Y coordinate
+samples.
+
+ # cat samples
+ 1023 1023 0 0 0 0 530 529 530 529 1023 1023
+
+To determine a reasonable threshold, press on the touch panel with an
+appropriate stylus and read the values from samples.
+
+ # cat samples
+ 1023 676 92 103 101 102 855 919 922 922 1023 679
+
+The first and eleventh samples are discarded. Thus, the important
+values are the second and twelfth which are used to determine if the
+pen is down. When both are below the threshold, the driver registers
+that the pen is down. When either is above the threshold, it
+registers then pen is up.
diff --git a/Documentation/arm/Sharp-LH/CompactFlash b/Documentation/arm/Sharp-LH/CompactFlash
new file mode 100644
index 0000000..8616d87
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/CompactFlash
@@ -0,0 +1,32 @@
+README on the Compact Flash for Card Engines
+============================================
+
+There are three challenges in supporting the CF interface of the Card
+Engines. First, every IO operation must be followed with IO to
+another memory region. Second, the slot is wired for one-to-one
+address mapping *and* it is wired for 16 bit access only. Second, the
+interrupt request line from the CF device isn't wired.
+
+The IOBARRIER issue is covered in README.IOBARRIER. This isn't an
+onerous problem. Enough said here.
+
+The addressing issue is solved in the
+arch/arm/mach-lh7a40x/ide-lpd7a40x.c file with some awkward
+work-arounds. We implement a special SELECT_DRIVE routine that is
+called before the IDE driver performs its own SELECT_DRIVE. Our code
+recognizes that the SELECT register cannot be modified without also
+writing a command. It send an IDLE_IMMEDIATE command on selecting a
+drive. The function also prevents drive select to the slave drive
+since there can be only one. The awkward part is that the IDE driver,
+even though we have a select procedure, also attempts to change the
+drive by writing directly the SELECT register. This attempt is
+explicitly blocked by the OUTB function--not pretty, but effective.
+
+The lack of interrupts is a more serious problem. Even though the CF
+card is fast when compared to a normal IDE device, we don't know that
+the CF is really flash. A user could use one of the very small hard
+drives being shipped with a CF interface. The IDE code includes a
+check for interfaces that lack an IRQ. In these cases, submitting a
+command to the IDE controller is followed by a call to poll for
+completion. If the device isn't immediately ready, it schedules a
+timer to poll again later.
diff --git a/Documentation/arm/Sharp-LH/IOBarrier b/Documentation/arm/Sharp-LH/IOBarrier
new file mode 100644
index 0000000..2e953e2
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/IOBarrier
@@ -0,0 +1,45 @@
+README on the IOBARRIER for CardEngine IO
+=========================================
+
+Due to an unfortunate oversight when the Card Engines were designed,
+the signals that control access to some peripherals, most notably the
+SMC91C9111 ethernet controller, are not properly handled.
+
+The symptom is that some back to back IO with the peripheral returns
+unreliable data. With the SMC chip, you'll see errors about the bank
+register being 'screwed'.
+
+The cause is that the AEN signal to the SMC chip does not transition
+for every memory access. It is driven through the CPLD from the CS7
+line of the CPU's static memory controller which is optimized to
+eliminate unnecessary transitions. Yet, the SMC requires a transition
+for every write access. The Sharp website has more information about
+the effect this power-conserving feature has on peripheral
+interfacing.
+
+The solution is to follow every write access to the SMC chip with an
+access to another memory region that will force the CPU to release the
+chip select line. It is important to guarantee that this access
+forces the CPU off-chip. We map a page of SDRAM as if it were an
+uncacheable IO device and read from it after every SMC IO write
+operation.
+
+ SMC IO
+ BARRIER IO
+
+Only this sequence is important. It does not matter that there is no
+BARRIER IO before the access to the SMC chip because the AEN latch
+only needs occurs after the SMC IO write cycle. The routines that
+implement this work-around make an additional concession which is to
+disable interrupts during the IO sequence. Other hardware devices
+(the LogicPD CPLD) have registers in the same physical memory
+region as the SMC chip. An interrupt might allow an access to one of
+those registers while SMC IO is being performed.
+
+You might be tempted to think that we have to access another device
+attached to the static memory controller, but the empirical evidence
+indicates that this is not so. Mapping 0x00000000 (flash) and
+0xc0000000 (SDRAM) appear to have the same effect. Using SDRAM seems
+to be faster. Choosing to access an undecoded memory region is not
+desirable as there is no way to know how that chip select will be used
+in the future.
diff --git a/Documentation/arm/Sharp-LH/KEV7A400 b/Documentation/arm/Sharp-LH/KEV7A400
new file mode 100644
index 0000000..be32b14
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/KEV7A400
@@ -0,0 +1,8 @@
+README on Implementing Linux for Sharp's KEV7a400
+=================================================
+
+This product has been discontinued by Sharp. For the time being, the
+partially implemented code remains in the kernel. At some point in
+the future, either the code will be finished or it will be removed
+completely. This depends primarily on how many of the development
+boards are in the field.
diff --git a/Documentation/arm/Sharp-LH/LCDPanels b/Documentation/arm/Sharp-LH/LCDPanels
new file mode 100644
index 0000000..fb1b21c
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/LCDPanels
@@ -0,0 +1,59 @@
+README on the LCD Panels
+========================
+
+Configuration options for several LCD panels, available from Logic PD,
+are included in the kernel source. This README will help you
+understand the configuration data and give you some guidance for
+adding support for other panels if you wish.
+
+
+lcd-panels.h
+------------
+
+There is no way, at present, to detect which panel is attached to the
+system at runtime. Thus the kernel configuration is static. The file
+arch/arm/mach-ld7a40x/lcd-panels.h (or similar) defines all of the
+panel specific parameters.
+
+It should be possible for this data to be shared among several device
+families. The current layout may be insufficiently general, but it is
+amenable to improvement.
+
+
+PIXEL_CLOCK
+-----------
+
+The panel data sheets will give a range of acceptable pixel clocks.
+The fundamental LCDCLK input frequency is divided down by a PCD
+constant in field '.tim2'. It may happen that it is impossible to set
+the pixel clock within this range. A clock which is too slow will
+tend to flicker. For the highest quality image, set the clock as high
+as possible.
+
+
+MARGINS
+-------
+
+These values may be difficult to glean from the panel data sheet. In
+the case of the Sharp panels, the upper margin is explicitly called
+out as a specific number of lines from the top of the frame. The
+other values may not matter as much as the panels tend to
+automatically center the image.
+
+
+Sync Sense
+----------
+
+The sense of the hsync and vsync pulses may be called out in the data
+sheet. On one panel, the sense of these pulses determine the height
+of the visible region on the panel. Most of the Sharp panels use
+negative sense sync pulses set by the TIM2_IHS and TIM2_IVS bits in
+'.tim2'.
+
+
+Pel Layout
+----------
+
+The Sharp color TFT panels are all configured for 16 bit direct color
+modes. The amba-lcd driver sets the pel mode to 565 for 5 bits of
+each red and blue and 6 bits of green.
diff --git a/Documentation/arm/Sharp-LH/LPD7A400 b/Documentation/arm/Sharp-LH/LPD7A400
new file mode 100644
index 0000000..3275b45
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/LPD7A400
@@ -0,0 +1,15 @@
+README on Implementing Linux for the Logic PD LPD7A400-10
+=========================================================
+
+- CPLD memory mapping
+
+ The board designers chose to use high address lines for controlling
+ access to the CPLD registers. It turns out to be a big waste
+ because we're using an MMU and must map IO space into virtual
+ memory. The result is that we have to make a mapping for every
+ register.
+
+- Serial Console
+
+ It may be OK not to use the serial console option if the user passes
+ the console device name to the kernel. This deserves some exploration.
diff --git a/Documentation/arm/Sharp-LH/LPD7A40X b/Documentation/arm/Sharp-LH/LPD7A40X
new file mode 100644
index 0000000..8c29a27
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/LPD7A40X
@@ -0,0 +1,16 @@
+README on Implementing Linux for the Logic PD LPD7A40X-10
+=========================================================
+
+- CPLD memory mapping
+
+ The board designers chose to use high address lines for controlling
+ access to the CPLD registers. It turns out to be a big waste
+ because we're using an MMU and must map IO space into virtual
+ memory. The result is that we have to make a mapping for every
+ register.
+
+- Serial Console
+
+ It may be OK not to use the serial console option if the user passes
+ the console device name to the kernel. This deserves some exploration.
+
diff --git a/Documentation/arm/Sharp-LH/SDRAM b/Documentation/arm/Sharp-LH/SDRAM
new file mode 100644
index 0000000..93ddc23
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/SDRAM
@@ -0,0 +1,51 @@
+README on the SDRAM Controller for the LH7a40X
+==============================================
+
+The standard configuration for the SDRAM controller generates a sparse
+memory array. The precise layout is determined by the SDRAM chips. A
+default kernel configuration assembles the discontiguous memory
+regions into separate memory nodes via the NUMA (Non-Uniform Memory
+Architecture) facilities. In this default configuration, the kernel
+is forgiving about the precise layout. As long as it is given an
+accurate picture of available memory by the bootloader the kernel will
+execute correctly.
+
+The SDRC supports a mode where some of the chip select lines are
+swapped in order to make SDRAM look like a synchronous ROM. Setting
+this bit means that the RAM will present as a contiguous array. Some
+programmers prefer this to the discontiguous layout. Be aware that
+may be a penalty for this feature where some some configurations of
+memory are significantly reduced; i.e. 64MiB of RAM appears as only 32
+MiB.
+
+There are a couple of configuration options to override the default
+behavior. When the SROMLL bit is set and memory appears as a
+contiguous array, there is no reason to support NUMA.
+CONFIG_LH7A40X_CONTIGMEM disables NUMA support. When physical memory
+is discontiguous, the memory tables are organized such that there are
+two banks per nodes with a small gap between them. This layout wastes
+some kernel memory for page tables representing non-existent memory.
+CONFIG_LH7A40X_ONE_BANK_PER_NODE optimizes the node tables such that
+there are no gaps. These options control the low level organization
+of the memory management tables in ways that may prevent the kernel
+from booting or may cause the kernel to allocated excessively large
+page tables. Be warned. Only change these options if you know what
+you are doing. The default behavior is a reasonable compromise that
+will suit all users.
+
+--
+
+A typical 32MiB system with the default configuration options will
+find physical memory managed as follows.
+
+ node 0: 0xc0000000 4MiB
+ 0xc1000000 4MiB
+ node 1: 0xc4000000 4MiB
+ 0xc5000000 4MiB
+ node 2: 0xc8000000 4MiB
+ 0xc9000000 4MiB
+ node 3: 0xcc000000 4MiB
+ 0xcd000000 4MiB
+
+Setting CONFIG_LH7A40X_ONE_BANK_PER_NODE will put each bank into a
+separate node.
diff --git a/Documentation/arm/Sharp-LH/VectoredInterruptController b/Documentation/arm/Sharp-LH/VectoredInterruptController
new file mode 100644
index 0000000..23047e9
--- /dev/null
+++ b/Documentation/arm/Sharp-LH/VectoredInterruptController
@@ -0,0 +1,80 @@
+README on the Vectored Interrupt Controller of the LH7A404
+==========================================================
+
+The 404 revision of the LH7A40X series comes with two vectored
+interrupts controllers. While the kernel does use some of the
+features of these devices, it is far from the purpose for which they
+were designed.
+
+When this README was written, the implementation of the VICs was in
+flux. It is possible that some details, especially with priorities,
+will change.
+
+The VIC support code is inspired by routines written by Sharp.
+
+
+Priority Control
+----------------
+
+The significant reason for using the VIC's vectoring is to control
+interrupt priorities. There are two tables in
+arch/arm/mach-lh7a40x/irq-lh7a404.c that look something like this.
+
+ static unsigned char irq_pri_vic1[] = { IRQ_GPIO3INTR, };
+ static unsigned char irq_pri_vic2[] = {
+ IRQ_T3UI, IRQ_GPIO7INTR,
+ IRQ_UART1INTR, IRQ_UART2INTR, IRQ_UART3INTR, };
+
+The initialization code reads these tables and inserts a vector
+address and enable for each indicated IRQ. Vectored interrupts have
+higher priority than non-vectored interrupts. So, on VIC1,
+IRQ_GPIO3INTR will be served before any other non-FIQ interrupt. Due
+to the way that the vectoring works, IRQ_T3UI is the next highest
+priority followed by the other vectored interrupts on VIC2. After
+that, the non-vectored interrupts are scanned in VIC1 then in VIC2.
+
+
+ISR
+---
+
+The interrupt service routine macro get_irqnr() in
+arch/arm/kernel/entry-armv.S scans the VICs for the next active
+interrupt. The vectoring makes this code somewhat larger than it was
+before using vectoring (refer to the LH7A400 implementation). In the
+case where an interrupt is vectored, the implementation will tend to
+be faster than the non-vectored version. However, the worst-case path
+is longer.
+
+It is worth noting that at present, there is no need to read
+VIC2_VECTADDR because the register appears to be shared between the
+controllers. The code is written such that if this changes, it ought
+to still work properly.
+
+
+Vector Addresses
+----------------
+
+The proper use of the vectoring hardware would jump to the ISR
+specified by the vectoring address. Linux isn't structured to take
+advantage of this feature, though it might be possible to change
+things to support it.
+
+In this implementation, the vectoring address is used to speed the
+search for the active IRQ. The address is coded such that the lowest
+6 bits store the IRQ number for vectored interrupts. These numbers
+correspond to the bits in the interrupt status registers. IRQ zero is
+the lowest interrupt bit in VIC1. IRQ 32 is the lowest interrupt bit
+in VIC2. Because zero is a valid IRQ number and because we cannot
+detect whether or not there is a valid vectoring address if that
+address is zero, the eigth bit (0x100) is set for vectored interrupts.
+The address for IRQ 0x18 (VIC2) is 0x118. Only the ninth bit is set
+for the default handler on VIC1 and only the tenth bit is set for the
+default handler on VIC2.
+
+In other words.
+
+ 0x000 - no active interrupt
+ 0x1ii - vectored interrupt 0xii
+ 0x2xx - unvectored interrupt on VIC1 (xx is don't care)
+ 0x4xx - unvectored interrupt on VIC2 (xx is don't care)
+
diff --git a/Documentation/arm/VFP/release-notes.txt b/Documentation/arm/VFP/release-notes.txt
new file mode 100644
index 0000000..28a2795
--- /dev/null
+++ b/Documentation/arm/VFP/release-notes.txt
@@ -0,0 +1,55 @@
+Release notes for Linux Kernel VFP support code
+-----------------------------------------------
+
+Date: 20 May 2004
+Author: Russell King
+
+This is the first release of the Linux Kernel VFP support code. It
+provides support for the exceptions bounced from VFP hardware found
+on ARM926EJ-S.
+
+This release has been validated against the SoftFloat-2b library by
+John R. Hauser using the TestFloat-2a test suite. Details of this
+library and test suite can be found at:
+
+ http://www.jhauser.us/arithmetic/SoftFloat.html
+
+The operations which have been tested with this package are:
+
+ - fdiv
+ - fsub
+ - fadd
+ - fmul
+ - fcmp
+ - fcmpe
+ - fcvtd
+ - fcvts
+ - fsito
+ - ftosi
+ - fsqrt
+
+All the above pass softfloat tests with the following exceptions:
+
+- fadd/fsub shows some differences in the handling of +0 / -0 results
+ when input operands differ in signs.
+- the handling of underflow exceptions is slightly different. If a
+ result underflows before rounding, but becomes a normalised number
+ after rounding, we do not signal an underflow exception.
+
+Other operations which have been tested by basic assembly-only tests
+are:
+
+ - fcpy
+ - fabs
+ - fneg
+ - ftoui
+ - ftosiz
+ - ftouiz
+
+The combination operations have not been tested:
+
+ - fmac
+ - fnmac
+ - fmsc
+ - fnmsc
+ - fnmul
diff --git a/Documentation/arm/mem_alignment b/Documentation/arm/mem_alignment
new file mode 100644
index 0000000..c7c7a11
--- /dev/null
+++ b/Documentation/arm/mem_alignment
@@ -0,0 +1,58 @@
+Too many problems poped up because of unnoticed misaligned memory access in
+kernel code lately. Therefore the alignment fixup is now unconditionally
+configured in for SA11x0 based targets. According to Alan Cox, this is a
+bad idea to configure it out, but Russell King has some good reasons for
+doing so on some f***ed up ARM architectures like the EBSA110. However
+this is not the case on many design I'm aware of, like all SA11x0 based
+ones.
+
+Of course this is a bad idea to rely on the alignment trap to perform
+unaligned memory access in general. If those access are predictable, you
+are better to use the macros provided by include/asm/unaligned.h. The
+alignment trap can fixup misaligned access for the exception cases, but at
+a high performance cost. It better be rare.
+
+Now for user space applications, it is possible to configure the alignment
+trap to SIGBUS any code performing unaligned access (good for debugging bad
+code), or even fixup the access by software like for kernel code. The later
+mode isn't recommended for performance reasons (just think about the
+floating point emulation that works about the same way). Fix your code
+instead!
+
+Please note that randomly changing the behaviour without good thought is
+real bad - it changes the behaviour of all unaligned instructions in user
+space, and might cause programs to fail unexpectedly.
+
+To change the alignment trap behavior, simply echo a number into
+/proc/cpu/alignment. The number is made up from various bits:
+
+bit behavior when set
+--- -----------------
+
+0 A user process performing an unaligned memory access
+ will cause the kernel to print a message indicating
+ process name, pid, pc, instruction, address, and the
+ fault code.
+
+1 The kernel will attempt to fix up the user process
+ performing the unaligned access. This is of course
+ slow (think about the floating point emulator) and
+ not recommended for production use.
+
+2 The kernel will send a SIGBUS signal to the user process
+ performing the unaligned access.
+
+Note that not all combinations are supported - only values 0 through 5.
+(6 and 7 don't make sense).
+
+For example, the following will turn on the warnings, but without
+fixing up or sending SIGBUS signals:
+
+ echo 1 > /proc/sys/debug/alignment
+
+You can also read the content of the same file to get statistical
+information on unaligned access occurrences plus the current mode of
+operation for user space code.
+
+
+Nicolas Pitre, Mar 13, 2001. Modified Russell King, Nov 30, 2001.
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
new file mode 100644
index 0000000..dc60455
--- /dev/null
+++ b/Documentation/arm/memory.txt
@@ -0,0 +1,74 @@
+ Kernel Memory Layout on ARM Linux
+
+ Russell King <rmk@arm.linux.org.uk>
+ November 17, 2005 (2.6.15)
+
+This document describes the virtual memory layout which the Linux
+kernel uses for ARM processors. It indicates which regions are
+free for platforms to use, and which are used by generic code.
+
+The ARM CPU is capable of addressing a maximum of 4GB virtual memory
+space, and this must be shared between user space processes, the
+kernel, and hardware devices.
+
+As the ARM architecture matures, it becomes necessary to reserve
+certain regions of VM space for use for new facilities; therefore
+this document may reserve more VM space over time.
+
+Start End Use
+--------------------------------------------------------------------------
+ffff8000 ffffffff copy_user_page / clear_user_page use.
+ For SA11xx and Xscale, this is used to
+ setup a minicache mapping.
+
+ffff1000 ffff7fff Reserved.
+ Platforms must not use this address range.
+
+ffff0000 ffff0fff CPU vector page.
+ The CPU vectors are mapped here if the
+ CPU supports vector relocation (control
+ register V bit.)
+
+ffc00000 fffeffff DMA memory mapping region. Memory returned
+ by the dma_alloc_xxx functions will be
+ dynamically mapped here.
+
+ff000000 ffbfffff Reserved for future expansion of DMA
+ mapping region.
+
+VMALLOC_END feffffff Free for platform use, recommended.
+ VMALLOC_END must be aligned to a 2MB
+ boundary.
+
+VMALLOC_START VMALLOC_END-1 vmalloc() / ioremap() space.
+ Memory returned by vmalloc/ioremap will
+ be dynamically placed in this region.
+ VMALLOC_START may be based upon the value
+ of the high_memory variable.
+
+PAGE_OFFSET high_memory-1 Kernel direct-mapped RAM region.
+ This maps the platforms RAM, and typically
+ maps all platform RAM in a 1:1 relationship.
+
+TASK_SIZE PAGE_OFFSET-1 Kernel module space
+ Kernel modules inserted via insmod are
+ placed here using dynamic mappings.
+
+00001000 TASK_SIZE-1 User space mappings
+ Per-thread mappings are placed here via
+ the mmap() system call.
+
+00000000 00000fff CPU vector page / null pointer trap
+ CPUs which do not support vector remapping
+ place their vector page here. NULL pointer
+ dereferences by both the kernel and user
+ space are also caught via this mapping.
+
+Please note that mappings which collide with the above areas may result
+in a non-bootable kernel, or may cause the kernel to (eventually) panic
+at run time.
+
+Since future CPUs may impact the kernel mapping layout, user programs
+must not access any memory which is not mapped inside their 0x0001000
+to TASK_SIZE address range. If they wish to access these areas, they
+must set up their own mappings using open() and mmap().
diff --git a/Documentation/arm/nwfpe/NOTES b/Documentation/arm/nwfpe/NOTES
new file mode 100644
index 0000000..40577b5
--- /dev/null
+++ b/Documentation/arm/nwfpe/NOTES
@@ -0,0 +1,29 @@
+There seems to be a problem with exp(double) and our emulator. I haven't
+been able to track it down yet. This does not occur with the emulator
+supplied by Russell King.
+
+I also found one oddity in the emulator. I don't think it is serious but
+will point it out. The ARM calling conventions require floating point
+registers f4-f7 to be preserved over a function call. The compiler quite
+often uses an stfe instruction to save f4 on the stack upon entry to a
+function, and an ldfe instruction to restore it before returning.
+
+I was looking at some code, that calculated a double result, stored it in f4
+then made a function call. Upon return from the function call the number in
+f4 had been converted to an extended value in the emulator.
+
+This is a side effect of the stfe instruction. The double in f4 had to be
+converted to extended, then stored. If an lfm/sfm combination had been used,
+then no conversion would occur. This has performance considerations. The
+result from the function call and f4 were used in a multiplication. If the
+emulator sees a multiply of a double and extended, it promotes the double to
+extended, then does the multiply in extended precision.
+
+This code will cause this problem:
+
+double x, y, z;
+z = log(x)/log(y);
+
+The result of log(x) (a double) will be calculated, returned in f0, then
+moved to f4 to preserve it over the log(y) call. The division will be done
+in extended precision, due to the stfe instruction used to save f4 in log(y).
diff --git a/Documentation/arm/nwfpe/README b/Documentation/arm/nwfpe/README
new file mode 100644
index 0000000..771871d
--- /dev/null
+++ b/Documentation/arm/nwfpe/README
@@ -0,0 +1,70 @@
+This directory contains the version 0.92 test release of the NetWinder
+Floating Point Emulator.
+
+The majority of the code was written by me, Scott Bambrough It is
+written in C, with a small number of routines in inline assembler
+where required. It was written quickly, with a goal of implementing a
+working version of all the floating point instructions the compiler
+emits as the first target. I have attempted to be as optimal as
+possible, but there remains much room for improvement.
+
+I have attempted to make the emulator as portable as possible. One of
+the problems is with leading underscores on kernel symbols. Elf
+kernels have no leading underscores, a.out compiled kernels do. I
+have attempted to use the C_SYMBOL_NAME macro wherever this may be
+important.
+
+Another choice I made was in the file structure. I have attempted to
+contain all operating system specific code in one module (fpmodule.*).
+All the other files contain emulator specific code. This should allow
+others to port the emulator to NetBSD for instance relatively easily.
+
+The floating point operations are based on SoftFloat Release 2, by
+John Hauser. SoftFloat is a software implementation of floating-point
+that conforms to the IEC/IEEE Standard for Binary Floating-point
+Arithmetic. As many as four formats are supported: single precision,
+double precision, extended double precision, and quadruple precision.
+All operations required by the standard are implemented, except for
+conversions to and from decimal. We use only the single precision,
+double precision and extended double precision formats. The port of
+SoftFloat to the ARM was done by Phil Blundell, based on an earlier
+port of SoftFloat version 1 by Neil Carson for NetBSD/arm32.
+
+The file README.FPE contains a description of what has been implemented
+so far in the emulator. The file TODO contains a information on what
+remains to be done, and other ideas for the emulator.
+
+Bug reports, comments, suggestions should be directed to me at
+<scottb@netwinder.org>. General reports of "this program doesn't
+work correctly when your emulator is installed" are useful for
+determining that bugs still exist; but are virtually useless when
+attempting to isolate the problem. Please report them, but don't
+expect quick action. Bugs still exist. The problem remains in isolating
+which instruction contains the bug. Small programs illustrating a specific
+problem are a godsend.
+
+Legal Notices
+-------------
+
+The NetWinder Floating Point Emulator is free software. Everything Rebel.com
+has written is provided under the GNU GPL. See the file COPYING for copying
+conditions. Excluded from the above is the SoftFloat code. John Hauser's
+legal notice for SoftFloat is included below.
+
+-------------------------------------------------------------------------------
+SoftFloat Legal Notice
+
+SoftFloat was written by John R. Hauser. This work was made possible in
+part by the International Computer Science Institute, located at Suite 600,
+1947 Center Street, Berkeley, California 94704. Funding was partially
+provided by the National Science Foundation under grant MIP-9311980. The
+original version of this code was written as part of a project to build
+a fixed-point vector processor in collaboration with the University of
+California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
+-------------------------------------------------------------------------------
diff --git a/Documentation/arm/nwfpe/README.FPE b/Documentation/arm/nwfpe/README.FPE
new file mode 100644
index 0000000..26f5d7b
--- /dev/null
+++ b/Documentation/arm/nwfpe/README.FPE
@@ -0,0 +1,156 @@
+The following describes the current state of the NetWinder's floating point
+emulator.
+
+In the following nomenclature is used to describe the floating point
+instructions. It follows the conventions in the ARM manual.
+
+<S|D|E> = <single|double|extended>, no default
+{P|M|Z} = {round to +infinity,round to -infinity,round to zero},
+ default = round to nearest
+
+Note: items enclosed in {} are optional.
+
+Floating Point Coprocessor Data Transfer Instructions (CPDT)
+------------------------------------------------------------
+
+LDF/STF - load and store floating
+
+<LDF|STF>{cond}<S|D|E> Fd, Rn
+<LDF|STF>{cond}<S|D|E> Fd, [Rn, #<expression>]{!}
+<LDF|STF>{cond}<S|D|E> Fd, [Rn], #<expression>
+
+These instructions are fully implemented.
+
+LFM/SFM - load and store multiple floating
+
+Form 1 syntax:
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn]
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn, #<expression>]{!}
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn], #<expression>
+
+Form 2 syntax:
+<LFM|SFM>{cond}<FD,EA> Fd, <count>, [Rn]{!}
+
+These instructions are fully implemented. They store/load three words
+for each floating point register into the memory location given in the
+instruction. The format in memory is unlikely to be compatible with
+other implementations, in particular the actual hardware. Specific
+mention of this is made in the ARM manuals.
+
+Floating Point Coprocessor Register Transfer Instructions (CPRT)
+----------------------------------------------------------------
+
+Conversions, read/write status/control register instructions
+
+FLT{cond}<S,D,E>{P,M,Z} Fn, Rd Convert integer to floating point
+FIX{cond}{P,M,Z} Rd, Fn Convert floating point to integer
+WFS{cond} Rd Write floating point status register
+RFS{cond} Rd Read floating point status register
+WFC{cond} Rd Write floating point control register
+RFC{cond} Rd Read floating point control register
+
+FLT/FIX are fully implemented.
+
+RFS/WFS are fully implemented.
+
+RFC/WFC are fully implemented. RFC/WFC are supervisor only instructions, and
+presently check the CPU mode, and do an invalid instruction trap if not called
+from supervisor mode.
+
+Compare instructions
+
+CMF{cond} Fn, Fm Compare floating
+CMFE{cond} Fn, Fm Compare floating with exception
+CNF{cond} Fn, Fm Compare negated floating
+CNFE{cond} Fn, Fm Compare negated floating with exception
+
+These are fully implemented.
+
+Floating Point Coprocessor Data Instructions (CPDT)
+---------------------------------------------------
+
+Dyadic operations:
+
+ADF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - add
+SUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - subtract
+RSF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse subtract
+MUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - multiply
+DVF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - divide
+RDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse divide
+
+These are fully implemented.
+
+FML{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast multiply
+FDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast divide
+FRD{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast reverse divide
+
+These are fully implemented as well. They use the same algorithm as the
+non-fast versions. Hence, in this implementation their performance is
+equivalent to the MUF/DVF/RDV instructions. This is acceptable according
+to the ARM manual. The manual notes these are defined only for single
+operands, on the actual FPA11 hardware they do not work for double or
+extended precision operands. The emulator currently does not check
+the requested permissions conditions, and performs the requested operation.
+
+RMF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - IEEE remainder
+
+This is fully implemented.
+
+Monadic operations:
+
+MVF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move
+MNF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move negated
+
+These are fully implemented.
+
+ABS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - absolute value
+SQT{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - square root
+RND{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - round
+
+These are fully implemented.
+
+URD{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - unnormalized round
+NRM{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - normalize
+
+These are implemented. URD is implemented using the same code as the RND
+instruction. Since URD cannot return a unnormalized number, NRM becomes
+a NOP.
+
+Library calls:
+
+POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
+RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
+POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
+
+LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
+LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
+EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
+SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
+COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
+TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
+ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
+ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
+ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
+
+These are not implemented. They are not currently issued by the compiler,
+and are handled by routines in libc. These are not implemented by the FPA11
+hardware, but are handled by the floating point support code. They should
+be implemented in future versions.
+
+Signalling:
+
+Signals are implemented. However current ELF kernels produced by Rebel.com
+have a bug in them that prevents the module from generating a SIGFPE. This
+is caused by a failure to alias fp_current to the kernel variable
+current_set[0] correctly.
+
+The kernel provided with this distribution (vmlinux-nwfpe-0.93) contains
+a fix for this problem and also incorporates the current version of the
+emulator directly. It is possible to run with no floating point module
+loaded with this kernel. It is provided as a demonstration of the
+technology and for those who want to do floating point work that depends
+on signals. It is not strictly necessary to use the module.
+
+A module (either the one provided by Russell King, or the one in this
+distribution) can be loaded to replace the functionality of the emulator
+built into the kernel.
diff --git a/Documentation/arm/nwfpe/TODO b/Documentation/arm/nwfpe/TODO
new file mode 100644
index 0000000..8027061
--- /dev/null
+++ b/Documentation/arm/nwfpe/TODO
@@ -0,0 +1,67 @@
+TODO LIST
+---------
+
+POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
+RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
+POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
+
+LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
+LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
+EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
+SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
+COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
+TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
+ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
+ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
+ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
+
+These are not implemented. They are not currently issued by the compiler,
+and are handled by routines in libc. These are not implemented by the FPA11
+hardware, but are handled by the floating point support code. They should
+be implemented in future versions.
+
+There are a couple of ways to approach the implementation of these. One
+method would be to use accurate table methods for these routines. I have
+a couple of papers by S. Gal from IBM's research labs in Haifa, Israel that
+seem to promise extreme accuracy (in the order of 99.8%) and reasonable speed.
+These methods are used in GLIBC for some of the transcendental functions.
+
+Another approach, which I know little about is CORDIC. This stands for
+Coordinate Rotation Digital Computer, and is a method of computing
+transcendental functions using mostly shifts and adds and a few
+multiplications and divisions. The ARM excels at shifts and adds,
+so such a method could be promising, but requires more research to
+determine if it is feasible.
+
+Rounding Methods
+
+The IEEE standard defines 4 rounding modes. Round to nearest is the
+default, but rounding to + or - infinity or round to zero are also allowed.
+Many architectures allow the rounding mode to be specified by modifying bits
+in a control register. Not so with the ARM FPA11 architecture. To change
+the rounding mode one must specify it with each instruction.
+
+This has made porting some benchmarks difficult. It is possible to
+introduce such a capability into the emulator. The FPCR contains
+bits describing the rounding mode. The emulator could be altered to
+examine a flag, which if set forced it to ignore the rounding mode in
+the instruction, and use the mode specified in the bits in the FPCR.
+
+This would require a method of getting/setting the flag, and the bits
+in the FPCR. This requires a kernel call in ArmLinux, as WFC/RFC are
+supervisor only instructions. If anyone has any ideas or comments I
+would like to hear them.
+
+[NOTE: pulled out from some docs on ARM floating point, specifically
+ for the Acorn FPE, but not limited to it:
+
+ The floating point control register (FPCR) may only be present in some
+ implementations: it is there to control the hardware in an implementation-
+ specific manner, for example to disable the floating point system. The user
+ mode of the ARM is not permitted to use this register (since the right is
+ reserved to alter it between implementations) and the WFC and RFC
+ instructions will trap if tried in user mode.
+
+ Hence, the answer is yes, you could do this, but then you will run a high
+ risk of becoming isolated if and when hardware FP emulation comes out
+ -- Russell].
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
new file mode 100644
index 0000000..4ef2450
--- /dev/null
+++ b/Documentation/atomic_ops.txt
@@ -0,0 +1,547 @@
+ Semantics and Behavior of Atomic and
+ Bitmask Operations
+
+ David S. Miller
+
+ This document is intended to serve as a guide to Linux port
+maintainers on how to implement atomic counter, bitops, and spinlock
+interfaces properly.
+
+ The atomic_t type should be defined as a signed integer.
+Also, it should be made opaque such that any kind of cast to a normal
+C integer type will fail. Something like the following should
+suffice:
+
+ typedef struct { volatile int counter; } atomic_t;
+
+Historically, counter has been declared volatile. This is now discouraged.
+See Documentation/volatile-considered-harmful.txt for the complete rationale.
+
+local_t is very similar to atomic_t. If the counter is per CPU and only
+updated by one CPU, local_t is probably more appropriate. Please see
+Documentation/local_ops.txt for the semantics of local_t.
+
+The first operations to implement for atomic_t's are the initializers and
+plain reads.
+
+ #define ATOMIC_INIT(i) { (i) }
+ #define atomic_set(v, i) ((v)->counter = (i))
+
+The first macro is used in definitions, such as:
+
+static atomic_t my_counter = ATOMIC_INIT(1);
+
+The initializer is atomic in that the return values of the atomic operations
+are guaranteed to be correct reflecting the initialized value if the
+initializer is used before runtime. If the initializer is used at runtime, a
+proper implicit or explicit read memory barrier is needed before reading the
+value with atomic_read from another thread.
+
+The second interface can be used at runtime, as in:
+
+ struct foo { atomic_t counter; };
+ ...
+
+ struct foo *k;
+
+ k = kmalloc(sizeof(*k), GFP_KERNEL);
+ if (!k)
+ return -ENOMEM;
+ atomic_set(&k->counter, 0);
+
+The setting is atomic in that the return values of the atomic operations by
+all threads are guaranteed to be correct reflecting either the value that has
+been set with this operation or set with another operation. A proper implicit
+or explicit memory barrier is needed before the value set with the operation
+is guaranteed to be readable with atomic_read from another thread.
+
+Next, we have:
+
+ #define atomic_read(v) ((v)->counter)
+
+which simply reads the counter value currently visible to the calling thread.
+The read is atomic in that the return value is guaranteed to be one of the
+values initialized or modified with the interface operations if a proper
+implicit or explicit memory barrier is used after possible runtime
+initialization by any other thread and the value is modified only with the
+interface operations. atomic_read does not guarantee that the runtime
+initialization by any other thread is visible yet, so the user of the
+interface must take care of that with a proper implicit or explicit memory
+barrier.
+
+*** WARNING: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS! ***
+
+Some architectures may choose to use the volatile keyword, barriers, or inline
+assembly to guarantee some degree of immediacy for atomic_read() and
+atomic_set(). This is not uniformly guaranteed, and may change in the future,
+so all users of atomic_t should treat atomic_read() and atomic_set() as simple
+C statements that may be reordered or optimized away entirely by the compiler
+or processor, and explicitly invoke the appropriate compiler and/or memory
+barrier for each use case. Failure to do so will result in code that may
+suddenly break when used with different architectures or compiler
+optimizations, or even changes in unrelated code which changes how the
+compiler optimizes the section accessing atomic_t variables.
+
+*** YOU HAVE BEEN WARNED! ***
+
+Now, we move onto the atomic operation interfaces typically implemented with
+the help of assembly code.
+
+ void atomic_add(int i, atomic_t *v);
+ void atomic_sub(int i, atomic_t *v);
+ void atomic_inc(atomic_t *v);
+ void atomic_dec(atomic_t *v);
+
+These four routines add and subtract integral values to/from the given
+atomic_t value. The first two routines pass explicit integers by
+which to make the adjustment, whereas the latter two use an implicit
+adjustment value of "1".
+
+One very important aspect of these two routines is that they DO NOT
+require any explicit memory barriers. They need only perform the
+atomic_t counter update in an SMP safe manner.
+
+Next, we have:
+
+ int atomic_inc_return(atomic_t *v);
+ int atomic_dec_return(atomic_t *v);
+
+These routines add 1 and subtract 1, respectively, from the given
+atomic_t and return the new counter value after the operation is
+performed.
+
+Unlike the above routines, it is required that explicit memory
+barriers are performed before and after the operation. It must be
+done such that all memory operations before and after the atomic
+operation calls are strongly ordered with respect to the atomic
+operation itself.
+
+For example, it should behave as if a smp_mb() call existed both
+before and after the atomic operation.
+
+If the atomic instructions used in an implementation provide explicit
+memory barrier semantics which satisfy the above requirements, that is
+fine as well.
+
+Let's move on:
+
+ int atomic_add_return(int i, atomic_t *v);
+ int atomic_sub_return(int i, atomic_t *v);
+
+These behave just like atomic_{inc,dec}_return() except that an
+explicit counter adjustment is given instead of the implicit "1".
+This means that like atomic_{inc,dec}_return(), the memory barrier
+semantics are required.
+
+Next:
+
+ int atomic_inc_and_test(atomic_t *v);
+ int atomic_dec_and_test(atomic_t *v);
+
+These two routines increment and decrement by 1, respectively, the
+given atomic counter. They return a boolean indicating whether the
+resulting counter value was zero or not.
+
+It requires explicit memory barrier semantics around the operation as
+above.
+
+ int atomic_sub_and_test(int i, atomic_t *v);
+
+This is identical to atomic_dec_and_test() except that an explicit
+decrement is given instead of the implicit "1". It requires explicit
+memory barrier semantics around the operation.
+
+ int atomic_add_negative(int i, atomic_t *v);
+
+The given increment is added to the given atomic counter value. A
+boolean is return which indicates whether the resulting counter value
+is negative. It requires explicit memory barrier semantics around the
+operation.
+
+Then:
+
+ int atomic_xchg(atomic_t *v, int new);
+
+This performs an atomic exchange operation on the atomic variable v, setting
+the given new value. It returns the old value that the atomic variable v had
+just before the operation.
+
+ int atomic_cmpxchg(atomic_t *v, int old, int new);
+
+This performs an atomic compare exchange operation on the atomic value v,
+with the given old and new values. Like all atomic_xxx operations,
+atomic_cmpxchg will only satisfy its atomicity semantics as long as all
+other accesses of *v are performed through atomic_xxx operations.
+
+atomic_cmpxchg requires explicit memory barriers around the operation.
+
+The semantics for atomic_cmpxchg are the same as those defined for 'cas'
+below.
+
+Finally:
+
+ int atomic_add_unless(atomic_t *v, int a, int u);
+
+If the atomic value v is not equal to u, this function adds a to v, and
+returns non zero. If v is equal to u then it returns zero. This is done as
+an atomic operation.
+
+atomic_add_unless requires explicit memory barriers around the operation
+unless it fails (returns 0).
+
+atomic_inc_not_zero, equivalent to atomic_add_unless(v, 1, 0)
+
+
+If a caller requires memory barrier semantics around an atomic_t
+operation which does not return a value, a set of interfaces are
+defined which accomplish this:
+
+ void smp_mb__before_atomic_dec(void);
+ void smp_mb__after_atomic_dec(void);
+ void smp_mb__before_atomic_inc(void);
+ void smp_mb__after_atomic_inc(void);
+
+For example, smp_mb__before_atomic_dec() can be used like so:
+
+ obj->dead = 1;
+ smp_mb__before_atomic_dec();
+ atomic_dec(&obj->ref_count);
+
+It makes sure that all memory operations preceding the atomic_dec()
+call are strongly ordered with respect to the atomic counter
+operation. In the above example, it guarantees that the assignment of
+"1" to obj->dead will be globally visible to other cpus before the
+atomic counter decrement.
+
+Without the explicit smp_mb__before_atomic_dec() call, the
+implementation could legally allow the atomic counter update visible
+to other cpus before the "obj->dead = 1;" assignment.
+
+The other three interfaces listed are used to provide explicit
+ordering with respect to memory operations after an atomic_dec() call
+(smp_mb__after_atomic_dec()) and around atomic_inc() calls
+(smp_mb__{before,after}_atomic_inc()).
+
+A missing memory barrier in the cases where they are required by the
+atomic_t implementation above can have disastrous results. Here is
+an example, which follows a pattern occurring frequently in the Linux
+kernel. It is the use of atomic counters to implement reference
+counting, and it works such that once the counter falls to zero it can
+be guaranteed that no other entity can be accessing the object:
+
+static void obj_list_add(struct obj *obj)
+{
+ obj->active = 1;
+ list_add(&obj->list);
+}
+
+static void obj_list_del(struct obj *obj)
+{
+ list_del(&obj->list);
+ obj->active = 0;
+}
+
+static void obj_destroy(struct obj *obj)
+{
+ BUG_ON(obj->active);
+ kfree(obj);
+}
+
+struct obj *obj_list_peek(struct list_head *head)
+{
+ if (!list_empty(head)) {
+ struct obj *obj;
+
+ obj = list_entry(head->next, struct obj, list);
+ atomic_inc(&obj->refcnt);
+ return obj;
+ }
+ return NULL;
+}
+
+void obj_poke(void)
+{
+ struct obj *obj;
+
+ spin_lock(&global_list_lock);
+ obj = obj_list_peek(&global_list);
+ spin_unlock(&global_list_lock);
+
+ if (obj) {
+ obj->ops->poke(obj);
+ if (atomic_dec_and_test(&obj->refcnt))
+ obj_destroy(obj);
+ }
+}
+
+void obj_timeout(struct obj *obj)
+{
+ spin_lock(&global_list_lock);
+ obj_list_del(obj);
+ spin_unlock(&global_list_lock);
+
+ if (atomic_dec_and_test(&obj->refcnt))
+ obj_destroy(obj);
+}
+
+(This is a simplification of the ARP queue management in the
+ generic neighbour discover code of the networking. Olaf Kirch
+ found a bug wrt. memory barriers in kfree_skb() that exposed
+ the atomic_t memory barrier requirements quite clearly.)
+
+Given the above scheme, it must be the case that the obj->active
+update done by the obj list deletion be visible to other processors
+before the atomic counter decrement is performed.
+
+Otherwise, the counter could fall to zero, yet obj->active would still
+be set, thus triggering the assertion in obj_destroy(). The error
+sequence looks like this:
+
+ cpu 0 cpu 1
+ obj_poke() obj_timeout()
+ obj = obj_list_peek();
+ ... gains ref to obj, refcnt=2
+ obj_list_del(obj);
+ obj->active = 0 ...
+ ... visibility delayed ...
+ atomic_dec_and_test()
+ ... refcnt drops to 1 ...
+ atomic_dec_and_test()
+ ... refcount drops to 0 ...
+ obj_destroy()
+ BUG() triggers since obj->active
+ still seen as one
+ obj->active update visibility occurs
+
+With the memory barrier semantics required of the atomic_t operations
+which return values, the above sequence of memory visibility can never
+happen. Specifically, in the above case the atomic_dec_and_test()
+counter decrement would not become globally visible until the
+obj->active update does.
+
+As a historical note, 32-bit Sparc used to only allow usage of
+24-bits of it's atomic_t type. This was because it used 8 bits
+as a spinlock for SMP safety. Sparc32 lacked a "compare and swap"
+type instruction. However, 32-bit Sparc has since been moved over
+to a "hash table of spinlocks" scheme, that allows the full 32-bit
+counter to be realized. Essentially, an array of spinlocks are
+indexed into based upon the address of the atomic_t being operated
+on, and that lock protects the atomic operation. Parisc uses the
+same scheme.
+
+Another note is that the atomic_t operations returning values are
+extremely slow on an old 386.
+
+We will now cover the atomic bitmask operations. You will find that
+their SMP and memory barrier semantics are similar in shape and scope
+to the atomic_t ops above.
+
+Native atomic bit operations are defined to operate on objects aligned
+to the size of an "unsigned long" C data type, and are least of that
+size. The endianness of the bits within each "unsigned long" are the
+native endianness of the cpu.
+
+ void set_bit(unsigned long nr, volatile unsigned long *addr);
+ void clear_bit(unsigned long nr, volatile unsigned long *addr);
+ void change_bit(unsigned long nr, volatile unsigned long *addr);
+
+These routines set, clear, and change, respectively, the bit number
+indicated by "nr" on the bit mask pointed to by "ADDR".
+
+They must execute atomically, yet there are no implicit memory barrier
+semantics required of these interfaces.
+
+ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr);
+ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr);
+ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr);
+
+Like the above, except that these routines return a boolean which
+indicates whether the changed bit was set _BEFORE_ the atomic bit
+operation.
+
+WARNING! It is incredibly important that the value be a boolean,
+ie. "0" or "1". Do not try to be fancy and save a few instructions by
+declaring the above to return "long" and just returning something like
+"old_val & mask" because that will not work.
+
+For one thing, this return value gets truncated to int in many code
+paths using these interfaces, so on 64-bit if the bit is set in the
+upper 32-bits then testers will never see that.
+
+One great example of where this problem crops up are the thread_info
+flag operations. Routines such as test_and_set_ti_thread_flag() chop
+the return value into an int. There are other places where things
+like this occur as well.
+
+These routines, like the atomic_t counter operations returning values,
+require explicit memory barrier semantics around their execution. All
+memory operations before the atomic bit operation call must be made
+visible globally before the atomic bit operation is made visible.
+Likewise, the atomic bit operation must be visible globally before any
+subsequent memory operation is made visible. For example:
+
+ obj->dead = 1;
+ if (test_and_set_bit(0, &obj->flags))
+ /* ... */;
+ obj->killed = 1;
+
+The implementation of test_and_set_bit() must guarantee that
+"obj->dead = 1;" is visible to cpus before the atomic memory operation
+done by test_and_set_bit() becomes visible. Likewise, the atomic
+memory operation done by test_and_set_bit() must become visible before
+"obj->killed = 1;" is visible.
+
+Finally there is the basic operation:
+
+ int test_bit(unsigned long nr, __const__ volatile unsigned long *addr);
+
+Which returns a boolean indicating if bit "nr" is set in the bitmask
+pointed to by "addr".
+
+If explicit memory barriers are required around clear_bit() (which
+does not return a value, and thus does not need to provide memory
+barrier semantics), two interfaces are provided:
+
+ void smp_mb__before_clear_bit(void);
+ void smp_mb__after_clear_bit(void);
+
+They are used as follows, and are akin to their atomic_t operation
+brothers:
+
+ /* All memory operations before this call will
+ * be globally visible before the clear_bit().
+ */
+ smp_mb__before_clear_bit();
+ clear_bit( ... );
+
+ /* The clear_bit() will be visible before all
+ * subsequent memory operations.
+ */
+ smp_mb__after_clear_bit();
+
+There are two special bitops with lock barrier semantics (acquire/release,
+same as spinlocks). These operate in the same way as their non-_lock/unlock
+postfixed variants, except that they are to provide acquire/release semantics,
+respectively. This means they can be used for bit_spin_trylock and
+bit_spin_unlock type operations without specifying any more barriers.
+
+ int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
+ void clear_bit_unlock(unsigned long nr, unsigned long *addr);
+ void __clear_bit_unlock(unsigned long nr, unsigned long *addr);
+
+The __clear_bit_unlock version is non-atomic, however it still implements
+unlock barrier semantics. This can be useful if the lock itself is protecting
+the other bits in the word.
+
+Finally, there are non-atomic versions of the bitmask operations
+provided. They are used in contexts where some other higher-level SMP
+locking scheme is being used to protect the bitmask, and thus less
+expensive non-atomic operations may be used in the implementation.
+They have names similar to the above bitmask operation interfaces,
+except that two underscores are prefixed to the interface name.
+
+ void __set_bit(unsigned long nr, volatile unsigned long *addr);
+ void __clear_bit(unsigned long nr, volatile unsigned long *addr);
+ void __change_bit(unsigned long nr, volatile unsigned long *addr);
+ int __test_and_set_bit(unsigned long nr, volatile unsigned long *addr);
+ int __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr);
+ int __test_and_change_bit(unsigned long nr, volatile unsigned long *addr);
+
+These non-atomic variants also do not require any special memory
+barrier semantics.
+
+The routines xchg() and cmpxchg() need the same exact memory barriers
+as the atomic and bit operations returning values.
+
+Spinlocks and rwlocks have memory barrier expectations as well.
+The rule to follow is simple:
+
+1) When acquiring a lock, the implementation must make it globally
+ visible before any subsequent memory operation.
+
+2) When releasing a lock, the implementation must make it such that
+ all previous memory operations are globally visible before the
+ lock release.
+
+Which finally brings us to _atomic_dec_and_lock(). There is an
+architecture-neutral version implemented in lib/dec_and_lock.c,
+but most platforms will wish to optimize this in assembler.
+
+ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
+
+Atomically decrement the given counter, and if will drop to zero
+atomically acquire the given spinlock and perform the decrement
+of the counter to zero. If it does not drop to zero, do nothing
+with the spinlock.
+
+It is actually pretty simple to get the memory barrier correct.
+Simply satisfy the spinlock grab requirements, which is make
+sure the spinlock operation is globally visible before any
+subsequent memory operation.
+
+We can demonstrate this operation more clearly if we define
+an abstract atomic operation:
+
+ long cas(long *mem, long old, long new);
+
+"cas" stands for "compare and swap". It atomically:
+
+1) Compares "old" with the value currently at "mem".
+2) If they are equal, "new" is written to "mem".
+3) Regardless, the current value at "mem" is returned.
+
+As an example usage, here is what an atomic counter update
+might look like:
+
+void example_atomic_inc(long *counter)
+{
+ long old, new, ret;
+
+ while (1) {
+ old = *counter;
+ new = old + 1;
+
+ ret = cas(counter, old, new);
+ if (ret == old)
+ break;
+ }
+}
+
+Let's use cas() in order to build a pseudo-C atomic_dec_and_lock():
+
+int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+ long old, new, ret;
+ int went_to_zero;
+
+ went_to_zero = 0;
+ while (1) {
+ old = atomic_read(atomic);
+ new = old - 1;
+ if (new == 0) {
+ went_to_zero = 1;
+ spin_lock(lock);
+ }
+ ret = cas(atomic, old, new);
+ if (ret == old)
+ break;
+ if (went_to_zero) {
+ spin_unlock(lock);
+ went_to_zero = 0;
+ }
+ }
+
+ return went_to_zero;
+}
+
+Now, as far as memory barriers go, as long as spin_lock()
+strictly orders all subsequent memory operations (including
+the cas()) with respect to itself, things will be fine.
+
+Said another way, _atomic_dec_and_lock() must guarantee that
+a counter dropping to zero is never made visible before the
+spinlock being acquired.
+
+Note that this also means that for the case where the counter
+is not dropping to zero, there are no memory ordering
+requirements.
diff --git a/Documentation/auxdisplay/.gitignore b/Documentation/auxdisplay/.gitignore
new file mode 100644
index 0000000..7af2228
--- /dev/null
+++ b/Documentation/auxdisplay/.gitignore
@@ -0,0 +1 @@
+cfag12864b-example
diff --git a/Documentation/auxdisplay/Makefile b/Documentation/auxdisplay/Makefile
new file mode 100644
index 0000000..51fe233
--- /dev/null
+++ b/Documentation/auxdisplay/Makefile
@@ -0,0 +1,10 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := cfag12864b-example
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_cfag12864b-example.o += -I$(objtree)/usr/include
diff --git a/Documentation/auxdisplay/cfag12864b b/Documentation/auxdisplay/cfag12864b
new file mode 100644
index 0000000..eb7be39
--- /dev/null
+++ b/Documentation/auxdisplay/cfag12864b
@@ -0,0 +1,105 @@
+ ===================================
+ cfag12864b LCD Driver Documentation
+ ===================================
+
+License: GPLv2
+Author & Maintainer: Miguel Ojeda Sandonis
+Date: 2006-10-27
+
+
+
+--------
+0. INDEX
+--------
+
+ 1. DRIVER INFORMATION
+ 2. DEVICE INFORMATION
+ 3. WIRING
+ 4. USERSPACE PROGRAMMING
+
+
+---------------------
+1. DRIVER INFORMATION
+---------------------
+
+This driver supports a cfag12864b LCD.
+
+
+---------------------
+2. DEVICE INFORMATION
+---------------------
+
+Manufacturer: Crystalfontz
+Device Name: Crystalfontz 12864b LCD Series
+Device Code: cfag12864b
+Webpage: http://www.crystalfontz.com
+Device Webpage: http://www.crystalfontz.com/products/12864b/
+Type: LCD (Liquid Crystal Display)
+Width: 128
+Height: 64
+Colors: 2 (B/N)
+Controller: ks0108
+Controllers: 2
+Pages: 8 each controller
+Addresses: 64 each page
+Data size: 1 byte each address
+Memory size: 2 * 8 * 64 * 1 = 1024 bytes = 1 Kbyte
+
+
+---------
+3. WIRING
+---------
+
+The cfag12864b LCD Series don't have official wiring.
+
+The common wiring is done to the parallel port as shown:
+
+Parallel Port cfag12864b
+
+ Name Pin# Pin# Name
+
+Strobe ( 1)------------------------------(17) Enable
+Data 0 ( 2)------------------------------( 4) Data 0
+Data 1 ( 3)------------------------------( 5) Data 1
+Data 2 ( 4)------------------------------( 6) Data 2
+Data 3 ( 5)------------------------------( 7) Data 3
+Data 4 ( 6)------------------------------( 8) Data 4
+Data 5 ( 7)------------------------------( 9) Data 5
+Data 6 ( 8)------------------------------(10) Data 6
+Data 7 ( 9)------------------------------(11) Data 7
+ (10) [+5v]---( 1) Vdd
+ (11) [GND]---( 2) Ground
+ (12) [+5v]---(14) Reset
+ (13) [GND]---(15) Read / Write
+ Line (14)------------------------------(13) Controller Select 1
+ (15)
+ Init (16)------------------------------(12) Controller Select 2
+Select (17)------------------------------(16) Data / Instruction
+Ground (18)---[GND] [+5v]---(19) LED +
+Ground (19)---[GND]
+Ground (20)---[GND] E A Values:
+Ground (21)---[GND] [GND]---[P1]---(18) Vee - R = Resistor = 22 ohm
+Ground (22)---[GND] | - P1 = Preset = 10 Kohm
+Ground (23)---[GND] ---- S ------( 3) V0 - P2 = Preset = 1 Kohm
+Ground (24)---[GND] | |
+Ground (25)---[GND] [GND]---[P2]---[R]---(20) LED -
+
+
+------------------------
+4. USERSPACE PROGRAMMING
+------------------------
+
+The cfag12864bfb describes a framebuffer device (/dev/fbX).
+
+It has a size of 1024 bytes = 1 Kbyte.
+Each bit represents one pixel. If the bit is high, the pixel will
+turn on. If the pixel is low, the pixel will turn off.
+
+You can use the framebuffer as a file: fopen, fwrite, fclose...
+Although the LCD won't get updated until the next refresh time arrives.
+
+Also, you can mmap the framebuffer: open & mmap, munmap & close...
+which is the best option for most uses.
+
+Check Documentation/auxdisplay/cfag12864b-example.c
+for a real working userspace complete program with usage examples.
diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c
new file mode 100644
index 0000000..2caeea5
--- /dev/null
+++ b/Documentation/auxdisplay/cfag12864b-example.c
@@ -0,0 +1,282 @@
+/*
+ * Filename: cfag12864b-example.c
+ * Version: 0.1.0
+ * Description: cfag12864b LCD userspace example program
+ * License: GPLv2
+ *
+ * Author: Copyright (C) Miguel Ojeda Sandonis
+ * Date: 2006-10-31
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+/*
+ * ------------------------
+ * start of cfag12864b code
+ * ------------------------
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#define CFAG12864B_WIDTH (128)
+#define CFAG12864B_HEIGHT (64)
+#define CFAG12864B_SIZE (128 * 64 / 8)
+#define CFAG12864B_BPB (8)
+#define CFAG12864B_ADDRESS(x, y) ((y) * CFAG12864B_WIDTH / \
+ CFAG12864B_BPB + (x) / CFAG12864B_BPB)
+#define CFAG12864B_BIT(n) (((unsigned char) 1) << (n))
+
+#undef CFAG12864B_DOCHECK
+#ifdef CFAG12864B_DOCHECK
+ #define CFAG12864B_CHECK(x, y) ((x) < CFAG12864B_WIDTH && \
+ (y) < CFAG12864B_HEIGHT)
+#else
+ #define CFAG12864B_CHECK(x, y) (1)
+#endif
+
+int cfag12864b_fd;
+unsigned char * cfag12864b_mem;
+unsigned char cfag12864b_buffer[CFAG12864B_SIZE];
+
+/*
+ * init a cfag12864b framebuffer device
+ *
+ * No error: return = 0
+ * Unable to open: return = -1
+ * Unable to mmap: return = -2
+ */
+int cfag12864b_init(char *path)
+{
+ cfag12864b_fd = open(path, O_RDWR);
+ if (cfag12864b_fd == -1)
+ return -1;
+
+ cfag12864b_mem = mmap(0, CFAG12864B_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED, cfag12864b_fd, 0);
+ if (cfag12864b_mem == MAP_FAILED) {
+ close(cfag12864b_fd);
+ return -2;
+ }
+
+ return 0;
+}
+
+/*
+ * exit a cfag12864b framebuffer device
+ */
+void cfag12864b_exit(void)
+{
+ munmap(cfag12864b_mem, CFAG12864B_SIZE);
+ close(cfag12864b_fd);
+}
+
+/*
+ * set (x, y) pixel
+ */
+void cfag12864b_set(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] |=
+ CFAG12864B_BIT(x % CFAG12864B_BPB);
+}
+
+/*
+ * unset (x, y) pixel
+ */
+void cfag12864b_unset(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &=
+ ~CFAG12864B_BIT(x % CFAG12864B_BPB);
+}
+
+/*
+ * is set (x, y) pixel?
+ *
+ * Pixel off: return = 0
+ * Pixel on: return = 1
+ */
+unsigned char cfag12864b_isset(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ if (cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &
+ CFAG12864B_BIT(x % CFAG12864B_BPB))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * not (x, y) pixel
+ */
+void cfag12864b_not(unsigned char x, unsigned char y)
+{
+ if (cfag12864b_isset(x, y))
+ cfag12864b_unset(x, y);
+ else
+ cfag12864b_set(x, y);
+}
+
+/*
+ * fill (set all pixels)
+ */
+void cfag12864b_fill(void)
+{
+ unsigned short i;
+
+ for (i = 0; i < CFAG12864B_SIZE; i++)
+ cfag12864b_buffer[i] = 0xFF;
+}
+
+/*
+ * clear (unset all pixels)
+ */
+void cfag12864b_clear(void)
+{
+ unsigned short i;
+
+ for (i = 0; i < CFAG12864B_SIZE; i++)
+ cfag12864b_buffer[i] = 0;
+}
+
+/*
+ * format a [128*64] matrix
+ *
+ * Pixel off: src[i] = 0
+ * Pixel on: src[i] > 0
+ */
+void cfag12864b_format(unsigned char * matrix)
+{
+ unsigned char i, j, n;
+
+ for (i = 0; i < CFAG12864B_HEIGHT; i++)
+ for (j = 0; j < CFAG12864B_WIDTH / CFAG12864B_BPB; j++) {
+ cfag12864b_buffer[i * CFAG12864B_WIDTH / CFAG12864B_BPB +
+ j] = 0;
+ for (n = 0; n < CFAG12864B_BPB; n++)
+ if (matrix[i * CFAG12864B_WIDTH +
+ j * CFAG12864B_BPB + n])
+ cfag12864b_buffer[i * CFAG12864B_WIDTH /
+ CFAG12864B_BPB + j] |=
+ CFAG12864B_BIT(n);
+ }
+}
+
+/*
+ * blit buffer to lcd
+ */
+void cfag12864b_blit(void)
+{
+ memcpy(cfag12864b_mem, cfag12864b_buffer, CFAG12864B_SIZE);
+}
+
+/*
+ * ----------------------
+ * end of cfag12864b code
+ * ----------------------
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#define EXAMPLES 6
+
+void example(unsigned char n)
+{
+ unsigned short i, j;
+ unsigned char matrix[CFAG12864B_WIDTH * CFAG12864B_HEIGHT];
+
+ if (n > EXAMPLES)
+ return;
+
+ printf("Example %i/%i - ", n, EXAMPLES);
+
+ switch (n) {
+ case 1:
+ printf("Draw points setting bits");
+ cfag12864b_clear();
+ for (i = 0; i < CFAG12864B_WIDTH; i += 2)
+ for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
+ cfag12864b_set(i, j);
+ break;
+
+ case 2:
+ printf("Clear the LCD");
+ cfag12864b_clear();
+ break;
+
+ case 3:
+ printf("Draw rows formatting a [128*64] matrix");
+ memset(matrix, 0, CFAG12864B_WIDTH * CFAG12864B_HEIGHT);
+ for (i = 0; i < CFAG12864B_WIDTH; i++)
+ for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
+ matrix[j * CFAG12864B_WIDTH + i] = 1;
+ cfag12864b_format(matrix);
+ break;
+
+ case 4:
+ printf("Fill the lcd");
+ cfag12864b_fill();
+ break;
+
+ case 5:
+ printf("Draw columns unsetting bits");
+ for (i = 0; i < CFAG12864B_WIDTH; i += 2)
+ for (j = 0; j < CFAG12864B_HEIGHT; j++)
+ cfag12864b_unset(i, j);
+ break;
+
+ case 6:
+ printf("Do negative not-ing all bits");
+ for (i = 0; i < CFAG12864B_WIDTH; i++)
+ for (j = 0; j < CFAG12864B_HEIGHT; j ++)
+ cfag12864b_not(i, j);
+ break;
+ }
+
+ puts(" - [Press Enter]");
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned char n;
+
+ if (argc != 2) {
+ printf(
+ "Sintax: %s fbdev\n"
+ "Usually: /dev/fb0, /dev/fb1...\n", argv[0]);
+ return -1;
+ }
+
+ if (cfag12864b_init(argv[1])) {
+ printf("Can't init %s fbdev\n", argv[1]);
+ return -2;
+ }
+
+ for (n = 1; n <= EXAMPLES; n++) {
+ example(n);
+ cfag12864b_blit();
+ while (getchar() != '\n');
+ }
+
+ cfag12864b_exit();
+
+ return 0;
+}
diff --git a/Documentation/auxdisplay/ks0108 b/Documentation/auxdisplay/ks0108
new file mode 100644
index 0000000..8ddda0c
--- /dev/null
+++ b/Documentation/auxdisplay/ks0108
@@ -0,0 +1,55 @@
+ ==========================================
+ ks0108 LCD Controller Driver Documentation
+ ==========================================
+
+License: GPLv2
+Author & Maintainer: Miguel Ojeda Sandonis
+Date: 2006-10-27
+
+
+
+--------
+0. INDEX
+--------
+
+ 1. DRIVER INFORMATION
+ 2. DEVICE INFORMATION
+ 3. WIRING
+
+
+---------------------
+1. DRIVER INFORMATION
+---------------------
+
+This driver supports the ks0108 LCD controller.
+
+
+---------------------
+2. DEVICE INFORMATION
+---------------------
+
+Manufacturer: Samsung
+Device Name: KS0108 LCD Controller
+Device Code: ks0108
+Webpage: -
+Device Webpage: -
+Type: LCD Controller (Liquid Crystal Display Controller)
+Width: 64
+Height: 64
+Colors: 2 (B/N)
+Pages: 8
+Addresses: 64 each page
+Data size: 1 byte each address
+Memory size: 8 * 64 * 1 = 512 bytes
+
+
+---------
+3. WIRING
+---------
+
+The driver supports data parallel port wiring.
+
+If you aren't building LCD related hardware, you should check
+your LCD specific wiring information in the same folder.
+
+For example, check Documentation/auxdisplay/cfag12864b.
diff --git a/Documentation/basic_profiling.txt b/Documentation/basic_profiling.txt
new file mode 100644
index 0000000..8764e9f
--- /dev/null
+++ b/Documentation/basic_profiling.txt
@@ -0,0 +1,56 @@
+These instructions are deliberately very basic. If you want something clever,
+go read the real docs ;-) Please don't add more stuff, but feel free to
+correct my mistakes ;-) (mbligh@aracnet.com)
+Thanks to John Levon, Dave Hansen, et al. for help writing this.
+
+<test> is the thing you're trying to measure.
+Make sure you have the correct System.map / vmlinux referenced!
+
+It is probably easiest to use "make install" for linux and hack
+/sbin/installkernel to copy vmlinux to /boot, in addition to vmlinuz,
+config, System.map, which are usually installed by default.
+
+Readprofile
+-----------
+A recent readprofile command is needed for 2.6, such as found in util-linux
+2.12a, which can be downloaded from:
+
+http://www.kernel.org/pub/linux/utils/util-linux/
+
+Most distributions will ship it already.
+
+Add "profile=2" to the kernel command line.
+
+clear readprofile -r
+ <test>
+dump output readprofile -m /boot/System.map > captured_profile
+
+Oprofile
+--------
+
+Get the source (see Changes for required version) from
+http://oprofile.sourceforge.net/ and add "idle=poll" to the kernel command
+line.
+
+Configure with CONFIG_PROFILING=y and CONFIG_OPROFILE=y & reboot on new kernel
+
+./configure --with-kernel-support
+make install
+
+For superior results, be sure to enable the local APIC. If opreport sees
+a 0Hz CPU, APIC was not on. Be aware that idle=poll may mean a performance
+penalty.
+
+One time setup:
+ opcontrol --setup --vmlinux=/boot/vmlinux
+
+clear opcontrol --reset
+start opcontrol --start
+ <test>
+stop opcontrol --stop
+dump output opreport > output_file
+
+To only report on the kernel, run opreport -l /boot/vmlinux > output_file
+
+A reset is needed to clear old statistics, which survive a reboot.
+
diff --git a/Documentation/binfmt_misc.txt b/Documentation/binfmt_misc.txt
new file mode 100644
index 0000000..f609ebf
--- /dev/null
+++ b/Documentation/binfmt_misc.txt
@@ -0,0 +1,116 @@
+ Kernel Support for miscellaneous (your favourite) Binary Formats v1.1
+ =====================================================================
+
+This Kernel feature allows you to invoke almost (for restrictions see below)
+every program by simply typing its name in the shell.
+This includes for example compiled Java(TM), Python or Emacs programs.
+
+To achieve this you must tell binfmt_misc which interpreter has to be invoked
+with which binary. Binfmt_misc recognises the binary-type by matching some bytes
+at the beginning of the file with a magic byte sequence (masking out specified
+bits) you have supplied. Binfmt_misc can also recognise a filename extension
+aka '.com' or '.exe'.
+
+First you must mount binfmt_misc:
+ mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc
+
+To actually register a new binary type, you have to set up a string looking like
+:name:type:offset:magic:mask:interpreter:flags (where you can choose the ':' upon
+your needs) and echo it to /proc/sys/fs/binfmt_misc/register.
+Here is what the fields mean:
+ - 'name' is an identifier string. A new /proc file will be created with this
+ name below /proc/sys/fs/binfmt_misc
+ - 'type' is the type of recognition. Give 'M' for magic and 'E' for extension.
+ - 'offset' is the offset of the magic/mask in the file, counted in bytes. This
+ defaults to 0 if you omit it (i.e. you write ':name:type::magic...')
+ - 'magic' is the byte sequence binfmt_misc is matching for. The magic string
+ may contain hex-encoded characters like \x0a or \xA4. In a shell environment
+ you will have to write \\x0a to prevent the shell from eating your \.
+ If you chose filename extension matching, this is the extension to be
+ recognised (without the '.', the \x0a specials are not allowed). Extension
+ matching is case sensitive!
+ - 'mask' is an (optional, defaults to all 0xff) mask. You can mask out some
+ bits from matching by supplying a string like magic and as long as magic.
+ The mask is anded with the byte sequence of the file.
+ - 'interpreter' is the program that should be invoked with the binary as first
+ argument (specify the full path)
+ - 'flags' is an optional field that controls several aspects of the invocation
+ of the interpreter. It is a string of capital letters, each controls a certain
+ aspect. The following flags are supported -
+ 'P' - preserve-argv[0]. Legacy behavior of binfmt_misc is to overwrite the
+ original argv[0] with the full path to the binary. When this flag is
+ included, binfmt_misc will add an argument to the argument vector for
+ this purpose, thus preserving the original argv[0].
+ 'O' - open-binary. Legacy behavior of binfmt_misc is to pass the full path
+ of the binary to the interpreter as an argument. When this flag is
+ included, binfmt_misc will open the file for reading and pass its
+ descriptor as an argument, instead of the full path, thus allowing
+ the interpreter to execute non-readable binaries. This feature should
+ be used with care - the interpreter has to be trusted not to emit
+ the contents of the non-readable binary.
+ 'C' - credentials. Currently, the behavior of binfmt_misc is to calculate
+ the credentials and security token of the new process according to
+ the interpreter. When this flag is included, these attributes are
+ calculated according to the binary. It also implies the 'O' flag.
+ This feature should be used with care as the interpreter
+ will run with root permissions when a setuid binary owned by root
+ is run with binfmt_misc.
+
+
+There are some restrictions:
+ - the whole register string may not exceed 255 characters
+ - the magic must reside in the first 128 bytes of the file, i.e.
+ offset+size(magic) has to be less than 128
+ - the interpreter string may not exceed 127 characters
+
+To use binfmt_misc you have to mount it first. You can mount it with
+"mount -t binfmt_misc none /proc/sys/fs/binfmt_misc" command, or you can add
+a line "none /proc/sys/fs/binfmt_misc binfmt_misc defaults 0 0" to your
+/etc/fstab so it auto mounts on boot.
+
+You may want to add the binary formats in one of your /etc/rc scripts during
+boot-up. Read the manual of your init program to figure out how to do this
+right.
+
+Think about the order of adding entries! Later added entries are matched first!
+
+
+A few examples (assumed you are in /proc/sys/fs/binfmt_misc):
+
+- enable support for em86 (like binfmt_em86, for Alpha AXP only):
+ echo ':i386:M::\x7fELF\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x03:\xff\xff\xff\xff\xff\xfe\xfe\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfb\xff\xff:/bin/em86:' > register
+ echo ':i486:M::\x7fELF\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x06:\xff\xff\xff\xff\xff\xfe\xfe\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfb\xff\xff:/bin/em86:' > register
+
+- enable support for packed DOS applications (pre-configured dosemu hdimages):
+ echo ':DEXE:M::\x0eDEX::/usr/bin/dosexec:' > register
+
+- enable support for Windows executables using wine:
+ echo ':DOSWin:M::MZ::/usr/local/bin/wine:' > register
+
+For java support see Documentation/java.txt
+
+
+You can enable/disable binfmt_misc or one binary type by echoing 0 (to disable)
+or 1 (to enable) to /proc/sys/fs/binfmt_misc/status or /proc/.../the_name.
+Catting the file tells you the current status of binfmt_misc/the entry.
+
+You can remove one entry or all entries by echoing -1 to /proc/.../the_name
+or /proc/sys/fs/binfmt_misc/status.
+
+
+HINTS:
+======
+
+If you want to pass special arguments to your interpreter, you can
+write a wrapper script for it. See Documentation/java.txt for an
+example.
+
+Your interpreter should NOT look in the PATH for the filename; the kernel
+passes it the full filename (or the file descriptor) to use. Using $PATH can
+cause unexpected behaviour and can be a security hazard.
+
+
+There is a web page about binfmt_misc at
+http://www.tat.physik.uni-tuebingen.de/~rguenth/linux/binfmt_misc.html
+
+Richard Günther <rguenth@tat.physik.uni-tuebingen.de>
diff --git a/Documentation/blackfin/00-INDEX b/Documentation/blackfin/00-INDEX
new file mode 100644
index 0000000..7cb3b35
--- /dev/null
+++ b/Documentation/blackfin/00-INDEX
@@ -0,0 +1,11 @@
+00-INDEX
+ - This file
+
+cache-lock.txt
+ - HOWTO for blackfin cache locking.
+
+cachefeatures.txt
+ - Supported cache features.
+
+Filesystems
+ - Requirements for mounting the root file system.
diff --git a/Documentation/blackfin/Filesystems b/Documentation/blackfin/Filesystems
new file mode 100644
index 0000000..51260a1
--- /dev/null
+++ b/Documentation/blackfin/Filesystems
@@ -0,0 +1,169 @@
+/*
+ * File: Documentation/blackfin/Filesystems
+ * Based on:
+ * Author:
+ *
+ * Created:
+ * Description: This file contains the simple DMA Implementation for Blackfin
+ *
+ * Rev: $Id: Filesystems 2384 2006-11-01 04:12:43Z magicyang $
+ *
+ * Modified:
+ * Copyright 2004-2006 Analog Devices Inc.
+ *
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ */
+
+ How to mount the root file system in uClinux/Blackfin
+ -----------------------------------------------------
+
+1 Mounting EXT3 File system.
+ ------------------------
+
+ Creating an EXT3 File system for uClinux/Blackfin:
+
+
+Please follow the steps to form the EXT3 File system and mount the same as root
+file system.
+
+a Make an ext3 file system as large as you want the final root file
+ system.
+
+ mkfs.ext3 /dev/ram0 <your-rootfs-size-in-1k-blocks>
+
+b Mount this Empty file system on a free directory as:
+
+ mount -t ext3 /dev/ram0 ./test
+ where ./test is the empty directory.
+
+c Copy your root fs directory that you have so carefully made over.
+
+ cp -af /tmp/my_final_rootfs_files/* ./test
+
+ (For ex: cp -af uClinux-dist/romfs/* ./test)
+
+d If you have done everything right till now you should be able to see
+ the required "root" dir's (that's etc, root, bin, lib, sbin...)
+
+e Now unmount the file system
+
+ umount ./test
+
+f Create the root file system image.
+
+ dd if=/dev/ram0 bs=1k count=<your-rootfs-size-in-1k-blocks> \
+ > ext3fs.img
+
+
+Now you have to tell the kernel that will be mounting this file system as
+rootfs.
+So do a make menuconfig under kernel and select the Ext3 journaling file system
+support under File system --> submenu.
+
+
+2. Mounting EXT2 File system.
+ -------------------------
+
+By default the ext2 file system image will be created if you invoke make from
+the top uClinux-dist directory.
+
+
+3. Mounting CRAMFS File System
+ ----------------------------
+
+To create a CRAMFS file system image execute the command
+
+ mkfs.cramfs ./test cramfs.img
+
+ where ./test is the target directory.
+
+
+4. Mounting ROMFS File System
+ --------------------------
+
+To create a ROMFS file system image execute the command
+
+ genromfs -v -V "ROMdisk" -f romfs.img -d ./test
+
+ where ./test is the target directory
+
+
+5. Mounting the JFFS2 Filesystem
+ -----------------------------
+
+To create a compressed JFFS filesystem (JFFS2), please execute the command
+
+ mkfs.jffs2 -d ./test -o jffs2.img
+
+ where ./test is the target directory.
+
+However, please make sure the following is in your kernel config.
+
+/*
+ * RAM/ROM/Flash chip drivers
+ */
+#define CONFIG_MTD_CFI 1
+#define CONFIG_MTD_ROM 1
+/*
+ * Mapping drivers for chip access
+ */
+#define CONFIG_MTD_COMPLEX_MAPPINGS 1
+#define CONFIG_MTD_BF533 1
+#undef CONFIG_MTD_UCLINUX
+
+Through the u-boot boot loader, use the jffs2.img in the corresponding
+partition made in linux-2.6.x/drivers/mtd/maps/bf533_flash.c.
+
+NOTE - Currently the Flash driver is available only for EZKIT. Watch out for a
+ STAMP driver soon.
+
+
+6. Mounting the NFS File system
+ -----------------------------
+
+ For mounting the NFS please do the following in the kernel config.
+
+ In Networking Support --> Networking options --> TCP/IP networking -->
+ IP: kernel level autoconfiguration
+
+ Enable BOOTP Support.
+
+ In Kernel hacking --> Compiled-in kernel boot parameter add the following
+
+ root=/dev/nfs rw ip=bootp
+
+ In File system --> Network File system, Enable
+
+ NFS file system support --> NFSv3 client support
+ Root File system on NFS
+
+ in uClibc menuconfig, do the following
+ In Networking Support
+ enable Remote Procedure Call (RPC) support
+ Full RPC Support
+
+ On the Host side, ensure that /etc/dhcpd.conf looks something like this
+
+ ddns-update-style ad-hoc;
+ allow bootp;
+ subnet 10.100.4.0 netmask 255.255.255.0 {
+ default-lease-time 122209600;
+ max-lease-time 31557600;
+ group {
+ host bf533 {
+ hardware ethernet 00:CF:52:49:C3:01;
+ fixed-address 10.100.4.50;
+ option root-path "/home/nfsmount";
+ }
+ }
+
+ ensure that /etc/exports looks something like this
+ /home/nfsmount *(rw,no_root_squash,no_all_squash)
+
+ run the following commands as root (may differ depending on your
+ distribution) :
+ - service nfs start
+ - service portmap start
+ - service dhcpd start
+ - /usr/sbin/exportfs
diff --git a/Documentation/blackfin/cache-lock.txt b/Documentation/blackfin/cache-lock.txt
new file mode 100644
index 0000000..88ba1e6
--- /dev/null
+++ b/Documentation/blackfin/cache-lock.txt
@@ -0,0 +1,48 @@
+/*
+ * File: Documentation/blackfin/cache-lock.txt
+ * Based on:
+ * Author:
+ *
+ * Created:
+ * Description: This file contains the simple DMA Implementation for Blackfin
+ *
+ * Rev: $Id: cache-lock.txt 2384 2006-11-01 04:12:43Z magicyang $
+ *
+ * Modified:
+ * Copyright 2004-2006 Analog Devices Inc.
+ *
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ */
+
+How to lock your code in cache in uClinux/blackfin
+--------------------------------------------------
+
+There are only a few steps required to lock your code into the cache.
+Currently you can lock the code by Way.
+
+Below are the interface provided for locking the cache.
+
+
+1. cache_grab_lock(int Ways);
+
+This function grab the lock for locking your code into the cache specified
+by Ways.
+
+
+2. cache_lock(int Ways);
+
+This function should be called after your critical code has been executed.
+Once the critical code exits, the code is now loaded into the cache. This
+function locks the code into the cache.
+
+
+So, the example sequence will be:
+
+ cache_grab_lock(WAY0_L); /* Grab the lock */
+
+ critical_code(); /* Execute the code of interest */
+
+ cache_lock(WAY0_L); /* Lock the cache */
+
+Where WAY0_L signifies WAY0 locking.
diff --git a/Documentation/blackfin/cachefeatures.txt b/Documentation/blackfin/cachefeatures.txt
new file mode 100644
index 0000000..0fbec23
--- /dev/null
+++ b/Documentation/blackfin/cachefeatures.txt
@@ -0,0 +1,65 @@
+/*
+ * File: Documentation/blackfin/cachefeatures.txt
+ * Based on:
+ * Author:
+ *
+ * Created:
+ * Description: This file contains the simple DMA Implementation for Blackfin
+ *
+ * Rev: $Id: cachefeatures.txt 2384 2006-11-01 04:12:43Z magicyang $
+ *
+ * Modified:
+ * Copyright 2004-2006 Analog Devices Inc.
+ *
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ */
+
+ - Instruction and Data cache initialization.
+ icache_init();
+ dcache_init();
+
+ - Instruction and Data cache Invalidation Routines, when flushing the
+ same is not required.
+ _icache_invalidate();
+ _dcache_invalidate();
+
+ Also, for invalidating the entire instruction and data cache, the below
+ routines are provided (another method for invalidation, refer page no 267 and 287 of
+ ADSP-BF533 Hardware Reference manual)
+
+ invalidate_entire_dcache();
+ invalidate_entire_icache();
+
+ -External Flushing of Instruction and data cache routines.
+
+ flush_instruction_cache();
+ flush_data_cache();
+
+ - Internal Flushing of Instruction and Data Cache.
+
+ icplb_flush();
+ dcplb_flush();
+
+ - Locking the cache.
+
+ cache_grab_lock();
+ cache_lock();
+
+ Please refer linux-2.6.x/Documentation/blackfin/cache-lock.txt for how to
+ lock the cache.
+
+ Locking the cache is optional feature.
+
+ - Miscellaneous cache functions.
+
+ flush_cache_all();
+ flush_cache_mm();
+ invalidate_dcache_range();
+ flush_dcache_range();
+ flush_dcache_page();
+ flush_cache_range();
+ flush_cache_page();
+ invalidate_dcache_range();
+ flush_page_to_ram();
+
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
new file mode 100644
index 0000000..961a051
--- /dev/null
+++ b/Documentation/block/00-INDEX
@@ -0,0 +1,20 @@
+00-INDEX
+ - This file
+as-iosched.txt
+ - Anticipatory IO scheduler
+barrier.txt
+ - I/O Barriers
+biodoc.txt
+ - Notes on the Generic Block Layer Rewrite in Linux 2.5
+capability.txt
+ - Generic Block Device Capability (/sys/block/<disk>/capability)
+deadline-iosched.txt
+ - Deadline IO scheduler tunables
+ioprio.txt
+ - Block io priorities (in CFQ scheduler)
+request.txt
+ - The members of struct request (in include/linux/blkdev.h)
+stat.txt
+ - Block layer statistics in /sys/block/<dev>/stat
+switching-sched.txt
+ - Switching I/O schedulers at runtime
diff --git a/Documentation/block/as-iosched.txt b/Documentation/block/as-iosched.txt
new file mode 100644
index 0000000..738b72b
--- /dev/null
+++ b/Documentation/block/as-iosched.txt
@@ -0,0 +1,172 @@
+Anticipatory IO scheduler
+-------------------------
+Nick Piggin <piggin@cyberone.com.au> 13 Sep 2003
+
+Attention! Database servers, especially those using "TCQ" disks should
+investigate performance with the 'deadline' IO scheduler. Any system with high
+disk performance requirements should do so, in fact.
+
+If you see unusual performance characteristics of your disk systems, or you
+see big performance regressions versus the deadline scheduler, please email
+me. Database users don't bother unless you're willing to test a lot of patches
+from me ;) its a known issue.
+
+Also, users with hardware RAID controllers, doing striping, may find
+highly variable performance results with using the as-iosched. The
+as-iosched anticipatory implementation is based on the notion that a disk
+device has only one physical seeking head. A striped RAID controller
+actually has a head for each physical device in the logical RAID device.
+
+However, setting the antic_expire (see tunable parameters below) produces
+very similar behavior to the deadline IO scheduler.
+
+Selecting IO schedulers
+-----------------------
+Refer to Documentation/block/switching-sched.txt for information on
+selecting an io scheduler on a per-device basis.
+
+Anticipatory IO scheduler Policies
+----------------------------------
+The as-iosched implementation implements several layers of policies
+to determine when an IO request is dispatched to the disk controller.
+Here are the policies outlined, in order of application.
+
+1. one-way Elevator algorithm.
+
+The elevator algorithm is similar to that used in deadline scheduler, with
+the addition that it allows limited backward movement of the elevator
+(i.e. seeks backwards). A seek backwards can occur when choosing between
+two IO requests where one is behind the elevator's current position, and
+the other is in front of the elevator's position. If the seek distance to
+the request in back of the elevator is less than half the seek distance to
+the request in front of the elevator, then the request in back can be chosen.
+Backward seeks are also limited to a maximum of MAXBACK (1024*1024) sectors.
+This favors forward movement of the elevator, while allowing opportunistic
+"short" backward seeks.
+
+2. FIFO expiration times for reads and for writes.
+
+This is again very similar to the deadline IO scheduler. The expiration
+times for requests on these lists is tunable using the parameters read_expire
+and write_expire discussed below. When a read or a write expires in this way,
+the IO scheduler will interrupt its current elevator sweep or read anticipation
+to service the expired request.
+
+3. Read and write request batching
+
+A batch is a collection of read requests or a collection of write
+requests. The as scheduler alternates dispatching read and write batches
+to the driver. In the case a read batch, the scheduler submits read
+requests to the driver as long as there are read requests to submit, and
+the read batch time limit has not been exceeded (read_batch_expire).
+The read batch time limit begins counting down only when there are
+competing write requests pending.
+
+In the case of a write batch, the scheduler submits write requests to
+the driver as long as there are write requests available, and the
+write batch time limit has not been exceeded (write_batch_expire).
+However, the length of write batches will be gradually shortened
+when read batches frequently exceed their time limit.
+
+When changing between batch types, the scheduler waits for all requests
+from the previous batch to complete before scheduling requests for the
+next batch.
+
+The read and write fifo expiration times described in policy 2 above
+are checked only when in scheduling IO of a batch for the corresponding
+(read/write) type. So for example, the read FIFO timeout values are
+tested only during read batches. Likewise, the write FIFO timeout
+values are tested only during write batches. For this reason,
+it is generally not recommended for the read batch time
+to be longer than the write expiration time, nor for the write batch
+time to exceed the read expiration time (see tunable parameters below).
+
+When the IO scheduler changes from a read to a write batch,
+it begins the elevator from the request that is on the head of the
+write expiration FIFO. Likewise, when changing from a write batch to
+a read batch, scheduler begins the elevator from the first entry
+on the read expiration FIFO.
+
+4. Read anticipation.
+
+Read anticipation occurs only when scheduling a read batch.
+This implementation of read anticipation allows only one read request
+to be dispatched to the disk controller at a time. In
+contrast, many write requests may be dispatched to the disk controller
+at a time during a write batch. It is this characteristic that can make
+the anticipatory scheduler perform anomalously with controllers supporting
+TCQ, or with hardware striped RAID devices. Setting the antic_expire
+queue parameter (see below) to zero disables this behavior, and the
+anticipatory scheduler behaves essentially like the deadline scheduler.
+
+When read anticipation is enabled (antic_expire is not zero), reads
+are dispatched to the disk controller one at a time.
+At the end of each read request, the IO scheduler examines its next
+candidate read request from its sorted read list. If that next request
+is from the same process as the request that just completed,
+or if the next request in the queue is "very close" to the
+just completed request, it is dispatched immediately. Otherwise,
+statistics (average think time, average seek distance) on the process
+that submitted the just completed request are examined. If it seems
+likely that that process will submit another request soon, and that
+request is likely to be near the just completed request, then the IO
+scheduler will stop dispatching more read requests for up to (antic_expire)
+milliseconds, hoping that process will submit a new request near the one
+that just completed. If such a request is made, then it is dispatched
+immediately. If the antic_expire wait time expires, then the IO scheduler
+will dispatch the next read request from the sorted read queue.
+
+To decide whether an anticipatory wait is worthwhile, the scheduler
+maintains statistics for each process that can be used to compute
+mean "think time" (the time between read requests), and mean seek
+distance for that process. One observation is that these statistics
+are associated with each process, but those statistics are not associated
+with a specific IO device. So for example, if a process is doing IO
+on several file systems on separate devices, the statistics will be
+a combination of IO behavior from all those devices.
+
+
+Tuning the anticipatory IO scheduler
+------------------------------------
+When using 'as', the anticipatory IO scheduler there are 5 parameters under
+/sys/block/*/queue/iosched/. All are units of milliseconds.
+
+The parameters are:
+* read_expire
+ Controls how long until a read request becomes "expired". It also controls the
+ interval between which expired requests are served, so set to 50, a request
+ might take anywhere < 100ms to be serviced _if_ it is the next on the
+ expired list. Obviously request expiration strategies won't make the disk
+ go faster. The result basically equates to the timeslice a single reader
+ gets in the presence of other IO. 100*((seek time / read_expire) + 1) is
+ very roughly the % streaming read efficiency your disk should get with
+ multiple readers.
+
+* read_batch_expire
+ Controls how much time a batch of reads is given before pending writes are
+ served. A higher value is more efficient. This might be set below read_expire
+ if writes are to be given higher priority than reads, but reads are to be
+ as efficient as possible when there are no writes. Generally though, it
+ should be some multiple of read_expire.
+
+* write_expire, and
+* write_batch_expire are equivalent to the above, for writes.
+
+* antic_expire
+ Controls the maximum amount of time we can anticipate a good read (one
+ with a short seek distance from the most recently completed request) before
+ giving up. Many other factors may cause anticipation to be stopped early,
+ or some processes will not be "anticipated" at all. Should be a bit higher
+ for big seek time devices though not a linear correspondence - most
+ processes have only a few ms thinktime.
+
+In addition to the tunables above there is a read-only file named est_time
+which, when read, will show:
+
+ - The probability of a task exiting without a cooperating task
+ submitting an anticipated IO.
+
+ - The current mean think time.
+
+ - The seek distance used to determine if an incoming IO is better.
+
diff --git a/Documentation/block/barrier.txt b/Documentation/block/barrier.txt
new file mode 100644
index 0000000..2c2f24f
--- /dev/null
+++ b/Documentation/block/barrier.txt
@@ -0,0 +1,261 @@
+I/O Barriers
+============
+Tejun Heo <htejun@gmail.com>, July 22 2005
+
+I/O barrier requests are used to guarantee ordering around the barrier
+requests. Unless you're crazy enough to use disk drives for
+implementing synchronization constructs (wow, sounds interesting...),
+the ordering is meaningful only for write requests for things like
+journal checkpoints. All requests queued before a barrier request
+must be finished (made it to the physical medium) before the barrier
+request is started, and all requests queued after the barrier request
+must be started only after the barrier request is finished (again,
+made it to the physical medium).
+
+In other words, I/O barrier requests have the following two properties.
+
+1. Request ordering
+
+Requests cannot pass the barrier request. Preceding requests are
+processed before the barrier and following requests after.
+
+Depending on what features a drive supports, this can be done in one
+of the following three ways.
+
+i. For devices which have queue depth greater than 1 (TCQ devices) and
+support ordered tags, block layer can just issue the barrier as an
+ordered request and the lower level driver, controller and drive
+itself are responsible for making sure that the ordering constraint is
+met. Most modern SCSI controllers/drives should support this.
+
+NOTE: SCSI ordered tag isn't currently used due to limitation in the
+ SCSI midlayer, see the following random notes section.
+
+ii. For devices which have queue depth greater than 1 but don't
+support ordered tags, block layer ensures that the requests preceding
+a barrier request finishes before issuing the barrier request. Also,
+it defers requests following the barrier until the barrier request is
+finished. Older SCSI controllers/drives and SATA drives fall in this
+category.
+
+iii. Devices which have queue depth of 1. This is a degenerate case
+of ii. Just keeping issue order suffices. Ancient SCSI
+controllers/drives and IDE drives are in this category.
+
+2. Forced flushing to physical medium
+
+Again, if you're not gonna do synchronization with disk drives (dang,
+it sounds even more appealing now!), the reason you use I/O barriers
+is mainly to protect filesystem integrity when power failure or some
+other events abruptly stop the drive from operating and possibly make
+the drive lose data in its cache. So, I/O barriers need to guarantee
+that requests actually get written to non-volatile medium in order.
+
+There are four cases,
+
+i. No write-back cache. Keeping requests ordered is enough.
+
+ii. Write-back cache but no flush operation. There's no way to
+guarantee physical-medium commit order. This kind of devices can't to
+I/O barriers.
+
+iii. Write-back cache and flush operation but no FUA (forced unit
+access). We need two cache flushes - before and after the barrier
+request.
+
+iv. Write-back cache, flush operation and FUA. We still need one
+flush to make sure requests preceding a barrier are written to medium,
+but post-barrier flush can be avoided by using FUA write on the
+barrier itself.
+
+
+How to support barrier requests in drivers
+------------------------------------------
+
+All barrier handling is done inside block layer proper. All low level
+drivers have to are implementing its prepare_flush_fn and using one
+the following two functions to indicate what barrier type it supports
+and how to prepare flush requests. Note that the term 'ordered' is
+used to indicate the whole sequence of performing barrier requests
+including draining and flushing.
+
+typedef void (prepare_flush_fn)(struct request_queue *q, struct request *rq);
+
+int blk_queue_ordered(struct request_queue *q, unsigned ordered,
+ prepare_flush_fn *prepare_flush_fn);
+
+@q : the queue in question
+@ordered : the ordered mode the driver/device supports
+@prepare_flush_fn : this function should prepare @rq such that it
+ flushes cache to physical medium when executed
+
+For example, SCSI disk driver's prepare_flush_fn looks like the
+following.
+
+static void sd_prepare_flush(struct request_queue *q, struct request *rq)
+{
+ memset(rq->cmd, 0, sizeof(rq->cmd));
+ rq->cmd_type = REQ_TYPE_BLOCK_PC;
+ rq->timeout = SD_TIMEOUT;
+ rq->cmd[0] = SYNCHRONIZE_CACHE;
+ rq->cmd_len = 10;
+}
+
+The following seven ordered modes are supported. The following table
+shows which mode should be used depending on what features a
+device/driver supports. In the leftmost column of table,
+QUEUE_ORDERED_ prefix is omitted from the mode names to save space.
+
+The table is followed by description of each mode. Note that in the
+descriptions of QUEUE_ORDERED_DRAIN*, '=>' is used whereas '->' is
+used for QUEUE_ORDERED_TAG* descriptions. '=>' indicates that the
+preceding step must be complete before proceeding to the next step.
+'->' indicates that the next step can start as soon as the previous
+step is issued.
+
+ write-back cache ordered tag flush FUA
+-----------------------------------------------------------------------
+NONE yes/no N/A no N/A
+DRAIN no no N/A N/A
+DRAIN_FLUSH yes no yes no
+DRAIN_FUA yes no yes yes
+TAG no yes N/A N/A
+TAG_FLUSH yes yes yes no
+TAG_FUA yes yes yes yes
+
+
+QUEUE_ORDERED_NONE
+ I/O barriers are not needed and/or supported.
+
+ Sequence: N/A
+
+QUEUE_ORDERED_DRAIN
+ Requests are ordered by draining the request queue and cache
+ flushing isn't needed.
+
+ Sequence: drain => barrier
+
+QUEUE_ORDERED_DRAIN_FLUSH
+ Requests are ordered by draining the request queue and both
+ pre-barrier and post-barrier cache flushings are needed.
+
+ Sequence: drain => preflush => barrier => postflush
+
+QUEUE_ORDERED_DRAIN_FUA
+ Requests are ordered by draining the request queue and
+ pre-barrier cache flushing is needed. By using FUA on barrier
+ request, post-barrier flushing can be skipped.
+
+ Sequence: drain => preflush => barrier
+
+QUEUE_ORDERED_TAG
+ Requests are ordered by ordered tag and cache flushing isn't
+ needed.
+
+ Sequence: barrier
+
+QUEUE_ORDERED_TAG_FLUSH
+ Requests are ordered by ordered tag and both pre-barrier and
+ post-barrier cache flushings are needed.
+
+ Sequence: preflush -> barrier -> postflush
+
+QUEUE_ORDERED_TAG_FUA
+ Requests are ordered by ordered tag and pre-barrier cache
+ flushing is needed. By using FUA on barrier request,
+ post-barrier flushing can be skipped.
+
+ Sequence: preflush -> barrier
+
+
+Random notes/caveats
+--------------------
+
+* SCSI layer currently can't use TAG ordering even if the drive,
+controller and driver support it. The problem is that SCSI midlayer
+request dispatch function is not atomic. It releases queue lock and
+switch to SCSI host lock during issue and it's possible and likely to
+happen in time that requests change their relative positions. Once
+this problem is solved, TAG ordering can be enabled.
+
+* Currently, no matter which ordered mode is used, there can be only
+one barrier request in progress. All I/O barriers are held off by
+block layer until the previous I/O barrier is complete. This doesn't
+make any difference for DRAIN ordered devices, but, for TAG ordered
+devices with very high command latency, passing multiple I/O barriers
+to low level *might* be helpful if they are very frequent. Well, this
+certainly is a non-issue. I'm writing this just to make clear that no
+two I/O barrier is ever passed to low-level driver.
+
+* Completion order. Requests in ordered sequence are issued in order
+but not required to finish in order. Barrier implementation can
+handle out-of-order completion of ordered sequence. IOW, the requests
+MUST be processed in order but the hardware/software completion paths
+are allowed to reorder completion notifications - eg. current SCSI
+midlayer doesn't preserve completion order during error handling.
+
+* Requeueing order. Low-level drivers are free to requeue any request
+after they removed it from the request queue with
+blkdev_dequeue_request(). As barrier sequence should be kept in order
+when requeued, generic elevator code takes care of putting requests in
+order around barrier. See blk_ordered_req_seq() and
+ELEVATOR_INSERT_REQUEUE handling in __elv_add_request() for details.
+
+Note that block drivers must not requeue preceding requests while
+completing latter requests in an ordered sequence. Currently, no
+error checking is done against this.
+
+* Error handling. Currently, block layer will report error to upper
+layer if any of requests in an ordered sequence fails. Unfortunately,
+this doesn't seem to be enough. Look at the following request flow.
+QUEUE_ORDERED_TAG_FLUSH is in use.
+
+ [0] [1] [2] [3] [pre] [barrier] [post] < [4] [5] [6] ... >
+ still in elevator
+
+Let's say request [2], [3] are write requests to update file system
+metadata (journal or whatever) and [barrier] is used to mark that
+those updates are valid. Consider the following sequence.
+
+ i. Requests [0] ~ [post] leaves the request queue and enters
+ low-level driver.
+ ii. After a while, unfortunately, something goes wrong and the
+ drive fails [2]. Note that any of [0], [1] and [3] could have
+ completed by this time, but [pre] couldn't have been finished
+ as the drive must process it in order and it failed before
+ processing that command.
+ iii. Error handling kicks in and determines that the error is
+ unrecoverable and fails [2], and resumes operation.
+ iv. [pre] [barrier] [post] gets processed.
+ v. *BOOM* power fails
+
+The problem here is that the barrier request is *supposed* to indicate
+that filesystem update requests [2] and [3] made it safely to the
+physical medium and, if the machine crashes after the barrier is
+written, filesystem recovery code can depend on that. Sadly, that
+isn't true in this case anymore. IOW, the success of a I/O barrier
+should also be dependent on success of some of the preceding requests,
+where only upper layer (filesystem) knows what 'some' is.
+
+This can be solved by implementing a way to tell the block layer which
+requests affect the success of the following barrier request and
+making lower lever drivers to resume operation on error only after
+block layer tells it to do so.
+
+As the probability of this happening is very low and the drive should
+be faulty, implementing the fix is probably an overkill. But, still,
+it's there.
+
+* In previous drafts of barrier implementation, there was fallback
+mechanism such that, if FUA or ordered TAG fails, less fancy ordered
+mode can be selected and the failed barrier request is retried
+automatically. The rationale for this feature was that as FUA is
+pretty new in ATA world and ordered tag was never used widely, there
+could be devices which report to support those features but choke when
+actually given such requests.
+
+ This was removed for two reasons 1. it's an overkill 2. it's
+impossible to implement properly when TAG ordering is used as low
+level drivers resume after an error automatically. If it's ever
+needed adding it back and modifying low level drivers accordingly
+shouldn't be difficult.
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
new file mode 100644
index 0000000..4dbb8be
--- /dev/null
+++ b/Documentation/block/biodoc.txt
@@ -0,0 +1,1215 @@
+ Notes on the Generic Block Layer Rewrite in Linux 2.5
+ =====================================================
+
+Notes Written on Jan 15, 2002:
+ Jens Axboe <jens.axboe@oracle.com>
+ Suparna Bhattacharya <suparna@in.ibm.com>
+
+Last Updated May 2, 2002
+September 2003: Updated I/O Scheduler portions
+ Nick Piggin <piggin@cyberone.com.au>
+
+Introduction:
+
+These are some notes describing some aspects of the 2.5 block layer in the
+context of the bio rewrite. The idea is to bring out some of the key
+changes and a glimpse of the rationale behind those changes.
+
+Please mail corrections & suggestions to suparna@in.ibm.com.
+
+Credits:
+---------
+
+2.5 bio rewrite:
+ Jens Axboe <jens.axboe@oracle.com>
+
+Many aspects of the generic block layer redesign were driven by and evolved
+over discussions, prior patches and the collective experience of several
+people. See sections 8 and 9 for a list of some related references.
+
+The following people helped with review comments and inputs for this
+document:
+ Christoph Hellwig <hch@infradead.org>
+ Arjan van de Ven <arjanv@redhat.com>
+ Randy Dunlap <rdunlap@xenotime.net>
+ Andre Hedrick <andre@linux-ide.org>
+
+The following people helped with fixes/contributions to the bio patches
+while it was still work-in-progress:
+ David S. Miller <davem@redhat.com>
+
+
+Description of Contents:
+------------------------
+
+1. Scope for tuning of logic to various needs
+ 1.1 Tuning based on device or low level driver capabilities
+ - Per-queue parameters
+ - Highmem I/O support
+ - I/O scheduler modularization
+ 1.2 Tuning based on high level requirements/capabilities
+ 1.2.1 I/O Barriers
+ 1.2.2 Request Priority/Latency
+ 1.3 Direct access/bypass to lower layers for diagnostics and special
+ device operations
+ 1.3.1 Pre-built commands
+2. New flexible and generic but minimalist i/o structure or descriptor
+ (instead of using buffer heads at the i/o layer)
+ 2.1 Requirements/Goals addressed
+ 2.2 The bio struct in detail (multi-page io unit)
+ 2.3 Changes in the request structure
+3. Using bios
+ 3.1 Setup/teardown (allocation, splitting)
+ 3.2 Generic bio helper routines
+ 3.2.1 Traversing segments and completion units in a request
+ 3.2.2 Setting up DMA scatterlists
+ 3.2.3 I/O completion
+ 3.2.4 Implications for drivers that do not interpret bios (don't handle
+ multiple segments)
+ 3.2.5 Request command tagging
+ 3.3 I/O submission
+4. The I/O scheduler
+5. Scalability related changes
+ 5.1 Granular locking: Removal of io_request_lock
+ 5.2 Prepare for transition to 64 bit sector_t
+6. Other Changes/Implications
+ 6.1 Partition re-mapping handled by the generic block layer
+7. A few tips on migration of older drivers
+8. A list of prior/related/impacted patches/ideas
+9. Other References/Discussion Threads
+
+---------------------------------------------------------------------------
+
+Bio Notes
+--------
+
+Let us discuss the changes in the context of how some overall goals for the
+block layer are addressed.
+
+1. Scope for tuning the generic logic to satisfy various requirements
+
+The block layer design supports adaptable abstractions to handle common
+processing with the ability to tune the logic to an appropriate extent
+depending on the nature of the device and the requirements of the caller.
+One of the objectives of the rewrite was to increase the degree of tunability
+and to enable higher level code to utilize underlying device/driver
+capabilities to the maximum extent for better i/o performance. This is
+important especially in the light of ever improving hardware capabilities
+and application/middleware software designed to take advantage of these
+capabilities.
+
+1.1 Tuning based on low level device / driver capabilities
+
+Sophisticated devices with large built-in caches, intelligent i/o scheduling
+optimizations, high memory DMA support, etc may find some of the
+generic processing an overhead, while for less capable devices the
+generic functionality is essential for performance or correctness reasons.
+Knowledge of some of the capabilities or parameters of the device should be
+used at the generic block layer to take the right decisions on
+behalf of the driver.
+
+How is this achieved ?
+
+Tuning at a per-queue level:
+
+i. Per-queue limits/values exported to the generic layer by the driver
+
+Various parameters that the generic i/o scheduler logic uses are set at
+a per-queue level (e.g maximum request size, maximum number of segments in
+a scatter-gather list, hardsect size)
+
+Some parameters that were earlier available as global arrays indexed by
+major/minor are now directly associated with the queue. Some of these may
+move into the block device structure in the future. Some characteristics
+have been incorporated into a queue flags field rather than separate fields
+in themselves. There are blk_queue_xxx functions to set the parameters,
+rather than update the fields directly
+
+Some new queue property settings:
+
+ blk_queue_bounce_limit(q, u64 dma_address)
+ Enable I/O to highmem pages, dma_address being the
+ limit. No highmem default.
+
+ blk_queue_max_sectors(q, max_sectors)
+ Sets two variables that limit the size of the request.
+
+ - The request queue's max_sectors, which is a soft size in
+ units of 512 byte sectors, and could be dynamically varied
+ by the core kernel.
+
+ - The request queue's max_hw_sectors, which is a hard limit
+ and reflects the maximum size request a driver can handle
+ in units of 512 byte sectors.
+
+ The default for both max_sectors and max_hw_sectors is
+ 255. The upper limit of max_sectors is 1024.
+
+ blk_queue_max_phys_segments(q, max_segments)
+ Maximum physical segments you can handle in a request. 128
+ default (driver limit). (See 3.2.2)
+
+ blk_queue_max_hw_segments(q, max_segments)
+ Maximum dma segments the hardware can handle in a request. 128
+ default (host adapter limit, after dma remapping).
+ (See 3.2.2)
+
+ blk_queue_max_segment_size(q, max_seg_size)
+ Maximum size of a clustered segment, 64kB default.
+
+ blk_queue_hardsect_size(q, hardsect_size)
+ Lowest possible sector size that the hardware can operate
+ on, 512 bytes default.
+
+New queue flags:
+
+ QUEUE_FLAG_CLUSTER (see 3.2.2)
+ QUEUE_FLAG_QUEUED (see 3.2.4)
+
+
+ii. High-mem i/o capabilities are now considered the default
+
+The generic bounce buffer logic, present in 2.4, where the block layer would
+by default copyin/out i/o requests on high-memory buffers to low-memory buffers
+assuming that the driver wouldn't be able to handle it directly, has been
+changed in 2.5. The bounce logic is now applied only for memory ranges
+for which the device cannot handle i/o. A driver can specify this by
+setting the queue bounce limit for the request queue for the device
+(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
+where a device is capable of handling high memory i/o.
+
+In order to enable high-memory i/o where the device is capable of supporting
+it, the pci dma mapping routines and associated data structures have now been
+modified to accomplish a direct page -> bus translation, without requiring
+a virtual address mapping (unlike the earlier scheme of virtual address
+-> bus translation). So this works uniformly for high-memory pages (which
+do not have a corresponding kernel virtual address space mapping) and
+low-memory pages.
+
+Note: Please refer to DMA-mapping.txt for a discussion on PCI high mem DMA
+aspects and mapping of scatter gather lists, and support for 64 bit PCI.
+
+Special handling is required only for cases where i/o needs to happen on
+pages at physical memory addresses beyond what the device can support. In these
+cases, a bounce bio representing a buffer from the supported memory range
+is used for performing the i/o with copyin/copyout as needed depending on
+the type of the operation. For example, in case of a read operation, the
+data read has to be copied to the original buffer on i/o completion, so a
+callback routine is set up to do this, while for write, the data is copied
+from the original buffer to the bounce buffer prior to issuing the
+operation. Since an original buffer may be in a high memory area that's not
+mapped in kernel virtual addr, a kmap operation may be required for
+performing the copy, and special care may be needed in the completion path
+as it may not be in irq context. Special care is also required (by way of
+GFP flags) when allocating bounce buffers, to avoid certain highmem
+deadlock possibilities.
+
+It is also possible that a bounce buffer may be allocated from high-memory
+area that's not mapped in kernel virtual addr, but within the range that the
+device can use directly; so the bounce page may need to be kmapped during
+copy operations. [Note: This does not hold in the current implementation,
+though]
+
+There are some situations when pages from high memory may need to
+be kmapped, even if bounce buffers are not necessary. For example a device
+may need to abort DMA operations and revert to PIO for the transfer, in
+which case a virtual mapping of the page is required. For SCSI it is also
+done in some scenarios where the low level driver cannot be trusted to
+handle a single sg entry correctly. The driver is expected to perform the
+kmaps as needed on such occasions using the __bio_kmap_atomic and bio_kmap_irq
+routines as appropriate. A driver could also use the blk_queue_bounce()
+routine on its own to bounce highmem i/o to low memory for specific requests
+if so desired.
+
+iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
+
+As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
+queue or pick from (copy) existing generic schedulers and replace/override
+certain portions of it. The 2.5 rewrite provides improved modularization
+of the i/o scheduler. There are more pluggable callbacks, e.g for init,
+add request, extract request, which makes it possible to abstract specific
+i/o scheduling algorithm aspects and details outside of the generic loop.
+It also makes it possible to completely hide the implementation details of
+the i/o scheduler from block drivers.
+
+I/O scheduler wrappers are to be used instead of accessing the queue directly.
+See section 4. The I/O scheduler for details.
+
+1.2 Tuning Based on High level code capabilities
+
+i. Application capabilities for raw i/o
+
+This comes from some of the high-performance database/middleware
+requirements where an application prefers to make its own i/o scheduling
+decisions based on an understanding of the access patterns and i/o
+characteristics
+
+ii. High performance filesystems or other higher level kernel code's
+capabilities
+
+Kernel components like filesystems could also take their own i/o scheduling
+decisions for optimizing performance. Journalling filesystems may need
+some control over i/o ordering.
+
+What kind of support exists at the generic block layer for this ?
+
+The flags and rw fields in the bio structure can be used for some tuning
+from above e.g indicating that an i/o is just a readahead request, or for
+marking barrier requests (discussed next), or priority settings (currently
+unused). As far as user applications are concerned they would need an
+additional mechanism either via open flags or ioctls, or some other upper
+level mechanism to communicate such settings to block.
+
+1.2.1 I/O Barriers
+
+There is a way to enforce strict ordering for i/os through barriers.
+All requests before a barrier point must be serviced before the barrier
+request and any other requests arriving after the barrier will not be
+serviced until after the barrier has completed. This is useful for higher
+level control on write ordering, e.g flushing a log of committed updates
+to disk before the corresponding updates themselves.
+
+A flag in the bio structure, BIO_BARRIER is used to identify a barrier i/o.
+The generic i/o scheduler would make sure that it places the barrier request and
+all other requests coming after it after all the previous requests in the
+queue. Barriers may be implemented in different ways depending on the
+driver. For more details regarding I/O barriers, please read barrier.txt
+in this directory.
+
+1.2.2 Request Priority/Latency
+
+Todo/Under discussion:
+Arjan's proposed request priority scheme allows higher levels some broad
+ control (high/med/low) over the priority of an i/o request vs other pending
+ requests in the queue. For example it allows reads for bringing in an
+ executable page on demand to be given a higher priority over pending write
+ requests which haven't aged too much on the queue. Potentially this priority
+ could even be exposed to applications in some manner, providing higher level
+ tunability. Time based aging avoids starvation of lower priority
+ requests. Some bits in the bi_rw flags field in the bio structure are
+ intended to be used for this priority information.
+
+
+1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
+ (e.g Diagnostics, Systems Management)
+
+There are situations where high-level code needs to have direct access to
+the low level device capabilities or requires the ability to issue commands
+to the device bypassing some of the intermediate i/o layers.
+These could, for example, be special control commands issued through ioctl
+interfaces, or could be raw read/write commands that stress the drive's
+capabilities for certain kinds of fitness tests. Having direct interfaces at
+multiple levels without having to pass through upper layers makes
+it possible to perform bottom up validation of the i/o path, layer by
+layer, starting from the media.
+
+The normal i/o submission interfaces, e.g submit_bio, could be bypassed
+for specially crafted requests which such ioctl or diagnostics
+interfaces would typically use, and the elevator add_request routine
+can instead be used to directly insert such requests in the queue or preferably
+the blk_do_rq routine can be used to place the request on the queue and
+wait for completion. Alternatively, sometimes the caller might just
+invoke a lower level driver specific interface with the request as a
+parameter.
+
+If the request is a means for passing on special information associated with
+the command, then such information is associated with the request->special
+field (rather than misuse the request->buffer field which is meant for the
+request data buffer's virtual mapping).
+
+For passing request data, the caller must build up a bio descriptor
+representing the concerned memory buffer if the underlying driver interprets
+bio segments or uses the block layer end*request* functions for i/o
+completion. Alternatively one could directly use the request->buffer field to
+specify the virtual address of the buffer, if the driver expects buffer
+addresses passed in this way and ignores bio entries for the request type
+involved. In the latter case, the driver would modify and manage the
+request->buffer, request->sector and request->nr_sectors or
+request->current_nr_sectors fields itself rather than using the block layer
+end_request or end_that_request_first completion interfaces.
+(See 2.3 or Documentation/block/request.txt for a brief explanation of
+the request structure fields)
+
+[TBD: end_that_request_last should be usable even in this case;
+Perhaps an end_that_direct_request_first routine could be implemented to make
+handling direct requests easier for such drivers; Also for drivers that
+expect bios, a helper function could be provided for setting up a bio
+corresponding to a data buffer]
+
+<JENS: I dont understand the above, why is end_that_request_first() not
+usable? Or _last for that matter. I must be missing something>
+<SUP: What I meant here was that if the request doesn't have a bio, then
+ end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
+ and hence can't be used for advancing request state settings on the
+ completion of partial transfers. The driver has to modify these fields
+ directly by hand.
+ This is because end_that_request_first only iterates over the bio list,
+ and always returns 0 if there are none associated with the request.
+ _last works OK in this case, and is not a problem, as I mentioned earlier
+>
+
+1.3.1 Pre-built Commands
+
+A request can be created with a pre-built custom command to be sent directly
+to the device. The cmd block in the request structure has room for filling
+in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
+command pre-building, and the type of the request is now indicated
+through rq->flags instead of via rq->cmd)
+
+The request structure flags can be set up to indicate the type of request
+in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
+packet command issued via blk_do_rq, REQ_SPECIAL: special request).
+
+It can help to pre-build device commands for requests in advance.
+Drivers can now specify a request prepare function (q->prep_rq_fn) that the
+block layer would invoke to pre-build device commands for a given request,
+or perform other preparatory processing for the request. This is routine is
+called by elv_next_request(), i.e. typically just before servicing a request.
+(The prepare function would not be called for requests that have REQ_DONTPREP
+enabled)
+
+Aside:
+ Pre-building could possibly even be done early, i.e before placing the
+ request on the queue, rather than construct the command on the fly in the
+ driver while servicing the request queue when it may affect latencies in
+ interrupt context or responsiveness in general. One way to add early
+ pre-building would be to do it whenever we fail to merge on a request.
+ Now REQ_NOMERGE is set in the request flags to skip this one in the future,
+ which means that it will not change before we feed it to the device. So
+ the pre-builder hook can be invoked there.
+
+
+2. Flexible and generic but minimalist i/o structure/descriptor.
+
+2.1 Reason for a new structure and requirements addressed
+
+Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
+layer, and the low level request structure was associated with a chain of
+buffer heads for a contiguous i/o request. This led to certain inefficiencies
+when it came to large i/o requests and readv/writev style operations, as it
+forced such requests to be broken up into small chunks before being passed
+on to the generic block layer, only to be merged by the i/o scheduler
+when the underlying device was capable of handling the i/o in one shot.
+Also, using the buffer head as an i/o structure for i/os that didn't originate
+from the buffer cache unnecessarily added to the weight of the descriptors
+which were generated for each such chunk.
+
+The following were some of the goals and expectations considered in the
+redesign of the block i/o data structure in 2.5.
+
+i. Should be appropriate as a descriptor for both raw and buffered i/o -
+ avoid cache related fields which are irrelevant in the direct/page i/o path,
+ or filesystem block size alignment restrictions which may not be relevant
+ for raw i/o.
+ii. Ability to represent high-memory buffers (which do not have a virtual
+ address mapping in kernel address space).
+iii.Ability to represent large i/os w/o unnecessarily breaking them up (i.e
+ greater than PAGE_SIZE chunks in one shot)
+iv. At the same time, ability to retain independent identity of i/os from
+ different sources or i/o units requiring individual completion (e.g. for
+ latency reasons)
+v. Ability to represent an i/o involving multiple physical memory segments
+ (including non-page aligned page fragments, as specified via readv/writev)
+ without unnecessarily breaking it up, if the underlying device is capable of
+ handling it.
+vi. Preferably should be based on a memory descriptor structure that can be
+ passed around different types of subsystems or layers, maybe even
+ networking, without duplication or extra copies of data/descriptor fields
+ themselves in the process
+vii.Ability to handle the possibility of splits/merges as the structure passes
+ through layered drivers (lvm, md, evms), with minimal overhead.
+
+The solution was to define a new structure (bio) for the block layer,
+instead of using the buffer head structure (bh) directly, the idea being
+avoidance of some associated baggage and limitations. The bio structure
+is uniformly used for all i/o at the block layer ; it forms a part of the
+bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
+mapped to bio structures.
+
+2.2 The bio struct
+
+The bio structure uses a vector representation pointing to an array of tuples
+of <page, offset, len> to describe the i/o buffer, and has various other
+fields describing i/o parameters and state that needs to be maintained for
+performing the i/o.
+
+Notice that this representation means that a bio has no virtual address
+mapping at all (unlike buffer heads).
+
+struct bio_vec {
+ struct page *bv_page;
+ unsigned short bv_len;
+ unsigned short bv_offset;
+};
+
+/*
+ * main unit of I/O for the block layer and lower layers (ie drivers)
+ */
+struct bio {
+ sector_t bi_sector;
+ struct bio *bi_next; /* request queue link */
+ struct block_device *bi_bdev; /* target device */
+ unsigned long bi_flags; /* status, command, etc */
+ unsigned long bi_rw; /* low bits: r/w, high: priority */
+
+ unsigned int bi_vcnt; /* how may bio_vec's */
+ unsigned int bi_idx; /* current index into bio_vec array */
+
+ unsigned int bi_size; /* total size in bytes */
+ unsigned short bi_phys_segments; /* segments after physaddr coalesce*/
+ unsigned short bi_hw_segments; /* segments after DMA remapping */
+ unsigned int bi_max; /* max bio_vecs we can hold
+ used as index into pool */
+ struct bio_vec *bi_io_vec; /* the actual vec list */
+ bio_end_io_t *bi_end_io; /* bi_end_io (bio) */
+ atomic_t bi_cnt; /* pin count: free when it hits zero */
+ void *bi_private;
+ bio_destructor_t *bi_destructor; /* bi_destructor (bio) */
+};
+
+With this multipage bio design:
+
+- Large i/os can be sent down in one go using a bio_vec list consisting
+ of an array of <page, offset, len> fragments (similar to the way fragments
+ are represented in the zero-copy network code)
+- Splitting of an i/o request across multiple devices (as in the case of
+ lvm or raid) is achieved by cloning the bio (where the clone points to
+ the same bi_io_vec array, but with the index and size accordingly modified)
+- A linked list of bios is used as before for unrelated merges (*) - this
+ avoids reallocs and makes independent completions easier to handle.
+- Code that traverses the req list can find all the segments of a bio
+ by using rq_for_each_segment. This handles the fact that a request
+ has multiple bios, each of which can have multiple segments.
+- Drivers which can't process a large bio in one shot can use the bi_idx
+ field to keep track of the next bio_vec entry to process.
+ (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
+ [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
+ bi_offset an len fields]
+
+(*) unrelated merges -- a request ends up containing two or more bios that
+ didn't originate from the same place.
+
+bi_end_io() i/o callback gets called on i/o completion of the entire bio.
+
+At a lower level, drivers build a scatter gather list from the merged bios.
+The scatter gather list is in the form of an array of <page, offset, len>
+entries with their corresponding dma address mappings filled in at the
+appropriate time. As an optimization, contiguous physical pages can be
+covered by a single entry where <page> refers to the first page and <len>
+covers the range of pages (upto 16 contiguous pages could be covered this
+way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
+the sg list.
+
+Note: Right now the only user of bios with more than one page is ll_rw_kio,
+which in turn means that only raw I/O uses it (direct i/o may not work
+right now). The intent however is to enable clustering of pages etc to
+become possible. The pagebuf abstraction layer from SGI also uses multi-page
+bios, but that is currently not included in the stock development kernels.
+The same is true of Andrew Morton's work-in-progress multipage bio writeout
+and readahead patches.
+
+2.3 Changes in the Request Structure
+
+The request structure is the structure that gets passed down to low level
+drivers. The block layer make_request function builds up a request structure,
+places it on the queue and invokes the drivers request_fn. The driver makes
+use of block layer helper routine elv_next_request to pull the next request
+off the queue. Control or diagnostic functions might bypass block and directly
+invoke underlying driver entry points passing in a specially constructed
+request structure.
+
+Only some relevant fields (mainly those which changed or may be referred
+to in some of the discussion here) are listed below, not necessarily in
+the order in which they occur in the structure (see include/linux/blkdev.h)
+Refer to Documentation/block/request.txt for details about all the request
+structure fields and a quick reference about the layers which are
+supposed to use or modify those fields.
+
+struct request {
+ struct list_head queuelist; /* Not meant to be directly accessed by
+ the driver.
+ Used by q->elv_next_request_fn
+ rq->queue is gone
+ */
+ .
+ .
+ unsigned char cmd[16]; /* prebuilt command data block */
+ unsigned long flags; /* also includes earlier rq->cmd settings */
+ .
+ .
+ sector_t sector; /* this field is now of type sector_t instead of int
+ preparation for 64 bit sectors */
+ .
+ .
+
+ /* Number of scatter-gather DMA addr+len pairs after
+ * physical address coalescing is performed.
+ */
+ unsigned short nr_phys_segments;
+
+ /* Number of scatter-gather addr+len pairs after
+ * physical and DMA remapping hardware coalescing is performed.
+ * This is the number of scatter-gather entries the driver
+ * will actually have to deal with after DMA mapping is done.
+ */
+ unsigned short nr_hw_segments;
+
+ /* Various sector counts */
+ unsigned long nr_sectors; /* no. of sectors left: driver modifiable */
+ unsigned long hard_nr_sectors; /* block internal copy of above */
+ unsigned int current_nr_sectors; /* no. of sectors left in the
+ current segment:driver modifiable */
+ unsigned long hard_cur_sectors; /* block internal copy of the above */
+ .
+ .
+ int tag; /* command tag associated with request */
+ void *special; /* same as before */
+ char *buffer; /* valid only for low memory buffers upto
+ current_nr_sectors */
+ .
+ .
+ struct bio *bio, *biotail; /* bio list instead of bh */
+ struct request_list *rl;
+}
+
+See the rq_flag_bits definitions for an explanation of the various flags
+available. Some bits are used by the block layer or i/o scheduler.
+
+The behaviour of the various sector counts are almost the same as before,
+except that since we have multi-segment bios, current_nr_sectors refers
+to the numbers of sectors in the current segment being processed which could
+be one of the many segments in the current bio (i.e i/o completion unit).
+The nr_sectors value refers to the total number of sectors in the whole
+request that remain to be transferred (no change). The purpose of the
+hard_xxx values is for block to remember these counts every time it hands
+over the request to the driver. These values are updated by block on
+end_that_request_first, i.e. every time the driver completes a part of the
+transfer and invokes block end*request helpers to mark this. The
+driver should not modify these values. The block layer sets up the
+nr_sectors and current_nr_sectors fields (based on the corresponding
+hard_xxx values and the number of bytes transferred) and updates it on
+every transfer that invokes end_that_request_first. It does the same for the
+buffer, bio, bio->bi_idx fields too.
+
+The buffer field is just a virtual address mapping of the current segment
+of the i/o buffer in cases where the buffer resides in low-memory. For high
+memory i/o, this field is not valid and must not be used by drivers.
+
+Code that sets up its own request structures and passes them down to
+a driver needs to be careful about interoperation with the block layer helper
+functions which the driver uses. (Section 1.3)
+
+3. Using bios
+
+3.1 Setup/Teardown
+
+There are routines for managing the allocation, and reference counting, and
+freeing of bios (bio_alloc, bio_get, bio_put).
+
+This makes use of Ingo Molnar's mempool implementation, which enables
+subsystems like bio to maintain their own reserve memory pools for guaranteed
+deadlock-free allocations during extreme VM load. For example, the VM
+subsystem makes use of the block layer to writeout dirty pages in order to be
+able to free up memory space, a case which needs careful handling. The
+allocation logic draws from the preallocated emergency reserve in situations
+where it cannot allocate through normal means. If the pool is empty and it
+can wait, then it would trigger action that would help free up memory or
+replenish the pool (without deadlocking) and wait for availability in the pool.
+If it is in IRQ context, and hence not in a position to do this, allocation
+could fail if the pool is empty. In general mempool always first tries to
+perform allocation without having to wait, even if it means digging into the
+pool as long it is not less that 50% full.
+
+On a free, memory is released to the pool or directly freed depending on
+the current availability in the pool. The mempool interface lets the
+subsystem specify the routines to be used for normal alloc and free. In the
+case of bio, these routines make use of the standard slab allocator.
+
+The caller of bio_alloc is expected to taken certain steps to avoid
+deadlocks, e.g. avoid trying to allocate more memory from the pool while
+already holding memory obtained from the pool.
+[TBD: This is a potential issue, though a rare possibility
+ in the bounce bio allocation that happens in the current code, since
+ it ends up allocating a second bio from the same pool while
+ holding the original bio ]
+
+Memory allocated from the pool should be released back within a limited
+amount of time (in the case of bio, that would be after the i/o is completed).
+This ensures that if part of the pool has been used up, some work (in this
+case i/o) must already be in progress and memory would be available when it
+is over. If allocating from multiple pools in the same code path, the order
+or hierarchy of allocation needs to be consistent, just the way one deals
+with multiple locks.
+
+The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
+for a non-clone bio. There are the 6 pools setup for different size biovecs,
+so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
+given size from these slabs.
+
+The bi_destructor() routine takes into account the possibility of the bio
+having originated from a different source (see later discussions on
+n/w to block transfers and kvec_cb)
+
+The bio_get() routine may be used to hold an extra reference on a bio prior
+to i/o submission, if the bio fields are likely to be accessed after the
+i/o is issued (since the bio may otherwise get freed in case i/o completion
+happens in the meantime).
+
+The bio_clone() routine may be used to duplicate a bio, where the clone
+shares the bio_vec_list with the original bio (i.e. both point to the
+same bio_vec_list). This would typically be used for splitting i/o requests
+in lvm or md.
+
+3.2 Generic bio helper Routines
+
+3.2.1 Traversing segments and completion units in a request
+
+The macro rq_for_each_segment() should be used for traversing the bios
+in the request list (drivers should avoid directly trying to do it
+themselves). Using these helpers should also make it easier to cope
+with block changes in the future.
+
+ struct req_iterator iter;
+ rq_for_each_segment(bio_vec, rq, iter)
+ /* bio_vec is now current segment */
+
+I/O completion callbacks are per-bio rather than per-segment, so drivers
+that traverse bio chains on completion need to keep that in mind. Drivers
+which don't make a distinction between segments and completion units would
+need to be reorganized to support multi-segment bios.
+
+3.2.2 Setting up DMA scatterlists
+
+The blk_rq_map_sg() helper routine would be used for setting up scatter
+gather lists from a request, so a driver need not do it on its own.
+
+ nr_segments = blk_rq_map_sg(q, rq, scatterlist);
+
+The helper routine provides a level of abstraction which makes it easier
+to modify the internals of request to scatterlist conversion down the line
+without breaking drivers. The blk_rq_map_sg routine takes care of several
+things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
+is set) and correct segment accounting to avoid exceeding the limits which
+the i/o hardware can handle, based on various queue properties.
+
+- Prevents a clustered segment from crossing a 4GB mem boundary
+- Avoids building segments that would exceed the number of physical
+ memory segments that the driver can handle (phys_segments) and the
+ number that the underlying hardware can handle at once, accounting for
+ DMA remapping (hw_segments) (i.e. IOMMU aware limits).
+
+Routines which the low level driver can use to set up the segment limits:
+
+blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
+hw data segments in a request (i.e. the maximum number of address/length
+pairs the host adapter can actually hand to the device at once)
+
+blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
+of physical data segments in a request (i.e. the largest sized scatter list
+a driver could handle)
+
+3.2.3 I/O completion
+
+The existing generic block layer helper routines end_request,
+end_that_request_first and end_that_request_last can be used for i/o
+completion (and setting things up so the rest of the i/o or the next
+request can be kicked of) as before. With the introduction of multi-page
+bio support, end_that_request_first requires an additional argument indicating
+the number of sectors completed.
+
+3.2.4 Implications for drivers that do not interpret bios (don't handle
+ multiple segments)
+
+Drivers that do not interpret bios e.g those which do not handle multiple
+segments and do not support i/o into high memory addresses (require bounce
+buffers) and expect only virtually mapped buffers, can access the rq->buffer
+field. As before the driver should use current_nr_sectors to determine the
+size of remaining data in the current segment (that is the maximum it can
+transfer in one go unless it interprets segments), and rely on the block layer
+end_request, or end_that_request_first/last to take care of all accounting
+and transparent mapping of the next bio segment when a segment boundary
+is crossed on completion of a transfer. (The end*request* functions should
+be used if only if the request has come down from block/bio path, not for
+direct access requests which only specify rq->buffer without a valid rq->bio)
+
+3.2.5 Generic request command tagging
+
+3.2.5.1 Tag helpers
+
+Block now offers some simple generic functionality to help support command
+queueing (typically known as tagged command queueing), ie manage more than
+one outstanding command on a queue at any given time.
+
+ blk_queue_init_tags(struct request_queue *q, int depth)
+
+ Initialize internal command tagging structures for a maximum
+ depth of 'depth'.
+
+ blk_queue_free_tags((struct request_queue *q)
+
+ Teardown tag info associated with the queue. This will be done
+ automatically by block if blk_queue_cleanup() is called on a queue
+ that is using tagging.
+
+The above are initialization and exit management, the main helpers during
+normal operations are:
+
+ blk_queue_start_tag(struct request_queue *q, struct request *rq)
+
+ Start tagged operation for this request. A free tag number between
+ 0 and 'depth' is assigned to the request (rq->tag holds this number),
+ and 'rq' is added to the internal tag management. If the maximum depth
+ for this queue is already achieved (or if the tag wasn't started for
+ some other reason), 1 is returned. Otherwise 0 is returned.
+
+ blk_queue_end_tag(struct request_queue *q, struct request *rq)
+
+ End tagged operation on this request. 'rq' is removed from the internal
+ book keeping structures.
+
+To minimize struct request and queue overhead, the tag helpers utilize some
+of the same request members that are used for normal request queue management.
+This means that a request cannot both be an active tag and be on the queue
+list at the same time. blk_queue_start_tag() will remove the request, but
+the driver must remember to call blk_queue_end_tag() before signalling
+completion of the request to the block layer. This means ending tag
+operations before calling end_that_request_last()! For an example of a user
+of these helpers, see the IDE tagged command queueing support.
+
+Certain hardware conditions may dictate a need to invalidate the block tag
+queue. For instance, on IDE any tagged request error needs to clear both
+the hardware and software block queue and enable the driver to sanely restart
+all the outstanding requests. There's a third helper to do that:
+
+ blk_queue_invalidate_tags(struct request_queue *q)
+
+ Clear the internal block tag queue and re-add all the pending requests
+ to the request queue. The driver will receive them again on the
+ next request_fn run, just like it did the first time it encountered
+ them.
+
+3.2.5.2 Tag info
+
+Some block functions exist to query current tag status or to go from a
+tag number to the associated request. These are, in no particular order:
+
+ blk_queue_tagged(q)
+
+ Returns 1 if the queue 'q' is using tagging, 0 if not.
+
+ blk_queue_tag_request(q, tag)
+
+ Returns a pointer to the request associated with tag 'tag'.
+
+ blk_queue_tag_depth(q)
+
+ Return current queue depth.
+
+ blk_queue_tag_queue(q)
+
+ Returns 1 if the queue can accept a new queued command, 0 if we are
+ at the maximum depth already.
+
+ blk_queue_rq_tagged(rq)
+
+ Returns 1 if the request 'rq' is tagged.
+
+3.2.5.2 Internal structure
+
+Internally, block manages tags in the blk_queue_tag structure:
+
+ struct blk_queue_tag {
+ struct request **tag_index; /* array or pointers to rq */
+ unsigned long *tag_map; /* bitmap of free tags */
+ struct list_head busy_list; /* fifo list of busy tags */
+ int busy; /* queue depth */
+ int max_depth; /* max queue depth */
+ };
+
+Most of the above is simple and straight forward, however busy_list may need
+a bit of explaining. Normally we don't care too much about request ordering,
+but in the event of any barrier requests in the tag queue we need to ensure
+that requests are restarted in the order they were queue. This may happen
+if the driver needs to use blk_queue_invalidate_tags().
+
+Tagging also defines a new request flag, REQ_QUEUED. This is set whenever
+a request is currently tagged. You should not use this flag directly,
+blk_rq_tagged(rq) is the portable way to do so.
+
+3.3 I/O Submission
+
+The routine submit_bio() is used to submit a single io. Higher level i/o
+routines make use of this:
+
+(a) Buffered i/o:
+The routine submit_bh() invokes submit_bio() on a bio corresponding to the
+bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
+
+(b) Kiobuf i/o (for raw/direct i/o):
+The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
+maps the array to one or more multi-page bios, issuing submit_bio() to
+perform the i/o on each of these.
+
+The embedded bh array in the kiobuf structure has been removed and no
+preallocation of bios is done for kiobufs. [The intent is to remove the
+blocks array as well, but it's currently in there to kludge around direct i/o.]
+Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
+
+Todo/Observation:
+
+ A single kiobuf structure is assumed to correspond to a contiguous range
+ of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
+ So right now it wouldn't work for direct i/o on non-contiguous blocks.
+ This is to be resolved. The eventual direction is to replace kiobuf
+ by kvec's.
+
+ Badari Pulavarty has a patch to implement direct i/o correctly using
+ bio and kvec.
+
+
+(c) Page i/o:
+Todo/Under discussion:
+
+ Andrew Morton's multi-page bio patches attempt to issue multi-page
+ writeouts (and reads) from the page cache, by directly building up
+ large bios for submission completely bypassing the usage of buffer
+ heads. This work is still in progress.
+
+ Christoph Hellwig had some code that uses bios for page-io (rather than
+ bh). This isn't included in bio as yet. Christoph was also working on a
+ design for representing virtual/real extents as an entity and modifying
+ some of the address space ops interfaces to utilize this abstraction rather
+ than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
+ abstraction, but intended to be as lightweight as possible).
+
+(d) Direct access i/o:
+Direct access requests that do not contain bios would be submitted differently
+as discussed earlier in section 1.3.
+
+Aside:
+
+ Kvec i/o:
+
+ Ben LaHaise's aio code uses a slightly different structure instead
+ of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
+ tuples (very much like the networking code), together with a callback function
+ and data pointer. This is embedded into a brw_cb structure when passed
+ to brw_kvec_async().
+
+ Now it should be possible to directly map these kvecs to a bio. Just as while
+ cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
+ array pointer to point to the veclet array in kvecs.
+
+ TBD: In order for this to work, some changes are needed in the way multi-page
+ bios are handled today. The values of the tuples in such a vector passed in
+ from higher level code should not be modified by the block layer in the course
+ of its request processing, since that would make it hard for the higher layer
+ to continue to use the vector descriptor (kvec) after i/o completes. Instead,
+ all such transient state should either be maintained in the request structure,
+ and passed on in some way to the endio completion routine.
+
+
+4. The I/O scheduler
+I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch
+queue and specific I/O schedulers. Unless stated otherwise, elevator is used
+to refer to both parts and I/O scheduler to specific I/O schedulers.
+
+Block layer implements generic dispatch queue in ll_rw_blk.c and elevator.c.
+The generic dispatch queue is responsible for properly ordering barrier
+requests, requeueing, handling non-fs requests and all other subtleties.
+
+Specific I/O schedulers are responsible for ordering normal filesystem
+requests. They can also choose to delay certain requests to improve
+throughput or whatever purpose. As the plural form indicates, there are
+multiple I/O schedulers. They can be built as modules but at least one should
+be built inside the kernel. Each queue can choose different one and can also
+change to another one dynamically.
+
+A block layer call to the i/o scheduler follows the convention elv_xxx(). This
+calls elevator_xxx_fn in the elevator switch (drivers/block/elevator.c). Oh,
+xxx and xxx might not match exactly, but use your imagination. If an elevator
+doesn't implement a function, the switch does nothing or some minimal house
+keeping work.
+
+4.1. I/O scheduler API
+
+The functions an elevator may implement are: (* are mandatory)
+elevator_merge_fn called to query requests for merge with a bio
+
+elevator_merge_req_fn called when two requests get merged. the one
+ which gets merged into the other one will be
+ never seen by I/O scheduler again. IOW, after
+ being merged, the request is gone.
+
+elevator_merged_fn called when a request in the scheduler has been
+ involved in a merge. It is used in the deadline
+ scheduler for example, to reposition the request
+ if its sorting order has changed.
+
+elevator_allow_merge_fn called whenever the block layer determines
+ that a bio can be merged into an existing
+ request safely. The io scheduler may still
+ want to stop a merge at this point if it
+ results in some sort of conflict internally,
+ this hook allows it to do that.
+
+elevator_dispatch_fn fills the dispatch queue with ready requests.
+ I/O schedulers are free to postpone requests by
+ not filling the dispatch queue unless @force
+ is non-zero. Once dispatched, I/O schedulers
+ are not allowed to manipulate the requests -
+ they belong to generic dispatch queue.
+
+elevator_add_req_fn called to add a new request into the scheduler
+
+elevator_queue_empty_fn returns true if the merge queue is empty.
+ Drivers shouldn't use this, but rather check
+ if elv_next_request is NULL (without losing the
+ request if one exists!)
+
+elevator_former_req_fn
+elevator_latter_req_fn These return the request before or after the
+ one specified in disk sort order. Used by the
+ block layer to find merge possibilities.
+
+elevator_completed_req_fn called when a request is completed.
+
+elevator_may_queue_fn returns true if the scheduler wants to allow the
+ current context to queue a new request even if
+ it is over the queue limit. This must be used
+ very carefully!!
+
+elevator_set_req_fn
+elevator_put_req_fn Must be used to allocate and free any elevator
+ specific storage for a request.
+
+elevator_activate_req_fn Called when device driver first sees a request.
+ I/O schedulers can use this callback to
+ determine when actual execution of a request
+ starts.
+elevator_deactivate_req_fn Called when device driver decides to delay
+ a request by requeueing it.
+
+elevator_init_fn
+elevator_exit_fn Allocate and free any elevator specific storage
+ for a queue.
+
+4.2 Request flows seen by I/O schedulers
+All requests seen by I/O schedulers strictly follow one of the following three
+flows.
+
+ set_req_fn ->
+
+ i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
+ (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
+ ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
+ iii. [none]
+
+ -> put_req_fn
+
+4.3 I/O scheduler implementation
+The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
+optimal disk scan and request servicing performance (based on generic
+principles and device capabilities), optimized for:
+i. improved throughput
+ii. improved latency
+iii. better utilization of h/w & CPU time
+
+Characteristics:
+
+i. Binary tree
+AS and deadline i/o schedulers use red black binary trees for disk position
+sorting and searching, and a fifo linked list for time-based searching. This
+gives good scalability and good availability of information. Requests are
+almost always dispatched in disk sort order, so a cache is kept of the next
+request in sort order to prevent binary tree lookups.
+
+This arrangement is not a generic block layer characteristic however, so
+elevators may implement queues as they please.
+
+ii. Merge hash
+AS and deadline use a hash table indexed by the last sector of a request. This
+enables merging code to quickly look up "back merge" candidates, even when
+multiple I/O streams are being performed at once on one disk.
+
+"Front merges", a new request being merged at the front of an existing request,
+are far less common than "back merges" due to the nature of most I/O patterns.
+Front merges are handled by the binary trees in AS and deadline schedulers.
+
+iii. Plugging the queue to batch requests in anticipation of opportunities for
+ merge/sort optimizations
+
+This is just the same as in 2.4 so far, though per-device unplugging
+support is anticipated for 2.5. Also with a priority-based i/o scheduler,
+such decisions could be based on request priorities.
+
+Plugging is an approach that the current i/o scheduling algorithm resorts to so
+that it collects up enough requests in the queue to be able to take
+advantage of the sorting/merging logic in the elevator. If the
+queue is empty when a request comes in, then it plugs the request queue
+(sort of like plugging the bottom of a vessel to get fluid to build up)
+till it fills up with a few more requests, before starting to service
+the requests. This provides an opportunity to merge/sort the requests before
+passing them down to the device. There are various conditions when the queue is
+unplugged (to open up the flow again), either through a scheduled task or
+could be on demand. For example wait_on_buffer sets the unplugging going
+(by running tq_disk) so the read gets satisfied soon. So in the read case,
+the queue gets explicitly unplugged as part of waiting for completion,
+in fact all queues get unplugged as a side-effect.
+
+Aside:
+ This is kind of controversial territory, as it's not clear if plugging is
+ always the right thing to do. Devices typically have their own queues,
+ and allowing a big queue to build up in software, while letting the device be
+ idle for a while may not always make sense. The trick is to handle the fine
+ balance between when to plug and when to open up. Also now that we have
+ multi-page bios being queued in one shot, we may not need to wait to merge
+ a big request from the broken up pieces coming by.
+
+ Per-queue granularity unplugging (still a Todo) may help reduce some of the
+ concerns with just a single tq_disk flush approach. Something like
+ blk_kick_queue() to unplug a specific queue (right away ?)
+ or optionally, all queues, is in the plan.
+
+4.4 I/O contexts
+I/O contexts provide a dynamically allocated per process data area. They may
+be used in I/O schedulers, and in the block layer (could be used for IO statis,
+priorities for example). See *io_context in block/ll_rw_blk.c, and as-iosched.c
+for an example of usage in an i/o scheduler.
+
+
+5. Scalability related changes
+
+5.1 Granular Locking: io_request_lock replaced by a per-queue lock
+
+The global io_request_lock has been removed as of 2.5, to avoid
+the scalability bottleneck it was causing, and has been replaced by more
+granular locking. The request queue structure has a pointer to the
+lock to be used for that queue. As a result, locking can now be
+per-queue, with a provision for sharing a lock across queues if
+necessary (e.g the scsi layer sets the queue lock pointers to the
+corresponding adapter lock, which results in a per host locking
+granularity). The locking semantics are the same, i.e. locking is
+still imposed by the block layer, grabbing the lock before
+request_fn execution which it means that lots of older drivers
+should still be SMP safe. Drivers are free to drop the queue
+lock themselves, if required. Drivers that explicitly used the
+io_request_lock for serialization need to be modified accordingly.
+Usually it's as easy as adding a global lock:
+
+ static DEFINE_SPINLOCK(my_driver_lock);
+
+and passing the address to that lock to blk_init_queue().
+
+5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
+
+The sector number used in the bio structure has been changed to sector_t,
+which could be defined as 64 bit in preparation for 64 bit sector support.
+
+6. Other Changes/Implications
+
+6.1 Partition re-mapping handled by the generic block layer
+
+In 2.5 some of the gendisk/partition related code has been reorganized.
+Now the generic block layer performs partition-remapping early and thus
+provides drivers with a sector number relative to whole device, rather than
+having to take partition number into account in order to arrive at the true
+sector number. The routine blk_partition_remap() is invoked by
+generic_make_request even before invoking the queue specific make_request_fn,
+so the i/o scheduler also gets to operate on whole disk sector numbers. This
+should typically not require changes to block drivers, it just never gets
+to invoke its own partition sector offset calculations since all bios
+sent are offset from the beginning of the device.
+
+
+7. A Few Tips on Migration of older drivers
+
+Old-style drivers that just use CURRENT and ignores clustered requests,
+may not need much change. The generic layer will automatically handle
+clustered requests, multi-page bios, etc for the driver.
+
+For a low performance driver or hardware that is PIO driven or just doesn't
+support scatter-gather changes should be minimal too.
+
+The following are some points to keep in mind when converting old drivers
+to bio.
+
+Drivers should use elv_next_request to pick up requests and are no longer
+supposed to handle looping directly over the request list.
+(struct request->queue has been removed)
+
+Now end_that_request_first takes an additional number_of_sectors argument.
+It used to handle always just the first buffer_head in a request, now
+it will loop and handle as many sectors (on a bio-segment granularity)
+as specified.
+
+Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
+right thing to use is bio_endio(bio, uptodate) instead.
+
+If the driver is dropping the io_request_lock from its request_fn strategy,
+then it just needs to replace that with q->queue_lock instead.
+
+As described in Sec 1.1, drivers can set max sector size, max segment size
+etc per queue now. Drivers that used to define their own merge functions i
+to handle things like this can now just use the blk_queue_* functions at
+blk_init_queue time.
+
+Drivers no longer have to map a {partition, sector offset} into the
+correct absolute location anymore, this is done by the block layer, so
+where a driver received a request ala this before:
+
+ rq->rq_dev = mk_kdev(3, 5); /* /dev/hda5 */
+ rq->sector = 0; /* first sector on hda5 */
+
+ it will now see
+
+ rq->rq_dev = mk_kdev(3, 0); /* /dev/hda */
+ rq->sector = 123128; /* offset from start of disk */
+
+As mentioned, there is no virtual mapping of a bio. For DMA, this is
+not a problem as the driver probably never will need a virtual mapping.
+Instead it needs a bus mapping (pci_map_page for a single segment or
+use blk_rq_map_sg for scatter gather) to be able to ship it to the driver. For
+PIO drivers (or drivers that need to revert to PIO transfer once in a
+while (IDE for example)), where the CPU is doing the actual data
+transfer a virtual mapping is needed. If the driver supports highmem I/O,
+(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic and bio_kmap_irq to
+temporarily map a bio into the virtual address space.
+
+
+8. Prior/Related/Impacted patches
+
+8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
+- orig kiobuf & raw i/o patches (now in 2.4 tree)
+- direct kiobuf based i/o to devices (no intermediate bh's)
+- page i/o using kiobuf
+- kiobuf splitting for lvm (mkp)
+- elevator support for kiobuf request merging (axboe)
+8.2. Zero-copy networking (Dave Miller)
+8.3. SGI XFS - pagebuf patches - use of kiobufs
+8.4. Multi-page pioent patch for bio (Christoph Hellwig)
+8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
+8.6. Async i/o implementation patch (Ben LaHaise)
+8.7. EVMS layering design (IBM EVMS team)
+8.8. Larger page cache size patch (Ben LaHaise) and
+ Large page size (Daniel Phillips)
+ => larger contiguous physical memory buffers
+8.9. VM reservations patch (Ben LaHaise)
+8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
+8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
+8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar,
+ Badari)
+8.13 Priority based i/o scheduler - prepatches (Arjan van de Ven)
+8.14 IDE Taskfile i/o patch (Andre Hedrick)
+8.15 Multi-page writeout and readahead patches (Andrew Morton)
+8.16 Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
+
+9. Other References:
+
+9.1 The Splice I/O Model - Larry McVoy (and subsequent discussions on lkml,
+and Linus' comments - Jan 2001)
+9.2 Discussions about kiobuf and bh design on lkml between sct, linus, alan
+et al - Feb-March 2001 (many of the initial thoughts that led to bio were
+brought up in this discussion thread)
+9.3 Discussions on mempool on lkml - Dec 2001.
+
diff --git a/Documentation/block/capability.txt b/Documentation/block/capability.txt
new file mode 100644
index 0000000..2f17294
--- /dev/null
+++ b/Documentation/block/capability.txt
@@ -0,0 +1,15 @@
+Generic Block Device Capability
+===============================================================================
+This file documents the sysfs file block/<disk>/capability
+
+capability is a hex word indicating which capabilities a specific disk
+supports. For more information on bits not listed here, see
+include/linux/genhd.h
+
+Capability Value
+-------------------------------------------------------------------------------
+GENHD_FL_MEDIA_CHANGE_NOTIFY 4
+ When this bit is set, the disk supports Asynchronous Notification
+ of media change events. These events will be broadcast to user
+ space via kernel uevent.
+
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt
new file mode 100644
index 0000000..e8ca040
--- /dev/null
+++ b/Documentation/block/data-integrity.txt
@@ -0,0 +1,327 @@
+----------------------------------------------------------------------
+1. INTRODUCTION
+
+Modern filesystems feature checksumming of data and metadata to
+protect against data corruption. However, the detection of the
+corruption is done at read time which could potentially be months
+after the data was written. At that point the original data that the
+application tried to write is most likely lost.
+
+The solution is to ensure that the disk is actually storing what the
+application meant it to. Recent additions to both the SCSI family
+protocols (SBC Data Integrity Field, SCC protection proposal) as well
+as SATA/T13 (External Path Protection) try to remedy this by adding
+support for appending integrity metadata to an I/O. The integrity
+metadata (or protection information in SCSI terminology) includes a
+checksum for each sector as well as an incrementing counter that
+ensures the individual sectors are written in the right order. And
+for some protection schemes also that the I/O is written to the right
+place on disk.
+
+Current storage controllers and devices implement various protective
+measures, for instance checksumming and scrubbing. But these
+technologies are working in their own isolated domains or at best
+between adjacent nodes in the I/O path. The interesting thing about
+DIF and the other integrity extensions is that the protection format
+is well defined and every node in the I/O path can verify the
+integrity of the I/O and reject it if corruption is detected. This
+allows not only corruption prevention but also isolation of the point
+of failure.
+
+----------------------------------------------------------------------
+2. THE DATA INTEGRITY EXTENSIONS
+
+As written, the protocol extensions only protect the path between
+controller and storage device. However, many controllers actually
+allow the operating system to interact with the integrity metadata
+(IMD). We have been working with several FC/SAS HBA vendors to enable
+the protection information to be transferred to and from their
+controllers.
+
+The SCSI Data Integrity Field works by appending 8 bytes of protection
+information to each sector. The data + integrity metadata is stored
+in 520 byte sectors on disk. Data + IMD are interleaved when
+transferred between the controller and target. The T13 proposal is
+similar.
+
+Because it is highly inconvenient for operating systems to deal with
+520 (and 4104) byte sectors, we approached several HBA vendors and
+encouraged them to allow separation of the data and integrity metadata
+scatter-gather lists.
+
+The controller will interleave the buffers on write and split them on
+read. This means that the Linux can DMA the data buffers to and from
+host memory without changes to the page cache.
+
+Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs
+is somewhat heavy to compute in software. Benchmarks found that
+calculating this checksum had a significant impact on system
+performance for a number of workloads. Some controllers allow a
+lighter-weight checksum to be used when interfacing with the operating
+system. Emulex, for instance, supports the TCP/IP checksum instead.
+The IP checksum received from the OS is converted to the 16-bit CRC
+when writing and vice versa. This allows the integrity metadata to be
+generated by Linux or the application at very low cost (comparable to
+software RAID5).
+
+The IP checksum is weaker than the CRC in terms of detecting bit
+errors. However, the strength is really in the separation of the data
+buffers and the integrity metadata. These two distinct buffers much
+match up for an I/O to complete.
+
+The separation of the data and integrity metadata buffers as well as
+the choice in checksums is referred to as the Data Integrity
+Extensions. As these extensions are outside the scope of the protocol
+bodies (T10, T13), Oracle and its partners are trying to standardize
+them within the Storage Networking Industry Association.
+
+----------------------------------------------------------------------
+3. KERNEL CHANGES
+
+The data integrity framework in Linux enables protection information
+to be pinned to I/Os and sent to/received from controllers that
+support it.
+
+The advantage to the integrity extensions in SCSI and SATA is that
+they enable us to protect the entire path from application to storage
+device. However, at the same time this is also the biggest
+disadvantage. It means that the protection information must be in a
+format that can be understood by the disk.
+
+Generally Linux/POSIX applications are agnostic to the intricacies of
+the storage devices they are accessing. The virtual filesystem switch
+and the block layer make things like hardware sector size and
+transport protocols completely transparent to the application.
+
+However, this level of detail is required when preparing the
+protection information to send to a disk. Consequently, the very
+concept of an end-to-end protection scheme is a layering violation.
+It is completely unreasonable for an application to be aware whether
+it is accessing a SCSI or SATA disk.
+
+The data integrity support implemented in Linux attempts to hide this
+from the application. As far as the application (and to some extent
+the kernel) is concerned, the integrity metadata is opaque information
+that's attached to the I/O.
+
+The current implementation allows the block layer to automatically
+generate the protection information for any I/O. Eventually the
+intent is to move the integrity metadata calculation to userspace for
+user data. Metadata and other I/O that originates within the kernel
+will still use the automatic generation interface.
+
+Some storage devices allow each hardware sector to be tagged with a
+16-bit value. The owner of this tag space is the owner of the block
+device. I.e. the filesystem in most cases. The filesystem can use
+this extra space to tag sectors as they see fit. Because the tag
+space is limited, the block interface allows tagging bigger chunks by
+way of interleaving. This way, 8*16 bits of information can be
+attached to a typical 4KB filesystem block.
+
+This also means that applications such as fsck and mkfs will need
+access to manipulate the tags from user space. A passthrough
+interface for this is being worked on.
+
+
+----------------------------------------------------------------------
+4. BLOCK LAYER IMPLEMENTATION DETAILS
+
+4.1 BIO
+
+The data integrity patches add a new field to struct bio when
+CONFIG_BLK_DEV_INTEGRITY is enabled. bio->bi_integrity is a pointer
+to a struct bip which contains the bio integrity payload. Essentially
+a bip is a trimmed down struct bio which holds a bio_vec containing
+the integrity metadata and the required housekeeping information (bvec
+pool, vector count, etc.)
+
+A kernel subsystem can enable data integrity protection on a bio by
+calling bio_integrity_alloc(bio). This will allocate and attach the
+bip to the bio.
+
+Individual pages containing integrity metadata can subsequently be
+attached using bio_integrity_add_page().
+
+bio_free() will automatically free the bip.
+
+
+4.2 BLOCK DEVICE
+
+Because the format of the protection data is tied to the physical
+disk, each block device has been extended with a block integrity
+profile (struct blk_integrity). This optional profile is registered
+with the block layer using blk_integrity_register().
+
+The profile contains callback functions for generating and verifying
+the protection data, as well as getting and setting application tags.
+The profile also contains a few constants to aid in completing,
+merging and splitting the integrity metadata.
+
+Layered block devices will need to pick a profile that's appropriate
+for all subdevices. blk_integrity_compare() can help with that. DM
+and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6
+will require extra work due to the application tag.
+
+
+----------------------------------------------------------------------
+5.0 BLOCK LAYER INTEGRITY API
+
+5.1 NORMAL FILESYSTEM
+
+ The normal filesystem is unaware that the underlying block device
+ is capable of sending/receiving integrity metadata. The IMD will
+ be automatically generated by the block layer at submit_bio() time
+ in case of a WRITE. A READ request will cause the I/O integrity
+ to be verified upon completion.
+
+ IMD generation and verification can be toggled using the
+
+ /sys/block/<bdev>/integrity/write_generate
+
+ and
+
+ /sys/block/<bdev>/integrity/read_verify
+
+ flags.
+
+
+5.2 INTEGRITY-AWARE FILESYSTEM
+
+ A filesystem that is integrity-aware can prepare I/Os with IMD
+ attached. It can also use the application tag space if this is
+ supported by the block device.
+
+
+ int bdev_integrity_enabled(block_device, int rw);
+
+ bdev_integrity_enabled() will return 1 if the block device
+ supports integrity metadata transfer for the data direction
+ specified in 'rw'.
+
+ bdev_integrity_enabled() honors the write_generate and
+ read_verify flags in sysfs and will respond accordingly.
+
+
+ int bio_integrity_prep(bio);
+
+ To generate IMD for WRITE and to set up buffers for READ, the
+ filesystem must call bio_integrity_prep(bio).
+
+ Prior to calling this function, the bio data direction and start
+ sector must be set, and the bio should have all data pages
+ added. It is up to the caller to ensure that the bio does not
+ change while I/O is in progress.
+
+ bio_integrity_prep() should only be called if
+ bio_integrity_enabled() returned 1.
+
+
+ int bio_integrity_tag_size(bio);
+
+ If the filesystem wants to use the application tag space it will
+ first have to find out how much storage space is available.
+ Because tag space is generally limited (usually 2 bytes per
+ sector regardless of sector size), the integrity framework
+ supports interleaving the information between the sectors in an
+ I/O.
+
+ Filesystems can call bio_integrity_tag_size(bio) to find out how
+ many bytes of storage are available for that particular bio.
+
+ Another option is bdev_get_tag_size(block_device) which will
+ return the number of available bytes per hardware sector.
+
+
+ int bio_integrity_set_tag(bio, void *tag_buf, len);
+
+ After a successful return from bio_integrity_prep(),
+ bio_integrity_set_tag() can be used to attach an opaque tag
+ buffer to a bio. Obviously this only makes sense if the I/O is
+ a WRITE.
+
+
+ int bio_integrity_get_tag(bio, void *tag_buf, len);
+
+ Similarly, at READ I/O completion time the filesystem can
+ retrieve the tag buffer using bio_integrity_get_tag().
+
+
+5.3 PASSING EXISTING INTEGRITY METADATA
+
+ Filesystems that either generate their own integrity metadata or
+ are capable of transferring IMD from user space can use the
+ following calls:
+
+
+ struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages);
+
+ Allocates the bio integrity payload and hangs it off of the bio.
+ nr_pages indicate how many pages of protection data need to be
+ stored in the integrity bio_vec list (similar to bio_alloc()).
+
+ The integrity payload will be freed at bio_free() time.
+
+
+ int bio_integrity_add_page(bio, page, len, offset);
+
+ Attaches a page containing integrity metadata to an existing
+ bio. The bio must have an existing bip,
+ i.e. bio_integrity_alloc() must have been called. For a WRITE,
+ the integrity metadata in the pages must be in a format
+ understood by the target device with the notable exception that
+ the sector numbers will be remapped as the request traverses the
+ I/O stack. This implies that the pages added using this call
+ will be modified during I/O! The first reference tag in the
+ integrity metadata must have a value of bip->bip_sector.
+
+ Pages can be added using bio_integrity_add_page() as long as
+ there is room in the bip bio_vec array (nr_pages).
+
+ Upon completion of a READ operation, the attached pages will
+ contain the integrity metadata received from the storage device.
+ It is up to the receiver to process them and verify data
+ integrity upon completion.
+
+
+5.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY
+ METADATA
+
+ To enable integrity exchange on a block device the gendisk must be
+ registered as capable:
+
+ int blk_integrity_register(gendisk, blk_integrity);
+
+ The blk_integrity struct is a template and should contain the
+ following:
+
+ static struct blk_integrity my_profile = {
+ .name = "STANDARDSBODY-TYPE-VARIANT-CSUM",
+ .generate_fn = my_generate_fn,
+ .verify_fn = my_verify_fn,
+ .get_tag_fn = my_get_tag_fn,
+ .set_tag_fn = my_set_tag_fn,
+ .tuple_size = sizeof(struct my_tuple_size),
+ .tag_size = <tag bytes per hw sector>,
+ };
+
+ 'name' is a text string which will be visible in sysfs. This is
+ part of the userland API so chose it carefully and never change
+ it. The format is standards body-type-variant.
+ E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC.
+
+ 'generate_fn' generates appropriate integrity metadata (for WRITE).
+
+ 'verify_fn' verifies that the data buffer matches the integrity
+ metadata.
+
+ 'tuple_size' must be set to match the size of the integrity
+ metadata per sector. I.e. 8 for DIF and EPP.
+
+ 'tag_size' must be set to identify how many bytes of tag space
+ are available per hardware sector. For DIF this is either 2 or
+ 0 depending on the value of the Control Mode Page ATO bit.
+
+ See 6.2 for a description of get_tag_fn and set_tag_fn.
+
+----------------------------------------------------------------------
+2007-12-24 Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
new file mode 100644
index 0000000..7257676
--- /dev/null
+++ b/Documentation/block/deadline-iosched.txt
@@ -0,0 +1,75 @@
+Deadline IO scheduler tunables
+==============================
+
+This little file attempts to document how the deadline io scheduler works.
+In particular, it will clarify the meaning of the exposed tunables that may be
+of interest to power users.
+
+Selecting IO schedulers
+-----------------------
+Refer to Documentation/block/switching-sched.txt for information on
+selecting an io scheduler on a per-device basis.
+
+
+********************************************************************************
+
+
+read_expire (in ms)
+-----------
+
+The goal of the deadline io scheduler is to attempt to guarantee a start
+service time for a request. As we focus mainly on read latencies, this is
+tunable. When a read request first enters the io scheduler, it is assigned
+a deadline that is the current time + the read_expire value in units of
+milliseconds.
+
+
+write_expire (in ms)
+-----------
+
+Similar to read_expire mentioned above, but for writes.
+
+
+fifo_batch (number of requests)
+----------
+
+Requests are grouped into ``batches'' of a particular data direction (read or
+write) which are serviced in increasing sector order. To limit extra seeking,
+deadline expiries are only checked between batches. fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput. When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour). Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
+
+
+writes_starved (number of dispatches)
+--------------
+
+When we have to move requests from the io scheduler queue to the block
+device dispatch queue, we always give a preference to reads. However, we
+don't want to starve writes indefinitely either. So writes_starved controls
+how many times we give preference to reads over writes. When that has been
+done writes_starved number of times, we dispatch some writes based on the
+same criteria as reads.
+
+
+front_merges (bool)
+------------
+
+Sometimes it happens that a request enters the io scheduler that is contigious
+with a request that is already on the queue. Either it fits in the back of that
+request, or it fits at the front. That is called either a back merge candidate
+or a front merge candidate. Due to the way files are typically laid out,
+back merges are much more common than front merges. For some work loads, you
+may even know that it is a waste of time to spend any time attempting to
+front merge requests. Setting front_merges to 0 disables this functionality.
+Front merges may still occur due to the cached last_merge hint, but since
+that comes at basically 0 cost we leave that on. We simply disable the
+rbtree front sector lookup when the io scheduler merge function is called.
+
+
+Nov 11 2002, Jens Axboe <jens.axboe@oracle.com>
+
+
diff --git a/Documentation/block/ioprio.txt b/Documentation/block/ioprio.txt
new file mode 100644
index 0000000..8ed8c59
--- /dev/null
+++ b/Documentation/block/ioprio.txt
@@ -0,0 +1,183 @@
+Block io priorities
+===================
+
+
+Intro
+-----
+
+With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io
+priorities are supported for reads on files. This enables users to io nice
+processes or process groups, similar to what has been possible with cpu
+scheduling for ages. This document mainly details the current possibilities
+with cfq; other io schedulers do not support io priorities thus far.
+
+Scheduling classes
+------------------
+
+CFQ implements three generic scheduling classes that determine how io is
+served for a process.
+
+IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
+higher priority than any other in the system, processes from this class are
+given first access to the disk every time. Thus it needs to be used with some
+care, one io RT process can starve the entire system. Within the RT class,
+there are 8 levels of class data that determine exactly how much time this
+process needs the disk for on each service. In the future this might change
+to be more directly mappable to performance, by passing in a wanted data
+rate instead.
+
+IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default
+for any process that hasn't set a specific io priority. The class data
+determines how much io bandwidth the process will get, it's directly mappable
+to the cpu nice levels just more coarsely implemented. 0 is the highest
+BE prio level, 7 is the lowest. The mapping between cpu nice level and io
+nice level is determined as: io_nice = (cpu_nice + 20) / 5.
+
+IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this
+level only get io time when no one else needs the disk. The idle class has no
+class data, since it doesn't really apply here.
+
+Tools
+-----
+
+See below for a sample ionice tool. Usage:
+
+# ionice -c<class> -n<level> -p<pid>
+
+If pid isn't given, the current process is assumed. IO priority settings
+are inherited on fork, so you can use ionice to start the process at a given
+level:
+
+# ionice -c2 -n0 /bin/ls
+
+will run ls at the best-effort scheduling class at the highest priority.
+For a running process, you can give the pid instead:
+
+# ionice -c1 -n2 -p100
+
+will change pid 100 to run at the realtime scheduling class, at priority 2.
+
+---> snip ionice.c tool <---
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <asm/unistd.h>
+
+extern int sys_ioprio_set(int, int, int);
+extern int sys_ioprio_get(int, int);
+
+#if defined(__i386__)
+#define __NR_ioprio_set 289
+#define __NR_ioprio_get 290
+#elif defined(__ppc__)
+#define __NR_ioprio_set 273
+#define __NR_ioprio_get 274
+#elif defined(__x86_64__)
+#define __NR_ioprio_set 251
+#define __NR_ioprio_get 252
+#elif defined(__ia64__)
+#define __NR_ioprio_set 1274
+#define __NR_ioprio_get 1275
+#else
+#error "Unsupported arch"
+#endif
+
+static inline int ioprio_set(int which, int who, int ioprio)
+{
+ return syscall(__NR_ioprio_set, which, who, ioprio);
+}
+
+static inline int ioprio_get(int which, int who)
+{
+ return syscall(__NR_ioprio_get, which, who);
+}
+
+enum {
+ IOPRIO_CLASS_NONE,
+ IOPRIO_CLASS_RT,
+ IOPRIO_CLASS_BE,
+ IOPRIO_CLASS_IDLE,
+};
+
+enum {
+ IOPRIO_WHO_PROCESS = 1,
+ IOPRIO_WHO_PGRP,
+ IOPRIO_WHO_USER,
+};
+
+#define IOPRIO_CLASS_SHIFT 13
+
+const char *to_prio[] = { "none", "realtime", "best-effort", "idle", };
+
+int main(int argc, char *argv[])
+{
+ int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE;
+ int c, pid = 0;
+
+ while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) {
+ switch (c) {
+ case 'n':
+ ioprio = strtol(optarg, NULL, 10);
+ set = 1;
+ break;
+ case 'c':
+ ioprio_class = strtol(optarg, NULL, 10);
+ set = 1;
+ break;
+ case 'p':
+ pid = strtol(optarg, NULL, 10);
+ break;
+ }
+ }
+
+ switch (ioprio_class) {
+ case IOPRIO_CLASS_NONE:
+ ioprio_class = IOPRIO_CLASS_BE;
+ break;
+ case IOPRIO_CLASS_RT:
+ case IOPRIO_CLASS_BE:
+ break;
+ case IOPRIO_CLASS_IDLE:
+ ioprio = 7;
+ break;
+ default:
+ printf("bad prio class %d\n", ioprio_class);
+ return 1;
+ }
+
+ if (!set) {
+ if (!pid && argv[optind])
+ pid = strtol(argv[optind], NULL, 10);
+
+ ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid);
+
+ printf("pid=%d, %d\n", pid, ioprio);
+
+ if (ioprio == -1)
+ perror("ioprio_get");
+ else {
+ ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT;
+ ioprio = ioprio & 0xff;
+ printf("%s: prio %d\n", to_prio[ioprio_class], ioprio);
+ }
+ } else {
+ if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) {
+ perror("ioprio_set");
+ return 1;
+ }
+
+ if (argv[optind])
+ execvp(argv[optind], &argv[optind]);
+ }
+
+ return 0;
+}
+
+---> snip ionice.c tool <---
+
+
+March 11 2005, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/request.txt b/Documentation/block/request.txt
new file mode 100644
index 0000000..754e104
--- /dev/null
+++ b/Documentation/block/request.txt
@@ -0,0 +1,88 @@
+
+struct request documentation
+
+Jens Axboe <jens.axboe@oracle.com> 27/05/02
+
+1.0
+Index
+
+2.0 Struct request members classification
+
+ 2.1 struct request members explanation
+
+3.0
+
+
+2.0
+Short explanation of request members
+
+Classification flags:
+
+ D driver member
+ B block layer member
+ I I/O scheduler member
+
+Unless an entry contains a D classification, a device driver must not access
+this member. Some members may contain D classifications, but should only be
+access through certain macros or functions (eg ->flags).
+
+<linux/blkdev.h>
+
+2.1
+Member Flag Comment
+------ ---- -------
+
+struct list_head queuelist BI Organization on various internal
+ queues
+
+void *elevator_private I I/O scheduler private data
+
+unsigned char cmd[16] D Driver can use this for setting up
+ a cdb before execution, see
+ blk_queue_prep_rq
+
+unsigned long flags DBI Contains info about data direction,
+ request type, etc.
+
+int rq_status D Request status bits
+
+kdev_t rq_dev DBI Target device
+
+int errors DB Error counts
+
+sector_t sector DBI Target location
+
+unsigned long hard_nr_sectors B Used to keep sector sane
+
+unsigned long nr_sectors DBI Total number of sectors in request
+
+unsigned long hard_nr_sectors B Used to keep nr_sectors sane
+
+unsigned short nr_phys_segments DB Number of physical scatter gather
+ segments in a request
+
+unsigned short nr_hw_segments DB Number of hardware scatter gather
+ segments in a request
+
+unsigned int current_nr_sectors DB Number of sectors in first segment
+ of request
+
+unsigned int hard_cur_sectors B Used to keep current_nr_sectors sane
+
+int tag DB TCQ tag, if assigned
+
+void *special D Free to be used by driver
+
+char *buffer D Map of first segment, also see
+ section on bouncing SECTION
+
+struct completion *waiting D Can be used by driver to get signalled
+ on request completion
+
+struct bio *bio DBI First bio in request
+
+struct bio *biotail DBI Last bio in request
+
+struct request_queue *q DB Request queue this request belongs to
+
+struct request_list *rl B Request list this request came from
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
new file mode 100644
index 0000000..0dbc946
--- /dev/null
+++ b/Documentation/block/stat.txt
@@ -0,0 +1,82 @@
+Block layer statistics in /sys/block/<dev>/stat
+===============================================
+
+This file documents the contents of the /sys/block/<dev>/stat file.
+
+The stat file provides several statistics about the state of block
+device <dev>.
+
+Q. Why are there multiple statistics in a single file? Doesn't sysfs
+ normally contain a single value per file?
+A. By having a single file, the kernel can guarantee that the statistics
+ represent a consistent snapshot of the state of the device. If the
+ statistics were exported as multiple files containing one statistic
+ each, it would be impossible to guarantee that a set of readings
+ represent a single point in time.
+
+The stat file consists of a single line of text containing 11 decimal
+values separated by whitespace. The fields are summarized in the
+following table, and described in more detail below.
+
+Name units description
+---- ----- -----------
+read I/Os requests number of read I/Os processed
+read merges requests number of read I/Os merged with in-queue I/O
+read sectors sectors number of sectors read
+read ticks milliseconds total wait time for read requests
+write I/Os requests number of write I/Os processed
+write merges requests number of write I/Os merged with in-queue I/O
+write sectors sectors number of sectors written
+write ticks milliseconds total wait time for write requests
+in_flight requests number of I/Os currently in flight
+io_ticks milliseconds total time this block device has been active
+time_in_queue milliseconds total wait time for all requests
+
+read I/Os, write I/Os
+=====================
+
+These values increment when an I/O request completes.
+
+read merges, write merges
+=========================
+
+These values increment when an I/O request is merged with an
+already-queued I/O request.
+
+read sectors, write sectors
+===========================
+
+These values count the number of sectors read from or written to this
+block device. The "sectors" in question are the standard UNIX 512-byte
+sectors, not any device- or filesystem-specific block size. The
+counters are incremented when the I/O completes.
+
+read ticks, write ticks
+=======================
+
+These values count the number of milliseconds that I/O requests have
+waited on this block device. If there are multiple I/O requests waiting,
+these values will increase at a rate greater than 1000/second; for
+example, if 60 read requests wait for an average of 30 ms, the read_ticks
+field will increase by 60*30 = 1800.
+
+in_flight
+=========
+
+This value counts the number of I/O requests that have been issued to
+the device driver but have not yet completed. It does not include I/O
+requests that are in the queue but not yet issued to the device driver.
+
+io_ticks
+========
+
+This value counts the number of milliseconds during which the device has
+had I/O requests queued.
+
+time_in_queue
+=============
+
+This value counts the number of milliseconds that I/O requests have waited
+on this block device. If there are multiple I/O requests waiting, this
+value will increase as the product of the number of milliseconds times the
+number of requests waiting (see "read ticks" above for an example).
diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt
new file mode 100644
index 0000000..634c952
--- /dev/null
+++ b/Documentation/block/switching-sched.txt
@@ -0,0 +1,43 @@
+To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
+'noop', 'as' and 'cfq' (the default) are also available. IO schedulers are
+assigned globally at boot time only presently.
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in:
+
+/sys/block/<device>/queue/iosched
+
+assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
+you can do so by typing:
+
+# mount none /sys -t sysfs
+
+As of the Linux 2.6.10 kernel, it is now possible to change the
+IO scheduler for a given block device on the fly (thus making it possible,
+for instance, to set the CFQ scheduler for the system default, but
+set a specific device to use the anticipatory or noop schedulers - which
+can improve that device's throughput).
+
+To set a specific scheduler, simply do this:
+
+echo SCHEDNAME > /sys/block/DEV/queue/scheduler
+
+where SCHEDNAME is the name of a defined IO scheduler, and DEV is the
+device name (hda, hdb, sga, or whatever you happen to have).
+
+The list of defined schedulers can be found by simply doing
+a "cat /sys/block/DEV/queue/scheduler" - the list of valid names
+will be displayed, with the currently selected scheduler in brackets:
+
+# cat /sys/block/hda/queue/scheduler
+noop anticipatory deadline [cfq]
+# echo anticipatory > /sys/block/hda/queue/scheduler
+# cat /sys/block/hda/queue/scheduler
+noop [anticipatory] deadline cfq
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in:
+
+/sys/block/<device>/queue/iosched
diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX
new file mode 100644
index 0000000..86f054c
--- /dev/null
+++ b/Documentation/blockdev/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - this file
+README.DAC960
+ - info on Mylex DAC960/DAC1100 PCI RAID Controller Driver for Linux.
+cciss.txt
+ - info, major/minor #'s for Compaq's SMART Array Controllers.
+cpqarray.txt
+ - info on using Compaq's SMART2 Intelligent Disk Array Controllers.
+floppy.txt
+ - notes and driver options for the floppy disk driver.
+nbd.txt
+ - info on a TCP implementation of a network block device.
+paride.txt
+ - information about the parallel port IDE subsystem.
+ramdisk.txt
+ - short guide on how to set up and use the RAM disk.
diff --git a/Documentation/blockdev/README.DAC960 b/Documentation/blockdev/README.DAC960
new file mode 100644
index 0000000..0e8f618
--- /dev/null
+++ b/Documentation/blockdev/README.DAC960
@@ -0,0 +1,756 @@
+ Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers
+
+ Version 2.2.11 for Linux 2.2.19
+ Version 2.4.11 for Linux 2.4.12
+
+ PRODUCTION RELEASE
+
+ 11 October 2001
+
+ Leonard N. Zubkoff
+ Dandelion Digital
+ lnz@dandelion.com
+
+ Copyright 1998-2001 by Leonard N. Zubkoff <lnz@dandelion.com>
+
+
+ INTRODUCTION
+
+Mylex, Inc. designs and manufactures a variety of high performance PCI RAID
+controllers. Mylex Corporation is located at 34551 Ardenwood Blvd., Fremont,
+California 94555, USA and can be reached at 510.796.6100 or on the World Wide
+Web at http://www.mylex.com. Mylex Technical Support can be reached by
+electronic mail at mylexsup@us.ibm.com, by voice at 510.608.2400, or by FAX at
+510.745.7715. Contact information for offices in Europe and Japan is available
+on their Web site.
+
+The latest information on Linux support for DAC960 PCI RAID Controllers, as
+well as the most recent release of this driver, will always be available from
+my Linux Home Page at URL "http://www.dandelion.com/Linux/". The Linux DAC960
+driver supports all current Mylex PCI RAID controllers including the new
+eXtremeRAID 2000/3000 and AcceleRAID 352/170/160 models which have an entirely
+new firmware interface from the older eXtremeRAID 1100, AcceleRAID 150/200/250,
+and DAC960PJ/PG/PU/PD/PL. See below for a complete controller list as well as
+minimum firmware version requirements. For simplicity, in most places this
+documentation refers to DAC960 generically rather than explicitly listing all
+the supported models.
+
+Driver bug reports should be sent via electronic mail to "lnz@dandelion.com".
+Please include with the bug report the complete configuration messages reported
+by the driver at startup, along with any subsequent system messages relevant to
+the controller's operation, and a detailed description of your system's
+hardware configuration. Driver bugs are actually quite rare; if you encounter
+problems with disks being marked offline, for example, please contact Mylex
+Technical Support as the problem is related to the hardware configuration
+rather than the Linux driver.
+
+Please consult the RAID controller documentation for detailed information
+regarding installation and configuration of the controllers. This document
+primarily provides information specific to the Linux support.
+
+
+ DRIVER FEATURES
+
+The DAC960 RAID controllers are supported solely as high performance RAID
+controllers, not as interfaces to arbitrary SCSI devices. The Linux DAC960
+driver operates at the block device level, the same level as the SCSI and IDE
+drivers. Unlike other RAID controllers currently supported on Linux, the
+DAC960 driver is not dependent on the SCSI subsystem, and hence avoids all the
+complexity and unnecessary code that would be associated with an implementation
+as a SCSI driver. The DAC960 driver is designed for as high a performance as
+possible with no compromises or extra code for compatibility with lower
+performance devices. The DAC960 driver includes extensive error logging and
+online configuration management capabilities. Except for initial configuration
+of the controller and adding new disk drives, most everything can be handled
+from Linux while the system is operational.
+
+The DAC960 driver is architected to support up to 8 controllers per system.
+Each DAC960 parallel SCSI controller can support up to 15 disk drives per
+channel, for a maximum of 60 drives on a four channel controller; the fibre
+channel eXtremeRAID 3000 controller supports up to 125 disk drives per loop for
+a total of 250 drives. The drives installed on a controller are divided into
+one or more "Drive Groups", and then each Drive Group is subdivided further
+into 1 to 32 "Logical Drives". Each Logical Drive has a specific RAID Level
+and caching policy associated with it, and it appears to Linux as a single
+block device. Logical Drives are further subdivided into up to 7 partitions
+through the normal Linux and PC disk partitioning schemes. Logical Drives are
+also known as "System Drives", and Drive Groups are also called "Packs". Both
+terms are in use in the Mylex documentation; I have chosen to standardize on
+the more generic "Logical Drive" and "Drive Group".
+
+DAC960 RAID disk devices are named in the style of the obsolete Device File
+System (DEVFS). The device corresponding to Logical Drive D on Controller C
+is referred to as /dev/rd/cCdD, and the partitions are called /dev/rd/cCdDp1
+through /dev/rd/cCdDp7. For example, partition 3 of Logical Drive 5 on
+Controller 2 is referred to as /dev/rd/c2d5p3. Note that unlike with SCSI
+disks the device names will not change in the event of a disk drive failure.
+The DAC960 driver is assigned major numbers 48 - 55 with one major number per
+controller. The 8 bits of minor number are divided into 5 bits for the Logical
+Drive and 3 bits for the partition.
+
+
+ SUPPORTED DAC960/AcceleRAID/eXtremeRAID PCI RAID CONTROLLERS
+
+The following list comprises the supported DAC960, AcceleRAID, and eXtremeRAID
+PCI RAID Controllers as of the date of this document. It is recommended that
+anyone purchasing a Mylex PCI RAID Controller not in the following table
+contact the author beforehand to verify that it is or will be supported.
+
+eXtremeRAID 3000
+ 1 Wide Ultra-2/LVD SCSI channel
+ 2 External Fibre FC-AL channels
+ 233MHz StrongARM SA 110 Processor
+ 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
+ 32MB/64MB ECC SDRAM Memory
+
+eXtremeRAID 2000
+ 4 Wide Ultra-160 LVD SCSI channels
+ 233MHz StrongARM SA 110 Processor
+ 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
+ 32MB/64MB ECC SDRAM Memory
+
+AcceleRAID 352
+ 2 Wide Ultra-160 LVD SCSI channels
+ 100MHz Intel i960RN RISC Processor
+ 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
+ 32MB/64MB ECC SDRAM Memory
+
+AcceleRAID 170
+ 1 Wide Ultra-160 LVD SCSI channel
+ 100MHz Intel i960RM RISC Processor
+ 16MB/32MB/64MB ECC SDRAM Memory
+
+AcceleRAID 160 (AcceleRAID 170LP)
+ 1 Wide Ultra-160 LVD SCSI channel
+ 100MHz Intel i960RS RISC Processor
+ Built in 16M ECC SDRAM Memory
+ PCI Low Profile Form Factor - fit for 2U height
+
+eXtremeRAID 1100 (DAC1164P)
+ 3 Wide Ultra-2/LVD SCSI channels
+ 233MHz StrongARM SA 110 Processor
+ 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
+ 16MB/32MB/64MB Parity SDRAM Memory with Battery Backup
+
+AcceleRAID 250 (DAC960PTL1)
+ Uses onboard Symbios SCSI chips on certain motherboards
+ Also includes one onboard Wide Ultra-2/LVD SCSI Channel
+ 66MHz Intel i960RD RISC Processor
+ 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory
+
+AcceleRAID 200 (DAC960PTL0)
+ Uses onboard Symbios SCSI chips on certain motherboards
+ Includes no onboard SCSI Channels
+ 66MHz Intel i960RD RISC Processor
+ 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory
+
+AcceleRAID 150 (DAC960PRL)
+ Uses onboard Symbios SCSI chips on certain motherboards
+ Also includes one onboard Wide Ultra-2/LVD SCSI Channel
+ 33MHz Intel i960RP RISC Processor
+ 4MB Parity EDO Memory
+
+DAC960PJ 1/2/3 Wide Ultra SCSI-3 Channels
+ 66MHz Intel i960RD RISC Processor
+ 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory
+
+DAC960PG 1/2/3 Wide Ultra SCSI-3 Channels
+ 33MHz Intel i960RP RISC Processor
+ 4MB/8MB ECC EDO Memory
+
+DAC960PU 1/2/3 Wide Ultra SCSI-3 Channels
+ Intel i960CF RISC Processor
+ 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory
+
+DAC960PD 1/2/3 Wide Fast SCSI-2 Channels
+ Intel i960CF RISC Processor
+ 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory
+
+DAC960PL 1/2/3 Wide Fast SCSI-2 Channels
+ Intel i960 RISC Processor
+ 2MB/4MB/8MB/16MB/32MB DRAM Memory
+
+DAC960P 1/2/3 Wide Fast SCSI-2 Channels
+ Intel i960 RISC Processor
+ 2MB/4MB/8MB/16MB/32MB DRAM Memory
+
+For the eXtremeRAID 2000/3000 and AcceleRAID 352/170/160, firmware version
+6.00-01 or above is required.
+
+For the eXtremeRAID 1100, firmware version 5.06-0-52 or above is required.
+
+For the AcceleRAID 250, 200, and 150, firmware version 4.06-0-57 or above is
+required.
+
+For the DAC960PJ and DAC960PG, firmware version 4.06-0-00 or above is required.
+
+For the DAC960PU, DAC960PD, DAC960PL, and DAC960P, either firmware version
+3.51-0-04 or above is required (for dual Flash ROM controllers), or firmware
+version 2.73-0-00 or above is required (for single Flash ROM controllers)
+
+Please note that not all SCSI disk drives are suitable for use with DAC960
+controllers, and only particular firmware versions of any given model may
+actually function correctly. Similarly, not all motherboards have a BIOS that
+properly initializes the AcceleRAID 250, AcceleRAID 200, AcceleRAID 150,
+DAC960PJ, and DAC960PG because the Intel i960RD/RP is a multi-function device.
+If in doubt, contact Mylex RAID Technical Support (mylexsup@us.ibm.com) to
+verify compatibility. Mylex makes available a hard disk compatibility list at
+http://www.mylex.com/support/hdcomp/hd-lists.html.
+
+
+ DRIVER INSTALLATION
+
+This distribution was prepared for Linux kernel version 2.2.19 or 2.4.12.
+
+To install the DAC960 RAID driver, you may use the following commands,
+replacing "/usr/src" with wherever you keep your Linux kernel source tree:
+
+ cd /usr/src
+ tar -xvzf DAC960-2.2.11.tar.gz (or DAC960-2.4.11.tar.gz)
+ mv README.DAC960 linux/Documentation
+ mv DAC960.[ch] linux/drivers/block
+ patch -p0 < DAC960.patch (if DAC960.patch is included)
+ cd linux
+ make config
+ make bzImage (or zImage)
+
+Then install "arch/i386/boot/bzImage" or "arch/i386/boot/zImage" as your
+standard kernel, run lilo if appropriate, and reboot.
+
+To create the necessary devices in /dev, the "make_rd" script included in
+"DAC960-Utilities.tar.gz" from http://www.dandelion.com/Linux/ may be used.
+LILO 21 and FDISK v2.9 include DAC960 support; also included in this archive
+are patches to LILO 20 and FDISK v2.8 that add DAC960 support, along with
+statically linked executables of LILO and FDISK. This modified version of LILO
+will allow booting from a DAC960 controller and/or mounting the root file
+system from a DAC960.
+
+Red Hat Linux 6.0 and SuSE Linux 6.1 include support for Mylex PCI RAID
+controllers. Installing directly onto a DAC960 may be problematic from other
+Linux distributions until their installation utilities are updated.
+
+
+ INSTALLATION NOTES
+
+Before installing Linux or adding DAC960 logical drives to an existing Linux
+system, the controller must first be configured to provide one or more logical
+drives using the BIOS Configuration Utility or DACCF. Please note that since
+there are only at most 6 usable partitions on each logical drive, systems
+requiring more partitions should subdivide a drive group into multiple logical
+drives, each of which can have up to 6 usable partitions. Also, note that with
+large disk arrays it is advisable to enable the 8GB BIOS Geometry (255/63)
+rather than accepting the default 2GB BIOS Geometry (128/32); failing to so do
+will cause the logical drive geometry to have more than 65535 cylinders which
+will make it impossible for FDISK to be used properly. The 8GB BIOS Geometry
+can be enabled by configuring the DAC960 BIOS, which is accessible via Alt-M
+during the BIOS initialization sequence.
+
+For maximum performance and the most efficient E2FSCK performance, it is
+recommended that EXT2 file systems be built with a 4KB block size and 16 block
+stride to match the DAC960 controller's 64KB default stripe size. The command
+"mke2fs -b 4096 -R stride=16 <device>" is appropriate. Unless there will be a
+large number of small files on the file systems, it is also beneficial to add
+the "-i 16384" option to increase the bytes per inode parameter thereby
+reducing the file system metadata. Finally, on systems that will only be run
+with Linux 2.2 or later kernels it is beneficial to enable sparse superblocks
+with the "-s 1" option.
+
+
+ DAC960 ANNOUNCEMENTS MAILING LIST
+
+The DAC960 Announcements Mailing List provides a forum for informing Linux
+users of new driver releases and other announcements regarding Linux support
+for DAC960 PCI RAID Controllers. To join the mailing list, send a message to
+"dac960-announce-request@dandelion.com" with the line "subscribe" in the
+message body.
+
+
+ CONTROLLER CONFIGURATION AND STATUS MONITORING
+
+The DAC960 RAID controllers running firmware 4.06 or above include a Background
+Initialization facility so that system downtime is minimized both for initial
+installation and subsequent configuration of additional storage. The BIOS
+Configuration Utility (accessible via Alt-R during the BIOS initialization
+sequence) is used to quickly configure the controller, and then the logical
+drives that have been created are available for immediate use even while they
+are still being initialized by the controller. The primary need for online
+configuration and status monitoring is then to avoid system downtime when disk
+drives fail and must be replaced. Mylex's online monitoring and configuration
+utilities are being ported to Linux and will become available at some point in
+the future. Note that with a SAF-TE (SCSI Accessed Fault-Tolerant Enclosure)
+enclosure, the controller is able to rebuild failed drives automatically as
+soon as a drive replacement is made available.
+
+The primary interfaces for controller configuration and status monitoring are
+special files created in the /proc/rd/... hierarchy along with the normal
+system console logging mechanism. Whenever the system is operating, the DAC960
+driver queries each controller for status information every 10 seconds, and
+checks for additional conditions every 60 seconds. The initial status of each
+controller is always available for controller N in /proc/rd/cN/initial_status,
+and the current status as of the last status monitoring query is available in
+/proc/rd/cN/current_status. In addition, status changes are also logged by the
+driver to the system console and will appear in the log files maintained by
+syslog. The progress of asynchronous rebuild or consistency check operations
+is also available in /proc/rd/cN/current_status, and progress messages are
+logged to the system console at most every 60 seconds.
+
+Starting with the 2.2.3/2.0.3 versions of the driver, the status information
+available in /proc/rd/cN/initial_status and /proc/rd/cN/current_status has been
+augmented to include the vendor, model, revision, and serial number (if
+available) for each physical device found connected to the controller:
+
+***** DAC960 RAID Driver Version 2.2.3 of 19 August 1999 *****
+Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
+Configuring Mylex DAC960PRL PCI RAID Controller
+ Firmware Version: 4.07-0-07, Channels: 1, Memory Size: 16MB
+ PCI Bus: 1, Device: 4, Function: 1, I/O Address: Unassigned
+ PCI Address: 0xFE300000 mapped at 0xA0800000, IRQ Channel: 21
+ Controller Queue Depth: 128, Maximum Blocks per Command: 128
+ Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
+ Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
+ SAF-TE Enclosure Management Enabled
+ Physical Devices:
+ 0:0 Vendor: IBM Model: DRVS09D Revision: 0270
+ Serial Number: 68016775HA
+ Disk Status: Online, 17928192 blocks
+ 0:1 Vendor: IBM Model: DRVS09D Revision: 0270
+ Serial Number: 68004E53HA
+ Disk Status: Online, 17928192 blocks
+ 0:2 Vendor: IBM Model: DRVS09D Revision: 0270
+ Serial Number: 13013935HA
+ Disk Status: Online, 17928192 blocks
+ 0:3 Vendor: IBM Model: DRVS09D Revision: 0270
+ Serial Number: 13016897HA
+ Disk Status: Online, 17928192 blocks
+ 0:4 Vendor: IBM Model: DRVS09D Revision: 0270
+ Serial Number: 68019905HA
+ Disk Status: Online, 17928192 blocks
+ 0:5 Vendor: IBM Model: DRVS09D Revision: 0270
+ Serial Number: 68012753HA
+ Disk Status: Online, 17928192 blocks
+ 0:6 Vendor: ESG-SHV Model: SCA HSBP M6 Revision: 0.61
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Online, 89640960 blocks, Write Thru
+ No Rebuild or Consistency Check in Progress
+
+To simplify the monitoring process for custom software, the special file
+/proc/rd/status returns "OK" when all DAC960 controllers in the system are
+operating normally and no failures have occurred, or "ALERT" if any logical
+drives are offline or critical or any non-standby physical drives are dead.
+
+Configuration commands for controller N are available via the special file
+/proc/rd/cN/user_command. A human readable command can be written to this
+special file to initiate a configuration operation, and the results of the
+operation can then be read back from the special file in addition to being
+logged to the system console. The shell command sequence
+
+ echo "<configuration-command>" > /proc/rd/c0/user_command
+ cat /proc/rd/c0/user_command
+
+is typically used to execute configuration commands. The configuration
+commands are:
+
+ flush-cache
+
+ The "flush-cache" command flushes the controller's cache. The system
+ automatically flushes the cache at shutdown or if the driver module is
+ unloaded, so this command is only needed to be certain a write back cache
+ is flushed to disk before the system is powered off by a command to a UPS.
+ Note that the flush-cache command also stops an asynchronous rebuild or
+ consistency check, so it should not be used except when the system is being
+ halted.
+
+ kill <channel>:<target-id>
+
+ The "kill" command marks the physical drive <channel>:<target-id> as DEAD.
+ This command is provided primarily for testing, and should not be used
+ during normal system operation.
+
+ make-online <channel>:<target-id>
+
+ The "make-online" command changes the physical drive <channel>:<target-id>
+ from status DEAD to status ONLINE. In cases where multiple physical drives
+ have been killed simultaneously, this command may be used to bring all but
+ one of them back online, after which a rebuild to the final drive is
+ necessary.
+
+ Warning: make-online should only be used on a dead physical drive that is
+ an active part of a drive group, never on a standby drive. The command
+ should never be used on a dead drive that is part of a critical logical
+ drive; rebuild should be used if only a single drive is dead.
+
+ make-standby <channel>:<target-id>
+
+ The "make-standby" command changes physical drive <channel>:<target-id>
+ from status DEAD to status STANDBY. It should only be used in cases where
+ a dead drive was replaced after an automatic rebuild was performed onto a
+ standby drive. It cannot be used to add a standby drive to the controller
+ configuration if one was not created initially; the BIOS Configuration
+ Utility must be used for that currently.
+
+ rebuild <channel>:<target-id>
+
+ The "rebuild" command initiates an asynchronous rebuild onto physical drive
+ <channel>:<target-id>. It should only be used when a dead drive has been
+ replaced.
+
+ check-consistency <logical-drive-number>
+
+ The "check-consistency" command initiates an asynchronous consistency check
+ of <logical-drive-number> with automatic restoration. It can be used
+ whenever it is desired to verify the consistency of the redundancy
+ information.
+
+ cancel-rebuild
+ cancel-consistency-check
+
+ The "cancel-rebuild" and "cancel-consistency-check" commands cancel any
+ rebuild or consistency check operations previously initiated.
+
+
+ EXAMPLE I - DRIVE FAILURE WITHOUT A STANDBY DRIVE
+
+The following annotated logs demonstrate the controller configuration and and
+online status monitoring capabilities of the Linux DAC960 Driver. The test
+configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a
+DAC960PJ controller. The physical drives are configured into a single drive
+group without a standby drive, and the drive group has been configured into two
+logical drives, one RAID-5 and one RAID-6. Note that these logs are from an
+earlier version of the driver and the messages have changed somewhat with newer
+releases, but the functionality remains similar. First, here is the current
+status of the RAID configuration:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 *****
+Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
+Configuring Mylex DAC960PJ PCI RAID Controller
+ Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB
+ PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned
+ PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9
+ Controller Queue Depth: 128, Maximum Blocks per Command: 128
+ Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
+ Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Online, 2201600 blocks
+ 1:2 - Disk: Online, 2201600 blocks
+ 1:3 - Disk: Online, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru
+ No Rebuild or Consistency Check in Progress
+
+gwynedd:/u/lnz# cat /proc/rd/status
+OK
+
+The above messages indicate that everything is healthy, and /proc/rd/status
+returns "OK" indicating that there are no problems with any DAC960 controller
+in the system. For demonstration purposes, while I/O is active Physical Drive
+1:1 is now disconnected, simulating a drive failure. The failure is noted by
+the driver within 10 seconds of the controller's having detected it, and the
+driver logs the following console status messages indicating that Logical
+Drives 0 and 1 are now CRITICAL as a result of Physical Drive 1:1 being DEAD:
+
+DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
+DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
+DAC960#0: Physical Drive 1:1 killed because of timeout on SCSI command
+DAC960#0: Physical Drive 1:1 is now DEAD
+DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL
+DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL
+
+The Sense Keys logged here are just Check Condition / Unit Attention conditions
+arising from a SCSI bus reset that is forced by the controller during its error
+recovery procedures. Concurrently with the above, the driver status available
+from /proc/rd also reflects the drive failure. The status message in
+/proc/rd/status has changed from "OK" to "ALERT":
+
+gwynedd:/u/lnz# cat /proc/rd/status
+ALERT
+
+and /proc/rd/c0/current_status has been updated:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+ ...
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Dead, 2201600 blocks
+ 1:2 - Disk: Online, 2201600 blocks
+ 1:3 - Disk: Online, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru
+ No Rebuild or Consistency Check in Progress
+
+Since there are no standby drives configured, the system can continue to access
+the logical drives in a performance degraded mode until the failed drive is
+replaced and a rebuild operation completed to restore the redundancy of the
+logical drives. Once Physical Drive 1:1 is replaced with a properly
+functioning drive, or if the physical drive was killed without having failed
+(e.g., due to electrical problems on the SCSI bus), the user can instruct the
+controller to initiate a rebuild operation onto the newly replaced drive:
+
+gwynedd:/u/lnz# echo "rebuild 1:1" > /proc/rd/c0/user_command
+gwynedd:/u/lnz# cat /proc/rd/c0/user_command
+Rebuild of Physical Drive 1:1 Initiated
+
+The echo command instructs the controller to initiate an asynchronous rebuild
+operation onto Physical Drive 1:1, and the status message that results from the
+operation is then available for reading from /proc/rd/c0/user_command, as well
+as being logged to the console by the driver.
+
+Within 10 seconds of this command the driver logs the initiation of the
+asynchronous rebuild operation:
+
+DAC960#0: Rebuild of Physical Drive 1:1 Initiated
+DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01
+DAC960#0: Physical Drive 1:1 is now WRITE-ONLY
+DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 1% completed
+
+and /proc/rd/c0/current_status is updated:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+ ...
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Write-Only, 2201600 blocks
+ 1:2 - Disk: Online, 2201600 blocks
+ 1:3 - Disk: Online, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru
+ Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 6% completed
+
+As the rebuild progresses, the current status in /proc/rd/c0/current_status is
+updated every 10 seconds:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+ ...
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Write-Only, 2201600 blocks
+ 1:2 - Disk: Online, 2201600 blocks
+ 1:3 - Disk: Online, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru
+ Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 15% completed
+
+and every minute a progress message is logged to the console by the driver:
+
+DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 32% completed
+DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 63% completed
+DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 94% completed
+DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 94% completed
+
+Finally, the rebuild completes successfully. The driver logs the status of the
+logical and physical drives and the rebuild completion:
+
+DAC960#0: Rebuild Completed Successfully
+DAC960#0: Physical Drive 1:1 is now ONLINE
+DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE
+DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE
+
+/proc/rd/c0/current_status is updated:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+ ...
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Online, 2201600 blocks
+ 1:2 - Disk: Online, 2201600 blocks
+ 1:3 - Disk: Online, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru
+ Rebuild Completed Successfully
+
+and /proc/rd/status indicates that everything is healthy once again:
+
+gwynedd:/u/lnz# cat /proc/rd/status
+OK
+
+
+ EXAMPLE II - DRIVE FAILURE WITH A STANDBY DRIVE
+
+The following annotated logs demonstrate the controller configuration and and
+online status monitoring capabilities of the Linux DAC960 Driver. The test
+configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a
+DAC960PJ controller. The physical drives are configured into a single drive
+group with a standby drive, and the drive group has been configured into two
+logical drives, one RAID-5 and one RAID-6. Note that these logs are from an
+earlier version of the driver and the messages have changed somewhat with newer
+releases, but the functionality remains similar. First, here is the current
+status of the RAID configuration:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 *****
+Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
+Configuring Mylex DAC960PJ PCI RAID Controller
+ Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB
+ PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned
+ PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9
+ Controller Queue Depth: 128, Maximum Blocks per Command: 128
+ Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
+ Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Online, 2201600 blocks
+ 1:2 - Disk: Online, 2201600 blocks
+ 1:3 - Disk: Standby, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru
+ No Rebuild or Consistency Check in Progress
+
+gwynedd:/u/lnz# cat /proc/rd/status
+OK
+
+The above messages indicate that everything is healthy, and /proc/rd/status
+returns "OK" indicating that there are no problems with any DAC960 controller
+in the system. For demonstration purposes, while I/O is active Physical Drive
+1:2 is now disconnected, simulating a drive failure. The failure is noted by
+the driver within 10 seconds of the controller's having detected it, and the
+driver logs the following console status messages:
+
+DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
+DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
+DAC960#0: Physical Drive 1:2 killed because of timeout on SCSI command
+DAC960#0: Physical Drive 1:2 is now DEAD
+DAC960#0: Physical Drive 1:2 killed because it was removed
+DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL
+DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL
+
+Since a standby drive is configured, the controller automatically begins
+rebuilding onto the standby drive:
+
+DAC960#0: Physical Drive 1:3 is now WRITE-ONLY
+DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed
+
+Concurrently with the above, the driver status available from /proc/rd also
+reflects the drive failure and automatic rebuild. The status message in
+/proc/rd/status has changed from "OK" to "ALERT":
+
+gwynedd:/u/lnz# cat /proc/rd/status
+ALERT
+
+and /proc/rd/c0/current_status has been updated:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+ ...
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Online, 2201600 blocks
+ 1:2 - Disk: Dead, 2201600 blocks
+ 1:3 - Disk: Write-Only, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru
+ Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed
+
+As the rebuild progresses, the current status in /proc/rd/c0/current_status is
+updated every 10 seconds:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+ ...
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Online, 2201600 blocks
+ 1:2 - Disk: Dead, 2201600 blocks
+ 1:3 - Disk: Write-Only, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru
+ Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed
+
+and every minute a progress message is logged on the console by the driver:
+
+DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed
+DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 76% completed
+DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 66% completed
+DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 84% completed
+
+Finally, the rebuild completes successfully. The driver logs the status of the
+logical and physical drives and the rebuild completion:
+
+DAC960#0: Rebuild Completed Successfully
+DAC960#0: Physical Drive 1:3 is now ONLINE
+DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE
+DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE
+
+/proc/rd/c0/current_status is updated:
+
+***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 *****
+Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
+Configuring Mylex DAC960PJ PCI RAID Controller
+ Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB
+ PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned
+ PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9
+ Controller Queue Depth: 128, Maximum Blocks per Command: 128
+ Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
+ Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Online, 2201600 blocks
+ 1:2 - Disk: Dead, 2201600 blocks
+ 1:3 - Disk: Online, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru
+ Rebuild Completed Successfully
+
+and /proc/rd/status indicates that everything is healthy once again:
+
+gwynedd:/u/lnz# cat /proc/rd/status
+OK
+
+Note that the absence of a viable standby drive does not create an "ALERT"
+status. Once dead Physical Drive 1:2 has been replaced, the controller must be
+told that this has occurred and that the newly replaced drive should become the
+new standby drive:
+
+gwynedd:/u/lnz# echo "make-standby 1:2" > /proc/rd/c0/user_command
+gwynedd:/u/lnz# cat /proc/rd/c0/user_command
+Make Standby of Physical Drive 1:2 Succeeded
+
+The echo command instructs the controller to make Physical Drive 1:2 into a
+standby drive, and the status message that results from the operation is then
+available for reading from /proc/rd/c0/user_command, as well as being logged to
+the console by the driver. Within 60 seconds of this command the driver logs:
+
+DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01
+DAC960#0: Physical Drive 1:2 is now STANDBY
+DAC960#0: Make Standby of Physical Drive 1:2 Succeeded
+
+and /proc/rd/c0/current_status is updated:
+
+gwynedd:/u/lnz# cat /proc/rd/c0/current_status
+ ...
+ Physical Devices:
+ 0:1 - Disk: Online, 2201600 blocks
+ 0:2 - Disk: Online, 2201600 blocks
+ 0:3 - Disk: Online, 2201600 blocks
+ 1:1 - Disk: Online, 2201600 blocks
+ 1:2 - Disk: Standby, 2201600 blocks
+ 1:3 - Disk: Online, 2201600 blocks
+ Logical Drives:
+ /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru
+ /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru
+ Rebuild Completed Successfully
diff --git a/Documentation/blockdev/cciss.txt b/Documentation/blockdev/cciss.txt
new file mode 100644
index 0000000..89698e8
--- /dev/null
+++ b/Documentation/blockdev/cciss.txt
@@ -0,0 +1,171 @@
+This driver is for Compaq's SMART Array Controllers.
+
+Supported Cards:
+----------------
+
+This driver is known to work with the following cards:
+
+ * SA 5300
+ * SA 5i
+ * SA 532
+ * SA 5312
+ * SA 641
+ * SA 642
+ * SA 6400
+ * SA 6400 U320 Expansion Module
+ * SA 6i
+ * SA P600
+ * SA P800
+ * SA E400
+ * SA P400i
+ * SA E200
+ * SA E200i
+ * SA E500
+ * SA P700m
+ * SA P212
+ * SA P410
+ * SA P410i
+ * SA P411
+ * SA P812
+ * SA P712m
+ * SA P711m
+
+Detecting drive failures:
+-------------------------
+
+To get the status of logical volumes and to detect physical drive
+failures, you can use the cciss_vol_status program found here:
+http://cciss.sourceforge.net/#cciss_utils
+
+Device Naming:
+--------------
+
+If nodes are not already created in the /dev/cciss directory, run as root:
+
+# cd /dev
+# ./MAKEDEV cciss
+
+You need some entries in /dev for the cciss device. The MAKEDEV script
+can make device nodes for you automatically. Currently the device setup
+is as follows:
+
+Major numbers:
+ 104 cciss0
+ 105 cciss1
+ 106 cciss2
+ 105 cciss3
+ 108 cciss4
+ 109 cciss5
+ 110 cciss6
+ 111 cciss7
+
+Minor numbers:
+ b7 b6 b5 b4 b3 b2 b1 b0
+ |----+----| |----+----|
+ | |
+ | +-------- Partition ID (0=wholedev, 1-15 partition)
+ |
+ +-------------------- Logical Volume number
+
+The device naming scheme is:
+/dev/cciss/c0d0 Controller 0, disk 0, whole device
+/dev/cciss/c0d0p1 Controller 0, disk 0, partition 1
+/dev/cciss/c0d0p2 Controller 0, disk 0, partition 2
+/dev/cciss/c0d0p3 Controller 0, disk 0, partition 3
+
+/dev/cciss/c1d1 Controller 1, disk 1, whole device
+/dev/cciss/c1d1p1 Controller 1, disk 1, partition 1
+/dev/cciss/c1d1p2 Controller 1, disk 1, partition 2
+/dev/cciss/c1d1p3 Controller 1, disk 1, partition 3
+
+SCSI tape drive and medium changer support
+------------------------------------------
+
+SCSI sequential access devices and medium changer devices are supported and
+appropriate device nodes are automatically created. (e.g.
+/dev/st0, /dev/st1, etc. See the "st" man page for more details.)
+You must enable "SCSI tape drive support for Smart Array 5xxx" and
+"SCSI support" in your kernel configuration to be able to use SCSI
+tape drives with your Smart Array 5xxx controller.
+
+Additionally, note that the driver will not engage the SCSI core at init
+time. The driver must be directed to dynamically engage the SCSI core via
+the /proc filesystem entry which the "block" side of the driver creates as
+/proc/driver/cciss/cciss* at runtime. This is because at driver init time,
+the SCSI core may not yet be initialized (because the driver is a block
+driver) and attempting to register it with the SCSI core in such a case
+would cause a hang. This is best done via an initialization script
+(typically in /etc/init.d, but could vary depending on distribution).
+For example:
+
+ for x in /proc/driver/cciss/cciss[0-9]*
+ do
+ echo "engage scsi" > $x
+ done
+
+Once the SCSI core is engaged by the driver, it cannot be disengaged
+(except by unloading the driver, if it happens to be linked as a module.)
+
+Note also that if no sequential access devices or medium changers are
+detected, the SCSI core will not be engaged by the action of the above
+script.
+
+Hot plug support for SCSI tape drives
+-------------------------------------
+
+Hot plugging of SCSI tape drives is supported, with some caveats.
+The cciss driver must be informed that changes to the SCSI bus
+have been made. This may be done via the /proc filesystem.
+For example:
+
+ echo "rescan" > /proc/scsi/cciss0/1
+
+This causes the driver to query the adapter about changes to the
+physical SCSI buses and/or fibre channel arbitrated loop and the
+driver to make note of any new or removed sequential access devices
+or medium changers. The driver will output messages indicating what
+devices have been added or removed and the controller, bus, target and
+lun used to address the device. It then notifies the SCSI mid layer
+of these changes.
+
+Note that the naming convention of the /proc filesystem entries
+contains a number in addition to the driver name. (E.g. "cciss0"
+instead of just "cciss" which you might expect.)
+
+Note: ONLY sequential access devices and medium changers are presented
+as SCSI devices to the SCSI mid layer by the cciss driver. Specifically,
+physical SCSI disk drives are NOT presented to the SCSI mid layer. The
+physical SCSI disk drives are controlled directly by the array controller
+hardware and it is important to prevent the kernel from attempting to directly
+access these devices too, as if the array controller were merely a SCSI
+controller in the same way that we are allowing it to access SCSI tape drives.
+
+SCSI error handling for tape drives and medium changers
+-------------------------------------------------------
+
+The linux SCSI mid layer provides an error handling protocol which
+kicks into gear whenever a SCSI command fails to complete within a
+certain amount of time (which can vary depending on the command).
+The cciss driver participates in this protocol to some extent. The
+normal protocol is a four step process. First the device is told
+to abort the command. If that doesn't work, the device is reset.
+If that doesn't work, the SCSI bus is reset. If that doesn't work
+the host bus adapter is reset. Because the cciss driver is a block
+driver as well as a SCSI driver and only the tape drives and medium
+changers are presented to the SCSI mid layer, and unlike more
+straightforward SCSI drivers, disk i/o continues through the block
+side during the SCSI error recovery process, the cciss driver only
+implements the first two of these actions, aborting the command, and
+resetting the device. Additionally, most tape drives will not oblige
+in aborting commands, and sometimes it appears they will not even
+obey a reset command, though in most circumstances they will. In
+the case that the command cannot be aborted and the device cannot be
+reset, the device will be set offline.
+
+In the event the error handling code is triggered and a tape drive is
+successfully reset or the tardy command is successfully aborted, the
+tape drive may still not allow i/o to continue until some command
+is issued which positions the tape to a known position. Typically you
+must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example)
+before i/o can proceed again to a tape drive which was reset.
+
diff --git a/Documentation/blockdev/cpqarray.txt b/Documentation/blockdev/cpqarray.txt
new file mode 100644
index 0000000..c7154e2
--- /dev/null
+++ b/Documentation/blockdev/cpqarray.txt
@@ -0,0 +1,93 @@
+This driver is for Compaq's SMART2 Intelligent Disk Array Controllers.
+
+Supported Cards:
+----------------
+
+This driver is known to work with the following cards:
+
+ * SMART (EISA)
+ * SMART-2/E (EISA)
+ * SMART-2/P
+ * SMART-2DH
+ * SMART-2SL
+ * SMART-221
+ * SMART-3100ES
+ * SMART-3200
+ * Integrated Smart Array Controller
+ * SA 4200
+ * SA 4250ES
+ * SA 431
+ * RAID LC2 Controller
+
+It should also work with some really old Disk array adapters, but I am
+unable to test against these cards:
+
+ * IDA
+ * IDA-2
+ * IAES
+
+
+EISA Controllers:
+-----------------
+
+If you want to use an EISA controller you'll have to supply some
+modprobe/lilo parameters. If the driver is compiled into the kernel, must
+give it the controller's IO port address at boot time (it is not
+necessary to specify the IRQ). For example, if you had two SMART-2/E
+controllers, in EISA slots 1 and 2 you'd give it a boot argument like
+this:
+
+ smart2=0x1000,0x2000
+
+If you were loading the driver as a module, you'd give load it like this:
+
+ modprobe cpqarray eisa=0x1000,0x2000
+
+You can use EISA and PCI adapters at the same time.
+
+
+Device Naming:
+--------------
+
+You need some entries in /dev for the ida device. MAKEDEV in the /dev
+directory can make device nodes for you automatically. The device setup is
+as follows:
+
+Major numbers:
+ 72 ida0
+ 73 ida1
+ 74 ida2
+ 75 ida3
+ 76 ida4
+ 77 ida5
+ 78 ida6
+ 79 ida7
+
+Minor numbers:
+ b7 b6 b5 b4 b3 b2 b1 b0
+ |----+----| |----+----|
+ | |
+ | +-------- Partition ID (0=wholedev, 1-15 partition)
+ |
+ +-------------------- Logical Volume number
+
+The device naming scheme is:
+/dev/ida/c0d0 Controller 0, disk 0, whole device
+/dev/ida/c0d0p1 Controller 0, disk 0, partition 1
+/dev/ida/c0d0p2 Controller 0, disk 0, partition 2
+/dev/ida/c0d0p3 Controller 0, disk 0, partition 3
+
+/dev/ida/c1d1 Controller 1, disk 1, whole device
+/dev/ida/c1d1p1 Controller 1, disk 1, partition 1
+/dev/ida/c1d1p2 Controller 1, disk 1, partition 2
+/dev/ida/c1d1p3 Controller 1, disk 1, partition 3
+
+
+Changelog:
+==========
+
+10-28-2004 : General cleanup, syntax fixes for in-kernel driver version.
+ James Nelson <james4765@gmail.com>
+
+
+1999 : Original Document
diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt
new file mode 100644
index 0000000..6ccab88
--- /dev/null
+++ b/Documentation/blockdev/floppy.txt
@@ -0,0 +1,245 @@
+This file describes the floppy driver.
+
+FAQ list:
+=========
+
+ A FAQ list may be found in the fdutils package (see below), and also
+at <http://fdutils.linux.lu/faq.html>.
+
+
+LILO configuration options (Thinkpad users, read this)
+======================================================
+
+ The floppy driver is configured using the 'floppy=' option in
+lilo. This option can be typed at the boot prompt, or entered in the
+lilo configuration file.
+
+ Example: If your kernel is called linux-2.6.9, type the following line
+at the lilo boot prompt (if you have a thinkpad):
+
+ linux-2.6.9 floppy=thinkpad
+
+You may also enter the following line in /etc/lilo.conf, in the description
+of linux-2.6.9:
+
+ append = "floppy=thinkpad"
+
+ Several floppy related options may be given, example:
+
+ linux-2.6.9 floppy=daring floppy=two_fdc
+ append = "floppy=daring floppy=two_fdc"
+
+ If you give options both in the lilo config file and on the boot
+prompt, the option strings of both places are concatenated, the boot
+prompt options coming last. That's why there are also options to
+restore the default behavior.
+
+
+Module configuration options
+============================
+
+ If you use the floppy driver as a module, use the following syntax:
+modprobe floppy <options>
+
+Example:
+ modprobe floppy omnibook messages
+
+ If you need certain options enabled every time you load the floppy driver,
+you can put:
+
+ options floppy omnibook messages
+
+in /etc/modprobe.conf.
+
+
+ The floppy driver related options are:
+
+ floppy=asus_pci
+ Sets the bit mask to allow only units 0 and 1. (default)
+
+ floppy=daring
+ Tells the floppy driver that you have a well behaved floppy controller.
+ This allows more efficient and smoother operation, but may fail on
+ certain controllers. This may speed up certain operations.
+
+ floppy=0,daring
+ Tells the floppy driver that your floppy controller should be used
+ with caution.
+
+ floppy=one_fdc
+ Tells the floppy driver that you have only one floppy controller.
+ (default)
+
+ floppy=two_fdc
+ floppy=<address>,two_fdc
+ Tells the floppy driver that you have two floppy controllers.
+ The second floppy controller is assumed to be at <address>.
+ This option is not needed if the second controller is at address
+ 0x370, and if you use the 'cmos' option.
+
+ floppy=thinkpad
+ Tells the floppy driver that you have a Thinkpad. Thinkpads use an
+ inverted convention for the disk change line.
+
+ floppy=0,thinkpad
+ Tells the floppy driver that you don't have a Thinkpad.
+
+ floppy=omnibook
+ floppy=nodma
+ Tells the floppy driver not to use Dma for data transfers.
+ This is needed on HP Omnibooks, which don't have a workable
+ DMA channel for the floppy driver. This option is also useful
+ if you frequently get "Unable to allocate DMA memory" messages.
+ Indeed, dma memory needs to be continuous in physical memory,
+ and is thus harder to find, whereas non-dma buffers may be
+ allocated in virtual memory. However, I advise against this if
+ you have an FDC without a FIFO (8272A or 82072). 82072A and
+ later are OK. You also need at least a 486 to use nodma.
+ If you use nodma mode, I suggest you also set the FIFO
+ threshold to 10 or lower, in order to limit the number of data
+ transfer interrupts.
+
+ If you have a FIFO-able FDC, the floppy driver automatically
+ falls back on non DMA mode if no DMA-able memory can be found.
+ If you want to avoid this, explicitly ask for 'yesdma'.
+
+ floppy=yesdma
+ Tells the floppy driver that a workable DMA channel is available.
+ (default)
+
+ floppy=nofifo
+ Disables the FIFO entirely. This is needed if you get "Bus
+ master arbitration error" messages from your Ethernet card (or
+ from other devices) while accessing the floppy.
+
+ floppy=usefifo
+ Enables the FIFO. (default)
+
+ floppy=<threshold>,fifo_depth
+ Sets the FIFO threshold. This is mostly relevant in DMA
+ mode. If this is higher, the floppy driver tolerates more
+ interrupt latency, but it triggers more interrupts (i.e. it
+ imposes more load on the rest of the system). If this is
+ lower, the interrupt latency should be lower too (faster
+ processor). The benefit of a lower threshold is less
+ interrupts.
+
+ To tune the fifo threshold, switch on over/underrun messages
+ using 'floppycontrol --messages'. Then access a floppy
+ disk. If you get a huge amount of "Over/Underrun - retrying"
+ messages, then the fifo threshold is too low. Try with a
+ higher value, until you only get an occasional Over/Underrun.
+ It is a good idea to compile the floppy driver as a module
+ when doing this tuning. Indeed, it allows to try different
+ fifo values without rebooting the machine for each test. Note
+ that you need to do 'floppycontrol --messages' every time you
+ re-insert the module.
+
+ Usually, tuning the fifo threshold should not be needed, as
+ the default (0xa) is reasonable.
+
+ floppy=<drive>,<type>,cmos
+ Sets the CMOS type of <drive> to <type>. This is mandatory if
+ you have more than two floppy drives (only two can be
+ described in the physical CMOS), or if your BIOS uses
+ non-standard CMOS types. The CMOS types are:
+
+ 0 - Use the value of the physical CMOS
+ 1 - 5 1/4 DD
+ 2 - 5 1/4 HD
+ 3 - 3 1/2 DD
+ 4 - 3 1/2 HD
+ 5 - 3 1/2 ED
+ 6 - 3 1/2 ED
+ 16 - unknown or not installed
+
+ (Note: there are two valid types for ED drives. This is because 5 was
+ initially chosen to represent floppy *tapes*, and 6 for ED drives.
+ AMI ignored this, and used 5 for ED drives. That's why the floppy
+ driver handles both.)
+
+ floppy=unexpected_interrupts
+ Print a warning message when an unexpected interrupt is received.
+ (default)
+
+ floppy=no_unexpected_interrupts
+ floppy=L40SX
+ Don't print a message when an unexpected interrupt is received. This
+ is needed on IBM L40SX laptops in certain video modes. (There seems
+ to be an interaction between video and floppy. The unexpected
+ interrupts affect only performance, and can be safely ignored.)
+
+ floppy=broken_dcl
+ Don't use the disk change line, but assume that the disk was
+ changed whenever the device node is reopened. Needed on some
+ boxes where the disk change line is broken or unsupported.
+ This should be regarded as a stopgap measure, indeed it makes
+ floppy operation less efficient due to unneeded cache
+ flushings, and slightly more unreliable. Please verify your
+ cable, connection and jumper settings if you have any DCL
+ problems. However, some older drives, and also some laptops
+ are known not to have a DCL.
+
+ floppy=debug
+ Print debugging messages.
+
+ floppy=messages
+ Print informational messages for some operations (disk change
+ notifications, warnings about over and underruns, and about
+ autodetection).
+
+ floppy=silent_dcl_clear
+ Uses a less noisy way to clear the disk change line (which
+ doesn't involve seeks). Implied by 'daring' option.
+
+ floppy=<nr>,irq
+ Sets the floppy IRQ to <nr> instead of 6.
+
+ floppy=<nr>,dma
+ Sets the floppy DMA channel to <nr> instead of 2.
+
+ floppy=slow
+ Use PS/2 stepping rate:
+ " PS/2 floppies have much slower step rates than regular floppies.
+ It's been recommended that take about 1/4 of the default speed
+ in some more extreme cases."
+
+
+Supporting utilities and additional documentation:
+==================================================
+
+ Additional parameters of the floppy driver can be configured at
+runtime. Utilities which do this can be found in the fdutils package.
+This package also contains a new version of mtools which allows to
+access high capacity disks (up to 1992K on a high density 3 1/2 disk!).
+It also contains additional documentation about the floppy driver.
+
+The latest version can be found at fdutils homepage:
+ http://fdutils.linux.lu
+
+The fdutils releases can be found at:
+ http://fdutils.linux.lu/download.html
+ http://www.tux.org/pub/knaff/fdutils/
+ ftp://metalab.unc.edu/pub/Linux/utils/disk-management/
+
+Reporting problems about the floppy driver
+==========================================
+
+ If you have a question or a bug report about the floppy driver, mail
+me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use
+comp.os.linux.hardware. As the volume in these groups is rather high,
+be sure to include the word "floppy" (or "FLOPPY") in the subject
+line. If the reported problem happens when mounting floppy disks, be
+sure to mention also the type of the filesystem in the subject line.
+
+ Be sure to read the FAQ before mailing/posting any bug reports!
+
+ Alain
+
+Changelog
+=========
+
+10-30-2004 : Cleanup, updating, add reference to module configuration.
+ James Nelson <james4765@gmail.com>
+
+6-3-2000 : Original Document
diff --git a/Documentation/blockdev/nbd.txt b/Documentation/blockdev/nbd.txt
new file mode 100644
index 0000000..aeb93ff
--- /dev/null
+++ b/Documentation/blockdev/nbd.txt
@@ -0,0 +1,47 @@
+ Network Block Device (TCP version)
+
+ What is it: With this compiled in the kernel (or as a module), Linux
+ can use a remote server as one of its block devices. So every time
+ the client computer wants to read, e.g., /dev/nb0, it sends a
+ request over TCP to the server, which will reply with the data read.
+ This can be used for stations with low disk space (or even diskless -
+ if you boot from floppy) to borrow disk space from another computer.
+ Unlike NFS, it is possible to put any filesystem on it, etc. It should
+ even be possible to use NBD as a root filesystem (I've never tried),
+ but it requires a user-level program to be in the initrd to start.
+ It also allows you to run block-device in user land (making server
+ and client physically the same computer, communicating using loopback).
+
+ Current state: It currently works. Network block device is stable.
+ I originally thought that it was impossible to swap over TCP. It
+ turned out not to be true - swapping over TCP now works and seems
+ to be deadlock-free, but it requires heavy patches into Linux's
+ network layer.
+
+ For more information, or to download the nbd-client and nbd-server
+ tools, go to http://nbd.sf.net/.
+
+ Howto: To setup nbd, you can simply do the following:
+
+ First, serve a device or file from a remote server:
+
+ nbd-server <port-number> <device-or-file-to-serve-to-client>
+
+ e.g.,
+ root@server1 # nbd-server 1234 /dev/sdb1
+
+ (serves sdb1 partition on TCP port 1234)
+
+ Then, on the local (client) system:
+
+ nbd-client <server-name-or-IP> <server-port-number> /dev/nb[0-n]
+
+ e.g.,
+ root@client1 # nbd-client server1 1234 /dev/nb0
+
+ (creates the nb0 device on client1)
+
+ The nbd kernel module need only be installed on the client
+ system, as the nbd-server is completely in userspace. In fact,
+ the nbd-server has been successfully ported to other operating
+ systems, including Windows.
diff --git a/Documentation/blockdev/paride.txt b/Documentation/blockdev/paride.txt
new file mode 100644
index 0000000..e431267
--- /dev/null
+++ b/Documentation/blockdev/paride.txt
@@ -0,0 +1,417 @@
+
+ Linux and parallel port IDE devices
+
+PARIDE v1.03 (c) 1997-8 Grant Guenther <grant@torque.net>
+
+1. Introduction
+
+Owing to the simplicity and near universality of the parallel port interface
+to personal computers, many external devices such as portable hard-disk,
+CD-ROM, LS-120 and tape drives use the parallel port to connect to their
+host computer. While some devices (notably scanners) use ad-hoc methods
+to pass commands and data through the parallel port interface, most
+external devices are actually identical to an internal model, but with
+a parallel-port adapter chip added in. Some of the original parallel port
+adapters were little more than mechanisms for multiplexing a SCSI bus.
+(The Iomega PPA-3 adapter used in the ZIP drives is an example of this
+approach). Most current designs, however, take a different approach.
+The adapter chip reproduces a small ISA or IDE bus in the external device
+and the communication protocol provides operations for reading and writing
+device registers, as well as data block transfer functions. Sometimes,
+the device being addressed via the parallel cable is a standard SCSI
+controller like an NCR 5380. The "ditto" family of external tape
+drives use the ISA replicator to interface a floppy disk controller,
+which is then connected to a floppy-tape mechanism. The vast majority
+of external parallel port devices, however, are now based on standard
+IDE type devices, which require no intermediate controller. If one
+were to open up a parallel port CD-ROM drive, for instance, one would
+find a standard ATAPI CD-ROM drive, a power supply, and a single adapter
+that interconnected a standard PC parallel port cable and a standard
+IDE cable. It is usually possible to exchange the CD-ROM device with
+any other device using the IDE interface.
+
+The document describes the support in Linux for parallel port IDE
+devices. It does not cover parallel port SCSI devices, "ditto" tape
+drives or scanners. Many different devices are supported by the
+parallel port IDE subsystem, including:
+
+ MicroSolutions backpack CD-ROM
+ MicroSolutions backpack PD/CD
+ MicroSolutions backpack hard-drives
+ MicroSolutions backpack 8000t tape drive
+ SyQuest EZ-135, EZ-230 & SparQ drives
+ Avatar Shark
+ Imation Superdisk LS-120
+ Maxell Superdisk LS-120
+ FreeCom Power CD
+ Hewlett-Packard 5GB and 8GB tape drives
+ Hewlett-Packard 7100 and 7200 CD-RW drives
+
+as well as most of the clone and no-name products on the market.
+
+To support such a wide range of devices, PARIDE, the parallel port IDE
+subsystem, is actually structured in three parts. There is a base
+paride module which provides a registry and some common methods for
+accessing the parallel ports. The second component is a set of
+high-level drivers for each of the different types of supported devices:
+
+ pd IDE disk
+ pcd ATAPI CD-ROM
+ pf ATAPI disk
+ pt ATAPI tape
+ pg ATAPI generic
+
+(Currently, the pg driver is only used with CD-R drives).
+
+The high-level drivers function according to the relevant standards.
+The third component of PARIDE is a set of low-level protocol drivers
+for each of the parallel port IDE adapter chips. Thanks to the interest
+and encouragement of Linux users from many parts of the world,
+support is available for almost all known adapter protocols:
+
+ aten ATEN EH-100 (HK)
+ bpck Microsolutions backpack (US)
+ comm DataStor (old-type) "commuter" adapter (TW)
+ dstr DataStor EP-2000 (TW)
+ epat Shuttle EPAT (UK)
+ epia Shuttle EPIA (UK)
+ fit2 FIT TD-2000 (US)
+ fit3 FIT TD-3000 (US)
+ friq Freecom IQ cable (DE)
+ frpw Freecom Power (DE)
+ kbic KingByte KBIC-951A and KBIC-971A (TW)
+ ktti KT Technology PHd adapter (SG)
+ on20 OnSpec 90c20 (US)
+ on26 OnSpec 90c26 (US)
+
+
+2. Using the PARIDE subsystem
+
+While configuring the Linux kernel, you may choose either to build
+the PARIDE drivers into your kernel, or to build them as modules.
+
+In either case, you will need to select "Parallel port IDE device support"
+as well as at least one of the high-level drivers and at least one
+of the parallel port communication protocols. If you do not know
+what kind of parallel port adapter is used in your drive, you could
+begin by checking the file names and any text files on your DOS
+installation floppy. Alternatively, you can look at the markings on
+the adapter chip itself. That's usually sufficient to identify the
+correct device.
+
+You can actually select all the protocol modules, and allow the PARIDE
+subsystem to try them all for you.
+
+For the "brand-name" products listed above, here are the protocol
+and high-level drivers that you would use:
+
+ Manufacturer Model Driver Protocol
+
+ MicroSolutions CD-ROM pcd bpck
+ MicroSolutions PD drive pf bpck
+ MicroSolutions hard-drive pd bpck
+ MicroSolutions 8000t tape pt bpck
+ SyQuest EZ, SparQ pd epat
+ Imation Superdisk pf epat
+ Maxell Superdisk pf friq
+ Avatar Shark pd epat
+ FreeCom CD-ROM pcd frpw
+ Hewlett-Packard 5GB Tape pt epat
+ Hewlett-Packard 7200e (CD) pcd epat
+ Hewlett-Packard 7200e (CD-R) pg epat
+
+2.1 Configuring built-in drivers
+
+We recommend that you get to know how the drivers work and how to
+configure them as loadable modules, before attempting to compile a
+kernel with the drivers built-in.
+
+If you built all of your PARIDE support directly into your kernel,
+and you have just a single parallel port IDE device, your kernel should
+locate it automatically for you. If you have more than one device,
+you may need to give some command line options to your bootloader
+(eg: LILO), how to do that is beyond the scope of this document.
+
+The high-level drivers accept a number of command line parameters, all
+of which are documented in the source files in linux/drivers/block/paride.
+By default, each driver will automatically try all parallel ports it
+can find, and all protocol types that have been installed, until it finds
+a parallel port IDE adapter. Once it finds one, the probe stops. So,
+if you have more than one device, you will need to tell the drivers
+how to identify them. This requires specifying the port address, the
+protocol identification number and, for some devices, the drive's
+chain ID. While your system is booting, a number of messages are
+displayed on the console. Like all such messages, they can be
+reviewed with the 'dmesg' command. Among those messages will be
+some lines like:
+
+ paride: bpck registered as protocol 0
+ paride: epat registered as protocol 1
+
+The numbers will always be the same until you build a new kernel with
+different protocol selections. You should note these numbers as you
+will need them to identify the devices.
+
+If you happen to be using a MicroSolutions backpack device, you will
+also need to know the unit ID number for each drive. This is usually
+the last two digits of the drive's serial number (but read MicroSolutions'
+documentation about this).
+
+As an example, let's assume that you have a MicroSolutions PD/CD drive
+with unit ID number 36 connected to the parallel port at 0x378, a SyQuest
+EZ-135 connected to the chained port on the PD/CD drive and also an
+Imation Superdisk connected to port 0x278. You could give the following
+options on your boot command:
+
+ pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36
+
+In the last option, pf.drive1 configures device /dev/pf1, the 0x378
+is the parallel port base address, the 0 is the protocol registration
+number and 36 is the chain ID.
+
+Please note: while PARIDE will work both with and without the
+PARPORT parallel port sharing system that is included by the
+"Parallel port support" option, PARPORT must be included and enabled
+if you want to use chains of devices on the same parallel port.
+
+2.2 Loading and configuring PARIDE as modules
+
+It is much faster and simpler to get to understand the PARIDE drivers
+if you use them as loadable kernel modules.
+
+Note 1: using these drivers with the "kerneld" automatic module loading
+system is not recommended for beginners, and is not documented here.
+
+Note 2: if you build PARPORT support as a loadable module, PARIDE must
+also be built as loadable modules, and PARPORT must be loaded before the
+PARIDE modules.
+
+To use PARIDE, you must begin by
+
+ insmod paride
+
+this loads a base module which provides a registry for the protocols,
+among other tasks.
+
+Then, load as many of the protocol modules as you think you might need.
+As you load each module, it will register the protocols that it supports,
+and print a log message to your kernel log file and your console. For
+example:
+
+ # insmod epat
+ paride: epat registered as protocol 0
+ # insmod kbic
+ paride: k951 registered as protocol 1
+ paride: k971 registered as protocol 2
+
+Finally, you can load high-level drivers for each kind of device that
+you have connected. By default, each driver will autoprobe for a single
+device, but you can support up to four similar devices by giving their
+individual co-ordinates when you load the driver.
+
+For example, if you had two no-name CD-ROM drives both using the
+KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc
+you could give the following command:
+
+ # insmod pcd drive0=0x378,1 drive1=0x3bc,1
+
+For most adapters, giving a port address and protocol number is sufficient,
+but check the source files in linux/drivers/block/paride for more
+information. (Hopefully someone will write some man pages one day !).
+
+As another example, here's what happens when PARPORT is installed, and
+a SyQuest EZ-135 is attached to port 0x378:
+
+ # insmod paride
+ paride: version 1.0 installed
+ # insmod epat
+ paride: epat registered as protocol 0
+ # insmod pd
+ pd: pd version 1.0, major 45, cluster 64, nice 0
+ pda: Sharing parport1 at 0x378
+ pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1
+ pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media
+ pda: pda1
+
+Note that the last line is the output from the generic partition table
+scanner - in this case it reports that it has found a disk with one partition.
+
+2.3 Using a PARIDE device
+
+Once the drivers have been loaded, you can access PARIDE devices in the
+same way as their traditional counterparts. You will probably need to
+create the device "special files". Here is a simple script that you can
+cut to a file and execute:
+
+#!/bin/bash
+#
+# mkd -- a script to create the device special files for the PARIDE subsystem
+#
+function mkdev {
+ mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
+}
+#
+function pd {
+ D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
+ mkdev pd$D b 45 $[ $1 * 16 ]
+ for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
+ done
+}
+#
+cd /dev
+#
+for u in 0 1 2 3 ; do pd $u ; done
+for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done
+for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done
+for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done
+for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done
+for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done
+#
+# end of mkd
+
+With the device files and drivers in place, you can access PARIDE devices
+like any other Linux device. For example, to mount a CD-ROM in pcd0, use:
+
+ mount /dev/pcd0 /cdrom
+
+If you have a fresh Avatar Shark cartridge, and the drive is pda, you
+might do something like:
+
+ fdisk /dev/pda -- make a new partition table with
+ partition 1 of type 83
+
+ mke2fs /dev/pda1 -- to build the file system
+
+ mkdir /shark -- make a place to mount the disk
+
+ mount /dev/pda1 /shark
+
+Devices like the Imation superdisk work in the same way, except that
+they do not have a partition table. For example to make a 120MB
+floppy that you could share with a DOS system:
+
+ mkdosfs /dev/pf0
+ mount /dev/pf0 /mnt
+
+
+2.4 The pf driver
+
+The pf driver is intended for use with parallel port ATAPI disk
+devices. The most common devices in this category are PD drives
+and LS-120 drives. Traditionally, media for these devices are not
+partitioned. Consequently, the pf driver does not support partitioned
+media. This may be changed in a future version of the driver.
+
+2.5 Using the pt driver
+
+The pt driver for parallel port ATAPI tape drives is a minimal driver.
+It does not yet support many of the standard tape ioctl operations.
+For best performance, a block size of 32KB should be used. You will
+probably want to set the parallel port delay to 0, if you can.
+
+2.6 Using the pg driver
+
+The pg driver can be used in conjunction with the cdrecord program
+to create CD-ROMs. Please get cdrecord version 1.6.1 or later
+from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media
+your parallel port should ideally be set to EPP mode, and the "port delay"
+should be set to 0. With those settings it is possible to record at 2x
+speed without any buffer underruns. If you cannot get the driver to work
+in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
+
+
+3. Troubleshooting
+
+3.1 Use EPP mode if you can
+
+The most common problems that people report with the PARIDE drivers
+concern the parallel port CMOS settings. At this time, none of the
+PARIDE protocol modules support ECP mode, or any ECP combination modes.
+If you are able to do so, please set your parallel port into EPP mode
+using your CMOS setup procedure.
+
+3.2 Check the port delay
+
+Some parallel ports cannot reliably transfer data at full speed. To
+offset the errors, the PARIDE protocol modules introduce a "port
+delay" between each access to the i/o ports. Each protocol sets
+a default value for this delay. In most cases, the user can override
+the default and set it to 0 - resulting in somewhat higher transfer
+rates. In some rare cases (especially with older 486 systems) the
+default delays are not long enough. if you experience corrupt data
+transfers, or unexpected failures, you may wish to increase the
+port delay. The delay can be programmed using the "driveN" parameters
+to each of the high-level drivers. Please see the notes above, or
+read the comments at the beginning of the driver source files in
+linux/drivers/block/paride.
+
+3.3 Some drives need a printer reset
+
+There appear to be a number of "noname" external drives on the market
+that do not always power up correctly. We have noticed this with some
+drives based on OnSpec and older Freecom adapters. In these rare cases,
+the adapter can often be reinitialised by issuing a "printer reset" on
+the parallel port. As the reset operation is potentially disruptive in
+multiple device environments, the PARIDE drivers will not do it
+automatically. You can however, force a printer reset by doing:
+
+ insmod lp reset=1
+ rmmod lp
+
+If you have one of these marginal cases, you should probably build
+your paride drivers as modules, and arrange to do the printer reset
+before loading the PARIDE drivers.
+
+3.4 Use the verbose option and dmesg if you need help
+
+While a lot of testing has gone into these drivers to make them work
+as smoothly as possible, problems will arise. If you do have problems,
+please check all the obvious things first: does the drive work in
+DOS with the manufacturer's drivers ? If that doesn't yield any useful
+clues, then please make sure that only one drive is hooked to your system,
+and that either (a) PARPORT is enabled or (b) no other device driver
+is using your parallel port (check in /proc/ioports). Then, load the
+appropriate drivers (you can load several protocol modules if you want)
+as in:
+
+ # insmod paride
+ # insmod epat
+ # insmod bpck
+ # insmod kbic
+ ...
+ # insmod pd verbose=1
+
+(using the correct driver for the type of device you have, of course).
+The verbose=1 parameter will cause the drivers to log a trace of their
+activity as they attempt to locate your drive.
+
+Use 'dmesg' to capture a log of all the PARIDE messages (any messages
+beginning with paride:, a protocol module's name or a driver's name) and
+include that with your bug report. You can submit a bug report in one
+of two ways. Either send it directly to the author of the PARIDE suite,
+by e-mail to grant@torque.net, or join the linux-parport mailing list
+and post your report there.
+
+3.5 For more information or help
+
+You can join the linux-parport mailing list by sending a mail message
+to
+ linux-parport-request@torque.net
+
+with the single word
+
+ subscribe
+
+in the body of the mail message (not in the subject line). Please be
+sure that your mail program is correctly set up when you do this, as
+the list manager is a robot that will subscribe you using the reply
+address in your mail headers. REMOVE any anti-spam gimmicks you may
+have in your mail headers, when sending mail to the list server.
+
+You might also find some useful information on the linux-parport
+web pages (although they are not always up to date) at
+
+ http://www.torque.net/parport/
+
+
diff --git a/Documentation/blockdev/ramdisk.txt b/Documentation/blockdev/ramdisk.txt
new file mode 100644
index 0000000..6c820ba
--- /dev/null
+++ b/Documentation/blockdev/ramdisk.txt
@@ -0,0 +1,165 @@
+Using the RAM disk block device with Linux
+------------------------------------------
+
+Contents:
+
+ 1) Overview
+ 2) Kernel Command Line Parameters
+ 3) Using "rdev -r"
+ 4) An Example of Creating a Compressed RAM Disk
+
+
+1) Overview
+-----------
+
+The RAM disk driver is a way to use main system memory as a block device. It
+is required for initrd, an initial filesystem used if you need to load modules
+in order to access the root filesystem (see Documentation/initrd.txt). It can
+also be used for a temporary filesystem for crypto work, since the contents
+are erased on reboot.
+
+The RAM disk dynamically grows as more space is required. It does this by using
+RAM from the buffer cache. The driver marks the buffers it is using as dirty
+so that the VM subsystem does not try to reclaim them later.
+
+The RAM disk supports up to 16 RAM disks by default, and can be reconfigured
+to support an unlimited number of RAM disks (at your own risk). Just change
+the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu
+and (re)build the kernel.
+
+To use RAM disk support with your system, run './MAKEDEV ram' from the /dev
+directory. RAM disks are all major number 1, and start with minor number 0
+for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd.
+
+The new RAM disk also has the ability to load compressed RAM disk images,
+allowing one to squeeze more programs onto an average installation or
+rescue floppy disk.
+
+
+2) Kernel Command Line Parameters
+---------------------------------
+
+ ramdisk_size=N
+ ==============
+
+This parameter tells the RAM disk driver to set up RAM disks of N k size. The
+default is 4096 (4 MB) (8192 (8 MB) on S390).
+
+ ramdisk_blocksize=N
+ ===================
+
+This parameter tells the RAM disk driver how many bytes to use per block. The
+default is 1024 (BLOCK_SIZE).
+
+
+3) Using "rdev -r"
+------------------
+
+The usage of the word (two bytes) that "rdev -r" sets in the kernel image is
+as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up
+to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit
+14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a
+prompt/wait sequence is to be given before trying to read the RAM disk. Since
+the RAM disk dynamically grows as data is being written into it, a size field
+is not required. Bits 11 to 13 are not currently used and may as well be zero.
+These numbers are no magical secrets, as seen below:
+
+./arch/i386/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF
+./arch/i386/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000
+./arch/i386/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000
+
+Consider a typical two floppy disk setup, where you will have the
+kernel on disk one, and have already put a RAM disk image onto disk #2.
+
+Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk
+starts at an offset of 0 kB from the beginning of the floppy.
+The command line equivalent is: "ramdisk_start=0"
+
+You want bit 14 as one, indicating that a RAM disk is to be loaded.
+The command line equivalent is: "load_ramdisk=1"
+
+You want bit 15 as one, indicating that you want a prompt/keypress
+sequence so that you have a chance to switch floppy disks.
+The command line equivalent is: "prompt_ramdisk=1"
+
+Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word.
+So to create disk one of the set, you would do:
+
+ /usr/src/linux# cat arch/i386/boot/zImage > /dev/fd0
+ /usr/src/linux# rdev /dev/fd0 /dev/fd0
+ /usr/src/linux# rdev -r /dev/fd0 49152
+
+If you make a boot disk that has LILO, then for the above, you would use:
+ append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1"
+Since the default start = 0 and the default prompt = 1, you could use:
+ append = "load_ramdisk=1"
+
+
+4) An Example of Creating a Compressed RAM Disk
+----------------------------------------------
+
+To create a RAM disk image, you will need a spare block device to
+construct it on. This can be the RAM disk device itself, or an
+unused disk partition (such as an unmounted swap partition). For this
+example, we will use the RAM disk device, "/dev/ram0".
+
+Note: This technique should not be done on a machine with less than 8 MB
+of RAM. If using a spare disk partition instead of /dev/ram0, then this
+restriction does not apply.
+
+a) Decide on the RAM disk size that you want. Say 2 MB for this example.
+ Create it by writing to the RAM disk device. (This step is not currently
+ required, but may be in the future.) It is wise to zero out the
+ area (esp. for disks) so that maximal compression is achieved for
+ the unused blocks of the image that you are about to create.
+
+ dd if=/dev/zero of=/dev/ram0 bs=1k count=2048
+
+b) Make a filesystem on it. Say ext2fs for this example.
+
+ mke2fs -vm0 /dev/ram0 2048
+
+c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...)
+ and unmount it again.
+
+d) Compress the contents of the RAM disk. The level of compression
+ will be approximately 50% of the space used by the files. Unused
+ space on the RAM disk will compress to almost nothing.
+
+ dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz
+
+e) Put the kernel onto the floppy
+
+ dd if=zImage of=/dev/fd0 bs=1k
+
+f) Put the RAM disk image onto the floppy, after the kernel. Use an offset
+ that is slightly larger than the kernel, so that you can put another
+ (possibly larger) kernel onto the same floppy later without overlapping
+ the RAM disk image. An offset of 400 kB for kernels about 350 kB in
+ size would be reasonable. Make sure offset+size of ram_image.gz is
+ not larger than the total space on your floppy (usually 1440 kB).
+
+ dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400
+
+g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc.
+ For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would
+ have 2^15 + 2^14 + 400 = 49552.
+
+ rdev /dev/fd0 /dev/fd0
+ rdev -r /dev/fd0 49552
+
+That is it. You now have your boot/root compressed RAM disk floppy. Some
+users may wish to combine steps (d) and (f) by using a pipe.
+
+--------------------------------------------------------------------------
+ Paul Gortmaker 12/95
+
+Changelog:
+----------
+
+10-22-04 : Updated to reflect changes in command line options, remove
+ obsolete references, general cleanup.
+ James Nelson (james4765@gmail.com)
+
+
+12-95 : Original Document
diff --git a/Documentation/braille-console.txt b/Documentation/braille-console.txt
new file mode 100644
index 0000000..000b0fb
--- /dev/null
+++ b/Documentation/braille-console.txt
@@ -0,0 +1,34 @@
+ Linux Braille Console
+
+To get early boot messages on a braille device (before userspace screen
+readers can start), you first need to compile the support for the usual serial
+console (see serial-console.txt), and for braille device (in Device Drivers -
+Accessibility).
+
+Then you need to specify a console=brl, option on the kernel command line, the
+format is:
+
+ console=brl,serial_options...
+
+where serial_options... are the same as described in serial-console.txt
+
+So for instance you can use console=brl,ttyS0 if the braille device is connected
+to the first serial port, and console=brl,ttyS0,115200 to override the baud rate
+to 115200, etc.
+
+By default, the braille device will just show the last kernel message (console
+mode). To review previous messages, press the Insert key to switch to the VT
+review mode. In review mode, the arrow keys permit to browse in the VT content,
+page up/down keys go at the top/bottom of the screen, and the home key goes back
+to the cursor, hence providing very basic screen reviewing facility.
+
+Sound feedback can be obtained by adding the braille_console.sound=1 kernel
+parameter.
+
+For simplicity, only one braille console can be enabled, other uses of
+console=brl,... will be discarded. Also note that it does not interfere with
+the console selection mecanism described in serial-console.txt
+
+For now, only the VisioBraille device is supported.
+
+Samuel Thibault <samuel.thibault@ens-lyon.org>
diff --git a/Documentation/bt8xxgpio.txt b/Documentation/bt8xxgpio.txt
new file mode 100644
index 0000000..d8297e4
--- /dev/null
+++ b/Documentation/bt8xxgpio.txt
@@ -0,0 +1,67 @@
+===============================================================
+== BT8XXGPIO driver ==
+== ==
+== A driver for a selfmade cheap BT8xx based PCI GPIO-card ==
+== ==
+== For advanced documentation, see ==
+== http://www.bu3sch.de/btgpio.php ==
+===============================================================
+
+
+A generic digital 24-port PCI GPIO card can be built out of an ordinary
+Brooktree bt848, bt849, bt878 or bt879 based analog TV tuner card. The
+Brooktree chip is used in old analog Hauppauge WinTV PCI cards. You can easily
+find them used for low prices on the net.
+
+The bt8xx chip does have 24 digital GPIO ports.
+These ports are accessible via 24 pins on the SMD chip package.
+
+
+==============================================
+== How to physically access the GPIO pins ==
+==============================================
+
+The are several ways to access these pins. One might unsolder the whole chip
+and put it on a custom PCI board, or one might only unsolder each individual
+GPIO pin and solder that to some tiny wire. As the chip package really is tiny
+there are some advanced soldering skills needed in any case.
+
+The physical pinouts are drawn in the following ASCII art.
+The GPIO pins are marked with G00-G23
+
+ G G G G G G G G G G G G G G G G G G
+ 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7
+ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ ---------------------------------------------------------------------------
+ --| ^ ^ |--
+ --| pin 86 pin 67 |--
+ --| |--
+ --| pin 61 > |-- G18
+ --| |-- G19
+ --| |-- G20
+ --| |-- G21
+ --| |-- G22
+ --| pin 56 > |-- G23
+ --| |--
+ --| Brooktree 878/879 |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| |--
+ --| O |--
+ --| |--
+ ---------------------------------------------------------------------------
+ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ ^
+ This is pin 1
+
diff --git a/Documentation/c2port.txt b/Documentation/c2port.txt
new file mode 100644
index 0000000..d9bf93e
--- /dev/null
+++ b/Documentation/c2port.txt
@@ -0,0 +1,90 @@
+ C2 port support
+ ---------------
+
+(C) Copyright 2007 Rodolfo Giometti <giometti@enneenne.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+
+
+Overview
+--------
+
+This driver implements the support for Linux of Silicon Labs (Silabs)
+C2 Interface used for in-system programming of micro controllers.
+
+By using this driver you can reprogram the in-system flash without EC2
+or EC3 debug adapter. This solution is also useful in those systems
+where the micro controller is connected via special GPIOs pins.
+
+References
+----------
+
+The C2 Interface main references are at (http://www.silabs.com)
+Silicon Laboratories site], see:
+
+- AN127: FLASH Programming via the C2 Interface at
+http://www.silabs.com/public/documents/tpub_doc/anote/Microcontrollers/Small_Form_Factor/en/an127.pdf, and
+
+- C2 Specification at
+http://www.silabs.com/public/documents/tpub_doc/spec/Microcontrollers/en/C2spec.pdf,
+
+however it implements a two wire serial communication protocol (bit
+banging) designed to enable in-system programming, debugging, and
+boundary-scan testing on low pin-count Silicon Labs devices. Currently
+this code supports only flash programming but extensions are easy to
+add.
+
+Using the driver
+----------------
+
+Once the driver is loaded you can use sysfs support to get C2port's
+info or read/write in-system flash.
+
+# ls /sys/class/c2port/c2port0/
+access flash_block_size flash_erase rev_id
+dev_id flash_blocks_num flash_size subsystem/
+flash_access flash_data reset uevent
+
+Initially the C2port access is disabled since you hardware may have
+such lines multiplexed with other devices so, to get access to the
+C2port, you need the command:
+
+# echo 1 > /sys/class/c2port/c2port0/access
+
+after that you should read the device ID and revision ID of the
+connected micro controller:
+
+# cat /sys/class/c2port/c2port0/dev_id
+8
+# cat /sys/class/c2port/c2port0/rev_id
+1
+
+However, for security reasons, the in-system flash access in not
+enabled yet, to do so you need the command:
+
+# echo 1 > /sys/class/c2port/c2port0/flash_access
+
+After that you can read the whole flash:
+
+# cat /sys/class/c2port/c2port0/flash_data > image
+
+erase it:
+
+# echo 1 > /sys/class/c2port/c2port0/flash_erase
+
+and write it:
+
+# cat image > /sys/class/c2port/c2port0/flash_data
+
+after writing you have to reset the device to execute the new code:
+
+# echo 1 > /sys/class/c2port/c2port0/reset
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
new file mode 100644
index 0000000..da42ab4
--- /dev/null
+++ b/Documentation/cachetlb.txt
@@ -0,0 +1,379 @@
+ Cache and TLB Flushing
+ Under Linux
+
+ David S. Miller <davem@redhat.com>
+
+This document describes the cache/tlb flushing interfaces called
+by the Linux VM subsystem. It enumerates over each interface,
+describes it's intended purpose, and what side effect is expected
+after the interface is invoked.
+
+The side effects described below are stated for a uniprocessor
+implementation, and what is to happen on that single processor. The
+SMP cases are a simple extension, in that you just extend the
+definition such that the side effect for a particular interface occurs
+on all processors in the system. Don't let this scare you into
+thinking SMP cache/tlb flushing must be so inefficient, this is in
+fact an area where many optimizations are possible. For example,
+if it can be proven that a user address space has never executed
+on a cpu (see vma->cpu_vm_mask), one need not perform a flush
+for this address space on that cpu.
+
+First, the TLB flushing interfaces, since they are the simplest. The
+"TLB" is abstracted under Linux as something the cpu uses to cache
+virtual-->physical address translations obtained from the software
+page tables. Meaning that if the software page tables change, it is
+possible for stale translations to exist in this "TLB" cache.
+Therefore when software page table changes occur, the kernel will
+invoke one of the following flush methods _after_ the page table
+changes occur:
+
+1) void flush_tlb_all(void)
+
+ The most severe flush of all. After this interface runs,
+ any previous page table modification whatsoever will be
+ visible to the cpu.
+
+ This is usually invoked when the kernel page tables are
+ changed, since such translations are "global" in nature.
+
+2) void flush_tlb_mm(struct mm_struct *mm)
+
+ This interface flushes an entire user address space from
+ the TLB. After running, this interface must make sure that
+ any previous page table modifications for the address space
+ 'mm' will be visible to the cpu. That is, after running,
+ there will be no entries in the TLB for 'mm'.
+
+ This interface is used to handle whole address space
+ page table operations such as what happens during
+ fork, and exec.
+
+3) void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+
+ Here we are flushing a specific range of (user) virtual
+ address translations from the TLB. After running, this
+ interface must make sure that any previous page table
+ modifications for the address space 'vma->vm_mm' in the range
+ 'start' to 'end-1' will be visible to the cpu. That is, after
+ running, here will be no entries in the TLB for 'mm' for
+ virtual addresses in the range 'start' to 'end-1'.
+
+ The "vma" is the backing store being used for the region.
+ Primarily, this is used for munmap() type operations.
+
+ The interface is provided in hopes that the port can find
+ a suitably efficient method for removing multiple page
+ sized translations from the TLB, instead of having the kernel
+ call flush_tlb_page (see below) for each entry which may be
+ modified.
+
+4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
+
+ This time we need to remove the PAGE_SIZE sized translation
+ from the TLB. The 'vma' is the backing structure used by
+ Linux to keep track of mmap'd regions for a process, the
+ address space is available via vma->vm_mm. Also, one may
+ test (vma->vm_flags & VM_EXEC) to see if this region is
+ executable (and thus could be in the 'instruction TLB' in
+ split-tlb type setups).
+
+ After running, this interface must make sure that any previous
+ page table modification for address space 'vma->vm_mm' for
+ user virtual address 'addr' will be visible to the cpu. That
+ is, after running, there will be no entries in the TLB for
+ 'vma->vm_mm' for virtual address 'addr'.
+
+ This is used primarily during fault processing.
+
+5) void update_mmu_cache(struct vm_area_struct *vma,
+ unsigned long address, pte_t pte)
+
+ At the end of every page fault, this routine is invoked to
+ tell the architecture specific code that a translation
+ described by "pte" now exists at virtual address "address"
+ for address space "vma->vm_mm", in the software page tables.
+
+ A port may use this information in any way it so chooses.
+ For example, it could use this event to pre-load TLB
+ translations for software managed TLB configurations.
+ The sparc64 port currently does this.
+
+6) void tlb_migrate_finish(struct mm_struct *mm)
+
+ This interface is called at the end of an explicit
+ process migration. This interface provides a hook
+ to allow a platform to update TLB or context-specific
+ information for the address space.
+
+ The ia64 sn2 platform is one example of a platform
+ that uses this interface.
+
+Next, we have the cache flushing interfaces. In general, when Linux
+is changing an existing virtual-->physical mapping to a new value,
+the sequence will be in one of the following forms:
+
+ 1) flush_cache_mm(mm);
+ change_all_page_tables_of(mm);
+ flush_tlb_mm(mm);
+
+ 2) flush_cache_range(vma, start, end);
+ change_range_of_page_tables(mm, start, end);
+ flush_tlb_range(vma, start, end);
+
+ 3) flush_cache_page(vma, addr, pfn);
+ set_pte(pte_pointer, new_pte_val);
+ flush_tlb_page(vma, addr);
+
+The cache level flush will always be first, because this allows
+us to properly handle systems whose caches are strict and require
+a virtual-->physical translation to exist for a virtual address
+when that virtual address is flushed from the cache. The HyperSparc
+cpu is one such cpu with this attribute.
+
+The cache flushing routines below need only deal with cache flushing
+to the extent that it is necessary for a particular cpu. Mostly,
+these routines must be implemented for cpus which have virtually
+indexed caches which must be flushed when virtual-->physical
+translations are changed or removed. So, for example, the physically
+indexed physically tagged caches of IA32 processors have no need to
+implement these interfaces since the caches are fully synchronized
+and have no dependency on translation information.
+
+Here are the routines, one by one:
+
+1) void flush_cache_mm(struct mm_struct *mm)
+
+ This interface flushes an entire user address space from
+ the caches. That is, after running, there will be no cache
+ lines associated with 'mm'.
+
+ This interface is used to handle whole address space
+ page table operations such as what happens during exit and exec.
+
+2) void flush_cache_dup_mm(struct mm_struct *mm)
+
+ This interface flushes an entire user address space from
+ the caches. That is, after running, there will be no cache
+ lines associated with 'mm'.
+
+ This interface is used to handle whole address space
+ page table operations such as what happens during fork.
+
+ This option is separate from flush_cache_mm to allow some
+ optimizations for VIPT caches.
+
+3) void flush_cache_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+
+ Here we are flushing a specific range of (user) virtual
+ addresses from the cache. After running, there will be no
+ entries in the cache for 'vma->vm_mm' for virtual addresses in
+ the range 'start' to 'end-1'.
+
+ The "vma" is the backing store being used for the region.
+ Primarily, this is used for munmap() type operations.
+
+ The interface is provided in hopes that the port can find
+ a suitably efficient method for removing multiple page
+ sized regions from the cache, instead of having the kernel
+ call flush_cache_page (see below) for each entry which may be
+ modified.
+
+4) void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn)
+
+ This time we need to remove a PAGE_SIZE sized range
+ from the cache. The 'vma' is the backing structure used by
+ Linux to keep track of mmap'd regions for a process, the
+ address space is available via vma->vm_mm. Also, one may
+ test (vma->vm_flags & VM_EXEC) to see if this region is
+ executable (and thus could be in the 'instruction cache' in
+ "Harvard" type cache layouts).
+
+ The 'pfn' indicates the physical page frame (shift this value
+ left by PAGE_SHIFT to get the physical address) that 'addr'
+ translates to. It is this mapping which should be removed from
+ the cache.
+
+ After running, there will be no entries in the cache for
+ 'vma->vm_mm' for virtual address 'addr' which translates
+ to 'pfn'.
+
+ This is used primarily during fault processing.
+
+5) void flush_cache_kmaps(void)
+
+ This routine need only be implemented if the platform utilizes
+ highmem. It will be called right before all of the kmaps
+ are invalidated.
+
+ After running, there will be no entries in the cache for
+ the kernel virtual address range PKMAP_ADDR(0) to
+ PKMAP_ADDR(LAST_PKMAP).
+
+ This routing should be implemented in asm/highmem.h
+
+6) void flush_cache_vmap(unsigned long start, unsigned long end)
+ void flush_cache_vunmap(unsigned long start, unsigned long end)
+
+ Here in these two interfaces we are flushing a specific range
+ of (kernel) virtual addresses from the cache. After running,
+ there will be no entries in the cache for the kernel address
+ space for virtual addresses in the range 'start' to 'end-1'.
+
+ The first of these two routines is invoked after map_vm_area()
+ has installed the page table entries. The second is invoked
+ before unmap_kernel_range() deletes the page table entries.
+
+There exists another whole class of cpu cache issues which currently
+require a whole different set of interfaces to handle properly.
+The biggest problem is that of virtual aliasing in the data cache
+of a processor.
+
+Is your port susceptible to virtual aliasing in it's D-cache?
+Well, if your D-cache is virtually indexed, is larger in size than
+PAGE_SIZE, and does not prevent multiple cache lines for the same
+physical address from existing at once, you have this problem.
+
+If your D-cache has this problem, first define asm/shmparam.h SHMLBA
+properly, it should essentially be the size of your virtually
+addressed D-cache (or if the size is variable, the largest possible
+size). This setting will force the SYSv IPC layer to only allow user
+processes to mmap shared memory at address which are a multiple of
+this value.
+
+NOTE: This does not fix shared mmaps, check out the sparc64 port for
+one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
+
+Next, you have to solve the D-cache aliasing issue for all
+other cases. Please keep in mind that fact that, for a given page
+mapped into some user address space, there is always at least one more
+mapping, that of the kernel in it's linear mapping starting at
+PAGE_OFFSET. So immediately, once the first user maps a given
+physical page into its address space, by implication the D-cache
+aliasing problem has the potential to exist since the kernel already
+maps this page at its virtual address.
+
+ void copy_user_page(void *to, void *from, unsigned long addr, struct page *page)
+ void clear_user_page(void *to, unsigned long addr, struct page *page)
+
+ These two routines store data in user anonymous or COW
+ pages. It allows a port to efficiently avoid D-cache alias
+ issues between userspace and the kernel.
+
+ For example, a port may temporarily map 'from' and 'to' to
+ kernel virtual addresses during the copy. The virtual address
+ for these two pages is chosen in such a way that the kernel
+ load/store instructions happen to virtual addresses which are
+ of the same "color" as the user mapping of the page. Sparc64
+ for example, uses this technique.
+
+ The 'addr' parameter tells the virtual address where the
+ user will ultimately have this page mapped, and the 'page'
+ parameter gives a pointer to the struct page of the target.
+
+ If D-cache aliasing is not an issue, these two routines may
+ simply call memcpy/memset directly and do nothing more.
+
+ void flush_dcache_page(struct page *page)
+
+ Any time the kernel writes to a page cache page, _OR_
+ the kernel is about to read from a page cache page and
+ user space shared/writable mappings of this page potentially
+ exist, this routine is called.
+
+ NOTE: This routine need only be called for page cache pages
+ which can potentially ever be mapped into the address
+ space of a user process. So for example, VFS layer code
+ handling vfs symlinks in the page cache need not call
+ this interface at all.
+
+ The phrase "kernel writes to a page cache page" means,
+ specifically, that the kernel executes store instructions
+ that dirty data in that page at the page->virtual mapping
+ of that page. It is important to flush here to handle
+ D-cache aliasing, to make sure these kernel stores are
+ visible to user space mappings of that page.
+
+ The corollary case is just as important, if there are users
+ which have shared+writable mappings of this file, we must make
+ sure that kernel reads of these pages will see the most recent
+ stores done by the user.
+
+ If D-cache aliasing is not an issue, this routine may
+ simply be defined as a nop on that architecture.
+
+ There is a bit set aside in page->flags (PG_arch_1) as
+ "architecture private". The kernel guarantees that,
+ for pagecache pages, it will clear this bit when such
+ a page first enters the pagecache.
+
+ This allows these interfaces to be implemented much more
+ efficiently. It allows one to "defer" (perhaps indefinitely)
+ the actual flush if there are currently no user processes
+ mapping this page. See sparc64's flush_dcache_page and
+ update_mmu_cache implementations for an example of how to go
+ about doing this.
+
+ The idea is, first at flush_dcache_page() time, if
+ page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear
+ an empty list, just mark the architecture private page flag bit.
+ Later, in update_mmu_cache(), a check is made of this flag bit,
+ and if set the flush is done and the flag bit is cleared.
+
+ IMPORTANT NOTE: It is often important, if you defer the flush,
+ that the actual flush occurs on the same CPU
+ as did the cpu stores into the page to make it
+ dirty. Again, see sparc64 for examples of how
+ to deal with this.
+
+ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
+ unsigned long user_vaddr,
+ void *dst, void *src, int len)
+ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
+ unsigned long user_vaddr,
+ void *dst, void *src, int len)
+ When the kernel needs to copy arbitrary data in and out
+ of arbitrary user pages (f.e. for ptrace()) it will use
+ these two routines.
+
+ Any necessary cache flushing or other coherency operations
+ that need to occur should happen here. If the processor's
+ instruction cache does not snoop cpu stores, it is very
+ likely that you will need to flush the instruction cache
+ for copy_to_user_page().
+
+ void flush_anon_page(struct vm_area_struct *vma, struct page *page,
+ unsigned long vmaddr)
+ When the kernel needs to access the contents of an anonymous
+ page, it calls this function (currently only
+ get_user_pages()). Note: flush_dcache_page() deliberately
+ doesn't work for an anonymous page. The default
+ implementation is a nop (and should remain so for all coherent
+ architectures). For incoherent architectures, it should flush
+ the cache of the page at vmaddr.
+
+ void flush_kernel_dcache_page(struct page *page)
+ When the kernel needs to modify a user page is has obtained
+ with kmap, it calls this function after all modifications are
+ complete (but before kunmapping it) to bring the underlying
+ page up to date. It is assumed here that the user has no
+ incoherent cached copies (i.e. the original page was obtained
+ from a mechanism like get_user_pages()). The default
+ implementation is a nop and should remain so on all coherent
+ architectures. On incoherent architectures, this should flush
+ the kernel cache for page (using page_address(page)).
+
+
+ void flush_icache_range(unsigned long start, unsigned long end)
+ When the kernel stores into addresses that it will execute
+ out of (eg when loading modules), this function is called.
+
+ If the icache does not snoop stores then this routine will need
+ to flush it.
+
+ void flush_icache_page(struct vm_area_struct *vma, struct page *page)
+ All the functionality of flush_icache_page can be implemented in
+ flush_dcache_page and update_mmu_cache. In 2.7 the hope is to
+ remove this interface completely.
diff --git a/Documentation/cdrom/00-INDEX b/Documentation/cdrom/00-INDEX
new file mode 100644
index 0000000..433edf2
--- /dev/null
+++ b/Documentation/cdrom/00-INDEX
@@ -0,0 +1,11 @@
+00-INDEX
+ - this file (info on CD-ROMs and Linux)
+Makefile
+ - only used to generate TeX output from the documentation.
+cdrom-standard.tex
+ - LaTeX document on standardizing the CD-ROM programming interface.
+ide-cd
+ - info on setting up and using ATAPI (aka IDE) CD-ROMs.
+packet-writing.txt
+ - Info on the CDRW packet writing module
+
diff --git a/Documentation/cdrom/Makefile b/Documentation/cdrom/Makefile
new file mode 100644
index 0000000..a19e321
--- /dev/null
+++ b/Documentation/cdrom/Makefile
@@ -0,0 +1,21 @@
+LATEXFILE = cdrom-standard
+
+all:
+ make clean
+ latex $(LATEXFILE)
+ latex $(LATEXFILE)
+ @if [ -x `which gv` ]; then \
+ `dvips -q -t letter -o $(LATEXFILE).ps $(LATEXFILE).dvi` ;\
+ `gv -antialias -media letter -nocenter $(LATEXFILE).ps` ;\
+ else \
+ `xdvi $(LATEXFILE).dvi &` ;\
+ fi
+ make sortofclean
+
+clean:
+ rm -f $(LATEXFILE).ps $(LATEXFILE).dvi $(LATEXFILE).aux $(LATEXFILE).log
+
+sortofclean:
+ rm -f $(LATEXFILE).aux $(LATEXFILE).log
+
+
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex
new file mode 100644
index 0000000..c06233f
--- /dev/null
+++ b/Documentation/cdrom/cdrom-standard.tex
@@ -0,0 +1,1022 @@
+\documentclass{article}
+\def\version{$Id: cdrom-standard.tex,v 1.9 1997/12/28 15:42:49 david Exp $}
+\newcommand{\newsection}[1]{\newpage\section{#1}}
+
+\evensidemargin=0pt
+\oddsidemargin=0pt
+\topmargin=-\headheight \advance\topmargin by -\headsep
+\textwidth=15.99cm \textheight=24.62cm % normal A4, 1'' margin
+
+\def\linux{{\sc Linux}}
+\def\cdrom{{\sc cd-rom}}
+\def\UCD{{\sc Uniform cd-rom Driver}}
+\def\cdromc{{\tt {cdrom.c}}}
+\def\cdromh{{\tt {cdrom.h}}}
+\def\fo{\sl} % foreign words
+\def\ie{{\fo i.e.}}
+\def\eg{{\fo e.g.}}
+
+\everymath{\it} \everydisplay{\it}
+\catcode `\_=\active \def_{\_\penalty100 }
+\catcode`\<=\active \def<#1>{{\langle\hbox{\rm#1}\rangle}}
+
+\begin{document}
+\title{A \linux\ \cdrom\ standard}
+\author{David van Leeuwen\\{\normalsize\tt david@ElseWare.cistron.nl}
+\\{\footnotesize updated by Erik Andersen {\tt(andersee@debian.org)}}
+\\{\footnotesize updated by Jens Axboe {\tt(axboe@image.dk)}}}
+\date{12 March 1999}
+
+\maketitle
+
+\newsection{Introduction}
+
+\linux\ is probably the Unix-like operating system that supports
+the widest variety of hardware devices. The reasons for this are
+presumably
+\begin{itemize}
+\item
+ The large list of hardware devices available for the many platforms
+ that \linux\ now supports (\ie, i386-PCs, Sparc Suns, etc.)
+\item
+ The open design of the operating system, such that anybody can write a
+ driver for \linux.
+\item
+ There is plenty of source code around as examples of how to write a driver.
+\end{itemize}
+The openness of \linux, and the many different types of available
+hardware has allowed \linux\ to support many different hardware devices.
+Unfortunately, the very openness that has allowed \linux\ to support
+all these different devices has also allowed the behavior of each
+device driver to differ significantly from one device to another.
+This divergence of behavior has been very significant for \cdrom\
+devices; the way a particular drive reacts to a `standard' $ioctl()$
+call varies greatly from one device driver to another. To avoid making
+their drivers totally inconsistent, the writers of \linux\ \cdrom\
+drivers generally created new device drivers by understanding, copying,
+and then changing an existing one. Unfortunately, this practice did not
+maintain uniform behavior across all the \linux\ \cdrom\ drivers.
+
+This document describes an effort to establish Uniform behavior across
+all the different \cdrom\ device drivers for \linux. This document also
+defines the various $ioctl$s, and how the low-level \cdrom\ device
+drivers should implement them. Currently (as of the \linux\ 2.1.$x$
+development kernels) several low-level \cdrom\ device drivers, including
+both IDE/ATAPI and SCSI, now use this Uniform interface.
+
+When the \cdrom\ was developed, the interface between the \cdrom\ drive
+and the computer was not specified in the standards. As a result, many
+different \cdrom\ interfaces were developed. Some of them had their
+own proprietary design (Sony, Mitsumi, Panasonic, Philips), other
+manufacturers adopted an existing electrical interface and changed
+the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply
+adapted their drives to one or more of the already existing electrical
+interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and
+most of the `NoName' manufacturers). In cases where a new drive really
+brought its own interface or used its own command set and flow control
+scheme, either a separate driver had to be written, or an existing
+driver had to be enhanced. History has delivered us \cdrom\ support for
+many of these different interfaces. Nowadays, almost all new \cdrom\
+drives are either IDE/ATAPI or SCSI, and it is very unlikely that any
+manufacturer will create a new interface. Even finding drives for the
+old proprietary interfaces is getting difficult.
+
+When (in the 1.3.70's) I looked at the existing software interface,
+which was expressed through \cdromh, it appeared to be a rather wild
+set of commands and data formats.\footnote{I cannot recollect what
+kernel version I looked at, then, presumably 1.2.13 and 1.3.34---the
+latest kernel that I was indirectly involved in.} It seemed that many
+features of the software interface had been added to accommodate the
+capabilities of a particular drive, in an {\fo ad hoc\/} manner. More
+importantly, it appeared that the behavior of the `standard' commands
+was different for most of the different drivers: \eg, some drivers
+close the tray if an $open()$ call occurs when the tray is open, while
+others do not. Some drivers lock the door upon opening the device, to
+prevent an incoherent file system, but others don't, to allow software
+ejection. Undoubtedly, the capabilities of the different drives vary,
+but even when two drives have the same capability their drivers'
+behavior was usually different.
+
+I decided to start a discussion on how to make all the \linux\ \cdrom\
+drivers behave more uniformly. I began by contacting the developers of
+the many \cdrom\ drivers found in the \linux\ kernel. Their reactions
+encouraged me to write the \UCD\ which this document is intended to
+describe. The implementation of the \UCD\ is in the file \cdromc. This
+driver is intended to be an additional software layer that sits on top
+of the low-level device drivers for each \cdrom\ drive. By adding this
+additional layer, it is possible to have all the different \cdrom\
+devices behave {\em exactly\/} the same (insofar as the underlying
+hardware will allow).
+
+The goal of the \UCD\ is {\em not\/} to alienate driver developers who
+have not yet taken steps to support this effort. The goal of \UCD\ is
+simply to give people writing application programs for \cdrom\ drives
+{\em one\/} \linux\ \cdrom\ interface with consistent behavior for all
+\cdrom\ devices. In addition, this also provides a consistent interface
+between the low-level device driver code and the \linux\ kernel. Care
+is taken that 100\,\% compatibility exists with the data structures and
+programmer's interface defined in \cdromh. This guide was written to
+help \cdrom\ driver developers adapt their code to use the \UCD\ code
+defined in \cdromc.
+
+Personally, I think that the most important hardware interfaces are
+the IDE/ATAPI drives and, of course, the SCSI drives, but as prices
+of hardware drop continuously, it is also likely that people may have
+more than one \cdrom\ drive, possibly of mixed types. It is important
+that these drives behave in the same way. In December 1994, one of the
+cheapest \cdrom\ drives was a Philips cm206, a double-speed proprietary
+drive. In the months that I was busy writing a \linux\ driver for it,
+proprietary drives became obsolete and IDE/ATAPI drives became the
+standard. At the time of the last update to this document (November
+1997) it is becoming difficult to even {\em find} anything less than a
+16 speed \cdrom\ drive, and 24 speed drives are common.
+
+\newsection{Standardizing through another software level}
+\label{cdrom.c}
+
+At the time this document was conceived, all drivers directly
+implemented the \cdrom\ $ioctl()$ calls through their own routines. This
+led to the danger of different drivers forgetting to do important things
+like checking that the user was giving the driver valid data. More
+importantly, this led to the divergence of behavior, which has already
+been discussed.
+
+For this reason, the \UCD\ was created to enforce consistent \cdrom\
+drive behavior, and to provide a common set of services to the various
+low-level \cdrom\ device drivers. The \UCD\ now provides another
+software-level, that separates the $ioctl()$ and $open()$ implementation
+from the actual hardware implementation. Note that this effort has
+made few changes which will affect a user's application programs. The
+greatest change involved moving the contents of the various low-level
+\cdrom\ drivers' header files to the kernel's cdrom directory. This was
+done to help ensure that the user is only presented with only one cdrom
+interface, the interface defined in \cdromh.
+
+\cdrom\ drives are specific enough (\ie, different from other
+block-devices such as floppy or hard disc drives), to define a set
+of common {\em \cdrom\ device operations}, $<cdrom-device>_dops$.
+These operations are different from the classical block-device file
+operations, $<block-device>_fops$.
+
+The routines for the \UCD\ interface level are implemented in the file
+\cdromc. In this file, the \UCD\ interfaces with the kernel as a block
+device by registering the following general $struct\ file_operations$:
+$$
+\halign{$#$\ \hfil&$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
+struct& file_operations\ cdrom_fops = \{\hidewidth\cr
+ &NULL, & lseek \cr
+ &block_read, & read---general block-dev read \cr
+ &block_write, & write---general block-dev write \cr
+ &NULL, & readdir \cr
+ &NULL, & select \cr
+ &cdrom_ioctl, & ioctl \cr
+ &NULL, & mmap \cr
+ &cdrom_open, & open \cr
+ &cdrom_release, & release \cr
+ &NULL, & fsync \cr
+ &NULL, & fasync \cr
+ &cdrom_media_changed, & media change \cr
+ &NULL & revalidate \cr
+\};\cr
+}
+$$
+
+Every active \cdrom\ device shares this $struct$. The routines
+declared above are all implemented in \cdromc, since this file is the
+place where the behavior of all \cdrom-devices is defined and
+standardized. The actual interface to the various types of \cdrom\
+hardware is still performed by various low-level \cdrom-device
+drivers. These routines simply implement certain {\em capabilities\/}
+that are common to all \cdrom\ (and really, all removable-media
+devices).
+
+Registration of a low-level \cdrom\ device driver is now done through
+the general routines in \cdromc, not through the Virtual File System
+(VFS) any more. The interface implemented in \cdromc\ is carried out
+through two general structures that contain information about the
+capabilities of the driver, and the specific drives on which the
+driver operates. The structures are:
+\begin{description}
+\item[$cdrom_device_ops$]
+ This structure contains information about the low-level driver for a
+ \cdrom\ device. This structure is conceptually connected to the major
+ number of the device (although some drivers may have different
+ major numbers, as is the case for the IDE driver).
+\item[$cdrom_device_info$]
+ This structure contains information about a particular \cdrom\ drive,
+ such as its device name, speed, etc. This structure is conceptually
+ connected to the minor number of the device.
+\end{description}
+
+Registering a particular \cdrom\ drive with the \UCD\ is done by the
+low-level device driver though a call to:
+$$register_cdrom(struct\ cdrom_device_info * <device>_info)
+$$
+The device information structure, $<device>_info$, contains all the
+information needed for the kernel to interface with the low-level
+\cdrom\ device driver. One of the most important entries in this
+structure is a pointer to the $cdrom_device_ops$ structure of the
+low-level driver.
+
+The device operations structure, $cdrom_device_ops$, contains a list
+of pointers to the functions which are implemented in the low-level
+device driver. When \cdromc\ accesses a \cdrom\ device, it does it
+through the functions in this structure. It is impossible to know all
+the capabilities of future \cdrom\ drives, so it is expected that this
+list may need to be expanded from time to time as new technologies are
+developed. For example, CD-R and CD-R/W drives are beginning to become
+popular, and support will soon need to be added for them. For now, the
+current $struct$ is:
+$$
+\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}&
+ $/*$ \rm# $*/$\hfil\cr
+struct& cdrom_device_ops\ \{ \hidewidth\cr
+ &int& (* open)(struct\ cdrom_device_info *, int)\cr
+ &void& (* release)(struct\ cdrom_device_info *);\cr
+ &int& (* drive_status)(struct\ cdrom_device_info *, int);\cr
+ &int& (* media_changed)(struct\ cdrom_device_info *, int);\cr
+ &int& (* tray_move)(struct\ cdrom_device_info *, int);\cr
+ &int& (* lock_door)(struct\ cdrom_device_info *, int);\cr
+ &int& (* select_speed)(struct\ cdrom_device_info *, int);\cr
+ &int& (* select_disc)(struct\ cdrom_device_info *, int);\cr
+ &int& (* get_last_session) (struct\ cdrom_device_info *,
+ struct\ cdrom_multisession *{});\cr
+ &int& (* get_mcn)(struct\ cdrom_device_info *, struct\ cdrom_mcn *{});\cr
+ &int& (* reset)(struct\ cdrom_device_info *);\cr
+ &int& (* audio_ioctl)(struct\ cdrom_device_info *, unsigned\ int,
+ void *{});\cr
+ &int& (* dev_ioctl)(struct\ cdrom_device_info *, unsigned\ int,
+ unsigned\ long);\cr
+\noalign{\medskip}
+ &const\ int& capability;& capability flags \cr
+ &int& n_minors;& number of active minor devices \cr
+\};\cr
+}
+$$
+When a low-level device driver implements one of these capabilities,
+it should add a function pointer to this $struct$. When a particular
+function is not implemented, however, this $struct$ should contain a
+NULL instead. The $capability$ flags specify the capabilities of the
+\cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive
+is registered with the \UCD. The value $n_minors$ should be a positive
+value indicating the number of minor devices that are supported by
+the low-level device driver, normally~1. Although these two variables
+are `informative' rather than `operational,' they are included in
+$cdrom_device_ops$ because they describe the capability of the {\em
+driver\/} rather than the {\em drive}. Nomenclature has always been
+difficult in computer programming.
+
+Note that most functions have fewer parameters than their
+$blkdev_fops$ counterparts. This is because very little of the
+information in the structures $inode$ and $file$ is used. For most
+drivers, the main parameter is the $struct$ $cdrom_device_info$, from
+which the major and minor number can be extracted. (Most low-level
+\cdrom\ drivers don't even look at the major and minor number though,
+since many of them only support one device.) This will be available
+through $dev$ in $cdrom_device_info$ described below.
+
+The drive-specific, minor-like information that is registered with
+\cdromc, currently contains the following fields:
+$$
+\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}&
+ $/*$ \rm# $*/$\hfil\cr
+struct& cdrom_device_info\ \{ \hidewidth\cr
+ & struct\ cdrom_device_ops *& ops;& device operations for this major\cr
+ & struct\ cdrom_device_info *& next;& next device_info for this major\cr
+ & void *& handle;& driver-dependent data\cr
+\noalign{\medskip}
+ & kdev_t& dev;& device number (incorporates minor)\cr
+ & int& mask;& mask of capability: disables them \cr
+ & int& speed;& maximum speed for reading data \cr
+ & int& capacity;& number of discs in a jukebox \cr
+\noalign{\medskip}
+ &int& options : 30;& options flags \cr
+ &unsigned& mc_flags : 2;& media-change buffer flags \cr
+ & int& use_count;& number of times device is opened\cr
+ & char& name[20];& name of the device type\cr
+\}\cr
+}$$
+Using this $struct$, a linked list of the registered minor devices is
+built, using the $next$ field. The device number, the device operations
+struct and specifications of properties of the drive are stored in this
+structure.
+
+The $mask$ flags can be used to mask out some of the capabilities listed
+in $ops\to capability$, if a specific drive doesn't support a feature
+of the driver. The value $speed$ specifies the maximum head-rate of the
+drive, measured in units of normal audio speed (176\,kB/sec raw data or
+150\,kB/sec file system data). The value $n_discs$ should reflect the
+number of discs the drive can hold simultaneously, if it is designed
+as a juke-box, or otherwise~1. The parameters are declared $const$
+because they describe properties of the drive, which don't change after
+registration.
+
+A few registers contain variables local to the \cdrom\ drive. The
+flags $options$ are used to specify how the general \cdrom\ routines
+should behave. These various flags registers should provide enough
+flexibility to adapt to the different users' wishes (and {\em not\/} the
+`arbitrary' wishes of the author of the low-level device driver, as is
+the case in the old scheme). The register $mc_flags$ is used to buffer
+the information from $media_changed()$ to two separate queues. Other
+data that is specific to a minor drive, can be accessed through $handle$,
+which can point to a data structure specific to the low-level driver.
+The fields $use_count$, $next$, $options$ and $mc_flags$ need not be
+initialized.
+
+The intermediate software layer that \cdromc\ forms will perform some
+additional bookkeeping. The use count of the device (the number of
+processes that have the device opened) is registered in $use_count$. The
+function $cdrom_ioctl()$ will verify the appropriate user-memory regions
+for read and write, and in case a location on the CD is transferred,
+it will `sanitize' the format by making requests to the low-level
+drivers in a standard format, and translating all formats between the
+user-software and low level drivers. This relieves much of the drivers'
+memory checking and format checking and translation. Also, the necessary
+structures will be declared on the program stack.
+
+The implementation of the functions should be as defined in the
+following sections. Two functions {\em must\/} be implemented, namely
+$open()$ and $release()$. Other functions may be omitted, their
+corresponding capability flags will be cleared upon registration.
+Generally, a function returns zero on success and negative on error. A
+function call should return only after the command has completed, but of
+course waiting for the device should not use processor time.
+
+\subsection{$Int\ open(struct\ cdrom_device_info * cdi, int\ purpose)$}
+
+$Open()$ should try to open the device for a specific $purpose$, which
+can be either:
+\begin{itemize}
+\item[0] Open for reading data, as done by {\tt {mount()}} (2), or the
+user commands {\tt {dd}} or {\tt {cat}}.
+\item[1] Open for $ioctl$ commands, as done by audio-CD playing
+programs.
+\end{itemize}
+Notice that any strategic code (closing tray upon $open()$, etc.)\ is
+done by the calling routine in \cdromc, so the low-level routine
+should only be concerned with proper initialization, such as spinning
+up the disc, etc. % and device-use count
+
+
+\subsection{$Void\ release(struct\ cdrom_device_info * cdi)$}
+
+
+Device-specific actions should be taken such as spinning down the device.
+However, strategic actions such as ejection of the tray, or unlocking
+the door, should be left over to the general routine $cdrom_release()$.
+This is the only function returning type $void$.
+
+\subsection{$Int\ drive_status(struct\ cdrom_device_info * cdi, int\ slot_nr)$}
+\label{drive status}
+
+The function $drive_status$, if implemented, should provide
+information on the status of the drive (not the status of the disc,
+which may or may not be in the drive). If the drive is not a changer,
+$slot_nr$ should be ignored. In \cdromh\ the possibilities are listed:
+$$
+\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
+CDS_NO_INFO& no information available\cr
+CDS_NO_DISC& no disc is inserted, tray is closed\cr
+CDS_TRAY_OPEN& tray is opened\cr
+CDS_DRIVE_NOT_READY& something is wrong, tray is moving?\cr
+CDS_DISC_OK& a disc is loaded and everything is fine\cr
+}
+$$
+
+\subsection{$Int\ media_changed(struct\ cdrom_device_info * cdi, int\ disc_nr)$}
+
+This function is very similar to the original function in $struct\
+file_operations$. It returns 1 if the medium of the device $cdi\to
+dev$ has changed since the last call, and 0 otherwise. The parameter
+$disc_nr$ identifies a specific slot in a juke-box, it should be
+ignored for single-disc drives. Note that by `re-routing' this
+function through $cdrom_media_changed()$, we can implement separate
+queues for the VFS and a new $ioctl()$ function that can report device
+changes to software (\eg, an auto-mounting daemon).
+
+\subsection{$Int\ tray_move(struct\ cdrom_device_info * cdi, int\ position)$}
+
+This function, if implemented, should control the tray movement. (No
+other function should control this.) The parameter $position$ controls
+the desired direction of movement:
+\begin{itemize}
+\item[0] Close tray
+\item[1] Open tray
+\end{itemize}
+This function returns 0 upon success, and a non-zero value upon
+error. Note that if the tray is already in the desired position, no
+action need be taken, and the return value should be 0.
+
+\subsection{$Int\ lock_door(struct\ cdrom_device_info * cdi, int\ lock)$}
+
+This function (and no other code) controls locking of the door, if the
+drive allows this. The value of $lock$ controls the desired locking
+state:
+\begin{itemize}
+\item[0] Unlock door, manual opening is allowed
+\item[1] Lock door, tray cannot be ejected manually
+\end{itemize}
+This function returns 0 upon success, and a non-zero value upon
+error. Note that if the door is already in the requested state, no
+action need be taken, and the return value should be 0.
+
+\subsection{$Int\ select_speed(struct\ cdrom_device_info * cdi, int\ speed)$}
+
+Some \cdrom\ drives are capable of changing their head-speed. There
+are several reasons for changing the speed of a \cdrom\ drive. Badly
+pressed \cdrom s may benefit from less-than-maximum head rate. Modern
+\cdrom\ drives can obtain very high head rates (up to $24\times$ is
+common). It has been reported that these drives can make reading
+errors at these high speeds, reducing the speed can prevent data loss
+in these circumstances. Finally, some of these drives can
+make an annoyingly loud noise, which a lower speed may reduce. %Finally,
+%although the audio-low-pass filters probably aren't designed for it,
+%more than real-time playback of audio might be used for high-speed
+%copying of audio tracks.
+
+This function specifies the speed at which data is read or audio is
+played back. The value of $speed$ specifies the head-speed of the
+drive, measured in units of standard cdrom speed (176\,kB/sec raw data
+or 150\,kB/sec file system data). So to request that a \cdrom\ drive
+operate at 300\,kB/sec you would call the CDROM_SELECT_SPEED $ioctl$
+with $speed=2$. The special value `0' means `auto-selection', \ie,
+maximum data-rate or real-time audio rate. If the drive doesn't have
+this `auto-selection' capability, the decision should be made on the
+current disc loaded and the return value should be positive. A negative
+return value indicates an error.
+
+\subsection{$Int\ select_disc(struct\ cdrom_device_info * cdi, int\ number)$}
+
+If the drive can store multiple discs (a juke-box) this function
+will perform disc selection. It should return the number of the
+selected disc on success, a negative value on error. Currently, only
+the ide-cd driver supports this functionality.
+
+\subsection{$Int\ get_last_session(struct\ cdrom_device_info * cdi, struct\
+ cdrom_multisession * ms_info)$}
+
+This function should implement the old corresponding $ioctl()$. For
+device $cdi\to dev$, the start of the last session of the current disc
+should be returned in the pointer argument $ms_info$. Note that
+routines in \cdromc\ have sanitized this argument: its requested
+format will {\em always\/} be of the type $CDROM_LBA$ (linear block
+addressing mode), whatever the calling software requested. But
+sanitization goes even further: the low-level implementation may
+return the requested information in $CDROM_MSF$ format if it wishes so
+(setting the $ms_info\rightarrow addr_format$ field appropriately, of
+course) and the routines in \cdromc\ will make the transformation if
+necessary. The return value is 0 upon success.
+
+\subsection{$Int\ get_mcn(struct\ cdrom_device_info * cdi, struct\
+ cdrom_mcn * mcn)$}
+
+Some discs carry a `Media Catalog Number' (MCN), also called
+`Universal Product Code' (UPC). This number should reflect the number
+that is generally found in the bar-code on the product. Unfortunately,
+the few discs that carry such a number on the disc don't even use the
+same format. The return argument to this function is a pointer to a
+pre-declared memory region of type $struct\ cdrom_mcn$. The MCN is
+expected as a 13-character string, terminated by a null-character.
+
+\subsection{$Int\ reset(struct\ cdrom_device_info * cdi)$}
+
+This call should perform a hard-reset on the drive (although in
+circumstances that a hard-reset is necessary, a drive may very well not
+listen to commands anymore). Preferably, control is returned to the
+caller only after the drive has finished resetting. If the drive is no
+longer listening, it may be wise for the underlying low-level cdrom
+driver to time out.
+
+\subsection{$Int\ audio_ioctl(struct\ cdrom_device_info * cdi, unsigned\
+ int\ cmd, void * arg)$}
+
+Some of the \cdrom-$ioctl$s defined in \cdromh\ can be
+implemented by the routines described above, and hence the function
+$cdrom_ioctl$ will use those. However, most $ioctl$s deal with
+audio-control. We have decided to leave these to be accessed through a
+single function, repeating the arguments $cmd$ and $arg$. Note that
+the latter is of type $void*{}$, rather than $unsigned\ long\
+int$. The routine $cdrom_ioctl()$ does do some useful things,
+though. It sanitizes the address format type to $CDROM_MSF$ (Minutes,
+Seconds, Frames) for all audio calls. It also verifies the memory
+location of $arg$, and reserves stack-memory for the argument. This
+makes implementation of the $audio_ioctl()$ much simpler than in the
+old driver scheme. For example, you may look up the function
+$cm206_audio_ioctl()$ in {\tt {cm206.c}} that should be updated with
+this documentation.
+
+An unimplemented ioctl should return $-ENOSYS$, but a harmless request
+(\eg, $CDROMSTART$) may be ignored by returning 0 (success). Other
+errors should be according to the standards, whatever they are. When
+an error is returned by the low-level driver, the \UCD\ tries whenever
+possible to return the error code to the calling program. (We may decide
+to sanitize the return value in $cdrom_ioctl()$ though, in order to
+guarantee a uniform interface to the audio-player software.)
+
+\subsection{$Int\ dev_ioctl(struct\ cdrom_device_info * cdi, unsigned\ int\
+ cmd, unsigned\ long\ arg)$}
+
+Some $ioctl$s seem to be specific to certain \cdrom\ drives. That is,
+they are introduced to service some capabilities of certain drives. In
+fact, there are 6 different $ioctl$s for reading data, either in some
+particular kind of format, or audio data. Not many drives support
+reading audio tracks as data, I believe this is because of protection
+of copyrights of artists. Moreover, I think that if audio-tracks are
+supported, it should be done through the VFS and not via $ioctl$s. A
+problem here could be the fact that audio-frames are 2352 bytes long,
+so either the audio-file-system should ask for 75264 bytes at once
+(the least common multiple of 512 and 2352), or the drivers should
+bend their backs to cope with this incoherence (to which I would be
+opposed). Furthermore, it is very difficult for the hardware to find
+the exact frame boundaries, since there are no synchronization headers
+in audio frames. Once these issues are resolved, this code should be
+standardized in \cdromc.
+
+Because there are so many $ioctl$s that seem to be introduced to
+satisfy certain drivers,\footnote{Is there software around that
+ actually uses these? I'd be interested!} any `non-standard' $ioctl$s
+are routed through the call $dev_ioctl()$. In principle, `private'
+$ioctl$s should be numbered after the device's major number, and not
+the general \cdrom\ $ioctl$ number, {\tt {0x53}}. Currently the
+non-supported $ioctl$s are: {\it CDROMREADMODE1, CDROMREADMODE2,
+ CDROMREADAUDIO, CDROMREADRAW, CDROMREADCOOKED, CDROMSEEK,
+ CDROMPLAY\-BLK and CDROM\-READALL}.
+
+
+\subsection{\cdrom\ capabilities}
+\label{capability}
+
+Instead of just implementing some $ioctl$ calls, the interface in
+\cdromc\ supplies the possibility to indicate the {\em capabilities\/}
+of a \cdrom\ drive. This can be done by ORing any number of
+capability-constants that are defined in \cdromh\ at the registration
+phase. Currently, the capabilities are any of:
+$$
+\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
+CDC_CLOSE_TRAY& can close tray by software control\cr
+CDC_OPEN_TRAY& can open tray\cr
+CDC_LOCK& can lock and unlock the door\cr
+CDC_SELECT_SPEED& can select speed, in units of $\sim$150\,kB/s\cr
+CDC_SELECT_DISC& drive is juke-box\cr
+CDC_MULTI_SESSION& can read sessions $>\rm1$\cr
+CDC_MCN& can read Media Catalog Number\cr
+CDC_MEDIA_CHANGED& can report if disc has changed\cr
+CDC_PLAY_AUDIO& can perform audio-functions (play, pause, etc)\cr
+CDC_RESET& hard reset device\cr
+CDC_IOCTLS& driver has non-standard ioctls\cr
+CDC_DRIVE_STATUS& driver implements drive status\cr
+}
+$$
+The capability flag is declared $const$, to prevent drivers from
+accidentally tampering with the contents. The capability fags actually
+inform \cdromc\ of what the driver can do. If the drive found
+by the driver does not have the capability, is can be masked out by
+the $cdrom_device_info$ variable $mask$. For instance, the SCSI \cdrom\
+driver has implemented the code for loading and ejecting \cdrom's, and
+hence its corresponding flags in $capability$ will be set. But a SCSI
+\cdrom\ drive might be a caddy system, which can't load the tray, and
+hence for this drive the $cdrom_device_info$ struct will have set
+the $CDC_CLOSE_TRAY$ bit in $mask$.
+
+In the file \cdromc\ you will encounter many constructions of the type
+$$\it
+if\ (cdo\rightarrow capability \mathrel\& \mathord{\sim} cdi\rightarrow mask
+ \mathrel{\&} CDC_<capability>) \ldots
+$$
+There is no $ioctl$ to set the mask\dots The reason is that
+I think it is better to control the {\em behavior\/} rather than the
+{\em capabilities}.
+
+\subsection{Options}
+
+A final flag register controls the {\em behavior\/} of the \cdrom\
+drives, in order to satisfy different users' wishes, hopefully
+independently of the ideas of the respective author who happened to
+have made the drive's support available to the \linux\ community. The
+current behavior options are:
+$$
+\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
+CDO_AUTO_CLOSE& try to close tray upon device $open()$\cr
+CDO_AUTO_EJECT& try to open tray on last device $close()$\cr
+CDO_USE_FFLAGS& use $file_pointer\rightarrow f_flags$ to indicate
+ purpose for $open()$\cr
+CDO_LOCK& try to lock door if device is opened\cr
+CDO_CHECK_TYPE& ensure disc type is data if opened for data\cr
+}
+$$
+
+The initial value of this register is $CDO_AUTO_CLOSE \mathrel|
+CDO_USE_FFLAGS \mathrel| CDO_LOCK$, reflecting my own view on user
+interface and software standards. Before you protest, there are two
+new $ioctl$s implemented in \cdromc, that allow you to control the
+behavior by software. These are:
+$$
+\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
+CDROM_SET_OPTIONS& set options specified in $(int)\ arg$\cr
+CDROM_CLEAR_OPTIONS& clear options specified in $(int)\ arg$\cr
+}
+$$
+One option needs some more explanation: $CDO_USE_FFLAGS$. In the next
+newsection we explain what the need for this option is.
+
+A software package {\tt setcd}, available from the Debian distribution
+and {\tt sunsite.unc.edu}, allows user level control of these flags.
+
+\newsection{The need to know the purpose of opening the \cdrom\ device}
+
+Traditionally, Unix devices can be used in two different `modes',
+either by reading/writing to the device file, or by issuing
+controlling commands to the device, by the device's $ioctl()$
+call. The problem with \cdrom\ drives, is that they can be used for
+two entirely different purposes. One is to mount removable
+file systems, \cdrom s, the other is to play audio CD's. Audio commands
+are implemented entirely through $ioctl$s, presumably because the
+first implementation (SUN?) has been such. In principle there is
+nothing wrong with this, but a good control of the `CD player' demands
+that the device can {\em always\/} be opened in order to give the
+$ioctl$ commands, regardless of the state the drive is in.
+
+On the other hand, when used as a removable-media disc drive (what the
+original purpose of \cdrom s is) we would like to make sure that the
+disc drive is ready for operation upon opening the device. In the old
+scheme, some \cdrom\ drivers don't do any integrity checking, resulting
+in a number of i/o errors reported by the VFS to the kernel when an
+attempt for mounting a \cdrom\ on an empty drive occurs. This is not a
+particularly elegant way to find out that there is no \cdrom\ inserted;
+it more-or-less looks like the old IBM-PC trying to read an empty floppy
+drive for a couple of seconds, after which the system complains it
+can't read from it. Nowadays we can {\em sense\/} the existence of a
+removable medium in a drive, and we believe we should exploit that
+fact. An integrity check on opening of the device, that verifies the
+availability of a \cdrom\ and its correct type (data), would be
+desirable.
+
+These two ways of using a \cdrom\ drive, principally for data and
+secondarily for playing audio discs, have different demands for the
+behavior of the $open()$ call. Audio use simply wants to open the
+device in order to get a file handle which is needed for issuing
+$ioctl$ commands, while data use wants to open for correct and
+reliable data transfer. The only way user programs can indicate what
+their {\em purpose\/} of opening the device is, is through the $flags$
+parameter (see {\tt {open(2)}}). For \cdrom\ devices, these flags aren't
+implemented (some drivers implement checking for write-related flags,
+but this is not strictly necessary if the device file has correct
+permission flags). Most option flags simply don't make sense to
+\cdrom\ devices: $O_CREAT$, $O_NOCTTY$, $O_TRUNC$, $O_APPEND$, and
+$O_SYNC$ have no meaning to a \cdrom.
+
+We therefore propose to use the flag $O_NONBLOCK$ to indicate
+that the device is opened just for issuing $ioctl$
+commands. Strictly, the meaning of $O_NONBLOCK$ is that opening and
+subsequent calls to the device don't cause the calling process to
+wait. We could interpret this as ``don't wait until someone has
+inserted some valid data-\cdrom.'' Thus, our proposal of the
+implementation for the $open()$ call for \cdrom s is:
+\begin{itemize}
+\item If no other flags are set than $O_RDONLY$, the device is opened
+for data transfer, and the return value will be 0 only upon successful
+initialization of the transfer. The call may even induce some actions
+on the \cdrom, such as closing the tray.
+\item If the option flag $O_NONBLOCK$ is set, opening will always be
+successful, unless the whole device doesn't exist. The drive will take
+no actions whatsoever.
+\end{itemize}
+
+\subsection{And what about standards?}
+
+You might hesitate to accept this proposal as it comes from the
+\linux\ community, and not from some standardizing institute. What
+about SUN, SGI, HP and all those other Unix and hardware vendors?
+Well, these companies are in the lucky position that they generally
+control both the hardware and software of their supported products,
+and are large enough to set their own standard. They do not have to
+deal with a dozen or more different, competing hardware
+configurations.\footnote{Incidentally, I think that SUN's approach to
+mounting \cdrom s is very good in origin: under Solaris a
+volume-daemon automatically mounts a newly inserted \cdrom\ under {\tt
+{/cdrom/$<volume-name>$/}}. In my opinion they should have pushed this
+further and have {\em every\/} \cdrom\ on the local area network be
+mounted at the similar location, \ie, no matter in which particular
+machine you insert a \cdrom, it will always appear at the same
+position in the directory tree, on every system. When I wanted to
+implement such a user-program for \linux, I came across the
+differences in behavior of the various drivers, and the need for an
+$ioctl$ informing about media changes.}
+
+We believe that using $O_NONBLOCK$ to indicate that a device is being opened
+for $ioctl$ commands only can be easily introduced in the \linux\
+community. All the CD-player authors will have to be informed, we can
+even send in our own patches to the programs. The use of $O_NONBLOCK$
+has most likely no influence on the behavior of the CD-players on
+other operating systems than \linux. Finally, a user can always revert
+to old behavior by a call to $ioctl(file_descriptor, CDROM_CLEAR_OPTIONS,
+CDO_USE_FFLAGS)$.
+
+\subsection{The preferred strategy of $open()$}
+
+The routines in \cdromc\ are designed in such a way that run-time
+configuration of the behavior of \cdrom\ devices (of {\em any\/} type)
+can be carried out, by the $CDROM_SET/CLEAR_OPTIONS$ $ioctls$. Thus, various
+modes of operation can be set:
+\begin{description}
+\item[$CDO_AUTO_CLOSE \mathrel| CDO_USE_FFLAGS \mathrel| CDO_LOCK$] This
+is the default setting. (With $CDO_CHECK_TYPE$ it will be better, in the
+future.) If the device is not yet opened by any other process, and if
+the device is being opened for data ($O_NONBLOCK$ is not set) and the
+tray is found to be open, an attempt to close the tray is made. Then,
+it is verified that a disc is in the drive and, if $CDO_CHECK_TYPE$ is
+set, that it contains tracks of type `data mode 1.' Only if all tests
+are passed is the return value zero. The door is locked to prevent file
+system corruption. If the drive is opened for audio ($O_NONBLOCK$ is
+set), no actions are taken and a value of 0 will be returned.
+\item[$CDO_AUTO_CLOSE \mathrel| CDO_AUTO_EJECT \mathrel| CDO_LOCK$] This
+mimics the behavior of the current sbpcd-driver. The option flags are
+ignored, the tray is closed on the first open, if necessary. Similarly,
+the tray is opened on the last release, \ie, if a \cdrom\ is unmounted,
+it is automatically ejected, such that the user can replace it.
+\end{description}
+We hope that these option can convince everybody (both driver
+maintainers and user program developers) to adopt the new \cdrom\
+driver scheme and option flag interpretation.
+
+\newsection{Description of routines in \cdromc}
+
+Only a few routines in \cdromc\ are exported to the drivers. In this
+new section we will discuss these, as well as the functions that `take
+over' the \cdrom\ interface to the kernel. The header file belonging
+to \cdromc\ is called \cdromh. Formerly, some of the contents of this
+file were placed in the file {\tt {ucdrom.h}}, but this file has now been
+merged back into \cdromh.
+
+\subsection{$Struct\ file_operations\ cdrom_fops$}
+
+The contents of this structure were described in section~\ref{cdrom.c}.
+A pointer to this structure is assigned to the $fops$ field
+of the $struct gendisk$.
+
+\subsection{$Int\ register_cdrom( struct\ cdrom_device_info\ * cdi)$}
+
+This function is used in about the same way one registers $cdrom_fops$
+with the kernel, the device operations and information structures,
+as described in section~\ref{cdrom.c}, should be registered with the
+\UCD:
+$$
+register_cdrom(\&<device>_info));
+$$
+This function returns zero upon success, and non-zero upon
+failure. The structure $<device>_info$ should have a pointer to the
+driver's $<device>_dops$, as in
+$$
+\vbox{\halign{&$#$\hfil\cr
+struct\ &cdrom_device_info\ <device>_info = \{\cr
+& <device>_dops;\cr
+&\ldots\cr
+\}\cr
+}}$$
+Note that a driver must have one static structure, $<device>_dops$, while
+it may have as many structures $<device>_info$ as there are minor devices
+active. $Register_cdrom()$ builds a linked list from these.
+
+\subsection{$Void\ unregister_cdrom(struct\ cdrom_device_info * cdi)$}
+
+Unregistering device $cdi$ with minor number $MINOR(cdi\to dev)$ removes
+the minor device from the list. If it was the last registered minor for
+the low-level driver, this disconnects the registered device-operation
+routines from the \cdrom\ interface. This function returns zero upon
+success, and non-zero upon failure.
+
+\subsection{$Int\ cdrom_open(struct\ inode * ip, struct\ file * fp)$}
+
+This function is not called directly by the low-level drivers, it is
+listed in the standard $cdrom_fops$. If the VFS opens a file, this
+function becomes active. A strategy is implemented in this routine,
+taking care of all capabilities and options that are set in the
+$cdrom_device_ops$ connected to the device. Then, the program flow is
+transferred to the device_dependent $open()$ call.
+
+\subsection{$Void\ cdrom_release(struct\ inode *ip, struct\ file
+*fp)$}
+
+This function implements the reverse-logic of $cdrom_open()$, and then
+calls the device-dependent $release()$ routine. When the use-count has
+reached 0, the allocated buffers are flushed by calls to $sync_dev(dev)$
+and $invalidate_buffers(dev)$.
+
+
+\subsection{$Int\ cdrom_ioctl(struct\ inode *ip, struct\ file *fp,
+unsigned\ int\ cmd, unsigned\ long\ arg)$}
+\label{cdrom-ioctl}
+
+This function handles all the standard $ioctl$ requests for \cdrom\
+devices in a uniform way. The different calls fall into three
+categories: $ioctl$s that can be directly implemented by device
+operations, ones that are routed through the call $audio_ioctl()$, and
+the remaining ones, that are presumable device-dependent. Generally, a
+negative return value indicates an error.
+
+\subsubsection{Directly implemented $ioctl$s}
+\label{ioctl-direct}
+
+The following `old' \cdrom-$ioctl$s are implemented by directly
+calling device-operations in $cdrom_device_ops$, if implemented and
+not masked:
+\begin{description}
+\item[CDROMMULTISESSION] Requests the last session on a \cdrom.
+\item[CDROMEJECT] Open tray.
+\item[CDROMCLOSETRAY] Close tray.
+\item[CDROMEJECT_SW] If $arg\not=0$, set behavior to auto-close (close
+tray on first open) and auto-eject (eject on last release), otherwise
+set behavior to non-moving on $open()$ and $release()$ calls.
+\item[CDROM_GET_MCN] Get the Media Catalog Number from a CD.
+\end{description}
+
+\subsubsection{$Ioctl$s routed through $audio_ioctl()$}
+\label{ioctl-audio}
+
+The following set of $ioctl$s are all implemented through a call to
+the $cdrom_fops$ function $audio_ioctl()$. Memory checks and
+allocation are performed in $cdrom_ioctl()$, and also sanitization of
+address format ($CDROM_LBA$/$CDROM_MSF$) is done.
+\begin{description}
+\item[CDROMSUBCHNL] Get sub-channel data in argument $arg$ of type $struct\
+cdrom_subchnl *{}$.
+\item[CDROMREADTOCHDR] Read Table of Contents header, in $arg$ of type
+$struct\ cdrom_tochdr *{}$.
+\item[CDROMREADTOCENTRY] Read a Table of Contents entry in $arg$ and
+specified by $arg$ of type $struct\ cdrom_tocentry *{}$.
+\item[CDROMPLAYMSF] Play audio fragment specified in Minute, Second,
+Frame format, delimited by $arg$ of type $struct\ cdrom_msf *{}$.
+\item[CDROMPLAYTRKIND] Play audio fragment in track-index format
+delimited by $arg$ of type $struct\ \penalty-1000 cdrom_ti *{}$.
+\item[CDROMVOLCTRL] Set volume specified by $arg$ of type $struct\
+cdrom_volctrl *{}$.
+\item[CDROMVOLREAD] Read volume into by $arg$ of type $struct\
+cdrom_volctrl *{}$.
+\item[CDROMSTART] Spin up disc.
+\item[CDROMSTOP] Stop playback of audio fragment.
+\item[CDROMPAUSE] Pause playback of audio fragment.
+\item[CDROMRESUME] Resume playing.
+\end{description}
+
+\subsubsection{New $ioctl$s in \cdromc}
+
+The following $ioctl$s have been introduced to allow user programs to
+control the behavior of individual \cdrom\ devices. New $ioctl$
+commands can be identified by the underscores in their names.
+\begin{description}
+\item[CDROM_SET_OPTIONS] Set options specified by $arg$. Returns the
+option flag register after modification. Use $arg = \rm0$ for reading
+the current flags.
+\item[CDROM_CLEAR_OPTIONS] Clear options specified by $arg$. Returns
+ the option flag register after modification.
+\item[CDROM_SELECT_SPEED] Select head-rate speed of disc specified as
+ by $arg$ in units of standard cdrom speed (176\,kB/sec raw data or
+ 150\,kB/sec file system data). The value 0 means `auto-select', \ie,
+ play audio discs at real time and data discs at maximum speed. The value
+ $arg$ is checked against the maximum head rate of the drive found in the
+ $cdrom_dops$.
+\item[CDROM_SELECT_DISC] Select disc numbered $arg$ from a juke-box.
+ First disc is numbered 0. The number $arg$ is checked against the
+ maximum number of discs in the juke-box found in the $cdrom_dops$.
+\item[CDROM_MEDIA_CHANGED] Returns 1 if a disc has been changed since
+ the last call. Note that calls to $cdrom_media_changed$ by the VFS
+ are treated by an independent queue, so both mechanisms will detect
+ a media change once. For juke-boxes, an extra argument $arg$
+ specifies the slot for which the information is given. The special
+ value $CDSL_CURRENT$ requests that information about the currently
+ selected slot be returned.
+\item[CDROM_DRIVE_STATUS] Returns the status of the drive by a call to
+ $drive_status()$. Return values are defined in section~\ref{drive
+ status}. Note that this call doesn't return information on the
+ current playing activity of the drive; this can be polled through an
+ $ioctl$ call to $CDROMSUBCHNL$. For juke-boxes, an extra argument
+ $arg$ specifies the slot for which (possibly limited) information is
+ given. The special value $CDSL_CURRENT$ requests that information
+ about the currently selected slot be returned.
+\item[CDROM_DISC_STATUS] Returns the type of the disc currently in the
+ drive. It should be viewed as a complement to $CDROM_DRIVE_STATUS$.
+ This $ioctl$ can provide \emph {some} information about the current
+ disc that is inserted in the drive. This functionality used to be
+ implemented in the low level drivers, but is now carried out
+ entirely in \UCD.
+
+ The history of development of the CD's use as a carrier medium for
+ various digital information has lead to many different disc types.
+ This $ioctl$ is useful only in the case that CDs have \emph {only
+ one} type of data on them. While this is often the case, it is
+ also very common for CDs to have some tracks with data, and some
+ tracks with audio. Because this is an existing interface, rather
+ than fixing this interface by changing the assumptions it was made
+ under, thereby breaking all user applications that use this
+ function, the \UCD\ implements this $ioctl$ as follows: If the CD in
+ question has audio tracks on it, and it has absolutely no CD-I, XA,
+ or data tracks on it, it will be reported as $CDS_AUDIO$. If it has
+ both audio and data tracks, it will return $CDS_MIXED$. If there
+ are no audio tracks on the disc, and if the CD in question has any
+ CD-I tracks on it, it will be reported as $CDS_XA_2_2$. Failing
+ that, if the CD in question has any XA tracks on it, it will be
+ reported as $CDS_XA_2_1$. Finally, if the CD in question has any
+ data tracks on it, it will be reported as a data CD ($CDS_DATA_1$).
+
+ This $ioctl$ can return:
+ $$
+ \halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr
+ CDS_NO_INFO& no information available\cr
+ CDS_NO_DISC& no disc is inserted, or tray is opened\cr
+ CDS_AUDIO& Audio disc (2352 audio bytes/frame)\cr
+ CDS_DATA_1& data disc, mode 1 (2048 user bytes/frame)\cr
+ CDS_XA_2_1& mixed data (XA), mode 2, form 1 (2048 user bytes)\cr
+ CDS_XA_2_2& mixed data (XA), mode 2, form 1 (2324 user bytes)\cr
+ CDS_MIXED& mixed audio/data disc\cr
+ }
+ $$
+ For some information concerning frame layout of the various disc
+ types, see a recent version of \cdromh.
+
+\item[CDROM_CHANGER_NSLOTS] Returns the number of slots in a
+ juke-box.
+\item[CDROMRESET] Reset the drive.
+\item[CDROM_GET_CAPABILITY] Returns the $capability$ flags for the
+ drive. Refer to section \ref{capability} for more information on
+ these flags.
+\item[CDROM_LOCKDOOR] Locks the door of the drive. $arg == \rm0$
+ unlocks the door, any other value locks it.
+\item[CDROM_DEBUG] Turns on debugging info. Only root is allowed
+ to do this. Same semantics as CDROM_LOCKDOOR.
+\end{description}
+
+\subsubsection{Device dependent $ioctl$s}
+
+Finally, all other $ioctl$s are passed to the function $dev_ioctl()$,
+if implemented. No memory allocation or verification is carried out.
+
+\newsection{How to update your driver}
+
+\begin{enumerate}
+\item Make a backup of your current driver.
+\item Get hold of the files \cdromc\ and \cdromh, they should be in
+ the directory tree that came with this documentation.
+\item Make sure you include \cdromh.
+\item Change the 3rd argument of $register_blkdev$ from
+$\&<your-drive>_fops$ to $\&cdrom_fops$.
+\item Just after that line, add the following to register with the \UCD:
+ $$register_cdrom(\&<your-drive>_info);$$
+ Similarly, add a call to $unregister_cdrom()$ at the appropriate place.
+\item Copy an example of the device-operations $struct$ to your
+ source, \eg, from {\tt {cm206.c}} $cm206_dops$, and change all
+ entries to names corresponding to your driver, or names you just
+ happen to like. If your driver doesn't support a certain function,
+ make the entry $NULL$. At the entry $capability$ you should list all
+ capabilities your driver currently supports. If your driver
+ has a capability that is not listed, please send me a message.
+\item Copy the $cdrom_device_info$ declaration from the same example
+ driver, and modify the entries according to your needs. If your
+ driver dynamically determines the capabilities of the hardware, this
+ structure should also be declared dynamically.
+\item Implement all functions in your $<device>_dops$ structure,
+ according to prototypes listed in \cdromh, and specifications given
+ in section~\ref{cdrom.c}. Most likely you have already implemented
+ the code in a large part, and you will almost certainly need to adapt the
+ prototype and return values.
+\item Rename your $<device>_ioctl()$ function to $audio_ioctl$ and
+ change the prototype a little. Remove entries listed in the first
+ part in section~\ref{cdrom-ioctl}, if your code was OK, these are
+ just calls to the routines you adapted in the previous step.
+\item You may remove all remaining memory checking code in the
+ $audio_ioctl()$ function that deals with audio commands (these are
+ listed in the second part of section~\ref{cdrom-ioctl}). There is no
+ need for memory allocation either, so most $case$s in the $switch$
+ statement look similar to:
+ $$
+ case\ CDROMREADTOCENTRY\colon get_toc_entry\bigl((struct\
+ cdrom_tocentry *{})\ arg\bigr);
+ $$
+\item All remaining $ioctl$ cases must be moved to a separate
+ function, $<device>_ioctl$, the device-dependent $ioctl$s. Note that
+ memory checking and allocation must be kept in this code!
+\item Change the prototypes of $<device>_open()$ and
+ $<device>_release()$, and remove any strategic code (\ie, tray
+ movement, door locking, etc.).
+\item Try to recompile the drivers. We advise you to use modules, both
+ for {\tt {cdrom.o}} and your driver, as debugging is much easier this
+ way.
+\end{enumerate}
+
+\newsection{Thanks}
+
+Thanks to all the people involved. First, Erik Andersen, who has
+taken over the torch in maintaining \cdromc\ and integrating much
+\cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and
+Gerd Knorr, who were the first to implement this interface for SCSI
+and IDE-CD drivers and added many ideas for extension of the data
+structures relative to kernel~2.0. Further thanks to Heiko Ei{\sz}feldt,
+Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
+Kroll, the \linux\ \cdrom\ device driver developers who were kind
+enough to give suggestions and criticisms during the writing. Finally
+of course, I want to thank Linus Torvalds for making this possible in
+the first place.
+
+\vfill
+$ \version\ $
+\eject
+\end{document}
diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd
new file mode 100644
index 0000000..2c558cd
--- /dev/null
+++ b/Documentation/cdrom/ide-cd
@@ -0,0 +1,573 @@
+IDE-CD driver documentation
+Originally by scott snyder <snyder@fnald0.fnal.gov> (19 May 1996)
+Carrying on the torch is: Erik Andersen <andersee@debian.org>
+New maintainers (19 Oct 1998): Jens Axboe <axboe@image.dk>
+
+1. Introduction
+---------------
+
+The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant
+CDROM drives which attach to an IDE interface. Note that some CDROM vendors
+(including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made
+both ATAPI-compliant drives and drives which use a proprietary
+interface. If your drive uses one of those proprietary interfaces,
+this driver will not work with it (but one of the other CDROM drivers
+probably will). This driver will not work with `ATAPI' drives which
+attach to the parallel port. In addition, there is at least one drive
+(CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI;
+this driver will not work with drives like that either (but see the
+aztcd driver).
+
+This driver provides the following features:
+
+ - Reading from data tracks, and mounting ISO 9660 filesystems.
+
+ - Playing audio tracks. Most of the CDROM player programs floating
+ around should work; I usually use Workman.
+
+ - Multisession support.
+
+ - On drives which support it, reading digital audio data directly
+ from audio tracks. The program cdda2wav can be used for this.
+ Note, however, that only some drives actually support this.
+
+ - There is now support for CDROM changers which comply with the
+ ATAPI 2.6 draft standard (such as the NEC CDR-251). This additional
+ functionality includes a function call to query which slot is the
+ currently selected slot, a function call to query which slots contain
+ CDs, etc. A sample program which demonstrates this functionality is
+ appended to the end of this file. The Sanyo 3-disc changer
+ (which does not conform to the standard) is also now supported.
+ Please note the driver refers to the first CD as slot # 0.
+
+
+2. Installation
+---------------
+
+0. The ide-cd relies on the ide disk driver. See
+ Documentation/ide/ide.txt for up-to-date information on the ide
+ driver.
+
+1. Make sure that the ide and ide-cd drivers are compiled into the
+ kernel you're using. When configuring the kernel, in the section
+ entitled "Floppy, IDE, and other block devices", say either `Y'
+ (which will compile the support directly into the kernel) or `M'
+ (to compile support as a module which can be loaded and unloaded)
+ to the options:
+
+ Enhanced IDE/MFM/RLL disk/cdrom/tape/floppy support
+ Include IDE/ATAPI CDROM support
+
+ and `no' to
+
+ Use old disk-only driver on primary interface
+
+ Depending on what type of IDE interface you have, you may need to
+ specify additional configuration options. See
+ Documentation/ide/ide.txt.
+
+2. You should also ensure that the iso9660 filesystem is either
+ compiled into the kernel or available as a loadable module. You
+ can see if a filesystem is known to the kernel by catting
+ /proc/filesystems.
+
+3. The CDROM drive should be connected to the host on an IDE
+ interface. Each interface on a system is defined by an I/O port
+ address and an IRQ number, the standard assignments being
+ 0x1f0 and 14 for the primary interface and 0x170 and 15 for the
+ secondary interface. Each interface can control up to two devices,
+ where each device can be a hard drive, a CDROM drive, a floppy drive,
+ or a tape drive. The two devices on an interface are called `master'
+ and `slave'; this is usually selectable via a jumper on the drive.
+
+ Linux names these devices as follows. The master and slave devices
+ on the primary IDE interface are called `hda' and `hdb',
+ respectively. The drives on the secondary interface are called
+ `hdc' and `hdd'. (Interfaces at other locations get other letters
+ in the third position; see Documentation/ide/ide.txt.)
+
+ If you want your CDROM drive to be found automatically by the
+ driver, you should make sure your IDE interface uses either the
+ primary or secondary addresses mentioned above. In addition, if
+ the CDROM drive is the only device on the IDE interface, it should
+ be jumpered as `master'. (If for some reason you cannot configure
+ your system in this manner, you can probably still use the driver.
+ You may have to pass extra configuration information to the kernel
+ when you boot, however. See Documentation/ide/ide.txt for more
+ information.)
+
+4. Boot the system. If the drive is recognized, you should see a
+ message which looks like
+
+ hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive
+
+ If you do not see this, see section 5 below.
+
+5. You may want to create a symbolic link /dev/cdrom pointing to the
+ actual device. You can do this with the command
+
+ ln -s /dev/hdX /dev/cdrom
+
+ where X should be replaced by the letter indicating where your
+ drive is installed.
+
+6. You should be able to see any error messages from the driver with
+ the `dmesg' command.
+
+
+3. Basic usage
+--------------
+
+An ISO 9660 CDROM can be mounted by putting the disc in the drive and
+typing (as root)
+
+ mount -t iso9660 /dev/cdrom /mnt/cdrom
+
+where it is assumed that /dev/cdrom is a link pointing to the actual
+device (as described in step 5 of the last section) and /mnt/cdrom is
+an empty directory. You should now be able to see the contents of the
+CDROM under the /mnt/cdrom directory. If you want to eject the CDROM,
+you must first dismount it with a command like
+
+ umount /mnt/cdrom
+
+Note that audio CDs cannot be mounted.
+
+Some distributions set up /etc/fstab to always try to mount a CDROM
+filesystem on bootup. It is not required to mount the CDROM in this
+manner, though, and it may be a nuisance if you change CDROMs often.
+You should feel free to remove the cdrom line from /etc/fstab and
+mount CDROMs manually if that suits you better.
+
+Multisession and photocd discs should work with no special handling.
+The hpcdtoppm package (ftp.gwdg.de:/pub/linux/hpcdtoppm/) may be
+useful for reading photocds.
+
+To play an audio CD, you should first unmount and remove any data
+CDROM. Any of the CDROM player programs should then work (workman,
+workbone, cdplayer, etc.).
+
+On a few drives, you can read digital audio directly using a program
+such as cdda2wav. The only types of drive which I've heard support
+this are Sony and Toshiba drives. You will get errors if you try to
+use this function on a drive which does not support it.
+
+For supported changers, you can use the `cdchange' program (appended to
+the end of this file) to switch between changer slots. Note that the
+drive should be unmounted before attempting this. The program takes
+two arguments: the CDROM device, and the slot number to which you wish
+to change. If the slot number is -1, the drive is unloaded.
+
+
+4. Compilation options
+----------------------
+
+There are a few additional options which can be set when compiling the
+driver. Most people should not need to mess with any of these; they
+are listed here simply for completeness. A compilation option can be
+enabled by adding a line of the form `#define <option> 1' to the top
+of ide-cd.c. All these options are disabled by default.
+
+VERBOSE_IDE_CD_ERRORS
+ If this is set, ATAPI error codes will be translated into textual
+ descriptions. In addition, a dump is made of the command which
+ provoked the error. This is off by default to save the memory used
+ by the (somewhat long) table of error descriptions.
+
+STANDARD_ATAPI
+ If this is set, the code needed to deal with certain drives which do
+ not properly implement the ATAPI spec will be disabled. If you know
+ your drive implements ATAPI properly, you can turn this on to get a
+ slightly smaller kernel.
+
+NO_DOOR_LOCKING
+ If this is set, the driver will never attempt to lock the door of
+ the drive.
+
+CDROM_NBLOCKS_BUFFER
+ This sets the size of the buffer to be used for a CDROMREADAUDIO
+ ioctl. The default is 8.
+
+TEST
+ This currently enables an additional ioctl which enables a user-mode
+ program to execute an arbitrary packet command. See the source for
+ details. This should be left off unless you know what you're doing.
+
+
+5. Common problems
+------------------
+
+This section discusses some common problems encountered when trying to
+use the driver, and some possible solutions. Note that if you are
+experiencing problems, you should probably also review
+Documentation/ide/ide.txt for current information about the underlying
+IDE support code. Some of these items apply only to earlier versions
+of the driver, but are mentioned here for completeness.
+
+In most cases, you should probably check with `dmesg' for any errors
+from the driver.
+
+a. Drive is not detected during booting.
+
+ - Review the configuration instructions above and in
+ Documentation/ide/ide.txt, and check how your hardware is
+ configured.
+
+ - If your drive is the only device on an IDE interface, it should
+ be jumpered as master, if at all possible.
+
+ - If your IDE interface is not at the standard addresses of 0x170
+ or 0x1f0, you'll need to explicitly inform the driver using a
+ lilo option. See Documentation/ide/ide.txt. (This feature was
+ added around kernel version 1.3.30.)
+
+ - If the autoprobing is not finding your drive, you can tell the
+ driver to assume that one exists by using a lilo option of the
+ form `hdX=cdrom', where X is the drive letter corresponding to
+ where your drive is installed. Note that if you do this and you
+ see a boot message like
+
+ hdX: ATAPI cdrom (?)
+
+ this does _not_ mean that the driver has successfully detected
+ the drive; rather, it means that the driver has not detected a
+ drive, but is assuming there's one there anyway because you told
+ it so. If you actually try to do I/O to a drive defined at a
+ nonexistent or nonresponding I/O address, you'll probably get
+ errors with a status value of 0xff.
+
+ - Some IDE adapters require a nonstandard initialization sequence
+ before they'll function properly. (If this is the case, there
+ will often be a separate MS-DOS driver just for the controller.)
+ IDE interfaces on sound cards often fall into this category.
+
+ Support for some interfaces needing extra initialization is
+ provided in later 1.3.x kernels. You may need to turn on
+ additional kernel configuration options to get them to work;
+ see Documentation/ide/ide.txt.
+
+ Even if support is not available for your interface, you may be
+ able to get it to work with the following procedure. First boot
+ MS-DOS and load the appropriate drivers. Then warm-boot linux
+ (i.e., without powering off). If this works, it can be automated
+ by running loadlin from the MS-DOS autoexec.
+
+
+b. Timeout/IRQ errors.
+
+ - If you always get timeout errors, interrupts from the drive are
+ probably not making it to the host.
+
+ - IRQ problems may also be indicated by the message
+ `IRQ probe failed (<n>)' while booting. If <n> is zero, that
+ means that the system did not see an interrupt from the drive when
+ it was expecting one (on any feasible IRQ). If <n> is negative,
+ that means the system saw interrupts on multiple IRQ lines, when
+ it was expecting to receive just one from the CDROM drive.
+
+ - Double-check your hardware configuration to make sure that the IRQ
+ number of your IDE interface matches what the driver expects.
+ (The usual assignments are 14 for the primary (0x1f0) interface
+ and 15 for the secondary (0x170) interface.) Also be sure that
+ you don't have some other hardware which might be conflicting with
+ the IRQ you're using. Also check the BIOS setup for your system;
+ some have the ability to disable individual IRQ levels, and I've
+ had one report of a system which was shipped with IRQ 15 disabled
+ by default.
+
+ - Note that many MS-DOS CDROM drivers will still function even if
+ there are hardware problems with the interrupt setup; they
+ apparently don't use interrupts.
+
+ - If you own a Pioneer DR-A24X, you _will_ get nasty error messages
+ on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }"
+ The Pioneer DR-A24X CDROM drives are fairly popular these days.
+ Unfortunately, these drives seem to become very confused when we perform
+ the standard Linux ATA disk drive probe. If you own one of these drives,
+ you can bypass the ATA probing which confuses these CDROM drives, by
+ adding `append="hdX=noprobe hdX=cdrom"' to your lilo.conf file and running
+ lilo (again where X is the drive letter corresponding to where your drive
+ is installed.)
+
+c. System hangups.
+
+ - If the system locks up when you try to access the CDROM, the most
+ likely cause is that you have a buggy IDE adapter which doesn't
+ properly handle simultaneous transactions on multiple interfaces.
+ The most notorious of these is the CMD640B chip. This problem can
+ be worked around by specifying the `serialize' option when
+ booting. Recent kernels should be able to detect the need for
+ this automatically in most cases, but the detection is not
+ foolproof. See Documentation/ide/ide.txt for more information
+ about the `serialize' option and the CMD640B.
+
+ - Note that many MS-DOS CDROM drivers will work with such buggy
+ hardware, apparently because they never attempt to overlap CDROM
+ operations with other disk activity.
+
+
+d. Can't mount a CDROM.
+
+ - If you get errors from mount, it may help to check `dmesg' to see
+ if there are any more specific errors from the driver or from the
+ filesystem.
+
+ - Make sure there's a CDROM loaded in the drive, and that's it's an
+ ISO 9660 disc. You can't mount an audio CD.
+
+ - With the CDROM in the drive and unmounted, try something like
+
+ cat /dev/cdrom | od | more
+
+ If you see a dump, then the drive and driver are probably working
+ OK, and the problem is at the filesystem level (i.e., the CDROM is
+ not ISO 9660 or has errors in the filesystem structure).
+
+ - If you see `not a block device' errors, check that the definitions
+ of the device special files are correct. They should be as
+ follows:
+
+ brw-rw---- 1 root disk 3, 0 Nov 11 18:48 /dev/hda
+ brw-rw---- 1 root disk 3, 64 Nov 11 18:48 /dev/hdb
+ brw-rw---- 1 root disk 22, 0 Nov 11 18:48 /dev/hdc
+ brw-rw---- 1 root disk 22, 64 Nov 11 18:48 /dev/hdd
+
+ Some early Slackware releases had these defined incorrectly. If
+ these are wrong, you can remake them by running the script
+ scripts/MAKEDEV.ide. (You may have to make it executable
+ with chmod first.)
+
+ If you have a /dev/cdrom symbolic link, check that it is pointing
+ to the correct device file.
+
+ If you hear people talking of the devices `hd1a' and `hd1b', these
+ were old names for what are now called hdc and hdd. Those names
+ should be considered obsolete.
+
+ - If mount is complaining that the iso9660 filesystem is not
+ available, but you know it is (check /proc/filesystems), you
+ probably need a newer version of mount. Early versions would not
+ always give meaningful error messages.
+
+
+e. Directory listings are unpredictably truncated, and `dmesg' shows
+ `buffer botch' error messages from the driver.
+
+ - There was a bug in the version of the driver in 1.2.x kernels
+ which could cause this. It was fixed in 1.3.0. If you can't
+ upgrade, you can probably work around the problem by specifying a
+ blocksize of 2048 when mounting. (Note that you won't be able to
+ directly execute binaries off the CDROM in that case.)
+
+ If you see this in kernels later than 1.3.0, please report it as a
+ bug.
+
+
+f. Data corruption.
+
+ - Random data corruption was occasionally observed with the Hitachi
+ CDR-7730 CDROM. If you experience data corruption, using "hdx=slow"
+ as a command line parameter may work around the problem, at the
+ expense of low system performance.
+
+
+6. cdchange.c
+-------------
+
+/*
+ * cdchange.c [-v] <device> [<slot>]
+ *
+ * This loads a CDROM from a specified slot in a changer, and displays
+ * information about the changer status. The drive should be unmounted before
+ * using this program.
+ *
+ * Changer information is displayed if either the -v flag is specified
+ * or no slot was specified.
+ *
+ * Based on code originally from Gerhard Zuber <zuber@berlin.snafu.de>.
+ * Changer status information, and rewrite for the new Uniform CDROM driver
+ * interface by Erik Andersen <andersee@debian.org>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/cdrom.h>
+
+
+int
+main (int argc, char **argv)
+{
+ char *program;
+ char *device;
+ int fd; /* file descriptor for CD-ROM device */
+ int status; /* return status for system calls */
+ int verbose = 0;
+ int slot=-1, x_slot;
+ int total_slots_available;
+
+ program = argv[0];
+
+ ++argv;
+ --argc;
+
+ if (argc < 1 || argc > 3) {
+ fprintf (stderr, "usage: %s [-v] <device> [<slot>]\n",
+ program);
+ fprintf (stderr, " Slots are numbered 1 -- n.\n");
+ exit (1);
+ }
+
+ if (strcmp (argv[0], "-v") == 0) {
+ verbose = 1;
+ ++argv;
+ --argc;
+ }
+
+ device = argv[0];
+
+ if (argc == 2)
+ slot = atoi (argv[1]) - 1;
+
+ /* open device */
+ fd = open(device, O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ fprintf (stderr, "%s: open failed for `%s': %s\n",
+ program, device, strerror (errno));
+ exit (1);
+ }
+
+ /* Check CD player status */
+ total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS);
+ if (total_slots_available <= 1 ) {
+ fprintf (stderr, "%s: Device `%s' is not an ATAPI "
+ "compliant CD changer.\n", program, device);
+ exit (1);
+ }
+
+ if (slot >= 0) {
+ if (slot >= total_slots_available) {
+ fprintf (stderr, "Bad slot number. "
+ "Should be 1 -- %d.\n",
+ total_slots_available);
+ exit (1);
+ }
+
+ /* load */
+ slot=ioctl (fd, CDROM_SELECT_DISC, slot);
+ if (slot<0) {
+ fflush(stdout);
+ perror ("CDROM_SELECT_DISC ");
+ exit(1);
+ }
+ }
+
+ if (slot < 0 || verbose) {
+
+ status=ioctl (fd, CDROM_SELECT_DISC, CDSL_CURRENT);
+ if (status<0) {
+ fflush(stdout);
+ perror (" CDROM_SELECT_DISC");
+ exit(1);
+ }
+ slot=status;
+
+ printf ("Current slot: %d\n", slot+1);
+ printf ("Total slots available: %d\n",
+ total_slots_available);
+
+ printf ("Drive status: ");
+ status = ioctl (fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
+ if (status<0) {
+ perror(" CDROM_DRIVE_STATUS");
+ } else switch(status) {
+ case CDS_DISC_OK:
+ printf ("Ready.\n");
+ break;
+ case CDS_TRAY_OPEN:
+ printf ("Tray Open.\n");
+ break;
+ case CDS_DRIVE_NOT_READY:
+ printf ("Drive Not Ready.\n");
+ break;
+ default:
+ printf ("This Should not happen!\n");
+ break;
+ }
+
+ for (x_slot=0; x_slot<total_slots_available; x_slot++) {
+ printf ("Slot %2d: ", x_slot+1);
+ status = ioctl (fd, CDROM_DRIVE_STATUS, x_slot);
+ if (status<0) {
+ perror(" CDROM_DRIVE_STATUS");
+ } else switch(status) {
+ case CDS_DISC_OK:
+ printf ("Disc present.");
+ break;
+ case CDS_NO_DISC:
+ printf ("Empty slot.");
+ break;
+ case CDS_TRAY_OPEN:
+ printf ("CD-ROM tray open.\n");
+ break;
+ case CDS_DRIVE_NOT_READY:
+ printf ("CD-ROM drive not ready.\n");
+ break;
+ case CDS_NO_INFO:
+ printf ("No Information available.");
+ break;
+ default:
+ printf ("This Should not happen!\n");
+ break;
+ }
+ if (slot == x_slot) {
+ status = ioctl (fd, CDROM_DISC_STATUS);
+ if (status<0) {
+ perror(" CDROM_DISC_STATUS");
+ }
+ switch (status) {
+ case CDS_AUDIO:
+ printf ("\tAudio disc.\t");
+ break;
+ case CDS_DATA_1:
+ case CDS_DATA_2:
+ printf ("\tData disc type %d.\t", status-CDS_DATA_1+1);
+ break;
+ case CDS_XA_2_1:
+ case CDS_XA_2_2:
+ printf ("\tXA data disc type %d.\t", status-CDS_XA_2_1+1);
+ break;
+ default:
+ printf ("\tUnknown disc type 0x%x!\t", status);
+ break;
+ }
+ }
+ status = ioctl (fd, CDROM_MEDIA_CHANGED, x_slot);
+ if (status<0) {
+ perror(" CDROM_MEDIA_CHANGED");
+ }
+ switch (status) {
+ case 1:
+ printf ("Changed.\n");
+ break;
+ default:
+ printf ("\n");
+ break;
+ }
+ }
+ }
+
+ /* close device */
+ status = close (fd);
+ if (status != 0) {
+ fprintf (stderr, "%s: close failed for `%s': %s\n",
+ program, device, strerror (errno));
+ exit (1);
+ }
+
+ exit (0);
+}
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
new file mode 100644
index 0000000..cf1f812
--- /dev/null
+++ b/Documentation/cdrom/packet-writing.txt
@@ -0,0 +1,132 @@
+Getting started quick
+---------------------
+
+- Select packet support in the block device section and UDF support in
+ the file system section.
+
+- Compile and install kernel and modules, reboot.
+
+- You need the udftools package (pktsetup, mkudffs, cdrwtool).
+ Download from http://sourceforge.net/projects/linux-udf/
+
+- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute
+ as appropriate):
+ # cdrwtool -d /dev/hdc -q
+
+- Setup your writer
+ # pktsetup dev_name /dev/hdc
+
+- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy!
+ # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD-RW media
+-------------------------------
+
+DVD-RW discs can be written to much like CD-RW discs if they are in
+the so called "restricted overwrite" mode. To put a disc in restricted
+overwrite mode, run:
+
+ # dvd+rw-format /dev/hdc
+
+You can then use the disc the same way you would use a CD-RW disc:
+
+ # pktsetup dev_name /dev/hdc
+ # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD+RW media
+-------------------------------
+
+According to the DVD+RW specification, a drive supporting DVD+RW discs
+shall implement "true random writes with 2KB granularity", which means
+that it should be possible to put any filesystem with a block size >=
+2KB on such a disc. For example, it should be possible to do:
+
+ # dvd+rw-format /dev/hdc (only needed if the disc has never
+ been formatted)
+ # mkudffs /dev/hdc
+ # mount /dev/hdc /cdrom -t udf -o rw,noatime
+
+However, some drives don't follow the specification and expect the
+host to perform aligned writes at 32KB boundaries. Other drives do
+follow the specification, but suffer bad performance problems if the
+writes are not 32KB aligned.
+
+Both problems can be solved by using the pktcdvd driver, which always
+generates aligned writes.
+
+ # dvd+rw-format /dev/hdc
+ # pktsetup dev_name /dev/hdc
+ # mkudffs /dev/pktcdvd/dev_name
+ # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD-RAM media
+--------------------------------
+
+DVD-RAM discs are random writable, so using the pktcdvd driver is not
+necessary. However, using the pktcdvd driver can improve performance
+in the same way it does for DVD+RW media.
+
+
+Notes
+-----
+
+- CD-RW media can usually not be overwritten more than about 1000
+ times, so to avoid unnecessary wear on the media, you should always
+ use the noatime mount option.
+
+- Defect management (ie automatic remapping of bad sectors) has not
+ been implemented yet, so you are likely to get at least some
+ filesystem corruption if the disc wears out.
+
+- Since the pktcdvd driver makes the disc appear as a regular block
+ device with a 2KB block size, you can put any filesystem you like on
+ the disc. For example, run:
+
+ # /sbin/mke2fs /dev/pktcdvd/dev_name
+
+ to create an ext2 filesystem on the disc.
+
+
+Using the pktcdvd sysfs interface
+---------------------------------
+
+Since Linux 2.6.20, the pktcdvd module has a sysfs interface
+and can be controlled by it. For example the "pktcdvd" tool uses
+this interface. (see http://people.freenet.de/BalaGi#pktcdvd )
+
+"pktcdvd" works similar to "pktsetup", e.g.:
+
+ # pktcdvd -a dev_name /dev/hdc
+ # mkudffs /dev/pktcdvd/dev_name
+ # mount -t udf -o rw,noatime /dev/pktcdvd/dev_name /dvdram
+ # cp files /dvdram
+ # umount /dvdram
+ # pktcdvd -r dev_name
+
+
+For a description of the sysfs interface look into the file:
+
+ Documentation/ABI/testing/sysfs-block-pktcdvd
+
+
+Using the pktcdvd debugfs interface
+-----------------------------------
+
+To read pktcdvd device infos in human readable form, do:
+
+ # cat /debug/pktcdvd/pktcdvd[0-7]/info
+
+For a description of the debugfs interface look into the file:
+
+ Documentation/ABI/testing/debugfs-pktcdvd
+
+
+
+Links
+-----
+
+See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information
+about DVD writing.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
new file mode 100644
index 0000000..d9014aa
--- /dev/null
+++ b/Documentation/cgroups/cgroups.txt
@@ -0,0 +1,548 @@
+ CGROUPS
+ -------
+
+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
+
+Original copyright statements from cpusets.txt:
+Portions Copyright (C) 2004 BULL SA.
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+
+CONTENTS:
+=========
+
+1. Control Groups
+ 1.1 What are cgroups ?
+ 1.2 Why are cgroups needed ?
+ 1.3 How are cgroups implemented ?
+ 1.4 What does notify_on_release do ?
+ 1.5 How do I use cgroups ?
+2. Usage Examples and Syntax
+ 2.1 Basic Usage
+ 2.2 Attaching processes
+3. Kernel API
+ 3.1 Overview
+ 3.2 Synchronization
+ 3.3 Subsystem API
+4. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy. Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierachies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task pids assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cpusets.txt) allows
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines:
+
+ CPU : Top cpuset
+ / \
+ CPUSet1 CPUSet2
+ | |
+ (Profs) (Students)
+
+ In addition (system tasks) are attached to topcpuset (so
+ that they can run anywhere) with a limit of 20%
+
+ Memory : Professors (50%), students (30%), system (20%)
+
+ Disk : Prof (50%), students (30%), system (20%)
+
+ Network : WWW browsing (20%), Network File System (60%), others (20%)
+ / \
+ Prof (15%) students (5%)
+
+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
+into NFS network class.
+
+At the same time firefox/lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies) then
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can
+
+ # echo browser_pid > /mnt/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+approp network and other resource class. This may lead to
+proliferation of such cgroups.
+
+Also lets say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :)) OR give one of the students simulation
+apps enhanced CPU power,
+
+With ability to write pids directly to resource classes, it's just a
+matter of :
+
+ # echo pid > /mnt/network/<new_class>/tasks
+ (after some time)
+ # echo pid > /mnt/network/<orig_class>/tasks
+
+Without this ability, he would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+ css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+ cgroup_subsys_state objects, one for each cgroup subsystem
+ registered in the system. There is no direct link from a task to
+ the cgroup of which it's a member in each hierarchy, but this
+ can be determined by following pointers through the
+ cgroup_subsys_state objects. This is because accessing the
+ subsystem state is something that's expected to happen frequently
+ and in performance-critical code, whereas operations that require a
+ task's actual cgroup assignments (in particular, moving between
+ cgroups) are less common. A linked list runs through the cg_list
+ field of each task_struct using the css_set, anchored at
+ css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+ manipulation from user space.
+
+ - You can list all the tasks (by pid) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+ css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition a new file system, of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel. When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options. By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by pid) attached to that cgroup
+ - releasable flag: cgroup currently removeable?
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+ exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command. The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks. A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, else a new
+css_set is allocated. Note that the current implementation uses a
+linear search to locate an appropriate existing css_set, so isn't
+very efficient. A future version will use a hash table for better
+performance.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup. This enables automatic
+removal of abandoned cgroups. The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0). The default value of other cgroups at creation is the current
+value of their parents notify_on_release setting. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like:
+
+ 1) mkdir /dev/cgroup
+ 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
+ 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
+ the /dev/cgroup virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cgroup by writing its pid to the
+ /dev/cgroup tasks file for that cgroup.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup:
+
+ mount -t cgroup cpuset -ocpuset /dev/cgroup
+ cd /dev/cgroup
+ mkdir Charlie
+ cd Charlie
+ /bin/echo 2-3 > cpuset.cpus
+ /bin/echo 1 > cpuset.mems
+ /bin/echo $$ > tasks
+ sh
+ # The subshell 'sh' is now running in cgroup Charlie
+ # The next line should display '/Charlie'
+ cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy will all available subsystems, type:
+# mount -t cgroup xxx /dev/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+To mount a cgroup hierarchy with just the cpuset and numtasks
+subsystems, type:
+# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
+
+To change the set of subsystems bound to a mounted hierarchy, just
+remount with different options:
+
+# mount -o remount,cpuset,ns /dev/cgroup
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /dev/cgroup you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /dev/cgroup
+is the cgroup that holds the whole system.
+
+If you want to create a new cgroup under /dev/cgroup:
+# cd /dev/cgroup
+# mkdir my_cgroup
+
+Now you want to do something with this cgroup.
+# cd my_cgroup
+
+In this directory you can find several files:
+# ls
+notify_on_release releasable tasks
+(plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup:
+# /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory.
+# mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir:
+# rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+# /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+ ...
+# /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0:
+
+# echo 0 > tasks
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem id which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+ entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+ no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+ at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem id; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are create/destroy. Any others that are null are presumed to
+be successful no-ops.
+
+struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+(cgroup_mutex held by caller)
+
+Called to create a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+negative error code. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+(cgroup_mutex held by caller)
+
+The cgroup system is about to destroy the passed cgroup; the subsystem
+should do any necessary cleanup and free its subsystem state
+object. By the time this method is called, the cgroup has already been
+unlinked from the file system and from the child list of its parent;
+cgroup->parent is still valid. (Note - can also be called for a
+newly-created cgroup if an error occurs after this subsystem's
+create() method has been called for the new cgroup).
+
+void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+(cgroup_mutex held by caller)
+
+Called before checking the reference count on each subsystem. This may
+be useful for subsystems which have some extra references even if
+there are not tasks in the cgroup.
+
+int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *task)
+(cgroup_mutex held by caller)
+
+Called prior to moving a task into a cgroup; if the subsystem
+returns an error, this will abort the attach operation. If a NULL
+task is passed, then a successful result indicates that *any*
+unspecified task can be moved into the cgroup. Note that this isn't
+called on a fork. If this method returns 0 (success) then this should
+remain valid while the caller holds cgroup_mutex.
+
+void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+
+void fork(struct cgroup_subsy *ss, struct task_struct *task)
+
+Called when a task is forked into a cgroup.
+
+void exit(struct cgroup_subsys *ss, struct task_struct *task)
+
+Called during task exit.
+
+int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+
+Called after creation of a cgroup to allow a subsystem to populate
+the cgroup directory with file entries. The subsystem should make
+calls to cgroup_add_file() with objects of type cftype (see
+include/linux/cgroup.h for details). Note that although this
+method can return an error code, the error code is currently not
+always handled well.
+
+void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
+
+Called at the end of cgroup_clone() to do any paramater
+initialization which might be required before a task could attach. For
+example in cpusets, no task may attach before 'cpus' and 'mems' are set
+up.
+
+void bind(struct cgroup_subsys *ss, struct cgroup *root)
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Questions
+============
+
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+ errors. If you use it in the cgroup file system, you won't be
+ able to tell whether a command succeeded or failed.
+
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+ put only ONE pid.
+
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt
new file mode 100644
index 0000000..41f37fe
--- /dev/null
+++ b/Documentation/cgroups/freezer-subsystem.txt
@@ -0,0 +1,102 @@
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells:
+
+ $ echo $$
+ 16644
+ $ bash
+ $ echo $$
+ 16690
+
+ From a second, unrelated bash shell:
+ $ kill -SIGSTOP 16690
+ $ kill -SIGCONT 16990
+
+ <at this point 16990 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The freezer subsystem in the container filesystem defines a file named
+freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
+cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
+Reading will return the current state.
+
+Note freezer.state doesn't exist in root cgroup, which means root cgroup
+is non-freezable.
+
+* Examples of usage :
+
+ # mkdir /containers
+ # mount -t cgroup -ofreezer freezer /containers
+ # mkdir /containers/0
+ # echo $some_pid > /containers/0/tasks
+
+to get status of the freezer subsystem :
+
+ # cat /containers/0/freezer.state
+ THAWED
+
+to freeze all tasks in the container :
+
+ # echo FROZEN > /containers/0/freezer.state
+ # cat /containers/0/freezer.state
+ FREEZING
+ # cat /containers/0/freezer.state
+ FROZEN
+
+to unfreeze all tasks in the container :
+
+ # echo THAWED > /containers/0/freezer.state
+ # cat /containers/0/freezer.state
+ THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
+
+It's important to note that freezing can be incomplete. In that case we return
+EBUSY. This means that some tasks in the cgroup are busy doing something that
+prevents us from completely freezing the cgroup at this time. After EBUSY,
+the cgroup will remain partially frozen -- reflected by freezer.state reporting
+"FREEZING" when read. The state will remain "FREEZING" until one of these
+things happens:
+
+ 1) Userspace cancels the freezing operation by writing "THAWED" to
+ the freezer.state file
+ 2) Userspace retries the freezing operation by writing "FROZEN" to
+ the freezer.state file (writing "FREEZING" is not legal
+ and returns EINVAL)
+ 3) The tasks that blocked the cgroup from entering the "FROZEN"
+ state disappear from the cgroup's set of tasks.
diff --git a/Documentation/connector/.gitignore b/Documentation/connector/.gitignore
new file mode 100644
index 0000000..d2b9c32
--- /dev/null
+++ b/Documentation/connector/.gitignore
@@ -0,0 +1 @@
+ucon
diff --git a/Documentation/connector/Makefile b/Documentation/connector/Makefile
new file mode 100644
index 0000000..8df1a72
--- /dev/null
+++ b/Documentation/connector/Makefile
@@ -0,0 +1,11 @@
+ifneq ($(CONFIG_CONNECTOR),)
+obj-m += cn_test.o
+endif
+
+# List of programs to build
+hostprogs-y := ucon
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include
diff --git a/Documentation/connector/cn_test.c b/Documentation/connector/cn_test.c
new file mode 100644
index 0000000..be7af14
--- /dev/null
+++ b/Documentation/connector/cn_test.c
@@ -0,0 +1,193 @@
+/*
+ * cn_test.c
+ *
+ * 2004-2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+
+#include <linux/connector.h>
+
+static struct cb_id cn_test_id = { 0x123, 0x456 };
+static char cn_test_name[] = "cn_test";
+static struct sock *nls;
+static struct timer_list cn_test_timer;
+
+void cn_test_callback(void *data)
+{
+ struct cn_msg *msg = (struct cn_msg *)data;
+
+ printk("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n",
+ __func__, jiffies, msg->id.idx, msg->id.val,
+ msg->seq, msg->ack, msg->len, (char *)msg->data);
+}
+
+static int cn_test_want_notify(void)
+{
+ struct cn_ctl_msg *ctl;
+ struct cn_notify_req *req;
+ struct cn_msg *msg = NULL;
+ int size, size0;
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ u32 group = 1;
+
+ size0 = sizeof(*msg) + sizeof(*ctl) + 3 * sizeof(*req);
+
+ size = NLMSG_SPACE(size0);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb) {
+ printk(KERN_ERR "Failed to allocate new skb with size=%u.\n",
+ size);
+
+ return -ENOMEM;
+ }
+
+ nlh = NLMSG_PUT(skb, 0, 0x123, NLMSG_DONE, size - sizeof(*nlh));
+
+ msg = (struct cn_msg *)NLMSG_DATA(nlh);
+
+ memset(msg, 0, size0);
+
+ msg->id.idx = -1;
+ msg->id.val = -1;
+ msg->seq = 0x123;
+ msg->ack = 0x345;
+ msg->len = size0 - sizeof(*msg);
+
+ ctl = (struct cn_ctl_msg *)(msg + 1);
+
+ ctl->idx_notify_num = 1;
+ ctl->val_notify_num = 2;
+ ctl->group = group;
+ ctl->len = msg->len - sizeof(*ctl);
+
+ req = (struct cn_notify_req *)(ctl + 1);
+
+ /*
+ * Idx.
+ */
+ req->first = cn_test_id.idx;
+ req->range = 10;
+
+ /*
+ * Val 0.
+ */
+ req++;
+ req->first = cn_test_id.val;
+ req->range = 10;
+
+ /*
+ * Val 1.
+ */
+ req++;
+ req->first = cn_test_id.val + 20;
+ req->range = 10;
+
+ NETLINK_CB(skb).dst_group = ctl->group;
+ //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC);
+ netlink_unicast(nls, skb, 0, 0);
+
+ printk(KERN_INFO "Request was sent. Group=0x%x.\n", ctl->group);
+
+ return 0;
+
+nlmsg_failure:
+ printk(KERN_ERR "Failed to send %u.%u\n", msg->seq, msg->ack);
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static u32 cn_test_timer_counter;
+static void cn_test_timer_func(unsigned long __data)
+{
+ struct cn_msg *m;
+ char data[32];
+
+ m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC);
+ if (m) {
+
+ memcpy(&m->id, &cn_test_id, sizeof(m->id));
+ m->seq = cn_test_timer_counter;
+ m->len = sizeof(data);
+
+ m->len =
+ scnprintf(data, sizeof(data), "counter = %u",
+ cn_test_timer_counter) + 1;
+
+ memcpy(m + 1, data, m->len);
+
+ cn_netlink_send(m, 0, gfp_any());
+ kfree(m);
+ }
+
+ cn_test_timer_counter++;
+
+ mod_timer(&cn_test_timer, jiffies + HZ);
+}
+
+static int cn_test_init(void)
+{
+ int err;
+
+ err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
+ if (err)
+ goto err_out;
+ cn_test_id.val++;
+ err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
+ if (err) {
+ cn_del_callback(&cn_test_id);
+ goto err_out;
+ }
+
+ init_timer(&cn_test_timer);
+ cn_test_timer.function = cn_test_timer_func;
+ cn_test_timer.expires = jiffies + HZ;
+ cn_test_timer.data = 0;
+ add_timer(&cn_test_timer);
+
+ return 0;
+
+ err_out:
+ if (nls && nls->sk_socket)
+ sock_release(nls->sk_socket);
+
+ return err;
+}
+
+static void cn_test_fini(void)
+{
+ del_timer_sync(&cn_test_timer);
+ cn_del_callback(&cn_test_id);
+ cn_test_id.val--;
+ cn_del_callback(&cn_test_id);
+ if (nls && nls->sk_socket)
+ sock_release(nls->sk_socket);
+}
+
+module_init(cn_test_init);
+module_exit(cn_test_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Evgeniy Polyakov <johnpol@2ka.mipt.ru>");
+MODULE_DESCRIPTION("Connector's test module");
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
new file mode 100644
index 0000000..ad6e0ba
--- /dev/null
+++ b/Documentation/connector/connector.txt
@@ -0,0 +1,178 @@
+/*****************************************/
+Kernel Connector.
+/*****************************************/
+
+Kernel connector - new netlink based userspace <-> kernel space easy
+to use communication module.
+
+Connector driver adds possibility to connect various agents using
+netlink based network. One must register callback and
+identifier. When driver receives special netlink message with
+appropriate identifier, appropriate callback will be called.
+
+From the userspace point of view it's quite straightforward:
+
+ socket();
+ bind();
+ send();
+ recv();
+
+But if kernelspace want to use full power of such connections, driver
+writer must create special sockets, must know about struct sk_buff
+handling... Connector allows any kernelspace agents to use netlink
+based networking for inter-process communication in a significantly
+easier way:
+
+int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *));
+void cn_netlink_send(struct cn_msg *msg, u32 __group, int gfp_mask);
+
+struct cb_id
+{
+ __u32 idx;
+ __u32 val;
+};
+
+idx and val are unique identifiers which must be registered in
+connector.h for in-kernel usage. void (*callback) (void *) - is a
+callback function which will be called when message with above idx.val
+will be received by connector core. Argument for that function must
+be dereferenced to struct cn_msg *.
+
+struct cn_msg
+{
+ struct cb_id id;
+
+ __u32 seq;
+ __u32 ack;
+
+ __u32 len; /* Length of the following data */
+ __u8 data[0];
+};
+
+/*****************************************/
+Connector interfaces.
+/*****************************************/
+
+int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *));
+
+Registers new callback with connector core.
+
+struct cb_id *id - unique connector's user identifier.
+ It must be registered in connector.h for legal in-kernel users.
+char *name - connector's callback symbolic name.
+void (*callback) (void *) - connector's callback.
+ Argument must be dereferenced to struct cn_msg *.
+
+void cn_del_callback(struct cb_id *id);
+
+Unregisters new callback with connector core.
+
+struct cb_id *id - unique connector's user identifier.
+
+int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask);
+
+Sends message to the specified groups. It can be safely called from
+softirq context, but may silently fail under strong memory pressure.
+If there are no listeners for given group -ESRCH can be returned.
+
+struct cn_msg * - message header(with attached data).
+u32 __group - destination group.
+ If __group is zero, then appropriate group will
+ be searched through all registered connector users,
+ and message will be delivered to the group which was
+ created for user with the same ID as in msg.
+ If __group is not zero, then message will be delivered
+ to the specified group.
+int gfp_mask - GFP mask.
+
+Note: When registering new callback user, connector core assigns
+netlink group to the user which is equal to it's id.idx.
+
+/*****************************************/
+Protocol description.
+/*****************************************/
+
+Current offers transport layer with fixed header. Recommended
+protocol which uses such header is following:
+
+msg->seq and msg->ack are used to determine message genealogy. When
+someone sends message it puts there locally unique sequence and random
+acknowledge numbers. Sequence number may be copied into
+nlmsghdr->nlmsg_seq too.
+
+Sequence number is incremented with each message to be sent.
+
+If we expect reply to our message, then sequence number in received
+message MUST be the same as in original message, and acknowledge
+number MUST be the same + 1.
+
+If we receive message and it's sequence number is not equal to one we
+are expecting, then it is new message. If we receive message and it's
+sequence number is the same as one we are expecting, but it's
+acknowledge is not equal acknowledge number in original message + 1,
+then it is new message.
+
+Obviously, protocol header contains above id.
+
+connector allows event notification in the following form: kernel
+driver or userspace process can ask connector to notify it when
+selected id's will be turned on or off(registered or unregistered it's
+callback). It is done by sending special command to connector
+driver(it also registers itself with id={-1, -1}).
+
+As example of usage Documentation/connector now contains cn_test.c -
+testing module which uses connector to request notification and to
+send messages.
+
+/*****************************************/
+Reliability.
+/*****************************************/
+
+Netlink itself is not reliable protocol, that means that messages can
+be lost due to memory pressure or process' receiving queue overflowed,
+so caller is warned must be prepared. That is why struct cn_msg [main
+connector's message header] contains u32 seq and u32 ack fields.
+
+/*****************************************/
+Userspace usage.
+/*****************************************/
+2.6.14 has a new netlink socket implementation, which by default does not
+allow to send data to netlink groups other than 1.
+So, if to use netlink socket (for example using connector)
+with different group number userspace application must subscribe to
+that group. It can be achieved by following pseudocode:
+
+s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+
+l_local.nl_family = AF_NETLINK;
+l_local.nl_groups = 12345;
+l_local.nl_pid = 0;
+
+if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+ perror("bind");
+ close(s);
+ return -1;
+}
+
+{
+ int on = l_local.nl_groups;
+ setsockopt(s, 270, 1, &on, sizeof(on));
+}
+
+Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
+option. To drop multicast subscription one should call above socket option
+with NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
+
+2.6.14 netlink code only allows to select a group which is less or equal to
+the maximum group number, which is used at netlink_kernel_create() time.
+In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
+group number 12345, you must increment CN_NETLINK_USERS to that number.
+Additional 0xf numbers are allocated to be used by non-in-kernel users.
+
+Due to this limitation, group 0xffffffff does not work now, so one can
+not use add/remove connector's group notifications, but as far as I know,
+only cn_test.c test module used it.
+
+Some work in netlink area is still being done, so things can be changed in
+2.6.15 timeframe, if it will happen, documentation will be updated for that
+kernel.
diff --git a/Documentation/connector/ucon.c b/Documentation/connector/ucon.c
new file mode 100644
index 0000000..d738cde
--- /dev/null
+++ b/Documentation/connector/ucon.c
@@ -0,0 +1,206 @@
+/*
+ * ucon.c
+ *
+ * Copyright (c) 2004+ Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/types.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <arpa/inet.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+
+#include <linux/connector.h>
+
+#define DEBUG
+#define NETLINK_CONNECTOR 11
+
+#ifdef DEBUG
+#define ulog(f, a...) fprintf(stdout, f, ##a)
+#else
+#define ulog(f, a...) do {} while (0)
+#endif
+
+static int need_exit;
+static __u32 seq;
+
+static int netlink_send(int s, struct cn_msg *msg)
+{
+ struct nlmsghdr *nlh;
+ unsigned int size;
+ int err;
+ char buf[128];
+ struct cn_msg *m;
+
+ size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len);
+
+ nlh = (struct nlmsghdr *)buf;
+ nlh->nlmsg_seq = seq++;
+ nlh->nlmsg_pid = getpid();
+ nlh->nlmsg_type = NLMSG_DONE;
+ nlh->nlmsg_len = NLMSG_LENGTH(size - sizeof(*nlh));
+ nlh->nlmsg_flags = 0;
+
+ m = NLMSG_DATA(nlh);
+#if 0
+ ulog("%s: [%08x.%08x] len=%u, seq=%u, ack=%u.\n",
+ __func__, msg->id.idx, msg->id.val, msg->len, msg->seq, msg->ack);
+#endif
+ memcpy(m, msg, sizeof(*m) + msg->len);
+
+ err = send(s, nlh, size, 0);
+ if (err == -1)
+ ulog("Failed to send: %s [%d].\n",
+ strerror(errno), errno);
+
+ return err;
+}
+
+int main(int argc, char *argv[])
+{
+ int s;
+ char buf[1024];
+ int len;
+ struct nlmsghdr *reply;
+ struct sockaddr_nl l_local;
+ struct cn_msg *data;
+ FILE *out;
+ time_t tm;
+ struct pollfd pfd;
+
+ if (argc < 2)
+ out = stdout;
+ else {
+ out = fopen(argv[1], "a+");
+ if (!out) {
+ ulog("Unable to open %s for writing: %s\n",
+ argv[1], strerror(errno));
+ out = stdout;
+ }
+ }
+
+ memset(buf, 0, sizeof(buf));
+
+ s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+ if (s == -1) {
+ perror("socket");
+ return -1;
+ }
+
+ l_local.nl_family = AF_NETLINK;
+ l_local.nl_groups = 0x123; /* bitmask of requested groups */
+ l_local.nl_pid = 0;
+
+ if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+ perror("bind");
+ close(s);
+ return -1;
+ }
+
+#if 0
+ {
+ int on = 0x57; /* Additional group number */
+ setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on));
+ }
+#endif
+ if (0) {
+ int i, j;
+
+ memset(buf, 0, sizeof(buf));
+
+ data = (struct cn_msg *)buf;
+
+ data->id.idx = 0x123;
+ data->id.val = 0x456;
+ data->seq = seq++;
+ data->ack = 0;
+ data->len = 0;
+
+ for (j=0; j<10; ++j) {
+ for (i=0; i<1000; ++i) {
+ len = netlink_send(s, data);
+ }
+
+ ulog("%d messages have been sent to %08x.%08x.\n", i, data->id.idx, data->id.val);
+ }
+
+ return 0;
+ }
+
+
+ pfd.fd = s;
+
+ while (!need_exit) {
+ pfd.events = POLLIN;
+ pfd.revents = 0;
+ switch (poll(&pfd, 1, -1)) {
+ case 0:
+ need_exit = 1;
+ break;
+ case -1:
+ if (errno != EINTR) {
+ need_exit = 1;
+ break;
+ }
+ continue;
+ }
+ if (need_exit)
+ break;
+
+ memset(buf, 0, sizeof(buf));
+ len = recv(s, buf, sizeof(buf), 0);
+ if (len == -1) {
+ perror("recv buf");
+ close(s);
+ return -1;
+ }
+ reply = (struct nlmsghdr *)buf;
+
+ switch (reply->nlmsg_type) {
+ case NLMSG_ERROR:
+ fprintf(out, "Error message received.\n");
+ fflush(out);
+ break;
+ case NLMSG_DONE:
+ data = (struct cn_msg *)NLMSG_DATA(reply);
+
+ time(&tm);
+ fprintf(out, "%.24s : [%x.%x] [%08u.%08u].\n",
+ ctime(&tm), data->id.idx, data->id.val, data->seq, data->ack);
+ fflush(out);
+ break;
+ default:
+ break;
+ }
+ }
+
+ close(s);
+ return 0;
+}
diff --git a/Documentation/console/console.txt b/Documentation/console/console.txt
new file mode 100644
index 0000000..877a1b2
--- /dev/null
+++ b/Documentation/console/console.txt
@@ -0,0 +1,144 @@
+Console Drivers
+===============
+
+The linux kernel has 2 general types of console drivers. The first type is
+assigned by the kernel to all the virtual consoles during the boot process.
+This type will be called 'system driver', and only one system driver is allowed
+to exist. The system driver is persistent and it can never be unloaded, though
+it may become inactive.
+
+The second type has to be explicitly loaded and unloaded. This will be called
+'modular driver' by this document. Multiple modular drivers can coexist at
+any time with each driver sharing the console with other drivers including
+the system driver. However, modular drivers cannot take over the console
+that is currently occupied by another modular driver. (Exception: Drivers that
+call take_over_console() will succeed in the takeover regardless of the type
+of driver occupying the consoles.) They can only take over the console that is
+occupied by the system driver. In the same token, if the modular driver is
+released by the console, the system driver will take over.
+
+Modular drivers, from the programmer's point of view, has to call:
+
+ take_over_console() - load and bind driver to console layer
+ give_up_console() - unbind and unload driver
+
+In newer kernels, the following are also available:
+
+ register_con_driver()
+ unregister_con_driver()
+
+If sysfs is enabled, the contents of /sys/class/vtconsole can be
+examined. This shows the console backends currently registered by the
+system which are named vtcon<n> where <n> is an integer from 0 to 15. Thus:
+
+ ls /sys/class/vtconsole
+ . .. vtcon0 vtcon1
+
+Each directory in /sys/class/vtconsole has 3 files:
+
+ ls /sys/class/vtconsole/vtcon0
+ . .. bind name uevent
+
+What do these files signify?
+
+ 1. bind - this is a read/write file. It shows the status of the driver if
+ read, or acts to bind or unbind the driver to the virtual consoles
+ when written to. The possible values are:
+
+ 0 - means the driver is not bound and if echo'ed, commands the driver
+ to unbind
+
+ 1 - means the driver is bound and if echo'ed, commands the driver to
+ bind
+
+ 2. name - read-only file. Shows the name of the driver in this format:
+
+ cat /sys/class/vtconsole/vtcon0/name
+ (S) VGA+
+
+ '(S)' stands for a (S)ystem driver, ie, it cannot be directly
+ commanded to bind or unbind
+
+ 'VGA+' is the name of the driver
+
+ cat /sys/class/vtconsole/vtcon1/name
+ (M) frame buffer device
+
+ In this case, '(M)' stands for a (M)odular driver, one that can be
+ directly commanded to bind or unbind.
+
+ 3. uevent - ignore this file
+
+When unbinding, the modular driver is detached first, and then the system
+driver takes over the consoles vacated by the driver. Binding, on the other
+hand, will bind the driver to the consoles that are currently occupied by a
+system driver.
+
+NOTE1: Binding and binding must be selected in Kconfig. It's under:
+
+Device Drivers -> Character devices -> Support for binding and unbinding
+console drivers
+
+NOTE2: If any of the virtual consoles are in KD_GRAPHICS mode, then binding or
+unbinding will not succeed. An example of an application that sets the console
+to KD_GRAPHICS is X.
+
+How useful is this feature? This is very useful for console driver
+developers. By unbinding the driver from the console layer, one can unload the
+driver, make changes, recompile, reload and rebind the driver without any need
+for rebooting the kernel. For regular users who may want to switch from
+framebuffer console to VGA console and vice versa, this feature also makes
+this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb
+for more details).
+
+Notes for developers:
+=====================
+
+take_over_console() is now broken up into:
+
+ register_con_driver()
+ bind_con_driver() - private function
+
+give_up_console() is a wrapper to unregister_con_driver(), and a driver must
+be fully unbound for this call to succeed. con_is_bound() will check if the
+driver is bound or not.
+
+Guidelines for console driver writers:
+=====================================
+
+In order for binding to and unbinding from the console to properly work,
+console drivers must follow these guidelines:
+
+1. All drivers, except system drivers, must call either register_con_driver()
+ or take_over_console(). register_con_driver() will just add the driver to
+ the console's internal list. It won't take over the
+ console. take_over_console(), as it name implies, will also take over (or
+ bind to) the console.
+
+2. All resources allocated during con->con_init() must be released in
+ con->con_deinit().
+
+3. All resources allocated in con->con_startup() must be released when the
+ driver, which was previously bound, becomes unbound. The console layer
+ does not have a complementary call to con->con_startup() so it's up to the
+ driver to check when it's legal to release these resources. Calling
+ con_is_bound() in con->con_deinit() will help. If the call returned
+ false(), then it's safe to release the resources. This balance has to be
+ ensured because con->con_startup() can be called again when a request to
+ rebind the driver to the console arrives.
+
+4. Upon exit of the driver, ensure that the driver is totally unbound. If the
+ condition is satisfied, then the driver must call unregister_con_driver()
+ or give_up_console().
+
+5. unregister_con_driver() can also be called on conditions which make it
+ impossible for the driver to service console requests. This can happen
+ with the framebuffer console that suddenly lost all of its drivers.
+
+The current crop of console drivers should still work correctly, but binding
+and unbinding them may cause problems. With minimal fixes, these drivers can
+be made to work correctly.
+
+==========================
+Antonino Daplas <adaplas@pol.net>
+
diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt
new file mode 100644
index 0000000..7cc6e6a
--- /dev/null
+++ b/Documentation/controllers/devices.txt
@@ -0,0 +1,52 @@
+Device Whitelist Controller
+
+1. Description:
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files. A device cgroup associates a device access
+whitelist with each cgroup. A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block). 'all' means it applies
+to all types and all major and minor numbers. Major and minor are
+either an integer or * for all. Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'. A child device
+cgroup gets a copy of the parent. Administrators can then remove
+devices from the whitelist or add new entries. A child cgroup can
+never receive a device access which is denied by its parent. However
+when a device access is removed from a parent it will not also be
+removed from the child(ren).
+
+2. User Interface
+
+An entry is added using devices.allow, and removed using
+devices.deny. For instance
+
+ echo 'c 1:3 mr' > /cgroups/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null. Doing
+
+ echo a > /cgroups/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing
+
+ echo a > /cgroups/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+
+Any task can move itself between cgroups. This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this. We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD. We may want to just refuse moving to a cgroup which
+isn't a descendent of the current one. Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup. (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
new file mode 100644
index 0000000..1c07547
--- /dev/null
+++ b/Documentation/controllers/memory.txt
@@ -0,0 +1,284 @@
+Memory Resource Controller
+
+NOTE: The Memory Resource Controller has been generically been referred
+to as the memory controller in this document. Do not confuse memory controller
+used here with the memory controller that is used in hardware.
+
+Salient features
+
+a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages
+b. The infrastructure allows easy addition of other types of memory to control
+c. Provides *zero overhead* for non memory controller users
+d. Provides a double LRU: global memory pressure causes reclaim from the
+ global LRU; a cgroup on hitting a limit, reclaims from the per
+ cgroup LRU
+
+NOTE: Swap Cache (unmapped) is not accounted now.
+
+Benefits and Purpose of the memory controller
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+ Memory hungry applications can be isolated and limited to a smaller
+ amount of memory.
+b. Create a cgroup with limited amount of memory, this can be used
+ as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+ to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+ rest of the system to ensure that burning does not fail due to lack
+ of available memory.
+e. There are several other use cases, find one or use the controller just
+ for fun (to learn and hack on the VM subsystem).
+
+1. History
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+
+The core of the design is a counter called the res_counter. The res_counter
+tracks the current memory usage and limit of the group of processes associated
+with the controller. Each cgroup has a memory controller specific data
+structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+
+ +--------------------+
+ | mem_cgroup |
+ | (res_counter) |
+ +--------------------+
+ / ^ \
+ / | \
+ +---------------+ | +---------------+
+ | mm_struct | |.... | mm_struct |
+ | | | | |
+ +---------------+ | +---------------+
+ |
+ + --------------+
+ |
+ +---------------+ +------+--------+
+ | page +----------> page_cgroup|
+ | | | |
+ +---------------+ +---------------+
+
+ (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+ cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge() is invoked to setup
+the necessary data structures and check if the cgroup that is being charged
+is over its limit. If it is then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+allocated and associated with the page. This routine also adds the page to
+the per cgroup LRU.
+
+2.2.1 Accounting details
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+(some pages which never be reclaimable and will not be on global LRU
+ are not accounted. we just accounts pages under usual vm management.)
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+A RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-lru because our purpose is to control amount
+of used pages. not-on-lru pages are tend to be out-of-control from vm view.
+
+2.3 Shared Page Accounting
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+2.4 Reclaim
+
+Each cgroup maintains a per cgroup LRU that consists of an active
+and inactive list. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup.
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per cgroup LRU
+list.
+
+2. Locking
+
+The memory controller uses the following hierarchy
+
+1. zone->lru_lock is used for selecting pages to be isolated
+2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone)
+3. lock_page_cgroup() is used to protect page->page_cgroup
+
+3. User Interface
+
+0. Configuration
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_RESOURCE_COUNTERS
+c. Enable CONFIG_CGROUP_MEM_RES_CTLR
+
+1. Prepare the cgroups
+# mkdir -p /cgroups
+# mount -t cgroup none /cgroups -o memory
+
+2. Make the new group and move bash into it
+# mkdir /cgroups/0
+# echo $$ > /cgroups/0/tasks
+
+Since now we're in the 0 cgroup,
+We can alter the memory limit:
+# echo 4M > /cgroups/0/memory.limit_in_bytes
+
+NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+mega or gigabytes.
+
+# cat /cgroups/0/memory.limit_in_bytes
+4194304
+
+NOTE: The interface has now changed to display the usage in bytes
+instead of pages
+
+We can check the usage:
+# cat /cgroups/0/memory.usage_in_bytes
+1216512
+
+A successful write to this file does not guarantee a successful set of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel.
+
+# echo 1 > memory.limit_in_bytes
+# cat memory.limit_in_bytes
+4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+The memory.force_empty gives an interface to drop *all* charges by force.
+
+# echo 1 > memory.force_empty
+
+will drop all charges in cgroup. Currently, this is maintained for test.
+
+4. Testing
+
+Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11].
+Apart from that v6 has been tested with several applications and regular
+daily use. The controller has also been tested on the PPC64, x86_64 and
+UML platforms.
+
+4.1 Troubleshooting
+
+Sometimes a user might find that the application under a cgroup is
+terminated. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+4.2 Task migration
+
+When a task migrates from one cgroup to another, it's charge is not
+carried forward. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+4.3 Removing a cgroup
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. Such charges are automatically dropped at
+rmdir() if there are no tasks.
+
+5. TODO
+
+1. Add support for accounting huge pages (as a separate controller)
+2. Make per-cgroup scanner reclaim not-shared pages first
+3. Teach controller to account for shared-pages
+4. Start reclamation in the background when the limit is
+ not yet hit but the usage is getting closer
+
+Summary
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+ http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+ http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+ http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+ http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+ subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+ http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+ http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+ http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+ http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+ http://lwn.net/Articles/243795/
diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt
new file mode 100644
index 0000000..f196ac1
--- /dev/null
+++ b/Documentation/controllers/resource_counter.txt
@@ -0,0 +1,181 @@
+
+ The Resource Counter
+
+The resource counter, declared at include/linux/res_counter.h,
+is supposed to facilitate the resource management by controllers
+by providing common stuff for accounting.
+
+This "stuff" includes the res_counter structure and routines
+to work with it.
+
+
+
+1. Crucial parts of the res_counter structure
+
+ a. unsigned long long usage
+
+ The usage value shows the amount of a resource that is consumed
+ by a group at a given time. The units of measurement should be
+ determined by the controller that uses this counter. E.g. it can
+ be bytes, items or any other unit the controller operates on.
+
+ b. unsigned long long max_usage
+
+ The maximal value of the usage over time.
+
+ This value is useful when gathering statistical information about
+ the particular group, as it shows the actual resource requirements
+ for a particular group, not just some usage snapshot.
+
+ c. unsigned long long limit
+
+ The maximal allowed amount of resource to consume by the group. In
+ case the group requests for more resources, so that the usage value
+ would exceed the limit, the resource allocation is rejected (see
+ the next section).
+
+ d. unsigned long long failcnt
+
+ The failcnt stands for "failures counter". This is the number of
+ resource allocation attempts that failed.
+
+ c. spinlock_t lock
+
+ Protects changes of the above values.
+
+
+
+2. Basic accounting routines
+
+ a. void res_counter_init(struct res_counter *rc)
+
+ Initializes the resource counter. As usual, should be the first
+ routine called for a new counter.
+
+ b. int res_counter_charge[_locked]
+ (struct res_counter *rc, unsigned long val)
+
+ When a resource is about to be allocated it has to be accounted
+ with the appropriate resource counter (controller should determine
+ which one to use on its own). This operation is called "charging".
+
+ This is not very important which operation - resource allocation
+ or charging - is performed first, but
+ * if the allocation is performed first, this may create a
+ temporary resource over-usage by the time resource counter is
+ charged;
+ * if the charging is performed first, then it should be uncharged
+ on error path (if the one is called).
+
+ c. void res_counter_uncharge[_locked]
+ (struct res_counter *rc, unsigned long val)
+
+ When a resource is released (freed) it should be de-accounted
+ from the resource counter it was accounted to. This is called
+ "uncharging".
+
+ The _locked routines imply that the res_counter->lock is taken.
+
+
+ 2.1 Other accounting routines
+
+ There are more routines that may help you with common needs, like
+ checking whether the limit is reached or resetting the max_usage
+ value. They are all declared in include/linux/res_counter.h.
+
+
+
+3. Analyzing the resource counter registrations
+
+ a. If the failcnt value constantly grows, this means that the counter's
+ limit is too tight. Either the group is misbehaving and consumes too
+ many resources, or the configuration is not suitable for the group
+ and the limit should be increased.
+
+ b. The max_usage value can be used to quickly tune the group. One may
+ set the limits to maximal values and either load the container with
+ a common pattern or leave one for a while. After this the max_usage
+ value shows the amount of memory the container would require during
+ its common activity.
+
+ Setting the limit a bit above this value gives a pretty good
+ configuration that works in most of the cases.
+
+ c. If the max_usage is much less than the limit, but the failcnt value
+ is growing, then the group tries to allocate a big chunk of resource
+ at once.
+
+ d. If the max_usage is much less than the limit, but the failcnt value
+ is 0, then this group is given too high limit, that it does not
+ require. It is better to lower the limit a bit leaving more resource
+ for other groups.
+
+
+
+4. Communication with the control groups subsystem (cgroups)
+
+All the resource controllers that are using cgroups and resource counters
+should provide files (in the cgroup filesystem) to work with the resource
+counter fields. They are recommended to adhere to the following rules:
+
+ a. File names
+
+ Field name File name
+ ---------------------------------------------------
+ usage usage_in_<unit_of_measurement>
+ max_usage max_usage_in_<unit_of_measurement>
+ limit limit_in_<unit_of_measurement>
+ failcnt failcnt
+ lock no file :)
+
+ b. Reading from file should show the corresponding field value in the
+ appropriate format.
+
+ c. Writing to file
+
+ Field Expected behavior
+ ----------------------------------
+ usage prohibited
+ max_usage reset to usage
+ limit set the limit
+ failcnt reset to zero
+
+
+
+5. Usage example
+
+ a. Declare a task group (take a look at cgroups subsystem for this) and
+ fold a res_counter into it
+
+ struct my_group {
+ struct res_counter res;
+
+ <other fields>
+ }
+
+ b. Put hooks in resource allocation/release paths
+
+ int alloc_something(...)
+ {
+ if (res_counter_charge(res_counter_ptr, amount) < 0)
+ return -ENOMEM;
+
+ <allocate the resource and return to the caller>
+ }
+
+ void release_something(...)
+ {
+ res_counter_uncharge(res_counter_ptr, amount);
+
+ <release the resource>
+ }
+
+ In order to keep the usage value self-consistent, both the
+ "res_counter_ptr" and the "amount" in release_something() should be
+ the same as they were in the alloc_something() when the releasing
+ resource was allocated.
+
+ c. Provide the way to read res_counter values and set them (the cgroups
+ still can help with it).
+
+ c. Compile and run :)
diff --git a/Documentation/cpu-freq/amd-powernow.txt b/Documentation/cpu-freq/amd-powernow.txt
new file mode 100644
index 0000000..254da15
--- /dev/null
+++ b/Documentation/cpu-freq/amd-powernow.txt
@@ -0,0 +1,38 @@
+
+PowerNow! and Cool'n'Quiet are AMD names for frequency
+management capabilities in AMD processors. As the hardware
+implementation changes in new generations of the processors,
+there is a different cpu-freq driver for each generation.
+
+Note that the driver's will not load on the "wrong" hardware,
+so it is safe to try each driver in turn when in doubt as to
+which is the correct driver.
+
+Note that the functionality to change frequency (and voltage)
+is not available in all processors. The drivers will refuse
+to load on processors without this capability. The capability
+is detected with the cpuid instruction.
+
+The drivers use BIOS supplied tables to obtain frequency and
+voltage information appropriate for a particular platform.
+Frequency transitions will be unavailable if the BIOS does
+not supply these tables.
+
+6th Generation: powernow-k6
+
+7th Generation: powernow-k7: Athlon, Duron, Geode.
+
+8th Generation: powernow-k8: Athlon, Athlon 64, Opteron, Sempron.
+Documentation on this functionality in 8th generation processors
+is available in the "BIOS and Kernel Developer's Guide", publication
+26094, in chapter 9, available for download from www.amd.com.
+
+BIOS supplied data, for powernow-k7 and for powernow-k8, may be
+from either the PSB table or from ACPI objects. The ACPI support
+is only available if the kernel config sets CONFIG_ACPI_PROCESSOR.
+The powernow-k8 driver will attempt to use ACPI if so configured,
+and fall back to PST if that fails.
+The powernow-k7 driver will try to use the PSB support first, and
+fall back to ACPI if the PSB support fails. A module parameter,
+acpi_force, is provided to force ACPI support to be used instead
+of PSB support.
diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
new file mode 100644
index 0000000..ce0666e
--- /dev/null
+++ b/Documentation/cpu-freq/core.txt
@@ -0,0 +1,98 @@
+ CPU frequency and voltage scaling code in the Linux(TM) kernel
+
+
+ L i n u x C P U F r e q
+
+ C P U F r e q C o r e
+
+
+ Dominik Brodowski <linux@brodo.de>
+ David Kimdon <dwhedon@debian.org>
+
+
+
+ Clock scaling allows you to change the clock speed of the CPUs on the
+ fly. This is a nice method to save battery power, because the lower
+ the clock speed, the less power the CPU consumes.
+
+
+Contents:
+---------
+1. CPUFreq core and interfaces
+2. CPUFreq notifiers
+
+1. General Information
+=======================
+
+The CPUFreq core code is located in drivers/cpufreq/cpufreq.c. This
+cpufreq code offers a standardized interface for the CPUFreq
+architecture drivers (those pieces of code that do actual
+frequency transitions), as well as to "notifiers". These are device
+drivers or other part of the kernel that need to be informed of
+policy changes (ex. thermal modules like ACPI) or of all
+frequency changes (ex. timing code) or even need to force certain
+speed limits (like LCD drivers on ARM architecture). Additionally, the
+kernel "constant" loops_per_jiffy is updated on frequency changes
+here.
+
+Reference counting is done by cpufreq_get_cpu and cpufreq_put_cpu,
+which make sure that the cpufreq processor driver is correctly
+registered with the core, and will not be unloaded until
+cpufreq_put_cpu is called.
+
+2. CPUFreq notifiers
+====================
+
+CPUFreq notifiers conform to the standard kernel notifier interface.
+See linux/include/linux/notifier.h for details on notifiers.
+
+There are two different CPUFreq notifiers - policy notifiers and
+transition notifiers.
+
+
+2.1 CPUFreq policy notifiers
+----------------------------
+
+These are notified when a new policy is intended to be set. Each
+CPUFreq policy notifier is called three times for a policy transition:
+
+1.) During CPUFREQ_ADJUST all CPUFreq notifiers may change the limit if
+ they see a need for this - may it be thermal considerations or
+ hardware limitations.
+
+2.) During CPUFREQ_INCOMPATIBLE only changes may be done in order to avoid
+ hardware failure.
+
+3.) And during CPUFREQ_NOTIFY all notifiers are informed of the new policy
+ - if two hardware drivers failed to agree on a new policy before this
+ stage, the incompatible hardware shall be shut down, and the user
+ informed of this.
+
+The phase is specified in the second argument to the notifier.
+
+The third argument, a void *pointer, points to a struct cpufreq_policy
+consisting of five values: cpu, min, max, policy and max_cpu_freq. min
+and max are the lower and upper frequencies (in kHz) of the new
+policy, policy the new policy, cpu the number of the affected CPU; and
+max_cpu_freq the maximum supported CPU frequency. This value is given
+for informational purposes only.
+
+
+2.2 CPUFreq transition notifiers
+--------------------------------
+
+These are notified twice when the CPUfreq driver switches the CPU core
+frequency and this change has any external implications.
+
+The second argument specifies the phase - CPUFREQ_PRECHANGE or
+CPUFREQ_POSTCHANGE.
+
+The third argument is a struct cpufreq_freqs with the following
+values:
+cpu - number of the affected CPU
+old - old frequency
+new - new frequency
+
+If the cpufreq core detects the frequency has changed while the system
+was suspended, these notifiers are called with CPUFREQ_RESUMECHANGE as
+second argument.
diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
new file mode 100644
index 0000000..43c7439
--- /dev/null
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -0,0 +1,216 @@
+ CPU frequency and voltage scaling code in the Linux(TM) kernel
+
+
+ L i n u x C P U F r e q
+
+ C P U D r i v e r s
+
+ - information for developers -
+
+
+ Dominik Brodowski <linux@brodo.de>
+
+
+
+ Clock scaling allows you to change the clock speed of the CPUs on the
+ fly. This is a nice method to save battery power, because the lower
+ the clock speed, the less power the CPU consumes.
+
+
+Contents:
+---------
+1. What To Do?
+1.1 Initialization
+1.2 Per-CPU Initialization
+1.3 verify
+1.4 target or setpolicy?
+1.5 target
+1.6 setpolicy
+2. Frequency Table Helpers
+
+
+
+1. What To Do?
+==============
+
+So, you just got a brand-new CPU / chipset with datasheets and want to
+add cpufreq support for this CPU / chipset? Great. Here are some hints
+on what is necessary:
+
+
+1.1 Initialization
+------------------
+
+First of all, in an __initcall level 7 (module_init()) or later
+function check whether this kernel runs on the right CPU and the right
+chipset. If so, register a struct cpufreq_driver with the CPUfreq core
+using cpufreq_register_driver()
+
+What shall this struct cpufreq_driver contain?
+
+cpufreq_driver.name - The name of this driver.
+
+cpufreq_driver.owner - THIS_MODULE;
+
+cpufreq_driver.init - A pointer to the per-CPU initialization
+ function.
+
+cpufreq_driver.verify - A pointer to a "verification" function.
+
+cpufreq_driver.setpolicy _or_
+cpufreq_driver.target - See below on the differences.
+
+And optionally
+
+cpufreq_driver.exit - A pointer to a per-CPU cleanup function.
+
+cpufreq_driver.resume - A pointer to a per-CPU resume function
+ which is called with interrupts disabled
+ and _before_ the pre-suspend frequency
+ and/or policy is restored by a call to
+ ->target or ->setpolicy.
+
+cpufreq_driver.attr - A pointer to a NULL-terminated list of
+ "struct freq_attr" which allow to
+ export values to sysfs.
+
+
+1.2 Per-CPU Initialization
+--------------------------
+
+Whenever a new CPU is registered with the device model, or after the
+cpufreq driver registers itself, the per-CPU initialization function
+cpufreq_driver.init is called. It takes a struct cpufreq_policy
+*policy as argument. What to do now?
+
+If necessary, activate the CPUfreq support on your CPU.
+
+Then, the driver must fill in the following values:
+
+policy->cpuinfo.min_freq _and_
+policy->cpuinfo.max_freq - the minimum and maximum frequency
+ (in kHz) which is supported by
+ this CPU
+policy->cpuinfo.transition_latency the time it takes on this CPU to
+ switch between two frequencies (if
+ appropriate, else specify
+ CPUFREQ_ETERNAL)
+
+policy->cur The current operating frequency of
+ this CPU (if appropriate)
+policy->min,
+policy->max,
+policy->policy and, if necessary,
+policy->governor must contain the "default policy" for
+ this CPU. A few moments later,
+ cpufreq_driver.verify and either
+ cpufreq_driver.setpolicy or
+ cpufreq_driver.target is called with
+ these values.
+
+For setting some of these values, the frequency table helpers might be
+helpful. See the section 2 for more information on them.
+
+
+1.3 verify
+------------
+
+When the user decides a new policy (consisting of
+"policy,governor,min,max") shall be set, this policy must be validated
+so that incompatible values can be corrected. For verifying these
+values, a frequency table helper and/or the
+cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned
+int min_freq, unsigned int max_freq) function might be helpful. See
+section 2 for details on frequency table helpers.
+
+You need to make sure that at least one valid frequency (or operating
+range) is within policy->min and policy->max. If necessary, increase
+policy->max first, and only if this is no solution, decrease policy->min.
+
+
+1.4 target or setpolicy?
+----------------------------
+
+Most cpufreq drivers or even most cpu frequency scaling algorithms
+only allow the CPU to be set to one frequency. For these, you use the
+->target call.
+
+Some cpufreq-capable processors switch the frequency between certain
+limits on their own. These shall use the ->setpolicy call
+
+
+1.4. target
+-------------
+
+The target call has three arguments: struct cpufreq_policy *policy,
+unsigned int target_frequency, unsigned int relation.
+
+The CPUfreq driver must set the new frequency when called here. The
+actual frequency must be determined using the following rules:
+
+- keep close to "target_freq"
+- policy->min <= new_freq <= policy->max (THIS MUST BE VALID!!!)
+- if relation==CPUFREQ_REL_L, try to select a new_freq higher than or equal
+ target_freq. ("L for lowest, but no lower than")
+- if relation==CPUFREQ_REL_H, try to select a new_freq lower than or equal
+ target_freq. ("H for highest, but no higher than")
+
+Here again the frequency table helper might assist you - see section 3
+for details.
+
+
+1.5 setpolicy
+---------------
+
+The setpolicy call only takes a struct cpufreq_policy *policy as
+argument. You need to set the lower limit of the in-processor or
+in-chipset dynamic frequency switching to policy->min, the upper limit
+to policy->max, and -if supported- select a performance-oriented
+setting when policy->policy is CPUFREQ_POLICY_PERFORMANCE, and a
+powersaving-oriented setting when CPUFREQ_POLICY_POWERSAVE. Also check
+the reference implementation in arch/i386/kernel/cpu/cpufreq/longrun.c
+
+
+
+2. Frequency Table Helpers
+==========================
+
+As most cpufreq processors only allow for being set to a few specific
+frequencies, a "frequency table" with some functions might assist in
+some work of the processor driver. Such a "frequency table" consists
+of an array of struct cpufreq_freq_table entries, with any value in
+"index" you want to use, and the corresponding frequency in
+"frequency". At the end of the table, you need to add a
+cpufreq_freq_table entry with frequency set to CPUFREQ_TABLE_END. And
+if you want to skip one entry in the table, set the frequency to
+CPUFREQ_ENTRY_INVALID. The entries don't need to be in ascending
+order.
+
+By calling cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *table);
+the cpuinfo.min_freq and cpuinfo.max_freq values are detected, and
+policy->min and policy->max are set to the same values. This is
+helpful for the per-CPU initialization stage.
+
+int cpufreq_frequency_table_verify(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *table);
+assures that at least one valid frequency is within policy->min and
+policy->max, and all other criteria are met. This is helpful for the
+->verify call.
+
+int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *table,
+ unsigned int target_freq,
+ unsigned int relation,
+ unsigned int *index);
+
+is the corresponding frequency table helper for the ->target
+stage. Just pass the values to this function, and the unsigned int
+index returns the number of the frequency table entry which contains
+the frequency the CPU shall be set to. PLEASE NOTE: This is not the
+"index" which is in this cpufreq_table_entry.index, but instead
+cpufreq_table[index]. So, the new frequency is
+cpufreq_table[index].frequency, and the value you stored into the
+frequency table "index" field is
+cpufreq_table[index].index.
+
diff --git a/Documentation/cpu-freq/cpufreq-nforce2.txt b/Documentation/cpu-freq/cpufreq-nforce2.txt
new file mode 100644
index 0000000..babce13
--- /dev/null
+++ b/Documentation/cpu-freq/cpufreq-nforce2.txt
@@ -0,0 +1,19 @@
+
+The cpufreq-nforce2 driver changes the FSB on nVidia nForce2 platforms.
+
+This works better than on other platforms, because the FSB of the CPU
+can be controlled independently from the PCI/AGP clock.
+
+The module has two options:
+
+ fid: multiplier * 10 (for example 8.5 = 85)
+ min_fsb: minimum FSB
+
+If not set, fid is calculated from the current CPU speed and the FSB.
+min_fsb defaults to FSB at boot time - 50 MHz.
+
+IMPORTANT: The available range is limited downwards!
+ Also the minimum available FSB can differ, for systems
+ booting with 200 MHz, 150 should always work.
+
+
diff --git a/Documentation/cpu-freq/cpufreq-stats.txt b/Documentation/cpu-freq/cpufreq-stats.txt
new file mode 100644
index 0000000..fc64749
--- /dev/null
+++ b/Documentation/cpu-freq/cpufreq-stats.txt
@@ -0,0 +1,128 @@
+
+ CPU frequency and voltage scaling statistics in the Linux(TM) kernel
+
+
+ L i n u x c p u f r e q - s t a t s d r i v e r
+
+ - information for users -
+
+
+ Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+
+Contents
+1. Introduction
+2. Statistics Provided (with example)
+3. Configuring cpufreq-stats
+
+
+1. Introduction
+
+cpufreq-stats is a driver that provides CPU frequency statistics for each CPU.
+These statistics are provided in /sysfs as a bunch of read_only interfaces. This
+interface (when configured) will appear in a separate directory under cpufreq
+in /sysfs (<sysfs root>/devices/system/cpu/cpuX/cpufreq/stats/) for each CPU.
+Various statistics will form read_only files under this directory.
+
+This driver is designed to be independent of any particular cpufreq_driver
+that may be running on your CPU. So, it will work with any cpufreq_driver.
+
+
+2. Statistics Provided (with example)
+
+cpufreq stats provides following statistics (explained in detail below).
+- time_in_state
+- total_trans
+- trans_table
+
+All the statistics will be from the time the stats driver has been inserted
+to the time when a read of a particular statistic is done. Obviously, stats
+driver will not have any information about the frequency transitions before
+the stats driver insertion.
+
+--------------------------------------------------------------------------------
+<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # ls -l
+total 0
+drwxr-xr-x 2 root root 0 May 14 16:06 .
+drwxr-xr-x 3 root root 0 May 14 15:58 ..
+-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state
+-r--r--r-- 1 root root 4096 May 14 16:06 total_trans
+-r--r--r-- 1 root root 4096 May 14 16:06 trans_table
+--------------------------------------------------------------------------------
+
+- time_in_state
+This gives the amount of time spent in each of the frequencies supported by
+this CPU. The cat output will have "<frequency> <time>" pair in each line, which
+will mean this CPU spent <time> usertime units of time at <frequency>. Output
+will have one line for each of the supported frequencies. usertime units here
+is 10mS (similar to other time exported in /proc).
+
+--------------------------------------------------------------------------------
+<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat time_in_state
+3600000 2089
+3400000 136
+3200000 34
+3000000 67
+2800000 172488
+--------------------------------------------------------------------------------
+
+
+- total_trans
+This gives the total number of frequency transitions on this CPU. The cat
+output will have a single count which is the total number of frequency
+transitions.
+
+--------------------------------------------------------------------------------
+<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat total_trans
+20
+--------------------------------------------------------------------------------
+
+- trans_table
+This will give a fine grained information about all the CPU frequency
+transitions. The cat output here is a two dimensional matrix, where an entry
+<i,j> (row i, column j) represents the count of number of transitions from
+Freq_i to Freq_j. Freq_i is in descending order with increasing rows and
+Freq_j is in descending order with increasing columns. The output here also
+contains the actual freq values for each row and column for better readability.
+
+--------------------------------------------------------------------------------
+<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat trans_table
+ From : To
+ : 3600000 3400000 3200000 3000000 2800000
+ 3600000: 0 5 0 0 0
+ 3400000: 4 0 2 0 0
+ 3200000: 0 1 0 2 0
+ 3000000: 0 0 1 0 3
+ 2800000: 0 0 0 2 0
+--------------------------------------------------------------------------------
+
+
+3. Configuring cpufreq-stats
+
+To configure cpufreq-stats in your kernel
+Config Main Menu
+ Power management options (ACPI, APM) --->
+ CPU Frequency scaling --->
+ [*] CPU Frequency scaling
+ <*> CPU frequency translation statistics
+ [*] CPU frequency translation statistics details
+
+
+"CPU Frequency scaling" (CONFIG_CPU_FREQ) should be enabled to configure
+cpufreq-stats.
+
+"CPU frequency translation statistics" (CONFIG_CPU_FREQ_STAT) provides the
+basic statistics which includes time_in_state and total_trans.
+
+"CPU frequency translation statistics details" (CONFIG_CPU_FREQ_STAT_DETAILS)
+provides fine grained cpufreq stats by trans_table. The reason for having a
+separate config option for trans_table is:
+- trans_table goes against the traditional /sysfs rule of one value per
+ interface. It provides a whole bunch of value in a 2 dimensional matrix
+ form.
+
+Once these two options are enabled and your CPU supports cpufrequency, you
+will be able to see the CPU frequency statistics in /sysfs.
+
+
+
+
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
new file mode 100644
index 0000000..5b0cfa6
--- /dev/null
+++ b/Documentation/cpu-freq/governors.txt
@@ -0,0 +1,213 @@
+ CPU frequency and voltage scaling code in the Linux(TM) kernel
+
+
+ L i n u x C P U F r e q
+
+ C P U F r e q G o v e r n o r s
+
+ - information for users and developers -
+
+
+ Dominik Brodowski <linux@brodo.de>
+ some additions and corrections by Nico Golde <nico@ngolde.de>
+
+
+
+ Clock scaling allows you to change the clock speed of the CPUs on the
+ fly. This is a nice method to save battery power, because the lower
+ the clock speed, the less power the CPU consumes.
+
+
+Contents:
+---------
+1. What is a CPUFreq Governor?
+
+2. Governors In the Linux Kernel
+2.1 Performance
+2.2 Powersave
+2.3 Userspace
+2.4 Ondemand
+2.5 Conservative
+
+3. The Governor Interface in the CPUfreq Core
+
+
+
+1. What Is A CPUFreq Governor?
+==============================
+
+Most cpufreq drivers (in fact, all except one, longrun) or even most
+cpu frequency scaling algorithms only offer the CPU to be set to one
+frequency. In order to offer dynamic frequency scaling, the cpufreq
+core must be able to tell these drivers of a "target frequency". So
+these specific drivers will be transformed to offer a "->target"
+call instead of the existing "->setpolicy" call. For "longrun", all
+stays the same, though.
+
+How to decide what frequency within the CPUfreq policy should be used?
+That's done using "cpufreq governors". Two are already in this patch
+-- they're the already existing "powersave" and "performance" which
+set the frequency statically to the lowest or highest frequency,
+respectively. At least two more such governors will be ready for
+addition in the near future, but likely many more as there are various
+different theories and models about dynamic frequency scaling
+around. Using such a generic interface as cpufreq offers to scaling
+governors, these can be tested extensively, and the best one can be
+selected for each specific use.
+
+Basically, it's the following flow graph:
+
+CPU can be set to switch independently | CPU can only be set
+ within specific "limits" | to specific frequencies
+
+ "CPUfreq policy"
+ consists of frequency limits (policy->{min,max})
+ and CPUfreq governor to be used
+ / \
+ / \
+ / the cpufreq governor decides
+ / (dynamically or statically)
+ / what target_freq to set within
+ / the limits of policy->{min,max}
+ / \
+ / \
+ Using the ->setpolicy call, Using the ->target call,
+ the limits and the the frequency closest
+ "policy" is set. to target_freq is set.
+ It is assured that it
+ is within policy->{min,max}
+
+
+2. Governors In the Linux Kernel
+================================
+
+2.1 Performance
+---------------
+
+The CPUfreq governor "performance" sets the CPU statically to the
+highest frequency within the borders of scaling_min_freq and
+scaling_max_freq.
+
+
+2.2 Powersave
+-------------
+
+The CPUfreq governor "powersave" sets the CPU statically to the
+lowest frequency within the borders of scaling_min_freq and
+scaling_max_freq.
+
+
+2.3 Userspace
+-------------
+
+The CPUfreq governor "userspace" allows the user, or any userspace
+program running with UID "root", to set the CPU to a specific frequency
+by making a sysfs file "scaling_setspeed" available in the CPU-device
+directory.
+
+
+2.4 Ondemand
+------------
+
+The CPUfreq governor "ondemand" sets the CPU depending on the
+current usage. To do this the CPU must have the capability to
+switch the frequency very quickly. There are a number of sysfs file
+accessible parameters:
+
+sampling_rate: measured in uS (10^-6 seconds), this is how often you
+want the kernel to look at the CPU usage and to make decisions on
+what to do about the frequency. Typically this is set to values of
+around '10000' or more.
+
+show_sampling_rate_(min|max): the minimum and maximum sampling rates
+available that you may set 'sampling_rate' to.
+
+up_threshold: defines what the average CPU usage between the samplings
+of 'sampling_rate' needs to be for the kernel to make a decision on
+whether it should increase the frequency. For example when it is set
+to its default value of '80' it means that between the checking
+intervals the CPU needs to be on average more than 80% in use to then
+decide that the CPU frequency needs to be increased.
+
+ignore_nice_load: this parameter takes a value of '0' or '1'. When
+set to '0' (its default), all processes are counted towards the
+'cpu utilisation' value. When set to '1', the processes that are
+run with a 'nice' value will not count (and thus be ignored) in the
+overall usage calculation. This is useful if you are running a CPU
+intensive calculation on your laptop that you do not care how long it
+takes to complete as you can 'nice' it and prevent it from taking part
+in the deciding process of whether to increase your CPU frequency.
+
+
+2.5 Conservative
+----------------
+
+The CPUfreq governor "conservative", much like the "ondemand"
+governor, sets the CPU depending on the current usage. It differs in
+behaviour in that it gracefully increases and decreases the CPU speed
+rather than jumping to max speed the moment there is any load on the
+CPU. This behaviour more suitable in a battery powered environment.
+The governor is tweaked in the same manner as the "ondemand" governor
+through sysfs with the addition of:
+
+freq_step: this describes what percentage steps the cpu freq should be
+increased and decreased smoothly by. By default the cpu frequency will
+increase in 5% chunks of your maximum cpu frequency. You can change this
+value to anywhere between 0 and 100 where '0' will effectively lock your
+CPU at a speed regardless of its load whilst '100' will, in theory, make
+it behave identically to the "ondemand" governor.
+
+down_threshold: same as the 'up_threshold' found for the "ondemand"
+governor but for the opposite direction. For example when set to its
+default value of '20' it means that if the CPU usage needs to be below
+20% between samples to have the frequency decreased.
+
+3. The Governor Interface in the CPUfreq Core
+=============================================
+
+A new governor must register itself with the CPUfreq core using
+"cpufreq_register_governor". The struct cpufreq_governor, which has to
+be passed to that function, must contain the following values:
+
+governor->name - A unique name for this governor
+governor->governor - The governor callback function
+governor->owner - .THIS_MODULE for the governor module (if
+ appropriate)
+
+The governor->governor callback is called with the current (or to-be-set)
+cpufreq_policy struct for that CPU, and an unsigned int event. The
+following events are currently defined:
+
+CPUFREQ_GOV_START: This governor shall start its duty for the CPU
+ policy->cpu
+CPUFREQ_GOV_STOP: This governor shall end its duty for the CPU
+ policy->cpu
+CPUFREQ_GOV_LIMITS: The limits for CPU policy->cpu have changed to
+ policy->min and policy->max.
+
+If you need other "events" externally of your driver, _only_ use the
+cpufreq_governor_l(unsigned int cpu, unsigned int event) call to the
+CPUfreq core to ensure proper locking.
+
+
+The CPUfreq governor may call the CPU processor driver using one of
+these two functions:
+
+int cpufreq_driver_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation);
+
+int __cpufreq_driver_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation);
+
+target_freq must be within policy->min and policy->max, of course.
+What's the difference between these two functions? When your governor
+still is in a direct code path of a call to governor->governor, the
+per-CPU cpufreq lock is still held in the cpufreq core, and there's
+no need to lock it again (in fact, this would cause a deadlock). So
+use __cpufreq_driver_target only in these cases. In all other cases
+(for example, when there's a "daemonized" function that wakes up
+every second), use cpufreq_driver_target to lock the cpufreq per-CPU
+lock before the command is passed to the cpufreq processor driver.
+
diff --git a/Documentation/cpu-freq/index.txt b/Documentation/cpu-freq/index.txt
new file mode 100644
index 0000000..3d0b915
--- /dev/null
+++ b/Documentation/cpu-freq/index.txt
@@ -0,0 +1,54 @@
+ CPU frequency and voltage scaling code in the Linux(TM) kernel
+
+
+ L i n u x C P U F r e q
+
+
+
+
+ Dominik Brodowski <linux@brodo.de>
+
+
+
+ Clock scaling allows you to change the clock speed of the CPUs on the
+ fly. This is a nice method to save battery power, because the lower
+ the clock speed, the less power the CPU consumes.
+
+
+
+Documents in this directory:
+----------------------------
+core.txt - General description of the CPUFreq core and
+ of CPUFreq notifiers
+
+cpu-drivers.txt - How to implement a new cpufreq processor driver
+
+governors.txt - What are cpufreq governors and how to
+ implement them?
+
+index.txt - File index, Mailing list and Links (this document)
+
+user-guide.txt - User Guide to CPUFreq
+
+
+Mailing List
+------------
+There is a CPU frequency changing CVS commit and general list where
+you can report bugs, problems or submit patches. To post a message,
+send an email to cpufreq@vger.kernel.org, to subscribe go to
+http://vger.kernel.org/vger-lists.html#cpufreq and follow the
+instructions there.
+
+Links
+-----
+the FTP archives:
+* ftp://ftp.linux.org.uk/pub/linux/cpufreq/
+
+how to access the CVS repository:
+* http://cvs.arm.linux.org.uk/
+
+the CPUFreq Mailing list:
+* http://vger.kernel.org/vger-lists.html#cpufreq
+
+Clock and voltage scaling for the SA-1100:
+* http://www.lartmaker.nl/projects/scaling
diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt
new file mode 100644
index 0000000..4f3f384
--- /dev/null
+++ b/Documentation/cpu-freq/user-guide.txt
@@ -0,0 +1,215 @@
+ CPU frequency and voltage scaling code in the Linux(TM) kernel
+
+
+ L i n u x C P U F r e q
+
+ U S E R G U I D E
+
+
+ Dominik Brodowski <linux@brodo.de>
+
+
+
+ Clock scaling allows you to change the clock speed of the CPUs on the
+ fly. This is a nice method to save battery power, because the lower
+ the clock speed, the less power the CPU consumes.
+
+
+Contents:
+---------
+1. Supported Architectures and Processors
+1.1 ARM
+1.2 x86
+1.3 sparc64
+1.4 ppc
+1.5 SuperH
+1.6 Blackfin
+
+2. "Policy" / "Governor"?
+2.1 Policy
+2.2 Governor
+
+3. How to change the CPU cpufreq policy and/or speed
+3.1 Preferred interface: sysfs
+3.2 Deprecated interfaces
+
+
+
+1. Supported Architectures and Processors
+=========================================
+
+1.1 ARM
+-------
+
+The following ARM processors are supported by cpufreq:
+
+ARM Integrator
+ARM-SA1100
+ARM-SA1110
+Intel PXA
+
+
+1.2 x86
+-------
+
+The following processors for the x86 architecture are supported by cpufreq:
+
+AMD Elan - SC400, SC410
+AMD mobile K6-2+
+AMD mobile K6-3+
+AMD mobile Duron
+AMD mobile Athlon
+AMD Opteron
+AMD Athlon 64
+Cyrix Media GXm
+Intel mobile PIII and Intel mobile PIII-M on certain chipsets
+Intel Pentium 4, Intel Xeon
+Intel Pentium M (Centrino)
+National Semiconductors Geode GX
+Transmeta Crusoe
+Transmeta Efficeon
+VIA Cyrix 3 / C3
+various processors on some ACPI 2.0-compatible systems [*]
+
+[*] Only if "ACPI Processor Performance States" are available
+to the ACPI<->BIOS interface.
+
+
+1.3 sparc64
+-----------
+
+The following processors for the sparc64 architecture are supported by
+cpufreq:
+
+UltraSPARC-III
+
+
+1.4 ppc
+-------
+
+Several "PowerBook" and "iBook2" notebooks are supported.
+
+
+1.5 SuperH
+----------
+
+The following SuperH processors are supported by cpufreq:
+
+SH-3
+SH-4
+
+1.6 Blackfin
+------------
+
+The following Blackfin processors are supported by cpufreq:
+
+BF522, BF523, BF524, BF525, BF526, BF527, Rev 0.1 or higher
+BF531, BF532, BF533, Rev 0.3 or higher
+BF534, BF536, BF537, Rev 0.2 or higher
+BF561, Rev 0.3 or higher
+BF542, BF544, BF547, BF548, BF549, Rev 0.1 or higher
+
+
+2. "Policy" / "Governor" ?
+==========================
+
+Some CPU frequency scaling-capable processor switch between various
+frequencies and operating voltages "on the fly" without any kernel or
+user involvement. This guarantees very fast switching to a frequency
+which is high enough to serve the user's needs, but low enough to save
+power.
+
+
+2.1 Policy
+----------
+
+On these systems, all you can do is select the lower and upper
+frequency limit as well as whether you want more aggressive
+power-saving or more instantly available processing power.
+
+
+2.2 Governor
+------------
+
+On all other cpufreq implementations, these boundaries still need to
+be set. Then, a "governor" must be selected. Such a "governor" decides
+what speed the processor shall run within the boundaries. One such
+"governor" is the "userspace" governor. This one allows the user - or
+a yet-to-implement userspace program - to decide what specific speed
+the processor shall run at.
+
+
+3. How to change the CPU cpufreq policy and/or speed
+====================================================
+
+3.1 Preferred Interface: sysfs
+------------------------------
+
+The preferred interface is located in the sysfs filesystem. If you
+mounted it at /sys, the cpufreq interface is located in a subdirectory
+"cpufreq" within the cpu-device directory
+(e.g. /sys/devices/system/cpu/cpu0/cpufreq/ for the first CPU).
+
+cpuinfo_min_freq : this file shows the minimum operating
+ frequency the processor can run at(in kHz)
+cpuinfo_max_freq : this file shows the maximum operating
+ frequency the processor can run at(in kHz)
+scaling_driver : this file shows what cpufreq driver is
+ used to set the frequency on this CPU
+
+scaling_available_governors : this file shows the CPUfreq governors
+ available in this kernel. You can see the
+ currently activated governor in
+
+scaling_governor, and by "echoing" the name of another
+ governor you can change it. Please note
+ that some governors won't load - they only
+ work on some specific architectures or
+ processors.
+
+cpuinfo_cur_freq : Current speed of the CPU, in KHz.
+
+scaling_available_frequencies : List of available frequencies, in KHz.
+
+scaling_min_freq and
+scaling_max_freq show the current "policy limits" (in
+ kHz). By echoing new values into these
+ files, you can change these limits.
+ NOTE: when setting a policy you need to
+ first set scaling_max_freq, then
+ scaling_min_freq.
+
+affected_cpus : List of CPUs that require software coordination
+ of frequency.
+
+related_cpus : List of CPUs that need some sort of frequency
+ coordination, whether software or hardware.
+
+scaling_driver : Hardware driver for cpufreq.
+
+scaling_cur_freq : Current frequency of the CPU, in KHz.
+
+If you have selected the "userspace" governor which allows you to
+set the CPU operating frequency to a specific value, you can read out
+the current frequency in
+
+scaling_setspeed. By "echoing" a new frequency into this
+ you can change the speed of the CPU,
+ but only within the limits of
+ scaling_min_freq and scaling_max_freq.
+
+
+3.2 Deprecated Interfaces
+-------------------------
+
+Depending on your kernel configuration, you might find the following
+cpufreq-related files:
+/proc/cpufreq
+/proc/sys/cpu/*/speed
+/proc/sys/cpu/*/speed-min
+/proc/sys/cpu/*/speed-max
+
+These are files for deprecated interfaces to cpufreq, which offer far
+less functionality. Because of this, these interfaces aren't described
+here.
+
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
new file mode 100644
index 0000000..94bbc27
--- /dev/null
+++ b/Documentation/cpu-hotplug.txt
@@ -0,0 +1,389 @@
+ CPU hotplug Support in Linux(tm) Kernel
+
+ Maintainers:
+ CPU Hotplug Core:
+ Rusty Russell <rusty@rustycorp.com.au>
+ Srivatsa Vaddagiri <vatsa@in.ibm.com>
+ i386:
+ Zwane Mwaikambo <zwane@arm.linux.org.uk>
+ ppc64:
+ Nathan Lynch <nathanl@austin.ibm.com>
+ Joel Schopp <jschopp@austin.ibm.com>
+ ia64/x86_64:
+ Ashok Raj <ashok.raj@intel.com>
+ s390:
+ Heiko Carstens <heiko.carstens@de.ibm.com>
+
+Authors: Ashok Raj <ashok.raj@intel.com>
+Lots of feedback: Nathan Lynch <nathanl@austin.ibm.com>,
+ Joel Schopp <jschopp@austin.ibm.com>
+
+Introduction
+
+Modern advances in system architectures have introduced advanced error
+reporting and correction capabilities in processors. CPU architectures permit
+partitioning support, where compute resources of a single CPU could be made
+available to virtual machine environments. There are couple OEMS that
+support NUMA hardware which are hot pluggable as well, where physical
+node insertion and removal require support for CPU hotplug.
+
+Such advances require CPUs available to a kernel to be removed either for
+provisioning reasons, or for RAS purposes to keep an offending CPU off
+system execution path. Hence the need for CPU hotplug support in the
+Linux kernel.
+
+A more novel use of CPU-hotplug support is its use today in suspend
+resume support for SMP. Dual-core and HT support makes even
+a laptop run SMP kernels which didn't support these methods. SMP support
+for suspend/resume is a work in progress.
+
+General Stuff about CPU Hotplug
+--------------------------------
+
+Command Line Switches
+---------------------
+maxcpus=n Restrict boot time cpus to n. Say if you have 4 cpus, using
+ maxcpus=2 will only boot 2. You can choose to bring the
+ other cpus later online, read FAQ's for more info.
+
+additional_cpus=n (*) Use this to limit hotpluggable cpus. This option sets
+ cpu_possible_map = cpu_present_map + additional_cpus
+
+(*) Option valid only for following architectures
+- x86_64, ia64
+
+ia64 and x86_64 use the number of disabled local apics in ACPI tables MADT
+to determine the number of potentially hot-pluggable cpus. The implementation
+should only rely on this to count the # of cpus, but *MUST* not rely on the
+apicid values in those tables for disabled apics. In the event BIOS doesn't
+mark such hot-pluggable cpus as disabled entries, one could use this
+parameter "additional_cpus=x" to represent those cpus in the cpu_possible_map.
+
+possible_cpus=n [s390 only] use this to set hotpluggable cpus.
+ This option sets possible_cpus bits in
+ cpu_possible_map. Thus keeping the numbers of bits set
+ constant even if the machine gets rebooted.
+
+CPU maps and such
+-----------------
+[More on cpumaps and primitive to manipulate, please check
+include/linux/cpumask.h that has more descriptive text.]
+
+cpu_possible_map: Bitmap of possible CPUs that can ever be available in the
+system. This is used to allocate some boot time memory for per_cpu variables
+that aren't designed to grow/shrink as CPUs are made available or removed.
+Once set during boot time discovery phase, the map is static, i.e no bits
+are added or removed anytime. Trimming it accurately for your system needs
+upfront can save some boot time memory. See below for how we use heuristics
+in x86_64 case to keep this under check.
+
+cpu_online_map: Bitmap of all CPUs currently online. Its set in __cpu_up()
+after a cpu is available for kernel scheduling and ready to receive
+interrupts from devices. Its cleared when a cpu is brought down using
+__cpu_disable(), before which all OS services including interrupts are
+migrated to another target CPU.
+
+cpu_present_map: Bitmap of CPUs currently present in the system. Not all
+of them may be online. When physical hotplug is processed by the relevant
+subsystem (e.g ACPI) can change and new bit either be added or removed
+from the map depending on the event is hot-add/hot-remove. There are currently
+no locking rules as of now. Typical usage is to init topology during boot,
+at which time hotplug is disabled.
+
+You really dont need to manipulate any of the system cpu maps. They should
+be read-only for most use. When setting up per-cpu resources almost always use
+cpu_possible_map/for_each_possible_cpu() to iterate.
+
+Never use anything other than cpumask_t to represent bitmap of CPUs.
+
+ #include <linux/cpumask.h>
+
+ for_each_possible_cpu - Iterate over cpu_possible_map
+ for_each_online_cpu - Iterate over cpu_online_map
+ for_each_present_cpu - Iterate over cpu_present_map
+ for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
+
+ #include <linux/cpu.h>
+ get_online_cpus() and put_online_cpus():
+
+The above calls are used to inhibit cpu hotplug operations. While the
+cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
+If you merely need to avoid cpus going away, you could also use
+preempt_disable() and preempt_enable() for those sections.
+Just remember the critical section cannot call any
+function that can sleep or schedule this process away. The preempt_disable()
+will work as long as stop_machine_run() is used to take a cpu down.
+
+CPU Hotplug - Frequently Asked Questions.
+
+Q: How to enable my kernel to support CPU hotplug?
+A: When doing make defconfig, Enable CPU hotplug support
+
+ "Processor type and Features" -> Support for Hotpluggable CPUs
+
+Make sure that you have CONFIG_HOTPLUG, and CONFIG_SMP turned on as well.
+
+You would need to enable CONFIG_HOTPLUG_CPU for SMP suspend/resume support
+as well.
+
+Q: What architectures support CPU hotplug?
+A: As of 2.6.14, the following architectures support CPU hotplug.
+
+i386 (Intel), ppc, ppc64, parisc, s390, ia64 and x86_64
+
+Q: How to test if hotplug is supported on the newly built kernel?
+A: You should now notice an entry in sysfs.
+
+Check if sysfs is mounted, using the "mount" command. You should notice
+an entry as shown below in the output.
+
+ ....
+ none on /sys type sysfs (rw)
+ ....
+
+If this is not mounted, do the following.
+
+ #mkdir /sysfs
+ #mount -t sysfs sys /sys
+
+Now you should see entries for all present cpu, the following is an example
+in a 8-way system.
+
+ #pwd
+ #/sys/devices/system/cpu
+ #ls -l
+ total 0
+ drwxr-xr-x 10 root root 0 Sep 19 07:44 .
+ drwxr-xr-x 13 root root 0 Sep 19 07:45 ..
+ drwxr-xr-x 3 root root 0 Sep 19 07:44 cpu0
+ drwxr-xr-x 3 root root 0 Sep 19 07:44 cpu1
+ drwxr-xr-x 3 root root 0 Sep 19 07:44 cpu2
+ drwxr-xr-x 3 root root 0 Sep 19 07:44 cpu3
+ drwxr-xr-x 3 root root 0 Sep 19 07:44 cpu4
+ drwxr-xr-x 3 root root 0 Sep 19 07:44 cpu5
+ drwxr-xr-x 3 root root 0 Sep 19 07:44 cpu6
+ drwxr-xr-x 3 root root 0 Sep 19 07:48 cpu7
+
+Under each directory you would find an "online" file which is the control
+file to logically online/offline a processor.
+
+Q: Does hot-add/hot-remove refer to physical add/remove of cpus?
+A: The usage of hot-add/remove may not be very consistently used in the code.
+CONFIG_HOTPLUG_CPU enables logical online/offline capability in the kernel.
+To support physical addition/removal, one would need some BIOS hooks and
+the platform should have something like an attention button in PCI hotplug.
+CONFIG_ACPI_HOTPLUG_CPU enables ACPI support for physical add/remove of CPUs.
+
+Q: How do i logically offline a CPU?
+A: Do the following.
+
+ #echo 0 > /sys/devices/system/cpu/cpuX/online
+
+Once the logical offline is successful, check
+
+ #cat /proc/interrupts
+
+You should now not see the CPU that you removed. Also online file will report
+the state as 0 when a cpu if offline and 1 when its online.
+
+ #To display the current cpu state.
+ #cat /sys/devices/system/cpu/cpuX/online
+
+Q: Why cant i remove CPU0 on some systems?
+A: Some architectures may have some special dependency on a certain CPU.
+
+For e.g in IA64 platforms we have ability to sent platform interrupts to the
+OS. a.k.a Corrected Platform Error Interrupts (CPEI). In current ACPI
+specifications, we didn't have a way to change the target CPU. Hence if the
+current ACPI version doesn't support such re-direction, we disable that CPU
+by making it not-removable.
+
+In such cases you will also notice that the online file is missing under cpu0.
+
+Q: How do i find out if a particular CPU is not removable?
+A: Depending on the implementation, some architectures may show this by the
+absence of the "online" file. This is done if it can be determined ahead of
+time that this CPU cannot be removed.
+
+In some situations, this can be a run time check, i.e if you try to remove the
+last CPU, this will not be permitted. You can find such failures by
+investigating the return value of the "echo" command.
+
+Q: What happens when a CPU is being logically offlined?
+A: The following happen, listed in no particular order :-)
+
+- A notification is sent to in-kernel registered modules by sending an event
+ CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
+ CPU is being offlined while tasks are frozen due to a suspend operation in
+ progress
+- All processes are migrated away from this outgoing CPU to new CPUs.
+ The new CPU is chosen from each process' current cpuset, which may be
+ a subset of all online CPUs.
+- All interrupts targeted to this CPU is migrated to a new CPU
+- timers/bottom half/task lets are also migrated to a new CPU
+- Once all services are migrated, kernel calls an arch specific routine
+ __cpu_disable() to perform arch specific cleanup.
+- Once this is successful, an event for successful cleanup is sent by an event
+ CPU_DEAD (or CPU_DEAD_FROZEN if tasks are frozen due to a suspend while the
+ CPU is being offlined).
+
+ "It is expected that each service cleans up when the CPU_DOWN_PREPARE
+ notifier is called, when CPU_DEAD is called its expected there is nothing
+ running on behalf of this CPU that was offlined"
+
+Q: If i have some kernel code that needs to be aware of CPU arrival and
+ departure, how to i arrange for proper notification?
+A: This is what you would need in your kernel code to receive notifications.
+
+ #include <linux/cpu.h>
+ static int __cpuinit foobar_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+ {
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ foobar_online_action(cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ foobar_dead_action(cpu);
+ break;
+ }
+ return NOTIFY_OK;
+ }
+
+ static struct notifier_block __cpuinitdata foobar_cpu_notifer =
+ {
+ .notifier_call = foobar_cpu_callback,
+ };
+
+You need to call register_cpu_notifier() from your init function.
+Init functions could be of two types:
+1. early init (init function called when only the boot processor is online).
+2. late init (init function called _after_ all the CPUs are online).
+
+For the first case, you should add the following to your init function
+
+ register_cpu_notifier(&foobar_cpu_notifier);
+
+For the second case, you should add the following to your init function
+
+ register_hotcpu_notifier(&foobar_cpu_notifier);
+
+You can fail PREPARE notifiers if something doesn't work to prepare resources.
+This will stop the activity and send a following CANCELED event back.
+
+CPU_DEAD should not be failed, its just a goodness indication, but bad
+things will happen if a notifier in path sent a BAD notify code.
+
+Q: I don't see my action being called for all CPUs already up and running?
+A: Yes, CPU notifiers are called only when new CPUs are on-lined or offlined.
+ If you need to perform some action for each cpu already in the system, then
+
+ for_each_online_cpu(i) {
+ foobar_cpu_callback(&foobar_cpu_notifier, CPU_UP_PREPARE, i);
+ foobar_cpu_callback(&foobar_cpu_notifier, CPU_ONLINE, i);
+ }
+
+Q: If i would like to develop cpu hotplug support for a new architecture,
+ what do i need at a minimum?
+A: The following are what is required for CPU hotplug infrastructure to work
+ correctly.
+
+ - Make sure you have an entry in Kconfig to enable CONFIG_HOTPLUG_CPU
+ - __cpu_up() - Arch interface to bring up a CPU
+ - __cpu_disable() - Arch interface to shutdown a CPU, no more interrupts
+ can be handled by the kernel after the routine
+ returns. Including local APIC timers etc are
+ shutdown.
+ - __cpu_die() - This actually supposed to ensure death of the CPU.
+ Actually look at some example code in other arch
+ that implement CPU hotplug. The processor is taken
+ down from the idle() loop for that specific
+ architecture. __cpu_die() typically waits for some
+ per_cpu state to be set, to ensure the processor
+ dead routine is called to be sure positively.
+
+Q: I need to ensure that a particular cpu is not removed when there is some
+ work specific to this cpu is in progress.
+A: First switch the current thread context to preferred cpu
+
+ int my_func_on_cpu(int cpu)
+ {
+ cpumask_t saved_mask, new_mask = CPU_MASK_NONE;
+ int curr_cpu, err = 0;
+
+ saved_mask = current->cpus_allowed;
+ cpu_set(cpu, new_mask);
+ err = set_cpus_allowed(current, new_mask);
+
+ if (err)
+ return err;
+
+ /*
+ * If we got scheduled out just after the return from
+ * set_cpus_allowed() before running the work, this ensures
+ * we stay locked.
+ */
+ curr_cpu = get_cpu();
+
+ if (curr_cpu != cpu) {
+ err = -EAGAIN;
+ goto ret;
+ } else {
+ /*
+ * Do work : But cant sleep, since get_cpu() disables preempt
+ */
+ }
+ ret:
+ put_cpu();
+ set_cpus_allowed(current, saved_mask);
+ return err;
+ }
+
+
+Q: How do we determine how many CPUs are available for hotplug.
+A: There is no clear spec defined way from ACPI that can give us that
+ information today. Based on some input from Natalie of Unisys,
+ that the ACPI MADT (Multiple APIC Description Tables) marks those possible
+ CPUs in a system with disabled status.
+
+ Andi implemented some simple heuristics that count the number of disabled
+ CPUs in MADT as hotpluggable CPUS. In the case there are no disabled CPUS
+ we assume 1/2 the number of CPUs currently present can be hotplugged.
+
+ Caveat: Today's ACPI MADT can only provide 256 entries since the apicid field
+ in MADT is only 8 bits.
+
+User Space Notification
+
+Hotplug support for devices is common in Linux today. Its being used today to
+support automatic configuration of network, usb and pci devices. A hotplug
+event can be used to invoke an agent script to perform the configuration task.
+
+You can add /etc/hotplug/cpu.agent to handle hotplug notification user space
+scripts.
+
+ #!/bin/bash
+ # $Id: cpu.agent
+ # Kernel hotplug params include:
+ #ACTION=%s [online or offline]
+ #DEVPATH=%s
+ #
+ cd /etc/hotplug
+ . ./hotplug.functions
+
+ case $ACTION in
+ online)
+ echo `date` ":cpu.agent" add cpu >> /tmp/hotplug.txt
+ ;;
+ offline)
+ echo `date` ":cpu.agent" remove cpu >>/tmp/hotplug.txt
+ ;;
+ *)
+ debug_mesg CPU $ACTION event not supported
+ exit 1
+ ;;
+ esac
diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
new file mode 100644
index 0000000..287224e
--- /dev/null
+++ b/Documentation/cpu-load.txt
@@ -0,0 +1,113 @@
+CPU load
+--------
+
+Linux exports various bits of information via `/proc/stat' and
+`/proc/uptime' that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example:
+
+ $ iostat
+ Linux 2.6.18.3-exp (linmac) 02/20/2007
+
+ avg-cpu: %user %nice %system %iowait %steal %idle
+ 10.01 0.00 2.92 5.44 0.00 81.63
+
+ ...
+
+Here the system thinks that over the default sampling period the
+system spent 10.01% of the time doing work in user space, 2.92% in the
+kernel, and was overall 81.63% of the time idle.
+
+In most cases the `/proc/stat' information reflects the reality quite
+closely, however due to the nature of how/when the kernel collects
+this data sometimes it can not be trusted at all.
+
+So how is this information collected? Whenever timer interrupt is
+signalled the kernel looks what kind of task was running at this
+moment and increments the counter that corresponds to this tasks
+kind/state. The problem with this is that the system could have
+switched between various states multiple times between two timer
+interrupts yet the counter is incremented only for the last state.
+
+
+Example
+-------
+
+If we imagine the system with one task that periodically burns cycles
+in the following manner:
+
+ time line between two timer interrupts
+|--------------------------------------|
+ ^ ^
+ |_ something begins working |
+ |_ something goes to sleep
+ (only to be awaken quite soon)
+
+In the above situation the system will be 0% loaded according to the
+`/proc/stat' (since the timer interrupt will always happen when the
+system is executing the idle handler), but in reality the load is
+closer to 99%.
+
+One can imagine many more situations where this behavior of the kernel
+will lead to quite erratic information inside `/proc/stat'.
+
+
+/* gcc -o hog smallhog.c */
+#include <time.h>
+#include <limits.h>
+#include <signal.h>
+#include <sys/time.h>
+#define HIST 10
+
+static volatile sig_atomic_t stop;
+
+static void sighandler (int signr)
+{
+ (void) signr;
+ stop = 1;
+}
+static unsigned long hog (unsigned long niters)
+{
+ stop = 0;
+ while (!stop && --niters);
+ return niters;
+}
+int main (void)
+{
+ int i;
+ struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+ .it_value = { .tv_sec = 0, .tv_usec = 1 } };
+ sigset_t set;
+ unsigned long v[HIST];
+ double tmp = 0.0;
+ unsigned long n;
+ signal (SIGALRM, &sighandler);
+ setitimer (ITIMER_REAL, &it, NULL);
+
+ hog (ULONG_MAX);
+ for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+ for (i = 0; i < HIST; ++i) tmp += v[i];
+ tmp /= HIST;
+ n = tmp - (tmp / 3.0);
+
+ sigemptyset (&set);
+ sigaddset (&set, SIGALRM);
+
+ for (;;) {
+ hog (n);
+ sigwait (&set, &i);
+ }
+ return 0;
+}
+
+
+References
+----------
+
+http://lkml.org/lkml/2007/2/12/6
+Documentation/filesystems/proc.txt (1.8)
+
+
+Thanks
+------
+
+Con Kolivas, Pavel Machek
diff --git a/Documentation/cpuidle/core.txt b/Documentation/cpuidle/core.txt
new file mode 100644
index 0000000..63ecc5d
--- /dev/null
+++ b/Documentation/cpuidle/core.txt
@@ -0,0 +1,23 @@
+
+ Supporting multiple CPU idle levels in kernel
+
+ cpuidle
+
+General Information:
+
+Various CPUs today support multiple idle levels that are differentiated
+by varying exit latencies and power consumption during idle.
+cpuidle is a generic in-kernel infrastructure that separates
+idle policy (governor) from idle mechanism (driver) and provides a
+standardized infrastructure to support independent development of
+governors and drivers.
+
+cpuidle resides under drivers/cpuidle.
+
+Boot options:
+"cpuidle_sysfs_switch"
+enables current_governor interface in /sys/devices/system/cpu/cpuidle/,
+which can be used to switch governors at run time. This boot option
+is meant for developer testing only. In normal usage, kernel picks the
+best governor based on governor ratings.
+SEE ALSO: sysfs.txt in this directory.
diff --git a/Documentation/cpuidle/driver.txt b/Documentation/cpuidle/driver.txt
new file mode 100644
index 0000000..7a9e09e
--- /dev/null
+++ b/Documentation/cpuidle/driver.txt
@@ -0,0 +1,31 @@
+
+
+ Supporting multiple CPU idle levels in kernel
+
+ cpuidle drivers
+
+
+
+
+cpuidle driver hooks into the cpuidle infrastructure and handles the
+architecture/platform dependent part of CPU idle states. Driver
+provides the platform idle state detection capability and also
+has mechanisms in place to support actual entry-exit into CPU idle states.
+
+cpuidle driver initializes the cpuidle_device structure for each CPU device
+and registers with cpuidle using cpuidle_register_device.
+
+It can also support the dynamic changes (like battery <-> AC), by using
+cpuidle_pause_and_lock, cpuidle_disable_device and cpuidle_enable_device,
+cpuidle_resume_and_unlock.
+
+Interfaces:
+extern int cpuidle_register_driver(struct cpuidle_driver *drv);
+extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
+extern int cpuidle_register_device(struct cpuidle_device *dev);
+extern void cpuidle_unregister_device(struct cpuidle_device *dev);
+
+extern void cpuidle_pause_and_lock(void);
+extern void cpuidle_resume_and_unlock(void);
+extern int cpuidle_enable_device(struct cpuidle_device *dev);
+extern void cpuidle_disable_device(struct cpuidle_device *dev);
diff --git a/Documentation/cpuidle/governor.txt b/Documentation/cpuidle/governor.txt
new file mode 100644
index 0000000..12c6bd5
--- /dev/null
+++ b/Documentation/cpuidle/governor.txt
@@ -0,0 +1,29 @@
+
+
+
+ Supporting multiple CPU idle levels in kernel
+
+ cpuidle governors
+
+
+
+
+cpuidle governor is policy routine that decides what idle state to enter at
+any given time. cpuidle core uses different callbacks to the governor.
+
+* enable() to enable governor for a particular device
+* disable() to disable governor for a particular device
+* select() to select an idle state to enter
+* reflect() called after returning from the idle state, which can be used
+ by the governor for some record keeping.
+
+More than one governor can be registered at the same time and
+users can switch between drivers using /sysfs interface (when enabled).
+More than one governor part is supported for developers to easily experiment
+with different governors. By default, most optimal governor based on your
+kernel configuration and platform will be selected by cpuidle.
+
+Interfaces:
+extern int cpuidle_register_governor(struct cpuidle_governor *gov);
+extern void cpuidle_unregister_governor(struct cpuidle_governor *gov);
+struct cpuidle_governor
diff --git a/Documentation/cpuidle/sysfs.txt b/Documentation/cpuidle/sysfs.txt
new file mode 100644
index 0000000..50d7b16
--- /dev/null
+++ b/Documentation/cpuidle/sysfs.txt
@@ -0,0 +1,79 @@
+
+
+ Supporting multiple CPU idle levels in kernel
+
+ cpuidle sysfs
+
+System global cpuidle related information and tunables are under
+/sys/devices/system/cpu/cpuidle
+
+The current interfaces in this directory has self-explanatory names:
+* current_driver
+* current_governor_ro
+
+With cpuidle_sysfs_switch boot option (meant for developer testing)
+following objects are visible instead.
+* current_driver
+* available_governors
+* current_governor
+In this case users can switch the governor at run time by writing
+to current_governor.
+
+
+Per logical CPU specific cpuidle information are under
+/sys/devices/system/cpu/cpuX/cpuidle
+for each online cpu X
+
+--------------------------------------------------------------------------------
+# ls -lR /sys/devices/system/cpu/cpu0/cpuidle/
+/sys/devices/system/cpu/cpu0/cpuidle/:
+total 0
+drwxr-xr-x 2 root root 0 Feb 8 10:42 state0
+drwxr-xr-x 2 root root 0 Feb 8 10:42 state1
+drwxr-xr-x 2 root root 0 Feb 8 10:42 state2
+drwxr-xr-x 2 root root 0 Feb 8 10:42 state3
+
+/sys/devices/system/cpu/cpu0/cpuidle/state0:
+total 0
+-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
+-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
+-r--r--r-- 1 root root 4096 Feb 8 10:42 name
+-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 time
+-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
+
+/sys/devices/system/cpu/cpu0/cpuidle/state1:
+total 0
+-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
+-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
+-r--r--r-- 1 root root 4096 Feb 8 10:42 name
+-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 time
+-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
+
+/sys/devices/system/cpu/cpu0/cpuidle/state2:
+total 0
+-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
+-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
+-r--r--r-- 1 root root 4096 Feb 8 10:42 name
+-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 time
+-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
+
+/sys/devices/system/cpu/cpu0/cpuidle/state3:
+total 0
+-r--r--r-- 1 root root 4096 Feb 8 10:42 desc
+-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
+-r--r--r-- 1 root root 4096 Feb 8 10:42 name
+-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 time
+-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
+--------------------------------------------------------------------------------
+
+
+* desc : Small description about the idle state (string)
+* latency : Latency to exit out of this idle state (in microseconds)
+* name : Name of the idle state (string)
+* power : Power consumed while in this idle state (in milliwatts)
+* time : Total time spent in this idle state (in microseconds)
+* usage : Number of times this state was entered (count)
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
new file mode 100644
index 0000000..5c86c25
--- /dev/null
+++ b/Documentation/cpusets.txt
@@ -0,0 +1,808 @@
+ CPUSETS
+ -------
+
+Copyright (C) 2004 BULL SA.
+Written by Simon.Derr@bull.net
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Paul Menage <menage@google.com>
+Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+CONTENTS:
+=========
+
+1. Cpusets
+ 1.1 What are cpusets ?
+ 1.2 Why are cpusets needed ?
+ 1.3 How are cpusets implemented ?
+ 1.4 What are exclusive cpusets ?
+ 1.5 What is memory_pressure ?
+ 1.6 What is memory spread ?
+ 1.7 What is sched_load_balance ?
+ 1.8 What is sched_relax_domain_level ?
+ 1.9 How do I use cpusets ?
+2. Usage Examples and Syntax
+ 2.1 Basic Usage
+ 2.2 Adding/removing cpus
+ 2.3 Setting flags
+ 2.4 Attaching processes
+3. Questions
+4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks. In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a tasks current cpuset. They form a nested
+hierarchy visible in a virtual file system. These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/cgroups/cgroups.txt.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that tasks cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset. The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting tasks mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+ * Web Servers running multiple instances of the same web application,
+ * Servers running different applications (for instance, a web server
+ and a database), or
+ * NUMA systems running large HPC applications with demanding
+ performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets. It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+ kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+ in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+ allowed in that tasks cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+ those Memory Nodes allowed in that tasks cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+ Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+ of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+ browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+ cpuset (except direct ancestors and descendents) may contain
+ any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+ allowed in that tasks cpuset.
+ - in sched.c migrate_all_tasks(), to keep migrating tasks within
+ the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+ Memory Nodes by what's allowed in that tasks cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel. No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example:
+
+ Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
+ Cpus_allowed_list: 0-127
+ Mems_allowed: ffffffff,ffffffff
+ Mems_allowed_list: 0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpus: list of CPUs in that cpuset
+ - mems: list of Memory Nodes in that cpuset
+ - memory_migrate flag: if set, move pages to cpusets nodes
+ - cpu_exclusive flag: is cpu placement exclusive?
+ - mem_exclusive flag: is memory placement exclusive?
+ - mem_hardwall flag: is memory allocation hardwalled
+ - memory_pressure: measure of how much paging pressure in cpuset
+
+In addition, the root cpuset only has the following file:
+ - memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command. The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset. A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset. Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only. The cpus file automatically tracks the value of
+cpu_online_map using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendent, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users. All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space. This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset. To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure. It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==> Unless this feature is enabled by writing "1" to the special file
+ /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+ code of __alloc_pages() for this metric reduces to simply noticing
+ that the cpuset_memory_pressure_enabled flag is zero. So only
+ systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+ Because this meter is per-cpuset, rather than per-task or mm,
+ the system load imposed by a batch scheduler monitoring this
+ metric is sharply reduced on large systems, because a scan of
+ the tasklist can be avoided on each set of queries.
+
+ Because this meter is a running average, instead of an accumulating
+ counter, a batch scheduler can detect memory pressure with a
+ single read, instead of having to read and accumulate results
+ for a period of time.
+
+ Because this meter is per-cpuset rather than per-task or mm,
+ the batch scheduler can obtain the key information, memory
+ pressure in a cpuset, with a single read, rather than having to
+ query and accumulate results over all the (dynamically changing)
+ set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures. They are called 'memory_spread_page' and
+'memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the tasks NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the tasks NUMA mempolicy and be spread
+instead. Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing tasks memory spread settings. If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag
+files. By default they contain "0", meaning that the feature is off
+for that cpuset. If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'memory_spread_page' turns on a per-process flag
+PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset. The page allocation calls for the page cache
+is modified to perform an inline check for this PF_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'memory_spread_slab' turns on the flag
+PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple. It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current tasks mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit. Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched.c) automatically load balances
+tasks. If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced. So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, except those
+marked isolated using the kernel boot time "isolcpus=" argument.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+ 1) On large systems, load balancing across many CPUs is expensive.
+ If the system is managed using cpusets to place independent jobs
+ on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+ system overhead on those CPUs, including avoiding task load
+ balancing if that is not needed.
+
+When the per-cpuset flag "sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendent cpusets. Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest. Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding. So if each of two partially
+overlapping cpusets enables the flag 'sched_load_balance', then we
+form a single sched domain that is a superset of both. We won't move
+a task to a CPU outside it cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "sched_load_balance" enabled,
+and the sched domain configuration. If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above. In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.) When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can. It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all
+the CPUs that must be load balanced.
+
+Whenever the 'sched_load_balance' flag changes, or CPUs come or go
+from a cpuset with this flag enabled, or a cpuset with this flag
+enabled is removed, the cpuset code builds a new such partition and
+passes it to the scheduler sched domain setup code, to have the sched
+domains rebuilt as necessary.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (cpumask_t) in the partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+everytime. In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searchs all.
+
+For example, assume CPU Z is relatively far from CPU X. Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick. For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'sched_relax_domain_level' file allows you to request changing
+this searching range as you like. This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+ -1 : no request. use system default or follow request of others.
+ 0 : no search.
+ 1 : search siblings (hyperthreads in a core).
+ 2 : search cores in a package.
+ 3 : search cpus in a node [= system wide on non-NUMA system]
+ ( 4 : search nodes in a chunk of node [on NUMA system] )
+ ( 5 : search system wide [on NUMA system] )
+
+The system default is architecture dependent. The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to. Therefore if the flag 'sched_load_balance' of a cpuset
+is disabled, then 'sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used. Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not will be depend on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+ - The migration costs between each cpu can be assumed considerably
+ small(for you) due to your special application's behavior or
+ special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+ the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the tasks cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement. If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset. If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its numa placement,
+as queried by get_mempolicy(), doesn't change). If a task is moved
+from one cpuset to another, then the kernel will adjust the tasks
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately. Similarly,
+if a tasks pid is written to a cpusets 'tasks' file, in either its
+current cpuset or another cpuset, then its allowed CPU placement is
+changed immediately. If such a task had been bound to some subset
+of its cpuset using the sched_setaffinity() call, the task will be
+allowed to run on any CPU allowed in its new cpuset, negating the
+affect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+but the processor placement is not updated, until that tasks pid is
+rewritten to the 'tasks' file of its cpuset. This is done to avoid
+impacting the scheduler code in the kernel with a check for changes
+in a tasks processor placement.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'mems' subsequently changes.
+If the cpuset flag file 'memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the tasks new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'memory_migrate' is set true, then if that cpusets
+'mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the tasks prior cpuset, or in the cpusets
+prior 'mems' setting, will not be moved.
+
+There is an exception to the above. If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus. But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching. In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs. When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well. In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above. GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails. If the request cannot be satisfied within
+the current tasks cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it. It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /dev/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /dev/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+ the /dev/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+ /dev/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset:
+
+ mount -t cgroup -ocpuset cpuset /dev/cpuset
+ cd /dev/cpuset
+ mkdir Charlie
+ cd Charlie
+ /bin/echo 2-3 > cpus
+ /bin/echo 1 > mems
+ /bin/echo $$ > tasks
+ sh
+ # The subshell 'sh' is now running in cpuset Charlie
+ # The next line should display '/Charlie'
+ cat /proc/self/cpuset
+
+In the future, a C library interface to cpusets will likely be
+available. For now, the only way to query or modify cpusets is
+via the cpuset file system, using the various cd, mkdir, echo, cat,
+rmdir commands from the shell, or their equivalent from C.
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset. The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /dev/cpuset
+
+Then under /dev/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /dev/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /dev/cpuset:
+# cd /dev/cpuset
+# mkdir my_cpuset
+
+Now you want to do something with this cpuset.
+# cd my_cpuset
+
+In this directory you can find several files:
+# ls
+cpu_exclusive memory_migrate mems tasks
+cpus memory_pressure notify_on_release
+mem_exclusive memory_spread_page sched_load_balance
+mem_hardwall memory_spread_slab sched_relax_domain_level
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties. By writing to these files you can manipulate
+the cpuset.
+
+Set some flags:
+# /bin/echo 1 > cpu_exclusive
+
+Add some cpus:
+# /bin/echo 0-7 > cpus
+
+Add some mems:
+# /bin/echo 0-7 > mems
+
+Now attach your shell to this cpuset:
+# /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory.
+# mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir:
+# rmdir my_sub_cs
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command
+
+mount -t cpuset X /dev/cpuset
+
+is equivalent to
+
+mount -t cgroup -ocpuset X /dev/cpuset
+echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories:
+
+# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4
+# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple:
+
+# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive'
+# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+# /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+ ...
+# /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+ errors. If you use it in the cpuset file system, you won't be
+ able to tell whether a command succeeded or failed.
+
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+ put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
new file mode 100644
index 0000000..bd699da
--- /dev/null
+++ b/Documentation/cputopology.txt
@@ -0,0 +1,33 @@
+
+Export cpu topology info via sysfs. Items (attributes) are similar
+to /proc/cpuinfo.
+
+1) /sys/devices/system/cpu/cpuX/topology/physical_package_id:
+represent the physical package id of cpu X;
+2) /sys/devices/system/cpu/cpuX/topology/core_id:
+represent the cpu core id to cpu X;
+3) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
+represent the thread siblings to cpu X in the same core;
+4) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+represent the thread siblings to cpu X in the same physical package;
+
+To implement it in an architecture-neutral way, a new source file,
+drivers/base/topology.c, is to export the 4 attributes.
+
+For an architecture to support this feature, it must define some of
+these macros in include/asm-XXX/topology.h:
+#define topology_physical_package_id(cpu)
+#define topology_core_id(cpu)
+#define topology_thread_siblings(cpu)
+#define topology_core_siblings(cpu)
+
+The type of **_id is int.
+The type of siblings is cpumask_t.
+
+To be consistent on all architectures, include/linux/topology.h
+provides default definitions for any of the above macros that are
+not defined by include/asm-XXX/topology.h:
+1) physical_package_id: -1
+2) core_id: 0
+3) thread_siblings: just the given CPU
+4) core_siblings: just the given CPU
diff --git a/Documentation/cris/README b/Documentation/cris/README
new file mode 100644
index 0000000..d9b0868
--- /dev/null
+++ b/Documentation/cris/README
@@ -0,0 +1,195 @@
+Linux 2.4 on the CRIS architecture
+==================================
+$Id: README,v 1.7 2001/04/19 12:38:32 bjornw Exp $
+
+This is a port of Linux 2.4 to Axis Communications ETRAX 100LX embedded
+network CPU. For more information about CRIS and ETRAX please see further
+below.
+
+In order to compile this you need a version of gcc with support for the
+ETRAX chip family. Please see this link for more information on how to
+download the compiler and other tools useful when building and booting
+software for the ETRAX platform:
+
+http://developer.axis.com/doc/software/devboard_lx/install-howto.html
+
+<more specific information should come in this document later>
+
+What is CRIS ?
+--------------
+
+CRIS is an acronym for 'Code Reduced Instruction Set'. It is the CPU
+architecture in Axis Communication AB's range of embedded network CPU's,
+called ETRAX. The latest CPU is called ETRAX 100LX, where LX stands for
+'Linux' because the chip was designed to be a good host for the Linux
+operating system.
+
+The ETRAX 100LX chip
+--------------------
+
+For reference, please see the press-release:
+
+http://www.axis.com/news/us/001101_etrax.htm
+
+The ETRAX 100LX is a 100 MIPS processor with 8kB cache, MMU, and a very broad
+range of built-in interfaces, all with modern scatter/gather DMA.
+
+Memory interfaces:
+
+ * SRAM
+ * NOR-flash/ROM
+ * EDO or page-mode DRAM
+ * SDRAM
+
+I/O interfaces:
+
+ * one 10/100 Mbit/s ethernet controller
+ * four serial-ports (up to 6 Mbit/s)
+ * two synchronous serial-ports for multimedia codec's etc.
+ * USB host controller and USB slave
+ * ATA
+ * SCSI
+ * two parallel-ports
+ * two generic 8-bit ports
+
+ (not all interfaces are available at the same time due to chip pin
+ multiplexing)
+
+The previous version of the ETRAX, the ETRAX 100, sits in almost all of
+Axis shipping thin-servers like the Axis 2100 web camera or the ETRAX 100
+developer-board. It lacks an MMU so the Linux we run on that is a version
+of uClinux (Linux 2.0 without MM-support) ported to the CRIS architecture.
+The new Linux 2.4 port has full MM and needs a CPU with an MMU, so it will
+not run on the ETRAX 100.
+
+A version of the Axis developer-board with ETRAX 100LX (running Linux
+2.4) is now available. For more information please see developer.axis.com.
+
+
+Bootlog
+-------
+
+Just as an example, this is the debug-output from a boot of Linux 2.4 on
+a board with ETRAX 100LX. The displayed BogoMIPS value is 5 times too small :)
+At the end you see some user-mode programs booting like telnet and ftp daemons.
+
+Linux version 2.4.1 (bjornw@godzilla.axis.se) (gcc version 2.96 20000427 (experimental)) #207 Wed Feb 21 15:48:15 CET 2001
+ROM fs in RAM, size 1376256 bytes
+Setting up paging and the MMU.
+On node 0 totalpages: 2048
+zone(0): 2048 pages.
+zone(1): 0 pages.
+zone(2): 0 pages.
+Linux/CRIS port on ETRAX 100LX (c) 2001 Axis Communications AB
+Kernel command line:
+Calibrating delay loop... 19.91 BogoMIPS
+Memory: 13872k/16384k available (587k kernel code, 2512k reserved, 44k data, 24k init)
+kmem_create: Forcing size word alignment - vm_area_struct
+kmem_create: Forcing size word alignment - filp
+Dentry-cache hash table entries: 2048 (order: 1, 16384 bytes)
+Buffer-cache hash table entries: 2048 (order: 0, 8192 bytes)
+Page-cache hash table entries: 2048 (order: 0, 8192 bytes)
+kmem_create: Forcing size word alignment - kiobuf
+kmem_create: Forcing size word alignment - bdev_cache
+Inode-cache hash table entries: 1024 (order: 0, 8192 bytes)
+kmem_create: Forcing size word alignment - inode_cache
+POSIX conformance testing by UNIFIX
+Linux NET4.0 for Linux 2.4
+Based upon Swansea University Computer Society NET3.039
+Starting kswapd v1.8
+kmem_create: Forcing size word alignment - file lock cache
+kmem_create: Forcing size word alignment - blkdev_requests
+block: queued sectors max/low 9109kB/3036kB, 64 slots per queue
+ETRAX 100LX 10/100MBit ethernet v2.0 (c) 2000 Axis Communications AB
+eth0 initialized
+eth0: changed MAC to 00:40:8C:CD:00:00
+ETRAX 100LX serial-driver $Revision: 1.7 $, (c) 2000 Axis Communications AB
+ttyS0 at 0xb0000060 is a builtin UART with DMA
+ttyS1 at 0xb0000068 is a builtin UART with DMA
+ttyS2 at 0xb0000070 is a builtin UART with DMA
+ttyS3 at 0xb0000078 is a builtin UART with DMA
+Axis flash mapping: 200000 at 50000000
+Axis flash: Found 1 x16 CFI device at 0x0 in 16 bit mode
+ Amd/Fujitsu Extended Query Table v1.0 at 0x0040
+Axis flash: JEDEC Device ID is 0xC4. Assuming broken CFI table.
+Axis flash: Swapping erase regions for broken CFI table.
+number of CFI chips: 1
+ Using default partition table
+I2C driver v2.2, (c) 1999-2001 Axis Communications AB
+ETRAX 100LX GPIO driver v2.1, (c) 2001 Axis Communications AB
+NET4: Linux TCP/IP 1.0 for NET4.0
+IP Protocols: ICMP, UDP, TCP
+kmem_create: Forcing size word alignment - ip_dst_cache
+IP: routing cache hash table of 1024 buckets, 8Kbytes
+TCP: Hash tables configured (established 2048 bind 2048)
+NET4: Unix domain sockets 1.0/SMP for Linux NET4.0.
+VFS: Mounted root (cramfs filesystem) readonly.
+Init starts up...
+Mounted none on /proc ok.
+Setting up eth0 with ip 10.13.9.116 and mac 00:40:8c:18:04:60
+eth0: changed MAC to 00:40:8C:18:04:60
+Setting up lo with ip 127.0.0.1
+Default gateway is 10.13.9.1
+Hostname is bbox1
+Telnetd starting, using port 23.
+ using /bin/sash as shell.
+sftpd[15]: sftpd $Revision: 1.7 $ starting up
+
+
+
+And here is how some /proc entries look:
+
+17# cd /proc
+17# cat cpuinfo
+cpu : CRIS
+cpu revision : 10
+cpu model : ETRAX 100LX
+cache size : 8 kB
+fpu : no
+mmu : yes
+ethernet : 10/100 Mbps
+token ring : no
+scsi : yes
+ata : yes
+usb : yes
+bogomips : 99.84
+
+17# cat meminfo
+ total: used: free: shared: buffers: cached:
+Mem: 7028736 925696 6103040 114688 0 229376
+Swap: 0 0 0
+MemTotal: 6864 kB
+MemFree: 5960 kB
+MemShared: 112 kB
+Buffers: 0 kB
+Cached: 224 kB
+Active: 224 kB
+Inact_dirty: 0 kB
+Inact_clean: 0 kB
+Inact_target: 0 kB
+HighTotal: 0 kB
+HighFree: 0 kB
+LowTotal: 6864 kB
+LowFree: 5960 kB
+SwapTotal: 0 kB
+SwapFree: 0 kB
+17# ls -l /bin
+-rwxr-xr-x 1 342 100 10356 Jan 01 00:00 ifconfig
+-rwxr-xr-x 1 342 100 17548 Jan 01 00:00 init
+-rwxr-xr-x 1 342 100 9488 Jan 01 00:00 route
+-rwxr-xr-x 1 342 100 46036 Jan 01 00:00 sftpd
+-rwxr-xr-x 1 342 100 48104 Jan 01 00:00 sh
+-rwxr-xr-x 1 342 100 16252 Jan 01 00:00 telnetd
+
+
+(All programs are statically linked to the libc at this point - we have not ported the
+ shared libraries yet)
+
+
+
+
+
+
+
+
+
diff --git a/Documentation/crypto/api-intro.txt b/Documentation/crypto/api-intro.txt
new file mode 100644
index 0000000..8b49302
--- /dev/null
+++ b/Documentation/crypto/api-intro.txt
@@ -0,0 +1,248 @@
+
+ Scatterlist Cryptographic API
+
+INTRODUCTION
+
+The Scatterlist Crypto API takes page vectors (scatterlists) as
+arguments, and works directly on pages. In some cases (e.g. ECB
+mode ciphers), this will allow for pages to be encrypted in-place
+with no copying.
+
+One of the initial goals of this design was to readily support IPsec,
+so that processing can be applied to paged skb's without the need
+for linearization.
+
+
+DETAILS
+
+At the lowest level are algorithms, which register dynamically with the
+API.
+
+'Transforms' are user-instantiated objects, which maintain state, handle all
+of the implementation logic (e.g. manipulating page vectors) and provide an
+abstraction to the underlying algorithms. However, at the user
+level they are very simple.
+
+Conceptually, the API layering looks like this:
+
+ [transform api] (user interface)
+ [transform ops] (per-type logic glue e.g. cipher.c, compress.c)
+ [algorithm api] (for registering algorithms)
+
+The idea is to make the user interface and algorithm registration API
+very simple, while hiding the core logic from both. Many good ideas
+from existing APIs such as Cryptoapi and Nettle have been adapted for this.
+
+The API currently supports five main types of transforms: AEAD (Authenticated
+Encryption with Associated Data), Block Ciphers, Ciphers, Compressors and
+Hashes.
+
+Please note that Block Ciphers is somewhat of a misnomer. It is in fact
+meant to support all ciphers including stream ciphers. The difference
+between Block Ciphers and Ciphers is that the latter operates on exactly
+one block while the former can operate on an arbitrary amount of data,
+subject to block size requirements (i.e., non-stream ciphers can only
+process multiples of blocks).
+
+Support for hardware crypto devices via an asynchronous interface is
+under development.
+
+Here's an example of how to use the API:
+
+ #include <linux/crypto.h>
+ #include <linux/err.h>
+ #include <linux/scatterlist.h>
+
+ struct scatterlist sg[2];
+ char result[128];
+ struct crypto_hash *tfm;
+ struct hash_desc desc;
+
+ tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ fail();
+
+ /* ... set up the scatterlists ... */
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ if (crypto_hash_digest(&desc, sg, 2, result))
+ fail();
+
+ crypto_free_hash(tfm);
+
+
+Many real examples are available in the regression test module (tcrypt.c).
+
+
+DEVELOPER NOTES
+
+Transforms may only be allocated in user context, and cryptographic
+methods may only be called from softirq and user contexts. For
+transforms with a setkey method it too should only be called from
+user context.
+
+When using the API for ciphers, performance will be optimal if each
+scatterlist contains data which is a multiple of the cipher's block
+size (typically 8 bytes). This prevents having to do any copying
+across non-aligned page fragment boundaries.
+
+
+ADDING NEW ALGORITHMS
+
+When submitting a new algorithm for inclusion, a mandatory requirement
+is that at least a few test vectors from known sources (preferably
+standards) be included.
+
+Converting existing well known code is preferred, as it is more likely
+to have been reviewed and widely tested. If submitting code from LGPL
+sources, please consider changing the license to GPL (see section 3 of
+the LGPL).
+
+Algorithms submitted must also be generally patent-free (e.g. IDEA
+will not be included in the mainline until around 2011), and be based
+on a recognized standard and/or have been subjected to appropriate
+peer review.
+
+Also check for any RFCs which may relate to the use of specific algorithms,
+as well as general application notes such as RFC2451 ("The ESP CBC-Mode
+Cipher Algorithms").
+
+It's a good idea to avoid using lots of macros and use inlined functions
+instead, as gcc does a good job with inlining, while excessive use of
+macros can cause compilation problems on some platforms.
+
+Also check the TODO list at the web site listed below to see what people
+might already be working on.
+
+
+BUGS
+
+Send bug reports to:
+linux-crypto@vger.kernel.org
+Cc: Herbert Xu <herbert@gondor.apana.org.au>,
+ David S. Miller <davem@redhat.com>
+
+
+FURTHER INFORMATION
+
+For further patches and various updates, including the current TODO
+list, see:
+http://gondor.apana.org.au/~herbert/crypto/
+
+
+AUTHORS
+
+James Morris
+David S. Miller
+Herbert Xu
+
+
+CREDITS
+
+The following people provided invaluable feedback during the development
+of the API:
+
+ Alexey Kuznetzov
+ Rusty Russell
+ Herbert Valerio Riedel
+ Jeff Garzik
+ Michael Richardson
+ Andrew Morton
+ Ingo Oeser
+ Christoph Hellwig
+
+Portions of this API were derived from the following projects:
+
+ Kerneli Cryptoapi (http://www.kerneli.org/)
+ Alexander Kjeldaas
+ Herbert Valerio Riedel
+ Kyle McMartin
+ Jean-Luc Cooke
+ David Bryson
+ Clemens Fruhwirth
+ Tobias Ringstrom
+ Harald Welte
+
+and;
+
+ Nettle (http://www.lysator.liu.se/~nisse/nettle/)
+ Niels Möller
+
+Original developers of the crypto algorithms:
+
+ Dana L. How (DES)
+ Andrew Tridgell and Steve French (MD4)
+ Colin Plumb (MD5)
+ Steve Reid (SHA1)
+ Jean-Luc Cooke (SHA256, SHA384, SHA512)
+ Kazunori Miyazawa / USAGI (HMAC)
+ Matthew Skala (Twofish)
+ Dag Arne Osvik (Serpent)
+ Brian Gladman (AES)
+ Kartikey Mahendra Bhatt (CAST6)
+ Jon Oberheide (ARC4)
+ Jouni Malinen (Michael MIC)
+ NTT(Nippon Telegraph and Telephone Corporation) (Camellia)
+
+SHA1 algorithm contributors:
+ Jean-Francois Dive
+
+DES algorithm contributors:
+ Raimar Falke
+ Gisle Sælensminde
+ Niels Möller
+
+Blowfish algorithm contributors:
+ Herbert Valerio Riedel
+ Kyle McMartin
+
+Twofish algorithm contributors:
+ Werner Koch
+ Marc Mutz
+
+SHA256/384/512 algorithm contributors:
+ Andrew McDonald
+ Kyle McMartin
+ Herbert Valerio Riedel
+
+AES algorithm contributors:
+ Alexander Kjeldaas
+ Herbert Valerio Riedel
+ Kyle McMartin
+ Adam J. Richter
+ Fruhwirth Clemens (i586)
+ Linus Torvalds (i586)
+
+CAST5 algorithm contributors:
+ Kartikey Mahendra Bhatt (original developers unknown, FSF copyright).
+
+TEA/XTEA algorithm contributors:
+ Aaron Grothe
+ Michael Ringe
+
+Khazad algorithm contributors:
+ Aaron Grothe
+
+Whirlpool algorithm contributors:
+ Aaron Grothe
+ Jean-Luc Cooke
+
+Anubis algorithm contributors:
+ Aaron Grothe
+
+Tiger algorithm contributors:
+ Aaron Grothe
+
+VIA PadLock contributors:
+ Michal Ludvig
+
+Camellia algorithm contributors:
+ NTT(Nippon Telegraph and Telephone Corporation) (Camellia)
+
+Generic scatterwalk code by Adam J. Richter <adam@yggdrasil.com>
+
+Please send any credits updates or corrections to:
+Herbert Xu <herbert@gondor.apana.org.au>
+
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
new file mode 100644
index 0000000..c1e9545
--- /dev/null
+++ b/Documentation/crypto/async-tx-api.txt
@@ -0,0 +1,219 @@
+ Asynchronous Transfers/Transforms API
+
+1 INTRODUCTION
+
+2 GENEALOGY
+
+3 USAGE
+3.1 General format of the API
+3.2 Supported operations
+3.3 Descriptor management
+3.4 When does the operation execute?
+3.5 When does the operation complete?
+3.6 Constraints
+3.7 Example
+
+4 DRIVER DEVELOPER NOTES
+4.1 Conformance points
+4.2 "My application needs finer control of hardware channels"
+
+5 SOURCE
+
+---
+
+1 INTRODUCTION
+
+The async_tx API provides methods for describing a chain of asynchronous
+bulk memory transfers/transforms with support for inter-transactional
+dependencies. It is implemented as a dmaengine client that smooths over
+the details of different hardware offload engine implementations. Code
+that is written to the API can optimize for asynchronous operation and
+the API will fit the chain of operations to the available offload
+resources.
+
+2 GENEALOGY
+
+The API was initially designed to offload the memory copy and
+xor-parity-calculations of the md-raid5 driver using the offload engines
+present in the Intel(R) Xscale series of I/O processors. It also built
+on the 'dmaengine' layer developed for offloading memory copies in the
+network stack using Intel(R) I/OAT engines. The following design
+features surfaced as a result:
+1/ implicit synchronous path: users of the API do not need to know if
+ the platform they are running on has offload capabilities. The
+ operation will be offloaded when an engine is available and carried out
+ in software otherwise.
+2/ cross channel dependency chains: the API allows a chain of dependent
+ operations to be submitted, like xor->copy->xor in the raid5 case. The
+ API automatically handles cases where the transition from one operation
+ to another implies a hardware channel switch.
+3/ dmaengine extensions to support multiple clients and operation types
+ beyond 'memcpy'
+
+3 USAGE
+
+3.1 General format of the API:
+struct dma_async_tx_descriptor *
+async_<operation>(<op specific parameters>,
+ enum async_tx_flags flags,
+ struct dma_async_tx_descriptor *dependency,
+ dma_async_tx_callback callback_routine,
+ void *callback_parameter);
+
+3.2 Supported operations:
+memcpy - memory copy between a source and a destination buffer
+memset - fill a destination buffer with a byte value
+xor - xor a series of source buffers and write the result to a
+ destination buffer
+xor_zero_sum - xor a series of source buffers and set a flag if the
+ result is zero. The implementation attempts to prevent
+ writes to memory
+
+3.3 Descriptor management:
+The return value is non-NULL and points to a 'descriptor' when the operation
+has been queued to execute asynchronously. Descriptors are recycled
+resources, under control of the offload engine driver, to be reused as
+operations complete. When an application needs to submit a chain of
+operations it must guarantee that the descriptor is not automatically recycled
+before the dependency is submitted. This requires that all descriptors be
+acknowledged by the application before the offload engine driver is allowed to
+recycle (or free) the descriptor. A descriptor can be acked by one of the
+following methods:
+1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted
+2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent
+ descriptor of a new operation.
+3/ calling async_tx_ack() on the descriptor.
+
+3.4 When does the operation execute?
+Operations do not immediately issue after return from the
+async_<operation> call. Offload engine drivers batch operations to
+improve performance by reducing the number of mmio cycles needed to
+manage the channel. Once a driver-specific threshold is met the driver
+automatically issues pending operations. An application can force this
+event by calling async_tx_issue_pending_all(). This operates on all
+channels since the application has no knowledge of channel to operation
+mapping.
+
+3.5 When does the operation complete?
+There are two methods for an application to learn about the completion
+of an operation.
+1/ Call dma_wait_for_async_tx(). This call causes the CPU to spin while
+ it polls for the completion of the operation. It handles dependency
+ chains and issuing pending operations.
+2/ Specify a completion callback. The callback routine runs in tasklet
+ context if the offload engine driver supports interrupts, or it is
+ called in application context if the operation is carried out
+ synchronously in software. The callback can be set in the call to
+ async_<operation>, or when the application needs to submit a chain of
+ unknown length it can use the async_trigger_callback() routine to set a
+ completion interrupt/callback at the end of the chain.
+
+3.6 Constraints:
+1/ Calls to async_<operation> are not permitted in IRQ context. Other
+ contexts are permitted provided constraint #2 is not violated.
+2/ Completion callback routines cannot submit new operations. This
+ results in recursion in the synchronous case and spin_locks being
+ acquired twice in the asynchronous case.
+
+3.7 Example:
+Perform a xor->copy->xor operation where each operation depends on the
+result from the previous operation:
+
+void complete_xor_copy_xor(void *param)
+{
+ printk("complete\n");
+}
+
+int run_xor_copy_xor(struct page **xor_srcs,
+ int xor_src_cnt,
+ struct page *xor_dest,
+ size_t xor_len,
+ struct page *copy_src,
+ struct page *copy_dest,
+ size_t copy_len)
+{
+ struct dma_async_tx_descriptor *tx;
+
+ tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
+ ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL);
+ tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len,
+ ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+ tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
+ ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK,
+ tx, complete_xor_copy_xor, NULL);
+
+ async_tx_issue_pending_all();
+}
+
+See include/linux/async_tx.h for more information on the flags. See the
+ops_run_* and ops_complete_* routines in drivers/md/raid5.c for more
+implementation examples.
+
+4 DRIVER DEVELOPMENT NOTES
+4.1 Conformance points:
+There are a few conformance points required in dmaengine drivers to
+accommodate assumptions made by applications using the async_tx API:
+1/ Completion callbacks are expected to happen in tasklet context
+2/ dma_async_tx_descriptor fields are never manipulated in IRQ context
+3/ Use async_tx_run_dependencies() in the descriptor clean up path to
+ handle submission of dependent operations
+
+4.2 "My application needs finer control of hardware channels"
+This requirement seems to arise from cases where a DMA engine driver is
+trying to support device-to-memory DMA. The dmaengine and async_tx
+implementations were designed for offloading memory-to-memory
+operations; however, there are some capabilities of the dmaengine layer
+that can be used for platform-specific channel management.
+Platform-specific constraints can be handled by registering the
+application as a 'dma_client' and implementing a 'dma_event_callback' to
+apply a filter to the available channels in the system. Before showing
+how to implement a custom dma_event callback some background of
+dmaengine's client support is required.
+
+The following routines in dmaengine support multiple clients requesting
+use of a channel:
+- dma_async_client_register(struct dma_client *client)
+- dma_async_client_chan_request(struct dma_client *client)
+
+dma_async_client_register takes a pointer to an initialized dma_client
+structure. It expects that the 'event_callback' and 'cap_mask' fields
+are already initialized.
+
+dma_async_client_chan_request triggers dmaengine to notify the client of
+all channels that satisfy the capability mask. It is up to the client's
+event_callback routine to track how many channels the client needs and
+how many it is currently using. The dma_event_callback routine returns a
+dma_state_client code to let dmaengine know the status of the
+allocation.
+
+Below is the example of how to extend this functionality for
+platform-specific filtering of the available channels beyond the
+standard capability mask:
+
+static enum dma_state_client
+my_dma_client_callback(struct dma_client *client,
+ struct dma_chan *chan, enum dma_state state)
+{
+ struct dma_device *dma_dev;
+ struct my_platform_specific_dma *plat_dma_dev;
+
+ dma_dev = chan->device;
+ plat_dma_dev = container_of(dma_dev,
+ struct my_platform_specific_dma,
+ dma_dev);
+
+ if (!plat_dma_dev->platform_specific_capability)
+ return DMA_DUP;
+
+ . . .
+}
+
+5 SOURCE
+include/linux/dmaengine.h: core header file for DMA drivers and clients
+drivers/dma/dmaengine.c: offload engine channel management routines
+drivers/dma/: location for offload engine drivers
+include/linux/async_tx.h: core header file for the async_tx api
+crypto/async_tx/async_tx.c: async_tx interface to dmaengine and common code
+crypto/async_tx/async_memcpy.c: copy offload
+crypto/async_tx/async_memset.c: memory fill offload
+crypto/async_tx/async_xor.c: xor and xor zero sum offload
diff --git a/Documentation/crypto/descore-readme.txt b/Documentation/crypto/descore-readme.txt
new file mode 100644
index 0000000..16e9e63
--- /dev/null
+++ b/Documentation/crypto/descore-readme.txt
@@ -0,0 +1,352 @@
+Below is the original README file from the descore.shar package.
+------------------------------------------------------------------------------
+
+des - fast & portable DES encryption & decryption.
+Copyright (C) 1992 Dana L. How
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Author's address: how@isl.stanford.edu
+
+$Id: README,v 1.15 1992/05/20 00:25:32 how E $
+
+
+==>> To compile after untarring/unsharring, just `make' <<==
+
+
+This package was designed with the following goals:
+1. Highest possible encryption/decryption PERFORMANCE.
+2. PORTABILITY to any byte-addressable host with a 32bit unsigned C type
+3. Plug-compatible replacement for KERBEROS's low-level routines.
+
+This second release includes a number of performance enhancements for
+register-starved machines. My discussions with Richard Outerbridge,
+71755.204@compuserve.com, sparked a number of these enhancements.
+
+To more rapidly understand the code in this package, inspect desSmallFips.i
+(created by typing `make') BEFORE you tackle desCode.h. The latter is set
+up in a parameterized fashion so it can easily be modified by speed-daemon
+hackers in pursuit of that last microsecond. You will find it more
+illuminating to inspect one specific implementation,
+and then move on to the common abstract skeleton with this one in mind.
+
+
+performance comparison to other available des code which i could
+compile on a SPARCStation 1 (cc -O4, gcc -O2):
+
+this code (byte-order independent):
+ 30us per encryption (options: 64k tables, no IP/FP)
+ 33us per encryption (options: 64k tables, FIPS standard bit ordering)
+ 45us per encryption (options: 2k tables, no IP/FP)
+ 48us per encryption (options: 2k tables, FIPS standard bit ordering)
+ 275us to set a new key (uses 1k of key tables)
+ this has the quickest encryption/decryption routines i've seen.
+ since i was interested in fast des filters rather than crypt(3)
+ and password cracking, i haven't really bothered yet to speed up
+ the key setting routine. also, i have no interest in re-implementing
+ all the other junk in the mit kerberos des library, so i've just
+ provided my routines with little stub interfaces so they can be
+ used as drop-in replacements with mit's code or any of the mit-
+ compatible packages below. (note that the first two timings above
+ are highly variable because of cache effects).
+
+kerberos des replacement from australia (version 1.95):
+ 53us per encryption (uses 2k of tables)
+ 96us to set a new key (uses 2.25k of key tables)
+ so despite the author's inclusion of some of the performance
+ improvements i had suggested to him, this package's
+ encryption/decryption is still slower on the sparc and 68000.
+ more specifically, 19-40% slower on the 68020 and 11-35% slower
+ on the sparc, depending on the compiler;
+ in full gory detail (ALT_ECB is a libdes variant):
+ compiler machine desCore libdes ALT_ECB slower by
+ gcc 2.1 -O2 Sun 3/110 304 uS 369.5uS 461.8uS 22%
+ cc -O1 Sun 3/110 336 uS 436.6uS 399.3uS 19%
+ cc -O2 Sun 3/110 360 uS 532.4uS 505.1uS 40%
+ cc -O4 Sun 3/110 365 uS 532.3uS 505.3uS 38%
+ gcc 2.1 -O2 Sun 4/50 48 uS 53.4uS 57.5uS 11%
+ cc -O2 Sun 4/50 48 uS 64.6uS 64.7uS 35%
+ cc -O4 Sun 4/50 48 uS 64.7uS 64.9uS 35%
+ (my time measurements are not as accurate as his).
+ the comments in my first release of desCore on version 1.92:
+ 68us per encryption (uses 2k of tables)
+ 96us to set a new key (uses 2.25k of key tables)
+ this is a very nice package which implements the most important
+ of the optimizations which i did in my encryption routines.
+ it's a bit weak on common low-level optimizations which is why
+ it's 39%-106% slower. because he was interested in fast crypt(3) and
+ password-cracking applications, he also used the same ideas to
+ speed up the key-setting routines with impressive results.
+ (at some point i may do the same in my package). he also implements
+ the rest of the mit des library.
+ (code from eay@psych.psy.uq.oz.au via comp.sources.misc)
+
+fast crypt(3) package from denmark:
+ the des routine here is buried inside a loop to do the
+ crypt function and i didn't feel like ripping it out and measuring
+ performance. his code takes 26 sparc instructions to compute one
+ des iteration; above, Quick (64k) takes 21 and Small (2k) takes 37.
+ he claims to use 280k of tables but the iteration calculation seems
+ to use only 128k. his tables and code are machine independent.
+ (code from glad@daimi.aau.dk via alt.sources or comp.sources.misc)
+
+swedish reimplementation of Kerberos des library
+ 108us per encryption (uses 34k worth of tables)
+ 134us to set a new key (uses 32k of key tables to get this speed!)
+ the tables used seem to be machine-independent;
+ he seems to have included a lot of special case code
+ so that, e.g., `long' loads can be used instead of 4 `char' loads
+ when the machine's architecture allows it.
+ (code obtained from chalmers.se:pub/des)
+
+crack 3.3c package from england:
+ as in crypt above, the des routine is buried in a loop. it's
+ also very modified for crypt. his iteration code uses 16k
+ of tables and appears to be slow.
+ (code obtained from aem@aber.ac.uk via alt.sources or comp.sources.misc)
+
+``highly optimized'' and tweaked Kerberos/Athena code (byte-order dependent):
+ 165us per encryption (uses 6k worth of tables)
+ 478us to set a new key (uses <1k of key tables)
+ so despite the comments in this code, it was possible to get
+ faster code AND smaller tables, as well as making the tables
+ machine-independent.
+ (code obtained from prep.ai.mit.edu)
+
+UC Berkeley code (depends on machine-endedness):
+ 226us per encryption
+10848us to set a new key
+ table sizes are unclear, but they don't look very small
+ (code obtained from wuarchive.wustl.edu)
+
+
+motivation and history
+
+a while ago i wanted some des routines and the routines documented on sun's
+man pages either didn't exist or dumped core. i had heard of kerberos,
+and knew that it used des, so i figured i'd use its routines. but once
+i got it and looked at the code, it really set off a lot of pet peeves -
+it was too convoluted, the code had been written without taking
+advantage of the regular structure of operations such as IP, E, and FP
+(i.e. the author didn't sit down and think before coding),
+it was excessively slow, the author had attempted to clarify the code
+by adding MORE statements to make the data movement more `consistent'
+instead of simplifying his implementation and cutting down on all data
+movement (in particular, his use of L1, R1, L2, R2), and it was full of
+idiotic `tweaks' for particular machines which failed to deliver significant
+speedups but which did obfuscate everything. so i took the test data
+from his verification program and rewrote everything else.
+
+a while later i ran across the great crypt(3) package mentioned above.
+the fact that this guy was computing 2 sboxes per table lookup rather
+than one (and using a MUCH larger table in the process) emboldened me to
+do the same - it was a trivial change from which i had been scared away
+by the larger table size. in his case he didn't realize you don't need to keep
+the working data in TWO forms, one for easy use of half the sboxes in
+indexing, the other for easy use of the other half; instead you can keep
+it in the form for the first half and use a simple rotate to get the other
+half. this means i have (almost) half the data manipulation and half
+the table size. in fairness though he might be encoding something particular
+to crypt(3) in his tables - i didn't check.
+
+i'm glad that i implemented it the way i did, because this C version is
+portable (the ifdef's are performance enhancements) and it is faster
+than versions hand-written in assembly for the sparc!
+
+
+porting notes
+
+one thing i did not want to do was write an enormous mess
+which depended on endedness and other machine quirks,
+and which necessarily produced different code and different lookup tables
+for different machines. see the kerberos code for an example
+of what i didn't want to do; all their endedness-specific `optimizations'
+obfuscate the code and in the end were slower than a simpler machine
+independent approach. however, there are always some portability
+considerations of some kind, and i have included some options
+for varying numbers of register variables.
+perhaps some will still regard the result as a mess!
+
+1) i assume everything is byte addressable, although i don't actually
+ depend on the byte order, and that bytes are 8 bits.
+ i assume word pointers can be freely cast to and from char pointers.
+ note that 99% of C programs make these assumptions.
+ i always use unsigned char's if the high bit could be set.
+2) the typedef `word' means a 32 bit unsigned integral type.
+ if `unsigned long' is not 32 bits, change the typedef in desCore.h.
+ i assume sizeof(word) == 4 EVERYWHERE.
+
+the (worst-case) cost of my NOT doing endedness-specific optimizations
+in the data loading and storing code surrounding the key iterations
+is less than 12%. also, there is the added benefit that
+the input and output work areas do not need to be word-aligned.
+
+
+OPTIONAL performance optimizations
+
+1) you should define one of `i386,' `vax,' `mc68000,' or `sparc,'
+ whichever one is closest to the capabilities of your machine.
+ see the start of desCode.h to see exactly what this selection implies.
+ note that if you select the wrong one, the des code will still work;
+ these are just performance tweaks.
+2) for those with functional `asm' keywords: you should change the
+ ROR and ROL macros to use machine rotate instructions if you have them.
+ this will save 2 instructions and a temporary per use,
+ or about 32 to 40 instructions per en/decryption.
+ note that gcc is smart enough to translate the ROL/R macros into
+ machine rotates!
+
+these optimizations are all rather persnickety, yet with them you should
+be able to get performance equal to assembly-coding, except that:
+1) with the lack of a bit rotate operator in C, rotates have to be synthesized
+ from shifts. so access to `asm' will speed things up if your machine
+ has rotates, as explained above in (3) (not necessary if you use gcc).
+2) if your machine has less than 12 32-bit registers i doubt your compiler will
+ generate good code.
+ `i386' tries to configure the code for a 386 by only declaring 3 registers
+ (it appears that gcc can use ebx, esi and edi to hold register variables).
+ however, if you like assembly coding, the 386 does have 7 32-bit registers,
+ and if you use ALL of them, use `scaled by 8' address modes with displacement
+ and other tricks, you can get reasonable routines for DesQuickCore... with
+ about 250 instructions apiece. For DesSmall... it will help to rearrange
+ des_keymap, i.e., now the sbox # is the high part of the index and
+ the 6 bits of data is the low part; it helps to exchange these.
+ since i have no way to conveniently test it i have not provided my
+ shoehorned 386 version. note that with this release of desCore, gcc is able
+ to put everything in registers(!), and generate about 370 instructions apiece
+ for the DesQuickCore... routines!
+
+coding notes
+
+the en/decryption routines each use 6 necessary register variables,
+with 4 being actively used at once during the inner iterations.
+if you don't have 4 register variables get a new machine.
+up to 8 more registers are used to hold constants in some configurations.
+
+i assume that the use of a constant is more expensive than using a register:
+a) additionally, i have tried to put the larger constants in registers.
+ registering priority was by the following:
+ anything more than 12 bits (bad for RISC and CISC)
+ greater than 127 in value (can't use movq or byte immediate on CISC)
+ 9-127 (may not be able to use CISC shift immediate or add/sub quick),
+ 1-8 were never registered, being the cheapest constants.
+b) the compiler may be too stupid to realize table and table+256 should
+ be assigned to different constant registers and instead repetitively
+ do the arithmetic, so i assign these to explicit `m' register variables
+ when possible and helpful.
+
+i assume that indexing is cheaper or equivalent to auto increment/decrement,
+where the index is 7 bits unsigned or smaller.
+this assumption is reversed for 68k and vax.
+
+i assume that addresses can be cheaply formed from two registers,
+or from a register and a small constant.
+for the 68000, the `two registers and small offset' form is used sparingly.
+all index scaling is done explicitly - no hidden shifts by log2(sizeof).
+
+the code is written so that even a dumb compiler
+should never need more than one hidden temporary,
+increasing the chance that everything will fit in the registers.
+KEEP THIS MORE SUBTLE POINT IN MIND IF YOU REWRITE ANYTHING.
+(actually, there are some code fragments now which do require two temps,
+but fixing it would either break the structure of the macros or
+require declaring another temporary).
+
+
+special efficient data format
+
+bits are manipulated in this arrangement most of the time (S7 S5 S3 S1):
+ 003130292827xxxx242322212019xxxx161514131211xxxx080706050403xxxx
+(the x bits are still there, i'm just emphasizing where the S boxes are).
+bits are rotated left 4 when computing S6 S4 S2 S0:
+ 282726252423xxxx201918171615xxxx121110090807xxxx040302010031xxxx
+the rightmost two bits are usually cleared so the lower byte can be used
+as an index into an sbox mapping table. the next two x'd bits are set
+to various values to access different parts of the tables.
+
+
+how to use the routines
+
+datatypes:
+ pointer to 8 byte area of type DesData
+ used to hold keys and input/output blocks to des.
+
+ pointer to 128 byte area of type DesKeys
+ used to hold full 768-bit key.
+ must be long-aligned.
+
+DesQuickInit()
+ call this before using any other routine with `Quick' in its name.
+ it generates the special 64k table these routines need.
+DesQuickDone()
+ frees this table
+
+DesMethod(m, k)
+ m points to a 128byte block, k points to an 8 byte des key
+ which must have odd parity (or -1 is returned) and which must
+ not be a (semi-)weak key (or -2 is returned).
+ normally DesMethod() returns 0.
+ m is filled in from k so that when one of the routines below
+ is called with m, the routine will act like standard des
+ en/decryption with the key k. if you use DesMethod,
+ you supply a standard 56bit key; however, if you fill in
+ m yourself, you will get a 768bit key - but then it won't
+ be standard. it's 768bits not 1024 because the least significant
+ two bits of each byte are not used. note that these two bits
+ will be set to magic constants which speed up the encryption/decryption
+ on some machines. and yes, each byte controls
+ a specific sbox during a specific iteration.
+ you really shouldn't use the 768bit format directly; i should
+ provide a routine that converts 128 6-bit bytes (specified in
+ S-box mapping order or something) into the right format for you.
+ this would entail some byte concatenation and rotation.
+
+Des{Small|Quick}{Fips|Core}{Encrypt|Decrypt}(d, m, s)
+ performs des on the 8 bytes at s into the 8 bytes at d. (d,s: char *).
+ uses m as a 768bit key as explained above.
+ the Encrypt|Decrypt choice is obvious.
+ Fips|Core determines whether a completely standard FIPS initial
+ and final permutation is done; if not, then the data is loaded
+ and stored in a nonstandard bit order (FIPS w/o IP/FP).
+ Fips slows down Quick by 10%, Small by 9%.
+ Small|Quick determines whether you use the normal routine
+ or the crazy quick one which gobbles up 64k more of memory.
+ Small is 50% slower then Quick, but Quick needs 32 times as much
+ memory. Quick is included for programs that do nothing but DES,
+ e.g., encryption filters, etc.
+
+
+Getting it to compile on your machine
+
+there are no machine-dependencies in the code (see porting),
+except perhaps the `now()' macro in desTest.c.
+ALL generated tables are machine independent.
+you should edit the Makefile with the appropriate optimization flags
+for your compiler (MAX optimization).
+
+
+Speeding up kerberos (and/or its des library)
+
+note that i have included a kerberos-compatible interface in desUtil.c
+through the functions des_key_sched() and des_ecb_encrypt().
+to use these with kerberos or kerberos-compatible code put desCore.a
+ahead of the kerberos-compatible library on your linker's command line.
+you should not need to #include desCore.h; just include the header
+file provided with the kerberos library.
+
+Other uses
+
+the macros in desCode.h would be very useful for putting inline des
+functions in more complicated encryption routines.
diff --git a/Documentation/dcdbas.txt b/Documentation/dcdbas.txt
new file mode 100644
index 0000000..e1c52e2
--- /dev/null
+++ b/Documentation/dcdbas.txt
@@ -0,0 +1,91 @@
+Overview
+
+The Dell Systems Management Base Driver provides a sysfs interface for
+systems management software such as Dell OpenManage to perform system
+management interrupts and host control actions (system power cycle or
+power off after OS shutdown) on certain Dell systems.
+
+Dell OpenManage requires this driver on the following Dell PowerEdge systems:
+300, 1300, 1400, 400SC, 500SC, 1500SC, 1550, 600SC, 1600SC, 650, 1655MC,
+700, and 750. Other Dell software such as the open source libsmbios project
+is expected to make use of this driver, and it may include the use of this
+driver on other Dell systems.
+
+The Dell libsmbios project aims towards providing access to as much BIOS
+information as possible. See http://linux.dell.com/libsmbios/main/ for
+more information about the libsmbios project.
+
+
+System Management Interrupt
+
+On some Dell systems, systems management software must access certain
+management information via a system management interrupt (SMI). The SMI data
+buffer must reside in 32-bit address space, and the physical address of the
+buffer is required for the SMI. The driver maintains the memory required for
+the SMI and provides a way for the application to generate the SMI.
+The driver creates the following sysfs entries for systems management
+software to perform these system management interrupts:
+
+/sys/devices/platform/dcdbas/smi_data
+/sys/devices/platform/dcdbas/smi_data_buf_phys_addr
+/sys/devices/platform/dcdbas/smi_data_buf_size
+/sys/devices/platform/dcdbas/smi_request
+
+Systems management software must perform the following steps to execute
+a SMI using this driver:
+
+1) Lock smi_data.
+2) Write system management command to smi_data.
+3) Write "1" to smi_request to generate a calling interface SMI or
+ "2" to generate a raw SMI.
+4) Read system management command response from smi_data.
+5) Unlock smi_data.
+
+
+Host Control Action
+
+Dell OpenManage supports a host control feature that allows the administrator
+to perform a power cycle or power off of the system after the OS has finished
+shutting down. On some Dell systems, this host control feature requires that
+a driver perform a SMI after the OS has finished shutting down.
+
+The driver creates the following sysfs entries for systems management software
+to schedule the driver to perform a power cycle or power off host control
+action after the system has finished shutting down:
+
+/sys/devices/platform/dcdbas/host_control_action
+/sys/devices/platform/dcdbas/host_control_smi_type
+/sys/devices/platform/dcdbas/host_control_on_shutdown
+
+Dell OpenManage performs the following steps to execute a power cycle or
+power off host control action using this driver:
+
+1) Write host control action to be performed to host_control_action.
+2) Write type of SMI that driver needs to perform to host_control_smi_type.
+3) Write "1" to host_control_on_shutdown to enable host control action.
+4) Initiate OS shutdown.
+ (Driver will perform host control SMI when it is notified that the OS
+ has finished shutting down.)
+
+
+Host Control SMI Type
+
+The following table shows the value to write to host_control_smi_type to
+perform a power cycle or power off host control action:
+
+PowerEdge System Host Control SMI Type
+---------------- ---------------------
+ 300 HC_SMITYPE_TYPE1
+ 1300 HC_SMITYPE_TYPE1
+ 1400 HC_SMITYPE_TYPE2
+ 500SC HC_SMITYPE_TYPE2
+ 1500SC HC_SMITYPE_TYPE2
+ 1550 HC_SMITYPE_TYPE2
+ 600SC HC_SMITYPE_TYPE2
+ 1600SC HC_SMITYPE_TYPE2
+ 650 HC_SMITYPE_TYPE2
+ 1655MC HC_SMITYPE_TYPE2
+ 700 HC_SMITYPE_TYPE3
+ 750 HC_SMITYPE_TYPE3
+
+
diff --git a/Documentation/debugging-modules.txt b/Documentation/debugging-modules.txt
new file mode 100644
index 0000000..172ad4a
--- /dev/null
+++ b/Documentation/debugging-modules.txt
@@ -0,0 +1,22 @@
+Debugging Modules after 2.6.3
+-----------------------------
+
+In almost all distributions, the kernel asks for modules which don't
+exist, such as "net-pf-10" or whatever. Changing "modprobe -q" to
+"succeed" in this case is hacky and breaks some setups, and also we
+want to know if it failed for the fallback code for old aliases in
+fs/char_dev.c, for example.
+
+In the past a debugging message which would fill people's logs was
+emitted. This debugging message has been removed. The correct way
+of debugging module problems is something like this:
+
+echo '#! /bin/sh' > /tmp/modprobe
+echo 'echo "$@" >> /tmp/modprobe.log' >> /tmp/modprobe
+echo 'exec /sbin/modprobe "$@"' >> /tmp/modprobe
+chmod a+x /tmp/modprobe
+echo /tmp/modprobe > /proc/sys/kernel/modprobe
+
+Note that the above applies only when the *kernel* is requesting
+that the module be loaded -- it won't have any effect if that module
+is being loaded explicitly using "modprobe" from userspace.
diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt
new file mode 100644
index 0000000..59a91e5
--- /dev/null
+++ b/Documentation/debugging-via-ohci1394.txt
@@ -0,0 +1,184 @@
+
+ Using physical DMA provided by OHCI-1394 FireWire controllers for debugging
+ ---------------------------------------------------------------------------
+
+Introduction
+------------
+
+Basically all FireWire controllers which are in use today are compliant
+to the OHCI-1394 specification which defines the controller to be a PCI
+bus master which uses DMA to offload data transfers from the CPU and has
+a "Physical Response Unit" which executes specific requests by employing
+PCI-Bus master DMA after applying filters defined by the OHCI-1394 driver.
+
+Once properly configured, remote machines can send these requests to
+ask the OHCI-1394 controller to perform read and write requests on
+physical system memory and, for read requests, send the result of
+the physical memory read back to the requester.
+
+With that, it is possible to debug issues by reading interesting memory
+locations such as buffers like the printk buffer or the process table.
+
+Retrieving a full system memory dump is also possible over the FireWire,
+using data transfer rates in the order of 10MB/s or more.
+
+Memory access is currently limited to the low 4G of physical address
+space which can be a problem on IA64 machines where memory is located
+mostly above that limit, but it is rarely a problem on more common
+hardware such as hardware based on x86, x86-64 and PowerPC.
+
+Together with a early initialization of the OHCI-1394 controller for debugging,
+this facility proved most useful for examining long debugs logs in the printk
+buffer on to debug early boot problems in areas like ACPI where the system
+fails to boot and other means for debugging (serial port) are either not
+available (notebooks) or too slow for extensive debug information (like ACPI).
+
+Drivers
+-------
+
+The ohci1394 driver in drivers/ieee1394 initializes the OHCI-1394 controllers
+to a working state and enables physical DMA by default for all remote nodes.
+This can be turned off by ohci1394's module parameter phys_dma=0.
+
+The alternative firewire-ohci driver in drivers/firewire uses filtered physical
+DMA by default, which is more secure but not suitable for remote debugging.
+Compile the driver with CONFIG_FIREWIRE_OHCI_REMOTE_DMA (Kernel hacking menu:
+Remote debugging over FireWire with firewire-ohci) to get unfiltered physical
+DMA.
+
+Because ohci1394 and firewire-ohci depend on the PCI enumeration to be
+completed, an initialization routine which runs pretty early has been
+implemented for x86. This routine runs long before console_init() can be
+called, i.e. before the printk buffer appears on the console.
+
+To activate it, enable CONFIG_PROVIDE_OHCI1394_DMA_INIT (Kernel hacking menu:
+Remote debugging over FireWire early on boot) and pass the parameter
+"ohci1394_dma=early" to the recompiled kernel on boot.
+
+Tools
+-----
+
+firescope - Originally developed by Benjamin Herrenschmidt, Andi Kleen ported
+it from PowerPC to x86 and x86_64 and added functionality, firescope can now
+be used to view the printk buffer of a remote machine, even with live update.
+
+Bernhard Kaindl enhanced firescope to support accessing 64-bit machines
+from 32-bit firescope and vice versa:
+- ftp://ftp.suse.de/private/bk/firewire/tools/firescope-0.2.2.tar.bz2
+
+and he implemented fast system dump (alpha version - read README.txt):
+- ftp://ftp.suse.de/private/bk/firewire/tools/firedump-0.1.tar.bz2
+
+There is also a gdb proxy for firewire which allows to use gdb to access
+data which can be referenced from symbols found by gdb in vmlinux:
+- ftp://ftp.suse.de/private/bk/firewire/tools/fireproxy-0.33.tar.bz2
+
+The latest version of this gdb proxy (fireproxy-0.34) can communicate (not
+yet stable) with kgdb over an memory-based communication module (kgdbom).
+
+Getting Started
+---------------
+
+The OHCI-1394 specification regulates that the OHCI-1394 controller must
+disable all physical DMA on each bus reset.
+
+This means that if you want to debug an issue in a system state where
+interrupts are disabled and where no polling of the OHCI-1394 controller
+for bus resets takes place, you have to establish any FireWire cable
+connections and fully initialize all FireWire hardware __before__ the
+system enters such state.
+
+Step-by-step instructions for using firescope with early OHCI initialization:
+
+1) Verify that your hardware is supported:
+
+ Load the ohci1394 or the fw-ohci module and check your kernel logs.
+ You should see a line similar to
+
+ ohci1394: fw-host0: OHCI-1394 1.1 (PCI): IRQ=[18] MMIO=[fe9ff800-fe9fffff]
+ ... Max Packet=[2048] IR/IT contexts=[4/8]
+
+ when loading the driver. If you have no supported controller, many PCI,
+ CardBus and even some Express cards which are fully compliant to OHCI-1394
+ specification are available. If it requires no driver for Windows operating
+ systems, it most likely is. Only specialized shops have cards which are not
+ compliant, they are based on TI PCILynx chips and require drivers for Win-
+ dows operating systems.
+
+2) Establish a working FireWire cable connection:
+
+ Any FireWire cable, as long at it provides electrically and mechanically
+ stable connection and has matching connectors (there are small 4-pin and
+ large 6-pin FireWire ports) will do.
+
+ If an driver is running on both machines you should see a line like
+
+ ieee1394: Node added: ID:BUS[0-01:1023] GUID[0090270001b84bba]
+
+ on both machines in the kernel log when the cable is plugged in
+ and connects the two machines.
+
+3) Test physical DMA using firescope:
+
+ On the debug host,
+ - load the raw1394 module,
+ - make sure that /dev/raw1394 is accessible,
+ then start firescope:
+
+ $ firescope
+ Port 0 (ohci1394) opened, 2 nodes detected
+
+ FireScope
+ ---------
+ Target : <unspecified>
+ Gen : 1
+ [Ctrl-T] choose target
+ [Ctrl-H] this menu
+ [Ctrl-Q] quit
+
+ ------> Press Ctrl-T now, the output should be similar to:
+
+ 2 nodes available, local node is: 0
+ 0: ffc0, uuid: 00000000 00000000 [LOCAL]
+ 1: ffc1, uuid: 00279000 ba4bb801
+
+ Besides the [LOCAL] node, it must show another node without error message.
+
+4) Prepare for debugging with early OHCI-1394 initialization:
+
+ 4.1) Kernel compilation and installation on debug target
+
+ Compile the kernel to be debugged with CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ (Kernel hacking: Provide code for enabling DMA over FireWire early on boot)
+ enabled and install it on the machine to be debugged (debug target).
+
+ 4.2) Transfer the System.map of the debugged kernel to the debug host
+
+ Copy the System.map of the kernel be debugged to the debug host (the host
+ which is connected to the debugged machine over the FireWire cable).
+
+5) Retrieving the printk buffer contents:
+
+ With the FireWire cable connected, the OHCI-1394 driver on the debugging
+ host loaded, reboot the debugged machine, booting the kernel which has
+ CONFIG_PROVIDE_OHCI1394_DMA_INIT enabled, with the option ohci1394_dma=early.
+
+ Then, on the debugging host, run firescope, for example by using -A:
+
+ firescope -A System.map-of-debug-target-kernel
+
+ Note: -A automatically attaches to the first non-local node. It only works
+ reliably if only connected two machines are connected using FireWire.
+
+ After having attached to the debug target, press Ctrl-D to view the
+ complete printk buffer or Ctrl-U to enter auto update mode and get an
+ updated live view of recent kernel messages logged on the debug target.
+
+ Call "firescope -h" to get more information on firescope's options.
+
+Notes
+-----
+Documentation and specifications: ftp://ftp.suse.de/private/bk/firewire/docs
+
+FireWire is a trademark of Apple Inc. - for more information please refer to:
+http://en.wikipedia.org/wiki/FireWire
diff --git a/Documentation/dell_rbu.txt b/Documentation/dell_rbu.txt
new file mode 100644
index 0000000..2c0d631
--- /dev/null
+++ b/Documentation/dell_rbu.txt
@@ -0,0 +1,100 @@
+Purpose:
+Demonstrate the usage of the new open sourced rbu (Remote BIOS Update) driver
+for updating BIOS images on Dell servers and desktops.
+
+Scope:
+This document discusses the functionality of the rbu driver only.
+It does not cover the support needed from applications to enable the BIOS to
+update itself with the image downloaded in to the memory.
+
+Overview:
+This driver works with Dell OpenManage or Dell Update Packages for updating
+the BIOS on Dell servers (starting from servers sold since 1999), desktops
+and notebooks (starting from those sold in 2005).
+Please go to http://support.dell.com register and you can find info on
+OpenManage and Dell Update packages (DUP).
+Libsmbios can also be used to update BIOS on Dell systems go to
+http://linux.dell.com/libsmbios/ for details.
+
+Dell_RBU driver supports BIOS update using the monolithic image and packetized
+image methods. In case of monolithic the driver allocates a contiguous chunk
+of physical pages having the BIOS image. In case of packetized the app
+using the driver breaks the image in to packets of fixed sizes and the driver
+would place each packet in contiguous physical memory. The driver also
+maintains a link list of packets for reading them back.
+If the dell_rbu driver is unloaded all the allocated memory is freed.
+
+The rbu driver needs to have an application (as mentioned above)which will
+inform the BIOS to enable the update in the next system reboot.
+
+The user should not unload the rbu driver after downloading the BIOS image
+or updating.
+
+The driver load creates the following directories under the /sys file system.
+/sys/class/firmware/dell_rbu/loading
+/sys/class/firmware/dell_rbu/data
+/sys/devices/platform/dell_rbu/image_type
+/sys/devices/platform/dell_rbu/data
+/sys/devices/platform/dell_rbu/packet_size
+
+The driver supports two types of update mechanism; monolithic and packetized.
+These update mechanism depends upon the BIOS currently running on the system.
+Most of the Dell systems support a monolithic update where the BIOS image is
+copied to a single contiguous block of physical memory.
+In case of packet mechanism the single memory can be broken in smaller chunks
+of contiguous memory and the BIOS image is scattered in these packets.
+
+By default the driver uses monolithic memory for the update type. This can be
+changed to packets during the driver load time by specifying the load
+parameter image_type=packet. This can also be changed later as below
+echo packet > /sys/devices/platform/dell_rbu/image_type
+
+In packet update mode the packet size has to be given before any packets can
+be downloaded. It is done as below
+echo XXXX > /sys/devices/platform/dell_rbu/packet_size
+In the packet update mechanism, the user needs to create a new file having
+packets of data arranged back to back. It can be done as follows
+The user creates packets header, gets the chunk of the BIOS image and
+places it next to the packetheader; now, the packetheader + BIOS image chunk
+added together should match the specified packet_size. This makes one
+packet, the user needs to create more such packets out of the entire BIOS
+image file and then arrange all these packets back to back in to one single
+file.
+This file is then copied to /sys/class/firmware/dell_rbu/data.
+Once this file gets to the driver, the driver extracts packet_size data from
+the file and spreads it accross the physical memory in contiguous packet_sized
+space.
+This method makes sure that all the packets get to the driver in a single operation.
+
+In monolithic update the user simply get the BIOS image (.hdr file) and copies
+to the data file as is without any change to the BIOS image itself.
+
+Do the steps below to download the BIOS image.
+1) echo 1 > /sys/class/firmware/dell_rbu/loading
+2) cp bios_image.hdr /sys/class/firmware/dell_rbu/data
+3) echo 0 > /sys/class/firmware/dell_rbu/loading
+
+The /sys/class/firmware/dell_rbu/ entries will remain till the following is
+done.
+echo -1 > /sys/class/firmware/dell_rbu/loading.
+Until this step is completed the driver cannot be unloaded.
+Also echoing either mono ,packet or init in to image_type will free up the
+memory allocated by the driver.
+
+If an user by accident executes steps 1 and 3 above without executing step 2;
+it will make the /sys/class/firmware/dell_rbu/ entries to disappear.
+The entries can be recreated by doing the following
+echo init > /sys/devices/platform/dell_rbu/image_type
+NOTE: echoing init in image_type does not change it original value.
+
+Also the driver provides /sys/devices/platform/dell_rbu/data readonly file to
+read back the image downloaded.
+
+NOTE:
+This driver requires a patch for firmware_class.c which has the modified
+request_firmware_nowait function.
+Also after updating the BIOS image a user mode application needs to execute
+code which sends the BIOS update request to the BIOS. So on the next reboot
+the BIOS knows about the new image downloaded and it updates itself.
+Also don't unload the rbu driver if the image has to be updated.
+
diff --git a/Documentation/development-process/1.Intro b/Documentation/development-process/1.Intro
new file mode 100644
index 0000000..8cc2cba
--- /dev/null
+++ b/Documentation/development-process/1.Intro
@@ -0,0 +1,274 @@
+1: A GUIDE TO THE KERNEL DEVELOPMENT PROCESS
+
+The purpose of this document is to help developers (and their managers)
+work with the development community with a minimum of frustration. It is
+an attempt to document how this community works in a way which is
+accessible to those who are not intimately familiar with Linux kernel
+development (or, indeed, free software development in general). While
+there is some technical material here, this is very much a process-oriented
+discussion which does not require a deep knowledge of kernel programming to
+understand.
+
+
+1.1: EXECUTIVE SUMMARY
+
+The rest of this section covers the scope of the kernel development process
+and the kinds of frustrations that developers and their employers can
+encounter there. There are a great many reasons why kernel code should be
+merged into the official ("mainline") kernel, including automatic
+availability to users, community support in many forms, and the ability to
+influence the direction of kernel development. Code contributed to the
+Linux kernel must be made available under a GPL-compatible license.
+
+Section 2 introduces the development process, the kernel release cycle, and
+the mechanics of the merge window. The various phases in the patch
+development, review, and merging cycle are covered. There is some
+discussion of tools and mailing lists. Developers wanting to get started
+with kernel development are encouraged to track down and fix bugs as an
+initial exercise.
+
+Section 3 covers early-stage project planning, with an emphasis on
+involving the development community as soon as possible.
+
+Section 4 is about the coding process; several pitfalls which have been
+encountered by other developers are discussed. Some requirements for
+patches are covered, and there is an introduction to some of the tools
+which can help to ensure that kernel patches are correct.
+
+Section 5 talks about the process of posting patches for review. To be
+taken seriously by the development community, patches must be properly
+formatted and described, and they must be sent to the right place.
+Following the advice in this section should help to ensure the best
+possible reception for your work.
+
+Section 6 covers what happens after posting patches; the job is far from
+done at that point. Working with reviewers is a crucial part of the
+development process; this section offers a number of tips on how to avoid
+problems at this important stage. Developers are cautioned against
+assuming that the job is done when a patch is merged into the mainline.
+
+Section 7 introduces a couple of "advanced" topics: managing patches with
+git and reviewing patches posted by others.
+
+Section 8 concludes the document with pointers to sources for more
+information on kernel development.
+
+
+1.2: WHAT THIS DOCUMENT IS ABOUT
+
+The Linux kernel, at over 6 million lines of code and well over 1000 active
+contributors, is one of the largest and most active free software projects
+in existence. Since its humble beginning in 1991, this kernel has evolved
+into a best-of-breed operating system component which runs on pocket-sized
+digital music players, desktop PCs, the largest supercomputers in
+existence, and all types of systems in between. It is a robust, efficient,
+and scalable solution for almost any situation.
+
+With the growth of Linux has come an increase in the number of developers
+(and companies) wishing to participate in its development. Hardware
+vendors want to ensure that Linux supports their products well, making
+those products attractive to Linux users. Embedded systems vendors, who
+use Linux as a component in an integrated product, want Linux to be as
+capable and well-suited to the task at hand as possible. Distributors and
+other software vendors who base their products on Linux have a clear
+interest in the capabilities, performance, and reliability of the Linux
+kernel. And end users, too, will often wish to change Linux to make it
+better suit their needs.
+
+One of the most compelling features of Linux is that it is accessible to
+these developers; anybody with the requisite skills can improve Linux and
+influence the direction of its development. Proprietary products cannot
+offer this kind of openness, which is a characteristic of the free software
+process. But, if anything, the kernel is even more open than most other
+free software projects. A typical three-month kernel development cycle can
+involve over 1000 developers working for more than 100 different companies
+(or for no company at all).
+
+Working with the kernel development community is not especially hard. But,
+that notwithstanding, many potential contributors have experienced
+difficulties when trying to do kernel work. The kernel community has
+evolved its own distinct ways of operating which allow it to function
+smoothly (and produce a high-quality product) in an environment where
+thousands of lines of code are being changed every day. So it is not
+surprising that Linux kernel development process differs greatly from
+proprietary development methods.
+
+The kernel's development process may come across as strange and
+intimidating to new developers, but there are good reasons and solid
+experience behind it. A developer who does not understand the kernel
+community's ways (or, worse, who tries to flout or circumvent them) will
+have a frustrating experience in store. The development community, while
+being helpful to those who are trying to learn, has little time for those
+who will not listen or who do not care about the development process.
+
+It is hoped that those who read this document will be able to avoid that
+frustrating experience. There is a lot of material here, but the effort
+involved in reading it will be repaid in short order. The development
+community is always in need of developers who will help to make the kernel
+better; the following text should help you - or those who work for you -
+join our community.
+
+
+1.3: CREDITS
+
+This document was written by Jonathan Corbet, corbet@lwn.net. It has been
+improved by comments from Johannes Berg, James Berry, Alex Chiang, Roland
+Dreier, Randy Dunlap, Jake Edge, Jiri Kosina, Matt Mackall, Arthur Marsh,
+Amanda McPherson, Andrew Morton, Andrew Price, Tsugikazu Shibata, and
+Jochen Voß.
+
+This work was supported by the Linux Foundation; thanks especially to
+Amanda McPherson, who saw the value of this effort and made it all happen.
+
+
+1.4: THE IMPORTANCE OF GETTING CODE INTO THE MAINLINE
+
+Some companies and developers occasionally wonder why they should bother
+learning how to work with the kernel community and get their code into the
+mainline kernel (the "mainline" being the kernel maintained by Linus
+Torvalds and used as a base by Linux distributors). In the short term,
+contributing code can look like an avoidable expense; it seems easier to
+just keep the code separate and support users directly. The truth of the
+matter is that keeping code separate ("out of tree") is a false economy.
+
+As a way of illustrating the costs of out-of-tree code, here are a few
+relevant aspects of the kernel development process; most of these will be
+discussed in greater detail later in this document. Consider:
+
+- Code which has been merged into the mainline kernel is available to all
+ Linux users. It will automatically be present on all distributions which
+ enable it. There is no need for driver disks, downloads, or the hassles
+ of supporting multiple versions of multiple distributions; it all just
+ works, for the developer and for the user. Incorporation into the
+ mainline solves a large number of distribution and support problems.
+
+- While kernel developers strive to maintain a stable interface to user
+ space, the internal kernel API is in constant flux. The lack of a stable
+ internal interface is a deliberate design decision; it allows fundamental
+ improvements to be made at any time and results in higher-quality code.
+ But one result of that policy is that any out-of-tree code requires
+ constant upkeep if it is to work with new kernels. Maintaining
+ out-of-tree code requires significant amounts of work just to keep that
+ code working.
+
+ Code which is in the mainline, instead, does not require this work as the
+ result of a simple rule requiring any developer who makes an API change
+ to also fix any code that breaks as the result of that change. So code
+ which has been merged into the mainline has significantly lower
+ maintenance costs.
+
+- Beyond that, code which is in the kernel will often be improved by other
+ developers. Surprising results can come from empowering your user
+ community and customers to improve your product.
+
+- Kernel code is subjected to review, both before and after merging into
+ the mainline. No matter how strong the original developer's skills are,
+ this review process invariably finds ways in which the code can be
+ improved. Often review finds severe bugs and security problems. This is
+ especially true for code which has been developed in a closed
+ environment; such code benefits strongly from review by outside
+ developers. Out-of-tree code is lower-quality code.
+
+- Participation in the development process is your way to influence the
+ direction of kernel development. Users who complain from the sidelines
+ are heard, but active developers have a stronger voice - and the ability
+ to implement changes which make the kernel work better for their needs.
+
+- When code is maintained separately, the possibility that a third party
+ will contribute a different implementation of a similar feature always
+ exists. Should that happen, getting your code merged will become much
+ harder - to the point of impossibility. Then you will be faced with the
+ unpleasant alternatives of either (1) maintaining a nonstandard feature
+ out of tree indefinitely, or (2) abandoning your code and migrating your
+ users over to the in-tree version.
+
+- Contribution of code is the fundamental action which makes the whole
+ process work. By contributing your code you can add new functionality to
+ the kernel and provide capabilities and examples which are of use to
+ other kernel developers. If you have developed code for Linux (or are
+ thinking about doing so), you clearly have an interest in the continued
+ success of this platform; contributing code is one of the best ways to
+ help ensure that success.
+
+All of the reasoning above applies to any out-of-tree kernel code,
+including code which is distributed in proprietary, binary-only form.
+There are, however, additional factors which should be taken into account
+before considering any sort of binary-only kernel code distribution. These
+include:
+
+- The legal issues around the distribution of proprietary kernel modules
+ are cloudy at best; quite a few kernel copyright holders believe that
+ most binary-only modules are derived products of the kernel and that, as
+ a result, their distribution is a violation of the GNU General Public
+ license (about which more will be said below). Your author is not a
+ lawyer, and nothing in this document can possibly be considered to be
+ legal advice. The true legal status of closed-source modules can only be
+ determined by the courts. But the uncertainty which haunts those modules
+ is there regardless.
+
+- Binary modules greatly increase the difficulty of debugging kernel
+ problems, to the point that most kernel developers will not even try. So
+ the distribution of binary-only modules will make it harder for your
+ users to get support from the community.
+
+- Support is also harder for distributors of binary-only modules, who must
+ provide a version of the module for every distribution and every kernel
+ version they wish to support. Dozens of builds of a single module can
+ be required to provide reasonably comprehensive coverage, and your users
+ will have to upgrade your module separately every time they upgrade their
+ kernel.
+
+- Everything that was said above about code review applies doubly to
+ closed-source code. Since this code is not available at all, it cannot
+ have been reviewed by the community and will, beyond doubt, have serious
+ problems.
+
+Makers of embedded systems, in particular, may be tempted to disregard much
+of what has been said in this section in the belief that they are shipping
+a self-contained product which uses a frozen kernel version and requires no
+more development after its release. This argument misses the value of
+widespread code review and the value of allowing your users to add
+capabilities to your product. But these products, too, have a limited
+commercial life, after which a new version must be released. At that
+point, vendors whose code is in the mainline and well maintained will be
+much better positioned to get the new product ready for market quickly.
+
+
+1.5: LICENSING
+
+Code is contributed to the Linux kernel under a number of licenses, but all
+code must be compatible with version 2 of the GNU General Public License
+(GPLv2), which is the license covering the kernel distribution as a whole.
+In practice, that means that all code contributions are covered either by
+GPLv2 (with, optionally, language allowing distribution under later
+versions of the GPL) or the three-clause BSD license. Any contributions
+which are not covered by a compatible license will not be accepted into the
+kernel.
+
+Copyright assignments are not required (or requested) for code contributed
+to the kernel. All code merged into the mainline kernel retains its
+original ownership; as a result, the kernel now has thousands of owners.
+
+One implication of this ownership structure is that any attempt to change
+the licensing of the kernel is doomed to almost certain failure. There are
+few practical scenarios where the agreement of all copyright holders could
+be obtained (or their code removed from the kernel). So, in particular,
+there is no prospect of a migration to version 3 of the GPL in the
+foreseeable future.
+
+It is imperative that all code contributed to the kernel be legitimately
+free software. For that reason, code from anonymous (or pseudonymous)
+contributors will not be accepted. All contributors are required to "sign
+off" on their code, stating that the code can be distributed with the
+kernel under the GPL. Code which has not been licensed as free software by
+its owner, or which risks creating copyright-related problems for the
+kernel (such as code which derives from reverse-engineering efforts lacking
+proper safeguards) cannot be contributed.
+
+Questions about copyright-related issues are common on Linux development
+mailing lists. Such questions will normally receive no shortage of
+answers, but one should bear in mind that the people answering those
+questions are not lawyers and cannot provide legal advice. If you have
+legal questions relating to Linux source code, there is no substitute for
+talking with a lawyer who understands this field. Relying on answers
+obtained on technical mailing lists is a risky affair.
diff --git a/Documentation/development-process/2.Process b/Documentation/development-process/2.Process
new file mode 100644
index 0000000..d750321
--- /dev/null
+++ b/Documentation/development-process/2.Process
@@ -0,0 +1,459 @@
+2: HOW THE DEVELOPMENT PROCESS WORKS
+
+Linux kernel development in the early 1990's was a pretty loose affair,
+with relatively small numbers of users and developers involved. With a
+user base in the millions and with some 2,000 developers involved over the
+course of one year, the kernel has since had to evolve a number of
+processes to keep development happening smoothly. A solid understanding of
+how the process works is required in order to be an effective part of it.
+
+
+2.1: THE BIG PICTURE
+
+The kernel developers use a loosely time-based release process, with a new
+major kernel release happening every two or three months. The recent
+release history looks like this:
+
+ 2.6.26 July 13, 2008
+ 2.6.25 April 16, 2008
+ 2.6.24 January 24, 2008
+ 2.6.23 October 9, 2007
+ 2.6.22 July 8, 2007
+ 2.6.21 April 25, 2007
+ 2.6.20 February 4, 2007
+
+Every 2.6.x release is a major kernel release with new features, internal
+API changes, and more. A typical 2.6 release can contain over 10,000
+changesets with changes to several hundred thousand lines of code. 2.6 is
+thus the leading edge of Linux kernel development; the kernel uses a
+rolling development model which is continually integrating major changes.
+
+A relatively straightforward discipline is followed with regard to the
+merging of patches for each release. At the beginning of each development
+cycle, the "merge window" is said to be open. At that time, code which is
+deemed to be sufficiently stable (and which is accepted by the development
+community) is merged into the mainline kernel. The bulk of changes for a
+new development cycle (and all of the major changes) will be merged during
+this time, at a rate approaching 1,000 changes ("patches," or "changesets")
+per day.
+
+(As an aside, it is worth noting that the changes integrated during the
+merge window do not come out of thin air; they have been collected, tested,
+and staged ahead of time. How that process works will be described in
+detail later on).
+
+The merge window lasts for two weeks. At the end of this time, Linus
+Torvalds will declare that the window is closed and release the first of
+the "rc" kernels. For the kernel which is destined to be 2.6.26, for
+example, the release which happens at the end of the merge window will be
+called 2.6.26-rc1. The -rc1 release is the signal that the time to merge
+new features has passed, and that the time to stabilize the next kernel has
+begun.
+
+Over the next six to ten weeks, only patches which fix problems should be
+submitted to the mainline. On occasion a more significant change will be
+allowed, but such occasions are rare; developers who try to merge new
+features outside of the merge window tend to get an unfriendly reception.
+As a general rule, if you miss the merge window for a given feature, the
+best thing to do is to wait for the next development cycle. (An occasional
+exception is made for drivers for previously-unsupported hardware; if they
+touch no in-tree code, they cannot cause regressions and should be safe to
+add at any time).
+
+As fixes make their way into the mainline, the patch rate will slow over
+time. Linus releases new -rc kernels about once a week; a normal series
+will get up to somewhere between -rc6 and -rc9 before the kernel is
+considered to be sufficiently stable and the final 2.6.x release is made.
+At that point the whole process starts over again.
+
+As an example, here is how the 2.6.25 development cycle went (all dates in
+2008):
+
+ January 24 2.6.24 stable release
+ February 10 2.6.25-rc1, merge window closes
+ February 15 2.6.25-rc2
+ February 24 2.6.25-rc3
+ March 4 2.6.25-rc4
+ March 9 2.6.25-rc5
+ March 16 2.6.25-rc6
+ March 25 2.6.25-rc7
+ April 1 2.6.25-rc8
+ April 11 2.6.25-rc9
+ April 16 2.6.25 stable release
+
+How do the developers decide when to close the development cycle and create
+the stable release? The most significant metric used is the list of
+regressions from previous releases. No bugs are welcome, but those which
+break systems which worked in the past are considered to be especially
+serious. For this reason, patches which cause regressions are looked upon
+unfavorably and are quite likely to be reverted during the stabilization
+period.
+
+The developers' goal is to fix all known regressions before the stable
+release is made. In the real world, this kind of perfection is hard to
+achieve; there are just too many variables in a project of this size.
+There comes a point where delaying the final release just makes the problem
+worse; the pile of changes waiting for the next merge window will grow
+larger, creating even more regressions the next time around. So most 2.6.x
+kernels go out with a handful of known regressions though, hopefully, none
+of them are serious.
+
+Once a stable release is made, its ongoing maintenance is passed off to the
+"stable team," currently comprised of Greg Kroah-Hartman and Chris Wright.
+The stable team will release occasional updates to the stable release using
+the 2.6.x.y numbering scheme. To be considered for an update release, a
+patch must (1) fix a significant bug, and (2) already be merged into the
+mainline for the next development kernel. Continuing our 2.6.25 example,
+the history (as of this writing) is:
+
+ May 1 2.6.25.1
+ May 6 2.6.25.2
+ May 9 2.6.25.3
+ May 15 2.6.25.4
+ June 7 2.6.25.5
+ June 9 2.6.25.6
+ June 16 2.6.25.7
+ June 21 2.6.25.8
+ June 24 2.6.25.9
+
+Stable updates for a given kernel are made for approximately six months;
+after that, the maintenance of stable releases is solely the responsibility
+of the distributors which have shipped that particular kernel.
+
+
+2.2: THE LIFECYCLE OF A PATCH
+
+Patches do not go directly from the developer's keyboard into the mainline
+kernel. There is, instead, a somewhat involved (if somewhat informal)
+process designed to ensure that each patch is reviewed for quality and that
+each patch implements a change which is desirable to have in the mainline.
+This process can happen quickly for minor fixes, or, in the case of large
+and controversial changes, go on for years. Much developer frustration
+comes from a lack of understanding of this process or from attempts to
+circumvent it.
+
+In the hopes of reducing that frustration, this document will describe how
+a patch gets into the kernel. What follows below is an introduction which
+describes the process in a somewhat idealized way. A much more detailed
+treatment will come in later sections.
+
+The stages that a patch goes through are, generally:
+
+ - Design. This is where the real requirements for the patch - and the way
+ those requirements will be met - are laid out. Design work is often
+ done without involving the community, but it is better to do this work
+ in the open if at all possible; it can save a lot of time redesigning
+ things later.
+
+ - Early review. Patches are posted to the relevant mailing list, and
+ developers on that list reply with any comments they may have. This
+ process should turn up any major problems with a patch if all goes
+ well.
+
+ - Wider review. When the patch is getting close to ready for mainline
+ inclusion, it will be accepted by a relevant subsystem maintainer -
+ though this acceptance is not a guarantee that the patch will make it
+ all the way to the mainline. The patch will show up in the maintainer's
+ subsystem tree and into the staging trees (described below). When the
+ process works, this step leads to more extensive review of the patch and
+ the discovery of any problems resulting from the integration of this
+ patch with work being done by others.
+
+ - Merging into the mainline. Eventually, a successful patch will be
+ merged into the mainline repository managed by Linus Torvalds. More
+ comments and/or problems may surface at this time; it is important that
+ the developer be responsive to these and fix any issues which arise.
+
+ - Stable release. The number of users potentially affected by the patch
+ is now large, so, once again, new problems may arise.
+
+ - Long-term maintenance. While it is certainly possible for a developer
+ to forget about code after merging it, that sort of behavior tends to
+ leave a poor impression in the development community. Merging code
+ eliminates some of the maintenance burden, in that others will fix
+ problems caused by API changes. But the original developer should
+ continue to take responsibility for the code if it is to remain useful
+ in the longer term.
+
+One of the largest mistakes made by kernel developers (or their employers)
+is to try to cut the process down to a single "merging into the mainline"
+step. This approach invariably leads to frustration for everybody
+involved.
+
+
+2.3: HOW PATCHES GET INTO THE KERNEL
+
+There is exactly one person who can merge patches into the mainline kernel
+repository: Linus Torvalds. But, of the over 12,000 patches which went
+into the 2.6.25 kernel, only 250 (around 2%) were directly chosen by Linus
+himself. The kernel project has long since grown to a size where no single
+developer could possibly inspect and select every patch unassisted. The
+way the kernel developers have addressed this growth is through the use of
+a lieutenant system built around a chain of trust.
+
+The kernel code base is logically broken down into a set of subsystems:
+networking, specific architecture support, memory management, video
+devices, etc. Most subsystems have a designated maintainer, a developer
+who has overall responsibility for the code within that subsystem. These
+subsystem maintainers are the gatekeepers (in a loose way) for the portion
+of the kernel they manage; they are the ones who will (usually) accept a
+patch for inclusion into the mainline kernel.
+
+Subsystem maintainers each manage their own version of the kernel source
+tree, usually (but certainly not always) using the git source management
+tool. Tools like git (and related tools like quilt or mercurial) allow
+maintainers to track a list of patches, including authorship information
+and other metadata. At any given time, the maintainer can identify which
+patches in his or her repository are not found in the mainline.
+
+When the merge window opens, top-level maintainers will ask Linus to "pull"
+the patches they have selected for merging from their repositories. If
+Linus agrees, the stream of patches will flow up into his repository,
+becoming part of the mainline kernel. The amount of attention that Linus
+pays to specific patches received in a pull operation varies. It is clear
+that, sometimes, he looks quite closely. But, as a general rule, Linus
+trusts the subsystem maintainers to not send bad patches upstream.
+
+Subsystem maintainers, in turn, can pull patches from other maintainers.
+For example, the networking tree is built from patches which accumulated
+first in trees dedicated to network device drivers, wireless networking,
+etc. This chain of repositories can be arbitrarily long, though it rarely
+exceeds two or three links. Since each maintainer in the chain trusts
+those managing lower-level trees, this process is known as the "chain of
+trust."
+
+Clearly, in a system like this, getting patches into the kernel depends on
+finding the right maintainer. Sending patches directly to Linus is not
+normally the right way to go.
+
+
+2.4: STAGING TREES
+
+The chain of subsystem trees guides the flow of patches into the kernel,
+but it also raises an interesting question: what if somebody wants to look
+at all of the patches which are being prepared for the next merge window?
+Developers will be interested in what other changes are pending to see
+whether there are any conflicts to worry about; a patch which changes a
+core kernel function prototype, for example, will conflict with any other
+patches which use the older form of that function. Reviewers and testers
+want access to the changes in their integrated form before all of those
+changes land in the mainline kernel. One could pull changes from all of
+the interesting subsystem trees, but that would be a big and error-prone
+job.
+
+The answer comes in the form of staging trees, where subsystem trees are
+collected for testing and review. The older of these trees, maintained by
+Andrew Morton, is called "-mm" (for memory management, which is how it got
+started). The -mm tree integrates patches from a long list of subsystem
+trees; it also has some patches aimed at helping with debugging.
+
+Beyond that, -mm contains a significant collection of patches which have
+been selected by Andrew directly. These patches may have been posted on a
+mailing list, or they may apply to a part of the kernel for which there is
+no designated subsystem tree. As a result, -mm operates as a sort of
+subsystem tree of last resort; if there is no other obvious path for a
+patch into the mainline, it is likely to end up in -mm. Miscellaneous
+patches which accumulate in -mm will eventually either be forwarded on to
+an appropriate subsystem tree or be sent directly to Linus. In a typical
+development cycle, approximately 10% of the patches going into the mainline
+get there via -mm.
+
+The current -mm patch can always be found from the front page of
+
+ http://kernel.org/
+
+Those who want to see the current state of -mm can get the "-mm of the
+moment" tree, found at:
+
+ http://userweb.kernel.org/~akpm/mmotm/
+
+Use of the MMOTM tree is likely to be a frustrating experience, though;
+there is a definite chance that it will not even compile.
+
+The other staging tree, started more recently, is linux-next, maintained by
+Stephen Rothwell. The linux-next tree is, by design, a snapshot of what
+the mainline is expected to look like after the next merge window closes.
+Linux-next trees are announced on the linux-kernel and linux-next mailing
+lists when they are assembled; they can be downloaded from:
+
+ http://www.kernel.org/pub/linux/kernel/people/sfr/linux-next/
+
+Some information about linux-next has been gathered at:
+
+ http://linux.f-seidel.de/linux-next/pmwiki/
+
+How the linux-next tree will fit into the development process is still
+changing. As of this writing, the first full development cycle involving
+linux-next (2.6.26) is coming to an end; thus far, it has proved to be a
+valuable resource for finding and fixing integration problems before the
+beginning of the merge window. See http://lwn.net/Articles/287155/ for
+more information on how linux-next has worked to set up the 2.6.27 merge
+window.
+
+Some developers have begun to suggest that linux-next should be used as the
+target for future development as well. The linux-next tree does tend to be
+far ahead of the mainline and is more representative of the tree into which
+any new work will be merged. The downside to this idea is that the
+volatility of linux-next tends to make it a difficult development target.
+See http://lwn.net/Articles/289013/ for more information on this topic, and
+stay tuned; much is still in flux where linux-next is involved.
+
+
+2.5: TOOLS
+
+As can be seen from the above text, the kernel development process depends
+heavily on the ability to herd collections of patches in various
+directions. The whole thing would not work anywhere near as well as it
+does without suitably powerful tools. Tutorials on how to use these tools
+are well beyond the scope of this document, but there is space for a few
+pointers.
+
+By far the dominant source code management system used by the kernel
+community is git. Git is one of a number of distributed version control
+systems being developed in the free software community. It is well tuned
+for kernel development, in that it performs quite well when dealing with
+large repositories and large numbers of patches. It also has a reputation
+for being difficult to learn and use, though it has gotten better over
+time. Some sort of familiarity with git is almost a requirement for kernel
+developers; even if they do not use it for their own work, they'll need git
+to keep up with what other developers (and the mainline) are doing.
+
+Git is now packaged by almost all Linux distributions. There is a home
+page at
+
+ http://git.or.cz/
+
+That page has pointers to documentation and tutorials. One should be
+aware, in particular, of the Kernel Hacker's Guide to git, which has
+information specific to kernel development:
+
+ http://linux.yyz.us/git-howto.html
+
+Among the kernel developers who do not use git, the most popular choice is
+almost certainly Mercurial:
+
+ http://www.selenic.com/mercurial/
+
+Mercurial shares many features with git, but it provides an interface which
+many find easier to use.
+
+The other tool worth knowing about is Quilt:
+
+ http://savannah.nongnu.org/projects/quilt/
+
+Quilt is a patch management system, rather than a source code management
+system. It does not track history over time; it is, instead, oriented
+toward tracking a specific set of changes against an evolving code base.
+Some major subsystem maintainers use quilt to manage patches intended to go
+upstream. For the management of certain kinds of trees (-mm, for example),
+quilt is the best tool for the job.
+
+
+2.6: MAILING LISTS
+
+A great deal of Linux kernel development work is done by way of mailing
+lists. It is hard to be a fully-functioning member of the community
+without joining at least one list somewhere. But Linux mailing lists also
+represent a potential hazard to developers, who risk getting buried under a
+load of electronic mail, running afoul of the conventions used on the Linux
+lists, or both.
+
+Most kernel mailing lists are run on vger.kernel.org; the master list can
+be found at:
+
+ http://vger.kernel.org/vger-lists.html
+
+There are lists hosted elsewhere, though; a number of them are at
+lists.redhat.com.
+
+The core mailing list for kernel development is, of course, linux-kernel.
+This list is an intimidating place to be; volume can reach 500 messages per
+day, the amount of noise is high, the conversation can be severely
+technical, and participants are not always concerned with showing a high
+degree of politeness. But there is no other place where the kernel
+development community comes together as a whole; developers who avoid this
+list will miss important information.
+
+There are a few hints which can help with linux-kernel survival:
+
+- Have the list delivered to a separate folder, rather than your main
+ mailbox. One must be able to ignore the stream for sustained periods of
+ time.
+
+- Do not try to follow every conversation - nobody else does. It is
+ important to filter on both the topic of interest (though note that
+ long-running conversations can drift away from the original subject
+ without changing the email subject line) and the people who are
+ participating.
+
+- Do not feed the trolls. If somebody is trying to stir up an angry
+ response, ignore them.
+
+- When responding to linux-kernel email (or that on other lists) preserve
+ the Cc: header for all involved. In the absence of a strong reason (such
+ as an explicit request), you should never remove recipients. Always make
+ sure that the person you are responding to is in the Cc: list. This
+ convention also makes it unnecessary to explicitly ask to be copied on
+ replies to your postings.
+
+- Search the list archives (and the net as a whole) before asking
+ questions. Some developers can get impatient with people who clearly
+ have not done their homework.
+
+- Avoid top-posting (the practice of putting your answer above the quoted
+ text you are responding to). It makes your response harder to read and
+ makes a poor impression.
+
+- Ask on the correct mailing list. Linux-kernel may be the general meeting
+ point, but it is not the best place to find developers from all
+ subsystems.
+
+The last point - finding the correct mailing list - is a common place for
+beginning developers to go wrong. Somebody who asks a networking-related
+question on linux-kernel will almost certainly receive a polite suggestion
+to ask on the netdev list instead, as that is the list frequented by most
+networking developers. Other lists exist for the SCSI, video4linux, IDE,
+filesystem, etc. subsystems. The best place to look for mailing lists is
+in the MAINTAINERS file packaged with the kernel source.
+
+
+2.7: GETTING STARTED WITH KERNEL DEVELOPMENT
+
+Questions about how to get started with the kernel development process are
+common - from both individuals and companies. Equally common are missteps
+which make the beginning of the relationship harder than it has to be.
+
+Companies often look to hire well-known developers to get a development
+group started. This can, in fact, be an effective technique. But it also
+tends to be expensive and does not do much to grow the pool of experienced
+kernel developers. It is possible to bring in-house developers up to speed
+on Linux kernel development, given the investment of a bit of time. Taking
+this time can endow an employer with a group of developers who understand
+the kernel and the company both, and who can help to train others as well.
+Over the medium term, this is often the more profitable approach.
+
+Individual developers are often, understandably, at a loss for a place to
+start. Beginning with a large project can be intimidating; one often wants
+to test the waters with something smaller first. This is the point where
+some developers jump into the creation of patches fixing spelling errors or
+minor coding style issues. Unfortunately, such patches create a level of
+noise which is distracting for the development community as a whole, so,
+increasingly, they are looked down upon. New developers wishing to
+introduce themselves to the community will not get the sort of reception
+they wish for by these means.
+
+Andrew Morton gives this advice for aspiring kernel developers
+
+ The #1 project for all kernel beginners should surely be "make sure
+ that the kernel runs perfectly at all times on all machines which
+ you can lay your hands on". Usually the way to do this is to work
+ with others on getting things fixed up (this can require
+ persistence!) but that's fine - it's a part of kernel development.
+
+(http://lwn.net/Articles/283982/).
+
+In the absence of obvious problems to fix, developers are advised to look
+at the current lists of regressions and open bugs in general. There is
+never any shortage of issues in need of fixing; by addressing these issues,
+developers will gain experience with the process while, at the same time,
+building respect with the rest of the development community.
diff --git a/Documentation/development-process/3.Early-stage b/Documentation/development-process/3.Early-stage
new file mode 100644
index 0000000..307a159
--- /dev/null
+++ b/Documentation/development-process/3.Early-stage
@@ -0,0 +1,195 @@
+3: EARLY-STAGE PLANNING
+
+When contemplating a Linux kernel development project, it can be tempting
+to jump right in and start coding. As with any significant project,
+though, much of the groundwork for success is best laid before the first
+line of code is written. Some time spent in early planning and
+communication can save far more time later on.
+
+
+3.1: SPECIFYING THE PROBLEM
+
+Like any engineering project, a successful kernel enhancement starts with a
+clear description of the problem to be solved. In some cases, this step is
+easy: when a driver is needed for a specific piece of hardware, for
+example. In others, though, it is tempting to confuse the real problem
+with the proposed solution, and that can lead to difficulties.
+
+Consider an example: some years ago, developers working with Linux audio
+sought a way to run applications without dropouts or other artifacts caused
+by excessive latency in the system. The solution they arrived at was a
+kernel module intended to hook into the Linux Security Module (LSM)
+framework; this module could be configured to give specific applications
+access to the realtime scheduler. This module was implemented and sent to
+the linux-kernel mailing list, where it immediately ran into problems.
+
+To the audio developers, this security module was sufficient to solve their
+immediate problem. To the wider kernel community, though, it was seen as a
+misuse of the LSM framework (which is not intended to confer privileges
+onto processes which they would not otherwise have) and a risk to system
+stability. Their preferred solutions involved realtime scheduling access
+via the rlimit mechanism for the short term, and ongoing latency reduction
+work in the long term.
+
+The audio community, however, could not see past the particular solution
+they had implemented; they were unwilling to accept alternatives. The
+resulting disagreement left those developers feeling disillusioned with the
+entire kernel development process; one of them went back to an audio list
+and posted this:
+
+ There are a number of very good Linux kernel developers, but they
+ tend to get outshouted by a large crowd of arrogant fools. Trying
+ to communicate user requirements to these people is a waste of
+ time. They are much too "intelligent" to listen to lesser mortals.
+
+(http://lwn.net/Articles/131776/).
+
+The reality of the situation was different; the kernel developers were far
+more concerned about system stability, long-term maintenance, and finding
+the right solution to the problem than they were with a specific module.
+The moral of the story is to focus on the problem - not a specific solution
+- and to discuss it with the development community before investing in the
+creation of a body of code.
+
+So, when contemplating a kernel development project, one should obtain
+answers to a short set of questions:
+
+ - What, exactly, is the problem which needs to be solved?
+
+ - Who are the users affected by this problem? Which use cases should the
+ solution address?
+
+ - How does the kernel fall short in addressing that problem now?
+
+Only then does it make sense to start considering possible solutions.
+
+
+3.2: EARLY DISCUSSION
+
+When planning a kernel development project, it makes great sense to hold
+discussions with the community before launching into implementation. Early
+communication can save time and trouble in a number of ways:
+
+ - It may well be that the problem is addressed by the kernel in ways which
+ you have not understood. The Linux kernel is large and has a number of
+ features and capabilities which are not immediately obvious. Not all
+ kernel capabilities are documented as well as one might like, and it is
+ easy to miss things. Your author has seen the posting of a complete
+ driver which duplicated an existing driver that the new author had been
+ unaware of. Code which reinvents existing wheels is not only wasteful;
+ it will also not be accepted into the mainline kernel.
+
+ - There may be elements of the proposed solution which will not be
+ acceptable for mainline merging. It is better to find out about
+ problems like this before writing the code.
+
+ - It's entirely possible that other developers have thought about the
+ problem; they may have ideas for a better solution, and may be willing
+ to help in the creation of that solution.
+
+Years of experience with the kernel development community have taught a
+clear lesson: kernel code which is designed and developed behind closed
+doors invariably has problems which are only revealed when the code is
+released into the community. Sometimes these problems are severe,
+requiring months or years of effort before the code can be brought up to
+the kernel community's standards. Some examples include:
+
+ - The Devicescape network stack was designed and implemented for
+ single-processor systems. It could not be merged into the mainline
+ until it was made suitable for multiprocessor systems. Retrofitting
+ locking and such into code is a difficult task; as a result, the merging
+ of this code (now called mac80211) was delayed for over a year.
+
+ - The Reiser4 filesystem included a number of capabilities which, in the
+ core kernel developers' opinion, should have been implemented in the
+ virtual filesystem layer instead. It also included features which could
+ not easily be implemented without exposing the system to user-caused
+ deadlocks. The late revelation of these problems - and refusal to
+ address some of them - has caused Reiser4 to stay out of the mainline
+ kernel.
+
+ - The AppArmor security module made use of internal virtual filesystem
+ data structures in ways which were considered to be unsafe and
+ unreliable. This code has since been significantly reworked, but
+ remains outside of the mainline.
+
+In each of these cases, a great deal of pain and extra work could have been
+avoided with some early discussion with the kernel developers.
+
+
+3.3: WHO DO YOU TALK TO?
+
+When developers decide to take their plans public, the next question will
+be: where do we start? The answer is to find the right mailing list(s) and
+the right maintainer. For mailing lists, the best approach is to look in
+the MAINTAINERS file for a relevant place to post. If there is a suitable
+subsystem list, posting there is often preferable to posting on
+linux-kernel; you are more likely to reach developers with expertise in the
+relevant subsystem and the environment may be more supportive.
+
+Finding maintainers can be a bit harder. Again, the MAINTAINERS file is
+the place to start. That file tends to not always be up to date, though,
+and not all subsystems are represented there. The person listed in the
+MAINTAINERS file may, in fact, not be the person who is actually acting in
+that role currently. So, when there is doubt about who to contact, a
+useful trick is to use git (and "git log" in particular) to see who is
+currently active within the subsystem of interest. Look at who is writing
+patches, and who, if anybody, is attaching Signed-off-by lines to those
+patches. Those are the people who will be best placed to help with a new
+development project.
+
+If all else fails, talking to Andrew Morton can be an effective way to
+track down a maintainer for a specific piece of code.
+
+
+3.4: WHEN TO POST?
+
+If possible, posting your plans during the early stages can only be
+helpful. Describe the problem being solved and any plans that have been
+made on how the implementation will be done. Any information you can
+provide can help the development community provide useful input on the
+project.
+
+One discouraging thing which can happen at this stage is not a hostile
+reaction, but, instead, little or no reaction at all. The sad truth of the
+matter is (1) kernel developers tend to be busy, (2) there is no shortage
+of people with grand plans and little code (or even prospect of code) to
+back them up, and (3) nobody is obligated to review or comment on ideas
+posted by others. If a request-for-comments posting yields little in the
+way of comments, do not assume that it means there is no interest in the
+project. Unfortunately, you also cannot assume that there are no problems
+with your idea. The best thing to do in this situation is to proceed,
+keeping the community informed as you go.
+
+
+3.5: GETTING OFFICIAL BUY-IN
+
+If your work is being done in a corporate environment - as most Linux
+kernel work is - you must, obviously, have permission from suitably
+empowered managers before you can post your company's plans or code to a
+public mailing list. The posting of code which has not been cleared for
+release under a GPL-compatible license can be especially problematic; the
+sooner that a company's management and legal staff can agree on the posting
+of a kernel development project, the better off everybody involved will be.
+
+Some readers may be thinking at this point that their kernel work is
+intended to support a product which does not yet have an officially
+acknowledged existence. Revealing their employer's plans on a public
+mailing list may not be a viable option. In cases like this, it is worth
+considering whether the secrecy is really necessary; there is often no real
+need to keep development plans behind closed doors.
+
+That said, there are also cases where a company legitimately cannot
+disclose its plans early in the development process. Companies with
+experienced kernel developers may choose to proceed in an open-loop manner
+on the assumption that they will be able to avoid serious integration
+problems later. For companies without that sort of in-house expertise, the
+best option is often to hire an outside developer to review the plans under
+a non-disclosure agreement. The Linux Foundation operates an NDA program
+designed to help with this sort of situation; more information can be found
+at:
+
+ http://www.linuxfoundation.org/en/NDA_program
+
+This kind of review is often enough to avoid serious problems later on
+without requiring public disclosure of the project.
diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding
new file mode 100644
index 0000000..014aca8
--- /dev/null
+++ b/Documentation/development-process/4.Coding
@@ -0,0 +1,384 @@
+4: GETTING THE CODE RIGHT
+
+While there is much to be said for a solid and community-oriented design
+process, the proof of any kernel development project is in the resulting
+code. It is the code which will be examined by other developers and merged
+(or not) into the mainline tree. So it is the quality of this code which
+will determine the ultimate success of the project.
+
+This section will examine the coding process. We'll start with a look at a
+number of ways in which kernel developers can go wrong. Then the focus
+will shift toward doing things right and the tools which can help in that
+quest.
+
+
+4.1: PITFALLS
+
+* Coding style
+
+The kernel has long had a standard coding style, described in
+Documentation/CodingStyle. For much of that time, the policies described
+in that file were taken as being, at most, advisory. As a result, there is
+a substantial amount of code in the kernel which does not meet the coding
+style guidelines. The presence of that code leads to two independent
+hazards for kernel developers.
+
+The first of these is to believe that the kernel coding standards do not
+matter and are not enforced. The truth of the matter is that adding new
+code to the kernel is very difficult if that code is not coded according to
+the standard; many developers will request that the code be reformatted
+before they will even review it. A code base as large as the kernel
+requires some uniformity of code to make it possible for developers to
+quickly understand any part of it. So there is no longer room for
+strangely-formatted code.
+
+Occasionally, the kernel's coding style will run into conflict with an
+employer's mandated style. In such cases, the kernel's style will have to
+win before the code can be merged. Putting code into the kernel means
+giving up a degree of control in a number of ways - including control over
+how the code is formatted.
+
+The other trap is to assume that code which is already in the kernel is
+urgently in need of coding style fixes. Developers may start to generate
+reformatting patches as a way of gaining familiarity with the process, or
+as a way of getting their name into the kernel changelogs - or both. But
+pure coding style fixes are seen as noise by the development community;
+they tend to get a chilly reception. So this type of patch is best
+avoided. It is natural to fix the style of a piece of code while working
+on it for other reasons, but coding style changes should not be made for
+their own sake.
+
+The coding style document also should not be read as an absolute law which
+can never be transgressed. If there is a good reason to go against the
+style (a line which becomes far less readable if split to fit within the
+80-column limit, for example), just do it.
+
+
+* Abstraction layers
+
+Computer Science professors teach students to make extensive use of
+abstraction layers in the name of flexibility and information hiding.
+Certainly the kernel makes extensive use of abstraction; no project
+involving several million lines of code could do otherwise and survive.
+But experience has shown that excessive or premature abstraction can be
+just as harmful as premature optimization. Abstraction should be used to
+the level required and no further.
+
+At a simple level, consider a function which has an argument which is
+always passed as zero by all callers. One could retain that argument just
+in case somebody eventually needs to use the extra flexibility that it
+provides. By that time, though, chances are good that the code which
+implements this extra argument has been broken in some subtle way which was
+never noticed - because it has never been used. Or, when the need for
+extra flexibility arises, it does not do so in a way which matches the
+programmer's early expectation. Kernel developers will routinely submit
+patches to remove unused arguments; they should, in general, not be added
+in the first place.
+
+Abstraction layers which hide access to hardware - often to allow the bulk
+of a driver to be used with multiple operating systems - are especially
+frowned upon. Such layers obscure the code and may impose a performance
+penalty; they do not belong in the Linux kernel.
+
+On the other hand, if you find yourself copying significant amounts of code
+from another kernel subsystem, it is time to ask whether it would, in fact,
+make sense to pull out some of that code into a separate library or to
+implement that functionality at a higher level. There is no value in
+replicating the same code throughout the kernel.
+
+
+* #ifdef and preprocessor use in general
+
+The C preprocessor seems to present a powerful temptation to some C
+programmers, who see it as a way to efficiently encode a great deal of
+flexibility into a source file. But the preprocessor is not C, and heavy
+use of it results in code which is much harder for others to read and
+harder for the compiler to check for correctness. Heavy preprocessor use
+is almost always a sign of code which needs some cleanup work.
+
+Conditional compilation with #ifdef is, indeed, a powerful feature, and it
+is used within the kernel. But there is little desire to see code which is
+sprinkled liberally with #ifdef blocks. As a general rule, #ifdef use
+should be confined to header files whenever possible.
+Conditionally-compiled code can be confined to functions which, if the code
+is not to be present, simply become empty. The compiler will then quietly
+optimize out the call to the empty function. The result is far cleaner
+code which is easier to follow.
+
+C preprocessor macros present a number of hazards, including possible
+multiple evaluation of expressions with side effects and no type safety.
+If you are tempted to define a macro, consider creating an inline function
+instead. The code which results will be the same, but inline functions are
+easier to read, do not evaluate their arguments multiple times, and allow
+the compiler to perform type checking on the arguments and return value.
+
+
+* Inline functions
+
+Inline functions present a hazard of their own, though. Programmers can
+become enamored of the perceived efficiency inherent in avoiding a function
+call and fill a source file with inline functions. Those functions,
+however, can actually reduce performance. Since their code is replicated
+at each call site, they end up bloating the size of the compiled kernel.
+That, in turn, creates pressure on the processor's memory caches, which can
+slow execution dramatically. Inline functions, as a rule, should be quite
+small and relatively rare. The cost of a function call, after all, is not
+that high; the creation of large numbers of inline functions is a classic
+example of premature optimization.
+
+In general, kernel programmers ignore cache effects at their peril. The
+classic time/space tradeoff taught in beginning data structures classes
+often does not apply to contemporary hardware. Space *is* time, in that a
+larger program will run slower than one which is more compact.
+
+
+* Locking
+
+In May, 2006, the "Devicescape" networking stack was, with great
+fanfare, released under the GPL and made available for inclusion in the
+mainline kernel. This donation was welcome news; support for wireless
+networking in Linux was considered substandard at best, and the Devicescape
+stack offered the promise of fixing that situation. Yet, this code did not
+actually make it into the mainline until June, 2007 (2.6.22). What
+happened?
+
+This code showed a number of signs of having been developed behind
+corporate doors. But one large problem in particular was that it was not
+designed to work on multiprocessor systems. Before this networking stack
+(now called mac80211) could be merged, a locking scheme needed to be
+retrofitted onto it.
+
+Once upon a time, Linux kernel code could be developed without thinking
+about the concurrency issues presented by multiprocessor systems. Now,
+however, this document is being written on a dual-core laptop. Even on
+single-processor systems, work being done to improve responsiveness will
+raise the level of concurrency within the kernel. The days when kernel
+code could be written without thinking about locking are long past.
+
+Any resource (data structures, hardware registers, etc.) which could be
+accessed concurrently by more than one thread must be protected by a lock.
+New code should be written with this requirement in mind; retrofitting
+locking after the fact is a rather more difficult task. Kernel developers
+should take the time to understand the available locking primitives well
+enough to pick the right tool for the job. Code which shows a lack of
+attention to concurrency will have a difficult path into the mainline.
+
+
+* Regressions
+
+One final hazard worth mentioning is this: it can be tempting to make a
+change (which may bring big improvements) which causes something to break
+for existing users. This kind of change is called a "regression," and
+regressions have become most unwelcome in the mainline kernel. With few
+exceptions, changes which cause regressions will be backed out if the
+regression cannot be fixed in a timely manner. Far better to avoid the
+regression in the first place.
+
+It is often argued that a regression can be justified if it causes things
+to work for more people than it creates problems for. Why not make a
+change if it brings new functionality to ten systems for each one it
+breaks? The best answer to this question was expressed by Linus in July,
+2007:
+
+ So we don't fix bugs by introducing new problems. That way lies
+ madness, and nobody ever knows if you actually make any real
+ progress at all. Is it two steps forwards, one step back, or one
+ step forward and two steps back?
+
+(http://lwn.net/Articles/243460/).
+
+An especially unwelcome type of regression is any sort of change to the
+user-space ABI. Once an interface has been exported to user space, it must
+be supported indefinitely. This fact makes the creation of user-space
+interfaces particularly challenging: since they cannot be changed in
+incompatible ways, they must be done right the first time. For this
+reason, a great deal of thought, clear documentation, and wide review for
+user-space interfaces is always required.
+
+
+
+4.2: CODE CHECKING TOOLS
+
+For now, at least, the writing of error-free code remains an ideal that few
+of us can reach. What we can hope to do, though, is to catch and fix as
+many of those errors as possible before our code goes into the mainline
+kernel. To that end, the kernel developers have put together an impressive
+array of tools which can catch a wide variety of obscure problems in an
+automated way. Any problem caught by the computer is a problem which will
+not afflict a user later on, so it stands to reason that the automated
+tools should be used whenever possible.
+
+The first step is simply to heed the warnings produced by the compiler.
+Contemporary versions of gcc can detect (and warn about) a large number of
+potential errors. Quite often, these warnings point to real problems.
+Code submitted for review should, as a rule, not produce any compiler
+warnings. When silencing warnings, take care to understand the real cause
+and try to avoid "fixes" which make the warning go away without addressing
+its cause.
+
+Note that not all compiler warnings are enabled by default. Build the
+kernel with "make EXTRA_CFLAGS=-W" to get the full set.
+
+The kernel provides several configuration options which turn on debugging
+features; most of these are found in the "kernel hacking" submenu. Several
+of these options should be turned on for any kernel used for development or
+testing purposes. In particular, you should turn on:
+
+ - ENABLE_WARN_DEPRECATED, ENABLE_MUST_CHECK, and FRAME_WARN to get an
+ extra set of warnings for problems like the use of deprecated interfaces
+ or ignoring an important return value from a function. The output
+ generated by these warnings can be verbose, but one need not worry about
+ warnings from other parts of the kernel.
+
+ - DEBUG_OBJECTS will add code to track the lifetime of various objects
+ created by the kernel and warn when things are done out of order. If
+ you are adding a subsystem which creates (and exports) complex objects
+ of its own, consider adding support for the object debugging
+ infrastructure.
+
+ - DEBUG_SLAB can find a variety of memory allocation and use errors; it
+ should be used on most development kernels.
+
+ - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a
+ number of common locking errors.
+
+There are quite a few other debugging options, some of which will be
+discussed below. Some of them have a significant performance impact and
+should not be used all of the time. But some time spent learning the
+available options will likely be paid back many times over in short order.
+
+One of the heavier debugging tools is the locking checker, or "lockdep."
+This tool will track the acquisition and release of every lock (spinlock or
+mutex) in the system, the order in which locks are acquired relative to
+each other, the current interrupt environment, and more. It can then
+ensure that locks are always acquired in the same order, that the same
+interrupt assumptions apply in all situations, and so on. In other words,
+lockdep can find a number of scenarios in which the system could, on rare
+occasion, deadlock. This kind of problem can be painful (for both
+developers and users) in a deployed system; lockdep allows them to be found
+in an automated manner ahead of time. Code with any sort of non-trivial
+locking should be run with lockdep enabled before being submitted for
+inclusion.
+
+As a diligent kernel programmer, you will, beyond doubt, check the return
+status of any operation (such as a memory allocation) which can fail. The
+fact of the matter, though, is that the resulting failure recovery paths
+are, probably, completely untested. Untested code tends to be broken code;
+you could be much more confident of your code if all those error-handling
+paths had been exercised a few times.
+
+The kernel provides a fault injection framework which can do exactly that,
+especially where memory allocations are involved. With fault injection
+enabled, a configurable percentage of memory allocations will be made to
+fail; these failures can be restricted to a specific range of code.
+Running with fault injection enabled allows the programmer to see how the
+code responds when things go badly. See
+Documentation/fault-injection/fault-injection.text for more information on
+how to use this facility.
+
+Other kinds of errors can be found with the "sparse" static analysis tool.
+With sparse, the programmer can be warned about confusion between
+user-space and kernel-space addresses, mixture of big-endian and
+small-endian quantities, the passing of integer values where a set of bit
+flags is expected, and so on. Sparse must be installed separately (it can
+be found at http://www.kernel.org/pub/software/devel/sparse/ if your
+distributor does not package it); it can then be run on the code by adding
+"C=1" to your make command.
+
+Other kinds of portability errors are best found by compiling your code for
+other architectures. If you do not happen to have an S/390 system or a
+Blackfin development board handy, you can still perform the compilation
+step. A large set of cross compilers for x86 systems can be found at
+
+ http://www.kernel.org/pub/tools/crosstool/
+
+Some time spent installing and using these compilers will help avoid
+embarrassment later.
+
+
+4.3: DOCUMENTATION
+
+Documentation has often been more the exception than the rule with kernel
+development. Even so, adequate documentation will help to ease the merging
+of new code into the kernel, make life easier for other developers, and
+will be helpful for your users. In many cases, the addition of
+documentation has become essentially mandatory.
+
+The first piece of documentation for any patch is its associated
+changelog. Log entries should describe the problem being solved, the form
+of the solution, the people who worked on the patch, any relevant
+effects on performance, and anything else that might be needed to
+understand the patch.
+
+Any code which adds a new user-space interface - including new sysfs or
+/proc files - should include documentation of that interface which enables
+user-space developers to know what they are working with. See
+Documentation/ABI/README for a description of how this documentation should
+be formatted and what information needs to be provided.
+
+The file Documentation/kernel-parameters.txt describes all of the kernel's
+boot-time parameters. Any patch which adds new parameters should add the
+appropriate entries to this file.
+
+Any new configuration options must be accompanied by help text which
+clearly explains the options and when the user might want to select them.
+
+Internal API information for many subsystems is documented by way of
+specially-formatted comments; these comments can be extracted and formatted
+in a number of ways by the "kernel-doc" script. If you are working within
+a subsystem which has kerneldoc comments, you should maintain them and add
+them, as appropriate, for externally-available functions. Even in areas
+which have not been so documented, there is no harm in adding kerneldoc
+comments for the future; indeed, this can be a useful activity for
+beginning kernel developers. The format of these comments, along with some
+information on how to create kerneldoc templates can be found in the file
+Documentation/kernel-doc-nano-HOWTO.txt.
+
+Anybody who reads through a significant amount of existing kernel code will
+note that, often, comments are most notable by their absence. Once again,
+the expectations for new code are higher than they were in the past;
+merging uncommented code will be harder. That said, there is little desire
+for verbosely-commented code. The code should, itself, be readable, with
+comments explaining the more subtle aspects.
+
+Certain things should always be commented. Uses of memory barriers should
+be accompanied by a line explaining why the barrier is necessary. The
+locking rules for data structures generally need to be explained somewhere.
+Major data structures need comprehensive documentation in general.
+Non-obvious dependencies between separate bits of code should be pointed
+out. Anything which might tempt a code janitor to make an incorrect
+"cleanup" needs a comment saying why it is done the way it is. And so on.
+
+
+4.4: INTERNAL API CHANGES
+
+The binary interface provided by the kernel to user space cannot be broken
+except under the most severe circumstances. The kernel's internal
+programming interfaces, instead, are highly fluid and can be changed when
+the need arises. If you find yourself having to work around a kernel API,
+or simply not using a specific functionality because it does not meet your
+needs, that may be a sign that the API needs to change. As a kernel
+developer, you are empowered to make such changes.
+
+There are, of course, some catches. API changes can be made, but they need
+to be well justified. So any patch making an internal API change should be
+accompanied by a description of what the change is and why it is
+necessary. This kind of change should also be broken out into a separate
+patch, rather than buried within a larger patch.
+
+The other catch is that a developer who changes an internal API is
+generally charged with the task of fixing any code within the kernel tree
+which is broken by the change. For a widely-used function, this duty can
+lead to literally hundreds or thousands of changes - many of which are
+likely to conflict with work being done by other developers. Needless to
+say, this can be a large job, so it is best to be sure that the
+justification is solid.
+
+When making an incompatible API change, one should, whenever possible,
+ensure that code which has not been updated is caught by the compiler.
+This will help you to be sure that you have found all in-tree uses of that
+interface. It will also alert developers of out-of-tree code that there is
+a change that they need to respond to. Supporting out-of-tree code is not
+something that kernel developers need to be worried about, but we also do
+not have to make life harder for out-of-tree developers than it it needs to
+be.
diff --git a/Documentation/development-process/5.Posting b/Documentation/development-process/5.Posting
new file mode 100644
index 0000000..dd48132
--- /dev/null
+++ b/Documentation/development-process/5.Posting
@@ -0,0 +1,278 @@
+5: POSTING PATCHES
+
+Sooner or later, the time comes when your work is ready to be presented to
+the community for review and, eventually, inclusion into the mainline
+kernel. Unsurprisingly, the kernel development community has evolved a set
+of conventions and procedures which are used in the posting of patches;
+following them will make life much easier for everybody involved. This
+document will attempt to cover these expectations in reasonable detail;
+more information can also be found in the files SubmittingPatches,
+SubmittingDrivers, and SubmitChecklist in the kernel documentation
+directory.
+
+
+5.1: WHEN TO POST
+
+There is a constant temptation to avoid posting patches before they are
+completely "ready." For simple patches, that is not a problem. If the
+work being done is complex, though, there is a lot to be gained by getting
+feedback from the community before the work is complete. So you should
+consider posting in-progress work, or even making a git tree available so
+that interested developers can catch up with your work at any time.
+
+When posting code which is not yet considered ready for inclusion, it is a
+good idea to say so in the posting itself. Also mention any major work
+which remains to be done and any known problems. Fewer people will look at
+patches which are known to be half-baked, but those who do will come in
+with the idea that they can help you drive the work in the right direction.
+
+
+5.2: BEFORE CREATING PATCHES
+
+There are a number of things which should be done before you consider
+sending patches to the development community. These include:
+
+ - Test the code to the extent that you can. Make use of the kernel's
+ debugging tools, ensure that the kernel will build with all reasonable
+ combinations of configuration options, use cross-compilers to build for
+ different architectures, etc.
+
+ - Make sure your code is compliant with the kernel coding style
+ guidelines.
+
+ - Does your change have performance implications? If so, you should run
+ benchmarks showing what the impact (or benefit) of your change is; a
+ summary of the results should be included with the patch.
+
+ - Be sure that you have the right to post the code. If this work was done
+ for an employer, the employer likely has a right to the work and must be
+ agreeable with its release under the GPL.
+
+As a general rule, putting in some extra thought before posting code almost
+always pays back the effort in short order.
+
+
+5.3: PATCH PREPARATION
+
+The preparation of patches for posting can be a surprising amount of work,
+but, once again, attempting to save time here is not generally advisable
+even in the short term.
+
+Patches must be prepared against a specific version of the kernel. As a
+general rule, a patch should be based on the current mainline as found in
+Linus's git tree. It may become necessary to make versions against -mm,
+linux-next, or a subsystem tree, though, to facilitate wider testing and
+review. Depending on the area of your patch and what is going on
+elsewhere, basing a patch against these other trees can require a
+significant amount of work resolving conflicts and dealing with API
+changes.
+
+Only the most simple changes should be formatted as a single patch;
+everything else should be made as a logical series of changes. Splitting
+up patches is a bit of an art; some developers spend a long time figuring
+out how to do it in the way that the community expects. There are a few
+rules of thumb, however, which can help considerably:
+
+ - The patch series you post will almost certainly not be the series of
+ changes found in your working revision control system. Instead, the
+ changes you have made need to be considered in their final form, then
+ split apart in ways which make sense. The developers are interested in
+ discrete, self-contained changes, not the path you took to get to those
+ changes.
+
+ - Each logically independent change should be formatted as a separate
+ patch. These changes can be small ("add a field to this structure") or
+ large (adding a significant new driver, for example), but they should be
+ conceptually small and amenable to a one-line description. Each patch
+ should make a specific change which can be reviewed on its own and
+ verified to do what it says it does.
+
+ - As a way of restating the guideline above: do not mix different types of
+ changes in the same patch. If a single patch fixes a critical security
+ bug, rearranges a few structures, and reformats the code, there is a
+ good chance that it will be passed over and the important fix will be
+ lost.
+
+ - Each patch should yield a kernel which builds and runs properly; if your
+ patch series is interrupted in the middle, the result should still be a
+ working kernel. Partial application of a patch series is a common
+ scenario when the "git bisect" tool is used to find regressions; if the
+ result is a broken kernel, you will make life harder for developers and
+ users who are engaging in the noble work of tracking down problems.
+
+ - Do not overdo it, though. One developer recently posted a set of edits
+ to a single file as 500 separate patches - an act which did not make him
+ the most popular person on the kernel mailing list. A single patch can
+ be reasonably large as long as it still contains a single *logical*
+ change.
+
+ - It can be tempting to add a whole new infrastructure with a series of
+ patches, but to leave that infrastructure unused until the final patch
+ in the series enables the whole thing. This temptation should be
+ avoided if possible; if that series adds regressions, bisection will
+ finger the last patch as the one which caused the problem, even though
+ the real bug is elsewhere. Whenever possible, a patch which adds new
+ code should make that code active immediately.
+
+Working to create the perfect patch series can be a frustrating process
+which takes quite a bit of time and thought after the "real work" has been
+done. When done properly, though, it is time well spent.
+
+
+5.4: PATCH FORMATTING
+
+So now you have a perfect series of patches for posting, but the work is
+not done quite yet. Each patch needs to be formatted into a message which
+quickly and clearly communicates its purpose to the rest of the world. To
+that end, each patch will be composed of the following:
+
+ - An optional "From" line naming the author of the patch. This line is
+ only necessary if you are passing on somebody else's patch via email,
+ but it never hurts to add it when in doubt.
+
+ - A one-line description of what the patch does. This message should be
+ enough for a reader who sees it with no other context to figure out the
+ scope of the patch; it is the line that will show up in the "short form"
+ changelogs. This message is usually formatted with the relevant
+ subsystem name first, followed by the purpose of the patch. For
+ example:
+
+ gpio: fix build on CONFIG_GPIO_SYSFS=n
+
+ - A blank line followed by a detailed description of the contents of the
+ patch. This description can be as long as is required; it should say
+ what the patch does and why it should be applied to the kernel.
+
+ - One or more tag lines, with, at a minimum, one Signed-off-by: line from
+ the author of the patch. Tags will be described in more detail below.
+
+The above three items should, normally, be the text used when committing
+the change to a revision control system. They are followed by:
+
+ - The patch itself, in the unified ("-u") patch format. Using the "-p"
+ option to diff will associate function names with changes, making the
+ resulting patch easier for others to read.
+
+You should avoid including changes to irrelevant files (those generated by
+the build process, for example, or editor backup files) in the patch. The
+file "dontdiff" in the Documentation directory can help in this regard;
+pass it to diff with the "-X" option.
+
+The tags mentioned above are used to describe how various developers have
+been associated with the development of this patch. They are described in
+detail in the SubmittingPatches document; what follows here is a brief
+summary. Each of these lines has the format:
+
+ tag: Full Name <email address> optional-other-stuff
+
+The tags in common use are:
+
+ - Signed-off-by: this is a developer's certification that he or she has
+ the right to submit the patch for inclusion into the kernel. It is an
+ agreement to the Developer's Certificate of Origin, the full text of
+ which can be found in Documentation/SubmittingPatches. Code without a
+ proper signoff cannot be merged into the mainline.
+
+ - Acked-by: indicates an agreement by another developer (often a
+ maintainer of the relevant code) that the patch is appropriate for
+ inclusion into the kernel.
+
+ - Tested-by: states that the named person has tested the patch and found
+ it to work.
+
+ - Reviewed-by: the named developer has reviewed the patch for correctness;
+ see the reviewer's statement in Documentation/SubmittingPatches for more
+ detail.
+
+ - Reported-by: names a user who reported a problem which is fixed by this
+ patch; this tag is used to give credit to the (often underappreciated)
+ people who test our code and let us know when things do not work
+ correctly.
+
+ - Cc: the named person received a copy of the patch and had the
+ opportunity to comment on it.
+
+Be careful in the addition of tags to your patches: only Cc: is appropriate
+for addition without the explicit permission of the person named.
+
+
+5.5: SENDING THE PATCH
+
+Before you mail your patches, there are a couple of other things you should
+take care of:
+
+ - Are you sure that your mailer will not corrupt the patches? Patches
+ which have had gratuitous white-space changes or line wrapping performed
+ by the mail client will not apply at the other end, and often will not
+ be examined in any detail. If there is any doubt at all, mail the patch
+ to yourself and convince yourself that it shows up intact.
+
+ Documentation/email-clients.txt has some helpful hints on making
+ specific mail clients work for sending patches.
+
+ - Are you sure your patch is free of silly mistakes? You should always
+ run patches through scripts/checkpatch.pl and address the complaints it
+ comes up with. Please bear in mind that checkpatch.pl, while being the
+ embodiment of a fair amount of thought about what kernel patches should
+ look like, is not smarter than you. If fixing a checkpatch.pl complaint
+ would make the code worse, don't do it.
+
+Patches should always be sent as plain text. Please do not send them as
+attachments; that makes it much harder for reviewers to quote sections of
+the patch in their replies. Instead, just put the patch directly into your
+message.
+
+When mailing patches, it is important to send copies to anybody who might
+be interested in it. Unlike some other projects, the kernel encourages
+people to err on the side of sending too many copies; don't assume that the
+relevant people will see your posting on the mailing lists. In particular,
+copies should go to:
+
+ - The maintainer(s) of the affected subsystem(s). As described earlier,
+ the MAINTAINERS file is the first place to look for these people.
+
+ - Other developers who have been working in the same area - especially
+ those who might be working there now. Using git to see who else has
+ modified the files you are working on can be helpful.
+
+ - If you are responding to a bug report or a feature request, copy the
+ original poster as well.
+
+ - Send a copy to the relevant mailing list, or, if nothing else applies,
+ the linux-kernel list.
+
+ - If you are fixing a bug, think about whether the fix should go into the
+ next stable update. If so, stable@kernel.org should get a copy of the
+ patch. Also add a "Cc: stable@kernel.org" to the tags within the patch
+ itself; that will cause the stable team to get a notification when your
+ fix goes into the mainline.
+
+When selecting recipients for a patch, it is good to have an idea of who
+you think will eventually accept the patch and get it merged. While it
+is possible to send patches directly to Linus Torvalds and have him merge
+them, things are not normally done that way. Linus is busy, and there are
+subsystem maintainers who watch over specific parts of the kernel. Usually
+you will be wanting that maintainer to merge your patches. If there is no
+obvious maintainer, Andrew Morton is often the patch target of last resort.
+
+Patches need good subject lines. The canonical format for a patch line is
+something like:
+
+ [PATCH nn/mm] subsys: one-line description of the patch
+
+where "nn" is the ordinal number of the patch, "mm" is the total number of
+patches in the series, and "subsys" is the name of the affected subsystem.
+Clearly, nn/mm can be omitted for a single, standalone patch.
+
+If you have a significant series of patches, it is customary to send an
+introductory description as part zero. This convention is not universally
+followed though; if you use it, remember that information in the
+introduction does not make it into the kernel changelogs. So please ensure
+that the patches, themselves, have complete changelog information.
+
+In general, the second and following parts of a multi-part patch should be
+sent as a reply to the first part so that they all thread together at the
+receiving end. Tools like git and quilt have commands to mail out a set of
+patches with the proper threading. If you have a long series, though, and
+are using git, please provide the --no-chain-reply-to option to avoid
+creating exceptionally deep nesting.
diff --git a/Documentation/development-process/6.Followthrough b/Documentation/development-process/6.Followthrough
new file mode 100644
index 0000000..a8fba3d
--- /dev/null
+++ b/Documentation/development-process/6.Followthrough
@@ -0,0 +1,202 @@
+6: FOLLOWTHROUGH
+
+At this point, you have followed the guidelines given so far and, with the
+addition of your own engineering skills, have posted a perfect series of
+patches. One of the biggest mistakes that even experienced kernel
+developers can make is to conclude that their work is now done. In truth,
+posting patches indicates a transition into the next stage of the process,
+with, possibly, quite a bit of work yet to be done.
+
+It is a rare patch which is so good at its first posting that there is no
+room for improvement. The kernel development process recognizes this fact,
+and, as a result, is heavily oriented toward the improvement of posted
+code. You, as the author of that code, will be expected to work with the
+kernel community to ensure that your code is up to the kernel's quality
+standards. A failure to participate in this process is quite likely to
+prevent the inclusion of your patches into the mainline.
+
+
+6.1: WORKING WITH REVIEWERS
+
+A patch of any significance will result in a number of comments from other
+developers as they review the code. Working with reviewers can be, for
+many developers, the most intimidating part of the kernel development
+process. Life can be made much easier, though, if you keep a few things in
+mind:
+
+ - If you have explained your patch well, reviewers will understand its
+ value and why you went to the trouble of writing it. But that value
+ will not keep them from asking a fundamental question: what will it be
+ like to maintain a kernel with this code in it five or ten years later?
+ Many of the changes you may be asked to make - from coding style tweaks
+ to substantial rewrites - come from the understanding that Linux will
+ still be around and under development a decade from now.
+
+ - Code review is hard work, and it is a relatively thankless occupation;
+ people remember who wrote kernel code, but there is little lasting fame
+ for those who reviewed it. So reviewers can get grumpy, especially when
+ they see the same mistakes being made over and over again. If you get a
+ review which seems angry, insulting, or outright offensive, resist the
+ impulse to respond in kind. Code review is about the code, not about
+ the people, and code reviewers are not attacking you personally.
+
+ - Similarly, code reviewers are not trying to promote their employers'
+ agendas at the expense of your own. Kernel developers often expect to
+ be working on the kernel years from now, but they understand that their
+ employer could change. They truly are, almost without exception,
+ working toward the creation of the best kernel they can; they are not
+ trying to create discomfort for their employers' competitors.
+
+What all of this comes down to is that, when reviewers send you comments,
+you need to pay attention to the technical observations that they are
+making. Do not let their form of expression or your own pride keep that
+from happening. When you get review comments on a patch, take the time to
+understand what the reviewer is trying to say. If possible, fix the things
+that the reviewer is asking you to fix. And respond back to the reviewer:
+thank them, and describe how you will answer their questions.
+
+Note that you do not have to agree with every change suggested by
+reviewers. If you believe that the reviewer has misunderstood your code,
+explain what is really going on. If you have a technical objection to a
+suggested change, describe it and justify your solution to the problem. If
+your explanations make sense, the reviewer will accept them. Should your
+explanation not prove persuasive, though, especially if others start to
+agree with the reviewer, take some time to think things over again. It can
+be easy to become blinded by your own solution to a problem to the point
+that you don't realize that something is fundamentally wrong or, perhaps,
+you're not even solving the right problem.
+
+One fatal mistake is to ignore review comments in the hope that they will
+go away. They will not go away. If you repost code without having
+responded to the comments you got the time before, you're likely to find
+that your patches go nowhere.
+
+Speaking of reposting code: please bear in mind that reviewers are not
+going to remember all the details of the code you posted the last time
+around. So it is always a good idea to remind reviewers of previously
+raised issues and how you dealt with them; the patch changelog is a good
+place for this kind of information. Reviewers should not have to search
+through list archives to familiarize themselves with what was said last
+time; if you help them get a running start, they will be in a better mood
+when they revisit your code.
+
+What if you've tried to do everything right and things still aren't going
+anywhere? Most technical disagreements can be resolved through discussion,
+but there are times when somebody simply has to make a decision. If you
+honestly believe that this decision is going against you wrongly, you can
+always try appealing to a higher power. As of this writing, that higher
+power tends to be Andrew Morton. Andrew has a great deal of respect in the
+kernel development community; he can often unjam a situation which seems to
+be hopelessly blocked. Appealing to Andrew should not be done lightly,
+though, and not before all other alternatives have been explored. And bear
+in mind, of course, that he may not agree with you either.
+
+
+6.2: WHAT HAPPENS NEXT
+
+If a patch is considered to be a good thing to add to the kernel, and once
+most of the review issues have been resolved, the next step is usually
+entry into a subsystem maintainer's tree. How that works varies from one
+subsystem to the next; each maintainer has his or her own way of doing
+things. In particular, there may be more than one tree - one, perhaps,
+dedicated to patches planned for the next merge window, and another for
+longer-term work.
+
+For patches applying to areas for which there is no obvious subsystem tree
+(memory management patches, for example), the default tree often ends up
+being -mm. Patches which affect multiple subsystems can also end up going
+through the -mm tree.
+
+Inclusion into a subsystem tree can bring a higher level of visibility to a
+patch. Now other developers working with that tree will get the patch by
+default. Subsystem trees typically feed into -mm and linux-next as well,
+making their contents visible to the development community as a whole. At
+this point, there's a good chance that you will get more comments from a
+new set of reviewers; these comments need to be answered as in the previous
+round.
+
+What may also happen at this point, depending on the nature of your patch,
+is that conflicts with work being done by others turn up. In the worst
+case, heavy patch conflicts can result in some work being put on the back
+burner so that the remaining patches can be worked into shape and merged.
+Other times, conflict resolution will involve working with the other
+developers and, possibly, moving some patches between trees to ensure that
+everything applies cleanly. This work can be a pain, but count your
+blessings: before the advent of the linux-next tree, these conflicts often
+only turned up during the merge window and had to be addressed in a hurry.
+Now they can be resolved at leisure, before the merge window opens.
+
+Some day, if all goes well, you'll log on and see that your patch has been
+merged into the mainline kernel. Congratulations! Once the celebration is
+complete (and you have added yourself to the MAINTAINERS file), though, it
+is worth remembering an important little fact: the job still is not done.
+Merging into the mainline brings its own challenges.
+
+To begin with, the visibility of your patch has increased yet again. There
+may be a new round of comments from developers who had not been aware of
+the patch before. It may be tempting to ignore them, since there is no
+longer any question of your code being merged. Resist that temptation,
+though; you still need to be responsive to developers who have questions or
+suggestions.
+
+More importantly, though: inclusion into the mainline puts your code into
+the hands of a much larger group of testers. Even if you have contributed
+a driver for hardware which is not yet available, you will be surprised by
+how many people will build your code into their kernels. And, of course,
+where there are testers, there will be bug reports.
+
+The worst sort of bug reports are regressions. If your patch causes a
+regression, you'll find an uncomfortable number of eyes upon you;
+regressions need to be fixed as soon as possible. If you are unwilling or
+unable to fix the regression (and nobody else does it for you), your patch
+will almost certainly be removed during the stabilization period. Beyond
+negating all of the work you have done to get your patch into the mainline,
+having a patch pulled as the result of a failure to fix a regression could
+well make it harder for you to get work merged in the future.
+
+After any regressions have been dealt with, there may be other, ordinary
+bugs to deal with. The stabilization period is your best opportunity to
+fix these bugs and ensure that your code's debut in a mainline kernel
+release is as solid as possible. So, please, answer bug reports, and fix
+the problems if at all possible. That's what the stabilization period is
+for; you can start creating cool new patches once any problems with the old
+ones have been taken care of.
+
+And don't forget that there are other milestones which may also create bug
+reports: the next mainline stable release, when prominent distributors pick
+up a version of the kernel containing your patch, etc. Continuing to
+respond to these reports is a matter of basic pride in your work. If that
+is insufficient motivation, though, it's also worth considering that the
+development community remembers developers who lose interest in their code
+after it's merged. The next time you post a patch, they will be evaluating
+it with the assumption that you will not be around to maintain it
+afterward.
+
+
+6.3: OTHER THINGS THAT CAN HAPPEN
+
+One day, you may open your mail client and see that somebody has mailed you
+a patch to your code. That is one of the advantages of having your code
+out there in the open, after all. If you agree with the patch, you can
+either forward it on to the subsystem maintainer (be sure to include a
+proper From: line so that the attribution is correct, and add a signoff of
+your own), or send an Acked-by: response back and let the original poster
+send it upward.
+
+If you disagree with the patch, send a polite response explaining why. If
+possible, tell the author what changes need to be made to make the patch
+acceptable to you. There is a certain resistance to merging patches which
+are opposed by the author and maintainer of the code, but it only goes so
+far. If you are seen as needlessly blocking good work, those patches will
+eventually flow around you and get into the mainline anyway. In the Linux
+kernel, nobody has absolute veto power over any code. Except maybe Linus.
+
+On very rare occasion, you may see something completely different: another
+developer posts a different solution to your problem. At that point,
+chances are that one of the two patches will not be merged, and "mine was
+here first" is not considered to be a compelling technical argument. If
+somebody else's patch displaces yours and gets into the mainline, there is
+really only one way to respond: be pleased that your problem got solved and
+get on with your work. Having one's work shoved aside in this manner can
+be hurtful and discouraging, but the community will remember your reaction
+long after they have forgotten whose patch actually got merged.
diff --git a/Documentation/development-process/7.AdvancedTopics b/Documentation/development-process/7.AdvancedTopics
new file mode 100644
index 0000000..a2cf740
--- /dev/null
+++ b/Documentation/development-process/7.AdvancedTopics
@@ -0,0 +1,173 @@
+7: ADVANCED TOPICS
+
+At this point, hopefully, you have a handle on how the development process
+works. There is still more to learn, however! This section will cover a
+number of topics which can be helpful for developers wanting to become a
+regular part of the Linux kernel development process.
+
+7.1: MANAGING PATCHES WITH GIT
+
+The use of distributed version control for the kernel began in early 2002,
+when Linus first started playing with the proprietary BitKeeper
+application. While BitKeeper was controversial, the approach to software
+version management it embodied most certainly was not. Distributed version
+control enabled an immediate acceleration of the kernel development
+project. In current times, there are several free alternatives to
+BitKeeper. For better or for worse, the kernel project has settled on git
+as its tool of choice.
+
+Managing patches with git can make life much easier for the developer,
+especially as the volume of those patches grows. Git also has its rough
+edges and poses certain hazards; it is a young and powerful tool which is
+still being civilized by its developers. This document will not attempt to
+teach the reader how to use git; that would be sufficient material for a
+long document in its own right. Instead, the focus here will be on how git
+fits into the kernel development process in particular. Developers who
+wish to come up to speed with git will find more information at:
+
+ http://git.or.cz/
+
+ http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
+
+and on various tutorials found on the web.
+
+The first order of business is to read the above sites and get a solid
+understanding of how git works before trying to use it to make patches
+available to others. A git-using developer should be able to obtain a copy
+of the mainline repository, explore the revision history, commit changes to
+the tree, use branches, etc. An understanding of git's tools for the
+rewriting of history (such as rebase) is also useful. Git comes with its
+own terminology and concepts; a new user of git should know about refs,
+remote branches, the index, fast-forward merges, pushes and pulls, detached
+heads, etc. It can all be a little intimidating at the outset, but the
+concepts are not that hard to grasp with a bit of study.
+
+Using git to generate patches for submission by email can be a good
+exercise while coming up to speed.
+
+When you are ready to start putting up git trees for others to look at, you
+will, of course, need a server that can be pulled from. Setting up such a
+server with git-daemon is relatively straightforward if you have a system
+which is accessible to the Internet. Otherwise, free, public hosting sites
+(Github, for example) are starting to appear on the net. Established
+developers can get an account on kernel.org, but those are not easy to come
+by; see http://kernel.org/faq/ for more information.
+
+The normal git workflow involves the use of a lot of branches. Each line
+of development can be separated into a separate "topic branch" and
+maintained independently. Branches in git are cheap, there is no reason to
+not make free use of them. And, in any case, you should not do your
+development in any branch which you intend to ask others to pull from.
+Publicly-available branches should be created with care; merge in patches
+from development branches when they are in complete form and ready to go -
+not before.
+
+Git provides some powerful tools which can allow you to rewrite your
+development history. An inconvenient patch (one which breaks bisection,
+say, or which has some other sort of obvious bug) can be fixed in place or
+made to disappear from the history entirely. A patch series can be
+rewritten as if it had been written on top of today's mainline, even though
+you have been working on it for months. Changes can be transparently
+shifted from one branch to another. And so on. Judicious use of git's
+ability to revise history can help in the creation of clean patch sets with
+fewer problems.
+
+Excessive use of this capability can lead to other problems, though, beyond
+a simple obsession for the creation of the perfect project history.
+Rewriting history will rewrite the changes contained in that history,
+turning a tested (hopefully) kernel tree into an untested one. But, beyond
+that, developers cannot easily collaborate if they do not have a shared
+view of the project history; if you rewrite history which other developers
+have pulled into their repositories, you will make life much more difficult
+for those developers. So a simple rule of thumb applies here: history
+which has been exported to others should generally be seen as immutable
+thereafter.
+
+So, once you push a set of changes to your publicly-available server, those
+changes should not be rewritten. Git will attempt to enforce this rule if
+you try to push changes which do not result in a fast-forward merge
+(i.e. changes which do not share the same history). It is possible to
+override this check, and there may be times when it is necessary to rewrite
+an exported tree. Moving changesets between trees to avoid conflicts in
+linux-next is one example. But such actions should be rare. This is one
+of the reasons why development should be done in private branches (which
+can be rewritten if necessary) and only moved into public branches when
+it's in a reasonably advanced state.
+
+As the mainline (or other tree upon which a set of changes is based)
+advances, it is tempting to merge with that tree to stay on the leading
+edge. For a private branch, rebasing can be an easy way to keep up with
+another tree, but rebasing is not an option once a tree is exported to the
+world. Once that happens, a full merge must be done. Merging occasionally
+makes good sense, but overly frequent merges can clutter the history
+needlessly. Suggested technique in this case is to merge infrequently, and
+generally only at specific release points (such as a mainline -rc
+release). If you are nervous about specific changes, you can always
+perform test merges in a private branch. The git "rerere" tool can be
+useful in such situations; it remembers how merge conflicts were resolved
+so that you don't have to do the same work twice.
+
+One of the biggest recurring complaints about tools like git is this: the
+mass movement of patches from one repository to another makes it easy to
+slip in ill-advised changes which go into the mainline below the review
+radar. Kernel developers tend to get unhappy when they see that kind of
+thing happening; putting up a git tree with unreviewed or off-topic patches
+can affect your ability to get trees pulled in the future. Quoting Linus:
+
+ You can send me patches, but for me to pull a git patch from you, I
+ need to know that you know what you're doing, and I need to be able
+ to trust things *without* then having to go and check every
+ individual change by hand.
+
+(http://lwn.net/Articles/224135/).
+
+To avoid this kind of situation, ensure that all patches within a given
+branch stick closely to the associated topic; a "driver fixes" branch
+should not be making changes to the core memory management code. And, most
+importantly, do not use a git tree to bypass the review process. Post an
+occasional summary of the tree to the relevant list, and, when the time is
+right, request that the tree be included in linux-next.
+
+If and when others start to send patches for inclusion into your tree,
+don't forget to review them. Also ensure that you maintain the correct
+authorship information; the git "am" tool does its best in this regard, but
+you may have to add a "From:" line to the patch if it has been relayed to
+you via a third party.
+
+When requesting a pull, be sure to give all the relevant information: where
+your tree is, what branch to pull, and what changes will result from the
+pull. The git request-pull command can be helpful in this regard; it will
+format the request as other developers expect, and will also check to be
+sure that you have remembered to push those changes to the public server.
+
+
+7.2: REVIEWING PATCHES
+
+Some readers will certainly object to putting this section with "advanced
+topics" on the grounds that even beginning kernel developers should be
+reviewing patches. It is certainly true that there is no better way to
+learn how to program in the kernel environment than by looking at code
+posted by others. In addition, reviewers are forever in short supply; by
+looking at code you can make a significant contribution to the process as a
+whole.
+
+Reviewing code can be an intimidating prospect, especially for a new kernel
+developer who may well feel nervous about questioning code - in public -
+which has been posted by those with more experience. Even code written by
+the most experienced developers can be improved, though. Perhaps the best
+piece of advice for reviewers (all reviewers) is this: phrase review
+comments as questions rather than criticisms. Asking "how does the lock
+get released in this path?" will always work better than stating "the
+locking here is wrong."
+
+Different developers will review code from different points of view. Some
+are mostly concerned with coding style and whether code lines have trailing
+white space. Others will focus primarily on whether the change implemented
+by the patch as a whole is a good thing for the kernel or not. Yet others
+will check for problematic locking, excessive stack usage, possible
+security issues, duplication of code found elsewhere, adequate
+documentation, adverse effects on performance, user-space ABI changes, etc.
+All types of review, if they lead to better code going into the kernel, are
+welcome and worthwhile.
+
+
diff --git a/Documentation/development-process/8.Conclusion b/Documentation/development-process/8.Conclusion
new file mode 100644
index 0000000..1990ab4
--- /dev/null
+++ b/Documentation/development-process/8.Conclusion
@@ -0,0 +1,74 @@
+8: FOR MORE INFORMATION
+
+There are numerous sources of information on Linux kernel development and
+related topics. First among those will always be the Documentation
+directory found in the kernel source distribution. The top-level HOWTO
+file is an important starting point; SubmittingPatches and
+SubmittingDrivers are also something which all kernel developers should
+read. Many internal kernel APIs are documented using the kerneldoc
+mechanism; "make htmldocs" or "make pdfdocs" can be used to generate those
+documents in HTML or PDF format (though the version of TeX shipped by some
+distributions runs into internal limits and fails to process the documents
+properly).
+
+Various web sites discuss kernel development at all levels of detail. Your
+author would like to humbly suggest http://lwn.net/ as a source;
+information on many specific kernel topics can be found via the LWN kernel
+index at:
+
+ http://lwn.net/Kernel/Index/
+
+Beyond that, a valuable resource for kernel developers is:
+
+ http://kernelnewbies.org/
+
+Information about the linux-next tree gathers at:
+
+ http://linux.f-seidel.de/linux-next/pmwiki/
+
+And, of course, one should not forget http://kernel.org/, the definitive
+location for kernel release information.
+
+There are a number of books on kernel development:
+
+ Linux Device Drivers, 3rd Edition (Jonathan Corbet, Alessandro
+ Rubini, and Greg Kroah-Hartman). Online at
+ http://lwn.net/Kernel/LDD3/.
+
+ Linux Kernel Development (Robert Love).
+
+ Understanding the Linux Kernel (Daniel Bovet and Marco Cesati).
+
+All of these books suffer from a common fault, though: they tend to be
+somewhat obsolete by the time they hit the shelves, and they have been on
+the shelves for a while now. Still, there is quite a bit of good
+information to be found there.
+
+Documentation for git can be found at:
+
+ http://www.kernel.org/pub/software/scm/git/docs/
+
+ http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
+
+
+9: CONCLUSION
+
+Congratulations to anybody who has made it through this long-winded
+document. Hopefully it has provided a helpful understanding of how the
+Linux kernel is developed and how you can participate in that process.
+
+In the end, it's the participation that matters. Any open source software
+project is no more than the sum of what its contributors put into it. The
+Linux kernel has progressed as quickly and as well as it has because it has
+been helped by an impressively large group of developers, all of whom are
+working to make it better. The kernel is a premier example of what can be
+done when thousands of people work together toward a common goal.
+
+The kernel can always benefit from a larger developer base, though. There
+is always more work to do. But, just as importantly, most other
+participants in the Linux ecosystem can benefit through contributing to the
+kernel. Getting code into the mainline is the key to higher code quality,
+lower maintenance and distribution costs, a higher level of influence over
+the direction of kernel development, and more. It is a situation where
+everybody involved wins. Fire up your editor and come join us; you will be
+more than welcome.
diff --git a/Documentation/device-mapper/delay.txt b/Documentation/device-mapper/delay.txt
new file mode 100644
index 0000000..15adc55
--- /dev/null
+++ b/Documentation/device-mapper/delay.txt
@@ -0,0 +1,26 @@
+dm-delay
+========
+
+Device-Mapper's "delay" target delays reads and/or writes
+and maps them to different devices.
+
+Parameters:
+ <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
+
+With separate write parameters, the first set is only used for reads.
+Delays are specified in milliseconds.
+
+Example scripts
+===============
+[[
+#!/bin/sh
+# Create device delaying rw operation for 500ms
+echo "0 `blockdev --getsize $1` delay $1 0 500" | dmsetup create delayed
+]]
+
+[[
+#!/bin/sh
+# Create device delaying only write operation for 500ms and
+# splitting reads and writes to different devices $1 $2
+echo "0 `blockdev --getsize $1` delay $1 0 0 $2 0 500" | dmsetup create delayed
+]]
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
new file mode 100644
index 0000000..6680cab
--- /dev/null
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -0,0 +1,52 @@
+dm-crypt
+=========
+
+Device-Mapper's "crypt" target provides transparent encryption of block devices
+using the kernel crypto API.
+
+Parameters: <cipher> <key> <iv_offset> <device path> <offset>
+
+<cipher>
+ Encryption cipher and an optional IV generation mode.
+ (In format cipher-chainmode-ivopts:ivmode).
+ Examples:
+ des
+ aes-cbc-essiv:sha256
+ twofish-ecb
+
+ /proc/crypto contains supported crypto modes
+
+<key>
+ Key used for encryption. It is encoded as a hexadecimal number.
+ You can only use key sizes that are valid for the selected cipher.
+
+<iv_offset>
+ The IV offset is a sector count that is added to the sector number
+ before creating the IV.
+
+<device path>
+ This is the device that is going to be used as backend and contains the
+ encrypted data. You can specify it as a path like /dev/xxx or a device
+ number <major>:<minor>.
+
+<offset>
+ Starting sector within the device where the encrypted data begins.
+
+Example scripts
+===============
+LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
+encryption with dm-crypt using the 'cryptsetup' utility, see
+http://luks.endorphin.org/
+
+[[
+#!/bin/sh
+# Create a crypt device using dmsetup
+dmsetup create crypt1 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0"
+]]
+
+[[
+#!/bin/sh
+# Create a crypt device using cryptsetup and LUKS header with default cipher
+cryptsetup luksFormat $1
+cryptsetup luksOpen $1 crypt1
+]]
diff --git a/Documentation/device-mapper/dm-io.txt b/Documentation/device-mapper/dm-io.txt
new file mode 100644
index 0000000..3b5d9a5
--- /dev/null
+++ b/Documentation/device-mapper/dm-io.txt
@@ -0,0 +1,75 @@
+dm-io
+=====
+
+Dm-io provides synchronous and asynchronous I/O services. There are three
+types of I/O services available, and each type has a sync and an async
+version.
+
+The user must set up an io_region structure to describe the desired location
+of the I/O. Each io_region indicates a block-device along with the starting
+sector and size of the region.
+
+ struct io_region {
+ struct block_device *bdev;
+ sector_t sector;
+ sector_t count;
+ };
+
+Dm-io can read from one io_region or write to one or more io_regions. Writes
+to multiple regions are specified by an array of io_region structures.
+
+The first I/O service type takes a list of memory pages as the data buffer for
+the I/O, along with an offset into the first page.
+
+ struct page_list {
+ struct page_list *next;
+ struct page *page;
+ };
+
+ int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
+ struct page_list *pl, unsigned int offset,
+ unsigned long *error_bits);
+ int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+ struct page_list *pl, unsigned int offset,
+ io_notify_fn fn, void *context);
+
+The second I/O service type takes an array of bio vectors as the data buffer
+for the I/O. This service can be handy if the caller has a pre-assembled bio,
+but wants to direct different portions of the bio to different devices.
+
+ int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where,
+ int rw, struct bio_vec *bvec,
+ unsigned long *error_bits);
+ int dm_io_async_bvec(unsigned int num_regions, struct io_region *where,
+ int rw, struct bio_vec *bvec,
+ io_notify_fn fn, void *context);
+
+The third I/O service type takes a pointer to a vmalloc'd memory buffer as the
+data buffer for the I/O. This service can be handy if the caller needs to do
+I/O to a large region but doesn't want to allocate a large number of individual
+memory pages.
+
+ int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
+ void *data, unsigned long *error_bits);
+ int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
+ void *data, io_notify_fn fn, void *context);
+
+Callers of the asynchronous I/O services must include the name of a completion
+callback routine and a pointer to some context data for the I/O.
+
+ typedef void (*io_notify_fn)(unsigned long error, void *context);
+
+The "error" parameter in this callback, as well as the "*error" parameter in
+all of the synchronous versions, is a bitset (instead of a simple error value).
+In the case of an write-I/O to multiple regions, this bitset allows dm-io to
+indicate success or failure on each individual region.
+
+Before using any of the dm-io services, the user should call dm_io_get()
+and specify the number of pages they expect to perform I/O on concurrently.
+Dm-io will attempt to resize its mempool to make sure enough pages are
+always available in order to avoid unnecessary waiting while performing I/O.
+
+When the user is finished using the dm-io services, they should call
+dm_io_put() and specify the same number of pages that were given on the
+dm_io_get() call.
+
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt
new file mode 100644
index 0000000..07edbd8
--- /dev/null
+++ b/Documentation/device-mapper/dm-uevent.txt
@@ -0,0 +1,97 @@
+The device-mapper uevent code adds the capability to device-mapper to create
+and send kobject uevents (uevents). Previously device-mapper events were only
+available through the ioctl interface. The advantage of the uevents interface
+is the event contains environment attributes providing increased context for
+the event avoiding the need to query the state of the device-mapper device after
+the event is received.
+
+There are two functions currently for device-mapper events. The first function
+listed creates the event and the second function sends the event(s).
+
+void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
+ const char *path, unsigned nr_valid_paths)
+
+void dm_send_uevents(struct list_head *events, struct kobject *kobj)
+
+
+The variables added to the uevent environment are:
+
+Variable Name: DM_TARGET
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Name of device-mapper target that generated the event.
+
+Variable Name: DM_ACTION
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Device-mapper specific action that caused the uevent action.
+ PATH_FAILED - A path has failed.
+ PATH_REINSTATED - A path has been reinstated.
+
+Variable Name: DM_SEQNUM
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description: A sequence number for this specific device-mapper device.
+Value: Valid unsigned integer range.
+
+Variable Name: DM_PATH
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Major and minor number of the path device pertaining to this
+event.
+Value: Path name in the form of "Major:Minor"
+
+Variable Name: DM_NR_VALID_PATHS
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description:
+Value: Valid unsigned integer range.
+
+Variable Name: DM_NAME
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Name of the device-mapper device.
+Value: Name
+
+Variable Name: DM_UUID
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: UUID of the device-mapper device.
+Value: UUID. (Empty string if there isn't one.)
+
+An example of the uevents generated as captured by udevmonitor is shown
+below.
+
+1.) Path failure.
+UEVENT[1192521009.711215] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_FAILED
+DM_SEQNUM=1
+DM_PATH=8:32
+DM_NR_VALID_PATHS=0
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1130
+
+2.) Path reinstate.
+UEVENT[1192521132.989927] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_REINSTATED
+DM_SEQNUM=2
+DM_PATH=8:32
+DM_NR_VALID_PATHS=1
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1131
diff --git a/Documentation/device-mapper/kcopyd.txt b/Documentation/device-mapper/kcopyd.txt
new file mode 100644
index 0000000..820382c
--- /dev/null
+++ b/Documentation/device-mapper/kcopyd.txt
@@ -0,0 +1,47 @@
+kcopyd
+======
+
+Kcopyd provides the ability to copy a range of sectors from one block-device
+to one or more other block-devices, with an asynchronous completion
+notification. It is used by dm-snapshot and dm-mirror.
+
+Users of kcopyd must first create a client and indicate how many memory pages
+to set aside for their copy jobs. This is done with a call to
+kcopyd_client_create().
+
+ int kcopyd_client_create(unsigned int num_pages,
+ struct kcopyd_client **result);
+
+To start a copy job, the user must set up io_region structures to describe
+the source and destinations of the copy. Each io_region indicates a
+block-device along with the starting sector and size of the region. The source
+of the copy is given as one io_region structure, and the destinations of the
+copy are given as an array of io_region structures.
+
+ struct io_region {
+ struct block_device *bdev;
+ sector_t sector;
+ sector_t count;
+ };
+
+To start the copy, the user calls kcopyd_copy(), passing in the client
+pointer, pointers to the source and destination io_regions, the name of a
+completion callback routine, and a pointer to some context data for the copy.
+
+ int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+ unsigned int num_dests, struct io_region *dests,
+ unsigned int flags, kcopyd_notify_fn fn, void *context);
+
+ typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err,
+ void *context);
+
+When the copy completes, kcopyd will call the user's completion routine,
+passing back the user's context pointer. It will also indicate if a read or
+write error occurred during the copy.
+
+When a user is done with all their copy jobs, they should call
+kcopyd_client_destroy() to delete the kcopyd client, which will release the
+associated memory pages.
+
+ void kcopyd_client_destroy(struct kcopyd_client *kc);
+
diff --git a/Documentation/device-mapper/linear.txt b/Documentation/device-mapper/linear.txt
new file mode 100644
index 0000000..d5307d3
--- /dev/null
+++ b/Documentation/device-mapper/linear.txt
@@ -0,0 +1,61 @@
+dm-linear
+=========
+
+Device-Mapper's "linear" target maps a linear range of the Device-Mapper
+device onto a linear range of another device. This is the basic building
+block of logical volume managers.
+
+Parameters: <dev path> <offset>
+ <dev path>: Full pathname to the underlying block-device, or a
+ "major:minor" device-number.
+ <offset>: Starting sector within the device.
+
+
+Example scripts
+===============
+[[
+#!/bin/sh
+# Create an identity mapping for a device
+echo "0 `blockdev --getsize $1` linear $1 0" | dmsetup create identity
+]]
+
+
+[[
+#!/bin/sh
+# Join 2 devices together
+size1=`blockdev --getsize $1`
+size2=`blockdev --getsize $2`
+echo "0 $size1 linear $1 0
+$size1 $size2 linear $2 0" | dmsetup create joined
+]]
+
+
+[[
+#!/usr/bin/perl -w
+# Split a device into 4M chunks and then join them together in reverse order.
+
+my $name = "reverse";
+my $extent_size = 4 * 1024 * 2;
+my $dev = $ARGV[0];
+my $table = "";
+my $count = 0;
+
+if (!defined($dev)) {
+ die("Please specify a device.\n");
+}
+
+my $dev_size = `blockdev --getsize $dev`;
+my $extents = int($dev_size / $extent_size) -
+ (($dev_size % $extent_size) ? 1 : 0);
+
+while ($extents > 0) {
+ my $this_start = $count * $extent_size;
+ $extents--;
+ $count++;
+ my $this_offset = $extents * $extent_size;
+
+ $table .= "$this_start $extent_size linear $dev $this_offset\n";
+}
+
+`echo \"$table\" | dmsetup create $name`;
+]]
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
new file mode 100644
index 0000000..a5009c8
--- /dev/null
+++ b/Documentation/device-mapper/snapshot.txt
@@ -0,0 +1,74 @@
+Device-mapper snapshot support
+==============================
+
+Device-mapper allows you, without massive data copying:
+
+*) To create snapshots of any block device i.e. mountable, saved states of
+the block device which are also writable without interfering with the
+original content;
+*) To create device "forks", i.e. multiple different versions of the
+same data stream.
+
+
+In both cases, dm copies only the chunks of data that get changed and
+uses a separate copy-on-write (COW) block device for storage.
+
+
+There are two dm targets available: snapshot and snapshot-origin.
+
+*) snapshot-origin <origin>
+
+which will normally have one or more snapshots based on it.
+Reads will be mapped directly to the backing device. For each write, the
+original data will be saved in the <COW device> of each snapshot to keep
+its visible content unchanged, at least until the <COW device> fills up.
+
+
+*) snapshot <origin> <COW device> <persistent?> <chunksize>
+
+A snapshot of the <origin> block device is created. Changed chunks of
+<chunksize> sectors will be stored on the <COW device>. Writes will
+only go to the <COW device>. Reads will come from the <COW device> or
+from <origin> for unchanged data. <COW device> will often be
+smaller than the origin and if it fills up the snapshot will become
+useless and be disabled, returning errors. So it is important to monitor
+the amount of free space and expand the <COW device> before it fills up.
+
+<persistent?> is P (Persistent) or N (Not persistent - will not survive
+after reboot).
+The difference is that for transient snapshots less metadata must be
+saved on disk - they can be kept in memory by the kernel.
+
+
+How this is used by LVM2
+========================
+When you create the first LVM2 snapshot of a volume, four dm devices are used:
+
+1) a device containing the original mapping table of the source volume;
+2) a device used as the <COW device>;
+3) a "snapshot" device, combining #1 and #2, which is the visible snapshot
+ volume;
+4) the "original" volume (which uses the device number used by the original
+ source volume), whose table is replaced by a "snapshot-origin" mapping
+ from device #1.
+
+A fixed naming scheme is used, so with the following commands:
+
+lvcreate -L 1G -n base volumeGroup
+lvcreate -L 100M --snapshot -n snap volumeGroup/base
+
+we'll have this situation (with volumes in above order):
+
+# dmsetup table|grep volumeGroup
+
+volumeGroup-base-real: 0 2097152 linear 8:19 384
+volumeGroup-snap-cow: 0 204800 linear 8:19 2097536
+volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16
+volumeGroup-base: 0 2097152 snapshot-origin 254:11
+
+# ls -lL /dev/mapper/volumeGroup-*
+brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
+brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
+brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
+
diff --git a/Documentation/device-mapper/striped.txt b/Documentation/device-mapper/striped.txt
new file mode 100644
index 0000000..f34d323
--- /dev/null
+++ b/Documentation/device-mapper/striped.txt
@@ -0,0 +1,58 @@
+dm-stripe
+=========
+
+Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0)
+device across one or more underlying devices. Data is written in "chunks",
+with consecutive chunks rotating among the underlying devices. This can
+potentially provide improved I/O throughput by utilizing several physical
+devices in parallel.
+
+Parameters: <num devs> <chunk size> [<dev path> <offset>]+
+ <num devs>: Number of underlying devices.
+ <chunk size>: Size of each chunk of data. Must be a power-of-2 and at
+ least as large as the system's PAGE_SIZE.
+ <dev path>: Full pathname to the underlying block-device, or a
+ "major:minor" device-number.
+ <offset>: Starting sector within the device.
+
+One or more underlying devices can be specified. The striped device size must
+be a multiple of the chunk size and a multiple of the number of underlying
+devices.
+
+
+Example scripts
+===============
+
+[[
+#!/usr/bin/perl -w
+# Create a striped device across any number of underlying devices. The device
+# will be called "stripe_dev" and have a chunk-size of 128k.
+
+my $chunk_size = 128 * 2;
+my $dev_name = "stripe_dev";
+my $num_devs = @ARGV;
+my @devs = @ARGV;
+my ($min_dev_size, $stripe_dev_size, $i);
+
+if (!$num_devs) {
+ die("Specify at least one device\n");
+}
+
+$min_dev_size = `blockdev --getsize $devs[0]`;
+for ($i = 1; $i < $num_devs; $i++) {
+ my $this_size = `blockdev --getsize $devs[$i]`;
+ $min_dev_size = ($min_dev_size < $this_size) ?
+ $min_dev_size : $this_size;
+}
+
+$stripe_dev_size = $min_dev_size * $num_devs;
+$stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs);
+
+$table = "0 $stripe_dev_size striped $num_devs $chunk_size";
+for ($i = 0; $i < $num_devs; $i++) {
+ $table .= " $devs[$i] 0";
+}
+
+`echo $table | dmsetup create $dev_name`;
+]]
+
diff --git a/Documentation/device-mapper/zero.txt b/Documentation/device-mapper/zero.txt
new file mode 100644
index 0000000..20fb38e
--- /dev/null
+++ b/Documentation/device-mapper/zero.txt
@@ -0,0 +1,37 @@
+dm-zero
+=======
+
+Device-Mapper's "zero" target provides a block-device that always returns
+zero'd data on reads and silently drops writes. This is similar behavior to
+/dev/zero, but as a block-device instead of a character-device.
+
+Dm-zero has no target-specific parameters.
+
+One very interesting use of dm-zero is for creating "sparse" devices in
+conjunction with dm-snapshot. A sparse device reports a device-size larger
+than the amount of actual storage space available for that device. A user can
+write data anywhere within the sparse device and read it back like a normal
+device. Reads to previously unwritten areas will return a zero'd buffer. When
+enough data has been written to fill up the actual storage space, the sparse
+device is deactivated. This can be very useful for testing device and
+filesystem limitations.
+
+To create a sparse device, start by creating a dm-zero device that's the
+desired size of the sparse device. For this example, we'll assume a 10TB
+sparse device.
+
+TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors
+echo "0 $TEN_TERABYTES zero" | dmsetup create zero1
+
+Then create a snapshot of the zero device, using any available block-device as
+the COW device. The size of the COW device will determine the amount of real
+space available to the sparse device. For this example, we'll assume /dev/sdb1
+is an available 10GB partition.
+
+echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \
+ dmsetup create sparse1
+
+This will create a 10TB sparse device called /dev/mapper/sparse1 that has
+10GB of actual storage space available. If more than 10GB of data is written
+to this device, it will start returning I/O errors.
+
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
new file mode 100644
index 0000000..2be0824
--- /dev/null
+++ b/Documentation/devices.txt
@@ -0,0 +1,3335 @@
+
+ LINUX ALLOCATED DEVICES (2.6+ version)
+
+ Maintained by Torben Mathiasen <device@lanana.org>
+
+ Last revised: 29 November 2006
+
+This list is the Linux Device List, the official registry of allocated
+device numbers and /dev directory nodes for the Linux operating
+system.
+
+The latest version of this list is available from
+http://www.lanana.org/docs/device-list/ or
+ftp://ftp.kernel.org/pub/linux/docs/device-list/. This version may be
+newer than the one distributed with the Linux kernel.
+
+The LaTeX version of this document is no longer maintained.
+
+This document is included by reference into the Filesystem Hierarchy
+Standard (FHS). The FHS is available from http://www.pathname.com/fhs/.
+
+Allocations marked (68k/Amiga) apply to Linux/68k on the Amiga
+platform only. Allocations marked (68k/Atari) apply to Linux/68k on
+the Atari platform only.
+
+The symbol {2.6} means the allocation is obsolete and scheduled for
+removal once kernel version 2.6 (or equivalent) is released. Some of these
+allocations have already been removed.
+
+This document is in the public domain. The author requests, however,
+that semantically altered versions are not distributed without
+permission of the author, assuming the author can be contacted without
+an unreasonable effort.
+
+In particular, please don't sent patches for this list to Linus, at
+least not without contacting me first.
+
+I do not have any information about these devices beyond what appears
+on this list. Any such information requests will be deleted without
+reply.
+
+
+ **** DEVICE DRIVERS AUTHORS PLEASE READ THIS ****
+
+To have a major number allocated, or a minor number in situations
+where that applies (e.g. busmice), please contact me with the
+appropriate device information. Also, if you have additional
+information regarding any of the devices listed below, or if I have
+made a mistake, I would greatly appreciate a note.
+
+I do, however, make a few requests about the nature of your report.
+This is necessary for me to be able to keep this list up to date and
+correct in a timely manner. First of all, *please* send it to the
+correct address... <device@lanana.org>. I receive hundreds of email
+messages a day, so mail sent to other addresses may very well get lost
+in the avalanche. Please put in a descriptive subject, so I can find
+your mail again should I need to. Too many people send me email
+saying just "device number request" in the subject.
+
+Second, please include a description of the device *in the same format
+as this list*. The reason for this is that it is the only way I have
+found to ensure I have all the requisite information to publish your
+device and avoid conflicts.
+
+Third, please don't assume that the distributed version of the list is
+up to date. Due to the number of registrations I have to maintain it
+in "batch mode", so there is likely additional registrations that
+haven't been listed yet.
+
+Finally, sometimes I have to play "namespace police." Please don't be
+offended. I often get submissions for /dev names that would be bound
+to cause conflicts down the road. I am trying to avoid getting in a
+situation where we would have to suffer an incompatible forward
+change. Therefore, please consult with me *before* you make your
+device names and numbers in any way public, at least to the point
+where it would be at all difficult to get them changed.
+
+Your cooperation is appreciated.
+
+
+ 0 Unnamed devices (e.g. non-device mounts)
+ 0 = reserved as null device number
+ See block major 144, 145, 146 for expansion areas.
+
+ 1 char Memory devices
+ 1 = /dev/mem Physical memory access
+ 2 = /dev/kmem Kernel virtual memory access
+ 3 = /dev/null Null device
+ 4 = /dev/port I/O port access
+ 5 = /dev/zero Null byte source
+ 6 = /dev/core OBSOLETE - replaced by /proc/kcore
+ 7 = /dev/full Returns ENOSPC on write
+ 8 = /dev/random Nondeterministic random number gen.
+ 9 = /dev/urandom Faster, less secure random number gen.
+ 10 = /dev/aio Asynchronous I/O notification interface
+ 11 = /dev/kmsg Writes to this come out as printk's
+ 12 = /dev/oldmem Used by crashdump kernels to access
+ the memory of the kernel that crashed.
+
+ 1 block RAM disk
+ 0 = /dev/ram0 First RAM disk
+ 1 = /dev/ram1 Second RAM disk
+ ...
+ 250 = /dev/initrd Initial RAM disk {2.6}
+
+ Older kernels had /dev/ramdisk (1, 1) here.
+ /dev/initrd refers to a RAM disk which was preloaded
+ by the boot loader; newer kernels use /dev/ram0 for
+ the initrd.
+
+ 2 char Pseudo-TTY masters
+ 0 = /dev/ptyp0 First PTY master
+ 1 = /dev/ptyp1 Second PTY master
+ ...
+ 255 = /dev/ptyef 256th PTY master
+
+ Pseudo-tty's are named as follows:
+ * Masters are "pty", slaves are "tty";
+ * the fourth letter is one of pqrstuvwxyzabcde indicating
+ the 1st through 16th series of 16 pseudo-ttys each, and
+ * the fifth letter is one of 0123456789abcdef indicating
+ the position within the series.
+
+ These are the old-style (BSD) PTY devices; Unix98
+ devices are on major 128 and above and use the PTY
+ master multiplex (/dev/ptmx) to acquire a PTY on
+ demand.
+
+ 2 block Floppy disks
+ 0 = /dev/fd0 Controller 0, drive 0, autodetect
+ 1 = /dev/fd1 Controller 0, drive 1, autodetect
+ 2 = /dev/fd2 Controller 0, drive 2, autodetect
+ 3 = /dev/fd3 Controller 0, drive 3, autodetect
+ 128 = /dev/fd4 Controller 1, drive 0, autodetect
+ 129 = /dev/fd5 Controller 1, drive 1, autodetect
+ 130 = /dev/fd6 Controller 1, drive 2, autodetect
+ 131 = /dev/fd7 Controller 1, drive 3, autodetect
+
+ To specify format, add to the autodetect device number:
+ 0 = /dev/fd? Autodetect format
+ 4 = /dev/fd?d360 5.25" 360K in a 360K drive(1)
+ 20 = /dev/fd?h360 5.25" 360K in a 1200K drive(1)
+ 48 = /dev/fd?h410 5.25" 410K in a 1200K drive
+ 64 = /dev/fd?h420 5.25" 420K in a 1200K drive
+ 24 = /dev/fd?h720 5.25" 720K in a 1200K drive
+ 80 = /dev/fd?h880 5.25" 880K in a 1200K drive(1)
+ 8 = /dev/fd?h1200 5.25" 1200K in a 1200K drive(1)
+ 40 = /dev/fd?h1440 5.25" 1440K in a 1200K drive(1)
+ 56 = /dev/fd?h1476 5.25" 1476K in a 1200K drive
+ 72 = /dev/fd?h1494 5.25" 1494K in a 1200K drive
+ 92 = /dev/fd?h1600 5.25" 1600K in a 1200K drive(1)
+
+ 12 = /dev/fd?u360 3.5" 360K Double Density(2)
+ 16 = /dev/fd?u720 3.5" 720K Double Density(1)
+ 120 = /dev/fd?u800 3.5" 800K Double Density(2)
+ 52 = /dev/fd?u820 3.5" 820K Double Density
+ 68 = /dev/fd?u830 3.5" 830K Double Density
+ 84 = /dev/fd?u1040 3.5" 1040K Double Density(1)
+ 88 = /dev/fd?u1120 3.5" 1120K Double Density(1)
+ 28 = /dev/fd?u1440 3.5" 1440K High Density(1)
+ 124 = /dev/fd?u1600 3.5" 1600K High Density(1)
+ 44 = /dev/fd?u1680 3.5" 1680K High Density(3)
+ 60 = /dev/fd?u1722 3.5" 1722K High Density
+ 76 = /dev/fd?u1743 3.5" 1743K High Density
+ 96 = /dev/fd?u1760 3.5" 1760K High Density
+ 116 = /dev/fd?u1840 3.5" 1840K High Density(3)
+ 100 = /dev/fd?u1920 3.5" 1920K High Density(1)
+ 32 = /dev/fd?u2880 3.5" 2880K Extra Density(1)
+ 104 = /dev/fd?u3200 3.5" 3200K Extra Density
+ 108 = /dev/fd?u3520 3.5" 3520K Extra Density
+ 112 = /dev/fd?u3840 3.5" 3840K Extra Density(1)
+
+ 36 = /dev/fd?CompaQ Compaq 2880K drive; obsolete?
+
+ (1) Autodetectable format
+ (2) Autodetectable format in a Double Density (720K) drive only
+ (3) Autodetectable format in a High Density (1440K) drive only
+
+ NOTE: The letter in the device name (d, q, h or u)
+ signifies the type of drive: 5.25" Double Density (d),
+ 5.25" Quad Density (q), 5.25" High Density (h) or 3.5"
+ (any model, u). The use of the capital letters D, H
+ and E for the 3.5" models have been deprecated, since
+ the drive type is insignificant for these devices.
+
+ 3 char Pseudo-TTY slaves
+ 0 = /dev/ttyp0 First PTY slave
+ 1 = /dev/ttyp1 Second PTY slave
+ ...
+ 255 = /dev/ttyef 256th PTY slave
+
+ These are the old-style (BSD) PTY devices; Unix98
+ devices are on major 136 and above.
+
+ 3 block First MFM, RLL and IDE hard disk/CD-ROM interface
+ 0 = /dev/hda Master: whole disk (or CD-ROM)
+ 64 = /dev/hdb Slave: whole disk (or CD-ROM)
+
+ For partitions, add to the whole disk device number:
+ 0 = /dev/hd? Whole disk
+ 1 = /dev/hd?1 First partition
+ 2 = /dev/hd?2 Second partition
+ ...
+ 63 = /dev/hd?63 63rd partition
+
+ For Linux/i386, partitions 1-4 are the primary
+ partitions, and 5 and above are logical partitions.
+ Other versions of Linux use partitioning schemes
+ appropriate to their respective architectures.
+
+ 4 char TTY devices
+ 0 = /dev/tty0 Current virtual console
+
+ 1 = /dev/tty1 First virtual console
+ ...
+ 63 = /dev/tty63 63rd virtual console
+ 64 = /dev/ttyS0 First UART serial port
+ ...
+ 255 = /dev/ttyS191 192nd UART serial port
+
+ UART serial ports refer to 8250/16450/16550 series devices.
+
+ Older versions of the Linux kernel used this major
+ number for BSD PTY devices. As of Linux 2.1.115, this
+ is no longer supported. Use major numbers 2 and 3.
+
+ 4 block Aliases for dynamically allocated major devices to be used
+ when its not possible to create the real device nodes
+ because the root filesystem is mounted read-only.
+
+ 0 = /dev/root
+
+ 5 char Alternate TTY devices
+ 0 = /dev/tty Current TTY device
+ 1 = /dev/console System console
+ 2 = /dev/ptmx PTY master multiplex
+ 64 = /dev/cua0 Callout device for ttyS0
+ ...
+ 255 = /dev/cua191 Callout device for ttyS191
+
+ (5,1) is /dev/console starting with Linux 2.1.71. See
+ the section on terminal devices for more information
+ on /dev/console.
+
+ 6 char Parallel printer devices
+ 0 = /dev/lp0 Parallel printer on parport0
+ 1 = /dev/lp1 Parallel printer on parport1
+ ...
+
+ Current Linux kernels no longer have a fixed mapping
+ between parallel ports and I/O addresses. Instead,
+ they are redirected through the parport multiplex layer.
+
+ 7 char Virtual console capture devices
+ 0 = /dev/vcs Current vc text contents
+ 1 = /dev/vcs1 tty1 text contents
+ ...
+ 63 = /dev/vcs63 tty63 text contents
+ 128 = /dev/vcsa Current vc text/attribute contents
+ 129 = /dev/vcsa1 tty1 text/attribute contents
+ ...
+ 191 = /dev/vcsa63 tty63 text/attribute contents
+
+ NOTE: These devices permit both read and write access.
+
+ 7 block Loopback devices
+ 0 = /dev/loop0 First loop device
+ 1 = /dev/loop1 Second loop device
+ ...
+
+ The loop devices are used to mount filesystems not
+ associated with block devices. The binding to the
+ loop devices is handled by mount(8) or losetup(8).
+
+ 8 block SCSI disk devices (0-15)
+ 0 = /dev/sda First SCSI disk whole disk
+ 16 = /dev/sdb Second SCSI disk whole disk
+ 32 = /dev/sdc Third SCSI disk whole disk
+ ...
+ 240 = /dev/sdp Sixteenth SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 9 char SCSI tape devices
+ 0 = /dev/st0 First SCSI tape, mode 0
+ 1 = /dev/st1 Second SCSI tape, mode 0
+ ...
+ 32 = /dev/st0l First SCSI tape, mode 1
+ 33 = /dev/st1l Second SCSI tape, mode 1
+ ...
+ 64 = /dev/st0m First SCSI tape, mode 2
+ 65 = /dev/st1m Second SCSI tape, mode 2
+ ...
+ 96 = /dev/st0a First SCSI tape, mode 3
+ 97 = /dev/st1a Second SCSI tape, mode 3
+ ...
+ 128 = /dev/nst0 First SCSI tape, mode 0, no rewind
+ 129 = /dev/nst1 Second SCSI tape, mode 0, no rewind
+ ...
+ 160 = /dev/nst0l First SCSI tape, mode 1, no rewind
+ 161 = /dev/nst1l Second SCSI tape, mode 1, no rewind
+ ...
+ 192 = /dev/nst0m First SCSI tape, mode 2, no rewind
+ 193 = /dev/nst1m Second SCSI tape, mode 2, no rewind
+ ...
+ 224 = /dev/nst0a First SCSI tape, mode 3, no rewind
+ 225 = /dev/nst1a Second SCSI tape, mode 3, no rewind
+ ...
+
+ "No rewind" refers to the omission of the default
+ automatic rewind on device close. The MTREW or MTOFFL
+ ioctl()'s can be used to rewind the tape regardless of
+ the device used to access it.
+
+ 9 block Metadisk (RAID) devices
+ 0 = /dev/md0 First metadisk group
+ 1 = /dev/md1 Second metadisk group
+ ...
+
+ The metadisk driver is used to span a
+ filesystem across multiple physical disks.
+
+ 10 char Non-serial mice, misc features
+ 0 = /dev/logibm Logitech bus mouse
+ 1 = /dev/psaux PS/2-style mouse port
+ 2 = /dev/inportbm Microsoft Inport bus mouse
+ 3 = /dev/atibm ATI XL bus mouse
+ 4 = /dev/jbm J-mouse
+ 4 = /dev/amigamouse Amiga mouse (68k/Amiga)
+ 5 = /dev/atarimouse Atari mouse
+ 6 = /dev/sunmouse Sun mouse
+ 7 = /dev/amigamouse1 Second Amiga mouse
+ 8 = /dev/smouse Simple serial mouse driver
+ 9 = /dev/pc110pad IBM PC-110 digitizer pad
+ 10 = /dev/adbmouse Apple Desktop Bus mouse
+ 11 = /dev/vrtpanel Vr41xx embedded touch panel
+ 13 = /dev/vpcmouse Connectix Virtual PC Mouse
+ 14 = /dev/touchscreen/ucb1x00 UCB 1x00 touchscreen
+ 15 = /dev/touchscreen/mk712 MK712 touchscreen
+ 128 = /dev/beep Fancy beep device
+ 129 = /dev/modreq Kernel module load request {2.6}
+ 130 = /dev/watchdog Watchdog timer port
+ 131 = /dev/temperature Machine internal temperature
+ 132 = /dev/hwtrap Hardware fault trap
+ 133 = /dev/exttrp External device trap
+ 134 = /dev/apm_bios Advanced Power Management BIOS
+ 135 = /dev/rtc Real Time Clock
+ 139 = /dev/openprom SPARC OpenBoot PROM
+ 140 = /dev/relay8 Berkshire Products Octal relay card
+ 141 = /dev/relay16 Berkshire Products ISO-16 relay card
+ 142 = /dev/msr x86 model-specific registers {2.6}
+ 143 = /dev/pciconf PCI configuration space
+ 144 = /dev/nvram Non-volatile configuration RAM
+ 145 = /dev/hfmodem Soundcard shortwave modem control {2.6}
+ 146 = /dev/graphics Linux/SGI graphics device
+ 147 = /dev/opengl Linux/SGI OpenGL pipe
+ 148 = /dev/gfx Linux/SGI graphics effects device
+ 149 = /dev/input/mouse Linux/SGI Irix emulation mouse
+ 150 = /dev/input/keyboard Linux/SGI Irix emulation keyboard
+ 151 = /dev/led Front panel LEDs
+ 152 = /dev/kpoll Kernel Poll Driver
+ 153 = /dev/mergemem Memory merge device
+ 154 = /dev/pmu Macintosh PowerBook power manager
+ 155 = /dev/isictl MultiTech ISICom serial control
+ 156 = /dev/lcd Front panel LCD display
+ 157 = /dev/ac Applicom Intl Profibus card
+ 158 = /dev/nwbutton Netwinder external button
+ 159 = /dev/nwdebug Netwinder debug interface
+ 160 = /dev/nwflash Netwinder flash memory
+ 161 = /dev/userdma User-space DMA access
+ 162 = /dev/smbus System Management Bus
+ 163 = /dev/lik Logitech Internet Keyboard
+ 164 = /dev/ipmo Intel Intelligent Platform Management
+ 165 = /dev/vmmon VMWare virtual machine monitor
+ 166 = /dev/i2o/ctl I2O configuration manager
+ 167 = /dev/specialix_sxctl Specialix serial control
+ 168 = /dev/tcldrv Technology Concepts serial control
+ 169 = /dev/specialix_rioctl Specialix RIO serial control
+ 170 = /dev/thinkpad/thinkpad IBM Thinkpad devices
+ 171 = /dev/srripc QNX4 API IPC manager
+ 172 = /dev/usemaclone Semaphore clone device
+ 173 = /dev/ipmikcs Intelligent Platform Management
+ 174 = /dev/uctrl SPARCbook 3 microcontroller
+ 175 = /dev/agpgart AGP Graphics Address Remapping Table
+ 176 = /dev/gtrsc Gorgy Timing radio clock
+ 177 = /dev/cbm Serial CBM bus
+ 178 = /dev/jsflash JavaStation OS flash SIMM
+ 179 = /dev/xsvc High-speed shared-mem/semaphore service
+ 180 = /dev/vrbuttons Vr41xx button input device
+ 181 = /dev/toshiba Toshiba laptop SMM support
+ 182 = /dev/perfctr Performance-monitoring counters
+ 183 = /dev/hwrng Generic random number generator
+ 184 = /dev/cpu/microcode CPU microcode update interface
+ 186 = /dev/atomicps Atomic shapshot of process state data
+ 187 = /dev/irnet IrNET device
+ 188 = /dev/smbusbios SMBus BIOS
+ 189 = /dev/ussp_ctl User space serial port control
+ 190 = /dev/crash Mission Critical Linux crash dump facility
+ 191 = /dev/pcl181 <information missing>
+ 192 = /dev/nas_xbus NAS xbus LCD/buttons access
+ 193 = /dev/d7s SPARC 7-segment display
+ 194 = /dev/zkshim Zero-Knowledge network shim control
+ 195 = /dev/elographics/e2201 Elographics touchscreen E271-2201
+ 198 = /dev/sexec Signed executable interface
+ 199 = /dev/scanners/cuecat :CueCat barcode scanner
+ 200 = /dev/net/tun TAP/TUN network device
+ 201 = /dev/button/gulpb Transmeta GULP-B buttons
+ 202 = /dev/emd/ctl Enhanced Metadisk RAID (EMD) control
+ 204 = /dev/video/em8300 EM8300 DVD decoder control
+ 205 = /dev/video/em8300_mv EM8300 DVD decoder video
+ 206 = /dev/video/em8300_ma EM8300 DVD decoder audio
+ 207 = /dev/video/em8300_sp EM8300 DVD decoder subpicture
+ 208 = /dev/compaq/cpqphpc Compaq PCI Hot Plug Controller
+ 209 = /dev/compaq/cpqrid Compaq Remote Insight Driver
+ 210 = /dev/impi/bt IMPI coprocessor block transfer
+ 211 = /dev/impi/smic IMPI coprocessor stream interface
+ 212 = /dev/watchdogs/0 First watchdog device
+ 213 = /dev/watchdogs/1 Second watchdog device
+ 214 = /dev/watchdogs/2 Third watchdog device
+ 215 = /dev/watchdogs/3 Fourth watchdog device
+ 216 = /dev/fujitsu/apanel Fujitsu/Siemens application panel
+ 217 = /dev/ni/natmotn National Instruments Motion
+ 218 = /dev/kchuid Inter-process chuid control
+ 219 = /dev/modems/mwave MWave modem firmware upload
+ 220 = /dev/mptctl Message passing technology (MPT) control
+ 221 = /dev/mvista/hssdsi Montavista PICMG hot swap system driver
+ 222 = /dev/mvista/hasi Montavista PICMG high availability
+ 223 = /dev/input/uinput User level driver support for input
+ 224 = /dev/tpm TCPA TPM driver
+ 225 = /dev/pps Pulse Per Second driver
+ 226 = /dev/systrace Systrace device
+ 227 = /dev/mcelog X86_64 Machine Check Exception driver
+ 228 = /dev/hpet HPET driver
+ 229 = /dev/fuse Fuse (virtual filesystem in user-space)
+ 230 = /dev/midishare MidiShare driver
+ 240-254 Reserved for local use
+ 255 Reserved for MISC_DYNAMIC_MINOR
+
+ 11 char Raw keyboard device (Linux/SPARC only)
+ 0 = /dev/kbd Raw keyboard device
+
+ 11 char Serial Mux device (Linux/PA-RISC only)
+ 0 = /dev/ttyB0 First mux port
+ 1 = /dev/ttyB1 Second mux port
+ ...
+
+ 11 block SCSI CD-ROM devices
+ 0 = /dev/scd0 First SCSI CD-ROM
+ 1 = /dev/scd1 Second SCSI CD-ROM
+ ...
+
+ The prefix /dev/sr (instead of /dev/scd) has been deprecated.
+
+ 12 char QIC-02 tape
+ 2 = /dev/ntpqic11 QIC-11, no rewind-on-close
+ 3 = /dev/tpqic11 QIC-11, rewind-on-close
+ 4 = /dev/ntpqic24 QIC-24, no rewind-on-close
+ 5 = /dev/tpqic24 QIC-24, rewind-on-close
+ 6 = /dev/ntpqic120 QIC-120, no rewind-on-close
+ 7 = /dev/tpqic120 QIC-120, rewind-on-close
+ 8 = /dev/ntpqic150 QIC-150, no rewind-on-close
+ 9 = /dev/tpqic150 QIC-150, rewind-on-close
+
+ The device names specified are proposed -- if there
+ are "standard" names for these devices, please let me know.
+
+ 12 block MSCDEX CD-ROM callback support {2.6}
+ 0 = /dev/dos_cd0 First MSCDEX CD-ROM
+ 1 = /dev/dos_cd1 Second MSCDEX CD-ROM
+ ...
+
+ 13 char Input core
+ 0 = /dev/input/js0 First joystick
+ 1 = /dev/input/js1 Second joystick
+ ...
+ 32 = /dev/input/mouse0 First mouse
+ 33 = /dev/input/mouse1 Second mouse
+ ...
+ 63 = /dev/input/mice Unified mouse
+ 64 = /dev/input/event0 First event queue
+ 65 = /dev/input/event1 Second event queue
+ ...
+
+ Each device type has 5 bits (32 minors).
+
+ 13 block 8-bit MFM/RLL/IDE controller
+ 0 = /dev/xda First XT disk whole disk
+ 64 = /dev/xdb Second XT disk whole disk
+
+ Partitions are handled in the same way as IDE disks
+ (see major number 3).
+
+ 14 char Open Sound System (OSS)
+ 0 = /dev/mixer Mixer control
+ 1 = /dev/sequencer Audio sequencer
+ 2 = /dev/midi00 First MIDI port
+ 3 = /dev/dsp Digital audio
+ 4 = /dev/audio Sun-compatible digital audio
+ 6 = /dev/sndstat Sound card status information {2.6}
+ 7 = /dev/audioctl SPARC audio control device
+ 8 = /dev/sequencer2 Sequencer -- alternate device
+ 16 = /dev/mixer1 Second soundcard mixer control
+ 17 = /dev/patmgr0 Sequencer patch manager
+ 18 = /dev/midi01 Second MIDI port
+ 19 = /dev/dsp1 Second soundcard digital audio
+ 20 = /dev/audio1 Second soundcard Sun digital audio
+ 33 = /dev/patmgr1 Sequencer patch manager
+ 34 = /dev/midi02 Third MIDI port
+ 50 = /dev/midi03 Fourth MIDI port
+
+ 14 block BIOS harddrive callback support {2.6}
+ 0 = /dev/dos_hda First BIOS harddrive whole disk
+ 64 = /dev/dos_hdb Second BIOS harddrive whole disk
+ 128 = /dev/dos_hdc Third BIOS harddrive whole disk
+ 192 = /dev/dos_hdd Fourth BIOS harddrive whole disk
+
+ Partitions are handled in the same way as IDE disks
+ (see major number 3).
+
+ 15 char Joystick
+ 0 = /dev/js0 First analog joystick
+ 1 = /dev/js1 Second analog joystick
+ ...
+ 128 = /dev/djs0 First digital joystick
+ 129 = /dev/djs1 Second digital joystick
+ ...
+ 15 block Sony CDU-31A/CDU-33A CD-ROM
+ 0 = /dev/sonycd Sony CDU-31a CD-ROM
+
+ 16 char Non-SCSI scanners
+ 0 = /dev/gs4500 Genius 4500 handheld scanner
+
+ 16 block GoldStar CD-ROM
+ 0 = /dev/gscd GoldStar CD-ROM
+
+ 17 char Chase serial card
+ 0 = /dev/ttyH0 First Chase port
+ 1 = /dev/ttyH1 Second Chase port
+ ...
+ 17 block Optics Storage CD-ROM
+ 0 = /dev/optcd Optics Storage CD-ROM
+
+ 18 char Chase serial card - alternate devices
+ 0 = /dev/cuh0 Callout device for ttyH0
+ 1 = /dev/cuh1 Callout device for ttyH1
+ ...
+ 18 block Sanyo CD-ROM
+ 0 = /dev/sjcd Sanyo CD-ROM
+
+ 19 char Cyclades serial card
+ 0 = /dev/ttyC0 First Cyclades port
+ ...
+ 31 = /dev/ttyC31 32nd Cyclades port
+
+ 19 block "Double" compressed disk
+ 0 = /dev/double0 First compressed disk
+ ...
+ 7 = /dev/double7 Eighth compressed disk
+ 128 = /dev/cdouble0 Mirror of first compressed disk
+ ...
+ 135 = /dev/cdouble7 Mirror of eighth compressed disk
+
+ See the Double documentation for the meaning of the
+ mirror devices.
+
+ 20 char Cyclades serial card - alternate devices
+ 0 = /dev/cub0 Callout device for ttyC0
+ ...
+ 31 = /dev/cub31 Callout device for ttyC31
+
+ 20 block Hitachi CD-ROM (under development)
+ 0 = /dev/hitcd Hitachi CD-ROM
+
+ 21 char Generic SCSI access
+ 0 = /dev/sg0 First generic SCSI device
+ 1 = /dev/sg1 Second generic SCSI device
+ ...
+
+ Most distributions name these /dev/sga, /dev/sgb...;
+ this sets an unnecessary limit of 26 SCSI devices in
+ the system and is counter to standard Linux
+ device-naming practice.
+
+ 21 block Acorn MFM hard drive interface
+ 0 = /dev/mfma First MFM drive whole disk
+ 64 = /dev/mfmb Second MFM drive whole disk
+
+ This device is used on the ARM-based Acorn RiscPC.
+ Partitions are handled the same way as for IDE disks
+ (see major number 3).
+
+ 22 char Digiboard serial card
+ 0 = /dev/ttyD0 First Digiboard port
+ 1 = /dev/ttyD1 Second Digiboard port
+ ...
+ 22 block Second IDE hard disk/CD-ROM interface
+ 0 = /dev/hdc Master: whole disk (or CD-ROM)
+ 64 = /dev/hdd Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 23 char Digiboard serial card - alternate devices
+ 0 = /dev/cud0 Callout device for ttyD0
+ 1 = /dev/cud1 Callout device for ttyD1
+ ...
+ 23 block Mitsumi proprietary CD-ROM
+ 0 = /dev/mcd Mitsumi CD-ROM
+
+ 24 char Stallion serial card
+ 0 = /dev/ttyE0 Stallion port 0 card 0
+ 1 = /dev/ttyE1 Stallion port 1 card 0
+ ...
+ 64 = /dev/ttyE64 Stallion port 0 card 1
+ 65 = /dev/ttyE65 Stallion port 1 card 1
+ ...
+ 128 = /dev/ttyE128 Stallion port 0 card 2
+ 129 = /dev/ttyE129 Stallion port 1 card 2
+ ...
+ 192 = /dev/ttyE192 Stallion port 0 card 3
+ 193 = /dev/ttyE193 Stallion port 1 card 3
+ ...
+ 24 block Sony CDU-535 CD-ROM
+ 0 = /dev/cdu535 Sony CDU-535 CD-ROM
+
+ 25 char Stallion serial card - alternate devices
+ 0 = /dev/cue0 Callout device for ttyE0
+ 1 = /dev/cue1 Callout device for ttyE1
+ ...
+ 64 = /dev/cue64 Callout device for ttyE64
+ 65 = /dev/cue65 Callout device for ttyE65
+ ...
+ 128 = /dev/cue128 Callout device for ttyE128
+ 129 = /dev/cue129 Callout device for ttyE129
+ ...
+ 192 = /dev/cue192 Callout device for ttyE192
+ 193 = /dev/cue193 Callout device for ttyE193
+ ...
+ 25 block First Matsushita (Panasonic/SoundBlaster) CD-ROM
+ 0 = /dev/sbpcd0 Panasonic CD-ROM controller 0 unit 0
+ 1 = /dev/sbpcd1 Panasonic CD-ROM controller 0 unit 1
+ 2 = /dev/sbpcd2 Panasonic CD-ROM controller 0 unit 2
+ 3 = /dev/sbpcd3 Panasonic CD-ROM controller 0 unit 3
+
+ 26 char Quanta WinVision frame grabber {2.6}
+ 0 = /dev/wvisfgrab Quanta WinVision frame grabber
+
+ 26 block Second Matsushita (Panasonic/SoundBlaster) CD-ROM
+ 0 = /dev/sbpcd4 Panasonic CD-ROM controller 1 unit 0
+ 1 = /dev/sbpcd5 Panasonic CD-ROM controller 1 unit 1
+ 2 = /dev/sbpcd6 Panasonic CD-ROM controller 1 unit 2
+ 3 = /dev/sbpcd7 Panasonic CD-ROM controller 1 unit 3
+
+ 27 char QIC-117 tape
+ 0 = /dev/qft0 Unit 0, rewind-on-close
+ 1 = /dev/qft1 Unit 1, rewind-on-close
+ 2 = /dev/qft2 Unit 2, rewind-on-close
+ 3 = /dev/qft3 Unit 3, rewind-on-close
+ 4 = /dev/nqft0 Unit 0, no rewind-on-close
+ 5 = /dev/nqft1 Unit 1, no rewind-on-close
+ 6 = /dev/nqft2 Unit 2, no rewind-on-close
+ 7 = /dev/nqft3 Unit 3, no rewind-on-close
+ 16 = /dev/zqft0 Unit 0, rewind-on-close, compression
+ 17 = /dev/zqft1 Unit 1, rewind-on-close, compression
+ 18 = /dev/zqft2 Unit 2, rewind-on-close, compression
+ 19 = /dev/zqft3 Unit 3, rewind-on-close, compression
+ 20 = /dev/nzqft0 Unit 0, no rewind-on-close, compression
+ 21 = /dev/nzqft1 Unit 1, no rewind-on-close, compression
+ 22 = /dev/nzqft2 Unit 2, no rewind-on-close, compression
+ 23 = /dev/nzqft3 Unit 3, no rewind-on-close, compression
+ 32 = /dev/rawqft0 Unit 0, rewind-on-close, no file marks
+ 33 = /dev/rawqft1 Unit 1, rewind-on-close, no file marks
+ 34 = /dev/rawqft2 Unit 2, rewind-on-close, no file marks
+ 35 = /dev/rawqft3 Unit 3, rewind-on-close, no file marks
+ 36 = /dev/nrawqft0 Unit 0, no rewind-on-close, no file marks
+ 37 = /dev/nrawqft1 Unit 1, no rewind-on-close, no file marks
+ 38 = /dev/nrawqft2 Unit 2, no rewind-on-close, no file marks
+ 39 = /dev/nrawqft3 Unit 3, no rewind-on-close, no file marks
+
+ 27 block Third Matsushita (Panasonic/SoundBlaster) CD-ROM
+ 0 = /dev/sbpcd8 Panasonic CD-ROM controller 2 unit 0
+ 1 = /dev/sbpcd9 Panasonic CD-ROM controller 2 unit 1
+ 2 = /dev/sbpcd10 Panasonic CD-ROM controller 2 unit 2
+ 3 = /dev/sbpcd11 Panasonic CD-ROM controller 2 unit 3
+
+ 28 char Stallion serial card - card programming
+ 0 = /dev/staliomem0 First Stallion card I/O memory
+ 1 = /dev/staliomem1 Second Stallion card I/O memory
+ 2 = /dev/staliomem2 Third Stallion card I/O memory
+ 3 = /dev/staliomem3 Fourth Stallion card I/O memory
+
+ 28 char Atari SLM ACSI laser printer (68k/Atari)
+ 0 = /dev/slm0 First SLM laser printer
+ 1 = /dev/slm1 Second SLM laser printer
+ ...
+ 28 block Fourth Matsushita (Panasonic/SoundBlaster) CD-ROM
+ 0 = /dev/sbpcd12 Panasonic CD-ROM controller 3 unit 0
+ 1 = /dev/sbpcd13 Panasonic CD-ROM controller 3 unit 1
+ 2 = /dev/sbpcd14 Panasonic CD-ROM controller 3 unit 2
+ 3 = /dev/sbpcd15 Panasonic CD-ROM controller 3 unit 3
+
+ 28 block ACSI disk (68k/Atari)
+ 0 = /dev/ada First ACSI disk whole disk
+ 16 = /dev/adb Second ACSI disk whole disk
+ 32 = /dev/adc Third ACSI disk whole disk
+ ...
+ 240 = /dev/adp 16th ACSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15, like SCSI.
+
+ 29 char Universal frame buffer
+ 0 = /dev/fb0 First frame buffer
+ 1 = /dev/fb1 Second frame buffer
+ ...
+ 31 = /dev/fb31 32nd frame buffer
+
+ 29 block Aztech/Orchid/Okano/Wearnes CD-ROM
+ 0 = /dev/aztcd Aztech CD-ROM
+
+ 30 char iBCS-2 compatibility devices
+ 0 = /dev/socksys Socket access
+ 1 = /dev/spx SVR3 local X interface
+ 32 = /dev/inet/ip Network access
+ 33 = /dev/inet/icmp
+ 34 = /dev/inet/ggp
+ 35 = /dev/inet/ipip
+ 36 = /dev/inet/tcp
+ 37 = /dev/inet/egp
+ 38 = /dev/inet/pup
+ 39 = /dev/inet/udp
+ 40 = /dev/inet/idp
+ 41 = /dev/inet/rawip
+
+ Additionally, iBCS-2 requires the following links:
+
+ /dev/ip -> /dev/inet/ip
+ /dev/icmp -> /dev/inet/icmp
+ /dev/ggp -> /dev/inet/ggp
+ /dev/ipip -> /dev/inet/ipip
+ /dev/tcp -> /dev/inet/tcp
+ /dev/egp -> /dev/inet/egp
+ /dev/pup -> /dev/inet/pup
+ /dev/udp -> /dev/inet/udp
+ /dev/idp -> /dev/inet/idp
+ /dev/rawip -> /dev/inet/rawip
+ /dev/inet/arp -> /dev/inet/udp
+ /dev/inet/rip -> /dev/inet/udp
+ /dev/nfsd -> /dev/socksys
+ /dev/X0R -> /dev/null (? apparently not required ?)
+
+ 30 block Philips LMS CM-205 CD-ROM
+ 0 = /dev/cm205cd Philips LMS CM-205 CD-ROM
+
+ /dev/lmscd is an older name for this device. This
+ driver does not work with the CM-205MS CD-ROM.
+
+ 31 char MPU-401 MIDI
+ 0 = /dev/mpu401data MPU-401 data port
+ 1 = /dev/mpu401stat MPU-401 status port
+
+ 31 block ROM/flash memory card
+ 0 = /dev/rom0 First ROM card (rw)
+ ...
+ 7 = /dev/rom7 Eighth ROM card (rw)
+ 8 = /dev/rrom0 First ROM card (ro)
+ ...
+ 15 = /dev/rrom7 Eighth ROM card (ro)
+ 16 = /dev/flash0 First flash memory card (rw)
+ ...
+ 23 = /dev/flash7 Eighth flash memory card (rw)
+ 24 = /dev/rflash0 First flash memory card (ro)
+ ...
+ 31 = /dev/rflash7 Eighth flash memory card (ro)
+
+ The read-write (rw) devices support back-caching
+ written data in RAM, as well as writing to flash RAM
+ devices. The read-only devices (ro) support reading
+ only.
+
+ 32 char Specialix serial card
+ 0 = /dev/ttyX0 First Specialix port
+ 1 = /dev/ttyX1 Second Specialix port
+ ...
+ 32 block Philips LMS CM-206 CD-ROM
+ 0 = /dev/cm206cd Philips LMS CM-206 CD-ROM
+
+ 33 char Specialix serial card - alternate devices
+ 0 = /dev/cux0 Callout device for ttyX0
+ 1 = /dev/cux1 Callout device for ttyX1
+ ...
+ 33 block Third IDE hard disk/CD-ROM interface
+ 0 = /dev/hde Master: whole disk (or CD-ROM)
+ 64 = /dev/hdf Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 34 char Z8530 HDLC driver
+ 0 = /dev/scc0 First Z8530, first port
+ 1 = /dev/scc1 First Z8530, second port
+ 2 = /dev/scc2 Second Z8530, first port
+ 3 = /dev/scc3 Second Z8530, second port
+ ...
+
+ In a previous version these devices were named
+ /dev/sc1 for /dev/scc0, /dev/sc2 for /dev/scc1, and so
+ on.
+
+ 34 block Fourth IDE hard disk/CD-ROM interface
+ 0 = /dev/hdg Master: whole disk (or CD-ROM)
+ 64 = /dev/hdh Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 35 char tclmidi MIDI driver
+ 0 = /dev/midi0 First MIDI port, kernel timed
+ 1 = /dev/midi1 Second MIDI port, kernel timed
+ 2 = /dev/midi2 Third MIDI port, kernel timed
+ 3 = /dev/midi3 Fourth MIDI port, kernel timed
+ 64 = /dev/rmidi0 First MIDI port, untimed
+ 65 = /dev/rmidi1 Second MIDI port, untimed
+ 66 = /dev/rmidi2 Third MIDI port, untimed
+ 67 = /dev/rmidi3 Fourth MIDI port, untimed
+ 128 = /dev/smpte0 First MIDI port, SMPTE timed
+ 129 = /dev/smpte1 Second MIDI port, SMPTE timed
+ 130 = /dev/smpte2 Third MIDI port, SMPTE timed
+ 131 = /dev/smpte3 Fourth MIDI port, SMPTE timed
+
+ 35 block Slow memory ramdisk
+ 0 = /dev/slram Slow memory ramdisk
+
+ 36 char Netlink support
+ 0 = /dev/route Routing, device updates, kernel to user
+ 1 = /dev/skip enSKIP security cache control
+ 3 = /dev/fwmonitor Firewall packet copies
+ 16 = /dev/tap0 First Ethertap device
+ ...
+ 31 = /dev/tap15 16th Ethertap device
+
+ 36 block MCA ESDI hard disk
+ 0 = /dev/eda First ESDI disk whole disk
+ 64 = /dev/edb Second ESDI disk whole disk
+ ...
+
+ Partitions are handled in the same way as IDE disks
+ (see major number 3).
+
+ 37 char IDE tape
+ 0 = /dev/ht0 First IDE tape
+ 1 = /dev/ht1 Second IDE tape
+ ...
+ 128 = /dev/nht0 First IDE tape, no rewind-on-close
+ 129 = /dev/nht1 Second IDE tape, no rewind-on-close
+ ...
+
+ Currently, only one IDE tape drive is supported.
+
+ 37 block Zorro II ramdisk
+ 0 = /dev/z2ram Zorro II ramdisk
+
+ 38 char Myricom PCI Myrinet board
+ 0 = /dev/mlanai0 First Myrinet board
+ 1 = /dev/mlanai1 Second Myrinet board
+ ...
+
+ This device is used for status query, board control
+ and "user level packet I/O." This board is also
+ accessible as a standard networking "eth" device.
+
+ 38 block Reserved for Linux/AP+
+
+ 39 char ML-16P experimental I/O board
+ 0 = /dev/ml16pa-a0 First card, first analog channel
+ 1 = /dev/ml16pa-a1 First card, second analog channel
+ ...
+ 15 = /dev/ml16pa-a15 First card, 16th analog channel
+ 16 = /dev/ml16pa-d First card, digital lines
+ 17 = /dev/ml16pa-c0 First card, first counter/timer
+ 18 = /dev/ml16pa-c1 First card, second counter/timer
+ 19 = /dev/ml16pa-c2 First card, third counter/timer
+ 32 = /dev/ml16pb-a0 Second card, first analog channel
+ 33 = /dev/ml16pb-a1 Second card, second analog channel
+ ...
+ 47 = /dev/ml16pb-a15 Second card, 16th analog channel
+ 48 = /dev/ml16pb-d Second card, digital lines
+ 49 = /dev/ml16pb-c0 Second card, first counter/timer
+ 50 = /dev/ml16pb-c1 Second card, second counter/timer
+ 51 = /dev/ml16pb-c2 Second card, third counter/timer
+ ...
+ 39 block Reserved for Linux/AP+
+
+ 40 char Matrox Meteor frame grabber {2.6}
+ 0 = /dev/mmetfgrab Matrox Meteor frame grabber
+
+ 40 block Syquest EZ135 parallel port removable drive
+ 0 = /dev/eza Parallel EZ135 drive, whole disk
+
+ This device is obsolete and will be removed in a
+ future version of Linux. It has been replaced with
+ the parallel port IDE disk driver at major number 45.
+ Partitions are handled in the same way as IDE disks
+ (see major number 3).
+
+ 41 char Yet Another Micro Monitor
+ 0 = /dev/yamm Yet Another Micro Monitor
+
+ 41 block MicroSolutions BackPack parallel port CD-ROM
+ 0 = /dev/bpcd BackPack CD-ROM
+
+ This device is obsolete and will be removed in a
+ future version of Linux. It has been replaced with
+ the parallel port ATAPI CD-ROM driver at major number 46.
+
+ 42 char Demo/sample use
+
+ 42 block Demo/sample use
+
+ This number is intended for use in sample code, as
+ well as a general "example" device number. It
+ should never be used for a device driver that is being
+ distributed; either obtain an official number or use
+ the local/experimental range. The sudden addition or
+ removal of a driver with this number should not cause
+ ill effects to the system (bugs excepted.)
+
+ IN PARTICULAR, ANY DISTRIBUTION WHICH CONTAINS A
+ DEVICE DRIVER USING MAJOR NUMBER 42 IS NONCOMPLIANT.
+
+ 43 char isdn4linux virtual modem
+ 0 = /dev/ttyI0 First virtual modem
+ ...
+ 63 = /dev/ttyI63 64th virtual modem
+
+ 43 block Network block devices
+ 0 = /dev/nb0 First network block device
+ 1 = /dev/nb1 Second network block device
+ ...
+
+ Network Block Device is somehow similar to loopback
+ devices: If you read from it, it sends packet across
+ network asking server for data. If you write to it, it
+ sends packet telling server to write. It could be used
+ to mounting filesystems over the net, swapping over
+ the net, implementing block device in userland etc.
+
+ 44 char isdn4linux virtual modem - alternate devices
+ 0 = /dev/cui0 Callout device for ttyI0
+ ...
+ 63 = /dev/cui63 Callout device for ttyI63
+
+ 44 block Flash Translation Layer (FTL) filesystems
+ 0 = /dev/ftla FTL on first Memory Technology Device
+ 16 = /dev/ftlb FTL on second Memory Technology Device
+ 32 = /dev/ftlc FTL on third Memory Technology Device
+ ...
+ 240 = /dev/ftlp FTL on 16th Memory Technology Device
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the partition
+ limit is 15 rather than 63 per disk (same as SCSI.)
+
+ 45 char isdn4linux ISDN BRI driver
+ 0 = /dev/isdn0 First virtual B channel raw data
+ ...
+ 63 = /dev/isdn63 64th virtual B channel raw data
+ 64 = /dev/isdnctrl0 First channel control/debug
+ ...
+ 127 = /dev/isdnctrl63 64th channel control/debug
+
+ 128 = /dev/ippp0 First SyncPPP device
+ ...
+ 191 = /dev/ippp63 64th SyncPPP device
+
+ 255 = /dev/isdninfo ISDN monitor interface
+
+ 45 block Parallel port IDE disk devices
+ 0 = /dev/pda First parallel port IDE disk
+ 16 = /dev/pdb Second parallel port IDE disk
+ 32 = /dev/pdc Third parallel port IDE disk
+ 48 = /dev/pdd Fourth parallel port IDE disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the partition
+ limit is 15 rather than 63 per disk.
+
+ 46 char Comtrol Rocketport serial card
+ 0 = /dev/ttyR0 First Rocketport port
+ 1 = /dev/ttyR1 Second Rocketport port
+ ...
+ 46 block Parallel port ATAPI CD-ROM devices
+ 0 = /dev/pcd0 First parallel port ATAPI CD-ROM
+ 1 = /dev/pcd1 Second parallel port ATAPI CD-ROM
+ 2 = /dev/pcd2 Third parallel port ATAPI CD-ROM
+ 3 = /dev/pcd3 Fourth parallel port ATAPI CD-ROM
+
+ 47 char Comtrol Rocketport serial card - alternate devices
+ 0 = /dev/cur0 Callout device for ttyR0
+ 1 = /dev/cur1 Callout device for ttyR1
+ ...
+ 47 block Parallel port ATAPI disk devices
+ 0 = /dev/pf0 First parallel port ATAPI disk
+ 1 = /dev/pf1 Second parallel port ATAPI disk
+ 2 = /dev/pf2 Third parallel port ATAPI disk
+ 3 = /dev/pf3 Fourth parallel port ATAPI disk
+
+ This driver is intended for floppy disks and similar
+ devices and hence does not support partitioning.
+
+ 48 char SDL RISCom serial card
+ 0 = /dev/ttyL0 First RISCom port
+ 1 = /dev/ttyL1 Second RISCom port
+ ...
+ 48 block Mylex DAC960 PCI RAID controller; first controller
+ 0 = /dev/rd/c0d0 First disk, whole disk
+ 8 = /dev/rd/c0d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c0d31 32nd disk, whole disk
+
+ For partitions add:
+ 0 = /dev/rd/c?d? Whole disk
+ 1 = /dev/rd/c?d?p1 First partition
+ ...
+ 7 = /dev/rd/c?d?p7 Seventh partition
+
+ 49 char SDL RISCom serial card - alternate devices
+ 0 = /dev/cul0 Callout device for ttyL0
+ 1 = /dev/cul1 Callout device for ttyL1
+ ...
+ 49 block Mylex DAC960 PCI RAID controller; second controller
+ 0 = /dev/rd/c1d0 First disk, whole disk
+ 8 = /dev/rd/c1d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c1d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+ 50 char Reserved for GLINT
+
+ 50 block Mylex DAC960 PCI RAID controller; third controller
+ 0 = /dev/rd/c2d0 First disk, whole disk
+ 8 = /dev/rd/c2d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c2d31 32nd disk, whole disk
+
+ 51 char Baycom radio modem OR Radio Tech BIM-XXX-RS232 radio modem
+ 0 = /dev/bc0 First Baycom radio modem
+ 1 = /dev/bc1 Second Baycom radio modem
+ ...
+ 51 block Mylex DAC960 PCI RAID controller; fourth controller
+ 0 = /dev/rd/c3d0 First disk, whole disk
+ 8 = /dev/rd/c3d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c3d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+ 52 char Spellcaster DataComm/BRI ISDN card
+ 0 = /dev/dcbri0 First DataComm card
+ 1 = /dev/dcbri1 Second DataComm card
+ 2 = /dev/dcbri2 Third DataComm card
+ 3 = /dev/dcbri3 Fourth DataComm card
+
+ 52 block Mylex DAC960 PCI RAID controller; fifth controller
+ 0 = /dev/rd/c4d0 First disk, whole disk
+ 8 = /dev/rd/c4d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c4d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+ 53 char BDM interface for remote debugging MC683xx microcontrollers
+ 0 = /dev/pd_bdm0 PD BDM interface on lp0
+ 1 = /dev/pd_bdm1 PD BDM interface on lp1
+ 2 = /dev/pd_bdm2 PD BDM interface on lp2
+ 4 = /dev/icd_bdm0 ICD BDM interface on lp0
+ 5 = /dev/icd_bdm1 ICD BDM interface on lp1
+ 6 = /dev/icd_bdm2 ICD BDM interface on lp2
+
+ This device is used for the interfacing to the MC683xx
+ microcontrollers via Background Debug Mode by use of a
+ Parallel Port interface. PD is the Motorola Public
+ Domain Interface and ICD is the commercial interface
+ by P&E.
+
+ 53 block Mylex DAC960 PCI RAID controller; sixth controller
+ 0 = /dev/rd/c5d0 First disk, whole disk
+ 8 = /dev/rd/c5d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c5d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+ 54 char Electrocardiognosis Holter serial card
+ 0 = /dev/holter0 First Holter port
+ 1 = /dev/holter1 Second Holter port
+ 2 = /dev/holter2 Third Holter port
+
+ A custom serial card used by Electrocardiognosis SRL
+ <mseritan@ottonel.pub.ro> to transfer data from Holter
+ 24-hour heart monitoring equipment.
+
+ 54 block Mylex DAC960 PCI RAID controller; seventh controller
+ 0 = /dev/rd/c6d0 First disk, whole disk
+ 8 = /dev/rd/c6d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c6d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+ 55 char DSP56001 digital signal processor
+ 0 = /dev/dsp56k First DSP56001
+
+ 55 block Mylex DAC960 PCI RAID controller; eighth controller
+ 0 = /dev/rd/c7d0 First disk, whole disk
+ 8 = /dev/rd/c7d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c7d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+ 56 char Apple Desktop Bus
+ 0 = /dev/adb ADB bus control
+
+ Additional devices will be added to this number, all
+ starting with /dev/adb.
+
+ 56 block Fifth IDE hard disk/CD-ROM interface
+ 0 = /dev/hdi Master: whole disk (or CD-ROM)
+ 64 = /dev/hdj Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 57 char Hayes ESP serial card
+ 0 = /dev/ttyP0 First ESP port
+ 1 = /dev/ttyP1 Second ESP port
+ ...
+
+ 57 block Sixth IDE hard disk/CD-ROM interface
+ 0 = /dev/hdk Master: whole disk (or CD-ROM)
+ 64 = /dev/hdl Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 58 char Hayes ESP serial card - alternate devices
+ 0 = /dev/cup0 Callout device for ttyP0
+ 1 = /dev/cup1 Callout device for ttyP1
+ ...
+
+ 58 block Reserved for logical volume manager
+
+ 59 char sf firewall package
+ 0 = /dev/firewall Communication with sf kernel module
+
+ 59 block Generic PDA filesystem device
+ 0 = /dev/pda0 First PDA device
+ 1 = /dev/pda1 Second PDA device
+ ...
+
+ The pda devices are used to mount filesystems on
+ remote pda's (basically slow handheld machines with
+ proprietary OS's and limited memory and storage
+ running small fs translation drivers) through serial /
+ IRDA / parallel links.
+
+ NAMING CONFLICT -- PROPOSED REVISED NAME /dev/rpda0 etc
+
+ 60-63 char LOCAL/EXPERIMENTAL USE
+
+ 60-63 block LOCAL/EXPERIMENTAL USE
+ Allocated for local/experimental use. For devices not
+ assigned official numbers, these ranges should be
+ used in order to avoid conflicting with future assignments.
+
+ 64 char ENskip kernel encryption package
+ 0 = /dev/enskip Communication with ENskip kernel module
+
+ 64 block Scramdisk/DriveCrypt encrypted devices
+ 0 = /dev/scramdisk/master Master node for ioctls
+ 1 = /dev/scramdisk/1 First encrypted device
+ 2 = /dev/scramdisk/2 Second encrypted device
+ ...
+ 255 = /dev/scramdisk/255 255th encrypted device
+
+ The filename of the encrypted container and the passwords
+ are sent via ioctls (using the sdmount tool) to the master
+ node which then activates them via one of the
+ /dev/scramdisk/x nodes for loop mounting (all handled
+ through the sdmount tool).
+
+ Requested by: andy@scramdisklinux.org
+
+ 65 char Sundance "plink" Transputer boards (obsolete, unused)
+ 0 = /dev/plink0 First plink device
+ 1 = /dev/plink1 Second plink device
+ 2 = /dev/plink2 Third plink device
+ 3 = /dev/plink3 Fourth plink device
+ 64 = /dev/rplink0 First plink device, raw
+ 65 = /dev/rplink1 Second plink device, raw
+ 66 = /dev/rplink2 Third plink device, raw
+ 67 = /dev/rplink3 Fourth plink device, raw
+ 128 = /dev/plink0d First plink device, debug
+ 129 = /dev/plink1d Second plink device, debug
+ 130 = /dev/plink2d Third plink device, debug
+ 131 = /dev/plink3d Fourth plink device, debug
+ 192 = /dev/rplink0d First plink device, raw, debug
+ 193 = /dev/rplink1d Second plink device, raw, debug
+ 194 = /dev/rplink2d Third plink device, raw, debug
+ 195 = /dev/rplink3d Fourth plink device, raw, debug
+
+ This is a commercial driver; contact James Howes
+ <jth@prosig.demon.co.uk> for information.
+
+ 65 block SCSI disk devices (16-31)
+ 0 = /dev/sdq 17th SCSI disk whole disk
+ 16 = /dev/sdr 18th SCSI disk whole disk
+ 32 = /dev/sds 19th SCSI disk whole disk
+ ...
+ 240 = /dev/sdaf 32nd SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 66 char YARC PowerPC PCI coprocessor card
+ 0 = /dev/yppcpci0 First YARC card
+ 1 = /dev/yppcpci1 Second YARC card
+ ...
+
+ 66 block SCSI disk devices (32-47)
+ 0 = /dev/sdag 33th SCSI disk whole disk
+ 16 = /dev/sdah 34th SCSI disk whole disk
+ 32 = /dev/sdai 35th SCSI disk whole disk
+ ...
+ 240 = /dev/sdav 48nd SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 67 char Coda network file system
+ 0 = /dev/cfs0 Coda cache manager
+
+ See http://www.coda.cs.cmu.edu for information about Coda.
+
+ 67 block SCSI disk devices (48-63)
+ 0 = /dev/sdaw 49th SCSI disk whole disk
+ 16 = /dev/sdax 50th SCSI disk whole disk
+ 32 = /dev/sday 51st SCSI disk whole disk
+ ...
+ 240 = /dev/sdbl 64th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 68 char CAPI 2.0 interface
+ 0 = /dev/capi20 Control device
+ 1 = /dev/capi20.00 First CAPI 2.0 application
+ 2 = /dev/capi20.01 Second CAPI 2.0 application
+ ...
+ 20 = /dev/capi20.19 19th CAPI 2.0 application
+
+ ISDN CAPI 2.0 driver for use with CAPI 2.0
+ applications; currently supports the AVM B1 card.
+
+ 68 block SCSI disk devices (64-79)
+ 0 = /dev/sdbm 65th SCSI disk whole disk
+ 16 = /dev/sdbn 66th SCSI disk whole disk
+ 32 = /dev/sdbo 67th SCSI disk whole disk
+ ...
+ 240 = /dev/sdcb 80th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 69 char MA16 numeric accelerator card
+ 0 = /dev/ma16 Board memory access
+
+ 69 block SCSI disk devices (80-95)
+ 0 = /dev/sdcc 81st SCSI disk whole disk
+ 16 = /dev/sdcd 82nd SCSI disk whole disk
+ 32 = /dev/sdce 83th SCSI disk whole disk
+ ...
+ 240 = /dev/sdcr 96th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 70 char SpellCaster Protocol Services Interface
+ 0 = /dev/apscfg Configuration interface
+ 1 = /dev/apsauth Authentication interface
+ 2 = /dev/apslog Logging interface
+ 3 = /dev/apsdbg Debugging interface
+ 64 = /dev/apsisdn ISDN command interface
+ 65 = /dev/apsasync Async command interface
+ 128 = /dev/apsmon Monitor interface
+
+ 70 block SCSI disk devices (96-111)
+ 0 = /dev/sdcs 97th SCSI disk whole disk
+ 16 = /dev/sdct 98th SCSI disk whole disk
+ 32 = /dev/sdcu 99th SCSI disk whole disk
+ ...
+ 240 = /dev/sddh 112nd SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 71 char Computone IntelliPort II serial card
+ 0 = /dev/ttyF0 IntelliPort II board 0, port 0
+ 1 = /dev/ttyF1 IntelliPort II board 0, port 1
+ ...
+ 63 = /dev/ttyF63 IntelliPort II board 0, port 63
+ 64 = /dev/ttyF64 IntelliPort II board 1, port 0
+ 65 = /dev/ttyF65 IntelliPort II board 1, port 1
+ ...
+ 127 = /dev/ttyF127 IntelliPort II board 1, port 63
+ 128 = /dev/ttyF128 IntelliPort II board 2, port 0
+ 129 = /dev/ttyF129 IntelliPort II board 2, port 1
+ ...
+ 191 = /dev/ttyF191 IntelliPort II board 2, port 63
+ 192 = /dev/ttyF192 IntelliPort II board 3, port 0
+ 193 = /dev/ttyF193 IntelliPort II board 3, port 1
+ ...
+ 255 = /dev/ttyF255 IntelliPort II board 3, port 63
+
+ 71 block SCSI disk devices (112-127)
+ 0 = /dev/sddi 113th SCSI disk whole disk
+ 16 = /dev/sddj 114th SCSI disk whole disk
+ 32 = /dev/sddk 115th SCSI disk whole disk
+ ...
+ 240 = /dev/sddx 128th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 72 char Computone IntelliPort II serial card - alternate devices
+ 0 = /dev/cuf0 Callout device for ttyF0
+ 1 = /dev/cuf1 Callout device for ttyF1
+ ...
+ 63 = /dev/cuf63 Callout device for ttyF63
+ 64 = /dev/cuf64 Callout device for ttyF64
+ 65 = /dev/cuf65 Callout device for ttyF65
+ ...
+ 127 = /dev/cuf127 Callout device for ttyF127
+ 128 = /dev/cuf128 Callout device for ttyF128
+ 129 = /dev/cuf129 Callout device for ttyF129
+ ...
+ 191 = /dev/cuf191 Callout device for ttyF191
+ 192 = /dev/cuf192 Callout device for ttyF192
+ 193 = /dev/cuf193 Callout device for ttyF193
+ ...
+ 255 = /dev/cuf255 Callout device for ttyF255
+
+ 72 block Compaq Intelligent Drive Array, first controller
+ 0 = /dev/ida/c0d0 First logical drive whole disk
+ 16 = /dev/ida/c0d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c0d15 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+ 73 char Computone IntelliPort II serial card - control devices
+ 0 = /dev/ip2ipl0 Loadware device for board 0
+ 1 = /dev/ip2stat0 Status device for board 0
+ 4 = /dev/ip2ipl1 Loadware device for board 1
+ 5 = /dev/ip2stat1 Status device for board 1
+ 8 = /dev/ip2ipl2 Loadware device for board 2
+ 9 = /dev/ip2stat2 Status device for board 2
+ 12 = /dev/ip2ipl3 Loadware device for board 3
+ 13 = /dev/ip2stat3 Status device for board 3
+
+ 73 block Compaq Intelligent Drive Array, second controller
+ 0 = /dev/ida/c1d0 First logical drive whole disk
+ 16 = /dev/ida/c1d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c1d15 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+ 74 char SCI bridge
+ 0 = /dev/SCI/0 SCI device 0
+ 1 = /dev/SCI/1 SCI device 1
+ ...
+
+ Currently for Dolphin Interconnect Solutions' PCI-SCI
+ bridge.
+
+ 74 block Compaq Intelligent Drive Array, third controller
+ 0 = /dev/ida/c2d0 First logical drive whole disk
+ 16 = /dev/ida/c2d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c2d15 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+ 75 char Specialix IO8+ serial card
+ 0 = /dev/ttyW0 First IO8+ port, first card
+ 1 = /dev/ttyW1 Second IO8+ port, first card
+ ...
+ 8 = /dev/ttyW8 First IO8+ port, second card
+ ...
+
+ 75 block Compaq Intelligent Drive Array, fourth controller
+ 0 = /dev/ida/c3d0 First logical drive whole disk
+ 16 = /dev/ida/c3d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c3d15 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+ 76 char Specialix IO8+ serial card - alternate devices
+ 0 = /dev/cuw0 Callout device for ttyW0
+ 1 = /dev/cuw1 Callout device for ttyW1
+ ...
+ 8 = /dev/cuw8 Callout device for ttyW8
+ ...
+
+ 76 block Compaq Intelligent Drive Array, fifth controller
+ 0 = /dev/ida/c4d0 First logical drive whole disk
+ 16 = /dev/ida/c4d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c4d15 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+
+ 77 char ComScire Quantum Noise Generator
+ 0 = /dev/qng ComScire Quantum Noise Generator
+
+ 77 block Compaq Intelligent Drive Array, sixth controller
+ 0 = /dev/ida/c5d0 First logical drive whole disk
+ 16 = /dev/ida/c5d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c5d15 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+ 78 char PAM Software's multimodem boards
+ 0 = /dev/ttyM0 First PAM modem
+ 1 = /dev/ttyM1 Second PAM modem
+ ...
+
+ 78 block Compaq Intelligent Drive Array, seventh controller
+ 0 = /dev/ida/c6d0 First logical drive whole disk
+ 16 = /dev/ida/c6d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c6d15 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+ 79 char PAM Software's multimodem boards - alternate devices
+ 0 = /dev/cum0 Callout device for ttyM0
+ 1 = /dev/cum1 Callout device for ttyM1
+ ...
+
+ 79 block Compaq Intelligent Drive Array, eighth controller
+ 0 = /dev/ida/c7d0 First logical drive whole disk
+ 16 = /dev/ida/c7d1 Second logical drive whole disk
+ ...
+ 240 = /dev/ida/c715 16th logical drive whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+ 80 char Photometrics AT200 CCD camera
+ 0 = /dev/at200 Photometrics AT200 CCD camera
+
+ 80 block I2O hard disk
+ 0 = /dev/i2o/hda First I2O hard disk, whole disk
+ 16 = /dev/i2o/hdb Second I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hdp 16th I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 81 char video4linux
+ 0 = /dev/video0 Video capture/overlay device
+ ...
+ 63 = /dev/video63 Video capture/overlay device
+ 64 = /dev/radio0 Radio device
+ ...
+ 127 = /dev/radio63 Radio device
+ 192 = /dev/vtx0 Teletext device
+ ...
+ 223 = /dev/vtx31 Teletext device
+ 224 = /dev/vbi0 Vertical blank interrupt
+ ...
+ 255 = /dev/vbi31 Vertical blank interrupt
+
+ 81 block I2O hard disk
+ 0 = /dev/i2o/hdq 17th I2O hard disk, whole disk
+ 16 = /dev/i2o/hdr 18th I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hdaf 32nd I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 82 char WiNRADiO communications receiver card
+ 0 = /dev/winradio0 First WiNRADiO card
+ 1 = /dev/winradio1 Second WiNRADiO card
+ ...
+
+ The driver and documentation may be obtained from
+ http://www.proximity.com.au/~brian/winradio/
+
+ 82 block I2O hard disk
+ 0 = /dev/i2o/hdag 33rd I2O hard disk, whole disk
+ 16 = /dev/i2o/hdah 34th I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hdav 48th I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 83 char Matrox mga_vid video driver
+ 0 = /dev/mga_vid0 1st video card
+ 1 = /dev/mga_vid1 2nd video card
+ 2 = /dev/mga_vid2 3rd video card
+ ...
+ 15 = /dev/mga_vid15 16th video card
+
+ 83 block I2O hard disk
+ 0 = /dev/i2o/hdaw 49th I2O hard disk, whole disk
+ 16 = /dev/i2o/hdax 50th I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hdbl 64th I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 84 char Ikon 1011[57] Versatec Greensheet Interface
+ 0 = /dev/ihcp0 First Greensheet port
+ 1 = /dev/ihcp1 Second Greensheet port
+
+ 84 block I2O hard disk
+ 0 = /dev/i2o/hdbm 65th I2O hard disk, whole disk
+ 16 = /dev/i2o/hdbn 66th I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hdcb 80th I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 85 char Linux/SGI shared memory input queue
+ 0 = /dev/shmiq Master shared input queue
+ 1 = /dev/qcntl0 First device pushed
+ 2 = /dev/qcntl1 Second device pushed
+ ...
+
+ 85 block I2O hard disk
+ 0 = /dev/i2o/hdcc 81st I2O hard disk, whole disk
+ 16 = /dev/i2o/hdcd 82nd I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hdcr 96th I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 86 char SCSI media changer
+ 0 = /dev/sch0 First SCSI media changer
+ 1 = /dev/sch1 Second SCSI media changer
+ ...
+
+ 86 block I2O hard disk
+ 0 = /dev/i2o/hdcs 97th I2O hard disk, whole disk
+ 16 = /dev/i2o/hdct 98th I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hddh 112th I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 87 char Sony Control-A1 stereo control bus
+ 0 = /dev/controla0 First device on chain
+ 1 = /dev/controla1 Second device on chain
+ ...
+
+ 87 block I2O hard disk
+ 0 = /dev/i2o/hddi 113rd I2O hard disk, whole disk
+ 16 = /dev/i2o/hddj 114th I2O hard disk, whole disk
+ ...
+ 240 = /dev/i2o/hddx 128th I2O hard disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 88 char COMX synchronous serial card
+ 0 = /dev/comx0 COMX channel 0
+ 1 = /dev/comx1 COMX channel 1
+ ...
+
+ 88 block Seventh IDE hard disk/CD-ROM interface
+ 0 = /dev/hdm Master: whole disk (or CD-ROM)
+ 64 = /dev/hdn Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 89 char I2C bus interface
+ 0 = /dev/i2c-0 First I2C adapter
+ 1 = /dev/i2c-1 Second I2C adapter
+ ...
+
+ 89 block Eighth IDE hard disk/CD-ROM interface
+ 0 = /dev/hdo Master: whole disk (or CD-ROM)
+ 64 = /dev/hdp Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 90 char Memory Technology Device (RAM, ROM, Flash)
+ 0 = /dev/mtd0 First MTD (rw)
+ 1 = /dev/mtdr0 First MTD (ro)
+ ...
+ 30 = /dev/mtd15 16th MTD (rw)
+ 31 = /dev/mtdr15 16th MTD (ro)
+
+ 90 block Ninth IDE hard disk/CD-ROM interface
+ 0 = /dev/hdq Master: whole disk (or CD-ROM)
+ 64 = /dev/hdr Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 91 char CAN-Bus devices
+ 0 = /dev/can0 First CAN-Bus controller
+ 1 = /dev/can1 Second CAN-Bus controller
+ ...
+
+ 91 block Tenth IDE hard disk/CD-ROM interface
+ 0 = /dev/hds Master: whole disk (or CD-ROM)
+ 64 = /dev/hdt Slave: whole disk (or CD-ROM)
+
+ Partitions are handled the same way as for the first
+ interface (see major number 3).
+
+ 92 char Reserved for ith Kommunikationstechnik MIC ISDN card
+
+ 92 block PPDD encrypted disk driver
+ 0 = /dev/ppdd0 First encrypted disk
+ 1 = /dev/ppdd1 Second encrypted disk
+ ...
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ 93 char IBM Smart Capture Card frame grabber {2.6}
+ 0 = /dev/iscc0 First Smart Capture Card
+ 1 = /dev/iscc1 Second Smart Capture Card
+ ...
+ 128 = /dev/isccctl0 First Smart Capture Card control
+ 129 = /dev/isccctl1 Second Smart Capture Card control
+ ...
+
+ 93 block NAND Flash Translation Layer filesystem
+ 0 = /dev/nftla First NFTL layer
+ 16 = /dev/nftlb Second NFTL layer
+ ...
+ 240 = /dev/nftlp 16th NTFL layer
+
+ 94 char miroVIDEO DC10/30 capture/playback device {2.6}
+ 0 = /dev/dcxx0 First capture card
+ 1 = /dev/dcxx1 Second capture card
+ ...
+
+ 94 block IBM S/390 DASD block storage
+ 0 = /dev/dasda First DASD device, major
+ 1 = /dev/dasda1 First DASD device, block 1
+ 2 = /dev/dasda2 First DASD device, block 2
+ 3 = /dev/dasda3 First DASD device, block 3
+ 4 = /dev/dasdb Second DASD device, major
+ 5 = /dev/dasdb1 Second DASD device, block 1
+ 6 = /dev/dasdb2 Second DASD device, block 2
+ 7 = /dev/dasdb3 Second DASD device, block 3
+ ...
+
+ 95 char IP filter
+ 0 = /dev/ipl Filter control device/log file
+ 1 = /dev/ipnat NAT control device/log file
+ 2 = /dev/ipstate State information log file
+ 3 = /dev/ipauth Authentication control device/log file
+ ...
+
+ 96 char Parallel port ATAPI tape devices
+ 0 = /dev/pt0 First parallel port ATAPI tape
+ 1 = /dev/pt1 Second parallel port ATAPI tape
+ ...
+ 128 = /dev/npt0 First p.p. ATAPI tape, no rewind
+ 129 = /dev/npt1 Second p.p. ATAPI tape, no rewind
+ ...
+
+ 96 block Inverse NAND Flash Translation Layer
+ 0 = /dev/inftla First INFTL layer
+ 16 = /dev/inftlb Second INFTL layer
+ ...
+ 240 = /dev/inftlp 16th INTFL layer
+
+ 97 char Parallel port generic ATAPI interface
+ 0 = /dev/pg0 First parallel port ATAPI device
+ 1 = /dev/pg1 Second parallel port ATAPI device
+ 2 = /dev/pg2 Third parallel port ATAPI device
+ 3 = /dev/pg3 Fourth parallel port ATAPI device
+
+ These devices support the same API as the generic SCSI
+ devices.
+
+ 98 char Control and Measurement Device (comedi)
+ 0 = /dev/comedi0 First comedi device
+ 1 = /dev/comedi1 Second comedi device
+ ...
+
+ See http://stm.lbl.gov/comedi or http://www.llp.fu-berlin.de/.
+
+ 98 block User-mode virtual block device
+ 0 = /dev/ubda First user-mode block device
+ 16 = /dev/udbb Second user-mode block device
+ ...
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+ This device is used by the user-mode virtual kernel port.
+
+ 99 char Raw parallel ports
+ 0 = /dev/parport0 First parallel port
+ 1 = /dev/parport1 Second parallel port
+ ...
+
+ 99 block JavaStation flash disk
+ 0 = /dev/jsfd JavaStation flash disk
+
+100 char Telephony for Linux
+ 0 = /dev/phone0 First telephony device
+ 1 = /dev/phone1 Second telephony device
+ ...
+
+101 char Motorola DSP 56xxx board
+ 0 = /dev/mdspstat Status information
+ 1 = /dev/mdsp1 First DSP board I/O controls
+ ...
+ 16 = /dev/mdsp16 16th DSP board I/O controls
+
+101 block AMI HyperDisk RAID controller
+ 0 = /dev/amiraid/ar0 First array whole disk
+ 16 = /dev/amiraid/ar1 Second array whole disk
+ ...
+ 240 = /dev/amiraid/ar15 16th array whole disk
+
+ For each device, partitions are added as:
+ 0 = /dev/amiraid/ar? Whole disk
+ 1 = /dev/amiraid/ar?p1 First partition
+ 2 = /dev/amiraid/ar?p2 Second partition
+ ...
+ 15 = /dev/amiraid/ar?p15 15th partition
+
+102 char Philips SAA5249 Teletext signal decoder {2.6}
+ 0 = /dev/tlk0 First Teletext decoder
+ 1 = /dev/tlk1 Second Teletext decoder
+ 2 = /dev/tlk2 Third Teletext decoder
+ 3 = /dev/tlk3 Fourth Teletext decoder
+
+102 block Compressed block device
+ 0 = /dev/cbd/a First compressed block device, whole device
+ 16 = /dev/cbd/b Second compressed block device, whole device
+ ...
+ 240 = /dev/cbd/p 16th compressed block device, whole device
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+103 char Arla network file system
+ 0 = /dev/nnpfs0 First NNPFS device
+ 1 = /dev/nnpfs1 Second NNPFS device
+
+ Arla is a free clone of the Andrew File System, AFS.
+ The NNPFS device gives user mode filesystem
+ implementations a kernel presence for caching and easy
+ mounting. For more information about the project,
+ write to <arla-drinkers@stacken.kth.se> or see
+ http://www.stacken.kth.se/project/arla/
+
+103 block Audit device
+ 0 = /dev/audit Audit device
+
+104 char Flash BIOS support
+
+104 block Compaq Next Generation Drive Array, first controller
+ 0 = /dev/cciss/c0d0 First logical drive, whole disk
+ 16 = /dev/cciss/c0d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c0d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+105 char Comtrol VS-1000 serial controller
+ 0 = /dev/ttyV0 First VS-1000 port
+ 1 = /dev/ttyV1 Second VS-1000 port
+ ...
+
+105 block Compaq Next Generation Drive Array, second controller
+ 0 = /dev/cciss/c1d0 First logical drive, whole disk
+ 16 = /dev/cciss/c1d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c1d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+106 char Comtrol VS-1000 serial controller - alternate devices
+ 0 = /dev/cuv0 First VS-1000 port
+ 1 = /dev/cuv1 Second VS-1000 port
+ ...
+
+106 block Compaq Next Generation Drive Array, third controller
+ 0 = /dev/cciss/c2d0 First logical drive, whole disk
+ 16 = /dev/cciss/c2d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c2d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+107 char 3Dfx Voodoo Graphics device
+ 0 = /dev/3dfx Primary 3Dfx graphics device
+
+107 block Compaq Next Generation Drive Array, fourth controller
+ 0 = /dev/cciss/c3d0 First logical drive, whole disk
+ 16 = /dev/cciss/c3d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c3d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+108 char Device independent PPP interface
+ 0 = /dev/ppp Device independent PPP interface
+
+108 block Compaq Next Generation Drive Array, fifth controller
+ 0 = /dev/cciss/c4d0 First logical drive, whole disk
+ 16 = /dev/cciss/c4d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c4d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+109 char Reserved for logical volume manager
+
+109 block Compaq Next Generation Drive Array, sixth controller
+ 0 = /dev/cciss/c5d0 First logical drive, whole disk
+ 16 = /dev/cciss/c5d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c5d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+110 char miroMEDIA Surround board
+ 0 = /dev/srnd0 First miroMEDIA Surround board
+ 1 = /dev/srnd1 Second miroMEDIA Surround board
+ ...
+
+110 block Compaq Next Generation Drive Array, seventh controller
+ 0 = /dev/cciss/c6d0 First logical drive, whole disk
+ 16 = /dev/cciss/c6d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c6d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+111 char Philips SAA7146-based audio/video card {2.6}
+ 0 = /dev/av0 First A/V card
+ 1 = /dev/av1 Second A/V card
+ ...
+
+111 block Compaq Next Generation Drive Array, eighth controller
+ 0 = /dev/cciss/c7d0 First logical drive, whole disk
+ 16 = /dev/cciss/c7d1 Second logical drive, whole disk
+ ...
+ 240 = /dev/cciss/c7d15 16th logical drive, whole disk
+
+ Partitions are handled the same way as for Mylex
+ DAC960 (see major number 48) except that the limit on
+ partitions is 15.
+
+112 char ISI serial card
+ 0 = /dev/ttyM0 First ISI port
+ 1 = /dev/ttyM1 Second ISI port
+ ...
+
+ There is currently a device-naming conflict between
+ these and PAM multimodems (major 78).
+
+112 block IBM iSeries virtual disk
+ 0 = /dev/iseries/vda First virtual disk, whole disk
+ 8 = /dev/iseries/vdb Second virtual disk, whole disk
+ ...
+ 200 = /dev/iseries/vdz 26th virtual disk, whole disk
+ 208 = /dev/iseries/vdaa 27th virtual disk, whole disk
+ ...
+ 248 = /dev/iseries/vdaf 32nd virtual disk, whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 7.
+
+113 char ISI serial card - alternate devices
+ 0 = /dev/cum0 Callout device for ttyM0
+ 1 = /dev/cum1 Callout device for ttyM1
+ ...
+
+113 block IBM iSeries virtual CD-ROM
+ 0 = /dev/iseries/vcda First virtual CD-ROM
+ 1 = /dev/iseries/vcdb Second virtual CD-ROM
+ ...
+
+114 char Picture Elements ISE board
+ 0 = /dev/ise0 First ISE board
+ 1 = /dev/ise1 Second ISE board
+ ...
+ 128 = /dev/isex0 Control node for first ISE board
+ 129 = /dev/isex1 Control node for second ISE board
+ ...
+
+ The ISE board is an embedded computer, optimized for
+ image processing. The /dev/iseN nodes are the general
+ I/O access to the board, the /dev/isex0 nodes command
+ nodes used to control the board.
+
+114 block IDE BIOS powered software RAID interfaces such as the
+ Promise Fastrak
+
+ 0 = /dev/ataraid/d0
+ 1 = /dev/ataraid/d0p1
+ 2 = /dev/ataraid/d0p2
+ ...
+ 16 = /dev/ataraid/d1
+ 17 = /dev/ataraid/d1p1
+ 18 = /dev/ataraid/d1p2
+ ...
+ 255 = /dev/ataraid/d15p15
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+115 char TI link cable devices (115 was formerly the console driver speaker)
+ 0 = /dev/tipar0 Parallel cable on first parallel port
+ ...
+ 7 = /dev/tipar7 Parallel cable on seventh parallel port
+
+ 8 = /dev/tiser0 Serial cable on first serial port
+ ...
+ 15 = /dev/tiser7 Serial cable on seventh serial port
+
+ 16 = /dev/tiusb0 First USB cable
+ ...
+ 47 = /dev/tiusb31 32nd USB cable
+
+115 block NetWare (NWFS) Devices (0-255)
+
+ The NWFS (NetWare) devices are used to present a
+ collection of NetWare Mirror Groups or NetWare
+ Partitions as a logical storage segment for
+ use in mounting NetWare volumes. A maximum of
+ 256 NetWare volumes can be supported in a single
+ machine.
+
+ http://www.kernel.org/pub/linux/kernel/people/jmerkey/nwfs
+
+ 0 = /dev/nwfs/v0 First NetWare (NWFS) Logical Volume
+ 1 = /dev/nwfs/v1 Second NetWare (NWFS) Logical Volume
+ 2 = /dev/nwfs/v2 Third NetWare (NWFS) Logical Volume
+ ...
+ 255 = /dev/nwfs/v255 Last NetWare (NWFS) Logical Volume
+
+116 char Advanced Linux Sound Driver (ALSA)
+
+116 block MicroMemory battery backed RAM adapter (NVRAM)
+ Supports 16 boards, 15 partitions each.
+ Requested by neilb at cse.unsw.edu.au.
+
+ 0 = /dev/umem/d0 Whole of first board
+ 1 = /dev/umem/d0p1 First partition of first board
+ 2 = /dev/umem/d0p2 Second partition of first board
+ 15 = /dev/umem/d0p15 15th partition of first board
+
+ 16 = /dev/umem/d1 Whole of second board
+ 17 = /dev/umem/d1p1 First partition of second board
+ ...
+ 255= /dev/umem/d15p15 15th partition of 16th board.
+
+117 char COSA/SRP synchronous serial card
+ 0 = /dev/cosa0c0 1st board, 1st channel
+ 1 = /dev/cosa0c1 1st board, 2nd channel
+ ...
+ 16 = /dev/cosa1c0 2nd board, 1st channel
+ 17 = /dev/cosa1c1 2nd board, 2nd channel
+ ...
+
+117 block Enterprise Volume Management System (EVMS)
+
+ The EVMS driver uses a layered, plug-in model to provide
+ unparalleled flexibility and extensibility in managing
+ storage. This allows for easy expansion or customization
+ of various levels of volume management. Requested by
+ Mark Peloquin (peloquin at us.ibm.com).
+
+ Note: EVMS populates and manages all the devnodes in
+ /dev/evms.
+
+ http://sf.net/projects/evms
+
+ 0 = /dev/evms/block_device EVMS block device
+ 1 = /dev/evms/legacyname1 First EVMS legacy device
+ 2 = /dev/evms/legacyname2 Second EVMS legacy device
+ ...
+ Both ranges can grow (down or up) until they meet.
+ ...
+ 254 = /dev/evms/EVMSname2 Second EVMS native device
+ 255 = /dev/evms/EVMSname1 First EVMS native device
+
+ Note: legacyname(s) are derived from the normal legacy
+ device names. For example, /dev/hda5 would become
+ /dev/evms/hda5.
+
+118 char IBM Cryptographic Accelerator
+ 0 = /dev/ica Virtual interface to all IBM Crypto Accelerators
+ 1 = /dev/ica0 IBMCA Device 0
+ 2 = /dev/ica1 IBMCA Device 1
+ ...
+
+119 char VMware virtual network control
+ 0 = /dev/vmnet0 1st virtual network
+ 1 = /dev/vmnet1 2nd virtual network
+ ...
+
+120-127 char LOCAL/EXPERIMENTAL USE
+
+120-127 block LOCAL/EXPERIMENTAL USE
+ Allocated for local/experimental use. For devices not
+ assigned official numbers, these ranges should be
+ used in order to avoid conflicting with future assignments.
+
+128-135 char Unix98 PTY masters
+
+ These devices should not have corresponding device
+ nodes; instead they should be accessed through the
+ /dev/ptmx cloning interface.
+
+128 block SCSI disk devices (128-143)
+ 0 = /dev/sddy 129th SCSI disk whole disk
+ 16 = /dev/sddz 130th SCSI disk whole disk
+ 32 = /dev/sdea 131th SCSI disk whole disk
+ ...
+ 240 = /dev/sden 144th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+129 block SCSI disk devices (144-159)
+ 0 = /dev/sdeo 145th SCSI disk whole disk
+ 16 = /dev/sdep 146th SCSI disk whole disk
+ 32 = /dev/sdeq 147th SCSI disk whole disk
+ ...
+ 240 = /dev/sdfd 160th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+130 char (Misc devices)
+
+130 block SCSI disk devices (160-175)
+ 0 = /dev/sdfe 161st SCSI disk whole disk
+ 16 = /dev/sdff 162nd SCSI disk whole disk
+ 32 = /dev/sdfg 163rd SCSI disk whole disk
+ ...
+ 240 = /dev/sdft 176th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+131 block SCSI disk devices (176-191)
+ 0 = /dev/sdfu 177th SCSI disk whole disk
+ 16 = /dev/sdfv 178th SCSI disk whole disk
+ 32 = /dev/sdfw 179th SCSI disk whole disk
+ ...
+ 240 = /dev/sdgj 192nd SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+132 block SCSI disk devices (192-207)
+ 0 = /dev/sdgk 193rd SCSI disk whole disk
+ 16 = /dev/sdgl 194th SCSI disk whole disk
+ 32 = /dev/sdgm 195th SCSI disk whole disk
+ ...
+ 240 = /dev/sdgz 208th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+133 block SCSI disk devices (208-223)
+ 0 = /dev/sdha 209th SCSI disk whole disk
+ 16 = /dev/sdhb 210th SCSI disk whole disk
+ 32 = /dev/sdhc 211th SCSI disk whole disk
+ ...
+ 240 = /dev/sdhp 224th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+134 block SCSI disk devices (224-239)
+ 0 = /dev/sdhq 225th SCSI disk whole disk
+ 16 = /dev/sdhr 226th SCSI disk whole disk
+ 32 = /dev/sdhs 227th SCSI disk whole disk
+ ...
+ 240 = /dev/sdif 240th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+135 block SCSI disk devices (240-255)
+ 0 = /dev/sdig 241st SCSI disk whole disk
+ 16 = /dev/sdih 242nd SCSI disk whole disk
+ 32 = /dev/sdih 243rd SCSI disk whole disk
+ ...
+ 240 = /dev/sdiv 256th SCSI disk whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+136-143 char Unix98 PTY slaves
+ 0 = /dev/pts/0 First Unix98 pseudo-TTY
+ 1 = /dev/pts/1 Second Unix98 pseudo-TTY
+ ...
+
+ These device nodes are automatically generated with
+ the proper permissions and modes by mounting the
+ devpts filesystem onto /dev/pts with the appropriate
+ mount options (distribution dependent, however, on
+ *most* distributions the appropriate options are
+ "mode=0620,gid=<gid of the "tty" group>".)
+
+136 block Mylex DAC960 PCI RAID controller; ninth controller
+ 0 = /dev/rd/c8d0 First disk, whole disk
+ 8 = /dev/rd/c8d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c8d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+137 block Mylex DAC960 PCI RAID controller; tenth controller
+ 0 = /dev/rd/c9d0 First disk, whole disk
+ 8 = /dev/rd/c9d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c9d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+138 block Mylex DAC960 PCI RAID controller; eleventh controller
+ 0 = /dev/rd/c10d0 First disk, whole disk
+ 8 = /dev/rd/c10d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c10d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+139 block Mylex DAC960 PCI RAID controller; twelfth controller
+ 0 = /dev/rd/c11d0 First disk, whole disk
+ 8 = /dev/rd/c11d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c11d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+140 block Mylex DAC960 PCI RAID controller; thirteenth controller
+ 0 = /dev/rd/c12d0 First disk, whole disk
+ 8 = /dev/rd/c12d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c12d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+141 block Mylex DAC960 PCI RAID controller; fourteenth controller
+ 0 = /dev/rd/c13d0 First disk, whole disk
+ 8 = /dev/rd/c13d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c13d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+142 block Mylex DAC960 PCI RAID controller; fifteenth controller
+ 0 = /dev/rd/c14d0 First disk, whole disk
+ 8 = /dev/rd/c14d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c14d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+143 block Mylex DAC960 PCI RAID controller; sixteenth controller
+ 0 = /dev/rd/c15d0 First disk, whole disk
+ 8 = /dev/rd/c15d1 Second disk, whole disk
+ ...
+ 248 = /dev/rd/c15d31 32nd disk, whole disk
+
+ Partitions are handled as for major 48.
+
+144 char Encapsulated PPP
+ 0 = /dev/pppox0 First PPP over Ethernet
+ ...
+ 63 = /dev/pppox63 64th PPP over Ethernet
+
+ This is primarily used for ADSL.
+
+ The SST 5136-DN DeviceNet interface driver has been
+ relocated to major 183 due to an unfortunate conflict.
+
+144 block Expansion Area #1 for more non-device (e.g. NFS) mounts
+ 0 = mounted device 256
+ 255 = mounted device 511
+
+145 char SAM9407-based soundcard
+ 0 = /dev/sam0_mixer
+ 1 = /dev/sam0_sequencer
+ 2 = /dev/sam0_midi00
+ 3 = /dev/sam0_dsp
+ 4 = /dev/sam0_audio
+ 6 = /dev/sam0_sndstat
+ 18 = /dev/sam0_midi01
+ 34 = /dev/sam0_midi02
+ 50 = /dev/sam0_midi03
+ 64 = /dev/sam1_mixer
+ ...
+ 128 = /dev/sam2_mixer
+ ...
+ 192 = /dev/sam3_mixer
+ ...
+
+ Device functions match OSS, but offer a number of
+ addons, which are sam9407 specific. OSS can be
+ operated simultaneously, taking care of the codec.
+
+145 block Expansion Area #2 for more non-device (e.g. NFS) mounts
+ 0 = mounted device 512
+ 255 = mounted device 767
+
+146 char SYSTRAM SCRAMNet mirrored-memory network
+ 0 = /dev/scramnet0 First SCRAMNet device
+ 1 = /dev/scramnet1 Second SCRAMNet device
+ ...
+
+146 block Expansion Area #3 for more non-device (e.g. NFS) mounts
+ 0 = mounted device 768
+ 255 = mounted device 1023
+
+147 char Aureal Semiconductor Vortex Audio device
+ 0 = /dev/aureal0 First Aureal Vortex
+ 1 = /dev/aureal1 Second Aureal Vortex
+ ...
+
+147 block Distributed Replicated Block Device (DRBD)
+ 0 = /dev/drbd0 First DRBD device
+ 1 = /dev/drbd1 Second DRBD device
+ ...
+
+148 char Technology Concepts serial card
+ 0 = /dev/ttyT0 First TCL port
+ 1 = /dev/ttyT1 Second TCL port
+ ...
+
+149 char Technology Concepts serial card - alternate devices
+ 0 = /dev/cut0 Callout device for ttyT0
+ 1 = /dev/cut0 Callout device for ttyT1
+ ...
+
+150 char Real-Time Linux FIFOs
+ 0 = /dev/rtf0 First RTLinux FIFO
+ 1 = /dev/rtf1 Second RTLinux FIFO
+ ...
+
+151 char DPT I2O SmartRaid V controller
+ 0 = /dev/dpti0 First DPT I2O adapter
+ 1 = /dev/dpti1 Second DPT I2O adapter
+ ...
+
+152 char EtherDrive Control Device
+ 0 = /dev/etherd/ctl Connect/Disconnect an EtherDrive
+ 1 = /dev/etherd/err Monitor errors
+ 2 = /dev/etherd/raw Raw AoE packet monitor
+
+152 block EtherDrive Block Devices
+ 0 = /dev/etherd/0 EtherDrive 0
+ ...
+ 255 = /dev/etherd/255 EtherDrive 255
+
+153 char SPI Bus Interface (sometimes referred to as MicroWire)
+ 0 = /dev/spi0 First SPI device on the bus
+ 1 = /dev/spi1 Second SPI device on the bus
+ ...
+ 15 = /dev/spi15 Sixteenth SPI device on the bus
+
+153 block Enhanced Metadisk RAID (EMD) storage units
+ 0 = /dev/emd/0 First unit
+ 1 = /dev/emd/0p1 Partition 1 on First unit
+ 2 = /dev/emd/0p2 Partition 2 on First unit
+ ...
+ 15 = /dev/emd/0p15 Partition 15 on First unit
+
+ 16 = /dev/emd/1 Second unit
+ 32 = /dev/emd/2 Third unit
+ ...
+ 240 = /dev/emd/15 Sixteenth unit
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+154 char Specialix RIO serial card
+ 0 = /dev/ttySR0 First RIO port
+ ...
+ 255 = /dev/ttySR255 256th RIO port
+
+155 char Specialix RIO serial card - alternate devices
+ 0 = /dev/cusr0 Callout device for ttySR0
+ ...
+ 255 = /dev/cusr255 Callout device for ttySR255
+
+156 char Specialix RIO serial card
+ 0 = /dev/ttySR256 257th RIO port
+ ...
+ 255 = /dev/ttySR511 512th RIO port
+
+157 char Specialix RIO serial card - alternate devices
+ 0 = /dev/cusr256 Callout device for ttySR256
+ ...
+ 255 = /dev/cusr511 Callout device for ttySR511
+
+158 char Dialogic GammaLink fax driver
+ 0 = /dev/gfax0 GammaLink channel 0
+ 1 = /dev/gfax1 GammaLink channel 1
+ ...
+
+159 char RESERVED
+
+159 block RESERVED
+
+160 char General Purpose Instrument Bus (GPIB)
+ 0 = /dev/gpib0 First GPIB bus
+ 1 = /dev/gpib1 Second GPIB bus
+ ...
+
+160 block Carmel 8-port SATA Disks on First Controller
+ 0 = /dev/carmel/0 SATA disk 0 whole disk
+ 1 = /dev/carmel/0p1 SATA disk 0 partition 1
+ ...
+ 31 = /dev/carmel/0p31 SATA disk 0 partition 31
+
+ 32 = /dev/carmel/1 SATA disk 1 whole disk
+ 64 = /dev/carmel/2 SATA disk 2 whole disk
+ ...
+ 224 = /dev/carmel/7 SATA disk 7 whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 31.
+
+161 char IrCOMM devices (IrDA serial/parallel emulation)
+ 0 = /dev/ircomm0 First IrCOMM device
+ 1 = /dev/ircomm1 Second IrCOMM device
+ ...
+ 16 = /dev/irlpt0 First IrLPT device
+ 17 = /dev/irlpt1 Second IrLPT device
+ ...
+
+161 block Carmel 8-port SATA Disks on Second Controller
+ 0 = /dev/carmel/8 SATA disk 8 whole disk
+ 1 = /dev/carmel/8p1 SATA disk 8 partition 1
+ ...
+ 31 = /dev/carmel/8p31 SATA disk 8 partition 31
+
+ 32 = /dev/carmel/9 SATA disk 9 whole disk
+ 64 = /dev/carmel/10 SATA disk 10 whole disk
+ ...
+ 224 = /dev/carmel/15 SATA disk 15 whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 31.
+
+162 char Raw block device interface
+ 0 = /dev/rawctl Raw I/O control device
+ 1 = /dev/raw/raw1 First raw I/O device
+ 2 = /dev/raw/raw2 Second raw I/O device
+ ...
+
+163 char UNASSIGNED (was Radio Tech BIM-XXX-RS232 radio modem - see 51)
+
+164 char Chase Research AT/PCI-Fast serial card
+ 0 = /dev/ttyCH0 AT/PCI-Fast board 0, port 0
+ ...
+ 15 = /dev/ttyCH15 AT/PCI-Fast board 0, port 15
+ 16 = /dev/ttyCH16 AT/PCI-Fast board 1, port 0
+ ...
+ 31 = /dev/ttyCH31 AT/PCI-Fast board 1, port 15
+ 32 = /dev/ttyCH32 AT/PCI-Fast board 2, port 0
+ ...
+ 47 = /dev/ttyCH47 AT/PCI-Fast board 2, port 15
+ 48 = /dev/ttyCH48 AT/PCI-Fast board 3, port 0
+ ...
+ 63 = /dev/ttyCH63 AT/PCI-Fast board 3, port 15
+
+165 char Chase Research AT/PCI-Fast serial card - alternate devices
+ 0 = /dev/cuch0 Callout device for ttyCH0
+ ...
+ 63 = /dev/cuch63 Callout device for ttyCH63
+
+166 char ACM USB modems
+ 0 = /dev/ttyACM0 First ACM modem
+ 1 = /dev/ttyACM1 Second ACM modem
+ ...
+
+167 char ACM USB modems - alternate devices
+ 0 = /dev/cuacm0 Callout device for ttyACM0
+ 1 = /dev/cuacm1 Callout device for ttyACM1
+ ...
+
+168 char Eracom CSA7000 PCI encryption adaptor
+ 0 = /dev/ecsa0 First CSA7000
+ 1 = /dev/ecsa1 Second CSA7000
+ ...
+
+169 char Eracom CSA8000 PCI encryption adaptor
+ 0 = /dev/ecsa8-0 First CSA8000
+ 1 = /dev/ecsa8-1 Second CSA8000
+ ...
+
+170 char AMI MegaRAC remote access controller
+ 0 = /dev/megarac0 First MegaRAC card
+ 1 = /dev/megarac1 Second MegaRAC card
+ ...
+
+171 char Reserved for IEEE 1394 (Firewire)
+
+172 char Moxa Intellio serial card
+ 0 = /dev/ttyMX0 First Moxa port
+ 1 = /dev/ttyMX1 Second Moxa port
+ ...
+ 127 = /dev/ttyMX127 128th Moxa port
+ 128 = /dev/moxactl Moxa control port
+
+173 char Moxa Intellio serial card - alternate devices
+ 0 = /dev/cumx0 Callout device for ttyMX0
+ 1 = /dev/cumx1 Callout device for ttyMX1
+ ...
+ 127 = /dev/cumx127 Callout device for ttyMX127
+
+174 char SmartIO serial card
+ 0 = /dev/ttySI0 First SmartIO port
+ 1 = /dev/ttySI1 Second SmartIO port
+ ...
+
+175 char SmartIO serial card - alternate devices
+ 0 = /dev/cusi0 Callout device for ttySI0
+ 1 = /dev/cusi1 Callout device for ttySI1
+ ...
+
+176 char nCipher nFast PCI crypto accelerator
+ 0 = /dev/nfastpci0 First nFast PCI device
+ 1 = /dev/nfastpci1 First nFast PCI device
+ ...
+
+177 char TI PCILynx memory spaces
+ 0 = /dev/pcilynx/aux0 AUX space of first PCILynx card
+ ...
+ 15 = /dev/pcilynx/aux15 AUX space of 16th PCILynx card
+ 16 = /dev/pcilynx/rom0 ROM space of first PCILynx card
+ ...
+ 31 = /dev/pcilynx/rom15 ROM space of 16th PCILynx card
+ 32 = /dev/pcilynx/ram0 RAM space of first PCILynx card
+ ...
+ 47 = /dev/pcilynx/ram15 RAM space of 16th PCILynx card
+
+178 char Giganet cLAN1xxx virtual interface adapter
+ 0 = /dev/clanvi0 First cLAN adapter
+ 1 = /dev/clanvi1 Second cLAN adapter
+ ...
+
+179 char CCube DVXChip-based PCI products
+ 0 = /dev/dvxirq0 First DVX device
+ 1 = /dev/dvxirq1 Second DVX device
+ ...
+
+180 char USB devices
+ 0 = /dev/usb/lp0 First USB printer
+ ...
+ 15 = /dev/usb/lp15 16th USB printer
+ 48 = /dev/usb/scanner0 First USB scanner
+ ...
+ 63 = /dev/usb/scanner15 16th USB scanner
+ 64 = /dev/usb/rio500 Diamond Rio 500
+ 65 = /dev/usb/usblcd USBLCD Interface (info@usblcd.de)
+ 66 = /dev/usb/cpad0 Synaptics cPad (mouse/LCD)
+ 96 = /dev/usb/hiddev0 1st USB HID device
+ ...
+ 111 = /dev/usb/hiddev15 16th USB HID device
+ 128 = /dev/usb/brlvgr0 First Braille Voyager device
+ ...
+ 131 = /dev/usb/brlvgr3 Fourth Braille Voyager device
+ 132 = /dev/usb/idmouse ID Mouse (fingerprint scanner) device
+ 133 = /dev/usb/sisusbvga1 First SiSUSB VGA device
+ ...
+ 140 = /dev/usb/sisusbvga8 Eighth SISUSB VGA device
+ 144 = /dev/usb/lcd USB LCD device
+ 160 = /dev/usb/legousbtower0 1st USB Legotower device
+ ...
+ 175 = /dev/usb/legousbtower15 16th USB Legotower device
+ 176 = /dev/usb/usbtmc1 First USB TMC device
+ ...
+ 192 = /dev/usb/usbtmc16 16th USB TMC device
+ 240 = /dev/usb/dabusb0 First daubusb device
+ ...
+ 243 = /dev/usb/dabusb3 Fourth dabusb device
+
+180 block USB block devices
+ 0 = /dev/uba First USB block device
+ 8 = /dev/ubb Second USB block device
+ 16 = /dev/ubc Third USB block device
+ ...
+
+181 char Conrad Electronic parallel port radio clocks
+ 0 = /dev/pcfclock0 First Conrad radio clock
+ 1 = /dev/pcfclock1 Second Conrad radio clock
+ ...
+
+182 char Picture Elements THR2 binarizer
+ 0 = /dev/pethr0 First THR2 board
+ 1 = /dev/pethr1 Second THR2 board
+ ...
+
+183 char SST 5136-DN DeviceNet interface
+ 0 = /dev/ss5136dn0 First DeviceNet interface
+ 1 = /dev/ss5136dn1 Second DeviceNet interface
+ ...
+
+ This device used to be assigned to major number 144.
+ It had to be moved due to an unfortunate conflict.
+
+184 char Picture Elements' video simulator/sender
+ 0 = /dev/pevss0 First sender board
+ 1 = /dev/pevss1 Second sender board
+ ...
+
+185 char InterMezzo high availability file system
+ 0 = /dev/intermezzo0 First cache manager
+ 1 = /dev/intermezzo1 Second cache manager
+ ...
+
+ See http://www.inter-mezzo.org/ for more information.
+
+186 char Object-based storage control device
+ 0 = /dev/obd0 First obd control device
+ 1 = /dev/obd1 Second obd control device
+ ...
+
+ See ftp://ftp.lustre.org/pub/obd for code and information.
+
+187 char DESkey hardware encryption device
+ 0 = /dev/deskey0 First DES key
+ 1 = /dev/deskey1 Second DES key
+ ...
+
+188 char USB serial converters
+ 0 = /dev/ttyUSB0 First USB serial converter
+ 1 = /dev/ttyUSB1 Second USB serial converter
+ ...
+
+189 char USB serial converters - alternate devices
+ 0 = /dev/cuusb0 Callout device for ttyUSB0
+ 1 = /dev/cuusb1 Callout device for ttyUSB1
+ ...
+
+190 char Kansas City tracker/tuner card
+ 0 = /dev/kctt0 First KCT/T card
+ 1 = /dev/kctt1 Second KCT/T card
+ ...
+
+191 char Reserved for PCMCIA
+
+192 char Kernel profiling interface
+ 0 = /dev/profile Profiling control device
+ 1 = /dev/profile0 Profiling device for CPU 0
+ 2 = /dev/profile1 Profiling device for CPU 1
+ ...
+
+193 char Kernel event-tracing interface
+ 0 = /dev/trace Tracing control device
+ 1 = /dev/trace0 Tracing device for CPU 0
+ 2 = /dev/trace1 Tracing device for CPU 1
+ ...
+
+194 char linVideoStreams (LINVS)
+ 0 = /dev/mvideo/status0 Video compression status
+ 1 = /dev/mvideo/stream0 Video stream
+ 2 = /dev/mvideo/frame0 Single compressed frame
+ 3 = /dev/mvideo/rawframe0 Raw uncompressed frame
+ 4 = /dev/mvideo/codec0 Direct codec access
+ 5 = /dev/mvideo/video4linux0 Video4Linux compatibility
+
+ 16 = /dev/mvideo/status1 Second device
+ ...
+ 32 = /dev/mvideo/status2 Third device
+ ...
+ ...
+ 240 = /dev/mvideo/status15 16th device
+ ...
+
+195 char Nvidia graphics devices
+ 0 = /dev/nvidia0 First Nvidia card
+ 1 = /dev/nvidia1 Second Nvidia card
+ ...
+ 255 = /dev/nvidiactl Nvidia card control device
+
+196 char Tormenta T1 card
+ 0 = /dev/tor/0 Master control channel for all cards
+ 1 = /dev/tor/1 First DS0
+ 2 = /dev/tor/2 Second DS0
+ ...
+ 48 = /dev/tor/48 48th DS0
+ 49 = /dev/tor/49 First pseudo-channel
+ 50 = /dev/tor/50 Second pseudo-channel
+ ...
+
+197 char OpenTNF tracing facility
+ 0 = /dev/tnf/t0 Trace 0 data extraction
+ 1 = /dev/tnf/t1 Trace 1 data extraction
+ ...
+ 128 = /dev/tnf/status Tracing facility status
+ 130 = /dev/tnf/trace Tracing device
+
+198 char Total Impact TPMP2 quad coprocessor PCI card
+ 0 = /dev/tpmp2/0 First card
+ 1 = /dev/tpmp2/1 Second card
+ ...
+
+199 char Veritas volume manager (VxVM) volumes
+ 0 = /dev/vx/rdsk/*/* First volume
+ 1 = /dev/vx/rdsk/*/* Second volume
+ ...
+
+199 block Veritas volume manager (VxVM) volumes
+ 0 = /dev/vx/dsk/*/* First volume
+ 1 = /dev/vx/dsk/*/* Second volume
+ ...
+
+ The namespace in these directories is maintained by
+ the user space VxVM software.
+
+200 char Veritas VxVM configuration interface
+ 0 = /dev/vx/config Configuration access node
+ 1 = /dev/vx/trace Volume i/o trace access node
+ 2 = /dev/vx/iod Volume i/o daemon access node
+ 3 = /dev/vx/info Volume information access node
+ 4 = /dev/vx/task Volume tasks access node
+ 5 = /dev/vx/taskmon Volume tasks monitor daemon
+
+201 char Veritas VxVM dynamic multipathing driver
+ 0 = /dev/vx/rdmp/* First multipath device
+ 1 = /dev/vx/rdmp/* Second multipath device
+ ...
+201 block Veritas VxVM dynamic multipathing driver
+ 0 = /dev/vx/dmp/* First multipath device
+ 1 = /dev/vx/dmp/* Second multipath device
+ ...
+
+ The namespace in these directories is maintained by
+ the user space VxVM software.
+
+202 char CPU model-specific registers
+ 0 = /dev/cpu/0/msr MSRs on CPU 0
+ 1 = /dev/cpu/1/msr MSRs on CPU 1
+ ...
+
+202 block Xen Virtual Block Device
+ 0 = /dev/xvda First Xen VBD whole disk
+ 16 = /dev/xvdb Second Xen VBD whole disk
+ 32 = /dev/xvdc Third Xen VBD whole disk
+ ...
+ 240 = /dev/xvdp Sixteenth Xen VBD whole disk
+
+ Partitions are handled in the same way as for IDE
+ disks (see major number 3) except that the limit on
+ partitions is 15.
+
+203 char CPU CPUID information
+ 0 = /dev/cpu/0/cpuid CPUID on CPU 0
+ 1 = /dev/cpu/1/cpuid CPUID on CPU 1
+ ...
+
+204 char Low-density serial ports
+ 0 = /dev/ttyLU0 LinkUp Systems L72xx UART - port 0
+ 1 = /dev/ttyLU1 LinkUp Systems L72xx UART - port 1
+ 2 = /dev/ttyLU2 LinkUp Systems L72xx UART - port 2
+ 3 = /dev/ttyLU3 LinkUp Systems L72xx UART - port 3
+ 4 = /dev/ttyFB0 Intel Footbridge (ARM)
+ 5 = /dev/ttySA0 StrongARM builtin serial port 0
+ 6 = /dev/ttySA1 StrongARM builtin serial port 1
+ 7 = /dev/ttySA2 StrongARM builtin serial port 2
+ 8 = /dev/ttySC0 SCI serial port (SuperH) - port 0
+ 9 = /dev/ttySC1 SCI serial port (SuperH) - port 1
+ 10 = /dev/ttySC2 SCI serial port (SuperH) - port 2
+ 11 = /dev/ttySC3 SCI serial port (SuperH) - port 3
+ 12 = /dev/ttyFW0 Firmware console - port 0
+ 13 = /dev/ttyFW1 Firmware console - port 1
+ 14 = /dev/ttyFW2 Firmware console - port 2
+ 15 = /dev/ttyFW3 Firmware console - port 3
+ 16 = /dev/ttyAM0 ARM "AMBA" serial port 0
+ ...
+ 31 = /dev/ttyAM15 ARM "AMBA" serial port 15
+ 32 = /dev/ttyDB0 DataBooster serial port 0
+ ...
+ 39 = /dev/ttyDB7 DataBooster serial port 7
+ 40 = /dev/ttySG0 SGI Altix console port
+ 41 = /dev/ttySMX0 Motorola i.MX - port 0
+ 42 = /dev/ttySMX1 Motorola i.MX - port 1
+ 43 = /dev/ttySMX2 Motorola i.MX - port 2
+ 44 = /dev/ttyMM0 Marvell MPSC - port 0
+ 45 = /dev/ttyMM1 Marvell MPSC - port 1
+ 46 = /dev/ttyCPM0 PPC CPM (SCC or SMC) - port 0
+ ...
+ 47 = /dev/ttyCPM5 PPC CPM (SCC or SMC) - port 5
+ 50 = /dev/ttyIOC0 Altix serial card
+ ...
+ 81 = /dev/ttyIOC31 Altix serial card
+ 82 = /dev/ttyVR0 NEC VR4100 series SIU
+ 83 = /dev/ttyVR1 NEC VR4100 series DSIU
+ 84 = /dev/ttyIOC84 Altix ioc4 serial card
+ ...
+ 115 = /dev/ttyIOC115 Altix ioc4 serial card
+ 116 = /dev/ttySIOC0 Altix ioc3 serial card
+ ...
+ 147 = /dev/ttySIOC31 Altix ioc3 serial card
+ 148 = /dev/ttyPSC0 PPC PSC - port 0
+ ...
+ 153 = /dev/ttyPSC5 PPC PSC - port 5
+ 154 = /dev/ttyAT0 ATMEL serial port 0
+ ...
+ 169 = /dev/ttyAT15 ATMEL serial port 15
+ 170 = /dev/ttyNX0 Hilscher netX serial port 0
+ ...
+ 185 = /dev/ttyNX15 Hilscher netX serial port 15
+ 186 = /dev/ttyJ0 JTAG1 DCC protocol based serial port emulation
+ 187 = /dev/ttyUL0 Xilinx uartlite - port 0
+ ...
+ 190 = /dev/ttyUL3 Xilinx uartlite - port 3
+ 191 = /dev/xvc0 Xen virtual console - port 0
+
+205 char Low-density serial ports (alternate device)
+ 0 = /dev/culu0 Callout device for ttyLU0
+ 1 = /dev/culu1 Callout device for ttyLU1
+ 2 = /dev/culu2 Callout device for ttyLU2
+ 3 = /dev/culu3 Callout device for ttyLU3
+ 4 = /dev/cufb0 Callout device for ttyFB0
+ 5 = /dev/cusa0 Callout device for ttySA0
+ 6 = /dev/cusa1 Callout device for ttySA1
+ 7 = /dev/cusa2 Callout device for ttySA2
+ 8 = /dev/cusc0 Callout device for ttySC0
+ 9 = /dev/cusc1 Callout device for ttySC1
+ 10 = /dev/cusc2 Callout device for ttySC2
+ 11 = /dev/cusc3 Callout device for ttySC3
+ 12 = /dev/cufw0 Callout device for ttyFW0
+ 13 = /dev/cufw1 Callout device for ttyFW1
+ 14 = /dev/cufw2 Callout device for ttyFW2
+ 15 = /dev/cufw3 Callout device for ttyFW3
+ 16 = /dev/cuam0 Callout device for ttyAM0
+ ...
+ 31 = /dev/cuam15 Callout device for ttyAM15
+ 32 = /dev/cudb0 Callout device for ttyDB0
+ ...
+ 39 = /dev/cudb7 Callout device for ttyDB7
+ 40 = /dev/cusg0 Callout device for ttySG0
+ 41 = /dev/ttycusmx0 Callout device for ttySMX0
+ 42 = /dev/ttycusmx1 Callout device for ttySMX1
+ 43 = /dev/ttycusmx2 Callout device for ttySMX2
+ 46 = /dev/cucpm0 Callout device for ttyCPM0
+ ...
+ 49 = /dev/cucpm5 Callout device for ttyCPM5
+ 50 = /dev/cuioc40 Callout device for ttyIOC40
+ ...
+ 81 = /dev/cuioc431 Callout device for ttyIOC431
+ 82 = /dev/cuvr0 Callout device for ttyVR0
+ 83 = /dev/cuvr1 Callout device for ttyVR1
+
+206 char OnStream SC-x0 tape devices
+ 0 = /dev/osst0 First OnStream SCSI tape, mode 0
+ 1 = /dev/osst1 Second OnStream SCSI tape, mode 0
+ ...
+ 32 = /dev/osst0l First OnStream SCSI tape, mode 1
+ 33 = /dev/osst1l Second OnStream SCSI tape, mode 1
+ ...
+ 64 = /dev/osst0m First OnStream SCSI tape, mode 2
+ 65 = /dev/osst1m Second OnStream SCSI tape, mode 2
+ ...
+ 96 = /dev/osst0a First OnStream SCSI tape, mode 3
+ 97 = /dev/osst1a Second OnStream SCSI tape, mode 3
+ ...
+ 128 = /dev/nosst0 No rewind version of /dev/osst0
+ 129 = /dev/nosst1 No rewind version of /dev/osst1
+ ...
+ 160 = /dev/nosst0l No rewind version of /dev/osst0l
+ 161 = /dev/nosst1l No rewind version of /dev/osst1l
+ ...
+ 192 = /dev/nosst0m No rewind version of /dev/osst0m
+ 193 = /dev/nosst1m No rewind version of /dev/osst1m
+ ...
+ 224 = /dev/nosst0a No rewind version of /dev/osst0a
+ 225 = /dev/nosst1a No rewind version of /dev/osst1a
+ ...
+
+ The OnStream SC-x0 SCSI tapes do not support the
+ standard SCSI SASD command set and therefore need
+ their own driver "osst". Note that the IDE, USB (and
+ maybe ParPort) versions may be driven via ide-scsi or
+ usb-storage SCSI emulation and this osst device and
+ driver as well. The ADR-x0 drives are QIC-157
+ compliant and don't need osst.
+
+207 char Compaq ProLiant health feature indicate
+ 0 = /dev/cpqhealth/cpqw Redirector interface
+ 1 = /dev/cpqhealth/crom EISA CROM
+ 2 = /dev/cpqhealth/cdt Data Table
+ 3 = /dev/cpqhealth/cevt Event Log
+ 4 = /dev/cpqhealth/casr Automatic Server Recovery
+ 5 = /dev/cpqhealth/cecc ECC Memory
+ 6 = /dev/cpqhealth/cmca Machine Check Architecture
+ 7 = /dev/cpqhealth/ccsm Deprecated CDT
+ 8 = /dev/cpqhealth/cnmi NMI Handling
+ 9 = /dev/cpqhealth/css Sideshow Management
+ 10 = /dev/cpqhealth/cram CMOS interface
+ 11 = /dev/cpqhealth/cpci PCI IRQ interface
+
+208 char User space serial ports
+ 0 = /dev/ttyU0 First user space serial port
+ 1 = /dev/ttyU1 Second user space serial port
+ ...
+
+209 char User space serial ports (alternate devices)
+ 0 = /dev/cuu0 Callout device for ttyU0
+ 1 = /dev/cuu1 Callout device for ttyU1
+ ...
+
+210 char SBE, Inc. sync/async serial card
+ 0 = /dev/sbei/wxcfg0 Configuration device for board 0
+ 1 = /dev/sbei/dld0 Download device for board 0
+ 2 = /dev/sbei/wan00 WAN device, port 0, board 0
+ 3 = /dev/sbei/wan01 WAN device, port 1, board 0
+ 4 = /dev/sbei/wan02 WAN device, port 2, board 0
+ 5 = /dev/sbei/wan03 WAN device, port 3, board 0
+ 6 = /dev/sbei/wanc00 WAN clone device, port 0, board 0
+ 7 = /dev/sbei/wanc01 WAN clone device, port 1, board 0
+ 8 = /dev/sbei/wanc02 WAN clone device, port 2, board 0
+ 9 = /dev/sbei/wanc03 WAN clone device, port 3, board 0
+ 10 = /dev/sbei/wxcfg1 Configuration device for board 1
+ 11 = /dev/sbei/dld1 Download device for board 1
+ 12 = /dev/sbei/wan10 WAN device, port 0, board 1
+ 13 = /dev/sbei/wan11 WAN device, port 1, board 1
+ 14 = /dev/sbei/wan12 WAN device, port 2, board 1
+ 15 = /dev/sbei/wan13 WAN device, port 3, board 1
+ 16 = /dev/sbei/wanc10 WAN clone device, port 0, board 1
+ 17 = /dev/sbei/wanc11 WAN clone device, port 1, board 1
+ 18 = /dev/sbei/wanc12 WAN clone device, port 2, board 1
+ 19 = /dev/sbei/wanc13 WAN clone device, port 3, board 1
+ ...
+
+ Yes, each board is really spaced 10 (decimal) apart.
+
+211 char Addinum CPCI1500 digital I/O card
+ 0 = /dev/addinum/cpci1500/0 First CPCI1500 card
+ 1 = /dev/addinum/cpci1500/1 Second CPCI1500 card
+ ...
+
+212 char LinuxTV.org DVB driver subsystem
+ 0 = /dev/dvb/adapter0/video0 first video decoder of first card
+ 1 = /dev/dvb/adapter0/audio0 first audio decoder of first card
+ 2 = /dev/dvb/adapter0/sec0 (obsolete/unused)
+ 3 = /dev/dvb/adapter0/frontend0 first frontend device of first card
+ 4 = /dev/dvb/adapter0/demux0 first demux device of first card
+ 5 = /dev/dvb/adapter0/dvr0 first digital video recoder device of first card
+ 6 = /dev/dvb/adapter0/ca0 first common access port of first card
+ 7 = /dev/dvb/adapter0/net0 first network device of first card
+ 8 = /dev/dvb/adapter0/osd0 first on-screen-display device of first card
+ 9 = /dev/dvb/adapter0/video1 second video decoder of first card
+ ...
+ 64 = /dev/dvb/adapter1/video0 first video decoder of second card
+ ...
+ 128 = /dev/dvb/adapter2/video0 first video decoder of third card
+ ...
+ 196 = /dev/dvb/adapter3/video0 first video decoder of fourth card
+
+216 char Bluetooth RFCOMM TTY devices
+ 0 = /dev/rfcomm0 First Bluetooth RFCOMM TTY device
+ 1 = /dev/rfcomm1 Second Bluetooth RFCOMM TTY device
+ ...
+
+217 char Bluetooth RFCOMM TTY devices (alternate devices)
+ 0 = /dev/curf0 Callout device for rfcomm0
+ 1 = /dev/curf1 Callout device for rfcomm1
+ ...
+
+218 char The Logical Company bus Unibus/Qbus adapters
+ 0 = /dev/logicalco/bci/0 First bus adapter
+ 1 = /dev/logicalco/bci/1 First bus adapter
+ ...
+
+219 char The Logical Company DCI-1300 digital I/O card
+ 0 = /dev/logicalco/dci1300/0 First DCI-1300 card
+ 1 = /dev/logicalco/dci1300/1 Second DCI-1300 card
+ ...
+
+220 char Myricom Myrinet "GM" board
+ 0 = /dev/myricom/gm0 First Myrinet GM board
+ 1 = /dev/myricom/gmp0 First board "root access"
+ 2 = /dev/myricom/gm1 Second Myrinet GM board
+ 3 = /dev/myricom/gmp1 Second board "root access"
+ ...
+
+221 char VME bus
+ 0 = /dev/bus/vme/m0 First master image
+ 1 = /dev/bus/vme/m1 Second master image
+ 2 = /dev/bus/vme/m2 Third master image
+ 3 = /dev/bus/vme/m3 Fourth master image
+ 4 = /dev/bus/vme/s0 First slave image
+ 5 = /dev/bus/vme/s1 Second slave image
+ 6 = /dev/bus/vme/s2 Third slave image
+ 7 = /dev/bus/vme/s3 Fourth slave image
+ 8 = /dev/bus/vme/ctl Control
+
+ It is expected that all VME bus drivers will use the
+ same interface. For interface documentation see
+ http://www.vmelinux.org/.
+
+224 char A2232 serial card
+ 0 = /dev/ttyY0 First A2232 port
+ 1 = /dev/ttyY1 Second A2232 port
+ ...
+
+225 char A2232 serial card (alternate devices)
+ 0 = /dev/cuy0 Callout device for ttyY0
+ 1 = /dev/cuy1 Callout device for ttyY1
+ ...
+
+226 char Direct Rendering Infrastructure (DRI)
+ 0 = /dev/dri/card0 First graphics card
+ 1 = /dev/dri/card1 Second graphics card
+ ...
+
+227 char IBM 3270 terminal Unix tty access
+ 1 = /dev/3270/tty1 First 3270 terminal
+ 2 = /dev/3270/tty2 Seconds 3270 terminal
+ ...
+
+228 char IBM 3270 terminal block-mode access
+ 0 = /dev/3270/tub Controlling interface
+ 1 = /dev/3270/tub1 First 3270 terminal
+ 2 = /dev/3270/tub2 Second 3270 terminal
+ ...
+
+229 char IBM iSeries/pSeries virtual console
+ 0 = /dev/hvc0 First console port
+ 1 = /dev/hvc1 Second console port
+ ...
+
+230 char IBM iSeries virtual tape
+ 0 = /dev/iseries/vt0 First virtual tape, mode 0
+ 1 = /dev/iseries/vt1 Second virtual tape, mode 0
+ ...
+ 32 = /dev/iseries/vt0l First virtual tape, mode 1
+ 33 = /dev/iseries/vt1l Second virtual tape, mode 1
+ ...
+ 64 = /dev/iseries/vt0m First virtual tape, mode 2
+ 65 = /dev/iseries/vt1m Second virtual tape, mode 2
+ ...
+ 96 = /dev/iseries/vt0a First virtual tape, mode 3
+ 97 = /dev/iseries/vt1a Second virtual tape, mode 3
+ ...
+ 128 = /dev/iseries/nvt0 First virtual tape, mode 0, no rewind
+ 129 = /dev/iseries/nvt1 Second virtual tape, mode 0, no rewind
+ ...
+ 160 = /dev/iseries/nvt0l First virtual tape, mode 1, no rewind
+ 161 = /dev/iseries/nvt1l Second virtual tape, mode 1, no rewind
+ ...
+ 192 = /dev/iseries/nvt0m First virtual tape, mode 2, no rewind
+ 193 = /dev/iseries/nvt1m Second virtual tape, mode 2, no rewind
+ ...
+ 224 = /dev/iseries/nvt0a First virtual tape, mode 3, no rewind
+ 225 = /dev/iseries/nvt1a Second virtual tape, mode 3, no rewind
+ ...
+
+ "No rewind" refers to the omission of the default
+ automatic rewind on device close. The MTREW or MTOFFL
+ ioctl()'s can be used to rewind the tape regardless of
+ the device used to access it.
+
+231 char InfiniBand
+ 0 = /dev/infiniband/umad0
+ 1 = /dev/infiniband/umad1
+ ...
+ 63 = /dev/infiniband/umad63 63rd InfiniBandMad device
+ 64 = /dev/infiniband/issm0 First InfiniBand IsSM device
+ 65 = /dev/infiniband/issm1 Second InfiniBand IsSM device
+ ...
+ 127 = /dev/infiniband/issm63 63rd InfiniBand IsSM device
+ 128 = /dev/infiniband/uverbs0 First InfiniBand verbs device
+ 129 = /dev/infiniband/uverbs1 Second InfiniBand verbs device
+ ...
+ 159 = /dev/infiniband/uverbs31 31st InfiniBand verbs device
+
+232 char Biometric Devices
+ 0 = /dev/biometric/sensor0/fingerprint first fingerprint sensor on first device
+ 1 = /dev/biometric/sensor0/iris first iris sensor on first device
+ 2 = /dev/biometric/sensor0/retina first retina sensor on first device
+ 3 = /dev/biometric/sensor0/voiceprint first voiceprint sensor on first device
+ 4 = /dev/biometric/sensor0/facial first facial sensor on first device
+ 5 = /dev/biometric/sensor0/hand first hand sensor on first device
+ ...
+ 10 = /dev/biometric/sensor1/fingerprint first fingerprint sensor on second device
+ ...
+ 20 = /dev/biometric/sensor2/fingerprint first fingerprint sensor on third device
+ ...
+
+233 char PathScale InfiniPath interconnect
+ 0 = /dev/ipath Primary device for programs (any unit)
+ 1 = /dev/ipath0 Access specifically to unit 0
+ 2 = /dev/ipath1 Access specifically to unit 1
+ ...
+ 4 = /dev/ipath3 Access specifically to unit 3
+ 129 = /dev/ipath_sma Device used by Subnet Management Agent
+ 130 = /dev/ipath_diag Device used by diagnostics programs
+
+234-239 UNASSIGNED
+
+240-254 char LOCAL/EXPERIMENTAL USE
+
+240-254 block LOCAL/EXPERIMENTAL USE
+ Allocated for local/experimental use. For devices not
+ assigned official numbers, these ranges should be
+ used in order to avoid conflicting with future assignments.
+
+255 char RESERVED
+
+255 block RESERVED
+
+ This major is reserved to assist the expansion to a
+ larger number space. No device nodes with this major
+ should ever be created on the filesystem.
+ (This is probably not true anymore, but I'll leave it
+ for now /Torben)
+
+---LARGE MAJORS!!!!!---
+
+256 char Equinox SST multi-port serial boards
+ 0 = /dev/ttyEQ0 First serial port on first Equinox SST board
+ 127 = /dev/ttyEQ127 Last serial port on first Equinox SST board
+ 128 = /dev/ttyEQ128 First serial port on second Equinox SST board
+ ...
+ 1027 = /dev/ttyEQ1027 Last serial port on eighth Equinox SST board
+
+256 block Resident Flash Disk Flash Translation Layer
+ 0 = /dev/rfda First RFD FTL layer
+ 16 = /dev/rfdb Second RFD FTL layer
+ ...
+ 240 = /dev/rfdp 16th RFD FTL layer
+
+257 char Phoenix Technologies Cryptographic Services Driver
+ 0 = /dev/ptlsec Crypto Services Driver
+
+257 block SSFDC Flash Translation Layer filesystem
+ 0 = /dev/ssfdca First SSFDC layer
+ 8 = /dev/ssfdcb Second SSFDC layer
+ 16 = /dev/ssfdcc Third SSFDC layer
+ 24 = /dev/ssfdcd 4th SSFDC layer
+ 32 = /dev/ssfdce 5th SSFDC layer
+ 40 = /dev/ssfdcf 6th SSFDC layer
+ 48 = /dev/ssfdcg 7th SSFDC layer
+ 56 = /dev/ssfdch 8th SSFDC layer
+
+258 block ROM/Flash read-only translation layer
+ 0 = /dev/blockrom0 First ROM card's translation layer interface
+ 1 = /dev/blockrom1 Second ROM card's translation layer interface
+ ...
+
+ **** ADDITIONAL /dev DIRECTORY ENTRIES
+
+This section details additional entries that should or may exist in
+the /dev directory. It is preferred that symbolic links use the same
+form (absolute or relative) as is indicated here. Links are
+classified as "hard" or "symbolic" depending on the preferred type of
+link; if possible, the indicated type of link should be used.
+
+
+ Compulsory links
+
+These links should exist on all systems:
+
+/dev/fd /proc/self/fd symbolic File descriptors
+/dev/stdin fd/0 symbolic stdin file descriptor
+/dev/stdout fd/1 symbolic stdout file descriptor
+/dev/stderr fd/2 symbolic stderr file descriptor
+/dev/nfsd socksys symbolic Required by iBCS-2
+/dev/X0R null symbolic Required by iBCS-2
+
+Note: /dev/X0R is <letter X>-<digit 0>-<letter R>.
+
+ Recommended links
+
+It is recommended that these links exist on all systems:
+
+/dev/core /proc/kcore symbolic Backward compatibility
+/dev/ramdisk ram0 symbolic Backward compatibility
+/dev/ftape qft0 symbolic Backward compatibility
+/dev/bttv0 video0 symbolic Backward compatibility
+/dev/radio radio0 symbolic Backward compatibility
+/dev/i2o* /dev/i2o/* symbolic Backward compatibility
+/dev/scd? sr? hard Alternate SCSI CD-ROM name
+
+ Locally defined links
+
+The following links may be established locally to conform to the
+configuration of the system. This is merely a tabulation of existing
+practice, and does not constitute a recommendation. However, if they
+exist, they should have the following uses.
+
+/dev/mouse mouse port symbolic Current mouse device
+/dev/tape tape device symbolic Current tape device
+/dev/cdrom CD-ROM device symbolic Current CD-ROM device
+/dev/cdwriter CD-writer symbolic Current CD-writer device
+/dev/scanner scanner symbolic Current scanner device
+/dev/modem modem port symbolic Current dialout device
+/dev/root root device symbolic Current root filesystem
+/dev/swap swap device symbolic Current swap device
+
+/dev/modem should not be used for a modem which supports dialin as
+well as dialout, as it tends to cause lock file problems. If it
+exists, /dev/modem should point to the appropriate primary TTY device
+(the use of the alternate callout devices is deprecated).
+
+For SCSI devices, /dev/tape and /dev/cdrom should point to the
+``cooked'' devices (/dev/st* and /dev/sr*, respectively), whereas
+/dev/cdwriter and /dev/scanner should point to the appropriate generic
+SCSI devices (/dev/sg*).
+
+/dev/mouse may point to a primary serial TTY device, a hardware mouse
+device, or a socket for a mouse driver program (e.g. /dev/gpmdata).
+
+ Sockets and pipes
+
+Non-transient sockets and named pipes may exist in /dev. Common entries are:
+
+/dev/printer socket lpd local socket
+/dev/log socket syslog local socket
+/dev/gpmdata socket gpm mouse multiplexer
+
+ Mount points
+
+The following names are reserved for mounting special filesystems
+under /dev. These special filesystems provide kernel interfaces that
+cannot be provided with standard device nodes.
+
+/dev/pts devpts PTY slave filesystem
+/dev/shm tmpfs POSIX shared memory maintenance access
+
+ **** TERMINAL DEVICES
+
+Terminal, or TTY devices are a special class of character devices. A
+terminal device is any device that could act as a controlling terminal
+for a session; this includes virtual consoles, serial ports, and
+pseudoterminals (PTYs).
+
+All terminal devices share a common set of capabilities known as line
+disciplines; these include the common terminal line discipline as well
+as SLIP and PPP modes.
+
+All terminal devices are named similarly; this section explains the
+naming and use of the various types of TTYs. Note that the naming
+conventions include several historical warts; some of these are
+Linux-specific, some were inherited from other systems, and some
+reflect Linux outgrowing a borrowed convention.
+
+A hash mark (#) in a device name is used here to indicate a decimal
+number without leading zeroes.
+
+ Virtual consoles and the console device
+
+Virtual consoles are full-screen terminal displays on the system video
+monitor. Virtual consoles are named /dev/tty#, with numbering
+starting at /dev/tty1; /dev/tty0 is the current virtual console.
+/dev/tty0 is the device that should be used to access the system video
+card on those architectures for which the frame buffer devices
+(/dev/fb*) are not applicable. Do not use /dev/console
+for this purpose.
+
+The console device, /dev/console, is the device to which system
+messages should be sent, and on which logins should be permitted in
+single-user mode. Starting with Linux 2.1.71, /dev/console is managed
+by the kernel; for previous versions it should be a symbolic link to
+either /dev/tty0, a specific virtual console such as /dev/tty1, or to
+a serial port primary (tty*, not cu*) device, depending on the
+configuration of the system.
+
+ Serial ports
+
+Serial ports are RS-232 serial ports and any device which simulates
+one, either in hardware (such as internal modems) or in software (such
+as the ISDN driver.) Under Linux, each serial ports has two device
+names, the primary or callin device and the alternate or callout one.
+Each kind of device is indicated by a different letter. For any
+letter X, the names of the devices are /dev/ttyX# and /dev/cux#,
+respectively; for historical reasons, /dev/ttyS# and /dev/ttyC#
+correspond to /dev/cua# and /dev/cub#. In the future, it should be
+expected that multiple letters will be used; all letters will be upper
+case for the "tty" device (e.g. /dev/ttyDP#) and lower case for the
+"cu" device (e.g. /dev/cudp#).
+
+The names /dev/ttyQ# and /dev/cuq# are reserved for local use.
+
+The alternate devices provide for kernel-based exclusion and somewhat
+different defaults than the primary devices. Their main purpose is to
+allow the use of serial ports with programs with no inherent or broken
+support for serial ports. Their use is deprecated, and they may be
+removed from a future version of Linux.
+
+Arbitration of serial ports is provided by the use of lock files with
+the names /var/lock/LCK..ttyX#. The contents of the lock file should
+be the PID of the locking process as an ASCII number.
+
+It is common practice to install links such as /dev/modem
+which point to serial ports. In order to ensure proper locking in the
+presence of these links, it is recommended that software chase
+symlinks and lock all possible names; additionally, it is recommended
+that a lock file be installed with the corresponding alternate
+device. In order to avoid deadlocks, it is recommended that the locks
+are acquired in the following order, and released in the reverse:
+
+ 1. The symbolic link name, if any (/var/lock/LCK..modem)
+ 2. The "tty" name (/var/lock/LCK..ttyS2)
+ 3. The alternate device name (/var/lock/LCK..cua2)
+
+In the case of nested symbolic links, the lock files should be
+installed in the order the symlinks are resolved.
+
+Under no circumstances should an application hold a lock while waiting
+for another to be released. In addition, applications which attempt
+to create lock files for the corresponding alternate device names
+should take into account the possibility of being used on a non-serial
+port TTY, for which no alternate device would exist.
+
+ Pseudoterminals (PTYs)
+
+Pseudoterminals, or PTYs, are used to create login sessions or provide
+other capabilities requiring a TTY line discipline (including SLIP or
+PPP capability) to arbitrary data-generation processes. Each PTY has
+a master side, named /dev/pty[p-za-e][0-9a-f], and a slave side, named
+/dev/tty[p-za-e][0-9a-f]. The kernel arbitrates the use of PTYs by
+allowing each master side to be opened only once.
+
+Once the master side has been opened, the corresponding slave device
+can be used in the same manner as any TTY device. The master and
+slave devices are connected by the kernel, generating the equivalent
+of a bidirectional pipe with TTY capabilities.
+
+Recent versions of the Linux kernels and GNU libc contain support for
+the System V/Unix98 naming scheme for PTYs, which assigns a common
+device, /dev/ptmx, to all the masters (opening it will automatically
+give you a previously unassigned PTY) and a subdirectory, /dev/pts,
+for the slaves; the slaves are named with decimal integers (/dev/pts/#
+in our notation). This removes the problem of exhausting the
+namespace and enables the kernel to automatically create the device
+nodes for the slaves on demand using the "devpts" filesystem.
+
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
new file mode 100644
index 0000000..1e89a51
--- /dev/null
+++ b/Documentation/dontdiff
@@ -0,0 +1,201 @@
+*.a
+*.aux
+*.bin
+*.cpio
+*.csp
+*.dsp
+*.dvi
+*.elf
+*.eps
+*.fw
+*.gen.S
+*.gif
+*.grep
+*.grp
+*.gz
+*.html
+*.i
+*.jpeg
+*.ko
+*.log
+*.lst
+*.moc
+*.mod.c
+*.o
+*.o.*
+*.orig
+*.out
+*.pdf
+*.png
+*.ps
+*.rej
+*.s
+*.sgml
+*.so
+*.so.dbg
+*.symtypes
+*.tab.c
+*.tab.h
+*.tex
+*.ver
+*.xml
+*_MODULES
+*_vga16.c
+*~
+*.9
+*.9.gz
+.*
+.mm
+53c700_d.h
+CVS
+ChangeSet
+Image
+Kerntypes
+Module.markers
+Module.symvers
+PENDING
+SCCS
+System.map*
+TAGS
+aic7*reg.h*
+aic7*reg_print.c*
+aic7*seq.h*
+aicasm
+aicdb.h*
+asm
+asm-offsets.h
+asm_offsets.h
+autoconf.h*
+bbootsect
+bin2c
+binkernel.spec
+binoffset
+bootsect
+bounds.h
+bsetup
+btfixupprep
+build
+bvmlinux
+bzImage*
+classlist.h*
+comp*.log
+compile.h*
+conf
+config
+config-*
+config_data.h*
+config_data.gz*
+conmakehash
+consolemap_deftbl.c*
+cpustr.h
+crc32table.h*
+cscope.*
+defkeymap.c
+devlist.h*
+docproc
+elf2ecoff
+elfconfig.h*
+fixdep
+fore200e_mkfirm
+fore200e_pca_fw.c*
+gconf
+gen-devlist
+gen_crc32table
+gen_init_cpio
+genksyms
+*_gray256.c
+ihex2fw
+ikconfig.h*
+initramfs_data.cpio
+initramfs_data.cpio.gz
+initramfs_list
+kallsyms
+kconfig
+keywords.c
+ksym.c*
+ksym.h*
+kxgettext
+lkc_defs.h
+lex.c
+lex.*.c
+logo_*.c
+logo_*_clut224.c
+logo_*_mono.c
+lxdialog
+mach-types
+mach-types.h
+machtypes.h
+map
+maui_boot.h
+mconf
+miboot*
+mk_elfconfig
+mkboot
+mkbugboot
+mkcpustr
+mkdep
+mkprep
+mktables
+mktree
+modpost
+modules.order
+modversions.h*
+ncscope.*
+offset.h
+offsets.h
+oui.c*
+parse.c
+parse.h
+patches*
+pca200e.bin
+pca200e_ecd.bin2
+piggy.gz
+piggyback
+pnmtologo
+ppc_defs.h*
+promcon_tbl.c
+pss_boot.h
+qconf
+raid6altivec*.c
+raid6int*.c
+raid6tables.c
+relocs
+series
+setup
+setup.bin
+setup.elf
+sImage
+sm_tbl*
+split-include
+syscalltab.h
+tags
+tftpboot.img
+timeconst.h
+times.h*
+trix_boot.h
+utsrelease.h*
+vdso-syms.lds
+vdso.lds
+vdso32-int80-syms.lds
+vdso32-syms.lds
+vdso32-syscall-syms.lds
+vdso32-sysenter-syms.lds
+vdso32.lds
+vdso32.so.dbg
+vdso64.lds
+vdso64.so.dbg
+version.h*
+vmlinux
+vmlinux-*
+vmlinux.aout
+vmlinux.lds
+vsyscall.lds
+vsyscall_32.lds
+wanxlfw.inc
+uImage
+unifdef
+wakeup.bin
+wakeup.elf
+wakeup.lds
+zImage*
+zconf.hash.c
diff --git a/Documentation/driver-model/binding.txt b/Documentation/driver-model/binding.txt
new file mode 100644
index 0000000..f7ec9d6
--- /dev/null
+++ b/Documentation/driver-model/binding.txt
@@ -0,0 +1,102 @@
+
+Driver Binding
+
+Driver binding is the process of associating a device with a device
+driver that can control it. Bus drivers have typically handled this
+because there have been bus-specific structures to represent the
+devices and the drivers. With generic device and device driver
+structures, most of the binding can take place using common code.
+
+
+Bus
+~~~
+
+The bus type structure contains a list of all devices that are on that bus
+type in the system. When device_register is called for a device, it is
+inserted into the end of this list. The bus object also contains a
+list of all drivers of that bus type. When driver_register is called
+for a driver, it is inserted at the end of this list. These are the
+two events which trigger driver binding.
+
+
+device_register
+~~~~~~~~~~~~~~~
+
+When a new device is added, the bus's list of drivers is iterated over
+to find one that supports it. In order to determine that, the device
+ID of the device must match one of the device IDs that the driver
+supports. The format and semantics for comparing IDs is bus-specific.
+Instead of trying to derive a complex state machine and matching
+algorithm, it is up to the bus driver to provide a callback to compare
+a device against the IDs of a driver. The bus returns 1 if a match was
+found; 0 otherwise.
+
+int match(struct device * dev, struct device_driver * drv);
+
+If a match is found, the device's driver field is set to the driver
+and the driver's probe callback is called. This gives the driver a
+chance to verify that it really does support the hardware, and that
+it's in a working state.
+
+Device Class
+~~~~~~~~~~~~
+
+Upon the successful completion of probe, the device is registered with
+the class to which it belongs. Device drivers belong to one and only one
+class, and that is set in the driver's devclass field.
+devclass_add_device is called to enumerate the device within the class
+and actually register it with the class, which happens with the
+class's register_dev callback.
+
+NOTE: The device class structures and core routines to manipulate them
+are not in the mainline kernel, so the discussion is still a bit
+speculative.
+
+
+Driver
+~~~~~~
+
+When a driver is attached to a device, the device is inserted into the
+driver's list of devices.
+
+
+sysfs
+~~~~~
+
+A symlink is created in the bus's 'devices' directory that points to
+the device's directory in the physical hierarchy.
+
+A symlink is created in the driver's 'devices' directory that points
+to the device's directory in the physical hierarchy.
+
+A directory for the device is created in the class's directory. A
+symlink is created in that directory that points to the device's
+physical location in the sysfs tree.
+
+A symlink can be created (though this isn't done yet) in the device's
+physical directory to either its class directory, or the class's
+top-level directory. One can also be created to point to its driver's
+directory also.
+
+
+driver_register
+~~~~~~~~~~~~~~~
+
+The process is almost identical for when a new driver is added.
+The bus's list of devices is iterated over to find a match. Devices
+that already have a driver are skipped. All the devices are iterated
+over, to bind as many devices as possible to the driver.
+
+
+Removal
+~~~~~~~
+
+When a device is removed, the reference count for it will eventually
+go to 0. When it does, the remove callback of the driver is called. It
+is removed from the driver's list of devices and the reference count
+of the driver is decremented. All symlinks between the two are removed.
+
+When a driver is removed, the list of devices that it supports is
+iterated over, and the driver's remove callback is called for each
+one. The device is removed from that list and the symlinks removed.
+
diff --git a/Documentation/driver-model/bus.txt b/Documentation/driver-model/bus.txt
new file mode 100644
index 0000000..5001b75
--- /dev/null
+++ b/Documentation/driver-model/bus.txt
@@ -0,0 +1,160 @@
+
+Bus Types
+
+Definition
+~~~~~~~~~~
+
+struct bus_type {
+ char * name;
+
+ struct subsystem subsys;
+ struct kset drivers;
+ struct kset devices;
+
+ struct bus_attribute * bus_attrs;
+ struct device_attribute * dev_attrs;
+ struct driver_attribute * drv_attrs;
+
+ int (*match)(struct device * dev, struct device_driver * drv);
+ int (*hotplug) (struct device *dev, char **envp,
+ int num_envp, char *buffer, int buffer_size);
+ int (*suspend)(struct device * dev, pm_message_t state);
+ int (*resume)(struct device * dev);
+};
+
+int bus_register(struct bus_type * bus);
+
+
+Declaration
+~~~~~~~~~~~
+
+Each bus type in the kernel (PCI, USB, etc) should declare one static
+object of this type. They must initialize the name field, and may
+optionally initialize the match callback.
+
+struct bus_type pci_bus_type = {
+ .name = "pci",
+ .match = pci_bus_match,
+};
+
+The structure should be exported to drivers in a header file:
+
+extern struct bus_type pci_bus_type;
+
+
+Registration
+~~~~~~~~~~~~
+
+When a bus driver is initialized, it calls bus_register. This
+initializes the rest of the fields in the bus object and inserts it
+into a global list of bus types. Once the bus object is registered,
+the fields in it are usable by the bus driver.
+
+
+Callbacks
+~~~~~~~~~
+
+match(): Attaching Drivers to Devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The format of device ID structures and the semantics for comparing
+them are inherently bus-specific. Drivers typically declare an array
+of device IDs of devices they support that reside in a bus-specific
+driver structure.
+
+The purpose of the match callback is provide the bus an opportunity to
+determine if a particular driver supports a particular device by
+comparing the device IDs the driver supports with the device ID of a
+particular device, without sacrificing bus-specific functionality or
+type-safety.
+
+When a driver is registered with the bus, the bus's list of devices is
+iterated over, and the match callback is called for each device that
+does not have a driver associated with it.
+
+
+
+Device and Driver Lists
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The lists of devices and drivers are intended to replace the local
+lists that many buses keep. They are lists of struct devices and
+struct device_drivers, respectively. Bus drivers are free to use the
+lists as they please, but conversion to the bus-specific type may be
+necessary.
+
+The LDM core provides helper functions for iterating over each list.
+
+int bus_for_each_dev(struct bus_type * bus, struct device * start, void * data,
+ int (*fn)(struct device *, void *));
+
+int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+ void * data, int (*fn)(struct device_driver *, void *));
+
+These helpers iterate over the respective list, and call the callback
+for each device or driver in the list. All list accesses are
+synchronized by taking the bus's lock (read currently). The reference
+count on each object in the list is incremented before the callback is
+called; it is decremented after the next object has been obtained. The
+lock is not held when calling the callback.
+
+
+sysfs
+~~~~~~~~
+There is a top-level directory named 'bus'.
+
+Each bus gets a directory in the bus directory, along with two default
+directories:
+
+ /sys/bus/pci/
+ |-- devices
+ `-- drivers
+
+Drivers registered with the bus get a directory in the bus's drivers
+directory:
+
+ /sys/bus/pci/
+ |-- devices
+ `-- drivers
+ |-- Intel ICH
+ |-- Intel ICH Joystick
+ |-- agpgart
+ `-- e100
+
+Each device that is discovered on a bus of that type gets a symlink in
+the bus's devices directory to the device's directory in the physical
+hierarchy:
+
+ /sys/bus/pci/
+ |-- devices
+ | |-- 00:00.0 -> ../../../root/pci0/00:00.0
+ | |-- 00:01.0 -> ../../../root/pci0/00:01.0
+ | `-- 00:02.0 -> ../../../root/pci0/00:02.0
+ `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+struct bus_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct bus_type *, char * buf);
+ ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+};
+
+Bus drivers can export attributes using the BUS_ATTR macro that works
+similarly to the DEVICE_ATTR macro for devices. For example, a definition
+like this:
+
+static BUS_ATTR(debug,0644,show_debug,store_debug);
+
+is equivalent to declaring:
+
+static bus_attribute bus_attr_debug;
+
+This can then be used to add and remove the attribute from the bus's
+sysfs directory using:
+
+int bus_create_file(struct bus_type *, struct bus_attribute *);
+void bus_remove_file(struct bus_type *, struct bus_attribute *);
+
+
diff --git a/Documentation/driver-model/class.txt b/Documentation/driver-model/class.txt
new file mode 100644
index 0000000..548505f
--- /dev/null
+++ b/Documentation/driver-model/class.txt
@@ -0,0 +1,162 @@
+
+Device Classes
+
+
+Introduction
+~~~~~~~~~~~~
+A device class describes a type of device, like an audio or network
+device. The following device classes have been identified:
+
+<Insert List of Device Classes Here>
+
+
+Each device class defines a set of semantics and a programming interface
+that devices of that class adhere to. Device drivers are the
+implementation of that programming interface for a particular device on
+a particular bus.
+
+Device classes are agnostic with respect to what bus a device resides
+on.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The device class structure looks like:
+
+
+typedef int (*devclass_add)(struct device *);
+typedef void (*devclass_remove)(struct device *);
+
+struct device_class {
+ char * name;
+ rwlock_t lock;
+ u32 devnum;
+ struct list_head node;
+
+ struct list_head drivers;
+ struct list_head intf_list;
+
+ struct driver_dir_entry dir;
+ struct driver_dir_entry device_dir;
+ struct driver_dir_entry driver_dir;
+
+ devclass_add add_device;
+ devclass_remove remove_device;
+};
+
+A typical device class definition would look like:
+
+struct device_class input_devclass = {
+ .name = "input",
+ .add_device = input_add_device,
+ .remove_device = input_remove_device,
+};
+
+Each device class structure should be exported in a header file so it
+can be used by drivers, extensions and interfaces.
+
+Device classes are registered and unregistered with the core using:
+
+int devclass_register(struct device_class * cls);
+void devclass_unregister(struct device_class * cls);
+
+
+Devices
+~~~~~~~
+As devices are bound to drivers, they are added to the device class
+that the driver belongs to. Before the driver model core, this would
+typically happen during the driver's probe() callback, once the device
+has been initialized. It now happens after the probe() callback
+finishes from the core.
+
+The device is enumerated in the class. Each time a device is added to
+the class, the class's devnum field is incremented and assigned to the
+device. The field is never decremented, so if the device is removed
+from the class and re-added, it will receive a different enumerated
+value.
+
+The class is allowed to create a class-specific structure for the
+device and store it in the device's class_data pointer.
+
+There is no list of devices in the device class. Each driver has a
+list of devices that it supports. The device class has a list of
+drivers of that particular class. To access all of the devices in the
+class, iterate over the device lists of each driver in the class.
+
+
+Device Drivers
+~~~~~~~~~~~~~~
+Device drivers are added to device classes when they are registered
+with the core. A driver specifies the class it belongs to by setting
+the struct device_driver::devclass field.
+
+
+sysfs directory structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There is a top-level sysfs directory named 'class'.
+
+Each class gets a directory in the class directory, along with two
+default subdirectories:
+
+ class/
+ `-- input
+ |-- devices
+ `-- drivers
+
+
+Drivers registered with the class get a symlink in the drivers/ directory
+that points to the driver's directory (under its bus directory):
+
+ class/
+ `-- input
+ |-- devices
+ `-- drivers
+ `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
+
+
+Each device gets a symlink in the devices/ directory that points to the
+device's directory in the physical hierarchy:
+
+ class/
+ `-- input
+ |-- devices
+ | `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+ `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+struct devclass_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
+ ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
+};
+
+Class drivers can export attributes using the DEVCLASS_ATTR macro that works
+similarly to the DEVICE_ATTR macro for devices. For example, a definition
+like this:
+
+static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
+
+is equivalent to declaring:
+
+static devclass_attribute devclass_attr_debug;
+
+The bus driver can add and remove the attribute from the class's
+sysfs directory using:
+
+int devclass_create_file(struct device_class *, struct devclass_attribute *);
+void devclass_remove_file(struct device_class *, struct devclass_attribute *);
+
+In the example above, the file will be named 'debug' in placed in the
+class's directory in sysfs.
+
+
+Interfaces
+~~~~~~~~~~
+There may exist multiple mechanisms for accessing the same device of a
+particular class type. Device interfaces describe these mechanisms.
+
+When a device is added to a device class, the core attempts to add it
+to every interface that is registered with the device class.
+
diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt
new file mode 100644
index 0000000..a05ec50
--- /dev/null
+++ b/Documentation/driver-model/device.txt
@@ -0,0 +1,162 @@
+
+The Basic Device Structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+struct device {
+ struct list_head g_list;
+ struct list_head node;
+ struct list_head bus_list;
+ struct list_head driver_list;
+ struct list_head intf_list;
+ struct list_head children;
+ struct device * parent;
+
+ char name[DEVICE_NAME_SIZE];
+ char bus_id[BUS_ID_SIZE];
+
+ spinlock_t lock;
+ atomic_t refcount;
+
+ struct bus_type * bus;
+ struct driver_dir_entry dir;
+
+ u32 class_num;
+
+ struct device_driver *driver;
+ void *driver_data;
+ void *platform_data;
+
+ u32 current_state;
+ unsigned char *saved_state;
+
+ void (*release)(struct device * dev);
+};
+
+Fields
+~~~~~~
+g_list: Node in the global device list.
+
+node: Node in device's parent's children list.
+
+bus_list: Node in device's bus's devices list.
+
+driver_list: Node in device's driver's devices list.
+
+intf_list: List of intf_data. There is one structure allocated for
+ each interface that the device supports.
+
+children: List of child devices.
+
+parent: *** FIXME ***
+
+name: ASCII description of device.
+ Example: " 3Com Corporation 3c905 100BaseTX [Boomerang]"
+
+bus_id: ASCII representation of device's bus position. This
+ field should be a name unique across all devices on the
+ bus type the device belongs to.
+
+ Example: PCI bus_ids are in the form of
+ <bus number>:<slot number>.<function number>
+ This name is unique across all PCI devices in the system.
+
+lock: Spinlock for the device.
+
+refcount: Reference count on the device.
+
+bus: Pointer to struct bus_type that device belongs to.
+
+dir: Device's sysfs directory.
+
+class_num: Class-enumerated value of the device.
+
+driver: Pointer to struct device_driver that controls the device.
+
+driver_data: Driver-specific data.
+
+platform_data: Platform data specific to the device.
+
+ Example: for devices on custom boards, as typical of embedded
+ and SOC based hardware, Linux often uses platform_data to point
+ to board-specific structures describing devices and how they
+ are wired. That can include what ports are available, chip
+ variants, which GPIO pins act in what additional roles, and so
+ on. This shrinks the "Board Support Packages" (BSPs) and
+ minimizes board-specific #ifdefs in drivers.
+
+current_state: Current power state of the device.
+
+saved_state: Pointer to saved state of the device. This is usable by
+ the device driver controlling the device.
+
+release: Callback to free the device after all references have
+ gone away. This should be set by the allocator of the
+ device (i.e. the bus driver that discovered the device).
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The bus driver that discovers the device uses this to register the
+device with the core:
+
+int device_register(struct device * dev);
+
+The bus should initialize the following fields:
+
+ - parent
+ - name
+ - bus_id
+ - bus
+
+A device is removed from the core when its reference count goes to
+0. The reference count can be adjusted using:
+
+struct device * get_device(struct device * dev);
+void put_device(struct device * dev);
+
+get_device() will return a pointer to the struct device passed to it
+if the reference is not already 0 (if it's in the process of being
+removed already).
+
+A driver can access the lock in the device structure using:
+
+void lock_device(struct device * dev);
+void unlock_device(struct device * dev);
+
+
+Attributes
+~~~~~~~~~~
+struct device_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device * dev, char * buf, size_t count, loff_t off);
+ ssize_t (*store)(struct device * dev, const char * buf, size_t count, loff_t off);
+};
+
+Attributes of devices can be exported via drivers using a simple
+procfs-like interface.
+
+Please see Documentation/filesystems/sysfs.txt for more information
+on how sysfs works.
+
+Attributes are declared using a macro called DEVICE_ATTR:
+
+#define DEVICE_ATTR(name,mode,show,store)
+
+Example:
+
+DEVICE_ATTR(power,0644,show_power,store_power);
+
+This declares a structure of type struct device_attribute named
+'dev_attr_power'. This can then be added and removed to the device's
+directory using:
+
+int device_create_file(struct device *device, struct device_attribute * entry);
+void device_remove_file(struct device * dev, struct device_attribute * attr);
+
+Example:
+
+device_create_file(dev,&dev_attr_power);
+device_remove_file(dev,&dev_attr_power);
+
+The file name will be 'power' with a mode of 0644 (-rw-r--r--).
+
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
new file mode 100644
index 0000000..387b8a7
--- /dev/null
+++ b/Documentation/driver-model/devres.txt
@@ -0,0 +1,268 @@
+Devres - Managed Device Resource
+================================
+
+Tejun Heo <teheo@suse.de>
+
+First draft 10 January 2007
+
+
+1. Intro : Huh? Devres?
+2. Devres : Devres in a nutshell
+3. Devres Group : Group devres'es and release them together
+4. Details : Life time rules, calling context, ...
+5. Overhead : How much do we have to pay for this?
+6. List of managed interfaces : Currently implemented managed interfaces
+
+
+ 1. Intro
+ --------
+
+devres came up while trying to convert libata to use iomap. Each
+iomapped address should be kept and unmapped on driver detach. For
+example, a plain SFF ATA controller (that is, good old PCI IDE) in
+native mode makes use of 5 PCI BARs and all of them should be
+maintained.
+
+As with many other device drivers, libata low level drivers have
+sufficient bugs in ->remove and ->probe failure path. Well, yes,
+that's probably because libata low level driver developers are lazy
+bunch, but aren't all low level driver developers? After spending a
+day fiddling with braindamaged hardware with no document or
+braindamaged document, if it's finally working, well, it's working.
+
+For one reason or another, low level drivers don't receive as much
+attention or testing as core code, and bugs on driver detach or
+initialization failure don't happen often enough to be noticeable.
+Init failure path is worse because it's much less travelled while
+needs to handle multiple entry points.
+
+So, many low level drivers end up leaking resources on driver detach
+and having half broken failure path implementation in ->probe() which
+would leak resources or even cause oops when failure occurs. iomap
+adds more to this mix. So do msi and msix.
+
+
+ 2. Devres
+ ---------
+
+devres is basically linked list of arbitrarily sized memory areas
+associated with a struct device. Each devres entry is associated with
+a release function. A devres can be released in several ways. No
+matter what, all devres entries are released on driver detach. On
+release, the associated release function is invoked and then the
+devres entry is freed.
+
+Managed interface is created for resources commonly used by device
+drivers using devres. For example, coherent DMA memory is acquired
+using dma_alloc_coherent(). The managed version is called
+dmam_alloc_coherent(). It is identical to dma_alloc_coherent() except
+for the DMA memory allocated using it is managed and will be
+automatically released on driver detach. Implementation looks like
+the following.
+
+ struct dma_devres {
+ size_t size;
+ void *vaddr;
+ dma_addr_t dma_handle;
+ };
+
+ static void dmam_coherent_release(struct device *dev, void *res)
+ {
+ struct dma_devres *this = res;
+
+ dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
+ }
+
+ dmam_alloc_coherent(dev, size, dma_handle, gfp)
+ {
+ struct dma_devres *dr;
+ void *vaddr;
+
+ dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
+ ...
+
+ /* alloc DMA memory as usual */
+ vaddr = dma_alloc_coherent(...);
+ ...
+
+ /* record size, vaddr, dma_handle in dr */
+ dr->vaddr = vaddr;
+ ...
+
+ devres_add(dev, dr);
+
+ return vaddr;
+ }
+
+If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
+freed whether initialization fails half-way or the device gets
+detached. If most resources are acquired using managed interface, a
+driver can have much simpler init and exit code. Init path basically
+looks like the following.
+
+ my_init_one()
+ {
+ struct mydev *d;
+
+ d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->ring = dmam_alloc_coherent(...);
+ if (!d->ring)
+ return -ENOMEM;
+
+ if (check something)
+ return -EINVAL;
+ ...
+
+ return register_to_upper_layer(d);
+ }
+
+And exit path,
+
+ my_remove_one()
+ {
+ unregister_from_upper_layer(d);
+ shutdown_my_hardware();
+ }
+
+As shown above, low level drivers can be simplified a lot by using
+devres. Complexity is shifted from less maintained low level drivers
+to better maintained higher layer. Also, as init failure path is
+shared with exit path, both can get more testing.
+
+
+ 3. Devres group
+ ---------------
+
+Devres entries can be grouped using devres group. When a group is
+released, all contained normal devres entries and properly nested
+groups are released. One usage is to rollback series of acquired
+resources on failure. For example,
+
+ if (!devres_open_group(dev, NULL, GFP_KERNEL))
+ return -ENOMEM;
+
+ acquire A;
+ if (failed)
+ goto err;
+
+ acquire B;
+ if (failed)
+ goto err;
+ ...
+
+ devres_remove_group(dev, NULL);
+ return 0;
+
+ err:
+ devres_release_group(dev, NULL);
+ return err_code;
+
+As resource acquisition failure usually means probe failure, constructs
+like above are usually useful in midlayer driver (e.g. libata core
+layer) where interface function shouldn't have side effect on failure.
+For LLDs, just returning error code suffices in most cases.
+
+Each group is identified by void *id. It can either be explicitly
+specified by @id argument to devres_open_group() or automatically
+created by passing NULL as @id as in the above example. In both
+cases, devres_open_group() returns the group's id. The returned id
+can be passed to other devres functions to select the target group.
+If NULL is given to those functions, the latest open group is
+selected.
+
+For example, you can do something like the following.
+
+ int my_midlayer_create_something()
+ {
+ if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
+ return -ENOMEM;
+
+ ...
+
+ devres_close_group(dev, my_midlayer_create_something);
+ return 0;
+ }
+
+ void my_midlayer_destroy_something()
+ {
+ devres_release_group(dev, my_midlayer_create_soemthing);
+ }
+
+
+ 4. Details
+ ----------
+
+Lifetime of a devres entry begins on devres allocation and finishes
+when it is released or destroyed (removed and freed) - no reference
+counting.
+
+devres core guarantees atomicity to all basic devres operations and
+has support for single-instance devres types (atomic
+lookup-and-add-if-not-found). Other than that, synchronizing
+concurrent accesses to allocated devres data is caller's
+responsibility. This is usually non-issue because bus ops and
+resource allocations already do the job.
+
+For an example of single-instance devres type, read pcim_iomap_table()
+in lib/devres.c.
+
+All devres interface functions can be called without context if the
+right gfp mask is given.
+
+
+ 5. Overhead
+ -----------
+
+Each devres bookkeeping info is allocated together with requested data
+area. With debug option turned off, bookkeeping info occupies 16
+bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
+up to ull alignment). If singly linked list is used, it can be
+reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
+
+Each devres group occupies 8 pointers. It can be reduced to 6 if
+singly linked list is used.
+
+Memory space overhead on ahci controller with two ports is between 300
+and 400 bytes on 32bit machine after naive conversion (we can
+certainly invest a bit more effort into libata core layer).
+
+
+ 6. List of managed interfaces
+ -----------------------------
+
+IO region
+ devm_request_region()
+ devm_request_mem_region()
+ devm_release_region()
+ devm_release_mem_region()
+
+IRQ
+ devm_request_irq()
+ devm_free_irq()
+
+DMA
+ dmam_alloc_coherent()
+ dmam_free_coherent()
+ dmam_alloc_noncoherent()
+ dmam_free_noncoherent()
+ dmam_declare_coherent_memory()
+ dmam_pool_create()
+ dmam_pool_destroy()
+
+PCI
+ pcim_enable_device() : after success, all PCI ops become managed
+ pcim_pin_device() : keep PCI device enabled after release
+
+IOMAP
+ devm_ioport_map()
+ devm_ioport_unmap()
+ devm_ioremap()
+ devm_ioremap_nocache()
+ devm_iounmap()
+ pcim_iomap()
+ pcim_iounmap()
+ pcim_iomap_table() : array of mapped addresses indexed by BAR
+ pcim_iomap_regions() : do request_region() and iomap() on multiple BARs
diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt
new file mode 100644
index 0000000..8213216
--- /dev/null
+++ b/Documentation/driver-model/driver.txt
@@ -0,0 +1,230 @@
+
+Device Drivers
+
+struct device_driver {
+ char * name;
+ struct bus_type * bus;
+
+ struct completion unloaded;
+ struct kobject kobj;
+ list_t devices;
+
+ struct module *owner;
+
+ int (*probe) (struct device * dev);
+ int (*remove) (struct device * dev);
+
+ int (*suspend) (struct device * dev, pm_message_t state);
+ int (*resume) (struct device * dev);
+};
+
+
+
+Allocation
+~~~~~~~~~~
+
+Device drivers are statically allocated structures. Though there may
+be multiple devices in a system that a driver supports, struct
+device_driver represents the driver as a whole (not a particular
+device instance).
+
+Initialization
+~~~~~~~~~~~~~~
+
+The driver must initialize at least the name and bus fields. It should
+also initialize the devclass field (when it arrives), so it may obtain
+the proper linkage internally. It should also initialize as many of
+the callbacks as possible, though each is optional.
+
+Declaration
+~~~~~~~~~~~
+
+As stated above, struct device_driver objects are statically
+allocated. Below is an example declaration of the eepro100
+driver. This declaration is hypothetical only; it relies on the driver
+being converted completely to the new model.
+
+static struct device_driver eepro100_driver = {
+ .name = "eepro100",
+ .bus = &pci_bus_type,
+
+ .probe = eepro100_probe,
+ .remove = eepro100_remove,
+ .suspend = eepro100_suspend,
+ .resume = eepro100_resume,
+};
+
+Most drivers will not be able to be converted completely to the new
+model because the bus they belong to has a bus-specific structure with
+bus-specific fields that cannot be generalized.
+
+The most common example of this are device ID structures. A driver
+typically defines an array of device IDs that it supports. The format
+of these structures and the semantics for comparing device IDs are
+completely bus-specific. Defining them as bus-specific entities would
+sacrifice type-safety, so we keep bus-specific structures around.
+
+Bus-specific drivers should include a generic struct device_driver in
+the definition of the bus-specific driver. Like this:
+
+struct pci_driver {
+ const struct pci_device_id *id_table;
+ struct device_driver driver;
+};
+
+A definition that included bus-specific fields would look like
+(using the eepro100 driver again):
+
+static struct pci_driver eepro100_driver = {
+ .id_table = eepro100_pci_tbl,
+ .driver = {
+ .name = "eepro100",
+ .bus = &pci_bus_type,
+ .probe = eepro100_probe,
+ .remove = eepro100_remove,
+ .suspend = eepro100_suspend,
+ .resume = eepro100_resume,
+ },
+};
+
+Some may find the syntax of embedded struct initialization awkward or
+even a bit ugly. So far, it's the best way we've found to do what we want...
+
+Registration
+~~~~~~~~~~~~
+
+int driver_register(struct device_driver * drv);
+
+The driver registers the structure on startup. For drivers that have
+no bus-specific fields (i.e. don't have a bus-specific driver
+structure), they would use driver_register and pass a pointer to their
+struct device_driver object.
+
+Most drivers, however, will have a bus-specific structure and will
+need to register with the bus using something like pci_driver_register.
+
+It is important that drivers register their driver structure as early as
+possible. Registration with the core initializes several fields in the
+struct device_driver object, including the reference count and the
+lock. These fields are assumed to be valid at all times and may be
+used by the device model core or the bus driver.
+
+
+Transition Bus Drivers
+~~~~~~~~~~~~~~~~~~~~~~
+
+By defining wrapper functions, the transition to the new model can be
+made easier. Drivers can ignore the generic structure altogether and
+let the bus wrapper fill in the fields. For the callbacks, the bus can
+define generic callbacks that forward the call to the bus-specific
+callbacks of the drivers.
+
+This solution is intended to be only temporary. In order to get class
+information in the driver, the drivers must be modified anyway. Since
+converting drivers to the new model should reduce some infrastructural
+complexity and code size, it is recommended that they are converted as
+class information is added.
+
+Access
+~~~~~~
+
+Once the object has been registered, it may access the common fields of
+the object, like the lock and the list of devices.
+
+int driver_for_each_dev(struct device_driver * drv, void * data,
+ int (*callback)(struct device * dev, void * data));
+
+The devices field is a list of all the devices that have been bound to
+the driver. The LDM core provides a helper function to operate on all
+the devices a driver controls. This helper locks the driver on each
+node access, and does proper reference counting on each device as it
+accesses it.
+
+
+sysfs
+~~~~~
+
+When a driver is registered, a sysfs directory is created in its
+bus's directory. In this directory, the driver can export an interface
+to userspace to control operation of the driver on a global basis;
+e.g. toggling debugging output in the driver.
+
+A future feature of this directory will be a 'devices' directory. This
+directory will contain symlinks to the directories of devices it
+supports.
+
+
+
+Callbacks
+~~~~~~~~~
+
+ int (*probe) (struct device * dev);
+
+The probe() entry is called in task context, with the bus's rwsem locked
+and the driver partially bound to the device. Drivers commonly use
+container_of() to convert "dev" to a bus-specific type, both in probe()
+and other routines. That type often provides device resource data, such
+as pci_dev.resource[] or platform_device.resources, which is used in
+addition to dev->platform_data to initialize the driver.
+
+This callback holds the driver-specific logic to bind the driver to a
+given device. That includes verifying that the device is present, that
+it's a version the driver can handle, that driver data structures can
+be allocated and initialized, and that any hardware can be initialized.
+Drivers often store a pointer to their state with dev_set_drvdata().
+When the driver has successfully bound itself to that device, then probe()
+returns zero and the driver model code will finish its part of binding
+the driver to that device.
+
+A driver's probe() may return a negative errno value to indicate that
+the driver did not bind to this device, in which case it should have
+released all resources it allocated.
+
+ int (*remove) (struct device * dev);
+
+remove is called to unbind a driver from a device. This may be
+called if a device is physically removed from the system, if the
+driver module is being unloaded, during a reboot sequence, or
+in other cases.
+
+It is up to the driver to determine if the device is present or
+not. It should free any resources allocated specifically for the
+device; i.e. anything in the device's driver_data field.
+
+If the device is still present, it should quiesce the device and place
+it into a supported low-power state.
+
+ int (*suspend) (struct device * dev, pm_message_t state);
+
+suspend is called to put the device in a low power state.
+
+ int (*resume) (struct device * dev);
+
+Resume is used to bring a device back from a low power state.
+
+
+Attributes
+~~~~~~~~~~
+struct driver_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device_driver *, char * buf, size_t count, loff_t off);
+ ssize_t (*store)(struct device_driver *, const char * buf, size_t count, loff_t off);
+};
+
+Device drivers can export attributes via their sysfs directories.
+Drivers can declare attributes using a DRIVER_ATTR macro that works
+identically to the DEVICE_ATTR macro.
+
+Example:
+
+DRIVER_ATTR(debug,0644,show_debug,store_debug);
+
+This is equivalent to declaring:
+
+struct driver_attribute driver_attr_debug;
+
+This can then be used to add and remove the attribute from the
+driver's directory using:
+
+int driver_create_file(struct device_driver *, struct driver_attribute *);
+void driver_remove_file(struct device_driver *, struct driver_attribute *);
diff --git a/Documentation/driver-model/interface.txt b/Documentation/driver-model/interface.txt
new file mode 100644
index 0000000..c66912b
--- /dev/null
+++ b/Documentation/driver-model/interface.txt
@@ -0,0 +1,129 @@
+
+Device Interfaces
+
+Introduction
+~~~~~~~~~~~~
+
+Device interfaces are the logical interfaces of device classes that correlate
+directly to userspace interfaces, like device nodes.
+
+Each device class may have multiple interfaces through which you can
+access the same device. An input device may support the mouse interface,
+the 'evdev' interface, and the touchscreen interface. A SCSI disk would
+support the disk interface, the SCSI generic interface, and possibly a raw
+device interface.
+
+Device interfaces are registered with the class they belong to. As devices
+are added to the class, they are added to each interface registered with
+the class. The interface is responsible for determining whether the device
+supports the interface or not.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+
+struct device_interface {
+ char * name;
+ rwlock_t lock;
+ u32 devnum;
+ struct device_class * devclass;
+
+ struct list_head node;
+ struct driver_dir_entry dir;
+
+ int (*add_device)(struct device *);
+ int (*add_device)(struct intf_data *);
+};
+
+int interface_register(struct device_interface *);
+void interface_unregister(struct device_interface *);
+
+
+An interface must specify the device class it belongs to. It is added
+to that class's list of interfaces on registration.
+
+
+Interfaces can be added to a device class at any time. Whenever it is
+added, each device in the class is passed to the interface's
+add_device callback. When an interface is removed, each device is
+removed from the interface.
+
+
+Devices
+~~~~~~~
+Once a device is added to a device class, it is added to each
+interface that is registered with the device class. The class
+is expected to place a class-specific data structure in
+struct device::class_data. The interface can use that (along with
+other fields of struct device) to determine whether or not the driver
+and/or device support that particular interface.
+
+
+Data
+~~~~
+
+struct intf_data {
+ struct list_head node;
+ struct device_interface * intf;
+ struct device * dev;
+ u32 intf_num;
+};
+
+int interface_add_data(struct interface_data *);
+
+The interface is responsible for allocating and initializing a struct
+intf_data and calling interface_add_data() to add it to the device's list
+of interfaces it belongs to. This list will be iterated over when the device
+is removed from the class (instead of all possible interfaces for a class).
+This structure should probably be embedded in whatever per-device data
+structure the interface is allocating anyway.
+
+Devices are enumerated within the interface. This happens in interface_add_data()
+and the enumerated value is stored in the struct intf_data for that device.
+
+sysfs
+~~~~~
+Each interface is given a directory in the directory of the device
+class it belongs to:
+
+Interfaces get a directory in the class's directory as well:
+
+ class/
+ `-- input
+ |-- devices
+ |-- drivers
+ |-- mouse
+ `-- evdev
+
+When a device is added to the interface, a symlink is created that points
+to the device's directory in the physical hierarchy:
+
+ class/
+ `-- input
+ |-- devices
+ | `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+ |-- drivers
+ | `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
+ |-- mouse
+ | `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+ `-- evdev
+ `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+
+
+Future Plans
+~~~~~~~~~~~~
+A device interface is correlated directly with a userspace interface
+for a device, specifically a device node. For instance, a SCSI disk
+exposes at least two interfaces to userspace: the standard SCSI disk
+interface and the SCSI generic interface. It might also export a raw
+device interface.
+
+Many interfaces have a major number associated with them and each
+device gets a minor number. Or, multiple interfaces might share one
+major number, and each will receive a range of minor numbers (like in
+the case of input devices).
+
+These major and minor numbers could be stored in the interface
+structure. Major and minor allocations could happen when the interface
+is registered with the class, or via a helper function.
+
diff --git a/Documentation/driver-model/overview.txt b/Documentation/driver-model/overview.txt
new file mode 100644
index 0000000..07236ed
--- /dev/null
+++ b/Documentation/driver-model/overview.txt
@@ -0,0 +1,107 @@
+The Linux Kernel Device Model
+
+Patrick Mochel <mochel@digitalimplant.org>
+
+Drafted 26 August 2002
+Updated 31 January 2006
+
+
+Overview
+~~~~~~~~
+
+The Linux Kernel Driver Model is a unification of all the disparate driver
+models that were previously used in the kernel. It is intended to augment the
+bus-specific drivers for bridges and devices by consolidating a set of data
+and operations into globally accessible data structures.
+
+Traditional driver models implemented some sort of tree-like structure
+(sometimes just a list) for the devices they control. There wasn't any
+uniformity across the different bus types.
+
+The current driver model provides a common, uniform data model for describing
+a bus and the devices that can appear under the bus. The unified bus
+model includes a set of common attributes which all busses carry, and a set
+of common callbacks, such as device discovery during bus probing, bus
+shutdown, bus power management, etc.
+
+The common device and bridge interface reflects the goals of the modern
+computer: namely the ability to do seamless device "plug and play", power
+management, and hot plug. In particular, the model dictated by Intel and
+Microsoft (namely ACPI) ensures that almost every device on almost any bus
+on an x86-compatible system can work within this paradigm. Of course,
+not every bus is able to support all such operations, although most
+buses support a most of those operations.
+
+
+Downstream Access
+~~~~~~~~~~~~~~~~~
+
+Common data fields have been moved out of individual bus layers into a common
+data structure. These fields must still be accessed by the bus layers,
+and sometimes by the device-specific drivers.
+
+Other bus layers are encouraged to do what has been done for the PCI layer.
+struct pci_dev now looks like this:
+
+struct pci_dev {
+ ...
+
+ struct device dev;
+};
+
+Note first that it is statically allocated. This means only one allocation on
+device discovery. Note also that it is at the _end_ of struct pci_dev. This is
+to make people think about what they're doing when switching between the bus
+driver and the global driver; and to prevent against mindless casts between
+the two.
+
+The PCI bus layer freely accesses the fields of struct device. It knows about
+the structure of struct pci_dev, and it should know the structure of struct
+device. Individual PCI device drivers that have been converted to the current
+driver model generally do not and should not touch the fields of struct device,
+unless there is a strong compelling reason to do so.
+
+This abstraction is prevention of unnecessary pain during transitional phases.
+If the name of the field changes or is removed, then every downstream driver
+will break. On the other hand, if only the bus layer (and not the device
+layer) accesses struct device, it is only that layer that needs to change.
+
+
+User Interface
+~~~~~~~~~~~~~~
+
+By virtue of having a complete hierarchical view of all the devices in the
+system, exporting a complete hierarchical view to userspace becomes relatively
+easy. This has been accomplished by implementing a special purpose virtual
+file system named sysfs. It is hence possible for the user to mount the
+whole sysfs filesystem anywhere in userspace.
+
+This can be done permanently by providing the following entry into the
+/etc/fstab (under the provision that the mount point does exist, of course):
+
+none /sys sysfs defaults 0 0
+
+Or by hand on the command line:
+
+# mount -t sysfs sysfs /sys
+
+Whenever a device is inserted into the tree, a directory is created for it.
+This directory may be populated at each layer of discovery - the global layer,
+the bus layer, or the device layer.
+
+The global layer currently creates two files - 'name' and 'power'. The
+former only reports the name of the device. The latter reports the
+current power state of the device. It will also be used to set the current
+power state.
+
+The bus layer may also create files for the devices it finds while probing the
+bus. For example, the PCI layer currently creates 'irq' and 'resource' files
+for each PCI device.
+
+A device-specific driver may also export files in its directory to expose
+device-specific data or tunable interfaces.
+
+More information about the sysfs directory layout can be found in
+the other documents in this directory and in the file
+Documentation/filesystems/sysfs.txt.
+
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
new file mode 100644
index 0000000..83009fd
--- /dev/null
+++ b/Documentation/driver-model/platform.txt
@@ -0,0 +1,171 @@
+Platform Devices and Drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+See <linux/platform_device.h> for the driver model interface to the
+platform bus: platform_device, and platform_driver. This pseudo-bus
+is used to connect devices on busses with minimal infrastructure,
+like those used to integrate peripherals on many system-on-chip
+processors, or some "legacy" PC interconnects; as opposed to large
+formally specified ones like PCI or USB.
+
+
+Platform devices
+~~~~~~~~~~~~~~~~
+Platform devices are devices that typically appear as autonomous
+entities in the system. This includes legacy port-based devices and
+host bridges to peripheral buses, and most controllers integrated
+into system-on-chip platforms. What they usually have in common
+is direct addressing from a CPU bus. Rarely, a platform_device will
+be connected through a segment of some other kind of bus; but its
+registers will still be directly addressable.
+
+Platform devices are given a name, used in driver binding, and a
+list of resources such as addresses and IRQs.
+
+struct platform_device {
+ const char *name;
+ u32 id;
+ struct device dev;
+ u32 num_resources;
+ struct resource *resource;
+};
+
+
+Platform drivers
+~~~~~~~~~~~~~~~~
+Platform drivers follow the standard driver model convention, where
+discovery/enumeration is handled outside the drivers, and drivers
+provide probe() and remove() methods. They support power management
+and shutdown notifications using the standard conventions.
+
+struct platform_driver {
+ int (*probe)(struct platform_device *);
+ int (*remove)(struct platform_device *);
+ void (*shutdown)(struct platform_device *);
+ int (*suspend)(struct platform_device *, pm_message_t state);
+ int (*suspend_late)(struct platform_device *, pm_message_t state);
+ int (*resume_early)(struct platform_device *);
+ int (*resume)(struct platform_device *);
+ struct device_driver driver;
+};
+
+Note that probe() should general verify that the specified device hardware
+actually exists; sometimes platform setup code can't be sure. The probing
+can use device resources, including clocks, and device platform_data.
+
+Platform drivers register themselves the normal way:
+
+ int platform_driver_register(struct platform_driver *drv);
+
+Or, in common situations where the device is known not to be hot-pluggable,
+the probe() routine can live in an init section to reduce the driver's
+runtime memory footprint:
+
+ int platform_driver_probe(struct platform_driver *drv,
+ int (*probe)(struct platform_device *))
+
+
+Device Enumeration
+~~~~~~~~~~~~~~~~~~
+As a rule, platform specific (and often board-specific) setup code will
+register platform devices:
+
+ int platform_device_register(struct platform_device *pdev);
+
+ int platform_add_devices(struct platform_device **pdevs, int ndev);
+
+The general rule is to register only those devices that actually exist,
+but in some cases extra devices might be registered. For example, a kernel
+might be configured to work with an external network adapter that might not
+be populated on all boards, or likewise to work with an integrated controller
+that some boards might not hook up to any peripherals.
+
+In some cases, boot firmware will export tables describing the devices
+that are populated on a given board. Without such tables, often the
+only way for system setup code to set up the correct devices is to build
+a kernel for a specific target board. Such board-specific kernels are
+common with embedded and custom systems development.
+
+In many cases, the memory and IRQ resources associated with the platform
+device are not enough to let the device's driver work. Board setup code
+will often provide additional information using the device's platform_data
+field to hold additional information.
+
+Embedded systems frequently need one or more clocks for platform devices,
+which are normally kept off until they're actively needed (to save power).
+System setup also associates those clocks with the device, so that that
+calls to clk_get(&pdev->dev, clock_name) return them as needed.
+
+
+Legacy Drivers: Device Probing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some drivers are not fully converted to the driver model, because they take
+on a non-driver role: the driver registers its platform device, rather than
+leaving that for system infrastructure. Such drivers can't be hotplugged
+or coldplugged, since those mechanisms require device creation to be in a
+different system component than the driver.
+
+The only "good" reason for this is to handle older system designs which, like
+original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
+configuration. Newer systems have largely abandoned that model, in favor of
+bus-level support for dynamic configuration (PCI, USB), or device tables
+provided by the boot firmware (e.g. PNPACPI on x86). There are too many
+conflicting options about what might be where, and even educated guesses by
+an operating system will be wrong often enough to make trouble.
+
+This style of driver is discouraged. If you're updating such a driver,
+please try to move the device enumeration to a more appropriate location,
+outside the driver. This will usually be cleanup, since such drivers
+tend to already have "normal" modes, such as ones using device nodes that
+were created by PNP or by platform device setup.
+
+None the less, there are some APIs to support such legacy drivers. Avoid
+using these calls except with such hotplug-deficient drivers.
+
+ struct platform_device *platform_device_alloc(
+ const char *name, int id);
+
+You can use platform_device_alloc() to dynamically allocate a device, which
+you will then initialize with resources and platform_device_register().
+A better solution is usually:
+
+ struct platform_device *platform_device_register_simple(
+ const char *name, int id,
+ struct resource *res, unsigned int nres);
+
+You can use platform_device_register_simple() as a one-step call to allocate
+and register a device.
+
+
+Device Naming and Driver Binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The platform_device.dev.bus_id is the canonical name for the devices.
+It's built from two components:
+
+ * platform_device.name ... which is also used to for driver matching.
+
+ * platform_device.id ... the device instance number, or else "-1"
+ to indicate there's only one.
+
+These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
+"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
+named "serial". While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
+and use the platform_driver called "my_rtc".
+
+Driver binding is performed automatically by the driver core, invoking
+driver probe() after finding a match between device and driver. If the
+probe() succeeds, the driver and device are bound as usual. There are
+three different ways to find such a match:
+
+ - Whenever a device is registered, the drivers for that bus are
+ checked for matches. Platform devices should be registered very
+ early during system boot.
+
+ - When a driver is registered using platform_driver_register(), all
+ unbound devices on that bus are checked for matches. Drivers
+ usually register later during booting, or by module loading.
+
+ - Registering a driver using platform_driver_probe() works just like
+ using platform_driver_register(), except that the driver won't
+ be probed later if another device registers. (Which is OK, since
+ this interface is only for use with non-hotpluggable devices.)
+
diff --git a/Documentation/driver-model/porting.txt b/Documentation/driver-model/porting.txt
new file mode 100644
index 0000000..92d86f7
--- /dev/null
+++ b/Documentation/driver-model/porting.txt
@@ -0,0 +1,445 @@
+
+Porting Drivers to the New Driver Model
+
+Patrick Mochel
+
+7 January 2003
+
+
+Overview
+
+Please refer to Documentation/driver-model/*.txt for definitions of
+various driver types and concepts.
+
+Most of the work of porting devices drivers to the new model happens
+at the bus driver layer. This was intentional, to minimize the
+negative effect on kernel drivers, and to allow a gradual transition
+of bus drivers.
+
+In a nutshell, the driver model consists of a set of objects that can
+be embedded in larger, bus-specific objects. Fields in these generic
+objects can replace fields in the bus-specific objects.
+
+The generic objects must be registered with the driver model core. By
+doing so, they will exported via the sysfs filesystem. sysfs can be
+mounted by doing
+
+ # mount -t sysfs sysfs /sys
+
+
+
+The Process
+
+Step 0: Read include/linux/device.h for object and function definitions.
+
+Step 1: Registering the bus driver.
+
+
+- Define a struct bus_type for the bus driver.
+
+struct bus_type pci_bus_type = {
+ .name = "pci",
+};
+
+
+- Register the bus type.
+ This should be done in the initialization function for the bus type,
+ which is usually the module_init(), or equivalent, function.
+
+static int __init pci_driver_init(void)
+{
+ return bus_register(&pci_bus_type);
+}
+
+subsys_initcall(pci_driver_init);
+
+
+ The bus type may be unregistered (if the bus driver may be compiled
+ as a module) by doing:
+
+ bus_unregister(&pci_bus_type);
+
+
+- Export the bus type for others to use.
+
+ Other code may wish to reference the bus type, so declare it in a
+ shared header file and export the symbol.
+
+From include/linux/pci.h:
+
+extern struct bus_type pci_bus_type;
+
+
+From file the above code appears in:
+
+EXPORT_SYMBOL(pci_bus_type);
+
+
+
+- This will cause the bus to show up in /sys/bus/pci/ with two
+ subdirectories: 'devices' and 'drivers'.
+
+# tree -d /sys/bus/pci/
+/sys/bus/pci/
+|-- devices
+`-- drivers
+
+
+
+Step 2: Registering Devices.
+
+struct device represents a single device. It mainly contains metadata
+describing the relationship the device has to other entities.
+
+
+- Embed a struct device in the bus-specific device type.
+
+
+struct pci_dev {
+ ...
+ struct device dev; /* Generic device interface */
+ ...
+};
+
+ It is recommended that the generic device not be the first item in
+ the struct to discourage programmers from doing mindless casts
+ between the object types. Instead macros, or inline functions,
+ should be created to convert from the generic object type.
+
+
+#define to_pci_dev(n) container_of(n, struct pci_dev, dev)
+
+or
+
+static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
+{
+ return container_of(n, struct pci_dev, dev);
+}
+
+ This allows the compiler to verify type-safety of the operations
+ that are performed (which is Good).
+
+
+- Initialize the device on registration.
+
+ When devices are discovered or registered with the bus type, the
+ bus driver should initialize the generic device. The most important
+ things to initialize are the bus_id, parent, and bus fields.
+
+ The bus_id is an ASCII string that contains the device's address on
+ the bus. The format of this string is bus-specific. This is
+ necessary for representing devices in sysfs.
+
+ parent is the physical parent of the device. It is important that
+ the bus driver sets this field correctly.
+
+ The driver model maintains an ordered list of devices that it uses
+ for power management. This list must be in order to guarantee that
+ devices are shutdown before their physical parents, and vice versa.
+ The order of this list is determined by the parent of registered
+ devices.
+
+ Also, the location of the device's sysfs directory depends on a
+ device's parent. sysfs exports a directory structure that mirrors
+ the device hierarchy. Accurately setting the parent guarantees that
+ sysfs will accurately represent the hierarchy.
+
+ The device's bus field is a pointer to the bus type the device
+ belongs to. This should be set to the bus_type that was declared
+ and initialized before.
+
+ Optionally, the bus driver may set the device's name and release
+ fields.
+
+ The name field is an ASCII string describing the device, like
+
+ "ATI Technologies Inc Radeon QD"
+
+ The release field is a callback that the driver model core calls
+ when the device has been removed, and all references to it have
+ been released. More on this in a moment.
+
+
+- Register the device.
+
+ Once the generic device has been initialized, it can be registered
+ with the driver model core by doing:
+
+ device_register(&dev->dev);
+
+ It can later be unregistered by doing:
+
+ device_unregister(&dev->dev);
+
+ This should happen on buses that support hotpluggable devices.
+ If a bus driver unregisters a device, it should not immediately free
+ it. It should instead wait for the driver model core to call the
+ device's release method, then free the bus-specific object.
+ (There may be other code that is currently referencing the device
+ structure, and it would be rude to free the device while that is
+ happening).
+
+
+ When the device is registered, a directory in sysfs is created.
+ The PCI tree in sysfs looks like:
+
+/sys/devices/pci0/
+|-- 00:00.0
+|-- 00:01.0
+| `-- 01:00.0
+|-- 00:02.0
+| `-- 02:1f.0
+| `-- 03:00.0
+|-- 00:1e.0
+| `-- 04:04.0
+|-- 00:1f.0
+|-- 00:1f.1
+| |-- ide0
+| | |-- 0.0
+| | `-- 0.1
+| `-- ide1
+| `-- 1.0
+|-- 00:1f.2
+|-- 00:1f.3
+`-- 00:1f.5
+
+ Also, symlinks are created in the bus's 'devices' directory
+ that point to the device's directory in the physical hierarchy.
+
+/sys/bus/pci/devices/
+|-- 00:00.0 -> ../../../devices/pci0/00:00.0
+|-- 00:01.0 -> ../../../devices/pci0/00:01.0
+|-- 00:02.0 -> ../../../devices/pci0/00:02.0
+|-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
+|-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
+|-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
+|-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
+|-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
+|-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
+|-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
+|-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
+|-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
+`-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
+
+
+
+Step 3: Registering Drivers.
+
+struct device_driver is a simple driver structure that contains a set
+of operations that the driver model core may call.
+
+
+- Embed a struct device_driver in the bus-specific driver.
+
+ Just like with devices, do something like:
+
+struct pci_driver {
+ ...
+ struct device_driver driver;
+};
+
+
+- Initialize the generic driver structure.
+
+ When the driver registers with the bus (e.g. doing pci_register_driver()),
+ initialize the necessary fields of the driver: the name and bus
+ fields.
+
+
+- Register the driver.
+
+ After the generic driver has been initialized, call
+
+ driver_register(&drv->driver);
+
+ to register the driver with the core.
+
+ When the driver is unregistered from the bus, unregister it from the
+ core by doing:
+
+ driver_unregister(&drv->driver);
+
+ Note that this will block until all references to the driver have
+ gone away. Normally, there will not be any.
+
+
+- Sysfs representation.
+
+ Drivers are exported via sysfs in their bus's 'driver's directory.
+ For example:
+
+/sys/bus/pci/drivers/
+|-- 3c59x
+|-- Ensoniq AudioPCI
+|-- agpgart-amdk7
+|-- e100
+`-- serial
+
+
+Step 4: Define Generic Methods for Drivers.
+
+struct device_driver defines a set of operations that the driver model
+core calls. Most of these operations are probably similar to
+operations the bus already defines for drivers, but taking different
+parameters.
+
+It would be difficult and tedious to force every driver on a bus to
+simultaneously convert their drivers to generic format. Instead, the
+bus driver should define single instances of the generic methods that
+forward call to the bus-specific drivers. For instance:
+
+
+static int pci_device_remove(struct device * dev)
+{
+ struct pci_dev * pci_dev = to_pci_dev(dev);
+ struct pci_driver * drv = pci_dev->driver;
+
+ if (drv) {
+ if (drv->remove)
+ drv->remove(pci_dev);
+ pci_dev->driver = NULL;
+ }
+ return 0;
+}
+
+
+The generic driver should be initialized with these methods before it
+is registered.
+
+ /* initialize common driver fields */
+ drv->driver.name = drv->name;
+ drv->driver.bus = &pci_bus_type;
+ drv->driver.probe = pci_device_probe;
+ drv->driver.resume = pci_device_resume;
+ drv->driver.suspend = pci_device_suspend;
+ drv->driver.remove = pci_device_remove;
+
+ /* register with core */
+ driver_register(&drv->driver);
+
+
+Ideally, the bus should only initialize the fields if they are not
+already set. This allows the drivers to implement their own generic
+methods.
+
+
+Step 5: Support generic driver binding.
+
+The model assumes that a device or driver can be dynamically
+registered with the bus at any time. When registration happens,
+devices must be bound to a driver, or drivers must be bound to all
+devices that it supports.
+
+A driver typically contains a list of device IDs that it supports. The
+bus driver compares these IDs to the IDs of devices registered with it.
+The format of the device IDs, and the semantics for comparing them are
+bus-specific, so the generic model does attempt to generalize them.
+
+Instead, a bus may supply a method in struct bus_type that does the
+comparison:
+
+ int (*match)(struct device * dev, struct device_driver * drv);
+
+match should return '1' if the driver supports the device, and '0'
+otherwise.
+
+When a device is registered, the bus's list of drivers is iterated
+over. bus->match() is called for each one until a match is found.
+
+When a driver is registered, the bus's list of devices is iterated
+over. bus->match() is called for each device that is not already
+claimed by a driver.
+
+When a device is successfully bound to a driver, device->driver is
+set, the device is added to a per-driver list of devices, and a
+symlink is created in the driver's sysfs directory that points to the
+device's physical directory:
+
+/sys/bus/pci/drivers/
+|-- 3c59x
+| `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
+|-- Ensoniq AudioPCI
+|-- agpgart-amdk7
+| `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
+|-- e100
+| `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
+`-- serial
+
+
+This driver binding should replace the existing driver binding
+mechanism the bus currently uses.
+
+
+Step 6: Supply a hotplug callback.
+
+Whenever a device is registered with the driver model core, the
+userspace program /sbin/hotplug is called to notify userspace.
+Users can define actions to perform when a device is inserted or
+removed.
+
+The driver model core passes several arguments to userspace via
+environment variables, including
+
+- ACTION: set to 'add' or 'remove'
+- DEVPATH: set to the device's physical path in sysfs.
+
+A bus driver may also supply additional parameters for userspace to
+consume. To do this, a bus must implement the 'hotplug' method in
+struct bus_type:
+
+ int (*hotplug) (struct device *dev, char **envp,
+ int num_envp, char *buffer, int buffer_size);
+
+This is called immediately before /sbin/hotplug is executed.
+
+
+Step 7: Cleaning up the bus driver.
+
+The generic bus, device, and driver structures provide several fields
+that can replace those defined privately to the bus driver.
+
+- Device list.
+
+struct bus_type contains a list of all devices registered with the bus
+type. This includes all devices on all instances of that bus type.
+An internal list that the bus uses may be removed, in favor of using
+this one.
+
+The core provides an iterator to access these devices.
+
+int bus_for_each_dev(struct bus_type * bus, struct device * start,
+ void * data, int (*fn)(struct device *, void *));
+
+
+- Driver list.
+
+struct bus_type also contains a list of all drivers registered with
+it. An internal list of drivers that the bus driver maintains may
+be removed in favor of using the generic one.
+
+The drivers may be iterated over, like devices:
+
+int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+ void * data, int (*fn)(struct device_driver *, void *));
+
+
+Please see drivers/base/bus.c for more information.
+
+
+- rwsem
+
+struct bus_type contains an rwsem that protects all core accesses to
+the device and driver lists. This can be used by the bus driver
+internally, and should be used when accessing the device or driver
+lists the bus maintains.
+
+
+- Device and driver fields.
+
+Some of the fields in struct device and struct device_driver duplicate
+fields in the bus-specific representations of these objects. Feel free
+to remove the bus-specific ones and favor the generic ones. Note
+though, that this will likely mean fixing up all the drivers that
+reference the bus-specific fields (though those should all be 1-line
+changes).
+
diff --git a/Documentation/dvb/README.dvb-usb b/Documentation/dvb/README.dvb-usb
new file mode 100644
index 0000000..bf2a9cd
--- /dev/null
+++ b/Documentation/dvb/README.dvb-usb
@@ -0,0 +1,232 @@
+Documentation for dvb-usb-framework module and its devices
+
+Idea behind the dvb-usb-framework
+=================================
+
+In March 2005 I got the new Twinhan USB2.0 DVB-T device. They provided specs and a firmware.
+
+Quite keen I wanted to put the driver (with some quirks of course) into dibusb.
+After reading some specs and doing some USB snooping, it realized, that the
+dibusb-driver would be a complete mess afterwards. So I decided to do it in a
+different way: With the help of a dvb-usb-framework.
+
+The framework provides generic functions (mostly kernel API calls), such as:
+
+- Transport Stream URB handling in conjunction with dvb-demux-feed-control
+ (bulk and isoc are supported)
+- registering the device for the DVB-API
+- registering an I2C-adapter if applicable
+- remote-control/input-device handling
+- firmware requesting and loading (currently just for the Cypress USB
+ controllers)
+- other functions/methods which can be shared by several drivers (such as
+ functions for bulk-control-commands)
+- TODO: a I2C-chunker. It creates device-specific chunks of register-accesses
+ depending on length of a register and the number of values that can be
+ multi-written and multi-read.
+
+The source code of the particular DVB USB devices does just the communication
+with the device via the bus. The connection between the DVB-API-functionality
+is done via callbacks, assigned in a static device-description (struct
+dvb_usb_device) each device-driver has to have.
+
+For an example have a look in drivers/media/dvb/dvb-usb/vp7045*.
+
+Objective is to migrate all the usb-devices (dibusb, cinergyT2, maybe the
+ttusb; flexcop-usb already benefits from the generic flexcop-device) to use
+the dvb-usb-lib.
+
+TODO: dynamic enabling and disabling of the pid-filter in regard to number of
+feeds requested.
+
+Supported devices
+========================
+
+See the LinuxTV DVB Wiki at www.linuxtv.org for a complete list of
+cards/drivers/firmwares:
+
+http://www.linuxtv.org/wiki/index.php/DVB_USB
+
+0. History & News:
+ 2005-06-30 - added support for WideView WT-220U (Thanks to Steve Chang)
+ 2005-05-30 - added basic isochronous support to the dvb-usb-framework
+ added support for Conexant Hybrid reference design and Nebula DigiTV USB
+ 2005-04-17 - all dibusb devices ported to make use of the dvb-usb-framework
+ 2005-04-02 - re-enabled and improved remote control code.
+ 2005-03-31 - ported the Yakumo/Hama/Typhoon DVB-T USB2.0 device to dvb-usb.
+ 2005-03-30 - first commit of the dvb-usb-module based on the dibusb-source. First device is a new driver for the
+ TwinhanDTV Alpha / MagicBox II USB2.0-only DVB-T device.
+
+ (change from dvb-dibusb to dvb-usb)
+ 2005-03-28 - added support for the AVerMedia AverTV DVB-T USB2.0 device (Thanks to Glen Harris and Jiun-Kuei Jung, AVerMedia)
+ 2005-03-14 - added support for the Typhoon/Yakumo/HAMA DVB-T mobile USB2.0
+ 2005-02-11 - added support for the KWorld/ADSTech Instant DVB-T USB2.0. Thanks a lot to Joachim von Caron
+ 2005-02-02 - added support for the Hauppauge Win-TV Nova-T USB2
+ 2005-01-31 - distorted streaming is gone for USB1.1 devices
+ 2005-01-13 - moved the mirrored pid_filter_table back to dvb-dibusb
+ - first almost working version for HanfTek UMT-010
+ - found out, that Yakumo/HAMA/Typhoon are predecessors of the HanfTek UMT-010
+ 2005-01-10 - refactoring completed, now everything is very delightful
+ - tuner quirks for some weird devices (Artec T1 AN2235 device has sometimes a
+ Panasonic Tuner assembled). Tunerprobing implemented. Thanks a lot to Gunnar Wittich.
+ 2004-12-29 - after several days of struggling around bug of no returning URBs fixed.
+ 2004-12-26 - refactored the dibusb-driver, splitted into separate files
+ - i2c-probing enabled
+ 2004-12-06 - possibility for demod i2c-address probing
+ - new usb IDs (Compro, Artec)
+ 2004-11-23 - merged changes from DiB3000MC_ver2.1
+ - revised the debugging
+ - possibility to deliver the complete TS for USB2.0
+ 2004-11-21 - first working version of the dib3000mc/p frontend driver.
+ 2004-11-12 - added additional remote control keys. Thanks to Uwe Hanke.
+ 2004-11-07 - added remote control support. Thanks to David Matthews.
+ 2004-11-05 - added support for a new devices (Grandtec/Avermedia/Artec)
+ - merged my changes (for dib3000mb/dibusb) to the FE_REFACTORING, because it became HEAD
+ - moved transfer control (pid filter, fifo control) from usb driver to frontend, it seems
+ better settled there (added xfer_ops-struct)
+ - created a common files for frontends (mc/p/mb)
+ 2004-09-28 - added support for a new device (Unkown, vendor ID is Hyper-Paltek)
+ 2004-09-20 - added support for a new device (Compro DVB-U2000), thanks
+ to Amaury Demol for reporting
+ - changed usb TS transfer method (several urbs, stopping transfer
+ before setting a new pid)
+ 2004-09-13 - added support for a new device (Artec T1 USB TVBOX), thanks
+ to Christian Motschke for reporting
+ 2004-09-05 - released the dibusb device and dib3000mb-frontend driver
+
+ (old news for vp7041.c)
+ 2004-07-15 - found out, by accident, that the device has a TUA6010XS for
+ PLL
+ 2004-07-12 - figured out, that the driver should also work with the
+ CTS Portable (Chinese Television System)
+ 2004-07-08 - firmware-extraction-2.422-problem solved, driver is now working
+ properly with firmware extracted from 2.422
+ - #if for 2.6.4 (dvb), compile issue
+ - changed firmware handling, see vp7041.txt sec 1.1
+ 2004-07-02 - some tuner modifications, v0.1, cleanups, first public
+ 2004-06-28 - now using the dvb_dmx_swfilter_packets, everything
+ runs fine now
+ 2004-06-27 - able to watch and switching channels (pre-alpha)
+ - no section filtering yet
+ 2004-06-06 - first TS received, but kernel oops :/
+ 2004-05-14 - firmware loader is working
+ 2004-05-11 - start writing the driver
+
+1. How to use?
+1.1. Firmware
+
+Most of the USB drivers need to download a firmware to the device before start
+working.
+
+Have a look at the Wikipage for the DVB-USB-drivers to find out, which firmware
+you need for your device:
+
+http://www.linuxtv.org/wiki/index.php/DVB_USB
+
+1.2. Compiling
+
+Since the driver is in the linux kernel, activating the driver in
+your favorite config-environment should sufficient. I recommend
+to compile the driver as module. Hotplug does the rest.
+
+If you use dvb-kernel enter the build-2.6 directory run 'make' and 'insmod.sh
+load' afterwards.
+
+1.3. Loading the drivers
+
+Hotplug is able to load the driver, when it is needed (because you plugged
+in the device).
+
+If you want to enable debug output, you have to load the driver manually and
+from withing the dvb-kernel cvs repository.
+
+first have a look, which debug level are available:
+
+modinfo dvb-usb
+modinfo dvb-usb-vp7045
+etc.
+
+modprobe dvb-usb debug=<level>
+modprobe dvb-usb-vp7045 debug=<level>
+etc.
+
+should do the trick.
+
+When the driver is loaded successfully, the firmware file was in
+the right place and the device is connected, the "Power"-LED should be
+turned on.
+
+At this point you should be able to start a dvb-capable application. I'm use
+(t|s)zap, mplayer and dvbscan to test the basics. VDR-xine provides the
+long-term test scenario.
+
+2. Known problems and bugs
+
+- Don't remove the USB device while running an DVB application, your system
+ will go crazy or die most likely.
+
+2.1. Adding support for devices
+
+TODO
+
+2.2. USB1.1 Bandwidth limitation
+
+A lot of the currently supported devices are USB1.1 and thus they have a
+maximum bandwidth of about 5-6 MBit/s when connected to a USB2.0 hub.
+This is not enough for receiving the complete transport stream of a
+DVB-T channel (which is about 16 MBit/s). Normally this is not a
+problem, if you only want to watch TV (this does not apply for HDTV),
+but watching a channel while recording another channel on the same
+frequency simply does not work very well. This applies to all USB1.1
+DVB-T devices, not just the dvb-usb-devices)
+
+The bug, where the TS is distorted by a heavy usage of the device is gone
+definitely. All dvb-usb-devices I was using (Twinhan, Kworld, DiBcom) are
+working like charm now with VDR. Sometimes I even was able to record a channel
+and watch another one.
+
+2.3. Comments
+
+Patches, comments and suggestions are very very welcome.
+
+3. Acknowledgements
+ Amaury Demol (ademol@dibcom.fr) and Francois Kanounnikoff from DiBcom for
+ providing specs, code and help, on which the dvb-dibusb, dib3000mb and
+ dib3000mc are based.
+
+ David Matthews for identifying a new device type (Artec T1 with AN2235)
+ and for extending dibusb with remote control event handling. Thank you.
+
+ Alex Woods for frequently answering question about usb and dvb
+ stuff, a big thank you.
+
+ Bernd Wagner for helping with huge bug reports and discussions.
+
+ Gunnar Wittich and Joachim von Caron for their trust for providing
+ root-shells on their machines to implement support for new devices.
+
+ Allan Third and Michael Hutchinson for their help to write the Nebula
+ digitv-driver.
+
+ Glen Harris for bringing up, that there is a new dibusb-device and Jiun-Kuei
+ Jung from AVerMedia who kindly provided a special firmware to get the device
+ up and running in Linux.
+
+ Jennifer Chen, Jeff and Jack from Twinhan for kindly supporting by
+ writing the vp7045-driver.
+
+ Steve Chang from WideView for providing information for new devices and
+ firmware files.
+
+ Michael Paxton for submitting remote control keymaps.
+
+ Some guys on the linux-dvb mailing list for encouraging me.
+
+ Peter Schildmann >peter.schildmann-nospam-at-web.de< for his
+ user-level firmware loader, which saves a lot of time
+ (when writing the vp7041 driver)
+
+ Ulf Hermenau for helping me out with traditional chinese.
+
+ André Smoktun and Christian Frömmel for supporting me with
+ hardware and listening to my problems very patiently.
diff --git a/Documentation/dvb/README.flexcop b/Documentation/dvb/README.flexcop
new file mode 100644
index 0000000..5515469
--- /dev/null
+++ b/Documentation/dvb/README.flexcop
@@ -0,0 +1,205 @@
+This README escorted the skystar2-driver rewriting procedure. It describes the
+state of the new flexcop-driver set and some internals are written down here
+too.
+
+This document hopefully describes things about the flexcop and its
+device-offsprings. Goal was to write an easy-to-write and easy-to-read set of
+drivers based on the skystar2.c and other information.
+
+Remark: flexcop-pci.c was a copy of skystar2.c, but every line has been
+touched and rewritten.
+
+History & News
+==============
+ 2005-04-01 - correct USB ISOC transfers (thanks to Vadim Catana)
+
+
+
+
+General coding processing
+=========================
+
+We should proceed as follows (as long as no one complains):
+
+0) Think before start writing code!
+
+1) rewriting the skystar2.c with the help of the flexcop register descriptions
+and splitting up the files to a pci-bus-part and a flexcop-part.
+The new driver will be called b2c2-flexcop-pci.ko/b2c2-flexcop-usb.ko for the
+device-specific part and b2c2-flexcop.ko for the common flexcop-functions.
+
+2) Search for errors in the leftover of flexcop-pci.c (compare with pluto2.c
+and other pci drivers)
+
+3) make some beautification (see 'Improvements when rewriting (refactoring) is
+done')
+
+4) Testing the new driver and maybe substitute the skystar2.c with it, to reach
+a wider tester audience.
+
+5) creating an usb-bus-part using the already written flexcop code for the pci
+card.
+
+Idea: create a kernel-object for the flexcop and export all important
+functions. This option saves kernel-memory, but maybe a lot of functions have
+to be exported to kernel namespace.
+
+
+Current situation
+=================
+
+0) Done :)
+1) Done (some minor issues left)
+2) Done
+3) Not ready yet, more information is necessary
+4) next to be done (see the table below)
+5) USB driver is working (yes, there are some minor issues)
+
+What seems to be ready?
+-----------------------
+
+1) Rewriting
+1a) i2c is cut off from the flexcop-pci.c and seems to work
+1b) moved tuner and demod stuff from flexcop-pci.c to flexcop-tuner-fe.c
+1c) moved lnb and diseqc stuff from flexcop-pci.c to flexcop-tuner-fe.c
+1e) eeprom (reading MAC address)
+1d) sram (no dynamic sll size detection (commented out) (using default as JJ told me))
+1f) misc. register accesses for reading parameters (e.g. resetting, revision)
+1g) pid/mac filter (flexcop-hw-filter.c)
+1i) dvb-stuff initialization in flexcop.c (done)
+1h) dma stuff (now just using the size-irq, instead of all-together, to be done)
+1j) remove flexcop initialization from flexcop-pci.c completely (done)
+1l) use a well working dma IRQ method (done, see 'Known bugs and problems and TODO')
+1k) cleanup flexcop-files (remove unused EXPORT_SYMBOLs, make static from
+non-static where possible, moved code to proper places)
+
+2) Search for errors in the leftover of flexcop-pci.c (partially done)
+5a) add MAC address reading
+5c) feeding of ISOC data to the software demux (format of the isochronous data
+and speed optimization, no real error) (thanks to Vadim Catana)
+
+What to do in the near future?
+--------------------------------------
+(no special order here)
+
+5) USB driver
+5b) optimize isoc-transfer (submitting/killing isoc URBs when transfer is starting)
+
+Testing changes
+---------------
+
+O = item is working
+P = item is partially working
+X = item is not working
+N = item does not apply here
+<empty field> = item need to be examined
+
+ | PCI | USB
+item | mt352 | nxt2002 | stv0299 | mt312 | mt352 | nxt2002 | stv0299 | mt312
+-------+-------+---------+---------+-------+-------+---------+---------+-------
+1a) | O | | | | N | N | N | N
+1b) | O | | | | | | O |
+1c) | N | N | | | N | N | O |
+1d) | O | O
+1e) | O | O
+1f) | P
+1g) | O
+1h) | P |
+1i) | O | N
+1j) | O | N
+1l) | O | N
+2) | O | N
+5a) | N | O
+5b)* | N |
+5c) | N | O
+
+* - not done yet
+
+Known bugs and problems and TODO
+--------------------------------
+
+1g/h/l) when pid filtering is enabled on the pci card
+
+DMA usage currently:
+ The DMA is splitted in 2 equal-sized subbuffers. The Flexcop writes to first
+ address and triggers an IRQ when it's full and starts writing to the second
+ address. When the second address is full, the IRQ is triggered again, and
+ the flexcop writes to first address again, and so on.
+ The buffersize of each address is currently 640*188 bytes.
+
+ Problem is, when using hw-pid-filtering and doing some low-bandwidth
+ operation (like scanning) the buffers won't be filled enough to trigger
+ the IRQ. That's why:
+
+ When PID filtering is activated, the timer IRQ is used. Every 1.97 ms the IRQ
+ is triggered. Is the current write address of DMA1 different to the one
+ during the last IRQ, then the data is passed to the demuxer.
+
+ There is an additional DMA-IRQ-method: packet count IRQ. This isn't
+ implemented correctly yet.
+
+ The solution is to disable HW PID filtering, but I don't know how the DVB
+ API software demux behaves on slow systems with 45MBit/s TS.
+
+Solved bugs :)
+--------------
+1g) pid-filtering (somehow pid index 4 and 5 (EMM_PID and ECM_PID) aren't
+working)
+SOLUTION: also index 0 was affected, because net_translation is done for
+these indexes by default
+
+5b) isochronous transfer does only work in the first attempt (for the Sky2PC
+USB, Air2PC is working) SOLUTION: the flexcop was going asleep and never really
+woke up again (don't know if this need fixes, see
+flexcop-fe-tuner.c:flexcop_sleep)
+
+NEWS: when the driver is loaded and unloaded and loaded again (w/o doing
+anything in the while the driver is loaded the first time), no transfers take
+place anymore.
+
+Improvements when rewriting (refactoring) is done
+=================================================
+
+- split sleeping of the flexcop (misc_204.ACPI3_sig = 1;) from lnb_control
+ (enable sleeping for other demods than dvb-s)
+- add support for CableStar (stv0297 Microtune 203x/ALPS) (almost done, incompatibilities with the Nexus-CA)
+
+Debugging
+---------
+- add verbose debugging to skystar2.c (dump the reg_dw_data) and compare it
+ with this flexcop, this is important, because i2c is now using the
+ flexcop_ibi_value union from flexcop-reg.h (do you have a better idea for
+ that, please tell us so).
+
+Everything which is identical in the following table, can be put into a common
+flexcop-module.
+
+ PCI USB
+-------------------------------------------------------------------------------
+Different:
+Register access: accessing IO memory USB control message
+I2C bus: I2C bus of the FC USB control message
+Data transfer: DMA isochronous transfer
+EEPROM transfer: through i2c bus not clear yet
+
+Identical:
+Streaming: accessing registers
+PID Filtering: accessing registers
+Sram destinations: accessing registers
+Tuner/Demod: I2C bus
+DVB-stuff: can be written for common use
+
+Acknowledgements (just for the rewriting part)
+================
+
+Bjarne Steinsbo thought a lot in the first place of the pci part for this code
+sharing idea.
+
+Andreas Oberritter for providing a recent PCI initialization template
+(pluto2.c).
+
+Boleslaw Ciesielski for pointing out a problem with firmware loader.
+
+Vadim Catana for correcting the USB transfer.
+
+comments, critics and ideas to linux-dvb@linuxtv.org.
diff --git a/Documentation/dvb/avermedia.txt b/Documentation/dvb/avermedia.txt
new file mode 100644
index 0000000..e44c009
--- /dev/null
+++ b/Documentation/dvb/avermedia.txt
@@ -0,0 +1,301 @@
+HOWTO: Get An Avermedia DVB-T working under Linux
+ ______________________________________________
+
+ Table of Contents
+ Assumptions and Introduction
+ The Avermedia DVB-T
+ Getting the card going
+ Receiving DVB-T in Australia
+ Known Limitations
+ Further Update
+
+Assumptions and Introduction
+
+ It is assumed that the reader understands the basic structure
+ of the Linux Kernel DVB drivers and the general principles of
+ Digital TV.
+
+ One significant difference between Digital TV and Analogue TV
+ that the unwary (like myself) should consider is that,
+ although the component structure of budget DVB-T cards are
+ substantially similar to Analogue TV cards, they function in
+ substantially different ways.
+
+ The purpose of an Analogue TV is to receive and display an
+ Analogue Television signal. An Analogue TV signal (otherwise
+ known as composite video) is an analogue encoding of a
+ sequence of image frames (25 per second) rasterised using an
+ interlacing technique. Interlacing takes two fields to
+ represent one frame. Computers today are at their best when
+ dealing with digital signals, not analogue signals and a
+ composite video signal is about as far removed from a digital
+ data stream as you can get. Therefore, an Analogue TV card for
+ a PC has the following purpose:
+
+ * Tune the receiver to receive a broadcast signal
+ * demodulate the broadcast signal
+ * demultiplex the analogue video signal and analogue audio
+ signal (note some countries employ a digital audio signal
+ embedded within the modulated composite analogue signal -
+ NICAM.)
+ * digitize the analogue video signal and make the resulting
+ datastream available to the data bus.
+
+ The digital datastream from an Analogue TV card is generated
+ by circuitry on the card and is often presented uncompressed.
+ For a PAL TV signal encoded at a resolution of 768x576 24-bit
+ color pixels over 25 frames per second - a fair amount of data
+ is generated and must be processed by the PC before it can be
+ displayed on the video monitor screen. Some Analogue TV cards
+ for PCs have onboard MPEG2 encoders which permit the raw
+ digital data stream to be presented to the PC in an encoded
+ and compressed form - similar to the form that is used in
+ Digital TV.
+
+ The purpose of a simple budget digital TV card (DVB-T,C or S)
+ is to simply:
+
+ * Tune the received to receive a broadcast signal.
+ * Extract the encoded digital datastream from the broadcast
+ signal.
+ * Make the encoded digital datastream (MPEG2) available to
+ the data bus.
+
+ The significant difference between the two is that the tuner
+ on the analogue TV card spits out an Analogue signal, whereas
+ the tuner on the digital TV card spits out a compressed
+ encoded digital datastream. As the signal is already
+ digitised, it is trivial to pass this datastream to the PC
+ databus with minimal additional processing and then extract
+ the digital video and audio datastreams passing them to the
+ appropriate software or hardware for decoding and viewing.
+ _________________________________________________________
+
+The Avermedia DVB-T
+
+ The Avermedia DVB-T is a budget PCI DVB card. It has 3 inputs:
+
+ * RF Tuner Input
+ * Composite Video Input (RCA Jack)
+ * SVIDEO Input (Mini-DIN)
+
+ The RF Tuner Input is the input to the tuner module of the
+ card. The Tuner is otherwise known as the "Frontend" . The
+ Frontend of the Avermedia DVB-T is a Microtune 7202D. A timely
+ post to the linux-dvb mailing list ascertained that the
+ Microtune 7202D is supported by the sp887x driver which is
+ found in the dvb-hw CVS module.
+
+ The DVB-T card is based around the BT878 chip which is a very
+ common multimedia bridge and often found on Analogue TV cards.
+ There is no on-board MPEG2 decoder, which means that all MPEG2
+ decoding must be done in software, or if you have one, on an
+ MPEG2 hardware decoding card or chipset.
+ _________________________________________________________
+
+Getting the card going
+
+ In order to fire up the card, it is necessary to load a number
+ of modules from the DVB driver set. Prior to this it will have
+ been necessary to download these drivers from the linuxtv CVS
+ server and compile them successfully.
+
+ Depending on the card's feature set, the Device Driver API for
+ DVB under Linux will expose some of the following device files
+ in the /dev tree:
+
+ * /dev/dvb/adapter0/audio0
+ * /dev/dvb/adapter0/ca0
+ * /dev/dvb/adapter0/demux0
+ * /dev/dvb/adapter0/dvr0
+ * /dev/dvb/adapter0/frontend0
+ * /dev/dvb/adapter0/net0
+ * /dev/dvb/adapter0/osd0
+ * /dev/dvb/adapter0/video0
+
+ The primary device nodes that we are interested in (at this
+ stage) for the Avermedia DVB-T are:
+
+ * /dev/dvb/adapter0/dvr0
+ * /dev/dvb/adapter0/frontend0
+
+ The dvr0 device node is used to read the MPEG2 Data Stream and
+ the frontend0 node is used to tune the frontend tuner module.
+
+ At this stage, it has not been able to ascertain the
+ functionality of the remaining device nodes in respect of the
+ Avermedia DVBT. However, full functionality in respect of
+ tuning, receiving and supplying the MPEG2 data stream is
+ possible with the currently available versions of the driver.
+ It may be possible that additional functionality is available
+ from the card (i.e. viewing the additional analogue inputs
+ that the card presents), but this has not been tested yet. If
+ I get around to this, I'll update the document with whatever I
+ find.
+
+ To power up the card, load the following modules in the
+ following order:
+
+ * modprobe bttv (normally loaded automatically)
+ * modprobe dvb-bt8xx (or place dvb-bt8xx in /etc/modules)
+
+ Insertion of these modules into the running kernel will
+ activate the appropriate DVB device nodes. It is then possible
+ to start accessing the card with utilities such as scan, tzap,
+ dvbstream etc.
+
+ The frontend module sp887x.o, requires an external firmware.
+ Please use the command "get_dvb_firmware sp887x" to download
+ it. Then copy it to /usr/lib/hotplug/firmware or /lib/firmware/
+ (depending on configuration of firmware hotplug).
+
+Receiving DVB-T in Australia
+
+ I have no experience of DVB-T in other countries other than
+ Australia, so I will attempt to explain how it works here in
+ Melbourne and how this affects the configuration of the DVB-T
+ card.
+
+ The Digital Broadcasting Australia website has a Reception
+ locatortool which provides information on transponder channels
+ and frequencies. My local transmitter happens to be Mount
+ Dandenong.
+
+ The frequencies broadcast by Mount Dandenong are:
+
+ Table 1. Transponder Frequencies Mount Dandenong, Vic, Aus.
+ Broadcaster Channel Frequency
+ ABC VHF 12 226.5 MHz
+ TEN VHF 11 219.5 MHz
+ NINE VHF 8 191.625 MHz
+ SEVEN VHF 6 177.5 MHz
+ SBS UHF 29 536.5 MHz
+
+ The Scan utility has a set of compiled-in defaults for various
+ countries and regions, but if they do not suit, or if you have
+ a pre-compiled scan binary, you can specify a data file on the
+ command line which contains the transponder frequencies. Here
+ is a sample file for the above channel transponders:
+# Data file for DVB scan program
+#
+# C Frequency SymbolRate FEC QAM
+# S Frequency Polarisation SymbolRate FEC
+# T Frequency Bandwidth FEC FEC2 QAM Mode Guard Hier
+T 226500000 7MHz 2/3 NONE QAM64 8k 1/8 NONE
+T 191625000 7MHz 2/3 NONE QAM64 8k 1/8 NONE
+T 219500000 7MHz 2/3 NONE QAM64 8k 1/8 NONE
+T 177500000 7MHz 2/3 NONE QAM64 8k 1/8 NONE
+T 536500000 7MHz 2/3 NONE QAM64 8k 1/8 NONE
+
+ The defaults for the transponder frequency and other
+ modulation parameters were obtained from www.dba.org.au.
+
+ When Scan runs, it will output channels.conf information for
+ any channel's transponders which the card's frontend can lock
+ onto. (i.e. any whose signal is strong enough at your
+ antenna).
+
+ Here's my channels.conf file for anyone who's interested:
+ABC HDTV:226500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_3_4:QAM_64
+:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:2307:0:560
+ABC TV Melbourne:226500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_3_
+4:QAM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:65
+0:561
+ABC TV 2:226500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_3_4:QAM_64
+:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:562
+ABC TV 3:226500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_3_4:QAM_64
+:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:563
+ABC TV 4:226500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_3_4:QAM_64
+:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:564
+ABC DiG Radio:226500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_3_4:Q
+AM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:0:2311:56
+6
+TEN Digital:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:158
+5
+TEN Digital 1:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:Q
+AM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:1
+586
+TEN Digital 2:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:Q
+AM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:1
+587
+TEN Digital 3:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:Q
+AM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:1
+588
+TEN Digital:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:158
+9
+TEN Digital 4:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:Q
+AM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:1
+590
+TEN Digital:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:159
+1
+TEN HD:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:QAM_64:T
+RANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:514:0:1592
+TEN Digital:219500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:650:159
+3
+Nine Digital:191625000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:QA
+M_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:513:660:10
+72
+Nine Digital HD:191625000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2
+:QAM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:512:0:1
+073
+Nine Guide:191625000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_3_4:FEC_1_2:QAM_
+64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_16:HIERARCHY_NONE:514:670:1074
+7 Digital:177500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM_6
+4:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:769:770:1328
+7 Digital 1:177500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:769:770:1329
+7 Digital 2:177500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:769:770:1330
+7 Digital 3:177500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:769:770:1331
+7 HD Digital:177500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QA
+M_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:833:834:133
+2
+7 Program Guide:177500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3
+:QAM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:865:866:
+1334
+SBS HD:536500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM_64:T
+RANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:102:103:784
+SBS DIGITAL 1:536500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:Q
+AM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:161:81:785
+SBS DIGITAL 2:536500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:Q
+AM_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:162:83:786
+SBS EPG:536500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM_64:
+TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:163:85:787
+SBS RADIO 1:536500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:0:201:798
+SBS RADIO 2:536500000:INVERSION_OFF:BANDWIDTH_7_MHZ:FEC_2_3:FEC_2_3:QAM
+_64:TRANSMISSION_MODE_8K:GUARD_INTERVAL_1_8:HIERARCHY_NONE:0:202:799
+ _________________________________________________________
+
+Known Limitations
+
+ At present I can say with confidence that the frontend tunes
+ via /dev/dvb/adapter{x}/frontend0 and supplies an MPEG2 stream
+ via /dev/dvb/adapter{x}/dvr0. I have not tested the
+ functionality of any other part of the card yet. I will do so
+ over time and update this document.
+
+ There are some limitations in the i2c layer due to a returned
+ error message inconsistency. Although this generates errors in
+ dmesg and the system logs, it does not appear to affect the
+ ability of the frontend to function correctly.
+ _________________________________________________________
+
+Further Update
+
+ dvbstream and VideoLAN Client on windows works a treat with
+ DVB, in fact this is currently serving as my main way of
+ viewing DVB-T at the moment. Additionally, VLC is happily
+ decoding HDTV signals, although the PC is dropping the odd
+ frame here and there - I assume due to processing capability -
+ as all the decoding is being done under windows in software.
+
+ Many thanks to Nigel Pearson for the updates to this document
+ since the recent revision of the driver.
+
+ February 14th 2006
diff --git a/Documentation/dvb/bt8xx.txt b/Documentation/dvb/bt8xx.txt
new file mode 100644
index 0000000..b7b1d1b
--- /dev/null
+++ b/Documentation/dvb/bt8xx.txt
@@ -0,0 +1,98 @@
+How to get the bt8xx cards working
+==================================
+
+1) General information
+======================
+
+This class of cards has a bt878a as the PCI interface, and require the bttv driver
+for accessing the i2c bus and the gpio pins of the bt8xx chipset.
+Please see Documentation/dvb/cards.txt => o Cards based on the Conexant Bt8xx PCI bridge:
+
+Compiling kernel please enable:
+a.)"Device drivers" => "Multimedia devices" => "Video For Linux" => "Enable Video for Linux API 1 (DEPRECATED)"
+b.)"Device drivers" => "Multimedia devices" => "Video For Linux" => "Video Capture Adapters" => "BT848 Video For Linux"
+c.)"Device drivers" => "Multimedia devices" => "Digital Video Broadcasting Devices" => "DVB for Linux" "DVB Core Support" "Bt8xx based PCI Cards"
+
+Please use the following options with care as deselection of drivers which are in fact necessary
+may result in DVB devices that cannot be tuned due to lack of driver support:
+You can save RAM by deselecting every frontend module that your DVB card does not need.
+
+First please remove the static dependency of DVB card drivers on all frontend modules for all possible card variants by enabling:
+d.) "Device drivers" => "Multimedia devices" => "Digital Video Broadcasting Devices"
+ => "DVB for Linux" "DVB Core Support" "Load and attach frontend modules as needed"
+
+If you know the frontend driver that your card needs please enable:
+e.)"Device drivers" => "Multimedia devices" => "Digital Video Broadcasting Devices"
+ => "DVB for Linux" "DVB Core Support" "Customise DVB Frontends" => "Customise the frontend modules to build"
+ Then please select your card-specific frontend module.
+
+2) Loading Modules
+==================
+
+Regular case: If the bttv driver detects a bt8xx-based DVB card, all frontend and backend modules will be loaded automatically.
+Exceptions are:
+- Old TwinHan DST cards or clones with or without CA slot and not containing an Eeprom.
+People running udev please see Documentation/dvb/udev.txt.
+
+In the following cases overriding the PCI type detection for dvb-bt8xx might be necessary:
+
+2a) Running TwinHan and Clones
+------------------------------
+
+ $ modprobe bttv card=113
+ $ modprobe dst
+
+Useful parameters for verbosity level and debugging the dst module:
+
+verbose=0: messages are disabled
+ 1: only error messages are displayed
+ 2: notifications are displayed
+ 3: other useful messages are displayed
+ 4: debug setting
+dst_addons=0: card is a free to air (FTA) card only
+ 0x20: card has a conditional access slot for scrambled channels
+
+The autodetected values are determined by the cards' "response string".
+In your logs see f. ex.: dst_get_device_id: Recognize [DSTMCI].
+For bug reports please send in a complete log with verbose=4 activated.
+Please also see Documentation/dvb/ci.txt.
+
+2b) Running multiple cards
+--------------------------
+
+Examples of card ID's:
+
+Pinnacle PCTV Sat: 94
+Nebula Electronics Digi TV: 104
+pcHDTV HD-2000 TV: 112
+Twinhan DST and clones: 113
+Avermedia AverTV DVB-T 771: 123
+Avermedia AverTV DVB-T 761: 124
+DViCO FusionHDTV DVB-T Lite: 128
+DViCO FusionHDTV 5 Lite: 135
+
+Notice: The order of the card ID should be uprising:
+Example:
+ $ modprobe bttv card=113 card=135
+
+For a full list of card ID's please see Documentation/video4linux/CARDLIST.bttv.
+In case of further problems please subscribe and send questions to the mailing list: linux-dvb@linuxtv.org.
+
+2c) Probing the cards with broken PCI subsystem ID
+--------------------------------------------------
+There are some TwinHan cards that the EEPROM has become corrupted for some
+reason. The cards do not have correct PCI subsystem ID. But we can force
+probing the cards with broken PCI subsystem ID
+
+ $ echo 109e 0878 $subvendor $subdevice > \
+ /sys/bus/pci/drivers/bt878/new_id
+
+109e: PCI_VENDOR_ID_BROOKTREE
+0878: PCI_DEVICE_ID_BROOKTREE_878
+
+Authors: Richard Walker,
+ Jamie Honan,
+ Michael Hunold,
+ Manu Abraham,
+ Uwe Bugla,
+ Michael Krufky
diff --git a/Documentation/dvb/cards.txt b/Documentation/dvb/cards.txt
new file mode 100644
index 0000000..cc09187
--- /dev/null
+++ b/Documentation/dvb/cards.txt
@@ -0,0 +1,122 @@
+Hardware supported by the linuxtv.org DVB drivers
+=================================================
+
+ Generally, the DVB hardware manufacturers frequently change the
+ frontends (i.e. tuner / demodulator units) used, usually without
+ changing the product name, revision number or specs. Some cards
+ are also available in versions with different frontends for
+ DVB-S/DVB-C/DVB-T. Thus the frontend drivers are listed separately.
+
+ Note 1: There is no guarantee that every frontend driver works
+ out of the box with every card, because of different wiring.
+
+ Note 2: The demodulator chips can be used with a variety of
+ tuner/PLL chips, and not all combinations are supported. Often
+ the demodulator and tuner/PLL chip are inside a metal box for
+ shielding, and the whole metal box has its own part number.
+
+
+o Frontends drivers:
+ - dvb_dummy_fe: for testing...
+ DVB-S:
+ - ves1x93 : Alps BSRV2 (ves1893 demodulator) and dbox2 (ves1993)
+ - cx24110 : Conexant HM1221/HM1811 (cx24110 or cx24106 demod, cx24108 PLL)
+ - grundig_29504-491 : Grundig 29504-491 (Philips TDA8083 demodulator), tsa5522 PLL
+ - mt312 : Zarlink mt312 or Mitel vp310 demodulator, sl1935 or tsa5059 PLLi, Technisat Sky2Pc with bios Rev. 2.3
+ - stv0299 : Alps BSRU6 (tsa5059 PLL), LG TDQB-S00x (tsa5059 PLL),
+ LG TDQF-S001F (sl1935 PLL), Philips SU1278 (tua6100 PLL),
+ Philips SU1278SH (tsa5059 PLL), Samsung TBMU24112IMB, Technisat Sky2Pc with bios Rev. 2.6
+ DVB-C:
+ - ves1820 : various (ves1820 demodulator, sp5659c or spXXXX PLL)
+ - at76c651 : Atmel AT76c651(B) with DAT7021 PLL
+ DVB-T:
+ - alps_tdlb7 : Alps TDLB7 (sp8870 demodulator, sp5659 PLL)
+ - alps_tdmb7 : Alps TDMB7 (cx22700 demodulator)
+ - grundig_29504-401 : Grundig 29504-401 (LSI L64781 demodulator), tsa5060 PLL
+ - tda1004x : Philips tda10045h (td1344 or tdm1316l PLL)
+ - nxt6000 : Alps TDME7 (MITEL SP5659 PLL), Alps TDED4 (TI ALP510 PLL),
+ Comtech DVBT-6k07 (SP5730 PLL)
+ (NxtWave Communications NXT6000 demodulator)
+ - sp887x : Microtune 7202D
+ - dib3000mb : DiBcom 3000-MB demodulator
+ DVB-S/C/T:
+ - dst : TwinHan DST Frontend
+ ATSC:
+ - nxt200x : Nxtwave NXT2002 & NXT2004
+ - or51211 : or51211 based (pcHDTV HD2000 card)
+ - or51132 : or51132 based (pcHDTV HD3000 card)
+ - bcm3510 : Broadcom BCM3510
+ - lgdt330x : LG Electronics DT3302 & DT3303
+
+
+o Cards based on the Phillips saa7146 multimedia PCI bridge chip:
+ - TI AV7110 based cards (i.e. with hardware MPEG decoder):
+ - Siemens/Technotrend/Hauppauge PCI DVB card revision 1.1, 1.3, 1.5, 1.6, 2.1
+ (aka Hauppauge Nexus)
+ - "budget" cards (i.e. without hardware MPEG decoder):
+ - Technotrend Budget / Hauppauge WinTV-Nova PCI Cards
+ - SATELCO Multimedia PCI
+ - KNC1 DVB-S, Typhoon DVB-S, Terratec Cinergy 1200 DVB-S (no CI support)
+ - Typhoon DVB-S budget
+ - Fujitsu-Siemens Activy DVB-S budget card
+
+o Cards based on the B2C2 Inc. FlexCopII/IIb/III:
+ - Technisat SkyStar2 PCI DVB card revision 2.3, 2.6B, 2.6C
+
+o Cards based on the Conexant Bt8xx PCI bridge:
+ - Pinnacle PCTV Sat DVB
+ - Nebula Electronics DigiTV
+ - TwinHan DST
+ - Avermedia DVB-T
+ - ChainTech digitop DST-1000 DVB-S
+ - pcHDTV HD-2000 TV
+ - DViCO FusionHDTV DVB-T Lite
+ - DViCO FusionHDTV5 Lite
+
+o Technotrend / Hauppauge DVB USB devices:
+ - Nova USB
+ - DEC 2000-T, 3000-S, 2540-T
+
+o DiBcom DVB-T USB based devices:
+ - Twinhan VisionPlus VisionDTV USB-Ter DVB-T Device
+ - HAMA DVB-T USB device
+ - CTS Portable (Chinese Television System)
+ - KWorld V-Stream XPERT DTV DVB-T USB
+ - JetWay DTV DVB-T USB
+ - ADSTech Instant TV DVB-T USB
+ - Ultima Electronic/Artec T1 USB TVBOX (AN2135 and AN2235)
+ - Compro Videomate DVB-U2000 - DVB-T USB
+ - Grandtec USB DVB-T
+ - Avermedia AverTV DVBT USB
+ - DiBcom USB DVB-T reference device (non-public)
+ - Yakumo DVB-T mobile USB2.0
+ - DiBcom USB2.0 DVB-T reference device (non-public)
+
+o Experimental support for the analog module of the Siemens DVB-C PCI card
+
+o Cards based on the Conexant cx2388x PCI bridge:
+ - ADS Tech Instant TV DVB-T PCI
+ - ATI HDTV Wonder
+ - digitalnow DNTV Live! DVB-T
+ - DViCO FusionHDTV DVB-T1
+ - DViCO FusionHDTV DVB-T Plus
+ - DViCO FusionHDTV3 Gold-Q
+ - DViCO FusionHDTV3 Gold-T
+ - DViCO FusionHDTV5 Gold
+ - Hauppauge Nova-T DVB-T
+ - KWorld/VStream XPert DVB-T
+ - pcHDTV HD3000 HDTV
+ - TerraTec Cinergy 1400 DVB-T
+ - WinFast DTV1000-T
+
+o Cards based on the Phillips saa7134 PCI bridge:
+ - Medion 7134
+ - Pinnacle PCTV 300i DVB-T + PAL
+ - LifeView FlyDVB-T DUO
+ - Typhoon DVB-T Duo Digital/Analog Cardbus
+ - Philips TOUGH DVB-T reference design
+ - Philips EUROPA V3 reference design
+ - Compro Videomate DVB-T300
+ - Compro Videomate DVB-T200
+ - AVerMedia AVerTVHD MCE A180
+
diff --git a/Documentation/dvb/ci.txt b/Documentation/dvb/ci.txt
new file mode 100644
index 0000000..2ecd834
--- /dev/null
+++ b/Documentation/dvb/ci.txt
@@ -0,0 +1,212 @@
+* For the user
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+NOTE: This document describes the usage of the high level CI API as
+in accordance to the Linux DVB API. This is a not a documentation for the,
+existing low level CI API.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To utilize the High Level CI capabilities,
+
+(1*) This point is valid only for the Twinhan/clones
+ For the Twinhan/Twinhan clones, the dst_ca module handles the CI
+ hardware handling.This module is loaded automatically if a CI
+ (Common Interface, that holds the CAM (Conditional Access Module)
+ is detected.
+
+(2) one requires a userspace application, ca_zap. This small userland
+ application is in charge of sending the descrambling related information
+ to the CAM.
+
+This application requires the following to function properly as of now.
+
+ (a) Tune to a valid channel, with szap.
+ eg: $ szap -c channels.conf -r "TMC" -x
+
+ (b) a channels.conf containing a valid PMT PID
+ eg: TMC:11996:h:0:27500:278:512:650:321
+
+ here 278 is a valid PMT PID. the rest of the values are the
+ same ones that szap uses.
+
+ (c) after running a szap, you have to run ca_zap, for the
+ descrambler to function,
+ eg: $ ca_zap channels.conf "TMC"
+
+ (d) Hopefully enjoy your favourite subscribed channel as you do with
+ a FTA card.
+
+(3) Currently ca_zap, and dst_test, both are meant for demonstration
+ purposes only, they can become full fledged applications if necessary.
+
+
+* Cards that fall in this category
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+At present the cards that fall in this category are the Twinhan and it's
+clones, these cards are available as VVMER, Tomato, Hercules, Orange and
+so on.
+
+* CI modules that are supported
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The CI module support is largely dependant upon the firmware on the cards
+Some cards do support almost all of the available CI modules. There is
+nothing much that can be done in order to make additional CI modules
+working with these cards.
+
+Modules that have been tested by this driver at present are
+
+(1) Irdeto 1 and 2 from SCM
+(2) Viaccess from SCM
+(3) Dragoncam
+
+* The High level CI API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* For the programmer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+With the High Level CI approach any new card with almost any random
+architecture can be implemented with this style, the definitions
+inside the switch statement can be easily adapted for any card, thereby
+eliminating the need for any additional ioctls.
+
+The disadvantage is that the driver/hardware has to manage the rest. For
+the application programmer it would be as simple as sending/receiving an
+array to/from the CI ioctls as defined in the Linux DVB API. No changes
+have been made in the API to accommodate this feature.
+
+
+* Why the need for another CI interface ?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This is one of the most commonly asked question. Well a nice question.
+Strictly speaking this is not a new interface.
+
+The CI interface is defined in the DVB API in ca.h as
+
+typedef struct ca_slot_info {
+ int num; /* slot number */
+
+ int type; /* CA interface this slot supports */
+#define CA_CI 1 /* CI high level interface */
+#define CA_CI_LINK 2 /* CI link layer level interface */
+#define CA_CI_PHYS 4 /* CI physical layer level interface */
+#define CA_DESCR 8 /* built-in descrambler */
+#define CA_SC 128 /* simple smart card interface */
+
+ unsigned int flags;
+#define CA_CI_MODULE_PRESENT 1 /* module (or card) inserted */
+#define CA_CI_MODULE_READY 2
+} ca_slot_info_t;
+
+
+
+This CI interface follows the CI high level interface, which is not
+implemented by most applications. Hence this area is revisited.
+
+This CI interface is quite different in the case that it tries to
+accommodate all other CI based devices, that fall into the other categories.
+
+This means that this CI interface handles the EN50221 style tags in the
+Application layer only and no session management is taken care of by the
+application. The driver/hardware will take care of all that.
+
+This interface is purely an EN50221 interface exchanging APDU's. This
+means that no session management, link layer or a transport layer do
+exist in this case in the application to driver communication. It is
+as simple as that. The driver/hardware has to take care of that.
+
+
+With this High Level CI interface, the interface can be defined with the
+regular ioctls.
+
+All these ioctls are also valid for the High level CI interface
+
+#define CA_RESET _IO('o', 128)
+#define CA_GET_CAP _IOR('o', 129, ca_caps_t)
+#define CA_GET_SLOT_INFO _IOR('o', 130, ca_slot_info_t)
+#define CA_GET_DESCR_INFO _IOR('o', 131, ca_descr_info_t)
+#define CA_GET_MSG _IOR('o', 132, ca_msg_t)
+#define CA_SEND_MSG _IOW('o', 133, ca_msg_t)
+#define CA_SET_DESCR _IOW('o', 134, ca_descr_t)
+#define CA_SET_PID _IOW('o', 135, ca_pid_t)
+
+
+On querying the device, the device yields information thus
+
+CA_GET_SLOT_INFO
+----------------------------
+Command = [info]
+APP: Number=[1]
+APP: Type=[1]
+APP: flags=[1]
+APP: CI High level interface
+APP: CA/CI Module Present
+
+CA_GET_CAP
+----------------------------
+Command = [caps]
+APP: Slots=[1]
+APP: Type=[1]
+APP: Descrambler keys=[16]
+APP: Type=[1]
+
+CA_SEND_MSG
+----------------------------
+Descriptors(Program Level)=[ 09 06 06 04 05 50 ff f1]
+Found CA descriptor @ program level
+
+(20) ES type=[2] ES pid=[201] ES length =[0 (0x0)]
+(25) ES type=[4] ES pid=[301] ES length =[0 (0x0)]
+ca_message length is 25 (0x19) bytes
+EN50221 CA MSG=[ 9f 80 32 19 03 01 2d d1 f0 08 01 09 06 06 04 05 50 ff f1 02 e0 c9 00 00 04 e1 2d 00 00]
+
+
+Not all ioctl's are implemented in the driver from the API, the other
+features of the hardware that cannot be implemented by the API are achieved
+using the CA_GET_MSG and CA_SEND_MSG ioctls. An EN50221 style wrapper is
+used to exchange the data to maintain compatibility with other hardware.
+
+
+/* a message to/from a CI-CAM */
+typedef struct ca_msg {
+ unsigned int index;
+ unsigned int type;
+ unsigned int length;
+ unsigned char msg[256];
+} ca_msg_t;
+
+
+The flow of data can be described thus,
+
+
+
+
+
+ App (User)
+ -----
+ parse
+ |
+ |
+ v
+ en50221 APDU (package)
+ --------------------------------------
+ | | | High Level CI driver
+ | | |
+ | v |
+ | en50221 APDU (unpackage) |
+ | | |
+ | | |
+ | v |
+ | sanity checks |
+ | | |
+ | | |
+ | v |
+ | do (H/W dep) |
+ --------------------------------------
+ | Hardware
+ |
+ v
+
+
+
+
+The High Level CI interface uses the EN50221 DVB standard, following a
+standard ensures futureproofness.
diff --git a/Documentation/dvb/contributors.txt b/Documentation/dvb/contributors.txt
new file mode 100644
index 0000000..4865add
--- /dev/null
+++ b/Documentation/dvb/contributors.txt
@@ -0,0 +1,96 @@
+Thanks go to the following people for patches and contributions:
+
+Michael Hunold <m.hunold@gmx.de>
+ for the initial saa7146 driver and it's recent overhaul
+
+Christian Theiss
+ for his work on the initial Linux DVB driver
+
+Marcus Metzler <mocm@metzlerbros.de>
+Ralph Metzler <rjkm@metzlerbros.de>
+ for their continuing work on the DVB driver
+
+Michael Holzt <kju@debian.org>
+ for his contributions to the dvb-net driver
+
+Diego Picciani <d.picciani@novacomp.it>
+ for CyberLogin for Linux which allows logging onto EON
+ (in case you are wondering where CyberLogin is, EON changed its login
+ procedure and CyberLogin is no longer used.)
+
+Martin Schaller <martin@smurf.franken.de>
+ for patching the cable card decoder driver
+
+Klaus Schmidinger <Klaus.Schmidinger@cadsoft.de>
+ for various fixes regarding tuning, OSD and CI stuff and his work on VDR
+
+Steve Brown <sbrown@cortland.com>
+ for his AFC kernel thread
+
+Christoph Martin <martin@uni-mainz.de>
+ for his LIRC infrared handler
+
+Andreas Oberritter <obi@linuxtv.org>
+Dennis Noermann <dennis.noermann@noernet.de>
+Felix Domke <tmbinc@elitedvb.net>
+Florian Schirmer <jolt@tuxbox.org>
+Ronny Strutz <3des@elitedvb.de>
+Wolfram Joost <dbox2@frokaschwei.de>
+...and all the other dbox2 people
+ for many bugfixes in the generic DVB Core, frontend drivers and
+ their work on the dbox2 port of the DVB driver
+
+Oliver Endriss <o.endriss@gmx.de>
+ for many bugfixes
+
+Andrew de Quincey <adq_dvb@lidskialf.net>
+ for the tda1004x frontend driver, and various bugfixes
+
+Peter Schildmann <peter.schildmann@web.de>
+ for the driver for the Technisat SkyStar2 PCI DVB card
+
+Vadim Catana <skystar@moldova.cc>
+Roberto Ragusa <r.ragusa@libero.it>
+Augusto Cardoso <augusto@carhil.net>
+ for all the work for the FlexCopII chipset by B2C2,Inc.
+
+Davor Emard <emard@softhome.net>
+ for his work on the budget drivers, the demux code,
+ the module unloading problems, ...
+
+Hans-Frieder Vogt <hfvogt@arcor.de>
+ for his work on calculating and checking the crc's for the
+ TechnoTrend/Hauppauge DEC driver firmware
+
+Michael Dreher <michael@5dot1.de>
+Andreas 'randy' Weinberger
+ for the support of the Fujitsu-Siemens Activy budget DVB-S
+
+Kenneth Aafløy <ke-aa@frisurf.no>
+ for adding support for Typhoon DVB-S budget card
+
+Ernst Peinlich <e.peinlich@inode.at>
+ for tuning/DiSEqC support for the DEC 3000-s
+
+Peter Beutner <p.beutner@gmx.net>
+ for the IR code for the ttusb-dec driver
+
+Wilson Michaels <wilsonmichaels@earthlink.net>
+ for the lgdt330x frontend driver, and various bugfixes
+
+Michael Krufky <mkrufky@m1k.net>
+ for maintaining v4l/dvb inter-tree dependencies
+
+Taylor Jacob <rtjacob@earthlink.net>
+ for the nxt2002 frontend driver
+
+Jean-Francois Thibert <jeanfrancois@sagetv.com>
+ for the nxt2004 frontend driver
+
+Kirk Lapray <kirk.lapray@gmail.com>
+ for the or51211 and or51132 frontend drivers, and
+ for merging the nxt2002 and nxt2004 modules into a
+ single nxt200x frontend driver.
+
+(If you think you should be in this list, but you are not, drop a
+ line to the DVB mailing list)
diff --git a/Documentation/dvb/faq.txt b/Documentation/dvb/faq.txt
new file mode 100644
index 0000000..2511a33
--- /dev/null
+++ b/Documentation/dvb/faq.txt
@@ -0,0 +1,159 @@
+Some very frequently asked questions about linuxtv-dvb
+
+1. The signal seems to die a few seconds after tuning.
+
+ It's not a bug, it's a feature. Because the frontends have
+ significant power requirements (and hence get very hot), they
+ are powered down if they are unused (i.e. if the frontend device
+ is closed). The dvb-core.o module parameter "dvb_shutdown_timeout"
+ allow you to change the timeout (default 5 seconds). Setting the
+ timeout to 0 disables the timeout feature.
+
+2. How can I watch TV?
+
+ The driver distribution includes some simple utilities which
+ are mainly intended for testing and to demonstrate how the
+ DVB API works.
+
+ Depending on whether you have a DVB-S, DVB-C or DVB-T card, use
+ apps/szap/szap, czap or tzap. You must supply a channel list
+ in ~/.[sct]zap/channels.conf. If you are lucky you can just copy
+ one of the supplied channel lists, or you can create a new one
+ by running apps/scan/scan. If you run scan on an unknown network
+ you might have to supply some start data in apps/scan/initial.h.
+
+ If you have a card with a built-in hardware MPEG-decoder the
+ drivers create a video4linux device (/dev/v4l/video0) which
+ you can use to watch TV with any v4l application. xawtv is known
+ to work. Note that you cannot change channels with xawtv, you
+ have to zap using [sct]zap. If you want a nice application for
+ TV watching and record/playback, have a look at VDR.
+
+ If your card does not have a hardware MPEG decoder you need
+ a software MPEG decoder. Mplayer or xine are known to work.
+ Newsflash: MythTV also has DVB support now.
+ Note: Only very recent versions of Mplayer and xine can decode.
+ MPEG2 transport streams (TS) directly. Then, run
+ '[sct]zap channelname -r' in one xterm, and keep it running,
+ and start 'mplayer - < /dev/dvb/adapter0/dvr0' or
+ 'xine stdin://mpeg2 < /dev/dvb/adapter0/dvr0' in a second xterm.
+ That's all far from perfect, but it seems no one has written
+ a nice DVB application which includes a builtin software MPEG
+ decoder yet.
+
+ Newsflash: Newest xine directly supports DVB. Just copy your
+ channels.conf to ~/.xine and start 'xine dvb://', or select
+ the DVB button in the xine GUI. Channel switching works using the
+ numpad pgup/pgdown (NP9 / NP3) keys to scroll through the channel osd
+ menu and pressing numpad-enter to switch to the selected channel.
+
+ Note: Older versions of xine and mplayer understand MPEG program
+ streams (PS) only, and can be used in conjunction with the
+ ts2ps tool from the Metzler Brother's dvb-mpegtools package.
+
+3. Which other DVB applications exist?
+
+ http://www.cadsoft.de/people/kls/vdr/
+ Klaus Schmidinger's Video Disk Recorder
+
+ http://www.metzlerbros.org/dvb/
+ Metzler Bros. DVB development; alternate drivers and
+ DVB utilities, include dvb-mpegtools and tuxzap.
+
+ http://sourceforge.net/projects/dvbtools/
+ Dave Chapman's dvbtools package, including
+ dvbstream and dvbtune
+
+ http://www.linuxdvb.tv/
+ Henning Holtschneider's site with many interesting
+ links and docs
+
+ http://www.dbox2.info/
+ LinuxDVB on the dBox2
+
+ http://www.tuxbox.org/
+ http://cvs.tuxbox.org/
+ the TuxBox CVS many interesting DVB applications and the dBox2
+ DVB source
+
+ http://sourceforge.net/projects/dvbsak/
+ DVB Swiss Army Knife library and utilities
+
+ http://www.nenie.org/misc/mpsys/
+ MPSYS: a MPEG2 system library and tools
+
+ http://mplayerhq.hu/
+ mplayer
+
+ http://xine.sourceforge.net/
+ http://xinehq.de/
+ xine
+
+ http://www.mythtv.org/
+ MythTV - analog TV PVR, but now with DVB support, too
+ (with software MPEG decode)
+
+ http://dvbsnoop.sourceforge.net/
+ DVB sniffer program to monitor, analyze, debug, dump
+ or view dvb/mpeg/dsm-cc/mhp stream information (TS,
+ PES, SECTION)
+
+4. Can't get a signal tuned correctly
+
+ If you are using a Technotrend/Hauppauge DVB-C card *without* analog
+ module, you might have to use module parameter adac=-1 (dvb-ttpci.o).
+
+5. The dvb_net device doesn't give me any packets at all
+
+ Run tcpdump on the dvb0_0 interface. This sets the interface
+ into promiscous mode so it accepts any packets from the PID
+ you have configured with the dvbnet utility. Check if there
+ are any packets with the IP addr and MAC addr you have
+ configured with ifconfig.
+
+ If tcpdump doesn't give you any output, check the statistics
+ which ifconfig outputs. (Note: If the MAC address is wrong,
+ dvb_net won't get any input; thus you have to run tcpdump
+ before checking the statistics.) If there are no packets at
+ all then maybe the PID is wrong. If there are error packets,
+ then either the PID is wrong or the stream does not conform to
+ the MPE standard (EN 301 192, http://www.etsi.org/). You can
+ use e.g. dvbsnoop for debugging.
+
+6. The dvb_net device doesn't give me any multicast packets
+
+ Check your routes if they include the multicast address range.
+ Additionally make sure that "source validation by reversed path
+ lookup" is disabled:
+ $ "echo 0 > /proc/sys/net/ipv4/conf/dvb0/rp_filter"
+
+7. What the hell are all those modules that need to be loaded?
+
+ For a dvb-ttpci av7110 based full-featured card the following
+ modules are loaded:
+
+ - videodev: Video4Linux core module. This is the base module that
+ gives you access to the "analog" tv picture of the av7110 mpeg2
+ decoder.
+
+ - v4l2-common: common functions for Video4Linux-2 drivers
+
+ - v4l1-compat: backward compatibility layer for Video4Linux-1 legacy
+ applications
+
+ - dvb-core: DVB core module. This provides you with the
+ /dev/dvb/adapter entries
+
+ - saa7146: SAA7146 core driver. This is need to access any SAA7146
+ based card in your system.
+
+ - saa7146_vv: SAA7146 video and vbi functions. These are only needed
+ for full-featured cards.
+
+ - videobuf-dma-sg: capture helper module for the saa7146_vv driver. This
+ one is responsible to handle capture buffers.
+
+ - dvb-ttpci: The main driver for AV7110 based, full-featured
+ DVB-S/C/T cards
+
+eof
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware
new file mode 100644
index 0000000..f2e908d
--- /dev/null
+++ b/Documentation/dvb/get_dvb_firmware
@@ -0,0 +1,509 @@
+#!/usr/bin/perl
+# DVB firmware extractor
+#
+# (c) 2004 Andrew de Quincey
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+use File::Temp qw/ tempdir /;
+use IO::Handle;
+
+@components = ( "sp8870", "sp887x", "tda10045", "tda10046",
+ "tda10046lifeview", "av7110", "dec2000t", "dec2540t",
+ "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004",
+ "or51211", "or51132_qam", "or51132_vsb", "bluebird",
+ "opera1");
+
+# Check args
+syntax() if (scalar(@ARGV) != 1);
+$cid = $ARGV[0];
+
+# Do it!
+for ($i=0; $i < scalar(@components); $i++) {
+ if ($cid eq $components[$i]) {
+ $outfile = eval($cid);
+ die $@ if $@;
+ print STDERR <<EOF;
+Firmware $outfile extracted successfully.
+Now copy it to either /usr/lib/hotplug/firmware or /lib/firmware
+(depending on configuration of firmware hotplug).
+EOF
+ exit(0);
+ }
+}
+
+# If we get here, it wasn't found
+print STDERR "Unknown component \"$cid\"\n";
+syntax();
+
+
+
+
+# ---------------------------------------------------------------
+# Firmware-specific extraction subroutines
+
+sub sp8870 {
+ my $sourcefile = "tt_Premium_217g.zip";
+ my $url = "http://www.softwarepatch.pl/9999ccd06a4813cb827dbb0005071c71/$sourcefile";
+ my $hash = "53970ec17a538945a6d8cb608a7b3899";
+ my $outfile = "dvb-fe-sp8870.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ verify("$tmpdir/software/OEM/HE/App/boot/SC_MAIN.MC", $hash);
+ copy("$tmpdir/software/OEM/HE/App/boot/SC_MAIN.MC", $outfile);
+
+ $outfile;
+}
+
+sub sp887x {
+ my $sourcefile = "Dvbt1.3.57.6.zip";
+ my $url = "http://www.avermedia.com/software/$sourcefile";
+ my $cabfile = "DVBT Net Ver1.3.57.6/disk1/data1.cab";
+ my $hash = "237938d53a7f834c05c42b894ca68ac3";
+ my $outfile = "dvb-fe-sp887x.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+ checkunshield();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ unshield("$tmpdir/$cabfile", $tmpdir);
+ verify("$tmpdir/ZEnglish/sc_main.mc", $hash);
+ copy("$tmpdir/ZEnglish/sc_main.mc", $outfile);
+
+ $outfile;
+}
+
+sub tda10045 {
+ my $sourcefile = "tt_budget_217g.zip";
+ my $url = "http://www.technotrend.de/new/217g/$sourcefile";
+ my $hash = "2105fd5bf37842fbcdfa4bfd58f3594a";
+ my $outfile = "dvb-fe-tda10045.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ extract("$tmpdir/software/OEM/PCI/App/ttlcdacc.dll", 0x37ef9, 30555, "$tmpdir/fwtmp");
+ verify("$tmpdir/fwtmp", $hash);
+ copy("$tmpdir/fwtmp", $outfile);
+
+ $outfile;
+}
+
+sub tda10046 {
+ my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip";
+ my $url = "http://technotrend-online.com/download/software/219/$sourcefile";
+ my $hash = "6a7e1e2f2644b162ff0502367553c72d";
+ my $outfile = "dvb-fe-tda10046.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ extract("$tmpdir/TT_PCI_2.19h_28_11_2006/software/OEM/PCI/App/ttlcdacc.dll", 0x65389, 24478, "$tmpdir/fwtmp");
+ verify("$tmpdir/fwtmp", $hash);
+ copy("$tmpdir/fwtmp", $outfile);
+
+ $outfile;
+}
+
+sub tda10046lifeview {
+ my $sourcefile = "Drv_2.11.02.zip";
+ my $url = "http://www.lifeview.com.tw/drivers/pci_card/FlyDVB-T/$sourcefile";
+ my $hash = "1ea24dee4eea8fe971686981f34fd2e0";
+ my $outfile = "dvb-fe-tda10046.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ extract("$tmpdir/LVHybrid.sys", 0x8b088, 24602, "$tmpdir/fwtmp");
+ verify("$tmpdir/fwtmp", $hash);
+ copy("$tmpdir/fwtmp", $outfile);
+
+ $outfile;
+}
+
+sub av7110 {
+ my $sourcefile = "dvb-ttpci-01.fw-261d";
+ my $url = "http://www.linuxtv.org/downloads/firmware/$sourcefile";
+ my $hash = "603431b6259715a8e88f376a53b64e2f";
+ my $outfile = "dvb-ttpci-01.fw";
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ verify($sourcefile, $hash);
+ copy($sourcefile, $outfile);
+
+ $outfile;
+}
+
+sub dec2000t {
+ my $sourcefile = "dec217g.exe";
+ my $url = "http://hauppauge.lightpath.net/de/$sourcefile";
+ my $hash = "bd86f458cee4a8f0a8ce2d20c66215a9";
+ my $outfile = "dvb-ttusb-dec-2000t.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ verify("$tmpdir/software/OEM/STB/App/Boot/STB_PC_T.bin", $hash);
+ copy("$tmpdir/software/OEM/STB/App/Boot/STB_PC_T.bin", $outfile);
+
+ $outfile;
+}
+
+sub dec2540t {
+ my $sourcefile = "dec217g.exe";
+ my $url = "http://hauppauge.lightpath.net/de/$sourcefile";
+ my $hash = "53e58f4f5b5c2930beee74a7681fed92";
+ my $outfile = "dvb-ttusb-dec-2540t.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ verify("$tmpdir/software/OEM/STB/App/Boot/STB_PC_X.bin", $hash);
+ copy("$tmpdir/software/OEM/STB/App/Boot/STB_PC_X.bin", $outfile);
+
+ $outfile;
+}
+
+sub dec3000s {
+ my $sourcefile = "dec217g.exe";
+ my $url = "http://hauppauge.lightpath.net/de/$sourcefile";
+ my $hash = "b013ececea83f4d6d8d2a29ac7c1b448";
+ my $outfile = "dvb-ttusb-dec-3000s.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ verify("$tmpdir/software/OEM/STB/App/Boot/STB_PC_S.bin", $hash);
+ copy("$tmpdir/software/OEM/STB/App/Boot/STB_PC_S.bin", $outfile);
+
+ $outfile;
+}
+sub opera1{
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 0);
+
+ checkstandard();
+ my $fwfile1="dvb-usb-opera1-fpga-01.fw";
+ my $fwfile2="dvb-usb-opera-01.fw";
+ extract("2830SCap2.sys", 0x62e8, 55024, "$tmpdir/opera1-fpga.fw");
+ extract("2830SLoad2.sys",0x3178,0x3685-0x3178,"$tmpdir/fw1part1");
+ extract("2830SLoad2.sys",0x0980,0x3150-0x0980,"$tmpdir/fw1part2");
+ delzero("$tmpdir/fw1part1","$tmpdir/fw1part1-1");
+ delzero("$tmpdir/fw1part2","$tmpdir/fw1part2-1");
+ verify("$tmpdir/fw1part1-1","5e0909858fdf0b5b09ad48b9fe622e70");
+ verify("$tmpdir/fw1part2-1","d6e146f321427e931df2c6fcadac37a1");
+ verify("$tmpdir/opera1-fpga.fw","0f8133f5e9051f5f3c1928f7e5a1b07d");
+
+ my $RES1="\x01\x92\x7f\x00\x01\x00";
+ my $RES0="\x01\x92\x7f\x00\x00\x00";
+ my $DAT1="\x01\x00\xe6\x00\x01\x00";
+ my $DAT0="\x01\x00\xe6\x00\x00\x00";
+ open FW,">$tmpdir/opera.fw";
+ print FW "$RES1";
+ print FW "$DAT1";
+ print FW "$RES1";
+ print FW "$DAT1";
+ appendfile(FW,"$tmpdir/fw1part1-1");
+ print FW "$RES0";
+ print FW "$DAT0";
+ print FW "$RES1";
+ print FW "$DAT1";
+ appendfile(FW,"$tmpdir/fw1part2-1");
+ print FW "$RES1";
+ print FW "$DAT1";
+ print FW "$RES0";
+ print FW "$DAT0";
+ copy ("$tmpdir/opera1-fpga.fw",$fwfile1);
+ copy ("$tmpdir/opera.fw",$fwfile2);
+
+ $fwfile1.",".$fwfile2;
+}
+
+sub vp7041 {
+ my $sourcefile = "2.422.zip";
+ my $url = "http://www.twinhan.com/files/driver/USB-Ter/$sourcefile";
+ my $hash = "e88c9372d1f66609a3e7b072c53fbcfe";
+ my $outfile = "dvb-vp7041-2.422.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ extract("$tmpdir/VisionDTV/Drivers/Win2K&XP/UDTTload.sys", 12503, 3036, "$tmpdir/fwtmp1");
+ extract("$tmpdir/VisionDTV/Drivers/Win2K&XP/UDTTload.sys", 2207, 10274, "$tmpdir/fwtmp2");
+
+ my $CMD = "\000\001\000\222\177\000";
+ my $PAD = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000";
+ my ($FW);
+ open $FW, ">$tmpdir/fwtmp3";
+ print $FW "$CMD\001$PAD";
+ print $FW "$CMD\001$PAD";
+ appendfile($FW, "$tmpdir/fwtmp1");
+ print $FW "$CMD\000$PAD";
+ print $FW "$CMD\001$PAD";
+ appendfile($FW, "$tmpdir/fwtmp2");
+ print $FW "$CMD\001$PAD";
+ print $FW "$CMD\000$PAD";
+ close($FW);
+
+ verify("$tmpdir/fwtmp3", $hash);
+ copy("$tmpdir/fwtmp3", $outfile);
+
+ $outfile;
+}
+
+sub dibusb {
+ my $url = "http://www.linuxtv.org/downloads/firmware/dvb-usb-dibusb-5.0.0.11.fw";
+ my $outfile = "dvb-dibusb-5.0.0.11.fw";
+ my $hash = "fa490295a527360ca16dcdf3224ca243";
+
+ checkstandard();
+
+ wgetfile($outfile, $url);
+ verify($outfile,$hash);
+
+ $outfile;
+}
+
+sub nxt2002 {
+ my $sourcefile = "Technisat_DVB-PC_4_4_COMPACT.zip";
+ my $url = "http://www.bbti.us/download/windows/$sourcefile";
+ my $hash = "476befae8c7c1bb9648954060b1eec1f";
+ my $outfile = "dvb-fe-nxt2002.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ verify("$tmpdir/SkyNET.sys", $hash);
+ extract("$tmpdir/SkyNET.sys", 331624, 5908, $outfile);
+
+ $outfile;
+}
+
+sub nxt2004 {
+ my $sourcefile = "AVerTVHD_MCE_A180_Drv_v1.2.2.16.zip";
+ my $url = "http://www.aver.com/support/Drivers/$sourcefile";
+ my $hash = "111cb885b1e009188346d72acfed024c";
+ my $outfile = "dvb-fe-nxt2004.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+ checkstandard();
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+ verify("$tmpdir/3xHybrid.sys", $hash);
+ extract("$tmpdir/3xHybrid.sys", 465304, 9584, $outfile);
+
+ $outfile;
+}
+
+sub or51211 {
+ my $fwfile = "dvb-fe-or51211.fw";
+ my $url = "http://linuxtv.org/downloads/firmware/$fwfile";
+ my $hash = "d830949c771a289505bf9eafc225d491";
+
+ checkstandard();
+
+ wgetfile($fwfile, $url);
+ verify($fwfile, $hash);
+
+ $fwfile;
+}
+
+sub or51132_qam {
+ my $fwfile = "dvb-fe-or51132-qam.fw";
+ my $url = "http://linuxtv.org/downloads/firmware/$fwfile";
+ my $hash = "7702e8938612de46ccadfe9b413cb3b5";
+
+ checkstandard();
+
+ wgetfile($fwfile, $url);
+ verify($fwfile, $hash);
+
+ $fwfile;
+}
+
+sub or51132_vsb {
+ my $fwfile = "dvb-fe-or51132-vsb.fw";
+ my $url = "http://linuxtv.org/downloads/firmware/$fwfile";
+ my $hash = "c16208e02f36fc439a557ad4c613364a";
+
+ checkstandard();
+
+ wgetfile($fwfile, $url);
+ verify($fwfile, $hash);
+
+ $fwfile;
+}
+
+sub bluebird {
+ my $url = "http://www.linuxtv.org/download/dvb/firmware/dvb-usb-bluebird-01.fw";
+ my $outfile = "dvb-usb-bluebird-01.fw";
+ my $hash = "658397cb9eba9101af9031302671f49d";
+
+ checkstandard();
+
+ wgetfile($outfile, $url);
+ verify($outfile,$hash);
+
+ $outfile;
+}
+
+# ---------------------------------------------------------------
+# Utilities
+
+sub checkstandard {
+ if (system("which unzip > /dev/null 2>&1")) {
+ die "This firmware requires the unzip command - see ftp://ftp.info-zip.org/pub/infozip/UnZip.html\n";
+ }
+ if (system("which md5sum > /dev/null 2>&1")) {
+ die "This firmware requires the md5sum command - see http://www.gnu.org/software/coreutils/\n";
+ }
+ if (system("which wget > /dev/null 2>&1")) {
+ die "This firmware requires the wget command - see http://wget.sunsite.dk/\n";
+ }
+}
+
+sub checkunshield {
+ if (system("which unshield > /dev/null 2>&1")) {
+ die "This firmware requires the unshield command - see http://sourceforge.net/projects/synce/\n";
+ }
+}
+
+sub wgetfile {
+ my ($sourcefile, $url) = @_;
+
+ if (! -f $sourcefile) {
+ system("wget -O \"$sourcefile\" \"$url\"") and die "wget failed - unable to download firmware";
+ }
+}
+
+sub unzip {
+ my ($sourcefile, $todir) = @_;
+
+ $status = system("unzip -q -o -d \"$todir\" \"$sourcefile\" 2>/dev/null" );
+ if ((($status >> 8) > 2) || (($status & 0xff) != 0)) {
+ die ("unzip failed - unable to extract firmware");
+ }
+}
+
+sub unshield {
+ my ($sourcefile, $todir) = @_;
+
+ system("unshield x -d \"$todir\" \"$sourcefile\" > /dev/null" ) and die ("unshield failed - unable to extract firmware");
+}
+
+sub verify {
+ my ($filename, $hash) = @_;
+ my ($testhash);
+
+ open(CMD, "md5sum \"$filename\"|");
+ $testhash = <CMD>;
+ $testhash =~ /([a-zA-Z0-9]*)/;
+ $testhash = $1;
+ close CMD;
+ die "Hash of extracted file does not match!\n" if ($testhash ne $hash);
+}
+
+sub copy {
+ my ($from, $to) = @_;
+
+ system("cp -f \"$from\" \"$to\"") and die ("cp failed");
+}
+
+sub extract {
+ my ($infile, $offset, $length, $outfile) = @_;
+ my ($chunklength, $buf, $rcount);
+
+ open INFILE, "<$infile";
+ open OUTFILE, ">$outfile";
+ sysseek(INFILE, $offset, SEEK_SET);
+ while($length > 0) {
+ # Calc chunk size
+ $chunklength = 2048;
+ $chunklength = $length if ($chunklength > $length);
+
+ $rcount = sysread(INFILE, $buf, $chunklength);
+ die "Ran out of data\n" if ($rcount != $chunklength);
+ syswrite(OUTFILE, $buf);
+ $length -= $rcount;
+ }
+ close INFILE;
+ close OUTFILE;
+}
+
+sub appendfile {
+ my ($FH, $infile) = @_;
+ my ($buf);
+
+ open INFILE, "<$infile";
+ while(1) {
+ $rcount = sysread(INFILE, $buf, 2048);
+ last if ($rcount == 0);
+ print $FH $buf;
+ }
+ close(INFILE);
+}
+
+sub delzero{
+ my ($infile,$outfile) =@_;
+
+ open INFILE,"<$infile";
+ open OUTFILE,">$outfile";
+ while (1){
+ $rcount=sysread(INFILE,$buf,22);
+ $len=ord(substr($buf,0,1));
+ print OUTFILE substr($buf,0,1);
+ print OUTFILE substr($buf,2,$len+3);
+ last if ($rcount<1);
+ printf OUTFILE "%c",0;
+#print $len." ".length($buf)."\n";
+
+ }
+ close(INFILE);
+ close(OUTFILE);
+}
+
+sub syntax() {
+ print STDERR "syntax: get_dvb_firmware <component>\n";
+ print STDERR "Supported components:\n";
+ for($i=0; $i < scalar(@components); $i++) {
+ print STDERR "\t" . $components[$i] . "\n";
+ }
+ exit(1);
+}
diff --git a/Documentation/dvb/opera-firmware.txt b/Documentation/dvb/opera-firmware.txt
new file mode 100644
index 0000000..93e784c
--- /dev/null
+++ b/Documentation/dvb/opera-firmware.txt
@@ -0,0 +1,27 @@
+To extract the firmware for the Opera DVB-S1 USB-Box
+you need to copy the files:
+
+2830SCap2.sys
+2830SLoad2.sys
+
+from the windriver disk into this directory.
+
+Then run
+
+./get_dvb_firware opera1
+
+and after that you have 2 files:
+
+dvb-usb-opera-01.fw
+dvb-usb-opera1-fpga-01.fw
+
+in here.
+
+Copy them into /lib/firmware/ .
+
+After that the driver can load the firmware
+(if you have enabled firmware loading
+in kernel config and have hotplug running).
+
+
+Marco Gittler <g.marco@freenet.de> \ No newline at end of file
diff --git a/Documentation/dvb/readme.txt b/Documentation/dvb/readme.txt
new file mode 100644
index 0000000..0b0380c
--- /dev/null
+++ b/Documentation/dvb/readme.txt
@@ -0,0 +1,62 @@
+Linux Digital Video Broadcast (DVB) subsystem
+=============================================
+
+The main development site and CVS repository for these
+drivers is http://linuxtv.org/.
+
+The developer mailing list linux-dvb is also hosted there,
+see http://linuxtv.org/lists.php. Please check
+the archive http://linuxtv.org/pipermail/linux-dvb/
+and the Wiki http://linuxtv.org/wiki/
+before asking newbie questions on the list.
+
+API documentation, utilities and test/example programs
+are available as part of the old driver package for Linux 2.4
+(linuxtv-dvb-1.0.x.tar.gz), or from CVS (module DVB).
+We plan to split this into separate packages, but it's not
+been done yet.
+
+http://linuxtv.org/downloads/
+
+What's inside this directory:
+
+"avermedia.txt"
+contains detailed information about the
+Avermedia DVB-T cards. See also "bt8xx.txt".
+
+"bt8xx.txt"
+contains detailed information about the
+various bt8xx based "budget" DVB cards.
+
+"cards.txt"
+contains a list of supported hardware.
+
+"ci.txt"
+contains detailed information about the
+CI module as part from TwinHan cards and Clones.
+
+"contributors.txt"
+is the who-is-who of DVB development.
+
+"faq.txt"
+contains frequently asked questions and their answers.
+
+"get_dvb_firmware"
+script to download and extract firmware for those devices
+that require it.
+
+"ttusb-dec.txt"
+contains detailed information about the
+TT DEC2000/DEC3000 USB DVB hardware.
+
+"udev.txt"
+how to get DVB and udev up and running.
+
+"README.dvb-usb"
+contains detailed information about the DVB USB cards.
+
+"README.flexcop"
+contains detailed information about the
+Technisat- and Flexcop B2C2 drivers.
+
+Good luck and have fun!
diff --git a/Documentation/dvb/ttusb-dec.txt b/Documentation/dvb/ttusb-dec.txt
new file mode 100644
index 0000000..b2f271c
--- /dev/null
+++ b/Documentation/dvb/ttusb-dec.txt
@@ -0,0 +1,45 @@
+TechnoTrend/Hauppauge DEC USB Driver
+====================================
+
+Driver Status
+-------------
+
+Supported:
+ DEC2000-t
+ DEC2450-t
+ DEC3000-s
+ Linux Kernels 2.4 and 2.6
+ Video Streaming
+ Audio Streaming
+ Section Filters
+ Channel Zapping
+ Hotplug firmware loader under 2.6 kernels
+
+To Do:
+ Tuner status information
+ DVB network interface
+ Streaming video PC->DEC
+ Conax support for 2450-t
+
+Getting the Firmware
+--------------------
+To download the firmware, use the following commands:
+"get_dvb_firmware dec2000t"
+"get_dvb_firmware dec2540t"
+"get_dvb_firmware dec3000s"
+
+
+Compilation Notes for 2.4 kernels
+---------------------------------
+For 2.4 kernels the firmware for the DECs is compiled into the driver itself.
+
+Copy the three files downloaded above into the build-2.4 directory.
+
+
+Hotplug Firmware Loading for 2.6 kernels
+----------------------------------------
+For 2.6 kernels the firmware is loaded at the point that the driver module is
+loaded. See linux/Documentation/dvb/firmware.txt for more information.
+
+Copy the three files downloaded above into the /usr/lib/hotplug/firmware or
+/lib/firmware directory (depending on configuration of firmware hotplug).
diff --git a/Documentation/dvb/udev.txt b/Documentation/dvb/udev.txt
new file mode 100644
index 0000000..68ee224
--- /dev/null
+++ b/Documentation/dvb/udev.txt
@@ -0,0 +1,46 @@
+The DVB subsystem currently registers to the sysfs subsystem using the
+"class_simple" interface.
+
+This means that only the basic informations like module loading parameters
+are presented through sysfs. Other things that might be interesting are
+currently *not* available.
+
+Nevertheless it's now possible to add proper udev rules so that the
+DVB device nodes are created automatically.
+
+We assume that you have udev already up and running and that have been
+creating the DVB device nodes manually up to now due to the missing sysfs
+support.
+
+0. Don't forget to disable your current method of creating the
+device nodes manually.
+
+1. Unfortunately, you'll need a helper script to transform the kernel
+sysfs device name into the well known dvb adapter / device naming scheme.
+The script should be called "dvb.sh" and should be placed into a script
+dir where udev can execute it, most likely /etc/udev/scripts/
+
+So, create a new file /etc/udev/scripts/dvb.sh and add the following:
+------------------------------schnipp------------------------------------------------
+#!/bin/sh
+/bin/echo $1 | /bin/sed -e 's,dvb\([0-9]\)\.\([^0-9]*\)\([0-9]\),dvb/adapter\1/\2\3,'
+------------------------------schnipp------------------------------------------------
+
+Don't forget to make the script executable with "chmod".
+
+1. You need to create a proper udev rule that will create the device nodes
+like you know them. All real distributions out there scan the /etc/udev/rules.d
+directory for rule files. The main udev configuration file /etc/udev/udev.conf
+will tell you the directory where the rules are, most likely it's /etc/udev/rules.d/
+
+Create a new rule file in that directory called "dvb.rule" and add the following line:
+------------------------------schnipp------------------------------------------------
+KERNEL="dvb*", PROGRAM="/etc/udev/scripts/dvb.sh %k", NAME="%c"
+------------------------------schnipp------------------------------------------------
+
+If you want more control over the device nodes (for example a special group membership)
+have a look at "man udev".
+
+For every device that registers to the sysfs subsystem with a "dvb" prefix,
+the helper script /etc/udev/scripts/dvb.sh is invoked, which will then
+create the proper device node in your /dev/ directory.
diff --git a/Documentation/early-userspace/README b/Documentation/early-userspace/README
new file mode 100644
index 0000000..e35d830
--- /dev/null
+++ b/Documentation/early-userspace/README
@@ -0,0 +1,152 @@
+Early userspace support
+=======================
+
+Last update: 2004-12-20 tlh
+
+
+"Early userspace" is a set of libraries and programs that provide
+various pieces of functionality that are important enough to be
+available while a Linux kernel is coming up, but that don't need to be
+run inside the kernel itself.
+
+It consists of several major infrastructure components:
+
+- gen_init_cpio, a program that builds a cpio-format archive
+ containing a root filesystem image. This archive is compressed, and
+ the compressed image is linked into the kernel image.
+- initramfs, a chunk of code that unpacks the compressed cpio image
+ midway through the kernel boot process.
+- klibc, a userspace C library, currently packaged separately, that is
+ optimized for correctness and small size.
+
+The cpio file format used by initramfs is the "newc" (aka "cpio -H newc")
+format, and is documented in the file "buffer-format.txt". There are
+two ways to add an early userspace image: specify an existing cpio
+archive to be used as the image or have the kernel build process build
+the image from specifications.
+
+CPIO ARCHIVE method
+
+You can create a cpio archive that contains the early userspace image.
+Your cpio archive should be specified in CONFIG_INITRAMFS_SOURCE and it
+will be used directly. Only a single cpio file may be specified in
+CONFIG_INITRAMFS_SOURCE and directory and file names are not allowed in
+combination with a cpio archive.
+
+IMAGE BUILDING method
+
+The kernel build process can also build an early userspace image from
+source parts rather than supplying a cpio archive. This method provides
+a way to create images with root-owned files even though the image was
+built by an unprivileged user.
+
+The image is specified as one or more sources in
+CONFIG_INITRAMFS_SOURCE. Sources can be either directories or files -
+cpio archives are *not* allowed when building from sources.
+
+A source directory will have it and all of its contents packaged. The
+specified directory name will be mapped to '/'. When packaging a
+directory, limited user and group ID translation can be performed.
+INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to
+user root (0). INITRAMFS_ROOT_GID can be set to a group ID that needs
+to be mapped to group root (0).
+
+A source file must be directives in the format required by the
+usr/gen_init_cpio utility (run 'usr/gen_init_cpio --help' to get the
+file format). The directives in the file will be passed directly to
+usr/gen_init_cpio.
+
+When a combination of directories and files are specified then the
+initramfs image will be an aggregate of all of them. In this way a user
+can create a 'root-image' directory and install all files into it.
+Because device-special files cannot be created by a unprivileged user,
+special files can be listed in a 'root-files' file. Both 'root-image'
+and 'root-files' can be listed in CONFIG_INITRAMFS_SOURCE and a complete
+early userspace image can be built by an unprivileged user.
+
+As a technical note, when directories and files are specified, the
+entire CONFIG_INITRAMFS_SOURCE is passed to
+scripts/gen_initramfs_list.sh. This means that CONFIG_INITRAMFS_SOURCE
+can really be interpreted as any legal argument to
+gen_initramfs_list.sh. If a directory is specified as an argument then
+the contents are scanned, uid/gid translation is performed, and
+usr/gen_init_cpio file directives are output. If a directory is
+specified as an arugemnt to scripts/gen_initramfs_list.sh then the
+contents of the file are simply copied to the output. All of the output
+directives from directory scanning and file contents copying are
+processed by usr/gen_init_cpio.
+
+See also 'scripts/gen_initramfs_list.sh -h'.
+
+Where's this all leading?
+=========================
+
+The klibc distribution contains some of the necessary software to make
+early userspace useful. The klibc distribution is currently
+maintained separately from the kernel, but this may change early in
+the 2.7 era (it missed the boat for 2.5).
+
+You can obtain somewhat infrequent snapshots of klibc from
+ftp://ftp.kernel.org/pub/linux/libs/klibc/
+
+For active users, you are better off using the klibc git
+repository, at http://git.kernel.org/?p=libs/klibc/klibc.git
+
+The standalone klibc distribution currently provides three components,
+in addition to the klibc library:
+
+- ipconfig, a program that configures network interfaces. It can
+ configure them statically, or use DHCP to obtain information
+ dynamically (aka "IP autoconfiguration").
+- nfsmount, a program that can mount an NFS filesystem.
+- kinit, the "glue" that uses ipconfig and nfsmount to replace the old
+ support for IP autoconfig, mount a filesystem over NFS, and continue
+ system boot using that filesystem as root.
+
+kinit is built as a single statically linked binary to save space.
+
+Eventually, several more chunks of kernel functionality will hopefully
+move to early userspace:
+
+- Almost all of init/do_mounts* (the beginning of this is already in
+ place)
+- ACPI table parsing
+- Insert unwieldy subsystem that doesn't really need to be in kernel
+ space here
+
+If kinit doesn't meet your current needs and you've got bytes to burn,
+the klibc distribution includes a small Bourne-compatible shell (ash)
+and a number of other utilities, so you can replace kinit and build
+custom initramfs images that meet your needs exactly.
+
+For questions and help, you can sign up for the early userspace
+mailing list at http://www.zytor.com/mailman/listinfo/klibc
+
+How does it work?
+=================
+
+The kernel has currently 3 ways to mount the root filesystem:
+
+a) all required device and filesystem drivers compiled into the kernel, no
+ initrd. init/main.c:init() will call prepare_namespace() to mount the
+ final root filesystem, based on the root= option and optional init= to run
+ some other init binary than listed at the end of init/main.c:init().
+
+b) some device and filesystem drivers built as modules and stored in an
+ initrd. The initrd must contain a binary '/linuxrc' which is supposed to
+ load these driver modules. It is also possible to mount the final root
+ filesystem via linuxrc and use the pivot_root syscall. The initrd is
+ mounted and executed via prepare_namespace().
+
+c) using initramfs. The call to prepare_namespace() must be skipped.
+ This means that a binary must do all the work. Said binary can be stored
+ into initramfs either via modifying usr/gen_init_cpio.c or via the new
+ initrd format, an cpio archive. It must be called "/init". This binary
+ is responsible to do all the things prepare_namespace() would do.
+
+ To maintain backwards compatibility, the /init binary will only run if it
+ comes via an initramfs cpio archive. If this is not the case,
+ init/main.c:init() will run prepare_namespace() to mount the final root
+ and exec one of the predefined init binaries.
+
+Bryan O'Sullivan <bos@serpentine.com>
diff --git a/Documentation/early-userspace/buffer-format.txt b/Documentation/early-userspace/buffer-format.txt
new file mode 100644
index 0000000..e1fd7f9
--- /dev/null
+++ b/Documentation/early-userspace/buffer-format.txt
@@ -0,0 +1,112 @@
+ initramfs buffer format
+ -----------------------
+
+ Al Viro, H. Peter Anvin
+ Last revision: 2002-01-13
+
+Starting with kernel 2.5.x, the old "initial ramdisk" protocol is
+getting {replaced/complemented} with the new "initial ramfs"
+(initramfs) protocol. The initramfs contents is passed using the same
+memory buffer protocol used by the initrd protocol, but the contents
+is different. The initramfs buffer contains an archive which is
+expanded into a ramfs filesystem; this document details the format of
+the initramfs buffer format.
+
+The initramfs buffer format is based around the "newc" or "crc" CPIO
+formats, and can be created with the cpio(1) utility. The cpio
+archive can be compressed using gzip(1). One valid version of an
+initramfs buffer is thus a single .cpio.gz file.
+
+The full format of the initramfs buffer is defined by the following
+grammar, where:
+ * is used to indicate "0 or more occurrences of"
+ (|) indicates alternatives
+ + indicates concatenation
+ GZIP() indicates the gzip(1) of the operand
+ ALGN(n) means padding with null bytes to an n-byte boundary
+
+ initramfs := ("\0" | cpio_archive | cpio_gzip_archive)*
+
+ cpio_gzip_archive := GZIP(cpio_archive)
+
+ cpio_archive := cpio_file* + (<nothing> | cpio_trailer)
+
+ cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data
+
+ cpio_trailer := ALGN(4) + cpio_header + "TRAILER!!!\0" + ALGN(4)
+
+
+In human terms, the initramfs buffer contains a collection of
+compressed and/or uncompressed cpio archives (in the "newc" or "crc"
+formats); arbitrary amounts zero bytes (for padding) can be added
+between members.
+
+The cpio "TRAILER!!!" entry (cpio end-of-archive) is optional, but is
+not ignored; see "handling of hard links" below.
+
+The structure of the cpio_header is as follows (all fields contain
+hexadecimal ASCII numbers fully padded with '0' on the left to the
+full width of the field, for example, the integer 4780 is represented
+by the ASCII string "000012ac"):
+
+Field name Field size Meaning
+c_magic 6 bytes The string "070701" or "070702"
+c_ino 8 bytes File inode number
+c_mode 8 bytes File mode and permissions
+c_uid 8 bytes File uid
+c_gid 8 bytes File gid
+c_nlink 8 bytes Number of links
+c_mtime 8 bytes Modification time
+c_filesize 8 bytes Size of data field
+c_maj 8 bytes Major part of file device number
+c_min 8 bytes Minor part of file device number
+c_rmaj 8 bytes Major part of device node reference
+c_rmin 8 bytes Minor part of device node reference
+c_namesize 8 bytes Length of filename, including final \0
+c_chksum 8 bytes Checksum of data field if c_magic is 070702;
+ otherwise zero
+
+The c_mode field matches the contents of st_mode returned by stat(2)
+on Linux, and encodes the file type and file permissions.
+
+The c_filesize should be zero for any file which is not a regular file
+or symlink.
+
+The c_chksum field contains a simple 32-bit unsigned sum of all the
+bytes in the data field. cpio(1) refers to this as "crc", which is
+clearly incorrect (a cyclic redundancy check is a different and
+significantly stronger integrity check), however, this is the
+algorithm used.
+
+If the filename is "TRAILER!!!" this is actually an end-of-archive
+marker; the c_filesize for an end-of-archive marker must be zero.
+
+
+*** Handling of hard links
+
+When a nondirectory with c_nlink > 1 is seen, the (c_maj,c_min,c_ino)
+tuple is looked up in a tuple buffer. If not found, it is entered in
+the tuple buffer and the entry is created as usual; if found, a hard
+link rather than a second copy of the file is created. It is not
+necessary (but permitted) to include a second copy of the file
+contents; if the file contents is not included, the c_filesize field
+should be set to zero to indicate no data section follows. If data is
+present, the previous instance of the file is overwritten; this allows
+the data-carrying instance of a file to occur anywhere in the sequence
+(GNU cpio is reported to attach the data to the last instance of a
+file only.)
+
+c_filesize must not be zero for a symlink.
+
+When a "TRAILER!!!" end-of-archive marker is seen, the tuple buffer is
+reset. This permits archives which are generated independently to be
+concatenated.
+
+To combine file data from different sources (without having to
+regenerate the (c_maj,c_min,c_ino) fields), therefore, either one of
+the following techniques can be used:
+
+a) Separate the different file data sources with a "TRAILER!!!"
+ end-of-archive marker, or
+
+b) Make sure c_nlink == 1 for all nondirectory entries.
diff --git a/Documentation/edac.txt b/Documentation/edac.txt
new file mode 100644
index 0000000..8eda3fb
--- /dev/null
+++ b/Documentation/edac.txt
@@ -0,0 +1,720 @@
+
+
+EDAC - Error Detection And Correction
+
+Written by Doug Thompson <dougthompson@xmission.com>
+7 Dec 2005
+17 Jul 2007 Updated
+
+
+EDAC is maintained and written by:
+
+ Doug Thompson, Dave Jiang, Dave Peterson et al,
+ original author: Thayne Harbaugh,
+
+Contact:
+ website: bluesmoke.sourceforge.net
+ mailing list: bluesmoke-devel@lists.sourceforge.net
+
+"bluesmoke" was the name for this device driver when it was "out-of-tree"
+and maintained at sourceforge.net. When it was pushed into 2.6.16 for the
+first time, it was renamed to 'EDAC'.
+
+The bluesmoke project at sourceforge.net is now utilized as a 'staging area'
+for EDAC development, before it is sent upstream to kernel.org
+
+At the bluesmoke/EDAC project site, is a series of quilt patches against
+recent kernels, stored in a SVN respository. For easier downloading, there
+is also a tarball snapshot available.
+
+============================================================================
+EDAC PURPOSE
+
+The 'edac' kernel module goal is to detect and report errors that occur
+within the computer system running under linux.
+
+MEMORY
+
+In the initial release, memory Correctable Errors (CE) and Uncorrectable
+Errors (UE) are the primary errors being harvested. These types of errors
+are harvested by the 'edac_mc' class of device.
+
+Detecting CE events, then harvesting those events and reporting them,
+CAN be a predictor of future UE events. With CE events, the system can
+continue to operate, but with less safety. Preventive maintenance and
+proactive part replacement of memory DIMMs exhibiting CEs can reduce
+the likelihood of the dreaded UE events and system 'panics'.
+
+NON-MEMORY
+
+A new feature for EDAC, the edac_device class of device, was added in
+the 2.6.23 version of the kernel.
+
+This new device type allows for non-memory type of ECC hardware detectors
+to have their states harvested and presented to userspace via the sysfs
+interface.
+
+Some architectures have ECC detectors for L1, L2 and L3 caches, along with DMA
+engines, fabric switches, main data path switches, interconnections,
+and various other hardware data paths. If the hardware reports it, then
+a edac_device device probably can be constructed to harvest and present
+that to userspace.
+
+
+PCI BUS SCANNING
+
+In addition, PCI Bus Parity and SERR Errors are scanned for on PCI devices
+in order to determine if errors are occurring on data transfers.
+
+The presence of PCI Parity errors must be examined with a grain of salt.
+There are several add-in adapters that do NOT follow the PCI specification
+with regards to Parity generation and reporting. The specification says
+the vendor should tie the parity status bits to 0 if they do not intend
+to generate parity. Some vendors do not do this, and thus the parity bit
+can "float" giving false positives.
+
+In the kernel there is a pci device attribute located in sysfs that is
+checked by the EDAC PCI scanning code. If that attribute is set,
+PCI parity/error scannining is skipped for that device. The attribute
+is:
+
+ broken_parity_status
+
+as is located in /sys/devices/pci<XXX>/0000:XX:YY.Z directorys for
+PCI devices.
+
+FUTURE HARDWARE SCANNING
+
+EDAC will have future error detectors that will be integrated with
+EDAC or added to it, in the following list:
+
+ MCE Machine Check Exception
+ MCA Machine Check Architecture
+ NMI NMI notification of ECC errors
+ MSRs Machine Specific Register error cases
+ and other mechanisms.
+
+These errors are usually bus errors, ECC errors, thermal throttling
+and the like.
+
+
+============================================================================
+EDAC VERSIONING
+
+EDAC is composed of a "core" module (edac_core.ko) and several Memory
+Controller (MC) driver modules. On a given system, the CORE
+is loaded and one MC driver will be loaded. Both the CORE and
+the MC driver (or edac_device driver) have individual versions that reflect
+current release level of their respective modules.
+
+Thus, to "report" on what version a system is running, one must report both
+the CORE's and the MC driver's versions.
+
+
+LOADING
+
+If 'edac' was statically linked with the kernel then no loading is
+necessary. If 'edac' was built as modules then simply modprobe the
+'edac' pieces that you need. You should be able to modprobe
+hardware-specific modules and have the dependencies load the necessary core
+modules.
+
+Example:
+
+$> modprobe amd76x_edac
+
+loads both the amd76x_edac.ko memory controller module and the edac_mc.ko
+core module.
+
+
+============================================================================
+EDAC sysfs INTERFACE
+
+EDAC presents a 'sysfs' interface for control, reporting and attribute
+reporting purposes.
+
+EDAC lives in the /sys/devices/system/edac directory.
+
+Within this directory there currently reside 2 'edac' components:
+
+ mc memory controller(s) system
+ pci PCI control and status system
+
+
+============================================================================
+Memory Controller (mc) Model
+
+First a background on the memory controller's model abstracted in EDAC.
+Each 'mc' device controls a set of DIMM memory modules. These modules are
+laid out in a Chip-Select Row (csrowX) and Channel table (chX). There can
+be multiple csrows and multiple channels.
+
+Memory controllers allow for several csrows, with 8 csrows being a typical value.
+Yet, the actual number of csrows depends on the electrical "loading"
+of a given motherboard, memory controller and DIMM characteristics.
+
+Dual channels allows for 128 bit data transfers to the CPU from memory.
+Some newer chipsets allow for more than 2 channels, like Fully Buffered DIMMs
+(FB-DIMMs). The following example will assume 2 channels:
+
+
+ Channel 0 Channel 1
+ ===================================
+ csrow0 | DIMM_A0 | DIMM_B0 |
+ csrow1 | DIMM_A0 | DIMM_B0 |
+ ===================================
+
+ ===================================
+ csrow2 | DIMM_A1 | DIMM_B1 |
+ csrow3 | DIMM_A1 | DIMM_B1 |
+ ===================================
+
+In the above example table there are 4 physical slots on the motherboard
+for memory DIMMs:
+
+ DIMM_A0
+ DIMM_B0
+ DIMM_A1
+ DIMM_B1
+
+Labels for these slots are usually silk screened on the motherboard. Slots
+labeled 'A' are channel 0 in this example. Slots labeled 'B'
+are channel 1. Notice that there are two csrows possible on a
+physical DIMM. These csrows are allocated their csrow assignment
+based on the slot into which the memory DIMM is placed. Thus, when 1 DIMM
+is placed in each Channel, the csrows cross both DIMMs.
+
+Memory DIMMs come single or dual "ranked". A rank is a populated csrow.
+Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above
+will have 1 csrow, csrow0. csrow1 will be empty. On the other hand,
+when 2 dual ranked DIMMs are similarly placed, then both csrow0 and
+csrow1 will be populated. The pattern repeats itself for csrow2 and
+csrow3.
+
+The representation of the above is reflected in the directory tree
+in EDAC's sysfs interface. Starting in directory
+/sys/devices/system/edac/mc each memory controller will be represented
+by its own 'mcX' directory, where 'X" is the index of the MC.
+
+
+ ..../edac/mc/
+ |
+ |->mc0
+ |->mc1
+ |->mc2
+ ....
+
+Under each 'mcX' directory each 'csrowX' is again represented by a
+'csrowX', where 'X" is the csrow index:
+
+
+ .../mc/mc0/
+ |
+ |->csrow0
+ |->csrow2
+ |->csrow3
+ ....
+
+Notice that there is no csrow1, which indicates that csrow0 is
+composed of a single ranked DIMMs. This should also apply in both
+Channels, in order to have dual-channel mode be operational. Since
+both csrow2 and csrow3 are populated, this indicates a dual ranked
+set of DIMMs for channels 0 and 1.
+
+
+Within each of the 'mcX' and 'csrowX' directories are several
+EDAC control and attribute files.
+
+============================================================================
+'mcX' DIRECTORIES
+
+
+In 'mcX' directories are EDAC control and attribute files for
+this 'X" instance of the memory controllers:
+
+
+Counter reset control file:
+
+ 'reset_counters'
+
+ This write-only control file will zero all the statistical counters
+ for UE and CE errors. Zeroing the counters will also reset the timer
+ indicating how long since the last counter zero. This is useful
+ for computing errors/time. Since the counters are always reset at
+ driver initialization time, no module/kernel parameter is available.
+
+ RUN TIME: echo "anything" >/sys/devices/system/edac/mc/mc0/counter_reset
+
+ This resets the counters on memory controller 0
+
+
+Seconds since last counter reset control file:
+
+ 'seconds_since_reset'
+
+ This attribute file displays how many seconds have elapsed since the
+ last counter reset. This can be used with the error counters to
+ measure error rates.
+
+
+
+Memory Controller name attribute file:
+
+ 'mc_name'
+
+ This attribute file displays the type of memory controller
+ that is being utilized.
+
+
+Total memory managed by this memory controller attribute file:
+
+ 'size_mb'
+
+ This attribute file displays, in count of megabytes, of memory
+ that this instance of memory controller manages.
+
+
+Total Uncorrectable Errors count attribute file:
+
+ 'ue_count'
+
+ This attribute file displays the total count of uncorrectable
+ errors that have occurred on this memory controller. If panic_on_ue
+ is set this counter will not have a chance to increment,
+ since EDAC will panic the system.
+
+
+Total UE count that had no information attribute fileY:
+
+ 'ue_noinfo_count'
+
+ This attribute file displays the number of UEs that
+ have occurred have occurred with no informations as to which DIMM
+ slot is having errors.
+
+
+Total Correctable Errors count attribute file:
+
+ 'ce_count'
+
+ This attribute file displays the total count of correctable
+ errors that have occurred on this memory controller. This
+ count is very important to examine. CEs provide early
+ indications that a DIMM is beginning to fail. This count
+ field should be monitored for non-zero values and report
+ such information to the system administrator.
+
+
+Total Correctable Errors count attribute file:
+
+ 'ce_noinfo_count'
+
+ This attribute file displays the number of CEs that
+ have occurred wherewith no informations as to which DIMM slot
+ is having errors. Memory is handicapped, but operational,
+ yet no information is available to indicate which slot
+ the failing memory is in. This count field should be also
+ be monitored for non-zero values.
+
+Device Symlink:
+
+ 'device'
+
+ Symlink to the memory controller device.
+
+Sdram memory scrubbing rate:
+
+ 'sdram_scrub_rate'
+
+ Read/Write attribute file that controls memory scrubbing. The scrubbing
+ rate is set by writing a minimum bandwidth in bytes/sec to the attribute
+ file. The rate will be translated to an internal value that gives at
+ least the specified rate.
+
+ Reading the file will return the actual scrubbing rate employed.
+
+ If configuration fails or memory scrubbing is not implemented, the value
+ of the attribute file will be -1.
+
+
+
+============================================================================
+'csrowX' DIRECTORIES
+
+In the 'csrowX' directories are EDAC control and attribute files for
+this 'X" instance of csrow:
+
+
+Total Uncorrectable Errors count attribute file:
+
+ 'ue_count'
+
+ This attribute file displays the total count of uncorrectable
+ errors that have occurred on this csrow. If panic_on_ue is set
+ this counter will not have a chance to increment, since EDAC
+ will panic the system.
+
+
+Total Correctable Errors count attribute file:
+
+ 'ce_count'
+
+ This attribute file displays the total count of correctable
+ errors that have occurred on this csrow. This
+ count is very important to examine. CEs provide early
+ indications that a DIMM is beginning to fail. This count
+ field should be monitored for non-zero values and report
+ such information to the system administrator.
+
+
+Total memory managed by this csrow attribute file:
+
+ 'size_mb'
+
+ This attribute file displays, in count of megabytes, of memory
+ that this csrow contains.
+
+
+Memory Type attribute file:
+
+ 'mem_type'
+
+ This attribute file will display what type of memory is currently
+ on this csrow. Normally, either buffered or unbuffered memory.
+ Examples:
+ Registered-DDR
+ Unbuffered-DDR
+
+
+EDAC Mode of operation attribute file:
+
+ 'edac_mode'
+
+ This attribute file will display what type of Error detection
+ and correction is being utilized.
+
+
+Device type attribute file:
+
+ 'dev_type'
+
+ This attribute file will display what type of DRAM device is
+ being utilized on this DIMM.
+ Examples:
+ x1
+ x2
+ x4
+ x8
+
+
+Channel 0 CE Count attribute file:
+
+ 'ch0_ce_count'
+
+ This attribute file will display the count of CEs on this
+ DIMM located in channel 0.
+
+
+Channel 0 UE Count attribute file:
+
+ 'ch0_ue_count'
+
+ This attribute file will display the count of UEs on this
+ DIMM located in channel 0.
+
+
+Channel 0 DIMM Label control file:
+
+ 'ch0_dimm_label'
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+
+Channel 1 CE Count attribute file:
+
+ 'ch1_ce_count'
+
+ This attribute file will display the count of CEs on this
+ DIMM located in channel 1.
+
+
+Channel 1 UE Count attribute file:
+
+ 'ch1_ue_count'
+
+ This attribute file will display the count of UEs on this
+ DIMM located in channel 0.
+
+
+Channel 1 DIMM Label control file:
+
+ 'ch1_dimm_label'
+
+ This control file allows this DIMM to have a label assigned
+ to it. With this label in the module, when errors occur
+ the output can provide the DIMM label in the system log.
+ This becomes vital for panic events to isolate the
+ cause of the UE event.
+
+ DIMM Labels must be assigned after booting, with information
+ that correctly identifies the physical slot with its
+ silk screen label. This information is currently very
+ motherboard specific and determination of this information
+ must occur in userland at this time.
+
+============================================================================
+SYSTEM LOGGING
+
+If logging for UEs and CEs are enabled then system logs will have
+error notices indicating errors that have been detected:
+
+EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0,
+channel 1 "DIMM_B1": amd76x_edac
+
+EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0,
+channel 1 "DIMM_B1": amd76x_edac
+
+
+The structure of the message is:
+ the memory controller (MC0)
+ Error type (CE)
+ memory page (0x283)
+ offset in the page (0xce0)
+ the byte granularity (grain 8)
+ or resolution of the error
+ the error syndrome (0xb741)
+ memory row (row 0)
+ memory channel (channel 1)
+ DIMM label, if set prior (DIMM B1
+ and then an optional, driver-specific message that may
+ have additional information.
+
+Both UEs and CEs with no info will lack all but memory controller,
+error type, a notice of "no info" and then an optional,
+driver-specific error message.
+
+
+============================================================================
+PCI Bus Parity Detection
+
+
+On Header Type 00 devices the primary status is looked at
+for any parity error regardless of whether Parity is enabled on the
+device. (The spec indicates parity is generated in some cases).
+On Header Type 01 bridges, the secondary status register is also
+looked at to see if parity occurred on the bus on the other side of
+the bridge.
+
+
+SYSFS CONFIGURATION
+
+Under /sys/devices/system/edac/pci are control and attribute files as follows:
+
+
+Enable/Disable PCI Parity checking control file:
+
+ 'check_pci_parity'
+
+
+ This control file enables or disables the PCI Bus Parity scanning
+ operation. Writing a 1 to this file enables the scanning. Writing
+ a 0 to this file disables the scanning.
+
+ Enable:
+ echo "1" >/sys/devices/system/edac/pci/check_pci_parity
+
+ Disable:
+ echo "0" >/sys/devices/system/edac/pci/check_pci_parity
+
+
+Parity Count:
+
+ 'pci_parity_count'
+
+ This attribute file will display the number of parity errors that
+ have been detected.
+
+
+============================================================================
+MODULE PARAMETERS
+
+Panic on UE control file:
+
+ 'edac_mc_panic_on_ue'
+
+ An uncorrectable error will cause a machine panic. This is usually
+ desirable. It is a bad idea to continue when an uncorrectable error
+ occurs - it is indeterminate what was uncorrected and the operating
+ system context might be so mangled that continuing will lead to further
+ corruption. If the kernel has MCE configured, then EDAC will never
+ notice the UE.
+
+ LOAD TIME: module/kernel parameter: edac_mc_panic_on_ue=[0|1]
+
+ RUN TIME: echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue
+
+
+Log UE control file:
+
+ 'edac_mc_log_ue'
+
+ Generate kernel messages describing uncorrectable errors. These errors
+ are reported through the system message log system. UE statistics
+ will be accumulated even when UE logging is disabled.
+
+ LOAD TIME: module/kernel parameter: edac_mc_log_ue=[0|1]
+
+ RUN TIME: echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue
+
+
+Log CE control file:
+
+ 'edac_mc_log_ce'
+
+ Generate kernel messages describing correctable errors. These
+ errors are reported through the system message log system.
+ CE statistics will be accumulated even when CE logging is disabled.
+
+ LOAD TIME: module/kernel parameter: edac_mc_log_ce=[0|1]
+
+ RUN TIME: echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce
+
+
+Polling period control file:
+
+ 'edac_mc_poll_msec'
+
+ The time period, in milliseconds, for polling for error information.
+ Too small a value wastes resources. Too large a value might delay
+ necessary handling of errors and might loose valuable information for
+ locating the error. 1000 milliseconds (once each second) is the current
+ default. Systems which require all the bandwidth they can get, may
+ increase this.
+
+ LOAD TIME: module/kernel parameter: edac_mc_poll_msec=[0|1]
+
+ RUN TIME: echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec
+
+
+Panic on PCI PARITY Error:
+
+ 'panic_on_pci_parity'
+
+
+ This control files enables or disables panicking when a parity
+ error has been detected.
+
+
+ module/kernel parameter: edac_panic_on_pci_pe=[0|1]
+
+ Enable:
+ echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
+
+ Disable:
+ echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe
+
+
+
+=======================================================================
+
+
+EDAC_DEVICE type of device
+
+In the header file, edac_core.h, there is a series of edac_device structures
+and APIs for the EDAC_DEVICE.
+
+User space access to an edac_device is through the sysfs interface.
+
+At the location /sys/devices/system/edac (sysfs) new edac_device devices will
+appear.
+
+There is a three level tree beneath the above 'edac' directory. For example,
+the 'test_device_edac' device (found at the bluesmoke.sourceforget.net website)
+installs itself as:
+
+ /sys/devices/systm/edac/test-instance
+
+in this directory are various controls, a symlink and one or more 'instance'
+directorys.
+
+The standard default controls are:
+
+ log_ce boolean to log CE events
+ log_ue boolean to log UE events
+ panic_on_ue boolean to 'panic' the system if an UE is encountered
+ (default off, can be set true via startup script)
+ poll_msec time period between POLL cycles for events
+
+The test_device_edac device adds at least one of its own custom control:
+
+ test_bits which in the current test driver does nothing but
+ show how it is installed. A ported driver can
+ add one or more such controls and/or attributes
+ for specific uses.
+ One out-of-tree driver uses controls here to allow
+ for ERROR INJECTION operations to hardware
+ injection registers
+
+The symlink points to the 'struct dev' that is registered for this edac_device.
+
+INSTANCES
+
+One or more instance directories are present. For the 'test_device_edac' case:
+
+ test-instance0
+
+
+In this directory there are two default counter attributes, which are totals of
+counter in deeper subdirectories.
+
+ ce_count total of CE events of subdirectories
+ ue_count total of UE events of subdirectories
+
+BLOCKS
+
+At the lowest directory level is the 'block' directory. There can be 0, 1
+or more blocks specified in each instance.
+
+ test-block0
+
+
+In this directory the default attributes are:
+
+ ce_count which is counter of CE events for this 'block'
+ of hardware being monitored
+ ue_count which is counter of UE events for this 'block'
+ of hardware being monitored
+
+
+The 'test_device_edac' device adds 4 attributes and 1 control:
+
+ test-block-bits-0 for every POLL cycle this counter
+ is incremented
+ test-block-bits-1 every 10 cycles, this counter is bumped once,
+ and test-block-bits-0 is set to 0
+ test-block-bits-2 every 100 cycles, this counter is bumped once,
+ and test-block-bits-1 is set to 0
+ test-block-bits-3 every 1000 cycles, this counter is bumped once,
+ and test-block-bits-2 is set to 0
+
+
+ reset-counters writing ANY thing to this control will
+ reset all the above counters.
+
+
+Use of the 'test_device_edac' driver should any others to create their own
+unique drivers for their hardware systems.
+
+The 'test_device_edac' sample driver is located at the
+bluesmoke.sourceforge.net project site for EDAC.
+
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
new file mode 100644
index 0000000..60e361b
--- /dev/null
+++ b/Documentation/eisa.txt
@@ -0,0 +1,203 @@
+EISA bus support (Marc Zyngier <maz@wild-wind.fr.eu.org>)
+
+This document groups random notes about porting EISA drivers to the
+new EISA/sysfs API.
+
+Starting from version 2.5.59, the EISA bus is almost given the same
+status as other much more mainstream busses such as PCI or USB. This
+has been possible through sysfs, which defines a nice enough set of
+abstractions to manage busses, devices and drivers.
+
+Although the new API is quite simple to use, converting existing
+drivers to the new infrastructure is not an easy task (mostly because
+detection code is generally also used to probe ISA cards). Moreover,
+most EISA drivers are among the oldest Linux drivers so, as you can
+imagine, some dust has settled here over the years.
+
+The EISA infrastructure is made up of three parts :
+
+ - The bus code implements most of the generic code. It is shared
+ among all the architectures that the EISA code runs on. It
+ implements bus probing (detecting EISA cards available on the bus),
+ allocates I/O resources, allows fancy naming through sysfs, and
+ offers interfaces for driver to register.
+
+ - The bus root driver implements the glue between the bus hardware
+ and the generic bus code. It is responsible for discovering the
+ device implementing the bus, and setting it up to be latter probed
+ by the bus code. This can go from something as simple as reserving
+ an I/O region on x86, to the rather more complex, like the hppa
+ EISA code. This is the part to implement in order to have EISA
+ running on an "new" platform.
+
+ - The driver offers the bus a list of devices that it manages, and
+ implements the necessary callbacks to probe and release devices
+ whenever told to.
+
+Every function/structure below lives in <linux/eisa.h>, which depends
+heavily on <linux/device.h>.
+
+** Bus root driver :
+
+int eisa_root_register (struct eisa_root_device *root);
+
+The eisa_root_register function is used to declare a device as the
+root of an EISA bus. The eisa_root_device structure holds a reference
+to this device, as well as some parameters for probing purposes.
+
+struct eisa_root_device {
+ struct device *dev; /* Pointer to bridge device */
+ struct resource *res;
+ unsigned long bus_base_addr;
+ int slots; /* Max slot number */
+ int force_probe; /* Probe even when no slot 0 */
+ u64 dma_mask; /* from bridge device */
+ int bus_nr; /* Set by eisa_root_register */
+ struct resource eisa_root_res; /* ditto */
+};
+
+node : used for eisa_root_register internal purpose
+dev : pointer to the root device
+res : root device I/O resource
+bus_base_addr : slot 0 address on this bus
+slots : max slot number to probe
+force_probe : Probe even when slot 0 is empty (no EISA mainboard)
+dma_mask : Default DMA mask. Usually the bridge device dma_mask.
+bus_nr : unique bus id, set by eisa_root_register
+
+** Driver :
+
+int eisa_driver_register (struct eisa_driver *edrv);
+void eisa_driver_unregister (struct eisa_driver *edrv);
+
+Clear enough ?
+
+struct eisa_device_id {
+ char sig[EISA_SIG_LEN];
+ unsigned long driver_data;
+};
+
+struct eisa_driver {
+ const struct eisa_device_id *id_table;
+ struct device_driver driver;
+};
+
+id_table : an array of NULL terminated EISA id strings,
+ followed by an empty string. Each string can
+ optionally be paired with a driver-dependant value
+ (driver_data).
+
+driver : a generic driver, such as described in
+ Documentation/driver-model/driver.txt. Only .name,
+ .probe and .remove members are mandatory.
+
+An example is the 3c59x driver :
+
+static struct eisa_device_id vortex_eisa_ids[] = {
+ { "TCM5920", EISA_3C592_OFFSET },
+ { "TCM5970", EISA_3C597_OFFSET },
+ { "" }
+};
+
+static struct eisa_driver vortex_eisa_driver = {
+ .id_table = vortex_eisa_ids,
+ .driver = {
+ .name = "3c59x",
+ .probe = vortex_eisa_probe,
+ .remove = vortex_eisa_remove
+ }
+};
+
+** Device :
+
+The sysfs framework calls .probe and .remove functions upon device
+discovery and removal (note that the .remove function is only called
+when driver is built as a module).
+
+Both functions are passed a pointer to a 'struct device', which is
+encapsulated in a 'struct eisa_device' described as follows :
+
+struct eisa_device {
+ struct eisa_device_id id;
+ int slot;
+ int state;
+ unsigned long base_addr;
+ struct resource res[EISA_MAX_RESOURCES];
+ u64 dma_mask;
+ struct device dev; /* generic device */
+};
+
+id : EISA id, as read from device. id.driver_data is set from the
+ matching driver EISA id.
+slot : slot number which the device was detected on
+state : set of flags indicating the state of the device. Current
+ flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
+res : set of four 256 bytes I/O regions allocated to this device
+dma_mask: DMA mask set from the parent device.
+dev : generic device (see Documentation/driver-model/device.txt)
+
+You can get the 'struct eisa_device' from 'struct device' using the
+'to_eisa_device' macro.
+
+** Misc stuff :
+
+void eisa_set_drvdata (struct eisa_device *edev, void *data);
+
+Stores data into the device's driver_data area.
+
+void *eisa_get_drvdata (struct eisa_device *edev):
+
+Gets the pointer previously stored into the device's driver_data area.
+
+int eisa_get_region_index (void *addr);
+
+Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given
+address.
+
+** Kernel parameters :
+
+eisa_bus.enable_dev :
+
+A comma-separated list of slots to be enabled, even if the firmware
+set the card as disabled. The driver must be able to properly
+initialize the device in such conditions.
+
+eisa_bus.disable_dev :
+
+A comma-separated list of slots to be enabled, even if the firmware
+set the card as enabled. The driver won't be called to handle this
+device.
+
+virtual_root.force_probe :
+
+Force the probing code to probe EISA slots even when it cannot find an
+EISA compliant mainboard (nothing appears on slot 0). Defaultd to 0
+(don't force), and set to 1 (force probing) when either
+CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
+
+** Random notes :
+
+Converting an EISA driver to the new API mostly involves *deleting*
+code (since probing is now in the core EISA code). Unfortunately, most
+drivers share their probing routine between ISA, MCA and EISA. Special
+care must be taken when ripping out the EISA code, so other busses
+won't suffer from these surgical strikes...
+
+You *must not* expect any EISA device to be detected when returning
+from eisa_driver_register, since the chances are that the bus has not
+yet been probed. In fact, that's what happens most of the time (the
+bus root driver usually kicks in rather late in the boot process).
+Unfortunately, most drivers are doing the probing by themselves, and
+expect to have explored the whole machine when they exit their probe
+routine.
+
+For example, switching your favorite EISA SCSI card to the "hotplug"
+model is "the right thing"(tm).
+
+** Thanks :
+
+I'd like to thank the following people for their help :
+- Xavier Benigni for lending me a wonderful Alpha Jensen,
+- James Bottomley, Jeff Garzik for getting this stuff into the kernel,
+- Andries Brouwer for contributing numerous EISA ids,
+- Catrin Jones for coping with far too many machines at home.
diff --git a/Documentation/email-clients.txt b/Documentation/email-clients.txt
new file mode 100644
index 0000000..a618efa
--- /dev/null
+++ b/Documentation/email-clients.txt
@@ -0,0 +1,241 @@
+Email clients info for Linux
+======================================================================
+
+General Preferences
+----------------------------------------------------------------------
+Patches for the Linux kernel are submitted via email, preferably as
+inline text in the body of the email. Some maintainers accept
+attachments, but then the attachments should have content-type
+"text/plain". However, attachments are generally frowned upon because
+it makes quoting portions of the patch more difficult in the patch
+review process.
+
+Email clients that are used for Linux kernel patches should send the
+patch text untouched. For example, they should not modify or delete tabs
+or spaces, even at the beginning or end of lines.
+
+Don't send patches with "format=flowed". This can cause unexpected
+and unwanted line breaks.
+
+Don't let your email client do automatic word wrapping for you.
+This can also corrupt your patch.
+
+Email clients should not modify the character set encoding of the text.
+Emailed patches should be in ASCII or UTF-8 encoding only.
+If you configure your email client to send emails with UTF-8 encoding,
+you avoid some possible charset problems.
+
+Email clients should generate and maintain References: or In-Reply-To:
+headers so that mail threading is not broken.
+
+Copy-and-paste (or cut-and-paste) usually does not work for patches
+because tabs are converted to spaces. Using xclipboard, xclip, and/or
+xcutsel may work, but it's best to test this for yourself or just avoid
+copy-and-paste.
+
+Don't use PGP/GPG signatures in mail that contains patches.
+This breaks many scripts that read and apply the patches.
+(This should be fixable.)
+
+It's a good idea to send a patch to yourself, save the received message,
+and successfully apply it with 'patch' before sending patches to Linux
+mailing lists.
+
+
+Some email client (MUA) hints
+----------------------------------------------------------------------
+Here are some specific MUA configuration hints for editing and sending
+patches for the Linux kernel. These are not meant to be complete
+software package configuration summaries.
+
+Legend:
+TUI = text-based user interface
+GUI = graphical user interface
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Alpine (TUI)
+
+Config options:
+In the "Sending Preferences" section:
+
+- "Do Not Send Flowed Text" must be enabled
+- "Strip Whitespace Before Sending" must be disabled
+
+When composing the message, the cursor should be placed where the patch
+should appear, and then pressing CTRL-R let you specify the patch file
+to insert into the message.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Evolution (GUI)
+
+Some people use this successfully for patches.
+
+When composing mail select: Preformat
+ from Format->Heading->Preformatted (Ctrl-7)
+ or the toolbar
+
+Then use:
+ Insert->Text File... (Alt-n x)
+to insert the patch.
+
+You can also "diff -Nru old.c new.c | xclip", select Preformat, then
+paste with the middle button.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Kmail (GUI)
+
+Some people use Kmail successfully for patches.
+
+The default setting of not composing in HTML is appropriate; do not
+enable it.
+
+When composing an email, under options, uncheck "word wrap". The only
+disadvantage is any text you type in the email will not be word-wrapped
+so you will have to manually word wrap text before the patch. The easiest
+way around this is to compose your email with word wrap enabled, then save
+it as a draft. Once you pull it up again from your drafts it is now hard
+word-wrapped and you can uncheck "word wrap" without losing the existing
+wrapping.
+
+At the bottom of your email, put the commonly-used patch delimiter before
+inserting your patch: three hyphens (---).
+
+Then from the "Message" menu item, select insert file and choose your patch.
+As an added bonus you can customise the message creation toolbar menu
+and put the "insert file" icon there.
+
+You can safely GPG sign attachments, but inlined text is preferred for
+patches so do not GPG sign them. Signing patches that have been inserted
+as inlined text will make them tricky to extract from their 7-bit encoding.
+
+If you absolutely must send patches as attachments instead of inlining
+them as text, right click on the attachment and select properties, and
+highlight "Suggest automatic display" to make the attachment inlined to
+make it more viewable.
+
+When saving patches that are sent as inlined text, select the email that
+contains the patch from the message list pane, right click and select
+"save as". You can use the whole email unmodified as a patch if it was
+properly composed. There is no option currently to save the email when you
+are actually viewing it in its own window -- there has been a request filed
+at kmail's bugzilla and hopefully this will be addressed. Emails are saved
+as read-write for user only so you will have to chmod them to make them
+group and world readable if you copy them elsewhere.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Lotus Notes (GUI)
+
+Run away from it.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Mutt (TUI)
+
+Plenty of Linux developers use mutt, so it must work pretty well.
+
+Mutt doesn't come with an editor, so whatever editor you use should be
+used in a way that there are no automatic linebreaks. Most editors have
+an "insert file" option that inserts the contents of a file unaltered.
+
+To use 'vim' with mutt:
+ set editor="vi"
+
+ If using xclip, type the command
+ :set paste
+ before middle button or shift-insert or use
+ :r filename
+
+if you want to include the patch inline.
+(a)ttach works fine without "set paste".
+
+Config options:
+It should work with default settings.
+However, it's a good idea to set the "send_charset" to:
+ set send_charset="us-ascii:utf-8"
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Pine (TUI)
+
+Pine has had some whitespace truncation issues in the past, but these
+should all be fixed now.
+
+Use alpine (pine's successor) if you can.
+
+Config options:
+- quell-flowed-text is needed for recent versions
+- the "no-strip-whitespace-before-send" option is needed
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Sylpheed (GUI)
+
+- Works well for inlining text (or using attachments).
+- Allows use of an external editor.
+- Is slow on large folders.
+- Won't do TLS SMTP auth over a non-SSL connection.
+- Has a helpful ruler bar in the compose window.
+- Adding addresses to address book doesn't understand the display name
+ properly.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Thunderbird (GUI)
+
+By default, thunderbird likes to mangle text, but there are ways to
+coerce it into being nice.
+
+- Under account settings, composition and addressing, uncheck "Compose
+ messages in HTML format".
+
+- Edit your Thunderbird config settings to tell it not to wrap lines:
+ user_pref("mailnews.wraplength", 0);
+
+- Edit your Thunderbird config settings so that it won't use format=flowed:
+ user_pref("mailnews.send_plaintext_flowed", false);
+
+- You need to get Thunderbird into preformat mode:
+. If you compose HTML messages by default, it's not too hard. Just select
+ "Preformat" from the drop-down box just under the subject line.
+. If you compose in text by default, you have to tell it to compose a new
+ message in HTML (just as a one-off), and then force it from there back to
+ text, else it will wrap lines. To do this, use shift-click on the Write
+ icon to compose to get HTML compose mode, then select "Preformat" from
+ the drop-down box just under the subject line.
+
+- Allows use of an external editor:
+ The easiest thing to do with Thunderbird and patches is to use an
+ "external editor" extension and then just use your favorite $EDITOR
+ for reading/merging patches into the body text. To do this, download
+ and install the extension, then add a button for it using
+ View->Toolbars->Customize... and finally just click on it when in the
+ Compose dialog.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TkRat (GUI)
+
+Works. Use "Insert file..." or external editor.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Gmail (Web GUI)
+
+If you just have to use Gmail to send patches, it CAN be made to work. It
+requires a bit of external help, though.
+
+The first problem is that Gmail converts tabs to spaces. This will
+totally break your patches. To prevent this, you have to use a different
+editor. There is a firefox extension called "ViewSourceWith"
+(https://addons.mozilla.org/en-US/firefox/addon/394) which allows you to
+edit any text box in the editor of your choice. Configure it to launch
+your favorite editor. When you want to send a patch, use this technique.
+Once you have crafted your messsage + patch, save and exit the editor,
+which should reload the Gmail edit box. GMAIL WILL PRESERVE THE TABS.
+Hoorah. Apparently you can cut-n-paste literal tabs, but Gmail will
+convert those to spaces upon sending!
+
+The second problem is that Gmail converts tabs to spaces on replies. If
+you reply to a patch, don't expect to be able to apply it as a patch.
+
+The last problem is that Gmail will base64-encode any message that has a
+non-ASCII character. That includes things like European names. Be aware.
+
+Gmail is not convenient for lkml patches, but CAN be made to work.
+
+ ###
diff --git a/Documentation/exception.txt b/Documentation/exception.txt
new file mode 100644
index 0000000..2d5aded
--- /dev/null
+++ b/Documentation/exception.txt
@@ -0,0 +1,292 @@
+ Kernel level exception handling in Linux 2.1.8
+ Commentary by Joerg Pommnitz <joerg@raleigh.ibm.com>
+
+When a process runs in kernel mode, it often has to access user
+mode memory whose address has been passed by an untrusted program.
+To protect itself the kernel has to verify this address.
+
+In older versions of Linux this was done with the
+int verify_area(int type, const void * addr, unsigned long size)
+function (which has since been replaced by access_ok()).
+
+This function verified that the memory area starting at address
+'addr' and of size 'size' was accessible for the operation specified
+in type (read or write). To do this, verify_read had to look up the
+virtual memory area (vma) that contained the address addr. In the
+normal case (correctly working program), this test was successful.
+It only failed for a few buggy programs. In some kernel profiling
+tests, this normally unneeded verification used up a considerable
+amount of time.
+
+To overcome this situation, Linus decided to let the virtual memory
+hardware present in every Linux-capable CPU handle this test.
+
+How does this work?
+
+Whenever the kernel tries to access an address that is currently not
+accessible, the CPU generates a page fault exception and calls the
+page fault handler
+
+void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+
+in arch/i386/mm/fault.c. The parameters on the stack are set up by
+the low level assembly glue in arch/i386/kernel/entry.S. The parameter
+regs is a pointer to the saved registers on the stack, error_code
+contains a reason code for the exception.
+
+do_page_fault first obtains the unaccessible address from the CPU
+control register CR2. If the address is within the virtual address
+space of the process, the fault probably occurred, because the page
+was not swapped in, write protected or something similar. However,
+we are interested in the other case: the address is not valid, there
+is no vma that contains this address. In this case, the kernel jumps
+to the bad_area label.
+
+There it uses the address of the instruction that caused the exception
+(i.e. regs->eip) to find an address where the execution can continue
+(fixup). If this search is successful, the fault handler modifies the
+return address (again regs->eip) and returns. The execution will
+continue at the address in fixup.
+
+Where does fixup point to?
+
+Since we jump to the contents of fixup, fixup obviously points
+to executable code. This code is hidden inside the user access macros.
+I have picked the get_user macro defined in include/asm/uaccess.h as an
+example. The definition is somewhat hard to follow, so let's peek at
+the code generated by the preprocessor and the compiler. I selected
+the get_user call in drivers/char/console.c for a detailed examination.
+
+The original code in console.c line 1405:
+ get_user(c, buf);
+
+The preprocessor output (edited to become somewhat readable):
+
+(
+ {
+ long __gu_err = - 14 , __gu_val = 0;
+ const __typeof__(*( ( buf ) )) *__gu_addr = ((buf));
+ if (((((0 + current_set[0])->tss.segment) == 0x18 ) ||
+ (((sizeof(*(buf))) <= 0xC0000000UL) &&
+ ((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf)))))))
+ do {
+ __gu_err = 0;
+ switch ((sizeof(*(buf)))) {
+ case 1:
+ __asm__ __volatile__(
+ "1: mov" "b" " %2,%" "b" "1\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl %3,%0\n"
+ " xor" "b" " %" "b" "1,%" "b" "1\n"
+ " jmp 2b\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b,3b\n"
+ ".text" : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *)
+ ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )) ;
+ break;
+ case 2:
+ __asm__ __volatile__(
+ "1: mov" "w" " %2,%" "w" "1\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl %3,%0\n"
+ " xor" "w" " %" "w" "1,%" "w" "1\n"
+ " jmp 2b\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b,3b\n"
+ ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
+ ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err ));
+ break;
+ case 4:
+ __asm__ __volatile__(
+ "1: mov" "l" " %2,%" "" "1\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl %3,%0\n"
+ " xor" "l" " %" "" "1,%" "" "1\n"
+ " jmp 2b\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n" " .long 1b,3b\n"
+ ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
+ ( __gu_addr )) ), "i"(- 14 ), "0"(__gu_err));
+ break;
+ default:
+ (__gu_val) = __get_user_bad();
+ }
+ } while (0) ;
+ ((c)) = (__typeof__(*((buf))))__gu_val;
+ __gu_err;
+ }
+);
+
+WOW! Black GCC/assembly magic. This is impossible to follow, so let's
+see what code gcc generates:
+
+ > xorl %edx,%edx
+ > movl current_set,%eax
+ > cmpl $24,788(%eax)
+ > je .L1424
+ > cmpl $-1073741825,64(%esp)
+ > ja .L1423
+ > .L1424:
+ > movl %edx,%eax
+ > movl 64(%esp),%ebx
+ > #APP
+ > 1: movb (%ebx),%dl /* this is the actual user access */
+ > 2:
+ > .section .fixup,"ax"
+ > 3: movl $-14,%eax
+ > xorb %dl,%dl
+ > jmp 2b
+ > .section __ex_table,"a"
+ > .align 4
+ > .long 1b,3b
+ > .text
+ > #NO_APP
+ > .L1423:
+ > movzbl %dl,%esi
+
+The optimizer does a good job and gives us something we can actually
+understand. Can we? The actual user access is quite obvious. Thanks
+to the unified address space we can just access the address in user
+memory. But what does the .section stuff do?????
+
+To understand this we have to look at the final kernel:
+
+ > objdump --section-headers vmlinux
+ >
+ > vmlinux: file format elf32-i386
+ >
+ > Sections:
+ > Idx Name Size VMA LMA File off Algn
+ > 0 .text 00098f40 c0100000 c0100000 00001000 2**4
+ > CONTENTS, ALLOC, LOAD, READONLY, CODE
+ > 1 .fixup 000016bc c0198f40 c0198f40 00099f40 2**0
+ > CONTENTS, ALLOC, LOAD, READONLY, CODE
+ > 2 .rodata 0000f127 c019a5fc c019a5fc 0009b5fc 2**2
+ > CONTENTS, ALLOC, LOAD, READONLY, DATA
+ > 3 __ex_table 000015c0 c01a9724 c01a9724 000aa724 2**2
+ > CONTENTS, ALLOC, LOAD, READONLY, DATA
+ > 4 .data 0000ea58 c01abcf0 c01abcf0 000abcf0 2**4
+ > CONTENTS, ALLOC, LOAD, DATA
+ > 5 .bss 00018e21 c01ba748 c01ba748 000ba748 2**2
+ > ALLOC
+ > 6 .comment 00000ec4 00000000 00000000 000ba748 2**0
+ > CONTENTS, READONLY
+ > 7 .note 00001068 00000ec4 00000ec4 000bb60c 2**0
+ > CONTENTS, READONLY
+
+There are obviously 2 non standard ELF sections in the generated object
+file. But first we want to find out what happened to our code in the
+final kernel executable:
+
+ > objdump --disassemble --section=.text vmlinux
+ >
+ > c017e785 <do_con_write+c1> xorl %edx,%edx
+ > c017e787 <do_con_write+c3> movl 0xc01c7bec,%eax
+ > c017e78c <do_con_write+c8> cmpl $0x18,0x314(%eax)
+ > c017e793 <do_con_write+cf> je c017e79f <do_con_write+db>
+ > c017e795 <do_con_write+d1> cmpl $0xbfffffff,0x40(%esp,1)
+ > c017e79d <do_con_write+d9> ja c017e7a7 <do_con_write+e3>
+ > c017e79f <do_con_write+db> movl %edx,%eax
+ > c017e7a1 <do_con_write+dd> movl 0x40(%esp,1),%ebx
+ > c017e7a5 <do_con_write+e1> movb (%ebx),%dl
+ > c017e7a7 <do_con_write+e3> movzbl %dl,%esi
+
+The whole user memory access is reduced to 10 x86 machine instructions.
+The instructions bracketed in the .section directives are no longer
+in the normal execution path. They are located in a different section
+of the executable file:
+
+ > objdump --disassemble --section=.fixup vmlinux
+ >
+ > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax
+ > c0199ffa <.fixup+10ba> xorb %dl,%dl
+ > c0199ffc <.fixup+10bc> jmp c017e7a7 <do_con_write+e3>
+
+And finally:
+ > objdump --full-contents --section=__ex_table vmlinux
+ >
+ > c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0 ................
+ > c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0 ................
+ > c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0 ................
+
+or in human readable byte order:
+
+ > c01aa7c4 c017c093 c0199fe0 c017c097 c017c099 ................
+ > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................
+ ^^^^^^^^^^^^^^^^^
+ this is the interesting part!
+ > c01aa7e4 c0180a08 c019a001 c0180a0a c019a004 ................
+
+What happened? The assembly directives
+
+.section .fixup,"ax"
+.section __ex_table,"a"
+
+told the assembler to move the following code to the specified
+sections in the ELF object file. So the instructions
+3: movl $-14,%eax
+ xorb %dl,%dl
+ jmp 2b
+ended up in the .fixup section of the object file and the addresses
+ .long 1b,3b
+ended up in the __ex_table section of the object file. 1b and 3b
+are local labels. The local label 1b (1b stands for next label 1
+backward) is the address of the instruction that might fault, i.e.
+in our case the address of the label 1 is c017e7a5:
+the original assembly code: > 1: movb (%ebx),%dl
+and linked in vmlinux : > c017e7a5 <do_con_write+e1> movb (%ebx),%dl
+
+The local label 3 (backwards again) is the address of the code to handle
+the fault, in our case the actual value is c0199ff5:
+the original assembly code: > 3: movl $-14,%eax
+and linked in vmlinux : > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax
+
+The assembly code
+ > .section __ex_table,"a"
+ > .align 4
+ > .long 1b,3b
+
+becomes the value pair
+ > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................
+ ^this is ^this is
+ 1b 3b
+c017e7a5,c0199ff5 in the exception table of the kernel.
+
+So, what actually happens if a fault from kernel mode with no suitable
+vma occurs?
+
+1.) access to invalid address:
+ > c017e7a5 <do_con_write+e1> movb (%ebx),%dl
+2.) MMU generates exception
+3.) CPU calls do_page_fault
+4.) do page fault calls search_exception_table (regs->eip == c017e7a5);
+5.) search_exception_table looks up the address c017e7a5 in the
+ exception table (i.e. the contents of the ELF section __ex_table)
+ and returns the address of the associated fault handle code c0199ff5.
+6.) do_page_fault modifies its own return address to point to the fault
+ handle code and returns.
+7.) execution continues in the fault handling code.
+8.) 8a) EAX becomes -EFAULT (== -14)
+ 8b) DL becomes zero (the value we "read" from user space)
+ 8c) execution continues at local label 2 (address of the
+ instruction immediately after the faulting user access).
+
+The steps 8a to 8c in a certain way emulate the faulting instruction.
+
+That's it, mostly. If you look at our example, you might ask why
+we set EAX to -EFAULT in the exception handler code. Well, the
+get_user macro actually returns a value: 0, if the user access was
+successful, -EFAULT on failure. Our original code did not test this
+return value, however the inline assembly code in get_user tries to
+return -EFAULT. GCC selected EAX to return this value.
+
+NOTE:
+Due to the way that the exception table is built and needs to be ordered,
+only use exceptions for code in the .text section. Any other section
+will cause the exception table to not be sorted correctly, and the
+exceptions will fail.
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
new file mode 100644
index 0000000..4bc374a
--- /dev/null
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -0,0 +1,237 @@
+Fault injection capabilities infrastructure
+===========================================
+
+See also drivers/md/faulty.c and "every_nth" module option for scsi_debug.
+
+
+Available fault injection capabilities
+--------------------------------------
+
+o failslab
+
+ injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...)
+
+o fail_page_alloc
+
+ injects page allocation failures. (alloc_pages(), get_free_pages(), ...)
+
+o fail_make_request
+
+ injects disk IO errors on devices permitted by setting
+ /sys/block/<device>/make-it-fail or
+ /sys/block/<device>/<partition>/make-it-fail. (generic_make_request())
+
+Configure fault-injection capabilities behavior
+-----------------------------------------------
+
+o debugfs entries
+
+fault-inject-debugfs kernel module provides some debugfs entries for runtime
+configuration of fault-injection capabilities.
+
+- /debug/fail*/probability:
+
+ likelihood of failure injection, in percent.
+ Format: <percent>
+
+ Note that one-failure-per-hundred is a very high error rate
+ for some testcases. Consider setting probability=100 and configure
+ /debug/fail*/interval for such testcases.
+
+- /debug/fail*/interval:
+
+ specifies the interval between failures, for calls to
+ should_fail() that pass all the other tests.
+
+ Note that if you enable this, by setting interval>1, you will
+ probably want to set probability=100.
+
+- /debug/fail*/times:
+
+ specifies how many times failures may happen at most.
+ A value of -1 means "no limit".
+
+- /debug/fail*/space:
+
+ specifies an initial resource "budget", decremented by "size"
+ on each call to should_fail(,size). Failure injection is
+ suppressed until "space" reaches zero.
+
+- /debug/fail*/verbose
+
+ Format: { 0 | 1 | 2 }
+ specifies the verbosity of the messages when failure is
+ injected. '0' means no messages; '1' will print only a single
+ log line per failure; '2' will print a call trace too -- useful
+ to debug the problems revealed by fault injection.
+
+- /debug/fail*/task-filter:
+
+ Format: { 'Y' | 'N' }
+ A value of 'N' disables filtering by process (default).
+ Any positive value limits failures to only processes indicated by
+ /proc/<pid>/make-it-fail==1.
+
+- /debug/fail*/require-start:
+- /debug/fail*/require-end:
+- /debug/fail*/reject-start:
+- /debug/fail*/reject-end:
+
+ specifies the range of virtual addresses tested during
+ stacktrace walking. Failure is injected only if some caller
+ in the walked stacktrace lies within the required range, and
+ none lies within the rejected range.
+ Default required range is [0,ULONG_MAX) (whole of virtual address space).
+ Default rejected range is [0,0).
+
+- /debug/fail*/stacktrace-depth:
+
+ specifies the maximum stacktrace depth walked during search
+ for a caller within [require-start,require-end) OR
+ [reject-start,reject-end).
+
+- /debug/fail_page_alloc/ignore-gfp-highmem:
+
+ Format: { 'Y' | 'N' }
+ default is 'N', setting it to 'Y' won't inject failures into
+ highmem/user allocations.
+
+- /debug/failslab/ignore-gfp-wait:
+- /debug/fail_page_alloc/ignore-gfp-wait:
+
+ Format: { 'Y' | 'N' }
+ default is 'N', setting it to 'Y' will inject failures
+ only into non-sleep allocations (GFP_ATOMIC allocations).
+
+- /debug/fail_page_alloc/min-order:
+
+ specifies the minimum page allocation order to be injected
+ failures.
+
+o Boot option
+
+In order to inject faults while debugfs is not available (early boot time),
+use the boot option:
+
+ failslab=
+ fail_page_alloc=
+ fail_make_request=<interval>,<probability>,<space>,<times>
+
+How to add new fault injection capability
+-----------------------------------------
+
+o #include <linux/fault-inject.h>
+
+o define the fault attributes
+
+ DECLARE_FAULT_INJECTION(name);
+
+ Please see the definition of struct fault_attr in fault-inject.h
+ for details.
+
+o provide a way to configure fault attributes
+
+- boot option
+
+ If you need to enable the fault injection capability from boot time, you can
+ provide boot option to configure it. There is a helper function for it:
+
+ setup_fault_attr(attr, str);
+
+- debugfs entries
+
+ failslab, fail_page_alloc, and fail_make_request use this way.
+ Helper functions:
+
+ init_fault_attr_entries(entries, attr, name);
+ void cleanup_fault_attr_entries(entries);
+
+- module parameters
+
+ If the scope of the fault injection capability is limited to a
+ single kernel module, it is better to provide module parameters to
+ configure the fault attributes.
+
+o add a hook to insert failures
+
+ Upon should_fail() returning true, client code should inject a failure.
+
+ should_fail(attr, size);
+
+Application Examples
+--------------------
+
+o Inject slab allocation failures into module init/exit code
+
+#!/bin/bash
+
+FAILTYPE=failslab
+echo Y > /debug/$FAILTYPE/task-filter
+echo 10 > /debug/$FAILTYPE/probability
+echo 100 > /debug/$FAILTYPE/interval
+echo -1 > /debug/$FAILTYPE/times
+echo 0 > /debug/$FAILTYPE/space
+echo 2 > /debug/$FAILTYPE/verbose
+echo 1 > /debug/$FAILTYPE/ignore-gfp-wait
+
+faulty_system()
+{
+ bash -c "echo 1 > /proc/self/make-it-fail && exec $*"
+}
+
+if [ $# -eq 0 ]
+then
+ echo "Usage: $0 modulename [ modulename ... ]"
+ exit 1
+fi
+
+for m in $*
+do
+ echo inserting $m...
+ faulty_system modprobe $m
+
+ echo removing $m...
+ faulty_system modprobe -r $m
+done
+
+------------------------------------------------------------------------------
+
+o Inject page allocation failures only for a specific module
+
+#!/bin/bash
+
+FAILTYPE=fail_page_alloc
+module=$1
+
+if [ -z $module ]
+then
+ echo "Usage: $0 <modulename>"
+ exit 1
+fi
+
+modprobe $module
+
+if [ ! -d /sys/module/$module/sections ]
+then
+ echo Module $module is not loaded
+ exit 1
+fi
+
+cat /sys/module/$module/sections/.text > /debug/$FAILTYPE/require-start
+cat /sys/module/$module/sections/.data > /debug/$FAILTYPE/require-end
+
+echo N > /debug/$FAILTYPE/task-filter
+echo 10 > /debug/$FAILTYPE/probability
+echo 100 > /debug/$FAILTYPE/interval
+echo -1 > /debug/$FAILTYPE/times
+echo 0 > /debug/$FAILTYPE/space
+echo 2 > /debug/$FAILTYPE/verbose
+echo 1 > /debug/$FAILTYPE/ignore-gfp-wait
+echo 1 > /debug/$FAILTYPE/ignore-gfp-highmem
+echo 10 > /debug/$FAILTYPE/stacktrace-depth
+
+trap "echo 0 > /debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
+
+echo "Injecting errors into the module $module... (interrupt to stop)"
+sleep 1000000
+
diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX
new file mode 100644
index 0000000..caabbd3
--- /dev/null
+++ b/Documentation/fb/00-INDEX
@@ -0,0 +1,53 @@
+Index of files in Documentation/fb. If you think something about frame
+buffer devices needs an entry here, needs correction or you've written one
+please mail me.
+ Geert Uytterhoeven <geert@linux-m68k.org>
+
+00-INDEX
+ - this file
+arkfb.txt
+ - info on the fbdev driver for ARK Logic chips.
+aty128fb.txt
+ - info on the ATI Rage128 frame buffer driver.
+cirrusfb.txt
+ - info on the driver for Cirrus Logic chipsets.
+cyblafb/
+ - directory with documentation files related to the cyblafb driver.
+deferred_io.txt
+ - an introduction to deferred IO.
+fbcon.txt
+ - intro to and usage guide for the framebuffer console (fbcon).
+framebuffer.txt
+ - introduction to frame buffer devices.
+imacfb.txt
+ - info on the generic EFI platform driver for Intel based Macs.
+intel810.txt
+ - documentation for the Intel 810/815 framebuffer driver.
+intelfb.txt
+ - docs for Intel 830M/845G/852GM/855GM/865G/915G/945G fb driver.
+internals.txt
+ - quick overview of frame buffer device internals.
+matroxfb.txt
+ - info on the Matrox framebuffer driver for Alpha, Intel and PPC.
+modedb.txt
+ - info on the video mode database.
+matroxfb.txt
+ - info on the Matrox frame buffer driver.
+pvr2fb.txt
+ - info on the PowerVR 2 frame buffer driver.
+pxafb.txt
+ - info on the driver for the PXA25x LCD controller.
+s3fb.txt
+ - info on the fbdev driver for S3 Trio/Virge chips.
+sa1100fb.txt
+ - information about the driver for the SA-1100 LCD controller.
+sisfb.txt
+ - info on the framebuffer device driver for various SiS chips.
+sstfb.txt
+ - info on the frame buffer driver for 3dfx' Voodoo Graphics boards.
+tgafb.txt
+ - info on the TGA (DECChip 21030) frame buffer driver
+vesafb.txt
+ - info on the VESA frame buffer device
+vt8623fb.txt
+ - info on the fb driver for the graphics core in VIA VT8623 chipsets.
diff --git a/Documentation/fb/arkfb.txt b/Documentation/fb/arkfb.txt
new file mode 100644
index 0000000..e8487a9
--- /dev/null
+++ b/Documentation/fb/arkfb.txt
@@ -0,0 +1,68 @@
+
+ arkfb - fbdev driver for ARK Logic chips
+ ========================================
+
+
+Supported Hardware
+==================
+
+ ARK 2000PV chip
+ ICS 5342 ramdac
+
+ - only BIOS initialized VGA devices supported
+ - probably not working on big endian
+
+
+Supported Features
+==================
+
+ * 4 bpp pseudocolor modes (with 18bit palette, two variants)
+ * 8 bpp pseudocolor mode (with 18bit palette)
+ * 16 bpp truecolor modes (RGB 555 and RGB 565)
+ * 24 bpp truecolor mode (RGB 888)
+ * 32 bpp truecolor mode (RGB 888)
+ * text mode (activated by bpp = 0)
+ * doublescan mode variant (not available in text mode)
+ * panning in both directions
+ * suspend/resume support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (i got maximum about 70 MHz, it is dependent on specific
+hardware). This limitation is not enforced by driver. Text mode supports 8bit
+wide fonts only (hardware limitation) and 16bit tall fonts (driver
+limitation). Unfortunately character attributes (like color) in text mode are
+broken for unknown reason, so its usefulness is limited.
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+ * secondary (not initialized by BIOS) device support
+ * big endian support
+ * DPMS support
+ * MMIO support
+ * interlaced mode variant
+ * support for fontwidths != 8 in 4 bpp modes
+ * support for fontheight != 16 in text mode
+ * hardware cursor
+ * vsync synchronization
+ * feature connector support
+ * acceleration support (8514-like 2D)
+
+
+Known bugs
+==========
+
+ * character attributes (and cursor) in text mode are broken
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/aty128fb.txt b/Documentation/fb/aty128fb.txt
new file mode 100644
index 0000000..b605204
--- /dev/null
+++ b/Documentation/fb/aty128fb.txt
@@ -0,0 +1,72 @@
+[This file is cloned from VesaFB/matroxfb]
+
+What is aty128fb?
+=================
+
+This is a driver for a graphic framebuffer for ATI Rage128 based devices
+on Intel and PPC boxes.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+ without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode... but you should not notice
+ if you use same resolution as you used in textmode.
+ * still experimental.
+
+
+How to use it?
+==============
+
+Switching modes is done using the video=aty128fb:<resolution>... modedb
+boot parameter or using `fbset' program.
+
+See Documentation/fb/modedb.txt for more information on modedb
+resolutions.
+
+You should compile in both vgacon (to boot if you remove your Rage128 from
+box) and aty128fb (for graphics mode). You should not compile-in vesafb
+unless you have primary display on non-Rage128 VBE2.0 device (see
+Documentation/fb/vesafb.txt for details).
+
+
+X11
+===
+
+XF68_FBDev should generally work fine, but it is non-accelerated. As of
+this document, 8 and 32bpp works fine. There have been palette issues
+when switching from X to console and back to X. You will have to restart
+X to fix this.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to vesafb with
+`video=aty128fb:option1,option2:value2,option3' (multiple options should
+be separated by comma, values are separated from options by `:').
+Accepted options:
+
+noaccel - do not use acceleration engine. It is default.
+accel - use acceleration engine. Not finished.
+vmode:x - chooses PowerMacintosh video mode <x>. Deprecated.
+cmode:x - chooses PowerMacintosh colour mode <x>. Deprecated.
+<XxX@X> - selects startup videomode. See modedb.txt for detailed
+ explanation. Default is 640x480x8bpp.
+
+
+Limitations
+===========
+
+There are known and unknown bugs, features and misfeatures.
+Currently there are following known bugs:
+ + This driver is still experimental and is not finished. Too many
+ bugs/errata to list here.
+
+--
+Brad Douglas <brad@neruo.com>
diff --git a/Documentation/fb/cirrusfb.txt b/Documentation/fb/cirrusfb.txt
new file mode 100644
index 0000000..f943684
--- /dev/null
+++ b/Documentation/fb/cirrusfb.txt
@@ -0,0 +1,97 @@
+
+ Framebuffer driver for Cirrus Logic chipsets
+ Copyright 1999 Jeff Garzik <jgarzik@pobox.com>
+
+
+
+{ just a little something to get people going; contributors welcome! }
+
+
+
+Chip families supported:
+ SD64
+ Piccolo
+ Picasso
+ Spectrum
+ Alpine (GD-543x/4x)
+ Picasso4 (GD-5446)
+ GD-5480
+ Laguna (GD-546x)
+
+Bus's supported:
+ PCI
+ Zorro
+
+Architectures supported:
+ i386
+ Alpha
+ PPC (Motorola Powerstack)
+ m68k (Amiga)
+
+
+
+Default video modes
+-------------------
+At the moment, there are two kernel command line arguments supported:
+
+mode:640x480
+mode:800x600
+ or
+mode:1024x768
+
+Full support for startup video modes (modedb) will be integrated soon.
+
+Version 1.9.9.1
+---------------
+* Fix memory detection for 512kB case
+* 800x600 mode
+* Fixed timings
+* Hint for AXP: Use -accel false -vyres -1 when changing resolution
+
+
+Version 1.9.4.4
+---------------
+* Preliminary Laguna support
+* Overhaul color register routines.
+* Associated with the above, console colors are now obtained from a LUT
+ called 'palette' instead of from the VGA registers. This code was
+ modeled after that in atyfb and matroxfb.
+* Code cleanup, add comments.
+* Overhaul SR07 handling.
+* Bug fixes.
+
+
+Version 1.9.4.3
+---------------
+* Correctly set default startup video mode.
+* Do not override ram size setting. Define
+ CLGEN_USE_HARDCODED_RAM_SETTINGS if you _do_ want to override the RAM
+ setting.
+* Compile fixes related to new 2.3.x IORESOURCE_IO[PORT] symbol changes.
+* Use new 2.3.x resource allocation.
+* Some code cleanup.
+
+
+Version 1.9.4.2
+---------------
+* Casting fixes.
+* Assertions no longer cause an oops on purpose.
+* Bug fixes.
+
+
+Version 1.9.4.1
+---------------
+* Add compatibility support. Now requires a 2.1.x, 2.2.x or 2.3.x kernel.
+
+
+Version 1.9.4
+-------------
+* Several enhancements, smaller memory footprint, a few bugfixes.
+* Requires kernel 2.3.14-pre1 or later.
+
+
+Version 1.9.3
+-------------
+* Bundled with kernel 2.3.14-pre1 or later.
+
+
diff --git a/Documentation/fb/cmap_xfbdev.txt b/Documentation/fb/cmap_xfbdev.txt
new file mode 100644
index 0000000..55e1f0a
--- /dev/null
+++ b/Documentation/fb/cmap_xfbdev.txt
@@ -0,0 +1,53 @@
+Understanding fbdev's cmap
+--------------------------
+
+These notes explain how X's dix layer uses fbdev's cmap structures.
+
+*. example of relevant structures in fbdev as used for a 3-bit grayscale cmap
+struct fb_var_screeninfo {
+ .bits_per_pixel = 8,
+ .grayscale = 1,
+ .red = { 4, 3, 0 },
+ .green = { 0, 0, 0 },
+ .blue = { 0, 0, 0 },
+}
+struct fb_fix_screeninfo {
+ .visual = FB_VISUAL_STATIC_PSEUDOCOLOR,
+}
+for (i = 0; i < 8; i++)
+ info->cmap.red[i] = (((2*i)+1)*(0xFFFF))/16;
+memcpy(info->cmap.green, info->cmap.red, sizeof(u16)*8);
+memcpy(info->cmap.blue, info->cmap.red, sizeof(u16)*8);
+
+*. X11 apps do something like the following when trying to use grayscale.
+for (i=0; i < 8; i++) {
+ char colorspec[64];
+ memset(colorspec,0,64);
+ sprintf(colorspec, "rgb:%x/%x/%x", i*36,i*36,i*36);
+ if (!XParseColor(outputDisplay, testColormap, colorspec, &wantedColor))
+ printf("Can't get color %s\n",colorspec);
+ XAllocColor(outputDisplay, testColormap, &wantedColor);
+ grays[i] = wantedColor;
+}
+There's also named equivalents like gray1..x provided you have an rgb.txt.
+
+Somewhere in X's callchain, this results in a call to X code that handles the
+colormap. For example, Xfbdev hits the following:
+
+xc-011010/programs/Xserver/dix/colormap.c:
+
+FindBestPixel(pentFirst, size, prgb, channel)
+
+dr = (long) pent->co.local.red - prgb->red;
+dg = (long) pent->co.local.green - prgb->green;
+db = (long) pent->co.local.blue - prgb->blue;
+sq = dr * dr;
+UnsignedToBigNum (sq, &sum);
+BigNumAdd (&sum, &temp, &sum);
+
+co.local.red are entries that were brought in through FBIOGETCMAP which come
+directly from the info->cmap.red that was listed above. The prgb is the rgb
+that the app wants to match to. The above code is doing what looks like a least
+squares matching function. That's why the cmap entries can't be set to the left
+hand side boundaries of a color range.
+
diff --git a/Documentation/fb/cyblafb/bugs b/Documentation/fb/cyblafb/bugs
new file mode 100644
index 0000000..9443a6d
--- /dev/null
+++ b/Documentation/fb/cyblafb/bugs
@@ -0,0 +1,13 @@
+Bugs
+====
+
+I currently don't know of any bug. Please do send reports to:
+ - linux-fbdev-devel@lists.sourceforge.net
+ - Knut_Petersen@t-online.de.
+
+
+Untested features
+=================
+
+All LCD stuff is untested. If it worked in tridentfb, it should work in
+cyblafb. Please test and report the results to Knut_Petersen@t-online.de.
diff --git a/Documentation/fb/cyblafb/credits b/Documentation/fb/cyblafb/credits
new file mode 100644
index 0000000..0eb3b44
--- /dev/null
+++ b/Documentation/fb/cyblafb/credits
@@ -0,0 +1,7 @@
+Thanks to
+=========
+ * Alan Hourihane, for writing the X trident driver
+ * Jani Monoses, for writing the tridentfb driver
+ * Antonino A. Daplas, for review of the first published
+ version of cyblafb and some code
+ * Jochen Hein, for testing and a helpfull bug report
diff --git a/Documentation/fb/cyblafb/documentation b/Documentation/fb/cyblafb/documentation
new file mode 100644
index 0000000..bb1aac0
--- /dev/null
+++ b/Documentation/fb/cyblafb/documentation
@@ -0,0 +1,17 @@
+Available Documentation
+=======================
+
+Apollo PLE 133 Chipset VT8601A North Bridge Datasheet, Rev. 1.82, October 22,
+2001, available from VIA:
+
+ http://www.viavpsd.com/product/6/15/DS8601A182.pdf
+
+The datasheet is incomplete, some registers that need to be programmed are not
+explained at all and important bits are listed as "reserved". But you really
+need the datasheet to understand the code. "p. xxx" comments refer to page
+numbers of this document.
+
+XFree/XOrg drivers are available and of good quality, looking at the code
+there is a good idea if the datasheet does not provide enough information
+or if the datasheet seems to be wrong.
+
diff --git a/Documentation/fb/cyblafb/fb.modes b/Documentation/fb/cyblafb/fb.modes
new file mode 100644
index 0000000..fe0e522
--- /dev/null
+++ b/Documentation/fb/cyblafb/fb.modes
@@ -0,0 +1,154 @@
+#
+# Sample fb.modes file
+#
+# Provides an incomplete list of working modes for
+# the cyberblade/i1 graphics core.
+#
+# The value 4294967256 is used instead of -40. Of course, -40 is not
+# a really reasonable value, but chip design does not always follow
+# logic. Believe me, it's ok, and it's the way the BIOS does it.
+#
+# fbset requires 4294967256 in fb.modes and -40 as an argument to
+# the -t parameter. That's also not too reasonable, and it might change
+# in the future or might even be differt for your current version.
+#
+
+mode "640x480-50"
+ geometry 640 480 2048 4096 8
+ timings 47619 4294967256 24 17 0 216 3
+endmode
+
+mode "640x480-60"
+ geometry 640 480 2048 4096 8
+ timings 39682 4294967256 24 17 0 216 3
+endmode
+
+mode "640x480-70"
+ geometry 640 480 2048 4096 8
+ timings 34013 4294967256 24 17 0 216 3
+endmode
+
+mode "640x480-72"
+ geometry 640 480 2048 4096 8
+ timings 33068 4294967256 24 17 0 216 3
+endmode
+
+mode "640x480-75"
+ geometry 640 480 2048 4096 8
+ timings 31746 4294967256 24 17 0 216 3
+endmode
+
+mode "640x480-80"
+ geometry 640 480 2048 4096 8
+ timings 29761 4294967256 24 17 0 216 3
+endmode
+
+mode "640x480-85"
+ geometry 640 480 2048 4096 8
+ timings 28011 4294967256 24 17 0 216 3
+endmode
+
+mode "800x600-50"
+ geometry 800 600 2048 4096 8
+ timings 30303 96 24 14 0 136 11
+endmode
+
+mode "800x600-60"
+ geometry 800 600 2048 4096 8
+ timings 25252 96 24 14 0 136 11
+endmode
+
+mode "800x600-70"
+ geometry 800 600 2048 4096 8
+ timings 21645 96 24 14 0 136 11
+endmode
+
+mode "800x600-72"
+ geometry 800 600 2048 4096 8
+ timings 21043 96 24 14 0 136 11
+endmode
+
+mode "800x600-75"
+ geometry 800 600 2048 4096 8
+ timings 20202 96 24 14 0 136 11
+endmode
+
+mode "800x600-80"
+ geometry 800 600 2048 4096 8
+ timings 18939 96 24 14 0 136 11
+endmode
+
+mode "800x600-85"
+ geometry 800 600 2048 4096 8
+ timings 17825 96 24 14 0 136 11
+endmode
+
+mode "1024x768-50"
+ geometry 1024 768 2048 4096 8
+ timings 19054 144 24 29 0 120 3
+endmode
+
+mode "1024x768-60"
+ geometry 1024 768 2048 4096 8
+ timings 15880 144 24 29 0 120 3
+endmode
+
+mode "1024x768-70"
+ geometry 1024 768 2048 4096 8
+ timings 13610 144 24 29 0 120 3
+endmode
+
+mode "1024x768-72"
+ geometry 1024 768 2048 4096 8
+ timings 13232 144 24 29 0 120 3
+endmode
+
+mode "1024x768-75"
+ geometry 1024 768 2048 4096 8
+ timings 12703 144 24 29 0 120 3
+endmode
+
+mode "1024x768-80"
+ geometry 1024 768 2048 4096 8
+ timings 11910 144 24 29 0 120 3
+endmode
+
+mode "1024x768-85"
+ geometry 1024 768 2048 4096 8
+ timings 11209 144 24 29 0 120 3
+endmode
+
+mode "1280x1024-50"
+ geometry 1280 1024 2048 4096 8
+ timings 11114 232 16 39 0 160 3
+endmode
+
+mode "1280x1024-60"
+ geometry 1280 1024 2048 4096 8
+ timings 9262 232 16 39 0 160 3
+endmode
+
+mode "1280x1024-70"
+ geometry 1280 1024 2048 4096 8
+ timings 7939 232 16 39 0 160 3
+endmode
+
+mode "1280x1024-72"
+ geometry 1280 1024 2048 4096 8
+ timings 7719 232 16 39 0 160 3
+endmode
+
+mode "1280x1024-75"
+ geometry 1280 1024 2048 4096 8
+ timings 7410 232 16 39 0 160 3
+endmode
+
+mode "1280x1024-80"
+ geometry 1280 1024 2048 4096 8
+ timings 6946 232 16 39 0 160 3
+endmode
+
+mode "1280x1024-85"
+ geometry 1280 1024 2048 4096 8
+ timings 6538 232 16 39 0 160 3
+endmode
diff --git a/Documentation/fb/cyblafb/performance b/Documentation/fb/cyblafb/performance
new file mode 100644
index 0000000..8d15d5d
--- /dev/null
+++ b/Documentation/fb/cyblafb/performance
@@ -0,0 +1,79 @@
+Speed
+=====
+
+CyBlaFB is much faster than tridentfb and vesafb. Compare the performance data
+for mode 1280x1024-[8,16,32]@61 Hz.
+
+Test 1: Cat a file with 2000 lines of 0 characters.
+Test 2: Cat a file with 2000 lines of 80 characters.
+Test 3: Cat a file with 2000 lines of 160 characters.
+
+All values show system time use in seconds, kernel 2.6.12 was used for
+the measurements. 2.6.13 is a bit slower, 2.6.14 hopefully will include a
+patch that speeds up kernel bitblitting a lot ( > 20%).
+
++-----------+-----------------------------------------------------+
+| | not accelerated |
+| TRIDENTFB +-----------------+-----------------+-----------------+
+| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
+| | noypan | ypan | noypan | ypan | noypan | ypan |
++-----------+--------+--------+--------+--------+--------+--------+
+| Test 1 | 4.31 | 4.33 | 6.05 | 12.81 | ---- | ---- |
+| Test 2 | 67.94 | 5.44 | 123.16 | 14.79 | ---- | ---- |
+| Test 3 | 131.36 | 6.55 | 240.12 | 16.76 | ---- | ---- |
++-----------+--------+--------+--------+--------+--------+--------+
+| Comments | | | completely bro- |
+| | | | ken, monitor |
+| | | | switches off |
++-----------+-----------------+-----------------+-----------------+
+
+
++-----------+-----------------------------------------------------+
+| | accelerated |
+| TRIDENTFB +-----------------+-----------------+-----------------+
+| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
+| | noypan | ypan | noypan | ypan | noypan | ypan |
++-----------+--------+--------+--------+--------+--------+--------+
+| Test 1 | ---- | ---- | 20.62 | 1.22 | ---- | ---- |
+| Test 2 | ---- | ---- | 22.61 | 3.19 | ---- | ---- |
+| Test 3 | ---- | ---- | 24.59 | 5.16 | ---- | ---- |
++-----------+--------+--------+--------+--------+--------+--------+
+| Comments | broken, writing | broken, ok only | completely bro- |
+| | to wrong places | if bgcolor is | ken, monitor |
+| | on screen + bug | black, bug in | switches off |
+| | in fillrect() | fillrect() | |
++-----------+-----------------+-----------------+-----------------+
+
+
++-----------+-----------------------------------------------------+
+| | not accelerated |
+| VESAFB +-----------------+-----------------+-----------------+
+| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
+| | noypan | ypan | noypan | ypan | noypan | ypan |
++-----------+--------+--------+--------+--------+--------+--------+
+| Test 1 | 4.26 | 3.76 | 5.99 | 7.23 | ---- | ---- |
+| Test 2 | 65.65 | 4.89 | 120.88 | 9.08 | ---- | ---- |
+| Test 3 | 126.91 | 5.94 | 235.77 | 11.03 | ---- | ---- |
++-----------+--------+--------+--------+--------+--------+--------+
+| Comments | vga=0x307 | vga=0x31a | vga=0x31b not |
+| | fh=80kHz | fh=80kHz | supported by |
+| | fv=75kHz | fv=75kHz | video BIOS and |
+| | | | hardware |
++-----------+-----------------+-----------------+-----------------+
+
+
++-----------+-----------------------------------------------------+
+| | accelerated |
+| CYBLAFB +-----------------+-----------------+-----------------+
+| | 8 bpp | 16 bpp | 32 bpp |
+| | noypan | ypan | noypan | ypan | noypan | ypan |
++-----------+--------+--------+--------+--------+--------+--------+
+| Test 1 | 8.02 | 0.23 | 19.04 | 0.61 | 57.12 | 2.74 |
+| Test 2 | 8.38 | 0.55 | 19.39 | 0.92 | 57.54 | 3.13 |
+| Test 3 | 8.73 | 0.86 | 19.74 | 1.24 | 57.95 | 3.51 |
++-----------+--------+--------+--------+--------+--------+--------+
+| Comments | | | |
+| | | | |
+| | | | |
+| | | | |
++-----------+-----------------+-----------------+-----------------+
diff --git a/Documentation/fb/cyblafb/todo b/Documentation/fb/cyblafb/todo
new file mode 100644
index 0000000..c5f6d0e
--- /dev/null
+++ b/Documentation/fb/cyblafb/todo
@@ -0,0 +1,31 @@
+TODO / Missing features
+=======================
+
+Verify LCD stuff "stretch" and "center" options are
+ completely untested ... this code needs to be
+ verified. As I don't have access to such
+ hardware, please contact me if you are
+ willing run some tests.
+
+Interlaced video modes The reason that interleaved
+ modes are disabled is that I do not know
+ the meaning of the vertical interlace
+ parameter. Also the datasheet mentions a
+ bit d8 of a horizontal interlace parameter,
+ but nowhere the lower 8 bits. Please help
+ if you can.
+
+low-res double scan modes Who needs it?
+
+accelerated color blitting Who needs it? The console driver does use color
+ blitting for nothing but drawing the penguine,
+ everything else is done using color expanding
+ blitting of 1bpp character bitmaps.
+
+ioctls Who needs it?
+
+TV-out Will be done later. Use "vga= " at boot time
+ to set a suitable video mode.
+
+??? Feel free to contact me if you have any
+ feature requests
diff --git a/Documentation/fb/cyblafb/usage b/Documentation/fb/cyblafb/usage
new file mode 100644
index 0000000..a39bb3d
--- /dev/null
+++ b/Documentation/fb/cyblafb/usage
@@ -0,0 +1,217 @@
+CyBlaFB is a framebuffer driver for the Cyberblade/i1 graphics core integrated
+into the VIA Apollo PLE133 (aka vt8601) south bridge. It is developed and
+tested using a VIA EPIA 5000 board.
+
+Cyblafb - compiled into the kernel or as a module?
+==================================================
+
+You might compile cyblafb either as a module or compile it permanently into the
+kernel.
+
+Unless you have a real reason to do so you should not compile both vesafb and
+cyblafb permanently into the kernel. It's possible and it helps during the
+developement cycle, but it's useless and will at least block some otherwise
+usefull memory for ordinary users.
+
+Selecting Modes
+===============
+
+ Startup Mode
+ ============
+
+ First of all, you might use the "vga=???" boot parameter as it is
+ documented in vesafb.txt and svga.txt. Cyblafb will detect the video
+ mode selected and will use the geometry and timings found by
+ inspecting the hardware registers.
+
+ video=cyblafb vga=0x317
+
+ Alternatively you might use a combination of the mode, ref and bpp
+ parameters. If you compiled the driver into the kernel, add something
+ like this to the kernel command line:
+
+ video=cyblafb:1280x1024,bpp=16,ref=50 ...
+
+ If you compiled the driver as a module, the same mode would be
+ selected by the following command:
+
+ modprobe cyblafb mode=1280x1024 bpp=16 ref=50 ...
+
+ None of the modes possible to select as startup modes are affected by
+ the problems described at the end of the next subsection.
+
+ For all startup modes cyblafb chooses a virtual x resolution of 2048,
+ the only exception is mode 1280x1024 in combination with 32 bpp. This
+ allows ywrap scrolling for all those modes if rotation is 0 or 2, and
+ also fast scrolling if rotation is 1 or 3. The default virtual y reso-
+ lution is 4096 for bpp == 8, 2048 for bpp==16 and 1024 for bpp == 32,
+ again with the only exception of 1280x1024 at 32 bpp.
+
+ Please do set your video memory size to 8 Mb in the Bios setup. Other
+ values will work, but performace is decreased for a lot of modes.
+
+ Mode changes using fbset
+ ========================
+
+ You might use fbset to change the video mode, see "man fbset". Cyblafb
+ generally does assume that you know what you are doing. But it does
+ some checks, especially those that are needed to prevent you from
+ damaging your hardware.
+
+ - only 8, 16, 24 and 32 bpp video modes are accepted
+ - interlaced video modes are not accepted
+ - double scan video modes are not accepted
+ - if a flat panel is found, cyblafb does not allow you
+ to program a resolution higher than the physical
+ resolution of the flat panel monitor
+ - cyblafb does not allow vclk to exceed 230 MHz. As 32 bpp
+ and (currently) 24 bit modes use a doubled vclk internally,
+ the dotclock limit as seen by fbset is 115 MHz for those
+ modes and 230 MHz for 8 and 16 bpp modes.
+ - cyblafb will allow you to select very high resolutions as
+ long as the hardware can be programmed to these modes. The
+ documented limit 1600x1200 is not enforced, but don't expect
+ perfect signal quality.
+
+ Any request that violates the rules given above will be either changed
+ to something the hardware supports or an error value will be returned.
+
+ If you program a virtual y resolution higher than the hardware limit,
+ cyblafb will silently decrease that value to the highest possible
+ value. The same is true for a virtual x resolution that is not
+ supported by the hardware. Cyblafb tries to adapt vyres first because
+ vxres decides if ywrap scrolling is possible or not.
+
+ Attempts to disable acceleration are ignored, I believe that this is
+ safe.
+
+ Some video modes that should work do not work as expected. If you use
+ the standard fb.modes, fbset 640x480-60 will program that mode, but
+ you will see a vertical area, about two characters wide, with only
+ much darker characters than the other characters on the screen.
+ Cyblafb does allow that mode to be set, as it does not violate the
+ official specifications. It would need a lot of code to reliably sort
+ out all invalid modes, playing around with the margin values will
+ give a valid mode quickly. And if cyblafb would detect such an invalid
+ mode, should it silently alter the requested values or should it
+ report an error? Both options have some pros and cons. As stated
+ above, none of the startup modes are affected, and if you set
+ verbosity to 1 or higher, cyblafb will print the fbset command that
+ would be needed to program that mode using fbset.
+
+
+Other Parameters
+================
+
+
+crt don't autodetect, assume monitor connected to
+ standard VGA connector
+
+fp don't autodetect, assume flat panel display
+ connected to flat panel monitor interface
+
+nativex inform driver about native x resolution of
+ flat panel monitor connected to special
+ interface (should be autodetected)
+
+stretch stretch image to adapt low resolution modes to
+ higer resolutions of flat panel monitors
+ connected to special interface
+
+center center image to adapt low resolution modes to
+ higer resolutions of flat panel monitors
+ connected to special interface
+
+memsize use if autodetected memsize is wrong ...
+ should never be necessary
+
+nopcirr disable PCI read retry
+nopciwr disable PCI write retry
+nopcirb disable PCI read bursts
+nopciwb disable PCI write bursts
+
+bpp bpp for specified modes
+ valid values: 8 || 16 || 24 || 32
+
+ref refresh rate for specified mode
+ valid values: 50 <= ref <= 85
+
+mode 640x480 or 800x600 or 1024x768 or 1280x1024
+ if not specified, the startup mode will be detected
+ and used, so you might also use the vga=??? parameter
+ described in vesafb.txt. If you do not specify a mode,
+ bpp and ref parameters are ignored.
+
+verbosity 0 is the default, increase to at least 2 for every
+ bug report!
+
+Development hints
+=================
+
+It's much faster do compile a module and to load the new version after
+unloading the old module than to compile a new kernel and to reboot. So if you
+try to work on cyblafb, it might be a good idea to use cyblafb as a module.
+In real life, fast often means dangerous, and that's also the case here. If
+you introduce a serious bug when cyblafb is compiled into the kernel, the
+kernel will lock or oops with a high probability before the file system is
+mounted, and the danger for your data is low. If you load a broken own version
+of cyblafb on a running system, the danger for the integrity of the file
+system is much higher as you might need a hard reset afterwards. Decide
+yourself.
+
+Module unloading, the vfb method
+================================
+
+If you want to unload/reload cyblafb using the virtual framebuffer, you need
+to enable vfb support in the kernel first. After that, load the modules as
+shown below:
+
+ modprobe vfb vfb_enable=1
+ modprobe fbcon
+ modprobe cyblafb
+ fbset -fb /dev/fb1 1280x1024-60 -vyres 2662
+ con2fb /dev/fb1 /dev/tty1
+ ...
+
+If you now made some changes to cyblafb and want to reload it, you might do it
+as show below:
+
+ con2fb /dev/fb0 /dev/tty1
+ ...
+ rmmod cyblafb
+ modprobe cyblafb
+ con2fb /dev/fb1 /dev/tty1
+ ...
+
+Of course, you might choose another mode, and most certainly you also want to
+map some other /dev/tty* to the real framebuffer device. You might also choose
+to compile fbcon as a kernel module or place it permanently in the kernel.
+
+I do not know of any way to unload fbcon, and fbcon will prevent the
+framebuffer device loaded first from unloading. [If there is a way, then
+please add a description here!]
+
+Module unloading, the vesafb method
+===================================
+
+Configure the kernel:
+
+ <*> Support for frame buffer devices
+ [*] VESA VGA graphics support
+ <M> Cyberblade/i1 support
+
+Add e.g. "video=vesafb:ypan vga=0x307" to the kernel parameters. The ypan
+parameter is important, choose any vga parameter you like as long as it is
+a graphics mode.
+
+After booting, load cyblafb without any mode and bpp parameter and assign
+cyblafb to individual ttys using con2fb, e.g.:
+
+ modprobe cyblafb
+ con2fb /dev/fb1 /dev/tty1
+
+Unloading cyblafb works without problems after you assign vesafb to all
+ttys again, e.g.:
+
+ con2fb /dev/fb0 /dev/tty1
+ rmmod cyblafb
diff --git a/Documentation/fb/cyblafb/whatsnew b/Documentation/fb/cyblafb/whatsnew
new file mode 100644
index 0000000..76c07a2
--- /dev/null
+++ b/Documentation/fb/cyblafb/whatsnew
@@ -0,0 +1,29 @@
+0.62
+====
+
+ - the vesafb parameter has been removed as I decided to allow the
+ feature without any special parameter.
+
+ - Cyblafb does not use the vga style of panning any longer, now the
+ "right view" register in the graphics engine IO space is used. Without
+ that change it was impossible to use all available memory, and without
+ access to all available memory it is impossible to ywrap.
+
+ - The imageblit function now uses hardware acceleration for all font
+ widths. Hardware blitting across pixel column 2048 is broken in the
+ cyberblade/i1 graphics core, but we work around that hardware bug.
+
+ - modes with vxres != xres are supported now.
+
+ - ywrap scrolling is supported now and the default. This is a big
+ performance gain.
+
+ - default video modes use vyres > yres and vxres > xres to allow
+ almost optimal scrolling speed for normal and rotated screens
+
+ - some features mainly usefull for debugging the upper layers of the
+ framebuffer system have been added, have a look at the code
+
+ - fixed: Oops after unloading cyblafb when reading /proc/io*
+
+ - we work around some bugs of the higher framebuffer layers.
diff --git a/Documentation/fb/cyblafb/whycyblafb b/Documentation/fb/cyblafb/whycyblafb
new file mode 100644
index 0000000..a123bc1
--- /dev/null
+++ b/Documentation/fb/cyblafb/whycyblafb
@@ -0,0 +1,85 @@
+I tried the following framebuffer drivers:
+
+ - TRIDENTFB is full of bugs. Acceleration is broken for Blade3D
+ graphics cores like the cyberblade/i1. It claims to support a great
+ number of devices, but documentation for most of these devices is
+ unfortunately not available. There is _no_ reason to use tridentfb
+ for cyberblade/i1 + CRT users. VESAFB is faster, and the one
+ advantage, mode switching, is broken in tridentfb.
+
+ - VESAFB is used by many distributions as a standard. Vesafb does
+ not support mode switching. VESAFB is a bit faster than the working
+ configurations of TRIDENTFB, but it is still too slow, even if you
+ use ypan.
+
+ - EPIAFB (you'll find it on sourceforge) supports the Cyberblade/i1
+ graphics core, but it still has serious bugs and developement seems
+ to have stopped. This is the one driver with TV-out support. If you
+ do need this feature, try epiafb.
+
+None of these drivers was a real option for me.
+
+I believe that is unreasonable to change code that announces to support 20
+devices if I only have more or less sufficient documentation for exactly one
+of these. The risk of breaking device foo while fixing device bar is too high.
+
+So I decided to start CyBlaFB as a stripped down tridentfb.
+
+All code specific to other Trident chips has been removed. After that there
+were a lot of cosmetic changes to increase the readability of the code. All
+register names were changed to those mnemonics used in the datasheet. Function
+and macro names were changed if they hindered easy understanding of the code.
+
+After that I debugged the code and implemented some new features. I'll try to
+give a little summary of the main changes:
+
+ - calculation of vertical and horizontal timings was fixed
+
+ - video signal quality has been improved dramatically
+
+ - acceleration:
+
+ - fillrect and copyarea were fixed and reenabled
+
+ - color expanding imageblit was newly implemented, color
+ imageblit (only used to draw the penguine) still uses the
+ generic code.
+
+ - init of the acceleration engine was improved and moved to a
+ place where it really works ...
+
+ - sync function has a timeout now and tries to reset and
+ reinit the accel engine if necessary
+
+ - fewer slow copyarea calls when doing ypan scrolling by using
+ undocumented bit d21 of screen start address stored in
+ CR2B[5]. BIOS does use it also, so this should be safe.
+
+ - cyblafb rejects any attempt to set modes that would cause vclk
+ values above reasonable 230 MHz. 32bit modes use a clock
+ multiplicator of 2, so fbset does show the correct values for
+ pixclock but not for vclk in this case. The fbset limit is 115 MHz
+ for 32 bpp modes.
+
+ - cyblafb rejects modes known to be broken or unimplemented (all
+ interlaced modes, all doublescan modes for now)
+
+ - cyblafb now works independant of the video mode in effect at startup
+ time (tridentfb does not init all needed registers to reasonable
+ values)
+
+ - switching between video modes does work reliably now
+
+ - the first video mode now is the one selected on startup using the
+ vga=???? mechanism or any of
+ - 640x480, 800x600, 1024x768, 1280x1024
+ - 8, 16, 24 or 32 bpp
+ - refresh between 50 Hz and 85 Hz, 1 Hz steps (1280x1024-32
+ is limited to 63Hz)
+
+ - pci retry and pci burst mode are settable (try to disable if you
+ experience latency problems)
+
+ - built as a module cyblafb might be unloaded and reloaded using
+ the vfb module and con2vt or might be used together with vesafb
+
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
new file mode 100644
index 0000000..7483283
--- /dev/null
+++ b/Documentation/fb/deferred_io.txt
@@ -0,0 +1,75 @@
+Deferred IO
+-----------
+
+Deferred IO is a way to delay and repurpose IO. It uses host memory as a
+buffer and the MMU pagefault as a pretrigger for when to perform the device
+IO. The following example may be a useful explanation of how one such setup
+works:
+
+- userspace app like Xfbdev mmaps framebuffer
+- deferred IO and driver sets up fault and page_mkwrite handlers
+- userspace app tries to write to mmaped vaddress
+- we get pagefault and reach fault handler
+- fault handler finds and returns physical page
+- we get page_mkwrite where we add this page to a list
+- schedule a workqueue task to be run after a delay
+- app continues writing to that page with no additional cost. this is
+ the key benefit.
+- the workqueue task comes in and mkcleans the pages on the list, then
+ completes the work associated with updating the framebuffer. this is
+ the real work talking to the device.
+- app tries to write to the address (that has now been mkcleaned)
+- get pagefault and the above sequence occurs again
+
+As can be seen from above, one benefit is roughly to allow bursty framebuffer
+writes to occur at minimum cost. Then after some time when hopefully things
+have gone quiet, we go and really update the framebuffer which would be
+a relatively more expensive operation.
+
+For some types of nonvolatile high latency displays, the desired image is
+the final image rather than the intermediate stages which is why it's okay
+to not update for each write that is occurring.
+
+It may be the case that this is useful in other scenarios as well. Paul Mundt
+has mentioned a case where it is beneficial to use the page count to decide
+whether to coalesce and issue SG DMA or to do memory bursts.
+
+Another one may be if one has a device framebuffer that is in an usual format,
+say diagonally shifting RGB, this may then be a mechanism for you to allow
+apps to pretend to have a normal framebuffer but reswizzle for the device
+framebuffer at vsync time based on the touched pagelist.
+
+How to use it: (for applications)
+---------------------------------
+No changes needed. mmap the framebuffer like normal and just use it.
+
+How to use it: (for fbdev drivers)
+----------------------------------
+The following example may be helpful.
+
+1. Setup your structure. Eg:
+
+static struct fb_deferred_io hecubafb_defio = {
+ .delay = HZ,
+ .deferred_io = hecubafb_dpy_deferred_io,
+};
+
+The delay is the minimum delay between when the page_mkwrite trigger occurs
+and when the deferred_io callback is called. The deferred_io callback is
+explained below.
+
+2. Setup your deferred IO callback. Eg:
+static void hecubafb_dpy_deferred_io(struct fb_info *info,
+ struct list_head *pagelist)
+
+The deferred_io callback is where you would perform all your IO to the display
+device. You receive the pagelist which is the list of pages that were written
+to during the delay. You must not modify this list. This callback is called
+from a workqueue.
+
+3. Call init
+ info->fbdefio = &hecubafb_defio;
+ fb_deferred_io_init(info);
+
+4. Call cleanup
+ fb_deferred_io_cleanup(info);
diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.txt
new file mode 100644
index 0000000..99ea58e
--- /dev/null
+++ b/Documentation/fb/fbcon.txt
@@ -0,0 +1,324 @@
+The Framebuffer Console
+=======================
+
+ The framebuffer console (fbcon), as its name implies, is a text
+console running on top of the framebuffer device. It has the functionality of
+any standard text console driver, such as the VGA console, with the added
+features that can be attributed to the graphical nature of the framebuffer.
+
+ In the x86 architecture, the framebuffer console is optional, and
+some even treat it as a toy. For other architectures, it is the only available
+display device, text or graphical.
+
+ What are the features of fbcon? The framebuffer console supports
+high resolutions, varying font types, display rotation, primitive multihead,
+etc. Theoretically, multi-colored fonts, blending, aliasing, and any feature
+made available by the underlying graphics card are also possible.
+
+A. Configuration
+
+ The framebuffer console can be enabled by using your favorite kernel
+configuration tool. It is under Device Drivers->Graphics Support->Support for
+framebuffer devices->Framebuffer Console Support. Select 'y' to compile
+support statically, or 'm' for module support. The module will be fbcon.
+
+ In order for fbcon to activate, at least one framebuffer driver is
+required, so choose from any of the numerous drivers available. For x86
+systems, they almost universally have VGA cards, so vga16fb and vesafb will
+always be available. However, using a chipset-specific driver will give you
+more speed and features, such as the ability to change the video mode
+dynamically.
+
+ To display the penguin logo, choose any logo available in Logo
+Configuration->Boot up logo.
+
+ Also, you will need to select at least one compiled-in fonts, but if
+you don't do anything, the kernel configuration tool will select one for you,
+usually an 8x16 font.
+
+GOTCHA: A common bug report is enabling the framebuffer without enabling the
+framebuffer console. Depending on the driver, you may get a blanked or
+garbled display, but the system still boots to completion. If you are
+fortunate to have a driver that does not alter the graphics chip, then you
+will still get a VGA console.
+
+B. Loading
+
+Possible scenarios:
+
+1. Driver and fbcon are compiled statically
+
+ Usually, fbcon will automatically take over your console. The notable
+ exception is vesafb. It needs to be explicitly activated with the
+ vga= boot option parameter.
+
+2. Driver is compiled statically, fbcon is compiled as a module
+
+ Depending on the driver, you either get a standard console, or a
+ garbled display, as mentioned above. To get a framebuffer console,
+ do a 'modprobe fbcon'.
+
+3. Driver is compiled as a module, fbcon is compiled statically
+
+ You get your standard console. Once the driver is loaded with
+ 'modprobe xxxfb', fbcon automatically takes over the console with
+ the possible exception of using the fbcon=map:n option. See below.
+
+4. Driver and fbcon are compiled as a module.
+
+ You can load them in any order. Once both are loaded, fbcon will take
+ over the console.
+
+C. Boot options
+
+ The framebuffer console has several, largely unknown, boot options
+ that can change its behavior.
+
+1. fbcon=font:<name>
+
+ Select the initial font to use. The value 'name' can be any of the
+ compiled-in fonts: VGA8x16, 7x14, 10x18, VGA8x8, MINI4x6, RomanLarge,
+ SUN8x16, SUN12x22, ProFont6x11, Acorn8x8, PEARL8x8.
+
+ Note, not all drivers can handle font with widths not divisible by 8,
+ such as vga16fb.
+
+2. fbcon=scrollback:<value>[k]
+
+ The scrollback buffer is memory that is used to preserve display
+ contents that has already scrolled past your view. This is accessed
+ by using the Shift-PageUp key combination. The value 'value' is any
+ integer. It defaults to 32KB. The 'k' suffix is optional, and will
+ multiply the 'value' by 1024.
+
+3. fbcon=map:<0123>
+
+ This is an interesting option. It tells which driver gets mapped to
+ which console. The value '0123' is a sequence that gets repeated until
+ the total length is 64 which is the number of consoles available. In
+ the above example, it is expanded to 012301230123... and the mapping
+ will be:
+
+ tty | 1 2 3 4 5 6 7 8 9 ...
+ fb | 0 1 2 3 0 1 2 3 0 ...
+
+ ('cat /proc/fb' should tell you what the fb numbers are)
+
+ One side effect that may be useful is using a map value that exceeds
+ the number of loaded fb drivers. For example, if only one driver is
+ available, fb0, adding fbcon=map:1 tells fbcon not to take over the
+ console.
+
+ Later on, when you want to map the console the to the framebuffer
+ device, you can use the con2fbmap utility.
+
+4. fbcon=vc:<n1>-<n2>
+
+ This option tells fbcon to take over only a range of consoles as
+ specified by the values 'n1' and 'n2'. The rest of the consoles
+ outside the given range will still be controlled by the standard
+ console driver.
+
+ NOTE: For x86 machines, the standard console is the VGA console which
+ is typically located on the same video card. Thus, the consoles that
+ are controlled by the VGA console will be garbled.
+
+4. fbcon=rotate:<n>
+
+ This option changes the orientation angle of the console display. The
+ value 'n' accepts the following:
+
+ 0 - normal orientation (0 degree)
+ 1 - clockwise orientation (90 degrees)
+ 2 - upside down orientation (180 degrees)
+ 3 - counterclockwise orientation (270 degrees)
+
+ The angle can be changed anytime afterwards by 'echoing' the same
+ numbers to any one of the 2 attributes found in
+ /sys/class/graphics/fbcon
+
+ rotate - rotate the display of the active console
+ rotate_all - rotate the display of all consoles
+
+ Console rotation will only become available if Console Rotation
+ Support is compiled in your kernel.
+
+ NOTE: This is purely console rotation. Any other applications that
+ use the framebuffer will remain at their 'normal'orientation.
+ Actually, the underlying fb driver is totally ignorant of console
+ rotation.
+
+C. Attaching, Detaching and Unloading
+
+Before going on on how to attach, detach and unload the framebuffer console, an
+illustration of the dependencies may help.
+
+The console layer, as with most subsystems, needs a driver that interfaces with
+the hardware. Thus, in a VGA console:
+
+console ---> VGA driver ---> hardware.
+
+Assuming the VGA driver can be unloaded, one must first unbind the VGA driver
+from the console layer before unloading the driver. The VGA driver cannot be
+unloaded if it is still bound to the console layer. (See
+Documentation/console/console.txt for more information).
+
+This is more complicated in the case of the framebuffer console (fbcon),
+because fbcon is an intermediate layer between the console and the drivers:
+
+console ---> fbcon ---> fbdev drivers ---> hardware
+
+The fbdev drivers cannot be unloaded if it's bound to fbcon, and fbcon cannot
+be unloaded if it's bound to the console layer.
+
+So to unload the fbdev drivers, one must first unbind fbcon from the console,
+then unbind the fbdev drivers from fbcon. Fortunately, unbinding fbcon from
+the console layer will automatically unbind framebuffer drivers from
+fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from
+fbcon.
+
+So, how do we unbind fbcon from the console? Part of the answer is in
+Documentation/console/console.txt. To summarize:
+
+Echo a value to the bind file that represents the framebuffer console
+driver. So assuming vtcon1 represents fbcon, then:
+
+echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to
+ console layer
+echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from
+ console layer
+
+If fbcon is detached from the console layer, your boot console driver (which is
+usually VGA text mode) will take over. A few drivers (rivafb and i810fb) will
+restore VGA text mode for you. With the rest, before detaching fbcon, you
+must take a few additional steps to make sure that your VGA text mode is
+restored properly. The following is one of the several methods that you can do:
+
+1. Download or install vbetool. This utility is included with most
+ distributions nowadays, and is usually part of the suspend/resume tool.
+
+2. In your kernel configuration, ensure that CONFIG_FRAMEBUFFER_CONSOLE is set
+ to 'y' or 'm'. Enable one or more of your favorite framebuffer drivers.
+
+3. Boot into text mode and as root run:
+
+ vbetool vbestate save > <vga state file>
+
+ The above command saves the register contents of your graphics
+ hardware to <vga state file>. You need to do this step only once as
+ the state file can be reused.
+
+4. If fbcon is compiled as a module, load fbcon by doing:
+
+ modprobe fbcon
+
+5. Now to detach fbcon:
+
+ vbetool vbestate restore < <vga state file> && \
+ echo 0 > /sys/class/vtconsole/vtcon1/bind
+
+6. That's it, you're back to VGA mode. And if you compiled fbcon as a module,
+ you can unload it by 'rmmod fbcon'
+
+7. To reattach fbcon:
+
+ echo 1 > /sys/class/vtconsole/vtcon1/bind
+
+8. Once fbcon is unbound, all drivers registered to the system will also
+become unbound. This means that fbcon and individual framebuffer drivers
+can be unloaded or reloaded at will. Reloading the drivers or fbcon will
+automatically bind the console, fbcon and the drivers together. Unloading
+all the drivers without unloading fbcon will make it impossible for the
+console to bind fbcon.
+
+Notes for vesafb users:
+=======================
+
+Unfortunately, if your bootline includes a vga=xxx parameter that sets the
+hardware in graphics mode, such as when loading vesafb, vgacon will not load.
+Instead, vgacon will replace the default boot console with dummycon, and you
+won't get any display after detaching fbcon. Your machine is still alive, so
+you can reattach vesafb. However, to reattach vesafb, you need to do one of
+the following:
+
+Variation 1:
+
+ a. Before detaching fbcon, do
+
+ vbetool vbemode save > <vesa state file> # do once for each vesafb mode,
+ # the file can be reused
+
+ b. Detach fbcon as in step 5.
+
+ c. Attach fbcon
+
+ vbetool vbestate restore < <vesa state file> && \
+ echo 1 > /sys/class/vtconsole/vtcon1/bind
+
+Variation 2:
+
+ a. Before detaching fbcon, do:
+ echo <ID> > /sys/class/tty/console/bind
+
+
+ vbetool vbemode get
+
+ b. Take note of the mode number
+
+ b. Detach fbcon as in step 5.
+
+ c. Attach fbcon:
+
+ vbetool vbemode set <mode number> && \
+ echo 1 > /sys/class/vtconsole/vtcon1/bind
+
+Samples:
+========
+
+Here are 2 sample bash scripts that you can use to bind or unbind the
+framebuffer console driver if you are in an X86 box:
+
+---------------------------------------------------------------------------
+#!/bin/bash
+# Unbind fbcon
+
+# Change this to where your actual vgastate file is located
+# Or Use VGASTATE=$1 to indicate the state file at runtime
+VGASTATE=/tmp/vgastate
+
+# path to vbetool
+VBETOOL=/usr/local/bin
+
+
+for (( i = 0; i < 16; i++))
+do
+ if test -x /sys/class/vtconsole/vtcon$i; then
+ if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \
+ = 1 ]; then
+ if test -x $VBETOOL/vbetool; then
+ echo Unbinding vtcon$i
+ $VBETOOL/vbetool vbestate restore < $VGASTATE
+ echo 0 > /sys/class/vtconsole/vtcon$i/bind
+ fi
+ fi
+ fi
+done
+
+---------------------------------------------------------------------------
+#!/bin/bash
+# Bind fbcon
+
+for (( i = 0; i < 16; i++))
+do
+ if test -x /sys/class/vtconsole/vtcon$i; then
+ if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \
+ = 1 ]; then
+ echo Unbinding vtcon$i
+ echo 1 > /sys/class/vtconsole/vtcon$i/bind
+ fi
+ fi
+done
+---------------------------------------------------------------------------
+
+--
+Antonino Daplas <adaplas@pol.net>
diff --git a/Documentation/fb/framebuffer.txt b/Documentation/fb/framebuffer.txt
new file mode 100644
index 0000000..b3e3a03
--- /dev/null
+++ b/Documentation/fb/framebuffer.txt
@@ -0,0 +1,345 @@
+ The Frame Buffer Device
+ -----------------------
+
+Maintained by Geert Uytterhoeven <geert@linux-m68k.org>
+Last revised: May 10, 2001
+
+
+0. Introduction
+---------------
+
+The frame buffer device provides an abstraction for the graphics hardware. It
+represents the frame buffer of some video hardware and allows application
+software to access the graphics hardware through a well-defined interface, so
+the software doesn't need to know anything about the low-level (hardware
+register) stuff.
+
+The device is accessed through special device nodes, usually located in the
+/dev directory, i.e. /dev/fb*.
+
+
+1. User's View of /dev/fb*
+--------------------------
+
+From the user's point of view, the frame buffer device looks just like any
+other device in /dev. It's a character device using major 29; the minor
+specifies the frame buffer number.
+
+By convention, the following device nodes are used (numbers indicate the device
+minor numbers):
+
+ 0 = /dev/fb0 First frame buffer
+ 1 = /dev/fb1 Second frame buffer
+ ...
+ 31 = /dev/fb31 32nd frame buffer
+
+For backwards compatibility, you may want to create the following symbolic
+links:
+
+ /dev/fb0current -> fb0
+ /dev/fb1current -> fb1
+
+and so on...
+
+The frame buffer devices are also `normal' memory devices, this means, you can
+read and write their contents. You can, for example, make a screen snapshot by
+
+ cp /dev/fb0 myfile
+
+There also can be more than one frame buffer at a time, e.g. if you have a
+graphics card in addition to the built-in hardware. The corresponding frame
+buffer devices (/dev/fb0 and /dev/fb1 etc.) work independently.
+
+Application software that uses the frame buffer device (e.g. the X server) will
+use /dev/fb0 by default (older software uses /dev/fb0current). You can specify
+an alternative frame buffer device by setting the environment variable
+$FRAMEBUFFER to the path name of a frame buffer device, e.g. (for sh/bash
+users):
+
+ export FRAMEBUFFER=/dev/fb1
+
+or (for csh users):
+
+ setenv FRAMEBUFFER /dev/fb1
+
+After this the X server will use the second frame buffer.
+
+
+2. Programmer's View of /dev/fb*
+--------------------------------
+
+As you already know, a frame buffer device is a memory device like /dev/mem and
+it has the same features. You can read it, write it, seek to some location in
+it and mmap() it (the main usage). The difference is just that the memory that
+appears in the special file is not the whole memory, but the frame buffer of
+some video hardware.
+
+/dev/fb* also allows several ioctls on it, by which lots of information about
+the hardware can be queried and set. The color map handling works via ioctls,
+too. Look into <linux/fb.h> for more information on what ioctls exist and on
+which data structures they work. Here's just a brief overview:
+
+ - You can request unchangeable information about the hardware, like name,
+ organization of the screen memory (planes, packed pixels, ...) and address
+ and length of the screen memory.
+
+ - You can request and change variable information about the hardware, like
+ visible and virtual geometry, depth, color map format, timing, and so on.
+ If you try to change that information, the driver maybe will round up some
+ values to meet the hardware's capabilities (or return EINVAL if that isn't
+ possible).
+
+ - You can get and set parts of the color map. Communication is done with 16
+ bits per color part (red, green, blue, transparency) to support all
+ existing hardware. The driver does all the computations needed to apply
+ it to the hardware (round it down to less bits, maybe throw away
+ transparency).
+
+All this hardware abstraction makes the implementation of application programs
+easier and more portable. E.g. the X server works completely on /dev/fb* and
+thus doesn't need to know, for example, how the color registers of the concrete
+hardware are organized. XF68_FBDev is a general X server for bitmapped,
+unaccelerated video hardware. The only thing that has to be built into
+application programs is the screen organization (bitplanes or chunky pixels
+etc.), because it works on the frame buffer image data directly.
+
+For the future it is planned that frame buffer drivers for graphics cards and
+the like can be implemented as kernel modules that are loaded at runtime. Such
+a driver just has to call register_framebuffer() and supply some functions.
+Writing and distributing such drivers independently from the kernel will save
+much trouble...
+
+
+3. Frame Buffer Resolution Maintenance
+--------------------------------------
+
+Frame buffer resolutions are maintained using the utility `fbset'. It can
+change the video mode properties of a frame buffer device. Its main usage is
+to change the current video mode, e.g. during boot up in one of your /etc/rc.*
+or /etc/init.d/* files.
+
+Fbset uses a video mode database stored in a configuration file, so you can
+easily add your own modes and refer to them with a simple identifier.
+
+
+4. The X Server
+---------------
+
+The X server (XF68_FBDev) is the most notable application program for the frame
+buffer device. Starting with XFree86 release 3.2, the X server is part of
+XFree86 and has 2 modes:
+
+ - If the `Display' subsection for the `fbdev' driver in the /etc/XF86Config
+ file contains a
+
+ Modes "default"
+
+ line, the X server will use the scheme discussed above, i.e. it will start
+ up in the resolution determined by /dev/fb0 (or $FRAMEBUFFER, if set). You
+ still have to specify the color depth (using the Depth keyword) and virtual
+ resolution (using the Virtual keyword) though. This is the default for the
+ configuration file supplied with XFree86. It's the most simple
+ configuration, but it has some limitations.
+
+ - Therefore it's also possible to specify resolutions in the /etc/XF86Config
+ file. This allows for on-the-fly resolution switching while retaining the
+ same virtual desktop size. The frame buffer device that's used is still
+ /dev/fb0current (or $FRAMEBUFFER), but the available resolutions are
+ defined by /etc/XF86Config now. The disadvantage is that you have to
+ specify the timings in a different format (but `fbset -x' may help).
+
+To tune a video mode, you can use fbset or xvidtune. Note that xvidtune doesn't
+work 100% with XF68_FBDev: the reported clock values are always incorrect.
+
+
+5. Video Mode Timings
+---------------------
+
+A monitor draws an image on the screen by using an electron beam (3 electron
+beams for color models, 1 electron beam for monochrome monitors). The front of
+the screen is covered by a pattern of colored phosphors (pixels). If a phosphor
+is hit by an electron, it emits a photon and thus becomes visible.
+
+The electron beam draws horizontal lines (scanlines) from left to right, and
+from the top to the bottom of the screen. By modifying the intensity of the
+electron beam, pixels with various colors and intensities can be shown.
+
+After each scanline the electron beam has to move back to the left side of the
+screen and to the next line: this is called the horizontal retrace. After the
+whole screen (frame) was painted, the beam moves back to the upper left corner:
+this is called the vertical retrace. During both the horizontal and vertical
+retrace, the electron beam is turned off (blanked).
+
+The speed at which the electron beam paints the pixels is determined by the
+dotclock in the graphics board. For a dotclock of e.g. 28.37516 MHz (millions
+of cycles per second), each pixel is 35242 ps (picoseconds) long:
+
+ 1/(28.37516E6 Hz) = 35.242E-9 s
+
+If the screen resolution is 640x480, it will take
+
+ 640*35.242E-9 s = 22.555E-6 s
+
+to paint the 640 (xres) pixels on one scanline. But the horizontal retrace
+also takes time (e.g. 272 `pixels'), so a full scanline takes
+
+ (640+272)*35.242E-9 s = 32.141E-6 s
+
+We'll say that the horizontal scanrate is about 31 kHz:
+
+ 1/(32.141E-6 s) = 31.113E3 Hz
+
+A full screen counts 480 (yres) lines, but we have to consider the vertical
+retrace too (e.g. 49 `lines'). So a full screen will take
+
+ (480+49)*32.141E-6 s = 17.002E-3 s
+
+The vertical scanrate is about 59 Hz:
+
+ 1/(17.002E-3 s) = 58.815 Hz
+
+This means the screen data is refreshed about 59 times per second. To have a
+stable picture without visible flicker, VESA recommends a vertical scanrate of
+at least 72 Hz. But the perceived flicker is very human dependent: some people
+can use 50 Hz without any trouble, while I'll notice if it's less than 80 Hz.
+
+Since the monitor doesn't know when a new scanline starts, the graphics board
+will supply a synchronization pulse (horizontal sync or hsync) for each
+scanline. Similarly it supplies a synchronization pulse (vertical sync or
+vsync) for each new frame. The position of the image on the screen is
+influenced by the moments at which the synchronization pulses occur.
+
+The following picture summarizes all timings. The horizontal retrace time is
+the sum of the left margin, the right margin and the hsync length, while the
+vertical retrace time is the sum of the upper margin, the lower margin and the
+vsync length.
+
+ +----------+---------------------------------------------+----------+-------+
+ | | ↑ | | |
+ | | |upper_margin | | |
+ | | ↓ | | |
+ +----------###############################################----------+-------+
+ | # ↑ # | |
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | left # | # right | hsync |
+ | margin # | xres # margin | len |
+ |<-------->#<---------------+--------------------------->#<-------->|<----->|
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | # |yres # | |
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | # | # | |
+ | # ↓ # | |
+ +----------###############################################----------+-------+
+ | | ↑ | | |
+ | | |lower_margin | | |
+ | | ↓ | | |
+ +----------+---------------------------------------------+----------+-------+
+ | | ↑ | | |
+ | | |vsync_len | | |
+ | | ↓ | | |
+ +----------+---------------------------------------------+----------+-------+
+
+The frame buffer device expects all horizontal timings in number of dotclocks
+(in picoseconds, 1E-12 s), and vertical timings in number of scanlines.
+
+
+6. Converting XFree86 timing values info frame buffer device timings
+--------------------------------------------------------------------
+
+An XFree86 mode line consists of the following fields:
+ "800x600" 50 800 856 976 1040 600 637 643 666
+ < name > DCF HR SH1 SH2 HFL VR SV1 SV2 VFL
+
+The frame buffer device uses the following fields:
+
+ - pixclock: pixel clock in ps (pico seconds)
+ - left_margin: time from sync to picture
+ - right_margin: time from picture to sync
+ - upper_margin: time from sync to picture
+ - lower_margin: time from picture to sync
+ - hsync_len: length of horizontal sync
+ - vsync_len: length of vertical sync
+
+1) Pixelclock:
+ xfree: in MHz
+ fb: in picoseconds (ps)
+
+ pixclock = 1000000 / DCF
+
+2) horizontal timings:
+ left_margin = HFL - SH2
+ right_margin = SH1 - HR
+ hsync_len = SH2 - SH1
+
+3) vertical timings:
+ upper_margin = VFL - SV2
+ lower_margin = SV1 - VR
+ vsync_len = SV2 - SV1
+
+Good examples for VESA timings can be found in the XFree86 source tree,
+under "xc/programs/Xserver/hw/xfree86/doc/modeDB.txt".
+
+
+7. References
+-------------
+
+For more specific information about the frame buffer device and its
+applications, please refer to the Linux-fbdev website:
+
+ http://linux-fbdev.sourceforge.net/
+
+and to the following documentation:
+
+ - The manual pages for fbset: fbset(8), fb.modes(5)
+ - The manual pages for XFree86: XF68_FBDev(1), XF86Config(4/5)
+ - The mighty kernel sources:
+ o linux/drivers/video/
+ o linux/include/linux/fb.h
+ o linux/include/video/
+
+
+
+8. Mailing list
+---------------
+
+There are several frame buffer device related mailing lists at SourceForge:
+ - linux-fbdev-announce@lists.sourceforge.net, for announcements,
+ - linux-fbdev-user@lists.sourceforge.net, for generic user support,
+ - linux-fbdev-devel@lists.sourceforge.net, for project developers.
+
+Point your web browser to http://sourceforge.net/projects/linux-fbdev/ for
+subscription information and archive browsing.
+
+
+9. Downloading
+--------------
+
+All necessary files can be found at
+
+ ftp://ftp.uni-erlangen.de/pub/Linux/LOCAL/680x0/
+
+and on its mirrors.
+
+The latest version of fbset can be found at
+
+ http://home.tvd.be/cr26864/Linux/fbdev/
+
+
+10. Credits
+----------
+
+This readme was written by Geert Uytterhoeven, partly based on the original
+`X-framebuffer.README' by Roman Hodek and Martin Schaller. Section 6 was
+provided by Frank Neumann.
+
+The frame buffer device abstraction was designed by Martin Schaller.
diff --git a/Documentation/fb/gxfb.txt b/Documentation/fb/gxfb.txt
new file mode 100644
index 0000000..2f64090
--- /dev/null
+++ b/Documentation/fb/gxfb.txt
@@ -0,0 +1,52 @@
+[This file is cloned from VesaFB/aty128fb]
+
+What is gxfb?
+=================
+
+This is a graphics framebuffer driver for AMD Geode GX2 based processors.
+
+Advantages:
+
+ * No need to use AMD's VSA code (or other VESA emulation layer) in the
+ BIOS.
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+ without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode...
+
+
+How to use it?
+==============
+
+Switching modes is done using gxfb.mode_option=<resolution>... boot
+parameter or using `fbset' program.
+
+See Documentation/fb/modedb.txt for more information on modedb
+resolutions.
+
+
+X11
+===
+
+XF68_FBDev should generally work fine, but it is non-accelerated.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to gxfb with gxfb.<option>.
+For example, gxfb.mode_option=800x600@75.
+Accepted options:
+
+mode_option - specify the video mode. Of the form
+ <x>x<y>[-<bpp>][@<refresh>]
+vram - size of video ram (normally auto-detected)
+vt_switch - enable vt switching during suspend/resume. The vt
+ switch is slow, but harmless.
+
+--
+Andres Salomon <dilinger@debian.org>
diff --git a/Documentation/fb/imacfb.txt b/Documentation/fb/imacfb.txt
new file mode 100644
index 0000000..316ec9b
--- /dev/null
+++ b/Documentation/fb/imacfb.txt
@@ -0,0 +1,31 @@
+
+What is imacfb?
+===============
+
+This is a generic EFI platform driver for Intel based Apple computers.
+Imacfb is only for EFI booted Intel Macs.
+
+Supported Hardware
+==================
+
+iMac 17"/20"
+Macbook
+Macbook Pro 15"/17"
+MacMini
+
+How to use it?
+==============
+
+Imacfb does not have any kind of autodetection of your machine.
+You have to add the following kernel parameters in your elilo.conf:
+ Macbook :
+ video=imacfb:macbook
+ MacMini :
+ video=imacfb:mini
+ Macbook Pro 15", iMac 17" :
+ video=imacfb:i17
+ Macbook Pro 17", iMac 20" :
+ video=imacfb:i20
+
+--
+Edgar Hucek <gimli@dark-green.com>
diff --git a/Documentation/fb/intel810.txt b/Documentation/fb/intel810.txt
new file mode 100644
index 0000000..be3e783
--- /dev/null
+++ b/Documentation/fb/intel810.txt
@@ -0,0 +1,278 @@
+Intel 810/815 Framebuffer driver
+ Tony Daplas <adaplas@pol.net>
+ http://i810fb.sourceforge.net
+
+ March 17, 2002
+
+ First Released: July 2001
+ Last Update: September 12, 2005
+================================================================
+
+A. Introduction
+
+ This is a framebuffer driver for various Intel 810/815 compatible
+ graphics devices. These include:
+
+ Intel 810
+ Intel 810E
+ Intel 810-DC100
+ Intel 815 Internal graphics only, 100Mhz FSB
+ Intel 815 Internal graphics only
+ Intel 815 Internal graphics and AGP
+
+B. Features
+
+ - Choice of using Discrete Video Timings, VESA Generalized Timing
+ Formula, or a framebuffer specific database to set the video mode
+
+ - Supports a variable range of horizontal and vertical resolution and
+ vertical refresh rates if the VESA Generalized Timing Formula is
+ enabled.
+
+ - Supports color depths of 8, 16, 24 and 32 bits per pixel
+
+ - Supports pseudocolor, directcolor, or truecolor visuals
+
+ - Full and optimized hardware acceleration at 8, 16 and 24 bpp
+
+ - Robust video state save and restore
+
+ - MTRR support
+
+ - Utilizes user-entered monitor specifications to automatically
+ calculate required video mode parameters.
+
+ - Can concurrently run with xfree86 running with native i810 drivers
+
+ - Hardware Cursor Support
+
+ - Supports EDID probing either by DDC/I2C or through the BIOS
+
+C. List of available options
+
+ a. "video=i810fb"
+ enables the i810 driver
+
+ Recommendation: required
+
+ b. "xres:<value>"
+ select horizontal resolution in pixels. (This parameter will be
+ ignored if 'mode_option' is specified. See 'o' below).
+
+ Recommendation: user preference
+ (default = 640)
+
+ c. "yres:<value>"
+ select vertical resolution in scanlines. If Discrete Video Timings
+ is enabled, this will be ignored and computed as 3*xres/4. (This
+ parameter will be ignored if 'mode_option' is specified. See 'o'
+ below)
+
+ Recommendation: user preference
+ (default = 480)
+
+ d. "vyres:<value>"
+ select virtual vertical resolution in scanlines. If (0) or none
+ is specified, this will be computed against maximum available memory.
+
+ Recommendation: do not set
+ (default = 480)
+
+ e. "vram:<value>"
+ select amount of system RAM in MB to allocate for the video memory
+
+ Recommendation: 1 - 4 MB.
+ (default = 4)
+
+ f. "bpp:<value>"
+ select desired pixel depth
+
+ Recommendation: 8
+ (default = 8)
+
+ g. "hsync1/hsync2:<value>"
+ select the minimum and maximum Horizontal Sync Frequency of the
+ monitor in kHz. If using a fixed frequency monitor, hsync1 must
+ be equal to hsync2. If EDID probing is successful, these will be
+ ignored and values will be taken from the EDID block.
+
+ Recommendation: check monitor manual for correct values
+ (default = 29/30)
+
+ h. "vsync1/vsync2:<value>"
+ select the minimum and maximum Vertical Sync Frequency of the monitor
+ in Hz. You can also use this option to lock your monitor's refresh
+ rate. If EDID probing is successful, these will be ignored and values
+ will be taken from the EDID block.
+
+ Recommendation: check monitor manual for correct values
+ (default = 60/60)
+
+ IMPORTANT: If you need to clamp your timings, try to give some
+ leeway for computational errors (over/underflows). Example: if
+ using vsync1/vsync2 = 60/60, make sure hsync1/hsync2 has at least
+ a 1 unit difference, and vice versa.
+
+ i. "voffset:<value>"
+ select at what offset in MB of the logical memory to allocate the
+ framebuffer memory. The intent is to avoid the memory blocks
+ used by standard graphics applications (XFree86). The default
+ offset (16 MB for a 64 MB aperture, 8 MB for a 32 MB aperture) will
+ avoid XFree86's usage and allows up to 7 MB/15 MB of framebuffer
+ memory. Depending on your usage, adjust the value up or down
+ (0 for maximum usage, 31/63 MB for the least amount). Note, an
+ arbitrary setting may conflict with XFree86.
+
+ Recommendation: do not set
+ (default = 8 or 16 MB)
+
+ j. "accel"
+ enable text acceleration. This can be enabled/reenabled anytime
+ by using 'fbset -accel true/false'.
+
+ Recommendation: enable
+ (default = not set)
+
+ k. "mtrr"
+ enable MTRR. This allows data transfers to the framebuffer memory
+ to occur in bursts which can significantly increase performance.
+ Not very helpful with the i810/i815 because of 'shared memory'.
+
+ Recommendation: do not set
+ (default = not set)
+
+ l. "extvga"
+ if specified, secondary/external VGA output will always be enabled.
+ Useful if the BIOS turns off the VGA port when no monitor is attached.
+ The external VGA monitor can then be attached without rebooting.
+
+ Recommendation: do not set
+ (default = not set)
+
+ m. "sync"
+ Forces the hardware engine to do a "sync" or wait for the hardware
+ to finish before starting another instruction. This will produce a
+ more stable setup, but will be slower.
+
+ Recommendation: do not set
+ (default = not set)
+
+ n. "dcolor"
+ Use directcolor visual instead of truecolor for pixel depths greater
+ than 8 bpp. Useful for color tuning, such as gamma control.
+
+ Recommendation: do not set
+ (default = not set)
+
+ o. <xres>x<yres>[-<bpp>][@<refresh>]
+ The driver will now accept specification of boot mode option. If this
+ is specified, the options 'xres' and 'yres' will be ignored. See
+ Documentation/fb/modedb.txt for usage.
+
+D. Kernel booting
+
+Separate each option/option-pair by commas (,) and the option from its value
+with a colon (:) as in the following:
+
+video=i810fb:option1,option2:value2
+
+Sample Usage
+------------
+
+In /etc/lilo.conf, add the line:
+
+append="video=i810fb:vram:2,xres:1024,yres:768,bpp:8,hsync1:30,hsync2:55, \
+ vsync1:50,vsync2:85,accel,mtrr"
+
+This will initialize the framebuffer to 1024x768 at 8bpp. The framebuffer
+will use 2 MB of System RAM. MTRR support will be enabled. The refresh rate
+will be computed based on the hsync1/hsync2 and vsync1/vsync2 values.
+
+IMPORTANT:
+You must include hsync1, hsync2, vsync1 and vsync2 to enable video modes
+better than 640x480 at 60Hz. HOWEVER, if your chipset/display combination
+supports I2C and has an EDID block, you can safely exclude hsync1, hsync2,
+vsync1 and vsync2 parameters. These parameters will be taken from the EDID
+block.
+
+E. Module options
+
+The module parameters are essentially similar to the kernel
+parameters. The main difference is that you need to include a Boolean value
+(1 for TRUE, and 0 for FALSE) for those options which don't need a value.
+
+Example, to enable MTRR, include "mtrr=1".
+
+Sample Usage
+------------
+
+Using the same setup as described above, load the module like this:
+
+ modprobe i810fb vram=2 xres=1024 bpp=8 hsync1=30 hsync2=55 vsync1=50 \
+ vsync2=85 accel=1 mtrr=1
+
+Or just add the following to /etc/modprobe.conf
+
+ options i810fb vram=2 xres=1024 bpp=16 hsync1=30 hsync2=55 vsync1=50 \
+ vsync2=85 accel=1 mtrr=1
+
+and just do a
+
+ modprobe i810fb
+
+
+F. Setup
+
+ a. Do your usual method of configuring the kernel.
+
+ make menuconfig/xconfig/config
+
+ b. Under "Code maturity level options" enable "Prompt for development
+ and/or incomplete code/drivers".
+
+ c. Enable agpgart support for the Intel 810/815 on-board graphics.
+ This is required. The option is under "Character Devices".
+
+ d. Under "Graphics Support", select "Intel 810/815" either statically
+ or as a module. Choose "use VESA Generalized Timing Formula" if
+ you need to maximize the capability of your display. To be on the
+ safe side, you can leave this unselected.
+
+ e. If you want support for DDC/I2C probing (Plug and Play Displays),
+ set 'Enable DDC Support' to 'y'. To make this option appear, set
+ 'use VESA Generalized Timing Formula' to 'y'.
+
+ f. If you want a framebuffer console, enable it under "Console
+ Drivers".
+
+ g. Compile your kernel.
+
+ h. Load the driver as described in sections D and E.
+
+ i. Try the DirectFB (http://www.directfb.org) + the i810 gfxdriver
+ patch to see the chipset in action (or inaction :-).
+
+G. Acknowledgment:
+
+ 1. Geert Uytterhoeven - his excellent howto and the virtual
+ framebuffer driver code made this possible.
+
+ 2. Jeff Hartmann for his agpgart code.
+
+ 3. The X developers. Insights were provided just by reading the
+ XFree86 source code.
+
+ 4. Intel(c). For this value-oriented chipset driver and for
+ providing documentation.
+
+ 5. Matt Sottek. His inputs and ideas helped in making some
+ optimizations possible.
+
+H. Home Page:
+
+ A more complete, and probably updated information is provided at
+ http://i810fb.sourceforge.net.
+
+###########################
+Tony
+
diff --git a/Documentation/fb/intelfb.txt b/Documentation/fb/intelfb.txt
new file mode 100644
index 0000000..dd9e944
--- /dev/null
+++ b/Documentation/fb/intelfb.txt
@@ -0,0 +1,149 @@
+Intel 830M/845G/852GM/855GM/865G/915G/945G Framebuffer driver
+================================================================
+
+A. Introduction
+ This is a framebuffer driver for various Intel 8xx/9xx compatible
+graphics devices. These would include:
+
+ Intel 830M
+ Intel 845G
+ Intel 852GM
+ Intel 855GM
+ Intel 865G
+ Intel 915G
+ Intel 915GM
+ Intel 945G
+ Intel 945GM
+ Intel 945GME
+ Intel 965G
+ Intel 965GM
+
+B. List of available options
+
+ a. "video=intelfb"
+ enables the intelfb driver
+
+ Recommendation: required
+
+ b. "mode=<xres>x<yres>[-<bpp>][@<refresh>]"
+ select mode
+
+ Recommendation: user preference
+ (default = 1024x768-32@70)
+
+ c. "vram=<value>"
+ select amount of system RAM in MB to allocate for the video memory
+ if not enough RAM was already allocated by the BIOS.
+
+ Recommendation: 1 - 4 MB.
+ (default = 4 MB)
+
+ d. "voffset=<value>"
+ select at what offset in MB of the logical memory to allocate the
+ framebuffer memory. The intent is to avoid the memory blocks
+ used by standard graphics applications (XFree86). Depending on your
+ usage, adjust the value up or down, (0 for maximum usage, 63/127 MB
+ for the least amount). Note, an arbitrary setting may conflict
+ with XFree86.
+
+ Recommendation: do not set
+ (default = 48 MB)
+
+ e. "accel"
+ enable text acceleration. This can be enabled/reenabled anytime
+ by using 'fbset -accel true/false'.
+
+ Recommendation: enable
+ (default = set)
+
+ f. "hwcursor"
+ enable cursor acceleration.
+
+ Recommendation: enable
+ (default = set)
+
+ g. "mtrr"
+ enable MTRR. This allows data transfers to the framebuffer memory
+ to occur in bursts which can significantly increase performance.
+ Not very helpful with the intel chips because of 'shared memory'.
+
+ Recommendation: set
+ (default = set)
+
+ h. "fixed"
+ disable mode switching.
+
+ Recommendation: do not set
+ (default = not set)
+
+ The binary parameters can be unset with a "no" prefix, example "noaccel".
+ The default parameter (not named) is the mode.
+
+C. Kernel booting
+
+Separate each option/option-pair by commas (,) and the option from its value
+with an equals sign (=) as in the following:
+
+video=intelfb:option1,option2=value2
+
+Sample Usage
+------------
+
+In /etc/lilo.conf, add the line:
+
+append="video=intelfb:mode=800x600-32@75,accel,hwcursor,vram=8"
+
+This will initialize the framebuffer to 800x600 at 32bpp and 75Hz. The
+framebuffer will use 8 MB of System RAM. hw acceleration of text and cursor
+will be enabled.
+
+Remarks
+-------
+
+If setting this parameter doesn't work (you stay in a 80x25 text-mode),
+you might need to set the "vga=<mode>" parameter too - see vesafb.txt
+in this directory.
+
+
+D. Module options
+
+ The module parameters are essentially similar to the kernel
+parameters. The main difference is that you need to include a Boolean value
+(1 for TRUE, and 0 for FALSE) for those options which don't need a value.
+
+Example, to enable MTRR, include "mtrr=1".
+
+Sample Usage
+------------
+
+Using the same setup as described above, load the module like this:
+
+ modprobe intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1
+
+Or just add the following to /etc/modprobe.conf
+
+ options intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1
+
+and just do a
+
+ modprobe intelfb
+
+
+E. Acknowledgment:
+
+ 1. Geert Uytterhoeven - his excellent howto and the virtual
+ framebuffer driver code made this possible.
+
+ 2. Jeff Hartmann for his agpgart code.
+
+ 3. David Dawes for his original kernel 2.4 code.
+
+ 4. The X developers. Insights were provided just by reading the
+ XFree86 source code.
+
+ 5. Antonino A. Daplas for his inspiring i810fb driver.
+
+ 6. Andrew Morton for his kernel patches maintenance.
+
+###########################
+Sylvain
diff --git a/Documentation/fb/internals.txt b/Documentation/fb/internals.txt
new file mode 100644
index 0000000..9b2a2b2
--- /dev/null
+++ b/Documentation/fb/internals.txt
@@ -0,0 +1,82 @@
+
+This is a first start for some documentation about frame buffer device
+internals.
+
+Geert Uytterhoeven <geert@linux-m68k.org>, 21 July 1998
+James Simmons <jsimmons@user.sf.net>, Nov 26 2002
+
+--------------------------------------------------------------------------------
+
+ *** STRUCTURES USED BY THE FRAME BUFFER DEVICE API ***
+
+The following structures play a role in the game of frame buffer devices. They
+are defined in <linux/fb.h>.
+
+1. Outside the kernel (user space)
+
+ - struct fb_fix_screeninfo
+
+ Device independent unchangeable information about a frame buffer device and
+ a specific video mode. This can be obtained using the FBIOGET_FSCREENINFO
+ ioctl.
+
+ - struct fb_var_screeninfo
+
+ Device independent changeable information about a frame buffer device and a
+ specific video mode. This can be obtained using the FBIOGET_VSCREENINFO
+ ioctl, and updated with the FBIOPUT_VSCREENINFO ioctl. If you want to pan
+ the screen only, you can use the FBIOPAN_DISPLAY ioctl.
+
+ - struct fb_cmap
+
+ Device independent colormap information. You can get and set the colormap
+ using the FBIOGETCMAP and FBIOPUTCMAP ioctls.
+
+
+2. Inside the kernel
+
+ - struct fb_info
+
+ Generic information, API and low level information about a specific frame
+ buffer device instance (slot number, board address, ...).
+
+ - struct `par'
+
+ Device dependent information that uniquely defines the video mode for this
+ particular piece of hardware.
+
+
+--------------------------------------------------------------------------------
+
+ *** VISUALS USED BY THE FRAME BUFFER DEVICE API ***
+
+
+Monochrome (FB_VISUAL_MONO01 and FB_VISUAL_MONO10)
+-------------------------------------------------
+Each pixel is either black or white.
+
+
+Pseudo color (FB_VISUAL_PSEUDOCOLOR and FB_VISUAL_STATIC_PSEUDOCOLOR)
+---------------------------------------------------------------------
+The whole pixel value is fed through a programmable lookup table that has one
+color (including red, green, and blue intensities) for each possible pixel
+value, and that color is displayed.
+
+
+True color (FB_VISUAL_TRUECOLOR)
+--------------------------------
+The pixel value is broken up into red, green, and blue fields.
+
+
+Direct color (FB_VISUAL_DIRECTCOLOR)
+------------------------------------
+The pixel value is broken up into red, green, and blue fields, each of which
+are looked up in separate red, green, and blue lookup tables.
+
+
+Grayscale displays
+------------------
+Grayscale and static grayscale are special variants of pseudo color and static
+pseudo color, where the red, green and blue components are always equal to
+each other.
+
diff --git a/Documentation/fb/lxfb.txt b/Documentation/fb/lxfb.txt
new file mode 100644
index 0000000..38b3ca6
--- /dev/null
+++ b/Documentation/fb/lxfb.txt
@@ -0,0 +1,52 @@
+[This file is cloned from VesaFB/aty128fb]
+
+What is lxfb?
+=================
+
+This is a graphics framebuffer driver for AMD Geode LX based processors.
+
+Advantages:
+
+ * No need to use AMD's VSA code (or other VESA emulation layer) in the
+ BIOS.
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+ without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode...
+
+
+How to use it?
+==============
+
+Switching modes is done using lxfb.mode_option=<resolution>... boot
+parameter or using `fbset' program.
+
+See Documentation/fb/modedb.txt for more information on modedb
+resolutions.
+
+
+X11
+===
+
+XF68_FBDev should generally work fine, but it is non-accelerated.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to lxfb with lxfb.<option>.
+For example, lxfb.mode_option=800x600@75.
+Accepted options:
+
+mode_option - specify the video mode. Of the form
+ <x>x<y>[-<bpp>][@<refresh>]
+vram - size of video ram (normally auto-detected)
+vt_switch - enable vt switching during suspend/resume. The vt
+ switch is slow, but harmless.
+
+--
+Andres Salomon <dilinger@debian.org>
diff --git a/Documentation/fb/matroxfb.txt b/Documentation/fb/matroxfb.txt
new file mode 100644
index 0000000..ad7a677
--- /dev/null
+++ b/Documentation/fb/matroxfb.txt
@@ -0,0 +1,415 @@
+[This file is cloned from VesaFB. Thanks go to Gerd Knorr]
+
+What is matroxfb?
+=================
+
+This is a driver for a graphic framebuffer for Matrox devices on
+Alpha, Intel and PPC boxes.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+ without using tiny, unreadable fonts.
+ * You can run XF{68,86}_FBDev or XFree86 fbdev driver on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode... but you should not notice
+ if you use same resolution as you used in textmode.
+
+
+How to use it?
+==============
+
+Switching modes is done using the video=matroxfb:vesa:... boot parameter
+or using `fbset' program.
+
+If you want, for example, enable a resolution of 1280x1024x24bpp you should
+pass to the kernel this command line: "video=matroxfb:vesa:0x1BB".
+
+You should compile in both vgacon (to boot if you remove you Matrox from
+box) and matroxfb (for graphics mode). You should not compile-in vesafb
+unless you have primary display on non-Matrox VBE2.0 device (see
+Documentation/fb/vesafb.txt for details).
+
+Currently supported video modes are (through vesa:... interface, PowerMac
+has [as addon] compatibility code):
+
+
+[Graphic modes]
+
+bpp | 640x400 640x480 768x576 800x600 960x720
+----+--------------------------------------------
+ 4 | 0x12 0x102
+ 8 | 0x100 0x101 0x180 0x103 0x188
+ 15 | 0x110 0x181 0x113 0x189
+ 16 | 0x111 0x182 0x114 0x18A
+ 24 | 0x1B2 0x184 0x1B5 0x18C
+ 32 | 0x112 0x183 0x115 0x18B
+
+
+[Graphic modes (continued)]
+
+bpp | 1024x768 1152x864 1280x1024 1408x1056 1600x1200
+----+------------------------------------------------
+ 4 | 0x104 0x106
+ 8 | 0x105 0x190 0x107 0x198 0x11C
+ 15 | 0x116 0x191 0x119 0x199 0x11D
+ 16 | 0x117 0x192 0x11A 0x19A 0x11E
+ 24 | 0x1B8 0x194 0x1BB 0x19C 0x1BF
+ 32 | 0x118 0x193 0x11B 0x19B
+
+
+[Text modes]
+
+text | 640x400 640x480 1056x344 1056x400 1056x480
+-----+------------------------------------------------
+ 8x8 | 0x1C0 0x108 0x10A 0x10B 0x10C
+8x16 | 2, 3, 7 0x109
+
+You can enter these number either hexadecimal (leading `0x') or decimal
+(0x100 = 256). You can also use value + 512 to achieve compatibility
+with your old number passed to vesafb.
+
+Non-listed number can be achieved by more complicated command-line, for
+example 1600x1200x32bpp can be specified by `video=matroxfb:vesa:0x11C,depth:32'.
+
+
+X11
+===
+
+XF{68,86}_FBDev should work just fine, but it is non-accelerated. On non-intel
+architectures there are some glitches for 24bpp videomodes. 8, 16 and 32bpp
+works fine.
+
+Running another (accelerated) X-Server like XF86_SVGA works too. But (at least)
+XFree servers have big troubles in multihead configurations (even on first
+head, not even talking about second). Running XFree86 4.x accelerated mga
+driver is possible, but you must not enable DRI - if you do, resolution and
+color depth of your X desktop must match resolution and color depths of your
+virtual consoles, otherwise X will corrupt accelerator settings.
+
+
+SVGALib
+=======
+
+Driver contains SVGALib compatibility code. It is turned on by choosing textual
+mode for console. You can do it at boot time by using videomode
+2,3,7,0x108-0x10C or 0x1C0. At runtime, `fbset -depth 0' does this work.
+Unfortunately, after SVGALib application exits, screen contents is corrupted.
+Switching to another console and back fixes it. I hope that it is SVGALib's
+problem and not mine, but I'm not sure.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to matroxfb with
+`video=matroxfb:option1,option2:value2,option3' (multiple options should be
+separated by comma, values are separated from options by `:').
+Accepted options:
+
+mem:X - size of memory (X can be in megabytes, kilobytes or bytes)
+ You can only decrease value determined by driver because of
+ it always probe for memory. Default is to use whole detected
+ memory usable for on-screen display (i.e. max. 8 MB).
+disabled - do not load driver; you can use also `off', but `disabled'
+ is here too.
+enabled - load driver, if you have `video=matroxfb:disabled' in LILO
+ configuration, you can override it by this (you cannot override
+ `off'). It is default.
+noaccel - do not use acceleration engine. It does not work on Alphas.
+accel - use acceleration engine. It is default.
+nopan - create initial consoles with vyres = yres, thus disabling virtual
+ scrolling.
+pan - create initial consoles as tall as possible (vyres = memory/vxres).
+ It is default.
+nopciretry - disable PCI retries. It is needed for some broken chipsets,
+ it is autodetected for intel's 82437. In this case device does
+ not comply to PCI 2.1 specs (it will not guarantee that every
+ transaction terminate with success or retry in 32 PCLK).
+pciretry - enable PCI retries. It is default, except for intel's 82437.
+novga - disables VGA I/O ports. It is default if BIOS did not enable device.
+ You should not use this option, some boards then do not restart
+ without power off.
+vga - preserve state of VGA I/O ports. It is default. Driver does not
+ enable VGA I/O if BIOS did not it (it is not safe to enable it in
+ most cases).
+nobios - disables BIOS ROM. It is default if BIOS did not enable BIOS itself.
+ You should not use this option, some boards then do not restart
+ without power off.
+bios - preserve state of BIOS ROM. It is default. Driver does not enable
+ BIOS if BIOS was not enabled before.
+noinit - tells driver, that devices were already initialized. You should use
+ it if you have G100 and/or if driver cannot detect memory, you see
+ strange pattern on screen and so on. Devices not enabled by BIOS
+ are still initialized. It is default.
+init - driver initializes every device it knows about.
+memtype - specifies memory type, implies 'init'. This is valid only for G200
+ and G400 and has following meaning:
+ G200: 0 -> 2x128Kx32 chips, 2MB onboard, probably sgram
+ 1 -> 2x128Kx32 chips, 4MB onboard, probably sgram
+ 2 -> 2x256Kx32 chips, 4MB onboard, probably sgram
+ 3 -> 2x256Kx32 chips, 8MB onboard, probably sgram
+ 4 -> 2x512Kx16 chips, 8/16MB onboard, probably sdram only
+ 5 -> same as above
+ 6 -> 4x128Kx32 chips, 4MB onboard, probably sgram
+ 7 -> 4x128Kx32 chips, 8MB onboard, probably sgram
+ G400: 0 -> 2x512Kx16 SDRAM, 16/32MB
+ 2x512Kx32 SGRAM, 16/32MB
+ 1 -> 2x256Kx32 SGRAM, 8/16MB
+ 2 -> 4x128Kx32 SGRAM, 8/16MB
+ 3 -> 4x512Kx32 SDRAM, 32MB
+ 4 -> 4x256Kx32 SGRAM, 16/32MB
+ 5 -> 2x1Mx32 SDRAM, 32MB
+ 6 -> reserved
+ 7 -> reserved
+ You should use sdram or sgram parameter in addition to memtype
+ parameter.
+nomtrr - disables write combining on frame buffer. This slows down driver but
+ there is reported minor incompatibility between GUS DMA and XFree
+ under high loads if write combining is enabled (sound dropouts).
+mtrr - enables write combining on frame buffer. It speeds up video accesses
+ much. It is default. You must have MTRR support enabled in kernel
+ and your CPU must have MTRR (f.e. Pentium II have them).
+sgram - tells to driver that you have Gxx0 with SGRAM memory. It has no
+ effect without `init'.
+sdram - tells to driver that you have Gxx0 with SDRAM memory.
+ It is a default.
+inv24 - change timings parameters for 24bpp modes on Millenium and
+ Millenium II. Specify this if you see strange color shadows around
+ characters.
+noinv24 - use standard timings. It is the default.
+inverse - invert colors on screen (for LCD displays)
+noinverse - show true colors on screen. It is default.
+dev:X - bind driver to device X. Driver numbers device from 0 up to N,
+ where device 0 is first `known' device found, 1 second and so on.
+ lspci lists devices in this order.
+ Default is `every' known device for driver with multihead support
+ and first working device (usually dev:0) for driver without
+ multihead support.
+nohwcursor - disables hardware cursor (use software cursor instead).
+hwcursor - enables hardware cursor. It is default. If you are using
+ non-accelerated mode (`noaccel' or `fbset -accel false'), software
+ cursor is used (except for text mode).
+noblink - disables cursor blinking. Cursor in text mode always blinks (hw
+ limitation).
+blink - enables cursor blinking. It is default.
+nofastfont - disables fastfont feature. It is default.
+fastfont:X - enables fastfont feature. X specifies size of memory reserved for
+ font data, it must be >= (fontwidth*fontheight*chars_in_font)/8.
+ It is faster on Gx00 series, but slower on older cards.
+grayscale - enable grayscale summing. It works in PSEUDOCOLOR modes (text,
+ 4bpp, 8bpp). In DIRECTCOLOR modes it is limited to characters
+ displayed through putc/putcs. Direct accesses to framebuffer
+ can paint colors.
+nograyscale - disable grayscale summing. It is default.
+cross4MB - enables that pixel line can cross 4MB boundary. It is default for
+ non-Millenium.
+nocross4MB - pixel line must not cross 4MB boundary. It is default for
+ Millenium I or II, because of these devices have hardware
+ limitations which do not allow this. But this option is
+ incompatible with some (if not all yet released) versions of
+ XF86_FBDev.
+dfp - enables digital flat panel interface. This option is incompatible with
+ secondary (TV) output - if DFP is active, TV output must be
+ inactive and vice versa. DFP always uses same timing as primary
+ (monitor) output.
+dfp:X - use settings X for digital flat panel interface. X is number from
+ 0 to 0xFF, and meaning of each individual bit is described in
+ G400 manual, in description of DAC register 0x1F. For normal operation
+ you should set all bits to zero, except lowest bit. This lowest bit
+ selects who is source of display clocks, whether G400, or panel.
+ Default value is now read back from hardware - so you should specify
+ this value only if you are also using `init' parameter.
+outputs:XYZ - set mapping between CRTC and outputs. Each letter can have value
+ of 0 (for no CRTC), 1 (CRTC1) or 2 (CRTC2), and first letter corresponds
+ to primary analog output, second letter to the secondary analog output
+ and third letter to the DVI output. Default setting is 100 for
+ cards below G400 or G400 without DFP, 101 for G400 with DFP, and
+ 111 for G450 and G550. You can set mapping only on first card,
+ use matroxset for setting up other devices.
+vesa:X - selects startup videomode. X is number from 0 to 0x1FF, see table
+ above for detailed explanation. Default is 640x480x8bpp if driver
+ has 8bpp support. Otherwise first available of 640x350x4bpp,
+ 640x480x15bpp, 640x480x24bpp, 640x480x32bpp or 80x25 text
+ (80x25 text is always available).
+
+If you are not satisfied with videomode selected by `vesa' option, you
+can modify it with these options:
+
+xres:X - horizontal resolution, in pixels. Default is derived from `vesa'
+ option.
+yres:X - vertical resolution, in pixel lines. Default is derived from `vesa'
+ option.
+upper:X - top boundary: lines between end of VSYNC pulse and start of first
+ pixel line of picture. Default is derived from `vesa' option.
+lower:X - bottom boundary: lines between end of picture and start of VSYNC
+ pulse. Default is derived from `vesa' option.
+vslen:X - length of VSYNC pulse, in lines. Default is derived from `vesa'
+ option.
+left:X - left boundary: pixels between end of HSYNC pulse and first pixel.
+ Default is derived from `vesa' option.
+right:X - right boundary: pixels between end of picture and start of HSYNC
+ pulse. Default is derived from `vesa' option.
+hslen:X - length of HSYNC pulse, in pixels. Default is derived from `vesa'
+ option.
+pixclock:X - dotclocks, in ps (picoseconds). Default is derived from `vesa'
+ option and from `fh' and `fv' options.
+sync:X - sync. pulse - bit 0 inverts HSYNC polarity, bit 1 VSYNC polarity.
+ If bit 3 (value 0x08) is set, composite sync instead of HSYNC is
+ generated. If bit 5 (value 0x20) is set, sync on green is turned on.
+ Do not forget that if you want sync on green, you also probably
+ want composite sync.
+ Default depends on `vesa'.
+depth:X - Bits per pixel: 0=text, 4,8,15,16,24 or 32. Default depends on
+ `vesa'.
+
+If you know capabilities of your monitor, you can specify some (or all) of
+`maxclk', `fh' and `fv'. In this case, `pixclock' is computed so that
+pixclock <= maxclk, real_fh <= fh and real_fv <= fv.
+
+maxclk:X - maximum dotclock. X can be specified in MHz, kHz or Hz. Default is
+ `don't care'.
+fh:X - maximum horizontal synchronization frequency. X can be specified
+ in kHz or Hz. Default is `don't care'.
+fv:X - maximum vertical frequency. X must be specified in Hz. Default is
+ 70 for modes derived from `vesa' with yres <= 400, 60Hz for
+ yres > 400.
+
+
+Limitations
+===========
+
+There are known and unknown bugs, features and misfeatures.
+Currently there are following known bugs:
+ + SVGALib does not restore screen on exit
+ + generic fbcon-cfbX procedures do not work on Alphas. Due to this,
+ `noaccel' (and cfb4 accel) driver does not work on Alpha. So everyone
+ with access to /dev/fb* on Alpha can hang machine (you should restrict
+ access to /dev/fb* - everyone with access to this device can destroy
+ your monitor, believe me...).
+ + 24bpp does not support correctly XF-FBDev on big-endian architectures.
+ + interlaced text mode is not supported; it looks like hardware limitation,
+ but I'm not sure.
+ + Gxx0 SGRAM/SDRAM is not autodetected.
+ + If you are using more than one framebuffer device, you must boot kernel
+ with 'video=scrollback:0'.
+ + maybe more...
+And following misfeatures:
+ + SVGALib does not restore screen on exit.
+ + pixclock for text modes is limited by hardware to
+ 83 MHz on G200
+ 66 MHz on Millennium I
+ 60 MHz on Millennium II
+ Because I have no access to other devices, I do not know specific
+ frequencies for them. So driver does not check this and allows you to
+ set frequency higher that this. It causes sparks, black holes and other
+ pretty effects on screen. Device was not destroyed during tests. :-)
+ + my Millennium G200 oscillator has frequency range from 35 MHz to 380 MHz
+ (and it works with 8bpp on about 320 MHz dotclocks (and changed mclk)).
+ But Matrox says on product sheet that VCO limit is 50-250 MHz, so I believe
+ them (maybe that chip overheats, but it has a very big cooler (G100 has
+ none), so it should work).
+ + special mixed video/graphics videomodes of Mystique and Gx00 - 2G8V16 and
+ G16V16 are not supported
+ + color keying is not supported
+ + feature connector of Mystique and Gx00 is set to VGA mode (it is disabled
+ by BIOS)
+ + DDC (monitor detection) is supported through dualhead driver
+ + some check for input values are not so strict how it should be (you can
+ specify vslen=4000 and so on).
+ + maybe more...
+And following features:
+ + 4bpp is available only on Millennium I and Millennium II. It is hardware
+ limitation.
+ + selection between 1:5:5:5 and 5:6:5 16bpp videomode is done by -rgba
+ option of fbset: "fbset -depth 16 -rgba 5,5,5" selects 1:5:5:5, anything
+ else selects 5:6:5 mode.
+ + text mode uses 6 bit VGA palette instead of 8 bit (one of 262144 colors
+ instead of one of 16M colors). It is due to hardware limitation of
+ Millennium I/II and SVGALib compatibility.
+
+
+Benchmarks
+==========
+It is time to redraw whole screen 1000 times in 1024x768, 60Hz. It is
+time for draw 6144000 characters on screen through /dev/vcsa
+(for 32bpp it is about 3GB of data (exactly 3000 MB); for 8x16 font in
+16 seconds, i.e. 187 MBps).
+Times were obtained from one older version of driver, now they are about 3%
+faster, it is kernel-space only time on P-II/350 MHz, Millennium I in 33 MHz
+PCI slot, G200 in AGP 2x slot. I did not test vgacon.
+
+NOACCEL
+ 8x16 12x22
+ Millennium I G200 Millennium I G200
+8bpp 16.42 9.54 12.33 9.13
+16bpp 21.00 15.70 19.11 15.02
+24bpp 36.66 36.66 35.00 35.00
+32bpp 35.00 30.00 33.85 28.66
+
+ACCEL, nofastfont
+ 8x16 12x22 6x11
+ Millennium I G200 Millennium I G200 Millennium I G200
+8bpp 7.79 7.24 13.55 7.78 30.00 21.01
+16bpp 9.13 7.78 16.16 7.78 30.00 21.01
+24bpp 14.17 10.72 18.69 10.24 34.99 21.01
+32bpp 16.15 16.16 18.73 13.09 34.99 21.01
+
+ACCEL, fastfont
+ 8x16 12x22 6x11
+ Millennium I G200 Millennium I G200 Millennium I G200
+8bpp 8.41 6.01 6.54 4.37 16.00 10.51
+16bpp 9.54 9.12 8.76 6.17 17.52 14.01
+24bpp 15.00 12.36 11.67 10.00 22.01 18.32
+32bpp 16.18 18.29* 12.71 12.74 24.44 21.00
+
+TEXT
+ 8x16
+ Millennium I G200
+TEXT 3.29 1.50
+
+* Yes, it is slower than Millennium I.
+
+
+Dualhead G400
+=============
+Driver supports dualhead G400 with some limitations:
+ + secondary head shares videomemory with primary head. It is not problem
+ if you have 32MB of videoram, but if you have only 16MB, you may have
+ to think twice before choosing videomode (for example twice 1880x1440x32bpp
+ is not possible).
+ + due to hardware limitation, secondary head can use only 16 and 32bpp
+ videomodes.
+ + secondary head is not accelerated. There were bad problems with accelerated
+ XFree when secondary head used to use acceleration.
+ + secondary head always powerups in 640x480@60-32 videomode. You have to use
+ fbset to change this mode.
+ + secondary head always powerups in monitor mode. You have to use fbmatroxset
+ to change it to TV mode. Also, you must select at least 525 lines for
+ NTSC output and 625 lines for PAL output.
+ + kernel is not fully multihead ready. So some things are impossible to do.
+ + if you compiled it as module, you must insert i2c-matroxfb, matroxfb_maven
+ and matroxfb_crtc2 into kernel.
+
+
+Dualhead G450
+=============
+Driver supports dualhead G450 with some limitations:
+ + secondary head shares videomemory with primary head. It is not problem
+ if you have 32MB of videoram, but if you have only 16MB, you may have
+ to think twice before choosing videomode.
+ + due to hardware limitation, secondary head can use only 16 and 32bpp
+ videomodes.
+ + secondary head is not accelerated.
+ + secondary head always powerups in 640x480@60-32 videomode. You have to use
+ fbset to change this mode.
+ + TV output is not supported
+ + kernel is not fully multihead ready, so some things are impossible to do.
+ + if you compiled it as module, you must insert matroxfb_g450 and matroxfb_crtc2
+ into kernel.
+
+--
+Petr Vandrovec <vandrove@vc.cvut.cz>
diff --git a/Documentation/fb/metronomefb.txt b/Documentation/fb/metronomefb.txt
new file mode 100644
index 0000000..237ca41
--- /dev/null
+++ b/Documentation/fb/metronomefb.txt
@@ -0,0 +1,36 @@
+ Metronomefb
+ -----------
+Maintained by Jaya Kumar <jayakumar.lkml.gmail.com>
+Last revised: Mar 10, 2008
+
+Metronomefb is a driver for the Metronome display controller. The controller
+is from E-Ink Corporation. It is intended to be used to drive the E-Ink
+Vizplex display media. E-Ink hosts some details of this controller and the
+display media here http://www.e-ink.com/products/matrix/metronome.html .
+
+Metronome is interfaced to the host CPU through the AMLCD interface. The
+host CPU generates the control information and the image in a framebuffer
+which is then delivered to the AMLCD interface by a host specific method.
+The display and error status are each pulled through individual GPIOs.
+
+Metronomefb is platform independent and depends on a board specific driver
+to do all physical IO work. Currently, an example is implemented for the
+PXA board used in the AM-200 EPD devkit. This example is am200epd.c
+
+Metronomefb requires waveform information which is delivered via the AMLCD
+interface to the metronome controller. The waveform information is expected to
+be delivered from userspace via the firmware class interface. The waveform file
+can be compressed as long as your udev or hotplug script is aware of the need
+to uncompress it before delivering it. metronomefb will ask for metronome.wbf
+which would typically go into /lib/firmware/metronome.wbf depending on your
+udev/hotplug setup. I have only tested with a single waveform file which was
+originally labeled 23P01201_60_WT0107_MTC. I do not know what it stands for.
+Caution should be exercised when manipulating the waveform as there may be
+a possibility that it could have some permanent effects on the display media.
+I neither have access to nor know exactly what the waveform does in terms of
+the physical media.
+
+Metronomefb uses the deferred IO interface so that it can provide a memory
+mappable frame buffer. It has been tested with tinyx (Xfbdev). It is known
+to work at this time with xeyes, xclock, xloadimage, xpdf.
+
diff --git a/Documentation/fb/modedb.txt b/Documentation/fb/modedb.txt
new file mode 100644
index 0000000..ec4dee7
--- /dev/null
+++ b/Documentation/fb/modedb.txt
@@ -0,0 +1,136 @@
+
+
+ modedb default video mode support
+
+
+Currently all frame buffer device drivers have their own video mode databases,
+which is a mess and a waste of resources. The main idea of modedb is to have
+
+ - one routine to probe for video modes, which can be used by all frame buffer
+ devices
+ - one generic video mode database with a fair amount of standard videomodes
+ (taken from XFree86)
+ - the possibility to supply your own mode database for graphics hardware that
+ needs non-standard modes, like amifb and Mac frame buffer drivers (which
+ use macmodes.c)
+
+When a frame buffer device receives a video= option it doesn't know, it should
+consider that to be a video mode option. If no frame buffer device is specified
+in a video= option, fbmem considers that to be a global video mode option.
+
+Valid mode specifiers (mode_option argument):
+
+ <xres>x<yres>[M][R][-<bpp>][@<refresh>][i][m]
+ <name>[-<bpp>][@<refresh>]
+
+with <xres>, <yres>, <bpp> and <refresh> decimal numbers and <name> a string.
+Things between square brackets are optional.
+
+If 'M' is specified in the mode_option argument (after <yres> and before
+<bpp> and <refresh>, if specified) the timings will be calculated using
+VESA(TM) Coordinated Video Timings instead of looking up the mode from a table.
+If 'R' is specified, do a 'reduced blanking' calculation for digital displays.
+If 'i' is specified, calculate for an interlaced mode. And if 'm' is
+specified, add margins to the calculation (1.8% of xres rounded down to 8
+pixels and 1.8% of yres).
+
+ Sample usage: 1024x768M@60m - CVT timing with margins
+
+***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo *****
+
+What is the VESA(TM) Coordinated Video Timings (CVT)?
+
+From the VESA(TM) Website:
+
+ "The purpose of CVT is to provide a method for generating a consistent
+ and coordinated set of standard formats, display refresh rates, and
+ timing specifications for computer display products, both those
+ employing CRTs, and those using other display technologies. The
+ intention of CVT is to give both source and display manufacturers a
+ common set of tools to enable new timings to be developed in a
+ consistent manner that ensures greater compatibility."
+
+This is the third standard approved by VESA(TM) concerning video timings. The
+first was the Discrete Video Timings (DVT) which is a collection of
+pre-defined modes approved by VESA(TM). The second is the Generalized Timing
+Formula (GTF) which is an algorithm to calculate the timings, given the
+pixelclock, the horizontal sync frequency, or the vertical refresh rate.
+
+The GTF is limited by the fact that it is designed mainly for CRT displays.
+It artificially increases the pixelclock because of its high blanking
+requirement. This is inappropriate for digital display interface with its high
+data rate which requires that it conserves the pixelclock as much as possible.
+Also, GTF does not take into account the aspect ratio of the display.
+
+The CVT addresses these limitations. If used with CRT's, the formula used
+is a derivation of GTF with a few modifications. If used with digital
+displays, the "reduced blanking" calculation can be used.
+
+From the framebuffer subsystem perspective, new formats need not be added
+to the global mode database whenever a new mode is released by display
+manufacturers. Specifying for CVT will work for most, if not all, relatively
+new CRT displays and probably with most flatpanels, if 'reduced blanking'
+calculation is specified. (The CVT compatibility of the display can be
+determined from its EDID. The version 1.3 of the EDID has extra 128-byte
+blocks where additional timing information is placed. As of this time, there
+is no support yet in the layer to parse this additional blocks.)
+
+CVT also introduced a new naming convention (should be seen from dmesg output):
+
+ <pix>M<a>[-R]
+
+ where: pix = total amount of pixels in MB (xres x yres)
+ M = always present
+ a = aspect ratio (3 - 4:3; 4 - 5:4; 9 - 15:9, 16:9; A - 16:10)
+ -R = reduced blanking
+
+ example: .48M3-R - 800x600 with reduced blanking
+
+Note: VESA(TM) has restrictions on what is a standard CVT timing:
+
+ - aspect ratio can only be one of the above values
+ - acceptable refresh rates are 50, 60, 70 or 85 Hz only
+ - if reduced blanking, the refresh rate must be at 60Hz
+
+If one of the above are not satisfied, the kernel will print a warning but the
+timings will still be calculated.
+
+***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo *****
+
+To find a suitable video mode, you just call
+
+int __init fb_find_mode(struct fb_var_screeninfo *var,
+ struct fb_info *info, const char *mode_option,
+ const struct fb_videomode *db, unsigned int dbsize,
+ const struct fb_videomode *default_mode,
+ unsigned int default_bpp)
+
+with db/dbsize your non-standard video mode database, or NULL to use the
+standard video mode database.
+
+fb_find_mode() first tries the specified video mode (or any mode that matches,
+e.g. there can be multiple 640x480 modes, each of them is tried). If that
+fails, the default mode is tried. If that fails, it walks over all modes.
+
+To specify a video mode at bootup, use the following boot options:
+ video=<driver>:<xres>x<yres>[-<bpp>][@refresh]
+
+where <driver> is a name from the table below. Valid default modes can be
+found in linux/drivers/video/modedb.c. Check your driver's documentation.
+There may be more modes.
+
+ Drivers that support modedb boot options
+ Boot Name Cards Supported
+
+ amifb - Amiga chipset frame buffer
+ aty128fb - ATI Rage128 / Pro frame buffer
+ atyfb - ATI Mach64 frame buffer
+ pm2fb - Permedia 2/2V frame buffer
+ pm3fb - Permedia 3 frame buffer
+ sstfb - Voodoo 1/2 (SST1) chipset frame buffer
+ tdfxfb - 3D Fx frame buffer
+ tridentfb - Trident (Cyber)blade chipset frame buffer
+ vt8623fb - VIA 8623 frame buffer
+
+BTW, only a few drivers use this at the moment. Others are to follow
+(feel free to send patches).
diff --git a/Documentation/fb/pvr2fb.txt b/Documentation/fb/pvr2fb.txt
new file mode 100644
index 0000000..36bdeff
--- /dev/null
+++ b/Documentation/fb/pvr2fb.txt
@@ -0,0 +1,65 @@
+$Id: pvr2fb.txt,v 1.1 2001/05/24 05:09:16 mrbrown Exp $
+
+What is pvr2fb?
+===============
+
+This is a driver for PowerVR 2 based graphics frame buffers, such as the
+one found in the Dreamcast.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+ without using tiny, unreadable fonts (NOT on the Dreamcast)
+ * You can run XF86_FBDev on top of /dev/fb0
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * Driver is largely untested on non-Dreamcast systems.
+
+Configuration
+=============
+
+You can pass kernel command line options to pvr2fb with
+`video=pvr2fb:option1,option2:value2,option3' (multiple options should be
+separated by comma, values are separated from options by `:').
+Accepted options:
+
+font:X - default font to use. All fonts are supported, including the
+ SUN12x22 font which is very nice at high resolutions.
+
+
+mode:X - default video mode with format [xres]x[yres]-<bpp>@<refresh rate>
+ The following video modes are supported:
+ 640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast
+ defaults to 640x480-16@60. At the time of writing the
+ 24bpp and 32bpp modes function poorly. Work to fix that is
+ ongoing
+
+ Note: the 640x240 mode is currently broken, and should not be
+ used for any reason. It is only mentioned here as a reference.
+
+inverse - invert colors on screen (for LCD displays)
+
+nomtrr - disables write combining on frame buffer. This slows down driver
+ but there is reported minor incompatibility between GUS DMA and
+ XFree under high loads if write combining is enabled (sound
+ dropouts). MTRR is enabled by default on systems that have it
+ configured and that support it.
+
+cable:X - cable type. This can be any of the following: vga, rgb, and
+ composite. If none is specified, we guess.
+
+output:X - output type. This can be any of the following: pal, ntsc, and
+ vga. If none is specified, we guess.
+
+X11
+===
+
+XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet
+on any 2.6 series kernel.
+
+--
+Paul Mundt <lethal@linuxdc.org>
+Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk>
+
diff --git a/Documentation/fb/pxafb.txt b/Documentation/fb/pxafb.txt
new file mode 100644
index 0000000..db9b850
--- /dev/null
+++ b/Documentation/fb/pxafb.txt
@@ -0,0 +1,54 @@
+Driver for PXA25x LCD controller
+================================
+
+The driver supports the following options, either via
+options=<OPTIONS> when modular or video=pxafb:<OPTIONS> when built in.
+
+For example:
+ modprobe pxafb options=mode:640x480-8,passive
+or on the kernel command line
+ video=pxafb:mode:640x480-8,passive
+
+mode:XRESxYRES[-BPP]
+ XRES == LCCR1_PPL + 1
+ YRES == LLCR2_LPP + 1
+ The resolution of the display in pixels
+ BPP == The bit depth. Valid values are 1, 2, 4, 8 and 16.
+
+pixclock:PIXCLOCK
+ Pixel clock in picoseconds
+
+left:LEFT == LCCR1_BLW + 1
+right:RIGHT == LCCR1_ELW + 1
+hsynclen:HSYNC == LCCR1_HSW + 1
+upper:UPPER == LCCR2_BFW
+lower:LOWER == LCCR2_EFR
+vsynclen:VSYNC == LCCR2_VSW + 1
+ Display margins and sync times
+
+color | mono => LCCR0_CMS
+ umm...
+
+active | passive => LCCR0_PAS
+ Active (TFT) or Passive (STN) display
+
+single | dual => LCCR0_SDS
+ Single or dual panel passive display
+
+4pix | 8pix => LCCR0_DPD
+ 4 or 8 pixel monochrome single panel data
+
+hsync:HSYNC
+vsync:VSYNC
+ Horizontal and vertical sync. 0 => active low, 1 => active
+ high.
+
+dpc:DPC
+ Double pixel clock. 1=>true, 0=>false
+
+outputen:POLARITY
+ Output Enable Polarity. 0 => active low, 1 => active high
+
+pixclockpol:POLARITY
+ pixel clock polarity
+ 0 => falling edge, 1 => rising edge
diff --git a/Documentation/fb/s3fb.txt b/Documentation/fb/s3fb.txt
new file mode 100644
index 0000000..2c97770
--- /dev/null
+++ b/Documentation/fb/s3fb.txt
@@ -0,0 +1,82 @@
+
+ s3fb - fbdev driver for S3 Trio/Virge chips
+ ===========================================
+
+
+Supported Hardware
+==================
+
+ S3 Trio32
+ S3 Trio64 (and variants V+, UV+, V2/DX, V2/GX)
+ S3 Virge (and variants VX, DX, GX and GX2+)
+ S3 Plato/PX (completely untested)
+ S3 Aurora64V+ (completely untested)
+
+ - only PCI bus supported
+ - only BIOS initialized VGA devices supported
+ - probably not working on big endian
+
+I tested s3fb on Trio64 (plain, V+ and V2/DX) and Virge (plain, VX, DX),
+all on i386.
+
+
+Supported Features
+==================
+
+ * 4 bpp pseudocolor modes (with 18bit palette, two variants)
+ * 8 bpp pseudocolor mode (with 18bit palette)
+ * 16 bpp truecolor modes (RGB 555 and RGB 565)
+ * 24 bpp truecolor mode (RGB 888) on (only on Virge VX)
+ * 32 bpp truecolor mode (RGB 888) on (not on Virge VX)
+ * text mode (activated by bpp = 0)
+ * interlaced mode variant (not available in text mode)
+ * doublescan mode variant (not available in text mode)
+ * panning in both directions
+ * suspend/resume support
+ * DPMS support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (maximum usually between 50-60 MHz, depending on specific
+hardware, i get best results from plain S3 Trio32 card - about 75 MHz). This
+limitation is not enforced by driver. Text mode supports 8bit wide fonts only
+(hardware limitation) and 16bit tall fonts (driver limitation). Text mode
+support is broken on S3 Trio64 V2/DX.
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+ * secondary (not initialized by BIOS) device support
+ * big endian support
+ * Zorro bus support
+ * MMIO support
+ * 24 bpp mode support on more cards
+ * support for fontwidths != 8 in 4 bpp modes
+ * support for fontheight != 16 in text mode
+ * composite and external sync (is anyone able to test this?)
+ * hardware cursor
+ * video overlay support
+ * vsync synchronization
+ * feature connector support
+ * acceleration support (8514-like 2D, Virge 3D, busmaster transfers)
+ * better values for some magic registers (performance issues)
+
+
+Known bugs
+==========
+
+ * cursor disable in text mode doesn't work
+ * text mode broken on S3 Trio64 V2/DX
+
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/sa1100fb.txt b/Documentation/fb/sa1100fb.txt
new file mode 100644
index 0000000..f1b4220
--- /dev/null
+++ b/Documentation/fb/sa1100fb.txt
@@ -0,0 +1,39 @@
+[This file is cloned from VesaFB/matroxfb]
+
+What is sa1100fb?
+=================
+
+This is a driver for a graphic framebuffer for the SA-1100 LCD
+controller.
+
+Configuration
+==============
+
+For most common passive displays, giving the option
+
+video=sa1100fb:bpp:<value>,lccr0:<value>,lccr1:<value>,lccr2:<value>,lccr3:<value>
+
+on the kernel command line should be enough to configure the
+controller. The bits per pixel (bpp) value should be 4, 8, 12, or
+16. LCCR values are display-specific and should be computed as
+documented in the SA-1100 Developer's Manual, Section 11.7. Dual-panel
+displays are supported as long as the SDS bit is set in LCCR0; GPIO<9:2>
+are used for the lower panel.
+
+For active displays or displays requiring additional configuration
+(controlling backlights, powering on the LCD, etc.), the command line
+options may not be enough to configure the display. Adding sections to
+sa1100fb_init_fbinfo(), sa1100fb_activate_var(),
+sa1100fb_disable_lcd_controller(), and sa1100fb_enable_lcd_controller()
+will probably be necessary.
+
+Accepted options:
+
+bpp:<value> Configure for <value> bits per pixel
+lccr0:<value> Configure LCD control register 0 (11.7.3)
+lccr1:<value> Configure LCD control register 1 (11.7.4)
+lccr2:<value> Configure LCD control register 2 (11.7.5)
+lccr3:<value> Configure LCD control register 3 (11.7.6)
+
+--
+Mark Huang <mhuang@livetoy.com>
diff --git a/Documentation/fb/sh7760fb.txt b/Documentation/fb/sh7760fb.txt
new file mode 100644
index 0000000..c87bfe5
--- /dev/null
+++ b/Documentation/fb/sh7760fb.txt
@@ -0,0 +1,131 @@
+SH7760/SH7763 integrated LCDC Framebuffer driver
+================================================
+
+0. Overwiew
+-----------
+The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which
+supports (in theory) resolutions ranging from 1x1 to 1024x1024,
+with color depths ranging from 1 to 16 bits, on STN, DSTN and TFT Panels.
+
+Caveats:
+* Framebuffer memory must be a large chunk allocated at the top
+ of Area3 (HW requirement). Because of this requirement you should NOT
+ make the driver a module since at runtime it may become impossible to
+ get a large enough contiguous chunk of memory.
+
+* The driver does not support changing resolution while loaded
+ (displays aren't hotpluggable anyway)
+
+* Heavy flickering may be observed
+ a) if you're using 15/16bit color modes at >= 640x480 px resolutions,
+ b) during PCMCIA (or any other slow bus) activity.
+
+* Rotation works only 90degress clockwise, and only if horizontal
+ resolution is <= 320 pixels.
+
+files: drivers/video/sh7760fb.c
+ include/asm-sh/sh7760fb.h
+ Documentation/fb/sh7760fb.txt
+
+1. Platform setup
+-----------------
+SH7760:
+ Video data is fetched via the DMABRG DMA engine, so you have to
+ configure the SH DMAC for DMABRG mode (write 0x94808080 to the
+ DMARSRA register somewhere at boot).
+
+ PFC registers PCCR and PCDR must be set to peripheral mode.
+ (write zeros to both).
+
+The driver does NOT do the above for you since board setup is, well, job
+of the board setup code.
+
+2. Panel definitions
+--------------------
+The LCDC must explicitly be told about the type of LCD panel
+attached. Data must be wrapped in a "struct sh7760fb_platdata" and
+passed to the driver as platform_data.
+
+Suggest you take a closer look at the SH7760 Manual, Section 30.
+(http://documentation.renesas.com/eng/products/mpumcu/e602291_sh7760.pdf)
+
+The following code illustrates what needs to be done to
+get the framebuffer working on a 640x480 TFT:
+
+====================== cut here ======================================
+
+#include <linux/fb.h>
+#include <asm/sh7760fb.h>
+
+/*
+ * NEC NL6440bc26-01 640x480 TFT
+ * dotclock 25175 kHz
+ * Xres 640 Yres 480
+ * Htotal 800 Vtotal 525
+ * HsynStart 656 VsynStart 490
+ * HsynLenn 30 VsynLenn 2
+ *
+ * The linux framebuffer layer does not use the syncstart/synclen
+ * values but right/left/upper/lower margin values. The comments
+ * for the x_margin explain how to calculate those from given
+ * panel sync timings.
+ */
+static struct fb_videomode nl6448bc26 = {
+ .name = "NL6448BC26",
+ .refresh = 60,
+ .xres = 640,
+ .yres = 480,
+ .pixclock = 39683, /* in picoseconds! */
+ .hsync_len = 30,
+ .vsync_len = 2,
+ .left_margin = 114, /* HTOT - (HSYNSLEN + HSYNSTART) */
+ .right_margin = 16, /* HSYNSTART - XRES */
+ .upper_margin = 33, /* VTOT - (VSYNLEN + VSYNSTART) */
+ .lower_margin = 10, /* VSYNSTART - YRES */
+ .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT,
+ .vmode = FB_VMODE_NONINTERLACED,
+ .flag = 0,
+};
+
+static struct sh7760fb_platdata sh7760fb_nl6448 = {
+ .def_mode = &nl6448bc26,
+ .ldmtr = LDMTR_TFT_COLOR_16, /* 16bit TFT panel */
+ .lddfr = LDDFR_8BPP, /* we want 8bit output */
+ .ldpmmr = 0x0070,
+ .ldpspr = 0x0500,
+ .ldaclnr = 0,
+ .ldickr = LDICKR_CLKSRC(LCDC_CLKSRC_EXTERNAL) |
+ LDICKR_CLKDIV(1),
+ .rotate = 0,
+ .novsync = 1,
+ .blank = NULL,
+};
+
+/* SH7760:
+ * 0xFE300800: 256 * 4byte xRGB palette ram
+ * 0xFE300C00: 42 bytes ctrl registers
+ */
+static struct resource sh7760_lcdc_res[] = {
+ [0] = {
+ .start = 0xFE300800,
+ .end = 0xFE300CFF,
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .start = 65,
+ .end = 65,
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+static struct platform_device sh7760_lcdc_dev = {
+ .dev = {
+ .platform_data = &sh7760fb_nl6448,
+ },
+ .name = "sh7760-lcdc",
+ .id = -1,
+ .resource = sh7760_lcdc_res,
+ .num_resources = ARRAY_SIZE(sh7760_lcdc_res),
+};
+
+====================== cut here ======================================
diff --git a/Documentation/fb/sisfb.txt b/Documentation/fb/sisfb.txt
new file mode 100644
index 0000000..2e68e50
--- /dev/null
+++ b/Documentation/fb/sisfb.txt
@@ -0,0 +1,158 @@
+
+What is sisfb?
+==============
+
+sisfb is a framebuffer device driver for SiS (Silicon Integrated Systems)
+graphics chips. Supported are:
+
+- SiS 300 series: SiS 300/305, 540, 630(S), 730(S)
+- SiS 315 series: SiS 315/H/PRO, 55x, (M)65x, 740, (M)661(F/M)X, (M)741(GX)
+- SiS 330 series: SiS 330 ("Xabre"), (M)760
+
+
+Why do I need a framebuffer driver?
+===================================
+
+sisfb is eg. useful if you want a high-resolution text console. Besides that,
+sisfb is required to run DirectFB (which comes with an additional, dedicated
+driver for the 315 series).
+
+On the 300 series, sisfb on kernels older than 2.6.3 furthermore plays an
+important role in connection with DRM/DRI: Sisfb manages the memory heap
+used by DRM/DRI for 3D texture and other data. This memory management is
+required for using DRI/DRM.
+
+Kernels >= around 2.6.3 do not need sisfb any longer for DRI/DRM memory
+management. The SiS DRM driver has been updated and features a memory manager
+of its own (which will be used if sisfb is not compiled). So unless you want
+a graphical console, you don't need sisfb on kernels >=2.6.3.
+
+Sidenote: Since this seems to be a commonly made mistake: sisfb and vesafb
+cannot be active at the same time! Do only select one of them in your kernel
+configuration.
+
+
+How are parameters passed to sisfb?
+===================================
+
+Well, it depends: If compiled statically into the kernel, use lilo's append
+statement to add the parameters to the kernel command line. Please see lilo's
+(or GRUB's) documentation for more information. If sisfb is a kernel module,
+parameters are given with the modprobe (or insmod) command.
+
+Example for sisfb as part of the static kernel: Add the following line to your
+lilo.conf:
+
+ append="video=sisfb:mode:1024x768x16,mem:12288,rate:75"
+
+Example for sisfb as a module: Start sisfb by typing
+
+ modprobe sisfb mode=1024x768x16 rate=75 mem=12288
+
+A common mistake is that folks use a wrong parameter format when using the
+driver compiled into the kernel. Please note: If compiled into the kernel,
+the parameter format is video=sisfb:mode:none or video=sisfb:mode:1024x768x16
+(or whatever mode you want to use, alternatively using any other format
+described above or the vesa keyword instead of mode). If compiled as a module,
+the parameter format reads mode=none or mode=1024x768x16 (or whatever mode you
+want to use). Using a "=" for a ":" (and vice versa) is a huge difference!
+Additionally: If you give more than one argument to the in-kernel sisfb, the
+arguments are separated with ",". For example:
+
+ video=sisfb:mode:1024x768x16,rate:75,mem:12288
+
+
+How do I use it?
+================
+
+Preface statement: This file only covers very little of the driver's
+capabilities and features. Please refer to the author's and maintainer's
+website at http://www.winischhofer.net/linuxsisvga.shtml for more
+information. Additionally, "modinfo sisfb" gives an overview over all
+supported options including some explanation.
+
+The desired display mode can be specified using the keyword "mode" with
+a parameter in one of the following formats:
+ - XxYxDepth or
+ - XxY-Depth or
+ - XxY-Depth@Rate or
+ - XxY
+ - or simply use the VESA mode number in hexadecimal or decimal.
+
+For example: 1024x768x16, 1024x768-16@75, 1280x1024-16. If no depth is
+specified, it defaults to 8. If no rate is given, it defaults to 60Hz. Depth 32
+means 24bit color depth (but 32 bit framebuffer depth, which is not relevant
+to the user).
+
+Additionally, sisfb understands the keyword "vesa" followed by a VESA mode
+number in decimal or hexadecimal. For example: vesa=791 or vesa=0x117. Please
+use either "mode" or "vesa" but not both.
+
+Linux 2.4 only: If no mode is given, sisfb defaults to "no mode" (mode=none) if
+compiled as a module; if sisfb is statically compiled into the kernel, it
+defaults to 800x600x8 unless CRT2 type is LCD, in which case the LCD's native
+resolution is used. If you want to switch to a different mode, use the fbset
+shell command.
+
+Linux 2.6 only: If no mode is given, sisfb defaults to 800x600x8 unless CRT2
+type is LCD, in which case it defaults to the LCD's native resolution. If
+you want to switch to another mode, use the stty shell command.
+
+You should compile in both vgacon (to boot if you remove you SiS card from
+your system) and sisfb (for graphics mode). Under Linux 2.6, also "Framebuffer
+console support" (fbcon) is needed for a graphical console.
+
+You should *not* compile-in vesafb. And please do not use the "vga=" keyword
+in lilo's or grub's configuration file; mode selection is done using the
+"mode" or "vesa" keywords as a parameter. See above and below.
+
+
+X11
+===
+
+If using XFree86 or X.org, it is recommended that you don't use the "fbdev"
+driver but the dedicated "sis" X driver. The "sis" X driver and sisfb are
+developed by the same person (Thomas Winischhofer) and cooperate well with
+each other.
+
+
+SVGALib
+=======
+
+SVGALib, if directly accessing the hardware, never restores the screen
+correctly, especially on laptops or if the output devices are LCD or TV.
+Therefore, use the chipset "FBDEV" in SVGALib configuration. This will make
+SVGALib use the framebuffer device for mode switches and restoration.
+
+
+Configuration
+=============
+
+(Some) accepted options:
+
+off - Disable sisfb. This option is only understood if sisfb is
+ in-kernel, not a module.
+mem:X - size of memory for the console, rest will be used for DRI/DRM. X
+ is in kilobytes. On 300 series, the default is 4096, 8192 or
+ 16384 (each in kilobyte) depending on how much video ram the card
+ has. On 315/330 series, the default is the maximum available ram
+ (since DRI/DRM is not supported for these chipsets).
+noaccel - do not use 2D acceleration engine. (Default: use acceleration)
+noypan - disable y-panning and scroll by redrawing the entire screen.
+ This is much slower than y-panning. (Default: use y-panning)
+vesa:X - selects startup videomode. X is number from 0 to 0x1FF and
+ represents the VESA mode number (can be given in decimal or
+ hexadecimal form, the latter prefixed with "0x").
+mode:X - selects startup videomode. Please see above for the format of
+ "X".
+
+Boolean options such as "noaccel" or "noypan" are to be given without a
+parameter if sisfb is in-kernel (for example "video=sisfb:noypan). If
+sisfb is a module, these are to be set to 1 (for example "modprobe sisfb
+noypan=1").
+
+--
+Thomas Winischhofer <thomas@winischhofer.net>
+May 27, 2004
+
+
diff --git a/Documentation/fb/sstfb.txt b/Documentation/fb/sstfb.txt
new file mode 100644
index 0000000..550ca77
--- /dev/null
+++ b/Documentation/fb/sstfb.txt
@@ -0,0 +1,174 @@
+
+Introduction
+
+ This is a frame buffer device driver for 3dfx' Voodoo Graphics
+ (aka voodoo 1, aka sst1) and Voodoo² (aka Voodoo 2, aka CVG) based
+ video boards. It's highly experimental code, but is guaranteed to work
+ on my computer, with my "Maxi Gamer 3D" and "Maxi Gamer 3d²" boards,
+ and with me "between chair and keyboard". Some people tested other
+ combinations and it seems that it works.
+ The main page is located at <http://sstfb.sourceforge.net>, and if
+ you want the latest version, check out the CVS, as the driver is a work
+ in progress, I feel uncomfortable with releasing tarballs of something
+ not completely working...Don't worry, it's still more than useable
+ (I eat my own dog food)
+
+ Please read the Bug section, and report any success or failure to me
+ (Ghozlane Toumi <gtoumi@laposte.net>).
+ BTW, If you have only one monitor , and you don't feel like playing
+ with the vga passthrou cable, I can only suggest borrowing a screen
+ somewhere...
+
+
+Installation
+
+ This driver (should) work on ix86, with "late" 2.2.x kernel (tested
+ with x = 19) and "recent" 2.4.x kernel, as a module or compiled in.
+ It has been included in mainstream kernel since the infamous 2.4.10.
+ You can apply the patches found in sstfb/kernel/*-2.{2|4}.x.patch,
+ and copy sstfb.c to linux/drivers/video/, or apply a single patch,
+ sstfb/patch-2.{2|4}.x-sstfb-yymmdd to your linux source tree.
+
+ Then configure your kernel as usual: choose "m" or "y" to 3Dfx Voodoo
+ Graphics in section "console". Compile, install, have fun... and please
+ drop me a report :)
+
+
+Module Usage
+
+ Warnings.
+ # You should read completely this section before issuing any command.
+ # If you have only one monitor to play with, once you insmod the
+ module, the 3dfx takes control of the output, so you'll have to
+ plug the monitor to the "normal" video board in order to issue
+ the commands, or you can blindly use sst_dbg_vgapass
+ in the tools directory (See Tools). The latest solution is pass the
+ parameter vgapass=1 when insmodding the driver. (See Kernel/Modules
+ Options)
+
+ Module insertion:
+ # insmod sstfb.o
+ you should see some strange output from the board:
+ a big blue square, a green and a red small squares and a vertical
+ white rectangle. why? the function's name is self-explanatory:
+ "sstfb_test()"...
+ (if you don't have a second monitor, you'll have to plug your monitor
+ directly to the 2D videocard to see what you're typing)
+ # con2fb /dev/fbx /dev/ttyx
+ bind a tty to the new frame buffer. if you already have a frame
+ buffer driver, the voodoo fb will likely be /dev/fb1. if not,
+ the device will be /dev/fb0. You can check this by doing a
+ cat /proc/fb. You can find a copy of con2fb in tools/ directory.
+ if you don't have another fb device, this step is superfluous,
+ as the console subsystem automagicaly binds ttys to the fb.
+ # switch to the virtual console you just mapped. "tadaaa" ...
+
+ Module removal:
+ # con2fb /dev/fbx /dev/ttyx
+ bind the tty to the old frame buffer so the module can be removed.
+ (how does it work with vgacon ? short answer : it doesn't work)
+ # rmmod sstfb
+
+
+Kernel/Modules Options
+
+ You can pass some options to the sstfb module, and via the kernel
+ command line when the driver is compiled in:
+ for module : insmod sstfb.o option1=value1 option2=value2 ...
+ in kernel : video=sstfb:option1,option2:value2,option3 ...
+
+ sstfb supports the following options :
+
+Module Kernel Description
+
+vgapass=0 vganopass Enable or disable VGA passthrou cable.
+vgapass=1 vgapass When enabled, the monitor will get the signal
+ from the VGA board and not from the voodoo.
+ Default: nopass
+
+mem=x mem:x Force frame buffer memory in MiB
+ allowed values: 0, 1, 2, 4.
+ Default: 0 (= autodetect)
+
+inverse=1 inverse Supposed to enable inverse console.
+ doesn't work yet...
+
+clipping=1 clipping Enable or disable clipping.
+clipping=0 noclipping With clipping enabled, all offscreen
+ reads and writes are discarded.
+ Default: enable clipping.
+
+gfxclk=x gfxclk:x Force graphic clock frequency (in MHz).
+ Be careful with this option, it may be
+ DANGEROUS.
+ Default: auto
+ 50Mhz for Voodoo 1,
+ 75MHz for Voodoo 2.
+
+slowpci=1 fastpci Enable or disable fast PCI read/writes.
+slowpci=1 slowpci Default : fastpci
+
+dev=x dev:x Attach the driver to device number x.
+ 0 is the first compatible board (in
+ lspci order)
+
+Tools
+
+ These tools are mostly for debugging purposes, but you can
+ find some of these interesting :
+ - con2fb , maps a tty to a fbramebuffer .
+ con2fb /dev/fb1 /dev/tty5
+ - sst_dbg_vgapass , changes vga passthrou. You have to recompile the
+ driver with SST_DEBUG and SST_DEBUG_IOCTL set to 1
+ sst_dbg_vgapass /dev/fb1 1 (enables vga cable)
+ sst_dbg_vgapass /dev/fb1 0 (disables vga cable)
+ - glide_reset , resets the voodoo using glide
+ use this after rmmoding sstfb, if the module refuses to
+ reinsert .
+
+Bugs
+
+ - DO NOT use glide while the sstfb module is in, you'll most likely
+ hang your computer.
+ - If you see some artefacts (pixels not cleaning and stuff like that),
+ try turning off clipping (clipping=0), and/or using slowpci
+ - the driver don't detect the 4Mb frame buffer voodoos, it seems that
+ the 2 last Mbs wrap around. looking into that .
+ - The driver is 16 bpp only, 24/32 won't work.
+ - The driver is not your_favorite_toy-safe. this includes SMP...
+ [Actually from inspection it seems to be safe - Alan]
+ - When using XFree86 FBdev (X over fbdev) you may see strange color
+ patterns at the border of your windows (the pixels lose the lowest
+ byte -> basically the blue component and some of the green). I'm unable
+ to reproduce this with XFree86-3.3, but one of the testers has this
+ problem with XFree86-4. Apparently recent Xfree86-4.x solve this
+ problem.
+ - I didn't really test changing the palette, so you may find some weird
+ things when playing with that.
+ - Sometimes the driver will not recognise the DAC, and the
+ initialisation will fail. This is specifically true for
+ voodoo 2 boards, but it should be solved in recent versions. Please
+ contact me.
+ - The 24/32 is not likely to work anytime soon, knowing that the
+ hardware does ... unusual things in 24/32 bpp.
+ - When used with another video board, current limitations of the linux
+ console subsystem can cause some troubles, specifically, you should
+ disable software scrollback, as it can oops badly ...
+
+Todo
+
+ - Get rid of the previous paragraph.
+ - Buy more coffee.
+ - test/port to other arch.
+ - try to add panning using tweeks with front and back buffer .
+ - try to implement accel on voodoo2, this board can actually do a
+ lot in 2D even if it was sold as a 3D only board ...
+
+ghoz.
+
+--
+Ghozlane Toumi <gtoumi@laposte.net>
+
+
+$Date: 2002/05/09 20:11:45 $
+http://sstfb.sourceforge.net/README
diff --git a/Documentation/fb/tgafb.txt b/Documentation/fb/tgafb.txt
new file mode 100644
index 0000000..250083a
--- /dev/null
+++ b/Documentation/fb/tgafb.txt
@@ -0,0 +1,69 @@
+$Id: tgafb.txt,v 1.1.2.2 2000/04/04 06:50:18 mato Exp $
+
+What is tgafb?
+===============
+
+This is a driver for DECChip 21030 based graphics framebuffers, a.k.a. TGA
+cards, which are usually found in older Digital Alpha systems. The
+following models are supported:
+
+ZLxP-E1 (8bpp, 2 MB VRAM)
+ZLxP-E2 (32bpp, 8 MB VRAM)
+ZLxP-E3 (32bpp, 16 MB VRAM, Zbuffer)
+
+This version is an almost complete rewrite of the code written by Geert
+Uytterhoeven, which was based on the original TGA console code written by
+Jay Estabrook.
+
+Major new features since Linux 2.0.x:
+
+ * Support for multiple resolutions
+ * Support for fixed-frequency and other oddball monitors
+ (by allowing the video mode to be set at boot time)
+
+User-visible changes since Linux 2.2.x:
+
+ * Sync-on-green is now handled properly
+ * More useful information is printed on bootup
+ (this helps if people run into problems)
+
+This driver does not (yet) support the TGA2 family of framebuffers, so the
+PowerStorm 3D30/4D20 (also known as PBXGB) cards are not supported. These
+can however be used with the standard VGA Text Console driver.
+
+
+Configuration
+=============
+
+You can pass kernel command line options to tgafb with
+`video=tgafb:option1,option2:value2,option3' (multiple options should be
+separated by comma, values are separated from options by `:').
+Accepted options:
+
+font:X - default font to use. All fonts are supported, including the
+ SUN12x22 font which is very nice at high resolutions.
+
+mode:X - default video mode. The following video modes are supported:
+ 640x480-60, 800x600-56, 640x480-72, 800x600-60, 800x600-72,
+ 1024x768-60, 1152x864-60, 1024x768-70, 1024x768-76,
+ 1152x864-70, 1280x1024-61, 1024x768-85, 1280x1024-70,
+ 1152x864-84, 1280x1024-76, 1280x1024-85
+
+
+Known Issues
+============
+
+The XFree86 FBDev server has been reported not to work, since tgafb doesn't do
+mmap(). Running the standard XF86_TGA server from XFree86 3.3.x works fine for
+me, however this server does not do acceleration, which make certain operations
+quite slow. Support for acceleration is being progressively integrated in
+XFree86 4.x.
+
+When running tgafb in resolutions higher than 640x480, on switching VCs from
+tgafb to XF86_TGA 3.3.x, the entire screen is not re-drawn and must be manually
+refreshed. This is an X server problem, not a tgafb problem, and is fixed in
+XFree86 4.0.
+
+Enjoy!
+
+Martin Lucina <mato@kotelna.sk>
diff --git a/Documentation/fb/tridentfb.txt b/Documentation/fb/tridentfb.txt
new file mode 100644
index 0000000..45d9de5
--- /dev/null
+++ b/Documentation/fb/tridentfb.txt
@@ -0,0 +1,70 @@
+Tridentfb is a framebuffer driver for some Trident chip based cards.
+
+The following list of chips is thought to be supported although not all are
+tested:
+
+those from the TGUI series 9440/96XX and with Cyber in their names
+those from the Image series and with Cyber in their names
+those with Blade in their names (Blade3D,CyberBlade...)
+the newer CyberBladeXP family
+
+All families are accelerated. Only PCI/AGP based cards are supported,
+none of the older Tridents.
+The driver supports 8, 16 and 32 bits per pixel depths.
+The TGUI family requires a line length to be power of 2 if acceleration
+is enabled. This means that range of possible resolutions and bpp is
+limited comparing to the range if acceleration is disabled (see list
+of parameters below).
+
+Known bugs:
+1. The driver randomly locks up on 3DImage975 chip with acceleration
+ enabled. The same happens in X11 (Xorg).
+2. The ramdac speeds require some more fine tuning. It is possible to
+ switch resolution which the chip does not support at some depths for
+ older chips.
+
+How to use it?
+==============
+
+When booting you can pass the video parameter.
+video=tridentfb
+
+The parameters for tridentfb are concatenated with a ':' as in this example.
+
+video=tridentfb:800x600-16@75,noaccel
+
+The second level parameters that tridentfb understands are:
+
+noaccel - turns off acceleration (when it doesn't work for your card)
+
+fp - use flat panel related stuff
+crt - assume monitor is present instead of fp
+
+center - for flat panels and resolutions smaller than native size center the
+ image, otherwise use
+stretch
+
+memsize - integer value in KB, use if your card's memory size is misdetected.
+ look at the driver output to see what it says when initializing.
+
+memdiff - integer value in KB, should be nonzero if your card reports
+ more memory than it actually has. For instance mine is 192K less than
+ detection says in all three BIOS selectable situations 2M, 4M, 8M.
+ Only use if your video memory is taken from main memory hence of
+ configurable size. Otherwise use memsize.
+ If in some modes which barely fit the memory you see garbage
+ at the bottom this might help by not letting change to that mode
+ anymore.
+
+nativex - the width in pixels of the flat panel.If you know it (usually 1024
+ 800 or 1280) and it is not what the driver seems to detect use it.
+
+bpp - bits per pixel (8,16 or 32)
+mode - a mode name like 800x600-8@75 as described in
+ Documentation/fb/modedb.txt
+
+Using insane values for the above parameters will probably result in driver
+misbehaviour so take care(for instance memsize=12345678 or memdiff=23784 or
+nativex=93)
+
+Contact: jani@astechnix.ro
diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt
new file mode 100644
index 0000000..7ac3c40
--- /dev/null
+++ b/Documentation/fb/uvesafb.txt
@@ -0,0 +1,188 @@
+
+uvesafb - A Generic Driver for VBE2+ compliant video cards
+==========================================================
+
+1. Requirements
+---------------
+
+uvesafb should work with any video card that has a Video BIOS compliant
+with the VBE 2.0 standard.
+
+Unlike other drivers, uvesafb makes use of a userspace helper called
+v86d. v86d is used to run the x86 Video BIOS code in a simulated and
+controlled environment. This allows uvesafb to function on arches other
+than x86. Check the v86d documentation for a list of currently supported
+arches.
+
+v86d source code can be downloaded from the following website:
+ http://dev.gentoo.org/~spock/projects/uvesafb
+
+Please refer to the v86d documentation for detailed configuration and
+installation instructions.
+
+Note that the v86d userspace helper has to be available at all times in
+order for uvesafb to work properly. If you want to use uvesafb during
+early boot, you will have to include v86d into an initramfs image, and
+either compile it into the kernel or use it as an initrd.
+
+2. Caveats and limitations
+--------------------------
+
+uvesafb is a _generic_ driver which supports a wide variety of video
+cards, but which is ultimately limited by the Video BIOS interface.
+The most important limitations are:
+
+- Lack of any type of acceleration.
+- A strict and limited set of supported video modes. Often the native
+ or most optimal resolution/refresh rate for your setup will not work
+ with uvesafb, simply because the Video BIOS doesn't support the
+ video mode you want to use. This can be especially painful with
+ widescreen panels, where native video modes don't have the 4:3 aspect
+ ratio, which is what most BIOS-es are limited to.
+- Adjusting the refresh rate is only possible with a VBE 3.0 compliant
+ Video BIOS. Note that many nVidia Video BIOS-es claim to be VBE 3.0
+ compliant, while they simply ignore any refresh rate settings.
+
+3. Configuration
+----------------
+
+uvesafb can be compiled either as a module, or directly into the kernel.
+In both cases it supports the same set of configuration options, which
+are either given on the kernel command line or as module parameters, e.g.:
+
+ video=uvesafb:1024x768-32,mtrr:3,ywrap (compiled into the kernel)
+
+ # modprobe uvesafb mode_option=1024x768-32 mtrr=3 scroll=ywrap (module)
+
+Accepted options:
+
+ypan Enable display panning using the VESA protected mode
+ interface. The visible screen is just a window of the
+ video memory, console scrolling is done by changing the
+ start of the window. Available on x86 only.
+
+ywrap Same as ypan, but assumes your gfx board can wrap-around
+ the video memory (i.e. starts reading from top if it
+ reaches the end of video memory). Faster than ypan.
+ Available on x86 only.
+
+redraw Scroll by redrawing the affected part of the screen, this
+ is the safe (and slow) default.
+
+(If you're using uvesafb as a module, the above three options are
+ used a parameter of the scroll option, e.g. scroll=ypan.)
+
+vgapal Use the standard VGA registers for palette changes.
+
+pmipal Use the protected mode interface for palette changes.
+ This is the default if the protected mode interface is
+ available. Available on x86 only.
+
+mtrr:n Setup memory type range registers for the framebuffer
+ where n:
+ 0 - disabled (equivalent to nomtrr) (default)
+ 1 - uncachable
+ 2 - write-back
+ 3 - write-combining
+ 4 - write-through
+
+ If you see the following in dmesg, choose the type that matches
+ the old one. In this example, use "mtrr:2".
+...
+mtrr: type mismatch for e0000000,8000000 old: write-back new: write-combining
+...
+
+nomtrr Do not use memory type range registers.
+
+vremap:n
+ Remap 'n' MiB of video RAM. If 0 or not specified, remap memory
+ according to video mode.
+
+vtotal:n
+ If the video BIOS of your card incorrectly determines the total
+ amount of video RAM, use this option to override the BIOS (in MiB).
+
+<mode> The mode you want to set, in the standard modedb format. Refer to
+ modedb.txt for a detailed description. When uvesafb is compiled as
+ a module, the mode string should be provided as a value of the
+ 'mode_option' option.
+
+vbemode:x
+ Force the use of VBE mode x. The mode will only be set if it's
+ found in the VBE-provided list of supported modes.
+ NOTE: The mode number 'x' should be specified in VESA mode number
+ notation, not the Linux kernel one (eg. 257 instead of 769).
+ HINT: If you use this option because normal <mode> parameter does
+ not work for you and you use a X server, you'll probably want to
+ set the 'nocrtc' option to ensure that the video mode is properly
+ restored after console <-> X switches.
+
+nocrtc Do not use CRTC timings while setting the video mode. This option
+ has any effect only if the Video BIOS is VBE 3.0 compliant. Use it
+ if you have problems with modes set the standard way. Note that
+ using this option implies that any refresh rate adjustments will
+ be ignored and the refresh rate will stay at your BIOS default (60 Hz).
+
+noedid Do not try to fetch and use EDID-provided modes.
+
+noblank Disable hardware blanking.
+
+v86d:path
+ Set path to the v86d executable. This option is only available as
+ a module parameter, and not as a part of the video= string. If you
+ need to use it and have uvesafb built into the kernel, use
+ uvesafb.v86d="path".
+
+Additionally, the following parameters may be provided. They all override the
+EDID-provided values and BIOS defaults. Refer to your monitor's specs to get
+the correct values for maxhf, maxvf and maxclk for your hardware.
+
+maxhf:n Maximum horizontal frequency (in kHz).
+maxvf:n Maximum vertical frequency (in Hz).
+maxclk:n Maximum pixel clock (in MHz).
+
+4. The sysfs interface
+----------------------
+
+uvesafb provides several sysfs nodes for configurable parameters and
+additional information.
+
+Driver attributes:
+
+/sys/bus/platform/drivers/uvesafb
+ - v86d (default: /sbin/v86d)
+ Path to the v86d executable. v86d is started by uvesafb
+ if an instance of the daemon isn't already running.
+
+Device attributes:
+
+/sys/bus/platform/drivers/uvesafb/uvesafb.0
+ - nocrtc
+ Use the default refresh rate (60 Hz) if set to 1.
+
+ - oem_product_name
+ - oem_product_rev
+ - oem_string
+ - oem_vendor
+ Information about the card and its maker.
+
+ - vbe_modes
+ A list of video modes supported by the Video BIOS along with their
+ VBE mode numbers in hex.
+
+ - vbe_version
+ A BCD value indicating the implemented VBE standard.
+
+5. Miscellaneous
+----------------
+
+Uvesafb will set a video mode with the default refresh rate and timings
+from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
+
+
+--
+ Michal Januszewski <spock@gentoo.org>
+ Last updated: 2007-06-16
+
+ Documentation of the uvesafb options is loosely based on vesafb.txt.
+
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt
new file mode 100644
index 0000000..ee277dd
--- /dev/null
+++ b/Documentation/fb/vesafb.txt
@@ -0,0 +1,181 @@
+
+What is vesafb?
+===============
+
+This is a generic driver for a graphic framebuffer on intel boxes.
+
+The idea is simple: Turn on graphics mode at boot time with the help
+of the BIOS, and use this as framebuffer device /dev/fb0, like the m68k
+(and other) ports do.
+
+This means we decide at boot time whenever we want to run in text or
+graphics mode. Switching mode later on (in protected mode) is
+impossible; BIOS calls work in real mode only. VESA BIOS Extensions
+Version 2.0 are required, because we need a linear frame buffer.
+
+Advantages:
+
+ * It provides a nice large console (128 cols + 48 lines with 1024x768)
+ without using tiny, unreadable fonts.
+ * You can run XF68_FBDev on top of /dev/fb0 (=> non-accelerated X11
+ support for every VBE 2.0 compliant graphics board).
+ * Most important: boot logo :-)
+
+Disadvantages:
+
+ * graphic mode is slower than text mode...
+
+
+How to use it?
+==============
+
+Switching modes is done using the vga=... boot parameter. Read
+Documentation/svga.txt for details.
+
+You should compile in both vgacon (for text mode) and vesafb (for
+graphics mode). Which of them takes over the console depends on
+whenever the specified mode is text or graphics.
+
+The graphic modes are NOT in the list which you get if you boot with
+vga=ask and hit return. The mode you wish to use is derived from the
+VESA mode number. Here are those VESA mode numbers:
+
+ | 640x480 800x600 1024x768 1280x1024
+----+-------------------------------------
+256 | 0x101 0x103 0x105 0x107
+32k | 0x110 0x113 0x116 0x119
+64k | 0x111 0x114 0x117 0x11A
+16M | 0x112 0x115 0x118 0x11B
+
+The video mode number of the Linux kernel is the VESA mode number plus
+0x200.
+
+ Linux_kernel_mode_number = VESA_mode_number + 0x200
+
+So the table for the Kernel mode numbers are:
+
+ | 640x480 800x600 1024x768 1280x1024
+----+-------------------------------------
+256 | 0x301 0x303 0x305 0x307
+32k | 0x310 0x313 0x316 0x319
+64k | 0x311 0x314 0x317 0x31A
+16M | 0x312 0x315 0x318 0x31B
+
+To enable one of those modes you have to specify "vga=ask" in the
+lilo.conf file and rerun LILO. Then you can type in the desired
+mode at the "vga=ask" prompt. For example if you like to use
+1024x768x256 colors you have to say "305" at this prompt.
+
+If this does not work, this might be because your BIOS does not support
+linear framebuffers or because it does not support this mode at all.
+Even if your board does, it might be the BIOS which does not. VESA BIOS
+Extensions v2.0 are required, 1.2 is NOT sufficient. You will get a
+"bad mode number" message if something goes wrong.
+
+1. Note: LILO cannot handle hex, for booting directly with
+ "vga=mode-number" you have to transform the numbers to decimal.
+2. Note: Some newer versions of LILO appear to work with those hex values,
+ if you set the 0x in front of the numbers.
+
+X11
+===
+
+XF68_FBDev should work just fine, but it is non-accelerated. Running
+another (accelerated) X-Server like XF86_SVGA might or might not work.
+It depends on X-Server and graphics board.
+
+The X-Server must restore the video mode correctly, else you end up
+with a broken console (and vesafb cannot do anything about this).
+
+
+Refresh rates
+=============
+
+There is no way to change the vesafb video mode and/or timings after
+booting linux. If you are not happy with the 60 Hz refresh rate, you
+have these options:
+
+ * configure and load the DOS-Tools for your the graphics board (if
+ available) and boot linux with loadlin.
+ * use a native driver (matroxfb/atyfb) instead if vesafb. If none
+ is available, write a new one!
+ * VBE 3.0 might work too. I have neither a gfx board with VBE 3.0
+ support nor the specs, so I have not checked this yet.
+
+
+Configuration
+=============
+
+The VESA BIOS provides protected mode interface for changing
+some parameters. vesafb can use it for palette changes and
+to pan the display. It is turned off by default because it
+seems not to work with some BIOS versions, but there are options
+to turn it on.
+
+You can pass options to vesafb using "video=vesafb:option" on
+the kernel command line. Multiple options should be separated
+by comma, like this: "video=vesafb:ypan,invers"
+
+Accepted options:
+
+invers no comment...
+
+ypan enable display panning using the VESA protected mode
+ interface. The visible screen is just a window of the
+ video memory, console scrolling is done by changing the
+ start of the window.
+ pro: * scrolling (fullscreen) is fast, because there is
+ no need to copy around data.
+ * You'll get scrollback (the Shift-PgUp thing),
+ the video memory can be used as scrollback buffer
+ kontra: * scrolling only parts of the screen causes some
+ ugly flicker effects (boot logo flickers for
+ example).
+
+ywrap Same as ypan, but assumes your gfx board can wrap-around
+ the video memory (i.e. starts reading from top if it
+ reaches the end of video memory). Faster than ypan.
+
+redraw scroll by redrawing the affected part of the screen, this
+ is the safe (and slow) default.
+
+
+vgapal Use the standard vga registers for palette changes.
+ This is the default.
+pmipal Use the protected mode interface for palette changes.
+
+mtrr:n setup memory type range registers for the vesafb framebuffer
+ where n:
+ 0 - disabled (equivalent to nomtrr) (default)
+ 1 - uncachable
+ 2 - write-back
+ 3 - write-combining
+ 4 - write-through
+
+ If you see the following in dmesg, choose the type that matches the
+ old one. In this example, use "mtrr:2".
+...
+mtrr: type mismatch for e0000000,8000000 old: write-back new: write-combining
+...
+
+nomtrr disable mtrr
+
+vremap:n
+ remap 'n' MiB of video RAM. If 0 or not specified, remap memory
+ according to video mode. (2.5.66 patch/idea by Antonino Daplas
+ reversed to give override possibility (allocate more fb memory
+ than the kernel would) to 2.4 by tmb@iki.fi)
+
+vtotal:n
+ if the video BIOS of your card incorrectly determines the total
+ amount of video RAM, use this option to override the BIOS (in MiB).
+
+Have fun!
+
+ Gerd
+
+--
+Gerd Knorr <kraxel@goldbach.in-berlin.de>
+
+Minor (mostly typo) changes
+by Nico Schmoigl <schmoigl@rumms.uni-mannheim.de>
diff --git a/Documentation/fb/viafb.modes b/Documentation/fb/viafb.modes
new file mode 100644
index 0000000..02e5b48
--- /dev/null
+++ b/Documentation/fb/viafb.modes
@@ -0,0 +1,870 @@
+#
+#
+# These data are based on the CRTC parameters in
+#
+# VIA Integration Graphics Chip
+# (C) 2004 VIA Technologies Inc.
+#
+
+#
+# 640x480, 60 Hz, Non-Interlaced (25.175 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 640 480
+# Scan Frequency 31.469 kHz 59.94 Hz
+# Sync Width 3.813 us 0.064 ms
+# 12 chars 2 lines
+# Front Porch 0.636 us 0.318 ms
+# 2 chars 10 lines
+# Back Porch 1.907 us 1.048 ms
+# 6 chars 33 lines
+# Active Time 25.422 us 15.253 ms
+# 80 chars 480 lines
+# Blank Time 6.356 us 1.430 ms
+# 20 chars 45 lines
+# Polarity negative negative
+#
+
+mode "640x480-60"
+# D: 25.175 MHz, H: 31.469 kHz, V: 59.94 Hz
+ geometry 640 480 640 480 32
+ timings 39722 48 16 33 10 96 2 endmode mode "480x640-60"
+# D: 24.823 MHz, H: 39.780 kHz, V: 60.00 Hz
+ geometry 480 640 480 640 32 timings 39722 72 24 19 1 48 3 endmode
+#
+# 640x480, 75 Hz, Non-Interlaced (31.50 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 640 480
+# Scan Frequency 37.500 kHz 75.00 Hz
+# Sync Width 2.032 us 0.080 ms
+# 8 chars 3 lines
+# Front Porch 0.508 us 0.027 ms
+# 2 chars 1 lines
+# Back Porch 3.810 us 0.427 ms
+# 15 chars 16 lines
+# Active Time 20.317 us 12.800 ms
+# 80 chars 480 lines
+# Blank Time 6.349 us 0.533 ms
+# 25 chars 20 lines
+# Polarity negative negative
+#
+ mode "640x480-75"
+# D: 31.50 MHz, H: 37.500 kHz, V: 75.00 Hz
+ geometry 640 480 640 480 32 timings 31747 120 16 16 1 64 3 endmode
+#
+# 640x480, 85 Hz, Non-Interlaced (36.000 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 640 480
+# Scan Frequency 43.269 kHz 85.00 Hz
+# Sync Width 1.556 us 0.069 ms
+# 7 chars 3 lines
+# Front Porch 1.556 us 0.023 ms
+# 7 chars 1 lines
+# Back Porch 2.222 us 0.578 ms
+# 10 chars 25 lines
+# Active Time 17.778 us 11.093 ms
+# 80 chars 480 lines
+# Blank Time 5.333 us 0.670 ms
+# 24 chars 29 lines
+# Polarity negative negative
+#
+ mode "640x480-85"
+# D: 36.000 MHz, H: 43.269 kHz, V: 85.00 Hz
+ geometry 640 480 640 480 32 timings 27777 80 56 25 1 56 3 endmode
+#
+# 640x480, 100 Hz, Non-Interlaced (43.163 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 640 480
+# Scan Frequency 50.900 kHz 100.00 Hz
+# Sync Width 1.483 us 0.058 ms
+# 8 chars 3 lines
+# Front Porch 0.927 us 0.019 ms
+# 5 chars 1 lines
+# Back Porch 2.409 us 0.475 ms
+# 13 chars 25 lines
+# Active Time 14.827 us 9.430 ms
+# 80 chars 480 lines
+# Blank Time 4.819 us 0.570 ms
+# 26 chars 29 lines
+# Polarity positive positive
+#
+ mode "640x480-100"
+# D: 43.163 MHz, H: 50.900 kHz, V: 100.00 Hz
+ geometry 640 480 640 480 32 timings 23168 104 40 25 1 64 3 endmode
+#
+# 640x480, 120 Hz, Non-Interlaced (52.406 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 640 480
+# Scan Frequency 61.800 kHz 120.00 Hz
+# Sync Width 1.221 us 0.048 ms
+# 8 chars 3 lines
+# Front Porch 0.763 us 0.016 ms
+# 5 chars 1 lines
+# Back Porch 1.984 us 0.496 ms
+# 13 chars 31 lines
+# Active Time 12.212 us 7.767 ms
+# 80 chars 480 lines
+# Blank Time 3.969 us 0.566 ms
+# 26 chars 35 lines
+# Polarity positive positive
+#
+ mode "640x480-120"
+# D: 52.406 MHz, H: 61.800 kHz, V: 120.00 Hz
+ geometry 640 480 640 480 32 timings 19081 104 40 31 1 64 3 endmode
+#
+# 720x480, 60 Hz, Non-Interlaced (26.880 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 720 480
+# Scan Frequency 30.000 kHz 60.241 Hz
+# Sync Width 2.679 us 0.099 ms
+# 9 chars 3 lines
+# Front Porch 0.595 us 0.033 ms
+# 2 chars 1 lines
+# Back Porch 3.274 us 0.462 ms
+# 11 chars 14 lines
+# Active Time 26.786 us 16.000 ms
+# 90 chars 480 lines
+# Blank Time 6.548 us 0.600 ms
+# 22 chars 18 lines
+# Polarity positive positive
+#
+ mode "720x480-60"
+# D: 26.880 MHz, H: 30.000 kHz, V: 60.24 Hz
+ geometry 720 480 720 480 32 timings 37202 88 16 14 1 72 3 endmode
+#
+# 800x480, 60 Hz, Non-Interlaced (29.581 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 800 480
+# Scan Frequency 29.892 kHz 60.00 Hz
+# Sync Width 2.704 us 100.604 us
+# 10 chars 3 lines
+# Front Porch 0.541 us 33.535 us
+# 2 chars 1 lines
+# Back Porch 3.245 us 435.949 us
+# 12 chars 13 lines
+# Active Time 27.044 us 16.097 ms
+# 100 chars 480 lines
+# Blank Time 6.491 us 0.570 ms
+# 24 chars 17 lines
+# Polarity positive positive
+#
+ mode "800x480-60"
+# D: 29.500 MHz, H: 29.738 kHz, V: 60.00 Hz
+ geometry 800 480 800 480 32 timings 33805 96 24 10 3 72 7 endmode
+#
+# 720x576, 60 Hz, Non-Interlaced (32.668 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 720 576
+# Scan Frequency 35.820 kHz 60.00 Hz
+# Sync Width 2.204 us 0.083 ms
+# 9 chars 3 lines
+# Front Porch 0.735 us 0.027 ms
+# 3 chars 1 lines
+# Back Porch 2.939 us 0.459 ms
+# 12 chars 17 lines
+# Active Time 22.040 us 16.080 ms
+# 90 chars 476 lines
+# Blank Time 5.877 us 0.586 ms
+# 24 chars 21 lines
+# Polarity positive positive
+#
+ mode "720x576-60"
+# D: 32.668 MHz, H: 35.820 kHz, V: 60.00 Hz
+ geometry 720 576 720 576 32 timings 30611 96 24 17 1 72 3 endmode
+#
+# 800x600, 60 Hz, Non-Interlaced (40.00 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 800 600
+# Scan Frequency 37.879 kHz 60.32 Hz
+# Sync Width 3.200 us 0.106 ms
+# 16 chars 4 lines
+# Front Porch 1.000 us 0.026 ms
+# 5 chars 1 lines
+# Back Porch 2.200 us 0.607 ms
+# 11 chars 23 lines
+# Active Time 20.000 us 15.840 ms
+# 100 chars 600 lines
+# Blank Time 6.400 us 0.739 ms
+# 32 chars 28 lines
+# Polarity positive positive
+#
+ mode "800x600-60"
+# D: 40.00 MHz, H: 37.879 kHz, V: 60.32 Hz
+ geometry 800 600 800 600 32
+ timings 25000 88 40 23 1 128 4 hsync high vsync high endmode
+#
+# 800x600, 75 Hz, Non-Interlaced (49.50 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 800 600
+# Scan Frequency 46.875 kHz 75.00 Hz
+# Sync Width 1.616 us 0.064 ms
+# 10 chars 3 lines
+# Front Porch 0.323 us 0.021 ms
+# 2 chars 1 lines
+# Back Porch 3.232 us 0.448 ms
+# 20 chars 21 lines
+# Active Time 16.162 us 12.800 ms
+# 100 chars 600 lines
+# Blank Time 5.172 us 0.533 ms
+# 32 chars 25 lines
+# Polarity positive positive
+#
+ mode "800x600-75"
+# D: 49.50 MHz, H: 46.875 kHz, V: 75.00 Hz
+ geometry 800 600 800 600 32
+ timings 20203 160 16 21 1 80 3 hsync high vsync high endmode
+#
+# 800x600, 85 Hz, Non-Interlaced (56.25 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 800 600
+# Scan Frequency 53.674 kHz 85.061 Hz
+# Sync Width 1.138 us 0.056 ms
+# 8 chars 3 lines
+# Front Porch 0.569 us 0.019 ms
+# 4 chars 1 lines
+# Back Porch 2.702 us 0.503 ms
+# 19 chars 27 lines
+# Active Time 14.222 us 11.179 ms
+# 100 chars 600 lines
+# Blank Time 4.409 us 0.578 ms
+# 31 chars 31 lines
+# Polarity positive positive
+#
+ mode "800x600-85"
+# D: 56.25 MHz, H: 53.674 kHz, V: 85.061 Hz
+ geometry 800 600 800 600 32
+ timings 17777 152 32 27 1 64 3 hsync high vsync high endmode
+#
+# 800x600, 100 Hz, Non-Interlaced (67.50 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 800 600
+# Scan Frequency 62.500 kHz 100.00 Hz
+# Sync Width 0.948 us 0.064 ms
+# 8 chars 4 lines
+# Front Porch 0.000 us 0.112 ms
+# 0 chars 7 lines
+# Back Porch 3.200 us 0.224 ms
+# 27 chars 14 lines
+# Active Time 11.852 us 9.600 ms
+# 100 chars 600 lines
+# Blank Time 4.148 us 0.400 ms
+# 35 chars 25 lines
+# Polarity positive positive
+#
+ mode "800x600-100"
+# D: 67.50 MHz, H: 62.500 kHz, V: 100.00 Hz
+ geometry 800 600 800 600 32
+ timings 14667 216 0 14 7 64 4 hsync high vsync high endmode
+#
+# 800x600, 120 Hz, Non-Interlaced (83.950 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 800 600
+# Scan Frequency 77.160 kHz 120.00 Hz
+# Sync Width 1.048 us 0.039 ms
+# 11 chars 3 lines
+# Front Porch 0.667 us 0.013 ms
+# 7 chars 1 lines
+# Back Porch 1.715 us 0.507 ms
+# 18 chars 39 lines
+# Active Time 9.529 us 7.776 ms
+# 100 chars 600 lines
+# Blank Time 3.431 us 0.557 ms
+# 36 chars 43 lines
+# Polarity positive positive
+#
+ mode "800x600-120"
+# D: 83.950 MHz, H: 77.160 kHz, V: 120.00 Hz
+ geometry 800 600 800 600 32
+ timings 11912 144 56 39 1 88 3 hsync high vsync high endmode
+#
+# 848x480, 60 Hz, Non-Interlaced (31.490 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 848 480
+# Scan Frequency 29.820 kHz 60.00 Hz
+# Sync Width 2.795 us 0.099 ms
+# 11 chars 3 lines
+# Front Porch 0.508 us 0.033 ms
+# 2 chars 1 lines
+# Back Porch 3.303 us 0.429 ms
+# 13 chars 13 lines
+# Active Time 26.929 us 16.097 ms
+# 106 chars 480 lines
+# Blank Time 6.605 us 0.570 ms
+# 26 chars 17 lines
+# Polarity positive positive
+#
+ mode "848x480-60"
+# D: 31.500 MHz, H: 29.830 kHz, V: 60.00 Hz
+ geometry 848 480 848 480 32
+ timings 31746 104 24 12 3 80 5 hsync high vsync high endmode
+#
+# 856x480, 60 Hz, Non-Interlaced (31.728 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 856 480
+# Scan Frequency 29.820 kHz 60.00 Hz
+# Sync Width 2.774 us 0.099 ms
+# 11 chars 3 lines
+# Front Porch 0.504 us 0.033 ms
+# 2 chars 1 lines
+# Back Porch 3.728 us 0.429 ms
+# 13 chars 13 lines
+# Active Time 26.979 us 16.097 ms
+# 107 chars 480 lines
+# Blank Time 6.556 us 0.570 ms
+# 26 chars 17 lines
+# Polarity positive positive
+#
+ mode "856x480-60"
+# D: 31.728 MHz, H: 29.820 kHz, V: 60.00 Hz
+ geometry 856 480 856 480 32
+ timings 31518 104 16 13 1 88 3
+ hsync high vsync high endmode mode "960x600-60"
+# D: 45.250 MHz, H: 37.212 kHz, V: 60.00 Hz
+ geometry 960 600 960 600 32 timings 22099 128 32 15 3 96 6 endmode
+#
+# 1000x600, 60 Hz, Non-Interlaced (48.068 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1000 600
+# Scan Frequency 37.320 kHz 60.00 Hz
+# Sync Width 2.164 us 0.080 ms
+# 13 chars 3 lines
+# Front Porch 0.832 us 0.027 ms
+# 5 chars 1 lines
+# Back Porch 2.996 us 0.483 ms
+# 18 chars 18 lines
+# Active Time 20.804 us 16.077 ms
+# 125 chars 600 lines
+# Blank Time 5.991 us 0.589 ms
+# 36 chars 22 lines
+# Polarity negative positive
+#
+ mode "1000x600-60"
+# D: 48.068 MHz, H: 37.320 kHz, V: 60.00 Hz
+ geometry 1000 600 1000 600 32
+ timings 20834 144 40 18 1 104 3 endmode mode "1024x576-60"
+# D: 46.996 MHz, H: 35.820 kHz, V: 60.00 Hz
+ geometry 1024 576 1024 576 32
+ timings 21278 144 40 17 1 104 3 endmode mode "1024x600-60"
+# D: 48.964 MHz, H: 37.320 kHz, V: 60.00 Hz
+ geometry 1024 600 1024 600 32
+ timings 20461 144 40 18 1 104 3 endmode mode "1088x612-60"
+# D: 52.952 MHz, H: 38.040 kHz, V: 60.00 Hz
+ geometry 1088 612 1088 612 32 timings 18877 152 48 16 3 104 5 endmode
+#
+# 1024x512, 60 Hz, Non-Interlaced (41.291 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1024 512
+# Scan Frequency 31.860 kHz 60.00 Hz
+# Sync Width 2.519 us 0.094 ms
+# 13 chars 3 lines
+# Front Porch 0.775 us 0.031 ms
+# 4 chars 1 lines
+# Back Porch 3.294 us 0.465 ms
+# 17 chars 15 lines
+# Active Time 24.800 us 16.070 ms
+# 128 chars 512 lines
+# Blank Time 6.587 us 0.596 ms
+# 34 chars 19 lines
+# Polarity positive positive
+#
+ mode "1024x512-60"
+# D: 41.291 MHz, H: 31.860 kHz, V: 60.00 Hz
+ geometry 1024 512 1024 512 32
+ timings 24218 126 32 15 1 104 3 hsync high vsync high endmode
+#
+# 1024x600, 60 Hz, Non-Interlaced (48.875 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1024 768
+# Scan Frequency 37.252 kHz 60.00 Hz
+# Sync Width 2.128 us 80.532us
+# 13 chars 3 lines
+# Front Porch 0.818 us 26.844 us
+# 5 chars 1 lines
+# Back Porch 2.946 us 483.192 us
+# 18 chars 18 lines
+# Active Time 20.951 us 16.697 ms
+# 128 chars 622 lines
+# Blank Time 5.893 us 0.591 ms
+# 36 chars 22 lines
+# Polarity negative positive
+#
+#mode "1024x600-60"
+# # D: 48.875 MHz, H: 37.252 kHz, V: 60.00 Hz
+# geometry 1024 600 1024 600 32
+# timings 20460 144 40 18 1 104 3
+# endmode
+#
+# 1024x768, 60 Hz, Non-Interlaced (65.00 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1024 768
+# Scan Frequency 48.363 kHz 60.00 Hz
+# Sync Width 2.092 us 0.124 ms
+# 17 chars 6 lines
+# Front Porch 0.369 us 0.062 ms
+# 3 chars 3 lines
+# Back Porch 2.462 us 0.601 ms
+# 20 chars 29 lines
+# Active Time 15.754 us 15.880 ms
+# 128 chars 768 lines
+# Blank Time 4.923 us 0.786 ms
+# 40 chars 38 lines
+# Polarity negative negative
+#
+ mode "1024x768-60"
+# D: 65.00 MHz, H: 48.363 kHz, V: 60.00 Hz
+ geometry 1024 768 1024 768 32 timings 15385 160 24 29 3 136 6 endmode
+#
+# 1024x768, 75 Hz, Non-Interlaced (78.75 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1024 768
+# Scan Frequency 60.023 kHz 75.03 Hz
+# Sync Width 1.219 us 0.050 ms
+# 12 chars 3 lines
+# Front Porch 0.203 us 0.017 ms
+# 2 chars 1 lines
+# Back Porch 2.235 us 0.466 ms
+# 22 chars 28 lines
+# Active Time 13.003 us 12.795 ms
+# 128 chars 768 lines
+# Blank Time 3.657 us 0.533 ms
+# 36 chars 32 lines
+# Polarity positive positive
+#
+ mode "1024x768-75"
+# D: 78.75 MHz, H: 60.023 kHz, V: 75.03 Hz
+ geometry 1024 768 1024 768 32
+ timings 12699 176 16 28 1 96 3 hsync high vsync high endmode
+#
+# 1024x768, 85 Hz, Non-Interlaced (94.50 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1024 768
+# Scan Frequency 68.677 kHz 85.00 Hz
+# Sync Width 1.016 us 0.044 ms
+# 12 chars 3 lines
+# Front Porch 0.508 us 0.015 ms
+# 6 chars 1 lines
+# Back Porch 2.201 us 0.524 ms
+# 26 chars 36 lines
+# Active Time 10.836 us 11.183 ms
+# 128 chars 768 lines
+# Blank Time 3.725 us 0.582 ms
+# 44 chars 40 lines
+# Polarity positive positive
+#
+ mode "1024x768-85"
+# D: 94.50 MHz, H: 68.677 kHz, V: 85.00 Hz
+ geometry 1024 768 1024 768 32
+ timings 10582 208 48 36 1 96 3 hsync high vsync high endmode
+#
+# 1024x768, 100 Hz, Non-Interlaced (110.0 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1024 768
+# Scan Frequency 79.023 kHz 99.78 Hz
+# Sync Width 0.800 us 0.101 ms
+# 11 chars 8 lines
+# Front Porch 0.000 us 0.000 ms
+# 0 chars 0 lines
+# Back Porch 2.545 us 0.202 ms
+# 35 chars 16 lines
+# Active Time 9.309 us 9.719 ms
+# 128 chars 768 lines
+# Blank Time 3.345 us 0.304 ms
+# 46 chars 24 lines
+# Polarity negative negative
+#
+ mode "1024x768-100"
+# D: 113.3 MHz, H: 79.023 kHz, V: 99.78 Hz
+ geometry 1024 768 1024 768 32
+ timings 8825 280 0 16 0 88 8 endmode mode "1152x720-60"
+# D: 66.750 MHz, H: 44.859 kHz, V: 60.00 Hz
+ geometry 1152 720 1152 720 32 timings 14981 168 56 19 3 112 6 endmode
+#
+# 1152x864, 75 Hz, Non-Interlaced (110.0 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1152 864
+# Scan Frequency 75.137 kHz 74.99 Hz
+# Sync Width 1.309 us 0.106 ms
+# 18 chars 8 lines
+# Front Porch 0.245 us 0.599 ms
+# 3 chars 45 lines
+# Back Porch 1.282 us 1.132 ms
+# 18 chars 85 lines
+# Active Time 10.473 us 11.499 ms
+# 144 chars 864 lines
+# Blank Time 2.836 us 1.837 ms
+# 39 chars 138 lines
+# Polarity positive positive
+#
+ mode "1152x864-75"
+# D: 110.0 MHz, H: 75.137 kHz, V: 74.99 Hz
+ geometry 1152 864 1152 864 32
+ timings 9259 144 24 85 45 144 8
+ hsync high vsync high endmode mode "1200x720-60"
+# D: 70.184 MHz, H: 44.760 kHz, V: 60.00 Hz
+ geometry 1200 720 1200 720 32
+ timings 14253 184 28 22 1 128 3 endmode mode "1280x600-60"
+# D: 61.503 MHz, H: 37.320 kHz, V: 60.00 Hz
+ geometry 1280 600 1280 600 32
+ timings 16260 184 28 18 1 128 3 endmode mode "1280x720-50"
+# D: 60.466 MHz, H: 37.050 kHz, V: 50.00 Hz
+ geometry 1280 720 1280 720 32
+ timings 16538 176 48 17 1 128 3 endmode mode "1280x768-50"
+# D: 65.178 MHz, H: 39.550 kHz, V: 50.00 Hz
+ geometry 1280 768 1280 768 32 timings 15342 184 28 19 1 128 3 endmode
+#
+# 1280x768, 60 Hz, Non-Interlaced (80.136 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1280 768
+# Scan Frequency 47.700 kHz 60.00 Hz
+# Sync Width 1.697 us 0.063 ms
+# 17 chars 3 lines
+# Front Porch 0.799 us 0.021 ms
+# 8 chars 1 lines
+# Back Porch 2.496 us 0.483 ms
+# 25 chars 23 lines
+# Active Time 15.973 us 16.101 ms
+# 160 chars 768 lines
+# Blank Time 4.992 us 0.566 ms
+# 50 chars 27 lines
+# Polarity positive positive
+#
+ mode "1280x768-60"
+# D: 80.13 MHz, H: 47.700 kHz, V: 60.00 Hz
+ geometry 1280 768 1280 768 32
+ timings 12480 200 48 23 1 126 3 hsync high vsync high endmode
+#
+# 1280x800, 60 Hz, Non-Interlaced (83.375 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1280 800
+# Scan Frequency 49.628 kHz 60.00 Hz
+# Sync Width 1.631 us 60.450 us
+# 17 chars 3 lines
+# Front Porch 0.768 us 20.15 us
+# 8 chars 1 lines
+# Back Porch 2.399 us 0.483 ms
+# 25 chars 24 lines
+# Active Time 15.352 us 16.120 ms
+# 160 chars 800 lines
+# Blank Time 4.798 us 0.564 ms
+# 50 chars 28 lines
+# Polarity negtive positive
+#
+ mode "1280x800-60"
+# D: 83.500 MHz, H: 49.702 kHz, V: 60.00 Hz
+ geometry 1280 800 1280 800 32 timings 11994 200 72 22 3 128 6 endmode
+#
+# 1280x960, 60 Hz, Non-Interlaced (108.00 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1280 960
+# Scan Frequency 60.000 kHz 60.00 Hz
+# Sync Width 1.037 us 0.050 ms
+# 14 chars 3 lines
+# Front Porch 0.889 us 0.017 ms
+# 12 chars 1 lines
+# Back Porch 2.889 us 0.600 ms
+# 39 chars 36 lines
+# Active Time 11.852 us 16.000 ms
+# 160 chars 960 lines
+# Blank Time 4.815 us 0.667 ms
+# 65 chars 40 lines
+# Polarity positive positive
+#
+ mode "1280x960-60"
+# D: 108.00 MHz, H: 60.000 kHz, V: 60.00 Hz
+ geometry 1280 960 1280 960 32
+ timings 9259 312 96 36 1 112 3 hsync high vsync high endmode
+#
+# 1280x1024, 60 Hz, Non-Interlaced (108.00 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1280 1024
+# Scan Frequency 63.981 kHz 60.02 Hz
+# Sync Width 1.037 us 0.047 ms
+# 14 chars 3 lines
+# Front Porch 0.444 us 0.015 ms
+# 6 chars 1 lines
+# Back Porch 2.297 us 0.594 ms
+# 31 chars 38 lines
+# Active Time 11.852 us 16.005 ms
+# 160 chars 1024 lines
+# Blank Time 3.778 us 0.656 ms
+# 51 chars 42 lines
+# Polarity positive positive
+#
+ mode "1280x1024-60"
+# D: 108.00 MHz, H: 63.981 kHz, V: 60.02 Hz
+ geometry 1280 1024 1280 1024 32
+ timings 9260 248 48 38 1 112 3 hsync high vsync high endmode
+#
+# 1280x1024, 75 Hz, Non-Interlaced (135.00 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1280 1024
+# Scan Frequency 79.976 kHz 75.02 Hz
+# Sync Width 1.067 us 0.038 ms
+# 18 chars 3 lines
+# Front Porch 0.119 us 0.012 ms
+# 2 chars 1 lines
+# Back Porch 1.837 us 0.475 ms
+# 31 chars 38 lines
+# Active Time 9.481 us 12.804 ms
+# 160 chars 1024 lines
+# Blank Time 3.022 us 0.525 ms
+# 51 chars 42 lines
+# Polarity positive positive
+#
+ mode "1280x1024-75"
+# D: 135.00 MHz, H: 79.976 kHz, V: 75.02 Hz
+ geometry 1280 1024 1280 1024 32
+ timings 7408 248 16 38 1 144 3 hsync high vsync high endmode
+#
+# 1280x1024, 85 Hz, Non-Interlaced (157.50 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1280 1024
+# Scan Frequency 91.146 kHz 85.02 Hz
+# Sync Width 1.016 us 0.033 ms
+# 20 chars 3 lines
+# Front Porch 0.406 us 0.011 ms
+# 8 chars 1 lines
+# Back Porch 1.422 us 0.483 ms
+# 28 chars 44 lines
+# Active Time 8.127 us 11.235 ms
+# 160 chars 1024 lines
+# Blank Time 2.844 us 0.527 ms
+# 56 chars 48 lines
+# Polarity positive positive
+#
+ mode "1280x1024-85"
+# D: 157.50 MHz, H: 91.146 kHz, V: 85.02 Hz
+ geometry 1280 1024 1280 1024 32
+ timings 6349 224 64 44 1 160 3
+ hsync high vsync high endmode mode "1440x900-60"
+# D: 106.500 MHz, H: 55.935 kHz, V: 60.00 Hz
+ geometry 1440 900 1440 900 32
+ timings 9390 232 80 25 3 152 6
+ hsync high vsync high endmode mode "1440x900-75"
+# D: 136.750 MHz, H: 70.635 kHz, V: 75.00 Hz
+ geometry 1440 900 1440 900 32
+ timings 7315 248 96 33 3 152 6 hsync high vsync high endmode
+#
+# 1440x1050, 60 Hz, Non-Interlaced (125.10 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1440 1050
+# Scan Frequency 65.220 kHz 60.00 Hz
+# Sync Width 1.204 us 0.046 ms
+# 19 chars 3 lines
+# Front Porch 0.760 us 0.015 ms
+# 12 chars 1 lines
+# Back Porch 1.964 us 0.495 ms
+# 31 chars 33 lines
+# Active Time 11.405 us 16.099 ms
+# 180 chars 1050 lines
+# Blank Time 3.928 us 0.567 ms
+# 62 chars 37 lines
+# Polarity positive positive
+#
+ mode "1440x1050-60"
+# D: 125.10 MHz, H: 65.220 kHz, V: 60.00 Hz
+ geometry 1440 1050 1440 1050 32
+ timings 7993 248 96 33 1 152 3
+ hsync high vsync high endmode mode "1600x900-60"
+# D: 118.250 MHz, H: 55.990 kHz, V: 60.00 Hz
+ geometry 1600 900 1600 900 32
+ timings 8415 256 88 26 3 168 5 endmode mode "1600x1024-60"
+# D: 136.358 MHz, H: 63.600 kHz, V: 60.00 Hz
+ geometry 1600 1024 1600 1024 32 timings 7315 272 104 32 1 168 3 endmode
+#
+# 1600x1200, 60 Hz, Non-Interlaced (156.00 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1600 1200
+# Scan Frequency 76.200 kHz 60.00 Hz
+# Sync Width 1.026 us 0.105 ms
+# 20 chars 8 lines
+# Front Porch 0.205 us 0.131 ms
+# 4 chars 10 lines
+# Back Porch 1.636 us 0.682 ms
+# 32 chars 52 lines
+# Active Time 10.256 us 15.748 ms
+# 200 chars 1200 lines
+# Blank Time 2.872 us 0.866 ms
+# 56 chars 66 lines
+# Polarity negative negative
+#
+ mode "1600x1200-60"
+# D: 156.00 MHz, H: 76.200 kHz, V: 60.00 Hz
+ geometry 1600 1200 1600 1200 32 timings 6172 256 32 52 10 160 8 endmode
+#
+# 1600x1200, 75 Hz, Non-Interlaced (202.50 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1600 1200
+# Scan Frequency 93.750 kHz 75.00 Hz
+# Sync Width 0.948 us 0.032 ms
+# 24 chars 3 lines
+# Front Porch 0.316 us 0.011 ms
+# 8 chars 1 lines
+# Back Porch 1.501 us 0.491 ms
+# 38 chars 46 lines
+# Active Time 7.901 us 12.800 ms
+# 200 chars 1200 lines
+# Blank Time 2.765 us 0.533 ms
+# 70 chars 50 lines
+# Polarity positive positive
+#
+ mode "1600x1200-75"
+# D: 202.50 MHz, H: 93.750 kHz, V: 75.00 Hz
+ geometry 1600 1200 1600 1200 32
+ timings 4938 304 64 46 1 192 3
+ hsync high vsync high endmode mode "1680x1050-60"
+# D: 146.250 MHz, H: 65.290 kHz, V: 59.954 Hz
+ geometry 1680 1050 1680 1050 32
+ timings 6814 280 104 30 3 176 6
+ hsync high vsync high endmode mode "1680x1050-75"
+# D: 187.000 MHz, H: 82.306 kHz, V: 74.892 Hz
+ geometry 1680 1050 1680 1050 32
+ timings 5348 296 120 40 3 176 6
+ hsync high vsync high endmode mode "1792x1344-60"
+# D: 202.975 MHz, H: 83.460 kHz, V: 60.00 Hz
+ geometry 1792 1344 1792 1344 32
+ timings 4902 320 128 43 1 192 3
+ hsync high vsync high endmode mode "1856x1392-60"
+# D: 218.571 MHz, H: 86.460 kHz, V: 60.00 Hz
+ geometry 1856 1392 1856 1392 32
+ timings 4577 336 136 45 1 200 3
+ hsync high vsync high endmode mode "1920x1200-60"
+# D: 193.250 MHz, H: 74.556 kHz, V: 60.00 Hz
+ geometry 1920 1200 1920 1200 32
+ timings 5173 336 136 36 3 200 6
+ hsync high vsync high endmode mode "1920x1440-60"
+# D: 234.000 MHz, H:90.000 kHz, V: 60.00 Hz
+ geometry 1920 1440 1920 1440 32
+ timings 4274 344 128 56 1 208 3
+ hsync high vsync high endmode mode "1920x1440-75"
+# D: 297.000 MHz, H:112.500 kHz, V: 75.00 Hz
+ geometry 1920 1440 1920 1440 32
+ timings 3367 352 144 56 1 224 3
+ hsync high vsync high endmode mode "2048x1536-60"
+# D: 267.250 MHz, H: 95.446 kHz, V: 60.00 Hz
+ geometry 2048 1536 2048 1536 32
+ timings 3742 376 152 49 3 224 4 hsync high vsync high endmode
+#
+# 1280x720, 60 Hz, Non-Interlaced (74.481 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1280 720
+# Scan Frequency 44.760 kHz 60.00 Hz
+# Sync Width 1.826 us 67.024 ms
+# 17 chars 3 lines
+# Front Porch 0.752 us 22.341 ms
+# 7 chars 1 lines
+# Back Porch 2.578 us 491.510 ms
+# 24 chars 22 lines
+# Active Time 17.186 us 16.086 ms
+# 160 chars 720 lines
+# Blank Time 5.156 us 0.581 ms
+# 48 chars 26 lines
+# Polarity negative negative
+#
+ mode "1280x720-60"
+# D: 74.481 MHz, H: 44.760 kHz, V: 60.00 Hz
+ geometry 1280 720 1280 720 32 timings 13426 192 64 22 1 136 3 endmode
+#
+# 1920x1080, 60 Hz, Non-Interlaced (172.798 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1920 1080
+# Scan Frequency 67.080 kHz 60.00 Hz
+# Sync Width 1.204 us 44.723 ms
+# 26 chars 3 lines
+# Front Porch 0.694 us 14.908 ms
+# 15 chars 1 lines
+# Back Porch 1.898 us 506.857 ms
+# 41 chars 34 lines
+# Active Time 11.111 us 16.100 ms
+# 240 chars 1080 lines
+# Blank Time 3.796 us 0.566 ms
+# 82 chars 38 lines
+# Polarity negative negative
+#
+ mode "1920x1080-60"
+# D: 74.481 MHz, H: 67.080 kHz, V: 60.00 Hz
+ geometry 1920 1080 1920 1080 32 timings 5787 328 120 34 1 208 3 endmode
+#
+# 1400x1050, 60 Hz, Non-Interlaced (122.61 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1400 1050
+# Scan Frequency 65.218 kHz 59.99 Hz
+# Sync Width 1.037 us 0.047 ms
+# 19 chars 3 lines
+# Front Porch 0.444 us 0.015 ms
+# 11 chars 1 lines
+# Back Porch 1.185 us 0.188 ms
+# 30 chars 33 lines
+# Active Time 12.963 us 16.411 ms
+# 175 chars 1050 lines
+# Blank Time 2.667 us 0.250 ms
+# 60 chars 37 lines
+# Polarity negative positive
+#
+ mode "1400x1050-60"
+# D: 122.750 MHz, H: 65.317 kHz, V: 59.99 Hz
+ geometry 1400 1050 1408 1050 32
+ timings 8214 232 88 32 3 144 4 endmode mode "1400x1050-75"
+# D: 156.000 MHz, H: 82.278 kHz, V: 74.867 Hz
+ geometry 1400 1050 1408 1050 32 timings 6410 248 104 42 3 144 4 endmode
+#
+# 1366x768, 60 Hz, Non-Interlaced (85.86 MHz dotclock)
+#
+# Horizontal Vertical
+# Resolution 1366 768
+# Scan Frequency 47.700 kHz 60.00 Hz
+# Sync Width 1.677 us 0.063 ms
+# 18 chars 3 lines
+# Front Porch 0.839 us 0.021 ms
+# 9 chars 1 lines
+# Back Porch 2.516 us 0.482 ms
+# 27 chars 23 lines
+# Active Time 15.933 us 16.101 ms
+# 171 chars 768 lines
+# Blank Time 5.031 us 0.566 ms
+# 54 chars 27 lines
+# Polarity negative positive
+#
+ mode "1360x768-60"
+# D: 84.750 MHz, H: 47.720 kHz, V: 60.00 Hz
+ geometry 1360 768 1360 768 32
+ timings 11799 208 72 22 3 136 5 endmode mode "1366x768-60"
+# D: 85.86 MHz, H: 47.700 kHz, V: 60.00 Hz
+ geometry 1366 768 1366 768 32
+ timings 11647 216 72 23 1 144 3 endmode mode "1366x768-50"
+# D: 69,924 MHz, H: 39.550 kHz, V: 50.00 Hz
+ geometry 1366 768 1366 768 32 timings 14301 200 56 19 1 144 3 endmode
diff --git a/Documentation/fb/viafb.txt b/Documentation/fb/viafb.txt
new file mode 100644
index 0000000..67dbf44
--- /dev/null
+++ b/Documentation/fb/viafb.txt
@@ -0,0 +1,214 @@
+
+ VIA Integration Graphic Chip Console Framebuffer Driver
+
+[Platform]
+-----------------------
+ The console framebuffer driver is for graphics chips of
+ VIA UniChrome Family(CLE266, PM800 / CN400 / CN300,
+ P4M800CE / P4M800Pro / CN700 / VN800,
+ CX700 / VX700, K8M890, P4M890,
+ CN896 / P4M900, VX800)
+
+[Driver features]
+------------------------
+ Device: CRT, LCD, DVI
+
+ Support viafb_mode:
+ CRT:
+ 640x480(60, 75, 85, 100, 120 Hz), 720x480(60 Hz),
+ 720x576(60 Hz), 800x600(60, 75, 85, 100, 120 Hz),
+ 848x480(60 Hz), 856x480(60 Hz), 1024x512(60 Hz),
+ 1024x768(60, 75, 85, 100 Hz), 1152x864(75 Hz),
+ 1280x768(60 Hz), 1280x960(60 Hz), 1280x1024(60, 75, 85 Hz),
+ 1440x1050(60 Hz), 1600x1200(60, 75 Hz), 1280x720(60 Hz),
+ 1920x1080(60 Hz), 1400x1050(60 Hz), 800x480(60 Hz)
+
+ color depth: 8 bpp, 16 bpp, 32 bpp supports.
+
+ Support 2D hardware accelerator.
+
+[Using the viafb module]
+-- -- --------------------
+ Start viafb with default settings:
+ #modprobe viafb
+
+ Start viafb with with user options:
+ #modprobe viafb viafb_mode=800x600 viafb_bpp=16 viafb_refresh=60
+ viafb_active_dev=CRT+DVI viafb_dvi_port=DVP1
+ viafb_mode1=1024x768 viafb_bpp=16 viafb_refresh1=60
+ viafb_SAMM_ON=1
+
+ viafb_mode:
+ 640x480 (default)
+ 720x480
+ 800x600
+ 1024x768
+ ......
+
+ viafb_bpp:
+ 8, 16, 32 (default:32)
+
+ viafb_refresh:
+ 60, 75, 85, 100, 120 (default:60)
+
+ viafb_lcd_dsp_method:
+ 0 : expansion (default)
+ 1 : centering
+
+ viafb_lcd_mode:
+ 0 : LCD panel with LSB data format input (default)
+ 1 : LCD panel with MSB data format input
+
+ viafb_lcd_panel_id:
+ 0 : Resolution: 640x480, Channel: single, Dithering: Enable
+ 1 : Resolution: 800x600, Channel: single, Dithering: Enable
+ 2 : Resolution: 1024x768, Channel: single, Dithering: Enable (default)
+ 3 : Resolution: 1280x768, Channel: single, Dithering: Enable
+ 4 : Resolution: 1280x1024, Channel: dual, Dithering: Enable
+ 5 : Resolution: 1400x1050, Channel: dual, Dithering: Enable
+ 6 : Resolution: 1600x1200, Channel: dual, Dithering: Enable
+
+ 8 : Resolution: 800x480, Channel: single, Dithering: Enable
+ 9 : Resolution: 1024x768, Channel: dual, Dithering: Enable
+ 10: Resolution: 1024x768, Channel: single, Dithering: Disable
+ 11: Resolution: 1024x768, Channel: dual, Dithering: Disable
+ 12: Resolution: 1280x768, Channel: single, Dithering: Disable
+ 13: Resolution: 1280x1024, Channel: dual, Dithering: Disable
+ 14: Resolution: 1400x1050, Channel: dual, Dithering: Disable
+ 15: Resolution: 1600x1200, Channel: dual, Dithering: Disable
+ 16: Resolution: 1366x768, Channel: single, Dithering: Disable
+ 17: Resolution: 1024x600, Channel: single, Dithering: Enable
+ 18: Resolution: 1280x768, Channel: dual, Dithering: Enable
+ 19: Resolution: 1280x800, Channel: single, Dithering: Enable
+
+ viafb_accel:
+ 0 : No 2D Hardware Acceleration
+ 1 : 2D Hardware Acceleration (default)
+
+ viafb_SAMM_ON:
+ 0 : viafb_SAMM_ON disable (default)
+ 1 : viafb_SAMM_ON enable
+
+ viafb_mode1: (secondary display device)
+ 640x480 (default)
+ 720x480
+ 800x600
+ 1024x768
+ ... ...
+
+ viafb_bpp1: (secondary display device)
+ 8, 16, 32 (default:32)
+
+ viafb_refresh1: (secondary display device)
+ 60, 75, 85, 100, 120 (default:60)
+
+ viafb_active_dev:
+ This option is used to specify active devices.(CRT, DVI, CRT+LCD...)
+ DVI stands for DVI or HDMI, E.g., If you want to enable HDMI,
+ set viafb_active_dev=DVI. In SAMM case, the previous of
+ viafb_active_dev is primary device, and the following is
+ secondary device.
+
+ For example:
+ To enable one device, such as DVI only, we can use:
+ modprobe viafb viafb_active_dev=DVI
+ To enable two devices, such as CRT+DVI:
+ modprobe viafb viafb_active_dev=CRT+DVI;
+
+ For DuoView case, we can use:
+ modprobe viafb viafb_active_dev=CRT+DVI
+ OR
+ modprobe viafb viafb_active_dev=DVI+CRT...
+
+ For SAMM case:
+ If CRT is primary and DVI is secondary, we should use:
+ modprobe viafb viafb_active_dev=CRT+DVI viafb_SAMM_ON=1...
+ If DVI is primary and CRT is secondary, we should use:
+ modprobe viafb viafb_active_dev=DVI+CRT viafb_SAMM_ON=1...
+
+ viafb_display_hardware_layout:
+ This option is used to specify display hardware layout for CX700 chip.
+ 1 : LCD only
+ 2 : DVI only
+ 3 : LCD+DVI (default)
+ 4 : LCD1+LCD2 (internal + internal)
+ 16: LCD1+ExternalLCD2 (internal + external)
+
+ viafb_second_size:
+ This option is used to set second device memory size(MB) in SAMM case.
+ The minimal size is 16.
+
+ viafb_platform_epia_dvi:
+ This option is used to enable DVI on EPIA - M
+ 0 : No DVI on EPIA - M (default)
+ 1 : DVI on EPIA - M
+
+ viafb_bus_width:
+ When using 24 - Bit Bus Width Digital Interface,
+ this option should be set.
+ 12: 12-Bit LVDS or 12-Bit TMDS (default)
+ 24: 24-Bit LVDS or 24-Bit TMDS
+
+ viafb_device_lcd_dualedge:
+ When using Dual Edge Panel, this option should be set.
+ 0 : No Dual Edge Panel (default)
+ 1 : Dual Edge Panel
+
+ viafb_video_dev:
+ This option is used to specify video output devices(CRT, DVI, LCD) for
+ duoview case.
+ For example:
+ To output video on DVI, we should use:
+ modprobe viafb viafb_video_dev=DVI...
+
+ viafb_lcd_port:
+ This option is used to specify LCD output port,
+ available values are "DVP0" "DVP1" "DFP_HIGHLOW" "DFP_HIGH" "DFP_LOW".
+ for external LCD + external DVI on CX700(External LCD is on DVP0),
+ we should use:
+ modprobe viafb viafb_lcd_port=DVP0...
+
+Notes:
+ 1. CRT may not display properly for DuoView CRT & DVI display at
+ the "640x480" PAL mode with DVI overscan enabled.
+ 2. SAMM stands for single adapter multi monitors. It is different from
+ multi-head since SAMM support multi monitor at driver layers, thus fbcon
+ layer doesn't even know about it; SAMM's second screen doesn't have a
+ device node file, thus a user mode application can't access it directly.
+ When SAMM is enabled, viafb_mode and viafb_mode1, viafb_bpp and
+ viafb_bpp1, viafb_refresh and viafb_refresh1 can be different.
+ 3. When console is depending on viafbinfo1, dynamically change resolution
+ and bpp, need to call VIAFB specified ioctl interface VIAFB_SET_DEVICE
+ instead of calling common ioctl function FBIOPUT_VSCREENINFO since
+ viafb doesn't support multi-head well, or it will cause screen crush.
+ 4. VX800 2D accelerator hasn't been supported in this driver yet. When
+ using driver on VX800, the driver will disable the acceleration
+ function as default.
+
+
+[Configure viafb with "fbset" tool]
+-----------------------------------
+ "fbset" is an inbox utility of Linux.
+ 1. Inquire current viafb information, type,
+ # fbset -i
+
+ 2. Set various resolutions and viafb_refresh rates,
+ # fbset <resolution-vertical_sync>
+
+ example,
+ # fbset "1024x768-75"
+ or
+ # fbset -g 1024 768 1024 768 32
+ Check the file "/etc/fb.modes" to find display modes available.
+
+ 3. Set the color depth,
+ # fbset -depth <value>
+
+ example,
+ # fbset -depth 16
+
+[Bootup with viafb]:
+--------------------
+ Add the following line to your grub.conf:
+ append = "video=viafb:viafb_mode=1024x768,viafb_bpp=32,viafb_refresh=85"
+
diff --git a/Documentation/fb/vt8623fb.txt b/Documentation/fb/vt8623fb.txt
new file mode 100644
index 0000000..f654576
--- /dev/null
+++ b/Documentation/fb/vt8623fb.txt
@@ -0,0 +1,64 @@
+
+ vt8623fb - fbdev driver for graphics core in VIA VT8623 chipset
+ ===============================================================
+
+
+Supported Hardware
+==================
+
+ VIA VT8623 [CLE266] chipset and its graphics core
+ (known as CastleRock or Unichrome)
+
+I tested vt8623fb on VIA EPIA ML-6000
+
+
+Supported Features
+==================
+
+ * 4 bpp pseudocolor modes (with 18bit palette, two variants)
+ * 8 bpp pseudocolor mode (with 18bit palette)
+ * 16 bpp truecolor mode (RGB 565)
+ * 32 bpp truecolor mode (RGB 888)
+ * text mode (activated by bpp = 0)
+ * doublescan mode variant (not available in text mode)
+ * panning in both directions
+ * suspend/resume support
+ * DPMS support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (maximum about 100 MHz). This limitation is not enforced by
+driver. Text mode supports 8bit wide fonts only (hardware limitation) and
+16bit tall fonts (driver limitation).
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+ * secondary (not initialized by BIOS) device support
+ * MMIO support
+ * interlaced mode variant
+ * support for fontwidths != 8 in 4 bpp modes
+ * support for fontheight != 16 in text mode
+ * hardware cursor
+ * video overlay support
+ * vsync synchronization
+ * acceleration support (8514-like 2D, busmaster transfers)
+
+
+Known bugs
+==========
+
+ * cursor disable in text mode doesn't work
+
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
new file mode 100644
index 0000000..c28a2ac
--- /dev/null
+++ b/Documentation/feature-removal-schedule.txt
@@ -0,0 +1,345 @@
+The following is a list of files and features that are going to be
+removed in the kernel source tree. Every entry should contain what
+exactly is going away, why it is happening, and who is going to be doing
+the work. When the feature is removed from the kernel, it should also
+be removed from this file.
+
+---------------------------
+
+What: old static regulatory information and ieee80211_regdom module parameter
+When: 2.6.29
+Why: The old regulatory infrastructure has been replaced with a new one
+ which does not require statically defined regulatory domains. We do
+ not want to keep static regulatory domains in the kernel due to the
+ the dynamic nature of regulatory law and localization. We kept around
+ the old static definitions for the regulatory domains of:
+ * US
+ * JP
+ * EU
+ and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
+ set. We also kept around the ieee80211_regdom module parameter in case
+ some applications were relying on it. Changing regulatory domains
+ can now be done instead by using nl80211, as is done with iw.
+Who: Luis R. Rodriguez <lrodriguez@atheros.com>
+
+---------------------------
+
+What: dev->power.power_state
+When: July 2007
+Why: Broken design for runtime control over driver power states, confusing
+ driver-internal runtime power management with: mechanisms to support
+ system-wide sleep state transitions; event codes that distinguish
+ different phases of swsusp "sleep" transitions; and userspace policy
+ inputs. This framework was never widely used, and most attempts to
+ use it were broken. Drivers should instead be exposing domain-specific
+ interfaces either to kernel or to userspace.
+Who: Pavel Machek <pavel@suse.cz>
+
+---------------------------
+
+What: Video4Linux API 1 ioctls and video_decoder.h from Video devices.
+When: December 2008
+Files: include/linux/video_decoder.h include/linux/videodev.h
+Check: include/linux/video_decoder.h include/linux/videodev.h
+Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
+ series. The old API have lots of drawbacks and don't provide enough
+ means to work with all video and audio standards. The newer API is
+ already available on the main drivers and should be used instead.
+ Newer drivers should use v4l_compat_translate_ioctl function to handle
+ old calls, replacing to newer ones.
+ Decoder iocts are using internally to allow video drivers to
+ communicate with video decoders. This should also be improved to allow
+ V4L2 calls being translated into compatible internal ioctls.
+ Compatibility ioctls will be provided, for a while, via
+ v4l1-compat module.
+Who: Mauro Carvalho Chehab <mchehab@infradead.org>
+
+---------------------------
+
+What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl])
+When: November 2005
+Files: drivers/pcmcia/: pcmcia_ioctl.c
+Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a
+ normal hotpluggable bus, and with it using the default kernel
+ infrastructure (hotplug, driver core, sysfs) keeping the PCMCIA
+ control ioctl needed by cardmgr and cardctl from pcmcia-cs is
+ unnecessary, and makes further cleanups and integration of the
+ PCMCIA subsystem into the Linux kernel device driver model more
+ difficult. The features provided by cardmgr and cardctl are either
+ handled by the kernel itself now or are available in the new
+ pcmciautils package available at
+ http://kernel.org/pub/linux/utils/kernel/pcmcia/
+Who: Dominik Brodowski <linux@brodo.de>
+
+---------------------------
+
+What: sys_sysctl
+When: September 2010
+Option: CONFIG_SYSCTL_SYSCALL
+Why: The same information is available in a more convenient from
+ /proc/sys, and none of the sysctl variables appear to be
+ important performance wise.
+
+ Binary sysctls are a long standing source of subtle kernel
+ bugs and security issues.
+
+ When I looked several months ago all I could find after
+ searching several distributions were 5 user space programs and
+ glibc (which falls back to /proc/sys) using this syscall.
+
+ The man page for sysctl(2) documents it as unusable for user
+ space programs.
+
+ sysctl(2) is not generally ABI compatible to a 32bit user
+ space application on a 64bit and a 32bit kernel.
+
+ For the last several months the policy has been no new binary
+ sysctls and no one has put forward an argument to use them.
+
+ Binary sysctls issues seem to keep happening appearing so
+ properly deprecating them (with a warning to user space) and a
+ 2 year grace warning period will mean eventually we can kill
+ them and end the pain.
+
+ In the mean time individual binary sysctls can be dealt with
+ in a piecewise fashion.
+
+Who: Eric Biederman <ebiederm@xmission.com>
+
+---------------------------
+
+What: remove EXPORT_SYMBOL(kernel_thread)
+When: August 2006
+Files: arch/*/kernel/*_ksyms.c
+Check: kernel_thread
+Why: kernel_thread is a low-level implementation detail. Drivers should
+ use the <linux/kthread.h> API instead which shields them from
+ implementation details and provides a higherlevel interface that
+ prevents bugs and code duplication
+Who: Christoph Hellwig <hch@lst.de>
+
+---------------------------
+
+What: eepro100 network driver
+When: January 2007
+Why: replaced by the e100 driver
+Who: Adrian Bunk <bunk@stusta.de>
+
+---------------------------
+
+What: Unused EXPORT_SYMBOL/EXPORT_SYMBOL_GPL exports
+ (temporary transition config option provided until then)
+ The transition config option will also be removed at the same time.
+When: before 2.6.19
+Why: Unused symbols are both increasing the size of the kernel binary
+ and are often a sign of "wrong API"
+Who: Arjan van de Ven <arjan@linux.intel.com>
+
+---------------------------
+
+What: PHYSDEVPATH, PHYSDEVBUS, PHYSDEVDRIVER in the uevent environment
+When: October 2008
+Why: The stacking of class devices makes these values misleading and
+ inconsistent.
+ Class devices should not carry any of these properties, and bus
+ devices have SUBSYTEM and DRIVER as a replacement.
+Who: Kay Sievers <kay.sievers@suse.de>
+
+---------------------------
+
+What: ACPI procfs interface
+When: July 2008
+Why: ACPI sysfs conversion should be finished by January 2008.
+ ACPI procfs interface will be removed in July 2008 so that
+ there is enough time for the user space to catch up.
+Who: Zhang Rui <rui.zhang@intel.com>
+
+---------------------------
+
+What: /proc/acpi/button
+When: August 2007
+Why: /proc/acpi/button has been replaced by events to the input layer
+ since 2.6.20.
+Who: Len Brown <len.brown@intel.com>
+
+---------------------------
+
+What: /proc/acpi/event
+When: February 2008
+Why: /proc/acpi/event has been replaced by events via the input layer
+ and netlink since 2.6.23.
+Who: Len Brown <len.brown@intel.com>
+
+---------------------------
+
+What: libata spindown skipping and warning
+When: Dec 2008
+Why: Some halt(8) implementations synchronize caches for and spin
+ down libata disks because libata didn't use to spin down disk on
+ system halt (only synchronized caches).
+ Spin down on system halt is now implemented. sysfs node
+ /sys/class/scsi_disk/h:c:i:l/manage_start_stop is present if
+ spin down support is available.
+ Because issuing spin down command to an already spun down disk
+ makes some disks spin up just to spin down again, libata tracks
+ device spindown status to skip the extra spindown command and
+ warn about it.
+ This is to give userspace tools the time to get updated and will
+ be removed after userspace is reasonably updated.
+Who: Tejun Heo <htejun@gmail.com>
+
+---------------------------
+
+What: i386/x86_64 bzImage symlinks
+When: April 2010
+
+Why: The i386/x86_64 merge provides a symlink to the old bzImage
+ location so not yet updated user space tools, e.g. package
+ scripts, do not break.
+Who: Thomas Gleixner <tglx@linutronix.de>
+
+---------------------------
+
+What (Why):
+ - include/linux/netfilter_ipv4/ipt_TOS.h ipt_tos.h header files
+ (superseded by xt_TOS/xt_tos target & match)
+
+ - "forwarding" header files like ipt_mac.h in
+ include/linux/netfilter_ipv4/ and include/linux/netfilter_ipv6/
+
+ - xt_CONNMARK match revision 0
+ (superseded by xt_CONNMARK match revision 1)
+
+ - xt_MARK target revisions 0 and 1
+ (superseded by xt_MARK match revision 2)
+
+ - xt_connmark match revision 0
+ (superseded by xt_connmark match revision 1)
+
+ - xt_conntrack match revision 0
+ (superseded by xt_conntrack match revision 1)
+
+ - xt_iprange match revision 0,
+ include/linux/netfilter_ipv4/ipt_iprange.h
+ (superseded by xt_iprange match revision 1)
+
+ - xt_mark match revision 0
+ (superseded by xt_mark match revision 1)
+
+ - xt_recent: the old ipt_recent proc dir
+ (superseded by /proc/net/xt_recent)
+
+When: January 2009 or Linux 2.7.0, whichever comes first
+Why: Superseded by newer revisions or modules
+Who: Jan Engelhardt <jengelh@computergmbh.de>
+
+---------------------------
+
+What: b43 support for firmware revision < 410
+When: July 2008
+Why: The support code for the old firmware hurts code readability/maintainability
+ and slightly hurts runtime performance. Bugfixes for the old firmware
+ are not provided by Broadcom anymore.
+Who: Michael Buesch <mb@bu3sch.de>
+
+---------------------------
+
+What: init_mm export
+When: 2.6.26
+Why: Not used in-tree. The current out-of-tree users used it to
+ work around problems in the CPA code which should be resolved
+ by now. One usecase was described to provide verification code
+ of the CPA operation. That's a good idea in general, but such
+ code / infrastructure should be in the kernel and not in some
+ out-of-tree driver.
+Who: Thomas Gleixner <tglx@linutronix.de>
+
+----------------------------
+
+What: usedac i386 kernel parameter
+When: 2.6.27
+Why: replaced by allowdac and no dac combination
+Who: Glauber Costa <gcosta@redhat.com>
+
+---------------------------
+
+What: remove HID compat support
+When: 2.6.29
+Why: needed only as a temporary solution until distros fix themselves up
+Who: Jiri Slaby <jirislaby@gmail.com>
+
+---------------------------
+
+What: print_fn_descriptor_symbol()
+When: October 2009
+Why: The %pF vsprintf format provides the same functionality in a
+ simpler way. print_fn_descriptor_symbol() is deprecated but
+ still present to give out-of-tree modules time to change.
+Who: Bjorn Helgaas <bjorn.helgaas@hp.com>
+
+---------------------------
+
+What: /sys/o2cb symlink
+When: January 2010
+Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
+ exists as a symlink for backwards compatibility for old versions of
+ ocfs2-tools. 2 years should be sufficient time to phase in new versions
+ which know to look in /sys/fs/o2cb.
+Who: ocfs2-devel@oss.oracle.com
+
+---------------------------
+
+What: SCTP_GET_PEER_ADDRS_NUM_OLD, SCTP_GET_PEER_ADDRS_OLD,
+ SCTP_GET_LOCAL_ADDRS_NUM_OLD, SCTP_GET_LOCAL_ADDRS_OLD
+When: June 2009
+Why: A newer version of the options have been introduced in 2005 that
+ removes the limitions of the old API. The sctp library has been
+ converted to use these new options at the same time. Any user
+ space app that directly uses the old options should convert to using
+ the new options.
+Who: Vlad Yasevich <vladislav.yasevich@hp.com>
+
+---------------------------
+
+What: CONFIG_THERMAL_HWMON
+When: January 2009
+Why: This option was introduced just to allow older lm-sensors userspace
+ to keep working over the upgrade to 2.6.26. At the scheduled time of
+ removal fixed lm-sensors (2.x or 3.x) should be readily available.
+Who: Rene Herman <rene.herman@gmail.com>
+
+---------------------------
+
+What: Code that is now under CONFIG_WIRELESS_EXT_SYSFS
+ (in net/core/net-sysfs.c)
+When: After the only user (hal) has seen a release with the patches
+ for enough time, probably some time in 2010.
+Why: Over 1K .text/.data size reduction, data is available in other
+ ways (ioctls)
+Who: Johannes Berg <johannes@sipsolutions.net>
+
+---------------------------
+
+What: CONFIG_NF_CT_ACCT
+When: 2.6.29
+Why: Accounting can now be enabled/disabled without kernel recompilation.
+ Currently used only to set a default value for a feature that is also
+ controlled by a kernel/module/sysfs/sysctl parameter.
+Who: Krzysztof Piotr Oledzki <ole@ans.pl>
+
+---------------------------
+
+What: ide-scsi (BLK_DEV_IDESCSI)
+When: 2.6.29
+Why: The 2.6 kernel supports direct writing to ide CD drives, which
+ eliminates the need for ide-scsi. The new method is more
+ efficient in every way.
+Who: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
+
+---------------------------
+
+What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client()
+When: 2.6.29 (ideally) or 2.6.30 (more likely)
+Why: Deprecated by the new (standard) device driver binding model. Use
+ i2c_driver->probe() and ->remove() instead.
+Who: Jean Delvare <khali@linux-fr.org>
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
new file mode 100644
index 0000000..bc6b437
--- /dev/null
+++ b/Documentation/filesystems/00-INDEX
@@ -0,0 +1,118 @@
+00-INDEX
+ - this file (info on some of the filesystems supported by linux).
+Exporting
+ - explanation of how to make filesystems exportable.
+Locking
+ - info on locking rules as they pertain to Linux VFS.
+9p.txt
+ - 9p (v9fs) is an implementation of the Plan 9 remote fs protocol.
+adfs.txt
+ - info and mount options for the Acorn Advanced Disc Filing System.
+afs.txt
+ - info and examples for the distributed AFS (Andrew File System) fs.
+affs.txt
+ - info and mount options for the Amiga Fast File System.
+automount-support.txt
+ - information about filesystem automount support.
+befs.txt
+ - information about the BeOS filesystem for Linux.
+bfs.txt
+ - info for the SCO UnixWare Boot Filesystem (BFS).
+cifs.txt
+ - description of the CIFS filesystem.
+coda.txt
+ - description of the CODA filesystem.
+configfs/
+ - directory containing configfs documentation and example code.
+cramfs.txt
+ - info on the cram filesystem for small storage (ROMs etc).
+dentry-locking.txt
+ - info on the RCU-based dcache locking model.
+directory-locking
+ - info about the locking scheme used for directory operations.
+dlmfs.txt
+ - info on the userspace interface to the OCFS2 DLM.
+dnotify.txt
+ - info about directory notification in Linux.
+ecryptfs.txt
+ - docs on eCryptfs: stacked cryptographic filesystem for Linux.
+ext2.txt
+ - info, mount options and specifications for the Ext2 filesystem.
+ext3.txt
+ - info, mount options and specifications for the Ext3 filesystem.
+ext4.txt
+ - info, mount options and specifications for the Ext4 filesystem.
+files.txt
+ - info on file management in the Linux kernel.
+fuse.txt
+ - info on the Filesystem in User SpacE including mount options.
+gfs2.txt
+ - info on the Global File System 2.
+hfs.txt
+ - info on the Macintosh HFS Filesystem for Linux.
+hfsplus.txt
+ - info on the Macintosh HFSPlus Filesystem for Linux.
+hpfs.txt
+ - info and mount options for the OS/2 HPFS.
+inotify.txt
+ - info on the powerful yet simple file change notification system.
+isofs.txt
+ - info and mount options for the ISO 9660 (CDROM) filesystem.
+jfs.txt
+ - info and mount options for the JFS filesystem.
+locks.txt
+ - info on file locking implementations, flock() vs. fcntl(), etc.
+mandatory-locking.txt
+ - info on the Linux implementation of Sys V mandatory file locking.
+ncpfs.txt
+ - info on Novell Netware(tm) filesystem using NCP protocol.
+nfsroot.txt
+ - short guide on setting up a diskless box with NFS root filesystem.
+ntfs.txt
+ - info and mount options for the NTFS filesystem (Windows NT).
+ocfs2.txt
+ - info and mount options for the OCFS2 clustered filesystem.
+porting
+ - various information on filesystem porting.
+proc.txt
+ - info on Linux's /proc filesystem.
+ramfs-rootfs-initramfs.txt
+ - info on the 'in memory' filesystems ramfs, rootfs and initramfs.
+reiser4.txt
+ - info on the Reiser4 filesystem based on dancing tree algorithms.
+relay.txt
+ - info on relay, for efficient streaming from kernel to user space.
+romfs.txt
+ - description of the ROMFS filesystem.
+rpc-cache.txt
+ - introduction to the caching mechanisms in the sunrpc layer.
+seq_file.txt
+ - how to use the seq_file API
+sharedsubtree.txt
+ - a description of shared subtrees for namespaces.
+smbfs.txt
+ - info on using filesystems with the SMB protocol (Win 3.11 and NT).
+spufs.txt
+ - info and mount options for the SPU filesystem used on Cell.
+sysfs-pci.txt
+ - info on accessing PCI device resources through sysfs.
+sysfs.txt
+ - info on sysfs, a ram-based filesystem for exporting kernel objects.
+sysv-fs.txt
+ - info on the SystemV/V7/Xenix/Coherent filesystem.
+tmpfs.txt
+ - info on tmpfs, a filesystem that holds all files in virtual memory.
+udf.txt
+ - info and mount options for the UDF filesystem.
+ufs.txt
+ - info on the ufs filesystem.
+unionfs/
+ - info on the unionfs filesystem
+vfat.txt
+ - info on using the VFAT filesystem used in Windows NT and Windows 95
+vfs.txt
+ - overview of the Virtual File System
+xfs.txt
+ - info and mount options for the XFS filesystem.
+xip.txt
+ - info on execute-in-place for file mappings.
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
new file mode 100644
index 0000000..bf80806
--- /dev/null
+++ b/Documentation/filesystems/9p.txt
@@ -0,0 +1,144 @@
+ v9fs: Plan 9 Resource Sharing for Linux
+ =======================================
+
+ABOUT
+=====
+
+v9fs is a Unix implementation of the Plan 9 9p remote filesystem protocol.
+
+This software was originally developed by Ron Minnich <rminnich@sandia.gov>
+and Maya Gokhale. Additional development by Greg Watson
+<gwatson@lanl.gov> and most recently Eric Van Hensbergen
+<ericvh@gmail.com>, Latchesar Ionkov <lucho@ionkov.net> and Russ Cox
+<rsc@swtch.com>.
+
+The best detailed explanation of the Linux implementation and applications of
+the 9p client is available in the form of a USENIX paper:
+ http://www.usenix.org/events/usenix05/tech/freenix/hensbergen.html
+
+Other applications are described in the following papers:
+ * XCPU & Clustering
+ http://www.xcpu.org/xcpu-talk.pdf
+ * KVMFS: control file system for KVM
+ http://www.xcpu.org/kvmfs.pdf
+ * CellFS: A New ProgrammingModel for the Cell BE
+ http://www.xcpu.org/cellfs-talk.pdf
+ * PROSE I/O: Using 9p to enable Application Partitions
+ http://plan9.escet.urjc.es/iwp9/cready/PROSE_iwp9_2006.pdf
+
+USAGE
+=====
+
+For remote file server:
+
+ mount -t 9p 10.10.1.2 /mnt/9
+
+For Plan 9 From User Space applications (http://swtch.com/plan9)
+
+ mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
+
+OPTIONS
+=======
+
+ trans=name select an alternative transport. Valid options are
+ currently:
+ unix - specifying a named pipe mount point
+ tcp - specifying a normal TCP/IP connection
+ fd - used passed file descriptors for connection
+ (see rfdno and wfdno)
+ virtio - connect to the next virtio channel available
+ (from lguest or KVM with trans_virtio module)
+
+ uname=name user name to attempt mount as on the remote server. The
+ server may override or ignore this value. Certain user
+ names may require authentication.
+
+ aname=name aname specifies the file tree to access when the server is
+ offering several exported file systems.
+
+ cache=mode specifies a caching policy. By default, no caches are used.
+ loose = no attempts are made at consistency,
+ intended for exclusive, read-only mounts
+
+ debug=n specifies debug level. The debug level is a bitmask.
+ 0x01 = display verbose error messages
+ 0x02 = developer debug (DEBUG_CURRENT)
+ 0x04 = display 9p trace
+ 0x08 = display VFS trace
+ 0x10 = display Marshalling debug
+ 0x20 = display RPC debug
+ 0x40 = display transport debug
+ 0x80 = display allocation debug
+
+ rfdno=n the file descriptor for reading with trans=fd
+
+ wfdno=n the file descriptor for writing with trans=fd
+
+ maxdata=n the number of bytes to use for 9p packet payload (msize)
+
+ port=n port to connect to on the remote server
+
+ noextend force legacy mode (no 9p2000.u semantics)
+
+ dfltuid attempt to mount as a particular uid
+
+ dfltgid attempt to mount with a particular gid
+
+ afid security channel - used by Plan 9 authentication protocols
+
+ nodevmap do not map special files - represent them as normal files.
+ This can be used to share devices/named pipes/sockets between
+ hosts. This functionality will be expanded in later versions.
+
+ access there are three access modes.
+ user = if a user tries to access a file on v9fs
+ filesystem for the first time, v9fs sends an
+ attach command (Tattach) for that user.
+ This is the default mode.
+ <uid> = allows only user with uid=<uid> to access
+ the files on the mounted filesystem
+ any = v9fs does single attach and performs all
+ operations as one user
+
+RESOURCES
+=========
+
+Our current recommendation is to use Inferno (http://www.vitanuova.com/inferno)
+as the 9p server. You can start a 9p server under Inferno by issuing the
+following command:
+ ; styxlisten -A tcp!*!564 export '#U*'
+
+The -A specifies an unauthenticated export. The 564 is the port # (you may
+have to choose a higher port number if running as a normal user). The '#U*'
+specifies exporting the root of the Linux name space. You may specify a
+subset of the namespace by extending the path: '#U*'/tmp would just export
+/tmp. For more information, see the Inferno manual pages covering styxlisten
+and export.
+
+A Linux version of the 9p server is now maintained under the npfs project
+on sourceforge (http://sourceforge.net/projects/npfs). The currently
+maintained version is the single-threaded version of the server (named spfs)
+available from the same CVS repository.
+
+There are user and developer mailing lists available through the v9fs project
+on sourceforge (http://sourceforge.net/projects/v9fs).
+
+News and other information is maintained on SWiK (http://swik.net/v9fs).
+
+Bug reports may be issued through the kernel.org bugzilla
+(http://bugzilla.kernel.org)
+
+For more information on the Plan 9 Operating System check out
+http://plan9.bell-labs.com/plan9
+
+For information on Plan 9 from User Space (Plan 9 applications and libraries
+ported to Linux/BSD/OSX/etc) check out http://swtch.com/plan9
+
+
+STATUS
+======
+
+The 2.6 kernel support is working on PPC and x86.
+
+PLEASE USE THE KERNEL BUGZILLA TO REPORT PROBLEMS. (http://bugzilla.kernel.org)
+
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting
new file mode 100644
index 0000000..87019d2
--- /dev/null
+++ b/Documentation/filesystems/Exporting
@@ -0,0 +1,147 @@
+
+Making Filesystems Exportable
+=============================
+
+Overview
+--------
+
+All filesystem operations require a dentry (or two) as a starting
+point. Local applications have a reference-counted hold on suitable
+dentries via open file descriptors or cwd/root. However remote
+applications that access a filesystem via a remote filesystem protocol
+such as NFS may not be able to hold such a reference, and so need a
+different way to refer to a particular dentry. As the alternative
+form of reference needs to be stable across renames, truncates, and
+server-reboot (among other things, though these tend to be the most
+problematic), there is no simple answer like 'filename'.
+
+The mechanism discussed here allows each filesystem implementation to
+specify how to generate an opaque (outside of the filesystem) byte
+string for any dentry, and how to find an appropriate dentry for any
+given opaque byte string.
+This byte string will be called a "filehandle fragment" as it
+corresponds to part of an NFS filehandle.
+
+A filesystem which supports the mapping between filehandle fragments
+and dentries will be termed "exportable".
+
+
+
+Dcache Issues
+-------------
+
+The dcache normally contains a proper prefix of any given filesystem
+tree. This means that if any filesystem object is in the dcache, then
+all of the ancestors of that filesystem object are also in the dcache.
+As normal access is by filename this prefix is created naturally and
+maintained easily (by each object maintaining a reference count on
+its parent).
+
+However when objects are included into the dcache by interpreting a
+filehandle fragment, there is no automatic creation of a path prefix
+for the object. This leads to two related but distinct features of
+the dcache that are not needed for normal filesystem access.
+
+1/ The dcache must sometimes contain objects that are not part of the
+ proper prefix. i.e that are not connected to the root.
+2/ The dcache must be prepared for a newly found (via ->lookup) directory
+ to already have a (non-connected) dentry, and must be able to move
+ that dentry into place (based on the parent and name in the
+ ->lookup). This is particularly needed for directories as
+ it is a dcache invariant that directories only have one dentry.
+
+To implement these features, the dcache has:
+
+a/ A dentry flag DCACHE_DISCONNECTED which is set on
+ any dentry that might not be part of the proper prefix.
+ This is set when anonymous dentries are created, and cleared when a
+ dentry is noticed to be a child of a dentry which is in the proper
+ prefix.
+
+b/ A per-superblock list "s_anon" of dentries which are the roots of
+ subtrees that are not in the proper prefix. These dentries, as
+ well as the proper prefix, need to be released at unmount time. As
+ these dentries will not be hashed, they are linked together on the
+ d_hash list_head.
+
+c/ Helper routines to allocate anonymous dentries, and to help attach
+ loose directory dentries at lookup time. They are:
+ d_alloc_anon(inode) will return a dentry for the given inode.
+ If the inode already has a dentry, one of those is returned.
+ If it doesn't, a new anonymous (IS_ROOT and
+ DCACHE_DISCONNECTED) dentry is allocated and attached.
+ In the case of a directory, care is taken that only one dentry
+ can ever be attached.
+ d_splice_alias(inode, dentry) will make sure that there is a
+ dentry with the same name and parent as the given dentry, and
+ which refers to the given inode.
+ If the inode is a directory and already has a dentry, then that
+ dentry is d_moved over the given dentry.
+ If the passed dentry gets attached, care is taken that this is
+ mutually exclusive to a d_alloc_anon operation.
+ If the passed dentry is used, NULL is returned, else the used
+ dentry is returned. This corresponds to the calling pattern of
+ ->lookup.
+
+
+Filesystem Issues
+-----------------
+
+For a filesystem to be exportable it must:
+
+ 1/ provide the filehandle fragment routines described below.
+ 2/ make sure that d_splice_alias is used rather than d_add
+ when ->lookup finds an inode for a given parent and name.
+ Typically the ->lookup routine will end with a:
+
+ return d_splice_alias(inode, dentry);
+ }
+
+
+
+ A file system implementation declares that instances of the filesystem
+are exportable by setting the s_export_op field in the struct
+super_block. This field must point to a "struct export_operations"
+struct which has the following members:
+
+ encode_fh (optional)
+ Takes a dentry and creates a filehandle fragment which can later be used
+ to find or create a dentry for the same object. The default
+ implementation creates a filehandle fragment that encodes a 32bit inode
+ and generation number for the inode encoded, and if necessary the
+ same information for the parent.
+
+ fh_to_dentry (mandatory)
+ Given a filehandle fragment, this should find the implied object and
+ create a dentry for it (possibly with d_alloc_anon).
+
+ fh_to_parent (optional but strongly recommended)
+ Given a filehandle fragment, this should find the parent of the
+ implied object and create a dentry for it (possibly with d_alloc_anon).
+ May fail if the filehandle fragment is too small.
+
+ get_parent (optional but strongly recommended)
+ When given a dentry for a directory, this should return a dentry for
+ the parent. Quite possibly the parent dentry will have been allocated
+ by d_alloc_anon. The default get_parent function just returns an error
+ so any filehandle lookup that requires finding a parent will fail.
+ ->lookup("..") is *not* used as a default as it can leave ".." entries
+ in the dcache which are too messy to work with.
+
+ get_name (optional)
+ When given a parent dentry and a child dentry, this should find a name
+ in the directory identified by the parent dentry, which leads to the
+ object identified by the child dentry. If no get_name function is
+ supplied, a default implementation is provided which uses vfs_readdir
+ to find potential names, and matches inode numbers to find the correct
+ match.
+
+
+A filehandle fragment consists of an array of 1 or more 4byte words,
+together with a one byte "type".
+The decode_fh routine should not depend on the stated size that is
+passed to it. This size may be larger than the original filehandle
+generated by encode_fh, in which case it will have been padded with
+nuls. Rather, the encode_fh routine should choose a "type" which
+indicates the decode_fh how much of the filehandle is valid, and how
+it should be interpreted.
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
new file mode 100644
index 0000000..23d2f44
--- /dev/null
+++ b/Documentation/filesystems/Locking
@@ -0,0 +1,537 @@
+ The text below describes the locking rules for VFS-related methods.
+It is (believed to be) up-to-date. *Please*, if you change anything in
+prototypes or locking protocols - update this file. And update the relevant
+instances in the tree, don't leave that to maintainers of filesystems/devices/
+etc. At the very least, put the list of dubious cases in the end of this file.
+Don't turn it into log - maintainers of out-of-the-tree code are supposed to
+be able to use diff(1).
+ Thing currently missing here: socket operations. Alexey?
+
+--------------------------- dentry_operations --------------------------
+prototypes:
+ int (*d_revalidate)(struct dentry *, int);
+ int (*d_hash) (struct dentry *, struct qstr *);
+ int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
+ int (*d_delete)(struct dentry *);
+ void (*d_release)(struct dentry *);
+ void (*d_iput)(struct dentry *, struct inode *);
+ char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
+
+locking rules:
+ none have BKL
+ dcache_lock rename_lock ->d_lock may block
+d_revalidate: no no no yes
+d_hash no no no yes
+d_compare: no yes no no
+d_delete: yes no yes no
+d_release: no no no yes
+d_iput: no no no yes
+d_dname: no no no no
+
+--------------------------- inode_operations ---------------------------
+prototypes:
+ int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid
+ata *);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
+ int (*unlink) (struct inode *,struct dentry *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,int);
+ int (*rmdir) (struct inode *,struct dentry *);
+ int (*mknod) (struct inode *,struct dentry *,int,dev_t);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
+ int (*readlink) (struct dentry *, char __user *,int);
+ int (*follow_link) (struct dentry *, struct nameidata *);
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int, struct nameidata *);
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
+ int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ int (*removexattr) (struct dentry *, const char *);
+
+locking rules:
+ all may block, none have BKL
+ i_mutex(inode)
+lookup: yes
+create: yes
+link: yes (both)
+mknod: yes
+symlink: yes
+mkdir: yes
+unlink: yes (both)
+rmdir: yes (both) (see below)
+rename: yes (all) (see below)
+readlink: no
+follow_link: no
+truncate: yes (see below)
+setattr: yes
+permission: no
+getattr: no
+setxattr: yes
+getxattr: no
+listxattr: no
+removexattr: yes
+ Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
+victim.
+ cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
+ ->truncate() is never called directly - it's a callback, not a
+method. It's called by vmtruncate() - library function normally used by
+->setattr(). Locking information above applies to that call (i.e. is
+inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
+passed).
+
+See Documentation/filesystems/directory-locking for more detailed discussion
+of the locking scheme for directory operations.
+
+--------------------------- super_operations ---------------------------
+prototypes:
+ struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*destroy_inode)(struct inode *);
+ void (*dirty_inode) (struct inode *);
+ int (*write_inode) (struct inode *, int);
+ void (*drop_inode) (struct inode *);
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+ void (*write_super) (struct super_block *);
+ int (*sync_fs)(struct super_block *sb, int wait);
+ void (*write_super_lockfs) (struct super_block *);
+ void (*unlockfs) (struct super_block *);
+ int (*statfs) (struct dentry *, struct kstatfs *);
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*clear_inode) (struct inode *);
+ void (*umount_begin) (struct super_block *);
+ int (*show_options)(struct seq_file *, struct vfsmount *);
+ ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+ ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+
+locking rules:
+ All may block.
+ BKL s_lock s_umount
+alloc_inode: no no no
+destroy_inode: no
+dirty_inode: no (must not sleep)
+write_inode: no
+drop_inode: no !!!inode_lock!!!
+delete_inode: no
+put_super: yes yes no
+write_super: no yes read
+sync_fs: no no read
+write_super_lockfs: ?
+unlockfs: ?
+statfs: no no no
+remount_fs: yes yes maybe (see below)
+clear_inode: no
+umount_begin: yes no no
+show_options: no (vfsmount->sem)
+quota_read: no no no (see below)
+quota_write: no no no (see below)
+
+->remount_fs() will have the s_umount lock if it's already mounted.
+When called from get_sb_single, it does NOT have the s_umount lock.
+->quota_read() and ->quota_write() functions are both guaranteed to
+be the only ones operating on the quota file by the quota code (via
+dqio_sem) (unless an admin really wants to screw up something and
+writes to quota files with quotas on). For other details about locking
+see also dquot_operations section.
+
+--------------------------- file_system_type ---------------------------
+prototypes:
+ int (*get_sb) (struct file_system_type *, int,
+ const char *, void *, struct vfsmount *);
+ void (*kill_sb) (struct super_block *);
+locking rules:
+ may block BKL
+get_sb yes no
+kill_sb yes no
+
+->get_sb() returns error or 0 with locked superblock attached to the vfsmount
+(exclusive on ->s_umount).
+->kill_sb() takes a write-locked superblock, does all shutdown work on it,
+unlocks and drops the reference.
+
+--------------------------- address_space_operations --------------------------
+prototypes:
+ int (*writepage)(struct page *page, struct writeback_control *wbc);
+ int (*readpage)(struct file *, struct page *);
+ int (*sync_page)(struct page *);
+ int (*writepages)(struct address_space *, struct writeback_control *);
+ int (*set_page_dirty)(struct page *page);
+ int (*readpages)(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
+ sector_t (*bmap)(struct address_space *, sector_t);
+ int (*invalidatepage) (struct page *, unsigned long);
+ int (*releasepage) (struct page *, int);
+ int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
+ loff_t offset, unsigned long nr_segs);
+ int (*launder_page) (struct page *);
+
+locking rules:
+ All except set_page_dirty may block
+
+ BKL PageLocked(page) i_sem
+writepage: no yes, unlocks (see below)
+readpage: no yes, unlocks
+sync_page: no maybe
+writepages: no
+set_page_dirty no no
+readpages: no
+write_begin: no locks the page yes
+write_end: no yes, unlocks yes
+perform_write: no n/a yes
+bmap: yes
+invalidatepage: no yes
+releasepage: no yes
+direct_IO: no
+launder_page: no yes
+
+ ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
+may be called from the request handler (/dev/loop).
+
+ ->readpage() unlocks the page, either synchronously or via I/O
+completion.
+
+ ->readpages() populates the pagecache with the passed pages and starts
+I/O against them. They come unlocked upon I/O completion.
+
+ ->writepage() is used for two purposes: for "memory cleansing" and for
+"sync". These are quite different operations and the behaviour may differ
+depending upon the mode.
+
+If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then
+it *must* start I/O against the page, even if that would involve
+blocking on in-progress I/O.
+
+If writepage is called for memory cleansing (sync_mode ==
+WBC_SYNC_NONE) then its role is to get as much writeout underway as
+possible. So writepage should try to avoid blocking against
+currently-in-progress I/O.
+
+If the filesystem is not called for "sync" and it determines that it
+would need to block against in-progress I/O to be able to start new I/O
+against the page the filesystem should redirty the page with
+redirty_page_for_writepage(), then unlock the page and return zero.
+This may also be done to avoid internal deadlocks, but rarely.
+
+If the filesystem is called for sync then it must wait on any
+in-progress I/O and then start new I/O.
+
+The filesystem should unlock the page synchronously, before returning to the
+caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE
+value. WRITEPAGE_ACTIVATE means that page cannot really be written out
+currently, and VM should stop calling ->writepage() on this page for some
+time. VM does this by moving page to the head of the active list, hence the
+name.
+
+Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
+and return zero, writepage *must* run set_page_writeback() against the page,
+followed by unlocking it. Once set_page_writeback() has been run against the
+page, write I/O can be submitted and the write I/O completion handler must run
+end_page_writeback() once the I/O is complete. If no I/O is submitted, the
+filesystem must run end_page_writeback() against the page before returning from
+writepage.
+
+That is: after 2.5.12, pages which are under writeout are *not* locked. Note,
+if the filesystem needs the page to be locked during writeout, that is ok, too,
+the page is allowed to be unlocked at any point in time between the calls to
+set_page_writeback() and end_page_writeback().
+
+Note, failure to run either redirty_page_for_writepage() or the combination of
+set_page_writeback()/end_page_writeback() on a page submitted to writepage
+will leave the page itself marked clean but it will be tagged as dirty in the
+radix tree. This incoherency can lead to all sorts of hard-to-debug problems
+in the filesystem like having dirty inodes at umount and losing written data.
+
+ ->sync_page() locking rules are not well-defined - usually it is called
+with lock on page, but that is not guaranteed. Considering the currently
+existing instances of this method ->sync_page() itself doesn't look
+well-defined...
+
+ ->writepages() is used for periodic writeback and for syscall-initiated
+sync operations. The address_space should start I/O against at least
+*nr_to_write pages. *nr_to_write must be decremented for each page which is
+written. The address_space implementation may write more (or less) pages
+than *nr_to_write asks for, but it should try to be reasonably close. If
+nr_to_write is NULL, all dirty pages must be written.
+
+writepages should _only_ write pages which are present on
+mapping->io_pages.
+
+ ->set_page_dirty() is called from various places in the kernel
+when the target page is marked as needing writeback. It may be called
+under spinlock (it cannot block) and is sometimes called with the page
+not locked.
+
+ ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
+filesystems and by the swapper. The latter will eventually go away. All
+instances do not actually need the BKL. Please, keep it that way and don't
+breed new callers.
+
+ ->invalidatepage() is called when the filesystem must attempt to drop
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->invalidatepage is zero, the kernel uses
+block_invalidatepage() instead.
+
+ ->releasepage() is called when the kernel is about to try to drop the
+buffers from the page in preparation for freeing it. It returns zero to
+indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
+the kernel assumes that the fs has no private interest in the buffers.
+
+ ->launder_page() may be called prior to releasing a page if
+it is still found to be dirty. It returns zero if the page was successfully
+cleaned, or an error value if not. Note that in order to prevent the page
+getting mapped back in and redirtied, it needs to be kept locked
+across the entire operation.
+
+ Note: currently almost all instances of address_space methods are
+using BKL for internal serialization and that's one of the worst sources
+of contention. Normally they are calling library functions (in fs/buffer.c)
+and pass foo_get_block() as a callback (on local block-based filesystems,
+indeed). BKL is not needed for library stuff and is usually taken by
+foo_get_block(). It's an overkill, since block bitmaps can be protected by
+internal fs locking and real critical areas are much smaller than the areas
+filesystems protect now.
+
+----------------------- file_lock_operations ------------------------------
+prototypes:
+ void (*fl_insert)(struct file_lock *); /* lock insertion callback */
+ void (*fl_remove)(struct file_lock *); /* lock removal callback */
+ void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
+ void (*fl_release_private)(struct file_lock *);
+
+
+locking rules:
+ BKL may block
+fl_insert: yes no
+fl_remove: yes no
+fl_copy_lock: yes no
+fl_release_private: yes yes
+
+----------------------- lock_manager_operations ---------------------------
+prototypes:
+ int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
+ void (*fl_notify)(struct file_lock *); /* unblock callback */
+ void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
+ void (*fl_release_private)(struct file_lock *);
+ void (*fl_break)(struct file_lock *); /* break_lease callback */
+
+locking rules:
+ BKL may block
+fl_compare_owner: yes no
+fl_notify: yes no
+fl_copy_lock: yes no
+fl_release_private: yes yes
+fl_break: yes no
+
+ Currently only NFSD and NLM provide instances of this class. None of the
+them block. If you have out-of-tree instances - please, show up. Locking
+in that area will change.
+--------------------------- buffer_head -----------------------------------
+prototypes:
+ void (*b_end_io)(struct buffer_head *bh, int uptodate);
+
+locking rules:
+ called from interrupts. In other words, extreme care is needed here.
+bh is locked, but that's all warranties we have here. Currently only RAID1,
+highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices
+call this method upon the IO completion.
+
+--------------------------- block_device_operations -----------------------
+prototypes:
+ int (*open) (struct inode *, struct file *);
+ int (*release) (struct inode *, struct file *);
+ int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+ int (*media_changed) (struct gendisk *);
+ int (*revalidate_disk) (struct gendisk *);
+
+locking rules:
+ BKL bd_sem
+open: yes yes
+release: yes yes
+ioctl: yes no
+media_changed: no no
+revalidate_disk: no no
+
+The last two are called only from check_disk_change().
+
+--------------------------- file_operations -------------------------------
+prototypes:
+ loff_t (*llseek) (struct file *, loff_t, int);
+ ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
+ ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
+ ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ int (*readdir) (struct file *, void *, filldir_t);
+ unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ int (*ioctl) (struct inode *, struct file *, unsigned int,
+ unsigned long);
+ long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+ long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+ int (*mmap) (struct file *, struct vm_area_struct *);
+ int (*open) (struct inode *, struct file *);
+ int (*flush) (struct file *);
+ int (*release) (struct inode *, struct file *);
+ int (*fsync) (struct file *, struct dentry *, int datasync);
+ int (*aio_fsync) (struct kiocb *, int datasync);
+ int (*fasync) (int, struct file *, int);
+ int (*lock) (struct file *, int, struct file_lock *);
+ ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
+ loff_t *);
+ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
+ loff_t *);
+ ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
+ void __user *);
+ ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
+ loff_t *, int);
+ unsigned long (*get_unmapped_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+ int (*check_flags)(int);
+ int (*dir_notify)(struct file *, unsigned long);
+};
+
+locking rules:
+ All except ->poll() may block.
+ BKL
+llseek: no (see below)
+read: no
+aio_read: no
+write: no
+aio_write: no
+readdir: no
+poll: no
+ioctl: yes (see below)
+unlocked_ioctl: no (see below)
+compat_ioctl: no
+mmap: no
+open: no
+flush: no
+release: no
+fsync: no (see below)
+aio_fsync: no
+fasync: no
+lock: yes
+readv: no
+writev: no
+sendfile: no
+sendpage: no
+get_unmapped_area: no
+check_flags: no
+dir_notify: no
+
+->llseek() locking has moved from llseek to the individual llseek
+implementations. If your fs is not using generic_file_llseek, you
+need to acquire and release the appropriate locks in your ->llseek().
+For many filesystems, it is probably safe to acquire the inode
+semaphore. Note some filesystems (i.e. remote ones) provide no
+protection for i_size so you will need to use the BKL.
+
+Note: ext2_release() was *the* source of contention on fs-intensive
+loads and dropping BKL on ->release() helps to get rid of that (we still
+grab BKL for cases when we close a file that had been opened r/w, but that
+can and should be done using the internal locking with smaller critical areas).
+Current worst offender is ext2_get_block()...
+
+->fasync() is a mess. This area needs a big cleanup and that will probably
+affect locking.
+
+->readdir() and ->ioctl() on directories must be changed. Ideally we would
+move ->readdir() to inode_operations and use a separate method for directory
+->ioctl() or kill the latter completely. One of the problems is that for
+anything that resembles union-mount we won't have a struct file for all
+components. And there are other reasons why the current interface is a mess...
+
+->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
+doesn't take the BKL.
+
+->read on directories probably must go away - we should just enforce -EISDIR
+in sys_read() and friends.
+
+->fsync() has i_mutex on inode.
+
+--------------------------- dquot_operations -------------------------------
+prototypes:
+ int (*initialize) (struct inode *, int);
+ int (*drop) (struct inode *);
+ int (*alloc_space) (struct inode *, qsize_t, int);
+ int (*alloc_inode) (const struct inode *, unsigned long);
+ int (*free_space) (struct inode *, qsize_t);
+ int (*free_inode) (const struct inode *, unsigned long);
+ int (*transfer) (struct inode *, struct iattr *);
+ int (*write_dquot) (struct dquot *);
+ int (*acquire_dquot) (struct dquot *);
+ int (*release_dquot) (struct dquot *);
+ int (*mark_dirty) (struct dquot *);
+ int (*write_info) (struct super_block *, int);
+
+These operations are intended to be more or less wrapping functions that ensure
+a proper locking wrt the filesystem and call the generic quota operations.
+
+What filesystem should expect from the generic quota functions:
+
+ FS recursion Held locks when called
+initialize: yes maybe dqonoff_sem
+drop: yes -
+alloc_space: ->mark_dirty() -
+alloc_inode: ->mark_dirty() -
+free_space: ->mark_dirty() -
+free_inode: ->mark_dirty() -
+transfer: yes -
+write_dquot: yes dqonoff_sem or dqptr_sem
+acquire_dquot: yes dqonoff_sem or dqptr_sem
+release_dquot: yes dqonoff_sem or dqptr_sem
+mark_dirty: no -
+write_info: yes dqonoff_sem
+
+FS recursion means calling ->quota_read() and ->quota_write() from superblock
+operations.
+
+->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
+only directly by the filesystem and do not call any fs functions only
+the ->mark_dirty() operation.
+
+More details about quota locking can be found in fs/dquot.c.
+
+--------------------------- vm_operations_struct -----------------------------
+prototypes:
+ void (*open)(struct vm_area_struct*);
+ void (*close)(struct vm_area_struct*);
+ int (*fault)(struct vm_area_struct*, struct vm_fault *);
+ int (*page_mkwrite)(struct vm_area_struct *, struct page *);
+ int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
+
+locking rules:
+ BKL mmap_sem PageLocked(page)
+open: no yes
+close: no yes
+fault: no yes
+page_mkwrite: no yes no
+access: no yes
+
+ ->page_mkwrite() is called when a previously read-only page is
+about to become writeable. The file system is responsible for
+protecting against truncate races. Once appropriate action has been
+taking to lock out truncate, the page range should be verified to be
+within i_size. The page mapping should also be checked that it is not
+NULL.
+
+ ->access() is called when get_user_pages() fails in
+acces_process_vm(), typically used to debug a process through
+/proc/pid/mem or ptrace. This function is needed only for
+VM_IO | VM_PFNMAP VMAs.
+
+================================================================================
+ Dubious stuff
+
+(if you break something or notice that it is broken and do not fix it yourself
+- at least put it here)
+
+ipc/shm.c::shm_delete() - may need BKL.
+->read() and ->write() in many drivers are (probably) missing BKL.
diff --git a/Documentation/filesystems/adfs.txt b/Documentation/filesystems/adfs.txt
new file mode 100644
index 0000000..9e8811f
--- /dev/null
+++ b/Documentation/filesystems/adfs.txt
@@ -0,0 +1,57 @@
+Mount options for ADFS
+----------------------
+
+ uid=nnn All files in the partition will be owned by
+ user id nnn. Default 0 (root).
+ gid=nnn All files in the partition will be in group
+ nnn. Default 0 (root).
+ ownmask=nnn The permission mask for ADFS 'owner' permissions
+ will be nnn. Default 0700.
+ othmask=nnn The permission mask for ADFS 'other' permissions
+ will be nnn. Default 0077.
+
+Mapping of ADFS permissions to Linux permissions
+------------------------------------------------
+
+ ADFS permissions consist of the following:
+
+ Owner read
+ Owner write
+ Other read
+ Other write
+
+ (In older versions, an 'execute' permission did exist, but this
+ does not hold the same meaning as the Linux 'execute' permission
+ and is now obsolete).
+
+ The mapping is performed as follows:
+
+ Owner read -> -r--r--r--
+ Owner write -> --w--w---w
+ Owner read and filetype UnixExec -> ---x--x--x
+ These are then masked by ownmask, eg 700 -> -rwx------
+ Possible owner mode permissions -> -rwx------
+
+ Other read -> -r--r--r--
+ Other write -> --w--w--w-
+ Other read and filetype UnixExec -> ---x--x--x
+ These are then masked by othmask, eg 077 -> ----rwxrwx
+ Possible other mode permissions -> ----rwxrwx
+
+ Hence, with the default masks, if a file is owner read/write, and
+ not a UnixExec filetype, then the permissions will be:
+
+ -rw-------
+
+ However, if the masks were ownmask=0770,othmask=0007, then this would
+ be modified to:
+ -rw-rw----
+
+ There is no restriction on what you can do with these masks. You may
+ wish that either read bits give read access to the file for all, but
+ keep the default write protection (ownmask=0755,othmask=0577):
+
+ -rw-r--r--
+
+ You can therefore tailor the permission translation to whatever you
+ desire the permissions should be under Linux.
diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt
new file mode 100644
index 0000000..2d15244
--- /dev/null
+++ b/Documentation/filesystems/affs.txt
@@ -0,0 +1,219 @@
+Overview of Amiga Filesystems
+=============================
+
+Not all varieties of the Amiga filesystems are supported for reading and
+writing. The Amiga currently knows six different filesystems:
+
+DOS\0 The old or original filesystem, not really suited for
+ hard disks and normally not used on them, either.
+ Supported read/write.
+
+DOS\1 The original Fast File System. Supported read/write.
+
+DOS\2 The old "international" filesystem. International means that
+ a bug has been fixed so that accented ("international") letters
+ in file names are case-insensitive, as they ought to be.
+ Supported read/write.
+
+DOS\3 The "international" Fast File System. Supported read/write.
+
+DOS\4 The original filesystem with directory cache. The directory
+ cache speeds up directory accesses on floppies considerably,
+ but slows down file creation/deletion. Doesn't make much
+ sense on hard disks. Supported read only.
+
+DOS\5 The Fast File System with directory cache. Supported read only.
+
+All of the above filesystems allow block sizes from 512 to 32K bytes.
+Supported block sizes are: 512, 1024, 2048 and 4096 bytes. Larger blocks
+speed up almost everything at the expense of wasted disk space. The speed
+gain above 4K seems not really worth the price, so you don't lose too
+much here, either.
+
+The muFS (multi user File System) equivalents of the above file systems
+are supported, too.
+
+Mount options for the AFFS
+==========================
+
+protect If this option is set, the protection bits cannot be altered.
+
+setuid[=uid] This sets the owner of all files and directories in the file
+ system to uid or the uid of the current user, respectively.
+
+setgid[=gid] Same as above, but for gid.
+
+mode=mode Sets the mode flags to the given (octal) value, regardless
+ of the original permissions. Directories will get an x
+ permission if the corresponding r bit is set.
+ This is useful since most of the plain AmigaOS files
+ will map to 600.
+
+reserved=num Sets the number of reserved blocks at the start of the
+ partition to num. You should never need this option.
+ Default is 2.
+
+root=block Sets the block number of the root block. This should never
+ be necessary.
+
+bs=blksize Sets the blocksize to blksize. Valid block sizes are 512,
+ 1024, 2048 and 4096. Like the root option, this should
+ never be necessary, as the affs can figure it out itself.
+
+quiet The file system will not return an error for disallowed
+ mode changes.
+
+verbose The volume name, file system type and block size will
+ be written to the syslog when the filesystem is mounted.
+
+mufs The filesystem is really a muFS, also it doesn't
+ identify itself as one. This option is necessary if
+ the filesystem wasn't formatted as muFS, but is used
+ as one.
+
+prefix=path Path will be prefixed to every absolute path name of
+ symbolic links on an AFFS partition. Default = "/".
+ (See below.)
+
+volume=name When symbolic links with an absolute path are created
+ on an AFFS partition, name will be prepended as the
+ volume name. Default = "" (empty string).
+ (See below.)
+
+Handling of the Users/Groups and protection flags
+=================================================
+
+Amiga -> Linux:
+
+The Amiga protection flags RWEDRWEDHSPARWED are handled as follows:
+
+ - R maps to r for user, group and others. On directories, R implies x.
+
+ - If both W and D are allowed, w will be set.
+
+ - E maps to x.
+
+ - H and P are always retained and ignored under Linux.
+
+ - A is always reset when a file is written to.
+
+User id and group id will be used unless set[gu]id are given as mount
+options. Since most of the Amiga file systems are single user systems
+they will be owned by root. The root directory (the mount point) of the
+Amiga filesystem will be owned by the user who actually mounts the
+filesystem (the root directory doesn't have uid/gid fields).
+
+Linux -> Amiga:
+
+The Linux rwxrwxrwx file mode is handled as follows:
+
+ - r permission will set R for user, group and others.
+
+ - w permission will set W and D for user, group and others.
+
+ - x permission of the user will set E for plain files.
+
+ - All other flags (suid, sgid, ...) are ignored and will
+ not be retained.
+
+Newly created files and directories will get the user and group ID
+of the current user and a mode according to the umask.
+
+Symbolic links
+==============
+
+Although the Amiga and Linux file systems resemble each other, there
+are some, not always subtle, differences. One of them becomes apparent
+with symbolic links. While Linux has a file system with exactly one
+root directory, the Amiga has a separate root directory for each
+file system (for example, partition, floppy disk, ...). With the Amiga,
+these entities are called "volumes". They have symbolic names which
+can be used to access them. Thus, symbolic links can point to a
+different volume. AFFS turns the volume name into a directory name
+and prepends the prefix path (see prefix option) to it.
+
+Example:
+You mount all your Amiga partitions under /amiga/<volume> (where
+<volume> is the name of the volume), and you give the option
+"prefix=/amiga/" when mounting all your AFFS partitions. (They
+might be "User", "WB" and "Graphics", the mount points /amiga/User,
+/amiga/WB and /amiga/Graphics). A symbolic link referring to
+"User:sc/include/dos/dos.h" will be followed to
+"/amiga/User/sc/include/dos/dos.h".
+
+Examples
+========
+
+Command line:
+ mount Archive/Amiga/Workbench3.1.adf /mnt -t affs -o loop,verbose
+ mount /dev/sda3 /Amiga -t affs
+
+/etc/fstab entry:
+ /dev/sdb5 /amiga/Workbench affs noauto,user,exec,verbose 0 0
+
+IMPORTANT NOTE
+==============
+
+If you boot Windows 95 (don't know about 3.x, 98 and NT) while you
+have an Amiga harddisk connected to your PC, it will overwrite
+the bytes 0x00dc..0x00df of block 0 with garbage, thus invalidating
+the Rigid Disk Block. Sheer luck has it that this is an unused
+area of the RDB, so only the checksum doesn't match anymore.
+Linux will ignore this garbage and recognize the RDB anyway, but
+before you connect that drive to your Amiga again, you must
+restore or repair your RDB. So please do make a backup copy of it
+before booting Windows!
+
+If the damage is already done, the following should fix the RDB
+(where <disk> is the device name).
+DO AT YOUR OWN RISK:
+
+ dd if=/dev/<disk> of=rdb.tmp count=1
+ cp rdb.tmp rdb.fixed
+ dd if=/dev/zero of=rdb.fixed bs=1 seek=220 count=4
+ dd if=rdb.fixed of=/dev/<disk>
+
+Bugs, Restrictions, Caveats
+===========================
+
+Quite a few things may not work as advertised. Not everything is
+tested, though several hundred MB have been read and written using
+this fs. For a most up-to-date list of bugs please consult
+fs/affs/Changes.
+
+Filenames are truncated to 30 characters without warning (this
+can be changed by setting the compile-time option AFFS_NO_TRUNCATE
+in include/linux/amigaffs.h).
+
+Case is ignored by the affs in filename matching, but Linux shells
+do care about the case. Example (with /wb being an affs mounted fs):
+ rm /wb/WRONGCASE
+will remove /mnt/wrongcase, but
+ rm /wb/WR*
+will not since the names are matched by the shell.
+
+The block allocation is designed for hard disk partitions. If more
+than 1 process writes to a (small) diskette, the blocks are allocated
+in an ugly way (but the real AFFS doesn't do much better). This
+is also true when space gets tight.
+
+You cannot execute programs on an OFS (Old File System), since the
+program files cannot be memory mapped due to the 488 byte blocks.
+For the same reason you cannot mount an image on such a filesystem
+via the loopback device.
+
+The bitmap valid flag in the root block may not be accurate when the
+system crashes while an affs partition is mounted. There's currently
+no way to fix a garbled filesystem without an Amiga (disk validator)
+or manually (who would do this?). Maybe later.
+
+If you mount affs partitions on system startup, you may want to tell
+fsck that the fs should not be checked (place a '0' in the sixth field
+of /etc/fstab).
+
+It's not possible to read floppy disks with a normal PC or workstation
+due to an incompatibility with the Amiga floppy controller.
+
+If you are interested in an Amiga Emulator for Linux, look at
+
+http://www.freiburg.linux.de/~uae/
diff --git a/Documentation/filesystems/afs.txt b/Documentation/filesystems/afs.txt
new file mode 100644
index 0000000..12ad6c7
--- /dev/null
+++ b/Documentation/filesystems/afs.txt
@@ -0,0 +1,249 @@
+ ====================
+ kAFS: AFS FILESYSTEM
+ ====================
+
+Contents:
+
+ - Overview.
+ - Usage.
+ - Mountpoints.
+ - Proc filesystem.
+ - The cell database.
+ - Security.
+ - Examples.
+
+
+========
+OVERVIEW
+========
+
+This filesystem provides a fairly simple secure AFS filesystem driver. It is
+under development and does not yet provide the full feature set. The features
+it does support include:
+
+ (*) Security (currently only AFS kaserver and KerberosIV tickets).
+
+ (*) File reading.
+
+ (*) Automounting.
+
+It does not yet support the following AFS features:
+
+ (*) Write support.
+
+ (*) Local caching.
+
+ (*) pioctl() system call.
+
+
+===========
+COMPILATION
+===========
+
+The filesystem should be enabled by turning on the kernel configuration
+options:
+
+ CONFIG_AF_RXRPC - The RxRPC protocol transport
+ CONFIG_RXKAD - The RxRPC Kerberos security handler
+ CONFIG_AFS - The AFS filesystem
+
+Additionally, the following can be turned on to aid debugging:
+
+ CONFIG_AF_RXRPC_DEBUG - Permit AF_RXRPC debugging to be enabled
+ CONFIG_AFS_DEBUG - Permit AFS debugging to be enabled
+
+They permit the debugging messages to be turned on dynamically by manipulating
+the masks in the following files:
+
+ /sys/module/af_rxrpc/parameters/debug
+ /sys/module/afs/parameters/debug
+
+
+=====
+USAGE
+=====
+
+When inserting the driver modules the root cell must be specified along with a
+list of volume location server IP addresses:
+
+ insmod af_rxrpc.o
+ insmod rxkad.o
+ insmod kafs.o rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91
+
+The first module is the AF_RXRPC network protocol driver. This provides the
+RxRPC remote operation protocol and may also be accessed from userspace. See:
+
+ Documentation/networking/rxrpc.txt
+
+The second module is the kerberos RxRPC security driver, and the third module
+is the actual filesystem driver for the AFS filesystem.
+
+Once the module has been loaded, more modules can be added by the following
+procedure:
+
+ echo add grand.central.org 18.7.14.88:128.2.191.224 >/proc/fs/afs/cells
+
+Where the parameters to the "add" command are the name of a cell and a list of
+volume location servers within that cell, with the latter separated by colons.
+
+Filesystems can be mounted anywhere by commands similar to the following:
+
+ mount -t afs "%cambridge.redhat.com:root.afs." /afs
+ mount -t afs "#cambridge.redhat.com:root.cell." /afs/cambridge
+ mount -t afs "#root.afs." /afs
+ mount -t afs "#root.cell." /afs/cambridge
+
+Where the initial character is either a hash or a percent symbol depending on
+whether you definitely want a R/W volume (hash) or whether you'd prefer a R/O
+volume, but are willing to use a R/W volume instead (percent).
+
+The name of the volume can be suffixes with ".backup" or ".readonly" to
+specify connection to only volumes of those types.
+
+The name of the cell is optional, and if not given during a mount, then the
+named volume will be looked up in the cell specified during insmod.
+
+Additional cells can be added through /proc (see later section).
+
+
+===========
+MOUNTPOINTS
+===========
+
+AFS has a concept of mountpoints. In AFS terms, these are specially formatted
+symbolic links (of the same form as the "device name" passed to mount). kAFS
+presents these to the user as directories that have a follow-link capability
+(ie: symbolic link semantics). If anyone attempts to access them, they will
+automatically cause the target volume to be mounted (if possible) on that site.
+
+Automatically mounted filesystems will be automatically unmounted approximately
+twenty minutes after they were last used. Alternatively they can be unmounted
+directly with the umount() system call.
+
+Manually unmounting an AFS volume will cause any idle submounts upon it to be
+culled first. If all are culled, then the requested volume will also be
+unmounted, otherwise error EBUSY will be returned.
+
+This can be used by the administrator to attempt to unmount the whole AFS tree
+mounted on /afs in one go by doing:
+
+ umount /afs
+
+
+===============
+PROC FILESYSTEM
+===============
+
+The AFS modules creates a "/proc/fs/afs/" directory and populates it:
+
+ (*) A "cells" file that lists cells currently known to the afs module and
+ their usage counts:
+
+ [root@andromeda ~]# cat /proc/fs/afs/cells
+ USE NAME
+ 3 cambridge.redhat.com
+
+ (*) A directory per cell that contains files that list volume location
+ servers, volumes, and active servers known within that cell.
+
+ [root@andromeda ~]# cat /proc/fs/afs/cambridge.redhat.com/servers
+ USE ADDR STATE
+ 4 172.16.18.91 0
+ [root@andromeda ~]# cat /proc/fs/afs/cambridge.redhat.com/vlservers
+ ADDRESS
+ 172.16.18.91
+ [root@andromeda ~]# cat /proc/fs/afs/cambridge.redhat.com/volumes
+ USE STT VLID[0] VLID[1] VLID[2] NAME
+ 1 Val 20000000 20000001 20000002 root.afs
+
+
+=================
+THE CELL DATABASE
+=================
+
+The filesystem maintains an internal database of all the cells it knows and the
+IP addresses of the volume location servers for those cells. The cell to which
+the system belongs is added to the database when insmod is performed by the
+"rootcell=" argument or, if compiled in, using a "kafs.rootcell=" argument on
+the kernel command line.
+
+Further cells can be added by commands similar to the following:
+
+ echo add CELLNAME VLADDR[:VLADDR][:VLADDR]... >/proc/fs/afs/cells
+ echo add grand.central.org 18.7.14.88:128.2.191.224 >/proc/fs/afs/cells
+
+No other cell database operations are available at this time.
+
+
+========
+SECURITY
+========
+
+Secure operations are initiated by acquiring a key using the klog program. A
+very primitive klog program is available at:
+
+ http://people.redhat.com/~dhowells/rxrpc/klog.c
+
+This should be compiled by:
+
+ make klog LDLIBS="-lcrypto -lcrypt -lkrb4 -lkeyutils"
+
+And then run as:
+
+ ./klog
+
+Assuming it's successful, this adds a key of type RxRPC, named for the service
+and cell, eg: "afs@<cellname>". This can be viewed with the keyctl program or
+by cat'ing /proc/keys:
+
+ [root@andromeda ~]# keyctl show
+ Session Keyring
+ -3 --alswrv 0 0 keyring: _ses.3268
+ 2 --alswrv 0 0 \_ keyring: _uid.0
+ 111416553 --als--v 0 0 \_ rxrpc: afs@CAMBRIDGE.REDHAT.COM
+
+Currently the username, realm, password and proposed ticket lifetime are
+compiled in to the program.
+
+It is not required to acquire a key before using AFS facilities, but if one is
+not acquired then all operations will be governed by the anonymous user parts
+of the ACLs.
+
+If a key is acquired, then all AFS operations, including mounts and automounts,
+made by a possessor of that key will be secured with that key.
+
+If a file is opened with a particular key and then the file descriptor is
+passed to a process that doesn't have that key (perhaps over an AF_UNIX
+socket), then the operations on the file will be made with key that was used to
+open the file.
+
+
+========
+EXAMPLES
+========
+
+Here's what I use to test this. Some of the names and IP addresses are local
+to my internal DNS. My "root.afs" partition has a mount point within it for
+some public volumes volumes.
+
+insmod /tmp/rxrpc.o
+insmod /tmp/rxkad.o
+insmod /tmp/kafs.o rootcell=cambridge.redhat.com:172.16.18.91
+
+mount -t afs \%root.afs. /afs
+mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/
+
+echo add grand.central.org 18.7.14.88:128.2.191.224 > /proc/fs/afs/cells
+mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/
+mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive
+mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib
+mount -t afs "#grand.central.org:root.doc." /afs/grand.central.org/doc
+mount -t afs "#grand.central.org:root.project." /afs/grand.central.org/project
+mount -t afs "#grand.central.org:root.service." /afs/grand.central.org/service
+mount -t afs "#grand.central.org:root.software." /afs/grand.central.org/software
+mount -t afs "#grand.central.org:root.user." /afs/grand.central.org/user
+
+umount /afs
+rmmod kafs
+rmmod rxkad
+rmmod rxrpc
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
new file mode 100644
index 0000000..c634174
--- /dev/null
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -0,0 +1,393 @@
+
+Miscellaneous Device control operations for the autofs4 kernel module
+====================================================================
+
+The problem
+===========
+
+There is a problem with active restarts in autofs (that is to say
+restarting autofs when there are busy mounts).
+
+During normal operation autofs uses a file descriptor opened on the
+directory that is being managed in order to be able to issue control
+operations. Using a file descriptor gives ioctl operations access to
+autofs specific information stored in the super block. The operations
+are things such as setting an autofs mount catatonic, setting the
+expire timeout and requesting expire checks. As is explained below,
+certain types of autofs triggered mounts can end up covering an autofs
+mount itself which prevents us being able to use open(2) to obtain a
+file descriptor for these operations if we don't already have one open.
+
+Currently autofs uses "umount -l" (lazy umount) to clear active mounts
+at restart. While using lazy umount works for most cases, anything that
+needs to walk back up the mount tree to construct a path, such as
+getcwd(2) and the proc file system /proc/<pid>/cwd, no longer works
+because the point from which the path is constructed has been detached
+from the mount tree.
+
+The actual problem with autofs is that it can't reconnect to existing
+mounts. Immediately one thinks of just adding the ability to remount
+autofs file systems would solve it, but alas, that can't work. This is
+because autofs direct mounts and the implementation of "on demand mount
+and expire" of nested mount trees have the file system mounted directly
+on top of the mount trigger directory dentry.
+
+For example, there are two types of automount maps, direct (in the kernel
+module source you will see a third type called an offset, which is just
+a direct mount in disguise) and indirect.
+
+Here is a master map with direct and indirect map entries:
+
+/- /etc/auto.direct
+/test /etc/auto.indirect
+
+and the corresponding map files:
+
+/etc/auto.direct:
+
+/automount/dparse/g6 budgie:/autofs/export1
+/automount/dparse/g1 shark:/autofs/export1
+and so on.
+
+/etc/auto.indirect:
+
+g1 shark:/autofs/export1
+g6 budgie:/autofs/export1
+and so on.
+
+For the above indirect map an autofs file system is mounted on /test and
+mounts are triggered for each sub-directory key by the inode lookup
+operation. So we see a mount of shark:/autofs/export1 on /test/g1, for
+example.
+
+The way that direct mounts are handled is by making an autofs mount on
+each full path, such as /automount/dparse/g1, and using it as a mount
+trigger. So when we walk on the path we mount shark:/autofs/export1 "on
+top of this mount point". Since these are always directories we can
+use the follow_link inode operation to trigger the mount.
+
+But, each entry in direct and indirect maps can have offsets (making
+them multi-mount map entries).
+
+For example, an indirect mount map entry could also be:
+
+g1 \
+ / shark:/autofs/export5/testing/test \
+ /s1 shark:/autofs/export/testing/test/s1 \
+ /s2 shark:/autofs/export5/testing/test/s2 \
+ /s1/ss1 shark:/autofs/export1 \
+ /s2/ss2 shark:/autofs/export2
+
+and a similarly a direct mount map entry could also be:
+
+/automount/dparse/g1 \
+ / shark:/autofs/export5/testing/test \
+ /s1 shark:/autofs/export/testing/test/s1 \
+ /s2 shark:/autofs/export5/testing/test/s2 \
+ /s1/ss1 shark:/autofs/export2 \
+ /s2/ss2 shark:/autofs/export2
+
+One of the issues with version 4 of autofs was that, when mounting an
+entry with a large number of offsets, possibly with nesting, we needed
+to mount and umount all of the offsets as a single unit. Not really a
+problem, except for people with a large number of offsets in map entries.
+This mechanism is used for the well known "hosts" map and we have seen
+cases (in 2.4) where the available number of mounts are exhausted or
+where the number of privileged ports available is exhausted.
+
+In version 5 we mount only as we go down the tree of offsets and
+similarly for expiring them which resolves the above problem. There is
+somewhat more detail to the implementation but it isn't needed for the
+sake of the problem explanation. The one important detail is that these
+offsets are implemented using the same mechanism as the direct mounts
+above and so the mount points can be covered by a mount.
+
+The current autofs implementation uses an ioctl file descriptor opened
+on the mount point for control operations. The references held by the
+descriptor are accounted for in checks made to determine if a mount is
+in use and is also used to access autofs file system information held
+in the mount super block. So the use of a file handle needs to be
+retained.
+
+
+The Solution
+============
+
+To be able to restart autofs leaving existing direct, indirect and
+offset mounts in place we need to be able to obtain a file handle
+for these potentially covered autofs mount points. Rather than just
+implement an isolated operation it was decided to re-implement the
+existing ioctl interface and add new operations to provide this
+functionality.
+
+In addition, to be able to reconstruct a mount tree that has busy mounts,
+the uid and gid of the last user that triggered the mount needs to be
+available because these can be used as macro substitution variables in
+autofs maps. They are recorded at mount request time and an operation
+has been added to retrieve them.
+
+Since we're re-implementing the control interface, a couple of other
+problems with the existing interface have been addressed. First, when
+a mount or expire operation completes a status is returned to the
+kernel by either a "send ready" or a "send fail" operation. The
+"send fail" operation of the ioctl interface could only ever send
+ENOENT so the re-implementation allows user space to send an actual
+status. Another expensive operation in user space, for those using
+very large maps, is discovering if a mount is present. Usually this
+involves scanning /proc/mounts and since it needs to be done quite
+often it can introduce significant overhead when there are many entries
+in the mount table. An operation to lookup the mount status of a mount
+point dentry (covered or not) has also been added.
+
+Current kernel development policy recommends avoiding the use of the
+ioctl mechanism in favor of systems such as Netlink. An implementation
+using this system was attempted to evaluate its suitability and it was
+found to be inadequate, in this case. The Generic Netlink system was
+used for this as raw Netlink would lead to a significant increase in
+complexity. There's no question that the Generic Netlink system is an
+elegant solution for common case ioctl functions but it's not a complete
+replacement probably because it's primary purpose in life is to be a
+message bus implementation rather than specifically an ioctl replacement.
+While it would be possible to work around this there is one concern
+that lead to the decision to not use it. This is that the autofs
+expire in the daemon has become far to complex because umount
+candidates are enumerated, almost for no other reason than to "count"
+the number of times to call the expire ioctl. This involves scanning
+the mount table which has proved to be a big overhead for users with
+large maps. The best way to improve this is try and get back to the
+way the expire was done long ago. That is, when an expire request is
+issued for a mount (file handle) we should continually call back to
+the daemon until we can't umount any more mounts, then return the
+appropriate status to the daemon. At the moment we just expire one
+mount at a time. A Generic Netlink implementation would exclude this
+possibility for future development due to the requirements of the
+message bus architecture.
+
+
+autofs4 Miscellaneous Device mount control interface
+====================================================
+
+The control interface is opening a device node, typically /dev/autofs.
+
+All the ioctls use a common structure to pass the needed parameter
+information and return operation results:
+
+struct autofs_dev_ioctl {
+ __u32 ver_major;
+ __u32 ver_minor;
+ __u32 size; /* total size of data passed in
+ * including this struct */
+ __s32 ioctlfd; /* automount command fd */
+
+ __u32 arg1; /* Command parameters */
+ __u32 arg2;
+
+ char path[0];
+};
+
+The ioctlfd field is a mount point file descriptor of an autofs mount
+point. It is returned by the open call and is used by all calls except
+the check for whether a given path is a mount point, where it may
+optionally be used to check a specific mount corresponding to a given
+mount point file descriptor, and when requesting the uid and gid of the
+last successful mount on a directory within the autofs file system.
+
+The fields arg1 and arg2 are used to communicate parameters and results of
+calls made as described below.
+
+The path field is used to pass a path where it is needed and the size field
+is used account for the increased structure length when translating the
+structure sent from user space.
+
+This structure can be initialized before setting specific fields by using
+the void function call init_autofs_dev_ioctl(struct autofs_dev_ioctl *).
+
+All of the ioctls perform a copy of this structure from user space to
+kernel space and return -EINVAL if the size parameter is smaller than
+the structure size itself, -ENOMEM if the kernel memory allocation fails
+or -EFAULT if the copy itself fails. Other checks include a version check
+of the compiled in user space version against the module version and a
+mismatch results in a -EINVAL return. If the size field is greater than
+the structure size then a path is assumed to be present and is checked to
+ensure it begins with a "/" and is NULL terminated, otherwise -EINVAL is
+returned. Following these checks, for all ioctl commands except
+AUTOFS_DEV_IOCTL_VERSION_CMD, AUTOFS_DEV_IOCTL_OPENMOUNT_CMD and
+AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD the ioctlfd is validated and if it is
+not a valid descriptor or doesn't correspond to an autofs mount point
+an error of -EBADF, -ENOTTY or -EINVAL (not an autofs descriptor) is
+returned.
+
+
+The ioctls
+==========
+
+An example of an implementation which uses this interface can be seen
+in autofs version 5.0.4 and later in file lib/dev-ioctl-lib.c of the
+distribution tar available for download from kernel.org in directory
+/pub/linux/daemons/autofs/v5.
+
+The device node ioctl operations implemented by this interface are:
+
+
+AUTOFS_DEV_IOCTL_VERSION
+------------------------
+
+Get the major and minor version of the autofs4 device ioctl kernel module
+implementation. It requires an initialized struct autofs_dev_ioctl as an
+input parameter and sets the version information in the passed in structure.
+It returns 0 on success or the error -EINVAL if a version mismatch is
+detected.
+
+
+AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD
+------------------------------------------------------------------
+
+Get the major and minor version of the autofs4 protocol version understood
+by loaded module. This call requires an initialized struct autofs_dev_ioctl
+with the ioctlfd field set to a valid autofs mount point descriptor
+and sets the requested version number in structure field arg1. These
+commands return 0 on success or one of the negative error codes if
+validation fails.
+
+
+AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
+----------------------------------------------------------
+
+Obtain and release a file descriptor for an autofs managed mount point
+path. The open call requires an initialized struct autofs_dev_ioctl with
+the the path field set and the size field adjusted appropriately as well
+as the arg1 field set to the device number of the autofs mount. The
+device number can be obtained from the mount options shown in
+/proc/mounts. The close call requires an initialized struct
+autofs_dev_ioct with the ioctlfd field set to the descriptor obtained
+from the open call. The release of the file descriptor can also be done
+with close(2) so any open descriptors will also be closed at process exit.
+The close call is included in the implemented operations largely for
+completeness and to provide for a consistent user space implementation.
+
+
+AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD
+--------------------------------------------------------
+
+Return mount and expire result status from user space to the kernel.
+Both of these calls require an initialized struct autofs_dev_ioctl
+with the ioctlfd field set to the descriptor obtained from the open
+call and the arg1 field set to the wait queue token number, received
+by user space in the foregoing mount or expire request. The arg2 field
+is set to the status to be returned. For the ready call this is always
+0 and for the fail call it is set to the errno of the operation.
+
+
+AUTOFS_DEV_IOCTL_SETPIPEFD_CMD
+------------------------------
+
+Set the pipe file descriptor used for kernel communication to the daemon.
+Normally this is set at mount time using an option but when reconnecting
+to a existing mount we need to use this to tell the autofs mount about
+the new kernel pipe descriptor. In order to protect mounts against
+incorrectly setting the pipe descriptor we also require that the autofs
+mount be catatonic (see next call).
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call and
+the arg1 field set to descriptor of the pipe. On success the call
+also sets the process group id used to identify the controlling process
+(eg. the owning automount(8) daemon) to the process group of the caller.
+
+
+AUTOFS_DEV_IOCTL_CATATONIC_CMD
+------------------------------
+
+Make the autofs mount point catatonic. The autofs mount will no longer
+issue mount requests, the kernel communication pipe descriptor is released
+and any remaining waits in the queue released.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call.
+
+
+AUTOFS_DEV_IOCTL_TIMEOUT_CMD
+----------------------------
+
+Set the expire timeout for mounts withing an autofs mount point.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call.
+
+
+AUTOFS_DEV_IOCTL_REQUESTER_CMD
+------------------------------
+
+Return the uid and gid of the last process to successfully trigger a the
+mount on the given path dentry.
+
+The call requires an initialized struct autofs_dev_ioctl with the path
+field set to the mount point in question and the size field adjusted
+appropriately as well as the arg1 field set to the device number of the
+containing autofs mount. Upon return the struct field arg1 contains the
+uid and arg2 the gid.
+
+When reconstructing an autofs mount tree with active mounts we need to
+re-connect to mounts that may have used the original process uid and
+gid (or string variations of them) for mount lookups within the map entry.
+This call provides the ability to obtain this uid and gid so they may be
+used by user space for the mount map lookups.
+
+
+AUTOFS_DEV_IOCTL_EXPIRE_CMD
+---------------------------
+
+Issue an expire request to the kernel for an autofs mount. Typically
+this ioctl is called until no further expire candidates are found.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call. In
+addition an immediate expire, independent of the mount timeout, can be
+requested by setting the arg1 field to 1. If no expire candidates can
+be found the ioctl returns -1 with errno set to EAGAIN.
+
+This call causes the kernel module to check the mount corresponding
+to the given ioctlfd for mounts that can be expired, issues an expire
+request back to the daemon and waits for completion.
+
+AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD
+------------------------------
+
+Checks if an autofs mount point is in use.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call and
+it returns the result in the arg1 field, 1 for busy and 0 otherwise.
+
+
+AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
+---------------------------------
+
+Check if the given path is a mountpoint.
+
+The call requires an initialized struct autofs_dev_ioctl. There are two
+possible variations. Both use the path field set to the path of the mount
+point to check and the size field adjusted appropriately. One uses the
+ioctlfd field to identify a specific mount point to check while the other
+variation uses the path and optionaly arg1 set to an autofs mount type.
+The call returns 1 if this is a mount point and sets arg1 to the device
+number of the mount and field arg2 to the relevant super block magic
+number (described below) or 0 if it isn't a mountpoint. In both cases
+the the device number (as returned by new_encode_dev()) is returned
+in field arg1.
+
+If supplied with a file descriptor we're looking for a specific mount,
+not necessarily at the top of the mounted stack. In this case the path
+the descriptor corresponds to is considered a mountpoint if it is itself
+a mountpoint or contains a mount, such as a multi-mount without a root
+mount. In this case we return 1 if the descriptor corresponds to a mount
+point and and also returns the super magic of the covering mount if there
+is one or 0 if it isn't a mountpoint.
+
+If a path is supplied (and the ioctlfd field is set to -1) then the path
+is looked up and is checked to see if it is the root of a mount. If a
+type is also given we are looking for a particular autofs mount and if
+a match isn't found a fail is returned. If the the located path is the
+root of a mount 1 is returned along with the super magic of the mount
+or 0 otherwise.
+
diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt
new file mode 100644
index 0000000..7cac200
--- /dev/null
+++ b/Documentation/filesystems/automount-support.txt
@@ -0,0 +1,118 @@
+Support is available for filesystems that wish to do automounting support (such
+as kAFS which can be found in fs/afs/). This facility includes allowing
+in-kernel mounts to be performed and mountpoint degradation to be
+requested. The latter can also be requested by userspace.
+
+
+======================
+IN-KERNEL AUTOMOUNTING
+======================
+
+A filesystem can now mount another filesystem on one of its directories by the
+following procedure:
+
+ (1) Give the directory a follow_link() operation.
+
+ When the directory is accessed, the follow_link op will be called, and
+ it will be provided with the location of the mountpoint in the nameidata
+ structure (vfsmount and dentry).
+
+ (2) Have the follow_link() op do the following steps:
+
+ (a) Call vfs_kern_mount() to call the appropriate filesystem to set up a
+ superblock and gain a vfsmount structure representing it.
+
+ (b) Copy the nameidata provided as an argument and substitute the dentry
+ argument into it the copy.
+
+ (c) Call do_add_mount() to install the new vfsmount into the namespace's
+ mountpoint tree, thus making it accessible to userspace. Use the
+ nameidata set up in (b) as the destination.
+
+ If the mountpoint will be automatically expired, then do_add_mount()
+ should also be given the location of an expiration list (see further
+ down).
+
+ (d) Release the path in the nameidata argument and substitute in the new
+ vfsmount and its root dentry. The ref counts on these will need
+ incrementing.
+
+Then from userspace, you can just do something like:
+
+ [root@andromeda root]# mount -t afs \#root.afs. /afs
+ [root@andromeda root]# ls /afs
+ asd cambridge cambridge.redhat.com grand.central.org
+ [root@andromeda root]# ls /afs/cambridge
+ afsdoc
+ [root@andromeda root]# ls /afs/cambridge/afsdoc/
+ ChangeLog html LICENSE pdf RELNOTES-1.2.2
+
+And then if you look in the mountpoint catalogue, you'll see something like:
+
+ [root@andromeda root]# cat /proc/mounts
+ ...
+ #root.afs. /afs afs rw 0 0
+ #root.cell. /afs/cambridge.redhat.com afs rw 0 0
+ #afsdoc. /afs/cambridge.redhat.com/afsdoc afs rw 0 0
+
+
+===========================
+AUTOMATIC MOUNTPOINT EXPIRY
+===========================
+
+Automatic expiration of mountpoints is easy, provided you've mounted the
+mountpoint to be expired in the automounting procedure outlined above.
+
+To do expiration, you need to follow these steps:
+
+ (3) Create at least one list off which the vfsmounts to be expired can be
+ hung. Access to this list will be governed by the vfsmount_lock.
+
+ (4) In step (2c) above, the call to do_add_mount() should be provided with a
+ pointer to this list. It will hang the vfsmount off of it if it succeeds.
+
+ (5) When you want mountpoints to be expired, call mark_mounts_for_expiry()
+ with a pointer to this list. This will process the list, marking every
+ vfsmount thereon for potential expiry on the next call.
+
+ If a vfsmount was already flagged for expiry, and if its usage count is 1
+ (it's only referenced by its parent vfsmount), then it will be deleted
+ from the namespace and thrown away (effectively unmounted).
+
+ It may prove simplest to simply call this at regular intervals, using
+ some sort of timed event to drive it.
+
+The expiration flag is cleared by calls to mntput. This means that expiration
+will only happen on the second expiration request after the last time the
+mountpoint was accessed.
+
+If a mountpoint is moved, it gets removed from the expiration list. If a bind
+mount is made on an expirable mount, the new vfsmount will not be on the
+expiration list and will not expire.
+
+If a namespace is copied, all mountpoints contained therein will be copied,
+and the copies of those that are on an expiration list will be added to the
+same expiration list.
+
+
+=======================
+USERSPACE DRIVEN EXPIRY
+=======================
+
+As an alternative, it is possible for userspace to request expiry of any
+mountpoint (though some will be rejected - the current process's idea of the
+rootfs for example). It does this by passing the MNT_EXPIRE flag to
+umount(). This flag is considered incompatible with MNT_FORCE and MNT_DETACH.
+
+If the mountpoint in question is in referenced by something other than
+umount() or its parent mountpoint, an EBUSY error will be returned and the
+mountpoint will not be marked for expiration or unmounted.
+
+If the mountpoint was not already marked for expiry at that time, an EAGAIN
+error will be given and it won't be unmounted.
+
+Otherwise if it was already marked and it wasn't referenced, unmounting will
+take place as usual.
+
+Again, the expiration flag is cleared every time anything other than umount()
+looks at a mountpoint.
diff --git a/Documentation/filesystems/befs.txt b/Documentation/filesystems/befs.txt
new file mode 100644
index 0000000..67391a1
--- /dev/null
+++ b/Documentation/filesystems/befs.txt
@@ -0,0 +1,117 @@
+BeOS filesystem for Linux
+
+Document last updated: Dec 6, 2001
+
+WARNING
+=======
+Make sure you understand that this is alpha software. This means that the
+implementation is neither complete nor well-tested.
+
+I DISCLAIM ALL RESPONSIBILITY FOR ANY POSSIBLE BAD EFFECTS OF THIS CODE!
+
+LICENSE
+=====
+This software is covered by the GNU General Public License.
+See the file COPYING for the complete text of the license.
+Or the GNU website: <http://www.gnu.org/licenses/licenses.html>
+
+AUTHOR
+=====
+The largest part of the code written by Will Dyson <will_dyson@pobox.com>
+He has been working on the code since Aug 13, 2001. See the changelog for
+details.
+
+Original Author: Makoto Kato <m_kato@ga2.so-net.ne.jp>
+His original code can still be found at:
+<http://hp.vector.co.jp/authors/VA008030/bfs/>
+Does anyone know of a more current email address for Makoto? He doesn't
+respond to the address given above...
+
+Current maintainer: Sergey S. Kostyliov <rathamahata@php4.ru>
+
+WHAT IS THIS DRIVER?
+==================
+This module implements the native filesystem of BeOS <http://www.be.com/>
+for the linux 2.4.1 and later kernels. Currently it is a read-only
+implementation.
+
+Which is it, BFS or BEFS?
+================
+Be, Inc said, "BeOS Filesystem is officially called BFS, not BeFS".
+But Unixware Boot Filesystem is called bfs, too. And they are already in
+the kernel. Because of this naming conflict, on Linux the BeOS
+filesystem is called befs.
+
+HOW TO INSTALL
+==============
+step 1. Install the BeFS patch into the source code tree of linux.
+
+Apply the patchfile to your kernel source tree.
+Assuming that your kernel source is in /foo/bar/linux and the patchfile
+is called patch-befs-xxx, you would do the following:
+
+ cd /foo/bar/linux
+ patch -p1 < /path/to/patch-befs-xxx
+
+if the patching step fails (i.e. there are rejected hunks), you can try to
+figure it out yourself (it shouldn't be hard), or mail the maintainer
+(Will Dyson <will_dyson@pobox.com>) for help.
+
+step 2. Configuration & make kernel
+
+The linux kernel has many compile-time options. Most of them are beyond the
+scope of this document. I suggest the Kernel-HOWTO document as a good general
+reference on this topic. <http://www.linux.com/howto/Kernel-HOWTO.html>
+
+However, to use the BeFS module, you must enable it at configure time.
+
+ cd /foo/bar/linux
+ make menuconfig (or xconfig)
+
+The BeFS module is not a standard part of the linux kernel, so you must first
+enable support for experimental code under the "Code maturity level" menu.
+
+Then, under the "Filesystems" menu will be an option called "BeFS
+filesystem (experimental)", or something like that. Enable that option
+(it is fine to make it a module).
+
+Save your kernel configuration and then build your kernel.
+
+step 3. Install
+
+See the kernel howto <http://www.linux.com/howto/Kernel-HOWTO.html> for
+instructions on this critical step.
+
+USING BFS
+=========
+To use the BeOS filesystem, use filesystem type 'befs'.
+
+ex)
+ mount -t befs /dev/fd0 /beos
+
+MOUNT OPTIONS
+=============
+uid=nnn All files in the partition will be owned by user id nnn.
+gid=nnn All files in the partition will be in group nnn.
+iocharset=xxx Use xxx as the name of the NLS translation table.
+debug The driver will output debugging information to the syslog.
+
+HOW TO GET LASTEST VERSION
+==========================
+
+The latest version is currently available at:
+<http://befs-driver.sourceforge.net/>
+
+ANY KNOWN BUGS?
+===========
+As of Jan 20, 2002:
+
+ None
+
+SPECIAL THANKS
+==============
+Dominic Giampalo ... Writing "Practical file system design with Be filesystem"
+Hiroyuki Yamada ... Testing LinuxPPC.
+
+
+
diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt
new file mode 100644
index 0000000..78043d5
--- /dev/null
+++ b/Documentation/filesystems/bfs.txt
@@ -0,0 +1,57 @@
+BFS FILESYSTEM FOR LINUX
+========================
+
+The BFS filesystem is used by SCO UnixWare OS for the /stand slice, which
+usually contains the kernel image and a few other files required for the
+boot process.
+
+In order to access /stand partition under Linux you obviously need to
+know the partition number and the kernel must support UnixWare disk slices
+(CONFIG_UNIXWARE_DISKLABEL config option). However BFS support does not
+depend on having UnixWare disklabel support because one can also mount
+BFS filesystem via loopback:
+
+# losetup /dev/loop0 stand.img
+# mount -t bfs /dev/loop0 /mnt/stand
+
+where stand.img is a file containing the image of BFS filesystem.
+When you have finished using it and umounted you need to also deallocate
+/dev/loop0 device by:
+
+# losetup -d /dev/loop0
+
+You can simplify mounting by just typing:
+
+# mount -t bfs -o loop stand.img /mnt/stand
+
+this will allocate the first available loopback device (and load loop.o
+kernel module if necessary) automatically. If the loopback driver is not
+loaded automatically, make sure that you have compiled the module and
+that modprobe is functioning. Beware that umount will not deallocate
+/dev/loopN device if /etc/mtab file on your system is a symbolic link to
+/proc/mounts. You will need to do it manually using "-d" switch of
+losetup(8). Read losetup(8) manpage for more info.
+
+To create the BFS image under UnixWare you need to find out first which
+slice contains it. The command prtvtoc(1M) is your friend:
+
+# prtvtoc /dev/rdsk/c0b0t0d0s0
+
+(assuming your root disk is on target=0, lun=0, bus=0, controller=0). Then you
+look for the slice with tag "STAND", which is usually slice 10. With this
+information you can use dd(1) to create the BFS image:
+
+# umount /stand
+# dd if=/dev/rdsk/c0b0t0d0sa of=stand.img bs=512
+
+Just in case, you can verify that you have done the right thing by checking
+the magic number:
+
+# od -Ad -tx4 stand.img | more
+
+The first 4 bytes should be 0x1badface.
+
+If you have any patches, questions or suggestions regarding this BFS
+implementation please contact the author:
+
+Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
diff --git a/Documentation/filesystems/cifs.txt b/Documentation/filesystems/cifs.txt
new file mode 100644
index 0000000..49cc923
--- /dev/null
+++ b/Documentation/filesystems/cifs.txt
@@ -0,0 +1,51 @@
+ This is the client VFS module for the Common Internet File System
+ (CIFS) protocol which is the successor to the Server Message Block
+ (SMB) protocol, the native file sharing mechanism for most early
+ PC operating systems. CIFS is fully supported by current network
+ file servers such as Windows 2000, Windows 2003 (including
+ Windows XP) as well by Samba (which provides excellent CIFS
+ server support for Linux and many other operating systems), so
+ this network filesystem client can mount to a wide variety of
+ servers. The smbfs module should be used instead of this cifs module
+ for mounting to older SMB servers such as OS/2. The smbfs and cifs
+ modules can coexist and do not conflict. The CIFS VFS filesystem
+ module is designed to work well with servers that implement the
+ newer versions (dialects) of the SMB/CIFS protocol such as Samba,
+ the program written by Andrew Tridgell that turns any Unix host
+ into a SMB/CIFS file server.
+
+ The intent of this module is to provide the most advanced network
+ file system function for CIFS compliant servers, including better
+ POSIX compliance, secure per-user session establishment, high
+ performance safe distributed caching (oplock), optional packet
+ signing, large files, Unicode support and other internationalization
+ improvements. Since both Samba server and this filesystem client support
+ the CIFS Unix extensions, the combination can provide a reasonable
+ alternative to NFSv4 for fileserving in some Linux to Linux environments,
+ not just in Linux to Windows environments.
+
+ This filesystem has an optional mount utility (mount.cifs) that can
+ be obtained from the project page and installed in the path in the same
+ directory with the other mount helpers (such as mount.smbfs).
+ Mounting using the cifs filesystem without installing the mount helper
+ requires specifying the server's ip address.
+
+ For Linux 2.4:
+ mount //anything/here /mnt_target -o
+ user=username,pass=password,unc=//ip_address_of_server/sharename
+
+ For Linux 2.5:
+ mount //ip_address_of_server/sharename /mnt_target -o user=username, pass=password
+
+
+ For more information on the module see the project page at
+
+ http://us1.samba.org/samba/Linux_CIFS_client.html
+
+ For more information on CIFS see:
+
+ http://www.snia.org/tech_activities/CIFS
+
+ or the Samba site:
+
+ http://www.samba.org
diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt
new file mode 100644
index 0000000..6131135
--- /dev/null
+++ b/Documentation/filesystems/coda.txt
@@ -0,0 +1,1673 @@
+NOTE:
+This is one of the technical documents describing a component of
+Coda -- this document describes the client kernel-Venus interface.
+
+For more information:
+ http://www.coda.cs.cmu.edu
+For user level software needed to run Coda:
+ ftp://ftp.coda.cs.cmu.edu
+
+To run Coda you need to get a user level cache manager for the client,
+named Venus, as well as tools to manipulate ACLs, to log in, etc. The
+client needs to have the Coda filesystem selected in the kernel
+configuration.
+
+The server needs a user level server and at present does not depend on
+kernel support.
+
+
+
+
+
+
+
+ The Venus kernel interface
+ Peter J. Braam
+ v1.0, Nov 9, 1997
+
+ This document describes the communication between Venus and kernel
+ level filesystem code needed for the operation of the Coda file sys-
+ tem. This document version is meant to describe the current interface
+ (version 1.0) as well as improvements we envisage.
+ ______________________________________________________________________
+
+ Table of Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1. Introduction
+
+ 2. Servicing Coda filesystem calls
+
+ 3. The message layer
+
+ 3.1 Implementation details
+
+ 4. The interface at the call level
+
+ 4.1 Data structures shared by the kernel and Venus
+ 4.2 The pioctl interface
+ 4.3 root
+ 4.4 lookup
+ 4.5 getattr
+ 4.6 setattr
+ 4.7 access
+ 4.8 create
+ 4.9 mkdir
+ 4.10 link
+ 4.11 symlink
+ 4.12 remove
+ 4.13 rmdir
+ 4.14 readlink
+ 4.15 open
+ 4.16 close
+ 4.17 ioctl
+ 4.18 rename
+ 4.19 readdir
+ 4.20 vget
+ 4.21 fsync
+ 4.22 inactive
+ 4.23 rdwr
+ 4.24 odymount
+ 4.25 ody_lookup
+ 4.26 ody_expand
+ 4.27 prefetch
+ 4.28 signal
+
+ 5. The minicache and downcalls
+
+ 5.1 INVALIDATE
+ 5.2 FLUSH
+ 5.3 PURGEUSER
+ 5.4 ZAPFILE
+ 5.5 ZAPDIR
+ 5.6 ZAPVNODE
+ 5.7 PURGEFID
+ 5.8 REPLACE
+
+ 6. Initialization and cleanup
+
+ 6.1 Requirements
+
+
+ ______________________________________________________________________
+ 0wpage
+
+ 11.. IInnttrroodduuccttiioonn
+
+
+
+ A key component in the Coda Distributed File System is the cache
+ manager, _V_e_n_u_s.
+
+
+ When processes on a Coda enabled system access files in the Coda
+ filesystem, requests are directed at the filesystem layer in the
+ operating system. The operating system will communicate with Venus to
+ service the request for the process. Venus manages a persistent
+ client cache and makes remote procedure calls to Coda file servers and
+ related servers (such as authentication servers) to service these
+ requests it receives from the operating system. When Venus has
+ serviced a request it replies to the operating system with appropriate
+ return codes, and other data related to the request. Optionally the
+ kernel support for Coda may maintain a minicache of recently processed
+ requests to limit the number of interactions with Venus. Venus
+ possesses the facility to inform the kernel when elements from its
+ minicache are no longer valid.
+
+ This document describes precisely this communication between the
+ kernel and Venus. The definitions of so called upcalls and downcalls
+ will be given with the format of the data they handle. We shall also
+ describe the semantic invariants resulting from the calls.
+
+ Historically Coda was implemented in a BSD file system in Mach 2.6.
+ The interface between the kernel and Venus is very similar to the BSD
+ VFS interface. Similar functionality is provided, and the format of
+ the parameters and returned data is very similar to the BSD VFS. This
+ leads to an almost natural environment for implementing a kernel-level
+ filesystem driver for Coda in a BSD system. However, other operating
+ systems such as Linux and Windows 95 and NT have virtual filesystem
+ with different interfaces.
+
+ To implement Coda on these systems some reverse engineering of the
+ Venus/Kernel protocol is necessary. Also it came to light that other
+ systems could profit significantly from certain small optimizations
+ and modifications to the protocol. To facilitate this work as well as
+ to make future ports easier, communication between Venus and the
+ kernel should be documented in great detail. This is the aim of this
+ document.
+
+ 0wpage
+
+ 22.. SSeerrvviicciinngg CCooddaa ffiilleessyysstteemm ccaallllss
+
+ The service of a request for a Coda file system service originates in
+ a process PP which accessing a Coda file. It makes a system call which
+ traps to the OS kernel. Examples of such calls trapping to the kernel
+ are _r_e_a_d_, _w_r_i_t_e_, _o_p_e_n_, _c_l_o_s_e_, _c_r_e_a_t_e_, _m_k_d_i_r_, _r_m_d_i_r_, _c_h_m_o_d in a Unix
+ context. Similar calls exist in the Win32 environment, and are named
+ _C_r_e_a_t_e_F_i_l_e_, .
+
+ Generally the operating system handles the request in a virtual
+ filesystem (VFS) layer, which is named I/O Manager in NT and IFS
+ manager in Windows 95. The VFS is responsible for partial processing
+ of the request and for locating the specific filesystem(s) which will
+ service parts of the request. Usually the information in the path
+ assists in locating the correct FS drivers. Sometimes after extensive
+ pre-processing, the VFS starts invoking exported routines in the FS
+ driver. This is the point where the FS specific processing of the
+ request starts, and here the Coda specific kernel code comes into
+ play.
+
+ The FS layer for Coda must expose and implement several interfaces.
+ First and foremost the VFS must be able to make all necessary calls to
+ the Coda FS layer, so the Coda FS driver must expose the VFS interface
+ as applicable in the operating system. These differ very significantly
+ among operating systems, but share features such as facilities to
+ read/write and create and remove objects. The Coda FS layer services
+ such VFS requests by invoking one or more well defined services
+ offered by the cache manager Venus. When the replies from Venus have
+ come back to the FS driver, servicing of the VFS call continues and
+ finishes with a reply to the kernel's VFS. Finally the VFS layer
+ returns to the process.
+
+ As a result of this design a basic interface exposed by the FS driver
+ must allow Venus to manage message traffic. In particular Venus must
+ be able to retrieve and place messages and to be notified of the
+ arrival of a new message. The notification must be through a mechanism
+ which does not block Venus since Venus must attend to other tasks even
+ when no messages are waiting or being processed.
+
+
+
+
+
+
+ Interfaces of the Coda FS Driver
+
+ Furthermore the FS layer provides for a special path of communication
+ between a user process and Venus, called the pioctl interface. The
+ pioctl interface is used for Coda specific services, such as
+ requesting detailed information about the persistent cache managed by
+ Venus. Here the involvement of the kernel is minimal. It identifies
+ the calling process and passes the information on to Venus. When
+ Venus replies the response is passed back to the caller in unmodified
+ form.
+
+ Finally Venus allows the kernel FS driver to cache the results from
+ certain services. This is done to avoid excessive context switches
+ and results in an efficient system. However, Venus may acquire
+ information, for example from the network which implies that cached
+ information must be flushed or replaced. Venus then makes a downcall
+ to the Coda FS layer to request flushes or updates in the cache. The
+ kernel FS driver handles such requests synchronously.
+
+ Among these interfaces the VFS interface and the facility to place,
+ receive and be notified of messages are platform specific. We will
+ not go into the calls exported to the VFS layer but we will state the
+ requirements of the message exchange mechanism.
+
+ 0wpage
+
+ 33.. TThhee mmeessssaaggee llaayyeerr
+
+
+
+ At the lowest level the communication between Venus and the FS driver
+ proceeds through messages. The synchronization between processes
+ requesting Coda file service and Venus relies on blocking and waking
+ up processes. The Coda FS driver processes VFS- and pioctl-requests
+ on behalf of a process P, creates messages for Venus, awaits replies
+ and finally returns to the caller. The implementation of the exchange
+ of messages is platform specific, but the semantics have (so far)
+ appeared to be generally applicable. Data buffers are created by the
+ FS Driver in kernel memory on behalf of P and copied to user memory in
+ Venus.
+
+ The FS Driver while servicing P makes upcalls to Venus. Such an
+ upcall is dispatched to Venus by creating a message structure. The
+ structure contains the identification of P, the message sequence
+ number, the size of the request and a pointer to the data in kernel
+ memory for the request. Since the data buffer is re-used to hold the
+ reply from Venus, there is a field for the size of the reply. A flags
+ field is used in the message to precisely record the status of the
+ message. Additional platform dependent structures involve pointers to
+ determine the position of the message on queues and pointers to
+ synchronization objects. In the upcall routine the message structure
+ is filled in, flags are set to 0, and it is placed on the _p_e_n_d_i_n_g
+ queue. The routine calling upcall is responsible for allocating the
+ data buffer; its structure will be described in the next section.
+
+ A facility must exist to notify Venus that the message has been
+ created, and implemented using available synchronization objects in
+ the OS. This notification is done in the upcall context of the process
+ P. When the message is on the pending queue, process P cannot proceed
+ in upcall. The (kernel mode) processing of P in the filesystem
+ request routine must be suspended until Venus has replied. Therefore
+ the calling thread in P is blocked in upcall. A pointer in the
+ message structure will locate the synchronization object on which P is
+ sleeping.
+
+ Venus detects the notification that a message has arrived, and the FS
+ driver allow Venus to retrieve the message with a getmsg_from_kernel
+ call. This action finishes in the kernel by putting the message on the
+ queue of processing messages and setting flags to READ. Venus is
+ passed the contents of the data buffer. The getmsg_from_kernel call
+ now returns and Venus processes the request.
+
+ At some later point the FS driver receives a message from Venus,
+ namely when Venus calls sendmsg_to_kernel. At this moment the Coda FS
+ driver looks at the contents of the message and decides if:
+
+
+ +o the message is a reply for a suspended thread P. If so it removes
+ the message from the processing queue and marks the message as
+ WRITTEN. Finally, the FS driver unblocks P (still in the kernel
+ mode context of Venus) and the sendmsg_to_kernel call returns to
+ Venus. The process P will be scheduled at some point and continues
+ processing its upcall with the data buffer replaced with the reply
+ from Venus.
+
+ +o The message is a _d_o_w_n_c_a_l_l. A downcall is a request from Venus to
+ the FS Driver. The FS driver processes the request immediately
+ (usually a cache eviction or replacement) and when it finishes
+ sendmsg_to_kernel returns.
+
+ Now P awakes and continues processing upcall. There are some
+ subtleties to take account of. First P will determine if it was woken
+ up in upcall by a signal from some other source (for example an
+ attempt to terminate P) or as is normally the case by Venus in its
+ sendmsg_to_kernel call. In the normal case, the upcall routine will
+ deallocate the message structure and return. The FS routine can proceed
+ with its processing.
+
+
+
+
+
+
+
+ Sleeping and IPC arrangements
+
+ In case P is woken up by a signal and not by Venus, it will first look
+ at the flags field. If the message is not yet READ, the process P can
+ handle its signal without notifying Venus. If Venus has READ, and
+ the request should not be processed, P can send Venus a signal message
+ to indicate that it should disregard the previous message. Such
+ signals are put in the queue at the head, and read first by Venus. If
+ the message is already marked as WRITTEN it is too late to stop the
+ processing. The VFS routine will now continue. (-- If a VFS request
+ involves more than one upcall, this can lead to complicated state, an
+ extra field "handle_signals" could be added in the message structure
+ to indicate points of no return have been passed.--)
+
+
+
+ 33..11.. IImmpplleemmeennttaattiioonn ddeettaaiillss
+
+ The Unix implementation of this mechanism has been through the
+ implementation of a character device associated with Coda. Venus
+ retrieves messages by doing a read on the device, replies are sent
+ with a write and notification is through the select system call on the
+ file descriptor for the device. The process P is kept waiting on an
+ interruptible wait queue object.
+
+ In Windows NT and the DPMI Windows 95 implementation a DeviceIoControl
+ call is used. The DeviceIoControl call is designed to copy buffers
+ from user memory to kernel memory with OPCODES. The sendmsg_to_kernel
+ is issued as a synchronous call, while the getmsg_from_kernel call is
+ asynchronous. Windows EventObjects are used for notification of
+ message arrival. The process P is kept waiting on a KernelEvent
+ object in NT and a semaphore in Windows 95.
+
+ 0wpage
+
+ 44.. TThhee iinntteerrffaaccee aatt tthhee ccaallll lleevveell
+
+
+ This section describes the upcalls a Coda FS driver can make to Venus.
+ Each of these upcalls make use of two structures: inputArgs and
+ outputArgs. In pseudo BNF form the structures take the following
+ form:
+
+
+ struct inputArgs {
+ u_long opcode;
+ u_long unique; /* Keep multiple outstanding msgs distinct */
+ u_short pid; /* Common to all */
+ u_short pgid; /* Common to all */
+ struct CodaCred cred; /* Common to all */
+
+ <union "in" of call dependent parts of inputArgs>
+ };
+
+ struct outputArgs {
+ u_long opcode;
+ u_long unique; /* Keep multiple outstanding msgs distinct */
+ u_long result;
+
+ <union "out" of call dependent parts of inputArgs>
+ };
+
+
+
+ Before going on let us elucidate the role of the various fields. The
+ inputArgs start with the opcode which defines the type of service
+ requested from Venus. There are approximately 30 upcalls at present
+ which we will discuss. The unique field labels the inputArg with a
+ unique number which will identify the message uniquely. A process and
+ process group id are passed. Finally the credentials of the caller
+ are included.
+
+ Before delving into the specific calls we need to discuss a variety of
+ data structures shared by the kernel and Venus.
+
+
+
+
+ 44..11.. DDaattaa ssttrruuccttuurreess sshhaarreedd bbyy tthhee kkeerrnneell aanndd VVeennuuss
+
+
+ The CodaCred structure defines a variety of user and group ids as
+ they are set for the calling process. The vuid_t and guid_t are 32 bit
+ unsigned integers. It also defines group membership in an array. On
+ Unix the CodaCred has proven sufficient to implement good security
+ semantics for Coda but the structure may have to undergo modification
+ for the Windows environment when these mature.
+
+ struct CodaCred {
+ vuid_t cr_uid, cr_euid, cr_suid, cr_fsuid; /* Real, effective, set, fs uid*/
+ vgid_t cr_gid, cr_egid, cr_sgid, cr_fsgid; /* same for groups */
+ vgid_t cr_groups[NGROUPS]; /* Group membership for caller */
+ };
+
+
+
+ NNOOTTEE It is questionable if we need CodaCreds in Venus. Finally Venus
+ doesn't know about groups, although it does create files with the
+ default uid/gid. Perhaps the list of group membership is superfluous.
+
+
+ The next item is the fundamental identifier used to identify Coda
+ files, the ViceFid. A fid of a file uniquely defines a file or
+ directory in the Coda filesystem within a _c_e_l_l. (-- A _c_e_l_l is a
+ group of Coda servers acting under the aegis of a single system
+ control machine or SCM. See the Coda Administration manual for a
+ detailed description of the role of the SCM.--)
+
+
+ typedef struct ViceFid {
+ VolumeId Volume;
+ VnodeId Vnode;
+ Unique_t Unique;
+ } ViceFid;
+
+
+
+ Each of the constituent fields: VolumeId, VnodeId and Unique_t are
+ unsigned 32 bit integers. We envisage that a further field will need
+ to be prefixed to identify the Coda cell; this will probably take the
+ form of a Ipv6 size IP address naming the Coda cell through DNS.
+
+ The next important structure shared between Venus and the kernel is
+ the attributes of the file. The following structure is used to
+ exchange information. It has room for future extensions such as
+ support for device files (currently not present in Coda).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ struct coda_vattr {
+ enum coda_vtype va_type; /* vnode type (for create) */
+ u_short va_mode; /* files access mode and type */
+ short va_nlink; /* number of references to file */
+ vuid_t va_uid; /* owner user id */
+ vgid_t va_gid; /* owner group id */
+ long va_fsid; /* file system id (dev for now) */
+ long va_fileid; /* file id */
+ u_quad_t va_size; /* file size in bytes */
+ long va_blocksize; /* blocksize preferred for i/o */
+ struct timespec va_atime; /* time of last access */
+ struct timespec va_mtime; /* time of last modification */
+ struct timespec va_ctime; /* time file changed */
+ u_long va_gen; /* generation number of file */
+ u_long va_flags; /* flags defined for file */
+ dev_t va_rdev; /* device special file represents */
+ u_quad_t va_bytes; /* bytes of disk space held by file */
+ u_quad_t va_filerev; /* file modification number */
+ u_int va_vaflags; /* operations flags, see below */
+ long va_spare; /* remain quad aligned */
+ };
+
+
+
+
+ 44..22.. TThhee ppiiooccttll iinntteerrffaaccee
+
+
+ Coda specific requests can be made by application through the pioctl
+ interface. The pioctl is implemented as an ordinary ioctl on a
+ fictitious file /coda/.CONTROL. The pioctl call opens this file, gets
+ a file handle and makes the ioctl call. Finally it closes the file.
+
+ The kernel involvement in this is limited to providing the facility to
+ open and close and pass the ioctl message _a_n_d to verify that a path in
+ the pioctl data buffers is a file in a Coda filesystem.
+
+ The kernel is handed a data packet of the form:
+
+ struct {
+ const char *path;
+ struct ViceIoctl vidata;
+ int follow;
+ } data;
+
+
+
+ where
+
+
+ struct ViceIoctl {
+ caddr_t in, out; /* Data to be transferred in, or out */
+ short in_size; /* Size of input buffer <= 2K */
+ short out_size; /* Maximum size of output buffer, <= 2K */
+ };
+
+
+
+ The path must be a Coda file, otherwise the ioctl upcall will not be
+ made.
+
+ NNOOTTEE The data structures and code are a mess. We need to clean this
+ up.
+
+ We now proceed to document the individual calls:
+
+ 0wpage
+
+ 44..33.. rroooott
+
+
+ AArrgguummeennttss
+
+ iinn empty
+
+ oouutt
+
+ struct cfs_root_out {
+ ViceFid VFid;
+ } cfs_root;
+
+
+
+ DDeessccrriippttiioonn This call is made to Venus during the initialization of
+ the Coda filesystem. If the result is zero, the cfs_root structure
+ contains the ViceFid of the root of the Coda filesystem. If a non-zero
+ result is generated, its value is a platform dependent error code
+ indicating the difficulty Venus encountered in locating the root of
+ the Coda filesystem.
+
+ 0wpage
+
+ 44..44.. llooookkuupp
+
+
+ SSuummmmaarryy Find the ViceFid and type of an object in a directory if it
+ exists.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_lookup_in {
+ ViceFid VFid;
+ char *name; /* Place holder for data. */
+ } cfs_lookup;
+
+
+
+ oouutt
+
+ struct cfs_lookup_out {
+ ViceFid VFid;
+ int vtype;
+ } cfs_lookup;
+
+
+
+ DDeessccrriippttiioonn This call is made to determine the ViceFid and filetype of
+ a directory entry. The directory entry requested carries name name
+ and Venus will search the directory identified by cfs_lookup_in.VFid.
+ The result may indicate that the name does not exist, or that
+ difficulty was encountered in finding it (e.g. due to disconnection).
+ If the result is zero, the field cfs_lookup_out.VFid contains the
+ targets ViceFid and cfs_lookup_out.vtype the coda_vtype giving the
+ type of object the name designates.
+
+ The name of the object is an 8 bit character string of maximum length
+ CFS_MAXNAMLEN, currently set to 256 (including a 0 terminator.)
+
+ It is extremely important to realize that Venus bitwise ors the field
+ cfs_lookup.vtype with CFS_NOCACHE to indicate that the object should
+ not be put in the kernel name cache.
+
+ NNOOTTEE The type of the vtype is currently wrong. It should be
+ coda_vtype. Linux does not take note of CFS_NOCACHE. It should.
+
+ 0wpage
+
+ 44..55.. ggeettaattttrr
+
+
+ SSuummmmaarryy Get the attributes of a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_getattr_in {
+ ViceFid VFid;
+ struct coda_vattr attr; /* XXXXX */
+ } cfs_getattr;
+
+
+
+ oouutt
+
+ struct cfs_getattr_out {
+ struct coda_vattr attr;
+ } cfs_getattr;
+
+
+
+ DDeessccrriippttiioonn This call returns the attributes of the file identified by
+ fid.
+
+ EErrrroorrss Errors can occur if the object with fid does not exist, is
+ unaccessible or if the caller does not have permission to fetch
+ attributes.
+
+ NNoottee Many kernel FS drivers (Linux, NT and Windows 95) need to acquire
+ the attributes as well as the Fid for the instantiation of an internal
+ "inode" or "FileHandle". A significant improvement in performance on
+ such systems could be made by combining the _l_o_o_k_u_p and _g_e_t_a_t_t_r calls
+ both at the Venus/kernel interaction level and at the RPC level.
+
+ The vattr structure included in the input arguments is superfluous and
+ should be removed.
+
+ 0wpage
+
+ 44..66.. sseettaattttrr
+
+
+ SSuummmmaarryy Set the attributes of a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_setattr_in {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ } cfs_setattr;
+
+
+
+
+ oouutt
+ empty
+
+ DDeessccrriippttiioonn The structure attr is filled with attributes to be changed
+ in BSD style. Attributes not to be changed are set to -1, apart from
+ vtype which is set to VNON. Other are set to the value to be assigned.
+ The only attributes which the FS driver may request to change are the
+ mode, owner, groupid, atime, mtime and ctime. The return value
+ indicates success or failure.
+
+ EErrrroorrss A variety of errors can occur. The object may not exist, may
+ be inaccessible, or permission may not be granted by Venus.
+
+ 0wpage
+
+ 44..77.. aacccceessss
+
+
+ SSuummmmaarryy
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_access_in {
+ ViceFid VFid;
+ int flags;
+ } cfs_access;
+
+
+
+ oouutt
+ empty
+
+ DDeessccrriippttiioonn Verify if access to the object identified by VFid for
+ operations described by flags is permitted. The result indicates if
+ access will be granted. It is important to remember that Coda uses
+ ACLs to enforce protection and that ultimately the servers, not the
+ clients enforce the security of the system. The result of this call
+ will depend on whether a _t_o_k_e_n is held by the user.
+
+ EErrrroorrss The object may not exist, or the ACL describing the protection
+ may not be accessible.
+
+ 0wpage
+
+ 44..88.. ccrreeaattee
+
+
+ SSuummmmaarryy Invoked to create a file
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_create_in {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ int excl;
+ int mode;
+ char *name; /* Place holder for data. */
+ } cfs_create;
+
+
+
+
+ oouutt
+
+ struct cfs_create_out {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ } cfs_create;
+
+
+
+ DDeessccrriippttiioonn This upcall is invoked to request creation of a file.
+ The file will be created in the directory identified by VFid, its name
+ will be name, and the mode will be mode. If excl is set an error will
+ be returned if the file already exists. If the size field in attr is
+ set to zero the file will be truncated. The uid and gid of the file
+ are set by converting the CodaCred to a uid using a macro CRTOUID
+ (this macro is platform dependent). Upon success the VFid and
+ attributes of the file are returned. The Coda FS Driver will normally
+ instantiate a vnode, inode or file handle at kernel level for the new
+ object.
+
+
+ EErrrroorrss A variety of errors can occur. Permissions may be insufficient.
+ If the object exists and is not a file the error EISDIR is returned
+ under Unix.
+
+ NNOOTTEE The packing of parameters is very inefficient and appears to
+ indicate confusion between the system call creat and the VFS operation
+ create. The VFS operation create is only called to create new objects.
+ This create call differs from the Unix one in that it is not invoked
+ to return a file descriptor. The truncate and exclusive options,
+ together with the mode, could simply be part of the mode as it is
+ under Unix. There should be no flags argument; this is used in open
+ (2) to return a file descriptor for READ or WRITE mode.
+
+ The attributes of the directory should be returned too, since the size
+ and mtime changed.
+
+ 0wpage
+
+ 44..99.. mmkkddiirr
+
+
+ SSuummmmaarryy Create a new directory.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_mkdir_in {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ char *name; /* Place holder for data. */
+ } cfs_mkdir;
+
+
+
+ oouutt
+
+ struct cfs_mkdir_out {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ } cfs_mkdir;
+
+
+
+
+ DDeessccrriippttiioonn This call is similar to create but creates a directory.
+ Only the mode field in the input parameters is used for creation.
+ Upon successful creation, the attr returned contains the attributes of
+ the new directory.
+
+ EErrrroorrss As for create.
+
+ NNOOTTEE The input parameter should be changed to mode instead of
+ attributes.
+
+ The attributes of the parent should be returned since the size and
+ mtime changes.
+
+ 0wpage
+
+ 44..1100.. lliinnkk
+
+
+ SSuummmmaarryy Create a link to an existing file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_link_in {
+ ViceFid sourceFid; /* cnode to link *to* */
+ ViceFid destFid; /* Directory in which to place link */
+ char *tname; /* Place holder for data. */
+ } cfs_link;
+
+
+
+ oouutt
+ empty
+
+ DDeessccrriippttiioonn This call creates a link to the sourceFid in the directory
+ identified by destFid with name tname. The source must reside in the
+ target's parent, i.e. the source must be have parent destFid, i.e. Coda
+ does not support cross directory hard links. Only the return value is
+ relevant. It indicates success or the type of failure.
+
+ EErrrroorrss The usual errors can occur.0wpage
+
+ 44..1111.. ssyymmlliinnkk
+
+
+ SSuummmmaarryy create a symbolic link
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_symlink_in {
+ ViceFid VFid; /* Directory to put symlink in */
+ char *srcname;
+ struct coda_vattr attr;
+ char *tname;
+ } cfs_symlink;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Create a symbolic link. The link is to be placed in the
+ directory identified by VFid and named tname. It should point to the
+ pathname srcname. The attributes of the newly created object are to
+ be set to attr.
+
+ EErrrroorrss
+
+ NNOOTTEE The attributes of the target directory should be returned since
+ its size changed.
+
+ 0wpage
+
+ 44..1122.. rreemmoovvee
+
+
+ SSuummmmaarryy Remove a file
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_remove_in {
+ ViceFid VFid;
+ char *name; /* Place holder for data. */
+ } cfs_remove;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Remove file named cfs_remove_in.name in directory
+ identified by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE The attributes of the directory should be returned since its
+ mtime and size may change.
+
+ 0wpage
+
+ 44..1133.. rrmmddiirr
+
+
+ SSuummmmaarryy Remove a directory
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_rmdir_in {
+ ViceFid VFid;
+ char *name; /* Place holder for data. */
+ } cfs_rmdir;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Remove the directory with name name from the directory
+ identified by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE The attributes of the parent directory should be returned since
+ its mtime and size may change.
+
+ 0wpage
+
+ 44..1144.. rreeaaddlliinnkk
+
+
+ SSuummmmaarryy Read the value of a symbolic link.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_readlink_in {
+ ViceFid VFid;
+ } cfs_readlink;
+
+
+
+ oouutt
+
+ struct cfs_readlink_out {
+ int count;
+ caddr_t data; /* Place holder for data. */
+ } cfs_readlink;
+
+
+
+ DDeessccrriippttiioonn This routine reads the contents of symbolic link
+ identified by VFid into the buffer data. The buffer data must be able
+ to hold any name up to CFS_MAXNAMLEN (PATH or NAM??).
+
+ EErrrroorrss No unusual errors.
+
+ 0wpage
+
+ 44..1155.. ooppeenn
+
+
+ SSuummmmaarryy Open a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_open_in {
+ ViceFid VFid;
+ int flags;
+ } cfs_open;
+
+
+
+ oouutt
+
+ struct cfs_open_out {
+ dev_t dev;
+ ino_t inode;
+ } cfs_open;
+
+
+
+ DDeessccrriippttiioonn This request asks Venus to place the file identified by
+ VFid in its cache and to note that the calling process wishes to open
+ it with flags as in open(2). The return value to the kernel differs
+ for Unix and Windows systems. For Unix systems the Coda FS Driver is
+ informed of the device and inode number of the container file in the
+ fields dev and inode. For Windows the path of the container file is
+ returned to the kernel.
+ EErrrroorrss
+
+ NNOOTTEE Currently the cfs_open_out structure is not properly adapted to
+ deal with the Windows case. It might be best to implement two
+ upcalls, one to open aiming at a container file name, the other at a
+ container file inode.
+
+ 0wpage
+
+ 44..1166.. cclloossee
+
+
+ SSuummmmaarryy Close a file, update it on the servers.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_close_in {
+ ViceFid VFid;
+ int flags;
+ } cfs_close;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Close the file identified by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE The flags argument is bogus and not used. However, Venus' code
+ has room to deal with an execp input field, probably this field should
+ be used to inform Venus that the file was closed but is still memory
+ mapped for execution. There are comments about fetching versus not
+ fetching the data in Venus vproc_vfscalls. This seems silly. If a
+ file is being closed, the data in the container file is to be the new
+ data. Here again the execp flag might be in play to create confusion:
+ currently Venus might think a file can be flushed from the cache when
+ it is still memory mapped. This needs to be understood.
+
+ 0wpage
+
+ 44..1177.. iiooccttll
+
+
+ SSuummmmaarryy Do an ioctl on a file. This includes the pioctl interface.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_ioctl_in {
+ ViceFid VFid;
+ int cmd;
+ int len;
+ int rwflag;
+ char *data; /* Place holder for data. */
+ } cfs_ioctl;
+
+
+
+ oouutt
+
+
+ struct cfs_ioctl_out {
+ int len;
+ caddr_t data; /* Place holder for data. */
+ } cfs_ioctl;
+
+
+
+ DDeessccrriippttiioonn Do an ioctl operation on a file. The command, len and
+ data arguments are filled as usual. flags is not used by Venus.
+
+ EErrrroorrss
+
+ NNOOTTEE Another bogus parameter. flags is not used. What is the
+ business about PREFETCHING in the Venus code?
+
+
+ 0wpage
+
+ 44..1188.. rreennaammee
+
+
+ SSuummmmaarryy Rename a fid.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_rename_in {
+ ViceFid sourceFid;
+ char *srcname;
+ ViceFid destFid;
+ char *destname;
+ } cfs_rename;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Rename the object with name srcname in directory
+ sourceFid to destname in destFid. It is important that the names
+ srcname and destname are 0 terminated strings. Strings in Unix
+ kernels are not always null terminated.
+
+ EErrrroorrss
+
+ 0wpage
+
+ 44..1199.. rreeaaddddiirr
+
+
+ SSuummmmaarryy Read directory entries.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_readdir_in {
+ ViceFid VFid;
+ int count;
+ int offset;
+ } cfs_readdir;
+
+
+
+
+ oouutt
+
+ struct cfs_readdir_out {
+ int size;
+ caddr_t data; /* Place holder for data. */
+ } cfs_readdir;
+
+
+
+ DDeessccrriippttiioonn Read directory entries from VFid starting at offset and
+ read at most count bytes. Returns the data in data and returns
+ the size in size.
+
+ EErrrroorrss
+
+ NNOOTTEE This call is not used. Readdir operations exploit container
+ files. We will re-evaluate this during the directory revamp which is
+ about to take place.
+
+ 0wpage
+
+ 44..2200.. vvggeett
+
+
+ SSuummmmaarryy instructs Venus to do an FSDB->Get.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_vget_in {
+ ViceFid VFid;
+ } cfs_vget;
+
+
+
+ oouutt
+
+ struct cfs_vget_out {
+ ViceFid VFid;
+ int vtype;
+ } cfs_vget;
+
+
+
+ DDeessccrriippttiioonn This upcall asks Venus to do a get operation on an fsobj
+ labelled by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE This operation is not used. However, it is extremely useful
+ since it can be used to deal with read/write memory mapped files.
+ These can be "pinned" in the Venus cache using vget and released with
+ inactive.
+
+ 0wpage
+
+ 44..2211.. ffssyynncc
+
+
+ SSuummmmaarryy Tell Venus to update the RVM attributes of a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_fsync_in {
+ ViceFid VFid;
+ } cfs_fsync;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Ask Venus to update RVM attributes of object VFid. This
+ should be called as part of kernel level fsync type calls. The
+ result indicates if the syncing was successful.
+
+ EErrrroorrss
+
+ NNOOTTEE Linux does not implement this call. It should.
+
+ 0wpage
+
+ 44..2222.. iinnaaccttiivvee
+
+
+ SSuummmmaarryy Tell Venus a vnode is no longer in use.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_inactive_in {
+ ViceFid VFid;
+ } cfs_inactive;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn This operation returns EOPNOTSUPP.
+
+ EErrrroorrss
+
+ NNOOTTEE This should perhaps be removed.
+
+ 0wpage
+
+ 44..2233.. rrddwwrr
+
+
+ SSuummmmaarryy Read or write from a file
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_rdwr_in {
+ ViceFid VFid;
+ int rwflag;
+ int count;
+ int offset;
+ int ioflag;
+ caddr_t data; /* Place holder for data. */
+ } cfs_rdwr;
+
+
+
+
+ oouutt
+
+ struct cfs_rdwr_out {
+ int rwflag;
+ int count;
+ caddr_t data; /* Place holder for data. */
+ } cfs_rdwr;
+
+
+
+ DDeessccrriippttiioonn This upcall asks Venus to read or write from a file.
+
+ EErrrroorrss
+
+ NNOOTTEE It should be removed since it is against the Coda philosophy that
+ read/write operations never reach Venus. I have been told the
+ operation does not work. It is not currently used.
+
+
+ 0wpage
+
+ 44..2244.. ooddyymmoouunntt
+
+
+ SSuummmmaarryy Allows mounting multiple Coda "filesystems" on one Unix mount
+ point.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct ody_mount_in {
+ char *name; /* Place holder for data. */
+ } ody_mount;
+
+
+
+ oouutt
+
+ struct ody_mount_out {
+ ViceFid VFid;
+ } ody_mount;
+
+
+
+ DDeessccrriippttiioonn Asks Venus to return the rootfid of a Coda system named
+ name. The fid is returned in VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE This call was used by David for dynamic sets. It should be
+ removed since it causes a jungle of pointers in the VFS mounting area.
+ It is not used by Coda proper. Call is not implemented by Venus.
+
+ 0wpage
+
+ 44..2255.. ooddyy__llooookkuupp
+
+
+ SSuummmmaarryy Looks up something.
+
+ AArrgguummeennttss
+
+ iinn irrelevant
+
+
+ oouutt
+ irrelevant
+
+ DDeessccrriippttiioonn
+
+ EErrrroorrss
+
+ NNOOTTEE Gut it. Call is not implemented by Venus.
+
+ 0wpage
+
+ 44..2266.. ooddyy__eexxppaanndd
+
+
+ SSuummmmaarryy expands something in a dynamic set.
+
+ AArrgguummeennttss
+
+ iinn irrelevant
+
+ oouutt
+ irrelevant
+
+ DDeessccrriippttiioonn
+
+ EErrrroorrss
+
+ NNOOTTEE Gut it. Call is not implemented by Venus.
+
+ 0wpage
+
+ 44..2277.. pprreeffeettcchh
+
+
+ SSuummmmaarryy Prefetch a dynamic set.
+
+ AArrgguummeennttss
+
+ iinn Not documented.
+
+ oouutt
+ Not documented.
+
+ DDeessccrriippttiioonn Venus worker.cc has support for this call, although it is
+ noted that it doesn't work. Not surprising, since the kernel does not
+ have support for it. (ODY_PREFETCH is not a defined operation).
+
+ EErrrroorrss
+
+ NNOOTTEE Gut it. It isn't working and isn't used by Coda.
+
+
+ 0wpage
+
+ 44..2288.. ssiiggnnaall
+
+
+ SSuummmmaarryy Send Venus a signal about an upcall.
+
+ AArrgguummeennttss
+
+ iinn none
+
+ oouutt
+ not applicable.
+
+ DDeessccrriippttiioonn This is an out-of-band upcall to Venus to inform Venus
+ that the calling process received a signal after Venus read the
+ message from the input queue. Venus is supposed to clean up the
+ operation.
+
+ EErrrroorrss No reply is given.
+
+ NNOOTTEE We need to better understand what Venus needs to clean up and if
+ it is doing this correctly. Also we need to handle multiple upcall
+ per system call situations correctly. It would be important to know
+ what state changes in Venus take place after an upcall for which the
+ kernel is responsible for notifying Venus to clean up (e.g. open
+ definitely is such a state change, but many others are maybe not).
+
+ 0wpage
+
+ 55.. TThhee mmiinniiccaacchhee aanndd ddoowwnnccaallllss
+
+
+ The Coda FS Driver can cache results of lookup and access upcalls, to
+ limit the frequency of upcalls. Upcalls carry a price since a process
+ context switch needs to take place. The counterpart of caching the
+ information is that Venus will notify the FS Driver that cached
+ entries must be flushed or renamed.
+
+ The kernel code generally has to maintain a structure which links the
+ internal file handles (called vnodes in BSD, inodes in Linux and
+ FileHandles in Windows) with the ViceFid's which Venus maintains. The
+ reason is that frequent translations back and forth are needed in
+ order to make upcalls and use the results of upcalls. Such linking
+ objects are called ccnnooddeess.
+
+ The current minicache implementations have cache entries which record
+ the following:
+
+ 1. the name of the file
+
+ 2. the cnode of the directory containing the object
+
+ 3. a list of CodaCred's for which the lookup is permitted.
+
+ 4. the cnode of the object
+
+ The lookup call in the Coda FS Driver may request the cnode of the
+ desired object from the cache, by passing its name, directory and the
+ CodaCred's of the caller. The cache will return the cnode or indicate
+ that it cannot be found. The Coda FS Driver must be careful to
+ invalidate cache entries when it modifies or removes objects.
+
+ When Venus obtains information that indicates that cache entries are
+ no longer valid, it will make a downcall to the kernel. Downcalls are
+ intercepted by the Coda FS Driver and lead to cache invalidations of
+ the kind described below. The Coda FS Driver does not return an error
+ unless the downcall data could not be read into kernel memory.
+
+
+ 55..11.. IINNVVAALLIIDDAATTEE
+
+
+ No information is available on this call.
+
+
+ 55..22.. FFLLUUSSHH
+
+
+
+ AArrgguummeennttss None
+
+ SSuummmmaarryy Flush the name cache entirely.
+
+ DDeessccrriippttiioonn Venus issues this call upon startup and when it dies. This
+ is to prevent stale cache information being held. Some operating
+ systems allow the kernel name cache to be switched off dynamically.
+ When this is done, this downcall is made.
+
+
+ 55..33.. PPUURRGGEEUUSSEERR
+
+
+ AArrgguummeennttss
+
+ struct cfs_purgeuser_out {/* CFS_PURGEUSER is a venus->kernel call */
+ struct CodaCred cred;
+ } cfs_purgeuser;
+
+
+
+ DDeessccrriippttiioonn Remove all entries in the cache carrying the Cred. This
+ call is issued when tokens for a user expire or are flushed.
+
+
+ 55..44.. ZZAAPPFFIILLEE
+
+
+ AArrgguummeennttss
+
+ struct cfs_zapfile_out { /* CFS_ZAPFILE is a venus->kernel call */
+ ViceFid CodaFid;
+ } cfs_zapfile;
+
+
+
+ DDeessccrriippttiioonn Remove all entries which have the (dir vnode, name) pair.
+ This is issued as a result of an invalidation of cached attributes of
+ a vnode.
+
+ NNOOTTEE Call is not named correctly in NetBSD and Mach. The minicache
+ zapfile routine takes different arguments. Linux does not implement
+ the invalidation of attributes correctly.
+
+
+
+ 55..55.. ZZAAPPDDIIRR
+
+
+ AArrgguummeennttss
+
+ struct cfs_zapdir_out { /* CFS_ZAPDIR is a venus->kernel call */
+ ViceFid CodaFid;
+ } cfs_zapdir;
+
+
+
+ DDeessccrriippttiioonn Remove all entries in the cache lying in a directory
+ CodaFid, and all children of this directory. This call is issued when
+ Venus receives a callback on the directory.
+
+
+ 55..66.. ZZAAPPVVNNOODDEE
+
+
+
+ AArrgguummeennttss
+
+ struct cfs_zapvnode_out { /* CFS_ZAPVNODE is a venus->kernel call */
+ struct CodaCred cred;
+ ViceFid VFid;
+ } cfs_zapvnode;
+
+
+
+ DDeessccrriippttiioonn Remove all entries in the cache carrying the cred and VFid
+ as in the arguments. This downcall is probably never issued.
+
+
+ 55..77.. PPUURRGGEEFFIIDD
+
+
+ SSuummmmaarryy
+
+ AArrgguummeennttss
+
+ struct cfs_purgefid_out { /* CFS_PURGEFID is a venus->kernel call */
+ ViceFid CodaFid;
+ } cfs_purgefid;
+
+
+
+ DDeessccrriippttiioonn Flush the attribute for the file. If it is a dir (odd
+ vnode), purge its children from the namecache and remove the file from the
+ namecache.
+
+
+
+ 55..88.. RREEPPLLAACCEE
+
+
+ SSuummmmaarryy Replace the Fid's for a collection of names.
+
+ AArrgguummeennttss
+
+ struct cfs_replace_out { /* cfs_replace is a venus->kernel call */
+ ViceFid NewFid;
+ ViceFid OldFid;
+ } cfs_replace;
+
+
+
+ DDeessccrriippttiioonn This routine replaces a ViceFid in the name cache with
+ another. It is added to allow Venus during reintegration to replace
+ locally allocated temp fids while disconnected with global fids even
+ when the reference counts on those fids are not zero.
+
+ 0wpage
+
+ 66.. IInniittiiaalliizzaattiioonn aanndd cclleeaannuupp
+
+
+ This section gives brief hints as to desirable features for the Coda
+ FS Driver at startup and upon shutdown or Venus failures. Before
+ entering the discussion it is useful to repeat that the Coda FS Driver
+ maintains the following data:
+
+
+ 1. message queues
+
+ 2. cnodes
+
+ 3. name cache entries
+
+ The name cache entries are entirely private to the driver, so they
+ can easily be manipulated. The message queues will generally have
+ clear points of initialization and destruction. The cnodes are
+ much more delicate. User processes hold reference counts in Coda
+ filesystems and it can be difficult to clean up the cnodes.
+
+ It can expect requests through:
+
+ 1. the message subsystem
+
+ 2. the VFS layer
+
+ 3. pioctl interface
+
+ Currently the _p_i_o_c_t_l passes through the VFS for Coda so we can
+ treat these similarly.
+
+
+ 66..11.. RReeqquuiirreemmeennttss
+
+
+ The following requirements should be accommodated:
+
+ 1. The message queues should have open and close routines. On Unix
+ the opening of the character devices are such routines.
+
+ +o Before opening, no messages can be placed.
+
+ +o Opening will remove any old messages still pending.
+
+ +o Close will notify any sleeping processes that their upcall cannot
+ be completed.
+
+ +o Close will free all memory allocated by the message queues.
+
+
+ 2. At open the namecache shall be initialized to empty state.
+
+ 3. Before the message queues are open, all VFS operations will fail.
+ Fortunately this can be achieved by making sure than mounting the
+ Coda filesystem cannot succeed before opening.
+
+ 4. After closing of the queues, no VFS operations can succeed. Here
+ one needs to be careful, since a few operations (lookup,
+ read/write, readdir) can proceed without upcalls. These must be
+ explicitly blocked.
+
+ 5. Upon closing the namecache shall be flushed and disabled.
+
+ 6. All memory held by cnodes can be freed without relying on upcalls.
+
+ 7. Unmounting the file system can be done without relying on upcalls.
+
+ 8. Mounting the Coda filesystem should fail gracefully if Venus cannot
+ get the rootfid or the attributes of the rootfid. The latter is
+ best implemented by Venus fetching these objects before attempting
+ to mount.
+
+ NNOOTTEE NetBSD in particular but also Linux have not implemented the
+ above requirements fully. For smooth operation this needs to be
+ corrected.
+
+
+
diff --git a/Documentation/filesystems/configfs/Makefile b/Documentation/filesystems/configfs/Makefile
new file mode 100644
index 0000000..be7ec5e
--- /dev/null
+++ b/Documentation/filesystems/configfs/Makefile
@@ -0,0 +1,3 @@
+ifneq ($(CONFIG_CONFIGFS_FS),)
+obj-m += configfs_example_explicit.o configfs_example_macros.o
+endif
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644
index 0000000..fabcb0e
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,484 @@
+
+configfs - Userspace-driven kernel object configuration.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+ Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality. Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs. Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2). It may allow some attributes to be modified via
+write(2). The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2). It is destroyed via rmdir(2). The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together. Unlike sysfs, the
+lifetime of the representation is completely driven by userspace. The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system. One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel. You can access
+it by doing
+
+ mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems. Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config. Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2). The item's attributes will also
+appear at this time. readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values. Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file. The same efficiency caveats from sysfs
+apply. Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once. When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back. Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2). An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)). Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices. Call it FakeNBD. FakeNBD uses configfs
+for its configuration. Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it. Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+ # ls /config
+ fakenbd
+
+A fakenbd connection can be created with mkdir(2). The name is
+arbitrary, but likely the tool will make some use of the name. Perhaps
+it is a uuid or a disk name:
+
+ # mkdir /config/fakenbd/disk1
+ # ls /config/fakenbd/disk1
+ target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to. The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+ # echo 10.0.0.1 > /config/fakenbd/disk1/target
+ # echo /dev/sda1 > /config/fakenbd/disk1/device
+ # echo 1 > /config/fakenbd/disk1/rw
+
+That's it. That's all there is. Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item. A config_item reflects an
+object in the subsystem. It has attributes that match values on that
+object. configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group. A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that. The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module. During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem. A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+ struct config_item {
+ char *ci_name;
+ char ci_namebuf[UOBJ_NAME_LEN];
+ struct kref ci_kref;
+ struct list_head ci_entry;
+ struct config_item *ci_parent;
+ struct config_group *ci_group;
+ struct config_item_type *ci_type;
+ struct dentry *ci_dentry;
+ };
+
+ void config_item_init(struct config_item *);
+ void config_item_init_type_name(struct config_item *,
+ const char *name,
+ struct config_item_type *type);
+ struct config_item *config_item_get(struct config_item *);
+ void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing. The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it. This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things. For that, it needs a type.
+
+[struct config_item_type]
+
+ struct configfs_item_operations {
+ void (*release)(struct config_item *);
+ ssize_t (*show_attribute)(struct config_item *,
+ struct configfs_attribute *,
+ char *);
+ ssize_t (*store_attribute)(struct config_item *,
+ struct configfs_attribute *,
+ const char *, size_t);
+ int (*allow_link)(struct config_item *src,
+ struct config_item *target);
+ int (*drop_link)(struct config_item *src,
+ struct config_item *target);
+ };
+
+ struct config_item_type {
+ struct module *ct_owner;
+ struct configfs_item_operations *ct_item_ops;
+ struct configfs_group_operations *ct_group_ops;
+ struct configfs_attribute **ct_attrs;
+ };
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item. All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method. This method is called when the config_item's reference count
+reaches zero. Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method. Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+ struct configfs_attribute {
+ char *ca_name;
+ struct module *ca_owner;
+ mode_t ca_mode;
+ };
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs. When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename. configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute. The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vacuum. The only way one can be created
+is via mkdir(2) on a config_group. This will trigger creation of a
+child item.
+
+ struct config_group {
+ struct config_item cg_item;
+ struct list_head cg_children;
+ struct configfs_subsystem *cg_subsys;
+ struct config_group **default_groups;
+ };
+
+ void config_group_init(struct config_group *group);
+ void config_group_init_type_name(struct config_group *group,
+ const char *name,
+ struct config_item_type *type);
+
+
+The config_group structure contains a config_item. Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups. This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+ struct configfs_group_operations {
+ struct config_item *(*make_item)(struct config_group *group,
+ const char *name);
+ struct config_group *(*make_group)(struct config_group *group,
+ const char *name);
+ int (*commit_item)(struct config_item *item);
+ void (*disconnect_notify)(struct config_group *group,
+ struct config_item *item);
+ void (*drop_item)(struct config_group *group,
+ struct config_item *item);
+ };
+
+A group creates child items by providing the
+ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs. Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group(). Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called. As a config_group is also a
+config_item, it is not necessary for a separate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation. If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy). The subsystem is
+responsible for responding to this. If the subsystem has references to
+the item in other threads, the memory is safe. It may take some time
+for the item to actually disappear from the subsystem's usage. But it
+is gone from configfs.
+
+When drop_item() is called, the item's linkage has already been torn
+down. It no longer has a reference on its parent and has no place in
+the item hierarchy. If a client needs to do some cleanup before this
+teardown happens, the subsystem can implement the
+ct_group_ops->disconnect_notify() method. The method is called after
+configfs has removed the item from the filesystem view but before the
+item is removed from its parent group. Like drop_item(),
+disconnect_notify() is void and cannot fail. Client subsystems should
+not drop any references here, as they still must do it in drop_item().
+
+A config_group cannot be removed while it still has child items. This
+is implemented in the configfs rmdir(2) code. ->drop_item() will not be
+called, as the item has not been dropped. rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, usually at module_init time. This
+tells configfs to make the subsystem appear in the file tree.
+
+ struct configfs_subsystem {
+ struct config_group su_group;
+ struct mutex su_mutex;
+ };
+
+ int configfs_register_subsystem(struct configfs_subsystem *subsys);
+ void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+ A subsystem consists of a toplevel config_group and a mutex.
+The group is where child config_items are created. For a subsystem,
+this group is usually defined statically. Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the mutex.
+ When the register call returns, the subsystem is live, and it
+will be visible via configfs. At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example_explicit.c
+and configfs_example_macros.c. It shows a trivial object displaying and
+storing an attribute, and a simple group creating and destroying these
+children.
+
+The only difference between configfs_example_explicit.c and
+configfs_example_macros.c is how the attributes of the childless item
+are defined. The childless item has extended attributes, each with
+their own show()/store() operation. This follows a convention commonly
+used in sysfs. configfs_example_explicit.c creates these attributes
+by explicitly defining the structures involved. Conversely
+configfs_example_macros.c uses some convenience macros from configfs.h
+to define the attributes. These macros are similar to their sysfs
+counterparts.
+
+[Hierarchy Navigation and the Subsystem Mutex]
+
+There is an extra bonus that configfs provides. The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem. A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy. For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem. This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem mutex to
+protect modifications. Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+mutex.
+
+A subsystem will be prevented from acquiring the mutex while a newly
+allocated item has not been linked into this hierarchy. Similarly, it
+will not be able to acquire the mutex while a dropping item has not
+yet been unlinked. This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration. This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+mutex.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship. Often, however, a larger environment requires aggregation
+outside of the parent/child connection. This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods. If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items. Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item. If the source item
+allows linking to target item, it returns 0. A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method. Like the ->drop_item() method,
+this is a void function and cannot return failure. The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it. Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation. Thus,
+mkdir("parent") results in "parent", "parent/subgroup1", up through
+"parent/subgroupN". Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group. If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group. Similarly, they are removed at the same time
+as the parent. No extra notification is provided. When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2). They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Dependant Subsystems]
+
+Sometimes other drivers depend on particular configfs items. For
+example, ocfs2 mounts depend on a heartbeat region item. If that
+region item is removed with rmdir(2), the ocfs2 mount must BUG or go
+readonly. Not happy.
+
+configfs provides two additional API calls: configfs_depend_item() and
+configfs_undepend_item(). A client driver can call
+configfs_depend_item() on an existing item to tell configfs that it is
+depended on. configfs will then return -EBUSY from rmdir(2) for that
+item. When the item is no longer depended on, the client driver calls
+configfs_undepend_item() on it.
+
+These API cannot be called underneath any configfs callbacks, as
+they will conflict. They can block and allocate. A client driver
+probably shouldn't calling them of its own gumption. Rather it should
+be providing an API that external subsystems call.
+
+How does this work? Imagine the ocfs2 mount process. When it mounts,
+it asks for a heartbeat region item. This is done via a call into the
+heartbeat code. Inside the heartbeat code, the region item is looked
+up. Here, the heartbeat code calls configfs_depend_item(). If it
+succeeds, then heartbeat knows the region is safe to give to ocfs2.
+If it fails, it was being torn down anyway, and heartbeat can gracefully
+pass up an error.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state. That is, no
+default values can be specified for the item's attributes such that the
+item can do its work. Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above. Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects. This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized. Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go. More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attributes are
+initialized in a way that makes sense. configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations. An item is
+committed via rename(2). The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items. When this group appears in configfs, mkdir(2) will
+not work directly in the group. Instead, the group will have two
+subdirectories: "live" and "pending". The "live" directory does not
+support mkdir(2) or rmdir(2) either. It only allows rename(2). The
+"pending" directory does allow mkdir(2) and rmdir(2). An item is
+created in the "pending" directory. Its attributes can be modified at
+will. Userspace commits the item by renaming it into the "live"
+directory. At this point, the subsystem receives the ->commit_item()
+callback. If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted". Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one. The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+
diff --git a/Documentation/filesystems/configfs/configfs_example_explicit.c b/Documentation/filesystems/configfs/configfs_example_explicit.c
new file mode 100644
index 0000000..d428cc9
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example_explicit.c
@@ -0,0 +1,485 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example_explicit.c - This file is a demonstration module
+ * containing a number of configfs subsystems. It explicitly defines
+ * each structure without using the helper macros defined in
+ * configfs.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle. All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem. It cannot create
+ * any config_items. It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem. See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+ struct configfs_subsystem subsys;
+ int showme;
+ int storeme;
+};
+
+struct childless_attribute {
+ struct configfs_attribute attr;
+ ssize_t (*show)(struct childless *, char *);
+ ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+ return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+ char *page)
+{
+ ssize_t pos;
+
+ pos = sprintf(page, "%d\n", childless->showme);
+ childless->showme++;
+
+ return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+ const char *page,
+ size_t count)
+{
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ childless->storeme = tmp;
+
+ return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs. It does not support the creation of child config_items.\n"
+"It only has a few attributes. In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+ .attr = { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+ .show = childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+ .attr = { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+ .show = childless_storeme_read,
+ .store = childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+ .attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+ .show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+ &childless_attr_showme.attr,
+ &childless_attr_storeme.attr,
+ &childless_attr_description.attr,
+ NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ struct childless *childless = to_childless(item);
+ struct childless_attribute *childless_attr =
+ container_of(attr, struct childless_attribute, attr);
+ ssize_t ret = 0;
+
+ if (childless_attr->show)
+ ret = childless_attr->show(childless, page);
+ return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+ struct configfs_attribute *attr,
+ const char *page, size_t count)
+{
+ struct childless *childless = to_childless(item);
+ struct childless_attribute *childless_attr =
+ container_of(attr, struct childless_attribute, attr);
+ ssize_t ret = -EINVAL;
+
+ if (childless_attr->store)
+ ret = childless_attr->store(childless, page, count);
+ return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+ .show_attribute = childless_attr_show,
+ .store_attribute = childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+ .ct_item_ops = &childless_item_ops,
+ .ct_attrs = childless_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "01-childless",
+ .ci_type = &childless_type,
+ },
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child. Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go. Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+ struct config_item item;
+ int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+ return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "storeme",
+ .ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+ &simple_child_attr_storeme,
+ NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ ssize_t count;
+ struct simple_child *simple_child = to_simple_child(item);
+
+ count = sprintf(page, "%d\n", simple_child->storeme);
+
+ return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+ struct configfs_attribute *attr,
+ const char *page, size_t count)
+{
+ struct simple_child *simple_child = to_simple_child(item);
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ simple_child->storeme = tmp;
+
+ return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+ kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+ .release = simple_child_release,
+ .show_attribute = simple_child_attr_show,
+ .store_attribute = simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+ .ct_item_ops = &simple_child_item_ops,
+ .ct_attrs = simple_child_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+
+struct simple_children {
+ struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
+}
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+ struct simple_child *simple_child;
+
+ simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
+ if (!simple_child)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&simple_child->item, name,
+ &simple_child_type);
+
+ simple_child->storeme = 0;
+
+ return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+ &simple_children_attr_description,
+ NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items. These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static void simple_children_release(struct config_item *item)
+{
+ kfree(to_simple_children(item));
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+ .release = simple_children_release,
+ .show_attribute = simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+ .make_item = simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+ .ct_item_ops = &simple_children_item_ops,
+ .ct_group_ops = &simple_children_group_ops,
+ .ct_attrs = simple_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "02-simple-children",
+ .ci_type = &simple_children_type,
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above. However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem. Creation of a group in the subsystem creates
+ * a new simple_children group. That group can then have simple_child
+ * children of its own.
+ */
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+ struct simple_children *simple_children;
+
+ simple_children = kzalloc(sizeof(struct simple_children),
+ GFP_KERNEL);
+ if (!simple_children)
+ return ERR_PTR(-ENOMEM);
+
+ config_group_init_type_name(&simple_children->group, name,
+ &simple_children_type);
+
+ return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+ &group_children_attr_description,
+ NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups. These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+ .show_attribute = group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+ .make_group = group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+ .ct_item_ops = &group_children_item_ops,
+ .ct_group_ops = &group_children_group_ops,
+ .ct_attrs = group_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "03-group-children",
+ .ci_type = &group_children_type,
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all. It
+ * allows the init function to easily register them. Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+ &childless_subsys.subsys,
+ &simple_children_subsys,
+ &group_children_subsys,
+ NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+ int ret;
+ int i;
+ struct configfs_subsystem *subsys;
+
+ for (i = 0; example_subsys[i]; i++) {
+ subsys = example_subsys[i];
+
+ config_group_init(&subsys->su_group);
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ printk(KERN_ERR "Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+ }
+
+ return 0;
+
+out_unregister:
+ for (; i >= 0; i--) {
+ configfs_unregister_subsystem(example_subsys[i]);
+ }
+
+ return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+ int i;
+
+ for (i = 0; example_subsys[i]; i++) {
+ configfs_unregister_subsystem(example_subsys[i]);
+ }
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/Documentation/filesystems/configfs/configfs_example_macros.c b/Documentation/filesystems/configfs/configfs_example_macros.c
new file mode 100644
index 0000000..d8e30a0
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example_macros.c
@@ -0,0 +1,448 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example_macros.c - This file is a demonstration module
+ * containing a number of configfs subsystems. It uses the helper
+ * macros defined by configfs.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle. All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem. It cannot create
+ * any config_items. It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem. See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+ struct configfs_subsystem subsys;
+ int showme;
+ int storeme;
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+ return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+CONFIGFS_ATTR_STRUCT(childless);
+#define CHILDLESS_ATTR(_name, _mode, _show, _store) \
+struct childless_attribute childless_attr_##_name = __CONFIGFS_ATTR(_name, _mode, _show, _store)
+#define CHILDLESS_ATTR_RO(_name, _show) \
+struct childless_attribute childless_attr_##_name = __CONFIGFS_ATTR_RO(_name, _show);
+
+static ssize_t childless_showme_read(struct childless *childless,
+ char *page)
+{
+ ssize_t pos;
+
+ pos = sprintf(page, "%d\n", childless->showme);
+ childless->showme++;
+
+ return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+ const char *page,
+ size_t count)
+{
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ childless->storeme = tmp;
+
+ return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs. It does not support the creation of child config_items.\n"
+"It only has a few attributes. In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+CHILDLESS_ATTR_RO(showme, childless_showme_read);
+CHILDLESS_ATTR(storeme, S_IRUGO | S_IWUSR, childless_storeme_read,
+ childless_storeme_write);
+CHILDLESS_ATTR_RO(description, childless_description_read);
+
+static struct configfs_attribute *childless_attrs[] = {
+ &childless_attr_showme.attr,
+ &childless_attr_storeme.attr,
+ &childless_attr_description.attr,
+ NULL,
+};
+
+CONFIGFS_ATTR_OPS(childless);
+static struct configfs_item_operations childless_item_ops = {
+ .show_attribute = childless_attr_show,
+ .store_attribute = childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+ .ct_item_ops = &childless_item_ops,
+ .ct_attrs = childless_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "01-childless",
+ .ci_type = &childless_type,
+ },
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child. Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go. Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+ struct config_item item;
+ int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+ return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "storeme",
+ .ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+ &simple_child_attr_storeme,
+ NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ ssize_t count;
+ struct simple_child *simple_child = to_simple_child(item);
+
+ count = sprintf(page, "%d\n", simple_child->storeme);
+
+ return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+ struct configfs_attribute *attr,
+ const char *page, size_t count)
+{
+ struct simple_child *simple_child = to_simple_child(item);
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ simple_child->storeme = tmp;
+
+ return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+ kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+ .release = simple_child_release,
+ .show_attribute = simple_child_attr_show,
+ .store_attribute = simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+ .ct_item_ops = &simple_child_item_ops,
+ .ct_attrs = simple_child_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+
+struct simple_children {
+ struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
+}
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+ struct simple_child *simple_child;
+
+ simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
+ if (!simple_child)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&simple_child->item, name,
+ &simple_child_type);
+
+ simple_child->storeme = 0;
+
+ return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+ &simple_children_attr_description,
+ NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items. These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static void simple_children_release(struct config_item *item)
+{
+ kfree(to_simple_children(item));
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+ .release = simple_children_release,
+ .show_attribute = simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+ .make_item = simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+ .ct_item_ops = &simple_children_item_ops,
+ .ct_group_ops = &simple_children_group_ops,
+ .ct_attrs = simple_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "02-simple-children",
+ .ci_type = &simple_children_type,
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above. However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem. Creation of a group in the subsystem creates
+ * a new simple_children group. That group can then have simple_child
+ * children of its own.
+ */
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+ struct simple_children *simple_children;
+
+ simple_children = kzalloc(sizeof(struct simple_children),
+ GFP_KERNEL);
+ if (!simple_children)
+ return ERR_PTR(-ENOMEM);
+
+ config_group_init_type_name(&simple_children->group, name,
+ &simple_children_type);
+
+ return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+ &group_children_attr_description,
+ NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups. These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+ .show_attribute = group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+ .make_group = group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+ .ct_item_ops = &group_children_item_ops,
+ .ct_group_ops = &group_children_group_ops,
+ .ct_attrs = group_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "03-group-children",
+ .ci_type = &group_children_type,
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all. It
+ * allows the init function to easily register them. Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+ &childless_subsys.subsys,
+ &simple_children_subsys,
+ &group_children_subsys,
+ NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+ int ret;
+ int i;
+ struct configfs_subsystem *subsys;
+
+ for (i = 0; example_subsys[i]; i++) {
+ subsys = example_subsys[i];
+
+ config_group_init(&subsys->su_group);
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ printk(KERN_ERR "Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+ }
+
+ return 0;
+
+out_unregister:
+ for (; i >= 0; i--) {
+ configfs_unregister_subsystem(example_subsys[i]);
+ }
+
+ return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+ int i;
+
+ for (i = 0; example_subsys[i]; i++) {
+ configfs_unregister_subsystem(example_subsys[i]);
+ }
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/Documentation/filesystems/cramfs.txt b/Documentation/filesystems/cramfs.txt
new file mode 100644
index 0000000..31f53f0
--- /dev/null
+++ b/Documentation/filesystems/cramfs.txt
@@ -0,0 +1,76 @@
+
+ Cramfs - cram a filesystem onto a small ROM
+
+cramfs is designed to be simple and small, and to compress things well.
+
+It uses the zlib routines to compress a file one page at a time, and
+allows random page access. The meta-data is not compressed, but is
+expressed in a very terse representation to make it use much less
+diskspace than traditional filesystems.
+
+You can't write to a cramfs filesystem (making it compressible and
+compact also makes it _very_ hard to update on-the-fly), so you have to
+create the disk image with the "mkcramfs" utility.
+
+
+Usage Notes
+-----------
+
+File sizes are limited to less than 16MB.
+
+Maximum filesystem size is a little over 256MB. (The last file on the
+filesystem is allowed to extend past 256MB.)
+
+Only the low 8 bits of gid are stored. The current version of
+mkcramfs simply truncates to 8 bits, which is a potential security
+issue.
+
+Hard links are supported, but hard linked files
+will still have a link count of 1 in the cramfs image.
+
+Cramfs directories have no `.' or `..' entries. Directories (like
+every other file on cramfs) always have a link count of 1. (There's
+no need to use -noleaf in `find', btw.)
+
+No timestamps are stored in a cramfs, so these default to the epoch
+(1970 GMT). Recently-accessed files may have updated timestamps, but
+the update lasts only as long as the inode is cached in memory, after
+which the timestamp reverts to 1970, i.e. moves backwards in time.
+
+Currently, cramfs must be written and read with architectures of the
+same endianness, and can be read only by kernels with PAGE_CACHE_SIZE
+== 4096. At least the latter of these is a bug, but it hasn't been
+decided what the best fix is. For the moment if you have larger pages
+you can just change the #define in mkcramfs.c, so long as you don't
+mind the filesystem becoming unreadable to future kernels.
+
+
+For /usr/share/magic
+--------------------
+
+0 ulelong 0x28cd3d45 Linux cramfs offset 0
+>4 ulelong x size %d
+>8 ulelong x flags 0x%x
+>12 ulelong x future 0x%x
+>16 string >\0 signature "%.16s"
+>32 ulelong x fsid.crc 0x%x
+>36 ulelong x fsid.edition %d
+>40 ulelong x fsid.blocks %d
+>44 ulelong x fsid.files %d
+>48 string >\0 name "%.16s"
+512 ulelong 0x28cd3d45 Linux cramfs offset 512
+>516 ulelong x size %d
+>520 ulelong x flags 0x%x
+>524 ulelong x future 0x%x
+>528 string >\0 signature "%.16s"
+>544 ulelong x fsid.crc 0x%x
+>548 ulelong x fsid.edition %d
+>552 ulelong x fsid.blocks %d
+>556 ulelong x fsid.files %d
+>560 string >\0 name "%.16s"
+
+
+Hacker Notes
+------------
+
+See fs/cramfs/README for filesystem layout and implementation notes.
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
new file mode 100644
index 0000000..4c0c575
--- /dev/null
+++ b/Documentation/filesystems/dentry-locking.txt
@@ -0,0 +1,173 @@
+RCU-based dcache locking model
+==============================
+
+On many workloads, the most common operation on dcache is to look up a
+dentry, given a parent dentry and the name of the child. Typically,
+for every open(), stat() etc., the dentry corresponding to the
+pathname will be looked up by walking the tree starting with the first
+component of the pathname and using that dentry along with the next
+component to look up the next level and so on. Since it is a frequent
+operation for workloads like multiuser environments and web servers,
+it is important to optimize this path.
+
+Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in
+every component during path look-up. Since 2.5.10 onwards, fast-walk
+algorithm changed this by holding the dcache_lock at the beginning and
+walking as many cached path component dentries as possible. This
+significantly decreases the number of acquisition of
+dcache_lock. However it also increases the lock hold time
+significantly and affects performance in large SMP machines. Since
+2.5.62 kernel, dcache has been using a new locking model that uses RCU
+to make dcache look-up lock-free.
+
+The current dcache locking model is not very different from the
+existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock
+protected the hash chain, d_child, d_alias, d_lru lists as well as
+d_inode and several other things like mount look-up. RCU-based changes
+affect only the way the hash chain is protected. For everything else
+the dcache_lock must be taken for both traversing as well as
+updating. The hash chain updates too take the dcache_lock. The
+significant change is the way d_lookup traverses the hash chain, it
+doesn't acquire the dcache_lock for this and rely on RCU to ensure
+that the dentry has not been *freed*.
+
+
+Dcache locking details
+======================
+
+For many multi-user workloads, open() and stat() on files are very
+frequently occurring operations. Both involve walking of path names to
+find the dentry corresponding to the concerned file. In 2.4 kernel,
+dcache_lock was held during look-up of each path component. Contention
+and cache-line bouncing of this global lock caused significant
+scalability problems. With the introduction of RCU in Linux kernel,
+this was worked around by making the look-up of path components during
+path walking lock-free.
+
+
+Safe lock-free look-up of dcache hash table
+===========================================
+
+Dcache is a complex data structure with the hash table entries also
+linked together in other lists. In 2.4 kernel, dcache_lock protected
+all the lists. We applied RCU only on hash chain walking. The rest of
+the lists are still protected by dcache_lock. Some of the important
+changes are :
+
+1. The deletion from hash chain is done using hlist_del_rcu() macro
+ which doesn't initialize next pointer of the deleted dentry and
+ this allows us to walk safely lock-free while a deletion is
+ happening.
+
+2. Insertion of a dentry into the hash table is done using
+ hlist_add_head_rcu() which take care of ordering the writes - the
+ writes to the dentry must be visible before the dentry is
+ inserted. This works in conjunction with hlist_for_each_rcu() while
+ walking the hash chain. The only requirement is that all
+ initialization to the dentry must be done before
+ hlist_add_head_rcu() since we don't have dcache_lock protection
+ while traversing the hash chain. This isn't different from the
+ existing code.
+
+3. The dentry looked up without holding dcache_lock by cannot be
+ returned for walking if it is unhashed. It then may have a NULL
+ d_inode or other bogosity since RCU doesn't protect the other
+ fields in the dentry. We therefore use a flag DCACHE_UNHASHED to
+ indicate unhashed dentries and use this in conjunction with a
+ per-dentry lock (d_lock). Once looked up without the dcache_lock,
+ we acquire the per-dentry lock (d_lock) and check if the dentry is
+ unhashed. If so, the look-up is failed. If not, the reference count
+ of the dentry is increased and the dentry is returned.
+
+4. Once a dentry is looked up, it must be ensured during the path walk
+ for that component it doesn't go away. In pre-2.5.10 code, this was
+ done holding a reference to the dentry. dcache_rcu does the same.
+ In some sense, dcache_rcu path walking looks like the pre-2.5.10
+ version.
+
+5. All dentry hash chain updates must take the dcache_lock as well as
+ the per-dentry lock in that order. dput() does this to ensure that
+ a dentry that has just been looked up in another CPU doesn't get
+ deleted before dget() can be done on it.
+
+6. There are several ways to do reference counting of RCU protected
+ objects. One such example is in ipv4 route cache where deferred
+ freeing (using call_rcu()) is done as soon as the reference count
+ goes to zero. This cannot be done in the case of dentries because
+ tearing down of dentries require blocking (dentry_iput()) which
+ isn't supported from RCU callbacks. Instead, tearing down of
+ dentries happen synchronously in dput(), but actual freeing happens
+ later when RCU grace period is over. This allows safe lock-free
+ walking of the hash chains, but a matched dentry may have been
+ partially torn down. The checking of DCACHE_UNHASHED flag with
+ d_lock held detects such dentries and prevents them from being
+ returned from look-up.
+
+
+Maintaining POSIX rename semantics
+==================================
+
+Since look-up of dentries is lock-free, it can race against a
+concurrent rename operation. For example, during rename of file A to
+B, look-up of either A or B must succeed. So, if look-up of B happens
+after A has been removed from the hash chain but not added to the new
+hash chain, it may fail. Also, a comparison while the name is being
+written concurrently by a rename may result in false positive matches
+violating rename semantics. Issues related to race with rename are
+handled as described below :
+
+1. Look-up can be done in two ways - d_lookup() which is safe from
+ simultaneous renames and __d_lookup() which is not. If
+ __d_lookup() fails, it must be followed up by a d_lookup() to
+ correctly determine whether a dentry is in the hash table or
+ not. d_lookup() protects look-ups using a sequence lock
+ (rename_lock).
+
+2. The name associated with a dentry (d_name) may be changed if a
+ rename is allowed to happen simultaneously. To avoid memcmp() in
+ __d_lookup() go out of bounds due to a rename and false positive
+ comparison, the name comparison is done while holding the
+ per-dentry lock. This prevents concurrent renames during this
+ operation.
+
+3. Hash table walking during look-up may move to a different bucket as
+ the current dentry is moved to a different bucket due to rename.
+ But we use hlists in dcache hash table and they are
+ null-terminated. So, even if a dentry moves to a different bucket,
+ hash chain walk will terminate. [with a list_head list, it may not
+ since termination is when the list_head in the original bucket is
+ reached]. Since we redo the d_parent check and compare name while
+ holding d_lock, lock-free look-up will not race against d_move().
+
+4. There can be a theoretical race when a dentry keeps coming back to
+ original bucket due to double moves. Due to this look-up may
+ consider that it has never moved and can end up in a infinite loop.
+ But this is not any worse that theoretical livelocks we already
+ have in the kernel.
+
+
+Important guidelines for filesystem developers related to dcache_rcu
+====================================================================
+
+1. Existing dcache interfaces (pre-2.5.62) exported to filesystem
+ don't change. Only dcache internal implementation changes. However
+ filesystems *must not* delete from the dentry hash chains directly
+ using the list macros like allowed earlier. They must use dcache
+ APIs like d_drop() or __d_drop() depending on the situation.
+
+2. d_flags is now protected by a per-dentry lock (d_lock). All access
+ to d_flags must be protected by it.
+
+3. For a hashed dentry, checking of d_count needs to be protected by
+ d_lock.
+
+
+Papers and other documentation on dcache locking
+================================================
+
+1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
+
+2. http://lse.sourceforge.net/locking/dcache/dcache.html
+
+
+
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking
new file mode 100644
index 0000000..ff7b611
--- /dev/null
+++ b/Documentation/filesystems/directory-locking
@@ -0,0 +1,114 @@
+ Locking scheme used for directory operations is based on two
+kinds of locks - per-inode (->i_mutex) and per-filesystem
+(->s_vfs_rename_mutex).
+
+ For our purposes all operations fall in 5 classes:
+
+1) read access. Locking rules: caller locks directory we are accessing.
+
+2) object creation. Locking rules: same as above.
+
+3) object removal. Locking rules: caller locks parent, finds victim,
+locks victim and calls the method.
+
+4) rename() that is _not_ cross-directory. Locking rules: caller locks
+the parent, finds source and target, if target already exists - locks it
+and then calls the method.
+
+5) link creation. Locking rules:
+ * lock parent
+ * check that source is not a directory
+ * lock source
+ * call the method.
+
+6) cross-directory rename. The trickiest in the whole bunch. Locking
+rules:
+ * lock the filesystem
+ * lock parents in "ancestors first" order.
+ * find source and target.
+ * if old parent is equal to or is a descendent of target
+ fail with -ENOTEMPTY
+ * if new parent is equal to or is a descendent of source
+ fail with -ELOOP
+ * if target exists - lock it.
+ * call the method.
+
+
+The rules above obviously guarantee that all directories that are going to be
+read, modified or removed by method will be locked by caller.
+
+
+If no directory is its own ancestor, the scheme above is deadlock-free.
+Proof:
+
+ First of all, at any moment we have a partial ordering of the
+objects - A < B iff A is an ancestor of B.
+
+ That ordering can change. However, the following is true:
+
+(1) if object removal or non-cross-directory rename holds lock on A and
+ attempts to acquire lock on B, A will remain the parent of B until we
+ acquire the lock on B. (Proof: only cross-directory rename can change
+ the parent of object and it would have to lock the parent).
+
+(2) if cross-directory rename holds the lock on filesystem, order will not
+ change until rename acquires all locks. (Proof: other cross-directory
+ renames will be blocked on filesystem lock and we don't start changing
+ the order until we had acquired all locks).
+
+(3) any operation holds at most one lock on non-directory object and
+ that lock is acquired after all other locks. (Proof: see descriptions
+ of operations).
+
+ Now consider the minimal deadlock. Each process is blocked on
+attempt to acquire some lock and already holds at least one lock. Let's
+consider the set of contended locks. First of all, filesystem lock is
+not contended, since any process blocked on it is not holding any locks.
+Thus all processes are blocked on ->i_mutex.
+
+ Non-directory objects are not contended due to (3). Thus link
+creation can't be a part of deadlock - it can't be blocked on source
+and it means that it doesn't hold any locks.
+
+ Any contended object is either held by cross-directory rename or
+has a child that is also contended. Indeed, suppose that it is held by
+operation other than cross-directory rename. Then the lock this operation
+is blocked on belongs to child of that object due to (1).
+
+ It means that one of the operations is cross-directory rename.
+Otherwise the set of contended objects would be infinite - each of them
+would have a contended child and we had assumed that no object is its
+own descendent. Moreover, there is exactly one cross-directory rename
+(see above).
+
+ Consider the object blocking the cross-directory rename. One
+of its descendents is locked by cross-directory rename (otherwise we
+would again have an infinite set of contended objects). But that
+means that cross-directory rename is taking locks out of order. Due
+to (2) the order hadn't changed since we had acquired filesystem lock.
+But locking rules for cross-directory rename guarantee that we do not
+try to acquire lock on descendent before the lock on ancestor.
+Contradiction. I.e. deadlock is impossible. Q.E.D.
+
+
+ These operations are guaranteed to avoid loop creation. Indeed,
+the only operation that could introduce loops is cross-directory rename.
+Since the only new (parent, child) pair added by rename() is (new parent,
+source), such loop would have to contain these objects and the rest of it
+would have to exist before rename(). I.e. at the moment of loop creation
+rename() responsible for that would be holding filesystem lock and new parent
+would have to be equal to or a descendent of source. But that means that
+new parent had been equal to or a descendent of source since the moment when
+we had acquired filesystem lock and rename() would fail with -ELOOP in that
+case.
+
+ While this locking scheme works for arbitrary DAGs, it relies on
+ability to check that directory is a descendent of another object. Current
+implementation assumes that directory graph is a tree. This assumption is
+also preserved by all operations (cross-directory rename on a tree that would
+not introduce a cycle will leave it a tree and link() fails for directories).
+
+ Notice that "directory" in the above == "anything that might have
+children", so if we are going to introduce hybrid objects we will need
+either to make sure that link(2) doesn't work for them or to make changes
+in is_subdir() that would make it work even in presence of such beasts.
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
new file mode 100644
index 0000000..c50bbb2
--- /dev/null
+++ b/Documentation/filesystems/dlmfs.txt
@@ -0,0 +1,130 @@
+dlmfs
+==================
+A minimal DLM userspace interface implemented via a virtual file
+system.
+
+dlmfs is built with OCFS2 as it requires most of its infrastructure.
+
+Project web page: http://oss.oracle.com/projects/ocfs2
+Tools web page: http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS
+=======
+
+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
+and Transmeta Corp.
+
+Mark Fasheh <mark.fasheh@oracle.com>
+
+Caveats
+=======
+- Right now it only works with the OCFS2 DLM, though support for other
+ DLM implementations should not be a major issue.
+
+Mount options
+=============
+None
+
+Usage
+=====
+
+If you're just interested in OCFS2, then please see ocfs2.txt. The
+rest of this document will be geared towards those who want to use
+dlmfs for easy to setup and easy to use clustered locking in
+userspace.
+
+Setup
+=====
+
+dlmfs requires that the OCFS2 cluster infrastructure be in
+place. Please download ocfs2-tools from the above url and configure a
+cluster.
+
+You'll want to start heartbeating on a volume which all the nodes in
+your lockspace can access. The easiest way to do this is via
+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
+that an OCFS2 file system be in place so that it can automatically
+find it's heartbeat area, though it will eventually support heartbeat
+against raw disks.
+
+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
+with ocfs2-tools.
+
+Once you're heartbeating, DLM lock 'domains' can be easily created /
+destroyed and locks within them accessed.
+
+Locking
+=======
+
+Users may access dlmfs via standard file system calls, or they can use
+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
+system calls and presents a more traditional locking api.
+
+dlmfs handles lock caching automatically for the user, so a lock
+request for an already acquired lock will not generate another DLM
+call. Userspace programs are assumed to handle their own local
+locking.
+
+Two levels of locks are supported - Shared Read, and Exclusive.
+Also supported is a Trylock operation.
+
+For information on the libo2dlm interface, please see o2dlm.h,
+distributed with ocfs2-tools.
+
+Lock value blocks can be read and written to a resource via read(2)
+and write(2) against the fd obtained via your open(2) call. The
+maximum currently supported LVB length is 64 bytes (though that is an
+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
+small amounts of data amongst their nodes.
+
+mkdir(2) signals dlmfs to join a domain (which will have the same name
+as the resulting directory)
+
+rmdir(2) signals dlmfs to leave the domain
+
+Locks for a given domain are represented by regular inodes inside the
+domain directory. Locking against them is done via the open(2) system
+call.
+
+The open(2) call will not return until your lock has been granted or
+an error has occurred, unless it has been instructed to do a trylock
+operation. If the lock succeeds, you'll get an fd.
+
+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
+not automatically create inodes for existing lock resources.
+
+Open Flag Lock Request Type
+--------- -----------------
+O_RDONLY Shared Read
+O_RDWR Exclusive
+
+Open Flag Resulting Locking Behavior
+--------- --------------------------
+O_NONBLOCK Trylock operation
+
+You must provide exactly one of O_RDONLY or O_RDWR.
+
+If O_NONBLOCK is also provided and the trylock operation was valid but
+could not lock the resource then open(2) will return ETXTBUSY.
+
+close(2) drops the lock associated with your fd.
+
+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
+supported locally as well. This means you can use them to restrict
+access to the resources via dlmfs on your local node only.
+
+The resource LVB may be read from the fd in either Shared Read or
+Exclusive modes via the read(2) system call. It can be written via
+write(2) only when open in Exclusive mode.
+
+Once written, an LVB will be visible to other nodes who obtain Read
+Only or higher level locks on the resource.
+
+See Also
+========
+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+
+For more information on the VMS distributed locking API.
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt
new file mode 100644
index 0000000..9f5d338
--- /dev/null
+++ b/Documentation/filesystems/dnotify.txt
@@ -0,0 +1,99 @@
+ Linux Directory Notification
+ ============================
+
+ Stephen Rothwell <sfr@canb.auug.org.au>
+
+The intention of directory notification is to allow user applications
+to be notified when a directory, or any of the files in it, are changed.
+The basic mechanism involves the application registering for notification
+on a directory using a fcntl(2) call and the notifications themselves
+being delivered using signals.
+
+The application decides which "events" it wants to be notified about.
+The currently defined events are:
+
+ DN_ACCESS A file in the directory was accessed (read)
+ DN_MODIFY A file in the directory was modified (write,truncate)
+ DN_CREATE A file was created in the directory
+ DN_DELETE A file was unlinked from directory
+ DN_RENAME A file in the directory was renamed
+ DN_ATTRIB A file in the directory had its attributes
+ changed (chmod,chown)
+
+Usually, the application must reregister after each notification, but
+if DN_MULTISHOT is or'ed with the event mask, then the registration will
+remain until explicitly removed (by registering for no events).
+
+By default, SIGIO will be delivered to the process and no other useful
+information. However, if the F_SETSIG fcntl(2) call is used to let the
+kernel know which signal to deliver, a siginfo structure will be passed to
+the signal handler and the si_fd member of that structure will contain the
+file descriptor associated with the directory in which the event occurred.
+
+Preferably the application will choose one of the real time signals
+(SIGRTMIN + <n>) so that the notifications may be queued. This is
+especially important if DN_MULTISHOT is specified. Note that SIGRTMIN
+is often blocked, so it is better to use (at least) SIGRTMIN + 1.
+
+Implementation expectations (features and bugs :-))
+---------------------------
+
+The notification should work for any local access to files even if the
+actual file system is on a remote server. This implies that remote
+access to files served by local user mode servers should be notified.
+Also, remote accesses to files served by a local kernel NFS server should
+be notified.
+
+In order to make the impact on the file system code as small as possible,
+the problem of hard links to files has been ignored. So if a file (x)
+exists in two directories (a and b) then a change to the file using the
+name "a/x" should be notified to a program expecting notifications on
+directory "a", but will not be notified to one expecting notifications on
+directory "b".
+
+Also, files that are unlinked, will still cause notifications in the
+last directory that they were linked to.
+
+Configuration
+-------------
+
+Dnotify is controlled via the CONFIG_DNOTIFY configuration option. When
+disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
+
+Example
+-------
+
+ #define _GNU_SOURCE /* needed to get the defines */
+ #include <fcntl.h> /* in glibc 2.2 this has the needed
+ values defined */
+ #include <signal.h>
+ #include <stdio.h>
+ #include <unistd.h>
+
+ static volatile int event_fd;
+
+ static void handler(int sig, siginfo_t *si, void *data)
+ {
+ event_fd = si->si_fd;
+ }
+
+ int main(void)
+ {
+ struct sigaction act;
+ int fd;
+
+ act.sa_sigaction = handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ sigaction(SIGRTMIN + 1, &act, NULL);
+
+ fd = open(".", O_RDONLY);
+ fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+ fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+ /* we will now be notified if any of the files
+ in "." is modified or new files are created */
+ while (1) {
+ pause();
+ printf("Got event on fd=%d\n", event_fd);
+ }
+ }
diff --git a/Documentation/filesystems/ecryptfs.txt b/Documentation/filesystems/ecryptfs.txt
new file mode 100644
index 0000000..01d8a08
--- /dev/null
+++ b/Documentation/filesystems/ecryptfs.txt
@@ -0,0 +1,77 @@
+eCryptfs: A stacked cryptographic filesystem for Linux
+
+eCryptfs is free software. Please see the file COPYING for details.
+For documentation, please see the files in the doc/ subdirectory. For
+building and installation instructions please see the INSTALL file.
+
+Maintainer: Phillip Hellewell
+Lead developer: Michael A. Halcrow <mhalcrow@us.ibm.com>
+Developers: Michael C. Thompson
+ Kent Yoder
+Web Site: http://ecryptfs.sf.net
+
+This software is currently undergoing development. Make sure to
+maintain a backup copy of any data you write into eCryptfs.
+
+eCryptfs requires the userspace tools downloadable from the
+SourceForge site:
+
+http://sourceforge.net/projects/ecryptfs/
+
+Userspace requirements include:
+ - David Howells' userspace keyring headers and libraries (version
+ 1.0 or higher), obtainable from
+ http://people.redhat.com/~dhowells/keyutils/
+ - Libgcrypt
+
+
+NOTES
+
+In the beta/experimental releases of eCryptfs, when you upgrade
+eCryptfs, you should copy the files to an unencrypted location and
+then copy the files back into the new eCryptfs mount to migrate the
+files.
+
+
+MOUNT-WIDE PASSPHRASE
+
+Create a new directory into which eCryptfs will write its encrypted
+files (i.e., /root/crypt). Then, create the mount point directory
+(i.e., /mnt/crypt). Now it's time to mount eCryptfs:
+
+mount -t ecryptfs /root/crypt /mnt/crypt
+
+You should be prompted for a passphrase and a salt (the salt may be
+blank).
+
+Try writing a new file:
+
+echo "Hello, World" > /mnt/crypt/hello.txt
+
+The operation will complete. Notice that there is a new file in
+/root/crypt that is at least 12288 bytes in size (depending on your
+host page size). This is the encrypted underlying file for what you
+just wrote. To test reading, from start to finish, you need to clear
+the user session keyring:
+
+keyctl clear @u
+
+Then umount /mnt/crypt and mount again per the instructions given
+above.
+
+cat /mnt/crypt/hello.txt
+
+
+NOTES
+
+eCryptfs version 0.1 should only be mounted on (1) empty directories
+or (2) directories containing files only created by eCryptfs. If you
+mount a directory that has pre-existing files not created by eCryptfs,
+then behavior is undefined. Do not run eCryptfs in higher verbosity
+levels unless you are doing so for the sole purpose of debugging or
+development, since secret values will be written out to the system log
+in that case.
+
+
+Mike Halcrow
+mhalcrow@us.ibm.com
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
new file mode 100644
index 0000000..4333e83
--- /dev/null
+++ b/Documentation/filesystems/ext2.txt
@@ -0,0 +1,382 @@
+
+The Second Extended Filesystem
+==============================
+
+ext2 was originally released in January 1993. Written by R\'emy Card,
+Theodore Ts'o and Stephen Tweedie, it was a major rewrite of the
+Extended Filesystem. It is currently still (April 2001) the predominant
+filesystem in use by Linux. There are also implementations available
+for NetBSD, FreeBSD, the GNU HURD, Windows 95/98/NT, OS/2 and RISC OS.
+
+Options
+=======
+
+Most defaults are determined by the filesystem superblock, and can be
+set using tune2fs(8). Kernel-determined defaults are indicated by (*).
+
+bsddf (*) Makes `df' act like BSD.
+minixdf Makes `df' act like Minix.
+
+check=none, nocheck (*) Don't do extra checking of bitmaps on mount
+ (check=normal and check=strict options removed)
+
+debug Extra debugging information is sent to the
+ kernel syslog. Useful for developers.
+
+errors=continue Keep going on a filesystem error.
+errors=remount-ro Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+
+grpid, bsdgroups Give objects the same group ID as their parent.
+nogrpid, sysvgroups New objects have the group ID of their creator.
+
+nouid32 Use 16-bit UIDs and GIDs.
+
+oldalloc Enable the old block allocator. Orlov should
+ have better performance, we'd like to get some
+ feedback if it's the contrary for you.
+orlov (*) Use the Orlov block allocator.
+ (See http://lwn.net/Articles/14633/ and
+ http://lwn.net/Articles/14446/.)
+
+resuid=n The user ID which may use the reserved blocks.
+resgid=n The group ID which may use the reserved blocks.
+
+sb=n Use alternate superblock at this location.
+
+user_xattr Enable "user." POSIX Extended Attributes
+ (requires CONFIG_EXT2_FS_XATTR).
+ See also http://acl.bestbits.at
+nouser_xattr Don't support "user." extended attributes.
+
+acl Enable POSIX Access Control Lists support
+ (requires CONFIG_EXT2_FS_POSIX_ACL).
+ See also http://acl.bestbits.at
+noacl Don't support POSIX ACLs.
+
+nobh Do not attach buffer_heads to file pagecache.
+
+xip Use execute in place (no caching) if possible
+
+grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
+
+
+Specification
+=============
+
+ext2 shares many properties with traditional Unix filesystems. It has
+the concepts of blocks, inodes and directories. It has space in the
+specification for Access Control Lists (ACLs), fragments, undeletion and
+compression though these are not yet implemented (some are available as
+separate patches). There is also a versioning mechanism to allow new
+features (such as journalling) to be added in a maximally compatible
+manner.
+
+Blocks
+------
+
+The space in the device or file is split up into blocks. These are
+a fixed size, of 1024, 2048 or 4096 bytes (8192 bytes on Alpha systems),
+which is decided when the filesystem is created. Smaller blocks mean
+less wasted space per file, but require slightly more accounting overhead,
+and also impose other limits on the size of files and the filesystem.
+
+Block Groups
+------------
+
+Blocks are clustered into block groups in order to reduce fragmentation
+and minimise the amount of head seeking when reading a large amount
+of consecutive data. Information about each block group is kept in a
+descriptor table stored in the block(s) immediately after the superblock.
+Two blocks near the start of each group are reserved for the block usage
+bitmap and the inode usage bitmap which show which blocks and inodes
+are in use. Since each bitmap is limited to a single block, this means
+that the maximum size of a block group is 8 times the size of a block.
+
+The block(s) following the bitmaps in each block group are designated
+as the inode table for that block group and the remainder are the data
+blocks. The block allocation algorithm attempts to allocate data blocks
+in the same block group as the inode which contains them.
+
+The Superblock
+--------------
+
+The superblock contains all the information about the configuration of
+the filing system. The primary copy of the superblock is stored at an
+offset of 1024 bytes from the start of the device, and it is essential
+to mounting the filesystem. Since it is so important, backup copies of
+the superblock are stored in block groups throughout the filesystem.
+The first version of ext2 (revision 0) stores a copy at the start of
+every block group, along with backups of the group descriptor block(s).
+Because this can consume a considerable amount of space for large
+filesystems, later revisions can optionally reduce the number of backup
+copies by only putting backups in specific groups (this is the sparse
+superblock feature). The groups chosen are 0, 1 and powers of 3, 5 and 7.
+
+The information in the superblock contains fields such as the total
+number of inodes and blocks in the filesystem and how many are free,
+how many inodes and blocks are in each block group, when the filesystem
+was mounted (and if it was cleanly unmounted), when it was modified,
+what version of the filesystem it is (see the Revisions section below)
+and which OS created it.
+
+If the filesystem is revision 1 or higher, then there are extra fields,
+such as a volume name, a unique identification number, the inode size,
+and space for optional filesystem features to store configuration info.
+
+All fields in the superblock (as in all other ext2 structures) are stored
+on the disc in little endian format, so a filesystem is portable between
+machines without having to know what machine it was created on.
+
+Inodes
+------
+
+The inode (index node) is a fundamental concept in the ext2 filesystem.
+Each object in the filesystem is represented by an inode. The inode
+structure contains pointers to the filesystem blocks which contain the
+data held in the object and all of the metadata about an object except
+its name. The metadata about an object includes the permissions, owner,
+group, flags, size, number of blocks used, access time, change time,
+modification time, deletion time, number of links, fragments, version
+(for NFS) and extended attributes (EAs) and/or Access Control Lists (ACLs).
+
+There are some reserved fields which are currently unused in the inode
+structure and several which are overloaded. One field is reserved for the
+directory ACL if the inode is a directory and alternately for the top 32
+bits of the file size if the inode is a regular file (allowing file sizes
+larger than 2GB). The translator field is unused under Linux, but is used
+by the HURD to reference the inode of a program which will be used to
+interpret this object. Most of the remaining reserved fields have been
+used up for both Linux and the HURD for larger owner and group fields,
+The HURD also has a larger mode field so it uses another of the remaining
+fields to store the extra more bits.
+
+There are pointers to the first 12 blocks which contain the file's data
+in the inode. There is a pointer to an indirect block (which contains
+pointers to the next set of blocks), a pointer to a doubly-indirect
+block (which contains pointers to indirect blocks) and a pointer to a
+trebly-indirect block (which contains pointers to doubly-indirect blocks).
+
+The flags field contains some ext2-specific flags which aren't catered
+for by the standard chmod flags. These flags can be listed with lsattr
+and changed with the chattr command, and allow specific filesystem
+behaviour on a per-file basis. There are flags for secure deletion,
+undeletable, compression, synchronous updates, immutability, append-only,
+dumpable, no-atime, indexed directories, and data-journaling. Not all
+of these are supported yet.
+
+Directories
+-----------
+
+A directory is a filesystem object and has an inode just like a file.
+It is a specially formatted file containing records which associate
+each name with an inode number. Later revisions of the filesystem also
+encode the type of the object (file, directory, symlink, device, fifo,
+socket) to avoid the need to check the inode itself for this information
+(support for taking advantage of this feature does not yet exist in
+Glibc 2.2).
+
+The inode allocation code tries to assign inodes which are in the same
+block group as the directory in which they are first created.
+
+The current implementation of ext2 uses a singly-linked list to store
+the filenames in the directory; a pending enhancement uses hashing of the
+filenames to allow lookup without the need to scan the entire directory.
+
+The current implementation never removes empty directory blocks once they
+have been allocated to hold more files.
+
+Special files
+-------------
+
+Symbolic links are also filesystem objects with inodes. They deserve
+special mention because the data for them is stored within the inode
+itself if the symlink is less than 60 bytes long. It uses the fields
+which would normally be used to store the pointers to data blocks.
+This is a worthwhile optimisation as it we avoid allocating a full
+block for the symlink, and most symlinks are less than 60 characters long.
+
+Character and block special devices never have data blocks assigned to
+them. Instead, their device number is stored in the inode, again reusing
+the fields which would be used to point to the data blocks.
+
+Reserved Space
+--------------
+
+In ext2, there is a mechanism for reserving a certain number of blocks
+for a particular user (normally the super-user). This is intended to
+allow for the system to continue functioning even if non-privileged users
+fill up all the space available to them (this is independent of filesystem
+quotas). It also keeps the filesystem from filling up entirely which
+helps combat fragmentation.
+
+Filesystem check
+----------------
+
+At boot time, most systems run a consistency check (e2fsck) on their
+filesystems. The superblock of the ext2 filesystem contains several
+fields which indicate whether fsck should actually run (since checking
+the filesystem at boot can take a long time if it is large). fsck will
+run if the filesystem was not cleanly unmounted, if the maximum mount
+count has been exceeded or if the maximum time between checks has been
+exceeded.
+
+Feature Compatibility
+---------------------
+
+The compatibility feature mechanism used in ext2 is sophisticated.
+It safely allows features to be added to the filesystem, without
+unnecessarily sacrificing compatibility with older versions of the
+filesystem code. The feature compatibility mechanism is not supported by
+the original revision 0 (EXT2_GOOD_OLD_REV) of ext2, but was introduced in
+revision 1. There are three 32-bit fields, one for compatible features
+(COMPAT), one for read-only compatible (RO_COMPAT) features and one for
+incompatible (INCOMPAT) features.
+
+These feature flags have specific meanings for the kernel as follows:
+
+A COMPAT flag indicates that a feature is present in the filesystem,
+but the on-disk format is 100% compatible with older on-disk formats, so
+a kernel which didn't know anything about this feature could read/write
+the filesystem without any chance of corrupting the filesystem (or even
+making it inconsistent). This is essentially just a flag which says
+"this filesystem has a (hidden) feature" that the kernel or e2fsck may
+want to be aware of (more on e2fsck and feature flags later). The ext3
+HAS_JOURNAL feature is a COMPAT flag because the ext3 journal is simply
+a regular file with data blocks in it so the kernel does not need to
+take any special notice of it if it doesn't understand ext3 journaling.
+
+An RO_COMPAT flag indicates that the on-disk format is 100% compatible
+with older on-disk formats for reading (i.e. the feature does not change
+the visible on-disk format). However, an old kernel writing to such a
+filesystem would/could corrupt the filesystem, so this is prevented. The
+most common such feature, SPARSE_SUPER, is an RO_COMPAT feature because
+sparse groups allow file data blocks where superblock/group descriptor
+backups used to live, and ext2_free_blocks() refuses to free these blocks,
+which would leading to inconsistent bitmaps. An old kernel would also
+get an error if it tried to free a series of blocks which crossed a group
+boundary, but this is a legitimate layout in a SPARSE_SUPER filesystem.
+
+An INCOMPAT flag indicates the on-disk format has changed in some
+way that makes it unreadable by older kernels, or would otherwise
+cause a problem if an old kernel tried to mount it. FILETYPE is an
+INCOMPAT flag because older kernels would think a filename was longer
+than 256 characters, which would lead to corrupt directory listings.
+The COMPRESSION flag is an obvious INCOMPAT flag - if the kernel
+doesn't understand compression, you would just get garbage back from
+read() instead of it automatically decompressing your data. The ext3
+RECOVER flag is needed to prevent a kernel which does not understand the
+ext3 journal from mounting the filesystem without replaying the journal.
+
+For e2fsck, it needs to be more strict with the handling of these
+flags than the kernel. If it doesn't understand ANY of the COMPAT,
+RO_COMPAT, or INCOMPAT flags it will refuse to check the filesystem,
+because it has no way of verifying whether a given feature is valid
+or not. Allowing e2fsck to succeed on a filesystem with an unknown
+feature is a false sense of security for the user. Refusing to check
+a filesystem with unknown features is a good incentive for the user to
+update to the latest e2fsck. This also means that anyone adding feature
+flags to ext2 also needs to update e2fsck to verify these features.
+
+Metadata
+--------
+
+It is frequently claimed that the ext2 implementation of writing
+asynchronous metadata is faster than the ffs synchronous metadata
+scheme but less reliable. Both methods are equally resolvable by their
+respective fsck programs.
+
+If you're exceptionally paranoid, there are 3 ways of making metadata
+writes synchronous on ext2:
+
+per-file if you have the program source: use the O_SYNC flag to open()
+per-file if you don't have the source: use "chattr +S" on the file
+per-filesystem: add the "sync" option to mount (or in /etc/fstab)
+
+the first and last are not ext2 specific but do force the metadata to
+be written synchronously. See also Journaling below.
+
+Limitations
+-----------
+
+There are various limits imposed by the on-disk layout of ext2. Other
+limits are imposed by the current implementation of the kernel code.
+Many of the limits are determined at the time the filesystem is first
+created, and depend upon the block size chosen. The ratio of inodes to
+data blocks is fixed at filesystem creation time, so the only way to
+increase the number of inodes is to increase the size of the filesystem.
+No tools currently exist which can change the ratio of inodes to blocks.
+
+Most of these limits could be overcome with slight changes in the on-disk
+format and using a compatibility flag to signal the format change (at
+the expense of some compatibility).
+
+Filesystem block size: 1kB 2kB 4kB 8kB
+
+File size limit: 16GB 256GB 2048GB 2048GB
+Filesystem size limit: 2047GB 8192GB 16384GB 32768GB
+
+There is a 2.4 kernel limit of 2048GB for a single block device, so no
+filesystem larger than that can be created at this time. There is also
+an upper limit on the block size imposed by the page size of the kernel,
+so 8kB blocks are only allowed on Alpha systems (and other architectures
+which support larger pages).
+
+There is an upper limit of 32768 subdirectories in a single directory.
+
+There is a "soft" upper limit of about 10-15k files in a single directory
+with the current linear linked-list directory implementation. This limit
+stems from performance problems when creating and deleting (and also
+finding) files in such large directories. Using a hashed directory index
+(under development) allows 100k-1M+ files in a single directory without
+performance problems (although RAM size becomes an issue at this point).
+
+The (meaningless) absolute upper limit of files in a single directory
+(imposed by the file size, the realistic limit is obviously much less)
+is over 130 trillion files. It would be higher except there are not
+enough 4-character names to make up unique directory entries, so they
+have to be 8 character filenames, even then we are fairly close to
+running out of unique filenames.
+
+Journaling
+----------
+
+A journaling extension to the ext2 code has been developed by Stephen
+Tweedie. It avoids the risks of metadata corruption and the need to
+wait for e2fsck to complete after a crash, without requiring a change
+to the on-disk ext2 layout. In a nutshell, the journal is a regular
+file which stores whole metadata (and optionally data) blocks that have
+been modified, prior to writing them into the filesystem. This means
+it is possible to add a journal to an existing ext2 filesystem without
+the need for data conversion.
+
+When changes to the filesystem (e.g. a file is renamed) they are stored in
+a transaction in the journal and can either be complete or incomplete at
+the time of a crash. If a transaction is complete at the time of a crash
+(or in the normal case where the system does not crash), then any blocks
+in that transaction are guaranteed to represent a valid filesystem state,
+and are copied into the filesystem. If a transaction is incomplete at
+the time of the crash, then there is no guarantee of consistency for
+the blocks in that transaction so they are discarded (which means any
+filesystem changes they represent are also lost).
+Check Documentation/filesystems/ext3.txt if you want to read more about
+ext3 and journaling.
+
+References
+==========
+
+The kernel source file:/usr/src/linux/fs/ext2/
+e2fsprogs (e2fsck) http://e2fsprogs.sourceforge.net/
+Design & Implementation http://e2fsprogs.sourceforge.net/ext2intro.html
+Journaling (ext3) ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/
+Filesystem Resizing http://ext2resize.sourceforge.net/
+Compression (*) http://e2compr.sourceforge.net/
+
+Implementations for:
+Windows 95/98/NT/2000 http://uranus.it.swin.edu.au/~jn/linux/Explore2fs.htm
+Windows 95 (*) http://www.yipton.demon.co.uk/content.html#FSDEXT2
+DOS client (*) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/
+OS/2 http://perso.wanadoo.fr/matthieu.willm/ext2-os2/
+RISC OS client ftp://ftp.barnet.ac.uk/pub/acorn/armlinux/iscafs/
+
+(*) no longer actively developed/supported (as of Apr 2001)
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
new file mode 100644
index 0000000..9dd2a3b
--- /dev/null
+++ b/Documentation/filesystems/ext3.txt
@@ -0,0 +1,202 @@
+
+Ext3 Filesystem
+===============
+
+Ext3 was originally released in September 1999. Written by Stephen Tweedie
+for the 2.2 branch, and ported to 2.4 kernels by Peter Braam, Andreas Dilger,
+Andrew Morton, Alexander Viro, Ted Ts'o and Stephen Tweedie.
+
+Ext3 is the ext2 filesystem enhanced with journalling capabilities.
+
+Options
+=======
+
+When mounting an ext3 filesystem, the following option are accepted:
+(*) == default
+
+journal=update Update the ext3 file system's journal to the current
+ format.
+
+journal=inum When a journal already exists, this option is ignored.
+ Otherwise, it specifies the number of the inode which
+ will represent the ext3 file system's journal file.
+
+journal_dev=devnum When the external journal device's major/minor numbers
+ have changed, this option allows the user to specify
+ the new journal location. The journal device is
+ identified through its new major/minor numbers encoded
+ in devnum.
+
+noload Don't load the journal on mounting.
+
+data=journal All data are committed into the journal prior to being
+ written into the main file system.
+
+data=ordered (*) All data are forced directly out to the main file
+ system prior to its metadata being committed to the
+ journal.
+
+data=writeback Data ordering is not preserved, data may be written
+ into the main file system after its metadata has been
+ committed to the journal.
+
+commit=nrsec (*) Ext3 can be told to sync all its data and metadata
+ every 'nrsec' seconds. The default value is 5 seconds.
+ This means that if you lose your power, you will lose
+ as much as the latest 5 seconds of work (your
+ filesystem will not be damaged though, thanks to the
+ journaling). This default value (or any low value)
+ will hurt performance, but it's good for data-safety.
+ Setting it to 0 will have the same effect as leaving
+ it at the default (5 seconds).
+ Setting it to very large values will improve
+ performance.
+
+barrier=1 This enables/disables barriers. barrier=0 disables
+ it, barrier=1 enables it.
+
+orlov (*) This enables the new Orlov block allocator. It is
+ enabled by default.
+
+oldalloc This disables the Orlov block allocator and enables
+ the old block allocator. Orlov should have better
+ performance - we'd like to get some feedback if it's
+ the contrary for you.
+
+user_xattr Enables Extended User Attributes. Additionally, you
+ need to have extended attribute support enabled in the
+ kernel configuration (CONFIG_EXT3_FS_XATTR). See the
+ attr(5) manual page and http://acl.bestbits.at/ to
+ learn more about extended attributes.
+
+nouser_xattr Disables Extended User Attributes.
+
+acl Enables POSIX Access Control Lists support.
+ Additionally, you need to have ACL support enabled in
+ the kernel configuration (CONFIG_EXT3_FS_POSIX_ACL).
+ See the acl(5) manual page and http://acl.bestbits.at/
+ for more information.
+
+noacl This option disables POSIX Access Control List
+ support.
+
+reservation
+
+noreservation
+
+bsddf (*) Make 'df' act like BSD.
+minixdf Make 'df' act like Minix.
+
+check=none Don't do extra checking of bitmaps on mount.
+nocheck
+
+debug Extra debugging information is sent to syslog.
+
+errors=remount-ro(*) Remount the filesystem read-only on an error.
+errors=continue Keep going on a filesystem error.
+errors=panic Panic and halt the machine if an error occurs.
+
+data_err=ignore(*) Just print an error message if an error occurs
+ in a file data buffer in ordered mode.
+data_err=abort Abort the journal if an error occurs in a file
+ data buffer in ordered mode.
+
+grpid Give objects the same group ID as their creator.
+bsdgroups
+
+nogrpid (*) New objects have the group ID of their creator.
+sysvgroups
+
+resgid=n The group ID which may use the reserved blocks.
+
+resuid=n The user ID which may use the reserved blocks.
+
+sb=n Use alternate superblock at this location.
+
+quota
+noquota
+grpquota
+usrquota
+
+bh (*) ext3 associates buffer heads to data pages to
+nobh (a) cache disk block mapping information
+ (b) link pages into transaction to provide
+ ordering guarantees.
+ "bh" option forces use of buffer heads.
+ "nobh" option tries to avoid associating buffer
+ heads (supported only for "writeback" mode).
+
+
+Specification
+=============
+Ext3 shares all disk implementation with the ext2 filesystem, and adds
+transactions capabilities to ext2. Journaling is done by the Journaling Block
+Device layer.
+
+Journaling Block Device layer
+-----------------------------
+The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed
+to add journaling capabilities to a block device. The ext3 filesystem code
+will inform the JBD of modifications it is performing (called a transaction).
+The journal supports the transactions start and stop, and in case of a crash,
+the journal can replay the transactions to quickly put the partition back into
+a consistent state.
+
+Handles represent a single atomic update to a filesystem. JBD can handle an
+external journal on a block device.
+
+Data Mode
+---------
+There are 3 different data modes:
+
+* writeback mode
+In data=writeback mode, ext3 does not journal data at all. This mode provides
+a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
+mode - metadata journaling. A crash+recovery can cause incorrect data to
+appear in files which were written shortly before the crash. This mode will
+typically provide the best ext3 performance.
+
+* ordered mode
+In data=ordered mode, ext3 only officially journals metadata, but it logically
+groups metadata and data blocks into a single unit called a transaction. When
+it's time to write the new metadata out to disk, the associated data blocks
+are written first. In general, this mode performs slightly slower than
+writeback but significantly faster than journal mode.
+
+* journal mode
+data=journal mode provides full data and metadata journaling. All new data is
+written to the journal first, and then to its final location.
+In the event of a crash, the journal can be replayed, bringing both data and
+metadata into a consistent state. This mode is the slowest except when data
+needs to be read from and written to disk at the same time where it
+outperforms all other modes.
+
+Compatibility
+-------------
+
+Ext2 partitions can be easily convert to ext3, with `tune2fs -j <dev>`.
+Ext3 is fully compatible with Ext2. Ext3 partitions can easily be mounted as
+Ext2.
+
+
+External Tools
+==============
+See manual pages to learn more.
+
+tune2fs: create a ext3 journal on a ext2 partition with the -j flag.
+mke2fs: create a ext3 partition with the -j flag.
+debugfs: ext2 and ext3 file system debugger.
+ext2online: online (mounted) ext2 and ext3 filesystem resizer
+
+
+References
+==========
+
+kernel source: <file:fs/ext3/>
+ <file:fs/jbd/>
+
+programs: http://e2fsprogs.sourceforge.net/
+ http://ext2resize.sourceforge.net
+
+useful links: http://www-106.ibm.com/developerworks/linux/library/l-fs7/
+ http://www-106.ibm.com/developerworks/linux/library/l-fs8/
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
new file mode 100644
index 0000000..174eaff
--- /dev/null
+++ b/Documentation/filesystems/ext4.txt
@@ -0,0 +1,302 @@
+
+Ext4 Filesystem
+===============
+
+Ext4 is an an advanced level of the ext3 filesystem which incorporates
+scalability and reliability enhancements for supporting large filesystems
+(64 bit) in keeping with increasing disk capacities and state-of-the-art
+feature requirements.
+
+Mailing list: linux-ext4@vger.kernel.org
+Web site: http://ext4.wiki.kernel.org
+
+
+1. Quick usage instructions:
+===========================
+
+Note: More extensive information for getting started with ext4 can be
+ found at the ext4 wiki site at the URL:
+ http://ext4.wiki.kernel.org/index.php/Ext4_Howto
+
+ - Compile and install the latest version of e2fsprogs (as of this
+ writing version 1.41.3) from:
+
+ http://sourceforge.net/project/showfiles.php?group_id=2406
+
+ or
+
+ ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
+
+ or grab the latest git repository from:
+
+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+ - Note that it is highly important to install the mke2fs.conf file
+ that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
+ you have edited the /etc/mke2fs.conf file installed on your system,
+ you will need to merge your changes with the version from e2fsprogs
+ 1.41.x.
+
+ - Create a new filesystem using the ext4 filesystem type:
+
+ # mke2fs -t ext4 /dev/hda1
+
+ Or to configure an existing ext3 filesystem to support extents:
+
+ # tune2fs -O extents /dev/hda1
+
+ If the filesystem was created with 128 byte inodes, it can be
+ converted to use 256 byte for greater efficiency via:
+
+ # tune2fs -I 256 /dev/hda1
+
+ (Note: we currently do not have tools to convert an ext4
+ filesystem back to ext3; so please do not do try this on production
+ filesystems.)
+
+ - Mounting:
+
+ # mount -t ext4 /dev/hda1 /wherever
+
+ - When comparing performance with other filesystems, remember that
+ ext3/4 by default offers higher data integrity guarantees than most.
+ So when comparing with a metadata-only journalling filesystem, such
+ as ext3, use `mount -o data=writeback'. And you might as well use
+ `mount -o nobh' too along with it. Making the journal larger than
+ the mke2fs default often helps performance with metadata-intensive
+ workloads.
+
+2. Features
+===========
+
+2.1 Currently available
+
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
+* extent format reduces metadata overhead (RAM, IO for access, transactions)
+* extent format more robust in face of on-disk corruption due to magics,
+* internal redunancy in tree
+* improved file allocation (multi-block alloc)
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+ flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+ the ordering)
+
+2.2 Candidate features for future inclusion
+
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjuction with
+ the uninit_bg feature (capability to do this is available in e2fsprogs
+ but a kernel thread to do lazy zeroing of unused inode table blocks
+ after filesystem is first mounted is required for safety)
+
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
+
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables. Some test results available here:
+
+ - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html
+ - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html
+
+3. Options
+==========
+
+When mounting an ext4 filesystem, the following option are accepted:
+(*) == default
+
+extents (*) ext4 will use extents to address file data. The
+ file system will no longer be mountable by ext3.
+
+noextents ext4 will not use extents for newly created files
+
+journal_checksum Enable checksumming of the journal transactions.
+ This will allow the recovery code in e2fsck and the
+ kernel to detect corruption in the kernel. It is a
+ compatible change and will be ignored by older kernels.
+
+journal_async_commit Commit block can be written to disk without waiting
+ for descriptor blocks. If enabled older kernels cannot
+ mount the device. This will enable 'journal_checksum'
+ internally.
+
+journal=update Update the ext4 file system's journal to the current
+ format.
+
+journal=inum When a journal already exists, this option is ignored.
+ Otherwise, it specifies the number of the inode which
+ will represent the ext4 file system's journal file.
+
+journal_dev=devnum When the external journal device's major/minor numbers
+ have changed, this option allows the user to specify
+ the new journal location. The journal device is
+ identified through its new major/minor numbers encoded
+ in devnum.
+
+noload Don't load the journal on mounting.
+
+data=journal All data are committed into the journal prior to being
+ written into the main file system.
+
+data=ordered (*) All data are forced directly out to the main file
+ system prior to its metadata being committed to the
+ journal.
+
+data=writeback Data ordering is not preserved, data may be written
+ into the main file system after its metadata has been
+ committed to the journal.
+
+commit=nrsec (*) Ext4 can be told to sync all its data and metadata
+ every 'nrsec' seconds. The default value is 5 seconds.
+ This means that if you lose your power, you will lose
+ as much as the latest 5 seconds of work (your
+ filesystem will not be damaged though, thanks to the
+ journaling). This default value (or any low value)
+ will hurt performance, but it's good for data-safety.
+ Setting it to 0 will have the same effect as leaving
+ it at the default (5 seconds).
+ Setting it to very large values will improve
+ performance.
+
+barrier=<0|1(*)> This enables/disables the use of write barriers in
+ the jbd code. barrier=0 disables, barrier=1 enables.
+ This also requires an IO stack which can support
+ barriers, and if jbd gets an error on a barrier
+ write, it will disable again with a warning.
+ Write barriers enforce proper on-disk ordering
+ of journal commits, making volatile disk write caches
+ safe to use, at some performance penalty. If
+ your disks are battery-backed in one way or another,
+ disabling barriers may safely improve performance.
+
+inode_readahead=n This tuning parameter controls the maximum
+ number of inode table blocks that ext4's inode
+ table readahead algorithm will pre-read into
+ the buffer cache. The default value is 32 blocks.
+
+orlov (*) This enables the new Orlov block allocator. It is
+ enabled by default.
+
+oldalloc This disables the Orlov block allocator and enables
+ the old block allocator. Orlov should have better
+ performance - we'd like to get some feedback if it's
+ the contrary for you.
+
+user_xattr Enables Extended User Attributes. Additionally, you
+ need to have extended attribute support enabled in the
+ kernel configuration (CONFIG_EXT4_FS_XATTR). See the
+ attr(5) manual page and http://acl.bestbits.at/ to
+ learn more about extended attributes.
+
+nouser_xattr Disables Extended User Attributes.
+
+acl Enables POSIX Access Control Lists support.
+ Additionally, you need to have ACL support enabled in
+ the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL).
+ See the acl(5) manual page and http://acl.bestbits.at/
+ for more information.
+
+noacl This option disables POSIX Access Control List
+ support.
+
+reservation
+
+noreservation
+
+bsddf (*) Make 'df' act like BSD.
+minixdf Make 'df' act like Minix.
+
+debug Extra debugging information is sent to syslog.
+
+errors=remount-ro(*) Remount the filesystem read-only on an error.
+errors=continue Keep going on a filesystem error.
+errors=panic Panic and halt the machine if an error occurs.
+
+data_err=ignore(*) Just print an error message if an error occurs
+ in a file data buffer in ordered mode.
+data_err=abort Abort the journal if an error occurs in a file
+ data buffer in ordered mode.
+
+grpid Give objects the same group ID as their creator.
+bsdgroups
+
+nogrpid (*) New objects have the group ID of their creator.
+sysvgroups
+
+resgid=n The group ID which may use the reserved blocks.
+
+resuid=n The user ID which may use the reserved blocks.
+
+sb=n Use alternate superblock at this location.
+
+quota
+noquota
+grpquota
+usrquota
+
+bh (*) ext4 associates buffer heads to data pages to
+nobh (a) cache disk block mapping information
+ (b) link pages into transaction to provide
+ ordering guarantees.
+ "bh" option forces use of buffer heads.
+ "nobh" option tries to avoid associating buffer
+ heads (supported only for "writeback" mode).
+
+stripe=n Number of filesystem blocks that mballoc will try
+ to use for allocation size and alignment. For RAID5/6
+ systems this should be the number of data
+ disks * RAID chunk size in file system blocks.
+delalloc (*) Deferring block allocation until write-out time.
+nodelalloc Disable delayed allocation. Blocks are allocation
+ when data is copied from user to page cache.
+
+Data Mode
+=========
+There are 3 different data modes:
+
+* writeback mode
+In data=writeback mode, ext4 does not journal data at all. This mode provides
+a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
+mode - metadata journaling. A crash+recovery can cause incorrect data to
+appear in files which were written shortly before the crash. This mode will
+typically provide the best ext4 performance.
+
+* ordered mode
+In data=ordered mode, ext4 only officially journals metadata, but it logically
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction. When it's time to write the new metadata
+out to disk, the associated data blocks are written first. In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
+
+* journal mode
+data=journal mode provides full data and metadata journaling. All new data is
+written to the journal first, and then to its final location.
+In the event of a crash, the journal can be replayed, bringing both data and
+metadata into a consistent state. This mode is the slowest except when data
+needs to be read from and written to disk at the same time where it
+outperforms all others modes. Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
+
+References
+==========
+
+kernel source: <file:fs/ext4/>
+ <file:fs/jbd2/>
+
+programs: http://e2fsprogs.sourceforge.net/
+
+useful links: http://fedoraproject.org/wiki/ext3-devel
+ http://www.bullopensource.org/ext4/
+ http://ext4.wiki.kernel.org/index.php/Main_Page
+ http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
new file mode 100644
index 0000000..1e3defc
--- /dev/null
+++ b/Documentation/filesystems/fiemap.txt
@@ -0,0 +1,228 @@
+============
+Fiemap Ioctl
+============
+
+The fiemap ioctl is an efficient method for userspace to get file
+extent mappings. Instead of block-by-block mapping (such as bmap), fiemap
+returns a list of extents.
+
+
+Request Basics
+--------------
+
+A fiemap request is encoded within struct fiemap:
+
+struct fiemap {
+ __u64 fm_start; /* logical offset (inclusive) at
+ * which to start mapping (in) */
+ __u64 fm_length; /* logical length of mapping which
+ * userspace cares about (in) */
+ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
+ __u32 fm_mapped_extents; /* number of extents that were
+ * mapped (out) */
+ __u32 fm_extent_count; /* size of fm_extents array (in) */
+ __u32 fm_reserved;
+ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+
+fm_start, and fm_length specify the logical range within the file
+which the process would like mappings for. Extents returned mirror
+those on disk - that is, the logical offset of the 1st returned extent
+may start before fm_start, and the range covered by the last returned
+extent may end after fm_length. All offsets and lengths are in bytes.
+
+Certain flags to modify the way in which mappings are looked up can be
+set in fm_flags. If the kernel doesn't understand some particular
+flags, it will return EBADR and the contents of fm_flags will contain
+the set of flags which caused the error. If the kernel is compatible
+with all flags passed, the contents of fm_flags will be unmodified.
+It is up to userspace to determine whether rejection of a particular
+flag is fatal to it's operation. This scheme is intended to allow the
+fiemap interface to grow in the future but without losing
+compatibility with old software.
+
+fm_extent_count specifies the number of elements in the fm_extents[] array
+that can be used to return extents. If fm_extent_count is zero, then the
+fm_extents[] array is ignored (no extents will be returned), and the
+fm_mapped_extents count will hold the number of extents needed in
+fm_extents[] to hold the file's current mapping. Note that there is
+nothing to prevent the file from changing between calls to FIEMAP.
+
+The following flags can be set in fm_flags:
+
+* FIEMAP_FLAG_SYNC
+If this flag is set, the kernel will sync the file before mapping extents.
+
+* FIEMAP_FLAG_XATTR
+If this flag is set, the extents returned will describe the inodes
+extended attribute lookup tree, instead of it's data tree.
+
+
+Extent Mapping
+--------------
+
+Extent information is returned within the embedded fm_extents array
+which userspace must allocate along with the fiemap structure. The
+number of elements in the fiemap_extents[] array should be passed via
+fm_extent_count. The number of extents mapped by kernel will be
+returned via fm_mapped_extents. If the number of fiemap_extents
+allocated is less than would be required to map the requested range,
+the maximum number of extents that can be mapped in the fm_extent[]
+array will be returned and fm_mapped_extents will be equal to
+fm_extent_count. In that case, the last extent in the array will not
+complete the requested range and will not have the FIEMAP_EXTENT_LAST
+flag set (see the next section on extent flags).
+
+Each extent is described by a single fiemap_extent structure as
+returned in fm_extents.
+
+struct fiemap_extent {
+ __u64 fe_logical; /* logical offset in bytes for the start of
+ * the extent */
+ __u64 fe_physical; /* physical offset in bytes for the start
+ * of the extent */
+ __u64 fe_length; /* length in bytes for the extent */
+ __u64 fe_reserved64[2];
+ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
+ __u32 fe_reserved[3];
+};
+
+All offsets and lengths are in bytes and mirror those on disk. It is valid
+for an extents logical offset to start before the request or it's logical
+length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is
+returned, fe_logical, fe_physical, and fe_length will be aligned to the
+block size of the file system. With the exception of extents flagged as
+FIEMAP_EXTENT_MERGED, adjacent extents will not be merged.
+
+The fe_flags field contains flags which describe the extent returned.
+A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in
+the file so that the process making fiemap calls can determine when no
+more extents are available, without having to call the ioctl again.
+
+Some flags are intentionally vague and will always be set in the
+presence of other more specific flags. This way a program looking for
+a general property does not have to know all existing and future flags
+which imply that property.
+
+For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL
+are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking
+for inline or tail-packed data can key on the specific flag. Software
+which simply cares not to try operating on non-aligned extents
+however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to
+worry about all present and future flags which might imply unaligned
+data. Note that the opposite is not true - it would be valid for
+FIEMAP_EXTENT_NOT_ALIGNED to appear alone.
+
+* FIEMAP_EXTENT_LAST
+This is the last extent in the file. A mapping attempt past this
+extent will return nothing.
+
+* FIEMAP_EXTENT_UNKNOWN
+The location of this extent is currently unknown. This may indicate
+the data is stored on an inaccessible volume or that no storage has
+been allocated for the file yet.
+
+* FIEMAP_EXTENT_DELALLOC
+ - This will also set FIEMAP_EXTENT_UNKNOWN.
+Delayed allocation - while there is data for this extent, it's
+physical location has not been allocated yet.
+
+* FIEMAP_EXTENT_ENCODED
+This extent does not consist of plain filesystem blocks but is
+encoded (e.g. encrypted or compressed). Reading the data in this
+extent via I/O to the block device will have undefined results.
+
+Note that it is *always* undefined to try to update the data
+in-place by writing to the indicated location without the
+assistance of the filesystem, or to access the data using the
+information returned by the FIEMAP interface while the filesystem
+is mounted. In other words, user applications may only read the
+extent data via I/O to the block device while the filesystem is
+unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is
+clear; user applications must not try reading or writing to the
+filesystem via the block device under any other circumstances.
+
+* FIEMAP_EXTENT_DATA_ENCRYPTED
+ - This will also set FIEMAP_EXTENT_ENCODED
+The data in this extent has been encrypted by the file system.
+
+* FIEMAP_EXTENT_NOT_ALIGNED
+Extent offsets and length are not guaranteed to be block aligned.
+
+* FIEMAP_EXTENT_DATA_INLINE
+ This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is located within a meta data block.
+
+* FIEMAP_EXTENT_DATA_TAIL
+ This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is packed into a block with data from other files.
+
+* FIEMAP_EXTENT_UNWRITTEN
+Unwritten extent - the extent is allocated but it's data has not been
+initialized. This indicates the extent's data will be all zero if read
+through the filesystem but the contents are undefined if read directly from
+the device.
+
+* FIEMAP_EXTENT_MERGED
+This will be set when a file does not support extents, i.e., it uses a block
+based addressing scheme. Since returning an extent for each block back to
+userspace would be highly inefficient, the kernel will try to merge most
+adjacent blocks into 'extents'.
+
+
+VFS -> File System Implementation
+---------------------------------
+
+File systems wishing to support fiemap must implement a ->fiemap callback on
+their inode_operations structure. The fs ->fiemap call is responsible for
+defining it's set of supported fiemap flags, and calling a helper function on
+each discovered extent:
+
+struct inode_operations {
+ ...
+
+ int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
+ u64 len);
+
+->fiemap is passed struct fiemap_extent_info which describes the
+fiemap request:
+
+struct fiemap_extent_info {
+ unsigned int fi_flags; /* Flags as passed from user */
+ unsigned int fi_extents_mapped; /* Number of mapped extents */
+ unsigned int fi_extents_max; /* Size of fiemap_extent array */
+ struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
+};
+
+It is intended that the file system should not need to access any of this
+structure directly.
+
+
+Flag checking should be done at the beginning of the ->fiemap callback via the
+fiemap_check_flags() helper:
+
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
+
+The struct fieinfo should be passed in as recieved from ioctl_fiemap(). The
+set of fiemap flags which the fs understands should be passed via fs_flags. If
+fiemap_check_flags finds invalid user flags, it will place the bad values in
+fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from
+fiemap_check_flags(), it should immediately exit, returning that error back to
+ioctl_fiemap().
+
+
+For each extent in the request range, the file system should call
+the helper function, fiemap_fill_next_extent():
+
+int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
+ u64 phys, u64 len, u32 flags, u32 dev);
+
+fiemap_fill_next_extent() will use the passed values to populate the
+next free extent in the fm_extents array. 'General' extent flags will
+automatically be set from specific flags on behalf of the calling file
+system so that the userspace API is not broken.
+
+fiemap_fill_next_extent() returns 0 on success, and 1 when the
+user-supplied fm_extents array is full. If an error is encountered
+while copying the extent to user memory, -EFAULT will be returned.
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
new file mode 100644
index 0000000..bb0142f
--- /dev/null
+++ b/Documentation/filesystems/files.txt
@@ -0,0 +1,123 @@
+File management in the Linux kernel
+-----------------------------------
+
+This document describes how locking for files (struct file)
+and file descriptor table (struct files) works.
+
+Up until 2.6.12, the file descriptor table has been protected
+with a lock (files->file_lock) and reference count (files->count).
+->file_lock protected accesses to all the file related fields
+of the table. ->count was used for sharing the file descriptor
+table between tasks cloned with CLONE_FILES flag. Typically
+this would be the case for posix threads. As with the common
+refcounting model in the kernel, the last task doing
+a put_files_struct() frees the file descriptor (fd) table.
+The files (struct file) themselves are protected using
+reference count (->f_count).
+
+In the new lock-free model of file descriptor management,
+the reference counting is similar, but the locking is
+based on RCU. The file descriptor table contains multiple
+elements - the fd sets (open_fds and close_on_exec, the
+array of file pointers, the sizes of the sets and the array
+etc.). In order for the updates to appear atomic to
+a lock-free reader, all the elements of the file descriptor
+table are in a separate structure - struct fdtable.
+files_struct contains a pointer to struct fdtable through
+which the actual fd table is accessed. Initially the
+fdtable is embedded in files_struct itself. On a subsequent
+expansion of fdtable, a new fdtable structure is allocated
+and files->fdtab points to the new structure. The fdtable
+structure is freed with RCU and lock-free readers either
+see the old fdtable or the new fdtable making the update
+appear atomic. Here are the locking rules for
+the fdtable structure -
+
+1. All references to the fdtable must be done through
+ the files_fdtable() macro :
+
+ struct fdtable *fdt;
+
+ rcu_read_lock();
+
+ fdt = files_fdtable(files);
+ ....
+ if (n <= fdt->max_fds)
+ ....
+ ...
+ rcu_read_unlock();
+
+ files_fdtable() uses rcu_dereference() macro which takes care of
+ the memory barrier requirements for lock-free dereference.
+ The fdtable pointer must be read within the read-side
+ critical section.
+
+2. Reading of the fdtable as described above must be protected
+ by rcu_read_lock()/rcu_read_unlock().
+
+3. For any update to the fd table, files->file_lock must
+ be held.
+
+4. To look up the file structure given an fd, a reader
+ must use either fcheck() or fcheck_files() APIs. These
+ take care of barrier requirements due to lock-free lookup.
+ An example :
+
+ struct file *file;
+
+ rcu_read_lock();
+ file = fcheck(fd);
+ if (file) {
+ ...
+ }
+ ....
+ rcu_read_unlock();
+
+5. Handling of the file structures is special. Since the look-up
+ of the fd (fget()/fget_light()) are lock-free, it is possible
+ that look-up may race with the last put() operation on the
+ file structure. This is avoided using atomic_inc_not_zero()
+ on ->f_count :
+
+ rcu_read_lock();
+ file = fcheck_files(files, fd);
+ if (file) {
+ if (atomic_inc_not_zero(&file->f_count))
+ *fput_needed = 1;
+ else
+ /* Didn't get the reference, someone's freed */
+ file = NULL;
+ }
+ rcu_read_unlock();
+ ....
+ return file;
+
+ atomic_inc_not_zero() detects if refcounts is already zero or
+ goes to zero during increment. If it does, we fail
+ fget()/fget_light().
+
+6. Since both fdtable and file structures can be looked up
+ lock-free, they must be installed using rcu_assign_pointer()
+ API. If they are looked up lock-free, rcu_dereference()
+ must be used. However it is advisable to use files_fdtable()
+ and fcheck()/fcheck_files() which take care of these issues.
+
+7. While updating, the fdtable pointer must be looked up while
+ holding files->file_lock. If ->file_lock is dropped, then
+ another thread expand the files thereby creating a new
+ fdtable and making the earlier fdtable pointer stale.
+ For example :
+
+ spin_lock(&files->file_lock);
+ fd = locate_fd(files, file, start);
+ if (fd >= 0) {
+ /* locate_fd() may have expanded fdtable, load the ptr */
+ fdt = files_fdtable(files);
+ FD_SET(fd, fdt->open_fds);
+ FD_CLR(fd, fdt->close_on_exec);
+ spin_unlock(&files->file_lock);
+ .....
+
+ Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
+ the fdtable pointer (fdt) must be loaded after locate_fd().
+
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
new file mode 100644
index 0000000..397a41a
--- /dev/null
+++ b/Documentation/filesystems/fuse.txt
@@ -0,0 +1,423 @@
+Definitions
+~~~~~~~~~~~
+
+Userspace filesystem:
+
+ A filesystem in which data and metadata are provided by an ordinary
+ userspace process. The filesystem can be accessed normally through
+ the kernel interface.
+
+Filesystem daemon:
+
+ The process(es) providing the data and metadata of the filesystem.
+
+Non-privileged mount (or user mount):
+
+ A userspace filesystem mounted by a non-privileged (non-root) user.
+ The filesystem daemon is running with the privileges of the mounting
+ user. NOTE: this is not the same as mounts allowed with the "user"
+ option in /etc/fstab, which is not discussed here.
+
+Filesystem connection:
+
+ A connection between the filesystem daemon and the kernel. The
+ connection exists until either the daemon dies, or the filesystem is
+ umounted. Note that detaching (or lazy umounting) the filesystem
+ does _not_ break the connection, in this case it will exist until
+ the last reference to the filesystem is released.
+
+Mount owner:
+
+ The user who does the mounting.
+
+User:
+
+ The user who is performing filesystem operations.
+
+What is FUSE?
+~~~~~~~~~~~~~
+
+FUSE is a userspace filesystem framework. It consists of a kernel
+module (fuse.ko), a userspace library (libfuse.*) and a mount utility
+(fusermount).
+
+One of the most important features of FUSE is allowing secure,
+non-privileged mounts. This opens up new possibilities for the use of
+filesystems. A good example is sshfs: a secure network filesystem
+using the sftp protocol.
+
+The userspace library and utilities are available from the FUSE
+homepage:
+
+ http://fuse.sourceforge.net/
+
+Filesystem type
+~~~~~~~~~~~~~~~
+
+The filesystem type given to mount(2) can be one of the following:
+
+'fuse'
+
+ This is the usual way to mount a FUSE filesystem. The first
+ argument of the mount system call may contain an arbitrary string,
+ which is not interpreted by the kernel.
+
+'fuseblk'
+
+ The filesystem is block device based. The first argument of the
+ mount system call is interpreted as the name of the device.
+
+Mount options
+~~~~~~~~~~~~~
+
+'fd=N'
+
+ The file descriptor to use for communication between the userspace
+ filesystem and the kernel. The file descriptor must have been
+ obtained by opening the FUSE device ('/dev/fuse').
+
+'rootmode=M'
+
+ The file mode of the filesystem's root in octal representation.
+
+'user_id=N'
+
+ The numeric user id of the mount owner.
+
+'group_id=N'
+
+ The numeric group id of the mount owner.
+
+'default_permissions'
+
+ By default FUSE doesn't check file access permissions, the
+ filesystem is free to implement it's access policy or leave it to
+ the underlying file access mechanism (e.g. in case of network
+ filesystems). This option enables permission checking, restricting
+ access based on file mode. It is usually useful together with the
+ 'allow_other' mount option.
+
+'allow_other'
+
+ This option overrides the security measure restricting file access
+ to the user mounting the filesystem. This option is by default only
+ allowed to root, but this restriction can be removed with a
+ (userspace) configuration option.
+
+'max_read=N'
+
+ With this option the maximum size of read operations can be set.
+ The default is infinite. Note that the size of read requests is
+ limited anyway to 32 pages (which is 128kbyte on i386).
+
+'blksize=N'
+
+ Set the block size for the filesystem. The default is 512. This
+ option is only valid for 'fuseblk' type mounts.
+
+Control filesystem
+~~~~~~~~~~~~~~~~~~
+
+There's a control filesystem for FUSE, which can be mounted by:
+
+ mount -t fusectl none /sys/fs/fuse/connections
+
+Mounting it under the '/sys/fs/fuse/connections' directory makes it
+backwards compatible with earlier versions.
+
+Under the fuse control filesystem each connection has a directory
+named by a unique number.
+
+For each connection the following files exist within this directory:
+
+ 'waiting'
+
+ The number of requests which are waiting to be transferred to
+ userspace or being processed by the filesystem daemon. If there is
+ no filesystem activity and 'waiting' is non-zero, then the
+ filesystem is hung or deadlocked.
+
+ 'abort'
+
+ Writing anything into this file will abort the filesystem
+ connection. This means that all waiting requests will be aborted an
+ error returned for all aborted and new requests.
+
+Only the owner of the mount may read or write these files.
+
+Interrupting filesystem operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a process issuing a FUSE filesystem request is interrupted, the
+following will happen:
+
+ 1) If the request is not yet sent to userspace AND the signal is
+ fatal (SIGKILL or unhandled fatal signal), then the request is
+ dequeued and returns immediately.
+
+ 2) If the request is not yet sent to userspace AND the signal is not
+ fatal, then an 'interrupted' flag is set for the request. When
+ the request has been successfully transferred to userspace and
+ this flag is set, an INTERRUPT request is queued.
+
+ 3) If the request is already sent to userspace, then an INTERRUPT
+ request is queued.
+
+INTERRUPT requests take precedence over other requests, so the
+userspace filesystem will receive queued INTERRUPTs before any others.
+
+The userspace filesystem may ignore the INTERRUPT requests entirely,
+or may honor them by sending a reply to the _original_ request, with
+the error set to EINTR.
+
+It is also possible that there's a race between processing the
+original request and it's INTERRUPT request. There are two possibilities:
+
+ 1) The INTERRUPT request is processed before the original request is
+ processed
+
+ 2) The INTERRUPT request is processed after the original request has
+ been answered
+
+If the filesystem cannot find the original request, it should wait for
+some timeout and/or a number of new requests to arrive, after which it
+should reply to the INTERRUPT request with an EAGAIN error. In case
+1) the INTERRUPT request will be requeued. In case 2) the INTERRUPT
+reply will be ignored.
+
+Aborting a filesystem connection
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to get into certain situations where the filesystem is
+not responding. Reasons for this may be:
+
+ a) Broken userspace filesystem implementation
+
+ b) Network connection down
+
+ c) Accidental deadlock
+
+ d) Malicious deadlock
+
+(For more on c) and d) see later sections)
+
+In either of these cases it may be useful to abort the connection to
+the filesystem. There are several ways to do this:
+
+ - Kill the filesystem daemon. Works in case of a) and b)
+
+ - Kill the filesystem daemon and all users of the filesystem. Works
+ in all cases except some malicious deadlocks
+
+ - Use forced umount (umount -f). Works in all cases but only if
+ filesystem is still attached (it hasn't been lazy unmounted)
+
+ - Abort filesystem through the FUSE control filesystem. Most
+ powerful method, always works.
+
+How do non-privileged mounts work?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since the mount() system call is a privileged operation, a helper
+program (fusermount) is needed, which is installed setuid root.
+
+The implication of providing non-privileged mounts is that the mount
+owner must not be able to use this capability to compromise the
+system. Obvious requirements arising from this are:
+
+ A) mount owner should not be able to get elevated privileges with the
+ help of the mounted filesystem
+
+ B) mount owner should not get illegitimate access to information from
+ other users' and the super user's processes
+
+ C) mount owner should not be able to induce undesired behavior in
+ other users' or the super user's processes
+
+How are requirements fulfilled?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ A) The mount owner could gain elevated privileges by either:
+
+ 1) creating a filesystem containing a device file, then opening
+ this device
+
+ 2) creating a filesystem containing a suid or sgid application,
+ then executing this application
+
+ The solution is not to allow opening device files and ignore
+ setuid and setgid bits when executing programs. To ensure this
+ fusermount always adds "nosuid" and "nodev" to the mount options
+ for non-privileged mounts.
+
+ B) If another user is accessing files or directories in the
+ filesystem, the filesystem daemon serving requests can record the
+ exact sequence and timing of operations performed. This
+ information is otherwise inaccessible to the mount owner, so this
+ counts as an information leak.
+
+ The solution to this problem will be presented in point 2) of C).
+
+ C) There are several ways in which the mount owner can induce
+ undesired behavior in other users' processes, such as:
+
+ 1) mounting a filesystem over a file or directory which the mount
+ owner could otherwise not be able to modify (or could only
+ make limited modifications).
+
+ This is solved in fusermount, by checking the access
+ permissions on the mountpoint and only allowing the mount if
+ the mount owner can do unlimited modification (has write
+ access to the mountpoint, and mountpoint is not a "sticky"
+ directory)
+
+ 2) Even if 1) is solved the mount owner can change the behavior
+ of other users' processes.
+
+ i) It can slow down or indefinitely delay the execution of a
+ filesystem operation creating a DoS against the user or the
+ whole system. For example a suid application locking a
+ system file, and then accessing a file on the mount owner's
+ filesystem could be stopped, and thus causing the system
+ file to be locked forever.
+
+ ii) It can present files or directories of unlimited length, or
+ directory structures of unlimited depth, possibly causing a
+ system process to eat up diskspace, memory or other
+ resources, again causing DoS.
+
+ The solution to this as well as B) is not to allow processes
+ to access the filesystem, which could otherwise not be
+ monitored or manipulated by the mount owner. Since if the
+ mount owner can ptrace a process, it can do all of the above
+ without using a FUSE mount, the same criteria as used in
+ ptrace can be used to check if a process is allowed to access
+ the filesystem or not.
+
+ Note that the ptrace check is not strictly necessary to
+ prevent B/2/i, it is enough to check if mount owner has enough
+ privilege to send signal to the process accessing the
+ filesystem, since SIGSTOP can be used to get a similar effect.
+
+I think these limitations are unacceptable?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a sysadmin trusts the users enough, or can ensure through other
+measures, that system processes will never enter non-privileged
+mounts, it can relax the last limitation with a "user_allow_other"
+config option. If this config option is set, the mounting user can
+add the "allow_other" mount option which disables the check for other
+users' processes.
+
+Kernel - userspace interface
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following diagram shows how a filesystem operation (in this
+example unlink) is performed in FUSE.
+
+NOTE: everything in this description is greatly simplified
+
+ | "rm /mnt/fuse/file" | FUSE filesystem daemon
+ | |
+ | | >sys_read()
+ | | >fuse_dev_read()
+ | | >request_wait()
+ | | [sleep on fc->waitq]
+ | |
+ | >sys_unlink() |
+ | >fuse_unlink() |
+ | [get request from |
+ | fc->unused_list] |
+ | >request_send() |
+ | [queue req on fc->pending] |
+ | [wake up fc->waitq] | [woken up]
+ | >request_wait_answer() |
+ | [sleep on req->waitq] |
+ | | <request_wait()
+ | | [remove req from fc->pending]
+ | | [copy req to read buffer]
+ | | [add req to fc->processing]
+ | | <fuse_dev_read()
+ | | <sys_read()
+ | |
+ | | [perform unlink]
+ | |
+ | | >sys_write()
+ | | >fuse_dev_write()
+ | | [look up req in fc->processing]
+ | | [remove from fc->processing]
+ | | [copy write buffer to req]
+ | [woken up] | [wake up req->waitq]
+ | | <fuse_dev_write()
+ | | <sys_write()
+ | <request_wait_answer() |
+ | <request_send() |
+ | [add request to |
+ | fc->unused_list] |
+ | <fuse_unlink() |
+ | <sys_unlink() |
+
+There are a couple of ways in which to deadlock a FUSE filesystem.
+Since we are talking about unprivileged userspace programs,
+something must be done about these.
+
+Scenario 1 - Simple deadlock
+-----------------------------
+
+ | "rm /mnt/fuse/file" | FUSE filesystem daemon
+ | |
+ | >sys_unlink("/mnt/fuse/file") |
+ | [acquire inode semaphore |
+ | for "file"] |
+ | >fuse_unlink() |
+ | [sleep on req->waitq] |
+ | | <sys_read()
+ | | >sys_unlink("/mnt/fuse/file")
+ | | [acquire inode semaphore
+ | | for "file"]
+ | | *DEADLOCK*
+
+The solution for this is to allow the filesystem to be aborted.
+
+Scenario 2 - Tricky deadlock
+----------------------------
+
+This one needs a carefully crafted filesystem. It's a variation on
+the above, only the call back to the filesystem is not explicit,
+but is caused by a pagefault.
+
+ | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2
+ | |
+ | [fd = open("/mnt/fuse/file")] | [request served normally]
+ | [mmap fd to 'addr'] |
+ | [close fd] | [FLUSH triggers 'magic' flag]
+ | [read a byte from addr] |
+ | >do_page_fault() |
+ | [find or create page] |
+ | [lock page] |
+ | >fuse_readpage() |
+ | [queue READ request] |
+ | [sleep on req->waitq] |
+ | | [read request to buffer]
+ | | [create reply header before addr]
+ | | >sys_write(addr - headerlength)
+ | | >fuse_dev_write()
+ | | [look up req in fc->processing]
+ | | [remove from fc->processing]
+ | | [copy write buffer to req]
+ | | >do_page_fault()
+ | | [find or create page]
+ | | [lock page]
+ | | * DEADLOCK *
+
+Solution is basically the same as above.
+
+An additional problem is that while the write buffer is being copied
+to the request, the request must not be interrupted/aborted. This is
+because the destination address of the copy may not be valid after the
+request has returned.
+
+This is solved with doing the copy atomically, and allowing abort
+while the page(s) belonging to the write buffer are faulted with
+get_user_pages(). The 'req->locked' flag indicates when the copy is
+taking place, and abort is delayed until this flag is unset.
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
new file mode 100644
index 0000000..4dae9a3
--- /dev/null
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -0,0 +1,114 @@
+ Glock internal locking rules
+ ------------------------------
+
+This documents the basic principles of the glock state machine
+internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h)
+has two main (internal) locks:
+
+ 1. A spinlock (gl_spin) which protects the internal state such
+ as gl_state, gl_target and the list of holders (gl_holders)
+ 2. A non-blocking bit lock, GLF_LOCK, which is used to prevent other
+ threads from making calls to the DLM, etc. at the same time. If a
+ thread takes this lock, it must then call run_queue (usually via the
+ workqueue) when it releases it in order to ensure any pending tasks
+ are completed.
+
+The gl_holders list contains all the queued lock requests (not
+just the holders) associated with the glock. If there are any
+held locks, then they will be contiguous entries at the head
+of the list. Locks are granted in strictly the order that they
+are queued, except for those marked LM_FLAG_PRIORITY which are
+used only during recovery, and even then only for journal locks.
+
+There are three lock states that users of the glock layer can request,
+namely shared (SH), deferred (DF) and exclusive (EX). Those translate
+to the following DLM lock modes:
+
+Glock mode | DLM lock mode
+------------------------------
+ UN | IV/NL Unlocked (no DLM lock associated with glock) or NL
+ SH | PR (Protected read)
+ DF | CW (Concurrent write)
+ EX | EX (Exclusive)
+
+Thus DF is basically a shared mode which is incompatible with the "normal"
+shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O
+operations. The glocks are basically a lock plus some routines which deal
+with cache management. The following rules apply for the cache:
+
+Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata
+--------------------------------------------------------------------------
+ UN | No | No | No | No
+ SH | Yes | Yes | No | No
+ DF | No | Yes | No | No
+ EX | Yes | Yes | Yes | Yes
+
+These rules are implemented using the various glock operations which
+are defined for each type of glock. Not all types of glocks use
+all the modes. Only inode glocks use the DF mode for example.
+
+Table of glock operations and per type constants:
+
+Field | Purpose
+----------------------------------------------------------------------------
+go_xmote_th | Called before remote state change (e.g. to sync dirty data)
+go_xmote_bh | Called after remote state change (e.g. to refill cache)
+go_inval | Called if remote state change requires invalidating the cache
+go_demote_ok | Returns boolean value of whether its ok to demote a glock
+ | (e.g. checks timeout, and that there is no cached data)
+go_lock | Called for the first local holder of a lock
+go_unlock | Called on the final local unlock of a lock
+go_dump | Called to print content of object for debugfs file, or on
+ | error to dump glock to the log.
+go_type; | The type of the glock, LM_TYPE_.....
+go_min_hold_time | The minimum hold time
+
+The minimum hold time for each lock is the time after a remote lock
+grant for which we ignore remote demote requests. This is in order to
+prevent a situation where locks are being bounced around the cluster
+from node to node with none of the nodes making any progress. This
+tends to show up most with shared mmaped files which are being written
+to by multiple nodes. By delaying the demotion in response to a
+remote callback, that gives the userspace program time to make
+some progress before the pages are unmapped.
+
+There is a plan to try and remove the go_lock and go_unlock callbacks
+if possible, in order to try and speed up the fast path though the locking.
+Also, eventually we hope to make the glock "EX" mode locally shared
+such that any local locking will be done with the i_mutex as required
+rather than via the glock.
+
+Locking rules for glock operations:
+
+Operation | GLF_LOCK bit lock held | gl_spin spinlock held
+-----------------------------------------------------------------
+go_xmote_th | Yes | No
+go_xmote_bh | Yes | No
+go_inval | Yes | No
+go_demote_ok | Sometimes | Yes
+go_lock | Yes | No
+go_unlock | Yes | No
+go_dump | Sometimes | Yes
+
+N.B. Operations must not drop either the bit lock or the spinlock
+if its held on entry. go_dump and do_demote_ok must never block.
+Note that go_dump will only be called if the glock's state
+indicates that it is caching uptodate data.
+
+Glock locking order within GFS2:
+
+ 1. i_mutex (if required)
+ 2. Rename glock (for rename only)
+ 3. Inode glock(s)
+ (Parents before children, inodes at "same level" with same parent in
+ lock number order)
+ 4. Rgrp glock(s) (for (de)allocation operations)
+ 5. Transaction glock (via gfs2_trans_begin) for non-read operations
+ 6. Page lock (always last, very important!)
+
+There are two glocks per inode. One deals with access to the inode
+itself (locking order as above), and the other, known as the iopen
+glock is used in conjunction with the i_nlink field in the inode to
+determine the lifetime of the inode in question. Locking of inodes
+is on a per-inode basis. Locking of rgrps is on a per rgrp basis.
+
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 0000000..593004b
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,43 @@
+Global File System
+------------------
+
+http://sources.redhat.com/cluster/
+
+GFS is a cluster file system. It allows a cluster of computers to
+simultaneously use a block device that is shared between them (with FC,
+iSCSI, NBD, etc). GFS reads and writes to the block device like a local
+file system, but also uses a lock module to allow the computers coordinate
+their I/O so file system consistency is maintained. One of the nifty
+features of GFS is perfect consistency -- changes made to the file system
+on one machine show up immediately on all other machines in the cluster.
+
+GFS uses interchangable inter-node locking mechanisms. Different lock
+modules can plug into GFS and each file system selects the appropriate
+lock module at mount time. Lock modules include:
+
+ lock_nolock -- allows gfs to be used as a local file system
+
+ lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
+ The dlm is found at linux/fs/dlm/
+
+In addition to interfacing with an external locking manager, a gfs lock
+module is responsible for interacting with external cluster management
+systems. Lock_dlm depends on user space cluster management systems found
+at the URL above.
+
+To use gfs as a local file system, no external clustering systems are
+needed, simply:
+
+ $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
+ $ mount -t gfs2 /dev/block_device /dir
+
+GFS2 is not on-disk compatible with previous versions of GFS.
+
+The following man pages can be found at the URL above:
+ gfs2_fsck to repair a filesystem
+ gfs2_grow to expand a filesystem online
+ gfs2_jadd to add journals to a filesystem online
+ gfs2_tool to manipulate, examine and tune a filesystem
+ gfs2_quota to examine and change quota values in a filesystem
+ mount.gfs2 to help mount(8) mount a filesystem
+ mkfs.gfs2 to make a filesystem
diff --git a/Documentation/filesystems/hfs.txt b/Documentation/filesystems/hfs.txt
new file mode 100644
index 0000000..bd0fa77
--- /dev/null
+++ b/Documentation/filesystems/hfs.txt
@@ -0,0 +1,83 @@
+
+Macintosh HFS Filesystem for Linux
+==================================
+
+HFS stands for ``Hierarchical File System'' and is the filesystem used
+by the Mac Plus and all later Macintosh models. Earlier Macintosh
+models used MFS (``Macintosh File System''), which is not supported,
+MacOS 8.1 and newer support a filesystem called HFS+ that's similar to
+HFS but is extended in various areas. Use the hfsplus filesystem driver
+to access such filesystems from Linux.
+
+
+Mount options
+=============
+
+When mounting an HFS filesystem, the following options are accepted:
+
+ creator=cccc, type=cccc
+ Specifies the creator/type values as shown by the MacOS finder
+ used for creating new files. Default values: '????'.
+
+ uid=n, gid=n
+ Specifies the user/group that owns all files on the filesystems.
+ Default: user/group id of the mounting process.
+
+ dir_umask=n, file_umask=n, umask=n
+ Specifies the umask used for all files , all directories or all
+ files and directories. Defaults to the umask of the mounting process.
+
+ session=n
+ Select the CDROM session to mount as HFS filesystem. Defaults to
+ leaving that decision to the CDROM driver. This option will fail
+ with anything but a CDROM as underlying devices.
+
+ part=n
+ Select partition number n from the devices. Does only makes
+ sense for CDROMS because they can't be partitioned under Linux.
+ For disk devices the generic partition parsing code does this
+ for us. Defaults to not parsing the partition table at all.
+
+ quiet
+ Ignore invalid mount options instead of complaining.
+
+
+Writing to HFS Filesystems
+==========================
+
+HFS is not a UNIX filesystem, thus it does not have the usual features you'd
+expect:
+
+ o You can't modify the set-uid, set-gid, sticky or executable bits or the uid
+ and gid of files.
+ o You can't create hard- or symlinks, device files, sockets or FIFOs.
+
+HFS does on the other have the concepts of multiple forks per file. These
+non-standard forks are represented as hidden additional files in the normal
+filesystems namespace which is kind of a cludge and makes the semantics for
+the a little strange:
+
+ o You can't create, delete or rename resource forks of files or the
+ Finder's metadata.
+ o They are however created (with default values), deleted and renamed
+ along with the corresponding data fork or directory.
+ o Copying files to a different filesystem will loose those attributes
+ that are essential for MacOS to work.
+
+
+Creating HFS filesystems
+===================================
+
+The hfsutils package from Robert Leslie contains a program called
+hformat that can be used to create HFS filesystem. See
+<http://www.mars.org/home/rob/proj/hfs/> for details.
+
+
+Credits
+=======
+
+The HFS drivers was written by Paul H. Hargrovea (hargrove@sccm.Stanford.EDU)
+and is now maintained by Roman Zippel (roman@ardistech.com) at Ardis
+Technologies.
+Roman rewrote large parts of the code and brought in btree routines derived
+from Brad Boyer's hfsplus driver (also maintained by Roman now).
diff --git a/Documentation/filesystems/hfsplus.txt b/Documentation/filesystems/hfsplus.txt
new file mode 100644
index 0000000..af1628a
--- /dev/null
+++ b/Documentation/filesystems/hfsplus.txt
@@ -0,0 +1,59 @@
+
+Macintosh HFSPlus Filesystem for Linux
+======================================
+
+HFSPlus is a filesystem first introduced in MacOS 8.1.
+HFSPlus has several extensions to HFS, including 32-bit allocation
+blocks, 255-character unicode filenames, and file sizes of 2^63 bytes.
+
+
+Mount options
+=============
+
+When mounting an HFSPlus filesystem, the following options are accepted:
+
+ creator=cccc, type=cccc
+ Specifies the creator/type values as shown by the MacOS finder
+ used for creating new files. Default values: '????'.
+
+ uid=n, gid=n
+ Specifies the user/group that owns all files on the filesystem
+ that have uninitialized permissions structures.
+ Default: user/group id of the mounting process.
+
+ umask=n
+ Specifies the umask (in octal) used for files and directories
+ that have uninitialized permissions structures.
+ Default: umask of the mounting process.
+
+ session=n
+ Select the CDROM session to mount as HFSPlus filesystem. Defaults to
+ leaving that decision to the CDROM driver. This option will fail
+ with anything but a CDROM as underlying devices.
+
+ part=n
+ Select partition number n from the devices. This option only makes
+ sense for CDROMs because they can't be partitioned under Linux.
+ For disk devices the generic partition parsing code does this
+ for us. Defaults to not parsing the partition table at all.
+
+ decompose
+ Decompose file name characters.
+
+ nodecompose
+ Do not decompose file name characters.
+
+ force
+ Used to force write access to volumes that are marked as journalled
+ or locked. Use at your own risk.
+
+ nls=cccc
+ Encoding to use when presenting file names.
+
+
+References
+==========
+
+kernel source: <file:fs/hfsplus>
+
+Apple Technote 1150 http://developer.apple.com/technotes/tn/tn1150.html
diff --git a/Documentation/filesystems/hpfs.txt b/Documentation/filesystems/hpfs.txt
new file mode 100644
index 0000000..fa45c3b
--- /dev/null
+++ b/Documentation/filesystems/hpfs.txt
@@ -0,0 +1,296 @@
+Read/Write HPFS 2.09
+1998-2004, Mikulas Patocka
+
+email: mikulas@artax.karlin.mff.cuni.cz
+homepage: http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi
+
+CREDITS:
+Chris Smith, 1993, original read-only HPFS, some code and hpfs structures file
+ is taken from it
+Jacques Gelinas, MSDos mmap, Inspired by fs/nfs/mmap.c (Jon Tombs 15 Aug 1993)
+Werner Almesberger, 1992, 1993, MSDos option parser & CR/LF conversion
+
+Mount options
+
+uid=xxx,gid=xxx,umask=xxx (default uid=gid=0 umask=default_system_umask)
+ Set owner/group/mode for files that do not have it specified in extended
+ attributes. Mode is inverted umask - for example umask 027 gives owner
+ all permission, group read permission and anybody else no access. Note
+ that for files mode is anded with 0666. If you want files to have 'x'
+ rights, you must use extended attributes.
+case=lower,asis (default asis)
+ File name lowercasing in readdir.
+conv=binary,text,auto (default binary)
+ CR/LF -> LF conversion, if auto, decision is made according to extension
+ - there is a list of text extensions (I thing it's better to not convert
+ text file than to damage binary file). If you want to change that list,
+ change it in the source. Original readonly HPFS contained some strange
+ heuristic algorithm that I removed. I thing it's danger to let the
+ computer decide whether file is text or binary. For example, DJGPP
+ binaries contain small text message at the beginning and they could be
+ misidentified and damaged under some circumstances.
+check=none,normal,strict (default normal)
+ Check level. Selecting none will cause only little speedup and big
+ danger. I tried to write it so that it won't crash if check=normal on
+ corrupted filesystems. check=strict means many superfluous checks -
+ used for debugging (for example it checks if file is allocated in
+ bitmaps when accessing it).
+errors=continue,remount-ro,panic (default remount-ro)
+ Behaviour when filesystem errors found.
+chkdsk=no,errors,always (default errors)
+ When to mark filesystem dirty so that OS/2 checks it.
+eas=no,ro,rw (default rw)
+ What to do with extended attributes. 'no' - ignore them and use always
+ values specified in uid/gid/mode options. 'ro' - read extended
+ attributes but do not create them. 'rw' - create extended attributes
+ when you use chmod/chown/chgrp/mknod/ln -s on the filesystem.
+timeshift=(-)nnn (default 0)
+ Shifts the time by nnn seconds. For example, if you see under linux
+ one hour more, than under os/2, use timeshift=-3600.
+
+
+File names
+
+As in OS/2, filenames are case insensitive. However, shell thinks that names
+are case sensitive, so for example when you create a file FOO, you can use
+'cat FOO', 'cat Foo', 'cat foo' or 'cat F*' but not 'cat f*'. Note, that you
+also won't be able to compile linux kernel (and maybe other things) on HPFS
+because kernel creates different files with names like bootsect.S and
+bootsect.s. When searching for file thats name has characters >= 128, codepages
+are used - see below.
+OS/2 ignores dots and spaces at the end of file name, so this driver does as
+well. If you create 'a. ...', the file 'a' will be created, but you can still
+access it under names 'a.', 'a..', 'a . . . ' etc.
+
+
+Extended attributes
+
+On HPFS partitions, OS/2 can associate to each file a special information called
+extended attributes. Extended attributes are pairs of (key,value) where key is
+an ascii string identifying that attribute and value is any string of bytes of
+variable length. OS/2 stores window and icon positions and file types there. So
+why not use it for unix-specific info like file owner or access rights? This
+driver can do it. If you chown/chgrp/chmod on a hpfs partition, extended
+attributes with keys "UID", "GID" or "MODE" and 2-byte values are created. Only
+that extended attributes those value differs from defaults specified in mount
+options are created. Once created, the extended attributes are never deleted,
+they're just changed. It means that when your default uid=0 and you type
+something like 'chown luser file; chown root file' the file will contain
+extended attribute UID=0. And when you umount the fs and mount it again with
+uid=luser_uid, the file will be still owned by root! If you chmod file to 444,
+extended attribute "MODE" will not be set, this special case is done by setting
+read-only flag. When you mknod a block or char device, besides "MODE", the
+special 4-byte extended attribute "DEV" will be created containing the device
+number. Currently this driver cannot resize extended attributes - it means
+that if somebody (I don't know who?) has set "UID", "GID", "MODE" or "DEV"
+attributes with different sizes, they won't be rewritten and changing these
+values doesn't work.
+
+
+Symlinks
+
+You can do symlinks on HPFS partition, symlinks are achieved by setting extended
+attribute named "SYMLINK" with symlink value. Like on ext2, you can chown and
+chgrp symlinks but I don't know what is it good for. chmoding symlink results
+in chmoding file where symlink points. These symlinks are just for Linux use and
+incompatible with OS/2. OS/2 PmShell symlinks are not supported because they are
+stored in very crazy way. They tried to do it so that link changes when file is
+moved ... sometimes it works. But the link is partly stored in directory
+extended attributes and partly in OS2SYS.INI. I don't want (and don't know how)
+to analyze or change OS2SYS.INI.
+
+
+Codepages
+
+HPFS can contain several uppercasing tables for several codepages and each
+file has a pointer to codepage it's name is in. However OS/2 was created in
+America where people don't care much about codepages and so multiple codepages
+support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk.
+Once I booted English OS/2 working in cp 850 and I created a file on my 852
+partition. It marked file name codepage as 850 - good. But when I again booted
+Czech OS/2, the file was completely inaccessible under any name. It seems that
+OS/2 uppercases the search pattern with its system code page (852) and file
+name it's comparing to with its code page (850). These could never match. Is it
+really what IBM developers wanted? But problems continued. When I created in
+Czech OS/2 another file in that directory, that file was inaccessible too. OS/2
+probably uses different uppercasing method when searching where to place a file
+(note, that files in HPFS directory must be sorted) and when searching for
+a file. Finally when I opened this directory in PmShell, PmShell crashed (the
+funny thing was that, when rebooted, PmShell tried to reopen this directory
+again :-). chkdsk happily ignores these errors and only low-level disk
+modification saved me. Never mix different language versions of OS/2 on one
+system although HPFS was designed to allow that.
+OK, I could implement complex codepage support to this driver but I think it
+would cause more problems than benefit with such buggy implementation in OS/2.
+So this driver simply uses first codepage it finds for uppercasing and
+lowercasing no matter what's file codepage index. Usually all file names are in
+this codepage - if you don't try to do what I described above :-)
+
+
+Known bugs
+
+HPFS386 on OS/2 server is not supported. HPFS386 installed on normal OS/2 client
+should work. If you have OS/2 server, use only read-only mode. I don't know how
+to handle some HPFS386 structures like access control list or extended perm
+list, I don't know how to delete them when file is deleted and how to not
+overwrite them with extended attributes. Send me some info on these structures
+and I'll make it. However, this driver should detect presence of HPFS386
+structures, remount read-only and not destroy them (I hope).
+
+When there's not enough space for extended attributes, they will be truncated
+and no error is returned.
+
+OS/2 can't access files if the path is longer than about 256 chars but this
+driver allows you to do it. chkdsk ignores such errors.
+
+Sometimes you won't be able to delete some files on a very full filesystem
+(returning error ENOSPC). That's because file in non-leaf node in directory tree
+(one directory, if it's large, has dirents in tree on HPFS) must be replaced
+with another node when deleted. And that new file might have larger name than
+the old one so the new name doesn't fit in directory node (dnode). And that
+would result in directory tree splitting, that takes disk space. Workaround is
+to delete other files that are leaf (probability that the file is non-leaf is
+about 1/50) or to truncate file first to make some space.
+You encounter this problem only if you have many directories so that
+preallocated directory band is full i.e.
+ number_of_directories / size_of_filesystem_in_mb > 4.
+
+You can't delete open directories.
+
+You can't rename over directories (what is it good for?).
+
+Renaming files so that only case changes doesn't work. This driver supports it
+but vfs doesn't. Something like 'mv file FILE' won't work.
+
+All atimes and directory mtimes are not updated. That's because of performance
+reasons. If you extremely wish to update them, let me know, I'll write it (but
+it will be slow).
+
+When the system is out of memory and swap, it may slightly corrupt filesystem
+(lost files, unbalanced directories). (I guess all filesystem may do it).
+
+When compiled, you get warning: function declaration isn't a prototype. Does
+anybody know what does it mean?
+
+
+What does "unbalanced tree" message mean?
+
+Old versions of this driver created sometimes unbalanced dnode trees. OS/2
+chkdsk doesn't scream if the tree is unbalanced (and sometimes creates
+unbalanced trees too :-) but both HPFS and HPFS386 contain bug that it rarely
+crashes when the tree is not balanced. This driver handles unbalanced trees
+correctly and writes warning if it finds them. If you see this message, this is
+probably because of directories created with old version of this driver.
+Workaround is to move all files from that directory to another and then back
+again. Do it in Linux, not OS/2! If you see this message in directory that is
+whole created by this driver, it is BUG - let me know about it.
+
+
+Bugs in OS/2
+
+When you have two (or more) lost directories pointing each to other, chkdsk
+locks up when repairing filesystem.
+
+Sometimes (I think it's random) when you create a file with one-char name under
+OS/2, OS/2 marks it as 'long'. chkdsk then removes this flag saying "Minor fs
+error corrected".
+
+File names like "a .b" are marked as 'long' by OS/2 but chkdsk "corrects" it and
+marks them as short (and writes "minor fs error corrected"). This bug is not in
+HPFS386.
+
+Codepage bugs described above.
+
+If you don't install fixpacks, there are many, many more...
+
+
+History
+
+0.90 First public release
+0.91 Fixed bug that caused shooting to memory when write_inode was called on
+ open inode (rarely happened)
+0.92 Fixed a little memory leak in freeing directory inodes
+0.93 Fixed bug that locked up the machine when there were too many filenames
+ with first 15 characters same
+ Fixed write_file to zero file when writing behind file end
+0.94 Fixed a little memory leak when trying to delete busy file or directory
+0.95 Fixed a bug that i_hpfs_parent_dir was not updated when moving files
+1.90 First version for 2.1.1xx kernels
+1.91 Fixed a bug that chk_sectors failed when sectors were at the end of disk
+ Fixed a race-condition when write_inode is called while deleting file
+ Fixed a bug that could possibly happen (with very low probability) when
+ using 0xff in filenames
+ Rewritten locking to avoid race-conditions
+ Mount option 'eas' now works
+ Fsync no longer returns error
+ Files beginning with '.' are marked hidden
+ Remount support added
+ Alloc is not so slow when filesystem becomes full
+ Atimes are no more updated because it slows down operation
+ Code cleanup (removed all commented debug prints)
+1.92 Corrected a bug when sync was called just before closing file
+1.93 Modified, so that it works with kernels >= 2.1.131, I don't know if it
+ works with previous versions
+ Fixed a possible problem with disks > 64G (but I don't have one, so I can't
+ test it)
+ Fixed a file overflow at 2G
+ Added new option 'timeshift'
+ Changed behaviour on HPFS386: It is now possible to operate on HPFS386 in
+ read-only mode
+ Fixed a bug that slowed down alloc and prevented allocating 100% space
+ (this bug was not destructive)
+1.94 Added workaround for one bug in Linux
+ Fixed one buffer leak
+ Fixed some incompatibilities with large extended attributes (but it's still
+ not 100% ok, I have no info on it and OS/2 doesn't want to create them)
+ Rewritten allocation
+ Fixed a bug with i_blocks (du sometimes didn't display correct values)
+ Directories have no longer archive attribute set (some programs don't like
+ it)
+ Fixed a bug that it set badly one flag in large anode tree (it was not
+ destructive)
+1.95 Fixed one buffer leak, that could happen on corrupted filesystem
+ Fixed one bug in allocation in 1.94
+1.96 Added workaround for one bug in OS/2 (HPFS locked up, HPFS386 reported
+ error sometimes when opening directories in PMSHELL)
+ Fixed a possible bitmap race
+ Fixed possible problem on large disks
+ You can now delete open files
+ Fixed a nondestructive race in rename
+1.97 Support for HPFS v3 (on large partitions)
+ Fixed a bug that it didn't allow creation of files > 128M (it should be 2G)
+1.97.1 Changed names of global symbols
+ Fixed a bug when chmoding or chowning root directory
+1.98 Fixed a deadlock when using old_readdir
+ Better directory handling; workaround for "unbalanced tree" bug in OS/2
+1.99 Corrected a possible problem when there's not enough space while deleting
+ file
+ Now it tries to truncate the file if there's not enough space when deleting
+ Removed a lot of redundant code
+2.00 Fixed a bug in rename (it was there since 1.96)
+ Better anti-fragmentation strategy
+2.01 Fixed problem with directory listing over NFS
+ Directory lseek now checks for proper parameters
+ Fixed race-condition in buffer code - it is in all filesystems in Linux;
+ when reading device (cat /dev/hda) while creating files on it, files
+ could be damaged
+2.02 Workaround for bug in breada in Linux. breada could cause accesses beyond
+ end of partition
+2.03 Char, block devices and pipes are correctly created
+ Fixed non-crashing race in unlink (Alexander Viro)
+ Now it works with Japanese version of OS/2
+2.04 Fixed error when ftruncate used to extend file
+2.05 Fixed crash when got mount parameters without =
+ Fixed crash when allocation of anode failed due to full disk
+ Fixed some crashes when block io or inode allocation failed
+2.06 Fixed some crash on corrupted disk structures
+ Better allocation strategy
+ Reschedule points added so that it doesn't lock CPU long time
+ It should work in read-only mode on Warp Server
+2.07 More fixes for Warp Server. Now it really works
+2.08 Creating new files is not so slow on large disks
+ An attempt to sync deleted file does not generate filesystem error
+2.09 Fixed error on extremely fragmented files
+
+
+ vim: set textwidth=80:
diff --git a/Documentation/filesystems/inotify.txt b/Documentation/filesystems/inotify.txt
new file mode 100644
index 0000000..59a919f
--- /dev/null
+++ b/Documentation/filesystems/inotify.txt
@@ -0,0 +1,269 @@
+ inotify
+ a powerful yet simple file change notification system
+
+
+
+Document started 15 Mar 2005 by Robert Love <rml@novell.com>
+
+
+(i) User Interface
+
+Inotify is controlled by a set of three system calls and normal file I/O on a
+returned file descriptor.
+
+First step in using inotify is to initialise an inotify instance:
+
+ int fd = inotify_init ();
+
+Each instance is associated with a unique, ordered queue.
+
+Change events are managed by "watches". A watch is an (object,mask) pair where
+the object is a file or directory and the mask is a bit mask of one or more
+inotify events that the application wishes to receive. See <linux/inotify.h>
+for valid events. A watch is referenced by a watch descriptor, or wd.
+
+Watches are added via a path to the file.
+
+Watches on a directory will return events on any files inside of the directory.
+
+Adding a watch is simple:
+
+ int wd = inotify_add_watch (fd, path, mask);
+
+Where "fd" is the return value from inotify_init(), path is the path to the
+object to watch, and mask is the watch mask (see <linux/inotify.h>).
+
+You can update an existing watch in the same manner, by passing in a new mask.
+
+An existing watch is removed via
+
+ int ret = inotify_rm_watch (fd, wd);
+
+Events are provided in the form of an inotify_event structure that is read(2)
+from a given inotify instance. The filename is of dynamic length and follows
+the struct. It is of size len. The filename is padded with null bytes to
+ensure proper alignment. This padding is reflected in len.
+
+You can slurp multiple events by passing a large buffer, for example
+
+ size_t len = read (fd, buf, BUF_LEN);
+
+Where "buf" is a pointer to an array of "inotify_event" structures at least
+BUF_LEN bytes in size. The above example will return as many events as are
+available and fit in BUF_LEN.
+
+Each inotify instance fd is also select()- and poll()-able.
+
+You can find the size of the current event queue via the standard FIONREAD
+ioctl on the fd returned by inotify_init().
+
+All watches are destroyed and cleaned up on close.
+
+
+(ii)
+
+Prototypes:
+
+ int inotify_init (void);
+ int inotify_add_watch (int fd, const char *path, __u32 mask);
+ int inotify_rm_watch (int fd, __u32 mask);
+
+
+(iii) Kernel Interface
+
+Inotify's kernel API consists a set of functions for managing watches and an
+event callback.
+
+To use the kernel API, you must first initialize an inotify instance with a set
+of inotify_operations. You are given an opaque inotify_handle, which you use
+for any further calls to inotify.
+
+ struct inotify_handle *ih = inotify_init(my_event_handler);
+
+You must provide a function for processing events and a function for destroying
+the inotify watch.
+
+ void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
+ u32 cookie, const char *name, struct inode *inode)
+
+ watch - the pointer to the inotify_watch that triggered this call
+ wd - the watch descriptor
+ mask - describes the event that occurred
+ cookie - an identifier for synchronizing events
+ name - the dentry name for affected files in a directory-based event
+ inode - the affected inode in a directory-based event
+
+ void destroy_watch(struct inotify_watch *watch)
+
+You may add watches by providing a pre-allocated and initialized inotify_watch
+structure and specifying the inode to watch along with an inotify event mask.
+You must pin the inode during the call. You will likely wish to embed the
+inotify_watch structure in a structure of your own which contains other
+information about the watch. Once you add an inotify watch, it is immediately
+subject to removal depending on filesystem events. You must grab a reference if
+you depend on the watch hanging around after the call.
+
+ inotify_init_watch(&my_watch->iwatch);
+ inotify_get_watch(&my_watch->iwatch); // optional
+ s32 wd = inotify_add_watch(ih, &my_watch->iwatch, inode, mask);
+ inotify_put_watch(&my_watch->iwatch); // optional
+
+You may use the watch descriptor (wd) or the address of the inotify_watch for
+other inotify operations. You must not directly read or manipulate data in the
+inotify_watch. Additionally, you must not call inotify_add_watch() more than
+once for a given inotify_watch structure, unless you have first called either
+inotify_rm_watch() or inotify_rm_wd().
+
+To determine if you have already registered a watch for a given inode, you may
+call inotify_find_watch(), which gives you both the wd and the watch pointer for
+the inotify_watch, or an error if the watch does not exist.
+
+ wd = inotify_find_watch(ih, inode, &watchp);
+
+You may use container_of() on the watch pointer to access your own data
+associated with a given watch. When an existing watch is found,
+inotify_find_watch() bumps the refcount before releasing its locks. You must
+put that reference with:
+
+ put_inotify_watch(watchp);
+
+Call inotify_find_update_watch() to update the event mask for an existing watch.
+inotify_find_update_watch() returns the wd of the updated watch, or an error if
+the watch does not exist.
+
+ wd = inotify_find_update_watch(ih, inode, mask);
+
+An existing watch may be removed by calling either inotify_rm_watch() or
+inotify_rm_wd().
+
+ int ret = inotify_rm_watch(ih, &my_watch->iwatch);
+ int ret = inotify_rm_wd(ih, wd);
+
+A watch may be removed while executing your event handler with the following:
+
+ inotify_remove_watch_locked(ih, iwatch);
+
+Call inotify_destroy() to remove all watches from your inotify instance and
+release it. If there are no outstanding references, inotify_destroy() will call
+your destroy_watch op for each watch.
+
+ inotify_destroy(ih);
+
+When inotify removes a watch, it sends an IN_IGNORED event to your callback.
+You may use this event as an indication to free the watch memory. Note that
+inotify may remove a watch due to filesystem events, as well as by your request.
+If you use IN_ONESHOT, inotify will remove the watch after the first event, at
+which point you may call the final inotify_put_watch.
+
+(iv) Kernel Interface Prototypes
+
+ struct inotify_handle *inotify_init(struct inotify_operations *ops);
+
+ inotify_init_watch(struct inotify_watch *watch);
+
+ s32 inotify_add_watch(struct inotify_handle *ih,
+ struct inotify_watch *watch,
+ struct inode *inode, u32 mask);
+
+ s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+ struct inotify_watch **watchp);
+
+ s32 inotify_find_update_watch(struct inotify_handle *ih,
+ struct inode *inode, u32 mask);
+
+ int inotify_rm_wd(struct inotify_handle *ih, u32 wd);
+
+ int inotify_rm_watch(struct inotify_handle *ih,
+ struct inotify_watch *watch);
+
+ void inotify_remove_watch_locked(struct inotify_handle *ih,
+ struct inotify_watch *watch);
+
+ void inotify_destroy(struct inotify_handle *ih);
+
+ void get_inotify_watch(struct inotify_watch *watch);
+ void put_inotify_watch(struct inotify_watch *watch);
+
+
+(v) Internal Kernel Implementation
+
+Each inotify instance is represented by an inotify_handle structure.
+Inotify's userspace consumers also have an inotify_device which is
+associated with the inotify_handle, and on which events are queued.
+
+Each watch is associated with an inotify_watch structure. Watches are chained
+off of each associated inotify_handle and each associated inode.
+
+See fs/inotify.c and fs/inotify_user.c for the locking and lifetime rules.
+
+
+(vi) Rationale
+
+Q: What is the design decision behind not tying the watch to the open fd of
+ the watched object?
+
+A: Watches are associated with an open inotify device, not an open file.
+ This solves the primary problem with dnotify: keeping the file open pins
+ the file and thus, worse, pins the mount. Dnotify is therefore infeasible
+ for use on a desktop system with removable media as the media cannot be
+ unmounted. Watching a file should not require that it be open.
+
+Q: What is the design decision behind using an-fd-per-instance as opposed to
+ an fd-per-watch?
+
+A: An fd-per-watch quickly consumes more file descriptors than are allowed,
+ more fd's than are feasible to manage, and more fd's than are optimally
+ select()-able. Yes, root can bump the per-process fd limit and yes, users
+ can use epoll, but requiring both is a silly and extraneous requirement.
+ A watch consumes less memory than an open file, separating the number
+ spaces is thus sensible. The current design is what user-space developers
+ want: Users initialize inotify, once, and add n watches, requiring but one
+ fd and no twiddling with fd limits. Initializing an inotify instance two
+ thousand times is silly. If we can implement user-space's preferences
+ cleanly--and we can, the idr layer makes stuff like this trivial--then we
+ should.
+
+ There are other good arguments. With a single fd, there is a single
+ item to block on, which is mapped to a single queue of events. The single
+ fd returns all watch events and also any potential out-of-band data. If
+ every fd was a separate watch,
+
+ - There would be no way to get event ordering. Events on file foo and
+ file bar would pop poll() on both fd's, but there would be no way to tell
+ which happened first. A single queue trivially gives you ordering. Such
+ ordering is crucial to existing applications such as Beagle. Imagine
+ "mv a b ; mv b a" events without ordering.
+
+ - We'd have to maintain n fd's and n internal queues with state,
+ versus just one. It is a lot messier in the kernel. A single, linear
+ queue is the data structure that makes sense.
+
+ - User-space developers prefer the current API. The Beagle guys, for
+ example, love it. Trust me, I asked. It is not a surprise: Who'd want
+ to manage and block on 1000 fd's via select?
+
+ - No way to get out of band data.
+
+ - 1024 is still too low. ;-)
+
+ When you talk about designing a file change notification system that
+ scales to 1000s of directories, juggling 1000s of fd's just does not seem
+ the right interface. It is too heavy.
+
+ Additionally, it _is_ possible to more than one instance and
+ juggle more than one queue and thus more than one associated fd. There
+ need not be a one-fd-per-process mapping; it is one-fd-per-queue and a
+ process can easily want more than one queue.
+
+Q: Why the system call approach?
+
+A: The poor user-space interface is the second biggest problem with dnotify.
+ Signals are a terrible, terrible interface for file notification. Or for
+ anything, for that matter. The ideal solution, from all perspectives, is a
+ file descriptor-based one that allows basic file I/O and poll/select.
+ Obtaining the fd and managing the watches could have been done either via a
+ device file or a family of new system calls. We decided to implement a
+ family of system calls because that is the preferred approach for new kernel
+ interfaces. The only real difference was whether we wanted to use open(2)
+ and ioctl(2) or a couple of new system calls. System calls beat ioctls.
+
diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt
new file mode 100644
index 0000000..6973b98
--- /dev/null
+++ b/Documentation/filesystems/isofs.txt
@@ -0,0 +1,43 @@
+Mount options that are the same as for msdos and vfat partitions.
+
+ gid=nnn All files in the partition will be in group nnn.
+ uid=nnn All files in the partition will be owned by user id nnn.
+ umask=nnn The permission mask (see umask(1)) for the partition.
+
+Mount options that are the same as vfat partitions. These are only useful
+when using discs encoded using Microsoft's Joliet extensions.
+ iocharset=name Character set to use for converting from Unicode to
+ ASCII. Joliet filenames are stored in Unicode format, but
+ Unix for the most part doesn't know how to deal with Unicode.
+ There is also an option of doing UTF-8 translations with the
+ utf8 option.
+ utf8 Encode Unicode names in UTF-8 format. Default is no.
+
+Mount options unique to the isofs filesystem.
+ block=512 Set the block size for the disk to 512 bytes
+ block=1024 Set the block size for the disk to 1024 bytes
+ block=2048 Set the block size for the disk to 2048 bytes
+ check=relaxed Matches filenames with different cases
+ check=strict Matches only filenames with the exact same case
+ cruft Try to handle badly formatted CDs.
+ map=off Do not map non-Rock Ridge filenames to lower case
+ map=normal Map non-Rock Ridge filenames to lower case
+ map=acorn As map=normal but also apply Acorn extensions if present
+ mode=xxx Sets the permissions on files to xxx
+ dmode=xxx Sets the permissions on directories to xxx
+ nojoliet Ignore Joliet extensions if they are present.
+ norock Ignore Rock Ridge extensions if they are present.
+ hide Completely strip hidden files from the file system.
+ showassoc Show files marked with the 'associated' bit
+ unhide Deprecated; showing hidden files is now default;
+ If given, it is a synonym for 'showassoc' which will
+ recreate previous unhide behavior
+ session=x Select number of session on multisession CD
+ sbsector=xxx Session begins from sector xxx
+
+Recommended documents about ISO 9660 standard are located at:
+http://www.y-adagio.com/public/standards/iso_cdromr/tocont.htm
+ftp://ftp.ecma.ch/ecma-st/Ecma-119.pdf
+Quoting from the PDF "This 2nd Edition of Standard ECMA-119 is technically
+identical with ISO 9660.", so it is a valid and gratis substitute of the
+official ISO specification.
diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
new file mode 100644
index 0000000..26ebde7
--- /dev/null
+++ b/Documentation/filesystems/jfs.txt
@@ -0,0 +1,41 @@
+IBM's Journaled File System (JFS) for Linux
+
+JFS Homepage: http://jfs.sourceforge.net/
+
+The following mount options are supported:
+
+iocharset=name Character set to use for converting from Unicode to
+ ASCII. The default is to do no conversion. Use
+ iocharset=utf8 for UTF-8 translations. This requires
+ CONFIG_NLS_UTF8 to be set in the kernel .config file.
+ iocharset=none specifies the default behavior explicitly.
+
+resize=value Resize the volume to <value> blocks. JFS only supports
+ growing a volume, not shrinking it. This option is only
+ valid during a remount, when the volume is mounted
+ read-write. The resize keyword with no value will grow
+ the volume to the full size of the partition.
+
+nointegrity Do not write to the journal. The primary use of this option
+ is to allow for higher performance when restoring a volume
+ from backup media. The integrity of the volume is not
+ guaranteed if the system abnormally abends.
+
+integrity Default. Commit metadata changes to the journal. Use this
+ option to remount a volume where the nointegrity option was
+ previously specified in order to restore normal behavior.
+
+errors=continue Keep going on a filesystem error.
+errors=remount-ro Default. Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+
+uid=value Override on-disk uid with specified value
+gid=value Override on-disk gid with specified value
+umask=value Override on-disk umask with specified octal value. For
+ directories, the execute bit will be set if the corresponding
+ read bit is set.
+
+Please send bugs, comments, cards and letters to shaggy@linux.vnet.ibm.com.
+
+The JFS mailing list can be subscribed to by using the link labeled
+"Mail list Subscribe" at our web page http://jfs.sourceforge.net/
diff --git a/Documentation/filesystems/locks.txt b/Documentation/filesystems/locks.txt
new file mode 100644
index 0000000..fab857a
--- /dev/null
+++ b/Documentation/filesystems/locks.txt
@@ -0,0 +1,67 @@
+ File Locking Release Notes
+
+ Andy Walker <andy@lysaker.kvaerner.no>
+
+ 12 May 1997
+
+
+1. What's New?
+--------------
+
+1.1 Broken Flock Emulation
+--------------------------
+
+The old flock(2) emulation in the kernel was swapped for proper BSD
+compatible flock(2) support in the 1.3.x series of kernels. With the
+release of the 2.1.x kernel series, support for the old emulation has
+been totally removed, so that we don't need to carry this baggage
+forever.
+
+This should not cause problems for anybody, since everybody using a
+2.1.x kernel should have updated their C library to a suitable version
+anyway (see the file "Documentation/Changes".)
+
+1.2 Allow Mixed Locks Again
+---------------------------
+
+1.2.1 Typical Problems - Sendmail
+---------------------------------
+Because sendmail was unable to use the old flock() emulation, many sendmail
+installations use fcntl() instead of flock(). This is true of Slackware 3.0
+for example. This gave rise to some other subtle problems if sendmail was
+configured to rebuild the alias file. Sendmail tried to lock the aliases.dir
+file with fcntl() at the same time as the GDBM routines tried to lock this
+file with flock(). With pre 1.3.96 kernels this could result in deadlocks that,
+over time, or under a very heavy mail load, would eventually cause the kernel
+to lock solid with deadlocked processes.
+
+
+1.2.2 The Solution
+------------------
+The solution I have chosen, after much experimentation and discussion,
+is to make flock() and fcntl() locks oblivious to each other. Both can
+exists, and neither will have any effect on the other.
+
+I wanted the two lock styles to be cooperative, but there were so many
+race and deadlock conditions that the current solution was the only
+practical one. It puts us in the same position as, for example, SunOS
+4.1.x and several other commercial Unices. The only OS's that support
+cooperative flock()/fcntl() are those that emulate flock() using
+fcntl(), with all the problems that implies.
+
+
+1.3 Mandatory Locking As A Mount Option
+---------------------------------------
+
+Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt'
+was prior to this release a general configuration option that was valid for
+all mounted filesystems. This had a number of inherent dangers, not the
+least of which was the ability to freeze an NFS server by asking it to read
+a file for which a mandatory lock existed.
+
+From this release of the kernel, mandatory locking can be turned on and off
+on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
+The default is to disallow mandatory locking. The intention is that
+mandatory locking only be enabled on a local filesystem as the specific need
+arises.
+
diff --git a/Documentation/filesystems/mandatory-locking.txt b/Documentation/filesystems/mandatory-locking.txt
new file mode 100644
index 0000000..0979d1d
--- /dev/null
+++ b/Documentation/filesystems/mandatory-locking.txt
@@ -0,0 +1,171 @@
+ Mandatory File Locking For The Linux Operating System
+
+ Andy Walker <andy@lysaker.kvaerner.no>
+
+ 15 April 1996
+ (Updated September 2007)
+
+0. Why you should avoid mandatory locking
+-----------------------------------------
+
+The Linux implementation is prey to a number of difficult-to-fix race
+conditions which in practice make it not dependable:
+
+ - The write system call checks for a mandatory lock only once
+ at its start. It is therefore possible for a lock request to
+ be granted after this check but before the data is modified.
+ A process may then see file data change even while a mandatory
+ lock was held.
+ - Similarly, an exclusive lock may be granted on a file after
+ the kernel has decided to proceed with a read, but before the
+ read has actually completed, and the reading process may see
+ the file data in a state which should not have been visible
+ to it.
+ - Similar races make the claimed mutual exclusion between lock
+ and mmap similarly unreliable.
+
+1. What is mandatory locking?
+------------------------------
+
+Mandatory locking is kernel enforced file locking, as opposed to the more usual
+cooperative file locking used to guarantee sequential access to files among
+processes. File locks are applied using the flock() and fcntl() system calls
+(and the lockf() library routine which is a wrapper around fcntl().) It is
+normally a process' responsibility to check for locks on a file it wishes to
+update, before applying its own lock, updating the file and unlocking it again.
+The most commonly used example of this (and in the case of sendmail, the most
+troublesome) is access to a user's mailbox. The mail user agent and the mail
+transfer agent must guard against updating the mailbox at the same time, and
+prevent reading the mailbox while it is being updated.
+
+In a perfect world all processes would use and honour a cooperative, or
+"advisory" locking scheme. However, the world isn't perfect, and there's
+a lot of poorly written code out there.
+
+In trying to address this problem, the designers of System V UNIX came up
+with a "mandatory" locking scheme, whereby the operating system kernel would
+block attempts by a process to write to a file that another process holds a
+"read" -or- "shared" lock on, and block attempts to both read and write to a
+file that a process holds a "write " -or- "exclusive" lock on.
+
+The System V mandatory locking scheme was intended to have as little impact as
+possible on existing user code. The scheme is based on marking individual files
+as candidates for mandatory locking, and using the existing fcntl()/lockf()
+interface for applying locks just as if they were normal, advisory locks.
+
+Note 1: In saying "file" in the paragraphs above I am actually not telling
+the whole truth. System V locking is based on fcntl(). The granularity of
+fcntl() is such that it allows the locking of byte ranges in files, in addition
+to entire files, so the mandatory locking rules also have byte level
+granularity.
+
+Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite
+borrowing the fcntl() locking scheme from System V. The mandatory locking
+scheme is defined by the System V Interface Definition (SVID) Version 3.
+
+2. Marking a file for mandatory locking
+---------------------------------------
+
+A file is marked as a candidate for mandatory locking by setting the group-id
+bit in its file mode but removing the group-execute bit. This is an otherwise
+meaningless combination, and was chosen by the System V implementors so as not
+to break existing user programs.
+
+Note that the group-id bit is usually automatically cleared by the kernel when
+a setgid file is written to. This is a security measure. The kernel has been
+modified to recognize the special case of a mandatory lock candidate and to
+refrain from clearing this bit. Similarly the kernel has been modified not
+to run mandatory lock candidates with setgid privileges.
+
+3. Available implementations
+----------------------------
+
+I have considered the implementations of mandatory locking available with
+SunOS 4.1.x, Solaris 2.x and HP-UX 9.x.
+
+Generally I have tried to make the most sense out of the behaviour exhibited
+by these three reference systems. There are many anomalies.
+
+All the reference systems reject all calls to open() for a file on which
+another process has outstanding mandatory locks. This is in direct
+contravention of SVID 3, which states that only calls to open() with the
+O_TRUNC flag set should be rejected. The Linux implementation follows the SVID
+definition, which is the "Right Thing", since only calls with O_TRUNC can
+modify the contents of the file.
+
+HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not
+just mandatory locks. That would appear to contravene POSIX.1.
+
+mmap() is another interesting case. All the operating systems mentioned
+prevent mandatory locks from being applied to an mmap()'ed file, but HP-UX
+also disallows advisory locks for such a file. SVID actually specifies the
+paranoid HP-UX behaviour.
+
+In my opinion only MAP_SHARED mappings should be immune from locking, and then
+only from mandatory locks - that is what is currently implemented.
+
+SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for
+mandatory locks, so reads and writes to locked files always block when they
+should return EAGAIN.
+
+I'm afraid that this is such an esoteric area that the semantics described
+below are just as valid as any others, so long as the main points seem to
+agree.
+
+4. Semantics
+------------
+
+1. Mandatory locks can only be applied via the fcntl()/lockf() locking
+ interface - in other words the System V/POSIX interface. BSD style
+ locks using flock() never result in a mandatory lock.
+
+2. If a process has locked a region of a file with a mandatory read lock, then
+ other processes are permitted to read from that region. If any of these
+ processes attempts to write to the region it will block until the lock is
+ released, unless the process has opened the file with the O_NONBLOCK
+ flag in which case the system call will return immediately with the error
+ status EAGAIN.
+
+3. If a process has locked a region of a file with a mandatory write lock, all
+ attempts to read or write to that region block until the lock is released,
+ unless a process has opened the file with the O_NONBLOCK flag in which case
+ the system call will return immediately with the error status EAGAIN.
+
+4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has
+ any mandatory locks owned by other processes will be rejected with the
+ error status EAGAIN.
+
+5. Attempts to apply a mandatory lock to a file that is memory mapped and
+ shared (via mmap() with MAP_SHARED) will be rejected with the error status
+ EAGAIN.
+
+6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED)
+ that has any mandatory locks in effect will be rejected with the error status
+ EAGAIN.
+
+5. Which system calls are affected?
+-----------------------------------
+
+Those which modify a file's contents, not just the inode. That gives read(),
+write(), readv(), writev(), open(), creat(), mmap(), truncate() and
+ftruncate(). truncate() and ftruncate() are considered to be "write" actions
+for the purposes of mandatory locking.
+
+The affected region is usually defined as stretching from the current position
+for the total number of bytes read or written. For the truncate calls it is
+defined as the bytes of a file removed or added (we must also consider bytes
+added, as a lock can specify just "the whole file", rather than a specific
+range of bytes.)
+
+Note 3: I may have overlooked some system calls that need mandatory lock
+checking in my eagerness to get this code out the door. Please let me know, or
+better still fix the system calls yourself and submit a patch to me or Linus.
+
+6. Warning!
+-----------
+
+Not even root can override a mandatory lock, so runaway processes can wreak
+havoc if they lock crucial files. The way around it is to change the file
+permissions (remove the setgid bit) before trying to read or write to it.
+Of course, that might be a bit tricky if the system is hung :-(
+
diff --git a/Documentation/filesystems/ncpfs.txt b/Documentation/filesystems/ncpfs.txt
new file mode 100644
index 0000000..f12c30c
--- /dev/null
+++ b/Documentation/filesystems/ncpfs.txt
@@ -0,0 +1,12 @@
+The ncpfs filesystem understands the NCP protocol, designed by the
+Novell Corporation for their NetWare(tm) product. NCP is functionally
+similar to the NFS used in the TCP/IP community.
+To mount a NetWare filesystem, you need a special mount program, which
+can be found in the ncpfs package. The home site for ncpfs is
+ftp.gwdg.de/pub/linux/misc/ncpfs, but sunsite and its many mirrors
+will have it as well.
+
+Related products are linware and mars_nwe, which will give Linux partial
+NetWare server functionality. Linware's home site is
+klokan.sh.cvut.cz/pub/linux/linware; mars_nwe can be found on
+ftp.gwdg.de/pub/linux/misc/ncpfs.
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt
new file mode 100644
index 0000000..44bd766
--- /dev/null
+++ b/Documentation/filesystems/nfs-rdma.txt
@@ -0,0 +1,271 @@
+################################################################################
+# #
+# NFS/RDMA README #
+# #
+################################################################################
+
+ Author: NetApp and Open Grid Computing
+ Date: May 29, 2008
+
+Table of Contents
+~~~~~~~~~~~~~~~~~
+ - Overview
+ - Getting Help
+ - Installation
+ - Check RDMA and NFS Setup
+ - NFS/RDMA Setup
+
+Overview
+~~~~~~~~
+
+ This document describes how to install and setup the Linux NFS/RDMA client
+ and server software.
+
+ The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server
+ was first included in the following release, Linux 2.6.25.
+
+ In our testing, we have obtained excellent performance results (full 10Gbit
+ wire bandwidth at minimal client CPU) under many workloads. The code passes
+ the full Connectathon test suite and operates over both Infiniband and iWARP
+ RDMA adapters.
+
+Getting Help
+~~~~~~~~~~~~
+
+ If you get stuck, you can ask questions on the
+
+ nfs-rdma-devel@lists.sourceforge.net
+
+ mailing list.
+
+Installation
+~~~~~~~~~~~~
+
+ These instructions are a step by step guide to building a machine for
+ use with NFS/RDMA.
+
+ - Install an RDMA device
+
+ Any device supported by the drivers in drivers/infiniband/hw is acceptable.
+
+ Testing has been performed using several Mellanox-based IB cards, the
+ Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter.
+
+ - Install a Linux distribution and tools
+
+ The first kernel release to contain both the NFS/RDMA client and server was
+ Linux 2.6.25 Therefore, a distribution compatible with this and subsequent
+ Linux kernel release should be installed.
+
+ The procedures described in this document have been tested with
+ distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
+
+ - Install nfs-utils-1.1.2 or greater on the client
+
+ An NFS/RDMA mount point can be obtained by using the mount.nfs command in
+ nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
+ version with support for NFS/RDMA mounts, but for various reasons we
+ recommend using nfs-utils-1.1.2 or greater). To see which version of
+ mount.nfs you are using, type:
+
+ $ /sbin/mount.nfs -V
+
+ If the version is less than 1.1.2 or the command does not exist,
+ you should install the latest version of nfs-utils.
+
+ Download the latest package from:
+
+ http://www.kernel.org/pub/linux/utils/nfs
+
+ Uncompress the package and follow the installation instructions.
+
+ If you will not need the idmapper and gssd executables (you do not need
+ these to create an NFS/RDMA enabled mount command), the installation
+ process can be simplified by disabling these features when running
+ configure:
+
+ $ ./configure --disable-gss --disable-nfsv4
+
+ To build nfs-utils you will need the tcp_wrappers package installed. For
+ more information on this see the package's README and INSTALL files.
+
+ After building the nfs-utils package, there will be a mount.nfs binary in
+ the utils/mount directory. This binary can be used to initiate NFS v2, v3,
+ or v4 mounts. To initiate a v4 mount, the binary must be called
+ mount.nfs4. The standard technique is to create a symlink called
+ mount.nfs4 to mount.nfs.
+
+ This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
+
+ $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
+
+ In this location, mount.nfs will be invoked automatically for NFS mounts
+ by the system mount commmand.
+
+ NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
+ on the NFS client machine. You do not need this specific version of
+ nfs-utils on the server. Furthermore, only the mount.nfs command from
+ nfs-utils-1.1.2 is needed on the client.
+
+ - Install a Linux kernel with NFS/RDMA
+
+ The NFS/RDMA client and server are both included in the mainline Linux
+ kernel version 2.6.25 and later. This and other versions of the 2.6 Linux
+ kernel can be found at:
+
+ ftp://ftp.kernel.org/pub/linux/kernel/v2.6/
+
+ Download the sources and place them in an appropriate location.
+
+ - Configure the RDMA stack
+
+ Make sure your kernel configuration has RDMA support enabled. Under
+ Device Drivers -> InfiniBand support, update the kernel configuration
+ to enable InfiniBand support [NOTE: the option name is misleading. Enabling
+ InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)].
+
+ Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or
+ iWARP adapter support (amso, cxgb3, etc.).
+
+ If you are using InfiniBand, be sure to enable IP-over-InfiniBand support.
+
+ - Configure the NFS client and server
+
+ Your kernel configuration must also have NFS file system support and/or
+ NFS server support enabled. These and other NFS related configuration
+ options can be found under File Systems -> Network File Systems.
+
+ - Build, install, reboot
+
+ The NFS/RDMA code will be enabled automatically if NFS and RDMA
+ are turned on. The NFS/RDMA client and server are configured via the hidden
+ SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
+ value of SUNRPC_XPRT_RDMA will be:
+
+ - N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
+ and server will not be built
+ - M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M,
+ in this case the NFS/RDMA client and server will be built as modules
+ - Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client
+ and server will be built into the kernel
+
+ Therefore, if you have followed the steps above and turned no NFS and RDMA,
+ the NFS/RDMA client and server will be built.
+
+ Build a new kernel, install it, boot it.
+
+Check RDMA and NFS Setup
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Before configuring the NFS/RDMA software, it is a good idea to test
+ your new kernel to ensure that the kernel is working correctly.
+ In particular, it is a good idea to verify that the RDMA stack
+ is functioning as expected and standard NFS over TCP/IP and/or UDP/IP
+ is working properly.
+
+ - Check RDMA Setup
+
+ If you built the RDMA components as modules, load them at
+ this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
+ card:
+
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
+
+ If you are using InfiniBand, make sure there is a Subnet Manager (SM)
+ running on the network. If your IB switch has an embedded SM, you can
+ use it. Otherwise, you will need to run an SM, such as OpenSM, on one
+ of your end nodes.
+
+ If an SM is running on your network, you should see the following:
+
+ $ cat /sys/class/infiniband/driverX/ports/1/state
+ 4: ACTIVE
+
+ where driverX is mthca0, ipath5, ehca3, etc.
+
+ To further test the InfiniBand software stack, use IPoIB (this
+ assumes you have two IB hosts named host1 and host2):
+
+ host1$ ifconfig ib0 a.b.c.x
+ host2$ ifconfig ib0 a.b.c.y
+ host1$ ping a.b.c.y
+ host2$ ping a.b.c.x
+
+ For other device types, follow the appropriate procedures.
+
+ - Check NFS Setup
+
+ For the NFS components enabled above (client and/or server),
+ test their functionality over standard Ethernet using TCP/IP or UDP/IP.
+
+NFS/RDMA Setup
+~~~~~~~~~~~~~~
+
+ We recommend that you use two machines, one to act as the client and
+ one to act as the server.
+
+ One time configuration:
+
+ - On the server system, configure the /etc/exports file and
+ start the NFS/RDMA server.
+
+ Exports entries with the following formats have been tested:
+
+ /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
+ /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
+
+ The IP address(es) is(are) the client's IPoIB address for an InfiniBand
+ HCA or the cleint's iWARP address(es) for an RNIC.
+
+ NOTE: The "insecure" option must be used because the NFS/RDMA client does
+ not use a reserved port.
+
+ Each time a machine boots:
+
+ - Load and configure the RDMA drivers
+
+ For InfiniBand using a Mellanox adapter:
+
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
+ $ ifconfig ib0 a.b.c.d
+
+ NOTE: use unique addresses for the client and server
+
+ - Start the NFS server
+
+ If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA transport module:
+
+ $ modprobe svcrdma
+
+ Regardless of how the server was built (module or built-in), start the
+ server:
+
+ $ /etc/init.d/nfs start
+
+ or
+
+ $ service nfs start
+
+ Instruct the server to listen on the RDMA transport:
+
+ $ echo rdma 2050 > /proc/fs/nfsd/portlist
+
+ - On the client system
+
+ If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA client module:
+
+ $ modprobe xprtrdma.ko
+
+ Regardless of how the client was built (module or built-in), use this
+ command to mount the NFS/RDMA server:
+
+ $ mount -o rdma,port=2050 <IPoIB-server-name-or-address>:/<export> /mnt
+
+ To verify that the mount is using RDMA, run "cat /proc/mounts" and check
+ the "proto" field for the given mount.
+
+ Congratulations! You're using NFS/RDMA!
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt
new file mode 100644
index 0000000..68baddf
--- /dev/null
+++ b/Documentation/filesystems/nfsroot.txt
@@ -0,0 +1,270 @@
+Mounting the root filesystem via NFS (nfsroot)
+===============================================
+
+Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
+Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
+Updated 2006 by Horms <horms@verge.net.au>
+
+
+
+In order to use a diskless system, such as an X-terminal or printer server
+for example, it is necessary for the root filesystem to be present on a
+non-disk device. This may be an initramfs (see Documentation/filesystems/
+ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/initrd.txt) or a
+filesystem mounted via NFS. The following text describes on how to use NFS
+for the root filesystem. For the rest of this text 'client' means the
+diskless system, and 'server' means the NFS server.
+
+
+
+
+1.) Enabling nfsroot capabilities
+ -----------------------------
+
+In order to use nfsroot, NFS client support needs to be selected as
+built-in during configuration. Once this has been selected, the nfsroot
+option will become available, which should also be selected.
+
+In the networking options, kernel level autoconfiguration can be selected,
+along with the types of autoconfiguration to support. Selecting all of
+DHCP, BOOTP and RARP is safe.
+
+
+
+
+2.) Kernel command line
+ -------------------
+
+When the kernel has been loaded by a boot loader (see below) it needs to be
+told what root fs device to use. And in the case of nfsroot, where to find
+both the server and the name of the directory on the server to mount as root.
+This can be established using the following kernel command line parameters:
+
+
+root=/dev/nfs
+
+ This is necessary to enable the pseudo-NFS-device. Note that it's not a
+ real device but just a synonym to tell the kernel to use NFS instead of
+ a real device.
+
+
+nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+
+ If the `nfsroot' parameter is NOT given on the command line,
+ the default "/tftpboot/%s" will be used.
+
+ <server-ip> Specifies the IP address of the NFS server.
+ The default address is determined by the `ip' parameter
+ (see below). This parameter allows the use of different
+ servers for IP autoconfiguration and NFS.
+
+ <root-dir> Name of the directory on the server to mount as root.
+ If there is a "%s" token in the string, it will be
+ replaced by the ASCII-representation of the client's
+ IP address.
+
+ <nfs-options> Standard NFS options. All options are separated by commas.
+ The following defaults are used:
+ port = as given by server portmap daemon
+ rsize = 4096
+ wsize = 4096
+ timeo = 7
+ retrans = 3
+ acregmin = 3
+ acregmax = 60
+ acdirmin = 30
+ acdirmax = 60
+ flags = hard, nointr, noposix, cto, ac
+
+
+ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
+
+ This parameter tells the kernel how to configure IP addresses of devices
+ and also how to set up the IP routing table. It was originally called
+ `nfsaddrs', but now the boot-time IP configuration works independently of
+ NFS, so it was renamed to `ip' and the old name remained as an alias for
+ compatibility reasons.
+
+ If this parameter is missing from the kernel command line, all fields are
+ assumed to be empty, and the defaults mentioned below apply. In general
+ this means that the kernel tries to configure everything using
+ autoconfiguration.
+
+ The <autoconf> parameter can appear alone as the value to the `ip'
+ parameter (without all the ':' characters before). If the value is
+ "ip=off" or "ip=none", no autoconfiguration will take place, otherwise
+ autoconfiguration will take place. The most common way to use this
+ is "ip=dhcp".
+
+ <client-ip> IP address of the client.
+
+ Default: Determined using autoconfiguration.
+
+ <server-ip> IP address of the NFS server. If RARP is used to determine
+ the client address and this parameter is NOT empty only
+ replies from the specified server are accepted.
+
+ Only required for for NFS root. That is autoconfiguration
+ will not be triggered if it is missing and NFS root is not
+ in operation.
+
+ Default: Determined using autoconfiguration.
+ The address of the autoconfiguration server is used.
+
+ <gw-ip> IP address of a gateway if the server is on a different subnet.
+
+ Default: Determined using autoconfiguration.
+
+ <netmask> Netmask for local network interface. If unspecified
+ the netmask is derived from the client IP address assuming
+ classful addressing.
+
+ Default: Determined using autoconfiguration.
+
+ <hostname> Name of the client. May be supplied by autoconfiguration,
+ but its absence will not trigger autoconfiguration.
+
+ Default: Client IP address is used in ASCII notation.
+
+ <device> Name of network device to use.
+
+ Default: If the host only has one device, it is used.
+ Otherwise the device is determined using
+ autoconfiguration. This is done by sending
+ autoconfiguration requests out of all devices,
+ and using the device that received the first reply.
+
+ <autoconf> Method to use for autoconfiguration. In the case of options
+ which specify multiple autoconfiguration protocols,
+ requests are sent using all protocols, and the first one
+ to reply is used.
+
+ Only autoconfiguration protocols that have been compiled
+ into the kernel will be used, regardless of the value of
+ this option.
+
+ off or none: don't use autoconfiguration
+ (do static IP assignment instead)
+ on or any: use any protocol available in the kernel
+ (default)
+ dhcp: use DHCP
+ bootp: use BOOTP
+ rarp: use RARP
+ both: use both BOOTP and RARP but not DHCP
+ (old option kept for backwards compatibility)
+
+ Default: any
+
+
+
+
+3.) Boot Loader
+ ----------
+
+To get the kernel into memory different approaches can be used.
+They depend on various facilities being available:
+
+
+3.1) Booting from a floppy using syslinux
+
+ When building kernels, an easy way to create a boot floppy that uses
+ syslinux is to use the zdisk or bzdisk make targets which use zimage
+ and bzimage images respectively. Both targets accept the
+ FDARGS parameter which can be used to set the kernel command line.
+
+ e.g.
+ make bzdisk FDARGS="root=/dev/nfs"
+
+ Note that the user running this command will need to have
+ access to the floppy drive device, /dev/fd0
+
+ For more information on syslinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+ N.B: Previously it was possible to write a kernel directly to
+ a floppy using dd, configure the boot device using rdev, and
+ boot using the resulting floppy. Linux no longer supports this
+ method of booting.
+
+3.2) Booting from a cdrom using isolinux
+
+ When building kernels, an easy way to create a bootable cdrom that
+ uses isolinux is to use the isoimage target which uses a bzimage
+ image. Like zdisk and bzdisk, this target accepts the FDARGS
+ parameter which can be used to set the kernel command line.
+
+ e.g.
+ make isoimage FDARGS="root=/dev/nfs"
+
+ The resulting iso image will be arch/<ARCH>/boot/image.iso
+ This can be written to a cdrom using a variety of tools including
+ cdrecord.
+
+ e.g.
+ cdrecord dev=ATAPI:1,0,0 arch/i386/boot/image.iso
+
+ For more information on isolinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+3.2) Using LILO
+ When using LILO all the necessary command line parameters may be
+ specified using the 'append=' directive in the LILO configuration
+ file.
+
+ However, to use the 'root=' directive you also need to create
+ a dummy root device, which may be removed after LILO is run.
+
+ mknod /dev/boot255 c 0 255
+
+ For information on configuring LILO, please refer to its documentation.
+
+3.3) Using GRUB
+ When using GRUB, kernel parameter are simply appended after the kernel
+ specification: kernel <kernel> <parameters>
+
+3.4) Using loadlin
+ loadlin may be used to boot Linux from a DOS command prompt without
+ requiring a local hard disk to mount as root. This has not been
+ thoroughly tested by the authors of this document, but in general
+ it should be possible configure the kernel command line similarly
+ to the configuration of LILO.
+
+ Please refer to the loadlin documentation for further information.
+
+3.5) Using a boot ROM
+ This is probably the most elegant way of booting a diskless client.
+ With a boot ROM the kernel is loaded using the TFTP protocol. The
+ authors of this document are not aware of any no commercial boot
+ ROMs that support booting Linux over the network. However, there
+ are two free implementations of a boot ROM, netboot-nfs and
+ etherboot, both of which are available on sunsite.unc.edu, and both
+ of which contain everything you need to boot a diskless Linux client.
+
+3.6) Using pxelinux
+ Pxelinux may be used to boot linux using the PXE boot loader
+ which is present on many modern network cards.
+
+ When using pxelinux, the kernel image is specified using
+ "kernel <relative-path-below /tftpboot>". The nfsroot parameters
+ are passed to the kernel by adding them to the "append" line.
+ It is common to use serial console in conjunction with pxeliunx,
+ see Documentation/serial-console.txt for more information.
+
+ For more information on isolinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+
+
+
+4.) Credits
+ -------
+
+ The nfsroot code in the kernel and the RARP support have been written
+ by Gero Kuhlmann <gero@gkminix.han.de>.
+
+ The rest of the IP layer autoconfiguration code has been written
+ by Martin Mares <mj@atrey.karlin.mff.cuni.cz>.
+
+ In order to write the initial version of nfsroot I would like to thank
+ Jens-Uwe Mager <jum@anubis.han.de> for his help.
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
new file mode 100644
index 0000000..ac2a261
--- /dev/null
+++ b/Documentation/filesystems/ntfs.txt
@@ -0,0 +1,716 @@
+The Linux NTFS filesystem driver
+================================
+
+
+Table of contents
+=================
+
+- Overview
+- Web site
+- Features
+- Supported mount options
+- Known bugs and (mis-)features
+- Using NTFS volume and stripe sets
+ - The Device-Mapper driver
+ - The Software RAID / MD driver
+ - Limitations when using the MD driver
+- ChangeLog
+
+
+Overview
+========
+
+Linux-NTFS comes with a number of user-space programs known as ntfsprogs.
+These include mkntfs, a full-featured ntfs filesystem format utility,
+ntfsundelete used for recovering files that were unintentionally deleted
+from an NTFS volume and ntfsresize which is used to resize an NTFS partition.
+See the web site for more information.
+
+To mount an NTFS 1.2/3.x (Windows NT4/2000/XP/2003) volume, use the file
+system type 'ntfs'. The driver currently supports read-only mode (with no
+fault-tolerance, encryption or journalling) and very limited, but safe, write
+support.
+
+For fault tolerance and raid support (i.e. volume and stripe sets), you can
+use the kernel's Software RAID / MD driver. See section "Using Software RAID
+with NTFS" for details.
+
+
+Web site
+========
+
+There is plenty of additional information on the linux-ntfs web site
+at http://www.linux-ntfs.org/
+
+The web site has a lot of additional information, such as a comprehensive
+FAQ, documentation on the NTFS on-disk format, information on the Linux-NTFS
+userspace utilities, etc.
+
+
+Features
+========
+
+- This is a complete rewrite of the NTFS driver that used to be in the 2.4 and
+ earlier kernels. This new driver implements NTFS read support and is
+ functionally equivalent to the old ntfs driver and it also implements limited
+ write support. The biggest limitation at present is that files/directories
+ cannot be created or deleted. See below for the list of write features that
+ are so far supported. Another limitation is that writing to compressed files
+ is not implemented at all. Also, neither read nor write access to encrypted
+ files is so far implemented.
+- The new driver has full support for sparse files on NTFS 3.x volumes which
+ the old driver isn't happy with.
+- The new driver supports execution of binaries due to mmap() now being
+ supported.
+- The new driver supports loopback mounting of files on NTFS which is used by
+ some Linux distributions to enable the user to run Linux from an NTFS
+ partition by creating a large file while in Windows and then loopback
+ mounting the file while in Linux and creating a Linux filesystem on it that
+ is used to install Linux on it.
+- A comparison of the two drivers using:
+ time find . -type f -exec md5sum "{}" \;
+ run three times in sequence with each driver (after a reboot) on a 1.4GiB
+ NTFS partition, showed the new driver to be 20% faster in total time elapsed
+ (from 9:43 minutes on average down to 7:53). The time spent in user space
+ was unchanged but the time spent in the kernel was decreased by a factor of
+ 2.5 (from 85 CPU seconds down to 33).
+- The driver does not support short file names in general. For backwards
+ compatibility, we implement access to files using their short file names if
+ they exist. The driver will not create short file names however, and a
+ rename will discard any existing short file name.
+- The new driver supports exporting of mounted NTFS volumes via NFS.
+- The new driver supports async io (aio).
+- The new driver supports fsync(2), fdatasync(2), and msync(2).
+- The new driver supports readv(2) and writev(2).
+- The new driver supports access time updates (including mtime and ctime).
+- The new driver supports truncate(2) and open(2) with O_TRUNC. But at present
+ only very limited support for highly fragmented files, i.e. ones which have
+ their data attribute split across multiple extents, is included. Another
+ limitation is that at present truncate(2) will never create sparse files,
+ since to mark a file sparse we need to modify the directory entry for the
+ file and we do not implement directory modifications yet.
+- The new driver supports write(2) which can both overwrite existing data and
+ extend the file size so that you can write beyond the existing data. Also,
+ writing into sparse regions is supported and the holes are filled in with
+ clusters. But at present only limited support for highly fragmented files,
+ i.e. ones which have their data attribute split across multiple extents, is
+ included. Another limitation is that write(2) will never create sparse
+ files, since to mark a file sparse we need to modify the directory entry for
+ the file and we do not implement directory modifications yet.
+
+Supported mount options
+=======================
+
+In addition to the generic mount options described by the manual page for the
+mount command (man 8 mount, also see man 5 fstab), the NTFS driver supports the
+following mount options:
+
+iocharset=name Deprecated option. Still supported but please use
+ nls=name in the future. See description for nls=name.
+
+nls=name Character set to use when returning file names.
+ Unlike VFAT, NTFS suppresses names that contain
+ unconvertible characters. Note that most character
+ sets contain insufficient characters to represent all
+ possible Unicode characters that can exist on NTFS.
+ To be sure you are not missing any files, you are
+ advised to use nls=utf8 which is capable of
+ representing all Unicode characters.
+
+utf8=<bool> Option no longer supported. Currently mapped to
+ nls=utf8 but please use nls=utf8 in the future and
+ make sure utf8 is compiled either as module or into
+ the kernel. See description for nls=name.
+
+uid=
+gid=
+umask= Provide default owner, group, and access mode mask.
+ These options work as documented in mount(8). By
+ default, the files/directories are owned by root and
+ he/she has read and write permissions, as well as
+ browse permission for directories. No one else has any
+ access permissions. I.e. the mode on all files is by
+ default rw------- and for directories rwx------, a
+ consequence of the default fmask=0177 and dmask=0077.
+ Using a umask of zero will grant all permissions to
+ everyone, i.e. all files and directories will have mode
+ rwxrwxrwx.
+
+fmask=
+dmask= Instead of specifying umask which applies both to
+ files and directories, fmask applies only to files and
+ dmask only to directories.
+
+sloppy=<BOOL> If sloppy is specified, ignore unknown mount options.
+ Otherwise the default behaviour is to abort mount if
+ any unknown options are found.
+
+show_sys_files=<BOOL> If show_sys_files is specified, show the system files
+ in directory listings. Otherwise the default behaviour
+ is to hide the system files.
+ Note that even when show_sys_files is specified, "$MFT"
+ will not be visible due to bugs/mis-features in glibc.
+ Further, note that irrespective of show_sys_files, all
+ files are accessible by name, i.e. you can always do
+ "ls -l \$UpCase" for example to specifically show the
+ system file containing the Unicode upcase table.
+
+case_sensitive=<BOOL> If case_sensitive is specified, treat all file names as
+ case sensitive and create file names in the POSIX
+ namespace. Otherwise the default behaviour is to treat
+ file names as case insensitive and to create file names
+ in the WIN32/LONG name space. Note, the Linux NTFS
+ driver will never create short file names and will
+ remove them on rename/delete of the corresponding long
+ file name.
+ Note that files remain accessible via their short file
+ name, if it exists. If case_sensitive, you will need
+ to provide the correct case of the short file name.
+
+disable_sparse=<BOOL> If disable_sparse is specified, creation of sparse
+ regions, i.e. holes, inside files is disabled for the
+ volume (for the duration of this mount only). By
+ default, creation of sparse regions is enabled, which
+ is consistent with the behaviour of traditional Unix
+ filesystems.
+
+errors=opt What to do when critical filesystem errors are found.
+ Following values can be used for "opt":
+ continue: DEFAULT, try to clean-up as much as
+ possible, e.g. marking a corrupt inode as
+ bad so it is no longer accessed, and then
+ continue.
+ recover: At present only supported is recovery of
+ the boot sector from the backup copy.
+ If read-only mount, the recovery is done
+ in memory only and not written to disk.
+ Note that the options are additive, i.e. specifying:
+ errors=continue,errors=recover
+ means the driver will attempt to recover and if that
+ fails it will clean-up as much as possible and
+ continue.
+
+mft_zone_multiplier= Set the MFT zone multiplier for the volume (this
+ setting is not persistent across mounts and can be
+ changed from mount to mount but cannot be changed on
+ remount). Values of 1 to 4 are allowed, 1 being the
+ default. The MFT zone multiplier determines how much
+ space is reserved for the MFT on the volume. If all
+ other space is used up, then the MFT zone will be
+ shrunk dynamically, so this has no impact on the
+ amount of free space. However, it can have an impact
+ on performance by affecting fragmentation of the MFT.
+ In general use the default. If you have a lot of small
+ files then use a higher value. The values have the
+ following meaning:
+ Value MFT zone size (% of volume size)
+ 1 12.5%
+ 2 25%
+ 3 37.5%
+ 4 50%
+ Note this option is irrelevant for read-only mounts.
+
+
+Known bugs and (mis-)features
+=============================
+
+- The link count on each directory inode entry is set to 1, due to Linux not
+ supporting directory hard links. This may well confuse some user space
+ applications, since the directory names will have the same inode numbers.
+ This also speeds up ntfs_read_inode() immensely. And we haven't found any
+ problems with this approach so far. If you find a problem with this, please
+ let us know.
+
+
+Please send bug reports/comments/feedback/abuse to the Linux-NTFS development
+list at sourceforge: linux-ntfs-dev@lists.sourceforge.net
+
+
+Using NTFS volume and stripe sets
+=================================
+
+For support of volume and stripe sets, you can either use the kernel's
+Device-Mapper driver or the kernel's Software RAID / MD driver. The former is
+the recommended one to use for linear raid. But the latter is required for
+raid level 5. For striping and mirroring, either driver should work fine.
+
+
+The Device-Mapper driver
+------------------------
+
+You will need to create a table of the components of the volume/stripe set and
+how they fit together and load this into the kernel using the dmsetup utility
+(see man 8 dmsetup).
+
+Linear volume sets, i.e. linear raid, has been tested and works fine. Even
+though untested, there is no reason why stripe sets, i.e. raid level 0, and
+mirrors, i.e. raid level 1 should not work, too. Stripes with parity, i.e.
+raid level 5, unfortunately cannot work yet because the current version of the
+Device-Mapper driver does not support raid level 5. You may be able to use the
+Software RAID / MD driver for raid level 5, see the next section for details.
+
+To create the table describing your volume you will need to know each of its
+components and their sizes in sectors, i.e. multiples of 512-byte blocks.
+
+For NT4 fault tolerant volumes you can obtain the sizes using fdisk. So for
+example if one of your partitions is /dev/hda2 you would do:
+
+$ fdisk -ul /dev/hda
+
+Disk /dev/hda: 81.9 GB, 81964302336 bytes
+255 heads, 63 sectors/track, 9964 cylinders, total 160086528 sectors
+Units = sectors of 1 * 512 = 512 bytes
+
+ Device Boot Start End Blocks Id System
+ /dev/hda1 * 63 4209029 2104483+ 83 Linux
+ /dev/hda2 4209030 37768814 16779892+ 86 NTFS
+ /dev/hda3 37768815 46170809 4200997+ 83 Linux
+
+And you would know that /dev/hda2 has a size of 37768814 - 4209030 + 1 =
+33559785 sectors.
+
+For Win2k and later dynamic disks, you can for example use the ldminfo utility
+which is part of the Linux LDM tools (the latest version at the time of
+writing is linux-ldm-0.0.8.tar.bz2). You can download it from:
+ http://www.linux-ntfs.org/
+Simply extract the downloaded archive (tar xvjf linux-ldm-0.0.8.tar.bz2), go
+into it (cd linux-ldm-0.0.8) and change to the test directory (cd test). You
+will find the precompiled (i386) ldminfo utility there. NOTE: You will not be
+able to compile this yourself easily so use the binary version!
+
+Then you would use ldminfo in dump mode to obtain the necessary information:
+
+$ ./ldminfo --dump /dev/hda
+
+This would dump the LDM database found on /dev/hda which describes all of your
+dynamic disks and all the volumes on them. At the bottom you will see the
+VOLUME DEFINITIONS section which is all you really need. You may need to look
+further above to determine which of the disks in the volume definitions is
+which device in Linux. Hint: Run ldminfo on each of your dynamic disks and
+look at the Disk Id close to the top of the output for each (the PRIVATE HEADER
+section). You can then find these Disk Ids in the VBLK DATABASE section in the
+<Disk> components where you will get the LDM Name for the disk that is found in
+the VOLUME DEFINITIONS section.
+
+Note you will also need to enable the LDM driver in the Linux kernel. If your
+distribution did not enable it, you will need to recompile the kernel with it
+enabled. This will create the LDM partitions on each device at boot time. You
+would then use those devices (for /dev/hda they would be /dev/hda1, 2, 3, etc)
+in the Device-Mapper table.
+
+You can also bypass using the LDM driver by using the main device (e.g.
+/dev/hda) and then using the offsets of the LDM partitions into this device as
+the "Start sector of device" when creating the table. Once again ldminfo would
+give you the correct information to do this.
+
+Assuming you know all your devices and their sizes things are easy.
+
+For a linear raid the table would look like this (note all values are in
+512-byte sectors):
+
+--- cut here ---
+# Offset into Size of this Raid type Device Start sector
+# volume device of device
+0 1028161 linear /dev/hda1 0
+1028161 3903762 linear /dev/hdb2 0
+4931923 2103211 linear /dev/hdc1 0
+--- cut here ---
+
+For a striped volume, i.e. raid level 0, you will need to know the chunk size
+you used when creating the volume. Windows uses 64kiB as the default, so it
+will probably be this unless you changes the defaults when creating the array.
+
+For a raid level 0 the table would look like this (note all values are in
+512-byte sectors):
+
+--- cut here ---
+# Offset Size Raid Number Chunk 1st Start 2nd Start
+# into of the type of size Device in Device in
+# volume volume stripes device device
+0 2056320 striped 2 128 /dev/hda1 0 /dev/hdb1 0
+--- cut here ---
+
+If there are more than two devices, just add each of them to the end of the
+line.
+
+Finally, for a mirrored volume, i.e. raid level 1, the table would look like
+this (note all values are in 512-byte sectors):
+
+--- cut here ---
+# Ofs Size Raid Log Number Region Should Number Source Start Target Start
+# in of the type type of log size sync? of Device in Device in
+# vol volume params mirrors Device Device
+0 2056320 mirror core 2 16 nosync 2 /dev/hda1 0 /dev/hdb1 0
+--- cut here ---
+
+If you are mirroring to multiple devices you can specify further targets at the
+end of the line.
+
+Note the "Should sync?" parameter "nosync" means that the two mirrors are
+already in sync which will be the case on a clean shutdown of Windows. If the
+mirrors are not clean, you can specify the "sync" option instead of "nosync"
+and the Device-Mapper driver will then copy the entirety of the "Source Device"
+to the "Target Device" or if you specified multipled target devices to all of
+them.
+
+Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1),
+and hand it over to dmsetup to work with, like so:
+
+$ dmsetup create myvolume1 /etc/ntfsvolume1
+
+You can obviously replace "myvolume1" with whatever name you like.
+
+If it all worked, you will now have the device /dev/device-mapper/myvolume1
+which you can then just use as an argument to the mount command as usual to
+mount the ntfs volume. For example:
+
+$ mount -t ntfs -o ro /dev/device-mapper/myvolume1 /mnt/myvol1
+
+(You need to create the directory /mnt/myvol1 first and of course you can use
+anything you like instead of /mnt/myvol1 as long as it is an existing
+directory.)
+
+It is advisable to do the mount read-only to see if the volume has been setup
+correctly to avoid the possibility of causing damage to the data on the ntfs
+volume.
+
+
+The Software RAID / MD driver
+-----------------------------
+
+An alternative to using the Device-Mapper driver is to use the kernel's
+Software RAID / MD driver. For which you need to set up your /etc/raidtab
+appropriately (see man 5 raidtab).
+
+Linear volume sets, i.e. linear raid, as well as stripe sets, i.e. raid level
+0, have been tested and work fine (though see section "Limitations when using
+the MD driver with NTFS volumes" especially if you want to use linear raid).
+Even though untested, there is no reason why mirrors, i.e. raid level 1, and
+stripes with parity, i.e. raid level 5, should not work, too.
+
+You have to use the "persistent-superblock 0" option for each raid-disk in the
+NTFS volume/stripe you are configuring in /etc/raidtab as the persistent
+superblock used by the MD driver would damage the NTFS volume.
+
+Windows by default uses a stripe chunk size of 64k, so you probably want the
+"chunk-size 64k" option for each raid-disk, too.
+
+For example, if you have a stripe set consisting of two partitions /dev/hda5
+and /dev/hdb1 your /etc/raidtab would look like this:
+
+raiddev /dev/md0
+ raid-level 0
+ nr-raid-disks 2
+ nr-spare-disks 0
+ persistent-superblock 0
+ chunk-size 64k
+ device /dev/hda5
+ raid-disk 0
+ device /dev/hdb1
+ raid-disk 1
+
+For linear raid, just change the raid-level above to "raid-level linear", for
+mirrors, change it to "raid-level 1", and for stripe sets with parity, change
+it to "raid-level 5".
+
+Note for stripe sets with parity you will also need to tell the MD driver
+which parity algorithm to use by specifying the option "parity-algorithm
+which", where you need to replace "which" with the name of the algorithm to
+use (see man 5 raidtab for available algorithms) and you will have to try the
+different available algorithms until you find one that works. Make sure you
+are working read-only when playing with this as you may damage your data
+otherwise. If you find which algorithm works please let us know (email the
+linux-ntfs developers list linux-ntfs-dev@lists.sourceforge.net or drop in on
+IRC in channel #ntfs on the irc.freenode.net network) so we can update this
+documentation.
+
+Once the raidtab is setup, run for example raid0run -a to start all devices or
+raid0run /dev/md0 to start a particular md device, in this case /dev/md0.
+
+Then just use the mount command as usual to mount the ntfs volume using for
+example: mount -t ntfs -o ro /dev/md0 /mnt/myntfsvolume
+
+It is advisable to do the mount read-only to see if the md volume has been
+setup correctly to avoid the possibility of causing damage to the data on the
+ntfs volume.
+
+
+Limitations when using the Software RAID / MD driver
+-----------------------------------------------------
+
+Using the md driver will not work properly if any of your NTFS partitions have
+an odd number of sectors. This is especially important for linear raid as all
+data after the first partition with an odd number of sectors will be offset by
+one or more sectors so if you mount such a partition with write support you
+will cause massive damage to the data on the volume which will only become
+apparent when you try to use the volume again under Windows.
+
+So when using linear raid, make sure that all your partitions have an even
+number of sectors BEFORE attempting to use it. You have been warned!
+
+Even better is to simply use the Device-Mapper for linear raid and then you do
+not have this problem with odd numbers of sectors.
+
+
+ChangeLog
+=========
+
+Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
+
+2.1.29:
+ - Fix a deadlock when mounting read-write.
+2.1.28:
+ - Fix a deadlock.
+2.1.27:
+ - Implement page migration support so the kernel can move memory used
+ by NTFS files and directories around for management purposes.
+ - Add support for writing to sparse files created with Windows XP SP2.
+ - Many minor improvements and bug fixes.
+2.1.26:
+ - Implement support for sector sizes above 512 bytes (up to the maximum
+ supported by NTFS which is 4096 bytes).
+ - Enhance support for NTFS volumes which were supported by Windows but
+ not by Linux due to invalid attribute list attribute flags.
+ - A few minor updates and bug fixes.
+2.1.25:
+ - Write support is now extended with write(2) being able to both
+ overwrite existing file data and to extend files. Also, if a write
+ to a sparse region occurs, write(2) will fill in the hole. Note,
+ mmap(2) based writes still do not support writing into holes or
+ writing beyond the initialized size.
+ - Write support has a new feature and that is that truncate(2) and
+ open(2) with O_TRUNC are now implemented thus files can be both made
+ smaller and larger.
+ - Note: Both write(2) and truncate(2)/open(2) with O_TRUNC still have
+ limitations in that they
+ - only provide limited support for highly fragmented files.
+ - only work on regular, i.e. uncompressed and unencrypted files.
+ - never create sparse files although this will change once directory
+ operations are implemented.
+ - Lots of bug fixes and enhancements across the board.
+2.1.24:
+ - Support journals ($LogFile) which have been modified by chkdsk. This
+ means users can boot into Windows after we marked the volume dirty.
+ The Windows boot will run chkdsk and then reboot. The user can then
+ immediately boot into Linux rather than having to do a full Windows
+ boot first before rebooting into Linux and we will recognize such a
+ journal and empty it as it is clean by definition.
+ - Support journals ($LogFile) with only one restart page as well as
+ journals with two different restart pages. We sanity check both and
+ either use the only sane one or the more recent one of the two in the
+ case that both are valid.
+ - Lots of bug fixes and enhancements across the board.
+2.1.23:
+ - Stamp the user space journal, aka transaction log, aka $UsnJrnl, if
+ it is present and active thus telling Windows and applications using
+ the transaction log that changes can have happened on the volume
+ which are not recorded in $UsnJrnl.
+ - Detect the case when Windows has been hibernated (suspended to disk)
+ and if this is the case do not allow (re)mounting read-write to
+ prevent data corruption when you boot back into the suspended
+ Windows session.
+ - Implement extension of resident files using the normal file write
+ code paths, i.e. most very small files can be extended to be a little
+ bit bigger but not by much.
+ - Add new mount option "disable_sparse". (See list of mount options
+ above for details.)
+ - Improve handling of ntfs volumes with errors and strange boot sectors
+ in particular.
+ - Fix various bugs including a nasty deadlock that appeared in recent
+ kernels (around 2.6.11-2.6.12 timeframe).
+2.1.22:
+ - Improve handling of ntfs volumes with errors.
+ - Fix various bugs and race conditions.
+2.1.21:
+ - Fix several race conditions and various other bugs.
+ - Many internal cleanups, code reorganization, optimizations, and mft
+ and index record writing code rewritten to fit in with the changes.
+ - Update Documentation/filesystems/ntfs.txt with instructions on how to
+ use the Device-Mapper driver with NTFS ftdisk/LDM raid.
+2.1.20:
+ - Fix two stupid bugs introduced in 2.1.18 release.
+2.1.19:
+ - Minor bugfix in handling of the default upcase table.
+ - Many internal cleanups and improvements. Many thanks to Linus
+ Torvalds and Al Viro for the help and advice with the sparse
+ annotations and cleanups.
+2.1.18:
+ - Fix scheduling latencies at mount time. (Ingo Molnar)
+ - Fix endianness bug in a little traversed portion of the attribute
+ lookup code.
+2.1.17:
+ - Fix bugs in mount time error code paths.
+2.1.16:
+ - Implement access time updates (including mtime and ctime).
+ - Implement fsync(2), fdatasync(2), and msync(2) system calls.
+ - Enable the readv(2) and writev(2) system calls.
+ - Enable access via the asynchronous io (aio) API by adding support for
+ the aio_read(3) and aio_write(3) functions.
+2.1.15:
+ - Invalidate quotas when (re)mounting read-write.
+ NOTE: This now only leave user space journalling on the side. (See
+ note for version 2.1.13, below.)
+2.1.14:
+ - Fix an NFSd caused deadlock reported by several users.
+2.1.13:
+ - Implement writing of inodes (access time updates are not implemented
+ yet so mounting with -o noatime,nodiratime is enforced).
+ - Enable writing out of resident files so you can now overwrite any
+ uncompressed, unencrypted, nonsparse file as long as you do not
+ change the file size.
+ - Add housekeeping of ntfs system files so that ntfsfix no longer needs
+ to be run after writing to an NTFS volume.
+ NOTE: This still leaves quota tracking and user space journalling on
+ the side but they should not cause data corruption. In the worst
+ case the charged quotas will be out of date ($Quota) and some
+ userspace applications might get confused due to the out of date
+ userspace journal ($UsnJrnl).
+2.1.12:
+ - Fix the second fix to the decompression engine from the 2.1.9 release
+ and some further internals cleanups.
+2.1.11:
+ - Driver internal cleanups.
+2.1.10:
+ - Force read-only (re)mounting of volumes with unsupported volume
+ flags and various cleanups.
+2.1.9:
+ - Fix two bugs in handling of corner cases in the decompression engine.
+2.1.8:
+ - Read the $MFT mirror and compare it to the $MFT and if the two do not
+ match, force a read-only mount and do not allow read-write remounts.
+ - Read and parse the $LogFile journal and if it indicates that the
+ volume was not shutdown cleanly, force a read-only mount and do not
+ allow read-write remounts. If the $LogFile indicates a clean
+ shutdown and a read-write (re)mount is requested, empty $LogFile to
+ ensure that Windows cannot cause data corruption by replaying a stale
+ journal after Linux has written to the volume.
+ - Improve time handling so that the NTFS time is fully preserved when
+ converted to kernel time and only up to 99 nano-seconds are lost when
+ kernel time is converted to NTFS time.
+2.1.7:
+ - Enable NFS exporting of mounted NTFS volumes.
+2.1.6:
+ - Fix minor bug in handling of compressed directories that fixes the
+ erroneous "du" and "stat" output people reported.
+2.1.5:
+ - Minor bug fix in attribute list attribute handling that fixes the
+ I/O errors on "ls" of certain fragmented files found by at least two
+ people running Windows XP.
+2.1.4:
+ - Minor update allowing compilation with all gcc versions (well, the
+ ones the kernel can be compiled with anyway).
+2.1.3:
+ - Major bug fixes for reading files and volumes in corner cases which
+ were being hit by Windows 2k/XP users.
+2.1.2:
+ - Major bug fixes alleviating the hangs in statfs experienced by some
+ users.
+2.1.1:
+ - Update handling of compressed files so people no longer get the
+ frequently reported warning messages about initialized_size !=
+ data_size.
+2.1.0:
+ - Add configuration option for developmental write support.
+ - Initial implementation of file overwriting. (Writes to resident files
+ are not written out to disk yet, so avoid writing to files smaller
+ than about 1kiB.)
+ - Intercept/abort changes in file size as they are not implemented yet.
+2.0.25:
+ - Minor bugfixes in error code paths and small cleanups.
+2.0.24:
+ - Small internal cleanups.
+ - Support for sendfile system call. (Christoph Hellwig)
+2.0.23:
+ - Massive internal locking changes to mft record locking. Fixes
+ various race conditions and deadlocks.
+ - Fix ntfs over loopback for compressed files by adding an
+ optimization barrier. (gcc was screwing up otherwise ?)
+ Thanks go to Christoph Hellwig for pointing these two out:
+ - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
+ - Fix ntfs_free() for ia64 and parisc.
+2.0.22:
+ - Small internal cleanups.
+2.0.21:
+ These only affect 32-bit architectures:
+ - Check for, and refuse to mount too large volumes (maximum is 2TiB).
+ - Check for, and refuse to open too large files and directories
+ (maximum is 16TiB).
+2.0.20:
+ - Support non-resident directory index bitmaps. This means we now cope
+ with huge directories without problems.
+ - Fix a page leak that manifested itself in some cases when reading
+ directory contents.
+ - Internal cleanups.
+2.0.19:
+ - Fix race condition and improvements in block i/o interface.
+ - Optimization when reading compressed files.
+2.0.18:
+ - Fix race condition in reading of compressed files.
+2.0.17:
+ - Cleanups and optimizations.
+2.0.16:
+ - Fix stupid bug introduced in 2.0.15 in new attribute inode API.
+ - Big internal cleanup replacing the mftbmp access hacks by using the
+ new attribute inode API instead.
+2.0.15:
+ - Bug fix in parsing of remount options.
+ - Internal changes implementing attribute (fake) inodes allowing all
+ attribute i/o to go via the page cache and to use all the normal
+ vfs/mm functionality.
+2.0.14:
+ - Internal changes improving run list merging code and minor locking
+ change to not rely on BKL in ntfs_statfs().
+2.0.13:
+ - Internal changes towards using iget5_locked() in preparation for
+ fake inodes and small cleanups to ntfs_volume structure.
+2.0.12:
+ - Internal cleanups in address space operations made possible by the
+ changes introduced in the previous release.
+2.0.11:
+ - Internal updates and cleanups introducing the first step towards
+ fake inode based attribute i/o.
+2.0.10:
+ - Microsoft says that the maximum number of inodes is 2^32 - 1. Update
+ the driver accordingly to only use 32-bits to store inode numbers on
+ 32-bit architectures. This improves the speed of the driver a little.
+2.0.9:
+ - Change decompression engine to use a single buffer. This should not
+ affect performance except perhaps on the most heavy i/o on SMP
+ systems when accessing multiple compressed files from multiple
+ devices simultaneously.
+ - Minor updates and cleanups.
+2.0.8:
+ - Remove now obsolete show_inodes and posix mount option(s).
+ - Restore show_sys_files mount option.
+ - Add new mount option case_sensitive, to determine if the driver
+ treats file names as case sensitive or not.
+ - Mostly drop support for short file names (for backwards compatibility
+ we only support accessing files via their short file name if one
+ exists).
+ - Fix dcache aliasing issues wrt short/long file names.
+ - Cleanups and minor fixes.
+2.0.7:
+ - Just cleanups.
+2.0.6:
+ - Major bugfix to make compatible with other kernel changes. This fixes
+ the hangs/oopses on umount.
+ - Locking cleanup in directory operations (remove BKL usage).
+2.0.5:
+ - Major buffer overflow bug fix.
+ - Minor cleanups and updates for kernel 2.5.12.
+2.0.4:
+ - Cleanups and updates for kernel 2.5.11.
+2.0.3:
+ - Small bug fixes, cleanups, and performance improvements.
+2.0.2:
+ - Use default fmask of 0177 so that files are no executable by default.
+ If you want owner executable files, just use fmask=0077.
+ - Update for kernel 2.5.9 but preserve backwards compatibility with
+ kernel 2.5.7.
+ - Minor bug fixes, cleanups, and updates.
+2.0.1:
+ - Minor updates, primarily set the executable bit by default on files
+ so they can be executed.
+2.0.0:
+ - Started ChangeLog.
+
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 0000000..67310fb
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,81 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page: http://oss.oracle.com/projects/ocfs2
+Tools web page: http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker <joel.becker@oracle.com>
+Zach Brown <zach.brown@oracle.com>
+Mark Fasheh <mark.fasheh@oracle.com>
+Kurt Hackel <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh <manish.singh@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+ - quotas
+ - Directory change notification (F_NOTIFY)
+ - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+ - POSIX ACLs
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1 This enables/disables barriers. barrier=0 disables it,
+ barrier=1 enables it.
+errors=remount-ro(*) Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+intr (*) Allow signals to interrupt cluster operations.
+nointr Do not allow signals to interrupt cluster
+ operations.
+atime_quantum=60(*) OCFS2 will not update atime unless this number
+ of seconds has passed since the last update.
+ Set to zero to always update atime.
+data=ordered (*) All data are forced directly out to the main file
+ system prior to its metadata being committed to the
+ journal.
+data=writeback Data ordering is not preserved, data may be written
+ into the main file system after its metadata has been
+ committed to the journal.
+preferred_slot=0(*) During mount, try to use this filesystem slot first. If
+ it is in use by another node, the first empty one found
+ will be chosen. Invalid values will be ignored.
+commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata
+ every 'nrsec' seconds. The default value is 5 seconds.
+ This means that if you lose your power, you will lose
+ as much as the latest 5 seconds of work (your
+ filesystem will not be damaged though, thanks to the
+ journaling). This default value (or any low value)
+ will hurt performance, but it's good for data-safety.
+ Setting it to 0 will have the same effect as leaving
+ it at the default (5 seconds).
+ Setting it to very large values will improve
+ performance.
+localalloc=8(*) Allows custom localalloc size in MB. If the value is too
+ large, the fs will silently revert it to the default.
+ Localalloc is not enabled for local mounts.
+localflocks This disables cluster aware flock.
+inode64 Indicates that Ocfs2 is allowed to create inodes at
+ any location in the filesystem, including those which
+ will result in inode numbers occupying more than 32
+ bits of significance.
+user_xattr (*) Enables Extended User Attributes.
+nouser_xattr Disables Extended User Attributes.
diff --git a/Documentation/filesystems/omfs.txt b/Documentation/filesystems/omfs.txt
new file mode 100644
index 0000000..1d0d41f
--- /dev/null
+++ b/Documentation/filesystems/omfs.txt
@@ -0,0 +1,106 @@
+Optimized MPEG Filesystem (OMFS)
+
+Overview
+========
+
+OMFS is a filesystem created by SonicBlue for use in the ReplayTV DVR
+and Rio Karma MP3 player. The filesystem is extent-based, utilizing
+block sizes from 2k to 8k, with hash-based directories. This
+filesystem driver may be used to read and write disks from these
+devices.
+
+Note, it is not recommended that this FS be used in place of a general
+filesystem for your own streaming media device. Native Linux filesystems
+will likely perform better.
+
+More information is available at:
+
+ http://linux-karma.sf.net/
+
+Various utilities, including mkomfs and omfsck, are included with
+omfsprogs, available at:
+
+ http://bobcopeland.com/karma/
+
+Instructions are included in its README.
+
+Options
+=======
+
+OMFS supports the following mount-time options:
+
+ uid=n - make all files owned by specified user
+ gid=n - make all files owned by specified group
+ umask=xxx - set permission umask to xxx
+ fmask=xxx - set umask to xxx for files
+ dmask=xxx - set umask to xxx for directories
+
+Disk format
+===========
+
+OMFS discriminates between "sysblocks" and normal data blocks. The sysblock
+group consists of super block information, file metadata, directory structures,
+and extents. Each sysblock has a header containing CRCs of the entire
+sysblock, and may be mirrored in successive blocks on the disk. A sysblock may
+have a smaller size than a data block, but since they are both addressed by the
+same 64-bit block number, any remaining space in the smaller sysblock is
+unused.
+
+Sysblock header information:
+
+struct omfs_header {
+ __be64 h_self; /* FS block where this is located */
+ __be32 h_body_size; /* size of useful data after header */
+ __be16 h_crc; /* crc-ccitt of body_size bytes */
+ char h_fill1[2];
+ u8 h_version; /* version, always 1 */
+ char h_type; /* OMFS_INODE_X */
+ u8 h_magic; /* OMFS_IMAGIC */
+ u8 h_check_xor; /* XOR of header bytes before this */
+ __be32 h_fill2;
+};
+
+Files and directories are both represented by omfs_inode:
+
+struct omfs_inode {
+ struct omfs_header i_head; /* header */
+ __be64 i_parent; /* parent containing this inode */
+ __be64 i_sibling; /* next inode in hash bucket */
+ __be64 i_ctime; /* ctime, in milliseconds */
+ char i_fill1[35];
+ char i_type; /* OMFS_[DIR,FILE] */
+ __be32 i_fill2;
+ char i_fill3[64];
+ char i_name[OMFS_NAMELEN]; /* filename */
+ __be64 i_size; /* size of file, in bytes */
+};
+
+Directories in OMFS are implemented as a large hash table. Filenames are
+hashed then prepended into the bucket list beginning at OMFS_DIR_START.
+Lookup requires hashing the filename, then seeking across i_sibling pointers
+until a match is found on i_name. Empty buckets are represented by block
+pointers with all-1s (~0).
+
+A file is an omfs_inode structure followed by an extent table beginning at
+OMFS_EXTENT_START:
+
+struct omfs_extent_entry {
+ __be64 e_cluster; /* start location of a set of blocks */
+ __be64 e_blocks; /* number of blocks after e_cluster */
+};
+
+struct omfs_extent {
+ __be64 e_next; /* next extent table location */
+ __be32 e_extent_count; /* total # extents in this table */
+ __be32 e_fill;
+ struct omfs_extent_entry e_entry; /* start of extent entries */
+};
+
+Each extent holds the block offset followed by number of blocks allocated to
+the extent. The final extent in each table is a terminator with e_cluster
+being ~0 and e_blocks being ones'-complement of the total number of blocks
+in the table.
+
+If this table overflows, a continuation inode is written and pointed to by
+e_next. These have a header but lack the rest of the inode structure.
+
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
new file mode 100644
index 0000000..92b888d
--- /dev/null
+++ b/Documentation/filesystems/porting
@@ -0,0 +1,275 @@
+Changes since 2.5.0:
+
+---
+[recommended]
+
+New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(),
+ sb_set_blocksize() and sb_min_blocksize().
+
+Use them.
+
+(sb_find_get_block() replaces 2.4's get_hash_table())
+
+---
+[recommended]
+
+New methods: ->alloc_inode() and ->destroy_inode().
+
+Remove inode->u.foo_inode_i
+Declare
+ struct foo_inode_info {
+ /* fs-private stuff */
+ struct inode vfs_inode;
+ };
+ static inline struct foo_inode_info *FOO_I(struct inode *inode)
+ {
+ return list_entry(inode, struct foo_inode_info, vfs_inode);
+ }
+
+Use FOO_I(inode) instead of &inode->u.foo_inode_i;
+
+Add foo_alloc_inode() and foo_destroy_inode() - the former should allocate
+foo_inode_info and return the address of ->vfs_inode, the latter should free
+FOO_I(inode) (see in-tree filesystems for examples).
+
+Make them ->alloc_inode and ->destroy_inode in your super_operations.
+
+Keep in mind that now you need explicit initialization of private data
+typically between calling iget_locked() and unlocking the inode.
+
+At some point that will become mandatory.
+
+---
+[mandatory]
+
+Change of file_system_type method (->read_super to ->get_sb)
+
+->read_super() is no more. Ditto for DECLARE_FSTYPE and DECLARE_FSTYPE_DEV.
+
+Turn your foo_read_super() into a function that would return 0 in case of
+success and negative number in case of error (-EINVAL unless you have more
+informative error value to report). Call it foo_fill_super(). Now declare
+
+int foo_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super,
+ mnt);
+}
+
+(or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of
+filesystem).
+
+Replace DECLARE_FSTYPE... with explicit initializer and have ->get_sb set as
+foo_get_sb.
+
+---
+[mandatory]
+
+Locking change: ->s_vfs_rename_sem is taken only by cross-directory renames.
+Most likely there is no need to change anything, but if you relied on
+global exclusion between renames for some internal purpose - you need to
+change your internal locking. Otherwise exclusion warranties remain the
+same (i.e. parents and victim are locked, etc.).
+
+---
+[informational]
+
+Now we have the exclusion between ->lookup() and directory removal (by
+->rmdir() and ->rename()). If you used to need that exclusion and do
+it by internal locking (most of filesystems couldn't care less) - you
+can relax your locking.
+
+---
+[mandatory]
+
+->lookup(), ->truncate(), ->create(), ->unlink(), ->mknod(), ->mkdir(),
+->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename()
+and ->readdir() are called without BKL now. Grab it on entry, drop upon return
+- that will guarantee the same locking you used to have. If your method or its
+parts do not need BKL - better yet, now you can shift lock_kernel() and
+unlock_kernel() so that they would protect exactly what needs to be
+protected.
+
+---
+[mandatory]
+
+BKL is also moved from around sb operations. ->write_super() Is now called
+without BKL held. BKL should have been shifted into individual fs sb_op
+functions. If you don't need it, remove it.
+
+---
+[informational]
+
+check for ->link() target not being a directory is done by callers. Feel
+free to drop it...
+
+---
+[informational]
+
+->link() callers hold ->i_mutex on the object we are linking to. Some of your
+problems might be over...
+
+---
+[mandatory]
+
+new file_system_type method - kill_sb(superblock). If you are converting
+an existing filesystem, set it according to ->fs_flags:
+ FS_REQUIRES_DEV - kill_block_super
+ FS_LITTER - kill_litter_super
+ neither - kill_anon_super
+FS_LITTER is gone - just remove it from fs_flags.
+
+---
+[mandatory]
+
+ FS_SINGLE is gone (actually, that had happened back when ->get_sb()
+went in - and hadn't been documented ;-/). Just remove it from fs_flags
+(and see ->get_sb() entry for other actions).
+
+---
+[mandatory]
+
+->setattr() is called without BKL now. Caller _always_ holds ->i_mutex, so
+watch for ->i_mutex-grabbing code that might be used by your ->setattr().
+Callers of notify_change() need ->i_mutex now.
+
+---
+[recommended]
+
+New super_block field "struct export_operations *s_export_op" for
+explicit support for exporting, e.g. via NFS. The structure is fully
+documented at its declaration in include/linux/fs.h, and in
+Documentation/filesystems/Exporting.
+
+Briefly it allows for the definition of decode_fh and encode_fh operations
+to encode and decode filehandles, and allows the filesystem to use
+a standard helper function for decode_fh, and provide file-system specific
+support for this helper, particularly get_parent.
+
+It is planned that this will be required for exporting once the code
+settles down a bit.
+
+[mandatory]
+
+s_export_op is now required for exporting a filesystem.
+isofs, ext2, ext3, resierfs, fat
+can be used as examples of very different filesystems.
+
+---
+[mandatory]
+
+iget4() and the read_inode2 callback have been superseded by iget5_locked()
+which has the following prototype,
+
+ struct inode *iget5_locked(struct super_block *sb, unsigned long ino,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *),
+ void *data);
+
+'test' is an additional function that can be used when the inode
+number is not sufficient to identify the actual file object. 'set'
+should be a non-blocking function that initializes those parts of a
+newly created inode to allow the test function to succeed. 'data' is
+passed as an opaque value to both test and set functions.
+
+When the inode has been created by iget5_locked(), it will be returned with the
+I_NEW flag set and will still be locked. The filesystem then needs to finalize
+the initialization. Once the inode is initialized it must be unlocked by
+calling unlock_new_inode().
+
+The filesystem is responsible for setting (and possibly testing) i_ino
+when appropriate. There is also a simpler iget_locked function that
+just takes the superblock and inode number as arguments and does the
+test and set for you.
+
+e.g.
+ inode = iget_locked(sb, ino);
+ if (inode->i_state & I_NEW) {
+ err = read_inode_from_disk(inode);
+ if (err < 0) {
+ iget_failed(inode);
+ return err;
+ }
+ unlock_new_inode(inode);
+ }
+
+Note that if the process of setting up a new inode fails, then iget_failed()
+should be called on the inode to render it dead, and an appropriate error
+should be passed back to the caller.
+
+---
+[recommended]
+
+->getattr() finally getting used. See instances in nfs, minix, etc.
+
+---
+[mandatory]
+
+->revalidate() is gone. If your filesystem had it - provide ->getattr()
+and let it call whatever you had as ->revlidate() + (for symlinks that
+had ->revalidate()) add calls in ->follow_link()/->readlink().
+
+---
+[mandatory]
+
+->d_parent changes are not protected by BKL anymore. Read access is safe
+if at least one of the following is true:
+ * filesystem has no cross-directory rename()
+ * dcache_lock is held
+ * we know that parent had been locked (e.g. we are looking at
+->d_parent of ->lookup() argument).
+ * we are called from ->rename().
+ * the child's ->d_lock is held
+Audit your code and add locking if needed. Notice that any place that is
+not protected by the conditions above is risky even in the old tree - you
+had been relying on BKL and that's prone to screwups. Old tree had quite
+a few holes of that kind - unprotected access to ->d_parent leading to
+anything from oops to silent memory corruption.
+
+---
+[mandatory]
+
+ FS_NOMOUNT is gone. If you use it - just set MS_NOUSER in flags
+(see rootfs for one kind of solution and bdev/socket/pipe for another).
+
+---
+[recommended]
+
+ Use bdev_read_only(bdev) instead of is_read_only(kdev). The latter
+is still alive, but only because of the mess in drivers/s390/block/dasd.c.
+As soon as it gets fixed is_read_only() will die.
+
+---
+[mandatory]
+
+->permission() is called without BKL now. Grab it on entry, drop upon
+return - that will guarantee the same locking you used to have. If
+your method or its parts do not need BKL - better yet, now you can
+shift lock_kernel() and unlock_kernel() so that they would protect
+exactly what needs to be protected.
+
+---
+[mandatory]
+
+->statfs() is now called without BKL held. BKL should have been
+shifted into individual fs sb_op functions where it's not clear that
+it's safe to remove it. If you don't need it, remove it.
+
+---
+[mandatory]
+
+ is_read_only() is gone; use bdev_read_only() instead.
+
+---
+[mandatory]
+
+ destroy_buffers() is gone; use invalidate_bdev().
+
+---
+[mandatory]
+
+ fsync_dev() is gone; use fsync_bdev(). NOTE: lvm breakage is
+deliberate; as soon as struct block_device * is propagated in a reasonable
+way by that code fixing will become trivial; until then nothing can be
+done.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
new file mode 100644
index 0000000..bb1b0dd
--- /dev/null
+++ b/Documentation/filesystems/proc.txt
@@ -0,0 +1,2513 @@
+------------------------------------------------------------------------------
+ T H E /proc F I L E S Y S T E M
+------------------------------------------------------------------------------
+/proc/sys Terrehon Bowden <terrehon@pacbell.net> October 7 1999
+ Bodo Bauer <bb@ricochet.net>
+
+2.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000
+------------------------------------------------------------------------------
+Version 1.3 Kernel version 2.2.12
+ Kernel version 2.4.0-test11-pre4
+------------------------------------------------------------------------------
+
+Table of Contents
+-----------------
+
+ 0 Preface
+ 0.1 Introduction/Credits
+ 0.2 Legal Stuff
+
+ 1 Collecting System Information
+ 1.1 Process-Specific Subdirectories
+ 1.2 Kernel data
+ 1.3 IDE devices in /proc/ide
+ 1.4 Networking info in /proc/net
+ 1.5 SCSI info
+ 1.6 Parallel port info in /proc/parport
+ 1.7 TTY info in /proc/tty
+ 1.8 Miscellaneous kernel statistics in /proc/stat
+
+ 2 Modifying System Parameters
+ 2.1 /proc/sys/fs - File system data
+ 2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats
+ 2.3 /proc/sys/kernel - general kernel parameters
+ 2.4 /proc/sys/vm - The virtual memory subsystem
+ 2.5 /proc/sys/dev - Device specific parameters
+ 2.6 /proc/sys/sunrpc - Remote procedure calls
+ 2.7 /proc/sys/net - Networking stuff
+ 2.8 /proc/sys/net/ipv4 - IPV4 settings
+ 2.9 Appletalk
+ 2.10 IPX
+ 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
+ 2.12 /proc/<pid>/oom_adj - Adjust the oom-killer score
+ 2.13 /proc/<pid>/oom_score - Display current oom-killer score
+ 2.14 /proc/<pid>/io - Display the IO accounting fields
+ 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
+ 2.16 /proc/<pid>/mountinfo - Information about mounts
+ 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
+
+------------------------------------------------------------------------------
+Preface
+------------------------------------------------------------------------------
+
+0.1 Introduction/Credits
+------------------------
+
+This documentation is part of a soon (or so we hope) to be released book on
+the SuSE Linux distribution. As there is no complete documentation for the
+/proc file system and we've used many freely available sources to write these
+chapters, it seems only fair to give the work back to the Linux community.
+This work is based on the 2.2.* kernel version and the upcoming 2.4.*. I'm
+afraid it's still far from complete, but we hope it will be useful. As far as
+we know, it is the first 'all-in-one' document about the /proc file system. It
+is focused on the Intel x86 hardware, so if you are looking for PPC, ARM,
+SPARC, AXP, etc., features, you probably won't find what you are looking for.
+It also only covers IPv4 networking, not IPv6 nor other protocols - sorry. But
+additions and patches are welcome and will be added to this document if you
+mail them to Bodo.
+
+We'd like to thank Alan Cox, Rik van Riel, and Alexey Kuznetsov and a lot of
+other people for help compiling this documentation. We'd also like to extend a
+special thank you to Andi Kleen for documentation, which we relied on heavily
+to create this document, as well as the additional information he provided.
+Thanks to everybody else who contributed source or docs to the Linux kernel
+and helped create a great piece of software... :)
+
+If you have any comments, corrections or additions, please don't hesitate to
+contact Bodo Bauer at bb@ricochet.net. We'll be happy to add them to this
+document.
+
+The latest version of this document is available online at
+http://skaro.nightcrawler.com/~bb/Docs/Proc as HTML version.
+
+If the above direction does not works for you, ypu could try the kernel
+mailing list at linux-kernel@vger.kernel.org and/or try to reach me at
+comandante@zaralinux.com.
+
+0.2 Legal Stuff
+---------------
+
+We don't guarantee the correctness of this document, and if you come to us
+complaining about how you screwed up your system because of incorrect
+documentation, we won't feel responsible...
+
+------------------------------------------------------------------------------
+CHAPTER 1: COLLECTING SYSTEM INFORMATION
+------------------------------------------------------------------------------
+
+------------------------------------------------------------------------------
+In This Chapter
+------------------------------------------------------------------------------
+* Investigating the properties of the pseudo file system /proc and its
+ ability to provide information on the running Linux system
+* Examining /proc's structure
+* Uncovering various information about the kernel and the processes running
+ on the system
+------------------------------------------------------------------------------
+
+
+The proc file system acts as an interface to internal data structures in the
+kernel. It can be used to obtain information about the system and to change
+certain kernel parameters at runtime (sysctl).
+
+First, we'll take a look at the read-only parts of /proc. In Chapter 2, we
+show you how you can use /proc/sys to change settings.
+
+1.1 Process-Specific Subdirectories
+-----------------------------------
+
+The directory /proc contains (among other things) one subdirectory for each
+process running on the system, which is named after the process ID (PID).
+
+The link self points to the process reading the file system. Each process
+subdirectory has the entries listed in Table 1-1.
+
+
+Table 1-1: Process specific entries in /proc
+..............................................................................
+ File Content
+ clear_refs Clears page referenced bits shown in smaps output
+ cmdline Command line arguments
+ cpu Current and last cpu in which it was executed (2.4)(smp)
+ cwd Link to the current working directory
+ environ Values of environment variables
+ exe Link to the executable of this process
+ fd Directory, which contains all file descriptors
+ maps Memory maps to executables and library files (2.4)
+ mem Memory held by this process
+ root Link to the root directory of this process
+ stat Process status
+ statm Process memory status information
+ status Process status in human readable form
+ wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ smaps Extension based on maps, the rss size for each mapped file
+..............................................................................
+
+For example, to get the status information of a process, all you have to do is
+read the file /proc/PID/status:
+
+ >cat /proc/self/status
+ Name: cat
+ State: R (running)
+ Pid: 5452
+ PPid: 743
+ TracerPid: 0 (2.4)
+ Uid: 501 501 501 501
+ Gid: 100 100 100 100
+ Groups: 100 14 16
+ VmSize: 1112 kB
+ VmLck: 0 kB
+ VmRSS: 348 kB
+ VmData: 24 kB
+ VmStk: 12 kB
+ VmExe: 8 kB
+ VmLib: 1044 kB
+ SigPnd: 0000000000000000
+ SigBlk: 0000000000000000
+ SigIgn: 0000000000000000
+ SigCgt: 0000000000000000
+ CapInh: 00000000fffffeff
+ CapPrm: 0000000000000000
+ CapEff: 0000000000000000
+
+
+This shows you nearly the same information you would get if you viewed it with
+the ps command. In fact, ps uses the proc file system to obtain its
+information. The statm file contains more detailed information about the
+process memory usage. Its seven fields are explained in Table 1-2. The stat
+file contains details information about the process itself. Its fields are
+explained in Table 1-3.
+
+
+Table 1-2: Contents of the statm files (as of 2.6.8-rc3)
+..............................................................................
+ Field Content
+ size total program size (pages) (same as VmSize in status)
+ resident size of memory portions (pages) (same as VmRSS in status)
+ shared number of pages that are shared (i.e. backed by a file)
+ trs number of pages that are 'code' (not including libs; broken,
+ includes data segment)
+ lrs number of pages of library (always 0 on 2.6)
+ drs number of pages of data/stack (including libs; broken,
+ includes library text)
+ dt number of dirty pages (always 0 on 2.6)
+..............................................................................
+
+
+Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
+..............................................................................
+ Field Content
+ pid process id
+ tcomm filename of the executable
+ state state (R is running, S is sleeping, D is sleeping in an
+ uninterruptible wait, Z is zombie, T is traced or stopped)
+ ppid process id of the parent process
+ pgrp pgrp of the process
+ sid session id
+ tty_nr tty the process uses
+ tty_pgrp pgrp of the tty
+ flags task flags
+ min_flt number of minor faults
+ cmin_flt number of minor faults with child's
+ maj_flt number of major faults
+ cmaj_flt number of major faults with child's
+ utime user mode jiffies
+ stime kernel mode jiffies
+ cutime user mode jiffies with child's
+ cstime kernel mode jiffies with child's
+ priority priority level
+ nice nice level
+ num_threads number of threads
+ it_real_value (obsolete, always 0)
+ start_time time the process started after system boot
+ vsize virtual memory size
+ rss resident set memory size
+ rsslim current limit in bytes on the rss
+ start_code address above which program text can run
+ end_code address below which program text can run
+ start_stack address of the start of the stack
+ esp current value of ESP
+ eip current value of EIP
+ pending bitmap of pending signals (obsolete)
+ blocked bitmap of blocked signals (obsolete)
+ sigign bitmap of ignored signals (obsolete)
+ sigcatch bitmap of catched signals (obsolete)
+ wchan address where process went to sleep
+ 0 (place holder)
+ 0 (place holder)
+ exit_signal signal to send to parent thread on exit
+ task_cpu which CPU the task is scheduled on
+ rt_priority realtime priority
+ policy scheduling policy (man sched_setscheduler)
+ blkio_ticks time spent waiting for block IO
+..............................................................................
+
+
+1.2 Kernel data
+---------------
+
+Similar to the process entries, the kernel data files give information about
+the running kernel. The files used to obtain this information are contained in
+/proc and are listed in Table 1-4. Not all of these will be present in your
+system. It depends on the kernel configuration and the loaded modules, which
+files are there, and which are missing.
+
+Table 1-4: Kernel info in /proc
+..............................................................................
+ File Content
+ apm Advanced power management info
+ buddyinfo Kernel memory allocator information (see text) (2.5)
+ bus Directory containing bus specific information
+ cmdline Kernel command line
+ cpuinfo Info about the CPU
+ devices Available devices (block and character)
+ dma Used DMS channels
+ filesystems Supported filesystems
+ driver Various drivers grouped here, currently rtc (2.4)
+ execdomains Execdomains, related to security (2.4)
+ fb Frame Buffer devices (2.4)
+ fs File system parameters, currently nfs/exports (2.4)
+ ide Directory containing info about the IDE subsystem
+ interrupts Interrupt usage
+ iomem Memory map (2.4)
+ ioports I/O port usage
+ irq Masks for irq to cpu affinity (2.4)(smp?)
+ isapnp ISA PnP (Plug&Play) Info (2.4)
+ kcore Kernel core image (can be ELF or A.OUT(deprecated in 2.4))
+ kmsg Kernel messages
+ ksyms Kernel symbol table
+ loadavg Load average of last 1, 5 & 15 minutes
+ locks Kernel locks
+ meminfo Memory info
+ misc Miscellaneous
+ modules List of loaded modules
+ mounts Mounted filesystems
+ net Networking info (see text)
+ partitions Table of partitions known to the system
+ pci Deprecated info of PCI bus (new way -> /proc/bus/pci/,
+ decoupled by lspci (2.4)
+ rtc Real time clock
+ scsi SCSI info (see text)
+ slabinfo Slab pool info
+ stat Overall statistics
+ swaps Swap space utilization
+ sys See chapter 2
+ sysvipc Info of SysVIPC Resources (msg, sem, shm) (2.4)
+ tty Info of tty drivers
+ uptime System uptime
+ version Kernel version
+ video bttv info of video resources (2.4)
+ vmallocinfo Show vmalloced areas
+..............................................................................
+
+You can, for example, check which interrupts are currently in use and what
+they are used for by looking in the file /proc/interrupts:
+
+ > cat /proc/interrupts
+ CPU0
+ 0: 8728810 XT-PIC timer
+ 1: 895 XT-PIC keyboard
+ 2: 0 XT-PIC cascade
+ 3: 531695 XT-PIC aha152x
+ 4: 2014133 XT-PIC serial
+ 5: 44401 XT-PIC pcnet_cs
+ 8: 2 XT-PIC rtc
+ 11: 8 XT-PIC i82365
+ 12: 182918 XT-PIC PS/2 Mouse
+ 13: 1 XT-PIC fpu
+ 14: 1232265 XT-PIC ide0
+ 15: 7 XT-PIC ide1
+ NMI: 0
+
+In 2.4.* a couple of lines where added to this file LOC & ERR (this time is the
+output of a SMP machine):
+
+ > cat /proc/interrupts
+
+ CPU0 CPU1
+ 0: 1243498 1214548 IO-APIC-edge timer
+ 1: 8949 8958 IO-APIC-edge keyboard
+ 2: 0 0 XT-PIC cascade
+ 5: 11286 10161 IO-APIC-edge soundblaster
+ 8: 1 0 IO-APIC-edge rtc
+ 9: 27422 27407 IO-APIC-edge 3c503
+ 12: 113645 113873 IO-APIC-edge PS/2 Mouse
+ 13: 0 0 XT-PIC fpu
+ 14: 22491 24012 IO-APIC-edge ide0
+ 15: 2183 2415 IO-APIC-edge ide1
+ 17: 30564 30414 IO-APIC-level eth0
+ 18: 177 164 IO-APIC-level bttv
+ NMI: 2457961 2457959
+ LOC: 2457882 2457881
+ ERR: 2155
+
+NMI is incremented in this case because every timer interrupt generates a NMI
+(Non Maskable Interrupt) which is used by the NMI Watchdog to detect lockups.
+
+LOC is the local interrupt counter of the internal APIC of every CPU.
+
+ERR is incremented in the case of errors in the IO-APIC bus (the bus that
+connects the CPUs in a SMP system. This means that an error has been detected,
+the IO-APIC automatically retry the transmission, so it should not be a big
+problem, but you should read the SMP-FAQ.
+
+In 2.6.2* /proc/interrupts was expanded again. This time the goal was for
+/proc/interrupts to display every IRQ vector in use by the system, not
+just those considered 'most important'. The new vectors are:
+
+ THR -- interrupt raised when a machine check threshold counter
+ (typically counting ECC corrected errors of memory or cache) exceeds
+ a configurable threshold. Only available on some systems.
+
+ TRM -- a thermal event interrupt occurs when a temperature threshold
+ has been exceeded for the CPU. This interrupt may also be generated
+ when the temperature drops back to normal.
+
+ SPU -- a spurious interrupt is some interrupt that was raised then lowered
+ by some IO device before it could be fully processed by the APIC. Hence
+ the APIC sees the interrupt but does not know what device it came from.
+ For this case the APIC will generate the interrupt with a IRQ vector
+ of 0xff. This might also be generated by chipset bugs.
+
+ RES, CAL, TLB -- rescheduling, call and TLB flush interrupts are
+ sent from one CPU to another per the needs of the OS. Typically,
+ their statistics are used by kernel developers and interested users to
+ determine the occurance of interrupt of the given type.
+
+The above IRQ vectors are displayed only when relevent. For example,
+the threshold vector does not exist on x86_64 platforms. Others are
+suppressed when the system is a uniprocessor. As of this writing, only
+i386 and x86_64 platforms support the new IRQ vector displays.
+
+Of some interest is the introduction of the /proc/irq directory to 2.4.
+It could be used to set IRQ to CPU affinity, this means that you can "hook" an
+IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the
+irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and
+prof_cpu_mask.
+
+For example
+ > ls /proc/irq/
+ 0 10 12 14 16 18 2 4 6 8 prof_cpu_mask
+ 1 11 13 15 17 19 3 5 7 9 default_smp_affinity
+ > ls /proc/irq/0/
+ smp_affinity
+
+smp_affinity is a bitmask, in which you can specify which CPUs can handle the
+IRQ, you can set it by doing:
+
+ > echo 1 > /proc/irq/10/smp_affinity
+
+This means that only the first CPU will handle the IRQ, but you can also echo
+5 which means that only the first and fourth CPU can handle the IRQ.
+
+The contents of each smp_affinity file is the same by default:
+
+ > cat /proc/irq/0/smp_affinity
+ ffffffff
+
+The default_smp_affinity mask applies to all non-active IRQs, which are the
+IRQs which have not yet been allocated/activated, and hence which lack a
+/proc/irq/[0-9]* directory.
+
+prof_cpu_mask specifies which CPUs are to be profiled by the system wide
+profiler. Default value is ffffffff (all cpus).
+
+The way IRQs are routed is handled by the IO-APIC, and it's Round Robin
+between all the CPUs which are allowed to handle it. As usual the kernel has
+more info than you and does a better job than you, so the defaults are the
+best choice for almost everyone.
+
+There are three more important subdirectories in /proc: net, scsi, and sys.
+The general rule is that the contents, or even the existence of these
+directories, depend on your kernel configuration. If SCSI is not enabled, the
+directory scsi may not exist. The same is true with the net, which is there
+only when networking support is present in the running kernel.
+
+The slabinfo file gives information about memory usage at the slab level.
+Linux uses slab pools for memory management above page level in version 2.2.
+Commonly used objects have their own slab pool (such as network buffers,
+directory cache, and so on).
+
+..............................................................................
+
+> cat /proc/buddyinfo
+
+Node 0, zone DMA 0 4 5 4 4 3 ...
+Node 0, zone Normal 1 0 0 1 101 8 ...
+Node 0, zone HighMem 2 0 0 1 1 0 ...
+
+Memory fragmentation is a problem under some workloads, and buddyinfo is a
+useful tool for helping diagnose these problems. Buddyinfo will give you a
+clue as to how big an area you can safely allocate, or why a previous
+allocation failed.
+
+Each column represents the number of pages of a certain order which are
+available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
+ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE
+available in ZONE_NORMAL, etc...
+
+..............................................................................
+
+meminfo:
+
+Provides information about distribution and utilization of memory. This
+varies by architecture and compile options. The following is from a
+16GB PIII, which has highmem enabled. You may not have all of these fields.
+
+> cat /proc/meminfo
+
+
+MemTotal: 16344972 kB
+MemFree: 13634064 kB
+Buffers: 3656 kB
+Cached: 1195708 kB
+SwapCached: 0 kB
+Active: 891636 kB
+Inactive: 1077224 kB
+HighTotal: 15597528 kB
+HighFree: 13629632 kB
+LowTotal: 747444 kB
+LowFree: 4432 kB
+SwapTotal: 0 kB
+SwapFree: 0 kB
+Dirty: 968 kB
+Writeback: 0 kB
+AnonPages: 861800 kB
+Mapped: 280372 kB
+Slab: 284364 kB
+SReclaimable: 159856 kB
+SUnreclaim: 124508 kB
+PageTables: 24448 kB
+NFS_Unstable: 0 kB
+Bounce: 0 kB
+WritebackTmp: 0 kB
+CommitLimit: 7669796 kB
+Committed_AS: 100056 kB
+VmallocTotal: 112216 kB
+VmallocUsed: 428 kB
+VmallocChunk: 111088 kB
+
+ MemTotal: Total usable ram (i.e. physical ram minus a few reserved
+ bits and the kernel binary code)
+ MemFree: The sum of LowFree+HighFree
+ Buffers: Relatively temporary storage for raw disk blocks
+ shouldn't get tremendously large (20MB or so)
+ Cached: in-memory cache for files read from the disk (the
+ pagecache). Doesn't include SwapCached
+ SwapCached: Memory that once was swapped out, is swapped back in but
+ still also is in the swapfile (if memory is needed it
+ doesn't need to be swapped out AGAIN because it is already
+ in the swapfile. This saves I/O)
+ Active: Memory that has been used more recently and usually not
+ reclaimed unless absolutely necessary.
+ Inactive: Memory which has been less recently used. It is more
+ eligible to be reclaimed for other purposes
+ HighTotal:
+ HighFree: Highmem is all memory above ~860MB of physical memory
+ Highmem areas are for use by userspace programs, or
+ for the pagecache. The kernel must use tricks to access
+ this memory, making it slower to access than lowmem.
+ LowTotal:
+ LowFree: Lowmem is memory which can be used for everything that
+ highmem can be used for, but it is also available for the
+ kernel's use for its own data structures. Among many
+ other things, it is where everything from the Slab is
+ allocated. Bad things happen when you're out of lowmem.
+ SwapTotal: total amount of swap space available
+ SwapFree: Memory which has been evicted from RAM, and is temporarily
+ on the disk
+ Dirty: Memory which is waiting to get written back to the disk
+ Writeback: Memory which is actively being written back to the disk
+ AnonPages: Non-file backed pages mapped into userspace page tables
+ Mapped: files which have been mmaped, such as libraries
+ Slab: in-kernel data structures cache
+SReclaimable: Part of Slab, that might be reclaimed, such as caches
+ SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
+ PageTables: amount of memory dedicated to the lowest level of page
+ tables.
+NFS_Unstable: NFS pages sent to the server, but not yet committed to stable
+ storage
+ Bounce: Memory used for block device "bounce buffers"
+WritebackTmp: Memory used by FUSE for temporary writeback buffers
+ CommitLimit: Based on the overcommit ratio ('vm.overcommit_ratio'),
+ this is the total amount of memory currently available to
+ be allocated on the system. This limit is only adhered to
+ if strict overcommit accounting is enabled (mode 2 in
+ 'vm.overcommit_memory').
+ The CommitLimit is calculated with the following formula:
+ CommitLimit = ('vm.overcommit_ratio' * Physical RAM) + Swap
+ For example, on a system with 1G of physical RAM and 7G
+ of swap with a `vm.overcommit_ratio` of 30 it would
+ yield a CommitLimit of 7.3G.
+ For more details, see the memory overcommit documentation
+ in vm/overcommit-accounting.
+Committed_AS: The amount of memory presently allocated on the system.
+ The committed memory is a sum of all of the memory which
+ has been allocated by processes, even if it has not been
+ "used" by them as of yet. A process which malloc()'s 1G
+ of memory, but only touches 300M of it will only show up
+ as using 300M of memory even if it has the address space
+ allocated for the entire 1G. This 1G is memory which has
+ been "committed" to by the VM and can be used at any time
+ by the allocating application. With strict overcommit
+ enabled on the system (mode 2 in 'vm.overcommit_memory'),
+ allocations which would exceed the CommitLimit (detailed
+ above) will not be permitted. This is useful if one needs
+ to guarantee that processes will not fail due to lack of
+ memory once that memory has been successfully allocated.
+VmallocTotal: total size of vmalloc memory area
+ VmallocUsed: amount of vmalloc area which is used
+VmallocChunk: largest contigious block of vmalloc area which is free
+
+..............................................................................
+
+vmallocinfo:
+
+Provides information about vmalloced/vmaped areas. One line per area,
+containing the virtual address range of the area, size in bytes,
+caller information of the creator, and optional information depending
+on the kind of area :
+
+ pages=nr number of pages
+ phys=addr if a physical address was specified
+ ioremap I/O mapping (ioremap() and friends)
+ vmalloc vmalloc() area
+ vmap vmap()ed pages
+ user VM_USERMAP area
+ vpages buffer for pages pointers was vmalloced (huge area)
+ N<node>=nr (Only on NUMA kernels)
+ Number of pages allocated on memory node <node>
+
+> cat /proc/vmallocinfo
+0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204 ...
+ /0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128
+0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204 ...
+ /0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
+0xffffc20000302000-0xffffc20000304000 8192 acpi_tb_verify_table+0x21/0x4f...
+ phys=7fee8000 ioremap
+0xffffc20000304000-0xffffc20000307000 12288 acpi_tb_verify_table+0x21/0x4f...
+ phys=7fee7000 ioremap
+0xffffc2000031d000-0xffffc2000031f000 8192 init_vdso_vars+0x112/0x210
+0xffffc2000031f000-0xffffc2000032b000 49152 cramfs_uncompress_init+0x2e ...
+ /0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3
+0xffffc2000033a000-0xffffc2000033d000 12288 sys_swapon+0x640/0xac0 ...
+ pages=2 vmalloc N1=2
+0xffffc20000347000-0xffffc2000034c000 20480 xt_alloc_table_info+0xfe ...
+ /0x130 [x_tables] pages=4 vmalloc N0=4
+0xffffffffa0000000-0xffffffffa000f000 61440 sys_init_module+0xc27/0x1d00 ...
+ pages=14 vmalloc N2=14
+0xffffffffa000f000-0xffffffffa0014000 20480 sys_init_module+0xc27/0x1d00 ...
+ pages=4 vmalloc N1=4
+0xffffffffa0014000-0xffffffffa0017000 12288 sys_init_module+0xc27/0x1d00 ...
+ pages=2 vmalloc N1=2
+0xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 ...
+ pages=10 vmalloc N0=10
+
+1.3 IDE devices in /proc/ide
+----------------------------
+
+The subdirectory /proc/ide contains information about all IDE devices of which
+the kernel is aware. There is one subdirectory for each IDE controller, the
+file drivers and a link for each IDE device, pointing to the device directory
+in the controller specific subtree.
+
+The file drivers contains general information about the drivers used for the
+IDE devices:
+
+ > cat /proc/ide/drivers
+ ide-cdrom version 4.53
+ ide-disk version 1.08
+
+More detailed information can be found in the controller specific
+subdirectories. These are named ide0, ide1 and so on. Each of these
+directories contains the files shown in table 1-5.
+
+
+Table 1-5: IDE controller info in /proc/ide/ide?
+..............................................................................
+ File Content
+ channel IDE channel (0 or 1)
+ config Configuration (only for PCI/IDE bridge)
+ mate Mate name
+ model Type/Chipset of IDE controller
+..............................................................................
+
+Each device connected to a controller has a separate subdirectory in the
+controllers directory. The files listed in table 1-6 are contained in these
+directories.
+
+
+Table 1-6: IDE device information
+..............................................................................
+ File Content
+ cache The cache
+ capacity Capacity of the medium (in 512Byte blocks)
+ driver driver and version
+ geometry physical and logical geometry
+ identify device identify block
+ media media type
+ model device identifier
+ settings device setup
+ smart_thresholds IDE disk management thresholds
+ smart_values IDE disk management values
+..............................................................................
+
+The most interesting file is settings. This file contains a nice overview of
+the drive parameters:
+
+ # cat /proc/ide/ide0/hda/settings
+ name value min max mode
+ ---- ----- --- --- ----
+ bios_cyl 526 0 65535 rw
+ bios_head 255 0 255 rw
+ bios_sect 63 0 63 rw
+ breada_readahead 4 0 127 rw
+ bswap 0 0 1 r
+ file_readahead 72 0 2097151 rw
+ io_32bit 0 0 3 rw
+ keepsettings 0 0 1 rw
+ max_kb_per_request 122 1 127 rw
+ multcount 0 0 8 rw
+ nice1 1 0 1 rw
+ nowerr 0 0 1 rw
+ pio_mode write-only 0 255 w
+ slow 0 0 1 rw
+ unmaskirq 0 0 1 rw
+ using_dma 0 0 1 rw
+
+
+1.4 Networking info in /proc/net
+--------------------------------
+
+The subdirectory /proc/net follows the usual pattern. Table 1-6 shows the
+additional values you get for IP version 6 if you configure the kernel to
+support this. Table 1-7 lists the files and their meaning.
+
+
+Table 1-6: IPv6 info in /proc/net
+..............................................................................
+ File Content
+ udp6 UDP sockets (IPv6)
+ tcp6 TCP sockets (IPv6)
+ raw6 Raw device statistics (IPv6)
+ igmp6 IP multicast addresses, which this host joined (IPv6)
+ if_inet6 List of IPv6 interface addresses
+ ipv6_route Kernel routing table for IPv6
+ rt6_stats Global IPv6 routing tables statistics
+ sockstat6 Socket statistics (IPv6)
+ snmp6 Snmp data (IPv6)
+..............................................................................
+
+
+Table 1-7: Network info in /proc/net
+..............................................................................
+ File Content
+ arp Kernel ARP table
+ dev network devices with statistics
+ dev_mcast the Layer2 multicast groups a device is listening too
+ (interface index, label, number of references, number of bound
+ addresses).
+ dev_stat network device status
+ ip_fwchains Firewall chain linkage
+ ip_fwnames Firewall chain names
+ ip_masq Directory containing the masquerading tables
+ ip_masquerade Major masquerading table
+ netstat Network statistics
+ raw raw device statistics
+ route Kernel routing table
+ rpc Directory containing rpc info
+ rt_cache Routing cache
+ snmp SNMP data
+ sockstat Socket statistics
+ tcp TCP sockets
+ tr_rif Token ring RIF routing table
+ udp UDP sockets
+ unix UNIX domain sockets
+ wireless Wireless interface data (Wavelan etc)
+ igmp IP multicast addresses, which this host joined
+ psched Global packet scheduler parameters.
+ netlink List of PF_NETLINK sockets
+ ip_mr_vifs List of multicast virtual interfaces
+ ip_mr_cache List of multicast routing cache
+..............................................................................
+
+You can use this information to see which network devices are available in
+your system and how much traffic was routed over those devices:
+
+ > cat /proc/net/dev
+ Inter-|Receive |[...
+ face |bytes packets errs drop fifo frame compressed multicast|[...
+ lo: 908188 5596 0 0 0 0 0 0 [...
+ ppp0:15475140 20721 410 0 0 410 0 0 [...
+ eth0: 614530 7085 0 0 0 0 0 1 [...
+
+ ...] Transmit
+ ...] bytes packets errs drop fifo colls carrier compressed
+ ...] 908188 5596 0 0 0 0 0 0
+ ...] 1375103 17405 0 0 0 0 0 0
+ ...] 1703981 5535 0 0 0 3 0 0
+
+In addition, each Channel Bond interface has it's own directory. For
+example, the bond0 device will have a directory called /proc/net/bond0/.
+It will contain information that is specific to that bond, such as the
+current slaves of the bond, the link status of the slaves, and how
+many times the slaves link has failed.
+
+1.5 SCSI info
+-------------
+
+If you have a SCSI host adapter in your system, you'll find a subdirectory
+named after the driver for this adapter in /proc/scsi. You'll also see a list
+of all recognized SCSI devices in /proc/scsi:
+
+ >cat /proc/scsi/scsi
+ Attached devices:
+ Host: scsi0 Channel: 00 Id: 00 Lun: 00
+ Vendor: IBM Model: DGHS09U Rev: 03E0
+ Type: Direct-Access ANSI SCSI revision: 03
+ Host: scsi0 Channel: 00 Id: 06 Lun: 00
+ Vendor: PIONEER Model: CD-ROM DR-U06S Rev: 1.04
+ Type: CD-ROM ANSI SCSI revision: 02
+
+
+The directory named after the driver has one file for each adapter found in
+the system. These files contain information about the controller, including
+the used IRQ and the IO address range. The amount of information shown is
+dependent on the adapter you use. The example shows the output for an Adaptec
+AHA-2940 SCSI adapter:
+
+ > cat /proc/scsi/aic7xxx/0
+
+ Adaptec AIC7xxx driver version: 5.1.19/3.2.4
+ Compile Options:
+ TCQ Enabled By Default : Disabled
+ AIC7XXX_PROC_STATS : Disabled
+ AIC7XXX_RESET_DELAY : 5
+ Adapter Configuration:
+ SCSI Adapter: Adaptec AHA-294X Ultra SCSI host adapter
+ Ultra Wide Controller
+ PCI MMAPed I/O Base: 0xeb001000
+ Adapter SEEPROM Config: SEEPROM found and used.
+ Adaptec SCSI BIOS: Enabled
+ IRQ: 10
+ SCBs: Active 0, Max Active 2,
+ Allocated 15, HW 16, Page 255
+ Interrupts: 160328
+ BIOS Control Word: 0x18b6
+ Adapter Control Word: 0x005b
+ Extended Translation: Enabled
+ Disconnect Enable Flags: 0xffff
+ Ultra Enable Flags: 0x0001
+ Tag Queue Enable Flags: 0x0000
+ Ordered Queue Tag Flags: 0x0000
+ Default Tag Queue Depth: 8
+ Tagged Queue By Device array for aic7xxx host instance 0:
+ {255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}
+ Actual queue depth per device for aic7xxx host instance 0:
+ {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}
+ Statistics:
+ (scsi0:0:0:0)
+ Device using Wide/Sync transfers at 40.0 MByte/sec, offset 8
+ Transinfo settings: current(12/8/1/0), goal(12/8/1/0), user(12/15/1/0)
+ Total transfers 160151 (74577 reads and 85574 writes)
+ (scsi0:0:6:0)
+ Device using Narrow/Sync transfers at 5.0 MByte/sec, offset 15
+ Transinfo settings: current(50/15/0/0), goal(50/15/0/0), user(50/15/0/0)
+ Total transfers 0 (0 reads and 0 writes)
+
+
+1.6 Parallel port info in /proc/parport
+---------------------------------------
+
+The directory /proc/parport contains information about the parallel ports of
+your system. It has one subdirectory for each port, named after the port
+number (0,1,2,...).
+
+These directories contain the four files shown in Table 1-8.
+
+
+Table 1-8: Files in /proc/parport
+..............................................................................
+ File Content
+ autoprobe Any IEEE-1284 device ID information that has been acquired.
+ devices list of the device drivers using that port. A + will appear by the
+ name of the device currently using the port (it might not appear
+ against any).
+ hardware Parallel port's base address, IRQ line and DMA channel.
+ irq IRQ that parport is using for that port. This is in a separate
+ file to allow you to alter it by writing a new value in (IRQ
+ number or none).
+..............................................................................
+
+1.7 TTY info in /proc/tty
+-------------------------
+
+Information about the available and actually used tty's can be found in the
+directory /proc/tty.You'll find entries for drivers and line disciplines in
+this directory, as shown in Table 1-9.
+
+
+Table 1-9: Files in /proc/tty
+..............................................................................
+ File Content
+ drivers list of drivers and their usage
+ ldiscs registered line disciplines
+ driver/serial usage statistic and status of single tty lines
+..............................................................................
+
+To see which tty's are currently in use, you can simply look into the file
+/proc/tty/drivers:
+
+ > cat /proc/tty/drivers
+ pty_slave /dev/pts 136 0-255 pty:slave
+ pty_master /dev/ptm 128 0-255 pty:master
+ pty_slave /dev/ttyp 3 0-255 pty:slave
+ pty_master /dev/pty 2 0-255 pty:master
+ serial /dev/cua 5 64-67 serial:callout
+ serial /dev/ttyS 4 64-67 serial
+ /dev/tty0 /dev/tty0 4 0 system:vtmaster
+ /dev/ptmx /dev/ptmx 5 2 system
+ /dev/console /dev/console 5 1 system:console
+ /dev/tty /dev/tty 5 0 system:/dev/tty
+ unknown /dev/tty 4 1-63 console
+
+
+1.8 Miscellaneous kernel statistics in /proc/stat
+-------------------------------------------------
+
+Various pieces of information about kernel activity are available in the
+/proc/stat file. All of the numbers reported in this file are aggregates
+since the system first booted. For a quick look, simply cat the file:
+
+ > cat /proc/stat
+ cpu 2255 34 2290 22625563 6290 127 456 0
+ cpu0 1132 34 1441 11311718 3675 127 438 0
+ cpu1 1123 0 849 11313845 2614 0 18 0
+ intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...]
+ ctxt 1990473
+ btime 1062191376
+ processes 2915
+ procs_running 1
+ procs_blocked 0
+
+The very first "cpu" line aggregates the numbers in all of the other "cpuN"
+lines. These numbers identify the amount of time the CPU has spent performing
+different kinds of work. Time units are in USER_HZ (typically hundredths of a
+second). The meanings of the columns are as follows, from left to right:
+
+- user: normal processes executing in user mode
+- nice: niced processes executing in user mode
+- system: processes executing in kernel mode
+- idle: twiddling thumbs
+- iowait: waiting for I/O to complete
+- irq: servicing interrupts
+- softirq: servicing softirqs
+- steal: involuntary wait
+
+The "intr" line gives counts of interrupts serviced since boot time, for each
+of the possible system interrupts. The first column is the total of all
+interrupts serviced; each subsequent column is the total for that particular
+interrupt.
+
+The "ctxt" line gives the total number of context switches across all CPUs.
+
+The "btime" line gives the time at which the system booted, in seconds since
+the Unix epoch.
+
+The "processes" line gives the number of processes and threads created, which
+includes (but is not limited to) those created by calls to the fork() and
+clone() system calls.
+
+The "procs_running" line gives the number of processes currently running on
+CPUs.
+
+The "procs_blocked" line gives the number of processes currently blocked,
+waiting for I/O to complete.
+
+
+1.9 Ext4 file system parameters
+------------------------------
+
+Information about mounted ext4 file systems can be found in
+/proc/fs/ext4. Each mounted filesystem will have a directory in
+/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
+/proc/fs/ext4/dm-0). The files in each per-device directory are shown
+in Table 1-10, below.
+
+Table 1-10: Files in /proc/fs/ext4/<devname>
+..............................................................................
+ File Content
+ mb_groups details of multiblock allocator buddy cache of free blocks
+ mb_history multiblock allocation history
+ stats controls whether the multiblock allocator should start
+ collecting statistics, which are shown during the unmount
+ group_prealloc the multiblock allocator will round up allocation
+ requests to a multiple of this tuning parameter if the
+ stripe size is not set in the ext4 superblock
+ max_to_scan The maximum number of extents the multiblock allocator
+ will search to find the best extent
+ min_to_scan The minimum number of extents the multiblock allocator
+ will search to find the best extent
+ order2_req Tuning parameter which controls the minimum size for
+ requests (as a power of 2) where the buddy cache is
+ used
+ stream_req Files which have fewer blocks than this tunable
+ parameter will have their blocks allocated out of a
+ block group specific preallocation pool, so that small
+ files are packed closely together. Each large file
+ will have its blocks allocated out of its own unique
+ preallocation pool.
+inode_readahead Tuning parameter which controls the maximum number of
+ inode table blocks that ext4's inode table readahead
+ algorithm will pre-read into the buffer cache
+..............................................................................
+
+
+------------------------------------------------------------------------------
+Summary
+------------------------------------------------------------------------------
+The /proc file system serves information about the running system. It not only
+allows access to process data but also allows you to request the kernel status
+by reading files in the hierarchy.
+
+The directory structure of /proc reflects the types of information and makes
+it easy, if not obvious, where to look for specific data.
+------------------------------------------------------------------------------
+
+------------------------------------------------------------------------------
+CHAPTER 2: MODIFYING SYSTEM PARAMETERS
+------------------------------------------------------------------------------
+
+------------------------------------------------------------------------------
+In This Chapter
+------------------------------------------------------------------------------
+* Modifying kernel parameters by writing into files found in /proc/sys
+* Exploring the files which modify certain parameters
+* Review of the /proc/sys file tree
+------------------------------------------------------------------------------
+
+
+A very interesting part of /proc is the directory /proc/sys. This is not only
+a source of information, it also allows you to change parameters within the
+kernel. Be very careful when attempting this. You can optimize your system,
+but you can also cause it to crash. Never alter kernel parameters on a
+production system. Set up a development machine and test to make sure that
+everything works the way you want it to. You may have no alternative but to
+reboot the machine once an error has been made.
+
+To change a value, simply echo the new value into the file. An example is
+given below in the section on the file system data. You need to be root to do
+this. You can create your own boot script to perform this every time your
+system boots.
+
+The files in /proc/sys can be used to fine tune and monitor miscellaneous and
+general things in the operation of the Linux kernel. Since some of the files
+can inadvertently disrupt your system, it is advisable to read both
+documentation and source before actually making adjustments. In any case, be
+very careful when writing to any of these files. The entries in /proc may
+change slightly between the 2.1.* and the 2.2 kernel, so if there is any doubt
+review the kernel documentation in the directory /usr/src/linux/Documentation.
+This chapter is heavily based on the documentation included in the pre 2.2
+kernels, and became part of it in version 2.2.1 of the Linux kernel.
+
+2.1 /proc/sys/fs - File system data
+-----------------------------------
+
+This subdirectory contains specific file system, file handle, inode, dentry
+and quota information.
+
+Currently, these files are in /proc/sys/fs:
+
+dentry-state
+------------
+
+Status of the directory cache. Since directory entries are dynamically
+allocated and deallocated, this file indicates the current status. It holds
+six values, in which the last two are not used and are always zero. The others
+are listed in table 2-1.
+
+
+Table 2-1: Status files of the directory cache
+..............................................................................
+ File Content
+ nr_dentry Almost always zero
+ nr_unused Number of unused cache entries
+ age_limit
+ in seconds after the entry may be reclaimed, when memory is short
+ want_pages internally
+..............................................................................
+
+dquot-nr and dquot-max
+----------------------
+
+The file dquot-max shows the maximum number of cached disk quota entries.
+
+The file dquot-nr shows the number of allocated disk quota entries and the
+number of free disk quota entries.
+
+If the number of available cached disk quotas is very low and you have a large
+number of simultaneous system users, you might want to raise the limit.
+
+file-nr and file-max
+--------------------
+
+The kernel allocates file handles dynamically, but doesn't free them again at
+this time.
+
+The value in file-max denotes the maximum number of file handles that the
+Linux kernel will allocate. When you get a lot of error messages about running
+out of file handles, you might want to raise this limit. The default value is
+10% of RAM in kilobytes. To change it, just write the new number into the
+file:
+
+ # cat /proc/sys/fs/file-max
+ 4096
+ # echo 8192 > /proc/sys/fs/file-max
+ # cat /proc/sys/fs/file-max
+ 8192
+
+
+This method of revision is useful for all customizable parameters of the
+kernel - simply echo the new value to the corresponding file.
+
+Historically, the three values in file-nr denoted the number of allocated file
+handles, the number of allocated but unused file handles, and the maximum
+number of file handles. Linux 2.6 always reports 0 as the number of free file
+handles -- this is not an error, it just means that the number of allocated
+file handles exactly matches the number of used file handles.
+
+Attempts to allocate more file descriptors than file-max are reported with
+printk, look for "VFS: file-max limit <number> reached".
+
+inode-state and inode-nr
+------------------------
+
+The file inode-nr contains the first two items from inode-state, so we'll skip
+to that file...
+
+inode-state contains two actual numbers and five dummy values. The numbers
+are nr_inodes and nr_free_inodes (in order of appearance).
+
+nr_inodes
+~~~~~~~~~
+
+Denotes the number of inodes the system has allocated. This number will
+grow and shrink dynamically.
+
+nr_open
+-------
+
+Denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+nr_free_inodes
+--------------
+
+Represents the number of free inodes. Ie. The number of inuse inodes is
+(nr_inodes - nr_free_inodes).
+
+aio-nr and aio-max-nr
+---------------------
+
+aio-nr is the running total of the number of events specified on the
+io_setup system call for all currently active aio contexts. If aio-nr
+reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
+raising aio-max-nr does not result in the pre-allocation or re-sizing
+of any kernel data structures.
+
+2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats
+-----------------------------------------------------------
+
+Besides these files, there is the subdirectory /proc/sys/fs/binfmt_misc. This
+handles the kernel support for miscellaneous binary formats.
+
+Binfmt_misc provides the ability to register additional binary formats to the
+Kernel without compiling an additional module/kernel. Therefore, binfmt_misc
+needs to know magic numbers at the beginning or the filename extension of the
+binary.
+
+It works by maintaining a linked list of structs that contain a description of
+a binary format, including a magic with size (or the filename extension),
+offset and mask, and the interpreter name. On request it invokes the given
+interpreter with the original program as argument, as binfmt_java and
+binfmt_em86 and binfmt_mz do. Since binfmt_misc does not define any default
+binary-formats, you have to register an additional binary-format.
+
+There are two general files in binfmt_misc and one file per registered format.
+The two general files are register and status.
+
+Registering a new binary format
+-------------------------------
+
+To register a new binary format you have to issue the command
+
+ echo :name:type:offset:magic:mask:interpreter: > /proc/sys/fs/binfmt_misc/register
+
+
+
+with appropriate name (the name for the /proc-dir entry), offset (defaults to
+0, if omitted), magic, mask (which can be omitted, defaults to all 0xff) and
+last but not least, the interpreter that is to be invoked (for example and
+testing /bin/echo). Type can be M for usual magic matching or E for filename
+extension matching (give extension in place of magic).
+
+Check or reset the status of the binary format handler
+------------------------------------------------------
+
+If you do a cat on the file /proc/sys/fs/binfmt_misc/status, you will get the
+current status (enabled/disabled) of binfmt_misc. Change the status by echoing
+0 (disables) or 1 (enables) or -1 (caution: this clears all previously
+registered binary formats) to status. For example echo 0 > status to disable
+binfmt_misc (temporarily).
+
+Status of a single handler
+--------------------------
+
+Each registered handler has an entry in /proc/sys/fs/binfmt_misc. These files
+perform the same function as status, but their scope is limited to the actual
+binary format. By cating this file, you also receive all related information
+about the interpreter/magic of the binfmt.
+
+Example usage of binfmt_misc (emulate binfmt_java)
+--------------------------------------------------
+
+ cd /proc/sys/fs/binfmt_misc
+ echo ':Java:M::\xca\xfe\xba\xbe::/usr/local/java/bin/javawrapper:' > register
+ echo ':HTML:E::html::/usr/local/java/bin/appletviewer:' > register
+ echo ':Applet:M::<!--applet::/usr/local/java/bin/appletviewer:' > register
+ echo ':DEXE:M::\x0eDEX::/usr/bin/dosexec:' > register
+
+
+These four lines add support for Java executables and Java applets (like
+binfmt_java, additionally recognizing the .html extension with no need to put
+<!--applet> to every applet file). You have to install the JDK and the
+shell-script /usr/local/java/bin/javawrapper too. It works around the
+brokenness of the Java filename handling. To add a Java binary, just create a
+link to the class-file somewhere in the path.
+
+2.3 /proc/sys/kernel - general kernel parameters
+------------------------------------------------
+
+This directory reflects general kernel behaviors. As I've said before, the
+contents depend on your configuration. Here you'll find the most important
+files, along with descriptions of what they mean and how to use them.
+
+acct
+----
+
+The file contains three values; highwater, lowwater, and frequency.
+
+It exists only when BSD-style process accounting is enabled. These values
+control its behavior. If the free space on the file system where the log lives
+goes below lowwater percentage, accounting suspends. If it goes above
+highwater percentage, accounting resumes. Frequency determines how often you
+check the amount of free space (value is in seconds). Default settings are: 4,
+2, and 30. That is, suspend accounting if there is less than 2 percent free;
+resume it if we have a value of 3 or more percent; consider information about
+the amount of free space valid for 30 seconds
+
+ctrl-alt-del
+------------
+
+When the value in this file is 0, ctrl-alt-del is trapped and sent to the init
+program to handle a graceful restart. However, when the value is greater that
+zero, Linux's reaction to this key combination will be an immediate reboot,
+without syncing its dirty buffers.
+
+[NOTE]
+ When a program (like dosemu) has the keyboard in raw mode, the
+ ctrl-alt-del is intercepted by the program before it ever reaches the
+ kernel tty layer, and it is up to the program to decide what to do with
+ it.
+
+domainname and hostname
+-----------------------
+
+These files can be controlled to set the NIS domainname and hostname of your
+box. For the classic darkstar.frop.org a simple:
+
+ # echo "darkstar" > /proc/sys/kernel/hostname
+ # echo "frop.org" > /proc/sys/kernel/domainname
+
+
+would suffice to set your hostname and NIS domainname.
+
+osrelease, ostype and version
+-----------------------------
+
+The names make it pretty obvious what these fields contain:
+
+ > cat /proc/sys/kernel/osrelease
+ 2.2.12
+
+ > cat /proc/sys/kernel/ostype
+ Linux
+
+ > cat /proc/sys/kernel/version
+ #4 Fri Oct 1 12:41:14 PDT 1999
+
+
+The files osrelease and ostype should be clear enough. Version needs a little
+more clarification. The #4 means that this is the 4th kernel built from this
+source base and the date after it indicates the time the kernel was built. The
+only way to tune these values is to rebuild the kernel.
+
+panic
+-----
+
+The value in this file represents the number of seconds the kernel waits
+before rebooting on a panic. When you use the software watchdog, the
+recommended setting is 60. If set to 0, the auto reboot after a kernel panic
+is disabled, which is the default setting.
+
+printk
+------
+
+The four values in printk denote
+* console_loglevel,
+* default_message_loglevel,
+* minimum_console_loglevel and
+* default_console_loglevel
+respectively.
+
+These values influence printk() behavior when printing or logging error
+messages, which come from inside the kernel. See syslog(2) for more
+information on the different log levels.
+
+console_loglevel
+----------------
+
+Messages with a higher priority than this will be printed to the console.
+
+default_message_level
+---------------------
+
+Messages without an explicit priority will be printed with this priority.
+
+minimum_console_loglevel
+------------------------
+
+Minimum (highest) value to which the console_loglevel can be set.
+
+default_console_loglevel
+------------------------
+
+Default value for console_loglevel.
+
+sg-big-buff
+-----------
+
+This file shows the size of the generic SCSI (sg) buffer. At this point, you
+can't tune it yet, but you can change it at compile time by editing
+include/scsi/sg.h and changing the value of SG_BIG_BUFF.
+
+If you use a scanner with SANE (Scanner Access Now Easy) you might want to set
+this to a higher value. Refer to the SANE documentation on this issue.
+
+modprobe
+--------
+
+The location where the modprobe binary is located. The kernel uses this
+program to load modules on demand.
+
+unknown_nmi_panic
+-----------------
+
+The value in this file affects behavior of handling NMI. When the value is
+non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel
+debugging information is displayed on console.
+
+NMI switch that most IA32 servers have fires unknown NMI up, for example.
+If a system hangs up, try pressing the NMI switch.
+
+panic_on_unrecovered_nmi
+------------------------
+
+The default Linux behaviour on an NMI of either memory or unknown is to continue
+operation. For many environments such as scientific computing it is preferable
+that the box is taken out and the error dealt with than an uncorrected
+parity/ECC error get propogated.
+
+A small number of systems do generate NMI's for bizarre random reasons such as
+power management so the default is off. That sysctl works like the existing
+panic controls already in that directory.
+
+nmi_watchdog
+------------
+
+Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero
+the NMI watchdog is enabled and will continuously test all online cpus to
+determine whether or not they are still functioning properly.
+
+Because the NMI watchdog shares registers with oprofile, by disabling the NMI
+watchdog, oprofile may have more registers to utilize.
+
+msgmni
+------
+
+Maximum number of message queue ids on the system.
+This value scales to the amount of lowmem. It is automatically recomputed
+upon memory add/remove or ipc namespace creation/removal.
+When a value is written into this file, msgmni's value becomes fixed, i.e. it
+is not recomputed anymore when one of the above events occurs.
+Use auto_msgmni to change this behavior.
+
+auto_msgmni
+-----------
+
+Enables/Disables automatic recomputing of msgmni upon memory add/remove or
+upon ipc namespace creation/removal (see the msgmni description above).
+Echoing "1" into this file enables msgmni automatic recomputing.
+Echoing "0" turns it off.
+auto_msgmni default value is 1.
+
+
+2.4 /proc/sys/vm - The virtual memory subsystem
+-----------------------------------------------
+
+The files in this directory can be used to tune the operation of the virtual
+memory (VM) subsystem of the Linux kernel.
+
+vfs_cache_pressure
+------------------
+
+Controls the tendency of the kernel to reclaim the memory which is used for
+caching of directory and inode objects.
+
+At the default value of vfs_cache_pressure=100 the kernel will attempt to
+reclaim dentries and inodes at a "fair" rate with respect to pagecache and
+swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
+to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100
+causes the kernel to prefer to reclaim dentries and inodes.
+
+dirty_background_ratio
+----------------------
+
+Contains, as a percentage of the dirtyable system memory (free pages + mapped
+pages + file cache, not including locked pages and HugePages), the number of
+pages at which the pdflush background writeback daemon will start writing out
+dirty data.
+
+dirty_ratio
+-----------------
+
+Contains, as a percentage of the dirtyable system memory (free pages + mapped
+pages + file cache, not including locked pages and HugePages), the number of
+pages at which a process which is generating disk writes will itself start
+writing out dirty data.
+
+dirty_writeback_centisecs
+-------------------------
+
+The pdflush writeback daemons will periodically wake up and write `old' data
+out to disk. This tunable expresses the interval between those wakeups, in
+100'ths of a second.
+
+Setting this to zero disables periodic writeback altogether.
+
+dirty_expire_centisecs
+----------------------
+
+This tunable is used to define when dirty data is old enough to be eligible
+for writeout by the pdflush daemons. It is expressed in 100'ths of a second.
+Data which has been dirty in-memory for longer than this interval will be
+written out next time a pdflush daemon wakes up.
+
+highmem_is_dirtyable
+--------------------
+
+Only present if CONFIG_HIGHMEM is set.
+
+This defaults to 0 (false), meaning that the ratios set above are calculated
+as a percentage of lowmem only. This protects against excessive scanning
+in page reclaim, swapping and general VM distress.
+
+Setting this to 1 can be useful on 32 bit machines where you want to make
+random changes within an MMAPed file that is larger than your available
+lowmem without causing large quantities of random IO. Is is safe if the
+behavior of all programs running on the machine is known and memory will
+not be otherwise stressed.
+
+legacy_va_layout
+----------------
+
+If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel
+will use the legacy (2.4) layout for all processes.
+
+lowmem_reserve_ratio
+---------------------
+
+For some specialised workloads on highmem machines it is dangerous for
+the kernel to allow process memory to be allocated from the "lowmem"
+zone. This is because that memory could then be pinned via the mlock()
+system call, or by unavailability of swapspace.
+
+And on large highmem machines this lack of reclaimable lowmem memory
+can be fatal.
+
+So the Linux page allocator has a mechanism which prevents allocations
+which _could_ use highmem from using too much lowmem. This means that
+a certain amount of lowmem is defended from the possibility of being
+captured into pinned user memory.
+
+(The same argument applies to the old 16 megabyte ISA DMA region. This
+mechanism will also defend that region from allocations which could use
+highmem or lowmem).
+
+The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is
+in defending these lower zones.
+
+If you have a machine which uses highmem or ISA DMA and your
+applications are using mlock(), or if you are running with no swap then
+you probably should change the lowmem_reserve_ratio setting.
+
+The lowmem_reserve_ratio is an array. You can see them by reading this file.
+-
+% cat /proc/sys/vm/lowmem_reserve_ratio
+256 256 32
+-
+Note: # of this elements is one fewer than number of zones. Because the highest
+ zone's value is not necessary for following calculation.
+
+But, these values are not used directly. The kernel calculates # of protection
+pages for each zones from them. These are shown as array of protection pages
+in /proc/zoneinfo like followings. (This is an example of x86-64 box).
+Each zone has an array of protection pages like this.
+
+-
+Node 0, zone DMA
+ pages free 1355
+ min 3
+ low 3
+ high 4
+ :
+ :
+ numa_other 0
+ protection: (0, 2004, 2004, 2004)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ pagesets
+ cpu: 0 pcp: 0
+ :
+-
+These protections are added to score to judge whether this zone should be used
+for page allocation or should be reclaimed.
+
+In this example, if normal pages (index=2) are required to this DMA zone and
+pages_high is used for watermark, the kernel judges this zone should not be
+used because pages_free(1355) is smaller than watermark + protection[2]
+(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
+normal page requirement. If requirement is DMA zone(index=0), protection[0]
+(=0) is used.
+
+zone[i]'s protection[j] is calculated by following expression.
+
+(i < j):
+ zone[i]->protection[j]
+ = (total sums of present_pages from zone[i+1] to zone[j] on the node)
+ / lowmem_reserve_ratio[i];
+(i = j):
+ (should not be protected. = 0;
+(i > j):
+ (not necessary, but looks 0)
+
+The default values of lowmem_reserve_ratio[i] are
+ 256 (if zone[i] means DMA or DMA32 zone)
+ 32 (others).
+As above expression, they are reciprocal number of ratio.
+256 means 1/256. # of protection pages becomes about "0.39%" of total present
+pages of higher zones on the node.
+
+If you would like to protect more pages, smaller values are effective.
+The minimum value is 1 (1/1 -> 100%).
+
+page-cluster
+------------
+
+page-cluster controls the number of pages which are written to swap in
+a single attempt. The swap I/O size.
+
+It is a logarithmic value - setting it to zero means "1 page", setting
+it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
+
+The default value is three (eight pages at a time). There may be some
+small benefits in tuning this to a different value if your workload is
+swap-intensive.
+
+overcommit_memory
+-----------------
+
+Controls overcommit of system memory, possibly allowing processes
+to allocate (but not use) more memory than is actually available.
+
+
+0 - Heuristic overcommit handling. Obvious overcommits of
+ address space are refused. Used for a typical system. It
+ ensures a seriously wild allocation fails while allowing
+ overcommit to reduce swap usage. root is allowed to
+ allocate slightly more memory in this mode. This is the
+ default.
+
+1 - Always overcommit. Appropriate for some scientific
+ applications.
+
+2 - Don't overcommit. The total address space commit
+ for the system is not permitted to exceed swap plus a
+ configurable percentage (default is 50) of physical RAM.
+ Depending on the percentage you use, in most situations
+ this means a process will not be killed while attempting
+ to use already-allocated memory but will receive errors
+ on memory allocation as appropriate.
+
+overcommit_ratio
+----------------
+
+Percentage of physical memory size to include in overcommit calculations
+(see above.)
+
+Memory allocation limit = swapspace + physmem * (overcommit_ratio / 100)
+
+ swapspace = total size of all swap areas
+ physmem = size of physical memory in system
+
+nr_hugepages and hugetlb_shm_group
+----------------------------------
+
+nr_hugepages configures number of hugetlb page reserved for the system.
+
+hugetlb_shm_group contains group id that is allowed to create SysV shared
+memory segment using hugetlb page.
+
+hugepages_treat_as_movable
+--------------------------
+
+This parameter is only useful when kernelcore= is specified at boot time to
+create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages
+are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero
+value written to hugepages_treat_as_movable allows huge pages to be allocated
+from ZONE_MOVABLE.
+
+Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge
+pages pool can easily grow or shrink within. Assuming that applications are
+not running that mlock() a lot of memory, it is likely the huge pages pool
+can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value
+into nr_hugepages and triggering page reclaim.
+
+laptop_mode
+-----------
+
+laptop_mode is a knob that controls "laptop mode". All the things that are
+controlled by this knob are discussed in Documentation/laptops/laptop-mode.txt.
+
+block_dump
+----------
+
+block_dump enables block I/O debugging when set to a nonzero value. More
+information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
+
+swap_token_timeout
+------------------
+
+This file contains valid hold time of swap out protection token. The Linux
+VM has token based thrashing control mechanism and uses the token to prevent
+unnecessary page faults in thrashing situation. The unit of the value is
+second. The value would be useful to tune thrashing behavior.
+
+drop_caches
+-----------
+
+Writing to this will cause the kernel to drop clean caches, dentries and
+inodes from memory, causing that memory to become free.
+
+To free pagecache:
+ echo 1 > /proc/sys/vm/drop_caches
+To free dentries and inodes:
+ echo 2 > /proc/sys/vm/drop_caches
+To free pagecache, dentries and inodes:
+ echo 3 > /proc/sys/vm/drop_caches
+
+As this is a non-destructive operation and dirty objects are not freeable, the
+user should run `sync' first.
+
+
+2.5 /proc/sys/dev - Device specific parameters
+----------------------------------------------
+
+Currently there is only support for CDROM drives, and for those, there is only
+one read-only file containing information about the CD-ROM drives attached to
+the system:
+
+ >cat /proc/sys/dev/cdrom/info
+ CD-ROM information, Id: cdrom.c 2.55 1999/04/25
+
+ drive name: sr0 hdb
+ drive speed: 32 40
+ drive # of slots: 1 0
+ Can close tray: 1 1
+ Can open tray: 1 1
+ Can lock tray: 1 1
+ Can change speed: 1 1
+ Can select disk: 0 1
+ Can read multisession: 1 1
+ Can read MCN: 1 1
+ Reports media changed: 1 1
+ Can play audio: 1 1
+
+
+You see two drives, sr0 and hdb, along with a list of their features.
+
+2.6 /proc/sys/sunrpc - Remote procedure calls
+---------------------------------------------
+
+This directory contains four files, which enable or disable debugging for the
+RPC functions NFS, NFS-daemon, RPC and NLM. The default values are 0. They can
+be set to one to turn debugging on. (The default value is 0 for each)
+
+2.7 /proc/sys/net - Networking stuff
+------------------------------------
+
+The interface to the networking parts of the kernel is located in
+/proc/sys/net. Table 2-3 shows all possible subdirectories. You may see only
+some of them, depending on your kernel's configuration.
+
+
+Table 2-3: Subdirectories in /proc/sys/net
+..............................................................................
+ Directory Content Directory Content
+ core General parameter appletalk Appletalk protocol
+ unix Unix domain sockets netrom NET/ROM
+ 802 E802 protocol ax25 AX25
+ ethernet Ethernet protocol rose X.25 PLP layer
+ ipv4 IP version 4 x25 X.25 protocol
+ ipx IPX token-ring IBM token ring
+ bridge Bridging decnet DEC net
+ ipv6 IP version 6
+..............................................................................
+
+We will concentrate on IP networking here. Since AX15, X.25, and DEC Net are
+only minor players in the Linux world, we'll skip them in this chapter. You'll
+find some short info on Appletalk and IPX further on in this chapter. Review
+the online documentation and the kernel source to get a detailed view of the
+parameters for those protocols. In this section we'll discuss the
+subdirectories printed in bold letters in the table above. As default values
+are suitable for most needs, there is no need to change these values.
+
+/proc/sys/net/core - Network core options
+-----------------------------------------
+
+rmem_default
+------------
+
+The default setting of the socket receive buffer in bytes.
+
+rmem_max
+--------
+
+The maximum receive socket buffer size in bytes.
+
+wmem_default
+------------
+
+The default setting (in bytes) of the socket send buffer.
+
+wmem_max
+--------
+
+The maximum send socket buffer size in bytes.
+
+message_burst and message_cost
+------------------------------
+
+These parameters are used to limit the warning messages written to the kernel
+log from the networking code. They enforce a rate limit to make a
+denial-of-service attack impossible. A higher message_cost factor, results in
+fewer messages that will be written. Message_burst controls when messages will
+be dropped. The default settings limit warning messages to one every five
+seconds.
+
+warnings
+--------
+
+This controls console messages from the networking stack that can occur because
+of problems on the network like duplicate address or bad checksums. Normally,
+this should be enabled, but if the problem persists the messages can be
+disabled.
+
+
+netdev_max_backlog
+------------------
+
+Maximum number of packets, queued on the INPUT side, when the interface
+receives packets faster than kernel can process them.
+
+optmem_max
+----------
+
+Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
+of struct cmsghdr structures with appended data.
+
+/proc/sys/net/unix - Parameters for Unix domain sockets
+-------------------------------------------------------
+
+There are only two files in this subdirectory. They control the delays for
+deleting and destroying socket descriptors.
+
+2.8 /proc/sys/net/ipv4 - IPV4 settings
+--------------------------------------
+
+IP version 4 is still the most used protocol in Unix networking. It will be
+replaced by IP version 6 in the next couple of years, but for the moment it's
+the de facto standard for the internet and is used in most networking
+environments around the world. Because of the importance of this protocol,
+we'll have a deeper look into the subtree controlling the behavior of the IPv4
+subsystem of the Linux kernel.
+
+Let's start with the entries in /proc/sys/net/ipv4.
+
+ICMP settings
+-------------
+
+icmp_echo_ignore_all and icmp_echo_ignore_broadcasts
+----------------------------------------------------
+
+Turn on (1) or off (0), if the kernel should ignore all ICMP ECHO requests, or
+just those to broadcast and multicast addresses.
+
+Please note that if you accept ICMP echo requests with a broadcast/multi\-cast
+destination address your network may be used as an exploder for denial of
+service packet flooding attacks to other hosts.
+
+icmp_destunreach_rate, icmp_echoreply_rate, icmp_paramprob_rate and icmp_timeexeed_rate
+---------------------------------------------------------------------------------------
+
+Sets limits for sending ICMP packets to specific targets. A value of zero
+disables all limiting. Any positive value sets the maximum package rate in
+hundredth of a second (on Intel systems).
+
+IP settings
+-----------
+
+ip_autoconfig
+-------------
+
+This file contains the number one if the host received its IP configuration by
+RARP, BOOTP, DHCP or a similar mechanism. Otherwise it is zero.
+
+ip_default_ttl
+--------------
+
+TTL (Time To Live) for IPv4 interfaces. This is simply the maximum number of
+hops a packet may travel.
+
+ip_dynaddr
+----------
+
+Enable dynamic socket address rewriting on interface address change. This is
+useful for dialup interface with changing IP addresses.
+
+ip_forward
+----------
+
+Enable or disable forwarding of IP packages between interfaces. Changing this
+value resets all other parameters to their default values. They differ if the
+kernel is configured as host or router.
+
+ip_local_port_range
+-------------------
+
+Range of ports used by TCP and UDP to choose the local port. Contains two
+numbers, the first number is the lowest port, the second number the highest
+local port. Default is 1024-4999. Should be changed to 32768-61000 for
+high-usage systems.
+
+ip_no_pmtu_disc
+---------------
+
+Global switch to turn path MTU discovery off. It can also be set on a per
+socket basis by the applications or on a per route basis.
+
+ip_masq_debug
+-------------
+
+Enable/disable debugging of IP masquerading.
+
+IP fragmentation settings
+-------------------------
+
+ipfrag_high_trash and ipfrag_low_trash
+--------------------------------------
+
+Maximum memory used to reassemble IP fragments. When ipfrag_high_thresh bytes
+of memory is allocated for this purpose, the fragment handler will toss
+packets until ipfrag_low_thresh is reached.
+
+ipfrag_time
+-----------
+
+Time in seconds to keep an IP fragment in memory.
+
+TCP settings
+------------
+
+tcp_ecn
+-------
+
+This file controls the use of the ECN bit in the IPv4 headers. This is a new
+feature about Explicit Congestion Notification, but some routers and firewalls
+block traffic that has this bit set, so it could be necessary to echo 0 to
+/proc/sys/net/ipv4/tcp_ecn if you want to talk to these sites. For more info
+you could read RFC2481.
+
+tcp_retrans_collapse
+--------------------
+
+Bug-to-bug compatibility with some broken printers. On retransmit, try to send
+larger packets to work around bugs in certain TCP stacks. Can be turned off by
+setting it to zero.
+
+tcp_keepalive_probes
+--------------------
+
+Number of keep alive probes TCP sends out, until it decides that the
+connection is broken.
+
+tcp_keepalive_time
+------------------
+
+How often TCP sends out keep alive messages, when keep alive is enabled. The
+default is 2 hours.
+
+tcp_syn_retries
+---------------
+
+Number of times initial SYNs for a TCP connection attempt will be
+retransmitted. Should not be higher than 255. This is only the timeout for
+outgoing connections, for incoming connections the number of retransmits is
+defined by tcp_retries1.
+
+tcp_sack
+--------
+
+Enable select acknowledgments after RFC2018.
+
+tcp_timestamps
+--------------
+
+Enable timestamps as defined in RFC1323.
+
+tcp_stdurg
+----------
+
+Enable the strict RFC793 interpretation of the TCP urgent pointer field. The
+default is to use the BSD compatible interpretation of the urgent pointer
+pointing to the first byte after the urgent data. The RFC793 interpretation is
+to have it point to the last byte of urgent data. Enabling this option may
+lead to interoperability problems. Disabled by default.
+
+tcp_syncookies
+--------------
+
+Only valid when the kernel was compiled with CONFIG_SYNCOOKIES. Send out
+syncookies when the syn backlog queue of a socket overflows. This is to ward
+off the common 'syn flood attack'. Disabled by default.
+
+Note that the concept of a socket backlog is abandoned. This means the peer
+may not receive reliable error messages from an over loaded server with
+syncookies enabled.
+
+tcp_window_scaling
+------------------
+
+Enable window scaling as defined in RFC1323.
+
+tcp_fin_timeout
+---------------
+
+The length of time in seconds it takes to receive a final FIN before the
+socket is always closed. This is strictly a violation of the TCP
+specification, but required to prevent denial-of-service attacks.
+
+tcp_max_ka_probes
+-----------------
+
+Indicates how many keep alive probes are sent per slow timer run. Should not
+be set too high to prevent bursts.
+
+tcp_max_syn_backlog
+-------------------
+
+Length of the per socket backlog queue. Since Linux 2.2 the backlog specified
+in listen(2) only specifies the length of the backlog queue of already
+established sockets. When more connection requests arrive Linux starts to drop
+packets. When syncookies are enabled the packets are still answered and the
+maximum queue is effectively ignored.
+
+tcp_retries1
+------------
+
+Defines how often an answer to a TCP connection request is retransmitted
+before giving up.
+
+tcp_retries2
+------------
+
+Defines how often a TCP packet is retransmitted before giving up.
+
+Interface specific settings
+---------------------------
+
+In the directory /proc/sys/net/ipv4/conf you'll find one subdirectory for each
+interface the system knows about and one directory calls all. Changes in the
+all subdirectory affect all interfaces, whereas changes in the other
+subdirectories affect only one interface. All directories have the same
+entries:
+
+accept_redirects
+----------------
+
+This switch decides if the kernel accepts ICMP redirect messages or not. The
+default is 'yes' if the kernel is configured for a regular host and 'no' for a
+router configuration.
+
+accept_source_route
+-------------------
+
+Should source routed packages be accepted or declined. The default is
+dependent on the kernel configuration. It's 'yes' for routers and 'no' for
+hosts.
+
+bootp_relay
+~~~~~~~~~~~
+
+Accept packets with source address 0.b.c.d with destinations not to this host
+as local ones. It is supposed that a BOOTP relay daemon will catch and forward
+such packets.
+
+The default is 0, since this feature is not implemented yet (kernel version
+2.2.12).
+
+forwarding
+----------
+
+Enable or disable IP forwarding on this interface.
+
+log_martians
+------------
+
+Log packets with source addresses with no known route to kernel log.
+
+mc_forwarding
+-------------
+
+Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE and a
+multicast routing daemon is required.
+
+proxy_arp
+---------
+
+Does (1) or does not (0) perform proxy ARP.
+
+rp_filter
+---------
+
+Integer value determines if a source validation should be made. 1 means yes, 0
+means no. Disabled by default, but local/broadcast address spoofing is always
+on.
+
+If you set this to 1 on a router that is the only connection for a network to
+the net, it will prevent spoofing attacks against your internal networks
+(external addresses can still be spoofed), without the need for additional
+firewall rules.
+
+secure_redirects
+----------------
+
+Accept ICMP redirect messages only for gateways, listed in default gateway
+list. Enabled by default.
+
+shared_media
+------------
+
+If it is not set the kernel does not assume that different subnets on this
+device can communicate directly. Default setting is 'yes'.
+
+send_redirects
+--------------
+
+Determines whether to send ICMP redirects to other hosts.
+
+Routing settings
+----------------
+
+The directory /proc/sys/net/ipv4/route contains several file to control
+routing issues.
+
+error_burst and error_cost
+--------------------------
+
+These parameters are used to limit how many ICMP destination unreachable to
+send from the host in question. ICMP destination unreachable messages are
+sent when we cannot reach the next hop while trying to transmit a packet.
+It will also print some error messages to kernel logs if someone is ignoring
+our ICMP redirects. The higher the error_cost factor is, the fewer
+destination unreachable and error messages will be let through. Error_burst
+controls when destination unreachable messages and error messages will be
+dropped. The default settings limit warning messages to five every second.
+
+flush
+-----
+
+Writing to this file results in a flush of the routing cache.
+
+gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh
+---------------------------------------------------------------------
+
+Values to control the frequency and behavior of the garbage collection
+algorithm for the routing cache. gc_min_interval is deprecated and replaced
+by gc_min_interval_ms.
+
+
+max_size
+--------
+
+Maximum size of the routing cache. Old entries will be purged once the cache
+reached has this size.
+
+redirect_load, redirect_number
+------------------------------
+
+Factors which determine if more ICPM redirects should be sent to a specific
+host. No redirects will be sent once the load limit or the maximum number of
+redirects has been reached.
+
+redirect_silence
+----------------
+
+Timeout for redirects. After this period redirects will be sent again, even if
+this has been stopped, because the load or number limit has been reached.
+
+Network Neighbor handling
+-------------------------
+
+Settings about how to handle connections with direct neighbors (nodes attached
+to the same link) can be found in the directory /proc/sys/net/ipv4/neigh.
+
+As we saw it in the conf directory, there is a default subdirectory which
+holds the default values, and one directory for each interface. The contents
+of the directories are identical, with the single exception that the default
+settings contain additional options to set garbage collection parameters.
+
+In the interface directories you'll find the following entries:
+
+base_reachable_time, base_reachable_time_ms
+-------------------------------------------
+
+A base value used for computing the random reachable time value as specified
+in RFC2461.
+
+Expression of base_reachable_time, which is deprecated, is in seconds.
+Expression of base_reachable_time_ms is in milliseconds.
+
+retrans_time, retrans_time_ms
+-----------------------------
+
+The time between retransmitted Neighbor Solicitation messages.
+Used for address resolution and to determine if a neighbor is
+unreachable.
+
+Expression of retrans_time, which is deprecated, is in 1/100 seconds (for
+IPv4) or in jiffies (for IPv6).
+Expression of retrans_time_ms is in milliseconds.
+
+unres_qlen
+----------
+
+Maximum queue length for a pending arp request - the number of packets which
+are accepted from other layers while the ARP address is still resolved.
+
+anycast_delay
+-------------
+
+Maximum for random delay of answers to neighbor solicitation messages in
+jiffies (1/100 sec). Not yet implemented (Linux does not have anycast support
+yet).
+
+ucast_solicit
+-------------
+
+Maximum number of retries for unicast solicitation.
+
+mcast_solicit
+-------------
+
+Maximum number of retries for multicast solicitation.
+
+delay_first_probe_time
+----------------------
+
+Delay for the first time probe if the neighbor is reachable. (see
+gc_stale_time)
+
+locktime
+--------
+
+An ARP/neighbor entry is only replaced with a new one if the old is at least
+locktime old. This prevents ARP cache thrashing.
+
+proxy_delay
+-----------
+
+Maximum time (real time is random [0..proxytime]) before answering to an ARP
+request for which we have an proxy ARP entry. In some cases, this is used to
+prevent network flooding.
+
+proxy_qlen
+----------
+
+Maximum queue length of the delayed proxy arp timer. (see proxy_delay).
+
+app_solicit
+----------
+
+Determines the number of requests to send to the user level ARP daemon. Use 0
+to turn off.
+
+gc_stale_time
+-------------
+
+Determines how often to check for stale ARP entries. After an ARP entry is
+stale it will be resolved again (which is useful when an IP address migrates
+to another machine). When ucast_solicit is greater than 0 it first tries to
+send an ARP packet directly to the known host When that fails and
+mcast_solicit is greater than 0, an ARP request is broadcasted.
+
+2.9 Appletalk
+-------------
+
+The /proc/sys/net/appletalk directory holds the Appletalk configuration data
+when Appletalk is loaded. The configurable parameters are:
+
+aarp-expiry-time
+----------------
+
+The amount of time we keep an ARP entry before expiring it. Used to age out
+old hosts.
+
+aarp-resolve-time
+-----------------
+
+The amount of time we will spend trying to resolve an Appletalk address.
+
+aarp-retransmit-limit
+---------------------
+
+The number of times we will retransmit a query before giving up.
+
+aarp-tick-time
+--------------
+
+Controls the rate at which expires are checked.
+
+The directory /proc/net/appletalk holds the list of active Appletalk sockets
+on a machine.
+
+The fields indicate the DDP type, the local address (in network:node format)
+the remote address, the size of the transmit pending queue, the size of the
+received queue (bytes waiting for applications to read) the state and the uid
+owning the socket.
+
+/proc/net/atalk_iface lists all the interfaces configured for appletalk.It
+shows the name of the interface, its Appletalk address, the network range on
+that address (or network number for phase 1 networks), and the status of the
+interface.
+
+/proc/net/atalk_route lists each known network route. It lists the target
+(network) that the route leads to, the router (may be directly connected), the
+route flags, and the device the route is using.
+
+2.10 IPX
+--------
+
+The IPX protocol has no tunable values in proc/sys/net.
+
+The IPX protocol does, however, provide proc/net/ipx. This lists each IPX
+socket giving the local and remote addresses in Novell format (that is
+network:node:port). In accordance with the strange Novell tradition,
+everything but the port is in hex. Not_Connected is displayed for sockets that
+are not tied to a specific remote address. The Tx and Rx queue sizes indicate
+the number of bytes pending for transmission and reception. The state
+indicates the state the socket is in and the uid is the owning uid of the
+socket.
+
+The /proc/net/ipx_interface file lists all IPX interfaces. For each interface
+it gives the network number, the node number, and indicates if the network is
+the primary network. It also indicates which device it is bound to (or
+Internal for internal networks) and the Frame Type if appropriate. Linux
+supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for
+IPX.
+
+The /proc/net/ipx_route table holds a list of IPX routes. For each route it
+gives the destination network, the router node (or Directly) and the network
+address of the router (or Connected) for internal networks.
+
+2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
+----------------------------------------------------------
+
+The "mqueue" filesystem provides the necessary kernel features to enable the
+creation of a user space library that implements the POSIX message queues
+API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
+Interfaces specification.)
+
+The "mqueue" filesystem contains values for determining/setting the amount of
+resources used by the file system.
+
+/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
+maximum number of message queues allowed on the system.
+
+/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
+maximum number of messages in a queue value. In fact it is the limiting value
+for another (user) limit which is set in mq_open invocation. This attribute of
+a queue must be less or equal then msg_max.
+
+/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
+maximum message size value (it is every message queue's attribute set during
+its creation).
+
+2.12 /proc/<pid>/oom_adj - Adjust the oom-killer score
+------------------------------------------------------
+
+This file can be used to adjust the score used to select which processes
+should be killed in an out-of-memory situation. Giving it a high score will
+increase the likelihood of this process being killed by the oom-killer. Valid
+values are in the range -16 to +15, plus the special value -17, which disables
+oom-killing altogether for this process.
+
+2.13 /proc/<pid>/oom_score - Display current oom-killer score
+-------------------------------------------------------------
+
+------------------------------------------------------------------------------
+This file can be used to check the current score used by the oom-killer is for
+any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which
+process should be killed in an out-of-memory situation.
+
+------------------------------------------------------------------------------
+Summary
+------------------------------------------------------------------------------
+Certain aspects of kernel behavior can be modified at runtime, without the
+need to recompile the kernel, or even to reboot the system. The files in the
+/proc/sys tree can not only be read, but also modified. You can use the echo
+command to write value into these files, thereby changing the default settings
+of the kernel.
+------------------------------------------------------------------------------
+
+2.14 /proc/<pid>/io - Display the IO accounting fields
+-------------------------------------------------------
+
+This file contains IO statistics for each running process
+
+Example
+-------
+
+test:/tmp # dd if=/dev/zero of=/tmp/test.dat &
+[1] 3828
+
+test:/tmp # cat /proc/3828/io
+rchar: 323934931
+wchar: 323929600
+syscr: 632687
+syscw: 632675
+read_bytes: 0
+write_bytes: 323932160
+cancelled_write_bytes: 0
+
+
+Description
+-----------
+
+rchar
+-----
+
+I/O counter: chars read
+The number of bytes which this task has caused to be read from storage. This
+is simply the sum of bytes which this process passed to read() and pread().
+It includes things like tty IO and it is unaffected by whether or not actual
+physical disk IO was required (the read might have been satisfied from
+pagecache)
+
+
+wchar
+-----
+
+I/O counter: chars written
+The number of bytes which this task has caused, or shall cause to be written
+to disk. Similar caveats apply here as with rchar.
+
+
+syscr
+-----
+
+I/O counter: read syscalls
+Attempt to count the number of read I/O operations, i.e. syscalls like read()
+and pread().
+
+
+syscw
+-----
+
+I/O counter: write syscalls
+Attempt to count the number of write I/O operations, i.e. syscalls like
+write() and pwrite().
+
+
+read_bytes
+----------
+
+I/O counter: bytes read
+Attempt to count the number of bytes which this process really did cause to
+be fetched from the storage layer. Done at the submit_bio() level, so it is
+accurate for block-backed filesystems. <please add status regarding NFS and
+CIFS at a later time>
+
+
+write_bytes
+-----------
+
+I/O counter: bytes written
+Attempt to count the number of bytes which this process caused to be sent to
+the storage layer. This is done at page-dirtying time.
+
+
+cancelled_write_bytes
+---------------------
+
+The big inaccuracy here is truncate. If a process writes 1MB to a file and
+then deletes the file, it will in fact perform no writeout. But it will have
+been accounted as having caused 1MB of write.
+In other words: The number of bytes which this process caused to not happen,
+by truncating pagecache. A task can cause "negative" IO too. If this task
+truncates some dirty pagecache, some IO which another task has been accounted
+for (in it's write_bytes) will not be happening. We _could_ just subtract that
+from the truncating task's write_bytes, but there is information loss in doing
+that.
+
+
+Note
+----
+
+At its current implementation state, this is a bit racy on 32-bit machines: if
+process A reads process B's /proc/pid/io while process B is updating one of
+those 64-bit counters, process A could see an intermediate result.
+
+
+More information about this can be found within the taskstats documentation in
+Documentation/accounting.
+
+2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
+---------------------------------------------------------------
+When a process is dumped, all anonymous memory is written to a core file as
+long as the size of the core file isn't limited. But sometimes we don't want
+to dump some memory segments, for example, huge shared memory. Conversely,
+sometimes we want to save file-backed memory segments into a core file, not
+only the individual files.
+
+/proc/<pid>/coredump_filter allows you to customize which memory segments
+will be dumped when the <pid> process is dumped. coredump_filter is a bitmask
+of memory types. If a bit of the bitmask is set, memory segments of the
+corresponding memory type are dumped, otherwise they are not dumped.
+
+The following 7 memory types are supported:
+ - (bit 0) anonymous private memory
+ - (bit 1) anonymous shared memory
+ - (bit 2) file-backed private memory
+ - (bit 3) file-backed shared memory
+ - (bit 4) ELF header pages in file-backed private memory areas (it is
+ effective only if the bit 2 is cleared)
+ - (bit 5) hugetlb private memory
+ - (bit 6) hugetlb shared memory
+
+ Note that MMIO pages such as frame buffer are never dumped and vDSO pages
+ are always dumped regardless of the bitmask status.
+
+ Note bit 0-4 doesn't effect any hugetlb memory. hugetlb memory are only
+ effected by bit 5-6.
+
+Default value of coredump_filter is 0x23; this means all anonymous memory
+segments and hugetlb private memory are dumped.
+
+If you don't want to dump all shared memory segments attached to pid 1234,
+write 0x21 to the process's proc file.
+
+ $ echo 0x21 > /proc/1234/coredump_filter
+
+When a new process is created, the process inherits the bitmask status from its
+parent. It is useful to set up coredump_filter before the program runs.
+For example:
+
+ $ echo 0x7 > /proc/self/coredump_filter
+ $ ./some_program
+
+2.16 /proc/<pid>/mountinfo - Information about mounts
+--------------------------------------------------------
+
+This file contains lines of the form:
+
+36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+(1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
+
+(1) mount ID: unique identifier of the mount (may be reused after umount)
+(2) parent ID: ID of parent (or of self for the top of the mount tree)
+(3) major:minor: value of st_dev for files on filesystem
+(4) root: root of the mount within the filesystem
+(5) mount point: mount point relative to the process's root
+(6) mount options: per mount options
+(7) optional fields: zero or more fields of the form "tag[:value]"
+(8) separator: marks the end of the optional fields
+(9) filesystem type: name of filesystem of the form "type[.subtype]"
+(10) mount source: filesystem specific information or "none"
+(11) super options: per super block options
+
+Parsers should ignore all unrecognised optional fields. Currently the
+possible optional fields are:
+
+shared:X mount is shared in peer group X
+master:X mount is slave to peer group X
+propagate_from:X mount is slave and receives propagation from peer group X (*)
+unbindable mount is unbindable
+
+(*) X is the closest dominant peer group under the process's root. If
+X is the immediate master of the mount, or if there's no dominant peer
+group under the same root, then only the "master:X" field is present
+and not the "propagate_from:X" field.
+
+For more information on mount propagation see:
+
+ Documentation/filesystems/sharedsubtree.txt
+
+2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
+--------------------------------------------------------
+
+This directory contains configuration options for the epoll(7) interface.
+
+max_user_instances
+------------------
+
+This is the maximum number of epoll file descriptors that a single user can
+have open at a given time. The default value is 128, and should be enough
+for normal users.
+
+max_user_watches
+----------------
+
+Every epoll file descriptor can store a number of files to be monitored
+for event readiness. Each one of these monitored files constitutes a "watch".
+This configuration option sets the maximum number of "watches" that are
+allowed for each user.
+Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
+on a 64bit one.
+The current default value for max_user_watches is the 1/32 of the available
+low memory, divided for the "watch" cost in bytes.
+
+
+------------------------------------------------------------------------------
+
diff --git a/Documentation/filesystems/quota.txt b/Documentation/filesystems/quota.txt
new file mode 100644
index 0000000..5e8de25
--- /dev/null
+++ b/Documentation/filesystems/quota.txt
@@ -0,0 +1,65 @@
+
+Quota subsystem
+===============
+
+Quota subsystem allows system administrator to set limits on used space and
+number of used inodes (inode is a filesystem structure which is associated with
+each file or directory) for users and/or groups. For both used space and number
+of used inodes there are actually two limits. The first one is called softlimit
+and the second one hardlimit. An user can never exceed a hardlimit for any
+resource (unless he has CAP_SYS_RESOURCE capability). User is allowed to exceed
+softlimit but only for limited period of time. This period is called "grace
+period" or "grace time". When grace time is over, user is not able to allocate
+more space/inodes until he frees enough of them to get below softlimit.
+
+Quota limits (and amount of grace time) are set independently for each
+filesystem.
+
+For more details about quota design, see the documentation in quota-tools package
+(http://sourceforge.net/projects/linuxquota).
+
+Quota netlink interface
+=======================
+When user exceeds a softlimit, runs out of grace time or reaches hardlimit,
+quota subsystem traditionally printed a message to the controlling terminal of
+the process which caused the excess. This method has the disadvantage that
+when user is using a graphical desktop he usually cannot see the message.
+Thus quota netlink interface has been designed to pass information about
+the above events to userspace. There they can be captured by an application
+and processed accordingly.
+
+The interface uses generic netlink framework (see
+http://lwn.net/Articles/208755/ and http://people.suug.ch/~tgr/libnl/ for more
+details about this layer). The name of the quota generic netlink interface
+is "VFS_DQUOT". Definitions of constants below are in <linux/quota.h>.
+ Currently, the interface supports only one message type QUOTA_NL_C_WARNING.
+This command is used to send a notification about any of the above mentioned
+events. Each message has six attributes. These are (type of the argument is
+in parentheses):
+ QUOTA_NL_A_QTYPE (u32)
+ - type of quota being exceeded (one of USRQUOTA, GRPQUOTA)
+ QUOTA_NL_A_EXCESS_ID (u64)
+ - UID/GID (depends on quota type) of user / group whose limit
+ is being exceeded.
+ QUOTA_NL_A_CAUSED_ID (u64)
+ - UID of a user who caused the event
+ QUOTA_NL_A_WARNING (u32)
+ - what kind of limit is exceeded:
+ QUOTA_NL_IHARDWARN - inode hardlimit
+ QUOTA_NL_ISOFTLONGWARN - inode softlimit is exceeded longer
+ than given grace period
+ QUOTA_NL_ISOFTWARN - inode softlimit
+ QUOTA_NL_BHARDWARN - space (block) hardlimit
+ QUOTA_NL_BSOFTLONGWARN - space (block) softlimit is exceeded
+ longer than given grace period.
+ QUOTA_NL_BSOFTWARN - space (block) softlimit
+ - four warnings are also defined for the event when user stops
+ exceeding some limit:
+ QUOTA_NL_IHARDBELOW - inode hardlimit
+ QUOTA_NL_ISOFTBELOW - inode softlimit
+ QUOTA_NL_BHARDBELOW - space (block) hardlimit
+ QUOTA_NL_BSOFTBELOW - space (block) softlimit
+ QUOTA_NL_A_DEV_MAJOR (u32)
+ - major number of a device with the affected filesystem
+ QUOTA_NL_A_DEV_MINOR (u32)
+ - minor number of a device with the affected filesystem
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
new file mode 100644
index 0000000..a8273d5
--- /dev/null
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -0,0 +1,355 @@
+ramfs, rootfs and initramfs
+October 17, 2005
+Rob Landley <rob@landley.net>
+=============================
+
+What is ramfs?
+--------------
+
+Ramfs is a very simple filesystem that exports Linux's disk caching
+mechanisms (the page cache and dentry cache) as a dynamically resizable
+RAM-based filesystem.
+
+Normally all files are cached in memory by Linux. Pages of data read from
+backing store (usually the block device the filesystem is mounted on) are kept
+around in case it's needed again, but marked as clean (freeable) in case the
+Virtual Memory system needs the memory for something else. Similarly, data
+written to files is marked clean as soon as it has been written to backing
+store, but kept around for caching purposes until the VM reallocates the
+memory. A similar mechanism (the dentry cache) greatly speeds up access to
+directories.
+
+With ramfs, there is no backing store. Files written into ramfs allocate
+dentries and page cache as usual, but there's nowhere to write them to.
+This means the pages are never marked clean, so they can't be freed by the
+VM when it's looking to recycle memory.
+
+The amount of code required to implement ramfs is tiny, because all the
+work is done by the existing Linux caching infrastructure. Basically,
+you're mounting the disk cache as a filesystem. Because of this, ramfs is not
+an optional component removable via menuconfig, since there would be negligible
+space savings.
+
+ramfs and ramdisk:
+------------------
+
+The older "ram disk" mechanism created a synthetic block device out of
+an area of RAM and used it as backing store for a filesystem. This block
+device was of fixed size, so the filesystem mounted on it was of fixed
+size. Using a ram disk also required unnecessarily copying memory from the
+fake block device into the page cache (and copying changes back out), as well
+as creating and destroying dentries. Plus it needed a filesystem driver
+(such as ext2) to format and interpret this data.
+
+Compared to ramfs, this wastes memory (and memory bus bandwidth), creates
+unnecessary work for the CPU, and pollutes the CPU caches. (There are tricks
+to avoid this copying by playing with the page tables, but they're unpleasantly
+complicated and turn out to be about as expensive as the copying anyway.)
+More to the point, all the work ramfs is doing has to happen _anyway_,
+since all file access goes through the page and dentry caches. The RAM
+disk is simply unnecessary; ramfs is internally much simpler.
+
+Another reason ramdisks are semi-obsolete is that the introduction of
+loopback devices offered a more flexible and convenient way to create
+synthetic block devices, now from files instead of from chunks of memory.
+See losetup (8) for details.
+
+ramfs and tmpfs:
+----------------
+
+One downside of ramfs is you can keep writing data into it until you fill
+up all memory, and the VM can't free it because the VM thinks that files
+should get written to backing store (rather than swap space), but ramfs hasn't
+got any backing store. Because of this, only root (or a trusted user) should
+be allowed write access to a ramfs mount.
+
+A ramfs derivative called tmpfs was created to add size limits, and the ability
+to write the data to swap space. Normal users can be allowed write access to
+tmpfs mounts. See Documentation/filesystems/tmpfs.txt for more information.
+
+What is rootfs?
+---------------
+
+Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is
+always present in 2.6 systems. You can't unmount rootfs for approximately the
+same reason you can't kill the init process; rather than having special code
+to check for and handle an empty list, it's smaller and simpler for the kernel
+to just make sure certain lists can't become empty.
+
+Most systems just mount another filesystem over rootfs and ignore it. The
+amount of space an empty instance of ramfs takes up is tiny.
+
+What is initramfs?
+------------------
+
+All 2.6 Linux kernels contain a gzipped "cpio" format archive, which is
+extracted into rootfs when the kernel boots up. After extracting, the kernel
+checks to see if rootfs contains a file "init", and if so it executes it as PID
+1. If found, this init process is responsible for bringing the system the
+rest of the way up, including locating and mounting the real root device (if
+any). If rootfs does not contain an init program after the embedded cpio
+archive is extracted into it, the kernel will fall through to the older code
+to locate and mount a root partition, then exec some variant of /sbin/init
+out of that.
+
+All this differs from the old initrd in several ways:
+
+ - The old initrd was always a separate file, while the initramfs archive is
+ linked into the linux kernel image. (The directory linux-*/usr is devoted
+ to generating this archive during the build.)
+
+ - The old initrd file was a gzipped filesystem image (in some file format,
+ such as ext2, that needed a driver built into the kernel), while the new
+ initramfs archive is a gzipped cpio archive (like tar only simpler,
+ see cpio(1) and Documentation/early-userspace/buffer-format.txt). The
+ kernel's cpio extraction code is not only extremely small, it's also
+ __init text and data that can be discarded during the boot process.
+
+ - The program run by the old initrd (which was called /initrd, not /init) did
+ some setup and then returned to the kernel, while the init program from
+ initramfs is not expected to return to the kernel. (If /init needs to hand
+ off control it can overmount / with a new root device and exec another init
+ program. See the switch_root utility, below.)
+
+ - When switching another root device, initrd would pivot_root and then
+ umount the ramdisk. But initramfs is rootfs: you can neither pivot_root
+ rootfs, nor unmount it. Instead delete everything out of rootfs to
+ free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs
+ with the new root (cd /newmount; mount --move . /; chroot .), attach
+ stdin/stdout/stderr to the new /dev/console, and exec the new init.
+
+ Since this is a remarkably persnickety process (and involves deleting
+ commands before you can run them), the klibc package introduced a helper
+ program (utils/run_init.c) to do all this for you. Most other packages
+ (such as busybox) have named this command "switch_root".
+
+Populating initramfs:
+---------------------
+
+The 2.6 kernel build process always creates a gzipped cpio format initramfs
+archive and links it into the resulting kernel binary. By default, this
+archive is empty (consuming 134 bytes on x86).
+
+The config option CONFIG_INITRAMFS_SOURCE (in General Setup in menuconfig,
+and living in usr/Kconfig) can be used to specify a source for the
+initramfs archive, which will automatically be incorporated into the
+resulting binary. This option can point to an existing gzipped cpio
+archive, a directory containing files to be archived, or a text file
+specification such as the following example:
+
+ dir /dev 755 0 0
+ nod /dev/console 644 0 0 c 5 1
+ nod /dev/loop0 644 0 0 b 7 0
+ dir /bin 755 1000 1000
+ slink /bin/sh busybox 777 0 0
+ file /bin/busybox initramfs/busybox 755 0 0
+ dir /proc 755 0 0
+ dir /sys 755 0 0
+ dir /mnt 755 0 0
+ file /init initramfs/init.sh 755 0 0
+
+Run "usr/gen_init_cpio" (after the kernel build) to get a usage message
+documenting the above file format.
+
+One advantage of the configuration file is that root access is not required to
+set permissions or create device nodes in the new archive. (Note that those
+two example "file" entries expect to find files named "init.sh" and "busybox" in
+a directory called "initramfs", under the linux-2.6.* directory. See
+Documentation/early-userspace/README for more details.)
+
+The kernel does not depend on external cpio tools. If you specify a
+directory instead of a configuration file, the kernel's build infrastructure
+creates a configuration file from that directory (usr/Makefile calls
+scripts/gen_initramfs_list.sh), and proceeds to package up that directory
+using the config file (by feeding it to usr/gen_init_cpio, which is created
+from usr/gen_init_cpio.c). The kernel's build-time cpio creation code is
+entirely self-contained, and the kernel's boot-time extractor is also
+(obviously) self-contained.
+
+The one thing you might need external cpio utilities installed for is creating
+or extracting your own preprepared cpio files to feed to the kernel build
+(instead of a config file or directory).
+
+The following command line can extract a cpio image (either by the above script
+or by the kernel build) back into its component files:
+
+ cpio -i -d -H newc -F initramfs_data.cpio --no-absolute-filenames
+
+The following shell script can create a prebuilt cpio archive you can
+use in place of the above config file:
+
+ #!/bin/sh
+
+ # Copyright 2006 Rob Landley <rob@landley.net> and TimeSys Corporation.
+ # Licensed under GPL version 2
+
+ if [ $# -ne 2 ]
+ then
+ echo "usage: mkinitramfs directory imagename.cpio.gz"
+ exit 1
+ fi
+
+ if [ -d "$1" ]
+ then
+ echo "creating $2 from $1"
+ (cd "$1"; find . | cpio -o -H newc | gzip) > "$2"
+ else
+ echo "First argument must be a directory"
+ exit 1
+ fi
+
+Note: The cpio man page contains some bad advice that will break your initramfs
+archive if you follow it. It says "A typical way to generate the list
+of filenames is with the find command; you should give find the -depth option
+to minimize problems with permissions on directories that are unwritable or not
+searchable." Don't do this when creating initramfs.cpio.gz images, it won't
+work. The Linux kernel cpio extractor won't create files in a directory that
+doesn't exist, so the directory entries must go before the files that go in
+those directories. The above script gets them in the right order.
+
+External initramfs images:
+--------------------------
+
+If the kernel has initrd support enabled, an external cpio.gz archive can also
+be passed into a 2.6 kernel in place of an initrd. In this case, the kernel
+will autodetect the type (initramfs, not initrd) and extract the external cpio
+archive into rootfs before trying to run /init.
+
+This has the memory efficiency advantages of initramfs (no ramdisk block
+device) but the separate packaging of initrd (which is nice if you have
+non-GPL code you'd like to run from initramfs, without conflating it with
+the GPL licensed Linux kernel binary).
+
+It can also be used to supplement the kernel's built-in initramfs image. The
+files in the external archive will overwrite any conflicting files in
+the built-in initramfs archive. Some distributors also prefer to customize
+a single kernel image with task-specific initramfs images, without recompiling.
+
+Contents of initramfs:
+----------------------
+
+An initramfs archive is a complete self-contained root filesystem for Linux.
+If you don't already understand what shared libraries, devices, and paths
+you need to get a minimal root filesystem up and running, here are some
+references:
+http://www.tldp.org/HOWTO/Bootdisk-HOWTO/
+http://www.tldp.org/HOWTO/From-PowerUp-To-Bash-Prompt-HOWTO.html
+http://www.linuxfromscratch.org/lfs/view/stable/
+
+The "klibc" package (http://www.kernel.org/pub/linux/libs/klibc) is
+designed to be a tiny C library to statically link early userspace
+code against, along with some related utilities. It is BSD licensed.
+
+I use uClibc (http://www.uclibc.org) and busybox (http://www.busybox.net)
+myself. These are LGPL and GPL, respectively. (A self-contained initramfs
+package is planned for the busybox 1.3 release.)
+
+In theory you could use glibc, but that's not well suited for small embedded
+uses like this. (A "hello world" program statically linked against glibc is
+over 400k. With uClibc it's 7k. Also note that glibc dlopens libnss to do
+name lookups, even when otherwise statically linked.)
+
+A good first step is to get initramfs to run a statically linked "hello world"
+program as init, and test it under an emulator like qemu (www.qemu.org) or
+User Mode Linux, like so:
+
+ cat > hello.c << EOF
+ #include <stdio.h>
+ #include <unistd.h>
+
+ int main(int argc, char *argv[])
+ {
+ printf("Hello world!\n");
+ sleep(999999999);
+ }
+ EOF
+ gcc -static hello.c -o init
+ echo init | cpio -o -H newc | gzip > test.cpio.gz
+ # Testing external initramfs using the initrd loading mechanism.
+ qemu -kernel /boot/vmlinuz -initrd test.cpio.gz /dev/zero
+
+When debugging a normal root filesystem, it's nice to be able to boot with
+"init=/bin/sh". The initramfs equivalent is "rdinit=/bin/sh", and it's
+just as useful.
+
+Why cpio rather than tar?
+-------------------------
+
+This decision was made back in December, 2001. The discussion started here:
+
+ http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1538.html
+
+And spawned a second thread (specifically on tar vs cpio), starting here:
+
+ http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1587.html
+
+The quick and dirty summary version (which is no substitute for reading
+the above threads) is:
+
+1) cpio is a standard. It's decades old (from the AT&T days), and already
+ widely used on Linux (inside RPM, Red Hat's device driver disks). Here's
+ a Linux Journal article about it from 1996:
+
+ http://www.linuxjournal.com/article/1213
+
+ It's not as popular as tar because the traditional cpio command line tools
+ require _truly_hideous_ command line arguments. But that says nothing
+ either way about the archive format, and there are alternative tools,
+ such as:
+
+ http://freshmeat.net/projects/afio/
+
+2) The cpio archive format chosen by the kernel is simpler and cleaner (and
+ thus easier to create and parse) than any of the (literally dozens of)
+ various tar archive formats. The complete initramfs archive format is
+ explained in buffer-format.txt, created in usr/gen_init_cpio.c, and
+ extracted in init/initramfs.c. All three together come to less than 26k
+ total of human-readable text.
+
+3) The GNU project standardizing on tar is approximately as relevant as
+ Windows standardizing on zip. Linux is not part of either, and is free
+ to make its own technical decisions.
+
+4) Since this is a kernel internal format, it could easily have been
+ something brand new. The kernel provides its own tools to create and
+ extract this format anyway. Using an existing standard was preferable,
+ but not essential.
+
+5) Al Viro made the decision (quote: "tar is ugly as hell and not going to be
+ supported on the kernel side"):
+
+ http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1540.html
+
+ explained his reasoning:
+
+ http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1550.html
+ http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1638.html
+
+ and, most importantly, designed and implemented the initramfs code.
+
+Future directions:
+------------------
+
+Today (2.6.16), initramfs is always compiled in, but not always used. The
+kernel falls back to legacy boot code that is reached only if initramfs does
+not contain an /init program. The fallback is legacy code, there to ensure a
+smooth transition and allowing early boot functionality to gradually move to
+"early userspace" (I.E. initramfs).
+
+The move to early userspace is necessary because finding and mounting the real
+root device is complex. Root partitions can span multiple devices (raid or
+separate journal). They can be out on the network (requiring dhcp, setting a
+specific MAC address, logging into a server, etc). They can live on removable
+media, with dynamically allocated major/minor numbers and persistent naming
+issues requiring a full udev implementation to sort out. They can be
+compressed, encrypted, copy-on-write, loopback mounted, strangely partitioned,
+and so on.
+
+This kind of complexity (which inevitably includes policy) is rightly handled
+in userspace. Both klibc and busybox/uClibc are working on simple initramfs
+packages to drop into a kernel build.
+
+The klibc package has now been accepted into Andrew Morton's 2.6.17-mm tree.
+The kernel's current early boot code (partition detection, etc) will probably
+be migrated into a default initramfs, automatically created and used by the
+kernel build.
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
new file mode 100644
index 0000000..510b722
--- /dev/null
+++ b/Documentation/filesystems/relay.txt
@@ -0,0 +1,494 @@
+relay interface (formerly relayfs)
+==================================
+
+The relay interface provides a means for kernel applications to
+efficiently log and transfer large quantities of data from the kernel
+to userspace via user-defined 'relay channels'.
+
+A 'relay channel' is a kernel->user data relay mechanism implemented
+as a set of per-cpu kernel buffers ('channel buffers'), each
+represented as a regular file ('relay file') in user space. Kernel
+clients write into the channel buffers using efficient write
+functions; these automatically log into the current cpu's channel
+buffer. User space applications mmap() or read() from the relay files
+and retrieve the data as it becomes available. The relay files
+themselves are files created in a host filesystem, e.g. debugfs, and
+are associated with the channel buffers using the API described below.
+
+The format of the data logged into the channel buffers is completely
+up to the kernel client; the relay interface does however provide
+hooks which allow kernel clients to impose some structure on the
+buffer data. The relay interface doesn't implement any form of data
+filtering - this also is left to the kernel client. The purpose is to
+keep things as simple as possible.
+
+This document provides an overview of the relay interface API. The
+details of the function parameters are documented along with the
+functions in the relay interface code - please see that for details.
+
+Semantics
+=========
+
+Each relay channel has one buffer per CPU, each buffer has one or more
+sub-buffers. Messages are written to the first sub-buffer until it is
+too full to contain a new message, in which case it it is written to
+the next (if available). Messages are never split across sub-buffers.
+At this point, userspace can be notified so it empties the first
+sub-buffer, while the kernel continues writing to the next.
+
+When notified that a sub-buffer is full, the kernel knows how many
+bytes of it are padding i.e. unused space occurring because a complete
+message couldn't fit into a sub-buffer. Userspace can use this
+knowledge to copy only valid data.
+
+After copying it, userspace can notify the kernel that a sub-buffer
+has been consumed.
+
+A relay channel can operate in a mode where it will overwrite data not
+yet collected by userspace, and not wait for it to be consumed.
+
+The relay channel itself does not provide for communication of such
+data between userspace and kernel, allowing the kernel side to remain
+simple and not impose a single interface on userspace. It does
+provide a set of examples and a separate helper though, described
+below.
+
+The read() interface both removes padding and internally consumes the
+read sub-buffers; thus in cases where read(2) is being used to drain
+the channel buffers, special-purpose communication between kernel and
+user isn't necessary for basic operation.
+
+One of the major goals of the relay interface is to provide a low
+overhead mechanism for conveying kernel data to userspace. While the
+read() interface is easy to use, it's not as efficient as the mmap()
+approach; the example code attempts to make the tradeoff between the
+two approaches as small as possible.
+
+klog and relay-apps example code
+================================
+
+The relay interface itself is ready to use, but to make things easier,
+a couple simple utility functions and a set of examples are provided.
+
+The relay-apps example tarball, available on the relay sourceforge
+site, contains a set of self-contained examples, each consisting of a
+pair of .c files containing boilerplate code for each of the user and
+kernel sides of a relay application. When combined these two sets of
+boilerplate code provide glue to easily stream data to disk, without
+having to bother with mundane housekeeping chores.
+
+The 'klog debugging functions' patch (klog.patch in the relay-apps
+tarball) provides a couple of high-level logging functions to the
+kernel which allow writing formatted text or raw data to a channel,
+regardless of whether a channel to write into exists or not, or even
+whether the relay interface is compiled into the kernel or not. These
+functions allow you to put unconditional 'trace' statements anywhere
+in the kernel or kernel modules; only when there is a 'klog handler'
+registered will data actually be logged (see the klog and kleak
+examples for details).
+
+It is of course possible to use the relay interface from scratch,
+i.e. without using any of the relay-apps example code or klog, but
+you'll have to implement communication between userspace and kernel,
+allowing both to convey the state of buffers (full, empty, amount of
+padding). The read() interface both removes padding and internally
+consumes the read sub-buffers; thus in cases where read(2) is being
+used to drain the channel buffers, special-purpose communication
+between kernel and user isn't necessary for basic operation. Things
+such as buffer-full conditions would still need to be communicated via
+some channel though.
+
+klog and the relay-apps examples can be found in the relay-apps
+tarball on http://relayfs.sourceforge.net
+
+The relay interface user space API
+==================================
+
+The relay interface implements basic file operations for user space
+access to relay channel buffer data. Here are the file operations
+that are available and some comments regarding their behavior:
+
+open() enables user to open an _existing_ channel buffer.
+
+mmap() results in channel buffer being mapped into the caller's
+ memory space. Note that you can't do a partial mmap - you
+ must map the entire file, which is NRBUF * SUBBUFSIZE.
+
+read() read the contents of a channel buffer. The bytes read are
+ 'consumed' by the reader, i.e. they won't be available
+ again to subsequent reads. If the channel is being used
+ in no-overwrite mode (the default), it can be read at any
+ time even if there's an active kernel writer. If the
+ channel is being used in overwrite mode and there are
+ active channel writers, results may be unpredictable -
+ users should make sure that all logging to the channel has
+ ended before using read() with overwrite mode. Sub-buffer
+ padding is automatically removed and will not be seen by
+ the reader.
+
+sendfile() transfer data from a channel buffer to an output file
+ descriptor. Sub-buffer padding is automatically removed
+ and will not be seen by the reader.
+
+poll() POLLIN/POLLRDNORM/POLLERR supported. User applications are
+ notified when sub-buffer boundaries are crossed.
+
+close() decrements the channel buffer's refcount. When the refcount
+ reaches 0, i.e. when no process or kernel client has the
+ buffer open, the channel buffer is freed.
+
+In order for a user application to make use of relay files, the
+host filesystem must be mounted. For example,
+
+ mount -t debugfs debugfs /sys/kernel/debug
+
+NOTE: the host filesystem doesn't need to be mounted for kernel
+ clients to create or use channels - it only needs to be
+ mounted when user space applications need access to the buffer
+ data.
+
+
+The relay interface kernel API
+==============================
+
+Here's a summary of the API the relay interface provides to in-kernel clients:
+
+TBD(curr. line MT:/API/)
+ channel management functions:
+
+ relay_open(base_filename, parent, subbuf_size, n_subbufs,
+ callbacks, private_data)
+ relay_close(chan)
+ relay_flush(chan)
+ relay_reset(chan)
+
+ channel management typically called on instigation of userspace:
+
+ relay_subbufs_consumed(chan, cpu, subbufs_consumed)
+
+ write functions:
+
+ relay_write(chan, data, length)
+ __relay_write(chan, data, length)
+ relay_reserve(chan, length)
+
+ callbacks:
+
+ subbuf_start(buf, subbuf, prev_subbuf, prev_padding)
+ buf_mapped(buf, filp)
+ buf_unmapped(buf, filp)
+ create_buf_file(filename, parent, mode, buf, is_global)
+ remove_buf_file(dentry)
+
+ helper functions:
+
+ relay_buf_full(buf)
+ subbuf_start_reserve(buf, length)
+
+
+Creating a channel
+------------------
+
+relay_open() is used to create a channel, along with its per-cpu
+channel buffers. Each channel buffer will have an associated file
+created for it in the host filesystem, which can be and mmapped or
+read from in user space. The files are named basename0...basenameN-1
+where N is the number of online cpus, and by default will be created
+in the root of the filesystem (if the parent param is NULL). If you
+want a directory structure to contain your relay files, you should
+create it using the host filesystem's directory creation function,
+e.g. debugfs_create_dir(), and pass the parent directory to
+relay_open(). Users are responsible for cleaning up any directory
+structure they create, when the channel is closed - again the host
+filesystem's directory removal functions should be used for that,
+e.g. debugfs_remove().
+
+In order for a channel to be created and the host filesystem's files
+associated with its channel buffers, the user must provide definitions
+for two callback functions, create_buf_file() and remove_buf_file().
+create_buf_file() is called once for each per-cpu buffer from
+relay_open() and allows the user to create the file which will be used
+to represent the corresponding channel buffer. The callback should
+return the dentry of the file created to represent the channel buffer.
+remove_buf_file() must also be defined; it's responsible for deleting
+the file(s) created in create_buf_file() and is called during
+relay_close().
+
+Here are some typical definitions for these callbacks, in this case
+using debugfs:
+
+/*
+ * create_buf_file() callback. Creates relay file in debugfs.
+ */
+static struct dentry *create_buf_file_handler(const char *filename,
+ struct dentry *parent,
+ int mode,
+ struct rchan_buf *buf,
+ int *is_global)
+{
+ return debugfs_create_file(filename, mode, parent, buf,
+ &relay_file_operations);
+}
+
+/*
+ * remove_buf_file() callback. Removes relay file from debugfs.
+ */
+static int remove_buf_file_handler(struct dentry *dentry)
+{
+ debugfs_remove(dentry);
+
+ return 0;
+}
+
+/*
+ * relay interface callbacks
+ */
+static struct rchan_callbacks relay_callbacks =
+{
+ .create_buf_file = create_buf_file_handler,
+ .remove_buf_file = remove_buf_file_handler,
+};
+
+And an example relay_open() invocation using them:
+
+ chan = relay_open("cpu", NULL, SUBBUF_SIZE, N_SUBBUFS, &relay_callbacks, NULL);
+
+If the create_buf_file() callback fails, or isn't defined, channel
+creation and thus relay_open() will fail.
+
+The total size of each per-cpu buffer is calculated by multiplying the
+number of sub-buffers by the sub-buffer size passed into relay_open().
+The idea behind sub-buffers is that they're basically an extension of
+double-buffering to N buffers, and they also allow applications to
+easily implement random-access-on-buffer-boundary schemes, which can
+be important for some high-volume applications. The number and size
+of sub-buffers is completely dependent on the application and even for
+the same application, different conditions will warrant different
+values for these parameters at different times. Typically, the right
+values to use are best decided after some experimentation; in general,
+though, it's safe to assume that having only 1 sub-buffer is a bad
+idea - you're guaranteed to either overwrite data or lose events
+depending on the channel mode being used.
+
+The create_buf_file() implementation can also be defined in such a way
+as to allow the creation of a single 'global' buffer instead of the
+default per-cpu set. This can be useful for applications interested
+mainly in seeing the relative ordering of system-wide events without
+the need to bother with saving explicit timestamps for the purpose of
+merging/sorting per-cpu files in a postprocessing step.
+
+To have relay_open() create a global buffer, the create_buf_file()
+implementation should set the value of the is_global outparam to a
+non-zero value in addition to creating the file that will be used to
+represent the single buffer. In the case of a global buffer,
+create_buf_file() and remove_buf_file() will be called only once. The
+normal channel-writing functions, e.g. relay_write(), can still be
+used - writes from any cpu will transparently end up in the global
+buffer - but since it is a global buffer, callers should make sure
+they use the proper locking for such a buffer, either by wrapping
+writes in a spinlock, or by copying a write function from relay.h and
+creating a local version that internally does the proper locking.
+
+The private_data passed into relay_open() allows clients to associate
+user-defined data with a channel, and is immediately available
+(including in create_buf_file()) via chan->private_data or
+buf->chan->private_data.
+
+Buffer-only channels
+--------------------
+
+These channels have no files associated and can be created with
+relay_open(NULL, NULL, ...). Such channels are useful in scenarios such
+as when doing early tracing in the kernel, before the VFS is up. In these
+cases, one may open a buffer-only channel and then call
+relay_late_setup_files() when the kernel is ready to handle files,
+to expose the buffered data to the userspace.
+
+Channel 'modes'
+---------------
+
+relay channels can be used in either of two modes - 'overwrite' or
+'no-overwrite'. The mode is entirely determined by the implementation
+of the subbuf_start() callback, as described below. The default if no
+subbuf_start() callback is defined is 'no-overwrite' mode. If the
+default mode suits your needs, and you plan to use the read()
+interface to retrieve channel data, you can ignore the details of this
+section, as it pertains mainly to mmap() implementations.
+
+In 'overwrite' mode, also known as 'flight recorder' mode, writes
+continuously cycle around the buffer and will never fail, but will
+unconditionally overwrite old data regardless of whether it's actually
+been consumed. In no-overwrite mode, writes will fail, i.e. data will
+be lost, if the number of unconsumed sub-buffers equals the total
+number of sub-buffers in the channel. It should be clear that if
+there is no consumer or if the consumer can't consume sub-buffers fast
+enough, data will be lost in either case; the only difference is
+whether data is lost from the beginning or the end of a buffer.
+
+As explained above, a relay channel is made of up one or more
+per-cpu channel buffers, each implemented as a circular buffer
+subdivided into one or more sub-buffers. Messages are written into
+the current sub-buffer of the channel's current per-cpu buffer via the
+write functions described below. Whenever a message can't fit into
+the current sub-buffer, because there's no room left for it, the
+client is notified via the subbuf_start() callback that a switch to a
+new sub-buffer is about to occur. The client uses this callback to 1)
+initialize the next sub-buffer if appropriate 2) finalize the previous
+sub-buffer if appropriate and 3) return a boolean value indicating
+whether or not to actually move on to the next sub-buffer.
+
+To implement 'no-overwrite' mode, the userspace client would provide
+an implementation of the subbuf_start() callback something like the
+following:
+
+static int subbuf_start(struct rchan_buf *buf,
+ void *subbuf,
+ void *prev_subbuf,
+ unsigned int prev_padding)
+{
+ if (prev_subbuf)
+ *((unsigned *)prev_subbuf) = prev_padding;
+
+ if (relay_buf_full(buf))
+ return 0;
+
+ subbuf_start_reserve(buf, sizeof(unsigned int));
+
+ return 1;
+}
+
+If the current buffer is full, i.e. all sub-buffers remain unconsumed,
+the callback returns 0 to indicate that the buffer switch should not
+occur yet, i.e. until the consumer has had a chance to read the
+current set of ready sub-buffers. For the relay_buf_full() function
+to make sense, the consumer is responsible for notifying the relay
+interface when sub-buffers have been consumed via
+relay_subbufs_consumed(). Any subsequent attempts to write into the
+buffer will again invoke the subbuf_start() callback with the same
+parameters; only when the consumer has consumed one or more of the
+ready sub-buffers will relay_buf_full() return 0, in which case the
+buffer switch can continue.
+
+The implementation of the subbuf_start() callback for 'overwrite' mode
+would be very similar:
+
+static int subbuf_start(struct rchan_buf *buf,
+ void *subbuf,
+ void *prev_subbuf,
+ unsigned int prev_padding)
+{
+ if (prev_subbuf)
+ *((unsigned *)prev_subbuf) = prev_padding;
+
+ subbuf_start_reserve(buf, sizeof(unsigned int));
+
+ return 1;
+}
+
+In this case, the relay_buf_full() check is meaningless and the
+callback always returns 1, causing the buffer switch to occur
+unconditionally. It's also meaningless for the client to use the
+relay_subbufs_consumed() function in this mode, as it's never
+consulted.
+
+The default subbuf_start() implementation, used if the client doesn't
+define any callbacks, or doesn't define the subbuf_start() callback,
+implements the simplest possible 'no-overwrite' mode, i.e. it does
+nothing but return 0.
+
+Header information can be reserved at the beginning of each sub-buffer
+by calling the subbuf_start_reserve() helper function from within the
+subbuf_start() callback. This reserved area can be used to store
+whatever information the client wants. In the example above, room is
+reserved in each sub-buffer to store the padding count for that
+sub-buffer. This is filled in for the previous sub-buffer in the
+subbuf_start() implementation; the padding value for the previous
+sub-buffer is passed into the subbuf_start() callback along with a
+pointer to the previous sub-buffer, since the padding value isn't
+known until a sub-buffer is filled. The subbuf_start() callback is
+also called for the first sub-buffer when the channel is opened, to
+give the client a chance to reserve space in it. In this case the
+previous sub-buffer pointer passed into the callback will be NULL, so
+the client should check the value of the prev_subbuf pointer before
+writing into the previous sub-buffer.
+
+Writing to a channel
+--------------------
+
+Kernel clients write data into the current cpu's channel buffer using
+relay_write() or __relay_write(). relay_write() is the main logging
+function - it uses local_irqsave() to protect the buffer and should be
+used if you might be logging from interrupt context. If you know
+you'll never be logging from interrupt context, you can use
+__relay_write(), which only disables preemption. These functions
+don't return a value, so you can't determine whether or not they
+failed - the assumption is that you wouldn't want to check a return
+value in the fast logging path anyway, and that they'll always succeed
+unless the buffer is full and no-overwrite mode is being used, in
+which case you can detect a failed write in the subbuf_start()
+callback by calling the relay_buf_full() helper function.
+
+relay_reserve() is used to reserve a slot in a channel buffer which
+can be written to later. This would typically be used in applications
+that need to write directly into a channel buffer without having to
+stage data in a temporary buffer beforehand. Because the actual write
+may not happen immediately after the slot is reserved, applications
+using relay_reserve() can keep a count of the number of bytes actually
+written, either in space reserved in the sub-buffers themselves or as
+a separate array. See the 'reserve' example in the relay-apps tarball
+at http://relayfs.sourceforge.net for an example of how this can be
+done. Because the write is under control of the client and is
+separated from the reserve, relay_reserve() doesn't protect the buffer
+at all - it's up to the client to provide the appropriate
+synchronization when using relay_reserve().
+
+Closing a channel
+-----------------
+
+The client calls relay_close() when it's finished using the channel.
+The channel and its associated buffers are destroyed when there are no
+longer any references to any of the channel buffers. relay_flush()
+forces a sub-buffer switch on all the channel buffers, and can be used
+to finalize and process the last sub-buffers before the channel is
+closed.
+
+Misc
+----
+
+Some applications may want to keep a channel around and re-use it
+rather than open and close a new channel for each use. relay_reset()
+can be used for this purpose - it resets a channel to its initial
+state without reallocating channel buffer memory or destroying
+existing mappings. It should however only be called when it's safe to
+do so, i.e. when the channel isn't currently being written to.
+
+Finally, there are a couple of utility callbacks that can be used for
+different purposes. buf_mapped() is called whenever a channel buffer
+is mmapped from user space and buf_unmapped() is called when it's
+unmapped. The client can use this notification to trigger actions
+within the kernel application, such as enabling/disabling logging to
+the channel.
+
+
+Resources
+=========
+
+For news, example code, mailing list, etc. see the relay interface homepage:
+
+ http://relayfs.sourceforge.net
+
+
+Credits
+=======
+
+The ideas and specs for the relay interface came about as a result of
+discussions on tracing involving the following:
+
+Michel Dagenais <michel.dagenais@polymtl.ca>
+Richard Moore <richardj_moore@uk.ibm.com>
+Bob Wisniewski <bob@watson.ibm.com>
+Karim Yaghmour <karim@opersys.com>
+Tom Zanussi <zanussi@us.ibm.com>
+
+Also thanks to Hubertus Franke for a lot of useful suggestions and bug
+reports.
diff --git a/Documentation/filesystems/romfs.txt b/Documentation/filesystems/romfs.txt
new file mode 100644
index 0000000..2d2a7b2
--- /dev/null
+++ b/Documentation/filesystems/romfs.txt
@@ -0,0 +1,187 @@
+ROMFS - ROM FILE SYSTEM
+
+This is a quite dumb, read only filesystem, mainly for initial RAM
+disks of installation disks. It has grown up by the need of having
+modules linked at boot time. Using this filesystem, you get a very
+similar feature, and even the possibility of a small kernel, with a
+file system which doesn't take up useful memory from the router
+functions in the basement of your office.
+
+For comparison, both the older minix and xiafs (the latter is now
+defunct) filesystems, compiled as module need more than 20000 bytes,
+while romfs is less than a page, about 4000 bytes (assuming i586
+code). Under the same conditions, the msdos filesystem would need
+about 30K (and does not support device nodes or symlinks), while the
+nfs module with nfsroot is about 57K. Furthermore, as a bit unfair
+comparison, an actual rescue disk used up 3202 blocks with ext2, while
+with romfs, it needed 3079 blocks.
+
+To create such a file system, you'll need a user program named
+genromfs. It is available via anonymous ftp on sunsite.unc.edu and
+its mirrors, in the /pub/Linux/system/recovery/ directory.
+
+As the name suggests, romfs could be also used (space-efficiently) on
+various read-only media, like (E)EPROM disks if someone will have the
+motivation.. :)
+
+However, the main purpose of romfs is to have a very small kernel,
+which has only this filesystem linked in, and then can load any module
+later, with the current module utilities. It can also be used to run
+some program to decide if you need SCSI devices, and even IDE or
+floppy drives can be loaded later if you use the "initrd"--initial
+RAM disk--feature of the kernel. This would not be really news
+flash, but with romfs, you can even spare off your ext2 or minix or
+maybe even affs filesystem until you really know that you need it.
+
+For example, a distribution boot disk can contain only the cd disk
+drivers (and possibly the SCSI drivers), and the ISO 9660 filesystem
+module. The kernel can be small enough, since it doesn't have other
+filesystems, like the quite large ext2fs module, which can then be
+loaded off the CD at a later stage of the installation. Another use
+would be for a recovery disk, when you are reinstalling a workstation
+from the network, and you will have all the tools/modules available
+from a nearby server, so you don't want to carry two disks for this
+purpose, just because it won't fit into ext2.
+
+romfs operates on block devices as you can expect, and the underlying
+structure is very simple. Every accessible structure begins on 16
+byte boundaries for fast access. The minimum space a file will take
+is 32 bytes (this is an empty file, with a less than 16 character
+name). The maximum overhead for any non-empty file is the header, and
+the 16 byte padding for the name and the contents, also 16+14+15 = 45
+bytes. This is quite rare however, since most file names are longer
+than 3 bytes, and shorter than 15 bytes.
+
+The layout of the filesystem is the following:
+
+offset content
+
+ +---+---+---+---+
+ 0 | - | r | o | m | \
+ +---+---+---+---+ The ASCII representation of those bytes
+ 4 | 1 | f | s | - | / (i.e. "-rom1fs-")
+ +---+---+---+---+
+ 8 | full size | The number of accessible bytes in this fs.
+ +---+---+---+---+
+ 12 | checksum | The checksum of the FIRST 512 BYTES.
+ +---+---+---+---+
+ 16 | volume name | The zero terminated name of the volume,
+ : : padded to 16 byte boundary.
+ +---+---+---+---+
+ xx | file |
+ : headers :
+
+Every multi byte value (32 bit words, I'll use the longwords term from
+now on) must be in big endian order.
+
+The first eight bytes identify the filesystem, even for the casual
+inspector. After that, in the 3rd longword, it contains the number of
+bytes accessible from the start of this filesystem. The 4th longword
+is the checksum of the first 512 bytes (or the number of bytes
+accessible, whichever is smaller). The applied algorithm is the same
+as in the AFFS filesystem, namely a simple sum of the longwords
+(assuming bigendian quantities again). For details, please consult
+the source. This algorithm was chosen because although it's not quite
+reliable, it does not require any tables, and it is very simple.
+
+The following bytes are now part of the file system; each file header
+must begin on a 16 byte boundary.
+
+offset content
+
+ +---+---+---+---+
+ 0 | next filehdr|X| The offset of the next file header
+ +---+---+---+---+ (zero if no more files)
+ 4 | spec.info | Info for directories/hard links/devices
+ +---+---+---+---+
+ 8 | size | The size of this file in bytes
+ +---+---+---+---+
+ 12 | checksum | Covering the meta data, including the file
+ +---+---+---+---+ name, and padding
+ 16 | file name | The zero terminated name of the file,
+ : : padded to 16 byte boundary
+ +---+---+---+---+
+ xx | file data |
+ : :
+
+Since the file headers begin always at a 16 byte boundary, the lowest
+4 bits would be always zero in the next filehdr pointer. These four
+bits are used for the mode information. Bits 0..2 specify the type of
+the file; while bit 4 shows if the file is executable or not. The
+permissions are assumed to be world readable, if this bit is not set,
+and world executable if it is; except the character and block devices,
+they are never accessible for other than owner. The owner of every
+file is user and group 0, this should never be a problem for the
+intended use. The mapping of the 8 possible values to file types is
+the following:
+
+ mapping spec.info means
+ 0 hard link link destination [file header]
+ 1 directory first file's header
+ 2 regular file unused, must be zero [MBZ]
+ 3 symbolic link unused, MBZ (file data is the link content)
+ 4 block device 16/16 bits major/minor number
+ 5 char device - " -
+ 6 socket unused, MBZ
+ 7 fifo unused, MBZ
+
+Note that hard links are specifically marked in this filesystem, but
+they will behave as you can expect (i.e. share the inode number).
+Note also that it is your responsibility to not create hard link
+loops, and creating all the . and .. links for directories. This is
+normally done correctly by the genromfs program. Please refrain from
+using the executable bits for special purposes on the socket and fifo
+special files, they may have other uses in the future. Additionally,
+please remember that only regular files, and symlinks are supposed to
+have a nonzero size field; they contain the number of bytes available
+directly after the (padded) file name.
+
+Another thing to note is that romfs works on file headers and data
+aligned to 16 byte boundaries, but most hardware devices and the block
+device drivers are unable to cope with smaller than block-sized data.
+To overcome this limitation, the whole size of the file system must be
+padded to an 1024 byte boundary.
+
+If you have any problems or suggestions concerning this file system,
+please contact me. However, think twice before wanting me to add
+features and code, because the primary and most important advantage of
+this file system is the small code. On the other hand, don't be
+alarmed, I'm not getting that much romfs related mail. Now I can
+understand why Avery wrote poems in the ARCnet docs to get some more
+feedback. :)
+
+romfs has also a mailing list, and to date, it hasn't received any
+traffic, so you are welcome to join it to discuss your ideas. :)
+
+It's run by ezmlm, so you can subscribe to it by sending a message
+to romfs-subscribe@shadow.banki.hu, the content is irrelevant.
+
+Pending issues:
+
+- Permissions and owner information are pretty essential features of a
+Un*x like system, but romfs does not provide the full possibilities.
+I have never found this limiting, but others might.
+
+- The file system is read only, so it can be very small, but in case
+one would want to write _anything_ to a file system, he still needs
+a writable file system, thus negating the size advantages. Possible
+solutions: implement write access as a compile-time option, or a new,
+similarly small writable filesystem for RAM disks.
+
+- Since the files are only required to have alignment on a 16 byte
+boundary, it is currently possibly suboptimal to read or execute files
+from the filesystem. It might be resolved by reordering file data to
+have most of it (i.e. except the start and the end) laying at "natural"
+boundaries, thus it would be possible to directly map a big portion of
+the file contents to the mm subsystem.
+
+- Compression might be an useful feature, but memory is quite a
+limiting factor in my eyes.
+
+- Where it is used?
+
+- Does it work on other architectures than intel and motorola?
+
+
+Have fun,
+Janos Farkas <chexum@shadow.banki.hu>
diff --git a/Documentation/filesystems/rpc-cache.txt b/Documentation/filesystems/rpc-cache.txt
new file mode 100644
index 0000000..8a382be
--- /dev/null
+++ b/Documentation/filesystems/rpc-cache.txt
@@ -0,0 +1,202 @@
+ This document gives a brief introduction to the caching
+mechanisms in the sunrpc layer that is used, in particular,
+for NFS authentication.
+
+CACHES
+======
+The caching replaces the old exports table and allows for
+a wide variety of values to be caches.
+
+There are a number of caches that are similar in structure though
+quite possibly very different in content and use. There is a corpus
+of common code for managing these caches.
+
+Examples of caches that are likely to be needed are:
+ - mapping from IP address to client name
+ - mapping from client name and filesystem to export options
+ - mapping from UID to list of GIDs, to work around NFS's limitation
+ of 16 gids.
+ - mappings between local UID/GID and remote UID/GID for sites that
+ do not have uniform uid assignment
+ - mapping from network identify to public key for crypto authentication.
+
+The common code handles such things as:
+ - general cache lookup with correct locking
+ - supporting 'NEGATIVE' as well as positive entries
+ - allowing an EXPIRED time on cache items, and removing
+ items after they expire, and are no longer in-use.
+ - making requests to user-space to fill in cache entries
+ - allowing user-space to directly set entries in the cache
+ - delaying RPC requests that depend on as-yet incomplete
+ cache entries, and replaying those requests when the cache entry
+ is complete.
+ - clean out old entries as they expire.
+
+Creating a Cache
+----------------
+
+1/ A cache needs a datum to store. This is in the form of a
+ structure definition that must contain a
+ struct cache_head
+ as an element, usually the first.
+ It will also contain a key and some content.
+ Each cache element is reference counted and contains
+ expiry and update times for use in cache management.
+2/ A cache needs a "cache_detail" structure that
+ describes the cache. This stores the hash table, some
+ parameters for cache management, and some operations detailing how
+ to work with particular cache items.
+ The operations requires are:
+ struct cache_head *alloc(void)
+ This simply allocates appropriate memory and returns
+ a pointer to the cache_detail embedded within the
+ structure
+ void cache_put(struct kref *)
+ This is called when the last reference to an item is
+ dropped. The pointer passed is to the 'ref' field
+ in the cache_head. cache_put should release any
+ references create by 'cache_init' and, if CACHE_VALID
+ is set, any references created by cache_update.
+ It should then release the memory allocated by
+ 'alloc'.
+ int match(struct cache_head *orig, struct cache_head *new)
+ test if the keys in the two structures match. Return
+ 1 if they do, 0 if they don't.
+ void init(struct cache_head *orig, struct cache_head *new)
+ Set the 'key' fields in 'new' from 'orig'. This may
+ include taking references to shared objects.
+ void update(struct cache_head *orig, struct cache_head *new)
+ Set the 'content' fileds in 'new' from 'orig'.
+ int cache_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+ Optional. Used to provide a /proc file that lists the
+ contents of a cache. This should show one item,
+ usually on just one line.
+ int cache_request(struct cache_detail *cd, struct cache_head *h,
+ char **bpp, int *blen)
+ Format a request to be send to user-space for an item
+ to be instantiated. *bpp is a buffer of size *blen.
+ bpp should be moved forward over the encoded message,
+ and *blen should be reduced to show how much free
+ space remains. Return 0 on success or <0 if not
+ enough room or other problem.
+ int cache_parse(struct cache_detail *cd, char *buf, int len)
+ A message from user space has arrived to fill out a
+ cache entry. It is in 'buf' of length 'len'.
+ cache_parse should parse this, find the item in the
+ cache with sunrpc_cache_lookup, and update the item
+ with sunrpc_cache_update.
+
+
+3/ A cache needs to be registered using cache_register(). This
+ includes it on a list of caches that will be regularly
+ cleaned to discard old data.
+
+Using a cache
+-------------
+
+To find a value in a cache, call sunrpc_cache_lookup passing a pointer
+to the cache_head in a sample item with the 'key' fields filled in.
+This will be passed to ->match to identify the target entry. If no
+entry is found, a new entry will be create, added to the cache, and
+marked as not containing valid data.
+
+The item returned is typically passed to cache_check which will check
+if the data is valid, and may initiate an up-call to get fresh data.
+cache_check will return -ENOENT in the entry is negative or if an up
+call is needed but not possible, -EAGAIN if an upcall is pending,
+or 0 if the data is valid;
+
+cache_check can be passed a "struct cache_req *". This structure is
+typically embedded in the actual request and can be used to create a
+deferred copy of the request (struct cache_deferred_req). This is
+done when the found cache item is not uptodate, but the is reason to
+believe that userspace might provide information soon. When the cache
+item does become valid, the deferred copy of the request will be
+revisited (->revisit). It is expected that this method will
+reschedule the request for processing.
+
+The value returned by sunrpc_cache_lookup can also be passed to
+sunrpc_cache_update to set the content for the item. A second item is
+passed which should hold the content. If the item found by _lookup
+has valid data, then it is discarded and a new item is created. This
+saves any user of an item from worrying about content changing while
+it is being inspected. If the item found by _lookup does not contain
+valid data, then the content is copied across and CACHE_VALID is set.
+
+Populating a cache
+------------------
+
+Each cache has a name, and when the cache is registered, a directory
+with that name is created in /proc/net/rpc
+
+This directory contains a file called 'channel' which is a channel
+for communicating between kernel and user for populating the cache.
+This directory may later contain other files of interacting
+with the cache.
+
+The 'channel' works a bit like a datagram socket. Each 'write' is
+passed as a whole to the cache for parsing and interpretation.
+Each cache can treat the write requests differently, but it is
+expected that a message written will contain:
+ - a key
+ - an expiry time
+ - a content.
+with the intention that an item in the cache with the give key
+should be create or updated to have the given content, and the
+expiry time should be set on that item.
+
+Reading from a channel is a bit more interesting. When a cache
+lookup fails, or when it succeeds but finds an entry that may soon
+expire, a request is lodged for that cache item to be updated by
+user-space. These requests appear in the channel file.
+
+Successive reads will return successive requests.
+If there are no more requests to return, read will return EOF, but a
+select or poll for read will block waiting for another request to be
+added.
+
+Thus a user-space helper is likely to:
+ open the channel.
+ select for readable
+ read a request
+ write a response
+ loop.
+
+If it dies and needs to be restarted, any requests that have not been
+answered will still appear in the file and will be read by the new
+instance of the helper.
+
+Each cache should define a "cache_parse" method which takes a message
+written from user-space and processes it. It should return an error
+(which propagates back to the write syscall) or 0.
+
+Each cache should also define a "cache_request" method which
+takes a cache item and encodes a request into the buffer
+provided.
+
+Note: If a cache has no active readers on the channel, and has had not
+active readers for more than 60 seconds, further requests will not be
+added to the channel but instead all lookups that do not find a valid
+entry will fail. This is partly for backward compatibility: The
+previous nfs exports table was deemed to be authoritative and a
+failed lookup meant a definite 'no'.
+
+request/response format
+-----------------------
+
+While each cache is free to use it's own format for requests
+and responses over channel, the following is recommended as
+appropriate and support routines are available to help:
+Each request or response record should be printable ASCII
+with precisely one newline character which should be at the end.
+Fields within the record should be separated by spaces, normally one.
+If spaces, newlines, or nul characters are needed in a field they
+much be quoted. two mechanisms are available:
+1/ If a field begins '\x' then it must contain an even number of
+ hex digits, and pairs of these digits provide the bytes in the
+ field.
+2/ otherwise a \ in the field must be followed by 3 octal digits
+ which give the code for a byte. Other characters are treated
+ as them selves. At the very least, space, newline, nul, and
+ '\' must be quoted in this way.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
new file mode 100644
index 0000000..b843743
--- /dev/null
+++ b/Documentation/filesystems/seq_file.txt
@@ -0,0 +1,294 @@
+The seq_file interface
+
+ Copyright 2003 Jonathan Corbet <corbet@lwn.net>
+ This file is originally from the LWN.net Driver Porting series at
+ http://lwn.net/Articles/driver-porting/
+
+
+There are numerous ways for a device driver (or other kernel component) to
+provide information to the user or system administrator. One useful
+technique is the creation of virtual files, in debugfs, /proc or elsewhere.
+Virtual files can provide human-readable output that is easy to get at
+without any special utility programs; they can also make life easier for
+script writers. It is not surprising that the use of virtual files has
+grown over the years.
+
+Creating those files correctly has always been a bit of a challenge,
+however. It is not that hard to make a virtual file which returns a
+string. But life gets trickier if the output is long - anything greater
+than an application is likely to read in a single operation. Handling
+multiple reads (and seeks) requires careful attention to the reader's
+position within the virtual file - that position is, likely as not, in the
+middle of a line of output. The kernel has traditionally had a number of
+implementations that got this wrong.
+
+The 2.6 kernel contains a set of functions (implemented by Alexander Viro)
+which are designed to make it easy for virtual file creators to get it
+right.
+
+The seq_file interface is available via <linux/seq_file.h>. There are
+three aspects to seq_file:
+
+ * An iterator interface which lets a virtual file implementation
+ step through the objects it is presenting.
+
+ * Some utility functions for formatting objects for output without
+ needing to worry about things like output buffers.
+
+ * A set of canned file_operations which implement most operations on
+ the virtual file.
+
+We'll look at the seq_file interface via an extremely simple example: a
+loadable module which creates a file called /proc/sequence. The file, when
+read, simply produces a set of increasing integer values, one per line. The
+sequence will continue until the user loses patience and finds something
+better to do. The file is seekable, in that one can do something like the
+following:
+
+ dd if=/proc/sequence of=out1 count=1
+ dd if=/proc/sequence skip=1 out=out2 count=1
+
+Then concatenate the output files out1 and out2 and get the right
+result. Yes, it is a thoroughly useless module, but the point is to show
+how the mechanism works without getting lost in other details. (Those
+wanting to see the full source for this module can find it at
+http://lwn.net/Articles/22359/).
+
+
+The iterator interface
+
+Modules implementing a virtual file with seq_file must implement a simple
+iterator object that allows stepping through the data of interest.
+Iterators must be able to move to a specific position - like the file they
+implement - but the interpretation of that position is up to the iterator
+itself. A seq_file implementation that is formatting firewall rules, for
+example, could interpret position N as the Nth rule in the chain.
+Positioning can thus be done in whatever way makes the most sense for the
+generator of the data, which need not be aware of how a position translates
+to an offset in the virtual file. The one obvious exception is that a
+position of zero should indicate the beginning of the file.
+
+The /proc/sequence iterator just uses the count of the next number it
+will output as its position.
+
+Four functions must be implemented to make the iterator work. The first,
+called start() takes a position as an argument and returns an iterator
+which will start reading at that position. For our simple sequence example,
+the start() function looks like:
+
+ static void *ct_seq_start(struct seq_file *s, loff_t *pos)
+ {
+ loff_t *spos = kmalloc(sizeof(loff_t), GFP_KERNEL);
+ if (! spos)
+ return NULL;
+ *spos = *pos;
+ return spos;
+ }
+
+The entire data structure for this iterator is a single loff_t value
+holding the current position. There is no upper bound for the sequence
+iterator, but that will not be the case for most other seq_file
+implementations; in most cases the start() function should check for a
+"past end of file" condition and return NULL if need be.
+
+For more complicated applications, the private field of the seq_file
+structure can be used. There is also a special value which can be returned
+by the start() function called SEQ_START_TOKEN; it can be used if you wish
+to instruct your show() function (described below) to print a header at the
+top of the output. SEQ_START_TOKEN should only be used if the offset is
+zero, however.
+
+The next function to implement is called, amazingly, next(); its job is to
+move the iterator forward to the next position in the sequence. The
+example module can simply increment the position by one; more useful
+modules will do what is needed to step through some data structure. The
+next() function returns a new iterator, or NULL if the sequence is
+complete. Here's the example version:
+
+ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+ {
+ loff_t *spos = v;
+ *pos = ++*spos;
+ return spos;
+ }
+
+The stop() function is called when iteration is complete; its job, of
+course, is to clean up. If dynamic memory is allocated for the iterator,
+stop() is the place to free it.
+
+ static void ct_seq_stop(struct seq_file *s, void *v)
+ {
+ kfree(v);
+ }
+
+Finally, the show() function should format the object currently pointed to
+by the iterator for output. The example module's show() function is:
+
+ static int ct_seq_show(struct seq_file *s, void *v)
+ {
+ loff_t *spos = v;
+ seq_printf(s, "%lld\n", (long long)*spos);
+ return 0;
+ }
+
+If all is well, the show() function should return zero. A negative error
+code in the usual manner indicates that something went wrong; it will be
+passed back to user space. This function can also return SEQ_SKIP, which
+causes the current item to be skipped; if the show() function has already
+generated output before returning SEQ_SKIP, that output will be dropped.
+
+We will look at seq_printf() in a moment. But first, the definition of the
+seq_file iterator is finished by creating a seq_operations structure with
+the four functions we have just defined:
+
+ static const struct seq_operations ct_seq_ops = {
+ .start = ct_seq_start,
+ .next = ct_seq_next,
+ .stop = ct_seq_stop,
+ .show = ct_seq_show
+ };
+
+This structure will be needed to tie our iterator to the /proc file in
+a little bit.
+
+It's worth noting that the iterator value returned by start() and
+manipulated by the other functions is considered to be completely opaque by
+the seq_file code. It can thus be anything that is useful in stepping
+through the data to be output. Counters can be useful, but it could also be
+a direct pointer into an array or linked list. Anything goes, as long as
+the programmer is aware that things can happen between calls to the
+iterator function. However, the seq_file code (by design) will not sleep
+between the calls to start() and stop(), so holding a lock during that time
+is a reasonable thing to do. The seq_file code will also avoid taking any
+other locks while the iterator is active.
+
+
+Formatted output
+
+The seq_file code manages positioning within the output created by the
+iterator and getting it into the user's buffer. But, for that to work, that
+output must be passed to the seq_file code. Some utility functions have
+been defined which make this task easy.
+
+Most code will simply use seq_printf(), which works pretty much like
+printk(), but which requires the seq_file pointer as an argument. It is
+common to ignore the return value from seq_printf(), but a function
+producing complicated output may want to check that value and quit if
+something non-zero is returned; an error return means that the seq_file
+buffer has been filled and further output will be discarded.
+
+For straight character output, the following functions may be used:
+
+ int seq_putc(struct seq_file *m, char c);
+ int seq_puts(struct seq_file *m, const char *s);
+ int seq_escape(struct seq_file *m, const char *s, const char *esc);
+
+The first two output a single character and a string, just like one would
+expect. seq_escape() is like seq_puts(), except that any character in s
+which is in the string esc will be represented in octal form in the output.
+
+There is also a pair of functions for printing filenames:
+
+ int seq_path(struct seq_file *m, struct path *path, char *esc);
+ int seq_path_root(struct seq_file *m, struct path *path,
+ struct path *root, char *esc)
+
+Here, path indicates the file of interest, and esc is a set of characters
+which should be escaped in the output. A call to seq_path() will output
+the path relative to the current process's filesystem root. If a different
+root is desired, it can be used with seq_path_root(). Note that, if it
+turns out that path cannot be reached from root, the value of root will be
+changed in seq_file_root() to a root which *does* work.
+
+
+Making it all work
+
+So far, we have a nice set of functions which can produce output within the
+seq_file system, but we have not yet turned them into a file that a user
+can see. Creating a file within the kernel requires, of course, the
+creation of a set of file_operations which implement the operations on that
+file. The seq_file interface provides a set of canned operations which do
+most of the work. The virtual file author still must implement the open()
+method, however, to hook everything up. The open function is often a single
+line, as in the example module:
+
+ static int ct_open(struct inode *inode, struct file *file)
+ {
+ return seq_open(file, &ct_seq_ops);
+ }
+
+Here, the call to seq_open() takes the seq_operations structure we created
+before, and gets set up to iterate through the virtual file.
+
+On a successful open, seq_open() stores the struct seq_file pointer in
+file->private_data. If you have an application where the same iterator can
+be used for more than one file, you can store an arbitrary pointer in the
+private field of the seq_file structure; that value can then be retrieved
+by the iterator functions.
+
+The other operations of interest - read(), llseek(), and release() - are
+all implemented by the seq_file code itself. So a virtual file's
+file_operations structure will look like:
+
+ static const struct file_operations ct_file_ops = {
+ .owner = THIS_MODULE,
+ .open = ct_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+ };
+
+There is also a seq_release_private() which passes the contents of the
+seq_file private field to kfree() before releasing the structure.
+
+The final step is the creation of the /proc file itself. In the example
+code, that is done in the initialization code in the usual way:
+
+ static int ct_init(void)
+ {
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("sequence", 0, NULL);
+ if (entry)
+ entry->proc_fops = &ct_file_ops;
+ return 0;
+ }
+
+ module_init(ct_init);
+
+And that is pretty much it.
+
+
+seq_list
+
+If your file will be iterating through a linked list, you may find these
+routines useful:
+
+ struct list_head *seq_list_start(struct list_head *head,
+ loff_t pos);
+ struct list_head *seq_list_start_head(struct list_head *head,
+ loff_t pos);
+ struct list_head *seq_list_next(void *v, struct list_head *head,
+ loff_t *ppos);
+
+These helpers will interpret pos as a position within the list and iterate
+accordingly. Your start() and next() functions need only invoke the
+seq_list_* helpers with a pointer to the appropriate list_head structure.
+
+
+The extra-simple version
+
+For extremely simple virtual files, there is an even easier interface. A
+module can define only the show() function, which should create all the
+output that the virtual file will contain. The file's open() method then
+calls:
+
+ int single_open(struct file *file,
+ int (*show)(struct seq_file *m, void *p),
+ void *data);
+
+When output time comes, the show() function will be called once. The data
+value given to single_open() can be found in the private field of the
+seq_file structure. When using single_open(), the programmer should use
+single_release() instead of seq_release() in the file_operations structure
+to avoid a memory leak.
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
new file mode 100644
index 0000000..7365400
--- /dev/null
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -0,0 +1,1061 @@
+Shared Subtrees
+---------------
+
+Contents:
+ 1) Overview
+ 2) Features
+ 3) smount command
+ 4) Use-case
+ 5) Detailed semantics
+ 6) Quiz
+ 7) FAQ
+ 8) Implementation
+
+
+1) Overview
+-----------
+
+Consider the following situation:
+
+A process wants to clone its own namespace, but still wants to access the CD
+that got mounted recently. Shared subtree semantics provide the necessary
+mechanism to accomplish the above.
+
+It provides the necessary building blocks for features like per-user-namespace
+and versioned filesystem.
+
+2) Features
+-----------
+
+Shared subtree provides four different flavors of mounts; struct vfsmount to be
+precise
+
+ a. shared mount
+ b. slave mount
+ c. private mount
+ d. unbindable mount
+
+
+2a) A shared mount can be replicated to as many mountpoints and all the
+replicas continue to be exactly same.
+
+ Here is an example:
+
+ Lets say /mnt has a mount that is shared.
+ mount --make-shared /mnt
+
+ note: mount command does not yet support the --make-shared flag.
+ I have included a small C program which does the same by executing
+ 'smount /mnt shared'
+
+ #mount --bind /mnt /tmp
+ The above command replicates the mount at /mnt to the mountpoint /tmp
+ and the contents of both the mounts remain identical.
+
+ #ls /mnt
+ a b c
+
+ #ls /tmp
+ a b c
+
+ Now lets say we mount a device at /tmp/a
+ #mount /dev/sd0 /tmp/a
+
+ #ls /tmp/a
+ t1 t2 t2
+
+ #ls /mnt/a
+ t1 t2 t2
+
+ Note that the mount has propagated to the mount at /mnt as well.
+
+ And the same is true even when /dev/sd0 is mounted on /mnt/a. The
+ contents will be visible under /tmp/a too.
+
+
+2b) A slave mount is like a shared mount except that mount and umount events
+ only propagate towards it.
+
+ All slave mounts have a master mount which is a shared.
+
+ Here is an example:
+
+ Lets say /mnt has a mount which is shared.
+ #mount --make-shared /mnt
+
+ Lets bind mount /mnt to /tmp
+ #mount --bind /mnt /tmp
+
+ the new mount at /tmp becomes a shared mount and it is a replica of
+ the mount at /mnt.
+
+ Now lets make the mount at /tmp; a slave of /mnt
+ #mount --make-slave /tmp
+ [or smount /tmp slave]
+
+ lets mount /dev/sd0 on /mnt/a
+ #mount /dev/sd0 /mnt/a
+
+ #ls /mnt/a
+ t1 t2 t3
+
+ #ls /tmp/a
+ t1 t2 t3
+
+ Note the mount event has propagated to the mount at /tmp
+
+ However lets see what happens if we mount something on the mount at /tmp
+
+ #mount /dev/sd1 /tmp/b
+
+ #ls /tmp/b
+ s1 s2 s3
+
+ #ls /mnt/b
+
+ Note how the mount event has not propagated to the mount at
+ /mnt
+
+
+2c) A private mount does not forward or receive propagation.
+
+ This is the mount we are familiar with. Its the default type.
+
+
+2d) A unbindable mount is a unbindable private mount
+
+ lets say we have a mount at /mnt and we make is unbindable
+
+ #mount --make-unbindable /mnt
+ [ smount /mnt unbindable ]
+
+ Lets try to bind mount this mount somewhere else.
+ # mount --bind /mnt /tmp
+ mount: wrong fs type, bad option, bad superblock on /mnt,
+ or too many mounted file systems
+
+ Binding a unbindable mount is a invalid operation.
+
+
+3) smount command
+
+ Currently the mount command is not aware of shared subtree features.
+ Work is in progress to add the support in mount ( util-linux package ).
+ Till then use the following program.
+
+ ------------------------------------------------------------------------
+ //
+ //this code was developed my Miklos Szeredi <miklos@szeredi.hu>
+ //and modified by Ram Pai <linuxram@us.ibm.com>
+ // sample usage:
+ // smount /tmp shared
+ //
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <unistd.h>
+ #include <string.h>
+ #include <sys/mount.h>
+ #include <sys/fsuid.h>
+
+ #ifndef MS_REC
+ #define MS_REC 0x4000 /* 16384: Recursive loopback */
+ #endif
+
+ #ifndef MS_SHARED
+ #define MS_SHARED 1<<20 /* Shared */
+ #endif
+
+ #ifndef MS_PRIVATE
+ #define MS_PRIVATE 1<<18 /* Private */
+ #endif
+
+ #ifndef MS_SLAVE
+ #define MS_SLAVE 1<<19 /* Slave */
+ #endif
+
+ #ifndef MS_UNBINDABLE
+ #define MS_UNBINDABLE 1<<17 /* Unbindable */
+ #endif
+
+ int main(int argc, char *argv[])
+ {
+ int type;
+ if(argc != 3) {
+ fprintf(stderr, "usage: %s dir "
+ "<rshared|rslave|rprivate|runbindable|shared|slave"
+ "|private|unbindable>\n" , argv[0]);
+ return 1;
+ }
+
+ fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]);
+
+ if (strcmp(argv[2],"rshared")==0)
+ type=(MS_SHARED|MS_REC);
+ else if (strcmp(argv[2],"rslave")==0)
+ type=(MS_SLAVE|MS_REC);
+ else if (strcmp(argv[2],"rprivate")==0)
+ type=(MS_PRIVATE|MS_REC);
+ else if (strcmp(argv[2],"runbindable")==0)
+ type=(MS_UNBINDABLE|MS_REC);
+ else if (strcmp(argv[2],"shared")==0)
+ type=MS_SHARED;
+ else if (strcmp(argv[2],"slave")==0)
+ type=MS_SLAVE;
+ else if (strcmp(argv[2],"private")==0)
+ type=MS_PRIVATE;
+ else if (strcmp(argv[2],"unbindable")==0)
+ type=MS_UNBINDABLE;
+ else {
+ fprintf(stderr, "invalid operation: %s\n", argv[2]);
+ return 1;
+ }
+ setfsuid(getuid());
+
+ if(mount("", argv[1], "dontcare", type, "") == -1) {
+ perror("mount");
+ return 1;
+ }
+ return 0;
+ }
+ -----------------------------------------------------------------------
+
+ Copy the above code snippet into smount.c
+ gcc -o smount smount.c
+
+
+ (i) To mark all the mounts under /mnt as shared execute the following
+ command:
+
+ smount /mnt rshared
+ the corresponding syntax planned for mount command is
+ mount --make-rshared /mnt
+
+ just to mark a mount /mnt as shared, execute the following
+ command:
+ smount /mnt shared
+ the corresponding syntax planned for mount command is
+ mount --make-shared /mnt
+
+ (ii) To mark all the shared mounts under /mnt as slave execute the
+ following
+
+ command:
+ smount /mnt rslave
+ the corresponding syntax planned for mount command is
+ mount --make-rslave /mnt
+
+ just to mark a mount /mnt as slave, execute the following
+ command:
+ smount /mnt slave
+ the corresponding syntax planned for mount command is
+ mount --make-slave /mnt
+
+ (iii) To mark all the mounts under /mnt as private execute the
+ following command:
+
+ smount /mnt rprivate
+ the corresponding syntax planned for mount command is
+ mount --make-rprivate /mnt
+
+ just to mark a mount /mnt as private, execute the following
+ command:
+ smount /mnt private
+ the corresponding syntax planned for mount command is
+ mount --make-private /mnt
+
+ NOTE: by default all the mounts are created as private. But if
+ you want to change some shared/slave/unbindable mount as
+ private at a later point in time, this command can help.
+
+ (iv) To mark all the mounts under /mnt as unbindable execute the
+ following
+
+ command:
+ smount /mnt runbindable
+ the corresponding syntax planned for mount command is
+ mount --make-runbindable /mnt
+
+ just to mark a mount /mnt as unbindable, execute the following
+ command:
+ smount /mnt unbindable
+ the corresponding syntax planned for mount command is
+ mount --make-unbindable /mnt
+
+
+4) Use cases
+------------
+
+ A) A process wants to clone its own namespace, but still wants to
+ access the CD that got mounted recently.
+
+ Solution:
+
+ The system administrator can make the mount at /cdrom shared
+ mount --bind /cdrom /cdrom
+ mount --make-shared /cdrom
+
+ Now any process that clones off a new namespace will have a
+ mount at /cdrom which is a replica of the same mount in the
+ parent namespace.
+
+ So when a CD is inserted and mounted at /cdrom that mount gets
+ propagated to the other mount at /cdrom in all the other clone
+ namespaces.
+
+ B) A process wants its mounts invisible to any other process, but
+ still be able to see the other system mounts.
+
+ Solution:
+
+ To begin with, the administrator can mark the entire mount tree
+ as shareable.
+
+ mount --make-rshared /
+
+ A new process can clone off a new namespace. And mark some part
+ of its namespace as slave
+
+ mount --make-rslave /myprivatetree
+
+ Hence forth any mounts within the /myprivatetree done by the
+ process will not show up in any other namespace. However mounts
+ done in the parent namespace under /myprivatetree still shows
+ up in the process's namespace.
+
+
+ Apart from the above semantics this feature provides the
+ building blocks to solve the following problems:
+
+ C) Per-user namespace
+
+ The above semantics allows a way to share mounts across
+ namespaces. But namespaces are associated with processes. If
+ namespaces are made first class objects with user API to
+ associate/disassociate a namespace with userid, then each user
+ could have his/her own namespace and tailor it to his/her
+ requirements. Offcourse its needs support from PAM.
+
+ D) Versioned files
+
+ If the entire mount tree is visible at multiple locations, then
+ a underlying versioning file system can return different
+ version of the file depending on the path used to access that
+ file.
+
+ An example is:
+
+ mount --make-shared /
+ mount --rbind / /view/v1
+ mount --rbind / /view/v2
+ mount --rbind / /view/v3
+ mount --rbind / /view/v4
+
+ and if /usr has a versioning filesystem mounted, than that
+ mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
+ /view/v4/usr too
+
+ A user can request v3 version of the file /usr/fs/namespace.c
+ by accessing /view/v3/usr/fs/namespace.c . The underlying
+ versioning filesystem can then decipher that v3 version of the
+ filesystem is being requested and return the corresponding
+ inode.
+
+5) Detailed semantics:
+-------------------
+ The section below explains the detailed semantics of
+ bind, rbind, move, mount, umount and clone-namespace operations.
+
+ Note: the word 'vfsmount' and the noun 'mount' have been used
+ to mean the same thing, throughout this document.
+
+5a) Mount states
+
+ A given mount can be in one of the following states
+ 1) shared
+ 2) slave
+ 3) shared and slave
+ 4) private
+ 5) unbindable
+
+ A 'propagation event' is defined as event generated on a vfsmount
+ that leads to mount or unmount actions in other vfsmounts.
+
+ A 'peer group' is defined as a group of vfsmounts that propagate
+ events to each other.
+
+ (1) Shared mounts
+
+ A 'shared mount' is defined as a vfsmount that belongs to a
+ 'peer group'.
+
+ For example:
+ mount --make-shared /mnt
+ mount --bin /mnt /tmp
+
+ The mount at /mnt and that at /tmp are both shared and belong
+ to the same peer group. Anything mounted or unmounted under
+ /mnt or /tmp reflect in all the other mounts of its peer
+ group.
+
+
+ (2) Slave mounts
+
+ A 'slave mount' is defined as a vfsmount that receives
+ propagation events and does not forward propagation events.
+
+ A slave mount as the name implies has a master mount from which
+ mount/unmount events are received. Events do not propagate from
+ the slave mount to the master. Only a shared mount can be made
+ a slave by executing the following command
+
+ mount --make-slave mount
+
+ A shared mount that is made as a slave is no more shared unless
+ modified to become shared.
+
+ (3) Shared and Slave
+
+ A vfsmount can be both shared as well as slave. This state
+ indicates that the mount is a slave of some vfsmount, and
+ has its own peer group too. This vfsmount receives propagation
+ events from its master vfsmount, and also forwards propagation
+ events to its 'peer group' and to its slave vfsmounts.
+
+ Strictly speaking, the vfsmount is shared having its own
+ peer group, and this peer-group is a slave of some other
+ peer group.
+
+ Only a slave vfsmount can be made as 'shared and slave' by
+ either executing the following command
+ mount --make-shared mount
+ or by moving the slave vfsmount under a shared vfsmount.
+
+ (4) Private mount
+
+ A 'private mount' is defined as vfsmount that does not
+ receive or forward any propagation events.
+
+ (5) Unbindable mount
+
+ A 'unbindable mount' is defined as vfsmount that does not
+ receive or forward any propagation events and cannot
+ be bind mounted.
+
+
+ State diagram:
+ The state diagram below explains the state transition of a mount,
+ in response to various commands.
+ ------------------------------------------------------------------------
+ | |make-shared | make-slave | make-private |make-unbindab|
+ --------------|------------|--------------|--------------|-------------|
+ |shared |shared |*slave/private| private | unbindable |
+ | | | | | |
+ |-------------|------------|--------------|--------------|-------------|
+ |slave |shared | **slave | private | unbindable |
+ | |and slave | | | |
+ |-------------|------------|--------------|--------------|-------------|
+ |shared |shared | slave | private | unbindable |
+ |and slave |and slave | | | |
+ |-------------|------------|--------------|--------------|-------------|
+ |private |shared | **private | private | unbindable |
+ |-------------|------------|--------------|--------------|-------------|
+ |unbindable |shared |**unbindable | private | unbindable |
+ ------------------------------------------------------------------------
+
+ * if the shared mount is the only mount in its peer group, making it
+ slave, makes it private automatically. Note that there is no master to
+ which it can be slaved to.
+
+ ** slaving a non-shared mount has no effect on the mount.
+
+ Apart from the commands listed below, the 'move' operation also changes
+ the state of a mount depending on type of the destination mount. Its
+ explained in section 5d.
+
+5b) Bind semantics
+
+ Consider the following command
+
+ mount --bind A/a B/b
+
+ where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B'
+ is the destination mount and 'b' is the dentry in the destination mount.
+
+ The outcome depends on the type of mount of 'A' and 'B'. The table
+ below contains quick reference.
+ ---------------------------------------------------------------------------
+ | BIND MOUNT OPERATION |
+ |**************************************************************************
+ |source(A)->| shared | private | slave | unbindable |
+ | dest(B) | | | | |
+ | | | | | | |
+ | v | | | | |
+ |**************************************************************************
+ | shared | shared | shared | shared & slave | invalid |
+ | | | | | |
+ |non-shared| shared | private | slave | invalid |
+ ***************************************************************************
+
+ Details:
+
+ 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C'
+ which is clone of 'A', is created. Its root dentry is 'a' . 'C' is
+ mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ...
+ are created and mounted at the dentry 'b' on all mounts where 'B'
+ propagates to. A new propagation tree containing 'C1',..,'Cn' is
+ created. This propagation tree is identical to the propagation tree of
+ 'B'. And finally the peer-group of 'C' is merged with the peer group
+ of 'A'.
+
+ 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C'
+ which is clone of 'A', is created. Its root dentry is 'a'. 'C' is
+ mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ...
+ are created and mounted at the dentry 'b' on all mounts where 'B'
+ propagates to. A new propagation tree is set containing all new mounts
+ 'C', 'C1', .., 'Cn' with exactly the same configuration as the
+ propagation tree for 'B'.
+
+ 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new
+ mount 'C' which is clone of 'A', is created. Its root dentry is 'a' .
+ 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2',
+ 'C3' ... are created and mounted at the dentry 'b' on all mounts where
+ 'B' propagates to. A new propagation tree containing the new mounts
+ 'C','C1',.. 'Cn' is created. This propagation tree is identical to the
+ propagation tree for 'B'. And finally the mount 'C' and its peer group
+ is made the slave of mount 'Z'. In other words, mount 'C' is in the
+ state 'slave and shared'.
+
+ 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a
+ invalid operation.
+
+ 5. 'A' is a private mount and 'B' is a non-shared(private or slave or
+ unbindable) mount. A new mount 'C' which is clone of 'A', is created.
+ Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'.
+
+ 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C'
+ which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is
+ mounted on mount 'B' at dentry 'b'. 'C' is made a member of the
+ peer-group of 'A'.
+
+ 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A
+ new mount 'C' which is a clone of 'A' is created. Its root dentry is
+ 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a
+ slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of
+ 'Z'. All mount/unmount events on 'Z' propagates to 'A' and 'C'. But
+ mount/unmount on 'A' do not propagate anywhere else. Similarly
+ mount/unmount on 'C' do not propagate anywhere else.
+
+ 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a
+ invalid operation. A unbindable mount cannot be bind mounted.
+
+5c) Rbind semantics
+
+ rbind is same as bind. Bind replicates the specified mount. Rbind
+ replicates all the mounts in the tree belonging to the specified mount.
+ Rbind mount is bind mount applied to all the mounts in the tree.
+
+ If the source tree that is rbind has some unbindable mounts,
+ then the subtree under the unbindable mount is pruned in the new
+ location.
+
+ eg: lets say we have the following mount tree.
+
+ A
+ / \
+ B C
+ / \ / \
+ D E F G
+
+ Lets say all the mount except the mount C in the tree are
+ of a type other than unbindable.
+
+ If this tree is rbound to say Z
+
+ We will have the following tree at the new location.
+
+ Z
+ |
+ A'
+ /
+ B' Note how the tree under C is pruned
+ / \ in the new location.
+ D' E'
+
+
+
+5d) Move semantics
+
+ Consider the following command
+
+ mount --move A B/b
+
+ where 'A' is the source mount, 'B' is the destination mount and 'b' is
+ the dentry in the destination mount.
+
+ The outcome depends on the type of the mount of 'A' and 'B'. The table
+ below is a quick reference.
+ ---------------------------------------------------------------------------
+ | MOVE MOUNT OPERATION |
+ |**************************************************************************
+ | source(A)->| shared | private | slave | unbindable |
+ | dest(B) | | | | |
+ | | | | | | |
+ | v | | | | |
+ |**************************************************************************
+ | shared | shared | shared |shared and slave| invalid |
+ | | | | | |
+ |non-shared| shared | private | slave | unbindable |
+ ***************************************************************************
+ NOTE: moving a mount residing under a shared mount is invalid.
+
+ Details follow:
+
+ 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is
+ mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An'
+ are created and mounted at dentry 'b' on all mounts that receive
+ propagation from mount 'B'. A new propagation tree is created in the
+ exact same configuration as that of 'B'. This new propagation tree
+ contains all the new mounts 'A1', 'A2'... 'An'. And this new
+ propagation tree is appended to the already existing propagation tree
+ of 'A'.
+
+ 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is
+ mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An'
+ are created and mounted at dentry 'b' on all mounts that receive
+ propagation from mount 'B'. The mount 'A' becomes a shared mount and a
+ propagation tree is created which is identical to that of
+ 'B'. This new propagation tree contains all the new mounts 'A1',
+ 'A2'... 'An'.
+
+ 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The
+ mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1',
+ 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that
+ receive propagation from mount 'B'. A new propagation tree is created
+ in the exact same configuration as that of 'B'. This new propagation
+ tree contains all the new mounts 'A1', 'A2'... 'An'. And this new
+ propagation tree is appended to the already existing propagation tree of
+ 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also
+ becomes 'shared'.
+
+ 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation
+ is invalid. Because mounting anything on the shared mount 'B' can
+ create new mounts that get mounted on the mounts that receive
+ propagation from 'B'. And since the mount 'A' is unbindable, cloning
+ it to mount at other mountpoints is not possible.
+
+ 5. 'A' is a private mount and 'B' is a non-shared(private or slave or
+ unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'.
+
+ 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A'
+ is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a
+ shared mount.
+
+ 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount.
+ The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A'
+ continues to be a slave mount of mount 'Z'.
+
+ 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount
+ 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a
+ unbindable mount.
+
+5e) Mount semantics
+
+ Consider the following command
+
+ mount device B/b
+
+ 'B' is the destination mount and 'b' is the dentry in the destination
+ mount.
+
+ The above operation is the same as bind operation with the exception
+ that the source mount is always a private mount.
+
+
+5f) Unmount semantics
+
+ Consider the following command
+
+ umount A
+
+ where 'A' is a mount mounted on mount 'B' at dentry 'b'.
+
+ If mount 'B' is shared, then all most-recently-mounted mounts at dentry
+ 'b' on mounts that receive propagation from mount 'B' and does not have
+ sub-mounts within them are unmounted.
+
+ Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to
+ each other.
+
+ lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
+ 'B1', 'B2' and 'B3' respectively.
+
+ lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
+ mount 'B1', 'B2' and 'B3' respectively.
+
+ if 'C1' is unmounted, all the mounts that are most-recently-mounted on
+ 'B1' and on the mounts that 'B1' propagates-to are unmounted.
+
+ 'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount
+ on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'.
+
+ So all 'C1', 'C2' and 'C3' should be unmounted.
+
+ If any of 'C2' or 'C3' has some child mounts, then that mount is not
+ unmounted, but all other mounts are unmounted. However if 'C1' is told
+ to be unmounted and 'C1' has some sub-mounts, the umount operation is
+ failed entirely.
+
+5g) Clone Namespace
+
+ A cloned namespace contains all the mounts as that of the parent
+ namespace.
+
+ Lets say 'A' and 'B' are the corresponding mounts in the parent and the
+ child namespace.
+
+ If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
+ each other.
+
+ If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of
+ 'Z'.
+
+ If 'A' is a private mount, then 'B' is a private mount too.
+
+ If 'A' is unbindable mount, then 'B' is a unbindable mount too.
+
+
+6) Quiz
+
+ A. What is the result of the following command sequence?
+
+ mount --bind /mnt /mnt
+ mount --make-shared /mnt
+ mount --bind /mnt /tmp
+ mount --move /tmp /mnt/1
+
+ what should be the contents of /mnt /mnt/1 /mnt/1/1 should be?
+ Should they all be identical? or should /mnt and /mnt/1 be
+ identical only?
+
+
+ B. What is the result of the following command sequence?
+
+ mount --make-rshared /
+ mkdir -p /v/1
+ mount --rbind / /v/1
+
+ what should be the content of /v/1/v/1 be?
+
+
+ C. What is the result of the following command sequence?
+
+ mount --bind /mnt /mnt
+ mount --make-shared /mnt
+ mkdir -p /mnt/1/2/3 /mnt/1/test
+ mount --bind /mnt/1 /tmp
+ mount --make-slave /mnt
+ mount --make-shared /mnt
+ mount --bind /mnt/1/2 /tmp1
+ mount --make-slave /mnt
+
+ At this point we have the first mount at /tmp and
+ its root dentry is 1. Lets call this mount 'A'
+ And then we have a second mount at /tmp1 with root
+ dentry 2. Lets call this mount 'B'
+ Next we have a third mount at /mnt with root dentry
+ mnt. Lets call this mount 'C'
+
+ 'B' is the slave of 'A' and 'C' is a slave of 'B'
+ A -> B -> C
+
+ at this point if we execute the following command
+
+ mount --bind /bin /tmp/test
+
+ The mount is attempted on 'A'
+
+ will the mount propagate to 'B' and 'C' ?
+
+ what would be the contents of
+ /mnt/1/test be?
+
+7) FAQ
+
+ Q1. Why is bind mount needed? How is it different from symbolic links?
+ symbolic links can get stale if the destination mount gets
+ unmounted or moved. Bind mounts continue to exist even if the
+ other mount is unmounted or moved.
+
+ Q2. Why can't the shared subtree be implemented using exportfs?
+
+ exportfs is a heavyweight way of accomplishing part of what
+ shared subtree can do. I cannot imagine a way to implement the
+ semantics of slave mount using exportfs?
+
+ Q3 Why is unbindable mount needed?
+
+ Lets say we want to replicate the mount tree at multiple
+ locations within the same subtree.
+
+ if one rbind mounts a tree within the same subtree 'n' times
+ the number of mounts created is an exponential function of 'n'.
+ Having unbindable mount can help prune the unneeded bind
+ mounts. Here is a example.
+
+ step 1:
+ lets say the root tree has just two directories with
+ one vfsmount.
+ root
+ / \
+ tmp usr
+
+ And we want to replicate the tree at multiple
+ mountpoints under /root/tmp
+
+ step2:
+ mount --make-shared /root
+
+ mkdir -p /tmp/m1
+
+ mount --rbind /root /tmp/m1
+
+ the new tree now looks like this:
+
+ root
+ / \
+ tmp usr
+ /
+ m1
+ / \
+ tmp usr
+ /
+ m1
+
+ it has two vfsmounts
+
+ step3:
+ mkdir -p /tmp/m2
+ mount --rbind /root /tmp/m2
+
+ the new tree now looks like this:
+
+ root
+ / \
+ tmp usr
+ / \
+ m1 m2
+ / \ / \
+ tmp usr tmp usr
+ / \ /
+ m1 m2 m1
+ / \ / \
+ tmp usr tmp usr
+ / / \
+ m1 m1 m2
+ / \
+ tmp usr
+ / \
+ m1 m2
+
+ it has 6 vfsmounts
+
+ step 4:
+ mkdir -p /tmp/m3
+ mount --rbind /root /tmp/m3
+
+ I wont' draw the tree..but it has 24 vfsmounts
+
+
+ at step i the number of vfsmounts is V[i] = i*V[i-1].
+ This is an exponential function. And this tree has way more
+ mounts than what we really needed in the first place.
+
+ One could use a series of umount at each step to prune
+ out the unneeded mounts. But there is a better solution.
+ Unclonable mounts come in handy here.
+
+ step 1:
+ lets say the root tree has just two directories with
+ one vfsmount.
+ root
+ / \
+ tmp usr
+
+ How do we set up the same tree at multiple locations under
+ /root/tmp
+
+ step2:
+ mount --bind /root/tmp /root/tmp
+
+ mount --make-rshared /root
+ mount --make-unbindable /root/tmp
+
+ mkdir -p /tmp/m1
+
+ mount --rbind /root /tmp/m1
+
+ the new tree now looks like this:
+
+ root
+ / \
+ tmp usr
+ /
+ m1
+ / \
+ tmp usr
+
+ step3:
+ mkdir -p /tmp/m2
+ mount --rbind /root /tmp/m2
+
+ the new tree now looks like this:
+
+ root
+ / \
+ tmp usr
+ / \
+ m1 m2
+ / \ / \
+ tmp usr tmp usr
+
+ step4:
+
+ mkdir -p /tmp/m3
+ mount --rbind /root /tmp/m3
+
+ the new tree now looks like this:
+
+ root
+ / \
+ tmp usr
+ / \ \
+ m1 m2 m3
+ / \ / \ / \
+ tmp usr tmp usr tmp usr
+
+8) Implementation
+
+8A) Datastructure
+
+ 4 new fields are introduced to struct vfsmount
+ ->mnt_share
+ ->mnt_slave_list
+ ->mnt_slave
+ ->mnt_master
+
+ ->mnt_share links together all the mount to/from which this vfsmount
+ send/receives propagation events.
+
+ ->mnt_slave_list links all the mounts to which this vfsmount propagates
+ to.
+
+ ->mnt_slave links together all the slaves that its master vfsmount
+ propagates to.
+
+ ->mnt_master points to the master vfsmount from which this vfsmount
+ receives propagation.
+
+ ->mnt_flags takes two more flags to indicate the propagation status of
+ the vfsmount. MNT_SHARE indicates that the vfsmount is a shared
+ vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be
+ replicated.
+
+ All the shared vfsmounts in a peer group form a cyclic list through
+ ->mnt_share.
+
+ All vfsmounts with the same ->mnt_master form on a cyclic list anchored
+ in ->mnt_master->mnt_slave_list and going through ->mnt_slave.
+
+ ->mnt_master can point to arbitrary (and possibly different) members
+ of master peer group. To find all immediate slaves of a peer group
+ you need to go through _all_ ->mnt_slave_list of its members.
+ Conceptually it's just a single set - distribution among the
+ individual lists does not affect propagation or the way propagation
+ tree is modified by operations.
+
+ A example propagation tree looks as shown in the figure below.
+ [ NOTE: Though it looks like a forest, if we consider all the shared
+ mounts as a conceptual entity called 'pnode', it becomes a tree]
+
+
+ A <--> B <--> C <---> D
+ /|\ /| |\
+ / F G J K H I
+ /
+ E<-->K
+ /|\
+ M L N
+
+ In the above figure A,B,C and D all are shared and propagate to each
+ other. 'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave
+ mounts 'J' and 'K' and 'D' has got two slave mounts 'H' and 'I'.
+ 'E' is also shared with 'K' and they propagate to each other. And
+ 'K' has 3 slaves 'M', 'L' and 'N'
+
+ A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D'
+
+ A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G'
+
+ E's ->mnt_share links with ->mnt_share of K
+ 'E', 'K', 'F', 'G' have their ->mnt_master point to struct
+ vfsmount of 'A'
+ 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K'
+ K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N'
+
+ C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K'
+ J and K's ->mnt_master points to struct vfsmount of C
+ and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I'
+ 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'.
+
+
+ NOTE: The propagation tree is orthogonal to the mount tree.
+
+
+8B Algorithm:
+
+ The crux of the implementation resides in rbind/move operation.
+
+ The overall algorithm breaks the operation into 3 phases: (look at
+ attach_recursive_mnt() and propagate_mnt())
+
+ 1. prepare phase.
+ 2. commit phases.
+ 3. abort phases.
+
+ Prepare phase:
+
+ for each mount in the source tree:
+ a) Create the necessary number of mount trees to
+ be attached to each of the mounts that receive
+ propagation from the destination mount.
+ b) Do not attach any of the trees to its destination.
+ However note down its ->mnt_parent and ->mnt_mountpoint
+ c) Link all the new mounts to form a propagation tree that
+ is identical to the propagation tree of the destination
+ mount.
+
+ If this phase is successful, there should be 'n' new
+ propagation trees; where 'n' is the number of mounts in the
+ source tree. Go to the commit phase
+
+ Also there should be 'm' new mount trees, where 'm' is
+ the number of mounts to which the destination mount
+ propagates to.
+
+ if any memory allocations fail, go to the abort phase.
+
+ Commit phase
+ attach each of the mount trees to their corresponding
+ destination mounts.
+
+ Abort phase
+ delete all the newly created trees.
+
+ NOTE: all the propagation related functionality resides in the file
+ pnode.c
+
+
+------------------------------------------------------------------------
+
+version 0.1 (created the initial document, Ram Pai linuxram@us.ibm.com)
+version 0.2 (Incorporated comments from Al Viro)
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt
new file mode 100644
index 0000000..f673ef0
--- /dev/null
+++ b/Documentation/filesystems/smbfs.txt
@@ -0,0 +1,8 @@
+Smbfs is a filesystem that implements the SMB protocol, which is the
+protocol used by Windows for Workgroups, Windows 95 and Windows NT.
+Smbfs was inspired by Samba, the program written by Andrew Tridgell
+that turns any Unix host into a file server for DOS or Windows clients.
+
+Smbfs is a SMB client, but uses parts of samba for it's operation. For
+more info on samba, including documentation, please go to
+http://www.samba.org/ and then on to your nearest mirror.
diff --git a/Documentation/filesystems/spufs.txt b/Documentation/filesystems/spufs.txt
new file mode 100644
index 0000000..1343d11
--- /dev/null
+++ b/Documentation/filesystems/spufs.txt
@@ -0,0 +1,521 @@
+SPUFS(2) Linux Programmer's Manual SPUFS(2)
+
+
+
+NAME
+ spufs - the SPU file system
+
+
+DESCRIPTION
+ The SPU file system is used on PowerPC machines that implement the Cell
+ Broadband Engine Architecture in order to access Synergistic Processor
+ Units (SPUs).
+
+ The file system provides a name space similar to posix shared memory or
+ message queues. Users that have write permissions on the file system
+ can use spu_create(2) to establish SPU contexts in the spufs root.
+
+ Every SPU context is represented by a directory containing a predefined
+ set of files. These files can be used for manipulating the state of the
+ logical SPU. Users can change permissions on those files, but not actu-
+ ally add or remove files.
+
+
+MOUNT OPTIONS
+ uid=<uid>
+ set the user owning the mount point, the default is 0 (root).
+
+ gid=<gid>
+ set the group owning the mount point, the default is 0 (root).
+
+
+FILES
+ The files in spufs mostly follow the standard behavior for regular sys-
+ tem calls like read(2) or write(2), but often support only a subset of
+ the operations supported on regular file systems. This list details the
+ supported operations and the deviations from the behaviour in the
+ respective man pages.
+
+ All files that support the read(2) operation also support readv(2) and
+ all files that support the write(2) operation also support writev(2).
+ All files support the access(2) and stat(2) family of operations, but
+ only the st_mode, st_nlink, st_uid and st_gid fields of struct stat
+ contain reliable information.
+
+ All files support the chmod(2)/fchmod(2) and chown(2)/fchown(2) opera-
+ tions, but will not be able to grant permissions that contradict the
+ possible operations, e.g. read access on the wbox file.
+
+ The current set of files is:
+
+
+ /mem
+ the contents of the local storage memory of the SPU. This can be
+ accessed like a regular shared memory file and contains both code and
+ data in the address space of the SPU. The possible operations on an
+ open mem file are:
+
+ read(2), pread(2), write(2), pwrite(2), lseek(2)
+ These operate as documented, with the exception that seek(2),
+ write(2) and pwrite(2) are not supported beyond the end of the
+ file. The file size is the size of the local storage of the SPU,
+ which normally is 256 kilobytes.
+
+ mmap(2)
+ Mapping mem into the process address space gives access to the
+ SPU local storage within the process address space. Only
+ MAP_SHARED mappings are allowed.
+
+
+ /mbox
+ The first SPU to CPU communication mailbox. This file is read-only and
+ can be read in units of 32 bits. The file can only be used in non-
+ blocking mode and it even poll() will not block on it. The possible
+ operations on an open mbox file are:
+
+ read(2)
+ If a count smaller than four is requested, read returns -1 and
+ sets errno to EINVAL. If there is no data available in the mail
+ box, the return value is set to -1 and errno becomes EAGAIN.
+ When data has been read successfully, four bytes are placed in
+ the data buffer and the value four is returned.
+
+
+ /ibox
+ The second SPU to CPU communication mailbox. This file is similar to
+ the first mailbox file, but can be read in blocking I/O mode, and the
+ poll family of system calls can be used to wait for it. The possible
+ operations on an open ibox file are:
+
+ read(2)
+ If a count smaller than four is requested, read returns -1 and
+ sets errno to EINVAL. If there is no data available in the mail
+ box and the file descriptor has been opened with O_NONBLOCK, the
+ return value is set to -1 and errno becomes EAGAIN.
+
+ If there is no data available in the mail box and the file
+ descriptor has been opened without O_NONBLOCK, the call will
+ block until the SPU writes to its interrupt mailbox channel.
+ When data has been read successfully, four bytes are placed in
+ the data buffer and the value four is returned.
+
+ poll(2)
+ Poll on the ibox file returns (POLLIN | POLLRDNORM) whenever
+ data is available for reading.
+
+
+ /wbox
+ The CPU to SPU communation mailbox. It is write-only and can be written
+ in units of 32 bits. If the mailbox is full, write() will block and
+ poll can be used to wait for it becoming empty again. The possible
+ operations on an open wbox file are: write(2) If a count smaller than
+ four is requested, write returns -1 and sets errno to EINVAL. If there
+ is no space available in the mail box and the file descriptor has been
+ opened with O_NONBLOCK, the return value is set to -1 and errno becomes
+ EAGAIN.
+
+ If there is no space available in the mail box and the file descriptor
+ has been opened without O_NONBLOCK, the call will block until the SPU
+ reads from its PPE mailbox channel. When data has been read success-
+ fully, four bytes are placed in the data buffer and the value four is
+ returned.
+
+ poll(2)
+ Poll on the ibox file returns (POLLOUT | POLLWRNORM) whenever
+ space is available for writing.
+
+
+ /mbox_stat
+ /ibox_stat
+ /wbox_stat
+ Read-only files that contain the length of the current queue, i.e. how
+ many words can be read from mbox or ibox or how many words can be
+ written to wbox without blocking. The files can be read only in 4-byte
+ units and return a big-endian binary integer number. The possible
+ operations on an open *box_stat file are:
+
+ read(2)
+ If a count smaller than four is requested, read returns -1 and
+ sets errno to EINVAL. Otherwise, a four byte value is placed in
+ the data buffer, containing the number of elements that can be
+ read from (for mbox_stat and ibox_stat) or written to (for
+ wbox_stat) the respective mail box without blocking or resulting
+ in EAGAIN.
+
+
+ /npc
+ /decr
+ /decr_status
+ /spu_tag_mask
+ /event_mask
+ /srr0
+ Internal registers of the SPU. The representation is an ASCII string
+ with the numeric value of the next instruction to be executed. These
+ can be used in read/write mode for debugging, but normal operation of
+ programs should not rely on them because access to any of them except
+ npc requires an SPU context save and is therefore very inefficient.
+
+ The contents of these files are:
+
+ npc Next Program Counter
+
+ decr SPU Decrementer
+
+ decr_status Decrementer Status
+
+ spu_tag_mask MFC tag mask for SPU DMA
+
+ event_mask Event mask for SPU interrupts
+
+ srr0 Interrupt Return address register
+
+
+ The possible operations on an open npc, decr, decr_status,
+ spu_tag_mask, event_mask or srr0 file are:
+
+ read(2)
+ When the count supplied to the read call is shorter than the
+ required length for the pointer value plus a newline character,
+ subsequent reads from the same file descriptor will result in
+ completing the string, regardless of changes to the register by
+ a running SPU task. When a complete string has been read, all
+ subsequent read operations will return zero bytes and a new file
+ descriptor needs to be opened to read the value again.
+
+ write(2)
+ A write operation on the file results in setting the register to
+ the value given in the string. The string is parsed from the
+ beginning to the first non-numeric character or the end of the
+ buffer. Subsequent writes to the same file descriptor overwrite
+ the previous setting.
+
+
+ /fpcr
+ This file gives access to the Floating Point Status and Control Regis-
+ ter as a four byte long file. The operations on the fpcr file are:
+
+ read(2)
+ If a count smaller than four is requested, read returns -1 and
+ sets errno to EINVAL. Otherwise, a four byte value is placed in
+ the data buffer, containing the current value of the fpcr regis-
+ ter.
+
+ write(2)
+ If a count smaller than four is requested, write returns -1 and
+ sets errno to EINVAL. Otherwise, a four byte value is copied
+ from the data buffer, updating the value of the fpcr register.
+
+
+ /signal1
+ /signal2
+ The two signal notification channels of an SPU. These are read-write
+ files that operate on a 32 bit word. Writing to one of these files
+ triggers an interrupt on the SPU. The value written to the signal
+ files can be read from the SPU through a channel read or from host user
+ space through the file. After the value has been read by the SPU, it
+ is reset to zero. The possible operations on an open signal1 or sig-
+ nal2 file are:
+
+ read(2)
+ If a count smaller than four is requested, read returns -1 and
+ sets errno to EINVAL. Otherwise, a four byte value is placed in
+ the data buffer, containing the current value of the specified
+ signal notification register.
+
+ write(2)
+ If a count smaller than four is requested, write returns -1 and
+ sets errno to EINVAL. Otherwise, a four byte value is copied
+ from the data buffer, updating the value of the specified signal
+ notification register. The signal notification register will
+ either be replaced with the input data or will be updated to the
+ bitwise OR or the old value and the input data, depending on the
+ contents of the signal1_type, or signal2_type respectively,
+ file.
+
+
+ /signal1_type
+ /signal2_type
+ These two files change the behavior of the signal1 and signal2 notifi-
+ cation files. The contain a numerical ASCII string which is read as
+ either "1" or "0". In mode 0 (overwrite), the hardware replaces the
+ contents of the signal channel with the data that is written to it. in
+ mode 1 (logical OR), the hardware accumulates the bits that are subse-
+ quently written to it. The possible operations on an open signal1_type
+ or signal2_type file are:
+
+ read(2)
+ When the count supplied to the read call is shorter than the
+ required length for the digit plus a newline character, subse-
+ quent reads from the same file descriptor will result in com-
+ pleting the string. When a complete string has been read, all
+ subsequent read operations will return zero bytes and a new file
+ descriptor needs to be opened to read the value again.
+
+ write(2)
+ A write operation on the file results in setting the register to
+ the value given in the string. The string is parsed from the
+ beginning to the first non-numeric character or the end of the
+ buffer. Subsequent writes to the same file descriptor overwrite
+ the previous setting.
+
+
+EXAMPLES
+ /etc/fstab entry
+ none /spu spufs gid=spu 0 0
+
+
+AUTHORS
+ Arnd Bergmann <arndb@de.ibm.com>, Mark Nutter <mnutter@us.ibm.com>,
+ Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
+
+SEE ALSO
+ capabilities(7), close(2), spu_create(2), spu_run(2), spufs(7)
+
+
+
+Linux 2005-09-28 SPUFS(2)
+
+------------------------------------------------------------------------------
+
+SPU_RUN(2) Linux Programmer's Manual SPU_RUN(2)
+
+
+
+NAME
+ spu_run - execute an spu context
+
+
+SYNOPSIS
+ #include <sys/spu.h>
+
+ int spu_run(int fd, unsigned int *npc, unsigned int *event);
+
+DESCRIPTION
+ The spu_run system call is used on PowerPC machines that implement the
+ Cell Broadband Engine Architecture in order to access Synergistic Pro-
+ cessor Units (SPUs). It uses the fd that was returned from spu_cre-
+ ate(2) to address a specific SPU context. When the context gets sched-
+ uled to a physical SPU, it starts execution at the instruction pointer
+ passed in npc.
+
+ Execution of SPU code happens synchronously, meaning that spu_run does
+ not return while the SPU is still running. If there is a need to exe-
+ cute SPU code in parallel with other code on either the main CPU or
+ other SPUs, you need to create a new thread of execution first, e.g.
+ using the pthread_create(3) call.
+
+ When spu_run returns, the current value of the SPU instruction pointer
+ is written back to npc, so you can call spu_run again without updating
+ the pointers.
+
+ event can be a NULL pointer or point to an extended status code that
+ gets filled when spu_run returns. It can be one of the following con-
+ stants:
+
+ SPE_EVENT_DMA_ALIGNMENT
+ A DMA alignment error
+
+ SPE_EVENT_SPE_DATA_SEGMENT
+ A DMA segmentation error
+
+ SPE_EVENT_SPE_DATA_STORAGE
+ A DMA storage error
+
+ If NULL is passed as the event argument, these errors will result in a
+ signal delivered to the calling process.
+
+RETURN VALUE
+ spu_run returns the value of the spu_status register or -1 to indicate
+ an error and set errno to one of the error codes listed below. The
+ spu_status register value contains a bit mask of status codes and
+ optionally a 14 bit code returned from the stop-and-signal instruction
+ on the SPU. The bit masks for the status codes are:
+
+ 0x02 SPU was stopped by stop-and-signal.
+
+ 0x04 SPU was stopped by halt.
+
+ 0x08 SPU is waiting for a channel.
+
+ 0x10 SPU is in single-step mode.
+
+ 0x20 SPU has tried to execute an invalid instruction.
+
+ 0x40 SPU has tried to access an invalid channel.
+
+ 0x3fff0000
+ The bits masked with this value contain the code returned from
+ stop-and-signal.
+
+ There are always one or more of the lower eight bits set or an error
+ code is returned from spu_run.
+
+ERRORS
+ EAGAIN or EWOULDBLOCK
+ fd is in non-blocking mode and spu_run would block.
+
+ EBADF fd is not a valid file descriptor.
+
+ EFAULT npc is not a valid pointer or status is neither NULL nor a valid
+ pointer.
+
+ EINTR A signal occurred while spu_run was in progress. The npc value
+ has been updated to the new program counter value if necessary.
+
+ EINVAL fd is not a file descriptor returned from spu_create(2).
+
+ ENOMEM Insufficient memory was available to handle a page fault result-
+ ing from an MFC direct memory access.
+
+ ENOSYS the functionality is not provided by the current system, because
+ either the hardware does not provide SPUs or the spufs module is
+ not loaded.
+
+
+NOTES
+ spu_run is meant to be used from libraries that implement a more
+ abstract interface to SPUs, not to be used from regular applications.
+ See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec-
+ ommended libraries.
+
+
+CONFORMING TO
+ This call is Linux specific and only implemented by the ppc64 architec-
+ ture. Programs using this system call are not portable.
+
+
+BUGS
+ The code does not yet fully implement all features lined out here.
+
+
+AUTHOR
+ Arnd Bergmann <arndb@de.ibm.com>
+
+SEE ALSO
+ capabilities(7), close(2), spu_create(2), spufs(7)
+
+
+
+Linux 2005-09-28 SPU_RUN(2)
+
+------------------------------------------------------------------------------
+
+SPU_CREATE(2) Linux Programmer's Manual SPU_CREATE(2)
+
+
+
+NAME
+ spu_create - create a new spu context
+
+
+SYNOPSIS
+ #include <sys/types.h>
+ #include <sys/spu.h>
+
+ int spu_create(const char *pathname, int flags, mode_t mode);
+
+DESCRIPTION
+ The spu_create system call is used on PowerPC machines that implement
+ the Cell Broadband Engine Architecture in order to access Synergistic
+ Processor Units (SPUs). It creates a new logical context for an SPU in
+ pathname and returns a handle to associated with it. pathname must
+ point to a non-existing directory in the mount point of the SPU file
+ system (spufs). When spu_create is successful, a directory gets cre-
+ ated on pathname and it is populated with files.
+
+ The returned file handle can only be passed to spu_run(2) or closed,
+ other operations are not defined on it. When it is closed, all associ-
+ ated directory entries in spufs are removed. When the last file handle
+ pointing either inside of the context directory or to this file
+ descriptor is closed, the logical SPU context is destroyed.
+
+ The parameter flags can be zero or any bitwise or'd combination of the
+ following constants:
+
+ SPU_RAWIO
+ Allow mapping of some of the hardware registers of the SPU into
+ user space. This flag requires the CAP_SYS_RAWIO capability, see
+ capabilities(7).
+
+ The mode parameter specifies the permissions used for creating the new
+ directory in spufs. mode is modified with the user's umask(2) value
+ and then used for both the directory and the files contained in it. The
+ file permissions mask out some more bits of mode because they typically
+ support only read or write access. See stat(2) for a full list of the
+ possible mode values.
+
+
+RETURN VALUE
+ spu_create returns a new file descriptor. It may return -1 to indicate
+ an error condition and set errno to one of the error codes listed
+ below.
+
+
+ERRORS
+ EACCESS
+ The current user does not have write access on the spufs mount
+ point.
+
+ EEXIST An SPU context already exists at the given path name.
+
+ EFAULT pathname is not a valid string pointer in the current address
+ space.
+
+ EINVAL pathname is not a directory in the spufs mount point.
+
+ ELOOP Too many symlinks were found while resolving pathname.
+
+ EMFILE The process has reached its maximum open file limit.
+
+ ENAMETOOLONG
+ pathname was too long.
+
+ ENFILE The system has reached the global open file limit.
+
+ ENOENT Part of pathname could not be resolved.
+
+ ENOMEM The kernel could not allocate all resources required.
+
+ ENOSPC There are not enough SPU resources available to create a new
+ context or the user specific limit for the number of SPU con-
+ texts has been reached.
+
+ ENOSYS the functionality is not provided by the current system, because
+ either the hardware does not provide SPUs or the spufs module is
+ not loaded.
+
+ ENOTDIR
+ A part of pathname is not a directory.
+
+
+
+NOTES
+ spu_create is meant to be used from libraries that implement a more
+ abstract interface to SPUs, not to be used from regular applications.
+ See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec-
+ ommended libraries.
+
+
+FILES
+ pathname must point to a location beneath the mount point of spufs. By
+ convention, it gets mounted in /spu.
+
+
+CONFORMING TO
+ This call is Linux specific and only implemented by the ppc64 architec-
+ ture. Programs using this system call are not portable.
+
+
+BUGS
+ The code does not yet fully implement all features lined out here.
+
+
+AUTHOR
+ Arnd Bergmann <arndb@de.ibm.com>
+
+SEE ALSO
+ capabilities(7), close(2), spu_run(2), spufs(7)
+
+
+
+Linux 2005-09-28 SPU_CREATE(2)
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
new file mode 100644
index 0000000..de99749
--- /dev/null
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -0,0 +1,107 @@
+Accessing PCI device resources through sysfs
+--------------------------------------------
+
+sysfs, usually mounted at /sys, provides access to PCI resources on platforms
+that support it. For example, a given bus might look like this:
+
+ /sys/devices/pci0000:17
+ |-- 0000:17:00.0
+ | |-- class
+ | |-- config
+ | |-- device
+ | |-- enable
+ | |-- irq
+ | |-- local_cpus
+ | |-- resource
+ | |-- resource0
+ | |-- resource1
+ | |-- resource2
+ | |-- rom
+ | |-- subsystem_device
+ | |-- subsystem_vendor
+ | `-- vendor
+ `-- ...
+
+The topmost element describes the PCI domain and bus number. In this case,
+the domain number is 0000 and the bus number is 17 (both values are in hex).
+This bus contains a single function device in slot 0. The domain and bus
+numbers are reproduced for convenience. Under the device directory are several
+files, each with their own function.
+
+ file function
+ ---- --------
+ class PCI class (ascii, ro)
+ config PCI config space (binary, rw)
+ device PCI device (ascii, ro)
+ enable Whether the device is enabled (ascii, rw)
+ irq IRQ number (ascii, ro)
+ local_cpus nearby CPU mask (cpumask, ro)
+ resource PCI resource host addresses (ascii, ro)
+ resource0..N PCI resource N, if present (binary, mmap)
+ resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap)
+ rom PCI ROM resource, if present (binary, ro)
+ subsystem_device PCI subsystem device (ascii, ro)
+ subsystem_vendor PCI subsystem vendor (ascii, ro)
+ vendor PCI vendor (ascii, ro)
+
+ ro - read only file
+ rw - file is readable and writable
+ mmap - file is mmapable
+ ascii - file contains ascii text
+ binary - file contains binary data
+ cpumask - file contains a cpumask type
+
+The read only files are informational, writes to them will be ignored, with
+the exception of the 'rom' file. Writable files can be used to perform
+actions on the device (e.g. changing config space, detaching a device).
+mmapable files are available via an mmap of the file at offset 0 and can be
+used to do actual device programming from userspace. Note that some platforms
+don't support mmapping of certain resources, so be sure to check the return
+value from any attempted mmap.
+
+The 'enable' file provides a counter that indicates how many times the device
+has been enabled. If the 'enable' file currently returns '4', and a '1' is
+echoed into it, it will then return '5'. Echoing a '0' into it will decrease
+the count. Even when it returns to 0, though, some of the initialisation
+may not be reversed.
+
+The 'rom' file is special in that it provides read-only access to the device's
+ROM file, if available. It's disabled by default, however, so applications
+should write the string "1" to the file to enable it before attempting a read
+call, and disable it following the access by writing "0" to the file. Note
+that the device must be enabled for a rom read to return data succesfully.
+In the event a driver is not bound to the device, it can be enabled using the
+'enable' file, documented above.
+
+Accessing legacy resources through sysfs
+----------------------------------------
+
+Legacy I/O port and ISA memory resources are also provided in sysfs if the
+underlying platform supports them. They're located in the PCI class hierarchy,
+e.g.
+
+ /sys/class/pci_bus/0000:17/
+ |-- bridge -> ../../../devices/pci0000:17
+ |-- cpuaffinity
+ |-- legacy_io
+ `-- legacy_mem
+
+The legacy_io file is a read/write file that can be used by applications to
+do legacy port I/O. The application should open the file, seek to the desired
+port (e.g. 0x3e8) and do a read or a write of 1, 2 or 4 bytes. The legacy_mem
+file should be mmapped with an offset corresponding to the memory offset
+desired, e.g. 0xa0000 for the VGA frame buffer. The application can then
+simply dereference the returned pointer (after checking for errors of course)
+to access legacy memory space.
+
+Supporting PCI access on new platforms
+--------------------------------------
+
+In order to support PCI resource mapping as described above, Linux platform
+code must define HAVE_PCI_MMAP and provide a pci_mmap_page_range function.
+Platforms are free to only support subsets of the mmap functionality, but
+useful return codes should be provided.
+
+Legacy resources are protected by the HAVE_PCI_LEGACY define. Platforms
+wishing to support legacy functionality should define it and provide
+pci_legacy_read, pci_legacy_write and pci_mmap_legacy_page_range functions.
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
new file mode 100644
index 0000000..9e9c348
--- /dev/null
+++ b/Documentation/filesystems/sysfs.txt
@@ -0,0 +1,357 @@
+
+sysfs - _The_ filesystem for exporting kernel objects.
+
+Patrick Mochel <mochel@osdl.org>
+
+10 January 2003
+
+
+What it is:
+~~~~~~~~~~~
+
+sysfs is a ram-based filesystem initially based on ramfs. It provides
+a means to export kernel data structures, their attributes, and the
+linkages between them to userspace.
+
+sysfs is tied inherently to the kobject infrastructure. Please read
+Documentation/kobject.txt for more information concerning the kobject
+interface.
+
+
+Using sysfs
+~~~~~~~~~~~
+
+sysfs is always compiled in. You can access it by doing:
+
+ mount -t sysfs sysfs /sys
+
+
+Directory Creation
+~~~~~~~~~~~~~~~~~~
+
+For every kobject that is registered with the system, a directory is
+created for it in sysfs. That directory is created as a subdirectory
+of the kobject's parent, expressing internal object hierarchies to
+userspace. Top-level directories in sysfs represent the common
+ancestors of object hierarchies; i.e. the subsystems the objects
+belong to.
+
+Sysfs internally stores the kobject that owns the directory in the
+->d_fsdata pointer of the directory's dentry. This allows sysfs to do
+reference counting directly on the kobject when the file is opened and
+closed.
+
+
+Attributes
+~~~~~~~~~~
+
+Attributes can be exported for kobjects in the form of regular files in
+the filesystem. Sysfs forwards file I/O operations to methods defined
+for the attributes, providing a means to read and write kernel
+attributes.
+
+Attributes should be ASCII text files, preferably with only one value
+per file. It is noted that it may not be efficient to contain only one
+value per file, so it is socially acceptable to express an array of
+values of the same type.
+
+Mixing types, expressing multiple lines of data, and doing fancy
+formatting of data is heavily frowned upon. Doing these things may get
+you publically humiliated and your code rewritten without notice.
+
+
+An attribute definition is simply:
+
+struct attribute {
+ char * name;
+ mode_t mode;
+};
+
+
+int sysfs_create_file(struct kobject * kobj, struct attribute * attr);
+void sysfs_remove_file(struct kobject * kobj, struct attribute * attr);
+
+
+A bare attribute contains no means to read or write the value of the
+attribute. Subsystems are encouraged to define their own attribute
+structure and wrapper functions for adding and removing attributes for
+a specific object type.
+
+For example, the driver model defines struct device_attribute like:
+
+struct device_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device * dev, char * buf);
+ ssize_t (*store)(struct device * dev, const char * buf);
+};
+
+int device_create_file(struct device *, struct device_attribute *);
+void device_remove_file(struct device *, struct device_attribute *);
+
+It also defines this helper for defining device attributes:
+
+#define DEVICE_ATTR(_name, _mode, _show, _store) \
+struct device_attribute dev_attr_##_name = { \
+ .attr = {.name = __stringify(_name) , .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+};
+
+For example, declaring
+
+static DEVICE_ATTR(foo, S_IWUSR | S_IRUGO, show_foo, store_foo);
+
+is equivalent to doing:
+
+static struct device_attribute dev_attr_foo = {
+ .attr = {
+ .name = "foo",
+ .mode = S_IWUSR | S_IRUGO,
+ },
+ .show = show_foo,
+ .store = store_foo,
+};
+
+
+Subsystem-Specific Callbacks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a subsystem defines a new attribute type, it must implement a
+set of sysfs operations for forwarding read and write calls to the
+show and store methods of the attribute owners.
+
+struct sysfs_ops {
+ ssize_t (*show)(struct kobject *, struct attribute *, char *);
+ ssize_t (*store)(struct kobject *, struct attribute *, const char *);
+};
+
+[ Subsystems should have already defined a struct kobj_type as a
+descriptor for this type, which is where the sysfs_ops pointer is
+stored. See the kobject documentation for more information. ]
+
+When a file is read or written, sysfs calls the appropriate method
+for the type. The method then translates the generic struct kobject
+and struct attribute pointers to the appropriate pointer types, and
+calls the associated methods.
+
+
+To illustrate:
+
+#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)
+#define to_dev(d) container_of(d, struct device, kobj)
+
+static ssize_t
+dev_attr_show(struct kobject * kobj, struct attribute * attr, char * buf)
+{
+ struct device_attribute * dev_attr = to_dev_attr(attr);
+ struct device * dev = to_dev(kobj);
+ ssize_t ret = 0;
+
+ if (dev_attr->show)
+ ret = dev_attr->show(dev, buf);
+ return ret;
+}
+
+
+
+Reading/Writing Attribute Data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To read or write attributes, show() or store() methods must be
+specified when declaring the attribute. The method types should be as
+simple as those defined for device attributes:
+
+ ssize_t (*show)(struct device * dev, char * buf);
+ ssize_t (*store)(struct device * dev, const char * buf);
+
+IOW, they should take only an object and a buffer as parameters.
+
+
+sysfs allocates a buffer of size (PAGE_SIZE) and passes it to the
+method. Sysfs will call the method exactly once for each read or
+write. This forces the following behavior on the method
+implementations:
+
+- On read(2), the show() method should fill the entire buffer.
+ Recall that an attribute should only be exporting one value, or an
+ array of similar values, so this shouldn't be that expensive.
+
+ This allows userspace to do partial reads and forward seeks
+ arbitrarily over the entire file at will. If userspace seeks back to
+ zero or does a pread(2) with an offset of '0' the show() method will
+ be called again, rearmed, to fill the buffer.
+
+- On write(2), sysfs expects the entire buffer to be passed during the
+ first write. Sysfs then passes the entire buffer to the store()
+ method.
+
+ When writing sysfs files, userspace processes should first read the
+ entire file, modify the values it wishes to change, then write the
+ entire buffer back.
+
+ Attribute method implementations should operate on an identical
+ buffer when reading and writing values.
+
+Other notes:
+
+- Writing causes the show() method to be rearmed regardless of current
+ file position.
+
+- The buffer will always be PAGE_SIZE bytes in length. On i386, this
+ is 4096.
+
+- show() methods should return the number of bytes printed into the
+ buffer. This is the return value of snprintf().
+
+- show() should always use snprintf().
+
+- store() should return the number of bytes used from the buffer. This
+ can be done using strlen().
+
+- show() or store() can always return errors. If a bad value comes
+ through, be sure to return an error.
+
+- The object passed to the methods will be pinned in memory via sysfs
+ referencing counting its embedded object. However, the physical
+ entity (e.g. device) the object represents may not be present. Be
+ sure to have a way to check this, if necessary.
+
+
+A very simple (and naive) implementation of a device attribute is:
+
+static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", dev->name);
+}
+
+static ssize_t store_name(struct device * dev, const char * buf)
+{
+ sscanf(buf, "%20s", dev->name);
+ return strnlen(buf, PAGE_SIZE);
+}
+
+static DEVICE_ATTR(name, S_IRUGO, show_name, store_name);
+
+
+(Note that the real implementation doesn't allow userspace to set the
+name for a device.)
+
+
+Top Level Directory Layout
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The sysfs directory arrangement exposes the relationship of kernel
+data structures.
+
+The top level sysfs directory looks like:
+
+block/
+bus/
+class/
+dev/
+devices/
+firmware/
+net/
+fs/
+
+devices/ contains a filesystem representation of the device tree. It maps
+directly to the internal kernel device tree, which is a hierarchy of
+struct device.
+
+bus/ contains flat directory layout of the various bus types in the
+kernel. Each bus's directory contains two subdirectories:
+
+ devices/
+ drivers/
+
+devices/ contains symlinks for each device discovered in the system
+that point to the device's directory under root/.
+
+drivers/ contains a directory for each device driver that is loaded
+for devices on that particular bus (this assumes that drivers do not
+span multiple bus types).
+
+fs/ contains a directory for some filesystems. Currently each
+filesystem wanting to export attributes must create its own hierarchy
+below fs/ (see ./fuse.txt for an example).
+
+dev/ contains two directories char/ and block/. Inside these two
+directories there are symlinks named <major>:<minor>. These symlinks
+point to the sysfs directory for the given device. /sys/dev provides a
+quick way to lookup the sysfs interface for a device from the result of
+a stat(2) operation.
+
+More information can driver-model specific features can be found in
+Documentation/driver-model/.
+
+
+TODO: Finish this section.
+
+
+Current Interfaces
+~~~~~~~~~~~~~~~~~~
+
+The following interface layers currently exist in sysfs:
+
+
+- devices (include/linux/device.h)
+----------------------------------
+Structure:
+
+struct device_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device * dev, char * buf);
+ ssize_t (*store)(struct device * dev, const char * buf);
+};
+
+Declaring:
+
+DEVICE_ATTR(_name, _str, _mode, _show, _store);
+
+Creation/Removal:
+
+int device_create_file(struct device *device, struct device_attribute * attr);
+void device_remove_file(struct device * dev, struct device_attribute * attr);
+
+
+- bus drivers (include/linux/device.h)
+--------------------------------------
+Structure:
+
+struct bus_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct bus_type *, char * buf);
+ ssize_t (*store)(struct bus_type *, const char * buf);
+};
+
+Declaring:
+
+BUS_ATTR(_name, _mode, _show, _store)
+
+Creation/Removal:
+
+int bus_create_file(struct bus_type *, struct bus_attribute *);
+void bus_remove_file(struct bus_type *, struct bus_attribute *);
+
+
+- device drivers (include/linux/device.h)
+-----------------------------------------
+
+Structure:
+
+struct driver_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device_driver *, char * buf);
+ ssize_t (*store)(struct device_driver *, const char * buf);
+};
+
+Declaring:
+
+DRIVER_ATTR(_name, _mode, _show, _store)
+
+Creation/Removal:
+
+int driver_create_file(struct device_driver *, struct driver_attribute *);
+void driver_remove_file(struct device_driver *, struct driver_attribute *);
+
+
diff --git a/Documentation/filesystems/sysv-fs.txt b/Documentation/filesystems/sysv-fs.txt
new file mode 100644
index 0000000..253b50d
--- /dev/null
+++ b/Documentation/filesystems/sysv-fs.txt
@@ -0,0 +1,197 @@
+It implements all of
+ - Xenix FS,
+ - SystemV/386 FS,
+ - Coherent FS.
+
+To install:
+* Answer the 'System V and Coherent filesystem support' question with 'y'
+ when configuring the kernel.
+* To mount a disk or a partition, use
+ mount [-r] -t sysv device mountpoint
+ The file system type names
+ -t sysv
+ -t xenix
+ -t coherent
+ may be used interchangeably, but the last two will eventually disappear.
+
+Bugs in the present implementation:
+- Coherent FS:
+ - The "free list interleave" n:m is currently ignored.
+ - Only file systems with no filesystem name and no pack name are recognized.
+ (See Coherent "man mkfs" for a description of these features.)
+- SystemV Release 2 FS:
+ The superblock is only searched in the blocks 9, 15, 18, which
+ corresponds to the beginning of track 1 on floppy disks. No support
+ for this FS on hard disk yet.
+
+
+These filesystems are rather similar. Here is a comparison with Minix FS:
+
+* Linux fdisk reports on partitions
+ - Minix FS 0x81 Linux/Minix
+ - Xenix FS ??
+ - SystemV FS ??
+ - Coherent FS 0x08 AIX bootable
+
+* Size of a block or zone (data allocation unit on disk)
+ - Minix FS 1024
+ - Xenix FS 1024 (also 512 ??)
+ - SystemV FS 1024 (also 512 and 2048)
+ - Coherent FS 512
+
+* General layout: all have one boot block, one super block and
+ separate areas for inodes and for directories/data.
+ On SystemV Release 2 FS (e.g. Microport) the first track is reserved and
+ all the block numbers (including the super block) are offset by one track.
+
+* Byte ordering of "short" (16 bit entities) on disk:
+ - Minix FS little endian 0 1
+ - Xenix FS little endian 0 1
+ - SystemV FS little endian 0 1
+ - Coherent FS little endian 0 1
+ Of course, this affects only the file system, not the data of files on it!
+
+* Byte ordering of "long" (32 bit entities) on disk:
+ - Minix FS little endian 0 1 2 3
+ - Xenix FS little endian 0 1 2 3
+ - SystemV FS little endian 0 1 2 3
+ - Coherent FS PDP-11 2 3 0 1
+ Of course, this affects only the file system, not the data of files on it!
+
+* Inode on disk: "short", 0 means non-existent, the root dir ino is:
+ - Minix FS 1
+ - Xenix FS, SystemV FS, Coherent FS 2
+
+* Maximum number of hard links to a file:
+ - Minix FS 250
+ - Xenix FS ??
+ - SystemV FS ??
+ - Coherent FS >=10000
+
+* Free inode management:
+ - Minix FS a bitmap
+ - Xenix FS, SystemV FS, Coherent FS
+ There is a cache of a certain number of free inodes in the super-block.
+ When it is exhausted, new free inodes are found using a linear search.
+
+* Free block management:
+ - Minix FS a bitmap
+ - Xenix FS, SystemV FS, Coherent FS
+ Free blocks are organized in a "free list". Maybe a misleading term,
+ since it is not true that every free block contains a pointer to
+ the next free block. Rather, the free blocks are organized in chunks
+ of limited size, and every now and then a free block contains pointers
+ to the free blocks pertaining to the next chunk; the first of these
+ contains pointers and so on. The list terminates with a "block number"
+ 0 on Xenix FS and SystemV FS, with a block zeroed out on Coherent FS.
+
+* Super-block location:
+ - Minix FS block 1 = bytes 1024..2047
+ - Xenix FS block 1 = bytes 1024..2047
+ - SystemV FS bytes 512..1023
+ - Coherent FS block 1 = bytes 512..1023
+
+* Super-block layout:
+ - Minix FS
+ unsigned short s_ninodes;
+ unsigned short s_nzones;
+ unsigned short s_imap_blocks;
+ unsigned short s_zmap_blocks;
+ unsigned short s_firstdatazone;
+ unsigned short s_log_zone_size;
+ unsigned long s_max_size;
+ unsigned short s_magic;
+ - Xenix FS, SystemV FS, Coherent FS
+ unsigned short s_firstdatazone;
+ unsigned long s_nzones;
+ unsigned short s_fzone_count;
+ unsigned long s_fzones[NICFREE];
+ unsigned short s_finode_count;
+ unsigned short s_finodes[NICINOD];
+ char s_flock;
+ char s_ilock;
+ char s_modified;
+ char s_rdonly;
+ unsigned long s_time;
+ short s_dinfo[4]; -- SystemV FS only
+ unsigned long s_free_zones;
+ unsigned short s_free_inodes;
+ short s_dinfo[4]; -- Xenix FS only
+ unsigned short s_interleave_m,s_interleave_n; -- Coherent FS only
+ char s_fname[6];
+ char s_fpack[6];
+ then they differ considerably:
+ Xenix FS
+ char s_clean;
+ char s_fill[371];
+ long s_magic;
+ long s_type;
+ SystemV FS
+ long s_fill[12 or 14];
+ long s_state;
+ long s_magic;
+ long s_type;
+ Coherent FS
+ unsigned long s_unique;
+ Note that Coherent FS has no magic.
+
+* Inode layout:
+ - Minix FS
+ unsigned short i_mode;
+ unsigned short i_uid;
+ unsigned long i_size;
+ unsigned long i_time;
+ unsigned char i_gid;
+ unsigned char i_nlinks;
+ unsigned short i_zone[7+1+1];
+ - Xenix FS, SystemV FS, Coherent FS
+ unsigned short i_mode;
+ unsigned short i_nlink;
+ unsigned short i_uid;
+ unsigned short i_gid;
+ unsigned long i_size;
+ unsigned char i_zone[3*(10+1+1+1)];
+ unsigned long i_atime;
+ unsigned long i_mtime;
+ unsigned long i_ctime;
+
+* Regular file data blocks are organized as
+ - Minix FS
+ 7 direct blocks
+ 1 indirect block (pointers to blocks)
+ 1 double-indirect block (pointer to pointers to blocks)
+ - Xenix FS, SystemV FS, Coherent FS
+ 10 direct blocks
+ 1 indirect block (pointers to blocks)
+ 1 double-indirect block (pointer to pointers to blocks)
+ 1 triple-indirect block (pointer to pointers to pointers to blocks)
+
+* Inode size, inodes per block
+ - Minix FS 32 32
+ - Xenix FS 64 16
+ - SystemV FS 64 16
+ - Coherent FS 64 8
+
+* Directory entry on disk
+ - Minix FS
+ unsigned short inode;
+ char name[14/30];
+ - Xenix FS, SystemV FS, Coherent FS
+ unsigned short inode;
+ char name[14];
+
+* Dir entry size, dir entries per block
+ - Minix FS 16/32 64/32
+ - Xenix FS 16 64
+ - SystemV FS 16 64
+ - Coherent FS 16 32
+
+* How to implement symbolic links such that the host fsck doesn't scream:
+ - Minix FS normal
+ - Xenix FS kludge: as regular files with chmod 1000
+ - SystemV FS ??
+ - Coherent FS kludge: as regular files with chmod 1000
+
+
+Notation: We often speak of a "block" but mean a zone (the allocation unit)
+and not the disk driver's notion of "block".
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
new file mode 100644
index 0000000..222437e
--- /dev/null
+++ b/Documentation/filesystems/tmpfs.txt
@@ -0,0 +1,136 @@
+Tmpfs is a file system which keeps all files in virtual memory.
+
+
+Everything in tmpfs is temporary in the sense that no files will be
+created on your hard drive. If you unmount a tmpfs instance,
+everything stored therein is lost.
+
+tmpfs puts everything into the kernel internal caches and grows and
+shrinks to accommodate the files it contains and is able to swap
+unneeded pages out to swap space. It has maximum size limits which can
+be adjusted on the fly via 'mount -o remount ...'
+
+If you compare it to ramfs (which was the template to create tmpfs)
+you gain swapping and limit checking. Another similar thing is the RAM
+disk (/dev/ram*), which simulates a fixed size hard disk in physical
+RAM, where you have to create an ordinary filesystem on top. Ramdisks
+cannot swap and you do not have the possibility to resize them.
+
+Since tmpfs lives completely in the page cache and on swap, all tmpfs
+pages currently in memory will show up as cached. It will not show up
+as shared or something like that. Further on you can check the actual
+RAM+swap use of a tmpfs instance with df(1) and du(1).
+
+
+tmpfs has the following uses:
+
+1) There is always a kernel internal mount which you will not see at
+ all. This is used for shared anonymous mappings and SYSV shared
+ memory.
+
+ This mount does not depend on CONFIG_TMPFS. If CONFIG_TMPFS is not
+ set, the user visible part of tmpfs is not build. But the internal
+ mechanisms are always present.
+
+2) glibc 2.2 and above expects tmpfs to be mounted at /dev/shm for
+ POSIX shared memory (shm_open, shm_unlink). Adding the following
+ line to /etc/fstab should take care of this:
+
+ tmpfs /dev/shm tmpfs defaults 0 0
+
+ Remember to create the directory that you intend to mount tmpfs on
+ if necessary.
+
+ This mount is _not_ needed for SYSV shared memory. The internal
+ mount is used for that. (In the 2.3 kernel versions it was
+ necessary to mount the predecessor of tmpfs (shm fs) to use SYSV
+ shared memory)
+
+3) Some people (including me) find it very convenient to mount it
+ e.g. on /tmp and /var/tmp and have a big swap partition. And now
+ loop mounts of tmpfs files do work, so mkinitrd shipped by most
+ distributions should succeed with a tmpfs /tmp.
+
+4) And probably a lot more I do not know about :-)
+
+
+tmpfs has three mount options for sizing:
+
+size: The limit of allocated bytes for this tmpfs instance. The
+ default is half of your physical RAM without swap. If you
+ oversize your tmpfs instances the machine will deadlock
+ since the OOM handler will not be able to free that memory.
+nr_blocks: The same as size, but in blocks of PAGE_CACHE_SIZE.
+nr_inodes: The maximum number of inodes for this instance. The default
+ is half of the number of your physical RAM pages, or (on a
+ machine with highmem) the number of lowmem RAM pages,
+ whichever is the lower.
+
+These parameters accept a suffix k, m or g for kilo, mega and giga and
+can be changed on remount. The size parameter also accepts a suffix %
+to limit this tmpfs instance to that percentage of your physical RAM:
+the default, when neither size nor nr_blocks is specified, is size=50%
+
+If nr_blocks=0 (or size=0), blocks will not be limited in that instance;
+if nr_inodes=0, inodes will not be limited. It is generally unwise to
+mount with such options, since it allows any user with write access to
+use up all the memory on the machine; but enhances the scalability of
+that instance in a system with many cpus making intensive use of it.
+
+
+tmpfs has a mount option to set the NUMA memory allocation policy for
+all files in that instance (if CONFIG_NUMA is enabled) - which can be
+adjusted on the fly via 'mount -o remount ...'
+
+mpol=default prefers to allocate memory from the local node
+mpol=prefer:Node prefers to allocate memory from the given Node
+mpol=bind:NodeList allocates memory only from nodes in NodeList
+mpol=interleave prefers to allocate from each node in turn
+mpol=interleave:NodeList allocates from each node of NodeList in turn
+
+NodeList format is a comma-separated list of decimal numbers and ranges,
+a range being two hyphen-separated decimal numbers, the smallest and
+largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15
+
+NUMA memory allocation policies have optional flags that can be used in
+conjunction with their modes. These optional flags can be specified
+when tmpfs is mounted by appending them to the mode before the NodeList.
+See Documentation/vm/numa_memory_policy.txt for a list of all available
+memory allocation policy mode flags.
+
+ =static is equivalent to MPOL_F_STATIC_NODES
+ =relative is equivalent to MPOL_F_RELATIVE_NODES
+
+For example, mpol=bind=static:NodeList, is the equivalent of an
+allocation policy of MPOL_BIND | MPOL_F_STATIC_NODES.
+
+Note that trying to mount a tmpfs with an mpol option will fail if the
+running kernel does not support NUMA; and will fail if its nodelist
+specifies a node which is not online. If your system relies on that
+tmpfs being mounted, but from time to time runs a kernel built without
+NUMA capability (perhaps a safe recovery kernel), or with fewer nodes
+online, then it is advisable to omit the mpol option from automatic
+mount options. It can be added later, when the tmpfs is already mounted
+on MountPoint, by 'mount -o remount,mpol=Policy:NodeList MountPoint'.
+
+
+To specify the initial root directory you can use the following mount
+options:
+
+mode: The permissions as an octal number
+uid: The user id
+gid: The group id
+
+These options do not have any effect on remount. You can change these
+parameters with chmod(1), chown(1) and chgrp(1) on a mounted filesystem.
+
+
+So 'mount -t tmpfs -o size=10G,nr_inodes=10k,mode=700 tmpfs /mytmpfs'
+will give you tmpfs instance on /mytmpfs which can allocate 10GB
+RAM/SWAP in 10240 inodes and it is only accessible by root.
+
+
+Author:
+ Christoph Rohland <cr@sap.com>, 1.12.01
+Updated:
+ Hugh Dickins <hugh@veritas.com>, 4 June 2007
diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt
new file mode 100644
index 0000000..dd84ea3
--- /dev/null
+++ b/Documentation/filesystems/ubifs.txt
@@ -0,0 +1,173 @@
+Introduction
+=============
+
+UBIFS file-system stands for UBI File System. UBI stands for "Unsorted
+Block Images". UBIFS is a flash file system, which means it is designed
+to work with flash devices. It is important to understand, that UBIFS
+is completely different to any traditional file-system in Linux, like
+Ext2, XFS, JFS, etc. UBIFS represents a separate class of file-systems
+which work with MTD devices, not block devices. The other Linux
+file-system of this class is JFFS2.
+
+To make it more clear, here is a small comparison of MTD devices and
+block devices.
+
+1 MTD devices represent flash devices and they consist of eraseblocks of
+ rather large size, typically about 128KiB. Block devices consist of
+ small blocks, typically 512 bytes.
+2 MTD devices support 3 main operations - read from some offset within an
+ eraseblock, write to some offset within an eraseblock, and erase a whole
+ eraseblock. Block devices support 2 main operations - read a whole
+ block and write a whole block.
+3 The whole eraseblock has to be erased before it becomes possible to
+ re-write its contents. Blocks may be just re-written.
+4 Eraseblocks become worn out after some number of erase cycles -
+ typically 100K-1G for SLC NAND and NOR flashes, and 1K-10K for MLC
+ NAND flashes. Blocks do not have the wear-out property.
+5 Eraseblocks may become bad (only on NAND flashes) and software should
+ deal with this. Blocks on hard drives typically do not become bad,
+ because hardware has mechanisms to substitute bad blocks, at least in
+ modern LBA disks.
+
+It should be quite obvious why UBIFS is very different to traditional
+file-systems.
+
+UBIFS works on top of UBI. UBI is a separate software layer which may be
+found in drivers/mtd/ubi. UBI is basically a volume management and
+wear-leveling layer. It provides so called UBI volumes which is a higher
+level abstraction than a MTD device. The programming model of UBI devices
+is very similar to MTD devices - they still consist of large eraseblocks,
+they have read/write/erase operations, but UBI devices are devoid of
+limitations like wear and bad blocks (items 4 and 5 in the above list).
+
+In a sense, UBIFS is a next generation of JFFS2 file-system, but it is
+very different and incompatible to JFFS2. The following are the main
+differences.
+
+* JFFS2 works on top of MTD devices, UBIFS depends on UBI and works on
+ top of UBI volumes.
+* JFFS2 does not have on-media index and has to build it while mounting,
+ which requires full media scan. UBIFS maintains the FS indexing
+ information on the flash media and does not require full media scan,
+ so it mounts many times faster than JFFS2.
+* JFFS2 is a write-through file-system, while UBIFS supports write-back,
+ which makes UBIFS much faster on writes.
+
+Similarly to JFFS2, UBIFS supports on-the-flight compression which makes
+it possible to fit quite a lot of data to the flash.
+
+Similarly to JFFS2, UBIFS is tolerant of unclean reboots and power-cuts.
+It does not need stuff like fsck.ext2. UBIFS automatically replays its
+journal and recovers from crashes, ensuring that the on-flash data
+structures are consistent.
+
+UBIFS scales logarithmically (most of the data structures it uses are
+trees), so the mount time and memory consumption do not linearly depend
+on the flash size, like in case of JFFS2. This is because UBIFS
+maintains the FS index on the flash media. However, UBIFS depends on
+UBI, which scales linearly. So overall UBI/UBIFS stack scales linearly.
+Nevertheless, UBI/UBIFS scales considerably better than JFFS2.
+
+The authors of UBIFS believe, that it is possible to develop UBI2 which
+would scale logarithmically as well. UBI2 would support the same API as UBI,
+but it would be binary incompatible to UBI. So UBIFS would not need to be
+changed to use UBI2
+
+
+Mount options
+=============
+
+(*) == default.
+
+norm_unmount (*) commit on unmount; the journal is committed
+ when the file-system is unmounted so that the
+ next mount does not have to replay the journal
+ and it becomes very fast;
+fast_unmount do not commit on unmount; this option makes
+ unmount faster, but the next mount slower
+ because of the need to replay the journal.
+bulk_read read more in one go to take advantage of flash
+ media that read faster sequentially
+no_bulk_read (*) do not bulk-read
+no_chk_data_crc skip checking of CRCs on data nodes in order to
+ improve read performance. Use this option only
+ if the flash media is highly reliable. The effect
+ of this option is that corruption of the contents
+ of a file can go unnoticed.
+chk_data_crc (*) do not skip checking CRCs on data nodes
+
+
+Quick usage instructions
+========================
+
+The UBI volume to mount is specified using "ubiX_Y" or "ubiX:NAME" syntax,
+where "X" is UBI device number, "Y" is UBI volume number, and "NAME" is
+UBI volume name.
+
+Mount volume 0 on UBI device 0 to /mnt/ubifs:
+$ mount -t ubifs ubi0_0 /mnt/ubifs
+
+Mount "rootfs" volume of UBI device 0 to /mnt/ubifs ("rootfs" is volume
+name):
+$ mount -t ubifs ubi0:rootfs /mnt/ubifs
+
+The following is an example of the kernel boot arguments to attach mtd0
+to UBI and mount volume "rootfs":
+ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs
+
+
+Module Parameters for Debugging
+===============================
+
+When UBIFS has been compiled with debugging enabled, there are 3 module
+parameters that are available to control aspects of testing and debugging.
+The parameters are unsigned integers where each bit controls an option.
+The parameters are:
+
+debug_msgs Selects which debug messages to display, as follows:
+
+ Message Type Flag value
+
+ General messages 1
+ Journal messages 2
+ Mount messages 4
+ Commit messages 8
+ LEB search messages 16
+ Budgeting messages 32
+ Garbage collection messages 64
+ Tree Node Cache (TNC) messages 128
+ LEB properties (lprops) messages 256
+ Input/output messages 512
+ Log messages 1024
+ Scan messages 2048
+ Recovery messages 4096
+
+debug_chks Selects extra checks that UBIFS can do while running:
+
+ Check Flag value
+
+ General checks 1
+ Check Tree Node Cache (TNC) 2
+ Check indexing tree size 4
+ Check orphan area 8
+ Check old indexing tree 16
+ Check LEB properties (lprops) 32
+ Check leaf nodes and inodes 64
+
+debug_tsts Selects a mode of testing, as follows:
+
+ Test mode Flag value
+
+ Force in-the-gaps method 2
+ Failure mode for recovery testing 4
+
+For example, set debug_msgs to 5 to display General messages and Mount
+messages.
+
+
+References
+==========
+
+UBIFS documentation and FAQ/HOWTO at the MTD web site:
+http://www.linux-mtd.infradead.org/doc/ubifs.html
+http://www.linux-mtd.infradead.org/faq/ubifs.html
diff --git a/Documentation/filesystems/udf.txt b/Documentation/filesystems/udf.txt
new file mode 100644
index 0000000..fde829a
--- /dev/null
+++ b/Documentation/filesystems/udf.txt
@@ -0,0 +1,80 @@
+*
+* Documentation/filesystems/udf.txt
+*
+UDF Filesystem version 0.9.8.1
+
+If you encounter problems with reading UDF discs using this driver,
+please report them to linux_udf@hpesjro.fc.hp.com, which is the
+developer's list.
+
+Write support requires a block driver which supports writing. Currently
+dvd+rw drives and media support true random sector writes, and so a udf
+filesystem on such devices can be directly mounted read/write. CD-RW
+media however, does not support this. Instead the media can be formatted
+for packet mode using the utility cdrwtool, then the pktcdvd driver can
+be bound to the underlying cd device to provide the required buffering
+and read-modify-write cycles to allow the filesystem random sector writes
+while providing the hardware with only full packet writes. While not
+required for dvd+rw media, use of the pktcdvd driver often enhances
+performance due to very poor read-modify-write support supplied internally
+by drive firmware.
+
+-------------------------------------------------------------------------------
+The following mount options are supported:
+
+ gid= Set the default group.
+ umask= Set the default umask.
+ uid= Set the default user.
+ bs= Set the block size.
+ unhide Show otherwise hidden files.
+ undelete Show deleted files in lists.
+ adinicb Embed data in the inode (default)
+ noadinicb Don't embed data in the inode
+ shortad Use short ad's
+ longad Use long ad's (default)
+ nostrict Unset strict conformance
+ iocharset= Set the NLS character set
+
+The uid= and gid= options need a bit more explaining. They will accept a
+decimal numeric value which will be used as the default ID for that mount.
+They will also accept the string "ignore" and "forget". For files on the disk
+that are owned by nobody ( -1 ), they will instead look as if they are owned
+by the default ID. The ignore option causes the default ID to override all
+IDs on the disk, not just -1. The forget option causes all IDs to be written
+to disk as -1, so when the media is later remounted, they will appear to be
+owned by whatever default ID it is mounted with at that time.
+
+For typical desktop use of removable media, you should set the ID to that
+of the interactively logged on user, and also specify both the forget and
+ignore options. This way the interactive user will always see the files
+on the disk as belonging to him.
+
+The remaining are for debugging and disaster recovery:
+
+ novrs Skip volume sequence recognition
+
+The following expect a offset from 0.
+
+ session= Set the CDROM session (default= last session)
+ anchor= Override standard anchor location. (default= 256)
+ volume= Override the VolumeDesc location. (unused)
+ partition= Override the PartitionDesc location. (unused)
+ lastblock= Set the last block of the filesystem/
+
+The following expect a offset from the partition root.
+
+ fileset= Override the fileset block location. (unused)
+ rootdir= Override the root directory location. (unused)
+ WARNING: overriding the rootdir to a non-directory may
+ yield highly unpredictable results.
+-------------------------------------------------------------------------------
+
+
+For the latest version and toolset see:
+ http://linux-udf.sourceforge.net/
+
+Documentation on UDF and ECMA 167 is available FREE from:
+ http://www.osta.org/
+ http://www.ecma-international.org/
+
+Ben Fennema <bfennema@falcon.csc.calpoly.edu>
diff --git a/Documentation/filesystems/ufs.txt b/Documentation/filesystems/ufs.txt
new file mode 100644
index 0000000..7a602ad
--- /dev/null
+++ b/Documentation/filesystems/ufs.txt
@@ -0,0 +1,60 @@
+USING UFS
+=========
+
+mount -t ufs -o ufstype=type_of_ufs device dir
+
+
+UFS OPTIONS
+===========
+
+ufstype=type_of_ufs
+ UFS is a file system widely used in different operating systems.
+ The problem are differences among implementations. Features of
+ some implementations are undocumented, so its hard to recognize
+ type of ufs automatically. That's why user must specify type of
+ ufs manually by mount option ufstype. Possible values are:
+
+ old old format of ufs
+ default value, supported as read-only
+
+ 44bsd used in FreeBSD, NetBSD, OpenBSD
+ supported as read-write
+
+ ufs2 used in FreeBSD 5.x
+ supported as read-write
+
+ 5xbsd synonym for ufs2
+
+ sun used in SunOS (Solaris)
+ supported as read-write
+
+ sunx86 used in SunOS for Intel (Solarisx86)
+ supported as read-write
+
+ hp used in HP-UX
+ supported as read-only
+
+ nextstep
+ used in NextStep
+ supported as read-only
+
+ nextstep-cd
+ used for NextStep CDROMs (block_size == 2048)
+ supported as read-only
+
+ openstep
+ used in OpenStep
+ supported as read-only
+
+
+POSSIBLE PROBLEMS
+=================
+
+See next section, if you have any.
+
+
+BUG REPORTS
+===========
+
+Any ufs bug report you can send to daniel.pirkl@email.cz or
+to dushistov@mail.ru (do not send partition tables bug reports).
diff --git a/Documentation/filesystems/unionfs/00-INDEX b/Documentation/filesystems/unionfs/00-INDEX
new file mode 100644
index 0000000..96fdf67
--- /dev/null
+++ b/Documentation/filesystems/unionfs/00-INDEX
@@ -0,0 +1,10 @@
+00-INDEX
+ - this file.
+concepts.txt
+ - A brief introduction of concepts.
+issues.txt
+ - A summary of known issues with unionfs.
+rename.txt
+ - Information regarding rename operations.
+usage.txt
+ - Usage information and examples.
diff --git a/Documentation/filesystems/unionfs/concepts.txt b/Documentation/filesystems/unionfs/concepts.txt
new file mode 100644
index 0000000..b853788
--- /dev/null
+++ b/Documentation/filesystems/unionfs/concepts.txt
@@ -0,0 +1,287 @@
+Unionfs 2.x CONCEPTS:
+=====================
+
+This file describes the concepts needed by a namespace unification file
+system.
+
+
+Branch Priority:
+================
+
+Each branch is assigned a unique priority - starting from 0 (highest
+priority). No two branches can have the same priority.
+
+
+Branch Mode:
+============
+
+Each branch is assigned a mode - read-write or read-only. This allows
+directories on media mounted read-write to be used in a read-only manner.
+
+
+Whiteouts:
+==========
+
+A whiteout removes a file name from the namespace. Whiteouts are needed when
+one attempts to remove a file on a read-only branch.
+
+Suppose we have a two-branch union, where branch 0 is read-write and branch
+1 is read-only. And a file 'foo' on branch 1:
+
+./b0/
+./b1/
+./b1/foo
+
+The unified view would simply be:
+
+./union/
+./union/foo
+
+Since 'foo' is stored on a read-only branch, it cannot be removed. A
+whiteout is used to remove the name 'foo' from the unified namespace. Again,
+since branch 1 is read-only, the whiteout cannot be created there. So, we
+try on a higher priority (lower numerically) branch and create the whiteout
+there.
+
+./b0/
+./b0/.wh.foo
+./b1/
+./b1/foo
+
+Later, when Unionfs traverses branches (due to lookup or readdir), it
+eliminate 'foo' from the namespace (as well as the whiteout itself.)
+
+
+Opaque Directories:
+===================
+
+Assume we have a unionfs mount comprising of two branches. Branch 0 is
+empty; branch 1 has the directory /a and file /a/f. Let's say we mount a
+union of branch 0 as read-write and branch 1 as read-only. Now, let's say
+we try to perform the following operation in the union:
+
+ rm -fr a
+
+Because branch 1 is not writable, we cannot physically remove the file /a/f
+or the directory /a. So instead, we will create a whiteout in branch 0
+named /.wh.a, masking out the name "a" from branch 1. Next, let's say we
+try to create a directory named "a" as follows:
+
+ mkdir a
+
+Because we have a whiteout for "a" already, Unionfs behaves as if "a"
+doesn't exist, and thus will delete the whiteout and replace it with an
+actual directory named "a".
+
+The problem now is that if you try to "ls" in the union, Unionfs will
+perform is normal directory name unification, for *all* directories named
+"a" in all branches. This will cause the file /a/f from branch 1 to
+re-appear in the union's namespace, which violates Unix semantics.
+
+To avoid this problem, we have a different form of whiteouts for
+directories, called "opaque directories" (same as BSD Union Mount does).
+Whenever we replace a whiteout with a directory, that directory is marked as
+opaque. In Unionfs 2.x, it means that we create a file named
+/a/.wh.__dir_opaque in branch 0, after having created directory /a there.
+When unionfs notices that a directory is opaque, it stops all namespace
+operations (including merging readdir contents) at that opaque directory.
+This prevents re-exposing names from masked out directories.
+
+
+Duplicate Elimination:
+======================
+
+It is possible for files on different branches to have the same name.
+Unionfs then has to select which instance of the file to show to the user.
+Given the fact that each branch has a priority associated with it, the
+simplest solution is to take the instance from the highest priority
+(numerically lowest value) and "hide" the others.
+
+
+Unlinking:
+=========
+
+Unlink operation on non-directory instances is optimized to remove the
+maximum possible objects in case multiple underlying branches have the same
+file name. The unlink operation will first try to delete file instances
+from highest priority branch and then move further to delete from remaining
+branches in order of their decreasing priority. Consider a case (F..D..F),
+where F is a file and D is a directory of the same name; here, some
+intermediate branch could have an empty directory instance with the same
+name, so this operation also tries to delete this directory instance and
+proceed further to delete from next possible lower priority branch. The
+unionfs unlink operation will smoothly delete the files with same name from
+all possible underlying branches. In case if some error occurs, it creates
+whiteout in highest priority branch that will hide file instance in rest of
+the branches. An error could occur either if an unlink operations in any of
+the underlying branch failed or if a branch has no write permission.
+
+This unlinking policy is known as "delete all" and it has the benefit of
+overall reducing the number of inodes used by duplicate files, and further
+reducing the total number of inodes consumed by whiteouts. The cost is of
+extra processing, but testing shows this extra processing is well worth the
+savings.
+
+
+Copyup:
+=======
+
+When a change is made to the contents of a file's data or meta-data, they
+have to be stored somewhere. The best way is to create a copy of the
+original file on a branch that is writable, and then redirect the write
+though to this copy. The copy must be made on a higher priority branch so
+that lookup and readdir return this newer "version" of the file rather than
+the original (see duplicate elimination).
+
+An entire unionfs mount can be read-only or read-write. If it's read-only,
+then none of the branches will be written to, even if some of the branches
+are physically writeable. If the unionfs mount is read-write, then the
+leftmost (highest priority) branch must be writeable (for copyup to take
+place); the remaining branches can be any mix of read-write and read-only.
+
+In a writeable mount, unionfs will create new files/dir in the leftmost
+branch. If one tries to modify a file in a read-only branch/media, unionfs
+will copyup the file to the leftmost branch and modify it there. If you try
+to modify a file from a writeable branch which is not the leftmost branch,
+then unionfs will modify it in that branch; this is useful if you, say,
+unify differnet packages (e.g., apache, sendmail, ftpd, etc.) and you want
+changes to specific package files to remain logically in the directory where
+they came from.
+
+Cache Coherency:
+================
+
+Unionfs users often want to be able to modify files and directories directly
+on the lower branches, and have those changes be visible at the Unionfs
+level. This means that data (e.g., pages) and meta-data (dentries, inodes,
+open files, etc.) have to be synchronized between the upper and lower
+layers. In other words, the newest changes from a layer below have to be
+propagated to the Unionfs layer above. If the two layers are not in sync, a
+cache incoherency ensues, which could lead to application failures and even
+oopses. The Linux kernel, however, has a rather limited set of mechanisms
+to ensure this inter-layer cache coherency---so Unionfs has to do most of
+the hard work on its own.
+
+Maintaining Invariants:
+
+The way Unionfs ensures cache coherency is as follows. At each entry point
+to a Unionfs file system method, we call a utility function to validate the
+primary objects of this method. Generally, we call unionfs_file_revalidate
+on open files, and __unionfs_d_revalidate_chain on dentries (which also
+validates inodes). These utility functions check to see whether the upper
+Unionfs object is in sync with any of the lower objects that it represents.
+The checks we perform include whether the Unionfs superblock has a newer
+generation number, or if any of the lower objects mtime's or ctime's are
+newer. (Note: generation numbers change when branch-management commands are
+issued, so in a way, maintaining cache coherency is also very important for
+branch-management.) If indeed we determine that any Unionfs object is no
+longer in sync with its lower counterparts, then we rebuild that object
+similarly to how we do so for branch-management.
+
+While rebuilding Unionfs's objects, we also purge any page mappings and
+truncate inode pages (see fs/unionfs/dentry.c:purge_inode_data). This is to
+ensure that Unionfs will re-get the newer data from the lower branches. We
+perform this purging only if the Unionfs operation in question is a reading
+operation; if Unionfs is performing a data writing operation (e.g., ->write,
+->commit_write, etc.) then we do NOT flush the lower mappings/pages: this is
+because (1) a self-deadlock could occur and (2) the upper Unionfs pages are
+considered more authoritative anyway, as they are newer and will overwrite
+any lower pages.
+
+Unionfs maintains the following important invariant regarding mtime's,
+ctime's, and atime's: the upper inode object's times are the max() of all of
+the lower ones. For non-directory objects, there's only one object below,
+so the mapping is simple; for directory objects, there could me multiple
+lower objects and we have to sync up with the newest one of all the lower
+ones. This invariant is important to maintain, especially for directories
+(besides, we need this to be POSIX compliant). A union could comprise
+multiple writable branches, each of which could change. If we don't reflect
+the newest possible mtime/ctime, some applications could fail. For example,
+NFSv2/v3 exports check for newer directory mtimes on the server to determine
+if the client-side attribute cache should be purged.
+
+To maintain these important invariants, of course, Unionfs carefully
+synchronizes upper and lower times in various places. For example, if we
+copy-up a file to a top-level branch, the parent directory where the file
+was copied up to will now have a new mtime: so after a successful copy-up,
+we sync up with the new top-level branch's parent directory mtime.
+
+Implementation:
+
+This cache-coherency implementation is efficient because it defers any
+synchronizing between the upper and lower layers until absolutely needed.
+Consider the example a common situation where users perform a lot of lower
+changes, such as untarring a whole package. While these take place,
+typically the user doesn't access the files via Unionfs; only after the
+lower changes are done, does the user try to access the lower files. With
+our cache-coherency implementation, the entirety of the changes to the lower
+branches will not result in a single CPU cycle spent at the Unionfs level
+until the user invokes a system call that goes through Unionfs.
+
+We have considered two alternate cache-coherency designs. (1) Using the
+dentry/inode notify functionality to register interest in finding out about
+any lower changes. This is a somewhat limited and also a heavy-handed
+approach which could result in many notifications to the Unionfs layer upon
+each small change at the lower layer (imagine a file being modified multiple
+times in rapid succession). (2) Rewriting the VFS to support explicit
+callbacks from lower objects to upper objects. We began exploring such an
+implementation, but found it to be very complicated--it would have resulted
+in massive VFS/MM changes which are unlikely to be accepted by the LKML
+community. We therefore believe that our current cache-coherency design and
+implementation represent the best approach at this time.
+
+Limitations:
+
+Our implementation works in that as long as a user process will have caused
+Unionfs to be called, directly or indirectly, even to just do
+->d_revalidate; then we will have purged the current Unionfs data and the
+process will see the new data. For example, a process that continually
+re-reads the same file's data will see the NEW data as soon as the lower
+file had changed, upon the next read(2) syscall (even if the file is still
+open!) However, this doesn't work when the process re-reads the open file's
+data via mmap(2) (unless the user unmaps/closes the file and remaps/reopens
+it). Once we respond to ->readpage(s), then the kernel maps the page into
+the process's address space and there doesn't appear to be a way to force
+the kernel to invalidate those pages/mappings, and force the process to
+re-issue ->readpage. If there's a way to invalidate active mappings and
+force a ->readpage, let us know please (invalidate_inode_pages2 doesn't do
+the trick).
+
+Our current Unionfs code has to perform many file-revalidation calls. It
+would be really nice if the VFS would export an optional file system hook
+->file_revalidate (similarly to dentry->d_revalidate) that will be called
+before each VFS op that has a "struct file" in it.
+
+Certain file systems have micro-second granularity (or better) for inode
+times, and asynchronous actions could cause those times to change with some
+small delay. In such cases, Unionfs may see a changed inode time that only
+differs by a tiny fraction of a second: such a change may be a false
+positive indication that the lower object has changed, whereas if unionfs
+waits a little longer, that false indication will not be seen. (These false
+positives are harmless, because they would at most cause unionfs to
+re-validate an object that may need no revalidation, and print a debugging
+message that clutters the console/logs.) Therefore, to minimize the chances
+of these situations, we delay the detection of changed times by a small
+factor of a few seconds, called UNIONFS_MIN_CC_TIME (which defaults to 3
+seconds, as does NFS). This means that we will detect the change, only a
+couple of seconds later, if indeed the time change persists in the lower
+file object. This delayed detection has an added performance benefit: we
+reduce the number of times that unionfs has to revalidate objects, in case
+there's a lot of concurrent activity on both the upper and lower objects,
+for the same file(s). Lastly, this delayed time attribute detection is
+similar to how NFS clients operate (e.g., acregmin).
+
+Finally, there is no way currently in Linux to prevent lower directories
+from being moved around (i.e., topology changes); there's no way to prevent
+modifications to directory sub-trees of whole file systems which are mounted
+read-write. It is therefore possible for in-flight operations in unionfs to
+take place, while a lower directory is being moved around. Therefore, if
+you try to, say, create a new file in a directory through unionfs, while the
+directory is being moved around directly, then the new file may get created
+in the new location where that directory was moved to. This is a somewhat
+similar behaviour in NFS: an NFS client could be creating a new file while
+th NFS server is moving th directory around; the file will get successfully
+created in the new location. (The one exception in unionfs is that if the
+branch is marked read-only by unionfs, then a copyup will take place.)
+
+For more information, see <http://unionfs.filesystems.org/>.
diff --git a/Documentation/filesystems/unionfs/issues.txt b/Documentation/filesystems/unionfs/issues.txt
new file mode 100644
index 0000000..f4b7e7e
--- /dev/null
+++ b/Documentation/filesystems/unionfs/issues.txt
@@ -0,0 +1,28 @@
+KNOWN Unionfs 2.x ISSUES:
+=========================
+
+1. Unionfs should not use lookup_one_len() on the underlying f/s as it
+ confuses NFSv4. Currently, unionfs_lookup() passes lookup intents to the
+ lower file-system, this eliminates part of the problem. The remaining
+ calls to lookup_one_len may need to be changed to pass an intent. We are
+ currently introducing VFS changes to fs/namei.c's do_path_lookup() to
+ allow proper file lookup and opening in stackable file systems.
+
+2. Lockdep (a debugging feature) isn't aware of stacking, and so it
+ incorrectly complains about locking problems. The problem boils down to
+ this: Lockdep considers all objects of a certain type to be in the same
+ class, for example, all inodes. Lockdep doesn't like to see a lock held
+ on two inodes within the same task, and warns that it could lead to a
+ deadlock. However, stackable file systems do precisely that: they lock
+ an upper object, and then a lower object, in a strict order to avoid
+ locking problems; in addition, Unionfs, as a fan-out file system, may
+ have to lock several lower inodes. We are currently looking into Lockdep
+ to see how to make it aware of stackable file systems. For now, we
+ temporarily disable lockdep when calling vfs methods on lower objects,
+ but only for those places where lockdep complained. While this solution
+ may seem unclean, it is not without precedent: other places in the kernel
+ also do similar temporary disabling, of course after carefully having
+ checked that it is the right thing to do. Anyway, you get any warnings
+ from Lockdep, please report them to the Unionfs maintainers.
+
+For more information, see <http://unionfs.filesystems.org/>.
diff --git a/Documentation/filesystems/unionfs/rename.txt b/Documentation/filesystems/unionfs/rename.txt
new file mode 100644
index 0000000..e20bb82
--- /dev/null
+++ b/Documentation/filesystems/unionfs/rename.txt
@@ -0,0 +1,31 @@
+Rename is a complex beast. The following table shows which rename(2) operations
+should succeed and which should fail.
+
+o: success
+E: error (either unionfs or vfs)
+X: EXDEV
+
+none = file does not exist
+file = file is a file
+dir = file is a empty directory
+child= file is a non-empty directory
+wh = file is a directory containing only whiteouts; this makes it logically
+ empty
+
+ none file dir child wh
+file o o E E E
+dir o E o E o
+child X E X E X
+wh o E o E o
+
+
+Renaming directories:
+=====================
+
+Whenever a empty (either physically or logically) directory is being renamed,
+the following sequence of events should take place:
+
+1) Remove whiteouts from both source and destination directory
+2) Rename source to destination
+3) Make destination opaque to prevent anything under it from showing up
+
diff --git a/Documentation/filesystems/unionfs/usage.txt b/Documentation/filesystems/unionfs/usage.txt
new file mode 100644
index 0000000..1adde69
--- /dev/null
+++ b/Documentation/filesystems/unionfs/usage.txt
@@ -0,0 +1,134 @@
+Unionfs is a stackable unification file system, which can appear to merge
+the contents of several directories (branches), while keeping their physical
+content separate. Unionfs is useful for unified source tree management,
+merged contents of split CD-ROM, merged separate software package
+directories, data grids, and more. Unionfs allows any mix of read-only and
+read-write branches, as well as insertion and deletion of branches anywhere
+in the fan-out. To maintain Unix semantics, Unionfs handles elimination of
+duplicates, partial-error conditions, and more.
+
+GENERAL SYNTAX
+==============
+
+# mount -t unionfs -o <OPTIONS>,<BRANCH-OPTIONS> none MOUNTPOINT
+
+OPTIONS can be any legal combination of:
+
+- ro # mount file system read-only
+- rw # mount file system read-write
+- remount # remount the file system (see Branch Management below)
+- incgen # increment generation no. (see Cache Consistency below)
+
+BRANCH-OPTIONS can be either (1) a list of branches given to the "dirs="
+option, or (2) a list of individual branch manipulation commands, combined
+with the "remount" option, and is further described in the "Branch
+Management" section below.
+
+The syntax for the "dirs=" mount option is:
+
+ dirs=branch[=ro|=rw][:...]
+
+The "dirs=" option takes a colon-delimited list of directories to compose
+the union, with an optional branch mode for each of those directories.
+Directories that come earlier (specified first, on the left) in the list
+have a higher precedence than those which come later. Additionally,
+read-only or read-write permissions of the branch can be specified by
+appending =ro or =rw (default) to each directory. See the Copyup section in
+concepts.txt, for a description of Unionfs's behavior when mixing read-only
+and read-write branches and mounts.
+
+Syntax:
+
+ dirs=/branch1[=ro|=rw]:/branch2[=ro|=rw]:...:/branchN[=ro|=rw]
+
+Example:
+
+ dirs=/writable_branch=rw:/read-only_branch=ro
+
+
+BRANCH MANAGEMENT
+=================
+
+Once you mount your union for the first time, using the "dirs=" option, you
+can then change the union's overall mode or reconfigure the branches, using
+the remount option, as follows.
+
+To downgrade a union from read-write to read-only:
+
+# mount -t unionfs -o remount,ro none MOUNTPOINT
+
+To upgrade a union from read-only to read-write:
+
+# mount -t unionfs -o remount,rw none MOUNTPOINT
+
+To delete a branch /foo, regardless where it is in the current union:
+
+# mount -t unionfs -o remount,del=/foo none MOUNTPOINT
+
+To insert (add) a branch /foo before /bar:
+
+# mount -t unionfs -o remount,add=/bar:/foo none MOUNTPOINT
+
+To insert (add) a branch /foo (with the "rw" mode flag) before /bar:
+
+# mount -t unionfs -o remount,add=/bar:/foo=rw none MOUNTPOINT
+
+To insert (add) a branch /foo (in "rw" mode) at the very beginning (i.e., a
+new highest-priority branch), you can use the above syntax, or use a short
+hand version as follows:
+
+# mount -t unionfs -o remount,add=/foo none MOUNTPOINT
+
+To append a branch to the very end (new lowest-priority branch):
+
+# mount -t unionfs -o remount,add=:/foo none MOUNTPOINT
+
+To append a branch to the very end (new lowest-priority branch), in
+read-only mode:
+
+# mount -t unionfs -o remount,add=:/foo=ro none MOUNTPOINT
+
+Finally, to change the mode of one existing branch, say /foo, from read-only
+to read-write, and change /bar from read-write to read-only:
+
+# mount -t unionfs -o remount,mode=/foo=rw,mode=/bar=ro none MOUNTPOINT
+
+Note: in Unionfs 2.x, you cannot set the leftmost branch to readonly because
+then Unionfs won't have any writable place for copyups to take place.
+Moreover, the VFS can get confused when it tries to modify something in a
+file system mounted read-write, but isn't permitted to write to it.
+Instead, you should set the whole union as readonly, as described above.
+If, however, you must set the leftmost branch as readonly, perhaps so you
+can get a snapshot of it at a point in time, then you should insert a new
+writable top-level branch, and mark the one you want as readonly. This can
+be accomplished as follows, assuming that /foo is your current leftmost
+branch:
+
+# mount -t tmpfs -o size=NNN /new
+# mount -t unionfs -o remount,add=/new,mode=/foo=ro none MOUNTPOINT
+<do what you want safely in /foo>
+# mount -t unionfs -o remount,del=/new,mode=/foo=rw none MOUNTPOINT
+<check if there's anything in /new you want to preserve>
+# umount /new
+
+CACHE CONSISTENCY
+=================
+
+If you modify any file on any of the lower branches directly, while there is
+a Unionfs 2.x mounted above any of those branches, you should tell Unionfs
+to purge its caches and re-get the objects. To do that, you have to
+increment the generation number of the superblock using the following
+command:
+
+# mount -t unionfs -o remount,incgen none MOUNTPOINT
+
+Note that the older way of incrementing the generation number using an
+ioctl, is no longer supported in Unionfs 2.0 and newer. Ioctls in general
+are not encouraged. Plus, an ioctl is per-file concept, whereas the
+generation number is a per-file-system concept. Worse, such an ioctl
+requires an open file, which then has to be invalidated by the very nature
+of the generation number increase (read: the old generation increase ioctl
+was pretty racy).
+
+
+For more information, see <http://unionfs.filesystems.org/>.
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
new file mode 100644
index 0000000..3a5ddc9
--- /dev/null
+++ b/Documentation/filesystems/vfat.txt
@@ -0,0 +1,289 @@
+USING VFAT
+----------------------------------------------------------------------
+To use the vfat filesystem, use the filesystem type 'vfat'. i.e.
+ mount -t vfat /dev/fd0 /mnt
+
+No special partition formatter is required. mkdosfs will work fine
+if you want to format from within Linux.
+
+VFAT MOUNT OPTIONS
+----------------------------------------------------------------------
+uid=### -- Set the owner of all files on this filesystem.
+ The default is the uid of current process.
+
+gid=### -- Set the group of all files on this filesystem.
+ The default is the gid of current process.
+
+umask=### -- The permission mask (for files and directories, see umask(1)).
+ The default is the umask of current process.
+
+dmask=### -- The permission mask for the directory.
+ The default is the umask of current process.
+
+fmask=### -- The permission mask for files.
+ The default is the umask of current process.
+
+allow_utime=### -- This option controls the permission check of mtime/atime.
+
+ 20 - If current process is in group of file's group ID,
+ you can change timestamp.
+ 2 - Other users can change timestamp.
+
+ The default is set from `dmask' option. (If the directory is
+ writable, utime(2) is also allowed. I.e. ~dmask & 022)
+
+ Normally utime(2) checks current process is owner of
+ the file, or it has CAP_FOWNER capability. But FAT
+ filesystem doesn't have uid/gid on disk, so normal
+ check is too unflexible. With this option you can
+ relax it.
+
+codepage=### -- Sets the codepage number for converting to shortname
+ characters on FAT filesystem.
+ By default, FAT_DEFAULT_CODEPAGE setting is used.
+
+iocharset=<name> -- Character set to use for converting between the
+ encoding is used for user visible filename and 16 bit
+ Unicode characters. Long filenames are stored on disk
+ in Unicode format, but Unix for the most part doesn't
+ know how to deal with Unicode.
+ By default, FAT_DEFAULT_IOCHARSET setting is used.
+
+ There is also an option of doing UTF-8 translations
+ with the utf8 option.
+
+ NOTE: "iocharset=utf8" is not recommended. If unsure,
+ you should consider the following option instead.
+
+utf8=<bool> -- UTF-8 is the filesystem safe version of Unicode that
+ is used by the console. It can be enabled for the
+ filesystem with this option. If 'uni_xlate' gets set,
+ UTF-8 gets disabled.
+
+uni_xlate=<bool> -- Translate unhandled Unicode characters to special
+ escaped sequences. This would let you backup and
+ restore filenames that are created with any Unicode
+ characters. Until Linux supports Unicode for real,
+ this gives you an alternative. Without this option,
+ a '?' is used when no translation is possible. The
+ escape character is ':' because it is otherwise
+ illegal on the vfat filesystem. The escape sequence
+ that gets used is ':' and the four digits of hexadecimal
+ unicode.
+
+nonumtail=<bool> -- When creating 8.3 aliases, normally the alias will
+ end in '~1' or tilde followed by some number. If this
+ option is set, then if the filename is
+ "longfilename.txt" and "longfile.txt" does not
+ currently exist in the directory, 'longfile.txt' will
+ be the short alias instead of 'longfi~1.txt'.
+
+usefree -- Use the "free clusters" value stored on FSINFO. It'll
+ be used to determine number of free clusters without
+ scanning disk. But it's not used by default, because
+ recent Windows don't update it correctly in some
+ case. If you are sure the "free clusters" on FSINFO is
+ correct, by this option you can avoid scanning disk.
+
+quiet -- Stops printing certain warning messages.
+
+check=s|r|n -- Case sensitivity checking setting.
+ s: strict, case sensitive
+ r: relaxed, case insensitive
+ n: normal, default setting, currently case insensitive
+
+nocase -- This was deprecated for vfat. Use shortname=win95 instead.
+
+shortname=lower|win95|winnt|mixed
+ -- Shortname display/create setting.
+ lower: convert to lowercase for display,
+ emulate the Windows 95 rule for create.
+ win95: emulate the Windows 95 rule for display/create.
+ winnt: emulate the Windows NT rule for display/create.
+ mixed: emulate the Windows NT rule for display,
+ emulate the Windows 95 rule for create.
+ Default setting is `lower'.
+
+tz=UTC -- Interpret timestamps as UTC rather than local time.
+ This option disables the conversion of timestamps
+ between local time (as used by Windows on FAT) and UTC
+ (which Linux uses internally). This is particularly
+ useful when mounting devices (like digital cameras)
+ that are set to UTC in order to avoid the pitfalls of
+ local time.
+
+showexec -- If set, the execute permission bits of the file will be
+ allowed only if the extension part of the name is .EXE,
+ .COM, or .BAT. Not set by default.
+
+debug -- Can be set, but unused by the current implementation.
+
+sys_immutable -- If set, ATTR_SYS attribute on FAT is handled as
+ IMMUTABLE flag on Linux. Not set by default.
+
+flush -- If set, the filesystem will try to flush to disk more
+ early than normal. Not set by default.
+
+rodir -- FAT has the ATTR_RO (read-only) attribute. But on Windows,
+ the ATTR_RO of the directory will be just ignored actually,
+ and is used by only applications as flag. E.g. it's setted
+ for the customized folder.
+
+ If you want to use ATTR_RO as read-only flag even for
+ the directory, set this option.
+
+<bool>: 0,1,yes,no,true,false
+
+TODO
+----------------------------------------------------------------------
+* Need to get rid of the raw scanning stuff. Instead, always use
+ a get next directory entry approach. The only thing left that uses
+ raw scanning is the directory renaming code.
+
+
+POSSIBLE PROBLEMS
+----------------------------------------------------------------------
+* vfat_valid_longname does not properly checked reserved names.
+* When a volume name is the same as a directory name in the root
+ directory of the filesystem, the directory name sometimes shows
+ up as an empty file.
+* autoconv option does not work correctly.
+
+BUG REPORTS
+----------------------------------------------------------------------
+If you have trouble with the VFAT filesystem, mail bug reports to
+chaffee@bmrc.cs.berkeley.edu. Please specify the filename
+and the operation that gave you trouble.
+
+TEST SUITE
+----------------------------------------------------------------------
+If you plan to make any modifications to the vfat filesystem, please
+get the test suite that comes with the vfat distribution at
+
+ http://bmrc.berkeley.edu/people/chaffee/vfat.html
+
+This tests quite a few parts of the vfat filesystem and additional
+tests for new features or untested features would be appreciated.
+
+NOTES ON THE STRUCTURE OF THE VFAT FILESYSTEM
+----------------------------------------------------------------------
+(This documentation was provided by Galen C. Hunt <gchunt@cs.rochester.edu>
+ and lightly annotated by Gordon Chaffee).
+
+This document presents a very rough, technical overview of my
+knowledge of the extended FAT file system used in Windows NT 3.5 and
+Windows 95. I don't guarantee that any of the following is correct,
+but it appears to be so.
+
+The extended FAT file system is almost identical to the FAT
+file system used in DOS versions up to and including 6.223410239847
+:-). The significant change has been the addition of long file names.
+These names support up to 255 characters including spaces and lower
+case characters as opposed to the traditional 8.3 short names.
+
+Here is the description of the traditional FAT entry in the current
+Windows 95 filesystem:
+
+ struct directory { // Short 8.3 names
+ unsigned char name[8]; // file name
+ unsigned char ext[3]; // file extension
+ unsigned char attr; // attribute byte
+ unsigned char lcase; // Case for base and extension
+ unsigned char ctime_ms; // Creation time, milliseconds
+ unsigned char ctime[2]; // Creation time
+ unsigned char cdate[2]; // Creation date
+ unsigned char adate[2]; // Last access date
+ unsigned char reserved[2]; // reserved values (ignored)
+ unsigned char time[2]; // time stamp
+ unsigned char date[2]; // date stamp
+ unsigned char start[2]; // starting cluster number
+ unsigned char size[4]; // size of the file
+ };
+
+The lcase field specifies if the base and/or the extension of an 8.3
+name should be capitalized. This field does not seem to be used by
+Windows 95 but it is used by Windows NT. The case of filenames is not
+completely compatible from Windows NT to Windows 95. It is not completely
+compatible in the reverse direction, however. Filenames that fit in
+the 8.3 namespace and are written on Windows NT to be lowercase will
+show up as uppercase on Windows 95.
+
+Note that the "start" and "size" values are actually little
+endian integer values. The descriptions of the fields in this
+structure are public knowledge and can be found elsewhere.
+
+With the extended FAT system, Microsoft has inserted extra
+directory entries for any files with extended names. (Any name which
+legally fits within the old 8.3 encoding scheme does not have extra
+entries.) I call these extra entries slots. Basically, a slot is a
+specially formatted directory entry which holds up to 13 characters of
+a file's extended name. Think of slots as additional labeling for the
+directory entry of the file to which they correspond. Microsoft
+prefers to refer to the 8.3 entry for a file as its alias and the
+extended slot directory entries as the file name.
+
+The C structure for a slot directory entry follows:
+
+ struct slot { // Up to 13 characters of a long name
+ unsigned char id; // sequence number for slot
+ unsigned char name0_4[10]; // first 5 characters in name
+ unsigned char attr; // attribute byte
+ unsigned char reserved; // always 0
+ unsigned char alias_checksum; // checksum for 8.3 alias
+ unsigned char name5_10[12]; // 6 more characters in name
+ unsigned char start[2]; // starting cluster number
+ unsigned char name11_12[4]; // last 2 characters in name
+ };
+
+If the layout of the slots looks a little odd, it's only
+because of Microsoft's efforts to maintain compatibility with old
+software. The slots must be disguised to prevent old software from
+panicking. To this end, a number of measures are taken:
+
+ 1) The attribute byte for a slot directory entry is always set
+ to 0x0f. This corresponds to an old directory entry with
+ attributes of "hidden", "system", "read-only", and "volume
+ label". Most old software will ignore any directory
+ entries with the "volume label" bit set. Real volume label
+ entries don't have the other three bits set.
+
+ 2) The starting cluster is always set to 0, an impossible
+ value for a DOS file.
+
+Because the extended FAT system is backward compatible, it is
+possible for old software to modify directory entries. Measures must
+be taken to ensure the validity of slots. An extended FAT system can
+verify that a slot does in fact belong to an 8.3 directory entry by
+the following:
+
+ 1) Positioning. Slots for a file always immediately proceed
+ their corresponding 8.3 directory entry. In addition, each
+ slot has an id which marks its order in the extended file
+ name. Here is a very abbreviated view of an 8.3 directory
+ entry and its corresponding long name slots for the file
+ "My Big File.Extension which is long":
+
+ <proceeding files...>
+ <slot #3, id = 0x43, characters = "h is long">
+ <slot #2, id = 0x02, characters = "xtension whic">
+ <slot #1, id = 0x01, characters = "My Big File.E">
+ <directory entry, name = "MYBIGFIL.EXT">
+
+ Note that the slots are stored from last to first. Slots
+ are numbered from 1 to N. The Nth slot is or'ed with 0x40
+ to mark it as the last one.
+
+ 2) Checksum. Each slot has an "alias_checksum" value. The
+ checksum is calculated from the 8.3 name using the
+ following algorithm:
+
+ for (sum = i = 0; i < 11; i++) {
+ sum = (((sum&1)<<7)|((sum&0xfe)>>1)) + name[i]
+ }
+
+ 3) If there is free space in the final slot, a Unicode NULL (0x0000)
+ is stored after the final character. After that, all unused
+ characters in the final slot are set to Unicode 0xFFFF.
+
+Finally, note that the extended name is stored in Unicode. Each Unicode
+character takes two bytes.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
new file mode 100644
index 0000000..5579bda
--- /dev/null
+++ b/Documentation/filesystems/vfs.txt
@@ -0,0 +1,1000 @@
+
+ Overview of the Linux Virtual File System
+
+ Original author: Richard Gooch <rgooch@atnf.csiro.au>
+
+ Last updated on June 24, 2007.
+
+ Copyright (C) 1999 Richard Gooch
+ Copyright (C) 2005 Pekka Enberg
+
+ This file is released under the GPLv2.
+
+
+Introduction
+============
+
+The Virtual File System (also known as the Virtual Filesystem Switch)
+is the software layer in the kernel that provides the filesystem
+interface to userspace programs. It also provides an abstraction
+within the kernel which allows different filesystem implementations to
+coexist.
+
+VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so
+on are called from a process context. Filesystem locking is described
+in the document Documentation/filesystems/Locking.
+
+
+Directory Entry Cache (dcache)
+------------------------------
+
+The VFS implements the open(2), stat(2), chmod(2), and similar system
+calls. The pathname argument that is passed to them is used by the VFS
+to search through the directory entry cache (also known as the dentry
+cache or dcache). This provides a very fast look-up mechanism to
+translate a pathname (filename) into a specific dentry. Dentries live
+in RAM and are never saved to disc: they exist only for performance.
+
+The dentry cache is meant to be a view into your entire filespace. As
+most computers cannot fit all dentries in the RAM at the same time,
+some bits of the cache are missing. In order to resolve your pathname
+into a dentry, the VFS may have to resort to creating dentries along
+the way, and then loading the inode. This is done by looking up the
+inode.
+
+
+The Inode Object
+----------------
+
+An individual dentry usually has a pointer to an inode. Inodes are
+filesystem objects such as regular files, directories, FIFOs and other
+beasts. They live either on the disc (for block device filesystems)
+or in the memory (for pseudo filesystems). Inodes that live on the
+disc are copied into the memory when required and changes to the inode
+are written back to disc. A single inode can be pointed to by multiple
+dentries (hard links, for example, do this).
+
+To look up an inode requires that the VFS calls the lookup() method of
+the parent directory inode. This method is installed by the specific
+filesystem implementation that the inode lives in. Once the VFS has
+the required dentry (and hence the inode), we can do all those boring
+things like open(2) the file, or stat(2) it to peek at the inode
+data. The stat(2) operation is fairly simple: once the VFS has the
+dentry, it peeks at the inode data and passes some of it back to
+userspace.
+
+
+The File Object
+---------------
+
+Opening a file requires another operation: allocation of a file
+structure (this is the kernel-side implementation of file
+descriptors). The freshly allocated file structure is initialized with
+a pointer to the dentry and a set of file operation member functions.
+These are taken from the inode data. The open() file method is then
+called so the specific filesystem implementation can do it's work. You
+can see that this is another switch performed by the VFS. The file
+structure is placed into the file descriptor table for the process.
+
+Reading, writing and closing files (and other assorted VFS operations)
+is done by using the userspace file descriptor to grab the appropriate
+file structure, and then calling the required file structure method to
+do whatever is required. For as long as the file is open, it keeps the
+dentry in use, which in turn means that the VFS inode is still in use.
+
+
+Registering and Mounting a Filesystem
+=====================================
+
+To register and unregister a filesystem, use the following API
+functions:
+
+ #include <linux/fs.h>
+
+ extern int register_filesystem(struct file_system_type *);
+ extern int unregister_filesystem(struct file_system_type *);
+
+The passed struct file_system_type describes your filesystem. When a
+request is made to mount a device onto a directory in your filespace,
+the VFS will call the appropriate get_sb() method for the specific
+filesystem. The dentry for the mount point will then be updated to
+point to the root inode for the new filesystem.
+
+You can see all filesystems that are registered to the kernel in the
+file /proc/filesystems.
+
+
+struct file_system_type
+-----------------------
+
+This describes the filesystem. As of kernel 2.6.22, the following
+members are defined:
+
+struct file_system_type {
+ const char *name;
+ int fs_flags;
+ int (*get_sb) (struct file_system_type *, int,
+ const char *, void *, struct vfsmount *);
+ void (*kill_sb) (struct super_block *);
+ struct module *owner;
+ struct file_system_type * next;
+ struct list_head fs_supers;
+ struct lock_class_key s_lock_key;
+ struct lock_class_key s_umount_key;
+};
+
+ name: the name of the filesystem type, such as "ext2", "iso9660",
+ "msdos" and so on
+
+ fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.)
+
+ get_sb: the method to call when a new instance of this
+ filesystem should be mounted
+
+ kill_sb: the method to call when an instance of this filesystem
+ should be unmounted
+
+ owner: for internal VFS use: you should initialize this to THIS_MODULE in
+ most cases.
+
+ next: for internal VFS use: you should initialize this to NULL
+
+ s_lock_key, s_umount_key: lockdep-specific
+
+The get_sb() method has the following arguments:
+
+ struct file_system_type *fs_type: describes the filesystem, partly initialized
+ by the specific filesystem code
+
+ int flags: mount flags
+
+ const char *dev_name: the device name we are mounting.
+
+ void *data: arbitrary mount options, usually comes as an ASCII
+ string (see "Mount Options" section)
+
+ struct vfsmount *mnt: a vfs-internal representation of a mount point
+
+The get_sb() method must determine if the block device specified
+in the dev_name and fs_type contains a filesystem of the type the method
+supports. If it succeeds in opening the named block device, it initializes a
+struct super_block descriptor for the filesystem contained by the block device.
+On failure it returns an error.
+
+The most interesting member of the superblock structure that the
+get_sb() method fills in is the "s_op" field. This is a pointer to
+a "struct super_operations" which describes the next level of the
+filesystem implementation.
+
+Usually, a filesystem uses one of the generic get_sb() implementations
+and provides a fill_super() method instead. The generic methods are:
+
+ get_sb_bdev: mount a filesystem residing on a block device
+
+ get_sb_nodev: mount a filesystem that is not backed by a device
+
+ get_sb_single: mount a filesystem which shares the instance between
+ all mounts
+
+A fill_super() method implementation has the following arguments:
+
+ struct super_block *sb: the superblock structure. The method fill_super()
+ must initialize this properly.
+
+ void *data: arbitrary mount options, usually comes as an ASCII
+ string (see "Mount Options" section)
+
+ int silent: whether or not to be silent on error
+
+
+The Superblock Object
+=====================
+
+A superblock object represents a mounted filesystem.
+
+
+struct super_operations
+-----------------------
+
+This describes how the VFS can manipulate the superblock of your
+filesystem. As of kernel 2.6.22, the following members are defined:
+
+struct super_operations {
+ struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*destroy_inode)(struct inode *);
+
+ void (*dirty_inode) (struct inode *);
+ int (*write_inode) (struct inode *, int);
+ void (*drop_inode) (struct inode *);
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+ void (*write_super) (struct super_block *);
+ int (*sync_fs)(struct super_block *sb, int wait);
+ void (*write_super_lockfs) (struct super_block *);
+ void (*unlockfs) (struct super_block *);
+ int (*statfs) (struct dentry *, struct kstatfs *);
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*clear_inode) (struct inode *);
+ void (*umount_begin) (struct super_block *);
+
+ int (*show_options)(struct seq_file *, struct vfsmount *);
+
+ ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+ ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+};
+
+All methods are called without any locks being held, unless otherwise
+noted. This means that most methods can block safely. All methods are
+only called from a process context (i.e. not from an interrupt handler
+or bottom half).
+
+ alloc_inode: this method is called by inode_alloc() to allocate memory
+ for struct inode and initialize it. If this function is not
+ defined, a simple 'struct inode' is allocated. Normally
+ alloc_inode will be used to allocate a larger structure which
+ contains a 'struct inode' embedded within it.
+
+ destroy_inode: this method is called by destroy_inode() to release
+ resources allocated for struct inode. It is only required if
+ ->alloc_inode was defined and simply undoes anything done by
+ ->alloc_inode.
+
+ dirty_inode: this method is called by the VFS to mark an inode dirty.
+
+ write_inode: this method is called when the VFS needs to write an
+ inode to disc. The second parameter indicates whether the write
+ should be synchronous or not, not all filesystems check this flag.
+
+ drop_inode: called when the last access to the inode is dropped,
+ with the inode_lock spinlock held.
+
+ This method should be either NULL (normal UNIX filesystem
+ semantics) or "generic_delete_inode" (for filesystems that do not
+ want to cache inodes - causing "delete_inode" to always be
+ called regardless of the value of i_nlink)
+
+ The "generic_delete_inode()" behavior is equivalent to the
+ old practice of using "force_delete" in the put_inode() case,
+ but does not have the races that the "force_delete()" approach
+ had.
+
+ delete_inode: called when the VFS wants to delete an inode
+
+ put_super: called when the VFS wishes to free the superblock
+ (i.e. unmount). This is called with the superblock lock held
+
+ write_super: called when the VFS superblock needs to be written to
+ disc. This method is optional
+
+ sync_fs: called when VFS is writing out all dirty data associated with
+ a superblock. The second parameter indicates whether the method
+ should wait until the write out has been completed. Optional.
+
+ write_super_lockfs: called when VFS is locking a filesystem and
+ forcing it into a consistent state. This method is currently
+ used by the Logical Volume Manager (LVM).
+
+ unlockfs: called when VFS is unlocking a filesystem and making it writable
+ again.
+
+ statfs: called when the VFS needs to get filesystem statistics. This
+ is called with the kernel lock held
+
+ remount_fs: called when the filesystem is remounted. This is called
+ with the kernel lock held
+
+ clear_inode: called then the VFS clears the inode. Optional
+
+ umount_begin: called when the VFS is unmounting a filesystem.
+
+ show_options: called by the VFS to show mount options for
+ /proc/<pid>/mounts. (see "Mount Options" section)
+
+ quota_read: called by the VFS to read from filesystem quota file.
+
+ quota_write: called by the VFS to write to filesystem quota file.
+
+Whoever sets up the inode is responsible for filling in the "i_op" field. This
+is a pointer to a "struct inode_operations" which describes the methods that
+can be performed on individual inodes.
+
+
+The Inode Object
+================
+
+An inode object represents an object within the filesystem.
+
+
+struct inode_operations
+-----------------------
+
+This describes how the VFS can manipulate an inode in your
+filesystem. As of kernel 2.6.22, the following members are defined:
+
+struct inode_operations {
+ int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
+ int (*unlink) (struct inode *,struct dentry *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,int);
+ int (*rmdir) (struct inode *,struct dentry *);
+ int (*mknod) (struct inode *,struct dentry *,int,dev_t);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
+ int (*readlink) (struct dentry *, char __user *,int);
+ void * (*follow_link) (struct dentry *, struct nameidata *);
+ void (*put_link) (struct dentry *, struct nameidata *, void *);
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int, struct nameidata *);
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+ int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ int (*removexattr) (struct dentry *, const char *);
+ void (*truncate_range)(struct inode *, loff_t, loff_t);
+};
+
+Again, all methods are called without any locks being held, unless
+otherwise noted.
+
+ create: called by the open(2) and creat(2) system calls. Only
+ required if you want to support regular files. The dentry you
+ get should not have an inode (i.e. it should be a negative
+ dentry). Here you will probably call d_instantiate() with the
+ dentry and the newly created inode
+
+ lookup: called when the VFS needs to look up an inode in a parent
+ directory. The name to look for is found in the dentry. This
+ method must call d_add() to insert the found inode into the
+ dentry. The "i_count" field in the inode structure should be
+ incremented. If the named inode does not exist a NULL inode
+ should be inserted into the dentry (this is called a negative
+ dentry). Returning an error code from this routine must only
+ be done on a real error, otherwise creating inodes with system
+ calls like create(2), mknod(2), mkdir(2) and so on will fail.
+ If you wish to overload the dentry methods then you should
+ initialise the "d_dop" field in the dentry; this is a pointer
+ to a struct "dentry_operations".
+ This method is called with the directory inode semaphore held
+
+ link: called by the link(2) system call. Only required if you want
+ to support hard links. You will probably need to call
+ d_instantiate() just as you would in the create() method
+
+ unlink: called by the unlink(2) system call. Only required if you
+ want to support deleting inodes
+
+ symlink: called by the symlink(2) system call. Only required if you
+ want to support symlinks. You will probably need to call
+ d_instantiate() just as you would in the create() method
+
+ mkdir: called by the mkdir(2) system call. Only required if you want
+ to support creating subdirectories. You will probably need to
+ call d_instantiate() just as you would in the create() method
+
+ rmdir: called by the rmdir(2) system call. Only required if you want
+ to support deleting subdirectories
+
+ mknod: called by the mknod(2) system call to create a device (char,
+ block) inode or a named pipe (FIFO) or socket. Only required
+ if you want to support creating these types of inodes. You
+ will probably need to call d_instantiate() just as you would
+ in the create() method
+
+ rename: called by the rename(2) system call to rename the object to
+ have the parent and name given by the second inode and dentry.
+
+ readlink: called by the readlink(2) system call. Only required if
+ you want to support reading symbolic links
+
+ follow_link: called by the VFS to follow a symbolic link to the
+ inode it points to. Only required if you want to support
+ symbolic links. This method returns a void pointer cookie
+ that is passed to put_link().
+
+ put_link: called by the VFS to release resources allocated by
+ follow_link(). The cookie returned by follow_link() is passed
+ to this method as the last parameter. It is used by
+ filesystems such as NFS where page cache is not stable
+ (i.e. page that was installed when the symbolic link walk
+ started might not be in the page cache at the end of the
+ walk).
+
+ truncate: called by the VFS to change the size of a file. The
+ i_size field of the inode is set to the desired size by the
+ VFS before this method is called. This method is called by
+ the truncate(2) system call and related functionality.
+
+ permission: called by the VFS to check for access rights on a POSIX-like
+ filesystem.
+
+ setattr: called by the VFS to set attributes for a file. This method
+ is called by chmod(2) and related system calls.
+
+ getattr: called by the VFS to get attributes of a file. This method
+ is called by stat(2) and related system calls.
+
+ setxattr: called by the VFS to set an extended attribute for a file.
+ Extended attribute is a name:value pair associated with an
+ inode. This method is called by setxattr(2) system call.
+
+ getxattr: called by the VFS to retrieve the value of an extended
+ attribute name. This method is called by getxattr(2) function
+ call.
+
+ listxattr: called by the VFS to list all extended attributes for a
+ given file. This method is called by listxattr(2) system call.
+
+ removexattr: called by the VFS to remove an extended attribute from
+ a file. This method is called by removexattr(2) system call.
+
+ truncate_range: a method provided by the underlying filesystem to truncate a
+ range of blocks , i.e. punch a hole somewhere in a file.
+
+
+The Address Space Object
+========================
+
+The address space object is used to group and manage pages in the page
+cache. It can be used to keep track of the pages in a file (or
+anything else) and also track the mapping of sections of the file into
+process address spaces.
+
+There are a number of distinct yet related services that an
+address-space can provide. These include communicating memory
+pressure, page lookup by address, and keeping track of pages tagged as
+Dirty or Writeback.
+
+The first can be used independently to the others. The VM can try to
+either write dirty pages in order to clean them, or release clean
+pages in order to reuse them. To do this it can call the ->writepage
+method on dirty pages, and ->releasepage on clean pages with
+PagePrivate set. Clean pages without PagePrivate and with no external
+references will be released without notice being given to the
+address_space.
+
+To achieve this functionality, pages need to be placed on an LRU with
+lru_cache_add and mark_page_active needs to be called whenever the
+page is used.
+
+Pages are normally kept in a radix tree index by ->index. This tree
+maintains information about the PG_Dirty and PG_Writeback status of
+each page, so that pages with either of these flags can be found
+quickly.
+
+The Dirty tag is primarily used by mpage_writepages - the default
+->writepages method. It uses the tag to find dirty pages to call
+->writepage on. If mpage_writepages is not used (i.e. the address
+provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is
+almost unused. write_inode_now and sync_inode do use it (through
+__sync_single_inode) to check if ->writepages has been successful in
+writing out the whole address_space.
+
+The Writeback tag is used by filemap*wait* and sync_page* functions,
+via wait_on_page_writeback_range, to wait for all writeback to
+complete. While waiting ->sync_page (if defined) will be called on
+each page that is found to require writeback.
+
+An address_space handler may attach extra information to a page,
+typically using the 'private' field in the 'struct page'. If such
+information is attached, the PG_Private flag should be set. This will
+cause various VM routines to make extra calls into the address_space
+handler to deal with that data.
+
+An address space acts as an intermediate between storage and
+application. Data is read into the address space a whole page at a
+time, and provided to the application either by copying of the page,
+or by memory-mapping the page.
+Data is written into the address space by the application, and then
+written-back to storage typically in whole pages, however the
+address_space has finer control of write sizes.
+
+The read process essentially only requires 'readpage'. The write
+process is more complicated and uses write_begin/write_end or
+set_page_dirty to write data into the address_space, and writepage,
+sync_page, and writepages to writeback data to storage.
+
+Adding and removing pages to/from an address_space is protected by the
+inode's i_mutex.
+
+When data is written to a page, the PG_Dirty flag should be set. It
+typically remains set until writepage asks for it to be written. This
+should clear PG_Dirty and set PG_Writeback. It can be actually
+written at any point after PG_Dirty is clear. Once it is known to be
+safe, PG_Writeback is cleared.
+
+Writeback makes use of a writeback_control structure...
+
+struct address_space_operations
+-------------------------------
+
+This describes how the VFS can manipulate mapping of a file to page cache in
+your filesystem. As of kernel 2.6.22, the following members are defined:
+
+struct address_space_operations {
+ int (*writepage)(struct page *page, struct writeback_control *wbc);
+ int (*readpage)(struct file *, struct page *);
+ int (*sync_page)(struct page *);
+ int (*writepages)(struct address_space *, struct writeback_control *);
+ int (*set_page_dirty)(struct page *page);
+ int (*readpages)(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
+ sector_t (*bmap)(struct address_space *, sector_t);
+ int (*invalidatepage) (struct page *, unsigned long);
+ int (*releasepage) (struct page *, int);
+ ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
+ loff_t offset, unsigned long nr_segs);
+ struct page* (*get_xip_page)(struct address_space *, sector_t,
+ int);
+ /* migrate the contents of a page to the specified target */
+ int (*migratepage) (struct page *, struct page *);
+ int (*launder_page) (struct page *);
+};
+
+ writepage: called by the VM to write a dirty page to backing store.
+ This may happen for data integrity reasons (i.e. 'sync'), or
+ to free up memory (flush). The difference can be seen in
+ wbc->sync_mode.
+ The PG_Dirty flag has been cleared and PageLocked is true.
+ writepage should start writeout, should set PG_Writeback,
+ and should make sure the page is unlocked, either synchronously
+ or asynchronously when the write operation completes.
+
+ If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to
+ try too hard if there are problems, and may choose to write out
+ other pages from the mapping if that is easier (e.g. due to
+ internal dependencies). If it chooses not to start writeout, it
+ should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep
+ calling ->writepage on that page.
+
+ See the file "Locking" for more details.
+
+ readpage: called by the VM to read a page from backing store.
+ The page will be Locked when readpage is called, and should be
+ unlocked and marked uptodate once the read completes.
+ If ->readpage discovers that it needs to unlock the page for
+ some reason, it can do so, and then return AOP_TRUNCATED_PAGE.
+ In this case, the page will be relocated, relocked and if
+ that all succeeds, ->readpage will be called again.
+
+ sync_page: called by the VM to notify the backing store to perform all
+ queued I/O operations for a page. I/O operations for other pages
+ associated with this address_space object may also be performed.
+
+ This function is optional and is called only for pages with
+ PG_Writeback set while waiting for the writeback to complete.
+
+ writepages: called by the VM to write out pages associated with the
+ address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then
+ the writeback_control will specify a range of pages that must be
+ written out. If it is WBC_SYNC_NONE, then a nr_to_write is given
+ and that many pages should be written if possible.
+ If no ->writepages is given, then mpage_writepages is used
+ instead. This will choose pages from the address space that are
+ tagged as DIRTY and will pass them to ->writepage.
+
+ set_page_dirty: called by the VM to set a page dirty.
+ This is particularly needed if an address space attaches
+ private data to a page, and that data needs to be updated when
+ a page is dirtied. This is called, for example, when a memory
+ mapped page gets modified.
+ If defined, it should set the PageDirty flag, and the
+ PAGECACHE_TAG_DIRTY tag in the radix tree.
+
+ readpages: called by the VM to read pages associated with the address_space
+ object. This is essentially just a vector version of
+ readpage. Instead of just one page, several pages are
+ requested.
+ readpages is only used for read-ahead, so read errors are
+ ignored. If anything goes wrong, feel free to give up.
+
+ write_begin:
+ Called by the generic buffered write code to ask the filesystem to
+ prepare to write len bytes at the given offset in the file. The
+ address_space should check that the write will be able to complete,
+ by allocating space if necessary and doing any other internal
+ housekeeping. If the write will update parts of any basic-blocks on
+ storage, then those blocks should be pre-read (if they haven't been
+ read already) so that the updated blocks can be written out properly.
+
+ The filesystem must return the locked pagecache page for the specified
+ offset, in *pagep, for the caller to write into.
+
+ It must be able to cope with short writes (where the length passed to
+ write_begin is greater than the number of bytes copied into the page).
+
+ flags is a field for AOP_FLAG_xxx flags, described in
+ include/linux/fs.h.
+
+ A void * may be returned in fsdata, which then gets passed into
+ write_end.
+
+ Returns 0 on success; < 0 on failure (which is the error code), in
+ which case write_end is not called.
+
+ write_end: After a successful write_begin, and data copy, write_end must
+ be called. len is the original len passed to write_begin, and copied
+ is the amount that was able to be copied (copied == len is always true
+ if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
+
+ The filesystem must take care of unlocking the page and releasing it
+ refcount, and updating i_size.
+
+ Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
+ that were able to be copied into pagecache.
+
+ bmap: called by the VFS to map a logical block offset within object to
+ physical block number. This method is used by the FIBMAP
+ ioctl and for working with swap-files. To be able to swap to
+ a file, the file must have a stable mapping to a block
+ device. The swap system does not go through the filesystem
+ but instead uses bmap to find out where the blocks in the file
+ are and uses those addresses directly.
+
+
+ invalidatepage: If a page has PagePrivate set, then invalidatepage
+ will be called when part or all of the page is to be removed
+ from the address space. This generally corresponds to either a
+ truncation or a complete invalidation of the address space
+ (in the latter case 'offset' will always be 0).
+ Any private data associated with the page should be updated
+ to reflect this truncation. If offset is 0, then
+ the private data should be released, because the page
+ must be able to be completely discarded. This may be done by
+ calling the ->releasepage function, but in this case the
+ release MUST succeed.
+
+ releasepage: releasepage is called on PagePrivate pages to indicate
+ that the page should be freed if possible. ->releasepage
+ should remove any private data from the page and clear the
+ PagePrivate flag. It may also remove the page from the
+ address_space. If this fails for some reason, it may indicate
+ failure with a 0 return value.
+ This is used in two distinct though related cases. The first
+ is when the VM finds a clean page with no active users and
+ wants to make it a free page. If ->releasepage succeeds, the
+ page will be removed from the address_space and become free.
+
+ The second case is when a request has been made to invalidate
+ some or all pages in an address_space. This can happen
+ through the fadvice(POSIX_FADV_DONTNEED) system call or by the
+ filesystem explicitly requesting it as nfs and 9fs do (when
+ they believe the cache may be out of date with storage) by
+ calling invalidate_inode_pages2().
+ If the filesystem makes such a call, and needs to be certain
+ that all pages are invalidated, then its releasepage will
+ need to ensure this. Possibly it can clear the PageUptodate
+ bit if it cannot free private data yet.
+
+ direct_IO: called by the generic read/write routines to perform
+ direct_IO - that is IO requests which bypass the page cache
+ and transfer data directly between the storage and the
+ application's address space.
+
+ get_xip_page: called by the VM to translate a block number to a page.
+ The page is valid until the corresponding filesystem is unmounted.
+ Filesystems that want to use execute-in-place (XIP) need to implement
+ it. An example implementation can be found in fs/ext2/xip.c.
+
+ migrate_page: This is used to compact the physical memory usage.
+ If the VM wants to relocate a page (maybe off a memory card
+ that is signalling imminent failure) it will pass a new page
+ and an old page to this function. migrate_page should
+ transfer any private data across and update any references
+ that it has to the page.
+
+ launder_page: Called before freeing a page - it writes back the dirty page. To
+ prevent redirtying the page, it is kept locked during the whole
+ operation.
+
+The File Object
+===============
+
+A file object represents a file opened by a process.
+
+
+struct file_operations
+----------------------
+
+This describes how the VFS can manipulate an open file. As of kernel
+2.6.22, the following members are defined:
+
+struct file_operations {
+ struct module *owner;
+ loff_t (*llseek) (struct file *, loff_t, int);
+ ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
+ ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
+ ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ int (*readdir) (struct file *, void *, filldir_t);
+ unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+ long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+ long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+ int (*mmap) (struct file *, struct vm_area_struct *);
+ int (*open) (struct inode *, struct file *);
+ int (*flush) (struct file *);
+ int (*release) (struct inode *, struct file *);
+ int (*fsync) (struct file *, struct dentry *, int datasync);
+ int (*aio_fsync) (struct kiocb *, int datasync);
+ int (*fasync) (int, struct file *, int);
+ int (*lock) (struct file *, int, struct file_lock *);
+ ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
+ ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
+ unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ int (*check_flags)(int);
+ int (*dir_notify)(struct file *filp, unsigned long arg);
+ int (*flock) (struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
+};
+
+Again, all methods are called without any locks being held, unless
+otherwise noted.
+
+ llseek: called when the VFS needs to move the file position index
+
+ read: called by read(2) and related system calls
+
+ aio_read: called by io_submit(2) and other asynchronous I/O operations
+
+ write: called by write(2) and related system calls
+
+ aio_write: called by io_submit(2) and other asynchronous I/O operations
+
+ readdir: called when the VFS needs to read the directory contents
+
+ poll: called by the VFS when a process wants to check if there is
+ activity on this file and (optionally) go to sleep until there
+ is activity. Called by the select(2) and poll(2) system calls
+
+ ioctl: called by the ioctl(2) system call
+
+ unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not
+ require the BKL should use this method instead of the ioctl() above.
+
+ compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
+ are used on 64 bit kernels.
+
+ mmap: called by the mmap(2) system call
+
+ open: called by the VFS when an inode should be opened. When the VFS
+ opens a file, it creates a new "struct file". It then calls the
+ open method for the newly allocated file structure. You might
+ think that the open method really belongs in
+ "struct inode_operations", and you may be right. I think it's
+ done the way it is because it makes filesystems simpler to
+ implement. The open() method is a good place to initialize the
+ "private_data" member in the file structure if you want to point
+ to a device structure
+
+ flush: called by the close(2) system call to flush a file
+
+ release: called when the last reference to an open file is closed
+
+ fsync: called by the fsync(2) system call
+
+ fasync: called by the fcntl(2) system call when asynchronous
+ (non-blocking) mode is enabled for a file
+
+ lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW
+ commands
+
+ readv: called by the readv(2) system call
+
+ writev: called by the writev(2) system call
+
+ sendfile: called by the sendfile(2) system call
+
+ get_unmapped_area: called by the mmap(2) system call
+
+ check_flags: called by the fcntl(2) system call for F_SETFL command
+
+ dir_notify: called by the fcntl(2) system call for F_NOTIFY command
+
+ flock: called by the flock(2) system call
+
+ splice_write: called by the VFS to splice data from a pipe to a file. This
+ method is used by the splice(2) system call
+
+ splice_read: called by the VFS to splice data from file to a pipe. This
+ method is used by the splice(2) system call
+
+Note that the file operations are implemented by the specific
+filesystem in which the inode resides. When opening a device node
+(character or block special) most filesystems will call special
+support routines in the VFS which will locate the required device
+driver information. These support routines replace the filesystem file
+operations with those for the device driver, and then proceed to call
+the new open() method for the file. This is how opening a device file
+in the filesystem eventually ends up calling the device driver open()
+method.
+
+
+Directory Entry Cache (dcache)
+==============================
+
+
+struct dentry_operations
+------------------------
+
+This describes how a filesystem can overload the standard dentry
+operations. Dentries and the dcache are the domain of the VFS and the
+individual filesystem implementations. Device drivers have no business
+here. These methods may be set to NULL, as they are either optional or
+the VFS uses a default. As of kernel 2.6.22, the following members are
+defined:
+
+struct dentry_operations {
+ int (*d_revalidate)(struct dentry *, struct nameidata *);
+ int (*d_hash) (struct dentry *, struct qstr *);
+ int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
+ int (*d_delete)(struct dentry *);
+ void (*d_release)(struct dentry *);
+ void (*d_iput)(struct dentry *, struct inode *);
+ char *(*d_dname)(struct dentry *, char *, int);
+};
+
+ d_revalidate: called when the VFS needs to revalidate a dentry. This
+ is called whenever a name look-up finds a dentry in the
+ dcache. Most filesystems leave this as NULL, because all their
+ dentries in the dcache are valid
+
+ d_hash: called when the VFS adds a dentry to the hash table
+
+ d_compare: called when a dentry should be compared with another
+
+ d_delete: called when the last reference to a dentry is
+ deleted. This means no-one is using the dentry, however it is
+ still valid and in the dcache
+
+ d_release: called when a dentry is really deallocated
+
+ d_iput: called when a dentry loses its inode (just prior to its
+ being deallocated). The default when this is NULL is that the
+ VFS calls iput(). If you define this method, you must call
+ iput() yourself
+
+ d_dname: called when the pathname of a dentry should be generated.
+ Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay
+ pathname generation. (Instead of doing it when dentry is created,
+ it's done only when the path is needed.). Real filesystems probably
+ dont want to use it, because their dentries are present in global
+ dcache hash, so their hash should be an invariant. As no lock is
+ held, d_dname() should not try to modify the dentry itself, unless
+ appropriate SMP safety is used. CAUTION : d_path() logic is quite
+ tricky. The correct way to return for example "Hello" is to put it
+ at the end of the buffer, and returns a pointer to the first char.
+ dynamic_dname() helper function is provided to take care of this.
+
+Example :
+
+static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
+{
+ return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ dentry->d_inode->i_ino);
+}
+
+Each dentry has a pointer to its parent dentry, as well as a hash list
+of child dentries. Child dentries are basically like files in a
+directory.
+
+
+Directory Entry Cache API
+--------------------------
+
+There are a number of functions defined which permit a filesystem to
+manipulate dentries:
+
+ dget: open a new handle for an existing dentry (this just increments
+ the usage count)
+
+ dput: close a handle for a dentry (decrements the usage count). If
+ the usage count drops to 0, the "d_delete" method is called
+ and the dentry is placed on the unused list if the dentry is
+ still in its parents hash list. Putting the dentry on the
+ unused list just means that if the system needs some RAM, it
+ goes through the unused list of dentries and deallocates them.
+ If the dentry has already been unhashed and the usage count
+ drops to 0, in this case the dentry is deallocated after the
+ "d_delete" method is called
+
+ d_drop: this unhashes a dentry from its parents hash list. A
+ subsequent call to dput() will deallocate the dentry if its
+ usage count drops to 0
+
+ d_delete: delete a dentry. If there are no other open references to
+ the dentry then the dentry is turned into a negative dentry
+ (the d_iput() method is called). If there are other
+ references, then d_drop() is called instead
+
+ d_add: add a dentry to its parents hash list and then calls
+ d_instantiate()
+
+ d_instantiate: add a dentry to the alias hash list for the inode and
+ updates the "d_inode" member. The "i_count" member in the
+ inode structure should be set/incremented. If the inode
+ pointer is NULL, the dentry is called a "negative
+ dentry". This function is commonly called when an inode is
+ created for an existing negative dentry
+
+ d_lookup: look up a dentry given its parent and path name component
+ It looks up the child of that given name from the dcache
+ hash table. If it is found, the reference count is incremented
+ and the dentry is returned. The caller must use d_put()
+ to free the dentry when it finishes using it.
+
+For further information on dentry locking, please refer to the document
+Documentation/filesystems/dentry-locking.txt.
+
+Mount Options
+=============
+
+Parsing options
+---------------
+
+On mount and remount the filesystem is passed a string containing a
+comma separated list of mount options. The options can have either of
+these forms:
+
+ option
+ option=value
+
+The <linux/parser.h> header defines an API that helps parse these
+options. There are plenty of examples on how to use it in existing
+filesystems.
+
+Showing options
+---------------
+
+If a filesystem accepts mount options, it must define show_options()
+to show all the currently active options. The rules are:
+
+ - options MUST be shown which are not default or their values differ
+ from the default
+
+ - options MAY be shown which are enabled by default or have their
+ default value
+
+Options used only internally between a mount helper and the kernel
+(such as file descriptors), or which only have an effect during the
+mounting (such as ones controlling the creation of a journal) are exempt
+from the above rules.
+
+The underlying reason for the above rules is to make sure, that a
+mount can be accurately replicated (e.g. umounting and mounting again)
+based on the information found in /proc/mounts.
+
+A simple method of saving options at mount/remount time and showing
+them is provided with the save_mount_options() and
+generic_show_options() helper functions. Please note, that using
+these may have drawbacks. For more info see header comments for these
+functions in fs/namespace.c.
+
+Resources
+=========
+
+(Note some of these resources are not up-to-date with the latest kernel
+ version.)
+
+Creating Linux virtual filesystems. 2002
+ <http://lwn.net/Articles/13325/>
+
+The Linux Virtual File-system Layer by Neil Brown. 1999
+ <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html>
+
+A tour of the Linux VFS by Michael K. Johnson. 1996
+ <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html>
+
+A small trail through the Linux kernel by Andries Brouwer. 2001
+ <http://www.win.tue.nl/~aeb/linux/vfs/trail.html>
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
new file mode 100644
index 0000000..0a1668b
--- /dev/null
+++ b/Documentation/filesystems/xfs.txt
@@ -0,0 +1,261 @@
+
+The SGI XFS Filesystem
+======================
+
+XFS is a high performance journaling filesystem which originated
+on the SGI IRIX platform. It is completely multi-threaded, can
+support large files and large filesystems, extended attributes,
+variable block sizes, is extent based, and makes extensive use of
+Btrees (directories, extents, free space) to aid both performance
+and scalability.
+
+Refer to the documentation at http://oss.sgi.com/projects/xfs/
+for further details. This implementation is on-disk compatible
+with the IRIX version of XFS.
+
+
+Mount Options
+=============
+
+When mounting an XFS filesystem, the following options are accepted.
+
+ allocsize=size
+ Sets the buffered I/O end-of-file preallocation size when
+ doing delayed allocation writeout (default size is 64KiB).
+ Valid values for this option are page size (typically 4KiB)
+ through to 1GiB, inclusive, in power-of-2 increments.
+
+ attr2/noattr2
+ The options enable/disable (default is disabled for backward
+ compatibility on-disk) an "opportunistic" improvement to be
+ made in the way inline extended attributes are stored on-disk.
+ When the new form is used for the first time (by setting or
+ removing extended attributes) the on-disk superblock feature
+ bit field will be updated to reflect this format being in use.
+
+ barrier
+ Enables the use of block layer write barriers for writes into
+ the journal and unwritten extent conversion. This allows for
+ drive level write caching to be enabled, for devices that
+ support write barriers.
+
+ dmapi
+ Enable the DMAPI (Data Management API) event callouts.
+ Use with the "mtpt" option.
+
+ grpid/bsdgroups and nogrpid/sysvgroups
+ These options define what group ID a newly created file gets.
+ When grpid is set, it takes the group ID of the directory in
+ which it is created; otherwise (the default) it takes the fsgid
+ of the current process, unless the directory has the setgid bit
+ set, in which case it takes the gid from the parent directory,
+ and also gets the setgid bit set if it is a directory itself.
+
+ ihashsize=value
+ In memory inode hashes have been removed, so this option has
+ no function as of August 2007. Option is deprecated.
+
+ ikeep/noikeep
+ When ikeep is specified, XFS does not delete empty inode clusters
+ and keeps them around on disk. ikeep is the traditional XFS
+ behaviour. When noikeep is specified, empty inode clusters
+ are returned to the free space pool. The default is noikeep for
+ non-DMAPI mounts, while ikeep is the default when DMAPI is in use.
+
+ inode64
+ Indicates that XFS is allowed to create inodes at any location
+ in the filesystem, including those which will result in inode
+ numbers occupying more than 32 bits of significance. This is
+ provided for backwards compatibility, but causes problems for
+ backup applications that cannot handle large inode numbers.
+
+ largeio/nolargeio
+ If "nolargeio" is specified, the optimal I/O reported in
+ st_blksize by stat(2) will be as small as possible to allow user
+ applications to avoid inefficient read/modify/write I/O.
+ If "largeio" specified, a filesystem that has a "swidth" specified
+ will return the "swidth" value (in bytes) in st_blksize. If the
+ filesystem does not have a "swidth" specified but does specify
+ an "allocsize" then "allocsize" (in bytes) will be returned
+ instead.
+ If neither of these two options are specified, then filesystem
+ will behave as if "nolargeio" was specified.
+
+ logbufs=value
+ Set the number of in-memory log buffers. Valid numbers range
+ from 2-8 inclusive.
+ The default value is 8 buffers for filesystems with a
+ blocksize of 64KiB, 4 buffers for filesystems with a blocksize
+ of 32KiB, 3 buffers for filesystems with a blocksize of 16KiB
+ and 2 buffers for all other configurations. Increasing the
+ number of buffers may increase performance on some workloads
+ at the cost of the memory used for the additional log buffers
+ and their associated control structures.
+
+ logbsize=value
+ Set the size of each in-memory log buffer.
+ Size may be specified in bytes, or in kilobytes with a "k" suffix.
+ Valid sizes for version 1 and version 2 logs are 16384 (16k) and
+ 32768 (32k). Valid sizes for version 2 logs also include
+ 65536 (64k), 131072 (128k) and 262144 (256k).
+ The default value for machines with more than 32MiB of memory
+ is 32768, machines with less memory use 16384 by default.
+
+ logdev=device and rtdev=device
+ Use an external log (metadata journal) and/or real-time device.
+ An XFS filesystem has up to three parts: a data section, a log
+ section, and a real-time section. The real-time section is
+ optional, and the log section can be separate from the data
+ section or contained within it.
+
+ mtpt=mountpoint
+ Use with the "dmapi" option. The value specified here will be
+ included in the DMAPI mount event, and should be the path of
+ the actual mountpoint that is used.
+
+ noalign
+ Data allocations will not be aligned at stripe unit boundaries.
+
+ noatime
+ Access timestamps are not updated when a file is read.
+
+ norecovery
+ The filesystem will be mounted without running log recovery.
+ If the filesystem was not cleanly unmounted, it is likely to
+ be inconsistent when mounted in "norecovery" mode.
+ Some files or directories may not be accessible because of this.
+ Filesystems mounted "norecovery" must be mounted read-only or
+ the mount will fail.
+
+ nouuid
+ Don't check for double mounted file systems using the file system uuid.
+ This is useful to mount LVM snapshot volumes.
+
+ osyncisosync
+ Make O_SYNC writes implement true O_SYNC. WITHOUT this option,
+ Linux XFS behaves as if an "osyncisdsync" option is used,
+ which will make writes to files opened with the O_SYNC flag set
+ behave as if the O_DSYNC flag had been used instead.
+ This can result in better performance without compromising
+ data safety.
+ However if this option is not in effect, timestamp updates from
+ O_SYNC writes can be lost if the system crashes.
+ If timestamp updates are critical, use the osyncisosync option.
+
+ uquota/usrquota/uqnoenforce/quota
+ User disk quota accounting enabled, and limits (optionally)
+ enforced. Refer to xfs_quota(8) for further details.
+
+ gquota/grpquota/gqnoenforce
+ Group disk quota accounting enabled and limits (optionally)
+ enforced. Refer to xfs_quota(8) for further details.
+
+ pquota/prjquota/pqnoenforce
+ Project disk quota accounting enabled and limits (optionally)
+ enforced. Refer to xfs_quota(8) for further details.
+
+ sunit=value and swidth=value
+ Used to specify the stripe unit and width for a RAID device or
+ a stripe volume. "value" must be specified in 512-byte block
+ units.
+ If this option is not specified and the filesystem was made on
+ a stripe volume or the stripe width or unit were specified for
+ the RAID device at mkfs time, then the mount system call will
+ restore the value from the superblock. For filesystems that
+ are made directly on RAID devices, these options can be used
+ to override the information in the superblock if the underlying
+ disk layout changes after the filesystem has been created.
+ The "swidth" option is required if the "sunit" option has been
+ specified, and must be a multiple of the "sunit" value.
+
+ swalloc
+ Data allocations will be rounded up to stripe width boundaries
+ when the current end of file is being extended and the file
+ size is larger than the stripe width size.
+
+
+sysctls
+=======
+
+The following sysctls are available for the XFS filesystem:
+
+ fs.xfs.stats_clear (Min: 0 Default: 0 Max: 1)
+ Setting this to "1" clears accumulated XFS statistics
+ in /proc/fs/xfs/stat. It then immediately resets to "0".
+
+ fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000)
+ The interval at which the xfssyncd thread flushes metadata
+ out to disk. This thread will flush log activity out, and
+ do some processing on unlinked inodes.
+
+ fs.xfs.xfsbufd_centisecs (Min: 50 Default: 100 Max: 3000)
+ The interval at which xfsbufd scans the dirty metadata buffers list.
+
+ fs.xfs.age_buffer_centisecs (Min: 100 Default: 1500 Max: 720000)
+ The age at which xfsbufd flushes dirty metadata buffers to disk.
+
+ fs.xfs.error_level (Min: 0 Default: 3 Max: 11)
+ A volume knob for error reporting when internal errors occur.
+ This will generate detailed messages & backtraces for filesystem
+ shutdowns, for example. Current threshold values are:
+
+ XFS_ERRLEVEL_OFF: 0
+ XFS_ERRLEVEL_LOW: 1
+ XFS_ERRLEVEL_HIGH: 5
+
+ fs.xfs.panic_mask (Min: 0 Default: 0 Max: 127)
+ Causes certain error conditions to call BUG(). Value is a bitmask;
+ AND together the tags which represent errors which should cause panics:
+
+ XFS_NO_PTAG 0
+ XFS_PTAG_IFLUSH 0x00000001
+ XFS_PTAG_LOGRES 0x00000002
+ XFS_PTAG_AILDELETE 0x00000004
+ XFS_PTAG_ERROR_REPORT 0x00000008
+ XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
+ XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
+ XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
+
+ This option is intended for debugging only.
+
+ fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1)
+ Controls whether symlinks are created with mode 0777 (default)
+ or whether their mode is affected by the umask (irix mode).
+
+ fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1)
+ Controls files created in SGID directories.
+ If the group ID of the new file does not match the effective group
+ ID or one of the supplementary group IDs of the parent dir, the
+ ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
+ is set.
+
+ fs.xfs.restrict_chown (Min: 0 Default: 1 Max: 1)
+ Controls whether unprivileged users can use chown to "give away"
+ a file to another user.
+
+ fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "sync" flag set
+ by the xfs_io(8) chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.inherit_nodump (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "nodump" flag set
+ by the xfs_io(8) chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.inherit_noatime (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "noatime" flag set
+ by the xfs_io(8) chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.inherit_nosymlinks (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "nosymlinks" flag set
+ by the xfs_io(8) chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256)
+ In "inode32" allocation mode, this option determines how many
+ files the allocator attempts to allocate in the same allocation
+ group before moving to the next allocation group. The intent
+ is to control the rate at which the allocator moves between
+ allocation groups when allocating extents for new files.
diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt
new file mode 100644
index 0000000..0466ee5
--- /dev/null
+++ b/Documentation/filesystems/xip.txt
@@ -0,0 +1,68 @@
+Execute-in-place for file mappings
+----------------------------------
+
+Motivation
+----------
+File mappings are performed by mapping page cache pages to userspace. In
+addition, read&write type file operations also transfer data from/to the page
+cache.
+
+For memory backed storage devices that use the block device interface, the page
+cache pages are in fact copies of the original storage. Various approaches
+exist to work around the need for an extra copy. The ramdisk driver for example
+does read the data into the page cache, keeps a reference, and discards the
+original data behind later on.
+
+Execute-in-place solves this issue the other way around: instead of keeping
+data in the page cache, the need to have a page cache copy is eliminated
+completely. With execute-in-place, read&write type operations are performed
+directly from/to the memory backed storage device. For file mappings, the
+storage device itself is mapped directly into userspace.
+
+This implementation was initially written for shared memory segments between
+different virtual machines on s390 hardware to allow multiple machines to
+share the same binaries and libraries.
+
+Implementation
+--------------
+Execute-in-place is implemented in three steps: block device operation,
+address space operation, and file operations.
+
+A block device operation named direct_access is used to retrieve a
+reference (pointer) to a block on-disk. The reference is supposed to be
+cpu-addressable, physical address and remain valid until the release operation
+is performed. A struct block_device reference is used to address the device,
+and a sector_t argument is used to identify the individual block. As an
+alternative, memory technology devices can be used for this.
+
+The block device operation is optional, these block devices support it as of
+today:
+- dcssblk: s390 dcss block device driver
+
+An address space operation named get_xip_mem is used to retrieve references
+to a page frame number and a kernel address. To obtain these values a reference
+to an address_space is provided. This function assigns values to the kmem and
+pfn parameters. The third argument indicates whether the function should allocate
+blocks if needed.
+
+This address space operation is mutually exclusive with readpage&writepage that
+do page cache read/write operations.
+The following filesystems support it as of today:
+- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
+
+A set of file operations that do utilize get_xip_page can be found in
+mm/filemap_xip.c . The following file operation implementations are provided:
+- aio_read/aio_write
+- readv/writev
+- sendfile
+
+The generic file operations do_sync_read/do_sync_write can be used to implement
+classic synchronous IO calls.
+
+Shortcomings
+------------
+This implementation is limited to storage devices that are cpu addressable at
+all times (no highmem or such). It works well on rom/ram, but enhancements are
+needed to make it work with flash in read+write mode.
+Putting the Linux kernel and/or its modules on a xip filesystem does not mean
+they are not copied.
diff --git a/Documentation/firmware_class/README b/Documentation/firmware_class/README
new file mode 100644
index 0000000..c3480aa
--- /dev/null
+++ b/Documentation/firmware_class/README
@@ -0,0 +1,107 @@
+
+ request_firmware() hotplug interface:
+ ------------------------------------
+ Copyright (C) 2003 Manuel Estrada Sainz
+
+ Why:
+ ---
+
+ Today, the most extended way to use firmware in the Linux kernel is linking
+ it statically in a header file. Which has political and technical issues:
+
+ 1) Some firmware is not legal to redistribute.
+ 2) The firmware occupies memory permanently, even though it often is just
+ used once.
+ 3) Some people, like the Debian crowd, don't consider some firmware free
+ enough and remove entire drivers (e.g.: keyspan).
+
+ High level behavior (mixed):
+ ============================
+
+ kernel(driver): calls request_firmware(&fw_entry, $FIRMWARE, device)
+
+ userspace:
+ - /sys/class/firmware/xxx/{loading,data} appear.
+ - hotplug gets called with a firmware identifier in $FIRMWARE
+ and the usual hotplug environment.
+ - hotplug: echo 1 > /sys/class/firmware/xxx/loading
+
+ kernel: Discard any previous partial load.
+
+ userspace:
+ - hotplug: cat appropriate_firmware_image > \
+ /sys/class/firmware/xxx/data
+
+ kernel: grows a buffer in PAGE_SIZE increments to hold the image as it
+ comes in.
+
+ userspace:
+ - hotplug: echo 0 > /sys/class/firmware/xxx/loading
+
+ kernel: request_firmware() returns and the driver has the firmware
+ image in fw_entry->{data,size}. If something went wrong
+ request_firmware() returns non-zero and fw_entry is set to
+ NULL.
+
+ kernel(driver): Driver code calls release_firmware(fw_entry) releasing
+ the firmware image and any related resource.
+
+ High level behavior (driver code):
+ ==================================
+
+ if(request_firmware(&fw_entry, $FIRMWARE, device) == 0)
+ copy_fw_to_device(fw_entry->data, fw_entry->size);
+ release(fw_entry);
+
+ Sample/simple hotplug script:
+ ============================
+
+ # Both $DEVPATH and $FIRMWARE are already provided in the environment.
+
+ HOTPLUG_FW_DIR=/usr/lib/hotplug/firmware/
+
+ echo 1 > /sys/$DEVPATH/loading
+ cat $HOTPLUG_FW_DIR/$FIRMWARE > /sysfs/$DEVPATH/data
+ echo 0 > /sys/$DEVPATH/loading
+
+ Random notes:
+ ============
+
+ - "echo -1 > /sys/class/firmware/xxx/loading" will cancel the load at
+ once and make request_firmware() return with error.
+
+ - firmware_data_read() and firmware_loading_show() are just provided
+ for testing and completeness, they are not called in normal use.
+
+ - There is also /sys/class/firmware/timeout which holds a timeout in
+ seconds for the whole load operation.
+
+ - request_firmware_nowait() is also provided for convenience in
+ non-user contexts.
+
+
+ about in-kernel persistence:
+ ---------------------------
+ Under some circumstances, as explained below, it would be interesting to keep
+ firmware images in non-swappable kernel memory or even in the kernel image
+ (probably within initramfs).
+
+ Note that this functionality has not been implemented.
+
+ - Why OPTIONAL in-kernel persistence may be a good idea sometimes:
+
+ - If the device that needs the firmware is needed to access the
+ filesystem. When upon some error the device has to be reset and the
+ firmware reloaded, it won't be possible to get it from userspace.
+ e.g.:
+ - A diskless client with a network card that needs firmware.
+ - The filesystem is stored in a disk behind an scsi device
+ that needs firmware.
+ - Replacing buggy DSDT/SSDT ACPI tables on boot.
+ Note: this would require the persistent objects to be included
+ within the kernel image, probably within initramfs.
+
+ And the same device can be needed to access the filesystem or not depending
+ on the setup, so I think that the choice on what firmware to make
+ persistent should be left to userspace.
+
diff --git a/Documentation/firmware_class/hotplug-script b/Documentation/firmware_class/hotplug-script
new file mode 100644
index 0000000..1990130
--- /dev/null
+++ b/Documentation/firmware_class/hotplug-script
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+# Simple hotplug script sample:
+#
+# Both $DEVPATH and $FIRMWARE are already provided in the environment.
+
+HOTPLUG_FW_DIR=/usr/lib/hotplug/firmware/
+
+echo 1 > /sys/$DEVPATH/loading
+cat $HOTPLUG_FW_DIR/$FIRMWARE > /sys/$DEVPATH/data
+echo 0 > /sys/$DEVPATH/loading
+
+# To cancel the load in case of error:
+#
+# echo -1 > /sys/$DEVPATH/loading
+#
diff --git a/Documentation/frv/README.txt b/Documentation/frv/README.txt
new file mode 100644
index 0000000..a984faa
--- /dev/null
+++ b/Documentation/frv/README.txt
@@ -0,0 +1,51 @@
+ ================================
+ Fujitsu FR-V LINUX DOCUMENTATION
+ ================================
+
+This directory contains documentation for the Fujitsu FR-V CPU architecture
+port of Linux.
+
+The following documents are available:
+
+ (*) features.txt
+
+ A description of the basic features inherent in this architecture port.
+
+
+ (*) configuring.txt
+
+ A summary of the configuration options particular to this architecture.
+
+
+ (*) booting.txt
+
+ A description of how to boot the kernel image and a summary of the kernel
+ command line options.
+
+
+ (*) gdbstub.txt
+
+ A description of how to debug the kernel using GDB attached by serial
+ port, and a summary of the services available.
+
+
+ (*) mmu-layout.txt
+
+ A description of the virtual and physical memory layout used in the
+ MMU linux kernel, and the registers used to support it.
+
+
+ (*) gdbinit
+
+ An example .gdbinit file for use with GDB. It includes macros for viewing
+ MMU state on the FR451. See mmu-layout.txt for more information.
+
+
+ (*) clock.txt
+
+ A description of the CPU clock scaling interface.
+
+
+ (*) atomic-ops.txt
+
+ A description of how the FR-V kernel's atomic operations work.
diff --git a/Documentation/frv/atomic-ops.txt b/Documentation/frv/atomic-ops.txt
new file mode 100644
index 0000000..96638e9
--- /dev/null
+++ b/Documentation/frv/atomic-ops.txt
@@ -0,0 +1,134 @@
+ =====================================
+ FUJITSU FR-V KERNEL ATOMIC OPERATIONS
+ =====================================
+
+On the FR-V CPUs, there is only one atomic Read-Modify-Write operation: the SWAP/SWAPI
+instruction. Unfortunately, this alone can't be used to implement the following operations:
+
+ (*) Atomic add to memory
+
+ (*) Atomic subtract from memory
+
+ (*) Atomic bit modification (set, clear or invert)
+
+ (*) Atomic compare and exchange
+
+On such CPUs, the standard way of emulating such operations in uniprocessor mode is to disable
+interrupts, but on the FR-V CPUs, modifying the PSR takes a lot of clock cycles, and it has to be
+done twice. This means the CPU runs for a relatively long time with interrupts disabled,
+potentially having a great effect on interrupt latency.
+
+
+=============
+NEW ALGORITHM
+=============
+
+To get around this, the following algorithm has been implemented. It operates in a way similar to
+the LL/SC instruction pairs supported on a number of platforms.
+
+ (*) The CCCR.CC3 register is reserved within the kernel to act as an atomic modify abort flag.
+
+ (*) In the exception prologues run on kernel->kernel entry, CCCR.CC3 is set to 0 (Undefined
+ state).
+
+ (*) All atomic operations can then be broken down into the following algorithm:
+
+ (1) Set ICC3.Z to true and set CC3 to True (ORCC/CKEQ/ORCR).
+
+ (2) Load the value currently in the memory to be modified into a register.
+
+ (3) Make changes to the value.
+
+ (4) If CC3 is still True, simultaneously and atomically (by VLIW packing):
+
+ (a) Store the modified value back to memory.
+
+ (b) Set ICC3.Z to false (CORCC on GR29 is sufficient for this - GR29 holds the current
+ task pointer in the kernel, and so is guaranteed to be non-zero).
+
+ (5) If ICC3.Z is still true, go back to step (1).
+
+This works in a non-SMP environment because any interrupt or other exception that happens between
+steps (1) and (4) will set CC3 to the Undefined, thus aborting the store in (4a), and causing the
+condition in ICC3 to remain with the Z flag set, thus causing step (5) to loop back to step (1).
+
+
+This algorithm suffers from two problems:
+
+ (1) The condition CCCR.CC3 is cleared unconditionally by an exception, irrespective of whether or
+ not any changes were made to the target memory location during that exception.
+
+ (2) The branch from step (5) back to step (1) may have to happen more than once until the store
+ manages to take place. In theory, this loop could cycle forever because there are too many
+ interrupts coming in, but it's unlikely.
+
+
+=======
+EXAMPLE
+=======
+
+Taking an example from include/asm-frv/atomic.h:
+
+ static inline int atomic_add_return(int i, atomic_t *v)
+ {
+ unsigned long val;
+
+ asm("0: \n"
+
+It starts by setting ICC3.Z to true for later use, and also transforming that into CC3 being in the
+True state.
+
+ " orcc gr0,gr0,gr0,icc3 \n" <-- (1)
+ " ckeq icc3,cc7 \n" <-- (1)
+
+Then it does the load. Note that the final phase of step (1) is done at the same time as the
+load. The VLIW packing ensures they are done simultaneously. The ".p" on the load must not be
+removed without swapping the order of these two instructions.
+
+ " ld.p %M0,%1 \n" <-- (2)
+ " orcr cc7,cc7,cc3 \n" <-- (1)
+
+Then the proposed modification is generated. Note that the old value can be retained if required
+(such as in test_and_set_bit()).
+
+ " add%I2 %1,%2,%1 \n" <-- (3)
+
+Then it attempts to store the value back, contingent on no exception having cleared CC3 since it
+was set to True.
+
+ " cst.p %1,%M0 ,cc3,#1 \n" <-- (4a)
+
+It simultaneously records the success or failure of the store in ICC3.Z.
+
+ " corcc gr29,gr29,gr0 ,cc3,#1 \n" <-- (4b)
+
+Such that the branch can then be taken if the operation was aborted.
+
+ " beq icc3,#0,0b \n" <-- (5)
+ : "+U"(v->counter), "=&r"(val)
+ : "NPr"(i)
+ : "memory", "cc7", "cc3", "icc3"
+ );
+
+ return val;
+ }
+
+
+=============
+CONFIGURATION
+=============
+
+The atomic ops implementation can be made inline or out-of-line by changing the
+CONFIG_FRV_OUTOFLINE_ATOMIC_OPS configuration variable. Making it out-of-line has a number of
+advantages:
+
+ - The resulting kernel image may be smaller
+ - Debugging is easier as atomic ops can just be stepped over and they can be breakpointed
+
+Keeping it inline also has a number of advantages:
+
+ - The resulting kernel may be Faster
+ - no out-of-line function calls need to be made
+ - the compiler doesn't have half its registers clobbered by making a call
+
+The out-of-line implementations live in arch/frv/lib/atomic-ops.S.
diff --git a/Documentation/frv/booting.txt b/Documentation/frv/booting.txt
new file mode 100644
index 0000000..ace200b
--- /dev/null
+++ b/Documentation/frv/booting.txt
@@ -0,0 +1,181 @@
+ =========================
+ BOOTING FR-V LINUX KERNEL
+ =========================
+
+======================
+PROVIDING A FILESYSTEM
+======================
+
+First of all, a root filesystem must be made available. This can be done in
+one of two ways:
+
+ (1) NFS Export
+
+ A filesystem should be constructed in a directory on an NFS server that
+ the target board can reach. This directory should then be NFS exported
+ such that the target board can read and write into it as root.
+
+ (2) Flash Filesystem (JFFS2 Recommended)
+
+ In this case, the image must be stored or built up on flash before it
+ can be used. A complete image can be built using the mkfs.jffs2 or
+ similar program and then downloaded and stored into flash by RedBoot.
+
+
+========================
+LOADING THE KERNEL IMAGE
+========================
+
+The kernel will need to be loaded into RAM by RedBoot (or by some alternative
+boot loader) before it can be run. The kernel image (arch/frv/boot/Image) may
+be loaded in one of three ways:
+
+ (1) Load from Flash
+
+ This is the simplest. RedBoot can store an image in the flash (see the
+ RedBoot documentation) and then load it back into RAM. RedBoot keeps
+ track of the load address, entry point and size, so the command to do
+ this is simply:
+
+ fis load linux
+
+ The image is then ready to be executed.
+
+ (2) Load by TFTP
+
+ The following command will download a raw binary kernel image from the
+ default server (as negotiated by BOOTP) and store it into RAM:
+
+ load -b 0x00100000 -r /tftpboot/image.bin
+
+ The image is then ready to be executed.
+
+ (3) Load by Y-Modem
+
+ The following command will download a raw binary kernel image across the
+ serial port that RedBoot is currently using:
+
+ load -m ymodem -b 0x00100000 -r zImage
+
+ The serial client (such as minicom) must then be told to transmit the
+ program by Y-Modem.
+
+ When finished, the image will then be ready to be executed.
+
+
+==================
+BOOTING THE KERNEL
+==================
+
+Boot the image with the following RedBoot command:
+
+ exec -c "<CMDLINE>" 0x00100000
+
+For example:
+
+ exec -c "console=ttySM0,115200 ip=:::::dhcp root=/dev/mtdblock2 rw"
+
+This will start the kernel running. Note that if the GDB-stub is compiled in,
+then the kernel will immediately wait for GDB to connect over serial before
+doing anything else. See the section on kernel debugging with GDB.
+
+The kernel command line <CMDLINE> tells the kernel where its console is and
+how to find its root filesystem. This is made up of the following components,
+separated by spaces:
+
+ (*) console=ttyS<x>[,<baud>[<parity>[<bits>[<flow>]]]]
+
+ This specifies that the system console should output through on-chip
+ serial port <x> (which can be "0" or "1").
+
+ <baud> is a standard baud rate between 1200 and 115200 (default 9600).
+
+ <parity> is a parity setting of "N", "O", "E", "M" or "S" for None, Odd,
+ Even, Mark or Space. "None" is the default.
+
+ <stop> is "7" or "8" for the number of bits per character. "8" is the
+ default.
+
+ <flow> is "r" to use flow control (XCTS on serial port 2 only). The
+ default is to not use flow control.
+
+ For example:
+
+ console=ttyS0,115200
+
+ To use the first on-chip serial port at baud rate 115200, no parity, 8
+ bits, and no flow control.
+
+ (*) root=/dev/<xxxx>
+
+ This specifies the device upon which the root filesystem resides. For
+ example:
+
+ /dev/nfs NFS root filesystem
+ /dev/mtdblock3 Fourth RedBoot partition on the System Flash
+
+ (*) rw
+
+ Start with the root filesystem mounted Read/Write.
+
+ The remaining components are all optional:
+
+ (*) ip=<ip>::::<host>:<iface>:<cfg>
+
+ Configure the network interface. If <cfg> is "off" then <ip> should
+ specify the IP address for the network device <iface>. <host> provide
+ the hostname for the device.
+
+ If <cfg> is "bootp" or "dhcp", then all of these parameters will be
+ discovered by consulting a BOOTP or DHCP server.
+
+ For example, the following might be used:
+
+ ip=192.168.73.12::::frv:eth0:off
+
+ This sets the IP address on the VDK motherboard RTL8029 ethernet chipset
+ (eth0) to be 192.168.73.12, and sets the board's hostname to be "frv".
+
+ (*) nfsroot=<server>:<dir>[,v<vers>]
+
+ This is mandatory if "root=/dev/nfs" is given as an option. It tells the
+ kernel the IP address of the NFS server providing its root filesystem,
+ and the pathname on that server of the filesystem.
+
+ The NFS version to use can also be specified. v2 and v3 are supported by
+ Linux.
+
+ For example:
+
+ nfsroot=192.168.73.1:/nfsroot-frv
+
+ (*) profile=1
+
+ Turns on the kernel profiler (accessible through /proc/profile).
+
+ (*) console=gdb0
+
+ This can be used as an alternative to the "console=ttyS..." listed
+ above. I tells the kernel to pass the console output to GDB if the
+ gdbstub is compiled in to the kernel.
+
+ If this is used, then the gdbstub passes the text to GDB, which then
+ simply dumps it to its standard output.
+
+ (*) mem=<xxx>M
+
+ Normally the kernel will work out how much SDRAM it has by reading the
+ SDRAM controller registers. That can be overridden with this
+ option. This allows the kernel to be told that it has <xxx> megabytes of
+ memory available.
+
+ (*) init=<prog> [<arg> [<arg> [<arg> ...]]]
+
+ This tells the kernel what program to run initially. By default this is
+ /sbin/init, but /sbin/sash or /bin/sh are common alternatives.
+
+ (*) vdc=...
+
+ This option configures the MB93493 companion chip visual display
+ driver. Please see Documentation/frv/mb93493/vdc.txt for more
+ information.
diff --git a/Documentation/frv/clock.txt b/Documentation/frv/clock.txt
new file mode 100644
index 0000000..c72d350
--- /dev/null
+++ b/Documentation/frv/clock.txt
@@ -0,0 +1,65 @@
+Clock scaling
+-------------
+
+The kernel supports scaling of CLCK.CMODE, CLCK.CM and CLKC.P0 clock
+registers. If built with CONFIG_PM and CONFIG_SYSCTL options enabled, four
+extra files will appear in the directory /proc/sys/pm/. Reading these files
+will show:
+
+ p0 -- current value of the P0 bit in CLKC register.
+ cm -- current value of the CM bits in CLKC register.
+ cmode -- current value of the CMODE bits in CLKC register.
+
+On all boards, the 'p0' file should also be writable, and either '1' or '0'
+can be rewritten, to set or clear the CLKC_P0 bit respectively, hence
+controlling whether the resource bus rate clock is halved.
+
+The 'cm' file should also be available on all boards. '0' can be written to it
+to shift the board into High-Speed mode (normal), and '1' can be written to
+shift the board into Medium-Speed mode. Selecting Low-Speed mode is not
+supported by this interface, even though some CPUs do support it.
+
+On the boards with FR405 CPU (i.e. CB60 and CB70), the 'cmode' file is also
+writable, allowing the CPU core speed (and other clock speeds) to be
+controlled from userspace.
+
+
+Determining current and possible settings
+-----------------------------------------
+
+The current state and the available masks can be found in /proc/cpuinfo. For
+example, on the CB70:
+
+ # cat /proc/cpuinfo
+ CPU-Series: fr400
+ CPU-Core: fr405, gr0-31, BE, CCCR
+ CPU: mb93405
+ MMU: Prot
+ FP-Media: fr0-31, Media
+ System: mb93091-cb70, mb93090-mb00
+ PM-Controls: cmode=0xd31f, cm=0x3, p0=0x3, suspend=0x9
+ PM-Status: cmode=3, cm=0, p0=0
+ Clock-In: 50.00 MHz
+ Clock-Core: 300.00 MHz
+ Clock-SDRAM: 100.00 MHz
+ Clock-CBus: 100.00 MHz
+ Clock-Res: 50.00 MHz
+ Clock-Ext: 50.00 MHz
+ Clock-DSU: 25.00 MHz
+ BogoMips: 300.00
+
+And on the PDK, the PM lines look like the following:
+
+ PM-Controls: cm=0x3, p0=0x3, suspend=0x9
+ PM-Status: cmode=9, cm=0, p0=0
+
+The PM-Controls line, if present, will indicate which /proc/sys/pm files can
+be set to what values. The specification values are bitmasks; so, for example,
+"suspend=0x9" indicates that 0 and 3 can be written validly to
+/proc/sys/pm/suspend.
+
+The PM-Controls line will only be present if CONFIG_PM is configured to Y.
+
+The PM-Status line indicates which clock controls are set to which value. If
+the file can be read, then the suspend value must be 0, and so that's not
+included.
diff --git a/Documentation/frv/configuring.txt b/Documentation/frv/configuring.txt
new file mode 100644
index 0000000..36e76a2
--- /dev/null
+++ b/Documentation/frv/configuring.txt
@@ -0,0 +1,125 @@
+ =======================================
+ FUJITSU FR-V LINUX KERNEL CONFIGURATION
+ =======================================
+
+=====================
+CONFIGURATION OPTIONS
+=====================
+
+The most important setting is in the "MMU support options" tab (the first
+presented in the configuration tools available):
+
+ (*) "Kernel Type"
+
+ This options allows selection of normal, MMU-requiring linux, and uClinux
+ (which doesn't require an MMU and doesn't have inter-process protection).
+
+There are a number of settings in the "Processor type and features" section of
+the kernel configuration that need to be considered.
+
+ (*) "CPU"
+
+ The register and instruction sets at the core of the processor. This can
+ only be set to "FR40x/45x/55x" at the moment - but this permits usage of
+ the kernel with MB93091 CB10, CB11, CB30, CB41, CB60, CB70 and CB451
+ CPU boards, and with the MB93093 PDK board.
+
+ (*) "System"
+
+ This option allows a choice of basic system. This governs the peripherals
+ that are expected to be available.
+
+ (*) "Motherboard"
+
+ This specifies the type of motherboard being used, and the peripherals
+ upon it. Currently only "MB93090-MB00" can be set here.
+
+ (*) "Default cache-write mode"
+
+ This controls the initial data cache write management mode. By default
+ Write-Through is selected, but Write-Back (Copy-Back) can also be
+ selected. This can be changed dynamically once the kernel is running (see
+ features.txt).
+
+There are some architecture specific configuration options in the "General
+Setup" section of the kernel configuration too:
+
+ (*) "Reserve memory uncached for (PCI) DMA"
+
+ This requests that a uClinux kernel set aside some memory in an uncached
+ window for the use as consistent DMA memory (mainly for PCI). At least a
+ megabyte will be allocated in this way, possibly more. Any memory so
+ reserved will not be available for normal allocations.
+
+ (*) "Kernel support for ELF-FDPIC binaries"
+
+ This enables the binary-format driver for the new FDPIC ELF binaries that
+ this platform normally uses. These binaries are totally relocatable -
+ their separate sections can relocated independently, allowing them to be
+ shared on uClinux where possible. This should normally be enabled.
+
+ (*) "Kernel image protection"
+
+ This makes the protection register governing access to the core kernel
+ image prohibit access by userspace programs. This option is available on
+ uClinux only.
+
+There are also a number of settings in the "Kernel Hacking" section of the
+kernel configuration especially for debugging a kernel on this
+architecture. See the "gdbstub.txt" file for information about those.
+
+
+======================
+DEFAULT CONFIGURATIONS
+======================
+
+The kernel sources include a number of example default configurations:
+
+ (*) defconfig-mb93091
+
+ Default configuration for the MB93091-VDK with both CPU board and
+ MB93090-MB00 motherboard running uClinux.
+
+
+ (*) defconfig-mb93091-fb
+
+ Default configuration for the MB93091-VDK with CPU board,
+ MB93090-MB00 motherboard, and DAV board running uClinux.
+ Includes framebuffer driver.
+
+
+ (*) defconfig-mb93093
+
+ Default configuration for the MB93093-PDK board running uClinux.
+
+
+ (*) defconfig-cb70-standalone
+
+ Default configuration for the MB93091-VDK with only CB70 CPU board
+ running uClinux. This will use the CB70's DM9000 for network access.
+
+
+ (*) defconfig-mmu
+
+ Default configuration for the MB93091-VDK with both CB451 CPU board and
+ MB93090-MB00 motherboard running MMU linux.
+
+ (*) defconfig-mmu-audio
+
+ Default configuration for the MB93091-VDK with CB451 CPU board, DAV
+ board, and MB93090-MB00 motherboard running MMU linux. Includes
+ audio driver.
+
+ (*) defconfig-mmu-fb
+
+ Default configuration for the MB93091-VDK with CB451 CPU board, DAV
+ board, and MB93090-MB00 motherboard running MMU linux. Includes
+ framebuffer driver.
+
+ (*) defconfig-mmu-standalone
+
+ Default configuration for the MB93091-VDK with only CB451 CPU board
+ running MMU linux.
+
+
+
diff --git a/Documentation/frv/features.txt b/Documentation/frv/features.txt
new file mode 100644
index 0000000..fa20c0e
--- /dev/null
+++ b/Documentation/frv/features.txt
@@ -0,0 +1,310 @@
+ ===========================
+ FUJITSU FR-V LINUX FEATURES
+ ===========================
+
+This kernel port has a number of features of which the user should be aware:
+
+ (*) Linux and uClinux
+
+ The FR-V architecture port supports both normal MMU linux and uClinux out
+ of the same sources.
+
+
+ (*) CPU support
+
+ Support for the FR401, FR403, FR405, FR451 and FR555 CPUs should work with
+ the same uClinux kernel configuration.
+
+ In normal (MMU) Linux mode, only the FR451 CPU will work as that is the
+ only one with a suitably featured CPU.
+
+ The kernel is written and compiled with the assumption that only the
+ bottom 32 GR registers and no FR registers will be used by the kernel
+ itself, however all extra userspace registers will be saved on context
+ switch. Note that since most CPUs can't support lazy switching, no attempt
+ is made to do lazy register saving where that would be possible (FR555
+ only currently).
+
+
+ (*) Board support
+
+ The board on which the kernel will run can be configured on the "Processor
+ type and features" configuration tab.
+
+ Set the System to "MB93093-PDK" to boot from the MB93093 (FR403) PDK.
+
+ Set the System to "MB93091-VDK" to boot from the CB11, CB30, CB41, CB60,
+ CB70 or CB451 VDK boards. Set the Motherboard setting to "MB93090-MB00" to
+ boot with the standard ATA90590B VDK motherboard, and set it to "None" to
+ boot without any motherboard.
+
+
+ (*) Binary Formats
+
+ The only userspace binary format supported is FDPIC ELF. Normal ELF, FLAT
+ and AOUT binaries are not supported for this architecture.
+
+ FDPIC ELF supports shared library and program interpreter facilities.
+
+
+ (*) Scheduler Speed
+
+ The kernel scheduler runs at 100Hz irrespective of the clock speed on this
+ architecture. This value is set in asm/param.h (see the HZ macro defined
+ there).
+
+
+ (*) Normal (MMU) Linux Memory Layout.
+
+ See mmu-layout.txt in this directory for a description of the normal linux
+ memory layout
+
+ See include/asm-frv/mem-layout.h for constants pertaining to the memory
+ layout.
+
+ See include/asm-frv/mb-regs.h for the constants pertaining to the I/O bus
+ controller configuration.
+
+
+ (*) uClinux Memory Layout
+
+ The memory layout used by the uClinux kernel is as follows:
+
+ 0x00000000 - 0x00000FFF Null pointer catch page
+ 0x20000000 - 0x200FFFFF CS2# [PDK] FPGA
+ 0xC0000000 - 0xCFFFFFFF SDRAM
+ 0xC0000000 Base of Linux kernel image
+ 0xE0000000 - 0xEFFFFFFF CS2# [VDK] SLBUS/PCI window
+ 0xF0000000 - 0xF0FFFFFF CS5# MB93493 CSC area (DAV daughter board)
+ 0xF1000000 - 0xF1FFFFFF CS7# [CB70/CB451] CPU-card PCMCIA port space
+ 0xFC000000 - 0xFC0FFFFF CS1# [VDK] MB86943 config space
+ 0xFC100000 - 0xFC1FFFFF CS6# [CB70/CB451] CPU-card DM9000 NIC space
+ 0xFC100000 - 0xFC1FFFFF CS6# [PDK] AX88796 NIC space
+ 0xFC200000 - 0xFC2FFFFF CS3# MB93493 CSR area (DAV daughter board)
+ 0xFD000000 - 0xFDFFFFFF CS4# [CB70/CB451] CPU-card extra flash space
+ 0xFE000000 - 0xFEFFFFFF Internal CPU peripherals
+ 0xFF000000 - 0xFF1FFFFF CS0# Flash 1
+ 0xFF200000 - 0xFF3FFFFF CS0# Flash 2
+ 0xFFC00000 - 0xFFC0001F CS0# [VDK] FPGA
+
+ The kernel reads the size of the SDRAM from the memory bus controller
+ registers by default.
+
+ The kernel initialisation code (1) adjusts the SDRAM base addresses to
+ move the SDRAM to desired address, (2) moves the kernel image down to the
+ bottom of SDRAM, (3) adjusts the bus controller registers to move I/O
+ windows, and (4) rearranges the protection registers to protect all of
+ this.
+
+ The reasons for doing this are: (1) the page at address 0 should be
+ inaccessible so that NULL pointer errors can be caught; and (2) the bottom
+ three quarters are left unoccupied so that an FR-V CPU with an MMU can use
+ it for virtual userspace mappings.
+
+ See include/asm-frv/mem-layout.h for constants pertaining to the memory
+ layout.
+
+ See include/asm-frv/mb-regs.h for the constants pertaining to the I/O bus
+ controller configuration.
+
+
+ (*) uClinux Memory Protection
+
+ A DAMPR register is used to cover the entire region used for I/O
+ (0xE0000000 - 0xFFFFFFFF). This permits the kernel to make uncached
+ accesses to this region. Userspace is not permitted to access it.
+
+ The DAMPR/IAMPR protection registers not in use for any other purpose are
+ tiled over the top of the SDRAM such that:
+
+ (1) The core kernel image is covered by as small a tile as possible
+ granting only the kernel access to the underlying data, whilst
+ making sure no SDRAM is actually made unavailable by this approach.
+
+ (2) All other tiles are arranged to permit userspace access to the rest
+ of the SDRAM.
+
+ Barring point (1), there is nothing to protect kernel data against
+ userspace damage - but this is uClinux.
+
+
+ (*) Exceptions and Fixups
+
+ Since the FR40x and FR55x CPUs that do not have full MMUs generate
+ imprecise data error exceptions, there are currently no automatic fixup
+ services available in uClinux. This includes misaligned memory access
+ fixups.
+
+ Userspace EFAULT errors can be trapped by issuing a MEMBAR instruction and
+ forcing the fault to happen there.
+
+ On the FR451, however, data exceptions are mostly precise, and so
+ exception fixup handling is implemented as normal.
+
+
+ (*) Userspace Breakpoints
+
+ The ptrace() system call supports the following userspace debugging
+ features:
+
+ (1) Hardware assisted single step.
+
+ (2) Breakpoint via the FR-V "BREAK" instruction.
+
+ (3) Breakpoint via the FR-V "TIRA GR0, #1" instruction.
+
+ (4) Syscall entry/exit trap.
+
+ Each of the above generates a SIGTRAP.
+
+
+ (*) On-Chip Serial Ports
+
+ The FR-V on-chip serial ports are made available as ttyS0 and ttyS1. Note
+ that if the GDB stub is compiled in, ttyS1 will not actually be available
+ as it will be being used for the GDB stub.
+
+ These ports can be made by:
+
+ mknod /dev/ttyS0 c 4 64
+ mknod /dev/ttyS1 c 4 65
+
+
+ (*) Maskable Interrupts
+
+ Level 15 (Non-maskable) interrupts are dealt with by the GDB stub if
+ present, and cause a panic if not. If the GDB stub is present, ttyS1's
+ interrupts are rated at level 15.
+
+ All other interrupts are distributed over the set of available priorities
+ so that no IRQs are shared where possible. The arch interrupt handling
+ routines attempt to disentangle the various sources available through the
+ CPU's own multiplexor, and those on off-CPU peripherals.
+
+
+ (*) Accessing PCI Devices
+
+ Where PCI is available, care must be taken when dealing with drivers that
+ access PCI devices. PCI devices present their data in little-endian form,
+ but the CPU sees it in big-endian form. The macros in asm/io.h try to get
+ this right, but may not under all circumstances...
+
+
+ (*) Ax88796 Ethernet Driver
+
+ The MB93093 PDK board has an Ax88796 ethernet chipset (an NE2000 clone). A
+ driver has been written to deal specifically with this. The driver
+ provides MII services for the card.
+
+ The driver can be configured by running make xconfig, and going to:
+
+ (*) Network device support
+ - turn on "Network device support"
+ (*) Ethernet (10 or 100Mbit)
+ - turn on "Ethernet (10 or 100Mbit)"
+ - turn on "AX88796 NE2000 compatible chipset"
+
+ The driver can be found in:
+
+ drivers/net/ax88796.c
+ include/asm/ax88796.h
+
+
+ (*) WorkRAM Driver
+
+ This driver provides a character device that permits access to the WorkRAM
+ that can be found on the FR451 CPU. Each page is accessible through a
+ separate minor number, thereby permitting each page to have its own
+ filesystem permissions set on the device file.
+
+ The device files should be:
+
+ mknod /dev/frv/workram0 c 240 0
+ mknod /dev/frv/workram1 c 240 1
+ mknod /dev/frv/workram2 c 240 2
+ ...
+
+ The driver will not permit the opening of any device file that does not
+ correspond to at least a partial page of WorkRAM. So the first device file
+ is the only one available on the FR451. If any other CPU is detected, none
+ of the devices will be openable.
+
+ The devices can be accessed with read, write and llseek, and can also be
+ mmapped. If they're mmapped, they will only map at the appropriate
+ 0x7e8nnnnn address on linux and at the 0xfe8nnnnn address on uClinux. If
+ MAP_FIXED is not specified, the appropriate address will be chosen anyway.
+
+ The mappings must be MAP_SHARED not MAP_PRIVATE, and must not be
+ PROT_EXEC. They must also start at file offset 0, and must not be longer
+ than one page in size.
+
+ This driver can be configured by running make xconfig, and going to:
+
+ (*) Character devices
+ - turn on "Fujitsu FR-V CPU WorkRAM support"
+
+
+ (*) Dynamic data cache write mode changing
+
+ It is possible to view and to change the data cache's write mode through
+ the /proc/sys/frv/cache-mode file while the kernel is running. There are
+ two modes available:
+
+ NAME MEANING
+ ===== ==========================================
+ wthru Data cache is in Write-Through mode
+ wback Data cache is in Write-Back/Copy-Back mode
+
+ To read the cache mode:
+
+ # cat /proc/sys/frv/cache-mode
+ wthru
+
+ To change the cache mode:
+
+ # echo wback >/proc/sys/frv/cache-mode
+ # cat /proc/sys/frv/cache-mode
+ wback
+
+
+ (*) MMU Context IDs and Pinning
+
+ On MMU Linux the CPU supports the concept of a context ID in its MMU to
+ make it more efficient (TLB entries are labelled with a context ID to link
+ them to specific tasks).
+
+ Normally once a context ID is allocated, it will remain affixed to a task
+ or CLONE_VM'd group of tasks for as long as it exists. However, since the
+ kernel is capable of supporting more tasks than there are possible ID
+ numbers, the kernel will pass context IDs from one task to another if
+ there are insufficient available.
+
+ The context ID currently in use by a task can be viewed in /proc:
+
+ # grep CXNR /proc/1/status
+ CXNR: 1
+
+ Note that kernel threads do not have a userspace context, and so will not
+ show a CXNR entry in that file.
+
+ Under some circumstances, however, it is desirable to pin a context ID on
+ a process such that the kernel won't pass it on. This can be done by
+ writing the process ID of the target process to a special file:
+
+ # echo 17 >/proc/sys/frv/pin-cxnr
+
+ Reading from the file will then show the context ID pinned.
+
+ # cat /proc/sys/frv/pin-cxnr
+ 4
+
+ The context ID will remain pinned as long as any process is using that
+ context, i.e.: when the all the subscribing processes have exited or
+ exec'd; or when an unpinning request happens:
+
+ # echo 0 >/proc/sys/frv/pin-cxnr
+
+ When there isn't a pinned context, the file shows -1:
+
+ # cat /proc/sys/frv/pin-cxnr
+ -1
diff --git a/Documentation/frv/gdbinit b/Documentation/frv/gdbinit
new file mode 100644
index 0000000..51517b6
--- /dev/null
+++ b/Documentation/frv/gdbinit
@@ -0,0 +1,102 @@
+set remotebreak 1
+
+define _amr
+
+printf "AMRx DAMR IAMR \n"
+printf "==== ===================== =====================\n"
+printf "amr0 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x0].L,__debug_mmu.damr[0x0].P,__debug_mmu.iamr[0x0].L,__debug_mmu.iamr[0x0].P
+printf "amr1 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x1].L,__debug_mmu.damr[0x1].P,__debug_mmu.iamr[0x1].L,__debug_mmu.iamr[0x1].P
+printf "amr2 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x2].L,__debug_mmu.damr[0x2].P,__debug_mmu.iamr[0x2].L,__debug_mmu.iamr[0x2].P
+printf "amr3 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x3].L,__debug_mmu.damr[0x3].P,__debug_mmu.iamr[0x3].L,__debug_mmu.iamr[0x3].P
+printf "amr4 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x4].L,__debug_mmu.damr[0x4].P,__debug_mmu.iamr[0x4].L,__debug_mmu.iamr[0x4].P
+printf "amr5 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x5].L,__debug_mmu.damr[0x5].P,__debug_mmu.iamr[0x5].L,__debug_mmu.iamr[0x5].P
+printf "amr6 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x6].L,__debug_mmu.damr[0x6].P,__debug_mmu.iamr[0x6].L,__debug_mmu.iamr[0x6].P
+printf "amr7 : L:%08lx P:%08lx : L:%08lx P:%08lx\n",__debug_mmu.damr[0x7].L,__debug_mmu.damr[0x7].P,__debug_mmu.iamr[0x7].L,__debug_mmu.iamr[0x7].P
+
+printf "amr8 : L:%08lx P:%08lx\n",__debug_mmu.damr[0x8].L,__debug_mmu.damr[0x8].P
+printf "amr9 : L:%08lx P:%08lx\n",__debug_mmu.damr[0x9].L,__debug_mmu.damr[0x9].P
+printf "amr10: L:%08lx P:%08lx\n",__debug_mmu.damr[0xa].L,__debug_mmu.damr[0xa].P
+printf "amr11: L:%08lx P:%08lx\n",__debug_mmu.damr[0xb].L,__debug_mmu.damr[0xb].P
+
+end
+
+
+define _tlb
+printf "tlb[0x00]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x0].L,__debug_mmu.tlb[0x0].P,__debug_mmu.tlb[0x40+0x0].L,__debug_mmu.tlb[0x40+0x0].P
+printf "tlb[0x01]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x1].L,__debug_mmu.tlb[0x1].P,__debug_mmu.tlb[0x40+0x1].L,__debug_mmu.tlb[0x40+0x1].P
+printf "tlb[0x02]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x2].L,__debug_mmu.tlb[0x2].P,__debug_mmu.tlb[0x40+0x2].L,__debug_mmu.tlb[0x40+0x2].P
+printf "tlb[0x03]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x3].L,__debug_mmu.tlb[0x3].P,__debug_mmu.tlb[0x40+0x3].L,__debug_mmu.tlb[0x40+0x3].P
+printf "tlb[0x04]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x4].L,__debug_mmu.tlb[0x4].P,__debug_mmu.tlb[0x40+0x4].L,__debug_mmu.tlb[0x40+0x4].P
+printf "tlb[0x05]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x5].L,__debug_mmu.tlb[0x5].P,__debug_mmu.tlb[0x40+0x5].L,__debug_mmu.tlb[0x40+0x5].P
+printf "tlb[0x06]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x6].L,__debug_mmu.tlb[0x6].P,__debug_mmu.tlb[0x40+0x6].L,__debug_mmu.tlb[0x40+0x6].P
+printf "tlb[0x07]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x7].L,__debug_mmu.tlb[0x7].P,__debug_mmu.tlb[0x40+0x7].L,__debug_mmu.tlb[0x40+0x7].P
+printf "tlb[0x08]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x8].L,__debug_mmu.tlb[0x8].P,__debug_mmu.tlb[0x40+0x8].L,__debug_mmu.tlb[0x40+0x8].P
+printf "tlb[0x09]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x9].L,__debug_mmu.tlb[0x9].P,__debug_mmu.tlb[0x40+0x9].L,__debug_mmu.tlb[0x40+0x9].P
+printf "tlb[0x0a]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0xa].L,__debug_mmu.tlb[0xa].P,__debug_mmu.tlb[0x40+0xa].L,__debug_mmu.tlb[0x40+0xa].P
+printf "tlb[0x0b]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0xb].L,__debug_mmu.tlb[0xb].P,__debug_mmu.tlb[0x40+0xb].L,__debug_mmu.tlb[0x40+0xb].P
+printf "tlb[0x0c]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0xc].L,__debug_mmu.tlb[0xc].P,__debug_mmu.tlb[0x40+0xc].L,__debug_mmu.tlb[0x40+0xc].P
+printf "tlb[0x0d]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0xd].L,__debug_mmu.tlb[0xd].P,__debug_mmu.tlb[0x40+0xd].L,__debug_mmu.tlb[0x40+0xd].P
+printf "tlb[0x0e]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0xe].L,__debug_mmu.tlb[0xe].P,__debug_mmu.tlb[0x40+0xe].L,__debug_mmu.tlb[0x40+0xe].P
+printf "tlb[0x0f]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0xf].L,__debug_mmu.tlb[0xf].P,__debug_mmu.tlb[0x40+0xf].L,__debug_mmu.tlb[0x40+0xf].P
+printf "tlb[0x10]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x10].L,__debug_mmu.tlb[0x10].P,__debug_mmu.tlb[0x40+0x10].L,__debug_mmu.tlb[0x40+0x10].P
+printf "tlb[0x11]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x11].L,__debug_mmu.tlb[0x11].P,__debug_mmu.tlb[0x40+0x11].L,__debug_mmu.tlb[0x40+0x11].P
+printf "tlb[0x12]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x12].L,__debug_mmu.tlb[0x12].P,__debug_mmu.tlb[0x40+0x12].L,__debug_mmu.tlb[0x40+0x12].P
+printf "tlb[0x13]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x13].L,__debug_mmu.tlb[0x13].P,__debug_mmu.tlb[0x40+0x13].L,__debug_mmu.tlb[0x40+0x13].P
+printf "tlb[0x14]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x14].L,__debug_mmu.tlb[0x14].P,__debug_mmu.tlb[0x40+0x14].L,__debug_mmu.tlb[0x40+0x14].P
+printf "tlb[0x15]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x15].L,__debug_mmu.tlb[0x15].P,__debug_mmu.tlb[0x40+0x15].L,__debug_mmu.tlb[0x40+0x15].P
+printf "tlb[0x16]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x16].L,__debug_mmu.tlb[0x16].P,__debug_mmu.tlb[0x40+0x16].L,__debug_mmu.tlb[0x40+0x16].P
+printf "tlb[0x17]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x17].L,__debug_mmu.tlb[0x17].P,__debug_mmu.tlb[0x40+0x17].L,__debug_mmu.tlb[0x40+0x17].P
+printf "tlb[0x18]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x18].L,__debug_mmu.tlb[0x18].P,__debug_mmu.tlb[0x40+0x18].L,__debug_mmu.tlb[0x40+0x18].P
+printf "tlb[0x19]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x19].L,__debug_mmu.tlb[0x19].P,__debug_mmu.tlb[0x40+0x19].L,__debug_mmu.tlb[0x40+0x19].P
+printf "tlb[0x1a]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x1a].L,__debug_mmu.tlb[0x1a].P,__debug_mmu.tlb[0x40+0x1a].L,__debug_mmu.tlb[0x40+0x1a].P
+printf "tlb[0x1b]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x1b].L,__debug_mmu.tlb[0x1b].P,__debug_mmu.tlb[0x40+0x1b].L,__debug_mmu.tlb[0x40+0x1b].P
+printf "tlb[0x1c]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x1c].L,__debug_mmu.tlb[0x1c].P,__debug_mmu.tlb[0x40+0x1c].L,__debug_mmu.tlb[0x40+0x1c].P
+printf "tlb[0x1d]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x1d].L,__debug_mmu.tlb[0x1d].P,__debug_mmu.tlb[0x40+0x1d].L,__debug_mmu.tlb[0x40+0x1d].P
+printf "tlb[0x1e]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x1e].L,__debug_mmu.tlb[0x1e].P,__debug_mmu.tlb[0x40+0x1e].L,__debug_mmu.tlb[0x40+0x1e].P
+printf "tlb[0x1f]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x1f].L,__debug_mmu.tlb[0x1f].P,__debug_mmu.tlb[0x40+0x1f].L,__debug_mmu.tlb[0x40+0x1f].P
+printf "tlb[0x20]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x20].L,__debug_mmu.tlb[0x20].P,__debug_mmu.tlb[0x40+0x20].L,__debug_mmu.tlb[0x40+0x20].P
+printf "tlb[0x21]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x21].L,__debug_mmu.tlb[0x21].P,__debug_mmu.tlb[0x40+0x21].L,__debug_mmu.tlb[0x40+0x21].P
+printf "tlb[0x22]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x22].L,__debug_mmu.tlb[0x22].P,__debug_mmu.tlb[0x40+0x22].L,__debug_mmu.tlb[0x40+0x22].P
+printf "tlb[0x23]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x23].L,__debug_mmu.tlb[0x23].P,__debug_mmu.tlb[0x40+0x23].L,__debug_mmu.tlb[0x40+0x23].P
+printf "tlb[0x24]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x24].L,__debug_mmu.tlb[0x24].P,__debug_mmu.tlb[0x40+0x24].L,__debug_mmu.tlb[0x40+0x24].P
+printf "tlb[0x25]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x25].L,__debug_mmu.tlb[0x25].P,__debug_mmu.tlb[0x40+0x25].L,__debug_mmu.tlb[0x40+0x25].P
+printf "tlb[0x26]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x26].L,__debug_mmu.tlb[0x26].P,__debug_mmu.tlb[0x40+0x26].L,__debug_mmu.tlb[0x40+0x26].P
+printf "tlb[0x27]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x27].L,__debug_mmu.tlb[0x27].P,__debug_mmu.tlb[0x40+0x27].L,__debug_mmu.tlb[0x40+0x27].P
+printf "tlb[0x28]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x28].L,__debug_mmu.tlb[0x28].P,__debug_mmu.tlb[0x40+0x28].L,__debug_mmu.tlb[0x40+0x28].P
+printf "tlb[0x29]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x29].L,__debug_mmu.tlb[0x29].P,__debug_mmu.tlb[0x40+0x29].L,__debug_mmu.tlb[0x40+0x29].P
+printf "tlb[0x2a]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x2a].L,__debug_mmu.tlb[0x2a].P,__debug_mmu.tlb[0x40+0x2a].L,__debug_mmu.tlb[0x40+0x2a].P
+printf "tlb[0x2b]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x2b].L,__debug_mmu.tlb[0x2b].P,__debug_mmu.tlb[0x40+0x2b].L,__debug_mmu.tlb[0x40+0x2b].P
+printf "tlb[0x2c]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x2c].L,__debug_mmu.tlb[0x2c].P,__debug_mmu.tlb[0x40+0x2c].L,__debug_mmu.tlb[0x40+0x2c].P
+printf "tlb[0x2d]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x2d].L,__debug_mmu.tlb[0x2d].P,__debug_mmu.tlb[0x40+0x2d].L,__debug_mmu.tlb[0x40+0x2d].P
+printf "tlb[0x2e]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x2e].L,__debug_mmu.tlb[0x2e].P,__debug_mmu.tlb[0x40+0x2e].L,__debug_mmu.tlb[0x40+0x2e].P
+printf "tlb[0x2f]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x2f].L,__debug_mmu.tlb[0x2f].P,__debug_mmu.tlb[0x40+0x2f].L,__debug_mmu.tlb[0x40+0x2f].P
+printf "tlb[0x30]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x30].L,__debug_mmu.tlb[0x30].P,__debug_mmu.tlb[0x40+0x30].L,__debug_mmu.tlb[0x40+0x30].P
+printf "tlb[0x31]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x31].L,__debug_mmu.tlb[0x31].P,__debug_mmu.tlb[0x40+0x31].L,__debug_mmu.tlb[0x40+0x31].P
+printf "tlb[0x32]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x32].L,__debug_mmu.tlb[0x32].P,__debug_mmu.tlb[0x40+0x32].L,__debug_mmu.tlb[0x40+0x32].P
+printf "tlb[0x33]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x33].L,__debug_mmu.tlb[0x33].P,__debug_mmu.tlb[0x40+0x33].L,__debug_mmu.tlb[0x40+0x33].P
+printf "tlb[0x34]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x34].L,__debug_mmu.tlb[0x34].P,__debug_mmu.tlb[0x40+0x34].L,__debug_mmu.tlb[0x40+0x34].P
+printf "tlb[0x35]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x35].L,__debug_mmu.tlb[0x35].P,__debug_mmu.tlb[0x40+0x35].L,__debug_mmu.tlb[0x40+0x35].P
+printf "tlb[0x36]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x36].L,__debug_mmu.tlb[0x36].P,__debug_mmu.tlb[0x40+0x36].L,__debug_mmu.tlb[0x40+0x36].P
+printf "tlb[0x37]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x37].L,__debug_mmu.tlb[0x37].P,__debug_mmu.tlb[0x40+0x37].L,__debug_mmu.tlb[0x40+0x37].P
+printf "tlb[0x38]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x38].L,__debug_mmu.tlb[0x38].P,__debug_mmu.tlb[0x40+0x38].L,__debug_mmu.tlb[0x40+0x38].P
+printf "tlb[0x39]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x39].L,__debug_mmu.tlb[0x39].P,__debug_mmu.tlb[0x40+0x39].L,__debug_mmu.tlb[0x40+0x39].P
+printf "tlb[0x3a]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x3a].L,__debug_mmu.tlb[0x3a].P,__debug_mmu.tlb[0x40+0x3a].L,__debug_mmu.tlb[0x40+0x3a].P
+printf "tlb[0x3b]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x3b].L,__debug_mmu.tlb[0x3b].P,__debug_mmu.tlb[0x40+0x3b].L,__debug_mmu.tlb[0x40+0x3b].P
+printf "tlb[0x3c]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x3c].L,__debug_mmu.tlb[0x3c].P,__debug_mmu.tlb[0x40+0x3c].L,__debug_mmu.tlb[0x40+0x3c].P
+printf "tlb[0x3d]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x3d].L,__debug_mmu.tlb[0x3d].P,__debug_mmu.tlb[0x40+0x3d].L,__debug_mmu.tlb[0x40+0x3d].P
+printf "tlb[0x3e]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x3e].L,__debug_mmu.tlb[0x3e].P,__debug_mmu.tlb[0x40+0x3e].L,__debug_mmu.tlb[0x40+0x3e].P
+printf "tlb[0x3f]: %08lx %08lx %08lx %08lx\n",__debug_mmu.tlb[0x3f].L,__debug_mmu.tlb[0x3f].P,__debug_mmu.tlb[0x40+0x3f].L,__debug_mmu.tlb[0x40+0x3f].P
+end
+
+
+define _pgd
+p (pgd_t[0x40])*(pgd_t*)(__debug_mmu.damr[0x3].L)
+end
+
+define _ptd_i
+p (pte_t[0x1000])*(pte_t*)(__debug_mmu.damr[0x4].L)
+end
+
+define _ptd_d
+p (pte_t[0x1000])*(pte_t*)(__debug_mmu.damr[0x5].L)
+end
diff --git a/Documentation/frv/gdbstub.txt b/Documentation/frv/gdbstub.txt
new file mode 100644
index 0000000..b92bfd9
--- /dev/null
+++ b/Documentation/frv/gdbstub.txt
@@ -0,0 +1,130 @@
+ ====================
+ DEBUGGING FR-V LINUX
+ ====================
+
+
+The kernel contains a GDB stub that talks GDB remote protocol across a serial
+port. This permits GDB to single step through the kernel, set breakpoints and
+trap exceptions that happen in kernel space and interrupt execution. It also
+permits the NMI interrupt button or serial port events to jump the kernel into
+the debugger.
+
+On the CPUs that have on-chip UARTs (FR400, FR403, FR405, FR555), the
+GDB stub hijacks a serial port for its own purposes, and makes it
+generate level 15 interrupts (NMI). The kernel proper cannot see the serial
+port in question under these conditions.
+
+On the MB93091-VDK CPU boards, the GDB stub uses UART1, which would otherwise
+be /dev/ttyS1. On the MB93093-PDK, the GDB stub uses UART0. Therefore, on the
+PDK there is no externally accessible serial port and the serial port to
+which the touch screen is attached becomes /dev/ttyS0.
+
+Note that the GDB stub runs entirely within CPU debug mode, and so should not
+incur any exceptions or interrupts whilst it is active. In particular, note
+that the clock will lose time since it is implemented in software.
+
+
+==================
+KERNEL PREPARATION
+==================
+
+Firstly, a debuggable kernel must be built. To do this, unpack the kernel tree
+and copy the configuration that you wish to use to .config. Then reconfigure
+the following things on the "Kernel Hacking" tab:
+
+ (*) "Include debugging information"
+
+ Set this to "Y". This causes all C and Assembly files to be compiled
+ to include debugging information.
+
+ (*) "In-kernel GDB stub"
+
+ Set this to "Y". This causes the GDB stub to be compiled into the
+ kernel.
+
+ (*) "Immediate activation"
+
+ Set this to "Y" if you want the GDB stub to activate as soon as possible
+ and wait for GDB to connect. This allows you to start tracing right from
+ the beginning of start_kernel() in init/main.c.
+
+ (*) "Console through GDB stub"
+
+ Set this to "Y" if you wish to be able to use "console=gdb0" on the
+ command line. That tells the kernel to pass system console messages to
+ GDB (which then prints them on its standard output). This is useful when
+ debugging the serial drivers that'd otherwise be used to pass console
+ messages to the outside world.
+
+Then build as usual, download to the board and execute. Note that if
+"Immediate activation" was selected, then the kernel will wait for GDB to
+attach. If not, then the kernel will boot immediately and GDB will have to
+interrupt it or wait for an exception to occur before doing anything with
+the kernel.
+
+
+=========================
+KERNEL DEBUGGING WITH GDB
+=========================
+
+Set the serial port on the computer that's going to run GDB to the appropriate
+baud rate. Assuming the board's debug port is connected to ttyS0/COM1 on the
+computer doing the debugging:
+
+ stty -F /dev/ttyS0 115200
+
+Then start GDB in the base of the kernel tree:
+
+ frv-uclinux-gdb linux [uClinux]
+
+Or:
+
+ frv-uclinux-gdb vmlinux [MMU linux]
+
+When the prompt appears:
+
+ GNU gdb frv-031024
+ Copyright 2003 Free Software Foundation, Inc.
+ GDB is free software, covered by the GNU General Public License, and you are
+ welcome to change it and/or distribute copies of it under certain conditions.
+ Type "show copying" to see the conditions.
+ There is absolutely no warranty for GDB. Type "show warranty" for details.
+ This GDB was configured as "--host=i686-pc-linux-gnu --target=frv-uclinux"...
+ (gdb)
+
+Attach to the board like this:
+
+ (gdb) target remote /dev/ttyS0
+ Remote debugging using /dev/ttyS0
+ start_kernel () at init/main.c:395
+ (gdb)
+
+This should show the appropriate lines from the source too. The kernel can
+then be debugged almost as if it's any other program.
+
+
+===============================
+INTERRUPTING THE RUNNING KERNEL
+===============================
+
+The kernel can be interrupted whilst it is running, causing a jump back to the
+GDB stub and the debugger:
+
+ (*) Pressing Ctrl-C in GDB. This will cause GDB to try and interrupt the
+ kernel by sending an RS232 BREAK over the serial line to the GDB
+ stub. This will (mostly) immediately interrupt the kernel and return it
+ to the debugger.
+
+ (*) Pressing the NMI button on the board will also cause a jump into the
+ debugger.
+
+ (*) Setting a software breakpoint. This sets a break instruction at the
+ desired location which the GDB stub then traps the exception for.
+
+ (*) Setting a hardware breakpoint. The GDB stub is capable of using the IBAR
+ and DBAR registers to assist debugging.
+
+Furthermore, the GDB stub will intercept a number of exceptions automatically
+if they are caused by kernel execution. It will also intercept BUG() macro
+invocation.
+
diff --git a/Documentation/frv/kernel-ABI.txt b/Documentation/frv/kernel-ABI.txt
new file mode 100644
index 0000000..aaa1cec
--- /dev/null
+++ b/Documentation/frv/kernel-ABI.txt
@@ -0,0 +1,262 @@
+ =================================
+ INTERNAL KERNEL ABI FOR FR-V ARCH
+ =================================
+
+The internal FRV kernel ABI is not quite the same as the userspace ABI. A
+number of the registers are used for special purposed, and the ABI is not
+consistent between modules vs core, and MMU vs no-MMU.
+
+This partly stems from the fact that FRV CPUs do not have a separate
+supervisor stack pointer, and most of them do not have any scratch
+registers, thus requiring at least one general purpose register to be
+clobbered in such an event. Also, within the kernel core, it is possible to
+simply jump or call directly between functions using a relative offset.
+This cannot be extended to modules for the displacement is likely to be too
+far. Thus in modules the address of a function to call must be calculated
+in a register and then used, requiring two extra instructions.
+
+This document has the following sections:
+
+ (*) System call register ABI
+ (*) CPU operating modes
+ (*) Internal kernel-mode register ABI
+ (*) Internal debug-mode register ABI
+ (*) Virtual interrupt handling
+
+
+========================
+SYSTEM CALL REGISTER ABI
+========================
+
+When a system call is made, the following registers are effective:
+
+ REGISTERS CALL RETURN
+ =============== ======================= =======================
+ GR7 System call number Preserved
+ GR8 Syscall arg #1 Return value
+ GR9-GR13 Syscall arg #2-6 Preserved
+
+
+===================
+CPU OPERATING MODES
+===================
+
+The FR-V CPU has three basic operating modes. In order of increasing
+capability:
+
+ (1) User mode.
+
+ Basic userspace running mode.
+
+ (2) Kernel mode.
+
+ Normal kernel mode. There are many additional control registers
+ available that may be accessed in this mode, in addition to all the
+ stuff available to user mode. This has two submodes:
+
+ (a) Exceptions enabled (PSR.T == 1).
+
+ Exceptions will invoke the appropriate normal kernel mode
+ handler. On entry to the handler, the PSR.T bit will be cleared.
+
+ (b) Exceptions disabled (PSR.T == 0).
+
+ No exceptions or interrupts may happen. Any mandatory exceptions
+ will cause the CPU to halt unless the CPU is told to jump into
+ debug mode instead.
+
+ (3) Debug mode.
+
+ No exceptions may happen in this mode. Memory protection and
+ management exceptions will be flagged for later consideration, but
+ the exception handler won't be invoked. Debugging traps such as
+ hardware breakpoints and watchpoints will be ignored. This mode is
+ entered only by debugging events obtained from the other two modes.
+
+ All kernel mode registers may be accessed, plus a few extra debugging
+ specific registers.
+
+
+=================================
+INTERNAL KERNEL-MODE REGISTER ABI
+=================================
+
+There are a number of permanent register assignments that are set up by
+entry.S in the exception prologue. Note that there is a complete set of
+exception prologues for each of user->kernel transition and kernel->kernel
+transition. There are also user->debug and kernel->debug mode transition
+prologues.
+
+
+ REGISTER FLAVOUR USE
+ =============== ======= ==============================================
+ GR1 Supervisor stack pointer
+ GR15 Current thread info pointer
+ GR16 GP-Rel base register for small data
+ GR28 Current exception frame pointer (__frame)
+ GR29 Current task pointer (current)
+ GR30 Destroyed by kernel mode entry
+ GR31 NOMMU Destroyed by debug mode entry
+ GR31 MMU Destroyed by TLB miss kernel mode entry
+ CCR.ICC2 Virtual interrupt disablement tracking
+ CCCR.CC3 Cleared by exception prologue
+ (atomic op emulation)
+ SCR0 MMU See mmu-layout.txt.
+ SCR1 MMU See mmu-layout.txt.
+ SCR2 MMU Save for EAR0 (destroyed by icache insns
+ in debug mode)
+ SCR3 MMU Save for GR31 during debug exceptions
+ DAMR/IAMR NOMMU Fixed memory protection layout.
+ DAMR/IAMR MMU See mmu-layout.txt.
+
+
+Certain registers are also used or modified across function calls:
+
+ REGISTER CALL RETURN
+ =============== =============================== ======================
+ GR0 Fixed Zero -
+ GR2 Function call frame pointer
+ GR3 Special Preserved
+ GR3-GR7 - Clobbered
+ GR8 Function call arg #1 Return value
+ (or clobbered)
+ GR9 Function call arg #2 Return value MSW
+ (or clobbered)
+ GR10-GR13 Function call arg #3-#6 Clobbered
+ GR14 - Clobbered
+ GR15-GR16 Special Preserved
+ GR17-GR27 - Preserved
+ GR28-GR31 Special Only accessed
+ explicitly
+ LR Return address after CALL Clobbered
+ CCR/CCCR - Mostly Clobbered
+
+
+================================
+INTERNAL DEBUG-MODE REGISTER ABI
+================================
+
+This is the same as the kernel-mode register ABI for functions calls. The
+difference is that in debug-mode there's a different stack and a different
+exception frame. Almost all the global registers from kernel-mode
+(including the stack pointer) may be changed.
+
+ REGISTER FLAVOUR USE
+ =============== ======= ==============================================
+ GR1 Debug stack pointer
+ GR16 GP-Rel base register for small data
+ GR31 Current debug exception frame pointer
+ (__debug_frame)
+ SCR3 MMU Saved value of GR31
+
+
+Note that debug mode is able to interfere with the kernel's emulated atomic
+ops, so it must be exceedingly careful not to do any that would interact
+with the main kernel in this regard. Hence the debug mode code (gdbstub) is
+almost completely self-contained. The only external code used is the
+sprintf family of functions.
+
+Furthermore, break.S is so complicated because single-step mode does not
+switch off on entry to an exception. That means unless manually disabled,
+single-stepping will blithely go on stepping into things like interrupts.
+See gdbstub.txt for more information.
+
+
+==========================
+VIRTUAL INTERRUPT HANDLING
+==========================
+
+Because accesses to the PSR is so slow, and to disable interrupts we have
+to access it twice (once to read and once to write), we don't actually
+disable interrupts at all if we don't have to. What we do instead is use
+the ICC2 condition code flags to note virtual disablement, such that if we
+then do take an interrupt, we note the flag, really disable interrupts, set
+another flag and resume execution at the point the interrupt happened.
+Setting condition flags as a side effect of an arithmetic or logical
+instruction is really fast. This use of the ICC2 only occurs within the
+kernel - it does not affect userspace.
+
+The flags we use are:
+
+ (*) CCR.ICC2.Z [Zero flag]
+
+ Set to virtually disable interrupts, clear when interrupts are
+ virtually enabled. Can be modified by logical instructions without
+ affecting the Carry flag.
+
+ (*) CCR.ICC2.C [Carry flag]
+
+ Clear to indicate hardware interrupts are really disabled, set otherwise.
+
+
+What happens is this:
+
+ (1) Normal kernel-mode operation.
+
+ ICC2.Z is 0, ICC2.C is 1.
+
+ (2) An interrupt occurs. The exception prologue examines ICC2.Z and
+ determines that nothing needs doing. This is done simply with an
+ unlikely BEQ instruction.
+
+ (3) The interrupts are disabled (local_irq_disable)
+
+ ICC2.Z is set to 1.
+
+ (4) If interrupts were then re-enabled (local_irq_enable):
+
+ ICC2.Z would be set to 0.
+
+ A TIHI #2 instruction (trap #2 if condition HI - Z==0 && C==0) would
+ be used to trap if interrupts were now virtually enabled, but
+ physically disabled - which they're not, so the trap isn't taken. The
+ kernel would then be back to state (1).
+
+ (5) An interrupt occurs. The exception prologue examines ICC2.Z and
+ determines that the interrupt shouldn't actually have happened. It
+ jumps aside, and there disabled interrupts by setting PSR.PIL to 14
+ and then it clears ICC2.C.
+
+ (6) If interrupts were then saved and disabled again (local_irq_save):
+
+ ICC2.Z would be shifted into the save variable and masked off
+ (giving a 1).
+
+ ICC2.Z would then be set to 1 (thus unchanged), and ICC2.C would be
+ unaffected (ie: 0).
+
+ (7) If interrupts were then restored from state (6) (local_irq_restore):
+
+ ICC2.Z would be set to indicate the result of XOR'ing the saved
+ value (ie: 1) with 1, which gives a result of 0 - thus leaving
+ ICC2.Z set.
+
+ ICC2.C would remain unaffected (ie: 0).
+
+ A TIHI #2 instruction would be used to again assay the current state,
+ but this would do nothing as Z==1.
+
+ (8) If interrupts were then enabled (local_irq_enable):
+
+ ICC2.Z would be cleared. ICC2.C would be left unaffected. Both
+ flags would now be 0.
+
+ A TIHI #2 instruction again issued to assay the current state would
+ then trap as both Z==0 [interrupts virtually enabled] and C==0
+ [interrupts really disabled] would then be true.
+
+ (9) The trap #2 handler would simply enable hardware interrupts
+ (set PSR.PIL to 0), set ICC2.C to 1 and return.
+
+(10) Immediately upon returning, the pending interrupt would be taken.
+
+(11) The interrupt handler would take the path of actually processing the
+ interrupt (ICC2.Z is clear, BEQ fails as per step (2)).
+
+(12) The interrupt handler would then set ICC2.C to 1 since hardware
+ interrupts are definitely enabled - or else the kernel wouldn't be here.
+
+(13) On return from the interrupt handler, things would be back to state (1).
+
+This trap (#2) is only available in kernel mode. In user mode it will
+result in SIGILL.
diff --git a/Documentation/frv/mmu-layout.txt b/Documentation/frv/mmu-layout.txt
new file mode 100644
index 0000000..db10250
--- /dev/null
+++ b/Documentation/frv/mmu-layout.txt
@@ -0,0 +1,306 @@
+ =================================
+ FR451 MMU LINUX MEMORY MANAGEMENT
+ =================================
+
+============
+MMU HARDWARE
+============
+
+FR451 MMU Linux puts the MMU into EDAT mode whilst running. This means that it uses both the SAT
+registers and the DAT TLB to perform address translation.
+
+There are 8 IAMLR/IAMPR register pairs and 16 DAMLR/DAMPR register pairs for SAT mode.
+
+In DAT mode, there is also a TLB organised in cache format as 64 lines x 2 ways. Each line spans a
+16KB range of addresses, but can match a larger region.
+
+
+===========================
+MEMORY MANAGEMENT REGISTERS
+===========================
+
+Certain control registers are used by the kernel memory management routines:
+
+ REGISTERS USAGE
+ ====================== ==================================================
+ IAMR0, DAMR0 Kernel image and data mappings
+ IAMR1, DAMR1 First-chance TLB lookup mapping
+ DAMR2 Page attachment for cache flush by page
+ DAMR3 Current PGD mapping
+ SCR0, DAMR4 Instruction TLB PGE/PTD cache
+ SCR1, DAMR5 Data TLB PGE/PTD cache
+ DAMR6-10 kmap_atomic() mappings
+ DAMR11 I/O mapping
+ CXNR mm_struct context ID
+ TTBR Page directory (PGD) pointer (physical address)
+
+
+=====================
+GENERAL MEMORY LAYOUT
+=====================
+
+The physical memory layout is as follows:
+
+ PHYSICAL ADDRESS CONTROLLER DEVICE
+ =================== ============== =======================================
+ 00000000 - BFFFFFFF SDRAM SDRAM area
+ E0000000 - EFFFFFFF L-BUS CS2# VDK SLBUS/PCI window
+ F0000000 - F0FFFFFF L-BUS CS5# MB93493 CSC area (DAV daughter board)
+ F1000000 - F1FFFFFF L-BUS CS7# (CB70 CPU-card PCMCIA port I/O space)
+ FC000000 - FC0FFFFF L-BUS CS1# VDK MB86943 config space
+ FC100000 - FC1FFFFF L-BUS CS6# DM9000 NIC I/O space
+ FC200000 - FC2FFFFF L-BUS CS3# MB93493 CSR area (DAV daughter board)
+ FD000000 - FDFFFFFF L-BUS CS4# (CB70 CPU-card extra flash space)
+ FE000000 - FEFFFFFF Internal CPU peripherals
+ FF000000 - FF1FFFFF L-BUS CS0# Flash 1
+ FF200000 - FF3FFFFF L-BUS CS0# Flash 2
+ FFC00000 - FFC0001F L-BUS CS0# FPGA
+
+The virtual memory layout is:
+
+ VIRTUAL ADDRESS PHYSICAL TRANSLATOR FLAGS SIZE OCCUPATION
+ ================= ======== ============== ======= ======= ===================================
+ 00004000-BFFFFFFF various TLB,xAMR1 D-N-??V 3GB Userspace
+ C0000000-CFFFFFFF 00000000 xAMPR0 -L-S--V 256MB Kernel image and data
+ D0000000-D7FFFFFF various TLB,xAMR1 D-NS??V 128MB vmalloc area
+ D8000000-DBFFFFFF various TLB,xAMR1 D-NS??V 64MB kmap() area
+ DC000000-DCFFFFFF various TLB 1MB Secondary kmap_atomic() frame
+ DD000000-DD27FFFF various DAMR 160KB Primary kmap_atomic() frame
+ DD040000 DAMR2/IAMR2 -L-S--V page Page cache flush attachment point
+ DD080000 DAMR3 -L-SC-V page Page Directory (PGD)
+ DD0C0000 DAMR4 -L-SC-V page Cached insn TLB Page Table lookup
+ DD100000 DAMR5 -L-SC-V page Cached data TLB Page Table lookup
+ DD140000 DAMR6 -L-S--V page kmap_atomic(KM_BOUNCE_READ)
+ DD180000 DAMR7 -L-S--V page kmap_atomic(KM_SKB_SUNRPC_DATA)
+ DD1C0000 DAMR8 -L-S--V page kmap_atomic(KM_SKB_DATA_SOFTIRQ)
+ DD200000 DAMR9 -L-S--V page kmap_atomic(KM_USER0)
+ DD240000 DAMR10 -L-S--V page kmap_atomic(KM_USER1)
+ E0000000-FFFFFFFF E0000000 DAMR11 -L-SC-V 512MB I/O region
+
+IAMPR1 and DAMPR1 are used as an extension to the TLB.
+
+
+====================
+KMAP AND KMAP_ATOMIC
+====================
+
+To access pages in the page cache (which may not be directly accessible if highmem is available),
+the kernel calls kmap(), does the access and then calls kunmap(); or it calls kmap_atomic(), does
+the access and then calls kunmap_atomic().
+
+kmap() creates an attachment between an arbitrary inaccessible page and a range of virtual
+addresses by installing a PTE in a special page table. The kernel can then access this page as it
+wills. When it's finished, the kernel calls kunmap() to clear the PTE.
+
+kmap_atomic() does something slightly different. In the interests of speed, it chooses one of two
+strategies:
+
+ (1) If possible, kmap_atomic() attaches the requested page to one of DAMPR5 through DAMPR10
+ register pairs; and the matching kunmap_atomic() clears the DAMPR. This makes high memory
+ support really fast as there's no need to flush the TLB or modify the page tables. The DAMLR
+ registers being used for this are preset during boot and don't change over the lifetime of the
+ process. There's a direct mapping between the first few kmap_atomic() types, DAMR number and
+ virtual address slot.
+
+ However, there are more kmap_atomic() types defined than there are DAMR registers available,
+ so we fall back to:
+
+ (2) kmap_atomic() uses a slot in the secondary frame (determined by the type parameter), and then
+ locks an entry in the TLB to translate that slot to the specified page. The number of slots is
+ obviously limited, and their positions are controlled such that each slot is matched by a
+ different line in the TLB. kunmap() ejects the entry from the TLB.
+
+Note that the first three kmap atomic types are really just declared as placeholders. The DAMPR
+registers involved are actually modified directly.
+
+Also note that kmap() itself may sleep, kmap_atomic() may never sleep and both always succeed;
+furthermore, a driver using kmap() may sleep before calling kunmap(), but may not sleep before
+calling kunmap_atomic() if it had previously called kmap_atomic().
+
+
+===============================
+USING MORE THAN 256MB OF MEMORY
+===============================
+
+The kernel cannot access more than 256MB of memory directly. The physical layout, however, permits
+up to 3GB of SDRAM (possibly 3.25GB) to be made available. By using CONFIG_HIGHMEM, the kernel can
+allow userspace (by way of page tables) and itself (by way of kmap) to deal with the memory
+allocation.
+
+External devices can, of course, still DMA to and from all of the SDRAM, even if the kernel can't
+see it directly. The kernel translates page references into real addresses for communicating to the
+devices.
+
+
+===================
+PAGE TABLE TOPOLOGY
+===================
+
+The page tables are arranged in 2-layer format. There is a middle layer (PMD) that would be used in
+3-layer format tables but that is folded into the top layer (PGD) and so consumes no extra memory
+or processing power.
+
+ +------+ PGD PMD
+ | TTBR |--->+-------------------+
+ +------+ | | : STE |
+ | PGE0 | PME0 : STE |
+ | | : STE |
+ +-------------------+ Page Table
+ | | : STE -------------->+--------+ +0x0000
+ | PGE1 | PME0 : STE -----------+ | PTE0 |
+ | | : STE -------+ | +--------+
+ +-------------------+ | | | PTE63 |
+ | | : STE | | +-->+--------+ +0x0100
+ | PGE2 | PME0 : STE | | | PTE64 |
+ | | : STE | | +--------+
+ +-------------------+ | | PTE127 |
+ | | : STE | +------>+--------+ +0x0200
+ | PGE3 | PME0 : STE | | PTE128 |
+ | | : STE | +--------+
+ +-------------------+ | PTE191 |
+ +--------+ +0x0300
+
+Each Page Directory (PGD) is 16KB (page size) in size and is divided into 64 entries (PGEs). Each
+PGE contains one Page Mid Directory (PMD).
+
+Each PMD is 256 bytes in size and contains a single entry (PME). Each PME holds 64 FR451 MMU
+segment table entries of 4 bytes apiece. Each PME "points to" a page table. In practice, each STE
+points to a subset of the page table, the first to PT+0x0000, the second to PT+0x0100, the third to
+PT+0x200, and so on.
+
+Each PGE and PME covers 64MB of the total virtual address space.
+
+Each Page Table (PTD) is 16KB (page size) in size, and is divided into 4096 entries (PTEs). Each
+entry can point to one 16KB page. In practice, each Linux page table is subdivided into 64 FR451
+MMU page tables. But they are all grouped together to make management easier, in particular rmap
+support is then trivial.
+
+Grouping page tables in this fashion makes PGE caching in SCR0/SCR1 more efficient because the
+coverage of the cached item is greater.
+
+Page tables for the vmalloc area are allocated at boot time and shared between all mm_structs.
+
+
+=================
+USER SPACE LAYOUT
+=================
+
+For MMU capable Linux, the regions userspace code are allowed to access are kept entirely separate
+from those dedicated to the kernel:
+
+ VIRTUAL ADDRESS SIZE PURPOSE
+ ================= ===== ===================================
+ 00000000-00003fff 4KB NULL pointer access trap
+ 00004000-01ffffff ~32MB lower mmap space (grows up)
+ 02000000-021fffff 2MB Stack space (grows down from top)
+ 02200000-nnnnnnnn Executable mapping
+ nnnnnnnn- brk space (grows up)
+ -bfffffff upper mmap space (grows down)
+
+This is so arranged so as to make best use of the 16KB page tables and the way in which PGEs/PMEs
+are cached by the TLB handler. The lower mmap space is filled first, and then the upper mmap space
+is filled.
+
+
+===============================
+GDB-STUB MMU DEBUGGING SERVICES
+===============================
+
+The gdb-stub included in this kernel provides a number of services to aid in the debugging of MMU
+related kernel services:
+
+ (*) Every time the kernel stops, certain state information is dumped into __debug_mmu. This
+ variable is defined in arch/frv/kernel/gdb-stub.c. Note that the gdbinit file in this
+ directory has some useful macros for dealing with this.
+
+ (*) __debug_mmu.tlb[]
+
+ This receives the current TLB contents. This can be viewed with the _tlb GDB macro:
+
+ (gdb) _tlb
+ tlb[0x00]: 01000005 00718203 01000002 00718203
+ tlb[0x01]: 01004002 006d4201 01004005 006d4203
+ tlb[0x02]: 01008002 006d0201 01008006 00004200
+ tlb[0x03]: 0100c006 007f4202 0100c002 0064c202
+ tlb[0x04]: 01110005 00774201 01110002 00774201
+ tlb[0x05]: 01114005 00770201 01114002 00770201
+ tlb[0x06]: 01118002 0076c201 01118005 0076c201
+ ...
+ tlb[0x3d]: 010f4002 00790200 001f4002 0054ca02
+ tlb[0x3e]: 010f8005 0078c201 010f8002 0078c201
+ tlb[0x3f]: 001fc002 0056ca01 001fc005 00538a01
+
+ (*) __debug_mmu.iamr[]
+ (*) __debug_mmu.damr[]
+
+ These receive the current IAMR and DAMR contents. These can be viewed with the _amr
+ GDB macro:
+
+ (gdb) _amr
+ AMRx DAMR IAMR
+ ==== ===================== =====================
+ amr0 : L:c0000000 P:00000cb9 : L:c0000000 P:000004b9
+ amr1 : L:01070005 P:006f9203 : L:0102c005 P:006a1201
+ amr2 : L:d8d00000 P:00000000 : L:d8d00000 P:00000000
+ amr3 : L:d8d04000 P:00534c0d : L:00000000 P:00000000
+ amr4 : L:d8d08000 P:00554c0d : L:00000000 P:00000000
+ amr5 : L:d8d0c000 P:00554c0d : L:00000000 P:00000000
+ amr6 : L:d8d10000 P:00000000 : L:00000000 P:00000000
+ amr7 : L:d8d14000 P:00000000 : L:00000000 P:00000000
+ amr8 : L:d8d18000 P:00000000
+ amr9 : L:d8d1c000 P:00000000
+ amr10: L:d8d20000 P:00000000
+ amr11: L:e0000000 P:e0000ccd
+
+ (*) The current task's page directory is bound to DAMR3.
+
+ This can be viewed with the _pgd GDB macro:
+
+ (gdb) _pgd
+ $3 = {{pge = {{ste = {0x554001, 0x554101, 0x554201, 0x554301, 0x554401,
+ 0x554501, 0x554601, 0x554701, 0x554801, 0x554901, 0x554a01,
+ 0x554b01, 0x554c01, 0x554d01, 0x554e01, 0x554f01, 0x555001,
+ 0x555101, 0x555201, 0x555301, 0x555401, 0x555501, 0x555601,
+ 0x555701, 0x555801, 0x555901, 0x555a01, 0x555b01, 0x555c01,
+ 0x555d01, 0x555e01, 0x555f01, 0x556001, 0x556101, 0x556201,
+ 0x556301, 0x556401, 0x556501, 0x556601, 0x556701, 0x556801,
+ 0x556901, 0x556a01, 0x556b01, 0x556c01, 0x556d01, 0x556e01,
+ 0x556f01, 0x557001, 0x557101, 0x557201, 0x557301, 0x557401,
+ 0x557501, 0x557601, 0x557701, 0x557801, 0x557901, 0x557a01,
+ 0x557b01, 0x557c01, 0x557d01, 0x557e01, 0x557f01}}}}, {pge = {{
+ ste = {0x0 <repeats 64 times>}}}} <repeats 51 times>, {pge = {{ste = {
+ 0x248001, 0x248101, 0x248201, 0x248301, 0x248401, 0x248501,
+ 0x248601, 0x248701, 0x248801, 0x248901, 0x248a01, 0x248b01,
+ 0x248c01, 0x248d01, 0x248e01, 0x248f01, 0x249001, 0x249101,
+ 0x249201, 0x249301, 0x249401, 0x249501, 0x249601, 0x249701,
+ 0x249801, 0x249901, 0x249a01, 0x249b01, 0x249c01, 0x249d01,
+ 0x249e01, 0x249f01, 0x24a001, 0x24a101, 0x24a201, 0x24a301,
+ 0x24a401, 0x24a501, 0x24a601, 0x24a701, 0x24a801, 0x24a901,
+ 0x24aa01, 0x24ab01, 0x24ac01, 0x24ad01, 0x24ae01, 0x24af01,
+ 0x24b001, 0x24b101, 0x24b201, 0x24b301, 0x24b401, 0x24b501,
+ 0x24b601, 0x24b701, 0x24b801, 0x24b901, 0x24ba01, 0x24bb01,
+ 0x24bc01, 0x24bd01, 0x24be01, 0x24bf01}}}}, {pge = {{ste = {
+ 0x0 <repeats 64 times>}}}} <repeats 11 times>}
+
+ (*) The PTD last used by the instruction TLB miss handler is attached to DAMR4.
+ (*) The PTD last used by the data TLB miss handler is attached to DAMR5.
+
+ These can be viewed with the _ptd_i and _ptd_d GDB macros:
+
+ (gdb) _ptd_d
+ $5 = {{pte = 0x0} <repeats 127 times>, {pte = 0x539b01}, {
+ pte = 0x0} <repeats 896 times>, {pte = 0x719303}, {pte = 0x6d5303}, {
+ pte = 0x0}, {pte = 0x0}, {pte = 0x0}, {pte = 0x0}, {pte = 0x0}, {
+ pte = 0x0}, {pte = 0x0}, {pte = 0x0}, {pte = 0x0}, {pte = 0x6a1303}, {
+ pte = 0x0} <repeats 12 times>, {pte = 0x709303}, {pte = 0x0}, {pte = 0x0},
+ {pte = 0x6fd303}, {pte = 0x6f9303}, {pte = 0x6f5303}, {pte = 0x0}, {
+ pte = 0x6ed303}, {pte = 0x531b01}, {pte = 0x50db01}, {
+ pte = 0x0} <repeats 13 times>, {pte = 0x5303}, {pte = 0x7f5303}, {
+ pte = 0x509b01}, {pte = 0x505b01}, {pte = 0x7c9303}, {pte = 0x7b9303}, {
+ pte = 0x7b5303}, {pte = 0x7b1303}, {pte = 0x7ad303}, {pte = 0x0}, {
+ pte = 0x0}, {pte = 0x7a1303}, {pte = 0x0}, {pte = 0x795303}, {pte = 0x0}, {
+ pte = 0x78d303}, {pte = 0x0}, {pte = 0x0}, {pte = 0x0}, {pte = 0x0}, {
+ pte = 0x0}, {pte = 0x775303}, {pte = 0x771303}, {pte = 0x76d303}, {
+ pte = 0x0}, {pte = 0x765303}, {pte = 0x7c5303}, {pte = 0x501b01}, {
+ pte = 0x4f1b01}, {pte = 0x4edb01}, {pte = 0x0}, {pte = 0x4f9b01}, {
+ pte = 0x4fdb01}, {pte = 0x0} <repeats 2992 times>}
diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
new file mode 100644
index 0000000..9cc4d68
--- /dev/null
+++ b/Documentation/ftrace.txt
@@ -0,0 +1,1339 @@
+ ftrace - Function Tracer
+ ========================
+
+Copyright 2008 Red Hat Inc.
+ Author: Steven Rostedt <srostedt@redhat.com>
+ License: The GNU Free Documentation License, Version 1.2
+ (dual licensed under the GPL v2)
+Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton,
+ John Kacur, and David Teigland.
+
+Written for: 2.6.28-rc2
+
+Introduction
+------------
+
+Ftrace is an internal tracer designed to help out developers and
+designers of systems to find what is going on inside the kernel.
+It can be used for debugging or analyzing latencies and performance
+issues that take place outside of user-space.
+
+Although ftrace is the function tracer, it also includes an
+infrastructure that allows for other types of tracing. Some of the
+tracers that are currently in ftrace include a tracer to trace
+context switches, the time it takes for a high priority task to
+run after it was woken up, the time interrupts are disabled, and
+more (ftrace allows for tracer plugins, which means that the list of
+tracers can always grow).
+
+
+The File System
+---------------
+
+Ftrace uses the debugfs file system to hold the control files as well
+as the files to display output.
+
+To mount the debugfs system:
+
+ # mkdir /debug
+ # mount -t debugfs nodev /debug
+
+(Note: it is more common to mount at /sys/kernel/debug, but for simplicity
+ this document will use /debug)
+
+That's it! (assuming that you have ftrace configured into your kernel)
+
+After mounting the debugfs, you can see a directory called
+"tracing". This directory contains the control and output files
+of ftrace. Here is a list of some of the key files:
+
+
+ Note: all time values are in microseconds.
+
+ current_tracer: This is used to set or display the current tracer
+ that is configured.
+
+ available_tracers: This holds the different types of tracers that
+ have been compiled into the kernel. The tracers
+ listed here can be configured by echoing their name
+ into current_tracer.
+
+ tracing_enabled: This sets or displays whether the current_tracer
+ is activated and tracing or not. Echo 0 into this
+ file to disable the tracer or 1 to enable it.
+
+ trace: This file holds the output of the trace in a human readable
+ format (described below).
+
+ latency_trace: This file shows the same trace but the information
+ is organized more to display possible latencies
+ in the system (described below).
+
+ trace_pipe: The output is the same as the "trace" file but this
+ file is meant to be streamed with live tracing.
+ Reads from this file will block until new data
+ is retrieved. Unlike the "trace" and "latency_trace"
+ files, this file is a consumer. This means reading
+ from this file causes sequential reads to display
+ more current data. Once data is read from this
+ file, it is consumed, and will not be read
+ again with a sequential read. The "trace" and
+ "latency_trace" files are static, and if the
+ tracer is not adding more data, they will display
+ the same information every time they are read.
+
+ iter_ctrl: This file lets the user control the amount of data
+ that is displayed in one of the above output
+ files.
+
+ trace_max_latency: Some of the tracers record the max latency.
+ For example, the time interrupts are disabled.
+ This time is saved in this file. The max trace
+ will also be stored, and displayed by either
+ "trace" or "latency_trace". A new max trace will
+ only be recorded if the latency is greater than
+ the value in this file. (in microseconds)
+
+ trace_entries: This sets or displays the number of bytes each CPU
+ buffer can hold. The tracer buffers are the same size
+ for each CPU. The displayed number is the size of the
+ CPU buffer and not total size of all buffers. The
+ trace buffers are allocated in pages (blocks of memory
+ that the kernel uses for allocation, usually 4 KB in size).
+ If the last page allocated has room for more bytes
+ than requested, the rest of the page will be used,
+ making the actual allocation bigger than requested.
+ (Note, the size may not be a multiple of the page size due
+ to buffer managment overhead.)
+
+ This can only be updated when the current_tracer
+ is set to "nop".
+
+ tracing_cpumask: This is a mask that lets the user only trace
+ on specified CPUS. The format is a hex string
+ representing the CPUS.
+
+ set_ftrace_filter: When dynamic ftrace is configured in (see the
+ section below "dynamic ftrace"), the code is dynamically
+ modified (code text rewrite) to disable calling of the
+ function profiler (mcount). This lets tracing be configured
+ in with practically no overhead in performance. This also
+ has a side effect of enabling or disabling specific functions
+ to be traced. Echoing names of functions into this file
+ will limit the trace to only those functions.
+
+ set_ftrace_notrace: This has an effect opposite to that of
+ set_ftrace_filter. Any function that is added here will not
+ be traced. If a function exists in both set_ftrace_filter
+ and set_ftrace_notrace, the function will _not_ be traced.
+
+ available_filter_functions: This lists the functions that ftrace
+ has processed and can trace. These are the function
+ names that you can pass to "set_ftrace_filter" or
+ "set_ftrace_notrace". (See the section "dynamic ftrace"
+ below for more details.)
+
+
+The Tracers
+-----------
+
+Here is the list of current tracers that may be configured.
+
+ function - function tracer that uses mcount to trace all functions.
+
+ sched_switch - traces the context switches between tasks.
+
+ irqsoff - traces the areas that disable interrupts and saves
+ the trace with the longest max latency.
+ See tracing_max_latency. When a new max is recorded,
+ it replaces the old trace. It is best to view this
+ trace via the latency_trace file.
+
+ preemptoff - Similar to irqsoff but traces and records the amount of
+ time for which preemption is disabled.
+
+ preemptirqsoff - Similar to irqsoff and preemptoff, but traces and
+ records the largest time for which irqs and/or preemption
+ is disabled.
+
+ wakeup - Traces and records the max latency that it takes for
+ the highest priority task to get scheduled after
+ it has been woken up.
+
+ nop - This is not a tracer. To remove all tracers from tracing
+ simply echo "nop" into current_tracer.
+
+
+Examples of using the tracer
+----------------------------
+
+Here are typical examples of using the tracers when controlling them only
+with the debugfs interface (without using any user-land utilities).
+
+Output format:
+--------------
+
+Here is an example of the output format of the file "trace"
+
+ --------
+# tracer: function
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+ bash-4251 [01] 10152.583854: path_put <-path_walk
+ bash-4251 [01] 10152.583855: dput <-path_put
+ bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput
+ --------
+
+A header is printed with the tracer name that is represented by the trace.
+In this case the tracer is "function". Then a header showing the format. Task
+name "bash", the task PID "4251", the CPU that it was running on
+"01", the timestamp in <secs>.<usecs> format, the function name that was
+traced "path_put" and the parent function that called this function
+"path_walk". The timestamp is the time at which the function was
+entered.
+
+The sched_switch tracer also includes tracing of task wakeups and
+context switches.
+
+ ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S
+ ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S
+ ksoftirqd/1-7 [01] 1453.070013: 7:115:R ==> 10:115:R
+ events/1-10 [01] 1453.070013: 10:115:S ==> 2916:115:R
+ kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R
+ ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R
+
+Wake ups are represented by a "+" and the context switches are shown as
+"==>". The format is:
+
+ Context switches:
+
+ Previous task Next Task
+
+ <pid>:<prio>:<state> ==> <pid>:<prio>:<state>
+
+ Wake ups:
+
+ Current task Task waking up
+
+ <pid>:<prio>:<state> + <pid>:<prio>:<state>
+
+The prio is the internal kernel priority, which is the inverse of the
+priority that is usually displayed by user-space tools. Zero represents
+the highest priority (99). Prio 100 starts the "nice" priorities with
+100 being equal to nice -20 and 139 being nice 19. The prio "140" is
+reserved for the idle task which is the lowest priority thread (pid 0).
+
+
+Latency trace format
+--------------------
+
+For traces that display latency times, the latency_trace file gives
+somewhat more information to see why a latency happened. Here is a typical
+trace.
+
+# tracer: irqsoff
+#
+irqsoff latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 97 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
+ -----------------
+ => started at: apic_timer_interrupt
+ => ended at: do_softirq
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ <idle>-0 0d..1 0us+: trace_hardirqs_off_thunk (apic_timer_interrupt)
+ <idle>-0 0d.s. 97us : __do_softirq (do_softirq)
+ <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq)
+
+
+
+This shows that the current tracer is "irqsoff" tracing the time for which
+interrupts were disabled. It gives the trace version and the version
+of the kernel upon which this was executed on (2.6.26-rc8). Then it displays
+the max latency in microsecs (97 us). The number of trace entries displayed
+and the total number recorded (both are three: #3/3). The type of
+preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero
+and are reserved for later use. #P is the number of online CPUS (#P:2).
+
+The task is the process that was running when the latency occurred.
+(swapper pid: 0).
+
+The start and stop (the functions in which the interrupts were disabled and
+enabled respectively) that caused the latencies:
+
+ apic_timer_interrupt is where the interrupts were disabled.
+ do_softirq is where they were enabled again.
+
+The next lines after the header are the trace itself. The header
+explains which is which.
+
+ cmd: The name of the process in the trace.
+
+ pid: The PID of that process.
+
+ CPU#: The CPU which the process was running on.
+
+ irqs-off: 'd' interrupts are disabled. '.' otherwise.
+ Note: If the architecture does not support a way to
+ read the irq flags variable, an 'X' will always
+ be printed here.
+
+ need-resched: 'N' task need_resched is set, '.' otherwise.
+
+ hardirq/softirq:
+ 'H' - hard irq occurred inside a softirq.
+ 'h' - hard irq is running
+ 's' - soft irq is running
+ '.' - normal context.
+
+ preempt-depth: The level of preempt_disabled
+
+The above is mostly meaningful for kernel developers.
+
+ time: This differs from the trace file output. The trace file output
+ includes an absolute timestamp. The timestamp used by the
+ latency_trace file is relative to the start of the trace.
+
+ delay: This is just to help catch your eye a bit better. And
+ needs to be fixed to be only relative to the same CPU.
+ The marks are determined by the difference between this
+ current trace and the next trace.
+ '!' - greater than preempt_mark_thresh (default 100)
+ '+' - greater than 1 microsecond
+ ' ' - less than or equal to 1 microsecond.
+
+ The rest is the same as the 'trace' file.
+
+
+iter_ctrl
+---------
+
+The iter_ctrl file is used to control what gets printed in the trace
+output. To see what is available, simply cat the file:
+
+ cat /debug/tracing/iter_ctrl
+ print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
+ noblock nostacktrace nosched-tree
+
+To disable one of the options, echo in the option prepended with "no".
+
+ echo noprint-parent > /debug/tracing/iter_ctrl
+
+To enable an option, leave off the "no".
+
+ echo sym-offset > /debug/tracing/iter_ctrl
+
+Here are the available options:
+
+ print-parent - On function traces, display the calling function
+ as well as the function being traced.
+
+ print-parent:
+ bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul
+
+ noprint-parent:
+ bash-4000 [01] 1477.606694: simple_strtoul
+
+
+ sym-offset - Display not only the function name, but also the offset
+ in the function. For example, instead of seeing just
+ "ktime_get", you will see "ktime_get+0xb/0x20".
+
+ sym-offset:
+ bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0
+
+ sym-addr - this will also display the function address as well as
+ the function name.
+
+ sym-addr:
+ bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
+
+ verbose - This deals with the latency_trace file.
+
+ bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
+ (+0.000ms): simple_strtoul (strict_strtoul)
+
+ raw - This will display raw numbers. This option is best for use with
+ user applications that can translate the raw numbers better than
+ having it done in the kernel.
+
+ hex - Similar to raw, but the numbers will be in a hexadecimal format.
+
+ bin - This will print out the formats in raw binary.
+
+ block - TBD (needs update)
+
+ stacktrace - This is one of the options that changes the trace itself.
+ When a trace is recorded, so is the stack of functions.
+ This allows for back traces of trace sites.
+
+ sched-tree - TBD (any users??)
+
+
+sched_switch
+------------
+
+This tracer simply records schedule switches. Here is an example
+of how to use it.
+
+ # echo sched_switch > /debug/tracing/current_tracer
+ # echo 1 > /debug/tracing/tracing_enabled
+ # sleep 1
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/trace
+
+# tracer: sched_switch
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+ bash-3997 [01] 240.132281: 3997:120:R + 4055:120:R
+ bash-3997 [01] 240.132284: 3997:120:R ==> 4055:120:R
+ sleep-4055 [01] 240.132371: 4055:120:S ==> 3997:120:R
+ bash-3997 [01] 240.132454: 3997:120:R + 4055:120:S
+ bash-3997 [01] 240.132457: 3997:120:R ==> 4055:120:R
+ sleep-4055 [01] 240.132460: 4055:120:D ==> 3997:120:R
+ bash-3997 [01] 240.132463: 3997:120:R + 4055:120:D
+ bash-3997 [01] 240.132465: 3997:120:R ==> 4055:120:R
+ <idle>-0 [00] 240.132589: 0:140:R + 4:115:S
+ <idle>-0 [00] 240.132591: 0:140:R ==> 4:115:R
+ ksoftirqd/0-4 [00] 240.132595: 4:115:S ==> 0:140:R
+ <idle>-0 [00] 240.132598: 0:140:R + 4:115:S
+ <idle>-0 [00] 240.132599: 0:140:R ==> 4:115:R
+ ksoftirqd/0-4 [00] 240.132603: 4:115:S ==> 0:140:R
+ sleep-4055 [01] 240.133058: 4055:120:S ==> 3997:120:R
+ [...]
+
+
+As we have discussed previously about this format, the header shows
+the name of the trace and points to the options. The "FUNCTION"
+is a misnomer since here it represents the wake ups and context
+switches.
+
+The sched_switch file only lists the wake ups (represented with '+')
+and context switches ('==>') with the previous task or current task
+first followed by the next task or task waking up. The format for both
+of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO
+is the inverse of the actual priority with zero (0) being the highest
+priority and the nice values starting at 100 (nice -20). Below is
+a quick chart to map the kernel priority to user land priorities.
+
+ Kernel priority: 0 to 99 ==> user RT priority 99 to 0
+ Kernel priority: 100 to 139 ==> user nice -20 to 19
+ Kernel priority: 140 ==> idle task priority
+
+The task states are:
+
+ R - running : wants to run, may not actually be running
+ S - sleep : process is waiting to be woken up (handles signals)
+ D - disk sleep (uninterruptible sleep) : process must be woken up
+ (ignores signals)
+ T - stopped : process suspended
+ t - traced : process is being traced (with something like gdb)
+ Z - zombie : process waiting to be cleaned up
+ X - unknown
+
+
+ftrace_enabled
+--------------
+
+The following tracers (listed below) give different output depending
+on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled,
+one can either use the sysctl function or set it via the proc
+file system interface.
+
+ sysctl kernel.ftrace_enabled=1
+
+ or
+
+ echo 1 > /proc/sys/kernel/ftrace_enabled
+
+To disable ftrace_enabled simply replace the '1' with '0' in
+the above commands.
+
+When ftrace_enabled is set the tracers will also record the functions
+that are within the trace. The descriptions of the tracers
+will also show an example with ftrace enabled.
+
+
+irqsoff
+-------
+
+When interrupts are disabled, the CPU can not react to any other
+external event (besides NMIs and SMIs). This prevents the timer
+interrupt from triggering or the mouse interrupt from letting the
+kernel know of a new mouse event. The result is a latency with the
+reaction time.
+
+The irqsoff tracer tracks the time for which interrupts are disabled.
+When a new maximum latency is hit, the tracer saves the trace leading up
+to that latency point so that every time a new maximum is reached, the old
+saved trace is discarded and the new trace is saved.
+
+To reset the maximum, echo 0 into tracing_max_latency. Here is an
+example:
+
+ # echo irqsoff > /debug/tracing/current_tracer
+ # echo 0 > /debug/tracing/tracing_max_latency
+ # echo 1 > /debug/tracing/tracing_enabled
+ # ls -ltr
+ [...]
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/latency_trace
+# tracer: irqsoff
+#
+irqsoff latency trace v1.1.5 on 2.6.26
+--------------------------------------------------------------------
+ latency: 12 us, #3/3, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: bash-3730 (uid:0 nice:0 policy:0 rt_prio:0)
+ -----------------
+ => started at: sys_setpgid
+ => ended at: sys_setpgid
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ bash-3730 1d... 0us : _write_lock_irq (sys_setpgid)
+ bash-3730 1d..1 1us+: _write_unlock_irq (sys_setpgid)
+ bash-3730 1d..2 14us : trace_hardirqs_on (sys_setpgid)
+
+
+Here we see that that we had a latency of 12 microsecs (which is
+very good). The _write_lock_irq in sys_setpgid disabled interrupts.
+The difference between the 12 and the displayed timestamp 14us occurred
+because the clock was incremented between the time of recording the max
+latency and the time of recording the function that had that latency.
+
+Note the above example had ftrace_enabled not set. If we set the
+ftrace_enabled, we get a much larger output:
+
+# tracer: irqsoff
+#
+irqsoff latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 50 us, #101/101, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: ls-4339 (uid:0 nice:0 policy:0 rt_prio:0)
+ -----------------
+ => started at: __alloc_pages_internal
+ => ended at: __alloc_pages_internal
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ ls-4339 0...1 0us+: get_page_from_freelist (__alloc_pages_internal)
+ ls-4339 0d..1 3us : rmqueue_bulk (get_page_from_freelist)
+ ls-4339 0d..1 3us : _spin_lock (rmqueue_bulk)
+ ls-4339 0d..1 4us : add_preempt_count (_spin_lock)
+ ls-4339 0d..2 4us : __rmqueue (rmqueue_bulk)
+ ls-4339 0d..2 5us : __rmqueue_smallest (__rmqueue)
+ ls-4339 0d..2 5us : __mod_zone_page_state (__rmqueue_smallest)
+ ls-4339 0d..2 6us : __rmqueue (rmqueue_bulk)
+ ls-4339 0d..2 6us : __rmqueue_smallest (__rmqueue)
+ ls-4339 0d..2 7us : __mod_zone_page_state (__rmqueue_smallest)
+ ls-4339 0d..2 7us : __rmqueue (rmqueue_bulk)
+ ls-4339 0d..2 8us : __rmqueue_smallest (__rmqueue)
+[...]
+ ls-4339 0d..2 46us : __rmqueue_smallest (__rmqueue)
+ ls-4339 0d..2 47us : __mod_zone_page_state (__rmqueue_smallest)
+ ls-4339 0d..2 47us : __rmqueue (rmqueue_bulk)
+ ls-4339 0d..2 48us : __rmqueue_smallest (__rmqueue)
+ ls-4339 0d..2 48us : __mod_zone_page_state (__rmqueue_smallest)
+ ls-4339 0d..2 49us : _spin_unlock (rmqueue_bulk)
+ ls-4339 0d..2 49us : sub_preempt_count (_spin_unlock)
+ ls-4339 0d..1 50us : get_page_from_freelist (__alloc_pages_internal)
+ ls-4339 0d..2 51us : trace_hardirqs_on (__alloc_pages_internal)
+
+
+
+Here we traced a 50 microsecond latency. But we also see all the
+functions that were called during that time. Note that by enabling
+function tracing, we incur an added overhead. This overhead may
+extend the latency times. But nevertheless, this trace has provided
+some very helpful debugging information.
+
+
+preemptoff
+----------
+
+When preemption is disabled, we may be able to receive interrupts but
+the task cannot be preempted and a higher priority task must wait
+for preemption to be enabled again before it can preempt a lower
+priority task.
+
+The preemptoff tracer traces the places that disable preemption.
+Like the irqsoff tracer, it records the maximum latency for which preemption
+was disabled. The control of preemptoff tracer is much like the irqsoff
+tracer.
+
+ # echo preemptoff > /debug/tracing/current_tracer
+ # echo 0 > /debug/tracing/tracing_max_latency
+ # echo 1 > /debug/tracing/tracing_enabled
+ # ls -ltr
+ [...]
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/latency_trace
+# tracer: preemptoff
+#
+preemptoff latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 29 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
+ -----------------
+ => started at: do_IRQ
+ => ended at: __do_softirq
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ sshd-4261 0d.h. 0us+: irq_enter (do_IRQ)
+ sshd-4261 0d.s. 29us : _local_bh_enable (__do_softirq)
+ sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq)
+
+
+This has some more changes. Preemption was disabled when an interrupt
+came in (notice the 'h'), and was enabled while doing a softirq.
+(notice the 's'). But we also see that interrupts have been disabled
+when entering the preempt off section and leaving it (the 'd').
+We do not know if interrupts were enabled in the mean time.
+
+# tracer: preemptoff
+#
+preemptoff latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 63 us, #87/87, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
+ -----------------
+ => started at: remove_wait_queue
+ => ended at: __do_softirq
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ sshd-4261 0d..1 0us : _spin_lock_irqsave (remove_wait_queue)
+ sshd-4261 0d..1 1us : _spin_unlock_irqrestore (remove_wait_queue)
+ sshd-4261 0d..1 2us : do_IRQ (common_interrupt)
+ sshd-4261 0d..1 2us : irq_enter (do_IRQ)
+ sshd-4261 0d..1 2us : idle_cpu (irq_enter)
+ sshd-4261 0d..1 3us : add_preempt_count (irq_enter)
+ sshd-4261 0d.h1 3us : idle_cpu (irq_enter)
+ sshd-4261 0d.h. 4us : handle_fasteoi_irq (do_IRQ)
+[...]
+ sshd-4261 0d.h. 12us : add_preempt_count (_spin_lock)
+ sshd-4261 0d.h1 12us : ack_ioapic_quirk_irq (handle_fasteoi_irq)
+ sshd-4261 0d.h1 13us : move_native_irq (ack_ioapic_quirk_irq)
+ sshd-4261 0d.h1 13us : _spin_unlock (handle_fasteoi_irq)
+ sshd-4261 0d.h1 14us : sub_preempt_count (_spin_unlock)
+ sshd-4261 0d.h1 14us : irq_exit (do_IRQ)
+ sshd-4261 0d.h1 15us : sub_preempt_count (irq_exit)
+ sshd-4261 0d..2 15us : do_softirq (irq_exit)
+ sshd-4261 0d... 15us : __do_softirq (do_softirq)
+ sshd-4261 0d... 16us : __local_bh_disable (__do_softirq)
+ sshd-4261 0d... 16us+: add_preempt_count (__local_bh_disable)
+ sshd-4261 0d.s4 20us : add_preempt_count (__local_bh_disable)
+ sshd-4261 0d.s4 21us : sub_preempt_count (local_bh_enable)
+ sshd-4261 0d.s5 21us : sub_preempt_count (local_bh_enable)
+[...]
+ sshd-4261 0d.s6 41us : add_preempt_count (__local_bh_disable)
+ sshd-4261 0d.s6 42us : sub_preempt_count (local_bh_enable)
+ sshd-4261 0d.s7 42us : sub_preempt_count (local_bh_enable)
+ sshd-4261 0d.s5 43us : add_preempt_count (__local_bh_disable)
+ sshd-4261 0d.s5 43us : sub_preempt_count (local_bh_enable_ip)
+ sshd-4261 0d.s6 44us : sub_preempt_count (local_bh_enable_ip)
+ sshd-4261 0d.s5 44us : add_preempt_count (__local_bh_disable)
+ sshd-4261 0d.s5 45us : sub_preempt_count (local_bh_enable)
+[...]
+ sshd-4261 0d.s. 63us : _local_bh_enable (__do_softirq)
+ sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq)
+
+
+The above is an example of the preemptoff trace with ftrace_enabled
+set. Here we see that interrupts were disabled the entire time.
+The irq_enter code lets us know that we entered an interrupt 'h'.
+Before that, the functions being traced still show that it is not
+in an interrupt, but we can see from the functions themselves that
+this is not the case.
+
+Notice that __do_softirq when called does not have a preempt_count.
+It may seem that we missed a preempt enabling. What really happened
+is that the preempt count is held on the thread's stack and we
+switched to the softirq stack (4K stacks in effect). The code
+does not copy the preempt count, but because interrupts are disabled,
+we do not need to worry about it. Having a tracer like this is good
+for letting people know what really happens inside the kernel.
+
+
+preemptirqsoff
+--------------
+
+Knowing the locations that have interrupts disabled or preemption
+disabled for the longest times is helpful. But sometimes we would
+like to know when either preemption and/or interrupts are disabled.
+
+Consider the following code:
+
+ local_irq_disable();
+ call_function_with_irqs_off();
+ preempt_disable();
+ call_function_with_irqs_and_preemption_off();
+ local_irq_enable();
+ call_function_with_preemption_off();
+ preempt_enable();
+
+The irqsoff tracer will record the total length of
+call_function_with_irqs_off() and
+call_function_with_irqs_and_preemption_off().
+
+The preemptoff tracer will record the total length of
+call_function_with_irqs_and_preemption_off() and
+call_function_with_preemption_off().
+
+But neither will trace the time that interrupts and/or preemption
+is disabled. This total time is the time that we can not schedule.
+To record this time, use the preemptirqsoff tracer.
+
+Again, using this trace is much like the irqsoff and preemptoff tracers.
+
+ # echo preemptirqsoff > /debug/tracing/current_tracer
+ # echo 0 > /debug/tracing/tracing_max_latency
+ # echo 1 > /debug/tracing/tracing_enabled
+ # ls -ltr
+ [...]
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/latency_trace
+# tracer: preemptirqsoff
+#
+preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 293 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: ls-4860 (uid:0 nice:0 policy:0 rt_prio:0)
+ -----------------
+ => started at: apic_timer_interrupt
+ => ended at: __do_softirq
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ ls-4860 0d... 0us!: trace_hardirqs_off_thunk (apic_timer_interrupt)
+ ls-4860 0d.s. 294us : _local_bh_enable (__do_softirq)
+ ls-4860 0d.s1 294us : trace_preempt_on (__do_softirq)
+
+
+
+The trace_hardirqs_off_thunk is called from assembly on x86 when
+interrupts are disabled in the assembly code. Without the function
+tracing, we do not know if interrupts were enabled within the preemption
+points. We do see that it started with preemption enabled.
+
+Here is a trace with ftrace_enabled set:
+
+
+# tracer: preemptirqsoff
+#
+preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 105 us, #183/183, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
+ -----------------
+ => started at: write_chan
+ => ended at: __do_softirq
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ ls-4473 0.N.. 0us : preempt_schedule (write_chan)
+ ls-4473 0dN.1 1us : _spin_lock (schedule)
+ ls-4473 0dN.1 2us : add_preempt_count (_spin_lock)
+ ls-4473 0d..2 2us : put_prev_task_fair (schedule)
+[...]
+ ls-4473 0d..2 13us : set_normalized_timespec (ktime_get_ts)
+ ls-4473 0d..2 13us : __switch_to (schedule)
+ sshd-4261 0d..2 14us : finish_task_switch (schedule)
+ sshd-4261 0d..2 14us : _spin_unlock_irq (finish_task_switch)
+ sshd-4261 0d..1 15us : add_preempt_count (_spin_lock_irqsave)
+ sshd-4261 0d..2 16us : _spin_unlock_irqrestore (hrtick_set)
+ sshd-4261 0d..2 16us : do_IRQ (common_interrupt)
+ sshd-4261 0d..2 17us : irq_enter (do_IRQ)
+ sshd-4261 0d..2 17us : idle_cpu (irq_enter)
+ sshd-4261 0d..2 18us : add_preempt_count (irq_enter)
+ sshd-4261 0d.h2 18us : idle_cpu (irq_enter)
+ sshd-4261 0d.h. 18us : handle_fasteoi_irq (do_IRQ)
+ sshd-4261 0d.h. 19us : _spin_lock (handle_fasteoi_irq)
+ sshd-4261 0d.h. 19us : add_preempt_count (_spin_lock)
+ sshd-4261 0d.h1 20us : _spin_unlock (handle_fasteoi_irq)
+ sshd-4261 0d.h1 20us : sub_preempt_count (_spin_unlock)
+[...]
+ sshd-4261 0d.h1 28us : _spin_unlock (handle_fasteoi_irq)
+ sshd-4261 0d.h1 29us : sub_preempt_count (_spin_unlock)
+ sshd-4261 0d.h2 29us : irq_exit (do_IRQ)
+ sshd-4261 0d.h2 29us : sub_preempt_count (irq_exit)
+ sshd-4261 0d..3 30us : do_softirq (irq_exit)
+ sshd-4261 0d... 30us : __do_softirq (do_softirq)
+ sshd-4261 0d... 31us : __local_bh_disable (__do_softirq)
+ sshd-4261 0d... 31us+: add_preempt_count (__local_bh_disable)
+ sshd-4261 0d.s4 34us : add_preempt_count (__local_bh_disable)
+[...]
+ sshd-4261 0d.s3 43us : sub_preempt_count (local_bh_enable_ip)
+ sshd-4261 0d.s4 44us : sub_preempt_count (local_bh_enable_ip)
+ sshd-4261 0d.s3 44us : smp_apic_timer_interrupt (apic_timer_interrupt)
+ sshd-4261 0d.s3 45us : irq_enter (smp_apic_timer_interrupt)
+ sshd-4261 0d.s3 45us : idle_cpu (irq_enter)
+ sshd-4261 0d.s3 46us : add_preempt_count (irq_enter)
+ sshd-4261 0d.H3 46us : idle_cpu (irq_enter)
+ sshd-4261 0d.H3 47us : hrtimer_interrupt (smp_apic_timer_interrupt)
+ sshd-4261 0d.H3 47us : ktime_get (hrtimer_interrupt)
+[...]
+ sshd-4261 0d.H3 81us : tick_program_event (hrtimer_interrupt)
+ sshd-4261 0d.H3 82us : ktime_get (tick_program_event)
+ sshd-4261 0d.H3 82us : ktime_get_ts (ktime_get)
+ sshd-4261 0d.H3 83us : getnstimeofday (ktime_get_ts)
+ sshd-4261 0d.H3 83us : set_normalized_timespec (ktime_get_ts)
+ sshd-4261 0d.H3 84us : clockevents_program_event (tick_program_event)
+ sshd-4261 0d.H3 84us : lapic_next_event (clockevents_program_event)
+ sshd-4261 0d.H3 85us : irq_exit (smp_apic_timer_interrupt)
+ sshd-4261 0d.H3 85us : sub_preempt_count (irq_exit)
+ sshd-4261 0d.s4 86us : sub_preempt_count (irq_exit)
+ sshd-4261 0d.s3 86us : add_preempt_count (__local_bh_disable)
+[...]
+ sshd-4261 0d.s1 98us : sub_preempt_count (net_rx_action)
+ sshd-4261 0d.s. 99us : add_preempt_count (_spin_lock_irq)
+ sshd-4261 0d.s1 99us+: _spin_unlock_irq (run_timer_softirq)
+ sshd-4261 0d.s. 104us : _local_bh_enable (__do_softirq)
+ sshd-4261 0d.s. 104us : sub_preempt_count (_local_bh_enable)
+ sshd-4261 0d.s. 105us : _local_bh_enable (__do_softirq)
+ sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq)
+
+
+This is a very interesting trace. It started with the preemption of
+the ls task. We see that the task had the "need_resched" bit set
+via the 'N' in the trace. Interrupts were disabled before the spin_lock
+at the beginning of the trace. We see that a schedule took place to run
+sshd. When the interrupts were enabled, we took an interrupt.
+On return from the interrupt handler, the softirq ran. We took another
+interrupt while running the softirq as we see from the capital 'H'.
+
+
+wakeup
+------
+
+In a Real-Time environment it is very important to know the wakeup
+time it takes for the highest priority task that is woken up to the
+time that it executes. This is also known as "schedule latency".
+I stress the point that this is about RT tasks. It is also important
+to know the scheduling latency of non-RT tasks, but the average
+schedule latency is better for non-RT tasks. Tools like
+LatencyTop are more appropriate for such measurements.
+
+Real-Time environments are interested in the worst case latency.
+That is the longest latency it takes for something to happen, and
+not the average. We can have a very fast scheduler that may only
+have a large latency once in a while, but that would not work well
+with Real-Time tasks. The wakeup tracer was designed to record
+the worst case wakeups of RT tasks. Non-RT tasks are not recorded
+because the tracer only records one worst case and tracing non-RT
+tasks that are unpredictable will overwrite the worst case latency
+of RT tasks.
+
+Since this tracer only deals with RT tasks, we will run this slightly
+differently than we did with the previous tracers. Instead of performing
+an 'ls', we will run 'sleep 1' under 'chrt' which changes the
+priority of the task.
+
+ # echo wakeup > /debug/tracing/current_tracer
+ # echo 0 > /debug/tracing/tracing_max_latency
+ # echo 1 > /debug/tracing/tracing_enabled
+ # chrt -f 5 sleep 1
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/latency_trace
+# tracer: wakeup
+#
+wakeup latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 4 us, #2/2, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: sleep-4901 (uid:0 nice:0 policy:1 rt_prio:5)
+ -----------------
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ <idle>-0 1d.h4 0us+: try_to_wake_up (wake_up_process)
+ <idle>-0 1d..4 4us : schedule (cpu_idle)
+
+
+
+Running this on an idle system, we see that it only took 4 microseconds
+to perform the task switch. Note, since the trace marker in the
+schedule is before the actual "switch", we stop the tracing when
+the recorded task is about to schedule in. This may change if
+we add a new marker at the end of the scheduler.
+
+Notice that the recorded task is 'sleep' with the PID of 4901 and it
+has an rt_prio of 5. This priority is user-space priority and not
+the internal kernel priority. The policy is 1 for SCHED_FIFO and 2
+for SCHED_RR.
+
+Doing the same with chrt -r 5 and ftrace_enabled set.
+
+# tracer: wakeup
+#
+wakeup latency trace v1.1.5 on 2.6.26-rc8
+--------------------------------------------------------------------
+ latency: 50 us, #60/60, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
+ -----------------
+ | task: sleep-4068 (uid:0 nice:0 policy:2 rt_prio:5)
+ -----------------
+
+# _------=> CPU#
+# / _-----=> irqs-off
+# | / _----=> need-resched
+# || / _---=> hardirq/softirq
+# ||| / _--=> preempt-depth
+# |||| /
+# ||||| delay
+# cmd pid ||||| time | caller
+# \ / ||||| \ | /
+ksoftirq-7 1d.H3 0us : try_to_wake_up (wake_up_process)
+ksoftirq-7 1d.H4 1us : sub_preempt_count (marker_probe_cb)
+ksoftirq-7 1d.H3 2us : check_preempt_wakeup (try_to_wake_up)
+ksoftirq-7 1d.H3 3us : update_curr (check_preempt_wakeup)
+ksoftirq-7 1d.H3 4us : calc_delta_mine (update_curr)
+ksoftirq-7 1d.H3 5us : __resched_task (check_preempt_wakeup)
+ksoftirq-7 1d.H3 6us : task_wake_up_rt (try_to_wake_up)
+ksoftirq-7 1d.H3 7us : _spin_unlock_irqrestore (try_to_wake_up)
+[...]
+ksoftirq-7 1d.H2 17us : irq_exit (smp_apic_timer_interrupt)
+ksoftirq-7 1d.H2 18us : sub_preempt_count (irq_exit)
+ksoftirq-7 1d.s3 19us : sub_preempt_count (irq_exit)
+ksoftirq-7 1..s2 20us : rcu_process_callbacks (__do_softirq)
+[...]
+ksoftirq-7 1..s2 26us : __rcu_process_callbacks (rcu_process_callbacks)
+ksoftirq-7 1d.s2 27us : _local_bh_enable (__do_softirq)
+ksoftirq-7 1d.s2 28us : sub_preempt_count (_local_bh_enable)
+ksoftirq-7 1.N.3 29us : sub_preempt_count (ksoftirqd)
+ksoftirq-7 1.N.2 30us : _cond_resched (ksoftirqd)
+ksoftirq-7 1.N.2 31us : __cond_resched (_cond_resched)
+ksoftirq-7 1.N.2 32us : add_preempt_count (__cond_resched)
+ksoftirq-7 1.N.2 33us : schedule (__cond_resched)
+ksoftirq-7 1.N.2 33us : add_preempt_count (schedule)
+ksoftirq-7 1.N.3 34us : hrtick_clear (schedule)
+ksoftirq-7 1dN.3 35us : _spin_lock (schedule)
+ksoftirq-7 1dN.3 36us : add_preempt_count (_spin_lock)
+ksoftirq-7 1d..4 37us : put_prev_task_fair (schedule)
+ksoftirq-7 1d..4 38us : update_curr (put_prev_task_fair)
+[...]
+ksoftirq-7 1d..5 47us : _spin_trylock (tracing_record_cmdline)
+ksoftirq-7 1d..5 48us : add_preempt_count (_spin_trylock)
+ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline)
+ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock)
+ksoftirq-7 1d..4 50us : schedule (__cond_resched)
+
+The interrupt went off while running ksoftirqd. This task runs at
+SCHED_OTHER. Why did not we see the 'N' set early? This may be
+a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks
+configured, the interrupt and softirq run with their own stack.
+Some information is held on the top of the task's stack (need_resched
+and preempt_count are both stored there). The setting of the NEED_RESCHED
+bit is done directly to the task's stack, but the reading of the
+NEED_RESCHED is done by looking at the current stack, which in this case
+is the stack for the hard interrupt. This hides the fact that NEED_RESCHED
+has been set. We do not see the 'N' until we switch back to the task's
+assigned stack.
+
+function
+--------
+
+This tracer is the function tracer. Enabling the function tracer
+can be done from the debug file system. Make sure the ftrace_enabled is
+set; otherwise this tracer is a nop.
+
+ # sysctl kernel.ftrace_enabled=1
+ # echo function > /debug/tracing/current_tracer
+ # echo 1 > /debug/tracing/tracing_enabled
+ # usleep 1
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/trace
+# tracer: function
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+ bash-4003 [00] 123.638713: finish_task_switch <-schedule
+ bash-4003 [00] 123.638714: _spin_unlock_irq <-finish_task_switch
+ bash-4003 [00] 123.638714: sub_preempt_count <-_spin_unlock_irq
+ bash-4003 [00] 123.638715: hrtick_set <-schedule
+ bash-4003 [00] 123.638715: _spin_lock_irqsave <-hrtick_set
+ bash-4003 [00] 123.638716: add_preempt_count <-_spin_lock_irqsave
+ bash-4003 [00] 123.638716: _spin_unlock_irqrestore <-hrtick_set
+ bash-4003 [00] 123.638717: sub_preempt_count <-_spin_unlock_irqrestore
+ bash-4003 [00] 123.638717: hrtick_clear <-hrtick_set
+ bash-4003 [00] 123.638718: sub_preempt_count <-schedule
+ bash-4003 [00] 123.638718: sub_preempt_count <-preempt_schedule
+ bash-4003 [00] 123.638719: wait_for_completion <-__stop_machine_run
+ bash-4003 [00] 123.638719: wait_for_common <-wait_for_completion
+ bash-4003 [00] 123.638720: _spin_lock_irq <-wait_for_common
+ bash-4003 [00] 123.638720: add_preempt_count <-_spin_lock_irq
+[...]
+
+
+Note: function tracer uses ring buffers to store the above entries.
+The newest data may overwrite the oldest data. Sometimes using echo to
+stop the trace is not sufficient because the tracing could have overwritten
+the data that you wanted to record. For this reason, it is sometimes better to
+disable tracing directly from a program. This allows you to stop the
+tracing at the point that you hit the part that you are interested in.
+To disable the tracing directly from a C program, something like following
+code snippet can be used:
+
+int trace_fd;
+[...]
+int main(int argc, char *argv[]) {
+ [...]
+ trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY);
+ [...]
+ if (condition_hit()) {
+ write(trace_fd, "0", 1);
+ }
+ [...]
+}
+
+Note: Here we hard coded the path name. The debugfs mount is not
+guaranteed to be at /debug (and is more commonly at /sys/kernel/debug).
+For simple one time traces, the above is sufficent. For anything else,
+a search through /proc/mounts may be needed to find where the debugfs
+file-system is mounted.
+
+dynamic ftrace
+--------------
+
+If CONFIG_DYNAMIC_FTRACE is set, the system will run with
+virtually no overhead when function tracing is disabled. The way
+this works is the mcount function call (placed at the start of
+every kernel function, produced by the -pg switch in gcc), starts
+of pointing to a simple return. (Enabling FTRACE will include the
+-pg switch in the compiling of the kernel.)
+
+At compile time every C file object is run through the
+recordmcount.pl script (located in the scripts directory). This
+script will process the C object using objdump to find all the
+locations in the .text section that call mcount. (Note, only
+the .text section is processed, since processing other sections
+like .init.text may cause races due to those sections being freed).
+
+A new section called "__mcount_loc" is created that holds references
+to all the mcount call sites in the .text section. This section is
+compiled back into the original object. The final linker will add
+all these references into a single table.
+
+On boot up, before SMP is initialized, the dynamic ftrace code
+scans this table and updates all the locations into nops. It also
+records the locations, which are added to the available_filter_functions
+list. Modules are processed as they are loaded and before they are
+executed. When a module is unloaded, it also removes its functions from
+the ftrace function list. This is automatic in the module unload
+code, and the module author does not need to worry about it.
+
+When tracing is enabled, kstop_machine is called to prevent races
+with the CPUS executing code being modified (which can cause the
+CPU to do undesireable things), and the nops are patched back
+to calls. But this time, they do not call mcount (which is just
+a function stub). They now call into the ftrace infrastructure.
+
+One special side-effect to the recording of the functions being
+traced is that we can now selectively choose which functions we
+wish to trace and which ones we want the mcount calls to remain as
+nops.
+
+Two files are used, one for enabling and one for disabling the tracing
+of specified functions. They are:
+
+ set_ftrace_filter
+
+and
+
+ set_ftrace_notrace
+
+A list of available functions that you can add to these files is listed
+in:
+
+ available_filter_functions
+
+ # cat /debug/tracing/available_filter_functions
+put_prev_task_idle
+kmem_cache_create
+pick_next_task_rt
+get_online_cpus
+pick_next_task_fair
+mutex_lock
+[...]
+
+If I am only interested in sys_nanosleep and hrtimer_interrupt:
+
+ # echo sys_nanosleep hrtimer_interrupt \
+ > /debug/tracing/set_ftrace_filter
+ # echo ftrace > /debug/tracing/current_tracer
+ # echo 1 > /debug/tracing/tracing_enabled
+ # usleep 1
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/trace
+# tracer: ftrace
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+ usleep-4134 [00] 1317.070017: hrtimer_interrupt <-smp_apic_timer_interrupt
+ usleep-4134 [00] 1317.070111: sys_nanosleep <-syscall_call
+ <idle>-0 [00] 1317.070115: hrtimer_interrupt <-smp_apic_timer_interrupt
+
+To see which functions are being traced, you can cat the file:
+
+ # cat /debug/tracing/set_ftrace_filter
+hrtimer_interrupt
+sys_nanosleep
+
+
+Perhaps this is not enough. The filters also allow simple wild cards.
+Only the following are currently available
+
+ <match>* - will match functions that begin with <match>
+ *<match> - will match functions that end with <match>
+ *<match>* - will match functions that have <match> in it
+
+These are the only wild cards which are supported.
+
+ <match>*<match> will not work.
+
+ # echo hrtimer_* > /debug/tracing/set_ftrace_filter
+
+Produces:
+
+# tracer: ftrace
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+ bash-4003 [00] 1480.611794: hrtimer_init <-copy_process
+ bash-4003 [00] 1480.611941: hrtimer_start <-hrtick_set
+ bash-4003 [00] 1480.611956: hrtimer_cancel <-hrtick_clear
+ bash-4003 [00] 1480.611956: hrtimer_try_to_cancel <-hrtimer_cancel
+ <idle>-0 [00] 1480.612019: hrtimer_get_next_event <-get_next_timer_interrupt
+ <idle>-0 [00] 1480.612025: hrtimer_get_next_event <-get_next_timer_interrupt
+ <idle>-0 [00] 1480.612032: hrtimer_get_next_event <-get_next_timer_interrupt
+ <idle>-0 [00] 1480.612037: hrtimer_get_next_event <-get_next_timer_interrupt
+ <idle>-0 [00] 1480.612382: hrtimer_get_next_event <-get_next_timer_interrupt
+
+
+Notice that we lost the sys_nanosleep.
+
+ # cat /debug/tracing/set_ftrace_filter
+hrtimer_run_queues
+hrtimer_run_pending
+hrtimer_init
+hrtimer_cancel
+hrtimer_try_to_cancel
+hrtimer_forward
+hrtimer_start
+hrtimer_reprogram
+hrtimer_force_reprogram
+hrtimer_get_next_event
+hrtimer_interrupt
+hrtimer_nanosleep
+hrtimer_wakeup
+hrtimer_get_remaining
+hrtimer_get_res
+hrtimer_init_sleeper
+
+
+This is because the '>' and '>>' act just like they do in bash.
+To rewrite the filters, use '>'
+To append to the filters, use '>>'
+
+To clear out a filter so that all functions will be recorded again:
+
+ # echo > /debug/tracing/set_ftrace_filter
+ # cat /debug/tracing/set_ftrace_filter
+ #
+
+Again, now we want to append.
+
+ # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
+ # cat /debug/tracing/set_ftrace_filter
+sys_nanosleep
+ # echo hrtimer_* >> /debug/tracing/set_ftrace_filter
+ # cat /debug/tracing/set_ftrace_filter
+hrtimer_run_queues
+hrtimer_run_pending
+hrtimer_init
+hrtimer_cancel
+hrtimer_try_to_cancel
+hrtimer_forward
+hrtimer_start
+hrtimer_reprogram
+hrtimer_force_reprogram
+hrtimer_get_next_event
+hrtimer_interrupt
+sys_nanosleep
+hrtimer_nanosleep
+hrtimer_wakeup
+hrtimer_get_remaining
+hrtimer_get_res
+hrtimer_init_sleeper
+
+
+The set_ftrace_notrace prevents those functions from being traced.
+
+ # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
+
+Produces:
+
+# tracer: ftrace
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+ bash-4043 [01] 115.281644: finish_task_switch <-schedule
+ bash-4043 [01] 115.281645: hrtick_set <-schedule
+ bash-4043 [01] 115.281645: hrtick_clear <-hrtick_set
+ bash-4043 [01] 115.281646: wait_for_completion <-__stop_machine_run
+ bash-4043 [01] 115.281647: wait_for_common <-wait_for_completion
+ bash-4043 [01] 115.281647: kthread_stop <-stop_machine_run
+ bash-4043 [01] 115.281648: init_waitqueue_head <-kthread_stop
+ bash-4043 [01] 115.281648: wake_up_process <-kthread_stop
+ bash-4043 [01] 115.281649: try_to_wake_up <-wake_up_process
+
+We can see that there's no more lock or preempt tracing.
+
+trace_pipe
+----------
+
+The trace_pipe outputs the same content as the trace file, but the effect
+on the tracing is different. Every read from trace_pipe is consumed.
+This means that subsequent reads will be different. The trace
+is live.
+
+ # echo function > /debug/tracing/current_tracer
+ # cat /debug/tracing/trace_pipe > /tmp/trace.out &
+[1] 4153
+ # echo 1 > /debug/tracing/tracing_enabled
+ # usleep 1
+ # echo 0 > /debug/tracing/tracing_enabled
+ # cat /debug/tracing/trace
+# tracer: function
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+
+ #
+ # cat /tmp/trace.out
+ bash-4043 [00] 41.267106: finish_task_switch <-schedule
+ bash-4043 [00] 41.267106: hrtick_set <-schedule
+ bash-4043 [00] 41.267107: hrtick_clear <-hrtick_set
+ bash-4043 [00] 41.267108: wait_for_completion <-__stop_machine_run
+ bash-4043 [00] 41.267108: wait_for_common <-wait_for_completion
+ bash-4043 [00] 41.267109: kthread_stop <-stop_machine_run
+ bash-4043 [00] 41.267109: init_waitqueue_head <-kthread_stop
+ bash-4043 [00] 41.267110: wake_up_process <-kthread_stop
+ bash-4043 [00] 41.267110: try_to_wake_up <-wake_up_process
+ bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up
+
+
+Note, reading the trace_pipe file will block until more input is added.
+By changing the tracer, trace_pipe will issue an EOF. We needed
+to set the function tracer _before_ we "cat" the trace_pipe file.
+
+
+trace entries
+-------------
+
+Having too much or not enough data can be troublesome in diagnosing
+an issue in the kernel. The file trace_entries is used to modify
+the size of the internal trace buffers. The number listed
+is the number of entries that can be recorded per CPU. To know
+the full size, multiply the number of possible CPUS with the
+number of entries.
+
+ # cat /debug/tracing/trace_entries
+65620
+
+Note, to modify this, you must have tracing completely disabled. To do that,
+echo "nop" into the current_tracer. If the current_tracer is not set
+to "nop", an EINVAL error will be returned.
+
+ # echo nop > /debug/tracing/current_tracer
+ # echo 100000 > /debug/tracing/trace_entries
+ # cat /debug/tracing/trace_entries
+100045
+
+
+Notice that we echoed in 100,000 but the size is 100,045. The entries
+are held in individual pages. It allocates the number of pages it takes
+to fulfill the request. If more entries may fit on the last page
+then they will be added.
+
+ # echo 1 > /debug/tracing/trace_entries
+ # cat /debug/tracing/trace_entries
+85
+
+This shows us that 85 entries can fit in a single page.
+
+The number of pages which will be allocated is limited to a percentage
+of available memory. Allocating too much will produce an error.
+
+ # echo 1000000000000 > /debug/tracing/trace_entries
+-bash: echo: write error: Cannot allocate memory
+ # cat /debug/tracing/trace_entries
+85
+
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
new file mode 100644
index 0000000..b1b9887
--- /dev/null
+++ b/Documentation/gpio.txt
@@ -0,0 +1,570 @@
+GPIO Interfaces
+
+This provides an overview of GPIO access conventions on Linux.
+
+These calls use the gpio_* naming prefix. No other calls should use that
+prefix, or the related __gpio_* prefix.
+
+
+What is a GPIO?
+===============
+A "General Purpose Input/Output" (GPIO) is a flexible software-controlled
+digital signal. They are provided from many kinds of chip, and are familiar
+to Linux developers working with embedded and custom hardware. Each GPIO
+represents a bit connected to a particular pin, or "ball" on Ball Grid Array
+(BGA) packages. Board schematics show which external hardware connects to
+which GPIOs. Drivers can be written generically, so that board setup code
+passes such pin configuration data to drivers.
+
+System-on-Chip (SOC) processors heavily rely on GPIOs. In some cases, every
+non-dedicated pin can be configured as a GPIO; and most chips have at least
+several dozen of them. Programmable logic devices (like FPGAs) can easily
+provide GPIOs; multifunction chips like power managers, and audio codecs
+often have a few such pins to help with pin scarcity on SOCs; and there are
+also "GPIO Expander" chips that connect using the I2C or SPI serial busses.
+Most PC southbridges have a few dozen GPIO-capable pins (with only the BIOS
+firmware knowing how they're used).
+
+The exact capabilities of GPIOs vary between systems. Common options:
+
+ - Output values are writable (high=1, low=0). Some chips also have
+ options about how that value is driven, so that for example only one
+ value might be driven ... supporting "wire-OR" and similar schemes
+ for the other value (notably, "open drain" signaling).
+
+ - Input values are likewise readable (1, 0). Some chips support readback
+ of pins configured as "output", which is very useful in such "wire-OR"
+ cases (to support bidirectional signaling). GPIO controllers may have
+ input de-glitch/debounce logic, sometimes with software controls.
+
+ - Inputs can often be used as IRQ signals, often edge triggered but
+ sometimes level triggered. Such IRQs may be configurable as system
+ wakeup events, to wake the system from a low power state.
+
+ - Usually a GPIO will be configurable as either input or output, as needed
+ by different product boards; single direction ones exist too.
+
+ - Most GPIOs can be accessed while holding spinlocks, but those accessed
+ through a serial bus normally can't. Some systems support both types.
+
+On a given board each GPIO is used for one specific purpose like monitoring
+MMC/SD card insertion/removal, detecting card writeprotect status, driving
+a LED, configuring a transceiver, bitbanging a serial bus, poking a hardware
+watchdog, sensing a switch, and so on.
+
+
+GPIO conventions
+================
+Note that this is called a "convention" because you don't need to do it this
+way, and it's no crime if you don't. There **are** cases where portability
+is not the main issue; GPIOs are often used for the kind of board-specific
+glue logic that may even change between board revisions, and can't ever be
+used on a board that's wired differently. Only least-common-denominator
+functionality can be very portable. Other features are platform-specific,
+and that can be critical for glue logic.
+
+Plus, this doesn't require any implementation framework, just an interface.
+One platform might implement it as simple inline functions accessing chip
+registers; another might implement it by delegating through abstractions
+used for several very different kinds of GPIO controller. (There is some
+optional code supporting such an implementation strategy, described later
+in this document, but drivers acting as clients to the GPIO interface must
+not care how it's implemented.)
+
+That said, if the convention is supported on their platform, drivers should
+use it when possible. Platforms must declare GENERIC_GPIO support in their
+Kconfig (boolean true), and provide an <asm/gpio.h> file. Drivers that can't
+work without standard GPIO calls should have Kconfig entries which depend
+on GENERIC_GPIO. The GPIO calls are available, either as "real code" or as
+optimized-away stubs, when drivers use the include file:
+
+ #include <linux/gpio.h>
+
+If you stick to this convention then it'll be easier for other developers to
+see what your code is doing, and help maintain it.
+
+Note that these operations include I/O barriers on platforms which need to
+use them; drivers don't need to add them explicitly.
+
+
+Identifying GPIOs
+-----------------
+GPIOs are identified by unsigned integers in the range 0..MAX_INT. That
+reserves "negative" numbers for other purposes like marking signals as
+"not available on this board", or indicating faults. Code that doesn't
+touch the underlying hardware treats these integers as opaque cookies.
+
+Platforms define how they use those integers, and usually #define symbols
+for the GPIO lines so that board-specific setup code directly corresponds
+to the relevant schematics. In contrast, drivers should only use GPIO
+numbers passed to them from that setup code, using platform_data to hold
+board-specific pin configuration data (along with other board specific
+data they need). That avoids portability problems.
+
+So for example one platform uses numbers 32-159 for GPIOs; while another
+uses numbers 0..63 with one set of GPIO controllers, 64-79 with another
+type of GPIO controller, and on one particular board 80-95 with an FPGA.
+The numbers need not be contiguous; either of those platforms could also
+use numbers 2000-2063 to identify GPIOs in a bank of I2C GPIO expanders.
+
+If you want to initialize a structure with an invalid GPIO number, use
+some negative number (perhaps "-EINVAL"); that will never be valid. To
+test if a number could reference a GPIO, you may use this predicate:
+
+ int gpio_is_valid(int number);
+
+A number that's not valid will be rejected by calls which may request
+or free GPIOs (see below). Other numbers may also be rejected; for
+example, a number might be valid but unused on a given board.
+
+Whether a platform supports multiple GPIO controllers is currently a
+platform-specific implementation issue.
+
+
+Using GPIOs
+-----------
+One of the first things to do with a GPIO, often in board setup code when
+setting up a platform_device using the GPIO, is mark its direction:
+
+ /* set as input or output, returning 0 or negative errno */
+ int gpio_direction_input(unsigned gpio);
+ int gpio_direction_output(unsigned gpio, int value);
+
+The return value is zero for success, else a negative errno. It should
+be checked, since the get/set calls don't have error returns and since
+misconfiguration is possible. You should normally issue these calls from
+a task context. However, for spinlock-safe GPIOs it's OK to use them
+before tasking is enabled, as part of early board setup.
+
+For output GPIOs, the value provided becomes the initial output value.
+This helps avoid signal glitching during system startup.
+
+For compatibility with legacy interfaces to GPIOs, setting the direction
+of a GPIO implicitly requests that GPIO (see below) if it has not been
+requested already. That compatibility may be removed in the future;
+explicitly requesting GPIOs is strongly preferred.
+
+Setting the direction can fail if the GPIO number is invalid, or when
+that particular GPIO can't be used in that mode. It's generally a bad
+idea to rely on boot firmware to have set the direction correctly, since
+it probably wasn't validated to do more than boot Linux. (Similarly,
+that board setup code probably needs to multiplex that pin as a GPIO,
+and configure pullups/pulldowns appropriately.)
+
+
+Spinlock-Safe GPIO access
+-------------------------
+Most GPIO controllers can be accessed with memory read/write instructions.
+That doesn't need to sleep, and can safely be done from inside IRQ handlers.
+(That includes hardirq contexts on RT kernels.)
+
+Use these calls to access such GPIOs:
+
+ /* GPIO INPUT: return zero or nonzero */
+ int gpio_get_value(unsigned gpio);
+
+ /* GPIO OUTPUT */
+ void gpio_set_value(unsigned gpio, int value);
+
+The values are boolean, zero for low, nonzero for high. When reading the
+value of an output pin, the value returned should be what's seen on the
+pin ... that won't always match the specified output value, because of
+issues including open-drain signaling and output latencies.
+
+The get/set calls have no error returns because "invalid GPIO" should have
+been reported earlier from gpio_direction_*(). However, note that not all
+platforms can read the value of output pins; those that can't should always
+return zero. Also, using these calls for GPIOs that can't safely be accessed
+without sleeping (see below) is an error.
+
+Platform-specific implementations are encouraged to optimize the two
+calls to access the GPIO value in cases where the GPIO number (and for
+output, value) are constant. It's normal for them to need only a couple
+of instructions in such cases (reading or writing a hardware register),
+and not to need spinlocks. Such optimized calls can make bitbanging
+applications a lot more efficient (in both space and time) than spending
+dozens of instructions on subroutine calls.
+
+
+GPIO access that may sleep
+--------------------------
+Some GPIO controllers must be accessed using message based busses like I2C
+or SPI. Commands to read or write those GPIO values require waiting to
+get to the head of a queue to transmit a command and get its response.
+This requires sleeping, which can't be done from inside IRQ handlers.
+
+Platforms that support this type of GPIO distinguish them from other GPIOs
+by returning nonzero from this call (which requires a valid GPIO number,
+either explicitly or implicitly requested):
+
+ int gpio_cansleep(unsigned gpio);
+
+To access such GPIOs, a different set of accessors is defined:
+
+ /* GPIO INPUT: return zero or nonzero, might sleep */
+ int gpio_get_value_cansleep(unsigned gpio);
+
+ /* GPIO OUTPUT, might sleep */
+ void gpio_set_value_cansleep(unsigned gpio, int value);
+
+Other than the fact that these calls might sleep, and will not be ignored
+for GPIOs that can't be accessed from IRQ handlers, these calls act the
+same as the spinlock-safe calls.
+
+
+Claiming and Releasing GPIOs (OPTIONAL)
+---------------------------------------
+To help catch system configuration errors, two calls are defined.
+However, many platforms don't currently support this mechanism.
+
+ /* request GPIO, returning 0 or negative errno.
+ * non-null labels may be useful for diagnostics.
+ */
+ int gpio_request(unsigned gpio, const char *label);
+
+ /* release previously-claimed GPIO */
+ void gpio_free(unsigned gpio);
+
+Passing invalid GPIO numbers to gpio_request() will fail, as will requesting
+GPIOs that have already been claimed with that call. The return value of
+gpio_request() must be checked. You should normally issue these calls from
+a task context. However, for spinlock-safe GPIOs it's OK to request GPIOs
+before tasking is enabled, as part of early board setup.
+
+These calls serve two basic purposes. One is marking the signals which
+are actually in use as GPIOs, for better diagnostics; systems may have
+several hundred potential GPIOs, but often only a dozen are used on any
+given board. Another is to catch conflicts, identifying errors when
+(a) two or more drivers wrongly think they have exclusive use of that
+signal, or (b) something wrongly believes it's safe to remove drivers
+needed to manage a signal that's in active use. That is, requesting a
+GPIO can serve as a kind of lock.
+
+Some platforms may also use knowledge about what GPIOs are active for
+power management, such as by powering down unused chip sectors and, more
+easily, gating off unused clocks.
+
+These two calls are optional because not not all current Linux platforms
+offer such functionality in their GPIO support; a valid implementation
+could return success for all gpio_request() calls. Unlike the other calls,
+the state they represent doesn't normally match anything from a hardware
+register; it's just a software bitmap which clearly is not necessary for
+correct operation of hardware or (bug free) drivers.
+
+Note that requesting a GPIO does NOT cause it to be configured in any
+way; it just marks that GPIO as in use. Separate code must handle any
+pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
+
+Also note that it's your responsibility to have stopped using a GPIO
+before you free it.
+
+
+GPIOs mapped to IRQs
+--------------------
+GPIO numbers are unsigned integers; so are IRQ numbers. These make up
+two logically distinct namespaces (GPIO 0 need not use IRQ 0). You can
+map between them using calls like:
+
+ /* map GPIO numbers to IRQ numbers */
+ int gpio_to_irq(unsigned gpio);
+
+ /* map IRQ numbers to GPIO numbers (avoid using this) */
+ int irq_to_gpio(unsigned irq);
+
+Those return either the corresponding number in the other namespace, or
+else a negative errno code if the mapping can't be done. (For example,
+some GPIOs can't be used as IRQs.) It is an unchecked error to use a GPIO
+number that wasn't set up as an input using gpio_direction_input(), or
+to use an IRQ number that didn't originally come from gpio_to_irq().
+
+These two mapping calls are expected to cost on the order of a single
+addition or subtraction. They're not allowed to sleep.
+
+Non-error values returned from gpio_to_irq() can be passed to request_irq()
+or free_irq(). They will often be stored into IRQ resources for platform
+devices, by the board-specific initialization code. Note that IRQ trigger
+options are part of the IRQ interface, e.g. IRQF_TRIGGER_FALLING, as are
+system wakeup capabilities.
+
+Non-error values returned from irq_to_gpio() would most commonly be used
+with gpio_get_value(), for example to initialize or update driver state
+when the IRQ is edge-triggered. Note that some platforms don't support
+this reverse mapping, so you should avoid using it.
+
+
+Emulating Open Drain Signals
+----------------------------
+Sometimes shared signals need to use "open drain" signaling, where only the
+low signal level is actually driven. (That term applies to CMOS transistors;
+"open collector" is used for TTL.) A pullup resistor causes the high signal
+level. This is sometimes called a "wire-AND"; or more practically, from the
+negative logic (low=true) perspective this is a "wire-OR".
+
+One common example of an open drain signal is a shared active-low IRQ line.
+Also, bidirectional data bus signals sometimes use open drain signals.
+
+Some GPIO controllers directly support open drain outputs; many don't. When
+you need open drain signaling but your hardware doesn't directly support it,
+there's a common idiom you can use to emulate it with any GPIO pin that can
+be used as either an input or an output:
+
+ LOW: gpio_direction_output(gpio, 0) ... this drives the signal
+ and overrides the pullup.
+
+ HIGH: gpio_direction_input(gpio) ... this turns off the output,
+ so the pullup (or some other device) controls the signal.
+
+If you are "driving" the signal high but gpio_get_value(gpio) reports a low
+value (after the appropriate rise time passes), you know some other component
+is driving the shared signal low. That's not necessarily an error. As one
+common example, that's how I2C clocks are stretched: a slave that needs a
+slower clock delays the rising edge of SCK, and the I2C master adjusts its
+signaling rate accordingly.
+
+
+What do these conventions omit?
+===============================
+One of the biggest things these conventions omit is pin multiplexing, since
+this is highly chip-specific and nonportable. One platform might not need
+explicit multiplexing; another might have just two options for use of any
+given pin; another might have eight options per pin; another might be able
+to route a given GPIO to any one of several pins. (Yes, those examples all
+come from systems that run Linux today.)
+
+Related to multiplexing is configuration and enabling of the pullups or
+pulldowns integrated on some platforms. Not all platforms support them,
+or support them in the same way; and any given board might use external
+pullups (or pulldowns) so that the on-chip ones should not be used.
+(When a circuit needs 5 kOhm, on-chip 100 kOhm resistors won't do.)
+Likewise drive strength (2 mA vs 20 mA) and voltage (1.8V vs 3.3V) is a
+platform-specific issue, as are models like (not) having a one-to-one
+correspondence between configurable pins and GPIOs.
+
+There are other system-specific mechanisms that are not specified here,
+like the aforementioned options for input de-glitching and wire-OR output.
+Hardware may support reading or writing GPIOs in gangs, but that's usually
+configuration dependent: for GPIOs sharing the same bank. (GPIOs are
+commonly grouped in banks of 16 or 32, with a given SOC having several such
+banks.) Some systems can trigger IRQs from output GPIOs, or read values
+from pins not managed as GPIOs. Code relying on such mechanisms will
+necessarily be nonportable.
+
+Dynamic definition of GPIOs is not currently standard; for example, as
+a side effect of configuring an add-on board with some GPIO expanders.
+
+
+GPIO implementor's framework (OPTIONAL)
+=======================================
+As noted earlier, there is an optional implementation framework making it
+easier for platforms to support different kinds of GPIO controller using
+the same programming interface. This framework is called "gpiolib".
+
+As a debugging aid, if debugfs is available a /sys/kernel/debug/gpio file
+will be found there. That will list all the controllers registered through
+this framework, and the state of the GPIOs currently in use.
+
+
+Controller Drivers: gpio_chip
+-----------------------------
+In this framework each GPIO controller is packaged as a "struct gpio_chip"
+with information common to each controller of that type:
+
+ - methods to establish GPIO direction
+ - methods used to access GPIO values
+ - flag saying whether calls to its methods may sleep
+ - optional debugfs dump method (showing extra state like pullup config)
+ - label for diagnostics
+
+There is also per-instance data, which may come from device.platform_data:
+the number of its first GPIO, and how many GPIOs it exposes.
+
+The code implementing a gpio_chip should support multiple instances of the
+controller, possibly using the driver model. That code will configure each
+gpio_chip and issue gpiochip_add(). Removing a GPIO controller should be
+rare; use gpiochip_remove() when it is unavoidable.
+
+Most often a gpio_chip is part of an instance-specific structure with state
+not exposed by the GPIO interfaces, such as addressing, power management,
+and more. Chips such as codecs will have complex non-GPIO state,
+
+Any debugfs dump method should normally ignore signals which haven't been
+requested as GPIOs. They can use gpiochip_is_requested(), which returns
+either NULL or the label associated with that GPIO when it was requested.
+
+
+Platform Support
+----------------
+To support this framework, a platform's Kconfig will "select" either
+ARCH_REQUIRE_GPIOLIB or ARCH_WANT_OPTIONAL_GPIOLIB
+and arrange that its <asm/gpio.h> includes <asm-generic/gpio.h> and defines
+three functions: gpio_get_value(), gpio_set_value(), and gpio_cansleep().
+They may also want to provide a custom value for ARCH_NR_GPIOS.
+
+ARCH_REQUIRE_GPIOLIB means that the gpio-lib code will always get compiled
+into the kernel on that architecture.
+
+ARCH_WANT_OPTIONAL_GPIOLIB means the gpio-lib code defaults to off and the user
+can enable it and build it into the kernel optionally.
+
+If neither of these options are selected, the platform does not support
+GPIOs through GPIO-lib and the code cannot be enabled by the user.
+
+Trivial implementations of those functions can directly use framework
+code, which always dispatches through the gpio_chip:
+
+ #define gpio_get_value __gpio_get_value
+ #define gpio_set_value __gpio_set_value
+ #define gpio_cansleep __gpio_cansleep
+
+Fancier implementations could instead define those as inline functions with
+logic optimizing access to specific SOC-based GPIOs. For example, if the
+referenced GPIO is the constant "12", getting or setting its value could
+cost as little as two or three instructions, never sleeping. When such an
+optimization is not possible those calls must delegate to the framework
+code, costing at least a few dozen instructions. For bitbanged I/O, such
+instruction savings can be significant.
+
+For SOCs, platform-specific code defines and registers gpio_chip instances
+for each bank of on-chip GPIOs. Those GPIOs should be numbered/labeled to
+match chip vendor documentation, and directly match board schematics. They
+may well start at zero and go up to a platform-specific limit. Such GPIOs
+are normally integrated into platform initialization to make them always be
+available, from arch_initcall() or earlier; they can often serve as IRQs.
+
+
+Board Support
+-------------
+For external GPIO controllers -- such as I2C or SPI expanders, ASICs, multi
+function devices, FPGAs or CPLDs -- most often board-specific code handles
+registering controller devices and ensures that their drivers know what GPIO
+numbers to use with gpiochip_add(). Their numbers often start right after
+platform-specific GPIOs.
+
+For example, board setup code could create structures identifying the range
+of GPIOs that chip will expose, and passes them to each GPIO expander chip
+using platform_data. Then the chip driver's probe() routine could pass that
+data to gpiochip_add().
+
+Initialization order can be important. For example, when a device relies on
+an I2C-based GPIO, its probe() routine should only be called after that GPIO
+becomes available. That may mean the device should not be registered until
+calls for that GPIO can work. One way to address such dependencies is for
+such gpio_chip controllers to provide setup() and teardown() callbacks to
+board specific code; those board specific callbacks would register devices
+once all the necessary resources are available, and remove them later when
+the GPIO controller device becomes unavailable.
+
+
+Sysfs Interface for Userspace (OPTIONAL)
+========================================
+Platforms which use the "gpiolib" implementors framework may choose to
+configure a sysfs user interface to GPIOs. This is different from the
+debugfs interface, since it provides control over GPIO direction and
+value instead of just showing a gpio state summary. Plus, it could be
+present on production systems without debugging support.
+
+Given approprate hardware documentation for the system, userspace could
+know for example that GPIO #23 controls the write protect line used to
+protect boot loader segments in flash memory. System upgrade procedures
+may need to temporarily remove that protection, first importing a GPIO,
+then changing its output state, then updating the code before re-enabling
+the write protection. In normal use, GPIO #23 would never be touched,
+and the kernel would have no need to know about it.
+
+Again depending on appropriate hardware documentation, on some systems
+userspace GPIO can be used to determine system configuration data that
+standard kernels won't know about. And for some tasks, simple userspace
+GPIO drivers could be all that the system really needs.
+
+Note that standard kernel drivers exist for common "LEDs and Buttons"
+GPIO tasks: "leds-gpio" and "gpio_keys", respectively. Use those
+instead of talking directly to the GPIOs; they integrate with kernel
+frameworks better than your userspace code could.
+
+
+Paths in Sysfs
+--------------
+There are three kinds of entry in /sys/class/gpio:
+
+ - Control interfaces used to get userspace control over GPIOs;
+
+ - GPIOs themselves; and
+
+ - GPIO controllers ("gpio_chip" instances).
+
+That's in addition to standard files including the "device" symlink.
+
+The control interfaces are write-only:
+
+ /sys/class/gpio/
+
+ "export" ... Userspace may ask the kernel to export control of
+ a GPIO to userspace by writing its number to this file.
+
+ Example: "echo 19 > export" will create a "gpio19" node
+ for GPIO #19, if that's not requested by kernel code.
+
+ "unexport" ... Reverses the effect of exporting to userspace.
+
+ Example: "echo 19 > unexport" will remove a "gpio19"
+ node exported using the "export" file.
+
+GPIO signals have paths like /sys/class/gpio/gpio42/ (for GPIO #42)
+and have the following read/write attributes:
+
+ /sys/class/gpio/gpioN/
+
+ "direction" ... reads as either "in" or "out". This value may
+ normally be written. Writing as "out" defaults to
+ initializing the value as low. To ensure glitch free
+ operation, values "low" and "high" may be written to
+ configure the GPIO as an output with that initial value.
+
+ Note that this attribute *will not exist* if the kernel
+ doesn't support changing the direction of a GPIO, or
+ it was exported by kernel code that didn't explicitly
+ allow userspace to reconfigure this GPIO's direction.
+
+ "value" ... reads as either 0 (low) or 1 (high). If the GPIO
+ is configured as an output, this value may be written;
+ any nonzero value is treated as high.
+
+GPIO controllers have paths like /sys/class/gpio/chipchip42/ (for the
+controller implementing GPIOs starting at #42) and have the following
+read-only attributes:
+
+ /sys/class/gpio/gpiochipN/
+
+ "base" ... same as N, the first GPIO managed by this chip
+
+ "label" ... provided for diagnostics (not always unique)
+
+ "ngpio" ... how many GPIOs this manges (N to N + ngpio - 1)
+
+Board documentation should in most cases cover what GPIOs are used for
+what purposes. However, those numbers are not always stable; GPIOs on
+a daughtercard might be different depending on the base board being used,
+or other cards in the stack. In such cases, you may need to use the
+gpiochip nodes (possibly in conjunction with schematics) to determine
+the correct GPIO number to use for a given signal.
+
+
+Exporting from Kernel code
+--------------------------
+Kernel code can explicitly manage exports of GPIOs which have already been
+requested using gpio_request():
+
+ /* export the GPIO to userspace */
+ int gpio_export(unsigned gpio, bool direction_may_change);
+
+ /* reverse gpio_export() */
+ void gpio_unexport();
+
+After a kernel driver requests a GPIO, it may only be made available in
+the sysfs interface by gpio_export(). The driver can control whether the
+signal direction may change. This helps drivers prevent userspace code
+from accidentally clobbering important system state.
+
+This explicit exporting can help with debugging (by making some kinds
+of experiments easier), or can provide an always-there interface that's
+suitable for documenting as part of a board support package.
diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt
new file mode 100644
index 0000000..6bad6f1
--- /dev/null
+++ b/Documentation/highuid.txt
@@ -0,0 +1,77 @@
+Notes on the change from 16-bit UIDs to 32-bit UIDs:
+
+- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
+ when communicating between user and kernel space in an ioctl or data
+ structure.
+
+- kernel code should use uid_t and gid_t in kernel-private structures and
+ code.
+
+What's left to be done for 32-bit UIDs on all Linux architectures:
+
+- Disk quotas have an interesting limitation that is not related to the
+ maximum UID/GID. They are limited by the maximum file size on the
+ underlying filesystem, because quota records are written at offsets
+ corresponding to the UID in question.
+ Further investigation is needed to see if the quota system can cope
+ properly with huge UIDs. If it can deal with 64-bit file offsets on all
+ architectures, this should not be a problem.
+
+- Decide whether or not to keep backwards compatibility with the system
+ accounting file, or if we should break it as the comments suggest
+ (currently, the old 16-bit UID and GID are still written to disk, and
+ part of the former pad space is used to store separate 32-bit UID and
+ GID)
+
+- Need to validate that OS emulation calls the 16-bit UID
+ compatibility syscalls, if the OS being emulated used 16-bit UIDs, or
+ uses the 32-bit UID system calls properly otherwise.
+
+ This affects at least:
+ iBCS on Intel
+
+ sparc32 emulation on sparc64
+ (need to support whatever new 32-bit UID system calls are added to
+ sparc32)
+
+- Validate that all filesystems behave properly.
+
+ At present, 32-bit UIDs _should_ work for:
+ ext2
+ ufs
+ isofs
+ nfs
+ coda
+ udf
+
+ Ioctl() fixups have been made for:
+ ncpfs
+ smbfs
+
+ Filesystems with simple fixups to prevent 16-bit UID wraparound:
+ minix
+ sysv
+ qnx4
+
+ Other filesystems have not been checked yet.
+
+- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in
+ all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but
+ more are needed. (as well as new user<->kernel data structures)
+
+- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k,
+ sh, and sparc32. Fixing this is probably not that important, but would
+ require adding a new ELF section.
+
+- The ioctl()s used to control the in-kernel NFS server only support
+ 16-bit UIDs on arm, i386, m68k, sh, and sparc32.
+
+- make sure that the UID mapping feature of AX25 networking works properly
+ (it should be safe because it's always used a 32-bit integer to
+ communicate between user and kernel)
+
+
+Chris Wing
+wingc@umich.edu
+
+last updated: January 11, 2000
diff --git a/Documentation/hw_random.txt b/Documentation/hw_random.txt
new file mode 100644
index 0000000..690f525
--- /dev/null
+++ b/Documentation/hw_random.txt
@@ -0,0 +1,90 @@
+Introduction:
+
+ The hw_random framework is software that makes use of a
+ special hardware feature on your CPU or motherboard,
+ a Random Number Generator (RNG). The software has two parts:
+ a core providing the /dev/hw_random character device and its
+ sysfs support, plus a hardware-specific driver that plugs
+ into that core.
+
+ To make the most effective use of these mechanisms, you
+ should download the support software as well. Download the
+ latest version of the "rng-tools" package from the
+ hw_random driver's official Web site:
+
+ http://sourceforge.net/projects/gkernel/
+
+ Those tools use /dev/hw_random to fill the kernel entropy pool,
+ which is used internally and exported by the /dev/urandom and
+ /dev/random special files.
+
+Theory of operation:
+
+ CHARACTER DEVICE. Using the standard open()
+ and read() system calls, you can read random data from
+ the hardware RNG device. This data is NOT CHECKED by any
+ fitness tests, and could potentially be bogus (if the
+ hardware is faulty or has been tampered with). Data is only
+ output if the hardware "has-data" flag is set, but nevertheless
+ a security-conscious person would run fitness tests on the
+ data before assuming it is truly random.
+
+ The rng-tools package uses such tests in "rngd", and lets you
+ run them by hand with a "rngtest" utility.
+
+ /dev/hw_random is char device major 10, minor 183.
+
+ CLASS DEVICE. There is a /sys/class/misc/hw_random node with
+ two unique attributes, "rng_available" and "rng_current". The
+ "rng_available" attribute lists the hardware-specific drivers
+ available, while "rng_current" lists the one which is currently
+ connected to /dev/hw_random. If your system has more than one
+ RNG available, you may change the one used by writing a name from
+ the list in "rng_available" into "rng_current".
+
+==========================================================================
+
+ Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
+ Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
+ Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
+
+
+About the Intel RNG hardware, from the firmware hub datasheet:
+
+ The Firmware Hub integrates a Random Number Generator (RNG)
+ using thermal noise generated from inherently random quantum
+ mechanical properties of silicon. When not generating new random
+ bits the RNG circuitry will enter a low power state. Intel will
+ provide a binary software driver to give third party software
+ access to our RNG for use as a security feature. At this time,
+ the RNG is only to be used with a system in an OS-present state.
+
+Intel RNG Driver notes:
+
+ * FIXME: support poll(2)
+
+ NOTE: request_mem_region was removed, for two reasons:
+ 1) Only one RNG is supported by this driver, 2) The location
+ used by the RNG is a fixed location in MMIO-addressable memory,
+ 3) users with properly working BIOS e820 handling will always
+ have the region in which the RNG is located reserved, so
+ request_mem_region calls always fail for proper setups.
+ However, for people who use mem=XX, BIOS e820 information is
+ -not- in /proc/iomem, and request_mem_region(RNG_ADDR) can
+ succeed.
+
+Driver details:
+
+ Based on:
+ Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
+ May 1999 Order Number: 290658-002 R
+
+ Intel 82802 Firmware Hub: Random Number Generator
+ Programmer's Reference Manual
+ December 1999 Order Number: 298029-001 R
+
+ Intel 82802 Firmware HUB Random Number Generator Driver
+ Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
+
+ Special thanks to Matt Sottek. I did the "guts", he
+ did the "brains" and all the testing.
diff --git a/Documentation/hwmon/abituguru b/Documentation/hwmon/abituguru
new file mode 100644
index 0000000..87ffa0f
--- /dev/null
+++ b/Documentation/hwmon/abituguru
@@ -0,0 +1,92 @@
+Kernel driver abituguru
+=======================
+
+Supported chips:
+ * Abit uGuru revision 1 & 2 (Hardware Monitor part only)
+ Prefix: 'abituguru'
+ Addresses scanned: ISA 0x0E0
+ Datasheet: Not available, this driver is based on reverse engineering.
+ A "Datasheet" has been written based on the reverse engineering it
+ should be available in the same dir as this file under the name
+ abituguru-datasheet.
+ Note:
+ The uGuru is a microcontroller with onboard firmware which programs
+ it to behave as a hwmon IC. There are many different revisions of the
+ firmware and thus effectivly many different revisions of the uGuru.
+ Below is an incomplete list with which revisions are used for which
+ Motherboards:
+ uGuru 1.00 ~ 1.24 (AI7, KV8-MAX3, AN7) (1)
+ uGuru 2.0.0.0 ~ 2.0.4.2 (KV8-PRO)
+ uGuru 2.1.0.0 ~ 2.1.2.8 (AS8, AV8, AA8, AG8, AA8XE, AX8)
+ uGuru 2.2.0.0 ~ 2.2.0.6 (AA8 Fatal1ty)
+ uGuru 2.3.0.0 ~ 2.3.0.9 (AN8)
+ uGuru 3.0.0.0 ~ 3.0.x.x (AW8, AL8, AT8, NI8 SLI, AT8 32X, AN8 32X,
+ AW9D-MAX) (2)
+ 1) For revisions 2 and 3 uGuru's the driver can autodetect the
+ sensortype (Volt or Temp) for bank1 sensors, for revision 1 uGuru's
+ this doesnot always work. For these uGuru's the autodection can
+ be overriden with the bank1_types module param. For all 3 known
+ revison 1 motherboards the correct use of this param is:
+ bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1
+ You may also need to specify the fan_sensors option for these boards
+ fan_sensors=5
+ 2) There is a seperate abituguru3 driver for these motherboards,
+ the abituguru (without the 3 !) driver will not work on these
+ motherboards (and visa versa)!
+
+Authors:
+ Hans de Goede <j.w.r.degoede@hhs.nl>,
+ (Initial reverse engineering done by Olle Sandberg
+ <ollebull@gmail.com>)
+
+
+Module Parameters
+-----------------
+
+* force: bool Force detection. Note this parameter only causes the
+ detection to be skipped, and thus the insmod to
+ succeed. If the uGuru can't be read the actual hwmon
+ driver will not load and thus no hwmon device will get
+ registered.
+* bank1_types: int[] Bank1 sensortype autodetection override:
+ -1 autodetect (default)
+ 0 volt sensor
+ 1 temp sensor
+ 2 not connected
+* fan_sensors: int Tell the driver how many fan speed sensors there are
+ on your motherboard. Default: 0 (autodetect).
+* pwms: int Tell the driver how many fan speed controls (fan
+ pwms) your motherboard has. Default: 0 (autodetect).
+* verbose: int How verbose should the driver be? (0-3):
+ 0 normal output
+ 1 + verbose error reporting
+ 2 + sensors type probing info (default)
+ 3 + retryable error reporting
+ Default: 2 (the driver is still in the testing phase)
+
+Notice if you need any of the first three options above please insmod the
+driver with verbose set to 3 and mail me <j.w.r.degoede@hhs.nl> the output of:
+dmesg | grep abituguru
+
+
+Description
+-----------
+
+This driver supports the hardware monitoring features of the first and
+second revision of the Abit uGuru chip found on Abit uGuru featuring
+motherboards (most modern Abit motherboards).
+
+The first and second revision of the uGuru chip in reality is a Winbond
+W83L950D in disguise (despite Abit claiming it is "a new microprocessor
+designed by the ABIT Engineers"). Unfortunatly this doesn't help since the
+W83L950D is a generic microcontroller with a custom Abit application running
+on it.
+
+Despite Abit not releasing any information regarding the uGuru, Olle
+Sandberg <ollebull@gmail.com> has managed to reverse engineer the sensor part
+of the uGuru. Without his work this driver would not have been possible.
+
+Known Issues
+------------
+
+The voltage and frequency control parts of the Abit uGuru are not supported.
diff --git a/Documentation/hwmon/abituguru-datasheet b/Documentation/hwmon/abituguru-datasheet
new file mode 100644
index 0000000..aef5a9b
--- /dev/null
+++ b/Documentation/hwmon/abituguru-datasheet
@@ -0,0 +1,312 @@
+uGuru datasheet
+===============
+
+First of all, what I know about uGuru is no fact based on any help, hints or
+datasheet from Abit. The data I have got on uGuru have I assembled through
+my weak knowledge in "backwards engineering".
+And just for the record, you may have noticed uGuru isn't a chip developed by
+Abit, as they claim it to be. It's realy just an microprocessor (uC) created by
+Winbond (W83L950D). And no, reading the manual for this specific uC or
+mailing Windbond for help won't give any usefull data about uGuru, as it is
+the program inside the uC that is responding to calls.
+
+Olle Sandberg <ollebull@gmail.com>, 2005-05-25
+
+
+Original version by Olle Sandberg who did the heavy lifting of the initial
+reverse engineering. This version has been almost fully rewritten for clarity
+and extended with write support and info on more databanks, the write support
+is once again reverse engineered by Olle the additional databanks have been
+reverse engineered by me. I would like to express my thanks to Olle, this
+document and the Linux driver could not have been written without his efforts.
+
+Note: because of the lack of specs only the sensors part of the uGuru is
+described here and not the CPU / RAM / etc voltage & frequency control.
+
+Hans de Goede <j.w.r.degoede@hhs.nl>, 28-01-2006
+
+
+Detection
+=========
+
+As far as known the uGuru is always placed at and using the (ISA) I/O-ports
+0xE0 and 0xE4, so we don't have to scan any port-range, just check what the two
+ports are holding for detection. We will refer to 0xE0 as CMD (command-port)
+and 0xE4 as DATA because Abit refers to them with these names.
+
+If DATA holds 0x00 or 0x08 and CMD holds 0x00 or 0xAC an uGuru could be
+present. We have to check for two different values at data-port, because
+after a reboot uGuru will hold 0x00 here, but if the driver is removed and
+later on attached again data-port will hold 0x08, more about this later.
+
+After wider testing of the Linux kernel driver some variants of the uGuru have
+turned up which will hold 0x00 instead of 0xAC at the CMD port, thus we also
+have to test CMD for two different values. On these uGuru's DATA will initally
+hold 0x09 and will only hold 0x08 after reading CMD first, so CMD must be read
+first!
+
+To be really sure an uGuru is present a test read of one or more register
+sets should be done.
+
+
+Reading / Writing
+=================
+
+Addressing
+----------
+
+The uGuru has a number of different addressing levels. The first addressing
+level we will call banks. A bank holds data for one or more sensors. The data
+in a bank for a sensor is one or more bytes large.
+
+The number of bytes is fixed for a given bank, you should always read or write
+that many bytes, reading / writing more will fail, the results when writing
+less then the number of bytes for a given bank are undetermined.
+
+See below for all known bank addresses, numbers of sensors in that bank,
+number of bytes data per sensor and contents/meaning of those bytes.
+
+Although both this document and the kernel driver have kept the sensor
+terminoligy for the addressing within a bank this is not 100% correct, in
+bank 0x24 for example the addressing within the bank selects a PWM output not
+a sensor.
+
+Notice that some banks have both a read and a write address this is how the
+uGuru determines if a read from or a write to the bank is taking place, thus
+when reading you should always use the read address and when writing the
+write address. The write address is always one (1) more then the read address.
+
+
+uGuru ready
+-----------
+
+Before you can read from or write to the uGuru you must first put the uGuru
+in "ready" mode.
+
+To put the uGuru in ready mode first write 0x00 to DATA and then wait for DATA
+to hold 0x09, DATA should read 0x09 within 250 read cycles.
+
+Next CMD _must_ be read and should hold 0xAC, usually CMD will hold 0xAC the
+first read but sometimes it takes a while before CMD holds 0xAC and thus it
+has to be read a number of times (max 50).
+
+After reading CMD, DATA should hold 0x08 which means that the uGuru is ready
+for input. As above DATA will usually hold 0x08 the first read but not always.
+This step can be skipped, but it is undetermined what happens if the uGuru has
+not yet reported 0x08 at DATA and you proceed with writing a bank address.
+
+
+Sending bank and sensor addresses to the uGuru
+----------------------------------------------
+
+First the uGuru must be in "ready" mode as described above, DATA should hold
+0x08 indicating that the uGuru wants input, in this case the bank address.
+
+Next write the bank address to DATA. After the bank address has been written
+wait for to DATA to hold 0x08 again indicating that it wants / is ready for
+more input (max 250 reads).
+
+Once DATA holds 0x08 again write the sensor address to CMD.
+
+
+Reading
+-------
+
+First send the bank and sensor addresses as described above.
+Then for each byte of data you want to read wait for DATA to hold 0x01
+which indicates that the uGuru is ready to be read (max 250 reads) and once
+DATA holds 0x01 read the byte from CMD.
+
+Once all bytes have been read data will hold 0x09, but there is no reason to
+test for this. Notice that the number of bytes is bank address dependent see
+above and below.
+
+After completing a successfull read it is advised to put the uGuru back in
+ready mode, so that it is ready for the next read / write cycle. This way
+if your program / driver is unloaded and later loaded again the detection
+algorithm described above will still work.
+
+
+
+Writing
+-------
+
+First send the bank and sensor addresses as described above.
+Then for each byte of data you want to write wait for DATA to hold 0x00
+which indicates that the uGuru is ready to be written (max 250 reads) and
+once DATA holds 0x00 write the byte to CMD.
+
+Once all bytes have been written wait for DATA to hold 0x01 (max 250 reads)
+don't ask why this is the way it is.
+
+Once DATA holds 0x01 read CMD it should hold 0xAC now.
+
+After completing a successfull write it is advised to put the uGuru back in
+ready mode, so that it is ready for the next read / write cycle. This way
+if your program / driver is unloaded and later loaded again the detection
+algorithm described above will still work.
+
+
+Gotchas
+-------
+
+After wider testing of the Linux kernel driver some variants of the uGuru have
+turned up which do not hold 0x08 at DATA within 250 reads after writing the
+bank address. With these versions this happens quite frequent, using larger
+timeouts doesn't help, they just go offline for a second or 2, doing some
+internal callibration or whatever. Your code should be prepared to handle
+this and in case of no response in this specific case just goto sleep for a
+while and then retry.
+
+
+Address Map
+===========
+
+Bank 0x20 Alarms (R)
+--------------------
+This bank contains 0 sensors, iow the sensor address is ignored (but must be
+written) just use 0. Bank 0x20 contains 3 bytes:
+
+Byte 0:
+This byte holds the alarm flags for sensor 0-7 of Sensor Bank1, with bit 0
+corresponding to sensor 0, 1 to 1, etc.
+
+Byte 1:
+This byte holds the alarm flags for sensor 8-15 of Sensor Bank1, with bit 0
+corresponding to sensor 8, 1 to 9, etc.
+
+Byte 2:
+This byte holds the alarm flags for sensor 0-5 of Sensor Bank2, with bit 0
+corresponding to sensor 0, 1 to 1, etc.
+
+
+Bank 0x21 Sensor Bank1 Values / Readings (R)
+--------------------------------------------
+This bank contains 16 sensors, for each sensor it contains 1 byte.
+So far the following sensors are known to be available on all motherboards:
+Sensor 0 CPU temp
+Sensor 1 SYS temp
+Sensor 3 CPU core volt
+Sensor 4 DDR volt
+Sensor 10 DDR Vtt volt
+Sensor 15 PWM temp
+
+Byte 0:
+This byte holds the reading from the sensor. Sensors in Bank1 can be both
+volt and temp sensors, this is motherboard specific. The uGuru however does
+seem to know (be programmed with) what kindoff sensor is attached see Sensor
+Bank1 Settings description.
+
+Volt sensors use a linear scale, a reading 0 corresponds with 0 volt and a
+reading of 255 with 3494 mV. The sensors for higher voltages however are
+connected through a division circuit. The currently known division circuits
+in use result in ranges of: 0-4361mV, 0-6248mV or 0-14510mV. 3.3 volt sources
+use the 0-4361mV range, 5 volt the 0-6248mV and 12 volt the 0-14510mV .
+
+Temp sensors also use a linear scale, a reading of 0 corresponds with 0 degree
+Celsius and a reading of 255 with a reading of 255 degrees Celsius.
+
+
+Bank 0x22 Sensor Bank1 Settings (R)
+Bank 0x23 Sensor Bank1 Settings (W)
+-----------------------------------
+
+This bank contains 16 sensors, for each sensor it contains 3 bytes. Each
+set of 3 bytes contains the settings for the sensor with the same sensor
+address in Bank 0x21 .
+
+Byte 0:
+Alarm behaviour for the selected sensor. A 1 enables the described behaviour.
+Bit 0: Give an alarm if measured temp is over the warning threshold (RW) *
+Bit 1: Give an alarm if measured volt is over the max threshold (RW) **
+Bit 2: Give an alarm if measured volt is under the min threshold (RW) **
+Bit 3: Beep if alarm (RW)
+Bit 4: 1 if alarm cause measured temp is over the warning threshold (R)
+Bit 5: 1 if alarm cause measured volt is over the max threshold (R)
+Bit 6: 1 if alarm cause measured volt is under the min threshold (R)
+Bit 7: Volt sensor: Shutdown if alarm persist for more then 4 seconds (RW)
+ Temp sensor: Shutdown if temp is over the shutdown threshold (RW)
+
+* This bit is only honored/used by the uGuru if a temp sensor is connected
+** This bit is only honored/used by the uGuru if a volt sensor is connected
+Note with some trickery this can be used to find out what kinda sensor is
+detected see the Linux kernel driver for an example with many comments on
+how todo this.
+
+Byte 1:
+Temp sensor: warning threshold (scale as bank 0x21)
+Volt sensor: min threshold (scale as bank 0x21)
+
+Byte 2:
+Temp sensor: shutdown threshold (scale as bank 0x21)
+Volt sensor: max threshold (scale as bank 0x21)
+
+
+Bank 0x24 PWM outputs for FAN's (R)
+Bank 0x25 PWM outputs for FAN's (W)
+-----------------------------------
+
+This bank contains 3 "sensors", for each sensor it contains 5 bytes.
+Sensor 0 usually controls the CPU fan
+Sensor 1 usually controls the NB (or chipset for single chip) fan
+Sensor 2 usually controls the System fan
+
+Byte 0:
+Flag 0x80 to enable control, Fan runs at 100% when disabled.
+low nibble (temp)sensor address at bank 0x21 used for control.
+
+Byte 1:
+0-255 = 0-12v (linear), specify voltage at which fan will rotate when under
+low threshold temp (specified in byte 3)
+
+Byte 2:
+0-255 = 0-12v (linear), specify voltage at which fan will rotate when above
+high threshold temp (specified in byte 4)
+
+Byte 3:
+Low threshold temp (scale as bank 0x21)
+
+byte 4:
+High threshold temp (scale as bank 0x21)
+
+
+Bank 0x26 Sensors Bank2 Values / Readings (R)
+---------------------------------------------
+
+This bank contains 6 sensors (AFAIK), for each sensor it contains 1 byte.
+So far the following sensors are known to be available on all motherboards:
+Sensor 0: CPU fan speed
+Sensor 1: NB (or chipset for single chip) fan speed
+Sensor 2: SYS fan speed
+
+Byte 0:
+This byte holds the reading from the sensor. 0-255 = 0-15300 (linear)
+
+
+Bank 0x27 Sensors Bank2 Settings (R)
+Bank 0x28 Sensors Bank2 Settings (W)
+------------------------------------
+
+This bank contains 6 sensors (AFAIK), for each sensor it contains 2 bytes.
+
+Byte 0:
+Alarm behaviour for the selected sensor. A 1 enables the described behaviour.
+Bit 0: Give an alarm if measured rpm is under the min threshold (RW)
+Bit 3: Beep if alarm (RW)
+Bit 7: Shutdown if alarm persist for more then 4 seconds (RW)
+
+Byte 1:
+min threshold (scale as bank 0x26)
+
+
+Warning for the adventerous
+===========================
+
+A word of caution to those who want to experiment and see if they can figure
+the voltage / clock programming out, I tried reading and only reading banks
+0-0x30 with the reading code used for the sensor banks (0x20-0x28) and this
+resulted in a _permanent_ reprogramming of the voltages, luckily I had the
+sensors part configured so that it would shutdown my system on any out of spec
+voltages which proprably safed my computer (after a reboot I managed to
+immediatly enter the bios and reload the defaults). This probably means that
+the read/write cycle for the non sensor part is different from the sensor part.
diff --git a/Documentation/hwmon/abituguru3 b/Documentation/hwmon/abituguru3
new file mode 100644
index 0000000..fa598aa
--- /dev/null
+++ b/Documentation/hwmon/abituguru3
@@ -0,0 +1,65 @@
+Kernel driver abituguru3
+========================
+
+Supported chips:
+ * Abit uGuru revision 3 (Hardware Monitor part, reading only)
+ Prefix: 'abituguru3'
+ Addresses scanned: ISA 0x0E0
+ Datasheet: Not available, this driver is based on reverse engineering.
+ Note:
+ The uGuru is a microcontroller with onboard firmware which programs
+ it to behave as a hwmon IC. There are many different revisions of the
+ firmware and thus effectivly many different revisions of the uGuru.
+ Below is an incomplete list with which revisions are used for which
+ Motherboards:
+ uGuru 1.00 ~ 1.24 (AI7, KV8-MAX3, AN7)
+ uGuru 2.0.0.0 ~ 2.0.4.2 (KV8-PRO)
+ uGuru 2.1.0.0 ~ 2.1.2.8 (AS8, AV8, AA8, AG8, AA8XE, AX8)
+ uGuru 2.3.0.0 ~ 2.3.0.9 (AN8)
+ uGuru 3.0.0.0 ~ 3.0.x.x (AW8, AL8, AT8, NI8 SLI, AT8 32X, AN8 32X,
+ AW9D-MAX)
+ The abituguru3 driver is only for revison 3.0.x.x motherboards,
+ this driver will not work on older motherboards. For older
+ motherboards use the abituguru (without the 3 !) driver.
+
+Authors:
+ Hans de Goede <j.w.r.degoede@hhs.nl>,
+ (Initial reverse engineering done by Louis Kruger)
+
+
+Module Parameters
+-----------------
+
+* force: bool Force detection. Note this parameter only causes the
+ detection to be skipped, and thus the insmod to
+ succeed. If the uGuru can't be read the actual hwmon
+ driver will not load and thus no hwmon device will get
+ registered.
+* verbose: bool Should the driver be verbose?
+ 0/off/false normal output
+ 1/on/true + verbose error reporting (default)
+ Default: 1 (the driver is still in the testing phase)
+
+Description
+-----------
+
+This driver supports the hardware monitoring features of the third revision of
+the Abit uGuru chip, found on recent Abit uGuru featuring motherboards.
+
+The 3rd revision of the uGuru chip in reality is a Winbond W83L951G.
+Unfortunatly this doesn't help since the W83L951G is a generic microcontroller
+with a custom Abit application running on it.
+
+Despite Abit not releasing any information regarding the uGuru revision 3,
+Louis Kruger has managed to reverse engineer the sensor part of the uGuru.
+Without his work this driver would not have been possible.
+
+Known Issues
+------------
+
+The voltage and frequency control parts of the Abit uGuru are not supported,
+neither is writing any of the sensor settings and writing / reading the
+fanspeed control registers (FanEQ)
+
+If you encounter any problems please mail me <j.w.r.degoede@hhs.nl> and
+include the output of: "dmesg | grep abituguru"
diff --git a/Documentation/hwmon/adm1021 b/Documentation/hwmon/adm1021
new file mode 100644
index 0000000..03d02bf
--- /dev/null
+++ b/Documentation/hwmon/adm1021
@@ -0,0 +1,111 @@
+Kernel driver adm1021
+=====================
+
+Supported chips:
+ * Analog Devices ADM1021
+ Prefix: 'adm1021'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the Analog Devices website
+ * Analog Devices ADM1021A/ADM1023
+ Prefix: 'adm1023'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the Analog Devices website
+ * Genesys Logic GL523SM
+ Prefix: 'gl523sm'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet:
+ * Intel Xeon Processor
+ Prefix: - any other - may require 'force_adm1021' parameter
+ Addresses scanned: none
+ Datasheet: Publicly available at Intel website
+ * Maxim MAX1617
+ Prefix: 'max1617'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the Maxim website
+ * Maxim MAX1617A
+ Prefix: 'max1617a'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the Maxim website
+ * National Semiconductor LM84
+ Prefix: 'lm84'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the National Semiconductor website
+ * Philips NE1617
+ Prefix: 'max1617' (probably detected as a max1617)
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the Philips website
+ * Philips NE1617A
+ Prefix: 'max1617' (probably detected as a max1617)
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the Philips website
+ * TI THMC10
+ Prefix: 'thmc10'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the TI website
+ * Onsemi MC1066
+ Prefix: 'mc1066'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the Onsemi website
+
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>
+
+Module Parameters
+-----------------
+
+* read_only: int
+ Don't set any values, read only mode
+
+
+Description
+-----------
+
+The chips supported by this driver are very similar. The Maxim MAX1617 is
+the oldest; it has the problem that it is not very well detectable. The
+MAX1617A solves that. The ADM1021 is a straight clone of the MAX1617A.
+Ditto for the THMC10. From here on, we will refer to all these chips as
+ADM1021-clones.
+
+The ADM1021 and MAX1617A reports a die code, which is a sort of revision
+code. This can help us pinpoint problems; it is not very useful
+otherwise.
+
+ADM1021-clones implement two temperature sensors. One of them is internal,
+and measures the temperature of the chip itself; the other is external and
+is realised in the form of a transistor-like device. A special alarm
+indicates whether the remote sensor is connected.
+
+Each sensor has its own low and high limits. When they are crossed, the
+corresponding alarm is set and remains on as long as the temperature stays
+out of range. Temperatures are measured in degrees Celsius. Measurements
+are possible between -65 and +127 degrees, with a resolution of one degree.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may already
+have disappeared!
+
+This driver only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values. It is possible to make
+ADM1021-clones do faster measurements, but there is really no good reason
+for that.
+
+Xeon support
+------------
+
+Some Xeon processors have real max1617, adm1021, or compatible chips
+within them, with two temperature sensors.
+
+Other Xeons have chips with only one sensor.
+
+If you have a Xeon, and the adm1021 module loads, and both temperatures
+appear valid, then things are good.
+
+If the adm1021 module doesn't load, you should try this:
+ modprobe adm1021 force_adm1021=BUS,ADDRESS
+ ADDRESS can only be 0x18, 0x1a, 0x29, 0x2b, 0x4c, or 0x4e.
+
+If you have dual Xeons you may have appear to have two separate
+adm1021-compatible chips, or two single-temperature sensors, at distinct
+addresses.
diff --git a/Documentation/hwmon/adm1025 b/Documentation/hwmon/adm1025
new file mode 100644
index 0000000..39d2b78
--- /dev/null
+++ b/Documentation/hwmon/adm1025
@@ -0,0 +1,51 @@
+Kernel driver adm1025
+=====================
+
+Supported chips:
+ * Analog Devices ADM1025, ADM1025A
+ Prefix: 'adm1025'
+ Addresses scanned: I2C 0x2c - 0x2e
+ Datasheet: Publicly available at the Analog Devices website
+ * Philips NE1619
+ Prefix: 'ne1619'
+ Addresses scanned: I2C 0x2c - 0x2d
+ Datasheet: Publicly available at the Philips website
+
+The NE1619 presents some differences with the original ADM1025:
+ * Only two possible addresses (0x2c - 0x2d).
+ * No temperature offset register, but we don't use it anyway.
+ * No INT mode for pin 16. We don't play with it anyway.
+
+Authors:
+ Chen-Yuan Wu <gwu@esoft.com>,
+ Jean Delvare <khali@linux-fr.org>
+
+Description
+-----------
+
+(This is from Analog Devices.) The ADM1025 is a complete system hardware
+monitor for microprocessor-based systems, providing measurement and limit
+comparison of various system parameters. Five voltage measurement inputs
+are provided, for monitoring +2.5V, +3.3V, +5V and +12V power supplies and
+the processor core voltage. The ADM1025 can monitor a sixth power-supply
+voltage by measuring its own VCC. One input (two pins) is dedicated to a
+remote temperature-sensing diode and an on-chip temperature sensor allows
+ambient temperature to be monitored.
+
+One specificity of this chip is that the pin 11 can be hardwired in two
+different manners. It can act as the +12V power-supply voltage analog
+input, or as the a fifth digital entry for the VID reading (bit 4). It's
+kind of strange since both are useful, and the reason for designing the
+chip that way is obscure at least to me. The bit 5 of the configuration
+register can be used to define how the chip is hardwired. Please note that
+it is not a choice you have to make as the user. The choice was already
+made by your motherboard's maker. If the configuration bit isn't set
+properly, you'll have a wrong +12V reading or a wrong VID reading. The way
+the driver handles that is to preserve this bit through the initialization
+process, assuming that the BIOS set it up properly beforehand. If it turns
+out not to be true in some cases, we'll provide a module parameter to force
+modes.
+
+This driver also supports the ADM1025A, which differs from the ADM1025
+only in that it has "open-drain VID inputs while the ADM1025 has on-chip
+100k pull-ups on the VID inputs". It doesn't make any difference for us.
diff --git a/Documentation/hwmon/adm1026 b/Documentation/hwmon/adm1026
new file mode 100644
index 0000000..f4327db
--- /dev/null
+++ b/Documentation/hwmon/adm1026
@@ -0,0 +1,93 @@
+Kernel driver adm1026
+=====================
+
+Supported chips:
+ * Analog Devices ADM1026
+ Prefix: 'adm1026'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: Publicly available at the Analog Devices website
+ http://www.analog.com/en/prod/0,,766_825_ADM1026,00.html
+
+Authors:
+ Philip Pokorny <ppokorny@penguincomputing.com> for Penguin Computing
+ Justin Thiessen <jthiessen@penguincomputing.com>
+
+Module Parameters
+-----------------
+
+* gpio_input: int array (min = 1, max = 17)
+ List of GPIO pins (0-16) to program as inputs
+* gpio_output: int array (min = 1, max = 17)
+ List of GPIO pins (0-16) to program as outputs
+* gpio_inverted: int array (min = 1, max = 17)
+ List of GPIO pins (0-16) to program as inverted
+* gpio_normal: int array (min = 1, max = 17)
+ List of GPIO pins (0-16) to program as normal/non-inverted
+* gpio_fan: int array (min = 1, max = 8)
+ List of GPIO pins (0-7) to program as fan tachs
+
+
+Description
+-----------
+
+This driver implements support for the Analog Devices ADM1026. Analog
+Devices calls it a "complete thermal system management controller."
+
+The ADM1026 implements three (3) temperature sensors, 17 voltage sensors,
+16 general purpose digital I/O lines, eight (8) fan speed sensors (8-bit),
+an analog output and a PWM output along with limit, alarm and mask bits for
+all of the above. There is even 8k bytes of EEPROM memory on chip.
+
+Temperatures are measured in degrees Celsius. There are two external
+sensor inputs and one internal sensor. Each sensor has a high and low
+limit. If the limit is exceeded, an interrupt (#SMBALERT) can be
+generated. The interrupts can be masked. In addition, there are over-temp
+limits for each sensor. If this limit is exceeded, the #THERM output will
+be asserted. The current temperature and limits have a resolution of 1
+degree.
+
+Fan rotation speeds are reported in RPM (rotations per minute) but measured
+in counts of a 22.5kHz internal clock. Each fan has a high limit which
+corresponds to a minimum fan speed. If the limit is exceeded, an interrupt
+can be generated. Each fan can be programmed to divide the reference clock
+by 1, 2, 4 or 8. Not all RPM values can accurately be represented, so some
+rounding is done. With a divider of 8, the slowest measurable speed of a
+two pulse per revolution fan is 661 RPM.
+
+There are 17 voltage sensors. An alarm is triggered if the voltage has
+crossed a programmable minimum or maximum limit. Note that minimum in this
+case always means 'closest to zero'; this is important for negative voltage
+measurements. Several inputs have integrated attenuators so they can measure
+higher voltages directly. 3.3V, 5V, 12V, -12V and battery voltage all have
+dedicated inputs. There are several inputs scaled to 0-3V full-scale range
+for SCSI terminator power. The remaining inputs are not scaled and have
+a 0-2.5V full-scale range. A 2.5V or 1.82V reference voltage is provided
+for negative voltage measurements.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may already
+have disappeared! Note that in the current implementation, all hardware
+registers are read whenever any data is read (unless it is less than 2.0
+seconds since the last update). This means that you can easily miss
+once-only alarms.
+
+The ADM1026 measures continuously. Analog inputs are measured about 4
+times a second. Fan speed measurement time depends on fan speed and
+divisor. It can take as long as 1.5 seconds to measure all fan speeds.
+
+The ADM1026 has the ability to automatically control fan speed based on the
+temperature sensor inputs. Both the PWM output and the DAC output can be
+used to control fan speed. Usually only one of these two outputs will be
+used. Write the minimum PWM or DAC value to the appropriate control
+register. Then set the low temperature limit in the tmin values for each
+temperature sensor. The range of control is fixed at 20 °C, and the
+largest difference between current and tmin of the temperature sensors sets
+the control output. See the datasheet for several example circuits for
+controlling fan speed with the PWM and DAC outputs. The fan speed sensors
+do not have PWM compensation, so it is probably best to control the fan
+voltage from the power lead rather than on the ground lead.
+
+The datasheet shows an example application with VID signals attached to
+GPIO lines. Unfortunately, the chip may not be connected to the VID lines
+in this way. The driver assumes that the chips *is* connected this way to
+get a VID voltage.
diff --git a/Documentation/hwmon/adm1031 b/Documentation/hwmon/adm1031
new file mode 100644
index 0000000..be92a77
--- /dev/null
+++ b/Documentation/hwmon/adm1031
@@ -0,0 +1,35 @@
+Kernel driver adm1031
+=====================
+
+Supported chips:
+ * Analog Devices ADM1030
+ Prefix: 'adm1030'
+ Addresses scanned: I2C 0x2c to 0x2e
+ Datasheet: Publicly available at the Analog Devices website
+ http://www.analog.com/en/prod/0%2C2877%2CADM1030%2C00.html
+
+ * Analog Devices ADM1031
+ Prefix: 'adm1031'
+ Addresses scanned: I2C 0x2c to 0x2e
+ Datasheet: Publicly available at the Analog Devices website
+ http://www.analog.com/en/prod/0%2C2877%2CADM1031%2C00.html
+
+Authors:
+ Alexandre d'Alton <alex@alexdalton.org>
+ Jean Delvare <khali@linux-fr.org>
+
+Description
+-----------
+
+The ADM1030 and ADM1031 are digital temperature sensors and fan controllers.
+They sense their own temperature as well as the temperature of up to one
+(ADM1030) or two (ADM1031) external diodes.
+
+All temperature values are given in degrees Celsius. Resolution is 0.5
+degree for the local temperature, 0.125 degree for the remote temperatures.
+
+Each temperature channel has its own high and low limits, plus a critical
+limit.
+
+The ADM1030 monitors a single fan speed, while the ADM1031 monitors up to
+two. Each fan channel has its own low speed limit.
diff --git a/Documentation/hwmon/adm9240 b/Documentation/hwmon/adm9240
new file mode 100644
index 0000000..2c6f1fe
--- /dev/null
+++ b/Documentation/hwmon/adm9240
@@ -0,0 +1,177 @@
+Kernel driver adm9240
+=====================
+
+Supported chips:
+ * Analog Devices ADM9240
+ Prefix: 'adm9240'
+ Addresses scanned: I2C 0x2c - 0x2f
+ Datasheet: Publicly available at the Analog Devices website
+ http://www.analog.com/UploadedFiles/Data_Sheets/79857778ADM9240_0.pdf
+
+ * Dallas Semiconductor DS1780
+ Prefix: 'ds1780'
+ Addresses scanned: I2C 0x2c - 0x2f
+ Datasheet: Publicly available at the Dallas Semiconductor (Maxim) website
+ http://pdfserv.maxim-ic.com/en/ds/DS1780.pdf
+
+ * National Semiconductor LM81
+ Prefix: 'lm81'
+ Addresses scanned: I2C 0x2c - 0x2f
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/ds.cgi/LM/LM81.pdf
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Michiel Rook <michiel@grendelproject.nl>,
+ Grant Coady <gcoady.lk@gmail.com> with guidance
+ from Jean Delvare <khali@linux-fr.org>
+
+Interface
+---------
+The I2C addresses listed above assume BIOS has not changed the
+chip MSB 5-bit address. Each chip reports a unique manufacturer
+identification code as well as the chip revision/stepping level.
+
+Description
+-----------
+[From ADM9240] The ADM9240 is a complete system hardware monitor for
+microprocessor-based systems, providing measurement and limit comparison
+of up to four power supplies and two processor core voltages, plus
+temperature, two fan speeds and chassis intrusion. Measured values can
+be read out via an I2C-compatible serial System Management Bus, and values
+for limit comparisons can be programmed in over the same serial bus. The
+high speed successive approximation ADC allows frequent sampling of all
+analog channels to ensure a fast interrupt response to any out-of-limit
+measurement.
+
+The ADM9240, DS1780 and LM81 are register compatible, the following
+details are common to the three chips. Chip differences are described
+after this section.
+
+
+Measurements
+------------
+The measurement cycle
+
+The adm9240 driver will take a measurement reading no faster than once
+each two seconds. User-space may read sysfs interface faster than the
+measurement update rate and will receive cached data from the most
+recent measurement.
+
+ADM9240 has a very fast 320us temperature and voltage measurement cycle
+with independent fan speed measurement cycles counting alternating rising
+edges of the fan tacho inputs.
+
+DS1780 measurement cycle is about once per second including fan speed.
+
+LM81 measurement cycle is about once per 400ms including fan speed.
+The LM81 12-bit extended temperature measurement mode is not supported.
+
+Temperature
+-----------
+On chip temperature is reported as degrees Celsius as 9-bit signed data
+with resolution of 0.5 degrees Celsius. High and low temperature limits
+are 8-bit signed data with resolution of one degree Celsius.
+
+Temperature alarm is asserted once the temperature exceeds the high limit,
+and is cleared when the temperature falls below the temp1_max_hyst value.
+
+Fan Speed
+---------
+Two fan tacho inputs are provided, the ADM9240 gates an internal 22.5kHz
+clock via a divider to an 8-bit counter. Fan speed (rpm) is calculated by:
+
+rpm = (22500 * 60) / (count * divider)
+
+Automatic fan clock divider
+
+ * User sets 0 to fan_min limit
+ - low speed alarm is disabled
+ - fan clock divider not changed
+ - auto fan clock adjuster enabled for valid fan speed reading
+
+ * User sets fan_min limit too low
+ - low speed alarm is enabled
+ - fan clock divider set to max
+ - fan_min set to register value 254 which corresponds
+ to 664 rpm on adm9240
+ - low speed alarm will be asserted if fan speed is
+ less than minimum measurable speed
+ - auto fan clock adjuster disabled
+
+ * User sets reasonable fan speed
+ - low speed alarm is enabled
+ - fan clock divider set to suit fan_min
+ - auto fan clock adjuster enabled: adjusts fan_min
+
+ * User sets unreasonably high low fan speed limit
+ - resolution of the low speed limit may be reduced
+ - alarm will be asserted
+ - auto fan clock adjuster enabled: adjusts fan_min
+
+ * fan speed may be displayed as zero until the auto fan clock divider
+ adjuster brings fan speed clock divider back into chip measurement
+ range, this will occur within a few measurement cycles.
+
+Analog Output
+-------------
+An analog output provides a 0 to 1.25 volt signal intended for an external
+fan speed amplifier circuit. The analog output is set to maximum value on
+power up or reset. This doesn't do much on the test Intel SE440BX-2.
+
+Voltage Monitor
+
+Voltage (IN) measurement is internally scaled:
+
+ nr label nominal maximum resolution
+ mV mV mV
+ 0 +2.5V 2500 3320 13.0
+ 1 Vccp1 2700 3600 14.1
+ 2 +3.3V 3300 4380 17.2
+ 3 +5V 5000 6640 26.0
+ 4 +12V 12000 15940 62.5
+ 5 Vccp2 2700 3600 14.1
+
+The reading is an unsigned 8-bit value, nominal voltage measurement is
+represented by a reading of 192, being 3/4 of the measurement range.
+
+An alarm is asserted for any voltage going below or above the set limits.
+
+The driver reports and accepts voltage limits scaled to the above table.
+
+VID Monitor
+-----------
+The chip has five inputs to read the 5-bit VID and reports the mV value
+based on detected CPU type.
+
+Chassis Intrusion
+-----------------
+An alarm is asserted when the CI pin goes active high. The ADM9240
+Datasheet has an example of an external temperature sensor driving
+this pin. On an Intel SE440BX-2 the Chassis Intrusion header is
+connected to a normally open switch.
+
+The ADM9240 provides an internal open drain on this line, and may output
+a 20 ms active low pulse to reset an external Chassis Intrusion latch.
+
+Clear the CI latch by writing value 1 to the sysfs chassis_clear file.
+
+Alarm flags reported as 16-bit word
+
+ bit label comment
+ --- ------------- --------------------------
+ 0 +2.5 V_Error high or low limit exceeded
+ 1 VCCP_Error high or low limit exceeded
+ 2 +3.3 V_Error high or low limit exceeded
+ 3 +5 V_Error high or low limit exceeded
+ 4 Temp_Error temperature error
+ 6 FAN1_Error fan low limit exceeded
+ 7 FAN2_Error fan low limit exceeded
+ 8 +12 V_Error high or low limit exceeded
+ 9 VCCP2_Error high or low limit exceeded
+ 12 Chassis_Error CI pin went high
+
+Remaining bits are reserved and thus undefined. It is important to note
+that alarm bits may be cleared on read, user-space may latch alarms and
+provide the end-user with a method to clear alarm memory.
diff --git a/Documentation/hwmon/ads7828 b/Documentation/hwmon/ads7828
new file mode 100644
index 0000000..75bc4be
--- /dev/null
+++ b/Documentation/hwmon/ads7828
@@ -0,0 +1,36 @@
+Kernel driver ads7828
+=====================
+
+Supported chips:
+ * Texas Instruments/Burr-Brown ADS7828
+ Prefix: 'ads7828'
+ Addresses scanned: I2C 0x48, 0x49, 0x4a, 0x4b
+ Datasheet: Publicly available at the Texas Instruments website :
+ http://focus.ti.com/lit/ds/symlink/ads7828.pdf
+
+Authors:
+ Steve Hardy <steve@linuxrealtime.co.uk>
+
+Module Parameters
+-----------------
+
+* se_input: bool (default Y)
+ Single ended operation - set to N for differential mode
+* int_vref: bool (default Y)
+ Operate with the internal 2.5V reference - set to N for external reference
+* vref_mv: int (default 2500)
+ If using an external reference, set this to the reference voltage in mV
+
+Description
+-----------
+
+This driver implements support for the Texas Instruments ADS7828.
+
+This device is a 12-bit 8-channel A-D converter.
+
+It can operate in single ended mode (8 +ve inputs) or in differential mode,
+where 4 differential pairs can be measured.
+
+The chip also has the facility to use an external voltage reference. This
+may be required if your hardware supplies the ADS7828 from a 5V supply, see
+the datasheet for more details.
diff --git a/Documentation/hwmon/adt7462 b/Documentation/hwmon/adt7462
new file mode 100644
index 0000000..ec660b3
--- /dev/null
+++ b/Documentation/hwmon/adt7462
@@ -0,0 +1,67 @@
+Kernel driver adt7462
+======================
+
+Supported chips:
+ * Analog Devices ADT7462
+ Prefix: 'adt7462'
+ Addresses scanned: I2C 0x58, 0x5C
+ Datasheet: Publicly available at the Analog Devices website
+
+Author: Darrick J. Wong
+
+Description
+-----------
+
+This driver implements support for the Analog Devices ADT7462 chip family.
+
+This chip is a bit of a beast. It has 8 counters for measuring fan speed. It
+can also measure 13 voltages or 4 temperatures, or various combinations of the
+two. See the chip documentation for more details about the exact set of
+configurations. This driver does not allow one to configure the chip; that is
+left to the system designer.
+
+A sophisticated control system for the PWM outputs is designed into the ADT7462
+that allows fan speed to be adjusted automatically based on any of the three
+temperature sensors. Each PWM output is individually adjustable and
+programmable. Once configured, the ADT7462 will adjust the PWM outputs in
+response to the measured temperatures without further host intervention. This
+feature can also be disabled for manual control of the PWM's.
+
+Each of the measured inputs (voltage, temperature, fan speed) has
+corresponding high/low limit values. The ADT7462 will signal an ALARM if
+any measured value exceeds either limit.
+
+The ADT7462 samples all inputs continuously. The driver will not read
+the registers more often than once every other second. Further,
+configuration data is only read once per minute.
+
+Special Features
+----------------
+
+The ADT7462 have a 10-bit ADC and can therefore measure temperatures
+with 0.25 degC resolution.
+
+The Analog Devices datasheet is very detailed and describes a procedure for
+determining an optimal configuration for the automatic PWM control.
+
+The driver will report sensor labels when it is able to determine that
+information from the configuration registers.
+
+Configuration Notes
+-------------------
+
+Besides standard interfaces driver adds the following:
+
+* PWM Control
+
+* pwm#_auto_point1_pwm and temp#_auto_point1_temp and
+* pwm#_auto_point2_pwm and temp#_auto_point2_temp -
+
+point1: Set the pwm speed at a lower temperature bound.
+point2: Set the pwm speed at a higher temperature bound.
+
+The ADT7462 will scale the pwm between the lower and higher pwm speed when
+the temperature is between the two temperature boundaries. PWM values range
+from 0 (off) to 255 (full speed). Fan speed will be set to maximum when the
+temperature sensor associated with the PWM control exceeds temp#_max.
+
diff --git a/Documentation/hwmon/adt7470 b/Documentation/hwmon/adt7470
new file mode 100644
index 0000000..75d13ca
--- /dev/null
+++ b/Documentation/hwmon/adt7470
@@ -0,0 +1,76 @@
+Kernel driver adt7470
+=====================
+
+Supported chips:
+ * Analog Devices ADT7470
+ Prefix: 'adt7470'
+ Addresses scanned: I2C 0x2C, 0x2E, 0x2F
+ Datasheet: Publicly available at the Analog Devices website
+
+Author: Darrick J. Wong
+
+Description
+-----------
+
+This driver implements support for the Analog Devices ADT7470 chip. There may
+be other chips that implement this interface.
+
+The ADT7470 uses the 2-wire interface compatible with the SMBus 2.0
+specification. Using an analog to digital converter it measures up to ten (10)
+external temperatures. It has four (4) 16-bit counters for measuring fan speed.
+There are four (4) PWM outputs that can be used to control fan speed.
+
+A sophisticated control system for the PWM outputs is designed into the ADT7470
+that allows fan speed to be adjusted automatically based on any of the ten
+temperature sensors. Each PWM output is individually adjustable and
+programmable. Once configured, the ADT7470 will adjust the PWM outputs in
+response to the measured temperatures with further host intervention. This
+feature can also be disabled for manual control of the PWM's.
+
+Each of the measured inputs (temperature, fan speed) has corresponding high/low
+limit values. The ADT7470 will signal an ALARM if any measured value exceeds
+either limit.
+
+The ADT7470 DOES NOT sample all inputs continuously. A single pin on the
+ADT7470 is connected to a multitude of thermal diodes, but the chip must be
+instructed explicitly to read the multitude of diodes. If you want to use
+automatic fan control mode, you must manually read any of the temperature
+sensors or the fan control algorithm will not run. The chip WILL NOT DO THIS
+AUTOMATICALLY; this must be done from userspace. This may be a bug in the chip
+design, given that many other AD chips take care of this. The driver will not
+read the registers more often than once every 5 seconds. Further,
+configuration data is only read once per minute.
+
+Special Features
+----------------
+
+The ADT7470 has a 8-bit ADC and is capable of measuring temperatures with 1
+degC resolution.
+
+The Analog Devices datasheet is very detailed and describes a procedure for
+determining an optimal configuration for the automatic PWM control.
+
+Configuration Notes
+-------------------
+
+Besides standard interfaces driver adds the following:
+
+* PWM Control
+
+* pwm#_auto_point1_pwm and pwm#_auto_point1_temp and
+* pwm#_auto_point2_pwm and pwm#_auto_point2_temp -
+
+point1: Set the pwm speed at a lower temperature bound.
+point2: Set the pwm speed at a higher temperature bound.
+
+The ADT7470 will scale the pwm between the lower and higher pwm speed when
+the temperature is between the two temperature boundaries. PWM values range
+from 0 (off) to 255 (full speed). Fan speed will be set to maximum when the
+temperature sensor associated with the PWM control exceeds
+pwm#_auto_point2_temp.
+
+Notes
+-----
+
+As stated above, the temperature inputs must be read periodically from
+userspace in order for the automatic pwm algorithm to run.
diff --git a/Documentation/hwmon/adt7473 b/Documentation/hwmon/adt7473
new file mode 100644
index 0000000..1cbf671
--- /dev/null
+++ b/Documentation/hwmon/adt7473
@@ -0,0 +1,72 @@
+Kernel driver adt7473
+======================
+
+Supported chips:
+ * Analog Devices ADT7473
+ Prefix: 'adt7473'
+ Addresses scanned: I2C 0x2C, 0x2D, 0x2E
+ Datasheet: Publicly available at the Analog Devices website
+
+Author: Darrick J. Wong
+
+Description
+-----------
+
+This driver implements support for the Analog Devices ADT7473 chip family.
+
+The ADT7473 uses the 2-wire interface compatible with the SMBUS 2.0
+specification. Using an analog to digital converter it measures three (3)
+temperatures and two (2) voltages. It has four (4) 16-bit counters for
+measuring fan speed. There are three (3) PWM outputs that can be used
+to control fan speed.
+
+A sophisticated control system for the PWM outputs is designed into the
+ADT7473 that allows fan speed to be adjusted automatically based on any of the
+three temperature sensors. Each PWM output is individually adjustable and
+programmable. Once configured, the ADT7473 will adjust the PWM outputs in
+response to the measured temperatures without further host intervention.
+This feature can also be disabled for manual control of the PWM's.
+
+Each of the measured inputs (voltage, temperature, fan speed) has
+corresponding high/low limit values. The ADT7473 will signal an ALARM if
+any measured value exceeds either limit.
+
+The ADT7473 samples all inputs continuously. The driver will not read
+the registers more often than once every other second. Further,
+configuration data is only read once per minute.
+
+Special Features
+----------------
+
+The ADT7473 have a 10-bit ADC and can therefore measure temperatures
+with 0.25 degC resolution. Temperature readings can be configured either
+for twos complement format or "Offset 64" format, wherein 63 is subtracted
+from the raw value to get the temperature value.
+
+The Analog Devices datasheet is very detailed and describes a procedure for
+determining an optimal configuration for the automatic PWM control.
+
+Configuration Notes
+-------------------
+
+Besides standard interfaces driver adds the following:
+
+* PWM Control
+
+* pwm#_auto_point1_pwm and temp#_auto_point1_temp and
+* pwm#_auto_point2_pwm and temp#_auto_point2_temp -
+
+point1: Set the pwm speed at a lower temperature bound.
+point2: Set the pwm speed at a higher temperature bound.
+
+The ADT7473 will scale the pwm between the lower and higher pwm speed when
+the temperature is between the two temperature boundaries. PWM values range
+from 0 (off) to 255 (full speed). Fan speed will be set to maximum when the
+temperature sensor associated with the PWM control exceeds temp#_max.
+
+Notes
+-----
+
+The NVIDIA binary driver presents an ADT7473 chip via an on-card i2c bus.
+Unfortunately, they fail to set the i2c adapter class, so this driver may
+fail to find the chip until the nvidia driver is patched.
diff --git a/Documentation/hwmon/asb100 b/Documentation/hwmon/asb100
new file mode 100644
index 0000000..ab7365e
--- /dev/null
+++ b/Documentation/hwmon/asb100
@@ -0,0 +1,72 @@
+Kernel driver asb100
+====================
+
+Supported Chips:
+ * Asus ASB100 and ASB100-A "Bach"
+ Prefix: 'asb100'
+ Addresses scanned: I2C 0x2d
+ Datasheet: none released
+
+Author: Mark M. Hoffman <mhoffman@lightlink.com>
+
+Description
+-----------
+
+This driver implements support for the Asus ASB100 and ASB100-A "Bach".
+These are custom ASICs available only on Asus mainboards. Asus refuses to
+supply a datasheet for these chips. Thanks go to many people who helped
+investigate their hardware, including:
+
+Vitaly V. Bursov
+Alexander van Kaam (author of MBM for Windows)
+Bertrik Sikken
+
+The ASB100 implements seven voltage sensors, three fan rotation speed
+sensors, four temperature sensors, VID lines and alarms. In addition to
+these, the ASB100-A also implements a single PWM controller for fans 2 and
+3 (i.e. one setting controls both.) If you have a plain ASB100, the PWM
+controller will simply not work (or maybe it will for you... it doesn't for
+me).
+
+Temperatures are measured and reported in degrees Celsius.
+
+Fan speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit.
+
+Voltage sensors (also known as IN sensors) report values in volts.
+
+The VID lines encode the core voltage value: the voltage level your
+processor should work with. This is hardcoded by the mainboard and/or
+processor itself. It is a value in volts.
+
+Alarms: (TODO question marks indicate may or may not work)
+
+0x0001 => in0 (?)
+0x0002 => in1 (?)
+0x0004 => in2
+0x0008 => in3
+0x0010 => temp1 (1)
+0x0020 => temp2
+0x0040 => fan1
+0x0080 => fan2
+0x0100 => in4
+0x0200 => in5 (?) (2)
+0x0400 => in6 (?) (2)
+0x0800 => fan3
+0x1000 => chassis switch
+0x2000 => temp3
+
+Alarm Notes:
+
+(1) This alarm will only trigger if the hysteresis value is 127C.
+I.e. it behaves the same as w83781d.
+
+(2) The min and max registers for these values appear to
+be read-only or otherwise stuck at 0x00.
+
+TODO:
+* Experiment with fan divisors > 8.
+* Experiment with temp. sensor types.
+* Are there really 13 voltage inputs? Probably not...
+* Cleanups, no doubt...
+
diff --git a/Documentation/hwmon/coretemp b/Documentation/hwmon/coretemp
new file mode 100644
index 0000000..dbbe6c7
--- /dev/null
+++ b/Documentation/hwmon/coretemp
@@ -0,0 +1,38 @@
+Kernel driver coretemp
+======================
+
+Supported chips:
+ * All Intel Core family
+ Prefix: 'coretemp'
+ CPUID: family 0x6, models 0xe, 0xf, 0x16, 0x17
+ Datasheet: Intel 64 and IA-32 Architectures Software Developer's Manual
+ Volume 3A: System Programming Guide
+ http://softwarecommunity.intel.com/Wiki/Mobility/720.htm
+
+Author: Rudolf Marek
+
+Description
+-----------
+
+This driver permits reading temperature sensor embedded inside Intel Core CPU.
+Temperature is measured in degrees Celsius and measurement resolution is
+1 degree C. Valid temperatures are from 0 to TjMax degrees C, because
+the actual value of temperature register is in fact a delta from TjMax.
+
+Temperature known as TjMax is the maximum junction temperature of processor.
+Intel defines this temperature as 85C or 100C. At this temperature, protection
+mechanism will perform actions to forcibly cool down the processor. Alarm
+may be raised, if the temperature grows enough (more than TjMax) to trigger
+the Out-Of-Spec bit. Following table summarizes the exported sysfs files:
+
+temp1_input - Core temperature (in millidegrees Celsius).
+temp1_max - All cooling devices should be turned on (on Core2).
+temp1_crit - Maximum junction temperature (in millidegrees Celsius).
+temp1_crit_alarm - Set when Out-of-spec bit is set, never clears.
+ Correct CPU operation is no longer guaranteed.
+temp1_label - Contains string "Core X", where X is processor
+ number.
+
+The TjMax temperature is set to 85 degrees C if undocumented model specific
+register (UMSR) 0xee has bit 30 set. If not the TjMax is 100 degrees C as
+(sometimes) documented in processor datasheet.
diff --git a/Documentation/hwmon/dme1737 b/Documentation/hwmon/dme1737
new file mode 100644
index 0000000..001d2e7
--- /dev/null
+++ b/Documentation/hwmon/dme1737
@@ -0,0 +1,295 @@
+Kernel driver dme1737
+=====================
+
+Supported chips:
+ * SMSC DME1737 and compatibles (like Asus A8000)
+ Prefix: 'dme1737'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: Provided by SMSC upon request and under NDA
+ * SMSC SCH3112, SCH3114, SCH3116
+ Prefix: 'sch311x'
+ Addresses scanned: none, address read from Super-I/O config space
+ Datasheet: http://www.nuhorizons.com/FeaturedProducts/Volume1/SMSC/311x.pdf
+ * SMSC SCH5027
+ Prefix: 'sch5027'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: Provided by SMSC upon request and under NDA
+
+Authors:
+ Juerg Haefliger <juergh@gmail.com>
+
+
+Module Parameters
+-----------------
+
+* force_start: bool Enables the monitoring of voltage, fan and temp inputs
+ and PWM output control functions. Using this parameter
+ shouldn't be required since the BIOS usually takes care
+ of this.
+* probe_all_addr: bool Include non-standard LPC addresses 0x162e and 0x164e
+ when probing for ISA devices. This is required for the
+ following boards:
+ - VIA EPIA SN18000
+
+
+Description
+-----------
+
+This driver implements support for the hardware monitoring capabilities of the
+SMSC DME1737 and Asus A8000 (which are the same), SMSC SCH5027, and SMSC
+SCH311x Super-I/O chips. These chips feature monitoring of 3 temp sensors
+temp[1-3] (2 remote diodes and 1 internal), 7 voltages in[0-6] (6 external and
+1 internal) and up to 6 fan speeds fan[1-6]. Additionally, the chips implement
+up to 5 PWM outputs pwm[1-3,5-6] for controlling fan speeds both manually and
+automatically.
+
+For the DME1737, A8000 and SCH5027, fan[1-2] and pwm[1-2] are always present.
+Fan[3-6] and pwm[3,5-6] are optional features and their availability depends on
+the configuration of the chip. The driver will detect which features are
+present during initialization and create the sysfs attributes accordingly.
+
+For the SCH311x, fan[1-3] and pwm[1-3] are always present and fan[4-6] and
+pwm[5-6] don't exist.
+
+The hardware monitoring features of the DME1737, A8000, and SCH5027 are only
+accessible via SMBus, while the SCH311x only provides access via the ISA bus.
+The driver will therefore register itself as an I2C client driver if it detects
+a DME1737, A8000, or SCH5027 and as a platform driver if it detects a SCH311x
+chip.
+
+
+Voltage Monitoring
+------------------
+
+The voltage inputs are sampled with 12-bit resolution and have internal
+scaling resistors. The values returned by the driver therefore reflect true
+millivolts and don't need scaling. The voltage inputs are mapped as follows
+(the last column indicates the input ranges):
+
+DME1737, A8000:
+ in0: +5VTR (+5V standby) 0V - 6.64V
+ in1: Vccp (processor core) 0V - 3V
+ in2: VCC (internal +3.3V) 0V - 4.38V
+ in3: +5V 0V - 6.64V
+ in4: +12V 0V - 16V
+ in5: VTR (+3.3V standby) 0V - 4.38V
+ in6: Vbat (+3.0V) 0V - 4.38V
+
+SCH311x:
+ in0: +2.5V 0V - 6.64V
+ in1: Vccp (processor core) 0V - 2V
+ in2: VCC (internal +3.3V) 0V - 4.38V
+ in3: +5V 0V - 6.64V
+ in4: +12V 0V - 16V
+ in5: VTR (+3.3V standby) 0V - 4.38V
+ in6: Vbat (+3.0V) 0V - 4.38V
+
+SCH5027:
+ in0: +5VTR (+5V standby) 0V - 6.64V
+ in1: Vccp (processor core) 0V - 3V
+ in2: VCC (internal +3.3V) 0V - 4.38V
+ in3: V2_IN 0V - 1.5V
+ in4: V1_IN 0V - 1.5V
+ in5: VTR (+3.3V standby) 0V - 4.38V
+ in6: Vbat (+3.0V) 0V - 4.38V
+
+Each voltage input has associated min and max limits which trigger an alarm
+when crossed.
+
+
+Temperature Monitoring
+----------------------
+
+Temperatures are measured with 12-bit resolution and reported in millidegree
+Celsius. The chip also features offsets for all 3 temperature inputs which -
+when programmed - get added to the input readings. The chip does all the
+scaling by itself and the driver therefore reports true temperatures that don't
+need any user-space adjustments. The temperature inputs are mapped as follows
+(the last column indicates the input ranges):
+
+ temp1: Remote diode 1 (3904 type) temperature -127C - +127C
+ temp2: DME1737 internal temperature -127C - +127C
+ temp3: Remote diode 2 (3904 type) temperature -127C - +127C
+
+Each temperature input has associated min and max limits which trigger an alarm
+when crossed. Additionally, each temperature input has a fault attribute that
+returns 1 when a faulty diode or an unconnected input is detected and 0
+otherwise.
+
+
+Fan Monitoring
+--------------
+
+Fan RPMs are measured with 16-bit resolution. The chip provides inputs for 6
+fan tachometers. All 6 inputs have an associated min limit which triggers an
+alarm when crossed. Fan inputs 1-4 provide type attributes that need to be set
+to the number of pulses per fan revolution that the connected tachometer
+generates. Supported values are 1, 2, and 4. Fan inputs 5-6 only support fans
+that generate 2 pulses per revolution. Fan inputs 5-6 also provide a max
+attribute that needs to be set to the maximum attainable RPM (fan at 100% duty-
+cycle) of the input. The chip adjusts the sampling rate based on this value.
+
+
+PWM Output Control
+------------------
+
+This chip features 5 PWM outputs. PWM outputs 1-3 are associated with fan
+inputs 1-3 and PWM outputs 5-6 are associated with fan inputs 5-6. PWM outputs
+1-3 can be configured to operate either in manual or automatic mode by setting
+the appropriate enable attribute accordingly. PWM outputs 5-6 can only operate
+in manual mode, their enable attributes are therefore read-only. When set to
+manual mode, the fan speed is set by writing the duty-cycle value to the
+appropriate PWM attribute. In automatic mode, the PWM attribute returns the
+current duty-cycle as set by the fan controller in the chip. All PWM outputs
+support the setting of the output frequency via the freq attribute.
+
+In automatic mode, the chip supports the setting of the PWM ramp rate which
+defines how fast the PWM output is adjusting to changes of the associated
+temperature input. Associating PWM outputs to temperature inputs is done via
+temperature zones. The chip features 3 zones whose assignments to temperature
+inputs is static and determined during initialization. These assignments can
+be retrieved via the zone[1-3]_auto_channels_temp attributes. Each PWM output
+is assigned to one (or hottest of multiple) temperature zone(s) through the
+pwm[1-3]_auto_channels_zone attributes. Each PWM output has 3 distinct output
+duty-cycles: full, low, and min. Full is internally hard-wired to 255 (100%)
+and low and min can be programmed via pwm[1-3]_auto_point1_pwm and
+pwm[1-3]_auto_pwm_min, respectively. The thermal thresholds of the zones are
+programmed via zone[1-3]_auto_point[1-3]_temp and
+zone[1-3]_auto_point1_temp_hyst:
+
+ pwm[1-3]_auto_point2_pwm full-speed duty-cycle (255, i.e., 100%)
+ pwm[1-3]_auto_point1_pwm low-speed duty-cycle
+ pwm[1-3]_auto_pwm_min min-speed duty-cycle
+
+ zone[1-3]_auto_point3_temp full-speed temp (all outputs)
+ zone[1-3]_auto_point2_temp full-speed temp
+ zone[1-3]_auto_point1_temp low-speed temp
+ zone[1-3]_auto_point1_temp_hyst min-speed temp
+
+The chip adjusts the output duty-cycle linearly in the range of auto_point1_pwm
+to auto_point2_pwm if the temperature of the associated zone is between
+auto_point1_temp and auto_point2_temp. If the temperature drops below the
+auto_point1_temp_hyst value, the output duty-cycle is set to the auto_pwm_min
+value which only supports two values: 0 or auto_point1_pwm. That means that the
+fan either turns completely off or keeps spinning with the low-speed
+duty-cycle. If any of the temperatures rise above the auto_point3_temp value,
+all PWM outputs are set to 100% duty-cycle.
+
+Following is another representation of how the chip sets the output duty-cycle
+based on the temperature of the associated thermal zone:
+
+ Duty-Cycle Duty-Cycle
+ Temperature Rising Temp Falling Temp
+ ----------- ----------- ------------
+ full-speed full-speed full-speed
+
+ < linearly adjusted duty-cycle >
+
+ low-speed low-speed low-speed
+ min-speed low-speed
+ min-speed min-speed min-speed
+ min-speed min-speed
+
+
+Sysfs Attributes
+----------------
+
+Following is a list of all sysfs attributes that the driver provides, their
+permissions and a short description:
+
+Name Perm Description
+---- ---- -----------
+cpu0_vid RO CPU core reference voltage in
+ millivolts.
+vrm RW Voltage regulator module version
+ number.
+
+in[0-6]_input RO Measured voltage in millivolts.
+in[0-6]_min RW Low limit for voltage input.
+in[0-6]_max RW High limit for voltage input.
+in[0-6]_alarm RO Voltage input alarm. Returns 1 if
+ voltage input is or went outside the
+ associated min-max range, 0 otherwise.
+
+temp[1-3]_input RO Measured temperature in millidegree
+ Celsius.
+temp[1-3]_min RW Low limit for temp input.
+temp[1-3]_max RW High limit for temp input.
+temp[1-3]_offset RW Offset for temp input. This value will
+ be added by the chip to the measured
+ temperature.
+temp[1-3]_alarm RO Alarm for temp input. Returns 1 if temp
+ input is or went outside the associated
+ min-max range, 0 otherwise.
+temp[1-3]_fault RO Temp input fault. Returns 1 if the chip
+ detects a faulty thermal diode or an
+ unconnected temp input, 0 otherwise.
+
+zone[1-3]_auto_channels_temp RO Temperature zone to temperature input
+ mapping. This attribute is a bitfield
+ and supports the following values:
+ 1: temp1
+ 2: temp2
+ 4: temp3
+zone[1-3]_auto_point1_temp_hyst RW Auto PWM temp point1 hysteresis. The
+ output of the corresponding PWM is set
+ to the pwm_auto_min value if the temp
+ falls below the auto_point1_temp_hyst
+ value.
+zone[1-3]_auto_point[1-3]_temp RW Auto PWM temp points. Auto_point1 is
+ the low-speed temp, auto_point2 is the
+ full-speed temp, and auto_point3 is the
+ temp at which all PWM outputs are set
+ to full-speed (100% duty-cycle).
+
+fan[1-6]_input RO Measured fan speed in RPM.
+fan[1-6]_min RW Low limit for fan input.
+fan[1-6]_alarm RO Alarm for fan input. Returns 1 if fan
+ input is or went below the associated
+ min value, 0 otherwise.
+fan[1-4]_type RW Type of attached fan. Expressed in
+ number of pulses per revolution that
+ the fan generates. Supported values are
+ 1, 2, and 4.
+fan[5-6]_max RW Max attainable RPM at 100% duty-cycle.
+ Required for chip to adjust the
+ sampling rate accordingly.
+
+pmw[1-3,5-6] RO/RW Duty-cycle of PWM output. Supported
+ values are 0-255 (0%-100%). Only
+ writeable if the associated PWM is in
+ manual mode.
+pwm[1-3]_enable RW Enable of PWM outputs 1-3. Supported
+ values are:
+ 0: turned off (output @ 100%)
+ 1: manual mode
+ 2: automatic mode
+pwm[5-6]_enable RO Enable of PWM outputs 5-6. Always
+ returns 1 since these 2 outputs are
+ hard-wired to manual mode.
+pmw[1-3,5-6]_freq RW Frequency of PWM output. Supported
+ values are in the range 11Hz-30000Hz
+ (default is 25000Hz).
+pmw[1-3]_ramp_rate RW Ramp rate of PWM output. Determines how
+ fast the PWM duty-cycle will change
+ when the PWM is in automatic mode.
+ Expressed in ms per PWM step. Supported
+ values are in the range 0ms-206ms
+ (default is 0, which means the duty-
+ cycle changes instantly).
+pwm[1-3]_auto_channels_zone RW PWM output to temperature zone mapping.
+ This attribute is a bitfield and
+ supports the following values:
+ 1: zone1
+ 2: zone2
+ 4: zone3
+ 6: highest of zone[2-3]
+ 7: highest of zone[1-3]
+pwm[1-3]_auto_pwm_min RW Auto PWM min pwm. Minimum PWM duty-
+ cycle. Supported values are 0 or
+ auto_point1_pwm.
+pwm[1-3]_auto_point1_pwm RW Auto PWM pwm point. Auto_point1 is the
+ low-speed duty-cycle.
+pwm[1-3]_auto_point2_pwm RO Auto PWM pwm point. Auto_point2 is the
+ full-speed duty-cycle which is hard-
+ wired to 255 (100% duty-cycle).
diff --git a/Documentation/hwmon/ds1621 b/Documentation/hwmon/ds1621
new file mode 100644
index 0000000..1fee6f1
--- /dev/null
+++ b/Documentation/hwmon/ds1621
@@ -0,0 +1,108 @@
+Kernel driver ds1621
+====================
+
+Supported chips:
+ * Dallas Semiconductor DS1621
+ Prefix: 'ds1621'
+ Addresses scanned: I2C 0x48 - 0x4f
+ Datasheet: Publicly available at the Dallas Semiconductor website
+ http://www.dalsemi.com/
+ * Dallas Semiconductor DS1625
+ Prefix: 'ds1621'
+ Addresses scanned: I2C 0x48 - 0x4f
+ Datasheet: Publicly available at the Dallas Semiconductor website
+ http://www.dalsemi.com/
+
+Authors:
+ Christian W. Zuckschwerdt <zany@triq.net>
+ valuable contributions by Jan M. Sendler <sendler@sendler.de>
+ ported to 2.6 by Aurelien Jarno <aurelien@aurel32.net>
+ with the help of Jean Delvare <khali@linux-fr.org>
+
+Module Parameters
+------------------
+
+* polarity int
+ Output's polarity: 0 = active high, 1 = active low
+
+Description
+-----------
+
+The DS1621 is a (one instance) digital thermometer and thermostat. It has
+both high and low temperature limits which can be user defined (i.e.
+programmed into non-volatile on-chip registers). Temperature range is -55
+degree Celsius to +125 in 0.5 increments. You may convert this into a
+Fahrenheit range of -67 to +257 degrees with 0.9 steps. If polarity
+parameter is not provided, original value is used.
+
+As for the thermostat, behavior can also be programmed using the polarity
+toggle. On the one hand ("heater"), the thermostat output of the chip,
+Tout, will trigger when the low limit temperature is met or underrun and
+stays high until the high limit is met or exceeded. On the other hand
+("cooler"), vice versa. That way "heater" equals "active low", whereas
+"conditioner" equals "active high". Please note that the DS1621 data sheet
+is somewhat misleading in this point since setting the polarity bit does
+not simply invert Tout.
+
+A second thing is that, during extensive testing, Tout showed a tolerance
+of up to +/- 0.5 degrees even when compared against precise temperature
+readings. Be sure to have a high vs. low temperature limit gap of al least
+1.0 degree Celsius to avoid Tout "bouncing", though!
+
+As for alarms, you can read the alarm status of the DS1621 via the 'alarms'
+/sys file interface. The result consists mainly of bit 6 and 5 of the
+configuration register of the chip; bit 6 (0x40 or 64) is the high alarm
+bit and bit 5 (0x20 or 32) the low one. These bits are set when the high or
+low limits are met or exceeded and are reset by the module as soon as the
+respective temperature ranges are left.
+
+The alarm registers are in no way suitable to find out about the actual
+status of Tout. They will only tell you about its history, whether or not
+any of the limits have ever been met or exceeded since last power-up or
+reset. Be aware: When testing, it showed that the status of Tout can change
+with neither of the alarms set.
+
+Temperature conversion of the DS1621 takes up to 1000ms; internal access to
+non-volatile registers may last for 10ms or below.
+
+High Accuracy Temperature Reading
+---------------------------------
+
+As said before, the temperature issued via the 9-bit i2c-bus data is
+somewhat arbitrary. Internally, the temperature conversion is of a
+different kind that is explained (not so...) well in the DS1621 data sheet.
+To cut the long story short: Inside the DS1621 there are two oscillators,
+both of them biassed by a temperature coefficient.
+
+Higher resolution of the temperature reading can be achieved using the
+internal projection, which means taking account of REG_COUNT and REG_SLOPE
+(the driver manages them):
+
+Taken from Dallas Semiconductors App Note 068: 'Increasing Temperature
+Resolution on the DS1620' and App Note 105: 'High Resolution Temperature
+Measurement with Dallas Direct-to-Digital Temperature Sensors'
+
+- Read the 9-bit temperature and strip the LSB (Truncate the .5 degs)
+- The resulting value is TEMP_READ.
+- Then, read REG_COUNT.
+- And then, REG_SLOPE.
+
+ TEMP = TEMP_READ - 0.25 + ((REG_SLOPE - REG_COUNT) / REG_SLOPE)
+
+Note that this is what the DONE bit in the DS1621 configuration register is
+good for: Internally, one temperature conversion takes up to 1000ms. Before
+that conversion is complete you will not be able to read valid things out
+of REG_COUNT and REG_SLOPE. The DONE bit, as you may have guessed by now,
+tells you whether the conversion is complete ("done", in plain English) and
+thus, whether the values you read are good or not.
+
+The DS1621 has two modes of operation: "Continuous" conversion, which can
+be understood as the default stand-alone mode where the chip gets the
+temperature and controls external devices via its Tout pin or tells other
+i2c's about it if they care. The other mode is called "1SHOT", that means
+that it only figures out about the temperature when it is explicitly told
+to do so; this can be seen as power saving mode.
+
+Now if you want to read REG_COUNT and REG_SLOPE, you have to either stop
+the continuous conversions until the contents of these registers are valid,
+or, in 1SHOT mode, you have to have one conversion made.
diff --git a/Documentation/hwmon/f71805f b/Documentation/hwmon/f71805f
new file mode 100644
index 0000000..f0d5597
--- /dev/null
+++ b/Documentation/hwmon/f71805f
@@ -0,0 +1,167 @@
+Kernel driver f71805f
+=====================
+
+Supported chips:
+ * Fintek F71805F/FG
+ Prefix: 'f71805f'
+ Addresses scanned: none, address read from Super I/O config space
+ Datasheet: Available from the Fintek website
+ * Fintek F71806F/FG
+ Prefix: 'f71872f'
+ Addresses scanned: none, address read from Super I/O config space
+ Datasheet: Available from the Fintek website
+ * Fintek F71872F/FG
+ Prefix: 'f71872f'
+ Addresses scanned: none, address read from Super I/O config space
+ Datasheet: Available from the Fintek website
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+Thanks to Denis Kieft from Barracuda Networks for the donation of a
+test system (custom Jetway K8M8MS motherboard, with CPU and RAM) and
+for providing initial documentation.
+
+Thanks to Kris Chen and Aaron Huang from Fintek for answering technical
+questions and providing additional documentation.
+
+Thanks to Chris Lin from Jetway for providing wiring schematics and
+answering technical questions.
+
+
+Description
+-----------
+
+The Fintek F71805F/FG Super I/O chip includes complete hardware monitoring
+capabilities. It can monitor up to 9 voltages (counting its own power
+source), 3 fans and 3 temperature sensors.
+
+This chip also has fan controlling features, using either DC or PWM, in
+three different modes (one manual, two automatic).
+
+The Fintek F71872F/FG Super I/O chip is almost the same, with two
+additional internal voltages monitored (VSB and battery). It also features
+6 VID inputs. The VID inputs are not yet supported by this driver.
+
+The Fintek F71806F/FG Super-I/O chip is essentially the same as the
+F71872F/FG, and is undistinguishable therefrom.
+
+The driver assumes that no more than one chip is present, which seems
+reasonable.
+
+
+Voltage Monitoring
+------------------
+
+Voltages are sampled by an 8-bit ADC with a LSB of 8 mV. The supported
+range is thus from 0 to 2.040 V. Voltage values outside of this range
+need external resistors. An exception is in0, which is used to monitor
+the chip's own power source (+3.3V), and is divided internally by a
+factor 2. For the F71872F/FG, in9 (VSB) and in10 (battery) are also
+divided internally by a factor 2.
+
+The two LSB of the voltage limit registers are not used (always 0), so
+you can only set the limits in steps of 32 mV (before scaling).
+
+The wirings and resistor values suggested by Fintek are as follow:
+
+ pin expected
+ name use R1 R2 divider raw val.
+
+in0 VCC VCC3.3V int. int. 2.00 1.65 V
+in1 VIN1 VTT1.2V 10K - 1.00 1.20 V
+in2 VIN2 VRAM 100K 100K 2.00 ~1.25 V (1)
+in3 VIN3 VCHIPSET 47K 100K 1.47 2.24 V (2)
+in4 VIN4 VCC5V 200K 47K 5.25 0.95 V
+in5 VIN5 +12V 200K 20K 11.00 1.05 V
+in6 VIN6 VCC1.5V 10K - 1.00 1.50 V
+in7 VIN7 VCORE 10K - 1.00 ~1.40 V (1)
+in8 VIN8 VSB5V 200K 47K 1.00 0.95 V
+in10 VSB VSB3.3V int. int. 2.00 1.65 V (3)
+in9 VBAT VBATTERY int. int. 2.00 1.50 V (3)
+
+(1) Depends on your hardware setup.
+(2) Obviously not correct, swapping R1 and R2 would make more sense.
+(3) F71872F/FG only.
+
+These values can be used as hints at best, as motherboard manufacturers
+are free to use a completely different setup. As a matter of fact, the
+Jetway K8M8MS uses a significantly different setup. You will have to
+find out documentation about your own motherboard, and edit sensors.conf
+accordingly.
+
+Each voltage measured has associated low and high limits, each of which
+triggers an alarm when crossed.
+
+
+Fan Monitoring
+--------------
+
+Fan rotation speeds are reported as 12-bit values from a gated clock
+signal. Speeds down to 366 RPM can be measured. There is no theoretical
+high limit, but values over 6000 RPM seem to cause problem. The effective
+resolution is much lower than you would expect, the step between different
+register values being 10 rather than 1.
+
+The chip assumes 2 pulse-per-revolution fans.
+
+An alarm is triggered if the rotation speed drops below a programmable
+limit or is too low to be measured.
+
+
+Temperature Monitoring
+----------------------
+
+Temperatures are reported in degrees Celsius. Each temperature measured
+has a high limit, those crossing triggers an alarm. There is an associated
+hysteresis value, below which the temperature has to drop before the
+alarm is cleared.
+
+All temperature channels are external, there is no embedded temperature
+sensor. Each channel can be used for connecting either a thermal diode
+or a thermistor. The driver reports the currently selected mode, but
+doesn't allow changing it. In theory, the BIOS should have configured
+everything properly.
+
+
+Fan Control
+-----------
+
+Both PWM (pulse-width modulation) and DC fan speed control methods are
+supported. The right one to use depends on external circuitry on the
+motherboard, so the driver assumes that the BIOS set the method
+properly. The driver will report the method, but won't let you change
+it.
+
+When the PWM method is used, you can select the operating frequency,
+from 187.5 kHz (default) to 31 Hz. The best frequency depends on the
+fan model. As a rule of thumb, lower frequencies seem to give better
+control, but may generate annoying high-pitch noise. So a frequency just
+above the audible range, such as 25 kHz, may be a good choice; if this
+doesn't give you good linear control, try reducing it. Fintek recommends
+not going below 1 kHz, as the fan tachometers get confused by lower
+frequencies as well.
+
+When the DC method is used, Fintek recommends not going below 5 V, which
+corresponds to a pwm value of 106 for the driver. The driver doesn't
+enforce this limit though.
+
+Three different fan control modes are supported; the mode number is written
+to the pwm<n>_enable file.
+
+* 1: Manual mode
+ You ask for a specific PWM duty cycle or DC voltage by writing to the
+ pwm<n> file.
+
+* 2: Temperature mode
+ You define 3 temperature/fan speed trip points using the
+ pwm<n>_auto_point<m>_temp and _fan files. These define a staircase
+ relationship between temperature and fan speed with two additional points
+ interpolated between the values that you define. When the temperature
+ is below auto_point1_temp the fan is switched off.
+
+* 3: Fan speed mode
+ You ask for a specific fan speed by writing to the fan<n>_target file.
+
+Both of the automatic modes require that pwm1 corresponds to fan1, pwm2 to
+fan2 and pwm3 to fan3. Temperature mode also requires that temp1 corresponds
+to pwm1 and fan1, etc.
diff --git a/Documentation/hwmon/fscher b/Documentation/hwmon/fscher
new file mode 100644
index 0000000..6403165
--- /dev/null
+++ b/Documentation/hwmon/fscher
@@ -0,0 +1,169 @@
+Kernel driver fscher
+====================
+
+Supported chips:
+ * Fujitsu-Siemens Hermes chip
+ Prefix: 'fscher'
+ Addresses scanned: I2C 0x73
+
+Authors:
+ Reinhard Nissl <rnissl@gmx.de> based on work
+ from Hermann Jung <hej@odn.de>,
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>
+
+Description
+-----------
+
+This driver implements support for the Fujitsu-Siemens Hermes chip. It is
+described in the 'Register Set Specification BMC Hermes based Systemboard'
+from Fujitsu-Siemens.
+
+The Hermes chip implements a hardware-based system management, e.g. for
+controlling fan speed and core voltage. There is also a watchdog counter on
+the chip which can trigger an alarm and even shut the system down.
+
+The chip provides three temperature values (CPU, motherboard and
+auxiliary), three voltage values (+12V, +5V and battery) and three fans
+(power supply, CPU and auxiliary).
+
+Temperatures are measured in degrees Celsius. The resolution is 1 degree.
+
+Fan rotation speeds are reported in RPM (rotations per minute). The value
+can be divided by a programmable divider (1, 2 or 4) which is stored on
+the chip.
+
+Voltage sensors (also known as "in" sensors) report their values in volts.
+
+All values are reported as final values from the driver. There is no need
+for further calculations.
+
+
+Detailed description
+--------------------
+
+Below you'll find a single line description of all the bit values. With
+this information, you're able to decode e. g. alarms, wdog, etc. To make
+use of the watchdog, you'll need to set the watchdog time and enable the
+watchdog. After that it is necessary to restart the watchdog time within
+the specified period of time, or a system reset will occur.
+
+* revision
+ READING & 0xff = 0x??: HERMES revision identification
+
+* alarms
+ READING & 0x80 = 0x80: CPU throttling active
+ READING & 0x80 = 0x00: CPU running at full speed
+
+ READING & 0x10 = 0x10: software event (see control:1)
+ READING & 0x10 = 0x00: no software event
+
+ READING & 0x08 = 0x08: watchdog event (see wdog:2)
+ READING & 0x08 = 0x00: no watchdog event
+
+ READING & 0x02 = 0x02: thermal event (see temp*:1)
+ READING & 0x02 = 0x00: no thermal event
+
+ READING & 0x01 = 0x01: fan event (see fan*:1)
+ READING & 0x01 = 0x00: no fan event
+
+ READING & 0x13 ! 0x00: ALERT LED is flashing
+
+* control
+ READING & 0x01 = 0x01: software event
+ READING & 0x01 = 0x00: no software event
+
+ WRITING & 0x01 = 0x01: set software event
+ WRITING & 0x01 = 0x00: clear software event
+
+* watchdog_control
+ READING & 0x80 = 0x80: power off on watchdog event while thermal event
+ READING & 0x80 = 0x00: watchdog power off disabled (just system reset enabled)
+
+ READING & 0x40 = 0x40: watchdog timebase 60 seconds (see also wdog:1)
+ READING & 0x40 = 0x00: watchdog timebase 2 seconds
+
+ READING & 0x10 = 0x10: watchdog enabled
+ READING & 0x10 = 0x00: watchdog disabled
+
+ WRITING & 0x80 = 0x80: enable "power off on watchdog event while thermal event"
+ WRITING & 0x80 = 0x00: disable "power off on watchdog event while thermal event"
+
+ WRITING & 0x40 = 0x40: set watchdog timebase to 60 seconds
+ WRITING & 0x40 = 0x00: set watchdog timebase to 2 seconds
+
+ WRITING & 0x20 = 0x20: disable watchdog
+
+ WRITING & 0x10 = 0x10: enable watchdog / restart watchdog time
+
+* watchdog_state
+ READING & 0x02 = 0x02: watchdog system reset occurred
+ READING & 0x02 = 0x00: no watchdog system reset occurred
+
+ WRITING & 0x02 = 0x02: clear watchdog event
+
+* watchdog_preset
+ READING & 0xff = 0x??: configured watch dog time in units (see wdog:3 0x40)
+
+ WRITING & 0xff = 0x??: configure watch dog time in units
+
+* in* (0: +5V, 1: +12V, 2: onboard 3V battery)
+ READING: actual voltage value
+
+* temp*_status (1: CPU sensor, 2: onboard sensor, 3: auxiliary sensor)
+ READING & 0x02 = 0x02: thermal event (overtemperature)
+ READING & 0x02 = 0x00: no thermal event
+
+ READING & 0x01 = 0x01: sensor is working
+ READING & 0x01 = 0x00: sensor is faulty
+
+ WRITING & 0x02 = 0x02: clear thermal event
+
+* temp*_input (1: CPU sensor, 2: onboard sensor, 3: auxiliary sensor)
+ READING: actual temperature value
+
+* fan*_status (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
+ READING & 0x04 = 0x04: fan event (fan fault)
+ READING & 0x04 = 0x00: no fan event
+
+ WRITING & 0x04 = 0x04: clear fan event
+
+* fan*_div (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
+ Divisors 2,4 and 8 are supported, both for reading and writing
+
+* fan*_pwm (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
+ READING & 0xff = 0x00: fan may be switched off
+ READING & 0xff = 0x01: fan must run at least at minimum speed (supply: 6V)
+ READING & 0xff = 0xff: fan must run at maximum speed (supply: 12V)
+ READING & 0xff = 0x??: fan must run at least at given speed (supply: 6V..12V)
+
+ WRITING & 0xff = 0x00: fan may be switched off
+ WRITING & 0xff = 0x01: fan must run at least at minimum speed (supply: 6V)
+ WRITING & 0xff = 0xff: fan must run at maximum speed (supply: 12V)
+ WRITING & 0xff = 0x??: fan must run at least at given speed (supply: 6V..12V)
+
+* fan*_input (1: power supply fan, 2: CPU fan, 3: auxiliary fan)
+ READING: actual RPM value
+
+
+Limitations
+-----------
+
+* Measuring fan speed
+It seems that the chip counts "ripples" (typical fans produce 2 ripples per
+rotation while VERAX fans produce 18) in a 9-bit register. This register is
+read out every second, then the ripple prescaler (2, 4 or 8) is applied and
+the result is stored in the 8 bit output register. Due to the limitation of
+the counting register to 9 bits, it is impossible to measure a VERAX fan
+properly (even with a prescaler of 8). At its maximum speed of 3500 RPM the
+fan produces 1080 ripples per second which causes the counting register to
+overflow twice, leading to only 186 RPM.
+
+* Measuring input voltages
+in2 ("battery") reports the voltage of the onboard lithium battery and not
++3.3V from the power supply.
+
+* Undocumented features
+Fujitsu-Siemens Computers has not documented all features of the chip so
+far. Their software, System Guard, shows that there are a still some
+features which cannot be controlled by this implementation.
diff --git a/Documentation/hwmon/gl518sm b/Documentation/hwmon/gl518sm
new file mode 100644
index 0000000..229f8b7
--- /dev/null
+++ b/Documentation/hwmon/gl518sm
@@ -0,0 +1,74 @@
+Kernel driver gl518sm
+=====================
+
+Supported chips:
+ * Genesys Logic GL518SM release 0x00
+ Prefix: 'gl518sm'
+ Addresses scanned: I2C 0x2c and 0x2d
+ Datasheet: http://www.genesyslogic.com/pdf
+ * Genesys Logic GL518SM release 0x80
+ Prefix: 'gl518sm'
+ Addresses scanned: I2C 0x2c and 0x2d
+ Datasheet: http://www.genesyslogic.com/pdf
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Kyösti Mälkki <kmalkki@cc.hut.fi>
+ Hong-Gunn Chew <hglinux@gunnet.org>
+ Jean Delvare <khali@linux-fr.org>
+
+Description
+-----------
+
+IMPORTANT:
+
+For the revision 0x00 chip, the in0, in1, and in2 values (+5V, +3V,
+and +12V) CANNOT be read. This is a limitation of the chip, not the driver.
+
+This driver supports the Genesys Logic GL518SM chip. There are at least
+two revision of this chip, which we call revision 0x00 and 0x80. Revision
+0x80 chips support the reading of all voltages and revision 0x00 only
+for VIN3.
+
+The GL518SM implements one temperature sensor, two fan rotation speed
+sensors, and four voltage sensors. It can report alarms through the
+computer speakers.
+
+Temperatures are measured in degrees Celsius. An alarm goes off while the
+temperature is above the over temperature limit, and has not yet dropped
+below the hysteresis limit. The alarm always reflects the current
+situation. Measurements are guaranteed between -10 degrees and +110
+degrees, with a accuracy of +/-3 degrees.
+
+Rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. In
+case when you have selected to turn fan1 off, no fan1 alarm is triggered.
+
+Fan readings can be divided by a programmable divider (1, 2, 4 or 8) to
+give the readings more range or accuracy. Not all RPM values can
+accurately be represented, so some rounding is done. With a divider
+of 2, the lowest representable value is around 1900 RPM.
+
+Voltage sensors (also known as VIN sensors) report their values in volts.
+An alarm is triggered if the voltage has crossed a programmable minimum or
+maximum limit. Note that minimum in this case always means 'closest to
+zero'; this is important for negative voltage measurements. The VDD input
+measures voltages between 0.000 and 5.865 volt, with a resolution of 0.023
+volt. The other inputs measure voltages between 0.000 and 4.845 volt, with
+a resolution of 0.019 volt. Note that revision 0x00 chips do not support
+reading the current voltage of any input except for VIN3; limit setting and
+alarms work fine, though.
+
+When an alarm is triggered, you can be warned by a beeping signal through your
+computer speaker. It is possible to enable all beeping globally, or only the
+beeping for some alarms.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once (except for temperature alarms). This means that the
+cause for the alarm may already have disappeared! Note that in the current
+implementation, all hardware registers are read whenever any data is read
+(unless it is less than 1.5 seconds since the last update). This means that
+you can easily miss once-only alarms.
+
+The GL518SM only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
diff --git a/Documentation/hwmon/ibmaem b/Documentation/hwmon/ibmaem
new file mode 100644
index 0000000..e98bdfe
--- /dev/null
+++ b/Documentation/hwmon/ibmaem
@@ -0,0 +1,38 @@
+Kernel driver ibmaem
+======================
+
+This driver talks to the IBM Systems Director Active Energy Manager, known
+henceforth as AEM.
+
+Supported systems:
+ * Any recent IBM System X server with AEM support.
+ This includes the x3350, x3550, x3650, x3655, x3755, x3850 M2,
+ x3950 M2, and certain HS2x/LS2x/QS2x blades. The IPMI host interface
+ driver ("ipmi-si") needs to be loaded for this driver to do anything.
+ Prefix: 'ibmaem'
+ Datasheet: Not available
+
+Author: Darrick J. Wong
+
+Description
+-----------
+
+This driver implements sensor reading support for the energy and power meters
+available on various IBM System X hardware through the BMC. All sensor banks
+will be exported as platform devices; this driver can talk to both v1 and v2
+interfaces. This driver is completely separate from the older ibmpex driver.
+
+The v1 AEM interface has a simple set of features to monitor energy use. There
+is a register that displays an estimate of raw energy consumption since the
+last BMC reset, and a power sensor that returns average power use over a
+configurable interval.
+
+The v2 AEM interface is a bit more sophisticated, being able to present a wider
+range of energy and power use registers, the power cap as set by the AEM
+software, and temperature sensors.
+
+Special Features
+----------------
+
+The "power_cap" value displays the current system power cap, as set by the AEM
+software. Setting the power cap from the host is not currently supported.
diff --git a/Documentation/hwmon/it87 b/Documentation/hwmon/it87
new file mode 100644
index 0000000..042c041
--- /dev/null
+++ b/Documentation/hwmon/it87
@@ -0,0 +1,157 @@
+Kernel driver it87
+==================
+
+Supported chips:
+ * IT8705F
+ Prefix: 'it87'
+ Addresses scanned: from Super I/O config space (8 I/O ports)
+ Datasheet: Publicly available at the ITE website
+ http://www.ite.com.tw/product_info/file/pc/IT8705F_V.0.4.1.pdf
+ * IT8712F
+ Prefix: 'it8712'
+ Addresses scanned: from Super I/O config space (8 I/O ports)
+ Datasheet: Publicly available at the ITE website
+ http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.1.pdf
+ http://www.ite.com.tw/product_info/file/pc/Errata%20V0.1%20for%20IT8712F%20V0.9.1.pdf
+ http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.3.pdf
+ * IT8716F/IT8726F
+ Prefix: 'it8716'
+ Addresses scanned: from Super I/O config space (8 I/O ports)
+ Datasheet: Publicly available at the ITE website
+ http://www.ite.com.tw/product_info/file/pc/IT8716F_V0.3.ZIP
+ http://www.ite.com.tw/product_info/file/pc/IT8726F_V0.3.pdf
+ * IT8718F
+ Prefix: 'it8718'
+ Addresses scanned: from Super I/O config space (8 I/O ports)
+ Datasheet: Publicly available at the ITE website
+ http://www.ite.com.tw/product_info/file/pc/IT8718F_V0.2.zip
+ http://www.ite.com.tw/product_info/file/pc/IT8718F_V0%203_(for%20C%20version).zip
+ * SiS950 [clone of IT8705F]
+ Prefix: 'it87'
+ Addresses scanned: from Super I/O config space (8 I/O ports)
+ Datasheet: No longer be available
+
+Authors:
+ Christophe Gauthron
+ Jean Delvare <khali@linux-fr.org>
+
+
+Module Parameters
+-----------------
+
+* update_vbat: int
+
+ 0 if vbat should report power on value, 1 if vbat should be updated after
+ each read. Default is 0. On some boards the battery voltage is provided
+ by either the battery or the onboard power supply. Only the first reading
+ at power on will be the actual battery voltage (which the chip does
+ automatically). On other boards the battery voltage is always fed to
+ the chip so can be read at any time. Excessive reading may decrease
+ battery life but no information is given in the datasheet.
+
+* fix_pwm_polarity int
+
+ Force PWM polarity to active high (DANGEROUS). Some chips are
+ misconfigured by BIOS - PWM values would be inverted. This option tries
+ to fix this. Please contact your BIOS manufacturer and ask him for fix.
+
+
+Hardware Interfaces
+-------------------
+
+All the chips suported by this driver are LPC Super-I/O chips, accessed
+through the LPC bus (ISA-like I/O ports). The IT8712F additionally has an
+SMBus interface to the hardware monitoring functions. This driver no
+longer supports this interface though, as it is slower and less reliable
+than the ISA access, and was only available on a small number of
+motherboard models.
+
+
+Description
+-----------
+
+This driver implements support for the IT8705F, IT8712F, IT8716F,
+IT8718F, IT8726F and SiS950 chips.
+
+These chips are 'Super I/O chips', supporting floppy disks, infrared ports,
+joysticks and other miscellaneous stuff. For hardware monitoring, they
+include an 'environment controller' with 3 temperature sensors, 3 fan
+rotation speed sensors, 8 voltage sensors, and associated alarms.
+
+The IT8712F and IT8716F additionally feature VID inputs, used to report
+the Vcore voltage of the processor. The early IT8712F have 5 VID pins,
+the IT8716F and late IT8712F have 6. They are shared with other functions
+though, so the functionality may not be available on a given system.
+The driver dumbly assume it is there.
+
+The IT8718F also features VID inputs (up to 8 pins) but the value is
+stored in the Super-I/O configuration space. Due to technical limitations,
+this value can currently only be read once at initialization time, so
+the driver won't notice and report changes in the VID value. The two
+upper VID bits share their pins with voltage inputs (in5 and in6) so you
+can't have both on a given board.
+
+The IT8716F, IT8718F and later IT8712F revisions have support for
+2 additional fans. The additional fans are supported by the driver.
+
+The IT8716F and IT8718F, and late IT8712F and IT8705F also have optional
+16-bit tachometer counters for fans 1 to 3. This is better (no more fan
+clock divider mess) but not compatible with the older chips and
+revisions. The 16-bit tachometer mode is enabled by the driver when one
+of the above chips is detected.
+
+The IT8726F is just bit enhanced IT8716F with additional hardware
+for AMD power sequencing. Therefore the chip will appear as IT8716F
+to userspace applications.
+
+Temperatures are measured in degrees Celsius. An alarm is triggered once
+when the Overtemperature Shutdown limit is crossed.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. When
+16-bit tachometer counters aren't used, fan readings can be divided by
+a programmable divider (1, 2, 4 or 8) to give the readings more range or
+accuracy. With a divider of 2, the lowest representable value is around
+2600 RPM. Not all RPM values can accurately be represented, so some rounding
+is done.
+
+Voltage sensors (also known as IN sensors) report their values in volts. An
+alarm is triggered if the voltage has crossed a programmable minimum or
+maximum limit. Note that minimum in this case always means 'closest to
+zero'; this is important for negative voltage measurements. All voltage
+inputs can measure voltages between 0 and 4.08 volts, with a resolution of
+0.016 volt. The battery voltage in8 does not have limit registers.
+
+The VID lines (IT8712F/IT8716F/IT8718F) encode the core voltage value:
+the voltage level your processor should work with. This is hardcoded by
+the mainboard and/or processor itself. It is a value in volts.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may already
+have disappeared! Note that in the current implementation, all hardware
+registers are read whenever any data is read (unless it is less than 1.5
+seconds since the last update). This means that you can easily miss
+once-only alarms.
+
+The IT87xx only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
+
+To change sensor N to a thermistor, 'echo 4 > tempN_type' where N is 1, 2,
+or 3. To change sensor N to a thermal diode, 'echo 3 > tempN_type'.
+Give 0 for unused sensor. Any other value is invalid. To configure this at
+startup, consult lm_sensors's /etc/sensors.conf. (4 = thermistor;
+3 = thermal diode)
+
+
+Fan speed control
+-----------------
+
+The fan speed control features are limited to manual PWM mode. Automatic
+"Smart Guardian" mode control handling is not implemented. However
+if you want to go for "manual mode" just write 1 to pwmN_enable.
+
+If you are only able to control the fan speed with very small PWM values,
+try lowering the PWM base frequency (pwm1_freq). Depending on the fan,
+it may give you a somewhat greater control range. The same frequency is
+used to drive all fan outputs, which is why pwm2_freq and pwm3_freq are
+read-only.
diff --git a/Documentation/hwmon/k8temp b/Documentation/hwmon/k8temp
new file mode 100644
index 0000000..0005c71
--- /dev/null
+++ b/Documentation/hwmon/k8temp
@@ -0,0 +1,55 @@
+Kernel driver k8temp
+====================
+
+Supported chips:
+ * AMD Athlon64/FX or Opteron CPUs
+ Prefix: 'k8temp'
+ Addresses scanned: PCI space
+ Datasheet: http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+
+Author: Rudolf Marek
+Contact: Rudolf Marek <r.marek@assembler.cz>
+
+Description
+-----------
+
+This driver permits reading temperature sensor(s) embedded inside AMD K8
+family CPUs (Athlon64/FX, Opteron). Official documentation says that it works
+from revision F of K8 core, but in fact it seems to be implemented for all
+revisions of K8 except the first two revisions (SH-B0 and SH-B3).
+
+Please note that you will need at least lm-sensors 2.10.1 for proper userspace
+support.
+
+There can be up to four temperature sensors inside single CPU. The driver
+will auto-detect the sensors and will display only temperatures from
+implemented sensors.
+
+Mapping of /sys files is as follows:
+
+temp1_input - temperature of Core 0 and "place" 0
+temp2_input - temperature of Core 0 and "place" 1
+temp3_input - temperature of Core 1 and "place" 0
+temp4_input - temperature of Core 1 and "place" 1
+
+Temperatures are measured in degrees Celsius and measurement resolution is
+1 degree C. It is expected that future CPU will have better resolution. The
+temperature is updated once a second. Valid temperatures are from -49 to
+206 degrees C.
+
+Temperature known as TCaseMax was specified for processors up to revision E.
+This temperature is defined as temperature between heat-spreader and CPU
+case, so the internal CPU temperature supplied by this driver can be higher.
+There is no easy way how to measure the temperature which will correlate
+with TCaseMax temperature.
+
+For newer revisions of CPU (rev F, socket AM2) there is a mathematically
+computed temperature called TControl, which must be lower than TControlMax.
+
+The relationship is following:
+
+temp1_input - TjOffset*2 < TControlMax,
+
+TjOffset is not yet exported by the driver, TControlMax is usually
+70 degrees C. The rule of the thumb -> CPU temperature should not cross
+60 degrees C too much.
diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d
new file mode 100644
index 0000000..65dfb0c
--- /dev/null
+++ b/Documentation/hwmon/lis3lv02d
@@ -0,0 +1,49 @@
+Kernel driver lis3lv02d
+==================
+
+Supported chips:
+
+ * STMicroelectronics LIS3LV02DL and LIS3LV02DQ
+
+Author:
+ Yan Burman <burman.yan@gmail.com>
+ Eric Piel <eric.piel@tremplin-utc.net>
+
+
+Description
+-----------
+
+This driver provides support for the accelerometer found in various HP laptops
+sporting the feature officially called "HP Mobile Data Protection System 3D" or
+"HP 3D DriveGuard". It detect automatically laptops with this sensor. Known models
+(for now the HP 2133, nc6420, nc2510, nc8510, nc84x0, nw9440 and nx9420) will
+have their axis automatically oriented on standard way (eg: you can directly
+play neverball). The accelerometer data is readable via
+/sys/devices/platform/lis3lv02d.
+
+Sysfs attributes under /sys/devices/platform/lis3lv02d/:
+position - 3D position that the accelerometer reports. Format: "(x,y,z)"
+calibrate - read: values (x, y, z) that are used as the base for input class device operation.
+ write: forces the base to be recalibrated with the current position.
+rate - reports the sampling rate of the accelerometer device in HZ
+
+This driver also provides an absolute input class device, allowing
+the laptop to act as a pinball machine-esque joystick.
+
+Axes orientation
+----------------
+
+For better compatibility between the various laptops. The values reported by
+the accelerometer are converted into a "standard" organisation of the axes
+(aka "can play neverball out of the box"):
+ * When the laptop is horizontal the position reported is about 0 for X and Y
+and a positive value for Z
+ * If the left side is elevated, X increases (becomes positive)
+ * If the front side (where the touchpad is) is elevated, Y decreases (becomes negative)
+ * If the laptop is put upside-down, Z becomes negative
+
+If your laptop model is not recognized (cf "dmesg"), you can send an email to the
+authors to add it to the database. When reporting a new laptop, please include
+the output of "dmidecode" plus the value of /sys/devices/platform/lis3lv02d/position
+in these four cases.
+
diff --git a/Documentation/hwmon/lm63 b/Documentation/hwmon/lm63
new file mode 100644
index 0000000..31660bf
--- /dev/null
+++ b/Documentation/hwmon/lm63
@@ -0,0 +1,57 @@
+Kernel driver lm63
+==================
+
+Supported chips:
+ * National Semiconductor LM63
+ Prefix: 'lm63'
+ Addresses scanned: I2C 0x4c
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/pf/LM/LM63.html
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+Thanks go to Tyan and especially Alex Buckingham for setting up a remote
+access to their S4882 test platform for this driver.
+ http://www.tyan.com/
+
+Description
+-----------
+
+The LM63 is a digital temperature sensor with integrated fan monitoring
+and control.
+
+The LM63 is basically an LM86 with fan speed monitoring and control
+capabilities added. It misses some of the LM86 features though:
+ - No low limit for local temperature.
+ - No critical limit for local temperature.
+ - Critical limit for remote temperature can be changed only once. We
+ will consider that the critical limit is read-only.
+
+The datasheet isn't very clear about what the tachometer reading is.
+
+An explanation from National Semiconductor: The two lower bits of the read
+value have to be masked out. The value is still 16 bit in width.
+
+All temperature values are given in degrees Celsius. Resolution is 1.0
+degree for the local temperature, 0.125 degree for the remote temperature.
+
+The fan speed is measured using a tachometer. Contrary to most chips which
+store the value in an 8-bit register and have a selectable clock divider
+to make sure that the result will fit in the register, the LM63 uses 16-bit
+value for measuring the speed of the fan. It can measure fan speeds down to
+83 RPM, at least in theory.
+
+Note that the pin used for fan monitoring is shared with an alert out
+function. Depending on how the board designer wanted to use the chip, fan
+speed monitoring will or will not be possible. The proper chip configuration
+is left to the BIOS, and the driver will blindly trust it.
+
+A PWM output can be used to control the speed of the fan. The LM63 has two
+PWM modes: manual and automatic. Automatic mode is not fully implemented yet
+(you cannot define your custom PWM/temperature curve), and mode change isn't
+supported either.
+
+The lm63 driver will not update its values more frequently than every
+second; reading them more often will do no harm, but will return 'old'
+values.
+
diff --git a/Documentation/hwmon/lm70 b/Documentation/hwmon/lm70
new file mode 100644
index 0000000..2bdd3fe
--- /dev/null
+++ b/Documentation/hwmon/lm70
@@ -0,0 +1,31 @@
+Kernel driver lm70
+==================
+
+Supported chip:
+ * National Semiconductor LM70
+ Datasheet: http://www.national.com/pf/LM/LM70.html
+
+Author:
+ Kaiwan N Billimoria <kaiwan@designergraphix.com>
+
+Description
+-----------
+
+This driver implements support for the National Semiconductor LM70
+temperature sensor.
+
+The LM70 temperature sensor chip supports a single temperature sensor.
+It communicates with a host processor (or microcontroller) via an
+SPI/Microwire Bus interface.
+
+Communication with the LM70 is simple: when the temperature is to be sensed,
+the driver accesses the LM70 using SPI communication: 16 SCLK cycles
+comprise the MOSI/MISO loop. At the end of the transfer, the 11-bit 2's
+complement digital temperature (sent via the SIO line), is available in the
+driver for interpretation. This driver makes use of the kernel's in-core
+SPI support.
+
+Thanks to
+---------
+Jean Delvare <khali@linux-fr.org> for mentoring the hwmon-side driver
+development.
diff --git a/Documentation/hwmon/lm75 b/Documentation/hwmon/lm75
new file mode 100644
index 0000000..8e6356f
--- /dev/null
+++ b/Documentation/hwmon/lm75
@@ -0,0 +1,65 @@
+Kernel driver lm75
+==================
+
+Supported chips:
+ * National Semiconductor LM75
+ Prefix: 'lm75'
+ Addresses scanned: I2C 0x48 - 0x4f
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/
+ * Dallas Semiconductor DS75
+ Prefix: 'lm75'
+ Addresses scanned: I2C 0x48 - 0x4f
+ Datasheet: Publicly available at the Dallas Semiconductor website
+ http://www.maxim-ic.com/
+ * Dallas Semiconductor DS1775
+ Prefix: 'lm75'
+ Addresses scanned: I2C 0x48 - 0x4f
+ Datasheet: Publicly available at the Dallas Semiconductor website
+ http://www.maxim-ic.com/
+ * Maxim MAX6625, MAX6626
+ Prefix: 'lm75'
+ Addresses scanned: I2C 0x48 - 0x4b
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/
+ * Microchip (TelCom) TCN75
+ Prefix: 'lm75'
+ Addresses scanned: I2C 0x48 - 0x4f
+ Datasheet: Publicly available at the Microchip website
+ http://www.microchip.com/
+
+Author: Frodo Looijaard <frodol@dds.nl>
+
+Description
+-----------
+
+The LM75 implements one temperature sensor. Limits can be set through the
+Overtemperature Shutdown register and Hysteresis register. Each value can be
+set and read to half-degree accuracy.
+An alarm is issued (usually to a connected LM78) when the temperature
+gets higher then the Overtemperature Shutdown value; it stays on until
+the temperature falls below the Hysteresis value.
+All temperatures are in degrees Celsius, and are guaranteed within a
+range of -55 to +125 degrees.
+
+The LM75 only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
+
+The LM75 is usually used in combination with LM78-like chips, to measure
+the temperature of the processor(s).
+
+The DS75, DS1775, MAX6625, and MAX6626 are supported as well.
+They are not distinguished from an LM75. While most of these chips
+have three additional bits of accuracy (12 vs. 9 for the LM75),
+the additional bits are not supported. Not only that, but these chips will
+not be detected if not in 9-bit precision mode (use the force parameter if
+needed).
+
+The TCN75 is supported as well, and is not distinguished from an LM75.
+
+The LM75 is essentially an industry standard; there may be other
+LM75 clones not listed here, with or without various enhancements,
+that are supported.
+
+The LM77 is not supported, contrary to what we pretended for a long time.
+Both chips are simply not compatible, value encoding differs.
diff --git a/Documentation/hwmon/lm77 b/Documentation/hwmon/lm77
new file mode 100644
index 0000000..57c3a46
--- /dev/null
+++ b/Documentation/hwmon/lm77
@@ -0,0 +1,22 @@
+Kernel driver lm77
+==================
+
+Supported chips:
+ * National Semiconductor LM77
+ Prefix: 'lm77'
+ Addresses scanned: I2C 0x48 - 0x4b
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/
+
+Author: Andras BALI <drewie@freemail.hu>
+
+Description
+-----------
+
+The LM77 implements one temperature sensor. The temperature
+sensor incorporates a band-gap type temperature sensor,
+10-bit ADC, and a digital comparator with user-programmable upper
+and lower limit values.
+
+Limits can be set through the Overtemperature Shutdown register and
+Hysteresis register.
diff --git a/Documentation/hwmon/lm78 b/Documentation/hwmon/lm78
new file mode 100644
index 0000000..60932e2
--- /dev/null
+++ b/Documentation/hwmon/lm78
@@ -0,0 +1,67 @@
+Kernel driver lm78
+==================
+
+Supported chips:
+ * National Semiconductor LM78 / LM78-J
+ Prefix: 'lm78'
+ Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/
+ * National Semiconductor LM79
+ Prefix: 'lm79'
+ Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/
+
+Author: Frodo Looijaard <frodol@dds.nl>
+
+Description
+-----------
+
+This driver implements support for the National Semiconductor LM78, LM78-J
+and LM79. They are described as 'Microprocessor System Hardware Monitors'.
+
+There is almost no difference between the three supported chips. Functionally,
+the LM78 and LM78-J are exactly identical. The LM79 has one more VID line,
+which is used to report the lower voltages newer Pentium processors use.
+From here on, LM7* means either of these three types.
+
+The LM7* implements one temperature sensor, three fan rotation speed sensors,
+seven voltage sensors, VID lines, alarms, and some miscellaneous stuff.
+
+Temperatures are measured in degrees Celsius. An alarm is triggered once
+when the Overtemperature Shutdown limit is crossed; it is triggered again
+as soon as it drops below the Hysteresis value. A more useful behavior
+can be found by setting the Hysteresis value to +127 degrees Celsius; in
+this case, alarms are issued during all the time when the actual temperature
+is above the Overtemperature Shutdown value. Measurements are guaranteed
+between -55 and +125 degrees, with a resolution of 1 degree.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4 or 8) to give
+the readings more range or accuracy. Not all RPM values can accurately be
+represented, so some rounding is done. With a divider of 2, the lowest
+representable value is around 2600 RPM.
+
+Voltage sensors (also known as IN sensors) report their values in volts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit. Note that minimum in this case always means 'closest to
+zero'; this is important for negative voltage measurements. All voltage
+inputs can measure voltages between 0 and 4.08 volts, with a resolution
+of 0.016 volt.
+
+The VID lines encode the core voltage value: the voltage level your processor
+should work with. This is hardcoded by the mainboard and/or processor itself.
+It is a value in volts. When it is unconnected, you will often find the
+value 3.50 V here.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may
+already have disappeared! Note that in the current implementation, all
+hardware registers are read whenever any data is read (unless it is less
+than 1.5 seconds since the last update). This means that you can easily
+miss once-only alarms.
+
+The LM7* only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
diff --git a/Documentation/hwmon/lm80 b/Documentation/hwmon/lm80
new file mode 100644
index 0000000..cb5b407
--- /dev/null
+++ b/Documentation/hwmon/lm80
@@ -0,0 +1,56 @@
+Kernel driver lm80
+==================
+
+Supported chips:
+ * National Semiconductor LM80
+ Prefix: 'lm80'
+ Addresses scanned: I2C 0x28 - 0x2f
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>
+
+Description
+-----------
+
+This driver implements support for the National Semiconductor LM80.
+It is described as a 'Serial Interface ACPI-Compatible Microprocessor
+System Hardware Monitor'.
+
+The LM80 implements one temperature sensor, two fan rotation speed sensors,
+seven voltage sensors, alarms, and some miscellaneous stuff.
+
+Temperatures are measured in degrees Celsius. There are two sets of limits
+which operate independently. When the HOT Temperature Limit is crossed,
+this will cause an alarm that will be reasserted until the temperature
+drops below the HOT Hysteresis. The Overtemperature Shutdown (OS) limits
+should work in the same way (but this must be checked; the datasheet
+is unclear about this). Measurements are guaranteed between -55 and
++125 degrees. The current temperature measurement has a resolution of
+0.0625 degrees; the limits have a resolution of 1 degree.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4 or 8) to give
+the readings more range or accuracy. Not all RPM values can accurately be
+represented, so some rounding is done. With a divider of 2, the lowest
+representable value is around 2600 RPM.
+
+Voltage sensors (also known as IN sensors) report their values in volts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit. Note that minimum in this case always means 'closest to
+zero'; this is important for negative voltage measurements. All voltage
+inputs can measure voltages between 0 and 2.55 volts, with a resolution
+of 0.01 volt.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may
+already have disappeared! Note that in the current implementation, all
+hardware registers are read whenever any data is read (unless it is less
+than 2.0 seconds since the last update). This means that you can easily
+miss once-only alarms.
+
+The LM80 only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
diff --git a/Documentation/hwmon/lm83 b/Documentation/hwmon/lm83
new file mode 100644
index 0000000..a04d1fe
--- /dev/null
+++ b/Documentation/hwmon/lm83
@@ -0,0 +1,85 @@
+Kernel driver lm83
+==================
+
+Supported chips:
+ * National Semiconductor LM83
+ Prefix: 'lm83'
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/pf/LM/LM83.html
+ * National Semiconductor LM82
+ Addresses scanned: I2C 0x18 - 0x1a, 0x29 - 0x2b, 0x4c - 0x4e
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/pf/LM/LM82.html
+
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+Description
+-----------
+
+The LM83 is a digital temperature sensor. It senses its own temperature as
+well as the temperature of up to three external diodes. The LM82 is
+a stripped down version of the LM83 that only supports one external diode.
+Both are compatible with many other devices such as the LM84 and all
+other ADM1021 clones. The main difference between the LM83 and the LM84
+in that the later can only sense the temperature of one external diode.
+
+Using the adm1021 driver for a LM83 should work, but only two temperatures
+will be reported instead of four.
+
+The LM83 is only found on a handful of motherboards. Both a confirmed
+list and an unconfirmed list follow. If you can confirm or infirm the
+fact that any of these motherboards do actually have an LM83, please
+contact us. Note that the LM90 can easily be misdetected as a LM83.
+
+Confirmed motherboards:
+ SBS P014
+ SBS PSL09
+
+Unconfirmed motherboards:
+ Gigabyte GA-8IK1100
+ Iwill MPX2
+ Soltek SL-75DRV5
+
+The LM82 is confirmed to have been found on most AMD Geode reference
+designs and test platforms.
+
+The driver has been successfully tested by Magnus Forsström, who I'd
+like to thank here. More testers will be of course welcome.
+
+The fact that the LM83 is only scarcely used can be easily explained.
+Most motherboards come with more than just temperature sensors for
+health monitoring. They also have voltage and fan rotation speed
+sensors. This means that temperature-only chips are usually used as
+secondary chips coupled with another chip such as an IT8705F or similar
+chip, which provides more features. Since systems usually need three
+temperature sensors (motherboard, processor, power supply) and primary
+chips provide some temperature sensors, the secondary chip, if needed,
+won't have to handle more than two temperatures. Thus, ADM1021 clones
+are sufficient, and there is no need for a four temperatures sensor
+chip such as the LM83. The only case where using an LM83 would make
+sense is on SMP systems, such as the above-mentioned Iwill MPX2,
+because you want an additional temperature sensor for each additional
+CPU.
+
+On the SBS P014, this is different, since the LM83 is the only hardware
+monitoring chipset. One temperature sensor is used for the motherboard
+(actually measuring the LM83's own temperature), one is used for the
+CPU. The two other sensors must be used to measure the temperature of
+two other points of the motherboard. We suspect these points to be the
+north and south bridges, but this couldn't be confirmed.
+
+All temperature values are given in degrees Celsius. Local temperature
+is given within a range of 0 to +85 degrees. Remote temperatures are
+given within a range of 0 to +125 degrees. Resolution is 1.0 degree,
+accuracy is guaranteed to 3.0 degrees (see the datasheet for more
+details).
+
+Each sensor has its own high limit, but the critical limit is common to
+all four sensors. There is no hysteresis mechanism as found on most
+recent temperature sensors.
+
+The lm83 driver will not update its values more frequently than every
+other second; reading them more often will do no harm, but will return
+'old' values.
diff --git a/Documentation/hwmon/lm85 b/Documentation/hwmon/lm85
new file mode 100644
index 0000000..4006207
--- /dev/null
+++ b/Documentation/hwmon/lm85
@@ -0,0 +1,208 @@
+Kernel driver lm85
+==================
+
+Supported chips:
+ * National Semiconductor LM85 (B and C versions)
+ Prefix: 'lm85'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: http://www.national.com/pf/LM/LM85.html
+ * Analog Devices ADM1027
+ Prefix: 'adm1027'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: http://www.analog.com/en/prod/0,,766_825_ADM1027,00.html
+ * Analog Devices ADT7463
+ Prefix: 'adt7463'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: http://www.analog.com/en/prod/0,,766_825_ADT7463,00.html
+ * SMSC EMC6D100, SMSC EMC6D101
+ Prefix: 'emc6d100'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: http://www.smsc.com/main/tools/discontinued/6d100.pdf
+ * SMSC EMC6D102
+ Prefix: 'emc6d102'
+ Addresses scanned: I2C 0x2c, 0x2d, 0x2e
+ Datasheet: http://www.smsc.com/main/catalog/emc6d102.html
+
+Authors:
+ Philip Pokorny <ppokorny@penguincomputing.com>,
+ Frodo Looijaard <frodol@dds.nl>,
+ Richard Barrington <rich_b_nz@clear.net.nz>,
+ Margit Schubert-While <margitsw@t-online.de>,
+ Justin Thiessen <jthiessen@penguincomputing.com>
+
+Description
+-----------
+
+This driver implements support for the National Semiconductor LM85 and
+compatible chips including the Analog Devices ADM1027, ADT7463 and
+SMSC EMC6D10x chips family.
+
+The LM85 uses the 2-wire interface compatible with the SMBUS 2.0
+specification. Using an analog to digital converter it measures three (3)
+temperatures and five (5) voltages. It has four (4) 16-bit counters for
+measuring fan speed. Five (5) digital inputs are provided for sampling the
+VID signals from the processor to the VRM. Lastly, there are three (3) PWM
+outputs that can be used to control fan speed.
+
+The voltage inputs have internal scaling resistors so that the following
+voltage can be measured without external resistors:
+
+ 2.5V, 3.3V, 5V, 12V, and CPU core voltage (2.25V)
+
+The temperatures measured are one internal diode, and two remote diodes.
+Remote 1 is generally the CPU temperature. These inputs are designed to
+measure a thermal diode like the one in a Pentium 4 processor in a socket
+423 or socket 478 package. They can also measure temperature using a
+transistor like the 2N3904.
+
+A sophisticated control system for the PWM outputs is designed into the
+LM85 that allows fan speed to be adjusted automatically based on any of the
+three temperature sensors. Each PWM output is individually adjustable and
+programmable. Once configured, the LM85 will adjust the PWM outputs in
+response to the measured temperatures without further host intervention.
+This feature can also be disabled for manual control of the PWM's.
+
+Each of the measured inputs (voltage, temperature, fan speed) has
+corresponding high/low limit values. The LM85 will signal an ALARM if any
+measured value exceeds either limit.
+
+The LM85 samples all inputs continuously. The lm85 driver will not read
+the registers more often than once a second. Further, configuration data is
+only read once each 5 minutes. There is twice as much config data as
+measurements, so this would seem to be a worthwhile optimization.
+
+Special Features
+----------------
+
+The LM85 has four fan speed monitoring modes. The ADM1027 has only two.
+Both have special circuitry to compensate for PWM interactions with the
+TACH signal from the fans. The ADM1027 can be configured to measure the
+speed of a two wire fan, but the input conditioning circuitry is different
+for 3-wire and 2-wire mode. For this reason, the 2-wire fan modes are not
+exposed to user control. The BIOS should initialize them to the correct
+mode. If you've designed your own ADM1027, you'll have to modify the
+init_client function and add an insmod parameter to set this up.
+
+To smooth the response of fans to changes in temperature, the LM85 has an
+optional filter for smoothing temperatures. The ADM1027 has the same
+config option but uses it to rate limit the changes to fan speed instead.
+
+The ADM1027 and ADT7463 have a 10-bit ADC and can therefore measure
+temperatures with 0.25 degC resolution. They also provide an offset to the
+temperature readings that is automatically applied during measurement.
+This offset can be used to zero out any errors due to traces and placement.
+The documentation says that the offset is in 0.25 degC steps, but in
+initial testing of the ADM1027 it was 1.00 degC steps. Analog Devices has
+confirmed this "bug". The ADT7463 is reported to work as described in the
+documentation. The current lm85 driver does not show the offset register.
+
+See the vendor datasheets for more information. There is application note
+from National (AN-1260) with some additional information about the LM85.
+The Analog Devices datasheet is very detailed and describes a procedure for
+determining an optimal configuration for the automatic PWM control.
+
+The SMSC EMC6D100 & EMC6D101 monitor external voltages, temperatures, and
+fan speeds. They use this monitoring capability to alert the system to out
+of limit conditions and can automatically control the speeds of multiple
+fans in a PC or embedded system. The EMC6D101, available in a 24-pin SSOP
+package, and the EMC6D100, available in a 28-pin SSOP package, are designed
+to be register compatible. The EMC6D100 offers all the features of the
+EMC6D101 plus additional voltage monitoring and system control features.
+Unfortunately it is not possible to distinguish between the package
+versions on register level so these additional voltage inputs may read
+zero. The EMC6D102 features addtional ADC bits thus extending precision
+of voltage and temperature channels.
+
+
+Hardware Configurations
+-----------------------
+
+The LM85 can be jumpered for 3 different SMBus addresses. There are
+no other hardware configuration options for the LM85.
+
+The lm85 driver detects both LM85B and LM85C revisions of the chip. See the
+datasheet for a complete description of the differences. Other than
+identifying the chip, the driver behaves no differently with regard to
+these two chips. The LM85B is recommended for new designs.
+
+The ADM1027 and ADT7463 chips have an optional SMBALERT output that can be
+used to signal the chipset in case a limit is exceeded or the temperature
+sensors fail. Individual sensor interrupts can be masked so they won't
+trigger SMBALERT. The SMBALERT output if configured replaces one of the other
+functions (PWM2 or IN0). This functionality is not implemented in current
+driver.
+
+The ADT7463 also has an optional THERM output/input which can be connected
+to the processor PROC_HOT output. If available, the autofan control
+dynamic Tmin feature can be enabled to keep the system temperature within
+spec (just?!) with the least possible fan noise.
+
+Configuration Notes
+-------------------
+
+Besides standard interfaces driver adds following:
+
+* Temperatures and Zones
+
+Each temperature sensor is associated with a Zone. There are three
+sensors and therefore three zones (# 1, 2 and 3). Each zone has the following
+temperature configuration points:
+
+* temp#_auto_temp_off - temperature below which fans should be off or spinning very low.
+* temp#_auto_temp_min - temperature over which fans start to spin.
+* temp#_auto_temp_max - temperature when fans spin at full speed.
+* temp#_auto_temp_crit - temperature when all fans will run full speed.
+
+* PWM Control
+
+There are three PWM outputs. The LM85 datasheet suggests that the
+pwm3 output control both fan3 and fan4. Each PWM can be individually
+configured and assigned to a zone for it's control value. Each PWM can be
+configured individually according to the following options.
+
+* pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off
+ temperature. (PWM value from 0 to 255)
+
+* pwm#_auto_pwm_minctl - this flags selects for temp#_auto_temp_off temperature
+ the bahaviour of fans. Write 1 to let fans spinning at
+ pwm#_auto_pwm_min or write 0 to let them off.
+
+NOTE: It has been reported that there is a bug in the LM85 that causes the flag
+to be associated with the zones not the PWMs. This contradicts all the
+published documentation. Setting pwm#_min_ctl in this case actually affects all
+PWMs controlled by zone '#'.
+
+* PWM Controlling Zone selection
+
+* pwm#_auto_channels - controls zone that is associated with PWM
+
+Configuration choices:
+
+ Value Meaning
+ ------ ------------------------------------------------
+ 1 Controlled by Zone 1
+ 2 Controlled by Zone 2
+ 3 Controlled by Zone 3
+ 23 Controlled by higher temp of Zone 2 or 3
+ 123 Controlled by highest temp of Zone 1, 2 or 3
+ 0 PWM always 0% (off)
+ -1 PWM always 100% (full on)
+ -2 Manual control (write to 'pwm#' to set)
+
+The National LM85's have two vendor specific configuration
+features. Tach. mode and Spinup Control. For more details on these,
+see the LM85 datasheet or Application Note AN-1260. These features
+are not currently supported by the lm85 driver.
+
+The Analog Devices ADM1027 has several vendor specific enhancements.
+The number of pulses-per-rev of the fans can be set, Tach monitoring
+can be optimized for PWM operation, and an offset can be applied to
+the temperatures to compensate for systemic errors in the
+measurements. These features are not currently supported by the lm85
+driver.
+
+In addition to the ADM1027 features, the ADT7463 also has Tmin control
+and THERM asserted counts. Automatic Tmin control acts to adjust the
+Tmin value to maintain the measured temperature sensor at a specified
+temperature. There isn't much documentation on this feature in the
+ADT7463 data sheet. This is not supported by current driver.
diff --git a/Documentation/hwmon/lm87 b/Documentation/hwmon/lm87
new file mode 100644
index 0000000..6b47b67
--- /dev/null
+++ b/Documentation/hwmon/lm87
@@ -0,0 +1,77 @@
+Kernel driver lm87
+==================
+
+Supported chips:
+ * National Semiconductor LM87
+ Prefix: 'lm87'
+ Addresses scanned: I2C 0x2c - 0x2e
+ Datasheet: http://www.national.com/pf/LM/LM87.html
+ * Analog Devices ADM1024
+ Prefix: 'adm1024'
+ Addresses scanned: I2C 0x2c - 0x2e
+ Datasheet: http://www.analog.com/en/prod/0,2877,ADM1024,00.html
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Mark Studebaker <mdsxyz123@yahoo.com>,
+ Stephen Rousset <stephen.rousset@rocketlogix.com>,
+ Dan Eaton <dan.eaton@rocketlogix.com>,
+ Jean Delvare <khali@linux-fr.org>,
+ Original 2.6 port Jeff Oliver
+
+Description
+-----------
+
+This driver implements support for the National Semiconductor LM87
+and the Analog Devices ADM1024.
+
+The LM87 implements up to three temperature sensors, up to two fan
+rotation speed sensors, up to seven voltage sensors, alarms, and some
+miscellaneous stuff. The ADM1024 is fully compatible.
+
+Temperatures are measured in degrees Celsius. Each input has a high
+and low alarm settings. A high limit produces an alarm when the value
+goes above it, and an alarm is also produced when the value goes below
+the low limit.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4 or 8) to give
+the readings more range or accuracy. Not all RPM values can accurately be
+represented, so some rounding is done. With a divider of 2, the lowest
+representable value is around 2600 RPM.
+
+Voltage sensors (also known as IN sensors) report their values in
+volts. An alarm is triggered if the voltage has crossed a programmable
+minimum or maximum limit. Note that minimum in this case always means
+'closest to zero'; this is important for negative voltage measurements.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may
+already have disappeared! Note that in the current implementation, all
+hardware registers are read whenever any data is read (unless it is less
+than 1.0 seconds since the last update). This means that you can easily
+miss once-only alarms.
+
+The lm87 driver only updates its values each 1.0 seconds; reading it more
+often will do no harm, but will return 'old' values.
+
+
+Hardware Configurations
+-----------------------
+
+The LM87 has four pins which can serve one of two possible functions,
+depending on the hardware configuration.
+
+Some functions share pins, so not all functions are available at the same
+time. Which are depends on the hardware setup. This driver normally
+assumes that firmware configured the chip correctly. Where this is not
+the case, platform code must set the I2C client's platform_data to point
+to a u8 value to be written to the channel register.
+
+For reference, here is the list of exclusive functions:
+ - in0+in5 (default) or temp3
+ - fan1 (default) or in6
+ - fan2 (default) or in7
+ - VID lines (default) or IRQ lines (not handled by this driver)
diff --git a/Documentation/hwmon/lm90 b/Documentation/hwmon/lm90
new file mode 100644
index 0000000..0e84117
--- /dev/null
+++ b/Documentation/hwmon/lm90
@@ -0,0 +1,189 @@
+Kernel driver lm90
+==================
+
+Supported chips:
+ * National Semiconductor LM90
+ Prefix: 'lm90'
+ Addresses scanned: I2C 0x4c
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/pf/LM/LM90.html
+ * National Semiconductor LM89
+ Prefix: 'lm89' (no auto-detection)
+ Addresses scanned: I2C 0x4c and 0x4d
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/mpf/LM/LM89.html
+ * National Semiconductor LM99
+ Prefix: 'lm99'
+ Addresses scanned: I2C 0x4c and 0x4d
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/pf/LM/LM99.html
+ * National Semiconductor LM86
+ Prefix: 'lm86'
+ Addresses scanned: I2C 0x4c
+ Datasheet: Publicly available at the National Semiconductor website
+ http://www.national.com/mpf/LM/LM86.html
+ * Analog Devices ADM1032
+ Prefix: 'adm1032'
+ Addresses scanned: I2C 0x4c and 0x4d
+ Datasheet: Publicly available at the ON Semiconductor website
+ http://www.onsemi.com/PowerSolutions/product.do?id=ADM1032
+ * Analog Devices ADT7461
+ Prefix: 'adt7461'
+ Addresses scanned: I2C 0x4c and 0x4d
+ Datasheet: Publicly available at the ON Semiconductor website
+ http://www.onsemi.com/PowerSolutions/product.do?id=ADT7461
+ * Maxim MAX6646
+ Prefix: 'max6646'
+ Addresses scanned: I2C 0x4d
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3497
+ * Maxim MAX6647
+ Prefix: 'max6646'
+ Addresses scanned: I2C 0x4e
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3497
+ * Maxim MAX6649
+ Prefix: 'max6646'
+ Addresses scanned: I2C 0x4c
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3497
+ * Maxim MAX6657
+ Prefix: 'max6657'
+ Addresses scanned: I2C 0x4c
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/2578
+ * Maxim MAX6658
+ Prefix: 'max6657'
+ Addresses scanned: I2C 0x4c
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/2578
+ * Maxim MAX6659
+ Prefix: 'max6657'
+ Addresses scanned: I2C 0x4c, 0x4d (unsupported 0x4e)
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/2578
+ * Maxim MAX6680
+ Prefix: 'max6680'
+ Addresses scanned: I2C 0x18, 0x19, 0x1a, 0x29, 0x2a, 0x2b,
+ 0x4c, 0x4d and 0x4e
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3370
+ * Maxim MAX6681
+ Prefix: 'max6680'
+ Addresses scanned: I2C 0x18, 0x19, 0x1a, 0x29, 0x2a, 0x2b,
+ 0x4c, 0x4d and 0x4e
+ Datasheet: Publicly available at the Maxim website
+ http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3370
+
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+
+Description
+-----------
+
+The LM90 is a digital temperature sensor. It senses its own temperature as
+well as the temperature of up to one external diode. It is compatible
+with many other devices, many of which are supported by this driver.
+
+Note that there is no easy way to differentiate between the MAX6657,
+MAX6658 and MAX6659 variants. The extra address and features of the
+MAX6659 are not supported by this driver. The MAX6680 and MAX6681 only
+differ in their pinout, therefore they obviously can't (and don't need to)
+be distinguished.
+
+The specificity of this family of chipsets over the ADM1021/LM84
+family is that it features critical limits with hysteresis, and an
+increased resolution of the remote temperature measurement.
+
+The different chipsets of the family are not strictly identical, although
+very similar. For reference, here comes a non-exhaustive list of specific
+features:
+
+LM90:
+ * Filter and alert configuration register at 0xBF.
+ * ALERT is triggered by temperatures over critical limits.
+
+LM86 and LM89:
+ * Same as LM90
+ * Better external channel accuracy
+
+LM99:
+ * Same as LM89
+ * External temperature shifted by 16 degrees down
+
+ADM1032:
+ * Consecutive alert register at 0x22.
+ * Conversion averaging.
+ * Up to 64 conversions/s.
+ * ALERT is triggered by open remote sensor.
+ * SMBus PEC support for Write Byte and Receive Byte transactions.
+
+ADT7461:
+ * Extended temperature range (breaks compatibility)
+ * Lower resolution for remote temperature
+
+MAX6657 and MAX6658:
+ * Better local resolution
+ * Remote sensor type selection
+
+MAX6659:
+ * Better local resolution
+ * Selectable address
+ * Second critical temperature limit
+ * Remote sensor type selection
+
+MAX6680 and MAX6681:
+ * Selectable address
+ * Remote sensor type selection
+
+All temperature values are given in degrees Celsius. Resolution
+is 1.0 degree for the local temperature, 0.125 degree for the remote
+temperature, except for the MAX6657, MAX6658 and MAX6659 which have a
+resolution of 0.125 degree for both temperatures.
+
+Each sensor has its own high and low limits, plus a critical limit.
+Additionally, there is a relative hysteresis value common to both critical
+values. To make life easier to user-space applications, two absolute values
+are exported, one for each channel, but these values are of course linked.
+Only the local hysteresis can be set from user-space, and the same delta
+applies to the remote hysteresis.
+
+The lm90 driver will not update its values more frequently than every
+other second; reading them more often will do no harm, but will return
+'old' values.
+
+PEC Support
+-----------
+
+The ADM1032 is the only chip of the family which supports PEC. It does
+not support PEC on all transactions though, so some care must be taken.
+
+When reading a register value, the PEC byte is computed and sent by the
+ADM1032 chip. However, in the case of a combined transaction (SMBus Read
+Byte), the ADM1032 computes the CRC value over only the second half of
+the message rather than its entirety, because it thinks the first half
+of the message belongs to a different transaction. As a result, the CRC
+value differs from what the SMBus master expects, and all reads fail.
+
+For this reason, the lm90 driver will enable PEC for the ADM1032 only if
+the bus supports the SMBus Send Byte and Receive Byte transaction types.
+These transactions will be used to read register values, instead of
+SMBus Read Byte, and PEC will work properly.
+
+Additionally, the ADM1032 doesn't support SMBus Send Byte with PEC.
+Instead, it will try to write the PEC value to the register (because the
+SMBus Send Byte transaction with PEC is similar to a Write Byte transaction
+without PEC), which is not what we want. Thus, PEC is explicitly disabled
+on SMBus Send Byte transactions in the lm90 driver.
+
+PEC on byte data transactions represents a significant increase in bandwidth
+usage (+33% for writes, +25% for reads) in normal conditions. With the need
+to use two SMBus transaction for reads, this overhead jumps to +50%. Worse,
+two transactions will typically mean twice as much delay waiting for
+transaction completion, effectively doubling the register cache refresh time.
+I guess reliability comes at a price, but it's quite expensive this time.
+
+So, as not everyone might enjoy the slowdown, PEC can be disabled through
+sysfs. Just write 0 to the "pec" file and PEC will be disabled. Write 1
+to that file to enable PEC again.
diff --git a/Documentation/hwmon/lm92 b/Documentation/hwmon/lm92
new file mode 100644
index 0000000..7705bfa
--- /dev/null
+++ b/Documentation/hwmon/lm92
@@ -0,0 +1,37 @@
+Kernel driver lm92
+==================
+
+Supported chips:
+ * National Semiconductor LM92
+ Prefix: 'lm92'
+ Addresses scanned: I2C 0x48 - 0x4b
+ Datasheet: http://www.national.com/pf/LM/LM92.html
+ * National Semiconductor LM76
+ Prefix: 'lm92'
+ Addresses scanned: none, force parameter needed
+ Datasheet: http://www.national.com/pf/LM/LM76.html
+ * Maxim MAX6633/MAX6634/MAX6635
+ Prefix: 'lm92'
+ Addresses scanned: I2C 0x48 - 0x4b
+ MAX6633 with address in 0x40 - 0x47, 0x4c - 0x4f needs force parameter
+ and MAX6634 with address in 0x4c - 0x4f needs force parameter
+ Datasheet: http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3074
+
+Authors:
+ Abraham van der Merwe <abraham@2d3d.co.za>
+ Jean Delvare <khali@linux-fr.org>
+
+
+Description
+-----------
+
+This driver implements support for the National Semiconductor LM92
+temperature sensor.
+
+Each LM92 temperature sensor supports a single temperature sensor. There are
+alarms for high, low, and critical thresholds. There's also an hysteresis to
+control the thresholds for resetting alarms.
+
+Support was added later for the LM76 and Maxim MAX6633/MAX6634/MAX6635,
+which are mostly compatible. They have not all been tested, so you
+may need to use the force parameter.
diff --git a/Documentation/hwmon/lm93 b/Documentation/hwmon/lm93
new file mode 100644
index 0000000..ac711f3
--- /dev/null
+++ b/Documentation/hwmon/lm93
@@ -0,0 +1,302 @@
+Kernel driver lm93
+==================
+
+Supported chips:
+ * National Semiconductor LM93
+ Prefix 'lm93'
+ Addresses scanned: I2C 0x2c-0x2e
+ Datasheet: http://www.national.com/ds.cgi/LM/LM93.pdf
+
+Authors:
+ Mark M. Hoffman <mhoffman@lightlink.com>
+ Ported to 2.6 by Eric J. Bowersox <ericb@aspsys.com>
+ Adapted to 2.6.20 by Carsten Emde <ce@osadl.org>
+ Modified for mainline integration by Hans J. Koch <hjk@linutronix.de>
+
+Module Parameters
+-----------------
+
+* init: integer
+ Set to non-zero to force some initializations (default is 0).
+* disable_block: integer
+ A "0" allows SMBus block data transactions if the host supports them. A "1"
+ disables SMBus block data transactions. The default is 0.
+* vccp_limit_type: integer array (2)
+ Configures in7 and in8 limit type, where 0 means absolute and non-zero
+ means relative. "Relative" here refers to "Dynamic Vccp Monitoring using
+ VID" from the datasheet. It greatly simplifies the interface to allow
+ only one set of limits (absolute or relative) to be in operation at a
+ time (even though the hardware is capable of enabling both). There's
+ not a compelling use case for enabling both at once, anyway. The default
+ is "0,0".
+* vid_agtl: integer
+ A "0" configures the VID pins for V(ih) = 2.1V min, V(il) = 0.8V max.
+ A "1" configures the VID pins for V(ih) = 0.8V min, V(il) = 0.4V max.
+ (The latter setting is referred to as AGTL+ Compatible in the datasheet.)
+ I.e. this parameter controls the VID pin input thresholds; if your VID
+ inputs are not working, try changing this. The default value is "0".
+
+
+Hardware Description
+--------------------
+
+(from the datasheet)
+
+The LM93 hardware monitor has a two wire digital interface compatible with
+SMBus 2.0. Using an 8-bit ADC, the LM93 measures the temperature of two remote
+diode connected transistors as well as its own die and 16 power supply
+voltages. To set fan speed, the LM93 has two PWM outputs that are each
+controlled by up to four temperature zones. The fancontrol algorithm is lookup
+table based. The LM93 includes a digital filter that can be invoked to smooth
+temperature readings for better control of fan speed. The LM93 has four
+tachometer inputs to measure fan speed. Limit and status registers for all
+measured values are included. The LM93 builds upon the functionality of
+previous motherboard management ASICs and uses some of the LM85's features
+(i.e. smart tachometer mode). It also adds measurement and control support
+for dynamic Vccp monitoring and PROCHOT. It is designed to monitor a dual
+processor Xeon class motherboard with a minimum of external components.
+
+
+User Interface
+--------------
+
+#PROCHOT:
+
+The LM93 can monitor two #PROCHOT signals. The results are found in the
+sysfs files prochot1, prochot2, prochot1_avg, prochot2_avg, prochot1_max,
+and prochot2_max. prochot1_max and prochot2_max contain the user limits
+for #PROCHOT1 and #PROCHOT2, respectively. prochot1 and prochot2 contain
+the current readings for the most recent complete time interval. The
+value of prochot1_avg and prochot2_avg is something like a 2 period
+exponential moving average (but not quite - check the datasheet). Note
+that this third value is calculated by the chip itself. All values range
+from 0-255 where 0 indicates no throttling, and 255 indicates > 99.6%.
+
+The monitoring intervals for the two #PROCHOT signals is also configurable.
+These intervals can be found in the sysfs files prochot1_interval and
+prochot2_interval. The values in these files specify the intervals for
+#P1_PROCHOT and #P2_PROCHOT, respectively. Selecting a value not in this
+list will cause the driver to use the next largest interval. The available
+intervals are (in seconds):
+
+#PROCHOT intervals: 0.73, 1.46, 2.9, 5.8, 11.7, 23.3, 46.6, 93.2, 186, 372
+
+It is possible to configure the LM93 to logically short the two #PROCHOT
+signals. I.e. when #P1_PROCHOT is asserted, the LM93 will automatically
+assert #P2_PROCHOT, and vice-versa. This mode is enabled by writing a
+non-zero integer to the sysfs file prochot_short.
+
+The LM93 can also override the #PROCHOT pins by driving a PWM signal onto
+one or both of them. When overridden, the signal has a period of 3.56 ms,
+a minimum pulse width of 5 clocks (at 22.5kHz => 6.25% duty cycle), and
+a maximum pulse width of 80 clocks (at 22.5kHz => 99.88% duty cycle).
+
+The sysfs files prochot1_override and prochot2_override contain boolean
+integers which enable or disable the override function for #P1_PROCHOT and
+#P2_PROCHOT, respectively. The sysfs file prochot_override_duty_cycle
+contains a value controlling the duty cycle for the PWM signal used when
+the override function is enabled. This value ranges from 0 to 15, with 0
+indicating minimum duty cycle and 15 indicating maximum.
+
+#VRD_HOT:
+
+The LM93 can monitor two #VRD_HOT signals. The results are found in the
+sysfs files vrdhot1 and vrdhot2. There is one value per file: a boolean for
+which 1 indicates #VRD_HOT is asserted and 0 indicates it is negated. These
+files are read-only.
+
+Smart Tach Mode:
+
+(from the datasheet)
+
+ If a fan is driven using a low-side drive PWM, the tachometer
+ output of the fan is corrupted. The LM93 includes smart tachometer
+ circuitry that allows an accurate tachometer reading to be
+ achieved despite the signal corruption. In smart tach mode all
+ four signals are measured within 4 seconds.
+
+Smart tach mode is enabled by the driver by writing 1 or 2 (associating the
+the fan tachometer with a pwm) to the sysfs file fan<n>_smart_tach. A zero
+will disable the function for that fan. Note that Smart tach mode cannot be
+enabled if the PWM output frequency is 22500 Hz (see below).
+
+Manual PWM:
+
+The LM93 has a fixed or override mode for the two PWM outputs (although, there
+are still some conditions that will override even this mode - see section
+15.10.6 of the datasheet for details.) The sysfs files pwm1_override
+and pwm2_override are used to enable this mode; each is a boolean integer
+where 0 disables and 1 enables the manual control mode. The sysfs files pwm1
+and pwm2 are used to set the manual duty cycle; each is an integer (0-255)
+where 0 is 0% duty cycle, and 255 is 100%. Note that the duty cycle values
+are constrained by the hardware. Selecting a value which is not available
+will cause the driver to use the next largest value. Also note: when manual
+PWM mode is disabled, the value of pwm1 and pwm2 indicates the current duty
+cycle chosen by the h/w.
+
+PWM Output Frequency:
+
+The LM93 supports several different frequencies for the PWM output channels.
+The sysfs files pwm1_freq and pwm2_freq are used to select the frequency. The
+frequency values are constrained by the hardware. Selecting a value which is
+not available will cause the driver to use the next largest value. Also note
+that this parameter has implications for the Smart Tach Mode (see above).
+
+PWM Output Frequencies (in Hz): 12, 36, 48, 60, 72, 84, 96, 22500 (default)
+
+Automatic PWM:
+
+The LM93 is capable of complex automatic fan control, with many different
+points of configuration. To start, each PWM output can be bound to any
+combination of eight control sources. The final PWM is the largest of all
+individual control sources to which the PWM output is bound.
+
+The eight control sources are: temp1-temp4 (aka "zones" in the datasheet),
+#PROCHOT 1 & 2, and #VRDHOT 1 & 2. The bindings are expressed as a bitmask
+in the sysfs files pwm<n>_auto_channels, where a "1" enables the binding, and
+a "0" disables it. The h/w default is 0x0f (all temperatures bound).
+
+ 0x01 - Temp 1
+ 0x02 - Temp 2
+ 0x04 - Temp 3
+ 0x08 - Temp 4
+ 0x10 - #PROCHOT 1
+ 0x20 - #PROCHOT 2
+ 0x40 - #VRDHOT 1
+ 0x80 - #VRDHOT 2
+
+The function y = f(x) takes a source temperature x to a PWM output y. This
+function of the LM93 is derived from a base temperature and a table of 12
+temperature offsets. The base temperature is expressed in degrees C in the
+sysfs files temp<n>_auto_base. The offsets are expressed in cumulative
+degrees C, with the value of offset <i> for temperature value <n> being
+contained in the file temp<n>_auto_offset<i>. E.g. if the base temperature
+is 40C:
+
+ offset # temp<n>_auto_offset<i> range pwm
+ 1 0 - 25.00%
+ 2 0 - 28.57%
+ 3 1 40C - 41C 32.14%
+ 4 1 41C - 42C 35.71%
+ 5 2 42C - 44C 39.29%
+ 6 2 44C - 46C 42.86%
+ 7 2 48C - 50C 46.43%
+ 8 2 50C - 52C 50.00%
+ 9 2 52C - 54C 53.57%
+ 10 2 54C - 56C 57.14%
+ 11 2 56C - 58C 71.43%
+ 12 2 58C - 60C 85.71%
+ > 60C 100.00%
+
+Valid offsets are in the range 0C <= x <= 7.5C in 0.5C increments.
+
+There is an independent base temperature for each temperature channel. Note,
+however, there are only two tables of offsets: one each for temp[12] and
+temp[34]. Therefore, any change to e.g. temp1_auto_offset<i> will also
+affect temp2_auto_offset<i>.
+
+The LM93 can also apply hysteresis to the offset table, to prevent unwanted
+oscillation between two steps in the offsets table. These values are found in
+the sysfs files temp<n>_auto_offset_hyst. The value in this file has the
+same representation as in temp<n>_auto_offset<i>.
+
+If a temperature reading falls below the base value for that channel, the LM93
+will use the minimum PWM value. These values are found in the sysfs files
+temp<n>_auto_pwm_min. Note, there are only two minimums: one each for temp[12]
+and temp[34]. Therefore, any change to e.g. temp1_auto_pwm_min will also
+affect temp2_auto_pwm_min.
+
+PWM Spin-Up Cycle:
+
+A spin-up cycle occurs when a PWM output is commanded from 0% duty cycle to
+some value > 0%. The LM93 supports a minimum duty cycle during spin-up. These
+values are found in the sysfs files pwm<n>_auto_spinup_min. The value in this
+file has the same representation as other PWM duty cycle values. The
+duration of the spin-up cycle is also configurable. These values are found in
+the sysfs files pwm<n>_auto_spinup_time. The value in this file is
+the spin-up time in seconds. The available spin-up times are constrained by
+the hardware. Selecting a value which is not available will cause the driver
+to use the next largest value.
+
+Spin-up Durations: 0 (disabled, h/w default), 0.1, 0.25, 0.4, 0.7, 1.0,
+ 2.0, 4.0
+
+#PROCHOT and #VRDHOT PWM Ramping:
+
+If the #PROCHOT or #VRDHOT signals are asserted while bound to a PWM output
+channel, the LM93 will ramp the PWM output up to 100% duty cycle in discrete
+steps. The duration of each step is configurable. There are two files, with
+one value each in seconds: pwm_auto_prochot_ramp and pwm_auto_vrdhot_ramp.
+The available ramp times are constrained by the hardware. Selecting a value
+which is not available will cause the driver to use the next largest value.
+
+Ramp Times: 0 (disabled, h/w default) to 0.75 in 0.05 second intervals
+
+Fan Boost:
+
+For each temperature channel, there is a boost temperature: if the channel
+exceeds this limit, the LM93 will immediately drive both PWM outputs to 100%.
+This limit is expressed in degrees C in the sysfs files temp<n>_auto_boost.
+There is also a hysteresis temperature for this function: after the boost
+limit is reached, the temperature channel must drop below this value before
+the boost function is disabled. This temperature is also expressed in degrees
+C in the sysfs files temp<n>_auto_boost_hyst.
+
+GPIO Pins:
+
+The LM93 can monitor the logic level of four dedicated GPIO pins as well as the
+four tach input pins. GPIO0-GPIO3 correspond to (fan) tach 1-4, respectively.
+All eight GPIOs are read by reading the bitmask in the sysfs file gpio. The
+LSB is GPIO0, and the MSB is GPIO7.
+
+
+LM93 Unique sysfs Files
+-----------------------
+
+ file description
+ -------------------------------------------------------------
+
+ prochot<n> current #PROCHOT %
+
+ prochot<n>_avg moving average #PROCHOT %
+
+ prochot<n>_max limit #PROCHOT %
+
+ prochot_short enable or disable logical #PROCHOT pin short
+
+ prochot<n>_override force #PROCHOT assertion as PWM
+
+ prochot_override_duty_cycle
+ duty cycle for the PWM signal used when
+ #PROCHOT is overridden
+
+ prochot<n>_interval #PROCHOT PWM sampling interval
+
+ vrdhot<n> 0 means negated, 1 means asserted
+
+ fan<n>_smart_tach enable or disable smart tach mode
+
+ pwm<n>_auto_channels select control sources for PWM outputs
+
+ pwm<n>_auto_spinup_min minimum duty cycle during spin-up
+
+ pwm<n>_auto_spinup_time duration of spin-up
+
+ pwm_auto_prochot_ramp ramp time per step when #PROCHOT asserted
+
+ pwm_auto_vrdhot_ramp ramp time per step when #VRDHOT asserted
+
+ temp<n>_auto_base temperature channel base
+
+ temp<n>_auto_offset[1-12]
+ temperature channel offsets
+
+ temp<n>_auto_offset_hyst
+ temperature channel offset hysteresis
+
+ temp<n>_auto_boost temperature channel boost (PWMs to 100%) limit
+
+ temp<n>_auto_boost_hyst temperature channel boost hysteresis
+
+ gpio input state of 8 GPIO pins; read-only
+
diff --git a/Documentation/hwmon/max1619 b/Documentation/hwmon/max1619
new file mode 100644
index 0000000..d6f8d9c
--- /dev/null
+++ b/Documentation/hwmon/max1619
@@ -0,0 +1,29 @@
+Kernel driver max1619
+=====================
+
+Supported chips:
+ * Maxim MAX1619
+ Prefix: 'max1619'
+ Addresses scanned: I2C 0x18-0x1a, 0x29-0x2b, 0x4c-0x4e
+ Datasheet: Publicly available at the Maxim website
+ http://pdfserv.maxim-ic.com/en/ds/MAX1619.pdf
+
+Authors:
+ Alexey Fisher <fishor@mail.ru>,
+ Jean Delvare <khali@linux-fr.org>
+
+Description
+-----------
+
+The MAX1619 is a digital temperature sensor. It senses its own temperature as
+well as the temperature of up to one external diode.
+
+All temperature values are given in degrees Celsius. Resolution
+is 1.0 degree for the local temperature and for the remote temperature.
+
+Only the external sensor has high and low limits.
+
+The max1619 driver will not update its values more frequently than every
+other second; reading them more often will do no harm, but will return
+'old' values.
+
diff --git a/Documentation/hwmon/max6650 b/Documentation/hwmon/max6650
new file mode 100644
index 0000000..8be7beb
--- /dev/null
+++ b/Documentation/hwmon/max6650
@@ -0,0 +1,53 @@
+Kernel driver max6650
+=====================
+
+Supported chips:
+ * Maxim 6650 / 6651
+ Prefix: 'max6650'
+ Addresses scanned: I2C 0x1b, 0x1f, 0x48, 0x4b
+ Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6650-MAX6651.pdf
+
+Authors:
+ Hans J. Koch <hjk@linutronix.de>
+ John Morris <john.morris@spirentcom.com>
+ Claus Gindhart <claus.gindhart@kontron.com>
+
+Description
+-----------
+
+This driver implements support for the Maxim 6650/6651
+
+The 2 devices are very similar, but the Maxim 6550 has a reduced feature
+set, e.g. only one fan-input, instead of 4 for the 6651.
+
+The driver is not able to distinguish between the 2 devices.
+
+The driver provides the following sensor accesses in sysfs:
+
+fan1_input ro fan tachometer speed in RPM
+fan2_input ro "
+fan3_input ro "
+fan4_input ro "
+fan1_target rw desired fan speed in RPM (closed loop mode only)
+pwm1_enable rw regulator mode, 0=full on, 1=open loop, 2=closed loop
+pwm1 rw relative speed (0-255), 255=max. speed.
+ Used in open loop mode only.
+fan1_div rw sets the speed range the inputs can handle. Legal
+ values are 1, 2, 4, and 8. Use lower values for
+ faster fans.
+
+Module parameters
+-----------------
+
+If your board has a BIOS that initializes the MAX6650/6651 correctly, you can
+simply load your module without parameters. It won't touch the configuration
+registers then. If your board BIOS doesn't initialize the chip, or you want
+different settings, you can set the following parameters:
+
+voltage_12V: 5=5V fan, 12=12V fan, 0=don't change
+prescaler: Possible values are 1,2,4,8,16, or 0 for don't change
+clock: The clock frequency in Hz of the chip the driver should assume [254000]
+
+Please have a look at the MAX6650/6651 data sheet and make sure that you fully
+understand the meaning of these parameters before you attempt to change them.
+
diff --git a/Documentation/hwmon/pc87360 b/Documentation/hwmon/pc87360
new file mode 100644
index 0000000..cbac32b
--- /dev/null
+++ b/Documentation/hwmon/pc87360
@@ -0,0 +1,184 @@
+Kernel driver pc87360
+=====================
+
+Supported chips:
+ * National Semiconductor PC87360, PC87363, PC87364, PC87365 and PC87366
+ Prefixes: 'pc87360', 'pc87363', 'pc87364', 'pc87365', 'pc87366'
+ Addresses scanned: none, address read from Super I/O config space
+ Datasheets: No longer available
+
+Authors: Jean Delvare <khali@linux-fr.org>
+
+Thanks to Sandeep Mehta, Tonko de Rooy and Daniel Ceregatti for testing.
+Thanks to Rudolf Marek for helping me investigate conversion issues.
+
+
+Module Parameters
+-----------------
+
+* init int
+ Chip initialization level:
+ 0: None
+ *1: Forcibly enable internal voltage and temperature channels, except in9
+ 2: Forcibly enable all voltage and temperature channels, except in9
+ 3: Forcibly enable all voltage and temperature channels, including in9
+
+Note that this parameter has no effect for the PC87360, PC87363 and PC87364
+chips.
+
+Also note that for the PC87366, initialization levels 2 and 3 don't enable
+all temperature channels, because some of them share pins with each other,
+so they can't be used at the same time.
+
+
+Description
+-----------
+
+The National Semiconductor PC87360 Super I/O chip contains monitoring and
+PWM control circuitry for two fans. The PC87363 chip is similar, and the
+PC87364 chip has monitoring and PWM control for a third fan.
+
+The National Semiconductor PC87365 and PC87366 Super I/O chips are complete
+hardware monitoring chipsets, not only controlling and monitoring three fans,
+but also monitoring eleven voltage inputs and two (PC87365) or up to four
+(PC87366) temperatures.
+
+ Chip #vin #fan #pwm #temp devid
+
+ PC87360 - 2 2 - 0xE1
+ PC87363 - 2 2 - 0xE8
+ PC87364 - 3 3 - 0xE4
+ PC87365 11 3 3 2 0xE5
+ PC87366 11 3 3 3-4 0xE9
+
+The driver assumes that no more than one chip is present, and one of the
+standard Super I/O addresses is used (0x2E/0x2F or 0x4E/0x4F)
+
+Fan Monitoring
+--------------
+
+Fan rotation speeds are reported in RPM (revolutions per minute). An alarm
+is triggered if the rotation speed has dropped below a programmable limit.
+A different alarm is triggered if the fan speed is too low to be measured.
+
+Fan readings are affected by a programmable clock divider, giving the
+readings more range or accuracy. Usually, users have to learn how it works,
+but this driver implements dynamic clock divider selection, so you don't
+have to care no more.
+
+For reference, here are a few values about clock dividers:
+
+ slowest accuracy highest
+ measurable around 3000 accurate
+ divider speed (RPM) RPM (RPM) speed (RPM)
+ 1 1882 18 6928
+ 2 941 37 4898
+ 4 470 74 3464
+ 8 235 150 2449
+
+For the curious, here is how the values above were computed:
+ * slowest measurable speed: clock/(255*divider)
+ * accuracy around 3000 RPM: 3000^2/clock
+ * highest accurate speed: sqrt(clock*100)
+The clock speed for the PC87360 family is 480 kHz. I arbitrarily chose 100
+RPM as the lowest acceptable accuracy.
+
+As mentioned above, you don't have to care about this no more.
+
+Note that not all RPM values can be represented, even when the best clock
+divider is selected. This is not only true for the measured speeds, but
+also for the programmable low limits, so don't be surprised if you try to
+set, say, fan1_min to 2900 and it finally reads 2909.
+
+
+Fan Control
+-----------
+
+PWM (pulse width modulation) values range from 0 to 255, with 0 meaning
+that the fan is stopped, and 255 meaning that the fan goes at full speed.
+
+Be extremely careful when changing PWM values. Low PWM values, even
+non-zero, can stop the fan, which may cause irreversible damage to your
+hardware if temperature increases too much. When changing PWM values, go
+step by step and keep an eye on temperatures.
+
+One user reported problems with PWM. Changing PWM values would break fan
+speed readings. No explanation nor fix could be found.
+
+
+Temperature Monitoring
+----------------------
+
+Temperatures are reported in degrees Celsius. Each temperature measured has
+associated low, high and overtemperature limits, each of which triggers an
+alarm when crossed.
+
+The first two temperature channels are external. The third one (PC87366
+only) is internal.
+
+The PC87366 has three additional temperature channels, based on
+thermistors (as opposed to thermal diodes for the first three temperature
+channels). For technical reasons, these channels are held by the VLM
+(voltage level monitor) logical device, not the TMS (temperature
+measurement) one. As a consequence, these temperatures are exported as
+voltages, and converted into temperatures in user-space.
+
+Note that these three additional channels share their pins with the
+external thermal diode channels, so you (physically) can't use them all at
+the same time. Although it should be possible to mix the two sensor types,
+the documents from National Semiconductor suggest that motherboard
+manufacturers should choose one type and stick to it. So you will more
+likely have either channels 1 to 3 (thermal diodes) or 3 to 6 (internal
+thermal diode, and thermistors).
+
+
+Voltage Monitoring
+------------------
+
+Voltages are reported relatively to a reference voltage, either internal or
+external. Some of them (in7:Vsb, in8:Vdd and in10:AVdd) are divided by two
+internally, you will have to compensate in sensors.conf. Others (in0 to in6)
+are likely to be divided externally. The meaning of each of these inputs as
+well as the values of the resistors used for division is left to the
+motherboard manufacturers, so you will have to document yourself and edit
+sensors.conf accordingly. National Semiconductor has a document with
+recommended resistor values for some voltages, but this still leaves much
+room for per motherboard specificities, unfortunately. Even worse,
+motherboard manufacturers don't seem to care about National Semiconductor's
+recommendations.
+
+Each voltage measured has associated low and high limits, each of which
+triggers an alarm when crossed.
+
+When available, VID inputs are used to provide the nominal CPU Core voltage.
+The driver will default to VRM 9.0, but this can be changed from user-space.
+The chipsets can handle two sets of VID inputs (on dual-CPU systems), but
+the driver will only export one for now. This may change later if there is
+a need.
+
+
+General Remarks
+---------------
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may already
+have disappeared! Note that all hardware registers are read whenever any
+data is read (unless it is less than 2 seconds since the last update, in
+which case cached values are returned instead). As a consequence, when
+a once-only alarm triggers, it may take 2 seconds for it to show, and 2
+more seconds for it to disappear.
+
+Monitoring of in9 isn't enabled at lower init levels (<3) because that
+channel measures the battery voltage (Vbat). It is a known fact that
+repeatedly sampling the battery voltage reduces its lifetime. National
+Semiconductor smartly designed their chipset so that in9 is sampled only
+once every 1024 sampling cycles (that is every 34 minutes at the default
+sampling rate), so the effect is attenuated, but still present.
+
+
+Limitations
+-----------
+
+The datasheets suggests that some values (fan mins, fan dividers)
+shouldn't be changed once the monitoring has started, but we ignore that
+recommendation. We'll reconsider if it actually causes trouble.
diff --git a/Documentation/hwmon/pc87427 b/Documentation/hwmon/pc87427
new file mode 100644
index 0000000..d1ebbe5
--- /dev/null
+++ b/Documentation/hwmon/pc87427
@@ -0,0 +1,38 @@
+Kernel driver pc87427
+=====================
+
+Supported chips:
+ * National Semiconductor PC87427
+ Prefix: 'pc87427'
+ Addresses scanned: none, address read from Super I/O config space
+ Datasheet: No longer available
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+Thanks to Amir Habibi at Candelis for setting up a test system, and to
+Michael Kress for testing several iterations of this driver.
+
+
+Description
+-----------
+
+The National Semiconductor Super I/O chip includes complete hardware
+monitoring capabilities. It can monitor up to 18 voltages, 8 fans and
+6 temperature sensors. Only the fans are supported at the moment.
+
+This chip also has fan controlling features, which are not yet supported
+by this driver either.
+
+The driver assumes that no more than one chip is present, which seems
+reasonable.
+
+
+Fan Monitoring
+--------------
+
+Fan rotation speeds are reported as 14-bit values from a gated clock
+signal. Speeds down to 83 RPM can be measured.
+
+An alarm is triggered if the rotation speed drops below a programmable
+limit. Another alarm is triggered if the speed is too low to to be measured
+(including stalled or missing fan).
diff --git a/Documentation/hwmon/pmbus b/Documentation/hwmon/pmbus
new file mode 100644
index 0000000..bc342af
--- /dev/null
+++ b/Documentation/hwmon/pmbus
@@ -0,0 +1,214 @@
+Kernel driver pmbus
+====================
+
+Supported chips:
+ * Ericsson BMR453, BMR454
+ Prefixes: 'bmr453', 'bmr454'
+ Addresses scanned: -
+ Datasheet:
+ http://archive.ericsson.net/service/internet/picov/get?DocNo=28701-EN/LZT146395
+ * ON Semiconductor ADP4000, NCP4200, NCP4208
+ Prefixes: 'adp4000', 'ncp4200', 'ncp4208'
+ Addresses scanned: -
+ Datasheets:
+ http://www.onsemi.com/pub_link/Collateral/ADP4000-D.PDF
+ http://www.onsemi.com/pub_link/Collateral/NCP4200-D.PDF
+ http://www.onsemi.com/pub_link/Collateral/JUNE%202009-%20REV.%200.PDF
+ * Lineage Power
+ Prefixes: 'mdt040', 'pdt003', 'pdt006', 'pdt012', 'udt020'
+ Addresses scanned: -
+ Datasheets:
+ http://www.lineagepower.com/oem/pdf/PDT003A0X.pdf
+ http://www.lineagepower.com/oem/pdf/PDT006A0X.pdf
+ http://www.lineagepower.com/oem/pdf/PDT012A0X.pdf
+ http://www.lineagepower.com/oem/pdf/UDT020A0X.pdf
+ http://www.lineagepower.com/oem/pdf/MDT040A0X.pdf
+ * Texas Instruments TPS40400, TPS40422
+ Prefixes: 'tps40400', 'tps40422'
+ Addresses scanned: -
+ Datasheets:
+ http://www.ti.com/lit/gpn/tps40400
+ http://www.ti.com/lit/gpn/tps40422
+ * Generic PMBus devices
+ Prefix: 'pmbus'
+ Addresses scanned: -
+ Datasheet: n.a.
+
+Author: Guenter Roeck <linux@roeck-us.net>
+
+
+Description
+-----------
+
+This driver supports hardware montoring for various PMBus compliant devices.
+It supports voltage, current, power, and temperature sensors as supported
+by the device.
+
+Each monitored channel has its own high and low limits, plus a critical
+limit.
+
+Fan support will be added in a later version of this driver.
+
+
+Usage Notes
+-----------
+
+This driver does not probe for PMBus devices, since there is no register
+which can be safely used to identify the chip (The MFG_ID register is not
+supported by all chips), and since there is no well defined address range for
+PMBus devices. You will have to instantiate the devices explicitly.
+
+Example: the following will load the driver for an LTC2978 at address 0x60
+on I2C bus #1:
+$ modprobe pmbus
+[KML: Not for the backport]
+$ echo ltc2978 0x60 > /sys/bus/i2c/devices/i2c-1/new_device
+
+
+Platform data support
+---------------------
+
+Support for additional PMBus chips can be added by defining chip parameters in
+a new chip specific driver file. For example, (untested) code to add support for
+Emerson DS1200 power modules might look as follows.
+
+static struct pmbus_driver_info ds1200_info = {
+ .pages = 1,
+ /* Note: All other sensors are in linear mode */
+ .direct[PSC_VOLTAGE_OUT] = true,
+ .direct[PSC_TEMPERATURE] = true,
+ .direct[PSC_CURRENT_OUT] = true,
+ .m[PSC_VOLTAGE_IN] = 1,
+ .b[PSC_VOLTAGE_IN] = 0,
+ .R[PSC_VOLTAGE_IN] = 3,
+ .m[PSC_VOLTAGE_OUT] = 1,
+ .b[PSC_VOLTAGE_OUT] = 0,
+ .R[PSC_VOLTAGE_OUT] = 3,
+ .m[PSC_TEMPERATURE] = 1,
+ .b[PSC_TEMPERATURE] = 0,
+ .R[PSC_TEMPERATURE] = 3,
+ .func[0] = PMBUS_HAVE_VIN | PMBUS_HAVE_IIN | PMBUS_HAVE_STATUS_INPUT
+ | PMBUS_HAVE_VOUT | PMBUS_HAVE_STATUS_VOUT
+ | PMBUS_HAVE_IOUT | PMBUS_HAVE_STATUS_IOUT
+ | PMBUS_HAVE_PIN | PMBUS_HAVE_POUT
+ | PMBUS_HAVE_TEMP | PMBUS_HAVE_STATUS_TEMP
+ | PMBUS_HAVE_FAN12 | PMBUS_HAVE_STATUS_FAN12,
+};
+
+static int ds1200_probe(struct i2c_client *client,
+ const struct i2c_device_id *id)
+{
+ return pmbus_do_probe(client, id, &ds1200_info);
+}
+
+static int ds1200_remove(struct i2c_client *client)
+{
+ return pmbus_do_remove(client);
+}
+
+static const struct i2c_device_id ds1200_id[] = {
+ {"ds1200", 0},
+ {}
+};
+
+MODULE_DEVICE_TABLE(i2c, ds1200_id);
+
+/* This is the driver that will be inserted */
+static struct i2c_driver ds1200_driver = {
+ .driver = {
+ .name = "ds1200",
+ },
+ .probe = ds1200_probe,
+ .remove = ds1200_remove,
+ .id_table = ds1200_id,
+};
+
+static int __init ds1200_init(void)
+{
+ return i2c_add_driver(&ds1200_driver);
+}
+
+static void __exit ds1200_exit(void)
+{
+ i2c_del_driver(&ds1200_driver);
+}
+
+
+Sysfs entries
+-------------
+
+When probing the chip, the driver identifies which PMBus registers are
+supported, and determines available sensors from this information.
+Attribute files only exist if respective sensors are supported by the chip.
+Labels are provided to inform the user about the sensor associated with
+a given sysfs entry.
+
+The following attributes are supported. Limits are read-write; all other
+attributes are read-only.
+
+inX_input Measured voltage. From READ_VIN or READ_VOUT register.
+inX_min Minimum Voltage.
+ From VIN_UV_WARN_LIMIT or VOUT_UV_WARN_LIMIT register.
+inX_max Maximum voltage.
+ From VIN_OV_WARN_LIMIT or VOUT_OV_WARN_LIMIT register.
+inX_lcrit Critical minimum Voltage.
+ From VIN_UV_FAULT_LIMIT or VOUT_UV_FAULT_LIMIT register.
+inX_crit Critical maximum voltage.
+ From VIN_OV_FAULT_LIMIT or VOUT_OV_FAULT_LIMIT register.
+inX_min_alarm Voltage low alarm. From VOLTAGE_UV_WARNING status.
+inX_max_alarm Voltage high alarm. From VOLTAGE_OV_WARNING status.
+inX_lcrit_alarm Voltage critical low alarm.
+ From VOLTAGE_UV_FAULT status.
+inX_crit_alarm Voltage critical high alarm.
+ From VOLTAGE_OV_FAULT status.
+inX_label "vin", "vcap", or "voutY"
+
+currX_input Measured current. From READ_IIN or READ_IOUT register.
+currX_max Maximum current.
+ From IIN_OC_WARN_LIMIT or IOUT_OC_WARN_LIMIT register.
+currX_lcrit Critical minimum output current.
+ From IOUT_UC_FAULT_LIMIT register.
+currX_crit Critical maximum current.
+ From IIN_OC_FAULT_LIMIT or IOUT_OC_FAULT_LIMIT register.
+currX_alarm Current high alarm.
+ From IIN_OC_WARNING or IOUT_OC_WARNING status.
+currX_max_alarm Current high alarm.
+ From IIN_OC_WARN_LIMIT or IOUT_OC_WARN_LIMIT status.
+currX_lcrit_alarm Output current critical low alarm.
+ From IOUT_UC_FAULT status.
+currX_crit_alarm Current critical high alarm.
+ From IIN_OC_FAULT or IOUT_OC_FAULT status.
+currX_label "iin" or "ioutY"
+
+powerX_input Measured power. From READ_PIN or READ_POUT register.
+powerX_cap Output power cap. From POUT_MAX register.
+powerX_max Power limit. From PIN_OP_WARN_LIMIT or
+ POUT_OP_WARN_LIMIT register.
+powerX_crit Critical output power limit.
+ From POUT_OP_FAULT_LIMIT register.
+powerX_alarm Power high alarm.
+ From PIN_OP_WARNING or POUT_OP_WARNING status.
+powerX_crit_alarm Output power critical high alarm.
+ From POUT_OP_FAULT status.
+powerX_label "pin" or "poutY"
+
+tempX_input Measured temperature.
+ From READ_TEMPERATURE_X register.
+tempX_min Mimimum temperature. From UT_WARN_LIMIT register.
+tempX_max Maximum temperature. From OT_WARN_LIMIT register.
+tempX_lcrit Critical low temperature.
+ From UT_FAULT_LIMIT register.
+tempX_crit Critical high temperature.
+ From OT_FAULT_LIMIT register.
+tempX_min_alarm Chip temperature low alarm. Set by comparing
+ READ_TEMPERATURE_X with UT_WARN_LIMIT if
+ TEMP_UT_WARNING status is set.
+tempX_max_alarm Chip temperature high alarm. Set by comparing
+ READ_TEMPERATURE_X with OT_WARN_LIMIT if
+ TEMP_OT_WARNING status is set.
+tempX_lcrit_alarm Chip temperature critical low alarm. Set by comparing
+ READ_TEMPERATURE_X with UT_FAULT_LIMIT if
+ TEMP_UT_FAULT status is set.
+tempX_crit_alarm Chip temperature critical high alarm. Set by comparing
+ READ_TEMPERATURE_X with OT_FAULT_LIMIT if
+ TEMP_OT_FAULT status is set.
diff --git a/Documentation/hwmon/pmbus-core b/Documentation/hwmon/pmbus-core
new file mode 100644
index 0000000..31e4720
--- /dev/null
+++ b/Documentation/hwmon/pmbus-core
@@ -0,0 +1,283 @@
+PMBus core driver and internal API
+==================================
+
+Introduction
+============
+
+[from pmbus.org] The Power Management Bus (PMBus) is an open standard
+power-management protocol with a fully defined command language that facilitates
+communication with power converters and other devices in a power system. The
+protocol is implemented over the industry-standard SMBus serial interface and
+enables programming, control, and real-time monitoring of compliant power
+conversion products. This flexible and highly versatile standard allows for
+communication between devices based on both analog and digital technologies, and
+provides true interoperability which will reduce design complexity and shorten
+time to market for power system designers. Pioneered by leading power supply and
+semiconductor companies, this open power system standard is maintained and
+promoted by the PMBus Implementers Forum (PMBus-IF), comprising 30+ adopters
+with the objective to provide support to, and facilitate adoption among, users.
+
+Unfortunately, while PMBus commands are standardized, there are no mandatory
+commands, and manufacturers can add as many non-standard commands as they like.
+Also, different PMBUs devices act differently if non-supported commands are
+executed. Some devices return an error, some devices return 0xff or 0xffff and
+set a status error flag, and some devices may simply hang up.
+
+Despite all those difficulties, a generic PMBus device driver is still useful
+and supported since kernel version 2.6.39. However, it was necessary to support
+device specific extensions in addition to the core PMBus driver, since it is
+simply unknown what new device specific functionality PMBus device developers
+come up with next.
+
+To make device specific extensions as scalable as possible, and to avoid having
+to modify the core PMBus driver repeatedly for new devices, the PMBus driver was
+split into core, generic, and device specific code. The core code (in
+pmbus_core.c) provides generic functionality. The generic code (in pmbus.c)
+provides support for generic PMBus devices. Device specific code is responsible
+for device specific initialization and, if needed, maps device specific
+functionality into generic functionality. This is to some degree comparable
+to PCI code, where generic code is augmented as needed with quirks for all kinds
+of devices.
+
+PMBus device capabilities auto-detection
+========================================
+
+For generic PMBus devices, code in pmbus.c attempts to auto-detect all supported
+PMBus commands. Auto-detection is somewhat limited, since there are simply too
+many variables to consider. For example, it is almost impossible to autodetect
+which PMBus commands are paged and which commands are replicated across all
+pages (see the PMBus specification for details on multi-page PMBus devices).
+
+For this reason, it often makes sense to provide a device specific driver if not
+all commands can be auto-detected. The data structures in this driver can be
+used to inform the core driver about functionality supported by individual
+chips.
+
+Some commands are always auto-detected. This applies to all limit commands
+(lcrit, min, max, and crit attributes) as well as associated alarm attributes.
+Limits and alarm attributes are auto-detected because there are simply too many
+possible combinations to provide a manual configuration interface.
+
+PMBus internal API
+==================
+
+The API between core and device specific PMBus code is defined in
+drivers/hwmon/pmbus/pmbus.h. In addition to the internal API, pmbus.h defines
+standard PMBus commands and virtual PMBus commands.
+
+Standard PMBus commands
+-----------------------
+
+Standard PMBus commands (commands values 0x00 to 0xff) are defined in the PMBUs
+specification.
+
+Virtual PMBus commands
+----------------------
+
+Virtual PMBus commands are provided to enable support for non-standard
+functionality which has been implemented by several chip vendors and is thus
+desirable to support.
+
+Virtual PMBus commands start with command value 0x100 and can thus easily be
+distinguished from standard PMBus commands (which can not have values larger
+than 0xff). Support for virtual PMBus commands is device specific and thus has
+to be implemented in device specific code.
+
+Virtual commands are named PMBUS_VIRT_xxx and start with PMBUS_VIRT_BASE. All
+virtual commands are word sized.
+
+There are currently two types of virtual commands.
+
+- READ commands are read-only; writes are either ignored or return an error.
+- RESET commands are read/write. Reading reset registers returns zero
+ (used for detection), writing any value causes the associated history to be
+ reset.
+
+Virtual commands have to be handled in device specific driver code. Chip driver
+code returns non-negative values if a virtual command is supported, or a
+negative error code if not. The chip driver may return -ENODATA or any other
+Linux error code in this case, though an error code other than -ENODATA is
+handled more efficiently and thus preferred. Either case, the calling PMBus
+core code will abort if the chip driver returns an error code when reading
+or writing virtual registers (in other words, the PMBus core code will never
+send a virtual command to a chip).
+
+PMBus driver information
+------------------------
+
+PMBus driver information, defined in struct pmbus_driver_info, is the main means
+for device specific drivers to pass information to the core PMBus driver.
+Specifically, it provides the following information.
+
+- For devices supporting its data in Direct Data Format, it provides coefficients
+ for converting register values into normalized data. This data is usually
+ provided by chip manufacturers in device datasheets.
+- Supported chip functionality can be provided to the core driver. This may be
+ necessary for chips which react badly if non-supported commands are executed,
+ and/or to speed up device detection and initialization.
+- Several function entry points are provided to support overriding and/or
+ augmenting generic command execution. This functionality can be used to map
+ non-standard PMBus commands to standard commands, or to augment standard
+ command return values with device specific information.
+
+ API functions
+ -------------
+
+ Functions provided by chip driver
+ ---------------------------------
+
+ All functions return the command return value (read) or zero (write) if
+ successful. A return value of -ENODATA indicates that there is no manufacturer
+ specific command, but that a standard PMBus command may exist. Any other
+ negative return value indicates that the commands does not exist for this
+ chip, and that no attempt should be made to read or write the standard
+ command.
+
+ As mentioned above, an exception to this rule applies to virtual commands,
+ which _must_ be handled in driver specific code. See "Virtual PMBus Commands"
+ above for more details.
+
+ Command execution in the core PMBus driver code is as follows.
+
+ if (chip_access_function) {
+ status = chip_access_function();
+ if (status != -ENODATA)
+ return status;
+ }
+ if (command >= PMBUS_VIRT_BASE) /* For word commands/registers only */
+ return -EINVAL;
+ return generic_access();
+
+ Chip drivers may provide pointers to the following functions in struct
+ pmbus_driver_info. All functions are optional.
+
+ int (*read_byte_data)(struct i2c_client *client, int page, int reg);
+
+ Read byte from page <page>, register <reg>.
+ <page> may be -1, which means "current page".
+
+ int (*read_word_data)(struct i2c_client *client, int page, int reg);
+
+ Read word from page <page>, register <reg>.
+
+ int (*write_word_data)(struct i2c_client *client, int page, int reg,
+ u16 word);
+
+ Write word to page <page>, register <reg>.
+
+ int (*write_byte)(struct i2c_client *client, int page, u8 value);
+
+ Write byte to page <page>, register <reg>.
+ <page> may be -1, which means "current page".
+
+ int (*identify)(struct i2c_client *client, struct pmbus_driver_info *info);
+
+ Determine supported PMBus functionality. This function is only necessary
+ if a chip driver supports multiple chips, and the chip functionality is not
+ pre-determined. It is currently only used by the generic pmbus driver
+ (pmbus.c).
+
+ Functions exported by core driver
+ ---------------------------------
+
+ Chip drivers are expected to use the following functions to read or write
+ PMBus registers. Chip drivers may also use direct I2C commands. If direct I2C
+ commands are used, the chip driver code must not directly modify the current
+ page, since the selected page is cached in the core driver and the core driver
+ will assume that it is selected. Using pmbus_set_page() to select a new page
+ is mandatory.
+
+ int pmbus_set_page(struct i2c_client *client, u8 page);
+
+ Set PMBus page register to <page> for subsequent commands.
+
+ int pmbus_read_word_data(struct i2c_client *client, u8 page, u8 reg);
+
+ Read word data from <page>, <reg>. Similar to i2c_smbus_read_word_data(), but
+ selects page first.
+
+ int pmbus_write_word_data(struct i2c_client *client, u8 page, u8 reg,
+ u16 word);
+
+ Write word data to <page>, <reg>. Similar to i2c_smbus_write_word_data(), but
+ selects page first.
+
+ int pmbus_read_byte_data(struct i2c_client *client, int page, u8 reg);
+
+ Read byte data from <page>, <reg>. Similar to i2c_smbus_read_byte_data(), but
+ selects page first. <page> may be -1, which means "current page".
+
+ int pmbus_write_byte(struct i2c_client *client, int page, u8 value);
+
+ Write byte data to <page>, <reg>. Similar to i2c_smbus_write_byte(), but
+ selects page first. <page> may be -1, which means "current page".
+
+ void pmbus_clear_faults(struct i2c_client *client);
+
+ Execute PMBus "Clear Fault" command on all chip pages.
+ This function calls the device specific write_byte function if defined.
+ Therefore, it must _not_ be called from that function.
+
+ bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
+
+ Check if byte register exists. Return true if the register exists, false
+ otherwise.
+ This function calls the device specific write_byte function if defined to
+ obtain the chip status. Therefore, it must _not_ be called from that function.
+
+ bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
+
+ Check if word register exists. Return true if the register exists, false
+ otherwise.
+ This function calls the device specific write_byte function if defined to
+ obtain the chip status. Therefore, it must _not_ be called from that function.
+
+ int pmbus_do_probe(struct i2c_client *client, const struct i2c_device_id *id,
+ struct pmbus_driver_info *info);
+
+ Execute probe function. Similar to standard probe function for other drivers,
+ with the pointer to struct pmbus_driver_info as additional argument. Calls
+ identify function if supported. Must only be called from device probe
+ function.
+
+ void pmbus_do_remove(struct i2c_client *client);
+
+ Execute driver remove function. Similar to standard driver remove function.
+
+ const struct pmbus_driver_info
+ *pmbus_get_driver_info(struct i2c_client *client);
+
+ Return pointer to struct pmbus_driver_info as passed to pmbus_do_probe().
+
+
+PMBus driver platform data
+==========================
+
+PMBus platform data is defined in include/linux/i2c/pmbus.h. Platform data
+currently only provides a flag field with a single bit used.
+
+#define PMBUS_SKIP_STATUS_CHECK (1 << 0)
+
+struct pmbus_platform_data {
+ u32 flags; /* Device specific flags */
+};
+
+
+Flags
+-----
+
+PMBUS_SKIP_STATUS_CHECK
+
+During register detection, skip checking the status register for
+communication or command errors.
+
+Some PMBus chips respond with valid data when trying to read an unsupported
+register. For such chips, checking the status register is mandatory when
+trying to determine if a chip register exists or not.
+Other PMBus chips don't support the STATUS_CML register, or report
+communication errors for no explicable reason. For such chips, checking the
+status register must be disabled.
+
+Some i2c controllers do not support single-byte commands (write commands with
+no data, i2c_smbus_write_byte()). With such controllers, clearing the status
+register is impossible, and the PMBUS_SKIP_STATUS_CHECK flag must be set.
diff --git a/Documentation/hwmon/sis5595 b/Documentation/hwmon/sis5595
new file mode 100644
index 0000000..4f8877a
--- /dev/null
+++ b/Documentation/hwmon/sis5595
@@ -0,0 +1,106 @@
+Kernel driver sis5595
+=====================
+
+Supported chips:
+ * Silicon Integrated Systems Corp. SiS5595 Southbridge Hardware Monitor
+ Prefix: 'sis5595'
+ Addresses scanned: ISA in PCI-space encoded address
+ Datasheet: Publicly available at the Silicon Integrated Systems Corp. site.
+
+Authors:
+ Kyösti Mälkki <kmalkki@cc.hut.fi>,
+ Mark D. Studebaker <mdsxyz123@yahoo.com>,
+ Aurelien Jarno <aurelien@aurel32.net> 2.6 port
+
+ SiS southbridge has a LM78-like chip integrated on the same IC.
+ This driver is a customized copy of lm78.c
+
+ Supports following revisions:
+ Version PCI ID PCI Revision
+ 1 1039/0008 AF or less
+ 2 1039/0008 B0 or greater
+
+ Note: these chips contain a 0008 device which is incompatible with the
+ 5595. We recognize these by the presence of the listed
+ "blacklist" PCI ID and refuse to load.
+
+ NOT SUPPORTED PCI ID BLACKLIST PCI ID
+ 540 0008 0540
+ 550 0008 0550
+ 5513 0008 5511
+ 5581 0008 5597
+ 5582 0008 5597
+ 5597 0008 5597
+ 630 0008 0630
+ 645 0008 0645
+ 730 0008 0730
+ 735 0008 0735
+
+
+Module Parameters
+-----------------
+force_addr=0xaddr Set the I/O base address. Useful for boards
+ that don't set the address in the BIOS. Does not do a
+ PCI force; the device must still be present in lspci.
+ Don't use this unless the driver complains that the
+ base address is not set.
+ Example: 'modprobe sis5595 force_addr=0x290'
+
+
+Description
+-----------
+
+The SiS5595 southbridge has integrated hardware monitor functions. It also
+has an I2C bus, but this driver only supports the hardware monitor. For the
+I2C bus driver see i2c-sis5595.
+
+The SiS5595 implements zero or one temperature sensor, two fan speed
+sensors, four or five voltage sensors, and alarms.
+
+On the first version of the chip, there are four voltage sensors and one
+temperature sensor.
+
+On the second version of the chip, the temperature sensor (temp) and the
+fifth voltage sensor (in4) share a pin which is configurable, but not
+through the driver. Sorry. The driver senses the configuration of the pin,
+which was hopefully set by the BIOS.
+
+Temperatures are measured in degrees Celsius. An alarm is triggered once
+when the max is crossed; it is also triggered when it drops below the min
+value. Measurements are guaranteed between -55 and +125 degrees, with a
+resolution of 1 degree.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4 or 8) to give
+the readings more range or accuracy. Not all RPM values can accurately be
+represented, so some rounding is done. With a divider of 2, the lowest
+representable value is around 2600 RPM.
+
+Voltage sensors (also known as IN sensors) report their values in volts. An
+alarm is triggered if the voltage has crossed a programmable minimum or
+maximum limit. Note that minimum in this case always means 'closest to
+zero'; this is important for negative voltage measurements. All voltage
+inputs can measure voltages between 0 and 4.08 volts, with a resolution of
+0.016 volt.
+
+In addition to the alarms described above, there is a BTI alarm, which gets
+triggered when an external chip has crossed its limits. Usually, this is
+connected to some LM75-like chip; if at least one crosses its limits, this
+bit gets set.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may already
+have disappeared! Note that in the current implementation, all hardware
+registers are read whenever any data is read (unless it is less than 1.5
+seconds since the last update). This means that you can easily miss
+once-only alarms.
+
+The SiS5595 only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
+
+Problems
+--------
+Some chips refuse to be enabled. We don't know why.
+The driver will recognize this and print a message in dmesg.
+
diff --git a/Documentation/hwmon/smsc47b397 b/Documentation/hwmon/smsc47b397
new file mode 100644
index 0000000..3a43b69
--- /dev/null
+++ b/Documentation/hwmon/smsc47b397
@@ -0,0 +1,163 @@
+Kernel driver smsc47b397
+========================
+
+Supported chips:
+ * SMSC LPC47B397-NC
+ * SMSC SCH5307-NS
+ * SMSC SCH5317
+ Prefix: 'smsc47b397'
+ Addresses scanned: none, address read from Super I/O config space
+ Datasheet: In this file
+
+Authors: Mark M. Hoffman <mhoffman@lightlink.com>
+ Utilitek Systems, Inc.
+
+November 23, 2004
+
+The following specification describes the SMSC LPC47B397-NC[1] sensor chip
+(for which there is no public datasheet available). This document was
+provided by Craig Kelly (In-Store Broadcast Network) and edited/corrected
+by Mark M. Hoffman <mhoffman@lightlink.com>.
+
+[1] And SMSC SCH5307-NS and SCH5317, which have different device IDs but are
+otherwise compatible.
+
+* * * * *
+
+Methods for detecting the HP SIO and reading the thermal data on a dc7100.
+
+The thermal information on the dc7100 is contained in the SIO Hardware Monitor
+(HWM). The information is accessed through an index/data pair. The index/data
+pair is located at the HWM Base Address + 0 and the HWM Base Address + 1. The
+HWM Base address can be obtained from Logical Device 8, registers 0x60 (MSB)
+and 0x61 (LSB). Currently we are using 0x480 for the HWM Base Address and
+0x480 and 0x481 for the index/data pair.
+
+Reading temperature information.
+The temperature information is located in the following registers:
+Temp1 0x25 (Currently, this reflects the CPU temp on all systems).
+Temp2 0x26
+Temp3 0x27
+Temp4 0x80
+
+Programming Example
+The following is an example of how to read the HWM temperature registers:
+MOV DX,480H
+MOV AX,25H
+OUT DX,AL
+MOV DX,481H
+IN AL,DX
+
+AL contains the data in hex, the temperature in Celsius is the decimal
+equivalent.
+
+Ex: If AL contains 0x2A, the temperature is 42 degrees C.
+
+Reading tach information.
+The fan speed information is located in the following registers:
+ LSB MSB
+Tach1 0x28 0x29 (Currently, this reflects the CPU
+ fan speed on all systems).
+Tach2 0x2A 0x2B
+Tach3 0x2C 0x2D
+Tach4 0x2E 0x2F
+
+Important!!!
+Reading the tach LSB locks the tach MSB.
+The LSB Must be read first.
+
+How to convert the tach reading to RPM.
+The tach reading (TCount) is given by: (Tach MSB * 256) + (Tach LSB)
+The SIO counts the number of 90kHz (11.111us) pulses per revolution.
+RPM = 60/(TCount * 11.111us)
+
+Example:
+Reg 0x28 = 0x9B
+Reg 0x29 = 0x08
+
+TCount = 0x89B = 2203
+
+RPM = 60 / (2203 * 11.11111 E-6) = 2451 RPM
+
+Obtaining the SIO version.
+
+CONFIGURATION SEQUENCE
+To program the configuration registers, the following sequence must be followed:
+1. Enter Configuration Mode
+2. Configure the Configuration Registers
+3. Exit Configuration Mode.
+
+Enter Configuration Mode
+To place the chip into the Configuration State The config key (0x55) is written
+to the CONFIG PORT (0x2E).
+
+Configuration Mode
+In configuration mode, the INDEX PORT is located at the CONFIG PORT address and
+the DATA PORT is at INDEX PORT address + 1.
+
+The desired configuration registers are accessed in two steps:
+a. Write the index of the Logical Device Number Configuration Register
+ (i.e., 0x07) to the INDEX PORT and then write the number of the
+ desired logical device to the DATA PORT.
+
+b. Write the address of the desired configuration register within the
+ logical device to the INDEX PORT and then write or read the config-
+ uration register through the DATA PORT.
+
+Note: If accessing the Global Configuration Registers, step (a) is not required.
+
+Exit Configuration Mode
+To exit the Configuration State the write 0xAA to the CONFIG PORT (0x2E).
+The chip returns to the RUN State. (This is important).
+
+Programming Example
+The following is an example of how to read the SIO Device ID located at 0x20
+
+; ENTER CONFIGURATION MODE
+MOV DX,02EH
+MOV AX,055H
+OUT DX,AL
+; GLOBAL CONFIGURATION REGISTER
+MOV DX,02EH
+MOV AL,20H
+OUT DX,AL
+; READ THE DATA
+MOV DX,02FH
+IN AL,DX
+; EXIT CONFIGURATION MODE
+MOV DX,02EH
+MOV AX,0AAH
+OUT DX,AL
+
+The registers of interest for identifying the SIO on the dc7100 are Device ID
+(0x20) and Device Rev (0x21).
+
+The Device ID will read 0x6F (0x81 for SCH5307-NS, and 0x85 for SCH5317)
+The Device Rev currently reads 0x01
+
+Obtaining the HWM Base Address.
+The following is an example of how to read the HWM Base Address located in
+Logical Device 8.
+
+; ENTER CONFIGURATION MODE
+MOV DX,02EH
+MOV AX,055H
+OUT DX,AL
+; CONFIGURE REGISTER CRE0,
+; LOGICAL DEVICE 8
+MOV DX,02EH
+MOV AL,07H
+OUT DX,AL ;Point to LD# Config Reg
+MOV DX,02FH
+MOV AL, 08H
+OUT DX,AL;Point to Logical Device 8
+;
+MOV DX,02EH
+MOV AL,60H
+OUT DX,AL ; Point to HWM Base Addr MSB
+MOV DX,02FH
+IN AL,DX ; Get MSB of HWM Base Addr
+; EXIT CONFIGURATION MODE
+MOV DX,02EH
+MOV AX,0AAH
+OUT DX,AL
diff --git a/Documentation/hwmon/smsc47m1 b/Documentation/hwmon/smsc47m1
new file mode 100644
index 0000000..42c8431
--- /dev/null
+++ b/Documentation/hwmon/smsc47m1
@@ -0,0 +1,66 @@
+Kernel driver smsc47m1
+======================
+
+Supported chips:
+ * SMSC LPC47B27x, LPC47M112, LPC47M10x, LPC47M13x, LPC47M14x,
+ LPC47M15x and LPC47M192
+ Addresses scanned: none, address read from Super I/O config space
+ Prefix: 'smsc47m1'
+ Datasheets:
+ http://www.smsc.com/main/datasheets/47b27x.pdf
+ http://www.smsc.com/main/datasheets/47m10x.pdf
+ http://www.smsc.com/main/datasheets/47m112.pdf
+ http://www.smsc.com/main/tools/discontinued/47m13x.pdf
+ http://www.smsc.com/main/datasheets/47m14x.pdf
+ http://www.smsc.com/main/tools/discontinued/47m15x.pdf
+ http://www.smsc.com/main/datasheets/47m192.pdf
+ * SMSC LPC47M292
+ Addresses scanned: none, address read from Super I/O config space
+ Prefix: 'smsc47m2'
+ Datasheet: Not public
+ * SMSC LPC47M997
+ Addresses scanned: none, address read from Super I/O config space
+ Prefix: 'smsc47m1'
+ Datasheet: none
+
+Authors:
+ Mark D. Studebaker <mdsxyz123@yahoo.com>,
+ With assistance from Bruce Allen <ballen@uwm.edu>, and his
+ fan.c program: http://www.lsc-group.phys.uwm.edu/%7Eballen/driver/
+ Gabriele Gorla <gorlik@yahoo.com>,
+ Jean Delvare <khali@linux-fr.org>
+
+Description
+-----------
+
+The Standard Microsystems Corporation (SMSC) 47M1xx Super I/O chips
+contain monitoring and PWM control circuitry for two fans.
+
+The LPC47M15x, LPC47M192 and LPC47M292 chips contain a full 'hardware
+monitoring block' in addition to the fan monitoring and control. The
+hardware monitoring block is not supported by this driver, use the
+smsc47m192 driver for that.
+
+No documentation is available for the 47M997, but it has the same device
+ID as the 47M15x and 47M192 chips and seems to be compatible.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4 or 8) to give
+the readings more range or accuracy. Not all RPM values can accurately be
+represented, so some rounding is done. With a divider of 2, the lowest
+representable value is around 2600 RPM.
+
+PWM values are from 0 to 255.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may
+already have disappeared! Note that in the current implementation, all
+hardware registers are read whenever any data is read (unless it is less
+than 1.5 seconds since the last update). This means that you can easily
+miss once-only alarms.
+
+
+**********************
+The lm_sensors project gratefully acknowledges the support of
+Intel in the development of this driver.
diff --git a/Documentation/hwmon/smsc47m192 b/Documentation/hwmon/smsc47m192
new file mode 100644
index 0000000..6d54ecb
--- /dev/null
+++ b/Documentation/hwmon/smsc47m192
@@ -0,0 +1,103 @@
+Kernel driver smsc47m192
+========================
+
+Supported chips:
+ * SMSC LPC47M192, LPC47M15x, LPC47M292 and LPC47M997
+ Prefix: 'smsc47m192'
+ Addresses scanned: I2C 0x2c - 0x2d
+ Datasheet: The datasheet for LPC47M192 is publicly available from
+ http://www.smsc.com/
+ The LPC47M15x, LPC47M292 and LPC47M997 are compatible for
+ hardware monitoring.
+
+Author: Hartmut Rick <linux@rick.claranet.de>
+ Special thanks to Jean Delvare for careful checking
+ of the code and many helpful comments and suggestions.
+
+
+Description
+-----------
+
+This driver implements support for the hardware sensor capabilities
+of the SMSC LPC47M192 and compatible Super-I/O chips.
+
+These chips support 3 temperature channels and 8 voltage inputs
+as well as CPU voltage VID input.
+
+They do also have fan monitoring and control capabilities, but the
+these features are accessed via ISA bus and are not supported by this
+driver. Use the 'smsc47m1' driver for fan monitoring and control.
+
+Voltages and temperatures are measured by an 8-bit ADC, the resolution
+of the temperatures is 1 bit per degree C.
+Voltages are scaled such that the nominal voltage corresponds to
+192 counts, i.e. 3/4 of the full range. Thus the available range for
+each voltage channel is 0V ... 255/192*(nominal voltage), the resolution
+is 1 bit per (nominal voltage)/192.
+Both voltage and temperature values are scaled by 1000, the sys files
+show voltages in mV and temperatures in units of 0.001 degC.
+
+The +12V analog voltage input channel (in4_input) is multiplexed with
+bit 4 of the encoded CPU voltage. This means that you either get
+a +12V voltage measurement or a 5 bit CPU VID, but not both.
+The default setting is to use the pin as 12V input, and use only 4 bit VID.
+This driver assumes that the information in the configuration register
+is correct, i.e. that the BIOS has updated the configuration if
+the motherboard has this input wired to VID4.
+
+The temperature and voltage readings are updated once every 1.5 seconds.
+Reading them more often repeats the same values.
+
+
+sysfs interface
+---------------
+
+in0_input - +2.5V voltage input
+in1_input - CPU voltage input (nominal 2.25V)
+in2_input - +3.3V voltage input
+in3_input - +5V voltage input
+in4_input - +12V voltage input (may be missing if used as VID4)
+in5_input - Vcc voltage input (nominal 3.3V)
+ This is the supply voltage of the sensor chip itself.
+in6_input - +1.5V voltage input
+in7_input - +1.8V voltage input
+
+in[0-7]_min,
+in[0-7]_max - lower and upper alarm thresholds for in[0-7]_input reading
+
+ All voltages are read and written in mV.
+
+in[0-7]_alarm - alarm flags for voltage inputs
+ These files read '1' in case of alarm, '0' otherwise.
+
+temp1_input - chip temperature measured by on-chip diode
+temp[2-3]_input - temperature measured by external diodes (one of these would
+ typically be wired to the diode inside the CPU)
+
+temp[1-3]_min,
+temp[1-3]_max - lower and upper alarm thresholds for temperatures
+
+temp[1-3]_offset - temperature offset registers
+ The chip adds the offsets stored in these registers to
+ the corresponding temperature readings.
+ Note that temp1 and temp2 offsets share the same register,
+ they cannot both be different from zero at the same time.
+ Writing a non-zero number to one of them will reset the other
+ offset to zero.
+
+ All temperatures and offsets are read and written in
+ units of 0.001 degC.
+
+temp[1-3]_alarm - alarm flags for temperature inputs, '1' in case of alarm,
+ '0' otherwise.
+temp[2-3]_input_fault - diode fault flags for temperature inputs 2 and 3.
+ A fault is detected if the two pins for the corresponding
+ sensor are open or shorted, or any of the two is shorted
+ to ground or Vcc. '1' indicates a diode fault.
+
+cpu0_vid - CPU voltage as received from the CPU
+
+vrm - CPU VID standard used for decoding CPU voltage
+
+ The *_min, *_max, *_offset and vrm files can be read and
+ written, all others are read-only.
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
new file mode 100644
index 0000000..6dbfd5e
--- /dev/null
+++ b/Documentation/hwmon/sysfs-interface
@@ -0,0 +1,505 @@
+Naming and data format standards for sysfs files
+------------------------------------------------
+
+The libsensors library offers an interface to the raw sensors data
+through the sysfs interface. Since lm-sensors 3.0.0, libsensors is
+completely chip-independent. It assumes that all the kernel drivers
+implement the standard sysfs interface described in this document.
+This makes adding or updating support for any given chip very easy, as
+libsensors, and applications using it, do not need to be modified.
+This is a major improvement compared to lm-sensors 2.
+
+Note that motherboards vary widely in the connections to sensor chips.
+There is no standard that ensures, for example, that the second
+temperature sensor is connected to the CPU, or that the second fan is on
+the CPU. Also, some values reported by the chips need some computation
+before they make full sense. For example, most chips can only measure
+voltages between 0 and +4V. Other voltages are scaled back into that
+range using external resistors. Since the values of these resistors
+can change from motherboard to motherboard, the conversions cannot be
+hard coded into the driver and have to be done in user space.
+
+For this reason, even if we aim at a chip-independent libsensors, it will
+still require a configuration file (e.g. /etc/sensors.conf) for proper
+values conversion, labeling of inputs and hiding of unused inputs.
+
+An alternative method that some programs use is to access the sysfs
+files directly. This document briefly describes the standards that the
+drivers follow, so that an application program can scan for entries and
+access this data in a simple and consistent way. That said, such programs
+will have to implement conversion, labeling and hiding of inputs. For
+this reason, it is still not recommended to bypass the library.
+
+Each chip gets its own directory in the sysfs /sys/devices tree. To
+find all sensor chips, it is easier to follow the device symlinks from
+/sys/class/hwmon/hwmon*.
+
+Up to lm-sensors 3.0.0, libsensors looks for hardware monitoring attributes
+in the "physical" device directory. Since lm-sensors 3.0.1, attributes found
+in the hwmon "class" device directory are also supported. Complex drivers
+(e.g. drivers for multifunction chips) may want to use this possibility to
+avoid namespace pollution. The only drawback will be that older versions of
+libsensors won't support the driver in question.
+
+All sysfs values are fixed point numbers.
+
+There is only one value per file, unlike the older /proc specification.
+The common scheme for files naming is: <type><number>_<item>. Usual
+types for sensor chips are "in" (voltage), "temp" (temperature) and
+"fan" (fan). Usual items are "input" (measured value), "max" (high
+threshold, "min" (low threshold). Numbering usually starts from 1,
+except for voltages which start from 0 (because most data sheets use
+this). A number is always used for elements that can be present more
+than once, even if there is a single element of the given type on the
+specific chip. Other files do not refer to a specific element, so
+they have a simple name, and no number.
+
+Alarms are direct indications read from the chips. The drivers do NOT
+make comparisons of readings to thresholds. This allows violations
+between readings to be caught and alarmed. The exact definition of an
+alarm (for example, whether a threshold must be met or must be exceeded
+to cause an alarm) is chip-dependent.
+
+When setting values of hwmon sysfs attributes, the string representation of
+the desired value must be written, note that strings which are not a number
+are interpreted as 0! For more on how written strings are interpreted see the
+"sysfs attribute writes interpretation" section at the end of this file.
+
+-------------------------------------------------------------------------
+
+[0-*] denotes any positive number starting from 0
+[1-*] denotes any positive number starting from 1
+RO read only value
+RW read/write value
+
+Read/write values may be read-only for some chips, depending on the
+hardware implementation.
+
+All entries (except name) are optional, and should only be created in a
+given driver if the chip has the feature.
+
+
+********
+* Name *
+********
+
+name The chip name.
+ This should be a short, lowercase string, not containing
+ spaces nor dashes, representing the chip name. This is
+ the only mandatory attribute.
+ I2C devices get this attribute created automatically.
+ RO
+
+
+************
+* Voltages *
+************
+
+in[0-*]_min Voltage min value.
+ Unit: millivolt
+ RW
+
+in[0-*]_max Voltage max value.
+ Unit: millivolt
+ RW
+
+in[0-*]_input Voltage input value.
+ Unit: millivolt
+ RO
+ Voltage measured on the chip pin.
+ Actual voltage depends on the scaling resistors on the
+ motherboard, as recommended in the chip datasheet.
+ This varies by chip and by motherboard.
+ Because of this variation, values are generally NOT scaled
+ by the chip driver, and must be done by the application.
+ However, some drivers (notably lm87 and via686a)
+ do scale, because of internal resistors built into a chip.
+ These drivers will output the actual voltage. Rule of
+ thumb: drivers should report the voltage values at the
+ "pins" of the chip.
+
+in[0-*]_label Suggested voltage channel label.
+ Text string
+ Should only be created if the driver has hints about what
+ this voltage channel is being used for, and user-space
+ doesn't. In all other cases, the label is provided by
+ user-space.
+ RO
+
+cpu[0-*]_vid CPU core reference voltage.
+ Unit: millivolt
+ RO
+ Not always correct.
+
+vrm Voltage Regulator Module version number.
+ RW (but changing it should no more be necessary)
+ Originally the VRM standard version multiplied by 10, but now
+ an arbitrary number, as not all standards have a version
+ number.
+ Affects the way the driver calculates the CPU core reference
+ voltage from the vid pins.
+
+Also see the Alarms section for status flags associated with voltages.
+
+
+********
+* Fans *
+********
+
+fan[1-*]_min Fan minimum value
+ Unit: revolution/min (RPM)
+ RW
+
+fan[1-*]_input Fan input value.
+ Unit: revolution/min (RPM)
+ RO
+
+fan[1-*]_div Fan divisor.
+ Integer value in powers of two (1, 2, 4, 8, 16, 32, 64, 128).
+ RW
+ Some chips only support values 1, 2, 4 and 8.
+ Note that this is actually an internal clock divisor, which
+ affects the measurable speed range, not the read value.
+
+fan[1-*]_target
+ Desired fan speed
+ Unit: revolution/min (RPM)
+ RW
+ Only makes sense if the chip supports closed-loop fan speed
+ control based on the measured fan speed.
+
+fan[1-*]_label Suggested fan channel label.
+ Text string
+ Should only be created if the driver has hints about what
+ this fan channel is being used for, and user-space doesn't.
+ In all other cases, the label is provided by user-space.
+ RO
+
+Also see the Alarms section for status flags associated with fans.
+
+
+*******
+* PWM *
+*******
+
+pwm[1-*] Pulse width modulation fan control.
+ Integer value in the range 0 to 255
+ RW
+ 255 is max or 100%.
+
+pwm[1-*]_enable
+ Fan speed control method:
+ 0: no fan speed control (i.e. fan at full speed)
+ 1: manual fan speed control enabled (using pwm[1-*])
+ 2+: automatic fan speed control enabled
+ Check individual chip documentation files for automatic mode
+ details.
+ RW
+
+pwm[1-*]_mode 0: DC mode (direct current)
+ 1: PWM mode (pulse-width modulation)
+ RW
+
+pwm[1-*]_freq Base PWM frequency in Hz.
+ Only possibly available when pwmN_mode is PWM, but not always
+ present even then.
+ RW
+
+pwm[1-*]_auto_channels_temp
+ Select which temperature channels affect this PWM output in
+ auto mode. Bitfield, 1 is temp1, 2 is temp2, 4 is temp3 etc...
+ Which values are possible depend on the chip used.
+ RW
+
+pwm[1-*]_auto_point[1-*]_pwm
+pwm[1-*]_auto_point[1-*]_temp
+pwm[1-*]_auto_point[1-*]_temp_hyst
+ Define the PWM vs temperature curve. Number of trip points is
+ chip-dependent. Use this for chips which associate trip points
+ to PWM output channels.
+ RW
+
+OR
+
+temp[1-*]_auto_point[1-*]_pwm
+temp[1-*]_auto_point[1-*]_temp
+temp[1-*]_auto_point[1-*]_temp_hyst
+ Define the PWM vs temperature curve. Number of trip points is
+ chip-dependent. Use this for chips which associate trip points
+ to temperature channels.
+ RW
+
+
+****************
+* Temperatures *
+****************
+
+temp[1-*]_type Sensor type selection.
+ Integers 1 to 6
+ RW
+ 1: PII/Celeron Diode
+ 2: 3904 transistor
+ 3: thermal diode
+ 4: thermistor
+ 5: AMD AMDSI
+ 6: Intel PECI
+ Not all types are supported by all chips
+
+temp[1-*]_max Temperature max value.
+ Unit: millidegree Celsius (or millivolt, see below)
+ RW
+
+temp[1-*]_min Temperature min value.
+ Unit: millidegree Celsius
+ RW
+
+temp[1-*]_max_hyst
+ Temperature hysteresis value for max limit.
+ Unit: millidegree Celsius
+ Must be reported as an absolute temperature, NOT a delta
+ from the max value.
+ RW
+
+temp[1-*]_input Temperature input value.
+ Unit: millidegree Celsius
+ RO
+
+temp[1-*]_crit Temperature critical value, typically greater than
+ corresponding temp_max values.
+ Unit: millidegree Celsius
+ RW
+
+temp[1-*]_crit_hyst
+ Temperature hysteresis value for critical limit.
+ Unit: millidegree Celsius
+ Must be reported as an absolute temperature, NOT a delta
+ from the critical value.
+ RW
+
+temp[1-*]_offset
+ Temperature offset which is added to the temperature reading
+ by the chip.
+ Unit: millidegree Celsius
+ Read/Write value.
+
+temp[1-*]_label Suggested temperature channel label.
+ Text string
+ Should only be created if the driver has hints about what
+ this temperature channel is being used for, and user-space
+ doesn't. In all other cases, the label is provided by
+ user-space.
+ RO
+
+Some chips measure temperature using external thermistors and an ADC, and
+report the temperature measurement as a voltage. Converting this voltage
+back to a temperature (or the other way around for limits) requires
+mathematical functions not available in the kernel, so the conversion
+must occur in user space. For these chips, all temp* files described
+above should contain values expressed in millivolt instead of millidegree
+Celsius. In other words, such temperature channels are handled as voltage
+channels by the driver.
+
+Also see the Alarms section for status flags associated with temperatures.
+
+
+************
+* Currents *
+************
+
+Note that no known chip provides current measurements as of writing,
+so this part is theoretical, so to say.
+
+curr[1-*]_max Current max value
+ Unit: milliampere
+ RW
+
+curr[1-*]_min Current min value.
+ Unit: milliampere
+ RW
+
+curr[1-*]_input Current input value
+ Unit: milliampere
+ RO
+
+*********
+* Power *
+*********
+
+power[1-*]_average Average power use
+ Unit: microWatt
+ RO
+
+power[1-*]_average_interval Power use averaging interval
+ Unit: milliseconds
+ RW
+
+power[1-*]_average_highest Historical average maximum power use
+ Unit: microWatt
+ RO
+
+power[1-*]_average_lowest Historical average minimum power use
+ Unit: microWatt
+ RO
+
+power[1-*]_input Instantaneous power use
+ Unit: microWatt
+ RO
+
+power[1-*]_input_highest Historical maximum power use
+ Unit: microWatt
+ RO
+
+power[1-*]_input_lowest Historical minimum power use
+ Unit: microWatt
+ RO
+
+power[1-*]_reset_history Reset input_highest, input_lowest,
+ average_highest and average_lowest.
+ WO
+
+**********
+* Energy *
+**********
+
+energy[1-*]_input Cumulative energy use
+ Unit: microJoule
+ RO
+
+**********
+* Alarms *
+**********
+
+Each channel or limit may have an associated alarm file, containing a
+boolean value. 1 means than an alarm condition exists, 0 means no alarm.
+
+Usually a given chip will either use channel-related alarms, or
+limit-related alarms, not both. The driver should just reflect the hardware
+implementation.
+
+in[0-*]_alarm
+fan[1-*]_alarm
+temp[1-*]_alarm
+ Channel alarm
+ 0: no alarm
+ 1: alarm
+ RO
+
+OR
+
+in[0-*]_min_alarm
+in[0-*]_max_alarm
+fan[1-*]_min_alarm
+temp[1-*]_min_alarm
+temp[1-*]_max_alarm
+temp[1-*]_crit_alarm
+ Limit alarm
+ 0: no alarm
+ 1: alarm
+ RO
+
+Each input channel may have an associated fault file. This can be used
+to notify open diodes, unconnected fans etc. where the hardware
+supports it. When this boolean has value 1, the measurement for that
+channel should not be trusted.
+
+in[0-*]_fault
+fan[1-*]_fault
+temp[1-*]_fault
+ Input fault condition
+ 0: no fault occured
+ 1: fault condition
+ RO
+
+Some chips also offer the possibility to get beeped when an alarm occurs:
+
+beep_enable Master beep enable
+ 0: no beeps
+ 1: beeps
+ RW
+
+in[0-*]_beep
+fan[1-*]_beep
+temp[1-*]_beep
+ Channel beep
+ 0: disable
+ 1: enable
+ RW
+
+In theory, a chip could provide per-limit beep masking, but no such chip
+was seen so far.
+
+Old drivers provided a different, non-standard interface to alarms and
+beeps. These interface files are deprecated, but will be kept around
+for compatibility reasons:
+
+alarms Alarm bitmask.
+ RO
+ Integer representation of one to four bytes.
+ A '1' bit means an alarm.
+ Chips should be programmed for 'comparator' mode so that
+ the alarm will 'come back' after you read the register
+ if it is still valid.
+ Generally a direct representation of a chip's internal
+ alarm registers; there is no standard for the position
+ of individual bits. For this reason, the use of this
+ interface file for new drivers is discouraged. Use
+ individual *_alarm and *_fault files instead.
+ Bits are defined in kernel/include/sensors.h.
+
+beep_mask Bitmask for beep.
+ Same format as 'alarms' with the same bit locations,
+ use discouraged for the same reason. Use individual
+ *_beep files instead.
+ RW
+
+
+sysfs attribute writes interpretation
+-------------------------------------
+
+hwmon sysfs attributes always contain numbers, so the first thing to do is to
+convert the input to a number, there are 2 ways todo this depending whether
+the number can be negative or not:
+unsigned long u = simple_strtoul(buf, NULL, 10);
+long s = simple_strtol(buf, NULL, 10);
+
+With buf being the buffer with the user input being passed by the kernel.
+Notice that we do not use the second argument of strto[u]l, and thus cannot
+tell when 0 is returned, if this was really 0 or is caused by invalid input.
+This is done deliberately as checking this everywhere would add a lot of
+code to the kernel.
+
+Notice that it is important to always store the converted value in an
+unsigned long or long, so that no wrap around can happen before any further
+checking.
+
+After the input string is converted to an (unsigned) long, the value should be
+checked if its acceptable. Be careful with further conversions on the value
+before checking it for validity, as these conversions could still cause a wrap
+around before the check. For example do not multiply the result, and only
+add/subtract if it has been divided before the add/subtract.
+
+What to do if a value is found to be invalid, depends on the type of the
+sysfs attribute that is being set. If it is a continuous setting like a
+tempX_max or inX_max attribute, then the value should be clamped to its
+limits using SENSORS_LIMIT(value, min_limit, max_limit). If it is not
+continuous like for example a tempX_type, then when an invalid value is
+written, -EINVAL should be returned.
+
+Example1, temp1_max, register is a signed 8 bit value (-128 - 127 degrees):
+
+ long v = simple_strtol(buf, NULL, 10) / 1000;
+ v = SENSORS_LIMIT(v, -128, 127);
+ /* write v to register */
+
+Example2, fan divider setting, valid values 2, 4 and 8:
+
+ unsigned long v = simple_strtoul(buf, NULL, 10);
+
+ switch (v) {
+ case 2: v = 1; break;
+ case 4: v = 2; break;
+ case 8: v = 3; break;
+ default:
+ return -EINVAL;
+ }
+ /* write v to register */
diff --git a/Documentation/hwmon/thmc50 b/Documentation/hwmon/thmc50
new file mode 100644
index 0000000..9639ca9
--- /dev/null
+++ b/Documentation/hwmon/thmc50
@@ -0,0 +1,74 @@
+Kernel driver thmc50
+=====================
+
+Supported chips:
+ * Analog Devices ADM1022
+ Prefix: 'adm1022'
+ Addresses scanned: I2C 0x2c - 0x2e
+ Datasheet: http://www.analog.com/en/prod/0,2877,ADM1022,00.html
+ * Texas Instruments THMC50
+ Prefix: 'thmc50'
+ Addresses scanned: I2C 0x2c - 0x2e
+ Datasheet: http://focus.ti.com/docs/prod/folders/print/thmc50.html
+
+Author: Krzysztof Helt <krzysztof.h1@wp.pl>
+
+This driver was derived from the 2.4 kernel thmc50.c source file.
+
+Credits:
+ thmc50.c (2.4 kernel):
+ Frodo Looijaard <frodol@dds.nl>
+ Philip Edelbrock <phil@netroedge.com>
+
+Module Parameters
+-----------------
+
+* adm1022_temp3: short array
+ List of adapter,address pairs to force chips into ADM1022 mode with
+ second remote temperature. This does not work for original THMC50 chips.
+
+Description
+-----------
+
+The THMC50 implements: an internal temperature sensor, support for an
+external diode-type temperature sensor (compatible w/ the diode sensor inside
+many processors), and a controllable fan/analog_out DAC. For the temperature
+sensors, limits can be set through the appropriate Overtemperature Shutdown
+register and Hysteresis register. Each value can be set and read to half-degree
+accuracy. An alarm is issued (usually to a connected LM78) when the
+temperature gets higher then the Overtemperature Shutdown value; it stays on
+until the temperature falls below the Hysteresis value. All temperatures are in
+degrees Celsius, and are guaranteed within a range of -55 to +125 degrees.
+
+The THMC50 only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
+
+The THMC50 is usually used in combination with LM78-like chips, to measure
+the temperature of the processor(s).
+
+The ADM1022 works the same as THMC50 but it is faster (5 Hz instead of
+1 Hz for THMC50). It can be also put in a new mode to handle additional
+remote temperature sensor. The driver use the mode set by BIOS by default.
+
+In case the BIOS is broken and the mode is set incorrectly, you can force
+the mode with additional remote temperature with adm1022_temp3 parameter.
+A typical symptom of wrong setting is a fan forced to full speed.
+
+Driver Features
+---------------
+
+The driver provides up to three temperatures:
+
+temp1 -- internal
+temp2 -- remote
+temp3 -- 2nd remote only for ADM1022
+
+pwm1 -- fan speed (0 = stop, 255 = full)
+pwm1_mode -- always 0 (DC mode)
+
+The value of 0 for pwm1 also forces FAN_OFF signal from the chip,
+so it stops fans even if the value 0 into the ANALOG_OUT register does not.
+
+The driver was tested on Compaq AP550 with two ADM1022 chips (one works
+in the temp3 mode), five temperature readings and two fans.
+
diff --git a/Documentation/hwmon/userspace-tools b/Documentation/hwmon/userspace-tools
new file mode 100644
index 0000000..9865aee
--- /dev/null
+++ b/Documentation/hwmon/userspace-tools
@@ -0,0 +1,40 @@
+Introduction
+------------
+
+Most mainboards have sensor chips to monitor system health (like temperatures,
+voltages, fans speed). They are often connected through an I2C bus, but some
+are also connected directly through the ISA bus.
+
+The kernel drivers make the data from the sensor chips available in the /sys
+virtual filesystem. Userspace tools are then used to display the measured
+values or configure the chips in a more friendly manner.
+
+Lm-sensors
+----------
+
+Core set of utilities that will allow you to obtain health information,
+setup monitoring limits etc. You can get them on their homepage
+http://www.lm-sensors.org/ or as a package from your Linux distribution.
+
+If from website:
+Get lm-sensors from project web site. Please note, you need only userspace
+part, so compile with "make user" and install with "make user_install".
+
+General hints to get things working:
+
+0) get lm-sensors userspace utils
+1) compile all drivers in I2C and Hardware Monitoring sections as modules
+ in your kernel
+2) run sensors-detect script, it will tell you what modules you need to load.
+3) load them and run "sensors" command, you should see some results.
+4) fix sensors.conf, labels, limits, fan divisors
+5) if any more problems consult FAQ, or documentation
+
+Other utilities
+---------------
+
+If you want some graphical indicators of system health look for applications
+like: gkrellm, ksensors, xsensors, wmtemp, wmsensors, wmgtemp, ksysguardd,
+hardware-monitor
+
+If you are server administrator you can try snmpd or mrtgutils.
diff --git a/Documentation/hwmon/via686a b/Documentation/hwmon/via686a
new file mode 100644
index 0000000..d651b25
--- /dev/null
+++ b/Documentation/hwmon/via686a
@@ -0,0 +1,78 @@
+Kernel driver via686a
+=====================
+
+Supported chips:
+ * Via VT82C686A, VT82C686B Southbridge Integrated Hardware Monitor
+ Prefix: 'via686a'
+ Addresses scanned: ISA in PCI-space encoded address
+ Datasheet: On request through web form (http://www.via.com.tw/en/support/datasheets/)
+
+Authors:
+ Kyösti Mälkki <kmalkki@cc.hut.fi>,
+ Mark D. Studebaker <mdsxyz123@yahoo.com>
+ Bob Dougherty <bobd@stanford.edu>
+ (Some conversion-factor data were contributed by
+ Jonathan Teh Soon Yew <j.teh@iname.com>
+ and Alex van Kaam <darkside@chello.nl>.)
+
+Module Parameters
+-----------------
+
+force_addr=0xaddr Set the I/O base address. Useful for boards that
+ don't set the address in the BIOS. Look for a BIOS
+ upgrade before resorting to this. Does not do a
+ PCI force; the via686a must still be present in lspci.
+ Don't use this unless the driver complains that the
+ base address is not set.
+ Example: 'modprobe via686a force_addr=0x6000'
+
+Description
+-----------
+
+The driver does not distinguish between the chips and reports
+all as a 686A.
+
+The Via 686a southbridge has integrated hardware monitor functionality.
+It also has an I2C bus, but this driver only supports the hardware monitor.
+For the I2C bus driver, see <file:Documentation/i2c/busses/i2c-viapro>
+
+The Via 686a implements three temperature sensors, two fan rotation speed
+sensors, five voltage sensors and alarms.
+
+Temperatures are measured in degrees Celsius. An alarm is triggered once
+when the Overtemperature Shutdown limit is crossed; it is triggered again
+as soon as it drops below the hysteresis value.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4 or 8) to give
+the readings more range or accuracy. Not all RPM values can accurately be
+represented, so some rounding is done. With a divider of 2, the lowest
+representable value is around 2600 RPM.
+
+Voltage sensors (also known as IN sensors) report their values in volts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit. Voltages are internally scalled, so each voltage channel
+has a different resolution and range.
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may
+already have disappeared! Note that in the current implementation, all
+hardware registers are read whenever any data is read (unless it is less
+than 1.5 seconds since the last update). This means that you can easily
+miss once-only alarms.
+
+The driver only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
+
+Known Issues
+------------
+
+This driver handles sensors integrated in some VIA south bridges. It is
+possible that a motherboard maker used a VT82C686A/B chip as part of a
+product design but was not interested in its hardware monitoring features,
+in which case the sensor inputs will not be wired. This is the case of
+the Asus K7V, A7V and A7V133 motherboards, to name only a few of them.
+So, if you need the force_addr parameter, and end up with values which
+don't seem to make any sense, don't look any further: your chip is simply
+not wired for hardware monitoring.
diff --git a/Documentation/hwmon/vt1211 b/Documentation/hwmon/vt1211
new file mode 100644
index 0000000..77fa633
--- /dev/null
+++ b/Documentation/hwmon/vt1211
@@ -0,0 +1,206 @@
+Kernel driver vt1211
+====================
+
+Supported chips:
+ * VIA VT1211
+ Prefix: 'vt1211'
+ Addresses scanned: none, address read from Super-I/O config space
+ Datasheet: Provided by VIA upon request and under NDA
+
+Authors: Juerg Haefliger <juergh@gmail.com>
+
+This driver is based on the driver for kernel 2.4 by Mark D. Studebaker and
+its port to kernel 2.6 by Lars Ekman.
+
+Thanks to Joseph Chan and Fiona Gatt from VIA for providing documentation and
+technical support.
+
+
+Module Parameters
+-----------------
+
+* uch_config: int Override the BIOS default universal channel (UCH)
+ configuration for channels 1-5.
+ Legal values are in the range of 0-31. Bit 0 maps to
+ UCH1, bit 1 maps to UCH2 and so on. Setting a bit to 1
+ enables the thermal input of that particular UCH and
+ setting a bit to 0 enables the voltage input.
+
+* int_mode: int Override the BIOS default temperature interrupt mode.
+ The only possible value is 0 which forces interrupt
+ mode 0. In this mode, any pending interrupt is cleared
+ when the status register is read but is regenerated as
+ long as the temperature stays above the hysteresis
+ limit.
+
+Be aware that overriding BIOS defaults might cause some unwanted side effects!
+
+
+Description
+-----------
+
+The VIA VT1211 Super-I/O chip includes complete hardware monitoring
+capabilities. It monitors 2 dedicated temperature sensor inputs (temp1 and
+temp2), 1 dedicated voltage (in5) and 2 fans. Additionally, the chip
+implements 5 universal input channels (UCH1-5) that can be individually
+programmed to either monitor a voltage or a temperature.
+
+This chip also provides manual and automatic control of fan speeds (according
+to the datasheet). The driver only supports automatic control since the manual
+mode doesn't seem to work as advertised in the datasheet. In fact I couldn't
+get manual mode to work at all! Be aware that automatic mode hasn't been
+tested very well (due to the fact that my EPIA M10000 doesn't have the fans
+connected to the PWM outputs of the VT1211 :-().
+
+The following table shows the relationship between the vt1211 inputs and the
+sysfs nodes.
+
+Sensor Voltage Mode Temp Mode Default Use (from the datasheet)
+------ ------------ --------- --------------------------------
+Reading 1 temp1 Intel thermal diode
+Reading 3 temp2 Internal thermal diode
+UCH1/Reading2 in0 temp3 NTC type thermistor
+UCH2 in1 temp4 +2.5V
+UCH3 in2 temp5 VccP (processor core)
+UCH4 in3 temp6 +5V
+UCH5 in4 temp7 +12V
++3.3V in5 Internal VCC (+3.3V)
+
+
+Voltage Monitoring
+------------------
+
+Voltages are sampled by an 8-bit ADC with a LSB of ~10mV. The supported input
+range is thus from 0 to 2.60V. Voltage values outside of this range need
+external scaling resistors. This external scaling needs to be compensated for
+via compute lines in sensors.conf, like:
+
+compute inx @*(1+R1/R2), @/(1+R1/R2)
+
+The board level scaling resistors according to VIA's recommendation are as
+follows. And this is of course totally dependent on the actual board
+implementation :-) You will have to find documentation for your own
+motherboard and edit sensors.conf accordingly.
+
+ Expected
+Voltage R1 R2 Divider Raw Value
+-----------------------------------------------
++2.5V 2K 10K 1.2 2083 mV
+VccP --- --- 1.0 1400 mV (1)
++5V 14K 10K 2.4 2083 mV
++12V 47K 10K 5.7 2105 mV
++3.3V (int) 2K 3.4K 1.588 3300 mV (2)
++3.3V (ext) 6.8K 10K 1.68 1964 mV
+
+(1) Depending on the CPU (1.4V is for a VIA C3 Nehemiah).
+(2) R1 and R2 for 3.3V (int) are internal to the VT1211 chip and the driver
+ performs the scaling and returns the properly scaled voltage value.
+
+Each measured voltage has an associated low and high limit which triggers an
+alarm when crossed.
+
+
+Temperature Monitoring
+----------------------
+
+Temperatures are reported in millidegree Celsius. Each measured temperature
+has a high limit which triggers an alarm if crossed. There is an associated
+hysteresis value with each temperature below which the temperature has to drop
+before the alarm is cleared (this is only true for interrupt mode 0). The
+interrupt mode can be forced to 0 in case the BIOS doesn't do it
+automatically. See the 'Module Parameters' section for details.
+
+All temperature channels except temp2 are external. Temp2 is the VT1211
+internal thermal diode and the driver does all the scaling for temp2 and
+returns the temperature in millidegree Celsius. For the external channels
+temp1 and temp3-temp7, scaling depends on the board implementation and needs
+to be performed in userspace via sensors.conf.
+
+Temp1 is an Intel-type thermal diode which requires the following formula to
+convert between sysfs readings and real temperatures:
+
+compute temp1 (@-Offset)/Gain, (@*Gain)+Offset
+
+According to the VIA VT1211 BIOS porting guide, the following gain and offset
+values should be used:
+
+Diode Type Offset Gain
+---------- ------ ----
+Intel CPU 88.638 0.9528
+ 65.000 0.9686 *)
+VIA C3 Ezra 83.869 0.9528
+VIA C3 Ezra-T 73.869 0.9528
+
+*) This is the formula from the lm_sensors 2.10.0 sensors.conf file. I don't
+know where it comes from or how it was derived, it's just listed here for
+completeness.
+
+Temp3-temp7 support NTC thermistors. For these channels, the driver returns
+the voltages as seen at the individual pins of UCH1-UCH5. The voltage at the
+pin (Vpin) is formed by a voltage divider made of the thermistor (Rth) and a
+scaling resistor (Rs):
+
+Vpin = 2200 * Rth / (Rs + Rth) (2200 is the ADC max limit of 2200 mV)
+
+The equation for the thermistor is as follows (google it if you want to know
+more about it):
+
+Rth = Ro * exp(B * (1 / T - 1 / To)) (To is 298.15K (25C) and Ro is the
+ nominal resistance at 25C)
+
+Mingling the above two equations and assuming Rs = Ro and B = 3435 yields the
+following formula for sensors.conf:
+
+compute tempx 1 / (1 / 298.15 - (` (2200 / @ - 1)) / 3435) - 273.15,
+ 2200 / (1 + (^ (3435 / 298.15 - 3435 / (273.15 + @))))
+
+
+Fan Speed Control
+-----------------
+
+The VT1211 provides 2 programmable PWM outputs to control the speeds of 2
+fans. Writing a 2 to any of the two pwm[1-2]_enable sysfs nodes will put the
+PWM controller in automatic mode. There is only a single controller that
+controls both PWM outputs but each PWM output can be individually enabled and
+disabled.
+
+Each PWM has 4 associated distinct output duty-cycles: full, high, low and
+off. Full and off are internally hard-wired to 255 (100%) and 0 (0%),
+respectively. High and low can be programmed via
+pwm[1-2]_auto_point[2-3]_pwm. Each PWM output can be associated with a
+different thermal input but - and here's the weird part - only one set of
+thermal thresholds exist that controls both PWMs output duty-cycles. The
+thermal thresholds are accessible via pwm[1-2]_auto_point[1-4]_temp. Note
+that even though there are 2 sets of 4 auto points each, they map to the same
+registers in the VT1211 and programming one set is sufficient (actually only
+the first set pwm1_auto_point[1-4]_temp is writable, the second set is
+read-only).
+
+PWM Auto Point PWM Output Duty-Cycle
+------------------------------------------------
+pwm[1-2]_auto_point4_pwm full speed duty-cycle (hard-wired to 255)
+pwm[1-2]_auto_point3_pwm high speed duty-cycle
+pwm[1-2]_auto_point2_pwm low speed duty-cycle
+pwm[1-2]_auto_point1_pwm off duty-cycle (hard-wired to 0)
+
+Temp Auto Point Thermal Threshold
+---------------------------------------------
+pwm[1-2]_auto_point4_temp full speed temp
+pwm[1-2]_auto_point3_temp high speed temp
+pwm[1-2]_auto_point2_temp low speed temp
+pwm[1-2]_auto_point1_temp off temp
+
+Long story short, the controller implements the following algorithm to set the
+PWM output duty-cycle based on the input temperature:
+
+Thermal Threshold Output Duty-Cycle
+ (Rising Temp) (Falling Temp)
+----------------------------------------------------------
+ full speed duty-cycle full speed duty-cycle
+full speed temp
+ high speed duty-cycle full speed duty-cycle
+high speed temp
+ low speed duty-cycle high speed duty-cycle
+low speed temp
+ off duty-cycle low speed duty-cycle
+off temp
diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf
new file mode 100644
index 0000000..d6e1ae3
--- /dev/null
+++ b/Documentation/hwmon/w83627ehf
@@ -0,0 +1,126 @@
+Kernel driver w83627ehf
+=======================
+
+Supported chips:
+ * Winbond W83627EHF/EHG/DHG (ISA access ONLY)
+ Prefix: 'w83627ehf'
+ Addresses scanned: ISA address retrieved from Super I/O registers
+ Datasheet:
+ http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83627EHF_%20W83627EHGb.pdf
+ DHG datasheet confidential.
+
+Authors:
+ Jean Delvare <khali@linux-fr.org>
+ Yuan Mu (Winbond)
+ Rudolf Marek <r.marek@assembler.cz>
+ David Hubbard <david.c.hubbard@gmail.com>
+
+Description
+-----------
+
+This driver implements support for the Winbond W83627EHF, W83627EHG, and
+W83627DHG super I/O chips. We will refer to them collectively as Winbond chips.
+
+The chips implement three temperature sensors, five fan rotation
+speed sensors, ten analog voltage sensors (only nine for the 627DHG), one
+VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG), alarms with beep
+warnings (control unimplemented), and some automatic fan regulation
+strategies (plus manual fan control mode).
+
+Temperatures are measured in degrees Celsius and measurement resolution is 1
+degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when
+the temperature gets higher than high limit; it stays on until the temperature
+falls below the hysteresis value.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4, 8, 16, 32, 64 or
+128) to give the readings more range or accuracy. The driver sets the most
+suitable fan divisor itself. Some fans might not be present because they
+share pins with other functions.
+
+Voltage sensors (also known as IN sensors) report their values in millivolts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit.
+
+The driver supports automatic fan control mode known as Thermal Cruise.
+In this mode, the chip attempts to keep the measured temperature in a
+predefined temperature range. If the temperature goes out of range, fan
+is driven slower/faster to reach the predefined range again.
+
+The mode works for fan1-fan4. Mapping of temperatures to pwm outputs is as
+follows:
+
+temp1 -> pwm1
+temp2 -> pwm2
+temp3 -> pwm3
+prog -> pwm4 (the programmable setting is not supported by the driver)
+
+/sys files
+----------
+
+name - this is a standard hwmon device entry. For the W83627EHF and W83627EHG,
+ it is set to "w83627ehf" and for the W83627DHG it is set to "w83627dhg"
+
+pwm[1-4] - this file stores PWM duty cycle or DC value (fan speed) in range:
+ 0 (stop) to 255 (full)
+
+pwm[1-4]_enable - this file controls mode of fan/temperature control:
+ * 1 Manual Mode, write to pwm file any value 0-255 (full speed)
+ * 2 Thermal Cruise
+
+Thermal Cruise mode
+-------------------
+
+If the temperature is in the range defined by:
+
+pwm[1-4]_target - set target temperature, unit millidegree Celsius
+ (range 0 - 127000)
+pwm[1-4]_tolerance - tolerance, unit millidegree Celsius (range 0 - 15000)
+
+there are no changes to fan speed. Once the temperature leaves the interval,
+fan speed increases (temp is higher) or decreases if lower than desired.
+There are defined steps and times, but not exported by the driver yet.
+
+pwm[1-4]_min_output - minimum fan speed (range 1 - 255), when the temperature
+ is below defined range.
+pwm[1-4]_stop_time - how many milliseconds [ms] must elapse to switch
+ corresponding fan off. (when the temperature was below
+ defined range).
+
+Note: last two functions are influenced by other control bits, not yet exported
+ by the driver, so a change might not have any effect.
+
+Implementation Details
+----------------------
+
+Future driver development should bear in mind that the following registers have
+different functions on the 627EHF and the 627DHG. Some registers also have
+different power-on default values, but BIOS should already be loading
+appropriate defaults. Note that bank selection must be performed as is currently
+done in the driver for all register addresses.
+
+0x49: only on DHG, selects temperature source for AUX fan, CPU fan0
+0x4a: not completely documented for the EHF and the DHG documentation assigns
+ different behavior to bits 7 and 6, including extending the temperature
+ input selection to SmartFan I, not just SmartFan III. Testing on the EHF
+ will reveal whether they are compatible or not.
+
+0x58: Chip ID: 0xa1=EHF 0xc1=DHG
+0x5e: only on DHG, has bits to enable "current mode" temperature detection and
+ critical temperature protection
+0x45b: only on EHF, bit 3, vin4 alarm (EHF supports 10 inputs, only 9 on DHG)
+0x552: only on EHF, vin4
+0x558: only on EHF, vin4 high limit
+0x559: only on EHF, vin4 low limit
+0x6b: only on DHG, SYS fan critical temperature
+0x6c: only on DHG, CPU fan0 critical temperature
+0x6d: only on DHG, AUX fan critical temperature
+0x6e: only on DHG, CPU fan1 critical temperature
+
+0x50-0x55 and 0x650-0x657 are marked "Test Register" for the EHF, but "Reserved
+ Register" for the DHG
+
+The DHG also supports PECI, where the DHG queries Intel CPU temperatures, and
+the ICH8 southbridge gets that data via PECI from the DHG, so that the
+southbridge drives the fans. And the DHG supports SST, a one-wire serial bus.
diff --git a/Documentation/hwmon/w83627hf b/Documentation/hwmon/w83627hf
new file mode 100644
index 0000000..6ee36db
--- /dev/null
+++ b/Documentation/hwmon/w83627hf
@@ -0,0 +1,72 @@
+Kernel driver w83627hf
+======================
+
+Supported chips:
+ * Winbond W83627HF (ISA accesses ONLY)
+ Prefix: 'w83627hf'
+ Addresses scanned: ISA address retrieved from Super I/O registers
+ Datasheet: http://www.winbond.com/PDF/sheet/w83627hf.pdf
+ * Winbond W83627THF
+ Prefix: 'w83627thf'
+ Addresses scanned: ISA address retrieved from Super I/O registers
+ Datasheet: http://www.winbond.com/PDF/sheet/w83627thf.pdf
+ * Winbond W83697HF
+ Prefix: 'w83697hf'
+ Addresses scanned: ISA address retrieved from Super I/O registers
+ Datasheet: http://www.winbond.com/PDF/sheet/697hf.pdf
+ * Winbond W83637HF
+ Prefix: 'w83637hf'
+ Addresses scanned: ISA address retrieved from Super I/O registers
+ Datasheet: http://www.winbond.com/PDF/sheet/w83637hf.pdf
+ * Winbond W83687THF
+ Prefix: 'w83687thf'
+ Addresses scanned: ISA address retrieved from Super I/O registers
+ Datasheet: Provided by Winbond on request
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Mark Studebaker <mdsxyz123@yahoo.com>,
+ Bernhard C. Schrenk <clemy@clemy.org>
+
+Module Parameters
+-----------------
+
+* force_addr: int
+ Initialize the ISA address of the sensors
+* force_i2c: int
+ Initialize the I2C address of the sensors
+* init: int
+ (default is 1)
+ Use 'init=0' to bypass initializing the chip.
+ Try this if your computer crashes when you load the module.
+
+Description
+-----------
+
+This driver implements support for ISA accesses *only* for
+the Winbond W83627HF, W83627THF, W83697HF and W83637HF Super I/O chips.
+We will refer to them collectively as Winbond chips.
+
+This driver supports ISA accesses, which should be more reliable
+than i2c accesses. Also, for Tyan boards which contain both a
+Super I/O chip and a second i2c-only Winbond chip (often a W83782D),
+using this driver will avoid i2c address conflicts and complex
+initialization that were required in the w83781d driver.
+
+If you really want i2c accesses for these Super I/O chips,
+use the w83781d driver. However this is not the preferred method
+now that this ISA driver has been developed.
+
+The w83627_HF_ uses pins 110-106 as VID0-VID4. The w83627_THF_ uses the
+same pins as GPIO[0:4]. Technically, the w83627_THF_ does not support a
+VID reading. However the two chips have the identical 128 pin package. So,
+it is possible or even likely for a w83627thf to have the VID signals routed
+to these pins despite their not being labeled for that purpose. Therefore,
+the w83627thf driver interprets these as VID. If the VID on your board
+doesn't work, first see doc/vid in the lm_sensors package[1]. If that still
+doesn't help, you may just ignore the bogus VID reading with no harm done.
+
+For further information on this driver see the w83781d driver documentation.
+
+[1] http://www.lm-sensors.org/browser/lm-sensors/trunk/doc/vid
diff --git a/Documentation/hwmon/w83781d b/Documentation/hwmon/w83781d
new file mode 100644
index 0000000..c91e0b6
--- /dev/null
+++ b/Documentation/hwmon/w83781d
@@ -0,0 +1,453 @@
+Kernel driver w83781d
+=====================
+
+Supported chips:
+ * Winbond W83781D
+ Prefix: 'w83781d'
+ Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
+ Datasheet: http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/w83781d.pdf
+ * Winbond W83782D
+ Prefix: 'w83782d'
+ Addresses scanned: I2C 0x28 - 0x2f, ISA 0x290 (8 I/O ports)
+ Datasheet: http://www.winbond.com/PDF/sheet/w83782d.pdf
+ * Winbond W83783S
+ Prefix: 'w83783s'
+ Addresses scanned: I2C 0x2d
+ Datasheet: http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/w83783s.pdf
+ * Asus AS99127F
+ Prefix: 'as99127f'
+ Addresses scanned: I2C 0x28 - 0x2f
+ Datasheet: Unavailable from Asus
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Mark Studebaker <mdsxyz123@yahoo.com>
+
+Module parameters
+-----------------
+
+* init int
+ (default 1)
+ Use 'init=0' to bypass initializing the chip.
+ Try this if your computer crashes when you load the module.
+
+* reset int
+ (default 0)
+ The driver used to reset the chip on load, but does no more. Use
+ 'reset=1' to restore the old behavior. Report if you need to do this.
+
+force_subclients=bus,caddr,saddr,saddr
+ This is used to force the i2c addresses for subclients of
+ a certain chip. Typical usage is `force_subclients=0,0x2d,0x4a,0x4b'
+ to force the subclients of chip 0x2d on bus 0 to i2c addresses
+ 0x4a and 0x4b. This parameter is useful for certain Tyan boards.
+
+Description
+-----------
+
+This driver implements support for the Winbond W83781D, W83782D, W83783S
+chips, and the Asus AS99127F chips. We will refer to them collectively as
+W8378* chips.
+
+There is quite some difference between these chips, but they are similar
+enough that it was sensible to put them together in one driver.
+The Asus chips are similar to an I2C-only W83782D.
+
+Chip #vin #fanin #pwm #temp wchipid vendid i2c ISA
+as99127f 7 3 0 3 0x31 0x12c3 yes no
+as99127f rev.2 (type_name = as99127f) 0x31 0x5ca3 yes no
+w83781d 7 3 0 3 0x10-1 0x5ca3 yes yes
+w83782d 9 3 2-4 3 0x30 0x5ca3 yes yes
+w83783s 5-6 3 2 1-2 0x40 0x5ca3 yes no
+
+Detection of these chips can sometimes be foiled because they can be in
+an internal state that allows no clean access. If you know the address
+of the chip, use a 'force' parameter; this will put them into a more
+well-behaved state first.
+
+The W8378* implements temperature sensors (three on the W83781D and W83782D,
+two on the W83783S), three fan rotation speed sensors, voltage sensors
+(seven on the W83781D, nine on the W83782D and six on the W83783S), VID
+lines, alarms with beep warnings, and some miscellaneous stuff.
+
+Temperatures are measured in degrees Celsius. There is always one main
+temperature sensor, and one (W83783S) or two (W83781D and W83782D) other
+sensors. An alarm is triggered for the main sensor once when the
+Overtemperature Shutdown limit is crossed; it is triggered again as soon as
+it drops below the Hysteresis value. A more useful behavior
+can be found by setting the Hysteresis value to +127 degrees Celsius; in
+this case, alarms are issued during all the time when the actual temperature
+is above the Overtemperature Shutdown value. The driver sets the
+hysteresis value for temp1 to 127 at initialization.
+
+For the other temperature sensor(s), an alarm is triggered when the
+temperature gets higher then the Overtemperature Shutdown value; it stays
+on until the temperature falls below the Hysteresis value. But on the
+W83781D, there is only one alarm that functions for both other sensors!
+Temperatures are guaranteed within a range of -55 to +125 degrees. The
+main temperature sensors has a resolution of 1 degree; the other sensor(s)
+of 0.5 degree.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4 or 8 for the
+W83781D; 1, 2, 4, 8, 16, 32, 64 or 128 for the others) to give
+the readings more range or accuracy. Not all RPM values can accurately
+be represented, so some rounding is done. With a divider of 2, the lowest
+representable value is around 2600 RPM.
+
+Voltage sensors (also known as IN sensors) report their values in volts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit. Note that minimum in this case always means 'closest to
+zero'; this is important for negative voltage measurements. All voltage
+inputs can measure voltages between 0 and 4.08 volts, with a resolution
+of 0.016 volt.
+
+The VID lines encode the core voltage value: the voltage level your processor
+should work with. This is hardcoded by the mainboard and/or processor itself.
+It is a value in volts. When it is unconnected, you will often find the
+value 3.50 V here.
+
+The W83782D and W83783S temperature conversion machine understands about
+several kinds of temperature probes. You can program the so-called
+beta value in the sensor files. '1' is the PII/Celeron diode, '2' is the
+TN3904 transistor, and 3435 the default thermistor value. Other values
+are (not yet) supported.
+
+In addition to the alarms described above, there is a CHAS alarm on the
+chips which triggers if your computer case is open.
+
+When an alarm goes off, you can be warned by a beeping signal through
+your computer speaker. It is possible to enable all beeping globally,
+or only the beeping for some alarms.
+
+Individual alarm and beep bits:
+
+0x000001: in0
+0x000002: in1
+0x000004: in2
+0x000008: in3
+0x000010: temp1
+0x000020: temp2 (+temp3 on W83781D)
+0x000040: fan1
+0x000080: fan2
+0x000100: in4
+0x000200: in5
+0x000400: in6
+0x000800: fan3
+0x001000: chassis
+0x002000: temp3 (W83782D only)
+0x010000: in7 (W83782D only)
+0x020000: in8 (W83782D only)
+
+If an alarm triggers, it will remain triggered until the hardware register
+is read at least once. This means that the cause for the alarm may
+already have disappeared! Note that in the current implementation, all
+hardware registers are read whenever any data is read (unless it is less
+than 1.5 seconds since the last update). This means that you can easily
+miss once-only alarms.
+
+The chips only update values each 1.5 seconds; reading them more often
+will do no harm, but will return 'old' values.
+
+AS99127F PROBLEMS
+-----------------
+The as99127f support was developed without the benefit of a datasheet.
+In most cases it is treated as a w83781d (although revision 2 of the
+AS99127F looks more like a w83782d).
+This support will be BETA until a datasheet is released.
+One user has reported problems with fans stopping
+occasionally.
+
+Note that the individual beep bits are inverted from the other chips.
+The driver now takes care of this so that user-space applications
+don't have to know about it.
+
+Known problems:
+ - Problems with diode/thermistor settings (supported?)
+ - One user reports fans stopping under high server load.
+ - Revision 2 seems to have 2 PWM registers but we don't know
+ how to handle them. More details below.
+
+These will not be fixed unless we get a datasheet.
+If you have problems, please lobby Asus to release a datasheet.
+Unfortunately several others have without success.
+Please do not send mail to us asking for better as99127f support.
+We have done the best we can without a datasheet.
+Please do not send mail to the author or the sensors group asking for
+a datasheet or ideas on how to convince Asus. We can't help.
+
+
+NOTES:
+-----
+ 783s has no in1 so that in[2-6] are compatible with the 781d/782d.
+
+ 783s pin is programmable for -5V or temp1; defaults to -5V,
+ no control in driver so temp1 doesn't work.
+
+ 782d and 783s datasheets differ on which is pwm1 and which is pwm2.
+ We chose to follow 782d.
+
+ 782d and 783s pin is programmable for fan3 input or pwm2 output;
+ defaults to fan3 input.
+ If pwm2 is enabled (with echo 255 1 > pwm2), then
+ fan3 will report 0.
+
+ 782d has pwm1-2 for ISA, pwm1-4 for i2c. (pwm3-4 share pins with
+ the ISA pins)
+
+Data sheet updates:
+------------------
+ - PWM clock registers:
+
+ 000: master / 512
+ 001: master / 1024
+ 010: master / 2048
+ 011: master / 4096
+ 100: master / 8192
+
+
+Answers from Winbond tech support
+---------------------------------
+>
+> 1) In the W83781D data sheet section 7.2 last paragraph, it talks about
+> reprogramming the R-T table if the Beta of the thermistor is not
+> 3435K. The R-T table is described briefly in section 8.20.
+> What formulas do I use to program a new R-T table for a given Beta?
+>
+ We are sorry that the calculation for R-T table value is
+confidential. If you have another Beta value of thermistor, we can help
+to calculate the R-T table for you. But you should give us real R-T
+Table which can be gotten by thermistor vendor. Therefore we will calculate
+them and obtain 32-byte data, and you can fill the 32-byte data to the
+register in Bank0.CR51 of W83781D.
+
+
+> 2) In the W83782D data sheet, it mentions that pins 38, 39, and 40 are
+> programmable to be either thermistor or Pentium II diode inputs.
+> How do I program them for diode inputs? I can't find any register
+> to program these to be diode inputs.
+ --> You may program Bank0 CR[5Dh] and CR[59h] registers.
+
+ CR[5Dh] bit 1(VTIN1) bit 2(VTIN2) bit 3(VTIN3)
+
+ thermistor 0 0 0
+ diode 1 1 1
+
+
+(error) CR[59h] bit 4(VTIN1) bit 2(VTIN2) bit 3(VTIN3)
+(right) CR[59h] bit 4(VTIN1) bit 5(VTIN2) bit 6(VTIN3)
+
+ PII thermal diode 1 1 1
+ 2N3904 diode 0 0 0
+
+
+Asus Clones
+-----------
+
+We have no datasheets for the Asus clones (AS99127F and ASB100 Bach).
+Here are some very useful information that were given to us by Alex Van
+Kaam about how to detect these chips, and how to read their values. He
+also gives advice for another Asus chipset, the Mozart-2 (which we
+don't support yet). Thanks Alex!
+I reworded some parts and added personal comments.
+
+# Detection:
+
+AS99127F rev.1, AS99127F rev.2 and ASB100:
+- I2C address range: 0x29 - 0x2F
+- If register 0x58 holds 0x31 then we have an Asus (either ASB100 or
+ AS99127F)
+- Which one depends on register 0x4F (manufacturer ID):
+ 0x06 or 0x94: ASB100
+ 0x12 or 0xC3: AS99127F rev.1
+ 0x5C or 0xA3: AS99127F rev.2
+ Note that 0x5CA3 is Winbond's ID (WEC), which let us think Asus get their
+ AS99127F rev.2 direct from Winbond. The other codes mean ATT and DVC,
+ respectively. ATT could stand for Asustek something (although it would be
+ very badly chosen IMHO), I don't know what DVC could stand for. Maybe
+ these codes simply aren't meant to be decoded that way.
+
+Mozart-2:
+- I2C address: 0x77
+- If register 0x58 holds 0x56 or 0x10 then we have a Mozart-2
+- Of the Mozart there are 3 types:
+ 0x58=0x56, 0x4E=0x94, 0x4F=0x36: Asus ASM58 Mozart-2
+ 0x58=0x56, 0x4E=0x94, 0x4F=0x06: Asus AS2K129R Mozart-2
+ 0x58=0x10, 0x4E=0x5C, 0x4F=0xA3: Asus ??? Mozart-2
+ You can handle all 3 the exact same way :)
+
+# Temperature sensors:
+
+ASB100:
+- sensor 1: register 0x27
+- sensor 2 & 3 are the 2 LM75's on the SMBus
+- sensor 4: register 0x17
+Remark: I noticed that on Intel boards sensor 2 is used for the CPU
+ and 4 is ignored/stuck, on AMD boards sensor 4 is the CPU and sensor 2 is
+ either ignored or a socket temperature.
+
+AS99127F (rev.1 and 2 alike):
+- sensor 1: register 0x27
+- sensor 2 & 3 are the 2 LM75's on the SMBus
+Remark: Register 0x5b is suspected to be temperature type selector. Bit 1
+ would control temp1, bit 3 temp2 and bit 5 temp3.
+
+Mozart-2:
+- sensor 1: register 0x27
+- sensor 2: register 0x13
+
+# Fan sensors:
+
+ASB100, AS99127F (rev.1 and 2 alike):
+- 3 fans, identical to the W83781D
+
+Mozart-2:
+- 2 fans only, 1350000/RPM/div
+- fan 1: register 0x28, divisor on register 0xA1 (bits 4-5)
+- fan 2: register 0x29, divisor on register 0xA1 (bits 6-7)
+
+# Voltages:
+
+This is where there is a difference between AS99127F rev.1 and 2.
+Remark: The difference is similar to the difference between
+ W83781D and W83782D.
+
+ASB100:
+in0=r(0x20)*0.016
+in1=r(0x21)*0.016
+in2=r(0x22)*0.016
+in3=r(0x23)*0.016*1.68
+in4=r(0x24)*0.016*3.8
+in5=r(0x25)*(-0.016)*3.97
+in6=r(0x26)*(-0.016)*1.666
+
+AS99127F rev.1:
+in0=r(0x20)*0.016
+in1=r(0x21)*0.016
+in2=r(0x22)*0.016
+in3=r(0x23)*0.016*1.68
+in4=r(0x24)*0.016*3.8
+in5=r(0x25)*(-0.016)*3.97
+in6=r(0x26)*(-0.016)*1.503
+
+AS99127F rev.2:
+in0=r(0x20)*0.016
+in1=r(0x21)*0.016
+in2=r(0x22)*0.016
+in3=r(0x23)*0.016*1.68
+in4=r(0x24)*0.016*3.8
+in5=(r(0x25)*0.016-3.6)*5.14+3.6
+in6=(r(0x26)*0.016-3.6)*3.14+3.6
+
+Mozart-2:
+in0=r(0x20)*0.016
+in1=255
+in2=r(0x22)*0.016
+in3=r(0x23)*0.016*1.68
+in4=r(0x24)*0.016*4
+in5=255
+in6=255
+
+
+# PWM
+
+* Additional info about PWM on the AS99127F (may apply to other Asus
+chips as well) by Jean Delvare as of 2004-04-09:
+
+AS99127F revision 2 seems to have two PWM registers at 0x59 and 0x5A,
+and a temperature sensor type selector at 0x5B (which basically means
+that they swapped registers 0x59 and 0x5B when you compare with Winbond
+chips).
+Revision 1 of the chip also has the temperature sensor type selector at
+0x5B, but PWM registers have no effect.
+
+We don't know exactly how the temperature sensor type selection works.
+Looks like bits 1-0 are for temp1, bits 3-2 for temp2 and bits 5-4 for
+temp3, although it is possible that only the most significant bit matters
+each time. So far, values other than 0 always broke the readings.
+
+PWM registers seem to be split in two parts: bit 7 is a mode selector,
+while the other bits seem to define a value or threshold.
+
+When bit 7 is clear, bits 6-0 seem to hold a threshold value. If the value
+is below a given limit, the fan runs at low speed. If the value is above
+the limit, the fan runs at full speed. We have no clue as to what the limit
+represents. Note that there seem to be some inertia in this mode, speed
+changes may need some time to trigger. Also, an hysteresis mechanism is
+suspected since walking through all the values increasingly and then
+decreasingly led to slightly different limits.
+
+When bit 7 is set, bits 3-0 seem to hold a threshold value, while bits 6-4
+would not be significant. If the value is below a given limit, the fan runs
+at full speed, while if it is above the limit it runs at low speed (so this
+is the contrary of the other mode, in a way). Here again, we don't know
+what the limit is supposed to represent.
+
+One remarkable thing is that the fans would only have two or three
+different speeds (transitional states left apart), not a whole range as
+you usually get with PWM.
+
+As a conclusion, you can write 0x00 or 0x8F to the PWM registers to make
+fans run at low speed, and 0x7F or 0x80 to make them run at full speed.
+
+Please contact us if you can figure out how it is supposed to work. As
+long as we don't know more, the w83781d driver doesn't handle PWM on
+AS99127F chips at all.
+
+* Additional info about PWM on the AS99127F rev.1 by Hector Martin:
+
+I've been fiddling around with the (in)famous 0x59 register and
+found out the following values do work as a form of coarse pwm:
+
+0x80 - seems to turn fans off after some time(1-2 minutes)... might be
+some form of auto-fan-control based on temp? hmm (Qfan? this mobo is an
+old ASUS, it isn't marketed as Qfan. Maybe some beta pre-attemp at Qfan
+that was dropped at the BIOS)
+0x81 - off
+0x82 - slightly "on-ner" than off, but my fans do not get to move. I can
+hear the high-pitched PWM sound that motors give off at too-low-pwm.
+0x83 - now they do move. Estimate about 70% speed or so.
+0x84-0x8f - full on
+
+Changing the high nibble doesn't seem to do much except the high bit
+(0x80) must be set for PWM to work, else the current pwm doesn't seem to
+change.
+
+My mobo is an ASUS A7V266-E. This behavior is similar to what I got
+with speedfan under Windows, where 0-15% would be off, 15-2x% (can't
+remember the exact value) would be 70% and higher would be full on.
+
+* Additional info about PWM on the AS99127F rev.1 from lm-sensors
+ ticket #2350:
+
+I conducted some experiment on Asus P3B-F motherboard with AS99127F
+(Ver. 1).
+
+I confirm that 0x59 register control the CPU_Fan Header on this
+motherboard, and 0x5a register control PWR_Fan.
+
+In order to reduce the dependency of specific fan, the measurement is
+conducted with a digital scope without fan connected. I found out that
+P3B-F actually output variable DC voltage on fan header center pin,
+looks like PWM is filtered on this motherboard.
+
+Here are some of measurements:
+
+0x80 20 mV
+0x81 20 mV
+0x82 232 mV
+0x83 1.2 V
+0x84 2.31 V
+0x85 3.44 V
+0x86 4.62 V
+0x87 5.81 V
+0x88 7.01 V
+9x89 8.22 V
+0x8a 9.42 V
+0x8b 10.6 V
+0x8c 11.9 V
+0x8d 12.4 V
+0x8e 12.4 V
+0x8f 12.4 V
diff --git a/Documentation/hwmon/w83791d b/Documentation/hwmon/w83791d
new file mode 100644
index 0000000..5663e49
--- /dev/null
+++ b/Documentation/hwmon/w83791d
@@ -0,0 +1,161 @@
+Kernel driver w83791d
+=====================
+
+Supported chips:
+ * Winbond W83791D
+ Prefix: 'w83791d'
+ Addresses scanned: I2C 0x2c - 0x2f
+ Datasheet: http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83791D_W83791Gb.pdf
+
+Author: Charles Spirakis <bezaur@gmail.com>
+
+This driver was derived from the w83781d.c and w83792d.c source files.
+
+Credits:
+ w83781d.c:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ and Mark Studebaker <mdsxyz123@yahoo.com>
+ w83792d.c:
+ Chunhao Huang <DZShen@Winbond.com.tw>,
+ Rudolf Marek <r.marek@assembler.cz>
+
+Additional contributors:
+ Sven Anders <anders@anduras.de>
+ Marc Hulsman <m.hulsman@tudelft.nl>
+
+Module Parameters
+-----------------
+
+* init boolean
+ (default 0)
+ Use 'init=1' to have the driver do extra software initializations.
+ The default behavior is to do the minimum initialization possible
+ and depend on the BIOS to properly setup the chip. If you know you
+ have a w83791d and you're having problems, try init=1 before trying
+ reset=1.
+
+* reset boolean
+ (default 0)
+ Use 'reset=1' to reset the chip (via index 0x40, bit 7). The default
+ behavior is no chip reset to preserve BIOS settings.
+
+* force_subclients=bus,caddr,saddr,saddr
+ This is used to force the i2c addresses for subclients of
+ a certain chip. Example usage is `force_subclients=0,0x2f,0x4a,0x4b'
+ to force the subclients of chip 0x2f on bus 0 to i2c addresses
+ 0x4a and 0x4b.
+
+
+Description
+-----------
+
+This driver implements support for the Winbond W83791D chip. The W83791G
+chip appears to be the same as the W83791D but is lead free.
+
+Detection of the chip can sometimes be foiled because it can be in an
+internal state that allows no clean access (Bank with ID register is not
+currently selected). If you know the address of the chip, use a 'force'
+parameter; this will put it into a more well-behaved state first.
+
+The driver implements three temperature sensors, ten voltage sensors,
+five fan rotation speed sensors and manual PWM control of each fan.
+
+Temperatures are measured in degrees Celsius and measurement resolution is 1
+degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when
+the temperature gets higher than the Overtemperature Shutdown value; it stays
+on until the temperature falls below the Hysteresis value.
+
+Voltage sensors (also known as IN sensors) report their values in millivolts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4, 8, 16,
+32, 64 or 128 for all fans) to give the readings more range or accuracy.
+
+Each fan controlled is controlled by PWM. The PWM duty cycle can be read and
+set for each fan separately. Valid values range from 0 (stop) to 255 (full).
+PWM 1-3 support Thermal Cruise mode, in which the PWMs are automatically
+regulated to keep respectively temp 1-3 at a certain target temperature.
+See below for the description of the sysfs-interface.
+
+The w83791d has a global bit used to enable beeping from the speaker when an
+alarm is triggered as well as a bitmask to enable or disable the beep for
+specific alarms. You need both the global beep enable bit and the
+corresponding beep bit to be on for a triggered alarm to sound a beep.
+
+The sysfs interface to the global enable is via the sysfs beep_enable file.
+This file is used for both legacy and new code.
+
+The sysfs interface to the beep bitmask has migrated from the original legacy
+method of a single sysfs beep_mask file to a newer method using multiple
+*_beep files as described in .../Documentation/hwmon/sysfs-interface.
+
+A similar change has occured for the bitmap corresponding to the alarms. The
+original legacy method used a single sysfs alarms file containing a bitmap
+of triggered alarms. The newer method uses multiple sysfs *_alarm files
+(again following the pattern described in sysfs-interface).
+
+Since both methods read and write the underlying hardware, they can be used
+interchangeably and changes in one will automatically be reflected by
+the other. If you use the legacy bitmask method, your user-space code is
+responsible for handling the fact that the alarms and beep_mask bitmaps
+are not the same (see the table below).
+
+NOTE: All new code should be written to use the newer sysfs-interface
+specification as that avoids bitmap problems and is the preferred interface
+going forward.
+
+The driver reads the hardware chip values at most once every three seconds.
+User mode code requesting values more often will receive cached values.
+
+/sys files
+----------
+The sysfs-interface is documented in the 'sysfs-interface' file. Only
+chip-specific options are documented here.
+
+pwm[1-3]_enable - this file controls mode of fan/temperature control for
+ fan 1-3. Fan/PWM 4-5 only support manual mode.
+ * 1 Manual mode
+ * 2 Thermal Cruise mode
+ * 3 Fan Speed Cruise mode (no further support)
+
+temp[1-3]_target - defines the target temperature for Thermal Cruise mode.
+ Unit: millidegree Celsius
+ RW
+
+temp[1-3]_tolerance - temperature tolerance for Thermal Cruise mode.
+ Specifies an interval around the target temperature
+ in which the fan speed is not changed.
+ Unit: millidegree Celsius
+ RW
+
+Alarms bitmap vs. beep_mask bitmask
+------------------------------------
+For legacy code using the alarms and beep_mask files:
+
+in0 (VCORE) : alarms: 0x000001 beep_mask: 0x000001
+in1 (VINR0) : alarms: 0x000002 beep_mask: 0x002000 <== mismatch
+in2 (+3.3VIN): alarms: 0x000004 beep_mask: 0x000004
+in3 (5VDD) : alarms: 0x000008 beep_mask: 0x000008
+in4 (+12VIN) : alarms: 0x000100 beep_mask: 0x000100
+in5 (-12VIN) : alarms: 0x000200 beep_mask: 0x000200
+in6 (-5VIN) : alarms: 0x000400 beep_mask: 0x000400
+in7 (VSB) : alarms: 0x080000 beep_mask: 0x010000 <== mismatch
+in8 (VBAT) : alarms: 0x100000 beep_mask: 0x020000 <== mismatch
+in9 (VINR1) : alarms: 0x004000 beep_mask: 0x004000
+temp1 : alarms: 0x000010 beep_mask: 0x000010
+temp2 : alarms: 0x000020 beep_mask: 0x000020
+temp3 : alarms: 0x002000 beep_mask: 0x000002 <== mismatch
+fan1 : alarms: 0x000040 beep_mask: 0x000040
+fan2 : alarms: 0x000080 beep_mask: 0x000080
+fan3 : alarms: 0x000800 beep_mask: 0x000800
+fan4 : alarms: 0x200000 beep_mask: 0x200000
+fan5 : alarms: 0x400000 beep_mask: 0x400000
+tart1 : alarms: 0x010000 beep_mask: 0x040000 <== mismatch
+tart2 : alarms: 0x020000 beep_mask: 0x080000 <== mismatch
+tart3 : alarms: 0x040000 beep_mask: 0x100000 <== mismatch
+case_open : alarms: 0x001000 beep_mask: 0x001000
+global_enable: alarms: -------- beep_mask: 0x800000 (modified via beep_enable)
diff --git a/Documentation/hwmon/w83792d b/Documentation/hwmon/w83792d
new file mode 100644
index 0000000..14a668e
--- /dev/null
+++ b/Documentation/hwmon/w83792d
@@ -0,0 +1,174 @@
+Kernel driver w83792d
+=====================
+
+Supported chips:
+ * Winbond W83792D
+ Prefix: 'w83792d'
+ Addresses scanned: I2C 0x2c - 0x2f
+ Datasheet: http://www.winbond.com.tw/E-WINBONDHTM/partner/PDFresult.asp?Pname=1035
+
+Author: Chunhao Huang
+Contact: DZShen <DZShen@Winbond.com.tw>
+
+
+Module Parameters
+-----------------
+
+* init int
+ (default 1)
+ Use 'init=0' to bypass initializing the chip.
+ Try this if your computer crashes when you load the module.
+
+* force_subclients=bus,caddr,saddr,saddr
+ This is used to force the i2c addresses for subclients of
+ a certain chip. Example usage is `force_subclients=0,0x2f,0x4a,0x4b'
+ to force the subclients of chip 0x2f on bus 0 to i2c addresses
+ 0x4a and 0x4b.
+
+
+Description
+-----------
+
+This driver implements support for the Winbond W83792AD/D.
+
+Detection of the chip can sometimes be foiled because it can be in an
+internal state that allows no clean access (Bank with ID register is not
+currently selected). If you know the address of the chip, use a 'force'
+parameter; this will put it into a more well-behaved state first.
+
+The driver implements three temperature sensors, seven fan rotation speed
+sensors, nine voltage sensors, and two automatic fan regulation
+strategies called: Smart Fan I (Thermal Cruise mode) and Smart Fan II.
+Automatic fan control mode is possible only for fan1-fan3. Fan4-fan7 can run
+synchronized with selected fan (fan1-fan3). This functionality and manual PWM
+control for fan4-fan7 is not yet implemented.
+
+Temperatures are measured in degrees Celsius and measurement resolution is 1
+degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when
+the temperature gets higher than the Overtemperature Shutdown value; it stays
+on until the temperature falls below the Hysteresis value.
+
+Fan rotation speeds are reported in RPM (rotations per minute). An alarm is
+triggered if the rotation speed has dropped below a programmable limit. Fan
+readings can be divided by a programmable divider (1, 2, 4, 8, 16, 32, 64 or
+128) to give the readings more range or accuracy.
+
+Voltage sensors (also known as IN sensors) report their values in millivolts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit.
+
+Alarms are provided as output from "realtime status register". Following bits
+are defined:
+
+bit - alarm on:
+0 - in0
+1 - in1
+2 - temp1
+3 - temp2
+4 - temp3
+5 - fan1
+6 - fan2
+7 - fan3
+8 - in2
+9 - in3
+10 - in4
+11 - in5
+12 - in6
+13 - VID change
+14 - chassis
+15 - fan7
+16 - tart1
+17 - tart2
+18 - tart3
+19 - in7
+20 - in8
+21 - fan4
+22 - fan5
+23 - fan6
+
+Tart will be asserted while target temperature cannot be achieved after 3 minutes
+of full speed rotation of corresponding fan.
+
+In addition to the alarms described above, there is a CHAS alarm on the chips
+which triggers if your computer case is open (This one is latched, contrary
+to realtime alarms).
+
+The chips only update values each 3 seconds; reading them more often will
+do no harm, but will return 'old' values.
+
+
+W83792D PROBLEMS
+----------------
+Known problems:
+ - This driver is only for Winbond W83792D C version device, there
+ are also some motherboards with B version W83792D device. The
+ calculation method to in6-in7(measured value, limits) is a little
+ different between C and B version. C or B version can be identified
+ by CR[0x49h].
+ - The function of vid and vrm has not been finished, because I'm NOT
+ very familiar with them. Adding support is welcome.
+  - The function of chassis open detection needs more tests.
+ - If you have ASUS server board and chip was not found: Then you will
+ need to upgrade to latest (or beta) BIOS. If it does not help please
+ contact us.
+
+Fan control
+-----------
+
+Manual mode
+-----------
+
+Works as expected. You just need to specify desired PWM/DC value (fan speed)
+in appropriate pwm# file.
+
+Thermal cruise
+--------------
+
+In this mode, W83792D provides the Smart Fan system to automatically control
+fan speed to keep the temperatures of CPU and the system within specific
+range. At first a wanted temperature and interval must be set. This is done
+via thermal_cruise# file. The tolerance# file serves to create T +- tolerance
+interval. The fan speed will be lowered as long as the current temperature
+remains below the thermal_cruise# +- tolerance# value. Once the temperature
+exceeds the high limit (T+tolerance), the fan will be turned on with a
+specific speed set by pwm# and automatically controlled its PWM duty cycle
+with the temperature varying. Three conditions may occur:
+
+(1) If the temperature still exceeds the high limit, PWM duty
+cycle will increase slowly.
+
+(2) If the temperature goes below the high limit, but still above the low
+limit (T-tolerance), the fan speed will be fixed at the current speed because
+the temperature is in the target range.
+
+(3) If the temperature goes below the low limit, PWM duty cycle will decrease
+slowly to 0 or a preset stop value until the temperature exceeds the low
+limit. (The preset stop value handling is not yet implemented in driver)
+
+Smart Fan II
+------------
+
+W83792D also provides a special mode for fan. Four temperature points are
+available. When related temperature sensors detects the temperature in preset
+temperature region (sf2_point@_fan# +- tolerance#) it will cause fans to run
+on programmed value from sf2_level@_fan#. You need to set four temperatures
+for each fan.
+
+
+/sys files
+----------
+
+pwm[1-3] - this file stores PWM duty cycle or DC value (fan speed) in range:
+ 0 (stop) to 255 (full)
+pwm[1-3]_enable - this file controls mode of fan/temperature control:
+ * 0 Disabled
+ * 1 Manual mode
+ * 2 Smart Fan II
+ * 3 Thermal Cruise
+pwm[1-3]_mode - Select PWM of DC mode
+ * 0 DC
+ * 1 PWM
+thermal_cruise[1-3] - Selects the desired temperature for cruise (degC)
+tolerance[1-3] - Value in degrees of Celsius (degC) for +- T
+sf2_point[1-4]_fan[1-3] - four temperature points for each fan for Smart Fan II
+sf2_level[1-3]_fan[1-3] - three PWM/DC levels for each fan for Smart Fan II
diff --git a/Documentation/hwmon/w83793 b/Documentation/hwmon/w83793
new file mode 100644
index 0000000..51171a8
--- /dev/null
+++ b/Documentation/hwmon/w83793
@@ -0,0 +1,106 @@
+Kernel driver w83793
+====================
+
+Supported chips:
+ * Winbond W83793G/W83793R
+ Prefix: 'w83793'
+ Addresses scanned: I2C 0x2c - 0x2f
+ Datasheet: Still not published
+
+Authors:
+ Yuan Mu (Winbond Electronics)
+ Rudolf Marek <r.marek@assembler.cz>
+
+
+Module parameters
+-----------------
+
+* reset int
+ (default 0)
+ This parameter is not recommended, it will lose motherboard specific
+ settings. Use 'reset=1' to reset the chip when loading this module.
+
+* force_subclients=bus,caddr,saddr1,saddr2
+ This is used to force the i2c addresses for subclients of
+ a certain chip. Typical usage is `force_subclients=0,0x2f,0x4a,0x4b'
+ to force the subclients of chip 0x2f on bus 0 to i2c addresses
+ 0x4a and 0x4b.
+
+
+Description
+-----------
+
+This driver implements support for Winbond W83793G/W83793R chips.
+
+* Exported features
+ This driver exports 10 voltage sensors, up to 12 fan tachometer inputs,
+ 6 remote temperatures, up to 8 sets of PWM fan controls, SmartFan
+ (automatic fan speed control) on all temperature/PWM combinations, 2
+ sets of 6-pin CPU VID input.
+
+* Sensor resolutions
+ If your motherboard maker used the reference design, the resolution of
+ voltage0-2 is 2mV, resolution of voltage3/4/5 is 16mV, 8mV for voltage6,
+ 24mV for voltage7/8. Temp1-4 have a 0.25 degree Celsius resolution,
+ temp5-6 have a 1 degree Celsiis resolution.
+
+* Temperature sensor types
+ Temp1-4 have 2 possible types. It can be read from (and written to)
+ temp[1-4]_type.
+ - If the value is 3, it starts monitoring using a remote termal diode
+ (default).
+ - If the value is 6, it starts monitoring using the temperature sensor
+ in Intel CPU and get result by PECI.
+ Temp5-6 can be connected to external thermistors (value of
+ temp[5-6]_type is 4).
+
+* Alarm mechanism
+ For voltage sensors, an alarm triggers if the measured value is below
+ the low voltage limit or over the high voltage limit.
+ For temperature sensors, an alarm triggers if the measured value goes
+ above the high temperature limit, and wears off only after the measured
+ value drops below the hysteresis value.
+ For fan sensors, an alarm triggers if the measured value is below the
+ low speed limit.
+
+* SmartFan/PWM control
+ If you want to set a pwm fan to manual mode, you just need to make sure it
+ is not controlled by any temp channel, for example, you want to set fan1
+ to manual mode, you need to check the value of temp[1-6]_fan_map, make
+ sure bit 0 is cleared in the 6 values. And then set the pwm1 value to
+ control the fan.
+
+ Each temperature channel can control all the 8 PWM outputs (by setting the
+ corresponding bit in tempX_fan_map), you can set the temperature channel
+ mode using temp[1-6]_pwm_enable, 2 is Thermal Cruise mode and 3
+ is the SmartFanII mode. Temperature channels will try to speed up or
+ slow down all controlled fans, this means one fan can receive different
+ PWM value requests from different temperature channels, but the chip
+ will always pick the safest (max) PWM value for each fan.
+
+ In Thermal Cruise mode, the chip attempts to keep the temperature at a
+ predefined value, within a tolerance margin. So if tempX_input >
+ thermal_cruiseX + toleranceX, the chip will increase the PWM value,
+ if tempX_input < thermal_cruiseX - toleranceX, the chip will decrease
+ the PWM value. If the temperature is within the tolerance range, the PWM
+ value is left unchanged.
+
+ SmartFanII works differently, you have to define up to 7 PWM, temperature
+ trip points, defining a PWM/temperature curve which the chip will follow.
+ While not fundamentally different from the Thermal Cruise mode, the
+ implementation is quite different, giving you a finer-grained control.
+
+* Chassis
+ If the case open alarm triggers, it will stay in this state unless cleared
+ by any write to the sysfs file "chassis".
+
+* VID and VRM
+ The VRM version is detected automatically, don't modify the it unless you
+ *do* know the cpu VRM version and it's not properly detected.
+
+
+Notes
+-----
+
+ Only Fan1-5 and PWM1-3 are guaranteed to always exist, other fan inputs and
+ PWM outputs may or may not exist depending on the chip pin configuration.
diff --git a/Documentation/hwmon/w83l785ts b/Documentation/hwmon/w83l785ts
new file mode 100644
index 0000000..bd1fa9d
--- /dev/null
+++ b/Documentation/hwmon/w83l785ts
@@ -0,0 +1,40 @@
+Kernel driver w83l785ts
+=======================
+
+Supported chips:
+ * Winbond W83L785TS-S
+ Prefix: 'w83l785ts'
+ Addresses scanned: I2C 0x2e
+ Datasheet: Publicly available at the Winbond USA website
+ http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83L785TS-S.pdf
+
+Authors:
+ Jean Delvare <khali@linux-fr.org>
+
+Description
+-----------
+
+The W83L785TS-S is a digital temperature sensor. It senses the
+temperature of a single external diode. The high limit is
+theoretically defined as 85 or 100 degrees C through a combination
+of external resistors, so the user cannot change it. Values seen so
+far suggest that the two possible limits are actually 95 and 110
+degrees C. The datasheet is rather poor and obviously inaccurate
+on several points including this one.
+
+All temperature values are given in degrees Celsius. Resolution
+is 1.0 degree. See the datasheet for details.
+
+The w83l785ts driver will not update its values more frequently than
+every other second; reading them more often will do no harm, but will
+return 'old' values.
+
+Known Issues
+------------
+
+On some systems (Asus), the BIOS is known to interfere with the driver
+and cause read errors. Or maybe the W83L785TS-S chip is simply unreliable,
+we don't really know. The driver will retry a given number of times
+(5 by default) and then give up, returning the old value (or 0 if
+there is no old value). It seems to work well enough so that you should
+not notice anything. Thanks to James Bolt for helping test this feature.
diff --git a/Documentation/hwmon/w83l786ng b/Documentation/hwmon/w83l786ng
new file mode 100644
index 0000000..d8f55d7
--- /dev/null
+++ b/Documentation/hwmon/w83l786ng
@@ -0,0 +1,54 @@
+Kernel driver w83l786ng
+=====================
+
+Supported chips:
+ * Winbond W83L786NG/W83L786NR
+ Prefix: 'w83l786ng'
+ Addresses scanned: I2C 0x2e - 0x2f
+ Datasheet: http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83L786NRNG09.pdf
+
+Author: Kevin Lo <kevlo@kevlo.org>
+
+
+Module Parameters
+-----------------
+
+* reset boolean
+ (default 0)
+ Use 'reset=1' to reset the chip (via index 0x40, bit 7). The default
+ behavior is no chip reset to preserve BIOS settings
+
+
+Description
+-----------
+
+This driver implements support for Winbond W83L786NG/W83L786NR chips.
+
+The driver implements two temperature sensors, two fan rotation speed
+sensors, and three voltage sensors.
+
+Temperatures are measured in degrees Celsius and measurement resolution is 1
+degC for temp1 and temp2.
+
+Fan rotation speeds are reported in RPM (rotations per minute). Fan readings
+readings can be divided by a programmable divider (1, 2, 4, 8, 16, 32, 64
+or 128 for fan 1/2) to give the readings more range or accuracy.
+
+Voltage sensors (also known as IN sensors) report their values in millivolts.
+An alarm is triggered if the voltage has crossed a programmable minimum
+or maximum limit.
+
+/sys files
+----------
+
+pwm[1-2] - this file stores PWM duty cycle or DC value (fan speed) in range:
+ 0 (stop) to 255 (full)
+pwm[1-2]_enable - this file controls mode of fan/temperature control:
+ * 0 Manual Mode
+ * 1 Thermal Cruise
+ * 2 Smart Fan II
+ * 4 FAN_SET
+pwm[1-2]_mode - Select PWM of DC mode
+ * 0 DC
+ * 1 PWM
+tolerance[1-2] - Value in degrees of Celsius (degC) for +- T
diff --git a/Documentation/i2c/busses/i2c-ali1535 b/Documentation/i2c/busses/i2c-ali1535
new file mode 100644
index 0000000..0db3b4c
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-ali1535
@@ -0,0 +1,42 @@
+Kernel driver i2c-ali1535
+
+Supported adapters:
+ * Acer Labs, Inc. ALI 1535 (south bridge)
+ Datasheet: Now under NDA
+ http://www.ali.com.tw/eng/support/datasheet_request.php
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Mark D. Studebaker <mdsxyz123@yahoo.com>,
+ Dan Eaton <dan.eaton@rocketlogix.com>,
+ Stephen Rousset<stephen.rousset@rocketlogix.com>
+
+Description
+-----------
+
+This is the driver for the SMB Host controller on Acer Labs Inc. (ALI)
+M1535 South Bridge.
+
+The M1535 is a South bridge for portable systems. It is very similar to the
+M15x3 South bridges also produced by Acer Labs Inc. Some of the registers
+within the part have moved and some have been redefined slightly.
+Additionally, the sequencing of the SMBus transactions has been modified to
+be more consistent with the sequencing recommended by the manufacturer and
+observed through testing. These changes are reflected in this driver and
+can be identified by comparing this driver to the i2c-ali15x3 driver. For
+an overview of these chips see http://www.acerlabs.com
+
+The SMB controller is part of the M7101 device, which is an ACPI-compliant
+Power Management Unit (PMU).
+
+The whole M7101 device has to be enabled for the SMB to work. You can't
+just enable the SMB alone. The SMB and the ACPI have separate I/O spaces.
+We make sure that the SMB is enabled. We leave the ACPI alone.
+
+
+Features
+--------
+
+This driver controls the SMB Host only. This driver does not use
+interrupts.
diff --git a/Documentation/i2c/busses/i2c-ali1563 b/Documentation/i2c/busses/i2c-ali1563
new file mode 100644
index 0000000..99ad4b9
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-ali1563
@@ -0,0 +1,27 @@
+Kernel driver i2c-ali1563
+
+Supported adapters:
+ * Acer Labs, Inc. ALI 1563 (south bridge)
+ Datasheet: Now under NDA
+ http://www.ali.com.tw/eng/support/datasheet_request.php
+
+Author: Patrick Mochel <mochel@digitalimplant.org>
+
+Description
+-----------
+
+This is the driver for the SMB Host controller on Acer Labs Inc. (ALI)
+M1563 South Bridge.
+
+For an overview of these chips see http://www.acerlabs.com
+
+The M1563 southbridge is deceptively similar to the M1533, with a few
+notable exceptions. One of those happens to be the fact they upgraded the
+i2c core to be SMBus 2.0 compliant, and happens to be almost identical to
+the i2c controller found in the Intel 801 south bridges.
+
+Features
+--------
+
+This driver controls the SMB Host only. This driver does not use
+interrupts.
diff --git a/Documentation/i2c/busses/i2c-ali15x3 b/Documentation/i2c/busses/i2c-ali15x3
new file mode 100644
index 0000000..ff28d38
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-ali15x3
@@ -0,0 +1,112 @@
+Kernel driver i2c-ali15x3
+
+Supported adapters:
+ * Acer Labs, Inc. ALI 1533 and 1543C (south bridge)
+ Datasheet: Now under NDA
+ http://www.ali.com.tw/eng/support/datasheet_request.php
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Mark D. Studebaker <mdsxyz123@yahoo.com>
+
+Module Parameters
+-----------------
+
+* force_addr: int
+ Initialize the base address of the i2c controller
+
+
+Notes
+-----
+
+The force_addr parameter is useful for boards that don't set the address in
+the BIOS. Does not do a PCI force; the device must still be present in
+lspci. Don't use this unless the driver complains that the base address is
+not set.
+
+Example: 'modprobe i2c-ali15x3 force_addr=0xe800'
+
+SMBus periodically hangs on ASUS P5A motherboards and can only be cleared
+by a power cycle. Cause unknown (see Issues below).
+
+
+Description
+-----------
+
+This is the driver for the SMB Host controller on Acer Labs Inc. (ALI)
+M1541 and M1543C South Bridges.
+
+The M1543C is a South bridge for desktop systems.
+The M1541 is a South bridge for portable systems.
+They are part of the following ALI chipsets:
+
+ * "Aladdin Pro 2" includes the M1621 Slot 1 North bridge with AGP and
+ 100MHz CPU Front Side bus
+ * "Aladdin V" includes the M1541 Socket 7 North bridge with AGP and 100MHz
+ CPU Front Side bus
+ Some Aladdin V motherboards:
+ Asus P5A
+ Atrend ATC-5220
+ BCM/GVC VP1541
+ Biostar M5ALA
+ Gigabyte GA-5AX (** Generally doesn't work because the BIOS doesn't
+ enable the 7101 device! **)
+ Iwill XA100 Plus
+ Micronics C200
+ Microstar (MSI) MS-5169
+
+ * "Aladdin IV" includes the M1541 Socket 7 North bridge
+ with host bus up to 83.3 MHz.
+
+For an overview of these chips see http://www.acerlabs.com. At this time the
+full data sheets on the web site are password protected, however if you
+contact the ALI office in San Jose they may give you the password.
+
+The M1533/M1543C devices appear as FOUR separate devices on the PCI bus. An
+output of lspci will show something similar to the following:
+
+ 00:02.0 USB Controller: Acer Laboratories Inc. M5237 (rev 03)
+ 00:03.0 Bridge: Acer Laboratories Inc. M7101 <= THIS IS THE ONE WE NEED
+ 00:07.0 ISA bridge: Acer Laboratories Inc. M1533 (rev c3)
+ 00:0f.0 IDE interface: Acer Laboratories Inc. M5229 (rev c1)
+
+** IMPORTANT **
+** If you have a M1533 or M1543C on the board and you get
+** "ali15x3: Error: Can't detect ali15x3!"
+** then run lspci.
+** If you see the 1533 and 5229 devices but NOT the 7101 device,
+** then you must enable ACPI, the PMU, SMB, or something similar
+** in the BIOS.
+** The driver won't work if it can't find the M7101 device.
+
+The SMB controller is part of the M7101 device, which is an ACPI-compliant
+Power Management Unit (PMU).
+
+The whole M7101 device has to be enabled for the SMB to work. You can't
+just enable the SMB alone. The SMB and the ACPI have separate I/O spaces.
+We make sure that the SMB is enabled. We leave the ACPI alone.
+
+Features
+--------
+
+This driver controls the SMB Host only. The SMB Slave
+controller on the M15X3 is not enabled. This driver does not use
+interrupts.
+
+
+Issues
+------
+
+This driver requests the I/O space for only the SMB
+registers. It doesn't use the ACPI region.
+
+On the ASUS P5A motherboard, there are several reports that
+the SMBus will hang and this can only be resolved by
+powering off the computer. It appears to be worse when the board
+gets hot, for example under heavy CPU load, or in the summer.
+There may be electrical problems on this board.
+On the P5A, the W83781D sensor chip is on both the ISA and
+SMBus. Therefore the SMBus hangs can generally be avoided
+by accessing the W83781D on the ISA bus only.
+
diff --git a/Documentation/i2c/busses/i2c-amd756 b/Documentation/i2c/busses/i2c-amd756
new file mode 100644
index 0000000..67f3087
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-amd756
@@ -0,0 +1,25 @@
+Kernel driver i2c-amd756
+
+Supported adapters:
+ * AMD 756
+ * AMD 766
+ * AMD 768
+ * AMD 8111
+ Datasheets: Publicly available on AMD website
+
+ * nVidia nForce
+ Datasheet: Unavailable
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>
+
+Description
+-----------
+
+This driver supports the AMD 756, 766, 768 and 8111 Peripheral Bus
+Controllers, and the nVidia nForce.
+
+Note that for the 8111, there are two SMBus adapters. The SMBus 1.0 adapter
+is supported by this driver, and the SMBus 2.0 adapter is supported by the
+i2c-amd8111 driver.
diff --git a/Documentation/i2c/busses/i2c-amd8111 b/Documentation/i2c/busses/i2c-amd8111
new file mode 100644
index 0000000..460dd66
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-amd8111
@@ -0,0 +1,41 @@
+Kernel driver i2c-adm8111
+
+Supported adapters:
+ * AMD-8111 SMBus 2.0 PCI interface
+
+Datasheets:
+ AMD datasheet not yet available, but almost everything can be found
+ in the publicly available ACPI 2.0 specification, which the adapter
+ follows.
+
+Author: Vojtech Pavlik <vojtech@suse.cz>
+
+Description
+-----------
+
+If you see something like this:
+
+00:07.2 SMBus: Advanced Micro Devices [AMD] AMD-8111 SMBus 2.0 (rev 02)
+ Subsystem: Advanced Micro Devices [AMD] AMD-8111 SMBus 2.0
+ Flags: medium devsel, IRQ 19
+ I/O ports at d400 [size=32]
+
+in your 'lspci -v', then this driver is for your chipset.
+
+Process Call Support
+--------------------
+
+Supported.
+
+SMBus 2.0 Support
+-----------------
+
+Supported. Both PEC and block process call support is implemented. Slave
+mode or host notification are not yet implemented.
+
+Notes
+-----
+
+Note that for the 8111, there are two SMBus adapters. The SMBus 2.0 adapter
+is supported by this driver, and the SMBus 1.0 adapter is supported by the
+i2c-amd756 driver.
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801
new file mode 100644
index 0000000..81c0c59
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-i801
@@ -0,0 +1,134 @@
+Kernel driver i2c-i801
+
+Supported adapters:
+ * Intel 82801AA and 82801AB (ICH and ICH0 - part of the
+ '810' and '810E' chipsets)
+ * Intel 82801BA (ICH2 - part of the '815E' chipset)
+ * Intel 82801CA/CAM (ICH3)
+ * Intel 82801DB (ICH4) (HW PEC supported)
+ * Intel 82801EB/ER (ICH5) (HW PEC supported)
+ * Intel 6300ESB
+ * Intel 82801FB/FR/FW/FRW (ICH6)
+ * Intel 82801G (ICH7)
+ * Intel 631xESB/632xESB (ESB2)
+ * Intel 82801H (ICH8)
+ * Intel 82801I (ICH9)
+ * Intel EP80579 (Tolapai)
+ * Intel 82801JI (ICH10)
+ * Intel PCH
+ Datasheets: Publicly available at the Intel website
+
+Authors:
+ Mark Studebaker <mdsxyz123@yahoo.com>
+ Jean Delvare <khali@linux-fr.org>
+
+
+Module Parameters
+-----------------
+
+None.
+
+
+Description
+-----------
+
+The ICH (properly known as the 82801AA), ICH0 (82801AB), ICH2 (82801BA),
+ICH3 (82801CA/CAM) and later devices (PCH) are Intel chips that are a part of
+Intel's '810' chipset for Celeron-based PCs, '810E' chipset for
+Pentium-based PCs, '815E' chipset, and others.
+
+The ICH chips contain at least SEVEN separate PCI functions in TWO logical
+PCI devices. An output of lspci will show something similar to the
+following:
+
+ 00:1e.0 PCI bridge: Intel Corporation: Unknown device 2418 (rev 01)
+ 00:1f.0 ISA bridge: Intel Corporation: Unknown device 2410 (rev 01)
+ 00:1f.1 IDE interface: Intel Corporation: Unknown device 2411 (rev 01)
+ 00:1f.2 USB Controller: Intel Corporation: Unknown device 2412 (rev 01)
+ 00:1f.3 Unknown class [0c05]: Intel Corporation: Unknown device 2413 (rev 01)
+
+The SMBus controller is function 3 in device 1f. Class 0c05 is SMBus Serial
+Controller.
+
+The ICH chips are quite similar to Intel's PIIX4 chip, at least in the
+SMBus controller.
+
+
+Process Call Support
+--------------------
+
+Not supported.
+
+
+I2C Block Read Support
+----------------------
+
+I2C block read is supported on the 82801EB (ICH5) and later chips.
+
+
+SMBus 2.0 Support
+-----------------
+
+The 82801DB (ICH4) and later chips support several SMBus 2.0 features.
+
+
+Hidden ICH SMBus
+----------------
+
+If your system has an Intel ICH south bridge, but you do NOT see the
+SMBus device at 00:1f.3 in lspci, and you can't figure out any way in the
+BIOS to enable it, it means it has been hidden by the BIOS code. Asus is
+well known for first doing this on their P4B motherboard, and many other
+boards after that. Some vendor machines are affected as well.
+
+The first thing to try is the "i2c_ec" ACPI driver. It could be that the
+SMBus was hidden on purpose because it'll be driven by ACPI. If the
+i2c_ec driver works for you, just forget about the i2c-i801 driver and
+don't try to unhide the ICH SMBus. Even if i2c_ec doesn't work, you
+better make sure that the SMBus isn't used by the ACPI code. Try loading
+the "fan" and "thermal" drivers, and check in /proc/acpi/fan and
+/proc/acpi/thermal_zone. If you find anything there, it's likely that
+the ACPI is accessing the SMBus and it's safer not to unhide it. Only
+once you are certain that ACPI isn't using the SMBus, you can attempt
+to unhide it.
+
+In order to unhide the SMBus, we need to change the value of a PCI
+register before the kernel enumerates the PCI devices. This is done in
+drivers/pci/quirks.c, where all affected boards must be listed (see
+function asus_hides_smbus_hostbridge.) If the SMBus device is missing,
+and you think there's something interesting on the SMBus (e.g. a
+hardware monitoring chip), you need to add your board to the list.
+
+The motherboard is identified using the subvendor and subdevice IDs of the
+host bridge PCI device. Get yours with "lspci -n -v -s 00:00.0":
+
+00:00.0 Class 0600: 8086:2570 (rev 02)
+ Subsystem: 1043:80f2
+ Flags: bus master, fast devsel, latency 0
+ Memory at fc000000 (32-bit, prefetchable) [size=32M]
+ Capabilities: [e4] #09 [2106]
+ Capabilities: [a0] AGP version 3.0
+
+Here the host bridge ID is 2570 (82865G/PE/P), the subvendor ID is 1043
+(Asus) and the subdevice ID is 80f2 (P4P800-X). You can find the symbolic
+names for the bridge ID and the subvendor ID in include/linux/pci_ids.h,
+and then add a case for your subdevice ID at the right place in
+drivers/pci/quirks.c. Then please give it very good testing, to make sure
+that the unhidden SMBus doesn't conflict with e.g. ACPI.
+
+If it works, proves useful (i.e. there are usable chips on the SMBus)
+and seems safe, please submit a patch for inclusion into the kernel.
+
+Note: There's a useful script in lm_sensors 2.10.2 and later, named
+unhide_ICH_SMBus (in prog/hotplug), which uses the fakephp driver to
+temporarily unhide the SMBus without having to patch and recompile your
+kernel. It's very convenient if you just want to check if there's
+anything interesting on your hidden ICH SMBus.
+
+
+**********************
+The lm_sensors project gratefully acknowledges the support of Texas
+Instruments in the initial development of this driver.
+
+The lm_sensors project gratefully acknowledges the support of Intel in the
+development of SMBus 2.0 / ICH4 features of this driver.
diff --git a/Documentation/i2c/busses/i2c-nforce2 b/Documentation/i2c/busses/i2c-nforce2
new file mode 100644
index 0000000..fae3495
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-nforce2
@@ -0,0 +1,46 @@
+Kernel driver i2c-nforce2
+
+Supported adapters:
+ * nForce2 MCP 10de:0064
+ * nForce2 Ultra 400 MCP 10de:0084
+ * nForce3 Pro150 MCP 10de:00D4
+ * nForce3 250Gb MCP 10de:00E4
+ * nForce4 MCP 10de:0052
+ * nForce4 MCP-04 10de:0034
+ * nForce4 MCP51 10de:0264
+ * nForce4 MCP55 10de:0368
+ * nForce4 MCP61 10de:03EB
+ * nForce4 MCP65 10de:0446
+
+Datasheet: not publicly available, but seems to be similar to the
+ AMD-8111 SMBus 2.0 adapter.
+
+Authors:
+ Hans-Frieder Vogt <hfvogt@gmx.net>,
+ Thomas Leibold <thomas@plx.com>,
+ Patrick Dreker <patrick@dreker.de>
+
+Description
+-----------
+
+i2c-nforce2 is a driver for the SMBuses included in the nVidia nForce2 MCP.
+
+If your 'lspci -v' listing shows something like the following,
+
+00:01.1 SMBus: nVidia Corporation: Unknown device 0064 (rev a2)
+ Subsystem: Asustek Computer, Inc.: Unknown device 0c11
+ Flags: 66Mhz, fast devsel, IRQ 5
+ I/O ports at c000 [size=32]
+ Capabilities: <available only to root>
+
+then this driver should support the SMBuses of your motherboard.
+
+
+Notes
+-----
+
+The SMBus adapter in the nForce2 chipset seems to be very similar to the
+SMBus 2.0 adapter in the AMD-8111 south bridge. However, I could only get
+the driver to work with direct I/O access, which is different to the EC
+interface of the AMD-8111. Tested on Asus A7N8X. The ACPI DSDT table of the
+Asus A7N8X lists two SMBuses, both of which are supported by this driver.
diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
new file mode 100644
index 0000000..cfcebb1
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -0,0 +1,51 @@
+Kernel driver i2c-ocores
+
+Supported adapters:
+ * OpenCores.org I2C controller by Richard Herveille (see datasheet link)
+ Datasheet: http://www.opencores.org/projects.cgi/web/i2c/overview
+
+Author: Peter Korsgaard <jacmet@sunsite.dk>
+
+Description
+-----------
+
+i2c-ocores is an i2c bus driver for the OpenCores.org I2C controller
+IP core by Richard Herveille.
+
+Usage
+-----
+
+i2c-ocores uses the platform bus, so you need to provide a struct
+platform_device with the base address and interrupt number. The
+dev.platform_data of the device should also point to a struct
+ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
+distance between registers and the input clock speed.
+
+E.G. something like:
+
+static struct resource ocores_resources[] = {
+ [0] = {
+ .start = MYI2C_BASEADDR,
+ .end = MYI2C_BASEADDR + 8,
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .start = MYI2C_IRQ,
+ .end = MYI2C_IRQ,
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+static struct ocores_i2c_platform_data myi2c_data = {
+ .regstep = 2, /* two bytes between registers */
+ .clock_khz = 50000, /* input clock of 50MHz */
+};
+
+static struct platform_device myi2c = {
+ .name = "ocores-i2c",
+ .dev = {
+ .platform_data = &myi2c_data,
+ },
+ .num_resources = ARRAY_SIZE(ocores_resources),
+ .resource = ocores_resources,
+};
diff --git a/Documentation/i2c/busses/i2c-parport b/Documentation/i2c/busses/i2c-parport
new file mode 100644
index 0000000..dceaba1
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-parport
@@ -0,0 +1,174 @@
+Kernel driver i2c-parport
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+This is a unified driver for several i2c-over-parallel-port adapters,
+such as the ones made by Philips, Velleman or ELV. This driver is
+meant as a replacement for the older, individual drivers:
+ * i2c-philips-par
+ * i2c-elv
+ * i2c-velleman
+ * video/i2c-parport (NOT the same as this one, dedicated to home brew
+ teletext adapters)
+
+It currently supports the following devices:
+ * (type=0) Philips adapter
+ * (type=1) home brew teletext adapter
+ * (type=2) Velleman K8000 adapter
+ * (type=3) ELV adapter
+ * (type=4) Analog Devices ADM1032 evaluation board
+ * (type=5) Analog Devices evaluation boards: ADM1025, ADM1030, ADM1031
+ * (type=6) Barco LPT->DVI (K5800236) adapter
+ * (type=7) One For All JP1 parallel port adapter
+
+These devices use different pinout configurations, so you have to tell
+the driver what you have, using the type module parameter. There is no
+way to autodetect the devices. Support for different pinout configurations
+can be easily added when needed.
+
+Earlier kernels defaulted to type=0 (Philips). But now, if the type
+parameter is missing, the driver will simply fail to initialize.
+
+
+Building your own adapter
+-------------------------
+
+If you want to build you own i2c-over-parallel-port adapter, here is
+a sample electronics schema (credits go to Sylvain Munaut):
+
+Device PC
+Side ___________________Vdd (+) Side
+ | | |
+ --- --- ---
+ | | | | | |
+ |R| |R| |R|
+ | | | | | |
+ --- --- ---
+ | | |
+ | | /| |
+SCL ----------x--------o |-----------x------------------- pin 2
+ | \| | |
+ | | |
+ | |\ | |
+SDA ----------x----x---| o---x--------------------------- pin 13
+ | |/ |
+ | |
+ | /| |
+ ---------o |----------------x-------------- pin 3
+ \| | |
+ | |
+ --- ---
+ | | | |
+ |R| |R|
+ | | | |
+ --- ---
+ | |
+ ### ###
+ GND GND
+
+Remarks:
+ - This is the exact pinout and electronics used on the Analog Devices
+ evaluation boards.
+ /|
+ - All inverters -o |- must be 74HC05, they must be open collector output.
+ \|
+ - All resitors are 10k.
+ - Pins 18-25 of the parallel port connected to GND.
+ - Pins 4-9 (D2-D7) could be used as VDD is the driver drives them high.
+ The ADM1032 evaluation board uses D4-D7. Beware that the amount of
+ current you can draw from the parallel port is limited. Also note that
+ all connected lines MUST BE driven at the same state, else you'll short
+ circuit the output buffers! So plugging the I2C adapter after loading
+ the i2c-parport module might be a good safety since data line state
+ prior to init may be unknown.
+ - This is 5V!
+ - Obviously you cannot read SCL (so it's not really standard-compliant).
+ Pretty easy to add, just copy the SDA part and use another input pin.
+ That would give (ELV compatible pinout):
+
+
+Device PC
+Side ______________________________Vdd (+) Side
+ | | | |
+ --- --- --- ---
+ | | | | | | | |
+ |R| |R| |R| |R|
+ | | | | | | | |
+ --- --- --- ---
+ | | | |
+ | | |\ | |
+SCL ----------x--------x--| o---x------------------------ pin 15
+ | | |/ |
+ | | |
+ | | /| |
+ | ---o |-------------x-------------- pin 2
+ | \| | |
+ | | |
+ | | |
+ | |\ | |
+SDA ---------------x---x--| o--------x------------------- pin 10
+ | |/ |
+ | |
+ | /| |
+ ---o |------------------x--------- pin 3
+ \| | |
+ | |
+ --- ---
+ | | | |
+ |R| |R|
+ | | | |
+ --- ---
+ | |
+ ### ###
+ GND GND
+
+
+If possible, you should use the same pinout configuration as existing
+adapters do, so you won't even have to change the code.
+
+
+Similar (but different) drivers
+-------------------------------
+
+This driver is NOT the same as the i2c-pport driver found in the i2c
+package. The i2c-pport driver makes use of modern parallel port features so
+that you don't need additional electronics. It has other restrictions
+however, and was not ported to Linux 2.6 (yet).
+
+This driver is also NOT the same as the i2c-pcf-epp driver found in the
+lm_sensors package. The i2c-pcf-epp driver doesn't use the parallel port as
+an I2C bus directly. Instead, it uses it to control an external I2C bus
+master. That driver was not ported to Linux 2.6 (yet) either.
+
+
+Legacy documentation for Velleman adapter
+-----------------------------------------
+
+Useful links:
+Velleman http://www.velleman.be/
+Velleman K8000 Howto http://howto.htlw16.ac.at/k8000-howto.html
+
+The project has lead to new libs for the Velleman K8000 and K8005:
+ LIBK8000 v1.99.1 and LIBK8005 v0.21
+With these libs, you can control the K8000 interface card and the K8005
+stepper motor card with the simple commands which are in the original
+Velleman software, like SetIOchannel, ReadADchannel, SendStepCCWFull and
+many more, using /dev/velleman.
+ http://home.wanadoo.nl/hihihi/libk8000.htm
+ http://home.wanadoo.nl/hihihi/libk8005.htm
+ http://struyve.mine.nu:8080/index.php?block=k8000
+ http://sourceforge.net/projects/libk8005/
+
+
+One For All JP1 parallel port adapter
+-------------------------------------
+
+The JP1 project revolves around a set of remote controls which expose
+the I2C bus their internal configuration EEPROM lives on via a 6 pin
+jumper in the battery compartment. More details can be found at:
+
+http://www.hifi-remote.com/jp1/
+
+Details of the simple parallel port hardware can be found at:
+
+http://www.hifi-remote.com/jp1/hardware.shtml
diff --git a/Documentation/i2c/busses/i2c-parport-light b/Documentation/i2c/busses/i2c-parport-light
new file mode 100644
index 0000000..2874364
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-parport-light
@@ -0,0 +1,11 @@
+Kernel driver i2c-parport-light
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+This driver is a light version of i2c-parport. It doesn't depend
+on the parport driver, and uses direct I/O access instead. This might be
+prefered on embedded systems where wasting memory for the clean but heavy
+parport handling is not an option. The drawback is a reduced portability
+and the impossibility to daisy-chain other parallel port devices.
+
+Please see i2c-parport for documentation.
diff --git a/Documentation/i2c/busses/i2c-pca-isa b/Documentation/i2c/busses/i2c-pca-isa
new file mode 100644
index 0000000..6fc8f4c
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-pca-isa
@@ -0,0 +1,23 @@
+Kernel driver i2c-pca-isa
+
+Supported adapters:
+This driver supports ISA boards using the Philips PCA 9564
+Parallel bus to I2C bus controller
+
+Author: Ian Campbell <icampbell@arcom.com>, Arcom Control Systems
+
+Module Parameters
+-----------------
+
+* base int
+ I/O base address
+* irq int
+ IRQ interrupt
+* clock int
+ Clock rate as described in table 1 of PCA9564 datasheet
+
+Description
+-----------
+
+This driver supports ISA boards using the Philips PCA 9564
+Parallel bus to I2C bus controller
diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
new file mode 100644
index 0000000..ef1efa7
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -0,0 +1,98 @@
+Kernel driver i2c-piix4
+
+Supported adapters:
+ * Intel 82371AB PIIX4 and PIIX4E
+ * Intel 82443MX (440MX)
+ Datasheet: Publicly available at the Intel website
+ * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges
+ Datasheet: Only available via NDA from ServerWorks
+ * ATI IXP200, IXP300, IXP400, SB600, SB700 and SB800 southbridges
+ Datasheet: Not publicly available
+ * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge
+ Datasheet: Publicly available at the SMSC website http://www.smsc.com
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>
+ Philip Edelbrock <phil@netroedge.com>
+
+
+Module Parameters
+-----------------
+
+* force: int
+ Forcibly enable the PIIX4. DANGEROUS!
+* force_addr: int
+ Forcibly enable the PIIX4 at the given address. EXTREMELY DANGEROUS!
+
+
+Description
+-----------
+
+The PIIX4 (properly known as the 82371AB) is an Intel chip with a lot of
+functionality. Among other things, it implements the PCI bus. One of its
+minor functions is implementing a System Management Bus. This is a true
+SMBus - you can not access it on I2C levels. The good news is that it
+natively understands SMBus commands and you do not have to worry about
+timing problems. The bad news is that non-SMBus devices connected to it can
+confuse it mightily. Yes, this is known to happen...
+
+Do 'lspci -v' and see whether it contains an entry like this:
+
+0000:00:02.3 Bridge: Intel Corp. 82371AB/EB/MB PIIX4 ACPI (rev 02)
+ Flags: medium devsel, IRQ 9
+
+Bus and device numbers may differ, but the function number must be
+identical (like many PCI devices, the PIIX4 incorporates a number of
+different 'functions', which can be considered as separate devices). If you
+find such an entry, you have a PIIX4 SMBus controller.
+
+On some computers (most notably, some Dells), the SMBus is disabled by
+default. If you use the insmod parameter 'force=1', the kernel module will
+try to enable it. THIS IS VERY DANGEROUS! If the BIOS did not set up a
+correct address for this module, you could get in big trouble (read:
+crashes, data corruption, etc.). Try this only as a last resort (try BIOS
+updates first, for example), and backup first! An even more dangerous
+option is 'force_addr=<IOPORT>'. This will not only enable the PIIX4 like
+'force' foes, but it will also set a new base I/O port address. The SMBus
+parts of the PIIX4 needs a range of 8 of these addresses to function
+correctly. If these addresses are already reserved by some other device,
+you will get into big trouble! DON'T USE THIS IF YOU ARE NOT VERY SURE
+ABOUT WHAT YOU ARE DOING!
+
+The PIIX4E is just an new version of the PIIX4; it is supported as well.
+The PIIX/PIIX3 does not implement an SMBus or I2C bus, so you can't use
+this driver on those mainboards.
+
+The ServerWorks Southbridges, the Intel 440MX, and the Victory66 are
+identical to the PIIX4 in I2C/SMBus support.
+
+If you own Force CPCI735 motherboard or other OSB4 based systems you may need
+to change the SMBus Interrupt Select register so the SMBus controller uses
+the SMI mode.
+
+1) Use lspci command and locate the PCI device with the SMBus controller:
+ 00:0f.0 ISA bridge: ServerWorks OSB4 South Bridge (rev 4f)
+ The line may vary for different chipsets. Please consult the driver source
+ for all possible PCI ids (and lspci -n to match them). Lets assume the
+ device is located at 00:0f.0.
+2) Now you just need to change the value in 0xD2 register. Get it first with
+ command: lspci -xxx -s 00:0f.0
+ If the value is 0x3 then you need to change it to 0x1
+ setpci -s 00:0f.0 d2.b=1
+
+Please note that you don't need to do that in all cases, just when the SMBus is
+not working properly.
+
+
+Hardware-specific issues
+------------------------
+
+This driver will refuse to load on IBM systems with an Intel PIIX4 SMBus.
+Some of these machines have an RFID EEPROM (24RF08) connected to the SMBus,
+which can easily get corrupted due to a state machine bug. These are mostly
+Thinkpad laptops, but desktop systems may also be affected. We have no list
+of all affected systems, so the only safe solution was to prevent access to
+the SMBus on all IBM systems (detected using DMI data.)
+
+For additional information, read:
+http://www.lm-sensors.org/browser/lm-sensors/trunk/README.thinkpad
diff --git a/Documentation/i2c/busses/i2c-sis5595 b/Documentation/i2c/busses/i2c-sis5595
new file mode 100644
index 0000000..cc47db7
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-sis5595
@@ -0,0 +1,59 @@
+Kernel driver i2c-sis5595
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Mark D. Studebaker <mdsxyz123@yahoo.com>,
+ Philip Edelbrock <phil@netroedge.com>
+
+Supported adapters:
+ * Silicon Integrated Systems Corp. SiS5595 Southbridge
+ Datasheet: Publicly available at the Silicon Integrated Systems Corp. site.
+
+Note: all have mfr. ID 0x1039.
+
+ SUPPORTED PCI ID
+ 5595 0008
+
+ Note: these chips contain a 0008 device which is incompatible with the
+ 5595. We recognize these by the presence of the listed
+ "blacklist" PCI ID and refuse to load.
+
+ NOT SUPPORTED PCI ID BLACKLIST PCI ID
+ 540 0008 0540
+ 550 0008 0550
+ 5513 0008 5511
+ 5581 0008 5597
+ 5582 0008 5597
+ 5597 0008 5597
+ 5598 0008 5597/5598
+ 630 0008 0630
+ 645 0008 0645
+ 646 0008 0646
+ 648 0008 0648
+ 650 0008 0650
+ 651 0008 0651
+ 730 0008 0730
+ 735 0008 0735
+ 745 0008 0745
+ 746 0008 0746
+
+Module Parameters
+-----------------
+
+* force_addr=0xaddr Set the I/O base address. Useful for boards
+ that don't set the address in the BIOS. Does not do a
+ PCI force; the device must still be present in lspci.
+ Don't use this unless the driver complains that the
+ base address is not set.
+
+Description
+-----------
+
+i2c-sis5595 is a true SMBus host driver for motherboards with the SiS5595
+southbridges.
+
+WARNING: If you are trying to access the integrated sensors on the SiS5595
+chip, you want the sis5595 driver for those, not this driver. This driver
+is a BUS driver, not a CHIP driver. A BUS driver is used by other CHIP
+drivers to access chips on the bus.
+
diff --git a/Documentation/i2c/busses/i2c-sis630 b/Documentation/i2c/busses/i2c-sis630
new file mode 100644
index 0000000..9aca688
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-sis630
@@ -0,0 +1,49 @@
+Kernel driver i2c-sis630
+
+Supported adapters:
+ * Silicon Integrated Systems Corp (SiS)
+ 630 chipset (Datasheet: available at http://amalysh.bei.t-online.de/docs/SIS/)
+ 730 chipset
+ * Possible other SiS chipsets ?
+
+Author: Alexander Malysh <amalysh@web.de>
+
+Module Parameters
+-----------------
+
+* force = [1|0] Forcibly enable the SIS630. DANGEROUS!
+ This can be interesting for chipsets not named
+ above to check if it works for you chipset, but DANGEROUS!
+
+* high_clock = [1|0] Forcibly set Host Master Clock to 56KHz (default,
+ what your BIOS use). DANGEROUS! This should be a bit
+ faster, but freeze some systems (i.e. my Laptop).
+
+
+Description
+-----------
+
+This SMBus only driver is known to work on motherboards with the above
+named chipsets.
+
+If you see something like this:
+
+00:00.0 Host bridge: Silicon Integrated Systems [SiS] 630 Host (rev 31)
+00:01.0 ISA bridge: Silicon Integrated Systems [SiS] 85C503/5513
+
+or like this:
+
+00:00.0 Host bridge: Silicon Integrated Systems [SiS] 730 Host (rev 02)
+00:01.0 ISA bridge: Silicon Integrated Systems [SiS] 85C503/5513
+
+in your 'lspci' output , then this driver is for your chipset.
+
+Thank You
+---------
+Philip Edelbrock <phil@netroedge.com>
+- testing SiS730 support
+Mark M. Hoffman <mhoffman@lightlink.com>
+- bug fixes
+
+To anyone else which I forgot here ;), thanks!
+
diff --git a/Documentation/i2c/busses/i2c-sis96x b/Documentation/i2c/busses/i2c-sis96x
new file mode 100644
index 0000000..70e6a0c
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-sis96x
@@ -0,0 +1,73 @@
+Kernel driver i2c-sis96x
+
+Replaces 2.4.x i2c-sis645
+
+Supported adapters:
+ * Silicon Integrated Systems Corp (SiS)
+ Any combination of these host bridges:
+ 645, 645DX (aka 646), 648, 650, 651, 655, 735, 745, 746
+ and these south bridges:
+ 961, 962, 963(L)
+
+Author: Mark M. Hoffman <mhoffman@lightlink.com>
+
+Description
+-----------
+
+This SMBus only driver is known to work on motherboards with the above
+named chipset combinations. The driver was developed without benefit of a
+proper datasheet from SiS. The SMBus registers are assumed compatible with
+those of the SiS630, although they are located in a completely different
+place. Thanks to Alexander Malysh <amalysh@web.de> for providing the
+SiS630 datasheet (and driver).
+
+The command "lspci" as root should produce something like these lines:
+
+00:00.0 Host bridge: Silicon Integrated Systems [SiS]: Unknown device 0645
+00:02.0 ISA bridge: Silicon Integrated Systems [SiS] 85C503/5513
+00:02.1 SMBus: Silicon Integrated Systems [SiS]: Unknown device 0016
+
+or perhaps this...
+
+00:00.0 Host bridge: Silicon Integrated Systems [SiS]: Unknown device 0645
+00:02.0 ISA bridge: Silicon Integrated Systems [SiS]: Unknown device 0961
+00:02.1 SMBus: Silicon Integrated Systems [SiS]: Unknown device 0016
+
+(kernel versions later than 2.4.18 may fill in the "Unknown"s)
+
+If you cant see it please look on quirk_sis_96x_smbus
+(drivers/pci/quirks.c) (also if southbridge detection fails)
+
+I suspect that this driver could be made to work for the following SiS
+chipsets as well: 635, and 635T. If anyone owns a board with those chips
+AND is willing to risk crashing & burning an otherwise well-behaved kernel
+in the name of progress... please contact me at <mhoffman@lightlink.com> or
+via the linux-i2c mailing list: <linux-i2c@vger.kernel.org>. Please send bug
+reports and/or success stories as well.
+
+
+TO DOs
+------
+
+* The driver does not support SMBus block reads/writes; I may add them if a
+scenario is found where they're needed.
+
+
+Thank You
+---------
+
+Mark D. Studebaker <mdsxyz123@yahoo.com>
+ - design hints and bug fixes
+Alexander Maylsh <amalysh@web.de>
+ - ditto, plus an important datasheet... almost the one I really wanted
+Hans-Günter Lütke Uphues <hg_lu@t-online.de>
+ - patch for SiS735
+Robert Zwerus <arzie@dds.nl>
+ - testing for SiS645DX
+Kianusch Sayah Karadji <kianusch@sk-tech.net>
+ - patch for SiS645DX/962
+Ken Healy
+ - patch for SiS655
+
+To anyone else who has written w/ feedback, thanks!
+
diff --git a/Documentation/i2c/busses/i2c-taos-evm b/Documentation/i2c/busses/i2c-taos-evm
new file mode 100644
index 0000000..9146e33
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-taos-evm
@@ -0,0 +1,46 @@
+Kernel driver i2c-taos-evm
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+This is a driver for the evaluation modules for TAOS I2C/SMBus chips.
+The modules include an SMBus master with limited capabilities, which can
+be controlled over the serial port. Virtually all evaluation modules
+are supported, but a few lines of code need to be added for each new
+module to instantiate the right I2C chip on the bus. Obviously, a driver
+for the chip in question is also needed.
+
+Currently supported devices are:
+
+* TAOS TSL2550 EVM
+
+For addtional information on TAOS products, please see
+ http://www.taosinc.com/
+
+
+Using this driver
+-----------------
+
+In order to use this driver, you'll need the serport driver, and the
+inputattach tool, which is part of the input-utils package. The following
+commands will tell the kernel that you have a TAOS EVM on the first
+serial port:
+
+# modprobe serport
+# inputattach --taos-evm /dev/ttyS0
+
+
+Technical details
+-----------------
+
+Only 4 SMBus transaction types are supported by the TAOS evaluation
+modules:
+* Receive Byte
+* Send Byte
+* Read Byte
+* Write Byte
+
+The communication protocol is text-based and pretty simple. It is
+described in a PDF document on the CD which comes with the evaluation
+module. The communication is rather slow, because the serial port has
+to operate at 1200 bps. However, I don't think this is a big concern in
+practice, as these modules are meant for evaluation and testing only.
diff --git a/Documentation/i2c/busses/i2c-via b/Documentation/i2c/busses/i2c-via
new file mode 100644
index 0000000..3438706
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-via
@@ -0,0 +1,34 @@
+Kernel driver i2c-via
+
+Supported adapters:
+ * VIA Technologies, InC. VT82C586B
+ Datasheet: Publicly available at the VIA website
+
+Author: Kyösti Mälkki <kmalkki@cc.hut.fi>
+
+Description
+-----------
+
+i2c-via is an i2c bus driver for motherboards with VIA chipset.
+
+The following VIA pci chipsets are supported:
+ - MVP3, VP3, VP2/97, VPX/97
+ - others with South bridge VT82C586B
+
+Your lspci listing must show this :
+
+ Bridge: VIA Technologies, Inc. VT82C586B ACPI (rev 10)
+
+ Problems?
+
+ Q: You have VT82C586B on the motherboard, but not in the listing.
+
+ A: Go to your BIOS setup, section PCI devices or similar.
+ Turn USB support on, and try again.
+
+ Q: No error messages, but still i2c doesn't seem to work.
+
+ A: This can happen. This driver uses the pins VIA recommends in their
+ datasheets, but there are several ways the motherboard manufacturer
+ can actually wire the lines.
+
diff --git a/Documentation/i2c/busses/i2c-viapro b/Documentation/i2c/busses/i2c-viapro
new file mode 100644
index 0000000..22efedf
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-viapro
@@ -0,0 +1,65 @@
+Kernel driver i2c-viapro
+
+Supported adapters:
+ * VIA Technologies, Inc. VT82C596A/B
+ Datasheet: Sometimes available at the VIA website
+
+ * VIA Technologies, Inc. VT82C686A/B
+ Datasheet: Sometimes available at the VIA website
+
+ * VIA Technologies, Inc. VT8231, VT8233, VT8233A
+ Datasheet: available on request from VIA
+
+ * VIA Technologies, Inc. VT8235, VT8237R, VT8237A, VT8237S, VT8251
+ Datasheet: available on request and under NDA from VIA
+
+ * VIA Technologies, Inc. CX700
+ Datasheet: available on request and under NDA from VIA
+
+ * VIA Technologies, Inc. VX800/VX820
+ Datasheet: available on http://linux.via.com.tw
+
+Authors:
+ Kyösti Mälkki <kmalkki@cc.hut.fi>,
+ Mark D. Studebaker <mdsxyz123@yahoo.com>,
+ Jean Delvare <khali@linux-fr.org>
+
+Module Parameters
+-----------------
+
+* force: int
+ Forcibly enable the SMBus controller. DANGEROUS!
+* force_addr: int
+ Forcibly enable the SMBus at the given address. EXTREMELY DANGEROUS!
+
+Description
+-----------
+
+i2c-viapro is a true SMBus host driver for motherboards with one of the
+supported VIA south bridges.
+
+Your lspci -n listing must show one of these :
+
+ device 1106:3050 (VT82C596A function 3)
+ device 1106:3051 (VT82C596B function 3)
+ device 1106:3057 (VT82C686 function 4)
+ device 1106:3074 (VT8233)
+ device 1106:3147 (VT8233A)
+ device 1106:8235 (VT8231 function 4)
+ device 1106:3177 (VT8235)
+ device 1106:3227 (VT8237R)
+ device 1106:3337 (VT8237A)
+ device 1106:3372 (VT8237S)
+ device 1106:3287 (VT8251)
+ device 1106:8324 (CX700)
+ device 1106:8353 (VX800/VX820)
+
+If none of these show up, you should look in the BIOS for settings like
+enable ACPI / SMBus or even USB.
+
+Except for the oldest chips (VT82C596A/B, VT82C686A and most probably
+VT8231), this driver supports I2C block transactions. Such transactions
+are mainly useful to read from and write to EEPROMs.
+
+The CX700/VX800/VX820 additionally appears to support SMBus PEC, although
+this driver doesn't implement it yet.
diff --git a/Documentation/i2c/busses/i2c-voodoo3 b/Documentation/i2c/busses/i2c-voodoo3
new file mode 100644
index 0000000..62d90a4
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-voodoo3
@@ -0,0 +1,62 @@
+Kernel driver i2c-voodoo3
+
+Supported adapters:
+ * 3dfx Voodoo3 based cards
+ * Voodoo Banshee based cards
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Ralph Metzler <rjkm@thp.uni-koeln.de>,
+ Mark D. Studebaker <mdsxyz123@yahoo.com>
+
+Main contact: Philip Edelbrock <phil@netroedge.com>
+
+The code is based upon Ralph's test code (he did the hard stuff ;')
+
+Description
+-----------
+
+The 3dfx Voodoo3 chip contains two I2C interfaces (aka a I2C 'master' or
+'host').
+
+The first interface is used for DDC (Data Display Channel) which is a
+serial channel through the VGA monitor connector to a DDC-compliant
+monitor. This interface is defined by the Video Electronics Standards
+Association (VESA). The standards are available for purchase at
+http://www.vesa.org .
+
+The second interface is a general-purpose I2C bus. The intent by 3dfx was
+to allow manufacturers to add extra chips to the video card such as a
+TV-out chip such as the BT869 or possibly even I2C based temperature
+sensors like the ADM1021 or LM75.
+
+Stability
+---------
+
+Seems to be stable on the test machine, but needs more testing on other
+machines. Simultaneous accesses of the DDC and I2C busses may cause errors.
+
+Supported Devices
+-----------------
+
+Specifically, this driver was written and tested on the '3dfx Voodoo3 AGP
+3000' which has a tv-out feature (s-video or composite). According to the
+docs and discussions, this code should work for any Voodoo3 based cards as
+well as Voodoo Banshee based cards. The DDC interface has been tested on a
+Voodoo Banshee card.
+
+Issues
+------
+
+Probably many, but it seems to work OK on my system. :')
+
+
+External Device Connection
+--------------------------
+
+The digital video input jumpers give availability to the I2C bus.
+Specifically, pins 13 and 25 (bottom row middle, and bottom right-end) are
+the I2C clock and I2C data lines, respectively. +5V and GND are probably
+also easily available making the addition of extra I2C/SMBus devices easy
+to implement.
diff --git a/Documentation/i2c/busses/scx200_acb b/Documentation/i2c/busses/scx200_acb
new file mode 100644
index 0000000..7c07883
--- /dev/null
+++ b/Documentation/i2c/busses/scx200_acb
@@ -0,0 +1,32 @@
+Kernel driver scx200_acb
+
+Author: Christer Weinigel <wingel@nano-system.com>
+
+The driver supersedes the older, never merged driver named i2c-nscacb.
+
+Module Parameters
+-----------------
+
+* base: up to 4 ints
+ Base addresses for the ACCESS.bus controllers on SCx200 and SC1100 devices
+
+ By default the driver uses two base addresses 0x820 and 0x840.
+ If you want only one base address, specify the second as 0 so as to
+ override this default.
+
+Description
+-----------
+
+Enable the use of the ACCESS.bus controller on the Geode SCx200 and
+SC1100 processors and the CS5535 and CS5536 Geode companion devices.
+
+Device-specific notes
+---------------------
+
+The SC1100 WRAP boards are known to use base addresses 0x810 and 0x820.
+If the scx200_acb driver is built into the kernel, add the following
+parameter to your boot command line:
+ scx200_acb.base=0x810,0x820
+If the scx200_acb driver is built as a module, add the following line to
+the file /etc/modprobe.conf instead:
+ options scx200_acb base=0x810,0x820
diff --git a/Documentation/i2c/chips/eeprom b/Documentation/i2c/chips/eeprom
new file mode 100644
index 0000000..f7e8104
--- /dev/null
+++ b/Documentation/i2c/chips/eeprom
@@ -0,0 +1,96 @@
+Kernel driver eeprom
+====================
+
+Supported chips:
+ * Any EEPROM chip in the designated address range
+ Prefix: 'eeprom'
+ Addresses scanned: I2C 0x50 - 0x57
+ Datasheets: Publicly available from:
+ Atmel (www.atmel.com),
+ Catalyst (www.catsemi.com),
+ Fairchild (www.fairchildsemi.com),
+ Microchip (www.microchip.com),
+ Philips (www.semiconductor.philips.com),
+ Rohm (www.rohm.com),
+ ST (www.st.com),
+ Xicor (www.xicor.com),
+ and others.
+
+ Chip Size (bits) Address
+ 24C01 1K 0x50 (shadows at 0x51 - 0x57)
+ 24C01A 1K 0x50 - 0x57 (Typical device on DIMMs)
+ 24C02 2K 0x50 - 0x57
+ 24C04 4K 0x50, 0x52, 0x54, 0x56
+ (additional data at 0x51, 0x53, 0x55, 0x57)
+ 24C08 8K 0x50, 0x54 (additional data at 0x51, 0x52,
+ 0x53, 0x55, 0x56, 0x57)
+ 24C16 16K 0x50 (additional data at 0x51 - 0x57)
+ Sony 2K 0x57
+
+ Atmel 34C02B 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Catalyst 34FC02 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Catalyst 34RC02 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Fairchild 34W02 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Microchip 24AA52 2K 0x50 - 0x57, SW write protect at 0x30-37
+ ST M34C02 2K 0x50 - 0x57, SW write protect at 0x30-37
+
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Jean Delvare <khali@linux-fr.org>,
+ Greg Kroah-Hartman <greg@kroah.com>,
+ IBM Corp.
+
+Description
+-----------
+
+This is a simple EEPROM module meant to enable reading the first 256 bytes
+of an EEPROM (on a SDRAM DIMM for example). However, it will access serial
+EEPROMs on any I2C adapter. The supported devices are generically called
+24Cxx, and are listed above; however the numbering for these
+industry-standard devices may vary by manufacturer.
+
+This module was a programming exercise to get used to the new project
+organization laid out by Frodo, but it should be at least completely
+effective for decoding the contents of EEPROMs on DIMMs.
+
+DIMMS will typically contain a 24C01A or 24C02, or the 34C02 variants.
+The other devices will not be found on a DIMM because they respond to more
+than one address.
+
+DDC Monitors may contain any device. Often a 24C01, which responds to all 8
+addresses, is found.
+
+Recent Sony Vaio laptops have an EEPROM at 0x57. We couldn't get the
+specification, so it is guess work and far from being complete.
+
+The Microchip 24AA52/24LCS52, ST M34C02, and others support an additional
+software write protect register at 0x30 - 0x37 (0x20 less than the memory
+location). The chip responds to "write quick" detection at this address but
+does not respond to byte reads. If this register is present, the lower 128
+bytes of the memory array are not write protected. Any byte data write to
+this address will write protect the memory array permanently, and the
+device will no longer respond at the 0x30-37 address. The eeprom driver
+does not support this register.
+
+Lacking functionality:
+
+* Full support for larger devices (24C04, 24C08, 24C16). These are not
+typically found on a PC. These devices will appear as separate devices at
+multiple addresses.
+
+* Support for really large devices (24C32, 24C64, 24C128, 24C256, 24C512).
+These devices require two-byte address fields and are not supported.
+
+* Enable Writing. Again, no technical reason why not, but making it easy
+to change the contents of the EEPROMs (on DIMMs anyway) also makes it easy
+to disable the DIMMs (potentially preventing the computer from booting)
+until the values are restored somehow.
+
+Use:
+
+After inserting the module (and any other required SMBus/i2c modules), you
+should have some EEPROM directories in /sys/bus/i2c/devices/* of names such
+as "0-0050". Inside each of these is a series of files, the eeprom file
+contains the binary data from EEPROM.
diff --git a/Documentation/i2c/chips/max6875 b/Documentation/i2c/chips/max6875
new file mode 100644
index 0000000..10ca43c
--- /dev/null
+++ b/Documentation/i2c/chips/max6875
@@ -0,0 +1,108 @@
+Kernel driver max6875
+=====================
+
+Supported chips:
+ * Maxim MAX6874, MAX6875
+ Prefix: 'max6875'
+ Addresses scanned: None (see below)
+ Datasheet:
+ http://pdfserv.maxim-ic.com/en/ds/MAX6874-MAX6875.pdf
+
+Author: Ben Gardner <bgardner@wabtec.com>
+
+
+Description
+-----------
+
+The Maxim MAX6875 is an EEPROM-programmable power-supply sequencer/supervisor.
+It provides timed outputs that can be used as a watchdog, if properly wired.
+It also provides 512 bytes of user EEPROM.
+
+At reset, the MAX6875 reads the configuration EEPROM into its configuration
+registers. The chip then begins to operate according to the values in the
+registers.
+
+The Maxim MAX6874 is a similar, mostly compatible device, with more intputs
+and outputs:
+ vin gpi vout
+MAX6874 6 4 8
+MAX6875 4 3 5
+
+See the datasheet for more information.
+
+
+Sysfs entries
+-------------
+
+eeprom - 512 bytes of user-defined EEPROM space.
+
+
+General Remarks
+---------------
+
+Valid addresses for the MAX6875 are 0x50 and 0x52.
+Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56.
+The driver does not probe any address, so you must force the address.
+
+Example:
+$ modprobe max6875 force=0,0x50
+
+The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple
+addresses. For example, for address 0x50, it also reserves 0x51.
+The even-address instance is called 'max6875', the odd one is 'dummy'.
+
+
+Programming the chip using i2c-dev
+----------------------------------
+
+Use the i2c-dev interface to access and program the chips.
+Reads and writes are performed differently depending on the address range.
+
+The configuration registers are at addresses 0x00 - 0x45.
+Use i2c_smbus_write_byte_data() to write a register and
+i2c_smbus_read_byte_data() to read a register.
+The command is the register number.
+
+Examples:
+To write a 1 to register 0x45:
+ i2c_smbus_write_byte_data(fd, 0x45, 1);
+
+To read register 0x45:
+ value = i2c_smbus_read_byte_data(fd, 0x45);
+
+
+The configuration EEPROM is at addresses 0x8000 - 0x8045.
+The user EEPROM is at addresses 0x8100 - 0x82ff.
+
+Use i2c_smbus_write_word_data() to write a byte to EEPROM.
+
+The command is the upper byte of the address: 0x80, 0x81, or 0x82.
+The data word is the lower part of the address or'd with data << 8.
+ cmd = address >> 8;
+ val = (address & 0xff) | (data << 8);
+
+Example:
+To write 0x5a to address 0x8003:
+ i2c_smbus_write_word_data(fd, 0x80, 0x5a03);
+
+
+Reading data from the EEPROM is a little more complicated.
+Use i2c_smbus_write_byte_data() to set the read address and then
+i2c_smbus_read_byte() or i2c_smbus_read_i2c_block_data() to read the data.
+
+Example:
+To read data starting at offset 0x8100, first set the address:
+ i2c_smbus_write_byte_data(fd, 0x81, 0x00);
+
+And then read the data
+ value = i2c_smbus_read_byte(fd);
+
+ or
+
+ count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer);
+
+The block read should read 16 bytes.
+0x84 is the block read command.
+
+See the datasheet for more details.
+
diff --git a/Documentation/i2c/chips/pca9539 b/Documentation/i2c/chips/pca9539
new file mode 100644
index 0000000..6aff890
--- /dev/null
+++ b/Documentation/i2c/chips/pca9539
@@ -0,0 +1,58 @@
+Kernel driver pca9539
+=====================
+
+NOTE: this driver is deprecated and will be dropped soon, use
+drivers/gpio/pca9539.c instead.
+
+Supported chips:
+ * Philips PCA9539
+ Prefix: 'pca9539'
+ Addresses scanned: none
+ Datasheet:
+ http://www.semiconductors.philips.com/acrobat/datasheets/PCA9539_2.pdf
+
+Author: Ben Gardner <bgardner@wabtec.com>
+
+
+Description
+-----------
+
+The Philips PCA9539 is a 16 bit low power I/O device.
+All 16 lines can be individually configured as an input or output.
+The input sense can also be inverted.
+The 16 lines are split between two bytes.
+
+
+Detection
+---------
+
+The PCA9539 is difficult to detect and not commonly found in PC machines,
+so you have to pass the I2C bus and address of the installed PCA9539
+devices explicitly to the driver at load time via the force=... parameter.
+
+
+Sysfs entries
+-------------
+
+Each is a byte that maps to the 8 I/O bits.
+A '0' suffix is for bits 0-7, while '1' is for bits 8-15.
+
+input[01] - read the current value
+output[01] - sets the output value
+direction[01] - direction of each bit: 1=input, 0=output
+invert[01] - toggle the input bit sense
+
+input reads the actual state of the line and is always available.
+The direction defaults to input for all channels.
+
+
+General Remarks
+---------------
+
+Note that each output, direction, and invert entry controls 8 lines.
+You should use the read, modify, write sequence.
+For example. to set output bit 0 of 1.
+ val=$(cat output0)
+ val=$(( $val | 1 ))
+ echo $val > output0
+
diff --git a/Documentation/i2c/chips/pcf8574 b/Documentation/i2c/chips/pcf8574
new file mode 100644
index 0000000..235815c
--- /dev/null
+++ b/Documentation/i2c/chips/pcf8574
@@ -0,0 +1,65 @@
+Kernel driver pcf8574
+=====================
+
+Supported chips:
+ * Philips PCF8574
+ Prefix: 'pcf8574'
+ Addresses scanned: none
+ Datasheet: Publicly available at the Philips Semiconductors website
+ http://www.semiconductors.philips.com/pip/PCF8574P.html
+
+ * Philips PCF8574A
+ Prefix: 'pcf8574a'
+ Addresses scanned: none
+ Datasheet: Publicly available at the Philips Semiconductors website
+ http://www.semiconductors.philips.com/pip/PCF8574P.html
+
+Authors:
+ Frodo Looijaard <frodol@dds.nl>,
+ Philip Edelbrock <phil@netroedge.com>,
+ Dan Eaton <dan.eaton@rocketlogix.com>,
+ Aurelien Jarno <aurelien@aurel32.net>,
+ Jean Delvare <khali@linux-fr.org>,
+
+
+Description
+-----------
+The PCF8574(A) is an 8-bit I/O expander for the I2C bus produced by Philips
+Semiconductors. It is designed to provide a byte I2C interface to up to 16
+separate devices (8 x PCF8574 and 8 x PCF8574A).
+
+This device consists of a quasi-bidirectional port. Each of the eight I/Os
+can be independently used as an input or output. To setup an I/O as an
+input, you have to write a 1 to the corresponding output.
+
+For more informations see the datasheet.
+
+
+Accessing PCF8574(A) via /sys interface
+-------------------------------------
+
+The PCF8574(A) is plainly impossible to detect ! Stupid chip.
+So, you have to pass the I2C bus and address of the installed PCF857A
+and PCF8574A devices explicitly to the driver at load time via the
+force=... parameter.
+
+On detection (i.e. insmod, modprobe et al.), directories are being
+created for each detected PCF8574(A):
+
+/sys/bus/i2c/devices/<0>-<1>/
+where <0> is the bus the chip was detected on (e. g. i2c-0)
+and <1> the chip address ([20..27] or [38..3f]):
+
+(example: /sys/bus/i2c/devices/1-0020/)
+
+Inside these directories, there are two files each:
+read and write (and one file with chip name).
+
+The read file is read-only. Reading gives you the current I/O input
+if the corresponding output is set as 1, otherwise the current output
+value, that is to say 0.
+
+The write file is read/write. Writing a value outputs it on the I/O
+port. Reading returns the last written value. As it is not possible
+to read this value from the chip, you need to write at least once to
+this file before you can read back from it.
diff --git a/Documentation/i2c/chips/pcf8575 b/Documentation/i2c/chips/pcf8575
new file mode 100644
index 0000000..40b268e
--- /dev/null
+++ b/Documentation/i2c/chips/pcf8575
@@ -0,0 +1,69 @@
+About the PCF8575 chip and the pcf8575 kernel driver
+====================================================
+
+The PCF8575 chip is produced by the following manufacturers:
+
+ * Philips NXP
+ http://www.nxp.com/#/pip/cb=[type=product,path=50807/41735/41850,final=PCF8575_3]|pip=[pip=PCF8575_3][0]
+
+ * Texas Instruments
+ http://focus.ti.com/docs/prod/folders/print/pcf8575.html
+
+
+Some vendors sell small PCB's with the PCF8575 mounted on it. You can connect
+such a board to a Linux host via e.g. an USB to I2C interface. Examples of
+PCB boards with a PCF8575:
+
+ * SFE Breakout Board for PCF8575 I2C Expander by RobotShop
+ http://www.robotshop.ca/home/products/robot-parts/electronics/adapters-converters/sfe-pcf8575-i2c-expander-board.html
+
+ * Breakout Board for PCF8575 I2C Expander by Spark Fun Electronics
+ http://www.sparkfun.com/commerce/product_info.php?products_id=8130
+
+
+Description
+-----------
+The PCF8575 chip is a 16-bit I/O expander for the I2C bus. Up to eight of
+these chips can be connected to the same I2C bus. You can find this
+chip on some custom designed hardware, but you won't find it on PC
+motherboards.
+
+The PCF8575 chip consists of a 16-bit quasi-bidirectional port and an I2C-bus
+interface. Each of the sixteen I/O's can be independently used as an input or
+an output. To set up an I/O pin as an input, you have to write a 1 to the
+corresponding output.
+
+For more information please see the datasheet.
+
+
+Detection
+---------
+
+There is no method known to detect whether a chip on a given I2C address is
+a PCF8575 or whether it is any other I2C device, so you have to pass the I2C
+bus and address of the installed PCF8575 devices explicitly to the driver at
+load time via the force=... parameter.
+
+/sys interface
+--------------
+
+For each address on which a PCF8575 chip was found or forced the following
+files will be created under /sys:
+* /sys/bus/i2c/devices/<bus>-<address>/read
+* /sys/bus/i2c/devices/<bus>-<address>/write
+where bus is the I2C bus number (0, 1, ...) and address is the four-digit
+hexadecimal representation of the 7-bit I2C address of the PCF8575
+(0020 .. 0027).
+
+The read file is read-only. Reading it will trigger an I2C read and will hence
+report the current input state for the pins configured as inputs, and the
+current output value for the pins configured as outputs.
+
+The write file is read-write. Writing a value to it will configure all pins
+as output for which the corresponding bit is zero. Reading the write file will
+return the value last written, or -EAGAIN if no value has yet been written to
+the write file.
+
+On module initialization the configuration of the chip is not changed -- the
+chip is left in the state it was already configured in through either power-up
+or through previous I2C write actions.
diff --git a/Documentation/i2c/chips/pcf8591 b/Documentation/i2c/chips/pcf8591
new file mode 100644
index 0000000..5628fcf
--- /dev/null
+++ b/Documentation/i2c/chips/pcf8591
@@ -0,0 +1,90 @@
+Kernel driver pcf8591
+=====================
+
+Supported chips:
+ * Philips PCF8591
+ Prefix: 'pcf8591'
+ Addresses scanned: I2C 0x48 - 0x4f
+ Datasheet: Publicly available at the Philips Semiconductor website
+ http://www.semiconductors.philips.com/pip/PCF8591P.html
+
+Authors:
+ Aurelien Jarno <aurelien@aurel32.net>
+ valuable contributions by Jan M. Sendler <sendler@sendler.de>,
+ Jean Delvare <khali@linux-fr.org>
+
+
+Description
+-----------
+The PCF8591 is an 8-bit A/D and D/A converter (4 analog inputs and one
+analog output) for the I2C bus produced by Philips Semiconductors. It
+is designed to provide a byte I2C interface to up to 4 separate devices.
+
+The PCF8591 has 4 analog inputs programmable as single-ended or
+differential inputs :
+- mode 0 : four single ended inputs
+ Pins AIN0 to AIN3 are single ended inputs for channels 0 to 3
+
+- mode 1 : three differential inputs
+ Pins AIN3 is the common negative differential input
+ Pins AIN0 to AIN2 are positive differential inputs for channels 0 to 2
+
+- mode 2 : single ended and differential mixed
+ Pins AIN0 and AIN1 are single ended inputs for channels 0 and 1
+ Pins AIN2 is the positive differential input for channel 3
+ Pins AIN3 is the negative differential input for channel 3
+
+- mode 3 : two differential inputs
+ Pins AIN0 is the positive differential input for channel 0
+ Pins AIN1 is the negative differential input for channel 0
+ Pins AIN2 is the positive differential input for channel 1
+ Pins AIN3 is the negative differential input for channel 1
+
+See the datasheet for details.
+
+Module parameters
+-----------------
+
+* input_mode int
+
+ Analog input mode:
+ 0 = four single ended inputs
+ 1 = three differential inputs
+ 2 = single ended and differential mixed
+ 3 = two differential inputs
+
+
+Accessing PCF8591 via /sys interface
+-------------------------------------
+
+! Be careful !
+The PCF8591 is plainly impossible to detect ! Stupid chip.
+So every chip with address in the interval [48..4f] is
+detected as PCF8591. If you have other chips in this address
+range, the workaround is to load this module after the one
+for your others chips.
+
+On detection (i.e. insmod, modprobe et al.), directories are being
+created for each detected PCF8591:
+
+/sys/bus/devices/<0>-<1>/
+where <0> is the bus the chip was detected on (e. g. i2c-0)
+and <1> the chip address ([48..4f])
+
+Inside these directories, there are such files:
+in0, in1, in2, in3, out0_enable, out0_output, name
+
+Name contains chip name.
+
+The in0, in1, in2 and in3 files are RO. Reading gives the value of the
+corresponding channel. Depending on the current analog inputs configuration,
+files in2 and/or in3 do not exist. Values range are from 0 to 255 for single
+ended inputs and -128 to +127 for differential inputs (8-bit ADC).
+
+The out0_enable file is RW. Reading gives "1" for analog output enabled and
+"0" for analog output disabled. Writing accepts "0" and "1" accordingly.
+
+The out0_output file is RW. Writing a number between 0 and 255 (8-bit DAC), send
+the value to the digital-to-analog converter. Note that a voltage will
+only appears on AOUT pin if aout0_enable equals 1. Reading returns the last
+value written.
diff --git a/Documentation/i2c/dev-interface b/Documentation/i2c/dev-interface
new file mode 100644
index 0000000..3e742ba
--- /dev/null
+++ b/Documentation/i2c/dev-interface
@@ -0,0 +1,214 @@
+Usually, i2c devices are controlled by a kernel driver. But it is also
+possible to access all devices on an adapter from userspace, through
+the /dev interface. You need to load module i2c-dev for this.
+
+Each registered i2c adapter gets a number, counting from 0. You can
+examine /sys/class/i2c-dev/ to see what number corresponds to which adapter.
+Alternatively, you can run "i2cdetect -l" to obtain a formated list of all
+i2c adapters present on your system at a given time. i2cdetect is part of
+the i2c-tools package.
+
+I2C device files are character device files with major device number 89
+and a minor device number corresponding to the number assigned as
+explained above. They should be called "i2c-%d" (i2c-0, i2c-1, ...,
+i2c-10, ...). All 256 minor device numbers are reserved for i2c.
+
+
+C example
+=========
+
+So let's say you want to access an i2c adapter from a C program. The
+first thing to do is "#include <linux/i2c-dev.h>". Please note that
+there are two files named "i2c-dev.h" out there, one is distributed
+with the Linux kernel and is meant to be included from kernel
+driver code, the other one is distributed with i2c-tools and is
+meant to be included from user-space programs. You obviously want
+the second one here.
+
+Now, you have to decide which adapter you want to access. You should
+inspect /sys/class/i2c-dev/ or run "i2cdetect -l" to decide this.
+Adapter numbers are assigned somewhat dynamically, so you can not
+assume much about them. They can even change from one boot to the next.
+
+Next thing, open the device file, as follows:
+
+ int file;
+ int adapter_nr = 2; /* probably dynamically determined */
+ char filename[20];
+
+ snprintf(filename, 19, "/dev/i2c-%d", adapter_nr);
+ file = open(filename, O_RDWR);
+ if (file < 0) {
+ /* ERROR HANDLING; you can check errno to see what went wrong */
+ exit(1);
+ }
+
+When you have opened the device, you must specify with what device
+address you want to communicate:
+
+ int addr = 0x40; /* The I2C address */
+
+ if (ioctl(file, I2C_SLAVE, addr) < 0) {
+ /* ERROR HANDLING; you can check errno to see what went wrong */
+ exit(1);
+ }
+
+Well, you are all set up now. You can now use SMBus commands or plain
+I2C to communicate with your device. SMBus commands are preferred if
+the device supports them. Both are illustrated below.
+
+ __u8 register = 0x10; /* Device register to access */
+ __s32 res;
+ char buf[10];
+
+ /* Using SMBus commands */
+ res = i2c_smbus_read_word_data(file, register);
+ if (res < 0) {
+ /* ERROR HANDLING: i2c transaction failed */
+ } else {
+ /* res contains the read word */
+ }
+
+ /* Using I2C Write, equivalent of
+ i2c_smbus_write_word_data(file, register, 0x6543) */
+ buf[0] = register;
+ buf[1] = 0x43;
+ buf[2] = 0x65;
+ if (write(file, buf, 3) ! =3) {
+ /* ERROR HANDLING: i2c transaction failed */
+ }
+
+ /* Using I2C Read, equivalent of i2c_smbus_read_byte(file) */
+ if (read(file, buf, 1) != 1) {
+ /* ERROR HANDLING: i2c transaction failed */
+ } else {
+ /* buf[0] contains the read byte */
+ }
+
+Note that only a subset of the I2C and SMBus protocols can be achieved by
+the means of read() and write() calls. In particular, so-called combined
+transactions (mixing read and write messages in the same transaction)
+aren't supported. For this reason, this interface is almost never used by
+user-space programs.
+
+IMPORTANT: because of the use of inline functions, you *have* to use
+'-O' or some variation when you compile your program!
+
+
+Full interface description
+==========================
+
+The following IOCTLs are defined:
+
+ioctl(file, I2C_SLAVE, long addr)
+ Change slave address. The address is passed in the 7 lower bits of the
+ argument (except for 10 bit addresses, passed in the 10 lower bits in this
+ case).
+
+ioctl(file, I2C_TENBIT, long select)
+ Selects ten bit addresses if select not equals 0, selects normal 7 bit
+ addresses if select equals 0. Default 0. This request is only valid
+ if the adapter has I2C_FUNC_10BIT_ADDR.
+
+ioctl(file, I2C_PEC, long select)
+ Selects SMBus PEC (packet error checking) generation and verification
+ if select not equals 0, disables if select equals 0. Default 0.
+ Used only for SMBus transactions. This request only has an effect if the
+ the adapter has I2C_FUNC_SMBUS_PEC; it is still safe if not, it just
+ doesn't have any effect.
+
+ioctl(file, I2C_FUNCS, unsigned long *funcs)
+ Gets the adapter functionality and puts it in *funcs.
+
+ioctl(file, I2C_RDWR, struct i2c_rdwr_ioctl_data *msgset)
+ Do combined read/write transaction without stop in between.
+ Only valid if the adapter has I2C_FUNC_I2C. The argument is
+ a pointer to a
+
+ struct i2c_rdwr_ioctl_data {
+ struct i2c_msg *msgs; /* ptr to array of simple messages */
+ int nmsgs; /* number of messages to exchange */
+ }
+
+ The msgs[] themselves contain further pointers into data buffers.
+ The function will write or read data to or from that buffers depending
+ on whether the I2C_M_RD flag is set in a particular message or not.
+ The slave address and whether to use ten bit address mode has to be
+ set in each message, overriding the values set with the above ioctl's.
+
+ioctl(file, I2C_SMBUS, struct i2c_smbus_ioctl_data *args)
+ Not meant to be called directly; instead, use the access functions
+ below.
+
+You can do plain i2c transactions by using read(2) and write(2) calls.
+You do not need to pass the address byte; instead, set it through
+ioctl I2C_SLAVE before you try to access the device.
+
+You can do SMBus level transactions (see documentation file smbus-protocol
+for details) through the following functions:
+ __s32 i2c_smbus_write_quick(int file, __u8 value);
+ __s32 i2c_smbus_read_byte(int file);
+ __s32 i2c_smbus_write_byte(int file, __u8 value);
+ __s32 i2c_smbus_read_byte_data(int file, __u8 command);
+ __s32 i2c_smbus_write_byte_data(int file, __u8 command, __u8 value);
+ __s32 i2c_smbus_read_word_data(int file, __u8 command);
+ __s32 i2c_smbus_write_word_data(int file, __u8 command, __u16 value);
+ __s32 i2c_smbus_process_call(int file, __u8 command, __u16 value);
+ __s32 i2c_smbus_read_block_data(int file, __u8 command, __u8 *values);
+ __s32 i2c_smbus_write_block_data(int file, __u8 command, __u8 length,
+ __u8 *values);
+All these transactions return -1 on failure; you can read errno to see
+what happened. The 'write' transactions return 0 on success; the
+'read' transactions return the read value, except for read_block, which
+returns the number of values read. The block buffers need not be longer
+than 32 bytes.
+
+The above functions are all inline functions, that resolve to calls to
+the i2c_smbus_access function, that on its turn calls a specific ioctl
+with the data in a specific format. Read the source code if you
+want to know what happens behind the screens.
+
+
+Implementation details
+======================
+
+For the interested, here's the code flow which happens inside the kernel
+when you use the /dev interface to I2C:
+
+1* Your program opens /dev/i2c-N and calls ioctl() on it, as described in
+section "C example" above.
+
+2* These open() and ioctl() calls are handled by the i2c-dev kernel
+driver: see i2c-dev.c:i2cdev_open() and i2c-dev.c:i2cdev_ioctl(),
+respectively. You can think of i2c-dev as a generic I2C chip driver
+that can be programmed from user-space.
+
+3* Some ioctl() calls are for administrative tasks and are handled by
+i2c-dev directly. Examples include I2C_SLAVE (set the address of the
+device you want to access) and I2C_PEC (enable or disable SMBus error
+checking on future transactions.)
+
+4* Other ioctl() calls are converted to in-kernel function calls by
+i2c-dev. Examples include I2C_FUNCS, which queries the I2C adapter
+functionality using i2c.h:i2c_get_functionality(), and I2C_SMBUS, which
+performs an SMBus transaction using i2c-core.c:i2c_smbus_xfer().
+
+The i2c-dev driver is responsible for checking all the parameters that
+come from user-space for validity. After this point, there is no
+difference between these calls that came from user-space through i2c-dev
+and calls that would have been performed by kernel I2C chip drivers
+directly. This means that I2C bus drivers don't need to implement
+anything special to support access from user-space.
+
+5* These i2c-core.c/i2c.h functions are wrappers to the actual
+implementation of your I2C bus driver. Each adapter must declare
+callback functions implementing these standard calls.
+i2c.h:i2c_get_functionality() calls i2c_adapter.algo->functionality(),
+while i2c-core.c:i2c_smbus_xfer() calls either
+adapter.algo->smbus_xfer() if it is implemented, or if not,
+i2c-core.c:i2c_smbus_xfer_emulated() which in turn calls
+i2c_adapter.algo->master_xfer().
+
+After your I2C bus driver has processed these requests, execution runs
+up the call chain, with almost no processing done, except by i2c-dev to
+package the returned data, if any, in suitable format for the ioctl.
diff --git a/Documentation/i2c/fault-codes b/Documentation/i2c/fault-codes
new file mode 100644
index 0000000..045765c
--- /dev/null
+++ b/Documentation/i2c/fault-codes
@@ -0,0 +1,127 @@
+This is a summary of the most important conventions for use of fault
+codes in the I2C/SMBus stack.
+
+
+A "Fault" is not always an "Error"
+----------------------------------
+Not all fault reports imply errors; "page faults" should be a familiar
+example. Software often retries idempotent operations after transient
+faults. There may be fancier recovery schemes that are appropriate in
+some cases, such as re-initializing (and maybe resetting). After such
+recovery, triggered by a fault report, there is no error.
+
+In a similar way, sometimes a "fault" code just reports one defined
+result for an operation ... it doesn't indicate that anything is wrong
+at all, just that the outcome wasn't on the "golden path".
+
+In short, your I2C driver code may need to know these codes in order
+to respond correctly. Other code may need to rely on YOUR code reporting
+the right fault code, so that it can (in turn) behave correctly.
+
+
+I2C and SMBus fault codes
+-------------------------
+These are returned as negative numbers from most calls, with zero or
+some positive number indicating a non-fault return. The specific
+numbers associated with these symbols differ between architectures,
+though most Linux systems use <asm-generic/errno*.h> numbering.
+
+Note that the descriptions here are not exhaustive. There are other
+codes that may be returned, and other cases where these codes should
+be returned. However, drivers should not return other codes for these
+cases (unless the hardware doesn't provide unique fault reports).
+
+Also, codes returned by adapter probe methods follow rules which are
+specific to their host bus (such as PCI, or the platform bus).
+
+
+EAGAIN
+ Returned by I2C adapters when they lose arbitration in master
+ transmit mode: some other master was transmitting different
+ data at the same time.
+
+ Also returned when trying to invoke an I2C operation in an
+ atomic context, when some task is already using that I2C bus
+ to execute some other operation.
+
+EBADMSG
+ Returned by SMBus logic when an invalid Packet Error Code byte
+ is received. This code is a CRC covering all bytes in the
+ transaction, and is sent before the terminating STOP. This
+ fault is only reported on read transactions; the SMBus slave
+ may have a way to report PEC mismatches on writes from the
+ host. Note that even if PECs are in use, you should not rely
+ on these as the only way to detect incorrect data transfers.
+
+EBUSY
+ Returned by SMBus adapters when the bus was busy for longer
+ than allowed. This usually indicates some device (maybe the
+ SMBus adapter) needs some fault recovery (such as resetting),
+ or that the reset was attempted but failed.
+
+EINVAL
+ This rather vague error means an invalid parameter has been
+ detected before any I/O operation was started. Use a more
+ specific fault code when you can.
+
+ One example would be a driver trying an SMBus Block Write
+ with block size outside the range of 1-32 bytes.
+
+EIO
+ This rather vague error means something went wrong when
+ performing an I/O operation. Use a more specific fault
+ code when you can.
+
+ENODEV
+ Returned by driver probe() methods. This is a bit more
+ specific than ENXIO, implying the problem isn't with the
+ address, but with the device found there. Driver probes
+ may verify the device returns *correct* responses, and
+ return this as appropriate. (The driver core will warn
+ about probe faults other than ENXIO and ENODEV.)
+
+ENOMEM
+ Returned by any component that can't allocate memory when
+ it needs to do so.
+
+ENXIO
+ Returned by I2C adapters to indicate that the address phase
+ of a transfer didn't get an ACK. While it might just mean
+ an I2C device was temporarily not responding, usually it
+ means there's nothing listening at that address.
+
+ Returned by driver probe() methods to indicate that they
+ found no device to bind to. (ENODEV may also be used.)
+
+EOPNOTSUPP
+ Returned by an adapter when asked to perform an operation
+ that it doesn't, or can't, support.
+
+ For example, this would be returned when an adapter that
+ doesn't support SMBus block transfers is asked to execute
+ one. In that case, the driver making that request should
+ have verified that functionality was supported before it
+ made that block transfer request.
+
+ Similarly, if an I2C adapter can't execute all legal I2C
+ messages, it should return this when asked to perform a
+ transaction it can't. (These limitations can't be seen in
+ the adapter's functionality mask, since the assumption is
+ that if an adapter supports I2C it supports all of I2C.)
+
+EPROTO
+ Returned when slave does not conform to the relevant I2C
+ or SMBus (or chip-specific) protocol specifications. One
+ case is when the length of an SMBus block data response
+ (from the SMBus slave) is outside the range 1-32 bytes.
+
+ETIMEDOUT
+ This is returned by drivers when an operation took too much
+ time, and was aborted before it completed.
+
+ SMBus adapters may return it when an operation took more
+ time than allowed by the SMBus specification; for example,
+ when a slave stretches clocks too far. I2C has no such
+ timeouts, but it's normal for I2C adapters to impose some
+ arbitrary limits (much longer than SMBus!) too.
+
diff --git a/Documentation/i2c/functionality b/Documentation/i2c/functionality
new file mode 100644
index 0000000..42c17c1
--- /dev/null
+++ b/Documentation/i2c/functionality
@@ -0,0 +1,145 @@
+INTRODUCTION
+------------
+
+Because not every I2C or SMBus adapter implements everything in the
+I2C specifications, a client can not trust that everything it needs
+is implemented when it is given the option to attach to an adapter:
+the client needs some way to check whether an adapter has the needed
+functionality.
+
+
+FUNCTIONALITY CONSTANTS
+-----------------------
+
+For the most up-to-date list of functionality constants, please check
+<linux/i2c.h>!
+
+ I2C_FUNC_I2C Plain i2c-level commands (Pure SMBus
+ adapters typically can not do these)
+ I2C_FUNC_10BIT_ADDR Handles the 10-bit address extensions
+ I2C_FUNC_PROTOCOL_MANGLING Knows about the I2C_M_IGNORE_NAK,
+ I2C_M_REV_DIR_ADDR, I2C_M_NOSTART and
+ I2C_M_NO_RD_ACK flags (which modify the
+ I2C protocol!)
+ I2C_FUNC_SMBUS_QUICK Handles the SMBus write_quick command
+ I2C_FUNC_SMBUS_READ_BYTE Handles the SMBus read_byte command
+ I2C_FUNC_SMBUS_WRITE_BYTE Handles the SMBus write_byte command
+ I2C_FUNC_SMBUS_READ_BYTE_DATA Handles the SMBus read_byte_data command
+ I2C_FUNC_SMBUS_WRITE_BYTE_DATA Handles the SMBus write_byte_data command
+ I2C_FUNC_SMBUS_READ_WORD_DATA Handles the SMBus read_word_data command
+ I2C_FUNC_SMBUS_WRITE_WORD_DATA Handles the SMBus write_byte_data command
+ I2C_FUNC_SMBUS_PROC_CALL Handles the SMBus process_call command
+ I2C_FUNC_SMBUS_READ_BLOCK_DATA Handles the SMBus read_block_data command
+ I2C_FUNC_SMBUS_WRITE_BLOCK_DATA Handles the SMBus write_block_data command
+ I2C_FUNC_SMBUS_READ_I2C_BLOCK Handles the SMBus read_i2c_block_data command
+ I2C_FUNC_SMBUS_WRITE_I2C_BLOCK Handles the SMBus write_i2c_block_data command
+
+A few combinations of the above flags are also defined for your convenience:
+
+ I2C_FUNC_SMBUS_BYTE Handles the SMBus read_byte
+ and write_byte commands
+ I2C_FUNC_SMBUS_BYTE_DATA Handles the SMBus read_byte_data
+ and write_byte_data commands
+ I2C_FUNC_SMBUS_WORD_DATA Handles the SMBus read_word_data
+ and write_word_data commands
+ I2C_FUNC_SMBUS_BLOCK_DATA Handles the SMBus read_block_data
+ and write_block_data commands
+ I2C_FUNC_SMBUS_I2C_BLOCK Handles the SMBus read_i2c_block_data
+ and write_i2c_block_data commands
+ I2C_FUNC_SMBUS_EMUL Handles all SMBus commands than can be
+ emulated by a real I2C adapter (using
+ the transparent emulation layer)
+
+
+ADAPTER IMPLEMENTATION
+----------------------
+
+When you write a new adapter driver, you will have to implement a
+function callback `functionality'. Typical implementations are given
+below.
+
+A typical SMBus-only adapter would list all the SMBus transactions it
+supports. This example comes from the i2c-piix4 driver:
+
+ static u32 piix4_func(struct i2c_adapter *adapter)
+ {
+ return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
+ I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
+ I2C_FUNC_SMBUS_BLOCK_DATA;
+ }
+
+A typical full-I2C adapter would use the following (from the i2c-pxa
+driver):
+
+ static u32 i2c_pxa_functionality(struct i2c_adapter *adap)
+ {
+ return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
+ }
+
+I2C_FUNC_SMBUS_EMUL includes all the SMBus transactions (with the
+addition of I2C block transactions) which i2c-core can emulate using
+I2C_FUNC_I2C without any help from the adapter driver. The idea is
+to let the client drivers check for the support of SMBus functions
+without having to care whether the said functions are implemented in
+hardware by the adapter, or emulated in software by i2c-core on top
+of an I2C adapter.
+
+
+CLIENT CHECKING
+---------------
+
+Before a client tries to attach to an adapter, or even do tests to check
+whether one of the devices it supports is present on an adapter, it should
+check whether the needed functionality is present. The typical way to do
+this is (from the lm75 driver):
+
+ static int lm75_detect(...)
+ {
+ (...)
+ if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA |
+ I2C_FUNC_SMBUS_WORD_DATA))
+ goto exit;
+ (...)
+ }
+
+Here, the lm75 driver checks if the adapter can do both SMBus byte data
+and SMBus word data transactions. If not, then the driver won't work on
+this adapter and there's no point in going on. If the check above is
+successful, then the driver knows that it can call the following
+functions: i2c_smbus_read_byte_data(), i2c_smbus_write_byte_data(),
+i2c_smbus_read_word_data() and i2c_smbus_write_word_data(). As a rule of
+thumb, the functionality constants you test for with
+i2c_check_functionality() should match exactly the i2c_smbus_* functions
+which you driver is calling.
+
+Note that the check above doesn't tell whether the functionalities are
+implemented in hardware by the underlying adapter or emulated in
+software by i2c-core. Client drivers don't have to care about this, as
+i2c-core will transparently implement SMBus transactions on top of I2C
+adapters.
+
+
+CHECKING THROUGH /DEV
+---------------------
+
+If you try to access an adapter from a userspace program, you will have
+to use the /dev interface. You will still have to check whether the
+functionality you need is supported, of course. This is done using
+the I2C_FUNCS ioctl. An example, adapted from the i2cdetect program, is
+below:
+
+ int file;
+ if (file = open("/dev/i2c-0", O_RDWR) < 0) {
+ /* Some kind of error handling */
+ exit(1);
+ }
+ if (ioctl(file, I2C_FUNCS, &funcs) < 0) {
+ /* Some kind of error handling */
+ exit(1);
+ }
+ if (!(funcs & I2C_FUNC_SMBUS_QUICK)) {
+ /* Oops, the needed functionality (SMBus write_quick function) is
+ not available! */
+ exit(1);
+ }
+ /* Now it is safe to use the SMBus write_quick command */
diff --git a/Documentation/i2c/i2c-protocol b/Documentation/i2c/i2c-protocol
new file mode 100644
index 0000000..10518dd
--- /dev/null
+++ b/Documentation/i2c/i2c-protocol
@@ -0,0 +1,76 @@
+This document describes the i2c protocol. Or will, when it is finished :-)
+
+Key to symbols
+==============
+
+S (1 bit) : Start bit
+P (1 bit) : Stop bit
+Rd/Wr (1 bit) : Read/Write bit. Rd equals 1, Wr equals 0.
+A, NA (1 bit) : Accept and reverse accept bit.
+Addr (7 bits): I2C 7 bit address. Note that this can be expanded as usual to
+ get a 10 bit I2C address.
+Comm (8 bits): Command byte, a data byte which often selects a register on
+ the device.
+Data (8 bits): A plain data byte. Sometimes, I write DataLow, DataHigh
+ for 16 bit data.
+Count (8 bits): A data byte containing the length of a block operation.
+
+[..]: Data sent by I2C device, as opposed to data sent by the host adapter.
+
+
+Simple send transaction
+======================
+
+This corresponds to i2c_master_send.
+
+ S Addr Wr [A] Data [A] Data [A] ... [A] Data [A] P
+
+
+Simple receive transaction
+===========================
+
+This corresponds to i2c_master_recv
+
+ S Addr Rd [A] [Data] A [Data] A ... A [Data] NA P
+
+
+Combined transactions
+====================
+
+This corresponds to i2c_transfer
+
+They are just like the above transactions, but instead of a stop bit P
+a start bit S is sent and the transaction continues. An example of
+a byte read, followed by a byte write:
+
+ S Addr Rd [A] [Data] NA S Addr Wr [A] Data [A] P
+
+
+Modified transactions
+=====================
+
+We have found some I2C devices that needs the following modifications:
+
+ Flag I2C_M_NOSTART:
+ In a combined transaction, no 'S Addr Wr/Rd [A]' is generated at some
+ point. For example, setting I2C_M_NOSTART on the second partial message
+ generates something like:
+ S Addr Rd [A] [Data] NA Data [A] P
+ If you set the I2C_M_NOSTART variable for the first partial message,
+ we do not generate Addr, but we do generate the startbit S. This will
+ probably confuse all other clients on your bus, so don't try this.
+
+ Flags I2C_M_REV_DIR_ADDR
+ This toggles the Rd/Wr flag. That is, if you want to do a write, but
+ need to emit an Rd instead of a Wr, or vice versa, you set this
+ flag. For example:
+ S Addr Rd [A] Data [A] Data [A] ... [A] Data [A] P
+
+ Flags I2C_M_IGNORE_NAK
+ Normally message is interrupted immediately if there is [NA] from the
+ client. Setting this flag treats any [NA] as [A], and all of
+ message is sent.
+ These messages may still fail to SCL lo->hi timeout.
+
+ Flags I2C_M_NO_RD_ACK
+ In a read message, master A/NA bit is skipped.
diff --git a/Documentation/i2c/i2c-stub b/Documentation/i2c/i2c-stub
new file mode 100644
index 0000000..0d8be1c
--- /dev/null
+++ b/Documentation/i2c/i2c-stub
@@ -0,0 +1,47 @@
+MODULE: i2c-stub
+
+DESCRIPTION:
+
+This module is a very simple fake I2C/SMBus driver. It implements four
+types of SMBus commands: write quick, (r/w) byte, (r/w) byte data, and
+(r/w) word data.
+
+You need to provide chip addresses as a module parameter when loading this
+driver, which will then only react to SMBus commands to these addresses.
+
+No hardware is needed nor associated with this module. It will accept write
+quick commands to the specified addresses; it will respond to the other
+commands (also to the specified addresses) by reading from or writing to
+arrays in memory. It will also spam the kernel logs for every command it
+handles.
+
+A pointer register with auto-increment is implemented for all byte
+operations. This allows for continuous byte reads like those supported by
+EEPROMs, among others.
+
+The typical use-case is like this:
+ 1. load this module
+ 2. use i2cset (from lm_sensors project) to pre-load some data
+ 3. load the target sensors chip driver module
+ 4. observe its behavior in the kernel log
+
+There's a script named i2c-stub-from-dump in the i2c-tools package which
+can load register values automatically from a chip dump.
+
+PARAMETERS:
+
+int chip_addr[10]:
+ The SMBus addresses to emulate chips at.
+
+CAVEATS:
+
+If your target driver polls some byte or word waiting for it to change, the
+stub could lock it up. Use i2cset to unlock it.
+
+If the hardware for your driver has banked registers (e.g. Winbond sensors
+chips) this module will not work well - although it could be extended to
+support that pretty easily.
+
+If you spam it hard enough, printk can be lossy. This module really wants
+something like relayfs.
+
diff --git a/Documentation/i2c/smbus-protocol b/Documentation/i2c/smbus-protocol
new file mode 100644
index 0000000..9df4744
--- /dev/null
+++ b/Documentation/i2c/smbus-protocol
@@ -0,0 +1,227 @@
+SMBus Protocol Summary
+======================
+
+The following is a summary of the SMBus protocol. It applies to
+all revisions of the protocol (1.0, 1.1, and 2.0).
+Certain protocol features which are not supported by
+this package are briefly described at the end of this document.
+
+Some adapters understand only the SMBus (System Management Bus) protocol,
+which is a subset from the I2C protocol. Fortunately, many devices use
+only the same subset, which makes it possible to put them on an SMBus.
+
+If you write a driver for some I2C device, please try to use the SMBus
+commands if at all possible (if the device uses only that subset of the
+I2C protocol). This makes it possible to use the device driver on both
+SMBus adapters and I2C adapters (the SMBus command set is automatically
+translated to I2C on I2C adapters, but plain I2C commands can not be
+handled at all on most pure SMBus adapters).
+
+Below is a list of SMBus protocol operations, and the functions executing
+them. Note that the names used in the SMBus protocol specifications usually
+don't match these function names. For some of the operations which pass a
+single data byte, the functions using SMBus protocol operation names execute
+a different protocol operation entirely.
+
+
+Key to symbols
+==============
+
+S (1 bit) : Start bit
+P (1 bit) : Stop bit
+Rd/Wr (1 bit) : Read/Write bit. Rd equals 1, Wr equals 0.
+A, NA (1 bit) : Accept and reverse accept bit.
+Addr (7 bits): I2C 7 bit address. Note that this can be expanded as usual to
+ get a 10 bit I2C address.
+Comm (8 bits): Command byte, a data byte which often selects a register on
+ the device.
+Data (8 bits): A plain data byte. Sometimes, I write DataLow, DataHigh
+ for 16 bit data.
+Count (8 bits): A data byte containing the length of a block operation.
+
+[..]: Data sent by I2C device, as opposed to data sent by the host adapter.
+
+
+SMBus Quick Command
+===================
+
+This sends a single bit to the device, at the place of the Rd/Wr bit.
+
+A Addr Rd/Wr [A] P
+
+
+SMBus Receive Byte: i2c_smbus_read_byte()
+==========================================
+
+This reads a single byte from a device, without specifying a device
+register. Some devices are so simple that this interface is enough; for
+others, it is a shorthand if you want to read the same register as in
+the previous SMBus command.
+
+S Addr Rd [A] [Data] NA P
+
+
+SMBus Send Byte: i2c_smbus_write_byte()
+========================================
+
+This operation is the reverse of Receive Byte: it sends a single byte
+to a device. See Receive Byte for more information.
+
+S Addr Wr [A] Data [A] P
+
+
+SMBus Read Byte: i2c_smbus_read_byte_data()
+============================================
+
+This reads a single byte from a device, from a designated register.
+The register is specified through the Comm byte.
+
+S Addr Wr [A] Comm [A] S Addr Rd [A] [Data] NA P
+
+
+SMBus Read Word: i2c_smbus_read_word_data()
+============================================
+
+This operation is very like Read Byte; again, data is read from a
+device, from a designated register that is specified through the Comm
+byte. But this time, the data is a complete word (16 bits).
+
+S Addr Wr [A] Comm [A] S Addr Rd [A] [DataLow] A [DataHigh] NA P
+
+
+SMBus Write Byte: i2c_smbus_write_byte_data()
+==============================================
+
+This writes a single byte to a device, to a designated register. The
+register is specified through the Comm byte. This is the opposite of
+the Read Byte operation.
+
+S Addr Wr [A] Comm [A] Data [A] P
+
+
+SMBus Write Word: i2c_smbus_write_word_data()
+==============================================
+
+This is the opposite of the Read Word operation. 16 bits
+of data is written to a device, to the designated register that is
+specified through the Comm byte.
+
+S Addr Wr [A] Comm [A] DataLow [A] DataHigh [A] P
+
+
+SMBus Process Call: i2c_smbus_process_call()
+=============================================
+
+This command selects a device register (through the Comm byte), sends
+16 bits of data to it, and reads 16 bits of data in return.
+
+S Addr Wr [A] Comm [A] DataLow [A] DataHigh [A]
+ S Addr Rd [A] [DataLow] A [DataHigh] NA P
+
+
+SMBus Block Read: i2c_smbus_read_block_data()
+==============================================
+
+This command reads a block of up to 32 bytes from a device, from a
+designated register that is specified through the Comm byte. The amount
+of data is specified by the device in the Count byte.
+
+S Addr Wr [A] Comm [A]
+ S Addr Rd [A] [Count] A [Data] A [Data] A ... A [Data] NA P
+
+
+SMBus Block Write: i2c_smbus_write_block_data()
+================================================
+
+The opposite of the Block Read command, this writes up to 32 bytes to
+a device, to a designated register that is specified through the
+Comm byte. The amount of data is specified in the Count byte.
+
+S Addr Wr [A] Comm [A] Count [A] Data [A] Data [A] ... [A] Data [A] P
+
+
+SMBus Block Write - Block Read Process Call
+===========================================
+
+SMBus Block Write - Block Read Process Call was introduced in
+Revision 2.0 of the specification.
+
+This command selects a device register (through the Comm byte), sends
+1 to 31 bytes of data to it, and reads 1 to 31 bytes of data in return.
+
+S Addr Wr [A] Comm [A] Count [A] Data [A] ...
+ S Addr Rd [A] [Count] A [Data] ... A P
+
+
+SMBus Host Notify
+=================
+
+This command is sent from a SMBus device acting as a master to the
+SMBus host acting as a slave.
+It is the same form as Write Word, with the command code replaced by the
+alerting device's address.
+
+[S] [HostAddr] [Wr] A [DevAddr] A [DataLow] A [DataHigh] A [P]
+
+
+Packet Error Checking (PEC)
+===========================
+
+Packet Error Checking was introduced in Revision 1.1 of the specification.
+
+PEC adds a CRC-8 error-checking byte to transfers using it, immediately
+before the terminating STOP.
+
+
+Address Resolution Protocol (ARP)
+=================================
+
+The Address Resolution Protocol was introduced in Revision 2.0 of
+the specification. It is a higher-layer protocol which uses the
+messages above.
+
+ARP adds device enumeration and dynamic address assignment to
+the protocol. All ARP communications use slave address 0x61 and
+require PEC checksums.
+
+
+I2C Block Transactions
+======================
+
+The following I2C block transactions are supported by the
+SMBus layer and are described here for completeness.
+They are *NOT* defined by the SMBus specification.
+
+I2C block transactions do not limit the number of bytes transferred
+but the SMBus layer places a limit of 32 bytes.
+
+
+I2C Block Read: i2c_smbus_read_i2c_block_data()
+================================================
+
+This command reads a block of bytes from a device, from a
+designated register that is specified through the Comm byte.
+
+S Addr Wr [A] Comm [A]
+ S Addr Rd [A] [Data] A [Data] A ... A [Data] NA P
+
+
+I2C Block Read (2 Comm bytes)
+=============================
+
+This command reads a block of bytes from a device, from a
+designated register that is specified through the two Comm bytes.
+
+S Addr Wr [A] Comm1 [A] Comm2 [A]
+ S Addr Rd [A] [Data] A [Data] A ... A [Data] NA P
+
+
+I2C Block Write: i2c_smbus_write_i2c_block_data()
+==================================================
+
+The opposite of the Block Read command, this writes bytes to
+a device, to a designated register that is specified through the
+Comm byte. Note that command lengths of 0, 2, or more bytes are
+supported as they are indistinguishable from data.
+
+S Addr Wr [A] Comm [A] Data [A] Data [A] ... [A] Data [A] P
diff --git a/Documentation/i2c/summary b/Documentation/i2c/summary
new file mode 100644
index 0000000..13ab076
--- /dev/null
+++ b/Documentation/i2c/summary
@@ -0,0 +1,47 @@
+I2C and SMBus
+=============
+
+I2C (pronounce: I squared C) is a protocol developed by Philips. It is a
+slow two-wire protocol (variable speed, up to 400 kHz), with a high speed
+extension (3.4 MHz). It provides an inexpensive bus for connecting many
+types of devices with infrequent or low bandwidth communications needs.
+I2C is widely used with embedded systems. Some systems use variants that
+don't meet branding requirements, and so are not advertised as being I2C.
+
+SMBus (System Management Bus) is based on the I2C protocol, and is mostly
+a subset of I2C protocols and signaling. Many I2C devices will work on an
+SMBus, but some SMBus protocols add semantics beyond what is required to
+achieve I2C branding. Modern PC mainboards rely on SMBus. The most common
+devices connected through SMBus are RAM modules configured using I2C EEPROMs,
+and hardware monitoring chips.
+
+Because the SMBus is mostly a subset of the generalized I2C bus, we can
+use its protocols on many I2C systems. However, there are systems that don't
+meet both SMBus and I2C electrical constraints; and others which can't
+implement all the common SMBus protocol semantics or messages.
+
+
+Terminology
+===========
+
+When we talk about I2C, we use the following terms:
+ Bus -> Algorithm
+ Adapter
+ Device -> Driver
+ Client
+
+An Algorithm driver contains general code that can be used for a whole class
+of I2C adapters. Each specific adapter driver either depends on one algorithm
+driver, or includes its own implementation.
+
+A Driver driver (yes, this sounds ridiculous, sorry) contains the general
+code to access some type of device. Each detected device gets its own
+data in the Client structure. Usually, Driver and Client are more closely
+integrated than Algorithm and Adapter.
+
+For a given configuration, you will need a driver for your I2C bus, and
+drivers for your I2C devices (usually one driver for each device).
+
+At this time, Linux only operates I2C (or SMBus) in master mode; you can't
+use these APIs to make a Linux system behave as a slave/device, either to
+speak a custom protocol or to emulate some other device.
diff --git a/Documentation/i2c/ten-bit-addresses b/Documentation/i2c/ten-bit-addresses
new file mode 100644
index 0000000..200074f
--- /dev/null
+++ b/Documentation/i2c/ten-bit-addresses
@@ -0,0 +1,22 @@
+The I2C protocol knows about two kinds of device addresses: normal 7 bit
+addresses, and an extended set of 10 bit addresses. The sets of addresses
+do not intersect: the 7 bit address 0x10 is not the same as the 10 bit
+address 0x10 (though a single device could respond to both of them). You
+select a 10 bit address by adding an extra byte after the address
+byte:
+ S Addr7 Rd/Wr ....
+becomes
+ S 11110 Addr10 Rd/Wr
+S is the start bit, Rd/Wr the read/write bit, and if you count the number
+of bits, you will see the there are 8 after the S bit for 7 bit addresses,
+and 16 after the S bit for 10 bit addresses.
+
+WARNING! The current 10 bit address support is EXPERIMENTAL. There are
+several places in the code that will cause SEVERE PROBLEMS with 10 bit
+addresses, even though there is some basic handling and hooks. Also,
+almost no supported adapter handles the 10 bit addresses correctly.
+
+As soon as a real 10 bit address device is spotted 'in the wild', we
+can and will add proper support. Right now, 10 bit address devices
+are defined by the I2C protocol, but we have never seen a single device
+which supports them.
diff --git a/Documentation/i2c/upgrading-clients b/Documentation/i2c/upgrading-clients
new file mode 100644
index 0000000..9a45f9b
--- /dev/null
+++ b/Documentation/i2c/upgrading-clients
@@ -0,0 +1,281 @@
+Upgrading I2C Drivers to the new 2.6 Driver Model
+=================================================
+
+Ben Dooks <ben-linux@fluff.org>
+
+Introduction
+------------
+
+This guide outlines how to alter existing Linux 2.6 client drivers from
+the old to the new new binding methods.
+
+
+Example old-style driver
+------------------------
+
+
+struct example_state {
+ struct i2c_client client;
+ ....
+};
+
+static struct i2c_driver example_driver;
+
+static unsigned short ignore[] = { I2C_CLIENT_END };
+static unsigned short normal_addr[] = { OUR_ADDR, I2C_CLIENT_END };
+
+I2C_CLIENT_INSMOD;
+
+static int example_attach(struct i2c_adapter *adap, int addr, int kind)
+{
+ struct example_state *state;
+ struct device *dev = &adap->dev; /* to use for dev_ reports */
+ int ret;
+
+ state = kzalloc(sizeof(struct example_state), GFP_KERNEL);
+ if (state == NULL) {
+ dev_err(dev, "failed to create our state\n");
+ return -ENOMEM;
+ }
+
+ example->client.addr = addr;
+ example->client.flags = 0;
+ example->client.adapter = adap;
+
+ i2c_set_clientdata(&state->i2c_client, state);
+ strlcpy(client->i2c_client.name, "example", I2C_NAME_SIZE);
+
+ ret = i2c_attach_client(&state->i2c_client);
+ if (ret < 0) {
+ dev_err(dev, "failed to attach client\n");
+ kfree(state);
+ return ret;
+ }
+
+ dev = &state->i2c_client.dev;
+
+ /* rest of the initialisation goes here. */
+
+ dev_info(dev, "example client created\n");
+
+ return 0;
+}
+
+static int __devexit example_detach(struct i2c_client *client)
+{
+ struct example_state *state = i2c_get_clientdata(client);
+
+ i2c_detach_client(client);
+ kfree(state);
+ return 0;
+}
+
+static int example_attach_adapter(struct i2c_adapter *adap)
+{
+ return i2c_probe(adap, &addr_data, example_attach);
+}
+
+static struct i2c_driver example_driver = {
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "example",
+ },
+ .attach_adapter = example_attach_adapter,
+ .detach_client = __devexit_p(example_detach),
+ .suspend = example_suspend,
+ .resume = example_resume,
+};
+
+
+Updating the client
+-------------------
+
+The new style binding model will check against a list of supported
+devices and their associated address supplied by the code registering
+the busses. This means that the driver .attach_adapter and
+.detach_adapter methods can be removed, along with the addr_data,
+as follows:
+
+- static struct i2c_driver example_driver;
+
+- static unsigned short ignore[] = { I2C_CLIENT_END };
+- static unsigned short normal_addr[] = { OUR_ADDR, I2C_CLIENT_END };
+
+- I2C_CLIENT_INSMOD;
+
+- static int example_attach_adapter(struct i2c_adapter *adap)
+- {
+- return i2c_probe(adap, &addr_data, example_attach);
+- }
+
+ static struct i2c_driver example_driver = {
+- .attach_adapter = example_attach_adapter,
+- .detach_client = __devexit_p(example_detach),
+ }
+
+Add the probe and remove methods to the i2c_driver, as so:
+
+ static struct i2c_driver example_driver = {
++ .probe = example_probe,
++ .remove = __devexit_p(example_remove),
+ }
+
+Change the example_attach method to accept the new parameters
+which include the i2c_client that it will be working with:
+
+- static int example_attach(struct i2c_adapter *adap, int addr, int kind)
++ static int example_probe(struct i2c_client *client,
++ const struct i2c_device_id *id)
+
+Change the name of example_attach to example_probe to align it with the
+i2c_driver entry names. The rest of the probe routine will now need to be
+changed as the i2c_client has already been setup for use.
+
+The necessary client fields have already been setup before
+the probe function is called, so the following client setup
+can be removed:
+
+- example->client.addr = addr;
+- example->client.flags = 0;
+- example->client.adapter = adap;
+-
+- strlcpy(client->i2c_client.name, "example", I2C_NAME_SIZE);
+
+The i2c_set_clientdata is now:
+
+- i2c_set_clientdata(&state->client, state);
++ i2c_set_clientdata(client, state);
+
+The call to i2c_attach_client is no longer needed, if the probe
+routine exits successfully, then the driver will be automatically
+attached by the core. Change the probe routine as so:
+
+- ret = i2c_attach_client(&state->i2c_client);
+- if (ret < 0) {
+- dev_err(dev, "failed to attach client\n");
+- kfree(state);
+- return ret;
+- }
+
+
+Remove the storage of 'struct i2c_client' from the 'struct example_state'
+as we are provided with the i2c_client in our example_probe. Instead we
+store a pointer to it for when it is needed.
+
+struct example_state {
+- struct i2c_client client;
++ struct i2c_client *client;
+
+the new i2c client as so:
+
+- struct device *dev = &adap->dev; /* to use for dev_ reports */
++ struct device *dev = &i2c_client->dev; /* to use for dev_ reports */
+
+And remove the change after our client is attached, as the driver no
+longer needs to register a new client structure with the core:
+
+- dev = &state->i2c_client.dev;
+
+In the probe routine, ensure that the new state has the client stored
+in it:
+
+static int example_probe(struct i2c_client *i2c_client,
+ const struct i2c_device_id *id)
+{
+ struct example_state *state;
+ struct device *dev = &i2c_client->dev;
+ int ret;
+
+ state = kzalloc(sizeof(struct example_state), GFP_KERNEL);
+ if (state == NULL) {
+ dev_err(dev, "failed to create our state\n");
+ return -ENOMEM;
+ }
+
++ state->client = i2c_client;
+
+Update the detach method, by changing the name to _remove and
+to delete the i2c_detach_client call. It is possible that you
+can also remove the ret variable as it is not not needed for
+any of the core functions.
+
+- static int __devexit example_detach(struct i2c_client *client)
++ static int __devexit example_remove(struct i2c_client *client)
+{
+ struct example_state *state = i2c_get_clientdata(client);
+
+- i2c_detach_client(client);
+
+And finally ensure that we have the correct ID table for the i2c-core
+and other utilities:
+
++ struct i2c_device_id example_idtable[] = {
++ { "example", 0 },
++ { }
++};
++
++MODULE_DEVICE_TABLE(i2c, example_idtable);
+
+static struct i2c_driver example_driver = {
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "example",
+ },
++ .id_table = example_ids,
+
+
+Our driver should now look like this:
+
+struct example_state {
+ struct i2c_client *client;
+ ....
+};
+
+static int example_probe(struct i2c_client *client,
+ const struct i2c_device_id *id)
+{
+ struct example_state *state;
+ struct device *dev = &client->dev;
+
+ state = kzalloc(sizeof(struct example_state), GFP_KERNEL);
+ if (state == NULL) {
+ dev_err(dev, "failed to create our state\n");
+ return -ENOMEM;
+ }
+
+ state->client = client;
+ i2c_set_clientdata(client, state);
+
+ /* rest of the initialisation goes here. */
+
+ dev_info(dev, "example client created\n");
+
+ return 0;
+}
+
+static int __devexit example_remove(struct i2c_client *client)
+{
+ struct example_state *state = i2c_get_clientdata(client);
+
+ kfree(state);
+ return 0;
+}
+
+static struct i2c_device_id example_idtable[] = {
+ { "example", 0 },
+ { }
+};
+
+MODULE_DEVICE_TABLE(i2c, example_idtable);
+
+static struct i2c_driver example_driver = {
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "example",
+ },
+ .id_table = example_idtable,
+ .probe = example_probe,
+ .remove = __devexit_p(example_remove),
+ .suspend = example_suspend,
+ .resume = example_resume,
+};
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
new file mode 100644
index 0000000..6b9af7d
--- /dev/null
+++ b/Documentation/i2c/writing-clients
@@ -0,0 +1,389 @@
+This is a small guide for those who want to write kernel drivers for I2C
+or SMBus devices, using Linux as the protocol host/master (not slave).
+
+To set up a driver, you need to do several things. Some are optional, and
+some things can be done slightly or completely different. Use this as a
+guide, not as a rule book!
+
+
+General remarks
+===============
+
+Try to keep the kernel namespace as clean as possible. The best way to
+do this is to use a unique prefix for all global symbols. This is
+especially important for exported symbols, but it is a good idea to do
+it for non-exported symbols too. We will use the prefix `foo_' in this
+tutorial.
+
+
+The driver structure
+====================
+
+Usually, you will implement a single driver structure, and instantiate
+all clients from it. Remember, a driver structure contains general access
+routines, and should be zero-initialized except for fields with data you
+provide. A client structure holds device-specific information like the
+driver model device node, and its I2C address.
+
+static struct i2c_device_id foo_idtable[] = {
+ { "foo", my_id_for_foo },
+ { "bar", my_id_for_bar },
+ { }
+};
+
+MODULE_DEVICE_TABLE(i2c, foo_idtable);
+
+static struct i2c_driver foo_driver = {
+ .driver = {
+ .name = "foo",
+ },
+
+ .id_table = foo_ids,
+ .probe = foo_probe,
+ .remove = foo_remove,
+ /* if device autodetection is needed: */
+ .class = I2C_CLASS_SOMETHING,
+ .detect = foo_detect,
+ .address_data = &addr_data,
+
+ .shutdown = foo_shutdown, /* optional */
+ .suspend = foo_suspend, /* optional */
+ .resume = foo_resume, /* optional */
+ .command = foo_command, /* optional, deprecated */
+}
+
+The name field is the driver name, and must not contain spaces. It
+should match the module name (if the driver can be compiled as a module),
+although you can use MODULE_ALIAS (passing "foo" in this example) to add
+another name for the module. If the driver name doesn't match the module
+name, the module won't be automatically loaded (hotplug/coldplug).
+
+All other fields are for call-back functions which will be explained
+below.
+
+
+Extra client data
+=================
+
+Each client structure has a special `data' field that can point to any
+structure at all. You should use this to keep device-specific data.
+
+ /* store the value */
+ void i2c_set_clientdata(struct i2c_client *client, void *data);
+
+ /* retrieve the value */
+ void *i2c_get_clientdata(const struct i2c_client *client);
+
+
+Accessing the client
+====================
+
+Let's say we have a valid client structure. At some time, we will need
+to gather information from the client, or write new information to the
+client.
+
+I have found it useful to define foo_read and foo_write functions for this.
+For some cases, it will be easier to call the i2c functions directly,
+but many chips have some kind of register-value idea that can easily
+be encapsulated.
+
+The below functions are simple examples, and should not be copied
+literally.
+
+int foo_read_value(struct i2c_client *client, u8 reg)
+{
+ if (reg < 0x10) /* byte-sized register */
+ return i2c_smbus_read_byte_data(client, reg);
+ else /* word-sized register */
+ return i2c_smbus_read_word_data(client, reg);
+}
+
+int foo_write_value(struct i2c_client *client, u8 reg, u16 value)
+{
+ if (reg == 0x10) /* Impossible to write - driver error! */
+ return -EINVAL;
+ else if (reg < 0x10) /* byte-sized register */
+ return i2c_smbus_write_byte_data(client, reg, value);
+ else /* word-sized register */
+ return i2c_smbus_write_word_data(client, reg, value);
+}
+
+
+Probing and attaching
+=====================
+
+The Linux I2C stack was originally written to support access to hardware
+monitoring chips on PC motherboards, and thus used to embed some assumptions
+that were more appropriate to SMBus (and PCs) than to I2C. One of these
+assumptions was that most adapters and devices drivers support the SMBUS_QUICK
+protocol to probe device presence. Another was that devices and their drivers
+can be sufficiently configured using only such probe primitives.
+
+As Linux and its I2C stack became more widely used in embedded systems
+and complex components such as DVB adapters, those assumptions became more
+problematic. Drivers for I2C devices that issue interrupts need more (and
+different) configuration information, as do drivers handling chip variants
+that can't be distinguished by protocol probing, or which need some board
+specific information to operate correctly.
+
+Accordingly, the I2C stack now has two models for associating I2C devices
+with their drivers: the original "legacy" model, and a newer one that's
+fully compatible with the Linux 2.6 driver model. These models do not mix,
+since the "legacy" model requires drivers to create "i2c_client" device
+objects after SMBus style probing, while the Linux driver model expects
+drivers to be given such device objects in their probe() routines.
+
+The legacy model is deprecated now and will soon be removed, so we no
+longer document it here.
+
+
+Standard Driver Model Binding ("New Style")
+-------------------------------------------
+
+System infrastructure, typically board-specific initialization code or
+boot firmware, reports what I2C devices exist. For example, there may be
+a table, in the kernel or from the boot loader, identifying I2C devices
+and linking them to board-specific configuration information about IRQs
+and other wiring artifacts, chip type, and so on. That could be used to
+create i2c_client objects for each I2C device.
+
+I2C device drivers using this binding model work just like any other
+kind of driver in Linux: they provide a probe() method to bind to
+those devices, and a remove() method to unbind.
+
+ static int foo_probe(struct i2c_client *client,
+ const struct i2c_device_id *id);
+ static int foo_remove(struct i2c_client *client);
+
+Remember that the i2c_driver does not create those client handles. The
+handle may be used during foo_probe(). If foo_probe() reports success
+(zero not a negative status code) it may save the handle and use it until
+foo_remove() returns. That binding model is used by most Linux drivers.
+
+The probe function is called when an entry in the id_table name field
+matches the device's name. It is passed the entry that was matched so
+the driver knows which one in the table matched.
+
+
+Device Creation
+---------------
+
+If you know for a fact that an I2C device is connected to a given I2C bus,
+you can instantiate that device by simply filling an i2c_board_info
+structure with the device address and driver name, and calling
+i2c_new_device(). This will create the device, then the driver core will
+take care of finding the right driver and will call its probe() method.
+If a driver supports different device types, you can specify the type you
+want using the type field. You can also specify an IRQ and platform data
+if needed.
+
+Sometimes you know that a device is connected to a given I2C bus, but you
+don't know the exact address it uses. This happens on TV adapters for
+example, where the same driver supports dozens of slightly different
+models, and I2C device addresses change from one model to the next. In
+that case, you can use the i2c_new_probed_device() variant, which is
+similar to i2c_new_device(), except that it takes an additional list of
+possible I2C addresses to probe. A device is created for the first
+responsive address in the list. If you expect more than one device to be
+present in the address range, simply call i2c_new_probed_device() that
+many times.
+
+The call to i2c_new_device() or i2c_new_probed_device() typically happens
+in the I2C bus driver. You may want to save the returned i2c_client
+reference for later use.
+
+
+Device Detection
+----------------
+
+Sometimes you do not know in advance which I2C devices are connected to
+a given I2C bus. This is for example the case of hardware monitoring
+devices on a PC's SMBus. In that case, you may want to let your driver
+detect supported devices automatically. This is how the legacy model
+was working, and is now available as an extension to the standard
+driver model (so that we can finally get rid of the legacy model.)
+
+You simply have to define a detect callback which will attempt to
+identify supported devices (returning 0 for supported ones and -ENODEV
+for unsupported ones), a list of addresses to probe, and a device type
+(or class) so that only I2C buses which may have that type of device
+connected (and not otherwise enumerated) will be probed. The i2c
+core will then call you back as needed and will instantiate a device
+for you for every successful detection.
+
+Note that this mechanism is purely optional and not suitable for all
+devices. You need some reliable way to identify the supported devices
+(typically using device-specific, dedicated identification registers),
+otherwise misdetections are likely to occur and things can get wrong
+quickly.
+
+
+Device Deletion
+---------------
+
+Each I2C device which has been created using i2c_new_device() or
+i2c_new_probed_device() can be unregistered by calling
+i2c_unregister_device(). If you don't call it explicitly, it will be
+called automatically before the underlying I2C bus itself is removed, as a
+device can't survive its parent in the device driver model.
+
+
+Initializing the driver
+=======================
+
+When the kernel is booted, or when your foo driver module is inserted,
+you have to do some initializing. Fortunately, just registering the
+driver module is usually enough.
+
+static int __init foo_init(void)
+{
+ return i2c_add_driver(&foo_driver);
+}
+
+static void __exit foo_cleanup(void)
+{
+ i2c_del_driver(&foo_driver);
+}
+
+/* Substitute your own name and email address */
+MODULE_AUTHOR("Frodo Looijaard <frodol@dds.nl>"
+MODULE_DESCRIPTION("Driver for Barf Inc. Foo I2C devices");
+
+/* a few non-GPL license types are also allowed */
+MODULE_LICENSE("GPL");
+
+module_init(foo_init);
+module_exit(foo_cleanup);
+
+Note that some functions are marked by `__init'. These functions can
+be removed after kernel booting (or module loading) is completed.
+Likewise, functions marked by `__exit' are dropped by the compiler when
+the code is built into the kernel, as they would never be called.
+
+
+Power Management
+================
+
+If your I2C device needs special handling when entering a system low
+power state -- like putting a transceiver into a low power mode, or
+activating a system wakeup mechanism -- do that in the suspend() method.
+The resume() method should reverse what the suspend() method does.
+
+These are standard driver model calls, and they work just like they
+would for any other driver stack. The calls can sleep, and can use
+I2C messaging to the device being suspended or resumed (since their
+parent I2C adapter is active when these calls are issued, and IRQs
+are still enabled).
+
+
+System Shutdown
+===============
+
+If your I2C device needs special handling when the system shuts down
+or reboots (including kexec) -- like turning something off -- use a
+shutdown() method.
+
+Again, this is a standard driver model call, working just like it
+would for any other driver stack: the calls can sleep, and can use
+I2C messaging.
+
+
+Command function
+================
+
+A generic ioctl-like function call back is supported. You will seldom
+need this, and its use is deprecated anyway, so newer design should not
+use it.
+
+
+Sending and receiving
+=====================
+
+If you want to communicate with your device, there are several functions
+to do this. You can find all of them in <linux/i2c.h>.
+
+If you can choose between plain I2C communication and SMBus level
+communication, please use the latter. All adapters understand SMBus level
+commands, but only some of them understand plain I2C!
+
+
+Plain I2C communication
+-----------------------
+
+ int i2c_master_send(struct i2c_client *client, const char *buf,
+ int count);
+ int i2c_master_recv(struct i2c_client *client, char *buf, int count);
+
+These routines read and write some bytes from/to a client. The client
+contains the i2c address, so you do not have to include it. The second
+parameter contains the bytes to read/write, the third the number of bytes
+to read/write (must be less than the length of the buffer.) Returned is
+the actual number of bytes read/written.
+
+ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msg,
+ int num);
+
+This sends a series of messages. Each message can be a read or write,
+and they can be mixed in any way. The transactions are combined: no
+stop bit is sent between transaction. The i2c_msg structure contains
+for each message the client address, the number of bytes of the message
+and the message data itself.
+
+You can read the file `i2c-protocol' for more information about the
+actual I2C protocol.
+
+
+SMBus communication
+-------------------
+
+ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
+ unsigned short flags, char read_write, u8 command,
+ int size, union i2c_smbus_data *data);
+
+This is the generic SMBus function. All functions below are implemented
+in terms of it. Never use this function directly!
+
+ s32 i2c_smbus_read_byte(struct i2c_client *client);
+ s32 i2c_smbus_write_byte(struct i2c_client *client, u8 value);
+ s32 i2c_smbus_read_byte_data(struct i2c_client *client, u8 command);
+ s32 i2c_smbus_write_byte_data(struct i2c_client *client,
+ u8 command, u8 value);
+ s32 i2c_smbus_read_word_data(struct i2c_client *client, u8 command);
+ s32 i2c_smbus_write_word_data(struct i2c_client *client,
+ u8 command, u16 value);
+ s32 i2c_smbus_process_call(struct i2c_client *client,
+ u8 command, u16 value);
+ s32 i2c_smbus_read_block_data(struct i2c_client *client,
+ u8 command, u8 *values);
+ s32 i2c_smbus_write_block_data(struct i2c_client *client,
+ u8 command, u8 length, const u8 *values);
+ s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client,
+ u8 command, u8 length, u8 *values);
+ s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
+ u8 command, u8 length,
+ const u8 *values);
+
+These ones were removed from i2c-core because they had no users, but could
+be added back later if needed:
+
+ s32 i2c_smbus_write_quick(struct i2c_client *client, u8 value);
+ s32 i2c_smbus_block_process_call(struct i2c_client *client,
+ u8 command, u8 length, u8 *values);
+
+All these transactions return a negative errno value on failure. The 'write'
+transactions return 0 on success; the 'read' transactions return the read
+value, except for block transactions, which return the number of values
+read. The block buffers need not be longer than 32 bytes.
+
+You can read the file `smbus-protocol' for more information about the
+actual SMBus protocol.
+
+
+General purpose routines
+========================
+
+Below all general purpose routines are listed, that were not mentioned
+before.
+
+ /* Return the adapter number for a specific adapter */
+ int i2c_adapter_id(struct i2c_adapter *adap);
diff --git a/Documentation/i2o/README b/Documentation/i2o/README
new file mode 100644
index 0000000..0ebf58c
--- /dev/null
+++ b/Documentation/i2o/README
@@ -0,0 +1,63 @@
+
+ Linux I2O Support (c) Copyright 1999 Red Hat Software
+ and others.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ as published by the Free Software Foundation; either version
+ 2 of the License, or (at your option) any later version.
+
+AUTHORS (so far)
+
+Alan Cox, Building Number Three Ltd.
+ Core code, SCSI and Block OSMs
+
+Steve Ralston, LSI Logic Corp.
+ Debugging SCSI and Block OSM
+
+Deepak Saxena, Intel Corp.
+ Various core/block extensions
+ /proc interface, bug fixes
+ Ioctl interfaces for control
+ Debugging LAN OSM
+
+Philip Rumpf
+ Fixed assorted dumb SMP locking bugs
+
+Juha Sievanen, University of Helsinki Finland
+ LAN OSM code
+ /proc interface to LAN class
+ Bug fixes
+ Core code extensions
+
+Auvo Häkkinen, University of Helsinki Finland
+ LAN OSM code
+ /Proc interface to LAN class
+ Bug fixes
+ Core code extensions
+
+Taneli Vähäkangas, University of Helsinki Finland
+ Fixes to i2o_config
+
+CREDITS
+
+ This work was made possible by
+
+Red Hat Software
+ Funding for the Building #3 part of the project
+
+Symbios Logic (Now LSI)
+ Host adapters, hints, known to work platforms when I hit
+ compatibility problems
+
+BoxHill Corporation
+ Loan of initial FibreChannel disk array used for development work.
+
+European Comission
+ Funding the work done by the University of Helsinki
+
+SysKonnect
+ Loan of FDDI and Gigabit Ethernet cards
+
+ASUSTeK
+ Loan of I2O motherboard
diff --git a/Documentation/i2o/ioctl b/Documentation/i2o/ioctl
new file mode 100644
index 0000000..1e77fac
--- /dev/null
+++ b/Documentation/i2o/ioctl
@@ -0,0 +1,394 @@
+
+Linux I2O User Space Interface
+rev 0.3 - 04/20/99
+
+=============================================================================
+Originally written by Deepak Saxena(deepak@plexity.net)
+Currently maintained by Deepak Saxena(deepak@plexity.net)
+=============================================================================
+
+I. Introduction
+
+The Linux I2O subsystem provides a set of ioctl() commands that can be
+utilized by user space applications to communicate with IOPs and devices
+on individual IOPs. This document defines the specific ioctl() commands
+that are available to the user and provides examples of their uses.
+
+This document assumes the reader is familiar with or has access to the
+I2O specification as no I2O message parameters are outlined. For information
+on the specification, see http://www.i2osig.org
+
+This document and the I2O user space interface are currently maintained
+by Deepak Saxena. Please send all comments, errata, and bug fixes to
+deepak@csociety.purdue.edu
+
+II. IOP Access
+
+Access to the I2O subsystem is provided through the device file named
+/dev/i2o/ctl. This file is a character file with major number 10 and minor
+number 166. It can be created through the following command:
+
+ mknod /dev/i2o/ctl c 10 166
+
+III. Determining the IOP Count
+
+ SYNOPSIS
+
+ ioctl(fd, I2OGETIOPS, int *count);
+
+ u8 count[MAX_I2O_CONTROLLERS];
+
+ DESCRIPTION
+
+ This function returns the system's active IOP table. count should
+ point to a buffer containing MAX_I2O_CONTROLLERS entries. Upon
+ returning, each entry will contain a non-zero value if the given
+ IOP unit is active, and NULL if it is inactive or non-existent.
+
+ RETURN VALUE.
+
+ Returns 0 if no errors occur, and -1 otherwise. If an error occurs,
+ errno is set appropriately:
+
+ EFAULT Invalid user space pointer was passed
+
+IV. Getting Hardware Resource Table
+
+ SYNOPSIS
+
+ ioctl(fd, I2OHRTGET, struct i2o_cmd_hrt *hrt);
+
+ struct i2o_cmd_hrtlct
+ {
+ u32 iop; /* IOP unit number */
+ void *resbuf; /* Buffer for result */
+ u32 *reslen; /* Buffer length in bytes */
+ };
+
+ DESCRIPTION
+
+ This function returns the Hardware Resource Table of the IOP specified
+ by hrt->iop in the buffer pointed to by hrt->resbuf. The actual size of
+ the data is written into *(hrt->reslen).
+
+ RETURNS
+
+ This function returns 0 if no errors occur. If an error occurs, -1
+ is returned and errno is set appropriately:
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ENOBUFS Buffer not large enough. If this occurs, the required
+ buffer length is written into *(hrt->reslen)
+
+V. Getting Logical Configuration Table
+
+ SYNOPSIS
+
+ ioctl(fd, I2OLCTGET, struct i2o_cmd_lct *lct);
+
+ struct i2o_cmd_hrtlct
+ {
+ u32 iop; /* IOP unit number */
+ void *resbuf; /* Buffer for result */
+ u32 *reslen; /* Buffer length in bytes */
+ };
+
+ DESCRIPTION
+
+ This function returns the Logical Configuration Table of the IOP specified
+ by lct->iop in the buffer pointed to by lct->resbuf. The actual size of
+ the data is written into *(lct->reslen).
+
+ RETURNS
+
+ This function returns 0 if no errors occur. If an error occurs, -1
+ is returned and errno is set appropriately:
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ENOBUFS Buffer not large enough. If this occurs, the required
+ buffer length is written into *(lct->reslen)
+
+VI. Settting Parameters
+
+ SYNOPSIS
+
+ ioctl(fd, I2OPARMSET, struct i2o_parm_setget *ops);
+
+ struct i2o_cmd_psetget
+ {
+ u32 iop; /* IOP unit number */
+ u32 tid; /* Target device TID */
+ void *opbuf; /* Operation List buffer */
+ u32 oplen; /* Operation List buffer length in bytes */
+ void *resbuf; /* Result List buffer */
+ u32 *reslen; /* Result List buffer length in bytes */
+ };
+
+ DESCRIPTION
+
+ This function posts a UtilParamsSet message to the device identified
+ by ops->iop and ops->tid. The operation list for the message is
+ sent through the ops->opbuf buffer, and the result list is written
+ into the buffer pointed to by ops->resbuf. The number of bytes
+ written is placed into *(ops->reslen).
+
+ RETURNS
+
+ The return value is the size in bytes of the data written into
+ ops->resbuf if no errors occur. If an error occurs, -1 is returned
+ and errno is set appropriatly:
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ENOBUFS Buffer not large enough. If this occurs, the required
+ buffer length is written into *(ops->reslen)
+ ETIMEDOUT Timeout waiting for reply message
+ ENOMEM Kernel memory allocation error
+
+ A return value of 0 does not mean that the value was actually
+ changed properly on the IOP. The user should check the result
+ list to determine the specific status of the transaction.
+
+VII. Getting Parameters
+
+ SYNOPSIS
+
+ ioctl(fd, I2OPARMGET, struct i2o_parm_setget *ops);
+
+ struct i2o_parm_setget
+ {
+ u32 iop; /* IOP unit number */
+ u32 tid; /* Target device TID */
+ void *opbuf; /* Operation List buffer */
+ u32 oplen; /* Operation List buffer length in bytes */
+ void *resbuf; /* Result List buffer */
+ u32 *reslen; /* Result List buffer length in bytes */
+ };
+
+ DESCRIPTION
+
+ This function posts a UtilParamsGet message to the device identified
+ by ops->iop and ops->tid. The operation list for the message is
+ sent through the ops->opbuf buffer, and the result list is written
+ into the buffer pointed to by ops->resbuf. The actual size of data
+ written is placed into *(ops->reslen).
+
+ RETURNS
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ENOBUFS Buffer not large enough. If this occurs, the required
+ buffer length is written into *(ops->reslen)
+ ETIMEDOUT Timeout waiting for reply message
+ ENOMEM Kernel memory allocation error
+
+ A return value of 0 does not mean that the value was actually
+ properly retrieved. The user should check the result list
+ to determine the specific status of the transaction.
+
+VIII. Downloading Software
+
+ SYNOPSIS
+
+ ioctl(fd, I2OSWDL, struct i2o_sw_xfer *sw);
+
+ struct i2o_sw_xfer
+ {
+ u32 iop; /* IOP unit number */
+ u8 flags; /* DownloadFlags field */
+ u8 sw_type; /* Software type */
+ u32 sw_id; /* Software ID */
+ void *buf; /* Pointer to software buffer */
+ u32 *swlen; /* Length of software buffer */
+ u32 *maxfrag; /* Number of fragments */
+ u32 *curfrag; /* Current fragment number */
+ };
+
+ DESCRIPTION
+
+ This function downloads a software fragment pointed by sw->buf
+ to the iop identified by sw->iop. The DownloadFlags, SwID, SwType
+ and SwSize fields of the ExecSwDownload message are filled in with
+ the values of sw->flags, sw->sw_id, sw->sw_type and *(sw->swlen).
+
+ The fragments _must_ be sent in order and be 8K in size. The last
+ fragment _may_ be shorter, however. The kernel will compute its
+ size based on information in the sw->swlen field.
+
+ Please note that SW transfers can take a long time.
+
+ RETURNS
+
+ This function returns 0 no errors occur. If an error occurs, -1
+ is returned and errno is set appropriatly:
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ETIMEDOUT Timeout waiting for reply message
+ ENOMEM Kernel memory allocation error
+
+IX. Uploading Software
+
+ SYNOPSIS
+
+ ioctl(fd, I2OSWUL, struct i2o_sw_xfer *sw);
+
+ struct i2o_sw_xfer
+ {
+ u32 iop; /* IOP unit number */
+ u8 flags; /* UploadFlags */
+ u8 sw_type; /* Software type */
+ u32 sw_id; /* Software ID */
+ void *buf; /* Pointer to software buffer */
+ u32 *swlen; /* Length of software buffer */
+ u32 *maxfrag; /* Number of fragments */
+ u32 *curfrag; /* Current fragment number */
+ };
+
+ DESCRIPTION
+
+ This function uploads a software fragment from the IOP identified
+ by sw->iop, sw->sw_type, sw->sw_id and optionally sw->swlen fields.
+ The UploadFlags, SwID, SwType and SwSize fields of the ExecSwUpload
+ message are filled in with the values of sw->flags, sw->sw_id,
+ sw->sw_type and *(sw->swlen).
+
+ The fragments _must_ be requested in order and be 8K in size. The
+ user is responsible for allocating memory pointed by sw->buf. The
+ last fragment _may_ be shorter.
+
+ Please note that SW transfers can take a long time.
+
+ RETURNS
+
+ This function returns 0 if no errors occur. If an error occurs, -1
+ is returned and errno is set appropriatly:
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ETIMEDOUT Timeout waiting for reply message
+ ENOMEM Kernel memory allocation error
+
+X. Removing Software
+
+ SYNOPSIS
+
+ ioctl(fd, I2OSWDEL, struct i2o_sw_xfer *sw);
+
+ struct i2o_sw_xfer
+ {
+ u32 iop; /* IOP unit number */
+ u8 flags; /* RemoveFlags */
+ u8 sw_type; /* Software type */
+ u32 sw_id; /* Software ID */
+ void *buf; /* Unused */
+ u32 *swlen; /* Length of the software data */
+ u32 *maxfrag; /* Unused */
+ u32 *curfrag; /* Unused */
+ };
+
+ DESCRIPTION
+
+ This function removes software from the IOP identified by sw->iop.
+ The RemoveFlags, SwID, SwType and SwSize fields of the ExecSwRemove message
+ are filled in with the values of sw->flags, sw->sw_id, sw->sw_type and
+ *(sw->swlen). Give zero in *(sw->len) if the value is unknown. IOP uses
+ *(sw->swlen) value to verify correct identication of the module to remove.
+ The actual size of the module is written into *(sw->swlen).
+
+ RETURNS
+
+ This function returns 0 if no errors occur. If an error occurs, -1
+ is returned and errno is set appropriatly:
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ETIMEDOUT Timeout waiting for reply message
+ ENOMEM Kernel memory allocation error
+
+X. Validating Configuration
+
+ SYNOPSIS
+
+ ioctl(fd, I2OVALIDATE, int *iop);
+ u32 iop;
+
+ DESCRIPTION
+
+ This function posts an ExecConfigValidate message to the controller
+ identified by iop. This message indicates that the current
+ configuration is accepted. The iop changes the status of suspect drivers
+ to valid and may delete old drivers from its store.
+
+ RETURNS
+
+ This function returns 0 if no erro occur. If an error occurs, -1 is
+ returned and errno is set appropriatly:
+
+ ETIMEDOUT Timeout waiting for reply message
+ ENXIO Invalid IOP number
+
+XI. Configuration Dialog
+
+ SYNOPSIS
+
+ ioctl(fd, I2OHTML, struct i2o_html *htquery);
+ struct i2o_html
+ {
+ u32 iop; /* IOP unit number */
+ u32 tid; /* Target device ID */
+ u32 page; /* HTML page */
+ void *resbuf; /* Buffer for reply HTML page */
+ u32 *reslen; /* Length in bytes of reply buffer */
+ void *qbuf; /* Pointer to HTTP query string */
+ u32 qlen; /* Length in bytes of query string buffer */
+ };
+
+ DESCRIPTION
+
+ This function posts an UtilConfigDialog message to the device identified
+ by htquery->iop and htquery->tid. The requested HTML page number is
+ provided by the htquery->page field, and the resultant data is stored
+ in the buffer pointed to by htquery->resbuf. If there is an HTTP query
+ string that is to be sent to the device, it should be sent in the buffer
+ pointed to by htquery->qbuf. If there is no query string, this field
+ should be set to NULL. The actual size of the reply received is written
+ into *(htquery->reslen).
+
+ RETURNS
+
+ This function returns 0 if no error occur. If an error occurs, -1
+ is returned and errno is set appropriatly:
+
+ EFAULT Invalid user space pointer was passed
+ ENXIO Invalid IOP number
+ ENOBUFS Buffer not large enough. If this occurs, the required
+ buffer length is written into *(ops->reslen)
+ ETIMEDOUT Timeout waiting for reply message
+ ENOMEM Kernel memory allocation error
+
+XII. Events
+
+ In the process of determining this. Current idea is to have use
+ the select() interface to allow user apps to periodically poll
+ the /dev/i2o/ctl device for events. When select() notifies the user
+ that an event is available, the user would call read() to retrieve
+ a list of all the events that are pending for the specific device.
+
+=============================================================================
+Revision History
+=============================================================================
+
+Rev 0.1 - 04/01/99
+- Initial revision
+
+Rev 0.2 - 04/06/99
+- Changed return values to match UNIX ioctl() standard. Only return values
+ are 0 and -1. All errors are reported through errno.
+- Added summary of proposed possible event interfaces
+
+Rev 0.3 - 04/20/99
+- Changed all ioctls() to use pointers to user data instead of actual data
+- Updated error values to match the code
diff --git a/Documentation/ia64/.gitignore b/Documentation/ia64/.gitignore
new file mode 100644
index 0000000..ab806ed
--- /dev/null
+++ b/Documentation/ia64/.gitignore
@@ -0,0 +1 @@
+aliasing-test
diff --git a/Documentation/ia64/IRQ-redir.txt b/Documentation/ia64/IRQ-redir.txt
new file mode 100644
index 0000000..f7bd722
--- /dev/null
+++ b/Documentation/ia64/IRQ-redir.txt
@@ -0,0 +1,69 @@
+IRQ affinity on IA64 platforms
+------------------------------
+ 07.01.2002, Erich Focht <efocht@ess.nec.de>
+
+
+By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be
+controlled. The behavior on IA64 platforms is slightly different from
+that described in Documentation/IRQ-affinity.txt for i386 systems.
+
+Because of the usage of SAPIC mode and physical destination mode the
+IRQ target is one particular CPU and cannot be a mask of several
+CPUs. Only the first non-zero bit is taken into account.
+
+
+Usage examples:
+
+The target CPU has to be specified as a hexadecimal CPU mask. The
+first non-zero bit is the selected CPU. This format has been kept for
+compatibility reasons with i386.
+
+Set the delivery mode of interrupt 41 to fixed and route the
+interrupts to CPU #3 (logical CPU number) (2^3=0x08):
+ echo "8" >/proc/irq/41/smp_affinity
+
+Set the default route for IRQ number 41 to CPU 6 in lowest priority
+delivery mode (redirectable):
+ echo "r 40" >/proc/irq/41/smp_affinity
+
+The output of the command
+ cat /proc/irq/IRQ#/smp_affinity
+gives the target CPU mask for the specified interrupt vector. If the CPU
+mask is preceded by the character "r", the interrupt is redirectable
+(i.e. lowest priority mode routing is used), otherwise its route is
+fixed.
+
+
+
+Initialization and default behavior:
+
+If the platform features IRQ redirection (info provided by SAL) all
+IO-SAPIC interrupts are initialized with CPU#0 as their default target
+and the routing is the so called "lowest priority mode" (actually
+fixed SAPIC mode with hint). The XTP chipset registers are used as hints
+for the IRQ routing. Currently in Linux XTP registers can have three
+values:
+ - minimal for an idle task,
+ - normal if any other task runs,
+ - maximal if the CPU is going to be switched off.
+The IRQ is routed to the CPU with lowest XTP register value, the
+search begins at the default CPU. Therefore most of the interrupts
+will be handled by CPU #0.
+
+If the platform doesn't feature interrupt redirection IOSAPIC fixed
+routing is used. The target CPUs are distributed in a round robin
+manner. IRQs will be routed only to the selected target CPUs. Check
+with
+ cat /proc/interrupts
+
+
+
+Comments:
+
+On large (multi-node) systems it is recommended to route the IRQs to
+the node to which the corresponding device is connected.
+For systems like the NEC AzusA we get IRQ node-affinity for free. This
+is because usually the chipsets on each node redirect the interrupts
+only to their own CPUs (as they cannot see the XTP registers on the
+other nodes).
+
diff --git a/Documentation/ia64/Makefile b/Documentation/ia64/Makefile
new file mode 100644
index 0000000..b75db69
--- /dev/null
+++ b/Documentation/ia64/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := aliasing-test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/ia64/README b/Documentation/ia64/README
new file mode 100644
index 0000000..aa17f21
--- /dev/null
+++ b/Documentation/ia64/README
@@ -0,0 +1,43 @@
+ Linux kernel release 2.4.xx for the IA-64 Platform
+
+ These are the release notes for Linux version 2.4 for IA-64
+ platform. This document provides information specific to IA-64
+ ONLY, to get additional information about the Linux kernel also
+ read the original Linux README provided with the kernel.
+
+INSTALLING the kernel:
+
+ - IA-64 kernel installation is the same as the other platforms, see
+ original README for details.
+
+
+SOFTWARE REQUIREMENTS
+
+ Compiling and running this kernel requires an IA-64 compliant GCC
+ compiler. And various software packages also compiled with an
+ IA-64 compliant GCC compiler.
+
+
+CONFIGURING the kernel:
+
+ Configuration is the same, see original README for details.
+
+
+COMPILING the kernel:
+
+ - Compiling this kernel doesn't differ from other platform so read
+ the original README for details BUT make sure you have an IA-64
+ compliant GCC compiler.
+
+IA-64 SPECIFICS
+
+ - General issues:
+
+ o Hardly any performance tuning has been done. Obvious targets
+ include the library routines (IP checksum, etc.). Less
+ obvious targets include making sure we don't flush the TLB
+ needlessly, etc.
+
+ o SMP locks cleanup/optimization
+
+ o IA32 support. Currently experimental. It mostly works.
diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c
new file mode 100644
index 0000000..d23610f
--- /dev/null
+++ b/Documentation/ia64/aliasing-test.c
@@ -0,0 +1,262 @@
+/*
+ * Exercise /dev/mem mmap cases that have been troublesome in the past
+ *
+ * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ * Bjorn Helgaas <bjorn.helgaas@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/pci.h>
+
+int sum;
+
+int map_mem(char *path, off_t offset, size_t length, int touch)
+{
+ int fd, rc;
+ void *addr;
+ int *c;
+
+ fd = open(path, O_RDWR);
+ if (fd == -1) {
+ perror(path);
+ return -1;
+ }
+
+ if (fnmatch("/proc/bus/pci/*", path, 0) == 0) {
+ rc = ioctl(fd, PCIIOC_MMAP_IS_MEM);
+ if (rc == -1)
+ perror("PCIIOC_MMAP_IS_MEM ioctl");
+ }
+
+ addr = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset);
+ if (addr == MAP_FAILED)
+ return 1;
+
+ if (touch) {
+ c = (int *) addr;
+ while (c < (int *) (addr + length))
+ sum += *c++;
+ }
+
+ rc = munmap(addr, length);
+ if (rc == -1) {
+ perror("munmap");
+ return -1;
+ }
+
+ close(fd);
+ return 0;
+}
+
+int scan_tree(char *path, char *file, off_t offset, size_t length, int touch)
+{
+ struct dirent **namelist;
+ char *name, *path2;
+ int i, n, r, rc = 0, result = 0;
+ struct stat buf;
+
+ n = scandir(path, &namelist, 0, alphasort);
+ if (n < 0) {
+ perror("scandir");
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ name = namelist[i]->d_name;
+
+ if (fnmatch(".", name, 0) == 0)
+ goto skip;
+ if (fnmatch("..", name, 0) == 0)
+ goto skip;
+
+ path2 = malloc(strlen(path) + strlen(name) + 3);
+ strcpy(path2, path);
+ strcat(path2, "/");
+ strcat(path2, name);
+
+ if (fnmatch(file, name, 0) == 0) {
+ rc = map_mem(path2, offset, length, touch);
+ if (rc == 0)
+ fprintf(stderr, "PASS: %s 0x%lx-0x%lx is %s\n", path2, offset, offset + length, touch ? "readable" : "mappable");
+ else if (rc > 0)
+ fprintf(stderr, "PASS: %s 0x%lx-0x%lx not mappable\n", path2, offset, offset + length);
+ else {
+ fprintf(stderr, "FAIL: %s 0x%lx-0x%lx not accessible\n", path2, offset, offset + length);
+ return rc;
+ }
+ } else {
+ r = lstat(path2, &buf);
+ if (r == 0 && S_ISDIR(buf.st_mode)) {
+ rc = scan_tree(path2, file, offset, length, touch);
+ if (rc < 0)
+ return rc;
+ }
+ }
+
+ result |= rc;
+ free(path2);
+
+skip:
+ free(namelist[i]);
+ }
+ free(namelist);
+ return result;
+}
+
+char buf[1024];
+
+int read_rom(char *path)
+{
+ int fd, rc;
+ size_t size = 0;
+
+ fd = open(path, O_RDWR);
+ if (fd == -1) {
+ perror(path);
+ return -1;
+ }
+
+ rc = write(fd, "1", 2);
+ if (rc <= 0) {
+ perror("write");
+ return -1;
+ }
+
+ do {
+ rc = read(fd, buf, sizeof(buf));
+ if (rc > 0)
+ size += rc;
+ } while (rc > 0);
+
+ close(fd);
+ return size;
+}
+
+int scan_rom(char *path, char *file)
+{
+ struct dirent **namelist;
+ char *name, *path2;
+ int i, n, r, rc = 0, result = 0;
+ struct stat buf;
+
+ n = scandir(path, &namelist, 0, alphasort);
+ if (n < 0) {
+ perror("scandir");
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ name = namelist[i]->d_name;
+
+ if (fnmatch(".", name, 0) == 0)
+ goto skip;
+ if (fnmatch("..", name, 0) == 0)
+ goto skip;
+
+ path2 = malloc(strlen(path) + strlen(name) + 3);
+ strcpy(path2, path);
+ strcat(path2, "/");
+ strcat(path2, name);
+
+ if (fnmatch(file, name, 0) == 0) {
+ rc = read_rom(path2);
+
+ /*
+ * It's OK if the ROM is unreadable. Maybe there
+ * is no ROM, or some other error ocurred. The
+ * important thing is that no MCA happened.
+ */
+ if (rc > 0)
+ fprintf(stderr, "PASS: %s read %d bytes\n", path2, rc);
+ else {
+ fprintf(stderr, "PASS: %s not readable\n", path2);
+ return rc;
+ }
+ } else {
+ r = lstat(path2, &buf);
+ if (r == 0 && S_ISDIR(buf.st_mode)) {
+ rc = scan_rom(path2, file);
+ if (rc < 0)
+ return rc;
+ }
+ }
+
+ result |= rc;
+ free(path2);
+
+skip:
+ free(namelist[i]);
+ }
+ free(namelist);
+ return result;
+}
+
+int main(void)
+{
+ int rc;
+
+ if (map_mem("/dev/mem", 0, 0xA0000, 1) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0xa0000 is readable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0x0-0xa0000 not accessible\n");
+
+ /*
+ * It's not safe to blindly read the VGA frame buffer. If you know
+ * how to poke the card the right way, it should respond, but it's
+ * not safe in general. Many machines, e.g., Intel chipsets, cover
+ * up a non-responding card by just returning -1, but others will
+ * report the failure as a machine check.
+ */
+ if (map_mem("/dev/mem", 0xA0000, 0x20000, 0) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0xa0000-0xc0000 is mappable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0xa0000-0xc0000 not accessible\n");
+
+ if (map_mem("/dev/mem", 0xC0000, 0x40000, 1) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0xc0000-0x100000 is readable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0xc0000-0x100000 not accessible\n");
+
+ /*
+ * Often you can map all the individual pieces above (0-0xA0000,
+ * 0xA0000-0xC0000, and 0xC0000-0x100000), but can't map the whole
+ * thing at once. This is because the individual pieces use different
+ * attributes, and there's no single attribute supported over the
+ * whole region.
+ */
+ rc = map_mem("/dev/mem", 0, 1024*1024, 0);
+ if (rc == 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 is mappable\n");
+ else if (rc > 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 not mappable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0x0-0x100000 not accessible\n");
+
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0, 0xA0000, 1);
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0xA0000, 0x20000, 0);
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0xC0000, 0x40000, 1);
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0, 1024*1024, 0);
+
+ scan_rom("/sys/devices", "rom");
+
+ scan_tree("/proc/bus/pci", "??.?", 0, 0xA0000, 1);
+ scan_tree("/proc/bus/pci", "??.?", 0xA0000, 0x20000, 0);
+ scan_tree("/proc/bus/pci", "??.?", 0xC0000, 0x40000, 1);
+ scan_tree("/proc/bus/pci", "??.?", 0, 1024*1024, 0);
+
+ return rc;
+}
diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt
new file mode 100644
index 0000000..aa3e953
--- /dev/null
+++ b/Documentation/ia64/aliasing.txt
@@ -0,0 +1,223 @@
+ MEMORY ATTRIBUTE ALIASING ON IA-64
+
+ Bjorn Helgaas
+ <bjorn.helgaas@hp.com>
+ May 4, 2006
+
+
+MEMORY ATTRIBUTES
+
+ Itanium supports several attributes for virtual memory references.
+ The attribute is part of the virtual translation, i.e., it is
+ contained in the TLB entry. The ones of most interest to the Linux
+ kernel are:
+
+ WB Write-back (cacheable)
+ UC Uncacheable
+ WC Write-coalescing
+
+ System memory typically uses the WB attribute. The UC attribute is
+ used for memory-mapped I/O devices. The WC attribute is uncacheable
+ like UC is, but writes may be delayed and combined to increase
+ performance for things like frame buffers.
+
+ The Itanium architecture requires that we avoid accessing the same
+ page with both a cacheable mapping and an uncacheable mapping[1].
+
+ The design of the chipset determines which attributes are supported
+ on which regions of the address space. For example, some chipsets
+ support either WB or UC access to main memory, while others support
+ only WB access.
+
+MEMORY MAP
+
+ Platform firmware describes the physical memory map and the
+ supported attributes for each region. At boot-time, the kernel uses
+ the EFI GetMemoryMap() interface. ACPI can also describe memory
+ devices and the attributes they support, but Linux/ia64 currently
+ doesn't use this information.
+
+ The kernel uses the efi_memmap table returned from GetMemoryMap() to
+ learn the attributes supported by each region of physical address
+ space. Unfortunately, this table does not completely describe the
+ address space because some machines omit some or all of the MMIO
+ regions from the map.
+
+ The kernel maintains another table, kern_memmap, which describes the
+ memory Linux is actually using and the attribute for each region.
+ This contains only system memory; it does not contain MMIO space.
+
+ The kern_memmap table typically contains only a subset of the system
+ memory described by the efi_memmap. Linux/ia64 can't use all memory
+ in the system because of constraints imposed by the identity mapping
+ scheme.
+
+ The efi_memmap table is preserved unmodified because the original
+ boot-time information is required for kexec.
+
+KERNEL IDENTITY MAPPINGS
+
+ Linux/ia64 identity mappings are done with large pages, currently
+ either 16MB or 64MB, referred to as "granules." Cacheable mappings
+ are speculative[2], so the processor can read any location in the
+ page at any time, independent of the programmer's intentions. This
+ means that to avoid attribute aliasing, Linux can create a cacheable
+ identity mapping only when the entire granule supports cacheable
+ access.
+
+ Therefore, kern_memmap contains only full granule-sized regions that
+ can referenced safely by an identity mapping.
+
+ Uncacheable mappings are not speculative, so the processor will
+ generate UC accesses only to locations explicitly referenced by
+ software. This allows UC identity mappings to cover granules that
+ are only partially populated, or populated with a combination of UC
+ and WB regions.
+
+USER MAPPINGS
+
+ User mappings are typically done with 16K or 64K pages. The smaller
+ page size allows more flexibility because only 16K or 64K has to be
+ homogeneous with respect to memory attributes.
+
+POTENTIAL ATTRIBUTE ALIASING CASES
+
+ There are several ways the kernel creates new mappings:
+
+ mmap of /dev/mem
+
+ This uses remap_pfn_range(), which creates user mappings. These
+ mappings may be either WB or UC. If the region being mapped
+ happens to be in kern_memmap, meaning that it may also be mapped
+ by a kernel identity mapping, the user mapping must use the same
+ attribute as the kernel mapping.
+
+ If the region is not in kern_memmap, the user mapping should use
+ an attribute reported as being supported in the EFI memory map.
+
+ Since the EFI memory map does not describe MMIO on some
+ machines, this should use an uncacheable mapping as a fallback.
+
+ mmap of /sys/class/pci_bus/.../legacy_mem
+
+ This is very similar to mmap of /dev/mem, except that legacy_mem
+ only allows mmap of the one megabyte "legacy MMIO" area for a
+ specific PCI bus. Typically this is the first megabyte of
+ physical address space, but it may be different on machines with
+ several VGA devices.
+
+ "X" uses this to access VGA frame buffers. Using legacy_mem
+ rather than /dev/mem allows multiple instances of X to talk to
+ different VGA cards.
+
+ The /dev/mem mmap constraints apply.
+
+ mmap of /proc/bus/pci/.../??.?
+
+ This is an MMIO mmap of PCI functions, which additionally may or
+ may not be requested as using the WC attribute.
+
+ If WC is requested, and the region in kern_memmap is either WC
+ or UC, and the EFI memory map designates the region as WC, then
+ the WC mapping is allowed.
+
+ Otherwise, the user mapping must use the same attribute as the
+ kernel mapping.
+
+ read/write of /dev/mem
+
+ This uses copy_from_user(), which implicitly uses a kernel
+ identity mapping. This is obviously safe for things in
+ kern_memmap.
+
+ There may be corner cases of things that are not in kern_memmap,
+ but could be accessed this way. For example, registers in MMIO
+ space are not in kern_memmap, but could be accessed with a UC
+ mapping. This would not cause attribute aliasing. But
+ registers typically can be accessed only with four-byte or
+ eight-byte accesses, and the copy_from_user() path doesn't allow
+ any control over the access size, so this would be dangerous.
+
+ ioremap()
+
+ This returns a mapping for use inside the kernel.
+
+ If the region is in kern_memmap, we should use the attribute
+ specified there.
+
+ If the EFI memory map reports that the entire granule supports
+ WB, we should use that (granules that are partially reserved
+ or occupied by firmware do not appear in kern_memmap).
+
+ If the granule contains non-WB memory, but we can cover the
+ region safely with kernel page table mappings, we can use
+ ioremap_page_range() as most other architectures do.
+
+ Failing all of the above, we have to fall back to a UC mapping.
+
+PAST PROBLEM CASES
+
+ mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
+
+ The EFI memory map may not report these MMIO regions.
+
+ These must be allowed so that X will work. This means that
+ when the EFI memory map is incomplete, every /dev/mem mmap must
+ succeed. It may create either WB or UC user mappings, depending
+ on whether the region is in kern_memmap or the EFI memory map.
+
+ mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
+
+ See https://bugzilla.novell.com/show_bug.cgi?id=140858.
+
+ The EFI memory map reports the following attributes:
+ 0x00000-0x9FFFF WB only
+ 0xA0000-0xBFFFF UC only (VGA frame buffer)
+ 0xC0000-0xFFFFF WB only
+
+ This mmap is done with user pages, not kernel identity mappings,
+ so it is safe to use WB mappings.
+
+ The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
+ which uses a granule-sized UC mapping. This granule will cover some
+ WB-only memory, but since UC is non-speculative, the processor will
+ never generate an uncacheable reference to the WB-only areas unless
+ the driver explicitly touches them.
+
+ mmap of 0x0-0xFFFFF legacy_mem by "X"
+
+ If the EFI memory map reports that the entire range supports the
+ same attributes, we can allow the mmap (and we will prefer WB if
+ supported, as is the case with HP sx[12]000 machines with VGA
+ disabled).
+
+ If EFI reports the range as partly WB and partly UC (as on sx[12]000
+ machines with VGA enabled), we must fail the mmap because there's no
+ safe attribute to use.
+
+ If EFI reports some of the range but not all (as on Intel firmware
+ that doesn't report the VGA frame buffer at all), we should fail the
+ mmap and force the user to map just the specific region of interest.
+
+ mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
+
+ The EFI memory map reports the following attributes:
+ 0x00000-0xFFFFF WB only (no VGA MMIO hole)
+
+ This is a special case of the previous case, and the mmap should
+ fail for the same reason as above.
+
+ read of /sys/devices/.../rom
+
+ For VGA devices, this may cause an ioremap() of 0xC0000. This
+ used to be done with a UC mapping, because the VGA frame buffer
+ at 0xA0000 prevents use of a WB granule. The UC mapping causes
+ an MCA on HP sx[12]000 chipsets.
+
+ We should use WB page table mappings to avoid covering the VGA
+ frame buffer.
+
+NOTES
+
+ [1] SDM rev 2.2, vol 2, sec 4.4.1.
+ [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/Documentation/ia64/efirtc.txt b/Documentation/ia64/efirtc.txt
new file mode 100644
index 0000000..057e6be
--- /dev/null
+++ b/Documentation/ia64/efirtc.txt
@@ -0,0 +1,128 @@
+EFI Real Time Clock driver
+-------------------------------
+S. Eranian <eranian@hpl.hp.com>
+March 2000
+
+I/ Introduction
+
+This document describes the efirtc.c driver has provided for
+the IA-64 platform.
+
+The purpose of this driver is to supply an API for kernel and user applications
+to get access to the Time Service offered by EFI version 0.92.
+
+EFI provides 4 calls one can make once the OS is booted: GetTime(),
+SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this
+driver. We describe those calls as well the design of the driver in the
+following sections.
+
+II/ Design Decisions
+
+The original ideas was to provide a very simple driver to get access to,
+at first, the time of day service. This is required in order to access, in a
+portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock
+to initialize the system view of the time during boot.
+
+Because we wanted to minimize the impact on existing user-level apps using
+the CMOS clock, we decided to expose an API that was very similar to the one
+used today with the legacy RTC driver (driver/char/rtc.c). However, because
+EFI provides a simpler services, not all ioctl() are available. Also
+new ioctl()s have been introduced for things that EFI provides but not the
+legacy.
+
+EFI uses a slightly different way of representing the time, noticeably
+the reference date is different. Year is the using the full 4-digit format.
+The Epoch is January 1st 1998. For backward compatibility reasons we don't
+expose this new way of representing time. Instead we use something very
+similar to the struct tm, i.e. struct rtc_time, as used by hwclock.
+One of the reasons for doing it this way is to allow for EFI to still evolve
+without necessarily impacting any of the user applications. The decoupling
+enables flexibility and permits writing wrapper code is ncase things change.
+
+The driver exposes two interfaces, one via the device file and a set of
+ioctl()s. The other is read-only via the /proc filesystem.
+
+As of today we don't offer a /proc/sys interface.
+
+To allow for a uniform interface between the legacy RTC and EFI time service,
+we have created the include/linux/rtc.h header file to contain only the
+"public" API of the two drivers. The specifics of the legacy RTC are still
+in include/linux/mc146818rtc.h.
+
+
+III/ Time of day service
+
+The part of the driver gives access to the time of day service of EFI.
+Two ioctl()s, compatible with the legacy RTC calls:
+
+ Read the CMOS clock: ioctl(d, RTC_RD_TIME, &rtc);
+
+ Write the CMOS clock: ioctl(d, RTC_SET_TIME, &rtc);
+
+The rtc is a pointer to a data structure defined in rtc.h which is close
+to a struct tm:
+
+struct rtc_time {
+ int tm_sec;
+ int tm_min;
+ int tm_hour;
+ int tm_mday;
+ int tm_mon;
+ int tm_year;
+ int tm_wday;
+ int tm_yday;
+ int tm_isdst;
+};
+
+The driver takes care of converting back an forth between the EFI time and
+this format.
+
+Those two ioctl()s can be exercised with the hwclock command:
+
+For reading:
+# /sbin/hwclock --show
+Mon Mar 6 15:32:32 2000 -0.910248 seconds
+
+For setting:
+# /sbin/hwclock --systohc
+
+Root privileges are required to be able to set the time of day.
+
+IV/ Wakeup Alarm service
+
+EFI provides an API by which one can program when a machine should wakeup,
+i.e. reboot. This is very different from the alarm provided by the legacy
+RTC which is some kind of interval timer alarm. For this reason we don't use
+the same ioctl()s to get access to the service. Instead we have
+introduced 2 news ioctl()s to the interface of an RTC.
+
+We have added 2 new ioctl()s that are specific to the EFI driver:
+
+ Read the current state of the alarm
+ ioctl(d, RTC_WKLAM_RD, &wkt)
+
+ Set the alarm or change its status
+ ioctl(d, RTC_WKALM_SET, &wkt)
+
+The wkt structure encapsulates a struct rtc_time + 2 extra fields to get
+status information:
+
+struct rtc_wkalrm {
+
+ unsigned char enabled; /* =1 if alarm is enabled */
+ unsigned char pending; /* =1 if alarm is pending */
+
+ struct rtc_time time;
+}
+
+As of today, none of the existing user-level apps supports this feature.
+However writing such a program should be hard by simply using those two
+ioctl().
+
+Root privileges are required to be able to set the alarm.
+
+V/ References.
+
+Checkout the following Web site for more information on EFI:
+
+http://developer.intel.com/technology/efi/
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
new file mode 100644
index 0000000..223e4f0
--- /dev/null
+++ b/Documentation/ia64/err_inject.txt
@@ -0,0 +1,1068 @@
+
+IPF Machine Check (MC) error inject tool
+========================================
+
+IPF Machine Check (MC) error inject tool is used to inject MC
+errors from Linux. The tool is a test bed for IPF MC work flow including
+hardware correctable error handling, OS recoverable error handling, MC
+event logging, etc.
+
+The tool includes two parts: a kernel driver and a user application
+sample. The driver provides interface to PAL to inject error
+and query error injection capabilities. The driver code is in
+arch/ia64/kernel/err_inject.c. The application sample (shown below)
+provides a combination of various errors and calls the driver's interface
+(sysfs interface) to inject errors or query error injection capabilities.
+
+The tool can be used to test Intel IPF machine MC handling capabilities.
+It's especially useful for people who can not access hardware MC injection
+tool to inject error. It's also very useful to integrate with other
+software test suits to do stressful testing on IPF.
+
+Below is a sample application as part of the whole tool. The sample
+can be used as a working test tool. Or it can be expanded to include
+more features. It also can be a integrated into a library or other user
+application to have more thorough test.
+
+The sample application takes err.conf as error configuration input. GCC
+compiles the code. After you install err_inject driver, you can run
+this sample application to inject errors.
+
+Errata: Itanium 2 Processors Specification Update lists some errata against
+the pal_mc_error_inject PAL procedure. The following err.conf has been tested
+on latest Montecito PAL.
+
+err.conf:
+
+#This is configuration file for err_inject_tool.
+#The format of the each line is:
+#cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
+#where
+# cpu: logical cpu number the error will be inject in.
+# loop: times the error will be injected.
+# interval: In second. every so often one error is injected.
+# err_type_info, err_struct_info: PAL parameters.
+#
+#Note: All values are hex w/o or w/ 0x prefix.
+
+
+#On cpu2, inject only total 0x10 errors, interval 5 seconds
+#corrected, data cache, hier-2, physical addr(assigned by tool code).
+#working on Montecito latest PAL.
+2, 10, 5, 4101, 95
+
+#On cpu4, inject and consume total 0x10 errors, interval 5 seconds
+#corrected, data cache, hier-2, physical addr(assigned by tool code).
+#working on Montecito latest PAL.
+4, 10, 5, 4109, 95
+
+#On cpu15, inject and consume total 0x10 errors, interval 5 seconds
+#recoverable, DTR0, hier-2.
+#working on Montecito latest PAL.
+0xf, 0x10, 5, 4249, 15
+
+The sample application source code:
+
+err_injection_tool.c:
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Copyright (C) 2006 Intel Co
+ * Fenghua Yu <fenghua.yu@intel.com>
+ *
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+
+#define MAX_FN_SIZE 256
+#define MAX_BUF_SIZE 256
+#define DATA_BUF_SIZE 256
+#define NR_CPUS 512
+#define MAX_TASK_NUM 2048
+#define MIN_INTERVAL 5 // seconds
+#define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte.
+#define PARA_FIELD_NUM 5
+#define MASK_SIZE (NR_CPUS/64)
+#define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
+
+int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
+
+int verbose;
+#define vbprintf if (verbose) printf
+
+int log_info(int cpu, const char *fmt, ...)
+{
+ FILE *log;
+ char fn[MAX_FN_SIZE];
+ char buf[MAX_BUF_SIZE];
+ va_list args;
+
+ sprintf(fn, "%d.log", cpu);
+ log=fopen(fn, "a+");
+ if (log==NULL) {
+ perror("Error open:");
+ return -1;
+ }
+
+ va_start(args, fmt);
+ vprintf(fmt, args);
+ memset(buf, 0, MAX_BUF_SIZE);
+ vsprintf(buf, fmt, args);
+ va_end(args);
+
+ fwrite(buf, sizeof(buf), 1, log);
+ fclose(log);
+
+ return 0;
+}
+
+typedef unsigned long u64;
+typedef unsigned int u32;
+
+typedef union err_type_info_u {
+ struct {
+ u64 mode : 3, /* 0-2 */
+ err_inj : 3, /* 3-5 */
+ err_sev : 2, /* 6-7 */
+ err_struct : 5, /* 8-12 */
+ struct_hier : 3, /* 13-15 */
+ reserved : 48; /* 16-63 */
+ } err_type_info_u;
+ u64 err_type_info;
+} err_type_info_t;
+
+typedef union err_struct_info_u {
+ struct {
+ u64 siv : 1, /* 0 */
+ c_t : 2, /* 1-2 */
+ cl_p : 3, /* 3-5 */
+ cl_id : 3, /* 6-8 */
+ cl_dp : 1, /* 9 */
+ reserved1 : 22, /* 10-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_cache;
+ struct {
+ u64 siv : 1, /* 0 */
+ tt : 2, /* 1-2 */
+ tc_tr : 2, /* 3-4 */
+ tr_slot : 8, /* 5-12 */
+ reserved1 : 19, /* 13-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_tlb;
+ struct {
+ u64 siv : 1, /* 0 */
+ regfile_id : 4, /* 1-4 */
+ reg_num : 7, /* 5-11 */
+ reserved1 : 20, /* 12-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_register;
+ struct {
+ u64 reserved;
+ } err_struct_info_bus_processor_interconnect;
+ u64 err_struct_info;
+} err_struct_info_t;
+
+typedef union err_data_buffer_u {
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ u64 inj_addr; /* 64-127 */
+ u64 way : 5, /* 128-132 */
+ index : 20, /* 133-152 */
+ : 39; /* 153-191 */
+ } err_data_buffer_cache;
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ u64 inj_addr; /* 64-127 */
+ u64 way : 5, /* 128-132 */
+ index : 20, /* 133-152 */
+ reserved : 39; /* 153-191 */
+ } err_data_buffer_tlb;
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ } err_data_buffer_register;
+ struct {
+ u64 reserved; /* 0-63 */
+ } err_data_buffer_bus_processor_interconnect;
+ u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+} err_data_buffer_t;
+
+typedef union capabilities_u {
+ struct {
+ u64 i : 1,
+ d : 1,
+ rv : 1,
+ tag : 1,
+ data : 1,
+ mesi : 1,
+ dp : 1,
+ reserved1 : 3,
+ pa : 1,
+ va : 1,
+ wi : 1,
+ reserved2 : 20,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved3 : 30;
+ } capabilities_cache;
+ struct {
+ u64 d : 1,
+ i : 1,
+ rv : 1,
+ tc : 1,
+ tr : 1,
+ reserved1 : 27,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved2 : 30;
+ } capabilities_tlb;
+ struct {
+ u64 gr_b0 : 1,
+ gr_b1 : 1,
+ fr : 1,
+ br : 1,
+ pr : 1,
+ ar : 1,
+ cr : 1,
+ rr : 1,
+ pkr : 1,
+ dbr : 1,
+ ibr : 1,
+ pmc : 1,
+ pmd : 1,
+ reserved1 : 3,
+ regnum : 1,
+ reserved2 : 15,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved3 : 30;
+ } capabilities_register;
+ struct {
+ u64 reserved;
+ } capabilities_bus_processor_interconnect;
+} capabilities_t;
+
+typedef struct resources_s {
+ u64 ibr0 : 1,
+ ibr2 : 1,
+ ibr4 : 1,
+ ibr6 : 1,
+ dbr0 : 1,
+ dbr2 : 1,
+ dbr4 : 1,
+ dbr6 : 1,
+ reserved : 48;
+} resources_t;
+
+
+long get_page_size(void)
+{
+ long page_size=sysconf(_SC_PAGESIZE);
+ return page_size;
+}
+
+#define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
+#define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
+#define SHM_VA 0x2000000100000000
+
+int shmid;
+void *shmaddr;
+
+int create_shm(void)
+{
+ key_t key;
+ char fn[MAX_FN_SIZE];
+
+ /* cpu0 is always existing */
+ sprintf(fn, PATH_FORMAT, 0);
+ if ((key = ftok(fn, 's')) == -1) {
+ perror("ftok");
+ return -1;
+ }
+
+ shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
+ if (shmid == -1) {
+ if (errno==EEXIST) {
+ shmid = shmget(key, SHM_SIZE, 0);
+ if (shmid == -1) {
+ perror("shmget");
+ return -1;
+ }
+ }
+ else {
+ perror("shmget");
+ return -1;
+ }
+ }
+ vbprintf("shmid=%d", shmid);
+
+ /* connect to the segment: */
+ shmaddr = shmat(shmid, (void *)SHM_VA, 0);
+ if (shmaddr == (void*)-1) {
+ perror("shmat");
+ return -1;
+ }
+
+ memset(shmaddr, 0, SHM_SIZE);
+ mlock(shmaddr, SHM_SIZE);
+
+ return 0;
+}
+
+int free_shm()
+{
+ munlock(shmaddr, SHM_SIZE);
+ shmdt(shmaddr);
+ semctl(shmid, 0, IPC_RMID);
+
+ return 0;
+}
+
+#ifdef _SEM_SEMUN_UNDEFINED
+union semun
+{
+ int val;
+ struct semid_ds *buf;
+ unsigned short int *array;
+ struct seminfo *__buf;
+};
+#endif
+
+u32 mode=1; /* 1: physical mode; 2: virtual mode. */
+int one_lock=1;
+key_t key[NR_CPUS];
+int semid[NR_CPUS];
+
+int create_sem(int cpu)
+{
+ union semun arg;
+ char fn[MAX_FN_SIZE];
+ int sid;
+
+ sprintf(fn, PATH_FORMAT, cpu);
+ sprintf(fn, "%s/%s", fn, "err_type_info");
+ if ((key[cpu] = ftok(fn, 'e')) == -1) {
+ perror("ftok");
+ return -1;
+ }
+
+ if (semid[cpu]!=0)
+ return 0;
+
+ /* clear old semaphore */
+ if ((sid = semget(key[cpu], 1, 0)) != -1)
+ semctl(sid, 0, IPC_RMID);
+
+ /* get one semaphore */
+ if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
+ perror("semget");
+ printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
+ (u64)key[cpu]);
+ return -1;
+ }
+
+ vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
+ (u64)key[cpu]);
+ /* initialize the semaphore to 1: */
+ arg.val = 1;
+ if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
+ perror("semctl");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int lock(int cpu)
+{
+ struct sembuf lock;
+
+ lock.sem_num = cpu;
+ lock.sem_op = 1;
+ semop(semid[cpu], &lock, 1);
+
+ return 0;
+}
+
+static int unlock(int cpu)
+{
+ struct sembuf unlock;
+
+ unlock.sem_num = cpu;
+ unlock.sem_op = -1;
+ semop(semid[cpu], &unlock, 1);
+
+ return 0;
+}
+
+void free_sem(int cpu)
+{
+ semctl(semid[cpu], 0, IPC_RMID);
+}
+
+int wr_multi(char *fn, unsigned long *data, int size)
+{
+ int fd;
+ char buf[MAX_BUF_SIZE];
+ int ret;
+
+ if (size==1)
+ sprintf(buf, "%lx", *data);
+ else if (size==3)
+ sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
+ else {
+ fprintf(stderr,"write to file with wrong size!\n");
+ return -1;
+ }
+
+ fd=open(fn, O_RDWR);
+ if (!fd) {
+ perror("Error:");
+ return -1;
+ }
+ ret=write(fd, buf, sizeof(buf));
+ close(fd);
+ return ret;
+}
+
+int wr(char *fn, unsigned long data)
+{
+ return wr_multi(fn, &data, 1);
+}
+
+int rd(char *fn, unsigned long *data)
+{
+ int fd;
+ char buf[MAX_BUF_SIZE];
+
+ fd=open(fn, O_RDONLY);
+ if (fd<0) {
+ perror("Error:");
+ return -1;
+ }
+ read(fd, buf, MAX_BUF_SIZE);
+ *data=strtoul(buf, NULL, 16);
+ close(fd);
+ return 0;
+}
+
+int rd_status(char *path, int *status)
+{
+ char fn[MAX_FN_SIZE];
+ sprintf(fn, "%s/status", path);
+ if (rd(fn, (u64*)status)<0) {
+ perror("status reading error.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int rd_capabilities(char *path, u64 *capabilities)
+{
+ char fn[MAX_FN_SIZE];
+ sprintf(fn, "%s/capabilities", path);
+ if (rd(fn, capabilities)<0) {
+ perror("capabilities reading error.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int rd_all(char *path)
+{
+ unsigned long err_type_info, err_struct_info, err_data_buffer;
+ int status;
+ unsigned long capabilities, resources;
+ char fn[MAX_FN_SIZE];
+
+ sprintf(fn, "%s/err_type_info", path);
+ if (rd(fn, &err_type_info)<0) {
+ perror("err_type_info reading error.\n");
+ return -1;
+ }
+ printf("err_type_info=%lx\n", err_type_info);
+
+ sprintf(fn, "%s/err_struct_info", path);
+ if (rd(fn, &err_struct_info)<0) {
+ perror("err_struct_info reading error.\n");
+ return -1;
+ }
+ printf("err_struct_info=%lx\n", err_struct_info);
+
+ sprintf(fn, "%s/err_data_buffer", path);
+ if (rd(fn, &err_data_buffer)<0) {
+ perror("err_data_buffer reading error.\n");
+ return -1;
+ }
+ printf("err_data_buffer=%lx\n", err_data_buffer);
+
+ sprintf(fn, "%s/status", path);
+ if (rd("status", (u64*)&status)<0) {
+ perror("status reading error.\n");
+ return -1;
+ }
+ printf("status=%d\n", status);
+
+ sprintf(fn, "%s/capabilities", path);
+ if (rd(fn,&capabilities)<0) {
+ perror("capabilities reading error.\n");
+ return -1;
+ }
+ printf("capabilities=%lx\n", capabilities);
+
+ sprintf(fn, "%s/resources", path);
+ if (rd(fn, &resources)<0) {
+ perror("resources reading error.\n");
+ return -1;
+ }
+ printf("resources=%lx\n", resources);
+
+ return 0;
+}
+
+int query_capabilities(char *path, err_type_info_t err_type_info,
+ u64 *capabilities)
+{
+ char fn[MAX_FN_SIZE];
+ err_struct_info_t err_struct_info;
+ err_data_buffer_t err_data_buffer;
+
+ err_struct_info.err_struct_info=0;
+ memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
+
+ sprintf(fn, "%s/err_type_info", path);
+ wr(fn, err_type_info.err_type_info);
+ sprintf(fn, "%s/err_struct_info", path);
+ wr(fn, 0x0);
+ sprintf(fn, "%s/err_data_buffer", path);
+ wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+ // Fire pal_mc_error_inject procedure.
+ sprintf(fn, "%s/call_start", path);
+ wr(fn, mode);
+
+ if (rd_capabilities(path, capabilities)<0)
+ return -1;
+
+ return 0;
+}
+
+int query_all_capabilities()
+{
+ int status;
+ err_type_info_t err_type_info;
+ int err_sev, err_struct, struct_hier;
+ int cap=0;
+ u64 capabilities;
+ char path[MAX_FN_SIZE];
+
+ err_type_info.err_type_info=0; // Initial
+ err_type_info.err_type_info_u.mode=0; // Query mode;
+ err_type_info.err_type_info_u.err_inj=0;
+
+ printf("All capabilities implemented in pal_mc_error_inject:\n");
+ sprintf(path, PATH_FORMAT ,0);
+ for (err_sev=0;err_sev<3;err_sev++)
+ for (err_struct=0;err_struct<5;err_struct++)
+ for (struct_hier=0;struct_hier<5;struct_hier++)
+ {
+ status=-1;
+ capabilities=0;
+ err_type_info.err_type_info_u.err_sev=err_sev;
+ err_type_info.err_type_info_u.err_struct=err_struct;
+ err_type_info.err_type_info_u.struct_hier=struct_hier;
+
+ if (query_capabilities(path, err_type_info, &capabilities)<0)
+ continue;
+
+ if (rd_status(path, &status)<0)
+ continue;
+
+ if (status==0) {
+ cap=1;
+ printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
+ err_sev, err_struct, struct_hier);
+ printf("capabilities 0x%lx\n", capabilities);
+ }
+ }
+ if (!cap) {
+ printf("No capabilities supported.\n");
+ return 0;
+ }
+
+ return 0;
+}
+
+int err_inject(int cpu, char *path, err_type_info_t err_type_info,
+ err_struct_info_t err_struct_info,
+ err_data_buffer_t err_data_buffer)
+{
+ int status;
+ char fn[MAX_FN_SIZE];
+
+ log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
+ err_type_info.err_type_info,
+ err_struct_info.err_struct_info);
+ log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
+ err_data_buffer.err_data_buffer[0],
+ err_data_buffer.err_data_buffer[1],
+ err_data_buffer.err_data_buffer[2]);
+ sprintf(fn, "%s/err_type_info", path);
+ wr(fn, err_type_info.err_type_info);
+ sprintf(fn, "%s/err_struct_info", path);
+ wr(fn, err_struct_info.err_struct_info);
+ sprintf(fn, "%s/err_data_buffer", path);
+ wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+ // Fire pal_mc_error_inject procedure.
+ sprintf(fn, "%s/call_start", path);
+ wr(fn,mode);
+
+ if (rd_status(path, &status)<0) {
+ vbprintf("fail: read status\n");
+ return -100;
+ }
+
+ if (status!=0) {
+ log_info(cpu, "fail: status=%d\n", status);
+ return status;
+ }
+
+ return status;
+}
+
+static int construct_data_buf(char *path, err_type_info_t err_type_info,
+ err_struct_info_t err_struct_info,
+ err_data_buffer_t *err_data_buffer,
+ void *va1)
+{
+ char fn[MAX_FN_SIZE];
+ u64 virt_addr=0, phys_addr=0;
+
+ vbprintf("va1=%lx\n", (u64)va1);
+ memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
+
+ switch (err_type_info.err_type_info_u.err_struct) {
+ case 1: // Cache
+ switch (err_struct_info.err_struct_info_cache.cl_id) {
+ case 1: //Virtual addr
+ err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
+ break;
+ case 2: //Phys addr
+ sprintf(fn, "%s/virtual_to_phys", path);
+ virt_addr=(u64)va1;
+ if (wr(fn,virt_addr)<0)
+ return -1;
+ rd(fn, &phys_addr);
+ err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
+ break;
+ default:
+ printf("Not supported cl_id\n");
+ break;
+ }
+ break;
+ case 2: // TLB
+ break;
+ case 3: // Register file
+ break;
+ case 4: // Bus/system interconnect
+ default:
+ printf("Not supported err_struct\n");
+ break;
+ }
+
+ return 0;
+}
+
+typedef struct {
+ u64 cpu;
+ u64 loop;
+ u64 interval;
+ u64 err_type_info;
+ u64 err_struct_info;
+ u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+} parameters_t;
+
+parameters_t line_para;
+int para;
+
+static int empty_data_buffer(u64 *err_data_buffer)
+{
+ int empty=1;
+ int i;
+
+ for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
+ if (err_data_buffer[i]!=-1)
+ empty=0;
+
+ return empty;
+}
+
+int err_inj()
+{
+ err_type_info_t err_type_info;
+ err_struct_info_t err_struct_info;
+ err_data_buffer_t err_data_buffer;
+ int count;
+ FILE *fp;
+ unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
+ u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
+ int num;
+ int i;
+ char path[MAX_FN_SIZE];
+ parameters_t parameters[MAX_TASK_NUM]={};
+ pid_t child_pid[MAX_TASK_NUM];
+ time_t current_time;
+ int status;
+
+ if (!para) {
+ fp=fopen("err.conf", "r");
+ if (fp==NULL) {
+ perror("Error open err.conf");
+ return -1;
+ }
+
+ num=0;
+ while (!feof(fp)) {
+ char buf[256];
+ memset(buf,0,256);
+ fgets(buf, 256, fp);
+ count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+ &cpu, &loop, &interval,&err_type_info_conf,
+ &err_struct_info_conf,
+ &err_data_buffer_conf[0],
+ &err_data_buffer_conf[1],
+ &err_data_buffer_conf[2]);
+ if (count!=PARA_FIELD_NUM+3) {
+ err_data_buffer_conf[0]=-1;
+ err_data_buffer_conf[1]=-1;
+ err_data_buffer_conf[2]=-1;
+ count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
+ &cpu, &loop, &interval,&err_type_info_conf,
+ &err_struct_info_conf);
+ if (count!=PARA_FIELD_NUM)
+ continue;
+ }
+
+ parameters[num].cpu=cpu;
+ parameters[num].loop=loop;
+ parameters[num].interval= interval>MIN_INTERVAL
+ ?interval:MIN_INTERVAL;
+ parameters[num].err_type_info=err_type_info_conf;
+ parameters[num].err_struct_info=err_struct_info_conf;
+ memcpy(parameters[num++].err_data_buffer,
+ err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
+
+ if (num>=MAX_TASK_NUM)
+ break;
+ }
+ }
+ else {
+ parameters[0].cpu=line_para.cpu;
+ parameters[0].loop=line_para.loop;
+ parameters[0].interval= line_para.interval>MIN_INTERVAL
+ ?line_para.interval:MIN_INTERVAL;
+ parameters[0].err_type_info=line_para.err_type_info;
+ parameters[0].err_struct_info=line_para.err_struct_info;
+ memcpy(parameters[0].err_data_buffer,
+ line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
+
+ num=1;
+ }
+
+ /* Create semaphore: If one_lock, one semaphore for all processors.
+ Otherwise, one semaphore for each processor. */
+ if (one_lock) {
+ if (create_sem(0)) {
+ printf("Can not create semaphore...exit\n");
+ free_sem(0);
+ return -1;
+ }
+ }
+ else {
+ for (i=0;i<num;i++) {
+ if (create_sem(parameters[i].cpu)) {
+ printf("Can not create semaphore for cpu%d...exit\n",i);
+ free_sem(parameters[num].cpu);
+ return -1;
+ }
+ }
+ }
+
+ /* Create a shm segment which will be used to inject/consume errors on.*/
+ if (create_shm()==-1) {
+ printf("Error to create shm...exit\n");
+ return -1;
+ }
+
+ for (i=0;i<num;i++) {
+ pid_t pid;
+
+ current_time=time(NULL);
+ log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
+ log_info(parameters[i].cpu, "Configurations:\n");
+ log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
+ parameters[i].cpu,
+ parameters[i].loop,
+ parameters[i].interval);
+ log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
+ parameters[i].err_type_info,
+ parameters[i].err_struct_info);
+
+ sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
+ err_type_info.err_type_info=parameters[i].err_type_info;
+ err_struct_info.err_struct_info=parameters[i].err_struct_info;
+ memcpy(err_data_buffer.err_data_buffer,
+ parameters[i].err_data_buffer,
+ ERR_DATA_BUFFER_SIZE*8);
+
+ pid=fork();
+ if (pid==0) {
+ unsigned long mask[MASK_SIZE];
+ int j, k;
+
+ void *va1, *va2;
+
+ /* Allocate two memory areas va1 and va2 in shm */
+ va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
+ va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
+
+ vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
+ memset(va1, 0x1, PAGE_SIZE);
+ memset(va2, 0x2, PAGE_SIZE);
+
+ if (empty_data_buffer(err_data_buffer.err_data_buffer))
+ /* If not specified yet, construct data buffer
+ * with va1
+ */
+ construct_data_buf(path, err_type_info,
+ err_struct_info, &err_data_buffer,va1);
+
+ for (j=0;j<MASK_SIZE;j++)
+ mask[j]=0;
+
+ cpu=parameters[i].cpu;
+ k = cpu%64;
+ j = cpu/64;
+ mask[j]=1<<k;
+
+ if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
+ perror("Error sched_setaffinity:");
+ return -1;
+ }
+
+ for (j=0; j<parameters[i].loop; j++) {
+ log_info(parameters[i].cpu,"Injection ");
+ log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
+
+ parameters[i].cpu,j+1, parameters[i].loop);
+
+ /* Hold the lock */
+ if (one_lock)
+ lock(0);
+ else
+ /* Hold lock on this cpu */
+ lock(parameters[i].cpu);
+
+ if ((status=err_inject(parameters[i].cpu,
+ path, err_type_info,
+ err_struct_info, err_data_buffer))
+ ==0) {
+ /* consume the error for "inject only"*/
+ memcpy(va2, va1, PAGE_SIZE);
+ memcpy(va1, va2, PAGE_SIZE);
+ log_info(parameters[i].cpu,
+ "successful\n");
+ }
+ else {
+ log_info(parameters[i].cpu,"fail:");
+ log_info(parameters[i].cpu,
+ "status=%d\n", status);
+ unlock(parameters[i].cpu);
+ break;
+ }
+ if (one_lock)
+ /* Release the lock */
+ unlock(0);
+ /* Release lock on this cpu */
+ else
+ unlock(parameters[i].cpu);
+
+ if (j < parameters[i].loop-1)
+ sleep(parameters[i].interval);
+ }
+ current_time=time(NULL);
+ log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
+ return 0;
+ }
+ else if (pid<0) {
+ perror("Error fork:");
+ continue;
+ }
+ child_pid[i]=pid;
+ }
+ for (i=0;i<num;i++)
+ waitpid(child_pid[i], NULL, 0);
+
+ if (one_lock)
+ free_sem(0);
+ else
+ for (i=0;i<num;i++)
+ free_sem(parameters[i].cpu);
+
+ printf("All done.\n");
+
+ return 0;
+}
+
+void help()
+{
+ printf("err_inject_tool:\n");
+ printf("\t-q: query all capabilities. default: off\n");
+ printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
+ printf("\t-i: inject errors. default: off\n");
+ printf("\t-l: one lock per cpu. default: one lock for all\n");
+ printf("\t-e: error parameters:\n");
+ printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
+ printf("\t\t cpu: logical cpu number the error will be inject in.\n");
+ printf("\t\t loop: times the error will be injected.\n");
+ printf("\t\t interval: In second. every so often one error is injected.\n");
+ printf("\t\t err_type_info, err_struct_info: PAL parameters.\n");
+ printf("\t\t err_data_buffer: PAL parameter. Optional. If not present,\n");
+ printf("\t\t it's constructed by tool automatically. Be\n");
+ printf("\t\t careful to provide err_data_buffer and make\n");
+ printf("\t\t sure it's working with the environment.\n");
+ printf("\t Note:no space between error parameters.\n");
+ printf("\t default: Take error parameters from err.conf instead of command line.\n");
+ printf("\t-v: verbose. default: off\n");
+ printf("\t-h: help\n\n");
+ printf("The tool will take err.conf file as ");
+ printf("input to inject single or multiple errors ");
+ printf("on one or multiple cpus in parallel.\n");
+}
+
+int main(int argc, char **argv)
+{
+ char c;
+ int do_err_inj=0;
+ int do_query_all=0;
+ int count;
+ u32 m;
+
+ /* Default one lock for all cpu's */
+ one_lock=1;
+ while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
+ switch (c) {
+ case 'm': /* Procedure mode. 1: phys 2: virt */
+ count=sscanf(optarg, "%x", &m);
+ if (count!=1 || (m!=1 && m!=2)) {
+ printf("Wrong mode number.\n");
+ help();
+ return -1;
+ }
+ mode=m;
+ break;
+ case 'i': /* Inject errors */
+ do_err_inj=1;
+ break;
+ case 'q': /* Query */
+ do_query_all=1;
+ break;
+ case 'v': /* Verbose */
+ verbose=1;
+ break;
+ case 'l': /* One lock per cpu */
+ one_lock=0;
+ break;
+ case 'e': /* error arguments */
+ /* Take parameters:
+ * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
+ * err_data_buffer is optional. Recommend not to specify
+ * err_data_buffer. Better to use tool to generate it.
+ */
+ count=sscanf(optarg,
+ "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+ &line_para.cpu,
+ &line_para.loop,
+ &line_para.interval,
+ &line_para.err_type_info,
+ &line_para.err_struct_info,
+ &line_para.err_data_buffer[0],
+ &line_para.err_data_buffer[1],
+ &line_para.err_data_buffer[2]);
+ if (count!=PARA_FIELD_NUM+3) {
+ line_para.err_data_buffer[0]=-1,
+ line_para.err_data_buffer[1]=-1,
+ line_para.err_data_buffer[2]=-1;
+ count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
+ &line_para.cpu,
+ &line_para.loop,
+ &line_para.interval,
+ &line_para.err_type_info,
+ &line_para.err_struct_info);
+ if (count!=PARA_FIELD_NUM) {
+ printf("Wrong error arguments.\n");
+ help();
+ return -1;
+ }
+ }
+ para=1;
+ break;
+ continue;
+ break;
+ case 'h':
+ help();
+ return 0;
+ default:
+ break;
+ }
+
+ if (do_query_all)
+ query_all_capabilities();
+ if (do_err_inj)
+ err_inj();
+
+ if (!do_query_all && !do_err_inj)
+ help();
+
+ return 0;
+}
+
diff --git a/Documentation/ia64/fsys.txt b/Documentation/ia64/fsys.txt
new file mode 100644
index 0000000..59dd689
--- /dev/null
+++ b/Documentation/ia64/fsys.txt
@@ -0,0 +1,286 @@
+-*-Mode: outline-*-
+
+ Light-weight System Calls for IA-64
+ -----------------------------------
+
+ Started: 13-Jan-2003
+ Last update: 27-Sep-2003
+
+ David Mosberger-Tang
+ <davidm@hpl.hp.com>
+
+Using the "epc" instruction effectively introduces a new mode of
+execution to the ia64 linux kernel. We call this mode the
+"fsys-mode". To recap, the normal states of execution are:
+
+ - kernel mode:
+ Both the register stack and the memory stack have been
+ switched over to kernel memory. The user-level state is saved
+ in a pt-regs structure at the top of the kernel memory stack.
+
+ - user mode:
+ Both the register stack and the kernel stack are in
+ user memory. The user-level state is contained in the
+ CPU registers.
+
+ - bank 0 interruption-handling mode:
+ This is the non-interruptible state which all
+ interruption-handlers start execution in. The user-level
+ state remains in the CPU registers and some kernel state may
+ be stored in bank 0 of registers r16-r31.
+
+In contrast, fsys-mode has the following special properties:
+
+ - execution is at privilege level 0 (most-privileged)
+
+ - CPU registers may contain a mixture of user-level and kernel-level
+ state (it is the responsibility of the kernel to ensure that no
+ security-sensitive kernel-level state is leaked back to
+ user-level)
+
+ - execution is interruptible and preemptible (an fsys-mode handler
+ can disable interrupts and avoid all other interruption-sources
+ to avoid preemption)
+
+ - neither the memory-stack nor the register-stack can be trusted while
+ in fsys-mode (they point to the user-level stacks, which may
+ be invalid, or completely bogus addresses)
+
+In summary, fsys-mode is much more similar to running in user-mode
+than it is to running in kernel-mode. Of course, given that the
+privilege level is at level 0, this means that fsys-mode requires some
+care (see below).
+
+
+* How to tell fsys-mode
+
+Linux operates in fsys-mode when (a) the privilege level is 0 (most
+privileged) and (b) the stacks have NOT been switched to kernel memory
+yet. For convenience, the header file <asm-ia64/ptrace.h> provides
+three macros:
+
+ user_mode(regs)
+ user_stack(task,regs)
+ fsys_mode(task,regs)
+
+The "regs" argument is a pointer to a pt_regs structure. The "task"
+argument is a pointer to the task structure to which the "regs"
+pointer belongs to. user_mode() returns TRUE if the CPU state pointed
+to by "regs" was executing in user mode (privilege level 3).
+user_stack() returns TRUE if the state pointed to by "regs" was
+executing on the user-level stack(s). Finally, fsys_mode() returns
+TRUE if the CPU state pointed to by "regs" was executing in fsys-mode.
+The fsys_mode() macro is equivalent to the expression:
+
+ !user_mode(regs) && user_stack(task,regs)
+
+* How to write an fsyscall handler
+
+The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers
+(fsyscall_table). This table contains one entry for each system call.
+By default, a system call is handled by fsys_fallback_syscall(). This
+routine takes care of entering (full) kernel mode and calling the
+normal Linux system call handler. For performance-critical system
+calls, it is possible to write a hand-tuned fsyscall_handler. For
+example, fsys.S contains fsys_getpid(), which is a hand-tuned version
+of the getpid() system call.
+
+The entry and exit-state of an fsyscall handler is as follows:
+
+** Machine state on entry to fsyscall handler:
+
+ - r10 = 0
+ - r11 = saved ar.pfs (a user-level value)
+ - r15 = system call number
+ - r16 = "current" task pointer (in normal kernel-mode, this is in r13)
+ - r32-r39 = system call arguments
+ - b6 = return address (a user-level value)
+ - ar.pfs = previous frame-state (a user-level value)
+ - PSR.be = cleared to zero (i.e., little-endian byte order is in effect)
+ - all other registers may contain values passed in from user-mode
+
+** Required machine state on exit to fsyscall handler:
+
+ - r11 = saved ar.pfs (as passed into the fsyscall handler)
+ - r15 = system call number (as passed into the fsyscall handler)
+ - r32-r39 = system call arguments (as passed into the fsyscall handler)
+ - b6 = return address (as passed into the fsyscall handler)
+ - ar.pfs = previous frame-state (as passed into the fsyscall handler)
+
+Fsyscall handlers can execute with very little overhead, but with that
+speed comes a set of restrictions:
+
+ o Fsyscall-handlers MUST check for any pending work in the flags
+ member of the thread-info structure and if any of the
+ TIF_ALLWORK_MASK flags are set, the handler needs to fall back on
+ doing a full system call (by calling fsys_fallback_syscall).
+
+ o Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11,
+ r15, b6, and ar.pfs) because they will be needed in case of a
+ system call restart. Of course, all "preserved" registers also
+ must be preserved, in accordance to the normal calling conventions.
+
+ o Fsyscall-handlers MUST check argument registers for containing a
+ NaT value before using them in any way that could trigger a
+ NaT-consumption fault. If a system call argument is found to
+ contain a NaT value, an fsyscall-handler may return immediately
+ with r8=EINVAL, r10=-1.
+
+ o Fsyscall-handlers MUST NOT use the "alloc" instruction or perform
+ any other operation that would trigger mandatory RSE
+ (register-stack engine) traffic.
+
+ o Fsyscall-handlers MUST NOT write to any stacked registers because
+ it is not safe to assume that user-level called a handler with the
+ proper number of arguments.
+
+ o Fsyscall-handlers need to be careful when accessing per-CPU variables:
+ unless proper safe-guards are taken (e.g., interruptions are avoided),
+ execution may be pre-empted and resumed on another CPU at any given
+ time.
+
+ o Fsyscall-handlers must be careful not to leak sensitive kernel'
+ information back to user-level. In particular, before returning to
+ user-level, care needs to be taken to clear any scratch registers
+ that could contain sensitive information (note that the current
+ task pointer is not considered sensitive: it's already exposed
+ through ar.k6).
+
+ o Fsyscall-handlers MUST NOT access user-memory without first
+ validating access-permission (this can be done typically via
+ probe.r.fault and/or probe.w.fault) and without guarding against
+ memory access exceptions (this can be done with the EX() macros
+ defined by asmmacro.h).
+
+The above restrictions may seem draconian, but remember that it's
+possible to trade off some of the restrictions by paying a slightly
+higher overhead. For example, if an fsyscall-handler could benefit
+from the shadow register bank, it could temporarily disable PSR.i and
+PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as
+needed. In other words, following the above rules yields extremely
+fast system call execution (while fully preserving system call
+semantics), but there is also a lot of flexibility in handling more
+complicated cases.
+
+* Signal handling
+
+The delivery of (asynchronous) signals must be delayed until fsys-mode
+is exited. This is accomplished with the help of the lower-privilege
+transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user()
+checks whether the interrupted task was in fsys-mode and, if so, sets
+PSR.lp and returns immediately. When fsys-mode is exited via the
+"br.ret" instruction that lowers the privilege level, a trap will
+occur. The trap handler clears PSR.lp again and returns immediately.
+The kernel exit path then checks for and delivers any pending signals.
+
+* PSR Handling
+
+The "epc" instruction doesn't change the contents of PSR at all. This
+is in contrast to a regular interruption, which clears almost all
+bits. Because of that, some care needs to be taken to ensure things
+work as expected. The following discussion describes how each PSR bit
+is handled.
+
+PSR.be Cleared when entering fsys-mode. A srlz.d instruction is used
+ to ensure the CPU is in little-endian mode before the first
+ load/store instruction is executed. PSR.be is normally NOT
+ restored upon return from an fsys-mode handler. In other
+ words, user-level code must not rely on PSR.be being preserved
+ across a system call.
+PSR.up Unchanged.
+PSR.ac Unchanged.
+PSR.mfl Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.mfh Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.ic Unchanged. Note: fsys-mode handlers can clear the bit, if needed.
+PSR.i Unchanged. Note: fsys-mode handlers can clear the bit, if needed.
+PSR.pk Unchanged.
+PSR.dt Unchanged.
+PSR.dfl Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.dfh Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.sp Unchanged.
+PSR.pp Unchanged.
+PSR.di Unchanged.
+PSR.si Unchanged.
+PSR.db Unchanged. The kernel prevents user-level from setting a hardware
+ breakpoint that triggers at any privilege level other than 3 (user-mode).
+PSR.lp Unchanged.
+PSR.tb Lazy redirect. If a taken-branch trap occurs while in
+ fsys-mode, the trap-handler modifies the saved machine state
+ such that execution resumes in the gate page at
+ syscall_via_break(), with privilege level 3. Note: the
+ taken branch would occur on the branch invoking the
+ fsyscall-handler, at which point, by definition, a syscall
+ restart is still safe. If the system call number is invalid,
+ the fsys-mode handler will return directly to user-level. This
+ return will trigger a taken-branch trap, but since the trap is
+ taken _after_ restoring the privilege level, the CPU has already
+ left fsys-mode, so no special treatment is needed.
+PSR.rt Unchanged.
+PSR.cpl Cleared to 0.
+PSR.is Unchanged (guaranteed to be 0 on entry to the gate page).
+PSR.mc Unchanged.
+PSR.it Unchanged (guaranteed to be 1).
+PSR.id Unchanged. Note: the ia64 linux kernel never sets this bit.
+PSR.da Unchanged. Note: the ia64 linux kernel never sets this bit.
+PSR.dd Unchanged. Note: the ia64 linux kernel never sets this bit.
+PSR.ss Lazy redirect. If set, "epc" will cause a Single Step Trap to
+ be taken. The trap handler then modifies the saved machine
+ state such that execution resumes in the gate page at
+ syscall_via_break(), with privilege level 3.
+PSR.ri Unchanged.
+PSR.ed Unchanged. Note: This bit could only have an effect if an fsys-mode
+ handler performed a speculative load that gets NaTted. If so, this
+ would be the normal & expected behavior, so no special treatment is
+ needed.
+PSR.bn Unchanged. Note: fsys-mode handlers may clear the bit, if needed.
+ Doing so requires clearing PSR.i and PSR.ic as well.
+PSR.ia Unchanged. Note: the ia64 linux kernel never sets this bit.
+
+* Using fast system calls
+
+To use fast system calls, userspace applications need simply call
+__kernel_syscall_via_epc(). For example
+
+-- example fgettimeofday() call --
+-- fgettimeofday.S --
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(fgettimeofday)
+.prologue
+.save ar.pfs, r11
+mov r11 = ar.pfs
+.body
+
+mov r2 = 0xa000000000020660;; // gate address
+ // found by inspection of System.map for the
+ // __kernel_syscall_via_epc() function. See
+ // below for how to do this for real.
+
+mov b7 = r2
+mov r15 = 1087 // gettimeofday syscall
+;;
+br.call.sptk.many b6 = b7
+;;
+
+.restore sp
+
+mov ar.pfs = r11
+br.ret.sptk.many rp;; // return to caller
+END(fgettimeofday)
+
+-- end fgettimeofday.S --
+
+In reality, getting the gate address is accomplished by two extra
+values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
+
+ o AT_SYSINFO : is the address of __kernel_syscall_via_epc()
+ o AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
+
+The ELF DSO is a pre-linked library that is mapped in by the kernel at
+the gate page. It is a proper ELF shared object so, with a dynamic
+loader that recognises the library, you should be able to make calls to
+the exported functions within it as with any other shared library.
+AT_SYSINFO points into the kernel DSO at the
+__kernel_syscall_via_epc() function for historical reasons (it was
+used before the kernel DSO) and as a convenience.
diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
new file mode 100644
index 0000000..84f7cb3
--- /dev/null
+++ b/Documentation/ia64/kvm.txt
@@ -0,0 +1,83 @@
+Currently, kvm module is in EXPERIMENTAL stage on IA64. This means that
+interfaces are not stable enough to use. So, please don't run critical
+applications in virtual machine.
+We will try our best to improve it in future versions!
+
+ Guide: How to boot up guests on kvm/ia64
+
+This guide is to describe how to enable kvm support for IA-64 systems.
+
+1. Get the kvm source from git.kernel.org.
+ Userspace source:
+ git clone git://git.kernel.org/pub/scm/virt/kvm/kvm-userspace.git
+ Kernel Source:
+ git clone git://git.kernel.org/pub/scm/linux/kernel/git/xiantao/kvm-ia64.git
+
+2. Compile the source code.
+ 2.1 Compile userspace code:
+ (1)cd ./kvm-userspace
+ (2)./configure
+ (3)cd kernel
+ (4)make sync LINUX= $kernel_dir (kernel_dir is the directory of kernel source.)
+ (5)cd ..
+ (6)make qemu
+ (7)cd qemu; make install
+
+ 2.2 Compile kernel source code:
+ (1) cd ./$kernel_dir
+ (2) Make menuconfig
+ (3) Enter into virtualization option, and choose kvm.
+ (4) make
+ (5) Once (4) done, make modules_install
+ (6) Make initrd, and use new kernel to reboot up host machine.
+ (7) Once (6) done, cd $kernel_dir/arch/ia64/kvm
+ (8) insmod kvm.ko; insmod kvm-intel.ko
+
+Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qemu, otherwise, may fail.
+
+3. Get Guest Firmware named as Flash.fd, and put it under right place:
+ (1) If you have the guest firmware (binary) released by Intel Corp for Xen, use it directly.
+
+ (2) If you have no firmware at hand, Please download its source from
+ hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
+ you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
+
+ (3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu
+
+4. Boot up Linux or Windows guests:
+ 4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
+
+ 4.2 Boot up guests use the following command.
+ /usr/local/bin/qemu-system-ia64 -smp xx -m 512 -hda $your_image
+ (xx is the number of virtual processors for the guest, now the maximum value is 4)
+
+5. Known possible issue on some platforms with old Firmware.
+
+In the event of strange host crash issues, try to solve it through either of the following ways:
+
+(1): Upgrade your Firmware to the latest one.
+
+(2): Applying the below patch to kernel source.
+diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
+index 0b53344..f02b0f7 100644
+--- a/arch/ia64/kernel/pal.S
++++ b/arch/ia64/kernel/pal.S
+@@ -84,7 +84,8 @@ GLOBAL_ENTRY(ia64_pal_call_static)
+ mov ar.pfs = loc1
+ mov rp = loc0
+ ;;
+- srlz.d // serialize restoration of psr.l
++ srlz.i // serialize restoration of psr.l
++ ;;
+ br.ret.sptk.many b0
+ END(ia64_pal_call_static)
+
+6. Bug report:
+ If you found any issues when use kvm/ia64, Please post the bug info to kvm-ia64-devel mailing list.
+ https://lists.sourceforge.net/lists/listinfo/kvm-ia64-devel/
+
+Thanks for your interest! Let's work together, and make kvm/ia64 stronger and stronger!
+
+
+ Xiantao Zhang <xiantao.zhang@intel.com>
+ 2008.3.10
diff --git a/Documentation/ia64/mca.txt b/Documentation/ia64/mca.txt
new file mode 100644
index 0000000..f097c60
--- /dev/null
+++ b/Documentation/ia64/mca.txt
@@ -0,0 +1,194 @@
+An ad-hoc collection of notes on IA64 MCA and INIT processing. Feel
+free to update it with notes about any area that is not clear.
+
+---
+
+MCA/INIT are completely asynchronous. They can occur at any time, when
+the OS is in any state. Including when one of the cpus is already
+holding a spinlock. Trying to get any lock from MCA/INIT state is
+asking for deadlock. Also the state of structures that are protected
+by locks is indeterminate, including linked lists.
+
+---
+
+The complicated ia64 MCA process. All of this is mandated by Intel's
+specification for ia64 SAL, error recovery and unwind, it is not as
+if we have a choice here.
+
+* MCA occurs on one cpu, usually due to a double bit memory error.
+ This is the monarch cpu.
+
+* SAL sends an MCA rendezvous interrupt (which is a normal interrupt)
+ to all the other cpus, the slaves.
+
+* Slave cpus that receive the MCA interrupt call down into SAL, they
+ end up spinning disabled while the MCA is being serviced.
+
+* If any slave cpu was already spinning disabled when the MCA occurred
+ then it cannot service the MCA interrupt. SAL waits ~20 seconds then
+ sends an unmaskable INIT event to the slave cpus that have not
+ already rendezvoused.
+
+* Because MCA/INIT can be delivered at any time, including when the cpu
+ is down in PAL in physical mode, the registers at the time of the
+ event are _completely_ undefined. In particular the MCA/INIT
+ handlers cannot rely on the thread pointer, PAL physical mode can
+ (and does) modify TP. It is allowed to do that as long as it resets
+ TP on return. However MCA/INIT events expose us to these PAL
+ internal TP changes. Hence curr_task().
+
+* If an MCA/INIT event occurs while the kernel was running (not user
+ space) and the kernel has called PAL then the MCA/INIT handler cannot
+ assume that the kernel stack is in a fit state to be used. Mainly
+ because PAL may or may not maintain the stack pointer internally.
+ Because the MCA/INIT handlers cannot trust the kernel stack, they
+ have to use their own, per-cpu stacks. The MCA/INIT stacks are
+ preformatted with just enough task state to let the relevant handlers
+ do their job.
+
+* Unlike most other architectures, the ia64 struct task is embedded in
+ the kernel stack[1]. So switching to a new kernel stack means that
+ we switch to a new task as well. Because various bits of the kernel
+ assume that current points into the struct task, switching to a new
+ stack also means a new value for current.
+
+* Once all slaves have rendezvoused and are spinning disabled, the
+ monarch is entered. The monarch now tries to diagnose the problem
+ and decide if it can recover or not.
+
+* Part of the monarch's job is to look at the state of all the other
+ tasks. The only way to do that on ia64 is to call the unwinder,
+ as mandated by Intel.
+
+* The starting point for the unwind depends on whether a task is
+ running or not. That is, whether it is on a cpu or is blocked. The
+ monarch has to determine whether or not a task is on a cpu before it
+ knows how to start unwinding it. The tasks that received an MCA or
+ INIT event are no longer running, they have been converted to blocked
+ tasks. But (and its a big but), the cpus that received the MCA
+ rendezvous interrupt are still running on their normal kernel stacks!
+
+* To distinguish between these two cases, the monarch must know which
+ tasks are on a cpu and which are not. Hence each slave cpu that
+ switches to an MCA/INIT stack, registers its new stack using
+ set_curr_task(), so the monarch can tell that the _original_ task is
+ no longer running on that cpu. That gives us a decent chance of
+ getting a valid backtrace of the _original_ task.
+
+* MCA/INIT can be nested, to a depth of 2 on any cpu. In the case of a
+ nested error, we want diagnostics on the MCA/INIT handler that
+ failed, not on the task that was originally running. Again this
+ requires set_curr_task() so the MCA/INIT handlers can register their
+ own stack as running on that cpu. Then a recursive error gets a
+ trace of the failing handler's "task".
+
+[1] My (Keith Owens) original design called for ia64 to separate its
+ struct task and the kernel stacks. Then the MCA/INIT data would be
+ chained stacks like i386 interrupt stacks. But that required
+ radical surgery on the rest of ia64, plus extra hard wired TLB
+ entries with its associated performance degradation. David
+ Mosberger vetoed that approach. Which meant that separate kernel
+ stacks meant separate "tasks" for the MCA/INIT handlers.
+
+---
+
+INIT is less complicated than MCA. Pressing the nmi button or using
+the equivalent command on the management console sends INIT to all
+cpus. SAL picks one of the cpus as the monarch and the rest are
+slaves. All the OS INIT handlers are entered at approximately the same
+time. The OS monarch prints the state of all tasks and returns, after
+which the slaves return and the system resumes.
+
+At least that is what is supposed to happen. Alas there are broken
+versions of SAL out there. Some drive all the cpus as monarchs. Some
+drive them all as slaves. Some drive one cpu as monarch, wait for that
+cpu to return from the OS then drive the rest as slaves. Some versions
+of SAL cannot even cope with returning from the OS, they spin inside
+SAL on resume. The OS INIT code has workarounds for some of these
+broken SAL symptoms, but some simply cannot be fixed from the OS side.
+
+---
+
+The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer
+violations. Unfortunately MCA/INIT start off as massive layer
+violations (can occur at _any_ time) and they build from there.
+
+At least ia64 makes an attempt at recovering from hardware errors, but
+it is a difficult problem because of the asynchronous nature of these
+errors. When processing an unmaskable interrupt we sometimes need
+special code to cope with our inability to take any locks.
+
+---
+
+How is ia64 MCA/INIT different from x86 NMI?
+
+* x86 NMI typically gets delivered to one cpu. MCA/INIT gets sent to
+ all cpus.
+
+* x86 NMI cannot be nested. MCA/INIT can be nested, to a depth of 2
+ per cpu.
+
+* x86 has a separate struct task which points to one of multiple kernel
+ stacks. ia64 has the struct task embedded in the single kernel
+ stack, so switching stack means switching task.
+
+* x86 does not call the BIOS so the NMI handler does not have to worry
+ about any registers having changed. MCA/INIT can occur while the cpu
+ is in PAL in physical mode, with undefined registers and an undefined
+ kernel stack.
+
+* i386 backtrace is not very sensitive to whether a process is running
+ or not. ia64 unwind is very, very sensitive to whether a process is
+ running or not.
+
+---
+
+What happens when MCA/INIT is delivered what a cpu is running user
+space code?
+
+The user mode registers are stored in the RSE area of the MCA/INIT on
+entry to the OS and are restored from there on return to SAL, so user
+mode registers are preserved across a recoverable MCA/INIT. Since the
+OS has no idea what unwind data is available for the user space stack,
+MCA/INIT never tries to backtrace user space. Which means that the OS
+does not bother making the user space process look like a blocked task,
+i.e. the OS does not copy pt_regs and switch_stack to the user space
+stack. Also the OS has no idea how big the user space RSE and memory
+stacks are, which makes it too risky to copy the saved state to a user
+mode stack.
+
+---
+
+How do we get a backtrace on the tasks that were running when MCA/INIT
+was delivered?
+
+mca.c:::ia64_mca_modify_original_stack(). That identifies and
+verifies the original kernel stack, copies the dirty registers from
+the MCA/INIT stack's RSE to the original stack's RSE, copies the
+skeleton struct pt_regs and switch_stack to the original stack, fills
+in the skeleton structures from the PAL minstate area and updates the
+original stack's thread.ksp. That makes the original stack look
+exactly like any other blocked task, i.e. it now appears to be
+sleeping. To get a backtrace, just start with thread.ksp for the
+original task and unwind like any other sleeping task.
+
+---
+
+How do we identify the tasks that were running when MCA/INIT was
+delivered?
+
+If the previous task has been verified and converted to a blocked
+state, then sos->prev_task on the MCA/INIT stack is updated to point to
+the previous task. You can look at that field in dumps or debuggers.
+To help distinguish between the handler and the original tasks,
+handlers have _TIF_MCA_INIT set in thread_info.flags.
+
+The sos data is always in the MCA/INIT handler stack, at offset
+MCA_SOS_OFFSET. You can get that value from mca_asm.h or calculate it
+as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct
+ia64_sal_os_state), with 16 byte alignment for all structures.
+
+Also the comm field of the MCA/INIT task is modified to include the pid
+of the original task, for humans to use. For example, a comm field of
+'MCA 12159' means that pid 12159 was running when the MCA was
+delivered.
diff --git a/Documentation/ia64/paravirt_ops.txt b/Documentation/ia64/paravirt_ops.txt
new file mode 100644
index 0000000..39ded02
--- /dev/null
+++ b/Documentation/ia64/paravirt_ops.txt
@@ -0,0 +1,137 @@
+Paravirt_ops on IA64
+====================
+ 21 May 2008, Isaku Yamahata <yamahata@valinux.co.jp>
+
+
+Introduction
+------------
+The aim of this documentation is to help with maintainability and/or to
+encourage people to use paravirt_ops/IA64.
+
+paravirt_ops (pv_ops in short) is a way for virtualization support of
+Linux kernel on x86. Several ways for virtualization support were
+proposed, paravirt_ops is the winner.
+On the other hand, now there are also several IA64 virtualization
+technologies like kvm/IA64, xen/IA64 and many other academic IA64
+hypervisors so that it is good to add generic virtualization
+infrastructure on Linux/IA64.
+
+
+What is paravirt_ops?
+---------------------
+It has been developed on x86 as virtualization support via API, not ABI.
+It allows each hypervisor to override operations which are important for
+hypervisors at API level. And it allows a single kernel binary to run on
+all supported execution environments including native machine.
+Essentially paravirt_ops is a set of function pointers which represent
+operations corresponding to low level sensitive instructions and high
+level functionalities in various area. But one significant difference
+from usual function pointer table is that it allows optimization with
+binary patch. It is because some of these operations are very
+performance sensitive and indirect call overhead is not negligible.
+With binary patch, indirect C function call can be transformed into
+direct C function call or in-place execution to eliminate the overhead.
+
+Thus, operations of paravirt_ops are classified into three categories.
+- simple indirect call
+ These operations correspond to high level functionality so that the
+ overhead of indirect call isn't very important.
+
+- indirect call which allows optimization with binary patch
+ Usually these operations correspond to low level instructions. They
+ are called frequently and performance critical. So the overhead is
+ very important.
+
+- a set of macros for hand written assembly code
+ Hand written assembly codes (.S files) also need paravirtualization
+ because they include sensitive instructions or some of code paths in
+ them are very performance critical.
+
+
+The relation to the IA64 machine vector
+---------------------------------------
+Linux/IA64 has the IA64 machine vector functionality which allows the
+kernel to switch implementations (e.g. initialization, ipi, dma api...)
+depending on executing platform.
+We can replace some implementations very easily defining a new machine
+vector. Thus another approach for virtualization support would be
+enhancing the machine vector functionality.
+But paravirt_ops approach was taken because
+- virtualization support needs wider support than machine vector does.
+ e.g. low level instruction paravirtualization. It must be
+ initialized very early before platform detection.
+
+- virtualization support needs more functionality like binary patch.
+ Probably the calling overhead might not be very large compared to the
+ emulation overhead of virtualization. However in the native case, the
+ overhead should be eliminated completely.
+ A single kernel binary should run on each environment including native,
+ and the overhead of paravirt_ops on native environment should be as
+ small as possible.
+
+- for full virtualization technology, e.g. KVM/IA64 or
+ Xen/IA64 HVM domain, the result would be
+ (the emulated platform machine vector. probably dig) + (pv_ops).
+ This means that the virtualization support layer should be under
+ the machine vector layer.
+
+Possibly it might be better to move some function pointers from
+paravirt_ops to machine vector. In fact, Xen domU case utilizes both
+pv_ops and machine vector.
+
+
+IA64 paravirt_ops
+-----------------
+In this section, the concrete paravirt_ops will be discussed.
+Because of the architecture difference between ia64 and x86, the
+resulting set of functions is very different from x86 pv_ops.
+
+- C function pointer tables
+They are not very performance critical so that simple C indirect
+function call is acceptable. The following structures are defined at
+this moment. For details see linux/include/asm-ia64/paravirt.h
+ - struct pv_info
+ This structure describes the execution environment.
+ - struct pv_init_ops
+ This structure describes the various initialization hooks.
+ - struct pv_iosapic_ops
+ This structure describes hooks to iosapic operations.
+ - struct pv_irq_ops
+ This structure describes hooks to irq related operations
+ - struct pv_time_op
+ This structure describes hooks to steal time accounting.
+
+- a set of indirect calls which need optimization
+Currently this class of functions correspond to a subset of IA64
+intrinsics. At this moment the optimization with binary patch isn't
+implemented yet.
+struct pv_cpu_op is defined. For details see
+linux/include/asm-ia64/paravirt_privop.h
+Mostly they correspond to ia64 intrinsics 1-to-1.
+Caveat: Now they are defined as C indirect function pointers, but in
+order to support binary patch optimization, they will be changed
+using GCC extended inline assembly code.
+
+- a set of macros for hand written assembly code (.S files)
+For maintenance purpose, the taken approach for .S files is single
+source code and compile multiple times with different macros definitions.
+Each pv_ops instance must define those macros to compile.
+The important thing here is that sensitive, but non-privileged
+instructions must be paravirtualized and that some privileged
+instructions also need paravirtualization for reasonable performance.
+Developers who modify .S files must be aware of that. At this moment
+an easy checker is implemented to detect paravirtualization breakage.
+But it doesn't cover all the cases.
+
+Sometimes this set of macros is called pv_cpu_asm_op. But there is no
+corresponding structure in the source code.
+Those macros mostly 1:1 correspond to a subset of privileged
+instructions. See linux/include/asm-ia64/native/inst.h.
+And some functions written in assembly also need to be overrided so
+that each pv_ops instance have to define some macros. Again see
+linux/include/asm-ia64/native/inst.h.
+
+
+Those structures must be initialized very early before start_kernel.
+Probably initialized in head.S using multi entry point or some other trick.
+For native case implementation see linux/arch/ia64/kernel/paravirt.c.
diff --git a/Documentation/ia64/serial.txt b/Documentation/ia64/serial.txt
new file mode 100644
index 0000000..040b977
--- /dev/null
+++ b/Documentation/ia64/serial.txt
@@ -0,0 +1,151 @@
+SERIAL DEVICE NAMING
+
+ As of 2.6.10, serial devices on ia64 are named based on the
+ order of ACPI and PCI enumeration. The first device in the
+ ACPI namespace (if any) becomes /dev/ttyS0, the second becomes
+ /dev/ttyS1, etc., and PCI devices are named sequentially
+ starting after the ACPI devices.
+
+ Prior to 2.6.10, there were confusing exceptions to this:
+
+ - Firmware on some machines (mostly from HP) provides an HCDP
+ table[1] that tells the kernel about devices that can be used
+ as a serial console. If the user specified "console=ttyS0"
+ or the EFI ConOut path contained only UART devices, the
+ kernel registered the device described by the HCDP as
+ /dev/ttyS0.
+
+ - If there was no HCDP, we assumed there were UARTs at the
+ legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so
+ the kernel registered those as /dev/ttyS0 and /dev/ttyS1.
+
+ Any additional ACPI or PCI devices were registered sequentially
+ after /dev/ttyS0 as they were discovered.
+
+ With an HCDP, device names changed depending on EFI configuration
+ and "console=" arguments. Without an HCDP, device names didn't
+ change, but we registered devices that might not really exist.
+
+ For example, an HP rx1600 with a single built-in serial port
+ (described in the ACPI namespace) plus an MP[2] (a PCI device) has
+ these ports:
+
+ pre-2.6.10 pre-2.6.10
+ MMIO (EFI console (EFI console
+ address on builtin) on MP port) 2.6.10
+ ========== ========== ========== ======
+ builtin 0xff5e0000 ttyS0 ttyS1 ttyS0
+ MP UPS 0xf8031000 ttyS1 ttyS2 ttyS1
+ MP Console 0xf8030000 ttyS2 ttyS0 ttyS2
+ MP 2 0xf8030010 ttyS3 ttyS3 ttyS3
+ MP 3 0xf8030038 ttyS4 ttyS4 ttyS4
+
+CONSOLE SELECTION
+
+ EFI knows what your console devices are, but it doesn't tell the
+ kernel quite enough to actually locate them. The DIG64 HCDP
+ table[1] does tell the kernel where potential serial console
+ devices are, but not all firmware supplies it. Also, EFI supports
+ multiple simultaneous consoles and doesn't tell the kernel which
+ should be the "primary" one.
+
+ So how do you tell Linux which console device to use?
+
+ - If your firmware supplies the HCDP, it is simplest to
+ configure EFI with a single device (either a UART or a VGA
+ card) as the console. Then you don't need to tell Linux
+ anything; the kernel will automatically use the EFI console.
+
+ (This works only in 2.6.6 or later; prior to that you had
+ to specify "console=ttyS0" to get a serial console.)
+
+ - Without an HCDP, Linux defaults to a VGA console unless you
+ specify a "console=" argument.
+
+ NOTE: Don't assume that a serial console device will be /dev/ttyS0.
+ It might be ttyS1, ttyS2, etc. Make sure you have the appropriate
+ entries in /etc/inittab (for getty) and /etc/securetty (to allow
+ root login).
+
+EARLY SERIAL CONSOLE
+
+ The kernel can't start using a serial console until it knows where
+ the device lives. Normally this happens when the driver enumerates
+ all the serial devices, which can happen a minute or more after the
+ kernel starts booting.
+
+ 2.6.10 and later kernels have an "early uart" driver that works
+ very early in the boot process. The kernel will automatically use
+ this if the user supplies an argument like "console=uart,io,0x3f8",
+ or if the EFI console path contains only a UART device and the
+ firmware supplies an HCDP.
+
+TROUBLESHOOTING SERIAL CONSOLE PROBLEMS
+
+ No kernel output after elilo prints "Uncompressing Linux... done":
+
+ - You specified "console=ttyS0" but Linux changed the device
+ to which ttyS0 refers. Configure exactly one EFI console
+ device[3] and remove the "console=" option.
+
+ - The EFI console path contains both a VGA device and a UART.
+ EFI and elilo use both, but Linux defaults to VGA. Remove
+ the VGA device from the EFI console path[3].
+
+ - Multiple UARTs selected as EFI console devices. EFI and
+ elilo use all selected devices, but Linux uses only one.
+ Make sure only one UART is selected in the EFI console
+ path[3].
+
+ - You're connected to an HP MP port[2] but have a non-MP UART
+ selected as EFI console device. EFI uses the MP as a
+ console device even when it isn't explicitly selected.
+ Either move the console cable to the non-MP UART, or change
+ the EFI console path[3] to the MP UART.
+
+ Long pause (60+ seconds) between "Uncompressing Linux... done" and
+ start of kernel output:
+
+ - No early console because you used "console=ttyS<n>". Remove
+ the "console=" option if your firmware supplies an HCDP.
+
+ - If you don't have an HCDP, the kernel doesn't know where
+ your console lives until the driver discovers serial
+ devices. Use "console=uart, io,0x3f8" (or appropriate
+ address for your machine).
+
+ Kernel and init script output works fine, but no "login:" prompt:
+
+ - Add getty entry to /etc/inittab for console tty. Look for
+ the "Adding console on ttyS<n>" message that tells you which
+ device is the console.
+
+ "login:" prompt, but can't login as root:
+
+ - Add entry to /etc/securetty for console tty.
+
+ No ACPI serial devices found in 2.6.17 or later:
+
+ - Turn on CONFIG_PNP and CONFIG_PNPACPI. Prior to 2.6.17, ACPI
+ serial devices were discovered by 8250_acpi. In 2.6.17,
+ 8250_acpi was replaced by the combination of 8250_pnp and
+ CONFIG_PNPACPI.
+
+
+
+[1] http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
+ The table was originally defined as the "HCDP" for "Headless
+ Console/Debug Port." The current version is the "PCDP" for
+ "Primary Console and Debug Port Devices."
+
+[2] The HP MP (management processor) is a PCI device that provides
+ several UARTs. One of the UARTs is often used as a console; the
+ EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart".
+ The external connection is usually a 25-pin connector, and a
+ special dongle converts that to three 9-pin connectors, one of
+ which is labelled "Console."
+
+[3] EFI console devices are configured using the EFI Boot Manager
+ "Boot option maintenance" menu. You may have to interrupt the
+ boot sequence to use this menu, and you will have to reset the
+ box after changing console configuration.
diff --git a/Documentation/ia64/xen.txt b/Documentation/ia64/xen.txt
new file mode 100644
index 0000000..c61a99f
--- /dev/null
+++ b/Documentation/ia64/xen.txt
@@ -0,0 +1,183 @@
+ Recipe for getting/building/running Xen/ia64 with pv_ops
+ --------------------------------------------------------
+
+This recipe describes how to get xen-ia64 source and build it,
+and run domU with pv_ops.
+
+============
+Requirements
+============
+
+ - python
+ - mercurial
+ it (aka "hg") is an open-source source code
+ management software. See the below.
+ http://www.selenic.com/mercurial/wiki/
+ - git
+ - bridge-utils
+
+=================================
+Getting and Building Xen and Dom0
+=================================
+
+ My environment is;
+ Machine : Tiger4
+ Domain0 OS : RHEL5
+ DomainU OS : RHEL5
+
+ 1. Download source
+ # hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg
+ # cd xen-unstable.hg
+ # hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg
+
+ 2. # make world
+
+ 3. # make install-tools
+
+ 4. copy kernels and xen
+ # cp xen/xen.gz /boot/efi/efi/redhat/
+ # cp build-linux-2.6.18-xen_ia64/vmlinux.gz \
+ /boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen
+
+ 5. make initrd for Dom0/DomU
+ # make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \
+ O=$(/bin/pwd)/build-linux-2.6.18-xen_ia64
+ # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \
+ 2.6.18.8-xen --builtin mptspi --builtin mptbase \
+ --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \
+ --builtin ehci-hcd
+
+================================
+Making a disk image for guest OS
+================================
+
+ 1. make file
+ # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0
+ # mke2fs -F -j /root/rhel5.img
+ # mount -o loop /root/rhel5.img /mnt
+ # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt
+ # mkdir /mnt/{root,proc,sys,home,tmp}
+
+ Note: You may miss some device files. If so, please create them
+ with mknod. Or you can use tar instead of cp.
+
+ 2. modify DomU's fstab
+ # vi /mnt/etc/fstab
+ /dev/xvda1 / ext3 defaults 1 1
+ none /dev/pts devpts gid=5,mode=620 0 0
+ none /dev/shm tmpfs defaults 0 0
+ none /proc proc defaults 0 0
+ none /sys sysfs defaults 0 0
+
+ 3. modify inittab
+ set runlevel to 3 to avoid X trying to start
+ # vi /mnt/etc/inittab
+ id:3:initdefault:
+ Start a getty on the hvc0 console
+ X0:2345:respawn:/sbin/mingetty hvc0
+ tty1-6 mingetty can be commented out
+
+ 4. add hvc0 into /etc/securetty
+ # vi /mnt/etc/securetty (add hvc0)
+
+ 5. umount
+ # umount /mnt
+
+FYI, virt-manager can also make a disk image for guest OS.
+It's GUI tools and easy to make it.
+
+==================
+Boot Xen & Domain0
+==================
+
+ 1. replace elilo
+ elilo of RHEL5 can boot Xen and Dom0.
+ If you use old elilo (e.g RHEL4), please download from the below
+ http://elilo.sourceforge.net/cgi-bin/blosxom
+ and copy into /boot/efi/efi/redhat/
+ # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi
+
+ 2. modify elilo.conf (like the below)
+ # vi /boot/efi/efi/redhat/elilo.conf
+ prompt
+ timeout=20
+ default=xen
+ relocatable
+
+ image=vmlinuz-2.6.18.8-xen
+ label=xen
+ vmm=xen.gz
+ initrd=initrd-2.6.18.8-xen.img
+ read-only
+ append=" -- rhgb root=/dev/sda2"
+
+The append options before "--" are for xen hypervisor,
+the options after "--" are for dom0.
+
+FYI, your machine may need console options like
+"com1=19200,8n1 console=vga,com1". For example,
+append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \
+console=ttyS0 root=/dev/sda2"
+
+=====================================
+Getting and Building domU with pv_ops
+=====================================
+
+ 1. get pv_ops tree
+ # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/
+
+ 2. git branch (if necessary)
+ # cd linux-2.6-xen-ia64/
+ # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19
+ (Note: The current branch is xen-ia64-domu-minimal-2008may19.
+ But you would find the new branch. You can see with
+ "git branch -r" to get the branch lists.
+ http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/
+ is also available. The tree is based on
+ git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test)
+
+
+ 3. copy .config for pv_ops of domU
+ # cp arch/ia64/configs/xen_domu_wip_defconfig .config
+
+ 4. make kernel with pv_ops
+ # make oldconfig
+ # make
+
+ 5. install the kernel and initrd
+ # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU
+ # make modules_install
+ # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \
+ 2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \
+ --builtin mptbase --builtin mptscsih --builtin uhci-hcd \
+ --builtin ohci-hcd --builtin ehci-hcd
+
+========================
+Boot DomainU with pv_ops
+========================
+
+ 1. make config of DomU
+ # vi /etc/xen/rhel5
+ kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU"
+ ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img"
+ vcpus = 1
+ memory = 512
+ name = "rhel5"
+ disk = [ 'file:/root/rhel5.img,xvda1,w' ]
+ root = "/dev/xvda1 ro"
+ extra= "rhgb console=hvc0"
+
+ 2. After boot xen and dom0, start xend
+ # /etc/init.d/xend start
+ ( In the debugging case, # XEND_DEBUG=1 xend trace_start )
+
+ 3. start domU
+ # xm create -c rhel5
+
+=========
+Reference
+=========
+- Wiki of Xen/IA64 upstream merge
+ http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge
+
+Written by Akio Takebe <takebe_akio@jp.fujitsu.com> on 28 May 2008
diff --git a/Documentation/ics932s401 b/Documentation/ics932s401
new file mode 100644
index 0000000..07a739f
--- /dev/null
+++ b/Documentation/ics932s401
@@ -0,0 +1,31 @@
+Kernel driver ics932s401
+======================
+
+Supported chips:
+ * IDT ICS932S401
+ Prefix: 'ics932s401'
+ Addresses scanned: I2C 0x69
+ Datasheet: Publically available at the IDT website
+
+Author: Darrick J. Wong
+
+Description
+-----------
+
+This driver implements support for the IDT ICS932S401 chip family.
+
+This chip has 4 clock outputs--a base clock for the CPU (which is likely
+multiplied to get the real CPU clock), a system clock, a PCI clock, a USB
+clock, and a reference clock. The driver reports selected and actual
+frequency. If spread spectrum mode is enabled, the driver also reports by what
+percent the clock signal is being spread, which should be between 0 and -0.5%.
+All frequencies are reported in KHz.
+
+The ICS932S401 monitors all inputs continuously. The driver will not read
+the registers more often than once every other second.
+
+Special Features
+----------------
+
+The clocks could be reprogrammed to increase system speed. I will not help you
+do this, as you risk damaging your system!
diff --git a/Documentation/ide/00-INDEX b/Documentation/ide/00-INDEX
new file mode 100644
index 0000000..d6b7788
--- /dev/null
+++ b/Documentation/ide/00-INDEX
@@ -0,0 +1,12 @@
+00-INDEX
+ - this file
+ChangeLog.ide-cd.1994-2004
+ - ide-cd changelog
+ChangeLog.ide-floppy.1996-2002
+ - ide-floppy changelog
+ChangeLog.ide-tape.1995-2002
+ - ide-tape changelog
+ide-tape.txt
+ - info on the IDE ATAPI streaming tape driver
+ide.txt
+ - important info for users of ATA devices (IDE/EIDE disks and CD-ROMS).
diff --git a/Documentation/ide/ChangeLog.ide-cd.1994-2004 b/Documentation/ide/ChangeLog.ide-cd.1994-2004
new file mode 100644
index 0000000..190d17b
--- /dev/null
+++ b/Documentation/ide/ChangeLog.ide-cd.1994-2004
@@ -0,0 +1,268 @@
+/*
+ * 1.00 Oct 31, 1994 -- Initial version.
+ * 1.01 Nov 2, 1994 -- Fixed problem with starting request in
+ * cdrom_check_status.
+ * 1.03 Nov 25, 1994 -- leaving unmask_intr[] as a user-setting (as for disks)
+ * (from mlord) -- minor changes to cdrom_setup()
+ * -- renamed ide_dev_s to ide_drive_t, enable irq on command
+ * 2.00 Nov 27, 1994 -- Generalize packet command interface;
+ * add audio ioctls.
+ * 2.01 Dec 3, 1994 -- Rework packet command interface to handle devices
+ * which send an interrupt when ready for a command.
+ * 2.02 Dec 11, 1994 -- Cache the TOC in the driver.
+ * Don't use SCMD_PLAYAUDIO_TI; it's not included
+ * in the current version of ATAPI.
+ * Try to use LBA instead of track or MSF addressing
+ * when possible.
+ * Don't wait for READY_STAT.
+ * 2.03 Jan 10, 1995 -- Rewrite block read routines to handle block sizes
+ * other than 2k and to move multiple sectors in a
+ * single transaction.
+ * 2.04 Apr 21, 1995 -- Add work-around for Creative Labs CD220E drives.
+ * Thanks to Nick Saw <cwsaw@pts7.pts.mot.com> for
+ * help in figuring this out. Ditto for Acer and
+ * Aztech drives, which seem to have the same problem.
+ * 2.04b May 30, 1995 -- Fix to match changes in ide.c version 3.16 -ml
+ * 2.05 Jun 8, 1995 -- Don't attempt to retry after an illegal request
+ * or data protect error.
+ * Use HWIF and DEV_HWIF macros as in ide.c.
+ * Always try to do a request_sense after
+ * a failed command.
+ * Include an option to give textual descriptions
+ * of ATAPI errors.
+ * Fix a bug in handling the sector cache which
+ * showed up if the drive returned data in 512 byte
+ * blocks (like Pioneer drives). Thanks to
+ * Richard Hirst <srh@gpt.co.uk> for diagnosing this.
+ * Properly supply the page number field in the
+ * MODE_SELECT command.
+ * PLAYAUDIO12 is broken on the Aztech; work around it.
+ * 2.05x Aug 11, 1995 -- lots of data structure renaming/restructuring in ide.c
+ * (my apologies to Scott, but now ide-cd.c is independent)
+ * 3.00 Aug 22, 1995 -- Implement CDROMMULTISESSION ioctl.
+ * Implement CDROMREADAUDIO ioctl (UNTESTED).
+ * Use input_ide_data() and output_ide_data().
+ * Add door locking.
+ * Fix usage count leak in cdrom_open, which happened
+ * when a read-write mount was attempted.
+ * Try to load the disk on open.
+ * Implement CDROMEJECT_SW ioctl (off by default).
+ * Read total cdrom capacity during open.
+ * Rearrange logic in cdrom_decode_status. Issue
+ * request sense commands for failed packet commands
+ * from here instead of from cdrom_queue_packet_command.
+ * Fix a race condition in retrieving error information.
+ * Suppress printing normal unit attention errors and
+ * some drive not ready errors.
+ * Implement CDROMVOLREAD ioctl.
+ * Implement CDROMREADMODE1/2 ioctls.
+ * Fix race condition in setting up interrupt handlers
+ * when the `serialize' option is used.
+ * 3.01 Sep 2, 1995 -- Fix ordering of reenabling interrupts in
+ * cdrom_queue_request.
+ * Another try at using ide_[input,output]_data.
+ * 3.02 Sep 16, 1995 -- Stick total disk capacity in partition table as well.
+ * Make VERBOSE_IDE_CD_ERRORS dump failed command again.
+ * Dump out more information for ILLEGAL REQUEST errs.
+ * Fix handling of errors occurring before the
+ * packet command is transferred.
+ * Fix transfers with odd bytelengths.
+ * 3.03 Oct 27, 1995 -- Some Creative drives have an id of just `CD'.
+ * `DCI-2S10' drives are broken too.
+ * 3.04 Nov 20, 1995 -- So are Vertos drives.
+ * 3.05 Dec 1, 1995 -- Changes to go with overhaul of ide.c and ide-tape.c
+ * 3.06 Dec 16, 1995 -- Add support needed for partitions.
+ * More workarounds for Vertos bugs (based on patches
+ * from Holger Dietze <dietze@aix520.informatik.uni-leipzig.de>).
+ * Try to eliminate byteorder assumptions.
+ * Use atapi_cdrom_subchnl struct definition.
+ * Add STANDARD_ATAPI compilation option.
+ * 3.07 Jan 29, 1996 -- More twiddling for broken drives: Sony 55D,
+ * Vertos 300.
+ * Add NO_DOOR_LOCKING configuration option.
+ * Handle drive_cmd requests w/NULL args (for hdparm -t).
+ * Work around sporadic Sony55e audio play problem.
+ * 3.07a Feb 11, 1996 -- check drive->id for NULL before dereferencing, to fix
+ * problem with "hde=cdrom" with no drive present. -ml
+ * 3.08 Mar 6, 1996 -- More Vertos workarounds.
+ * 3.09 Apr 5, 1996 -- Add CDROMCLOSETRAY ioctl.
+ * Switch to using MSF addressing for audio commands.
+ * Reformat to match kernel tabbing style.
+ * Add CDROM_GET_UPC ioctl.
+ * 3.10 Apr 10, 1996 -- Fix compilation error with STANDARD_ATAPI.
+ * 3.11 Apr 29, 1996 -- Patch from Heiko Eißfeldt <heiko@colossus.escape.de>
+ * to remove redundant verify_area calls.
+ * 3.12 May 7, 1996 -- Rudimentary changer support. Based on patches
+ * from Gerhard Zuber <zuber@berlin.snafu.de>.
+ * Let open succeed even if there's no loaded disc.
+ * 3.13 May 19, 1996 -- Fixes for changer code.
+ * 3.14 May 29, 1996 -- Add work-around for Vertos 600.
+ * (From Hennus Bergman <hennus@sky.ow.nl>.)
+ * 3.15 July 2, 1996 -- Added support for Sanyo 3 CD changers
+ * from Ben Galliart <bgallia@luc.edu> with
+ * special help from Jeff Lightfoot
+ * <jeffml@pobox.com>
+ * 3.15a July 9, 1996 -- Improved Sanyo 3 CD changer identification
+ * 3.16 Jul 28, 1996 -- Fix from Gadi to reduce kernel stack usage for ioctl.
+ * 3.17 Sep 17, 1996 -- Tweak audio reads for some drives.
+ * Start changing CDROMLOADFROMSLOT to CDROM_SELECT_DISC.
+ * 3.18 Oct 31, 1996 -- Added module and DMA support.
+ *
+ * 4.00 Nov 5, 1996 -- New ide-cd maintainer,
+ * Erik B. Andersen <andersee@debian.org>
+ * -- Newer Creative drives don't always set the error
+ * register correctly. Make sure we see media changes
+ * regardless.
+ * -- Integrate with generic cdrom driver.
+ * -- CDROMGETSPINDOWN and CDROMSETSPINDOWN ioctls, based on
+ * a patch from Ciro Cattuto <>.
+ * -- Call set_device_ro.
+ * -- Implement CDROMMECHANISMSTATUS and CDROMSLOTTABLE
+ * ioctls, based on patch by Erik Andersen
+ * -- Add some probes of drive capability during setup.
+ *
+ * 4.01 Nov 11, 1996 -- Split into ide-cd.c and ide-cd.h
+ * -- Removed CDROMMECHANISMSTATUS and CDROMSLOTTABLE
+ * ioctls in favor of a generalized approach
+ * using the generic cdrom driver.
+ * -- Fully integrated with the 2.1.X kernel.
+ * -- Other stuff that I forgot (lots of changes)
+ *
+ * 4.02 Dec 01, 1996 -- Applied patch from Gadi Oxman <gadio@netvision.net.il>
+ * to fix the drive door locking problems.
+ *
+ * 4.03 Dec 04, 1996 -- Added DSC overlap support.
+ * 4.04 Dec 29, 1996 -- Added CDROMREADRAW ioclt based on patch
+ * by Ales Makarov (xmakarov@sun.felk.cvut.cz)
+ *
+ * 4.05 Nov 20, 1997 -- Modified to print more drive info on init
+ * Minor other changes
+ * Fix errors on CDROMSTOP (If you have a "Dolphin",
+ * you must define IHAVEADOLPHIN)
+ * Added identifier so new Sanyo CD-changer works
+ * Better detection if door locking isn't supported
+ *
+ * 4.06 Dec 17, 1997 -- fixed endless "tray open" messages -ml
+ * 4.07 Dec 17, 1997 -- fallback to set pc->stat on "tray open"
+ * 4.08 Dec 18, 1997 -- spew less noise when tray is empty
+ * -- fix speed display for ACER 24X, 18X
+ * 4.09 Jan 04, 1998 -- fix handling of the last block so we return
+ * an end of file instead of an I/O error (Gadi)
+ * 4.10 Jan 24, 1998 -- fixed a bug so now changers can change to a new
+ * slot when there is no disc in the current slot.
+ * -- Fixed a memory leak where info->changer_info was
+ * malloc'ed but never free'd when closing the device.
+ * -- Cleaned up the global namespace a bit by making more
+ * functions static that should already have been.
+ * 4.11 Mar 12, 1998 -- Added support for the CDROM_SELECT_SPEED ioctl
+ * based on a patch for 2.0.33 by Jelle Foks
+ * <jelle@scintilla.utwente.nl>, a patch for 2.0.33
+ * by Toni Giorgino <toni@pcape2.pi.infn.it>, the SCSI
+ * version, and my own efforts. -erik
+ * -- Fixed a stupid bug which egcs was kind enough to
+ * inform me of where "Illegal mode for this track"
+ * was never returned due to a comparison on data
+ * types of limited range.
+ * 4.12 Mar 29, 1998 -- Fixed bug in CDROM_SELECT_SPEED so write speed is
+ * now set ionly for CD-R and CD-RW drives. I had
+ * removed this support because it produced errors.
+ * It produced errors _only_ for non-writers. duh.
+ * 4.13 May 05, 1998 -- Suppress useless "in progress of becoming ready"
+ * messages, since this is not an error.
+ * -- Change error messages to be const
+ * -- Remove a "\t" which looks ugly in the syslogs
+ * 4.14 July 17, 1998 -- Change to pointing to .ps version of ATAPI spec
+ * since the .pdf version doesn't seem to work...
+ * -- Updated the TODO list to something more current.
+ *
+ * 4.15 Aug 25, 1998 -- Updated ide-cd.h to respect mechine endianess,
+ * patch thanks to "Eddie C. Dost" <ecd@skynet.be>
+ *
+ * 4.50 Oct 19, 1998 -- New maintainers!
+ * Jens Axboe <axboe@image.dk>
+ * Chris Zwilling <chris@cloudnet.com>
+ *
+ * 4.51 Dec 23, 1998 -- Jens Axboe <axboe@image.dk>
+ * - ide_cdrom_reset enabled since the ide subsystem
+ * handles resets fine now. <axboe@image.dk>
+ * - Transfer size fix for Samsung CD-ROMs, thanks to
+ * "Ville Hallik" <ville.hallik@mail.ee>.
+ * - other minor stuff.
+ *
+ * 4.52 Jan 19, 1999 -- Jens Axboe <axboe@image.dk>
+ * - Detect DVD-ROM/RAM drives
+ *
+ * 4.53 Feb 22, 1999 - Include other model Samsung and one Goldstar
+ * drive in transfer size limit.
+ * - Fix the I/O error when doing eject without a medium
+ * loaded on some drives.
+ * - CDROMREADMODE2 is now implemented through
+ * CDROMREADRAW, since many drives don't support
+ * MODE2 (even though ATAPI 2.6 says they must).
+ * - Added ignore parameter to ide-cd (as a module), eg
+ * insmod ide-cd ignore='hda hdb'
+ * Useful when using ide-cd in conjunction with
+ * ide-scsi. TODO: non-modular way of doing the
+ * same.
+ *
+ * 4.54 Aug 5, 1999 - Support for MMC2 class commands through the generic
+ * packet interface to cdrom.c.
+ * - Unified audio ioctl support, most of it.
+ * - cleaned up various deprecated verify_area().
+ * - Added ide_cdrom_packet() as the interface for
+ * the Uniform generic_packet().
+ * - bunch of other stuff, will fill in logs later.
+ * - report 1 slot for non-changers, like the other
+ * cd-rom drivers. don't report select disc for
+ * non-changers as well.
+ * - mask out audio playing, if the device can't do it.
+ *
+ * 4.55 Sep 1, 1999 - Eliminated the rest of the audio ioctls, except
+ * for CDROMREADTOC[ENTRY|HEADER]. Some of the drivers
+ * use this independently of the actual audio handling.
+ * They will disappear later when I get the time to
+ * do it cleanly.
+ * - Minimize the TOC reading - only do it when we
+ * know a media change has occurred.
+ * - Moved all the CDROMREADx ioctls to the Uniform layer.
+ * - Heiko Eißfeldt <heiko@colossus.escape.de> supplied
+ * some fixes for CDI.
+ * - CD-ROM leaving door locked fix from Andries
+ * Brouwer <Andries.Brouwer@cwi.nl>
+ * - Erik Andersen <andersen@xmission.com> unified
+ * commands across the various drivers and how
+ * sense errors are handled.
+ *
+ * 4.56 Sep 12, 1999 - Removed changer support - it is now in the
+ * Uniform layer.
+ * - Added partition based multisession handling.
+ * - Mode sense and mode select moved to the
+ * Uniform layer.
+ * - Fixed a problem with WPI CDS-32X drive - it
+ * failed the capabilities
+ *
+ * 4.57 Apr 7, 2000 - Fixed sense reporting.
+ * - Fixed possible oops in ide_cdrom_get_last_session()
+ * - Fix locking mania and make ide_cdrom_reset relock
+ * - Stop spewing errors to log when magicdev polls with
+ * TEST_UNIT_READY on some drives.
+ * - Various fixes from Tobias Ringstrom:
+ * tray if it was locked prior to the reset.
+ * - cdrom_read_capacity returns one frame too little.
+ * - Fix real capacity reporting.
+ *
+ * 4.58 May 1, 2000 - Clean up ACER50 stuff.
+ * - Fix small problem with ide_cdrom_capacity
+ *
+ * 4.59 Aug 11, 2000 - Fix changer problem in cdrom_read_toc, we weren't
+ * correctly sensing a disc change.
+ * - Rearranged some code
+ * - Use extended sense on drives that support it for
+ * correctly reporting tray status -- from
+ * Michael D Johnson <johnsom@orst.edu>
+ * 4.60 Dec 17, 2003 - Add mt rainier support
+ * - Bump timeout for packet commands, matches sr
+ * - Odd stuff
+ * 4.61 Jan 22, 2004 - support hardware sector sizes other than 2kB,
+ * Pascal Schmidt <der.eremit@email.de>
+ */
diff --git a/Documentation/ide/ChangeLog.ide-floppy.1996-2002 b/Documentation/ide/ChangeLog.ide-floppy.1996-2002
new file mode 100644
index 0000000..46c19ef
--- /dev/null
+++ b/Documentation/ide/ChangeLog.ide-floppy.1996-2002
@@ -0,0 +1,63 @@
+/*
+ * Many thanks to Lode Leroy <Lode.Leroy@www.ibase.be>, who tested so many
+ * ALPHA patches to this driver on an EASYSTOR LS-120 ATAPI floppy drive.
+ *
+ * Ver 0.1 Oct 17 96 Initial test version, mostly based on ide-tape.c.
+ * Ver 0.2 Oct 31 96 Minor changes.
+ * Ver 0.3 Dec 2 96 Fixed error recovery bug.
+ * Ver 0.4 Jan 26 97 Add support for the HDIO_GETGEO ioctl.
+ * Ver 0.5 Feb 21 97 Add partitions support.
+ * Use the minimum of the LBA and CHS capacities.
+ * Avoid hwgroup->rq == NULL on the last irq.
+ * Fix potential null dereferencing with DEBUG_LOG.
+ * Ver 0.8 Dec 7 97 Increase irq timeout from 10 to 50 seconds.
+ * Add media write-protect detection.
+ * Issue START command only if TEST UNIT READY fails.
+ * Add work-around for IOMEGA ZIP revision 21.D.
+ * Remove idefloppy_get_capabilities().
+ * Ver 0.9 Jul 4 99 Fix a bug which might have caused the number of
+ * bytes requested on each interrupt to be zero.
+ * Thanks to <shanos@es.co.nz> for pointing this out.
+ * Ver 0.9.sv Jan 6 01 Sam Varshavchik <mrsam@courier-mta.com>
+ * Implement low level formatting. Reimplemented
+ * IDEFLOPPY_CAPABILITIES_PAGE, since we need the srfp
+ * bit. My LS-120 drive barfs on
+ * IDEFLOPPY_CAPABILITIES_PAGE, but maybe it's just me.
+ * Compromise by not reporting a failure to get this
+ * mode page. Implemented four IOCTLs in order to
+ * implement formatting. IOCTls begin with 0x4600,
+ * 0x46 is 'F' as in Format.
+ * Jan 9 01 Userland option to select format verify.
+ * Added PC_SUPPRESS_ERROR flag - some idefloppy drives
+ * do not implement IDEFLOPPY_CAPABILITIES_PAGE, and
+ * return a sense error. Suppress error reporting in
+ * this particular case in order to avoid spurious
+ * errors in syslog. The culprit is
+ * idefloppy_get_capability_page(), so move it to
+ * idefloppy_begin_format() so that it's not used
+ * unless absolutely necessary.
+ * If drive does not support format progress indication
+ * monitor the dsc bit in the status register.
+ * Also, O_NDELAY on open will allow the device to be
+ * opened without a disk available. This can be used to
+ * open an unformatted disk, or get the device capacity.
+ * Ver 0.91 Dec 11 99 Added IOMEGA Clik! drive support by
+ * <paul@paulbristow.net>
+ * Ver 0.92 Oct 22 00 Paul Bristow became official maintainer for this
+ * driver. Included Powerbook internal zip kludge.
+ * Ver 0.93 Oct 24 00 Fixed bugs for Clik! drive
+ * no disk on insert and disk change now works
+ * Ver 0.94 Oct 27 00 Tidied up to remove strstr(Clik) everywhere
+ * Ver 0.95 Nov 7 00 Brought across to kernel 2.4
+ * Ver 0.96 Jan 7 01 Actually in line with release version of 2.4.0
+ * including set_bit patch from Rusty Russell
+ * Ver 0.97 Jul 22 01 Merge 0.91-0.96 onto 0.9.sv for ac series
+ * Ver 0.97.sv Aug 3 01 Backported from 2.4.7-ac3
+ * Ver 0.98 Oct 26 01 Split idefloppy_transfer_pc into two pieces to
+ * fix a lost interrupt problem. It appears the busy
+ * bit was being deasserted by my IOMEGA ATAPI ZIP 100
+ * drive before the drive was actually ready.
+ * Ver 0.98a Oct 29 01 Expose delay value so we can play.
+ * Ver 0.99 Feb 24 02 Remove duplicate code, modify clik! detection code
+ * to support new PocketZip drives
+ */
diff --git a/Documentation/ide/ChangeLog.ide-tape.1995-2002 b/Documentation/ide/ChangeLog.ide-tape.1995-2002
new file mode 100644
index 0000000..877fac8
--- /dev/null
+++ b/Documentation/ide/ChangeLog.ide-tape.1995-2002
@@ -0,0 +1,257 @@
+/*
+ * Ver 0.1 Nov 1 95 Pre-working code :-)
+ * Ver 0.2 Nov 23 95 A short backup (few megabytes) and restore procedure
+ * was successful ! (Using tar cvf ... on the block
+ * device interface).
+ * A longer backup resulted in major swapping, bad
+ * overall Linux performance and eventually failed as
+ * we received non serial read-ahead requests from the
+ * buffer cache.
+ * Ver 0.3 Nov 28 95 Long backups are now possible, thanks to the
+ * character device interface. Linux's responsiveness
+ * and performance doesn't seem to be much affected
+ * from the background backup procedure.
+ * Some general mtio.h magnetic tape operations are
+ * now supported by our character device. As a result,
+ * popular tape utilities are starting to work with
+ * ide tapes :-)
+ * The following configurations were tested:
+ * 1. An IDE ATAPI TAPE shares the same interface
+ * and irq with an IDE ATAPI CDROM.
+ * 2. An IDE ATAPI TAPE shares the same interface
+ * and irq with a normal IDE disk.
+ * Both configurations seemed to work just fine !
+ * However, to be on the safe side, it is meanwhile
+ * recommended to give the IDE TAPE its own interface
+ * and irq.
+ * The one thing which needs to be done here is to
+ * add a "request postpone" feature to ide.c,
+ * so that we won't have to wait for the tape to finish
+ * performing a long media access (DSC) request (such
+ * as a rewind) before we can access the other device
+ * on the same interface. This effect doesn't disturb
+ * normal operation most of the time because read/write
+ * requests are relatively fast, and once we are
+ * performing one tape r/w request, a lot of requests
+ * from the other device can be queued and ide.c will
+ * service all of them after this single tape request.
+ * Ver 1.0 Dec 11 95 Integrated into Linux 1.3.46 development tree.
+ * On each read / write request, we now ask the drive
+ * if we can transfer a constant number of bytes
+ * (a parameter of the drive) only to its buffers,
+ * without causing actual media access. If we can't,
+ * we just wait until we can by polling the DSC bit.
+ * This ensures that while we are not transferring
+ * more bytes than the constant referred to above, the
+ * interrupt latency will not become too high and
+ * we won't cause an interrupt timeout, as happened
+ * occasionally in the previous version.
+ * While polling for DSC, the current request is
+ * postponed and ide.c is free to handle requests from
+ * the other device. This is handled transparently to
+ * ide.c. The hwgroup locking method which was used
+ * in the previous version was removed.
+ * Use of new general features which are provided by
+ * ide.c for use with atapi devices.
+ * (Programming done by Mark Lord)
+ * Few potential bug fixes (Again, suggested by Mark)
+ * Single character device data transfers are now
+ * not limited in size, as they were before.
+ * We are asking the tape about its recommended
+ * transfer unit and send a larger data transfer
+ * as several transfers of the above size.
+ * For best results, use an integral number of this
+ * basic unit (which is shown during driver
+ * initialization). I will soon add an ioctl to get
+ * this important parameter.
+ * Our data transfer buffer is allocated on startup,
+ * rather than before each data transfer. This should
+ * ensure that we will indeed have a data buffer.
+ * Ver 1.1 Dec 14 95 Fixed random problems which occurred when the tape
+ * shared an interface with another device.
+ * (poll_for_dsc was a complete mess).
+ * Removed some old (non-active) code which had
+ * to do with supporting buffer cache originated
+ * requests.
+ * The block device interface can now be opened, so
+ * that general ide driver features like the unmask
+ * interrupts flag can be selected with an ioctl.
+ * This is the only use of the block device interface.
+ * New fast pipelined operation mode (currently only on
+ * writes). When using the pipelined mode, the
+ * throughput can potentially reach the maximum
+ * tape supported throughput, regardless of the
+ * user backup program. On my tape drive, it sometimes
+ * boosted performance by a factor of 2. Pipelined
+ * mode is enabled by default, but since it has a few
+ * downfalls as well, you may want to disable it.
+ * A short explanation of the pipelined operation mode
+ * is available below.
+ * Ver 1.2 Jan 1 96 Eliminated pipelined mode race condition.
+ * Added pipeline read mode. As a result, restores
+ * are now as fast as backups.
+ * Optimized shared interface behavior. The new behavior
+ * typically results in better IDE bus efficiency and
+ * higher tape throughput.
+ * Pre-calculation of the expected read/write request
+ * service time, based on the tape's parameters. In
+ * the pipelined operation mode, this allows us to
+ * adjust our polling frequency to a much lower value,
+ * and thus to dramatically reduce our load on Linux,
+ * without any decrease in performance.
+ * Implemented additional mtio.h operations.
+ * The recommended user block size is returned by
+ * the MTIOCGET ioctl.
+ * Additional minor changes.
+ * Ver 1.3 Feb 9 96 Fixed pipelined read mode bug which prevented the
+ * use of some block sizes during a restore procedure.
+ * The character device interface will now present a
+ * continuous view of the media - any mix of block sizes
+ * during a backup/restore procedure is supported. The
+ * driver will buffer the requests internally and
+ * convert them to the tape's recommended transfer
+ * unit, making performance almost independent of the
+ * chosen user block size.
+ * Some improvements in error recovery.
+ * By cooperating with ide-dma.c, bus mastering DMA can
+ * now sometimes be used with IDE tape drives as well.
+ * Bus mastering DMA has the potential to dramatically
+ * reduce the CPU's overhead when accessing the device,
+ * and can be enabled by using hdparm -d1 on the tape's
+ * block device interface. For more info, read the
+ * comments in ide-dma.c.
+ * Ver 1.4 Mar 13 96 Fixed serialize support.
+ * Ver 1.5 Apr 12 96 Fixed shared interface operation, broken in 1.3.85.
+ * Fixed pipelined read mode inefficiency.
+ * Fixed nasty null dereferencing bug.
+ * Ver 1.6 Aug 16 96 Fixed FPU usage in the driver.
+ * Fixed end of media bug.
+ * Ver 1.7 Sep 10 96 Minor changes for the CONNER CTT8000-A model.
+ * Ver 1.8 Sep 26 96 Attempt to find a better balance between good
+ * interactive response and high system throughput.
+ * Ver 1.9 Nov 5 96 Automatically cross encountered filemarks rather
+ * than requiring an explicit FSF command.
+ * Abort pending requests at end of media.
+ * MTTELL was sometimes returning incorrect results.
+ * Return the real block size in the MTIOCGET ioctl.
+ * Some error recovery bug fixes.
+ * Ver 1.10 Nov 5 96 Major reorganization.
+ * Reduced CPU overhead a bit by eliminating internal
+ * bounce buffers.
+ * Added module support.
+ * Added multiple tape drives support.
+ * Added partition support.
+ * Rewrote DSC handling.
+ * Some portability fixes.
+ * Removed ide-tape.h.
+ * Additional minor changes.
+ * Ver 1.11 Dec 2 96 Bug fix in previous DSC timeout handling.
+ * Use ide_stall_queue() for DSC overlap.
+ * Use the maximum speed rather than the current speed
+ * to compute the request service time.
+ * Ver 1.12 Dec 7 97 Fix random memory overwriting and/or last block data
+ * corruption, which could occur if the total number
+ * of bytes written to the tape was not an integral
+ * number of tape blocks.
+ * Add support for INTERRUPT DRQ devices.
+ * Ver 1.13 Jan 2 98 Add "speed == 0" work-around for HP COLORADO 5GB
+ * Ver 1.14 Dec 30 98 Partial fixes for the Sony/AIWA tape drives.
+ * Replace cli()/sti() with hwgroup spinlocks.
+ * Ver 1.15 Mar 25 99 Fix SMP race condition by replacing hwgroup
+ * spinlock with private per-tape spinlock.
+ * Ver 1.16 Sep 1 99 Add OnStream tape support.
+ * Abort read pipeline on EOD.
+ * Wait for the tape to become ready in case it returns
+ * "in the process of becoming ready" on open().
+ * Fix zero padding of the last written block in
+ * case the tape block size is larger than PAGE_SIZE.
+ * Decrease the default disconnection time to tn.
+ * Ver 1.16e Oct 3 99 Minor fixes.
+ * Ver 1.16e1 Oct 13 99 Patches by Arnold Niessen,
+ * niessen@iae.nl / arnold.niessen@philips.com
+ * GO-1) Undefined code in idetape_read_position
+ * according to Gadi's email
+ * AJN-1) Minor fix asc == 11 should be asc == 0x11
+ * in idetape_issue_packet_command (did effect
+ * debugging output only)
+ * AJN-2) Added more debugging output, and
+ * added ide-tape: where missing. I would also
+ * like to add tape->name where possible
+ * AJN-3) Added different debug_level's
+ * via /proc/ide/hdc/settings
+ * "debug_level" determines amount of debugging output;
+ * can be changed using /proc/ide/hdx/settings
+ * 0 : almost no debugging output
+ * 1 : 0+output errors only
+ * 2 : 1+output all sensekey/asc
+ * 3 : 2+follow all chrdev related procedures
+ * 4 : 3+follow all procedures
+ * 5 : 4+include pc_stack rq_stack info
+ * 6 : 5+USE_COUNT updates
+ * AJN-4) Fixed timeout for retension in idetape_queue_pc_tail
+ * from 5 to 10 minutes
+ * AJN-5) Changed maximum number of blocks to skip when
+ * reading tapes with multiple consecutive write
+ * errors from 100 to 1000 in idetape_get_logical_blk
+ * Proposed changes to code:
+ * 1) output "logical_blk_num" via /proc
+ * 2) output "current_operation" via /proc
+ * 3) Either solve or document the fact that `mt rewind' is
+ * required after reading from /dev/nhtx to be
+ * able to rmmod the idetape module;
+ * Also, sometimes an application finishes but the
+ * device remains `busy' for some time. Same cause ?
+ * Proposed changes to release-notes:
+ * 4) write a simple `quickstart' section in the
+ * release notes; I volunteer if you don't want to
+ * 5) include a pointer to video4linux in the doc
+ * to stimulate video applications
+ * 6) release notes lines 331 and 362: explain what happens
+ * if the application data rate is higher than 1100 KB/s;
+ * similar approach to lower-than-500 kB/s ?
+ * 7) 6.6 Comparison; wouldn't it be better to allow different
+ * strategies for read and write ?
+ * Wouldn't it be better to control the tape buffer
+ * contents instead of the bandwidth ?
+ * 8) line 536: replace will by would (if I understand
+ * this section correctly, a hypothetical and unwanted situation
+ * is being described)
+ * Ver 1.16f Dec 15 99 Change place of the secondary OnStream header frames.
+ * Ver 1.17 Nov 2000 / Jan 2001 Marcel Mol, marcel@mesa.nl
+ * - Add idetape_onstream_mode_sense_tape_parameter_page
+ * function to get tape capacity in frames: tape->capacity.
+ * - Add support for DI-50 drives( or any DI- drive).
+ * - 'workaround' for read error/blank block around block 3000.
+ * - Implement Early warning for end of media for Onstream.
+ * - Cosmetic code changes for readability.
+ * - Idetape_position_tape should not use SKIP bit during
+ * Onstream read recovery.
+ * - Add capacity, logical_blk_num and first/last_frame_position
+ * to /proc/ide/hd?/settings.
+ * - Module use count was gone in the Linux 2.4 driver.
+ * Ver 1.17a Apr 2001 Willem Riede osst@riede.org
+ * - Get drive's actual block size from mode sense block descriptor
+ * - Limit size of pipeline
+ * Ver 1.17b Oct 2002 Alan Stern <stern@rowland.harvard.edu>
+ * Changed IDETAPE_MIN_PIPELINE_STAGES to 1 and actually used
+ * it in the code!
+ * Actually removed aborted stages in idetape_abort_pipeline
+ * instead of just changing the command code.
+ * Made the transfer byte count for Request Sense equal to the
+ * actual length of the data transfer.
+ * Changed handling of partial data transfers: they do not
+ * cause DMA errors.
+ * Moved initiation of DMA transfers to the correct place.
+ * Removed reference to unallocated memory.
+ * Made __idetape_discard_read_pipeline return the number of
+ * sectors skipped, not the number of stages.
+ * Replaced errant kfree() calls with __idetape_kfree_stage().
+ * Fixed off-by-one error in testing the pipeline length.
+ * Fixed handling of filemarks in the read pipeline.
+ * Small code optimization for MTBSF and MTBSFM ioctls.
+ * Don't try to unlock the door during device close if is
+ * already unlocked!
+ * Cosmetic fixes to miscellaneous debugging output messages.
+ * Set the minimum /proc/ide/hd?/settings values for "pipeline",
+ * "pipeline_min", and "pipeline_max" to 1.
+ */
diff --git a/Documentation/ide/ide-tape.txt b/Documentation/ide/ide-tape.txt
new file mode 100644
index 0000000..3f348a0
--- /dev/null
+++ b/Documentation/ide/ide-tape.txt
@@ -0,0 +1,65 @@
+IDE ATAPI streaming tape driver.
+
+This driver is a part of the Linux ide driver.
+
+The driver, in co-operation with ide.c, basically traverses the
+request-list for the block device interface. The character device
+interface, on the other hand, creates new requests, adds them
+to the request-list of the block device, and waits for their completion.
+
+The block device major and minor numbers are determined from the
+tape's relative position in the ide interfaces, as explained in ide.c.
+
+The character device interface consists of the following devices:
+
+ht0 major 37, minor 0 first IDE tape, rewind on close.
+ht1 major 37, minor 1 second IDE tape, rewind on close.
+...
+nht0 major 37, minor 128 first IDE tape, no rewind on close.
+nht1 major 37, minor 129 second IDE tape, no rewind on close.
+...
+
+The general magnetic tape commands compatible interface, as defined by
+include/linux/mtio.h, is accessible through the character device.
+
+General ide driver configuration options, such as the interrupt-unmask
+flag, can be configured by issuing an ioctl to the block device interface,
+as any other ide device.
+
+Our own ide-tape ioctl's can be issued to either the block device or
+the character device interface.
+
+Maximal throughput with minimal bus load will usually be achieved in the
+following scenario:
+
+ 1. ide-tape is operating in the pipelined operation mode.
+ 2. No buffering is performed by the user backup program.
+
+Testing was done with a 2 GB CONNER CTMA 4000 IDE ATAPI Streaming Tape Drive.
+
+Here are some words from the first releases of hd.c, which are quoted
+in ide.c and apply here as well:
+
+| Special care is recommended. Have Fun!
+
+Possible improvements:
+
+1. Support for the ATAPI overlap protocol.
+
+In order to maximize bus throughput, we currently use the DSC
+overlap method which enables ide.c to service requests from the
+other device while the tape is busy executing a command. The
+DSC overlap method involves polling the tape's status register
+for the DSC bit, and servicing the other device while the tape
+isn't ready.
+
+In the current QIC development standard (December 1995),
+it is recommended that new tape drives will *in addition*
+implement the ATAPI overlap protocol, which is used for the
+same purpose - efficient use of the IDE bus, but is interrupt
+driven and thus has much less CPU overhead.
+
+ATAPI overlap is likely to be supported in most new ATAPI
+devices, including new ATAPI cdroms, and thus provides us
+a method by which we can achieve higher throughput when
+sharing a (fast) ATA-2 disk with any (slow) new ATAPI device.
diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.txt
new file mode 100644
index 0000000..0c78f4b
--- /dev/null
+++ b/Documentation/ide/ide.txt
@@ -0,0 +1,254 @@
+
+ Information regarding the Enhanced IDE drive in Linux 2.6
+
+==============================================================================
+
+
+ The hdparm utility can be used to control various IDE features on a
+ running system. It is packaged separately. Please Look for it on popular
+ linux FTP sites.
+
+
+
+*** IMPORTANT NOTICES: BUGGY IDE CHIPSETS CAN CORRUPT DATA!!
+*** =================
+*** PCI versions of the CMD640 and RZ1000 interfaces are now detected
+*** automatically at startup when PCI BIOS support is configured.
+***
+*** Linux disables the "prefetch" ("readahead") mode of the RZ1000
+*** to prevent data corruption possible due to hardware design flaws.
+***
+*** For the CMD640, linux disables "IRQ unmasking" (hdparm -u1) on any
+*** drive for which the "prefetch" mode of the CMD640 is turned on.
+*** If "prefetch" is disabled (hdparm -p8), then "IRQ unmasking" can be
+*** used again.
+***
+*** For the CMD640, linux disables "32bit I/O" (hdparm -c1) on any drive
+*** for which the "prefetch" mode of the CMD640 is turned off.
+*** If "prefetch" is enabled (hdparm -p9), then "32bit I/O" can be
+*** used again.
+***
+*** The CMD640 is also used on some Vesa Local Bus (VLB) cards, and is *NOT*
+*** automatically detected by Linux. For safe, reliable operation with such
+*** interfaces, one *MUST* use the "cmd640.probe_vlb" kernel option.
+***
+*** Use of the "serialize" option is no longer necessary.
+
+================================================================================
+Common pitfalls:
+
+- 40-conductor IDE cables are capable of transferring data in DMA modes up to
+ udma2, but no faster.
+
+- If possible devices should be attached to separate channels if they are
+ available. Typically the disk on the first and CD-ROM on the second.
+
+- If you mix devices on the same cable, please consider using similar devices
+ in respect of the data transfer mode they support.
+
+- Even better try to stick to the same vendor and device type on the same
+ cable.
+
+================================================================================
+
+This is the multiple IDE interface driver, as evolved from hd.c.
+
+It supports up to 9 IDE interfaces per default, on one or more IRQs (usually
+14 & 15). There can be up to two drives per interface, as per the ATA-6 spec.
+
+Primary: ide0, port 0x1f0; major=3; hda is minor=0; hdb is minor=64
+Secondary: ide1, port 0x170; major=22; hdc is minor=0; hdd is minor=64
+Tertiary: ide2, port 0x1e8; major=33; hde is minor=0; hdf is minor=64
+Quaternary: ide3, port 0x168; major=34; hdg is minor=0; hdh is minor=64
+fifth.. ide4, usually PCI, probed
+sixth.. ide5, usually PCI, probed
+
+To access devices on interfaces > ide0, device entries please make sure that
+device files for them are present in /dev. If not, please create such
+entries, by using /dev/MAKEDEV.
+
+This driver automatically probes for most IDE interfaces (including all PCI
+ones), for the drives/geometries attached to those interfaces, and for the IRQ
+lines being used by the interfaces (normally 14, 15 for ide0/ide1).
+
+Any number of interfaces may share a single IRQ if necessary, at a slight
+performance penalty, whether on separate cards or a single VLB card.
+The IDE driver automatically detects and handles this. However, this may
+or may not be harmful to your hardware.. two or more cards driving the same IRQ
+can potentially burn each other's bus driver, though in practice this
+seldom occurs. Be careful, and if in doubt, don't do it!
+
+Drives are normally found by auto-probing and/or examining the CMOS/BIOS data.
+For really weird situations, the apparent (fdisk) geometry can also be specified
+on the kernel "command line" using LILO. The format of such lines is:
+
+ ide_core.chs=[interface_number.device_number]:cyls,heads,sects
+or ide_core.cdrom=[interface_number.device_number]
+
+For example:
+
+ ide_core.chs=1.0:1050,32,64 ide_core.cdrom=1.1
+
+The results of successful auto-probing may override the physical geometry/irq
+specified, though the "original" geometry may be retained as the "logical"
+geometry for partitioning purposes (fdisk).
+
+If the auto-probing during boot time confuses a drive (ie. the drive works
+with hd.c but not with ide.c), then an command line option may be specified
+for each drive for which you'd like the drive to skip the hardware
+probe/identification sequence. For example:
+
+ ide_core.noprobe=0.1
+or
+ ide_core.chs=1.0:768,16,32
+ ide_core.noprobe=1.0
+
+Note that when only one IDE device is attached to an interface, it should be
+jumpered as "single" or "master", *not* "slave". Many folks have had
+"trouble" with cdroms because of this requirement, so the driver now probes
+for both units, though success is more likely when the drive is jumpered
+correctly.
+
+Courtesy of Scott Snyder and others, the driver supports ATAPI cdrom drives
+such as the NEC-260 and the new MITSUMI triple/quad speed drives.
+Such drives will be identified at boot time, just like a hard disk.
+
+If for some reason your cdrom drive is *not* found at boot time, you can force
+the probe to look harder by supplying a kernel command line parameter
+via LILO, such as:
+
+ ide_core.cdrom=1.0 /* "master" on second interface (hdc) */
+or
+ ide_core.cdrom=1.1 /* "slave" on second interface (hdd) */
+
+For example, a GW2000 system might have a hard drive on the primary
+interface (/dev/hda) and an IDE cdrom drive on the secondary interface
+(/dev/hdc). To mount a CD in the cdrom drive, one would use something like:
+
+ ln -sf /dev/hdc /dev/cdrom
+ mkdir /mnt/cdrom
+ mount /dev/cdrom /mnt/cdrom -t iso9660 -o ro
+
+If, after doing all of the above, mount doesn't work and you see
+errors from the driver (with dmesg) complaining about `status=0xff',
+this means that the hardware is not responding to the driver's attempts
+to read it. One of the following is probably the problem:
+
+ - Your hardware is broken.
+
+ - You are using the wrong address for the device, or you have the
+ drive jumpered wrong. Review the configuration instructions above.
+
+ - Your IDE controller requires some nonstandard initialization sequence
+ before it will work properly. If this is the case, there will often
+ be a separate MS-DOS driver just for the controller. IDE interfaces
+ on sound cards usually fall into this category. Such configurations
+ can often be made to work by first booting MS-DOS, loading the
+ appropriate drivers, and then warm-booting linux (without powering
+ off). This can be automated using loadlin in the MS-DOS autoexec.
+
+If you always get timeout errors, interrupts from the drive are probably
+not making it to the host. Check how you have the hardware jumpered
+and make sure it matches what the driver expects (see the configuration
+instructions above). If you have a PCI system, also check the BIOS
+setup; I've had one report of a system which was shipped with IRQ 15
+disabled by the BIOS.
+
+The kernel is able to execute binaries directly off of the cdrom,
+provided it is mounted with the default block size of 1024 (as above).
+
+Please pass on any feedback on any of this stuff to the maintainer,
+whose address can be found in linux/MAINTAINERS.
+
+The IDE driver is modularized. The high level disk/CD-ROM/tape/floppy
+drivers can always be compiled as loadable modules, the chipset drivers
+can only be compiled into the kernel, and the core code (ide.c) can be
+compiled as a loadable module provided no chipset support is needed.
+
+When using ide.c as a module in combination with kmod, add:
+
+ alias block-major-3 ide-probe
+
+to /etc/modprobe.conf.
+
+When ide.c is used as a module, you can pass command line parameters to the
+driver using the "options=" keyword to insmod, while replacing any ',' with
+';'.
+
+
+================================================================================
+
+Summary of ide driver parameters for kernel command line
+--------------------------------------------------------
+
+For legacy IDE VLB host drivers (ali14xx/dtc2278/ht6560b/qd65xx/umc8672)
+you need to explicitly enable probing by using "probe" kernel parameter,
+i.e. to enable probing for ALI M14xx chipsets (ali14xx host driver) use:
+
+* "ali14xx.probe" boot option when ali14xx driver is built-in the kernel
+
+* "probe" module parameter when ali14xx driver is compiled as module
+ ("modprobe ali14xx probe")
+
+Also for legacy CMD640 host driver (cmd640) you need to use "probe_vlb"
+kernel paremeter to enable probing for VLB version of the chipset (PCI ones
+are detected automatically).
+
+You also need to use "probe" kernel parameter for ide-4drives driver
+(support for IDE generic chipset with four drives on one port).
+
+To enable support for IDE doublers on Amiga use "doubler" kernel parameter
+for gayle host driver (i.e. "gayle.doubler" if the driver is built-in).
+
+To force ignoring cable detection (this should be needed only if you're using
+short 40-wires cable which cannot be automatically detected - if this is not
+a case please report it as a bug instead) use "ignore_cable" kernel parameter:
+
+* "ide_core.ignore_cable=[interface_number]" boot option if IDE is built-in
+ (i.e. "ide_core.ignore_cable=1" to force ignoring cable for "ide1")
+
+* "ignore_cable=[interface_number]" module parameter (for ide_core module)
+ if IDE is compiled as module
+
+Other kernel parameters for ide_core are:
+
+* "nodma=[interface_number.device_number]" to disallow DMA for a device
+
+* "noflush=[interface_number.device_number]" to disable flush requests
+
+* "noprobe=[interface_number.device_number]" to skip probing
+
+* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
+
+* "cdrom=[interface_number.device_number]" to force device as a CD-ROM
+
+* "chs=[interface_number.device_number]" to force device as a disk (using CHS)
+
+================================================================================
+
+Some Terminology
+----------------
+IDE = Integrated Drive Electronics, meaning that each drive has a built-in
+controller, which is why an "IDE interface card" is not a "controller card".
+
+ATA = AT (the old IBM 286 computer) Attachment Interface, a draft American
+National Standard for connecting hard drives to PCs. This is the official
+name for "IDE".
+
+The latest standards define some enhancements, known as the ATA-6 spec,
+which grew out of vendor-specific "Enhanced IDE" (EIDE) implementations.
+
+ATAPI = ATA Packet Interface, a new protocol for controlling the drives,
+similar to SCSI protocols, created at the same time as the ATA2 standard.
+ATAPI is currently used for controlling CDROM, TAPE and FLOPPY (ZIP or
+LS120/240) devices, removable R/W cartridges, and for high capacity hard disk
+drives.
+
+mlord@pobox.com
+--
+
+Wed Apr 17 22:52:44 CEST 2002 edited by Marcin Dalecki, the current
+maintainer.
+
+Wed Aug 20 22:31:29 CEST 2003 updated ide boot options to current ide.c
+comments at 2.6.0-test4 time. Maciej Soltysiak <solt@dns.toxicfilms.tv>
diff --git a/Documentation/ide/warm-plug-howto.txt b/Documentation/ide/warm-plug-howto.txt
new file mode 100644
index 0000000..d588546
--- /dev/null
+++ b/Documentation/ide/warm-plug-howto.txt
@@ -0,0 +1,13 @@
+
+IDE warm-plug HOWTO
+===================
+
+To warm-plug devices on a port 'idex':
+
+# echo -n "1" > /sys/class/ide_port/idex/delete_devices
+
+unplug old device(s) and plug new device(s)
+
+# echo -n "1" > /sys/class/ide_port/idex/scan
+
+done
diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt
new file mode 100644
index 0000000..e167854
--- /dev/null
+++ b/Documentation/infiniband/core_locking.txt
@@ -0,0 +1,114 @@
+INFINIBAND MIDLAYER LOCKING
+
+ This guide is an attempt to make explicit the locking assumptions
+ made by the InfiniBand midlayer. It describes the requirements on
+ both low-level drivers that sit below the midlayer and upper level
+ protocols that use the midlayer.
+
+Sleeping and interrupt context
+
+ With the following exceptions, a low-level driver implementation of
+ all of the methods in struct ib_device may sleep. The exceptions
+ are any methods from the list:
+
+ create_ah
+ modify_ah
+ query_ah
+ destroy_ah
+ bind_mw
+ post_send
+ post_recv
+ poll_cq
+ req_notify_cq
+ map_phys_fmr
+
+ which may not sleep and must be callable from any context.
+
+ The corresponding functions exported to upper level protocol
+ consumers:
+
+ ib_create_ah
+ ib_modify_ah
+ ib_query_ah
+ ib_destroy_ah
+ ib_bind_mw
+ ib_post_send
+ ib_post_recv
+ ib_req_notify_cq
+ ib_map_phys_fmr
+
+ are therefore safe to call from any context.
+
+ In addition, the function
+
+ ib_dispatch_event
+
+ used by low-level drivers to dispatch asynchronous events through
+ the midlayer is also safe to call from any context.
+
+Reentrancy
+
+ All of the methods in struct ib_device exported by a low-level
+ driver must be fully reentrant. The low-level driver is required to
+ perform all synchronization necessary to maintain consistency, even
+ if multiple function calls using the same object are run
+ simultaneously.
+
+ The IB midlayer does not perform any serialization of function calls.
+
+ Because low-level drivers are reentrant, upper level protocol
+ consumers are not required to perform any serialization. However,
+ some serialization may be required to get sensible results. For
+ example, a consumer may safely call ib_poll_cq() on multiple CPUs
+ simultaneously. However, the ordering of the work completion
+ information between different calls of ib_poll_cq() is not defined.
+
+Callbacks
+
+ A low-level driver must not perform a callback directly from the
+ same callchain as an ib_device method call. For example, it is not
+ allowed for a low-level driver to call a consumer's completion event
+ handler directly from its post_send method. Instead, the low-level
+ driver should defer this callback by, for example, scheduling a
+ tasklet to perform the callback.
+
+ The low-level driver is responsible for ensuring that multiple
+ completion event handlers for the same CQ are not called
+ simultaneously. The driver must guarantee that only one CQ event
+ handler for a given CQ is running at a time. In other words, the
+ following situation is not allowed:
+
+ CPU1 CPU2
+
+ low-level driver ->
+ consumer CQ event callback:
+ /* ... */
+ ib_req_notify_cq(cq, ...);
+ low-level driver ->
+ /* ... */ consumer CQ event callback:
+ /* ... */
+ return from CQ event handler
+
+ The context in which completion event and asynchronous event
+ callbacks run is not defined. Depending on the low-level driver, it
+ may be process context, softirq context, or interrupt context.
+ Upper level protocol consumers may not sleep in a callback.
+
+Hot-plug
+
+ A low-level driver announces that a device is ready for use by
+ consumers when it calls ib_register_device(), all initialization
+ must be complete before this call. The device must remain usable
+ until the driver's call to ib_unregister_device() has returned.
+
+ A low-level driver must call ib_register_device() and
+ ib_unregister_device() from process context. It must not hold any
+ semaphores that could cause deadlock if a consumer calls back into
+ the driver across these calls.
+
+ An upper level protocol consumer may begin using an IB device as
+ soon as the add method of its struct ib_client is called for that
+ device. A consumer must finish all cleanup and free all resources
+ relating to a device before returning from the remove method.
+
+ A consumer is permitted to sleep in its add and remove methods.
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
new file mode 100644
index 0000000..864ff32
--- /dev/null
+++ b/Documentation/infiniband/ipoib.txt
@@ -0,0 +1,57 @@
+IP OVER INFINIBAND
+
+ The ib_ipoib driver is an implementation of the IP over InfiniBand
+ protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
+ working group. It is a "native" implementation in the sense of
+ setting the interface type to ARPHRD_INFINIBAND and the hardware
+ address length to 20 (earlier proprietary implementations
+ masqueraded to the kernel as ethernet interfaces).
+
+Partitions and P_Keys
+
+ When the IPoIB driver is loaded, it creates one interface for each
+ port using the P_Key at index 0. To create an interface with a
+ different P_Key, write the desired P_Key into the main interface's
+ /sys/class/net/<intf name>/create_child file. For example:
+
+ echo 0x8001 > /sys/class/net/ib0/create_child
+
+ This will create an interface named ib0.8001 with P_Key 0x8001. To
+ remove a subinterface, use the "delete_child" file:
+
+ echo 0x8001 > /sys/class/net/ib0/delete_child
+
+ The P_Key for any interface is given by the "pkey" file, and the
+ main interface for a subinterface is in "parent."
+
+Debugging Information
+
+ By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
+ to 'y', tracing messages are compiled into the driver. They are
+ turned on by setting the module parameters debug_level and
+ mcast_debug_level to 1. These parameters can be controlled at
+ runtime through files in /sys/module/ib_ipoib/.
+
+ CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
+ virtual filesystem. By mounting this filesystem, for example with
+
+ mount -t debugfs none /sys/kernel/debug
+
+ it is possible to get statistics about multicast groups from the
+ files /sys/kernel/debug/ipoib/ib0_mcg and so on.
+
+ The performance impact of this option is negligible, so it
+ is safe to enable this option with debug_level set to 0 for normal
+ operation.
+
+ CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
+ the data path when data_debug_level is set to 1. However, even with
+ the output disabled, enabling this configuration option will affect
+ performance, because it adds tests to the fast path.
+
+References
+
+ Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
+ http://ietf.org/rfc/rfc4391.txt
+ IP over InfiniBand (IPoIB) Architecture (RFC 4392)
+ http://ietf.org/rfc/rfc4392.txt
diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt
new file mode 100644
index 0000000..ddd519b
--- /dev/null
+++ b/Documentation/infiniband/sysfs.txt
@@ -0,0 +1,66 @@
+SYSFS FILES
+
+ For each InfiniBand device, the InfiniBand drivers create the
+ following files under /sys/class/infiniband/<device name>:
+
+ node_type - Node type (CA, switch or router)
+ node_guid - Node GUID
+ sys_image_guid - System image GUID
+
+ In addition, there is a "ports" subdirectory, with one subdirectory
+ for each port. For example, if mthca0 is a 2-port HCA, there will
+ be two directories:
+
+ /sys/class/infiniband/mthca0/ports/1
+ /sys/class/infiniband/mthca0/ports/2
+
+ (A switch will only have a single "0" subdirectory for switch port
+ 0; no subdirectory is created for normal switch ports)
+
+ In each port subdirectory, the following files are created:
+
+ cap_mask - Port capability mask
+ lid - Port LID
+ lid_mask_count - Port LID mask count
+ rate - Port data rate (active width * active speed)
+ sm_lid - Subnet manager LID for port's subnet
+ sm_sl - Subnet manager SL for port's subnet
+ state - Port state (DOWN, INIT, ARMED, ACTIVE or ACTIVE_DEFER)
+ phys_state - Port physical state (Sleep, Polling, LinkUp, etc)
+
+ There is also a "counters" subdirectory, with files
+
+ VL15_dropped
+ excessive_buffer_overrun_errors
+ link_downed
+ link_error_recovery
+ local_link_integrity_errors
+ port_rcv_constraint_errors
+ port_rcv_data
+ port_rcv_errors
+ port_rcv_packets
+ port_rcv_remote_physical_errors
+ port_rcv_switch_relay_errors
+ port_xmit_constraint_errors
+ port_xmit_data
+ port_xmit_discards
+ port_xmit_packets
+ symbol_error
+
+ Each of these files contains the corresponding value from the port's
+ Performance Management PortCounters attribute, as described in
+ section 16.1.3.5 of the InfiniBand Architecture Specification.
+
+ The "pkeys" and "gids" subdirectories contain one file for each
+ entry in the port's P_Key or GID table respectively. For example,
+ ports/1/pkeys/10 contains the value at index 10 in port 1's P_Key
+ table.
+
+MTHCA
+
+ The Mellanox HCA driver also creates the files:
+
+ hw_rev - Hardware revision number
+ fw_ver - Firmware version
+ hca_type - HCA type: "MT23108", "MT25208 (MT23108 compat mode)",
+ or "MT25208"
diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt
new file mode 100644
index 0000000..744687d
--- /dev/null
+++ b/Documentation/infiniband/user_mad.txt
@@ -0,0 +1,148 @@
+USERSPACE MAD ACCESS
+
+Device files
+
+ Each port of each InfiniBand device has a "umad" device and an
+ "issm" device attached. For example, a two-port HCA will have two
+ umad devices and two issm devices, while a switch will have one
+ device of each type (for switch port 0).
+
+Creating MAD agents
+
+ A MAD agent can be created by filling in a struct ib_user_mad_reg_req
+ and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
+ descriptor for the appropriate device file. If the registration
+ request succeeds, a 32-bit id will be returned in the structure.
+ For example:
+
+ struct ib_user_mad_reg_req req = { /* ... */ };
+ ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
+ if (!ret)
+ my_agent = req.id;
+ else
+ perror("agent register");
+
+ Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
+ ioctl. Also, all agents registered through a file descriptor will
+ be unregistered when the descriptor is closed.
+
+Receiving MADs
+
+ MADs are received using read(). The receive side now supports
+ RMPP. The buffer passed to read() must be at least one
+ struct ib_user_mad + 256 bytes. For example:
+
+ If the buffer passed is not large enough to hold the received
+ MAD (RMPP), the errno is set to ENOSPC and the length of the
+ buffer needed is set in mad.length.
+
+ Example for normal MAD (non RMPP) reads:
+ struct ib_user_mad *mad;
+ mad = malloc(sizeof *mad + 256);
+ ret = read(fd, mad, sizeof *mad + 256);
+ if (ret != sizeof mad + 256) {
+ perror("read");
+ free(mad);
+ }
+
+ Example for RMPP reads:
+ struct ib_user_mad *mad;
+ mad = malloc(sizeof *mad + 256);
+ ret = read(fd, mad, sizeof *mad + 256);
+ if (ret == -ENOSPC)) {
+ length = mad.length;
+ free(mad);
+ mad = malloc(sizeof *mad + length);
+ ret = read(fd, mad, sizeof *mad + length);
+ }
+ if (ret < 0) {
+ perror("read");
+ free(mad);
+ }
+
+ In addition to the actual MAD contents, the other struct ib_user_mad
+ fields will be filled in with information on the received MAD. For
+ example, the remote LID will be in mad.lid.
+
+ If a send times out, a receive will be generated with mad.status set
+ to ETIMEDOUT. Otherwise when a MAD has been successfully received,
+ mad.status will be 0.
+
+ poll()/select() may be used to wait until a MAD can be read.
+
+Sending MADs
+
+ MADs are sent using write(). The agent ID for sending should be
+ filled into the id field of the MAD, the destination LID should be
+ filled into the lid field, and so on. The send side does support
+ RMPP so arbitrary length MAD can be sent. For example:
+
+ struct ib_user_mad *mad;
+
+ mad = malloc(sizeof *mad + mad_length);
+
+ /* fill in mad->data */
+
+ mad->hdr.id = my_agent; /* req.id from agent registration */
+ mad->hdr.lid = my_dest; /* in network byte order... */
+ /* etc. */
+
+ ret = write(fd, &mad, sizeof *mad + mad_length);
+ if (ret != sizeof *mad + mad_length)
+ perror("write");
+
+Transaction IDs
+
+ Users of the umad devices can use the lower 32 bits of the
+ transaction ID field (that is, the least significant half of the
+ field in network byte order) in MADs being sent to match
+ request/response pairs. The upper 32 bits are reserved for use by
+ the kernel and will be overwritten before a MAD is sent.
+
+P_Key Index Handling
+
+ The old ib_umad interface did not allow setting the P_Key index for
+ MADs that are sent and did not provide a way for obtaining the P_Key
+ index of received MADs. A new layout for struct ib_user_mad_hdr
+ with a pkey_index member has been defined; however, to preserve
+ binary compatibility with older applications, this new layout will
+ not be used unless the IB_USER_MAD_ENABLE_PKEY ioctl is called
+ before a file descriptor is used for anything else.
+
+ In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
+ to 6, the new layout of struct ib_user_mad_hdr will be used by
+ default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
+
+Setting IsSM Capability Bit
+
+ To set the IsSM capability bit for a port, simply open the
+ corresponding issm device file. If the IsSM bit is already set,
+ then the open call will block until the bit is cleared (or return
+ immediately with errno set to EAGAIN if the O_NONBLOCK flag is
+ passed to open()). The IsSM bit will be cleared when the issm file
+ is closed. No read, write or other operations can be performed on
+ the issm file.
+
+/dev files
+
+ To create the appropriate character device files automatically with
+ udev, a rule like
+
+ KERNEL="umad*", NAME="infiniband/%k"
+ KERNEL="issm*", NAME="infiniband/%k"
+
+ can be used. This will create device nodes named
+
+ /dev/infiniband/umad0
+ /dev/infiniband/issm0
+
+ for the first port, and so on. The InfiniBand device and port
+ associated with these devices can be determined from the files
+
+ /sys/class/infiniband_mad/umad0/ibdev
+ /sys/class/infiniband_mad/umad0/port
+
+ and
+
+ /sys/class/infiniband_mad/issm0/ibdev
+ /sys/class/infiniband_mad/issm0/port
diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt
new file mode 100644
index 0000000..f847501
--- /dev/null
+++ b/Documentation/infiniband/user_verbs.txt
@@ -0,0 +1,69 @@
+USERSPACE VERBS ACCESS
+
+ The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
+ enables direct userspace access to IB hardware via "verbs," as
+ described in chapter 11 of the InfiniBand Architecture Specification.
+
+ To use the verbs, the libibverbs library, available from
+ <http://openib.org/>, is required. libibverbs contains a
+ device-independent API for using the ib_uverbs interface.
+ libibverbs also requires appropriate device-dependent kernel and
+ userspace driver for your InfiniBand hardware. For example, to use
+ a Mellanox HCA, you will need the ib_mthca kernel module and the
+ libmthca userspace driver be installed.
+
+User-kernel communication
+
+ Userspace communicates with the kernel for slow path, resource
+ management operations via the /dev/infiniband/uverbsN character
+ devices. Fast path operations are typically performed by writing
+ directly to hardware registers mmap()ed into userspace, with no
+ system call or context switch into the kernel.
+
+ Commands are sent to the kernel via write()s on these device files.
+ The ABI is defined in drivers/infiniband/include/ib_user_verbs.h.
+ The structs for commands that require a response from the kernel
+ contain a 64-bit field used to pass a pointer to an output buffer.
+ Status is returned to userspace as the return value of the write()
+ system call.
+
+Resource management
+
+ Since creation and destruction of all IB resources is done by
+ commands passed through a file descriptor, the kernel can keep track
+ of which resources are attached to a given userspace context. The
+ ib_uverbs module maintains idr tables that are used to translate
+ between kernel pointers and opaque userspace handles, so that kernel
+ pointers are never exposed to userspace and userspace cannot trick
+ the kernel into following a bogus pointer.
+
+ This also allows the kernel to clean up when a process exits and
+ prevent one process from touching another process's resources.
+
+Memory pinning
+
+ Direct userspace I/O requires that memory regions that are potential
+ I/O targets be kept resident at the same physical address. The
+ ib_uverbs module manages pinning and unpinning memory regions via
+ get_user_pages() and put_page() calls. It also accounts for the
+ amount of memory pinned in the process's locked_vm, and checks that
+ unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
+
+ Pages that are pinned multiple times are counted each time they are
+ pinned, so the value of locked_vm may be an overestimate of the
+ number of pages pinned by a process.
+
+/dev files
+
+ To create the appropriate character device files automatically with
+ udev, a rule like
+
+ KERNEL="uverbs*", NAME="infiniband/%k"
+
+ can be used. This will create device nodes named
+
+ /dev/infiniband/uverbs0
+
+ and so on. Since the InfiniBand userspace verbs should be safe for
+ use by non-privileged processes, it may be useful to add an
+ appropriate MODE or GROUP to the udev rule.
diff --git a/Documentation/initrd.txt b/Documentation/initrd.txt
new file mode 100644
index 0000000..1ba84f3
--- /dev/null
+++ b/Documentation/initrd.txt
@@ -0,0 +1,366 @@
+Using the initial RAM disk (initrd)
+===================================
+
+Written 1996,2000 by Werner Almesberger <werner.almesberger@epfl.ch> and
+ Hans Lermen <lermen@fgan.de>
+
+
+initrd provides the capability to load a RAM disk by the boot loader.
+This RAM disk can then be mounted as the root file system and programs
+can be run from it. Afterwards, a new root file system can be mounted
+from a different device. The previous root (from initrd) is then moved
+to a directory and can be subsequently unmounted.
+
+initrd is mainly designed to allow system startup to occur in two phases,
+where the kernel comes up with a minimum set of compiled-in drivers, and
+where additional modules are loaded from initrd.
+
+This document gives a brief overview of the use of initrd. A more detailed
+discussion of the boot process can be found in [1].
+
+
+Operation
+---------
+
+When using initrd, the system typically boots as follows:
+
+ 1) the boot loader loads the kernel and the initial RAM disk
+ 2) the kernel converts initrd into a "normal" RAM disk and
+ frees the memory used by initrd
+ 3) if the root device is not /dev/ram0, the old (deprecated)
+ change_root procedure is followed. see the "Obsolete root change
+ mechanism" section below.
+ 4) root device is mounted. if it is /dev/ram0, the initrd image is
+ then mounted as root
+ 5) /sbin/init is executed (this can be any valid executable, including
+ shell scripts; it is run with uid 0 and can do basically everything
+ init can do).
+ 6) init mounts the "real" root file system
+ 7) init places the root file system at the root directory using the
+ pivot_root system call
+ 8) init execs the /sbin/init on the new root filesystem, performing
+ the usual boot sequence
+ 9) the initrd file system is removed
+
+Note that changing the root directory does not involve unmounting it.
+It is therefore possible to leave processes running on initrd during that
+procedure. Also note that file systems mounted under initrd continue to
+be accessible.
+
+
+Boot command-line options
+-------------------------
+
+initrd adds the following new options:
+
+ initrd=<path> (e.g. LOADLIN)
+
+ Loads the specified file as the initial RAM disk. When using LILO, you
+ have to specify the RAM disk image file in /etc/lilo.conf, using the
+ INITRD configuration variable.
+
+ noinitrd
+
+ initrd data is preserved but it is not converted to a RAM disk and
+ the "normal" root file system is mounted. initrd data can be read
+ from /dev/initrd. Note that the data in initrd can have any structure
+ in this case and doesn't necessarily have to be a file system image.
+ This option is used mainly for debugging.
+
+ Note: /dev/initrd is read-only and it can only be used once. As soon
+ as the last process has closed it, all data is freed and /dev/initrd
+ can't be opened anymore.
+
+ root=/dev/ram0
+
+ initrd is mounted as root, and the normal boot procedure is followed,
+ with the RAM disk mounted as root.
+
+Compressed cpio images
+----------------------
+
+Recent kernels have support for populating a ramdisk from a compressed cpio
+archive. On such systems, the creation of a ramdisk image doesn't need to
+involve special block devices or loopbacks; you merely create a directory on
+disk with the desired initrd content, cd to that directory, and run (as an
+example):
+
+find . | cpio --quiet -H newc -o | gzip -9 -n > /boot/imagefile.img
+
+Examining the contents of an existing image file is just as simple:
+
+mkdir /tmp/imagefile
+cd /tmp/imagefile
+gzip -cd /boot/imagefile.img | cpio -imd --quiet
+
+Installation
+------------
+
+First, a directory for the initrd file system has to be created on the
+"normal" root file system, e.g.
+
+# mkdir /initrd
+
+The name is not relevant. More details can be found on the pivot_root(2)
+man page.
+
+If the root file system is created during the boot procedure (i.e. if
+you're building an install floppy), the root file system creation
+procedure should create the /initrd directory.
+
+If initrd will not be mounted in some cases, its content is still
+accessible if the following device has been created:
+
+# mknod /dev/initrd b 1 250
+# chmod 400 /dev/initrd
+
+Second, the kernel has to be compiled with RAM disk support and with
+support for the initial RAM disk enabled. Also, at least all components
+needed to execute programs from initrd (e.g. executable format and file
+system) must be compiled into the kernel.
+
+Third, you have to create the RAM disk image. This is done by creating a
+file system on a block device, copying files to it as needed, and then
+copying the content of the block device to the initrd file. With recent
+kernels, at least three types of devices are suitable for that:
+
+ - a floppy disk (works everywhere but it's painfully slow)
+ - a RAM disk (fast, but allocates physical memory)
+ - a loopback device (the most elegant solution)
+
+We'll describe the loopback device method:
+
+ 1) make sure loopback block devices are configured into the kernel
+ 2) create an empty file system of the appropriate size, e.g.
+ # dd if=/dev/zero of=initrd bs=300k count=1
+ # mke2fs -F -m0 initrd
+ (if space is critical, you may want to use the Minix FS instead of Ext2)
+ 3) mount the file system, e.g.
+ # mount -t ext2 -o loop initrd /mnt
+ 4) create the console device:
+ # mkdir /mnt/dev
+ # mknod /mnt/dev/console c 5 1
+ 5) copy all the files that are needed to properly use the initrd
+ environment. Don't forget the most important file, /sbin/init
+ Note that /sbin/init's permissions must include "x" (execute).
+ 6) correct operation the initrd environment can frequently be tested
+ even without rebooting with the command
+ # chroot /mnt /sbin/init
+ This is of course limited to initrds that do not interfere with the
+ general system state (e.g. by reconfiguring network interfaces,
+ overwriting mounted devices, trying to start already running demons,
+ etc. Note however that it is usually possible to use pivot_root in
+ such a chroot'ed initrd environment.)
+ 7) unmount the file system
+ # umount /mnt
+ 8) the initrd is now in the file "initrd". Optionally, it can now be
+ compressed
+ # gzip -9 initrd
+
+For experimenting with initrd, you may want to take a rescue floppy and
+only add a symbolic link from /sbin/init to /bin/sh. Alternatively, you
+can try the experimental newlib environment [2] to create a small
+initrd.
+
+Finally, you have to boot the kernel and load initrd. Almost all Linux
+boot loaders support initrd. Since the boot process is still compatible
+with an older mechanism, the following boot command line parameters
+have to be given:
+
+ root=/dev/ram0 rw
+
+(rw is only necessary if writing to the initrd file system.)
+
+With LOADLIN, you simply execute
+
+ LOADLIN <kernel> initrd=<disk_image>
+e.g. LOADLIN C:\LINUX\BZIMAGE initrd=C:\LINUX\INITRD.GZ root=/dev/ram0 rw
+
+With LILO, you add the option INITRD=<path> to either the global section
+or to the section of the respective kernel in /etc/lilo.conf, and pass
+the options using APPEND, e.g.
+
+ image = /bzImage
+ initrd = /boot/initrd.gz
+ append = "root=/dev/ram0 rw"
+
+and run /sbin/lilo
+
+For other boot loaders, please refer to the respective documentation.
+
+Now you can boot and enjoy using initrd.
+
+
+Changing the root device
+------------------------
+
+When finished with its duties, init typically changes the root device
+and proceeds with starting the Linux system on the "real" root device.
+
+The procedure involves the following steps:
+ - mounting the new root file system
+ - turning it into the root file system
+ - removing all accesses to the old (initrd) root file system
+ - unmounting the initrd file system and de-allocating the RAM disk
+
+Mounting the new root file system is easy: it just needs to be mounted on
+a directory under the current root. Example:
+
+# mkdir /new-root
+# mount -o ro /dev/hda1 /new-root
+
+The root change is accomplished with the pivot_root system call, which
+is also available via the pivot_root utility (see pivot_root(8) man
+page; pivot_root is distributed with util-linux version 2.10h or higher
+[3]). pivot_root moves the current root to a directory under the new
+root, and puts the new root at its place. The directory for the old root
+must exist before calling pivot_root. Example:
+
+# cd /new-root
+# mkdir initrd
+# pivot_root . initrd
+
+Now, the init process may still access the old root via its
+executable, shared libraries, standard input/output/error, and its
+current root directory. All these references are dropped by the
+following command:
+
+# exec chroot . what-follows <dev/console >dev/console 2>&1
+
+Where what-follows is a program under the new root, e.g. /sbin/init
+If the new root file system will be used with udev and has no valid
+/dev directory, udev must be initialized before invoking chroot in order
+to provide /dev/console.
+
+Note: implementation details of pivot_root may change with time. In order
+to ensure compatibility, the following points should be observed:
+
+ - before calling pivot_root, the current directory of the invoking
+ process should point to the new root directory
+ - use . as the first argument, and the _relative_ path of the directory
+ for the old root as the second argument
+ - a chroot program must be available under the old and the new root
+ - chroot to the new root afterwards
+ - use relative paths for dev/console in the exec command
+
+Now, the initrd can be unmounted and the memory allocated by the RAM
+disk can be freed:
+
+# umount /initrd
+# blockdev --flushbufs /dev/ram0
+
+It is also possible to use initrd with an NFS-mounted root, see the
+pivot_root(8) man page for details.
+
+
+Usage scenarios
+---------------
+
+The main motivation for implementing initrd was to allow for modular
+kernel configuration at system installation. The procedure would work
+as follows:
+
+ 1) system boots from floppy or other media with a minimal kernel
+ (e.g. support for RAM disks, initrd, a.out, and the Ext2 FS) and
+ loads initrd
+ 2) /sbin/init determines what is needed to (1) mount the "real" root FS
+ (i.e. device type, device drivers, file system) and (2) the
+ distribution media (e.g. CD-ROM, network, tape, ...). This can be
+ done by asking the user, by auto-probing, or by using a hybrid
+ approach.
+ 3) /sbin/init loads the necessary kernel modules
+ 4) /sbin/init creates and populates the root file system (this doesn't
+ have to be a very usable system yet)
+ 5) /sbin/init invokes pivot_root to change the root file system and
+ execs - via chroot - a program that continues the installation
+ 6) the boot loader is installed
+ 7) the boot loader is configured to load an initrd with the set of
+ modules that was used to bring up the system (e.g. /initrd can be
+ modified, then unmounted, and finally, the image is written from
+ /dev/ram0 or /dev/rd/0 to a file)
+ 8) now the system is bootable and additional installation tasks can be
+ performed
+
+The key role of initrd here is to re-use the configuration data during
+normal system operation without requiring the use of a bloated "generic"
+kernel or re-compiling or re-linking the kernel.
+
+A second scenario is for installations where Linux runs on systems with
+different hardware configurations in a single administrative domain. In
+such cases, it is desirable to generate only a small set of kernels
+(ideally only one) and to keep the system-specific part of configuration
+information as small as possible. In this case, a common initrd could be
+generated with all the necessary modules. Then, only /sbin/init or a file
+read by it would have to be different.
+
+A third scenario is more convenient recovery disks, because information
+like the location of the root FS partition doesn't have to be provided at
+boot time, but the system loaded from initrd can invoke a user-friendly
+dialog and it can also perform some sanity checks (or even some form of
+auto-detection).
+
+Last not least, CD-ROM distributors may use it for better installation
+from CD, e.g. by using a boot floppy and bootstrapping a bigger RAM disk
+via initrd from CD; or by booting via a loader like LOADLIN or directly
+from the CD-ROM, and loading the RAM disk from CD without need of
+floppies.
+
+
+Obsolete root change mechanism
+------------------------------
+
+The following mechanism was used before the introduction of pivot_root.
+Current kernels still support it, but you should _not_ rely on its
+continued availability.
+
+It works by mounting the "real" root device (i.e. the one set with rdev
+in the kernel image or with root=... at the boot command line) as the
+root file system when linuxrc exits. The initrd file system is then
+unmounted, or, if it is still busy, moved to a directory /initrd, if
+such a directory exists on the new root file system.
+
+In order to use this mechanism, you do not have to specify the boot
+command options root, init, or rw. (If specified, they will affect
+the real root file system, not the initrd environment.)
+
+If /proc is mounted, the "real" root device can be changed from within
+linuxrc by writing the number of the new root FS device to the special
+file /proc/sys/kernel/real-root-dev, e.g.
+
+ # echo 0x301 >/proc/sys/kernel/real-root-dev
+
+Note that the mechanism is incompatible with NFS and similar file
+systems.
+
+This old, deprecated mechanism is commonly called "change_root", while
+the new, supported mechanism is called "pivot_root".
+
+
+Mixed change_root and pivot_root mechanism
+------------------------------------------
+
+In case you did not want to use root=/dev/ram0 to trigger the pivot_root
+mechanism, you may create both /linuxrc and /sbin/init in your initrd image.
+
+/linuxrc would contain only the following:
+
+#! /bin/sh
+mount -n -t proc proc /proc
+echo 0x0100 >/proc/sys/kernel/real-root-dev
+umount -n /proc
+
+Once linuxrc exited, the kernel would mount again your initrd as root,
+this time executing /sbin/init. Again, it would be the duty of this init
+to build the right environment (maybe using the root= device passed on
+the cmdline) before the final execution of the real /sbin/init.
+
+
+Resources
+---------
+
+[1] Almesberger, Werner; "Booting Linux: The History and the Future"
+ http://www.almesberger.net/cv/papers/ols2k-9.ps.gz
+[2] newlib package (experimental), with initrd example
+ http://sources.redhat.com/newlib/
+[3] Brouwer, Andries; "util-linux: Miscellaneous utilities for Linux"
+ ftp://ftp.win.tue.nl/pub/linux-local/utils/util-linux/
diff --git a/Documentation/input/amijoy.txt b/Documentation/input/amijoy.txt
new file mode 100644
index 0000000..7dc4f17
--- /dev/null
+++ b/Documentation/input/amijoy.txt
@@ -0,0 +1,184 @@
+Amiga 4-joystick parport extension
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Parallel port pins:
+
+ (2) - Up1 (6) - Up2
+ (3) - Down1 (7) - Down2
+ (4) - Left1 (8) - Left2
+ (5) - Right1 (9) - Right2
+(13) - Fire1 (11) - Fire2
+(18) - Gnd1 (18) - Gnd2
+
+Amiga digital joystick pinout
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+(1) - Up
+(2) - Down
+(3) - Left
+(4) - Right
+(5) - n/c
+(6) - Fire button
+(7) - +5V (50mA)
+(8) - Gnd
+(9) - Thumb button
+
+Amiga mouse pinout
+~~~~~~~~~~~~~~~~~~
+(1) - V-pulse
+(2) - H-pulse
+(3) - VQ-pulse
+(4) - HQ-pulse
+(5) - Middle button
+(6) - Left button
+(7) - +5V (50mA)
+(8) - Gnd
+(9) - Right button
+
+Amiga analog joystick pinout
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+(1) - Top button
+(2) - Top2 button
+(3) - Trigger button
+(4) - Thumb button
+(5) - Analog X
+(6) - n/c
+(7) - +5V (50mA)
+(8) - Gnd
+(9) - Analog Y
+
+Amiga lightpen pinout
+~~~~~~~~~~~~~~~~~~~~~
+(1) - n/c
+(2) - n/c
+(3) - n/c
+(4) - n/c
+(5) - Touch button
+(6) - /Beamtrigger
+(7) - +5V (50mA)
+(8) - Gnd
+(9) - Stylus button
+
+-------------------------------------------------------------------------------
+
+NAME rev ADDR type chip Description
+JOY0DAT 00A R Denise Joystick-mouse 0 data (left vert, horiz)
+JOY1DAT 00C R Denise Joystick-mouse 1 data (right vert,horiz)
+
+ These addresses each read a 16 bit register. These in turn
+ are loaded from the MDAT serial stream and are clocked in on
+ the rising edge of SCLK. MLD output is used to parallel load
+ the external parallel-to-serial converter.This in turn is
+ loaded with the 4 quadrature inputs from each of two game
+ controller ports (8 total) plus 8 miscellaneous control bits
+ which are new for LISA and can be read in upper 8 bits of
+ LISAID.
+ Register bits are as follows:
+ Mouse counter usage (pins 1,3 =Yclock, pins 2,4 =Xclock)
+
+ BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+JOY0DAT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0
+JOY1DAT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0
+
+ 0=LEFT CONTROLLER PAIR, 1=RIGHT CONTROLLER PAIR.
+ (4 counters total). The bit usage for both left and right
+ addresses is shown below. Each 6 bit counter (Y7-Y2,X7-X2) is
+ clocked by 2 of the signals input from the mouse serial
+ stream. Starting with first bit received:
+
+ +-------------------+-----------------------------------------+
+ | Serial | Bit Name | Description |
+ +--------+----------+-----------------------------------------+
+ | 0 | M0H | JOY0DAT Horizontal Clock |
+ | 1 | M0HQ | JOY0DAT Horizontal Clock (quadrature) |
+ | 2 | M0V | JOY0DAT Vertical Clock |
+ | 3 | M0VQ | JOY0DAT Vertical Clock (quadrature) |
+ | 4 | M1V | JOY1DAT Horizontal Clock |
+ | 5 | M1VQ | JOY1DAT Horizontal Clock (quadrature) |
+ | 6 | M1V | JOY1DAT Vertical Clock |
+ | 7 | M1VQ | JOY1DAT Vertical Clock (quadrature) |
+ +--------+----------+-----------------------------------------+
+
+ Bits 1 and 0 of each counter (Y1-Y0,X1-X0) may be
+ read to determine the state of the related input signal pair.
+ This allows these pins to double as joystick switch inputs.
+ Joystick switch closures can be deciphered as follows:
+
+ +------------+------+---------------------------------+
+ | Directions | Pin# | Counter bits |
+ +------------+------+---------------------------------+
+ | Forward | 1 | Y1 xor Y0 (BIT#09 xor BIT#08) |
+ | Left | 3 | Y1 |
+ | Back | 2 | X1 xor X0 (BIT#01 xor BIT#00) |
+ | Right | 4 | X1 |
+ +------------+------+---------------------------------+
+
+-------------------------------------------------------------------------------
+
+NAME rev ADDR type chip Description
+JOYTEST 036 W Denise Write to all 4 joystick-mouse counters at once.
+
+ Mouse counter write test data:
+ BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+ JOYxDAT Y7 Y6 Y5 Y4 Y3 Y2 xx xx X7 X6 X5 X4 X3 X2 xx xx
+ JOYxDAT Y7 Y6 Y5 Y4 Y3 Y2 xx xx X7 X6 X5 X4 X3 X2 xx xx
+
+-------------------------------------------------------------------------------
+
+NAME rev ADDR type chip Description
+POT0DAT h 012 R Paula Pot counter data left pair (vert, horiz)
+POT1DAT h 014 R Paula Pot counter data right pair (vert,horiz)
+
+ These addresses each read a pair of 8 bit pot counters.
+ (4 counters total). The bit assignment for both
+ addresses is shown below. The counters are stopped by signals
+ from 2 controller connectors (left-right) with 2 pins each.
+
+ BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+ RIGHT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0
+ LEFT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0
+
+ +--------------------------+-------+
+ | CONNECTORS | PAULA |
+ +-------+------+-----+-----+-------+
+ | Loc. | Dir. | Sym | pin | pin |
+ +-------+------+-----+-----+-------+
+ | RIGHT | Y | RX | 9 | 33 |
+ | RIGHT | X | RX | 5 | 32 |
+ | LEFT | Y | LY | 9 | 36 |
+ | LEFT | X | LX | 5 | 35 |
+ +-------+------+-----+-----+-------+
+
+ With normal (NTSC or PAL) horiz. line rate, the pots will
+ give a full scale (FF) reading with about 500kohms in one
+ frame time. With proportionally faster horiz line times,
+ the counters will count proportionally faster.
+ This should be noted when doing variable beam displays.
+
+-------------------------------------------------------------------------------
+
+NAME rev ADDR type chip Description
+POTGO 034 W Paula Pot port (4 bit) bi-direction and data, and pot counter start.
+
+-------------------------------------------------------------------------------
+
+NAME rev ADDR type chip Description
+POTINP 016 R Paula Pot pin data read
+
+ This register controls a 4 bit bi-direction I/O port
+ that shares the same 4 pins as the 4 pot counters above.
+
+ +-------+----------+---------------------------------------------+
+ | BIT# | FUNCTION | DESCRIPTION |
+ +-------+----------+---------------------------------------------+
+ | 15 | OUTRY | Output enable for Paula pin 33 |
+ | 14 | DATRY | I/O data Paula pin 33 |
+ | 13 | OUTRX | Output enable for Paula pin 32 |
+ | 12 | DATRX | I/O data Paula pin 32 |
+ | 11 | OUTLY | Out put enable for Paula pin 36 |
+ | 10 | DATLY | I/O data Paula pin 36 |
+ | 09 | OUTLX | Output enable for Paula pin 35 |
+ | 08 | DATLX | I/O data Paula pin 35 |
+ | 07-01 | X | Not used |
+ | 00 | START | Start pots (dump capacitors,start counters) |
+ +-------+----------+---------------------------------------------+
+
+-------------------------------------------------------------------------------
diff --git a/Documentation/input/appletouch.txt b/Documentation/input/appletouch.txt
new file mode 100644
index 0000000..4f7c633
--- /dev/null
+++ b/Documentation/input/appletouch.txt
@@ -0,0 +1,85 @@
+Apple Touchpad Driver (appletouch)
+----------------------------------
+ Copyright (C) 2005 Stelian Pop <stelian@popies.net>
+
+appletouch is a Linux kernel driver for the USB touchpad found on post
+February 2005 and October 2005 Apple Aluminium Powerbooks.
+
+This driver is derived from Johannes Berg's appletrackpad driver[1], but it has
+been improved in some areas:
+ * appletouch is a full kernel driver, no userspace program is necessary
+ * appletouch can be interfaced with the synaptics X11 driver, in order
+ to have touchpad acceleration, scrolling, etc.
+
+Credits go to Johannes Berg for reverse-engineering the touchpad protocol,
+Frank Arnold for further improvements, and Alex Harper for some additional
+information about the inner workings of the touchpad sensors. Michael
+Hanselmann added support for the October 2005 models.
+
+Usage:
+------
+
+In order to use the touchpad in the basic mode, compile the driver and load
+the module. A new input device will be detected and you will be able to read
+the mouse data from /dev/input/mice (using gpm, or X11).
+
+In X11, you can configure the touchpad to use the synaptics X11 driver, which
+will give additional functionalities, like acceleration, scrolling, 2 finger
+tap for middle button mouse emulation, 3 finger tap for right button mouse
+emulation, etc. In order to do this, make sure you're using a recent version of
+the synaptics driver (tested with 0.14.2, available from [2]), and configure a
+new input device in your X11 configuration file (take a look below for an
+example). For additional configuration, see the synaptics driver documentation.
+
+ Section "InputDevice"
+ Identifier "Synaptics Touchpad"
+ Driver "synaptics"
+ Option "SendCoreEvents" "true"
+ Option "Device" "/dev/input/mice"
+ Option "Protocol" "auto-dev"
+ Option "LeftEdge" "0"
+ Option "RightEdge" "850"
+ Option "TopEdge" "0"
+ Option "BottomEdge" "645"
+ Option "MinSpeed" "0.4"
+ Option "MaxSpeed" "1"
+ Option "AccelFactor" "0.02"
+ Option "FingerLow" "0"
+ Option "FingerHigh" "30"
+ Option "MaxTapMove" "20"
+ Option "MaxTapTime" "100"
+ Option "HorizScrollDelta" "0"
+ Option "VertScrollDelta" "30"
+ Option "SHMConfig" "on"
+ EndSection
+
+ Section "ServerLayout"
+ ...
+ InputDevice "Mouse"
+ InputDevice "Synaptics Touchpad"
+ ...
+ EndSection
+
+Fuzz problems:
+--------------
+
+The touchpad sensors are very sensitive to heat, and will generate a lot of
+noise when the temperature changes. This is especially true when you power-on
+the laptop for the first time.
+
+The appletouch driver tries to handle this noise and auto adapt itself, but it
+is not perfect. If finger movements are not recognized anymore, try reloading
+the driver.
+
+You can activate debugging using the 'debug' module parameter. A value of 0
+deactivates any debugging, 1 activates tracing of invalid samples, 2 activates
+full tracing (each sample is being traced):
+ modprobe appletouch debug=1
+ or
+ echo "1" > /sys/module/appletouch/parameters/debug
+
+Links:
+------
+
+[1]: http://johannes.sipsolutions.net/PowerBook/touchpad/
+[2]: http://web.telia.com/~u89404340/touchpad/index.html
diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt
new file mode 100644
index 0000000..f3a3ba8
--- /dev/null
+++ b/Documentation/input/atarikbd.txt
@@ -0,0 +1,709 @@
+Intelligent Keyboard (ikbd) Protocol
+
+
+1. Introduction
+
+The Atari Corp. Intelligent Keyboard (ikbd) is a general purpose keyboard
+controller that is flexible enough that it can be used in a variety of
+products without modification. The keyboard, with its microcontroller,
+provides a convenient connection point for a mouse and switch-type joysticks.
+The ikbd processor also maintains a time-of-day clock with one second
+resolution.
+The ikbd has been designed to be general enough that it can be used with a
+variety of new computer products. Product variations in a number of
+keyswitches, mouse resolution, etc. can be accommodated.
+The ikbd communicates with the main processor over a high speed bi-directional
+serial interface. It can function in a variety of modes to facilitate
+different applications of the keyboard, joysticks, or mouse. Limited use of
+the controller is possible in applications in which only a unidirectional
+communications medium is available by carefully designing the default modes.
+
+3. Keyboard
+
+The keyboard always returns key make/break scan codes. The ikbd generates
+keyboard scan codes for each key press and release. The key scan make (key
+closure) codes start at 1, and are defined in Appendix A. For example, the
+ISO key position in the scan code table should exist even if no keyswitch
+exists in that position on a particular keyboard. The break code for each key
+is obtained by ORing 0x80 with the make code.
+
+The special codes 0xF6 through 0xFF are reserved for use as follows:
+ 0xF6 status report
+ 0xF7 absolute mouse position record
+ 0xF8-0xFB relative mouse position records (lsbs determined by
+ mouse button states)
+ 0xFC time-of-day
+ 0xFD joystick report (both sticks)
+ 0xFE joystick 0 event
+ 0xFF joystick 1 event
+
+The two shift keys return different scan codes in this mode. The ENTER key
+and the RETurn key are also distinct.
+
+4. Mouse
+
+The mouse port should be capable of supporting a mouse with resolution of
+approximately 200 counts (phase changes or 'clicks') per inch of travel. The
+mouse should be scanned at a rate that will permit accurate tracking at
+velocities up to 10 inches per second.
+The ikbd can report mouse motion in three distinctly different ways. It can
+report relative motion, absolute motion in a coordinate system maintained
+within the ikbd, or by converting mouse motion into keyboard cursor control
+key equivalents.
+The mouse buttons can be treated as part of the mouse or as additional
+keyboard keys.
+
+4.1 Relative Position Reporting
+
+In relative position mode, the ikbd will return relative mouse position
+records whenever a mouse event occurs. A mouse event consists of a mouse
+button being pressed or released, or motion in either axis exceeding a
+settable threshold of motion. Regardless of the threshold, all bits of
+resolution are returned to the host computer.
+Note that the ikbd may return mouse relative position reports with
+significantly more than the threshold delta x or y. This may happen since no
+relative mouse motion events will be generated: (a) while the keyboard has
+been 'paused' ( the event will be stored until keyboard communications is
+resumed) (b) while any event is being transmitted.
+
+The relative mouse position record is a three byte record of the form
+(regardless of keyboard mode):
+ %111110xy ; mouse position record flag
+ ; where y is the right button state
+ ; and x is the left button state
+ X ; delta x as twos complement integer
+ Y ; delta y as twos complement integer
+
+Note that the value of the button state bits should be valid even if the
+MOUSE BUTTON ACTION has set the buttons to act like part of the keyboard.
+If the accumulated motion before the report packet is generated exceeds the
++127...-128 range, the motion is broken into multiple packets.
+Note that the sign of the delta y reported is a function of the Y origin
+selected.
+
+4.2 Absolute Position reporting
+
+The ikbd can also maintain absolute mouse position. Commands exist for
+resetting the mouse position, setting X/Y scaling, and interrogating the
+current mouse position.
+
+4.3 Mouse Cursor Key Mode
+
+The ikbd can translate mouse motion into the equivalent cursor keystrokes.
+The number of mouse clicks per keystroke is independently programmable in
+each axis. The ikbd internally maintains mouse motion information to the
+highest resolution available, and merely generates a pair of cursor key events
+for each multiple of the scale factor.
+Mouse motion produces the cursor key make code immediately followed by the
+break code for the appropriate cursor key. The mouse buttons produce scan
+codes above those normally assigned for the largest envisioned keyboard (i.e.
+LEFT=0x74 & RIGHT=0x75).
+
+5. Joystick
+
+5.1 Joystick Event Reporting
+
+In this mode, the ikbd generates a record whenever the joystick position is
+changed (i.e. for each opening or closing of a joystick switch or trigger).
+
+The joystick event record is two bytes of the form:
+ %1111111x ; Joystick event marker
+ ; where x is Joystick 0 or 1
+ %x000yyyy ; where yyyy is the stick position
+ ; and x is the trigger
+
+5.2 Joystick Interrogation
+
+The current state of the joystick ports may be interrogated at any time in
+this mode by sending an 'Interrogate Joystick' command to the ikbd.
+
+The ikbd response to joystick interrogation is a three byte report of the form
+ 0xFD ; joystick report header
+ %x000yyyy ; Joystick 0
+ %x000yyyy ; Joystick 1
+ ; where x is the trigger
+ ; and yyy is the stick position
+
+5.3 Joystick Monitoring
+
+A mode is available that devotes nearly all of the keyboard communications
+time to reporting the state of the joystick ports at a user specifiable rate.
+It remains in this mode until reset or commanded into another mode. The PAUSE
+command in this mode not only stop the output but also temporarily stops
+scanning the joysticks (samples are not queued).
+
+5.4 Fire Button Monitoring
+
+A mode is provided to permit monitoring a single input bit at a high rate. In
+this mode the ikbd monitors the state of the Joystick 1 fire button at the
+maximum rate permitted by the serial communication channel. The data is packed
+8 bits per byte for transmission to the host. The ikbd remains in this mode
+until reset or commanded into another mode. The PAUSE command in this mode not
+only stops the output but also temporarily stops scanning the button (samples
+are not queued).
+
+5.5 Joystick Key Code Mode
+
+The ikbd may be commanded to translate the use of either joystick into the
+equivalent cursor control keystroke(s). The ikbd provides a single breakpoint
+velocity joystick cursor.
+Joystick events produce the make code, immediately followed by the break code
+for the appropriate cursor motion keys. The trigger or fire buttons of the
+joysticks produce pseudo key scan codes above those used by the largest key
+matrix envisioned (i.e. JOYSTICK0=0x74, JOYSTICK1=0x75).
+
+6. Time-of-Day Clock
+
+The ikbd also maintains a time-of-day clock for the system. Commands are
+available to set and interrogate the timer-of-day clock. Time-keeping is
+maintained down to a resolution of one second.
+
+7. Status Inquiries
+
+The current state of ikbd modes and parameters may be found by sending status
+inquiry commands that correspond to the ikbd set commands.
+
+8. Power-Up Mode
+
+The keyboard controller will perform a simple self-test on power-up to detect
+major controller faults (ROM checksum and RAM test) and such things as stuck
+keys. Any keys down at power-up are presumed to be stuck, and their BREAK
+(sic) code is returned (which without the preceding MAKE code is a flag for a
+keyboard error). If the controller self-test completes without error, the code
+0xF0 is returned. (This code will be used to indicate the version/release of
+the ikbd controller. The first release of the ikbd is version 0xF0, should
+there be a second release it will be 0xF1, and so on.)
+The ikbd defaults to a mouse position reporting with threshold of 1 unit in
+either axis and the Y=0 origin at the top of the screen, and joystick event
+reporting mode for joystick 1, with both buttons being logically assigned to
+the mouse. After any joystick command, the ikbd assumes that joysticks are
+connected to both Joystick0 and Joystick1. Any mouse command (except MOUSE
+DISABLE) then causes port 0 to again be scanned as if it were a mouse, and
+both buttons are logically connected to it. If a mouse disable command is
+received while port 0 is presumed to be a mouse, the button is logically
+assigned to Joystick1 (until the mouse is reenabled by another mouse command).
+
+9. ikbd Command Set
+
+This section contains a list of commands that can be sent to the ikbd. Command
+codes (such as 0x00) which are not specified should perform no operation
+(NOPs).
+
+9.1 RESET
+
+ 0x80
+ 0x01
+
+N.B. The RESET command is the only two byte command understood by the ikbd.
+Any byte following an 0x80 command byte other than 0x01 is ignored (and causes
+the 0x80 to be ignored).
+A reset may also be caused by sending a break lasting at least 200mS to the
+ikbd.
+Executing the RESET command returns the keyboard to its default (power-up)
+mode and parameter settings. It does not affect the time-of-day clock.
+The RESET command or function causes the ikbd to perform a simple self-test.
+If the test is successful, the ikbd will send the code of 0xF0 within 300mS
+of receipt of the RESET command (or the end of the break, or power-up). The
+ikbd will then scan the key matrix for any stuck (closed) keys. Any keys found
+closed will cause the break scan code to be generated (the break code arriving
+without being preceded by the make code is a flag for a key matrix error).
+
+9.2. SET MOUSE BUTTON ACTION
+
+ 0x07
+ %00000mss ; mouse button action
+ ; (m is presumed = 1 when in MOUSE KEYCODE mode)
+ ; mss=0xy, mouse button press or release causes mouse
+ ; position report
+ ; where y=1, mouse key press causes absolute report
+ ; and x=1, mouse key release causes absolute report
+ ; mss=100, mouse buttons act like keys
+
+This command sets how the ikbd should treat the buttons on the mouse. The
+default mouse button action mode is %00000000, the buttons are treated as part
+of the mouse logically.
+When buttons act like keys, LEFT=0x74 & RIGHT=0x75.
+
+9.3 SET RELATIVE MOUSE POSITION REPORTING
+
+ 0x08
+
+Set relative mouse position reporting. (DEFAULT) Mouse position packets are
+generated asynchronously by the ikbd whenever motion exceeds the setable
+threshold in either axis (see SET MOUSE THRESHOLD). Depending upon the mouse
+key mode, mouse position reports may also be generated when either mouse
+button is pressed or released. Otherwise the mouse buttons behave as if they
+were keyboard keys.
+
+9.4 SET ABSOLUTE MOUSE POSITIONING
+
+ 0x09
+ XMSB ; X maximum (in scaled mouse clicks)
+ XLSB
+ YMSB ; Y maximum (in scaled mouse clicks)
+ YLSB
+
+Set absolute mouse position maintenance. Resets the ikbd maintained X and Y
+coordinates.
+In this mode, the value of the internally maintained coordinates does NOT wrap
+between 0 and large positive numbers. Excess motion below 0 is ignored. The
+command sets the maximum positive value that can be attained in the scaled
+coordinate system. Motion beyond that value is also ignored.
+
+9.5 SET MOUSE KEYCODE MOSE
+
+ 0x0A
+ deltax ; distance in X clicks to return (LEFT) or (RIGHT)
+ deltay ; distance in Y clicks to return (UP) or (DOWN)
+
+Set mouse monitoring routines to return cursor motion keycodes instead of
+either RELATIVE or ABSOLUTE motion records. The ikbd returns the appropriate
+cursor keycode after mouse travel exceeding the user specified deltas in
+either axis. When the keyboard is in key scan code mode, mouse motion will
+cause the make code immediately followed by the break code. Note that this
+command is not affected by the mouse motion origin.
+
+9..6 SET MOUSE THRESHOLD
+
+ 0x0B
+ X ; x threshold in mouse ticks (positive integers)
+ Y ; y threshold in mouse ticks (positive integers)
+
+This command sets the threshold before a mouse event is generated. Note that
+it does NOT affect the resolution of the data returned to the host. This
+command is valid only in RELATIVE MOUSE POSITIONING mode. The thresholds
+default to 1 at RESET (or power-up).
+
+9.7 SET MOUSE SCALE
+
+ 0x0C
+ X ; horizontal mouse ticks per internal X
+ Y ; vertical mouse ticks per internal Y
+
+This command sets the scale factor for the ABSOLUTE MOUSE POSITIONING mode.
+In this mode, the specified number of mouse phase changes ('clicks') must
+occur before the internally maintained coordinate is changed by one
+(independently scaled for each axis). Remember that the mouse position
+information is available only by interrogating the ikbd in the ABSOLUTE MOUSE
+POSITIONING mode unless the ikbd has been commanded to report on button press
+or release (see SET MOSE BUTTON ACTION).
+
+9.8 INTERROGATE MOUSE POSITION
+
+ 0x0D
+ Returns:
+ 0xF7 ; absolute mouse position header
+ BUTTONS
+ 0000dcba ; where a is right button down since last interrogation
+ ; b is right button up since last
+ ; c is left button down since last
+ ; d is left button up since last
+ XMSB ; X coordinate
+ XLSB
+ YMSB ; Y coordinate
+ YLSB
+
+The INTERROGATE MOUSE POSITION command is valid when in the ABSOLUTE MOUSE
+POSITIONING mode, regardless of the setting of the MOUSE BUTTON ACTION.
+
+9.9 LOAD MOUSE POSITION
+
+ 0x0E
+ 0x00 ; filler
+ XMSB ; X coordinate
+ XLSB ; (in scaled coordinate system)
+ YMSB ; Y coordinate
+ YLSB
+
+This command allows the user to preset the internally maintained absolute
+mouse position.
+
+9.10 SET Y=0 AT BOTTOM
+
+ 0x0F
+
+This command makes the origin of the Y axis to be at the bottom of the
+logical coordinate system internal to the ikbd for all relative or absolute
+mouse motion. This causes mouse motion toward the user to be negative in sign
+and away from the user to be positive.
+
+9.11 SET Y=0 AT TOP
+
+ 0x10
+
+Makes the origin of the Y axis to be at the top of the logical coordinate
+system within the ikbd for all relative or absolute mouse motion. (DEFAULT)
+This causes mouse motion toward the user to be positive in sign and away from
+the user to be negative.
+
+9.12 RESUME
+
+ 0x11
+
+Resume sending data to the host. Since any command received by the ikbd after
+its output has been paused also causes an implicit RESUME this command can be
+thought of as a NO OPERATION command. If this command is received by the ikbd
+and it is not PAUSED, it is simply ignored.
+
+9.13 DISABLE MOUSE
+
+ 0x12
+
+All mouse event reporting is disabled (and scanning may be internally
+disabled). Any valid mouse mode command resumes mouse motion monitoring. (The
+valid mouse mode commands are SET RELATIVE MOUSE POSITION REPORTING, SET
+ABSOLUTE MOUSE POSITIONING, and SET MOUSE KEYCODE MODE. )
+N.B. If the mouse buttons have been commanded to act like keyboard keys, this
+command DOES affect their actions.
+
+9.14 PAUSE OUTPUT
+
+ 0x13
+
+Stop sending data to the host until another valid command is received. Key
+matrix activity is still monitored and scan codes or ASCII characters enqueued
+(up to the maximum supported by the microcontroller) to be sent when the host
+allows the output to be resumed. If in the JOYSTICK EVENT REPORTING mode,
+joystick events are also queued.
+Mouse motion should be accumulated while the output is paused. If the ikbd is
+in RELATIVE MOUSE POSITIONING REPORTING mode, motion is accumulated beyond the
+normal threshold limits to produce the minimum number of packets necessary for
+transmission when output is resumed. Pressing or releasing either mouse button
+causes any accumulated motion to be immediately queued as packets, if the
+mouse is in RELATIVE MOUSE POSITION REPORTING mode.
+Because of the limitations of the microcontroller memory this command should
+be used sparingly, and the output should not be shut of for more than <tbd>
+milliseconds at a time.
+The output is stopped only at the end of the current 'even'. If the PAUSE
+OUTPUT command is received in the middle of a multiple byte report, the packet
+will still be transmitted to conclusion and then the PAUSE will take effect.
+When the ikbd is in either the JOYSTICK MONITORING mode or the FIRE BUTTON
+MONITORING mode, the PAUSE OUTPUT command also temporarily stops the
+monitoring process (i.e. the samples are not enqueued for transmission).
+
+0.15 SET JOYSTICK EVENT REPORTING
+
+ 0x14
+
+Enter JOYSTICK EVENT REPORTING mode (DEFAULT). Each opening or closure of a
+joystick switch or trigger causes a joystick event record to be generated.
+
+9.16 SET JOYSTICK INTERROGATION MODE
+
+ 0x15
+
+Disables JOYSTICK EVENT REPORTING. Host must send individual JOYSTICK
+INTERROGATE commands to sense joystick state.
+
+9.17 JOYSTICK INTERROGATE
+
+ 0x16
+
+Return a record indicating the current state of the joysticks. This command
+is valid in either the JOYSTICK EVENT REPORTING mode or the JOYSTICK
+INTERROGATION MODE.
+
+9.18 SET JOYSTICK MONITORING
+
+ 0x17
+ rate ; time between samples in hundredths of a second
+ Returns: (in packets of two as long as in mode)
+ %000000xy ; where y is JOYSTICK1 Fire button
+ ; and x is JOYSTICK0 Fire button
+ %nnnnmmmm ; where m is JOYSTICK1 state
+ ; and n is JOYSTICK0 state
+
+Sets the ikbd to do nothing but monitor the serial command line, maintain the
+time-of-day clock, and monitor the joystick. The rate sets the interval
+between joystick samples.
+N.B. The user should not set the rate higher than the serial communications
+channel will allow the 2 bytes packets to be transmitted.
+
+9.19 SET FIRE BUTTON MONITORING
+
+ 0x18
+ Returns: (as long as in mode)
+ %bbbbbbbb ; state of the JOYSTICK1 fire button packed
+ ; 8 bits per byte, the first sample if the MSB
+
+Set the ikbd to do nothing but monitor the serial command line, maintain the
+time-of-day clock, and monitor the fire button on Joystick 1. The fire button
+is scanned at a rate that causes 8 samples to be made in the time it takes for
+the previous byte to be sent to the host (i.e. scan rate = 8/10 * baud rate).
+The sample interval should be as constant as possible.
+
+9.20 SET JOYSTICK KEYCODE MODE
+
+ 0x19
+ RX ; length of time (in tenths of seconds) until
+ ; horizontal velocity breakpoint is reached
+ RY ; length of time (in tenths of seconds) until
+ ; vertical velocity breakpoint is reached
+ TX ; length (in tenths of seconds) of joystick closure
+ ; until horizontal cursor key is generated before RX
+ ; has elapsed
+ TY ; length (in tenths of seconds) of joystick closure
+ ; until vertical cursor key is generated before RY
+ ; has elapsed
+ VX ; length (in tenths of seconds) of joystick closure
+ ; until horizontal cursor keystrokes are generated
+ ; after RX has elapsed
+ VY ; length (in tenths of seconds) of joystick closure
+ ; until vertical cursor keystrokes are generated
+ ; after RY has elapsed
+
+In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes.
+On initial closure, a keystroke pair (make/break) is generated. Then up to Rn
+tenths of seconds later, keystroke pairs are generated every Tn tenths of
+seconds. After the Rn breakpoint is reached, keystroke pairs are generated
+every Vn tenths of seconds. This provides a velocity (auto-repeat) breakpoint
+feature.
+Note that by setting RX and/or Ry to zero, the velocity feature can be
+disabled. The values of TX and TY then become meaningless, and the generation
+of cursor 'keystrokes' is set by VX and VY.
+
+9.21 DISABLE JOYSTICKS
+
+ 0x1A
+
+Disable the generation of any joystick events (and scanning may be internally
+disabled). Any valid joystick mode command resumes joystick monitoring. (The
+joystick mode commands are SET JOYSTICK EVENT REPORTING, SET JOYSTICK
+INTERROGATION MODE, SET JOYSTICK MONITORING, SET FIRE BUTTON MONITORING, and
+SET JOYSTICK KEYCODE MODE.)
+
+9.22 TIME-OF-DAY CLOCK SET
+
+ 0x1B
+ YY ; year (2 least significant digits)
+ MM ; month
+ DD ; day
+ hh ; hour
+ mm ; minute
+ ss ; second
+
+All time-of-day data should be sent to the ikbd in packed BCD format.
+Any digit that is not a valid BCD digit should be treated as a 'don't care'
+and not alter that particular field of the date or time. This permits setting
+only some subfields of the time-of-day clock.
+
+9.23 INTERROGATE TIME-OF-DAT CLOCK
+
+ 0x1C
+ Returns:
+ 0xFC ; time-of-day event header
+ YY ; year (2 least significant digits)
+ MM ; month
+ DD ; day
+ hh ; hour
+ mm ; minute
+ ss ; second
+
+ All time-of-day is sent in packed BCD format.
+
+9.24 MEMORY LOAD
+
+ 0x20
+ ADRMSB ; address in controller
+ ADRLSB ; memory to be loaded
+ NUM ; number of bytes (0-128)
+ { data }
+
+This command permits the host to load arbitrary values into the ikbd
+controller memory. The time between data bytes must be less than 20ms.
+
+9.25 MEMORY READ
+
+ 0x21
+ ADRMSB ; address in controller
+ ADRLSB ; memory to be read
+ Returns:
+ 0xF6 ; status header
+ 0x20 ; memory access
+ { data } ; 6 data bytes starting at ADR
+
+This command permits the host to read from the ikbd controller memory.
+
+9.26 CONTROLLER EXECUTE
+
+ 0x22
+ ADRMSB ; address of subroutine in
+ ADRLSB ; controller memory to be called
+
+This command allows the host to command the execution of a subroutine in the
+ikbd controller memory.
+
+9.27 STATUS INQUIRIES
+
+ Status commands are formed by inclusively ORing 0x80 with the
+ relevant SET command.
+
+ Example:
+ 0x88 (or 0x89 or 0x8A) ; request mouse mode
+ Returns:
+ 0xF6 ; status response header
+ mode ; 0x08 is RELATIVE
+ ; 0x09 is ABSOLUTE
+ ; 0x0A is KEYCODE
+ param1 ; 0 is RELATIVE
+ ; XMSB maximum if ABSOLUTE
+ ; DELTA X is KEYCODE
+ param2 ; 0 is RELATIVE
+ ; YMSB maximum if ABSOLUTE
+ ; DELTA Y is KEYCODE
+ param3 ; 0 if RELATIVE
+ ; or KEYCODE
+ ; YMSB is ABSOLUTE
+ param4 ; 0 if RELATIVE
+ ; or KEYCODE
+ ; YLSB is ABSOLUTE
+ 0 ; pad
+ 0
+
+The STATUS INQUIRY commands request the ikbd to return either the current mode
+or the parameters associated with a given command. All status reports are
+padded to form 8 byte long return packets. The responses to the status
+requests are designed so that the host may store them away (after stripping
+off the status report header byte) and later send them back as commands to
+ikbd to restore its state. The 0 pad bytes will be treated as NOPs by the
+ikbd.
+
+ Valid STATUS INQUIRY commands are:
+
+ 0x87 mouse button action
+ 0x88 mouse mode
+ 0x89
+ 0x8A
+ 0x8B mnouse threshold
+ 0x8C mouse scale
+ 0x8F mouse vertical coordinates
+ 0x90 ( returns 0x0F Y=0 at bottom
+ 0x10 Y=0 at top )
+ 0x92 mouse enable/disable
+ ( returns 0x00 enabled)
+ 0x12 disabled )
+ 0x94 joystick mode
+ 0x95
+ 0x96
+ 0x9A joystick enable/disable
+ ( returns 0x00 enabled
+ 0x1A disabled )
+
+It is the (host) programmer's responsibility to have only one unanswered
+inquiry in process at a time.
+STATUS INQUIRY commands are not valid if the ikbd is in JOYSTICK MONITORING
+mode or FIRE BUTTON MONITORING mode.
+
+
+10. SCAN CODES
+
+The key scan codes returned by the ikbd are chosen to simplify the
+implementation of GSX.
+
+GSX Standard Keyboard Mapping.
+
+Hex Keytop
+01 Esc
+02 1
+03 2
+04 3
+05 4
+06 5
+07 6
+08 7
+09 8
+0A 9
+0B 0
+0C -
+0D ==
+0E BS
+0F TAB
+10 Q
+11 W
+12 E
+13 R
+14 T
+15 Y
+16 U
+17 I
+18 O
+19 P
+1A [
+1B ]
+1C RET
+1D CTRL
+1E A
+1F S
+20 D
+21 F
+22 G
+23 H
+24 J
+25 K
+26 L
+27 ;
+28 '
+29 `
+2A (LEFT) SHIFT
+2B \
+2C Z
+2D X
+2E C
+2F V
+30 B
+31 N
+32 M
+33 ,
+34 .
+35 /
+36 (RIGHT) SHIFT
+37 { NOT USED }
+38 ALT
+39 SPACE BAR
+3A CAPS LOCK
+3B F1
+3C F2
+3D F3
+3E F4
+3F F5
+40 F6
+41 F7
+42 F8
+43 F9
+44 F10
+45 { NOT USED }
+46 { NOT USED }
+47 HOME
+48 UP ARROW
+49 { NOT USED }
+4A KEYPAD -
+4B LEFT ARROW
+4C { NOT USED }
+4D RIGHT ARROW
+4E KEYPAD +
+4F { NOT USED }
+50 DOWN ARROW
+51 { NOT USED }
+52 INSERT
+53 DEL
+54 { NOT USED }
+5F { NOT USED }
+60 ISO KEY
+61 UNDO
+62 HELP
+63 KEYPAD (
+64 KEYPAD /
+65 KEYPAD *
+66 KEYPAD *
+67 KEYPAD 7
+68 KEYPAD 8
+69 KEYPAD 9
+6A KEYPAD 4
+6B KEYPAD 5
+6C KEYPAD 6
+6D KEYPAD 1
+6E KEYPAD 2
+6F KEYPAD 3
+70 KEYPAD 0
+71 KEYPAD .
+72 KEYPAD ENTER
diff --git a/Documentation/input/cd32.txt b/Documentation/input/cd32.txt
new file mode 100644
index 0000000..a003d9b
--- /dev/null
+++ b/Documentation/input/cd32.txt
@@ -0,0 +1,19 @@
+I have written a small patch that let's me use my Amiga CD32
+joypad connected to the parallel port. Thought I'd share it with you so
+you can add it to the list of supported joysticks (hopefully someone will
+find it useful).
+
+It needs the following wiring:
+
+CD32 pad | Parallel port
+----------------------------
+1 (Up) | 2 (D0)
+2 (Down) | 3 (D1)
+3 (Left) | 4 (D2)
+4 (Right) | 5 (D3)
+5 (Fire3) | 14 (AUTOFD)
+6 (Fire1) | 17 (SELIN)
+7 (+5V) | 1 (STROBE)
+8 (Gnd) | 18 (Gnd)
+9 (Fire2) | 7 (D5)
+
diff --git a/Documentation/input/cs461x.txt b/Documentation/input/cs461x.txt
new file mode 100644
index 0000000..202e9db
--- /dev/null
+++ b/Documentation/input/cs461x.txt
@@ -0,0 +1,45 @@
+Preface.
+
+This is a new low-level driver to support analog joystick attached to
+Crystal SoundFusion CS4610/CS4612/CS4615. This code is based upon
+Vortex/Solo drivers as an example of decoration style, and ALSA
+0.5.8a kernel drivers as an chipset documentation and samples.
+
+This version does not have cooked mode support; the basic code
+is present here, but have not tested completely. The button analysis
+is completed in this mode, but the axis movement is not.
+
+Raw mode works fine with analog joystick front-end driver and cs461x
+driver as a backend. I've tested this driver with CS4610, 4-axis and
+4-button joystick; I mean the jstest utility. Also I've tried to
+play in xracer game using joystick, and the result is better than
+keyboard only mode.
+
+The sensitivity and calibrate quality have not been tested; the two
+reasons are performed: the same hardware cannot work under Win95 (blue
+screen in VJOYD); I have no documentation on my chip; and the existing
+behavior in my case was not raised the requirement of joystick calibration.
+So the driver have no code to perform hardware related calibration.
+
+The patch contains minor changes of Config.in and Makefile files. All
+needed code have been moved to one separate file cs461x.c like ns558.c
+This driver have the basic support for PCI devices only; there is no
+ISA or PnP ISA cards supported. AFAIK the ns558 have support for Crystal
+ISA and PnP ISA series.
+
+The driver works with ALSA drivers simultaneously. For example, the xracer
+uses joystick as input device and PCM device as sound output in one time.
+There are no sound or input collisions detected. The source code have
+comments about them; but I've found the joystick can be initialized
+separately of ALSA modules. So, you can use only one joystick driver
+without ALSA drivers. The ALSA drivers are not needed to compile or
+run this driver.
+
+There are no debug information print have been placed in source, and no
+specific options required to work this driver. The found chipset parameters
+are printed via printk(KERN_INFO "..."), see the /var/log/messages to
+inspect cs461x: prefixed messages to determine possible card detection
+errors.
+
+Regards,
+Viktor
diff --git a/Documentation/input/elantech.txt b/Documentation/input/elantech.txt
new file mode 100644
index 0000000..a10c3b6
--- /dev/null
+++ b/Documentation/input/elantech.txt
@@ -0,0 +1,405 @@
+Elantech Touchpad Driver
+========================
+
+ Copyright (C) 2007-2008 Arjan Opmeer <arjan@opmeer.net>
+
+ Extra information for hardware version 1 found and
+ provided by Steve Havelka
+
+ Version 2 (EeePC) hardware support based on patches
+ received from Woody at Xandros and forwarded to me
+ by user StewieGriffin at the eeeuser.com forum
+
+
+Contents
+~~~~~~~~
+
+ 1. Introduction
+ 2. Extra knobs
+ 3. Hardware version 1
+ 3.1 Registers
+ 3.2 Native relative mode 4 byte packet format
+ 3.3 Native absolute mode 4 byte packet format
+ 4. Hardware version 2
+ 4.1 Registers
+ 4.2 Native absolute mode 6 byte packet format
+ 4.2.1 One finger touch
+ 4.2.2 Two finger touch
+
+
+
+1. Introduction
+ ~~~~~~~~~~~~
+
+Currently the Linux Elantech touchpad driver is aware of two different
+hardware versions unimaginatively called version 1 and version 2. Version 1
+is found in "older" laptops and uses 4 bytes per packet. Version 2 seems to
+be introduced with the EeePC and uses 6 bytes per packet.
+
+The driver tries to support both hardware versions and should be compatible
+with the Xorg Synaptics touchpad driver and its graphical configuration
+utilities.
+
+Additionally the operation of the touchpad can be altered by adjusting the
+contents of some of its internal registers. These registers are represented
+by the driver as sysfs entries under /sys/bus/serio/drivers/psmouse/serio?
+that can be read from and written to.
+
+Currently only the registers for hardware version 1 are somewhat understood.
+Hardware version 2 seems to use some of the same registers but it is not
+known whether the bits in the registers represent the same thing or might
+have changed their meaning.
+
+On top of that, some register settings have effect only when the touchpad is
+in relative mode and not in absolute mode. As the Linux Elantech touchpad
+driver always puts the hardware into absolute mode not all information
+mentioned below can be used immediately. But because there is no freely
+available Elantech documentation the information is provided here anyway for
+completeness sake.
+
+
+/////////////////////////////////////////////////////////////////////////////
+
+
+2. Extra knobs
+ ~~~~~~~~~~~
+
+Currently the Linux Elantech touchpad driver provides two extra knobs under
+/sys/bus/serio/drivers/psmouse/serio? for the user.
+
+* debug
+
+ Turn different levels of debugging ON or OFF.
+
+ By echoing "0" to this file all debugging will be turned OFF.
+
+ Currently a value of "1" will turn on some basic debugging and a value of
+ "2" will turn on packet debugging. For hardware version 1 the default is
+ OFF. For version 2 the default is "1".
+
+ Turning packet debugging on will make the driver dump every packet
+ received to the syslog before processing it. Be warned that this can
+ generate quite a lot of data!
+
+* paritycheck
+
+ Turns parity checking ON or OFF.
+
+ By echoing "0" to this file parity checking will be turned OFF. Any
+ non-zero value will turn it ON. For hardware version 1 the default is ON.
+ For version 2 the default it is OFF.
+
+ Hardware version 1 provides basic data integrity verification by
+ calculating a parity bit for the last 3 bytes of each packet. The driver
+ can check these bits and reject any packet that appears corrupted. Using
+ this knob you can bypass that check.
+
+ It is not known yet whether hardware version 2 provides the same parity
+ bits. Hence checking is disabled by default. Currently even turning it on
+ will do nothing.
+
+
+/////////////////////////////////////////////////////////////////////////////
+
+
+3. Hardware version 1
+ ==================
+
+3.1 Registers
+ ~~~~~~~~~
+
+By echoing a hexadecimal value to a register it contents can be altered.
+
+For example:
+
+ echo -n 0x16 > reg_10
+
+* reg_10
+
+ bit 7 6 5 4 3 2 1 0
+ B C T D L A S E
+
+ E: 1 = enable smart edges unconditionally
+ S: 1 = enable smart edges only when dragging
+ A: 1 = absolute mode (needs 4 byte packets, see reg_11)
+ L: 1 = enable drag lock (see reg_22)
+ D: 1 = disable dynamic resolution
+ T: 1 = disable tapping
+ C: 1 = enable corner tap
+ B: 1 = swap left and right button
+
+* reg_11
+
+ bit 7 6 5 4 3 2 1 0
+ 1 0 0 H V 1 F P
+
+ P: 1 = enable parity checking for relative mode
+ F: 1 = enable native 4 byte packet mode
+ V: 1 = enable vertical scroll area
+ H: 1 = enable horizontal scroll area
+
+* reg_20
+
+ single finger width?
+
+* reg_21
+
+ scroll area width (small: 0x40 ... wide: 0xff)
+
+* reg_22
+
+ drag lock time out (short: 0x14 ... long: 0xfe;
+ 0xff = tap again to release)
+
+* reg_23
+
+ tap make timeout?
+
+* reg_24
+
+ tap release timeout?
+
+* reg_25
+
+ smart edge cursor speed (0x02 = slow, 0x03 = medium, 0x04 = fast)
+
+* reg_26
+
+ smart edge activation area width?
+
+
+3.2 Native relative mode 4 byte packet format
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+byte 0:
+ bit 7 6 5 4 3 2 1 0
+ c c p2 p1 1 M R L
+
+ L, R, M = 1 when Left, Right, Middle mouse button pressed
+ some models have M as byte 3 odd parity bit
+ when parity checking is enabled (reg_11, P = 1):
+ p1..p2 = byte 1 and 2 odd parity bit
+ c = 1 when corner tap detected
+
+byte 1:
+ bit 7 6 5 4 3 2 1 0
+ dx7 dx6 dx5 dx4 dx3 dx2 dx1 dx0
+
+ dx7..dx0 = x movement; positive = right, negative = left
+ byte 1 = 0xf0 when corner tap detected
+
+byte 2:
+ bit 7 6 5 4 3 2 1 0
+ dy7 dy6 dy5 dy4 dy3 dy2 dy1 dy0
+
+ dy7..dy0 = y movement; positive = up, negative = down
+
+byte 3:
+ parity checking enabled (reg_11, P = 1):
+
+ bit 7 6 5 4 3 2 1 0
+ w h n1 n0 ds3 ds2 ds1 ds0
+
+ normally:
+ ds3..ds0 = scroll wheel amount and direction
+ positive = down or left
+ negative = up or right
+ when corner tap detected:
+ ds0 = 1 when top right corner tapped
+ ds1 = 1 when bottom right corner tapped
+ ds2 = 1 when bottom left corner tapped
+ ds3 = 1 when top left corner tapped
+ n1..n0 = number of fingers on touchpad
+ only models with firmware 2.x report this, models with
+ firmware 1.x seem to map one, two and three finger taps
+ directly to L, M and R mouse buttons
+ h = 1 when horizontal scroll action
+ w = 1 when wide finger touch?
+
+ otherwise (reg_11, P = 0):
+
+ bit 7 6 5 4 3 2 1 0
+ ds7 ds6 ds5 ds4 ds3 ds2 ds1 ds0
+
+ ds7..ds0 = vertical scroll amount and direction
+ negative = up
+ positive = down
+
+
+3.3 Native absolute mode 4 byte packet format
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+byte 0:
+ firmware version 1.x:
+
+ bit 7 6 5 4 3 2 1 0
+ D U p1 p2 1 p3 R L
+
+ L, R = 1 when Left, Right mouse button pressed
+ p1..p3 = byte 1..3 odd parity bit
+ D, U = 1 when rocker switch pressed Up, Down
+
+ firmware version 2.x:
+
+ bit 7 6 5 4 3 2 1 0
+ n1 n0 p2 p1 1 p3 R L
+
+ L, R = 1 when Left, Right mouse button pressed
+ p1..p3 = byte 1..3 odd parity bit
+ n1..n0 = number of fingers on touchpad
+
+byte 1:
+ firmware version 1.x:
+
+ bit 7 6 5 4 3 2 1 0
+ f 0 th tw x9 x8 y9 y8
+
+ tw = 1 when two finger touch
+ th = 1 when three finger touch
+ f = 1 when finger touch
+
+ firmware version 2.x:
+
+ bit 7 6 5 4 3 2 1 0
+ . . . . x9 x8 y9 y8
+
+byte 2:
+ bit 7 6 5 4 3 2 1 0
+ x7 x6 x5 x4 x3 x2 x1 x0
+
+ x9..x0 = absolute x value (horizontal)
+
+byte 3:
+ bit 7 6 5 4 3 2 1 0
+ y7 y6 y5 y4 y3 y2 y1 y0
+
+ y9..y0 = absolute y value (vertical)
+
+
+/////////////////////////////////////////////////////////////////////////////
+
+
+4. Hardware version 2
+ ==================
+
+
+4.1 Registers
+ ~~~~~~~~~
+
+By echoing a hexadecimal value to a register it contents can be altered.
+
+For example:
+
+ echo -n 0x56 > reg_10
+
+* reg_10
+
+ bit 7 6 5 4 3 2 1 0
+ 0 1 0 1 0 1 D 0
+
+ D: 1 = enable drag and drop
+
+* reg_11
+
+ bit 7 6 5 4 3 2 1 0
+ 1 0 0 0 S 0 1 0
+
+ S: 1 = enable vertical scroll
+
+* reg_21
+
+ unknown (0x00)
+
+* reg_22
+
+ drag and drop release time out (short: 0x70 ... long 0x7e;
+ 0x7f = never i.e. tap again to release)
+
+
+4.2 Native absolute mode 6 byte packet format
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+4.2.1 One finger touch
+ ~~~~~~~~~~~~~~~~
+
+byte 0:
+
+ bit 7 6 5 4 3 2 1 0
+ n1 n0 . . . . R L
+
+ L, R = 1 when Left, Right mouse button pressed
+ n1..n0 = numbers of fingers on touchpad
+
+byte 1:
+
+ bit 7 6 5 4 3 2 1 0
+ x15 x14 x13 x12 x11 x10 x9 x8
+
+byte 2:
+
+ bit 7 6 5 4 3 2 1 0
+ x7 x6 x5 x4 x4 x2 x1 x0
+
+ x15..x0 = absolute x value (horizontal)
+
+byte 3:
+
+ bit 7 6 5 4 3 2 1 0
+ . . . . . . . .
+
+byte 4:
+
+ bit 7 6 5 4 3 2 1 0
+ y15 y14 y13 y12 y11 y10 y8 y8
+
+byte 5:
+
+ bit 7 6 5 4 3 2 1 0
+ y7 y6 y5 y4 y3 y2 y1 y0
+
+ y15..y0 = absolute y value (vertical)
+
+
+4.2.2 Two finger touch
+ ~~~~~~~~~~~~~~~~
+
+byte 0:
+
+ bit 7 6 5 4 3 2 1 0
+ n1 n0 ay8 ax8 . . R L
+
+ L, R = 1 when Left, Right mouse button pressed
+ n1..n0 = numbers of fingers on touchpad
+
+byte 1:
+
+ bit 7 6 5 4 3 2 1 0
+ ax7 ax6 ax5 ax4 ax3 ax2 ax1 ax0
+
+ ax8..ax0 = first finger absolute x value
+
+byte 2:
+
+ bit 7 6 5 4 3 2 1 0
+ ay7 ay6 ay5 ay4 ay3 ay2 ay1 ay0
+
+ ay8..ay0 = first finger absolute y value
+
+byte 3:
+
+ bit 7 6 5 4 3 2 1 0
+ . . by8 bx8 . . . .
+
+byte 4:
+
+ bit 7 6 5 4 3 2 1 0
+ bx7 bx6 bx5 bx4 bx3 bx2 bx1 bx0
+
+ bx8..bx0 = second finger absolute x value
+
+byte 5:
+
+ bit 7 6 5 4 3 2 1 0
+ by7 by8 by5 by4 by3 by2 by1 by0
+
+ by8..by0 = second finger absolute y value
diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt
new file mode 100644
index 0000000..ded4d5f
--- /dev/null
+++ b/Documentation/input/ff.txt
@@ -0,0 +1,219 @@
+Force feedback for Linux.
+By Johann Deneux <johann.deneux@gmail.com> on 2001/04/22.
+Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09.
+You may redistribute this file. Please remember to include shape.fig and
+interactive.fig as well.
+----------------------------------------------------------------------------
+
+1. Introduction
+~~~~~~~~~~~~~~~
+This document describes how to use force feedback devices under Linux. The
+goal is not to support these devices as if they were simple input-only devices
+(as it is already the case), but to really enable the rendering of force
+effects.
+This document only describes the force feedback part of the Linux input
+interface. Please read joystick.txt and input.txt before reading further this
+document.
+
+2. Instructions to the user
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To enable force feedback, you have to:
+
+1. have your kernel configured with evdev and a driver that supports your
+ device.
+2. make sure evdev module is loaded and /dev/input/event* device files are
+ created.
+
+Before you start, let me WARN you that some devices shake violently during the
+initialisation phase. This happens for example with my "AVB Top Shot Pegasus".
+To stop this annoying behaviour, move you joystick to its limits. Anyway, you
+should keep a hand on your device, in order to avoid it to break down if
+something goes wrong.
+
+If you have a serial iforce device, you need to start inputattach. See
+joystick.txt for details.
+
+2.1 Does it work ?
+~~~~~~~~~~~~~~~~~~
+There is an utility called fftest that will allow you to test the driver.
+% fftest /dev/input/eventXX
+
+3. Instructions to the developer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+All interactions are done using the event API. That is, you can use ioctl()
+and write() on /dev/input/eventXX.
+This information is subject to change.
+
+3.1 Querying device capabilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#include <linux/input.h>
+#include <sys/ioctl.h>
+
+unsigned long features[1 + FF_MAX/sizeof(unsigned long)];
+int ioctl(int file_descriptor, int request, unsigned long *features);
+
+"request" must be EVIOCGBIT(EV_FF, size of features array in bytes )
+
+Returns the features supported by the device. features is a bitfield with the
+following bits:
+- FF_CONSTANT can render constant force effects
+- FF_PERIODIC can render periodic effects with the following waveforms:
+ - FF_SQUARE square waveform
+ - FF_TRIANGLE triangle waveform
+ - FF_SINE sine waveform
+ - FF_SAW_UP sawtooth up waveform
+ - FF_SAW_DOWN sawtooth down waveform
+ - FF_CUSTOM custom waveform
+- FF_RAMP can render ramp effects
+- FF_SPRING can simulate the presence of a spring
+- FF_FRICTION can simulate friction
+- FF_DAMPER can simulate damper effects
+- FF_RUMBLE rumble effects
+- FF_INERTIA can simulate inertia
+- FF_GAIN gain is adjustable
+- FF_AUTOCENTER autocenter is adjustable
+
+Note: In most cases you should use FF_PERIODIC instead of FF_RUMBLE. All
+ devices that support FF_RUMBLE support FF_PERIODIC (square, triangle,
+ sine) and the other way around.
+
+Note: The exact syntax FF_CUSTOM is undefined for the time being as no driver
+ supports it yet.
+
+
+int ioctl(int fd, EVIOCGEFFECTS, int *n);
+
+Returns the number of effects the device can keep in its memory.
+
+3.2 Uploading effects to the device
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#include <linux/input.h>
+#include <sys/ioctl.h>
+
+int ioctl(int file_descriptor, int request, struct ff_effect *effect);
+
+"request" must be EVIOCSFF.
+
+"effect" points to a structure describing the effect to upload. The effect is
+uploaded, but not played.
+The content of effect may be modified. In particular, its field "id" is set
+to the unique id assigned by the driver. This data is required for performing
+some operations (removing an effect, controlling the playback).
+This if field must be set to -1 by the user in order to tell the driver to
+allocate a new effect.
+
+Effects are file descriptor specific.
+
+See <linux/input.h> for a description of the ff_effect struct. You should also
+find help in a few sketches, contained in files shape.fig and interactive.fig.
+You need xfig to visualize these files.
+
+3.3 Removing an effect from the device
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+int ioctl(int fd, EVIOCRMFF, effect.id);
+
+This makes room for new effects in the device's memory. Note that this also
+stops the effect if it was playing.
+
+3.4 Controlling the playback of effects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Control of playing is done with write(). Below is an example:
+
+#include <linux/input.h>
+#include <unistd.h>
+
+ struct input_event play;
+ struct input_event stop;
+ struct ff_effect effect;
+ int fd;
+...
+ fd = open("/dev/input/eventXX", O_RDWR);
+...
+ /* Play three times */
+ play.type = EV_FF;
+ play.code = effect.id;
+ play.value = 3;
+
+ write(fd, (const void*) &play, sizeof(play));
+...
+ /* Stop an effect */
+ stop.type = EV_FF;
+ stop.code = effect.id;
+ stop.value = 0;
+
+ write(fd, (const void*) &play, sizeof(stop));
+
+3.5 Setting the gain
+~~~~~~~~~~~~~~~~~~~~
+Not all devices have the same strength. Therefore, users should set a gain
+factor depending on how strong they want effects to be. This setting is
+persistent across access to the driver.
+
+/* Set the gain of the device
+int gain; /* between 0 and 100 */
+struct input_event ie; /* structure used to communicate with the driver */
+
+ie.type = EV_FF;
+ie.code = FF_GAIN;
+ie.value = 0xFFFFUL * gain / 100;
+
+if (write(fd, &ie, sizeof(ie)) == -1)
+ perror("set gain");
+
+3.6 Enabling/Disabling autocenter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The autocenter feature quite disturbs the rendering of effects in my opinion,
+and I think it should be an effect, which computation depends on the game
+type. But you can enable it if you want.
+
+int autocenter; /* between 0 and 100 */
+struct input_event ie;
+
+ie.type = EV_FF;
+ie.code = FF_AUTOCENTER;
+ie.value = 0xFFFFUL * autocenter / 100;
+
+if (write(fd, &ie, sizeof(ie)) == -1)
+ perror("set auto-center");
+
+A value of 0 means "no auto-center".
+
+3.7 Dynamic update of an effect
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Proceed as if you wanted to upload a new effect, except that instead of
+setting the id field to -1, you set it to the wanted effect id.
+Normally, the effect is not stopped and restarted. However, depending on the
+type of device, not all parameters can be dynamically updated. For example,
+the direction of an effect cannot be updated with iforce devices. In this
+case, the driver stops the effect, up-load it, and restart it.
+
+Therefore it is recommended to dynamically change direction while the effect
+is playing only when it is ok to restart the effect with a replay count of 1.
+
+3.8 Information about the status of effects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Every time the status of an effect is changed, an event is sent. The values
+and meanings of the fields of the event are as follows:
+
+struct input_event {
+/* When the status of the effect changed */
+ struct timeval time;
+
+/* Set to EV_FF_STATUS */
+ unsigned short type;
+
+/* Contains the id of the effect */
+ unsigned short code;
+
+/* Indicates the status */
+ unsigned int value;
+};
+
+FF_STATUS_STOPPED The effect stopped playing
+FF_STATUS_PLAYING The effect started to play
+
+NOTE: Status feedback is only supported by iforce driver. If you have
+ a really good reason to use this, please contact
+ linux-joystick@atrey.karlin.mff.cuni.cz or anssi.hannula@gmail.com
+ so that support for it can be added to the rest of the drivers.
+
diff --git a/Documentation/input/gameport-programming.txt b/Documentation/input/gameport-programming.txt
new file mode 100644
index 0000000..03a74fc
--- /dev/null
+++ b/Documentation/input/gameport-programming.txt
@@ -0,0 +1,187 @@
+Programming gameport drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. A basic classic gameport
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If the gameport doesn't provide more than the inb()/outb() functionality,
+the code needed to register it with the joystick drivers is simple:
+
+ struct gameport gameport;
+
+ gameport.io = MY_IO_ADDRESS;
+ gameport_register_port(&gameport);
+
+Make sure struct gameport is initialized to 0 in all other fields. The
+gameport generic code will take care of the rest.
+
+If your hardware supports more than one io address, and your driver can
+choose which one to program the hardware to, starting from the more exotic
+addresses is preferred, because the likelihood of clashing with the standard
+0x201 address is smaller.
+
+Eg. if your driver supports addresses 0x200, 0x208, 0x210 and 0x218, then
+0x218 would be the address of first choice.
+
+If your hardware supports a gameport address that is not mapped to ISA io
+space (is above 0x1000), use that one, and don't map the ISA mirror.
+
+Also, always request_region() on the whole io space occupied by the
+gameport. Although only one ioport is really used, the gameport usually
+occupies from one to sixteen addresses in the io space.
+
+Please also consider enabling the gameport on the card in the ->open()
+callback if the io is mapped to ISA space - this way it'll occupy the io
+space only when something really is using it. Disable it again in the
+->close() callback. You also can select the io address in the ->open()
+callback, so that it doesn't fail if some of the possible addresses are
+already occupied by other gameports.
+
+2. Memory mapped gameport
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a gameport can be accessed through MMIO, this way is preferred, because
+it is faster, allowing more reads per second. Registering such a gameport
+isn't as easy as a basic IO one, but not so much complex:
+
+ struct gameport gameport;
+
+ void my_trigger(struct gameport *gameport)
+ {
+ my_mmio = 0xff;
+ }
+
+ unsigned char my_read(struct gameport *gameport)
+ {
+ return my_mmio;
+ }
+
+ gameport.read = my_read;
+ gameport.trigger = my_trigger;
+ gameport_register_port(&gameport);
+
+3. Cooked mode gameport
+~~~~~~~~~~~~~~~~~~~~~~~
+
+There are gameports that can report the axis values as numbers, that means
+the driver doesn't have to measure them the old way - an ADC is built into
+the gameport. To register a cooked gameport:
+
+ struct gameport gameport;
+
+ int my_cooked_read(struct gameport *gameport, int *axes, int *buttons)
+ {
+ int i;
+
+ for (i = 0; i < 4; i++)
+ axes[i] = my_mmio[i];
+ buttons[i] = my_mmio[4];
+ }
+
+ int my_open(struct gameport *gameport, int mode)
+ {
+ return -(mode != GAMEPORT_MODE_COOKED);
+ }
+
+ gameport.cooked_read = my_cooked_read;
+ gameport.open = my_open;
+ gameport.fuzz = 8;
+ gameport_register_port(&gameport);
+
+The only confusing thing here is the fuzz value. Best determined by
+experimentation, it is the amount of noise in the ADC data. Perfect
+gameports can set this to zero, most common have fuzz between 8 and 32.
+See analog.c and input.c for handling of fuzz - the fuzz value determines
+the size of a gaussian filter window that is used to eliminate the noise
+in the data.
+
+4. More complex gameports
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Gameports can support both raw and cooked modes. In that case combine either
+examples 1+2 or 1+3. Gameports can support internal calibration - see below,
+and also lightning.c and analog.c on how that works. If your driver supports
+more than one gameport instance simultaneously, use the ->private member of
+the gameport struct to point to your data.
+
+5. Unregistering a gameport
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Simple:
+
+gameport_unregister_port(&gameport);
+
+6. The gameport structure
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+struct gameport {
+
+ void *private;
+
+A private pointer for free use in the gameport driver. (Not the joystick
+driver!)
+
+ int number;
+
+Number assigned to the gameport when registered. Informational purpose only.
+
+ int io;
+
+I/O address for use with raw mode. You have to either set this, or ->read()
+to some value if your gameport supports raw mode.
+
+ int speed;
+
+Raw mode speed of the gameport reads in thousands of reads per second.
+
+ int fuzz;
+
+If the gameport supports cooked mode, this should be set to a value that
+represents the amount of noise in the data. See section 3.
+
+ void (*trigger)(struct gameport *);
+
+Trigger. This function should trigger the ns558 oneshots. If set to NULL,
+outb(0xff, io) will be used.
+
+ unsigned char (*read)(struct gameport *);
+
+Read the buttons and ns558 oneshot bits. If set to NULL, inb(io) will be
+used instead.
+
+ int (*cooked_read)(struct gameport *, int *axes, int *buttons);
+
+If the gameport supports cooked mode, it should point this to its cooked
+read function. It should fill axes[0..3] with four values of the joystick axes
+and buttons[0] with four bits representing the buttons.
+
+ int (*calibrate)(struct gameport *, int *axes, int *max);
+
+Function for calibrating the ADC hardware. When called, axes[0..3] should be
+pre-filled by cooked data by the caller, max[0..3] should be pre-filled with
+expected maximums for each axis. The calibrate() function should set the
+sensitivity of the ADC hardware so that the maximums fit in its range and
+recompute the axes[] values to match the new sensitivity or re-read them from
+the hardware so that they give valid values.
+
+ int (*open)(struct gameport *, int mode);
+
+Open() serves two purposes. First a driver either opens the port in raw or
+in cooked mode, the open() callback can decide which modes are supported.
+Second, resource allocation can happen here. The port can also be enabled
+here. Prior to this call, other fields of the gameport struct (namely the io
+member) need not to be valid.
+
+ void (*close)(struct gameport *);
+
+Close() should free the resources allocated by open, possibly disabling the
+gameport.
+
+ struct gameport_dev *dev;
+ struct gameport *next;
+
+For internal use by the gameport layer.
+
+};
+
+Enjoy!
diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt
new file mode 100644
index 0000000..3ac9241
--- /dev/null
+++ b/Documentation/input/iforce-protocol.txt
@@ -0,0 +1,258 @@
+** Introduction
+This document describes what I managed to discover about the protocol used to
+specify force effects to I-Force 2.0 devices. None of this information comes
+from Immerse. That's why you should not trust what is written in this
+document. This document is intended to help understanding the protocol.
+This is not a reference. Comments and corrections are welcome. To contact me,
+send an email to: johann.deneux@gmail.com
+
+** WARNING **
+I shall not be held responsible for any damage or harm caused if you try to
+send data to your I-Force device based on what you read in this document.
+
+** Preliminary Notes:
+All values are hexadecimal with big-endian encoding (msb on the left). Beware,
+values inside packets are encoded using little-endian. Bytes whose roles are
+unknown are marked ??? Information that needs deeper inspection is marked (?)
+
+** General form of a packet **
+This is how packets look when the device uses the rs232 to communicate.
+2B OP LEN DATA CS
+CS is the checksum. It is equal to the exclusive or of all bytes.
+
+When using USB:
+OP DATA
+The 2B, LEN and CS fields have disappeared, probably because USB handles frames and
+data corruption is handled or unsignificant.
+
+First, I describe effects that are sent by the device to the computer
+
+** Device input state
+This packet is used to indicate the state of each button and the value of each
+axis
+OP= 01 for a joystick, 03 for a wheel
+LEN= Varies from device to device
+00 X-Axis lsb
+01 X-Axis msb
+02 Y-Axis lsb, or gas pedal for a wheel
+03 Y-Axis msb, or brake pedal for a wheel
+04 Throttle
+05 Buttons
+06 Lower 4 bits: Buttons
+ Upper 4 bits: Hat
+07 Rudder
+
+** Device effects states
+OP= 02
+LEN= Varies
+00 ? Bit 1 (Value 2) is the value of the deadman switch
+01 Bit 8 is set if the effect is playing. Bits 0 to 7 are the effect id.
+02 ??
+03 Address of parameter block changed (lsb)
+04 Address of parameter block changed (msb)
+05 Address of second parameter block changed (lsb)
+... depending on the number of parameter blocks updated
+
+** Force effect **
+OP= 01
+LEN= 0e
+00 Channel (when playing several effects at the same time, each must be assigned a channel)
+01 Wave form
+ Val 00 Constant
+ Val 20 Square
+ Val 21 Triangle
+ Val 22 Sine
+ Val 23 Sawtooth up
+ Val 24 Sawtooth down
+ Val 40 Spring (Force = f(pos))
+ Val 41 Friction (Force = f(velocity)) and Inertia (Force = f(acceleration))
+
+
+02 Axes affected and trigger
+ Bits 4-7: Val 2 = effect along one axis. Byte 05 indicates direction
+ Val 4 = X axis only. Byte 05 must contain 5a
+ Val 8 = Y axis only. Byte 05 must contain b4
+ Val c = X and Y axes. Bytes 05 must contain 60
+ Bits 0-3: Val 0 = No trigger
+ Val x+1 = Button x triggers the effect
+ When the whole byte is 0, cancel the previously set trigger
+
+03-04 Duration of effect (little endian encoding, in ms)
+
+05 Direction of effect, if applicable. Else, see 02 for value to assign.
+
+06-07 Minimum time between triggering.
+
+08-09 Address of periodicity or magnitude parameters
+0a-0b Address of attack and fade parameters, or ffff if none.
+*or*
+08-09 Address of interactive parameters for X-axis, or ffff if not applicable
+0a-0b Address of interactive parameters for Y-axis, or ffff if not applicable
+
+0c-0d Delay before execution of effect (little endian encoding, in ms)
+
+
+** Time based parameters **
+
+*** Attack and fade ***
+OP= 02
+LEN= 08
+00-01 Address where to store the parameteres
+02-03 Duration of attack (little endian encoding, in ms)
+04 Level at end of attack. Signed byte.
+05-06 Duration of fade.
+07 Level at end of fade.
+
+*** Magnitude ***
+OP= 03
+LEN= 03
+00-01 Address
+02 Level. Signed byte.
+
+*** Periodicity ***
+OP= 04
+LEN= 07
+00-01 Address
+02 Magnitude. Signed byte.
+03 Offset. Signed byte.
+04 Phase. Val 00 = 0 deg, Val 40 = 90 degs.
+05-06 Period (little endian encoding, in ms)
+
+** Interactive parameters **
+OP= 05
+LEN= 0a
+00-01 Address
+02 Positive Coeff
+03 Negative Coeff
+04+05 Offset (center)
+06+07 Dead band (Val 01F4 = 5000 (decimal))
+08 Positive saturation (Val 0a = 1000 (decimal) Val 64 = 10000 (decimal))
+09 Negative saturation
+
+The encoding is a bit funny here: For coeffs, these are signed values. The
+maximum value is 64 (100 decimal), the min is 9c.
+For the offset, the minimum value is FE0C, the maximum value is 01F4.
+For the deadband, the minimum value is 0, the max is 03E8.
+
+** Controls **
+OP= 41
+LEN= 03
+00 Channel
+01 Start/Stop
+ Val 00: Stop
+ Val 01: Start and play once.
+ Val 41: Start and play n times (See byte 02 below)
+02 Number of iterations n.
+
+** Init **
+
+*** Querying features ***
+OP= ff
+Query command. Length varies according to the query type.
+The general format of this packet is:
+ff 01 QUERY [INDEX] CHECKSUM
+responses are of the same form:
+FF LEN QUERY VALUE_QUERIED CHECKSUM2
+where LEN = 1 + length(VALUE_QUERIED)
+
+**** Query ram size ****
+QUERY = 42 ('B'uffer size)
+The device should reply with the same packet plus two additional bytes
+containing the size of the memory:
+ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available.
+
+**** Query number of effects ****
+QUERY = 4e ('N'umber of effects)
+The device should respond by sending the number of effects that can be played
+at the same time (one byte)
+ff 02 4e 14 CS would stand for 20 effects.
+
+**** Vendor's id ****
+QUERY = 4d ('M'anufacturer)
+Query the vendors'id (2 bytes)
+
+**** Product id *****
+QUERY = 50 ('P'roduct)
+Query the product id (2 bytes)
+
+**** Open device ****
+QUERY = 4f ('O'pen)
+No data returned.
+
+**** Close device *****
+QUERY = 43 ('C')lose
+No data returned.
+
+**** Query effect ****
+QUERY = 45 ('E')
+Send effect type.
+Returns nonzero if supported (2 bytes)
+
+**** Firmware Version ****
+QUERY = 56 ('V'ersion)
+Sends back 3 bytes - major, minor, subminor
+
+*** Initialisation of the device ***
+
+**** Set Control ****
+!!! Device dependent, can be different on different models !!!
+OP= 40 <idx> <val> [<val>]
+LEN= 2 or 3
+00 Idx
+ Idx 00 Set dead zone (0..2048)
+ Idx 01 Ignore Deadman sensor (0..1)
+ Idx 02 Enable comm watchdog (0..1)
+ Idx 03 Set the strength of the spring (0..100)
+ Idx 04 Enable or disable the spring (0/1)
+ Idx 05 Set axis saturation threshold (0..2048)
+
+**** Set Effect State ****
+OP= 42 <val>
+LEN= 1
+00 State
+ Bit 3 Pause force feedback
+ Bit 2 Enable force feedback
+ Bit 0 Stop all effects
+
+**** Set overall gain ****
+OP= 43 <val>
+LEN= 1
+00 Gain
+ Val 00 = 0%
+ Val 40 = 50%
+ Val 80 = 100%
+
+** Parameter memory **
+
+Each device has a certain amount of memory to store parameters of effects.
+The amount of RAM may vary, I encountered values from 200 to 1000 bytes. Below
+is the amount of memory apparently needed for every set of parameters:
+ - period : 0c
+ - magnitude : 02
+ - attack and fade : 0e
+ - interactive : 08
+
+** Appendix: How to study the protocol ? **
+
+1. Generate effects using the force editor provided with the DirectX SDK, or
+use Immersion Studio (freely available at their web site in the developer section:
+www.immersion.com)
+2. Start a soft spying RS232 or USB (depending on where you connected your
+joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
+3. Play the effect, and watch what happens on the spy screen.
+
+A few words about ComPortSpy:
+At first glance, this software seems, hum, well... buggy. In fact, data appear with a
+few seconds latency. Personally, I restart it every time I play an effect.
+Remember it's free (as in free beer) and alpha!
+
+** URLS **
+Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy.
+
+** Author of this document **
+Johann Deneux <johann.deneux@gmail.com>
+Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/
+
+Additions by Vojtech Pavlik.
+
+I-Force is trademark of Immersion Corp.
diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt
new file mode 100644
index 0000000..7f8b9d9
--- /dev/null
+++ b/Documentation/input/input-programming.txt
@@ -0,0 +1,302 @@
+Programming input drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. Creating an input device driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1.0 The simplest example
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here comes a very simple example of an input device driver. The device has
+just one button and the button is accessible at i/o port BUTTON_PORT. When
+pressed or released a BUTTON_IRQ happens. The driver could look like:
+
+#include <linux/input.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/irq.h>
+#include <asm/io.h>
+
+static struct input_dev *button_dev;
+
+static irqreturn_t button_interrupt(int irq, void *dummy)
+{
+ input_report_key(button_dev, BTN_0, inb(BUTTON_PORT) & 1);
+ input_sync(button_dev);
+ return IRQ_HANDLED;
+}
+
+static int __init button_init(void)
+{
+ int error;
+
+ if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) {
+ printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq);
+ return -EBUSY;
+ }
+
+ button_dev = input_allocate_device();
+ if (!button_dev) {
+ printk(KERN_ERR "button.c: Not enough memory\n");
+ error = -ENOMEM;
+ goto err_free_irq;
+ }
+
+ button_dev->evbit[0] = BIT_MASK(EV_KEY);
+ button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0);
+
+ error = input_register_device(button_dev);
+ if (error) {
+ printk(KERN_ERR "button.c: Failed to register device\n");
+ goto err_free_dev;
+ }
+
+ return 0;
+
+ err_free_dev:
+ input_free_device(button_dev);
+ err_free_irq:
+ free_irq(BUTTON_IRQ, button_interrupt);
+ return error;
+}
+
+static void __exit button_exit(void)
+{
+ input_unregister_device(button_dev);
+ free_irq(BUTTON_IRQ, button_interrupt);
+}
+
+module_init(button_init);
+module_exit(button_exit);
+
+1.1 What the example does
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+First it has to include the <linux/input.h> file, which interfaces to the
+input subsystem. This provides all the definitions needed.
+
+In the _init function, which is called either upon module load or when
+booting the kernel, it grabs the required resources (it should also check
+for the presence of the device).
+
+Then it allocates a new input device structure with input_allocate_device()
+and sets up input bitfields. This way the device driver tells the other
+parts of the input systems what it is - what events can be generated or
+accepted by this input device. Our example device can only generate EV_KEY
+type events, and from those only BTN_0 event code. Thus we only set these
+two bits. We could have used
+
+ set_bit(EV_KEY, button_dev.evbit);
+ set_bit(BTN_0, button_dev.keybit);
+
+as well, but with more than single bits the first approach tends to be
+shorter.
+
+Then the example driver registers the input device structure by calling
+
+ input_register_device(&button_dev);
+
+This adds the button_dev structure to linked lists of the input driver and
+calls device handler modules _connect functions to tell them a new input
+device has appeared. input_register_device() may sleep and therefore must
+not be called from an interrupt or with a spinlock held.
+
+While in use, the only used function of the driver is
+
+ button_interrupt()
+
+which upon every interrupt from the button checks its state and reports it
+via the
+
+ input_report_key()
+
+call to the input system. There is no need to check whether the interrupt
+routine isn't reporting two same value events (press, press for example) to
+the input system, because the input_report_* functions check that
+themselves.
+
+Then there is the
+
+ input_sync()
+
+call to tell those who receive the events that we've sent a complete report.
+This doesn't seem important in the one button case, but is quite important
+for for example mouse movement, where you don't want the X and Y values
+to be interpreted separately, because that'd result in a different movement.
+
+1.2 dev->open() and dev->close()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In case the driver has to repeatedly poll the device, because it doesn't
+have an interrupt coming from it and the polling is too expensive to be done
+all the time, or if the device uses a valuable resource (eg. interrupt), it
+can use the open and close callback to know when it can stop polling or
+release the interrupt and when it must resume polling or grab the interrupt
+again. To do that, we would add this to our example driver:
+
+static int button_open(struct input_dev *dev)
+{
+ if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) {
+ printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void button_close(struct input_dev *dev)
+{
+ free_irq(IRQ_AMIGA_VERTB, button_interrupt);
+}
+
+static int __init button_init(void)
+{
+ ...
+ button_dev->open = button_open;
+ button_dev->close = button_close;
+ ...
+}
+
+Note that input core keeps track of number of users for the device and
+makes sure that dev->open() is called only when the first user connects
+to the device and that dev->close() is called when the very last user
+disconnects. Calls to both callbacks are serialized.
+
+The open() callback should return a 0 in case of success or any nonzero value
+in case of failure. The close() callback (which is void) must always succeed.
+
+1.3 Basic event types
+~~~~~~~~~~~~~~~~~~~~~
+
+The most simple event type is EV_KEY, which is used for keys and buttons.
+It's reported to the input system via:
+
+ input_report_key(struct input_dev *dev, int code, int value)
+
+See linux/input.h for the allowable values of code (from 0 to KEY_MAX).
+Value is interpreted as a truth value, ie any nonzero value means key
+pressed, zero value means key released. The input code generates events only
+in case the value is different from before.
+
+In addition to EV_KEY, there are two more basic event types: EV_REL and
+EV_ABS. They are used for relative and absolute values supplied by the
+device. A relative value may be for example a mouse movement in the X axis.
+The mouse reports it as a relative difference from the last position,
+because it doesn't have any absolute coordinate system to work in. Absolute
+events are namely for joysticks and digitizers - devices that do work in an
+absolute coordinate systems.
+
+Having the device report EV_REL buttons is as simple as with EV_KEY, simply
+set the corresponding bits and call the
+
+ input_report_rel(struct input_dev *dev, int code, int value)
+
+function. Events are generated only for nonzero value.
+
+However EV_ABS requires a little special care. Before calling
+input_register_device, you have to fill additional fields in the input_dev
+struct for each absolute axis your device has. If our button device had also
+the ABS_X axis:
+
+ button_dev.absmin[ABS_X] = 0;
+ button_dev.absmax[ABS_X] = 255;
+ button_dev.absfuzz[ABS_X] = 4;
+ button_dev.absflat[ABS_X] = 8;
+
+Or, you can just say:
+
+ input_set_abs_params(button_dev, ABS_X, 0, 255, 4, 8);
+
+This setting would be appropriate for a joystick X axis, with the minimum of
+0, maximum of 255 (which the joystick *must* be able to reach, no problem if
+it sometimes reports more, but it must be able to always reach the min and
+max values), with noise in the data up to +- 4, and with a center flat
+position of size 8.
+
+If you don't need absfuzz and absflat, you can set them to zero, which mean
+that the thing is precise and always returns to exactly the center position
+(if it has any).
+
+1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK()
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These three macros from bitops.h help some bitfield computations:
+
+ BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for
+ x bits
+ BIT_WORD(x) - returns the index in the array in longs for bit x
+ BIT_MASK(x) - returns the index in a long for bit x
+
+1.5 The id* and name fields
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The dev->name should be set before registering the input device by the input
+device driver. It's a string like 'Generic button device' containing a
+user friendly name of the device.
+
+The id* fields contain the bus ID (PCI, USB, ...), vendor ID and device ID
+of the device. The bus IDs are defined in input.h. The vendor and device ids
+are defined in pci_ids.h, usb_ids.h and similar include files. These fields
+should be set by the input device driver before registering it.
+
+The idtype field can be used for specific information for the input device
+driver.
+
+The id and name fields can be passed to userland via the evdev interface.
+
+1.6 The keycode, keycodemax, keycodesize fields
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These three fields should be used by input devices that have dense keymaps.
+The keycode is an array used to map from scancodes to input system keycodes.
+The keycode max should contain the size of the array and keycodesize the
+size of each entry in it (in bytes).
+
+Userspace can query and alter current scancode to keycode mappings using
+EVIOCGKEYCODE and EVIOCSKEYCODE ioctls on corresponding evdev interface.
+When a device has all 3 aforementioned fields filled in, the driver may
+rely on kernel's default implementation of setting and querying keycode
+mappings.
+
+1.7 dev->getkeycode() and dev->setkeycode()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+getkeycode() and setkeycode() callbacks allow drivers to override default
+keycode/keycodesize/keycodemax mapping mechanism provided by input core
+and implement sparse keycode maps.
+
+1.8 Key autorepeat
+~~~~~~~~~~~~~~~~~~
+
+... is simple. It is handled by the input.c module. Hardware autorepeat is
+not used, because it's not present in many devices and even where it is
+present, it is broken sometimes (at keyboards: Toshiba notebooks). To enable
+autorepeat for your device, just set EV_REP in dev->evbit. All will be
+handled by the input system.
+
+1.9 Other event types, handling output events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The other event types up to now are:
+
+EV_LED - used for the keyboard LEDs.
+EV_SND - used for keyboard beeps.
+
+They are very similar to for example key events, but they go in the other
+direction - from the system to the input device driver. If your input device
+driver can handle these events, it has to set the respective bits in evbit,
+*and* also the callback routine:
+
+ button_dev->event = button_event;
+
+int button_event(struct input_dev *dev, unsigned int type, unsigned int code, int value);
+{
+ if (type == EV_SND && code == SND_BELL) {
+ outb(value, BUTTON_BELL);
+ return 0;
+ }
+ return -1;
+}
+
+This callback routine can be called from an interrupt or a BH (although that
+isn't a rule), and thus must not sleep, and must not take too long to finish.
diff --git a/Documentation/input/input.txt b/Documentation/input/input.txt
new file mode 100644
index 0000000..686ee99
--- /dev/null
+++ b/Documentation/input/input.txt
@@ -0,0 +1,290 @@
+ Linux Input drivers v1.0
+ (c) 1999-2001 Vojtech Pavlik <vojtech@ucw.cz>
+ Sponsored by SuSE
+----------------------------------------------------------------------------
+
+0. Disclaimer
+~~~~~~~~~~~~~
+ This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+ This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+ You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Should you need to contact me, the author, you can do so either by e-mail
+- mail your message to <vojtech@ucw.cz>, or by paper mail: Vojtech Pavlik,
+Simunkova 1594, Prague 8, 182 00 Czech Republic
+
+ For your convenience, the GNU General Public License version 2 is included
+in the package: See the file COPYING.
+
+1. Introduction
+~~~~~~~~~~~~~~~
+ This is a collection of drivers that is designed to support all input
+devices under Linux. While it is currently used only on for USB input
+devices, future use (say 2.5/2.6) is expected to expand to replace
+most of the existing input system, which is why it lives in
+drivers/input/ instead of drivers/usb/.
+
+ The centre of the input drivers is the input module, which must be
+loaded before any other of the input modules - it serves as a way of
+communication between two groups of modules:
+
+1.1 Device drivers
+~~~~~~~~~~~~~~~~~~
+ These modules talk to the hardware (for example via USB), and provide
+events (keystrokes, mouse movements) to the input module.
+
+1.2 Event handlers
+~~~~~~~~~~~~~~~~~~
+ These modules get events from input and pass them where needed via
+various interfaces - keystrokes to the kernel, mouse movements via a
+simulated PS/2 interface to GPM and X and so on.
+
+2. Simple Usage
+~~~~~~~~~~~~~~~
+ For the most usual configuration, with one USB mouse and one USB keyboard,
+you'll have to load the following modules (or have them built in to the
+kernel):
+
+ input
+ mousedev
+ keybdev
+ usbcore
+ uhci_hcd or ohci_hcd or ehci_hcd
+ usbhid
+
+ After this, the USB keyboard will work straight away, and the USB mouse
+will be available as a character device on major 13, minor 63:
+
+ crw-r--r-- 1 root root 13, 63 Mar 28 22:45 mice
+
+ This device has to be created.
+ The commands to create it by hand are:
+
+ cd /dev
+ mkdir input
+ mknod input/mice c 13 63
+
+ After that you have to point GPM (the textmode mouse cut&paste tool) and
+XFree to this device to use it - GPM should be called like:
+
+ gpm -t ps2 -m /dev/input/mice
+
+ And in X:
+
+ Section "Pointer"
+ Protocol "ImPS/2"
+ Device "/dev/input/mice"
+ ZAxisMapping 4 5
+ EndSection
+
+ When you do all of the above, you can use your USB mouse and keyboard.
+
+3. Detailed Description
+~~~~~~~~~~~~~~~~~~~~~~~
+3.1 Device drivers
+~~~~~~~~~~~~~~~~~~
+ Device drivers are the modules that generate events. The events are
+however not useful without being handled, so you also will need to use some
+of the modules from section 3.2.
+
+3.1.1 usbhid
+~~~~~~~~~~~~
+ usbhid is the largest and most complex driver of the whole suite. It
+handles all HID devices, and because there is a very wide variety of them,
+and because the USB HID specification isn't simple, it needs to be this big.
+
+ Currently, it handles USB mice, joysticks, gamepads, steering wheels
+keyboards, trackballs and digitizers.
+
+ However, USB uses HID also for monitor controls, speaker controls, UPSs,
+LCDs and many other purposes.
+
+ The monitor and speaker controls should be easy to add to the hid/input
+interface, but for the UPSs and LCDs it doesn't make much sense. For this,
+the hiddev interface was designed. See Documentation/usb/hiddev.txt
+for more information about it.
+
+ The usage of the usbhid module is very simple, it takes no parameters,
+detects everything automatically and when a HID device is inserted, it
+detects it appropriately.
+
+ However, because the devices vary wildly, you might happen to have a
+device that doesn't work well. In that case #define DEBUG at the beginning
+of hid-core.c and send me the syslog traces.
+
+3.1.2 usbmouse
+~~~~~~~~~~~~~~
+ For embedded systems, for mice with broken HID descriptors and just any
+other use when the big usbhid wouldn't be a good choice, there is the
+usbmouse driver. It handles USB mice only. It uses a simpler HIDBP
+protocol. This also means the mice must support this simpler protocol. Not
+all do. If you don't have any strong reason to use this module, use usbhid
+instead.
+
+3.1.3 usbkbd
+~~~~~~~~~~~~
+ Much like usbmouse, this module talks to keyboards with a simplified
+HIDBP protocol. It's smaller, but doesn't support any extra special keys.
+Use usbhid instead if there isn't any special reason to use this.
+
+3.1.4 wacom
+~~~~~~~~~~~
+ This is a driver for Wacom Graphire and Intuos tablets. Not for Wacom
+PenPartner, that one is handled by the HID driver. Although the Intuos and
+Graphire tablets claim that they are HID tablets as well, they are not and
+thus need this specific driver.
+
+3.1.5 iforce
+~~~~~~~~~~~~
+ A driver for I-Force joysticks and wheels, both over USB and RS232.
+It includes ForceFeedback support now, even though Immersion
+Corp. considers the protocol a trade secret and won't disclose a word
+about it.
+
+3.2 Event handlers
+~~~~~~~~~~~~~~~~~~
+ Event handlers distribute the events from the devices to userland and
+kernel, as needed.
+
+3.2.1 keybdev
+~~~~~~~~~~~~~
+ keybdev is currently a rather ugly hack that translates the input
+events into architecture-specific keyboard raw mode (Xlated AT Set2 on
+x86), and passes them into the handle_scancode function of the
+keyboard.c module. This works well enough on all architectures that
+keybdev can generate rawmode on, other architectures can be added to
+it.
+
+ The right way would be to pass the events to keyboard.c directly,
+best if keyboard.c would itself be an event handler. This is done in
+the input patch, available on the webpage mentioned below.
+
+3.2.2 mousedev
+~~~~~~~~~~~~~~
+ mousedev is also a hack to make programs that use mouse input
+work. It takes events from either mice or digitizers/tablets and makes
+a PS/2-style (a la /dev/psaux) mouse device available to the
+userland. Ideally, the programs could use a more reasonable interface,
+for example evdev
+
+ Mousedev devices in /dev/input (as shown above) are:
+
+ crw-r--r-- 1 root root 13, 32 Mar 28 22:45 mouse0
+ crw-r--r-- 1 root root 13, 33 Mar 29 00:41 mouse1
+ crw-r--r-- 1 root root 13, 34 Mar 29 00:41 mouse2
+ crw-r--r-- 1 root root 13, 35 Apr 1 10:50 mouse3
+ ...
+ ...
+ crw-r--r-- 1 root root 13, 62 Apr 1 10:50 mouse30
+ crw-r--r-- 1 root root 13, 63 Apr 1 10:50 mice
+
+Each 'mouse' device is assigned to a single mouse or digitizer, except
+the last one - 'mice'. This single character device is shared by all
+mice and digitizers, and even if none are connected, the device is
+present. This is useful for hotplugging USB mice, so that programs
+can open the device even when no mice are present.
+
+ CONFIG_INPUT_MOUSEDEV_SCREEN_[XY] in the kernel configuration are
+the size of your screen (in pixels) in XFree86. This is needed if you
+want to use your digitizer in X, because its movement is sent to X
+via a virtual PS/2 mouse and thus needs to be scaled
+accordingly. These values won't be used if you use a mouse only.
+
+ Mousedev will generate either PS/2, ImPS/2 (Microsoft IntelliMouse) or
+ExplorerPS/2 (IntelliMouse Explorer) protocols, depending on what the
+program reading the data wishes. You can set GPM and X to any of
+these. You'll need ImPS/2 if you want to make use of a wheel on a USB
+mouse and ExplorerPS/2 if you want to use extra (up to 5) buttons.
+
+3.2.3 joydev
+~~~~~~~~~~~~
+ Joydev implements v0.x and v1.x Linux joystick api, much like
+drivers/char/joystick/joystick.c used to in earlier versions. See
+joystick-api.txt in the Documentation subdirectory for details. As
+soon as any joystick is connected, it can be accessed in /dev/input
+on:
+
+ crw-r--r-- 1 root root 13, 0 Apr 1 10:50 js0
+ crw-r--r-- 1 root root 13, 1 Apr 1 10:50 js1
+ crw-r--r-- 1 root root 13, 2 Apr 1 10:50 js2
+ crw-r--r-- 1 root root 13, 3 Apr 1 10:50 js3
+ ...
+
+And so on up to js31.
+
+3.2.4 evdev
+~~~~~~~~~~~
+ evdev is the generic input event interface. It passes the events
+generated in the kernel straight to the program, with timestamps. The
+API is still evolving, but should be useable now. It's described in
+section 5.
+
+ This should be the way for GPM and X to get keyboard and mouse
+events. It allows for multihead in X without any specific multihead
+kernel support. The event codes are the same on all architectures and
+are hardware independent.
+
+ The devices are in /dev/input:
+
+ crw-r--r-- 1 root root 13, 64 Apr 1 10:49 event0
+ crw-r--r-- 1 root root 13, 65 Apr 1 10:50 event1
+ crw-r--r-- 1 root root 13, 66 Apr 1 10:50 event2
+ crw-r--r-- 1 root root 13, 67 Apr 1 10:50 event3
+ ...
+
+And so on up to event31.
+
+4. Verifying if it works
+~~~~~~~~~~~~~~~~~~~~~~~~
+ Typing a couple keys on the keyboard should be enough to check that
+a USB keyboard works and is correctly connected to the kernel keyboard
+driver.
+
+ Doing a cat /dev/input/mouse0 (c, 13, 32) will verify that a mouse
+is also emulated, characters should appear if you move it.
+
+ You can test the joystick emulation with the 'jstest' utility,
+available in the joystick package (see Documentation/input/joystick.txt).
+
+ You can test the event devices with the 'evtest' utility available
+in the LinuxConsole project CVS archive (see the URL below).
+
+5. Event interface
+~~~~~~~~~~~~~~~~~~
+ Should you want to add event device support into any application (X, gpm,
+svgalib ...) I <vojtech@ucw.cz> will be happy to provide you any help I
+can. Here goes a description of the current state of things, which is going
+to be extended, but not changed incompatibly as time goes:
+
+ You can use blocking and nonblocking reads, also select() on the
+/dev/input/eventX devices, and you'll always get a whole number of input
+events on a read. Their layout is:
+
+struct input_event {
+ struct timeval time;
+ unsigned short type;
+ unsigned short code;
+ unsigned int value;
+};
+
+ 'time' is the timestamp, it returns the time at which the event happened.
+Type is for example EV_REL for relative moment, REL_KEY for a keypress or
+release. More types are defined in include/linux/input.h.
+
+ 'code' is event code, for example REL_X or KEY_BACKSPACE, again a complete
+list is in include/linux/input.h.
+
+ 'value' is the value the event carries. Either a relative change for
+EV_REL, absolute new value for EV_ABS (joysticks ...), or 0 for EV_KEY for
+release, 1 for keypress and 2 for autorepeat.
+
diff --git a/Documentation/input/interactive.fig b/Documentation/input/interactive.fig
new file mode 100644
index 0000000..1e7de38
--- /dev/null
+++ b/Documentation/input/interactive.fig
@@ -0,0 +1,42 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter
+100.00
+Single
+-2
+1200 2
+2 1 0 2 0 7 50 0 -1 6.000 0 0 -1 0 0 6
+ 1200 3600 1800 3600 2400 4800 3000 4800 4200 5700 4800 5700
+2 2 0 1 0 7 50 0 -1 4.000 0 0 -1 0 0 5
+ 1200 3150 4800 3150 4800 6300 1200 6300 1200 3150
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 1200 4800 4800 4800
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4
+ 2400 4800 2400 6525 1950 7125 1950 7800
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4
+ 3000 4800 3000 6525 3600 7125 3600 7800
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 1 3
+ 0 0 1.00 60.00 120.00
+ 3825 5400 4125 5100 5400 5100
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 1 3
+ 0 0 1.00 60.00 120.00
+ 2100 4200 2400 3900 5400 3900
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 4800 5700 5400 5700
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 1800 3600 5400 3600
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 1 3
+ 0 0 1.00 60.00 120.00
+ 2700 4800 2700 4425 5400 4425
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 60.00 120.00
+ 0 0 1.00 60.00 120.00
+ 1950 7800 3600 7800
+4 1 0 50 0 0 12 0.0000 4 135 810 2775 7725 Dead band\001
+4 0 0 50 0 0 12 0.0000 4 180 1155 5400 5700 right saturation\001
+4 0 0 50 0 0 12 0.0000 4 135 1065 5400 3600 left saturation\001
+4 0 0 50 0 0 12 0.0000 4 180 2505 5400 3900 left coeff ( positive in that case )\001
+4 0 0 50 0 0 12 0.0000 4 180 2640 5475 5100 right coeff ( negative in that case )\001
+4 0 0 50 0 0 12 0.0000 4 105 480 5400 4425 center\001
diff --git a/Documentation/input/joystick-api.txt b/Documentation/input/joystick-api.txt
new file mode 100644
index 0000000..c507330
--- /dev/null
+++ b/Documentation/input/joystick-api.txt
@@ -0,0 +1,314 @@
+ Joystick API Documentation -*-Text-*-
+
+ Ragnar Hojland Espinosa
+ <ragnar@macula.net>
+
+ 7 Aug 1998
+
+1. Initialization
+~~~~~~~~~~~~~~~~~
+
+Open the joystick device following the usual semantics (that is, with open).
+Since the driver now reports events instead of polling for changes,
+immediately after the open it will issue a series of synthetic events
+(JS_EVENT_INIT) that you can read to check the initial state of the
+joystick.
+
+By default, the device is opened in blocking mode.
+
+ int fd = open ("/dev/js0", O_RDONLY);
+
+
+2. Event Reading
+~~~~~~~~~~~~~~~~
+
+ struct js_event e;
+ read (fd, &e, sizeof(struct js_event));
+
+where js_event is defined as
+
+ struct js_event {
+ __u32 time; /* event timestamp in milliseconds */
+ __s16 value; /* value */
+ __u8 type; /* event type */
+ __u8 number; /* axis/button number */
+ };
+
+If the read is successful, it will return sizeof(struct js_event), unless
+you wanted to read more than one event per read as described in section 3.1.
+
+
+2.1 js_event.type
+~~~~~~~~~~~~~~~~~
+
+The possible values of ``type'' are
+
+ #define JS_EVENT_BUTTON 0x01 /* button pressed/released */
+ #define JS_EVENT_AXIS 0x02 /* joystick moved */
+ #define JS_EVENT_INIT 0x80 /* initial state of device */
+
+As mentioned above, the driver will issue synthetic JS_EVENT_INIT ORed
+events on open. That is, if it's issuing a INIT BUTTON event, the
+current type value will be
+
+ int type = JS_EVENT_BUTTON | JS_EVENT_INIT; /* 0x81 */
+
+If you choose not to differentiate between synthetic or real events
+you can turn off the JS_EVENT_INIT bits
+
+ type &= ~JS_EVENT_INIT; /* 0x01 */
+
+
+2.2 js_event.number
+~~~~~~~~~~~~~~~~~~~
+
+The values of ``number'' correspond to the axis or button that
+generated the event. Note that they carry separate numeration (that
+is, you have both an axis 0 and a button 0). Generally,
+
+ number
+ 1st Axis X 0
+ 1st Axis Y 1
+ 2nd Axis X 2
+ 2nd Axis Y 3
+ ...and so on
+
+Hats vary from one joystick type to another. Some can be moved in 8
+directions, some only in 4, The driver, however, always reports a hat as two
+independent axis, even if the hardware doesn't allow independent movement.
+
+
+2.3 js_event.value
+~~~~~~~~~~~~~~~~~~
+
+For an axis, ``value'' is a signed integer between -32767 and +32767
+representing the position of the joystick along that axis. If you
+don't read a 0 when the joystick is `dead', or if it doesn't span the
+full range, you should recalibrate it (with, for example, jscal).
+
+For a button, ``value'' for a press button event is 1 and for a release
+button event is 0.
+
+Though this
+
+ if (js_event.type == JS_EVENT_BUTTON) {
+ buttons_state ^= (1 << js_event.number);
+ }
+
+may work well if you handle JS_EVENT_INIT events separately,
+
+ if ((js_event.type & ~JS_EVENT_INIT) == JS_EVENT_BUTTON) {
+ if (js_event.value)
+ buttons_state |= (1 << js_event.number);
+ else
+ buttons_state &= ~(1 << js_event.number);
+ }
+
+is much safer since it can't lose sync with the driver. As you would
+have to write a separate handler for JS_EVENT_INIT events in the first
+snippet, this ends up being shorter.
+
+
+2.4 js_event.time
+~~~~~~~~~~~~~~~~~
+
+The time an event was generated is stored in ``js_event.time''. It's a time
+in milliseconds since ... well, since sometime in the past. This eases the
+task of detecting double clicks, figuring out if movement of axis and button
+presses happened at the same time, and similar.
+
+
+3. Reading
+~~~~~~~~~~
+
+If you open the device in blocking mode, a read will block (that is,
+wait) forever until an event is generated and effectively read. There
+are two alternatives if you can't afford to wait forever (which is,
+admittedly, a long time;)
+
+ a) use select to wait until there's data to be read on fd, or
+ until it timeouts. There's a good example on the select(2)
+ man page.
+
+ b) open the device in non-blocking mode (O_NONBLOCK)
+
+
+3.1 O_NONBLOCK
+~~~~~~~~~~~~~~
+
+If read returns -1 when reading in O_NONBLOCK mode, this isn't
+necessarily a "real" error (check errno(3)); it can just mean there
+are no events pending to be read on the driver queue. You should read
+all events on the queue (that is, until you get a -1).
+
+For example,
+
+ while (1) {
+ while (read (fd, &e, sizeof(struct js_event)) > 0) {
+ process_event (e);
+ }
+ /* EAGAIN is returned when the queue is empty */
+ if (errno != EAGAIN) {
+ /* error */
+ }
+ /* do something interesting with processed events */
+ }
+
+One reason for emptying the queue is that if it gets full you'll start
+missing events since the queue is finite, and older events will get
+overwritten.
+
+The other reason is that you want to know all what happened, and not
+delay the processing till later.
+
+Why can get the queue full? Because you don't empty the queue as
+mentioned, or because too much time elapses from one read to another
+and too many events to store in the queue get generated. Note that
+high system load may contribute to space those reads even more.
+
+If time between reads is enough to fill the queue and lose an event,
+the driver will switch to startup mode and next time you read it,
+synthetic events (JS_EVENT_INIT) will be generated to inform you of
+the actual state of the joystick.
+
+[As for version 1.2.8, the queue is circular and able to hold 64
+ events. You can increment this size bumping up JS_BUFF_SIZE in
+ joystick.h and recompiling the driver.]
+
+
+In the above code, you might as well want to read more than one event
+at a time using the typical read(2) functionality. For that, you would
+replace the read above with something like
+
+ struct js_event mybuffer[0xff];
+ int i = read (fd, mybuffer, sizeof(struct mybuffer));
+
+In this case, read would return -1 if the queue was empty, or some
+other value in which the number of events read would be i /
+sizeof(js_event) Again, if the buffer was full, it's a good idea to
+process the events and keep reading it until you empty the driver queue.
+
+
+4. IOCTLs
+~~~~~~~~~
+
+The joystick driver defines the following ioctl(2) operations.
+
+ /* function 3rd arg */
+ #define JSIOCGAXES /* get number of axes char */
+ #define JSIOCGBUTTONS /* get number of buttons char */
+ #define JSIOCGVERSION /* get driver version int */
+ #define JSIOCGNAME(len) /* get identifier string char */
+ #define JSIOCSCORR /* set correction values &js_corr */
+ #define JSIOCGCORR /* get correction values &js_corr */
+
+For example, to read the number of axes
+
+ char number_of_axes;
+ ioctl (fd, JSIOCGAXES, &number_of_axes);
+
+
+4.1 JSIOGCVERSION
+~~~~~~~~~~~~~~~~~
+
+JSIOGCVERSION is a good way to check in run-time whether the running
+driver is 1.0+ and supports the event interface. If it is not, the
+IOCTL will fail. For a compile-time decision, you can test the
+JS_VERSION symbol
+
+ #ifdef JS_VERSION
+ #if JS_VERSION > 0xsomething
+
+
+4.2 JSIOCGNAME
+~~~~~~~~~~~~~~
+
+JSIOCGNAME(len) allows you to get the name string of the joystick - the same
+as is being printed at boot time. The 'len' argument is the length of the
+buffer provided by the application asking for the name. It is used to avoid
+possible overrun should the name be too long.
+
+ char name[128];
+ if (ioctl(fd, JSIOCGNAME(sizeof(name)), name) < 0)
+ strncpy(name, "Unknown", sizeof(name));
+ printf("Name: %s\n", name);
+
+
+4.3 JSIOC[SG]CORR
+~~~~~~~~~~~~~~~~~
+
+For usage on JSIOC[SG]CORR I suggest you to look into jscal.c They are
+not needed in a normal program, only in joystick calibration software
+such as jscal or kcmjoy. These IOCTLs and data types aren't considered
+to be in the stable part of the API, and therefore may change without
+warning in following releases of the driver.
+
+Both JSIOCSCORR and JSIOCGCORR expect &js_corr to be able to hold
+information for all axis. That is, struct js_corr corr[MAX_AXIS];
+
+struct js_corr is defined as
+
+ struct js_corr {
+ __s32 coef[8];
+ __u16 prec;
+ __u16 type;
+ };
+
+and ``type''
+
+ #define JS_CORR_NONE 0x00 /* returns raw values */
+ #define JS_CORR_BROKEN 0x01 /* broken line */
+
+
+5. Backward compatibility
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The 0.x joystick driver API is quite limited and its usage is deprecated.
+The driver offers backward compatibility, though. Here's a quick summary:
+
+ struct JS_DATA_TYPE js;
+ while (1) {
+ if (read (fd, &js, JS_RETURN) != JS_RETURN) {
+ /* error */
+ }
+ usleep (1000);
+ }
+
+As you can figure out from the example, the read returns immediately,
+with the actual state of the joystick.
+
+ struct JS_DATA_TYPE {
+ int buttons; /* immediate button state */
+ int x; /* immediate x axis value */
+ int y; /* immediate y axis value */
+ };
+
+and JS_RETURN is defined as
+
+ #define JS_RETURN sizeof(struct JS_DATA_TYPE)
+
+To test the state of the buttons,
+
+ first_button_state = js.buttons & 1;
+ second_button_state = js.buttons & 2;
+
+The axis values do not have a defined range in the original 0.x driver,
+except for that the values are non-negative. The 1.2.8+ drivers use a
+fixed range for reporting the values, 1 being the minimum, 128 the
+center, and 255 maximum value.
+
+The v0.8.0.2 driver also had an interface for 'digital joysticks', (now
+called Multisystem joysticks in this driver), under /dev/djsX. This driver
+doesn't try to be compatible with that interface.
+
+
+6. Final Notes
+~~~~~~~~~~~~~~
+
+____/| Comments, additions, and specially corrections are welcome.
+\ o.O| Documentation valid for at least version 1.2.8 of the joystick
+ =(_)= driver and as usual, the ultimate source for documentation is
+ U to "Use The Source Luke" or, at your convenience, Vojtech ;)
+
+ - Ragnar
+EOF
diff --git a/Documentation/input/joystick-parport.txt b/Documentation/input/joystick-parport.txt
new file mode 100644
index 0000000..1c856f3
--- /dev/null
+++ b/Documentation/input/joystick-parport.txt
@@ -0,0 +1,542 @@
+ Linux Joystick parport drivers v2.0
+ (c) 1998-2000 Vojtech Pavlik <vojtech@ucw.cz>
+ (c) 1998 Andree Borrmann <a.borrmann@tu-bs.de>
+ Sponsored by SuSE
+----------------------------------------------------------------------------
+
+0. Disclaimer
+~~~~~~~~~~~~~
+ Any information in this file is provided as-is, without any guarantee that
+it will be true. So, use it at your own risk. The possible damages that can
+happen include burning your parallel port, and/or the sticks and joystick
+and maybe even more. Like when a lightning kills you it is not our problem.
+
+1. Intro
+~~~~~~~~
+ The joystick parport drivers are used for joysticks and gamepads not
+originally designed for PCs and other computers Linux runs on. Because of
+that, PCs usually lack the right ports to connect these devices to. Parallel
+port, because of its ability to change single bits at will, and providing
+both output and input bits is the most suitable port on the PC for
+connecting such devices.
+
+2. Devices supported
+~~~~~~~~~~~~~~~~~~~~
+ Many console and 8-bit computer gamepads and joysticks are supported. The
+following subsections discuss usage of each.
+
+2.1 NES and SNES
+~~~~~~~~~~~~~~~~
+ The Nintendo Entertainment System and Super Nintendo Entertainment System
+gamepads are widely available, and easy to get. Also, they are quite easy to
+connect to a PC, and don't need much processing speed (108 us for NES and
+165 us for SNES, compared to about 1000 us for PC gamepads) to communicate
+with them.
+
+ All NES and SNES use the same synchronous serial protocol, clocked from
+the computer's side (and thus timing insensitive). To allow up to 5 NES
+and/or SNES gamepads and/or SNES mice connected to the parallel port at once,
+the output lines of the parallel port are shared, while one of 5 available
+input lines is assigned to each gamepad.
+
+ This protocol is handled by the gamecon.c driver, so that's the one
+you'll use for NES, SNES gamepads and SNES mice.
+
+ The main problem with PC parallel ports is that they don't have +5V power
+source on any of their pins. So, if you want a reliable source of power
+for your pads, use either keyboard or joystick port, and make a pass-through
+cable. You can also pull the power directly from the power supply (the red
+wire is +5V).
+
+ If you want to use the parallel port only, you can take the power is from
+some data pin. For most gamepad and parport implementations only one pin is
+needed, and I'd recommend pin 9 for that, the highest data bit. On the other
+hand, if you are not planning to use anything else than NES / SNES on the
+port, anything between and including pin 4 and pin 9 will work.
+
+(pin 9) -----> Power
+
+ Unfortunately, there are pads that need a lot more of power, and parallel
+ports that can't give much current through the data pins. If this is your
+case, you'll need to use diodes (as a prevention of destroying your parallel
+port), and combine the currents of two or more data bits together.
+
+ Diodes
+(pin 9) ----|>|-------+------> Power
+ |
+(pin 8) ----|>|-------+
+ |
+(pin 7) ----|>|-------+
+ |
+ <and so on> :
+ |
+(pin 4) ----|>|-------+
+
+ Ground is quite easy. On PC's parallel port the ground is on any of the
+pins from pin 18 to pin 25. So use any pin of these you like for the ground.
+
+(pin 18) -----> Ground
+
+ NES and SNES pads have two input bits, Clock and Latch, which drive the
+serial transfer. These are connected to pins 2 and 3 of the parallel port,
+respectively.
+
+(pin 2) -----> Clock
+(pin 3) -----> Latch
+
+ And the last thing is the NES / SNES data wire. Only that isn't shared and
+each pad needs its own data pin. The parallel port pins are:
+
+(pin 10) -----> Pad 1 data
+(pin 11) -----> Pad 2 data
+(pin 12) -----> Pad 3 data
+(pin 13) -----> Pad 4 data
+(pin 15) -----> Pad 5 data
+
+ Note that pin 14 is not used, since it is not an input pin on the parallel
+port.
+
+ This is everything you need on the PC's side of the connection, now on to
+the gamepads side. The NES and SNES have different connectors. Also, there
+are quite a lot of NES clones, and because Nintendo used proprietary
+connectors for their machines, the cloners couldn't and used standard D-Cannon
+connectors. Anyway, if you've got a gamepad, and it has buttons A, B, Turbo
+A, Turbo B, Select and Start, and is connected through 5 wires, then it is
+either a NES or NES clone and will work with this connection. SNES gamepads
+also use 5 wires, but have more buttons. They will work as well, of course.
+
+Pinout for NES gamepads Pinout for SNES gamepads and mice
+
+ +----> Power +-----------------------\
+ | 7 | o o o o | x x o | 1
+ 5 +---------+ 7 +-----------------------/
+ | x x o \ | | | | |
+ | o o o o | | | | | +-> Ground
+ 4 +------------+ 1 | | | +------------> Data
+ | | | | | | +---------------> Latch
+ | | | +-> Ground | +------------------> Clock
+ | | +----> Clock +---------------------> Power
+ | +-------> Latch
+ +----------> Data
+
+Pinout for NES clone (db9) gamepads Pinout for NES clone (db15) gamepads
+
+ +---------> Clock +-----------------> Data
+ | +-------> Latch | +---> Ground
+ | | +-----> Data | |
+ | | | ___________________
+ _____________ 8 \ o x x x x x x o / 1
+ 5 \ x o o o x / 1 \ o x x o x x o /
+ \ x o x o / 15 `~~~~~~~~~~~~~' 9
+ 9 `~~~~~~~' 6 | | |
+ | | | | +----> Clock
+ | +----> Power | +----------> Latch
+ +--------> Ground +----------------> Power
+
+2.2 Multisystem joysticks
+~~~~~~~~~~~~~~~~~~~~~~~~~
+ In the era of 8-bit machines, there was something like de-facto standard
+for joystick ports. They were all digital, and all used D-Cannon 9 pin
+connectors (db9). Because of that, a single joystick could be used without
+hassle on Atari (130, 800XE, 800XL, 2600, 7200), Amiga, Commodore C64,
+Amstrad CPC, Sinclair ZX Spectrum and many other machines. That's why these
+joysticks are called "Multisystem".
+
+ Now their pinout:
+
+ +---------> Right
+ | +-------> Left
+ | | +-----> Down
+ | | | +---> Up
+ | | | |
+ _____________
+5 \ x o o o o / 1
+ \ x o x o /
+ 9 `~~~~~~~' 6
+ | |
+ | +----> Button
+ +--------> Ground
+
+ However, as time passed, extensions to this standard developed, and these
+were not compatible with each other:
+
+
+ Atari 130, 800/XL/XE MSX
+
+ +-----------> Power
+ +---------> Right | +---------> Right
+ | +-------> Left | | +-------> Left
+ | | +-----> Down | | | +-----> Down
+ | | | +---> Up | | | | +---> Up
+ | | | | | | | | |
+ _____________ _____________
+5 \ x o o o o / 1 5 \ o o o o o / 1
+ \ x o o o / \ o o o o /
+ 9 `~~~~~~~' 6 9 `~~~~~~~' 6
+ | | | | | | |
+ | | +----> Button | | | +----> Button 1
+ | +------> Power | | +------> Button 2
+ +--------> Ground | +--------> Output 3
+ +----------> Ground
+
+ Amstrad CPC Commodore C64
+
+ +-----------> Analog Y
+ +---------> Right | +---------> Right
+ | +-------> Left | | +-------> Left
+ | | +-----> Down | | | +-----> Down
+ | | | +---> Up | | | | +---> Up
+ | | | | | | | | |
+ _____________ _____________
+5 \ x o o o o / 1 5 \ o o o o o / 1
+ \ x o o o / \ o o o o /
+ 9 `~~~~~~~' 6 9 `~~~~~~~' 6
+ | | | | | | |
+ | | +----> Button 1 | | | +----> Button
+ | +------> Button 2 | | +------> Power
+ +--------> Ground | +--------> Ground
+ +----------> Analog X
+
+ Sinclair Spectrum +2A/+3 Amiga 1200
+
+ +-----------> Up +-----------> Button 3
+ | +---------> Fire | +---------> Right
+ | | | | +-------> Left
+ | | +-----> Ground | | | +-----> Down
+ | | | | | | | +---> Up
+ | | | | | | | |
+ _____________ _____________
+5 \ o o x o x / 1 5 \ o o o o o / 1
+ \ o o o o / \ o o o o /
+ 9 `~~~~~~~' 6 9 `~~~~~~~' 6
+ | | | | | | | |
+ | | | +----> Right | | | +----> Button 1
+ | | +------> Left | | +------> Power
+ | +--------> Ground | +--------> Ground
+ +----------> Down +----------> Button 2
+
+ And there were many others.
+
+2.2.1 Multisystem joysticks using db9.c
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ For the Multisystem joysticks, and their derivatives, the db9.c driver
+was written. It allows only one joystick / gamepad per parallel port, but
+the interface is easy to build and works with almost anything.
+
+ For the basic 1-button Multisystem joystick you connect its wires to the
+parallel port like this:
+
+(pin 1) -----> Power
+(pin 18) -----> Ground
+
+(pin 2) -----> Up
+(pin 3) -----> Down
+(pin 4) -----> Left
+(pin 5) -----> Right
+(pin 6) -----> Button 1
+
+ However, if the joystick is switch based (eg. clicks when you move it),
+you might or might not, depending on your parallel port, need 10 kOhm pullup
+resistors on each of the direction and button signals, like this:
+
+(pin 2) ------------+------> Up
+ Resistor |
+(pin 1) --[10kOhm]--+
+
+ Try without, and if it doesn't work, add them. For TTL based joysticks /
+gamepads the pullups are not needed.
+
+ For joysticks with two buttons you connect the second button to pin 7 on
+the parallel port.
+
+(pin 7) -----> Button 2
+
+ And that's it.
+
+ On a side note, if you have already built a different adapter for use with
+the digital joystick driver 0.8.0.2, this is also supported by the db9.c
+driver, as device type 8. (See section 3.2)
+
+2.2.2 Multisystem joysticks using gamecon.c
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ For some people just one joystick per parallel port is not enough, and/or
+want to use them on one parallel port together with NES/SNES/PSX pads. This is
+possible using the gamecon.c. It supports up to 5 devices of the above types,
+including 1 and 2 buttons Multisystem joysticks.
+
+ However, there is nothing for free. To allow more sticks to be used at
+once, you need the sticks to be purely switch based (that is non-TTL), and
+not to need power. Just a plain simple six switches inside. If your
+joystick can do more (eg. turbofire) you'll need to disable it totally first
+if you want to use gamecon.c.
+
+ Also, the connection is a bit more complex. You'll need a bunch of diodes,
+and one pullup resistor. First, you connect the Directions and the button
+the same as for db9, however with the diodes inbetween.
+
+ Diodes
+(pin 2) -----|<|----> Up
+(pin 3) -----|<|----> Down
+(pin 4) -----|<|----> Left
+(pin 5) -----|<|----> Right
+(pin 6) -----|<|----> Button 1
+
+ For two button sticks you also connect the other button.
+
+(pin 7) -----|<|----> Button 2
+
+ And finally, you connect the Ground wire of the joystick, like done in
+this little schematic to Power and Data on the parallel port, as described
+for the NES / SNES pads in section 2.1 of this file - that is, one data pin
+for each joystick. The power source is shared.
+
+Data ------------+-----> Ground
+ Resistor |
+Power --[10kOhm]--+
+
+ And that's all, here we go!
+
+2.2.3 Multisystem joysticks using turbografx.c
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The TurboGraFX interface, designed by
+
+ Steffen Schwenke <schwenke@burg-halle.de>
+
+ allows up to 7 Multisystem joysticks connected to the parallel port. In
+Steffen's version, there is support for up to 5 buttons per joystick. However,
+since this doesn't work reliably on all parallel ports, the turbografx.c driver
+supports only one button per joystick. For more information on how to build the
+interface, see
+
+ http://www2.burg-halle.de/~schwenke/parport.html
+
+2.3 Sony Playstation
+~~~~~~~~~~~~~~~~~~~~
+
+ The PSX controller is supported by the gamecon.c. Pinout of the PSX
+controller (compatible with DirectPadPro):
+
+ +---------+---------+---------+
+9 | o o o | o o o | o o o | 1 parallel
+ \________|_________|________/ port pins
+ | | | | | |
+ | | | | | +--------> Clock --- (4)
+ | | | | +------------> Select --- (3)
+ | | | +---------------> Power --- (5-9)
+ | | +------------------> Ground --- (18-25)
+ | +-------------------------> Command --- (2)
+ +----------------------------> Data --- (one of 10,11,12,13,15)
+
+ The driver supports these controllers:
+
+ * Standard PSX Pad
+ * NegCon PSX Pad
+ * Analog PSX Pad (red mode)
+ * Analog PSX Pad (green mode)
+ * PSX Rumble Pad
+ * PSX DDR Pad
+
+2.4 Sega
+~~~~~~~~
+ All the Sega controllers are more or less based on the standard 2-button
+Multisystem joystick. However, since they don't use switches and use TTL
+logic, the only driver usable with them is the db9.c driver.
+
+2.4.1 Sega Master System
+~~~~~~~~~~~~~~~~~~~~~~~~
+ The SMS gamepads are almost exactly the same as normal 2-button
+Multisystem joysticks. Set the driver to Multi2 mode, use the corresponding
+parallel port pins, and the following schematic:
+
+ +-----------> Power
+ | +---------> Right
+ | | +-------> Left
+ | | | +-----> Down
+ | | | | +---> Up
+ | | | | |
+ _____________
+5 \ o o o o o / 1
+ \ o o x o /
+ 9 `~~~~~~~' 6
+ | | |
+ | | +----> Button 1
+ | +--------> Ground
+ +----------> Button 2
+
+2.4.2 Sega Genesis aka MegaDrive
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The Sega Genesis (in Europe sold as Sega MegaDrive) pads are an extension
+to the Sega Master System pads. They use more buttons (3+1, 5+1, 6+1). Use
+the following schematic:
+
+ +-----------> Power
+ | +---------> Right
+ | | +-------> Left
+ | | | +-----> Down
+ | | | | +---> Up
+ | | | | |
+ _____________
+5 \ o o o o o / 1
+ \ o o o o /
+ 9 `~~~~~~~' 6
+ | | | |
+ | | | +----> Button 1
+ | | +------> Select
+ | +--------> Ground
+ +----------> Button 2
+
+ The Select pin goes to pin 14 on the parallel port.
+
+(pin 14) -----> Select
+
+ The rest is the same as for Multi2 joysticks using db9.c
+
+2.4.3 Sega Saturn
+~~~~~~~~~~~~~~~~~
+ Sega Saturn has eight buttons, and to transfer that, without hacks like
+Genesis 6 pads use, it needs one more select pin. Anyway, it is still
+handled by the db9.c driver. Its pinout is very different from anything
+else. Use this schematic:
+
+ +-----------> Select 1
+ | +---------> Power
+ | | +-------> Up
+ | | | +-----> Down
+ | | | | +---> Ground
+ | | | | |
+ _____________
+5 \ o o o o o / 1
+ \ o o o o /
+ 9 `~~~~~~~' 6
+ | | | |
+ | | | +----> Select 2
+ | | +------> Right
+ | +--------> Left
+ +----------> Power
+
+ Select 1 is pin 14 on the parallel port, Select 2 is pin 16 on the
+parallel port.
+
+(pin 14) -----> Select 1
+(pin 16) -----> Select 2
+
+ The other pins (Up, Down, Right, Left, Power, Ground) are the same as for
+Multi joysticks using db9.c
+
+3. The drivers
+~~~~~~~~~~~~~~
+ There are three drivers for the parallel port interfaces. Each, as
+described above, allows to connect a different group of joysticks and pads.
+Here are described their command lines:
+
+3.1 gamecon.c
+~~~~~~~~~~~~~
+ Using gamecon.c you can connect up to five devices to one parallel port. It
+uses the following kernel/module command line:
+
+ gamecon.map=port,pad1,pad2,pad3,pad4,pad5
+
+ Where 'port' the number of the parport interface (eg. 0 for parport0).
+
+ And 'pad1' to 'pad5' are pad types connected to different data input pins
+(10,11,12,13,15), as described in section 2.1 of this file.
+
+ The types are:
+
+ Type | Joystick/Pad
+ --------------------
+ 0 | None
+ 1 | SNES pad
+ 2 | NES pad
+ 4 | Multisystem 1-button joystick
+ 5 | Multisystem 2-button joystick
+ 6 | N64 pad
+ 7 | Sony PSX controller
+ 8 | Sony PSX DDR controller
+ 9 | SNES mouse
+
+ The exact type of the PSX controller type is autoprobed when used, so
+hot swapping should work (but is not recommended).
+
+ Should you want to use more than one of parallel ports at once, you can use
+gamecon.map2 and gamecon.map3 as additional command line parameters for two
+more parallel ports.
+
+ There are two options specific to PSX driver portion. gamecon.psx_delay sets
+the command delay when talking to the controllers. The default of 25 should
+work but you can try lowering it for better performance. If your pads don't
+respond try raising it until they work. Setting the type to 8 allows the
+driver to be used with Dance Dance Revolution or similar games. Arrow keys are
+registered as key presses instead of X and Y axes.
+
+3.2 db9.c
+~~~~~~~~~
+ Apart from making an interface, there is nothing difficult on using the
+db9.c driver. It uses the following kernel/module command line:
+
+ db9.dev=port,type
+
+ Where 'port' is the number of the parport interface (eg. 0 for parport0).
+
+ Caveat here: This driver only works on bidirectional parallel ports. If
+your parallel port is recent enough, you should have no trouble with this.
+Old parallel ports may not have this feature.
+
+ 'Type' is the type of joystick or pad attached:
+
+ Type | Joystick/Pad
+ --------------------
+ 0 | None
+ 1 | Multisystem 1-button joystick
+ 2 | Multisystem 2-button joystick
+ 3 | Genesis pad (3+1 buttons)
+ 5 | Genesis pad (5+1 buttons)
+ 6 | Genesis pad (6+2 buttons)
+ 7 | Saturn pad (8 buttons)
+ 8 | Multisystem 1-button joystick (v0.8.0.2 pin-out)
+ 9 | Two Multisystem 1-button joysticks (v0.8.0.2 pin-out)
+ 10 | Amiga CD32 pad
+
+ Should you want to use more than one of these joysticks/pads at once, you
+can use db9.dev2 and db9.dev3 as additional command line parameters for two
+more joysticks/pads.
+
+3.3 turbografx.c
+~~~~~~~~~~~~~~~~
+ The turbografx.c driver uses a very simple kernel/module command line:
+
+ turbografx.map=port,js1,js2,js3,js4,js5,js6,js7
+
+ Where 'port' is the number of the parport interface (eg. 0 for parport0).
+
+ 'jsX' is the number of buttons the Multisystem joysticks connected to the
+interface ports 1-7 have. For a standard multisystem joystick, this is 1.
+
+ Should you want to use more than one of these interfaces at once, you can
+use turbografx.map2 and turbografx.map3 as additional command line parameters
+for two more interfaces.
+
+3.4 PC parallel port pinout
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ .----------------------------------------.
+ At the PC: \ 13 12 11 10 9 8 7 6 5 4 3 2 1 /
+ \ 25 24 23 22 21 20 19 18 17 16 15 14 /
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Pin | Name | Description
+ ~~~~~~|~~~~~~~~~|~~~~~~~~~~
+ 1 | /STROBE | Strobe
+ 2-9 | D0-D7 | Data Bit 0-7
+ 10 | /ACK | Acknowledge
+ 11 | BUSY | Busy
+ 12 | PE | Paper End
+ 13 | SELIN | Select In
+ 14 | /AUTOFD | Autofeed
+ 15 | /ERROR | Error
+ 16 | /INIT | Initialize
+ 17 | /SEL | Select
+ 18-25 | GND | Signal Ground
+
+3.5 End
+~~~~~~~
+ That's all, folks! Have fun!
diff --git a/Documentation/input/joystick.txt b/Documentation/input/joystick.txt
new file mode 100644
index 0000000..154d767
--- /dev/null
+++ b/Documentation/input/joystick.txt
@@ -0,0 +1,586 @@
+ Linux Joystick driver v2.0.0
+ (c) 1996-2000 Vojtech Pavlik <vojtech@ucw.cz>
+ Sponsored by SuSE
+----------------------------------------------------------------------------
+
+0. Disclaimer
+~~~~~~~~~~~~~
+ This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+ This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+ You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Should you need to contact me, the author, you can do so either by e-mail
+- mail your message to <vojtech@ucw.cz>, or by paper mail: Vojtech Pavlik,
+Simunkova 1594, Prague 8, 182 00 Czech Republic
+
+ For your convenience, the GNU General Public License version 2 is included
+in the package: See the file COPYING.
+
+1. Intro
+~~~~~~~~
+ The joystick driver for Linux provides support for a variety of joysticks
+and similar devices. It is based on a larger project aiming to support all
+input devices in Linux.
+
+ Should you encounter any problems while using the driver, or joysticks
+this driver can't make complete use of, I'm very interested in hearing about
+them. Bug reports and success stories are also welcome.
+
+ The input project website is at:
+
+ http://atrey.karlin.mff.cuni.cz/~vojtech/input/
+
+ There is also a mailing list for the driver at:
+
+ listproc@atrey.karlin.mff.cuni.cz
+
+send "subscribe linux-joystick Your Name" to subscribe to it.
+
+2. Usage
+~~~~~~~~
+ For basic usage you just choose the right options in kernel config and
+you should be set.
+
+2.1 inpututils
+~~~~~~~~~~~~~~
+For testing and other purposes (for example serial devices), a set of
+utilities is available at the abovementioned website. I suggest you download
+and install it before going on.
+
+2.2 Device nodes
+~~~~~~~~~~~~~~~~
+For applications to be able to use the joysticks,
+you'll have to manually create these nodes in /dev:
+
+cd /dev
+rm js*
+mkdir input
+mknod input/js0 c 13 0
+mknod input/js1 c 13 1
+mknod input/js2 c 13 2
+mknod input/js3 c 13 3
+ln -s input/js0 js0
+ln -s input/js1 js1
+ln -s input/js2 js2
+ln -s input/js3 js3
+
+For testing with inpututils it's also convenient to create these:
+
+mknod input/event0 c 13 64
+mknod input/event1 c 13 65
+mknod input/event2 c 13 66
+mknod input/event3 c 13 67
+
+2.4 Modules needed
+~~~~~~~~~~~~~~~~~~
+ For all joystick drivers to function, you'll need the userland interface
+module in kernel, either loaded or compiled in:
+
+ modprobe joydev
+
+ For gameport joysticks, you'll have to load the gameport driver as well;
+
+ modprobe ns558
+
+ And for serial port joysticks, you'll need the serial input line
+discipline module loaded and the inputattach utility started:
+
+ modprobe serport
+ inputattach -xxx /dev/tts/X &
+
+ In addition to that, you'll need the joystick driver module itself, most
+usually you'll have an analog joystick:
+
+ modprobe analog
+
+ For automatic module loading, something like this might work - tailor to
+your needs:
+
+ alias tty-ldisc-2 serport
+ alias char-major-13 input
+ above input joydev ns558 analog
+ options analog map=gamepad,none,2btn
+
+2.5 Verifying that it works
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ For testing the joystick driver functionality, there is the jstest
+program in the utilities package. You run it by typing:
+
+ jstest /dev/js0
+
+ And it should show a line with the joystick values, which update as you
+move the stick, and press its buttons. The axes should all be zero when the
+joystick is in the center position. They should not jitter by themselves to
+other close values, and they also should be steady in any other position of
+the stick. They should have the full range from -32767 to 32767. If all this
+is met, then it's all fine, and you can play the games. :)
+
+ If it's not, then there might be a problem. Try to calibrate the joystick,
+and if it still doesn't work, read the drivers section of this file, the
+troubleshooting section, and the FAQ.
+
+2.6. Calibration
+~~~~~~~~~~~~~~~~
+ For most joysticks you won't need any manual calibration, since the
+joystick should be autocalibrated by the driver automagically. However, with
+some analog joysticks, that either do not use linear resistors, or if you
+want better precision, you can use the jscal program
+
+ jscal -c /dev/js0
+
+ included in the joystick package to set better correction coefficients than
+what the driver would choose itself.
+
+ After calibrating the joystick you can verify if you like the new
+calibration using the jstest command, and if you do, you then can save the
+correction coefficients into a file
+
+ jscal -p /dev/js0 > /etc/joystick.cal
+
+ And add a line to your rc script executing that file
+
+ source /etc/joystick.cal
+
+ This way, after the next reboot your joystick will remain calibrated. You
+can also add the jscal -p line to your shutdown script.
+
+
+3. HW specific driver information
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In this section each of the separate hardware specific drivers is described.
+
+3.1 Analog joysticks
+~~~~~~~~~~~~~~~~~~~~
+ The analog.c uses the standard analog inputs of the gameport, and thus
+supports all standard joysticks and gamepads. It uses a very advanced
+routine for this, allowing for data precision that can't be found on any
+other system.
+
+ It also supports extensions like additional hats and buttons compatible
+with CH Flightstick Pro, ThrustMaster FCS or 6 and 8 button gamepads. Saitek
+Cyborg 'digital' joysticks are also supported by this driver, because
+they're basically souped up CHF sticks.
+
+ However the only types that can be autodetected are:
+
+* 2-axis, 4-button joystick
+* 3-axis, 4-button joystick
+* 4-axis, 4-button joystick
+* Saitek Cyborg 'digital' joysticks
+
+ For other joystick types (more/less axes, hats, and buttons) support
+you'll need to specify the types either on the kernel command line or on the
+module command line, when inserting analog into the kernel. The
+parameters are:
+
+ analog.map=<type1>,<type2>,<type3>,....
+
+ 'type' is type of the joystick from the table below, defining joysticks
+present on gameports in the system, starting with gameport0, second 'type'
+entry defining joystick on gameport1 and so on.
+
+ Type | Meaning
+ -----------------------------------
+ none | No analog joystick on that port
+ auto | Autodetect joystick
+ 2btn | 2-button n-axis joystick
+ y-joy | Two 2-button 2-axis joysticks on an Y-cable
+ y-pad | Two 2-button 2-axis gamepads on an Y-cable
+ fcs | Thrustmaster FCS compatible joystick
+ chf | Joystick with a CH Flightstick compatible hat
+ fullchf | CH Flightstick compatible with two hats and 6 buttons
+ gamepad | 4/6-button n-axis gamepad
+ gamepad8 | 8-button 2-axis gamepad
+
+ In case your joystick doesn't fit in any of the above categories, you can
+specify the type as a number by combining the bits in the table below. This
+is not recommended unless you really know what are you doing. It's not
+dangerous, but not simple either.
+
+ Bit | Meaning
+ --------------------------
+ 0 | Axis X1
+ 1 | Axis Y1
+ 2 | Axis X2
+ 3 | Axis Y2
+ 4 | Button A
+ 5 | Button B
+ 6 | Button C
+ 7 | Button D
+ 8 | CHF Buttons X and Y
+ 9 | CHF Hat 1
+ 10 | CHF Hat 2
+ 11 | FCS Hat
+ 12 | Pad Button X
+ 13 | Pad Button Y
+ 14 | Pad Button U
+ 15 | Pad Button V
+ 16 | Saitek F1-F4 Buttons
+ 17 | Saitek Digital Mode
+ 19 | GamePad
+ 20 | Joy2 Axis X1
+ 21 | Joy2 Axis Y1
+ 22 | Joy2 Axis X2
+ 23 | Joy2 Axis Y2
+ 24 | Joy2 Button A
+ 25 | Joy2 Button B
+ 26 | Joy2 Button C
+ 27 | Joy2 Button D
+ 31 | Joy2 GamePad
+
+3.2 Microsoft SideWinder joysticks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Microsoft 'Digital Overdrive' protocol is supported by the sidewinder.c
+module. All currently supported joysticks:
+
+* Microsoft SideWinder 3D Pro
+* Microsoft SideWinder Force Feedback Pro
+* Microsoft SideWinder Force Feedback Wheel
+* Microsoft SideWinder FreeStyle Pro
+* Microsoft SideWinder GamePad (up to four, chained)
+* Microsoft SideWinder Precision Pro
+* Microsoft SideWinder Precision Pro USB
+
+ are autodetected, and thus no module parameters are needed.
+
+ There is one caveat with the 3D Pro. There are 9 buttons reported,
+although the joystick has only 8. The 9th button is the mode switch on the
+rear side of the joystick. However, moving it, you'll reset the joystick,
+and make it unresponsive for about a one third of a second. Furthermore, the
+joystick will also re-center itself, taking the position it was in during
+this time as a new center position. Use it if you want, but think first.
+
+ The SideWinder Standard is not a digital joystick, and thus is supported
+by the analog driver described above.
+
+3.3 Logitech ADI devices
+~~~~~~~~~~~~~~~~~~~~~~~~
+ Logitech ADI protocol is supported by the adi.c module. It should support
+any Logitech device using this protocol. This includes, but is not limited
+to:
+
+* Logitech CyberMan 2
+* Logitech ThunderPad Digital
+* Logitech WingMan Extreme Digital
+* Logitech WingMan Formula
+* Logitech WingMan Interceptor
+* Logitech WingMan GamePad
+* Logitech WingMan GamePad USB
+* Logitech WingMan GamePad Extreme
+* Logitech WingMan Extreme Digital 3D
+
+ ADI devices are autodetected, and the driver supports up to two (any
+combination of) devices on a single gameport, using an Y-cable or chained
+together.
+
+ Logitech WingMan Joystick, Logitech WingMan Attack, Logitech WingMan
+Extreme and Logitech WingMan ThunderPad are not digital joysticks and are
+handled by the analog driver described above. Logitech WingMan Warrior and
+Logitech Magellan are supported by serial drivers described below. Logitech
+WingMan Force and Logitech WingMan Formula Force are supported by the
+I-Force driver described below. Logitech CyberMan is not supported yet.
+
+3.4 Gravis GrIP
+~~~~~~~~~~~~~~~
+ Gravis GrIP protocol is supported by the grip.c module. It currently
+supports:
+
+* Gravis GamePad Pro
+* Gravis BlackHawk Digital
+* Gravis Xterminator
+* Gravis Xterminator DualControl
+
+ All these devices are autodetected, and you can even use any combination
+of up to two of these pads either chained together or using an Y-cable on a
+single gameport.
+
+GrIP MultiPort isn't supported yet. Gravis Stinger is a serial device and is
+supported by the stinger driver. Other Gravis joysticks are supported by the
+analog driver.
+
+3.5 FPGaming A3D and MadCatz A3D
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The Assassin 3D protocol created by FPGaming, is used both by FPGaming
+themselves and is licensed to MadCatz. A3D devices are supported by the
+a3d.c module. It currently supports:
+
+* FPGaming Assassin 3D
+* MadCatz Panther
+* MadCatz Panther XL
+
+ All these devices are autodetected. Because the Assassin 3D and the Panther
+allow connecting analog joysticks to them, you'll need to load the analog
+driver as well to handle the attached joysticks.
+
+ The trackball should work with USB mousedev module as a normal mouse. See
+the USB documentation for how to setup an USB mouse.
+
+3.6 ThrustMaster DirectConnect (BSP)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The TM DirectConnect (BSP) protocol is supported by the tmdc.c
+module. This includes, but is not limited to:
+
+* ThrustMaster Millenium 3D Inceptor
+* ThrustMaster 3D Rage Pad
+* ThrustMaster Fusion Digital Game Pad
+
+ Devices not directly supported, but hopefully working are:
+
+* ThrustMaster FragMaster
+* ThrustMaster Attack Throttle
+
+ If you have one of these, contact me.
+
+ TMDC devices are autodetected, and thus no parameters to the module
+are needed. Up to two TMDC devices can be connected to one gameport, using
+an Y-cable.
+
+3.7 Creative Labs Blaster
+~~~~~~~~~~~~~~~~~~~~~~~~~
+ The Blaster protocol is supported by the cobra.c module. It supports only
+the:
+
+* Creative Blaster GamePad Cobra
+
+ Up to two of these can be used on a single gameport, using an Y-cable.
+
+3.8 Genius Digital joysticks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The Genius digitally communicating joysticks are supported by the gf2k.c
+module. This includes:
+
+* Genius Flight2000 F-23 joystick
+* Genius Flight2000 F-31 joystick
+* Genius G-09D gamepad
+
+ Other Genius digital joysticks are not supported yet, but support can be
+added fairly easily.
+
+3.9 InterAct Digital joysticks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The InterAct digitally communicating joysticks are supported by the
+interact.c module. This includes:
+
+* InterAct HammerHead/FX gamepad
+* InterAct ProPad8 gamepad
+
+ Other InterAct digital joysticks are not supported yet, but support can be
+added fairly easily.
+
+3.10 PDPI Lightning 4 gamecards
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ PDPI Lightning 4 gamecards are supported by the lightning.c module.
+Once the module is loaded, the analog driver can be used to handle the
+joysticks. Digitally communicating joystick will work only on port 0, while
+using Y-cables, you can connect up to 8 analog joysticks to a single L4
+card, 16 in case you have two in your system.
+
+3.11 Trident 4DWave / Aureal Vortex
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipsets
+provide an "Enhanced Game Port" mode where the soundcard handles polling the
+joystick. This mode is supported by the pcigame.c module. Once loaded the
+analog driver can use the enhanced features of these gameports..
+
+3.13 Crystal SoundFusion
+~~~~~~~~~~~~~~~~~~~~~~~~
+ Soundcards with Crystal SoundFusion chipsets provide an "Enhanced Game
+Port", much like the 4DWave or Vortex above. This, and also the normal mode
+for the port of the SoundFusion is supported by the cs461x.c module.
+
+3.14 SoundBlaster Live!
+~~~~~~~~~~~~~~~~~~~~~~~~
+ The Live! has a special PCI gameport, which, although it doesn't provide
+any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than
+it's ISA counterparts. It also requires special support, hence the
+emu10k1-gp.c module for it instead of the normal ns558.c one.
+
+3.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ These PCI soundcards have specific gameports. They are handled by the
+sound drivers themselves. Make sure you select gameport support in the
+joystick menu and sound card support in the sound menu for your appropriate
+card.
+
+3.16 Amiga
+~~~~~~~~~~
+ Amiga joysticks, connected to an Amiga, are supported by the amijoy.c
+driver. Since they can't be autodetected, the driver has a command line.
+
+ amijoy.map=<a>,<b>
+
+ a and b define the joysticks connected to the JOY0DAT and JOY1DAT ports of
+the Amiga.
+
+ Value | Joystick type
+ ---------------------
+ 0 | None
+ 1 | 1-button digital joystick
+
+ No more joystick types are supported now, but that should change in the
+future if I get an Amiga in the reach of my fingers.
+
+3.17 Game console and 8-bit pads and joysticks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+See joystick-parport.txt for more info.
+
+3.18 SpaceTec/LabTec devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ SpaceTec serial devices communicate using the SpaceWare protocol. It is
+supported by the spaceorb.c and spaceball.c drivers. The devices currently
+supported by spaceorb.c are:
+
+* SpaceTec SpaceBall Avenger
+* SpaceTec SpaceOrb 360
+
+Devices currently supported by spaceball.c are:
+
+* SpaceTec SpaceBall 4000 FLX
+
+ In addition to having the spaceorb/spaceball and serport modules in the
+kernel, you also need to attach a serial port to it. to do that, run the
+inputattach program:
+
+ inputattach --spaceorb /dev/tts/x &
+or
+ inputattach --spaceball /dev/tts/x &
+
+where /dev/tts/x is the serial port which the device is connected to. After
+doing this, the device will be reported and will start working.
+
+ There is one caveat with the SpaceOrb. The button #6, the on the bottom
+side of the orb, although reported as an ordinary button, causes internal
+recentering of the spaceorb, moving the zero point to the position in which
+the ball is at the moment of pressing the button. So, think first before
+you bind it to some other function.
+
+SpaceTec SpaceBall 2003 FLX and 3003 FLX are not supported yet.
+
+3.19 Logitech SWIFT devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The SWIFT serial protocol is supported by the warrior.c module. It
+currently supports only the:
+
+* Logitech WingMan Warrior
+
+but in the future, Logitech CyberMan (the original one, not CM2) could be
+supported as well. To use the module, you need to run inputattach after you
+insert/compile the module into your kernel:
+
+ inputattach --warrior /dev/tts/x &
+
+/dev/tts/x is the serial port your Warrior is attached to.
+
+3.20 Magellan / Space Mouse
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The Magellan (or Space Mouse), manufactured by LogiCad3d (formerly Space
+Systems), for many other companies (Logitech, HP, ...) is supported by the
+joy-magellan module. It currently supports only the:
+
+* Magellan 3D
+* Space Mouse
+
+models, the additional buttons on the 'Plus' versions are not supported yet.
+
+ To use it, you need to attach the serial port to the driver using the
+
+ inputattach --magellan /dev/tts/x &
+
+command. After that the Magellan will be detected, initialized, will beep,
+and the /dev/input/jsX device should become usable.
+
+3.21 I-Force devices
+~~~~~~~~~~~~~~~~~~~~
+ All I-Force devices are supported by the iforce module. This includes:
+
+* AVB Mag Turbo Force
+* AVB Top Shot Pegasus
+* AVB Top Shot Force Feedback Racing Wheel
+* Logitech WingMan Force
+* Logitech WingMan Force Wheel
+* Guillemot Race Leader Force Feedback
+* Guillemot Force Feedback Racing Wheel
+* Thrustmaster Motor Sport GT
+
+ To use it, you need to attach the serial port to the driver using the
+
+ inputattach --iforce /dev/tts/x &
+
+command. After that the I-Force device will be detected, and the
+/dev/input/jsX device should become usable.
+
+ In case you're using the device via the USB port, the inputattach command
+isn't needed.
+
+ The I-Force driver now supports force feedback via the event interface.
+
+ Please note that Logitech WingMan *3D devices are _not_ supported by this
+module, rather by hid. Force feedback is not supported for those devices.
+Logitech gamepads are also hid devices.
+
+3.22 Gravis Stinger gamepad
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The Gravis Stinger serial port gamepad, designed for use with laptop
+computers, is supported by the stinger.c module. To use it, attach the
+serial port to the driver using:
+
+ inputattach --stinger /dev/tty/x &
+
+where x is the number of the serial port.
+
+4. Troubleshooting
+~~~~~~~~~~~~~~~~~~
+ There is quite a high probability that you run into some problems. For
+testing whether the driver works, if in doubt, use the jstest utility in
+some of its modes. The most useful modes are "normal" - for the 1.x
+interface, and "old" for the "0.x" interface. You run it by typing:
+
+ jstest --normal /dev/input/js0
+ jstest --old /dev/input/js0
+
+ Additionally you can do a test with the evtest utility:
+
+ evtest /dev/input/event0
+
+ Oh, and read the FAQ! :)
+
+5. FAQ
+~~~~~~
+Q: Running 'jstest /dev/js0' results in "File not found" error. What's the
+ cause?
+A: The device files don't exist. Create them (see section 2.2).
+
+Q: Is it possible to connect my old Atari/Commodore/Amiga/console joystick
+ or pad that uses a 9-pin D-type cannon connector to the serial port of my
+ PC?
+A: Yes, it is possible, but it'll burn your serial port or the pad. It
+ won't work, of course.
+
+Q: My joystick doesn't work with Quake / Quake 2. What's the cause?
+A: Quake / Quake 2 don't support joystick. Use joy2key to simulate keypresses
+ for them.
+
+6. Programming Interface
+~~~~~~~~~~~~~~~~~~~~~~~~
+ The 1.0 driver uses a new, event based approach to the joystick driver.
+Instead of the user program polling for the joystick values, the joystick
+driver now reports only any changes of its state. See joystick-api.txt,
+joystick.h and jstest.c included in the joystick package for more
+information. The joystick device can be used in either blocking or
+nonblocking mode and supports select() calls.
+
+ For backward compatibility the old (v0.x) interface is still included.
+Any call to the joystick driver using the old interface will return values
+that are compatible to the old interface. This interface is still limited
+to 2 axes, and applications using it usually decode only 2 buttons, although
+the driver provides up to 32.
diff --git a/Documentation/input/notifier.txt b/Documentation/input/notifier.txt
new file mode 100644
index 0000000..95172ca
--- /dev/null
+++ b/Documentation/input/notifier.txt
@@ -0,0 +1,52 @@
+Keyboard notifier
+
+One can use register_keyboard_notifier to get called back on keyboard
+events (see kbd_keycode() function for details). The passed structure is
+keyboard_notifier_param:
+
+- 'vc' always provide the VC for which the keyboard event applies;
+- 'down' is 1 for a key press event, 0 for a key release;
+- 'shift' is the current modifier state, mask bit indexes are KG_*;
+- 'value' depends on the type of event.
+
+- KBD_KEYCODE events are always sent before other events, value is the keycode.
+- KBD_UNBOUND_KEYCODE events are sent if the keycode is not bound to a keysym.
+ value is the keycode.
+- KBD_UNICODE events are sent if the keycode -> keysym translation produced a
+ unicode character. value is the unicode value.
+- KBD_KEYSYM events are sent if the keycode -> keysym translation produced a
+ non-unicode character. value is the keysym.
+- KBD_POST_KEYSYM events are sent after the treatment of non-unicode keysyms.
+ That permits one to inspect the resulting LEDs for instance.
+
+For each kind of event but the last, the callback may return NOTIFY_STOP in
+order to "eat" the event: the notify loop is stopped and the keyboard event is
+dropped.
+
+In a rough C snippet, we have:
+
+kbd_keycode(keycode) {
+ ...
+ params.value = keycode;
+ if (notifier_call_chain(KBD_KEYCODE,&params) == NOTIFY_STOP)
+ || !bound) {
+ notifier_call_chain(KBD_UNBOUND_KEYCODE,&params);
+ return;
+ }
+
+ if (unicode) {
+ param.value = unicode;
+ if (notifier_call_chain(KBD_UNICODE,&params) == NOTIFY_STOP)
+ return;
+ emit unicode;
+ return;
+ }
+
+ params.value = keysym;
+ if (notifier_call_chain(KBD_KEYSYM,&params) == NOTIFY_STOP)
+ return;
+ apply keysym;
+ notifier_call_chain(KBD_POST_KEYSYM,&params);
+}
+
+NOTE: This notifier is usually called from interrupt context.
diff --git a/Documentation/input/shape.fig b/Documentation/input/shape.fig
new file mode 100644
index 0000000..c22bff8
--- /dev/null
+++ b/Documentation/input/shape.fig
@@ -0,0 +1,65 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter
+100.00
+Single
+-2
+1200 2
+2 1 0 2 0 7 50 0 -1 0.000 0 0 -1 0 0 6
+ 4200 3600 4200 3075 4950 2325 7425 2325 8250 3150 8250 3600
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 4200 3675 4200 5400
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 8250 3675 8250 5400
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 3675 3600 8700 3600
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 8775 3600 10200 3600
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 8325 3150 9075 3150
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 7500 2325 10200 2325
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 3600 3600 3000 3600
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 4125 3075 3000 3075
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 60.00 120.00
+ 0 0 1.00 60.00 120.00
+ 4200 5400 8175 5400
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 60.00 120.00
+ 0 0 1.00 60.00 120.00
+ 10125 2325 10125 3600
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 60.00 120.00
+ 0 0 1.00 60.00 120.00
+ 3000 3150 3000 3600
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 60.00 120.00
+ 0 0 1.00 60.00 120.00
+ 9075 3150 9075 3600
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 4950 2325 4950 1200
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2
+ 7425 2325 7425 1200
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4
+ 4200 3075 4200 2400 3600 1800 3600 1200
+2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4
+ 8250 3150 8250 2475 8775 1950 8775 1200
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 60.00 120.00
+ 0 0 1.00 60.00 120.00
+ 3600 1275 4950 1275
+2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 60.00 120.00
+ 0 0 1.00 60.00 120.00
+ 7425 1275 8700 1275
+4 1 0 50 0 0 12 0.0000 4 135 1140 6075 5325 Effect duration\001
+4 0 0 50 0 0 12 0.0000 4 180 1305 10200 3000 Effect magnitude\001
+4 0 0 50 0 0 12 0.0000 4 135 780 9150 3450 Fade level\001
+4 1 0 50 0 0 12 0.0000 4 180 1035 4275 1200 Attack length\001
+4 1 0 50 0 0 12 0.0000 4 180 885 8175 1200 Fade length\001
+4 2 0 50 0 0 12 0.0000 4 135 930 2925 3375 Attack level\001
diff --git a/Documentation/input/xpad.txt b/Documentation/input/xpad.txt
new file mode 100644
index 0000000..aae0d40
--- /dev/null
+++ b/Documentation/input/xpad.txt
@@ -0,0 +1,183 @@
+xpad - Linux USB driver for X-Box gamepads
+
+This is the very first release of a driver for X-Box gamepads.
+Basically, this was hacked away in just a few hours, so don't expect
+miracles.
+
+In particular, there is currently NO support for the rumble pack.
+You won't find many ff-aware linux applications anyway.
+
+
+0. Notes
+--------
+
+Driver updated for kernel 2.6.17.11. (Based on a patch for 2.6.11.4.)
+
+The number of buttons/axes reported varies based on 3 things:
+- if you are using a known controller
+- if you are using a known dance pad
+- if using an unknown device (one not listed below), what you set in the
+ module configuration for "Map D-PAD to buttons rather than axes for unknown
+ pads" (module option dpad_to_buttons)
+
+If you set dpad_to_buttons to 0 and you are using an unknown device (one
+not listed below), the driver will map the directional pad to axes (X/Y),
+if you said N it will map the d-pad to buttons, which is needed for dance
+style games to function correctly. The default is Y.
+
+dpad_to_buttons has no effect for known pads.
+
+0.1 Normal Controllers
+----------------------
+With a normal controller, the directional pad is mapped to its own X/Y axes.
+The jstest-program from joystick-1.2.15 (jstest-version 2.1.0) will report 8
+axes and 10 buttons.
+
+All 8 axes work, though they all have the same range (-32768..32767)
+and the zero-setting is not correct for the triggers (I don't know if that
+is some limitation of jstest, since the input device setup should be fine. I
+didn't have a look at jstest itself yet).
+
+All of the 10 buttons work (in digital mode). The six buttons on the
+right side (A, B, X, Y, black, white) are said to be "analog" and
+report their values as 8 bit unsigned, not sure what this is good for.
+
+I tested the controller with quake3, and configuration and
+in game functionality were OK. However, I find it rather difficult to
+play first person shooters with a pad. Your mileage may vary.
+
+
+0.2 Xbox Dance Pads
+-------------------
+When using a known dance pad, jstest will report 6 axes and 14 buttons.
+
+For dance style pads (like the redoctane pad) several changes
+have been made. The old driver would map the d-pad to axes, resulting
+in the driver being unable to report when the user was pressing both
+left+right or up+down, making DDR style games unplayable.
+
+Known dance pads automatically map the d-pad to buttons and will work
+correctly out of the box.
+
+If your dance pad is recognized by the driver but is using axes instead
+of buttons, see section 0.3 - Unknown Controllers
+
+I've tested this with Stepmania, and it works quite well.
+
+
+0.3 Unknown Controllers
+----------------------
+If you have an unknown xbox controller, it should work just fine with
+the default settings.
+
+HOWEVER if you have an unknown dance pad not listed below, it will not
+work UNLESS you set "dpad_to_buttons" to 1 in the module configuration.
+
+PLEASE, if you have an unknown controller, email Dom <binary1230@yahoo.com> with
+a dump from /proc/bus/usb and a description of the pad (manufacturer, country,
+whether it is a dance pad or normal controller) so that we can add your pad
+to the list of supported devices, ensuring that it will work out of the
+box in the future.
+
+
+1. USB adapter
+--------------
+
+Before you can actually use the driver, you need to get yourself an
+adapter cable to connect the X-Box controller to your Linux-Box. You
+can buy these online fairly cheap, or build your own.
+
+Such a cable is pretty easy to build. The Controller itself is a USB
+compound device (a hub with three ports for two expansion slots and
+the controller device) with the only difference in a nonstandard connector
+(5 pins vs. 4 on standard USB connector).
+
+You just need to solder a USB connector onto the cable and keep the
+yellow wire unconnected. The other pins have the same order on both
+connectors so there is no magic to it. Detailed info on these matters
+can be found on the net ([1], [2], [3]).
+
+Thanks to the trip splitter found on the cable you don't even need to cut the
+original one. You can buy an extension cable and cut that instead. That way,
+you can still use the controller with your X-Box, if you have one ;)
+
+
+2. Driver Installation
+----------------------
+
+Once you have the adapter cable and the controller is connected, you need
+to load your USB subsystem and should cat /proc/bus/usb/devices.
+There should be an entry like the one at the end [4].
+
+Currently (as of version 0.0.6), the following devices are included:
+ original Microsoft XBOX controller (US), vendor=0x045e, product=0x0202
+ smaller Microsoft XBOX controller (US), vendor=0x045e, product=0x0289
+ original Microsoft XBOX controller (Japan), vendor=0x045e, product=0x0285
+ InterAct PowerPad Pro (Germany), vendor=0x05fd, product=0x107a
+ RedOctane Xbox Dance Pad (US), vendor=0x0c12, product=0x8809
+
+The driver should work with xbox pads not listed above as well, however
+you will need to do something extra for dance pads to work.
+
+If you have a controller not listed above, see 0.3 - Unknown Controllers
+
+If you compiled and installed the driver, test the functionality:
+> modprobe xpad
+> modprobe joydev
+> jstest /dev/js0
+
+If you're using a normal controller, there should be a single line showing
+18 inputs (8 axes, 10 buttons), and its values should change if you move
+the sticks and push the buttons. If you're using a dance pad, it should
+show 20 inputs (6 axes, 14 buttons).
+
+It works? Voila, you're done ;)
+
+
+3. Thanks
+---------
+
+I have to thank ITO Takayuki for the detailed info on his site
+ http://euc.jp/periphs/xbox-controller.ja.html.
+
+His useful info and both the usb-skeleton as well as the iforce input driver
+(Greg Kroah-Hartmann; Vojtech Pavlik) helped a lot in rapid prototyping
+the basic functionality.
+
+
+4. References
+-------------
+
+1. http://euc.jp/periphs/xbox-controller.ja.html (ITO Takayuki)
+2. http://xpad.xbox-scene.com/
+3. http://www.xboxhackz.com/Hackz-Reference.htm
+
+4. /proc/bus/usb/devices - dump from InterAct PowerPad Pro (Germany):
+
+T: Bus=01 Lev=03 Prnt=04 Port=00 Cnt=01 Dev#= 5 Spd=12 MxCh= 0
+D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=32 #Cfgs= 1
+P: Vendor=05fd ProdID=107a Rev= 1.00
+C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA
+I: If#= 0 Alt= 0 #EPs= 2 Cls=58(unk. ) Sub=42 Prot=00 Driver=(none)
+E: Ad=81(I) Atr=03(Int.) MxPS= 32 Ivl= 10ms
+E: Ad=02(O) Atr=03(Int.) MxPS= 32 Ivl= 10ms
+
+5. /proc/bus/usb/devices - dump from Redoctane Xbox Dance Pad (US):
+
+T: Bus=01 Lev=02 Prnt=09 Port=00 Cnt=01 Dev#= 10 Spd=12 MxCh= 0
+D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1
+P: Vendor=0c12 ProdID=8809 Rev= 0.01
+S: Product=XBOX DDR
+C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA
+I: If#= 0 Alt= 0 #EPs= 2 Cls=58(unk. ) Sub=42 Prot=00 Driver=xpad
+E: Ad=82(I) Atr=03(Int.) MxPS= 32 Ivl=4ms
+E: Ad=02(O) Atr=03(Int.) MxPS= 32 Ivl=4ms
+
+--
+Marko Friedemann <mfr@bmx-chemnitz.de>
+2002-07-16
+ - original doc
+
+Dominic Cerquetti <binary1230@yahoo.com>
+2005-03-19
+ - added stuff for dance pads, new d-pad->axes mappings
diff --git a/Documentation/input/yealink.txt b/Documentation/input/yealink.txt
new file mode 100644
index 0000000..5360e43
--- /dev/null
+++ b/Documentation/input/yealink.txt
@@ -0,0 +1,216 @@
+Driver documentation for yealink usb-p1k phones
+
+0. Status
+~~~~~~~~~
+The p1k is a relatively cheap usb 1.1 phone with:
+ - keyboard full support, yealink.ko / input event API
+ - LCD full support, yealink.ko / sysfs API
+ - LED full support, yealink.ko / sysfs API
+ - dialtone full support, yealink.ko / sysfs API
+ - ringtone full support, yealink.ko / sysfs API
+ - audio playback full support, snd_usb_audio.ko / alsa API
+ - audio record full support, snd_usb_audio.ko / alsa API
+
+For vendor documentation see http://www.yealink.com
+
+
+1. Compilation (stand alone version)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Currently only kernel 2.6.x.y versions are supported.
+In order to build the yealink.ko module do
+
+ make
+
+If you encounter problems please check if in the MAKE_OPTS variable in
+the Makefile is pointing to the location where your kernel sources
+are located, default /usr/src/linux.
+
+
+1.1 Troubleshooting
+~~~~~~~~~~~~~~~~~~~
+Q: Module yealink compiled and installed without any problem but phone
+ is not initialized and does not react to any actions.
+A: If you see something like:
+ hiddev0: USB HID v1.00 Device [Yealink Network Technology Ltd. VOIP USB Phone
+ in dmesg, it means that the hid driver has grabbed the device first. Try to
+ load module yealink before any other usb hid driver. Please see the
+ instructions provided by your distribution on module configuration.
+
+Q: Phone is working now (displays version and accepts keypad input) but I can't
+ find the sysfs files.
+A: The sysfs files are located on the particular usb endpoint. On most
+ distributions you can do: "find /sys/ -name get_icons" for a hint.
+
+
+2. keyboard features
+~~~~~~~~~~~~~~~~~~~~
+The current mapping in the kernel is provided by the map_p1k_to_key
+function:
+
+ Physical USB-P1K button layout input events
+
+
+ up up
+ IN OUT left, right
+ down down
+
+ pickup C hangup enter, backspace, escape
+ 1 2 3 1, 2, 3
+ 4 5 6 4, 5, 6,
+ 7 8 9 7, 8, 9,
+ * 0 # *, 0, #,
+
+ The "up" and "down" keys, are symbolised by arrows on the button.
+ The "pickup" and "hangup" keys are symbolised by a green and red phone
+ on the button.
+
+
+3. LCD features
+~~~~~~~~~~~~~~~
+The LCD is divided and organised as a 3 line display:
+
+ |[] [][] [][] [][] in |[][]
+ |[] M [][] D [][] : [][] out |[][]
+ store
+
+ NEW REP SU MO TU WE TH FR SA
+
+ [] [] [] [] [] [] [] [] [] [] [] []
+ [] [] [] [] [] [] [] [] [] [] [] []
+
+
+Line 1 Format (see below) : 18.e8.M8.88...188
+ Icon names : M D : IN OUT STORE
+Line 2 Format : .........
+ Icon name : NEW REP SU MO TU WE TH FR SA
+Line 3 Format : 888888888888
+
+
+Format description:
+ From a userspace perspective the world is separated into "digits" and "icons".
+ A digit can have a character set, an icon can only be ON or OFF.
+
+ Format specifier
+ '8' : Generic 7 segment digit with individual addressable segments
+
+ Reduced capability 7 segm digit, when segments are hard wired together.
+ '1' : 2 segments digit only able to produce a 1.
+ 'e' : Most significant day of the month digit,
+ able to produce at least 1 2 3.
+ 'M' : Most significant minute digit,
+ able to produce at least 0 1 2 3 4 5.
+
+ Icons or pictograms:
+ '.' : For example like AM, PM, SU, a 'dot' .. or other single segment
+ elements.
+
+
+4. Driver usage
+~~~~~~~~~~~~~~~
+For userland the following interfaces are available using the sysfs interface:
+ /sys/.../
+ line1 Read/Write, lcd line1
+ line2 Read/Write, lcd line2
+ line3 Read/Write, lcd line3
+
+ get_icons Read, returns a set of available icons.
+ hide_icon Write, hide the element by writing the icon name.
+ show_icon Write, display the element by writing the icon name.
+
+ map_seg7 Read/Write, the 7 segments char set, common for all
+ yealink phones. (see map_to_7segment.h)
+
+ ringtone Write, upload binary representation of a ringtone,
+ see yealink.c. status EXPERIMENTAL due to potential
+ races between async. and sync usb calls.
+
+
+4.1 lineX
+~~~~~~~~~
+Reading /sys/../lineX will return the format string with its current value:
+
+ Example:
+ cat ./line3
+ 888888888888
+ Linux Rocks!
+
+Writing to /sys/../lineX will set the corresponding LCD line.
+ - Excess characters are ignored.
+ - If less characters are written than allowed, the remaining digits are
+ unchanged.
+ - The tab '\t'and '\n' char does not overwrite the original content.
+ - Writing a space to an icon will always hide its content.
+
+ Example:
+ date +"%m.%e.%k:%M" | sed 's/^0/ /' > ./line1
+
+ Will update the LCD with the current date & time.
+
+
+4.2 get_icons
+~~~~~~~~~~~~~
+Reading will return all available icon names and its current settings:
+
+ cat ./get_icons
+ on M
+ on D
+ on :
+ IN
+ OUT
+ STORE
+ NEW
+ REP
+ SU
+ MO
+ TU
+ WE
+ TH
+ FR
+ SA
+ LED
+ DIALTONE
+ RINGTONE
+
+
+4.3 show/hide icons
+~~~~~~~~~~~~~~~~~~~
+Writing to these files will update the state of the icon.
+Only one icon at a time can be updated.
+
+If an icon is also on a ./lineX the corresponding value is
+updated with the first letter of the icon.
+
+ Example - light up the store icon:
+ echo -n "STORE" > ./show_icon
+
+ cat ./line1
+ 18.e8.M8.88...188
+ S
+
+ Example - sound the ringtone for 10 seconds:
+ echo -n RINGTONE > /sys/..../show_icon
+ sleep 10
+ echo -n RINGTONE > /sys/..../hide_icon
+
+
+5. Sound features
+~~~~~~~~~~~~~~~~~
+Sound is supported by the ALSA driver: snd_usb_audio
+
+One 16-bit channel with sample and playback rates of 8000 Hz is the practical
+limit of the device.
+
+ Example - recording test:
+ arecord -v -d 10 -r 8000 -f S16_LE -t wav foobar.wav
+
+ Example - playback test:
+ aplay foobar.wav
+
+
+6. Credits & Acknowledgments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ - Olivier Vandorpe, for starting the usbb2k-api project doing much of
+ the reverse engineering.
+ - Martin Diehl, for pointing out how to handle USB memory allocation.
+ - Dmitry Torokhov, for the numerous code reviews and suggestions.
+
diff --git a/Documentation/io-mapping.txt b/Documentation/io-mapping.txt
new file mode 100644
index 0000000..473e43b
--- /dev/null
+++ b/Documentation/io-mapping.txt
@@ -0,0 +1,82 @@
+The io_mapping functions in linux/io-mapping.h provide an abstraction for
+efficiently mapping small regions of an I/O device to the CPU. The initial
+usage is to support the large graphics aperture on 32-bit processors where
+ioremap_wc cannot be used to statically map the entire aperture to the CPU
+as it would consume too much of the kernel address space.
+
+A mapping object is created during driver initialization using
+
+ struct io_mapping *io_mapping_create_wc(unsigned long base,
+ unsigned long size)
+
+ 'base' is the bus address of the region to be made
+ mappable, while 'size' indicates how large a mapping region to
+ enable. Both are in bytes.
+
+ This _wc variant provides a mapping which may only be used
+ with the io_mapping_map_atomic_wc or io_mapping_map_wc.
+
+With this mapping object, individual pages can be mapped either atomically
+or not, depending on the necessary scheduling environment. Of course, atomic
+maps are more efficient:
+
+ void *io_mapping_map_atomic_wc(struct io_mapping *mapping,
+ unsigned long offset)
+
+ 'offset' is the offset within the defined mapping region.
+ Accessing addresses beyond the region specified in the
+ creation function yields undefined results. Using an offset
+ which is not page aligned yields an undefined result. The
+ return value points to a single page in CPU address space.
+
+ This _wc variant returns a write-combining map to the
+ page and may only be used with mappings created by
+ io_mapping_create_wc
+
+ Note that the task may not sleep while holding this page
+ mapped.
+
+ void io_mapping_unmap_atomic(void *vaddr)
+
+ 'vaddr' must be the the value returned by the last
+ io_mapping_map_atomic_wc call. This unmaps the specified
+ page and allows the task to sleep once again.
+
+If you need to sleep while holding the lock, you can use the non-atomic
+variant, although they may be significantly slower.
+
+ void *io_mapping_map_wc(struct io_mapping *mapping,
+ unsigned long offset)
+
+ This works like io_mapping_map_atomic_wc except it allows
+ the task to sleep while holding the page mapped.
+
+ void io_mapping_unmap(void *vaddr)
+
+ This works like io_mapping_unmap_atomic, except it is used
+ for pages mapped with io_mapping_map_wc.
+
+At driver close time, the io_mapping object must be freed:
+
+ void io_mapping_free(struct io_mapping *mapping)
+
+Current Implementation:
+
+The initial implementation of these functions uses existing mapping
+mechanisms and so provides only an abstraction layer and no new
+functionality.
+
+On 64-bit processors, io_mapping_create_wc calls ioremap_wc for the whole
+range, creating a permanent kernel-visible mapping to the resource. The
+map_atomic and map functions add the requested offset to the base of the
+virtual address returned by ioremap_wc.
+
+On 32-bit processors with HIGHMEM defined, io_mapping_map_atomic_wc uses
+kmap_atomic_pfn to map the specified page in an atomic fashion;
+kmap_atomic_pfn isn't really supposed to be used with device pages, but it
+provides an efficient mapping for this usage.
+
+On 32-bit processors without HIGHMEM defined, io_mapping_map_atomic_wc and
+io_mapping_map_wc both use ioremap_wc, a terribly inefficient function which
+performs an IPI to inform all processors about the new mapping. This results
+in a significant performance penalty.
diff --git a/Documentation/io_ordering.txt b/Documentation/io_ordering.txt
new file mode 100644
index 0000000..9faae6f
--- /dev/null
+++ b/Documentation/io_ordering.txt
@@ -0,0 +1,47 @@
+On some platforms, so-called memory-mapped I/O is weakly ordered. On such
+platforms, driver writers are responsible for ensuring that I/O writes to
+memory-mapped addresses on their device arrive in the order intended. This is
+typically done by reading a 'safe' device or bridge register, causing the I/O
+chipset to flush pending writes to the device before any reads are posted. A
+driver would usually use this technique immediately prior to the exit of a
+critical section of code protected by spinlocks. This would ensure that
+subsequent writes to I/O space arrived only after all prior writes (much like a
+memory barrier op, mb(), only with respect to I/O).
+
+A more concrete example from a hypothetical device driver:
+
+ ...
+CPU A: spin_lock_irqsave(&dev_lock, flags)
+CPU A: val = readl(my_status);
+CPU A: ...
+CPU A: writel(newval, ring_ptr);
+CPU A: spin_unlock_irqrestore(&dev_lock, flags)
+ ...
+CPU B: spin_lock_irqsave(&dev_lock, flags)
+CPU B: val = readl(my_status);
+CPU B: ...
+CPU B: writel(newval2, ring_ptr);
+CPU B: spin_unlock_irqrestore(&dev_lock, flags)
+ ...
+
+In the case above, the device may receive newval2 before it receives newval,
+which could cause problems. Fixing it is easy enough though:
+
+ ...
+CPU A: spin_lock_irqsave(&dev_lock, flags)
+CPU A: val = readl(my_status);
+CPU A: ...
+CPU A: writel(newval, ring_ptr);
+CPU A: (void)readl(safe_register); /* maybe a config register? */
+CPU A: spin_unlock_irqrestore(&dev_lock, flags)
+ ...
+CPU B: spin_lock_irqsave(&dev_lock, flags)
+CPU B: val = readl(my_status);
+CPU B: ...
+CPU B: writel(newval2, ring_ptr);
+CPU B: (void)readl(safe_register); /* maybe a config register? */
+CPU B: spin_unlock_irqrestore(&dev_lock, flags)
+
+Here, the reads from safe_register will cause the I/O chipset to flush any
+pending writes before actually posting the read to the chipset, preventing
+possible data corruption.
diff --git a/Documentation/ioctl/00-INDEX b/Documentation/ioctl/00-INDEX
new file mode 100644
index 0000000..d2fe4d4
--- /dev/null
+++ b/Documentation/ioctl/00-INDEX
@@ -0,0 +1,10 @@
+00-INDEX
+ - this file
+cdrom.txt
+ - summary of CDROM ioctl calls
+hdio.txt
+ - summary of HDIO_ ioctl calls
+ioctl-decoding.txt
+ - how to decode the bits of an IOCTL code
+ioctl-number.txt
+ - how to implement and register device/driver ioctl calls
diff --git a/Documentation/ioctl/cdrom.txt b/Documentation/ioctl/cdrom.txt
new file mode 100644
index 0000000..59df81c
--- /dev/null
+++ b/Documentation/ioctl/cdrom.txt
@@ -0,0 +1,966 @@
+ Summary of CDROM ioctl calls.
+ ============================
+
+ Edward A. Falk <efalk@google.com>
+
+ November, 2004
+
+This document attempts to describe the ioctl(2) calls supported by
+the CDROM layer. These are by-and-large implemented (as of Linux 2.6)
+in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c
+
+ioctl values are listed in <linux/cdrom.h>. As of this writing, they
+are as follows:
+
+ CDROMPAUSE Pause Audio Operation
+ CDROMRESUME Resume paused Audio Operation
+ CDROMPLAYMSF Play Audio MSF (struct cdrom_msf)
+ CDROMPLAYTRKIND Play Audio Track/index (struct cdrom_ti)
+ CDROMREADTOCHDR Read TOC header (struct cdrom_tochdr)
+ CDROMREADTOCENTRY Read TOC entry (struct cdrom_tocentry)
+ CDROMSTOP Stop the cdrom drive
+ CDROMSTART Start the cdrom drive
+ CDROMEJECT Ejects the cdrom media
+ CDROMVOLCTRL Control output volume (struct cdrom_volctrl)
+ CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl)
+ CDROMREADMODE2 Read CDROM mode 2 data (2336 Bytes)
+ (struct cdrom_read)
+ CDROMREADMODE1 Read CDROM mode 1 data (2048 Bytes)
+ (struct cdrom_read)
+ CDROMREADAUDIO (struct cdrom_read_audio)
+ CDROMEJECT_SW enable(1)/disable(0) auto-ejecting
+ CDROMMULTISESSION Obtain the start-of-last-session
+ address of multi session disks
+ (struct cdrom_multisession)
+ CDROM_GET_MCN Obtain the "Universal Product Code"
+ if available (struct cdrom_mcn)
+ CDROM_GET_UPC Deprecated, use CDROM_GET_MCN instead.
+ CDROMRESET hard-reset the drive
+ CDROMVOLREAD Get the drive's volume setting
+ (struct cdrom_volctrl)
+ CDROMREADRAW read data in raw mode (2352 Bytes)
+ (struct cdrom_read)
+ CDROMREADCOOKED read data in cooked mode
+ CDROMSEEK seek msf address
+ CDROMPLAYBLK scsi-cd only, (struct cdrom_blk)
+ CDROMREADALL read all 2646 bytes
+ CDROMGETSPINDOWN return 4-bit spindown value
+ CDROMSETSPINDOWN set 4-bit spindown value
+ CDROMCLOSETRAY pendant of CDROMEJECT
+ CDROM_SET_OPTIONS Set behavior options
+ CDROM_CLEAR_OPTIONS Clear behavior options
+ CDROM_SELECT_SPEED Set the CD-ROM speed
+ CDROM_SELECT_DISC Select disc (for juke-boxes)
+ CDROM_MEDIA_CHANGED Check is media changed
+ CDROM_DRIVE_STATUS Get tray position, etc.
+ CDROM_DISC_STATUS Get disc type, etc.
+ CDROM_CHANGER_NSLOTS Get number of slots
+ CDROM_LOCKDOOR lock or unlock door
+ CDROM_DEBUG Turn debug messages on/off
+ CDROM_GET_CAPABILITY get capabilities
+ CDROMAUDIOBUFSIZ set the audio buffer size
+ DVD_READ_STRUCT Read structure
+ DVD_WRITE_STRUCT Write structure
+ DVD_AUTH Authentication
+ CDROM_SEND_PACKET send a packet to the drive
+ CDROM_NEXT_WRITABLE get next writable block
+ CDROM_LAST_WRITTEN get last block written on disc
+
+
+The information that follows was determined from reading kernel source
+code. It is likely that some corrections will be made over time.
+
+
+
+
+
+
+
+General:
+
+ Unless otherwise specified, all ioctl calls return 0 on success
+ and -1 with errno set to an appropriate value on error. (Some
+ ioctls return non-negative data values.)
+
+ Unless otherwise specified, all ioctl calls return -1 and set
+ errno to EFAULT on a failed attempt to copy data to or from user
+ address space.
+
+ Individual drivers may return error codes not listed here.
+
+ Unless otherwise specified, all data structures and constants
+ are defined in <linux/cdrom.h>
+
+
+
+
+CDROMPAUSE Pause Audio Operation
+
+ usage:
+
+ ioctl(fd, CDROMPAUSE, 0);
+
+ inputs: none
+
+ outputs: none
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+
+CDROMRESUME Resume paused Audio Operation
+
+ usage:
+
+ ioctl(fd, CDROMRESUME, 0);
+
+ inputs: none
+
+ outputs: none
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+
+CDROMPLAYMSF Play Audio MSF (struct cdrom_msf)
+
+ usage:
+
+ struct cdrom_msf msf;
+ ioctl(fd, CDROMPLAYMSF, &msf);
+
+ inputs:
+ cdrom_msf structure, describing a segment of music to play
+
+ outputs: none
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+ notes:
+ MSF stands for minutes-seconds-frames
+ LBA stands for logical block address
+
+ Segment is described as start and end times, where each time
+ is described as minutes:seconds:frames. A frame is 1/75 of
+ a second.
+
+
+CDROMPLAYTRKIND Play Audio Track/index (struct cdrom_ti)
+
+ usage:
+
+ struct cdrom_ti ti;
+ ioctl(fd, CDROMPLAYTRKIND, &ti);
+
+ inputs:
+ cdrom_ti structure, describing a segment of music to play
+
+ outputs: none
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+ notes:
+ Segment is described as start and end times, where each time
+ is described as a track and an index.
+
+
+
+CDROMREADTOCHDR Read TOC header (struct cdrom_tochdr)
+
+ usage:
+
+ cdrom_tochdr header;
+ ioctl(fd, CDROMREADTOCHDR, &header);
+
+ inputs:
+ cdrom_tochdr structure
+
+ outputs:
+ cdrom_tochdr structure
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+
+
+CDROMREADTOCENTRY Read TOC entry (struct cdrom_tocentry)
+
+ usage:
+
+ struct cdrom_tocentry entry;
+ ioctl(fd, CDROMREADTOCENTRY, &entry);
+
+ inputs:
+ cdrom_tocentry structure
+
+ outputs:
+ cdrom_tocentry structure
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+ EINVAL entry.cdte_format not CDROM_MSF or CDROM_LBA
+ EINVAL requested track out of bounds
+ EIO I/O error reading TOC
+
+ notes:
+ TOC stands for Table Of Contents
+ MSF stands for minutes-seconds-frames
+ LBA stands for logical block address
+
+
+
+CDROMSTOP Stop the cdrom drive
+
+ usage:
+
+ ioctl(fd, CDROMSTOP, 0);
+
+ inputs: none
+
+ outputs: none
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+ notes:
+ Exact interpretation of this ioctl depends on the device,
+ but most seem to spin the drive down.
+
+
+CDROMSTART Start the cdrom drive
+
+ usage:
+
+ ioctl(fd, CDROMSTART, 0);
+
+ inputs: none
+
+ outputs: none
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+ notes:
+ Exact interpretation of this ioctl depends on the device,
+ but most seem to spin the drive up and/or close the tray.
+ Other devices ignore the ioctl completely.
+
+
+CDROMEJECT Ejects the cdrom media
+
+ usage:
+
+ ioctl(fd, CDROMEJECT, 0);
+
+ inputs: none
+
+ outputs: none
+
+ error returns:
+ ENOSYS cd drive not capable of ejecting
+ EBUSY other processes are accessing drive, or door is locked
+
+ notes:
+ See CDROM_LOCKDOOR, below.
+
+
+
+CDROMCLOSETRAY pendant of CDROMEJECT
+
+ usage:
+
+ ioctl(fd, CDROMCLOSETRAY, 0);
+
+ inputs: none
+
+ outputs: none
+
+ error returns:
+ ENOSYS cd drive not capable of closing the tray
+ EBUSY other processes are accessing drive, or door is locked
+
+ notes:
+ See CDROM_LOCKDOOR, below.
+
+
+
+CDROMVOLCTRL Control output volume (struct cdrom_volctrl)
+
+ usage:
+
+ struct cdrom_volctrl volume;
+ ioctl(fd, CDROMVOLCTRL, &volume);
+
+ inputs:
+ cdrom_volctrl structure containing volumes for up to 4
+ channels.
+
+ outputs: none
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+
+
+CDROMVOLREAD Get the drive's volume setting
+ (struct cdrom_volctrl)
+
+ usage:
+
+ struct cdrom_volctrl volume;
+ ioctl(fd, CDROMVOLREAD, &volume);
+
+ inputs: none
+
+ outputs:
+ The current volume settings.
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+
+
+
+CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl)
+
+ usage:
+
+ struct cdrom_subchnl q;
+ ioctl(fd, CDROMSUBCHNL, &q);
+
+ inputs:
+ cdrom_subchnl structure
+
+ outputs:
+ cdrom_subchnl structure
+
+ error return:
+ ENOSYS cd drive not audio-capable.
+ EINVAL format not CDROM_MSF or CDROM_LBA
+
+ notes:
+ Format is converted to CDROM_MSF on return
+
+
+
+CDROMREADRAW read data in raw mode (2352 Bytes)
+ (struct cdrom_read)
+
+ usage:
+
+ union {
+ struct cdrom_msf msf; /* input */
+ char buffer[CD_FRAMESIZE_RAW]; /* return */
+ } arg;
+ ioctl(fd, CDROMREADRAW, &arg);
+
+ inputs:
+ cdrom_msf structure indicating an address to read.
+ Only the start values are significant.
+
+ outputs:
+ Data written to address provided by user.
+
+ error return:
+ EINVAL address less than 0, or msf less than 0:2:0
+ ENOMEM out of memory
+
+ notes:
+ As of 2.6.8.1, comments in <linux/cdrom.h> indicate that this
+ ioctl accepts a cdrom_read structure, but actual source code
+ reads a cdrom_msf structure and writes a buffer of data to
+ the same address.
+
+ MSF values are converted to LBA values via this formula:
+
+ lba = (((m * CD_SECS) + s) * CD_FRAMES + f) - CD_MSF_OFFSET;
+
+
+
+
+CDROMREADMODE1 Read CDROM mode 1 data (2048 Bytes)
+ (struct cdrom_read)
+
+ notes:
+ Identical to CDROMREADRAW except that block size is
+ CD_FRAMESIZE (2048) bytes
+
+
+
+CDROMREADMODE2 Read CDROM mode 2 data (2336 Bytes)
+ (struct cdrom_read)
+
+ notes:
+ Identical to CDROMREADRAW except that block size is
+ CD_FRAMESIZE_RAW0 (2336) bytes
+
+
+
+CDROMREADAUDIO (struct cdrom_read_audio)
+
+ usage:
+
+ struct cdrom_read_audio ra;
+ ioctl(fd, CDROMREADAUDIO, &ra);
+
+ inputs:
+ cdrom_read_audio structure containing read start
+ point and length
+
+ outputs:
+ audio data, returned to buffer indicated by ra
+
+ error return:
+ EINVAL format not CDROM_MSF or CDROM_LBA
+ EINVAL nframes not in range [1 75]
+ ENXIO drive has no queue (probably means invalid fd)
+ ENOMEM out of memory
+
+
+CDROMEJECT_SW enable(1)/disable(0) auto-ejecting
+
+ usage:
+
+ int val;
+ ioctl(fd, CDROMEJECT_SW, val);
+
+ inputs:
+ Flag specifying auto-eject flag.
+
+ outputs: none
+
+ error return:
+ ENOSYS Drive is not capable of ejecting.
+ EBUSY Door is locked
+
+
+
+
+CDROMMULTISESSION Obtain the start-of-last-session
+ address of multi session disks
+ (struct cdrom_multisession)
+ usage:
+
+ struct cdrom_multisession ms_info;
+ ioctl(fd, CDROMMULTISESSION, &ms_info);
+
+ inputs:
+ cdrom_multisession structure containing desired
+ format.
+
+ outputs:
+ cdrom_multisession structure is filled with last_session
+ information.
+
+ error return:
+ EINVAL format not CDROM_MSF or CDROM_LBA
+
+
+CDROM_GET_MCN Obtain the "Universal Product Code"
+ if available (struct cdrom_mcn)
+
+ usage:
+
+ struct cdrom_mcn mcn;
+ ioctl(fd, CDROM_GET_MCN, &mcn);
+
+ inputs: none
+
+ outputs:
+ Universal Product Code
+
+ error return:
+ ENOSYS Drive is not capable of reading MCN data.
+
+ notes:
+ Source code comments state:
+
+ The following function is implemented, although very few
+ audio discs give Universal Product Code information, which
+ should just be the Medium Catalog Number on the box. Note,
+ that the way the code is written on the CD is /not/ uniform
+ across all discs!
+
+
+
+
+CDROM_GET_UPC CDROM_GET_MCN (deprecated)
+
+ Not implemented, as of 2.6.8.1
+
+
+
+CDROMRESET hard-reset the drive
+
+ usage:
+
+ ioctl(fd, CDROMRESET, 0);
+
+ inputs: none
+
+ outputs: none
+
+ error return:
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ ENOSYS Drive is not capable of resetting.
+
+
+
+
+CDROMREADCOOKED read data in cooked mode
+
+ usage:
+
+ u8 buffer[CD_FRAMESIZE]
+ ioctl(fd, CDROMREADCOOKED, buffer);
+
+ inputs: none
+
+ outputs:
+ 2048 bytes of data, "cooked" mode.
+
+ notes:
+ Not implemented on all drives.
+
+
+
+
+CDROMREADALL read all 2646 bytes
+
+ Same as CDROMREADCOOKED, but reads 2646 bytes.
+
+
+
+CDROMSEEK seek msf address
+
+ usage:
+
+ struct cdrom_msf msf;
+ ioctl(fd, CDROMSEEK, &msf);
+
+ inputs:
+ MSF address to seek to.
+
+ outputs: none
+
+
+
+CDROMPLAYBLK scsi-cd only, (struct cdrom_blk)
+
+ usage:
+
+ struct cdrom_blk blk;
+ ioctl(fd, CDROMPLAYBLK, &blk);
+
+ inputs:
+ Region to play
+
+ outputs: none
+
+
+
+CDROMGETSPINDOWN
+
+ usage:
+
+ char spindown;
+ ioctl(fd, CDROMGETSPINDOWN, &spindown);
+
+ inputs: none
+
+ outputs:
+ The value of the current 4-bit spindown value.
+
+
+
+
+CDROMSETSPINDOWN
+
+ usage:
+
+ char spindown
+ ioctl(fd, CDROMSETSPINDOWN, &spindown);
+
+ inputs:
+ 4-bit value used to control spindown (TODO: more detail here)
+
+ outputs: none
+
+
+
+
+
+CDROM_SET_OPTIONS Set behavior options
+
+ usage:
+
+ int options;
+ ioctl(fd, CDROM_SET_OPTIONS, options);
+
+ inputs:
+ New values for drive options. The logical 'or' of:
+ CDO_AUTO_CLOSE close tray on first open(2)
+ CDO_AUTO_EJECT open tray on last release
+ CDO_USE_FFLAGS use O_NONBLOCK information on open
+ CDO_LOCK lock tray on open files
+ CDO_CHECK_TYPE check type on open for data
+
+ outputs:
+ Returns the resulting options settings in the
+ ioctl return value. Returns -1 on error.
+
+ error return:
+ ENOSYS selected option(s) not supported by drive.
+
+
+
+
+CDROM_CLEAR_OPTIONS Clear behavior options
+
+ Same as CDROM_SET_OPTIONS, except that selected options are
+ turned off.
+
+
+
+CDROM_SELECT_SPEED Set the CD-ROM speed
+
+ usage:
+
+ int speed;
+ ioctl(fd, CDROM_SELECT_SPEED, speed);
+
+ inputs:
+ New drive speed.
+
+ outputs: none
+
+ error return:
+ ENOSYS speed selection not supported by drive.
+
+
+
+CDROM_SELECT_DISC Select disc (for juke-boxes)
+
+ usage:
+
+ int disk;
+ ioctl(fd, CDROM_SELECT_DISC, disk);
+
+ inputs:
+ Disk to load into drive.
+
+ outputs: none
+
+ error return:
+ EINVAL Disk number beyond capacity of drive
+
+
+
+CDROM_MEDIA_CHANGED Check is media changed
+
+ usage:
+
+ int slot;
+ ioctl(fd, CDROM_MEDIA_CHANGED, slot);
+
+ inputs:
+ Slot number to be tested, always zero except for jukeboxes.
+ May also be special values CDSL_NONE or CDSL_CURRENT
+
+ outputs:
+ Ioctl return value is 0 or 1 depending on whether the media
+ has been changed, or -1 on error.
+
+ error returns:
+ ENOSYS Drive can't detect media change
+ EINVAL Slot number beyond capacity of drive
+ ENOMEM Out of memory
+
+
+
+CDROM_DRIVE_STATUS Get tray position, etc.
+
+ usage:
+
+ int slot;
+ ioctl(fd, CDROM_DRIVE_STATUS, slot);
+
+ inputs:
+ Slot number to be tested, always zero except for jukeboxes.
+ May also be special values CDSL_NONE or CDSL_CURRENT
+
+ outputs:
+ Ioctl return value will be one of the following values
+ from <linux/cdrom.h>:
+
+ CDS_NO_INFO Information not available.
+ CDS_NO_DISC
+ CDS_TRAY_OPEN
+ CDS_DRIVE_NOT_READY
+ CDS_DISC_OK
+ -1 error
+
+ error returns:
+ ENOSYS Drive can't detect drive status
+ EINVAL Slot number beyond capacity of drive
+ ENOMEM Out of memory
+
+
+
+
+CDROM_DISC_STATUS Get disc type, etc.
+
+ usage:
+
+ ioctl(fd, CDROM_DISC_STATUS, 0);
+
+ inputs: none
+
+ outputs:
+ Ioctl return value will be one of the following values
+ from <linux/cdrom.h>:
+ CDS_NO_INFO
+ CDS_AUDIO
+ CDS_MIXED
+ CDS_XA_2_2
+ CDS_XA_2_1
+ CDS_DATA_1
+
+ error returns: none at present
+
+ notes:
+ Source code comments state:
+
+ Ok, this is where problems start. The current interface for
+ the CDROM_DISC_STATUS ioctl is flawed. It makes the false
+ assumption that CDs are all CDS_DATA_1 or all CDS_AUDIO, etc.
+ Unfortunately, while this is often the case, it is also
+ very common for CDs to have some tracks with data, and some
+ tracks with audio. Just because I feel like it, I declare
+ the following to be the best way to cope. If the CD has
+ ANY data tracks on it, it will be returned as a data CD.
+ If it has any XA tracks, I will return it as that. Now I
+ could simplify this interface by combining these returns with
+ the above, but this more clearly demonstrates the problem
+ with the current interface. Too bad this wasn't designed
+ to use bitmasks... -Erik
+
+ Well, now we have the option CDS_MIXED: a mixed-type CD.
+ User level programmers might feel the ioctl is not very
+ useful.
+ ---david
+
+
+
+
+CDROM_CHANGER_NSLOTS Get number of slots
+
+ usage:
+
+ ioctl(fd, CDROM_CHANGER_NSLOTS, 0);
+
+ inputs: none
+
+ outputs:
+ The ioctl return value will be the number of slots in a
+ CD changer. Typically 1 for non-multi-disk devices.
+
+ error returns: none
+
+
+
+CDROM_LOCKDOOR lock or unlock door
+
+ usage:
+
+ int lock;
+ ioctl(fd, CDROM_LOCKDOOR, lock);
+
+ inputs:
+ Door lock flag, 1=lock, 0=unlock
+
+ outputs: none
+
+ error returns:
+ EDRIVE_CANT_DO_THIS Door lock function not supported.
+ EBUSY Attempt to unlock when multiple users
+ have the drive open and not CAP_SYS_ADMIN
+
+ notes:
+ As of 2.6.8.1, the lock flag is a global lock, meaning that
+ all CD drives will be locked or unlocked together. This is
+ probably a bug.
+
+ The EDRIVE_CANT_DO_THIS value is defined in <linux/cdrom.h>
+ and is currently (2.6.8.1) the same as EOPNOTSUPP
+
+
+
+CDROM_DEBUG Turn debug messages on/off
+
+ usage:
+
+ int debug;
+ ioctl(fd, CDROM_DEBUG, debug);
+
+ inputs:
+ Cdrom debug flag, 0=disable, 1=enable
+
+ outputs:
+ The ioctl return value will be the new debug flag.
+
+ error return:
+ EACCES Access denied: requires CAP_SYS_ADMIN
+
+
+
+CDROM_GET_CAPABILITY get capabilities
+
+ usage:
+
+ ioctl(fd, CDROM_GET_CAPABILITY, 0);
+
+ inputs: none
+
+ outputs:
+ The ioctl return value is the current device capability
+ flags. See CDC_CLOSE_TRAY, CDC_OPEN_TRAY, etc.
+
+
+
+CDROMAUDIOBUFSIZ set the audio buffer size
+
+ usage:
+
+ int arg;
+ ioctl(fd, CDROMAUDIOBUFSIZ, val);
+
+ inputs:
+ New audio buffer size
+
+ outputs:
+ The ioctl return value is the new audio buffer size, or -1
+ on error.
+
+ error return:
+ ENOSYS Not supported by this driver.
+
+ notes:
+ Not supported by all drivers.
+
+
+
+DVD_READ_STRUCT Read structure
+
+ usage:
+
+ dvd_struct s;
+ ioctl(fd, DVD_READ_STRUCT, &s);
+
+ inputs:
+ dvd_struct structure, containing:
+ type specifies the information desired, one of
+ DVD_STRUCT_PHYSICAL, DVD_STRUCT_COPYRIGHT,
+ DVD_STRUCT_DISCKEY, DVD_STRUCT_BCA,
+ DVD_STRUCT_MANUFACT
+ physical.layer_num desired layer, indexed from 0
+ copyright.layer_num desired layer, indexed from 0
+ disckey.agid
+
+ outputs:
+ dvd_struct structure, containing:
+ physical for type == DVD_STRUCT_PHYSICAL
+ copyright for type == DVD_STRUCT_COPYRIGHT
+ disckey.value for type == DVD_STRUCT_DISCKEY
+ bca.{len,value} for type == DVD_STRUCT_BCA
+ manufact.{len,valu} for type == DVD_STRUCT_MANUFACT
+
+ error returns:
+ EINVAL physical.layer_num exceeds number of layers
+ EIO Received invalid response from drive
+
+
+
+DVD_WRITE_STRUCT Write structure
+
+ Not implemented, as of 2.6.8.1
+
+
+
+DVD_AUTH Authentication
+
+ usage:
+
+ dvd_authinfo ai;
+ ioctl(fd, DVD_AUTH, &ai);
+
+ inputs:
+ dvd_authinfo structure. See <linux/cdrom.h>
+
+ outputs:
+ dvd_authinfo structure.
+
+ error return:
+ ENOTTY ai.type not recognized.
+
+
+
+CDROM_SEND_PACKET send a packet to the drive
+
+ usage:
+
+ struct cdrom_generic_command cgc;
+ ioctl(fd, CDROM_SEND_PACKET, &cgc);
+
+ inputs:
+ cdrom_generic_command structure containing the packet to send.
+
+ outputs: none
+ cdrom_generic_command structure containing results.
+
+ error return:
+ EIO command failed.
+ EPERM Operation not permitted, either because a
+ write command was attempted on a drive which
+ is opened read-only, or because the command
+ requires CAP_SYS_RAWIO
+ EINVAL cgc.data_direction not set
+
+
+
+CDROM_NEXT_WRITABLE get next writable block
+
+ usage:
+
+ long next;
+ ioctl(fd, CDROM_NEXT_WRITABLE, &next);
+
+ inputs: none
+
+ outputs:
+ The next writable block.
+
+ notes:
+ If the device does not support this ioctl directly, the
+ ioctl will return CDROM_LAST_WRITTEN + 7.
+
+
+
+CDROM_LAST_WRITTEN get last block written on disc
+
+ usage:
+
+ long last;
+ ioctl(fd, CDROM_LAST_WRITTEN, &last);
+
+ inputs: none
+
+ outputs:
+ The last block written on disc
+
+ notes:
+ If the device does not support this ioctl directly, the
+ result is derived from the disc's table of contents. If the
+ table of contents can't be read, this ioctl returns an
+ error.
diff --git a/Documentation/ioctl/hdio.txt b/Documentation/ioctl/hdio.txt
new file mode 100644
index 0000000..91a6ecb
--- /dev/null
+++ b/Documentation/ioctl/hdio.txt
@@ -0,0 +1,1071 @@
+ Summary of HDIO_ ioctl calls.
+ ============================
+
+ Edward A. Falk <efalk@google.com>
+
+ November, 2004
+
+This document attempts to describe the ioctl(2) calls supported by
+the HD/IDE layer. These are by-and-large implemented (as of Linux 2.6)
+in drivers/ide/ide.c and drivers/block/scsi_ioctl.c
+
+ioctl values are listed in <linux/hdreg.h>. As of this writing, they
+are as follows:
+
+ ioctls that pass argument pointers to user space:
+
+ HDIO_GETGEO get device geometry
+ HDIO_GET_UNMASKINTR get current unmask setting
+ HDIO_GET_MULTCOUNT get current IDE blockmode setting
+ HDIO_GET_QDMA get use-qdma flag
+ HDIO_SET_XFER set transfer rate via proc
+ HDIO_OBSOLETE_IDENTITY OBSOLETE, DO NOT USE
+ HDIO_GET_KEEPSETTINGS get keep-settings-on-reset flag
+ HDIO_GET_32BIT get current io_32bit setting
+ HDIO_GET_NOWERR get ignore-write-error flag
+ HDIO_GET_DMA get use-dma flag
+ HDIO_GET_NICE get nice flags
+ HDIO_GET_IDENTITY get IDE identification info
+ HDIO_GET_WCACHE get write cache mode on|off
+ HDIO_GET_ACOUSTIC get acoustic value
+ HDIO_GET_ADDRESS get sector addressing mode
+ HDIO_GET_BUSSTATE get the bus state of the hwif
+ HDIO_TRISTATE_HWIF execute a channel tristate
+ HDIO_DRIVE_RESET execute a device reset
+ HDIO_DRIVE_TASKFILE execute raw taskfile
+ HDIO_DRIVE_TASK execute task and special drive command
+ HDIO_DRIVE_CMD execute a special drive command
+ HDIO_DRIVE_CMD_AEB HDIO_DRIVE_TASK
+
+ ioctls that pass non-pointer values:
+
+ HDIO_SET_MULTCOUNT change IDE blockmode
+ HDIO_SET_UNMASKINTR permit other irqs during I/O
+ HDIO_SET_KEEPSETTINGS keep ioctl settings on reset
+ HDIO_SET_32BIT change io_32bit flags
+ HDIO_SET_NOWERR change ignore-write-error flag
+ HDIO_SET_DMA change use-dma flag
+ HDIO_SET_PIO_MODE reconfig interface to new speed
+ HDIO_SCAN_HWIF register and (re)scan interface
+ HDIO_SET_NICE set nice flags
+ HDIO_UNREGISTER_HWIF unregister interface
+ HDIO_SET_WCACHE change write cache enable-disable
+ HDIO_SET_ACOUSTIC change acoustic behavior
+ HDIO_SET_BUSSTATE set the bus state of the hwif
+ HDIO_SET_QDMA change use-qdma flag
+ HDIO_SET_ADDRESS change lba addressing modes
+
+ HDIO_SET_IDE_SCSI Set scsi emulation mode on/off
+ HDIO_SET_SCSI_IDE not implemented yet
+
+
+The information that follows was determined from reading kernel source
+code. It is likely that some corrections will be made over time.
+
+
+
+
+
+
+
+General:
+
+ Unless otherwise specified, all ioctl calls return 0 on success
+ and -1 with errno set to an appropriate value on error.
+
+ Unless otherwise specified, all ioctl calls return -1 and set
+ errno to EFAULT on a failed attempt to copy data to or from user
+ address space.
+
+ Unless otherwise specified, all data structures and constants
+ are defined in <linux/hdreg.h>
+
+
+
+HDIO_GETGEO get device geometry
+
+ usage:
+
+ struct hd_geometry geom;
+ ioctl(fd, HDIO_GETGEO, &geom);
+
+
+ inputs: none
+
+ outputs:
+
+ hd_geometry structure containing:
+
+ heads number of heads
+ sectors number of sectors/track
+ cylinders number of cylinders, mod 65536
+ start starting sector of this partition.
+
+
+ error returns:
+ EINVAL if the device is not a disk drive or floppy drive,
+ or if the user passes a null pointer
+
+
+ notes:
+
+ Not particularly useful with modern disk drives, whose geometry
+ is a polite fiction anyway. Modern drives are addressed
+ purely by sector number nowadays (lba addressing), and the
+ drive geometry is an abstraction which is actually subject
+ to change. Currently (as of Nov 2004), the geometry values
+ are the "bios" values -- presumably the values the drive had
+ when Linux first booted.
+
+ In addition, the cylinders field of the hd_geometry is an
+ unsigned short, meaning that on most architectures, this
+ ioctl will not return a meaningful value on drives with more
+ than 65535 tracks.
+
+ The start field is unsigned long, meaning that it will not
+ contain a meaningful value for disks over 219 Gb in size.
+
+
+
+
+HDIO_GET_UNMASKINTR get current unmask setting
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_UNMASKINTR, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the drive's current unmask setting
+
+
+
+HDIO_SET_UNMASKINTR permit other irqs during I/O
+
+ usage:
+
+ unsigned long val;
+ ioctl(fd, HDIO_SET_UNMASKINTR, val);
+
+ inputs:
+ New value for unmask flag
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 1]
+ EBUSY Controller busy
+
+
+
+
+HDIO_GET_MULTCOUNT get current IDE blockmode setting
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_MULTCOUNT, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current IDE block mode setting. This
+ controls how many sectors the drive will transfer per
+ interrupt.
+
+
+
+HDIO_SET_MULTCOUNT change IDE blockmode
+
+ usage:
+
+ int val;
+ ioctl(fd, HDIO_SET_MULTCOUNT, val);
+
+ inputs:
+ New value for IDE block mode setting. This controls how many
+ sectors the drive will transfer per interrupt.
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range supported by disk.
+ EBUSY Controller busy or blockmode already set.
+ EIO Drive did not accept new block mode.
+
+ notes:
+
+ Source code comments read:
+
+ This is tightly woven into the driver->do_special cannot
+ touch. DON'T do it again until a total personality rewrite
+ is committed.
+
+ If blockmode has already been set, this ioctl will fail with
+ EBUSY
+
+
+
+HDIO_GET_QDMA get use-qdma flag
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_XFER set transfer rate via proc
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_OBSOLETE_IDENTITY OBSOLETE, DO NOT USE
+
+ Same as HDIO_GET_IDENTITY (see below), except that it only
+ returns the first 142 bytes of drive identity information.
+
+
+
+HDIO_GET_IDENTITY get IDE identification info
+
+ usage:
+
+ unsigned char identity[512];
+ ioctl(fd, HDIO_GET_IDENTITY, identity);
+
+ inputs: none
+
+ outputs:
+
+ ATA drive identity information. For full description, see
+ the IDENTIFY DEVICE and IDENTIFY PACKET DEVICE commands in
+ the ATA specification.
+
+ error returns:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ ENOMSG IDENTIFY DEVICE information not available
+
+ notes:
+
+ Returns information that was obtained when the drive was
+ probed. Some of this information is subject to change, and
+ this ioctl does not re-probe the drive to update the
+ information.
+
+ This information is also available from /proc/ide/hdX/identify
+
+
+
+HDIO_GET_KEEPSETTINGS get keep-settings-on-reset flag
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_KEEPSETTINGS, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current "keep settings" flag
+
+ notes:
+
+ When set, indicates that kernel should restore settings
+ after a drive reset.
+
+
+
+HDIO_SET_KEEPSETTINGS keep ioctl settings on reset
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_SET_KEEPSETTINGS, val);
+
+ inputs:
+ New value for keep_settings flag
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 1]
+ EBUSY Controller busy
+
+
+
+HDIO_GET_32BIT get current io_32bit setting
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_32BIT, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current io_32bit setting
+
+ notes:
+
+ 0=16-bit, 1=32-bit, 2,3 = 32bit+sync
+
+
+
+HDIO_GET_NOWERR get ignore-write-error flag
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_NOWERR, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current ignore-write-error flag
+
+
+
+HDIO_GET_DMA get use-dma flag
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_DMA, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current use-dma flag
+
+
+
+HDIO_GET_NICE get nice flags
+
+ usage:
+
+ long nice;
+ ioctl(fd, HDIO_GET_NICE, &nice);
+
+ inputs: none
+
+ outputs:
+
+ The drive's "nice" values.
+
+ notes:
+
+ Per-drive flags which determine when the system will give more
+ bandwidth to other devices sharing the same IDE bus.
+ See <linux/hdreg.h>, near symbol IDE_NICE_DSC_OVERLAP.
+
+
+
+
+HDIO_SET_NICE set nice flags
+
+ usage:
+
+ unsigned long nice;
+ ...
+ ioctl(fd, HDIO_SET_NICE, nice);
+
+ inputs:
+ bitmask of nice flags.
+
+ outputs: none
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EPERM Flags other than DSC_OVERLAP and NICE_1 set.
+ EPERM DSC_OVERLAP specified but not supported by drive
+
+ notes:
+
+ This ioctl sets the DSC_OVERLAP and NICE_1 flags from values
+ provided by the user.
+
+ Nice flags are listed in <linux/hdreg.h>, starting with
+ IDE_NICE_DSC_OVERLAP. These values represent shifts.
+
+
+
+
+
+HDIO_GET_WCACHE get write cache mode on|off
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_WCACHE, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current write cache mode
+
+
+
+HDIO_GET_ACOUSTIC get acoustic value
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_ACOUSTIC, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current acoustic settings
+
+ notes:
+
+ See HDIO_SET_ACOUSTIC
+
+
+
+HDIO_GET_ADDRESS
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_GET_ADDRESS, &val);
+
+ inputs: none
+
+ outputs:
+ The value of the current addressing mode:
+ 0 = 28-bit
+ 1 = 48-bit
+ 2 = 48-bit doing 28-bit
+ 3 = 64-bit
+
+
+
+HDIO_GET_BUSSTATE get the bus state of the hwif
+
+ usage:
+
+ long state;
+ ioctl(fd, HDIO_SCAN_HWIF, &state);
+
+ inputs: none
+
+ outputs:
+ Current power state of the IDE bus. One of BUSSTATE_OFF,
+ BUSSTATE_ON, or BUSSTATE_TRISTATE
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_ADMIN
+
+
+
+
+HDIO_SET_BUSSTATE set the bus state of the hwif
+
+ usage:
+
+ int state;
+ ...
+ ioctl(fd, HDIO_SCAN_HWIF, state);
+
+ inputs:
+ Desired IDE power state. One of BUSSTATE_OFF, BUSSTATE_ON,
+ or BUSSTATE_TRISTATE
+
+ outputs: none
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_RAWIO
+ EOPNOTSUPP Hardware interface does not support bus power control
+
+
+
+
+HDIO_TRISTATE_HWIF execute a channel tristate
+
+ Not implemented, as of 2.6.8.1. See HDIO_SET_BUSSTATE
+
+
+
+HDIO_DRIVE_RESET execute a device reset
+
+ usage:
+
+ int args[3]
+ ...
+ ioctl(fd, HDIO_DRIVE_RESET, args);
+
+ inputs: none
+
+ outputs: none
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ ENXIO No such device: phy dead or ctl_addr == 0
+ EIO I/O error: reset timed out or hardware error
+
+ notes:
+
+ Execute a reset on the device as soon as the current IO
+ operation has completed.
+
+ Executes an ATAPI soft reset if applicable, otherwise
+ executes an ATA soft reset on the controller.
+
+
+
+HDIO_DRIVE_TASKFILE execute raw taskfile
+
+ Note: If you don't have a copy of the ANSI ATA specification
+ handy, you should probably ignore this ioctl.
+
+ Execute an ATA disk command directly by writing the "taskfile"
+ registers of the drive. Requires ADMIN and RAWIO access
+ privileges.
+
+ usage:
+
+ struct {
+ ide_task_request_t req_task;
+ u8 outbuf[OUTPUT_SIZE];
+ u8 inbuf[INPUT_SIZE];
+ } task;
+ memset(&task.req_task, 0, sizeof(task.req_task));
+ task.req_task.out_size = sizeof(task.outbuf);
+ task.req_task.in_size = sizeof(task.inbuf);
+ ...
+ ioctl(fd, HDIO_DRIVE_TASKFILE, &task);
+ ...
+
+ inputs:
+
+ (See below for details on memory area passed to ioctl.)
+
+ io_ports[8] values to be written to taskfile registers
+ hob_ports[8] high-order bytes, for extended commands.
+ out_flags flags indicating which registers are valid
+ in_flags flags indicating which registers should be returned
+ data_phase see below
+ req_cmd command type to be executed
+ out_size size of output buffer
+ outbuf buffer of data to be transmitted to disk
+ inbuf buffer of data to be received from disk (see [1])
+
+ outputs:
+
+ io_ports[] values returned in the taskfile registers
+ hob_ports[] high-order bytes, for extended commands.
+ out_flags flags indicating which registers are valid (see [2])
+ in_flags flags indicating which registers should be returned
+ outbuf buffer of data to be transmitted to disk (see [1])
+ inbuf buffer of data to be received from disk
+
+ error returns:
+ EACCES CAP_SYS_ADMIN or CAP_SYS_RAWIO privilege not set.
+ ENOMSG Device is not a disk drive.
+ ENOMEM Unable to allocate memory for task
+ EFAULT req_cmd == TASKFILE_IN_OUT (not implemented as of 2.6.8)
+ EPERM req_cmd == TASKFILE_MULTI_OUT and drive
+ multi-count not yet set.
+ EIO Drive failed the command.
+
+ notes:
+
+ [1] READ THE FOLLOWING NOTES *CAREFULLY*. THIS IOCTL IS
+ FULL OF GOTCHAS. Extreme caution should be used with using
+ this ioctl. A mistake can easily corrupt data or hang the
+ system.
+
+ [2] Both the input and output buffers are copied from the
+ user and written back to the user, even when not used.
+
+ [3] If one or more bits are set in out_flags and in_flags is
+ zero, the following values are used for in_flags.all and
+ written back into in_flags on completion.
+
+ * IDE_TASKFILE_STD_IN_FLAGS | (IDE_HOB_STD_IN_FLAGS << 8)
+ if LBA48 addressing is enabled for the drive
+ * IDE_TASKFILE_STD_IN_FLAGS
+ if CHS/LBA28
+
+ The association between in_flags.all and each enable
+ bitfield flips depending on endianess; fortunately, TASKFILE
+ only uses inflags.b.data bit and ignores all other bits.
+ The end result is that, on any endian machines, it has no
+ effect other than modifying in_flags on completion.
+
+ [4] The default value of SELECT is (0xa0|DEV_bit|LBA_bit)
+ except for four drives per port chipsets. For four drives
+ per port chipsets, it's (0xa0|DEV_bit|LBA_bit) for the first
+ pair and (0x80|DEV_bit|LBA_bit) for the second pair.
+
+ [5] The argument to the ioctl is a pointer to a region of
+ memory containing a ide_task_request_t structure, followed
+ by an optional buffer of data to be transmitted to the
+ drive, followed by an optional buffer to receive data from
+ the drive.
+
+ Command is passed to the disk drive via the ide_task_request_t
+ structure, which contains these fields:
+
+ io_ports[8] values for the taskfile registers
+ hob_ports[8] high-order bytes, for extended commands
+ out_flags flags indicating which entries in the
+ io_ports[] and hob_ports[] arrays
+ contain valid values. Type ide_reg_valid_t.
+ in_flags flags indicating which entries in the
+ io_ports[] and hob_ports[] arrays
+ are expected to contain valid values
+ on return.
+ data_phase See below
+ req_cmd Command type, see below
+ out_size output (user->drive) buffer size, bytes
+ in_size input (drive->user) buffer size, bytes
+
+ When out_flags is zero, the following registers are loaded.
+
+ HOB_FEATURE If the drive supports LBA48
+ HOB_NSECTOR If the drive supports LBA48
+ HOB_SECTOR If the drive supports LBA48
+ HOB_LCYL If the drive supports LBA48
+ HOB_HCYL If the drive supports LBA48
+ FEATURE
+ NSECTOR
+ SECTOR
+ LCYL
+ HCYL
+ SELECT First, masked with 0xE0 if LBA48, 0xEF
+ otherwise; then, or'ed with the default
+ value of SELECT.
+
+ If any bit in out_flags is set, the following registers are loaded.
+
+ HOB_DATA If out_flags.b.data is set. HOB_DATA will
+ travel on DD8-DD15 on little endian machines
+ and on DD0-DD7 on big endian machines.
+ DATA If out_flags.b.data is set. DATA will
+ travel on DD0-DD7 on little endian machines
+ and on DD8-DD15 on big endian machines.
+ HOB_NSECTOR If out_flags.b.nsector_hob is set
+ HOB_SECTOR If out_flags.b.sector_hob is set
+ HOB_LCYL If out_flags.b.lcyl_hob is set
+ HOB_HCYL If out_flags.b.hcyl_hob is set
+ FEATURE If out_flags.b.feature is set
+ NSECTOR If out_flags.b.nsector is set
+ SECTOR If out_flags.b.sector is set
+ LCYL If out_flags.b.lcyl is set
+ HCYL If out_flags.b.hcyl is set
+ SELECT Or'ed with the default value of SELECT and
+ loaded regardless of out_flags.b.select.
+
+ Taskfile registers are read back from the drive into
+ {io|hob}_ports[] after the command completes iff one of the
+ following conditions is met; otherwise, the original values
+ will be written back, unchanged.
+
+ 1. The drive fails the command (EIO).
+ 2. One or more than one bits are set in out_flags.
+ 3. The requested data_phase is TASKFILE_NO_DATA.
+
+ HOB_DATA If in_flags.b.data is set. It will contain
+ DD8-DD15 on little endian machines and
+ DD0-DD7 on big endian machines.
+ DATA If in_flags.b.data is set. It will contain
+ DD0-DD7 on little endian machines and
+ DD8-DD15 on big endian machines.
+ HOB_FEATURE If the drive supports LBA48
+ HOB_NSECTOR If the drive supports LBA48
+ HOB_SECTOR If the drive supports LBA48
+ HOB_LCYL If the drive supports LBA48
+ HOB_HCYL If the drive supports LBA48
+ NSECTOR
+ SECTOR
+ LCYL
+ HCYL
+
+ The data_phase field describes the data transfer to be
+ performed. Value is one of:
+
+ TASKFILE_IN
+ TASKFILE_MULTI_IN
+ TASKFILE_OUT
+ TASKFILE_MULTI_OUT
+ TASKFILE_IN_OUT
+ TASKFILE_IN_DMA
+ TASKFILE_IN_DMAQ == IN_DMA (queueing not supported)
+ TASKFILE_OUT_DMA
+ TASKFILE_OUT_DMAQ == OUT_DMA (queueing not supported)
+ TASKFILE_P_IN unimplemented
+ TASKFILE_P_IN_DMA unimplemented
+ TASKFILE_P_IN_DMAQ unimplemented
+ TASKFILE_P_OUT unimplemented
+ TASKFILE_P_OUT_DMA unimplemented
+ TASKFILE_P_OUT_DMAQ unimplemented
+
+ The req_cmd field classifies the command type. It may be
+ one of:
+
+ IDE_DRIVE_TASK_NO_DATA
+ IDE_DRIVE_TASK_SET_XFER unimplemented
+ IDE_DRIVE_TASK_IN
+ IDE_DRIVE_TASK_OUT unimplemented
+ IDE_DRIVE_TASK_RAW_WRITE
+
+ [6] Do not access {in|out}_flags->all except for resetting
+ all the bits. Always access individual bit fields. ->all
+ value will flip depending on endianess. For the same
+ reason, do not use IDE_{TASKFILE|HOB}_STD_{OUT|IN}_FLAGS
+ constants defined in hdreg.h.
+
+
+
+HDIO_DRIVE_CMD execute a special drive command
+
+ Note: If you don't have a copy of the ANSI ATA specification
+ handy, you should probably ignore this ioctl.
+
+ usage:
+
+ u8 args[4+XFER_SIZE];
+ ...
+ ioctl(fd, HDIO_DRIVE_CMD, args);
+
+ inputs:
+
+ Commands other than WIN_SMART
+ args[0] COMMAND
+ args[1] NSECTOR
+ args[2] FEATURE
+ args[3] NSECTOR
+
+ WIN_SMART
+ args[0] COMMAND
+ args[1] SECTOR
+ args[2] FEATURE
+ args[3] NSECTOR
+
+ outputs:
+
+ args[] buffer is filled with register values followed by any
+ data returned by the disk.
+ args[0] status
+ args[1] error
+ args[2] NSECTOR
+ args[3] undefined
+ args[4+] NSECTOR * 512 bytes of data returned by the command.
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_RAWIO
+ ENOMEM Unable to allocate memory for task
+ EIO Drive reports error
+
+ notes:
+
+ [1] For commands other than WIN_SMART, args[1] should equal
+ args[3]. SECTOR, LCYL and HCYL are undefined. For
+ WIN_SMART, 0x4f and 0xc2 are loaded into LCYL and HCYL
+ respectively. In both cases SELECT will contain the default
+ value for the drive. Please refer to HDIO_DRIVE_TASKFILE
+ notes for the default value of SELECT.
+
+ [2] If NSECTOR value is greater than zero and the drive sets
+ DRQ when interrupting for the command, NSECTOR * 512 bytes
+ are read from the device into the area following NSECTOR.
+ In the above example, the area would be
+ args[4..4+XFER_SIZE]. 16bit PIO is used regardless of
+ HDIO_SET_32BIT setting.
+
+ [3] If COMMAND == WIN_SETFEATURES && FEATURE == SETFEATURES_XFER
+ && NSECTOR >= XFER_SW_DMA_0 && the drive supports any DMA
+ mode, IDE driver will try to tune the transfer mode of the
+ drive accordingly.
+
+
+
+HDIO_DRIVE_TASK execute task and special drive command
+
+ Note: If you don't have a copy of the ANSI ATA specification
+ handy, you should probably ignore this ioctl.
+
+ usage:
+
+ u8 args[7];
+ ...
+ ioctl(fd, HDIO_DRIVE_TASK, args);
+
+ inputs:
+
+ Taskfile register values:
+ args[0] COMMAND
+ args[1] FEATURE
+ args[2] NSECTOR
+ args[3] SECTOR
+ args[4] LCYL
+ args[5] HCYL
+ args[6] SELECT
+
+ outputs:
+
+ Taskfile register values:
+ args[0] status
+ args[1] error
+ args[2] NSECTOR
+ args[3] SECTOR
+ args[4] LCYL
+ args[5] HCYL
+ args[6] SELECT
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_RAWIO
+ ENOMEM Unable to allocate memory for task
+ ENOMSG Device is not a disk drive.
+ EIO Drive failed the command.
+
+ notes:
+
+ [1] DEV bit (0x10) of SELECT register is ignored and the
+ appropriate value for the drive is used. All other bits
+ are used unaltered.
+
+
+
+HDIO_DRIVE_CMD_AEB HDIO_DRIVE_TASK
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_32BIT change io_32bit flags
+
+ usage:
+
+ int val;
+ ioctl(fd, HDIO_SET_32BIT, val);
+
+ inputs:
+ New value for io_32bit flag
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 3]
+ EBUSY Controller busy
+
+
+
+
+HDIO_SET_NOWERR change ignore-write-error flag
+
+ usage:
+
+ int val;
+ ioctl(fd, HDIO_SET_NOWERR, val);
+
+ inputs:
+ New value for ignore-write-error flag. Used for ignoring
+ WRERR_STAT
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 1]
+ EBUSY Controller busy
+
+
+
+HDIO_SET_DMA change use-dma flag
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_SET_DMA, val);
+
+ inputs:
+ New value for use-dma flag
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 1]
+ EBUSY Controller busy
+
+
+
+HDIO_SET_PIO_MODE reconfig interface to new speed
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_SET_PIO_MODE, val);
+
+ inputs:
+ New interface speed.
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 255]
+ EBUSY Controller busy
+
+
+
+HDIO_SCAN_HWIF register and (re)scan interface
+
+ usage:
+
+ int args[3]
+ ...
+ ioctl(fd, HDIO_SCAN_HWIF, args);
+
+ inputs:
+ args[0] io address to probe
+ args[1] control address to probe
+ args[2] irq number
+
+ outputs: none
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_RAWIO
+ EIO Probe failed.
+
+ notes:
+
+ This ioctl initializes the addresses and irq for a disk
+ controller, probes for drives, and creates /proc/ide
+ interfaces as appropriate.
+
+
+
+HDIO_UNREGISTER_HWIF unregister interface
+
+ usage:
+
+ int index;
+ ioctl(fd, HDIO_UNREGISTER_HWIF, index);
+
+ inputs:
+ index index of hardware interface to unregister
+
+ outputs: none
+
+ error returns:
+ EACCES Access denied: requires CAP_SYS_RAWIO
+
+ notes:
+
+ This ioctl removes a hardware interface from the kernel.
+
+ Currently (2.6.8) this ioctl silently fails if any drive on
+ the interface is busy.
+
+
+
+HDIO_SET_WCACHE change write cache enable-disable
+
+ usage:
+
+ int val;
+ ioctl(fd, HDIO_SET_WCACHE, val);
+
+ inputs:
+ New value for write cache enable
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 1]
+ EBUSY Controller busy
+
+
+
+HDIO_SET_ACOUSTIC change acoustic behavior
+
+ usage:
+
+ int val;
+ ioctl(fd, HDIO_SET_ACOUSTIC, val);
+
+ inputs:
+ New value for drive acoustic settings
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 254]
+ EBUSY Controller busy
+
+
+
+HDIO_SET_QDMA change use-qdma flag
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_ADDRESS change lba addressing modes
+
+ usage:
+
+ int val;
+ ioctl(fd, HDIO_SET_ADDRESS, val);
+
+ inputs:
+ New value for addressing mode
+ 0 = 28-bit
+ 1 = 48-bit
+ 2 = 48-bit doing 28-bit
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 2]
+ EBUSY Controller busy
+ EIO Drive does not support lba48 mode.
+
+
+HDIO_SET_IDE_SCSI
+
+ usage:
+
+ long val;
+ ioctl(fd, HDIO_SET_IDE_SCSI, val);
+
+ inputs:
+ New value for scsi emulation mode (?)
+
+ outputs: none
+
+ error return:
+ EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ EACCES Access denied: requires CAP_SYS_ADMIN
+ EINVAL value out of range [0 1]
+ EBUSY Controller busy
+
+
+
+HDIO_SET_SCSI_IDE
+
+ Not implemented, as of 2.6.8.1
+
+
diff --git a/Documentation/ioctl/ioctl-decoding.txt b/Documentation/ioctl/ioctl-decoding.txt
new file mode 100644
index 0000000..e35efb0
--- /dev/null
+++ b/Documentation/ioctl/ioctl-decoding.txt
@@ -0,0 +1,24 @@
+To decode a hex IOCTL code:
+
+Most architectures use this generic format, but check
+include/ARCH/ioctl.h for specifics, e.g. powerpc
+uses 3 bits to encode read/write and 13 bits for size.
+
+ bits meaning
+ 31-30 00 - no parameters: uses _IO macro
+ 10 - read: _IOR
+ 01 - write: _IOW
+ 11 - read/write: _IOWR
+
+ 29-16 size of arguments
+
+ 15-8 ascii character supposedly
+ unique to each driver
+
+ 7-0 function #
+
+
+So for example 0x82187201 is a read with arg length of 0x218,
+character 'r' function 1. Grepping the source reveals this is:
+
+#define VFAT_IOCTL_READDIR_BOTH _IOR('r', 1, struct dirent [2])
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
new file mode 100644
index 0000000..b880ce5
--- /dev/null
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -0,0 +1,201 @@
+Ioctl Numbers
+19 October 1999
+Michael Elizabeth Chastain
+<mec@shout.net>
+
+If you are adding new ioctl's to the kernel, you should use the _IO
+macros defined in <linux/ioctl.h>:
+
+ _IO an ioctl with no parameters
+ _IOW an ioctl with write parameters (copy_from_user)
+ _IOR an ioctl with read parameters (copy_to_user)
+ _IOWR an ioctl with both write and read parameters.
+
+'Write' and 'read' are from the user's point of view, just like the
+system calls 'write' and 'read'. For example, a SET_FOO ioctl would
+be _IOW, although the kernel would actually read data from user space;
+a GET_FOO ioctl would be _IOR, although the kernel would actually write
+data to user space.
+
+The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter
+or number from the table below. Because of the large number of drivers,
+many drivers share a partial letter with other drivers.
+
+If you are writing a driver for a new device and need a letter, pick an
+unused block with enough room for expansion: 32 to 256 ioctl commands.
+You can register the block by patching this file and submitting the
+patch to Linus Torvalds. Or you can e-mail me at <mec@shout.net> and
+I'll register one for you.
+
+The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number
+to distinguish ioctls from each other. The third argument to _IOW,
+_IOR, or _IOWR is the type of the data going into the kernel or coming
+out of the kernel (e.g. 'int' or 'struct foo'). NOTE! Do NOT use
+sizeof(arg) as the third argument as this results in your ioctl thinking
+it passes an argument of type size_t.
+
+Some devices use their major number as the identifier; this is OK, as
+long as it is unique. Some devices are irregular and don't follow any
+convention at all.
+
+Following this convention is good because:
+
+(1) Keeping the ioctl's globally unique helps error checking:
+ if a program calls an ioctl on the wrong device, it will get an
+ error rather than some unexpected behaviour.
+
+(2) The 'strace' build procedure automatically finds ioctl numbers
+ defined with _IO, _IOW, _IOR, or _IOWR.
+
+(3) 'strace' can decode numbers back into useful names when the
+ numbers are unique.
+
+(4) People looking for ioctls can grep for them more easily when
+ this convention is used to define the ioctl numbers.
+
+(5) When following the convention, the driver code can use generic
+ code to copy the parameters between user and kernel space.
+
+This table lists ioctls visible from user land for Linux/i386. It contains
+most drivers up to 2.3.14, but I know I am missing some.
+
+Code Seq# Include File Comments
+========================================================
+0x00 00-1F linux/fs.h conflict!
+0x00 00-1F scsi/scsi_ioctl.h conflict!
+0x00 00-1F linux/fb.h conflict!
+0x00 00-1F linux/wavefront.h conflict!
+0x02 all linux/fd.h
+0x03 all linux/hdreg.h
+0x04 D2-DC linux/umsdos_fs.h Dead since 2.6.11, but don't reuse these.
+0x06 all linux/lp.h
+0x09 all linux/md.h
+0x12 all linux/fs.h
+ linux/blkpg.h
+0x1b all InfiniBand Subsystem <http://www.openib.org/>
+0x20 all drivers/cdrom/cm206.h
+0x22 all scsi/sg.h
+'#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem
+'1' 00-1F <linux/timepps.h> PPS kit from Ulrich Windl
+ <ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
+'8' all SNP8023 advanced NIC card
+ <mailto:mcr@solidum.com>
+'A' 00-1F linux/apm_bios.h
+'B' C0-FF advanced bbus
+ <mailto:maassen@uni-freiburg.de>
+'C' all linux/soundcard.h
+'D' all asm-s390/dasd.h
+'E' all linux/input.h
+'F' all linux/fb.h
+'H' all linux/hiddev.h
+'I' all linux/isdn.h
+'J' 00-1F drivers/scsi/gdth_ioctl.h
+'K' all linux/kd.h
+'L' 00-1F linux/loop.h
+'L' 20-2F driver/usb/misc/vstusb.h
+'L' E0-FF linux/ppdd.h encrypted disk device driver
+ <http://linux01.gwdg.de/~alatham/ppdd.html>
+'M' all linux/soundcard.h
+'N' 00-1F drivers/usb/scanner.h
+'P' all linux/soundcard.h
+'Q' all linux/soundcard.h
+'R' 00-1F linux/random.h
+'S' all linux/cdrom.h conflict!
+'S' 80-81 scsi/scsi_ioctl.h conflict!
+'S' 82-FF scsi/scsi.h conflict!
+'T' all linux/soundcard.h conflict!
+'T' all asm-i386/ioctls.h conflict!
+'U' 00-EF linux/drivers/usb/usb.h
+'V' all linux/vt.h
+'W' 00-1F linux/watchdog.h conflict!
+'W' 00-1F linux/wanrouter.h conflict!
+'X' all linux/xfs_fs.h
+'Y' all linux/cyclades.h
+'[' 00-07 linux/usb/usbtmc.h USB Test and Measurement Devices
+ <mailto:gregkh@suse.de>
+'a' all ATM on linux
+ <http://lrcwww.epfl.ch/linux-atm/magic.html>
+'b' 00-FF bit3 vme host bridge
+ <mailto:natalia@nikhefk.nikhef.nl>
+'c' 00-7F linux/comstats.h conflict!
+'c' 00-7F linux/coda.h conflict!
+'c' 80-9F asm-s390/chsc.h
+'d' 00-FF linux/char/drm/drm/h conflict!
+'d' 00-DF linux/video_decoder.h conflict!
+'d' F0-FF linux/digi1.h
+'e' all linux/digi1.h conflict!
+'e' 00-1F linux/video_encoder.h conflict!
+'e' 00-1F net/irda/irtty.h conflict!
+'f' 00-1F linux/ext2_fs.h
+'h' 00-7F Charon filesystem
+ <mailto:zapman@interlan.net>
+'i' 00-3F linux/i2o.h
+'j' 00-3F linux/joystick.h
+'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system
+ <http://mikonos.dia.unisa.it/tcfs>
+'l' 40-7F linux/udf_fs_i.h in development:
+ <http://sourceforge.net/projects/linux-udf/>
+'m' all linux/mtio.h conflict!
+'m' all linux/soundcard.h conflict!
+'m' all linux/synclink.h conflict!
+'m' 00-1F net/irda/irmod.h conflict!
+'n' 00-7F linux/ncp_fs.h
+'n' E0-FF video/matrox.h matroxfb
+'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2
+'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this)
+'p' 00-3F linux/mc146818rtc.h conflict!
+'p' 40-7F linux/nvram.h
+'p' 80-9F user-space parport
+ <mailto:tim@cyberelk.net>
+'q' 00-1F linux/serio.h
+'q' 80-FF Internet PhoneJACK, Internet LineJACK
+ <http://www.quicknet.net>
+'r' 00-1F linux/msdos_fs.h
+'s' all linux/cdk.h
+'t' 00-7F linux/if_ppp.h
+'t' 80-8F linux/isdn_ppp.h
+'u' 00-1F linux/smb_fs.h
+'v' 00-1F linux/ext2_fs.h conflict!
+'v' all linux/videodev.h conflict!
+'w' all CERN SCI driver
+'y' 00-1F packet based user level communications
+ <mailto:zapman@interlan.net>
+'z' 00-3F CAN bus card
+ <mailto:hdstich@connectu.ulm.circular.de>
+'z' 40-7F CAN bus card
+ <mailto:oe@port.de>
+0x80 00-1F linux/fb.h
+0x81 00-1F linux/videotext.h
+0x89 00-06 asm-i386/sockios.h
+0x89 0B-DF linux/sockios.h
+0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range
+0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range
+0x8B all linux/wireless.h
+0x8C 00-3F WiNRADiO driver
+ <http://www.proximity.com.au/~brian/winradio/>
+0x90 00 drivers/cdrom/sbpcd.h
+0x93 60-7F linux/auto_fs.h
+0x99 00-0F 537-Addinboard driver
+ <mailto:buk@buks.ipn.de>
+0xA0 all linux/sdp/sdp.h Industrial Device Project
+ <mailto:kenji@bitgate.com>
+0xA3 80-8F Port ACL in development:
+ <mailto:tlewis@mindspring.com>
+0xA3 90-9F linux/dtlk.h
+0xAB 00-1F linux/nbd.h
+0xAC 00-1F linux/raw.h
+0xAD 00 Netfilter device in development:
+ <mailto:rusty@rustcorp.com.au>
+0xAE all linux/kvm.h Kernel-based Virtual Machine
+ <mailto:kvm-devel@lists.sourceforge.net>
+0xB0 all RATIO devices in development:
+ <mailto:vgo@ratio.de>
+0xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca>
+0xCB 00-1F CBM serial IEC bus in development:
+ <mailto:michael.klein@puffin.lb.shuttle.de>
+0xDD 00-3F ZFCP device driver see drivers/s390/scsi/
+ <mailto:aherrman@de.ibm.com>
+0xF3 00-3F video/sisfb.h sisfb (in development)
+ <mailto:thomas@winischhofer.net>
+0xF4 00-1F video/mbxfb.h mbxfb
+ <mailto:raph@8d.com>
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
new file mode 100644
index 0000000..59a69ec
--- /dev/null
+++ b/Documentation/iostats.txt
@@ -0,0 +1,163 @@
+I/O statistics fields
+---------------
+
+Last modified Sep 30, 2003
+
+Since 2.4.20 (and some versions before, with patches), and 2.5.45,
+more extensive disk statistics have been introduced to help measure disk
+activity. Tools such as sar and iostat typically interpret these and do
+the work for you, but in case you are interested in creating your own
+tools, the fields are explained here.
+
+In 2.4 now, the information is found as additional fields in
+/proc/partitions. In 2.6, the same information is found in two
+places: one is in the file /proc/diskstats, and the other is within
+the sysfs file system, which must be mounted in order to obtain
+the information. Throughout this document we'll assume that sysfs
+is mounted on /sys, although of course it may be mounted anywhere.
+Both /proc/diskstats and sysfs use the same source for the information
+and so should not differ.
+
+Here are examples of these different formats:
+
+2.4:
+ 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+ 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
+
+
+2.6 sysfs:
+ 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+ 35486 38030 38030 38030
+
+2.6 diskstats:
+ 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+ 3 1 hda1 35486 38030 38030 38030
+
+On 2.4 you might execute "grep 'hda ' /proc/partitions". On 2.6, you have
+a choice of "cat /sys/block/hda/stat" or "grep 'hda ' /proc/diskstats".
+The advantage of one over the other is that the sysfs choice works well
+if you are watching a known, small set of disks. /proc/diskstats may
+be a better choice if you are watching a large number of disks because
+you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
+each snapshot of your disk statistics.
+
+In 2.4, the statistics fields are those after the device name. In
+the above example, the first field of statistics would be 446216.
+By contrast, in 2.6 if you look at /sys/block/hda/stat, you'll
+find just the eleven fields, beginning with 446216. If you look at
+/proc/diskstats, the eleven fields will be preceded by the major and
+minor device numbers, and device name. Each of these formats provide
+eleven fields of statistics, each meaning exactly the same things.
+All fields except field 9 are cumulative since boot. Field 9 should
+go to zero as I/Os complete; all others only increase. Yes, these are
+32 bit unsigned numbers, and on a very busy or long-lived system they
+may wrap. Applications should be prepared to deal with that; unless
+your observations are measured in large numbers of minutes or hours,
+they should not wrap twice before you notice them.
+
+Each set of stats only applies to the indicated device; if you want
+system-wide stats you'll have to find all the devices and sum them all up.
+
+Field 1 -- # of reads completed
+ This is the total number of reads completed successfully.
+Field 2 -- # of reads merged, field 6 -- # of writes merged
+ Reads and writes which are adjacent to each other may be merged for
+ efficiency. Thus two 4K reads may become one 8K read before it is
+ ultimately handed to the disk, and so it will be counted (and queued)
+ as only one I/O. This field lets you know how often this was done.
+Field 3 -- # of sectors read
+ This is the total number of sectors read successfully.
+Field 4 -- # of milliseconds spent reading
+ This is the total number of milliseconds spent by all reads (as
+ measured from __make_request() to end_that_request_last()).
+Field 5 -- # of writes completed
+ This is the total number of writes completed successfully.
+Field 7 -- # of sectors written
+ This is the total number of sectors written successfully.
+Field 8 -- # of milliseconds spent writing
+ This is the total number of milliseconds spent by all writes (as
+ measured from __make_request() to end_that_request_last()).
+Field 9 -- # of I/Os currently in progress
+ The only field that should go to zero. Incremented as requests are
+ given to appropriate struct request_queue and decremented as they finish.
+Field 10 -- # of milliseconds spent doing I/Os
+ This field is increases so long as field 9 is nonzero.
+Field 11 -- weighted # of milliseconds spent doing I/Os
+ This field is incremented at each I/O start, I/O completion, I/O
+ merge, or read of these stats by the number of I/Os in progress
+ (field 9) times the number of milliseconds spent doing I/O since the
+ last update of this field. This can provide an easy measure of both
+ I/O completion time and the backlog that may be accumulating.
+
+
+To avoid introducing performance bottlenecks, no locks are held while
+modifying these counters. This implies that minor inaccuracies may be
+introduced when changes collide, so (for instance) adding up all the
+read I/Os issued per partition should equal those made to the disks ...
+but due to the lack of locking it may only be very close.
+
+In 2.6, there are counters for each cpu, which made the lack of locking
+almost a non-issue. When the statistics are read, the per-cpu counters
+are summed (possibly overflowing the unsigned 32-bit variable they are
+summed to) and the result given to the user. There is no convenient
+user interface for accessing the per-cpu counters themselves.
+
+Disks vs Partitions
+-------------------
+
+There were significant changes between 2.4 and 2.6 in the I/O subsystem.
+As a result, some statistic information disappeared. The translation from
+a disk address relative to a partition to the disk address relative to
+the host disk happens much earlier. All merges and timings now happen
+at the disk level rather than at both the disk and partition level as
+in 2.4. Consequently, you'll see a different statistics output on 2.6 for
+partitions from that for disks. There are only *four* fields available
+for partitions on 2.6 machines. This is reflected in the examples above.
+
+Field 1 -- # of reads issued
+ This is the total number of reads issued to this partition.
+Field 2 -- # of sectors read
+ This is the total number of sectors requested to be read from this
+ partition.
+Field 3 -- # of writes issued
+ This is the total number of writes issued to this partition.
+Field 4 -- # of sectors written
+ This is the total number of sectors requested to be written to
+ this partition.
+
+Note that since the address is translated to a disk-relative one, and no
+record of the partition-relative address is kept, the subsequent success
+or failure of the read cannot be attributed to the partition. In other
+words, the number of reads for partitions is counted slightly before time
+of queuing for partitions, and at completion for whole disks. This is
+a subtle distinction that is probably uninteresting for most cases.
+
+More significant is the error induced by counting the numbers of
+reads/writes before merges for partitions and after for disks. Since a
+typical workload usually contains a lot of successive and adjacent requests,
+the number of reads/writes issued can be several times higher than the
+number of reads/writes completed.
+
+In 2.6.25, the full statistic set is again available for partitions and
+disk and partition statistics are consistent again. Since we still don't
+keep record of the partition-relative address, an operation is attributed to
+the partition which contains the first sector of the request after the
+eventual merges. As requests can be merged across partition, this could lead
+to some (probably insignificant) inaccuracy.
+
+Additional notes
+----------------
+
+In 2.6, sysfs is not mounted by default. If your distribution of
+Linux hasn't added it already, here's the line you'll want to add to
+your /etc/fstab:
+
+none /sys sysfs defaults 0 0
+
+
+In 2.6, all disk statistics were removed from /proc/stat. In 2.4, they
+appear in both /proc/partitions and /proc/stat, although the ones in
+/proc/stat take a very different format from those in /proc/partitions
+(see proc(5), if your system has it.)
+
+-- ricklind@us.ibm.com
diff --git a/Documentation/irqflags-tracing.txt b/Documentation/irqflags-tracing.txt
new file mode 100644
index 0000000..6a44487
--- /dev/null
+++ b/Documentation/irqflags-tracing.txt
@@ -0,0 +1,57 @@
+IRQ-flags state tracing
+
+started by Ingo Molnar <mingo@redhat.com>
+
+the "irq-flags tracing" feature "traces" hardirq and softirq state, in
+that it gives interested subsystems an opportunity to be notified of
+every hardirqs-off/hardirqs-on, softirqs-off/softirqs-on event that
+happens in the kernel.
+
+CONFIG_TRACE_IRQFLAGS_SUPPORT is needed for CONFIG_PROVE_SPIN_LOCKING
+and CONFIG_PROVE_RW_LOCKING to be offered by the generic lock debugging
+code. Otherwise only CONFIG_PROVE_MUTEX_LOCKING and
+CONFIG_PROVE_RWSEM_LOCKING will be offered on an architecture - these
+are locking APIs that are not used in IRQ context. (the one exception
+for rwsems is worked around)
+
+architecture support for this is certainly not in the "trivial"
+category, because lots of lowlevel assembly code deal with irq-flags
+state changes. But an architecture can be irq-flags-tracing enabled in a
+rather straightforward and risk-free manner.
+
+Architectures that want to support this need to do a couple of
+code-organizational changes first:
+
+- move their irq-flags manipulation code from their asm/system.h header
+ to asm/irqflags.h
+
+- rename local_irq_disable()/etc to raw_local_irq_disable()/etc. so that
+ the linux/irqflags.h code can inject callbacks and can construct the
+ real local_irq_disable()/etc APIs.
+
+- add and enable TRACE_IRQFLAGS_SUPPORT in their arch level Kconfig file
+
+and then a couple of functional changes are needed as well to implement
+irq-flags-tracing support:
+
+- in lowlevel entry code add (build-conditional) calls to the
+ trace_hardirqs_off()/trace_hardirqs_on() functions. The lock validator
+ closely guards whether the 'real' irq-flags matches the 'virtual'
+ irq-flags state, and complains loudly (and turns itself off) if the
+ two do not match. Usually most of the time for arch support for
+ irq-flags-tracing is spent in this state: look at the lockdep
+ complaint, try to figure out the assembly code we did not cover yet,
+ fix and repeat. Once the system has booted up and works without a
+ lockdep complaint in the irq-flags-tracing functions arch support is
+ complete.
+- if the architecture has non-maskable interrupts then those need to be
+ excluded from the irq-tracing [and lock validation] mechanism via
+ lockdep_off()/lockdep_on().
+
+in general there is no risk from having an incomplete irq-flags-tracing
+implementation in an architecture: lockdep will detect that and will
+turn itself off. I.e. the lock validator will still be reliable. There
+should be no crashes due to irq-tracing bugs. (except if the assembly
+changes break other code by modifying conditions or registers that
+shouldnt be)
+
diff --git a/Documentation/isapnp.txt b/Documentation/isapnp.txt
new file mode 100644
index 0000000..400d1b5
--- /dev/null
+++ b/Documentation/isapnp.txt
@@ -0,0 +1,14 @@
+ISA Plug & Play support by Jaroslav Kysela <perex@suse.cz>
+==========================================================
+
+Interface /proc/isapnp
+======================
+
+The interface has been removed. See pnp.txt for more details.
+
+Interface /proc/bus/isapnp
+==========================
+
+This directory allows access to ISA PnP cards and logical devices.
+The regular files contain the contents of ISA PnP registers for
+a logical device.
diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
new file mode 100644
index 0000000..9fee5f2
--- /dev/null
+++ b/Documentation/isdn/00-INDEX
@@ -0,0 +1,43 @@
+00-INDEX
+ - this file (info on ISDN implementation for Linux)
+CREDITS
+ - list of the kind folks that brought you this stuff.
+INTERFACE
+ - description of Linklevel and Hardwarelevel ISDN interface.
+README
+ - general info on what you need and what to do for Linux ISDN.
+README.FAQ
+ - general info for FAQ.
+README.audio
+ - info for running audio over ISDN.
+README.fax
+ - info for using Fax over ISDN.
+README.icn
+ - info on the ICN-ISDN-card and its driver.
+README.HiSax
+ - info on the HiSax driver which replaces the old teles.
+README.hfc-pci
+ - info on hfc-pci based cards.
+README.pcbit
+ - info on the PCBIT-D ISDN adapter and driver.
+README.syncppp
+ - info on running Sync PPP over ISDN.
+syncPPP.FAQ
+ - frequently asked questions about running PPP over ISDN.
+README.avmb1
+ - info on driver for AVM-B1 ISDN card.
+README.act2000
+ - info on driver for IBM ACT-2000 card.
+README.eicon
+ - info on driver for Eicon active cards.
+README.concap
+ - info on "CONCAP" encapsulation protocol interface used for X.25.
+README.diversion
+ - info on module for isdn diversion services.
+README.sc
+ - info on driver for Spellcaster cards.
+README.x25
+ _ info for running X.25 over ISDN.
+README.hysdn
+ - info on driver for Hypercope active HYSDN cards
+
diff --git a/Documentation/isdn/CREDITS b/Documentation/isdn/CREDITS
new file mode 100644
index 0000000..c1679e9
--- /dev/null
+++ b/Documentation/isdn/CREDITS
@@ -0,0 +1,70 @@
+
+I want to thank all who contributed to this project and especially to:
+(in alphabetical order)
+
+Thomas Bogendörfer (tsbogend@bigbug.franken.de)
+ Tester, lots of bugfixes and hints.
+
+Alan Cox (alan@lxorguk.ukuu.org.uk)
+ For help getting into standard-kernel.
+
+Henner Eisen (eis@baty.hanse.de)
+ For X.25 implementation.
+
+Volker Götz (volker@oops.franken.de)
+ For contribution of man-pages, the imontty-tool and a perfect
+ maintaining of the mailing-list at hub-wue.
+
+Matthias Hessler (hessler@isdn4linux.de)
+ For creating and maintaining the FAQ.
+
+Bernhard Hailer (Bernhard.Hailer@lrz.uni-muenchen.de)
+ For creating the FAQ, and the leafsite HOWTO.
+
+Michael 'Ghandi' Herold (michael@abadonna.franken.de)
+ For contribution of the vbox answering machine.
+
+Michael Hipp (Michael.Hipp@student.uni-tuebingen.de)
+ For his Sync-PPP-code.
+
+Karsten Keil (keil@isdn4linux.de)
+ For adding 1TR6-support to the Teles-driver.
+ For the HiSax-driver.
+
+Michael Knigge (knick@cove.han.de)
+ For contributing the imon-tool
+
+Andreas Kool (akool@Kool.f.EUnet.de)
+ For contribution of the isdnlog/isdnrep-tool
+
+Pedro Roque Marques (roque@di.fc.ul.pt)
+ For lot of new ideas and the pcbit driver.
+
+Eberhard Mönkeberg (emoenke@gwdg.de)
+ For testing and help to get into kernel.
+
+Thomas Neumann (tn@ruhr.de)
+ For help with Cisco-SLARP and keepalive
+
+Jan den Ouden (denouden@groovin.xs4all.nl)
+ For contribution of the original teles-driver
+
+Carsten Paeth (calle@calle.in-berlin.de)
+ For the AVM-B1-CAPI2.0 driver
+
+Thomas Pfeiffer (pfeiffer@pds.de)
+ For V.110, extended T.70 and Hylafax extensions in isdn_tty.c
+
+Max Riegel (riegel@max.franken.de)
+ For making the ICN hardware-documentation and test-equipment available.
+
+Armin Schindler (mac@melware.de)
+ For the eicon active card driver.
+
+Gerhard 'Fido' Schneider (fido@wuff.mayn.de)
+ For heavy-duty-beta-testing with his BBS ;)
+
+Thomas Uhl (uhl@think.de)
+ For distributing the cards.
+ For pushing me to work ;-)
+
diff --git a/Documentation/isdn/HiSax.cert b/Documentation/isdn/HiSax.cert
new file mode 100644
index 0000000..f2a6fcb
--- /dev/null
+++ b/Documentation/isdn/HiSax.cert
@@ -0,0 +1,96 @@
+-----BEGIN PGP SIGNED MESSAGE-----
+
+First:
+
+ HiSax is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+However, if you wish to modify the HiSax sources, please note the following:
+
+HiSax has passed the ITU approval test suite with ELSA Quickstep ISDN cards
+and Eicon Technology Diva 2.01 PCI card.
+The certification is only valid for the combination of the tested software
+version and the tested hardware. Any changes to the HiSax source code may
+therefore affect the certification.
+
+Additional ITU approval tests have been carried out for all generic cards
+using Colognechip single chip solutions HFC-S PCI A for PCI cards as well
+as HFC-S USB based USB ISDN ta adapters.
+These tests included all layers 1-3 and as well all functional tests for
+the layer 1. Because all hardware based on these chips are complete ISDN
+solutions in one chip all cards and USB-TAs using these chips are to be
+regarded as approved for those tests. Some additional electrical tests
+of the layer 1 which are independent of the driver and related to a
+special hardware used will be regarded as approved if at least one
+solution has been tested including those electrical tests. So if cards
+or tas have been completely approved for any other os, the approval
+for those electrical tests is valid for linux, too.
+Please send any questions regarding this drivers or approval abouts to
+werner@isdn-development.de
+Additional information and the type approval documents will be found
+shortly on the Colognechip website www.colognechip.com
+
+If you change the main files of the HiSax ISDN stack, the certification will
+become invalid. Because in most countries it is illegal to connect
+unapproved ISDN equipment to the public network, I have to guarantee that
+changes in HiSax do not affect the certification.
+
+In order to make a valid certification apparent to the user, I have built in
+some validation checks that are made during the make process. The HiSax main
+files are protected by md5 checksums and the md5sum file is pgp signed by
+myself:
+
+KeyID 1024/FF992F6D 1997/01/16 Karsten Keil <kkeil@suse.de>
+Key fingerprint = 92 6B F7 58 EE 86 28 C8 C4 1A E6 DC 39 89 F2 AA
+
+Only if the checksums are OK, and the signature of the file
+"drivers/isdn/hisax/md5sums.asc" match, is the certification valid; a
+message confirming this is then displayed during the hisax init process.
+
+The affected files are:
+
+drivers/isdn/hisax/isac.c
+drivers/isdn/hisax/isdnl1.c
+drivers/isdn/hisax/isdnl2.c
+drivers/isdn/hisax/isdnl3.c
+drivers/isdn/hisax/tei.c
+drivers/isdn/hisax/callc.c
+drivers/isdn/hisax/l3dss1.c
+drivers/isdn/hisax/l3_1tr6.c
+drivers/isdn/hisax/cert.c
+drivers/isdn/hisax/elsa.c
+drivers/isdn/hisax/diva.c
+drivers/isdn/hisax/hfc_pci.c
+
+Please send any changes, bugfixes and patches to me rather than implementing
+them directly into the HiSax sources.
+
+This does not reduce your rights granted by the GNU General Public License.
+If you wish to change the sources, go ahead; but note that then the
+certification is invalid even if you use one of the approved cards.
+
+Here are the certification registration numbers for ELSA Quickstep cards:
+German D133361J CETECOM ICT Services GmbH 0682
+European D133362J CETECOM ICT Services GmbH 0682
+
+
+Karsten Keil
+keil@isdn4linux.de
+
+-----BEGIN PGP SIGNATURE-----
+Version: 2.6.3i
+Charset: noconv
+
+iQCVAwUBOFAwqTpxHvX/mS9tAQFI2QP9GLDK2iy/KBhwReE3F7LeO+tVhffTVZ3a
+20q5/z/WcIg/pnH0uTkl2UgDXBFXYl45zJyDGNpAposIFmT+Edd14o7Vj1w/BBdn
+Y+5rBmJf+gyBu61da5d6bv0lpymwRa/um+ri+ilYnZ/XPfg5JKhdjGSBCJuJAElM
+d2jFbTrsMYw=
+=LNf9
+-----END PGP SIGNATURE-----
diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE
new file mode 100644
index 0000000..5df17e5
--- /dev/null
+++ b/Documentation/isdn/INTERFACE
@@ -0,0 +1,759 @@
+$Id: INTERFACE,v 1.15.8.2 2001/03/13 16:17:07 kai Exp $
+
+Description of the Interface between Linklevel and Hardwarelevel
+ of isdn4linux:
+
+
+ The Communication between Linklevel (LL) and Hardwarelevel (HL)
+ is based on the struct isdn_if (defined in isdnif.h).
+
+ An HL-driver can register itself at LL by calling the function
+ register_isdn() with a pointer to that struct. Prior to that, it has
+ to preset some of the fields of isdn_if. The LL sets the rest of
+ the fields. All further communication is done via callbacks using
+ the function-pointers defined in isdn_if.
+
+ Changes/Version numbering:
+
+ During development of the ISDN subsystem, several changes have been
+ made to the interface. Before it went into kernel, the package
+ had a unique version number. The last version, distributed separately
+ was 0.7.4. When the subsystem went into kernel, every functional unit
+ got a separate version number. These numbers are shown at initialization,
+ separated by slashes:
+
+ c.c/t.t/n.n/p.p/a.a/v.v
+
+ where
+
+ c.c is the revision of the common code.
+ t.t is the revision of the tty related code.
+ n.n is the revision of the network related code.
+ p.p is the revision of the ppp related code.
+ a.a is the revision of the audio related code.
+ v.v is the revision of the V.110 related code.
+
+ Changes in this document are marked with '***CHANGEx' where x representing
+ the version number. If that number starts with 0, it refers to the old,
+ separately distributed package. If it starts with one of the letters
+ above, it refers to the revision of the corresponding module.
+ ***CHANGEIx refers to the revision number of the isdnif.h
+
+1. Description of the fields of isdn_if:
+
+ int channels;
+
+ This field has to be set by the HL-driver to the number of channels
+ supported prior to calling register_isdn(). Upon return of the call,
+ the LL puts an id there, which has to be used by the HL-driver when
+ invoking the other callbacks.
+
+ int maxbufsize;
+
+ ***CHANGE0.6: New since this version.
+
+ Also to be preset by the HL-driver. With this value the HL-driver
+ tells the LL the maximum size of a data-packet it will accept.
+
+ unsigned long features;
+
+ To be preset by the HL-driver. Using this field, the HL-driver
+ announces the features supported. At the moment this is limited to
+ report the supported layer2 and layer3-protocols. For setting this
+ field the constants ISDN_FEATURE..., declared in isdnif.h have to be
+ used.
+
+ ***CHANGE0.7.1: The line type (1TR6, EDSS1) has to be set.
+
+ unsigned short hl_hdrlen;
+
+ ***CHANGE0.7.4: New field.
+
+ To be preset by the HL-driver, if it supports sk_buff's. The driver
+ should put here the amount of additional space needed in sk_buff's for
+ its internal purposes. Drivers not supporting sk_buff's should
+ initialize this field to 0.
+
+ void (*rcvcallb_skb)(int, int, struct sk_buff *)
+
+ ***CHANGE0.7.4: New field.
+
+ This field will be set by LL. The HL-driver delivers received data-
+ packets by calling this function. Upon calling, the HL-driver must
+ already have its private data pulled off the head of the sk_buff.
+
+ Parameter:
+ int driver-Id
+ int Channel-number locally to the driver. (starting with 0)
+ struct sk_buff * Pointer to sk_buff, containing received data.
+
+ int (*statcallb)(isdn_ctrl*);
+
+ This field will be set by LL. This function has to be called by the
+ HL-driver for signaling status-changes or other events to the LL.
+
+ Parameter:
+ isdn_ctrl*
+
+ The struct isdn_ctrl also defined in isdn_if. The exact meanings of its
+ fields are described together with the descriptions of the possible
+ events. Here is only a short description of the fields:
+
+ driver = driver Id.
+ command = event-type. (one of the constants ISDN_STAT_...)
+ arg = depends on event-type.
+ num = depends on event-type.
+
+ Returnvalue:
+ 0 on success, else -1
+
+ int (*command)(isdn_ctrl*);
+
+ This field has to be preset by the HL-driver. It points to a function,
+ to be called by LL to perform functions like dialing, B-channel
+ setup, etc. The exact meaning of the parameters is described with the
+ descriptions of the possible commands.
+
+ Parameter:
+ isdn_ctrl*
+ driver = driver-Id
+ command = command to perform. (one of the constants ISDN_CMD_...)
+ arg = depends on command.
+ num = depends on command.
+
+ Returnvalue:
+ >=0 on success, else error-code (-ENODEV etc.)
+
+ int (*writebuf_skb)(int, int, int, struct sk_buff *)
+
+ ***CHANGE0.7.4: New field.
+ ***CHANGEI.1.21: New field.
+
+ This field has to be preset by the HL-driver. The given function will
+ be called by the LL for delivering data to be send via B-Channel.
+
+
+ Parameter:
+ int driver-Id ***CHANGE0.7.4: New parameter.
+ int channel-number locally to the HL-driver. (starts with 0)
+ int ack ***ChangeI1.21: New parameter
+ If this is !0, the driver has to signal the delivery
+ by sending an ISDN_STAT_BSENT. If this is 0, the driver
+ MUST NOT send an ISDN_STAT_BSENT.
+ struct sk_buff * Pointer to sk_buff containing data to be send via
+ B-channel.
+
+ Returnvalue:
+ Length of data accepted on success, else error-code (-EINVAL on
+ oversized packets etc.)
+
+ int (*writecmd)(u_char*, int, int, int, int);
+
+ This field has to be preset by the HL-driver. The given function will be
+ called to perform write-requests on /dev/isdnctrl (i.e. sending commands
+ to the card) The data-format is hardware-specific. This function is
+ intended for debugging only. It is not necessary for normal operation
+ and never will be called by the tty-emulation- or network-code. If
+ this function is not supported, the driver has to set NULL here.
+
+ Parameter:
+ u_char* pointer to data.
+ int length of data.
+ int flag: 0 = call from within kernel-space. (HL-driver must use
+ memcpy, may NOT use schedule())
+ 1 = call from user-space. (HL-driver must use
+ memcpy_fromfs, use of schedule() allowed)
+ int driver-Id.
+ int channel-number locally to the HL-driver. (starts with 0)
+
+***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
+
+ Returnvalue:
+ Length of data accepted on success, else error-code (-EINVAL etc.)
+
+ int (*readstat)(u_char*, int, int, int, int);
+
+ This field has to be preset by the HL-driver. The given function will be
+ called to perform read-requests on /dev/isdnctrl (i.e. reading replies
+ from the card) The data-format is hardware-specific. This function is
+ intended for debugging only. It is not necessary for normal operation
+ and never will be called by the tty-emulation- or network-code. If
+ this function is not supported, the driver has to set NULL here.
+
+ Parameter:
+ u_char* pointer to data.
+ int length of data.
+ int flag: 0 = call from within kernel-space. (HL-driver must use
+ memcpy, may NOT use schedule())
+ 1 = call from user-space. (HL-driver must use
+ memcpy_fromfs, use of schedule() allowed)
+ int driver-Id.
+ int channel-number locally to the HL-driver. (starts with 0)
+
+***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
+
+ Returnvalue:
+ Length of data on success, else error-code (-EINVAL etc.)
+
+ char id[20];
+ ***CHANGE0.7: New since this version.
+
+ This string has to be preset by the HL-driver. Its purpose is for
+ identification of the driver by the user. Eg.: it is shown in the
+ status-info of /dev/isdninfo. Furthermore it is used as Id for binding
+ net-interfaces to a specific channel. If a string of length zero is
+ given, upon return, isdn4linux will replace it by a generic name. (line0,
+ line1 etc.) It is recommended to make this string configurable during
+ module-load-time. (copy a global variable to this string.) For doing that,
+ modules 1.2.8 or newer are necessary.
+
+2. Description of the commands, a HL-driver has to support:
+
+ All commands will be performed by calling the function command() described
+ above from within the LL. The field command of the struct-parameter will
+ contain the desired command, the field driver is always set to the
+ appropriate driver-Id.
+
+ Until now, the following commands are defined:
+
+***CHANGEI1.34: The parameter "num" has been replaced by a union "parm" containing
+ the old "num" and a new setup_type struct used for ISDN_CMD_DIAL
+ and ISDN_STAT_ICALL callback.
+
+ ISDN_CMD_IOCTL:
+
+ This command is intended for performing ioctl-calls for configuring
+ hardware or similar purposes (setting port-addresses, loading firmware
+ etc.) For this purpose, in the LL all ioctl-calls with an argument
+ >= IIOCDRVCTL (0x100) will be handed transparently to this
+ function after subtracting 0x100 and placing the result in arg.
+ Example:
+ If a userlevel-program calls ioctl(0x101,...) the function gets
+ called with the field command set to 1.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_IOCTL
+ arg = Original ioctl-cmd - IIOCDRVCTL
+ parm.num = first bytes filled with (unsigned long)arg
+
+ Returnvalue:
+ Depending on driver.
+
+
+ ISDN_CMD_DIAL:
+
+ This command is used to tell the HL-driver it should dial a given
+ number.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_DIAL
+ arg = channel-number locally to the driver. (starting with 0)
+
+ parm.setup.phone = An ASCII-String containing the number to dial.
+ parm.setup.eazmsn = An ASCII-Sting containing the own EAZ or MSN.
+ parm.setup.si1 = The Service-Indicator.
+ parm.setup.si2 = Additional Service-Indicator.
+
+ If the Line has been designed as SPV (a special german
+ feature, meaning semi-leased-line) the phone has to
+ start with an "S".
+ ***CHANGE0.6: In previous versions the EAZ has been given in the
+ highbyte of arg.
+ ***CHANGE0.7.1: New since this version: ServiceIndicator and AddInfo.
+
+ ISDN_CMD_ACCEPTD:
+
+ With this command, the HL-driver is told to accept a D-Channel-setup.
+ (Response to an incoming call)
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_ACCEPTD
+ arg = channel-number locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_CMD_ACCEPTB:
+
+ With this command, the HL-driver is told to perform a B-Channel-setup.
+ (after establishing D-Channel-Connection)
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_ACCEPTB
+ arg = channel-number locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_CMD_HANGUP:
+
+ With this command, the HL-driver is told to hangup (B-Channel if
+ established first, then D-Channel). This command is also used for
+ actively rejecting an incoming call.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_HANGUP
+ arg = channel-number locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_CMD_CLREAZ:
+
+ With this command, the HL-driver is told not to signal incoming
+ calls to the LL.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_CLREAZ
+ arg = channel-number locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_CMD_SETEAZ:
+
+ With this command, the HL-driver is told to signal incoming calls for
+ the given EAZs/MSNs to the LL.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_SETEAZ
+ arg = channel-number locally to the driver. (starting with 0)
+ parm.num = ASCII-String, containing the desired EAZ's/MSN's
+ (comma-separated). If an empty String is given, the
+ HL-driver should respond to ALL incoming calls,
+ regardless of the destination-address.
+ ***CHANGE0.6: New since this version the "empty-string"-feature.
+
+ ISDN_CMD_GETEAZ: (currently unused)
+
+ With this command, the HL-driver is told to report the current setting
+ given with ISDN_CMD_SETEAZ.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_GETEAZ
+ arg = channel-number locally to the driver. (starting with 0)
+ parm.num = ASCII-String, containing the current EAZ's/MSN's
+
+ ISDN_CMD_SETSIL: (currently unused)
+
+ With this command, the HL-driver is told to signal only incoming
+ calls with the given Service-Indicators.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_SETSIL
+ arg = channel-number locally to the driver. (starting with 0)
+ parm.num = ASCII-String, containing the desired Service-Indicators.
+
+ ISDN_CMD_GETSIL: (currently unused)
+
+ With this command, the HL-driver is told to return the current
+ Service-Indicators it will respond to.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_SETSIL
+ arg = channel-number locally to the driver. (starting with 0)
+ parm.num = ASCII-String, containing the current Service-Indicators.
+
+ ISDN_CMD_SETL2:
+
+ With this command, the HL-driver is told to select the given Layer-2-
+ protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
+ ISDN_CMD_ACCEPTD.
+
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_SETL2
+ arg = channel-number locally to the driver. (starting with 0)
+ logical or'ed with (protocol-Id << 8)
+ protocol-Id is one of the constants ISDN_PROTO_L2...
+ parm = unused.
+
+ ISDN_CMD_GETL2: (currently unused)
+
+ With this command, the HL-driver is told to return the current
+ setting of the Layer-2-protocol.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_GETL2
+ arg = channel-number locally to the driver. (starting with 0)
+ parm = unused.
+ Returnvalue:
+ current protocol-Id (one of the constants ISDN_L2_PROTO)
+
+ ISDN_CMD_SETL3:
+
+ With this command, the HL-driver is told to select the given Layer-3-
+ protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
+ ISDN_CMD_ACCEPTD.
+
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_SETL3
+ arg = channel-number locally to the driver. (starting with 0)
+ logical or'ed with (protocol-Id << 8)
+ protocol-Id is one of the constants ISDN_PROTO_L3...
+ parm.fax = Pointer to T30_s fax struct. (fax usage only)
+
+ ISDN_CMD_GETL2: (currently unused)
+
+ With this command, the HL-driver is told to return the current
+ setting of the Layer-3-protocol.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_GETL3
+ arg = channel-number locally to the driver. (starting with 0)
+ parm = unused.
+ Returnvalue:
+ current protocol-Id (one of the constants ISDN_L3_PROTO)
+
+ ISDN_CMD_PROCEED:
+
+ With this command, the HL-driver is told to proceed with a incoming call.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_PROCEED
+ arg = channel-number locally to the driver. (starting with 0)
+ setup.eazmsn= empty string or string send as uus1 in DSS1 with
+ PROCEED message
+
+ ISDN_CMD_ALERT:
+
+ With this command, the HL-driver is told to alert a proceeding call.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_ALERT
+ arg = channel-number locally to the driver. (starting with 0)
+ setup.eazmsn= empty string or string send as uus1 in DSS1 with
+ ALERT message
+
+ ISDN_CMD_REDIR:
+
+ With this command, the HL-driver is told to redirect a call in proceeding
+ or alerting state.
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_REDIR
+ arg = channel-number locally to the driver. (starting with 0)
+ setup.eazmsn= empty string or string send as uus1 in DSS1 protocol
+ setup.screen= screening indicator
+ setup.phone = redirected to party number
+
+ ISDN_CMD_PROT_IO:
+
+ With this call, the LL-driver invokes protocol specific features through
+ the LL.
+ The call is not implicitely bound to a connection.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_CMD_PROT_IO
+ arg = The lower 8 Bits define the addressed protocol as defined
+ in ISDN_PTYPE..., the upper bits are used to differentiate
+ the protocol specific CMD.
+
+ para = protocol and function specific. See isdnif.h for detail.
+
+
+ ISDN_CMD_FAXCMD:
+
+ With this command the HL-driver receives a fax sub-command.
+ For details refer to INTERFACE.fax
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_CMD_FAXCMD
+ arg = channel-number locally to the driver. (starting with 0)
+ parm = unused.
+
+
+3. Description of the events to be signaled by the HL-driver to the LL.
+
+ All status-changes are signaled via calling the previously described
+ function statcallb(). The field command of the struct isdn_cmd has
+ to be set by the HL-driver with the appropriate Status-Id (event-number).
+ The field arg has to be set to the channel-number (locally to the driver,
+ starting with 0) to which this event applies. (Exception: STAVAIL-event)
+
+ Until now, the following Status-Ids are defined:
+
+ ISDN_STAT_AVAIL:
+
+ With this call, the HL-driver signals the availability of new data
+ for readstat(). Used only for debugging-purposes, see description
+ of readstat().
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_STAVAIL
+ arg = length of available data.
+ parm = unused.
+
+ ISDN_STAT_ICALL:
+ ISDN_STAT_ICALLW:
+
+ With this call, the HL-driver signals an incoming call to the LL.
+ If ICALLW is signalled the incoming call is a waiting call without
+ a available B-chan.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_ICALL
+ arg = channel-number, locally to the driver. (starting with 0)
+ para.setup.phone = Callernumber.
+ para.setup.eazmsn = CalledNumber.
+ para.setup.si1 = Service Indicator.
+ para.setup.si2 = Additional Service Indicator.
+ para.setup.plan = octet 3 from Calling party number Information Element.
+ para.setup.screen = octet 3a from Calling party number Information Element.
+
+ Return:
+ 0 = No device matching this call.
+ 1 = At least one device matching this call (RING on ttyI).
+ HL-driver may send ALERTING on the D-channel in this case.
+ 2 = Call will be rejected.
+ 3 = Incoming called party number is currently incomplete.
+ Additional digits are required.
+ Used for signalling with PtP connections.
+ 4 = Call will be held in a proceeding state
+ (HL driver sends PROCEEDING)
+ Used when a user space prog needs time to interpret a call
+ para.setup.eazmsn may be filled with an uus1 message of
+ 30 octets maximum. Empty string if no uus.
+ 5 = Call will be actively deflected to another party
+ Only available in DSS1/EURO protocol
+ para.setup.phone must be set to destination party number
+ para.setup.eazmsn may be filled with an uus1 message of
+ 30 octets maximum. Empty string if no uus.
+ -1 = An error happened. (Invalid parameters for example.)
+ The keypad support now is included in the dial command.
+
+
+ ISDN_STAT_RUN:
+
+ With this call, the HL-driver signals availability of the ISDN-card.
+ (after initializing, loading firmware)
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_RUN
+ arg = unused.
+ parm = unused.
+
+ ISDN_STAT_STOP:
+
+ With this call, the HL-driver signals unavailability of the ISDN-card.
+ (before unloading, while resetting/reconfiguring the card)
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_STOP
+ arg = unused.
+ parm = unused.
+
+ ISDN_STAT_DCONN:
+
+ With this call, the HL-driver signals the successful establishment of
+ a D-Channel-connection. (Response to ISDN_CMD_ACCEPTD or ISDN_CMD_DIAL)
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_DCONN
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_STAT_BCONN:
+
+ With this call, the HL-driver signals the successful establishment of
+ a B-Channel-connection. (Response to ISDN_CMD_ACCEPTB or because the
+ remote-station has initiated establishment)
+
+ The HL driver should call this when the logical l2/l3 protocol
+ connection on top of the physical B-channel is established.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_BCONN
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm.num = ASCII-String, containing type of connection (for analog
+ modem only). This will be appended to the CONNECT message
+ e.g. 14400/V.32bis
+
+ ISDN_STAT_DHUP:
+
+ With this call, the HL-driver signals the shutdown of a
+ D-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
+ or caused by a remote-hangup or if the remote-station has actively
+ rejected a call.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_DHUP
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_STAT_BHUP:
+
+ With this call, the HL-driver signals the shutdown of a
+ B-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
+ or caused by a remote-hangup.
+
+ The HL driver should call this as soon as the logical l2/l3 protocol
+ connection on top of the physical B-channel is released.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_BHUP
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_STAT_CINF:
+
+ With this call, the HL-driver delivers charge-unit information to the
+ LL.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_CINF
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm.num = ASCII string containing charge-units (digits only).
+
+ ISDN_STAT_LOAD: (currently unused)
+
+ ISDN_STAT_UNLOAD:
+
+ With this call, the HL-driver signals that it will be unloaded now. This
+ tells the LL to release all corresponding data-structures.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_UNLOAD
+ arg = unused.
+ parm = unused.
+
+ ISDN_STAT_BSENT:
+
+ With this call the HL-driver signals the delivery of a data-packet.
+ This callback is used by the network-interfaces only, tty-Emulation
+ does not need this call.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_BSENT
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm.length = ***CHANGEI.1.21: New field.
+ the driver has to set this to the original length
+ of the skb at the time of receiving it from the linklevel.
+
+ ISDN_STAT_NODCH:
+
+ With this call, the driver has to respond to a prior ISDN_CMD_DIAL, if
+ no D-Channel is available.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_NODCH
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm = unused.
+
+ ISDN_STAT_ADDCH:
+
+ This call is for HL-drivers, which are unable to check card-type
+ or numbers of supported channels before they have loaded any firmware
+ using ioctl. Those HL-driver simply set the channel-parameter to a
+ minimum channel-number when registering, and later if they know
+ the real amount, perform this call, allocating additional channels.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_ADDCH
+ arg = number of channels to be added.
+ parm = unused.
+
+ ISDN_STAT_CAUSE:
+
+ With this call, the HL-driver delivers CAUSE-messages to the LL.
+ Currently the LL does not use this messages. Their contents is simply
+ logged via kernel-messages. Therefore, currently the format of the
+ messages is completely free. However they should be printable.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_NODCH
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm.num = ASCII string containing CAUSE-message.
+
+ ISDN_STAT_DISPLAY:
+
+ With this call, the HL-driver delivers DISPLAY-messages to the LL.
+ Currently the LL does not use this messages.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_DISPLAY
+ arg = channel-number, locally to the driver. (starting with 0)
+ para.display= string containing DISPLAY-message.
+
+ ISDN_STAT_PROT:
+
+ With this call, the HL-driver delivers protocol specific infos to the LL.
+ The call is not implicitely bound to a connection.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_PROT
+ arg = The lower 8 Bits define the addressed protocol as defined
+ in ISDN_PTYPE..., the upper bits are used to differentiate
+ the protocol specific STAT.
+
+ para = protocol and function specific. See isdnif.h for detail.
+
+ ISDN_STAT_DISCH:
+
+ With this call, the HL-driver signals the LL to disable or enable the
+ use of supplied channel and driver.
+ The call may be used to reduce the available number of B-channels after
+ loading the driver. The LL has to ignore a disabled channel when searching
+ for free channels. The HL driver itself never delivers STAT callbacks for
+ disabled channels.
+ The LL returns a nonzero code if the operation was not successful or the
+ selected channel is actually regarded as busy.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_DISCH
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm.num[0] = 0 if channel shall be disabled, else enabled.
+
+ ISDN_STAT_L1ERR:
+
+ ***CHANGEI1.21 new status message.
+ A signal can be sent to the linklevel if an Layer1-error results in
+ packet-loss on receive or send. The field errcode of the cmd.parm
+ union describes the error more precisely.
+
+ Parameter:
+ driver = driver-Id
+ command = ISDN_STAT_L1ERR
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm.errcode= ISDN_STAT_L1ERR_SEND: Packet lost while sending.
+ ISDN_STAT_L1ERR_RECV: Packet lost while receiving.
+ ISDN_STAT_FAXIND:
+
+ With this call the HL-driver signals a fax sub-command to the LL.
+ For details refer to INTERFACE.fax
+
+ Parameter:
+ driver = driver-Id.
+ command = ISDN_STAT_FAXIND
+ arg = channel-number, locally to the driver. (starting with 0)
+ parm = unused.
+
diff --git a/Documentation/isdn/INTERFACE.fax b/Documentation/isdn/INTERFACE.fax
new file mode 100644
index 0000000..9c8c6d9
--- /dev/null
+++ b/Documentation/isdn/INTERFACE.fax
@@ -0,0 +1,163 @@
+$Id: INTERFACE.fax,v 1.2 2000/08/06 09:22:50 armin Exp $
+
+
+Description of the fax-subinterface between linklevel and hardwarelevel of
+ isdn4linux.
+
+ The communication between linklevel (LL) and hardwarelevel (HL) for fax
+ is based on the struct T30_s (defined in isdnif.h).
+ This struct is allocated in the LL.
+ In order to use fax, the LL provides the pointer to this struct with the
+ command ISDN_CMD_SETL3 (parm.fax). This pointer expires in case of hangup
+ and when a new channel to a new connection is assigned.
+
+
+Data handling:
+ In send-mode the HL-driver has to handle the <DLE> codes and the bit-order
+ conversion by itself.
+ In receive-mode the LL-driver takes care of the bit-order conversion
+ (specified by +FBOR)
+
+Structure T30_s description:
+
+ This structure stores the values (set by AT-commands), the remote-
+ capability-values and the command-codes between LL and HL.
+
+ If the HL-driver receives ISDN_CMD_FAXCMD, all needed information
+ is in this struct set by the LL.
+ To signal information to the LL, the HL-driver has to set the
+ parameters and use ISDN_STAT_FAXIND.
+ (Please refer to INTERFACE)
+
+Structure T30_s:
+
+ All members are 8-bit unsigned (__u8)
+
+ - resolution
+ - rate
+ - width
+ - length
+ - compression
+ - ecm
+ - binary
+ - scantime
+ - id[]
+ Local faxmachine's parameters, set by +FDIS, +FDCS, +FLID, ...
+
+ - r_resolution
+ - r_rate
+ - r_width
+ - r_length
+ - r_compression
+ - r_ecm
+ - r_binary
+ - r_scantime
+ - r_id[]
+ Remote faxmachine's parameters. To be set by HL-driver.
+
+ - phase
+ Defines the actual state of fax connection. Set by HL or LL
+ depending on progress and type of connection.
+ If the phase changes because of an AT command, the LL driver
+ changes this value. Otherwise the HL-driver takes care of it, but
+ only necessary on call establishment (from IDLE to PHASE_A).
+ (one of the constants ISDN_FAX_PHASE_[IDLE,A,B,C,D,E])
+
+ - direction
+ Defines outgoing/send or incoming/receive connection.
+ (ISDN_TTY_FAX_CONN_[IN,OUT])
+
+ - code
+ Commands from LL to HL; possible constants :
+ ISDN_TTY_FAX_DR signals +FDR command to HL
+
+ ISDN_TTY_FAX_DT signals +FDT command to HL
+
+ ISDN_TTY_FAX_ET signals +FET command to HL
+
+
+ Other than that the "code" is set with the hangup-code value at
+ the end of connection for the +FHNG message.
+
+ - r_code
+ Commands from HL to LL; possible constants :
+ ISDN_TTY_FAX_CFR output of +FCFR message.
+
+ ISDN_TTY_FAX_RID output of remote ID set in r_id[]
+ (+FCSI/+FTSI on send/receive)
+
+ ISDN_TTY_FAX_DCS output of +FDCS and CONNECT message,
+ switching to phase C.
+
+ ISDN_TTY_FAX_ET signals end of data,
+ switching to phase D.
+
+ ISDN_TTY_FAX_FCON signals the established, outgoing connection,
+ switching to phase B.
+
+ ISDN_TTY_FAX_FCON_I signals the established, incoming connection,
+ switching to phase B.
+
+ ISDN_TTY_FAX_DIS output of +FDIS message and values.
+
+ ISDN_TTY_FAX_SENT signals that all data has been sent
+ and <DLE><ETX> is acknowledged,
+ OK message will be sent.
+
+ ISDN_TTY_FAX_PTS signals a msg-confirmation (page sent successful),
+ depending on fet value:
+ 0: output OK message (more pages follow)
+ 1: switching to phase B (next document)
+
+ ISDN_TTY_FAX_TRAIN_OK output of +FDCS and OK message (for receive mode).
+
+ ISDN_TTY_FAX_EOP signals end of data in receive mode,
+ switching to phase D.
+
+ ISDN_TTY_FAX_HNG output of the +FHNG and value set by code and
+ OK message, switching to phase E.
+
+
+ - badlin
+ Value of +FBADLIN
+
+ - badmul
+ Value of +FBADMUL
+
+ - bor
+ Value of +FBOR
+
+ - fet
+ Value of +FET command in send-mode.
+ Set by HL in receive-mode for +FET message.
+
+ - pollid[]
+ ID-string, set by +FCIG
+
+ - cq
+ Value of +FCQ
+
+ - cr
+ Value of +FCR
+
+ - ctcrty
+ Value of +FCTCRTY
+
+ - minsp
+ Value of +FMINSP
+
+ - phcto
+ Value of +FPHCTO
+
+ - rel
+ Value of +FREL
+
+ - nbc
+ Value of +FNBC (0,1)
+ (+FNBC is not a known class 2 fax command, I added this to change the
+ automatic "best capabilities" connection in the eicon HL-driver)
+
+
+Armin
+mac@melware.de
+
diff --git a/Documentation/isdn/README b/Documentation/isdn/README
new file mode 100644
index 0000000..6783437
--- /dev/null
+++ b/Documentation/isdn/README
@@ -0,0 +1,599 @@
+README for the ISDN-subsystem
+
+1. Preface
+
+ 1.1 Introduction
+
+ This README describes how to set up and how to use the different parts
+ of the ISDN-subsystem.
+
+ For using the ISDN-subsystem, some additional userlevel programs are
+ necessary. Those programs and some contributed utilities are available
+ at
+
+ ftp.isdn4linux.de
+
+ /pub/isdn4linux/isdn4k-utils-<VersionNumber>.tar.gz
+
+
+ We also have set up a mailing-list:
+
+ The isdn4linux-project originates in Germany, and therefore by historical
+ reasons, the mailing-list's primary language is german. However mails
+ written in english have been welcome all the time.
+
+ to subscribe: write a email to majordomo@listserv.isdn4linux.de,
+ Subject irrelevant, in the message body:
+ subscribe isdn4linux <your_email_address>
+
+ To write to the mailing-list, write to isdn4linux@listserv.isdn4linux.de
+
+ This mailinglist is bidirectionally gated to the newsgroup
+
+ de.alt.comm.isdn4linux
+
+ There is also a well maintained FAQ in English available at
+ http://www.mhessler.de/i4lfaq/
+ It can be viewed online, or downloaded in sgml/text/html format.
+ The FAQ can also be viewed online at
+ http://www.isdn4inux.de/faq/
+ or downloaded from
+ ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/
+
+ 1.1 Technical details
+
+ In the following Text, the terms MSN and EAZ are used.
+
+ MSN is the abbreviation for (M)ultiple(S)ubscriber(N)umber, and applies
+ to Euro(EDSS1)-type lines. Usually it is simply the phone number.
+
+ EAZ is the abbreviation of (E)ndgeraete(A)uswahl(Z)iffer and
+ applies to German 1TR6-type lines. This is a one-digit string,
+ simply appended to the base phone number
+
+ The internal handling is nearly identical, so replace the appropriate
+ term to that one, which applies to your local ISDN-environment.
+
+ When the link-level-module isdn.o is loaded, it supports up to 16
+ low-level-modules with up to 64 channels. (The number 64 is arbitrarily
+ chosen and can be configured at compile-time --ISDN_MAX in isdn.h).
+ A low-level-driver can register itself through an interface (which is
+ defined in isdnif.h) and gets assigned a slot.
+ The following char-devices are made available for each channel:
+
+ A raw-control-device with the following functions:
+ write: raw D-channel-messages (format: depends on driver).
+ read: raw D-channel-messages (format: depends on driver).
+ ioctl: depends on driver, i.e. for the ICN-driver, the base-address of
+ the ports and the shared memory on the card can be set and read
+ also the boot-code and the protocol software can be loaded into
+ the card.
+
+ O N L Y !!! for debugging (no locking against other devices):
+ One raw-data-device with the following functions:
+ write: data to B-channel.
+ read: data from B-channel.
+
+ In addition the following devices are made available:
+
+ 128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator:
+ The functionality is almost the same as that of a serial device
+ (the line-discs are handled by the kernel), which lets you run
+ SLIP, CSLIP and asynchronous PPP through the devices. We have tested
+ Seyon, minicom, CSLIP (uri-dip) PPP, mgetty, XCept and Hylafax.
+
+ The modem-emulation supports the following:
+ 1.3.1 Commands:
+
+ ATA Answer incoming call.
+ ATD<No.> Dial, the number may contain:
+ [0-9] and [,#.*WPT-S]
+ the latter are ignored until 'S'.
+ The 'S' must precede the number, if
+ the line is a SPV (German 1TR6).
+ ATE0 Echo off.
+ ATE1 Echo on (default).
+ ATH Hang-up.
+ ATH1 Off hook (ignored).
+ ATH0 Hang-up.
+ ATI Return "ISDN for Linux...".
+ ATI0 "
+ ATI1 "
+ ATI2 Report of last connection.
+ ATO On line (data mode).
+ ATQ0 Enable result codes (default).
+ ATQ1 Disable result codes (default).
+ ATSx=y Set register x to y.
+ ATSx? Show contents of register x.
+ ATV0 Numeric responses.
+ ATV1 English responses (default).
+ ATZ Load registers and EAZ/MSN from Profile.
+ AT&Bx Set Send-Packet-size to x (max. 4000)
+ The real packet-size may be limited by the
+ low-level-driver used. e.g. the HiSax-Module-
+ limit is 2000. You will get NO Error-Message,
+ if you set it to higher values, because at the
+ time of giving this command the corresponding
+ driver may not be selected (see "Automatic
+ Assignment") however the size of outgoing packets
+ will be limited correctly.
+ AT&D0 Ignore DTR
+ AT&D2 DTR-low-edge: Hang up and return to
+ command mode (default).
+ AT&D3 Same as AT&D2 but also resets all registers.
+ AT&Ex Set the EAZ/MSN for this channel to x.
+ AT&F Reset all registers and profile to "factory-defaults"
+ AT&Lx Set list of phone numbers to listen on. x is a
+ list of wildcard patterns separated by semicolon.
+ If this is set, it has precedence over the MSN set
+ by AT&E.
+ AT&Rx Select V.110 bitrate adaption.
+ This command enables V.110 protocol with 9600 baud
+ (x=9600), 19200 baud (x=19200) or 38400 baud
+ (x=38400). A value of x=0 disables V.110 switching
+ back to default X.75. This command sets the following
+ Registers:
+ Reg 14 (Layer-2 protocol):
+ x = 0: 0
+ x = 9600: 7
+ x = 19200: 8
+ x = 38400: 9
+ Reg 18.2 = 1
+ Reg 19 (Additional Service Indicator):
+ x = 0: 0
+ x = 9600: 197
+ x = 19200: 199
+ x = 38400: 198
+ Note on value in Reg 19:
+ There is _NO_ common convention for 38400 baud.
+ The value 198 is chosen arbitrarily. Users
+ _MUST_ negotiate this value before establishing
+ a connection.
+ AT&Sx Set window-size (x = 1..8) (not yet implemented)
+ AT&V Show all settings.
+ AT&W0 Write registers and EAZ/MSN to profile. See also
+ iprofd (5.c in this README).
+ AT&X0 BTX-mode and T.70-mode off (default)
+ AT&X1 BTX-mode on. (S13.1=1, S13.5=0 S14=0, S16=7, S18=7, S19=0)
+ AT&X2 T.70-mode on. (S13.1=1, S13.5=1, S14=0, S16=7, S18=7, S19=0)
+ AT+Rx Resume a suspended call with CallID x (x = 1,2,3...)
+ AT+Sx Suspend a call with CallID x (x = 1,2,3...)
+
+ For voice-mode commands refer to README.audio
+
+ 1.3.2 Escape sequence:
+ During a connection, the emulation reacts just like
+ a normal modem to the escape sequence <DELAY>+++<DELAY>.
+ (The escape character - default '+' - can be set in the
+ register 2).
+ The DELAY must at least be 1.5 seconds long and delay
+ between the escape characters must not exceed 0.5 seconds.
+
+ 1.3.3 Registers:
+
+ Nr. Default Description
+ 0 0 Answer on ring number.
+ (no auto-answer if S0=0).
+ 1 0 Count of rings.
+ 2 43 Escape character.
+ (a value >= 128 disables the escape sequence).
+ 3 13 Carriage return character (ASCII).
+ 4 10 Line feed character (ASCII).
+ 5 8 Backspace character (ASCII).
+ 6 3 Delay in seconds before dialing.
+ 7 60 Wait for carrier.
+ 8 2 Pause time for comma (ignored)
+ 9 6 Carrier detect time (ignored)
+ 10 7 Carrier loss to disconnect time (ignored).
+ 11 70 Touch tone timing (ignored).
+ 12 69 Bit coded register:
+ Bit 0: 0 = Suppress response messages.
+ 1 = Show response messages.
+ Bit 1: 0 = English response messages.
+ 1 = Numeric response messages.
+ Bit 2: 0 = Echo off.
+ 1 = Echo on.
+ Bit 3 0 = DCD always on.
+ 1 = DCD follows carrier.
+ Bit 4 0 = CTS follows RTS
+ 1 = Ignore RTS, CTS always on.
+ Bit 5 0 = return to command mode on DTR low.
+ 1 = Same as 0 but also resets all
+ registers.
+ See also register 13, bit 2
+ Bit 6 0 = DSR always on.
+ 1 = DSR only on if channel is available.
+ Bit 7 0 = Cisco-PPP-flag-hack off (default).
+ 1 = Cisco-PPP-flag-hack on.
+ 13 0 Bit coded register:
+ Bit 0: 0 = Use delayed tty-send-algorithm
+ 1 = Direct tty-send.
+ Bit 1: 0 = T.70 protocol (Only for BTX!) off
+ 1 = T.70 protocol (Only for BTX!) on
+ Bit 2: 0 = Don't hangup on DTR low.
+ 1 = Hangup on DTR low.
+ Bit 3: 0 = Standard response messages
+ 1 = Extended response messages
+ Bit 4: 0 = CALLER NUMBER before every RING.
+ 1 = CALLER NUMBER after first RING.
+ Bit 5: 0 = T.70 extended protocol off
+ 1 = T.70 extended protocol on
+ Bit 6: 0 = Special RUNG Message off
+ 1 = Special RUNG Message on
+ "RUNG" is delivered on a ttyI, if
+ an incoming call happened (RING) and
+ the remote party hung up before any
+ local ATA was given.
+ Bit 7: 0 = Don't show display messages from net
+ 1 = Show display messages from net
+ (S12 Bit 1 must be 0 too)
+ 14 0 Layer-2 protocol:
+ 0 = X75/LAPB with I-frames
+ 1 = X75/LAPB with UI-frames
+ 2 = X75/LAPB with BUI-frames
+ 3 = HDLC
+ 4 = Transparent (audio)
+ 7 = V.110, 9600 baud
+ 8 = V.110, 19200 baud
+ 9 = V.110, 38400 baud
+ 10 = Analog Modem (only if hardware supports this)
+ 11 = Fax G3 (only if hardware supports this)
+ 15 0 Layer-3 protocol:
+ 0 = transparent
+ 1 = transparent with audio features (e.g. DSP)
+ 2 = Fax G3 Class 2 commands (S14 has to be set to 11)
+ 3 = Fax G3 Class 1 commands (S14 has to be set to 11)
+ 16 250 Send-Packet-size/16
+ 17 8 Window-size (not yet implemented)
+ 18 4 Bit coded register, Service-Octet-1 to accept,
+ or to be used on dialout:
+ Bit 0: Service 1 (audio) when set.
+ Bit 1: Service 5 (BTX) when set.
+ Bit 2: Service 7 (data) when set.
+ Note: It is possible to set more than one
+ bit. In this case, on incoming calls
+ the selected services are accepted,
+ and if the service is "audio", the
+ Layer-2-protocol is automatically
+ changed to 4 regardless of the setting
+ of register 14. On outgoing calls,
+ the most significant 1-bit is chosen to
+ select the outgoing service octet.
+ 19 0 Service-Octet-2
+ 20 0 Bit coded register (readonly)
+ Service-Octet-1 of last call.
+ Bit mapping is the same as register 18
+ 21 0 Bit coded register (readonly)
+ Set on incoming call (during RING) to
+ octet 3 of calling party number IE (Numbering plan)
+ See section 4.5.10 of ITU Q.931
+ 22 0 Bit coded register (readonly)
+ Set on incoming call (during RING) to
+ octet 3a of calling party number IE (Screening info)
+ See section 4.5.10 of ITU Q.931
+ 23 0 Bit coded register:
+ Bit 0: 0 = Add CPN to RING message off
+ 1 = Add CPN to RING message on
+ Bit 1: 0 = Add CPN to FCON message off
+ 1 = Add CPN to FCON message on
+ Bit 2: 0 = Add CDN to RING/FCON message off
+ 1 = Add CDN to RING/FCON message on
+
+ Last but not least a (at the moment fairly primitive) device to request
+ the line-status (/dev/isdninfo) is made available.
+
+ Automatic assignment of devices to lines:
+
+ All inactive physical lines are listening to all EAZs for incoming
+ calls and are NOT assigned to a specific tty or network interface.
+ When an incoming call is detected, the driver looks first for a network
+ interface and then for an opened tty which:
+
+ 1. is configured for the same EAZ.
+ 2. has the same protocol settings for the B-channel.
+ 3. (only for network interfaces if the security flag is set)
+ contains the caller number in its access list.
+ 4. Either the channel is not bound exclusively to another Net-interface, or
+ it is bound AND the other checks apply to exactly this interface.
+ (For usage of the bind-features, refer to the isdnctrl-man-page)
+
+ Only when a matching interface or tty is found is the call accepted
+ and the "connection" between the low-level-layer and the link-level-layer
+ is established and kept until the end of the connection.
+ In all other cases no connection is established. Isdn4linux can be
+ configured to either do NOTHING in this case (which is useful, if
+ other, external devices with the same EAZ/MSN are connected to the bus)
+ or to reject the call actively. (isdnctrl busreject ...)
+
+ For an outgoing call, the inactive physical lines are searched.
+ The call is placed on the first physical line, which supports the
+ requested protocols for the B-channel. If a net-interface, however
+ is pre-bound to a channel, this channel is used directly.
+
+ This makes it possible to configure several network interfaces and ttys
+ for one EAZ, if the network interfaces are set to secure operation.
+ If an incoming call matches one network interface, it gets connected to it.
+ If another incoming call for the same EAZ arrives, which does not match
+ a network interface, the first tty gets a "RING" and so on.
+
+2 System prerequisites:
+
+ ATTENTION!
+
+ Always use the latest module utilities. The current version is
+ named in Documentation/Changes. Some old versions of insmod
+ are not capable of setting the driver-Ids correctly.
+
+3. Lowlevel-driver configuration.
+
+ Configuration depends on how the drivers are built. See the
+ README.<yourDriver> for information on driver-specific setup.
+
+4. Device-inodes
+
+ The major and minor numbers and their names are described in
+ Documentation/devices.txt. The major numbers are:
+
+ 43 for the ISDN-tty's.
+ 44 for the ISDN-callout-tty's.
+ 45 for control/info/debug devices.
+
+5. Application
+
+ a) For some card-types, firmware has to be loaded into the cards, before
+ proceeding with device-independent setup. See README.<yourDriver>
+ for how to do that.
+
+ b) If you only intend to use ttys, you are nearly ready now.
+
+ c) If you want to have really permanent "Modem"-settings on disk, you
+ can start the daemon iprofd. Give it a path to a file at the command-
+ line. It will store the profile-settings in this file every time
+ an AT&W0 is performed on any ISDN-tty. If the file already exists,
+ all profiles are initialized from this file. If you want to unload
+ any of the modules, kill iprofd first.
+
+ d) For networking, continue: Create an interface:
+ isdnctrl addif isdn0
+
+ e) Set the EAZ (or MSN for Euro-ISDN):
+ isdnctrl eaz isdn0 2
+
+ (For 1TR6 a single digit is allowed, for Euro-ISDN the number is your
+ real MSN e.g.: Phone-Number)
+
+ f) Set the number for outgoing calls on the interface:
+ isdnctrl addphone isdn0 out 1234567
+ ... (this can be executed more than once, all assigned numbers are
+ tried in order)
+ and the number(s) for incoming calls:
+ isdnctrl addphone isdn0 in 1234567
+
+ g) Set the timeout for hang-up:
+ isdnctrl huptimeout isdn0 <timeout_in_seconds>
+
+ h) additionally you may activate charge-hang-up (= Hang up before
+ next charge-info, this only works, if your isdn-provider transmits
+ the charge-info during and after the connection):
+ isdnctrl chargehup isdn0 on
+
+ i) Set the dial mode of the interface:
+ isdnctrl dialmode isdn0 auto
+ "off" means that you (or the system) cannot make any connection
+ (neither incoming or outgoing connections are possible). Use
+ this if you want to be sure that no connections will be made.
+ "auto" means that the interface is in auto-dial mode, and will
+ attempt to make a connection whenever a network data packet needs
+ the interface's link. Note that this can cause unexpected dialouts,
+ and lead to a high phone bill! Some daemons or other pc's that use
+ this interface can cause this.
+ Incoming connections are also possible.
+ "manual" is a dial mode created to prevent the unexpected dialouts.
+ In this mode, the interface will never make any connections on its
+ own. You must explicitly initiate a connection with "isdnctrl dial
+ isdn0". However, after an idle time of no traffic as configured for
+ the huptimeout value with isdnctrl, the connection _will_ be ended.
+ If you don't want any automatic hangup, set the huptimeout value to 0.
+ "manual" is the default.
+
+ j) Setup the interface with ifconfig as usual, and set a route to it.
+
+ k) (optional) If you run X11 and have Tcl/Tk-wish version 4.0, you can use
+ the script tools/tcltk/isdnmon. You can add actions for line-status
+ changes. See the comments at the beginning of the script for how to
+ do that. There are other tty-based tools in the tools-subdirectory
+ contributed by Michael Knigge (imon), Volker Götz (imontty) and
+ Andreas Kool (isdnmon).
+
+ l) For initial testing, you can set the verbose-level to 2 (default: 0).
+ Then all incoming calls are logged, even if they are not addressed
+ to one of the configured net-interfaces:
+ isdnctrl verbose 2
+
+ Now you are ready! A ping to the set address should now result in an
+ automatic dial-out (look at syslog kernel-messages).
+ The phone numbers and EAZs can be assigned at any time with isdnctrl.
+ You can add as many interfaces as you like with addif following the
+ directions above. Of course, there may be some limitations. But we have
+ tested as many as 20 interfaces without any problem. However, if you
+ don't give an interface name to addif, the kernel will assign a name
+ which starts with "eth". The number of "eth"-interfaces is limited by
+ the kernel.
+
+5. Additional options for isdnctrl:
+
+ "isdnctrl secure <InterfaceName> on"
+ Only incoming calls, for which the caller-id is listed in the access
+ list of the interface are accepted. You can add caller-id's With the
+ command "isdnctrl addphone <InterfaceName> in <caller-id>"
+ Euro-ISDN does not transmit the leading '0' of the caller-id for an
+ incoming call, therefore you should configure it accordingly.
+ If the real number for the dialout e.g. is "09311234567" the number
+ to configure here is "9311234567". The pattern-match function
+ works similar to the shell mechanism.
+
+ ? one arbitrary digit
+ * zero or arbitrary many digits
+ [123] one of the digits in the list
+ [1-5] one digit between '1' and '5'
+ a '^' as the first character in a list inverts the list
+
+
+ "isdnctrl secure <InterfaceName> off"
+ Switch off secure operation (default).
+
+ "isdnctrl ihup <InterfaceName> [on|off]"
+ Switch the hang-up-timer for incoming calls on or off.
+
+ "isdnctrl eaz <InterfaceName>"
+ Returns the EAZ of an interface.
+
+ "isdnctrl delphone <InterfaceName> in|out <number>"
+ Deletes a number from one of the access-lists of the interface.
+
+ "isdnctrl delif <InterfaceName>"
+ Removes the interface (and possible slaves) from the kernel.
+ (You have to unregister it with "ifconfig <InterfaceName> down" before).
+
+ "isdnctrl callback <InterfaceName> [on|off]"
+ Switches an interface to callback-mode. In this mode, an incoming call
+ will be rejected and after this the remote-station will be called. If
+ you test this feature by using ping, some routers will re-dial very
+ quickly, so that the callback from isdn4linux may not be recognized.
+ In this case use ping with the option -i <sec> to increase the interval
+ between echo-packets.
+
+ "isdnctrl cbdelay <InterfaceName> [seconds]"
+ Sets the delay (default 5 sec) between an incoming call and start of
+ dialing when callback is enabled.
+
+ "isdnctrl cbhup <InterfaceName> [on|off]"
+ This enables (default) or disables an active hangup (reject) when getting an
+ incoming call for an interface which is configured for callback.
+
+ "isdnctrl encap <InterfaceName> <EncapType>"
+ Selects the type of packet-encapsulation. The encapsulation can be changed
+ only while an interface is down.
+
+ At the moment the following values are supported:
+
+ rawip (Default) Selects raw-IP-encapsulation. This means, MAC-headers
+ are stripped off.
+ ip IP with type-field. Same as IP but the type-field of the MAC-header
+ is preserved.
+ x25iface X.25 interface encapsulation (first byte semantics as defined in
+ ../networking/x25-iface.txt). Use this for running the linux
+ X.25 network protocol stack (AF_X25 sockets) on top of isdn.
+ cisco-h A special-mode for communicating with a Cisco, which is configured
+ to do "hdlc"
+ ethernet No stripping. Packets are sent with full MAC-header.
+ The Ethernet-address of the interface is faked, from its
+ IP-address: fc:fc:i1:i2:i3:i4, where i1-4 are the IP-addr.-values.
+ syncppp Synchronous PPP
+
+ uihdlc HDLC with UI-frame-header (for use with DOS ISPA, option -h1)
+
+
+ NOTE: x25iface encapsulation is currently experimental. Please
+ read README.x25 for further details
+
+
+ Watching packets, using standard-tcpdump will fail for all encapsulations
+ except ethernet because tcpdump does not know how to handle packets
+ without MAC-header. A patch for tcpdump is included in the utility-package
+ mentioned above.
+
+ "isdnctrl l2_prot <InterfaceName> <L2-ProtocolName>"
+ Selects a layer-2-protocol.
+ (With the ICN-driver and the HiSax-driver, "x75i" and "hdlc" is available.
+ With other drivers, "x75ui", "x75bui", "x25dte", "x25dce" may be
+ possible too. See README.x25 for x25 related l2 protocols.)
+
+ isdnctrl l3_prot <InterfaceName> <L3-ProtocolName>
+ The same for layer-3. (At the moment only "trans" is allowed)
+
+ "isdnctrl list <InterfaceName>"
+ Shows all parameters of an interface and the charge-info.
+ Try "all" as the interface name.
+
+ "isdnctrl hangup <InterfaceName>"
+ Forces hangup of an interface.
+
+ "isdnctrl bind <InterfaceName> <DriverId>,<ChannelNumber> [exclusive]"
+ If you are using more than one ISDN card, it is sometimes necessary to
+ dial out using a specific card or even preserve a specific channel for
+ dialout of a specific net-interface. This can be done with the above
+ command. Replace <DriverId> by whatever you assigned while loading the
+ module. The <ChannelNumber> is counted from zero. The upper limit
+ depends on the card used. At the moment no card supports more than
+ 2 channels, so the upper limit is one.
+
+ "isdnctrl unbind <InterfaceName>"
+ unbinds a previously bound interface.
+
+ "isdnctrl busreject <DriverId> on|off"
+ If switched on, isdn4linux replies a REJECT to incoming calls, it
+ cannot match to any configured interface.
+ If switched off, nothing happens in this case.
+ You normally should NOT enable this feature, if the ISDN adapter is not
+ the only device connected to the S0-bus. Otherwise it could happen that
+ isdn4linux rejects an incoming call, which belongs to another device on
+ the bus.
+
+ "isdnctrl addslave <InterfaceName> <SlaveName>
+ Creates a slave interface for channel-bundling. Slave interfaces are
+ not seen by the kernel, but their ISDN-part can be configured with
+ isdnctrl as usual. (Phone numbers, EAZ/MSN, timeouts etc.) If more
+ than two channels are to be bundled, feel free to create as many as you
+ want. InterfaceName must be a real interface, NOT a slave. Slave interfaces
+ start dialing, if the master interface resp. the previous slave interface
+ has a load of more than 7000 cps. They hangup if the load goes under 7000
+ cps, according to their "huptimeout"-parameter.
+
+ "isdnctrl sdelay <InterfaceName> secs."
+ This sets the minimum time an Interface has to be fully loaded, until
+ it sends a dial-request to its slave.
+
+ "isdnctrl dial <InterfaceName>"
+ Forces an interface to start dialing even if no packets are to be
+ transferred.
+
+ "isdnctrl mapping <DriverId> MSN0,MSN1,MSN2,...MSN9"
+ This installs a mapping table for EAZ<->MSN-mapping for a single line.
+ Missing MSN's have to be given as "-" or can be omitted, if at the end
+ of the commandline.
+ With this command, it's now possible to have an interface listening to
+ mixed 1TR6- and Euro-Type lines. In this case, the interface has to be
+ configured to a 1TR6-type EAZ (one digit). The mapping is also valid
+ for tty-emulation. Seen from the interface/tty-level the mapping
+ CAN be used, however it's possible to use single tty's/interfaces with
+ real MSN's (more digits) also, in which case the mapping will be ignored.
+ Here is an example:
+
+ You have a 1TR6-type line with base-nr. 1234567 and a Euro-line with
+ MSN's 987654, 987655 and 987656. The DriverId for the Euro-line is "EURO".
+
+ isdnctrl mapping EURO -,987654,987655,987656,-,987655
+ ...
+ isdnctrl eaz isdn0 1 # listen on 12345671(1tr6) and 987654(euro)
+ ...
+ isdnctrl eaz isdn1 4 # listen on 12345674(1tr6) only.
+ ...
+ isdnctrl eaz isdn2 987654 # listen on 987654(euro) only.
+
+ Same scheme is used with AT&E... at the tty's.
+
+6. If you want to write a new low-level-driver, you are welcome.
+ The interface to the link-level-module is described in the file INTERFACE.
+ If the interface should be expanded for any reason, don't do it
+ on your own, send me a mail containing the proposed changes and
+ some reasoning about them.
+ If other drivers will not be affected, I will include the changes
+ in the next release.
+ For developers only, there is a second mailing-list. Write to me
+ (fritz@isdn4linux.de), if you want to join that list.
+
+Have fun!
+
+ -Fritz
+
diff --git a/Documentation/isdn/README.FAQ b/Documentation/isdn/README.FAQ
new file mode 100644
index 0000000..356f794
--- /dev/null
+++ b/Documentation/isdn/README.FAQ
@@ -0,0 +1,26 @@
+
+The FAQ for isdn4linux
+======================
+
+Please note that there is a big FAQ available in the isdn4k-utils.
+You find it in:
+ isdn4k-utils/FAQ/i4lfaq.sgml
+
+In case you just want to see the FAQ online, or download the newest version,
+you can have a look at my website:
+http://www.mhessler.de/i4lfaq/ (view + download)
+or:
+http://www.isdn4linux.de/faq/ (view)
+
+As the extension tells, the FAQ is in SGML format, and you can convert it
+into text/html/... format by using the sgml2txt/sgml2html/... tools.
+Alternatively, you can also do a 'configure; make all' in the FAQ directory.
+
+
+Please have a look at the FAQ before posting anything in the Mailinglist,
+or the newsgroup!
+
+
+Matthias Hessler
+hessler@isdn4linux.de
+
diff --git a/Documentation/isdn/README.HiSax b/Documentation/isdn/README.HiSax
new file mode 100644
index 0000000..031c8d8
--- /dev/null
+++ b/Documentation/isdn/README.HiSax
@@ -0,0 +1,659 @@
+HiSax is a Linux hardware-level driver for passive ISDN cards with Siemens
+chipset (ISAC_S 2085/2086/2186, HSCX SAB 82525). It is based on the Teles
+driver from Jan den Ouden.
+It is meant to be used with isdn4linux, an ISDN link-level module for Linux
+written by Fritz Elfert.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+Supported cards
+---------------
+
+Teles 8.0/16.0/16.3 and compatible ones
+Teles 16.3c
+Teles S0/PCMCIA
+Teles PCI
+Teles S0Box
+Creatix S0Box
+Creatix PnP S0
+Compaq ISDN S0 ISA card
+AVM A1 (Fritz, Teledat 150)
+AVM Fritz PCMCIA
+AVM Fritz PnP
+AVM Fritz PCI
+ELSA Microlink PCC-16, PCF, PCF-Pro, PCC-8
+ELSA Quickstep 1000
+ELSA Quickstep 1000PCI
+ELSA Quickstep 3000 (same settings as QS1000)
+ELSA Quickstep 3000PCI
+ELSA PCMCIA
+ITK ix1-micro Rev.2
+Eicon Diva 2.0 ISA and PCI (S0 and U interface, no PRO version)
+Eicon Diva 2.01 ISA and PCI
+Eicon Diva 2.02 PCI
+Eicon Diva Piccola
+ASUSCOM NETWORK INC. ISDNLink 128K PC adapter (order code I-IN100-ST-D)
+Dynalink IS64PH (OEM version of ASUSCOM NETWORK INC. ISDNLink 128K adapter)
+PCBIT-DP (OEM version of ASUSCOM NETWORK INC. ISDNLink)
+HFC-2BS0 based cards (TeleInt SA1)
+Sedlbauer Speed Card (Speed Win, Teledat 100, PCI, Fax+)
+Sedlbauer Speed Star/Speed Star2 (PCMCIA)
+Sedlbauer ISDN-Controller PC/104
+USR Sportster internal TA (compatible Stollmann tina-pp V3)
+USR internal TA PCI
+ith Kommunikationstechnik GmbH MIC 16 ISA card
+Traverse Technologie NETjet PCI S0 card and NETspider U card
+Ovislink ISDN sc100-p card (NETjet driver)
+Dr. Neuhaus Niccy PnP/PCI
+Siemens I-Surf 1.0
+Siemens I-Surf 2.0 (with IPAC, try type 12 asuscom)
+ACER P10
+HST Saphir
+Berkom Telekom A4T
+Scitel Quadro
+Gazel ISDN cards
+HFC-PCI based cards
+Winbond W6692 based cards
+HFC-S+, HFC-SP/PCMCIA cards
+formula-n enternow
+Gerdes Power ISDN
+
+Note: PCF, PCF-Pro: up to now, only the ISDN part is supported
+ PCC-8: not tested yet
+ Eicon.Diehl Diva U interface not tested
+
+If you know other passive cards with the Siemens chipset, please let me know.
+You can combine any card, if there is no conflict between the resources
+(io, mem, irq).
+
+
+Configuring the driver
+----------------------
+
+The HiSax driver can either be built directly into the kernel or as a module.
+It can be configured using the command line feature while loading the kernel
+with LILO or LOADLIN or, if built as a module, using insmod/modprobe with
+parameters.
+There is also some config needed before you compile the kernel and/or
+modules. It is included in the normal "make [menu]config" target at the
+kernel. Don't forget it, especially to select the right D-channel protocol.
+
+Please note: In older versions of the HiSax driver, all PnP cards
+needed to be configured with isapnp and worked only with the HiSax
+driver used as a module.
+
+In the current version, HiSax will automatically use the in-kernel
+ISAPnP support, provided you selected it during kernel configuration
+(CONFIG_ISAPNP), if you don't give the io=, irq= command line parameters.
+
+The affected card types are: 4,7,12,14,19,27-30
+
+a) when built as a module
+-------------------------
+
+insmod/modprobe hisax.o \
+ io=iobase irq=IRQ mem=membase type=card_type \
+ protocol=D_channel_protocol id=idstring
+
+or, if several cards are installed:
+
+insmod/modprobe hisax.o \
+ io=iobase1,iobase2,... irq=IRQ1,IRQ2,... mem=membase1,membase2,... \
+ type=card_type1,card_type2,... \
+ protocol=D_channel_protocol1,D_channel_protocol2,... \
+ id=idstring1%idstring2 ...
+
+where "iobaseN" represents the I/O base address of the Nth card, "membaseN"
+the memory base address of the Nth card, etc.
+
+The reason for the delimiter "%" being used in the idstrings is that ","
+won't work with the current modules package.
+
+The parameters may be specified in any order. For example, the "io"
+parameter may precede the "irq" parameter, or vice versa. If several
+cards are installed, the ordering within the comma separated parameter
+lists must of course be consistent.
+
+Only parameters applicable to the card type need to be specified. For
+example, the Teles 16.3 card is not memory-mapped, so the "mem"
+parameter may be omitted for this card. Sometimes it may be necessary
+to specify a dummy parameter, however. This is the case when there is
+a card of a different type later in the list that needs a parameter
+which the preceding card does not. For instance, if a Teles 16.0 card
+is listed after a Teles 16.3 card, a dummy memory base parameter of 0
+must be specified for the 16.3. Instead of a dummy value, the parameter
+can also be skipped by simply omitting the value. For example:
+mem=,0xd0000. See example 6 below.
+
+The parameter for the D-Channel protocol may be omitted if you selected the
+correct one during kernel config. Valid values are "1" for German 1TR6,
+"2" for EDSS1 (Euro ISDN), "3" for leased lines (no D-Channel) and "4"
+for US NI1.
+With US NI1 you have to include your SPID into the MSN setting in the form
+<MSN>:<SPID> for example (your phonenumber is 1234 your SPID 5678):
+AT&E1234:5678 on ttyI interfaces
+isdnctrl eaz ippp0 1234:5678 on network devices
+
+The Creatix/Teles PnP cards use io1= and io2= instead of io= for specifying
+the I/O addresses of the ISAC and HSCX chips, respectively.
+
+Card types:
+
+ Type Required parameters (in addition to type and protocol)
+
+ 1 Teles 16.0 irq, mem, io
+ 2 Teles 8.0 irq, mem
+ 3 Teles 16.3 (non PnP) irq, io
+ 4 Creatix/Teles PnP irq, io0 (ISAC), io1 (HSCX)
+ 5 AVM A1 (Fritz) irq, io
+ 6 ELSA PCC/PCF cards io or nothing for autodetect (the iobase is
+ required only if you have more than one ELSA
+ card in your PC)
+ 7 ELSA Quickstep 1000 irq, io (from isapnp setup)
+ 8 Teles 16.3 PCMCIA irq, io
+ 9 ITK ix1-micro Rev.2 irq, io
+ 10 ELSA PCMCIA irq, io (set with card manager)
+ 11 Eicon.Diehl Diva ISA PnP irq, io
+ 11 Eicon.Diehl Diva PCI no parameter
+ 12 ASUS COM ISDNLink irq, io (from isapnp setup)
+ 13 HFC-2BS0 based cards irq, io
+ 14 Teles 16.3c PnP irq, io
+ 15 Sedlbauer Speed Card irq, io
+ 15 Sedlbauer PC/104 irq, io
+ 15 Sedlbauer Speed PCI no parameter
+ 16 USR Sportster internal irq, io
+ 17 MIC card irq, io
+ 18 ELSA Quickstep 1000PCI no parameter
+ 19 Compaq ISDN S0 ISA card irq, io0, io1, io (from isapnp setup io=IO2)
+ 20 NETjet PCI card no parameter
+ 21 Teles PCI no parameter
+ 22 Sedlbauer Speed Star (PCMCIA) irq, io (set with card manager)
+ 24 Dr. Neuhaus Niccy PnP irq, io0, io1 (from isapnp setup)
+ 24 Dr. Neuhaus Niccy PCI no parameter
+ 25 Teles S0Box irq, io (of the used lpt port)
+ 26 AVM A1 PCMCIA (Fritz!) irq, io (set with card manager)
+ 27 AVM PnP (Fritz!PnP) irq, io (from isapnp setup)
+ 27 AVM PCI (Fritz!PCI) no parameter
+ 28 Sedlbauer Speed Fax+ irq, io (from isapnp setup)
+ 29 Siemens I-Surf 1.0 irq, io, memory (from isapnp setup)
+ 30 ACER P10 irq, io (from isapnp setup)
+ 31 HST Saphir irq, io
+ 32 Telekom A4T none
+ 33 Scitel Quadro subcontroller (4*S0, subctrl 1...4)
+ 34 Gazel ISDN cards (ISA) irq,io
+ 34 Gazel ISDN cards (PCI) none
+ 35 HFC 2BDS0 PCI none
+ 36 W6692 based PCI cards none
+ 37 HFC 2BDS0 S+, SP irq,io
+ 38 NETspider U PCI card none
+ 39 HFC 2BDS0 SP/PCMCIA irq,io (set with cardmgr)
+ 40 hotplug interface
+ 41 Formula-n enter:now PCI none
+
+At the moment IRQ sharing is only possible with PCI cards. Please make sure
+that your IRQ is free and enabled for ISA use.
+
+
+Examples for module loading
+
+1. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 10
+ modprobe hisax type=3 protocol=2 io=0x280 irq=10
+
+2. Teles 16.0, 1TR6 ISDN, I/O base d80 hex, IRQ 5, Memory d0000 hex
+ modprobe hisax protocol=1 type=1 io=0xd80 mem=0xd0000 irq=5
+
+3. Fritzcard, Euro ISDN, I/O base 340 hex, IRQ 10 and ELSA PCF, Euro ISDN
+ modprobe hisax type=5,6 protocol=2,2 io=0x340 irq=10 id=Fritz%Elsa
+
+4. Any ELSA PCC/PCF card, Euro ISDN
+ modprobe hisax type=6 protocol=2
+
+5. Teles 16.3 PnP, Euro ISDN, with isapnp configured
+ isapnp config: (INT 0 (IRQ 10 (MODE +E)))
+ (IO 0 (BASE 0x0580))
+ (IO 1 (BASE 0x0180))
+ modprobe hisax type=4 protocol=2 irq=10 io0=0x580 io1=0x180
+
+ In the current version of HiSax, you can instead simply use
+
+ modprobe hisax type=4 protocol=2
+
+ if you configured your kernel for ISAPnP. Don't run isapnp in
+ this case!
+
+6. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 12 and
+ Teles 16.0, 1TR6, IRQ 5, Memory d0000 hex
+ modprobe hisax type=3,1 protocol=2,1 io=0x280 mem=0,0xd0000
+
+ Please note the dummy 0 memory address for the Teles 16.3, used as a
+ placeholder as described above, in the last example.
+
+7. Teles PCMCIA, Euro ISDN, I/O base 180 hex, IRQ 15 (default values)
+ modprobe hisax type=8 protocol=2 io=0x180 irq=15
+
+
+b) using LILO/LOADLIN, with the driver compiled directly into the kernel
+------------------------------------------------------------------------
+
+hisax=typ1,dp1,pa_1,pb_1,pc_1[,typ2,dp2,pa_2 ... \
+ typn,dpn,pa_n,pb_n,pc_n][,idstring1[,idstring2,...,idstringn]]
+
+where
+ typ1 = type of 1st card (default depends on kernel settings)
+ dp1 = D-Channel protocol of 1st card. 1=1TR6, 2=EDSS1, 3=leased
+ pa_1 = 1st parameter (depending on the type of the card)
+ pb_1 = 2nd parameter ( " " " " " " " )
+ pc_1 = 3rd parameter ( " " " " " " " )
+
+ typ2,dp2,pa_2,pb_2,pc_2 = Parameters of the second card (defaults: none)
+ typn,dpn,pa_n,pb_n,pc_n = Parameters of the n'th card (up to 16 cards are
+ supported)
+
+ idstring = Driver ID for accessing the particular card with utility
+ programs and for identification when using a line monitor
+ (default: "HiSax")
+
+ Note: the ID string must start with an alphabetical character!
+
+Card types:
+
+type
+ 1 Teles 16.0 pa=irq pb=membase pc=iobase
+ 2 Teles 8.0 pa=irq pb=membase
+ 3 Teles 16.3 pa=irq pb=iobase
+ 4 Creatix/Teles PNP ONLY WORKS AS A MODULE !
+ 5 AVM A1 (Fritz) pa=irq pb=iobase
+ 6 ELSA PCC/PCF cards pa=iobase or nothing for autodetect
+ 7 ELSA Quickstep 1000 ONLY WORKS AS A MODULE !
+ 8 Teles S0 PCMCIA pa=irq pb=iobase
+ 9 ITK ix1-micro Rev.2 pa=irq pb=iobase
+ 10 ELSA PCMCIA pa=irq, pb=io (set with card manager)
+ 11 Eicon.Diehl Diva ISAPnP ONLY WORKS AS A MODULE !
+ 11 Eicon.Diehl Diva PCI no parameter
+ 12 ASUS COM ISDNLink ONLY WORKS AS A MODULE !
+ 13 HFC-2BS0 based cards pa=irq pb=io
+ 14 Teles 16.3c PnP ONLY WORKS AS A MODULE !
+ 15 Sedlbauer Speed Card pa=irq pb=io (Speed Win only as module !)
+ 15 Sedlbauer PC/104 pa=irq pb=io
+ 15 Sedlbauer Speed PCI no parameter
+ 16 USR Sportster internal pa=irq pb=io
+ 17 MIC card pa=irq pb=io
+ 18 ELSA Quickstep 1000PCI no parameter
+ 19 Compaq ISDN S0 ISA card ONLY WORKS AS A MODULE !
+ 20 NETjet PCI card no parameter
+ 21 Teles PCI no parameter
+ 22 Sedlbauer Speed Star (PCMCIA) pa=irq, pb=io (set with card manager)
+ 24 Dr. Neuhaus Niccy PnP ONLY WORKS AS A MODULE !
+ 24 Dr. Neuhaus Niccy PCI no parameter
+ 25 Teles S0Box pa=irq, pb=io (of the used lpt port)
+ 26 AVM A1 PCMCIA (Fritz!) pa=irq, pb=io (set with card manager)
+ 27 AVM PnP (Fritz!PnP) ONLY WORKS AS A MODULE !
+ 27 AVM PCI (Fritz!PCI) no parameter
+ 28 Sedlbauer Speed Fax+ ONLY WORKS AS A MODULE !
+ 29 Siemens I-Surf 1.0 ONLY WORKS AS A MODULE !
+ 30 ACER P10 ONLY WORKS AS A MODULE !
+ 31 HST Saphir pa=irq, pb=io
+ 32 Telekom A4T no parameter
+ 33 Scitel Quadro subcontroller (4*S0, subctrl 1...4)
+ 34 Gazel ISDN cards (ISA) pa=irq, pb=io
+ 34 Gazel ISDN cards (PCI) no parameter
+ 35 HFC 2BDS0 PCI no parameter
+ 36 W6692 based PCI cards none
+ 37 HFC 2BDS0 S+,SP/PCMCIA ONLY WORKS AS A MODULE !
+ 38 NETspider U PCI card none
+ 39 HFC 2BDS0 SP/PCMCIA ONLY WORKS AS A MODULE !
+ 40 hotplug interface ONLY WORKS AS A MODULE !
+ 41 Formula-n enter:now PCI none
+
+Running the driver
+------------------
+
+When you insmod isdn.o and hisax.o (or with the in-kernel version, during
+boot time), a few lines should appear in your syslog. Look for something like:
+
+Apr 13 21:01:59 kke01 kernel: HiSax: Driver for Siemens chip set ISDN cards
+Apr 13 21:01:59 kke01 kernel: HiSax: Version 2.9
+Apr 13 21:01:59 kke01 kernel: HiSax: Revisions 1.14/1.9/1.10/1.25/1.8
+Apr 13 21:01:59 kke01 kernel: HiSax: Total 1 card defined
+Apr 13 21:01:59 kke01 kernel: HiSax: Card 1 Protocol EDSS1 Id=HiSax1 (0)
+Apr 13 21:01:59 kke01 kernel: HiSax: Elsa driver Rev. 1.13
+...
+Apr 13 21:01:59 kke01 kernel: Elsa: PCF-Pro found at 0x360 Rev.:C IRQ 10
+Apr 13 21:01:59 kke01 kernel: Elsa: timer OK; resetting card
+Apr 13 21:01:59 kke01 kernel: Elsa: HSCX version A: V2.1 B: V2.1
+Apr 13 21:01:59 kke01 kernel: Elsa: ISAC 2086/2186 V1.1
+...
+Apr 13 21:01:59 kke01 kernel: HiSax: DSS1 Rev. 1.14
+Apr 13 21:01:59 kke01 kernel: HiSax: 2 channels added
+
+This means that the card is ready for use.
+Cabling problems or line-downs are not detected, and only some ELSA cards can
+detect the S0 power.
+
+Remember that, according to the new strategy for accessing low-level drivers
+from within isdn4linux, you should also define a driver ID while doing
+insmod: Simply append hisax_id=<SomeString> to the insmod command line. This
+string MUST NOT start with a digit or a small 'x'!
+
+At this point you can run a 'cat /dev/isdnctrl0' and view debugging messages.
+
+At the moment, debugging messages are enabled with the hisaxctrl tool:
+
+ hisaxctrl <DriverId> DebugCmd <debugging_flags>
+
+<DriverId> default is HiSax, if you didn't specify one.
+
+DebugCmd is 1 for generic debugging
+ 11 for layer 1 development debugging
+ 13 for layer 3 development debugging
+
+where <debugging_flags> is the integer sum of the following debugging
+options you wish enabled:
+
+With DebugCmd set to 1:
+
+ 0x0001 Link-level <--> hardware-level communication
+ 0x0002 Top state machine
+ 0x0004 D-Channel Frames for isdnlog
+ 0x0008 D-Channel Q.921
+ 0x0010 B-Channel X.75
+ 0x0020 D-Channel l2
+ 0x0040 B-Channel l2
+ 0x0080 D-Channel link state debugging
+ 0x0100 B-Channel link state debugging
+ 0x0200 TEI debug
+ 0x0400 LOCK debug in callc.c
+ 0x0800 More paranoid debug in callc.c (not for normal use)
+ 0x1000 D-Channel l1 state debugging
+ 0x2000 B-Channel l1 state debugging
+
+With DebugCmd set to 11:
+
+ 0x0001 Warnings (default: on)
+ 0x0002 IRQ status
+ 0x0004 ISAC
+ 0x0008 ISAC FIFO
+ 0x0010 HSCX
+ 0x0020 HSCX FIFO (attention: full B-Channel output!)
+ 0x0040 D-Channel LAPD frame types
+ 0x0080 IPAC debug
+ 0x0100 HFC receive debug
+ 0x0200 ISAC monitor debug
+ 0x0400 D-Channel frames for isdnlog (set with 1 0x4 too)
+ 0x0800 D-Channel message verbose
+
+With DebugCmd set to 13:
+
+ 1 Warnings (default: on)
+ 2 l3 protocol descriptor errors
+ 4 l3 state machine
+ 8 charge info debugging (1TR6)
+
+For example, 'hisaxctrl HiSax 1 0x3ff' enables full generic debugging.
+
+Because of some obscure problems with some switch equipment, the delay
+between the CONNECT message and sending the first data on the B-channel is now
+configurable with
+
+hisaxctrl <DriverId> 2 <delay>
+<delay> in ms Value between 50 and 800 ms is recommended.
+
+Downloading Firmware
+--------------------
+At the moment, the Sedlbauer speed fax+ is the only card, which
+needs to download firmware.
+The firmware is downloaded with the hisaxctrl tool:
+
+ hisaxctrl <DriverId> 9 <firmware_filename>
+
+<DriverId> default is HiSax, if you didn't specify one,
+
+where <firmware_filename> is the filename of the firmware file.
+
+For example, 'hisaxctrl HiSax 9 ISAR.BIN' downloads the firmware for
+ISAR based cards (like the Sedlbauer speed fax+).
+
+Warning
+-------
+HiSax is a work in progress and may crash your machine.
+For certification look at HiSax.cert file.
+
+Limitations
+-----------
+At this time, HiSax only works on Euro ISDN lines and German 1TR6 lines.
+For leased lines see appendix.
+
+Bugs
+----
+If you find any, please let me know.
+
+
+Thanks
+------
+Special thanks to:
+
+ Emil Stephan for the name HiSax which is a mix of HSCX and ISAC.
+
+ Fritz Elfert, Jan den Ouden, Michael Hipp, Michael Wein,
+ Andreas Kool, Pekka Sarnila, Sim Yskes, Johan Myrre'en,
+ Klaus-Peter Nischke (ITK AG), Christof Petig, Werner Fehn (ELSA GmbH),
+ Volker Schmidt
+ Edgar Toernig and Marcus Niemann for the Sedlbauer driver
+ Stephan von Krawczynski
+ Juergen Quade for the Leased Line part
+ Klaus Lichtenwalder (Klaus.Lichtenwalder@WebForum.DE), for ELSA PCMCIA support
+ Enrik Berkhan (enrik@starfleet.inka.de) for S0BOX specific stuff
+ Ton van Rosmalen for Teles PCI
+ Petr Novak <petr.novak@i.cz> for Winbond W6692 support
+ Werner Cornelius <werner@isdn4linux.de> for HFC-PCI, HFC-S(+/P) and supplementary services support
+ and more people who are hunting bugs. (If I forgot somebody, please
+ send me a mail).
+
+ Firma ELSA GmbH
+ Firma Eicon.Diehl GmbH
+ Firma Dynalink NL
+ Firma ASUSCOM NETWORK INC. Taiwan
+ Firma S.u.S.E
+ Firma ith Kommunikationstechnik GmbH
+ Firma Traverse Technologie Australia
+ Firma Medusa GmbH (www.medusa.de).
+ Firma Quant-X Austria for sponsoring a DEC Alpha board+CPU
+ Firma Cologne Chip Designs GmbH
+
+ My girl friend and partner in life Ute for her patience with me.
+
+
+Enjoy,
+
+Karsten Keil
+keil@isdn4linux.de
+
+
+Appendix: Teles PCMCIA driver
+-----------------------------
+
+See
+ http://www.stud.uni-wuppertal.de/~ea0141/pcmcia.html
+for instructions.
+
+Appendix: Linux and ISDN-leased lines
+-------------------------------------
+
+Original from Juergen Quade, new version KKe.
+
+Attention NEW VERSION, the old leased line syntax won't work !!!
+
+You can use HiSax to connect your Linux-Box via an ISDN leased line
+to e.g. the Internet:
+
+1. Build a kernel which includes the HiSax driver either as a module
+ or as part of the kernel.
+ cd /usr/src/linux
+ make menuconfig
+ <ISDN subsystem - ISDN support -- HiSax>
+ make clean; make zImage; make modules; make modules_install
+2. Install the new kernel
+ cp /usr/src/linux/arch/i386/boot/zImage /etc/kernel/linux.isdn
+ vi /etc/lilo.conf
+ <add new kernel in the bootable image section>
+ lilo
+3. in case the hisax driver is a "fixed" part of the kernel, configure
+ the driver with lilo:
+ vi /etc/lilo.conf
+ <add HiSax driver parameter in the global section (see below)>
+ lilo
+ Your lilo.conf _might_ look like the following:
+
+ # LILO configuration-file
+ # global section
+ # teles 16.0 on IRQ=5, MEM=0xd8000, PORT=0xd80
+ append="hisax=1,3,5,0xd8000,0xd80,HiSax"
+ # teles 16.3 (non pnp) on IRQ=15, PORT=0xd80
+ # append="hisax=3,3,5,0xd8000,0xd80,HiSax"
+ boot=/dev/sda
+ compact # faster, but won't work on all systems.
+ linear
+ read-only
+ prompt
+ timeout=100
+ vga = normal # force sane state
+ # Linux bootable partition config begins
+ image = /etc/kernel/linux.isdn
+ root = /dev/sda1
+ label = linux.isdn
+ #
+ image = /etc/kernel/linux-2.0.30
+ root = /dev/sda1
+ label = linux.secure
+
+ In the line starting with "append" you have to adapt the parameters
+ according to your card (see above in this file)
+
+3. boot the new linux.isdn kernel
+4. start the ISDN subsystem:
+ a) load - if necessary - the modules (depends, whether you compiled
+ the ISDN driver as module or not)
+ According to the type of card you have to specify the necessary
+ driver parameter (irq, io, mem, type, protocol).
+ For the leased line the protocol is "3". See the table above for
+ the parameters, which you have to specify depending on your card.
+ b) configure i4l
+ /sbin/isdnctrl addif isdn0
+ # EAZ 1 -- B1 channel 2 --B2 channel
+ /sbin/isdnctrl eaz isdn0 1
+ /sbin/isdnctrl secure isdn0 on
+ /sbin/isdnctrl huptimeout isdn0 0
+ /sbin/isdnctrl l2_prot isdn0 hdlc
+ # Attention you must not set an outgoing number !!! This won't work !!!
+ # The incoming number is LEASED0 for the first card, LEASED1 for the
+ # second and so on.
+ /sbin/isdnctrl addphone isdn0 in LEASED0
+ # Here is no need to bind the channel.
+ c) in case the remote partner is a CISCO:
+ /sbin/isdnctrl encap isdn0 cisco-h
+ d) configure the interface
+ /sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP}
+ e) set the routes
+ /sbin/route add -host ${REMOTE_IP} isdn0
+ /sbin/route add default gw ${REMOTE_IP}
+ f) switch the card into leased mode for each used B-channel
+ /sbin/hisaxctrl HiSax 5 1
+
+Remarks:
+a) Use state of the art isdn4k-utils
+
+Here an example script:
+#!/bin/sh
+# Start/Stop ISDN leased line connection
+
+I4L_AS_MODULE=yes
+I4L_REMOTE_IS_CISCO=no
+I4L_MODULE_PARAMS="type=16 io=0x268 irq=7 "
+I4L_DEBUG=no
+I4L_LEASED_128K=yes
+LOCAL_IP=192.168.1.1
+REMOTE_IP=192.168.2.1
+
+case "$1" in
+ start)
+ echo "Starting ISDN ..."
+ if [ ${I4L_AS_MODULE} = "yes" ]; then
+ echo "loading modules..."
+ /sbin/modprobe hisax ${I4L_MODULE_PARAMS}
+ fi
+ # configure interface
+ /sbin/isdnctrl addif isdn0
+ /sbin/isdnctrl secure isdn0 on
+ if [ ${I4L_DEBUG} = "yes" ]; then
+ /sbin/isdnctrl verbose 7
+ /sbin/hisaxctrl HiSax 1 0xffff
+ /sbin/hisaxctrl HiSax 11 0xff
+ cat /dev/isdnctrl >/tmp/lea.log &
+ fi
+ if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then
+ /sbin/isdnctrl encap isdn0 cisco-h
+ fi
+ /sbin/isdnctrl huptimeout isdn0 0
+ # B-CHANNEL 1
+ /sbin/isdnctrl eaz isdn0 1
+ /sbin/isdnctrl l2_prot isdn0 hdlc
+ # 1. card
+ /sbin/isdnctrl addphone isdn0 in LEASED0
+ if [ ${I4L_LEASED_128K} = "yes" ]; then
+ /sbin/isdnctrl addslave isdn0 isdn0s
+ /sbin/isdnctrl secure isdn0s on
+ /sbin/isdnctrl huptimeout isdn0s 0
+ # B-CHANNEL 2
+ /sbin/isdnctrl eaz isdn0s 2
+ /sbin/isdnctrl l2_prot isdn0s hdlc
+ # 1. card
+ /sbin/isdnctrl addphone isdn0s in LEASED0
+ if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then
+ /sbin/isdnctrl encap isdn0s cisco-h
+ fi
+ fi
+ /sbin/isdnctrl dialmode isdn0 manual
+ # configure tcp/ip
+ /sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP}
+ /sbin/route add -host ${REMOTE_IP} isdn0
+ /sbin/route add default gw ${REMOTE_IP}
+ # switch to leased mode
+ # B-CHANNEL 1
+ /sbin/hisaxctrl HiSax 5 1
+ if [ ${I4L_LEASED_128K} = "yes" ]; then
+ # B-CHANNEL 2
+ sleep 10; /* Wait for master */
+ /sbin/hisaxctrl HiSax 5 2
+ fi
+ ;;
+ stop)
+ /sbin/ifconfig isdn0 down
+ /sbin/isdnctrl delif isdn0
+ if [ ${I4L_DEBUG} = "yes" ]; then
+ killall cat
+ fi
+ if [ ${I4L_AS_MODULE} = "yes" ]; then
+ /sbin/rmmod hisax
+ /sbin/rmmod isdn
+ /sbin/rmmod ppp
+ /sbin/rmmod slhc
+ fi
+ ;;
+ *)
+ echo "Usage: $0 {start|stop}"
+ exit 1
+esac
+exit 0
diff --git a/Documentation/isdn/README.act2000 b/Documentation/isdn/README.act2000
new file mode 100644
index 0000000..ce7115e
--- /dev/null
+++ b/Documentation/isdn/README.act2000
@@ -0,0 +1,104 @@
+$Id: README.act2000,v 1.3 2000/08/06 09:22:51 armin Exp $
+
+This document describes the ACT2000 driver for the
+IBM Active 2000 ISDN card.
+
+There are 3 Types of this card available. A ISA-, MCA-, and PCMCIA-Bus
+Version. Currently, only the ISA-Bus version of the card is supported.
+However MCA and PCMCIA will follow soon.
+
+The ISA-Bus Version uses 8 IO-ports. The base port address has to be set
+manually using the DIP switches.
+
+Setting up the DIP switches for the IBM Active 2000 ISDN card:
+
+ Note: S5 and S6 always set off!
+
+ S1 S2 S3 S4 Base-port
+ on on on on 0x0200 (Factory default)
+ off on on on 0x0240
+ on off on on 0x0280
+ off off on on 0x02c0
+ on on off on 0x0300
+ off on off on 0x0340
+ on off off on 0x0380
+ on on on off 0xcfe0
+ off on on off 0xcfa0
+ on off on off 0xcf60
+ off off on off 0xcf20
+ on on off off 0xcee0
+ off on off off 0xcea0
+ on off off off 0xce60
+ off off off off Card disabled
+
+IRQ is configured by software. Possible values are:
+
+ 3, 5, 7, 10, 11, 12, 15 and none (polled mode)
+
+
+The ACT2000 driver may either be built into the kernel or as a module.
+Initialization depends on how the driver is built:
+
+Driver built into the kernel:
+
+ The ACT2000 driver can be configured using the commandline-feature while
+ loading the kernel with LILO or LOADLIN. It accepts the following syntax:
+
+ act2000=b,p,i[,idstring]
+
+ where
+
+ b = Bus-Type (1=ISA, 2=MCA, 3=PCMCIA)
+ p = portbase (-1 means autoprobe)
+ i = Interrupt (-1 means use next free IRQ, 0 means polled mode)
+
+ The idstring is an arbitrary string used for referencing the card
+ by the actctrl tool later.
+
+ Defaults used, when no parameters given at all:
+
+ 1,-1,-1,""
+
+ which means: Autoprobe for an ISA card, use next free IRQ, let the
+ ISDN linklevel fill the IdString (usually "line0" for the first card).
+
+ If you like to use more than one card, you can use the program
+ "actctrl" from the utility-package to configure additional cards.
+
+ Using the "actctrl"-utility, portbase and irq can also be changed
+ during runtime. The D-channel protocol is configured by the "dproto"
+ option of the "actctrl"-utility after loading the firmware into the
+ card's memory using the "actctrl"-utility.
+
+Driver built as module:
+
+ The module act2000.o can be configured during modprobe (insmod) by
+ appending its parameters to the modprobe resp. insmod commandline.
+ The following syntax is accepted:
+
+ act_bus=b act_port=p act_irq=i act_id=idstring
+
+ where b, p, i and idstring have the same meanings as the parameters
+ described for the builtin version above.
+
+ Using the "actctrl"-utility, the same features apply to the modularized
+ version as to the kernel-builtin one. (i.e. loading of firmware and
+ configuring the D-channel protocol)
+
+Loading the firmware into the card:
+
+ The firmware is supplied together with the isdn4k-utils package. It
+ can be found in the subdirectory act2000/firmware/
+
+ Assuming you have installed the utility-package correctly, the firmware
+ will be downloaded into the card using the following command:
+
+ actctrl -d idstring load /etc/isdn/bip11.btl
+
+ where idstring is the Name of the card, given during insmod-time or
+ (for kernel-builtin driver) on the kernel commandline. If only one
+ ISDN card is used, the -d isdstrin may be omitted.
+
+ For further documentation (adding more IBM Active 2000 cards), refer to
+ the manpage actctrl.8 which is included in the isdn4k-utils package.
+
diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio
new file mode 100644
index 0000000..8ebca19
--- /dev/null
+++ b/Documentation/isdn/README.audio
@@ -0,0 +1,138 @@
+$Id: README.audio,v 1.8 1999/07/11 17:17:29 armin Exp $
+
+ISDN subsystem for Linux.
+ Description of audio mode.
+
+When enabled during kernel configuration, the tty emulator of the ISDN
+subsystem is capable of a reduced set of commands to support audio.
+This document describes the commands supported and the format of
+audio data.
+
+Commands for enabling/disabling audio mode:
+
+ AT+FCLASS=8 Enable audio mode.
+ This affects the following registers:
+ S18: Bits 0 and 2 are set.
+ S16: Set to 48 and any further change to
+ larger values is blocked.
+ AT+FCLASS=0 Disable audio mode.
+ Register 18 is set to 4.
+ AT+FCLASS=? Show possible modes.
+ AT+FCLASS? Report current mode (0 or 8).
+
+Commands supported in audio mode:
+
+All audio mode commands have one of the following forms:
+
+ AT+Vxx? Show current setting.
+ AT+Vxx=? Show possible settings.
+ AT+Vxx=v Set simple parameter.
+ AT+Vxx=v,v ... Set complex parameter.
+
+where xx is a two-character code and v are alphanumerical parameters.
+The following commands are supported:
+
+ AT+VNH=x Auto hangup setting. NO EFFECT, supported
+ for compatibility only.
+ AT+VNH? Always reporting "1"
+ AT+VNH=? Always reporting "1"
+
+ AT+VIP Reset all audio parameters.
+
+ AT+VLS=x Line select. x is one of the following:
+ 0 = No device.
+ 2 = Phone line.
+ AT+VLS=? Always reporting "0,2"
+ AT+VLS? Show current line.
+
+ AT+VRX Start recording. Emulator responds with
+ CONNECT and starts sending audio data to
+ the application. See below for data format
+
+ AT+VSD=x,y Set silence-detection parameters.
+ Possible parameters:
+ x = 0 ... 31 sensitivity threshold level.
+ (default 0 , deactivated)
+ y = 0 ... 255 range of interval in units
+ of 0.1 second. (default 70)
+ AT+VSD=? Report possible parameters.
+ AT+VSD? Show current parameters.
+
+ AT+VDD=x,y Set DTMF-detection parameters.
+ Only possible if online and during this connection.
+ Possible parameters:
+ x = 0 ... 15 sensitivity threshold level.
+ (default 0 , I4L soft-decode)
+ (1-15 soft-decode off, hardware on)
+ y = 0 ... 255 tone duration in units of 5ms.
+ Not for I4L soft decode (default 8, 40ms)
+ AT+VDD=? Report possible parameters.
+ AT+VDD? Show current parameters.
+
+ AT+VSM=x Select audio data format.
+ Possible parameters:
+ 2 = ADPCM-2
+ 3 = ADPCM-3
+ 4 = ADPCM-4
+ 5 = aLAW
+ 6 = uLAW
+ AT+VSM=? Show possible audio formats.
+
+ AT+VTX Start audio playback. Emulator responds
+ with CONNECT and starts sending audio data
+ received from the application via phone line.
+General behavior and description of data formats/protocol.
+ when a connection is made:
+
+ On incoming calls, if the application responds to a RING
+ with ATA, depending on the calling service, the emulator
+ responds with either CONNECT (data call) or VCON (voice call).
+
+ On outgoing voice calls, the emulator responds with VCON
+ upon connection setup.
+
+ Audio recording.
+
+ When receiving audio data, a kind of bisync protocol is used.
+ Upon AT+VRX command, the emulator responds with CONNECT, and
+ starts sending audio data to the application. There are several
+ escape sequences defined, all using DLE (0x10) as Escape char:
+
+ <DLE><ETX> End of audio data. (i.e. caused by a
+ hangup of the remote side) Emulator stops
+ recording, responding with VCON.
+ <DLE><DC4> Abort recording, (send by appl.) Emulator
+ stops recording, sends DLE,ETX.
+ <DLE><DLE> Escape sequence for DLE in data stream.
+ <DLE>0 Touchtone "0" received.
+ ...
+ <DLE>9 Touchtone "9" received.
+ <DLE># Touchtone "#" received.
+ <DLE>* Touchtone "*" received.
+ <DLE>A Touchtone "A" received.
+ <DLE>B Touchtone "B" received.
+ <DLE>C Touchtone "C" received.
+ <DLE>D Touchtone "D" received.
+
+ <DLE>q quiet. Silence detected after non-silence.
+ <DLE>s silence. Silence detected from the
+ start of recording.
+
+ Currently unsupported DLE sequences:
+
+ <DLE>c FAX calling tone received.
+ <DLE>b busy tone received.
+
+ Audio playback.
+
+ When sending audio data, upon AT+VTX command, emulator responds with
+ CONNECT, and starts transferring data from application to the phone line.
+ The same DLE sequences apply to this mode.
+
+ Full-Duplex-Audio:
+
+ When _both_ commands for recording and playback are given in _one_
+ AT-command-line (i.e.: "AT+VTX+VRX"), full-duplex-mode is selected.
+ In this mode, the only way to stop recording is sending <DLE><DC4>
+ and the only way to stop playback is to send <DLE><ETX>.
+
diff --git a/Documentation/isdn/README.avmb1 b/Documentation/isdn/README.avmb1
new file mode 100644
index 0000000..9e07548
--- /dev/null
+++ b/Documentation/isdn/README.avmb1
@@ -0,0 +1,187 @@
+Driver for active AVM Controller.
+
+The driver provides a kernel capi2.0 Interface (kernelcapi) and
+on top of this a User-Level-CAPI2.0-interface (capi)
+and a driver to connect isdn4linux with CAPI2.0 (capidrv).
+The lowlevel interface can be used to implement a CAPI2.0
+also for passive cards since July 1999.
+
+The author can be reached at calle@calle.in-berlin.de.
+The command avmcapictrl is part of the isdn4k-utils.
+t4-files can be found at ftp://ftp.avm.de/cardware/b1/linux/firmware
+
+Currently supported cards:
+ B1 ISA (all versions)
+ B1 PCI
+ T1/T1B (HEMA card)
+ M1
+ M2
+ B1 PCMCIA
+
+Installing
+----------
+
+You need at least /dev/capi20 to load the firmware.
+
+mknod /dev/capi20 c 68 0
+mknod /dev/capi20.00 c 68 1
+mknod /dev/capi20.01 c 68 2
+.
+.
+.
+mknod /dev/capi20.19 c 68 20
+
+Running
+-------
+
+To use the card you need the t4-files to download the firmware.
+AVM GmbH provides several t4-files for the different D-channel
+protocols (b1.t4 for Euro-ISDN). Install these file in /lib/isdn.
+
+if you configure as modules load the modules this way:
+
+insmod /lib/modules/current/misc/capiutil.o
+insmod /lib/modules/current/misc/b1.o
+insmod /lib/modules/current/misc/kernelcapi.o
+insmod /lib/modules/current/misc/capidrv.o
+insmod /lib/modules/current/misc/capi.o
+
+if you have an B1-PCI card load the module b1pci.o
+insmod /lib/modules/current/misc/b1pci.o
+and load the firmware with
+avmcapictrl load /lib/isdn/b1.t4 1
+
+if you have an B1-ISA card load the module b1isa.o
+and add the card by calling
+avmcapictrl add 0x150 15
+and load the firmware by calling
+avmcapictrl load /lib/isdn/b1.t4 1
+
+if you have an T1-ISA card load the module t1isa.o
+and add the card by calling
+avmcapictrl add 0x450 15 T1 0
+and load the firmware by calling
+avmcapictrl load /lib/isdn/t1.t4 1
+
+if you have an PCMCIA card (B1/M1/M2) load the module b1pcmcia.o
+before you insert the card.
+
+Leased Lines with B1
+--------------------
+Init card and load firmware.
+For an D64S use "FV: 1" as phone number
+For an D64S2 use "FV: 1" and "FV: 2" for multilink
+or "FV: 1,2" to use CAPI channel bundling.
+
+/proc-Interface
+-----------------
+
+/proc/capi:
+ dr-xr-xr-x 2 root root 0 Jul 1 14:03 .
+ dr-xr-xr-x 82 root root 0 Jun 30 19:08 ..
+ -r--r--r-- 1 root root 0 Jul 1 14:03 applications
+ -r--r--r-- 1 root root 0 Jul 1 14:03 applstats
+ -r--r--r-- 1 root root 0 Jul 1 14:03 capi20
+ -r--r--r-- 1 root root 0 Jul 1 14:03 capidrv
+ -r--r--r-- 1 root root 0 Jul 1 14:03 controller
+ -r--r--r-- 1 root root 0 Jul 1 14:03 contrstats
+ -r--r--r-- 1 root root 0 Jul 1 14:03 driver
+ -r--r--r-- 1 root root 0 Jul 1 14:03 ncci
+ -r--r--r-- 1 root root 0 Jul 1 14:03 users
+
+/proc/capi/applications:
+ applid level3cnt datablkcnt datablklen ncci-cnt recvqueuelen
+ level3cnt: capi_register parameter
+ datablkcnt: capi_register parameter
+ ncci-cnt: current number of nccis (connections)
+ recvqueuelen: number of messages on receive queue
+ for example:
+1 -2 16 2048 1 0
+2 2 7 2048 1 0
+
+/proc/capi/applstats:
+ applid recvctlmsg nrecvdatamsg nsentctlmsg nsentdatamsg
+ recvctlmsg: capi messages received without DATA_B3_IND
+ recvdatamsg: capi DATA_B3_IND received
+ sentctlmsg: capi messages sent without DATA_B3_REQ
+ sentdatamsg: capi DATA_B3_REQ sent
+ for example:
+1 2057 1699 1721 1699
+
+/proc/capi/capi20: statistics of capi.o (/dev/capi20)
+ minor nopen nrecvdropmsg nrecvctlmsg nrecvdatamsg sentctlmsg sentdatamsg
+ minor: minor device number of capi device
+ nopen: number of calls to devices open
+ nrecvdropmsg: capi messages dropped (messages in recvqueue in close)
+ nrecvctlmsg: capi messages received without DATA_B3_IND
+ nrecvdatamsg: capi DATA_B3_IND received
+ nsentctlmsg: capi messages sent without DATA_B3_REQ
+ nsentdatamsg: capi DATA_B3_REQ sent
+
+ for example:
+1 2 18 0 16 2
+
+/proc/capi/capidrv: statistics of capidrv.o (capi messages)
+ nrecvctlmsg nrecvdatamsg sentctlmsg sentdatamsg
+ nrecvctlmsg: capi messages received without DATA_B3_IND
+ nrecvdatamsg: capi DATA_B3_IND received
+ nsentctlmsg: capi messages sent without DATA_B3_REQ
+ nsentdatamsg: capi DATA_B3_REQ sent
+ for example:
+2780 2226 2256 2226
+
+/proc/capi/controller:
+ controller drivername state cardname controllerinfo
+ for example:
+1 b1pci running b1pci-e000 B1 3.07-01 0xe000 19
+2 t1isa running t1isa-450 B1 3.07-01 0x450 11 0
+3 b1pcmcia running m2-150 B1 3.07-01 0x150 5
+
+/proc/capi/contrstats:
+ controller nrecvctlmsg nrecvdatamsg sentctlmsg sentdatamsg
+ nrecvctlmsg: capi messages received without DATA_B3_IND
+ nrecvdatamsg: capi DATA_B3_IND received
+ nsentctlmsg: capi messages sent without DATA_B3_REQ
+ nsentdatamsg: capi DATA_B3_REQ sent
+ for example:
+1 2845 2272 2310 2274
+2 2 0 2 0
+3 2 0 2 0
+
+/proc/capi/driver:
+ drivername ncontroller
+ for example:
+b1pci 1
+t1isa 1
+b1pcmcia 1
+b1isa 0
+
+/proc/capi/ncci:
+ apllid ncci winsize sendwindow
+ for example:
+1 0x10101 8 0
+
+/proc/capi/users: kernelmodules that use the kernelcapi.
+ name
+ for example:
+capidrv
+capi20
+
+Questions
+---------
+Check out the FAQ (ftp.isdn4linux.de) or subscribe to the
+linux-avmb1@calle.in-berlin.de mailing list by sending
+a mail to majordomo@calle.in-berlin.de with
+subscribe linux-avmb1
+in the body.
+
+German documentation and several scripts can be found at
+ftp://ftp.avm.de/cardware/b1/linux/
+
+Bugs
+----
+If you find any please let me know.
+
+Enjoy,
+
+Carsten Paeth (calle@calle.in-berlin.de)
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
new file mode 100644
index 0000000..a76d748
--- /dev/null
+++ b/Documentation/isdn/README.concap
@@ -0,0 +1,259 @@
+Description of the "concap" encapsulation protocol interface
+============================================================
+
+The "concap" interface is intended to be used by network device
+drivers that need to process an encapsulation protocol.
+It is assumed that the protocol interacts with a linux network device by
+- data transmission
+- connection control (establish, release)
+Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol".
+
+This is currently only used inside the isdn subsystem. But it might
+also be useful to other kinds of network devices. Thus, if you want
+to suggest changes that improve usability or performance of the
+interface, please let me know. I'm willing to include them in future
+releases (even if I needed to adapt the current isdn code to the
+changed interface).
+
+
+Why is this useful?
+===================
+
+The encapsulation protocol used on top of WAN connections or permanent
+point-to-point links are frequently chosen upon bilateral agreement.
+Thus, a device driver for a certain type of hardware must support
+several different encapsulation protocols at once.
+
+The isdn device driver did already support several different
+encapsulation protocols. The encapsulation protocol is configured by a
+user space utility (isdnctrl). The isdn network interface code then
+uses several case statements which select appropriate actions
+depending on the currently configured encapsulation protocol.
+
+In contrast, LAN network interfaces always used a single encapsulation
+protocol which is unique to the hardware type of the interface. The LAN
+encapsulation is usually done by just sticking a header on the data. Thus,
+traditional linux network device drivers used to process the
+encapsulation protocol directly (usually by just providing a hard_header()
+method in the device structure) using some hardware type specific support
+functions. This is simple, direct and efficient. But it doesn't fit all
+the requirements for complex WAN encapsulations.
+
+
+ The configurability of the encapsulation protocol to be used
+ makes isdn network interfaces more flexible, but also much more
+ complex than traditional lan network interfaces.
+
+
+Many Encapsulation protocols used on top of WAN connections will not just
+stick a header on the data. They also might need to set up or release
+the WAN connection. They also might want to send other data for their
+private purpose over the wire, e.g. ppp does a lot of link level
+negotiation before the first piece of user data can be transmitted.
+Such encapsulation protocols for WAN devices are typically more complex
+than encapsulation protocols for lan devices. Thus, network interface
+code for typical WAN devices also tends to be more complex.
+
+
+In order to support Linux' x25 PLP implementation on top of
+isdn network interfaces I could have introduced yet another branch to
+the various case statements inside drivers/isdn/isdn_net.c.
+This eventually made isdn_net.c even more complex. In addition, it made
+isdn_net.c harder to maintain. Thus, by identifying an abstract
+interface between the network interface code and the encapsulation
+protocol, complexity could be reduced and maintainability could be
+increased.
+
+
+Likewise, a similar encapsulation protocol will frequently be needed by
+several different interfaces of even different hardware type, e.g. the
+synchronous ppp implementation used by the isdn driver and the
+asynchronous ppp implementation used by the ppp driver have a lot of
+similar code in them. By cleanly separating the encapsulation protocol
+from the hardware specific interface stuff such code could be shared
+better in future.
+
+
+When operating over dial-up-connections (e.g. telephone lines via modem,
+non-permanent virtual circuits of wide area networks, ISDN) many
+encapsulation protocols will need to control the connection. Therefore,
+some basic connection control primitives are supported. The type and
+semantics of the connection (i.e the ISO layer where connection service
+is provided) is outside our scope and might be different depending on
+the encapsulation protocol used, e.g. for a ppp module using our service
+on top of a modem connection a connect_request will result in dialing
+a (somewhere else configured) remote phone number. For an X25-interface
+module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt)
+a connect_request will ask for establishing a reliable lapb
+datalink connection.
+
+
+The encapsulation protocol currently provides the following
+service primitives to the network device.
+
+- create a new encapsulation protocol instance
+- delete encapsulation protocol instance and free all its resources
+- initialize (open) the encapsulation protocol instance for use.
+- deactivate (close) an encapsulation protocol instance.
+- process (xmit) data handed down by upper protocol layer
+- receive data from lower (hardware) layer
+- process connect indication from lower (hardware) layer
+- process disconnect indication from lower (hardware) layer
+
+
+The network interface driver accesses those primitives via callbacks
+provided by the encapsulation protocol instance within a
+struct concap_proto_ops.
+
+struct concap_proto_ops{
+
+ /* create a new encapsulation protocol instance of same type */
+ struct concap_proto * (*proto_new) (void);
+
+ /* delete encapsulation protocol instance and free all its resources.
+ cprot may no longer be referenced after calling this */
+ void (*proto_del)(struct concap_proto *cprot);
+
+ /* initialize the protocol's data. To be called at interface startup
+ or when the device driver resets the interface. All services of the
+ encapsulation protocol may be used after this*/
+ int (*restart)(struct concap_proto *cprot,
+ struct net_device *ndev,
+ struct concap_device_ops *dops);
+
+ /* deactivate an encapsulation protocol instance. The encapsulation
+ protocol may not call any *dops methods after this. */
+ int (*close)(struct concap_proto *cprot);
+
+ /* process a frame handed down to us by upper layer */
+ int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
+
+ /* to be called for each data entity received from lower layer*/
+ int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb);
+
+ /* to be called when a connection was set up/down.
+ Protocols that don't process these primitives might fill in
+ dummy methods here */
+ int (*connect_ind)(struct concap_proto *cprot);
+ int (*disconn_ind)(struct concap_proto *cprot);
+};
+
+
+The data structures are defined in the header file include/linux/concap.h.
+
+
+A Network interface using encapsulation protocols must also provide
+some service primitives to the encapsulation protocol:
+
+- request data being submitted by lower layer (device hardware)
+- request a connection being set up by lower layer
+- request a connection being released by lower layer
+
+The encapsulation protocol accesses those primitives via callbacks
+provided by the network interface within a struct concap_device_ops.
+
+struct concap_device_ops{
+
+ /* to request data be submitted by device */
+ int (*data_req)(struct concap_proto *, struct sk_buff *);
+
+ /* Control methods must be set to NULL by devices which do not
+ support connection control. */
+ /* to request a connection be set up */
+ int (*connect_req)(struct concap_proto *);
+
+ /* to request a connection be released */
+ int (*disconn_req)(struct concap_proto *);
+};
+
+The network interface does not explicitly provide a receive service
+because the encapsulation protocol directly calls netif_rx().
+
+
+
+
+An encapsulation protocol itself is actually the
+struct concap_proto{
+ struct net_device *net_dev; /* net device using our service */
+ struct concap_device_ops *dops; /* callbacks provided by device */
+ struct concap_proto_ops *pops; /* callbacks provided by us */
+ int flags;
+ void *proto_data; /* protocol specific private data, to
+ be accessed via *pops methods only*/
+ /*
+ :
+ whatever
+ :
+ */
+};
+
+Most of this is filled in when the device requests the protocol to
+be reset (opend). The network interface must provide the net_dev and
+dops pointers. Other concap_proto members should be considered private
+data that are only accessed by the pops callback functions. Likewise,
+a concap proto should access the network device's private data
+only by means of the callbacks referred to by the dops pointer.
+
+
+A possible extended device structure which uses the connection controlling
+encapsulation services could look like this:
+
+struct concap_device{
+ struct net_device net_dev;
+ struct my_priv /* device->local stuff */
+ /* the my_priv struct might contain a
+ struct concap_device_ops *dops;
+ to provide the device specific callbacks
+ */
+ struct concap_proto *cprot; /* callbacks provided by protocol */
+};
+
+
+
+Misc Thoughts
+=============
+
+The concept of the concap proto might help to reuse protocol code and
+reduce the complexity of certain network interface implementations.
+The trade off is that it introduces yet another procedure call layer
+when processing the protocol. This has of course some impact on
+performance. However, typically the concap interface will be used by
+devices attached to slow lines (like telephone, isdn, leased synchronous
+lines). For such slow lines, the overhead is probably negligible.
+This might no longer hold for certain high speed WAN links (like
+ATM).
+
+
+If general linux network interfaces explicitly supported concap
+protocols (e.g. by a member struct concap_proto* in struct net_device)
+then the interface of the service function could be changed
+by passing a pointer of type (struct net_device*) instead of
+type (struct concap_proto*). Doing so would make many of the service
+functions compatible to network device support functions.
+
+e.g. instead of the concap protocol's service function
+
+ int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
+
+we could have
+
+ int (*encap_and_xmit)(struct net_device *ndev, struct sk_buff *skb);
+
+As this is compatible to the dev->hard_start_xmit() method, the device
+driver could directly register the concap protocol's encap_and_xmit()
+function as its hard_start_xmit() method. This would eliminate one
+procedure call layer.
+
+
+The device's data request function could also be defined as
+
+ int (*data_req)(struct net_device *ndev, struct sk_buff *skb);
+
+This might even allow for some protocol stacking. And the network
+interface might even register the same data_req() function directly
+as its hard_start_xmit() method when a zero layer encapsulation
+protocol is configured. Thus, eliminating the performance penalty
+of the concap interface when a trivial concap protocol is used.
+Nevertheless, the device remains able to support encapsulation
+protocol configuration.
+
diff --git a/Documentation/isdn/README.diversion b/Documentation/isdn/README.diversion
new file mode 100644
index 0000000..bddcd5f
--- /dev/null
+++ b/Documentation/isdn/README.diversion
@@ -0,0 +1,127 @@
+The isdn diversion services are a supporting module working together with
+the isdn4linux and the HiSax module for passive cards.
+Active cards, TAs and cards using a own or other driver than the HiSax
+module need to be adapted to the HL<->LL interface described in a separate
+document. The diversion services may be used with all cards supported by
+the HiSax driver.
+The diversion kernel interface and controlling tool divertctrl were written
+by Werner Cornelius (werner@isdn4linux.de or werner@titro.de) under the
+GNU General Public License.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Table of contents
+=================
+
+1. Features of the i4l diversion services
+ (Or what can the i4l diversion services do for me)
+
+2. Required hard- and software
+
+3. Compiling, installing and loading/unloading the module
+ Tracing calling and diversion information
+
+4. Tracing calling and diversion information
+
+5. Format of the divert device ASCII output
+
+
+1. Features of the i4l diversion services
+ (Or what can the i4l diversion services do for me)
+
+ The i4l diversion services offers call forwarding and logging normally
+ only supported by isdn phones. Incoming calls may be diverted
+ unconditionally (CFU), when not reachable (CFNR) or on busy condition
+ (CFB).
+ The diversions may be invoked statically in the providers exchange
+ as normally done by isdn phones. In this case all incoming calls
+ with a special (or all) service identifiers are forwarded if the
+ forwarding reason is met. Activated static services may also be
+ interrogated (queried).
+ The i4l diversion services additionally offers a dynamic version of
+ call forwarding which is not preprogrammed inside the providers exchange
+ but dynamically activated by i4l.
+ In this case all incoming calls are checked by rules that may be
+ compared to the mechanism of ipfwadm or ipchains. If a given rule matches
+ the checking process is finished and the rule matching will be applied
+ to the call.
+ The rules include primary and secondary service identifiers, called
+ number and subaddress, callers number and subaddress and whether the rule
+ matches to all filtered calls or only those when all B-channel resources
+ are exhausted.
+ Actions that may be invoked by a rule are ignore, proceed, reject,
+ direct divert or delayed divert of a call.
+ All incoming calls matching a rule except the ignore rule a reported and
+ logged as ASCII via the proc filesystem (/proc/net/isdn/divert). If proceed
+ is selected the call will be held in a proceeding state (without ringing)
+ for a certain amount of time to let an external program or client decide
+ how to handle the call.
+
+
+2. Required hard- and software
+
+ For using the i4l diversion services the isdn line must be of a EURO/DSS1
+ type. Additionally the i4l services only work together with the HiSax
+ driver for passive isdn cards. All HiSax supported cards may be used for
+ the diversion purposes.
+ The static diversion services require the provider having static services
+ CFU, CFNR, CFB activated on an MSN-line. The static services may not be
+ used on a point-to-point connection. Further the static services are only
+ available in some countries (for example germany). Countries requiring the
+ keypad protocol for activating static diversions (like the netherlands) are
+ not supported but may use the tty devices for this purpose.
+ The dynamic diversion services may be used in all countries if the provider
+ enables the feature CF (call forwarding). This should work on both MSN- and
+ point-to-point lines.
+ To add and delete rules the additional divertctrl program is needed. This
+ program is part of the isdn4kutils package.
+
+3. Compiling, installing and loading/unloading the module
+ Tracing calling and diversion information
+
+
+ To compile the i4l code with diversion support you need to say yes to the
+ DSS1 diversion services when selecting the i4l options in the kernel
+ config (menuconfig or config).
+ After having properly activated a make modules and make modules_install all
+ required modules will be correctly installed in the needed modules dirs.
+ As the diversion services are currently not included in the scripts of most
+ standard distributions you will have to add a "insmod dss1_divert" after
+ having loaded the global isdn module.
+ The module can be loaded without any command line parameters.
+ If the module is actually loaded and active may be checked with a
+ "cat /proc/modules" or "ls /proc/net/isdn/divert". The divert file is
+ dynamically created by the diversion module and removed when the module is
+ unloaded.
+
+
+4. Tracing calling and diversion information
+
+ You also may put a "cat /proc/net/isdn/divert" in the background with the
+ output redirected to a file. Then all actions of the module are logged.
+ The divert file in the proc system may be opened more than once, so in
+ conjunction with inetd and a small remote client on other machines inside
+ your network incoming calls and reactions by the module may be shown on
+ every listening machine.
+ If a call is reported as proceeding an external program or client may
+ specify during a certain amount of time (normally 4 to 10 seconds) what
+ to do with that call.
+ To unload the module all open files to the device in the proc system must
+ be closed. Otherwise the module (and isdn.o) may not be unloaded.
+
+5. Format of the divert device ASCII output
+
+ To be done later
+
diff --git a/Documentation/isdn/README.fax b/Documentation/isdn/README.fax
new file mode 100644
index 0000000..5314958
--- /dev/null
+++ b/Documentation/isdn/README.fax
@@ -0,0 +1,45 @@
+
+Fax with isdn4linux
+===================
+
+When enabled during kernel configuration, the tty emulator
+of the ISDN subsystem is capable of the Fax Class 2 commands.
+
+This only makes sense under the following conditions :
+
+- You need the commands as dummy, because you are using
+ hylafax (with patch) for AVM capi.
+- You want to use the fax capabilities of your isdn-card.
+ (supported cards are listed below)
+
+
+NOTE: This implementation does *not* support fax with passive
+ ISDN-cards (known as softfax). The low-level driver of
+ the ISDN-card and/or the card itself must support this.
+
+
+Supported ISDN-Cards
+--------------------
+
+Eicon DIVA Server BRI/PCI
+ - full support with both B-channels.
+
+Eicon DIVA Server 4BRI/PCI
+ - full support with all B-channels.
+
+Eicon DIVA Server PRI/PCI
+ - full support on amount of B-channels
+ depending on DSPs on board.
+
+
+
+The command set is known as Class 2 (not Class 2.0) and
+can be activated by AT+FCLASS=2
+
+
+The interface between the link-level-module and the hardware-level driver
+is described in the files INTERFACE.fax and INTERFACE.
+
+Armin
+mac@melware.de
+
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
new file mode 100644
index 0000000..55b2852
--- /dev/null
+++ b/Documentation/isdn/README.gigaset
@@ -0,0 +1,318 @@
+GigaSet 307x Device Driver
+==========================
+
+1. Requirements
+ ------------
+1.1. Hardware
+ --------
+ This release supports the connection of the Gigaset 307x/417x family of
+ ISDN DECT bases via Gigaset M101 Data, Gigaset M105 Data or direct USB
+ connection. The following devices are reported to be compatible:
+
+ Bases:
+ Siemens Gigaset 3070/3075 isdn
+ Siemens Gigaset 4170/4175 isdn
+ Siemens Gigaset SX205/255
+ Siemens Gigaset SX353
+ T-Com Sinus 45 [AB] isdn
+ T-Com Sinus 721X[A] [SE]
+ Vox Chicago 390 ISDN (KPN Telecom)
+
+ RS232 data boxes:
+ Siemens Gigaset M101 Data
+ T-Com Sinus 45 Data 1
+
+ USB data boxes:
+ Siemens Gigaset M105 Data
+ Siemens Gigaset USB Adapter DECT
+ T-Com Sinus 45 Data 2
+ T-Com Sinus 721 data
+ Chicago 390 USB (KPN)
+
+ See also http://www.erbze.info/sinus_gigaset.htm and
+ http://gigaset307x.sourceforge.net/
+
+ We had also reports from users of Gigaset M105 who could use the drivers
+ with SX 100 and CX 100 ISDN bases (only in unimodem mode, see section 2.4.)
+ If you have another device that works with our driver, please let us know.
+
+ Chances of getting an USB device to work are good if the output of
+ lsusb
+ at the command line contains one of the following:
+ ID 0681:0001
+ ID 0681:0002
+ ID 0681:0009
+ ID 0681:0021
+ ID 0681:0022
+
+1.2. Software
+ --------
+ The driver works with ISDN4linux and so can be used with any software
+ which is able to use ISDN4linux for ISDN connections (voice or data).
+ CAPI4Linux support is planned but not yet available.
+
+ There are some user space tools available at
+ http://sourceforge.net/projects/gigaset307x/
+ which provide access to additional device specific functions like SMS,
+ phonebook or call journal.
+
+
+2. How to use the driver
+ ---------------------
+2.1. Modules
+ -------
+ To get the device working, you have to load the proper kernel module. You
+ can do this using
+ modprobe modulename
+ where modulename is ser_gigaset (M101), usb_gigaset (M105), or
+ bas_gigaset (direct USB connection to the base).
+
+ The module ser_gigaset provides a serial line discipline N_GIGASET_M101
+ which drives the device through the regular serial line driver. To use it,
+ run the Gigaset M101 daemon "gigasetm101d" (also available from
+ http://sourceforge.net/projects/gigaset307x/) with the device file of the
+ RS232 port to the M101 as an argument, for example:
+ gigasetm101d /dev/ttyS1
+ This will open the device file, set its line discipline to N_GIGASET_M101,
+ and then sleep in the background, keeping the device open so that the
+ line discipline remains active. To deactivate it, kill the daemon, for
+ example with
+ killall gigasetm101d
+ before disconnecting the device.
+
+2.2. Device nodes for user space programs
+ ------------------------------------
+ The device can be accessed from user space (eg. by the user space tools
+ mentioned in 1.2.) through the device nodes:
+
+ - /dev/ttyGS0 for M101 (RS232 data boxes)
+ - /dev/ttyGU0 for M105 (USB data boxes)
+ - /dev/ttyGB0 for the base driver (direct USB connection)
+
+ You can also select a "default device" which is used by the frontends when
+ no device node is given as parameter, by creating a symlink /dev/ttyG to
+ one of them, eg.:
+
+ ln -s /dev/ttyGB0 /dev/ttyG
+
+2.3. ISDN4linux
+ ----------
+ This is the "normal" mode of operation. After loading the module you can
+ set up the ISDN system just as you'd do with any ISDN card.
+ Your distribution should provide some configuration utility.
+ If not, you can use some HOWTOs like
+ http://www.linuxhaven.de/dlhp/HOWTO/DE-ISDN-HOWTO-5.html
+ If this doesn't work, because you have some recent device like SX100 where
+ debug output (see section 3.2.) shows something like this when dialing
+ CMD Received: ERROR
+ Available Params: 0
+ Connection State: 0, Response: -1
+ gigaset_process_response: resp_code -1 in ConState 0 !
+ Timeout occurred
+ you might need to use unimodem mode:
+
+2.4. Unimodem mode
+ -------------
+ This is needed for some devices [e.g. SX100] as they have problems with
+ the "normal" commands.
+
+ If you have installed the command line tool gigacontr, you can enter
+ unimodem mode using
+ gigacontr --mode unimodem
+ You can switch back using
+ gigacontr --mode isdn
+
+ You can also load the driver using e.g.
+ modprobe usb_gigaset startmode=0
+ to prevent the driver from starting in "isdn4linux mode".
+
+ In this mode the device works like a modem connected to a serial port
+ (the /dev/ttyGU0, ... mentioned above) which understands the commands
+ ATZ init, reset
+ => OK or ERROR
+ ATD
+ ATDT dial
+ => OK, CONNECT,
+ BUSY,
+ NO DIAL TONE,
+ NO CARRIER,
+ NO ANSWER
+ <pause>+++<pause> change to command mode when connected
+ ATH hangup
+
+ You can use some configuration tool of your distribution to configure this
+ "modem" or configure pppd/wvdial manually. There are some example ppp
+ configuration files and chat scripts in the gigaset-VERSION/ppp directory
+ in the driver packages from http://sourceforge.net/projects/gigaset307x/.
+ Please note that the USB drivers are not able to change the state of the
+ control lines (the M105 driver can be configured to use some undocumented
+ control requests, if you really need the control lines, though). This means
+ you must use "Stupid Mode" if you are using wvdial or you should use the
+ nocrtscts option of pppd.
+ You must also assure that the ppp_async module is loaded with the parameter
+ flag_time=0. You can do this e.g. by adding a line like
+
+ options ppp_async flag_time=0
+
+ to /etc/modprobe.conf. If your distribution has some local module
+ configuration file like /etc/modprobe.conf.local,
+ using that should be preferred.
+
+2.5. Call-ID (CID) mode
+ ------------------
+ Call-IDs are numbers used to tag commands to, and responses from, the
+ Gigaset base in order to support the simultaneous handling of multiple
+ ISDN calls. Their use can be enabled ("CID mode") or disabled ("Unimodem
+ mode"). Without Call-IDs (in Unimodem mode), only a very limited set of
+ functions is available. It allows outgoing data connections only, but
+ does not signal incoming calls or other base events.
+
+ DECT cordless data devices (M10x) permanently occupy the cordless
+ connection to the base while Call-IDs are activated. As the Gigaset
+ bases only support one DECT data connection at a time, this prevents
+ other DECT cordless data devices from accessing the base.
+
+ During active operation, the driver switches to the necessary mode
+ automatically. However, for the reasons above, the mode chosen when
+ the device is not in use (idle) can be selected by the user.
+ - If you want to receive incoming calls, you can use the default
+ settings (CID mode).
+ - If you have several DECT data devices (M10x) which you want to use
+ in turn, select Unimodem mode by passing the parameter "cidmode=0" to
+ the driver ("modprobe usb_gigaset cidmode=0" or modprobe.conf).
+
+ If you want both of these at once, you are out of luck.
+
+ You can also use /sys/class/tty/ttyGxy/cidmode for changing the CID mode
+ setting (ttyGxy is ttyGU0 or ttyGB0).
+
+2.6. M105 Undocumented USB Requests
+ ------------------------------
+
+ The Gigaset M105 USB data box understands a couple of useful, but
+ undocumented USB commands. These requests are not used in normal
+ operation (for wireless access to the base), but are needed for access
+ to the M105's own configuration mode (registration to the base, baudrate
+ and line format settings, device status queries) via the gigacontr
+ utility. Their use is disabled in the driver by default for safety
+ reasons but can be enabled by setting the kernel configuration option
+ "Support for undocumented USB requests" (GIGASET_UNDOCREQ) to "Y" and
+ recompiling.
+
+
+3. Troubleshooting
+ ---------------
+3.1. Solutions to frequently reported problems
+ -----------------------------------------
+ Problem:
+ You have a slow provider and isdn4linux gives up dialing too early.
+ Solution:
+ Load the isdn module using the dialtimeout option. You can do this e.g.
+ by adding a line like
+
+ options isdn dialtimeout=15
+
+ to /etc/modprobe.conf. If your distribution has some local module
+ configuration file like /etc/modprobe.conf.local,
+ using that should be preferred.
+
+ Problem:
+ Your isdn script aborts with a message about isdnlog.
+ Solution:
+ Try deactivating (or commenting out) isdnlog. This driver does not
+ support it.
+
+ Problem:
+ You have two or more DECT data adapters (M101/M105) and only the
+ first one you turn on works.
+ Solution:
+ Select Unimodem mode for all DECT data adapters. (see section 2.4.)
+
+3.2. Telling the driver to provide more information
+ ----------------------------------------------
+ Building the driver with the "Gigaset debugging" kernel configuration
+ option (CONFIG_GIGASET_DEBUG) gives it the ability to produce additional
+ information useful for debugging.
+
+ You can control the amount of debugging information the driver produces by
+ writing an appropriate value to /sys/module/gigaset/parameters/debug, e.g.
+ echo 0 > /sys/module/gigaset/parameters/debug
+ switches off debugging output completely,
+ echo 0x10a020 > /sys/module/gigaset/parameters/debug
+ enables the standard set of debugging output messages. These values are
+ bit patterns where every bit controls a certain type of debugging output.
+ See the constants DEBUG_* in the source file gigaset.h for details.
+
+ The initial value can be set using the debug parameter when loading the
+ module "gigaset", e.g. by adding a line
+ options gigaset debug=0
+ to /etc/modprobe.conf, ...
+
+ Generated debugging information can be found
+ - as output of the command
+ dmesg
+ - in system log files written by your syslog daemon, usually
+ in /var/log/, e.g. /var/log/messages.
+
+3.3. Reporting problems and bugs
+ ---------------------------
+ If you can't solve problems with the driver on your own, feel free to
+ use one of the forums, bug trackers, or mailing lists on
+ http://sourceforge.net/projects/gigaset307x
+ or write an electronic mail to the maintainers.
+
+ Try to provide as much information as possible, such as
+ - distribution
+ - kernel version (uname -r)
+ - gcc version (gcc --version)
+ - hardware architecture (uname -m, ...)
+ - type and firmware version of your device (base and wireless module,
+ if any)
+ - output of "lsusb -v" (if using an USB device)
+ - error messages
+ - relevant system log messages (it would help if you activate debug
+ output as described in 3.2.)
+
+ For help with general configuration problems not specific to our driver,
+ such as isdn4linux and network configuration issues, please refer to the
+ appropriate forums and newsgroups.
+
+3.4. Reporting problem solutions
+ ---------------------------
+ If you solved a problem with our drivers, wrote startup scripts for your
+ distribution, ... feel free to contact us (using one of the places
+ mentioned in 3.3.). We'd like to add scripts, hints, documentation
+ to the driver and/or the project web page.
+
+
+4. Links, other software
+ ---------------------
+ - Sourceforge project developing this driver and associated tools
+ http://sourceforge.net/projects/gigaset307x
+ - Yahoo! Group on the Siemens Gigaset family of devices
+ http://de.groups.yahoo.com/group/Siemens-Gigaset
+ - Siemens Gigaset/T-Sinus compatibility table
+ http://www.erbze.info/sinus_gigaset.htm
+
+
+5. Credits
+ -------
+ Thanks to
+
+ Karsten Keil
+ for his help with isdn4linux
+ Deti Fliegl
+ for his base driver code
+ Dennis Dietrich
+ for his kernel 2.6 patches
+ Andreas Rummel
+ for his work and logs to get unimodem mode working
+ Andreas Degert
+ for his logs and patches to get cx 100 working
+ Dietrich Feist
+ for his generous donation of one M105 and two M101 cordless adapters
+ Christoph Schweers
+ for his generous donation of a M34 device
+
+ and all the other people who sent logs and other information.
+
diff --git a/Documentation/isdn/README.hfc-pci b/Documentation/isdn/README.hfc-pci
new file mode 100644
index 0000000..e8a4ef0
--- /dev/null
+++ b/Documentation/isdn/README.hfc-pci
@@ -0,0 +1,41 @@
+The driver for the HFC-PCI and HFC-PCI-A chips from CCD may be used
+for many OEM cards using this chips.
+Additionally the driver has a special feature which makes it possible
+to read the echo-channel of the isdn bus. So all frames in both directions
+may be logged.
+When the echo logging feature is used the number of available B-channels
+for a HFC-PCI card is reduced to 1. Of course this is only relevant to
+the card, not to the isdn line.
+To activate the echo mode the following ioctls must be entered:
+
+hisaxctrl <driver/cardname> 10 1
+
+This reduces the available channels to 1. There must not be open connections
+through this card when entering the command.
+And then:
+
+hisaxctrl <driver/cardname> 12 1
+
+This enables the echo mode. If Hex logging is activated the isdnctrlx
+devices show a output with a line beginning of HEX: for the providers
+exchange and ECHO: for isdn devices sending to the provider.
+
+If more than one HFC-PCI cards are installed, a specific card may be selected
+at the hisax module load command line. Supply the load command with the desired
+IO-address of the desired card.
+Example:
+There tree cards installed in your machine at IO-base addresses 0xd000, 0xd400
+and 0xdc00
+If you want to use the card at 0xd400 standalone you should supply the insmod
+or depmod with type=35 io=0xd400.
+If you want to use all three cards, but the order needs to be at 0xdc00,0xd400,
+0xd000 you may give the parameters type=35,35,35 io=0xdc00,0xd400,0xd00
+Then the desired card will be the initialised in the desired order.
+If the io parameter is used the io addresses of all used cards should be
+supplied else the parameter is assumed 0 and a auto search for a free card is
+invoked which may not give the wanted result.
+
+Comments and reports to werner@isdn4linux.de or werner@isdn-development.de
+
+
+
diff --git a/Documentation/isdn/README.hysdn b/Documentation/isdn/README.hysdn
new file mode 100644
index 0000000..eeca11f
--- /dev/null
+++ b/Documentation/isdn/README.hysdn
@@ -0,0 +1,195 @@
+$Id: README.hysdn,v 1.3.6.1 2001/02/10 14:41:19 kai Exp $
+The hysdn driver has been written by
+Werner Cornelius (werner@isdn4linux.de or werner@titro.de)
+for Hypercope GmbH Aachen Germany. Hypercope agreed to publish this driver
+under the GNU General Public License.
+
+The CAPI 2.0-support was added by Ulrich Albrecht (ualbrecht@hypercope.de)
+for Hypercope GmbH Aachen, Germany.
+
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Table of contents
+=================
+
+1. About the driver
+
+2. Loading/Unloading the driver
+
+3. Entries in the /proc filesystem
+
+4. The /proc/net/hysdn/cardconfX file
+
+5. The /proc/net/hysdn/cardlogX file
+
+6. Where to get additional info and help
+
+
+1. About the driver
+
+ The drivers/isdn/hysdn subdir contains a driver for HYPERCOPEs active
+ PCI isdn cards Champ, Ergo and Metro. To enable support for this cards
+ enable ISDN support in the kernel config and support for HYSDN cards in
+ the active cards submenu. The driver may only be compiled and used if
+ support for loadable modules and the process filesystem have been enabled.
+
+ These cards provide two different interfaces to the kernel. Without the
+ optional CAPI 2.0 support, they register as ethernet card. IP-routing
+ to a ISDN-destination is performed on the card itself. All necessary
+ handlers for various protocols like ppp and others as well as config info
+ and firmware may be fetched from Hypercopes WWW-Site www.hypercope.de.
+
+ With CAPI 2.0 support enabled, the card can also be used as a CAPI 2.0
+ compliant devices with either CAPI 2.0 applications
+ (check isdn4k-utils) or -using the capidrv module- as a regular
+ isdn4linux device. This is done via the same mechanism as with the
+ active AVM cards and in fact uses the same module.
+
+
+2. Loading/Unloading the driver
+
+ The module has no command line parameters and auto detects up to 10 cards
+ in the id-range 0-9.
+ If a loaded driver shall be unloaded all open files in the /proc/net/hysdn
+ subdir need to be closed and all ethernet interfaces allocated by this
+ driver must be shut down. Otherwise the module counter will avoid a module
+ unload.
+
+ If you are using the CAPI 2.0-interface, make sure to load/modprobe the
+ kernelcapi-module first.
+
+ If you plan to use the capidrv-link to isdn4linux, make sure to load
+ capidrv.o after all modules using this driver (i.e. after hysdn and
+ any avm-specific modules).
+
+3. Entries in the /proc filesystem
+
+ When the module has been loaded it adds the directory hysdn in the
+ /proc/net tree. This directory contains exactly 2 file entries for each
+ card. One is called cardconfX and the other cardlogX, where X is the
+ card id number from 0 to 9.
+ The cards are numbered in the order found in the PCI config data.
+
+4. The /proc/net/hysdn/cardconfX file
+
+ This file may be read to get by everyone to get info about the cards type,
+ actual state, available features and used resources.
+ The first 3 entries (id, bus and slot) are PCI info fields, the following
+ type field gives the information about the cards type:
+
+ 4 -> Ergo card (server card with 2 b-chans)
+ 5 -> Metro card (server card with 4 or 8 b-chans)
+ 6 -> Champ card (client card with 2 b-chans)
+
+ The following 3 fields show the hardware assignments for irq, iobase and the
+ dual ported memory (dp-mem).
+ The fields b-chans and fax-chans announce the available card resources of
+ this types for the user.
+ The state variable indicates the actual drivers state for this card with the
+ following assignments.
+
+ 0 -> card has not been booted since driver load
+ 1 -> card booting is actually in progess
+ 2 -> card is in an error state due to a previous boot failure
+ 3 -> card is booted and active
+
+ And the last field (device) shows the name of the ethernet device assigned
+ to this card. Up to the first successful boot this field only shows a -
+ to tell that no net device has been allocated up to now. Once a net device
+ has been allocated it remains assigned to this card, even if a card is
+ rebooted and an boot error occurs.
+
+ Writing to the cardconfX file boots the card or transfers config lines to
+ the cards firmware. The type of data is automatically detected when the
+ first data is written. Only root has write access to this file.
+ The firmware boot files are normally called hyclient.pof for client cards
+ and hyserver.pof for server cards.
+ After successfully writing the boot file, complete config files or single
+ config lines may be copied to this file.
+ If an error occurs the return value given to the writing process has the
+ following additional codes (decimal):
+
+ 1000 Another process is currently bootng the card
+ 1001 Invalid firmware header
+ 1002 Boards dual-port RAM test failed
+ 1003 Internal firmware handler error
+ 1004 Boot image size invalid
+ 1005 First boot stage (bootstrap loader) failed
+ 1006 Second boot stage failure
+ 1007 Timeout waiting for card ready during boot
+ 1008 Operation only allowed in booted state
+ 1009 Config line too long
+ 1010 Invalid channel number
+ 1011 Timeout sending config data
+
+ Additional info about error reasons may be fetched from the log output.
+
+5. The /proc/net/hysdn/cardlogX file
+
+ The cardlogX file entry may be opened multiple for reading by everyone to
+ get the cards and drivers log data. Card messages always start with the
+ keyword LOG. All other lines are output from the driver.
+ The driver log data may be redirected to the syslog by selecting the
+ appropriate bitmask. The cards log messages will always be send to this
+ interface but never to the syslog.
+
+ A root user may write a decimal or hex (with 0x) value t this file to select
+ desired output options. As mentioned above the cards log dat is always
+ written to the cardlog file independent of the following options only used
+ to check and debug the driver itself:
+
+ For example:
+ echo "0x34560078" > /proc/net/hysdn/cardlog0
+ to output the hex log mask 34560078 for card 0.
+
+ The written value is regarded as an unsigned 32-Bit value, bit ored for
+ desired output. The following bits are already assigned:
+
+ 0x80000000 All driver log data is alternatively via syslog
+ 0x00000001 Log memory allocation errors
+ 0x00000010 Firmware load start and close are logged
+ 0x00000020 Log firmware record parser
+ 0x00000040 Log every firmware write actions
+ 0x00000080 Log all card related boot messages
+ 0x00000100 Output all config data sent for debugging purposes
+ 0x00000200 Only non comment config lines are shown wth channel
+ 0x00000400 Additional conf log output
+ 0x00001000 Log the asynchronous scheduler actions (config and log)
+ 0x00100000 Log all open and close actions to /proc/net/hysdn/card files
+ 0x00200000 Log all actions from /proc file entries
+ 0x00010000 Log network interface init and deinit
+
+6. Where to get additional info and help
+
+ If you have any problems concerning the driver or configuration contact
+ the Hypercope support team (support@hypercope.de) and or the authors
+ Werner Cornelius (werner@isdn4linux or cornelius@titro.de) or
+ Ulrich Albrecht (ualbrecht@hypercope.de).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Documentation/isdn/README.icn b/Documentation/isdn/README.icn
new file mode 100644
index 0000000..13f833d
--- /dev/null
+++ b/Documentation/isdn/README.icn
@@ -0,0 +1,148 @@
+$Id: README.icn,v 1.7 2000/08/06 09:22:51 armin Exp $
+
+You can get the ICN-ISDN-card from:
+
+Thinking Objects Software GmbH
+Versbacher Röthe 159
+97078 Würzburg
+Tel: +49 931 2877950
+Fax: +49 931 2877951
+
+email info@think.de
+WWW http:/www.think.de
+
+
+The card communicates with the PC by two interfaces:
+ 1. A range of 4 successive port-addresses, whose base address can be
+ configured with the switches.
+ 2. A memory window with 16KB-256KB size, which can be setup in 16k steps
+ over the whole range of 16MB. Isdn4linux only uses a 16k window.
+ The base address of the window can be configured when loading
+ the lowlevel-module (see README). If using more than one card,
+ all cards are mapped to the same window and activated as needed.
+
+Setting up the IO-address dipswitches for the ICN-ISDN-card:
+
+ Two types of cards exist, one with dip-switches and one with
+ hook-switches.
+
+ 1. Setting for the card with hook-switches:
+
+ (0 = switch closed, 1 = switch open)
+
+ S3 S2 S1 Base-address
+ 0 0 0 0x300
+ 0 0 1 0x310
+ 0 1 0 0x320 (Default for isdn4linux)
+ 0 1 1 0x330
+ 1 0 0 0x340
+ 1 0 1 0x350
+ 1 1 0 0x360
+ 1 1 1 NOT ALLOWED!
+
+ 2. Setting for the card with dip-switches:
+
+ (0 = switch closed, 1 = switch open)
+
+ S1 S2 S3 S4 Base-Address
+ 0 0 0 0 0x300
+ 0 0 0 1 0x310
+ 0 0 1 0 0x320 (Default for isdn4linux)
+ 0 0 1 1 0x330
+ 0 1 0 0 0x340
+ 0 1 0 1 0x350
+ 0 1 1 0 0x360
+ 0 1 1 1 NOT ALLOWED!
+ 1 0 0 0 0x308
+ 1 0 0 1 0x318
+ 1 0 1 0 0x328
+ 1 0 1 1 0x338
+ 1 1 0 0 0x348
+ 1 1 0 1 0x358
+ 1 1 1 0 0x368
+ 1 1 1 1 NOT ALLOWED!
+
+The ICN driver may be built into the kernel or as a module. Initialization
+depends on how the driver is built:
+
+Driver built into the kernel:
+
+ The ICN driver can be configured using the commandline-feature while
+ loading the kernel with LILO or LOADLIN. It accepts the following syntax:
+
+ icn=p,m[,idstring1[,idstring2]]
+
+ where
+
+ p = portbase (default: 0x320)
+ m = shared memory (default: 0xd0000)
+
+ When using the ICN double card (4B), you MUST define TWO idstrings.
+ idstring must start with a character! There is no way for the driver
+ to distinguish between a 2B and 4B type card. Therefore, by supplying
+ TWO idstrings, you tell the driver that you have a 4B installed.
+
+ If you like to use more than one card, you can use the program
+ "icnctrl" from the utility-package to configure additional cards.
+ You need to configure shared memory only once, since the icn-driver
+ maps all cards into the same address-space.
+
+ Using the "icnctrl"-utility, portbase and shared memory can also be
+ changed during runtime.
+
+ The D-channel protocol is configured by loading different firmware
+ into the card's memory using the "icnctrl"-utility.
+
+
+Driver built as module:
+
+ The module icn.o can be configured during "insmod'ing" it by
+ appending its parameters to the insmod-commandline. The following
+ syntax is accepted:
+
+ portbase=p membase=m icn_id=idstring [icn_id2=idstring2]
+
+ where p, m, idstring1 and idstring2 have the same meanings as the
+ parameters described for the kernel-version above.
+
+ When using the ICN double card (4B), you MUST define TWO idstrings.
+ idstring must start with a character! There is no way for the driver
+ to distinguish between a 2B and 4B type card. Therefore, by supplying
+ TWO idstrings, you tell the driver that you have a 4B installed.
+
+ Using the "icnctrl"-utility, the same features apply to the modularized
+ version like to the kernel-builtin one.
+
+ The D-channel protocol is configured by loading different firmware
+ into the card's memory using the "icnctrl"-utility.
+
+Loading the firmware into the card:
+
+ The firmware is supplied together with the isdn4k-utils package. It
+ can be found in the subdirectory icnctrl/firmware/
+
+ There are 3 files:
+
+ loadpg.bin - Image of the bootstrap loader.
+ pc_1t_ca.bin - Image of firmware for german 1TR6 protocol.
+ pc_eu_ca.bin - Image if firmware for EDSS1 (Euro-ISDN) protocol.
+
+ Assuming you have installed the utility-package correctly, the firmware
+ will be downloaded into the 2B-card using the following command:
+
+ icnctrl -d Idstring load /etc/isdn/loadpg.bin /etc/isdn/pc_XX_ca.bin
+
+ where XX is either "1t" or "eu", depending on the D-Channel protocol
+ used on your S0-bus and Idstring is the Name of the card, given during
+ insmod-time or (for kernel-builtin driver) on the kernel commandline.
+
+ To load a 4B-card, the same command is used, except a second firmware
+ file is appended to the commandline of icnctrl.
+
+ -> After downloading firmware, the two LEDs at the back cover of the card
+ (ICN-4B: 4 LEDs) must be blinking intermittently now. If a connection
+ is up, the corresponding led is lit continuously.
+
+ For further documentation (adding more ICN-cards), refer to the manpage
+ icnctrl.8 which is included in the isdn4k-utils package.
+
diff --git a/Documentation/isdn/README.mISDN b/Documentation/isdn/README.mISDN
new file mode 100644
index 0000000..cd8bf92
--- /dev/null
+++ b/Documentation/isdn/README.mISDN
@@ -0,0 +1,6 @@
+mISDN is a new modular ISDN driver, in the long term it should replace
+the old I4L driver architecture for passiv ISDN cards.
+It was designed to allow a broad range of applications and interfaces
+but only have the basic function in kernel, the interface to the user
+space is based on sockets with a own address family AF_ISDN.
+
diff --git a/Documentation/isdn/README.pcbit b/Documentation/isdn/README.pcbit
new file mode 100644
index 0000000..5125002
--- /dev/null
+++ b/Documentation/isdn/README.pcbit
@@ -0,0 +1,40 @@
+------------------------------------------------------------------------------
+ README file for the PCBIT-D Device Driver.
+------------------------------------------------------------------------------
+
+The PCBIT is a Euro ISDN adapter manufactured in Portugal by Octal and
+developed in cooperation with Portugal Telecom and Inesc.
+The driver interfaces with the standard kernel isdn facilities
+originally developed by Fritz Elfert in the isdn4linux project.
+
+The common versions of the pcbit board require a firmware that is
+distributed (and copyrighted) by the manufacturer. To load this
+firmware you need "pcbitctl" available on the standard isdn4k-utils
+package or in the pcbit package available in:
+
+ftp://ftp.di.fc.ul.pt/pub/systems/Linux/isdn
+
+Known Limitations:
+
+- The board reset procedure is at the moment incorrect and will only
+allow you to load the firmware after a hard reset.
+
+- Only HDLC in B-channels is supported at the moment. There is no
+current support for X.25 in B or D channels nor LAPD in B
+channels. The main reason is that these two other protocol modes have,
+to my knowledge, very little use. If you want to see them implemented
+*do* send me a mail.
+
+- The driver often triggers errors in the board that I and the
+manufacturer believe to be caused by bugs in the firmware. The current
+version includes several procedures for error recovery that should
+allow normal operation. Plans for the future include cooperation with
+the manufacturer in order to solve this problem.
+
+Information/hints/help can be obtained in the linux isdn
+mailing list (isdn4linux@listserv.isdn4linux.de) or directly from me.
+
+regards,
+ Pedro.
+
+<roque@di.fc.ul.pt>
diff --git a/Documentation/isdn/README.sc b/Documentation/isdn/README.sc
new file mode 100644
index 0000000..1153cd9
--- /dev/null
+++ b/Documentation/isdn/README.sc
@@ -0,0 +1,281 @@
+Welcome to Beta Release 2 of the combination ISDN driver for SpellCaster's
+ISA ISDN adapters. Please note this release 2 includes support for the
+DataCommute/BRI and TeleCommute/BRI adapters only and any other use is
+guaranteed to fail. If you have a DataCommute/PRI installed in the test
+computer, we recommend removing it as it will be detected but will not
+be usable. To see what we have done to Beta Release 2, see section 3.
+
+Speaking of guarantees, THIS IS BETA SOFTWARE and as such contains
+bugs and defects either known or unknown. Use this software at your own
+risk. There is NO SUPPORT for this software. Some help may be available
+through the web site or the mailing list but such support is totally at
+our own option and without warranty. If you choose to assume all and
+total risk by using this driver, we encourage you to join the beta
+mailing list.
+
+To join the Linux beta mailing list, send a message to:
+majordomo@spellcast.com with the words "subscribe linux-beta" as the only
+contents of the message. Do not include a signature. If you choose to
+remove yourself from this list at a later date, send another message to
+the same address with the words "unsubscribe linux-beta" as its only
+contents.
+
+TABLE OF CONTENTS
+-----------------
+ 1. Introduction
+ 1.1 What is ISDN4Linux?
+ 1.2 What is different between this driver and previous drivers?
+ 1.3 How do I setup my system with the correct software to use
+ this driver release?
+
+ 2. Basic Operations
+ 2.1 Unpacking and installing the driver
+ 2.2 Read the man pages!!!
+ 2.3 Installing the driver
+ 2.4 Removing the driver
+ 2.5 What to do if it doesn't load
+ 2.6 How to setup ISDN4Linux with the driver
+
+ 3. Beta Change Summaries and Miscellaneous Notes
+
+1. Introduction
+---------------
+
+The revision 2 Linux driver for SpellCaster ISA ISDN adapters is built
+upon ISDN4Linux available separately or as included in Linux 2.0 and later.
+The driver will support a maximum of 4 adapters in any one system of any
+type including DataCommute/BRI, DataCommute/PRI and TeleCommute/BRI for a
+maximum of 92 channels for host. The driver is supplied as a module in
+source form and needs to be complied before it can be used. It has been
+tested on Linux 2.0.20.
+
+1.1 What Is ISDN4Linux
+
+ISDN4Linux is a driver and set of tools used to access and use ISDN devices
+on a Linux platform in a common and standard way. It supports HDLC and PPP
+protocols and offers channel bundling and MLPPP support. To use ISDN4Linux
+you need to configure your kernel for ISDN support and get the ISDN4Linux
+tool kit from our web site.
+
+ISDN4Linux creates a channel pool from all of the available ISDN channels
+and therefore can function across adapters. When an ISDN4Linux compliant
+driver (such as ours) is loaded, all of the channels go into a pool and
+are used on a first-come first-served basis. In addition, individual
+channels can be specifically bound to particular interfaces.
+
+1.2 What is different between this driver and previous drivers?
+
+The revision 2 driver besides adopting the ISDN4Linux architecture has many
+subtle and not so subtle functional differences from previous releases. These
+include:
+ - More efficient shared memory management combined with a simpler
+ configuration. All adapters now use only 16Kbytes of shared RAM
+ versus between 16K and 64K. New methods for using the shared RAM
+ allow us to utilize all of the available RAM on the adapter through
+ only one 16K page.
+ - Better detection of available upper memory. The probing routines
+ have been improved to better detect available shared RAM pages and
+ used pages are now locked.
+ - Decreased loading time and a wider range of I/O ports probed.
+ We have significantly reduced the amount of time it takes to load
+ the driver and at the same time doubled the number of I/O ports
+ probed increasing the likelihood of finding an adapter.
+ - We now support all ISA adapter models with a single driver instead
+ of separate drivers for each model. The revision 2 driver supports
+ the DataCommute/BRI, DataCommute/PRI and TeleCommute/BRI in any
+ combination up to a maximum of four adapters per system.
+ - On board PPP protocol support has been removed in favour of the
+ sync-PPP support used in ISDN4Linux. This means more control of
+ the protocol parameters, faster negotiation time and a more
+ familiar interface.
+
+1.3 How do I setup my system with the correct software to use
+ this driver release?
+
+Before you can compile, install and use the SpellCaster ISA ISDN driver, you
+must ensure that the following software is installed, configured and running:
+
+ - Linux kernel 2.0.20 or later with the required init and ps
+ versions. Please see your distribution vendor for the correct
+ utility packages. The latest kernel is available from
+ ftp://sunsite.unc.edu/pub/Linux/kernel/v2.0/
+
+ - The latest modules package (modules-2.0.0.tar.gz) from
+ ftp://sunsite.unc.edu/pub/Linux/kernel/modules-2.0.0.tar.gz
+
+ - The ISDN4Linux tools available from
+ ftp://ftp.franken.de/pub/isdn4linux/v2.0/isdn4k-utils-2.0.tar.gz
+ This package may fail to compile for you so you can alternatively
+ get a pre-compiled version from
+ ftp://ftp.spellcast.com/pub/drivers/isdn4linux/isdn4k-bin-2.0.tar.gz
+
+
+2. Basic Operations
+-------------------
+
+2.1 Unpacking and installing the driver
+
+ 1. As root, create a directory in a convenient place. We suggest
+ /usr/src/spellcaster.
+
+ 2. Unpack the archive with :
+ tar xzf sc-n.nn.tar.gz -C /usr/src/spellcaster
+
+ 3. Change directory to /usr/src/spellcaster
+
+ 4. Read the README and RELNOTES files.
+
+ 5. Run 'make' and if all goes well, run 'make install'.
+
+2.2 Read the man pages!!!
+
+Make sure you read the scctrl(8) and sc(4) manual pages before continuing
+any further. Type 'man 8 scctrl' and 'man 4 sc'.
+
+2.3 Installing the driver
+
+To install the driver, type '/sbin/insmod sc' as root. sc(4) details options
+you can specify but you shouldn't need to use any unless this doesn't work.
+
+Make sure the driver loaded and detected all of the adapters by typing
+'dmesg'.
+
+The driver can be configured so that it is loaded upon startup. To do this,
+edit the file "/etc/modules/'uname -f'/'uname -v'" and insert the driver name
+"sc" into this file.
+
+2.4 Removing the driver
+
+To remove the driver, delete any interfaces that may exist (see isdnctrl(8)
+for more on this) and then type '/sbin/rmmod sc'.
+
+2.5 What to do if it doesn't load
+
+If, when you try to install the driver, you get a message mentioning
+'register_isdn' then you do not have the ISDN4Linux system installed. Please
+make sure that ISDN support is configured in the kernel.
+
+If you get a message that says 'initialization of sc failed', then the
+driver failed to detect an adapter or failed to find resources needed such
+as a free IRQ line or shared memory segment. If you are sure there are free
+resources available, use the insmod options detailed in sc(4) to override
+the probing function.
+
+Upon testing, the following problem was noted, the driver would load without
+problems, but the board would not respond beyond that point. When a check was
+done with 'cat /proc/interrupts' the interrupt count for sc was 0. In the event
+of this problem, change the BIOS settings so that the interrupts in question are
+reserved for ISA use only.
+
+
+2.6 How to setup ISDN4Linux with the driver
+
+There are three main configurations which you can use with the driver:
+
+A) Basic HDLC connection
+B) PPP connection
+C) MLPPP connection
+
+It should be mentioned here that you may also use a tty connection if you
+desire. The Documentation directory of the isdn4linux subsystem offers good
+documentation on this feature.
+
+A) 10 steps to the establishment of a basic HDLC connection
+-----------------------------------------------------------
+
+- please open the isdn-hdlc file in the examples directory and follow along...
+
+ This file is a script used to configure a BRI ISDN TA to establish a
+ basic HDLC connection between its two channels. Two network
+ interfaces are created and two routes added between the channels.
+
+ i) using the isdnctrl utility, add an interface with "addif" and
+ name it "isdn0"
+ ii) add the outgoing and inbound telephone numbers
+ iii) set the Layer 2 protocol to hdlc
+ iv) set the eaz of the interface to be the phone number of that
+ specific channel
+ v) to turn the callback features off, set the callback to "off" and
+ the callback delay (cbdelay) to 0.
+ vi) the hangup timeout can be set to a specified number of seconds
+ vii) the hangup upon incoming call can be set on or off
+ viii) use the ifconfig command to bring up the network interface with
+ a specific IP address and point to point address
+ ix) add a route to the IP address through the isdn0 interface
+ x) a ping should result in the establishment of the connection
+
+
+B) Establishment of a PPP connection
+------------------------------------
+
+- please open the isdn-ppp file in the examples directory and follow along...
+
+ This file is a script used to configure a BRI ISDN TA to establish a
+ PPP connection between the two channels. The file is almost
+ identical to the HDLC connection example except that the packet
+ encapsulation type has to be set.
+
+ use the same procedure as in the HDLC connection from steps i) to
+ iii) then, after the Layer 2 protocol is set, set the encapsulation
+ "encap" to syncppp. With this done, the rest of the steps, iv) to x)
+ can be followed from above.
+
+ Then, the ipppd (ippp daemon) must be setup:
+
+ xi) use the ipppd function found in /sbin/ipppd to set the following:
+ xii) take out (minus) VJ compression and bsd compression
+ xiii) set the mru size to 2000
+ xiv) link the two /dev interfaces to the daemon
+
+NOTE: A "*" in the inbound telephone number specifies that a call can be
+accepted on any number.
+
+C) Establishment of a MLPPP connection
+--------------------------------------
+
+- please open the isdn-mppp file in the examples directory and follow along...
+
+ This file is a script used to configure a BRI ISDN TA to accept a
+ Multi Link PPP connection.
+
+ i) using the isdnctrl utility, add an interface with "addif" and
+ name it "ippp0"
+ ii) add the inbound telephone number
+ iii) set the Layer 2 protocol to hdlc and the Layer 3 protocol to
+ trans (transparent)
+ iv) set the packet encapsulation to syncppp
+ v) set the eaz of the interface to be the phone number of that
+ specific channel
+ vi) to turn the callback features off, set the callback to "off" and
+ the callback delay (cbdelay) to 0.
+ vi) the hangup timeout can be set to a specified number of seconds
+ vii) the hangup upon incoming call can be set on or off
+ viii) add a slave interface and name it "ippp32" for example
+ ix) set the similar parameters for the ippp32 interface
+ x) use the ifconfig command to bring-up the ippp0 interface with a
+ specific IP address and point to point address
+ xi) add a route to the IP address through the ippp0 interface
+ xii) use the ipppd function found in /sbin/ipppd to set the following:
+ xiii) take out (minus) bsd compression
+ xiv) set the mru size to 2000
+ xv) add (+) the multi-link function "+mp"
+ xvi) link the two /dev interfaces to the daemon
+
+NOTE: To use the MLPPP connection to dial OUT to a MLPPP connection, change
+the inbound telephone numbers to the outgoing telephone numbers of the MLPPP
+host.
+
+
+3. Beta Change Summaries and Miscellaneous Notes
+------------------------------------------------
+When using the "scctrl" utility to upload firmware revisions on the board,
+please note that the byte count displayed at the end of the operation may be
+different from the total number of bytes in the "dcbfwn.nn.sr" file. Please
+disregard the displayed byte count.
+
+It was noted that in Beta Release 1, the module would fail to load and result
+in a segmentation fault when 'insmod'ed. This problem was created when one of
+the isdn4linux parameters, (isdn_ctrl, data field) was filled in. In some
+cases, this data field was NULL, and was left unchecked, so when it was
+referenced... segv. The bug has been fixed around line 63-68 of event.c.
+
diff --git a/Documentation/isdn/README.syncppp b/Documentation/isdn/README.syncppp
new file mode 100644
index 0000000..27d2600
--- /dev/null
+++ b/Documentation/isdn/README.syncppp
@@ -0,0 +1,58 @@
+Some additional information for setting up a syncPPP
+connection using network interfaces.
+---------------------------------------------------------------
+
+You need one thing beside the isdn4linux package:
+
+ a patched pppd .. (I called it ipppd to show the difference)
+
+Compiling isdn4linux with sync PPP:
+-----------------------------------
+To compile isdn4linux with the sync PPP part, you have
+to answer the appropriate question when doing a "make config"
+Don't forget to load the slhc.o
+module before the isdn.o module, if VJ-compression support
+is not compiled into your kernel. (e.g if you have no PPP or
+CSLIP in the kernel)
+
+Using isdn4linux with sync PPP:
+-------------------------------
+Sync PPP is just another encapsulation for isdn4linux. The
+name to enable sync PPP encapsulation is 'syncppp' .. e.g:
+
+ /sbin/isdnctrl encap ippp0 syncppp
+
+The name of the interface is here 'ippp0'. You need
+one interface with the name 'ippp0' to saturate the
+ipppd, which checks the ppp version via this interface.
+Currently, all devices must have the name ipppX where
+'X' is a decimal value.
+
+To set up a PPP connection you need the ipppd .. You must start
+the ipppd once after installing the modules. The ipppd
+communicates with the isdn4linux link-level driver using the
+/dev/ippp0 to /dev/ippp15 devices. One ipppd can handle
+all devices at once. If you want to use two PPP connections
+at the same time, you have to connect the ipppd to two
+devices .. and so on.
+I've implemented one additional option for the ipppd:
+ 'useifip' will get (if set to not 0.0.0.0) the IP address
+ for the negotiation from the attached network-interface.
+(also: ipppd will try to negotiate pointopoint IP as remote IP)
+You must disable BSD-compression, this implementation can't
+handle compressed packets.
+
+Check the etc/rc.isdn.syncppp in the isdn4kernel-util package
+for an example setup script.
+
+To use the MPPP stuff, you must configure a slave device
+with isdn4linux. Now call the ipppd with the '+mp' option.
+To increase the number of links, you must use the
+'addlink' option of the isdnctrl tool. (rc.isdn.syncppp.MPPP is
+an example script)
+
+enjoy it,
+ michael
+
+
+
diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25
new file mode 100644
index 0000000..e561a77
--- /dev/null
+++ b/Documentation/isdn/README.x25
@@ -0,0 +1,184 @@
+
+X.25 support within isdn4linux
+==============================
+
+This is alpha/beta test code. Use it completely at your own risk.
+As new versions appear, the stuff described here might suddenly change
+or become invalid without notice.
+
+Keep in mind:
+
+You are using several new parts of the 2.2.x kernel series which
+have not been tested in a large scale. Therefore, you might encounter
+more bugs as usual.
+
+- If you connect to an X.25 neighbour not operated by yourself, ASK the
+ other side first. Be prepared that bugs in the protocol implementation
+ might result in problems.
+
+- This implementation has never wiped out my whole hard disk yet. But as
+ this is experimental code, don't blame me if that happened to you.
+ Backing up important data will never harm.
+
+- Monitor your isdn connections while using this software. This should
+ prevent you from undesired phone bills in case of driver problems.
+
+
+
+
+How to configure the kernel
+===========================
+
+The ITU-T (former CCITT) X.25 network protocol layer has been implemented
+in the Linux source tree since version 2.1.16. The isdn subsystem might be
+useful to run X.25 on top of ISDN. If you want to try it, select
+
+ "CCITT X.25 Packet Layer"
+
+from the networking options as well as
+
+ "ISDN Support" and "X.25 PLP on Top of ISDN"
+
+from the ISDN subsystem options when you configure your kernel for
+compilation. You currently also need to enable
+"Prompt for development and/or incomplete code/drivers" from the
+"Code maturity level options" menu. For the x25trace utility to work
+you also need to enable "Packet socket".
+
+For local testing it is also recommended to enable the isdnloop driver
+from the isdn subsystem's configuration menu.
+
+For testing, it is recommended that all isdn drivers and the X.25 PLP
+protocol are compiled as loadable modules. Like this, you can recover
+from certain errors by simply unloading and reloading the modules.
+
+
+
+What's it for? How to use it?
+=============================
+
+X.25 on top of isdn might be useful with two different scenarios:
+
+- You might want to access a public X.25 data network from your Linux box.
+ You can use i4l if you were physically connected to the X.25 switch
+ by an ISDN B-channel (leased line as well as dial up connection should
+ work).
+
+ This corresponds to ITU-T recommendation X.31 Case A (circuit-mode
+ access to PSPDN [packet switched public data network]).
+
+ NOTE: X.31 also covers a Case B (access to PSPDN via virtual
+ circuit / packet mode service). The latter mode (which in theory
+ also allows using the D-channel) is not supported by isdn4linux.
+ It should however be possible to establish such packet mode connections
+ with certain active isdn cards provided that the firmware supports X.31
+ and the driver exports this functionality to the user. Currently,
+ the AVM B1 driver is the only driver which does so. (It should be
+ possible to access D-channel X.31 with active AVM cards using the
+ CAPI interface of the AVM-B1 driver).
+
+- Or you might want to operate certain ISDN teleservices on your linux
+ box. A lot of those teleservices run on top of the ISO-8208
+ (DTE-DTE mode) network layer protocol. ISO-8208 is essentially the
+ same as ITU-T X.25.
+
+ Popular candidates of such teleservices are EUROfile transfer or any
+ teleservice applying ITU-T recommendation T.90.
+
+To use the X.25 protocol on top of isdn, just create an isdn network
+interface as usual, configure your own and/or peer's ISDN numbers,
+and choose x25iface encapsulation by
+
+ isdnctrl encap <iface-name> x25iface.
+
+Once encap is set like this, the device can be used by the X.25 packet layer.
+
+All the stuff needed for X.25 is implemented inside the isdn link
+level (mainly isdn_net.c and some new source files). Thus, it should
+work with every existing HL driver. I was able to successfully open X.25
+connections on top of the isdnloop driver and the hisax driver.
+"x25iface"-encapsulation bypasses demand dialing. Dialing will be
+initiated when the upper (X.25 packet) layer requests the lapb datalink to
+be established. But hangup timeout is still active. Whenever a hangup
+occurs, all existing X.25 connections on that link will be cleared
+It is recommended to use sufficiently large hangup-timeouts for the
+isdn interfaces.
+
+
+In order to set up a conforming protocol stack you also need to
+specify the proper l2_prot parameter:
+
+To operate in ISO-8208 X.25 DTE-DTE mode, use
+
+ isdnctrl l2_prot <iface-name> x75i
+
+To access an X.25 network switch via isdn (your linux box is the DTE), use
+
+ isdnctrl l2_prot <iface-name> x25dte
+
+To mimic an X.25 network switch (DCE side of the connection), use
+
+ isdnctrl l2_prot <iface-name> x25dce
+
+However, x25dte or x25dce is currently not supported by any real HL
+level driver. The main difference between x75i and x25dte/dce is that
+x25d[tc]e uses fixed lap_b addresses. With x75i, the side which
+initiates the isdn connection uses the DTE's lap_b address while the
+called side used the DCE's lap_b address. Thus, l2_prot x75i might
+probably work if you access a public X.25 network as long as the
+corresponding isdn connection is set up by you. At least one test
+was successful to connect via isdn4linux to an X.25 switch using this
+trick. At the switch side, a terminal adapter X.21 was used to connect
+it to the isdn.
+
+
+How to set up a test installation?
+==================================
+
+To test X.25 on top of isdn, you need to get
+
+- a recent version of the "isdnctrl" program that supports setting the new
+ X.25 specific parameters.
+
+- the x25-utils-2.X package from
+ ftp://ftp.hes.iki.fi/pub/ham/linux/ax25/x25utils-*
+ (don't confuse the x25-utils with the ax25-utils)
+
+- an application program that uses linux PF_X25 sockets (some are
+ contained in the x25-util package).
+
+Before compiling the user level utilities make sure that the compiler/
+preprocessor will fetch the proper kernel header files of this kernel
+source tree. Either make /usr/include/linux a symbolic link pointing to
+this kernel's include/linux directory or set the appropriate compiler flags.
+
+When all drivers and interfaces are loaded and configured you need to
+ifconfig the network interfaces up and add X.25-routes to them. Use
+the usual ifconfig tool.
+
+ifconfig <iface-name> up
+
+But a special x25route tool (distributed with the x25-util package)
+is needed to set up X.25 routes. I.e.
+
+x25route add 01 <iface-name>
+
+will cause all x.25 connections to the destination X.25-address
+"01" to be routed to your created isdn network interface.
+
+There are currently no real X.25 applications available. However, for
+tests, the x25-utils package contains a modified version of telnet
+and telnetd that uses X.25 sockets instead of tcp/ip sockets. You can
+use those for your first tests. Furthermore, you might check
+ftp://ftp.hamburg.pop.de/pub/LOCAL/linux/i4l-eft/ which contains some
+alpha-test implementation ("eftp4linux") of the EUROfile transfer
+protocol.
+
+The scripts distributed with the eftp4linux test releases might also
+provide useful examples for setting up X.25 on top of isdn.
+
+The x25-utility package also contains an x25trace tool that can be
+used to monitor X.25 packets received by the network interfaces.
+The /proc/net/x25* files also contain useful information.
+
+- Henner
diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ
new file mode 100644
index 0000000..3257a4b
--- /dev/null
+++ b/Documentation/isdn/syncPPP.FAQ
@@ -0,0 +1,224 @@
+simple isdn4linux PPP FAQ .. to be continued .. not 'debugged'
+-------------------------------------------------------------------
+
+Q01: what's pppd, ipppd, syncPPP, asyncPPP ??
+Q02: error message "this system lacks PPP support"
+Q03: strange information using 'ifconfig'
+Q04: MPPP?? What's that and how can I use it ...
+Q05: I tried MPPP but it doesn't work
+Q06: can I use asynchronous PPP encapsulation with network devices
+Q07: A SunISDN machine can't connect to my i4l system
+Q08: I wanna talk to several machines, which need different configs
+Q09: Starting the ipppd, I get only error messages from i4l
+Q10: I wanna use dynamic IP address assignment
+Q11: I can't connect. How can I check where the problem is.
+Q12: How can I reduce login delay?
+
+-------------------------------------------------------------------
+
+Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ?
+ what should I use?
+A: The pppd is for asynchronous PPP .. asynchronous means
+ here, the framing is character based. (e.g when
+ using ttyI* or tty* devices)
+
+ The ipppd handles PPP packets coming in HDLC
+ frames (bit based protocol) ... The PPP driver
+ in isdn4linux pushes all IP packets direct
+ to the network layer and all PPP protocol
+ frames to the /dev/ippp* device.
+ So, the ipppd is a simple external network
+ protocol handler.
+
+ If you login into a remote machine using the
+ /dev/ttyI* devices and then enable PPP on the
+ remote terminal server -> use the 'old' pppd
+
+ If your remote side immediately starts to send
+ frames ... you probably connect to a
+ syncPPP machine .. use the network device part
+ of isdn4linux with the 'syncppp' encapsulation
+ and make sure, that the ipppd is running and
+ connected to at least one /dev/ippp*. Check the
+ isdn4linux manual on how to configure a network device.
+
+--
+
+Q02: when I start the ipppd .. I only get the
+ error message "this system lacks PPP support"
+A: check that at least the device 'ippp0' exists.
+ (you can check this e.g with the program 'ifconfig')
+ The ipppd NEEDS this device under THIS name ..
+ If this device doesn't exists, use:
+ isdnctrl addif ippp0
+ isdnctrl encap ippp0 syncppp
+ ... (see isdn4linux doc for more) ...
+A: Maybe you have compiled the ipppd with another
+ kernel source tree than the kernel you currently
+ run ...
+
+--
+
+Q03: when I list the netdevices with ifconfig I see, that
+ my ISDN interface has a HWaddr and IRQ=0 and Base
+ address = 0
+A: The device is a fake ethernet device .. ignore IRQ and baseaddr
+ You need the HWaddr only for ethernet encapsulation.
+
+--
+
+Q04: MPPP?? What's that and how can I use it ...
+
+A: MPPP or MP or MPP (Warning: MP is also an
+ acronym for 'Multi Processor') stands for
+ Multi Point to Point and means bundling
+ of several channels to one logical stream.
+ To enable MPPP negotiation you must call the
+ ipppd with the '+mp' option.
+ You must also configure a slave device for
+ every additional channel. (see the i4l manual
+ for more)
+ To use channel bundling you must first activate
+ the 'master' or initial call. Now you can add
+ the slave channels with the command:
+ isdnctrl addlink <device>
+ e.g:
+ isdnctrl addlink ippp0
+ This is different from other encapsulations of
+ isdn4linux! With syncPPP, there is no automatic
+ activation of slave devices.
+
+--
+
+Q05: I tried MPPP but it doesn't work .. the ipppd
+ writes in the debug log something like:
+ .. rcvd [0][proto=0x3d] c0 00 00 00 80 fd 01 01 00 0a ...
+ .. sent [0][LCP ProtRej id=0x2 00 3d c0 00 00 00 80 fd 01 ...
+
+A: you forgot to compile MPPP/RFC1717 support into the
+ ISDN Subsystem. Recompile with this option enabled.
+
+--
+
+Q06: can I use asynchronous PPP encapsulation
+ over the network interface of isdn4linux ..
+
+A: No .. that's not possible .. Use the standard
+ PPP package over the /dev/ttyI* devices. You
+ must not use the ipppd for this.
+
+--
+
+Q07: A SunISDN machine tries to connect my i4l system,
+ which doesn't work.
+ Checking the debug log I just saw garbage like:
+!![ ... fill in the line ... ]!!
+
+A: The Sun tries to talk asynchronous PPP ... i4l
+ can't understand this ... try to use the ttyI*
+ devices with the standard PPP/pppd package
+
+A: (from Alexanter Strauss: )
+!![ ... fill in mail ]!!
+
+--
+
+Q08: I wanna talk to remote machines, which need
+ a different configuration. The only way
+ I found to do this is to kill the ipppd and
+ start a new one with another config to connect
+ to the second machine.
+
+A: you must bind a network interface explicitly to
+ an ippp device, where you can connect a (for this
+ interface) individually configured ipppd.
+
+--
+
+Q09: When I start the ipppd I only get error messages
+ from the i4l driver ..
+
+A: When starting, the ipppd calls functions which may
+ trigger a network packet. (e.g gethostbyname()).
+ Without the ipppd (at this moment, it is not
+ fully started) we can't handle this network request.
+ Try to configure hostnames necessary for the ipppd
+ in your local /etc/hosts file or in a way, that
+ your system can resolve it without using an
+ isdn/ippp network-interface.
+
+--
+
+Q10: I wanna use dynamic IP address assignment ... How
+ must I configure the network device.
+
+A: At least you must have a route which forwards
+ a packet to the ippp network-interface to trigger
+ the dial-on-demand.
+ A default route to the ippp-interface will work.
+ Now you must choose a dummy IP address for your
+ interface.
+ If for some reason you can't set the default
+ route to the ippp interface, you may take any
+ address of the subnet from which you expect your
+ dynamic IP number and set a 'network route' for
+ this subnet to the ippp interface.
+ To allow overriding of the dummy address you
+ must call the ipppd with the 'ipcp-accept-local' option.
+
+A: You must know, how the ipppd gets the addresses it wanna
+ configure. If you don't give any option, the ipppd
+ tries to negotiate the local host address!
+ With the option 'noipdefault' it requests an address
+ from the remote machine. With 'useifip' it gets the
+ addresses from the net interface. Or you set the address
+ on the option line with the <a.b.c.d:e.f.g.h> option.
+ Note: the IP address of the remote machine must be configured
+ locally or the remote machine must send it in an IPCP request.
+ If your side doesn't know the IP address after negotiation, it
+ closes the connection!
+ You must allow overriding of address with the 'ipcp-accept-*'
+ options, if you have set your own or the remote address
+ explicitly.
+
+A: Maybe you try these options .. e.g:
+
+ /sbin/ipppd :$REMOTE noipdefault /dev/ippp0
+
+ where REMOTE must be the address of the remote machine (the
+ machine, which gives you your address)
+
+--
+
+Q11: I can't connect. How can I check where the problem is.
+
+A: A good help log is the debug output from the ipppd...
+ Check whether you can find there:
+ - only a few LCP-conf-req SENT messages (less then 10)
+ and then a Term-REQ:
+ -> check whether your ISDN card is well configured
+ it seems, that your machine doesn't dial
+ (IRQ,IO,Proto, etc problems)
+ Configure your ISDN card to print debug messages and
+ check the /dev/isdnctrl output next time. There
+ you can see, whether there is activity on the card/line.
+ - there are at least a few RECV messages in the log:
+ -> fine: your card is dialing and your remote machine
+ tries to talk with you. Maybe only a missing
+ authentication. Check your ipppd configuration again.
+ - the ipppd exits for some reason:
+ -> not good ... check /var/adm/syslog and /var/adm/daemon.
+ Could be a bug in the ipppd.
+
+--
+
+Q12: How can I reduce login delay?
+
+A: Log a login session ('debug' log) and check which options
+ your remote side rejects. Next time configure your ipppd
+ to not negotiate these options. Another 'side effect' is, that
+ this increases redundancy. (e.g your remote side is buggy and
+ rejects options in a wrong way).
+
+
+
diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO
new file mode 100644
index 0000000..5547698
--- /dev/null
+++ b/Documentation/ja_JP/HOWTO
@@ -0,0 +1,687 @@
+NOTE:
+This is a version of Documentation/HOWTO translated into Japanese.
+This document is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
+and the JF Project team <www.linux.or.jp/JF>.
+If you find any difference between this document and the original file
+or a problem with the translation,
+please contact the maintainer of this file or JF project.
+
+Please also note that the purpose of this file is to be easier to read
+for non English (read: Japanese) speakers and is not intended as a
+fork. So if you have any comments or updates for this file, please try
+to update the original English file first.
+
+Last Updated: 2008/10/24
+==================================
+これは、
+linux-2.6.28/Documentation/HOWTO
+の和訳です。
+
+翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
+翻訳日: 2008/10/24
+翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com>
+校正者: 松倉さん <nbh--mats at nifty dot com>
+ 小林 雅典さん (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp>
+ 武井伸光さん、<takei at webmasters dot gr dot jp>
+ かねこさん (Seiji Kaneko) <skaneko at a2 dot mbn dot or dot jp>
+ 野口さん (Kenji Noguchi) <tokyo246 at gmail dot com>
+ 河内さん (Takayoshi Kochi) <t-kochi at bq dot jp dot nec dot com>
+ 岩本さん (iwamoto) <iwamoto.kn at ncos dot nec dot co dot jp>
+ 内田さん (Satoshi Uchida) <s-uchida at ap dot jp dot nec dot com>
+==================================
+
+Linux カーネル開発のやり方
+-------------------------------
+
+これは上のトピック( Linux カーネル開発のやり方)の重要な事柄を網羅した
+ドキュメントです。ここには Linux カーネル開発者になるための方法と
+Linux カーネル開発コミュニティと共に活動するやり方を学ぶ方法が含まれて
+います。カーネルプログラミングに関する技術的な項目に関することは何も含
+めないようにしていますが、カーネル開発者となるための正しい方向に向かう
+手助けになります。
+
+もし、このドキュメントのどこかが古くなっていた場合には、このドキュメン
+トの最後にリストしたメンテナにパッチを送ってください。
+
+はじめに
+---------
+
+あなたは Linux カーネルの開発者になる方法を学びたいのでしょうか? そ
+れともあなたは上司から「このデバイスの Linux ドライバを書くように」と
+言われているのでしょうか? 
+この文書の目的は、あなたが踏むべき手順と、コミュニティと一緒にうまく働
+くヒントを書き下すことで、あなたが知るべき全てのことを教えることです。
+また、このコミュニティがなぜ今うまくまわっているのかという理由の一部も
+説明しようと試みています。
+
+
+カーネルは 少量のアーキテクチャ依存部分がアセンブリ言語で書かれている
+以外は大部分は C 言語で書かれています。C言語をよく理解していることはカー
+ネル開発者には必要です。アーキテクチャ向けの低レベル部分の開発をするの
+でなければ、(どんなアーキテクチャでも)アセンブリ(訳注: 言語)は必要あり
+ません。以下の本は、C 言語の十分な知識や何年もの経験に取って代わるもの
+ではありませんが、少なくともリファレンスとしては良い本です。
+ - "The C Programming Language" by Kernighan and Ritchie [Prentice Hall]
+ -『プログラミング言語C第2版』(B.W. カーニハン/D.M. リッチー著 石田晴久訳) [共立出版]
+ - "Practical C Programming" by Steve Oualline [O'Reilly]
+ - 『C実践プログラミング第3版』(Steve Oualline著 望月康司監訳 谷口功訳) [オライリージャパン]
+ - "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
+ - 『新・詳説 C 言語 H&S リファレンス』
+ (サミュエル P ハービソン/ガイ L スティール共著 斉藤 信男監訳)[ソフトバンク]
+
+カーネルは GNU C と GNU ツールチェインを使って書かれています。カーネル
+は ISO C89 仕様に準拠して書く一方で、標準には無い言語拡張を多く使って
+います。カーネルは標準 C ライブラリとは関係がないといった、C 言語フリー
+スタンディング環境です。そのため、C の標準で使えないものもあります。任
+意の long long の除算や浮動小数点は使えません。
+ときどき、カーネルがツールチェインや C 言語拡張に置いている前提がどう
+なっているのかわかりにくいことがあり、また、残念なことに決定的なリファ
+レンスは存在しません。情報を得るには、gcc の info ページ( info gcc )を
+見てください。
+
+あなたは既存の開発コミュニティと一緒に作業する方法を学ぼうとしているこ
+とに留意してください。そのコミュニティは、コーディング、スタイル、
+開発手順について高度な標準を持つ、多様な人の集まりです。
+地理的に分散した大規模なチームに対してもっともうまくいくとわかったこと
+をベースにしながら、これらの標準は長い時間をかけて築かれてきました。
+これらはきちんと文書化されていますから、事前にこれらの標準についてでき
+るだけたくさん学んでください。また皆があなたやあなたの会社のやり方に合わ
+せてくれると思わないでください。
+
+法的問題
+------------
+
+Linux カーネルのソースコードは GPL ライセンスの下でリリースされていま
+す。ライセンスの詳細については、ソースツリーのメインディレクトリに存在
+する、COPYING のファイルを見てください。もしライセンスについてさらに質
+問があれば、Linux Kernel メーリングリストに質問するのではなく、どうぞ
+法律家に相談してください。メーリングリストの人達は法律家ではなく、法的
+問題については彼らの声明はあてにするべきではありません。
+
+GPL に関する共通の質問や回答については、以下を参照してください。
+ http://www.gnu.org/licenses/gpl-faq.html
+
+ドキュメント
+------------
+
+Linux カーネルソースツリーは幅広い範囲のドキュメントを含んでおり、それ
+らはカーネルコミュニティと会話する方法を学ぶのに非常に貴重なものです。
+新しい機能がカーネルに追加される場合、その機能の使い方について説明した
+新しいドキュメントファイルも追加することを勧めます。
+カーネルの変更が、カーネルがユーザ空間に公開しているインターフェイスの
+変更を引き起こす場合、その変更を説明するマニュアルページのパッチや情報
+をマニュアルページのメンテナ mtk.manpages@gmail.com に送り、CC を
+linux-api@ver.kernel.org に送ることを勧めます。
+
+以下はカーネルソースツリーに含まれている読んでおくべきファイルの一覧で
+す-
+
+ README
+ このファイルは Linuxカーネルの簡単な背景とカーネルを設定(訳注
+ configure )し、生成(訳注 build )するために必要なことは何かが書かれ
+ ています。カーネルに関して初めての人はここからスタートすると良いで
+ しょう。
+
+ Documentation/Changes
+ このファイルはカーネルをうまく生成(訳注 build )し、走らせるのに最
+ 小限のレベルで必要な数々のソフトウェアパッケージの一覧を示してい
+ ます。
+
+ Documentation/CodingStyle
+ これは Linux カーネルのコーディングスタイルと背景にある理由を記述
+ しています。全ての新しいコードはこのドキュメントにあるガイドライン
+ に従っていることを期待されています。大部分のメンテナはこれらのルー
+ ルに従っているものだけを受け付け、多くの人は正しいスタイルのコード
+ だけをレビューします。
+
+ Documentation/SubmittingPatches
+ Documentation/SubmittingDrivers
+ これらのファイルには、どうやってうまくパッチを作って投稿するかに
+ ついて非常に詳しく書かれており、以下を含みます(これだけに限らない
+ けれども)
+ - Email に含むこと
+ - Email の形式
+ - だれに送るか
+ これらのルールに従えばうまくいくことを保証することではありません
+ が (すべてのパッチは内容とスタイルについて精査を受けるので)、
+ ルールに従わなければ間違いなくうまくいかないでしょう。
+
+ この他にパッチを作る方法についてのよくできた記述は-
+
+ "The Perfect Patch"
+ http://userweb.kernel.org/~akpm/stuff/tpp.txt
+ "Linux kernel patch submission format"
+ http://linux.yyz.us/patch-format.html
+
+ Documentation/stable_api_nonsense.txt
+ このファイルはカーネルの中に不変のAPIを持たないことにした意識的な
+ 決断の背景にある理由について書かれています。以下のようなことを含
+ んでいます-
+ - サブシステムとの間に層を作ること(コンパチビリティのため?)
+ - オペレーティングシステム間のドライバの移植性
+ - カーネルソースツリーの素早い変更を遅らせる(もしくは素早い変更
+ を妨げる)
+ このドキュメントは Linux 開発の思想を理解するのに非常に重要です。
+ そして、他のOSでの開発者が Linux に移る時にとても重要です。
+
+ Documentation/SecurityBugs
+ もし Linux カーネルでセキュリティ問題を発見したように思ったら、こ
+ のドキュメントのステップに従ってカーネル開発者に連絡し、問題解決を
+ 支援してください。
+
+ Documentation/ManagementStyle
+ このドキュメントは Linux カーネルのメンテナ達がどう行動するか、
+ 彼らの手法の背景にある共有されている精神について記述しています。こ
+ れはカーネル開発の初心者なら(もしくは、単に興味があるだけの人でも)
+ 重要です。なぜならこのドキュメントは、カーネルメンテナ達の独特な
+ 行動についての多くの誤解や混乱を解消するからです。
+
+ Documentation/stable_kernel_rules.txt
+ このファイルはどのように stable カーネルのリリースが行われるかのルー
+ ルが記述されています。そしてこれらのリリースの中のどこかで変更を取
+ り入れてもらいたい場合に何をすれば良いかが示されています。
+
+ Documentation/kernel-docs.txt
+  カーネル開発に付随する外部ドキュメントのリストです。もしあなたが
+ 探しているものがカーネル内のドキュメントでみつからなかった場合、
+ このリストをあたってみてください。
+
+ Documentation/applying-patches.txt
+ パッチとはなにか、パッチをどうやって様々なカーネルの開発ブランチに
+ 適用するのかについて正確に記述した良い入門書です。
+
+カーネルはソースコードから自動的に生成可能な多数のドキュメントを自分自
+身でもっています。これにはカーネル内 API のすべての記述や、どう正しく
+ロックをかけるかの規則が含まれます。このドキュメントは
+Documentation/DocBook/ ディレクトリに作られ、以下のように
+ make pdfdocs
+ make psdocs
+ make htmldocs
+ make mandocs
+コマンドを実行するとメインカーネルのソースディレクトリから
+それぞれ、PDF, Postscript, HTML, man page の形式で生成されます。
+
+カーネル開発者になるには
+---------------------------
+
+もしあなたが、Linux カーネル開発について何も知らないならば、
+KernelNewbies プロジェクトを見るべきです
+ http://kernelnewbies.org
+
+このサイトには役に立つメーリングリストがあり、基本的なカーネル開発に関
+するほとんどどんな種類の質問もできます (既に回答されているようなことを
+聞く前にまずはアーカイブを調べてください)。
+またここには、リアルタイムで質問を聞くことができる IRC チャネルや、Linux
+カーネルの開発に関して学ぶのに便利なたくさんの役に立つドキュメントがあ
+ります。
+
+web サイトには、コードの構成、サブシステム、現在存在するプロジェクト(ツ
+リーにあるもの無いものの両方)の基本的な管理情報があります。
+ここには、また、カーネルのコンパイルのやり方やパッチの当て方などの間接
+的な基本情報も記述されています。
+
+あなたがどこからスタートして良いかわからないが、Linux カーネル開発コミュ
+ニティに参加して何かすることをさがしている場合には、Linux kernel
+Janitor's プロジェクトにいけば良いでしょう -
+ http://janitor.kernelnewbies.org/
+ここはそのようなスタートをするのにうってつけの場所です。ここには、
+Linux カーネルソースツリーの中に含まれる、きれいにし、修正しなければな
+らない、単純な問題のリストが記述されています。このプロジェクトに関わる
+開発者と一緒に作業することで、あなたのパッチを Linuxカーネルツリーに入
+れるための基礎を学ぶことができ、そしてもしあなたがまだアイディアを持っ
+ていない場合には、次にやる仕事の方向性が見えてくるかもしれません。
+
+もしあなたが、すでにひとまとまりコードを書いていて、カーネルツリーに入
+れたいと思っていたり、それに関する適切な支援を求めたい場合、カーネル
+メンターズプロジェクトはそのような皆さんを助けるためにできました。
+ここにはメーリングリストがあり、以下から参照できます
+ http://selenic.com/mailman/listinfo/kernel-mentors
+
+実際に Linux カーネルのコードについて修正を加える前に、どうやってその
+コードが動作するのかを理解することが必要です。そのためには、特別なツー
+ルの助けを借りてでも、それを直接よく読むことが最良の方法です(ほとんど
+のトリッキーな部分は十分にコメントしてありますから)。そういうツールで
+特におすすめなのは、Linux クロスリファレンスプロジェクトです。これは、
+自己参照方式で、索引がついた web 形式で、ソースコードを参照することが
+できます。この最新の素晴しいカーネルコードのリポジトリは以下で見つかり
+ます-
+ http://sosdg.org/~qiyong/lxr/
+
+開発プロセス
+-----------------------
+
+Linux カーネルの開発プロセスは現在幾つかの異なるメインカーネル「ブラン
+チ」と多数のサブシステム毎のカーネルブランチから構成されます。
+これらのブランチとは-
+ - メインの 2.6.x カーネルツリー
+ - 2.6.x.y -stable カーネルツリー
+ - 2.6.x -git カーネルパッチ
+ - 2.6.x -mm カーネルパッチ
+ - サブシステム毎のカーネルツリーとパッチ
+
+2.6.x カーネルツリー
+-----------------
+
+2.6.x カーネルは Linus Torvalds によってメンテナンスされ、kernel.org
+の pub/linux/kernel/v2.6/ ディレクトリに存在します。この開発プロセスは
+以下のとおり-
+
+ - 新しいカーネルがリリースされた直後に、2週間の特別期間が設けられ、
+ この期間中に、メンテナ達は Linus に大きな差分を送ることができます。
+ このような差分は通常 -mm カーネルに数週間含まれてきたパッチです。
+ 大きな変更は git(カーネルのソース管理ツール、詳細は
+ http://git.or.cz/ 参照) を使って送るのが好ましいやり方ですが、パッ
+ チファイルの形式のまま送るのでも十分です。
+
+ - 2週間後、-rc1 カーネルがリリースされ、この後にはカーネル全体の安定
+ 性に影響をあたえるような新機能は含まない類のパッチしか取り込むこと
+ はできません。新しいドライバ(もしくはファイルシステム)のパッチは
+ -rc1 の後で受け付けられることもあることを覚えておいてください。な
+ ぜなら、変更が独立していて、追加されたコードの外の領域に影響を与え
+ ない限り、退行のリスクは無いからです。-rc1 がリリースされた後、
+ Linus へパッチを送付するのに git を使うこともできますが、パッチは
+ レビューのために、パブリックなメーリングリストへも同時に送る必要が
+ あります。
+
+ - 新しい -rc は Linus が、最新の git ツリーがテスト目的であれば十分
+ に安定した状態にあると判断したときにリリースされます。目標は毎週新
+ しい -rc カーネルをリリースすることです。
+
+ - このプロセスはカーネルが 「準備ができた」と考えられるまで継続しま
+ す。このプロセスはだいたい 6週間継続します。
+
+ - 各リリースでの既知の後戻り問題(regression: このリリースの中で新規
+ に作り込まれた問題を指す) はその都度 Linux-kernel メーリングリスト
+ に投稿されます。ゴールとしては、カーネルが 「準備ができた」と宣言
+ する前にこのリストの長さをゼロに減らすことですが、現実には、数個の
+ 後戻り問題がリリース時にたびたび残ってしまいます。
+
+Andrew Morton が Linux-kernel メーリングリストにカーネルリリースについ
+て書いたことをここで言っておくことは価値があります-
+ 「カーネルがいつリリースされるかは誰も知りません。なぜなら、これは現
+ 実に認識されたバグの状況によりリリースされるのであり、前もって決めら
+ れた計画によってリリースされるものではないからです。」
+
+2.6.x.y -stable カーネルツリー
+---------------------------
+
+バージョン番号が4つの数字に分かれているカーネルは -stable カーネルです。
+これには、2.6.x カーネルで見つかったセキュリティ問題や重大な後戻りに対
+する比較的小さい重要な修正が含まれます。
+
+これは、開発/実験的バージョンのテストに協力することに興味が無く、
+最新の安定したカーネルを使いたいユーザに推奨するブランチです。
+
+もし、2.6.x.y カーネルが存在しない場合には、番号が一番大きい 2.6.x が
+最新の安定版カーネルです。
+
+2.6.x.y は "stable" チーム <stable@kernel.org> でメンテされており、必
+要に応じてリリースされます。通常のリリース期間は 2週間毎ですが、差し迫っ
+た問題がなければもう少し長くなることもあります。セキュリティ関連の問題
+の場合はこれに対してだいたいの場合、すぐにリリースがされます。
+
+カーネルツリーに入っている、Documentation/stable_kernel_rules.txt ファ
+イルにはどのような種類の変更が -stable ツリーに受け入れ可能か、またリ
+リースプロセスがどう動くかが記述されています。
+
+2.6.x -git パッチ
+------------------
+
+git リポジトリで管理されているLinus のカーネルツリーの毎日のスナップ
+ショットがあります。(だから -git という名前がついています)。これらのパッ
+チはおおむね毎日リリースされており、Linus のツリーの現状を表します。こ
+れは -rc カーネルと比べて、パッチが大丈夫かどうかも確認しないで自動的
+に生成されるので、より実験的です。
+
+2.6.x -mm カーネルパッチ
+------------------------
+
+Andrew Morton によってリリースされる実験的なカーネルパッチ群です。
+Andrew は個別のサブシステムカーネルツリーとパッチを全て集めてきて
+linux-kernel メーリングリストで収集された多数のパッチと同時に一つにま
+とめます。
+このツリーは新機能とパッチが検証される場となります。ある期間の間パッチ
+が -mm に入って価値を証明されたら、Andrew やサブシステムメンテナが、
+メインラインへ入れるように Linus にプッシュします。
+
+メインカーネルツリーに含めるために Linus に送る前に、すべての新しいパッ
+チが -mm ツリーでテストされることが強く推奨されています。マージウィン
+ドウが開く前に -mm ツリーに現れなかったパッチはメインラインにマージさ
+れることは困難になります。
+
+これらのカーネルは安定して動作すべきシステムとして使うのには適切ではあ
+りませんし、カーネルブランチの中でももっとも動作にリスクが高いものです。
+
+もしあなたが、カーネル開発プロセスの支援をしたいと思っているのであれば、
+どうぞこれらのカーネルリリースをテストに使ってみて、そしてもし問題があ
+れば、またもし全てが正しく動作したとしても、linux-kernel メーリングリ
+ストにフィードバックを提供してください。
+
+すべての他の実験的パッチに加えて、これらのカーネルは通常リリース時点で
+メインラインの -git カーネルに含まれる全ての変更も含んでいます。
+
+-mm カーネルは決まったスケジュールではリリースされません、しかし通常幾
+つかの -mm カーネル (1 から 3 が普通)が各-rc カーネルの間にリリースさ
+れます。
+
+サブシステム毎のカーネルツリーとパッチ
+-------------------------------------------
+
+カーネルの様々な領域で何が起きているかを見られるようにするため、多くの
+カーネルサブシステム開発者は彼らの開発ツリーを公開しています。これらの
+ツリーは説明したように -mm カーネルリリースに入れ込まれます。
+
+以下はさまざまなカーネルツリーの中のいくつかのリスト-
+
+ git ツリー-
+ - Kbuild の開発ツリー、Sam Ravnborg <sam@ravnborg.org>
+ git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
+
+ - ACPI の開発ツリー、 Len Brown <len.brown@intel.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
+
+ - Block の開発ツリー、Jens Axboe <axboe@suse.de>
+ git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
+
+ - DRM の開発ツリー、Dave Airlie <airlied@linux.ie>
+ git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
+
+ - ia64 の開発ツリー、Tony Luck <tony.luck@intel.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
+
+ - infiniband, Roland Dreier <rolandd@cisco.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
+
+ - libata, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
+
+ - ネットワークドライバ, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
+
+ - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
+ git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
+
+ - SCSI, James Bottomley <James.Bottomley@hansenpartnership.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
+
+ - x86, Ingo Molnar <mingo@elte.hu>
+ git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
+
+ quilt ツリー-
+ - USB, ドライバコアと I2C, Greg Kroah-Hartman <gregkh@suse.de>
+ kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+
+ その他のカーネルツリーは http://git.kernel.org/ と MAINTAINERS ファ
+ イルに一覧表があります。
+
+バグレポート
+-------------
+
+bugzilla.kernel.org は Linux カーネル開発者がカーネルのバグを追跡する
+場所です。ユーザは見つけたバグの全てをこのツールで報告すべきです。
+どう kernel bugzilla を使うかの詳細は、以下を参照してください-
+ http://bugzilla.kernel.org/page.cgi?id=faq.html
+メインカーネルソースディレクトリにあるファイル REPORTING-BUGS はカーネ
+ルバグらしいものについてどうレポートするかの良いテンプレートであり、問
+題の追跡を助けるためにカーネル開発者にとってどんな情報が必要なのかの詳
+細が書かれています。
+
+バグレポートの管理
+-------------------
+
+あなたのハッキングのスキルを訓練する最高の方法のひとつに、他人がレポー
+トしたバグを修正することがあります。あなたがカーネルをより安定化させる
+こに寄与するということだけでなく、あなたは 現実の問題を修正することを
+学び、自分のスキルも強化でき、また他の開発者があなたの存在に気がつき
+ます。バグを修正することは、多くの開発者の中から自分が功績をあげる最善
+の道です、なぜなら多くの人は他人のバグの修正に時間を浪費することを好ま
+ないからです。
+
+すでにレポートされたバグのために仕事をするためには、
+http://bugzilla.kernel.org に行ってください。もし今後のバグレポートに
+ついてアドバイスを受けたいのであれば、bugme-new メーリングリスト(新し
+いバグレポートだけがここにメールされる) または bugme-janitor メーリン
+グリスト(bugzilla の変更毎にここにメールされる)を購読できます。
+
+ http://lists.linux-foundation.org/mailman/listinfo/bugme-new
+ http://lists.linux-foundation.org/mailman/listinfo/bugme-janitors
+
+メーリングリスト
+-------------
+
+上のいくつかのドキュメントで述べていますが、コアカーネル開発者の大部分
+は Linux kernel メーリングリストに参加しています。このリストの登録/脱
+退の方法については以下を参照してください-
+ http://vger.kernel.org/vger-lists.html#linux-kernel
+
+このメーリングリストのアーカイブは web 上の多数の場所に存在します。こ
+れらのアーカイブを探すにはサーチエンジンを使いましょう。例えば-
+ http://dir.gmane.org/gmane.linux.kernel
+
+リストに投稿する前にすでにその話題がアーカイブに存在するかどうかを検索
+することを是非やってください。多数の事がすでに詳細に渡って議論されて
+おり、アーカイブにのみ記録されています。
+
+大部分のカーネルサブシステムも自分の個別の開発を実施するメーリングリス
+トを持っています。個々のグループがどんなリストを持っているかは、
+MAINTAINERS ファイルにリストがありますので参照してください。
+
+多くのリストは kernel.org でホストされています。これらの情報は以下にあ
+ります-
+ http://vger.kernel.org/vger-lists.html
+
+メーリングリストを使う場合、良い行動習慣に従うようにしましょう。
+少し安っぽいが、以下の URL は上のリスト(や他のリスト)で会話する場合の
+シンプルなガイドラインを示しています-
+ http://www.albion.com/netiquette/
+
+もし複数の人があなたのメールに返事をした場合、CC: で受ける人のリストは
+だいぶ多くなるでしょう。良い理由がない場合、CC: リストから誰かを削除を
+しないように、また、メーリングリストのアドレスだけにリプライすることの
+ないようにしましょう。1つは送信者から、もう1つはリストからのように、メー
+ルを2回受けることになってもそれに慣れ、しゃれたメールヘッダーを追加し
+てこの状態を変えようとしないように。人々はそのようなことは好みません。
+
+今までのメールでのやりとりとその間のあなたの発言はそのまま残し、
+"John Kernlehacker wrote ...:" の行をあなたのリプライの先頭行にして、
+メールの先頭でなく、各引用行の間にあなたの言いたいことを追加するべきで
+す。
+
+もしパッチをメールに付ける場合は、Documentaion/SubmittingPatches に提
+示されているように、それは プレーンな可読テキストにすることを忘れない
+ようにしましょう。カーネル開発者は 添付や圧縮したパッチを扱いたがりま
+せん-
+彼らはあなたのパッチの行毎にコメントを入れたいので、そのためにはそうす
+るしかありません。あなたのメールプログラムが空白やタブを圧縮しないよう
+に確認した方が良いです。最初の良いテストとしては、自分にメールを送って
+みて、そのパッチを自分で当ててみることです。もしそれがうまく行かないな
+ら、あなたのメールプログラムを直してもらうか、正しく動くように変えるべ
+きです。
+
+とりわけ、他の登録者に対する尊敬を表すようにすることを覚えておいてくだ
+さい。
+
+コミュニティと共に働くこと
+--------------------------
+
+カーネルコミュニティのゴールは可能なかぎり最高のカーネルを提供すること
+です。あなたがパッチを受け入れてもらうために投稿した場合、それは、技術
+的メリットだけがレビューされます。その際、あなたは何を予想すべきでしょ
+うか?
+ - 批判
+ - コメント
+ - 変更の要求
+ - パッチの正当性の証明要求
+ - 沈黙
+
+思い出してください、ここはあなたのパッチをカーネルに入れる話です。あ
+なたは、あなたのパッチに対する批判とコメントを受け入れるべきで、それら
+を技術的レベルで評価して、パッチを再作成するか、なぜそれらの変更をすべ
+きでないかを明確で簡潔な理由の説明を提供してください。
+もし、あなたのパッチに何も反応がない場合、たまにはメールの山に埋もれて
+見逃され、あなたの投稿が忘れられてしまうこともあるので、数日待って再度
+投稿してください。
+
+あなたがやるべきでないものは?
+ - 質問なしにあなたのパッチが受け入れられると想像すること
+ - 守りに入ること
+ - コメントを無視すること
+ - 要求された変更を何もしないでパッチを出し直すこと
+
+可能な限り最高の技術的解決を求めているコミュニティでは、パッチがどのく
+らい有益なのかについては常に異なる意見があります。あなたは協調的である
+べきですし、また、あなたのアイディアをカーネルに対してうまく合わせるよ
+うにすることが望まれています。もしくは、最低限あなたのアイディアがそれ
+だけの価値があるとすすんで証明するようにしなければなりません。
+正しい解決に向かって進もうという意志がある限り、間違うことがあっても許
+容されることを忘れないでください。
+
+あなたの最初のパッチに単に 1ダースもの修正を求めるリストの返答になるこ
+とも普通のことです。これはあなたのパッチが受け入れられないということで
+は *ありません*、そしてあなた自身に反対することを意味するのでも *ありま
+せん*。単に自分のパッチに対して指摘された問題を全て修正して再送すれば
+良いのです。
+
+
+カーネルコミュニティと企業組織のちがい
+-----------------------------------------------------------------
+
+カーネルコミュニティは大部分の伝統的な会社の開発環境とは異ったやり方で
+動いています。以下は問題を避けるためにできると良いことのリストです-
+
+ あなたの提案する変更について言うときのうまい言い方:
+
+ - "これは複数の問題を解決します"
+ - "これは2000行のコードを削除します"
+ - "以下のパッチは、私が言おうとしていることを説明するものです"
+ - "私はこれを5つの異なるアーキテクチャでテストしたのですが..."
+ - "以下は一連の小さなパッチ群ですが..."
+ - "これは典型的なマシンでの性能を向上させます.."
+
+ やめた方が良い悪い言い方:
+
+ - このやり方で AIX/ptx/Solaris ではできたので、できるはずだ
+ - 私はこれを20年もの間やってきた、だから
+ - これは、私の会社が金儲けをするために必要だ
+ - これは我々のエンタープライズ向け商品ラインのためである
+ - これは 私が自分のアイディアを記述した、1000ページの設計資料である
+ - 私はこれについて、6ケ月作業している。
+ - 以下は ... に関する5000行のパッチです
+ - 私は現在のぐちゃぐちゃを全部書き直した、それが以下です...
+ - 私は〆切がある、そのためこのパッチは今すぐ適用される必要がある
+
+カーネルコミュニティが大部分の伝統的なソフトウェアエンジニアリングの労
+働環境と異なるもう一つの点は、やりとりに顔を合わせないということです。
+email と irc を第一のコミュニケーションの形とする一つの利点は、性別や
+民族の差別がないことです。Linux カーネルの職場環境は女性や少数民族を受
+容します。なぜなら、email アドレスによってのみあなたが認識されるからで
+す。
+国際的な側面からも活動領域を均等にするようにします。なぜならば、あなた
+は人の名前で性別を想像できないからです。ある男性が アンドレアという名
+前で、女性の名前は パット かもしれません (訳注 Andrea は米国では女性、
+それ以外(欧州など)では男性名として使われることが多い。同様に、Pat は
+Patricia (主に女性名)や Patrick (主に男性名)の略称)。
+Linux カーネルの活動をして、意見を表明したことがある大部分の女性は、前
+向きな経験をもっています。
+
+言葉の壁は英語が得意でない一部の人には問題になります。
+メーリングリストの中できちんとアイディアを交換するには、相当うまく英語
+を操れる必要があることもあります。そのため、あなたは自分のメール
+を送る前に英語で意味が通じているかをチェックすることをお薦めします。
+
+変更を分割する
+---------------------
+
+Linux カーネルコミュニティは、一度に大量のコードの塊を喜んで受容するこ
+とはありません。変更は正確に説明される必要があり、議論され、小さい、個
+別の部分に分割する必要があります。これはこれまで多くの会社がやり慣れて
+きたことと全く正反対のことです。あなたのプロポーザルは、開発プロセスのと
+ても早い段階から紹介されるべきです。そうすれば あなたは自分のやってい
+ることにフィードバックを得られます。これは、コミュニティからみれば、あ
+なたが彼らと一緒にやっているように感じられ、単にあなたの提案する機能の
+ゴミ捨て場として使っているのではない、と感じられるでしょう。
+しかし、一度に 50 もの email をメーリングリストに送りつけるようなことは
+やってはいけません、あなたのパッチ群はいつもどんな時でもそれよりは小さ
+くなければなりません。
+
+パッチを分割する理由は以下です-
+
+1) 小さいパッチはあなたのパッチが適用される見込みを大きくします、カー
+ ネルの人達はパッチが正しいかどうかを確認する時間や労力をかけないか
+ らです。5行のパッチはメンテナがたった1秒見るだけで適用できます。
+ しかし、500行のパッチは、正しいことをレビューするのに数時間かかるか
+ もしれません(時間はパッチのサイズなどにより指数関数に比例してかかり
+ ます)
+
+ 小さいパッチは何かあったときにデバッグもとても簡単になります。パッ
+ チを1個1個取り除くのは、とても大きなパッチを当てた後に(かつ、何かお
+ かしくなった後で)解剖するのに比べればとても簡単です。
+
+2) 小さいパッチを送るだけでなく、送るまえに、書き直して、シンプルにす
+ る(もしくは、単に順番を変えるだけでも)ことも、とても重要です。
+
+以下はカーネル開発者の Al Viro のたとえ話です:
+
+ "生徒の数学の宿題を採点する先生のことを考えてみてください、先
+ 生は生徒が解に到達するまでの試行錯誤を見たいとは思わないでしょ
+ う。先生は簡潔な最高の解を見たいのです。良い生徒はこれを知って
+ おり、そして最終解の前の中間作業を提出することは決してないので
+ す"
+
+ カーネル開発でもこれは同じです。メンテナ達とレビューア達は、
+ 問題を解決する解の背後になる思考プロセスを見たいとは思いません。
+ 彼らは単純であざやかな解決方法を見たいのです。
+
+あざやかな解を説明するのと、コミュニティと共に仕事をし、未解決の仕事を
+議論することのバランスをキープするのは難しいかもしれません。
+ですから、開発プロセスの早期段階で改善のためのフィードバックをもらうよ
+うにするのも良いですが、変更点を小さい部分に分割して全体ではまだ完成し
+ていない仕事を(部分的に)取り込んでもらえるようにすることも良いことです。
+
+また、でき上がっていないものや、"将来直す" ようなパッチを、本流に含め
+てもらうように送っても、それは受け付けられないことを理解してください。
+
+あなたの変更を正当化する
+-------------------
+
+あなたのパッチを分割するのと同時に、なぜその変更を追加しなければならな
+いかを Linux コミュニティに知らせることはとても重要です。新機能は必要
+性と有用性で正当化されなければなりません。
+
+あなたの変更の説明
+--------------------
+
+あなたのパッチを送付する場合には、メールの中のテキストで何を言うかにつ
+いて、特別に注意を払ってください。この情報はパッチの ChangeLog に使わ
+れ、いつも皆がみられるように保管されます。これは次のような項目を含め、
+パッチを完全に記述するべきです-
+
+ - なぜ変更が必要か
+ - パッチ全体の設計アプローチ
+ - 実装の詳細
+ - テスト結果
+
+これについて全てがどのようにあるべきかについての詳細は、以下のドキュメ
+ントの ChangeLog セクションを見てください-
+ "The Perfect Patch"
+ http://userweb.kernel.org/~akpm/stuff/tpp.txt
+
+これらのどれもが、時にはとても困難です。これらの慣例を完璧に実施するに
+は数年かかるかもしれません。これは継続的な改善のプロセスであり、そのた
+めには多数の忍耐と決意を必要とするものです。でも、諦めないで、これは可
+能なことです。多数の人がすでにできていますし、彼らも皆最初はあなたと同
+じところからスタートしたのですから。
+
+Paolo Ciarrocchi に感謝、彼は彼の書いた "Development Process"
+(http://linux.tar.bz/articles/2.6-development_process)セクショ
+ンをこのテキストの原型にすることを許可してくれました。
+Rundy Dunlap と Gerrit Huizenga はメーリングリストでやるべきこととやっ
+てはいけないことのリストを提供してくれました。
+以下の人々のレビュー、コメント、貢献に感謝。
+Pat Mochel, Hanna Linder, Randy Dunlap, Kay Sievers,
+Vojtech Pavlik, Jan Kara, Josh Boyer, Kees Cook, Andrew Morton, Andi
+Kleen, Vadim Lobanov, Jesper Juhl, Adrian Bunk, Keri Harris, Frans Pop,
+David A. Wheeler, Junio Hamano, Michael Kerrisk, と Alex Shepard
+彼らの支援なしでは、このドキュメントはできなかったでしょう。
+
+Maintainer: Greg Kroah-Hartman <greg@kroah.com>
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
new file mode 100644
index 0000000..6c42e07
--- /dev/null
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -0,0 +1,111 @@
+NOTE:
+This is a version of Documentation/SubmitChecklist into Japanese.
+This document is maintained by Takenori Nagano <t-nagano@ah.jp.nec.com>
+and the JF Project team <http://www.linux.or.jp/JF/>.
+If you find any difference between this document and the original file
+or a problem with the translation,
+please contact the maintainer of this file or JF project.
+
+Please also note that the purpose of this file is to be easier to read
+for non English (read: Japanese) speakers and is not intended as a
+fork. So if you have any comments or updates of this file, please try
+to update the original English file first.
+
+Last Updated: 2008/07/14
+==================================
+これは、
+linux-2.6.26/Documentation/SubmitChecklist の和訳です。
+
+翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
+翻訳日: 2008/07/14
+翻訳者: Takenori Nagano <t-nagano at ah dot jp dot nec dot com>
+校正者: Masanori Kobayashi さん <zap03216 at nifty dot ne dot jp>
+==================================
+
+
+Linux カーネルパッチ投稿者向けチェックリスト
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+本書では、パッチをより素早く取り込んでもらいたい開発者が実践すべき基本的な事柄
+をいくつか紹介します。ここにある全ての事柄は、Documentation/SubmittingPatches
+などのLinuxカーネルパッチ投稿に際しての心得を補足するものです。
+
+ 1: 妥当なCONFIGオプションや変更されたCONFIGオプション、つまり =y, =m, =n
+ 全てで正しくビルドできることを確認してください。その際、gcc及びリンカが
+ warningやerrorを出していないことも確認してください。
+
+ 2: allnoconfig, allmodconfig オプションを用いて正しくビルドできることを
+ 確認してください。
+
+ 3: 手許のクロスコンパイルツールやOSDLのPLMのようなものを用いて、複数の
+ アーキテクチャにおいても正しくビルドできることを確認してください。
+
+ 4: 64bit長の'unsigned long'を使用しているppc64は、クロスコンパイルでの
+ チェックに適当なアーキテクチャです。
+
+ 5: カーネルコーディングスタイルに準拠しているかどうか確認してください(!)
+
+ 6: CONFIGオプションの追加・変更をした場合には、CONFIGメニューが壊れていない
+ ことを確認してください。
+
+ 7: 新しくKconfigのオプションを追加する際には、必ずそのhelpも記述してください。
+
+ 8: 適切なKconfigの依存関係を考えながら慎重にチェックしてください。
+ ただし、この作業はマシンを使ったテストできちんと行うのがとても困難です。
+ うまくやるには、自分の頭で考えることです。
+
+ 9: sparseを利用してちゃんとしたコードチェックをしてください。
+
+10: 'make checkstack' と 'make namespacecheck' を利用し、問題が発見されたら
+ 修正してください。'make checkstack' は明示的に問題を示しませんが、どれか
+ 1つの関数が512バイトより大きいスタックを使っていれば、修正すべき候補と
+ なります。
+
+11: グローバルなkernel API を説明する kernel-doc をソースの中に含めてください。
+ ( staticな関数においては必須ではありませんが、含めてもらっても結構です )
+ そして、'make htmldocs' もしくは 'make mandocs' を利用して追記した
+ ドキュメントのチェックを行い、問題が見つかった場合には修正を行ってください。
+
+12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB,
+ CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK,
+ CONFIG_DEBUG_SPINLOCK_SLEEP これら全てを同時に有効にして動作確認を
+ 行ってください。
+
+13: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で
+ ビルドした上、動作確認を行ってください。
+
+14: もしパッチがディスクのI/O性能などに影響を与えるようであれば、
+ 'CONFIG_LBD'オプションを有効にした場合と無効にした場合の両方で
+ テストを実施してみてください。
+
+15: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。
+
+16: /proc に新しいエントリを追加した場合には、Documentation/ 配下に
+ 必ずドキュメントを追加してください。
+
+17: 新しいブートパラメータを追加した場合には、
+ 必ずDocumentation/kernel-parameters.txt に説明を追加してください。
+
+18: 新しくmoduleにパラメータを追加した場合には、MODULE_PARM_DESC()を
+ 利用して必ずその説明を記述してください。
+
+19: 新しいuserspaceインタフェースを作成した場合には、Documentation/ABI/ に
+ Documentation/ABI/README を参考にして必ずドキュメントを追加してください。
+
+20: 'make headers_check'を実行して全く問題がないことを確認してください。
+
+21: 少なくともslabアロケーションとpageアロケーションに失敗した場合の
+ 挙動について、fault-injectionを利用して確認してください。
+ Documentation/fault-injection/ を参照してください。
+
+ 追加したコードがかなりの量であったならば、サブシステム特有の
+ fault-injectionを追加したほうが良いかもしれません。
+
+22: 新たに追加したコードは、`gcc -W'でコンパイルしてください。
+ このオプションは大量の不要なメッセージを出力しますが、
+ "warning: comparison between signed and unsigned" のようなメッセージは、
+ バグを見つけるのに役に立ちます。
+
+23: 投稿したパッチが -mm パッチセットにマージされた後、全ての既存のパッチや
+ VM, VFS およびその他のサブシステムに関する様々な変更と、現時点でも共存
+ できることを確認するテストを行ってください。
diff --git a/Documentation/ja_JP/SubmittingPatches b/Documentation/ja_JP/SubmittingPatches
new file mode 100644
index 0000000..a9dc124
--- /dev/null
+++ b/Documentation/ja_JP/SubmittingPatches
@@ -0,0 +1,556 @@
+NOTE:
+This is a version of Documentation/SubmittingPatches into Japanese.
+This document is maintained by Keiichi KII <k-keiichi@bx.jp.nec.com>
+and the JF Project team <http://www.linux.or.jp/JF/>.
+If you find any difference between this document and the original file
+or a problem with the translation,
+please contact the maintainer of this file or JF project.
+
+Please also note that the purpose of this file is to be easier to read
+for non English (read: Japanese) speakers and is not intended as a
+fork. So if you have any comments or updates of this file, please try
+to update the original English file first.
+
+Last Updated: 2007/10/24
+==================================
+これは、
+linux-2.6.23/Documentation/SubmittingPatches の和訳
+です。
+翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
+翻訳日: 2007/10/17
+翻訳者: Keiichi Kii <k-keiichi at bx dot jp dot nec dot com>
+校正者: Masanari Kobayashi さん <zap03216 at nifty dot ne dot jp>
+ Matsukura さん <nbh--mats at nifty dot com>
+==================================
+
+ Linux カーネルに変更を加えるための Howto
+ 又は
+ かの Linus Torvalds の取り扱い説明書
+
+Linux カーネルに変更を加えたいと思っている個人又は会社にとって、パッ
+チの投稿に関連した仕組みに慣れていなければ、その過程は時々みなさんを
+おじけづかせることもあります。この文章はあなたの変更を大いに受け入れ
+てもらえやすくする提案を集めたものです。
+
+コードを投稿する前に、Documentation/SubmitChecklist の項目リストに目
+を通してチェックしてください。もしあなたがドライバーを投稿しようとし
+ているなら、Documentation/SubmittingDrivers にも目を通してください。
+
+--------------------------------------------
+セクション1 パッチの作り方と送り方
+--------------------------------------------
+
+1) 「 diff -up 」
+------------
+
+パッチの作成には「 diff -up 」又は「 diff -uprN 」を使ってください。
+
+Linux カーネルに対する全ての変更は diff(1) コマンドによるパッチの形式で
+生成してください。パッチを作成するときには、diff(1) コマンドに「 -u 」引
+数を指定して、unified 形式のパッチを作成することを確認してください。また、
+変更がどの C 関数で行われたのかを表示する「 -p 」引数を使ってください。
+この引数は生成した差分をずっと読みやすくしてくれます。パッチは Linux
+カーネルソースの中のサブディレクトリではなく Linux カーネルソースのルート
+ディレクトリを基準にしないといけません。
+
+1個のファイルについてのパッチを作成するためには、ほとんどの場合、
+以下の作業を行えば十分です。
+
+ SRCTREE= linux-2.6
+ MYFILE= drivers/net/mydriver.c
+
+ cd $SRCTREE
+ cp $MYFILE $MYFILE.orig
+ vi $MYFILE # make your change
+ cd ..
+ diff -up $SRCTREE/$MYFILE{.orig,} > /tmp/patch
+
+複数のファイルについてのパッチを作成するためには、素の( vanilla )、す
+なわち変更を加えてない Linux カーネルを展開し、自分の Linux カーネル
+ソースとの差分を生成しないといけません。例えば、
+
+ MYSRC= /devel/linux-2.6
+
+ tar xvfz linux-2.6.12.tar.gz
+ mv linux-2.6.12 linux-2.6.12-vanilla
+ diff -uprN -X linux-2.6.12-vanilla/Documentation/dontdiff \
+ linux-2.6.12-vanilla $MYSRC > /tmp/patch
+
+dontdiff ファイルには Linux カーネルのビルドプロセスの過程で生成された
+ファイルの一覧がのっています。そして、それらはパッチを生成する diff(1)
+コマンドで無視されるべきです。dontdiff ファイルは 2.6.12 以後のバージョ
+ンの Linux カーネルソースツリーに含まれています。それより前のバージョン
+の Linux カーネルソースツリーに対する dontdiff ファイルは、
+<http://www.xenotime.net/linux/doc/dontdiff>から取得することができます。
+
+投稿するパッチの中に関係のない余分なファイルが含まれていないことを確
+認してください。diff(1) コマンドで生成したパッチがあなたの意図したとお
+りのものであることを確認してください。
+
+もしあなたのパッチが多くの差分を生み出すのであれば、あなたはパッチ
+を意味のあるひとまとまりごとに分けたいと思うかもしれません。
+これは他のカーネル開発者にとってレビューしやすくなるので、あなたの
+パッチを受け入れてもらうためにはとても重要なことです。これを補助でき
+る多くのスクリプトがあります。
+
+Quilt:
+http://savannah.nongnu.org/projects/quilt
+
+Andrew Morton's patch scripts:
+http://www.zip.com.au/~akpm/linux/patches/
+このリンクの先のスクリプトの代わりとして、quilt がパッチマネジメント
+ツールとして推奨されています(上のリンクを見てください)。
+
+2) パッチに対する説明
+
+パッチの中の変更点に対する技術的な詳細について説明してください。
+
+説明はできる限り具体的に。もっとも悪い説明は「ドライバー X を更新」、
+「ドライバー X に対するバグフィックス」あるいは「このパッチはサブシス
+テム X に対する更新を含んでいます。どうか取り入れてください。」などです。
+
+説明が長くなりだしたのであれば、おそらくそれはパッチを分ける必要がある
+という兆候です。次の #3 を見てください。
+
+3) パッチの分割
+
+意味のあるひとまとまりごとに変更を個々のパッチファイルに分けてください。
+
+例えば、もし1つのドライバーに対するバグフィックスとパフォーマンス強
+化の両方の変更を含んでいるのであれば、その変更を2つ以上のパッチに分
+けてください。もし変更箇所に API の更新と、その新しい API を使う新たな
+ドライバーが含まれているなら、2つのパッチに分けてください。
+
+一方で、もしあなたが多数のファイルに対して意味的に同じ1つの変更を加え
+るのであれば、その変更を1つのパッチにまとめてください。言いかえると、
+意味的に同じ1つの変更は1つのパッチの中に含まれます。
+
+あるパッチが変更を完結させるために他のパッチに依存していたとしても、
+それは問題ありません。パッチの説明の中で「このパッチはパッチ X に依存
+している」と簡単に注意書きをつけてください。
+
+もしパッチをより小さなパッチの集合に凝縮することができないなら、まずは
+15かそこらのパッチを送り、そのレビューと統合を待って下さい。
+
+4) パッチのスタイルチェック
+
+あなたのパッチが基本的な( Linux カーネルの)コーディングスタイルに違反し
+ていないかをチェックして下さい。その詳細を Documentation/CodingStyle で
+見つけることができます。コーディングスタイルの違反はレビューする人の
+時間を無駄にするだけなので、恐らくあなたのパッチは読まれることすらなく
+拒否されるでしょう。
+
+あなたはパッチを投稿する前に最低限パッチスタイルチェッカー
+( scripts/patchcheck.pl )を利用してパッチをチェックすべきです。
+もしパッチに違反がのこっているならば、それらの全てについてあなたは正当な
+理由を示せるようにしておく必要があります。
+
+5) 電子メールの宛先の選び方
+
+MAINTAINERS ファイルとソースコードに目を通してください。そして、その変
+更がメンテナのいる特定のサブシステムに加えられるものであることが分か
+れば、その人に電子メールを送ってください。
+
+もし、メンテナが載っていなかったり、メンテナからの応答がないなら、
+LKML ( linux-kernel@vger.kernel.org )へパッチを送ってください。ほとんど
+のカーネル開発者はこのメーリングリストに目を通しており、変更に対して
+コメントを得ることができます。
+
+15個より多くのパッチを同時に vger.kernel.org のメーリングリストへ送らな
+いでください!!!
+
+Linus Torvalds は Linux カーネルに入る全ての変更に対する最終的な意思決定者
+です。電子メールアドレスは torvalds@linux-foundation.org になります。彼は
+多くの電子メールを受け取っているため、できる限り彼に電子メールを送るのは
+避けるべきです。
+
+バグフィックスであったり、自明な変更であったり、話し合いをほとんど
+必要としないパッチは Linus へ電子メールを送るか CC しなければなりません。
+話し合いを必要としたり、明確なアドバンテージがないパッチは、通常まず
+は LKML へ送られるべきです。パッチが議論された後にだけ、そのパッチを
+Linus へ送るべきです。
+
+6) CC (カーボンコピー)先の選び方
+
+特に理由がないなら、LKML にも CC してください。
+
+Linus 以外のカーネル開発者は変更に気づく必要があり、その結果、彼らはそ
+の変更に対してコメントをくれたり、コードに対してレビューや提案をくれ
+るかもしれません。LKML とは Linux カーネル開発者にとって一番中心的なメー
+リングリストです。USB やフレームバッファデバイスや VFS や SCSI サブシステ
+ムなどの特定のサブシステムに関するメーリングリストもあります。あなた
+の変更に、はっきりと関連のあるメーリングリストについて知りたければ
+MAINTAINERS ファイルを参照してください。
+
+VGER.KERNEL.ORG でホスティングされているメーリングリストの一覧が下記の
+サイトに載っています。
+<http://vger.kernel.org/vger-lists.html>
+
+もし、変更がユーザランドのカーネルインタフェースに影響を与え
+るのであれば、MAN-PAGES のメンテナ( MAINTAINERS ファイルに一覧
+があります)に man ページのパッチを送ってください。少なくとも
+情報がマニュアルページの中に入ってくるように、変更が起きたという
+通知を送ってください。
+
+たとえ、メンテナが #4 で反応がなかったとしても、メンテナのコードに変更を
+加えたときには、いつもメンテナに CC するのを忘れないようにしてください。
+
+小さなパッチであれば、Adrian Bunk が管理している Trivial Patch Monkey
+(ちょっとしたパッチを集めている)<trivial@kernel.org>に CC してもいい
+です。ちょっとしたパッチとは以下のルールのどれか1つを満たしていなけ
+ればなりません。
+ ・ドキュメントのスペルミスの修正
+ ・grep(1) コマンドによる検索を困難にしているスペルの修正
+ ・コンパイル時の警告の修正(無駄な警告が散乱することは好ましくないた
+ めです)
+ ・コンパイル問題の修正(それらの修正が本当に正しい場合に限る)
+ ・実行時の問題の修正(それらの修正が本当に問題を修正している場合に限る)
+ ・廃止予定の関数やマクロを使用しているコードの除去(例 check_region )
+ ・問い合わせ先やドキュメントの修正
+ ・移植性のないコードから移植性のあるコードへの置き換え(小さい範囲で
+ あればアーキテクチャ特有のことでも他の人がコピーできます)
+ ・作者やメンテナによる修正(すなわち patch monkey の再転送モード)
+URL: <http://www.kernel.org/pub/linux/kernel/people/bunk/trivial/>
+
+7) MIME やリンクや圧縮ファイルや添付ファイルではなくプレインテキストのみ
+
+Linus や他のカーネル開発者はあなたが投稿した変更を読んで、コメントでき
+る必要があります。カーネル開発者にとって、あなたが書いたコードの特定の
+部分にコメントをするために、標準的な電子メールクライアントで変更が引用
+できることは重要です。
+
+上記の理由で、すべてのパッチは文中に含める形式の電子メールで投稿さ
+れるべきです。警告:あなたがパッチをコピー&ペーストする際には、パッ
+チを改悪するエディターの折り返し機能に注意してください。
+
+パッチを圧縮の有無に関わらず MIME 形式で添付しないでください。多くのポ
+ピュラーな電子メールクライアントは MIME 形式の添付ファイルをプレーンテ
+キストとして送信するとは限らないでしょう。そうなると、電子メールクラ
+イアントがコードに対するコメントを付けることをできなくします。また、
+MIME 形式の添付ファイルは Linus に手間を取らせることになり、その変更を
+受け入れてもらう可能性が低くなってしまいます。
+
+例外:お使いの電子メールクライアントがパッチをめちゃくちゃにするので
+あれば、誰かが MIME 形式のパッチを再送するよう求めるかもしれません。
+
+警告: Mozilla のような特定の電子メールクライアントは電子メールの
+ヘッダに以下のものを付加して送ります。
+---- message header ----
+Content-Type: text/plain; charset=us-ascii; format=flowed
+---- message header ----
+問題は、「 format=flowed 」が付いた電子メールを特定の受信側の電子メール
+クライアントがタブをスペースに置き換えるというような変更をすることです。
+したがって送られてきたパッチは壊れているように見えるでしょう。
+
+これを修正するには、mozilla の defaults/pref/mailnews.js ファイルを
+以下のように修正します。
+pref("mailnews.send_plaintext_flowed", false); // RFC 2646=======
+pref("mailnews.display.disable_format_flowed_support", true);
+
+8) 電子メールのサイズ
+
+パッチを Linus へ送るときは常に #7 の手順に従ってください。
+
+大きなパッチはメーリングリストやメンテナにとって不親切です。パッチが
+未圧縮で 40KB を超えるようであるなら、インターネット上のアクセス可能な
+サーバに保存し、保存場所を示す URL を伝えるほうが適切です。
+
+9) カーネルバージョンの明記
+
+パッチが対象とするカーネルのバージョンをパッチの概要か電子メールの
+サブジェクトに付けることが重要です。
+
+パッチが最新バージョンのカーネルに正しく適用できなければ、Linus は
+そのパッチを採用しないでしょう。
+
+10) がっかりせず再投稿
+
+パッチを投稿した後は、辛抱強く待っていてください。Linus があなたのパッ
+チを気に入って採用すれば、Linus がリリースする次のバージョンのカーネル
+の中で姿を見せるでしょう。
+
+しかし、パッチが次のバージョンのカーネルに入っていないなら、いくつもの
+理由があるのでしょう。その原因を絞り込み、間違っているものを正し、更新
+したパッチを投稿するのはあなたの仕事です。
+
+Linus があなたのパッチに対して何のコメントもなく不採用にすることは極め
+て普通のことです。それは自然な姿です。もし、Linus があなたのパッチを受
+け取っていないのであれば、以下の理由が考えられます。
+* パッチが最新バージョンの Linux カーネルにきちんと適用できなかった
+* パッチが LKML で十分に議論されていなかった
+* スタイルの問題(セクション2を参照)
+* 電子メールフォーマットの問題(このセクションを参照)
+* パッチに対する技術的な問題
+* Linus はたくさんの電子メールを受け取っているので、どさくさに紛れて見
+ 失った
+* 不愉快にさせている
+
+判断できない場合は、LKML にコメントを頼んでください。
+
+11) サブジェクトに「 PATCH 」
+
+Linus や LKML への大量の電子メールのために、サブジェクトのプレフィックスに
+「 [PATCH] 」を付けることが慣習となっています。これによって Linus や他の
+カーネル開発者がパッチであるのか、又は、他の議論に関する電子メールであるの
+かをより簡単に識別できます。
+
+12) パッチへの署名
+
+誰が何をしたのかを追いかけやすくするために (特に、パッチが何人かの
+メンテナを経て最終的に Linux カーネルに取り込まれる場合のために)、電子
+メールでやり取りされるパッチに対して「 sign-off 」という手続きを導入し
+ました。
+
+「 sign-off 」とは、パッチがあなたの書いたものであるか、あるいは、
+あなたがそのパッチをオープンソースとして提供する権利を保持している、
+という証明をパッチの説明の末尾に一行記載するというものです。
+ルールはとても単純です。以下の項目を確認して下さい。
+
+ 原作者の証明書( DCO ) 1.1
+
+ このプロジェクトに寄与するものとして、以下のことを証明する。
+
+ (a) 本寄与は私が全体又は一部作成したものであり、私がそのファイ
+ ル中に明示されたオープンソースライセンスの下で公開する権利
+ を持っている。もしくは、
+
+ (b) 本寄与は、私が知る限り、適切なオープンソースライセンスでカバ
+ ーされている既存の作品を元にしている。同時に、私はそのライセ
+ ンスの下で、私が全体又は一部作成した修正物を、ファイル中で示
+ される同一のオープンソースライセンスで(異なるライセンスの下で
+ 投稿することが許可されている場合を除いて)投稿する権利を持って
+ いる。もしくは、
+
+ (c) 本寄与は(a)、(b)、(c)を証明する第3者から私へ直接提供された
+ ものであり、私はそれに変更を加えていない。
+
+ (d) 私はこのプロジェクトと本寄与が公のものであることに理解及び同意す
+ る。同時に、関与した記録(投稿の際の全ての個人情報と sign-off を
+ 含む)が無期限に保全されることと、当該プロジェクト又は関連する
+ オープンソースライセンスに沿った形で再配布されることに理解及び
+ 同意する。
+
+もしこれに同意できるなら、以下のような1行を追加してください。
+
+ Signed-off-by: Random J Developer <random@developer.example.org>
+
+実名を使ってください。(残念ですが、偽名や匿名による寄与はできません。)
+
+人によっては sign-off の近くに追加のタグを付加しています。それらは今のところ
+無視されますが、あなたはそのタグを社内の手続きに利用したり、sign-off に特別
+な情報を示したりすることができます。
+
+13) いつ Acked-by: を使うのか
+
+「 Signed-off-by: 」タグはその署名者がパッチの開発に関わっていたことやパッチ
+の伝播パスにいたことを示しています。
+
+ある人が直接パッチの準備や作成に関わっていないけれど、その人のパッチに対す
+る承認を記録し、示したいとします。その場合、その人を示すのに Acked-by: が使
+えます。Acked-by: はパッチのチェンジログにも追加されます。
+
+パッチの影響を受けるコードのメンテナがパッチに関わっていなかったり、パッチ
+の伝播パスにいなかった時にも、メンテナは Acked-by: をしばしば利用します。
+
+Acked-by: は Signed-off-by: のように公式なタグではありません。それはメンテナが
+少なくともパッチをレビューし、同意を示しているという記録です。そのような
+ことからパッチの統合者がメンテナの「うん、良いと思うよ」という発言を
+Acked-by: へ置き換えることがあります。
+
+Acked-by: が必ずしもパッチ全体の承認を示しているわけではありません。例えば、
+あるパッチが複数のサブシステムへ影響を与えており、その中の1つのサブシステム
+のメンテナからの Acked-by: を持っているとします。その場合、Acked-by: は通常
+そのメンテナのコードに影響を与える一部分だけに対する承認を示しています。
+この点は、ご自分で判断してください。(その Acked-by: が)疑わしい場合は、
+メーリングリストアーカイブの中の大元の議論を参照すべきです。
+
+14) 標準的なパッチのフォーマット
+
+標準的なパッチのサブジェクトは以下のとおりです。
+
+ Subject: [PATCH 001/123] subsystem: summary phrase
+
+標準的なパッチの、電子メールのボディは以下の項目を含んでいます。
+
+ - パッチの作成者を明記する「 from 」行
+
+ - 空行
+
+ - 説明本体。これはこのパッチを説明するために無期限のチェンジログ
+ (変更履歴)にコピーされます。
+
+ - 上述した「 Signed-off-by: 」行。これも説明本体と同じくチェン
+ ジログ内にコピーされます。
+
+ - マーカー行は単純に「 --- 」です。
+
+ - 余計なコメントは、チェンジログには不適切です。
+
+ - 実際のパッチ(差分出力)
+
+サブジェクト行のフォーマットは、アルファベット順で電子メールをとても
+ソートしやすいものになっています。(ほとんどの電子メールクライアント
+はソートをサポートしています)パッチのサブジェクトの連番は0詰めであ
+るため、数字でのソートとアルファベットでのソートは同じ結果になります。
+
+電子メールのサブジェクト内のサブシステム表記は、パッチが適用される
+分野またはサブシステムを識別できるようにすべきです。
+
+電子メールのサブジェクトの「概要の言い回し」はそのパッチの概要を正確
+に表現しなければなりません。「概要の言い回し」をファイル名にしてはい
+けません。一連のパッチ中でそれぞれのパッチは同じ「概要の言い回し」を
+使ってはいけません(「一連のパッチ」とは順序付けられた関連のある複数の
+パッチ群です)。
+
+あなたの電子メールの「概要の言い回し」がそのパッチにとって世界で唯
+一の識別子になるように心がけてください。「概要の言い回し」は git の
+チェンジログの中へずっと伝播していきます。「概要の言い回し」は、開
+発者が後でパッチを参照するために議論の中で利用するかもしれません。
+人々はそのパッチに関連した議論を読むために「概要の言い回し」を使って
+google で検索したがるでしょう。
+
+サブジェクトの例を二つ
+
+ Subject: [patch 2/5] ext2: improve scalability of bitmap searching
+ Subject: [PATCHv2 001/207] x86: fix eflags tracking
+
+「 from 」行は電子メールのボディの一番最初の行でなければなりません。
+その形式は以下のとおりです。
+
+ From: Original Author <author@example.com>
+
+「 from 」行はチェンジログの中で、そのパッチの作成者としてクレジットされ
+ている人を特定するものです。「 from 」行がかけていると、電子メールのヘッ
+ダーの「 From: 」が、チェンジログの中でパッチの作成者を決定するために使わ
+れるでしょう。
+
+説明本体は無期限のソースのチェンジログにコミットされます。なので、説明
+本体はそのパッチに至った議論の詳細を忘れているある程度の技量を持っている人
+がその詳細を思い出すことができるものでなければなりません。
+
+「 --- 」マーカー行はパッチ処理ツールに対して、チェンジログメッセージの終端
+部分を認識させるという重要な役目を果たします。
+
+「 --- 」マーカー行の後の追加コメントの良い使用方法の1つに diffstat コマンド
+があります。diffstat コマンドとは何のファイルが変更され、1ファイル当たり何行
+追加され何行消されたかを示すものです。diffstat コマンドは特に大きなパッチに
+おいて役立ちます。その時点でだけ又はメンテナにとってのみ関係のあるコメント
+は無期限に保存されるチェンジログにとって適切ではありません。そのため、この
+ようなコメントもマーカー行の後に書かれるべきです。ファイル名はカーネルソー
+スツリーのトップディレクトリからの表記でリストされるため、横方向のスペース
+をとり過ぎないように、diffstat コマンドにオプション「 -p 1 -w 70 」を指定し
+てください(インデントを含めてちょうど80列に合うでしょう)。
+
+適切なパッチのフォーマットの詳細についてはセクション3の参考文献を参照して
+ください。
+
+------------------------------------
+セクション2 - ヒントとTIPSと小技
+------------------------------------
+
+このセクションは Linux カーネルに変更を適用することに関係のある一般的な
+「お約束」の多くを載せています。物事には例外というものがあります。しか
+し例外を適用するには、本当に妥当な理由が不可欠です。あなたは恐らくこの
+セクションを Linus のコンピュータ・サイエンス101と呼ぶでしょう。
+
+1) Documentation/CodingStyleを参照
+
+言うまでもなく、あなたのコードがこのコーディングスタイルからあまりに
+も逸脱していると、レビューやコメントなしに受け取ってもらえないかもし
+れません。
+
+唯一の特筆すべき例外は、コードをあるファイルから別のファイルに移動
+するときです。この場合、コードを移動するパッチでは、移動されるコード
+に関して移動以外の変更を一切加えるべきではありません。これにより、
+コードの移動とあなたが行ったコードの修正を明確に区別できるようにな
+ります。これは実際に何が変更されたかをレビューする際の大きな助けに
+なるとともに、ツールにコードの履歴を追跡させることも容易になります。
+
+投稿するより前にパッチのスタイルチェッカー( scripts/checkpatch.pl )で
+あなたのパッチをチェックしてください。このスタイルチェッカーは最終結
+論としてではなく、指標としてみるべきです。もし、あなたのコードが違反
+はしているが修正するより良く見えるのであれば、おそらくそのままにする
+のがベストです。
+
+スタイルチェッカーによる3段階のレポート:
+ - エラー: 間違っている可能性が高い
+ - 警告:注意してレビューする必要がある
+ - チェック:考慮する必要がある
+
+あなたはパッチに残っている全ての違反について、それがなぜ必要なのか正当な
+理由を示せるようにしておく必要があります。
+
+2) #ifdefは見苦しい
+
+ifdef が散乱したコードは、読むのもメンテナンスするのも面倒です。コードの中
+で ifdef を使わないでください。代わりに、ヘッダファイルの中に ifdef を入れて、
+条件付きで、コードの中で使われる関数を「 static inline 」関数かマクロで定義し
+てください。後はコンパイラが、何もしない箇所を最適化して取り去ってくれるで
+しょう。
+
+まずいコードの簡単な例
+
+ dev = alloc_etherdev (sizeof(struct funky_private));
+ if (!dev)
+ return -ENODEV;
+ #ifdef CONFIG_NET_FUNKINESS
+ init_funky_net(dev);
+ #endif
+
+クリーンアップしたコードの例
+
+(in header)
+ #ifndef CONFIG_NET_FUNKINESS
+ static inline void init_funky_net (struct net_device *d) {}
+ #endif
+
+(in the code itself)
+ dev = alloc_etherdev (sizeof(struct funky_private));
+ if (!dev)
+ return -ENODEV;
+ init_funky_net(dev);
+
+3) マクロより「 static inline 」を推奨
+
+「 static inline 」関数はマクロよりもずっと推奨されています。それらは、
+型安全性があり、長さにも制限が無く、フォーマットの制限もありません。
+gcc においては、マクロと同じくらい軽いです。
+
+マクロは「 static inline 」が明らかに不適切であると分かる場所(高速化パスの
+いくつかの特定のケース)や「 static inline 」関数を使うことができないような
+場所(マクロの引数の文字列連結のような)にだけ使われるべきです。
+
+「 static inline 」は「 static __inline__ 」や「 extern inline 」や
+「 extern __inline__ 」よりも適切です。
+
+4) 設計に凝りすぎるな
+
+それが有用になるかどうか分からないような不明瞭な将来を見越した設計
+をしないでください。「できる限り簡単に、そして、それ以上簡単になら
+ないような設計をしてください。」
+
+----------------------
+セクション3 参考文献
+----------------------
+
+Andrew Morton, "The perfect patch" (tpp).
+ <http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt>
+
+Jeff Garzik, "Linux kernel patch submission format".
+ <http://linux.yyz.us/patch-format.html>
+
+Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer".
+ <http://www.kroah.com/log/2005/03/31/>
+ <http://www.kroah.com/log/2005/07/08/>
+ <http://www.kroah.com/log/2005/10/19/>
+ <http://www.kroah.com/log/2006/01/11/>
+
+NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!
+ <http://marc.theaimsgroup.com/?l=linux-kernel&m=112112749912944&w=2>
+
+Kernel Documentation/CodingStyle:
+ <http://users.sosdg.org/~qiyong/lxr/source/Documentation/CodingStyle>
+
+Linus Torvalds's mail on the canonical patch format:
+ <http://lkml.org/lkml/2005/4/7/183>
+--
diff --git a/Documentation/ja_JP/stable_api_nonsense.txt b/Documentation/ja_JP/stable_api_nonsense.txt
new file mode 100644
index 0000000..7653b5c
--- /dev/null
+++ b/Documentation/ja_JP/stable_api_nonsense.txt
@@ -0,0 +1,263 @@
+NOTE:
+This is a version of Documentation/stable_api_nonsense.txt into Japanese.
+This document is maintained by IKEDA, Munehiro <m-ikeda@ds.jp.nec.com>
+and the JF Project team <http://www.linux.or.jp/JF/>.
+If you find any difference between this document and the original file
+or a problem with the translation,
+please contact the maintainer of this file or JF project.
+
+Please also note that the purpose of this file is to be easier to read
+for non English (read: Japanese) speakers and is not intended as a
+fork. So if you have any comments or updates of this file, please try
+to update the original English file first.
+
+Last Updated: 2007/07/18
+==================================
+これは、
+linux-2.6.22-rc4/Documentation/stable_api_nonsense.txt の和訳
+です。
+翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
+翻訳日 : 2007/06/11
+原著作者: Greg Kroah-Hartman < greg at kroah dot com >
+翻訳者 : 池田 宗広 < m-ikeda at ds dot jp dot nec dot com >
+校正者 : Masanori Kobayashi さん < zap03216 at nifty dot ne dot jp >
+ Seiji Kaneko さん < skaneko at a2 dot mbn dot or dot jp >
+==================================
+
+
+
+Linux カーネルのドライバインターフェース
+(あなたの質問すべてに対する回答とその他諸々)
+
+Greg Kroah-Hartman <greg at kroah dot com>
+
+
+この文書は、なぜ Linux ではバイナリカーネルインターフェースが定義
+されていないのか、またはなぜ不変のカーネルインターフェースを持たな
+いのか、ということを説明するために書かれた。ここでの話題は「カーネ
+ル内部の」インターフェースについてであり、ユーザー空間とのインター
+フェースではないことを理解してほしい。カーネルとユーザー空間とのイ
+ンターフェースとはアプリケーションプログラムが使用するものであり、
+つまりシステムコールのインターフェースがこれに当たる。これは今まで
+長きに渡り、かつ今後も「まさしく」不変である。私は確か 0.9 か何か
+より前のカーネルを使ってビルドした古いプログラムを持っているが、そ
+れは最新の 2.6 カーネルでもきちんと動作する。ユーザー空間とのイン
+ターフェースは、ユーザーとアプリケーションプログラマが不変性を信頼
+してよいものの一つである。
+
+
+要旨
+----
+
+あなたは不変のカーネルインターフェースが必要だと考えているかもしれ
+ないが、実際のところはそうではない。あなたは必要としているものが分
+かっていない。あなたが必要としているものは安定して動作するドライバ
+であり、それはドライバがメインのカーネルツリーに含まれる場合のみ得
+ることができる。ドライバがメインのカーネルツリーに含まれていると、
+他にも多くの良いことがある。それは、Linux をより強固で、安定な、成
+熟したオペレーティングシステムにすることができるということだ。これ
+こそ、そもそもあなたが Linux を使う理由のはずだ。
+
+
+はじめに
+--------
+
+カーネル内部のインターフェース変更を心配しなければならないドライバ
+を書きたいなどというのは、変わり者だけだ。この世界のほとんどの人は、
+そのようなドライバがどんなインターフェースを使っているかなど知らな
+いし、そんなドライバのことなど全く気にもかけていない。
+
+
+まず初めに、クローズソースとか、ソースコードの隠蔽とか、バイナリの
+みが配布される使い物にならない代物[訳注(1)]とか、実体はバイナリ
+コードでそれを読み込むためのラッパー部分のみソースコードが公開され
+ているとか、その他用語は何であれ GPL の下にソースコードがリリース
+されていないカーネルドライバに関する法的な問題について、私は「いか
+なる議論も」行うつもりがない。法的な疑問があるのならば、プログラマ
+である私ではなく、弁護士に相談して欲しい。ここでは単に、技術的な問
+題について述べることにする。(法的な問題を軽視しているわけではない。
+それらは実際に存在するし、あなたはそれをいつも気にかけておく必要が
+ある)
+
+訳注(1)
+「使い物にならない代物」の原文は "blob"
+
+
+さてここでは、バイナリカーネルインターフェースについてと、ソースレ
+ベルでのインターフェースの不変性について、という二つの話題を取り上
+げる。この二つは互いに依存する関係にあるが、まずはバイナリインター
+フェースについて議論を行いやっつけてしまおう。
+
+
+バイナリカーネルインターフェース
+--------------------------------
+
+もしソースレベルでのインターフェースが不変ならば、バイナリインター
+フェースも当然のように不変である、というのは正しいだろうか?正しく
+ない。Linux カーネルに関する以下の事実を考えてみてほしい。
+ - あなたが使用するCコンパイラのバージョンによって、カーネル内部
+ の構造体の配置構造は異なったものになる。また、関数は異なった方
+ 法でカーネルに含まれることになるかもしれない(例えばインライン
+ 関数として扱われたり、扱われなかったりする)。個々の関数がどの
+ ようにコンパイルされるかはそれほど重要ではないが、構造体のパデ
+ ィングが異なるというのは非常に重要である。
+ - あなたがカーネルのビルドオプションをどのように設定するかによっ
+ て、カーネルには広い範囲で異なった事態が起こり得る。
+ - データ構造は異なるデータフィールドを持つかもしれない
+ - いくつかの関数は全く実装されていない状態になり得る
+ (例:SMP向けではないビルドでは、いくつかのロックは中身が
+ カラにコンパイルされる)
+ - カーネル内のメモリは、異なった方法で配置され得る。これはビ
+ ルドオプションに依存している。
+ - Linux は様々な異なるプロセッサアーキテクチャ上で動作する。
+ あるアーキテクチャ用のバイナリドライバを、他のアーキテクチャで
+ 正常に動作させる方法はない。
+
+
+ある特定のカーネル設定を使用し、カーネルをビルドしたのと正確に同じ
+Cコンパイラを使用して単にカーネルモジュールをコンパイルするだけで
+も、あなたはこれらいくつもの問題に直面することになる。ある特定の
+Linux ディストリビューションの、ある特定のリリースバージョン用にモ
+ジュールを提供しようと思っただけでも、これらの問題を引き起こすには
+十分である。にも関わらず Linux ディストリビューションの数と、サ
+ポートするディストリビューションのリリース数を掛け算し、それら一つ
+一つについてビルドを行ったとしたら、今度はリリースごとのビルドオプ
+ションの違いという悪夢にすぐさま悩まされることになる。また、ディス
+トリビューションの各リリースバージョンには、異なるハードウェア(プ
+ロセッサタイプや種々のオプション)に対応するため、何種類かのカーネ
+ルが含まれているということも理解して欲しい。従って、ある一つのリ
+リースバージョンだけのためにモジュールを作成する場合でも、あなたは
+何バージョンものモジュールを用意しなければならない。
+
+
+信じて欲しい。このような方法でサポートを続けようとするなら、あなた
+はいずれ正気を失うだろう。遠い昔、私はそれがいかに困難なことか、身
+をもって学んだのだ・・・
+
+
+不変のカーネルソースレベルインターフェース
+------------------------------------------
+
+メインカーネルツリーに含まれていない Linux カーネルドライバを継続
+してサポートしていこうとしている人たちとの議論においては、これは極
+めて「引火性の高い」話題である。[訳注(2)]
+
+訳注(2)
+「引火性の高い」の原文は "volatile"。
+volatile には「揮発性の」「爆発しやすい」という意味の他、「変わり
+やすい」「移り気な」という意味がある。
+「(この話題は)爆発的に激しい論争を巻き起こしかねない」ということ
+を、「(カーネルのソースレベルインターフェースは)移ろい行くもので
+ある」ということを連想させる "volatile" という単語で表現している。
+
+
+Linux カーネルの開発は継続的に速いペースで行われ、決して歩みを緩め
+ることがない。その中でカーネル開発者達は、現状のインターフェースに
+あるバグを見つけ、より良い方法を考え出す。彼らはやがて、現状のイン
+ターフェースがより正しく動作するように修正を行う。その過程で関数の
+名前は変更されるかもしれず、構造体は大きく、または小さくなるかもし
+れず、関数の引数は検討しなおされるかもしれない。そのような場合、引
+き続き全てが正常に動作するよう、カーネル内でこれらのインターフェー
+スを使用している個所も全て同時に修正される。
+
+
+具体的な例として、カーネル内の USB インターフェースを挙げる。USB
+サブシステムはこれまでに少なくとも3回の書き直しが行われ、その結果
+インターフェースが変更された。これらの書き直しはいくつかの異なった
+問題を修正するために行われた。
+ - 同期的データストリームが非同期に変更された。これにより多数のド
+ ライバを単純化でき、全てのドライバのスループットが向上した。今
+ やほとんど全ての USB デバイスは、考えられる最高の速度で動作し
+ ている。
+ - USB ドライバが USB サブシステムのコアから行う、データパケット
+ 用のメモリ確保方法が変更された。これに伴い、いくつもの文書化さ
+ れたデッドロック条件を回避するため、全ての USB ドライバはより
+ 多くの情報を USB コアに提供しなければならないようになっている。
+
+
+このできごとは、数多く存在するクローズソースのオペレーティングシス
+テムとは全く対照的だ。それらは長期に渡り古い USB インターフェース
+をメンテナンスしなければならない。古いインターフェースが残ることで、
+新たな開発者が偶然古いインターフェースを使い、正しくない方法で開発
+を行ってしまう可能性が生じる。これによりシステムの安定性は危険にさ
+らされることになる。
+
+
+上に挙げたどちらの例においても、開発者達はその変更が重要かつ必要で
+あることに合意し、比較的楽にそれを実行した。もし Linux がソースレ
+ベルでインターフェースの不変性を保証しなければならないとしたら、新
+しいインターフェースを作ると同時に、古い、問題のある方を今後ともメ
+ンテナンスするという余計な仕事を USB の開発者にさせなければならな
+い。Linux の USB 開発者は、自分の時間を使って仕事をしている。よっ
+て、価値のない余計な仕事を報酬もなしに実行しろと言うことはできない。
+
+
+セキュリティ問題も、Linux にとっては非常に重要である。ひとたびセキ
+ュリティに関する問題が発見されれば、それは極めて短期間のうちに修正
+される。セキュリティ問題の発生を防ぐための修正は、カーネルの内部イ
+ンターフェースの変更を何度も引き起こしてきた。その際同時に、変更さ
+れたインターフェースを使用する全てのドライバもまた変更された。これ
+により問題が解消し、将来偶然に問題が再発してしまわないことが保証さ
+れる。もし内部インターフェースの変更が許されないとしたら、このよう
+にセキュリティ問題を修正し、将来再発しないことを保証することなど不
+可能なのだ。
+
+
+カーネルのインターフェースは時が経つにつれクリーンナップを受ける。
+誰も使っていないインターフェースは削除される。これにより、可能な限
+りカーネルが小さく保たれ、現役の全てのインターフェースが可能な限り
+テストされることを保証しているのだ。(使われていないインターフェー
+スの妥当性をテストすることは不可能と言っていいだろう)
+
+
+
+これから何をすべきか
+-----------------------
+
+では、もしメインのカーネルツリーに含まれない Linux カーネルドライ
+バがあったとして、あなたは、つまり開発者は何をするべきだろうか?全
+てのディストリビューションの全てのカーネルバージョン向けにバイナリ
+のドライバを供給することは悪夢であり、カーネルインターフェースの変
+更を追いかけ続けることもまた過酷な仕事だ。
+
+
+答えは簡単。そのドライバをメインのカーネルツリーに入れてしまえばよ
+い。(ここで言及しているのは、GPL に従って公開されるドライバのこと
+だということに注意してほしい。あなたのコードがそれに該当しないなら
+ば、さよなら。幸運を祈ります。ご自分で何とかしてください。Andrew
+と Linus からのコメント<Andrew と Linus のコメントへのリンクをこ
+こに置く>をどうぞ)ドライバがメインツリーに入れば、カーネルのイン
+ターフェースが変更された場合、変更を行った開発者によってドライバも
+修正されることになるだろう。あなたはほとんど労力を払うことなしに、
+常にビルド可能できちんと動作するドライバを手に入れることができる。
+
+
+ドライバをメインのカーネルツリーに入れると、非常に好ましい以下の効
+果がある。
+ - ドライバの品質が向上する一方で、(元の開発者にとっての)メンテ
+ ナンスコストは下がる。
+ - あなたのドライバに他の開発者が機能を追加してくれる。
+ - 誰かがあなたのドライバにあるバグを見つけ、修正してくれる。
+ - 誰かがあなたのドライバにある改善点を見つけてくれる。
+ - 外部インターフェースが変更されドライバの更新が必要になった場合、
+ 誰かがあなたの代わりに更新してくれる。
+ - ドライバを入れてくれとディストロに頼まなくても、そのドライバは
+ 全ての Linux ディストリビューションに自動的に含まれてリリース
+ される。
+
+
+Linux では、他のどのオペレーティングシステムよりも数多くのデバイス
+が「そのまま」使用できるようになった。また Linux は、どのオペレー
+ティングシステムよりも数多くのプロセッサアーキテクチャ上でそれらの
+デバイスを使用することができるようにもなった。このように、Linux の
+開発モデルは実証されており、今後も間違いなく正しい方向へと進んでい
+くだろう。:)
+
+
+
+------
+
+この文書の初期の草稿に対し、Randy Dunlap, Andrew Morton, David
+Brownell, Hanna Linder, Robert Love, Nishanth Aravamudan から査読
+と助言を頂きました。感謝申し上げます。
+
diff --git a/Documentation/ja_JP/stable_kernel_rules.txt b/Documentation/ja_JP/stable_kernel_rules.txt
new file mode 100644
index 0000000..b3ffe87
--- /dev/null
+++ b/Documentation/ja_JP/stable_kernel_rules.txt
@@ -0,0 +1,79 @@
+NOTE:
+This is Japanese translated version of "Documentation/stable_kernel_rules.txt".
+This one is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
+and JF Project team <www.linux.or.jp/JF>.
+If you find difference with original file or problem in translation,
+please contact maintainer of this file or JF project.
+
+Please also note that purpose of this file is easier to read for non
+English natives and do no intended to fork. So, if you have any
+comment or update of this file, please try to update Original(English)
+file at first.
+
+==================================
+これは、
+linux-2.6.24/Documentation/stable_kernel_rules.txt
+の和訳です。
+
+翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
+翻訳日: 2007/12/30
+翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com>
+校正者: 武井伸光さん、<takei at webmasters dot gr dot jp>
+ かねこさん (Seiji Kaneko) <skaneko at a2 dot mbn dot or dot jp>
+ 小林 雅典さん (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp>
+ 野口さん (Kenji Noguchi) <tokyo246 at gmail dot com>
+ 神宮信太郎さん <jin at libjingu dot jp>
+==================================
+
+ずっと知りたかった Linux 2.6 -stable リリースの全て
+
+"-stable" ツリーにどのような種類のパッチが受け入れられるか、どのような
+ものが受け入れられないか、についての規則-
+
+ - 明らかに正しく、テストされているものでなければならない。
+ - 文脈(変更行の前後)を含めて 100 行より大きくてはいけない。
+ - ただ一個のことだけを修正しているべき。
+ - 皆を悩ませている本物のバグを修正しなければならない。("これはバグで
+ あるかもしれないが..." のようなものではない)
+ - ビルドエラー(CONFIG_BROKENになっているものを除く), oops, ハング、デー
+ タ破壊、現実のセキュリティ問題、その他 "ああ、これはダメだね"という
+ ようなものを修正しなければならない。短く言えば、重大な問題。
+ - どのように競合状態が発生するかの説明も一緒に書かれていない限り、
+ "理論的には競合状態になる"ようなものは不可。
+ - いかなる些細な修正も含めることはできない。(スペルの修正、空白のクリー
+ ンアップなど)
+ - 対応するサブシステムメンテナが受け入れたものでなければならない。
+ - Documentation/SubmittingPatches の規則に従ったものでなければならない。
+
+-stable ツリーにパッチを送付する手続き-
+
+ - 上記の規則に従っているかを確認した後に、stable@kernel.org にパッチ
+ を送る。
+ - 送信者はパッチがキューに受け付けられた際には ACK を、却下された場合
+ には NAK を受け取る。この反応は開発者たちのスケジュールによって、数
+ 日かかる場合がある。
+ - もし受け取られたら、パッチは他の開発者たちのレビューのために
+ -stable キューに追加される。
+ - セキュリティパッチはこのエイリアス (stable@kernel.org) に送られるべ
+ きではなく、代わりに security@kernel.org のアドレスに送られる。
+
+レビューサイクル-
+
+ - -stable メンテナがレビューサイクルを決めるとき、パッチはレビュー委
+ 員会とパッチが影響する領域のメンテナ(提供者がその領域のメンテナで無
+ い限り)に送られ、linux-kernel メーリングリストにCCされる。
+ - レビュー委員会は 48時間の間に ACK か NAK を出す。
+ - もしパッチが委員会のメンバから却下されるか、メンテナ達やメンバが気付
+ かなかった問題が持ちあがり、linux-kernel メンバがパッチに異議を唱え
+ た場合には、パッチはキューから削除される。
+ - レビューサイクルの最後に、ACK を受けたパッチは最新の -stable リリー
+ スに追加され、その後に新しい -stable リリースが行われる。
+ - セキュリティパッチは、通常のレビューサイクルを通らず、セキュリティ
+ カーネルチームから直接 -stable ツリーに受け付けられる。
+ この手続きの詳細については kernel security チームに問い合わせること。
+
+レビュー委員会-
+
+ - この委員会は、このタスクについて活動する多くのボランティアと、少数の
+ 非ボランティアのカーネル開発者達で構成されている。
+
diff --git a/Documentation/java.txt b/Documentation/java.txt
new file mode 100644
index 0000000..e6a7232
--- /dev/null
+++ b/Documentation/java.txt
@@ -0,0 +1,396 @@
+ Java(tm) Binary Kernel Support for Linux v1.03
+ ----------------------------------------------
+
+Linux beats them ALL! While all other OS's are TALKING about direct
+support of Java Binaries in the OS, Linux is doing it!
+
+You can execute Java applications and Java Applets just like any
+other program after you have done the following:
+
+1) You MUST FIRST install the Java Developers Kit for Linux.
+ The Java on Linux HOWTO gives the details on getting and
+ installing this. This HOWTO can be found at:
+
+ ftp://sunsite.unc.edu/pub/Linux/docs/HOWTO/Java-HOWTO
+
+ You should also set up a reasonable CLASSPATH environment
+ variable to use Java applications that make use of any
+ nonstandard classes (not included in the same directory
+ as the application itself).
+
+2) You have to compile BINFMT_MISC either as a module or into
+ the kernel (CONFIG_BINFMT_MISC) and set it up properly.
+ If you choose to compile it as a module, you will have
+ to insert it manually with modprobe/insmod, as kmod
+ cannot easily be supported with binfmt_misc.
+ Read the file 'binfmt_misc.txt' in this directory to know
+ more about the configuration process.
+
+3) Add the following configuration items to binfmt_misc
+ (you should really have read binfmt_misc.txt now):
+ support for Java applications:
+ ':Java:M::\xca\xfe\xba\xbe::/usr/local/bin/javawrapper:'
+ support for executable Jar files:
+ ':ExecutableJAR:E::jar::/usr/local/bin/jarwrapper:'
+ support for Java Applets:
+ ':Applet:E::html::/usr/bin/appletviewer:'
+ or the following, if you want to be more selective:
+ ':Applet:M::<!--applet::/usr/bin/appletviewer:'
+
+ Of course you have to fix the path names. The path/file names given in this
+ document match the Debian 2.1 system. (i.e. jdk installed in /usr,
+ custom wrappers from this document in /usr/local)
+
+ Note, that for the more selective applet support you have to modify
+ existing html-files to contain <!--applet--> in the first line
+ ('<' has to be the first character!) to let this work!
+
+ For the compiled Java programs you need a wrapper script like the
+ following (this is because Java is broken in case of the filename
+ handling), again fix the path names, both in the script and in the
+ above given configuration string.
+
+ You, too, need the little program after the script. Compile like
+ gcc -O2 -o javaclassname javaclassname.c
+ and stick it to /usr/local/bin.
+
+ Both the javawrapper shellscript and the javaclassname program
+ were supplied by Colin J. Watson <cjw44@cam.ac.uk>.
+
+====================== Cut here ===================
+#!/bin/bash
+# /usr/local/bin/javawrapper - the wrapper for binfmt_misc/java
+
+if [ -z "$1" ]; then
+ exec 1>&2
+ echo Usage: $0 class-file
+ exit 1
+fi
+
+CLASS=$1
+FQCLASS=`/usr/local/bin/javaclassname $1`
+FQCLASSN=`echo $FQCLASS | sed -e 's/^.*\.\([^.]*\)$/\1/'`
+FQCLASSP=`echo $FQCLASS | sed -e 's-\.-/-g' -e 's-^[^/]*$--' -e 's-/[^/]*$--'`
+
+# for example:
+# CLASS=Test.class
+# FQCLASS=foo.bar.Test
+# FQCLASSN=Test
+# FQCLASSP=foo/bar
+
+unset CLASSBASE
+
+declare -i LINKLEVEL=0
+
+while :; do
+ if [ "`basename $CLASS .class`" == "$FQCLASSN" ]; then
+ # See if this directory works straight off
+ cd -L `dirname $CLASS`
+ CLASSDIR=$PWD
+ cd $OLDPWD
+ if echo $CLASSDIR | grep -q "$FQCLASSP$"; then
+ CLASSBASE=`echo $CLASSDIR | sed -e "s.$FQCLASSP$.."`
+ break;
+ fi
+ # Try dereferencing the directory name
+ cd -P `dirname $CLASS`
+ CLASSDIR=$PWD
+ cd $OLDPWD
+ if echo $CLASSDIR | grep -q "$FQCLASSP$"; then
+ CLASSBASE=`echo $CLASSDIR | sed -e "s.$FQCLASSP$.."`
+ break;
+ fi
+ # If no other possible filename exists
+ if [ ! -L $CLASS ]; then
+ exec 1>&2
+ echo $0:
+ echo " $CLASS should be in a" \
+ "directory tree called $FQCLASSP"
+ exit 1
+ fi
+ fi
+ if [ ! -L $CLASS ]; then break; fi
+ # Go down one more level of symbolic links
+ let LINKLEVEL+=1
+ if [ $LINKLEVEL -gt 5 ]; then
+ exec 1>&2
+ echo $0:
+ echo " Too many symbolic links encountered"
+ exit 1
+ fi
+ CLASS=`ls --color=no -l $CLASS | sed -e 's/^.* \([^ ]*\)$/\1/'`
+done
+
+if [ -z "$CLASSBASE" ]; then
+ if [ -z "$FQCLASSP" ]; then
+ GOODNAME=$FQCLASSN.class
+ else
+ GOODNAME=$FQCLASSP/$FQCLASSN.class
+ fi
+ exec 1>&2
+ echo $0:
+ echo " $FQCLASS should be in a file called $GOODNAME"
+ exit 1
+fi
+
+if ! echo $CLASSPATH | grep -q "^\(.*:\)*$CLASSBASE\(:.*\)*"; then
+ # class is not in CLASSPATH, so prepend dir of class to CLASSPATH
+ if [ -z "${CLASSPATH}" ] ; then
+ export CLASSPATH=$CLASSBASE
+ else
+ export CLASSPATH=$CLASSBASE:$CLASSPATH
+ fi
+fi
+
+shift
+/usr/bin/java $FQCLASS "$@"
+====================== Cut here ===================
+
+
+====================== Cut here ===================
+/* javaclassname.c
+ *
+ * Extracts the class name from a Java class file; intended for use in a Java
+ * wrapper of the type supported by the binfmt_misc option in the Linux kernel.
+ *
+ * Copyright (C) 1999 Colin J. Watson <cjw44@cam.ac.uk>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <sys/types.h>
+
+/* From Sun's Java VM Specification, as tag entries in the constant pool. */
+
+#define CP_UTF8 1
+#define CP_INTEGER 3
+#define CP_FLOAT 4
+#define CP_LONG 5
+#define CP_DOUBLE 6
+#define CP_CLASS 7
+#define CP_STRING 8
+#define CP_FIELDREF 9
+#define CP_METHODREF 10
+#define CP_INTERFACEMETHODREF 11
+#define CP_NAMEANDTYPE 12
+
+/* Define some commonly used error messages */
+
+#define seek_error() error("%s: Cannot seek\n", program)
+#define corrupt_error() error("%s: Class file corrupt\n", program)
+#define eof_error() error("%s: Unexpected end of file\n", program)
+#define utf8_error() error("%s: Only ASCII 1-255 supported\n", program);
+
+char *program;
+
+long *pool;
+
+u_int8_t read_8(FILE *classfile);
+u_int16_t read_16(FILE *classfile);
+void skip_constant(FILE *classfile, u_int16_t *cur);
+void error(const char *format, ...);
+int main(int argc, char **argv);
+
+/* Reads in an unsigned 8-bit integer. */
+u_int8_t read_8(FILE *classfile)
+{
+ int b = fgetc(classfile);
+ if(b == EOF)
+ eof_error();
+ return (u_int8_t)b;
+}
+
+/* Reads in an unsigned 16-bit integer. */
+u_int16_t read_16(FILE *classfile)
+{
+ int b1, b2;
+ b1 = fgetc(classfile);
+ if(b1 == EOF)
+ eof_error();
+ b2 = fgetc(classfile);
+ if(b2 == EOF)
+ eof_error();
+ return (u_int16_t)((b1 << 8) | b2);
+}
+
+/* Reads in a value from the constant pool. */
+void skip_constant(FILE *classfile, u_int16_t *cur)
+{
+ u_int16_t len;
+ int seekerr = 1;
+ pool[*cur] = ftell(classfile);
+ switch(read_8(classfile))
+ {
+ case CP_UTF8:
+ len = read_16(classfile);
+ seekerr = fseek(classfile, len, SEEK_CUR);
+ break;
+ case CP_CLASS:
+ case CP_STRING:
+ seekerr = fseek(classfile, 2, SEEK_CUR);
+ break;
+ case CP_INTEGER:
+ case CP_FLOAT:
+ case CP_FIELDREF:
+ case CP_METHODREF:
+ case CP_INTERFACEMETHODREF:
+ case CP_NAMEANDTYPE:
+ seekerr = fseek(classfile, 4, SEEK_CUR);
+ break;
+ case CP_LONG:
+ case CP_DOUBLE:
+ seekerr = fseek(classfile, 8, SEEK_CUR);
+ ++(*cur);
+ break;
+ default:
+ corrupt_error();
+ }
+ if(seekerr)
+ seek_error();
+}
+
+void error(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ FILE *classfile;
+ u_int16_t cp_count, i, this_class, classinfo_ptr;
+ u_int8_t length;
+
+ program = argv[0];
+
+ if(!argv[1])
+ error("%s: Missing input file\n", program);
+ classfile = fopen(argv[1], "rb");
+ if(!classfile)
+ error("%s: Error opening %s\n", program, argv[1]);
+
+ if(fseek(classfile, 8, SEEK_SET)) /* skip magic and version numbers */
+ seek_error();
+ cp_count = read_16(classfile);
+ pool = calloc(cp_count, sizeof(long));
+ if(!pool)
+ error("%s: Out of memory for constant pool\n", program);
+
+ for(i = 1; i < cp_count; ++i)
+ skip_constant(classfile, &i);
+ if(fseek(classfile, 2, SEEK_CUR)) /* skip access flags */
+ seek_error();
+
+ this_class = read_16(classfile);
+ if(this_class < 1 || this_class >= cp_count)
+ corrupt_error();
+ if(!pool[this_class] || pool[this_class] == -1)
+ corrupt_error();
+ if(fseek(classfile, pool[this_class] + 1, SEEK_SET))
+ seek_error();
+
+ classinfo_ptr = read_16(classfile);
+ if(classinfo_ptr < 1 || classinfo_ptr >= cp_count)
+ corrupt_error();
+ if(!pool[classinfo_ptr] || pool[classinfo_ptr] == -1)
+ corrupt_error();
+ if(fseek(classfile, pool[classinfo_ptr] + 1, SEEK_SET))
+ seek_error();
+
+ length = read_16(classfile);
+ for(i = 0; i < length; ++i)
+ {
+ u_int8_t x = read_8(classfile);
+ if((x & 0x80) || !x)
+ {
+ if((x & 0xE0) == 0xC0)
+ {
+ u_int8_t y = read_8(classfile);
+ if((y & 0xC0) == 0x80)
+ {
+ int c = ((x & 0x1f) << 6) + (y & 0x3f);
+ if(c) putchar(c);
+ else utf8_error();
+ }
+ else utf8_error();
+ }
+ else utf8_error();
+ }
+ else if(x == '/') putchar('.');
+ else putchar(x);
+ }
+ putchar('\n');
+ free(pool);
+ fclose(classfile);
+ return 0;
+}
+====================== Cut here ===================
+
+
+====================== Cut here ===================
+#!/bin/bash
+# /usr/local/java/bin/jarwrapper - the wrapper for binfmt_misc/jar
+
+java -jar $1
+====================== Cut here ===================
+
+
+Now simply chmod +x the .class, .jar and/or .html files you want to execute.
+To add a Java program to your path best put a symbolic link to the main
+.class file into /usr/bin (or another place you like) omitting the .class
+extension. The directory containing the original .class file will be
+added to your CLASSPATH during execution.
+
+
+To test your new setup, enter in the following simple Java app, and name
+it "HelloWorld.java":
+
+ class HelloWorld {
+ public static void main(String args[]) {
+ System.out.println("Hello World!");
+ }
+ }
+
+Now compile the application with:
+ javac HelloWorld.java
+
+Set the executable permissions of the binary file, with:
+ chmod 755 HelloWorld.class
+
+And then execute it:
+ ./HelloWorld.class
+
+
+To execute Java Jar files, simple chmod the *.jar files to include
+the execution bit, then just do
+ ./Application.jar
+
+
+To execute Java Applets, simple chmod the *.html files to include
+the execution bit, then just do
+ ./Applet.html
+
+
+originally by Brian A. Lantz, brian@lantz.com
+heavily edited for binfmt_misc by Richard Günther
+new scripts by Colin J. Watson <cjw44@cam.ac.uk>
+added executable Jar file support by Kurt Huwig <kurt@iku-netz.de>
+
diff --git a/Documentation/kbuild/00-INDEX b/Documentation/kbuild/00-INDEX
new file mode 100644
index 0000000..1146442
--- /dev/null
+++ b/Documentation/kbuild/00-INDEX
@@ -0,0 +1,8 @@
+00-INDEX
+ - this file: info on the kernel build process
+kconfig-language.txt
+ - specification of Config Language, the language in Kconfig files
+makefiles.txt
+ - developer information for linux kernel makefiles
+modules.txt
+ - how to build modules and to install them
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
new file mode 100644
index 0000000..c412c24
--- /dev/null
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -0,0 +1,379 @@
+Introduction
+------------
+
+The configuration database is a collection of configuration options
+organized in a tree structure:
+
+ +- Code maturity level options
+ | +- Prompt for development and/or incomplete code/drivers
+ +- General setup
+ | +- Networking support
+ | +- System V IPC
+ | +- BSD Process Accounting
+ | +- Sysctl support
+ +- Loadable module support
+ | +- Enable loadable module support
+ | +- Set version information on all module symbols
+ | +- Kernel module loader
+ +- ...
+
+Every entry has its own dependencies. These dependencies are used
+to determine the visibility of an entry. Any child entry is only
+visible if its parent entry is also visible.
+
+Menu entries
+------------
+
+Most entries define a config option; all other entries help to organize
+them. A single configuration option is defined like this:
+
+config MODVERSIONS
+ bool "Set version information on all module symbols"
+ depends on MODULES
+ help
+ Usually, modules have to be recompiled whenever you switch to a new
+ kernel. ...
+
+Every line starts with a key word and can be followed by multiple
+arguments. "config" starts a new config entry. The following lines
+define attributes for this config option. Attributes can be the type of
+the config option, input prompt, dependencies, help text and default
+values. A config option can be defined multiple times with the same
+name, but every definition can have only a single input prompt and the
+type must not conflict.
+
+Menu attributes
+---------------
+
+A menu entry can have a number of attributes. Not all of them are
+applicable everywhere (see syntax).
+
+- type definition: "bool"/"tristate"/"string"/"hex"/"int"
+ Every config option must have a type. There are only two basic types:
+ tristate and string; the other types are based on these two. The type
+ definition optionally accepts an input prompt, so these two examples
+ are equivalent:
+
+ bool "Networking support"
+ and
+ bool
+ prompt "Networking support"
+
+- input prompt: "prompt" <prompt> ["if" <expr>]
+ Every menu entry can have at most one prompt, which is used to display
+ to the user. Optionally dependencies only for this prompt can be added
+ with "if".
+
+- default value: "default" <expr> ["if" <expr>]
+ A config option can have any number of default values. If multiple
+ default values are visible, only the first defined one is active.
+ Default values are not limited to the menu entry where they are
+ defined. This means the default can be defined somewhere else or be
+ overridden by an earlier definition.
+ The default value is only assigned to the config symbol if no other
+ value was set by the user (via the input prompt above). If an input
+ prompt is visible the default value is presented to the user and can
+ be overridden by him.
+ Optionally, dependencies only for this default value can be added with
+ "if".
+
+- type definition + default value:
+ "def_bool"/"def_tristate" <expr> ["if" <expr>]
+ This is a shorthand notation for a type definition plus a value.
+ Optionally dependencies for this default value can be added with "if".
+
+- dependencies: "depends on" <expr>
+ This defines a dependency for this menu entry. If multiple
+ dependencies are defined, they are connected with '&&'. Dependencies
+ are applied to all other options within this menu entry (which also
+ accept an "if" expression), so these two examples are equivalent:
+
+ bool "foo" if BAR
+ default y if BAR
+ and
+ depends on BAR
+ bool "foo"
+ default y
+
+- reverse dependencies: "select" <symbol> ["if" <expr>]
+ While normal dependencies reduce the upper limit of a symbol (see
+ below), reverse dependencies can be used to force a lower limit of
+ another symbol. The value of the current menu symbol is used as the
+ minimal value <symbol> can be set to. If <symbol> is selected multiple
+ times, the limit is set to the largest selection.
+ Reverse dependencies can only be used with boolean or tristate
+ symbols.
+ Note:
+ select should be used with care. select will force
+ a symbol to a value without visiting the dependencies.
+ By abusing select you are able to select a symbol FOO even
+ if FOO depends on BAR that is not set.
+ In general use select only for non-visible symbols
+ (no prompts anywhere) and for symbols with no dependencies.
+ That will limit the usefulness but on the other hand avoid
+ the illegal configurations all over.
+ kconfig should one day warn about such things.
+
+- numerical ranges: "range" <symbol> <symbol> ["if" <expr>]
+ This allows to limit the range of possible input values for int
+ and hex symbols. The user can only input a value which is larger than
+ or equal to the first symbol and smaller than or equal to the second
+ symbol.
+
+- help text: "help" or "---help---"
+ This defines a help text. The end of the help text is determined by
+ the indentation level, this means it ends at the first line which has
+ a smaller indentation than the first line of the help text.
+ "---help---" and "help" do not differ in behaviour, "---help---" is
+ used to help visually separate configuration logic from help within
+ the file as an aid to developers.
+
+- misc options: "option" <symbol>[=<value>]
+ Various less common options can be defined via this option syntax,
+ which can modify the behaviour of the menu entry and its config
+ symbol. These options are currently possible:
+
+ - "defconfig_list"
+ This declares a list of default entries which can be used when
+ looking for the default configuration (which is used when the main
+ .config doesn't exists yet.)
+
+ - "modules"
+ This declares the symbol to be used as the MODULES symbol, which
+ enables the third modular state for all config symbols.
+
+ - "env"=<value>
+ This imports the environment variable into Kconfig. It behaves like
+ a default, except that the value comes from the environment, this
+ also means that the behaviour when mixing it with normal defaults is
+ undefined at this point. The symbol is currently not exported back
+ to the build environment (if this is desired, it can be done via
+ another symbol).
+
+Menu dependencies
+-----------------
+
+Dependencies define the visibility of a menu entry and can also reduce
+the input range of tristate symbols. The tristate logic used in the
+expressions uses one more state than normal boolean logic to express the
+module state. Dependency expressions have the following syntax:
+
+<expr> ::= <symbol> (1)
+ <symbol> '=' <symbol> (2)
+ <symbol> '!=' <symbol> (3)
+ '(' <expr> ')' (4)
+ '!' <expr> (5)
+ <expr> '&&' <expr> (6)
+ <expr> '||' <expr> (7)
+
+Expressions are listed in decreasing order of precedence.
+
+(1) Convert the symbol into an expression. Boolean and tristate symbols
+ are simply converted into the respective expression values. All
+ other symbol types result in 'n'.
+(2) If the values of both symbols are equal, it returns 'y',
+ otherwise 'n'.
+(3) If the values of both symbols are equal, it returns 'n',
+ otherwise 'y'.
+(4) Returns the value of the expression. Used to override precedence.
+(5) Returns the result of (2-/expr/).
+(6) Returns the result of min(/expr/, /expr/).
+(7) Returns the result of max(/expr/, /expr/).
+
+An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
+respectively for calculations). A menu entry becomes visible when it's
+expression evaluates to 'm' or 'y'.
+
+There are two types of symbols: constant and non-constant symbols.
+Non-constant symbols are the most common ones and are defined with the
+'config' statement. Non-constant symbols consist entirely of alphanumeric
+characters or underscores.
+Constant symbols are only part of expressions. Constant symbols are
+always surrounded by single or double quotes. Within the quote, any
+other character is allowed and the quotes can be escaped using '\'.
+
+Menu structure
+--------------
+
+The position of a menu entry in the tree is determined in two ways. First
+it can be specified explicitly:
+
+menu "Network device support"
+ depends on NET
+
+config NETDEVICES
+ ...
+
+endmenu
+
+All entries within the "menu" ... "endmenu" block become a submenu of
+"Network device support". All subentries inherit the dependencies from
+the menu entry, e.g. this means the dependency "NET" is added to the
+dependency list of the config option NETDEVICES.
+
+The other way to generate the menu structure is done by analyzing the
+dependencies. If a menu entry somehow depends on the previous entry, it
+can be made a submenu of it. First, the previous (parent) symbol must
+be part of the dependency list and then one of these two conditions
+must be true:
+- the child entry must become invisible, if the parent is set to 'n'
+- the child entry must only be visible, if the parent is visible
+
+config MODULES
+ bool "Enable loadable module support"
+
+config MODVERSIONS
+ bool "Set version information on all module symbols"
+ depends on MODULES
+
+comment "module support disabled"
+ depends on !MODULES
+
+MODVERSIONS directly depends on MODULES, this means it's only visible if
+MODULES is different from 'n'. The comment on the other hand is always
+visible when MODULES is visible (the (empty) dependency of MODULES is
+also part of the comment dependencies).
+
+
+Kconfig syntax
+--------------
+
+The configuration file describes a series of menu entries, where every
+line starts with a keyword (except help texts). The following keywords
+end a menu entry:
+- config
+- menuconfig
+- choice/endchoice
+- comment
+- menu/endmenu
+- if/endif
+- source
+The first five also start the definition of a menu entry.
+
+config:
+
+ "config" <symbol>
+ <config options>
+
+This defines a config symbol <symbol> and accepts any of above
+attributes as options.
+
+menuconfig:
+ "menuconfig" <symbol>
+ <config options>
+
+This is similar to the simple config entry above, but it also gives a
+hint to front ends, that all suboptions should be displayed as a
+separate list of options.
+
+choices:
+
+ "choice"
+ <choice options>
+ <choice block>
+ "endchoice"
+
+This defines a choice group and accepts any of the above attributes as
+options. A choice can only be of type bool or tristate, while a boolean
+choice only allows a single config entry to be selected, a tristate
+choice also allows any number of config entries to be set to 'm'. This
+can be used if multiple drivers for a single hardware exists and only a
+single driver can be compiled/loaded into the kernel, but all drivers
+can be compiled as modules.
+A choice accepts another option "optional", which allows to set the
+choice to 'n' and no entry needs to be selected.
+
+comment:
+
+ "comment" <prompt>
+ <comment options>
+
+This defines a comment which is displayed to the user during the
+configuration process and is also echoed to the output files. The only
+possible options are dependencies.
+
+menu:
+
+ "menu" <prompt>
+ <menu options>
+ <menu block>
+ "endmenu"
+
+This defines a menu block, see "Menu structure" above for more
+information. The only possible options are dependencies.
+
+if:
+
+ "if" <expr>
+ <if block>
+ "endif"
+
+This defines an if block. The dependency expression <expr> is appended
+to all enclosed menu entries.
+
+source:
+
+ "source" <prompt>
+
+This reads the specified configuration file. This file is always parsed.
+
+mainmenu:
+
+ "mainmenu" <prompt>
+
+This sets the config program's title bar if the config program chooses
+to use it.
+
+
+Kconfig hints
+-------------
+This is a collection of Kconfig tips, most of which aren't obvious at
+first glance and most of which have become idioms in several Kconfig
+files.
+
+Adding common features and make the usage configurable
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+It is a common idiom to implement a feature/functionality that are
+relevant for some architectures but not all.
+The recommended way to do so is to use a config variable named HAVE_*
+that is defined in a common Kconfig file and selected by the relevant
+architectures.
+An example is the generic IOMAP functionality.
+
+We would in lib/Kconfig see:
+
+# Generic IOMAP is used to ...
+config HAVE_GENERIC_IOMAP
+
+config GENERIC_IOMAP
+ depends on HAVE_GENERIC_IOMAP && FOO
+
+And in lib/Makefile we would see:
+obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
+
+For each architecture using the generic IOMAP functionality we would see:
+
+config X86
+ select ...
+ select HAVE_GENERIC_IOMAP
+ select ...
+
+Note: we use the existing config option and avoid creating a new
+config variable to select HAVE_GENERIC_IOMAP.
+
+Note: the use of the internal config variable HAVE_GENERIC_IOMAP, it is
+introduced to overcome the limitation of select which will force a
+config option to 'y' no matter the dependencies.
+The dependencies are moved to the symbol GENERIC_IOMAP and we avoid the
+situation where select forces a symbol equals to 'y'.
+
+Build as module only
+~~~~~~~~~~~~~~~~~~~~
+To restrict a component build to module-only, qualify its config symbol
+with "depends on m". E.g.:
+
+config FOO
+ depends on BAR && m
+
+limits FOO to module (=m) or disabled (=n).
+
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
new file mode 100644
index 0000000..7a77533
--- /dev/null
+++ b/Documentation/kbuild/makefiles.txt
@@ -0,0 +1,1228 @@
+Linux Kernel Makefiles
+
+This document describes the Linux kernel Makefiles.
+
+=== Table of Contents
+
+ === 1 Overview
+ === 2 Who does what
+ === 3 The kbuild files
+ --- 3.1 Goal definitions
+ --- 3.2 Built-in object goals - obj-y
+ --- 3.3 Loadable module goals - obj-m
+ --- 3.4 Objects which export symbols
+ --- 3.5 Library file goals - lib-y
+ --- 3.6 Descending down in directories
+ --- 3.7 Compilation flags
+ --- 3.8 Command line dependency
+ --- 3.9 Dependency tracking
+ --- 3.10 Special Rules
+ --- 3.11 $(CC) support functions
+
+ === 4 Host Program support
+ --- 4.1 Simple Host Program
+ --- 4.2 Composite Host Programs
+ --- 4.3 Defining shared libraries
+ --- 4.4 Using C++ for host programs
+ --- 4.5 Controlling compiler options for host programs
+ --- 4.6 When host programs are actually built
+ --- 4.7 Using hostprogs-$(CONFIG_FOO)
+
+ === 5 Kbuild clean infrastructure
+
+ === 6 Architecture Makefiles
+ --- 6.1 Set variables to tweak the build to the architecture
+ --- 6.2 Add prerequisites to archprepare:
+ --- 6.3 List directories to visit when descending
+ --- 6.4 Architecture-specific boot images
+ --- 6.5 Building non-kbuild targets
+ --- 6.6 Commands useful for building a boot image
+ --- 6.7 Custom kbuild commands
+ --- 6.8 Preprocessing linker scripts
+
+ === 7 Kbuild Variables
+ === 8 Makefile language
+ === 9 Credits
+ === 10 TODO
+
+=== 1 Overview
+
+The Makefiles have five parts:
+
+ Makefile the top Makefile.
+ .config the kernel configuration file.
+ arch/$(ARCH)/Makefile the arch Makefile.
+ scripts/Makefile.* common rules etc. for all kbuild Makefiles.
+ kbuild Makefiles there are about 500 of these.
+
+The top Makefile reads the .config file, which comes from the kernel
+configuration process.
+
+The top Makefile is responsible for building two major products: vmlinux
+(the resident kernel image) and modules (any module files).
+It builds these goals by recursively descending into the subdirectories of
+the kernel source tree.
+The list of subdirectories which are visited depends upon the kernel
+configuration. The top Makefile textually includes an arch Makefile
+with the name arch/$(ARCH)/Makefile. The arch Makefile supplies
+architecture-specific information to the top Makefile.
+
+Each subdirectory has a kbuild Makefile which carries out the commands
+passed down from above. The kbuild Makefile uses information from the
+.config file to construct various file lists used by kbuild to build
+any built-in or modular targets.
+
+scripts/Makefile.* contains all the definitions/rules etc. that
+are used to build the kernel based on the kbuild makefiles.
+
+
+=== 2 Who does what
+
+People have four different relationships with the kernel Makefiles.
+
+*Users* are people who build kernels. These people type commands such as
+"make menuconfig" or "make". They usually do not read or edit
+any kernel Makefiles (or any other source files).
+
+*Normal developers* are people who work on features such as device
+drivers, file systems, and network protocols. These people need to
+maintain the kbuild Makefiles for the subsystem they are
+working on. In order to do this effectively, they need some overall
+knowledge about the kernel Makefiles, plus detailed knowledge about the
+public interface for kbuild.
+
+*Arch developers* are people who work on an entire architecture, such
+as sparc or ia64. Arch developers need to know about the arch Makefile
+as well as kbuild Makefiles.
+
+*Kbuild developers* are people who work on the kernel build system itself.
+These people need to know about all aspects of the kernel Makefiles.
+
+This document is aimed towards normal developers and arch developers.
+
+
+=== 3 The kbuild files
+
+Most Makefiles within the kernel are kbuild Makefiles that use the
+kbuild infrastructure. This chapter introduces the syntax used in the
+kbuild makefiles.
+The preferred name for the kbuild files are 'Makefile' but 'Kbuild' can
+be used and if both a 'Makefile' and a 'Kbuild' file exists, then the 'Kbuild'
+file will be used.
+
+Section 3.1 "Goal definitions" is a quick intro, further chapters provide
+more details, with real examples.
+
+--- 3.1 Goal definitions
+
+ Goal definitions are the main part (heart) of the kbuild Makefile.
+ These lines define the files to be built, any special compilation
+ options, and any subdirectories to be entered recursively.
+
+ The most simple kbuild makefile contains one line:
+
+ Example:
+ obj-y += foo.o
+
+ This tells kbuild that there is one object in that directory, named
+ foo.o. foo.o will be built from foo.c or foo.S.
+
+ If foo.o shall be built as a module, the variable obj-m is used.
+ Therefore the following pattern is often used:
+
+ Example:
+ obj-$(CONFIG_FOO) += foo.o
+
+ $(CONFIG_FOO) evaluates to either y (for built-in) or m (for module).
+ If CONFIG_FOO is neither y nor m, then the file will not be compiled
+ nor linked.
+
+--- 3.2 Built-in object goals - obj-y
+
+ The kbuild Makefile specifies object files for vmlinux
+ in the $(obj-y) lists. These lists depend on the kernel
+ configuration.
+
+ Kbuild compiles all the $(obj-y) files. It then calls
+ "$(LD) -r" to merge these files into one built-in.o file.
+ built-in.o is later linked into vmlinux by the parent Makefile.
+
+ The order of files in $(obj-y) is significant. Duplicates in
+ the lists are allowed: the first instance will be linked into
+ built-in.o and succeeding instances will be ignored.
+
+ Link order is significant, because certain functions
+ (module_init() / __initcall) will be called during boot in the
+ order they appear. So keep in mind that changing the link
+ order may e.g. change the order in which your SCSI
+ controllers are detected, and thus your disks are renumbered.
+
+ Example:
+ #drivers/isdn/i4l/Makefile
+ # Makefile for the kernel ISDN subsystem and device drivers.
+ # Each configuration option enables a list of files.
+ obj-$(CONFIG_ISDN) += isdn.o
+ obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o
+
+--- 3.3 Loadable module goals - obj-m
+
+ $(obj-m) specify object files which are built as loadable
+ kernel modules.
+
+ A module may be built from one source file or several source
+ files. In the case of one source file, the kbuild makefile
+ simply adds the file to $(obj-m).
+
+ Example:
+ #drivers/isdn/i4l/Makefile
+ obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o
+
+ Note: In this example $(CONFIG_ISDN_PPP_BSDCOMP) evaluates to 'm'
+
+ If a kernel module is built from several source files, you specify
+ that you want to build a module in the same way as above.
+
+ Kbuild needs to know which the parts that you want to build your
+ module from, so you have to tell it by setting an
+ $(<module_name>-objs) variable.
+
+ Example:
+ #drivers/isdn/i4l/Makefile
+ obj-$(CONFIG_ISDN) += isdn.o
+ isdn-objs := isdn_net_lib.o isdn_v110.o isdn_common.o
+
+ In this example, the module name will be isdn.o. Kbuild will
+ compile the objects listed in $(isdn-objs) and then run
+ "$(LD) -r" on the list of these files to generate isdn.o.
+
+ Kbuild recognises objects used for composite objects by the suffix
+ -objs, and the suffix -y. This allows the Makefiles to use
+ the value of a CONFIG_ symbol to determine if an object is part
+ of a composite object.
+
+ Example:
+ #fs/ext2/Makefile
+ obj-$(CONFIG_EXT2_FS) += ext2.o
+ ext2-y := balloc.o bitmap.o
+ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o
+
+ In this example, xattr.o is only part of the composite object
+ ext2.o if $(CONFIG_EXT2_FS_XATTR) evaluates to 'y'.
+
+ Note: Of course, when you are building objects into the kernel,
+ the syntax above will also work. So, if you have CONFIG_EXT2_FS=y,
+ kbuild will build an ext2.o file for you out of the individual
+ parts and then link this into built-in.o, as you would expect.
+
+--- 3.4 Objects which export symbols
+
+ No special notation is required in the makefiles for
+ modules exporting symbols.
+
+--- 3.5 Library file goals - lib-y
+
+ Objects listed with obj-* are used for modules, or
+ combined in a built-in.o for that specific directory.
+ There is also the possibility to list objects that will
+ be included in a library, lib.a.
+ All objects listed with lib-y are combined in a single
+ library for that directory.
+ Objects that are listed in obj-y and additionally listed in
+ lib-y will not be included in the library, since they will
+ be accessible anyway.
+ For consistency, objects listed in lib-m will be included in lib.a.
+
+ Note that the same kbuild makefile may list files to be built-in
+ and to be part of a library. Therefore the same directory
+ may contain both a built-in.o and a lib.a file.
+
+ Example:
+ #arch/i386/lib/Makefile
+ lib-y := checksum.o delay.o
+
+ This will create a library lib.a based on checksum.o and delay.o.
+ For kbuild to actually recognize that there is a lib.a being built,
+ the directory shall be listed in libs-y.
+ See also "6.3 List directories to visit when descending".
+
+ Use of lib-y is normally restricted to lib/ and arch/*/lib.
+
+--- 3.6 Descending down in directories
+
+ A Makefile is only responsible for building objects in its own
+ directory. Files in subdirectories should be taken care of by
+ Makefiles in these subdirs. The build system will automatically
+ invoke make recursively in subdirectories, provided you let it know of
+ them.
+
+ To do so, obj-y and obj-m are used.
+ ext2 lives in a separate directory, and the Makefile present in fs/
+ tells kbuild to descend down using the following assignment.
+
+ Example:
+ #fs/Makefile
+ obj-$(CONFIG_EXT2_FS) += ext2/
+
+ If CONFIG_EXT2_FS is set to either 'y' (built-in) or 'm' (modular)
+ the corresponding obj- variable will be set, and kbuild will descend
+ down in the ext2 directory.
+ Kbuild only uses this information to decide that it needs to visit
+ the directory, it is the Makefile in the subdirectory that
+ specifies what is modules and what is built-in.
+
+ It is good practice to use a CONFIG_ variable when assigning directory
+ names. This allows kbuild to totally skip the directory if the
+ corresponding CONFIG_ option is neither 'y' nor 'm'.
+
+--- 3.7 Compilation flags
+
+ ccflags-y, asflags-y and ldflags-y
+ The three flags listed above applies only to the kbuild makefile
+ where they are assigned. They are used for all the normal
+ cc, as and ld invocation happenign during a recursive build.
+ Note: Flags with the same behaviour were previously named:
+ EXTRA_CFLAGS, EXTRA_AFLAGS and EXTRA_LDFLAGS.
+ They are yet supported but their use are deprecated.
+
+ ccflags-y specifies options for compiling C files with $(CC).
+
+ Example:
+ # drivers/sound/emu10k1/Makefile
+ ccflags-y += -I$(obj)
+ ccflags-$(DEBUG) += -DEMU10K1_DEBUG
+
+
+ This variable is necessary because the top Makefile owns the
+ variable $(KBUILD_CFLAGS) and uses it for compilation flags for the
+ entire tree.
+
+ asflags-y is a similar string for per-directory options
+ when compiling assembly language source.
+
+ Example:
+ #arch/x86_64/kernel/Makefile
+ asflags-y := -traditional
+
+
+ ldflags-y is a string for per-directory options to $(LD).
+
+ Example:
+ #arch/m68k/fpsp040/Makefile
+ ldflags-y := -x
+
+ CFLAGS_$@, AFLAGS_$@
+
+ CFLAGS_$@ and AFLAGS_$@ only apply to commands in current
+ kbuild makefile.
+
+ $(CFLAGS_$@) specifies per-file options for $(CC). The $@
+ part has a literal value which specifies the file that it is for.
+
+ Example:
+ # drivers/scsi/Makefile
+ CFLAGS_aha152x.o = -DAHA152X_STAT -DAUTOCONF
+ CFLAGS_gdth.o = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ \
+ -DGDTH_STATISTICS
+ CFLAGS_seagate.o = -DARBITRATE -DPARITY -DSEAGATE_USE_ASM
+
+ These three lines specify compilation flags for aha152x.o,
+ gdth.o, and seagate.o
+
+ $(AFLAGS_$@) is a similar feature for source files in assembly
+ languages.
+
+ Example:
+ # arch/arm/kernel/Makefile
+ AFLAGS_head-armv.o := -DTEXTADDR=$(TEXTADDR) -traditional
+ AFLAGS_head-armo.o := -DTEXTADDR=$(TEXTADDR) -traditional
+
+--- 3.9 Dependency tracking
+
+ Kbuild tracks dependencies on the following:
+ 1) All prerequisite files (both *.c and *.h)
+ 2) CONFIG_ options used in all prerequisite files
+ 3) Command-line used to compile target
+
+ Thus, if you change an option to $(CC) all affected files will
+ be re-compiled.
+
+--- 3.10 Special Rules
+
+ Special rules are used when the kbuild infrastructure does
+ not provide the required support. A typical example is
+ header files generated during the build process.
+ Another example are the architecture-specific Makefiles which
+ need special rules to prepare boot images etc.
+
+ Special rules are written as normal Make rules.
+ Kbuild is not executing in the directory where the Makefile is
+ located, so all special rules shall provide a relative
+ path to prerequisite files and target files.
+
+ Two variables are used when defining special rules:
+
+ $(src)
+ $(src) is a relative path which points to the directory
+ where the Makefile is located. Always use $(src) when
+ referring to files located in the src tree.
+
+ $(obj)
+ $(obj) is a relative path which points to the directory
+ where the target is saved. Always use $(obj) when
+ referring to generated files.
+
+ Example:
+ #drivers/scsi/Makefile
+ $(obj)/53c8xx_d.h: $(src)/53c7,8xx.scr $(src)/script_asm.pl
+ $(CPP) -DCHIP=810 - < $< | ... $(src)/script_asm.pl
+
+ This is a special rule, following the normal syntax
+ required by make.
+ The target file depends on two prerequisite files. References
+ to the target file are prefixed with $(obj), references
+ to prerequisites are referenced with $(src) (because they are not
+ generated files).
+
+--- 3.11 $(CC) support functions
+
+ The kernel may be built with several different versions of
+ $(CC), each supporting a unique set of features and options.
+ kbuild provide basic support to check for valid options for $(CC).
+ $(CC) is usually the gcc compiler, but other alternatives are
+ available.
+
+ as-option
+ as-option is used to check if $(CC) -- when used to compile
+ assembler (*.S) files -- supports the given option. An optional
+ second option may be specified if the first option is not supported.
+
+ Example:
+ #arch/sh/Makefile
+ cflags-y += $(call as-option,-Wa$(comma)-isa=$(isa-y),)
+
+ In the above example, cflags-y will be assigned the option
+ -Wa$(comma)-isa=$(isa-y) if it is supported by $(CC).
+ The second argument is optional, and if supplied will be used
+ if first argument is not supported.
+
+ ld-option
+ ld-option is used to check if $(CC) when used to link object files
+ supports the given option. An optional second option may be
+ specified if first option are not supported.
+
+ Example:
+ #arch/i386/kernel/Makefile
+ vsyscall-flags += $(call ld-option, -Wl$(comma)--hash-style=sysv)
+
+ In the above example, vsyscall-flags will be assigned the option
+ -Wl$(comma)--hash-style=sysv if it is supported by $(CC).
+ The second argument is optional, and if supplied will be used
+ if first argument is not supported.
+
+ as-instr
+ as-instr checks if the assembler reports a specific instruction
+ and then outputs either option1 or option2
+ C escapes are supported in the test instruction
+ Note: as-instr-option uses KBUILD_AFLAGS for $(AS) options
+
+ cc-option
+ cc-option is used to check if $(CC) supports a given option, and not
+ supported to use an optional second option.
+
+ Example:
+ #arch/i386/Makefile
+ cflags-y += $(call cc-option,-march=pentium-mmx,-march=i586)
+
+ In the above example, cflags-y will be assigned the option
+ -march=pentium-mmx if supported by $(CC), otherwise -march=i586.
+ The second argument to cc-option is optional, and if omitted,
+ cflags-y will be assigned no value if first option is not supported.
+ Note: cc-option uses KBUILD_CFLAGS for $(CC) options
+
+ cc-option-yn
+ cc-option-yn is used to check if gcc supports a given option
+ and return 'y' if supported, otherwise 'n'.
+
+ Example:
+ #arch/ppc/Makefile
+ biarch := $(call cc-option-yn, -m32)
+ aflags-$(biarch) += -a32
+ cflags-$(biarch) += -m32
+
+ In the above example, $(biarch) is set to y if $(CC) supports the -m32
+ option. When $(biarch) equals 'y', the expanded variables $(aflags-y)
+ and $(cflags-y) will be assigned the values -a32 and -m32,
+ respectively.
+ Note: cc-option-yn uses KBUILD_CFLAGS for $(CC) options
+
+ cc-option-align
+ gcc versions >= 3.0 changed the type of options used to specify
+ alignment of functions, loops etc. $(cc-option-align), when used
+ as prefix to the align options, will select the right prefix:
+ gcc < 3.00
+ cc-option-align = -malign
+ gcc >= 3.00
+ cc-option-align = -falign
+
+ Example:
+ KBUILD_CFLAGS += $(cc-option-align)-functions=4
+
+ In the above example, the option -falign-functions=4 is used for
+ gcc >= 3.00. For gcc < 3.00, -malign-functions=4 is used.
+ Note: cc-option-align uses KBUILD_CFLAGS for $(CC) options
+
+ cc-version
+ cc-version returns a numerical version of the $(CC) compiler version.
+ The format is <major><minor> where both are two digits. So for example
+ gcc 3.41 would return 0341.
+ cc-version is useful when a specific $(CC) version is faulty in one
+ area, for example -mregparm=3 was broken in some gcc versions
+ even though the option was accepted by gcc.
+
+ Example:
+ #arch/i386/Makefile
+ cflags-y += $(shell \
+ if [ $(call cc-version) -ge 0300 ] ; then \
+ echo "-mregparm=3"; fi ;)
+
+ In the above example, -mregparm=3 is only used for gcc version greater
+ than or equal to gcc 3.0.
+
+ cc-ifversion
+ cc-ifversion tests the version of $(CC) and equals last argument if
+ version expression is true.
+
+ Example:
+ #fs/reiserfs/Makefile
+ ccflags-y := $(call cc-ifversion, -lt, 0402, -O1)
+
+ In this example, ccflags-y will be assigned the value -O1 if the
+ $(CC) version is less than 4.2.
+ cc-ifversion takes all the shell operators:
+ -eq, -ne, -lt, -le, -gt, and -ge
+ The third parameter may be a text as in this example, but it may also
+ be an expanded variable or a macro.
+
+ cc-fullversion
+ cc-fullversion is useful when the exact version of gcc is needed.
+ One typical use-case is when a specific GCC version is broken.
+ cc-fullversion points out a more specific version than cc-version does.
+
+ Example:
+ #arch/powerpc/Makefile
+ $(Q)if test "$(call cc-fullversion)" = "040200" ; then \
+ echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \
+ false ; \
+ fi
+
+ In this example for a specific GCC version the build will error out explaining
+ to the user why it stops.
+
+ cc-cross-prefix
+ cc-cross-prefix is used to check if there exists a $(CC) in path with
+ one of the listed prefixes. The first prefix where there exist a
+ prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found
+ then nothing is returned.
+ Additional prefixes are separated by a single space in the
+ call of cc-cross-prefix.
+ This functionality is useful for architecture Makefiles that try
+ to set CROSS_COMPILE to well-known values but may have several
+ values to select between.
+ It is recommended only to try to set CROSS_COMPILE if it is a cross
+ build (host arch is different from target arch). And if CROSS_COMPILE
+ is already set then leave it with the old value.
+
+ Example:
+ #arch/m68k/Makefile
+ ifneq ($(SUBARCH),$(ARCH))
+ ifeq ($(CROSS_COMPILE),)
+ CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-)
+ endif
+ endif
+
+=== 4 Host Program support
+
+Kbuild supports building executables on the host for use during the
+compilation stage.
+Two steps are required in order to use a host executable.
+
+The first step is to tell kbuild that a host program exists. This is
+done utilising the variable hostprogs-y.
+
+The second step is to add an explicit dependency to the executable.
+This can be done in two ways. Either add the dependency in a rule,
+or utilise the variable $(always).
+Both possibilities are described in the following.
+
+--- 4.1 Simple Host Program
+
+ In some cases there is a need to compile and run a program on the
+ computer where the build is running.
+ The following line tells kbuild that the program bin2hex shall be
+ built on the build host.
+
+ Example:
+ hostprogs-y := bin2hex
+
+ Kbuild assumes in the above example that bin2hex is made from a single
+ c-source file named bin2hex.c located in the same directory as
+ the Makefile.
+
+--- 4.2 Composite Host Programs
+
+ Host programs can be made up based on composite objects.
+ The syntax used to define composite objects for host programs is
+ similar to the syntax used for kernel objects.
+ $(<executable>-objs) lists all objects used to link the final
+ executable.
+
+ Example:
+ #scripts/lxdialog/Makefile
+ hostprogs-y := lxdialog
+ lxdialog-objs := checklist.o lxdialog.o
+
+ Objects with extension .o are compiled from the corresponding .c
+ files. In the above example, checklist.c is compiled to checklist.o
+ and lxdialog.c is compiled to lxdialog.o.
+ Finally, the two .o files are linked to the executable, lxdialog.
+ Note: The syntax <executable>-y is not permitted for host-programs.
+
+--- 4.3 Defining shared libraries
+
+ Objects with extension .so are considered shared libraries, and
+ will be compiled as position independent objects.
+ Kbuild provides support for shared libraries, but the usage
+ shall be restricted.
+ In the following example the libkconfig.so shared library is used
+ to link the executable conf.
+
+ Example:
+ #scripts/kconfig/Makefile
+ hostprogs-y := conf
+ conf-objs := conf.o libkconfig.so
+ libkconfig-objs := expr.o type.o
+
+ Shared libraries always require a corresponding -objs line, and
+ in the example above the shared library libkconfig is composed by
+ the two objects expr.o and type.o.
+ expr.o and type.o will be built as position independent code and
+ linked as a shared library libkconfig.so. C++ is not supported for
+ shared libraries.
+
+--- 4.4 Using C++ for host programs
+
+ kbuild offers support for host programs written in C++. This was
+ introduced solely to support kconfig, and is not recommended
+ for general use.
+
+ Example:
+ #scripts/kconfig/Makefile
+ hostprogs-y := qconf
+ qconf-cxxobjs := qconf.o
+
+ In the example above the executable is composed of the C++ file
+ qconf.cc - identified by $(qconf-cxxobjs).
+
+ If qconf is composed by a mixture of .c and .cc files, then an
+ additional line can be used to identify this.
+
+ Example:
+ #scripts/kconfig/Makefile
+ hostprogs-y := qconf
+ qconf-cxxobjs := qconf.o
+ qconf-objs := check.o
+
+--- 4.5 Controlling compiler options for host programs
+
+ When compiling host programs, it is possible to set specific flags.
+ The programs will always be compiled utilising $(HOSTCC) passed
+ the options specified in $(HOSTCFLAGS).
+ To set flags that will take effect for all host programs created
+ in that Makefile, use the variable HOST_EXTRACFLAGS.
+
+ Example:
+ #scripts/lxdialog/Makefile
+ HOST_EXTRACFLAGS += -I/usr/include/ncurses
+
+ To set specific flags for a single file the following construction
+ is used:
+
+ Example:
+ #arch/ppc64/boot/Makefile
+ HOSTCFLAGS_piggyback.o := -DKERNELBASE=$(KERNELBASE)
+
+ It is also possible to specify additional options to the linker.
+
+ Example:
+ #scripts/kconfig/Makefile
+ HOSTLOADLIBES_qconf := -L$(QTDIR)/lib
+
+ When linking qconf, it will be passed the extra option
+ "-L$(QTDIR)/lib".
+
+--- 4.6 When host programs are actually built
+
+ Kbuild will only build host-programs when they are referenced
+ as a prerequisite.
+ This is possible in two ways:
+
+ (1) List the prerequisite explicitly in a special rule.
+
+ Example:
+ #drivers/pci/Makefile
+ hostprogs-y := gen-devlist
+ $(obj)/devlist.h: $(src)/pci.ids $(obj)/gen-devlist
+ ( cd $(obj); ./gen-devlist ) < $<
+
+ The target $(obj)/devlist.h will not be built before
+ $(obj)/gen-devlist is updated. Note that references to
+ the host programs in special rules must be prefixed with $(obj).
+
+ (2) Use $(always)
+ When there is no suitable special rule, and the host program
+ shall be built when a makefile is entered, the $(always)
+ variable shall be used.
+
+ Example:
+ #scripts/lxdialog/Makefile
+ hostprogs-y := lxdialog
+ always := $(hostprogs-y)
+
+ This will tell kbuild to build lxdialog even if not referenced in
+ any rule.
+
+--- 4.7 Using hostprogs-$(CONFIG_FOO)
+
+ A typical pattern in a Kbuild file looks like this:
+
+ Example:
+ #scripts/Makefile
+ hostprogs-$(CONFIG_KALLSYMS) += kallsyms
+
+ Kbuild knows about both 'y' for built-in and 'm' for module.
+ So if a config symbol evaluate to 'm', kbuild will still build
+ the binary. In other words, Kbuild handles hostprogs-m exactly
+ like hostprogs-y. But only hostprogs-y is recommended to be used
+ when no CONFIG symbols are involved.
+
+=== 5 Kbuild clean infrastructure
+
+"make clean" deletes most generated files in the obj tree where the kernel
+is compiled. This includes generated files such as host programs.
+Kbuild knows targets listed in $(hostprogs-y), $(hostprogs-m), $(always),
+$(extra-y) and $(targets). They are all deleted during "make clean".
+Files matching the patterns "*.[oas]", "*.ko", plus some additional files
+generated by kbuild are deleted all over the kernel src tree when
+"make clean" is executed.
+
+Additional files can be specified in kbuild makefiles by use of $(clean-files).
+
+ Example:
+ #drivers/pci/Makefile
+ clean-files := devlist.h classlist.h
+
+When executing "make clean", the two files "devlist.h classlist.h" will
+be deleted. Kbuild will assume files to be in same relative directory as the
+Makefile except if an absolute path is specified (path starting with '/').
+
+To delete a directory hierarchy use:
+
+ Example:
+ #scripts/package/Makefile
+ clean-dirs := $(objtree)/debian/
+
+This will delete the directory debian, including all subdirectories.
+Kbuild will assume the directories to be in the same relative path as the
+Makefile if no absolute path is specified (path does not start with '/').
+
+Usually kbuild descends down in subdirectories due to "obj-* := dir/",
+but in the architecture makefiles where the kbuild infrastructure
+is not sufficient this sometimes needs to be explicit.
+
+ Example:
+ #arch/i386/boot/Makefile
+ subdir- := compressed/
+
+The above assignment instructs kbuild to descend down in the
+directory compressed/ when "make clean" is executed.
+
+To support the clean infrastructure in the Makefiles that builds the
+final bootimage there is an optional target named archclean:
+
+ Example:
+ #arch/i386/Makefile
+ archclean:
+ $(Q)$(MAKE) $(clean)=arch/i386/boot
+
+When "make clean" is executed, make will descend down in arch/i386/boot,
+and clean as usual. The Makefile located in arch/i386/boot/ may use
+the subdir- trick to descend further down.
+
+Note 1: arch/$(ARCH)/Makefile cannot use "subdir-", because that file is
+included in the top level makefile, and the kbuild infrastructure
+is not operational at that point.
+
+Note 2: All directories listed in core-y, libs-y, drivers-y and net-y will
+be visited during "make clean".
+
+=== 6 Architecture Makefiles
+
+The top level Makefile sets up the environment and does the preparation,
+before starting to descend down in the individual directories.
+The top level makefile contains the generic part, whereas
+arch/$(ARCH)/Makefile contains what is required to set up kbuild
+for said architecture.
+To do so, arch/$(ARCH)/Makefile sets up a number of variables and defines
+a few targets.
+
+When kbuild executes, the following steps are followed (roughly):
+1) Configuration of the kernel => produce .config
+2) Store kernel version in include/linux/version.h
+3) Symlink include/asm to include/asm-$(ARCH)
+4) Updating all other prerequisites to the target prepare:
+ - Additional prerequisites are specified in arch/$(ARCH)/Makefile
+5) Recursively descend down in all directories listed in
+ init-* core* drivers-* net-* libs-* and build all targets.
+ - The values of the above variables are expanded in arch/$(ARCH)/Makefile.
+6) All object files are then linked and the resulting file vmlinux is
+ located at the root of the obj tree.
+ The very first objects linked are listed in head-y, assigned by
+ arch/$(ARCH)/Makefile.
+7) Finally, the architecture-specific part does any required post processing
+ and builds the final bootimage.
+ - This includes building boot records
+ - Preparing initrd images and the like
+
+
+--- 6.1 Set variables to tweak the build to the architecture
+
+ LDFLAGS Generic $(LD) options
+
+ Flags used for all invocations of the linker.
+ Often specifying the emulation is sufficient.
+
+ Example:
+ #arch/s390/Makefile
+ LDFLAGS := -m elf_s390
+ Note: ldflags-y can be used to further customise
+ the flags used. See chapter 3.7.
+
+ LDFLAGS_MODULE Options for $(LD) when linking modules
+
+ LDFLAGS_MODULE is used to set specific flags for $(LD) when
+ linking the .ko files used for modules.
+ Default is "-r", for relocatable output.
+
+ LDFLAGS_vmlinux Options for $(LD) when linking vmlinux
+
+ LDFLAGS_vmlinux is used to specify additional flags to pass to
+ the linker when linking the final vmlinux image.
+ LDFLAGS_vmlinux uses the LDFLAGS_$@ support.
+
+ Example:
+ #arch/i386/Makefile
+ LDFLAGS_vmlinux := -e stext
+
+ OBJCOPYFLAGS objcopy flags
+
+ When $(call if_changed,objcopy) is used to translate a .o file,
+ the flags specified in OBJCOPYFLAGS will be used.
+ $(call if_changed,objcopy) is often used to generate raw binaries on
+ vmlinux.
+
+ Example:
+ #arch/s390/Makefile
+ OBJCOPYFLAGS := -O binary
+
+ #arch/s390/boot/Makefile
+ $(obj)/image: vmlinux FORCE
+ $(call if_changed,objcopy)
+
+ In this example, the binary $(obj)/image is a binary version of
+ vmlinux. The usage of $(call if_changed,xxx) will be described later.
+
+ KBUILD_AFLAGS $(AS) assembler flags
+
+ Default value - see top level Makefile
+ Append or modify as required per architecture.
+
+ Example:
+ #arch/sparc64/Makefile
+ KBUILD_AFLAGS += -m64 -mcpu=ultrasparc
+
+ KBUILD_CFLAGS $(CC) compiler flags
+
+ Default value - see top level Makefile
+ Append or modify as required per architecture.
+
+ Often, the KBUILD_CFLAGS variable depends on the configuration.
+
+ Example:
+ #arch/i386/Makefile
+ cflags-$(CONFIG_M386) += -march=i386
+ KBUILD_CFLAGS += $(cflags-y)
+
+ Many arch Makefiles dynamically run the target C compiler to
+ probe supported options:
+
+ #arch/i386/Makefile
+
+ ...
+ cflags-$(CONFIG_MPENTIUMII) += $(call cc-option,\
+ -march=pentium2,-march=i686)
+ ...
+ # Disable unit-at-a-time mode ...
+ KBUILD_CFLAGS += $(call cc-option,-fno-unit-at-a-time)
+ ...
+
+
+ The first example utilises the trick that a config option expands
+ to 'y' when selected.
+
+ CFLAGS_KERNEL $(CC) options specific for built-in
+
+ $(CFLAGS_KERNEL) contains extra C compiler flags used to compile
+ resident kernel code.
+
+ CFLAGS_MODULE $(CC) options specific for modules
+
+ $(CFLAGS_MODULE) contains extra C compiler flags used to compile code
+ for loadable kernel modules.
+
+
+--- 6.2 Add prerequisites to archprepare:
+
+ The archprepare: rule is used to list prerequisites that need to be
+ built before starting to descend down in the subdirectories.
+ This is usually used for header files containing assembler constants.
+
+ Example:
+ #arch/arm/Makefile
+ archprepare: maketools
+
+ In this example, the file target maketools will be processed
+ before descending down in the subdirectories.
+ See also chapter XXX-TODO that describe how kbuild supports
+ generating offset header files.
+
+
+--- 6.3 List directories to visit when descending
+
+ An arch Makefile cooperates with the top Makefile to define variables
+ which specify how to build the vmlinux file. Note that there is no
+ corresponding arch-specific section for modules; the module-building
+ machinery is all architecture-independent.
+
+
+ head-y, init-y, core-y, libs-y, drivers-y, net-y
+
+ $(head-y) lists objects to be linked first in vmlinux.
+ $(libs-y) lists directories where a lib.a archive can be located.
+ The rest list directories where a built-in.o object file can be
+ located.
+
+ $(init-y) objects will be located after $(head-y).
+ Then the rest follows in this order:
+ $(core-y), $(libs-y), $(drivers-y) and $(net-y).
+
+ The top level Makefile defines values for all generic directories,
+ and arch/$(ARCH)/Makefile only adds architecture-specific directories.
+
+ Example:
+ #arch/sparc64/Makefile
+ core-y += arch/sparc64/kernel/
+ libs-y += arch/sparc64/prom/ arch/sparc64/lib/
+ drivers-$(CONFIG_OPROFILE) += arch/sparc64/oprofile/
+
+
+--- 6.4 Architecture-specific boot images
+
+ An arch Makefile specifies goals that take the vmlinux file, compress
+ it, wrap it in bootstrapping code, and copy the resulting files
+ somewhere. This includes various kinds of installation commands.
+ The actual goals are not standardized across architectures.
+
+ It is common to locate any additional processing in a boot/
+ directory below arch/$(ARCH)/.
+
+ Kbuild does not provide any smart way to support building a
+ target specified in boot/. Therefore arch/$(ARCH)/Makefile shall
+ call make manually to build a target in boot/.
+
+ The recommended approach is to include shortcuts in
+ arch/$(ARCH)/Makefile, and use the full path when calling down
+ into the arch/$(ARCH)/boot/Makefile.
+
+ Example:
+ #arch/i386/Makefile
+ boot := arch/i386/boot
+ bzImage: vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
+ "$(Q)$(MAKE) $(build)=<dir>" is the recommended way to invoke
+ make in a subdirectory.
+
+ There are no rules for naming architecture-specific targets,
+ but executing "make help" will list all relevant targets.
+ To support this, $(archhelp) must be defined.
+
+ Example:
+ #arch/i386/Makefile
+ define archhelp
+ echo '* bzImage - Image (arch/$(ARCH)/boot/bzImage)'
+ endif
+
+ When make is executed without arguments, the first goal encountered
+ will be built. In the top level Makefile the first goal present
+ is all:.
+ An architecture shall always, per default, build a bootable image.
+ In "make help", the default goal is highlighted with a '*'.
+ Add a new prerequisite to all: to select a default goal different
+ from vmlinux.
+
+ Example:
+ #arch/i386/Makefile
+ all: bzImage
+
+ When "make" is executed without arguments, bzImage will be built.
+
+--- 6.5 Building non-kbuild targets
+
+ extra-y
+
+ extra-y specify additional targets created in the current
+ directory, in addition to any targets specified by obj-*.
+
+ Listing all targets in extra-y is required for two purposes:
+ 1) Enable kbuild to check changes in command lines
+ - When $(call if_changed,xxx) is used
+ 2) kbuild knows what files to delete during "make clean"
+
+ Example:
+ #arch/i386/kernel/Makefile
+ extra-y := head.o init_task.o
+
+ In this example, extra-y is used to list object files that
+ shall be built, but shall not be linked as part of built-in.o.
+
+
+--- 6.6 Commands useful for building a boot image
+
+ Kbuild provides a few macros that are useful when building a
+ boot image.
+
+ if_changed
+
+ if_changed is the infrastructure used for the following commands.
+
+ Usage:
+ target: source(s) FORCE
+ $(call if_changed,ld/objcopy/gzip)
+
+ When the rule is evaluated, it is checked to see if any files
+ need an update, or the command line has changed since the last
+ invocation. The latter will force a rebuild if any options
+ to the executable have changed.
+ Any target that utilises if_changed must be listed in $(targets),
+ otherwise the command line check will fail, and the target will
+ always be built.
+ Assignments to $(targets) are without $(obj)/ prefix.
+ if_changed may be used in conjunction with custom commands as
+ defined in 6.7 "Custom kbuild commands".
+
+ Note: It is a typical mistake to forget the FORCE prerequisite.
+ Another common pitfall is that whitespace is sometimes
+ significant; for instance, the below will fail (note the extra space
+ after the comma):
+ target: source(s) FORCE
+ #WRONG!# $(call if_changed, ld/objcopy/gzip)
+
+ ld
+ Link target. Often, LDFLAGS_$@ is used to set specific options to ld.
+
+ objcopy
+ Copy binary. Uses OBJCOPYFLAGS usually specified in
+ arch/$(ARCH)/Makefile.
+ OBJCOPYFLAGS_$@ may be used to set additional options.
+
+ gzip
+ Compress target. Use maximum compression to compress target.
+
+ Example:
+ #arch/i386/boot/Makefile
+ LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
+ LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext
+
+ targets += setup setup.o bootsect bootsect.o
+ $(obj)/setup $(obj)/bootsect: %: %.o FORCE
+ $(call if_changed,ld)
+
+ In this example, there are two possible targets, requiring different
+ options to the linker. The linker options are specified using the
+ LDFLAGS_$@ syntax - one for each potential target.
+ $(targets) are assigned all potential targets, by which kbuild knows
+ the targets and will:
+ 1) check for commandline changes
+ 2) delete target during make clean
+
+ The ": %: %.o" part of the prerequisite is a shorthand that
+ free us from listing the setup.o and bootsect.o files.
+ Note: It is a common mistake to forget the "target :=" assignment,
+ resulting in the target file being recompiled for no
+ obvious reason.
+
+
+--- 6.7 Custom kbuild commands
+
+ When kbuild is executing with KBUILD_VERBOSE=0, then only a shorthand
+ of a command is normally displayed.
+ To enable this behaviour for custom commands kbuild requires
+ two variables to be set:
+ quiet_cmd_<command> - what shall be echoed
+ cmd_<command> - the command to execute
+
+ Example:
+ #
+ quiet_cmd_image = BUILD $@
+ cmd_image = $(obj)/tools/build $(BUILDFLAGS) \
+ $(obj)/vmlinux.bin > $@
+
+ targets += bzImage
+ $(obj)/bzImage: $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+ $(call if_changed,image)
+ @echo 'Kernel: $@ is ready'
+
+ When updating the $(obj)/bzImage target, the line
+
+ BUILD arch/i386/boot/bzImage
+
+ will be displayed with "make KBUILD_VERBOSE=0".
+
+
+--- 6.8 Preprocessing linker scripts
+
+ When the vmlinux image is built, the linker script
+ arch/$(ARCH)/kernel/vmlinux.lds is used.
+ The script is a preprocessed variant of the file vmlinux.lds.S
+ located in the same directory.
+ kbuild knows .lds files and includes a rule *lds.S -> *lds.
+
+ Example:
+ #arch/i386/kernel/Makefile
+ always := vmlinux.lds
+
+ #Makefile
+ export CPPFLAGS_vmlinux.lds += -P -C -U$(ARCH)
+
+ The assignment to $(always) is used to tell kbuild to build the
+ target vmlinux.lds.
+ The assignment to $(CPPFLAGS_vmlinux.lds) tells kbuild to use the
+ specified options when building the target vmlinux.lds.
+
+ When building the *.lds target, kbuild uses the variables:
+ KBUILD_CPPFLAGS : Set in top-level Makefile
+ cppflags-y : May be set in the kbuild makefile
+ CPPFLAGS_$(@F) : Target specific flags.
+ Note that the full filename is used in this
+ assignment.
+
+ The kbuild infrastructure for *lds file are used in several
+ architecture-specific files.
+
+
+=== 7 Kbuild Variables
+
+The top Makefile exports the following variables:
+
+ VERSION, PATCHLEVEL, SUBLEVEL, EXTRAVERSION
+
+ These variables define the current kernel version. A few arch
+ Makefiles actually use these values directly; they should use
+ $(KERNELRELEASE) instead.
+
+ $(VERSION), $(PATCHLEVEL), and $(SUBLEVEL) define the basic
+ three-part version number, such as "2", "4", and "0". These three
+ values are always numeric.
+
+ $(EXTRAVERSION) defines an even tinier sublevel for pre-patches
+ or additional patches. It is usually some non-numeric string
+ such as "-pre4", and is often blank.
+
+ KERNELRELEASE
+
+ $(KERNELRELEASE) is a single string such as "2.4.0-pre4", suitable
+ for constructing installation directory names or showing in
+ version strings. Some arch Makefiles use it for this purpose.
+
+ ARCH
+
+ This variable defines the target architecture, such as "i386",
+ "arm", or "sparc". Some kbuild Makefiles test $(ARCH) to
+ determine which files to compile.
+
+ By default, the top Makefile sets $(ARCH) to be the same as the
+ host system architecture. For a cross build, a user may
+ override the value of $(ARCH) on the command line:
+
+ make ARCH=m68k ...
+
+
+ INSTALL_PATH
+
+ This variable defines a place for the arch Makefiles to install
+ the resident kernel image and System.map file.
+ Use this for architecture-specific install targets.
+
+ INSTALL_MOD_PATH, MODLIB
+
+ $(INSTALL_MOD_PATH) specifies a prefix to $(MODLIB) for module
+ installation. This variable is not defined in the Makefile but
+ may be passed in by the user if desired.
+
+ $(MODLIB) specifies the directory for module installation.
+ The top Makefile defines $(MODLIB) to
+ $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE). The user may
+ override this value on the command line if desired.
+
+ INSTALL_MOD_STRIP
+
+ If this variable is specified, will cause modules to be stripped
+ after they are installed. If INSTALL_MOD_STRIP is '1', then the
+ default option --strip-debug will be used. Otherwise,
+ INSTALL_MOD_STRIP will used as the option(s) to the strip command.
+
+
+=== 8 Makefile language
+
+The kernel Makefiles are designed to be run with GNU Make. The Makefiles
+use only the documented features of GNU Make, but they do use many
+GNU extensions.
+
+GNU Make supports elementary list-processing functions. The kernel
+Makefiles use a novel style of list building and manipulation with few
+"if" statements.
+
+GNU Make has two assignment operators, ":=" and "=". ":=" performs
+immediate evaluation of the right-hand side and stores an actual string
+into the left-hand side. "=" is like a formula definition; it stores the
+right-hand side in an unevaluated form and then evaluates this form each
+time the left-hand side is used.
+
+There are some cases where "=" is appropriate. Usually, though, ":="
+is the right choice.
+
+=== 9 Credits
+
+Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net>
+Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de>
+Updates by Sam Ravnborg <sam@ravnborg.org>
+Language QA by Jan Engelhardt <jengelh@gmx.de>
+
+=== 10 TODO
+
+- Describe how kbuild supports shipped files with _shipped.
+- Generating offset header files.
+- Add more variables to section 7?
+
+
+
diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt
new file mode 100644
index 0000000..1821c07
--- /dev/null
+++ b/Documentation/kbuild/modules.txt
@@ -0,0 +1,552 @@
+
+In this document you will find information about:
+- how to build external modules
+- how to make your module use the kbuild infrastructure
+- how kbuild will install a kernel
+- how to install modules in a non-standard location
+
+=== Table of Contents
+
+ === 1 Introduction
+ === 2 How to build external modules
+ --- 2.1 Building external modules
+ --- 2.2 Available targets
+ --- 2.3 Available options
+ --- 2.4 Preparing the kernel tree for module build
+ --- 2.5 Building separate files for a module
+ === 3. Example commands
+ === 4. Creating a kbuild file for an external module
+ === 5. Include files
+ --- 5.1 How to include files from the kernel include dir
+ --- 5.2 External modules using an include/ dir
+ --- 5.3 External modules using several directories
+ === 6. Module installation
+ --- 6.1 INSTALL_MOD_PATH
+ --- 6.2 INSTALL_MOD_DIR
+ === 7. Module versioning & Module.symvers
+ --- 7.1 Symbols from the kernel (vmlinux + modules)
+ --- 7.2 Symbols and external modules
+ --- 7.3 Symbols from another external module
+ === 8. Tips & Tricks
+ --- 8.1 Testing for CONFIG_FOO_BAR
+
+
+
+=== 1. Introduction
+
+kbuild includes functionality for building modules both
+within the kernel source tree and outside the kernel source tree.
+The latter is usually referred to as external or "out-of-tree"
+modules and is used both during development and for modules that
+are not planned to be included in the kernel tree.
+
+What is covered within this file is mainly information to authors
+of modules. The author of an external module should supply
+a makefile that hides most of the complexity, so one only has to type
+'make' to build the module. A complete example will be presented in
+chapter 4, "Creating a kbuild file for an external module".
+
+
+=== 2. How to build external modules
+
+kbuild offers functionality to build external modules, with the
+prerequisite that there is a pre-built kernel available with full source.
+A subset of the targets available when building the kernel is available
+when building an external module.
+
+--- 2.1 Building external modules
+
+ Use the following command to build an external module:
+
+ make -C <path-to-kernel> M=`pwd`
+
+ For the running kernel use:
+
+ make -C /lib/modules/`uname -r`/build M=`pwd`
+
+ For the above command to succeed, the kernel must have been
+ built with modules enabled.
+
+ To install the modules that were just built:
+
+ make -C <path-to-kernel> M=`pwd` modules_install
+
+ More complex examples will be shown later, the above should
+ be enough to get you started.
+
+--- 2.2 Available targets
+
+ $KDIR refers to the path to the kernel source top-level directory
+
+ make -C $KDIR M=`pwd`
+ Will build the module(s) located in current directory.
+ All output files will be located in the same directory
+ as the module source.
+ No attempts are made to update the kernel source, and it is
+ a precondition that a successful make has been executed
+ for the kernel.
+
+ make -C $KDIR M=`pwd` modules
+ The modules target is implied when no target is given.
+ Same functionality as if no target was specified.
+ See description above.
+
+ make -C $KDIR M=`pwd` modules_install
+ Install the external module(s).
+ Installation default is in /lib/modules/<kernel-version>/extra,
+ but may be prefixed with INSTALL_MOD_PATH - see separate
+ chapter.
+
+ make -C $KDIR M=`pwd` clean
+ Remove all generated files for the module - the kernel
+ source directory is not modified.
+
+ make -C $KDIR M=`pwd` help
+ help will list the available target when building external
+ modules.
+
+--- 2.3 Available options:
+
+ $KDIR refers to the path to the kernel source top-level directory
+
+ make -C $KDIR
+ Used to specify where to find the kernel source.
+ '$KDIR' represent the directory where the kernel source is.
+ Make will actually change directory to the specified directory
+ when executed but change back when finished.
+
+ make -C $KDIR M=`pwd`
+ M= is used to tell kbuild that an external module is
+ being built.
+ The option given to M= is the directory where the external
+ module (kbuild file) is located.
+ When an external module is being built only a subset of the
+ usual targets are available.
+
+ make -C $KDIR SUBDIRS=`pwd`
+ Same as M=. The SUBDIRS= syntax is kept for backwards
+ compatibility.
+
+--- 2.4 Preparing the kernel tree for module build
+
+ To make sure the kernel contains the information required to
+ build external modules the target 'modules_prepare' must be used.
+ 'modules_prepare' exists solely as a simple way to prepare
+ a kernel source tree for building external modules.
+ Note: modules_prepare will not build Module.symvers even if
+ CONFIG_MODVERSIONS is set. Therefore a full kernel build
+ needs to be executed to make module versioning work.
+
+--- 2.5 Building separate files for a module
+ It is possible to build single files which are part of a module.
+ This works equally well for the kernel, a module and even for
+ external modules.
+ Examples (module foo.ko, consist of bar.o, baz.o):
+ make -C $KDIR M=`pwd` bar.lst
+ make -C $KDIR M=`pwd` bar.o
+ make -C $KDIR M=`pwd` foo.ko
+ make -C $KDIR M=`pwd` /
+
+
+=== 3. Example commands
+
+This example shows the actual commands to be executed when building
+an external module for the currently running kernel.
+In the example below, the distribution is supposed to use the
+facility to locate output files for a kernel compile in a different
+directory than the kernel source - but the examples will also work
+when the source and the output files are mixed in the same directory.
+
+# Kernel source
+/lib/modules/<kernel-version>/source -> /usr/src/linux-<version>
+
+# Output from kernel compile
+/lib/modules/<kernel-version>/build -> /usr/src/linux-<version>-up
+
+Change to the directory where the kbuild file is located and execute
+the following commands to build the module:
+
+ cd /home/user/src/module
+ make -C /usr/src/`uname -r`/source \
+ O=/lib/modules/`uname-r`/build \
+ M=`pwd`
+
+Then, to install the module use the following command:
+
+ make -C /usr/src/`uname -r`/source \
+ O=/lib/modules/`uname-r`/build \
+ M=`pwd` \
+ modules_install
+
+If you look closely you will see that this is the same command as
+listed before - with the directories spelled out.
+
+The above are rather long commands, and the following chapter
+lists a few tricks to make it all easier.
+
+
+=== 4. Creating a kbuild file for an external module
+
+kbuild is the build system for the kernel, and external modules
+must use kbuild to stay compatible with changes in the build system
+and to pick up the right flags to gcc etc.
+
+The kbuild file used as input shall follow the syntax described
+in Documentation/kbuild/makefiles.txt. This chapter will introduce a few
+more tricks to be used when dealing with external modules.
+
+In the following a Makefile will be created for a module with the
+following files:
+ 8123_if.c
+ 8123_if.h
+ 8123_pci.c
+ 8123_bin.o_shipped <= Binary blob
+
+--- 4.1 Shared Makefile for module and kernel
+
+ An external module always includes a wrapper Makefile supporting
+ building the module using 'make' with no arguments.
+ The Makefile provided will most likely include additional
+ functionality such as test targets etc. and this part shall
+ be filtered away from kbuild since it may impact kbuild if
+ name clashes occurs.
+
+ Example 1:
+ --> filename: Makefile
+ ifneq ($(KERNELRELEASE),)
+ # kbuild part of makefile
+ obj-m := 8123.o
+ 8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+ else
+ # Normal Makefile
+
+ KERNELDIR := /lib/modules/`uname -r`/build
+ all::
+ $(MAKE) -C $(KERNELDIR) M=`pwd` $@
+
+ # Module specific targets
+ genbin:
+ echo "X" > 8123_bin.o_shipped
+
+ endif
+
+ In example 1, the check for KERNELRELEASE is used to separate
+ the two parts of the Makefile. kbuild will only see the two
+ assignments whereas make will see everything except the two
+ kbuild assignments.
+
+ In recent versions of the kernel, kbuild will look for a file named
+ Kbuild and as second option look for a file named Makefile.
+ Utilising the Kbuild file makes us split up the Makefile in example 1
+ into two files as shown in example 2:
+
+ Example 2:
+ --> filename: Kbuild
+ obj-m := 8123.o
+ 8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+ --> filename: Makefile
+ KERNELDIR := /lib/modules/`uname -r`/build
+ all::
+ $(MAKE) -C $(KERNELDIR) M=`pwd` $@
+
+ # Module specific targets
+ genbin:
+ echo "X" > 8123_bin_shipped
+
+
+ In example 2, we are down to two fairly simple files and for simple
+ files as used in this example the split is questionable. But some
+ external modules use Makefiles of several hundred lines and here it
+ really pays off to separate the kbuild part from the rest.
+ Example 3 shows a backward compatible version.
+
+ Example 3:
+ --> filename: Kbuild
+ obj-m := 8123.o
+ 8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+ --> filename: Makefile
+ ifneq ($(KERNELRELEASE),)
+ include Kbuild
+ else
+ # Normal Makefile
+
+ KERNELDIR := /lib/modules/`uname -r`/build
+ all::
+ $(MAKE) -C $KERNELDIR M=`pwd` $@
+
+ # Module specific targets
+ genbin:
+ echo "X" > 8123_bin_shipped
+
+ endif
+
+ The trick here is to include the Kbuild file from Makefile, so
+ if an older version of kbuild picks up the Makefile, the Kbuild
+ file will be included.
+
+--- 4.2 Binary blobs included in a module
+
+ Some external modules needs to include a .o as a blob. kbuild
+ has support for this, but requires the blob file to be named
+ <filename>_shipped. In our example the blob is named
+ 8123_bin.o_shipped and when the kbuild rules kick in the file
+ 8123_bin.o is created as a simple copy off the 8213_bin.o_shipped file
+ with the _shipped part stripped of the filename.
+ This allows the 8123_bin.o filename to be used in the assignment to
+ the module.
+
+ Example 4:
+ obj-m := 8123.o
+ 8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+ In example 4, there is no distinction between the ordinary .c/.h files
+ and the binary file. But kbuild will pick up different rules to create
+ the .o file.
+
+
+=== 5. Include files
+
+Include files are a necessity when a .c file uses something from other .c
+files (not strictly in the sense of C, but if good programming practice is
+used). Any module that consists of more than one .c file will have a .h file
+for one of the .c files.
+
+- If the .h file only describes a module internal interface, then the .h file
+ shall be placed in the same directory as the .c files.
+- If the .h files describe an interface used by other parts of the kernel
+ located in different directories, the .h files shall be located in
+ include/linux/ or other include/ directories as appropriate.
+
+One exception for this rule is larger subsystems that have their own directory
+under include/ such as include/scsi. Another exception is arch-specific
+.h files which are located under include/asm-$(ARCH)/*.
+
+External modules have a tendency to locate include files in a separate include/
+directory and therefore need to deal with this in their kbuild file.
+
+--- 5.1 How to include files from the kernel include dir
+
+ When a module needs to include a file from include/linux/, then one
+ just uses:
+
+ #include <linux/modules.h>
+
+ kbuild will make sure to add options to gcc so the relevant
+ directories are searched.
+ Likewise for .h files placed in the same directory as the .c file.
+
+ #include "8123_if.h"
+
+ will do the job.
+
+--- 5.2 External modules using an include/ dir
+
+ External modules often locate their .h files in a separate include/
+ directory although this is not usual kernel style. When an external
+ module uses an include/ dir then kbuild needs to be told so.
+ The trick here is to use either EXTRA_CFLAGS (take effect for all .c
+ files) or CFLAGS_$F.o (take effect only for a single file).
+
+ In our example, if we move 8123_if.h to a subdirectory named include/
+ the resulting Kbuild file would look like:
+
+ --> filename: Kbuild
+ obj-m := 8123.o
+
+ EXTRA_CFLAGS := -Iinclude
+ 8123-y := 8123_if.o 8123_pci.o 8123_bin.o
+
+ Note that in the assignment there is no space between -I and the path.
+ This is a kbuild limitation: there must be no space present.
+
+--- 5.3 External modules using several directories
+
+ If an external module does not follow the usual kernel style, but
+ decides to spread files over several directories, then kbuild can
+ handle this too.
+
+ Consider the following example:
+
+ |
+ +- src/complex_main.c
+ | +- hal/hardwareif.c
+ | +- hal/include/hardwareif.h
+ +- include/complex.h
+
+ To build a single module named complex.ko, we then need the following
+ kbuild file:
+
+ Kbuild:
+ obj-m := complex.o
+ complex-y := src/complex_main.o
+ complex-y += src/hal/hardwareif.o
+
+ EXTRA_CFLAGS := -I$(src)/include
+ EXTRA_CFLAGS += -I$(src)src/hal/include
+
+
+ kbuild knows how to handle .o files located in another directory -
+ although this is NOT recommended practice. The syntax is to specify
+ the directory relative to the directory where the Kbuild file is
+ located.
+
+ To find the .h files, we have to explicitly tell kbuild where to look
+ for the .h files. When kbuild executes, the current directory is always
+ the root of the kernel tree (argument to -C) and therefore we have to
+ tell kbuild how to find the .h files using absolute paths.
+ $(src) will specify the absolute path to the directory where the
+ Kbuild file are located when being build as an external module.
+ Therefore -I$(src)/ is used to point out the directory of the Kbuild
+ file and any additional path are just appended.
+
+=== 6. Module installation
+
+Modules which are included in the kernel are installed in the directory:
+
+ /lib/modules/$(KERNELRELEASE)/kernel
+
+External modules are installed in the directory:
+
+ /lib/modules/$(KERNELRELEASE)/extra
+
+--- 6.1 INSTALL_MOD_PATH
+
+ Above are the default directories, but as always, some level of
+ customization is possible. One can prefix the path using the variable
+ INSTALL_MOD_PATH:
+
+ $ make INSTALL_MOD_PATH=/frodo modules_install
+ => Install dir: /frodo/lib/modules/$(KERNELRELEASE)/kernel
+
+ INSTALL_MOD_PATH may be set as an ordinary shell variable or as in the
+ example above, can be specified on the command line when calling make.
+ INSTALL_MOD_PATH has effect both when installing modules included in
+ the kernel as well as when installing external modules.
+
+--- 6.2 INSTALL_MOD_DIR
+
+ When installing external modules they are by default installed to a
+ directory under /lib/modules/$(KERNELRELEASE)/extra, but one may wish
+ to locate modules for a specific functionality in a separate
+ directory. For this purpose, one can use INSTALL_MOD_DIR to specify an
+ alternative name to 'extra'.
+
+ $ make INSTALL_MOD_DIR=gandalf -C KERNELDIR \
+ M=`pwd` modules_install
+ => Install dir: /lib/modules/$(KERNELRELEASE)/gandalf
+
+
+=== 7. Module versioning & Module.symvers
+
+Module versioning is enabled by the CONFIG_MODVERSIONS tag.
+
+Module versioning is used as a simple ABI consistency check. The Module
+versioning creates a CRC value of the full prototype for an exported symbol and
+when a module is loaded/used then the CRC values contained in the kernel are
+compared with similar values in the module. If they are not equal, then the
+kernel refuses to load the module.
+
+Module.symvers contains a list of all exported symbols from a kernel build.
+
+--- 7.1 Symbols from the kernel (vmlinux + modules)
+
+ During a kernel build, a file named Module.symvers will be generated.
+ Module.symvers contains all exported symbols from the kernel and
+ compiled modules. For each symbols, the corresponding CRC value
+ is stored too.
+
+ The syntax of the Module.symvers file is:
+ <CRC> <Symbol> <module>
+ Sample:
+ 0x2d036834 scsi_remove_host drivers/scsi/scsi_mod
+
+ For a kernel build without CONFIG_MODVERSIONS enabled, the crc
+ would read: 0x00000000
+
+ Module.symvers serves two purposes:
+ 1) It lists all exported symbols both from vmlinux and all modules
+ 2) It lists the CRC if CONFIG_MODVERSIONS is enabled
+
+--- 7.2 Symbols and external modules
+
+ When building an external module, the build system needs access to
+ the symbols from the kernel to check if all external symbols are
+ defined. This is done in the MODPOST step and to obtain all
+ symbols, modpost reads Module.symvers from the kernel.
+ If a Module.symvers file is present in the directory where
+ the external module is being built, this file will be read too.
+ During the MODPOST step, a new Module.symvers file will be written
+ containing all exported symbols that were not defined in the kernel.
+
+--- 7.3 Symbols from another external module
+
+ Sometimes, an external module uses exported symbols from another
+ external module. Kbuild needs to have full knowledge on all symbols
+ to avoid spitting out warnings about undefined symbols.
+ Three solutions exist to let kbuild know all symbols of more than
+ one external module.
+ The method with a top-level kbuild file is recommended but may be
+ impractical in certain situations.
+
+ Use a top-level Kbuild file
+ If you have two modules: 'foo' and 'bar', and 'foo' needs
+ symbols from 'bar', then one can use a common top-level kbuild
+ file so both modules are compiled in same build.
+
+ Consider following directory layout:
+ ./foo/ <= contains the foo module
+ ./bar/ <= contains the bar module
+ The top-level Kbuild file would then look like:
+
+ #./Kbuild: (this file may also be named Makefile)
+ obj-y := foo/ bar/
+
+ Executing:
+ make -C $KDIR M=`pwd`
+
+ will then do the expected and compile both modules with full
+ knowledge on symbols from both modules.
+
+ Use an extra Module.symvers file
+ When an external module is built, a Module.symvers file is
+ generated containing all exported symbols which are not
+ defined in the kernel.
+ To get access to symbols from module 'bar', one can copy the
+ Module.symvers file from the compilation of the 'bar' module
+ to the directory where the 'foo' module is built.
+ During the module build, kbuild will read the Module.symvers
+ file in the directory of the external module and when the
+ build is finished, a new Module.symvers file is created
+ containing the sum of all symbols defined and not part of the
+ kernel.
+
+ Use make variable KBUILD_EXTRA_SYMBOLS in the Makefile
+ If it is impractical to copy Module.symvers from another
+ module, you can assign a space separated list of files to
+ KBUILD_EXTRA_SYMBOLS in your Makfile. These files will be
+ loaded by modpost during the initialisation of its symbol
+ tables.
+
+=== 8. Tips & Tricks
+
+--- 8.1 Testing for CONFIG_FOO_BAR
+
+ Modules often need to check for certain CONFIG_ options to decide if
+ a specific feature shall be included in the module. When kbuild is used
+ this is done by referencing the CONFIG_ variable directly.
+
+ #fs/ext2/Makefile
+ obj-$(CONFIG_EXT2_FS) += ext2.o
+
+ ext2-y := balloc.o bitmap.o dir.o
+ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o
+
+ External modules have traditionally used grep to check for specific
+ CONFIG_ settings directly in .config. This usage is broken.
+ As introduced before, external modules shall use kbuild when building
+ and therefore can use the same methods as in-kernel modules when
+ testing for CONFIG_ definitions.
+
diff --git a/Documentation/kdump/gdbmacros.txt b/Documentation/kdump/gdbmacros.txt
new file mode 100644
index 0000000..9b9b454
--- /dev/null
+++ b/Documentation/kdump/gdbmacros.txt
@@ -0,0 +1,201 @@
+#
+# This file contains a few gdb macros (user defined commands) to extract
+# useful information from kernel crashdump (kdump) like stack traces of
+# all the processes or a particular process and trapinfo.
+#
+# These macros can be used by copying this file in .gdbinit (put in home
+# directory or current directory) or by invoking gdb command with
+# --command=<command-file-name> option
+#
+# Credits:
+# Alexander Nyberg <alexn@telia.com>
+# V Srivatsa <vatsa@in.ibm.com>
+# Maneesh Soni <maneesh@in.ibm.com>
+#
+
+define bttnobp
+ set $tasks_off=((size_t)&((struct task_struct *)0)->tasks)
+ set $pid_off=((size_t)&((struct task_struct *)0)->pids[1].pid_list.next)
+ set $init_t=&init_task
+ set $next_t=(((char *)($init_t->tasks).next) - $tasks_off)
+ while ($next_t != $init_t)
+ set $next_t=(struct task_struct *)$next_t
+ printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm
+ printf "===================\n"
+ set var $stackp = $next_t.thread.esp
+ set var $stack_top = ($stackp & ~4095) + 4096
+
+ while ($stackp < $stack_top)
+ if (*($stackp) > _stext && *($stackp) < _sinittext)
+ info symbol *($stackp)
+ end
+ set $stackp += 4
+ end
+ set $next_th=(((char *)$next_t->pids[1].pid_list.next) - $pid_off)
+ while ($next_th != $next_t)
+ set $next_th=(struct task_struct *)$next_th
+ printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm
+ printf "===================\n"
+ set var $stackp = $next_t.thread.esp
+ set var $stack_top = ($stackp & ~4095) + 4096
+
+ while ($stackp < $stack_top)
+ if (*($stackp) > _stext && *($stackp) < _sinittext)
+ info symbol *($stackp)
+ end
+ set $stackp += 4
+ end
+ set $next_th=(((char *)$next_th->pids[1].pid_list.next) - $pid_off)
+ end
+ set $next_t=(char *)($next_t->tasks.next) - $tasks_off
+ end
+end
+document bttnobp
+ dump all thread stack traces on a kernel compiled with !CONFIG_FRAME_POINTER
+end
+
+define btt
+ set $tasks_off=((size_t)&((struct task_struct *)0)->tasks)
+ set $pid_off=((size_t)&((struct task_struct *)0)->pids[1].pid_list.next)
+ set $init_t=&init_task
+ set $next_t=(((char *)($init_t->tasks).next) - $tasks_off)
+ while ($next_t != $init_t)
+ set $next_t=(struct task_struct *)$next_t
+ printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm
+ printf "===================\n"
+ set var $stackp = $next_t.thread.esp
+ set var $stack_top = ($stackp & ~4095) + 4096
+ set var $stack_bot = ($stackp & ~4095)
+
+ set $stackp = *($stackp)
+ while (($stackp < $stack_top) && ($stackp > $stack_bot))
+ set var $addr = *($stackp + 4)
+ info symbol $addr
+ set $stackp = *($stackp)
+ end
+
+ set $next_th=(((char *)$next_t->pids[1].pid_list.next) - $pid_off)
+ while ($next_th != $next_t)
+ set $next_th=(struct task_struct *)$next_th
+ printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm
+ printf "===================\n"
+ set var $stackp = $next_t.thread.esp
+ set var $stack_top = ($stackp & ~4095) + 4096
+ set var $stack_bot = ($stackp & ~4095)
+
+ set $stackp = *($stackp)
+ while (($stackp < $stack_top) && ($stackp > $stack_bot))
+ set var $addr = *($stackp + 4)
+ info symbol $addr
+ set $stackp = *($stackp)
+ end
+ set $next_th=(((char *)$next_th->pids[1].pid_list.next) - $pid_off)
+ end
+ set $next_t=(char *)($next_t->tasks.next) - $tasks_off
+ end
+end
+document btt
+ dump all thread stack traces on a kernel compiled with CONFIG_FRAME_POINTER
+end
+
+define btpid
+ set var $pid = $arg0
+ set $tasks_off=((size_t)&((struct task_struct *)0)->tasks)
+ set $pid_off=((size_t)&((struct task_struct *)0)->pids[1].pid_list.next)
+ set $init_t=&init_task
+ set $next_t=(((char *)($init_t->tasks).next) - $tasks_off)
+ set var $pid_task = 0
+
+ while ($next_t != $init_t)
+ set $next_t=(struct task_struct *)$next_t
+
+ if ($next_t.pid == $pid)
+ set $pid_task = $next_t
+ end
+
+ set $next_th=(((char *)$next_t->pids[1].pid_list.next) - $pid_off)
+ while ($next_th != $next_t)
+ set $next_th=(struct task_struct *)$next_th
+ if ($next_th.pid == $pid)
+ set $pid_task = $next_th
+ end
+ set $next_th=(((char *)$next_th->pids[1].pid_list.next) - $pid_off)
+ end
+ set $next_t=(char *)($next_t->tasks.next) - $tasks_off
+ end
+
+ printf "\npid %d; comm %s:\n", $pid_task.pid, $pid_task.comm
+ printf "===================\n"
+ set var $stackp = $pid_task.thread.esp
+ set var $stack_top = ($stackp & ~4095) + 4096
+ set var $stack_bot = ($stackp & ~4095)
+
+ set $stackp = *($stackp)
+ while (($stackp < $stack_top) && ($stackp > $stack_bot))
+ set var $addr = *($stackp + 4)
+ info symbol $addr
+ set $stackp = *($stackp)
+ end
+end
+document btpid
+ backtrace of pid
+end
+
+
+define trapinfo
+ set var $pid = $arg0
+ set $tasks_off=((size_t)&((struct task_struct *)0)->tasks)
+ set $pid_off=((size_t)&((struct task_struct *)0)->pids[1].pid_list.next)
+ set $init_t=&init_task
+ set $next_t=(((char *)($init_t->tasks).next) - $tasks_off)
+ set var $pid_task = 0
+
+ while ($next_t != $init_t)
+ set $next_t=(struct task_struct *)$next_t
+
+ if ($next_t.pid == $pid)
+ set $pid_task = $next_t
+ end
+
+ set $next_th=(((char *)$next_t->pids[1].pid_list.next) - $pid_off)
+ while ($next_th != $next_t)
+ set $next_th=(struct task_struct *)$next_th
+ if ($next_th.pid == $pid)
+ set $pid_task = $next_th
+ end
+ set $next_th=(((char *)$next_th->pids[1].pid_list.next) - $pid_off)
+ end
+ set $next_t=(char *)($next_t->tasks.next) - $tasks_off
+ end
+
+ printf "Trapno %ld, cr2 0x%lx, error_code %ld\n", $pid_task.thread.trap_no, \
+ $pid_task.thread.cr2, $pid_task.thread.error_code
+
+end
+document trapinfo
+ Run info threads and lookup pid of thread #1
+ 'trapinfo <pid>' will tell you by which trap & possibly
+ address the kernel panicked.
+end
+
+
+define dmesg
+ set $i = 0
+ set $end_idx = (log_end - 1) & (log_buf_len - 1)
+
+ while ($i < logged_chars)
+ set $idx = (log_end - 1 - logged_chars + $i) & (log_buf_len - 1)
+
+ if ($idx + 100 <= $end_idx) || \
+ ($end_idx <= $idx && $idx + 100 < log_buf_len)
+ printf "%.100s", &log_buf[$idx]
+ set $i = $i + 100
+ else
+ printf "%c", log_buf[$idx]
+ set $i = $i + 1
+ end
+ end
+end
+document dmesg
+ print the kernel ring buffer
+end
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
new file mode 100644
index 0000000..3f4bc84
--- /dev/null
+++ b/Documentation/kdump/kdump.txt
@@ -0,0 +1,444 @@
+================================================================
+Documentation for Kdump - The kexec-based Crash Dumping Solution
+================================================================
+
+This document includes overview, setup and installation, and analysis
+information.
+
+Overview
+========
+
+Kdump uses kexec to quickly boot to a dump-capture kernel whenever a
+dump of the system kernel's memory needs to be taken (for example, when
+the system panics). The system kernel's memory image is preserved across
+the reboot and is accessible to the dump-capture kernel.
+
+You can use common commands, such as cp and scp, to copy the
+memory image to a dump file on the local disk, or across the network to
+a remote system.
+
+Kdump and kexec are currently supported on the x86, x86_64, ppc64 and ia64
+architectures.
+
+When the system kernel boots, it reserves a small section of memory for
+the dump-capture kernel. This ensures that ongoing Direct Memory Access
+(DMA) from the system kernel does not corrupt the dump-capture kernel.
+The kexec -p command loads the dump-capture kernel into this reserved
+memory.
+
+On x86 machines, the first 640 KB of physical memory is needed to boot,
+regardless of where the kernel loads. Therefore, kexec backs up this
+region just before rebooting into the dump-capture kernel.
+
+Similarly on PPC64 machines first 32KB of physical memory is needed for
+booting regardless of where the kernel is loaded and to support 64K page
+size kexec backs up the first 64KB memory.
+
+All of the necessary information about the system kernel's core image is
+encoded in the ELF format, and stored in a reserved area of memory
+before a crash. The physical address of the start of the ELF header is
+passed to the dump-capture kernel through the elfcorehdr= boot
+parameter.
+
+With the dump-capture kernel, you can access the memory image, or "old
+memory," in two ways:
+
+- Through a /dev/oldmem device interface. A capture utility can read the
+ device file and write out the memory in raw format. This is a raw dump
+ of memory. Analysis and capture tools must be intelligent enough to
+ determine where to look for the right information.
+
+- Through /proc/vmcore. This exports the dump as an ELF-format file that
+ you can write out using file copy commands such as cp or scp. Further,
+ you can use analysis tools such as the GNU Debugger (GDB) and the Crash
+ tool to debug the dump file. This method ensures that the dump pages are
+ correctly ordered.
+
+
+Setup and Installation
+======================
+
+Install kexec-tools
+-------------------
+
+1) Login as the root user.
+
+2) Download the kexec-tools user-space package from the following URL:
+
+http://www.kernel.org/pub/linux/kernel/people/horms/kexec-tools/kexec-tools.tar.gz
+
+This is a symlink to the latest version.
+
+The latest kexec-tools git tree is available at:
+
+git://git.kernel.org/pub/scm/linux/kernel/git/horms/kexec-tools.git
+or
+http://www.kernel.org/git/?p=linux/kernel/git/horms/kexec-tools.git
+
+More information about kexec-tools can be found at
+http://www.kernel.org/pub/linux/kernel/people/horms/kexec-tools/README.html
+
+3) Unpack the tarball with the tar command, as follows:
+
+ tar xvpzf kexec-tools.tar.gz
+
+4) Change to the kexec-tools directory, as follows:
+
+ cd kexec-tools-VERSION
+
+5) Configure the package, as follows:
+
+ ./configure
+
+6) Compile the package, as follows:
+
+ make
+
+7) Install the package, as follows:
+
+ make install
+
+
+Build the system and dump-capture kernels
+-----------------------------------------
+There are two possible methods of using Kdump.
+
+1) Build a separate custom dump-capture kernel for capturing the
+ kernel core dump.
+
+2) Or use the system kernel binary itself as dump-capture kernel and there is
+ no need to build a separate dump-capture kernel. This is possible
+ only with the architecutres which support a relocatable kernel. As
+ of today, i386, x86_64, ppc64 and ia64 architectures support relocatable
+ kernel.
+
+Building a relocatable kernel is advantageous from the point of view that
+one does not have to build a second kernel for capturing the dump. But
+at the same time one might want to build a custom dump capture kernel
+suitable to his needs.
+
+Following are the configuration setting required for system and
+dump-capture kernels for enabling kdump support.
+
+System kernel config options
+----------------------------
+
+1) Enable "kexec system call" in "Processor type and features."
+
+ CONFIG_KEXEC=y
+
+2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo
+ filesystems." This is usually enabled by default.
+
+ CONFIG_SYSFS=y
+
+ Note that "sysfs file system support" might not appear in the "Pseudo
+ filesystems" menu if "Configure standard kernel features (for small
+ systems)" is not enabled in "General Setup." In this case, check the
+ .config file itself to ensure that sysfs is turned on, as follows:
+
+ grep 'CONFIG_SYSFS' .config
+
+3) Enable "Compile the kernel with debug info" in "Kernel hacking."
+
+ CONFIG_DEBUG_INFO=Y
+
+ This causes the kernel to be built with debug symbols. The dump
+ analysis tools require a vmlinux with debug symbols in order to read
+ and analyze a dump file.
+
+Dump-capture kernel config options (Arch Independent)
+-----------------------------------------------------
+
+1) Enable "kernel crash dumps" support under "Processor type and
+ features":
+
+ CONFIG_CRASH_DUMP=y
+
+2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems".
+
+ CONFIG_PROC_VMCORE=y
+ (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.)
+
+Dump-capture kernel config options (Arch Dependent, i386 and x86_64)
+--------------------------------------------------------------------
+
+1) On i386, enable high memory support under "Processor type and
+ features":
+
+ CONFIG_HIGHMEM64G=y
+ or
+ CONFIG_HIGHMEM4G
+
+2) On i386 and x86_64, disable symmetric multi-processing support
+ under "Processor type and features":
+
+ CONFIG_SMP=n
+
+ (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line
+ when loading the dump-capture kernel, see section "Load the Dump-capture
+ Kernel".)
+
+3) If one wants to build and use a relocatable kernel,
+ Enable "Build a relocatable kernel" support under "Processor type and
+ features"
+
+ CONFIG_RELOCATABLE=y
+
+4) Use a suitable value for "Physical address where the kernel is
+ loaded" (under "Processor type and features"). This only appears when
+ "kernel crash dumps" is enabled. A suitable value depends upon
+ whether kernel is relocatable or not.
+
+ If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000
+ This will compile the kernel for physical address 1MB, but given the fact
+ kernel is relocatable, it can be run from any physical address hence
+ kexec boot loader will load it in memory region reserved for dump-capture
+ kernel.
+
+ Otherwise it should be the start of memory region reserved for
+ second kernel using boot parameter "crashkernel=Y@X". Here X is
+ start of memory region reserved for dump-capture kernel.
+ Generally X is 16MB (0x1000000). So you can set
+ CONFIG_PHYSICAL_START=0x1000000
+
+5) Make and install the kernel and its modules. DO NOT add this kernel
+ to the boot loader configuration files.
+
+Dump-capture kernel config options (Arch Dependent, ppc64)
+----------------------------------------------------------
+
+1) Enable "Build a kdump crash kernel" support under "Kernel" options:
+
+ CONFIG_CRASH_DUMP=y
+
+2) Enable "Build a relocatable kernel" support
+
+ CONFIG_RELOCATABLE=y
+
+ Make and install the kernel and its modules.
+
+Dump-capture kernel config options (Arch Dependent, ia64)
+----------------------------------------------------------
+
+- No specific options are required to create a dump-capture kernel
+ for ia64, other than those specified in the arch idependent section
+ above. This means that it is possible to use the system kernel
+ as a dump-capture kernel if desired.
+
+ The crashkernel region can be automatically placed by the system
+ kernel at run time. This is done by specifying the base address as 0,
+ or omitting it all together.
+
+ crashkernel=256M@0
+ or
+ crashkernel=256M
+
+ If the start address is specified, note that the start address of the
+ kernel will be aligned to 64Mb, so if the start address is not then
+ any space below the alignment point will be wasted.
+
+
+Extended crashkernel syntax
+===========================
+
+While the "crashkernel=size[@offset]" syntax is sufficient for most
+configurations, sometimes it's handy to have the reserved memory dependent
+on the value of System RAM -- that's mostly for distributors that pre-setup
+the kernel command line to avoid a unbootable system after some memory has
+been removed from the machine.
+
+The syntax is:
+
+ crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
+ range=start-[end]
+
+ 'start' is inclusive and 'end' is exclusive.
+
+For example:
+
+ crashkernel=512M-2G:64M,2G-:128M
+
+This would mean:
+
+ 1) if the RAM is smaller than 512M, then don't reserve anything
+ (this is the "rescue" case)
+ 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M
+ 3) if the RAM size is larger than 2G, then reserve 128M
+
+
+
+Boot into System Kernel
+=======================
+
+1) Update the boot loader (such as grub, yaboot, or lilo) configuration
+ files as necessary.
+
+2) Boot the system kernel with the boot parameter "crashkernel=Y@X",
+ where Y specifies how much memory to reserve for the dump-capture kernel
+ and X specifies the beginning of this reserved memory. For example,
+ "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory
+ starting at physical address 0x01000000 (16MB) for the dump-capture kernel.
+
+ On x86 and x86_64, use "crashkernel=64M@16M".
+
+ On ppc64, use "crashkernel=128M@32M".
+
+ On ia64, 256M@256M is a generous value that typically works.
+ The region may be automatically placed on ia64, see the
+ dump-capture kernel config option notes above.
+
+Load the Dump-capture Kernel
+============================
+
+After booting to the system kernel, dump-capture kernel needs to be
+loaded.
+
+Based on the architecture and type of image (relocatable or not), one
+can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz
+of dump-capture kernel. Following is the summary.
+
+For i386 and x86_64:
+ - Use vmlinux if kernel is not relocatable.
+ - Use bzImage/vmlinuz if kernel is relocatable.
+For ppc64:
+ - Use vmlinux
+For ia64:
+ - Use vmlinux or vmlinuz.gz
+
+
+If you are using a uncompressed vmlinux image then use following command
+to load dump-capture kernel.
+
+ kexec -p <dump-capture-kernel-vmlinux-image> \
+ --initrd=<initrd-for-dump-capture-kernel> --args-linux \
+ --append="root=<root-dev> <arch-specific-options>"
+
+If you are using a compressed bzImage/vmlinuz, then use following command
+to load dump-capture kernel.
+
+ kexec -p <dump-capture-kernel-bzImage> \
+ --initrd=<initrd-for-dump-capture-kernel> \
+ --append="root=<root-dev> <arch-specific-options>"
+
+Please note, that --args-linux does not need to be specified for ia64.
+It is planned to make this a no-op on that architecture, but for now
+it should be omitted
+
+Following are the arch specific command line options to be used while
+loading dump-capture kernel.
+
+For i386, x86_64 and ia64:
+ "1 irqpoll maxcpus=1 reset_devices"
+
+For ppc64:
+ "1 maxcpus=1 noirqdistrib reset_devices"
+
+
+Notes on loading the dump-capture kernel:
+
+* By default, the ELF headers are stored in ELF64 format to support
+ systems with more than 4GB memory. On i386, kexec automatically checks if
+ the physical RAM size exceeds the 4 GB limit and if not, uses ELF32.
+ So, on non-PAE systems, ELF32 is always used.
+
+ The --elf32-core-headers option can be used to force the generation of ELF32
+ headers. This is necessary because GDB currently cannot open vmcore files
+ with ELF64 headers on 32-bit systems.
+
+* The "irqpoll" boot parameter reduces driver initialization failures
+ due to shared interrupts in the dump-capture kernel.
+
+* You must specify <root-dev> in the format corresponding to the root
+ device name in the output of mount command.
+
+* Boot parameter "1" boots the dump-capture kernel into single-user
+ mode without networking. If you want networking, use "3".
+
+* We generally don' have to bring up a SMP kernel just to capture the
+ dump. Hence generally it is useful either to build a UP dump-capture
+ kernel or specify maxcpus=1 option while loading dump-capture kernel.
+
+Kernel Panic
+============
+
+After successfully loading the dump-capture kernel as previously
+described, the system will reboot into the dump-capture kernel if a
+system crash is triggered. Trigger points are located in panic(),
+die(), die_nmi() and in the sysrq handler (ALT-SysRq-c).
+
+The following conditions will execute a crash trigger point:
+
+If a hard lockup is detected and "NMI watchdog" is configured, the system
+will boot into the dump-capture kernel ( die_nmi() ).
+
+If die() is called, and it happens to be a thread with pid 0 or 1, or die()
+is called inside interrupt context or die() is called and panic_on_oops is set,
+the system will boot into the dump-capture kernel.
+
+On powerpc systems when a soft-reset is generated, die() is called by all cpus
+and the system will boot into the dump-capture kernel.
+
+For testing purposes, you can trigger a crash by using "ALT-SysRq-c",
+"echo c > /proc/sysrq-trigger" or write a module to force the panic.
+
+Write Out the Dump File
+=======================
+
+After the dump-capture kernel is booted, write out the dump file with
+the following command:
+
+ cp /proc/vmcore <dump-file>
+
+You can also access dumped memory as a /dev/oldmem device for a linear
+and raw view. To create the device, use the following command:
+
+ mknod /dev/oldmem c 1 12
+
+Use the dd command with suitable options for count, bs, and skip to
+access specific portions of the dump.
+
+To see the entire memory, use the following command:
+
+ dd if=/dev/oldmem of=oldmem.001
+
+
+Analysis
+========
+
+Before analyzing the dump image, you should reboot into a stable kernel.
+
+You can do limited analysis using GDB on the dump file copied out of
+/proc/vmcore. Use the debug vmlinux built with -g and run the following
+command:
+
+ gdb vmlinux <dump-file>
+
+Stack trace for the task on processor 0, register display, and memory
+display work fine.
+
+Note: GDB cannot analyze core files generated in ELF64 format for x86.
+On systems with a maximum of 4GB of memory, you can generate
+ELF32-format headers using the --elf32-core-headers kernel option on the
+dump kernel.
+
+You can also use the Crash utility to analyze dump files in Kdump
+format. Crash is available on Dave Anderson's site at the following URL:
+
+ http://people.redhat.com/~anderson/
+
+
+To Do
+=====
+
+1) Provide relocatable kernels for all architectures to help in maintaining
+ multiple kernels for crash_dump, and the same kernel as the system kernel
+ can be used to capture the dump.
+
+
+Contact
+=======
+
+Vivek Goyal (vgoyal@in.ibm.com)
+Maneesh Soni (maneesh@in.ibm.com)
+
diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt
new file mode 100644
index 0000000..c6841ee
--- /dev/null
+++ b/Documentation/kernel-doc-nano-HOWTO.txt
@@ -0,0 +1,307 @@
+kernel-doc nano-HOWTO
+=====================
+
+How to format kernel-doc comments
+---------------------------------
+
+In order to provide embedded, 'C' friendly, easy to maintain,
+but consistent and extractable documentation of the functions and
+data structures in the Linux kernel, the Linux kernel has adopted
+a consistent style for documenting functions and their parameters,
+and structures and their members.
+
+The format for this documentation is called the kernel-doc format.
+It is documented in this Documentation/kernel-doc-nano-HOWTO.txt file.
+
+This style embeds the documentation within the source files, using
+a few simple conventions. The scripts/kernel-doc perl script, some
+SGML templates in Documentation/DocBook, and other tools understand
+these conventions, and are used to extract this embedded documentation
+into various documents.
+
+In order to provide good documentation of kernel functions and data
+structures, please use the following conventions to format your
+kernel-doc comments in Linux kernel source.
+
+We definitely need kernel-doc formatted documentation for functions
+that are exported to loadable modules using EXPORT_SYMBOL.
+
+We also look to provide kernel-doc formatted documentation for
+functions externally visible to other kernel files (not marked
+"static").
+
+We also recommend providing kernel-doc formatted documentation
+for private (file "static") routines, for consistency of kernel
+source code layout. But this is lower priority and at the
+discretion of the MAINTAINER of that kernel source file.
+
+Data structures visible in kernel include files should also be
+documented using kernel-doc formatted comments.
+
+The opening comment mark "/**" is reserved for kernel-doc comments.
+Only comments so marked will be considered by the kernel-doc scripts,
+and any comment so marked must be in kernel-doc format. Do not use
+"/**" to be begin a comment block unless the comment block contains
+kernel-doc formatted comments. The closing comment marker for
+kernel-doc comments can be either "*/" or "**/".
+
+Kernel-doc comments should be placed just before the function
+or data structure being described.
+
+Example kernel-doc function comment:
+
+/**
+ * foobar() - short function description of foobar
+ * @arg1: Describe the first argument to foobar.
+ * @arg2: Describe the second argument to foobar.
+ * One can provide multiple line descriptions
+ * for arguments.
+ *
+ * A longer description, with more discussion of the function foobar()
+ * that might be useful to those using or modifying it. Begins with
+ * empty comment line, and may include additional embedded empty
+ * comment lines.
+ *
+ * The longer description can have multiple paragraphs.
+ **/
+
+The first line, with the short description, must be on a single line.
+
+The @argument descriptions must begin on the very next line following
+this opening short function description line, with no intervening
+empty comment lines.
+
+Example kernel-doc data structure comment.
+
+/**
+ * struct blah - the basic blah structure
+ * @mem1: describe the first member of struct blah
+ * @mem2: describe the second member of struct blah,
+ * perhaps with more lines and words.
+ *
+ * Longer description of this structure.
+ **/
+
+The kernel-doc function comments describe each parameter to the
+function, in order, with the @name lines.
+
+The kernel-doc data structure comments describe each structure member
+in the data structure, with the @name lines.
+
+The longer description formatting is "reflowed", losing your line
+breaks. So presenting carefully formatted lists within these
+descriptions won't work so well; derived documentation will lose
+the formatting.
+
+See the section below "How to add extractable documentation to your
+source files" for more details and notes on how to format kernel-doc
+comments.
+
+Components of the kernel-doc system
+-----------------------------------
+
+Many places in the source tree have extractable documentation in the
+form of block comments above functions. The components of this system
+are:
+
+- scripts/kernel-doc
+
+ This is a perl script that hunts for the block comments and can mark
+ them up directly into DocBook, man, text, and HTML. (No, not
+ texinfo.)
+
+- Documentation/DocBook/*.tmpl
+
+ These are SGML template files, which are normal SGML files with
+ special place-holders for where the extracted documentation should
+ go.
+
+- scripts/basic/docproc.c
+
+ This is a program for converting SGML template files into SGML
+ files. When a file is referenced it is searched for symbols
+ exported (EXPORT_SYMBOL), to be able to distinguish between internal
+ and external functions.
+ It invokes kernel-doc, giving it the list of functions that
+ are to be documented.
+ Additionally it is used to scan the SGML template files to locate
+ all the files referenced herein. This is used to generate dependency
+ information as used by make.
+
+- Makefile
+
+ The targets 'sgmldocs', 'psdocs', 'pdfdocs', and 'htmldocs' are used
+ to build DocBook files, PostScript files, PDF files, and html files
+ in Documentation/DocBook.
+
+- Documentation/DocBook/Makefile
+
+ This is where C files are associated with SGML templates.
+
+
+How to extract the documentation
+--------------------------------
+
+If you just want to read the ready-made books on the various
+subsystems (see Documentation/DocBook/*.tmpl), just type 'make
+psdocs', or 'make pdfdocs', or 'make htmldocs', depending on your
+preference. If you would rather read a different format, you can type
+'make sgmldocs' and then use DocBook tools to convert
+Documentation/DocBook/*.sgml to a format of your choice (for example,
+'db2html ...' if 'make htmldocs' was not defined).
+
+If you want to see man pages instead, you can do this:
+
+$ cd linux
+$ scripts/kernel-doc -man $(find -name '*.c') | split-man.pl /tmp/man
+$ scripts/kernel-doc -man $(find -name '*.h') | split-man.pl /tmp/man
+
+Here is split-man.pl:
+
+-->
+#!/usr/bin/perl
+
+if ($#ARGV < 0) {
+ die "where do I put the results?\n";
+}
+
+mkdir $ARGV[0],0777;
+$state = 0;
+while (<STDIN>) {
+ if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
+ if ($state == 1) { close OUT }
+ $state = 1;
+ $fn = "$ARGV[0]/$1.9";
+ print STDERR "Creating $fn\n";
+ open OUT, ">$fn" or die "can't open $fn: $!\n";
+ print OUT $_;
+ } elsif ($state != 0) {
+ print OUT $_;
+ }
+}
+
+close OUT;
+<--
+
+If you just want to view the documentation for one function in one
+file, you can do this:
+
+$ scripts/kernel-doc -man -function fn file | nroff -man | less
+
+or this:
+
+$ scripts/kernel-doc -text -function fn file
+
+
+How to add extractable documentation to your source files
+---------------------------------------------------------
+
+The format of the block comment is like this:
+
+/**
+ * function_name(:)? (- short description)?
+(* @parameterx(space)*: (description of parameter x)?)*
+(* a blank line)?
+ * (Description:)? (Description of function)?
+ * (section header: (section description)? )*
+(*)?*/
+
+The short function description ***cannot be multiline***, but the other
+descriptions can be (and they can contain blank lines). If you continue
+that initial short description onto a second line, that second line will
+appear further down at the beginning of the description section, which is
+almost certainly not what you had in mind.
+
+Avoid putting a spurious blank line after the function name, or else the
+description will be repeated!
+
+All descriptive text is further processed, scanning for the following special
+patterns, which are highlighted appropriately.
+
+'funcname()' - function
+'$ENVVAR' - environment variable
+'&struct_name' - name of a structure (up to two words including 'struct')
+'@parameter' - name of a parameter
+'%CONST' - name of a constant.
+
+NOTE 1: The multi-line descriptive text you provide does *not* recognize
+line breaks, so if you try to format some text nicely, as in:
+
+ Return codes
+ 0 - cool
+ 1 - invalid arg
+ 2 - out of memory
+
+this will all run together and produce:
+
+ Return codes 0 - cool 1 - invalid arg 2 - out of memory
+
+NOTE 2: If the descriptive text you provide has lines that begin with
+some phrase followed by a colon, each of those phrases will be taken as
+a new section heading, which means you should similarly try to avoid text
+like:
+
+ Return codes:
+ 0: cool
+ 1: invalid arg
+ 2: out of memory
+
+every line of which would start a new section. Again, probably not
+what you were after.
+
+Take a look around the source tree for examples.
+
+
+kernel-doc for structs, unions, enums, and typedefs
+---------------------------------------------------
+
+Beside functions you can also write documentation for structs, unions,
+enums and typedefs. Instead of the function name you must write the name
+of the declaration; the struct/union/enum/typedef must always precede
+the name. Nesting of declarations is not supported.
+Use the argument mechanism to document members or constants.
+
+Inside a struct description, you can use the "private:" and "public:"
+comment tags. Structure fields that are inside a "private:" area
+are not listed in the generated output documentation.
+
+Example:
+
+/**
+ * struct my_struct - short description
+ * @a: first member
+ * @b: second member
+ *
+ * Longer description
+ */
+struct my_struct {
+ int a;
+ int b;
+/* private: */
+ int c;
+};
+
+
+How to make new SGML template files
+-----------------------------------
+
+SGML template files (*.tmpl) are like normal SGML files, except that
+they can contain escape sequences where extracted documentation should
+be inserted.
+
+!E<filename> is replaced by the documentation, in <filename>, for
+functions that are exported using EXPORT_SYMBOL: the function list is
+collected from files listed in Documentation/DocBook/Makefile.
+
+!I<filename> is replaced by the documentation for functions that are
+_not_ exported using EXPORT_SYMBOL.
+
+!D<filename> is used to name additional files to search for functions
+exported using EXPORT_SYMBOL.
+
+!F<filename> <function [functions...]> is replaced by the
+documentation, in <filename>, for the functions listed.
+
+
+Tim.
+*/ <twaugh@redhat.com>
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
new file mode 100644
index 0000000..28cdc2a
--- /dev/null
+++ b/Documentation/kernel-docs.txt
@@ -0,0 +1,761 @@
+
+ Index of Documentation for People Interested in Writing and/or
+
+ Understanding the Linux Kernel.
+
+ Juan-Mariano de Goyeneche <jmseyas@dit.upm.es>
+
+/*
+ * The latest version of this document may be found at:
+ * http://www.dit.upm.es/~jmseyas/linux/kernel/hackers-docs.html
+ */
+
+ The need for a document like this one became apparent in the
+ linux-kernel mailing list as the same questions, asking for pointers
+ to information, appeared again and again.
+
+ Fortunately, as more and more people get to GNU/Linux, more and more
+ get interested in the Kernel. But reading the sources is not always
+ enough. It is easy to understand the code, but miss the concepts, the
+ philosophy and design decisions behind this code.
+
+ Unfortunately, not many documents are available for beginners to
+ start. And, even if they exist, there was no "well-known" place which
+ kept track of them. These lines try to cover this lack. All documents
+ available on line known by the author are listed, while some reference
+ books are also mentioned.
+
+ PLEASE, if you know any paper not listed here or write a new document,
+ send me an e-mail, and I'll include a reference to it here. Any
+ corrections, ideas or comments are also welcomed.
+
+ The papers that follow are listed in no particular order. All are
+ cataloged with the following fields: the document's "Title", the
+ "Author"/s, the "URL" where they can be found, some "Keywords" helpful
+ when searching for specific topics, and a brief "Description" of the
+ Document.
+
+ Enjoy!
+
+ ON-LINE DOCS:
+
+ * Title: "Linux Device Drivers, Third Edition"
+ Author: Jonathan Corbet, Alessandro Rubini, Greg Kroah-Hartman
+ URL: http://lwn.net/Kernel/LDD3/
+ Description: A 600-page book covering the (2.6.10) driver
+ programming API and kernel hacking in general. Available under the
+ Creative Commons Attribution-ShareAlike 2.0 license.
+
+ * Title: "The Linux Kernel"
+ Author: David A. Rusling.
+ URL: http://www.tldp.org/LDP/tlk/tlk.html
+ Keywords: everything!, book.
+ Description: On line, 200 pages book describing most aspects of
+ the Linux Kernel. Probably, the first reference for beginners.
+ Lots of illustrations explaining data structures use and
+ relationships in the purest Richard W. Stevens' style. Contents:
+ "1.-Hardware Basics, 2.-Software Basics, 3.-Memory Management,
+ 4.-Processes, 5.-Interprocess Communication Mechanisms, 6.-PCI,
+ 7.-Interrupts and Interrupt Handling, 8.-Device Drivers, 9.-The
+ File system, 10.-Networks, 11.-Kernel Mechanisms, 12.-Modules,
+ 13.-The Linux Kernel Sources, A.-Linux Data Structures, B.-The
+ Alpha AXP Processor, C.-Useful Web and FTP Sites, D.-The GNU
+ General Public License, Glossary". In short: a must have.
+
+ * Title: "Linux Device Drivers, 2nd Edition"
+ Author: Alessandro Rubini and Jonathan Corbet.
+ URL: http://www.xml.com/ldd/chapter/book/index.html
+ Keywords: device drivers, modules, debugging, memory, hardware,
+ interrupt handling, char drivers, block drivers, kmod, mmap, DMA,
+ buses.
+ Description: O'Reilly's popular book, now also on-line under the
+ GNU Free Documentation License.
+ Notes: You can also buy it in paper-form from O'Reilly. See below
+ under BOOKS (Not on-line).
+
+ * Title: "Conceptual Architecture of the Linux Kernel"
+ Author: Ivan T. Bowman.
+ URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html
+ Keywords: conceptual software architecture, extracted design,
+ reverse engineering, system structure.
+ Description: Conceptual software architecture of the Linux kernel,
+ automatically extracted from the source code. Very detailed. Good
+ figures. Gives good overall kernel understanding.
+
+ * Title: "Concrete Architecture of the Linux Kernel"
+ Author: Ivan T. Bowman, Saheem Siddiqi, and Meyer C. Tanuan.
+ URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a2.html
+ Keywords: concrete architecture, extracted design, reverse
+ engineering, system structure, dependencies.
+ Description: Concrete architecture of the Linux kernel,
+ automatically extracted from the source code. Very detailed. Good
+ figures. Gives good overall kernel understanding. This papers
+ focus on lower details than its predecessor (files, variables...).
+
+ * Title: "Linux as a Case Study: Its Extracted Software
+ Architecture"
+ Author: Ivan T. Bowman, Richard C. Holt and Neil V. Brewster.
+ URL: http://plg.uwaterloo.ca/~itbowman/papers/linuxcase.html
+ Keywords: software architecture, architecture recovery,
+ redocumentation.
+ Description: Paper appeared at ICSE'99, Los Angeles, May 16-22,
+ 1999. A mixture of the previous two documents from the same
+ author.
+
+ * Title: "Overview of the Virtual File System"
+ Author: Richard Gooch.
+ URL: http://www.atnf.csiro.au/~rgooch/linux/vfs.txt
+ Keywords: VFS, File System, mounting filesystems, opening files,
+ dentries, dcache.
+ Description: Brief introduction to the Linux Virtual File System.
+ What is it, how it works, operations taken when opening a file or
+ mounting a file system and description of important data
+ structures explaining the purpose of each of their entries.
+
+ * Title: "The Linux RAID-1, 4, 5 Code"
+ Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza.
+ URL: http://www.linuxjournal.com/article.php?sid=2391
+ Keywords: RAID, MD driver.
+ Description: Linux Journal Kernel Korner article. Here is it's
+ abstract: "A description of the implementation of the RAID-1,
+ RAID-4 and RAID-5 personalities of the MD device driver in the
+ Linux kernel, providing users with high performance and reliable,
+ secondary-storage capability using software".
+
+ * Title: "Dynamic Kernels: Modularized Device Drivers"
+ Author: Alessandro Rubini.
+ URL: http://www.linuxjournal.com/article.php?sid=1219
+ Keywords: device driver, module, loading/unloading modules,
+ allocating resources.
+ Description: Linux Journal Kernel Korner article. Here is it's
+ abstract: "This is the first of a series of four articles
+ co-authored by Alessandro Rubini and Georg Zezchwitz which present
+ a practical approach to writing Linux device drivers as kernel
+ loadable modules. This installment presents an introduction to the
+ topic, preparing the reader to understand next month's
+ installment".
+
+ * Title: "Dynamic Kernels: Discovery"
+ Author: Alessandro Rubini.
+ URL: http://www.linuxjournal.com/article.php?sid=1220
+ Keywords: character driver, init_module, clean_up module,
+ autodetection, mayor number, minor number, file operations,
+ open(), close().
+ Description: Linux Journal Kernel Korner article. Here is it's
+ abstract: "This article, the second of four, introduces part of
+ the actual code to create custom module implementing a character
+ device driver. It describes the code for module initialization and
+ cleanup, as well as the open() and close() system calls".
+
+ * Title: "The Devil's in the Details"
+ Author: Georg v. Zezschwitz and Alessandro Rubini.
+ URL: http://www.linuxjournal.com/article.php?sid=1221
+ Keywords: read(), write(), select(), ioctl(), blocking/non
+ blocking mode, interrupt handler.
+ Description: Linux Journal Kernel Korner article. Here is it's
+ abstract: "This article, the third of four on writing character
+ device drivers, introduces concepts of reading, writing, and using
+ ioctl-calls".
+
+ * Title: "Dissecting Interrupts and Browsing DMA"
+ Author: Alessandro Rubini and Georg v. Zezschwitz.
+ URL: http://www.linuxjournal.com/article.php?sid=1222
+ Keywords: interrupts, irqs, DMA, bottom halves, task queues.
+ Description: Linux Journal Kernel Korner article. Here is it's
+ abstract: "This is the fourth in a series of articles about
+ writing character device drivers as loadable kernel modules. This
+ month, we further investigate the field of interrupt handling.
+ Though it is conceptually simple, practical limitations and
+ constraints make this an ``interesting'' part of device driver
+ writing, and several different facilities have been provided for
+ different situations. We also investigate the complex topic of
+ DMA".
+
+ * Title: "Device Drivers Concluded"
+ Author: Georg v. Zezschwitz.
+ URL: http://www.linuxjournal.com/article.php?sid=1287
+ Keywords: address spaces, pages, pagination, page management,
+ demand loading, swapping, memory protection, memory mapping, mmap,
+ virtual memory areas (VMAs), vremap, PCI.
+ Description: Finally, the above turned out into a five articles
+ series. This latest one's introduction reads: "This is the last of
+ five articles about character device drivers. In this final
+ section, Georg deals with memory mapping devices, beginning with
+ an overall description of the Linux memory management concepts".
+
+ * Title: "Network Buffers And Memory Management"
+ Author: Alan Cox.
+ URL: http://www.linuxjournal.com/article.php?sid=1312
+ Keywords: sk_buffs, network devices, protocol/link layer
+ variables, network devices flags, transmit, receive,
+ configuration, multicast.
+ Description: Linux Journal Kernel Korner. Here is the abstract:
+ "Writing a network device driver for Linux is fundamentally
+ simple---most of the complexity (other than talking to the
+ hardware) involves managing network packets in memory".
+
+ * Title: "Writing Linux Device Drivers"
+ Author: Michael K. Johnson.
+ URL: http://users.evitech.fi/~tk/rtos/writing_linux_device_d.html
+ Keywords: files, VFS, file operations, kernel interface, character
+ vs block devices, I/O access, hardware interrupts, DMA, access to
+ user memory, memory allocation, timers.
+ Description: Introductory 50-minutes (sic) tutorial on writing
+ device drivers. 12 pages written by the same author of the "Kernel
+ Hackers' Guide" which give a very good overview of the topic.
+
+ * Title: "The Venus kernel interface"
+ Author: Peter J. Braam.
+ URL:
+ http://www.coda.cs.cmu.edu/doc/html/kernel-venus-protocol.html
+ Keywords: coda, filesystem, venus, cache manager.
+ Description: "This document describes the communication between
+ Venus and kernel level file system code needed for the operation
+ of the Coda filesystem. This version document is meant to describe
+ the current interface (version 1.0) as well as improvements we
+ envisage".
+
+ * Title: "Programming PCI-Devices under Linux"
+ Author: Claus Schroeter.
+ URL:
+ ftp://ftp.llp.fu-berlin.de/pub/linux/LINUX-LAB/whitepapers/pcip.ps.gz
+ Keywords: PCI, device, busmastering.
+ Description: 6 pages tutorial on PCI programming under Linux.
+ Gives the basic concepts on the architecture of the PCI subsystem,
+ as long as basic functions and macros to read/write the devices
+ and perform busmastering.
+
+ * Title: "Writing Character Device Driver for Linux"
+ Author: R. Baruch and C. Schroeter.
+ URL:
+ ftp://ftp.llp.fu-berlin.de/pub/linux/LINUX-LAB/whitepapers/drivers.ps.gz
+ Keywords: character device drivers, I/O, signals, DMA, accessing
+ ports in user space, kernel environment.
+ Description: 68 pages paper on writing character drivers. A little
+ bit old (1.993, 1.994) although still useful.
+
+ * Title: "Design and Implementation of the Second Extended
+ Filesystem"
+ Author: Rémy Card, Theodore Ts'o, Stephen Tweedie.
+ URL: http://web.mit.edu/tytso/www/linux/ext2intro.html
+ Keywords: ext2, linux fs history, inode, directory, link, devices,
+ VFS, physical structure, performance, benchmarks, ext2fs library,
+ ext2fs tools, e2fsck.
+ Description: Paper written by three of the top ext2 hackers.
+ Covers Linux filesystems history, ext2 motivation, ext2 features,
+ design, physical structure on disk, performance, benchmarks,
+ e2fsck's passes description... A must read!
+ Notes: This paper was first published in the Proceedings of the
+ First Dutch International Symposium on Linux, ISBN 90-367-0385-9.
+
+ * Title: "Analysis of the Ext2fs structure"
+ Author: Louis-Dominique Dubeau.
+ URL: http://www.nondot.org/sabre/os/files/FileSystems/ext2fs/
+ Keywords: ext2, filesystem, ext2fs.
+ Description: Description of ext2's blocks, directories, inodes,
+ bitmaps, invariants...
+
+ * Title: "Journaling the Linux ext2fs Filesystem"
+ Author: Stephen C. Tweedie.
+ URL:
+ ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz
+ Keywords: ext3, journaling.
+ Description: Excellent 8-pages paper explaining the journaling
+ capabilities added to ext2 by the author, showing different
+ problems faced and the alternatives chosen.
+
+ * Title: "Kernel API changes from 2.0 to 2.2"
+ Author: Richard Gooch.
+ URL:
+ http://www.atnf.csiro.au/~rgooch/linux/docs/porting-to-2.2.html
+ Keywords: 2.2, changes.
+ Description: Kernel functions/structures/variables which changed
+ from 2.0.x to 2.2.x.
+
+ * Title: "Kernel API changes from 2.2 to 2.4"
+ Author: Richard Gooch.
+ URL:
+ http://www.atnf.csiro.au/~rgooch/linux/docs/porting-to-2.4.html
+ Keywords: 2.4, changes.
+ Description: Kernel functions/structures/variables which changed
+ from 2.2.x to 2.4.x.
+
+ * Title: "Linux Kernel Module Programming Guide"
+ Author: Ori Pomerantz.
+ URL: http://tldp.org/LDP/lkmpg/2.6/html/index.html
+ Keywords: modules, GPL book, /proc, ioctls, system calls,
+ interrupt handlers .
+ Description: Very nice 92 pages GPL book on the topic of modules
+ programming. Lots of examples.
+
+ * Title: "I/O Event Handling Under Linux"
+ Author: Richard Gooch.
+ URL: http://www.atnf.csiro.au/~rgooch/linux/docs/io-events.html
+ Keywords: IO, I/O, select(2), poll(2), FDs, aio_read(2), readiness
+ event queues.
+ Description: From the Introduction: "I/O Event handling is about
+ how your Operating System allows you to manage a large number of
+ open files (file descriptors in UNIX/POSIX, or FDs) in your
+ application. You want the OS to notify you when FDs become active
+ (have data ready to be read or are ready for writing). Ideally you
+ want a mechanism that is scalable. This means a large number of
+ inactive FDs cost very little in memory and CPU time to manage".
+
+ * Title: "The Kernel Hacking HOWTO"
+ Author: Various Talented People, and Rusty.
+ Location: in kernel tree, Documentation/DocBook/kernel-hacking/
+ (must be built as "make {htmldocs | psdocs | pdfdocs})
+ Keywords: HOWTO, kernel contexts, deadlock, locking, modules,
+ symbols, return conventions.
+ Description: From the Introduction: "Please understand that I
+ never wanted to write this document, being grossly underqualified,
+ but I always wanted to read it, and this was the only way. I
+ simply explain some best practices, and give reading entry-points
+ into the kernel sources. I avoid implementation details: that's
+ what the code is for, and I ignore whole tracts of useful
+ routines. This document assumes familiarity with C, and an
+ understanding of what the kernel is, and how it is used. It was
+ originally written for the 2.3 kernels, but nearly all of it
+ applies to 2.2 too; 2.0 is slightly different".
+
+ * Title: "Writing an ALSA Driver"
+ Author: Takashi Iwai <tiwai@suse.de>
+ URL: http://www.alsa-project.org/~iwai/writing-an-alsa-driver/index.html
+ Keywords: ALSA, sound, soundcard, driver, lowlevel, hardware.
+ Description: Advanced Linux Sound Architecture for developers,
+ both at kernel and user-level sides. ALSA is the Linux kernel
+ sound architecture in the 2.6 kernel version.
+
+ * Title: "Programming Guide for Linux USB Device Drivers"
+ Author: Detlef Fliegl.
+ URL: http://usb.in.tum.de/usbdoc/
+ Keywords: USB, universal serial bus.
+ Description: A must-read. From the Preface: "This document should
+ give detailed information about the current state of the USB
+ subsystem and its API for USB device drivers. The first section
+ will deal with the basics of USB devices. You will learn about
+ different types of devices and their properties. Going into detail
+ you will see how USB devices communicate on the bus. The second
+ section gives an overview of the Linux USB subsystem [2] and the
+ device driver framework. Then the API and its data structures will
+ be explained step by step. The last section of this document
+ contains a reference of all API calls and their return codes".
+ Notes: Beware: the main page states: "This document may not be
+ published, printed or used in excerpts without explicit permission
+ of the author". Fortunately, it may still be read...
+
+ * Title: "Linux Kernel Mailing List Glossary"
+ Author: various
+ URL: http://kernelnewbies.org/glossary/
+ Keywords: glossary, terms, linux-kernel.
+ Description: From the introduction: "This glossary is intended as
+ a brief description of some of the acronyms and terms you may hear
+ during discussion of the Linux kernel".
+
+ * Title: "Linux Kernel Locking HOWTO"
+ Author: Various Talented People, and Rusty.
+ Location: in kernel tree, Documentation/DocBook/kernel-locking/
+ (must be built as "make {htmldocs | psdocs | pdfdocs})
+ Keywords: locks, locking, spinlock, semaphore, atomic, race
+ condition, bottom halves, tasklets, softirqs.
+ Description: The title says it all: document describing the
+ locking system in the Linux Kernel either in uniprocessor or SMP
+ systems.
+ Notes: "It was originally written for the later (>2.3.47) 2.3
+ kernels, but most of it applies to 2.2 too; 2.0 is slightly
+ different". Freely redistributable under the conditions of the GNU
+ General Public License.
+
+ * Title: "Global spinlock list and usage"
+ Author: Rick Lindsley.
+ URL: http://lse.sourceforge.net/lockhier/global-spin-lock
+ Keywords: spinlock.
+ Description: This is an attempt to document both the existence and
+ usage of the spinlocks in the Linux 2.4.5 kernel. Comprehensive
+ list of spinlocks showing when they are used, which functions
+ access them, how each lock is acquired, under what conditions it
+ is held, whether interrupts can occur or not while it is held...
+
+ * Title: "Porting Linux 2.0 Drivers To Linux 2.2: Changes and New
+ Features "
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/1999-05/gear_01.html
+ Keywords: ports, porting.
+ Description: Article from Linux Magazine on porting from 2.0 to
+ 2.2 kernels.
+
+ * Title: "Porting Device Drivers To Linux 2.2: part II"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/1999-06/gear_01.html
+ Keywords: ports, porting.
+ Description: Second part on porting from 2.0 to 2.2 kernels.
+
+ * Title: "How To Make Sure Your Driver Will Work On The Power
+ Macintosh"
+ Author: Paul Mackerras.
+ URL: http://www.linux-mag.com/1999-07/gear_01.html
+ Keywords: Mac, Power Macintosh, porting, drivers, compatibility.
+ Description: The title says it all.
+
+ * Title: "An Introduction to SCSI Drivers"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/1999-08/gear_01.html
+ Keywords: SCSI, device, driver.
+ Description: The title says it all.
+
+ * Title: "Advanced SCSI Drivers And Other Tales"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/1999-09/gear_01.html
+ Keywords: SCSI, device, driver, advanced.
+ Description: The title says it all.
+
+ * Title: "Writing Linux Mouse Drivers"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/1999-10/gear_01.html
+ Keywords: mouse, driver, gpm.
+ Description: The title says it all.
+
+ * Title: "More on Mouse Drivers"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/1999-11/gear_01.html
+ Keywords: mouse, driver, gpm, races, asynchronous I/O.
+ Description: The title still says it all.
+
+ * Title: "Writing Video4linux Radio Driver"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/1999-12/gear_01.html
+ Keywords: video4linux, driver, radio, radio devices.
+ Description: The title says it all.
+
+ * Title: "Video4linux Drivers, Part 1: Video-Capture Device"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/2000-01/gear_01.html
+ Keywords: video4linux, driver, video capture, capture devices,
+ camera driver.
+ Description: The title says it all.
+
+ * Title: "Video4linux Drivers, Part 2: Video-capture Devices"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/2000-02/gear_01.html
+ Keywords: video4linux, driver, video capture, capture devices,
+ camera driver, control, query capabilities, capability, facility.
+ Description: The title says it all.
+
+ * Title: "PCI Management in Linux 2.2"
+ Author: Alan Cox.
+ URL: http://www.linux-mag.com/2000-03/gear_01.html
+ Keywords: PCI, bus, bus-mastering.
+ Description: The title says it all.
+
+ * Title: "Linux 2.4 Kernel Internals"
+ Author: Tigran Aivazian and Christoph Hellwig.
+ URL: http://www.moses.uklinux.net/patches/lki.html
+ Keywords: Linux, kernel, booting, SMB boot, VFS, page cache.
+ Description: A little book used for a short training course.
+ Covers building the kernel image, booting (including SMP bootup),
+ process management, VFS and more.
+
+ * Title: "Linux IP Networking. A Guide to the Implementation and
+ Modification of the Linux Protocol Stack."
+ Author: Glenn Herrin.
+ URL: http://www.cs.unh.edu/cnrg/gherrin
+ Keywords: network, networking, protocol, IP, UDP, TCP, connection,
+ socket, receiving, transmitting, forwarding, routing, packets,
+ modules, /proc, sk_buff, FIB, tags.
+ Description: Excellent paper devoted to the Linux IP Networking,
+ explaining anything from the kernel's to the user space
+ configuration tools' code. Very good to get a general overview of
+ the kernel networking implementation and understand all steps
+ packets follow from the time they are received at the network
+ device till they are delivered to applications. The studied kernel
+ code is from 2.2.14 version. Provides code for a working packet
+ dropper example.
+
+ * Title: "Get those boards talking under Linux."
+ Author: Alex Ivchenko.
+ URL: http://www.edn.com/article/CA46968.html
+ Keywords: data-acquisition boards, drivers, modules, interrupts,
+ memory allocation.
+ Description: Article written for people wishing to make their data
+ acquisition boards work on their GNU/Linux machines. Gives a basic
+ overview on writing drivers, from the naming of functions to
+ interrupt handling.
+ Notes: Two-parts article. Part II is at
+ URL: http://www.edn.com/article/CA46998.html
+
+ * Title: "Linux PCMCIA Programmer's Guide"
+ Author: David Hinds.
+ URL: http://pcmcia-cs.sourceforge.net/ftp/doc/PCMCIA-PROG.html
+ Keywords: PCMCIA.
+ Description: "This document describes how to write kernel device
+ drivers for the Linux PCMCIA Card Services interface. It also
+ describes how to write user-mode utilities for communicating with
+ Card Services.
+
+ * Title: "The Linux Kernel NFSD Implementation"
+ Author: Neil Brown.
+ URL:
+ http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/nfsd.html
+ Keywords: knfsd, nfsd, NFS, RPC, lockd, mountd, statd.
+ Description: The title says it all.
+ Notes: Covers knfsd's version 1.4.7 (patch against 2.2.7 kernel).
+
+ * Title: "A Linux vm README"
+ Author: Kanoj Sarcar.
+ URL: http://reality.sgi.com/kanoj_engr/vm229.html
+ Keywords: virtual memory, mm, pgd, vma, page, page flags, page
+ cache, swap cache, kswapd.
+ Description: Telegraphic, short descriptions and definitions
+ relating the Linux virtual memory implementation.
+
+ * Title: "(nearly) Complete Linux Loadable Kernel Modules. The
+ definitive guide for hackers, virus coders and system
+ administrators."
+ Author: pragmatic/THC.
+ URL: http://packetstormsecurity.org/docs/hack/LKM_HACKING.html
+ Keywords: syscalls, intercept, hide, abuse, symbol table.
+ Description: Interesting paper on how to abuse the Linux kernel in
+ order to intercept and modify syscalls, make
+ files/directories/processes invisible, become root, hijack ttys,
+ write kernel modules based virus... and solutions for admins to
+ avoid all those abuses.
+ Notes: For 2.0.x kernels. Gives guidances to port it to 2.2.x
+ kernels.
+
+ BOOKS: (Not on-line)
+
+ * Title: "Linux Device Drivers"
+ Author: Alessandro Rubini.
+ Publisher: O'Reilly & Associates.
+ Date: 1998.
+ Pages: 439.
+ ISBN: 1-56592-292-1
+
+ * Title: "Linux Device Drivers, 2nd Edition"
+ Author: Alessandro Rubini and Jonathan Corbet.
+ Publisher: O'Reilly & Associates.
+ Date: 2001.
+ Pages: 586.
+ ISBN: 0-59600-008-1
+ Notes: Further information in
+ http://www.oreilly.com/catalog/linuxdrive2/
+
+ * Title: "Linux Device Drivers, 3nd Edition"
+ Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
+ Publisher: O'Reilly & Associates.
+ Date: 2005.
+ Pages: 636.
+ ISBN: 0-596-00590-3
+ Notes: Further information in
+ http://www.oreilly.com/catalog/linuxdrive3/
+ PDF format, URL: http://lwn.net/Kernel/LDD3/
+
+ * Title: "Linux Kernel Internals"
+ Author: Michael Beck.
+ Publisher: Addison-Wesley.
+ Date: 1997.
+ ISBN: 0-201-33143-8 (second edition)
+
+ * Title: "The Design of the UNIX Operating System"
+ Author: Maurice J. Bach.
+ Publisher: Prentice Hall.
+ Date: 1986.
+ Pages: 471.
+ ISBN: 0-13-201757-1
+
+ * Title: "The Design and Implementation of the 4.3 BSD UNIX
+ Operating System"
+ Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
+ Karels, John S. Quarterman.
+ Publisher: Addison-Wesley.
+ Date: 1989 (reprinted with corrections on October, 1990).
+ ISBN: 0-201-06196-1
+
+ * Title: "The Design and Implementation of the 4.4 BSD UNIX
+ Operating System"
+ Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
+ John S. Quarterman.
+ Publisher: Addison-Wesley.
+ Date: 1996.
+ ISBN: 0-201-54979-4
+
+ * Title: "Programmation Linux 2.0 API systeme et fonctionnement du
+ noyau"
+ Author: Remy Card, Eric Dumas, Franck Mevel.
+ Publisher: Eyrolles.
+ Date: 1997.
+ Pages: 520.
+ ISBN: 2-212-08932-5
+ Notes: French.
+
+ * Title: "Unix internals -- the new frontiers"
+ Author: Uresh Vahalia.
+ Publisher: Prentice Hall.
+ Date: 1996.
+ Pages: 600.
+ ISBN: 0-13-101908-2
+
+ * Title: "The Design and Implementation of the 4.4 BSD UNIX
+ Operating System"
+ Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
+ John S. Quarterman.
+ Publisher: Addison-Wesley.
+ Date: 1996.
+ ISBN: 0-201-54979-4
+
+ * Title: "Programming for the real world - POSIX.4"
+ Author: Bill O. Gallmeister.
+ Publisher: O'Reilly & Associates, Inc..
+ Date: 1995.
+ Pages: ???.
+ ISBN: I-56592-074-0
+ Notes: Though not being directly about Linux, Linux aims to be
+ POSIX. Good reference.
+
+ * Title: "UNIX Systems for Modern Architectures: Symmetric
+ Multiprocesssing and Caching for Kernel Programmers"
+ Author: Curt Schimmel.
+ Publisher: Addison Wesley.
+ Date: June, 1994.
+ Pages: 432.
+ ISBN: 0-201-63338-8
+
+ * Title: "The Design and Implementation of the 4.3 BSD UNIX
+ Operating System"
+ Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
+ Karels, John S. Quarterman.
+ Publisher: Addison-Wesley.
+ Date: 1989 (reprinted with corrections on October, 1990).
+ ISBN: 0-201-06196-1
+
+ * Title: "The Design of the UNIX Operating System"
+ Author: Maurice J. Bach.
+ Publisher: Prentice Hall.
+ Date: 1986.
+ Pages: 471.
+ ISBN: 0-13-201757-1
+
+ MISCELLANEOUS:
+
+ * Name: linux/Documentation
+ Author: Many.
+ URL: Just look inside your kernel sources.
+ Keywords: anything, DocBook.
+ Description: Documentation that comes with the kernel sources,
+ inside the Documentation directory. Some pages from this document
+ (including this document itself) have been moved there, and might
+ be more up to date than the web version.
+
+ * Name: "Linux Source Driver"
+ URL: http://lsd.linux.cz
+ Keywords: Browsing source code.
+ Description: "Linux Source Driver (LSD) is an application, which
+ can make browsing source codes of Linux kernel easier than you can
+ imagine. You can select between multiple versions of kernel (e.g.
+ 0.01, 1.0.0, 2.0.33, 2.0.34pre13, 2.0.0, 2.1.101 etc.). With LSD
+ you can search Linux kernel (fulltext, macros, types, functions
+ and variables) and LSD can generate patches for you on the fly
+ (files, directories or kernel)".
+
+ * Name: "Linux Kernel Source Reference"
+ Author: Thomas Graichen.
+ URL: http://innominate.org/~graichen/projects/lksr/
+ Keywords: CVS, web, cvsweb, browsing source code.
+ Description: Web interface to a CVS server with the kernel
+ sources. "Here you can have a look at any file of the Linux kernel
+ sources of any version starting from 1.0 up to the (daily updated)
+ current version available. Also you can check the differences
+ between two versions of a file".
+
+ * Name: "Cross-Referencing Linux"
+ URL: http://lxr.linux.no/source/
+ Keywords: Browsing source code.
+ Description: Another web-based Linux kernel source code browser.
+ Lots of cross references to variables and functions. You can see
+ where they are defined and where they are used.
+
+ * Name: "Linux Weekly News"
+ URL: http://lwn.net
+ Keywords: latest kernel news.
+ Description: The title says it all. There's a fixed kernel section
+ summarizing developers' work, bug fixes, new features and versions
+ produced during the week. Published every Thursday.
+
+ * Name: "Kernel Traffic"
+ URL: http://kt.zork.net/kernel-traffic/
+ Keywords: linux-kernel mailing list, weekly kernel news.
+ Description: Weekly newsletter covering the most relevant
+ discussions of the linux-kernel mailing list.
+
+ * Name: "CuTTiNG.eDGe.LiNuX"
+ URL: http://edge.kernelnotes.org
+ Keywords: changelist.
+ Description: Site which provides the changelist for every kernel
+ release. What's new, what's better, what's changed. Myrdraal reads
+ the patches and describes them. Pointers to the patches are there,
+ too.
+
+ * Name: "New linux-kernel Mailing List FAQ"
+ URL: http://www.tux.org/lkml/
+ Keywords: linux-kernel mailing list FAQ.
+ Description: linux-kernel is a mailing list for developers to
+ communicate. This FAQ builds on the previous linux-kernel mailing
+ list FAQ maintained by Frohwalt Egerer, who no longer maintains
+ it. Read it to see how to join the mailing list. Dozens of
+ interesting questions regarding the list, Linux, developers (who
+ is ...?), terms (what is...?) are answered here too. Just read it.
+
+ * Name: "Linux Virtual File System"
+ Author: Peter J. Braam.
+ URL: http://www.coda.cs.cmu.edu/doc/talks/linuxvfs/
+ Keywords: slides, VFS, inode, superblock, dentry, dcache.
+ Description: Set of slides, presumably from a presentation on the
+ Linux VFS layer. Covers version 2.1.x, with dentries and the
+ dcache.
+
+ * Name: "Gary's Encyclopedia - The Linux Kernel"
+ Author: Gary (I suppose...).
+ URL: http://slencyclopedia.berlios.de/index.html
+ Keywords: linux, community, everything!
+ Description: Gary's Encyclopedia exists to allow the rapid finding
+ of documentation and other information of interest to GNU/Linux
+ users. It has about 4000 links to external pages in 150 major
+ categories. This link is for kernel-specific links, documents,
+ sites... This list is now hosted by developer.Berlios.de,
+ but seems not to have been updated since sometime in 1999.
+
+ * Name: "The home page of Linux-MM"
+ Author: The Linux-MM team.
+ URL: http://linux-mm.org/
+ Keywords: memory management, Linux-MM, mm patches, TODO, docs,
+ mailing list.
+ Description: Site devoted to Linux Memory Management development.
+ Memory related patches, HOWTOs, links, mm developers... Don't miss
+ it if you are interested in memory management development!
+
+ * Name: "Kernel Newbies IRC Channel"
+ URL: http://www.kernelnewbies.org
+ Keywords: IRC, newbies, channel, asking doubts.
+ Description: #kernelnewbies on irc.openprojects.net. From the web
+ page: "#kernelnewbies is an IRC network dedicated to the 'newbie'
+ kernel hacker. The audience mostly consists of people who are
+ learning about the kernel, working on kernel projects or
+ professional kernel hackers that want to help less seasoned kernel
+ people. [...] #kernelnewbies is on the Open Projects IRC Network,
+ try irc.openprojects.net or irc.<country>.openprojects.net as your
+ server and then /join #kernelnewbies". It also hosts articles,
+ documents, FAQs...
+
+ * Name: "linux-kernel mailing list archives and search engines"
+ URL: http://vger.kernel.org/vger-lists.html
+ URL: http://www.uwsg.indiana.edu/hypermail/linux/kernel/index.html
+ URL: http://marc.theaimsgroup.com/?l=linux-kernel
+ URL: http://groups.google.com/group/mlist.linux.kernel
+ URL: http://www.cs.helsinki.fi/linux/linux-kernel/
+ URL: http://www.lib.uaa.alaska.edu/linux-kernel/
+ Keywords: linux-kernel, archives, search.
+ Description: Some of the linux-kernel mailing list archivers. If
+ you have a better/another one, please let me know.
+ _________________________________________________________________
+
+ Document last updated on Sat 2005-NOV-19
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
new file mode 100644
index 0000000..c9115c1
--- /dev/null
+++ b/Documentation/kernel-parameters.txt
@@ -0,0 +1,2385 @@
+ Kernel Parameters
+ ~~~~~~~~~~~~~~~~~
+
+The following is a consolidated list of the kernel parameters as implemented
+(mostly) by the __setup() macro and sorted into English Dictionary order
+(defined as ignoring all punctuation and sorting digits before letters in a
+case insensitive manner), and with descriptions where known.
+
+Module parameters for loadable modules are specified only as the
+parameter name with optional '=' and value as appropriate, such as:
+
+ modprobe usbcore blinkenlights=1
+
+Module parameters for modules that are built into the kernel image
+are specified on the kernel command line with the module name plus
+'.' plus parameter name, with '=' and value if appropriate, such as:
+
+ usbcore.blinkenlights=1
+
+This document may not be entirely up to date and comprehensive. The command
+"modinfo -p ${modulename}" shows a current list of all parameters of a loadable
+module. Loadable modules, after being loaded into the running kernel, also
+reveal their parameters in /sys/module/${modulename}/parameters/. Some of these
+parameters may be changed at runtime by the command
+"echo -n ${value} > /sys/module/${modulename}/parameters/${parm}".
+
+The parameters listed below are only valid if certain kernel build options were
+enabled and if respective hardware is present. The text in square brackets at
+the beginning of each description states the restrictions within which a
+parameter is applicable:
+
+ ACPI ACPI support is enabled.
+ AGP AGP (Accelerated Graphics Port) is enabled.
+ ALSA ALSA sound support is enabled.
+ APIC APIC support is enabled.
+ APM Advanced Power Management support is enabled.
+ AVR32 AVR32 architecture is enabled.
+ AX25 Appropriate AX.25 support is enabled.
+ BLACKFIN Blackfin architecture is enabled.
+ DRM Direct Rendering Management support is enabled.
+ EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
+ EFI EFI Partitioning (GPT) is enabled
+ EIDE EIDE/ATAPI support is enabled.
+ FB The frame buffer device is enabled.
+ HW Appropriate hardware is enabled.
+ IA-64 IA-64 architecture is enabled.
+ IOSCHED More than one I/O scheduler is enabled.
+ IP_PNP IP DHCP, BOOTP, or RARP is enabled.
+ ISAPNP ISA PnP code is enabled.
+ ISDN Appropriate ISDN support is enabled.
+ JOY Appropriate joystick support is enabled.
+ LIBATA Libata driver is enabled
+ LP Printer support is enabled.
+ LOOP Loopback device support is enabled.
+ M68k M68k architecture is enabled.
+ These options have more detailed description inside of
+ Documentation/m68k/kernel-options.txt.
+ MCA MCA bus support is enabled.
+ MDA MDA console support is enabled.
+ MOUSE Appropriate mouse support is enabled.
+ MSI Message Signaled Interrupts (PCI).
+ MTD MTD (Memory Technology Device) support is enabled.
+ NET Appropriate network support is enabled.
+ NUMA NUMA support is enabled.
+ GENERIC_TIME The generic timeofday code is enabled.
+ NFS Appropriate NFS support is enabled.
+ OSS OSS sound support is enabled.
+ PV_OPS A paravirtualized kernel is enabled.
+ PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
+ PARISC The PA-RISC architecture is enabled.
+ PCI PCI bus support is enabled.
+ PCIE PCI Express support is enabled.
+ PCMCIA The PCMCIA subsystem is enabled.
+ PNP Plug & Play support is enabled.
+ PPC PowerPC architecture is enabled.
+ PPT Parallel port support is enabled.
+ PS2 Appropriate PS/2 support is enabled.
+ RAM RAM disk support is enabled.
+ ROOTPLUG The example Root Plug LSM is enabled.
+ S390 S390 architecture is enabled.
+ SCSI Appropriate SCSI support is enabled.
+ A lot of drivers has their options described inside of
+ Documentation/scsi/.
+ SECURITY Different security models are enabled.
+ SELINUX SELinux support is enabled.
+ SERIAL Serial support is enabled.
+ SH SuperH architecture is enabled.
+ SMP The kernel is an SMP kernel.
+ SPARC Sparc architecture is enabled.
+ SWSUSP Software suspend (hibernation) is enabled.
+ SUSPEND System suspend states are enabled.
+ TS Appropriate touchscreen support is enabled.
+ USB USB support is enabled.
+ USBHID USB Human Interface Device support is enabled.
+ V4L Video For Linux support is enabled.
+ VGA The VGA console has been enabled.
+ VT Virtual terminal support is enabled.
+ WDT Watchdog support is enabled.
+ XT IBM PC/XT MFM hard disk support is enabled.
+ X86-32 X86-32, aka i386 architecture is enabled.
+ X86-64 X86-64 architecture is enabled.
+ More X86-64 boot options can be found in
+ Documentation/x86/x86_64/boot-options.txt .
+ X86 Either 32bit or 64bit x86 (same as X86-32+X86-64)
+
+In addition, the following text indicates that the option:
+
+ BUGS= Relates to possible processor bugs on the said processor.
+ KNL Is a kernel start-up parameter.
+ BOOT Is a boot loader parameter.
+
+Parameters denoted with BOOT are actually interpreted by the boot
+loader, and have no meaning to the kernel directly.
+Do not modify the syntax of boot loader parameters without extreme
+need or coordination with <Documentation/x86/i386/boot.txt>.
+
+There are also arch-specific kernel-parameters not documented here.
+See for example <Documentation/x86/x86_64/boot-options.txt>.
+
+Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
+a trailing = on the name of any parameter states that that parameter will
+be entered as an environment variable, whereas its absence indicates that
+it will appear as a kernel argument readable via /proc/cmdline by programs
+running once the system is up.
+
+The number of kernel parameters is not limited, but the length of the
+complete command line (parameters including spaces etc.) is limited to
+a fixed number of characters. This limit depends on the architecture
+and is between 256 and 4096 characters. It is defined in the file
+./include/asm/setup.h as COMMAND_LINE_SIZE.
+
+
+ acpi= [HW,ACPI,X86-64,i386]
+ Advanced Configuration and Power Interface
+ Format: { force | off | ht | strict | noirq }
+ force -- enable ACPI if default was off
+ off -- disable ACPI if default was on
+ noirq -- do not use ACPI for IRQ routing
+ ht -- run only enough ACPI to enable Hyper Threading
+ strict -- Be less tolerant of platforms that are not
+ strictly ACPI specification compliant.
+
+ See also Documentation/power/pm.txt, pci=noacpi
+
+ acpi_apic_instance= [ACPI, IOAPIC]
+ Format: <int>
+ 2: use 2nd APIC table, if available
+ 1,0: use 1st APIC table
+ default: 0
+
+ acpi_sleep= [HW,ACPI] Sleep options
+ Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, old_ordering }
+ See Documentation/power/video.txt for s3_bios and s3_mode.
+ s3_beep is for debugging; it makes the PC's speaker beep
+ as soon as the kernel's real-mode entry point is called.
+ s4_nohwsig prevents ACPI hardware signature from being
+ used during resume from hibernation.
+ old_ordering causes the ACPI 1.0 ordering of the _PTS
+ control method, wrt putting devices into low power
+ states, to be enforced (the ACPI 2.0 ordering of _PTS is
+ used by default).
+
+ acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
+ Format: { level | edge | high | low }
+
+ acpi_irq_balance [HW,ACPI]
+ ACPI will balance active IRQs
+ default in APIC mode
+
+ acpi_irq_nobalance [HW,ACPI]
+ ACPI will not move active IRQs (default)
+ default in PIC mode
+
+ acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for
+ use by PCI
+ Format: <irq>,<irq>...
+
+ acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
+ Format: <irq>,<irq>...
+
+ acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
+
+ acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
+ Format: To spoof as Windows 98: ="Microsoft Windows"
+
+ acpi_osi= [HW,ACPI] Modify list of supported OS interface strings
+ acpi_osi="string1" # add string1 -- only one string
+ acpi_osi="!string2" # remove built-in string2
+ acpi_osi= # disable all strings
+
+ acpi_serialize [HW,ACPI] force serialization of AML methods
+
+ acpi_skip_timer_override [HW,ACPI]
+ Recognize and ignore IRQ0/pin2 Interrupt Override.
+ For broken nForce2 BIOS resulting in XT-PIC timer.
+ acpi_use_timer_override [HW,ACPI}
+ Use timer override. For some broken Nvidia NF5 boards
+ that require a timer override, but don't have
+ HPET
+
+ acpi_backlight= [HW,ACPI]
+ acpi_backlight=vendor
+ acpi_backlight=video
+ If set to vendor, prefer vendor specific driver
+ (e.g. thinkpad_acpi, sony_acpi, etc.) instead
+ of the ACPI video.ko driver.
+
+ acpi_display_output= [HW,ACPI]
+ acpi_display_output=vendor
+ acpi_display_output=video
+ See above.
+
+ acpi.debug_layer= [HW,ACPI,ACPI_DEBUG]
+ acpi.debug_level= [HW,ACPI,ACPI_DEBUG]
+ Format: <int>
+ CONFIG_ACPI_DEBUG must be enabled to produce any ACPI
+ debug output. Bits in debug_layer correspond to a
+ _COMPONENT in an ACPI source file, e.g.,
+ #define _COMPONENT ACPI_PCI_COMPONENT
+ Bits in debug_level correspond to a level in
+ ACPI_DEBUG_PRINT statements, e.g.,
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO, ...
+ The debug_level mask defaults to "info". See
+ Documentation/acpi/debug.txt for more information about
+ debug layers and levels.
+
+ Enable processor driver info messages:
+ acpi.debug_layer=0x20000000
+ Enable PCI/PCI interrupt routing info messages:
+ acpi.debug_layer=0x400000
+ Enable AML "Debug" output, i.e., stores to the Debug
+ object while interpreting AML:
+ acpi.debug_layer=0xffffffff acpi.debug_level=0x2
+ Enable all messages related to ACPI hardware:
+ acpi.debug_layer=0x2 acpi.debug_level=0xffffffff
+
+ Some values produce so much output that the system is
+ unusable. The "log_buf_len" parameter may be useful
+ if you need to capture more output.
+
+ acpi.power_nocheck= [HW,ACPI]
+ Format: 1/0 enable/disable the check of power state.
+ On some bogus BIOS the _PSC object/_STA object of
+ power resource can't return the correct device power
+ state. In such case it is unneccessary to check its
+ power state again in power transition.
+ 1 : disable the power state check
+
+ acpi_pm_good [X86-32,X86-64]
+ Override the pmtimer bug detection: force the kernel
+ to assume that this machine's pmtimer latches its value
+ and always returns good values.
+
+ agp= [AGP]
+ { off | try_unsupported }
+ off: disable AGP support
+ try_unsupported: try to drive unsupported chipsets
+ (may crash computer or cause data corruption)
+
+ enable_timer_pin_1 [i386,x86-64]
+ Enable PIN 1 of APIC timer
+ Can be useful to work around chipset bugs
+ (in particular on some ATI chipsets).
+ The kernel tries to set a reasonable default.
+
+ disable_timer_pin_1 [i386,x86-64]
+ Disable PIN 1 of APIC timer
+ Can be useful to work around chipset bugs.
+
+ ad1848= [HW,OSS]
+ Format: <io>,<irq>,<dma>,<dma2>,<type>
+
+ advansys= [HW,SCSI]
+ See header of drivers/scsi/advansys.c.
+
+ advwdt= [HW,WDT] Advantech WDT
+ Format: <iostart>,<iostop>
+
+ aedsp16= [HW,OSS] Audio Excel DSP 16
+ Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
+ See also header of sound/oss/aedsp16.c.
+
+ aha152x= [HW,SCSI]
+ See Documentation/scsi/aha152x.txt.
+
+ aha1542= [HW,SCSI]
+ Format: <portbase>[,<buson>,<busoff>[,<dmaspeed>]]
+
+ aic7xxx= [HW,SCSI]
+ See Documentation/scsi/aic7xxx.txt.
+
+ aic79xx= [HW,SCSI]
+ See Documentation/scsi/aic79xx.txt.
+
+ amd_iommu= [HW,X86-84]
+ Pass parameters to the AMD IOMMU driver in the system.
+ Possible values are:
+ isolate - enable device isolation (each device, as far
+ as possible, will get its own protection
+ domain) [default]
+ share - put every device behind one IOMMU into the
+ same protection domain
+ fullflush - enable flushing of IO/TLB entries when
+ they are unmapped. Otherwise they are
+ flushed before they will be reused, which
+ is a lot of faster
+
+ amd_iommu_size= [HW,X86-64]
+ Define the size of the aperture for the AMD IOMMU
+ driver. Possible values are:
+ '32M', '64M' (default), '128M', '256M', '512M', '1G'
+
+ amijoy.map= [HW,JOY] Amiga joystick support
+ Map of devices attached to JOY0DAT and JOY1DAT
+ Format: <a>,<b>
+ See also Documentation/kernel/input/joystick.txt
+
+ analog.map= [HW,JOY] Analog joystick and gamepad support
+ Specifies type or capabilities of an analog joystick
+ connected to one of 16 gameports
+ Format: <type1>,<type2>,..<type16>
+
+ apc= [HW,SPARC]
+ Power management functions (SPARCstation-4/5 + deriv.)
+ Format: noidle
+ Disable APC CPU standby support. SPARCstation-Fox does
+ not play well with APC CPU idle - disable it if you have
+ APC and your system crashes randomly.
+
+ apic= [APIC,i386] Advanced Programmable Interrupt Controller
+ Change the output verbosity whilst booting
+ Format: { quiet (default) | verbose | debug }
+ Change the amount of debugging information output
+ when initialising the APIC and IO-APIC components.
+
+ apm= [APM] Advanced Power Management
+ See header of arch/x86/kernel/apm_32.c.
+
+ arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
+ Format: <io>,<irq>,<nodeID>
+
+ ataflop= [HW,M68k]
+
+ atarimouse= [HW,MOUSE] Atari Mouse
+
+ atascsi= [HW,SCSI] Atari SCSI
+
+ atkbd.extra= [HW] Enable extra LEDs and keys on IBM RapidAccess,
+ EzKey and similar keyboards
+
+ atkbd.reset= [HW] Reset keyboard during initialization
+
+ atkbd.set= [HW] Select keyboard code set
+ Format: <int> (2 = AT (default), 3 = PS/2)
+
+ atkbd.scroll= [HW] Enable scroll wheel on MS Office and similar
+ keyboards
+
+ atkbd.softraw= [HW] Choose between synthetic and real raw mode
+ Format: <bool> (0 = real, 1 = synthetic (default))
+
+ atkbd.softrepeat= [HW]
+ Use software keyboard repeat
+
+ autotest [IA64]
+
+ baycom_epp= [HW,AX25]
+ Format: <io>,<mode>
+
+ baycom_par= [HW,AX25] BayCom Parallel Port AX.25 Modem
+ Format: <io>,<mode>
+ See header of drivers/net/hamradio/baycom_par.c.
+
+ baycom_ser_fdx= [HW,AX25]
+ BayCom Serial Port AX.25 Modem (Full Duplex Mode)
+ Format: <io>,<irq>,<mode>[,<baud>]
+ See header of drivers/net/hamradio/baycom_ser_fdx.c.
+
+ baycom_ser_hdx= [HW,AX25]
+ BayCom Serial Port AX.25 Modem (Half Duplex Mode)
+ Format: <io>,<irq>,<mode>
+ See header of drivers/net/hamradio/baycom_ser_hdx.c.
+
+ boot_delay= Milliseconds to delay each printk during boot.
+ Values larger than 10 seconds (10000) are changed to
+ no delay (0).
+ Format: integer
+
+ bootmem_debug [KNL] Enable bootmem allocator debug messages.
+
+ bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
+ bttv.radio= Most important insmod options are available as
+ kernel args too.
+ bttv.pll= See Documentation/video4linux/bttv/Insmod-options
+ bttv.tuner= and Documentation/video4linux/bttv/CARDLIST
+
+ BusLogic= [HW,SCSI]
+ See drivers/scsi/BusLogic.c, comment before function
+ BusLogic_ParseDriverOptions().
+
+ c101= [NET] Moxa C101 synchronous serial card
+
+ cachesize= [BUGS=X86-32] Override level 2 CPU cache size detection.
+ Sometimes CPU hardware bugs make them report the cache
+ size incorrectly. The kernel will attempt work arounds
+ to fix known problems, but for some CPUs it is not
+ possible to determine what the correct size should be.
+ This option provides an override for these situations.
+
+ security= [SECURITY] Choose a security module to enable at boot.
+ If this boot parameter is not specified, only the first
+ security module asking for security registration will be
+ loaded. An invalid security module name will be treated
+ as if no module has been chosen.
+
+ capability.disable=
+ [SECURITY] Disable capabilities. This would normally
+ be used only if an alternative security model is to be
+ configured. Potentially dangerous and should only be
+ used if you are entirely sure of the consequences.
+
+ ccw_timeout_log [S390]
+ See Documentation/s390/CommonIO for details.
+
+ cgroup_disable= [KNL] Disable a particular controller
+ Format: {name of the controller(s) to disable}
+ {Currently supported controllers - "memory"}
+
+ checkreqprot [SELINUX] Set initial checkreqprot flag value.
+ Format: { "0" | "1" }
+ See security/selinux/Kconfig help text.
+ 0 -- check protection applied by kernel (includes
+ any implied execute protection).
+ 1 -- check protection requested by application.
+ Default value is set via a kernel config option.
+ Value can be changed at runtime via
+ /selinux/checkreqprot.
+
+ cio_ignore= [S390]
+ See Documentation/s390/CommonIO for details.
+
+ clock= [BUGS=X86-32, HW] gettimeofday clocksource override.
+ [Deprecated]
+ Forces specified clocksource (if available) to be used
+ when calculating gettimeofday(). If specified
+ clocksource is not available, it defaults to PIT.
+ Format: { pit | tsc | cyclone | pmtmr }
+
+ clocksource= [GENERIC_TIME] Override the default clocksource
+ Format: <string>
+ Override the default clocksource and use the clocksource
+ with the name specified.
+ Some clocksource names to choose from, depending on
+ the platform:
+ [all] jiffies (this is the base, fallback clocksource)
+ [ACPI] acpi_pm
+ [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
+ pxa_timer,timer3,32k_counter,timer0_1
+ [AVR32] avr32
+ [X86-32] pit,hpet,tsc,vmi-timer;
+ scx200_hrt on Geode; cyclone on IBM x440
+ [MIPS] MIPS
+ [PARISC] cr16
+ [S390] tod
+ [SH] SuperH
+ [SPARC64] tick
+ [X86-64] hpet,tsc
+
+ clearcpuid=BITNUM [X86]
+ Disable CPUID feature X for the kernel. See
+ include/asm-x86/cpufeature.h for the valid bit numbers.
+ Note the Linux specific bits are not necessarily
+ stable over kernel options, but the vendor specific
+ ones should be.
+ Also note that user programs calling CPUID directly
+ or using the feature without checking anything
+ will still see it. This just prevents it from
+ being used by the kernel or shown in /proc/cpuinfo.
+ Also note the kernel might malfunction if you disable
+ some critical bits.
+
+ code_bytes [IA32/X86_64] How many bytes of object code to print
+ in an oops report.
+ Range: 0 - 8192
+ Default: 64
+
+ hpet= [X86-32,HPET] option to control HPET usage
+ Format: { enable (default) | disable | force }
+ disable: disable HPET and use PIT instead
+ force: allow force enabled of undocumented chips (ICH4,
+ VIA, nVidia)
+
+ com20020= [HW,NET] ARCnet - COM20020 chipset
+ Format:
+ <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
+
+ com90io= [HW,NET] ARCnet - COM90xx chipset (IO-mapped buffers)
+ Format: <io>[,<irq>]
+
+ com90xx= [HW,NET]
+ ARCnet - COM90xx chipset (memory-mapped buffers)
+ Format: <io>[,<irq>[,<memstart>]]
+
+ condev= [HW,S390] console device
+ conmode=
+
+ console= [KNL] Output console device and options.
+
+ tty<n> Use the virtual console device <n>.
+
+ ttyS<n>[,options]
+ ttyUSB0[,options]
+ Use the specified serial port. The options are of
+ the form "bbbbpnf", where "bbbb" is the baud rate,
+ "p" is parity ("n", "o", or "e"), "n" is number of
+ bits, and "f" is flow control ("r" for RTS or
+ omit it). Default is "9600n8".
+
+ See Documentation/serial-console.txt for more
+ information. See
+ Documentation/networking/netconsole.txt for an
+ alternative.
+
+ uart[8250],io,<addr>[,options]
+ uart[8250],mmio,<addr>[,options]
+ Start an early, polled-mode console on the 8250/16550
+ UART at the specified I/O port or MMIO address,
+ switching to the matching ttyS device later. The
+ options are the same as for ttyS, above.
+
+ If the device connected to the port is not a TTY but a braille
+ device, prepend "brl," before the device type, for instance
+ console=brl,ttyS0
+ For now, only VisioBraille is supported.
+
+ earlycon= [KNL] Output early console device and options.
+ uart[8250],io,<addr>[,options]
+ uart[8250],mmio,<addr>[,options]
+ Start an early, polled-mode console on the 8250/16550
+ UART at the specified I/O port or MMIO address.
+ The options are the same as for ttyS, above.
+
+ no_console_suspend
+ [HW] Never suspend the console
+ Disable suspending of consoles during suspend and
+ hibernate operations. Once disabled, debugging
+ messages can reach various consoles while the rest
+ of the system is being put to sleep (ie, while
+ debugging driver suspend/resume hooks). This may
+ not work reliably with all consoles, but is known
+ to work with serial and VGA consoles.
+
+ cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
+ Format:
+ <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
+
+ crashkernel=nn[KMG]@ss[KMG]
+ [KNL] Reserve a chunk of physical memory to
+ hold a kernel to switch to with kexec on panic.
+
+ crashkernel=range1:size1[,range2:size2,...][@offset]
+ [KNL] Same as above, but depends on the memory
+ in the running system. The syntax of range is
+ start-[end] where start and end are both
+ a memory unit (amount[KMG]). See also
+ Documentation/kdump/kdump.txt for a example.
+
+ cs4232= [HW,OSS]
+ Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
+
+ cs89x0_dma= [HW,NET]
+ Format: <dma>
+
+ cs89x0_media= [HW,NET]
+ Format: { rj45 | aui | bnc }
+
+ dasd= [HW,NET]
+ See header of drivers/s390/block/dasd_devmap.c.
+
+ db9.dev[2|3]= [HW,JOY] Multisystem joystick support via parallel port
+ (one device per port)
+ Format: <port#>,<type>
+ See also Documentation/input/joystick-parport.txt
+
+ debug [KNL] Enable kernel debugging (events log level).
+
+ debug_locks_verbose=
+ [KNL] verbose self-tests
+ Format=<0|1>
+ Print debugging info while doing the locking API
+ self-tests.
+ We default to 0 (no extra messages), setting it to
+ 1 will print _a lot_ more information - normally
+ only useful to kernel developers.
+
+ debug_objects [KNL] Enable object debugging
+
+ debugpat [X86] Enable PAT debugging
+
+ decnet.addr= [HW,NET]
+ Format: <area>[,<node>]
+ See also Documentation/networking/decnet.txt.
+
+ vt.default_blu= [VT]
+ Format: <blue0>,<blue1>,<blue2>,...,<blue15>
+ Change the default blue palette of the console.
+ This is a 16-member array composed of values
+ ranging from 0-255.
+
+ vt.default_grn= [VT]
+ Format: <green0>,<green1>,<green2>,...,<green15>
+ Change the default green palette of the console.
+ This is a 16-member array composed of values
+ ranging from 0-255.
+
+ vt.default_red= [VT]
+ Format: <red0>,<red1>,<red2>,...,<red15>
+ Change the default red palette of the console.
+ This is a 16-member array composed of values
+ ranging from 0-255.
+
+ vt.default_utf8=
+ [VT]
+ Format=<0|1>
+ Set system-wide default UTF-8 mode for all tty's.
+ Default is 1, i.e. UTF-8 mode is enabled for all
+ newly opened terminals.
+
+ dhash_entries= [KNL]
+ Set number of hash buckets for dentry cache.
+
+ digi= [HW,SERIAL]
+ IO parameters + enable/disable command.
+
+ digiepca= [HW,SERIAL]
+ See drivers/char/README.epca and
+ Documentation/serial/digiepca.txt.
+
+ disable_mtrr_cleanup [X86]
+ enable_mtrr_cleanup [X86]
+ The kernel tries to adjust MTRR layout from continuous
+ to discrete, to make X server driver able to add WB
+ entry later. This parameter enables/disables that.
+
+ mtrr_chunk_size=nn[KMG] [X86]
+ used for mtrr cleanup. It is largest continous chunk
+ that could hold holes aka. UC entries.
+
+ mtrr_gran_size=nn[KMG] [X86]
+ Used for mtrr cleanup. It is granularity of mtrr block.
+ Default is 1.
+ Large value could prevent small alignment from
+ using up MTRRs.
+
+ mtrr_spare_reg_nr=n [X86]
+ Format: <integer>
+ Range: 0,7 : spare reg number
+ Default : 1
+ Used for mtrr cleanup. It is spare mtrr entries number.
+ Set to 2 or more if your graphical card needs more.
+
+ disable_mtrr_trim [X86, Intel and AMD only]
+ By default the kernel will trim any uncacheable
+ memory out of your available memory pool based on
+ MTRR settings. This parameter disables that behavior,
+ possibly causing your machine to run very slowly.
+
+ dmasound= [HW,OSS] Sound subsystem buffers
+
+ dscc4.setup= [NET]
+
+ dtc3181e= [HW,SCSI]
+
+ earlyprintk= [X86-32,X86-64,SH,BLACKFIN]
+ earlyprintk=vga
+ earlyprintk=serial[,ttySn[,baudrate]]
+ earlyprintk=dbgp
+
+ Append ",keep" to not disable it when the real console
+ takes over.
+
+ Only vga or serial or usb debug port at a time.
+
+ Currently only ttyS0 and ttyS1 are supported.
+
+ Interaction with the standard serial driver is not
+ very good.
+
+ The VGA output is eventually overwritten by the real
+ console.
+
+ eata= [HW,SCSI]
+
+ edd= [EDD]
+ Format: {"off" | "on" | "skip[mbr]"}
+
+ eisa_irq_edge= [PARISC,HW]
+ See header of drivers/parisc/eisa.c.
+
+ elanfreq= [X86-32]
+ See comment before function elanfreq_setup() in
+ arch/x86/kernel/cpu/cpufreq/elanfreq.c.
+
+ elevator= [IOSCHED]
+ Format: {"anticipatory" | "cfq" | "deadline" | "noop"}
+ See Documentation/block/as-iosched.txt and
+ Documentation/block/deadline-iosched.txt for details.
+
+ elfcorehdr= [IA64,PPC,SH,X86-32,X86_64]
+ Specifies physical address of start of kernel core
+ image elf header. Generally kexec loader will
+ pass this option to capture kernel.
+ See Documentation/kdump/kdump.txt for details.
+
+ enforcing [SELINUX] Set initial enforcing status.
+ Format: {"0" | "1"}
+ See security/selinux/Kconfig help text.
+ 0 -- permissive (log only, no denials).
+ 1 -- enforcing (deny and log).
+ Default value is 0.
+ Value can be changed at runtime via /selinux/enforce.
+
+ es1371= [HW,OSS]
+ Format: <spdif>,[<nomix>,[<amplifier>]]
+ See also header of sound/oss/es1371.c.
+
+ ether= [HW,NET] Ethernet cards parameters
+ This option is obsoleted by the "netdev=" option, which
+ has equivalent usage. See its documentation for details.
+
+ eurwdt= [HW,WDT] Eurotech CPU-1220/1410 onboard watchdog.
+ Format: <io>[,<irq>]
+
+ failslab=
+ fail_page_alloc=
+ fail_make_request=[KNL]
+ General fault injection mechanism.
+ Format: <interval>,<probability>,<space>,<times>
+ See also /Documentation/fault-injection/.
+
+ fd_mcs= [HW,SCSI]
+ See header of drivers/scsi/fd_mcs.c.
+
+ fdomain= [HW,SCSI]
+ See header of drivers/scsi/fdomain.c.
+
+ floppy= [HW]
+ See Documentation/blockdev/floppy.txt.
+
+ force_pal_cache_flush
+ [IA-64] Avoid check_sal_cache_flush which may hang on
+ buggy SAL_CACHE_FLUSH implementations. Using this
+ parameter will force ia64_sal_cache_flush to call
+ ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
+
+ gamecon.map[2|3]=
+ [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
+ support via parallel port (up to 5 devices per port)
+ Format: <port#>,<pad1>,<pad2>,<pad3>,<pad4>,<pad5>
+ See also Documentation/input/joystick-parport.txt
+
+ gamma= [HW,DRM]
+
+ gart_fix_e820= [X86_64] disable the fix e820 for K8 GART
+ Format: off | on
+ default: on
+
+ gdth= [HW,SCSI]
+ See header of drivers/scsi/gdth.c.
+
+ gpt [EFI] Forces disk with valid GPT signature but
+ invalid Protective MBR to be treated as GPT.
+
+ gvp11= [HW,SCSI]
+
+ hashdist= [KNL,NUMA] Large hashes allocated during boot
+ are distributed across NUMA nodes. Defaults on
+ for IA-64, off otherwise.
+ Format: 0 | 1 (for off | on)
+
+ hcl= [IA-64] SGI's Hardware Graph compatibility layer
+
+ hd= [EIDE] (E)IDE hard drive subsystem geometry
+ Format: <cyl>,<head>,<sect>
+
+ highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
+ size of <nn>. This works even on boxes that have no
+ highmem otherwise. This also works to reduce highmem
+ size on bigger boxes.
+
+ highres= [KNL] Enable/disable high resolution timer mode.
+ Valid parameters: "on", "off"
+ Default: "on"
+
+ hisax= [HW,ISDN]
+ See Documentation/isdn/README.HiSax.
+
+ hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
+ hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
+ On x86-64 and powerpc, this option can be specified
+ multiple times interleaved with hugepages= to reserve
+ huge pages of different sizes. Valid pages sizes on
+ x86-64 are 2M (when the CPU supports "pse") and 1G
+ (when the CPU supports the "pdpe1gb" cpuinfo flag)
+ Note that 1GB pages can only be allocated at boot time
+ using hugepages= and not freed afterwards.
+ default_hugepagesz=
+ [same as hugepagesz=] The size of the default
+ HugeTLB page size. This is the size represented by
+ the legacy /proc/ hugepages APIs, used for SHM, and
+ default size when mounting hugetlbfs filesystems.
+ Defaults to the default architecture's huge page size
+ if not specified.
+
+ hlt [BUGS=ARM,SH]
+
+ i8042.debug [HW] Toggle i8042 debug mode
+ i8042.direct [HW] Put keyboard port into non-translated mode
+ i8042.dumbkbd [HW] Pretend that controller can only read data from
+ keyboard and cannot control its state
+ (Don't attempt to blink the leds)
+ i8042.noaux [HW] Don't check for auxiliary (== mouse) port
+ i8042.nokbd [HW] Don't check/create keyboard port
+ i8042.noloop [HW] Disable the AUX Loopback command while probing
+ for the AUX port
+ i8042.nomux [HW] Don't check presence of an active multiplexing
+ controller
+ i8042.nopnp [HW] Don't use ACPIPnP / PnPBIOS to discover KBD/AUX
+ controllers
+ i8042.panicblink=
+ [HW] Frequency with which keyboard LEDs should blink
+ when kernel panics (default is 0.5 sec)
+ i8042.reset [HW] Reset the controller during init and cleanup
+ i8042.unlock [HW] Unlock (ignore) the keylock
+
+ i810= [HW,DRM]
+
+ i8k.ignore_dmi [HW] Continue probing hardware even if DMI data
+ indicates that the driver is running on unsupported
+ hardware.
+ i8k.force [HW] Activate i8k driver even if SMM BIOS signature
+ does not match list of supported models.
+ i8k.power_status
+ [HW] Report power status in /proc/i8k
+ (disabled by default)
+ i8k.restricted [HW] Allow controlling fans only if SYS_ADMIN
+ capability is set.
+
+ ibmmcascsi= [HW,MCA,SCSI] IBM MicroChannel SCSI adapter
+ See Documentation/mca.txt.
+
+ icn= [HW,ISDN]
+ Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
+
+ ide= [HW] (E)IDE subsystem
+ Format: ide=nodma or ide=doubler
+ See Documentation/ide/ide.txt.
+
+ idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed
+ See Documentation/ide/ide.txt.
+
+ idle= [X86]
+ Format: idle=poll or idle=mwait, idle=halt, idle=nomwait
+ Poll forces a polling idle loop that can slightly improves the performance
+ of waking up a idle CPU, but will use a lot of power and make the system
+ run hot. Not recommended.
+ idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose
+ to not use it because it doesn't save as much power as a normal idle
+ loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
+ as idle=poll.
+ idle=halt. Halt is forced to be used for CPU idle.
+ In such case C2/C3 won't be used again.
+ idle=nomwait. Disable mwait for CPU C-states
+
+ ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
+ Claim all unknown PCI IDE storage controllers.
+
+ ignore_loglevel [KNL]
+ Ignore loglevel setting - this will print /all/
+ kernel messages to the console. Useful for debugging.
+
+ ihash_entries= [KNL]
+ Set number of hash buckets for inode cache.
+
+ in2000= [HW,SCSI]
+ See header of drivers/scsi/in2000.c.
+
+ init= [KNL]
+ Format: <full_path>
+ Run specified binary instead of /sbin/init as init
+ process.
+
+ initcall_debug [KNL] Trace initcalls as they are executed. Useful
+ for working out where the kernel is dying during
+ startup.
+
+ initrd= [BOOT] Specify the location of the initial ramdisk
+
+ inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
+ Format: <irq>
+
+ inttest= [IA64]
+
+ iommu= [x86]
+ off
+ force
+ noforce
+ biomerge
+ panic
+ nopanic
+ merge
+ nomerge
+ forcesac
+ soft
+
+
+ intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
+ off
+ Disable intel iommu driver.
+ igfx_off [Default Off]
+ By default, gfx is mapped as normal device. If a gfx
+ device has a dedicated DMAR unit, the DMAR unit is
+ bypassed by not enabling DMAR with this option. In
+ this case, gfx device will use physical address for
+ DMA.
+ forcedac [x86_64]
+ With this option iommu will not optimize to look
+ for io virtual address below 32 bit forcing dual
+ address cycle on pci bus for cards supporting greater
+ than 32 bit addressing. The default is to look
+ for translation below 32 bit and if not available
+ then look in the higher range.
+ strict [Default Off]
+ With this option on every unmap_single operation will
+ result in a hardware IOTLB flush operation as opposed
+ to batching them for performance.
+
+ io_delay= [X86-32,X86-64] I/O delay method
+ 0x80
+ Standard port 0x80 based delay
+ 0xed
+ Alternate port 0xed based delay (needed on some systems)
+ udelay
+ Simple two microseconds delay
+ none
+ No delay
+
+ io7= [HW] IO7 for Marvel based alpha systems
+ See comment before marvel_specify_io7 in
+ arch/alpha/kernel/core_marvel.c.
+
+ ip= [IP_PNP]
+ See Documentation/filesystems/nfsroot.txt.
+
+ ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards
+ See comment before ip2_setup() in
+ drivers/char/ip2/ip2base.c.
+
+ ips= [HW,SCSI] Adaptec / IBM ServeRAID controller
+ See header of drivers/scsi/ips.c.
+
+ ports= [IP_VS_FTP] IPVS ftp helper module
+ Default is 21.
+ Up to 8 (IP_VS_APP_MAX_PORTS) ports
+ may be specified.
+ Format: <port>,<port>....
+
+ irqfixup [HW]
+ When an interrupt is not handled search all handlers
+ for it. Intended to get systems with badly broken
+ firmware running.
+
+ irqpoll [HW]
+ When an interrupt is not handled search all handlers
+ for it. Also check all handlers each timer
+ interrupt. Intended to get systems with badly broken
+ firmware running.
+
+ isapnp= [ISAPNP]
+ Format: <RDP>,<reset>,<pci_scan>,<verbosity>
+
+ isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler.
+ Format:
+ <cpu number>,...,<cpu number>
+ or
+ <cpu number>-<cpu number>
+ (must be a positive range in ascending order)
+ or a mixture
+ <cpu number>,...,<cpu number>-<cpu number>
+
+ This option can be used to specify one or more CPUs
+ to isolate from the general SMP balancing and scheduling
+ algorithms. You can move a process onto or off an
+ "isolated" CPU via the CPU affinity syscalls or cpuset.
+ <cpu number> begins at 0 and the maximum value is
+ "number of CPUs in system - 1".
+
+ This option is the preferred way to isolate CPUs. The
+ alternative -- manually setting the CPU mask of all
+ tasks in the system -- can cause problems and
+ suboptimal load balancer performance.
+
+ iucv= [HW,NET]
+
+ js= [HW,JOY] Analog joystick
+ See Documentation/input/joystick.txt.
+
+ kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
+ specifies the amount of memory usable by the kernel
+ for non-movable allocations. The requested amount is
+ spread evenly throughout all nodes in the system. The
+ remaining memory in each node is used for Movable
+ pages. In the event, a node is too small to have both
+ kernelcore and Movable pages, kernelcore pages will
+ take priority and other nodes will have a larger number
+ of kernelcore pages. The Movable zone is used for the
+ allocation of pages that may be reclaimed or moved
+ by the page migration subsystem. This means that
+ HugeTLB pages may not be allocated from this zone.
+ Note that allocations like PTEs-from-HighMem still
+ use the HighMem zone if it exists, and the Normal
+ zone if it does not.
+
+ movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
+ is similar to kernelcore except it specifies the
+ amount of memory used for migratable allocations.
+ If both kernelcore and movablecore is specified,
+ then kernelcore will be at *least* the specified
+ value but may be more. If movablecore on its own
+ is specified, the administrator must be careful
+ that the amount of memory usable for all allocations
+ is not too small.
+
+ keepinitrd [HW,ARM]
+
+ kstack=N [X86-32,X86-64] Print N words from the kernel stack
+ in oops dumps.
+
+ kgdboc= [HW] kgdb over consoles.
+ Requires a tty driver that supports console polling.
+ (only serial suported for now)
+ Format: <serial_device>[,baud]
+
+ kmac= [MIPS] korina ethernet MAC address.
+ Configure the RouterBoard 532 series on-chip
+ Ethernet adapter MAC address.
+
+ l2cr= [PPC]
+
+ l3cr= [PPC]
+
+ lapic [X86-32,APIC] Enable the local APIC even if BIOS
+ disabled it.
+
+ lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in
+ C2 power state.
+
+ libata.dma= [LIBATA] DMA control
+ libata.dma=0 Disable all PATA and SATA DMA
+ libata.dma=1 PATA and SATA Disk DMA only
+ libata.dma=2 ATAPI (CDROM) DMA only
+ libata.dma=4 Compact Flash DMA only
+ Combinations also work, so libata.dma=3 enables DMA
+ for disks and CDROMs, but not CFs.
+
+ libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume
+ when set.
+ Format: <int>
+
+ libata.force= [LIBATA] Force configurations. The format is comma
+ separated list of "[ID:]VAL" where ID is
+ PORT[:DEVICE]. PORT and DEVICE are decimal numbers
+ matching port, link or device. Basically, it matches
+ the ATA ID string printed on console by libata. If
+ the whole ID part is omitted, the last PORT and DEVICE
+ values are used. If ID hasn't been specified yet, the
+ configuration applies to all ports, links and devices.
+
+ If only DEVICE is omitted, the parameter applies to
+ the port and all links and devices behind it. DEVICE
+ number of 0 either selects the first device or the
+ first fan-out link behind PMP device. It does not
+ select the host link. DEVICE number of 15 selects the
+ host link and device attached to it.
+
+ The VAL specifies the configuration to force. As long
+ as there's no ambiguity shortcut notation is allowed.
+ For example, both 1.5 and 1.5G would work for 1.5Gbps.
+ The following configurations can be forced.
+
+ * Cable type: 40c, 80c, short40c, unk, ign or sata.
+ Any ID with matching PORT is used.
+
+ * SATA link speed limit: 1.5Gbps or 3.0Gbps.
+
+ * Transfer mode: pio[0-7], mwdma[0-4] and udma[0-7].
+ udma[/][16,25,33,44,66,100,133] notation is also
+ allowed.
+
+ * [no]ncq: Turn on or off NCQ.
+
+ * nohrst, nosrst, norst: suppress hard, soft
+ and both resets.
+
+ If there are multiple matching configurations changing
+ the same attribute, the last one is used.
+
+ load_ramdisk= [RAM] List of ramdisks to load from floppy
+ See Documentation/blockdev/ramdisk.txt.
+
+ lockd.nlm_grace_period=P [NFS] Assign grace period.
+ Format: <integer>
+
+ lockd.nlm_tcpport=N [NFS] Assign TCP port.
+ Format: <integer>
+
+ lockd.nlm_timeout=T [NFS] Assign timeout value.
+ Format: <integer>
+
+ lockd.nlm_udpport=M [NFS] Assign UDP port.
+ Format: <integer>
+
+ logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver
+ Format: <irq>
+
+ loglevel= All Kernel Messages with a loglevel smaller than the
+ console loglevel will be printed to the console. It can
+ also be changed with klogd or other programs. The
+ loglevels are defined as follows:
+
+ 0 (KERN_EMERG) system is unusable
+ 1 (KERN_ALERT) action must be taken immediately
+ 2 (KERN_CRIT) critical conditions
+ 3 (KERN_ERR) error conditions
+ 4 (KERN_WARNING) warning conditions
+ 5 (KERN_NOTICE) normal but significant condition
+ 6 (KERN_INFO) informational
+ 7 (KERN_DEBUG) debug-level messages
+
+ log_buf_len=n Sets the size of the printk ring buffer, in bytes.
+ Format: { n | nk | nM }
+ n must be a power of two. The default size
+ is set in the kernel config file.
+
+ logo.nologo [FB] Disables display of the built-in Linux logo.
+ This may be used to provide more screen space for
+ kernel log messages and is useful when debugging
+ kernel boot problems.
+
+ lp=0 [LP] Specify parallel ports to use, e.g,
+ lp=port[,port...] lp=none,parport0 (lp0 not configured, lp1 uses
+ lp=reset first parallel port). 'lp=0' disables the
+ lp=auto printer driver. 'lp=reset' (which can be
+ specified in addition to the ports) causes
+ attached printers to be reset. Using
+ lp=port1,port2,... specifies the parallel ports
+ to associate lp devices with, starting with
+ lp0. A port specification may be 'none' to skip
+ that lp device, or a parport name such as
+ 'parport0'. Specifying 'lp=auto' instead of a
+ port specification list means that device IDs
+ from each port should be examined, to see if
+ an IEEE 1284-compliant printer is attached; if
+ so, the driver will manage that printer.
+ See also header of drivers/char/lp.c.
+
+ lpj=n [KNL]
+ Sets loops_per_jiffy to given constant, thus avoiding
+ time-consuming boot-time autodetection (up to 250 ms per
+ CPU). 0 enables autodetection (default). To determine
+ the correct value for your kernel, boot with normal
+ autodetection and see what value is printed. Note that
+ on SMP systems the preset will be applied to all CPUs,
+ which is likely to cause problems if your CPUs need
+ significantly divergent settings. An incorrect value
+ will cause delays in the kernel to be wrong, leading to
+ unpredictable I/O errors and other breakage. Although
+ unlikely, in the extreme case this might damage your
+ hardware.
+
+ ltpc= [NET]
+ Format: <io>,<irq>,<dma>
+
+ mac5380= [HW,SCSI] Format:
+ <can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
+
+ machvec= [IA64] Force the use of a particular machine-vector
+ (machvec) in a generic kernel.
+ Example: machvec=hpzx1_swiotlb
+
+ max_loop= [LOOP] Maximum number of loopback devices that can
+ be mounted
+ Format: <1-256>
+
+ maxcpus= [SMP] Maximum number of processors that an SMP kernel
+ should make use of. maxcpus=n : n >= 0 limits the
+ kernel to using 'n' processors. n=0 is a special case,
+ it is equivalent to "nosmp", which also disables
+ the IO APIC.
+
+ max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater than
+ or equal to this physical address is ignored.
+
+ max_luns= [SCSI] Maximum number of LUNs to probe.
+ Should be between 1 and 2^32-1.
+
+ max_report_luns=
+ [SCSI] Maximum number of LUNs received.
+ Should be between 1 and 16384.
+
+ mcatest= [IA-64]
+
+ mce [X86-32] Machine Check Exception
+
+ mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt
+
+ md= [HW] RAID subsystems devices and level
+ See Documentation/md.txt.
+
+ mdacon= [MDA]
+ Format: <first>,<last>
+ Specifies range of consoles to be captured by the MDA.
+
+ mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
+ Amount of memory to be used when the kernel is not able
+ to see the whole system memory or for test.
+ [X86-32] Use together with memmap= to avoid physical
+ address space collisions. Without memmap= PCI devices
+ could be placed at addresses belonging to unused RAM.
+
+ mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
+ memory.
+
+ memchunk=nn[KMG]
+ [KNL,SH] Allow user to override the default size for
+ per-device physically contiguous DMA buffers.
+
+ memmap=exactmap [KNL,X86-32,X86_64] Enable setting of an exact
+ E820 memory map, as specified by the user.
+ Such memmap=exactmap lines can be constructed based on
+ BIOS output or other requirements. See the memmap=nn@ss
+ option description.
+
+ memmap=nn[KMG]@ss[KMG]
+ [KNL] Force usage of a specific region of memory
+ Region of memory to be used, from ss to ss+nn.
+
+ memmap=nn[KMG]#ss[KMG]
+ [KNL,ACPI] Mark specific memory as ACPI data.
+ Region of memory to be used, from ss to ss+nn.
+
+ memmap=nn[KMG]$ss[KMG]
+ [KNL,ACPI] Mark specific memory as reserved.
+ Region of memory to be used, from ss to ss+nn.
+ Example: Exclude memory from 0x18690000-0x1869ffff
+ memmap=64K$0x18690000
+ or
+ memmap=0x10000$0x18690000
+
+ memory_corruption_check=0/1 [X86]
+ Some BIOSes seem to corrupt the first 64k of
+ memory when doing things like suspend/resume.
+ Setting this option will scan the memory
+ looking for corruption. Enabling this will
+ both detect corruption and prevent the kernel
+ from using the memory being corrupted.
+ However, its intended as a diagnostic tool; if
+ repeatable BIOS-originated corruption always
+ affects the same memory, you can use memmap=
+ to prevent the kernel from using that memory.
+
+ memory_corruption_check_size=size [X86]
+ By default it checks for corruption in the low
+ 64k, making this memory unavailable for normal
+ use. Use this parameter to scan for
+ corruption in more or less memory.
+
+ memory_corruption_check_period=seconds [X86]
+ By default it checks for corruption every 60
+ seconds. Use this parameter to check at some
+ other rate. 0 disables periodic checking.
+
+ memtest= [KNL,X86] Enable memtest
+ Format: <integer>
+ range: 0,4 : pattern number
+ default : 0 <disable>
+
+ meye.*= [HW] Set MotionEye Camera parameters
+ See Documentation/video4linux/meye.txt.
+
+ mfgpt_irq= [IA-32] Specify the IRQ to use for the
+ Multi-Function General Purpose Timers on AMD Geode
+ platforms.
+
+ mfgptfix [X86-32] Fix MFGPT timers on AMD Geode platforms when
+ the BIOS has incorrectly applied a workaround. TinyBIOS
+ version 0.98 is known to be affected, 0.99 fixes the
+ problem by letting the user disable the workaround.
+
+ mga= [HW,DRM]
+
+ min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this
+ physical address is ignored.
+
+ mminit_loglevel=
+ [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
+ parameter allows control of the logging verbosity for
+ the additional memory initialisation checks. A value
+ of 0 disables mminit logging and a level of 4 will
+ log everything. Information is printed at KERN_DEBUG
+ so loglevel=8 may also need to be specified.
+
+ mousedev.tap_time=
+ [MOUSE] Maximum time between finger touching and
+ leaving touchpad surface for touch to be considered
+ a tap and be reported as a left button click (for
+ touchpads working in absolute mode only).
+ Format: <msecs>
+ mousedev.xres= [MOUSE] Horizontal screen resolution, used for devices
+ reporting absolute coordinates, such as tablets
+ mousedev.yres= [MOUSE] Vertical screen resolution, used for devices
+ reporting absolute coordinates, such as tablets
+
+ mpu401= [HW,OSS]
+ Format: <io>,<irq>
+
+ MTD_Partition= [MTD]
+ Format: <name>,<region-number>,<size>,<offset>
+
+ MTD_Region= [MTD] Format:
+ <name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>]
+
+ mtdparts= [MTD]
+ See drivers/mtd/cmdlinepart.c.
+
+ mtdset= [ARM]
+ ARM/S3C2412 JIVE boot control
+
+ See arch/arm/mach-s3c2412/mach-jive.c
+
+ mtouchusb.raw_coordinates=
+ [HW] Make the MicroTouch USB driver use raw coordinates
+ ('y', default) or cooked coordinates ('n')
+
+ n2= [NET] SDL Inc. RISCom/N2 synchronous serial card
+
+ NCR_D700= [HW,SCSI]
+ See header of drivers/scsi/NCR_D700.c.
+
+ ncr5380= [HW,SCSI]
+
+ ncr53c400= [HW,SCSI]
+
+ ncr53c400a= [HW,SCSI]
+
+ ncr53c406a= [HW,SCSI]
+
+ ncr53c8xx= [HW,SCSI]
+
+ netdev= [NET] Network devices parameters
+ Format: <irq>,<io>,<mem_start>,<mem_end>,<name>
+ Note that mem_start is often overloaded to mean
+ something different and driver-specific.
+ This usage is only documented in each driver source
+ file if at all.
+
+ nf_conntrack.acct=
+ [NETFILTER] Enable connection tracking flow accounting
+ 0 to disable accounting
+ 1 to enable accounting
+ Default value depends on CONFIG_NF_CT_ACCT that is
+ going to be removed in 2.6.29.
+
+ nfsaddrs= [NFS]
+ See Documentation/filesystems/nfsroot.txt.
+
+ nfsroot= [NFS] nfs root filesystem for disk-less boxes.
+ See Documentation/filesystems/nfsroot.txt.
+
+ nfs.callback_tcpport=
+ [NFS] set the TCP port on which the NFSv4 callback
+ channel should listen.
+
+ nfs.idmap_cache_timeout=
+ [NFS] set the maximum lifetime for idmapper cache
+ entries.
+
+ nfs.enable_ino64=
+ [NFS] enable 64-bit inode numbers.
+ If zero, the NFS client will fake up a 32-bit inode
+ number for the readdir() and stat() syscalls instead
+ of returning the full 64-bit number.
+ The default is to return 64-bit inode numbers.
+
+ nmi_debug= [KNL,AVR32] Specify one or more actions to take
+ when a NMI is triggered.
+ Format: [state][,regs][,debounce][,die]
+
+ nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels
+
+ no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
+ emulation library even if a 387 maths coprocessor
+ is present.
+
+ noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
+ caches in the slab allocator. Saves per-node memory,
+ but will impact performance.
+
+ noalign [KNL,ARM]
+
+ noapic [SMP,APIC] Tells the kernel to not make use of any
+ IOAPICs that may be present in the system.
+
+ nobats [PPC] Do not use BATs for mapping kernel lowmem
+ on "Classic" PPC cores.
+
+ nocache [ARM]
+
+ nodelayacct [KNL] Disable per-task delay accounting
+
+ nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects.
+
+ nodsp [SH] Disable hardware DSP at boot time.
+
+ noefi [X86-32,X86-64] Disable EFI runtime services support.
+
+ noexec [IA-64]
+
+ noexec [X86-32,X86-64]
+ On X86-32 available only on PAE configured kernels.
+ noexec=on: enable non-executable mappings (default)
+ noexec=off: disable non-executable mappings
+
+ noexec32 [X86-64]
+ This affects only 32-bit executables.
+ noexec32=on: enable non-executable mappings (default)
+ read doesn't imply executable mappings
+ noexec32=off: disable non-executable mappings
+ read implies executable mappings
+
+ nofpu [SH] Disable hardware FPU at boot time.
+
+ nofxsr [BUGS=X86-32] Disables x86 floating point extended
+ register save and restore. The kernel will only save
+ legacy floating-point registers on task switch.
+
+ noclflush [BUGS=X86] Don't use the CLFLUSH instruction
+
+ nohlt [BUGS=ARM,SH]
+
+ no-hlt [BUGS=X86-32] Tells the kernel that the hlt
+ instruction doesn't work correctly and not to
+ use it.
+
+ nohalt [IA-64] Tells the kernel not to use the power saving
+ function PAL_HALT_LIGHT when idle. This increases
+ power-consumption. On the positive side, it reduces
+ interrupt wake-up latency, which may improve performance
+ in certain environments such as networked servers or
+ real-time systems.
+
+ nohz= [KNL] Boottime enable/disable dynamic ticks
+ Valid arguments: on, off
+ Default: on
+
+ noirqdebug [X86-32] Disables the code which attempts to detect and
+ disable unhandled interrupt sources.
+
+ no_timer_check [X86-32,X86_64,APIC] Disables the code which tests for
+ broken timer IRQ sources.
+
+ noisapnp [ISAPNP] Disables ISA PnP code.
+
+ noinitrd [RAM] Tells the kernel not to load any configured
+ initial RAM disk.
+
+ nointroute [IA-64]
+
+ nojitter [IA64] Disables jitter checking for ITC timers.
+
+ nolapic [X86-32,APIC] Do not enable or use the local APIC.
+
+ nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
+
+ nox2apic [X86-64,APIC] Do not enable x2APIC mode.
+
+ x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
+ default x2apic cluster mode on platforms
+ supporting x2apic.
+
+ noltlbs [PPC] Do not use large page/tlb entries for kernel
+ lowmem mapping on PPC40x.
+
+ nomca [IA-64] Disable machine check abort handling
+
+ nomce [X86-32] Machine Check Exception
+
+ nomfgpt [X86-32] Disable Multi-Function General Purpose
+ Timer usage (for AMD Geode machines).
+
+ noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops
+
+ noreplace-smp [X86-32,SMP] Don't replace SMP instructions
+ with UP alternatives
+
+ noresidual [PPC] Don't use residual data on PReP machines.
+
+ noresume [SWSUSP] Disables resume and restores original swap
+ space.
+
+ no-scroll [VGA] Disables scrollback.
+ This is required for the Braillex ib80-piezo Braille
+ reader made by F.H. Papenmeier (Germany).
+
+ nosbagart [IA-64]
+
+ nosep [BUGS=X86-32] Disables x86 SYSENTER/SYSEXIT support.
+
+ nosmp [SMP] Tells an SMP kernel to act as a UP kernel,
+ and disable the IO APIC. legacy for "maxcpus=0".
+
+ nosoftlockup [KNL] Disable the soft-lockup detector.
+
+ nosync [HW,M68K] Disables sync negotiation for all devices.
+
+ notsc [BUGS=X86-32] Disable Time Stamp Counter
+
+ nousb [USB] Disable the USB subsystem
+
+ nowb [ARM]
+
+ nptcg= [IA64] Override max number of concurrent global TLB
+ purges which is reported from either PAL_VM_SUMMARY or
+ SAL PALO.
+
+ numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
+ one of ['zone', 'node', 'default'] can be specified
+ This can be set from sysctl after boot.
+ See Documentation/sysctl/vm.txt for details.
+
+ nr_uarts= [SERIAL] maximum number of UARTs to be registered.
+
+ olpc_ec_timeout= [OLPC] ms delay when issuing EC commands
+ Rather than timing out after 20 ms if an EC
+ command is not properly ACKed, override the length
+ of the timeout. We have interrupts disabled while
+ waiting for the ACK, so if this is set too high
+ interrupts *may* be lost!
+
+ opl3= [HW,OSS]
+ Format: <io>
+
+ oprofile.timer= [HW]
+ Use timer interrupt instead of performance counters
+
+ osst= [HW,SCSI] SCSI Tape Driver
+ Format: <buffer_size>,<write_threshold>
+ See also Documentation/scsi/st.txt.
+
+ panic= [KNL] Kernel behaviour on panic
+ Format: <timeout>
+
+ parkbd.port= [HW] Parallel port number the keyboard adapter is
+ connected to, default is 0.
+ Format: <parport#>
+ parkbd.mode= [HW] Parallel port keyboard adapter mode of operation,
+ 0 for XT, 1 for AT (default is AT).
+ Format: <mode>
+
+ parport= [HW,PPT] Specify parallel ports. 0 disables.
+ Format: { 0 | auto | 0xBBB[,IRQ[,DMA]] }
+ Use 'auto' to force the driver to use any
+ IRQ/DMA settings detected (the default is to
+ ignore detected IRQ/DMA settings because of
+ possible conflicts). You can specify the base
+ address, IRQ, and DMA settings; IRQ and DMA
+ should be numbers, or 'auto' (for using detected
+ settings on that particular port), or 'nofifo'
+ (to avoid using a FIFO even if it is detected).
+ Parallel ports are assigned in the order they
+ are specified on the command line, starting
+ with parport0.
+
+ parport_init_mode= [HW,PPT]
+ Configure VIA parallel port to operate in
+ a specific mode. This is necessary on Pegasos
+ computer where firmware has no options for setting
+ up parallel port mode and sets it to spp.
+ Currently this function knows 686a and 8231 chips.
+ Format: [spp|ps2|epp|ecp|ecpepp]
+
+ pas2= [HW,OSS] Format:
+ <io>,<irq>,<dma>,<dma16>,<sb_io>,<sb_irq>,<sb_dma>,<sb_dma16>
+
+ pas16= [HW,SCSI]
+ See header of drivers/scsi/pas16.c.
+
+ pause_on_oops=
+ Halt all CPUs after the first oops has been printed for
+ the specified number of seconds. This is to be used if
+ your oopses keep scrolling off the screen.
+
+ pcbit= [HW,ISDN]
+
+ pcd. [PARIDE]
+ See header of drivers/block/paride/pcd.c.
+ See also Documentation/blockdev/paride.txt.
+
+ pci=option[,option...] [PCI] various PCI subsystem options:
+ off [X86] don't probe for the PCI bus
+ bios [X86-32] force use of PCI BIOS, don't access
+ the hardware directly. Use this if your machine
+ has a non-standard PCI host bridge.
+ nobios [X86-32] disallow use of PCI BIOS, only direct
+ hardware access methods are allowed. Use this
+ if you experience crashes upon bootup and you
+ suspect they are caused by the BIOS.
+ conf1 [X86] Force use of PCI Configuration
+ Mechanism 1.
+ conf2 [X86] Force use of PCI Configuration
+ Mechanism 2.
+ noaer [PCIE] If the PCIEAER kernel config parameter is
+ enabled, this kernel boot option can be used to
+ disable the use of PCIE advanced error reporting.
+ nodomains [PCI] Disable support for multiple PCI
+ root domains (aka PCI segments, in ACPI-speak).
+ nommconf [X86-32,X86_64] Disable use of MMCONFIG for PCI
+ Configuration
+ nomsi [MSI] If the PCI_MSI kernel config parameter is
+ enabled, this kernel boot option can be used to
+ disable the use of MSI interrupts system-wide.
+ biosirq [X86-32] Use PCI BIOS calls to get the interrupt
+ routing table. These calls are known to be buggy
+ on several machines and they hang the machine
+ when used, but on other computers it's the only
+ way to get the interrupt routing table. Try
+ this option if the kernel is unable to allocate
+ IRQs or discover secondary PCI buses on your
+ motherboard.
+ rom [X86] Assign address space to expansion ROMs.
+ Use with caution as certain devices share
+ address decoders between ROMs and other
+ resources.
+ norom [X86] Do not assign address space to
+ expansion ROMs that do not already have
+ BIOS assigned address ranges.
+ irqmask=0xMMMM [X86] Set a bit mask of IRQs allowed to be
+ assigned automatically to PCI devices. You can
+ make the kernel exclude IRQs of your ISA cards
+ this way.
+ pirqaddr=0xAAAAA [X86] Specify the physical address
+ of the PIRQ table (normally generated
+ by the BIOS) if it is outside the
+ F0000h-100000h range.
+ lastbus=N [X86] Scan all buses thru bus #N. Can be
+ useful if the kernel is unable to find your
+ secondary buses and you want to tell it
+ explicitly which ones they are.
+ assign-busses [X86] Always assign all PCI bus
+ numbers ourselves, overriding
+ whatever the firmware may have done.
+ usepirqmask [X86] Honor the possible IRQ mask stored
+ in the BIOS $PIR table. This is needed on
+ some systems with broken BIOSes, notably
+ some HP Pavilion N5400 and Omnibook XE3
+ notebooks. This will have no effect if ACPI
+ IRQ routing is enabled.
+ noacpi [X86] Do not use ACPI for IRQ routing
+ or for PCI scanning.
+ use_crs [X86] Use _CRS for PCI resource
+ allocation.
+ routeirq Do IRQ routing for all PCI devices.
+ This is normally done in pci_enable_device(),
+ so this option is a temporary workaround
+ for broken drivers that don't call it.
+ skip_isa_align [X86] do not align io start addr, so can
+ handle more pci cards
+ firmware [ARM] Do not re-enumerate the bus but instead
+ just use the configuration from the
+ bootloader. This is currently used on
+ IXP2000 systems where the bus has to be
+ configured a certain way for adjunct CPUs.
+ noearly [X86] Don't do any early type 1 scanning.
+ This might help on some broken boards which
+ machine check when some devices' config space
+ is read. But various workarounds are disabled
+ and some IOMMU drivers will not work.
+ bfsort Sort PCI devices into breadth-first order.
+ This sorting is done to get a device
+ order compatible with older (<= 2.4) kernels.
+ nobfsort Don't sort PCI devices into breadth-first order.
+ cbiosize=nn[KMG] The fixed amount of bus space which is
+ reserved for the CardBus bridge's IO window.
+ The default value is 256 bytes.
+ cbmemsize=nn[KMG] The fixed amount of bus space which is
+ reserved for the CardBus bridge's memory
+ window. The default value is 64 megabytes.
+
+ pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
+ Management.
+ off Disable ASPM.
+ force Enable ASPM even on devices that claim not to support it.
+ WARNING: Forcing ASPM on may cause system lockups.
+
+ pcmv= [HW,PCMCIA] BadgePAD 4
+
+ pd. [PARIDE]
+ See Documentation/blockdev/paride.txt.
+
+ pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at
+ boot time.
+ Format: { 0 | 1 }
+ See arch/parisc/kernel/pdc_chassis.c
+
+ pf. [PARIDE]
+ See Documentation/blockdev/paride.txt.
+
+ pg. [PARIDE]
+ See Documentation/blockdev/paride.txt.
+
+ pirq= [SMP,APIC] Manual mp-table setup
+ See Documentation/x86/i386/IO-APIC.txt.
+
+ plip= [PPT,NET] Parallel port network link
+ Format: { parport<nr> | timid | 0 }
+ See also Documentation/parport.txt.
+
+ pmtmr= [X86] Manual setup of pmtmr I/O Port.
+ Override pmtimer IOPort with a hex value.
+ e.g. pmtmr=0x508
+
+ pnp.debug [PNP]
+ Enable PNP debug messages. This depends on the
+ CONFIG_PNP_DEBUG_MESSAGES option.
+
+ pnpacpi= [ACPI]
+ { off }
+
+ pnpbios= [ISAPNP]
+ { on | off | curr | res | no-curr | no-res }
+
+ pnp_reserve_irq=
+ [ISAPNP] Exclude IRQs for the autoconfiguration
+
+ pnp_reserve_dma=
+ [ISAPNP] Exclude DMAs for the autoconfiguration
+
+ pnp_reserve_io= [ISAPNP] Exclude I/O ports for the autoconfiguration
+ Ranges are in pairs (I/O port base and size).
+
+ pnp_reserve_mem=
+ [ISAPNP] Exclude memory regions for the
+ autoconfiguration.
+ Ranges are in pairs (memory base and size).
+
+ dynamic_printk
+ Enables pr_debug()/dev_dbg() calls if
+ CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. These can also
+ be switched on/off via <debugfs>/dynamic_printk/modules
+
+ print-fatal-signals=
+ [KNL] debug: print fatal signals
+ print-fatal-signals=1: print segfault info to
+ the kernel console.
+ default: off.
+
+ printk.time= Show timing data prefixed to each printk message line
+ Format: <bool> (1/Y/y=enable, 0/N/n=disable)
+
+ profile= [KNL] Enable kernel profiling via /proc/profile
+ Format: [schedule,]<number>
+ Param: "schedule" - profile schedule points.
+ Param: <number> - step/bucket size as a power of 2 for
+ statistical time based profiling.
+ Param: "sleep" - profile D-state sleeping (millisecs).
+ Requires CONFIG_SCHEDSTATS
+ Param: "kvm" - profile VM exits.
+
+ processor.max_cstate= [HW,ACPI]
+ Limit processor to maximum C-state
+ max_cstate=9 overrides any DMI blacklist limit.
+
+ processor.nocst [HW,ACPI]
+ Ignore the _CST method to determine C-states,
+ instead using the legacy FADT method
+
+ prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk
+ before loading.
+ See Documentation/blockdev/ramdisk.txt.
+
+ psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to
+ probe for; one of (bare|imps|exps|lifebook|any).
+ psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports
+ per second.
+ psmouse.resetafter= [HW,MOUSE]
+ Try to reset the device after so many bad packets
+ (0 = never).
+ psmouse.resolution=
+ [HW,MOUSE] Set desired mouse resolution, in dpi.
+ psmouse.smartscroll=
+ [HW,MOUSE] Controls Logitech smartscroll autorepeat.
+ 0 = disabled, 1 = enabled (default).
+
+ pss= [HW,OSS] Personal Sound System (ECHO ESC614)
+ Format:
+ <io>,<mss_io>,<mss_irq>,<mss_dma>,<mpu_io>,<mpu_irq>
+
+ pt. [PARIDE]
+ See Documentation/blockdev/paride.txt.
+
+ pty.legacy_count=
+ [KNL] Number of legacy pty's. Overwrites compiled-in
+ default number.
+
+ quiet [KNL] Disable most log messages
+
+ r128= [HW,DRM]
+
+ raid= [HW,RAID]
+ See Documentation/md.txt.
+
+ ramdisk_blocksize= [RAM]
+ See Documentation/blockdev/ramdisk.txt.
+
+ ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
+ See Documentation/blockdev/ramdisk.txt.
+
+ rcupdate.blimit= [KNL,BOOT]
+ Set maximum number of finished RCU callbacks to process
+ in one batch.
+
+ rcupdate.qhimark= [KNL,BOOT]
+ Set threshold of queued
+ RCU callbacks over which batch limiting is disabled.
+
+ rcupdate.qlowmark= [KNL,BOOT]
+ Set threshold of queued RCU callbacks below which
+ batch limiting is re-enabled.
+
+ rdinit= [KNL]
+ Format: <full_path>
+ Run specified binary instead of /init from the ramdisk,
+ used for early userspace startup. See initrd.
+
+ reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
+ Format: <reboot_mode>[,<reboot_mode2>[,...]]
+ See arch/*/kernel/reboot.c or arch/*/kernel/process.c
+
+ relax_domain_level=
+ [KNL, SMP] Set scheduler's default relax_domain_level.
+ See Documentation/cpusets.txt.
+
+ reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
+
+ reservetop= [X86-32]
+ Format: nn[KMG]
+ Reserves a hole at the top of the kernel virtual
+ address space.
+
+ reset_devices [KNL] Force drivers to reset the underlying device
+ during initialization.
+
+ resume= [SWSUSP]
+ Specify the partition device for software suspend
+
+ resume_offset= [SWSUSP]
+ Specify the offset from the beginning of the partition
+ given by "resume=" at which the swap header is located,
+ in <PAGE_SIZE> units (needed only for swap files).
+ See Documentation/power/swsusp-and-swap-files.txt
+
+ retain_initrd [RAM] Keep initrd memory after extraction
+
+ rhash_entries= [KNL,NET]
+ Set number of hash buckets for route cache
+
+ riscom8= [HW,SERIAL]
+ Format: <io_board1>[,<io_board2>[,...<io_boardN>]]
+
+ ro [KNL] Mount root device read-only on boot
+
+ root= [KNL] Root filesystem
+
+ rootdelay= [KNL] Delay (in seconds) to pause before attempting to
+ mount the root filesystem
+
+ rootflags= [KNL] Set root filesystem mount option string
+
+ rootfstype= [KNL] Set root filesystem type
+
+ rootwait [KNL] Wait (indefinitely) for root device to show up.
+ Useful for devices that are detected asynchronously
+ (e.g. USB and MMC devices).
+
+ root_plug.vendor_id=
+ [ROOTPLUG] Override the default vendor ID
+
+ root_plug.product_id=
+ [ROOTPLUG] Override the default product ID
+
+ root_plug.debug=
+ [ROOTPLUG] Enable debugging output
+
+ rw [KNL] Mount root device read-write on boot
+
+ S [KNL] Run init in single mode
+
+ sa1100ir [NET]
+ See drivers/net/irda/sa1100_ir.c.
+
+ sbni= [NET] Granch SBNI12 leased line adapter
+
+ sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
+ Format: <io>[,<timeout>[,<isapnp>]]
+
+ scsi_debug_*= [SCSI]
+ See drivers/scsi/scsi_debug.c.
+
+ scsi_default_dev_flags=
+ [SCSI] SCSI default device flags
+ Format: <integer>
+
+ scsi_dev_flags= [SCSI] Black/white list entry for vendor and model
+ Format: <vendor>:<model>:<flags>
+ (flags are integer value)
+
+ scsi_logging_level= [SCSI] a bit mask of logging levels
+ See drivers/scsi/scsi_logging.h for bits. Also
+ settable via sysctl at dev.scsi.logging_level
+ (/proc/sys/dev/scsi/logging_level).
+ There is also a nice 'scsi_logging_level' script in the
+ S390-tools package, available for download at
+ http://www-128.ibm.com/developerworks/linux/linux390/s390-tools-1.5.4.html
+
+ scsi_mod.scan= [SCSI] sync (default) scans SCSI busses as they are
+ discovered. async scans them in kernel threads,
+ allowing boot to proceed. none ignores them, expecting
+ user space to do the scan.
+
+ selinux [SELINUX] Disable or enable SELinux at boot time.
+ Format: { "0" | "1" }
+ See security/selinux/Kconfig help text.
+ 0 -- disable.
+ 1 -- enable.
+ Default value is set via kernel config option.
+ If enabled at boot time, /selinux/disable can be used
+ later to disable prior to initial policy load.
+
+ selinux_compat_net =
+ [SELINUX] Set initial selinux_compat_net flag value.
+ Format: { "0" | "1" }
+ 0 -- use new secmark-based packet controls
+ 1 -- use legacy packet controls
+ Default value is 0 (preferred).
+ Value can be changed at runtime via
+ /selinux/compat_net.
+
+ serialnumber [BUGS=X86-32]
+
+ shapers= [NET]
+ Maximal number of shapers.
+
+ show_msr= [x86] show boot-time MSR settings
+ Format: { <integer> }
+ Show boot-time (BIOS-initialized) MSR settings.
+ The parameter means the number of CPUs to show,
+ for example 1 means boot CPU only.
+
+ sim710= [SCSI,HW]
+ See header of drivers/scsi/sim710.c.
+
+ simeth= [IA-64]
+ simscsi=
+
+ slram= [HW,MTD]
+
+ slub_debug[=options[,slabs]] [MM, SLUB]
+ Enabling slub_debug allows one to determine the
+ culprit if slab objects become corrupted. Enabling
+ slub_debug can create guard zones around objects and
+ may poison objects when not in use. Also tracks the
+ last alloc / free. For more information see
+ Documentation/vm/slub.txt.
+
+ slub_max_order= [MM, SLUB]
+ Determines the maximum allowed order for slabs.
+ A high setting may cause OOMs due to memory
+ fragmentation. For more information see
+ Documentation/vm/slub.txt.
+
+ slub_min_objects= [MM, SLUB]
+ The minimum number of objects per slab. SLUB will
+ increase the slab order up to slub_max_order to
+ generate a sufficiently large slab able to contain
+ the number of objects indicated. The higher the number
+ of objects the smaller the overhead of tracking slabs
+ and the less frequently locks need to be acquired.
+ For more information see Documentation/vm/slub.txt.
+
+ slub_min_order= [MM, SLUB]
+ Determines the mininum page order for slabs. Must be
+ lower than slub_max_order.
+ For more information see Documentation/vm/slub.txt.
+
+ slub_nomerge [MM, SLUB]
+ Disable merging of slabs with similar size. May be
+ necessary if there is some reason to distinguish
+ allocs to different slabs. Debug options disable
+ merging on their own.
+ For more information see Documentation/vm/slub.txt.
+
+ smart2= [HW]
+ Format: <io1>[,<io2>[,...,<io8>]]
+
+ smp-alt-once [X86-32,SMP] On a hotplug CPU system, only
+ attempt to substitute SMP alternatives once at boot.
+
+ smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
+ smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port
+ smsc-ircc2.ircc_sir= [HW] SIR base I/O port
+ smsc-ircc2.ircc_fir= [HW] FIR base I/O port
+ smsc-ircc2.ircc_irq= [HW] IRQ line
+ smsc-ircc2.ircc_dma= [HW] DMA channel
+ smsc-ircc2.ircc_transceiver= [HW] Transceiver type:
+ 0: Toshiba Satellite 1800 (GP data pin select)
+ 1: Fast pin select (default)
+ 2: ATC IRMode
+
+ snd-ad1816a= [HW,ALSA]
+
+ snd-ad1848= [HW,ALSA]
+
+ snd-ali5451= [HW,ALSA]
+
+ snd-als100= [HW,ALSA]
+
+ snd-als4000= [HW,ALSA]
+
+ snd-azt2320= [HW,ALSA]
+
+ snd-cmi8330= [HW,ALSA]
+
+ snd-cmipci= [HW,ALSA]
+
+ snd-cs4231= [HW,ALSA]
+
+ snd-cs4232= [HW,ALSA]
+
+ snd-cs4236= [HW,ALSA]
+
+ snd-cs4281= [HW,ALSA]
+
+ snd-cs46xx= [HW,ALSA]
+
+ snd-dt019x= [HW,ALSA]
+
+ snd-dummy= [HW,ALSA]
+
+ snd-emu10k1= [HW,ALSA]
+
+ snd-ens1370= [HW,ALSA]
+
+ snd-ens1371= [HW,ALSA]
+
+ snd-es968= [HW,ALSA]
+
+ snd-es1688= [HW,ALSA]
+
+ snd-es18xx= [HW,ALSA]
+
+ snd-es1938= [HW,ALSA]
+
+ snd-es1968= [HW,ALSA]
+
+ snd-fm801= [HW,ALSA]
+
+ snd-gusclassic= [HW,ALSA]
+
+ snd-gusextreme= [HW,ALSA]
+
+ snd-gusmax= [HW,ALSA]
+
+ snd-hdsp= [HW,ALSA]
+
+ snd-ice1712= [HW,ALSA]
+
+ snd-intel8x0= [HW,ALSA]
+
+ snd-interwave= [HW,ALSA]
+
+ snd-interwave-stb=
+ [HW,ALSA]
+
+ snd-korg1212= [HW,ALSA]
+
+ snd-maestro3= [HW,ALSA]
+
+ snd-mpu401= [HW,ALSA]
+
+ snd-mtpav= [HW,ALSA]
+
+ snd-nm256= [HW,ALSA]
+
+ snd-opl3sa2= [HW,ALSA]
+
+ snd-opti92x-ad1848=
+ [HW,ALSA]
+
+ snd-opti92x-cs4231=
+ [HW,ALSA]
+
+ snd-opti93x= [HW,ALSA]
+
+ snd-pmac= [HW,ALSA]
+
+ snd-rme32= [HW,ALSA]
+
+ snd-rme96= [HW,ALSA]
+
+ snd-rme9652= [HW,ALSA]
+
+ snd-sb8= [HW,ALSA]
+
+ snd-sb16= [HW,ALSA]
+
+ snd-sbawe= [HW,ALSA]
+
+ snd-serial= [HW,ALSA]
+
+ snd-sgalaxy= [HW,ALSA]
+
+ snd-sonicvibes= [HW,ALSA]
+
+ snd-sun-amd7930=
+ [HW,ALSA]
+
+ snd-sun-cs4231= [HW,ALSA]
+
+ snd-trident= [HW,ALSA]
+
+ snd-usb-audio= [HW,ALSA,USB]
+
+ snd-via82xx= [HW,ALSA]
+
+ snd-virmidi= [HW,ALSA]
+
+ snd-wavefront= [HW,ALSA]
+
+ snd-ymfpci= [HW,ALSA]
+
+ softlockup_panic=
+ [KNL] Should the soft-lockup detector generate panics.
+
+ sonypi.*= [HW] Sony Programmable I/O Control Device driver
+ See Documentation/sonypi.txt
+
+ specialix= [HW,SERIAL] Specialix multi-serial port adapter
+ See Documentation/serial/specialix.txt.
+
+ spia_io_base= [HW,MTD]
+ spia_fio_base=
+ spia_pedr=
+ spia_peddr=
+
+ sscape= [HW,OSS]
+ Format: <io>,<irq>,<dma>,<mpu_io>,<mpu_irq>
+
+ st= [HW,SCSI] SCSI tape parameters (buffers, etc.)
+ See Documentation/scsi/st.txt.
+
+ sti= [PARISC,HW]
+ Format: <num>
+ Set the STI (builtin display/keyboard on the HP-PARISC
+ machines) console (graphic card) which should be used
+ as the initial boot-console.
+ See also comment in drivers/video/console/sticore.c.
+
+ sti_font= [HW]
+ See comment in drivers/video/console/sticore.c.
+
+ stifb= [HW]
+ Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
+
+ sunrpc.pool_mode=
+ [NFS]
+ Control how the NFS server code allocates CPUs to
+ service thread pools. Depending on how many NICs
+ you have and where their interrupts are bound, this
+ option will affect which CPUs will do NFS serving.
+ Note: this parameter cannot be changed while the
+ NFS server is running.
+
+ auto the server chooses an appropriate mode
+ automatically using heuristics
+ global a single global pool contains all CPUs
+ percpu one pool for each CPU
+ pernode one pool for each NUMA node (equivalent
+ to global on non-NUMA machines)
+
+ swiotlb= [IA-64] Number of I/O TLB slabs
+
+ switches= [HW,M68k]
+
+ sym53c416= [HW,SCSI]
+ See header of drivers/scsi/sym53c416.c.
+
+ sysrq_always_enabled
+ [KNL]
+ Ignore sysrq setting - this boot parameter will
+ neutralize any effect of /proc/sys/kernel/sysrq.
+ Useful for debugging.
+
+ t128= [HW,SCSI]
+ See header of drivers/scsi/t128.c.
+
+ tdfx= [HW,DRM]
+
+ test_suspend= [SUSPEND]
+ Specify "mem" (for Suspend-to-RAM) or "standby" (for
+ standby suspend) as the system sleep state to briefly
+ enter during system startup. The system is woken from
+ this state using a wakeup-capable RTC alarm.
+
+ thash_entries= [KNL,NET]
+ Set number of hash buckets for TCP connection
+
+ thermal.act= [HW,ACPI]
+ -1: disable all active trip points in all thermal zones
+ <degrees C>: override all lowest active trip points
+
+ thermal.crt= [HW,ACPI]
+ -1: disable all critical trip points in all thermal zones
+ <degrees C>: override all critical trip points
+
+ thermal.nocrt= [HW,ACPI]
+ Set to disable actions on ACPI thermal zone
+ critical and hot trip points.
+
+ thermal.off= [HW,ACPI]
+ 1: disable ACPI thermal control
+
+ thermal.psv= [HW,ACPI]
+ -1: disable all passive trip points
+ <degrees C>: override all passive trip points to this value
+
+ thermal.tzp= [HW,ACPI]
+ Specify global default ACPI thermal zone polling rate
+ <deci-seconds>: poll all this frequency
+ 0: no polling (default)
+
+ tmscsim= [HW,SCSI]
+ See comment before function dc390_setup() in
+ drivers/scsi/tmscsim.c.
+
+ tp720= [HW,PS2]
+
+ trix= [HW,OSS] MediaTrix AudioTrix Pro
+ Format:
+ <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
+
+ turbografx.map[2|3]= [HW,JOY]
+ TurboGraFX parallel port interface
+ Format:
+ <port#>,<js1>,<js2>,<js3>,<js4>,<js5>,<js6>,<js7>
+ See also Documentation/input/joystick-parport.txt
+
+ u14-34f= [HW,SCSI] UltraStor 14F/34F SCSI host adapter
+ See header of drivers/scsi/u14-34f.c.
+
+ uart401= [HW,OSS]
+ Format: <io>,<irq>
+
+ uart6850= [HW,OSS]
+ Format: <io>,<irq>
+
+ uhci-hcd.ignore_oc=
+ [USB] Ignore overcurrent events (default N).
+ Some badly-designed motherboards generate lots of
+ bogus events, for ports that aren't wired to
+ anything. Set this parameter to avoid log spamming.
+ Note that genuine overcurrent events won't be
+ reported either.
+
+ unknown_nmi_panic
+ [X86-32,X86-64]
+ Set unknown_nmi_panic=1 early on boot.
+
+ usbcore.autosuspend=
+ [USB] The autosuspend time delay (in seconds) used
+ for newly-detected USB devices (default 2). This
+ is the time required before an idle device will be
+ autosuspended. Devices for which the delay is set
+ to a negative value won't be autosuspended at all.
+
+ usbcore.usbfs_snoop=
+ [USB] Set to log all usbfs traffic (default 0 = off).
+
+ usbcore.blinkenlights=
+ [USB] Set to cycle leds on hubs (default 0 = off).
+
+ usbcore.old_scheme_first=
+ [USB] Start with the old device initialization
+ scheme (default 0 = off).
+
+ usbcore.use_both_schemes=
+ [USB] Try the other device initialization scheme
+ if the first one fails (default 1 = enabled).
+
+ usbcore.initial_descriptor_timeout=
+ [USB] Specifies timeout for the initial 64-byte
+ USB_REQ_GET_DESCRIPTOR request in milliseconds
+ (default 5000 = 5.0 seconds).
+
+ usbhid.mousepoll=
+ [USBHID] The interval which mice are to be polled at.
+
+ add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in
+ kernel's map of available physical RAM.
+
+ vdso= [X86-32,SH,x86-64]
+ vdso=2: enable compat VDSO (default with COMPAT_VDSO)
+ vdso=1: enable VDSO (default)
+ vdso=0: disable VDSO mapping
+
+ vdso32= [X86-32,X86-64]
+ vdso32=2: enable compat VDSO (default with COMPAT_VDSO)
+ vdso32=1: enable 32-bit VDSO (default)
+ vdso32=0: disable 32-bit VDSO mapping
+
+ vector= [IA-64,SMP]
+ vector=percpu: enable percpu vector domain
+
+ video= [FB] Frame buffer configuration
+ See Documentation/fb/modedb.txt.
+
+ vga= [BOOT,X86-32] Select a particular video mode
+ See Documentation/x86/i386/boot.txt and
+ Documentation/svga.txt.
+ Use vga=ask for menu.
+ This is actually a boot loader parameter; the value is
+ passed to the kernel using a special protocol.
+
+ vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact
+ size of <nn>. This can be used to increase the
+ minimum size (128MB on x86). It can also be used to
+ decrease the size and leave more room for directly
+ mapped kernel RAM.
+
+ vmhalt= [KNL,S390] Perform z/VM CP command after system halt.
+ Format: <command>
+
+ vmpanic= [KNL,S390] Perform z/VM CP command after kernel panic.
+ Format: <command>
+
+ vmpoff= [KNL,S390] Perform z/VM CP command after power off.
+ Format: <command>
+
+ waveartist= [HW,OSS]
+ Format: <io>,<irq>,<dma>,<dma2>
+
+ wd33c93= [HW,SCSI]
+ See header of drivers/scsi/wd33c93.c.
+
+ wd7000= [HW,SCSI]
+ See header of drivers/scsi/wd7000.c.
+
+ wdt= [WDT] Watchdog
+ See Documentation/watchdog/wdt.txt.
+
+ xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks.
+ xd_geo= See header of drivers/block/xd.c.
+
+ xirc2ps_cs= [NET,PCMCIA]
+ Format:
+ <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
+
+ norandmaps Don't use address space randomization
+ Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
+
+______________________________________________________________________
+
+TODO:
+
+ Add documentation for ALSA options.
+ Add more DRM drivers.
diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
new file mode 100644
index 0000000..09b55e4
--- /dev/null
+++ b/Documentation/keys-request-key.txt
@@ -0,0 +1,201 @@
+ ===================
+ KEY REQUEST SERVICE
+ ===================
+
+The key request service is part of the key retention service (refer to
+Documentation/keys.txt). This document explains more fully how the requesting
+algorithm works.
+
+The process starts by either the kernel requesting a service by calling
+request_key*():
+
+ struct key *request_key(const struct key_type *type,
+ const char *description,
+ const char *callout_info);
+
+or:
+
+ struct key *request_key_with_auxdata(const struct key_type *type,
+ const char *description,
+ const char *callout_info,
+ size_t callout_len,
+ void *aux);
+
+or:
+
+ struct key *request_key_async(const struct key_type *type,
+ const char *description,
+ const char *callout_info,
+ size_t callout_len);
+
+or:
+
+ struct key *request_key_async_with_auxdata(const struct key_type *type,
+ const char *description,
+ const char *callout_info,
+ size_t callout_len,
+ void *aux);
+
+Or by userspace invoking the request_key system call:
+
+ key_serial_t request_key(const char *type,
+ const char *description,
+ const char *callout_info,
+ key_serial_t dest_keyring);
+
+The main difference between the access points is that the in-kernel interface
+does not need to link the key to a keyring to prevent it from being immediately
+destroyed. The kernel interface returns a pointer directly to the key, and
+it's up to the caller to destroy the key.
+
+The request_key*_with_auxdata() calls are like the in-kernel request_key*()
+calls, except that they permit auxiliary data to be passed to the upcaller (the
+default is NULL). This is only useful for those key types that define their
+own upcall mechanism rather than using /sbin/request-key.
+
+The two async in-kernel calls may return keys that are still in the process of
+being constructed. The two non-async ones will wait for construction to
+complete first.
+
+The userspace interface links the key to a keyring associated with the process
+to prevent the key from going away, and returns the serial number of the key to
+the caller.
+
+
+The following example assumes that the key types involved don't define their
+own upcall mechanisms. If they do, then those should be substituted for the
+forking and execution of /sbin/request-key.
+
+
+===========
+THE PROCESS
+===========
+
+A request proceeds in the following manner:
+
+ (1) Process A calls request_key() [the userspace syscall calls the kernel
+ interface].
+
+ (2) request_key() searches the process's subscribed keyrings to see if there's
+ a suitable key there. If there is, it returns the key. If there isn't,
+ and callout_info is not set, an error is returned. Otherwise the process
+ proceeds to the next step.
+
+ (3) request_key() sees that A doesn't have the desired key yet, so it creates
+ two things:
+
+ (a) An uninstantiated key U of requested type and description.
+
+ (b) An authorisation key V that refers to key U and notes that process A
+ is the context in which key U should be instantiated and secured, and
+ from which associated key requests may be satisfied.
+
+ (4) request_key() then forks and executes /sbin/request-key with a new session
+ keyring that contains a link to auth key V.
+
+ (5) /sbin/request-key assumes the authority associated with key U.
+
+ (6) /sbin/request-key execs an appropriate program to perform the actual
+ instantiation.
+
+ (7) The program may want to access another key from A's context (say a
+ Kerberos TGT key). It just requests the appropriate key, and the keyring
+ search notes that the session keyring has auth key V in its bottom level.
+
+ This will permit it to then search the keyrings of process A with the
+ UID, GID, groups and security info of process A as if it was process A,
+ and come up with key W.
+
+ (8) The program then does what it must to get the data with which to
+ instantiate key U, using key W as a reference (perhaps it contacts a
+ Kerberos server using the TGT) and then instantiates key U.
+
+ (9) Upon instantiating key U, auth key V is automatically revoked so that it
+ may not be used again.
+
+(10) The program then exits 0 and request_key() deletes key V and returns key
+ U to the caller.
+
+This also extends further. If key W (step 7 above) didn't exist, key W would
+be created uninstantiated, another auth key (X) would be created (as per step
+3) and another copy of /sbin/request-key spawned (as per step 4); but the
+context specified by auth key X will still be process A, as it was in auth key
+V.
+
+This is because process A's keyrings can't simply be attached to
+/sbin/request-key at the appropriate places because (a) execve will discard two
+of them, and (b) it requires the same UID/GID/Groups all the way through.
+
+
+======================
+NEGATIVE INSTANTIATION
+======================
+
+Rather than instantiating a key, it is possible for the possessor of an
+authorisation key to negatively instantiate a key that's under construction.
+This is a short duration placeholder that causes any attempt at re-requesting
+the key whilst it exists to fail with error ENOKEY.
+
+This is provided to prevent excessive repeated spawning of /sbin/request-key
+processes for a key that will never be obtainable.
+
+Should the /sbin/request-key process exit anything other than 0 or die on a
+signal, the key under construction will be automatically negatively
+instantiated for a short amount of time.
+
+
+====================
+THE SEARCH ALGORITHM
+====================
+
+A search of any particular keyring proceeds in the following fashion:
+
+ (1) When the key management code searches for a key (keyring_search_aux) it
+ firstly calls key_permission(SEARCH) on the keyring it's starting with,
+ if this denies permission, it doesn't search further.
+
+ (2) It considers all the non-keyring keys within that keyring and, if any key
+ matches the criteria specified, calls key_permission(SEARCH) on it to see
+ if the key is allowed to be found. If it is, that key is returned; if
+ not, the search continues, and the error code is retained if of higher
+ priority than the one currently set.
+
+ (3) It then considers all the keyring-type keys in the keyring it's currently
+ searching. It calls key_permission(SEARCH) on each keyring, and if this
+ grants permission, it recurses, executing steps (2) and (3) on that
+ keyring.
+
+The process stops immediately a valid key is found with permission granted to
+use it. Any error from a previous match attempt is discarded and the key is
+returned.
+
+When search_process_keyrings() is invoked, it performs the following searches
+until one succeeds:
+
+ (1) If extant, the process's thread keyring is searched.
+
+ (2) If extant, the process's process keyring is searched.
+
+ (3) The process's session keyring is searched.
+
+ (4) If the process has assumed the authority associated with a request_key()
+ authorisation key then:
+
+ (a) If extant, the calling process's thread keyring is searched.
+
+ (b) If extant, the calling process's process keyring is searched.
+
+ (c) The calling process's session keyring is searched.
+
+The moment one succeeds, all pending errors are discarded and the found key is
+returned.
+
+Only if all these fail does the whole thing fail with the highest priority
+error. Note that several errors may have come from LSM.
+
+The error priority is:
+
+ EKEYREVOKED > EKEYEXPIRED > ENOKEY
+
+EACCES/EPERM are only returned on a direct search of a specific keyring where
+the basal keyring does not grant Search permission.
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
new file mode 100644
index 0000000..b56aacc
--- /dev/null
+++ b/Documentation/keys.txt
@@ -0,0 +1,1233 @@
+ ============================
+ KERNEL KEY RETENTION SERVICE
+ ============================
+
+This service allows cryptographic keys, authentication tokens, cross-domain
+user mappings, and similar to be cached in the kernel for the use of
+filesystems and other kernel services.
+
+Keyrings are permitted; these are a special type of key that can hold links to
+other keys. Processes each have three standard keyring subscriptions that a
+kernel service can search for relevant keys.
+
+The key service can be configured on by enabling:
+
+ "Security options"/"Enable access key retention support" (CONFIG_KEYS)
+
+This document has the following sections:
+
+ - Key overview
+ - Key service overview
+ - Key access permissions
+ - SELinux support
+ - New procfs files
+ - Userspace system call interface
+ - Kernel services
+ - Notes on accessing payload contents
+ - Defining a key type
+ - Request-key callback service
+ - Key access filesystem
+
+
+============
+KEY OVERVIEW
+============
+
+In this context, keys represent units of cryptographic data, authentication
+tokens, keyrings, etc.. These are represented in the kernel by struct key.
+
+Each key has a number of attributes:
+
+ - A serial number.
+ - A type.
+ - A description (for matching a key in a search).
+ - Access control information.
+ - An expiry time.
+ - A payload.
+ - State.
+
+
+ (*) Each key is issued a serial number of type key_serial_t that is unique for
+ the lifetime of that key. All serial numbers are positive non-zero 32-bit
+ integers.
+
+ Userspace programs can use a key's serial numbers as a way to gain access
+ to it, subject to permission checking.
+
+ (*) Each key is of a defined "type". Types must be registered inside the
+ kernel by a kernel service (such as a filesystem) before keys of that type
+ can be added or used. Userspace programs cannot define new types directly.
+
+ Key types are represented in the kernel by struct key_type. This defines a
+ number of operations that can be performed on a key of that type.
+
+ Should a type be removed from the system, all the keys of that type will
+ be invalidated.
+
+ (*) Each key has a description. This should be a printable string. The key
+ type provides an operation to perform a match between the description on a
+ key and a criterion string.
+
+ (*) Each key has an owner user ID, a group ID and a permissions mask. These
+ are used to control what a process may do to a key from userspace, and
+ whether a kernel service will be able to find the key.
+
+ (*) Each key can be set to expire at a specific time by the key type's
+ instantiation function. Keys can also be immortal.
+
+ (*) Each key can have a payload. This is a quantity of data that represent the
+ actual "key". In the case of a keyring, this is a list of keys to which
+ the keyring links; in the case of a user-defined key, it's an arbitrary
+ blob of data.
+
+ Having a payload is not required; and the payload can, in fact, just be a
+ value stored in the struct key itself.
+
+ When a key is instantiated, the key type's instantiation function is
+ called with a blob of data, and that then creates the key's payload in
+ some way.
+
+ Similarly, when userspace wants to read back the contents of the key, if
+ permitted, another key type operation will be called to convert the key's
+ attached payload back into a blob of data.
+
+ (*) Each key can be in one of a number of basic states:
+
+ (*) Uninstantiated. The key exists, but does not have any data attached.
+ Keys being requested from userspace will be in this state.
+
+ (*) Instantiated. This is the normal state. The key is fully formed, and
+ has data attached.
+
+ (*) Negative. This is a relatively short-lived state. The key acts as a
+ note saying that a previous call out to userspace failed, and acts as
+ a throttle on key lookups. A negative key can be updated to a normal
+ state.
+
+ (*) Expired. Keys can have lifetimes set. If their lifetime is exceeded,
+ they traverse to this state. An expired key can be updated back to a
+ normal state.
+
+ (*) Revoked. A key is put in this state by userspace action. It can't be
+ found or operated upon (apart from by unlinking it).
+
+ (*) Dead. The key's type was unregistered, and so the key is now useless.
+
+
+====================
+KEY SERVICE OVERVIEW
+====================
+
+The key service provides a number of features besides keys:
+
+ (*) The key service defines two special key types:
+
+ (+) "keyring"
+
+ Keyrings are special keys that contain a list of other keys. Keyring
+ lists can be modified using various system calls. Keyrings should not
+ be given a payload when created.
+
+ (+) "user"
+
+ A key of this type has a description and a payload that are arbitrary
+ blobs of data. These can be created, updated and read by userspace,
+ and aren't intended for use by kernel services.
+
+ (*) Each process subscribes to three keyrings: a thread-specific keyring, a
+ process-specific keyring, and a session-specific keyring.
+
+ The thread-specific keyring is discarded from the child when any sort of
+ clone, fork, vfork or execve occurs. A new keyring is created only when
+ required.
+
+ The process-specific keyring is replaced with an empty one in the child on
+ clone, fork, vfork unless CLONE_THREAD is supplied, in which case it is
+ shared. execve also discards the process's process keyring and creates a
+ new one.
+
+ The session-specific keyring is persistent across clone, fork, vfork and
+ execve, even when the latter executes a set-UID or set-GID binary. A
+ process can, however, replace its current session keyring with a new one
+ by using PR_JOIN_SESSION_KEYRING. It is permitted to request an anonymous
+ new one, or to attempt to create or join one of a specific name.
+
+ The ownership of the thread keyring changes when the real UID and GID of
+ the thread changes.
+
+ (*) Each user ID resident in the system holds two special keyrings: a user
+ specific keyring and a default user session keyring. The default session
+ keyring is initialised with a link to the user-specific keyring.
+
+ When a process changes its real UID, if it used to have no session key, it
+ will be subscribed to the default session key for the new UID.
+
+ If a process attempts to access its session key when it doesn't have one,
+ it will be subscribed to the default for its current UID.
+
+ (*) Each user has two quotas against which the keys they own are tracked. One
+ limits the total number of keys and keyrings, the other limits the total
+ amount of description and payload space that can be consumed.
+
+ The user can view information on this and other statistics through procfs
+ files. The root user may also alter the quota limits through sysctl files
+ (see the section "New procfs files").
+
+ Process-specific and thread-specific keyrings are not counted towards a
+ user's quota.
+
+ If a system call that modifies a key or keyring in some way would put the
+ user over quota, the operation is refused and error EDQUOT is returned.
+
+ (*) There's a system call interface by which userspace programs can create and
+ manipulate keys and keyrings.
+
+ (*) There's a kernel interface by which services can register types and search
+ for keys.
+
+ (*) There's a way for the a search done from the kernel to call back to
+ userspace to request a key that can't be found in a process's keyrings.
+
+ (*) An optional filesystem is available through which the key database can be
+ viewed and manipulated.
+
+
+======================
+KEY ACCESS PERMISSIONS
+======================
+
+Keys have an owner user ID, a group access ID, and a permissions mask. The mask
+has up to eight bits each for possessor, user, group and other access. Only
+six of each set of eight bits are defined. These permissions granted are:
+
+ (*) View
+
+ This permits a key or keyring's attributes to be viewed - including key
+ type and description.
+
+ (*) Read
+
+ This permits a key's payload to be viewed or a keyring's list of linked
+ keys.
+
+ (*) Write
+
+ This permits a key's payload to be instantiated or updated, or it allows a
+ link to be added to or removed from a keyring.
+
+ (*) Search
+
+ This permits keyrings to be searched and keys to be found. Searches can
+ only recurse into nested keyrings that have search permission set.
+
+ (*) Link
+
+ This permits a key or keyring to be linked to. To create a link from a
+ keyring to a key, a process must have Write permission on the keyring and
+ Link permission on the key.
+
+ (*) Set Attribute
+
+ This permits a key's UID, GID and permissions mask to be changed.
+
+For changing the ownership, group ID or permissions mask, being the owner of
+the key or having the sysadmin capability is sufficient.
+
+
+===============
+SELINUX SUPPORT
+===============
+
+The security class "key" has been added to SELinux so that mandatory access
+controls can be applied to keys created within various contexts. This support
+is preliminary, and is likely to change quite significantly in the near future.
+Currently, all of the basic permissions explained above are provided in SELinux
+as well; SELinux is simply invoked after all basic permission checks have been
+performed.
+
+The value of the file /proc/self/attr/keycreate influences the labeling of
+newly-created keys. If the contents of that file correspond to an SELinux
+security context, then the key will be assigned that context. Otherwise, the
+key will be assigned the current context of the task that invoked the key
+creation request. Tasks must be granted explicit permission to assign a
+particular context to newly-created keys, using the "create" permission in the
+key security class.
+
+The default keyrings associated with users will be labeled with the default
+context of the user if and only if the login programs have been instrumented to
+properly initialize keycreate during the login process. Otherwise, they will
+be labeled with the context of the login program itself.
+
+Note, however, that the default keyrings associated with the root user are
+labeled with the default kernel context, since they are created early in the
+boot process, before root has a chance to log in.
+
+The keyrings associated with new threads are each labeled with the context of
+their associated thread, and both session and process keyrings are handled
+similarly.
+
+
+================
+NEW PROCFS FILES
+================
+
+Two files have been added to procfs by which an administrator can find out
+about the status of the key service:
+
+ (*) /proc/keys
+
+ This lists the keys that are currently viewable by the task reading the
+ file, giving information about their type, description and permissions.
+ It is not possible to view the payload of the key this way, though some
+ information about it may be given.
+
+ The only keys included in the list are those that grant View permission to
+ the reading process whether or not it possesses them. Note that LSM
+ security checks are still performed, and may further filter out keys that
+ the current process is not authorised to view.
+
+ The contents of the file look like this:
+
+ SERIAL FLAGS USAGE EXPY PERM UID GID TYPE DESCRIPTION: SUMMARY
+ 00000001 I----- 39 perm 1f3f0000 0 0 keyring _uid_ses.0: 1/4
+ 00000002 I----- 2 perm 1f3f0000 0 0 keyring _uid.0: empty
+ 00000007 I----- 1 perm 1f3f0000 0 0 keyring _pid.1: empty
+ 0000018d I----- 1 perm 1f3f0000 0 0 keyring _pid.412: empty
+ 000004d2 I--Q-- 1 perm 1f3f0000 32 -1 keyring _uid.32: 1/4
+ 000004d3 I--Q-- 3 perm 1f3f0000 32 -1 keyring _uid_ses.32: empty
+ 00000892 I--QU- 1 perm 1f000000 0 0 user metal:copper: 0
+ 00000893 I--Q-N 1 35s 1f3f0000 0 0 user metal:silver: 0
+ 00000894 I--Q-- 1 10h 003f0000 0 0 user metal:gold: 0
+
+ The flags are:
+
+ I Instantiated
+ R Revoked
+ D Dead
+ Q Contributes to user's quota
+ U Under construction by callback to userspace
+ N Negative key
+
+ This file must be enabled at kernel configuration time as it allows anyone
+ to list the keys database.
+
+ (*) /proc/key-users
+
+ This file lists the tracking data for each user that has at least one key
+ on the system. Such data includes quota information and statistics:
+
+ [root@andromeda root]# cat /proc/key-users
+ 0: 46 45/45 1/100 13/10000
+ 29: 2 2/2 2/100 40/10000
+ 32: 2 2/2 2/100 40/10000
+ 38: 2 2/2 2/100 40/10000
+
+ The format of each line is
+ <UID>: User ID to which this applies
+ <usage> Structure refcount
+ <inst>/<keys> Total number of keys and number instantiated
+ <keys>/<max> Key count quota
+ <bytes>/<max> Key size quota
+
+
+Four new sysctl files have been added also for the purpose of controlling the
+quota limits on keys:
+
+ (*) /proc/sys/kernel/keys/root_maxkeys
+ /proc/sys/kernel/keys/root_maxbytes
+
+ These files hold the maximum number of keys that root may have and the
+ maximum total number of bytes of data that root may have stored in those
+ keys.
+
+ (*) /proc/sys/kernel/keys/maxkeys
+ /proc/sys/kernel/keys/maxbytes
+
+ These files hold the maximum number of keys that each non-root user may
+ have and the maximum total number of bytes of data that each of those
+ users may have stored in their keys.
+
+Root may alter these by writing each new limit as a decimal number string to
+the appropriate file.
+
+
+===============================
+USERSPACE SYSTEM CALL INTERFACE
+===============================
+
+Userspace can manipulate keys directly through three new syscalls: add_key,
+request_key and keyctl. The latter provides a number of functions for
+manipulating keys.
+
+When referring to a key directly, userspace programs should use the key's
+serial number (a positive 32-bit integer). However, there are some special
+values available for referring to special keys and keyrings that relate to the
+process making the call:
+
+ CONSTANT VALUE KEY REFERENCED
+ ============================== ====== ===========================
+ KEY_SPEC_THREAD_KEYRING -1 thread-specific keyring
+ KEY_SPEC_PROCESS_KEYRING -2 process-specific keyring
+ KEY_SPEC_SESSION_KEYRING -3 session-specific keyring
+ KEY_SPEC_USER_KEYRING -4 UID-specific keyring
+ KEY_SPEC_USER_SESSION_KEYRING -5 UID-session keyring
+ KEY_SPEC_GROUP_KEYRING -6 GID-specific keyring
+ KEY_SPEC_REQKEY_AUTH_KEY -7 assumed request_key()
+ authorisation key
+
+
+The main syscalls are:
+
+ (*) Create a new key of given type, description and payload and add it to the
+ nominated keyring:
+
+ key_serial_t add_key(const char *type, const char *desc,
+ const void *payload, size_t plen,
+ key_serial_t keyring);
+
+ If a key of the same type and description as that proposed already exists
+ in the keyring, this will try to update it with the given payload, or it
+ will return error EEXIST if that function is not supported by the key
+ type. The process must also have permission to write to the key to be able
+ to update it. The new key will have all user permissions granted and no
+ group or third party permissions.
+
+ Otherwise, this will attempt to create a new key of the specified type and
+ description, and to instantiate it with the supplied payload and attach it
+ to the keyring. In this case, an error will be generated if the process
+ does not have permission to write to the keyring.
+
+ The payload is optional, and the pointer can be NULL if not required by
+ the type. The payload is plen in size, and plen can be zero for an empty
+ payload.
+
+ A new keyring can be generated by setting type "keyring", the keyring name
+ as the description (or NULL) and setting the payload to NULL.
+
+ User defined keys can be created by specifying type "user". It is
+ recommended that a user defined key's description by prefixed with a type
+ ID and a colon, such as "krb5tgt:" for a Kerberos 5 ticket granting
+ ticket.
+
+ Any other type must have been registered with the kernel in advance by a
+ kernel service such as a filesystem.
+
+ The ID of the new or updated key is returned if successful.
+
+
+ (*) Search the process's keyrings for a key, potentially calling out to
+ userspace to create it.
+
+ key_serial_t request_key(const char *type, const char *description,
+ const char *callout_info,
+ key_serial_t dest_keyring);
+
+ This function searches all the process's keyrings in the order thread,
+ process, session for a matching key. This works very much like
+ KEYCTL_SEARCH, including the optional attachment of the discovered key to
+ a keyring.
+
+ If a key cannot be found, and if callout_info is not NULL, then
+ /sbin/request-key will be invoked in an attempt to obtain a key. The
+ callout_info string will be passed as an argument to the program.
+
+ See also Documentation/keys-request-key.txt.
+
+
+The keyctl syscall functions are:
+
+ (*) Map a special key ID to a real key ID for this process:
+
+ key_serial_t keyctl(KEYCTL_GET_KEYRING_ID, key_serial_t id,
+ int create);
+
+ The special key specified by "id" is looked up (with the key being created
+ if necessary) and the ID of the key or keyring thus found is returned if
+ it exists.
+
+ If the key does not yet exist, the key will be created if "create" is
+ non-zero; and the error ENOKEY will be returned if "create" is zero.
+
+
+ (*) Replace the session keyring this process subscribes to with a new one:
+
+ key_serial_t keyctl(KEYCTL_JOIN_SESSION_KEYRING, const char *name);
+
+ If name is NULL, an anonymous keyring is created attached to the process
+ as its session keyring, displacing the old session keyring.
+
+ If name is not NULL, if a keyring of that name exists, the process
+ attempts to attach it as the session keyring, returning an error if that
+ is not permitted; otherwise a new keyring of that name is created and
+ attached as the session keyring.
+
+ To attach to a named keyring, the keyring must have search permission for
+ the process's ownership.
+
+ The ID of the new session keyring is returned if successful.
+
+
+ (*) Update the specified key:
+
+ long keyctl(KEYCTL_UPDATE, key_serial_t key, const void *payload,
+ size_t plen);
+
+ This will try to update the specified key with the given payload, or it
+ will return error EOPNOTSUPP if that function is not supported by the key
+ type. The process must also have permission to write to the key to be able
+ to update it.
+
+ The payload is of length plen, and may be absent or empty as for
+ add_key().
+
+
+ (*) Revoke a key:
+
+ long keyctl(KEYCTL_REVOKE, key_serial_t key);
+
+ This makes a key unavailable for further operations. Further attempts to
+ use the key will be met with error EKEYREVOKED, and the key will no longer
+ be findable.
+
+
+ (*) Change the ownership of a key:
+
+ long keyctl(KEYCTL_CHOWN, key_serial_t key, uid_t uid, gid_t gid);
+
+ This function permits a key's owner and group ID to be changed. Either one
+ of uid or gid can be set to -1 to suppress that change.
+
+ Only the superuser can change a key's owner to something other than the
+ key's current owner. Similarly, only the superuser can change a key's
+ group ID to something other than the calling process's group ID or one of
+ its group list members.
+
+
+ (*) Change the permissions mask on a key:
+
+ long keyctl(KEYCTL_SETPERM, key_serial_t key, key_perm_t perm);
+
+ This function permits the owner of a key or the superuser to change the
+ permissions mask on a key.
+
+ Only bits the available bits are permitted; if any other bits are set,
+ error EINVAL will be returned.
+
+
+ (*) Describe a key:
+
+ long keyctl(KEYCTL_DESCRIBE, key_serial_t key, char *buffer,
+ size_t buflen);
+
+ This function returns a summary of the key's attributes (but not its
+ payload data) as a string in the buffer provided.
+
+ Unless there's an error, it always returns the amount of data it could
+ produce, even if that's too big for the buffer, but it won't copy more
+ than requested to userspace. If the buffer pointer is NULL then no copy
+ will take place.
+
+ A process must have view permission on the key for this function to be
+ successful.
+
+ If successful, a string is placed in the buffer in the following format:
+
+ <type>;<uid>;<gid>;<perm>;<description>
+
+ Where type and description are strings, uid and gid are decimal, and perm
+ is hexadecimal. A NUL character is included at the end of the string if
+ the buffer is sufficiently big.
+
+ This can be parsed with
+
+ sscanf(buffer, "%[^;];%d;%d;%o;%s", type, &uid, &gid, &mode, desc);
+
+
+ (*) Clear out a keyring:
+
+ long keyctl(KEYCTL_CLEAR, key_serial_t keyring);
+
+ This function clears the list of keys attached to a keyring. The calling
+ process must have write permission on the keyring, and it must be a
+ keyring (or else error ENOTDIR will result).
+
+
+ (*) Link a key into a keyring:
+
+ long keyctl(KEYCTL_LINK, key_serial_t keyring, key_serial_t key);
+
+ This function creates a link from the keyring to the key. The process must
+ have write permission on the keyring and must have link permission on the
+ key.
+
+ Should the keyring not be a keyring, error ENOTDIR will result; and if the
+ keyring is full, error ENFILE will result.
+
+ The link procedure checks the nesting of the keyrings, returning ELOOP if
+ it appears too deep or EDEADLK if the link would introduce a cycle.
+
+ Any links within the keyring to keys that match the new key in terms of
+ type and description will be discarded from the keyring as the new one is
+ added.
+
+
+ (*) Unlink a key or keyring from another keyring:
+
+ long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key);
+
+ This function looks through the keyring for the first link to the
+ specified key, and removes it if found. Subsequent links to that key are
+ ignored. The process must have write permission on the keyring.
+
+ If the keyring is not a keyring, error ENOTDIR will result; and if the key
+ is not present, error ENOENT will be the result.
+
+
+ (*) Search a keyring tree for a key:
+
+ key_serial_t keyctl(KEYCTL_SEARCH, key_serial_t keyring,
+ const char *type, const char *description,
+ key_serial_t dest_keyring);
+
+ This searches the keyring tree headed by the specified keyring until a key
+ is found that matches the type and description criteria. Each keyring is
+ checked for keys before recursion into its children occurs.
+
+ The process must have search permission on the top level keyring, or else
+ error EACCES will result. Only keyrings that the process has search
+ permission on will be recursed into, and only keys and keyrings for which
+ a process has search permission can be matched. If the specified keyring
+ is not a keyring, ENOTDIR will result.
+
+ If the search succeeds, the function will attempt to link the found key
+ into the destination keyring if one is supplied (non-zero ID). All the
+ constraints applicable to KEYCTL_LINK apply in this case too.
+
+ Error ENOKEY, EKEYREVOKED or EKEYEXPIRED will be returned if the search
+ fails. On success, the resulting key ID will be returned.
+
+
+ (*) Read the payload data from a key:
+
+ long keyctl(KEYCTL_READ, key_serial_t keyring, char *buffer,
+ size_t buflen);
+
+ This function attempts to read the payload data from the specified key
+ into the buffer. The process must have read permission on the key to
+ succeed.
+
+ The returned data will be processed for presentation by the key type. For
+ instance, a keyring will return an array of key_serial_t entries
+ representing the IDs of all the keys to which it is subscribed. The user
+ defined key type will return its data as is. If a key type does not
+ implement this function, error EOPNOTSUPP will result.
+
+ As much of the data as can be fitted into the buffer will be copied to
+ userspace if the buffer pointer is not NULL.
+
+ On a successful return, the function will always return the amount of data
+ available rather than the amount copied.
+
+
+ (*) Instantiate a partially constructed key.
+
+ long keyctl(KEYCTL_INSTANTIATE, key_serial_t key,
+ const void *payload, size_t plen,
+ key_serial_t keyring);
+
+ If the kernel calls back to userspace to complete the instantiation of a
+ key, userspace should use this call to supply data for the key before the
+ invoked process returns, or else the key will be marked negative
+ automatically.
+
+ The process must have write access on the key to be able to instantiate
+ it, and the key must be uninstantiated.
+
+ If a keyring is specified (non-zero), the key will also be linked into
+ that keyring, however all the constraints applying in KEYCTL_LINK apply in
+ this case too.
+
+ The payload and plen arguments describe the payload data as for add_key().
+
+
+ (*) Negatively instantiate a partially constructed key.
+
+ long keyctl(KEYCTL_NEGATE, key_serial_t key,
+ unsigned timeout, key_serial_t keyring);
+
+ If the kernel calls back to userspace to complete the instantiation of a
+ key, userspace should use this call mark the key as negative before the
+ invoked process returns if it is unable to fulfil the request.
+
+ The process must have write access on the key to be able to instantiate
+ it, and the key must be uninstantiated.
+
+ If a keyring is specified (non-zero), the key will also be linked into
+ that keyring, however all the constraints applying in KEYCTL_LINK apply in
+ this case too.
+
+
+ (*) Set the default request-key destination keyring.
+
+ long keyctl(KEYCTL_SET_REQKEY_KEYRING, int reqkey_defl);
+
+ This sets the default keyring to which implicitly requested keys will be
+ attached for this thread. reqkey_defl should be one of these constants:
+
+ CONSTANT VALUE NEW DEFAULT KEYRING
+ ====================================== ====== =======================
+ KEY_REQKEY_DEFL_NO_CHANGE -1 No change
+ KEY_REQKEY_DEFL_DEFAULT 0 Default[1]
+ KEY_REQKEY_DEFL_THREAD_KEYRING 1 Thread keyring
+ KEY_REQKEY_DEFL_PROCESS_KEYRING 2 Process keyring
+ KEY_REQKEY_DEFL_SESSION_KEYRING 3 Session keyring
+ KEY_REQKEY_DEFL_USER_KEYRING 4 User keyring
+ KEY_REQKEY_DEFL_USER_SESSION_KEYRING 5 User session keyring
+ KEY_REQKEY_DEFL_GROUP_KEYRING 6 Group keyring
+
+ The old default will be returned if successful and error EINVAL will be
+ returned if reqkey_defl is not one of the above values.
+
+ The default keyring can be overridden by the keyring indicated to the
+ request_key() system call.
+
+ Note that this setting is inherited across fork/exec.
+
+ [1] The default is: the thread keyring if there is one, otherwise
+ the process keyring if there is one, otherwise the session keyring if
+ there is one, otherwise the user default session keyring.
+
+
+ (*) Set the timeout on a key.
+
+ long keyctl(KEYCTL_SET_TIMEOUT, key_serial_t key, unsigned timeout);
+
+ This sets or clears the timeout on a key. The timeout can be 0 to clear
+ the timeout or a number of seconds to set the expiry time that far into
+ the future.
+
+ The process must have attribute modification access on a key to set its
+ timeout. Timeouts may not be set with this function on negative, revoked
+ or expired keys.
+
+
+ (*) Assume the authority granted to instantiate a key
+
+ long keyctl(KEYCTL_ASSUME_AUTHORITY, key_serial_t key);
+
+ This assumes or divests the authority required to instantiate the
+ specified key. Authority can only be assumed if the thread has the
+ authorisation key associated with the specified key in its keyrings
+ somewhere.
+
+ Once authority is assumed, searches for keys will also search the
+ requester's keyrings using the requester's security label, UID, GID and
+ groups.
+
+ If the requested authority is unavailable, error EPERM will be returned,
+ likewise if the authority has been revoked because the target key is
+ already instantiated.
+
+ If the specified key is 0, then any assumed authority will be divested.
+
+ The assumed authoritative key is inherited across fork and exec.
+
+
+ (*) Get the LSM security context attached to a key.
+
+ long keyctl(KEYCTL_GET_SECURITY, key_serial_t key, char *buffer,
+ size_t buflen)
+
+ This function returns a string that represents the LSM security context
+ attached to a key in the buffer provided.
+
+ Unless there's an error, it always returns the amount of data it could
+ produce, even if that's too big for the buffer, but it won't copy more
+ than requested to userspace. If the buffer pointer is NULL then no copy
+ will take place.
+
+ A NUL character is included at the end of the string if the buffer is
+ sufficiently big. This is included in the returned count. If no LSM is
+ in force then an empty string will be returned.
+
+ A process must have view permission on the key for this function to be
+ successful.
+
+
+===============
+KERNEL SERVICES
+===============
+
+The kernel services for key management are fairly simple to deal with. They can
+be broken down into two areas: keys and key types.
+
+Dealing with keys is fairly straightforward. Firstly, the kernel service
+registers its type, then it searches for a key of that type. It should retain
+the key as long as it has need of it, and then it should release it. For a
+filesystem or device file, a search would probably be performed during the open
+call, and the key released upon close. How to deal with conflicting keys due to
+two different users opening the same file is left to the filesystem author to
+solve.
+
+To access the key manager, the following header must be #included:
+
+ <linux/key.h>
+
+Specific key types should have a header file under include/keys/ that should be
+used to access that type. For keys of type "user", for example, that would be:
+
+ <keys/user-type.h>
+
+Note that there are two different types of pointers to keys that may be
+encountered:
+
+ (*) struct key *
+
+ This simply points to the key structure itself. Key structures will be at
+ least four-byte aligned.
+
+ (*) key_ref_t
+
+ This is equivalent to a struct key *, but the least significant bit is set
+ if the caller "possesses" the key. By "possession" it is meant that the
+ calling processes has a searchable link to the key from one of its
+ keyrings. There are three functions for dealing with these:
+
+ key_ref_t make_key_ref(const struct key *key,
+ unsigned long possession);
+
+ struct key *key_ref_to_ptr(const key_ref_t key_ref);
+
+ unsigned long is_key_possessed(const key_ref_t key_ref);
+
+ The first function constructs a key reference from a key pointer and
+ possession information (which must be 0 or 1 and not any other value).
+
+ The second function retrieves the key pointer from a reference and the
+ third retrieves the possession flag.
+
+When accessing a key's payload contents, certain precautions must be taken to
+prevent access vs modification races. See the section "Notes on accessing
+payload contents" for more information.
+
+(*) To search for a key, call:
+
+ struct key *request_key(const struct key_type *type,
+ const char *description,
+ const char *callout_info);
+
+ This is used to request a key or keyring with a description that matches
+ the description specified according to the key type's match function. This
+ permits approximate matching to occur. If callout_string is not NULL, then
+ /sbin/request-key will be invoked in an attempt to obtain the key from
+ userspace. In that case, callout_string will be passed as an argument to
+ the program.
+
+ Should the function fail error ENOKEY, EKEYEXPIRED or EKEYREVOKED will be
+ returned.
+
+ If successful, the key will have been attached to the default keyring for
+ implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING.
+
+ See also Documentation/keys-request-key.txt.
+
+
+(*) To search for a key, passing auxiliary data to the upcaller, call:
+
+ struct key *request_key_with_auxdata(const struct key_type *type,
+ const char *description,
+ const void *callout_info,
+ size_t callout_len,
+ void *aux);
+
+ This is identical to request_key(), except that the auxiliary data is
+ passed to the key_type->request_key() op if it exists, and the callout_info
+ is a blob of length callout_len, if given (the length may be 0).
+
+
+(*) A key can be requested asynchronously by calling one of:
+
+ struct key *request_key_async(const struct key_type *type,
+ const char *description,
+ const void *callout_info,
+ size_t callout_len);
+
+ or:
+
+ struct key *request_key_async_with_auxdata(const struct key_type *type,
+ const char *description,
+ const char *callout_info,
+ size_t callout_len,
+ void *aux);
+
+ which are asynchronous equivalents of request_key() and
+ request_key_with_auxdata() respectively.
+
+ These two functions return with the key potentially still under
+ construction. To wait for construction completion, the following should be
+ called:
+
+ int wait_for_key_construction(struct key *key, bool intr);
+
+ The function will wait for the key to finish being constructed and then
+ invokes key_validate() to return an appropriate value to indicate the state
+ of the key (0 indicates the key is usable).
+
+ If intr is true, then the wait can be interrupted by a signal, in which
+ case error ERESTARTSYS will be returned.
+
+
+(*) When it is no longer required, the key should be released using:
+
+ void key_put(struct key *key);
+
+ Or:
+
+ void key_ref_put(key_ref_t key_ref);
+
+ These can be called from interrupt context. If CONFIG_KEYS is not set then
+ the argument will not be parsed.
+
+
+(*) Extra references can be made to a key by calling the following function:
+
+ struct key *key_get(struct key *key);
+
+ These need to be disposed of by calling key_put() when they've been
+ finished with. The key pointer passed in will be returned. If the pointer
+ is NULL or CONFIG_KEYS is not set then the key will not be dereferenced and
+ no increment will take place.
+
+
+(*) A key's serial number can be obtained by calling:
+
+ key_serial_t key_serial(struct key *key);
+
+ If key is NULL or if CONFIG_KEYS is not set then 0 will be returned (in the
+ latter case without parsing the argument).
+
+
+(*) If a keyring was found in the search, this can be further searched by:
+
+ key_ref_t keyring_search(key_ref_t keyring_ref,
+ const struct key_type *type,
+ const char *description)
+
+ This searches the keyring tree specified for a matching key. Error ENOKEY
+ is returned upon failure (use IS_ERR/PTR_ERR to determine). If successful,
+ the returned key will need to be released.
+
+ The possession attribute from the keyring reference is used to control
+ access through the permissions mask and is propagated to the returned key
+ reference pointer if successful.
+
+
+(*) To check the validity of a key, this function can be called:
+
+ int validate_key(struct key *key);
+
+ This checks that the key in question hasn't expired or and hasn't been
+ revoked. Should the key be invalid, error EKEYEXPIRED or EKEYREVOKED will
+ be returned. If the key is NULL or if CONFIG_KEYS is not set then 0 will be
+ returned (in the latter case without parsing the argument).
+
+
+(*) To register a key type, the following function should be called:
+
+ int register_key_type(struct key_type *type);
+
+ This will return error EEXIST if a type of the same name is already
+ present.
+
+
+(*) To unregister a key type, call:
+
+ void unregister_key_type(struct key_type *type);
+
+
+Under some circumstances, it may be desirable to deal with a bundle of keys.
+The facility provides access to the keyring type for managing such a bundle:
+
+ struct key_type key_type_keyring;
+
+This can be used with a function such as request_key() to find a specific
+keyring in a process's keyrings. A keyring thus found can then be searched
+with keyring_search(). Note that it is not possible to use request_key() to
+search a specific keyring, so using keyrings in this way is of limited utility.
+
+
+===================================
+NOTES ON ACCESSING PAYLOAD CONTENTS
+===================================
+
+The simplest payload is just a number in key->payload.value. In this case,
+there's no need to indulge in RCU or locking when accessing the payload.
+
+More complex payload contents must be allocated and a pointer to them set in
+key->payload.data. One of the following ways must be selected to access the
+data:
+
+ (1) Unmodifiable key type.
+
+ If the key type does not have a modify method, then the key's payload can
+ be accessed without any form of locking, provided that it's known to be
+ instantiated (uninstantiated keys cannot be "found").
+
+ (2) The key's semaphore.
+
+ The semaphore could be used to govern access to the payload and to control
+ the payload pointer. It must be write-locked for modifications and would
+ have to be read-locked for general access. The disadvantage of doing this
+ is that the accessor may be required to sleep.
+
+ (3) RCU.
+
+ RCU must be used when the semaphore isn't already held; if the semaphore
+ is held then the contents can't change under you unexpectedly as the
+ semaphore must still be used to serialise modifications to the key. The
+ key management code takes care of this for the key type.
+
+ However, this means using:
+
+ rcu_read_lock() ... rcu_dereference() ... rcu_read_unlock()
+
+ to read the pointer, and:
+
+ rcu_dereference() ... rcu_assign_pointer() ... call_rcu()
+
+ to set the pointer and dispose of the old contents after a grace period.
+ Note that only the key type should ever modify a key's payload.
+
+ Furthermore, an RCU controlled payload must hold a struct rcu_head for the
+ use of call_rcu() and, if the payload is of variable size, the length of
+ the payload. key->datalen cannot be relied upon to be consistent with the
+ payload just dereferenced if the key's semaphore is not held.
+
+
+===================
+DEFINING A KEY TYPE
+===================
+
+A kernel service may want to define its own key type. For instance, an AFS
+filesystem might want to define a Kerberos 5 ticket key type. To do this, it
+author fills in a key_type struct and registers it with the system.
+
+Source files that implement key types should include the following header file:
+
+ <linux/key-type.h>
+
+The structure has a number of fields, some of which are mandatory:
+
+ (*) const char *name
+
+ The name of the key type. This is used to translate a key type name
+ supplied by userspace into a pointer to the structure.
+
+
+ (*) size_t def_datalen
+
+ This is optional - it supplies the default payload data length as
+ contributed to the quota. If the key type's payload is always or almost
+ always the same size, then this is a more efficient way to do things.
+
+ The data length (and quota) on a particular key can always be changed
+ during instantiation or update by calling:
+
+ int key_payload_reserve(struct key *key, size_t datalen);
+
+ With the revised data length. Error EDQUOT will be returned if this is not
+ viable.
+
+
+ (*) int (*instantiate)(struct key *key, const void *data, size_t datalen);
+
+ This method is called to attach a payload to a key during construction.
+ The payload attached need not bear any relation to the data passed to this
+ function.
+
+ If the amount of data attached to the key differs from the size in
+ keytype->def_datalen, then key_payload_reserve() should be called.
+
+ This method does not have to lock the key in order to attach a payload.
+ The fact that KEY_FLAG_INSTANTIATED is not set in key->flags prevents
+ anything else from gaining access to the key.
+
+ It is safe to sleep in this method.
+
+
+ (*) int (*update)(struct key *key, const void *data, size_t datalen);
+
+ If this type of key can be updated, then this method should be provided.
+ It is called to update a key's payload from the blob of data provided.
+
+ key_payload_reserve() should be called if the data length might change
+ before any changes are actually made. Note that if this succeeds, the type
+ is committed to changing the key because it's already been altered, so all
+ memory allocation must be done first.
+
+ The key will have its semaphore write-locked before this method is called,
+ but this only deters other writers; any changes to the key's payload must
+ be made under RCU conditions, and call_rcu() must be used to dispose of
+ the old payload.
+
+ key_payload_reserve() should be called before the changes are made, but
+ after all allocations and other potentially failing function calls are
+ made.
+
+ It is safe to sleep in this method.
+
+
+ (*) int (*match)(const struct key *key, const void *desc);
+
+ This method is called to match a key against a description. It should
+ return non-zero if the two match, zero if they don't.
+
+ This method should not need to lock the key in any way. The type and
+ description can be considered invariant, and the payload should not be
+ accessed (the key may not yet be instantiated).
+
+ It is not safe to sleep in this method; the caller may hold spinlocks.
+
+
+ (*) void (*revoke)(struct key *key);
+
+ This method is optional. It is called to discard part of the payload
+ data upon a key being revoked. The caller will have the key semaphore
+ write-locked.
+
+ It is safe to sleep in this method, though care should be taken to avoid
+ a deadlock against the key semaphore.
+
+
+ (*) void (*destroy)(struct key *key);
+
+ This method is optional. It is called to discard the payload data on a key
+ when it is being destroyed.
+
+ This method does not need to lock the key to access the payload; it can
+ consider the key as being inaccessible at this time. Note that the key's
+ type may have been changed before this function is called.
+
+ It is not safe to sleep in this method; the caller may hold spinlocks.
+
+
+ (*) void (*describe)(const struct key *key, struct seq_file *p);
+
+ This method is optional. It is called during /proc/keys reading to
+ summarise a key's description and payload in text form.
+
+ This method will be called with the RCU read lock held. rcu_dereference()
+ should be used to read the payload pointer if the payload is to be
+ accessed. key->datalen cannot be trusted to stay consistent with the
+ contents of the payload.
+
+ The description will not change, though the key's state may.
+
+ It is not safe to sleep in this method; the RCU read lock is held by the
+ caller.
+
+
+ (*) long (*read)(const struct key *key, char __user *buffer, size_t buflen);
+
+ This method is optional. It is called by KEYCTL_READ to translate the
+ key's payload into something a blob of data for userspace to deal with.
+ Ideally, the blob should be in the same format as that passed in to the
+ instantiate and update methods.
+
+ If successful, the blob size that could be produced should be returned
+ rather than the size copied.
+
+ This method will be called with the key's semaphore read-locked. This will
+ prevent the key's payload changing. It is not necessary to use RCU locking
+ when accessing the key's payload. It is safe to sleep in this method, such
+ as might happen when the userspace buffer is accessed.
+
+
+ (*) int (*request_key)(struct key_construction *cons, const char *op,
+ void *aux);
+
+ This method is optional. If provided, request_key() and friends will
+ invoke this function rather than upcalling to /sbin/request-key to operate
+ upon a key of this type.
+
+ The aux parameter is as passed to request_key_async_with_auxdata() and
+ similar or is NULL otherwise. Also passed are the construction record for
+ the key to be operated upon and the operation type (currently only
+ "create").
+
+ This method is permitted to return before the upcall is complete, but the
+ following function must be called under all circumstances to complete the
+ instantiation process, whether or not it succeeds, whether or not there's
+ an error:
+
+ void complete_request_key(struct key_construction *cons, int error);
+
+ The error parameter should be 0 on success, -ve on error. The
+ construction record is destroyed by this action and the authorisation key
+ will be revoked. If an error is indicated, the key under construction
+ will be negatively instantiated if it wasn't already instantiated.
+
+ If this method returns an error, that error will be returned to the
+ caller of request_key*(). complete_request_key() must be called prior to
+ returning.
+
+ The key under construction and the authorisation key can be found in the
+ key_construction struct pointed to by cons:
+
+ (*) struct key *key;
+
+ The key under construction.
+
+ (*) struct key *authkey;
+
+ The authorisation key.
+
+
+============================
+REQUEST-KEY CALLBACK SERVICE
+============================
+
+To create a new key, the kernel will attempt to execute the following command
+line:
+
+ /sbin/request-key create <key> <uid> <gid> \
+ <threadring> <processring> <sessionring> <callout_info>
+
+<key> is the key being constructed, and the three keyrings are the process
+keyrings from the process that caused the search to be issued. These are
+included for two reasons:
+
+ (1) There may be an authentication token in one of the keyrings that is
+ required to obtain the key, eg: a Kerberos Ticket-Granting Ticket.
+
+ (2) The new key should probably be cached in one of these rings.
+
+This program should set it UID and GID to those specified before attempting to
+access any more keys. It may then look around for a user specific process to
+hand the request off to (perhaps a path held in placed in another key by, for
+example, the KDE desktop manager).
+
+The program (or whatever it calls) should finish construction of the key by
+calling KEYCTL_INSTANTIATE, which also permits it to cache the key in one of
+the keyrings (probably the session ring) before returning. Alternatively, the
+key can be marked as negative with KEYCTL_NEGATE; this also permits the key to
+be cached in one of the keyrings.
+
+If it returns with the key remaining in the unconstructed state, the key will
+be marked as being negative, it will be added to the session keyring, and an
+error will be returned to the key requestor.
+
+Supplementary information may be provided from whoever or whatever invoked this
+service. This will be passed as the <callout_info> parameter. If no such
+information was made available, then "-" will be passed as this parameter
+instead.
+
+
+Similarly, the kernel may attempt to update an expired or a soon to expire key
+by executing:
+
+ /sbin/request-key update <key> <uid> <gid> \
+ <threadring> <processring> <sessionring>
+
+In this case, the program isn't required to actually attach the key to a ring;
+the rings are provided for reference.
diff --git a/Documentation/ko_KR/HOWTO b/Documentation/ko_KR/HOWTO
new file mode 100644
index 0000000..029fca9
--- /dev/null
+++ b/Documentation/ko_KR/HOWTO
@@ -0,0 +1,625 @@
+NOTE:
+This is a version of Documentation/HOWTO translated into korean
+This document is maintained by minchan Kim <minchan.kim@gmail.com>
+If you find any difference between this document and the original file or
+a problem with the translation, please contact the maintainer of this file.
+
+Please also note that the purpose of this file is to be easier to
+read for non English (read: korean) speakers and is not intended as
+a fork. So if you have any comments or updates for this file please
+try to update the original English file first.
+
+==================================
+이 문서는
+Documentation/HOWTO
+의 한글 번역입니다.
+
+역자: 김민찬 <minchan.kim@gmail.com>
+감수: 이제이미 <jamee.lee@samsung.com>
+==================================
+
+어떻게 리눅스 커널 개발을 하는가
+---------------------------------
+
+이 문서는 커널 개발에 있어 가장 중요한 문서이다. 이 문서는
+리눅스 커널 개발자가 되는 법과 리눅스 커널 개발 커뮤니티와 일하는
+법을 담고있다. 커널 프로그래밍의 기술적인 측면과 관련된 내용들은
+포함하지 않으려고 하였지만 올바른 길로 여러분을 안내하는 데는 도움이
+될 것이다.
+
+이 문서에서 오래된 것을 발견하면 문서의 아래쪽에 나열된 메인테이너에게
+패치를 보내달라.
+
+
+소개
+----
+
+자, 여러분은 리눅스 커널 개발자가 되는 법을 배우고 싶은가? 아니면
+상사로부터"이 장치를 위한 리눅스 드라이버를 작성하시오"라는 말을
+들었는가? 이 문서의 목적은 여러분이 겪게 될 과정과 커뮤니티와 협력하는
+법을 조언하여 여러분의 목적을 달성하기 위해 필요한 것 모두를 알려주기
+위함이다.
+
+커널은 대부분은 C로 작성되어 있고 몇몇 아키텍쳐의 의존적인 부분은
+어셈블리로 작성되어 있다. 커널 개발을 위해 C를 잘 이해하고 있어야 한다.
+여러분이 특정 아키텍쳐의 low-level 개발을 할 것이 아니라면
+어셈블리(특정 아키텍쳐)는 잘 알아야 할 필요는 없다.
+다음의 참고서적들은 기본에 충실한 C 교육이나 수년간의 경험에 견주지는
+못하지만 적어도 참고 용도로는 좋을 것이다
+ - "The C Programming Language" by Kernighan and Ritchie [Prentice Hall]
+ - "Practical C Programming" by Steve Oualline [O'Reilly]
+ - "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
+
+커널은 GNU C와 GNU 툴체인을 사용하여 작성되었다. 이 툴들은 ISO C89 표준을
+따르는 반면 표준에 있지 않은 많은 확장기능도 가지고 있다. 커널은 표준 C
+라이브러리와는 관계없이 freestanding C 환경이어서 C 표준의 일부는
+지원되지 않는다. 임의의 long long 나누기나 floating point는 지원되지 않는다.
+때론 이런 이유로 커널이 그런 확장 기능을 가진 툴체인을 가지고 만들어졌다는
+것이 이해하기 어려울 수도 있고 게다가 불행하게도 그런 것을 정확하게 설명하는
+어떤 참고문서도 있지 않다. 정보를 얻기 위해서는 gcc info (`info gcc`)페이지를
+살펴보라.
+
+여러분은 기존의 개발 커뮤니티와 협력하는 법을 배우려고 하고 있다는 것을
+기억하라. 코딩, 스타일, 함수에 관한 훌륭한 표준을 가진 사람들이 모인
+다양한 그룹이 있다. 이 표준들은 오랜동안 크고 지역적으로 분산된 팀들에
+의해 가장 좋은 방법으로 일하기 위하여 찾은 것을 기초로 만들어져 왔다.
+그 표준들은 문서화가 잘 되어있기 때문에 가능한한 미리 많은 표준들에
+관하여 배우려고 시도하라. 다른 사람들은 여러분이나 여러분의 회사가
+일하는 방식에 적응하는 것을 원하지는 않는다.
+
+
+법적 문제
+---------
+
+리눅스 커널 소스 코드는 GPL로 배포(release)되었다. 소스트리의 메인
+디렉토리에 있는 라이센스에 관하여 상세하게 쓰여 있는 COPYING이라는
+파일을 봐라. 여러분이 라이센스에 관한 더 깊은 문제를 가지고 있다면
+리눅스 커널 메일링 리스트에 묻지말고 변호사와 연락하라. 메일링
+리스트들에 있는 사람들은 변호사가 아니기 때문에 법적 문제에 관하여
+그들의 말에 의지해서는 안된다.
+
+GPL에 관한 잦은 질문들과 답변들은 다음을 참조하라.
+ http://www.gnu.org/licenses/gpl-faq.html
+
+
+문서
+----
+
+리눅스 커널 소스 트리는 커널 커뮤니티와 협력하는 법을 배우기위해 훌륭한
+다양한 문서들을 가지고 있다. 새로운 기능들이 커널에 들어가게 될 때,
+그 기능을 어떻게 사용하는지에 관한 설명을 위하여 새로운 문서 파일을
+추가하는 것을 권장한다. 커널이 유저스페이스로 노출하는 인터페이스를
+변경하게 되면 변경을 설명하는 메뉴얼 페이지들에 대한 패치나 정보를
+mtk.manpages@gmail.com의 메인테이너에게 보낼 것을 권장한다.
+
+다음은 커널 소스 트리에 있는 읽어야 할 파일들의 리스트이다.
+ README
+ 이 파일은 리눅스 커널에 관하여 간단한 배경 설명과 커널을 설정하고
+ 빌드하기 위해 필요한 것을 설명한다. 커널에 입문하는 사람들은 여기서
+ 시작해야 한다.
+
+ Documentation/Changes
+ 이 파일은 커널을 성공적으로 빌드하고 실행시키기 위해 필요한 다양한
+ 소프트웨어 패키지들의 최소 버젼을 나열한다.
+
+ Documentation/CodingStyle
+ 이 문서는 리눅스 커널 코딩 스타일과 그렇게 한 몇몇 이유를 설명한다.
+ 모든 새로운 코드는 이 문서에 가이드라인들을 따라야 한다. 대부분의
+ 메인테이너들은 이 규칙을 따르는 패치들만을 받아들일 것이고 많은 사람들이
+ 그 패치가 올바른 스타일일 경우만 코드를 검토할 것이다.
+
+ Documentation/SubmittingPatches
+ Documentation/SubmittingDrivers
+ 이 파일들은 성공적으로 패치를 만들고 보내는 법을 다음의 내용들로
+ 굉장히 상세히 설명하고 있다(그러나 다음으로 한정되진 않는다).
+ - Email 내용들
+ - Email 양식
+ - 그것을 누구에게 보낼지
+ 이러한 규칙들을 따르는 것이 성공(역자주: 패치가 받아들여 지는 것)을
+ 보장하진 않는다(왜냐하면 모든 패치들은 내용과 스타일에 관하여
+ 면밀히 검토되기 때문이다). 그러나 규칙을 따르지 않는다면 거의
+ 성공하지도 못할 것이다.
+
+ 올바른 패치들을 만드는 법에 관한 훌륭한 다른 문서들이 있다.
+ "The Perfect Patch"
+ http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+ "Linux kernel patch submission format"
+ http://linux.yyz.us/patch-format.html
+
+ Documentation/stable_api_nonsense.txt
+ 이 문서는 의도적으로 커널이 불변하는 API를 갖지 않도록 결정한
+ 이유를 설명하며 다음과 같은 것들을 포함한다.
+ - 서브시스템 shim-layer(호환성을 위해?)
+ - 운영체제들간의 드라이버 이식성
+ - 커널 소스 트리내에 빠른 변화를 늦추는 것(또는 빠른 변화를 막는 것)
+ 이 문서는 리눅스 개발 철학을 이해하는데 필수적이며 다른 운영체제에서
+ 리눅스로 전향하는 사람들에게는 매우 중요하다.
+
+
+ Documentation/SecurityBugs
+ 여러분들이 리눅스 커널의 보안 문제를 발견했다고 생각한다면 이 문서에
+ 나온 단계에 따라서 커널 개발자들에게 알리고 그 문제를 해결할 수 있도록
+ 도와 달라.
+
+ Documentation/ManagementStyle
+ 이 문서는 리눅스 커널 메인테이너들이 그들의 방법론에 녹아 있는
+ 정신을 어떻게 공유하고 운영하는지를 설명한다. 이것은 커널 개발에 입문하는
+ 모든 사람들(또는 커널 개발에 작은 호기심이라도 있는 사람들)이
+ 읽어야 할 중요한 문서이다. 왜냐하면 이 문서는 커널 메인테이너들의
+ 독특한 행동에 관하여 흔히 있는 오해들과 혼란들을 해소하고 있기
+ 때문이다.
+
+ Documentation/stable_kernel_rules.txt
+ 이 문서는 안정적인 커널 배포가 이루어지는 규칙을 설명하고 있으며
+ 여러분들이 이러한 배포들 중 하나에 변경을 하길 원한다면
+ 무엇을 해야 하는지를 설명한다.
+
+ Documentation/kernel-docs.txt
+ 커널 개발에 관계된 외부 문서의 리스트이다. 커널 내의 포함된 문서들
+ 중에 여러분이 찾고 싶은 문서를 발견하지 못할 경우 이 리스트를
+ 살펴보라.
+
+ Documentation/applying-patches.txt
+ 패치가 무엇이며 그것을 커널의 다른 개발 브랜치들에 어떻게
+ 적용하는지에 관하여 자세히 설명하고 있는 좋은 입문서이다.
+
+커널은 소스 코드 그 자체에서 자동적으로 만들어질 수 있는 많은 문서들을
+가지고 있다. 이것은 커널 내의 API에 대한 모든 설명, 그리고 락킹을
+올바르게 처리하는 법에 관한 규칙을 포함하고 있다. 이 문서는
+Documentation/DocBook/ 디렉토리 내에서 만들어지며 PDF, Postscript, HTML,
+그리고 man 페이지들로 다음과 같이 실행하여 만들어 진다.
+ make pdfdocs
+ make psdocs
+ make htmldocs
+ make mandocs
+각각의 명령을 메인 커널 소스 디렉토리로부터 실행한다.
+
+
+커널 개발자가 되는 것
+---------------------
+
+여러분이 리눅스 커널 개발에 관하여 아무것도 모른다면 Linux KernelNewbies
+프로젝트를 봐야 한다.
+ http://kernelnewbies.org
+그곳은 거의 모든 종류의 기본적인 커널 개발 질문들(질문하기 전에 먼저
+아카이브를 찾아봐라. 과거에 이미 답변되었을 수도 있다)을 할수있는 도움이
+될만한 메일링 리스트가 있다. 또한 실시간으로 질문 할수 있는 IRC 채널도
+가지고 있으며 리눅스 커널 개발을 배우는 데 유용한 문서들을 보유하고 있다.
+
+웹사이트는 코드구성, 서브시스템들, 그리고 현재 프로젝트들
+(트리 내, 외부에 존재하는)에 관한 기본적인 정보들을 가지고 있다. 또한
+그곳은 커널 컴파일이나 패치를 하는 법과 같은 기본적인 것들을 설명한다.
+
+여러분이 어디서 시작해야 할진 모르지만 커널 개발 커뮤니티에 참여할 수
+있는 일들을 찾길 원한다면 리눅스 커널 Janitor 프로젝트를 살펴봐라.
+ http://janitor.kernelnewbies.org/
+그곳은 시작하기에 훌륭한 장소이다. 그곳은 리눅스 커널 소스 트리내에
+간단히 정리되고 수정될 수 있는 문제들에 관하여 설명한다. 여러분은 이
+프로젝트를 대표하는 개발자들과 일하면서 자신의 패치를 리눅스 커널 트리에
+반영하기 위한 기본적인 것들을 배우게 될것이며 여러분이 아직 아이디어를
+가지고 있지 않다면 다음에 무엇을 해야할지에 관한 방향을 배울 수 있을
+것이다.
+
+여러분들이 이미 커널 트리에 반영하길 원하는 코드 묶음을 가지고 있지만
+올바른 포맷으로 포장하는데 도움이 필요하다면 그러한 문제를 돕기 위해
+만들어진 kernel-mentors 프로젝트가 있다. 그곳은 메일링 리스트이며
+다음에서 참조할 수 있다.
+ http://selenic.com/mailman/listinfo/kernel-mentors
+
+리눅스 커널 코드에 실제 변경을 하기 전에 반드시 그 코드가 어떻게
+동작하는지 이해하고 있어야 한다. 코드를 분석하기 위하여 특정한 툴의
+도움을 빌려서라도 코드를 직접 읽는 것보다 좋은 것은 없다(대부분의
+자잘한 부분들은 잘 코멘트되어 있다). 그런 툴들 중에 특히 추천할만한
+것은 Linux Cross-Reference project이며 그것은 자기 참조 방식이며
+소스코드를 인덱스된 웹 페이지들의 형태로 보여준다. 최신의 멋진 커널
+코드 저장소는 다음을 통하여 참조할 수 있다.
+ http://users.sosdg.org/~qiyong/lxr/
+
+
+개발 프로세스
+-------------
+
+리눅스 커널 개발 프로세스는 현재 몇몇 다른 메인 커널 "브랜치들"과
+서브시스템에 특화된 커널 브랜치들로 구성된다. 몇몇 다른 메인
+브랜치들은 다음과 같다.
+ - main 2.6.x 커널 트리
+ - 2.6.x.y - 안정된 커널 트리
+ - 2.6.x -git 커널 패치들
+ - 2.6.x -mm 커널 패치들
+ - 서브시스템을 위한 커널 트리들과 패치들
+
+2.6.x 커널 트리
+---------------
+
+2.6.x 커널들은 Linux Torvalds가 관리하며 kernel.org의 pub/linux/kernel/v2.6/
+디렉토리에서 참조될 수 있다.개발 프로세스는 다음과 같다.
+ - 새로운 커널이 배포되자마자 2주의 시간이 주어진다. 이 기간동은
+ 메인테이너들은 큰 diff들을 Linus에게 제출할 수 있다. 대개 이 패치들은
+ 몇 주 동안 -mm 커널내에 이미 있었던 것들이다. 큰 변경들을 제출하는 데
+ 선호되는 방법은 git(커널의 소스 관리 툴, 더 많은 정보들은 http://git.or.cz/
+ 에서 참조할 수 있다)를 사용하는 것이지만 순수한 패치파일의 형식으로 보내는
+ 것도 무관하다.
+ - 2주 후에 -rc1 커널이 배포되며 지금부터는 전체 커널의 안정성에 영향을
+ 미칠수 있는 새로운 기능들을 포함하지 않는 패치들만이 추가될 수 있다.
+ 완전히 새로운 드라이버(혹은 파일시스템)는 -rc1 이후에만 받아들여진다는
+ 것을 기억해라. 왜냐하면 변경이 자체내에서만 발생하고 추가된 코드가
+ 드라이버 외부의 다른 부분에는 영향을 주지 않으므로 그런 변경은
+ 회귀(역자주: 이전에는 존재하지 않았지만 새로운 기능추가나 변경으로 인해
+ 생겨난 버그)를 일으킬 만한 위험을 가지고 있지 않기 때문이다. -rc1이
+ 배포된 이후에 git를 사용하여 패치들을 Linus에게 보낼수 있지만 패치들은
+ 공식적인 메일링 리스트로 보내서 검토를 받을 필요가 있다.
+ - 새로운 -rc는 Linus가 현재 git tree가 테스트 하기에 충분히 안정된 상태에
+ 있다고 판단될 때마다 배포된다. 목표는 새로운 -rc 커널을 매주 배포하는
+ 것이다.
+ - 이러한 프로세스는 커널이 "준비(ready)"되었다고 여겨질때까지 계속된다.
+ 프로세스는 대체로 6주간 지속된다.
+ - 각 -rc 배포에 있는 알려진 회귀의 목록들은 다음 URI에 남겨진다.
+ http://kernelnewbies.org/known_regressions
+
+커널 배포에 있어서 언급할만한 가치가 있는 리눅스 커널 메일링 리스트의
+Andrew Morton의 글이 있다.
+ "커널이 언제 배포될지는 아무도 모른다. 왜냐하면 배포는 알려진
+ 버그의 상황에 따라 배포되는 것이지 미리정해 놓은 시간에 따라
+ 배포되는 것은 아니기 때문이다."
+
+2.6.x.y - 안정 커널 트리
+------------------------
+
+4 자리 숫자로 이루어진 버젼의 커널들은 -stable 커널들이다. 그것들은 2.6.x
+커널에서 발견된 큰 회귀들이나 보안 문제들 중 비교적 작고 중요한 수정들을
+포함한다.
+
+이것은 가장 최근의 안정적인 커널을 원하는 사용자에게 추천되는 브랜치이며,
+개발/실험적 버젼을 테스트하는 것을 돕고자 하는 사용자들과는 별로 관련이 없다.
+
+어떤 2.6.x.y 커널도 사용할 수 없다면 그때는 가장 높은 숫자의 2.6.x
+커널이 현재의 안정 커널이다.
+
+2.6.x.y는 "stable" 팀<stable@kernel.org>에 의해 관리되며 거의 매번 격주로
+배포된다.
+
+커널 트리 문서들 내에 Documentation/stable_kernel_rules.txt 파일은 어떤
+종류의 변경들이 -stable 트리로 들어왔는지와 배포 프로세스가 어떻게
+진행되는지를 설명한다.
+
+
+2.6.x -git 패치들
+------------------
+git 저장소(그러므로 -git이라는 이름이 붙음)에는 날마다 관리되는 Linus의
+커널 트리의 snapshot 들이 있다. 이 패치들은 일반적으로 날마다 배포되며
+Linus의 트리의 현재 상태를 나타낸다. 이 패치들은 정상적인지 조금도
+살펴보지 않고 자동적으로 생성된 것이므로 -rc 커널들 보다도 더 실험적이다.
+
+2.6.x -mm 커널 패치들
+---------------------
+Andrew Morton에 의해 배포된 실험적인 커널 패치들이다. Andrew는 모든 다른
+서브시스템 커널 트리와 패치들을 가져와서 리눅스 커널 메일링 리스트로
+온 많은 패치들과 한데 묶는다. 이 트리는 새로운 기능들과 패치들을 위한
+장소를 제공하는 역할을 한다. 하나의 패치가 -mm에 한동안 있으면서 그 가치가
+증명되게 되면 Andrew나 서브시스템 메인테이너는 그것을 메인라인에 포함시키기
+위하여 Linus에게 보낸다.
+
+커널 트리에 포함하고 싶은 모든 새로운 패치들은 Linus에게 보내지기 전에
+-mm 트리에서 테스트를 하는 것을 적극 추천한다.
+
+이 커널들은 안정되게 사용할 시스템에서에 실행하는 것은 적합하지 않으며
+다른 브랜치들의 어떤 것들보다 위험하다.
+
+여러분이 커널 개발 프로세스를 돕길 원한다면 이 커널 배포들을 사용하고
+테스트한 후 어떤 문제를 발견하거나 또는 모든 것이 잘 동작한다면 리눅스
+커널 메일링 리스트로 피드백을 해달라.
+
+이 커널들은 일반적으로 모든 다른 실험적인 패치들과 배포될 당시의
+사용가능한 메인라인 -git 커널들의 몇몇 변경을 포함한다.
+
+-mm 커널들은 정해진 일정대로 배포되지 않는다. 하지만 대개 몇몇 -mm 커널들은
+각 -rc 커널(1부터 3이 흔함) 사이에서 배포된다.
+
+서브시스템 커널 트리들과 패치들
+-------------------------------
+많은 다른 커널 서브시스템 개발자들은 커널의 다른 부분들에서 무슨 일이
+일어나고 있는지를 볼수 있도록 그들의 개발 트리를 공개한다. 이 트리들은
+위에서 설명하였던 것 처럼 -mm 커널 배포들로 합쳐진다.
+
+다음은 활용가능한 커널 트리들을 나열한다.
+ git trees:
+ - Kbuild development tree, Sam Ravnborg < sam@ravnborg.org>
+ git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
+
+ - ACPI development tree, Len Brown <len.brown@intel.com >
+ git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
+
+ - Block development tree, Jens Axboe <jens.axboe@oracle.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
+
+ - DRM development tree, Dave Airlie <airlied@linux.ie>
+ git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
+
+ - ia64 development tree, Tony Luck < tony.luck@intel.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
+
+ - infiniband, Roland Dreier <rolandd@cisco.com >
+ git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
+
+ - libata, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
+
+ - network drivers, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
+
+ - pcmcia, Dominik Brodowski < linux@dominikbrodowski.net>
+ git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
+
+ - SCSI, James Bottomley < James.Bottomley@SteelEye.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
+
+ quilt trees:
+ - USB, PCI, Driver Core, and I2C, Greg Kroah-Hartman < gregkh@suse.de>
+ kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+ - x86-64, partly i386, Andi Kleen < ak@suse.de>
+ ftp.firstfloor.org:/pub/ak/x86_64/quilt/
+
+ 다른 커널 트리들은 http://kernel.org/git와 MAINTAINERS 파일에서 참조할 수
+ 있다.
+
+버그 보고
+---------
+bugzilla.kernel.org는 리눅스 커널 개발자들이 커널의 버그를 추적하는 곳이다.
+사용자들은 발견한 모든 버그들을 보고하기 위하여 이 툴을 사용할 것을 권장한다.
+kernel bugzilla를 사용하는 자세한 방법은 다음을 참조하라.
+ http://test.kernel.org/bugzilla/faq.html
+
+메인 커널 소스 디렉토리에 있는 REPORTING-BUGS 파일은 커널 버그라고 생각되는
+것을 보고하는 방법에 관한 좋은 템플릿이며 문제를 추적하기 위해서 커널
+개발자들이 필요로 하는 정보가 무엇들인지를 상세히 설명하고 있다.
+
+
+버그 리포트들의 관리
+--------------------
+
+여러분의 해킹 기술을 연습하는 가장 좋은 방법 중의 하는 다른 사람들이
+보고한 버그들을 수정하는 것이다. 여러분은 커널을 더욱 안정화시키는데
+도움을 줄 뿐만이 아니라 실제있는 문제들을 수정하는 법을 배우게 되고
+그와 함께 여러분들의 기술은 향상될 것이며 다른 개발자들이 여러분의
+존재에 대해 알게 될 것이다. 버그를 수정하는 것은 개발자들 사이에서
+점수를 얻을 수 있는 가장 좋은 방법중의 하나이다. 왜냐하면 많은 사람들은
+다른 사람들의 버그들을 수정하기 위하여 시간을 낭비하지 않기 때문이다.
+
+이미 보고된 버그 리포트들을 가지고 작업하기 위해서 http://bugzilla.kernel.org를
+참조하라. 여러분이 앞으로 생겨날 버그 리포트들의 조언자가 되길 원한다면
+bugme-new 메일링 리스트나(새로운 버그 리포트들만이 이곳에서 메일로 전해진다)
+bugme-janitor 메일링 리스트(bugzilla에 모든 변화들이 여기서 메일로 전해진다)
+에 등록하면 된다.
+
+ http://lists.osdl.org/mailman/listinfo/bugme-new
+ http://lists.osdl.org/mailman/listinfo/bugme-janitors
+
+
+
+메일링 리스트들
+---------------
+
+위의 몇몇 문서들이 설명하였지만 핵심 커널 개발자들의 대다수는
+리눅스 커널 메일링 리스트에 참여하고 있다. 리스트에 등록하고 해지하는
+방법에 관한 자세한 사항은 다음에서 참조할 수 있다.
+ http://vger.kernel.org/vger-lists.html#linux-kernel
+웹상의 많은 다른 곳에도 메일링 리스트의 아카이브들이 있다.
+이러한 아카이브들을 찾으려면 검색 엔진을 사용하라. 예를 들어:
+ http://dir.gmane.org/gmane.linux.kernel
+여러분이 새로운 문제에 관해 리스트에 올리기 전에 말하고 싶은 주제에 관한
+것을 아카이브에서 먼저 찾아보기를 강력히 권장한다. 이미 상세하게 토론된 많은
+것들이 메일링 리스트의 아카이브에 기록되어 있다.
+
+각각의 커널 서브시스템들의 대부분은 자신들의 개발에 관한 노력들로 이루어진
+분리된 메일링 리스트를 따로 가지고 있다. 다른 그룹들이 무슨 리스트를 가지고
+있는지는 MAINTAINERS 파일을 참조하라.
+
+많은 리스트들은 kernel.org에서 호스트되고 있다. 그 정보들은 다음에서 참조될 수 있다.
+ http://vger.kernel.org/vger-lists.html
+
+리스트들을 사용할 때는 올바른 예절을 따를 것을 유념해라.
+대단하진 않지만 다음 URL은 리스트(혹은 모든 리스트)와 대화하는 몇몇 간단한
+가이드라인을 가지고 있다.
+ http://www.albion.com/netiquette/
+
+여러 사람들이 여러분의 메일에 응답한다면 CC: 즉 수신 리스트는 꽤 커지게
+될 것이다. 아무 이유없이 CC에서 어떤 사람도 제거하거나 리스트 주소로만
+회신하지 마라. 메일을 보낸 사람으로서 하나를 받고 리스트로부터 또
+하나를 받아 두번 받는 것에 익숙하여 있으니 mail-header를 조작하려고 하지
+말아라. 사람들은 그런 것을 좋아하지 않을 것이다.
+
+여러분의 회신의 문맥을 원래대로 유지해야 한다. 여러분들의 회신의 윗부분에
+"John 커널해커는 작성했다...."를 유지하며 여러분들의 의견을 그 메일의 윗부분에
+작성하지 말고 각 인용한 단락들 사이에 넣어라.
+
+여러분들이 패치들을 메일에 넣는다면 그것들은 Documentation/SubmittingPatches에
+나와있는데로 명백히(plain) 읽을 수 있는 텍스트여야 한다. 커널 개발자들은
+첨부파일이나 압축된 패치들을 원하지 않는다. 그들은 여러분들의 패치의
+각 라인 단위로 코멘트를 하길 원하며 압축하거나 첨부하지 않고 보내는 것이
+그렇게 할 수 있는 유일한 방법이다. 여러분들이 사용하는 메일 프로그램이
+스페이스나 탭 문자들을 조작하지 않는지 확인하라. 가장 좋은 첫 테스트는
+메일을 자신에게 보내보고 스스로 그 패치를 적용해보라. 그것이 동작하지
+않는다면 여러분의 메일 프로그램을 고치던가 제대로 동작하는 프로그램으로
+바꾸어라.
+
+무엇보다도 메일링 리스트의 다른 구독자들에게 보여주려 한다는 것을 기억하라.
+
+
+커뮤니티와 협력하는 법
+--------------------
+
+커널 커뮤니티의 목적은 가능한한 가장 좋은 커널을 제공하는 것이다. 여러분이
+받아들여질 패치를 제출하게 되면 그 패치의 기술적인 이점으로 검토될 것이다.
+그럼 여러분들은 무엇을 기대하고 있어야 하는가?
+ - 비판
+ - 의견
+ - 변경을 위한 요구
+ - 당위성을 위한 요구
+ - 고요
+
+기억하라. 이것들은 여러분의 패치가 커널로 들어가기 위한 과정이다. 여러분의
+패치들은 비판과 다른 의견을 받을 수 있고 그것들을 기술적인 레벨로 평가하고
+재작업하거나 또는 왜 수정하면 안되는지에 관하여 명료하고 간결한 이유를
+말할 수 있어야 한다. 여러분이 제출한 것에 어떤 응답도 있지 않다면 몇 일을
+기다려보고 다시 시도해라. 때론 너무 많은 메일들 속에 묻혀버리기도 한다.
+
+여러분은 무엇을 해서는 안되는가?
+ - 여러분의 패치가 아무 질문 없이 받아들여지기를 기대하는 것
+ - 방어적이 되는 것
+ - 의견을 무시하는 것
+ - 요청된 변경을 하지 않고 패치를 다시 제출하는 것
+
+가능한한 가장 좋은 기술적인 해답을 찾고 있는 커뮤니티에서는 항상
+어떤 패치가 얼마나 좋은지에 관하여 다른 의견들이 있을 수 있다. 여러분은
+협조적이어야 하고 기꺼이 여러분의 생각을 커널 내에 맞추어야 한다. 아니면
+적어도 여러분의 것이 가치있다는 것을 중명하여야 한다. 잘못된 것도 여러분이
+올바른 방향의 해결책으로 이끌어갈 의지가 있다면 받아들여질 것이라는 점을
+기억하라.
+
+여러분의 첫 패치에 여러분이 수정해야하는 십여개 정도의 회신이 오는
+경우도 흔하다. 이것은 여러분의 패치가 받아들여지지 않을 것이라는 것을
+의미하는 것이 아니고 개인적으로 여러분에게 감정이 있어서 그러는 것도
+아니다. 간단히 여러분의 패치에 제기된 문제들을 수정하고 그것을 다시
+보내라.
+
+
+커널 커뮤니티와 기업 조직간의 차이점
+-----------------------------------------------------------------
+커널 커뮤니티는 가장 전통적인 회사의 개발 환경과는 다르다. 여기에 여러분들의
+문제를 피하기 위한 목록이 있다.
+ 여러분들이 제안한 변경들에 관하여 말할 때 좋은 것들 :
+ - "이것은 여러 문제들을 해겹합니다."
+ - "이것은 2000 라인의 코드를 제거합니다."
+ - "이것은 내가 말하려는 것에 관해 설명하는 패치입니다."
+ - "나는 5개의 다른 아키텍쳐에서 그것을 테스트했슴으로..."
+ - "여기에 일련의 작은 패치들이 있슴음로..."
+ - "이것은 일반적인 머신에서 성능을 향상시킴으로..."
+
+ 여러분들이 말할 때 피해야 할 좋지 않은 것들 :
+ - "우리를 그것을 AIT/ptx/Solaris에서 이러한 방법으로 했다. 그러므로 그것은 좋은 것임에 틀립없다..."
+ - "나는 20년동안 이것을 해왔다. 그러므로..."
+ - "이것은 돈을 벌기위해 나의 회사가 필요로 하는 것이다."
+ - "이것은 우리의 엔터프라이즈 상품 라인을 위한 것이다."
+ - "여기에 나의 생각을 말하고 있는 1000 페이지 설계 문서가 있다."
+ - "나는 6달동안 이것을 했으니..."
+ - "여기에 5000라인 짜리 패치가 있으니..."
+ - "나는 현재 뒤죽박죽인 것을 재작성했다. 그리고 여기에..."
+ - "나는 마감시한을 가지고 있으므로 이 패치는 지금 적용될 필요가 있다."
+
+커널 커뮤니티가 전통적인 소프트웨어 엔지니어링 개발 환경들과
+또 다른 점은 얼굴을 보지 않고 일한다는 점이다. 이메일과 irc를 대화의
+주요수단으로 사용하는 것의 한가지 장점은 성별이나 인종의 차별이
+없다는 것이다. 리눅스 커널의 작업 환경에서는 단지 이메일 주소만
+알수 있기 때문에 여성과 소수 민족들도 모두 받아들여진다. 국제적으로
+일하게 되는 측면은 사람의 이름에 근거하여 성별을 추측할 수 없게
+하기때문에 차별을 없애는 데 도움을 준다. Andrea라는 이름을 가진 남자와
+Pat이라는 이름을 가진 여자가 있을 수도 있는 것이다. 리눅스 커널에서
+작업하며 생각을 표현해왔던 대부분의 여성들은 긍정적인 경험을 가지고
+있다.
+
+언어 장벽은 영어에 익숙하지 않은 몇몇 사람들에게 문제가 될 수도 있다.
+언어의 훌륭한 구사는 메일링 리스트에서 올바르게 자신의 생각을
+표현하기 위하여 필요하다. 그래서 여러분은 이메일을 보내기 전에
+영어를 올바르게 사용하고 있는지를 체크하는 것이 바람직하다.
+
+
+여러분의 변경을 나누어라
+------------------------
+
+리눅스 커널 커뮤니티는 한꺼번에 굉장히 큰 코드의 묶음(chunk)을 쉽게
+받아들이지 않는다. 변경은 적절하게 소개되고, 검토되고, 각각의
+부분으로 작게 나누어져야 한다. 이것은 회사에서 하는 것과는 정확히
+반대되는 것이다. 여러분들의 제안은 개발 초기에 일찍이 소개되야 한다.
+그래서 여러분들은 자신이 하고 있는 것에 관하여 피드백을 받을 수 있게
+된다. 커뮤니티가 여러분들이 커뮤니티와 함께 일하고 있다는 것을
+느끼도록 만들고 커뮤니티가 여러분의 기능을 위한 쓰레기 장으로써
+사용되지 않고 있다는 것을 느끼게 하자. 그러나 메일링 리스트에 한번에
+50개의 이메일을 보내지는 말아라. 여러분들의 일련의 패치들은 항상
+더 작아야 한다.
+
+패치를 나누는 이유는 다음과 같다.
+
+1) 작은 패치들은 여러분의 패치들이 적용될 수 있는 확률을 높여준다.
+ 왜냐하면 다른 사람들은 정확성을 검증하기 위하여 많은 시간과 노력을
+ 들이기를 원하지 않는다. 5줄의 패치는 메인테이너가 거의 몇 초간 힐끗
+ 보면 적용될 수 있다. 그러나 500 줄의 패치는 정확성을 검토하기 위하여
+ 몇시간이 걸릴 수도 있다(걸리는 시간은 패치의 크기 혹은 다른 것에
+ 비례하여 기하급수적으로 늘어난다).
+
+ 패치를 작게 만드는 것은 무엇인가 잘못되었을 때 디버그하는 것을
+ 쉽게 만든다. 즉, 그렇게 만드는 것은 매우 큰 패치를 적용한 후에
+ 조사하는 것 보다 작은 패치를 적용한 후에 (그리고 몇몇의 것이
+ 깨졌을 때) 하나씩 패치들을 제거해가며 디버그 하기 쉽도록 만들어 준다.
+
+2) 작은 패치들을 보내는 것뿐만 아니라 패치들을 제출하기전에 재작성하고
+ 간단하게(혹은 간단한게 재배치하여) 하는 것도 중요하다.
+
+여기에 커널 개발자 Al Viro의 이야기가 있다.
+ "학생의 수학 숙제를 채점하는 선생님을 생각해보라. 선생님은 학생들이
+ 답을 얻을때까지 겪은 시행착오를 보길 원하지 않는다. 선생님들은
+ 간결하고 가장 뛰어난 답을 보길 원한다. 훌륭한 학생은 이것을 알고
+ 마지막으로 답을 얻기 전 중간 과정들을 제출하진 않는다.
+
+ 커널 개발도 마찬가지이다. 메인테이너들과 검토하는 사람들은 문제를
+ 풀어나가는 과정속에 숨겨진 과정을 보길 원하진 않는다. 그들은
+ 간결하고 멋진 답을 보길 원한다."
+
+커뮤니티와 협력하며 뛰어난 답을 찾는 것과 여러분들의 끝마치지 못한 작업들
+사이에 균형을 유지해야 하는 것은 어려울지도 모른다. 그러므로 프로세스의
+초반에 여러분의 작업을 향상시키기위한 피드백을 얻는 것 뿐만 아니라
+여러분들의 변경들을 작은 묶음으로 유지해서 심지어는 여러분의 작업의
+모든 부분이 지금은 포함될 준비가 되어있지 않지만 작은 부분은 벌써
+받아들여질 수 있도록 유지하는 것이 바람직하다.
+
+또한 완성되지 않았고 "나중에 수정될 것이다." 와 같은 것들을 포함하는
+패치들은 받아들여지지 않을 것이라는 점을 유념하라.
+
+변경을 정당화해라
+-----------------
+
+여러분들의 나누어진 패치들을 리눅스 커뮤니티가 왜 반영해야 하는지를
+알도록 하는 것은 매우 중요하다. 새로운 기능들이 필요하고 유용하다는
+것은 반드시 그에 합당한 이유가 있어야 한다.
+
+
+변경을 문서화해라
+-----------------
+
+여러분이 패치를 보내려 할때는 여러분이 무엇을 말하려고 하는지를 충분히
+생각하여 이메일을 작성해야 한다. 이 정보는 패치를 위한 ChangeLog가 될
+것이다. 그리고 항상 그 내용을 보길 원하는 모든 사람들을 위해 보존될
+것이다. 패치는 완벽하게 다음과 같은 내용들을 포함하여 설명해야 한다.
+ - 변경이 왜 필요한지
+ - 패치에 관한 전체 설계 접근(approach)
+ - 구현 상세들
+ - 테스트 결과들
+
+이것이 무엇인지 더 자세한 것을 알고 싶다면 다음 문서의 ChageLog 항을 봐라.
+ "The Perfect Patch"
+ http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+
+
+
+
+이 모든 것을 하는 것은 매우 어려운 일이다. 완벽히 소화하는 데는 적어도 몇년이
+걸릴 수도 있다. 많은 인내와 결심이 필요한 계속되는 개선의 과정이다. 그러나
+가능한한 포기하지 말라. 많은 사람들은 이전부터 해왔던 것이고 그 사람들도
+정확하게 여러분들이 지금 서 있는 그 곳부터 시작했었다.
+
+
+
+
+----------
+"개발 프로세스"(http://linux.tar.gz/articles/2.6-development_process) 섹션을
+작성하는데 있어 참고할 문서를 사용하도록 허락해준 Paolo Ciarrocchi에게
+감사한다. 여러분들이 말해야 할 것과 말해서는 안되는 것의 목록 중 일부를 제공해준
+Randy Dunlap과 Gerrit Huizenga에게 감사한다. 또한 검토와 의견 그리고
+공헌을 아끼지 않은 Pat Mochel, Hanna Linder, Randy Dunlap, Kay Sievers,
+Vojtech Pavlik, Jan Kara, Josh Boyer, Kees Cook, Andrew Morton, Andi Kleen,
+Vadim Lobanov, Jesper Juhl, Adrian Bunk, Keri Harris, Frans Pop,
+David A. Wheeler, Junio Hamano, Michael Kerrisk, and Alex Shepard에게도 감사를 전한다.
+그들의 도움이 없었다면 이 문서는 존재하지 않았을 것이다.
+
+
+
+메인테이너: Greg Kroah-Hartman <greg@kroah.com>
diff --git a/Documentation/ko_KR/stable_api_nonsense.txt b/Documentation/ko_KR/stable_api_nonsense.txt
new file mode 100644
index 0000000..8f2b0e1
--- /dev/null
+++ b/Documentation/ko_KR/stable_api_nonsense.txt
@@ -0,0 +1,195 @@
+NOTE:
+This is a version of Documentation/stable_api_nonsense.txt translated
+into korean
+This document is maintained by barrios <minchan.kim@gmail.com>
+If you find any difference between this document and the original file or
+a problem with the translation, please contact the maintainer of this file.
+
+Please also note that the purpose of this file is to be easier to
+read for non English (read: korean) speakers and is not intended as
+a fork. So if you have any comments or updates for this file please
+try to update the original English file first.
+
+==================================
+이 문서는
+Documentation/stable_api_nonsense.txt
+의 한글 번역입니다.
+
+역자: 김민찬 <minchan.kim@gmail.com>
+감수: 이제이미 <jamee.lee@samsung.com>
+==================================
+
+리눅스 커널 드라이버 인터페이스
+(여러분들의 모든 질문에 대한 답 그리고 다른 몇가지)
+
+Greg Kroah-Hartman <greg@kroah.com>
+
+이 문서는 리눅스가 왜 바이너리 커널 인터페이스를 갖지 않는지, 왜 변하지
+않는(stable) 커널 인터페이스를 갖지 않는지를 설명하기 위해 쓰여졌다.
+이 문서는 커널과 유저공간 사이의 인터페이스가 아니라 커널 내부의
+인터페이스들을 설명하고 있다는 것을 유념하라. 커널과 유저공간 사이의
+인터페이스는 응용프로그램이 사용하는 syscall 인터페이스이다. 그 인터페이스는
+오랫동안 거의 변하지 않았고 앞으로도 변하지 않을 것이다. 나는 pre 0.9에서
+만들어졌지만 최신의 2.6 커널 배포에서도 잘 동작하는 프로그램을 가지고
+있다. 이 인터페이스는 사용자와 응용프로그램 개발자들이 변하지 않을 것이라고
+여길수 있는 것이다.
+
+
+초록
+----
+여러분은 변하지 않는 커널 인터페이스를 원한다고 생각하지만 실제로는
+그렇지 않으며 심지어는 그것을 알아채지 못한다. 여러분이 원하는 것은
+안정되게 실행되는 드라이버이며 드라이버가 메인 커널 트리에 있을 때
+그런 안정적인 드라이버를 얻을 수 있게 된다. 또한 여러분의 드라이버가
+메인 커널 트리에 있다면 다른 많은 좋은 이점들을 얻게 된다. 그러한 것들이
+리눅스를 강건하고, 안정적이며, 성숙한 운영체제로 만들어 놓음으로써
+여러분들로 하여금 바로 리눅스를 사용하게 만드는 이유이다.
+
+
+소개
+----
+
+커널 내부의 인터페이스가 바뀌는 것을 걱정하며 커널 드라이버를 작성하고
+싶어하는 사람은 정말 이상한 사람이다. 세상의 대다수의 사람들은 이 인터페이스를
+보지못할 것이며 전혀 걱정하지도 않는다.
+
+먼저, 나는 closed 소스, hidden 소스, binary blobs, 소스 wrappers, 또는 GPL로
+배포되었지만 소스 코드를 갖고 있지 않은 커널 드라이버들을 설명하는 어떤 다른
+용어들에 관한 어떤 법적인 문제에 관해서는 언급하지 않을 것이다. 어떤 법적인
+질문들을 가지고 있다면 변호사와 연락하라. 나는 프로그래머이므로 여기서 기술적인
+문제들만을 설명하려고 한다. (법적인 문제를 경시하는 것은 아니다. 그런 문제들은
+엄연히 현실에 있고 여러분들은 항상 그 문제들을 인식하고 있을 필요는 있다.)
+
+자, 두가지의 주요 주제가 있다. 바이너리 커널 인터페이스들과 변하지 않는
+커널 소스 인터페이들. 그것들은 서로 의존성을 가지고 있지만 바이너리
+문제를 먼저 풀고 넘어갈 것이다.
+
+
+
+바이너리 커널 인터페이스
+------------------------
+우리가 변하지 않는 커널 소스 인터페이스를 가지고 있다고 가정하자. 그러면
+바이너리 인터페이스 또한 자연적으로 변하지 않을까? 틀렸다. 리눅스 커널에
+관한 다음 사실들을 생각해보라.
+ - 여러분들이 사용하는 C 컴파일러의 버젼에 따라 다른 커널 자료 구조들은
+ 다른 alignmnet들을 갖게 될것이고 다른 방법으로(함수들을 inline으로
+ 했느냐, 아니냐) 다른 함수들을 포함하는 것도 가능한다. 중요한 것은
+ 개별적인 함수 구성이 아니라 자료 구조 패딩이 달라진다는 점이다.
+ - 여러분이 선택한 커널 빌드 옵션에 따라서 커널은 다양한 것들을 가정할
+ 수 있다.
+ - 다른 구조체들은 다른 필드들을 포함할 수 있다.
+ - 몇몇 함수들은 전혀 구현되지 않을 수도 있다(즉, 몇몇 lock들은
+ non-SMP 빌드에서는 사라져 버릴수도 있다).
+ - 커널내에 메모리는 build optoin들에 따라 다른 방법으로 align될수
+ 있다.
+ - 리눅스는 많은 다양한 프로세서 아키텍쳐에서 실행된다. 한 아키텍쳐의
+ 바이너리 드라이버를 다른 아키텍쳐에서 정상적으로 실행시킬 방법은
+ 없다.
+
+커널을 빌드했던 C 컴파일러와 정확하게 같은 것을 사용하고 정확하게 같은
+커널 구성(configuration)을 사용하여 여러분들의 모듈을 빌드하면 간단히
+많은 문제들을 해결할 수 있다. 이렇게 하는 것은 여러분들이 하나의 리눅스
+배포판의 하나의 배포 버젼을 위한 모듈만을 제공한다면 별일 아닐 것이다.
+그러나 각기 다른 리눅스 배포판마다 한번씩 빌드하는 수를 각 리눅스 배포판마다
+제공하는 다른 릴리즈의 수와 곱하게 되면 이번에는 각 릴리즈들의 다른 빌드
+옵션의 악몽과 마주하게 것이다. 또한 각 리눅스 배포판들은 다른 하드웨어
+종류에(다른 프로세서 타입과 다른 옵션들) 맞춰져 있는 많은 다른 커널들을
+배포한다. 그러므로 한번의 배포에서조차 여러분들의 모듈은 여러 버젼을
+만들 필요가 있다.
+
+나를 믿어라. 여러분들은 이러한 종류의 배포를 지원하려고 시도한다면 시간이
+지나면 미칠지경이 될 것이다. 난 이러한 것을 오래전에 아주 어렵게 배웠다...
+
+
+
+변하지않는 커널 소스 인터페이스들
+---------------------------------
+
+리눅스 커널 드라이버를 계속해서 메인 커널 트리에 반영하지 않고
+유지보수하려고 하는 사름들과 이 문제를 논의하게 되면 훨씬 더
+"논란의 여지가 많은" 주제가 될 것이다.
+
+리눅스 커널 개발은 끊임없이 빠른 속도로 이루어지고 있으며 결코
+느슨해진 적이 없다. 커널 개발자들이 현재 인터페이스들에서 버그를
+발견하거나 무엇인가 할수 있는 더 좋은 방법을 찾게 되었다고 하자.
+그들이 발견한 것을 실행한다면 아마도 더 잘 동작하도록 현재 인터페이스들을
+수정하게 될 것이다. 그들이 그런 일을 하게되면 함수 이름들은 변하게 되고,
+구조체들은 늘어나거나 줄어들게 되고, 함수 파라미터들은 재작업될 것이다.
+이러한 일이 발생되면 커널 내에 이 인터페이스를 사용했던 인스턴스들이 동시에
+수정될 것이며 이러한 과정은 모든 것이 계속해서 올바르게 동작할 것이라는
+것을 보장한다.
+
+이러한 것의 한 예로써, 커널 내부의 USB 인터페이스들은 이 서브시스템이
+생긴 이후로 적어도 3번의 다른 재작업을 겪었다. 이 재작업들은 많은 다른
+문제들을 풀었다.
+ - 데이터 스트림들의 동기적인 모델에서 비동기적인 모델로의 변화. 이것은
+ 많은 드라이버들의 복잡성을 줄이고 처리량을 향상시켜 현재는 거의 모든
+ USB 장치들의 거의 최대 속도로 실행되고 있다.
+ - USB 드라이버가 USB 코어로부터 데이터 패킷들을 할당받로록 한 변경으로
+ 인해서 지금의 모든 드라이버들은 많은 문서화된 데드락을 수정하기 위하여
+ USB 코어에게 더 많은 정보를 제공해야만 한다.
+
+이것은 오랫동안 자신의 오래된 USB 인터페이스들을 유지해야 하는 closed 운영체제들과는
+완전히 반대되는 것이다. closed된 운영체제들은 새로운 개발자들에게 우연히 낡은
+인터페이스를 사용하게 할 기회를 주게되며, 적절하지 못한 방법으로 처리하게 되어
+운영체제의 안정성을 해치는 문제를 야기하게 된다.
+
+이 두가지의 예들 모두, 모든 개발자들은 꼭 이루어져야 하는 중요한 변화들이라고
+동의를 하였고 비교적 적은 고통으로 변경되어졌다. 리눅스가 변하지 않는 소스
+인터페이스를 고집한다면, 새로운 인터페이스가 만들어지게 되며 반면 기존의 오래된
+것들, 그리고 깨진 것들은 계속해서 유지되어야 하며 이러한 일들은 USB 개발자들에게
+또 다른 일거리를 주게 된다. 모든 리눅스 USB 개발자들에게 자신의 그들의 업무를
+마친 후 시간을 투자하여 아무 득도 없는 무료 봉사를 해달라고 하는 것은 가능성이
+희박한 일이다.
+
+보안 문제 역시 리눅스에게는 매우 중요하다. 보안 문제가 발견되면 그것은
+매우 짧은 시간 안에 수정된다. 보안 문제는 그 문제를 해결하기 위하여
+여러번 내부 커널 인터페이스들을 재작업하게 만들었다. 이러한 문제가
+발생하였을 때 그 인터페이스들을 사용하는 모든 드라이버들도 동시에
+수정되어 보안 문제가 앞으로 갑작스럽게 생기지는 않을 것이라는 것을
+보장한다. 내부 인터페이스들의 변경이 허락되지 않으면 이러한 종류의 보안
+문제를 수정하고 그것이 다시 발생하지 않을 것이라고 보장하는 것은 가능하지
+않을 것이다.
+
+커널 인터페이스들은 계속해서 정리되고 있다. 현재 인터페이스를 사용하는
+사람이 한명도 없다면 그것은 삭제된다. 이것은 커널이 가능한한 가장 작게
+유지되며 존재하는 모든 가능성이 있는 인터페이스들이 테스트된다는 것을
+보장한다(사용되지 않는 인터페이스들은 유효성 검증을 하기가 거의 불가능하다).
+
+
+무엇을 해야 하나
+---------------
+자, 여러분이 메인 커널 트리에 있지 않은 리눅스 커널 드라이버를 가지고
+있다면 여러분은 즉, 개발자는 무엇을 해야 하나? 모든 배포판마다 다른
+커널 버젼을 위한 바이너리 드라이버를 배포하는 것은 악몽이며 계속해서
+변하고 있는 커널 인터페이스들의 맞처 유지보수하려고 시도하는 것은 힘든
+일이다.
+
+간단하다. 여러분의 커널 드라이버를 메인 커널 트리에 반영하라(우리는 여기서
+GPL을 따르는 배포 드라이버에 관해 얘기하고 있다는 것을 상기하라. 여러분의
+코드가 이러한 분류에 해당되지 않는다면 행운을 빈다. 여러분 스스로 어떻게든
+해야만 한다). 여러분의 드라이버가 트리에 있게되면 커널 인터페이스가
+변경되더라도 가장 먼저 커널에 변경을 가했던 사람에 의해서 수정될 것이다.
+이것은 여러분의 드라이버가 여러분의 별다른 노력없이 항상 빌드가 가능하며
+동작하는 것을 보장한다.
+
+메인 커널 트리에 여러분의 드라이버를 반영하면 얻게 되는 장점들은 다음과 같다.
+ - 관리의 드는 비용(원래 개발자의)은 줄어줄면서 드라이버의 질은 향상될 것이다.
+ - 다른 개발자들이 여러분의 드라이버에 기능들을 추가 할 것이다.
+ - 다른 사람들은 여러분의 드라이버에 버그를 발견하고 수정할 것이다.
+ - 다른 사람들은 여러분의 드라이버의 개선점을 찾을 줄 것이다.
+ - 외부 인터페이스 변경으로 인해 여러분의 드라이버의 수정이 필요하다면 다른
+ 사람들이 드라이버를 업데이트할 것이다.
+ - 여러분의 드라이버는 별다른 노력 없이 모든 리눅스 배포판에 자동적으로
+ 추가될 것이다.
+
+리눅스는 다른 운영 체제보다 "쉽게 쓸수 있는(out of the box)" 많은 다른 장치들을
+지원하고 어떤 다른 운영 체제보다 다양한 아키텍쳐위에서 이러한 장치들을 지원하기 때문에
+이러한 증명된 개발 모델은 틀림없이 바로 가고 있는 것이다.
+
+
+
+------
+
+이 문서의 초안을 검토해주고 코멘트 해준 Randy Dunlap, Andrew Morton, David Brownell,
+Hanna Linder, Robert Love, 그리고 Nishanth Aravamudan에게 감사한다.
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
new file mode 100644
index 0000000..f5d2aad
--- /dev/null
+++ b/Documentation/kobject.txt
@@ -0,0 +1,390 @@
+Everything you never wanted to know about kobjects, ksets, and ktypes
+
+Greg Kroah-Hartman <gregkh@suse.de>
+
+Based on an original article by Jon Corbet for lwn.net written October 1,
+2003 and located at http://lwn.net/Articles/51437/
+
+Last updated December 19, 2007
+
+
+Part of the difficulty in understanding the driver model - and the kobject
+abstraction upon which it is built - is that there is no obvious starting
+place. Dealing with kobjects requires understanding a few different types,
+all of which make reference to each other. In an attempt to make things
+easier, we'll take a multi-pass approach, starting with vague terms and
+adding detail as we go. To that end, here are some quick definitions of
+some terms we will be working with.
+
+ - A kobject is an object of type struct kobject. Kobjects have a name
+ and a reference count. A kobject also has a parent pointer (allowing
+ objects to be arranged into hierarchies), a specific type, and,
+ usually, a representation in the sysfs virtual filesystem.
+
+ Kobjects are generally not interesting on their own; instead, they are
+ usually embedded within some other structure which contains the stuff
+ the code is really interested in.
+
+ No structure should EVER have more than one kobject embedded within it.
+ If it does, the reference counting for the object is sure to be messed
+ up and incorrect, and your code will be buggy. So do not do this.
+
+ - A ktype is the type of object that embeds a kobject. Every structure
+ that embeds a kobject needs a corresponding ktype. The ktype controls
+ what happens to the kobject when it is created and destroyed.
+
+ - A kset is a group of kobjects. These kobjects can be of the same ktype
+ or belong to different ktypes. The kset is the basic container type for
+ collections of kobjects. Ksets contain their own kobjects, but you can
+ safely ignore that implementation detail as the kset core code handles
+ this kobject automatically.
+
+ When you see a sysfs directory full of other directories, generally each
+ of those directories corresponds to a kobject in the same kset.
+
+We'll look at how to create and manipulate all of these types. A bottom-up
+approach will be taken, so we'll go back to kobjects.
+
+
+Embedding kobjects
+
+It is rare for kernel code to create a standalone kobject, with one major
+exception explained below. Instead, kobjects are used to control access to
+a larger, domain-specific object. To this end, kobjects will be found
+embedded in other structures. If you are used to thinking of things in
+object-oriented terms, kobjects can be seen as a top-level, abstract class
+from which other classes are derived. A kobject implements a set of
+capabilities which are not particularly useful by themselves, but which are
+nice to have in other objects. The C language does not allow for the
+direct expression of inheritance, so other techniques - such as structure
+embedding - must be used.
+
+So, for example, the UIO code has a structure that defines the memory
+region associated with a uio device:
+
+struct uio_mem {
+ struct kobject kobj;
+ unsigned long addr;
+ unsigned long size;
+ int memtype;
+ void __iomem *internal_addr;
+};
+
+If you have a struct uio_mem structure, finding its embedded kobject is
+just a matter of using the kobj member. Code that works with kobjects will
+often have the opposite problem, however: given a struct kobject pointer,
+what is the pointer to the containing structure? You must avoid tricks
+(such as assuming that the kobject is at the beginning of the structure)
+and, instead, use the container_of() macro, found in <linux/kernel.h>:
+
+ container_of(pointer, type, member)
+
+where pointer is the pointer to the embedded kobject, type is the type of
+the containing structure, and member is the name of the structure field to
+which pointer points. The return value from container_of() is a pointer to
+the given type. So, for example, a pointer "kp" to a struct kobject
+embedded within a struct uio_mem could be converted to a pointer to the
+containing uio_mem structure with:
+
+ struct uio_mem *u_mem = container_of(kp, struct uio_mem, kobj);
+
+Programmers often define a simple macro for "back-casting" kobject pointers
+to the containing type.
+
+
+Initialization of kobjects
+
+Code which creates a kobject must, of course, initialize that object. Some
+of the internal fields are setup with a (mandatory) call to kobject_init():
+
+ void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
+
+The ktype is required for a kobject to be created properly, as every kobject
+must have an associated kobj_type. After calling kobject_init(), to
+register the kobject with sysfs, the function kobject_add() must be called:
+
+ int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...);
+
+This sets up the parent of the kobject and the name for the kobject
+properly. If the kobject is to be associated with a specific kset,
+kobj->kset must be assigned before calling kobject_add(). If a kset is
+associated with a kobject, then the parent for the kobject can be set to
+NULL in the call to kobject_add() and then the kobject's parent will be the
+kset itself.
+
+As the name of the kobject is set when it is added to the kernel, the name
+of the kobject should never be manipulated directly. If you must change
+the name of the kobject, call kobject_rename():
+
+ int kobject_rename(struct kobject *kobj, const char *new_name);
+
+Note kobject_rename does perform any locking or have a solid notion of
+what names are valid so the provide must provide their own sanity checking
+and serialization.
+
+There is a function called kobject_set_name() but that is legacy cruft and
+is being removed. If your code needs to call this function, it is
+incorrect and needs to be fixed.
+
+To properly access the name of the kobject, use the function
+kobject_name():
+
+ const char *kobject_name(const struct kobject * kobj);
+
+There is a helper function to both initialize and add the kobject to the
+kernel at the same time, called supprisingly enough kobject_init_and_add():
+
+ int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
+ struct kobject *parent, const char *fmt, ...);
+
+The arguments are the same as the individual kobject_init() and
+kobject_add() functions described above.
+
+
+Uevents
+
+After a kobject has been registered with the kobject core, you need to
+announce to the world that it has been created. This can be done with a
+call to kobject_uevent():
+
+ int kobject_uevent(struct kobject *kobj, enum kobject_action action);
+
+Use the KOBJ_ADD action for when the kobject is first added to the kernel.
+This should be done only after any attributes or children of the kobject
+have been initialized properly, as userspace will instantly start to look
+for them when this call happens.
+
+When the kobject is removed from the kernel (details on how to do that is
+below), the uevent for KOBJ_REMOVE will be automatically created by the
+kobject core, so the caller does not have to worry about doing that by
+hand.
+
+
+Reference counts
+
+One of the key functions of a kobject is to serve as a reference counter
+for the object in which it is embedded. As long as references to the object
+exist, the object (and the code which supports it) must continue to exist.
+The low-level functions for manipulating a kobject's reference counts are:
+
+ struct kobject *kobject_get(struct kobject *kobj);
+ void kobject_put(struct kobject *kobj);
+
+A successful call to kobject_get() will increment the kobject's reference
+counter and return the pointer to the kobject.
+
+When a reference is released, the call to kobject_put() will decrement the
+reference count and, possibly, free the object. Note that kobject_init()
+sets the reference count to one, so the code which sets up the kobject will
+need to do a kobject_put() eventually to release that reference.
+
+Because kobjects are dynamic, they must not be declared statically or on
+the stack, but instead, always allocated dynamically. Future versions of
+the kernel will contain a run-time check for kobjects that are created
+statically and will warn the developer of this improper usage.
+
+If all that you want to use a kobject for is to provide a reference counter
+for your structure, please use the struct kref instead; a kobject would be
+overkill. For more information on how to use struct kref, please see the
+file Documentation/kref.txt in the Linux kernel source tree.
+
+
+Creating "simple" kobjects
+
+Sometimes all that a developer wants is a way to create a simple directory
+in the sysfs hierarchy, and not have to mess with the whole complication of
+ksets, show and store functions, and other details. This is the one
+exception where a single kobject should be created. To create such an
+entry, use the function:
+
+ struct kobject *kobject_create_and_add(char *name, struct kobject *parent);
+
+This function will create a kobject and place it in sysfs in the location
+underneath the specified parent kobject. To create simple attributes
+associated with this kobject, use:
+
+ int sysfs_create_file(struct kobject *kobj, struct attribute *attr);
+or
+ int sysfs_create_group(struct kobject *kobj, struct attribute_group *grp);
+
+Both types of attributes used here, with a kobject that has been created
+with the kobject_create_and_add(), can be of type kobj_attribute, so no
+special custom attribute is needed to be created.
+
+See the example module, samples/kobject/kobject-example.c for an
+implementation of a simple kobject and attributes.
+
+
+
+ktypes and release methods
+
+One important thing still missing from the discussion is what happens to a
+kobject when its reference count reaches zero. The code which created the
+kobject generally does not know when that will happen; if it did, there
+would be little point in using a kobject in the first place. Even
+predictable object lifecycles become more complicated when sysfs is brought
+in as other portions of the kernel can get a reference on any kobject that
+is registered in the system.
+
+The end result is that a structure protected by a kobject cannot be freed
+before its reference count goes to zero. The reference count is not under
+the direct control of the code which created the kobject. So that code must
+be notified asynchronously whenever the last reference to one of its
+kobjects goes away.
+
+Once you registered your kobject via kobject_add(), you must never use
+kfree() to free it directly. The only safe way is to use kobject_put(). It
+is good practice to always use kobject_put() after kobject_init() to avoid
+errors creeping in.
+
+This notification is done through a kobject's release() method. Usually
+such a method has a form like:
+
+ void my_object_release(struct kobject *kobj)
+ {
+ struct my_object *mine = container_of(kobj, struct my_object, kobj);
+
+ /* Perform any additional cleanup on this object, then... */
+ kfree(mine);
+ }
+
+One important point cannot be overstated: every kobject must have a
+release() method, and the kobject must persist (in a consistent state)
+until that method is called. If these constraints are not met, the code is
+flawed. Note that the kernel will warn you if you forget to provide a
+release() method. Do not try to get rid of this warning by providing an
+"empty" release function; you will be mocked mercilessly by the kobject
+maintainer if you attempt this.
+
+Note, the name of the kobject is available in the release function, but it
+must NOT be changed within this callback. Otherwise there will be a memory
+leak in the kobject core, which makes people unhappy.
+
+Interestingly, the release() method is not stored in the kobject itself;
+instead, it is associated with the ktype. So let us introduce struct
+kobj_type:
+
+ struct kobj_type {
+ void (*release)(struct kobject *);
+ struct sysfs_ops *sysfs_ops;
+ struct attribute **default_attrs;
+ };
+
+This structure is used to describe a particular type of kobject (or, more
+correctly, of containing object). Every kobject needs to have an associated
+kobj_type structure; a pointer to that structure must be specified when you
+call kobject_init() or kobject_init_and_add().
+
+The release field in struct kobj_type is, of course, a pointer to the
+release() method for this type of kobject. The other two fields (sysfs_ops
+and default_attrs) control how objects of this type are represented in
+sysfs; they are beyond the scope of this document.
+
+The default_attrs pointer is a list of default attributes that will be
+automatically created for any kobject that is registered with this ktype.
+
+
+ksets
+
+A kset is merely a collection of kobjects that want to be associated with
+each other. There is no restriction that they be of the same ktype, but be
+very careful if they are not.
+
+A kset serves these functions:
+
+ - It serves as a bag containing a group of objects. A kset can be used by
+ the kernel to track "all block devices" or "all PCI device drivers."
+
+ - A kset is also a subdirectory in sysfs, where the associated kobjects
+ with the kset can show up. Every kset contains a kobject which can be
+ set up to be the parent of other kobjects; the top-level directories of
+ the sysfs hierarchy are constructed in this way.
+
+ - Ksets can support the "hotplugging" of kobjects and influence how
+ uevent events are reported to user space.
+
+In object-oriented terms, "kset" is the top-level container class; ksets
+contain their own kobject, but that kobject is managed by the kset code and
+should not be manipulated by any other user.
+
+A kset keeps its children in a standard kernel linked list. Kobjects point
+back to their containing kset via their kset field. In almost all cases,
+the kobjects belonging to a kset have that kset (or, strictly, its embedded
+kobject) in their parent.
+
+As a kset contains a kobject within it, it should always be dynamically
+created and never declared statically or on the stack. To create a new
+kset use:
+ struct kset *kset_create_and_add(const char *name,
+ struct kset_uevent_ops *u,
+ struct kobject *parent);
+
+When you are finished with the kset, call:
+ void kset_unregister(struct kset *kset);
+to destroy it.
+
+An example of using a kset can be seen in the
+samples/kobject/kset-example.c file in the kernel tree.
+
+If a kset wishes to control the uevent operations of the kobjects
+associated with it, it can use the struct kset_uevent_ops to handle it:
+
+struct kset_uevent_ops {
+ int (*filter)(struct kset *kset, struct kobject *kobj);
+ const char *(*name)(struct kset *kset, struct kobject *kobj);
+ int (*uevent)(struct kset *kset, struct kobject *kobj,
+ struct kobj_uevent_env *env);
+};
+
+
+The filter function allows a kset to prevent a uevent from being emitted to
+userspace for a specific kobject. If the function returns 0, the uevent
+will not be emitted.
+
+The name function will be called to override the default name of the kset
+that the uevent sends to userspace. By default, the name will be the same
+as the kset itself, but this function, if present, can override that name.
+
+The uevent function will be called when the uevent is about to be sent to
+userspace to allow more environment variables to be added to the uevent.
+
+One might ask how, exactly, a kobject is added to a kset, given that no
+functions which perform that function have been presented. The answer is
+that this task is handled by kobject_add(). When a kobject is passed to
+kobject_add(), its kset member should point to the kset to which the
+kobject will belong. kobject_add() will handle the rest.
+
+If the kobject belonging to a kset has no parent kobject set, it will be
+added to the kset's directory. Not all members of a kset do necessarily
+live in the kset directory. If an explicit parent kobject is assigned
+before the kobject is added, the kobject is registered with the kset, but
+added below the parent kobject.
+
+
+Kobject removal
+
+After a kobject has been registered with the kobject core successfully, it
+must be cleaned up when the code is finished with it. To do that, call
+kobject_put(). By doing this, the kobject core will automatically clean up
+all of the memory allocated by this kobject. If a KOBJ_ADD uevent has been
+sent for the object, a corresponding KOBJ_REMOVE uevent will be sent, and
+any other sysfs housekeeping will be handled for the caller properly.
+
+If you need to do a two-stage delete of the kobject (say you are not
+allowed to sleep when you need to destroy the object), then call
+kobject_del() which will unregister the kobject from sysfs. This makes the
+kobject "invisible", but it is not cleaned up, and the reference count of
+the object is still the same. At a later time call kobject_put() to finish
+the cleanup of the memory associated with the kobject.
+
+kobject_del() can be used to drop the reference to the parent object, if
+circular references are constructed. It is valid in some cases, that a
+parent objects references a child. Circular references _must_ be broken
+with an explicit call to kobject_del(), so that a release functions will be
+called, and the objects in the former circle release each other.
+
+
+Example code to copy from
+
+For a more complete example of using ksets and kobjects properly, see the
+sample/kobject/kset-example.c code.
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
new file mode 100644
index 0000000..a79633d
--- /dev/null
+++ b/Documentation/kprobes.txt
@@ -0,0 +1,506 @@
+Title : Kernel Probes (Kprobes)
+Authors : Jim Keniston <jkenisto@us.ibm.com>
+ : Prasanna S Panchamukhi <prasanna@in.ibm.com>
+
+CONTENTS
+
+1. Concepts: Kprobes, Jprobes, Return Probes
+2. Architectures Supported
+3. Configuring Kprobes
+4. API Reference
+5. Kprobes Features and Limitations
+6. Probe Overhead
+7. TODO
+8. Kprobes Example
+9. Jprobes Example
+10. Kretprobes Example
+Appendix A: The kprobes debugfs interface
+
+1. Concepts: Kprobes, Jprobes, Return Probes
+
+Kprobes enables you to dynamically break into any kernel routine and
+collect debugging and performance information non-disruptively. You
+can trap at almost any kernel code address, specifying a handler
+routine to be invoked when the breakpoint is hit.
+
+There are currently three types of probes: kprobes, jprobes, and
+kretprobes (also called return probes). A kprobe can be inserted
+on virtually any instruction in the kernel. A jprobe is inserted at
+the entry to a kernel function, and provides convenient access to the
+function's arguments. A return probe fires when a specified function
+returns.
+
+In the typical case, Kprobes-based instrumentation is packaged as
+a kernel module. The module's init function installs ("registers")
+one or more probes, and the exit function unregisters them. A
+registration function such as register_kprobe() specifies where
+the probe is to be inserted and what handler is to be called when
+the probe is hit.
+
+There are also register_/unregister_*probes() functions for batch
+registration/unregistration of a group of *probes. These functions
+can speed up unregistration process when you have to unregister
+a lot of probes at once.
+
+The next three subsections explain how the different types of
+probes work. They explain certain things that you'll need to
+know in order to make the best use of Kprobes -- e.g., the
+difference between a pre_handler and a post_handler, and how
+to use the maxactive and nmissed fields of a kretprobe. But
+if you're in a hurry to start using Kprobes, you can skip ahead
+to section 2.
+
+1.1 How Does a Kprobe Work?
+
+When a kprobe is registered, Kprobes makes a copy of the probed
+instruction and replaces the first byte(s) of the probed instruction
+with a breakpoint instruction (e.g., int3 on i386 and x86_64).
+
+When a CPU hits the breakpoint instruction, a trap occurs, the CPU's
+registers are saved, and control passes to Kprobes via the
+notifier_call_chain mechanism. Kprobes executes the "pre_handler"
+associated with the kprobe, passing the handler the addresses of the
+kprobe struct and the saved registers.
+
+Next, Kprobes single-steps its copy of the probed instruction.
+(It would be simpler to single-step the actual instruction in place,
+but then Kprobes would have to temporarily remove the breakpoint
+instruction. This would open a small time window when another CPU
+could sail right past the probepoint.)
+
+After the instruction is single-stepped, Kprobes executes the
+"post_handler," if any, that is associated with the kprobe.
+Execution then continues with the instruction following the probepoint.
+
+1.2 How Does a Jprobe Work?
+
+A jprobe is implemented using a kprobe that is placed on a function's
+entry point. It employs a simple mirroring principle to allow
+seamless access to the probed function's arguments. The jprobe
+handler routine should have the same signature (arg list and return
+type) as the function being probed, and must always end by calling
+the Kprobes function jprobe_return().
+
+Here's how it works. When the probe is hit, Kprobes makes a copy of
+the saved registers and a generous portion of the stack (see below).
+Kprobes then points the saved instruction pointer at the jprobe's
+handler routine, and returns from the trap. As a result, control
+passes to the handler, which is presented with the same register and
+stack contents as the probed function. When it is done, the handler
+calls jprobe_return(), which traps again to restore the original stack
+contents and processor state and switch to the probed function.
+
+By convention, the callee owns its arguments, so gcc may produce code
+that unexpectedly modifies that portion of the stack. This is why
+Kprobes saves a copy of the stack and restores it after the jprobe
+handler has run. Up to MAX_STACK_SIZE bytes are copied -- e.g.,
+64 bytes on i386.
+
+Note that the probed function's args may be passed on the stack
+or in registers. The jprobe will work in either case, so long as the
+handler's prototype matches that of the probed function.
+
+1.3 Return Probes
+
+1.3.1 How Does a Return Probe Work?
+
+When you call register_kretprobe(), Kprobes establishes a kprobe at
+the entry to the function. When the probed function is called and this
+probe is hit, Kprobes saves a copy of the return address, and replaces
+the return address with the address of a "trampoline." The trampoline
+is an arbitrary piece of code -- typically just a nop instruction.
+At boot time, Kprobes registers a kprobe at the trampoline.
+
+When the probed function executes its return instruction, control
+passes to the trampoline and that probe is hit. Kprobes' trampoline
+handler calls the user-specified return handler associated with the
+kretprobe, then sets the saved instruction pointer to the saved return
+address, and that's where execution resumes upon return from the trap.
+
+While the probed function is executing, its return address is
+stored in an object of type kretprobe_instance. Before calling
+register_kretprobe(), the user sets the maxactive field of the
+kretprobe struct to specify how many instances of the specified
+function can be probed simultaneously. register_kretprobe()
+pre-allocates the indicated number of kretprobe_instance objects.
+
+For example, if the function is non-recursive and is called with a
+spinlock held, maxactive = 1 should be enough. If the function is
+non-recursive and can never relinquish the CPU (e.g., via a semaphore
+or preemption), NR_CPUS should be enough. If maxactive <= 0, it is
+set to a default value. If CONFIG_PREEMPT is enabled, the default
+is max(10, 2*NR_CPUS). Otherwise, the default is NR_CPUS.
+
+It's not a disaster if you set maxactive too low; you'll just miss
+some probes. In the kretprobe struct, the nmissed field is set to
+zero when the return probe is registered, and is incremented every
+time the probed function is entered but there is no kretprobe_instance
+object available for establishing the return probe.
+
+1.3.2 Kretprobe entry-handler
+
+Kretprobes also provides an optional user-specified handler which runs
+on function entry. This handler is specified by setting the entry_handler
+field of the kretprobe struct. Whenever the kprobe placed by kretprobe at the
+function entry is hit, the user-defined entry_handler, if any, is invoked.
+If the entry_handler returns 0 (success) then a corresponding return handler
+is guaranteed to be called upon function return. If the entry_handler
+returns a non-zero error then Kprobes leaves the return address as is, and
+the kretprobe has no further effect for that particular function instance.
+
+Multiple entry and return handler invocations are matched using the unique
+kretprobe_instance object associated with them. Additionally, a user
+may also specify per return-instance private data to be part of each
+kretprobe_instance object. This is especially useful when sharing private
+data between corresponding user entry and return handlers. The size of each
+private data object can be specified at kretprobe registration time by
+setting the data_size field of the kretprobe struct. This data can be
+accessed through the data field of each kretprobe_instance object.
+
+In case probed function is entered but there is no kretprobe_instance
+object available, then in addition to incrementing the nmissed count,
+the user entry_handler invocation is also skipped.
+
+2. Architectures Supported
+
+Kprobes, jprobes, and return probes are implemented on the following
+architectures:
+
+- i386
+- x86_64 (AMD-64, EM64T)
+- ppc64
+- ia64 (Does not support probes on instruction slot1.)
+- sparc64 (Return probes not yet implemented.)
+- arm
+- ppc
+
+3. Configuring Kprobes
+
+When configuring the kernel using make menuconfig/xconfig/oldconfig,
+ensure that CONFIG_KPROBES is set to "y". Under "Instrumentation
+Support", look for "Kprobes".
+
+So that you can load and unload Kprobes-based instrumentation modules,
+make sure "Loadable module support" (CONFIG_MODULES) and "Module
+unloading" (CONFIG_MODULE_UNLOAD) are set to "y".
+
+Also make sure that CONFIG_KALLSYMS and perhaps even CONFIG_KALLSYMS_ALL
+are set to "y", since kallsyms_lookup_name() is used by the in-kernel
+kprobe address resolution code.
+
+If you need to insert a probe in the middle of a function, you may find
+it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
+so you can use "objdump -d -l vmlinux" to see the source-to-object
+code mapping.
+
+4. API Reference
+
+The Kprobes API includes a "register" function and an "unregister"
+function for each type of probe. The API also includes "register_*probes"
+and "unregister_*probes" functions for (un)registering arrays of probes.
+Here are terse, mini-man-page specifications for these functions and
+the associated probe handlers that you'll write. See the files in the
+samples/kprobes/ sub-directory for examples.
+
+4.1 register_kprobe
+
+#include <linux/kprobes.h>
+int register_kprobe(struct kprobe *kp);
+
+Sets a breakpoint at the address kp->addr. When the breakpoint is
+hit, Kprobes calls kp->pre_handler. After the probed instruction
+is single-stepped, Kprobe calls kp->post_handler. If a fault
+occurs during execution of kp->pre_handler or kp->post_handler,
+or during single-stepping of the probed instruction, Kprobes calls
+kp->fault_handler. Any or all handlers can be NULL.
+
+NOTE:
+1. With the introduction of the "symbol_name" field to struct kprobe,
+the probepoint address resolution will now be taken care of by the kernel.
+The following will now work:
+
+ kp.symbol_name = "symbol_name";
+
+(64-bit powerpc intricacies such as function descriptors are handled
+transparently)
+
+2. Use the "offset" field of struct kprobe if the offset into the symbol
+to install a probepoint is known. This field is used to calculate the
+probepoint.
+
+3. Specify either the kprobe "symbol_name" OR the "addr". If both are
+specified, kprobe registration will fail with -EINVAL.
+
+4. With CISC architectures (such as i386 and x86_64), the kprobes code
+does not validate if the kprobe.addr is at an instruction boundary.
+Use "offset" with caution.
+
+register_kprobe() returns 0 on success, or a negative errno otherwise.
+
+User's pre-handler (kp->pre_handler):
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+int pre_handler(struct kprobe *p, struct pt_regs *regs);
+
+Called with p pointing to the kprobe associated with the breakpoint,
+and regs pointing to the struct containing the registers saved when
+the breakpoint was hit. Return 0 here unless you're a Kprobes geek.
+
+User's post-handler (kp->post_handler):
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+void post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags);
+
+p and regs are as described for the pre_handler. flags always seems
+to be zero.
+
+User's fault-handler (kp->fault_handler):
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+int fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr);
+
+p and regs are as described for the pre_handler. trapnr is the
+architecture-specific trap number associated with the fault (e.g.,
+on i386, 13 for a general protection fault or 14 for a page fault).
+Returns 1 if it successfully handled the exception.
+
+4.2 register_jprobe
+
+#include <linux/kprobes.h>
+int register_jprobe(struct jprobe *jp)
+
+Sets a breakpoint at the address jp->kp.addr, which must be the address
+of the first instruction of a function. When the breakpoint is hit,
+Kprobes runs the handler whose address is jp->entry.
+
+The handler should have the same arg list and return type as the probed
+function; and just before it returns, it must call jprobe_return().
+(The handler never actually returns, since jprobe_return() returns
+control to Kprobes.) If the probed function is declared asmlinkage
+or anything else that affects how args are passed, the handler's
+declaration must match.
+
+register_jprobe() returns 0 on success, or a negative errno otherwise.
+
+4.3 register_kretprobe
+
+#include <linux/kprobes.h>
+int register_kretprobe(struct kretprobe *rp);
+
+Establishes a return probe for the function whose address is
+rp->kp.addr. When that function returns, Kprobes calls rp->handler.
+You must set rp->maxactive appropriately before you call
+register_kretprobe(); see "How Does a Return Probe Work?" for details.
+
+register_kretprobe() returns 0 on success, or a negative errno
+otherwise.
+
+User's return-probe handler (rp->handler):
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+int kretprobe_handler(struct kretprobe_instance *ri, struct pt_regs *regs);
+
+regs is as described for kprobe.pre_handler. ri points to the
+kretprobe_instance object, of which the following fields may be
+of interest:
+- ret_addr: the return address
+- rp: points to the corresponding kretprobe object
+- task: points to the corresponding task struct
+- data: points to per return-instance private data; see "Kretprobe
+ entry-handler" for details.
+
+The regs_return_value(regs) macro provides a simple abstraction to
+extract the return value from the appropriate register as defined by
+the architecture's ABI.
+
+The handler's return value is currently ignored.
+
+4.4 unregister_*probe
+
+#include <linux/kprobes.h>
+void unregister_kprobe(struct kprobe *kp);
+void unregister_jprobe(struct jprobe *jp);
+void unregister_kretprobe(struct kretprobe *rp);
+
+Removes the specified probe. The unregister function can be called
+at any time after the probe has been registered.
+
+NOTE:
+If the functions find an incorrect probe (ex. an unregistered probe),
+they clear the addr field of the probe.
+
+4.5 register_*probes
+
+#include <linux/kprobes.h>
+int register_kprobes(struct kprobe **kps, int num);
+int register_kretprobes(struct kretprobe **rps, int num);
+int register_jprobes(struct jprobe **jps, int num);
+
+Registers each of the num probes in the specified array. If any
+error occurs during registration, all probes in the array, up to
+the bad probe, are safely unregistered before the register_*probes
+function returns.
+- kps/rps/jps: an array of pointers to *probe data structures
+- num: the number of the array entries.
+
+NOTE:
+You have to allocate(or define) an array of pointers and set all
+of the array entries before using these functions.
+
+4.6 unregister_*probes
+
+#include <linux/kprobes.h>
+void unregister_kprobes(struct kprobe **kps, int num);
+void unregister_kretprobes(struct kretprobe **rps, int num);
+void unregister_jprobes(struct jprobe **jps, int num);
+
+Removes each of the num probes in the specified array at once.
+
+NOTE:
+If the functions find some incorrect probes (ex. unregistered
+probes) in the specified array, they clear the addr field of those
+incorrect probes. However, other probes in the array are
+unregistered correctly.
+
+5. Kprobes Features and Limitations
+
+Kprobes allows multiple probes at the same address. Currently,
+however, there cannot be multiple jprobes on the same function at
+the same time.
+
+In general, you can install a probe anywhere in the kernel.
+In particular, you can probe interrupt handlers. Known exceptions
+are discussed in this section.
+
+The register_*probe functions will return -EINVAL if you attempt
+to install a probe in the code that implements Kprobes (mostly
+kernel/kprobes.c and arch/*/kernel/kprobes.c, but also functions such
+as do_page_fault and notifier_call_chain).
+
+If you install a probe in an inline-able function, Kprobes makes
+no attempt to chase down all inline instances of the function and
+install probes there. gcc may inline a function without being asked,
+so keep this in mind if you're not seeing the probe hits you expect.
+
+A probe handler can modify the environment of the probed function
+-- e.g., by modifying kernel data structures, or by modifying the
+contents of the pt_regs struct (which are restored to the registers
+upon return from the breakpoint). So Kprobes can be used, for example,
+to install a bug fix or to inject faults for testing. Kprobes, of
+course, has no way to distinguish the deliberately injected faults
+from the accidental ones. Don't drink and probe.
+
+Kprobes makes no attempt to prevent probe handlers from stepping on
+each other -- e.g., probing printk() and then calling printk() from a
+probe handler. If a probe handler hits a probe, that second probe's
+handlers won't be run in that instance, and the kprobe.nmissed member
+of the second probe will be incremented.
+
+As of Linux v2.6.15-rc1, multiple handlers (or multiple instances of
+the same handler) may run concurrently on different CPUs.
+
+Kprobes does not use mutexes or allocate memory except during
+registration and unregistration.
+
+Probe handlers are run with preemption disabled. Depending on the
+architecture, handlers may also run with interrupts disabled. In any
+case, your handler should not yield the CPU (e.g., by attempting to
+acquire a semaphore).
+
+Since a return probe is implemented by replacing the return
+address with the trampoline's address, stack backtraces and calls
+to __builtin_return_address() will typically yield the trampoline's
+address instead of the real return address for kretprobed functions.
+(As far as we can tell, __builtin_return_address() is used only
+for instrumentation and error reporting.)
+
+If the number of times a function is called does not match the number
+of times it returns, registering a return probe on that function may
+produce undesirable results. In such a case, a line:
+kretprobe BUG!: Processing kretprobe d000000000041aa8 @ c00000000004f48c
+gets printed. With this information, one will be able to correlate the
+exact instance of the kretprobe that caused the problem. We have the
+do_exit() case covered. do_execve() and do_fork() are not an issue.
+We're unaware of other specific cases where this could be a problem.
+
+If, upon entry to or exit from a function, the CPU is running on
+a stack other than that of the current task, registering a return
+probe on that function may produce undesirable results. For this
+reason, Kprobes doesn't support return probes (or kprobes or jprobes)
+on the x86_64 version of __switch_to(); the registration functions
+return -EINVAL.
+
+6. Probe Overhead
+
+On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0
+microseconds to process. Specifically, a benchmark that hits the same
+probepoint repeatedly, firing a simple handler each time, reports 1-2
+million hits per second, depending on the architecture. A jprobe or
+return-probe hit typically takes 50-75% longer than a kprobe hit.
+When you have a return probe set on a function, adding a kprobe at
+the entry to that function adds essentially no overhead.
+
+Here are sample overhead figures (in usec) for different architectures.
+k = kprobe; j = jprobe; r = return probe; kr = kprobe + return probe
+on same function; jr = jprobe + return probe on same function
+
+i386: Intel Pentium M, 1495 MHz, 2957.31 bogomips
+k = 0.57 usec; j = 1.00; r = 0.92; kr = 0.99; jr = 1.40
+
+x86_64: AMD Opteron 246, 1994 MHz, 3971.48 bogomips
+k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07
+
+ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU)
+k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99
+
+7. TODO
+
+a. SystemTap (http://sourceware.org/systemtap): Provides a simplified
+programming interface for probe-based instrumentation. Try it out.
+b. Kernel return probes for sparc64.
+c. Support for other architectures.
+d. User-space probes.
+e. Watchpoint probes (which fire on data references).
+
+8. Kprobes Example
+
+See samples/kprobes/kprobe_example.c
+
+9. Jprobes Example
+
+See samples/kprobes/jprobe_example.c
+
+10. Kretprobes Example
+
+See samples/kprobes/kretprobe_example.c
+
+For additional information on Kprobes, refer to the following URLs:
+http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe
+http://www.redhat.com/magazine/005mar05/features/kprobes/
+http://www-users.cs.umn.edu/~boutcher/kprobes/
+http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
+
+
+Appendix A: The kprobes debugfs interface
+
+With recent kernels (> 2.6.20) the list of registered kprobes is visible
+under the /debug/kprobes/ directory (assuming debugfs is mounted at /debug).
+
+/debug/kprobes/list: Lists all registered probes on the system
+
+c015d71a k vfs_read+0x0
+c011a316 j do_fork+0x0
+c03dedc5 r tcp_v4_rcv+0x0
+
+The first column provides the kernel address where the probe is inserted.
+The second column identifies the type of probe (k - kprobe, r - kretprobe
+and j - jprobe), while the third column specifies the symbol+offset of
+the probe. If the probed function belongs to a module, the module name
+is also specified.
+
+/debug/kprobes/enabled: Turn kprobes ON/OFF
+
+Provides a knob to globally turn registered kprobes ON or OFF. By default,
+all kprobes are enabled. By echoing "0" to this file, all registered probes
+will be disarmed, till such time a "1" is echoed to this file.
diff --git a/Documentation/kref.txt b/Documentation/kref.txt
new file mode 100644
index 0000000..130b6e8
--- /dev/null
+++ b/Documentation/kref.txt
@@ -0,0 +1,216 @@
+
+krefs allow you to add reference counters to your objects. If you
+have objects that are used in multiple places and passed around, and
+you don't have refcounts, your code is almost certainly broken. If
+you want refcounts, krefs are the way to go.
+
+To use a kref, add one to your data structures like:
+
+struct my_data
+{
+ .
+ .
+ struct kref refcount;
+ .
+ .
+};
+
+The kref can occur anywhere within the data structure.
+
+You must initialize the kref after you allocate it. To do this, call
+kref_init as so:
+
+ struct my_data *data;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ kref_init(&data->refcount);
+
+This sets the refcount in the kref to 1.
+
+Once you have an initialized kref, you must follow the following
+rules:
+
+1) If you make a non-temporary copy of a pointer, especially if
+ it can be passed to another thread of execution, you must
+ increment the refcount with kref_get() before passing it off:
+ kref_get(&data->refcount);
+ If you already have a valid pointer to a kref-ed structure (the
+ refcount cannot go to zero) you may do this without a lock.
+
+2) When you are done with a pointer, you must call kref_put():
+ kref_put(&data->refcount, data_release);
+ If this is the last reference to the pointer, the release
+ routine will be called. If the code never tries to get
+ a valid pointer to a kref-ed structure without already
+ holding a valid pointer, it is safe to do this without
+ a lock.
+
+3) If the code attempts to gain a reference to a kref-ed structure
+ without already holding a valid pointer, it must serialize access
+ where a kref_put() cannot occur during the kref_get(), and the
+ structure must remain valid during the kref_get().
+
+For example, if you allocate some data and then pass it to another
+thread to process:
+
+void data_release(struct kref *ref)
+{
+ struct my_data *data = container_of(ref, struct my_data, refcount);
+ kfree(data);
+}
+
+void more_data_handling(void *cb_data)
+{
+ struct my_data *data = cb_data;
+ .
+ . do stuff with data here
+ .
+ kref_put(&data->refcount, data_release);
+}
+
+int my_data_handler(void)
+{
+ int rv = 0;
+ struct my_data *data;
+ struct task_struct *task;
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ kref_init(&data->refcount);
+
+ kref_get(&data->refcount);
+ task = kthread_run(more_data_handling, data, "more_data_handling");
+ if (task == ERR_PTR(-ENOMEM)) {
+ rv = -ENOMEM;
+ kref_put(&data->refcount, data_release);
+ goto out;
+ }
+
+ .
+ . do stuff with data here
+ .
+ out:
+ kref_put(&data->refcount, data_release);
+ return rv;
+}
+
+This way, it doesn't matter what order the two threads handle the
+data, the kref_put() handles knowing when the data is not referenced
+any more and releasing it. The kref_get() does not require a lock,
+since we already have a valid pointer that we own a refcount for. The
+put needs no lock because nothing tries to get the data without
+already holding a pointer.
+
+Note that the "before" in rule 1 is very important. You should never
+do something like:
+
+ task = kthread_run(more_data_handling, data, "more_data_handling");
+ if (task == ERR_PTR(-ENOMEM)) {
+ rv = -ENOMEM;
+ goto out;
+ } else
+ /* BAD BAD BAD - get is after the handoff */
+ kref_get(&data->refcount);
+
+Don't assume you know what you are doing and use the above construct.
+First of all, you may not know what you are doing. Second, you may
+know what you are doing (there are some situations where locking is
+involved where the above may be legal) but someone else who doesn't
+know what they are doing may change the code or copy the code. It's
+bad style. Don't do it.
+
+There are some situations where you can optimize the gets and puts.
+For instance, if you are done with an object and enqueuing it for
+something else or passing it off to something else, there is no reason
+to do a get then a put:
+
+ /* Silly extra get and put */
+ kref_get(&obj->ref);
+ enqueue(obj);
+ kref_put(&obj->ref, obj_cleanup);
+
+Just do the enqueue. A comment about this is always welcome:
+
+ enqueue(obj);
+ /* We are done with obj, so we pass our refcount off
+ to the queue. DON'T TOUCH obj AFTER HERE! */
+
+The last rule (rule 3) is the nastiest one to handle. Say, for
+instance, you have a list of items that are each kref-ed, and you wish
+to get the first one. You can't just pull the first item off the list
+and kref_get() it. That violates rule 3 because you are not already
+holding a valid pointer. You must add a mutex (or some other lock).
+For instance:
+
+static DEFINE_MUTEX(mutex);
+static LIST_HEAD(q);
+struct my_data
+{
+ struct kref refcount;
+ struct list_head link;
+};
+
+static struct my_data *get_entry()
+{
+ struct my_data *entry = NULL;
+ mutex_lock(&mutex);
+ if (!list_empty(&q)) {
+ entry = container_of(q.next, struct my_q_entry, link);
+ kref_get(&entry->refcount);
+ }
+ mutex_unlock(&mutex);
+ return entry;
+}
+
+static void release_entry(struct kref *ref)
+{
+ struct my_data *entry = container_of(ref, struct my_data, refcount);
+
+ list_del(&entry->link);
+ kfree(entry);
+}
+
+static void put_entry(struct my_data *entry)
+{
+ mutex_lock(&mutex);
+ kref_put(&entry->refcount, release_entry);
+ mutex_unlock(&mutex);
+}
+
+The kref_put() return value is useful if you do not want to hold the
+lock during the whole release operation. Say you didn't want to call
+kfree() with the lock held in the example above (since it is kind of
+pointless to do so). You could use kref_put() as follows:
+
+static void release_entry(struct kref *ref)
+{
+ /* All work is done after the return from kref_put(). */
+}
+
+static void put_entry(struct my_data *entry)
+{
+ mutex_lock(&mutex);
+ if (kref_put(&entry->refcount, release_entry)) {
+ list_del(&entry->link);
+ mutex_unlock(&mutex);
+ kfree(entry);
+ } else
+ mutex_unlock(&mutex);
+}
+
+This is really more useful if you have to call other routines as part
+of the free operations that could take a long time or might claim the
+same lock. Note that doing everything in the release routine is still
+preferred as it is a little neater.
+
+
+Corey Minyard <minyard@acm.org>
+
+A lot of this was lifted from Greg Kroah-Hartman's 2004 OLS paper and
+presentation on krefs, which can be found at:
+ http://www.kroah.com/linux/talks/ols_2004_kref_paper/Reprint-Kroah-Hartman-OLS2004.pdf
+and:
+ http://www.kroah.com/linux/talks/ols_2004_kref_talk/
+
diff --git a/Documentation/laptops/00-INDEX b/Documentation/laptops/00-INDEX
new file mode 100644
index 0000000..ee5692b
--- /dev/null
+++ b/Documentation/laptops/00-INDEX
@@ -0,0 +1,12 @@
+00-INDEX
+ - This file
+acer-wmi.txt
+ - information on the Acer Laptop WMI Extras driver.
+laptop-mode.txt
+ - how to conserve battery power using laptop-mode.
+sony-laptop.txt
+ - Sony Notebook Control Driver (SNC) Readme.
+sonypi.txt
+ - info on Linux Sony Programmable I/O Device support.
+thinkpad-acpi.txt
+ - information on the (IBM and Lenovo) ThinkPad ACPI Extras driver.
diff --git a/Documentation/laptops/acer-wmi.txt b/Documentation/laptops/acer-wmi.txt
new file mode 100644
index 0000000..2b3a6b5
--- /dev/null
+++ b/Documentation/laptops/acer-wmi.txt
@@ -0,0 +1,180 @@
+Acer Laptop WMI Extras Driver
+http://code.google.com/p/aceracpi
+Version 0.2
+18th August 2008
+
+Copyright 2007-2008 Carlos Corbacho <carlos@strangeworlds.co.uk>
+
+acer-wmi is a driver to allow you to control various parts of your Acer laptop
+hardware under Linux which are exposed via ACPI-WMI.
+
+This driver completely replaces the old out-of-tree acer_acpi, which I am
+currently maintaining for bug fixes only on pre-2.6.25 kernels. All development
+work is now focused solely on acer-wmi.
+
+Disclaimer
+**********
+
+Acer and Wistron have provided nothing towards the development acer_acpi or
+acer-wmi. All information we have has been through the efforts of the developers
+and the users to discover as much as possible about the hardware.
+
+As such, I do warn that this could break your hardware - this is extremely
+unlikely of course, but please bear this in mind.
+
+Background
+**********
+
+acer-wmi is derived from acer_acpi, originally developed by Mark
+Smith in 2005, then taken over by Carlos Corbacho in 2007, in order to activate
+the wireless LAN card under a 64-bit version of Linux, as acerhk[1] (the
+previous solution to the problem) relied on making 32 bit BIOS calls which are
+not possible in kernel space from a 64 bit OS.
+
+[1] acerhk: http://www.cakey.de/acerhk/
+
+Supported Hardware
+******************
+
+Please see the website for the current list of known working hardare:
+
+http://code.google.com/p/aceracpi/wiki/SupportedHardware
+
+If your laptop is not listed, or listed as unknown, and works with acer-wmi,
+please contact me with a copy of the DSDT.
+
+If your Acer laptop doesn't work with acer-wmi, I would also like to see the
+DSDT.
+
+To send me the DSDT, as root/sudo:
+
+cat /sys/firmware/acpi/tables/DSDT > dsdt
+
+And send me the resulting 'dsdt' file.
+
+Usage
+*****
+
+On Acer laptops, acer-wmi should already be autoloaded based on DMI matching.
+For non-Acer laptops, until WMI based autoloading support is added, you will
+need to manually load acer-wmi.
+
+acer-wmi creates /sys/devices/platform/acer-wmi, and fills it with various
+files whose usage is detailed below, which enables you to control some of the
+following (varies between models):
+
+* the wireless LAN card radio
+* inbuilt Bluetooth adapter
+* inbuilt 3G card
+* mail LED of your laptop
+* brightness of the LCD panel
+
+Wireless
+********
+
+With regards to wireless, all acer-wmi does is enable the radio on the card. It
+is not responsible for the wireless LED - once the radio is enabled, this is
+down to the wireless driver for your card. So the behaviour of the wireless LED,
+once you enable the radio, will depend on your hardware and driver combination.
+
+e.g. With the BCM4318 on the Acer Aspire 5020 series:
+
+ndiswrapper: Light blinks on when transmitting
+b43: Solid light, blinks off when transmitting
+
+Wireless radio control is unconditionally enabled - all Acer laptops that support
+acer-wmi come with built-in wireless. However, should you feel so inclined to
+ever wish to remove the card, or swap it out at some point, please get in touch
+with me, as we may well be able to gain some data on wireless card detection.
+
+The wireless radio is exposed through rfkill.
+
+Bluetooth
+*********
+
+For bluetooth, this is an internal USB dongle, so once enabled, you will get
+a USB device connection event, and a new USB device appears. When you disable
+bluetooth, you get the reverse - a USB device disconnect event, followed by the
+device disappearing again.
+
+Bluetooth is autodetected by acer-wmi, so if you do not have a bluetooth module
+installed in your laptop, this file won't exist (please be aware that it is
+quite common for Acer not to fit bluetooth to their laptops - so just because
+you have a bluetooth button on the laptop, doesn't mean that bluetooth is
+installed).
+
+For the adventurously minded - if you want to buy an internal bluetooth
+module off the internet that is compatible with your laptop and fit it, then
+it will work just fine with acer-wmi.
+
+Bluetooth is exposed through rfkill.
+
+3G
+**
+
+3G is currently not autodetected, so the 'threeg' file is always created under
+sysfs. So far, no-one in possession of an Acer laptop with 3G built-in appears to
+have tried Linux, or reported back, so we don't have any information on this.
+
+If you have an Acer laptop that does have a 3G card in, please contact me so we
+can properly detect these, and find out a bit more about them.
+
+To read the status of the 3G card (0=off, 1=on):
+cat /sys/devices/platform/acer-wmi/threeg
+
+To enable the 3G card:
+echo 1 > /sys/devices/platform/acer-wmi/threeg
+
+To disable the 3G card:
+echo 0 > /sys/devices/platform/acer-wmi/threeg
+
+To set the state of the 3G card when loading acer-wmi, pass:
+threeg=X (where X is 0 or 1)
+
+Mail LED
+********
+
+This can be found in most older Acer laptops supported by acer-wmi, and many
+newer ones - it is built into the 'mail' button, and blinks when active.
+
+On newer (WMID) laptops though, we have no way of detecting the mail LED. If
+your laptop identifies itself in dmesg as a WMID model, then please try loading
+acer_acpi with:
+
+force_series=2490
+
+This will use a known alternative method of reading/ writing the mail LED. If
+it works, please report back to me with the DMI data from your laptop so this
+can be added to acer-wmi.
+
+The LED is exposed through the LED subsystem, and can be found in:
+
+/sys/devices/platform/acer-wmi/leds/acer-wmi::mail/
+
+The mail LED is autodetected, so if you don't have one, the LED device won't
+be registered.
+
+Backlight
+*********
+
+The backlight brightness control is available on all acer-wmi supported
+hardware. The maximum brightness level is usually 15, but on some newer laptops
+it's 10 (this is again autodetected).
+
+The backlight is exposed through the backlight subsystem, and can be found in:
+
+/sys/devices/platform/acer-wmi/backlight/acer-wmi/
+
+Credits
+*******
+
+Olaf Tauber, who did the real hard work when he developed acerhk
+http://www.informatik.hu-berlin.de/~tauber/acerhk
+All the authors of laptop ACPI modules in the kernel, whose work
+was an inspiration in the early days of acer_acpi
+Mathieu Segaud, who solved the problem with having to modprobe the driver
+twice in acer_acpi 0.2.
+Jim Ramsay, who added support for the WMID interface
+Mark Smith, who started the original acer_acpi
+
+And the many people who have used both acer_acpi and acer-wmi.
diff --git a/Documentation/laptops/disk-shock-protection.txt b/Documentation/laptops/disk-shock-protection.txt
new file mode 100644
index 0000000..0e6ba26
--- /dev/null
+++ b/Documentation/laptops/disk-shock-protection.txt
@@ -0,0 +1,149 @@
+Hard disk shock protection
+==========================
+
+Author: Elias Oltmanns <eo@nebensachen.de>
+Last modified: 2008-10-03
+
+
+0. Contents
+-----------
+
+1. Intro
+2. The interface
+3. References
+4. CREDITS
+
+
+1. Intro
+--------
+
+ATA/ATAPI-7 specifies the IDLE IMMEDIATE command with unload feature.
+Issuing this command should cause the drive to switch to idle mode and
+unload disk heads. This feature is being used in modern laptops in
+conjunction with accelerometers and appropriate software to implement
+a shock protection facility. The idea is to stop all I/O operations on
+the internal hard drive and park its heads on the ramp when critical
+situations are anticipated. The desire to have such a feature
+available on GNU/Linux systems has been the original motivation to
+implement a generic disk head parking interface in the Linux kernel.
+Please note, however, that other components have to be set up on your
+system in order to get disk shock protection working (see
+section 3. References below for pointers to more information about
+that).
+
+
+2. The interface
+----------------
+
+For each ATA device, the kernel exports the file
+block/*/device/unload_heads in sysfs (here assumed to be mounted under
+/sys). Access to /sys/block/*/device/unload_heads is denied with
+-EOPNOTSUPP if the device does not support the unload feature.
+Otherwise, writing an integer value to this file will take the heads
+of the respective drive off the platter and block all I/O operations
+for the specified number of milliseconds. When the timeout expires and
+no further disk head park request has been issued in the meantime,
+normal operation will be resumed. The maximal value accepted for a
+timeout is 30000 milliseconds. Exceeding this limit will return
+-EOVERFLOW, but heads will be parked anyway and the timeout will be
+set to 30 seconds. However, you can always change a timeout to any
+value between 0 and 30000 by issuing a subsequent head park request
+before the timeout of the previous one has expired. In particular, the
+total timeout can exceed 30 seconds and, more importantly, you can
+cancel a previously set timeout and resume normal operation
+immediately by specifying a timeout of 0. Values below -2 are rejected
+with -EINVAL (see below for the special meaning of -1 and -2). If the
+timeout specified for a recent head park request has not yet expired,
+reading from /sys/block/*/device/unload_heads will report the number
+of milliseconds remaining until normal operation will be resumed;
+otherwise, reading the unload_heads attribute will return 0.
+
+For example, do the following in order to park the heads of drive
+/dev/sda and stop all I/O operations for five seconds:
+
+# echo 5000 > /sys/block/sda/device/unload_heads
+
+A simple
+
+# cat /sys/block/sda/device/unload_heads
+
+will show you how many milliseconds are left before normal operation
+will be resumed.
+
+A word of caution: The fact that the interface operates on a basis of
+milliseconds may raise expectations that cannot be satisfied in
+reality. In fact, the ATA specs clearly state that the time for an
+unload operation to complete is vendor specific. The hint in ATA-7
+that this will typically be within 500 milliseconds apparently has
+been dropped in ATA-8.
+
+There is a technical detail of this implementation that may cause some
+confusion and should be discussed here. When a head park request has
+been issued to a device successfully, all I/O operations on the
+controller port this device is attached to will be deferred. That is
+to say, any other device that may be connected to the same port will
+be affected too. The only exception is that a subsequent head unload
+request to that other device will be executed immediately. Further
+operations on that port will be deferred until the timeout specified
+for either device on the port has expired. As far as PATA (old style
+IDE) configurations are concerned, there can only be two devices
+attached to any single port. In SATA world we have port multipliers
+which means that a user-issued head parking request to one device may
+actually result in stopping I/O to a whole bunch of devices. However,
+since this feature is supposed to be used on laptops and does not seem
+to be very useful in any other environment, there will be mostly one
+device per port. Even if the CD/DVD writer happens to be connected to
+the same port as the hard drive, it generally *should* recover just
+fine from the occasional buffer under-run incurred by a head park
+request to the HD. Actually, when you are using an ide driver rather
+than its libata counterpart (i.e. your disk is called /dev/hda
+instead of /dev/sda), then parking the heads of one drive (drive X)
+will generally not affect the mode of operation of another drive
+(drive Y) on the same port as described above. It is only when a port
+reset is required to recover from an exception on drive Y that further
+I/O operations on that drive (and the reset itself) will be delayed
+until drive X is no longer in the parked state.
+
+Finally, there are some hard drives that only comply with an earlier
+version of the ATA standard than ATA-7, but do support the unload
+feature nonetheless. Unfortunately, there is no safe way Linux can
+detect these devices, so you won't be able to write to the
+unload_heads attribute. If you know that your device really does
+support the unload feature (for instance, because the vendor of your
+laptop or the hard drive itself told you so), then you can tell the
+kernel to enable the usage of this feature for that drive by writing
+the special value -1 to the unload_heads attribute:
+
+# echo -1 > /sys/block/sda/device/unload_heads
+
+will enable the feature for /dev/sda, and giving -2 instead of -1 will
+disable it again.
+
+
+3. References
+-------------
+
+There are several laptops from different vendors featuring shock
+protection capabilities. As manufacturers have refused to support open
+source development of the required software components so far, Linux
+support for shock protection varies considerably between different
+hardware implementations. Ideally, this section should contain a list
+of pointers at different projects aiming at an implementation of shock
+protection on different systems. Unfortunately, I only know of a
+single project which, although still considered experimental, is fit
+for use. Please feel free to add projects that have been the victims
+of my ignorance.
+
+- http://www.thinkwiki.org/wiki/HDAPS
+ See this page for information about Linux support of the hard disk
+ active protection system as implemented in IBM/Lenovo Thinkpads.
+
+
+4. CREDITS
+----------
+
+This implementation of disk head parking has been inspired by a patch
+originally published by Jon Escombe <lists@dresco.co.uk>. My efforts
+to develop an implementation of this feature that is fit to be merged
+into mainline have been aided by various kernel developers, in
+particular by Tejun Heo and Bartlomiej Zolnierkiewicz.
diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt
new file mode 100644
index 0000000..eeedee1
--- /dev/null
+++ b/Documentation/laptops/laptop-mode.txt
@@ -0,0 +1,950 @@
+How to conserve battery power using laptop-mode
+-----------------------------------------------
+
+Document Author: Bart Samwel (bart@samwel.tk)
+Date created: January 2, 2004
+Last modified: December 06, 2004
+
+Introduction
+------------
+
+Laptop mode is used to minimize the time that the hard disk needs to be spun up,
+to conserve battery power on laptops. It has been reported to cause significant
+power savings.
+
+Contents
+--------
+
+* Introduction
+* Installation
+* Caveats
+* The Details
+* Tips & Tricks
+* Control script
+* ACPI integration
+* Monitoring tool
+
+
+Installation
+------------
+
+To use laptop mode, you don't need to set any kernel configuration options
+or anything. Simply install all the files included in this document, and
+laptop mode will automatically be started when you're on battery. For
+your convenience, a tarball containing an installer can be downloaded at:
+
+http://www.samwel.tk/laptop_mode/laptop_mode/
+
+To configure laptop mode, you need to edit the configuration file, which is
+located in /etc/default/laptop-mode on Debian-based systems, or in
+/etc/sysconfig/laptop-mode on other systems.
+
+Unfortunately, automatic enabling of laptop mode does not work for
+laptops that don't have ACPI. On those laptops, you need to start laptop
+mode manually. To start laptop mode, run "laptop_mode start", and to
+stop it, run "laptop_mode stop". (Note: The laptop mode tools package now
+has experimental support for APM, you might want to try that first.)
+
+
+Caveats
+-------
+
+* The downside of laptop mode is that you have a chance of losing up to 10
+ minutes of work. If you cannot afford this, don't use it! The supplied ACPI
+ scripts automatically turn off laptop mode when the battery almost runs out,
+ so that you won't lose any data at the end of your battery life.
+
+* Most desktop hard drives have a very limited lifetime measured in spindown
+ cycles, typically about 50.000 times (it's usually listed on the spec sheet).
+ Check your drive's rating, and don't wear down your drive's lifetime if you
+ don't need to.
+
+* If you mount some of your ext3/reiserfs filesystems with the -n option, then
+ the control script will not be able to remount them correctly. You must set
+ DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
+ wrong options -- or it will fail because it cannot write to /etc/mtab.
+
+* If you have your filesystems listed as type "auto" in fstab, like I did, then
+ the control script will not recognize them as filesystems that need remounting.
+ You must list the filesystems with their true type instead.
+
+* It has been reported that some versions of the mutt mail client use file access
+ times to determine whether a folder contains new mail. If you use mutt and
+ experience this, you must disable the noatime remounting by setting the option
+ DO_REMOUNT_NOATIME to 0 in the configuration file.
+
+
+The Details
+-----------
+
+Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is
+present for all kernels that have the laptop mode patch, regardless of any
+configuration options. When the knob is set, any physical disk I/O (that might
+have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The
+result of this is that after a disk has spun down, it will not be spun up
+anymore to write dirty blocks, because those blocks had already been written
+immediately after the most recent read operation. The value of the laptop_mode
+knob determines the time between the occurrence of disk I/O and when the flush
+is triggered. A sensible value for the knob is 5 seconds. Setting the knob to
+0 disables laptop mode.
+
+To increase the effectiveness of the laptop_mode strategy, the laptop_mode
+control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
+/proc/sys/vm to about 10 minutes (by default), which means that pages that are
+dirtied are not forced to be written to disk as often. The control script also
+changes the dirty background ratio, so that background writeback of dirty pages
+is not done anymore. Combined with a higher commit value (also 10 minutes) for
+ext3 or ReiserFS filesystems (also done automatically by the control script),
+this results in concentration of disk activity in a small time interval which
+occurs only once every 10 minutes, or whenever the disk is forced to spin up by
+a cache miss. The disk can then be spun down in the periods of inactivity.
+
+If you want to find out which process caused the disk to spin up, you can
+gather information by setting the flag /proc/sys/vm/block_dump. When this flag
+is set, Linux reports all disk read and write operations that take place, and
+all block dirtyings done to files. This makes it possible to debug why a disk
+needs to spin up, and to increase battery life even more. The output of
+block_dump is written to the kernel output, and it can be retrieved using
+"dmesg". When you use block_dump and your kernel logging level also includes
+kernel debugging messages, you probably want to turn off klogd, otherwise
+the output of block_dump will be logged, causing disk activity that is not
+normally there.
+
+
+Configuration
+-------------
+
+The laptop mode configuration file is located in /etc/default/laptop-mode on
+Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It
+contains the following options:
+
+MAX_AGE:
+
+Maximum time, in seconds, of hard drive spindown time that you are
+comfortable with. Worst case, it's possible that you could lose this
+amount of work if your battery fails while you're in laptop mode.
+
+MINIMUM_BATTERY_MINUTES:
+
+Automatically disable laptop mode if the remaining number of minutes of
+battery power is less than this value. Default is 10 minutes.
+
+AC_HD/BATT_HD:
+
+The idle timeout that should be set on your hard drive when laptop mode
+is active (BATT_HD) and when it is not active (AC_HD). The defaults are
+20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The
+possible values are those listed in the manual page for "hdparm" for the
+"-S" option.
+
+HD:
+
+The devices for which the spindown timeout should be adjusted by laptop mode.
+Default is /dev/hda. If you specify multiple devices, separate them by a space.
+
+READAHEAD:
+
+Disk readahead, in 512-byte sectors, while laptop mode is active. A large
+readahead can prevent disk accesses for things like executable pages (which are
+loaded on demand while the application executes) and sequentially accessed data
+(MP3s).
+
+DO_REMOUNTS:
+
+The control script automatically remounts any mounted journaled filesystems
+with appropriate commit interval options. When this option is set to 0, this
+feature is disabled.
+
+DO_REMOUNT_NOATIME:
+
+When remounting, should the filesystems be remounted with the noatime option?
+Normally, this is set to "1" (enabled), but there may be programs that require
+access time recording.
+
+DIRTY_RATIO:
+
+The percentage of memory that is allowed to contain "dirty" or unsaved data
+before a writeback is forced, while laptop mode is active. Corresponds to
+the /proc/sys/vm/dirty_ratio sysctl.
+
+DIRTY_BACKGROUND_RATIO:
+
+The percentage of memory that is allowed to contain "dirty" or unsaved data
+after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set
+this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio
+sysctl.
+
+Note that the behaviour of dirty_background_ratio is quite different
+when laptop mode is active and when it isn't. When laptop mode is inactive,
+dirty_background_ratio is the threshold percentage at which background writeouts
+start taking place. When laptop mode is active, however, background writeouts
+are disabled, and the dirty_background_ratio only determines how much writeback
+is done when dirty_ratio is reached.
+
+DO_CPU:
+
+Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup.
+See Documentation/cpu-freq/user-guide.txt for more info. Disabled by default.)
+
+CPU_MAXFREQ:
+
+When on battery, what is the maximum CPU speed that the system should use? Legal
+values are "slowest" for the slowest speed that your CPU is able to operate at,
+or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies.
+
+
+Tips & Tricks
+-------------
+
+* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
+ of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1).
+
+* You can spin down the disk while playing MP3, by setting disk readahead
+ to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at
+ once, and will then spin down while the MP3 is playing. (Thanks to Bartek
+ Kania.)
+
+* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
+ of colours that my display uses it consumes less battery power. I've seen
+ this on powerbooks too. I hope that this is a piece of information that
+ might be useful to the Laptop Mode patch or it's users."
+
+* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
+ file after every logging. When you're using laptop-mode and your disk doesn't
+ spin down, this is a likely culprit.
+
+* Richard Atterer observed that laptop mode does not work well with noflushd
+ (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
+ from doing its thing.
+
+* If you're worried about your data, you might want to consider using a USB
+ memory stick or something like that as a "working area". (Be aware though
+ that flash memory can only handle a limited number of writes, and overuse
+ may wear out your memory stick pretty quickly. Do _not_ use journalling
+ filesystems on flash memory sticks.)
+
+
+Configuration file for control and ACPI battery scripts
+-------------------------------------------------------
+
+This allows the tunables to be changed for the scripts via an external
+configuration file
+
+It should be installed as /etc/default/laptop-mode on Debian, and as
+/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes.
+
+--------------------CONFIG FILE BEGIN-------------------------------------------
+# Maximum time, in seconds, of hard drive spindown time that you are
+# comfortable with. Worst case, it's possible that you could lose this
+# amount of work if your battery fails you while in laptop mode.
+#MAX_AGE=600
+
+# Automatically disable laptop mode when the number of minutes of battery
+# that you have left goes below this threshold.
+MINIMUM_BATTERY_MINUTES=10
+
+# Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG
+# by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk
+# will read a complete MP3 at once, and will then spin down while the MP3/OGG is
+# playing.
+#READAHEAD=4096
+
+# Shall we remount journaled fs. with appropriate commit interval? (1=yes)
+#DO_REMOUNTS=1
+
+# And shall we add the "noatime" option to that as well? (1=yes)
+#DO_REMOUNT_NOATIME=1
+
+# Dirty synchronous ratio. At this percentage of dirty pages the process
+# which
+# calls write() does its own writeback
+#DIRTY_RATIO=40
+
+#
+# Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been
+# exceeded, the kernel will wake pdflush which will then reduce the amount
+# of dirty memory to dirty_background_ratio. Set this nice and low, so once
+# some writeout has commenced, we do a lot of it.
+#
+#DIRTY_BACKGROUND_RATIO=5
+
+# kernel default dirty buffer age
+#DEF_AGE=30
+#DEF_UPDATE=5
+#DEF_DIRTY_BACKGROUND_RATIO=10
+#DEF_DIRTY_RATIO=40
+#DEF_XFS_AGE_BUFFER=15
+#DEF_XFS_SYNC_INTERVAL=30
+#DEF_XFS_BUFD_INTERVAL=1
+
+# This must be adjusted manually to the value of HZ in the running kernel
+# on 2.4, until the XFS people change their 2.4 external interfaces to work in
+# centisecs. This can be automated, but it's a work in progress that still
+# needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for
+# external interfaces, and that is currently always set to 100. So you don't
+# need to change this on 2.6.
+#XFS_HZ=100
+
+# Should the maximum CPU frequency be adjusted down while on battery?
+# Requires CPUFreq to be setup.
+# See Documentation/cpu-freq/user-guide.txt for more info
+#DO_CPU=0
+
+# When on battery what is the maximum CPU speed that the system should
+# use? Legal values are "slowest" for the slowest speed that your
+# CPU is able to operate at, or a value listed in:
+# /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
+# Only applicable if DO_CPU=1.
+#CPU_MAXFREQ=slowest
+
+# Idle timeout for your hard drive (man hdparm for valid values, -S option)
+# Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4).
+#AC_HD=244
+#BATT_HD=4
+
+# The drives for which to adjust the idle timeout. Separate them by a space,
+# e.g. HD="/dev/hda /dev/hdb".
+#HD="/dev/hda"
+
+# Set the spindown timeout on a hard drive?
+#DO_HD=1
+
+--------------------CONFIG FILE END---------------------------------------------
+
+
+Control script
+--------------
+
+Please note that this control script works for the Linux 2.4 and 2.6 series (thanks
+to Kiko Piris).
+
+--------------------CONTROL SCRIPT BEGIN----------------------------------------
+#!/bin/bash
+
+# start or stop laptop_mode, best run by a power management daemon when
+# ac gets connected/disconnected from a laptop
+#
+# install as /sbin/laptop_mode
+#
+# Contributors to this script: Kiko Piris
+# Bart Samwel
+# Micha Feigin
+# Andrew Morton
+# Herve Eychenne
+# Dax Kelson
+#
+# Original Linux 2.4 version by: Jens Axboe
+
+#############################################################################
+
+# Source config
+if [ -f /etc/default/laptop-mode ] ; then
+ # Debian
+ . /etc/default/laptop-mode
+elif [ -f /etc/sysconfig/laptop-mode ] ; then
+ # Others
+ . /etc/sysconfig/laptop-mode
+fi
+
+# Don't raise an error if the config file is incomplete
+# set defaults instead:
+
+# Maximum time, in seconds, of hard drive spindown time that you are
+# comfortable with. Worst case, it's possible that you could lose this
+# amount of work if your battery fails you while in laptop mode.
+MAX_AGE=${MAX_AGE:-'600'}
+
+# Read-ahead, in kilobytes
+READAHEAD=${READAHEAD:-'4096'}
+
+# Shall we remount journaled fs. with appropriate commit interval? (1=yes)
+DO_REMOUNTS=${DO_REMOUNTS:-'1'}
+
+# And shall we add the "noatime" option to that as well? (1=yes)
+DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'}
+
+# Shall we adjust the idle timeout on a hard drive?
+DO_HD=${DO_HD:-'1'}
+
+# Adjust idle timeout on which hard drive?
+HD="${HD:-'/dev/hda'}"
+
+# spindown time for HD (hdparm -S values)
+AC_HD=${AC_HD:-'244'}
+BATT_HD=${BATT_HD:-'4'}
+
+# Dirty synchronous ratio. At this percentage of dirty pages the process which
+# calls write() does its own writeback
+DIRTY_RATIO=${DIRTY_RATIO:-'40'}
+
+# cpu frequency scaling
+# See Documentation/cpu-freq/user-guide.txt for more info
+DO_CPU=${CPU_MANAGE:-'0'}
+CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'}
+
+#
+# Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been
+# exceeded, the kernel will wake pdflush which will then reduce the amount
+# of dirty memory to dirty_background_ratio. Set this nice and low, so once
+# some writeout has commenced, we do a lot of it.
+#
+DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'}
+
+# kernel default dirty buffer age
+DEF_AGE=${DEF_AGE:-'30'}
+DEF_UPDATE=${DEF_UPDATE:-'5'}
+DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'}
+DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'}
+DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'}
+DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'}
+DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'}
+
+# This must be adjusted manually to the value of HZ in the running kernel
+# on 2.4, until the XFS people change their 2.4 external interfaces to work in
+# centisecs. This can be automated, but it's a work in progress that still needs
+# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external
+# interfaces, and that is currently always set to 100. So you don't need to
+# change this on 2.6.
+XFS_HZ=${XFS_HZ:-'100'}
+
+#############################################################################
+
+KLEVEL="$(uname -r |
+ {
+ IFS='.' read a b c
+ echo $a.$b
+ }
+)"
+case "$KLEVEL" in
+ "2.4"|"2.6")
+ ;;
+ *)
+ echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2
+ exit 1
+ ;;
+esac
+
+if [ ! -e /proc/sys/vm/laptop_mode ] ; then
+ echo "Kernel is not patched with laptop_mode patch." >&2
+ exit 1
+fi
+
+if [ ! -w /proc/sys/vm/laptop_mode ] ; then
+ echo "You do not have enough privileges to enable laptop_mode." >&2
+ exit 1
+fi
+
+# Remove an option (the first parameter) of the form option=<number> from
+# a mount options string (the rest of the parameters).
+parse_mount_opts () {
+ OPT="$1"
+ shift
+ echo ",$*," | sed \
+ -e 's/,'"$OPT"'=[0-9]*,/,/g' \
+ -e 's/,,*/,/g' \
+ -e 's/^,//' \
+ -e 's/,$//'
+}
+
+# Remove an option (the first parameter) without any arguments from
+# a mount option string (the rest of the parameters).
+parse_nonumber_mount_opts () {
+ OPT="$1"
+ shift
+ echo ",$*," | sed \
+ -e 's/,'"$OPT"',/,/g' \
+ -e 's/,,*/,/g' \
+ -e 's/^,//' \
+ -e 's/,$//'
+}
+
+# Find out the state of a yes/no option (e.g. "atime"/"noatime") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, the option name the second, and the default
+# value the third. The remainder is the mount options string.
+#
+# Example:
+# parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
+#
+# If fstab contains, say, "rw" for this filesystem, then the result
+# will be "defaults,atime".
+parse_yesno_opts_wfstab () {
+ L_DEV="$1"
+ OPT="$2"
+ DEF_OPT="$3"
+ shift 3
+ L_OPTS="$*"
+ PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
+ PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
+ # Watch for a default atime in fstab
+ FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
+ if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then
+ # option specified in fstab: extract the value and use it
+ if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then
+ echo "$PARSEDOPTS1,no$OPT"
+ else
+ # no$OPT not found -- so we must have $OPT.
+ echo "$PARSEDOPTS1,$OPT"
+ fi
+ else
+ # option not specified in fstab -- choose the default.
+ echo "$PARSEDOPTS1,$DEF_OPT"
+ fi
+}
+
+# Find out the state of a numbered option (e.g. "commit=NNN") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, and the option name the second. The
+# remainder is the mount options string in which the replacement
+# must be done.
+#
+# Example:
+# parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
+#
+# If fstab contains, say, "commit=3,rw" for this filesystem, then the
+# result will be "rw,commit=3".
+parse_mount_opts_wfstab () {
+ L_DEV="$1"
+ OPT="$2"
+ shift 2
+ L_OPTS="$*"
+ PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
+ # Watch for a default commit in fstab
+ FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
+ if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then
+ # option specified in fstab: extract the value, and use it
+ echo -n "$PARSEDOPTS1,$OPT="
+ echo ",$FSTAB_OPTS," | sed \
+ -e 's/.*,'"$OPT"'=//' \
+ -e 's/,.*//'
+ else
+ # option not specified in fstab: set it to 0
+ echo "$PARSEDOPTS1,$OPT=0"
+ fi
+}
+
+deduce_fstype () {
+ MP="$1"
+ # My root filesystem unfortunately has
+ # type "unknown" in /etc/mtab. If we encounter
+ # "unknown", we try to get the type from fstab.
+ cat /etc/fstab |
+ grep -v '^#' |
+ while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do
+ if [ "$FSTAB_MP" = "$MP" ]; then
+ echo $FSTAB_FST
+ exit 0
+ fi
+ done
+}
+
+if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then
+ NOATIME_OPT=",noatime"
+fi
+
+case "$1" in
+ start)
+ AGE=$((100*$MAX_AGE))
+ XFS_AGE=$(($XFS_HZ*$MAX_AGE))
+ echo -n "Starting laptop_mode"
+
+ if [ -d /proc/sys/vm/pagebuf ] ; then
+ # (For 2.4 and early 2.6.)
+ # This only needs to be set, not reset -- it is only used when
+ # laptop mode is enabled.
+ echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
+ echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+ elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+ # (A couple of early 2.6 laptop mode patches had these.)
+ # The same goes for these.
+ echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
+ echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+ elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
+ # (2.6.6)
+ # But not for these -- they are also used in normal
+ # operation.
+ echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
+ echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
+ elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
+ # (2.6.7 upwards)
+ # And not for these either. These are in centisecs,
+ # not USER_HZ, so we have to use $AGE, not $XFS_AGE.
+ echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs
+ echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs
+ echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs
+ fi
+
+ case "$KLEVEL" in
+ "2.4")
+ echo 1 > /proc/sys/vm/laptop_mode
+ echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush
+ ;;
+ "2.6")
+ echo 5 > /proc/sys/vm/laptop_mode
+ echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs
+ echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs
+ echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio
+ echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio
+ ;;
+ esac
+ if [ $DO_REMOUNTS -eq 1 ]; then
+ cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+ PARSEDOPTS="$(parse_mount_opts "$OPTS")"
+ if [ "$FST" = 'unknown' ]; then
+ FST=$(deduce_fstype $MP)
+ fi
+ case "$FST" in
+ "ext3"|"reiserfs")
+ PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
+ mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
+ ;;
+ "xfs")
+ mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT
+ ;;
+ esac
+ if [ -b $DEV ] ; then
+ blockdev --setra $(($READAHEAD * 2)) $DEV
+ fi
+ done
+ fi
+ if [ $DO_HD -eq 1 ] ; then
+ for THISHD in $HD ; do
+ /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1
+ /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1
+ done
+ fi
+ if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
+ if [ $CPU_MAXFREQ = 'slowest' ]; then
+ CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq`
+ fi
+ echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
+ fi
+ echo "."
+ ;;
+ stop)
+ U_AGE=$((100*$DEF_UPDATE))
+ B_AGE=$((100*$DEF_AGE))
+ echo -n "Stopping laptop_mode"
+ echo 0 > /proc/sys/vm/laptop_mode
+ if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+ # These need to be restored, if there are no lm_*.
+ echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer
+ echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval
+ elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
+ # These need to be restored as well.
+ echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs
+ echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs
+ echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs
+ fi
+ case "$KLEVEL" in
+ "2.4")
+ echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush
+ ;;
+ "2.6")
+ echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs
+ echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs
+ echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio
+ echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio
+ ;;
+ esac
+ if [ $DO_REMOUNTS -eq 1 ] ; then
+ cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+ # Reset commit and atime options to defaults.
+ if [ "$FST" = 'unknown' ]; then
+ FST=$(deduce_fstype $MP)
+ fi
+ case "$FST" in
+ "ext3"|"reiserfs")
+ PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
+ PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
+ mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+ ;;
+ "xfs")
+ PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
+ mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+ ;;
+ esac
+ if [ -b $DEV ] ; then
+ blockdev --setra 256 $DEV
+ fi
+ done
+ fi
+ if [ $DO_HD -eq 1 ] ; then
+ for THISHD in $HD ; do
+ /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1
+ /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1
+ done
+ fi
+ if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
+ echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
+ fi
+ echo "."
+ ;;
+ *)
+ echo "Usage: $0 {start|stop}" 2>&1
+ exit 1
+ ;;
+
+esac
+
+exit 0
+--------------------CONTROL SCRIPT END------------------------------------------
+
+
+ACPI integration
+----------------
+
+Dax Kelson submitted this so that the ACPI acpid daemon will
+kick off the laptop_mode script and run hdparm. The part that
+automatically disables laptop mode when the battery is low was
+written by Jan Topinski.
+
+-----------------/etc/acpi/events/ac_adapter BEGIN------------------------------
+event=ac_adapter
+action=/etc/acpi/actions/ac.sh %e
+----------------/etc/acpi/events/ac_adapter END---------------------------------
+
+
+-----------------/etc/acpi/events/battery BEGIN---------------------------------
+event=battery.*
+action=/etc/acpi/actions/battery.sh %e
+----------------/etc/acpi/events/battery END------------------------------------
+
+
+----------------/etc/acpi/actions/ac.sh BEGIN-----------------------------------
+#!/bin/bash
+
+# ac on/offline event handler
+
+status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state`
+
+case $status in
+ "on-line")
+ /sbin/laptop_mode stop
+ exit 0
+ ;;
+ "off-line")
+ /sbin/laptop_mode start
+ exit 0
+ ;;
+esac
+---------------------------/etc/acpi/actions/ac.sh END--------------------------
+
+
+---------------------------/etc/acpi/actions/battery.sh BEGIN-------------------
+#! /bin/bash
+
+# Automatically disable laptop mode when the battery almost runs out.
+
+BATT_INFO=/proc/acpi/battery/$2/state
+
+if [[ -f /proc/sys/vm/laptop_mode ]]
+then
+ LM=`cat /proc/sys/vm/laptop_mode`
+ if [[ $LM -gt 0 ]]
+ then
+ if [[ -f $BATT_INFO ]]
+ then
+ # Source the config file only now that we know we need
+ if [ -f /etc/default/laptop-mode ] ; then
+ # Debian
+ . /etc/default/laptop-mode
+ elif [ -f /etc/sysconfig/laptop-mode ] ; then
+ # Others
+ . /etc/sysconfig/laptop-mode
+ fi
+ MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'}
+
+ ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`"
+ if [[ ACTION -eq "discharging" ]]
+ then
+ PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" `
+ REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" `
+ fi
+ if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES))
+ then
+ /sbin/laptop_mode stop
+ fi
+ else
+ logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path."
+ fi
+ fi
+fi
+---------------------------/etc/acpi/actions/battery.sh END--------------------
+
+
+Monitoring tool
+---------------
+
+Bartek Kania submitted this, it can be used to measure how much time your disk
+spends spun up/down.
+
+---------------------------dslm.c BEGIN-----------------------------------------
+/*
+ * Simple Disk Sleep Monitor
+ * by Bartek Kania
+ * Licenced under the GPL
+ */
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <linux/hdreg.h>
+
+#ifdef DEBUG
+#define D(x) x
+#else
+#define D(x)
+#endif
+
+int endit = 0;
+
+/* Check if the disk is in powersave-mode
+ * Most of the code is stolen from hdparm.
+ * 1 = active, 0 = standby/sleep, -1 = unknown */
+int check_powermode(int fd)
+{
+ unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0};
+ int state;
+
+ if (ioctl(fd, HDIO_DRIVE_CMD, &args)
+ && (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */
+ && ioctl(fd, HDIO_DRIVE_CMD, &args)) {
+ if (errno != EIO || args[0] != 0 || args[1] != 0) {
+ state = -1; /* "unknown"; */
+ } else
+ state = 0; /* "sleeping"; */
+ } else {
+ state = (args[2] == 255) ? 1 : 0;
+ }
+ D(printf(" drive state is: %d\n", state));
+
+ return state;
+}
+
+char *state_name(int i)
+{
+ if (i == -1) return "unknown";
+ if (i == 0) return "sleeping";
+ if (i == 1) return "active";
+
+ return "internal error";
+}
+
+char *myctime(time_t time)
+{
+ char *ts = ctime(&time);
+ ts[strlen(ts) - 1] = 0;
+
+ return ts;
+}
+
+void measure(int fd)
+{
+ time_t start_time;
+ int last_state;
+ time_t last_time;
+ int curr_state;
+ time_t curr_time = 0;
+ time_t time_diff;
+ time_t active_time = 0;
+ time_t sleep_time = 0;
+ time_t unknown_time = 0;
+ time_t total_time = 0;
+ int changes = 0;
+ float tmp;
+
+ printf("Starting measurements\n");
+
+ last_state = check_powermode(fd);
+ start_time = last_time = time(0);
+ printf(" System is in state %s\n\n", state_name(last_state));
+
+ while(!endit) {
+ sleep(1);
+ curr_state = check_powermode(fd);
+
+ if (curr_state != last_state || endit) {
+ changes++;
+ curr_time = time(0);
+ time_diff = curr_time - last_time;
+
+ if (last_state == 1) active_time += time_diff;
+ else if (last_state == 0) sleep_time += time_diff;
+ else unknown_time += time_diff;
+
+ last_state = curr_state;
+ last_time = curr_time;
+
+ printf("%s: State-change to %s\n", myctime(curr_time),
+ state_name(curr_state));
+ }
+ }
+ changes--; /* Compensate for SIGINT */
+
+ total_time = time(0) - start_time;
+ printf("\nTotal running time: %lus\n", curr_time - start_time);
+ printf(" State changed %d times\n", changes);
+
+ tmp = (float)sleep_time / (float)total_time * 100;
+ printf(" Time in sleep state: %lus (%.2f%%)\n", sleep_time, tmp);
+ tmp = (float)active_time / (float)total_time * 100;
+ printf(" Time in active state: %lus (%.2f%%)\n", active_time, tmp);
+ tmp = (float)unknown_time / (float)total_time * 100;
+ printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp);
+}
+
+void ender(int s)
+{
+ endit = 1;
+}
+
+void usage()
+{
+ puts("usage: dslm [-w <time>] <disk>");
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ int fd;
+ char *disk = 0;
+ int settle_time = 60;
+
+ /* Parse the simple command-line */
+ if (argc == 2)
+ disk = argv[1];
+ else if (argc == 4) {
+ settle_time = atoi(argv[2]);
+ disk = argv[3];
+ } else
+ usage();
+
+ if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) {
+ printf("Can't open %s, because: %s\n", disk, strerror(errno));
+ exit(-1);
+ }
+
+ if (settle_time) {
+ printf("Waiting %d seconds for the system to settle down to "
+ "'normal'\n", settle_time);
+ sleep(settle_time);
+ } else
+ puts("Not waiting for system to settle down");
+
+ signal(SIGINT, ender);
+
+ measure(fd);
+
+ close(fd);
+
+ return 0;
+}
+---------------------------dslm.c END-------------------------------------------
diff --git a/Documentation/laptops/sony-laptop.txt b/Documentation/laptops/sony-laptop.txt
new file mode 100644
index 0000000..8b2bc15
--- /dev/null
+++ b/Documentation/laptops/sony-laptop.txt
@@ -0,0 +1,116 @@
+Sony Notebook Control Driver (SNC) Readme
+-----------------------------------------
+ Copyright (C) 2004- 2005 Stelian Pop <stelian@popies.net>
+ Copyright (C) 2007 Mattia Dongili <malattia@linux.it>
+
+This mini-driver drives the SNC and SPIC device present in the ACPI BIOS of the
+Sony Vaio laptops. This driver mixes both devices functions under the same
+(hopefully consistent) interface. This also means that the sonypi driver is
+obsoleted by sony-laptop now.
+
+Fn keys (hotkeys):
+------------------
+Some models report hotkeys through the SNC or SPIC devices, such events are
+reported both through the ACPI subsystem as acpi events and through the INPUT
+subsystem. See the logs of acpid or /proc/acpi/event and
+/proc/bus/input/devices to find out what those events are and which input
+devices are created by the driver.
+
+Backlight control:
+------------------
+If your laptop model supports it, you will find sysfs files in the
+/sys/class/backlight/sony/
+directory. You will be able to query and set the current screen
+brightness:
+ brightness get/set screen brightness (an iteger
+ between 0 and 7)
+ actual_brightness reading from this file will query the HW
+ to get real brightness value
+ max_brightness the maximum brightness value
+
+
+Platform specific:
+------------------
+Loading the sony-laptop module will create a
+/sys/devices/platform/sony-laptop/
+directory populated with some files.
+
+You then read/write integer values from/to those files by using
+standard UNIX tools.
+
+The files are:
+ brightness_default screen brightness which will be set
+ when the laptop will be rebooted
+ cdpower power on/off the internal CD drive
+ audiopower power on/off the internal sound card
+ lanpower power on/off the internal ethernet card
+ (only in debug mode)
+ bluetoothpower power on/off the internal bluetooth device
+ fanspeed get/set the fan speed
+
+Note that some files may be missing if they are not supported
+by your particular laptop model.
+
+Example usage:
+ # echo "1" > /sys/devices/platform/sony-laptop/brightness_default
+sets the lowest screen brightness for the next and later reboots,
+ # echo "8" > /sys/devices/platform/sony-laptop/brightness_default
+sets the highest screen brightness for the next and later reboots,
+ # cat /sys/devices/platform/sony-laptop/brightness_default
+retrieves the value.
+
+ # echo "0" > /sys/devices/platform/sony-laptop/audiopower
+powers off the sound card,
+ # echo "1" > /sys/devices/platform/sony-laptop/audiopower
+powers on the sound card.
+
+Development:
+------------
+
+If you want to help with the development of this driver (and
+you are not afraid of any side effects doing strange things with
+your ACPI BIOS could have on your laptop), load the driver and
+pass the option 'debug=1'.
+
+REPEAT: DON'T DO THIS IF YOU DON'T LIKE RISKY BUSINESS.
+
+In your kernel logs you will find the list of all ACPI methods
+the SNC device has on your laptop. You can see the GCDP/GCDP methods
+used to pwer on/off the CD drive, but there are others.
+
+I HAVE NO IDEA WHAT THOSE METHODS DO.
+
+The sony-laptop driver creates, for some of those methods (the most
+current ones found on several Vaio models), an entry under
+/sys/devices/platform/sony-laptop, just like the 'cdpower' one.
+You can create other entries corresponding to your own laptop methods by
+further editing the source (see the 'sony_nc_values' table, and add a new
+entry to this table with your get/set method names using the
+SNC_HANDLE_NAMES macro).
+
+Your mission, should you accept it, is to try finding out what
+those entries are for, by reading/writing random values from/to those
+files and find out what is the impact on your laptop.
+
+Should you find anything interesting, please report it back to me,
+I will not disavow all knowledge of your actions :)
+
+See also http://www.linux.it/~malattia/wiki/index.php/Sony_drivers for other
+useful info.
+
+Bugs/Limitations:
+-----------------
+
+* This driver is not based on official documentation from Sony
+ (because there is none), so there is no guarantee this driver
+ will work at all, or do the right thing. Although this hasn't
+ happened to me, this driver could do very bad things to your
+ laptop, including permanent damage.
+
+* The sony-laptop and sonypi drivers do not interact at all. In the
+ future, sonypi could use sony-laptop to do (part of) its business.
+
+* spicctrl, which is the userspace tool used to communicate with the
+ sonypi driver (through /dev/sonypi) does not try to use the
+ sony-laptop driver. In the future, spicctrl could try sonypi first,
+ and if it isn't present, try sony-laptop instead.
diff --git a/Documentation/laptops/sonypi.txt b/Documentation/laptops/sonypi.txt
new file mode 100644
index 0000000..4857acf
--- /dev/null
+++ b/Documentation/laptops/sonypi.txt
@@ -0,0 +1,152 @@
+Sony Programmable I/O Control Device Driver Readme
+--------------------------------------------------
+ Copyright (C) 2001-2004 Stelian Pop <stelian@popies.net>
+ Copyright (C) 2001-2002 Alcôve <www.alcove.com>
+ Copyright (C) 2001 Michael Ashley <m.ashley@unsw.edu.au>
+ Copyright (C) 2001 Junichi Morita <jun1m@mars.dti.ne.jp>
+ Copyright (C) 2000 Takaya Kinjo <t-kinjo@tc4.so-net.ne.jp>
+ Copyright (C) 2000 Andrew Tridgell <tridge@samba.org>
+
+This driver enables access to the Sony Programmable I/O Control Device which
+can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be
+limited to new FX series laptops, at least the FX501 and the FX702) lack a
+sonypi device and are not supported at all by this driver.
+
+It will give access (through a user space utility) to some events those laptops
+generate, like:
+ - jogdial events (the small wheel on the side of Vaios)
+ - capture button events (only on Vaio Picturebook series)
+ - Fn keys
+ - bluetooth button (only on C1VR model)
+ - programmable keys, back, help, zoom, thumbphrase buttons, etc.
+ (when available)
+
+Those events (see linux/sonypi.h) can be polled using the character device node
+/dev/sonypi (major 10, minor auto allocated or specified as a option).
+A simple daemon which translates the jogdial movements into mouse wheel events
+can be downloaded at: <http://popies.net/sonypi/>
+
+Another option to intercept the events is to get them directly through the
+input layer.
+
+This driver supports also some ioctl commands for setting the LCD screen
+brightness and querying the batteries charge information (some more
+commands may be added in the future).
+
+This driver can also be used to set the camera controls on Picturebook series
+(brightness, contrast etc), and is used by the video4linux driver for the
+Motion Eye camera.
+
+Please note that this driver was created by reverse engineering the Windows
+driver and the ACPI BIOS, because Sony doesn't agree to release any programming
+specs for its laptops. If someone convinces them to do so, drop me a note.
+
+Driver options:
+---------------
+
+Several options can be passed to the sonypi driver using the standard
+module argument syntax (<param>=<value> when passing the option to the
+module or sonypi.<param>=<value> on the kernel boot line when sonypi is
+statically linked into the kernel). Those options are:
+
+ minor: minor number of the misc device /dev/sonypi,
+ default is -1 (automatic allocation, see /proc/misc
+ or kernel logs)
+
+ camera: if you have a PictureBook series Vaio (with the
+ integrated MotionEye camera), set this parameter to 1
+ in order to let the driver access to the camera
+
+ fnkeyinit: on some Vaios (C1VE, C1VR etc), the Fn key events don't
+ get enabled unless you set this parameter to 1.
+ Do not use this option unless it's actually necessary,
+ some Vaio models don't deal well with this option.
+ This option is available only if the kernel is
+ compiled without ACPI support (since it conflicts
+ with it and it shouldn't be required anyway if
+ ACPI is already enabled).
+
+ verbose: set to 1 to print unknown events received from the
+ sonypi device.
+ set to 2 to print all events received from the
+ sonypi device.
+
+ compat: uses some compatibility code for enabling the sonypi
+ events. If the driver worked for you in the past
+ (prior to version 1.5) and does not work anymore,
+ add this option and report to the author.
+
+ mask: event mask telling the driver what events will be
+ reported to the user. This parameter is required for
+ some Vaio models where the hardware reuses values
+ used in other Vaio models (like the FX series who does
+ not have a jogdial but reuses the jogdial events for
+ programmable keys events). The default event mask is
+ set to 0xffffffff, meaning that all possible events
+ will be tried. You can use the following bits to
+ construct your own event mask (from
+ drivers/char/sonypi.h):
+ SONYPI_JOGGER_MASK 0x0001
+ SONYPI_CAPTURE_MASK 0x0002
+ SONYPI_FNKEY_MASK 0x0004
+ SONYPI_BLUETOOTH_MASK 0x0008
+ SONYPI_PKEY_MASK 0x0010
+ SONYPI_BACK_MASK 0x0020
+ SONYPI_HELP_MASK 0x0040
+ SONYPI_LID_MASK 0x0080
+ SONYPI_ZOOM_MASK 0x0100
+ SONYPI_THUMBPHRASE_MASK 0x0200
+ SONYPI_MEYE_MASK 0x0400
+ SONYPI_MEMORYSTICK_MASK 0x0800
+ SONYPI_BATTERY_MASK 0x1000
+ SONYPI_WIRELESS_MASK 0x2000
+
+ useinput: if set (which is the default) two input devices are
+ created, one which interprets the jogdial events as
+ mouse events, the other one which acts like a
+ keyboard reporting the pressing of the special keys.
+
+Module use:
+-----------
+
+In order to automatically load the sonypi module on use, you can put those
+lines in your /etc/modprobe.conf file:
+
+ alias char-major-10-250 sonypi
+ options sonypi minor=250
+
+This supposes the use of minor 250 for the sonypi device:
+
+ # mknod /dev/sonypi c 10 250
+
+Bugs:
+-----
+
+ - several users reported that this driver disables the BIOS-managed
+ Fn-keys which put the laptop in sleeping state, or switch the
+ external monitor on/off. There is no workaround yet, since this
+ driver disables all APM management for those keys, by enabling the
+ ACPI management (and the ACPI core stuff is not complete yet). If
+ you have one of those laptops with working Fn keys and want to
+ continue to use them, don't use this driver.
+
+ - some users reported that the laptop speed is lower (dhrystone
+ tested) when using the driver with the fnkeyinit parameter. I cannot
+ reproduce it on my laptop and not all users have this problem.
+ This happens because the fnkeyinit parameter enables the ACPI
+ mode (but without additional ACPI control, like processor
+ speed handling etc). Use ACPI instead of APM if it works on your
+ laptop.
+
+ - sonypi lacks the ability to distinguish between certain key
+ events on some models.
+
+ - some models with the nvidia card (geforce go 6200 tc) uses a
+ different way to adjust the backlighting of the screen. There
+ is a userspace utility to adjust the brightness on those models,
+ which can be downloaded from
+ http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2
+
+ - since all development was done by reverse engineering, there is
+ _absolutely no guarantee_ that this driver will not crash your
+ laptop. Permanently.
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
new file mode 100644
index 0000000..71f0fe1
--- /dev/null
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -0,0 +1,1488 @@
+ ThinkPad ACPI Extras Driver
+
+ Version 0.21
+ May 29th, 2008
+
+ Borislav Deianov <borislav@users.sf.net>
+ Henrique de Moraes Holschuh <hmh@hmh.eng.br>
+ http://ibm-acpi.sf.net/
+
+
+This is a Linux driver for the IBM and Lenovo ThinkPad laptops. It
+supports various features of these laptops which are accessible
+through the ACPI and ACPI EC framework, but not otherwise fully
+supported by the generic Linux ACPI drivers.
+
+This driver used to be named ibm-acpi until kernel 2.6.21 and release
+0.13-20070314. It used to be in the drivers/acpi tree, but it was
+moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
+2.6.22, and release 0.14.
+
+The driver is named "thinkpad-acpi". In some places, like module
+names, "thinkpad_acpi" is used because of userspace issues.
+
+"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
+long due to length limitations on some Linux kernel versions.
+
+Status
+------
+
+The features currently supported are the following (see below for
+detailed description):
+
+ - Fn key combinations
+ - Bluetooth enable and disable
+ - video output switching, expansion control
+ - ThinkLight on and off
+ - limited docking and undocking
+ - UltraBay eject
+ - CMOS control
+ - LED control
+ - ACPI sounds
+ - temperature sensors
+ - Experimental: embedded controller register dump
+ - LCD brightness control
+ - Volume control
+ - Fan control and monitoring: fan speed, fan enable/disable
+ - WAN enable and disable
+
+A compatibility table by model and feature is maintained on the web
+site, http://ibm-acpi.sf.net/. I appreciate any success or failure
+reports, especially if they add to or correct the compatibility table.
+Please include the following information in your report:
+
+ - ThinkPad model name
+ - a copy of your DSDT, from /proc/acpi/dsdt
+ - a copy of the output of dmidecode, with serial numbers
+ and UUIDs masked off
+ - which driver features work and which don't
+ - the observed behavior of non-working features
+
+Any other comments or patches are also more than welcome.
+
+
+Installation
+------------
+
+If you are compiling this driver as included in the Linux kernel
+sources, simply enable the CONFIG_THINKPAD_ACPI option, and optionally
+enable the CONFIG_THINKPAD_ACPI_BAY option if you want the
+thinkpad-specific bay functionality.
+
+Features
+--------
+
+The driver exports two different interfaces to userspace, which can be
+used to access the features it provides. One is a legacy procfs-based
+interface, which will be removed at some time in the distant future.
+The other is a new sysfs-based interface which is not complete yet.
+
+The procfs interface creates the /proc/acpi/ibm directory. There is a
+file under that directory for each feature it supports. The procfs
+interface is mostly frozen, and will change very little if at all: it
+will not be extended to add any new functionality in the driver, instead
+all new functionality will be implemented on the sysfs interface.
+
+The sysfs interface tries to blend in the generic Linux sysfs subsystems
+and classes as much as possible. Since some of these subsystems are not
+yet ready or stabilized, it is expected that this interface will change,
+and any and all userspace programs must deal with it.
+
+
+Notes about the sysfs interface:
+
+Unlike what was done with the procfs interface, correctness when talking
+to the sysfs interfaces will be enforced, as will correctness in the
+thinkpad-acpi's implementation of sysfs interfaces.
+
+Also, any bugs in the thinkpad-acpi sysfs driver code or in the
+thinkpad-acpi's implementation of the sysfs interfaces will be fixed for
+maximum correctness, even if that means changing an interface in
+non-compatible ways. As these interfaces mature both in the kernel and
+in thinkpad-acpi, such changes should become quite rare.
+
+Applications interfacing to the thinkpad-acpi sysfs interfaces must
+follow all sysfs guidelines and correctly process all errors (the sysfs
+interface makes extensive use of errors). File descriptors and open /
+close operations to the sysfs inodes must also be properly implemented.
+
+The version of thinkpad-acpi's sysfs interface is exported by the driver
+as a driver attribute (see below).
+
+Sysfs driver attributes are on the driver's sysfs attribute space,
+for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and
+/sys/bus/platform/drivers/thinkpad_hwmon/
+
+Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
+space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/.
+
+Sysfs device attributes for the sensors and fan are on the
+thinkpad_hwmon device's sysfs attribute space, but you should locate it
+looking for a hwmon device with the name attribute of "thinkpad".
+
+Driver version
+--------------
+
+procfs: /proc/acpi/ibm/driver
+sysfs driver attribute: version
+
+The driver name and version. No commands can be written to this file.
+
+Sysfs interface version
+-----------------------
+
+sysfs driver attribute: interface_version
+
+Version of the thinkpad-acpi sysfs interface, as an unsigned long
+(output in hex format: 0xAAAABBCC), where:
+ AAAA - major revision
+ BB - minor revision
+ CC - bugfix revision
+
+The sysfs interface version changelog for the driver can be found at the
+end of this document. Changes to the sysfs interface done by the kernel
+subsystems are not documented here, nor are they tracked by this
+attribute.
+
+Changes to the thinkpad-acpi sysfs interface are only considered
+non-experimental when they are submitted to Linux mainline, at which
+point the changes in this interface are documented and interface_version
+may be updated. If you are using any thinkpad-acpi features not yet
+sent to mainline for merging, you do so on your own risk: these features
+may disappear, or be implemented in a different and incompatible way by
+the time they are merged in Linux mainline.
+
+Changes that are backwards-compatible by nature (e.g. the addition of
+attributes that do not change the way the other attributes work) do not
+always warrant an update of interface_version. Therefore, one must
+expect that an attribute might not be there, and deal with it properly
+(an attribute not being there *is* a valid way to make it clear that a
+feature is not available in sysfs).
+
+Hot keys
+--------
+
+procfs: /proc/acpi/ibm/hotkey
+sysfs device attribute: hotkey_*
+
+In a ThinkPad, the ACPI HKEY handler is responsible for communicating
+some important events and also keyboard hot key presses to the operating
+system. Enabling the hotkey functionality of thinkpad-acpi signals the
+firmware that such a driver is present, and modifies how the ThinkPad
+firmware will behave in many situations.
+
+The driver enables the hot key feature automatically when loaded. The
+feature can later be disabled and enabled back at runtime. The driver
+will also restore the hot key feature to its previous state and mask
+when it is unloaded.
+
+When the hotkey feature is enabled and the hot key mask is set (see
+below), the driver will report HKEY events in the following format:
+
+ ibm/hotkey HKEY 00000080 0000xxxx
+
+Some of these events refer to hot key presses, but not all.
+
+The driver will generate events over the input layer for hot keys and
+radio switches, and over the ACPI netlink layer for other events. The
+input layer support accepts the standard IOCTLs to remap the keycodes
+assigned to each hot key.
+
+The hot key bit mask allows some control over which hot keys generate
+events. If a key is "masked" (bit set to 0 in the mask), the firmware
+will handle it. If it is "unmasked", it signals the firmware that
+thinkpad-acpi would prefer to handle it, if the firmware would be so
+kind to allow it (and it often doesn't!).
+
+Not all bits in the mask can be modified. Not all bits that can be
+modified do anything. Not all hot keys can be individually controlled
+by the mask. Some models do not support the mask at all, and in those
+models, hot keys cannot be controlled individually. The behaviour of
+the mask is, therefore, highly dependent on the ThinkPad model.
+
+Note that unmasking some keys prevents their default behavior. For
+example, if Fn+F5 is unmasked, that key will no longer enable/disable
+Bluetooth by itself.
+
+Note also that not all Fn key combinations are supported through ACPI.
+For example, on the X40, the brightness, volume and "Access IBM" buttons
+do not generate ACPI events even with this driver. They *can* be used
+through the "ThinkPad Buttons" utility, see http://www.nongnu.org/tpb/
+
+procfs notes:
+
+The following commands can be written to the /proc/acpi/ibm/hotkey file:
+
+ echo enable > /proc/acpi/ibm/hotkey -- enable the hot keys feature
+ echo disable > /proc/acpi/ibm/hotkey -- disable the hot keys feature
+ echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
+ echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
+ ... any other 8-hex-digit mask ...
+ echo reset > /proc/acpi/ibm/hotkey -- restore the original mask
+
+The procfs interface does not support NVRAM polling control. So as to
+maintain maximum bug-to-bug compatibility, it does not report any masks,
+nor does it allow one to manipulate the hot key mask when the firmware
+does not support masks at all, even if NVRAM polling is in use.
+
+sysfs notes:
+
+ hotkey_bios_enabled:
+ Returns the status of the hot keys feature when
+ thinkpad-acpi was loaded. Upon module unload, the hot
+ key feature status will be restored to this value.
+
+ 0: hot keys were disabled
+ 1: hot keys were enabled (unusual)
+
+ hotkey_bios_mask:
+ Returns the hot keys mask when thinkpad-acpi was loaded.
+ Upon module unload, the hot keys mask will be restored
+ to this value.
+
+ hotkey_enable:
+ Enables/disables the hot keys feature in the ACPI
+ firmware, and reports current status of the hot keys
+ feature. Has no effect on the NVRAM hot key polling
+ functionality.
+
+ 0: disables the hot keys feature / feature disabled
+ 1: enables the hot keys feature / feature enabled
+
+ hotkey_mask:
+ bit mask to enable driver-handling (and depending on
+ the firmware, ACPI event generation) for each hot key
+ (see above). Returns the current status of the hot keys
+ mask, and allows one to modify it.
+
+ Note: when NVRAM polling is active, the firmware mask
+ will be different from the value returned by
+ hotkey_mask. The driver will retain enabled bits for
+ hotkeys that are under NVRAM polling even if the
+ firmware refuses them, and will not set these bits on
+ the firmware hot key mask.
+
+ hotkey_all_mask:
+ bit mask that should enable event reporting for all
+ supported hot keys, when echoed to hotkey_mask above.
+ Unless you know which events need to be handled
+ passively (because the firmware *will* handle them
+ anyway), do *not* use hotkey_all_mask. Use
+ hotkey_recommended_mask, instead. You have been warned.
+
+ hotkey_recommended_mask:
+ bit mask that should enable event reporting for all
+ supported hot keys, except those which are always
+ handled by the firmware anyway. Echo it to
+ hotkey_mask above, to use.
+
+ hotkey_source_mask:
+ bit mask that selects which hot keys will the driver
+ poll the NVRAM for. This is auto-detected by the driver
+ based on the capabilities reported by the ACPI firmware,
+ but it can be overridden at runtime.
+
+ Hot keys whose bits are set in both hotkey_source_mask
+ and also on hotkey_mask are polled for in NVRAM. Only a
+ few hot keys are available through CMOS NVRAM polling.
+
+ Warning: when in NVRAM mode, the volume up/down/mute
+ keys are synthesized according to changes in the mixer,
+ so you have to use volume up or volume down to unmute,
+ as per the ThinkPad volume mixer user interface. When
+ in ACPI event mode, volume up/down/mute are reported as
+ separate events, but this behaviour may be corrected in
+ future releases of this driver, in which case the
+ ThinkPad volume mixer user interface semantics will be
+ enforced.
+
+ hotkey_poll_freq:
+ frequency in Hz for hot key polling. It must be between
+ 0 and 25 Hz. Polling is only carried out when strictly
+ needed.
+
+ Setting hotkey_poll_freq to zero disables polling, and
+ will cause hot key presses that require NVRAM polling
+ to never be reported.
+
+ Setting hotkey_poll_freq too low will cause repeated
+ pressings of the same hot key to be misreported as a
+ single key press, or to not even be detected at all.
+ The recommended polling frequency is 10Hz.
+
+ hotkey_radio_sw:
+ If the ThinkPad has a hardware radio switch, this
+ attribute will read 0 if the switch is in the "radios
+ disabled" position, and 1 if the switch is in the
+ "radios enabled" position.
+
+ This attribute has poll()/select() support.
+
+ hotkey_tablet_mode:
+ If the ThinkPad has tablet capabilities, this attribute
+ will read 0 if the ThinkPad is in normal mode, and
+ 1 if the ThinkPad is in tablet mode.
+
+ This attribute has poll()/select() support.
+
+ hotkey_report_mode:
+ Returns the state of the procfs ACPI event report mode
+ filter for hot keys. If it is set to 1 (the default),
+ all hot key presses are reported both through the input
+ layer and also as ACPI events through procfs (but not
+ through netlink). If it is set to 2, hot key presses
+ are reported only through the input layer.
+
+ This attribute is read-only in kernels 2.6.23 or later,
+ and read-write on earlier kernels.
+
+ May return -EPERM (write access locked out by module
+ parameter) or -EACCES (read-only).
+
+ wakeup_reason:
+ Set to 1 if the system is waking up because the user
+ requested a bay ejection. Set to 2 if the system is
+ waking up because the user requested the system to
+ undock. Set to zero for normal wake-ups or wake-ups
+ due to unknown reasons.
+
+ This attribute has poll()/select() support.
+
+ wakeup_hotunplug_complete:
+ Set to 1 if the system was waken up because of an
+ undock or bay ejection request, and that request
+ was successfully completed. At this point, it might
+ be useful to send the system back to sleep, at the
+ user's choice. Refer to HKEY events 0x4003 and
+ 0x3003, below.
+
+ This attribute has poll()/select() support.
+
+input layer notes:
+
+A Hot key is mapped to a single input layer EV_KEY event, possibly
+followed by an EV_MSC MSC_SCAN event that shall contain that key's scan
+code. An EV_SYN event will always be generated to mark the end of the
+event block.
+
+Do not use the EV_MSC MSC_SCAN events to process keys. They are to be
+used as a helper to remap keys, only. They are particularly useful when
+remapping KEY_UNKNOWN keys.
+
+The events are available in an input device, with the following id:
+
+ Bus: BUS_HOST
+ vendor: 0x1014 (PCI_VENDOR_ID_IBM) or
+ 0x17aa (PCI_VENDOR_ID_LENOVO)
+ product: 0x5054 ("TP")
+ version: 0x4101
+
+The version will have its LSB incremented if the keymap changes in a
+backwards-compatible way. The MSB shall always be 0x41 for this input
+device. If the MSB is not 0x41, do not use the device as described in
+this section, as it is either something else (e.g. another input device
+exported by a thinkpad driver, such as HDAPS) or its functionality has
+been changed in a non-backwards compatible way.
+
+Adding other event types for other functionalities shall be considered a
+backwards-compatible change for this input device.
+
+Thinkpad-acpi Hot Key event map (version 0x4101):
+
+ACPI Scan
+event code Key Notes
+
+0x1001 0x00 FN+F1 -
+0x1002 0x01 FN+F2 IBM: battery (rare)
+ Lenovo: Screen lock
+
+0x1003 0x02 FN+F3 Many IBM models always report
+ this hot key, even with hot keys
+ disabled or with Fn+F3 masked
+ off
+ IBM: screen lock
+ Lenovo: battery
+
+0x1004 0x03 FN+F4 Sleep button (ACPI sleep button
+ semantics, i.e. sleep-to-RAM).
+ It is always generate some kind
+ of event, either the hot key
+ event or a ACPI sleep button
+ event. The firmware may
+ refuse to generate further FN+F4
+ key presses until a S3 or S4 ACPI
+ sleep cycle is performed or some
+ time passes.
+
+0x1005 0x04 FN+F5 Radio. Enables/disables
+ the internal Bluetooth hardware
+ and W-WAN card if left in control
+ of the firmware. Does not affect
+ the WLAN card.
+ Should be used to turn on/off all
+ radios (Bluetooth+W-WAN+WLAN),
+ really.
+
+0x1006 0x05 FN+F6 -
+
+0x1007 0x06 FN+F7 Video output cycle.
+ Do you feel lucky today?
+
+0x1008 0x07 FN+F8 IBM: toggle screen expand
+ Lenovo: configure UltraNav
+
+0x1009 0x08 FN+F9 -
+ .. .. ..
+0x100B 0x0A FN+F11 -
+
+0x100C 0x0B FN+F12 Sleep to disk. You are always
+ supposed to handle it yourself,
+ either through the ACPI event,
+ or through a hotkey event.
+ The firmware may refuse to
+ generate further FN+F4 key
+ press events until a S3 or S4
+ ACPI sleep cycle is performed,
+ or some time passes.
+
+0x100D 0x0C FN+BACKSPACE -
+0x100E 0x0D FN+INSERT -
+0x100F 0x0E FN+DELETE -
+
+0x1010 0x0F FN+HOME Brightness up. This key is
+ always handled by the firmware
+ in IBM ThinkPads, even when
+ unmasked. Just leave it alone.
+ For Lenovo ThinkPads with a new
+ BIOS, it has to be handled either
+ by the ACPI OSI, or by userspace.
+0x1011 0x10 FN+END Brightness down. See brightness
+ up for details.
+
+0x1012 0x11 FN+PGUP ThinkLight toggle. This key is
+ always handled by the firmware,
+ even when unmasked.
+
+0x1013 0x12 FN+PGDOWN -
+
+0x1014 0x13 FN+SPACE Zoom key
+
+0x1015 0x14 VOLUME UP Internal mixer volume up. This
+ key is always handled by the
+ firmware, even when unmasked.
+ NOTE: Lenovo seems to be changing
+ this.
+0x1016 0x15 VOLUME DOWN Internal mixer volume up. This
+ key is always handled by the
+ firmware, even when unmasked.
+ NOTE: Lenovo seems to be changing
+ this.
+0x1017 0x16 MUTE Mute internal mixer. This
+ key is always handled by the
+ firmware, even when unmasked.
+
+0x1018 0x17 THINKPAD ThinkPad/Access IBM/Lenovo key
+
+0x1019 0x18 unknown
+.. .. ..
+0x1020 0x1F unknown
+
+The ThinkPad firmware does not allow one to differentiate when most hot
+keys are pressed or released (either that, or we don't know how to, yet).
+For these keys, the driver generates a set of events for a key press and
+immediately issues the same set of events for a key release. It is
+unknown by the driver if the ThinkPad firmware triggered these events on
+hot key press or release, but the firmware will do it for either one, not
+both.
+
+If a key is mapped to KEY_RESERVED, it generates no input events at all.
+If a key is mapped to KEY_UNKNOWN, it generates an input event that
+includes an scan code. If a key is mapped to anything else, it will
+generate input device EV_KEY events.
+
+In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW
+events for switches:
+
+SW_RFKILL_ALL T60 and later hardare rfkill rocker switch
+SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A
+
+Non hot-key ACPI HKEY event map:
+0x5001 Lid closed
+0x5002 Lid opened
+0x5009 Tablet swivel: switched to tablet mode
+0x500A Tablet swivel: switched to normal mode
+0x7000 Radio Switch may have changed state
+
+The above events are not propagated by the driver, except for legacy
+compatibility purposes when hotkey_report_mode is set to 1.
+
+0x2304 System is waking up from suspend to undock
+0x2305 System is waking up from suspend to eject bay
+0x2404 System is waking up from hibernation to undock
+0x2405 System is waking up from hibernation to eject bay
+
+The above events are never propagated by the driver.
+
+0x3003 Bay ejection (see 0x2x05) complete, can sleep again
+0x4003 Undocked (see 0x2x04), can sleep again
+0x500B Tablet pen inserted into its storage bay
+0x500C Tablet pen removed from its storage bay
+0x5010 Brightness level changed (newer Lenovo BIOSes)
+
+The above events are propagated by the driver.
+
+Compatibility notes:
+
+ibm-acpi and thinkpad-acpi 0.15 (mainline kernels before 2.6.23) never
+supported the input layer, and sent events over the procfs ACPI event
+interface.
+
+To avoid sending duplicate events over the input layer and the ACPI
+event interface, thinkpad-acpi 0.16 implements a module parameter
+(hotkey_report_mode), and also a sysfs device attribute with the same
+name.
+
+Make no mistake here: userspace is expected to switch to using the input
+layer interface of thinkpad-acpi, together with the ACPI netlink event
+interface in kernels 2.6.23 and later, or with the ACPI procfs event
+interface in kernels 2.6.22 and earlier.
+
+If no hotkey_report_mode module parameter is specified (or it is set to
+zero), the driver defaults to mode 1 (see below), and on kernels 2.6.22
+and earlier, also allows one to change the hotkey_report_mode through
+sysfs. In kernels 2.6.23 and later, where the netlink ACPI event
+interface is available, hotkey_report_mode cannot be changed through
+sysfs (it is read-only).
+
+If the hotkey_report_mode module parameter is set to 1 or 2, it cannot
+be changed later through sysfs (any writes will return -EPERM to signal
+that hotkey_report_mode was locked. On 2.6.23 and later, where
+hotkey_report_mode cannot be changed at all, writes will return -EACCES).
+
+hotkey_report_mode set to 1 makes the driver export through the procfs
+ACPI event interface all hot key presses (which are *also* sent to the
+input layer). This is a legacy compatibility behaviour, and it is also
+the default mode of operation for the driver.
+
+hotkey_report_mode set to 2 makes the driver filter out the hot key
+presses from the procfs ACPI event interface, so these events will only
+be sent through the input layer. Userspace that has been updated to use
+the thinkpad-acpi input layer interface should set hotkey_report_mode to
+2.
+
+Hot key press events are never sent to the ACPI netlink event interface.
+Really up-to-date userspace under kernel 2.6.23 and later is to use the
+netlink interface and the input layer interface, and don't bother at all
+with hotkey_report_mode.
+
+
+Brightness hotkey notes:
+
+These are the current sane choices for brightness key mapping in
+thinkpad-acpi:
+
+For IBM and Lenovo models *without* ACPI backlight control (the ones on
+which thinkpad-acpi will autoload its backlight interface by default,
+and on which ACPI video does not export a backlight interface):
+
+1. Don't enable or map the brightness hotkeys in thinkpad-acpi, as
+ these older firmware versions unfortunately won't respect the hotkey
+ mask for brightness keys anyway, and always reacts to them. This
+ usually work fine, unless X.org drivers are doing something to block
+ the BIOS. In that case, use (3) below. This is the default mode of
+ operation.
+
+2. Enable the hotkeys, but map them to something else that is NOT
+ KEY_BRIGHTNESS_UP/DOWN or any other keycode that would cause
+ userspace to try to change the backlight level, and use that as an
+ on-screen-display hint.
+
+3. IF AND ONLY IF X.org drivers find a way to block the firmware from
+ automatically changing the brightness, enable the hotkeys and map
+ them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN, and feed that to
+ something that calls xbacklight. thinkpad-acpi will not be able to
+ change brightness in that case either, so you should disable its
+ backlight interface.
+
+For Lenovo models *with* ACPI backlight control:
+
+1. Load up ACPI video and use that. ACPI video will report ACPI
+ events for brightness change keys. Do not mess with thinkpad-acpi
+ defaults in this case. thinkpad-acpi should not have anything to do
+ with backlight events in a scenario where ACPI video is loaded:
+ brightness hotkeys must be disabled, and the backlight interface is
+ to be kept disabled as well. This is the default mode of operation.
+
+2. Do *NOT* load up ACPI video, enable the hotkeys in thinkpad-acpi,
+ and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN. Process
+ these keys on userspace somehow (e.g. by calling xbacklight).
+
+Bluetooth
+---------
+
+procfs: /proc/acpi/ibm/bluetooth
+sysfs device attribute: bluetooth_enable (deprecated)
+sysfs rfkill class: switch "tpacpi_bluetooth_sw"
+
+This feature shows the presence and current state of a ThinkPad
+Bluetooth device in the internal ThinkPad CDC slot.
+
+Procfs notes:
+
+If Bluetooth is installed, the following commands can be used:
+
+ echo enable > /proc/acpi/ibm/bluetooth
+ echo disable > /proc/acpi/ibm/bluetooth
+
+Sysfs notes:
+
+ If the Bluetooth CDC card is installed, it can be enabled /
+ disabled through the "bluetooth_enable" thinkpad-acpi device
+ attribute, and its current status can also be queried.
+
+ enable:
+ 0: disables Bluetooth / Bluetooth is disabled
+ 1: enables Bluetooth / Bluetooth is enabled.
+
+ Note: this interface has been superseded by the generic rfkill
+ class. It has been deprecated, and it will be removed in year
+ 2010.
+
+ rfkill controller switch "tpacpi_bluetooth_sw": refer to
+ Documentation/rfkill.txt for details.
+
+Video output control -- /proc/acpi/ibm/video
+--------------------------------------------
+
+This feature allows control over the devices used for video output -
+LCD, CRT or DVI (if available). The following commands are available:
+
+ echo lcd_enable > /proc/acpi/ibm/video
+ echo lcd_disable > /proc/acpi/ibm/video
+ echo crt_enable > /proc/acpi/ibm/video
+ echo crt_disable > /proc/acpi/ibm/video
+ echo dvi_enable > /proc/acpi/ibm/video
+ echo dvi_disable > /proc/acpi/ibm/video
+ echo auto_enable > /proc/acpi/ibm/video
+ echo auto_disable > /proc/acpi/ibm/video
+ echo expand_toggle > /proc/acpi/ibm/video
+ echo video_switch > /proc/acpi/ibm/video
+
+Each video output device can be enabled or disabled individually.
+Reading /proc/acpi/ibm/video shows the status of each device.
+
+Automatic video switching can be enabled or disabled. When automatic
+video switching is enabled, certain events (e.g. opening the lid,
+docking or undocking) cause the video output device to change
+automatically. While this can be useful, it also causes flickering
+and, on the X40, video corruption. By disabling automatic switching,
+the flickering or video corruption can be avoided.
+
+The video_switch command cycles through the available video outputs
+(it simulates the behavior of Fn-F7).
+
+Video expansion can be toggled through this feature. This controls
+whether the display is expanded to fill the entire LCD screen when a
+mode with less than full resolution is used. Note that the current
+video expansion status cannot be determined through this feature.
+
+Note that on many models (particularly those using Radeon graphics
+chips) the X driver configures the video card in a way which prevents
+Fn-F7 from working. This also disables the video output switching
+features of this driver, as it uses the same ACPI methods as
+Fn-F7. Video switching on the console should still work.
+
+UPDATE: There's now a patch for the X.org Radeon driver which
+addresses this issue. Some people are reporting success with the patch
+while others are still having problems. For more information:
+
+https://bugs.freedesktop.org/show_bug.cgi?id=2000
+
+ThinkLight control
+------------------
+
+procfs: /proc/acpi/ibm/light
+sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED
+
+procfs notes:
+
+The ThinkLight status can be read and set through the procfs interface. A
+few models which do not make the status available will show the ThinkLight
+status as "unknown". The available commands are:
+
+ echo on > /proc/acpi/ibm/light
+ echo off > /proc/acpi/ibm/light
+
+sysfs notes:
+
+The ThinkLight sysfs interface is documented by the LED class
+documentation, in Documentation/leds-class.txt. The ThinkLight LED name
+is "tpacpi::thinklight".
+
+Due to limitations in the sysfs LED class, if the status of the thinklight
+cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
+It is impossible to know if the status returned through sysfs is valid.
+
+Docking / undocking -- /proc/acpi/ibm/dock
+------------------------------------------
+
+Docking and undocking (e.g. with the X4 UltraBase) requires some
+actions to be taken by the operating system to safely make or break
+the electrical connections with the dock.
+
+The docking feature of this driver generates the following ACPI events:
+
+ ibm/dock GDCK 00000003 00000001 -- eject request
+ ibm/dock GDCK 00000003 00000002 -- undocked
+ ibm/dock GDCK 00000000 00000003 -- docked
+
+NOTE: These events will only be generated if the laptop was docked
+when originally booted. This is due to the current lack of support for
+hot plugging of devices in the Linux ACPI framework. If the laptop was
+booted while not in the dock, the following message is shown in the
+logs:
+
+ Mar 17 01:42:34 aero kernel: thinkpad_acpi: dock device not present
+
+In this case, no dock-related events are generated but the dock and
+undock commands described below still work. They can be executed
+manually or triggered by Fn key combinations (see the example acpid
+configuration files included in the driver tarball package available
+on the web site).
+
+When the eject request button on the dock is pressed, the first event
+above is generated. The handler for this event should issue the
+following command:
+
+ echo undock > /proc/acpi/ibm/dock
+
+After the LED on the dock goes off, it is safe to eject the laptop.
+Note: if you pressed this key by mistake, go ahead and eject the
+laptop, then dock it back in. Otherwise, the dock may not function as
+expected.
+
+When the laptop is docked, the third event above is generated. The
+handler for this event should issue the following command to fully
+enable the dock:
+
+ echo dock > /proc/acpi/ibm/dock
+
+The contents of the /proc/acpi/ibm/dock file shows the current status
+of the dock, as provided by the ACPI framework.
+
+The docking support in this driver does not take care of enabling or
+disabling any other devices you may have attached to the dock. For
+example, a CD drive plugged into the UltraBase needs to be disabled or
+enabled separately. See the provided example acpid configuration files
+for how this can be accomplished.
+
+There is no support yet for PCI devices that may be attached to a
+docking station, e.g. in the ThinkPad Dock II. The driver currently
+does not recognize, enable or disable such devices. This means that
+the only docking stations currently supported are the X-series
+UltraBase docks and "dumb" port replicators like the Mini Dock (the
+latter don't need any ACPI support, actually).
+
+UltraBay eject -- /proc/acpi/ibm/bay
+------------------------------------
+
+Inserting or ejecting an UltraBay device requires some actions to be
+taken by the operating system to safely make or break the electrical
+connections with the device.
+
+This feature generates the following ACPI events:
+
+ ibm/bay MSTR 00000003 00000000 -- eject request
+ ibm/bay MSTR 00000001 00000000 -- eject lever inserted
+
+NOTE: These events will only be generated if the UltraBay was present
+when the laptop was originally booted (on the X series, the UltraBay
+is in the dock, so it may not be present if the laptop was undocked).
+This is due to the current lack of support for hot plugging of devices
+in the Linux ACPI framework. If the laptop was booted without the
+UltraBay, the following message is shown in the logs:
+
+ Mar 17 01:42:34 aero kernel: thinkpad_acpi: bay device not present
+
+In this case, no bay-related events are generated but the eject
+command described below still works. It can be executed manually or
+triggered by a hot key combination.
+
+Sliding the eject lever generates the first event shown above. The
+handler for this event should take whatever actions are necessary to
+shut down the device in the UltraBay (e.g. call idectl), then issue
+the following command:
+
+ echo eject > /proc/acpi/ibm/bay
+
+After the LED on the UltraBay goes off, it is safe to pull out the
+device.
+
+When the eject lever is inserted, the second event above is
+generated. The handler for this event should take whatever actions are
+necessary to enable the UltraBay device (e.g. call idectl).
+
+The contents of the /proc/acpi/ibm/bay file shows the current status
+of the UltraBay, as provided by the ACPI framework.
+
+EXPERIMENTAL warm eject support on the 600e/x, A22p and A3x (To use
+this feature, you need to supply the experimental=1 parameter when
+loading the module):
+
+These models do not have a button near the UltraBay device to request
+a hot eject but rather require the laptop to be put to sleep
+(suspend-to-ram) before the bay device is ejected or inserted).
+The sequence of steps to eject the device is as follows:
+
+ echo eject > /proc/acpi/ibm/bay
+ put the ThinkPad to sleep
+ remove the drive
+ resume from sleep
+ cat /proc/acpi/ibm/bay should show that the drive was removed
+
+On the A3x, both the UltraBay 2000 and UltraBay Plus devices are
+supported. Use "eject2" instead of "eject" for the second bay.
+
+Note: the UltraBay eject support on the 600e/x, A22p and A3x is
+EXPERIMENTAL and may not work as expected. USE WITH CAUTION!
+
+CMOS control
+------------
+
+procfs: /proc/acpi/ibm/cmos
+sysfs device attribute: cmos_command
+
+This feature is mostly used internally by the ACPI firmware to keep the legacy
+CMOS NVRAM bits in sync with the current machine state, and to record this
+state so that the ThinkPad will retain such settings across reboots.
+
+Some of these commands actually perform actions in some ThinkPad models, but
+this is expected to disappear more and more in newer models. As an example, in
+a T43 and in a X40, commands 12 and 13 still control the ThinkLight state for
+real, but commands 0 to 2 don't control the mixer anymore (they have been
+phased out) and just update the NVRAM.
+
+The range of valid cmos command numbers is 0 to 21, but not all have an
+effect and the behavior varies from model to model. Here is the behavior
+on the X40 (tpb is the ThinkPad Buttons utility):
+
+ 0 - Related to "Volume down" key press
+ 1 - Related to "Volume up" key press
+ 2 - Related to "Mute on" key press
+ 3 - Related to "Access IBM" key press
+ 4 - Related to "LCD brightness up" key press
+ 5 - Related to "LCD brightness down" key press
+ 11 - Related to "toggle screen expansion" key press/function
+ 12 - Related to "ThinkLight on"
+ 13 - Related to "ThinkLight off"
+ 14 - Related to "ThinkLight" key press (toggle ThinkLight)
+
+The cmos command interface is prone to firmware split-brain problems, as
+in newer ThinkPads it is just a compatibility layer. Do not use it, it is
+exported just as a debug tool.
+
+LED control
+-----------
+
+procfs: /proc/acpi/ibm/led
+sysfs attributes: as per LED class, see below for names
+
+Some of the LED indicators can be controlled through this feature. On
+some older ThinkPad models, it is possible to query the status of the
+LED indicators as well. Newer ThinkPads cannot query the real status
+of the LED indicators.
+
+procfs notes:
+
+The available commands are:
+
+ echo '<LED number> on' >/proc/acpi/ibm/led
+ echo '<LED number> off' >/proc/acpi/ibm/led
+ echo '<LED number> blink' >/proc/acpi/ibm/led
+
+The <LED number> range is 0 to 7. The set of LEDs that can be
+controlled varies from model to model. Here is the common ThinkPad
+mapping:
+
+ 0 - power
+ 1 - battery (orange)
+ 2 - battery (green)
+ 3 - UltraBase/dock
+ 4 - UltraBay
+ 5 - UltraBase battery slot
+ 6 - (unknown)
+ 7 - standby
+
+All of the above can be turned on and off and can be made to blink.
+
+sysfs notes:
+
+The ThinkPad LED sysfs interface is described in detail by the LED class
+documentation, in Documentation/leds-class.txt.
+
+The leds are named (in LED ID order, from 0 to 7):
+"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt",
+"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt",
+"tpacpi::unknown_led", "tpacpi::standby".
+
+Due to limitations in the sysfs LED class, if the status of the LED
+indicators cannot be read due to an error, thinkpad-acpi will report it as
+a brightness of zero (same as LED off).
+
+If the thinkpad firmware doesn't support reading the current status,
+trying to read the current LED brightness will just return whatever
+brightness was last written to that attribute.
+
+These LEDs can blink using hardware acceleration. To request that a
+ThinkPad indicator LED should blink in hardware accelerated mode, use the
+"timer" trigger, and leave the delay_on and delay_off parameters set to
+zero (to request hardware acceleration autodetection).
+
+ACPI sounds -- /proc/acpi/ibm/beep
+----------------------------------
+
+The BEEP method is used internally by the ACPI firmware to provide
+audible alerts in various situations. This feature allows the same
+sounds to be triggered manually.
+
+The commands are non-negative integer numbers:
+
+ echo <number> >/proc/acpi/ibm/beep
+
+The valid <number> range is 0 to 17. Not all numbers trigger sounds
+and the sounds vary from model to model. Here is the behavior on the
+X40:
+
+ 0 - stop a sound in progress (but use 17 to stop 16)
+ 2 - two beeps, pause, third beep ("low battery")
+ 3 - single beep
+ 4 - high, followed by low-pitched beep ("unable")
+ 5 - single beep
+ 6 - very high, followed by high-pitched beep ("AC/DC")
+ 7 - high-pitched beep
+ 9 - three short beeps
+ 10 - very long beep
+ 12 - low-pitched beep
+ 15 - three high-pitched beeps repeating constantly, stop with 0
+ 16 - one medium-pitched beep repeating constantly, stop with 17
+ 17 - stop 16
+
+Temperature sensors
+-------------------
+
+procfs: /proc/acpi/ibm/thermal
+sysfs device attributes: (hwmon "thinkpad") temp*_input
+
+Most ThinkPads include six or more separate temperature sensors but only
+expose the CPU temperature through the standard ACPI methods. This
+feature shows readings from up to eight different sensors on older
+ThinkPads, and up to sixteen different sensors on newer ThinkPads.
+
+For example, on the X40, a typical output may be:
+temperatures: 42 42 45 41 36 -128 33 -128
+
+On the T43/p, a typical output may be:
+temperatures: 48 48 36 52 38 -128 31 -128 48 52 48 -128 -128 -128 -128 -128
+
+The mapping of thermal sensors to physical locations varies depending on
+system-board model (and thus, on ThinkPad model).
+
+http://thinkwiki.org/wiki/Thermal_Sensors is a public wiki page that
+tries to track down these locations for various models.
+
+Most (newer?) models seem to follow this pattern:
+
+1: CPU
+2: (depends on model)
+3: (depends on model)
+4: GPU
+5: Main battery: main sensor
+6: Bay battery: main sensor
+7: Main battery: secondary sensor
+8: Bay battery: secondary sensor
+9-15: (depends on model)
+
+For the R51 (source: Thomas Gruber):
+2: Mini-PCI
+3: Internal HDD
+
+For the T43, T43/p (source: Shmidoax/Thinkwiki.org)
+http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_T43.2C_T43p
+2: System board, left side (near PCMCIA slot), reported as HDAPS temp
+3: PCMCIA slot
+9: MCH (northbridge) to DRAM Bus
+10: Clock-generator, mini-pci card and ICH (southbridge), under Mini-PCI
+ card, under touchpad
+11: Power regulator, underside of system board, below F2 key
+
+The A31 has a very atypical layout for the thermal sensors
+(source: Milos Popovic, http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_A31)
+1: CPU
+2: Main Battery: main sensor
+3: Power Converter
+4: Bay Battery: main sensor
+5: MCH (northbridge)
+6: PCMCIA/ambient
+7: Main Battery: secondary sensor
+8: Bay Battery: secondary sensor
+
+
+Procfs notes:
+ Readings from sensors that are not available return -128.
+ No commands can be written to this file.
+
+Sysfs notes:
+ Sensors that are not available return the ENXIO error. This
+ status may change at runtime, as there are hotplug thermal
+ sensors, like those inside the batteries and docks.
+
+ thinkpad-acpi thermal sensors are reported through the hwmon
+ subsystem, and follow all of the hwmon guidelines at
+ Documentation/hwmon.
+
+
+EXPERIMENTAL: Embedded controller register dump -- /proc/acpi/ibm/ecdump
+------------------------------------------------------------------------
+
+This feature is marked EXPERIMENTAL because the implementation
+directly accesses hardware registers and may not work as expected. USE
+WITH CAUTION! To use this feature, you need to supply the
+experimental=1 parameter when loading the module.
+
+This feature dumps the values of 256 embedded controller
+registers. Values which have changed since the last time the registers
+were dumped are marked with a star:
+
+[root@x40 ibm-acpi]# cat /proc/acpi/ibm/ecdump
+EC +00 +01 +02 +03 +04 +05 +06 +07 +08 +09 +0a +0b +0c +0d +0e +0f
+EC 0x00: a7 47 87 01 fe 96 00 08 01 00 cb 00 00 00 40 00
+EC 0x10: 00 00 ff ff f4 3c 87 09 01 ff 42 01 ff ff 0d 00
+EC 0x20: 00 00 00 00 00 00 00 00 00 00 00 03 43 00 00 80
+EC 0x30: 01 07 1a 00 30 04 00 00 *85 00 00 10 00 50 00 00
+EC 0x40: 00 00 00 00 00 00 14 01 00 04 00 00 00 00 00 00
+EC 0x50: 00 c0 02 0d 00 01 01 02 02 03 03 03 03 *bc *02 *bc
+EC 0x60: *02 *bc *02 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0x70: 00 00 00 00 00 12 30 40 *24 *26 *2c *27 *20 80 *1f 80
+EC 0x80: 00 00 00 06 *37 *0e 03 00 00 00 0e 07 00 00 00 00
+EC 0x90: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xa0: *ff 09 ff 09 ff ff *64 00 *00 *00 *a2 41 *ff *ff *e0 00
+EC 0xb0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xc0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xd0: 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xe0: 00 00 00 00 00 00 00 00 11 20 49 04 24 06 55 03
+EC 0xf0: 31 55 48 54 35 38 57 57 08 2f 45 73 07 65 6c 1a
+
+This feature can be used to determine the register holding the fan
+speed on some models. To do that, do the following:
+
+ - make sure the battery is fully charged
+ - make sure the fan is running
+ - run 'cat /proc/acpi/ibm/ecdump' several times, once per second or so
+
+The first step makes sure various charging-related values don't
+vary. The second ensures that the fan-related values do vary, since
+the fan speed fluctuates a bit. The third will (hopefully) mark the
+fan register with a star:
+
+[root@x40 ibm-acpi]# cat /proc/acpi/ibm/ecdump
+EC +00 +01 +02 +03 +04 +05 +06 +07 +08 +09 +0a +0b +0c +0d +0e +0f
+EC 0x00: a7 47 87 01 fe 96 00 08 01 00 cb 00 00 00 40 00
+EC 0x10: 00 00 ff ff f4 3c 87 09 01 ff 42 01 ff ff 0d 00
+EC 0x20: 00 00 00 00 00 00 00 00 00 00 00 03 43 00 00 80
+EC 0x30: 01 07 1a 00 30 04 00 00 85 00 00 10 00 50 00 00
+EC 0x40: 00 00 00 00 00 00 14 01 00 04 00 00 00 00 00 00
+EC 0x50: 00 c0 02 0d 00 01 01 02 02 03 03 03 03 bc 02 bc
+EC 0x60: 02 bc 02 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0x70: 00 00 00 00 00 12 30 40 24 27 2c 27 21 80 1f 80
+EC 0x80: 00 00 00 06 *be 0d 03 00 00 00 0e 07 00 00 00 00
+EC 0x90: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xa0: ff 09 ff 09 ff ff 64 00 00 00 a2 41 ff ff e0 00
+EC 0xb0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xc0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xd0: 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+EC 0xe0: 00 00 00 00 00 00 00 00 11 20 49 04 24 06 55 03
+EC 0xf0: 31 55 48 54 35 38 57 57 08 2f 45 73 07 65 6c 1a
+
+Another set of values that varies often is the temperature
+readings. Since temperatures don't change vary fast, you can take
+several quick dumps to eliminate them.
+
+You can use a similar method to figure out the meaning of other
+embedded controller registers - e.g. make sure nothing else changes
+except the charging or discharging battery to determine which
+registers contain the current battery capacity, etc. If you experiment
+with this, do send me your results (including some complete dumps with
+a description of the conditions when they were taken.)
+
+LCD brightness control
+----------------------
+
+procfs: /proc/acpi/ibm/brightness
+sysfs backlight device "thinkpad_screen"
+
+This feature allows software control of the LCD brightness on ThinkPad
+models which don't have a hardware brightness slider.
+
+It has some limitations: the LCD backlight cannot be actually turned on or
+off by this interface, and in many ThinkPad models, the "dim while on
+battery" functionality will be enabled by the BIOS when this interface is
+used, and cannot be controlled.
+
+On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
+has eight brightness levels, ranging from 0 to 7. Some of the levels
+may not be distinct. Later Lenovo models that implement the ACPI
+display backlight brightness control methods have 16 levels, ranging
+from 0 to 15.
+
+There are two interfaces to the firmware for direct brightness control,
+EC and CMOS. To select which one should be used, use the
+brightness_mode module parameter: brightness_mode=1 selects EC mode,
+brightness_mode=2 selects CMOS mode, brightness_mode=3 selects both EC
+and CMOS. The driver tries to auto-detect which interface to use.
+
+When display backlight brightness controls are available through the
+standard ACPI interface, it is best to use it instead of this direct
+ThinkPad-specific interface. The driver will disable its native
+backlight brightness control interface if it detects that the standard
+ACPI interface is available in the ThinkPad.
+
+The brightness_enable module parameter can be used to control whether
+the LCD brightness control feature will be enabled when available.
+brightness_enable=0 forces it to be disabled. brightness_enable=1
+forces it to be enabled when available, even if the standard ACPI
+interface is also available.
+
+Procfs notes:
+
+ The available commands are:
+
+ echo up >/proc/acpi/ibm/brightness
+ echo down >/proc/acpi/ibm/brightness
+ echo 'level <level>' >/proc/acpi/ibm/brightness
+
+Sysfs notes:
+
+The interface is implemented through the backlight sysfs class, which is
+poorly documented at this time.
+
+Locate the thinkpad_screen device under /sys/class/backlight, and inside
+it there will be the following attributes:
+
+ max_brightness:
+ Reads the maximum brightness the hardware can be set to.
+ The minimum is always zero.
+
+ actual_brightness:
+ Reads what brightness the screen is set to at this instant.
+
+ brightness:
+ Writes request the driver to change brightness to the
+ given value. Reads will tell you what brightness the
+ driver is trying to set the display to when "power" is set
+ to zero and the display has not been dimmed by a kernel
+ power management event.
+
+ power:
+ power management mode, where 0 is "display on", and 1 to 3
+ will dim the display backlight to brightness level 0
+ because thinkpad-acpi cannot really turn the backlight
+ off. Kernel power management events can temporarily
+ increase the current power management level, i.e. they can
+ dim the display.
+
+
+WARNING:
+
+ Whatever you do, do NOT ever call thinkpad-acpi backlight-level change
+ interface and the ACPI-based backlight level change interface
+ (available on newer BIOSes, and driven by the Linux ACPI video driver)
+ at the same time. The two will interact in bad ways, do funny things,
+ and maybe reduce the life of the backlight lamps by needlessly kicking
+ its level up and down at every change.
+
+Volume control -- /proc/acpi/ibm/volume
+---------------------------------------
+
+This feature allows volume control on ThinkPad models which don't have
+a hardware volume knob. The available commands are:
+
+ echo up >/proc/acpi/ibm/volume
+ echo down >/proc/acpi/ibm/volume
+ echo mute >/proc/acpi/ibm/volume
+ echo 'level <level>' >/proc/acpi/ibm/volume
+
+The <level> number range is 0 to 15 although not all of them may be
+distinct. The unmute the volume after the mute command, use either the
+up or down command (the level command will not unmute the volume).
+The current volume level and mute state is shown in the file.
+
+Fan control and monitoring: fan speed, fan enable/disable
+---------------------------------------------------------
+
+procfs: /proc/acpi/ibm/fan
+sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
+ pwm1_enable
+sysfs hwmon driver attributes: fan_watchdog
+
+NOTE NOTE NOTE: fan control operations are disabled by default for
+safety reasons. To enable them, the module parameter "fan_control=1"
+must be given to thinkpad-acpi.
+
+This feature attempts to show the current fan speed, control mode and
+other fan data that might be available. The speed is read directly
+from the hardware registers of the embedded controller. This is known
+to work on later R, T, X and Z series ThinkPads but may show a bogus
+value on other models.
+
+Fan levels:
+
+Most ThinkPad fans work in "levels" at the firmware interface. Level 0
+stops the fan. The higher the level, the higher the fan speed, although
+adjacent levels often map to the same fan speed. 7 is the highest
+level, where the fan reaches the maximum recommended speed.
+
+Level "auto" means the EC changes the fan level according to some
+internal algorithm, usually based on readings from the thermal sensors.
+
+There is also a "full-speed" level, also known as "disengaged" level.
+In this level, the EC disables the speed-locked closed-loop fan control,
+and drives the fan as fast as it can go, which might exceed hardware
+limits, so use this level with caution.
+
+The fan usually ramps up or down slowly from one speed to another, and
+it is normal for the EC to take several seconds to react to fan
+commands. The full-speed level may take up to two minutes to ramp up to
+maximum speed, and in some ThinkPads, the tachometer readings go stale
+while the EC is transitioning to the full-speed level.
+
+WARNING WARNING WARNING: do not leave the fan disabled unless you are
+monitoring all of the temperature sensor readings and you are ready to
+enable it if necessary to avoid overheating.
+
+An enabled fan in level "auto" may stop spinning if the EC decides the
+ThinkPad is cool enough and doesn't need the extra airflow. This is
+normal, and the EC will spin the fan up if the various thermal readings
+rise too much.
+
+On the X40, this seems to depend on the CPU and HDD temperatures.
+Specifically, the fan is turned on when either the CPU temperature
+climbs to 56 degrees or the HDD temperature climbs to 46 degrees. The
+fan is turned off when the CPU temperature drops to 49 degrees and the
+HDD temperature drops to 41 degrees. These thresholds cannot
+currently be controlled.
+
+The ThinkPad's ACPI DSDT code will reprogram the fan on its own when
+certain conditions are met. It will override any fan programming done
+through thinkpad-acpi.
+
+The thinkpad-acpi kernel driver can be programmed to revert the fan
+level to a safe setting if userspace does not issue one of the procfs
+fan commands: "enable", "disable", "level" or "watchdog", or if there
+are no writes to pwm1_enable (or to pwm1 *if and only if* pwm1_enable is
+set to 1, manual mode) within a configurable amount of time of up to
+120 seconds. This functionality is called fan safety watchdog.
+
+Note that the watchdog timer stops after it enables the fan. It will be
+rearmed again automatically (using the same interval) when one of the
+above mentioned fan commands is received. The fan watchdog is,
+therefore, not suitable to protect against fan mode changes made through
+means other than the "enable", "disable", and "level" procfs fan
+commands, or the hwmon fan control sysfs interface.
+
+Procfs notes:
+
+The fan may be enabled or disabled with the following commands:
+
+ echo enable >/proc/acpi/ibm/fan
+ echo disable >/proc/acpi/ibm/fan
+
+Placing a fan on level 0 is the same as disabling it. Enabling a fan
+will try to place it in a safe level if it is too slow or disabled.
+
+The fan level can be controlled with the command:
+
+ echo 'level <level>' > /proc/acpi/ibm/fan
+
+Where <level> is an integer from 0 to 7, or one of the words "auto" or
+"full-speed" (without the quotes). Not all ThinkPads support the "auto"
+and "full-speed" levels. The driver accepts "disengaged" as an alias for
+"full-speed", and reports it as "disengaged" for backwards
+compatibility.
+
+On the X31 and X40 (and ONLY on those models), the fan speed can be
+controlled to a certain degree. Once the fan is running, it can be
+forced to run faster or slower with the following command:
+
+ echo 'speed <speed>' > /proc/acpi/ibm/fan
+
+The sustainable range of fan speeds on the X40 appears to be from about
+3700 to about 7350. Values outside this range either do not have any
+effect or the fan speed eventually settles somewhere in that range. The
+fan cannot be stopped or started with this command. This functionality
+is incomplete, and not available through the sysfs interface.
+
+To program the safety watchdog, use the "watchdog" command.
+
+ echo 'watchdog <interval in seconds>' > /proc/acpi/ibm/fan
+
+If you want to disable the watchdog, use 0 as the interval.
+
+Sysfs notes:
+
+The sysfs interface follows the hwmon subsystem guidelines for the most
+part, and the exception is the fan safety watchdog.
+
+Writes to any of the sysfs attributes may return the EINVAL error if
+that operation is not supported in a given ThinkPad or if the parameter
+is out-of-bounds, and EPERM if it is forbidden. They may also return
+EINTR (interrupted system call), and EIO (I/O error while trying to talk
+to the firmware).
+
+Features not yet implemented by the driver return ENOSYS.
+
+hwmon device attribute pwm1_enable:
+ 0: PWM offline (fan is set to full-speed mode)
+ 1: Manual PWM control (use pwm1 to set fan level)
+ 2: Hardware PWM control (EC "auto" mode)
+ 3: reserved (Software PWM control, not implemented yet)
+
+ Modes 0 and 2 are not supported by all ThinkPads, and the
+ driver is not always able to detect this. If it does know a
+ mode is unsupported, it will return -EINVAL.
+
+hwmon device attribute pwm1:
+ Fan level, scaled from the firmware values of 0-7 to the hwmon
+ scale of 0-255. 0 means fan stopped, 255 means highest normal
+ speed (level 7).
+
+ This attribute only commands the fan if pmw1_enable is set to 1
+ (manual PWM control).
+
+hwmon device attribute fan1_input:
+ Fan tachometer reading, in RPM. May go stale on certain
+ ThinkPads while the EC transitions the PWM to offline mode,
+ which can take up to two minutes. May return rubbish on older
+ ThinkPads.
+
+hwmon driver attribute fan_watchdog:
+ Fan safety watchdog timer interval, in seconds. Minimum is
+ 1 second, maximum is 120 seconds. 0 disables the watchdog.
+
+To stop the fan: set pwm1 to zero, and pwm1_enable to 1.
+
+To start the fan in a safe mode: set pwm1_enable to 2. If that fails
+with EINVAL, try to set pwm1_enable to 1 and pwm1 to at least 128 (255
+would be the safest choice, though).
+
+
+WAN
+---
+
+procfs: /proc/acpi/ibm/wan
+sysfs device attribute: wwan_enable (deprecated)
+sysfs rfkill class: switch "tpacpi_wwan_sw"
+
+This feature shows the presence and current state of a W-WAN (Sierra
+Wireless EV-DO) device.
+
+It was tested on a Lenovo ThinkPad X60. It should probably work on other
+ThinkPad models which come with this module installed.
+
+Procfs notes:
+
+If the W-WAN card is installed, the following commands can be used:
+
+ echo enable > /proc/acpi/ibm/wan
+ echo disable > /proc/acpi/ibm/wan
+
+Sysfs notes:
+
+ If the W-WAN card is installed, it can be enabled /
+ disabled through the "wwan_enable" thinkpad-acpi device
+ attribute, and its current status can also be queried.
+
+ enable:
+ 0: disables WWAN card / WWAN card is disabled
+ 1: enables WWAN card / WWAN card is enabled.
+
+ Note: this interface has been superseded by the generic rfkill
+ class. It has been deprecated, and it will be removed in year
+ 2010.
+
+ rfkill controller switch "tpacpi_wwan_sw": refer to
+ Documentation/rfkill.txt for details.
+
+Multiple Commands, Module Parameters
+------------------------------------
+
+Multiple commands can be written to the proc files in one shot by
+separating them with commas, for example:
+
+ echo enable,0xffff > /proc/acpi/ibm/hotkey
+ echo lcd_disable,crt_enable > /proc/acpi/ibm/video
+
+Commands can also be specified when loading the thinkpad-acpi module,
+for example:
+
+ modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
+
+Enabling debugging output
+-------------------------
+
+The module takes a debug parameter which can be used to selectively
+enable various classes of debugging output, for example:
+
+ modprobe thinkpad_acpi debug=0xffff
+
+will enable all debugging output classes. It takes a bitmask, so
+to enable more than one output class, just add their values.
+
+ Debug bitmask Description
+ 0x0001 Initialization and probing
+ 0x0002 Removal
+
+There is also a kernel build option to enable more debugging
+information, which may be necessary to debug driver problems.
+
+The level of debugging information output by the driver can be changed
+at runtime through sysfs, using the driver attribute debug_level. The
+attribute takes the same bitmask as the debug module parameter above.
+
+Force loading of module
+-----------------------
+
+If thinkpad-acpi refuses to detect your ThinkPad, you can try to specify
+the module parameter force_load=1. Regardless of whether this works or
+not, please contact ibm-acpi-devel@lists.sourceforge.net with a report.
+
+
+Sysfs interface changelog:
+
+0x000100: Initial sysfs support, as a single platform driver and
+ device.
+0x000200: Hot key support for 32 hot keys, and radio slider switch
+ support.
+0x010000: Hot keys are now handled by default over the input
+ layer, the radio switch generates input event EV_RADIO,
+ and the driver enables hot key handling by default in
+ the firmware.
+
+0x020000: ABI fix: added a separate hwmon platform device and
+ driver, which must be located by name (thinkpad)
+ and the hwmon class for libsensors4 (lm-sensors 3)
+ compatibility. Moved all hwmon attributes to this
+ new platform device.
+
+0x020100: Marker for thinkpad-acpi with hot key NVRAM polling
+ support. If you must, use it to know you should not
+ start an userspace NVRAM poller (allows to detect when
+ NVRAM is compiled out by the user because it is
+ unneeded/undesired in the first place).
+0x020101: Marker for thinkpad-acpi with hot key NVRAM polling
+ and proper hotkey_mask semantics (version 8 of the
+ NVRAM polling patch). Some development snapshots of
+ 0.18 had an earlier version that did strange things
+ to hotkey_mask.
+
+0x020200: Add poll()/select() support to the following attributes:
+ hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt
new file mode 100644
index 0000000..718085b
--- /dev/null
+++ b/Documentation/ldm.txt
@@ -0,0 +1,109 @@
+
+ LDM - Logical Disk Manager (Dynamic Disks)
+ ------------------------------------------
+
+Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
+Last Updated by Anton Altaparmakov on 30 March 2007 for Windows Vista.
+
+Overview
+--------
+
+Windows 2000, XP, and Vista use a new partitioning scheme. It is a complete
+replacement for the MSDOS style partitions. It stores its information in a
+1MiB journalled database at the end of the physical disk. The size of
+partitions is limited only by disk space. The maximum number of partitions is
+nearly 2000.
+
+Any partitions created under the LDM are called "Dynamic Disks". There are no
+longer any primary or extended partitions. Normal MSDOS style partitions are
+now known as Basic Disks.
+
+If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use
+Dynamic Disks. The journalling allows Windows to make changes to these
+partitions and filesystems without the need to reboot.
+
+Once the LDM driver has divided up the disk, you can use the MD driver to
+assemble any multi-partition volumes, e.g. Stripes, RAID5.
+
+To prevent legacy applications from repartitioning the disk, the LDM creates a
+dummy MSDOS partition containing one disk-sized partition. This is what is
+supported with the Linux LDM driver.
+
+A newer approach that has been implemented with Vista is to put LDM on top of a
+GPT label disk. This is not supported by the Linux LDM driver yet.
+
+
+Example
+-------
+
+Below we have a 50MiB disk, divided into seven partitions.
+N.B. The missing 1MiB at the end of the disk is where the LDM database is
+ stored.
+
+ Device | Offset Bytes Sectors MiB | Size Bytes Sectors MiB
+ -------+----------------------------+---------------------------
+ hda | 0 0 0 | 52428800 102400 50
+ hda1 | 51380224 100352 49 | 1048576 2048 1
+ hda2 | 16384 32 0 | 6979584 13632 6
+ hda3 | 6995968 13664 6 | 10485760 20480 10
+ hda4 | 17481728 34144 16 | 4194304 8192 4
+ hda5 | 21676032 42336 20 | 5242880 10240 5
+ hda6 | 26918912 52576 25 | 10485760 20480 10
+ hda7 | 37404672 73056 35 | 13959168 27264 13
+
+The LDM Database may not store the partitions in the order that they appear on
+disk, but the driver will sort them.
+
+When Linux boots, you will see something like:
+
+ hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
+ hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
+
+
+Compiling LDM Support
+---------------------
+
+To enable LDM, choose the following two options:
+
+ "Advanced partition selection" CONFIG_PARTITION_ADVANCED
+ "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
+
+If you believe the driver isn't working as it should, you can enable the extra
+debugging code. This will produce a LOT of output. The option is:
+
+ "Windows LDM extra logging" CONFIG_LDM_DEBUG
+
+N.B. The partition code cannot be compiled as a module.
+
+As with all the partition code, if the driver doesn't see signs of its type of
+partition, it will pass control to another driver, so there is no harm in
+enabling it.
+
+If you have Dynamic Disks but don't enable the driver, then all you will see
+is a dummy MSDOS partition filling the whole disk. You won't be able to mount
+any of the volumes on the disk.
+
+
+Booting
+-------
+
+If you enable LDM support, then lilo is capable of booting from any of the
+discovered partitions. However, grub does not understand the LDM partitioning
+and cannot boot from a Dynamic Disk.
+
+
+More Documentation
+------------------
+
+There is an Overview of the LDM together with complete Technical Documentation.
+It is available for download.
+
+ http://www.linux-ntfs.org/content/view/19/37/
+
+If you have any LDM questions that aren't answered in the documentation, email
+me.
+
+Cheers,
+ FlatCap - Richard Russon
+ ldm@flatcap.org
+
diff --git a/Documentation/leds-class.txt b/Documentation/leds-class.txt
new file mode 100644
index 0000000..6399557
--- /dev/null
+++ b/Documentation/leds-class.txt
@@ -0,0 +1,94 @@
+LED handling under Linux
+========================
+
+If you're reading this and thinking about keyboard leds, these are
+handled by the input subsystem and the led class is *not* needed.
+
+In its simplest form, the LED class just allows control of LEDs from
+userspace. LEDs appear in /sys/class/leds/. The brightness file will
+set the brightness of the LED (taking a value 0-255). Most LEDs don't
+have hardware brightness support so will just be turned on for non-zero
+brightness settings.
+
+The class also introduces the optional concept of an LED trigger. A trigger
+is a kernel based source of led events. Triggers can either be simple or
+complex. A simple trigger isn't configurable and is designed to slot into
+existing subsystems with minimal additional code. Examples are the ide-disk,
+nand-disk and sharpsl-charge triggers. With led triggers disabled, the code
+optimises away.
+
+Complex triggers whilst available to all LEDs have LED specific
+parameters and work on a per LED basis. The timer trigger is an example.
+The timer trigger will periodically change the LED brightness between
+LED_OFF and the current brightness setting. The "on" and "off" time can
+be specified via /sys/class/leds/<device>/delay_{on,off} in milliseconds.
+You can change the brightness value of a LED independently of the timer
+trigger. However, if you set the brightness value to LED_OFF it will
+also disable the timer trigger.
+
+You can change triggers in a similar manner to the way an IO scheduler
+is chosen (via /sys/class/leds/<device>/trigger). Trigger specific
+parameters can appear in /sys/class/leds/<device> once a given trigger is
+selected.
+
+
+Design Philosophy
+=================
+
+The underlying design philosophy is simplicity. LEDs are simple devices
+and the aim is to keep a small amount of code giving as much functionality
+as possible. Please keep this in mind when suggesting enhancements.
+
+
+LED Device Naming
+=================
+
+Is currently of the form:
+
+"devicename:colour:function"
+
+There have been calls for LED properties such as colour to be exported as
+individual led class attributes. As a solution which doesn't incur as much
+overhead, I suggest these become part of the device name. The naming scheme
+above leaves scope for further attributes should they be needed. If sections
+of the name don't apply, just leave that section blank.
+
+
+Hardware accelerated blink of LEDs
+==================================
+
+Some LEDs can be programmed to blink without any CPU interaction. To
+support this feature, a LED driver can optionally implement the
+blink_set() function (see <linux/leds.h>). If implemented, triggers can
+attempt to use it before falling back to software timers. The blink_set()
+function should return 0 if the blink setting is supported, or -EINVAL
+otherwise, which means that LED blinking will be handled by software.
+
+The blink_set() function should choose a user friendly blinking
+value if it is called with *delay_on==0 && *delay_off==0 parameters. In
+this case the driver should give back the chosen value through delay_on
+and delay_off parameters to the leds subsystem.
+
+Setting the brightness to zero with brightness_set() callback function
+should completely turn off the LED and cancel the previously programmed
+hardware blinking function, if any.
+
+
+Known Issues
+============
+
+The LED Trigger core cannot be a module as the simple trigger functions
+would cause nightmare dependency issues. I see this as a minor issue
+compared to the benefits the simple trigger functionality brings. The
+rest of the LED subsystem can be modular.
+
+
+Future Development
+==================
+
+At the moment, a trigger can't be created specifically for a single LED.
+There are a number of cases where a trigger might only be mappable to a
+particular LED (ACPI?). The addition of triggers provided by the LED driver
+should cover this option and be possible to add without breaking the
+current interface.
+
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
new file mode 100644
index 0000000..725eef8
--- /dev/null
+++ b/Documentation/lguest/Makefile
@@ -0,0 +1,8 @@
+# This creates the demonstration utility "lguest" which runs a Linux guest.
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include
+LDLIBS:=-lz
+
+all: lguest
+
+clean:
+ rm -f lguest
diff --git a/Documentation/lguest/extract b/Documentation/lguest/extract
new file mode 100644
index 0000000..7730bb6
--- /dev/null
+++ b/Documentation/lguest/extract
@@ -0,0 +1,58 @@
+#! /bin/sh
+
+set -e
+
+PREFIX=$1
+shift
+
+trap 'rm -r $TMPDIR' 0
+TMPDIR=`mktemp -d`
+
+exec 3>/dev/null
+for f; do
+ while IFS="
+" read -r LINE; do
+ case "$LINE" in
+ *$PREFIX:[0-9]*:\**)
+ NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
+ if [ -f $TMPDIR/$NUM ]; then
+ echo "$TMPDIR/$NUM already exits prior to $f"
+ exit 1
+ fi
+ exec 3>>$TMPDIR/$NUM
+ echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
+ /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
+ ;;
+ *$PREFIX:[0-9]*)
+ NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
+ if [ -f $TMPDIR/$NUM ]; then
+ echo "$TMPDIR/$NUM already exits prior to $f"
+ exit 1
+ fi
+ exec 3>>$TMPDIR/$NUM
+ echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
+ /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
+ ;;
+ *:\**)
+ /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
+ echo >&3
+ exec 3>/dev/null
+ ;;
+ *)
+ /bin/echo "$LINE" >&3
+ ;;
+ esac
+ done < $f
+ echo >&3
+ exec 3>/dev/null
+done
+
+LASTFILE=""
+for f in $TMPDIR/*; do
+ if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
+ LASTFILE=$(cat $TMPDIR/.$(basename $f) )
+ echo "[ $LASTFILE ]"
+ fi
+ cat $f
+done
+
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
new file mode 100644
index 0000000..8045206
--- /dev/null
+++ b/Documentation/lguest/lguest.c
@@ -0,0 +1,2088 @@
+/*P:100 This is the Launcher code, a simple program which lays out the
+ * "physical" memory for the new Guest by mapping the kernel image and
+ * the virtual devices, then opens /dev/lguest to tell the kernel
+ * about the Guest and control it. :*/
+#define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <err.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <elf.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <linux/sockios.h>
+#include <linux/if_tun.h>
+#include <sys/uio.h>
+#include <termios.h>
+#include <getopt.h>
+#include <zlib.h>
+#include <assert.h>
+#include <sched.h>
+#include <limits.h>
+#include <stddef.h>
+#include <signal.h>
+#include "linux/lguest_launcher.h"
+#include "linux/virtio_config.h"
+#include "linux/virtio_net.h"
+#include "linux/virtio_blk.h"
+#include "linux/virtio_console.h"
+#include "linux/virtio_rng.h"
+#include "linux/virtio_ring.h"
+#include "asm/bootparam.h"
+/*L:110 We can ignore the 39 include files we need for this program, but I do
+ * want to draw attention to the use of kernel-style types.
+ *
+ * As Linus said, "C is a Spartan language, and so should your naming be." I
+ * like these abbreviations, so we define them here. Note that u64 is always
+ * unsigned long long, which works on all Linux systems: this means that we can
+ * use %llu in printf for any u64. */
+typedef unsigned long long u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+/*:*/
+
+#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
+#define NET_PEERNUM 1
+#define BRIDGE_PFX "bridge:"
+#ifndef SIOCBRADDIF
+#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
+#endif
+/* We can have up to 256 pages for devices. */
+#define DEVICE_PAGES 256
+/* This will occupy 3 pages: it must be a power of 2. */
+#define VIRTQUEUE_NUM 256
+
+/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
+ * this, and although I wouldn't recommend it, it works quite nicely here. */
+static bool verbose;
+#define verbose(args...) \
+ do { if (verbose) printf(args); } while(0)
+/*:*/
+
+/* File descriptors for the Waker. */
+struct {
+ int pipe[2];
+ int lguest_fd;
+} waker_fds;
+
+/* The pointer to the start of guest memory. */
+static void *guest_base;
+/* The maximum guest physical address allowed, and maximum possible. */
+static unsigned long guest_limit, guest_max;
+/* The pipe for signal hander to write to. */
+static int timeoutpipe[2];
+static unsigned int timeout_usec = 500;
+
+/* a per-cpu variable indicating whose vcpu is currently running */
+static unsigned int __thread cpu_id;
+
+/* This is our list of devices. */
+struct device_list
+{
+ /* Summary information about the devices in our list: ready to pass to
+ * select() to ask which need servicing.*/
+ fd_set infds;
+ int max_infd;
+
+ /* Counter to assign interrupt numbers. */
+ unsigned int next_irq;
+
+ /* Counter to print out convenient device numbers. */
+ unsigned int device_num;
+
+ /* The descriptor page for the devices. */
+ u8 *descpage;
+
+ /* A single linked list of devices. */
+ struct device *dev;
+ /* And a pointer to the last device for easy append and also for
+ * configuration appending. */
+ struct device *lastdev;
+};
+
+/* The list of Guest devices, based on command line arguments. */
+static struct device_list devices;
+
+/* The device structure describes a single device. */
+struct device
+{
+ /* The linked-list pointer. */
+ struct device *next;
+
+ /* The this device's descriptor, as mapped into the Guest. */
+ struct lguest_device_desc *desc;
+
+ /* The name of this device, for --verbose. */
+ const char *name;
+
+ /* If handle_input is set, it wants to be called when this file
+ * descriptor is ready. */
+ int fd;
+ bool (*handle_input)(int fd, struct device *me);
+
+ /* Any queues attached to this device */
+ struct virtqueue *vq;
+
+ /* Handle status being finalized (ie. feature bits stable). */
+ void (*ready)(struct device *me);
+
+ /* Device-specific data. */
+ void *priv;
+};
+
+/* The virtqueue structure describes a queue attached to a device. */
+struct virtqueue
+{
+ struct virtqueue *next;
+
+ /* Which device owns me. */
+ struct device *dev;
+
+ /* The configuration for this queue. */
+ struct lguest_vqconfig config;
+
+ /* The actual ring of buffers. */
+ struct vring vring;
+
+ /* Last available index we saw. */
+ u16 last_avail_idx;
+
+ /* The routine to call when the Guest pings us, or timeout. */
+ void (*handle_output)(int fd, struct virtqueue *me, bool timeout);
+
+ /* Outstanding buffers */
+ unsigned int inflight;
+
+ /* Is this blocked awaiting a timer? */
+ bool blocked;
+};
+
+/* Remember the arguments to the program so we can "reboot" */
+static char **main_args;
+
+/* Since guest is UP and we don't run at the same time, we don't need barriers.
+ * But I include them in the code in case others copy it. */
+#define wmb()
+
+/* Convert an iovec element to the given type.
+ *
+ * This is a fairly ugly trick: we need to know the size of the type and
+ * alignment requirement to check the pointer is kosher. It's also nice to
+ * have the name of the type in case we report failure.
+ *
+ * Typing those three things all the time is cumbersome and error prone, so we
+ * have a macro which sets them all up and passes to the real function. */
+#define convert(iov, type) \
+ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
+
+static void *_convert(struct iovec *iov, size_t size, size_t align,
+ const char *name)
+{
+ if (iov->iov_len != size)
+ errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
+ if ((unsigned long)iov->iov_base % align != 0)
+ errx(1, "Bad alignment %p for %s", iov->iov_base, name);
+ return iov->iov_base;
+}
+
+/* Wrapper for the last available index. Makes it easier to change. */
+#define lg_last_avail(vq) ((vq)->last_avail_idx)
+
+/* The virtio configuration space is defined to be little-endian. x86 is
+ * little-endian too, but it's nice to be explicit so we have these helpers. */
+#define cpu_to_le16(v16) (v16)
+#define cpu_to_le32(v32) (v32)
+#define cpu_to_le64(v64) (v64)
+#define le16_to_cpu(v16) (v16)
+#define le32_to_cpu(v32) (v32)
+#define le64_to_cpu(v64) (v64)
+
+/* Is this iovec empty? */
+static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_iov; i++)
+ if (iov[i].iov_len)
+ return false;
+ return true;
+}
+
+/* Take len bytes from the front of this iovec. */
+static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_iov; i++) {
+ unsigned int used;
+
+ used = iov[i].iov_len < len ? iov[i].iov_len : len;
+ iov[i].iov_base += used;
+ iov[i].iov_len -= used;
+ len -= used;
+ }
+ assert(len == 0);
+}
+
+/* The device virtqueue descriptors are followed by feature bitmasks. */
+static u8 *get_feature_bits(struct device *dev)
+{
+ return (u8 *)(dev->desc + 1)
+ + dev->desc->num_vq * sizeof(struct lguest_vqconfig);
+}
+
+/*L:100 The Launcher code itself takes us out into userspace, that scary place
+ * where pointers run wild and free! Unfortunately, like most userspace
+ * programs, it's quite boring (which is why everyone likes to hack on the
+ * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
+ * will get you through this section. Or, maybe not.
+ *
+ * The Launcher sets up a big chunk of memory to be the Guest's "physical"
+ * memory and stores it in "guest_base". In other words, Guest physical ==
+ * Launcher virtual with an offset.
+ *
+ * This can be tough to get your head around, but usually it just means that we
+ * use these trivial conversion functions when the Guest gives us it's
+ * "physical" addresses: */
+static void *from_guest_phys(unsigned long addr)
+{
+ return guest_base + addr;
+}
+
+static unsigned long to_guest_phys(const void *addr)
+{
+ return (addr - guest_base);
+}
+
+/*L:130
+ * Loading the Kernel.
+ *
+ * We start with couple of simple helper routines. open_or_die() avoids
+ * error-checking code cluttering the callers: */
+static int open_or_die(const char *name, int flags)
+{
+ int fd = open(name, flags);
+ if (fd < 0)
+ err(1, "Failed to open %s", name);
+ return fd;
+}
+
+/* map_zeroed_pages() takes a number of pages. */
+static void *map_zeroed_pages(unsigned int num)
+{
+ int fd = open_or_die("/dev/zero", O_RDONLY);
+ void *addr;
+
+ /* We use a private mapping (ie. if we write to the page, it will be
+ * copied). */
+ addr = mmap(NULL, getpagesize() * num,
+ PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
+ if (addr == MAP_FAILED)
+ err(1, "Mmaping %u pages of /dev/zero", num);
+ close(fd);
+
+ return addr;
+}
+
+/* Get some more pages for a device. */
+static void *get_pages(unsigned int num)
+{
+ void *addr = from_guest_phys(guest_limit);
+
+ guest_limit += num * getpagesize();
+ if (guest_limit > guest_max)
+ errx(1, "Not enough memory for devices");
+ return addr;
+}
+
+/* This routine is used to load the kernel or initrd. It tries mmap, but if
+ * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
+ * it falls back to reading the memory in. */
+static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
+{
+ ssize_t r;
+
+ /* We map writable even though for some segments are marked read-only.
+ * The kernel really wants to be writable: it patches its own
+ * instructions.
+ *
+ * MAP_PRIVATE means that the page won't be copied until a write is
+ * done to it. This allows us to share untouched memory between
+ * Guests. */
+ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
+ MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
+ return;
+
+ /* pread does a seek and a read in one shot: saves a few lines. */
+ r = pread(fd, addr, len, offset);
+ if (r != len)
+ err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
+}
+
+/* This routine takes an open vmlinux image, which is in ELF, and maps it into
+ * the Guest memory. ELF = Embedded Linking Format, which is the format used
+ * by all modern binaries on Linux including the kernel.
+ *
+ * The ELF headers give *two* addresses: a physical address, and a virtual
+ * address. We use the physical address; the Guest will map itself to the
+ * virtual address.
+ *
+ * We return the starting address. */
+static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
+{
+ Elf32_Phdr phdr[ehdr->e_phnum];
+ unsigned int i;
+
+ /* Sanity checks on the main ELF header: an x86 executable with a
+ * reasonable number of correctly-sized program headers. */
+ if (ehdr->e_type != ET_EXEC
+ || ehdr->e_machine != EM_386
+ || ehdr->e_phentsize != sizeof(Elf32_Phdr)
+ || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+ errx(1, "Malformed elf header");
+
+ /* An ELF executable contains an ELF header and a number of "program"
+ * headers which indicate which parts ("segments") of the program to
+ * load where. */
+
+ /* We read in all the program headers at once: */
+ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
+ err(1, "Seeking to program headers");
+ if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
+ err(1, "Reading program headers");
+
+ /* Try all the headers: there are usually only three. A read-only one,
+ * a read-write one, and a "note" section which we don't load. */
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ /* If this isn't a loadable segment, we ignore it */
+ if (phdr[i].p_type != PT_LOAD)
+ continue;
+
+ verbose("Section %i: size %i addr %p\n",
+ i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+
+ /* We map this section of the file at its physical address. */
+ map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
+ phdr[i].p_offset, phdr[i].p_filesz);
+ }
+
+ /* The entry point is given in the ELF header. */
+ return ehdr->e_entry;
+}
+
+/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
+ * supposed to jump into it and it will unpack itself. We used to have to
+ * perform some hairy magic because the unpacking code scared me.
+ *
+ * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
+ * a small patch to jump over the tricky bits in the Guest, so now we just read
+ * the funky header so we know where in the file to load, and away we go! */
+static unsigned long load_bzimage(int fd)
+{
+ struct boot_params boot;
+ int r;
+ /* Modern bzImages get loaded at 1M. */
+ void *p = from_guest_phys(0x100000);
+
+ /* Go back to the start of the file and read the header. It should be
+ * a Linux boot header (see Documentation/x86/i386/boot.txt) */
+ lseek(fd, 0, SEEK_SET);
+ read(fd, &boot, sizeof(boot));
+
+ /* Inside the setup_hdr, we expect the magic "HdrS" */
+ if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
+ errx(1, "This doesn't look like a bzImage to me");
+
+ /* Skip over the extra sectors of the header. */
+ lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
+
+ /* Now read everything into memory. in nice big chunks. */
+ while ((r = read(fd, p, 65536)) > 0)
+ p += r;
+
+ /* Finally, code32_start tells us where to enter the kernel. */
+ return boot.hdr.code32_start;
+}
+
+/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
+ * come wrapped up in the self-decompressing "bzImage" format. With a little
+ * work, we can load those, too. */
+static unsigned long load_kernel(int fd)
+{
+ Elf32_Ehdr hdr;
+
+ /* Read in the first few bytes. */
+ if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+ err(1, "Reading kernel");
+
+ /* If it's an ELF file, it starts with "\177ELF" */
+ if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
+ return map_elf(fd, &hdr);
+
+ /* Otherwise we assume it's a bzImage, and try to load it. */
+ return load_bzimage(fd);
+}
+
+/* This is a trivial little helper to align pages. Andi Kleen hated it because
+ * it calls getpagesize() twice: "it's dumb code."
+ *
+ * Kernel guys get really het up about optimization, even when it's not
+ * necessary. I leave this code as a reaction against that. */
+static inline unsigned long page_align(unsigned long addr)
+{
+ /* Add upwards and truncate downwards. */
+ return ((addr + getpagesize()-1) & ~(getpagesize()-1));
+}
+
+/*L:180 An "initial ram disk" is a disk image loaded into memory along with
+ * the kernel which the kernel can use to boot from without needing any
+ * drivers. Most distributions now use this as standard: the initrd contains
+ * the code to load the appropriate driver modules for the current machine.
+ *
+ * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
+ * kernels. He sent me this (and tells me when I break it). */
+static unsigned long load_initrd(const char *name, unsigned long mem)
+{
+ int ifd;
+ struct stat st;
+ unsigned long len;
+
+ ifd = open_or_die(name, O_RDONLY);
+ /* fstat() is needed to get the file size. */
+ if (fstat(ifd, &st) < 0)
+ err(1, "fstat() on initrd '%s'", name);
+
+ /* We map the initrd at the top of memory, but mmap wants it to be
+ * page-aligned, so we round the size up for that. */
+ len = page_align(st.st_size);
+ map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
+ /* Once a file is mapped, you can close the file descriptor. It's a
+ * little odd, but quite useful. */
+ close(ifd);
+ verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
+
+ /* We return the initrd size. */
+ return len;
+}
+
+/* Once we know how much memory we have we can construct simple linear page
+ * tables which set virtual == physical which will get the Guest far enough
+ * into the boot to create its own.
+ *
+ * We lay them out of the way, just below the initrd (which is why we need to
+ * know its size here). */
+static unsigned long setup_pagetables(unsigned long mem,
+ unsigned long initrd_size)
+{
+ unsigned long *pgdir, *linear;
+ unsigned int mapped_pages, i, linear_pages;
+ unsigned int ptes_per_page = getpagesize()/sizeof(void *);
+
+ mapped_pages = mem/getpagesize();
+
+ /* Each PTE page can map ptes_per_page pages: how many do we need? */
+ linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
+
+ /* We put the toplevel page directory page at the top of memory. */
+ pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
+
+ /* Now we use the next linear_pages pages as pte pages */
+ linear = (void *)pgdir - linear_pages*getpagesize();
+
+ /* Linear mapping is easy: put every page's address into the mapping in
+ * order. PAGE_PRESENT contains the flags Present, Writable and
+ * Executable. */
+ for (i = 0; i < mapped_pages; i++)
+ linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
+
+ /* The top level points to the linear page table pages above. */
+ for (i = 0; i < mapped_pages; i += ptes_per_page) {
+ pgdir[i/ptes_per_page]
+ = ((to_guest_phys(linear) + i*sizeof(void *))
+ | PAGE_PRESENT);
+ }
+
+ verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
+ mapped_pages, linear_pages, to_guest_phys(linear));
+
+ /* We return the top level (guest-physical) address: the kernel needs
+ * to know where it is. */
+ return to_guest_phys(pgdir);
+}
+/*:*/
+
+/* Simple routine to roll all the commandline arguments together with spaces
+ * between them. */
+static void concat(char *dst, char *args[])
+{
+ unsigned int i, len = 0;
+
+ for (i = 0; args[i]; i++) {
+ if (i) {
+ strcat(dst+len, " ");
+ len++;
+ }
+ strcpy(dst+len, args[i]);
+ len += strlen(args[i]);
+ }
+ /* In case it's empty. */
+ dst[len] = '\0';
+}
+
+/*L:185 This is where we actually tell the kernel to initialize the Guest. We
+ * saw the arguments it expects when we looked at initialize() in lguest_user.c:
+ * the base of Guest "physical" memory, the top physical page to allow, the
+ * top level pagetable and the entry point for the Guest. */
+static int tell_kernel(unsigned long pgdir, unsigned long start)
+{
+ unsigned long args[] = { LHREQ_INITIALIZE,
+ (unsigned long)guest_base,
+ guest_limit / getpagesize(), pgdir, start };
+ int fd;
+
+ verbose("Guest: %p - %p (%#lx)\n",
+ guest_base, guest_base + guest_limit, guest_limit);
+ fd = open_or_die("/dev/lguest", O_RDWR);
+ if (write(fd, args, sizeof(args)) < 0)
+ err(1, "Writing to /dev/lguest");
+
+ /* We return the /dev/lguest file descriptor to control this Guest */
+ return fd;
+}
+/*:*/
+
+static void add_device_fd(int fd)
+{
+ FD_SET(fd, &devices.infds);
+ if (fd > devices.max_infd)
+ devices.max_infd = fd;
+}
+
+/*L:200
+ * The Waker.
+ *
+ * With console, block and network devices, we can have lots of input which we
+ * need to process. We could try to tell the kernel what file descriptors to
+ * watch, but handing a file descriptor mask through to the kernel is fairly
+ * icky.
+ *
+ * Instead, we clone off a thread which watches the file descriptors and writes
+ * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
+ * stop running the Guest. This causes the Launcher to return from the
+ * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
+ * the LHREQ_BREAK and wake us up again.
+ *
+ * This, of course, is merely a different *kind* of icky.
+ *
+ * Given my well-known antipathy to threads, I'd prefer to use processes. But
+ * it's easier to share Guest memory with threads, and trivial to share the
+ * devices.infds as the Launcher changes it.
+ */
+static int waker(void *unused)
+{
+ /* Close the write end of the pipe: only the Launcher has it open. */
+ close(waker_fds.pipe[1]);
+
+ for (;;) {
+ fd_set rfds = devices.infds;
+ unsigned long args[] = { LHREQ_BREAK, 1 };
+ unsigned int maxfd = devices.max_infd;
+
+ /* We also listen to the pipe from the Launcher. */
+ FD_SET(waker_fds.pipe[0], &rfds);
+ if (waker_fds.pipe[0] > maxfd)
+ maxfd = waker_fds.pipe[0];
+
+ /* Wait until input is ready from one of the devices. */
+ select(maxfd+1, &rfds, NULL, NULL, NULL);
+
+ /* Message from Launcher? */
+ if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
+ char c;
+ /* If this fails, then assume Launcher has exited.
+ * Don't do anything on exit: we're just a thread! */
+ if (read(waker_fds.pipe[0], &c, 1) != 1)
+ _exit(0);
+ continue;
+ }
+
+ /* Send LHREQ_BREAK command to snap the Launcher out of it. */
+ pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
+ }
+ return 0;
+}
+
+/* This routine just sets up a pipe to the Waker process. */
+static void setup_waker(int lguest_fd)
+{
+ /* This pipe is closed when Launcher dies, telling Waker. */
+ if (pipe(waker_fds.pipe) != 0)
+ err(1, "Creating pipe for Waker");
+
+ /* Waker also needs to know the lguest fd */
+ waker_fds.lguest_fd = lguest_fd;
+
+ if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
+ err(1, "Creating Waker");
+}
+
+/*
+ * Device Handling.
+ *
+ * When the Guest gives us a buffer, it sends an array of addresses and sizes.
+ * We need to make sure it's not trying to reach into the Launcher itself, so
+ * we have a convenient routine which checks it and exits with an error message
+ * if something funny is going on:
+ */
+static void *_check_pointer(unsigned long addr, unsigned int size,
+ unsigned int line)
+{
+ /* We have to separately check addr and addr+size, because size could
+ * be huge and addr + size might wrap around. */
+ if (addr >= guest_limit || addr + size >= guest_limit)
+ errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
+ /* We return a pointer for the caller's convenience, now we know it's
+ * safe to use. */
+ return from_guest_phys(addr);
+}
+/* A macro which transparently hands the line number to the real function. */
+#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
+
+/* Each buffer in the virtqueues is actually a chain of descriptors. This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end. */
+static unsigned next_desc(struct virtqueue *vq, unsigned int i)
+{
+ unsigned int next;
+
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
+ return vq->vring.num;
+
+ /* Check they're not leading us off end of descriptors. */
+ next = vq->vring.desc[i].next;
+ /* Make sure compiler knows to grab that: we don't want it changing! */
+ wmb();
+
+ if (next >= vq->vring.num)
+ errx(1, "Desc next is %u", next);
+
+ return next;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->vring.num (which
+ * is never a valid descriptor number) if none was found. */
+static unsigned get_vq_desc(struct virtqueue *vq,
+ struct iovec iov[],
+ unsigned int *out_num, unsigned int *in_num)
+{
+ unsigned int i, head;
+ u16 last_avail;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail = lg_last_avail(vq);
+ if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
+ errx(1, "Guest moved used index from %u to %u",
+ last_avail, vq->vring.avail->idx);
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->vring.avail->idx == last_avail)
+ return vq->vring.num;
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ head = vq->vring.avail->ring[last_avail % vq->vring.num];
+ lg_last_avail(vq)++;
+
+ /* If their number is silly, that's a fatal mistake. */
+ if (head >= vq->vring.num)
+ errx(1, "Guest says index %u is available", head);
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+
+ i = head;
+ do {
+ /* Grab the first descriptor, and check it's OK. */
+ iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
+ iov[*out_num + *in_num].iov_base
+ = check_pointer(vq->vring.desc[i].addr,
+ vq->vring.desc[i].len);
+ /* If this is an input descriptor, increment that count. */
+ if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+ (*in_num)++;
+ else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (*in_num)
+ errx(1, "Descriptor has out after in");
+ (*out_num)++;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (*out_num + *in_num > vq->vring.num)
+ errx(1, "Looped descriptor");
+ } while ((i = next_desc(vq, i)) != vq->vring.num);
+
+ vq->inflight++;
+ return head;
+}
+
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to send them an interrupt, using trigger_irq(). */
+static void add_used(struct virtqueue *vq, unsigned int head, int len)
+{
+ struct vring_used_elem *used;
+
+ /* The virtqueue contains a ring of used buffers. Get a pointer to the
+ * next entry in that used ring. */
+ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
+ used->id = head;
+ used->len = len;
+ /* Make sure buffer is written before we update index. */
+ wmb();
+ vq->vring.used->idx++;
+ vq->inflight--;
+}
+
+/* This actually sends the interrupt for this virtqueue */
+static void trigger_irq(int fd, struct virtqueue *vq)
+{
+ unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
+
+ /* If they don't want an interrupt, don't send one, unless empty. */
+ if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && vq->inflight)
+ return;
+
+ /* Send the Guest an interrupt tell them we used something up. */
+ if (write(fd, buf, sizeof(buf)) != 0)
+ err(1, "Triggering irq %i", vq->config.irq);
+}
+
+/* And here's the combo meal deal. Supersize me! */
+static void add_used_and_trigger(int fd, struct virtqueue *vq,
+ unsigned int head, int len)
+{
+ add_used(vq, head, len);
+ trigger_irq(fd, vq);
+}
+
+/*
+ * The Console
+ *
+ * Here is the input terminal setting we save, and the routine to restore them
+ * on exit so the user gets their terminal back. */
+static struct termios orig_term;
+static void restore_term(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
+}
+
+/* We associate some data with the console for our exit hack. */
+struct console_abort
+{
+ /* How many times have they hit ^C? */
+ int count;
+ /* When did they start? */
+ struct timeval start;
+};
+
+/* This is the routine which handles console input (ie. stdin). */
+static bool handle_console_input(int fd, struct device *dev)
+{
+ int len;
+ unsigned int head, in_num, out_num;
+ struct iovec iov[dev->vq->vring.num];
+ struct console_abort *abort = dev->priv;
+
+ /* First we need a console buffer from the Guests's input virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+
+ /* If they're not ready for input, stop listening to this file
+ * descriptor. We'll start again once they add an input buffer. */
+ if (head == dev->vq->vring.num)
+ return false;
+
+ if (out_num)
+ errx(1, "Output buffers in console in queue?");
+
+ /* This is why we convert to iovecs: the readv() call uses them, and so
+ * it reads straight into the Guest's buffer. */
+ len = readv(dev->fd, iov, in_num);
+ if (len <= 0) {
+ /* This implies that the console is closed, is /dev/null, or
+ * something went terribly wrong. */
+ warnx("Failed to get console input, ignoring console.");
+ /* Put the input terminal back. */
+ restore_term();
+ /* Remove callback from input vq, so it doesn't restart us. */
+ dev->vq->handle_output = NULL;
+ /* Stop listening to this fd: don't call us again. */
+ return false;
+ }
+
+ /* Tell the Guest about the new input. */
+ add_used_and_trigger(fd, dev->vq, head, len);
+
+ /* Three ^C within one second? Exit.
+ *
+ * This is such a hack, but works surprisingly well. Each ^C has to be
+ * in a buffer by itself, so they can't be too fast. But we check that
+ * we get three within about a second, so they can't be too slow. */
+ if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
+ if (!abort->count++)
+ gettimeofday(&abort->start, NULL);
+ else if (abort->count == 3) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ if (now.tv_sec <= abort->start.tv_sec+1) {
+ unsigned long args[] = { LHREQ_BREAK, 0 };
+ /* Close the fd so Waker will know it has to
+ * exit. */
+ close(waker_fds.pipe[1]);
+ /* Just in case Waker is blocked in BREAK, send
+ * unbreak now. */
+ write(fd, args, sizeof(args));
+ exit(2);
+ }
+ abort->count = 0;
+ }
+ } else
+ /* Any other key resets the abort counter. */
+ abort->count = 0;
+
+ /* Everything went OK! */
+ return true;
+}
+
+/* Handling output for console is simple: we just get all the output buffers
+ * and write them to stdout. */
+static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
+{
+ unsigned int head, out, in;
+ int len;
+ struct iovec iov[vq->vring.num];
+
+ /* Keep getting output buffers from the Guest until we run out. */
+ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
+ if (in)
+ errx(1, "Input buffers in output queue?");
+ len = writev(STDOUT_FILENO, iov, out);
+ add_used_and_trigger(fd, vq, head, len);
+ }
+}
+
+/* This is called when we no longer want to hear about Guest changes to a
+ * virtqueue. This is more efficient in high-traffic cases, but it means we
+ * have to set a timer to check if any more changes have occurred. */
+static void block_vq(struct virtqueue *vq)
+{
+ struct itimerval itm;
+
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ vq->blocked = true;
+
+ itm.it_interval.tv_sec = 0;
+ itm.it_interval.tv_usec = 0;
+ itm.it_value.tv_sec = 0;
+ itm.it_value.tv_usec = timeout_usec;
+
+ setitimer(ITIMER_REAL, &itm, NULL);
+}
+
+/*
+ * The Network
+ *
+ * Handling output for network is also simple: we get all the output buffers
+ * and write them (ignoring the first element) to this device's file descriptor
+ * (/dev/net/tun).
+ */
+static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
+{
+ unsigned int head, out, in, num = 0;
+ int len;
+ struct iovec iov[vq->vring.num];
+ static int last_timeout_num;
+
+ /* Keep getting output buffers from the Guest until we run out. */
+ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
+ if (in)
+ errx(1, "Input buffers in output queue?");
+ len = writev(vq->dev->fd, iov, out);
+ if (len < 0)
+ err(1, "Writing network packet to tun");
+ add_used_and_trigger(fd, vq, head, len);
+ num++;
+ }
+
+ /* Block further kicks and set up a timer if we saw anything. */
+ if (!timeout && num)
+ block_vq(vq);
+
+ /* We never quite know how long should we wait before we check the
+ * queue again for more packets. We start at 500 microseconds, and if
+ * we get fewer packets than last time, we assume we made the timeout
+ * too small and increase it by 10 microseconds. Otherwise, we drop it
+ * by one microsecond every time. It seems to work well enough. */
+ if (timeout) {
+ if (num < last_timeout_num)
+ timeout_usec += 10;
+ else if (timeout_usec > 1)
+ timeout_usec--;
+ last_timeout_num = num;
+ }
+}
+
+/* This is where we handle a packet coming in from the tun device to our
+ * Guest. */
+static bool handle_tun_input(int fd, struct device *dev)
+{
+ unsigned int head, in_num, out_num;
+ int len;
+ struct iovec iov[dev->vq->vring.num];
+
+ /* First we need a network buffer from the Guests's recv virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+ if (head == dev->vq->vring.num) {
+ /* Now, it's expected that if we try to send a packet too
+ * early, the Guest won't be ready yet. Wait until the device
+ * status says it's ready. */
+ /* FIXME: Actually want DRIVER_ACTIVE here. */
+
+ /* Now tell it we want to know if new things appear. */
+ dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ wmb();
+
+ /* We'll turn this back on if input buffers are registered. */
+ return false;
+ } else if (out_num)
+ errx(1, "Output buffers in network recv queue?");
+
+ /* Read the packet from the device directly into the Guest's buffer. */
+ len = readv(dev->fd, iov, in_num);
+ if (len <= 0)
+ err(1, "reading network");
+
+ /* Tell the Guest about the new packet. */
+ add_used_and_trigger(fd, dev->vq, head, len);
+
+ verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
+ ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
+ head != dev->vq->vring.num ? "sent" : "discarded");
+
+ /* All good. */
+ return true;
+}
+
+/*L:215 This is the callback attached to the network and console input
+ * virtqueues: it ensures we try again, in case we stopped console or net
+ * delivery because Guest didn't have any buffers. */
+static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
+{
+ add_device_fd(vq->dev->fd);
+ /* Snap the Waker out of its select loop. */
+ write(waker_fds.pipe[1], "", 1);
+}
+
+static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout)
+{
+ /* We don't need to know again when Guest refills receive buffer. */
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ enable_fd(fd, vq, timeout);
+}
+
+/* When the Guest tells us they updated the status field, we handle it. */
+static void update_device_status(struct device *dev)
+{
+ struct virtqueue *vq;
+
+ /* This is a reset. */
+ if (dev->desc->status == 0) {
+ verbose("Resetting device %s\n", dev->name);
+
+ /* Clear any features they've acked. */
+ memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
+ dev->desc->feature_len);
+
+ /* Zero out the virtqueues. */
+ for (vq = dev->vq; vq; vq = vq->next) {
+ memset(vq->vring.desc, 0,
+ vring_size(vq->config.num, getpagesize()));
+ lg_last_avail(vq) = 0;
+ }
+ } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
+ warnx("Device %s configuration FAILED", dev->name);
+ } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ unsigned int i;
+
+ verbose("Device %s OK: offered", dev->name);
+ for (i = 0; i < dev->desc->feature_len; i++)
+ verbose(" %02x", get_feature_bits(dev)[i]);
+ verbose(", accepted");
+ for (i = 0; i < dev->desc->feature_len; i++)
+ verbose(" %02x", get_feature_bits(dev)
+ [dev->desc->feature_len+i]);
+
+ if (dev->ready)
+ dev->ready(dev);
+ }
+}
+
+/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
+static void handle_output(int fd, unsigned long addr)
+{
+ struct device *i;
+ struct virtqueue *vq;
+
+ /* Check each device and virtqueue. */
+ for (i = devices.dev; i; i = i->next) {
+ /* Notifications to device descriptors update device status. */
+ if (from_guest_phys(addr) == i->desc) {
+ update_device_status(i);
+ return;
+ }
+
+ /* Notifications to virtqueues mean output has occurred. */
+ for (vq = i->vq; vq; vq = vq->next) {
+ if (vq->config.pfn != addr/getpagesize())
+ continue;
+
+ /* Guest should acknowledge (and set features!) before
+ * using the device. */
+ if (i->desc->status == 0) {
+ warnx("%s gave early output", i->name);
+ return;
+ }
+
+ if (strcmp(vq->dev->name, "console") != 0)
+ verbose("Output to %s\n", vq->dev->name);
+ if (vq->handle_output)
+ vq->handle_output(fd, vq, false);
+ return;
+ }
+ }
+
+ /* Early console write is done using notify on a nul-terminated string
+ * in Guest memory. */
+ if (addr >= guest_limit)
+ errx(1, "Bad NOTIFY %#lx", addr);
+
+ write(STDOUT_FILENO, from_guest_phys(addr),
+ strnlen(from_guest_phys(addr), guest_limit - addr));
+}
+
+static void handle_timeout(int fd)
+{
+ char buf[32];
+ struct device *i;
+ struct virtqueue *vq;
+
+ /* Clear the pipe */
+ read(timeoutpipe[0], buf, sizeof(buf));
+
+ /* Check each device and virtqueue: flush blocked ones. */
+ for (i = devices.dev; i; i = i->next) {
+ for (vq = i->vq; vq; vq = vq->next) {
+ if (!vq->blocked)
+ continue;
+
+ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ vq->blocked = false;
+ if (vq->handle_output)
+ vq->handle_output(fd, vq, true);
+ }
+ }
+}
+
+/* This is called when the Waker wakes us up: check for incoming file
+ * descriptors. */
+static void handle_input(int fd)
+{
+ /* select() wants a zeroed timeval to mean "don't wait". */
+ struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
+
+ for (;;) {
+ struct device *i;
+ fd_set fds = devices.infds;
+ int num;
+
+ num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
+ /* Could get interrupted */
+ if (num < 0)
+ continue;
+ /* If nothing is ready, we're done. */
+ if (num == 0)
+ break;
+
+ /* Otherwise, call the device(s) which have readable file
+ * descriptors and a method of handling them. */
+ for (i = devices.dev; i; i = i->next) {
+ if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+ if (i->handle_input(fd, i))
+ continue;
+
+ /* If handle_input() returns false, it means we
+ * should no longer service it. Networking and
+ * console do this when there's no input
+ * buffers to deliver into. Console also uses
+ * it when it discovers that stdin is closed. */
+ FD_CLR(i->fd, &devices.infds);
+ }
+ }
+
+ /* Is this the timeout fd? */
+ if (FD_ISSET(timeoutpipe[0], &fds))
+ handle_timeout(fd);
+ }
+}
+
+/*L:190
+ * Device Setup
+ *
+ * All devices need a descriptor so the Guest knows it exists, and a "struct
+ * device" so the Launcher can keep track of it. We have common helper
+ * routines to allocate and manage them.
+ */
+
+/* The layout of the device page is a "struct lguest_device_desc" followed by a
+ * number of virtqueue descriptors, then two sets of feature bits, then an
+ * array of configuration bytes. This routine returns the configuration
+ * pointer. */
+static u8 *device_config(const struct device *dev)
+{
+ return (void *)(dev->desc + 1)
+ + dev->desc->num_vq * sizeof(struct lguest_vqconfig)
+ + dev->desc->feature_len * 2;
+}
+
+/* This routine allocates a new "struct lguest_device_desc" from descriptor
+ * table page just above the Guest's normal memory. It returns a pointer to
+ * that descriptor. */
+static struct lguest_device_desc *new_dev_desc(u16 type)
+{
+ struct lguest_device_desc d = { .type = type };
+ void *p;
+
+ /* Figure out where the next device config is, based on the last one. */
+ if (devices.lastdev)
+ p = device_config(devices.lastdev)
+ + devices.lastdev->desc->config_len;
+ else
+ p = devices.descpage;
+
+ /* We only have one page for all the descriptors. */
+ if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
+ errx(1, "Too many devices");
+
+ /* p might not be aligned, so we memcpy in. */
+ return memcpy(p, &d, sizeof(d));
+}
+
+/* Each device descriptor is followed by the description of its virtqueues. We
+ * specify how many descriptors the virtqueue is to have. */
+static void add_virtqueue(struct device *dev, unsigned int num_descs,
+ void (*handle_output)(int, struct virtqueue *, bool))
+{
+ unsigned int pages;
+ struct virtqueue **i, *vq = malloc(sizeof(*vq));
+ void *p;
+
+ /* First we need some memory for this virtqueue. */
+ pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1)
+ / getpagesize();
+ p = get_pages(pages);
+
+ /* Initialize the virtqueue */
+ vq->next = NULL;
+ vq->last_avail_idx = 0;
+ vq->dev = dev;
+ vq->inflight = 0;
+ vq->blocked = false;
+
+ /* Initialize the configuration. */
+ vq->config.num = num_descs;
+ vq->config.irq = devices.next_irq++;
+ vq->config.pfn = to_guest_phys(p) / getpagesize();
+
+ /* Initialize the vring. */
+ vring_init(&vq->vring, num_descs, p, getpagesize());
+
+ /* Append virtqueue to this device's descriptor. We use
+ * device_config() to get the end of the device's current virtqueues;
+ * we check that we haven't added any config or feature information
+ * yet, otherwise we'd be overwriting them. */
+ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
+ memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+ dev->desc->num_vq++;
+
+ verbose("Virtqueue page %#lx\n", to_guest_phys(p));
+
+ /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
+ * second. */
+ for (i = &dev->vq; *i; i = &(*i)->next);
+ *i = vq;
+
+ /* Set the routine to call when the Guest does something to this
+ * virtqueue. */
+ vq->handle_output = handle_output;
+
+ /* As an optimization, set the advisory "Don't Notify Me" flag if we
+ * don't have a handler */
+ if (!handle_output)
+ vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+}
+
+/* The first half of the feature bitmask is for us to advertise features. The
+ * second half is for the Guest to accept features. */
+static void add_feature(struct device *dev, unsigned bit)
+{
+ u8 *features = get_feature_bits(dev);
+
+ /* We can't extend the feature bits once we've added config bytes */
+ if (dev->desc->feature_len <= bit / CHAR_BIT) {
+ assert(dev->desc->config_len == 0);
+ dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+ }
+
+ features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
+}
+
+/* This routine sets the configuration fields for an existing device's
+ * descriptor. It only works for the last device, but that's OK because that's
+ * how we use it. */
+static void set_config(struct device *dev, unsigned len, const void *conf)
+{
+ /* Check we haven't overflowed our single page. */
+ if (device_config(dev) + len > devices.descpage + getpagesize())
+ errx(1, "Too many devices");
+
+ /* Copy in the config information, and store the length. */
+ memcpy(device_config(dev), conf, len);
+ dev->desc->config_len = len;
+}
+
+/* This routine does all the creation and setup of a new device, including
+ * calling new_dev_desc() to allocate the descriptor and device memory.
+ *
+ * See what I mean about userspace being boring? */
+static struct device *new_device(const char *name, u16 type, int fd,
+ bool (*handle_input)(int, struct device *))
+{
+ struct device *dev = malloc(sizeof(*dev));
+
+ /* Now we populate the fields one at a time. */
+ dev->fd = fd;
+ /* If we have an input handler for this file descriptor, then we add it
+ * to the device_list's fdset and maxfd. */
+ if (handle_input)
+ add_device_fd(dev->fd);
+ dev->desc = new_dev_desc(type);
+ dev->handle_input = handle_input;
+ dev->name = name;
+ dev->vq = NULL;
+ dev->ready = NULL;
+
+ /* Append to device list. Prepending to a single-linked list is
+ * easier, but the user expects the devices to be arranged on the bus
+ * in command-line order. The first network device on the command line
+ * is eth0, the first block device /dev/vda, etc. */
+ if (devices.lastdev)
+ devices.lastdev->next = dev;
+ else
+ devices.dev = dev;
+ devices.lastdev = dev;
+
+ return dev;
+}
+
+/* Our first setup routine is the console. It's a fairly simple device, but
+ * UNIX tty handling makes it uglier than it could be. */
+static void setup_console(void)
+{
+ struct device *dev;
+
+ /* If we can save the initial standard input settings... */
+ if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
+ struct termios term = orig_term;
+ /* Then we turn off echo, line buffering and ^C etc. We want a
+ * raw input stream to the Guest. */
+ term.c_lflag &= ~(ISIG|ICANON|ECHO);
+ tcsetattr(STDIN_FILENO, TCSANOW, &term);
+ /* If we exit gracefully, the original settings will be
+ * restored so the user can see what they're typing. */
+ atexit(restore_term);
+ }
+
+ dev = new_device("console", VIRTIO_ID_CONSOLE,
+ STDIN_FILENO, handle_console_input);
+ /* We store the console state in dev->priv, and initialize it. */
+ dev->priv = malloc(sizeof(struct console_abort));
+ ((struct console_abort *)dev->priv)->count = 0;
+
+ /* The console needs two virtqueues: the input then the output. When
+ * they put something the input queue, we make sure we're listening to
+ * stdin. When they put something in the output queue, we write it to
+ * stdout. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
+
+ verbose("device %u: console\n", devices.device_num++);
+}
+/*:*/
+
+static void timeout_alarm(int sig)
+{
+ write(timeoutpipe[1], "", 1);
+}
+
+static void setup_timeout(void)
+{
+ if (pipe(timeoutpipe) != 0)
+ err(1, "Creating timeout pipe");
+
+ if (fcntl(timeoutpipe[1], F_SETFL,
+ fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
+ err(1, "Making timeout pipe nonblocking");
+
+ add_device_fd(timeoutpipe[0]);
+ signal(SIGALRM, timeout_alarm);
+}
+
+/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
+ * --sharenet=<name> option which opens or creates a named pipe. This can be
+ * used to send packets to another guest in a 1:1 manner.
+ *
+ * More sopisticated is to use one of the tools developed for project like UML
+ * to do networking.
+ *
+ * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
+ * completely generic ("here's my vring, attach to your vring") and would work
+ * for any traffic. Of course, namespace and permissions issues need to be
+ * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
+ * multiple inter-guest channels behind one interface, although it would
+ * require some manner of hotplugging new virtio channels.
+ *
+ * Finally, we could implement a virtio network switch in the kernel. :*/
+
+static u32 str2ip(const char *ipaddr)
+{
+ unsigned int b[4];
+
+ if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
+ errx(1, "Failed to parse IP address '%s'", ipaddr);
+ return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
+}
+
+static void str2mac(const char *macaddr, unsigned char mac[6])
+{
+ unsigned int m[6];
+ if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
+ &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
+ errx(1, "Failed to parse mac address '%s'", macaddr);
+ mac[0] = m[0];
+ mac[1] = m[1];
+ mac[2] = m[2];
+ mac[3] = m[3];
+ mac[4] = m[4];
+ mac[5] = m[5];
+}
+
+/* This code is "adapted" from libbridge: it attaches the Host end of the
+ * network device to the bridge device specified by the command line.
+ *
+ * This is yet another James Morris contribution (I'm an IP-level guy, so I
+ * dislike bridging), and I just try not to break it. */
+static void add_to_bridge(int fd, const char *if_name, const char *br_name)
+{
+ int ifidx;
+ struct ifreq ifr;
+
+ if (!*br_name)
+ errx(1, "must specify bridge name");
+
+ ifidx = if_nametoindex(if_name);
+ if (!ifidx)
+ errx(1, "interface %s does not exist!", if_name);
+
+ strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+ ifr.ifr_ifindex = ifidx;
+ if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
+ err(1, "can't add %s to bridge %s", if_name, br_name);
+}
+
+/* This sets up the Host end of the network device with an IP address, brings
+ * it up so packets will flow, the copies the MAC address into the hwaddr
+ * pointer. */
+static void configure_device(int fd, const char *tapif, u32 ipaddr)
+{
+ struct ifreq ifr;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, tapif);
+
+ /* Don't read these incantations. Just cut & paste them like I did! */
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = htonl(ipaddr);
+ if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+ err(1, "Setting %s interface address", tapif);
+ ifr.ifr_flags = IFF_UP;
+ if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+ err(1, "Bringing interface %s up", tapif);
+}
+
+static int get_tun_device(char tapif[IFNAMSIZ])
+{
+ struct ifreq ifr;
+ int netfd;
+
+ /* Start with this zeroed. Messy but sure. */
+ memset(&ifr, 0, sizeof(ifr));
+
+ /* We open the /dev/net/tun device and tell it we want a tap device. A
+ * tap device is like a tun device, only somehow different. To tell
+ * the truth, I completely blundered my way through this code, but it
+ * works now! */
+ netfd = open_or_die("/dev/net/tun", O_RDWR);
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+ strcpy(ifr.ifr_name, "tap%d");
+ if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
+ err(1, "configuring /dev/net/tun");
+
+ if (ioctl(netfd, TUNSETOFFLOAD,
+ TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
+ err(1, "Could not set features for tun device");
+
+ /* We don't need checksums calculated for packets coming in this
+ * device: trust us! */
+ ioctl(netfd, TUNSETNOCSUM, 1);
+
+ memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
+ return netfd;
+}
+
+/*L:195 Our network is a Host<->Guest network. This can either use bridging or
+ * routing, but the principle is the same: it uses the "tun" device to inject
+ * packets into the Host as if they came in from a normal network card. We
+ * just shunt packets between the Guest and the tun device. */
+static void setup_tun_net(char *arg)
+{
+ struct device *dev;
+ int netfd, ipfd;
+ u32 ip = INADDR_ANY;
+ bool bridging = false;
+ char tapif[IFNAMSIZ], *p;
+ struct virtio_net_config conf;
+
+ netfd = get_tun_device(tapif);
+
+ /* First we create a new network device. */
+ dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+
+ /* Network devices need a receive and a send queue, just like
+ * console. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+
+ /* We need a socket to perform the magic network ioctls to bring up the
+ * tap interface, connect to the bridge etc. Any socket will do! */
+ ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+ if (ipfd < 0)
+ err(1, "opening IP socket");
+
+ /* If the command line was --tunnet=bridge:<name> do bridging. */
+ if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
+ arg += strlen(BRIDGE_PFX);
+ bridging = true;
+ }
+
+ /* A mac address may follow the bridge name or IP address */
+ p = strchr(arg, ':');
+ if (p) {
+ str2mac(p+1, conf.mac);
+ add_feature(dev, VIRTIO_NET_F_MAC);
+ *p = '\0';
+ }
+
+ /* arg is now either an IP address or a bridge name */
+ if (bridging)
+ add_to_bridge(ipfd, tapif, arg);
+ else
+ ip = str2ip(arg);
+
+ /* Set up the tun device. */
+ configure_device(ipfd, tapif, ip);
+
+ add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
+ /* Expect Guest to handle everything except UFO */
+ add_feature(dev, VIRTIO_NET_F_CSUM);
+ add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
+ add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
+ add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
+ add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
+ add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
+ add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
+ add_feature(dev, VIRTIO_NET_F_HOST_ECN);
+ set_config(dev, sizeof(conf), &conf);
+
+ /* We don't need the socket any more; setup is done. */
+ close(ipfd);
+
+ devices.device_num++;
+
+ if (bridging)
+ verbose("device %u: tun %s attached to bridge: %s\n",
+ devices.device_num, tapif, arg);
+ else
+ verbose("device %u: tun %s: %s\n",
+ devices.device_num, tapif, arg);
+}
+
+/* Our block (disk) device should be really simple: the Guest asks for a block
+ * number and we read or write that position in the file. Unfortunately, that
+ * was amazingly slow: the Guest waits until the read is finished before
+ * running anything else, even if it could have been doing useful work.
+ *
+ * We could use async I/O, except it's reputed to suck so hard that characters
+ * actually go missing from your code when you try to use it.
+ *
+ * So we farm the I/O out to thread, and communicate with it via a pipe. */
+
+/* This hangs off device->priv. */
+struct vblk_info
+{
+ /* The size of the file. */
+ off64_t len;
+
+ /* The file descriptor for the file. */
+ int fd;
+
+ /* IO thread listens on this file descriptor [0]. */
+ int workpipe[2];
+
+ /* IO thread writes to this file descriptor to mark it done, then
+ * Launcher triggers interrupt to Guest. */
+ int done_fd;
+};
+
+/*L:210
+ * The Disk
+ *
+ * Remember that the block device is handled by a separate I/O thread. We head
+ * straight into the core of that thread here:
+ */
+static bool service_io(struct device *dev)
+{
+ struct vblk_info *vblk = dev->priv;
+ unsigned int head, out_num, in_num, wlen;
+ int ret;
+ u8 *in;
+ struct virtio_blk_outhdr *out;
+ struct iovec iov[dev->vq->vring.num];
+ off64_t off;
+
+ /* See if there's a request waiting. If not, nothing to do. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+ if (head == dev->vq->vring.num)
+ return false;
+
+ /* Every block request should contain at least one output buffer
+ * (detailing the location on disk and the type of request) and one
+ * input buffer (to hold the result). */
+ if (out_num == 0 || in_num == 0)
+ errx(1, "Bad virtblk cmd %u out=%u in=%u",
+ head, out_num, in_num);
+
+ out = convert(&iov[0], struct virtio_blk_outhdr);
+ in = convert(&iov[out_num+in_num-1], u8);
+ off = out->sector * 512;
+
+ /* The block device implements "barriers", where the Guest indicates
+ * that it wants all previous writes to occur before this write. We
+ * don't have a way of asking our kernel to do a barrier, so we just
+ * synchronize all the data in the file. Pretty poor, no? */
+ if (out->type & VIRTIO_BLK_T_BARRIER)
+ fdatasync(vblk->fd);
+
+ /* In general the virtio block driver is allowed to try SCSI commands.
+ * It'd be nice if we supported eject, for example, but we don't. */
+ if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
+ fprintf(stderr, "Scsi commands unsupported\n");
+ *in = VIRTIO_BLK_S_UNSUPP;
+ wlen = sizeof(*in);
+ } else if (out->type & VIRTIO_BLK_T_OUT) {
+ /* Write */
+
+ /* Move to the right location in the block file. This can fail
+ * if they try to write past end. */
+ if (lseek64(vblk->fd, off, SEEK_SET) != off)
+ err(1, "Bad seek to sector %llu", out->sector);
+
+ ret = writev(vblk->fd, iov+1, out_num-1);
+ verbose("WRITE to sector %llu: %i\n", out->sector, ret);
+
+ /* Grr... Now we know how long the descriptor they sent was, we
+ * make sure they didn't try to write over the end of the block
+ * file (possibly extending it). */
+ if (ret > 0 && off + ret > vblk->len) {
+ /* Trim it back to the correct length */
+ ftruncate64(vblk->fd, vblk->len);
+ /* Die, bad Guest, die. */
+ errx(1, "Write past end %llu+%u", off, ret);
+ }
+ wlen = sizeof(*in);
+ *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+ } else {
+ /* Read */
+
+ /* Move to the right location in the block file. This can fail
+ * if they try to read past end. */
+ if (lseek64(vblk->fd, off, SEEK_SET) != off)
+ err(1, "Bad seek to sector %llu", out->sector);
+
+ ret = readv(vblk->fd, iov+1, in_num-1);
+ verbose("READ from sector %llu: %i\n", out->sector, ret);
+ if (ret >= 0) {
+ wlen = sizeof(*in) + ret;
+ *in = VIRTIO_BLK_S_OK;
+ } else {
+ wlen = sizeof(*in);
+ *in = VIRTIO_BLK_S_IOERR;
+ }
+ }
+
+ /* We can't trigger an IRQ, because we're not the Launcher. It does
+ * that when we tell it we're done. */
+ add_used(dev->vq, head, wlen);
+ return true;
+}
+
+/* This is the thread which actually services the I/O. */
+static int io_thread(void *_dev)
+{
+ struct device *dev = _dev;
+ struct vblk_info *vblk = dev->priv;
+ char c;
+
+ /* Close other side of workpipe so we get 0 read when main dies. */
+ close(vblk->workpipe[1]);
+ /* Close the other side of the done_fd pipe. */
+ close(dev->fd);
+
+ /* When this read fails, it means Launcher died, so we follow. */
+ while (read(vblk->workpipe[0], &c, 1) == 1) {
+ /* We acknowledge each request immediately to reduce latency,
+ * rather than waiting until we've done them all. I haven't
+ * measured to see if it makes any difference.
+ *
+ * That would be an interesting test, wouldn't it? You could
+ * also try having more than one I/O thread. */
+ while (service_io(dev))
+ write(vblk->done_fd, &c, 1);
+ }
+ return 0;
+}
+
+/* Now we've seen the I/O thread, we return to the Launcher to see what happens
+ * when that thread tells us it's completed some I/O. */
+static bool handle_io_finish(int fd, struct device *dev)
+{
+ char c;
+
+ /* If the I/O thread died, presumably it printed the error, so we
+ * simply exit. */
+ if (read(dev->fd, &c, 1) != 1)
+ exit(1);
+
+ /* It did some work, so trigger the irq. */
+ trigger_irq(fd, dev->vq);
+ return true;
+}
+
+/* When the Guest submits some I/O, we just need to wake the I/O thread. */
+static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
+{
+ struct vblk_info *vblk = vq->dev->priv;
+ char c = 0;
+
+ /* Wake up I/O thread and tell it to go to work! */
+ if (write(vblk->workpipe[1], &c, 1) != 1)
+ /* Presumably it indicated why it died. */
+ exit(1);
+}
+
+/*L:198 This actually sets up a virtual block device. */
+static void setup_block_file(const char *filename)
+{
+ int p[2];
+ struct device *dev;
+ struct vblk_info *vblk;
+ void *stack;
+ struct virtio_blk_config conf;
+
+ /* This is the pipe the I/O thread will use to tell us I/O is done. */
+ pipe(p);
+
+ /* The device responds to return from I/O thread. */
+ dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
+
+ /* The device has one virtqueue, where the Guest places requests. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
+
+ /* Allocate the room for our own bookkeeping */
+ vblk = dev->priv = malloc(sizeof(*vblk));
+
+ /* First we open the file and store the length. */
+ vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
+ vblk->len = lseek64(vblk->fd, 0, SEEK_END);
+
+ /* We support barriers. */
+ add_feature(dev, VIRTIO_BLK_F_BARRIER);
+
+ /* Tell Guest how many sectors this device has. */
+ conf.capacity = cpu_to_le64(vblk->len / 512);
+
+ /* Tell Guest not to put in too many descriptors at once: two are used
+ * for the in and out elements. */
+ add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
+ conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
+
+ set_config(dev, sizeof(conf), &conf);
+
+ /* The I/O thread writes to this end of the pipe when done. */
+ vblk->done_fd = p[1];
+
+ /* This is the second pipe, which is how we tell the I/O thread about
+ * more work. */
+ pipe(vblk->workpipe);
+
+ /* Create stack for thread and run it. Since stack grows upwards, we
+ * point the stack pointer to the end of this region. */
+ stack = malloc(32768);
+ /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
+ * becoming a zombie. */
+ if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
+ err(1, "Creating clone");
+
+ /* We don't need to keep the I/O thread's end of the pipes open. */
+ close(vblk->done_fd);
+ close(vblk->workpipe[0]);
+
+ verbose("device %u: virtblock %llu sectors\n",
+ devices.device_num, le64_to_cpu(conf.capacity));
+}
+
+/* Our random number generator device reads from /dev/random into the Guest's
+ * input buffers. The usual case is that the Guest doesn't want random numbers
+ * and so has no buffers although /dev/random is still readable, whereas
+ * console is the reverse.
+ *
+ * The same logic applies, however. */
+static bool handle_rng_input(int fd, struct device *dev)
+{
+ int len;
+ unsigned int head, in_num, out_num, totlen = 0;
+ struct iovec iov[dev->vq->vring.num];
+
+ /* First we need a buffer from the Guests's virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+
+ /* If they're not ready for input, stop listening to this file
+ * descriptor. We'll start again once they add an input buffer. */
+ if (head == dev->vq->vring.num)
+ return false;
+
+ if (out_num)
+ errx(1, "Output buffers in rng?");
+
+ /* This is why we convert to iovecs: the readv() call uses them, and so
+ * it reads straight into the Guest's buffer. We loop to make sure we
+ * fill it. */
+ while (!iov_empty(iov, in_num)) {
+ len = readv(dev->fd, iov, in_num);
+ if (len <= 0)
+ err(1, "Read from /dev/random gave %i", len);
+ iov_consume(iov, in_num, len);
+ totlen += len;
+ }
+
+ /* Tell the Guest about the new input. */
+ add_used_and_trigger(fd, dev->vq, head, totlen);
+
+ /* Everything went OK! */
+ return true;
+}
+
+/* And this creates a "hardware" random number device for the Guest. */
+static void setup_rng(void)
+{
+ struct device *dev;
+ int fd;
+
+ fd = open_or_die("/dev/random", O_RDONLY);
+
+ /* The device responds to return from I/O thread. */
+ dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input);
+
+ /* The device has one virtqueue, where the Guest places inbufs. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+
+ verbose("device %u: rng\n", devices.device_num++);
+}
+/* That's the end of device setup. */
+
+/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
+static void __attribute__((noreturn)) restart_guest(void)
+{
+ unsigned int i;
+
+ /* Since we don't track all open fds, we simply close everything beyond
+ * stderr. */
+ for (i = 3; i < FD_SETSIZE; i++)
+ close(i);
+
+ /* The exec automatically gets rid of the I/O and Waker threads. */
+ execv(main_args[0], main_args);
+ err(1, "Could not exec %s", main_args[0]);
+}
+
+/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
+ * its input and output, and finally, lays it to rest. */
+static void __attribute__((noreturn)) run_guest(int lguest_fd)
+{
+ for (;;) {
+ unsigned long args[] = { LHREQ_BREAK, 0 };
+ unsigned long notify_addr;
+ int readval;
+
+ /* We read from the /dev/lguest device to run the Guest. */
+ readval = pread(lguest_fd, &notify_addr,
+ sizeof(notify_addr), cpu_id);
+
+ /* One unsigned long means the Guest did HCALL_NOTIFY */
+ if (readval == sizeof(notify_addr)) {
+ verbose("Notify on address %#lx\n", notify_addr);
+ handle_output(lguest_fd, notify_addr);
+ continue;
+ /* ENOENT means the Guest died. Reading tells us why. */
+ } else if (errno == ENOENT) {
+ char reason[1024] = { 0 };
+ pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
+ errx(1, "%s", reason);
+ /* ERESTART means that we need to reboot the guest */
+ } else if (errno == ERESTART) {
+ restart_guest();
+ /* EAGAIN means a signal (timeout).
+ * Anything else means a bug or incompatible change. */
+ } else if (errno != EAGAIN)
+ err(1, "Running guest failed");
+
+ /* Only service input on thread for CPU 0. */
+ if (cpu_id != 0)
+ continue;
+
+ /* Service input, then unset the BREAK to release the Waker. */
+ handle_input(lguest_fd);
+ if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+ err(1, "Resetting break");
+ }
+}
+/*L:240
+ * This is the end of the Launcher. The good news: we are over halfway
+ * through! The bad news: the most fiendish part of the code still lies ahead
+ * of us.
+ *
+ * Are you ready? Take a deep breath and join me in the core of the Host, in
+ * "make Host".
+ :*/
+
+static struct option opts[] = {
+ { "verbose", 0, NULL, 'v' },
+ { "tunnet", 1, NULL, 't' },
+ { "block", 1, NULL, 'b' },
+ { "rng", 0, NULL, 'r' },
+ { "initrd", 1, NULL, 'i' },
+ { NULL },
+};
+static void usage(void)
+{
+ errx(1, "Usage: lguest [--verbose] "
+ "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
+ "|--block=<filename>|--initrd=<filename>]...\n"
+ "<mem-in-mb> vmlinux [args...]");
+}
+
+/*L:105 The main routine is where the real work begins: */
+int main(int argc, char *argv[])
+{
+ /* Memory, top-level pagetable, code startpoint and size of the
+ * (optional) initrd. */
+ unsigned long mem = 0, pgdir, start, initrd_size = 0;
+ /* Two temporaries and the /dev/lguest file descriptor. */
+ int i, c, lguest_fd;
+ /* The boot information for the Guest. */
+ struct boot_params *boot;
+ /* If they specify an initrd file to load. */
+ const char *initrd_name = NULL;
+
+ /* Save the args: we "reboot" by execing ourselves again. */
+ main_args = argv;
+ /* We don't "wait" for the children, so prevent them from becoming
+ * zombies. */
+ signal(SIGCHLD, SIG_IGN);
+
+ /* First we initialize the device list. Since console and network
+ * device receive input from a file descriptor, we keep an fdset
+ * (infds) and the maximum fd number (max_infd) with the head of the
+ * list. We also keep a pointer to the last device. Finally, we keep
+ * the next interrupt number to use for devices (1: remember that 0 is
+ * used by the timer). */
+ FD_ZERO(&devices.infds);
+ devices.max_infd = -1;
+ devices.lastdev = NULL;
+ devices.next_irq = 1;
+
+ cpu_id = 0;
+ /* We need to know how much memory so we can set up the device
+ * descriptor and memory pages for the devices as we parse the command
+ * line. So we quickly look through the arguments to find the amount
+ * of memory now. */
+ for (i = 1; i < argc; i++) {
+ if (argv[i][0] != '-') {
+ mem = atoi(argv[i]) * 1024 * 1024;
+ /* We start by mapping anonymous pages over all of
+ * guest-physical memory range. This fills it with 0,
+ * and ensures that the Guest won't be killed when it
+ * tries to access it. */
+ guest_base = map_zeroed_pages(mem / getpagesize()
+ + DEVICE_PAGES);
+ guest_limit = mem;
+ guest_max = mem + DEVICE_PAGES*getpagesize();
+ devices.descpage = get_pages(1);
+ break;
+ }
+ }
+
+ /* The options are fairly straight-forward */
+ while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
+ switch (c) {
+ case 'v':
+ verbose = true;
+ break;
+ case 't':
+ setup_tun_net(optarg);
+ break;
+ case 'b':
+ setup_block_file(optarg);
+ break;
+ case 'r':
+ setup_rng();
+ break;
+ case 'i':
+ initrd_name = optarg;
+ break;
+ default:
+ warnx("Unknown argument %s", argv[optind]);
+ usage();
+ }
+ }
+ /* After the other arguments we expect memory and kernel image name,
+ * followed by command line arguments for the kernel. */
+ if (optind + 2 > argc)
+ usage();
+
+ verbose("Guest base is at %p\n", guest_base);
+
+ /* We always have a console device */
+ setup_console();
+
+ /* We can timeout waiting for Guest network transmit. */
+ setup_timeout();
+
+ /* Now we load the kernel */
+ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
+
+ /* Boot information is stashed at physical address 0 */
+ boot = from_guest_phys(0);
+
+ /* Map the initrd image if requested (at top of physical memory) */
+ if (initrd_name) {
+ initrd_size = load_initrd(initrd_name, mem);
+ /* These are the location in the Linux boot header where the
+ * start and size of the initrd are expected to be found. */
+ boot->hdr.ramdisk_image = mem - initrd_size;
+ boot->hdr.ramdisk_size = initrd_size;
+ /* The bootloader type 0xFF means "unknown"; that's OK. */
+ boot->hdr.type_of_loader = 0xFF;
+ }
+
+ /* Set up the initial linear pagetables, starting below the initrd. */
+ pgdir = setup_pagetables(mem, initrd_size);
+
+ /* The Linux boot header contains an "E820" memory map: ours is a
+ * simple, single region. */
+ boot->e820_entries = 1;
+ boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
+ /* The boot header contains a command line pointer: we put the command
+ * line after the boot header. */
+ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
+ /* We use a simple helper to copy the arguments separated by spaces. */
+ concat((char *)(boot + 1), argv+optind+2);
+
+ /* Boot protocol version: 2.07 supports the fields for lguest. */
+ boot->hdr.version = 0x207;
+
+ /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
+ boot->hdr.hardware_subarch = 1;
+
+ /* Tell the entry path not to try to reload segment registers. */
+ boot->hdr.loadflags |= KEEP_SEGMENTS;
+
+ /* We tell the kernel to initialize the Guest: this returns the open
+ * /dev/lguest file descriptor. */
+ lguest_fd = tell_kernel(pgdir, start);
+
+ /* We clone off a thread, which wakes the Launcher whenever one of the
+ * input file descriptors needs attention. We call this the Waker, and
+ * we'll cover it in a moment. */
+ setup_waker(lguest_fd);
+
+ /* Finally, run the Guest. This doesn't return. */
+ run_guest(lguest_fd);
+}
+/*:*/
+
+/*M:999
+ * Mastery is done: you now know everything I do.
+ *
+ * But surely you have seen code, features and bugs in your wanderings which
+ * you now yearn to attack? That is the real game, and I look forward to you
+ * patching and forking lguest into the Your-Name-Here-visor.
+ *
+ * Farewell, and good coding!
+ * Rusty Russell.
+ */
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
new file mode 100644
index 0000000..29510dc
--- /dev/null
+++ b/Documentation/lguest/lguest.txt
@@ -0,0 +1,120 @@
+ __
+ (___()'`; Rusty's Remarkably Unreliable Guide to Lguest
+ /, /` - or, A Young Coder's Illustrated Hypervisor
+ \\"--\\ http://lguest.ozlabs.org
+
+Lguest is designed to be a minimal hypervisor for the Linux kernel, for
+Linux developers and users to experiment with virtualization with the
+minimum of complexity. Nonetheless, it should have sufficient
+features to make it useful for specific tasks, and, of course, you are
+encouraged to fork and enhance it (see drivers/lguest/README).
+
+Features:
+
+- Kernel module which runs in a normal kernel.
+- Simple I/O model for communication.
+- Simple program to create new guests.
+- Logo contains cute puppies: http://lguest.ozlabs.org
+
+Developer features:
+
+- Fun to hack on.
+- No ABI: being tied to a specific kernel anyway, you can change anything.
+- Many opportunities for improvement or feature implementation.
+
+Running Lguest:
+
+- The easiest way to run lguest is to use same kernel as guest and host.
+ You can configure them differently, but usually it's easiest not to.
+
+ You will need to configure your kernel with the following options:
+
+ "General setup":
+ "Prompt for development and/or incomplete code/drivers" = Y
+ (CONFIG_EXPERIMENTAL=y)
+
+ "Processor type and features":
+ "Paravirtualized guest support" = Y
+ "Lguest guest support" = Y
+ "High Memory Support" = off/4GB
+ "Alignment value to which kernel should be aligned" = 0x100000
+ (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
+ CONFIG_PHYSICAL_ALIGN=0x100000)
+
+ "Device Drivers":
+ "Block devices"
+ "Virtio block driver (EXPERIMENTAL)" = M/Y
+ "Network device support"
+ "Universal TUN/TAP device driver support" = M/Y
+ "Virtio network driver (EXPERIMENTAL)" = M/Y
+ (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
+
+ "Virtualization"
+ "Linux hypervisor example code" = M/Y
+ (CONFIG_LGUEST=m)
+
+- A tool called "lguest" is available in this directory: type "make"
+ to build it. If you didn't build your kernel in-tree, use "make
+ O=<builddir>".
+
+- Create or find a root disk image. There are several useful ones
+ around, such as the xm-test tiny root image at
+ http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
+
+ For more serious work, I usually use a distribution ISO image and
+ install it under qemu, then make multiple copies:
+
+ dd if=/dev/zero of=rootfile bs=1M count=2048
+ qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
+
+ Make sure that you install a getty on /dev/hvc0 if you want to log in on the
+ console!
+
+- "modprobe lg" if you built it as a module.
+
+- Run an lguest as root:
+
+ Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
+
+ Explanation:
+ 64: the amount of memory to use, in MB.
+
+ vmlinux: the kernel image found in the top of your build directory. You
+ can also use a standard bzImage.
+
+ --tunnet=192.168.19.1: configures a "tap" device for networking with this
+ IP address.
+
+ --block=rootfile: a file or block device which becomes /dev/vda
+ inside the guest.
+
+ root=/dev/vda: this (and anything else on the command line) are
+ kernel boot parameters.
+
+- Configuring networking. I usually have the host masquerade, using
+ "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
+ /proc/sys/net/ipv4/ip_forward". In this example, I would configure
+ eth0 inside the guest at 192.168.19.2.
+
+ Another method is to bridge the tap device to an external interface
+ using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
+ to obtain an IP address. The bridge needs to be configured first:
+ this option simply adds the tap interface to it.
+
+ A simple example on my system:
+
+ ifconfig eth0 0.0.0.0
+ brctl addbr lg0
+ ifconfig lg0 up
+ brctl addif lg0 eth0
+ dhclient lg0
+
+ Then use --tunnet=bridge:lg0 when launching the guest.
+
+ See http://linux-net.osdl.org/index.php/Bridge for general information
+ on how to get bridging working.
+
+There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
+
+Good luck!
+Rusty Russell rusty@rustcorp.com.au.
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt
new file mode 100644
index 0000000..23045b8
--- /dev/null
+++ b/Documentation/local_ops.txt
@@ -0,0 +1,186 @@
+ Semantics and Behavior of Local Atomic Operations
+
+ Mathieu Desnoyers
+
+
+ This document explains the purpose of the local atomic operations, how
+to implement them for any given architecture and shows how they can be used
+properly. It also stresses on the precautions that must be taken when reading
+those local variables across CPUs when the order of memory writes matters.
+
+
+
+* Purpose of local atomic operations
+
+Local atomic operations are meant to provide fast and highly reentrant per CPU
+counters. They minimize the performance cost of standard atomic operations by
+removing the LOCK prefix and memory barriers normally required to synchronize
+across CPUs.
+
+Having fast per CPU atomic counters is interesting in many cases : it does not
+require disabling interrupts to protect from interrupt handlers and it permits
+coherent counters in NMI handlers. It is especially useful for tracing purposes
+and for various performance monitoring counters.
+
+Local atomic operations only guarantee variable modification atomicity wrt the
+CPU which owns the data. Therefore, care must taken to make sure that only one
+CPU writes to the local_t data. This is done by using per cpu data and making
+sure that we modify it from within a preemption safe context. It is however
+permitted to read local_t data from any CPU : it will then appear to be written
+out of order wrt other memory writes by the owner CPU.
+
+
+* Implementation for a given architecture
+
+It can be done by slightly modifying the standard atomic operations : only
+their UP variant must be kept. It typically means removing LOCK prefix (on
+i386 and x86_64) and any SMP sychronization barrier. If the architecture does
+not have a different behavior between SMP and UP, including asm-generic/local.h
+in your architecture's local.h is sufficient.
+
+The local_t type is defined as an opaque signed long by embedding an
+atomic_long_t inside a structure. This is made so a cast from this type to a
+long fails. The definition looks like :
+
+typedef struct { atomic_long_t a; } local_t;
+
+
+* Rules to follow when using local atomic operations
+
+- Variables touched by local ops must be per cpu variables.
+- _Only_ the CPU owner of these variables must write to them.
+- This CPU can use local ops from any context (process, irq, softirq, nmi, ...)
+ to update its local_t variables.
+- Preemption (or interrupts) must be disabled when using local ops in
+ process context to make sure the process won't be migrated to a
+ different CPU between getting the per-cpu variable and doing the
+ actual local op.
+- When using local ops in interrupt context, no special care must be
+ taken on a mainline kernel, since they will run on the local CPU with
+ preemption already disabled. I suggest, however, to explicitly
+ disable preemption anyway to make sure it will still work correctly on
+ -rt kernels.
+- Reading the local cpu variable will provide the current copy of the
+ variable.
+- Reads of these variables can be done from any CPU, because updates to
+ "long", aligned, variables are always atomic. Since no memory
+ synchronization is done by the writer CPU, an outdated copy of the
+ variable can be read when reading some _other_ cpu's variables.
+
+
+* How to use local atomic operations
+
+#include <linux/percpu.h>
+#include <asm/local.h>
+
+static DEFINE_PER_CPU(local_t, counters) = LOCAL_INIT(0);
+
+
+* Counting
+
+Counting is done on all the bits of a signed long.
+
+In preemptible context, use get_cpu_var() and put_cpu_var() around local atomic
+operations : it makes sure that preemption is disabled around write access to
+the per cpu variable. For instance :
+
+ local_inc(&get_cpu_var(counters));
+ put_cpu_var(counters);
+
+If you are already in a preemption-safe context, you can directly use
+__get_cpu_var() instead.
+
+ local_inc(&__get_cpu_var(counters));
+
+
+
+* Reading the counters
+
+Those local counters can be read from foreign CPUs to sum the count. Note that
+the data seen by local_read across CPUs must be considered to be out of order
+relatively to other memory writes happening on the CPU that owns the data.
+
+ long sum = 0;
+ for_each_online_cpu(cpu)
+ sum += local_read(&per_cpu(counters, cpu));
+
+If you want to use a remote local_read to synchronize access to a resource
+between CPUs, explicit smp_wmb() and smp_rmb() memory barriers must be used
+respectively on the writer and the reader CPUs. It would be the case if you use
+the local_t variable as a counter of bytes written in a buffer : there should
+be a smp_wmb() between the buffer write and the counter increment and also a
+smp_rmb() between the counter read and the buffer read.
+
+
+Here is a sample module which implements a basic per cpu counter using local.h.
+
+--- BEGIN ---
+/* test-local.c
+ *
+ * Sample module for local.h usage.
+ */
+
+
+#include <asm/local.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+static DEFINE_PER_CPU(local_t, counters) = LOCAL_INIT(0);
+
+static struct timer_list test_timer;
+
+/* IPI called on each CPU. */
+static void test_each(void *info)
+{
+ /* Increment the counter from a non preemptible context */
+ printk("Increment on cpu %d\n", smp_processor_id());
+ local_inc(&__get_cpu_var(counters));
+
+ /* This is what incrementing the variable would look like within a
+ * preemptible context (it disables preemption) :
+ *
+ * local_inc(&get_cpu_var(counters));
+ * put_cpu_var(counters);
+ */
+}
+
+static void do_test_timer(unsigned long data)
+{
+ int cpu;
+
+ /* Increment the counters */
+ on_each_cpu(test_each, NULL, 1);
+ /* Read all the counters */
+ printk("Counters read from CPU %d\n", smp_processor_id());
+ for_each_online_cpu(cpu) {
+ printk("Read : CPU %d, count %ld\n", cpu,
+ local_read(&per_cpu(counters, cpu)));
+ }
+ del_timer(&test_timer);
+ test_timer.expires = jiffies + 1000;
+ add_timer(&test_timer);
+}
+
+static int __init test_init(void)
+{
+ /* initialize the timer that will increment the counter */
+ init_timer(&test_timer);
+ test_timer.function = do_test_timer;
+ test_timer.expires = jiffies + 1;
+ add_timer(&test_timer);
+
+ return 0;
+}
+
+static void __exit test_exit(void)
+{
+ del_timer_sync(&test_timer);
+}
+
+module_init(test_init);
+module_exit(test_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Local Atomic Ops");
+--- END ---
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
new file mode 100644
index 0000000..4887730
--- /dev/null
+++ b/Documentation/lockdep-design.txt
@@ -0,0 +1,219 @@
+Runtime locking correctness validator
+=====================================
+
+started by Ingo Molnar <mingo@redhat.com>
+additions by Arjan van de Ven <arjan@linux.intel.com>
+
+Lock-class
+----------
+
+The basic object the validator operates upon is a 'class' of locks.
+
+A class of locks is a group of locks that are logically the same with
+respect to locking rules, even if the locks may have multiple (possibly
+tens of thousands of) instantiations. For example a lock in the inode
+struct is one class, while each inode has its own instantiation of that
+lock class.
+
+The validator tracks the 'state' of lock-classes, and it tracks
+dependencies between different lock-classes. The validator maintains a
+rolling proof that the state and the dependencies are correct.
+
+Unlike an lock instantiation, the lock-class itself never goes away: when
+a lock-class is used for the first time after bootup it gets registered,
+and all subsequent uses of that lock-class will be attached to this
+lock-class.
+
+State
+-----
+
+The validator tracks lock-class usage history into 5 separate state bits:
+
+- 'ever held in hardirq context' [ == hardirq-safe ]
+- 'ever held in softirq context' [ == softirq-safe ]
+- 'ever held with hardirqs enabled' [ == hardirq-unsafe ]
+- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ]
+
+- 'ever used' [ == !unused ]
+
+When locking rules are violated, these 4 state bits are presented in the
+locking error messages, inside curlies. A contrived example:
+
+ modprobe/2287 is trying to acquire lock:
+ (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+ but task is already holding lock:
+ (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+
+The bit position indicates hardirq, softirq, hardirq-read,
+softirq-read respectively, and the character displayed in each
+indicates:
+
+ '.' acquired while irqs disabled
+ '+' acquired in irq context
+ '-' acquired with irqs enabled
+ '?' read acquired in irq context with irqs enabled.
+
+Unused mutexes cannot be part of the cause of an error.
+
+
+Single-lock state rules:
+------------------------
+
+A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
+following states are exclusive, and only one of them is allowed to be
+set for any lock-class:
+
+ <hardirq-safe> and <hardirq-unsafe>
+ <softirq-safe> and <softirq-unsafe>
+
+The validator detects and reports lock usage that violate these
+single-lock state rules.
+
+Multi-lock dependency rules:
+----------------------------
+
+The same lock-class must not be acquired twice, because this could lead
+to lock recursion deadlocks.
+
+Furthermore, two locks may not be taken in different order:
+
+ <L1> -> <L2>
+ <L2> -> <L1>
+
+because this could lead to lock inversion deadlocks. (The validator
+finds such dependencies in arbitrary complexity, i.e. there can be any
+other locking sequence between the acquire-lock operations, the
+validator will still track all dependencies between locks.)
+
+Furthermore, the following usage based lock dependencies are not allowed
+between any two lock-classes:
+
+ <hardirq-safe> -> <hardirq-unsafe>
+ <softirq-safe> -> <softirq-unsafe>
+
+The first rule comes from the fact the a hardirq-safe lock could be
+taken by a hardirq context, interrupting a hardirq-unsafe lock - and
+thus could result in a lock inversion deadlock. Likewise, a softirq-safe
+lock could be taken by an softirq context, interrupting a softirq-unsafe
+lock.
+
+The above rules are enforced for any locking sequence that occurs in the
+kernel: when acquiring a new lock, the validator checks whether there is
+any rule violation between the new lock and any of the held locks.
+
+When a lock-class changes its state, the following aspects of the above
+dependency rules are enforced:
+
+- if a new hardirq-safe lock is discovered, we check whether it
+ took any hardirq-unsafe lock in the past.
+
+- if a new softirq-safe lock is discovered, we check whether it took
+ any softirq-unsafe lock in the past.
+
+- if a new hardirq-unsafe lock is discovered, we check whether any
+ hardirq-safe lock took it in the past.
+
+- if a new softirq-unsafe lock is discovered, we check whether any
+ softirq-safe lock took it in the past.
+
+(Again, we do these checks too on the basis that an interrupt context
+could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
+could lead to a lock inversion deadlock - even if that lock scenario did
+not trigger in practice yet.)
+
+Exception: Nested data dependencies leading to nested locking
+-------------------------------------------------------------
+
+There are a few cases where the Linux kernel acquires more than one
+instance of the same lock-class. Such cases typically happen when there
+is some sort of hierarchy within objects of the same type. In these
+cases there is an inherent "natural" ordering between the two objects
+(defined by the properties of the hierarchy), and the kernel grabs the
+locks in this fixed order on each of the objects.
+
+An example of such an object hierarchy that results in "nested locking"
+is that of a "whole disk" block-dev object and a "partition" block-dev
+object; the partition is "part of" the whole device and as long as one
+always takes the whole disk lock as a higher lock than the partition
+lock, the lock ordering is fully correct. The validator does not
+automatically detect this natural ordering, as the locking rule behind
+the ordering is not static.
+
+In order to teach the validator about this correct usage model, new
+versions of the various locking primitives were added that allow you to
+specify a "nesting level". An example call, for the block device mutex,
+looks like this:
+
+enum bdev_bd_mutex_lock_class
+{
+ BD_MUTEX_NORMAL,
+ BD_MUTEX_WHOLE,
+ BD_MUTEX_PARTITION
+};
+
+ mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
+
+In this case the locking is done on a bdev object that is known to be a
+partition.
+
+The validator treats a lock that is taken in such a nested fashion as a
+separate (sub)class for the purposes of validation.
+
+Note: When changing code to use the _nested() primitives, be careful and
+check really thoroughly that the hierarchy is correctly mapped; otherwise
+you can get false positives or false negatives.
+
+Proof of 100% correctness:
+--------------------------
+
+The validator achieves perfect, mathematical 'closure' (proof of locking
+correctness) in the sense that for every simple, standalone single-task
+locking sequence that occurred at least once during the lifetime of the
+kernel, the validator proves it with a 100% certainty that no
+combination and timing of these locking sequences can cause any class of
+lock related deadlock. [*]
+
+I.e. complex multi-CPU and multi-task locking scenarios do not have to
+occur in practice to prove a deadlock: only the simple 'component'
+locking chains have to occur at least once (anytime, in any
+task/context) for the validator to be able to prove correctness. (For
+example, complex deadlocks that would normally need more than 3 CPUs and
+a very unlikely constellation of tasks, irq-contexts and timings to
+occur, can be detected on a plain, lightly loaded single-CPU system as
+well!)
+
+This radically decreases the complexity of locking related QA of the
+kernel: what has to be done during QA is to trigger as many "simple"
+single-task locking dependencies in the kernel as possible, at least
+once, to prove locking correctness - instead of having to trigger every
+possible combination of locking interaction between CPUs, combined with
+every possible hardirq and softirq nesting scenario (which is impossible
+to do in practice).
+
+[*] assuming that the validator itself is 100% correct, and no other
+ part of the system corrupts the state of the validator in any way.
+ We also assume that all NMI/SMM paths [which could interrupt
+ even hardirq-disabled codepaths] are correct and do not interfere
+ with the validator. We also assume that the 64-bit 'chain hash'
+ value is unique for every lock-chain in the system. Also, lock
+ recursion must not be higher than 20.
+
+Performance:
+------------
+
+The above rules require _massive_ amounts of runtime checking. If we did
+that for every lock taken and for every irqs-enable event, it would
+render the system practically unusably slow. The complexity of checking
+is O(N^2), so even with just a few hundred lock-classes we'd have to do
+tens of thousands of checks for every event.
+
+This problem is solved by checking any given 'locking scenario' (unique
+sequence of locks taken after each other) only once. A simple stack of
+held locks is maintained, and a lightweight 64-bit hash value is
+calculated, which hash is unique for every lock chain. The hash value,
+when the chain is validated for the first time, is then put into a hash
+table, which hash-table can be checked in a lockfree manner. If the
+locking chain occurs again later on, the hash table tells us that we
+dont have to validate the chain again.
diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
new file mode 100644
index 0000000..4ba4664
--- /dev/null
+++ b/Documentation/lockstat.txt
@@ -0,0 +1,120 @@
+
+LOCK STATISTICS
+
+- WHAT
+
+As the name suggests, it provides statistics on locks.
+
+- WHY
+
+Because things like lock contention can severely impact performance.
+
+- HOW
+
+Lockdep already has hooks in the lock functions and maps lock instances to
+lock classes. We build on that. The graph below shows the relation between
+the lock functions and the various hooks therein.
+
+ __acquire
+ |
+ lock _____
+ | \
+ | __contended
+ | |
+ | <wait>
+ | _______/
+ |/
+ |
+ __acquired
+ |
+ .
+ <hold>
+ .
+ |
+ __release
+ |
+ unlock
+
+lock, unlock - the regular lock functions
+__* - the hooks
+<> - states
+
+With these hooks we provide the following statistics:
+
+ con-bounces - number of lock contention that involved x-cpu data
+ contentions - number of lock acquisitions that had to wait
+ wait time min - shortest (non-0) time we ever had to wait for a lock
+ max - longest time we ever had to wait for a lock
+ total - total time we spend waiting on this lock
+ acq-bounces - number of lock acquisitions that involved x-cpu data
+ acquisitions - number of times we took the lock
+ hold time min - shortest (non-0) time we ever held the lock
+ max - longest time we ever held the lock
+ total - total time this lock was held
+
+From these number various other statistics can be derived, such as:
+
+ hold time average = hold time total / acquisitions
+
+These numbers are gathered per lock class, per read/write state (when
+applicable).
+
+It also tracks 4 contention points per class. A contention point is a call site
+that had to wait on lock acquisition.
+
+ - USAGE
+
+Look at the current lock statistics:
+
+( line numbers not part of actual output, done for clarity in the explanation
+ below )
+
+# less /proc/lock_stat
+
+01 lock_stat version 0.2
+02 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+03 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
+04 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+05
+06 &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60
+07 &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38
+08 --------------------------
+09 &inode->i_data.tree_lock 0 [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190
+10
+11 ...............................................................................................................................................................................................
+12
+13 dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24
+14 -----------
+15 dcache_lock 180 [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230
+16 dcache_lock 165 [<ffffffff802c002a>] d_alloc+0x15a/0x210
+17 dcache_lock 33 [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70
+18 dcache_lock 1 [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130
+
+This excerpt shows the first two lock class statistics. Line 01 shows the
+output version - each time the format changes this will be updated. Line 02-04
+show the header with column descriptions. Lines 05-10 and 13-18 show the actual
+statistics. These statistics come in two parts; the actual stats separated by a
+short separator (line 08, 14) from the contention points.
+
+The first lock (05-10) is a read/write lock, and shows two lines above the
+short separator. The contention points don't match the column descriptors,
+they have two: contentions and [<IP>] symbol.
+
+
+View the top contending locks:
+
+# grep : /proc/lock_stat | head
+ &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60
+ &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38
+ dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24
+ &inode->i_mutex: 161 286 18446744073709 62882.54 1244614.55 3653 20598 18446744073709 62318.60 1693822.74
+ &zone->lru_lock: 94 94 0.53 7.33 92.10 4366 32690 0.29 59.81 16350.06
+ &inode->i_data.i_mmap_lock: 79 79 0.40 3.77 53.03 11779 87755 0.28 116.93 29898.44
+ &q->__queue_lock: 48 50 0.52 31.62 86.31 774 13131 0.17 113.08 12277.52
+ &rq->rq_lock_key: 43 47 0.74 68.50 170.63 3706 33929 0.22 107.99 17460.62
+ &rq->rq_lock_key#2: 39 46 0.75 6.68 49.03 2979 32292 0.17 125.17 17137.63
+ tasklist_lock-W: 15 15 1.45 10.87 32.70 1201 7390 0.58 62.55 13648.47
+
+Clear the statistics:
+
+# echo 0 > /proc/lock_stat
diff --git a/Documentation/logo.gif b/Documentation/logo.gif
new file mode 100644
index 0000000..2eae75f
--- /dev/null
+++ b/Documentation/logo.gif
Binary files differ
diff --git a/Documentation/logo.txt b/Documentation/logo.txt
new file mode 100644
index 0000000..296f0f7
--- /dev/null
+++ b/Documentation/logo.txt
@@ -0,0 +1,13 @@
+This is the full-colour version of the currently unofficial Linux logo
+("currently unofficial" just means that there has been no paperwork and
+that I have not really announced it yet). It was created by Larry Ewing,
+and is freely usable as long as you acknowledge Larry as the original
+artist.
+
+Note that there are black-and-white versions of this available that
+scale down to smaller sizes and are better for letterheads or whatever
+you want to use it for: for the full range of logos take a look at
+Larry's web-page:
+
+ http://www.isc.tamu.edu/~lewing/linux/
+
diff --git a/Documentation/m68k/00-INDEX b/Documentation/m68k/00-INDEX
new file mode 100644
index 0000000..a014e9f
--- /dev/null
+++ b/Documentation/m68k/00-INDEX
@@ -0,0 +1,5 @@
+00-INDEX
+ - this file
+kernel-options.txt
+ - command line options for Linux/m68k
+
diff --git a/Documentation/m68k/README.buddha b/Documentation/m68k/README.buddha
new file mode 100644
index 0000000..3ea9827
--- /dev/null
+++ b/Documentation/m68k/README.buddha
@@ -0,0 +1,210 @@
+
+The Amiga Buddha and Catweasel IDE Driver (part of ide.c) was written by
+Geert Uytterhoeven based on the following specifications:
+
+------------------------------------------------------------------------
+
+Register map of the Buddha IDE controller and the
+Buddha-part of the Catweasel Zorro-II version
+
+The Autoconfiguration has been implemented just as Commodore
+described in their manuals, no tricks have been used (for
+example leaving some address lines out of the equations...).
+If you want to configure the board yourself (for example let
+a Linux kernel configure the card), look at the Commodore
+Docs. Reading the nibbles should give this information:
+
+Vendor number: 4626 ($1212)
+product number: 0 (42 for Catweasel Z-II)
+Serial number: 0
+Rom-vector: $1000
+
+The card should be a Z-II board, size 64K, not for freemem
+list, Rom-Vektor is valid, no second Autoconfig-board on the
+same card, no space preference, supports "Shutup_forever".
+
+Setting the base address should be done in two steps, just
+as the Amiga Kickstart does: The lower nibble of the 8-Bit
+address is written to $4a, then the whole Byte is written to
+$48, while it doesn't matter how often you're writing to $4a
+as long as $48 is not touched. After $48 has been written,
+the whole card disappears from $e8 and is mapped to the new
+address just written. Make sure $4a is written before $48,
+otherwise your chance is only 1:16 to find the board :-).
+
+The local memory-map is even active when mapped to $e8:
+
+$0-$7e Autokonfig-space, see Z-II docs.
+
+$80-$7fd reserved
+
+$7fe Speed-select Register: Read & Write
+ (description see further down)
+
+$800-$8ff IDE-Select 0 (Port 0, Register set 0)
+
+$900-$9ff IDE-Select 1 (Port 0, Register set 1)
+
+$a00-$aff IDE-Select 2 (Port 1, Register set 0)
+
+$b00-$bff IDE-Select 3 (Port 1, Register set 1)
+
+$c00-$cff IDE-Select 4 (Port 2, Register set 0,
+ Catweasel only!)
+
+$d00-$dff IDE-Select 5 (Port 3, Register set 1,
+ Catweasel only!)
+
+$e00-$eff local expansion port, on Catweasel Z-II the
+ Catweasel registers are also mapped here.
+ Never touch, use multidisk.device!
+
+$f00 read only, Byte-access: Bit 7 shows the
+ level of the IRQ-line of IDE port 0.
+
+$f01-$f3f mirror of $f00
+
+$f40 read only, Byte-access: Bit 7 shows the
+ level of the IRQ-line of IDE port 1.
+
+$f41-$f7f mirror of $f40
+
+$f80 read only, Byte-access: Bit 7 shows the
+ level of the IRQ-line of IDE port 2.
+ (Catweasel only!)
+
+$f81-$fbf mirror of $f80
+
+$fc0 write-only: Writing any value to this
+ register enables IRQs to be passed from the
+ IDE ports to the Zorro bus. This mechanism
+ has been implemented to be compatible with
+ harddisks that are either defective or have
+ a buggy firmware and pull the IRQ line up
+ while starting up. If interrupts would
+ always be passed to the bus, the computer
+ might not start up. Once enabled, this flag
+ can not be disabled again. The level of the
+ flag can not be determined by software
+ (what for? Write to me if it's necessary!).
+
+$fc1-$fff mirror of $fc0
+
+$1000-$ffff Buddha-Rom with offset $1000 in the rom
+ chip. The addresses $0 to $fff of the rom
+ chip cannot be read. Rom is Byte-wide and
+ mapped to even addresses.
+
+The IDE ports issue an INT2. You can read the level of the
+IRQ-lines of the IDE-ports by reading from the three (two
+for Buddha-only) registers $f00, $f40 and $f80. This way
+more than one I/O request can be handled and you can easily
+determine what driver has to serve the INT2. Buddha and
+Catweasel expansion boards can issue an INT6. A separate
+memory map is available for the I/O module and the sysop's
+I/O module.
+
+The IDE ports are fed by the address lines A2 to A4, just as
+the Amiga 1200 and Amiga 4000 IDE ports are. This way
+existing drivers can be easily ported to Buddha. A move.l
+polls two words out of the same address of IDE port since
+every word is mirrored once. movem is not possible, but
+it's not necessary either, because you can only speedup
+68000 systems with this technique. A 68020 system with
+fastmem is faster with move.l.
+
+If you're using the mirrored registers of the IDE-ports with
+A6=1, the Buddha doesn't care about the speed that you have
+selected in the speed register (see further down). With
+A6=1 (for example $840 for port 0, register set 0), a 780ns
+access is being made. These registers should be used for a
+command access to the harddisk/CD-Rom, since command
+accesses are Byte-wide and have to be made slower according
+to the ATA-X3T9 manual.
+
+Now for the speed-register: The register is byte-wide, and
+only the upper three bits are used (Bits 7 to 5). Bit 4
+must always be set to 1 to be compatible with later Buddha
+versions (if I'll ever update this one). I presume that
+I'll never use the lower four bits, but they have to be set
+to 1 by definition.
+ The values in this table have to be shifted 5 bits to the
+left and or'd with $1f (this sets the lower 5 bits).
+
+All the timings have in common: Select and IOR/IOW rise at
+the same time. IOR and IOW have a propagation delay of
+about 30ns to the clocks on the Zorro bus, that's why the
+values are no multiple of 71. One clock-cycle is 71ns long
+(exactly 70,5 at 14,18 Mhz on PAL systems).
+
+value 0 (Default after reset)
+
+497ns Select (7 clock cycles) , IOR/IOW after 172ns (2 clock cycles)
+(same timing as the Amiga 1200 does on it's IDE port without
+accelerator card)
+
+value 1
+
+639ns Select (9 clock cycles), IOR/IOW after 243ns (3 clock cycles)
+
+value 2
+
+781ns Select (11 clock cycles), IOR/IOW after 314ns (4 clock cycles)
+
+value 3
+
+355ns Select (5 clock cycles), IOR/IOW after 101ns (1 clock cycle)
+
+value 4
+
+355ns Select (5 clock cycles), IOR/IOW after 172ns (2 clock cycles)
+
+value 5
+
+355ns Select (5 clock cycles), IOR/IOW after 243ns (3 clock cycles)
+
+value 6
+
+1065ns Select (15 clock cycles), IOR/IOW after 314ns (4 clock cycles)
+
+value 7
+
+355ns Select, (5 clock cycles), IOR/IOW after 101ns (1 clock cycle)
+
+When accessing IDE registers with A6=1 (for example $84x),
+the timing will always be mode 0 8-bit compatible, no matter
+what you have selected in the speed register:
+
+781ns select, IOR/IOW after 4 clock cycles (=314ns) aktive.
+
+All the timings with a very short select-signal (the 355ns
+fast accesses) depend on the accelerator card used in the
+system: Sometimes two more clock cycles are inserted by the
+bus interface, making the whole access 497ns long. This
+doesn't affect the reliability of the controller nor the
+performance of the card, since this doesn't happen very
+often.
+
+All the timings are calculated and only confirmed by
+measurements that allowed me to count the clock cycles. If
+the system is clocked by an oscillator other than 28,37516
+Mhz (for example the NTSC-frequency 28,63636 Mhz), each
+clock cycle is shortened to a bit less than 70ns (not worth
+mentioning). You could think of a small performance boost
+by overclocking the system, but you would either need a
+multisync monitor, or a graphics card, and your internal
+diskdrive would go crazy, that's why you shouldn't tune your
+Amiga this way.
+
+Giving you the possibility to write software that is
+compatible with both the Buddha and the Catweasel Z-II, The
+Buddha acts just like a Catweasel Z-II with no device
+connected to the third IDE-port. The IRQ-register $f80
+always shows a "no IRQ here" on the Buddha, and accesses to
+the third IDE port are going into data's Nirwana on the
+Buddha.
+
+ Jens Schönfeld february 19th, 1997
+ updated may 27th, 1997
+ eMail: sysop@nostlgic.tng.oche.de
+
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
new file mode 100644
index 0000000..c93bed6
--- /dev/null
+++ b/Documentation/m68k/kernel-options.txt
@@ -0,0 +1,872 @@
+
+
+ Command Line Options for Linux/m68k
+ ===================================
+
+Last Update: 2 May 1999
+Linux/m68k version: 2.2.6
+Author: Roman.Hodek@informatik.uni-erlangen.de (Roman Hodek)
+Update: jds@kom.auc.dk (Jes Sorensen) and faq@linux-m68k.org (Chris Lawrence)
+
+0) Introduction
+===============
+
+ Often I've been asked which command line options the Linux/m68k
+kernel understands, or how the exact syntax for the ... option is, or
+... about the option ... . I hope, this document supplies all the
+answers...
+
+ Note that some options might be outdated, their descriptions being
+incomplete or missing. Please update the information and send in the
+patches.
+
+
+1) Overview of the Kernel's Option Processing
+=============================================
+
+The kernel knows three kinds of options on its command line:
+
+ 1) kernel options
+ 2) environment settings
+ 3) arguments for init
+
+To which of these classes an argument belongs is determined as
+follows: If the option is known to the kernel itself, i.e. if the name
+(the part before the '=') or, in some cases, the whole argument string
+is known to the kernel, it belongs to class 1. Otherwise, if the
+argument contains an '=', it is of class 2, and the definition is put
+into init's environment. All other arguments are passed to init as
+command line options.
+
+ This document describes the valid kernel options for Linux/m68k in
+the version mentioned at the start of this file. Later revisions may
+add new such options, and some may be missing in older versions.
+
+ In general, the value (the part after the '=') of an option is a
+list of values separated by commas. The interpretation of these values
+is up to the driver that "owns" the option. This association of
+options with drivers is also the reason that some are further
+subdivided.
+
+
+2) General Kernel Options
+=========================
+
+2.1) root=
+----------
+
+Syntax: root=/dev/<device>
+ or: root=<hex_number>
+
+This tells the kernel which device it should mount as the root
+filesystem. The device must be a block device with a valid filesystem
+on it.
+
+ The first syntax gives the device by name. These names are converted
+into a major/minor number internally in the kernel in an unusual way.
+Normally, this "conversion" is done by the device files in /dev, but
+this isn't possible here, because the root filesystem (with /dev)
+isn't mounted yet... So the kernel parses the name itself, with some
+hardcoded name to number mappings. The name must always be a
+combination of two or three letters, followed by a decimal number.
+Valid names are:
+
+ /dev/ram: -> 0x0100 (initial ramdisk)
+ /dev/hda: -> 0x0300 (first IDE disk)
+ /dev/hdb: -> 0x0340 (second IDE disk)
+ /dev/sda: -> 0x0800 (first SCSI disk)
+ /dev/sdb: -> 0x0810 (second SCSI disk)
+ /dev/sdc: -> 0x0820 (third SCSI disk)
+ /dev/sdd: -> 0x0830 (forth SCSI disk)
+ /dev/sde: -> 0x0840 (fifth SCSI disk)
+ /dev/fd : -> 0x0200 (floppy disk)
+ /dev/xda: -> 0x0c00 (first XT disk, unused in Linux/m68k)
+ /dev/xdb: -> 0x0c40 (second XT disk, unused in Linux/m68k)
+
+ The name must be followed by a decimal number, that stands for the
+partition number. Internally, the value of the number is just
+added to the device number mentioned in the table above. The
+exceptions are /dev/ram and /dev/fd, where /dev/ram refers to an
+initial ramdisk loaded by your bootstrap program (please consult the
+instructions for your bootstrap program to find out how to load an
+initial ramdisk). As of kernel version 2.0.18 you must specify
+/dev/ram as the root device if you want to boot from an initial
+ramdisk. For the floppy devices, /dev/fd, the number stands for the
+floppy drive number (there are no partitions on floppy disks). I.e.,
+/dev/fd0 stands for the first drive, /dev/fd1 for the second, and so
+on. Since the number is just added, you can also force the disk format
+by adding a number greater than 3. If you look into your /dev
+directory, use can see the /dev/fd0D720 has major 2 and minor 16. You
+can specify this device for the root FS by writing "root=/dev/fd16" on
+the kernel command line.
+
+[Strange and maybe uninteresting stuff ON]
+
+ This unusual translation of device names has some strange
+consequences: If, for example, you have a symbolic link from /dev/fd
+to /dev/fd0D720 as an abbreviation for floppy driver #0 in DD format,
+you cannot use this name for specifying the root device, because the
+kernel cannot see this symlink before mounting the root FS and it
+isn't in the table above. If you use it, the root device will not be
+set at all, without an error message. Another example: You cannot use a
+partition on e.g. the sixth SCSI disk as the root filesystem, if you
+want to specify it by name. This is, because only the devices up to
+/dev/sde are in the table above, but not /dev/sdf. Although, you can
+use the sixth SCSI disk for the root FS, but you have to specify the
+device by number... (see below). Or, even more strange, you can use the
+fact that there is no range checking of the partition number, and your
+knowledge that each disk uses 16 minors, and write "root=/dev/sde17"
+(for /dev/sdf1).
+
+[Strange and maybe uninteresting stuff OFF]
+
+ If the device containing your root partition isn't in the table
+above, you can also specify it by major and minor numbers. These are
+written in hex, with no prefix and no separator between. E.g., if you
+have a CD with contents appropriate as a root filesystem in the first
+SCSI CD-ROM drive, you boot from it by "root=0b00". Here, hex "0b" =
+decimal 11 is the major of SCSI CD-ROMs, and the minor 0 stands for
+the first of these. You can find out all valid major numbers by
+looking into include/linux/major.h.
+
+
+2.2) ro, rw
+-----------
+
+Syntax: ro
+ or: rw
+
+These two options tell the kernel whether it should mount the root
+filesystem read-only or read-write. The default is read-only, except
+for ramdisks, which default to read-write.
+
+
+2.3) debug
+----------
+
+Syntax: debug
+
+This raises the kernel log level to 10 (the default is 7). This is the
+same level as set by the "dmesg" command, just that the maximum level
+selectable by dmesg is 8.
+
+
+2.4) debug=
+-----------
+
+Syntax: debug=<device>
+
+This option causes certain kernel messages be printed to the selected
+debugging device. This can aid debugging the kernel, since the
+messages can be captured and analyzed on some other machine. Which
+devices are possible depends on the machine type. There are no checks
+for the validity of the device name. If the device isn't implemented,
+nothing happens.
+
+ Messages logged this way are in general stack dumps after kernel
+memory faults or bad kernel traps, and kernel panics. To be exact: all
+messages of level 0 (panic messages) and all messages printed while
+the log level is 8 or more (their level doesn't matter). Before stack
+dumps, the kernel sets the log level to 10 automatically. A level of
+at least 8 can also be set by the "debug" command line option (see
+2.3) and at run time with "dmesg -n 8".
+
+Devices possible for Amiga:
+
+ - "ser": built-in serial port; parameters: 9600bps, 8N1
+ - "mem": Save the messages to a reserved area in chip mem. After
+ rebooting, they can be read under AmigaOS with the tool
+ 'dmesg'.
+
+Devices possible for Atari:
+
+ - "ser1": ST-MFP serial port ("Modem1"); parameters: 9600bps, 8N1
+ - "ser2": SCC channel B serial port ("Modem2"); parameters: 9600bps, 8N1
+ - "ser" : default serial port
+ This is "ser2" for a Falcon, and "ser1" for any other machine
+ - "midi": The MIDI port; parameters: 31250bps, 8N1
+ - "par" : parallel port
+ The printing routine for this implements a timeout for the
+ case there's no printer connected (else the kernel would
+ lock up). The timeout is not exact, but usually a few
+ seconds.
+
+
+2.6) ramdisk_size=
+-------------
+
+Syntax: ramdisk_size=<size>
+
+ This option instructs the kernel to set up a ramdisk of the given
+size in KBytes. Do not use this option if the ramdisk contents are
+passed by bootstrap! In this case, the size is selected automatically
+and should not be overwritten.
+
+ The only application is for root filesystems on floppy disks, that
+should be loaded into memory. To do that, select the corresponding
+size of the disk as ramdisk size, and set the root device to the disk
+drive (with "root=").
+
+
+2.7) swap=
+2.8) buff=
+-----------
+
+ I can't find any sign of these options in 2.2.6.
+
+
+3) General Device Options (Amiga and Atari)
+===========================================
+
+3.1) ether=
+-----------
+
+Syntax: ether=[<irq>[,<base_addr>[,<mem_start>[,<mem_end>]]]],<dev-name>
+
+ <dev-name> is the name of a net driver, as specified in
+drivers/net/Space.c in the Linux source. Most prominent are eth0, ...
+eth3, sl0, ... sl3, ppp0, ..., ppp3, dummy, and lo.
+
+ The non-ethernet drivers (sl, ppp, dummy, lo) obviously ignore the
+settings by this options. Also, the existing ethernet drivers for
+Linux/m68k (ariadne, a2065, hydra) don't use them because Zorro boards
+are really Plug-'n-Play, so the "ether=" option is useless altogether
+for Linux/m68k.
+
+
+3.2) hd=
+--------
+
+Syntax: hd=<cylinders>,<heads>,<sectors>
+
+ This option sets the disk geometry of an IDE disk. The first hd=
+option is for the first IDE disk, the second for the second one.
+(I.e., you can give this option twice.) In most cases, you won't have
+to use this option, since the kernel can obtain the geometry data
+itself. It exists just for the case that this fails for one of your
+disks.
+
+
+3.3) max_scsi_luns=
+-------------------
+
+Syntax: max_scsi_luns=<n>
+
+ Sets the maximum number of LUNs (logical units) of SCSI devices to
+be scanned. Valid values for <n> are between 1 and 8. Default is 8 if
+"Probe all LUNs on each SCSI device" was selected during the kernel
+configuration, else 1.
+
+
+3.4) st=
+--------
+
+Syntax: st=<buffer_size>,[<write_thres>,[<max_buffers>]]
+
+ Sets several parameters of the SCSI tape driver. <buffer_size> is
+the number of 512-byte buffers reserved for tape operations for each
+device. <write_thres> sets the number of blocks which must be filled
+to start an actual write operation to the tape. Maximum value is the
+total number of buffers. <max_buffer> limits the total number of
+buffers allocated for all tape devices.
+
+
+3.5) dmasound=
+--------------
+
+Syntax: dmasound=[<buffers>,<buffer-size>[,<catch-radius>]]
+
+ This option controls some configurations of the Linux/m68k DMA sound
+driver (Amiga and Atari): <buffers> is the number of buffers you want
+to use (minimum 4, default 4), <buffer-size> is the size of each
+buffer in kilobytes (minimum 4, default 32) and <catch-radius> says
+how much percent of error will be tolerated when setting a frequency
+(maximum 10, default 0). For example with 3% you can play 8000Hz
+AU-Files on the Falcon with its hardware frequency of 8195Hz and thus
+don't need to expand the sound.
+
+
+
+4) Options for Atari Only
+=========================
+
+4.1) video=
+-----------
+
+Syntax: video=<fbname>:<sub-options...>
+
+The <fbname> parameter specifies the name of the frame buffer,
+eg. most atari users will want to specify `atafb' here. The
+<sub-options> is a comma-separated list of the sub-options listed
+below.
+
+NB: Please notice that this option was renamed from `atavideo' to
+ `video' during the development of the 1.3.x kernels, thus you
+ might need to update your boot-scripts if upgrading to 2.x from
+ an 1.2.x kernel.
+
+NBB: The behavior of video= was changed in 2.1.57 so the recommended
+option is to specify the name of the frame buffer.
+
+4.1.1) Video Mode
+-----------------
+
+This sub-option may be any of the predefined video modes, as listed
+in atari/atafb.c in the Linux/m68k source tree. The kernel will
+activate the given video mode at boot time and make it the default
+mode, if the hardware allows. Currently defined names are:
+
+ - stlow : 320x200x4
+ - stmid, default5 : 640x200x2
+ - sthigh, default4: 640x400x1
+ - ttlow : 320x480x8, TT only
+ - ttmid, default1 : 640x480x4, TT only
+ - tthigh, default2: 1280x960x1, TT only
+ - vga2 : 640x480x1, Falcon only
+ - vga4 : 640x480x2, Falcon only
+ - vga16, default3 : 640x480x4, Falcon only
+ - vga256 : 640x480x8, Falcon only
+ - falh2 : 896x608x1, Falcon only
+ - falh16 : 896x608x4, Falcon only
+
+ If no video mode is given on the command line, the kernel tries the
+modes names "default<n>" in turn, until one is possible with the
+hardware in use.
+
+ A video mode setting doesn't make sense, if the external driver is
+activated by a "external:" sub-option.
+
+4.1.2) inverse
+--------------
+
+Invert the display. This affects both, text (consoles) and graphics
+(X) display. Usually, the background is chosen to be black. With this
+option, you can make the background white.
+
+4.1.3) font
+-----------
+
+Syntax: font:<fontname>
+
+Specify the font to use in text modes. Currently you can choose only
+between `VGA8x8', `VGA8x16' and `PEARL8x8'. `VGA8x8' is default, if the
+vertical size of the display is less than 400 pixel rows. Otherwise, the
+`VGA8x16' font is the default.
+
+4.1.4) hwscroll_
+----------------
+
+Syntax: hwscroll_<n>
+
+The number of additional lines of video memory to reserve for
+speeding up the scrolling ("hardware scrolling"). Hardware scrolling
+is possible only if the kernel can set the video base address in steps
+fine enough. This is true for STE, MegaSTE, TT, and Falcon. It is not
+possible with plain STs and graphics cards (The former because the
+base address must be on a 256 byte boundary there, the latter because
+the kernel doesn't know how to set the base address at all.)
+
+ By default, <n> is set to the number of visible text lines on the
+display. Thus, the amount of video memory is doubled, compared to no
+hardware scrolling. You can turn off the hardware scrolling altogether
+by setting <n> to 0.
+
+4.1.5) internal:
+----------------
+
+Syntax: internal:<xres>;<yres>[;<xres_max>;<yres_max>;<offset>]
+
+This option specifies the capabilities of some extended internal video
+hardware, like e.g. OverScan. <xres> and <yres> give the (extended)
+dimensions of the screen.
+
+ If your OverScan needs a black border, you have to write the last
+three arguments of the "internal:". <xres_max> is the maximum line
+length the hardware allows, <yres_max> the maximum number of lines.
+<offset> is the offset of the visible part of the screen memory to its
+physical start, in bytes.
+
+ Often, extended interval video hardware has to be activated somehow.
+For this, see the "sw_*" options below.
+
+4.1.6) external:
+----------------
+
+Syntax:
+ external:<xres>;<yres>;<depth>;<org>;<scrmem>[;<scrlen>[;<vgabase>\
+ [;<colw>[;<coltype>[;<xres_virtual>]]]]]
+
+[I had to break this line...]
+
+ This is probably the most complicated parameter... It specifies that
+you have some external video hardware (a graphics board), and how to
+use it under Linux/m68k. The kernel cannot know more about the hardware
+than you tell it here! The kernel also is unable to set or change any
+video modes, since it doesn't know about any board internal. So, you
+have to switch to that video mode before you start Linux, and cannot
+switch to another mode once Linux has started.
+
+ The first 3 parameters of this sub-option should be obvious: <xres>,
+<yres> and <depth> give the dimensions of the screen and the number of
+planes (depth). The depth is the logarithm to base 2 of the number
+of colors possible. (Or, the other way round: The number of colors is
+2^depth).
+
+ You have to tell the kernel furthermore how the video memory is
+organized. This is done by a letter as <org> parameter:
+
+ 'n': "normal planes", i.e. one whole plane after another
+ 'i': "interleaved planes", i.e. 16 bit of the first plane, than 16 bit
+ of the next, and so on... This mode is used only with the
+ built-in Atari video modes, I think there is no card that
+ supports this mode.
+ 'p': "packed pixels", i.e. <depth> consecutive bits stand for all
+ planes of one pixel; this is the most common mode for 8 planes
+ (256 colors) on graphic cards
+ 't': "true color" (more or less packed pixels, but without a color
+ lookup table); usually depth is 24
+
+For monochrome modes (i.e., <depth> is 1), the <org> letter has a
+different meaning:
+
+ 'n': normal colors, i.e. 0=white, 1=black
+ 'i': inverted colors, i.e. 0=black, 1=white
+
+ The next important information about the video hardware is the base
+address of the video memory. That is given in the <scrmem> parameter,
+as a hexadecimal number with a "0x" prefix. You have to find out this
+address in the documentation of your hardware.
+
+ The next parameter, <scrlen>, tells the kernel about the size of the
+video memory. If it's missing, the size is calculated from <xres>,
+<yres>, and <depth>. For now, it is not useful to write a value here.
+It would be used only for hardware scrolling (which isn't possible
+with the external driver, because the kernel cannot set the video base
+address), or for virtual resolutions under X (which the X server
+doesn't support yet). So, it's currently best to leave this field
+empty, either by ending the "external:" after the video address or by
+writing two consecutive semicolons, if you want to give a <vgabase>
+(it is allowed to leave this parameter empty).
+
+ The <vgabase> parameter is optional. If it is not given, the kernel
+cannot read or write any color registers of the video hardware, and
+thus you have to set appropriate colors before you start Linux. But if
+your card is somehow VGA compatible, you can tell the kernel the base
+address of the VGA register set, so it can change the color lookup
+table. You have to look up this address in your board's documentation.
+To avoid misunderstandings: <vgabase> is the _base_ address, i.e. a 4k
+aligned address. For read/writing the color registers, the kernel
+uses the addresses vgabase+0x3c7...vgabase+0x3c9. The <vgabase>
+parameter is written in hexadecimal with a "0x" prefix, just as
+<scrmem>.
+
+ <colw> is meaningful only if <vgabase> is specified. It tells the
+kernel how wide each of the color register is, i.e. the number of bits
+per single color (red/green/blue). Default is 6, another quite usual
+value is 8.
+
+ Also <coltype> is used together with <vgabase>. It tells the kernel
+about the color register model of your gfx board. Currently, the types
+"vga" (which is also the default) and "mv300" (SANG MV300) are
+implemented.
+
+ Parameter <xres_virtual> is required for ProMST or ET4000 cards where
+the physical linelength differs from the visible length. With ProMST,
+xres_virtual must be set to 2048. For ET4000, xres_virtual depends on the
+initialisation of the video-card.
+If you're missing a corresponding yres_virtual: the external part is legacy,
+therefore we don't support hardware-dependent functions like hardware-scroll,
+panning or blanking.
+
+4.1.7) eclock:
+--------------
+
+The external pixel clock attached to the Falcon VIDEL shifter. This
+currently works only with the ScreenWonder!
+
+4.1.8) monitorcap:
+-------------------
+
+Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
+
+This describes the capabilities of a multisync monitor. Don't use it
+with a fixed-frequency monitor! For now, only the Falcon frame buffer
+uses the settings of "monitorcap:".
+
+ <vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
+your monitor can work with, in Hz. <hmin> and <hmax> are the same for
+the horizontal frequency, in kHz.
+
+ The defaults are 58;62;31;32 (VGA compatible).
+
+ The defaults for TV/SC1224/SC1435 cover both PAL and NTSC standards.
+
+4.1.9) keep
+------------
+
+If this option is given, the framebuffer device doesn't do any video
+mode calculations and settings on its own. The only Atari fb device
+that does this currently is the Falcon.
+
+ What you reach with this: Settings for unknown video extensions
+aren't overridden by the driver, so you can still use the mode found
+when booting, when the driver doesn't know to set this mode itself.
+But this also means, that you can't switch video modes anymore...
+
+ An example where you may want to use "keep" is the ScreenBlaster for
+the Falcon.
+
+
+4.2) atamouse=
+--------------
+
+Syntax: atamouse=<x-threshold>,[<y-threshold>]
+
+ With this option, you can set the mouse movement reporting threshold.
+This is the number of pixels of mouse movement that have to accumulate
+before the IKBD sends a new mouse packet to the kernel. Higher values
+reduce the mouse interrupt load and thus reduce the chance of keyboard
+overruns. Lower values give a slightly faster mouse responses and
+slightly better mouse tracking.
+
+ You can set the threshold in x and y separately, but usually this is
+of little practical use. If there's just one number in the option, it
+is used for both dimensions. The default value is 2 for both
+thresholds.
+
+
+4.3) ataflop=
+-------------
+
+Syntax: ataflop=<drive type>[,<trackbuffering>[,<steprateA>[,<steprateB>]]]
+
+ The drive type may be 0, 1, or 2, for DD, HD, and ED, resp. This
+ setting affects how many buffers are reserved and which formats are
+ probed (see also below). The default is 1 (HD). Only one drive type
+ can be selected. If you have two disk drives, select the "better"
+ type.
+
+ The second parameter <trackbuffer> tells the kernel whether to use
+ track buffering (1) or not (0). The default is machine-dependent:
+ no for the Medusa and yes for all others.
+
+ With the two following parameters, you can change the default
+ steprate used for drive A and B, resp.
+
+
+4.4) atascsi=
+-------------
+
+Syntax: atascsi=<can_queue>[,<cmd_per_lun>[,<scat-gat>[,<host-id>[,<tagged>]]]]
+
+ This option sets some parameters for the Atari native SCSI driver.
+Generally, any number of arguments can be omitted from the end. And
+for each of the numbers, a negative value means "use default". The
+defaults depend on whether TT-style or Falcon-style SCSI is used.
+Below, defaults are noted as n/m, where the first value refers to
+TT-SCSI and the latter to Falcon-SCSI. If an illegal value is given
+for one parameter, an error message is printed and that one setting is
+ignored (others aren't affected).
+
+ <can_queue>:
+ This is the maximum number of SCSI commands queued internally to the
+ Atari SCSI driver. A value of 1 effectively turns off the driver
+ internal multitasking (if it causes problems). Legal values are >=
+ 1. <can_queue> can be as high as you like, but values greater than
+ <cmd_per_lun> times the number of SCSI targets (LUNs) you have
+ don't make sense. Default: 16/8.
+
+ <cmd_per_lun>:
+ Maximum number of SCSI commands issued to the driver for one
+ logical unit (LUN, usually one SCSI target). Legal values start
+ from 1. If tagged queuing (see below) is not used, values greater
+ than 2 don't make sense, but waste memory. Otherwise, the maximum
+ is the number of command tags available to the driver (currently
+ 32). Default: 8/1. (Note: Values > 1 seem to cause problems on a
+ Falcon, cause not yet known.)
+
+ The <cmd_per_lun> value at a great part determines the amount of
+ memory SCSI reserves for itself. The formula is rather
+ complicated, but I can give you some hints:
+ no scatter-gather : cmd_per_lun * 232 bytes
+ full scatter-gather: cmd_per_lun * approx. 17 Kbytes
+
+ <scat-gat>:
+ Size of the scatter-gather table, i.e. the number of requests
+ consecutive on the disk that can be merged into one SCSI command.
+ Legal values are between 0 and 255. Default: 255/0. Note: This
+ value is forced to 0 on a Falcon, since scatter-gather isn't
+ possible with the ST-DMA. Not using scatter-gather hurts
+ performance significantly.
+
+ <host-id>:
+ The SCSI ID to be used by the initiator (your Atari). This is
+ usually 7, the highest possible ID. Every ID on the SCSI bus must
+ be unique. Default: determined at run time: If the NV-RAM checksum
+ is valid, and bit 7 in byte 30 of the NV-RAM is set, the lower 3
+ bits of this byte are used as the host ID. (This method is defined
+ by Atari and also used by some TOS HD drivers.) If the above
+ isn't given, the default ID is 7. (both, TT and Falcon).
+
+ <tagged>:
+ 0 means turn off tagged queuing support, all other values > 0 mean
+ use tagged queuing for targets that support it. Default: currently
+ off, but this may change when tagged queuing handling has been
+ proved to be reliable.
+
+ Tagged queuing means that more than one command can be issued to
+ one LUN, and the SCSI device itself orders the requests so they
+ can be performed in optimal order. Not all SCSI devices support
+ tagged queuing (:-().
+
+4.5 switches=
+-------------
+
+Syntax: switches=<list of switches>
+
+ With this option you can switch some hardware lines that are often
+used to enable/disable certain hardware extensions. Examples are
+OverScan, overclocking, ...
+
+ The <list of switches> is a comma-separated list of the following
+items:
+
+ ikbd: set RTS of the keyboard ACIA high
+ midi: set RTS of the MIDI ACIA high
+ snd6: set bit 6 of the PSG port A
+ snd7: set bit 6 of the PSG port A
+
+It doesn't make sense to mention a switch more than once (no
+difference to only once), but you can give as many switches as you
+want to enable different features. The switch lines are set as early
+as possible during kernel initialization (even before determining the
+present hardware.)
+
+ All of the items can also be prefixed with "ov_", i.e. "ov_ikbd",
+"ov_midi", ... These options are meant for switching on an OverScan
+video extension. The difference to the bare option is that the
+switch-on is done after video initialization, and somehow synchronized
+to the HBLANK. A speciality is that ov_ikbd and ov_midi are switched
+off before rebooting, so that OverScan is disabled and TOS boots
+correctly.
+
+ If you give an option both, with and without the "ov_" prefix, the
+earlier initialization ("ov_"-less) takes precedence. But the
+switching-off on reset still happens in this case.
+
+5) Options for Amiga Only:
+==========================
+
+5.1) video=
+-----------
+
+Syntax: video=<fbname>:<sub-options...>
+
+The <fbname> parameter specifies the name of the frame buffer, valid
+options are `amifb', `cyber', 'virge', `retz3' and `clgen', provided
+that the respective frame buffer devices have been compiled into the
+kernel (or compiled as loadable modules). The behavior of the <fbname>
+option was changed in 2.1.57 so it is now recommended to specify this
+option.
+
+The <sub-options> is a comma-separated list of the sub-options listed
+below. This option is organized similar to the Atari version of the
+"video"-option (4.1), but knows fewer sub-options.
+
+5.1.1) video mode
+-----------------
+
+Again, similar to the video mode for the Atari (see 4.1.1). Predefined
+modes depend on the used frame buffer device.
+
+OCS, ECS and AGA machines all use the color frame buffer. The following
+predefined video modes are available:
+
+NTSC modes:
+ - ntsc : 640x200, 15 kHz, 60 Hz
+ - ntsc-lace : 640x400, 15 kHz, 60 Hz interlaced
+PAL modes:
+ - pal : 640x256, 15 kHz, 50 Hz
+ - pal-lace : 640x512, 15 kHz, 50 Hz interlaced
+ECS modes:
+ - multiscan : 640x480, 29 kHz, 57 Hz
+ - multiscan-lace : 640x960, 29 kHz, 57 Hz interlaced
+ - euro36 : 640x200, 15 kHz, 72 Hz
+ - euro36-lace : 640x400, 15 kHz, 72 Hz interlaced
+ - euro72 : 640x400, 29 kHz, 68 Hz
+ - euro72-lace : 640x800, 29 kHz, 68 Hz interlaced
+ - super72 : 800x300, 23 kHz, 70 Hz
+ - super72-lace : 800x600, 23 kHz, 70 Hz interlaced
+ - dblntsc-ff : 640x400, 27 kHz, 57 Hz
+ - dblntsc-lace : 640x800, 27 kHz, 57 Hz interlaced
+ - dblpal-ff : 640x512, 27 kHz, 47 Hz
+ - dblpal-lace : 640x1024, 27 kHz, 47 Hz interlaced
+ - dblntsc : 640x200, 27 kHz, 57 Hz doublescan
+ - dblpal : 640x256, 27 kHz, 47 Hz doublescan
+VGA modes:
+ - vga : 640x480, 31 kHz, 60 Hz
+ - vga70 : 640x400, 31 kHz, 70 Hz
+
+Please notice that the ECS and VGA modes require either an ECS or AGA
+chipset, and that these modes are limited to 2-bit color for the ECS
+chipset and 8-bit color for the AGA chipset.
+
+5.1.2) depth
+------------
+
+Syntax: depth:<nr. of bit-planes>
+
+Specify the number of bit-planes for the selected video-mode.
+
+5.1.3) inverse
+--------------
+
+Use inverted display (black on white). Functionally the same as the
+"inverse" sub-option for the Atari.
+
+5.1.4) font
+-----------
+
+Syntax: font:<fontname>
+
+Specify the font to use in text modes. Functionally the same as the
+"font" sub-option for the Atari, except that `PEARL8x8' is used instead
+of `VGA8x8' if the vertical size of the display is less than 400 pixel
+rows.
+
+5.1.5) monitorcap:
+-------------------
+
+Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
+
+This describes the capabilities of a multisync monitor. For now, only
+the color frame buffer uses the settings of "monitorcap:".
+
+ <vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
+your monitor can work with, in Hz. <hmin> and <hmax> are the same for
+the horizontal frequency, in kHz.
+
+ The defaults are 50;90;15;38 (Generic Amiga multisync monitor).
+
+
+5.2) fd_def_df0=
+----------------
+
+Syntax: fd_def_df0=<value>
+
+Sets the df0 value for "silent" floppy drives. The value should be in
+hexadecimal with "0x" prefix.
+
+
+5.3) wd33c93=
+-------------
+
+Syntax: wd33c93=<sub-options...>
+
+These options affect the A590/A2091, A3000 and GVP Series II SCSI
+controllers.
+
+The <sub-options> is a comma-separated list of the sub-options listed
+below.
+
+5.3.1) nosync
+-------------
+
+Syntax: nosync:bitmask
+
+ bitmask is a byte where the 1st 7 bits correspond with the 7
+possible SCSI devices. Set a bit to prevent sync negotiation on that
+device. To maintain backwards compatibility, a command-line such as
+"wd33c93=255" will be automatically translated to
+"wd33c93=nosync:0xff". The default is to disable sync negotiation for
+all devices, eg. nosync:0xff.
+
+5.3.2) period
+-------------
+
+Syntax: period:ns
+
+ `ns' is the minimum # of nanoseconds in a SCSI data transfer
+period. Default is 500; acceptable values are 250 - 1000.
+
+5.3.3) disconnect
+-----------------
+
+Syntax: disconnect:x
+
+ Specify x = 0 to never allow disconnects, 2 to always allow them.
+x = 1 does 'adaptive' disconnects, which is the default and generally
+the best choice.
+
+5.3.4) debug
+------------
+
+Syntax: debug:x
+
+ If `DEBUGGING_ON' is defined, x is a bit mask that causes various
+types of debug output to printed - see the DB_xxx defines in
+wd33c93.h.
+
+5.3.5) clock
+------------
+
+Syntax: clock:x
+
+ x = clock input in MHz for WD33c93 chip. Normal values would be from
+8 through 20. The default value depends on your hostadapter(s),
+default for the A3000 internal controller is 14, for the A2091 it's 8
+and for the GVP hostadapters it's either 8 or 14, depending on the
+hostadapter and the SCSI-clock jumper present on some GVP
+hostadapters.
+
+5.3.6) next
+-----------
+
+ No argument. Used to separate blocks of keywords when there's more
+than one wd33c93-based host adapter in the system.
+
+5.3.7) nodma
+------------
+
+Syntax: nodma:x
+
+ If x is 1 (or if the option is just written as "nodma"), the WD33c93
+controller will not use DMA (= direct memory access) to access the
+Amiga's memory. This is useful for some systems (like A3000's and
+A4000's with the A3640 accelerator, revision 3.0) that have problems
+using DMA to chip memory. The default is 0, i.e. to use DMA if
+possible.
+
+
+5.4) gvp11=
+-----------
+
+Syntax: gvp11=<addr-mask>
+
+ The earlier versions of the GVP driver did not handle DMA
+address-mask settings correctly which made it necessary for some
+people to use this option, in order to get their GVP controller
+running under Linux. These problems have hopefully been solved and the
+use of this option is now highly unrecommended!
+
+ Incorrect use can lead to unpredictable behavior, so please only use
+this option if you *know* what you are doing and have a reason to do
+so. In any case if you experience problems and need to use this
+option, please inform us about it by mailing to the Linux/68k kernel
+mailing list.
+
+ The address mask set by this option specifies which addresses are
+valid for DMA with the GVP Series II SCSI controller. An address is
+valid, if no bits are set except the bits that are set in the mask,
+too.
+
+ Some versions of the GVP can only DMA into a 24 bit address range,
+some can address a 25 bit address range while others can use the whole
+32 bit address range for DMA. The correct setting depends on your
+controller and should be autodetected by the driver. An example is the
+24 bit region which is specified by a mask of 0x00fffffe.
+
+
+/* Local Variables: */
+/* mode: text */
+/* End: */
diff --git a/Documentation/magic-number.txt b/Documentation/magic-number.txt
new file mode 100644
index 0000000..9507002
--- /dev/null
+++ b/Documentation/magic-number.txt
@@ -0,0 +1,172 @@
+This file is a registry of magic numbers which are in use. When you
+add a magic number to a structure, you should also add it to this
+file, since it is best if the magic numbers used by various structures
+are unique.
+
+It is a *very* good idea to protect kernel data structures with magic
+numbers. This allows you to check at run time whether (a) a structure
+has been clobbered, or (b) you've passed the wrong structure to a
+routine. This last is especially useful --- particularly when you are
+passing pointers to structures via a void * pointer. The tty code,
+for example, does this frequently to pass driver-specific and line
+discipline-specific structures back and forth.
+
+The way to use magic numbers is to declare then at the beginning of
+the structure, like so:
+
+struct tty_ldisc {
+ int magic;
+ ...
+};
+
+Please follow this discipline when you are adding future enhancements
+to the kernel! It has saved me countless hours of debugging,
+especially in the screwy cases where an array has been overrun and
+structures following the array have been overwritten. Using this
+discipline, these cases get detected quickly and safely.
+
+ Theodore Ts'o
+ 31 Mar 94
+
+The magic table is current to Linux 2.1.55.
+
+ Michael Chastain
+ <mailto:mec@shout.net>
+ 22 Sep 1997
+
+Now it should be up to date with Linux 2.1.112. Because
+we are in feature freeze time it is very unlikely that
+something will change before 2.2.x. The entries are
+sorted by number field.
+
+ Krzysztof G. Baranowski
+ <mailto: kgb@knm.org.pl>
+ 29 Jul 1998
+
+Updated the magic table to Linux 2.5.45. Right over the feature freeze,
+but it is possible that some new magic numbers will sneak into the
+kernel before 2.6.x yet.
+
+ Petr Baudis
+ <pasky@ucw.cz>
+ 03 Nov 2002
+
+Updated the magic table to Linux 2.5.74.
+
+ Fabian Frederick
+ <ffrederick@users.sourceforge.net>
+ 09 Jul 2003
+
+
+Magic Name Number Structure File
+===========================================================================
+PG_MAGIC 'P' pg_{read,write}_hdr include/linux/pg.h
+CMAGIC 0x0111 user include/linux/a.out.h
+MKISS_DRIVER_MAGIC 0x04bf mkiss_channel drivers/net/mkiss.h
+RISCOM8_MAGIC 0x0907 riscom_port drivers/char/riscom8.h
+SPECIALIX_MAGIC 0x0907 specialix_port drivers/char/specialix_io8.h
+HDLC_MAGIC 0x239e n_hdlc drivers/char/n_hdlc.c
+APM_BIOS_MAGIC 0x4101 apm_user arch/i386/kernel/apm.c
+CYCLADES_MAGIC 0x4359 cyclades_port include/linux/cyclades.h
+DB_MAGIC 0x4442 fc_info drivers/net/iph5526_novram.c
+DL_MAGIC 0x444d fc_info drivers/net/iph5526_novram.c
+FASYNC_MAGIC 0x4601 fasync_struct include/linux/fs.h
+FF_MAGIC 0x4646 fc_info drivers/net/iph5526_novram.c
+ISICOM_MAGIC 0x4d54 isi_port include/linux/isicom.h
+PTY_MAGIC 0x5001 drivers/char/pty.c
+PPP_MAGIC 0x5002 ppp include/linux/if_pppvar.h
+SERIAL_MAGIC 0x5301 async_struct include/linux/serial.h
+SSTATE_MAGIC 0x5302 serial_state include/linux/serial.h
+SLIP_MAGIC 0x5302 slip drivers/net/slip.h
+STRIP_MAGIC 0x5303 strip drivers/net/strip.c
+X25_ASY_MAGIC 0x5303 x25_asy drivers/net/x25_asy.h
+SIXPACK_MAGIC 0x5304 sixpack drivers/net/hamradio/6pack.h
+AX25_MAGIC 0x5316 ax_disp drivers/net/mkiss.h
+ESP_MAGIC 0x53ee esp_struct drivers/char/esp.h
+TTY_MAGIC 0x5401 tty_struct include/linux/tty.h
+MGSL_MAGIC 0x5401 mgsl_info drivers/char/synclink.c
+TTY_DRIVER_MAGIC 0x5402 tty_driver include/linux/tty_driver.h
+MGSLPC_MAGIC 0x5402 mgslpc_info drivers/char/pcmcia/synclink_cs.c
+TTY_LDISC_MAGIC 0x5403 tty_ldisc include/linux/tty_ldisc.h
+USB_SERIAL_MAGIC 0x6702 usb_serial drivers/usb/serial/usb-serial.h
+FULL_DUPLEX_MAGIC 0x6969 drivers/net/tulip/de2104x.c
+USB_BLUETOOTH_MAGIC 0x6d02 usb_bluetooth drivers/usb/class/bluetty.c
+RFCOMM_TTY_MAGIC 0x6d02 net/bluetooth/rfcomm/tty.c
+USB_SERIAL_PORT_MAGIC 0x7301 usb_serial_port drivers/usb/serial/usb-serial.h
+CG_MAGIC 0x00090255 ufs_cylinder_group include/linux/ufs_fs.h
+A2232_MAGIC 0x000a2232 gs_port drivers/char/ser_a2232.h
+RPORT_MAGIC 0x00525001 r_port drivers/char/rocket_int.h
+LSEMAGIC 0x05091998 lse drivers/fc4/fc.c
+GDTIOCTL_MAGIC 0x06030f07 gdth_iowr_str drivers/scsi/gdth_ioctl.h
+RIEBL_MAGIC 0x09051990 drivers/net/atarilance.c
+RIO_MAGIC 0x12345678 gs_port drivers/char/rio/rio_linux.c
+SX_MAGIC 0x12345678 gs_port drivers/char/sx.h
+NBD_REQUEST_MAGIC 0x12560953 nbd_request include/linux/nbd.h
+RED_MAGIC2 0x170fc2a5 (any) mm/slab.c
+BAYCOM_MAGIC 0x19730510 baycom_state drivers/net/baycom_epp.c
+ISDN_X25IFACE_MAGIC 0x1e75a2b9 isdn_x25iface_proto_data
+ drivers/isdn/isdn_x25iface.h
+ECP_MAGIC 0x21504345 cdkecpsig include/linux/cdk.h
+LSOMAGIC 0x27091997 lso drivers/fc4/fc.c
+LSMAGIC 0x2a3b4d2a ls drivers/fc4/fc.c
+WANPIPE_MAGIC 0x414C4453 sdla_{dump,exec} include/linux/wanpipe.h
+CS_CARD_MAGIC 0x43525553 cs_card sound/oss/cs46xx.c
+LABELCL_MAGIC 0x4857434c labelcl_info_s include/asm/ia64/sn/labelcl.h
+ISDN_ASYNC_MAGIC 0x49344C01 modem_info include/linux/isdn.h
+CTC_ASYNC_MAGIC 0x49344C01 ctc_tty_info drivers/s390/net/ctctty.c
+ISDN_NET_MAGIC 0x49344C02 isdn_net_local_s drivers/isdn/i4l/isdn_net_lib.h
+SAVEKMSG_MAGIC2 0x4B4D5347 savekmsg arch/*/amiga/config.c
+STLI_BOARDMAGIC 0x4bc6c825 stlibrd include/linux/istallion.h
+CS_STATE_MAGIC 0x4c4f4749 cs_state sound/oss/cs46xx.c
+SLAB_C_MAGIC 0x4f17a36d kmem_cache mm/slab.c
+COW_MAGIC 0x4f4f4f4d cow_header_v1 arch/um/drivers/ubd_user.c
+I810_CARD_MAGIC 0x5072696E i810_card sound/oss/i810_audio.c
+TRIDENT_CARD_MAGIC 0x5072696E trident_card sound/oss/trident.c
+ROUTER_MAGIC 0x524d4157 wan_device include/linux/wanrouter.h
+SCC_MAGIC 0x52696368 gs_port drivers/char/scc.h
+SAVEKMSG_MAGIC1 0x53415645 savekmsg arch/*/amiga/config.c
+GDA_MAGIC 0x58464552 gda include/asm-mips64/sn/gda.h
+RED_MAGIC1 0x5a2cf071 (any) mm/slab.c
+STL_PORTMAGIC 0x5a7182c9 stlport include/linux/stallion.h
+EEPROM_MAGIC_VALUE 0x5ab478d2 lanai_dev drivers/atm/lanai.c
+HDLCDRV_MAGIC 0x5ac6e778 hdlcdrv_state include/linux/hdlcdrv.h
+EPCA_MAGIC 0x5c6df104 channel include/linux/epca.h
+PCXX_MAGIC 0x5c6df104 channel drivers/char/pcxx.h
+KV_MAGIC 0x5f4b565f kernel_vars_s include/asm-mips64/sn/klkernvars.h
+I810_STATE_MAGIC 0x63657373 i810_state sound/oss/i810_audio.c
+TRIDENT_STATE_MAGIC 0x63657373 trient_state sound/oss/trident.c
+M3_CARD_MAGIC 0x646e6f50 m3_card sound/oss/maestro3.c
+FW_HEADER_MAGIC 0x65726F66 fw_header drivers/atm/fore200e.h
+SLOT_MAGIC 0x67267321 slot drivers/hotplug/cpqphp.h
+SLOT_MAGIC 0x67267322 slot drivers/hotplug/acpiphp.h
+LO_MAGIC 0x68797548 nbd_device include/linux/nbd.h
+OPROFILE_MAGIC 0x6f70726f super_block drivers/oprofile/oprofilefs.h
+M3_STATE_MAGIC 0x734d724d m3_state sound/oss/maestro3.c
+STL_PANELMAGIC 0x7ef621a1 stlpanel include/linux/stallion.h
+VMALLOC_MAGIC 0x87654320 snd_alloc_track sound/core/memory.c
+KMALLOC_MAGIC 0x87654321 snd_alloc_track sound/core/memory.c
+PWC_MAGIC 0x89DC10AB pwc_device drivers/usb/media/pwc.h
+NBD_REPLY_MAGIC 0x96744668 nbd_reply include/linux/nbd.h
+STL_BOARDMAGIC 0xa2267f52 stlbrd include/linux/stallion.h
+ENI155_MAGIC 0xa54b872d midway_eprom drivers/atm/eni.h
+SCI_MAGIC 0xbabeface gs_port drivers/char/sh-sci.h
+CODA_MAGIC 0xC0DAC0DA coda_file_info include/linux/coda_fs_i.h
+DPMEM_MAGIC 0xc0ffee11 gdt_pci_sram drivers/scsi/gdth.h
+STLI_PORTMAGIC 0xe671c7a1 stliport include/linux/istallion.h
+YAM_MAGIC 0xF10A7654 yam_port drivers/net/hamradio/yam.c
+CCB_MAGIC 0xf2691ad2 ccb drivers/scsi/ncr53c8xx.c
+QUEUE_MAGIC_FREE 0xf7e1c9a3 queue_entry drivers/scsi/arm/queue.c
+QUEUE_MAGIC_USED 0xf7e1cc33 queue_entry drivers/scsi/arm/queue.c
+HTB_CMAGIC 0xFEFAFEF1 htb_class net/sched/sch_htb.c
+NMI_MAGIC 0x48414d4d455201 nmi_s include/asm-mips64/sn/nmi.h
+
+Note that there are also defined special per-driver magic numbers in sound
+memory management. See include/sound/sndmagic.h for complete list of them. Many
+OSS sound drivers have their magic numbers constructed from the soundcard PCI
+ID - these are not listed here as well.
+
+IrDA subsystem also uses large number of own magic numbers, see
+include/net/irda/irda.h for a complete list of them.
+
+HFS is another larger user of magic numbers - you can find them in
+fs/hfs/hfs.h.
diff --git a/Documentation/make/headers_install.txt b/Documentation/make/headers_install.txt
new file mode 100644
index 0000000..f2481ca
--- /dev/null
+++ b/Documentation/make/headers_install.txt
@@ -0,0 +1,46 @@
+Exporting kernel headers for use by userspace
+=============================================
+
+The "make headers_install" command exports the kernel's header files in a
+form suitable for use by userspace programs.
+
+The linux kernel's exported header files describe the API for user space
+programs attempting to use kernel services. These kernel header files are
+used by the system's C library (such as glibc or uClibc) to define available
+system calls, as well as constants and structures to be used with these
+system calls. The C library's header files include the kernel header files
+from the "linux" subdirectory. The system's libc headers are usually
+installed at the default location /usr/include and the kernel headers in
+subdirectories under that (most notably /usr/include/linux and
+/usr/include/asm).
+
+Kernel headers are backwards compatible, but not forwards compatible. This
+means that a program built against a C library using older kernel headers
+should run on a newer kernel (although it may not have access to new
+features), but a program built against newer kernel headers may not work on an
+older kernel.
+
+The "make headers_install" command can be run in the top level directory of the
+kernel source code (or using a standard out-of-tree build). It takes two
+optional arguments:
+
+ make headers_install ARCH=i386 INSTALL_HDR_PATH=/usr/include
+
+ARCH indicates which architecture to produce headers for, and defaults to the
+current architecture. The linux/asm directory of the exported kernel headers
+is platform-specific, to see a complete list of supported architectures use
+the command:
+
+ ls -d include/asm-* | sed 's/.*-//'
+
+INSTALL_HDR_PATH indicates where to install the headers. It defaults to
+"./usr/include".
+
+The command "make headers_install_all" exports headers for all architectures
+simultaneously. (This is mostly of interest to distribution maintainers,
+who create an architecture-independent tarball from the resulting include
+directory.) Remember to provide the appropriate linux/asm directory via "mv"
+or "ln -s" before building a C library with headers exported this way.
+
+The kernel header export infrastructure is maintained by David Woodhouse
+<dwmw2@infradead.org>.
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
new file mode 100644
index 0000000..089f613
--- /dev/null
+++ b/Documentation/markers.txt
@@ -0,0 +1,85 @@
+ Using the Linux Kernel Markers
+
+ Mathieu Desnoyers
+
+
+This document introduces Linux Kernel Markers and their use. It provides
+examples of how to insert markers in the kernel and connect probe functions to
+them and provides some examples of probe functions.
+
+
+* Purpose of markers
+
+A marker placed in code provides a hook to call a function (probe) that you can
+provide at runtime. A marker can be "on" (a probe is connected to it) or "off"
+(no probe is attached). When a marker is "off" it has no effect, except for
+adding a tiny time penalty (checking a condition for a branch) and space
+penalty (adding a few bytes for the function call at the end of the
+instrumented function and adds a data structure in a separate section). When a
+marker is "on", the function you provide is called each time the marker is
+executed, in the execution context of the caller. When the function provided
+ends its execution, it returns to the caller (continuing from the marker site).
+
+You can put markers at important locations in the code. Markers are
+lightweight hooks that can pass an arbitrary number of parameters,
+described in a printk-like format string, to the attached probe function.
+
+They can be used for tracing and performance accounting.
+
+
+* Usage
+
+In order to use the macro trace_mark, you should include linux/marker.h.
+
+#include <linux/marker.h>
+
+And,
+
+trace_mark(subsystem_event, "myint %d mystring %s", someint, somestring);
+Where :
+- subsystem_event is an identifier unique to your event
+ - subsystem is the name of your subsystem.
+ - event is the name of the event to mark.
+- "myint %d mystring %s" is the formatted string for the serializer. "myint" and
+ "mystring" are repectively the field names associated with the first and
+ second parameter.
+- someint is an integer.
+- somestring is a char pointer.
+
+Connecting a function (probe) to a marker is done by providing a probe (function
+to call) for the specific marker through marker_probe_register() and can be
+activated by calling marker_arm(). Marker deactivation can be done by calling
+marker_disarm() as many times as marker_arm() has been called. Removing a probe
+is done through marker_probe_unregister(); it will disarm the probe.
+marker_synchronize_unregister() must be called before the end of the module exit
+function to make sure there is no caller left using the probe. This, and the
+fact that preemption is disabled around the probe call, make sure that probe
+removal and module unload are safe. See the "Probe example" section below for a
+sample probe module.
+
+The marker mechanism supports inserting multiple instances of the same marker.
+Markers can be put in inline functions, inlined static functions, and
+unrolled loops as well as regular functions.
+
+The naming scheme "subsystem_event" is suggested here as a convention intended
+to limit collisions. Marker names are global to the kernel: they are considered
+as being the same whether they are in the core kernel image or in modules.
+Conflicting format strings for markers with the same name will cause the markers
+to be detected to have a different format string not to be armed and will output
+a printk warning which identifies the inconsistency:
+
+"Format mismatch for probe probe_name (format), marker (format)"
+
+
+* Probe / marker example
+
+See the example provided in samples/markers/src
+
+Compile them with your kernel.
+
+Run, as root :
+modprobe marker-example (insmod order is not important)
+modprobe probe-example
+cat /proc/marker-example (returns an expected error)
+rmmod marker-example probe-example
+dmesg
diff --git a/Documentation/mca.txt b/Documentation/mca.txt
new file mode 100644
index 0000000..510375d
--- /dev/null
+++ b/Documentation/mca.txt
@@ -0,0 +1,313 @@
+i386 Micro Channel Architecture Support
+=======================================
+
+MCA support is enabled using the CONFIG_MCA define. A machine with a MCA
+bus will have the kernel variable MCA_bus set, assuming the BIOS feature
+bits are set properly (see arch/i386/boot/setup.S for information on
+how this detection is done).
+
+Adapter Detection
+=================
+
+The ideal MCA adapter detection is done through the use of the
+Programmable Option Select registers. Generic functions for doing
+this have been added in include/linux/mca.h and arch/i386/kernel/mca.c.
+Everything needed to detect adapters and read (and write) configuration
+information is there. A number of MCA-specific drivers already use
+this. The typical probe code looks like the following:
+
+ #include <linux/mca.h>
+
+ unsigned char pos2, pos3, pos4, pos5;
+ struct net_device* dev;
+ int slot;
+
+ if( MCA_bus ) {
+ slot = mca_find_adapter( ADAPTER_ID, 0 );
+ if( slot == MCA_NOTFOUND ) {
+ return -ENODEV;
+ }
+ /* optional - see below */
+ mca_set_adapter_name( slot, "adapter name & description" );
+ mca_set_adapter_procfn( slot, dev_getinfo, dev );
+
+ /* read the POS registers. Most devices only use 2 and 3 */
+ pos2 = mca_read_stored_pos( slot, 2 );
+ pos3 = mca_read_stored_pos( slot, 3 );
+ pos4 = mca_read_stored_pos( slot, 4 );
+ pos5 = mca_read_stored_pos( slot, 5 );
+ } else {
+ return -ENODEV;
+ }
+
+ /* extract configuration from pos[2345] and set everything up */
+
+Loadable modules should modify this to test that the specified IRQ and
+IO ports (plus whatever other stuff) match. See 3c523.c for example
+code (actually, smc-mca.c has a slightly more complex example that can
+handle a list of adapter ids).
+
+Keep in mind that devices should never directly access the POS registers
+(via inb(), outb(), etc). While it's generally safe, there is a small
+potential for blowing up hardware when it's done at the wrong time.
+Furthermore, accessing a POS register disables a device temporarily.
+This is usually okay during startup, but do _you_ want to rely on it?
+During initial configuration, mca_init() reads all the POS registers
+into memory. mca_read_stored_pos() accesses that data. mca_read_pos()
+and mca_write_pos() are also available for (safer) direct POS access,
+but their use is _highly_ discouraged. mca_write_pos() is particularly
+dangerous, as it is possible for adapters to be put in inconsistent
+states (i.e. sharing IO address, etc) and may result in crashes, toasted
+hardware, and blindness.
+
+User level drivers (such as the AGX X server) can use /proc/mca/pos to
+find adapters (see below).
+
+Some MCA adapters can also be detected via the usual ISA-style device
+probing (many SCSI adapters, for example). This sort of thing is highly
+discouraged. Perfectly good information is available telling you what's
+there, so there's no excuse for messing with random IO ports. However,
+we MCA people still appreciate any ISA-style driver that will work with
+our hardware. You take what you can get...
+
+Level-Triggered Interrupts
+==========================
+
+Because MCA uses level-triggered interrupts, a few problems arise with
+what might best be described as the ISA mindset and its effects on
+drivers. These sorts of problems are expected to become less common as
+more people use shared IRQs on PCI machines.
+
+In general, an interrupt must be acknowledged not only at the ICU (which
+is done automagically by the kernel), but at the device level. In
+particular, IRQ 0 must be reset after a timer interrupt (now done in
+arch/i386/kernel/time.c) or the first timer interrupt hangs the system.
+There were also problems with the 1.3.x floppy drivers, but that seems
+to have been fixed.
+
+IRQs are also shareable, and most MCA-specific devices should be coded
+with shared IRQs in mind.
+
+/proc/mca
+=========
+
+/proc/mca is a directory containing various files for adapters and
+other stuff.
+
+ /proc/mca/pos Straight listing of POS registers
+ /proc/mca/slot[1-8] Information on adapter in specific slot
+ /proc/mca/video Same for integrated video
+ /proc/mca/scsi Same for integrated SCSI
+ /proc/mca/machine Machine information
+
+See Appendix A for a sample.
+
+Device drivers can easily add their own information function for
+specific slots (including integrated ones) via the
+mca_set_adapter_procfn() call. Drivers that support this are ESDI, IBM
+SCSI, and 3c523. If a device is also a module, make sure that the proc
+function is removed in the module cleanup. This will require storing
+the slot information in a private structure somewhere. See the 3c523
+driver for details.
+
+Your typical proc function will look something like this:
+
+ static int
+ dev_getinfo( char* buf, int slot, void* d ) {
+ struct net_device* dev = (struct net_device*) d;
+ int len = 0;
+
+ len += sprintf( buf+len, "Device: %s\n", dev->name );
+ len += sprintf( buf+len, "IRQ: %d\n", dev->irq );
+ len += sprintf( buf+len, "IO Port: %#lx-%#lx\n", ... );
+ ...
+
+ return len;
+ }
+
+Some of the standard MCA information will already be printed, so don't
+bother repeating it. Don't try putting in more than 3K of information.
+
+Enable this function with:
+ mca_set_adapter_procfn( slot, dev_getinfo, dev );
+
+Disable it with:
+ mca_set_adapter_procfn( slot, NULL, NULL );
+
+It is also recommended that, even if you don't write a proc function, to
+set the name of the adapter (i.e. "PS/2 ESDI Controller") via
+mca_set_adapter_name( int slot, char* name ).
+
+MCA Device Drivers
+==================
+
+Currently, there are a number of MCA-specific device drivers.
+
+1) PS/2 SCSI
+ drivers/scsi/ibmmca.c
+ drivers/scsi/ibmmca.h
+ The driver for the IBM SCSI subsystem. Includes both integrated
+ controllers and adapter cards. May require command-line arg
+ "ibmmcascsi=io_port" to force detection of an adapter. If you have a
+ machine with a front-panel display (i.e. model 95), you can use
+ "ibmmcascsi=display" to enable a drive activity indicator.
+
+2) 3c523
+ drivers/net/3c523.c
+ drivers/net/3c523.h
+ 3Com 3c523 Etherlink/MC ethernet driver.
+
+3) SMC Ultra/MCA and IBM Adapter/A
+ drivers/net/smc-mca.c
+ drivers/net/smc-mca.h
+ Driver for the MCA version of the SMC Ultra and various other
+ OEM'ed and work-alike cards (Elite, Adapter/A, etc).
+
+4) NE/2
+ driver/net/ne2.c
+ driver/net/ne2.h
+ The NE/2 is the MCA version of the NE2000. This may not work
+ with clones that have a different adapter id than the original
+ NE/2.
+
+5) Future Domain MCS-600/700, OEM'd IBM Fast SCSI Adapter/A and
+ Reply Sound Blaster/SCSI (SCSI part)
+ Better support for these cards than the driver for ISA.
+ Supports multiple cards with IRQ sharing.
+
+Also added boot time option of scsi-probe, which can do reordering of
+SCSI host adapters. This will direct the kernel on the order which
+SCSI adapter should be detected. Example:
+ scsi-probe=ibmmca,fd_mcs,adaptec1542,buslogic
+
+The serial drivers were modified to support the extended IO port range
+of the typical MCA system (also #ifdef CONFIG_MCA).
+
+The following devices work with existing drivers:
+1) Token-ring
+2) Future Domain SCSI (MCS-600, MCS-700, not MCS-350, OEM'ed IBM SCSI)
+3) Adaptec 1640 SCSI (using the aha1542 driver)
+4) Bustek/Buslogic SCSI (various)
+5) Probably all Arcnet cards.
+6) Some, possibly all, MCA IDE controllers.
+7) 3Com 3c529 (MCA version of 3c509) (patched)
+
+8) Intel EtherExpressMC (patched version)
+ You need to have CONFIG_MCA defined to have EtherExpressMC support.
+9) Reply Sound Blaster/SCSI (SB part) (patched version)
+
+Bugs & Other Weirdness
+======================
+
+NMIs tend to occur with MCA machines because of various hardware
+weirdness, bus timeouts, and many other non-critical things. Some basic
+code to handle them (inspired by the NetBSD MCA code) has been added to
+detect the guilty device, but it's pretty incomplete. If NMIs are a
+persistent problem (on some model 70 or 80s, they occur every couple
+shell commands), the CONFIG_IGNORE_NMI flag will take care of that.
+
+Various Pentium machines have had serious problems with the FPU test in
+bugs.h. Basically, the machine hangs after the HLT test. This occurs,
+as far as we know, on the Pentium-equipped 85s, 95s, and some PC Servers.
+The PCI/MCA PC 750s are fine as far as I can tell. The ``mca-pentium''
+boot-prompt flag will disable the FPU bug check if this is a problem
+with your machine.
+
+The model 80 has a raft of problems that are just too weird and unique
+to get into here. Some people have no trouble while others have nothing
+but problems. I'd suspect some problems are related to the age of the
+average 80 and accompanying hardware deterioration, although others
+are definitely design problems with the hardware. Among the problems
+include SCSI controller problems, ESDI controller problems, and serious
+screw-ups in the floppy controller. Oh, and the parallel port is also
+pretty flaky. There were about 5 or 6 different model 80 motherboards
+produced to fix various obscure problems. As far as I know, it's pretty
+much impossible to tell which bugs a particular model 80 has (other than
+triggering them, that is).
+
+Drivers are required for some MCA memory adapters. If you're suddenly
+short a few megs of RAM, this might be the reason. The (I think) Enhanced
+Memory Adapter commonly found on the model 70 is one. There's a very
+alpha driver floating around, but it's pretty ugly (disassembled from
+the DOS driver, actually). See the MCA Linux web page (URL below)
+for more current memory info.
+
+The Thinkpad 700 and 720 will work, but various components are either
+non-functional, flaky, or we don't know anything about them. The
+graphics controller is supposed to be some WD, but we can't get things
+working properly. The PCMCIA slots don't seem to work. Ditto for APM.
+The serial ports work, but detection seems to be flaky.
+
+Credits
+=======
+A whole pile of people have contributed to the MCA code. I'd include
+their names here, but I don't have a list handy. Check the MCA Linux
+home page (URL below) for a perpetually out-of-date list.
+
+=====================================================================
+MCA Linux Home Page: http://www.dgmicro.com/mca/
+
+Christophe Beauregard
+chrisb@truespectra.com
+cpbeaure@calum.csclub.uwaterloo.ca
+
+=====================================================================
+Appendix A: Sample /proc/mca
+
+This is from my model 8595. Slot 1 contains the standard IBM SCSI
+adapter, slot 3 is an Adaptec AHA-1640, slot 5 is a XGA-1 video adapter,
+and slot 7 is the 3c523 Etherlink/MC.
+
+/proc/mca/machine:
+Model Id: 0xf8
+Submodel Id: 0x14
+BIOS Revision: 0x5
+
+/proc/mca/pos:
+Slot 1: ff 8e f1 fc a0 ff ff ff IBM SCSI Adapter w/Cache
+Slot 2: ff ff ff ff ff ff ff ff
+Slot 3: 1f 0f 81 3b bf b6 ff ff
+Slot 4: ff ff ff ff ff ff ff ff
+Slot 5: db 8f 1d 5e fd c0 00 00
+Slot 6: ff ff ff ff ff ff ff ff
+Slot 7: 42 60 ff 08 ff ff ff ff 3Com 3c523 Etherlink/MC
+Slot 8: ff ff ff ff ff ff ff ff
+Video : ff ff ff ff ff ff ff ff
+SCSI : ff ff ff ff ff ff ff ff
+
+/proc/mca/slot1:
+Slot: 1
+Adapter Name: IBM SCSI Adapter w/Cache
+Id: 8eff
+Enabled: Yes
+POS: ff 8e f1 fc a0 ff ff ff
+Subsystem PUN: 7
+Detected at boot: Yes
+
+/proc/mca/slot3:
+Slot: 3
+Adapter Name: Unknown
+Id: 0f1f
+Enabled: Yes
+POS: 1f 0f 81 3b bf b6 ff ff
+
+/proc/mca/slot5:
+Slot: 5
+Adapter Name: Unknown
+Id: 8fdb
+Enabled: Yes
+POS: db 8f 1d 5e fd c0 00 00
+
+/proc/mca/slot7:
+Slot: 7
+Adapter Name: 3Com 3c523 Etherlink/MC
+Id: 6042
+Enabled: Yes
+POS: 42 60 ff 08 ff ff ff ff
+Revision: 0xe
+IRQ: 9
+IO Address: 0x3300-0x3308
+Memory: 0xd8000-0xdbfff
+Transceiver: External
+Device: eth0
+Hardware Address: 02 60 8c 45 c4 2a
diff --git a/Documentation/md.txt b/Documentation/md.txt
new file mode 100644
index 0000000..1da9d1b
--- /dev/null
+++ b/Documentation/md.txt
@@ -0,0 +1,486 @@
+Tools that manage md devices can be found at
+ http://www.<country>.kernel.org/pub/linux/utils/raid/....
+
+
+Boot time assembly of RAID arrays
+---------------------------------
+
+You can boot with your md device with the following kernel command
+lines:
+
+for old raid arrays without persistent superblocks:
+ md=<md device no.>,<raid level>,<chunk size factor>,<fault level>,dev0,dev1,...,devn
+
+for raid arrays with persistent superblocks
+ md=<md device no.>,dev0,dev1,...,devn
+or, to assemble a partitionable array:
+ md=d<md device no.>,dev0,dev1,...,devn
+
+md device no. = the number of the md device ...
+ 0 means md0,
+ 1 md1,
+ 2 md2,
+ 3 md3,
+ 4 md4
+
+raid level = -1 linear mode
+ 0 striped mode
+ other modes are only supported with persistent super blocks
+
+chunk size factor = (raid-0 and raid-1 only)
+ Set the chunk size as 4k << n.
+
+fault level = totally ignored
+
+dev0-devn: e.g. /dev/hda1,/dev/hdc1,/dev/sda1,/dev/sdb1
+
+A possible loadlin line (Harald Hoyer <HarryH@Royal.Net>) looks like this:
+
+e:\loadlin\loadlin e:\zimage root=/dev/md0 md=0,0,4,0,/dev/hdb2,/dev/hdc3 ro
+
+
+Boot time autodetection of RAID arrays
+--------------------------------------
+
+When md is compiled into the kernel (not as module), partitions of
+type 0xfd are scanned and automatically assembled into RAID arrays.
+This autodetection may be suppressed with the kernel parameter
+"raid=noautodetect". As of kernel 2.6.9, only drives with a type 0
+superblock can be autodetected and run at boot time.
+
+The kernel parameter "raid=partitionable" (or "raid=part") means
+that all auto-detected arrays are assembled as partitionable.
+
+Boot time assembly of degraded/dirty arrays
+-------------------------------------------
+
+If a raid5 or raid6 array is both dirty and degraded, it could have
+undetectable data corruption. This is because the fact that it is
+'dirty' means that the parity cannot be trusted, and the fact that it
+is degraded means that some datablocks are missing and cannot reliably
+be reconstructed (due to no parity).
+
+For this reason, md will normally refuse to start such an array. This
+requires the sysadmin to take action to explicitly start the array
+despite possible corruption. This is normally done with
+ mdadm --assemble --force ....
+
+This option is not really available if the array has the root
+filesystem on it. In order to support this booting from such an
+array, md supports a module parameter "start_dirty_degraded" which,
+when set to 1, bypassed the checks and will allows dirty degraded
+arrays to be started.
+
+So, to boot with a root filesystem of a dirty degraded raid[56], use
+
+ md-mod.start_dirty_degraded=1
+
+
+Superblock formats
+------------------
+
+The md driver can support a variety of different superblock formats.
+Currently, it supports superblock formats "0.90.0" and the "md-1" format
+introduced in the 2.5 development series.
+
+The kernel will autodetect which format superblock is being used.
+
+Superblock format '0' is treated differently to others for legacy
+reasons - it is the original superblock format.
+
+
+General Rules - apply for all superblock formats
+------------------------------------------------
+
+An array is 'created' by writing appropriate superblocks to all
+devices.
+
+It is 'assembled' by associating each of these devices with an
+particular md virtual device. Once it is completely assembled, it can
+be accessed.
+
+An array should be created by a user-space tool. This will write
+superblocks to all devices. It will usually mark the array as
+'unclean', or with some devices missing so that the kernel md driver
+can create appropriate redundancy (copying in raid1, parity
+calculation in raid4/5).
+
+When an array is assembled, it is first initialized with the
+SET_ARRAY_INFO ioctl. This contains, in particular, a major and minor
+version number. The major version number selects which superblock
+format is to be used. The minor number might be used to tune handling
+of the format, such as suggesting where on each device to look for the
+superblock.
+
+Then each device is added using the ADD_NEW_DISK ioctl. This
+provides, in particular, a major and minor number identifying the
+device to add.
+
+The array is started with the RUN_ARRAY ioctl.
+
+Once started, new devices can be added. They should have an
+appropriate superblock written to them, and then passed be in with
+ADD_NEW_DISK.
+
+Devices that have failed or are not yet active can be detached from an
+array using HOT_REMOVE_DISK.
+
+
+Specific Rules that apply to format-0 super block arrays, and
+ arrays with no superblock (non-persistent).
+-------------------------------------------------------------
+
+An array can be 'created' by describing the array (level, chunksize
+etc) in a SET_ARRAY_INFO ioctl. This must has major_version==0 and
+raid_disks != 0.
+
+Then uninitialized devices can be added with ADD_NEW_DISK. The
+structure passed to ADD_NEW_DISK must specify the state of the device
+and it's role in the array.
+
+Once started with RUN_ARRAY, uninitialized spares can be added with
+HOT_ADD_DISK.
+
+
+
+MD devices in sysfs
+-------------------
+md devices appear in sysfs (/sys) as regular block devices,
+e.g.
+ /sys/block/md0
+
+Each 'md' device will contain a subdirectory called 'md' which
+contains further md-specific information about the device.
+
+All md devices contain:
+ level
+ a text file indicating the 'raid level'. e.g. raid0, raid1,
+ raid5, linear, multipath, faulty.
+ If no raid level has been set yet (array is still being
+ assembled), the value will reflect whatever has been written
+ to it, which may be a name like the above, or may be a number
+ such as '0', '5', etc.
+
+ raid_disks
+ a text file with a simple number indicating the number of devices
+ in a fully functional array. If this is not yet known, the file
+ will be empty. If an array is being resized (not currently
+ possible) this will contain the larger of the old and new sizes.
+ Some raid level (RAID1) allow this value to be set while the
+ array is active. This will reconfigure the array. Otherwise
+ it can only be set while assembling an array.
+
+ chunk_size
+ This is the size if bytes for 'chunks' and is only relevant to
+ raid levels that involve striping (1,4,5,6,10). The address space
+ of the array is conceptually divided into chunks and consecutive
+ chunks are striped onto neighbouring devices.
+ The size should be at least PAGE_SIZE (4k) and should be a power
+ of 2. This can only be set while assembling an array
+
+ layout
+ The "layout" for the array for the particular level. This is
+ simply a number that is interpretted differently by different
+ levels. It can be written while assembling an array.
+
+ reshape_position
+ This is either "none" or a sector number within the devices of
+ the array where "reshape" is up to. If this is set, the three
+ attributes mentioned above (raid_disks, chunk_size, layout) can
+ potentially have 2 values, an old and a new value. If these
+ values differ, reading the attribute returns
+ new (old)
+ and writing will effect the 'new' value, leaving the 'old'
+ unchanged.
+
+ component_size
+ For arrays with data redundancy (i.e. not raid0, linear, faulty,
+ multipath), all components must be the same size - or at least
+ there must a size that they all provide space for. This is a key
+ part or the geometry of the array. It is measured in sectors
+ and can be read from here. Writing to this value may resize
+ the array if the personality supports it (raid1, raid5, raid6),
+ and if the component drives are large enough.
+
+ metadata_version
+ This indicates the format that is being used to record metadata
+ about the array. It can be 0.90 (traditional format), 1.0, 1.1,
+ 1.2 (newer format in varying locations) or "none" indicating that
+ the kernel isn't managing metadata at all.
+
+ resync_start
+ The point at which resync should start. If no resync is needed,
+ this will be a very large number. At array creation it will
+ default to 0, though starting the array as 'clean' will
+ set it much larger.
+
+ new_dev
+ This file can be written but not read. The value written should
+ be a block device number as major:minor. e.g. 8:0
+ This will cause that device to be attached to the array, if it is
+ available. It will then appear at md/dev-XXX (depending on the
+ name of the device) and further configuration is then possible.
+
+ safe_mode_delay
+ When an md array has seen no write requests for a certain period
+ of time, it will be marked as 'clean'. When another write
+ request arrives, the array is marked as 'dirty' before the write
+ commences. This is known as 'safe_mode'.
+ The 'certain period' is controlled by this file which stores the
+ period as a number of seconds. The default is 200msec (0.200).
+ Writing a value of 0 disables safemode.
+
+ array_state
+ This file contains a single word which describes the current
+ state of the array. In many cases, the state can be set by
+ writing the word for the desired state, however some states
+ cannot be explicitly set, and some transitions are not allowed.
+
+ Select/poll works on this file. All changes except between
+ active_idle and active (which can be frequent and are not
+ very interesting) are notified. active->active_idle is
+ reported if the metadata is externally managed.
+
+ clear
+ No devices, no size, no level
+ Writing is equivalent to STOP_ARRAY ioctl
+ inactive
+ May have some settings, but array is not active
+ all IO results in error
+ When written, doesn't tear down array, but just stops it
+ suspended (not supported yet)
+ All IO requests will block. The array can be reconfigured.
+ Writing this, if accepted, will block until array is quiessent
+ readonly
+ no resync can happen. no superblocks get written.
+ write requests fail
+ read-auto
+ like readonly, but behaves like 'clean' on a write request.
+
+ clean - no pending writes, but otherwise active.
+ When written to inactive array, starts without resync
+ If a write request arrives then
+ if metadata is known, mark 'dirty' and switch to 'active'.
+ if not known, block and switch to write-pending
+ If written to an active array that has pending writes, then fails.
+ active
+ fully active: IO and resync can be happening.
+ When written to inactive array, starts with resync
+
+ write-pending
+ clean, but writes are blocked waiting for 'active' to be written.
+
+ active-idle
+ like active, but no writes have been seen for a while (safe_mode_delay).
+
+
+As component devices are added to an md array, they appear in the 'md'
+directory as new directories named
+ dev-XXX
+where XXX is a name that the kernel knows for the device, e.g. hdb1.
+Each directory contains:
+
+ block
+ a symlink to the block device in /sys/block, e.g.
+ /sys/block/md0/md/dev-hdb1/block -> ../../../../block/hdb/hdb1
+
+ super
+ A file containing an image of the superblock read from, or
+ written to, that device.
+
+ state
+ A file recording the current state of the device in the array
+ which can be a comma separated list of
+ faulty - device has been kicked from active use due to
+ a detected fault
+ in_sync - device is a fully in-sync member of the array
+ writemostly - device will only be subject to read
+ requests if there are no other options.
+ This applies only to raid1 arrays.
+ blocked - device has failed, metadata is "external",
+ and the failure hasn't been acknowledged yet.
+ Writes that would write to this device if
+ it were not faulty are blocked.
+ spare - device is working, but not a full member.
+ This includes spares that are in the process
+ of being recovered to
+ This list may grow in future.
+ This can be written to.
+ Writing "faulty" simulates a failure on the device.
+ Writing "remove" removes the device from the array.
+ Writing "writemostly" sets the writemostly flag.
+ Writing "-writemostly" clears the writemostly flag.
+ Writing "blocked" sets the "blocked" flag.
+ Writing "-blocked" clear the "blocked" flag and allows writes
+ to complete.
+
+ This file responds to select/poll. Any change to 'faulty'
+ or 'blocked' causes an event.
+
+ errors
+ An approximate count of read errors that have been detected on
+ this device but have not caused the device to be evicted from
+ the array (either because they were corrected or because they
+ happened while the array was read-only). When using version-1
+ metadata, this value persists across restarts of the array.
+
+ This value can be written while assembling an array thus
+ providing an ongoing count for arrays with metadata managed by
+ userspace.
+
+ slot
+ This gives the role that the device has in the array. It will
+ either be 'none' if the device is not active in the array
+ (i.e. is a spare or has failed) or an integer less than the
+ 'raid_disks' number for the array indicating which position
+ it currently fills. This can only be set while assembling an
+ array. A device for which this is set is assumed to be working.
+
+ offset
+ This gives the location in the device (in sectors from the
+ start) where data from the array will be stored. Any part of
+ the device before this offset us not touched, unless it is
+ used for storing metadata (Formats 1.1 and 1.2).
+
+ size
+ The amount of the device, after the offset, that can be used
+ for storage of data. This will normally be the same as the
+ component_size. This can be written while assembling an
+ array. If a value less than the current component_size is
+ written, it will be rejected.
+
+
+An active md device will also contain and entry for each active device
+in the array. These are named
+
+ rdNN
+
+where 'NN' is the position in the array, starting from 0.
+So for a 3 drive array there will be rd0, rd1, rd2.
+These are symbolic links to the appropriate 'dev-XXX' entry.
+Thus, for example,
+ cat /sys/block/md*/md/rd*/state
+will show 'in_sync' on every line.
+
+
+
+Active md devices for levels that support data redundancy (1,4,5,6)
+also have
+
+ sync_action
+ a text file that can be used to monitor and control the rebuild
+ process. It contains one word which can be one of:
+ resync - redundancy is being recalculated after unclean
+ shutdown or creation
+ recover - a hot spare is being built to replace a
+ failed/missing device
+ idle - nothing is happening
+ check - A full check of redundancy was requested and is
+ happening. This reads all block and checks
+ them. A repair may also happen for some raid
+ levels.
+ repair - A full check and repair is happening. This is
+ similar to 'resync', but was requested by the
+ user, and the write-intent bitmap is NOT used to
+ optimise the process.
+
+ This file is writable, and each of the strings that could be
+ read are meaningful for writing.
+
+ 'idle' will stop an active resync/recovery etc. There is no
+ guarantee that another resync/recovery may not be automatically
+ started again, though some event will be needed to trigger
+ this.
+ 'resync' or 'recovery' can be used to restart the
+ corresponding operation if it was stopped with 'idle'.
+ 'check' and 'repair' will start the appropriate process
+ providing the current state is 'idle'.
+
+ This file responds to select/poll. Any important change in the value
+ triggers a poll event. Sometimes the value will briefly be
+ "recover" if a recovery seems to be needed, but cannot be
+ achieved. In that case, the transition to "recover" isn't
+ notified, but the transition away is.
+
+ degraded
+ This contains a count of the number of devices by which the
+ arrays is degraded. So an optimal array with show '0'. A
+ single failed/missing drive will show '1', etc.
+ This file responds to select/poll, any increase or decrease
+ in the count of missing devices will trigger an event.
+
+ mismatch_count
+ When performing 'check' and 'repair', and possibly when
+ performing 'resync', md will count the number of errors that are
+ found. The count in 'mismatch_cnt' is the number of sectors
+ that were re-written, or (for 'check') would have been
+ re-written. As most raid levels work in units of pages rather
+ than sectors, this my be larger than the number of actual errors
+ by a factor of the number of sectors in a page.
+
+ bitmap_set_bits
+ If the array has a write-intent bitmap, then writing to this
+ attribute can set bits in the bitmap, indicating that a resync
+ would need to check the corresponding blocks. Either individual
+ numbers or start-end pairs can be written. Multiple numbers
+ can be separated by a space.
+ Note that the numbers are 'bit' numbers, not 'block' numbers.
+ They should be scaled by the bitmap_chunksize.
+
+ sync_speed_min
+ sync_speed_max
+ This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
+ however they only apply to the particular array.
+ If no value has been written to these, of if the word 'system'
+ is written, then the system-wide value is used. If a value,
+ in kibibytes-per-second is written, then it is used.
+ When the files are read, they show the currently active value
+ followed by "(local)" or "(system)" depending on whether it is
+ a locally set or system-wide value.
+
+ sync_completed
+ This shows the number of sectors that have been completed of
+ whatever the current sync_action is, followed by the number of
+ sectors in total that could need to be processed. The two
+ numbers are separated by a '/' thus effectively showing one
+ value, a fraction of the process that is complete.
+ A 'select' on this attribute will return when resync completes,
+ when it reaches the current sync_max (below) and possibly at
+ other times.
+
+ sync_max
+ This is a number of sectors at which point a resync/recovery
+ process will pause. When a resync is active, the value can
+ only ever be increased, never decreased. The value of 'max'
+ effectively disables the limit.
+
+
+ sync_speed
+ This shows the current actual speed, in K/sec, of the current
+ sync_action. It is averaged over the last 30 seconds.
+
+ suspend_lo
+ suspend_hi
+ The two values, given as numbers of sectors, indicate a range
+ within the array where IO will be blocked. This is currently
+ only supported for raid4/5/6.
+
+
+Each active md device may also have attributes specific to the
+personality module that manages it.
+These are specific to the implementation of the module and could
+change substantially if the implementation changes.
+
+These currently include
+
+ stripe_cache_size (currently raid5 only)
+ number of entries in the stripe cache. This is writable, but
+ there are upper and lower limits (32768, 16). Default is 128.
+ strip_cache_active (currently raid5 only)
+ number of active entries in the stripe cache
+ preread_bypass_threshold (currently raid5 only)
+ number of times a stripe requiring preread will be bypassed by
+ a stripe that does not require preread. For fairness defaults
+ to 1. Setting this to 0 disables bypass accounting and
+ requires preread stripes to wait until all full-width stripe-
+ writes are complete. Valid values are 0 to stripe_cache_size.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
new file mode 100644
index 0000000..f5b7127
--- /dev/null
+++ b/Documentation/memory-barriers.txt
@@ -0,0 +1,2156 @@
+ ============================
+ LINUX KERNEL MEMORY BARRIERS
+ ============================
+
+By: David Howells <dhowells@redhat.com>
+
+Contents:
+
+ (*) Abstract memory access model.
+
+ - Device operations.
+ - Guarantees.
+
+ (*) What are memory barriers?
+
+ - Varieties of memory barrier.
+ - What may not be assumed about memory barriers?
+ - Data dependency barriers.
+ - Control dependencies.
+ - SMP barrier pairing.
+ - Examples of memory barrier sequences.
+ - Read memory barriers vs load speculation.
+
+ (*) Explicit kernel barriers.
+
+ - Compiler barrier.
+ - CPU memory barriers.
+ - MMIO write barrier.
+
+ (*) Implicit kernel memory barriers.
+
+ - Locking functions.
+ - Interrupt disabling functions.
+ - Miscellaneous functions.
+
+ (*) Inter-CPU locking barrier effects.
+
+ - Locks vs memory accesses.
+ - Locks vs I/O accesses.
+
+ (*) Where are memory barriers needed?
+
+ - Interprocessor interaction.
+ - Atomic operations.
+ - Accessing devices.
+ - Interrupts.
+
+ (*) Kernel I/O barrier effects.
+
+ (*) Assumed minimum execution ordering model.
+
+ (*) The effects of the cpu cache.
+
+ - Cache coherency.
+ - Cache coherency vs DMA.
+ - Cache coherency vs MMIO.
+
+ (*) The things CPUs get up to.
+
+ - And then there's the Alpha.
+
+ (*) References.
+
+
+============================
+ABSTRACT MEMORY ACCESS MODEL
+============================
+
+Consider the following abstract model of the system:
+
+ : :
+ : :
+ : :
+ +-------+ : +--------+ : +-------+
+ | | : | | : | |
+ | | : | | : | |
+ | CPU 1 |<----->| Memory |<----->| CPU 2 |
+ | | : | | : | |
+ | | : | | : | |
+ +-------+ : +--------+ : +-------+
+ ^ : ^ : ^
+ | : | : |
+ | : | : |
+ | : v : |
+ | : +--------+ : |
+ | : | | : |
+ | : | | : |
+ +---------->| Device |<----------+
+ : | | :
+ : | | :
+ : +--------+ :
+ : :
+
+Each CPU executes a program that generates memory access operations. In the
+abstract CPU, memory operation ordering is very relaxed, and a CPU may actually
+perform the memory operations in any order it likes, provided program causality
+appears to be maintained. Similarly, the compiler may also arrange the
+instructions it emits in any order it likes, provided it doesn't affect the
+apparent operation of the program.
+
+So in the above diagram, the effects of the memory operations performed by a
+CPU are perceived by the rest of the system as the operations cross the
+interface between the CPU and rest of the system (the dotted lines).
+
+
+For example, consider the following sequence of events:
+
+ CPU 1 CPU 2
+ =============== ===============
+ { A == 1; B == 2 }
+ A = 3; x = A;
+ B = 4; y = B;
+
+The set of accesses as seen by the memory system in the middle can be arranged
+in 24 different combinations:
+
+ STORE A=3, STORE B=4, x=LOAD A->3, y=LOAD B->4
+ STORE A=3, STORE B=4, y=LOAD B->4, x=LOAD A->3
+ STORE A=3, x=LOAD A->3, STORE B=4, y=LOAD B->4
+ STORE A=3, x=LOAD A->3, y=LOAD B->2, STORE B=4
+ STORE A=3, y=LOAD B->2, STORE B=4, x=LOAD A->3
+ STORE A=3, y=LOAD B->2, x=LOAD A->3, STORE B=4
+ STORE B=4, STORE A=3, x=LOAD A->3, y=LOAD B->4
+ STORE B=4, ...
+ ...
+
+and can thus result in four different combinations of values:
+
+ x == 1, y == 2
+ x == 1, y == 4
+ x == 3, y == 2
+ x == 3, y == 4
+
+
+Furthermore, the stores committed by a CPU to the memory system may not be
+perceived by the loads made by another CPU in the same order as the stores were
+committed.
+
+
+As a further example, consider this sequence of events:
+
+ CPU 1 CPU 2
+ =============== ===============
+ { A == 1, B == 2, C = 3, P == &A, Q == &C }
+ B = 4; Q = P;
+ P = &B D = *Q;
+
+There is an obvious data dependency here, as the value loaded into D depends on
+the address retrieved from P by CPU 2. At the end of the sequence, any of the
+following results are possible:
+
+ (Q == &A) and (D == 1)
+ (Q == &B) and (D == 2)
+ (Q == &B) and (D == 4)
+
+Note that CPU 2 will never try and load C into D because the CPU will load P
+into Q before issuing the load of *Q.
+
+
+DEVICE OPERATIONS
+-----------------
+
+Some devices present their control interfaces as collections of memory
+locations, but the order in which the control registers are accessed is very
+important. For instance, imagine an ethernet card with a set of internal
+registers that are accessed through an address port register (A) and a data
+port register (D). To read internal register 5, the following code might then
+be used:
+
+ *A = 5;
+ x = *D;
+
+but this might show up as either of the following two sequences:
+
+ STORE *A = 5, x = LOAD *D
+ x = LOAD *D, STORE *A = 5
+
+the second of which will almost certainly result in a malfunction, since it set
+the address _after_ attempting to read the register.
+
+
+GUARANTEES
+----------
+
+There are some minimal guarantees that may be expected of a CPU:
+
+ (*) On any given CPU, dependent memory accesses will be issued in order, with
+ respect to itself. This means that for:
+
+ Q = P; D = *Q;
+
+ the CPU will issue the following memory operations:
+
+ Q = LOAD P, D = LOAD *Q
+
+ and always in that order.
+
+ (*) Overlapping loads and stores within a particular CPU will appear to be
+ ordered within that CPU. This means that for:
+
+ a = *X; *X = b;
+
+ the CPU will only issue the following sequence of memory operations:
+
+ a = LOAD *X, STORE *X = b
+
+ And for:
+
+ *X = c; d = *X;
+
+ the CPU will only issue:
+
+ STORE *X = c, d = LOAD *X
+
+ (Loads and stores overlap if they are targeted at overlapping pieces of
+ memory).
+
+And there are a number of things that _must_ or _must_not_ be assumed:
+
+ (*) It _must_not_ be assumed that independent loads and stores will be issued
+ in the order given. This means that for:
+
+ X = *A; Y = *B; *D = Z;
+
+ we may get any of the following sequences:
+
+ X = LOAD *A, Y = LOAD *B, STORE *D = Z
+ X = LOAD *A, STORE *D = Z, Y = LOAD *B
+ Y = LOAD *B, X = LOAD *A, STORE *D = Z
+ Y = LOAD *B, STORE *D = Z, X = LOAD *A
+ STORE *D = Z, X = LOAD *A, Y = LOAD *B
+ STORE *D = Z, Y = LOAD *B, X = LOAD *A
+
+ (*) It _must_ be assumed that overlapping memory accesses may be merged or
+ discarded. This means that for:
+
+ X = *A; Y = *(A + 4);
+
+ we may get any one of the following sequences:
+
+ X = LOAD *A; Y = LOAD *(A + 4);
+ Y = LOAD *(A + 4); X = LOAD *A;
+ {X, Y} = LOAD {*A, *(A + 4) };
+
+ And for:
+
+ *A = X; Y = *A;
+
+ we may get either of:
+
+ STORE *A = X; Y = LOAD *A;
+ STORE *A = Y = X;
+
+
+=========================
+WHAT ARE MEMORY BARRIERS?
+=========================
+
+As can be seen above, independent memory operations are effectively performed
+in random order, but this can be a problem for CPU-CPU interaction and for I/O.
+What is required is some way of intervening to instruct the compiler and the
+CPU to restrict the order.
+
+Memory barriers are such interventions. They impose a perceived partial
+ordering over the memory operations on either side of the barrier.
+
+Such enforcement is important because the CPUs and other devices in a system
+can use a variety of tricks to improve performance, including reordering,
+deferral and combination of memory operations; speculative loads; speculative
+branch prediction and various types of caching. Memory barriers are used to
+override or suppress these tricks, allowing the code to sanely control the
+interaction of multiple CPUs and/or devices.
+
+
+VARIETIES OF MEMORY BARRIER
+---------------------------
+
+Memory barriers come in four basic varieties:
+
+ (1) Write (or store) memory barriers.
+
+ A write memory barrier gives a guarantee that all the STORE operations
+ specified before the barrier will appear to happen before all the STORE
+ operations specified after the barrier with respect to the other
+ components of the system.
+
+ A write barrier is a partial ordering on stores only; it is not required
+ to have any effect on loads.
+
+ A CPU can be viewed as committing a sequence of store operations to the
+ memory system as time progresses. All stores before a write barrier will
+ occur in the sequence _before_ all the stores after the write barrier.
+
+ [!] Note that write barriers should normally be paired with read or data
+ dependency barriers; see the "SMP barrier pairing" subsection.
+
+
+ (2) Data dependency barriers.
+
+ A data dependency barrier is a weaker form of read barrier. In the case
+ where two loads are performed such that the second depends on the result
+ of the first (eg: the first load retrieves the address to which the second
+ load will be directed), a data dependency barrier would be required to
+ make sure that the target of the second load is updated before the address
+ obtained by the first load is accessed.
+
+ A data dependency barrier is a partial ordering on interdependent loads
+ only; it is not required to have any effect on stores, independent loads
+ or overlapping loads.
+
+ As mentioned in (1), the other CPUs in the system can be viewed as
+ committing sequences of stores to the memory system that the CPU being
+ considered can then perceive. A data dependency barrier issued by the CPU
+ under consideration guarantees that for any load preceding it, if that
+ load touches one of a sequence of stores from another CPU, then by the
+ time the barrier completes, the effects of all the stores prior to that
+ touched by the load will be perceptible to any loads issued after the data
+ dependency barrier.
+
+ See the "Examples of memory barrier sequences" subsection for diagrams
+ showing the ordering constraints.
+
+ [!] Note that the first load really has to have a _data_ dependency and
+ not a control dependency. If the address for the second load is dependent
+ on the first load, but the dependency is through a conditional rather than
+ actually loading the address itself, then it's a _control_ dependency and
+ a full read barrier or better is required. See the "Control dependencies"
+ subsection for more information.
+
+ [!] Note that data dependency barriers should normally be paired with
+ write barriers; see the "SMP barrier pairing" subsection.
+
+
+ (3) Read (or load) memory barriers.
+
+ A read barrier is a data dependency barrier plus a guarantee that all the
+ LOAD operations specified before the barrier will appear to happen before
+ all the LOAD operations specified after the barrier with respect to the
+ other components of the system.
+
+ A read barrier is a partial ordering on loads only; it is not required to
+ have any effect on stores.
+
+ Read memory barriers imply data dependency barriers, and so can substitute
+ for them.
+
+ [!] Note that read barriers should normally be paired with write barriers;
+ see the "SMP barrier pairing" subsection.
+
+
+ (4) General memory barriers.
+
+ A general memory barrier gives a guarantee that all the LOAD and STORE
+ operations specified before the barrier will appear to happen before all
+ the LOAD and STORE operations specified after the barrier with respect to
+ the other components of the system.
+
+ A general memory barrier is a partial ordering over both loads and stores.
+
+ General memory barriers imply both read and write memory barriers, and so
+ can substitute for either.
+
+
+And a couple of implicit varieties:
+
+ (5) LOCK operations.
+
+ This acts as a one-way permeable barrier. It guarantees that all memory
+ operations after the LOCK operation will appear to happen after the LOCK
+ operation with respect to the other components of the system.
+
+ Memory operations that occur before a LOCK operation may appear to happen
+ after it completes.
+
+ A LOCK operation should almost always be paired with an UNLOCK operation.
+
+
+ (6) UNLOCK operations.
+
+ This also acts as a one-way permeable barrier. It guarantees that all
+ memory operations before the UNLOCK operation will appear to happen before
+ the UNLOCK operation with respect to the other components of the system.
+
+ Memory operations that occur after an UNLOCK operation may appear to
+ happen before it completes.
+
+ LOCK and UNLOCK operations are guaranteed to appear with respect to each
+ other strictly in the order specified.
+
+ The use of LOCK and UNLOCK operations generally precludes the need for
+ other sorts of memory barrier (but note the exceptions mentioned in the
+ subsection "MMIO write barrier").
+
+
+Memory barriers are only required where there's a possibility of interaction
+between two CPUs or between a CPU and a device. If it can be guaranteed that
+there won't be any such interaction in any particular piece of code, then
+memory barriers are unnecessary in that piece of code.
+
+
+Note that these are the _minimum_ guarantees. Different architectures may give
+more substantial guarantees, but they may _not_ be relied upon outside of arch
+specific code.
+
+
+WHAT MAY NOT BE ASSUMED ABOUT MEMORY BARRIERS?
+----------------------------------------------
+
+There are certain things that the Linux kernel memory barriers do not guarantee:
+
+ (*) There is no guarantee that any of the memory accesses specified before a
+ memory barrier will be _complete_ by the completion of a memory barrier
+ instruction; the barrier can be considered to draw a line in that CPU's
+ access queue that accesses of the appropriate type may not cross.
+
+ (*) There is no guarantee that issuing a memory barrier on one CPU will have
+ any direct effect on another CPU or any other hardware in the system. The
+ indirect effect will be the order in which the second CPU sees the effects
+ of the first CPU's accesses occur, but see the next point:
+
+ (*) There is no guarantee that a CPU will see the correct order of effects
+ from a second CPU's accesses, even _if_ the second CPU uses a memory
+ barrier, unless the first CPU _also_ uses a matching memory barrier (see
+ the subsection on "SMP Barrier Pairing").
+
+ (*) There is no guarantee that some intervening piece of off-the-CPU
+ hardware[*] will not reorder the memory accesses. CPU cache coherency
+ mechanisms should propagate the indirect effects of a memory barrier
+ between CPUs, but might not do so in order.
+
+ [*] For information on bus mastering DMA and coherency please read:
+
+ Documentation/PCI/pci.txt
+ Documentation/PCI/PCI-DMA-mapping.txt
+ Documentation/DMA-API.txt
+
+
+DATA DEPENDENCY BARRIERS
+------------------------
+
+The usage requirements of data dependency barriers are a little subtle, and
+it's not always obvious that they're needed. To illustrate, consider the
+following sequence of events:
+
+ CPU 1 CPU 2
+ =============== ===============
+ { A == 1, B == 2, C = 3, P == &A, Q == &C }
+ B = 4;
+ <write barrier>
+ P = &B
+ Q = P;
+ D = *Q;
+
+There's a clear data dependency here, and it would seem that by the end of the
+sequence, Q must be either &A or &B, and that:
+
+ (Q == &A) implies (D == 1)
+ (Q == &B) implies (D == 4)
+
+But! CPU 2's perception of P may be updated _before_ its perception of B, thus
+leading to the following situation:
+
+ (Q == &B) and (D == 2) ????
+
+Whilst this may seem like a failure of coherency or causality maintenance, it
+isn't, and this behaviour can be observed on certain real CPUs (such as the DEC
+Alpha).
+
+To deal with this, a data dependency barrier or better must be inserted
+between the address load and the data load:
+
+ CPU 1 CPU 2
+ =============== ===============
+ { A == 1, B == 2, C = 3, P == &A, Q == &C }
+ B = 4;
+ <write barrier>
+ P = &B
+ Q = P;
+ <data dependency barrier>
+ D = *Q;
+
+This enforces the occurrence of one of the two implications, and prevents the
+third possibility from arising.
+
+[!] Note that this extremely counterintuitive situation arises most easily on
+machines with split caches, so that, for example, one cache bank processes
+even-numbered cache lines and the other bank processes odd-numbered cache
+lines. The pointer P might be stored in an odd-numbered cache line, and the
+variable B might be stored in an even-numbered cache line. Then, if the
+even-numbered bank of the reading CPU's cache is extremely busy while the
+odd-numbered bank is idle, one can see the new value of the pointer P (&B),
+but the old value of the variable B (2).
+
+
+Another example of where data dependency barriers might by required is where a
+number is read from memory and then used to calculate the index for an array
+access:
+
+ CPU 1 CPU 2
+ =============== ===============
+ { M[0] == 1, M[1] == 2, M[3] = 3, P == 0, Q == 3 }
+ M[1] = 4;
+ <write barrier>
+ P = 1
+ Q = P;
+ <data dependency barrier>
+ D = M[Q];
+
+
+The data dependency barrier is very important to the RCU system, for example.
+See rcu_dereference() in include/linux/rcupdate.h. This permits the current
+target of an RCU'd pointer to be replaced with a new modified target, without
+the replacement target appearing to be incompletely initialised.
+
+See also the subsection on "Cache Coherency" for a more thorough example.
+
+
+CONTROL DEPENDENCIES
+--------------------
+
+A control dependency requires a full read memory barrier, not simply a data
+dependency barrier to make it work correctly. Consider the following bit of
+code:
+
+ q = &a;
+ if (p)
+ q = &b;
+ <data dependency barrier>
+ x = *q;
+
+This will not have the desired effect because there is no actual data
+dependency, but rather a control dependency that the CPU may short-circuit by
+attempting to predict the outcome in advance. In such a case what's actually
+required is:
+
+ q = &a;
+ if (p)
+ q = &b;
+ <read barrier>
+ x = *q;
+
+
+SMP BARRIER PAIRING
+-------------------
+
+When dealing with CPU-CPU interactions, certain types of memory barrier should
+always be paired. A lack of appropriate pairing is almost certainly an error.
+
+A write barrier should always be paired with a data dependency barrier or read
+barrier, though a general barrier would also be viable. Similarly a read
+barrier or a data dependency barrier should always be paired with at least an
+write barrier, though, again, a general barrier is viable:
+
+ CPU 1 CPU 2
+ =============== ===============
+ a = 1;
+ <write barrier>
+ b = 2; x = b;
+ <read barrier>
+ y = a;
+
+Or:
+
+ CPU 1 CPU 2
+ =============== ===============================
+ a = 1;
+ <write barrier>
+ b = &a; x = b;
+ <data dependency barrier>
+ y = *x;
+
+Basically, the read barrier always has to be there, even though it can be of
+the "weaker" type.
+
+[!] Note that the stores before the write barrier would normally be expected to
+match the loads after the read barrier or the data dependency barrier, and vice
+versa:
+
+ CPU 1 CPU 2
+ =============== ===============
+ a = 1; }---- --->{ v = c
+ b = 2; } \ / { w = d
+ <write barrier> \ <read barrier>
+ c = 3; } / \ { x = a;
+ d = 4; }---- --->{ y = b;
+
+
+EXAMPLES OF MEMORY BARRIER SEQUENCES
+------------------------------------
+
+Firstly, write barriers act as partial orderings on store operations.
+Consider the following sequence of events:
+
+ CPU 1
+ =======================
+ STORE A = 1
+ STORE B = 2
+ STORE C = 3
+ <write barrier>
+ STORE D = 4
+ STORE E = 5
+
+This sequence of events is committed to the memory coherence system in an order
+that the rest of the system might perceive as the unordered set of { STORE A,
+STORE B, STORE C } all occurring before the unordered set of { STORE D, STORE E
+}:
+
+ +-------+ : :
+ | | +------+
+ | |------>| C=3 | } /\
+ | | : +------+ }----- \ -----> Events perceptible to
+ | | : | A=1 | } \/ the rest of the system
+ | | : +------+ }
+ | CPU 1 | : | B=2 | }
+ | | +------+ }
+ | | wwwwwwwwwwwwwwww } <--- At this point the write barrier
+ | | +------+ } requires all stores prior to the
+ | | : | E=5 | } barrier to be committed before
+ | | : +------+ } further stores may take place
+ | |------>| D=4 | }
+ | | +------+
+ +-------+ : :
+ |
+ | Sequence in which stores are committed to the
+ | memory system by CPU 1
+ V
+
+
+Secondly, data dependency barriers act as partial orderings on data-dependent
+loads. Consider the following sequence of events:
+
+ CPU 1 CPU 2
+ ======================= =======================
+ { B = 7; X = 9; Y = 8; C = &Y }
+ STORE A = 1
+ STORE B = 2
+ <write barrier>
+ STORE C = &B LOAD X
+ STORE D = 4 LOAD C (gets &B)
+ LOAD *C (reads B)
+
+Without intervention, CPU 2 may perceive the events on CPU 1 in some
+effectively random order, despite the write barrier issued by CPU 1:
+
+ +-------+ : : : :
+ | | +------+ +-------+ | Sequence of update
+ | |------>| B=2 |----- --->| Y->8 | | of perception on
+ | | : +------+ \ +-------+ | CPU 2
+ | CPU 1 | : | A=1 | \ --->| C->&Y | V
+ | | +------+ | +-------+
+ | | wwwwwwwwwwwwwwww | : :
+ | | +------+ | : :
+ | | : | C=&B |--- | : : +-------+
+ | | : +------+ \ | +-------+ | |
+ | |------>| D=4 | ----------->| C->&B |------>| |
+ | | +------+ | +-------+ | |
+ +-------+ : : | : : | |
+ | : : | |
+ | : : | CPU 2 |
+ | +-------+ | |
+ Apparently incorrect ---> | | B->7 |------>| |
+ perception of B (!) | +-------+ | |
+ | : : | |
+ | +-------+ | |
+ The load of X holds ---> \ | X->9 |------>| |
+ up the maintenance \ +-------+ | |
+ of coherence of B ----->| B->2 | +-------+
+ +-------+
+ : :
+
+
+In the above example, CPU 2 perceives that B is 7, despite the load of *C
+(which would be B) coming after the LOAD of C.
+
+If, however, a data dependency barrier were to be placed between the load of C
+and the load of *C (ie: B) on CPU 2:
+
+ CPU 1 CPU 2
+ ======================= =======================
+ { B = 7; X = 9; Y = 8; C = &Y }
+ STORE A = 1
+ STORE B = 2
+ <write barrier>
+ STORE C = &B LOAD X
+ STORE D = 4 LOAD C (gets &B)
+ <data dependency barrier>
+ LOAD *C (reads B)
+
+then the following will occur:
+
+ +-------+ : : : :
+ | | +------+ +-------+
+ | |------>| B=2 |----- --->| Y->8 |
+ | | : +------+ \ +-------+
+ | CPU 1 | : | A=1 | \ --->| C->&Y |
+ | | +------+ | +-------+
+ | | wwwwwwwwwwwwwwww | : :
+ | | +------+ | : :
+ | | : | C=&B |--- | : : +-------+
+ | | : +------+ \ | +-------+ | |
+ | |------>| D=4 | ----------->| C->&B |------>| |
+ | | +------+ | +-------+ | |
+ +-------+ : : | : : | |
+ | : : | |
+ | : : | CPU 2 |
+ | +-------+ | |
+ | | X->9 |------>| |
+ | +-------+ | |
+ Makes sure all effects ---> \ ddddddddddddddddd | |
+ prior to the store of C \ +-------+ | |
+ are perceptible to ----->| B->2 |------>| |
+ subsequent loads +-------+ | |
+ : : +-------+
+
+
+And thirdly, a read barrier acts as a partial order on loads. Consider the
+following sequence of events:
+
+ CPU 1 CPU 2
+ ======================= =======================
+ { A = 0, B = 9 }
+ STORE A=1
+ <write barrier>
+ STORE B=2
+ LOAD B
+ LOAD A
+
+Without intervention, CPU 2 may then choose to perceive the events on CPU 1 in
+some effectively random order, despite the write barrier issued by CPU 1:
+
+ +-------+ : : : :
+ | | +------+ +-------+
+ | |------>| A=1 |------ --->| A->0 |
+ | | +------+ \ +-------+
+ | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 |
+ | | +------+ | +-------+
+ | |------>| B=2 |--- | : :
+ | | +------+ \ | : : +-------+
+ +-------+ : : \ | +-------+ | |
+ ---------->| B->2 |------>| |
+ | +-------+ | CPU 2 |
+ | | A->0 |------>| |
+ | +-------+ | |
+ | : : +-------+
+ \ : :
+ \ +-------+
+ ---->| A->1 |
+ +-------+
+ : :
+
+
+If, however, a read barrier were to be placed between the load of B and the
+load of A on CPU 2:
+
+ CPU 1 CPU 2
+ ======================= =======================
+ { A = 0, B = 9 }
+ STORE A=1
+ <write barrier>
+ STORE B=2
+ LOAD B
+ <read barrier>
+ LOAD A
+
+then the partial ordering imposed by CPU 1 will be perceived correctly by CPU
+2:
+
+ +-------+ : : : :
+ | | +------+ +-------+
+ | |------>| A=1 |------ --->| A->0 |
+ | | +------+ \ +-------+
+ | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 |
+ | | +------+ | +-------+
+ | |------>| B=2 |--- | : :
+ | | +------+ \ | : : +-------+
+ +-------+ : : \ | +-------+ | |
+ ---------->| B->2 |------>| |
+ | +-------+ | CPU 2 |
+ | : : | |
+ | : : | |
+ At this point the read ----> \ rrrrrrrrrrrrrrrrr | |
+ barrier causes all effects \ +-------+ | |
+ prior to the storage of B ---->| A->1 |------>| |
+ to be perceptible to CPU 2 +-------+ | |
+ : : +-------+
+
+
+To illustrate this more completely, consider what could happen if the code
+contained a load of A either side of the read barrier:
+
+ CPU 1 CPU 2
+ ======================= =======================
+ { A = 0, B = 9 }
+ STORE A=1
+ <write barrier>
+ STORE B=2
+ LOAD B
+ LOAD A [first load of A]
+ <read barrier>
+ LOAD A [second load of A]
+
+Even though the two loads of A both occur after the load of B, they may both
+come up with different values:
+
+ +-------+ : : : :
+ | | +------+ +-------+
+ | |------>| A=1 |------ --->| A->0 |
+ | | +------+ \ +-------+
+ | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 |
+ | | +------+ | +-------+
+ | |------>| B=2 |--- | : :
+ | | +------+ \ | : : +-------+
+ +-------+ : : \ | +-------+ | |
+ ---------->| B->2 |------>| |
+ | +-------+ | CPU 2 |
+ | : : | |
+ | : : | |
+ | +-------+ | |
+ | | A->0 |------>| 1st |
+ | +-------+ | |
+ At this point the read ----> \ rrrrrrrrrrrrrrrrr | |
+ barrier causes all effects \ +-------+ | |
+ prior to the storage of B ---->| A->1 |------>| 2nd |
+ to be perceptible to CPU 2 +-------+ | |
+ : : +-------+
+
+
+But it may be that the update to A from CPU 1 becomes perceptible to CPU 2
+before the read barrier completes anyway:
+
+ +-------+ : : : :
+ | | +------+ +-------+
+ | |------>| A=1 |------ --->| A->0 |
+ | | +------+ \ +-------+
+ | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 |
+ | | +------+ | +-------+
+ | |------>| B=2 |--- | : :
+ | | +------+ \ | : : +-------+
+ +-------+ : : \ | +-------+ | |
+ ---------->| B->2 |------>| |
+ | +-------+ | CPU 2 |
+ | : : | |
+ \ : : | |
+ \ +-------+ | |
+ ---->| A->1 |------>| 1st |
+ +-------+ | |
+ rrrrrrrrrrrrrrrrr | |
+ +-------+ | |
+ | A->1 |------>| 2nd |
+ +-------+ | |
+ : : +-------+
+
+
+The guarantee is that the second load will always come up with A == 1 if the
+load of B came up with B == 2. No such guarantee exists for the first load of
+A; that may come up with either A == 0 or A == 1.
+
+
+READ MEMORY BARRIERS VS LOAD SPECULATION
+----------------------------------------
+
+Many CPUs speculate with loads: that is they see that they will need to load an
+item from memory, and they find a time where they're not using the bus for any
+other loads, and so do the load in advance - even though they haven't actually
+got to that point in the instruction execution flow yet. This permits the
+actual load instruction to potentially complete immediately because the CPU
+already has the value to hand.
+
+It may turn out that the CPU didn't actually need the value - perhaps because a
+branch circumvented the load - in which case it can discard the value or just
+cache it for later use.
+
+Consider:
+
+ CPU 1 CPU 2
+ ======================= =======================
+ LOAD B
+ DIVIDE } Divide instructions generally
+ DIVIDE } take a long time to perform
+ LOAD A
+
+Which might appear as this:
+
+ : : +-------+
+ +-------+ | |
+ --->| B->2 |------>| |
+ +-------+ | CPU 2 |
+ : :DIVIDE | |
+ +-------+ | |
+ The CPU being busy doing a ---> --->| A->0 |~~~~ | |
+ division speculates on the +-------+ ~ | |
+ LOAD of A : : ~ | |
+ : :DIVIDE | |
+ : : ~ | |
+ Once the divisions are complete --> : : ~-->| |
+ the CPU can then perform the : : | |
+ LOAD with immediate effect : : +-------+
+
+
+Placing a read barrier or a data dependency barrier just before the second
+load:
+
+ CPU 1 CPU 2
+ ======================= =======================
+ LOAD B
+ DIVIDE
+ DIVIDE
+ <read barrier>
+ LOAD A
+
+will force any value speculatively obtained to be reconsidered to an extent
+dependent on the type of barrier used. If there was no change made to the
+speculated memory location, then the speculated value will just be used:
+
+ : : +-------+
+ +-------+ | |
+ --->| B->2 |------>| |
+ +-------+ | CPU 2 |
+ : :DIVIDE | |
+ +-------+ | |
+ The CPU being busy doing a ---> --->| A->0 |~~~~ | |
+ division speculates on the +-------+ ~ | |
+ LOAD of A : : ~ | |
+ : :DIVIDE | |
+ : : ~ | |
+ : : ~ | |
+ rrrrrrrrrrrrrrrr~ | |
+ : : ~ | |
+ : : ~-->| |
+ : : | |
+ : : +-------+
+
+
+but if there was an update or an invalidation from another CPU pending, then
+the speculation will be cancelled and the value reloaded:
+
+ : : +-------+
+ +-------+ | |
+ --->| B->2 |------>| |
+ +-------+ | CPU 2 |
+ : :DIVIDE | |
+ +-------+ | |
+ The CPU being busy doing a ---> --->| A->0 |~~~~ | |
+ division speculates on the +-------+ ~ | |
+ LOAD of A : : ~ | |
+ : :DIVIDE | |
+ : : ~ | |
+ : : ~ | |
+ rrrrrrrrrrrrrrrrr | |
+ +-------+ | |
+ The speculation is discarded ---> --->| A->1 |------>| |
+ and an updated value is +-------+ | |
+ retrieved : : +-------+
+
+
+========================
+EXPLICIT KERNEL BARRIERS
+========================
+
+The Linux kernel has a variety of different barriers that act at different
+levels:
+
+ (*) Compiler barrier.
+
+ (*) CPU memory barriers.
+
+ (*) MMIO write barrier.
+
+
+COMPILER BARRIER
+----------------
+
+The Linux kernel has an explicit compiler barrier function that prevents the
+compiler from moving the memory accesses either side of it to the other side:
+
+ barrier();
+
+This is a general barrier - lesser varieties of compiler barrier do not exist.
+
+The compiler barrier has no direct effect on the CPU, which may then reorder
+things however it wishes.
+
+
+CPU MEMORY BARRIERS
+-------------------
+
+The Linux kernel has eight basic CPU memory barriers:
+
+ TYPE MANDATORY SMP CONDITIONAL
+ =============== ======================= ===========================
+ GENERAL mb() smp_mb()
+ WRITE wmb() smp_wmb()
+ READ rmb() smp_rmb()
+ DATA DEPENDENCY read_barrier_depends() smp_read_barrier_depends()
+
+
+All memory barriers except the data dependency barriers imply a compiler
+barrier. Data dependencies do not impose any additional compiler ordering.
+
+Aside: In the case of data dependencies, the compiler would be expected to
+issue the loads in the correct order (eg. `a[b]` would have to load the value
+of b before loading a[b]), however there is no guarantee in the C specification
+that the compiler may not speculate the value of b (eg. is equal to 1) and load
+a before b (eg. tmp = a[1]; if (b != 1) tmp = a[b]; ). There is also the
+problem of a compiler reloading b after having loaded a[b], thus having a newer
+copy of b than a[b]. A consensus has not yet been reached about these problems,
+however the ACCESS_ONCE macro is a good place to start looking.
+
+SMP memory barriers are reduced to compiler barriers on uniprocessor compiled
+systems because it is assumed that a CPU will appear to be self-consistent,
+and will order overlapping accesses correctly with respect to itself.
+
+[!] Note that SMP memory barriers _must_ be used to control the ordering of
+references to shared memory on SMP systems, though the use of locking instead
+is sufficient.
+
+Mandatory barriers should not be used to control SMP effects, since mandatory
+barriers unnecessarily impose overhead on UP systems. They may, however, be
+used to control MMIO effects on accesses through relaxed memory I/O windows.
+These are required even on non-SMP systems as they affect the order in which
+memory operations appear to a device by prohibiting both the compiler and the
+CPU from reordering them.
+
+
+There are some more advanced barrier functions:
+
+ (*) set_mb(var, value)
+
+ This assigns the value to the variable and then inserts a full memory
+ barrier after it, depending on the function. It isn't guaranteed to
+ insert anything more than a compiler barrier in a UP compilation.
+
+
+ (*) smp_mb__before_atomic_dec();
+ (*) smp_mb__after_atomic_dec();
+ (*) smp_mb__before_atomic_inc();
+ (*) smp_mb__after_atomic_inc();
+
+ These are for use with atomic add, subtract, increment and decrement
+ functions that don't return a value, especially when used for reference
+ counting. These functions do not imply memory barriers.
+
+ As an example, consider a piece of code that marks an object as being dead
+ and then decrements the object's reference count:
+
+ obj->dead = 1;
+ smp_mb__before_atomic_dec();
+ atomic_dec(&obj->ref_count);
+
+ This makes sure that the death mark on the object is perceived to be set
+ *before* the reference counter is decremented.
+
+ See Documentation/atomic_ops.txt for more information. See the "Atomic
+ operations" subsection for information on where to use these.
+
+
+ (*) smp_mb__before_clear_bit(void);
+ (*) smp_mb__after_clear_bit(void);
+
+ These are for use similar to the atomic inc/dec barriers. These are
+ typically used for bitwise unlocking operations, so care must be taken as
+ there are no implicit memory barriers here either.
+
+ Consider implementing an unlock operation of some nature by clearing a
+ locking bit. The clear_bit() would then need to be barriered like this:
+
+ smp_mb__before_clear_bit();
+ clear_bit( ... );
+
+ This prevents memory operations before the clear leaking to after it. See
+ the subsection on "Locking Functions" with reference to UNLOCK operation
+ implications.
+
+ See Documentation/atomic_ops.txt for more information. See the "Atomic
+ operations" subsection for information on where to use these.
+
+
+MMIO WRITE BARRIER
+------------------
+
+The Linux kernel also has a special barrier for use with memory-mapped I/O
+writes:
+
+ mmiowb();
+
+This is a variation on the mandatory write barrier that causes writes to weakly
+ordered I/O regions to be partially ordered. Its effects may go beyond the
+CPU->Hardware interface and actually affect the hardware at some level.
+
+See the subsection "Locks vs I/O accesses" for more information.
+
+
+===============================
+IMPLICIT KERNEL MEMORY BARRIERS
+===============================
+
+Some of the other functions in the linux kernel imply memory barriers, amongst
+which are locking and scheduling functions.
+
+This specification is a _minimum_ guarantee; any particular architecture may
+provide more substantial guarantees, but these may not be relied upon outside
+of arch specific code.
+
+
+LOCKING FUNCTIONS
+-----------------
+
+The Linux kernel has a number of locking constructs:
+
+ (*) spin locks
+ (*) R/W spin locks
+ (*) mutexes
+ (*) semaphores
+ (*) R/W semaphores
+ (*) RCU
+
+In all cases there are variants on "LOCK" operations and "UNLOCK" operations
+for each construct. These operations all imply certain barriers:
+
+ (1) LOCK operation implication:
+
+ Memory operations issued after the LOCK will be completed after the LOCK
+ operation has completed.
+
+ Memory operations issued before the LOCK may be completed after the LOCK
+ operation has completed.
+
+ (2) UNLOCK operation implication:
+
+ Memory operations issued before the UNLOCK will be completed before the
+ UNLOCK operation has completed.
+
+ Memory operations issued after the UNLOCK may be completed before the
+ UNLOCK operation has completed.
+
+ (3) LOCK vs LOCK implication:
+
+ All LOCK operations issued before another LOCK operation will be completed
+ before that LOCK operation.
+
+ (4) LOCK vs UNLOCK implication:
+
+ All LOCK operations issued before an UNLOCK operation will be completed
+ before the UNLOCK operation.
+
+ All UNLOCK operations issued before a LOCK operation will be completed
+ before the LOCK operation.
+
+ (5) Failed conditional LOCK implication:
+
+ Certain variants of the LOCK operation may fail, either due to being
+ unable to get the lock immediately, or due to receiving an unblocked
+ signal whilst asleep waiting for the lock to become available. Failed
+ locks do not imply any sort of barrier.
+
+Therefore, from (1), (2) and (4) an UNLOCK followed by an unconditional LOCK is
+equivalent to a full barrier, but a LOCK followed by an UNLOCK is not.
+
+[!] Note: one of the consequences of LOCKs and UNLOCKs being only one-way
+ barriers is that the effects of instructions outside of a critical section
+ may seep into the inside of the critical section.
+
+A LOCK followed by an UNLOCK may not be assumed to be full memory barrier
+because it is possible for an access preceding the LOCK to happen after the
+LOCK, and an access following the UNLOCK to happen before the UNLOCK, and the
+two accesses can themselves then cross:
+
+ *A = a;
+ LOCK
+ UNLOCK
+ *B = b;
+
+may occur as:
+
+ LOCK, STORE *B, STORE *A, UNLOCK
+
+Locks and semaphores may not provide any guarantee of ordering on UP compiled
+systems, and so cannot be counted on in such a situation to actually achieve
+anything at all - especially with respect to I/O accesses - unless combined
+with interrupt disabling operations.
+
+See also the section on "Inter-CPU locking barrier effects".
+
+
+As an example, consider the following:
+
+ *A = a;
+ *B = b;
+ LOCK
+ *C = c;
+ *D = d;
+ UNLOCK
+ *E = e;
+ *F = f;
+
+The following sequence of events is acceptable:
+
+ LOCK, {*F,*A}, *E, {*C,*D}, *B, UNLOCK
+
+ [+] Note that {*F,*A} indicates a combined access.
+
+But none of the following are:
+
+ {*F,*A}, *B, LOCK, *C, *D, UNLOCK, *E
+ *A, *B, *C, LOCK, *D, UNLOCK, *E, *F
+ *A, *B, LOCK, *C, UNLOCK, *D, *E, *F
+ *B, LOCK, *C, *D, UNLOCK, {*F,*A}, *E
+
+
+
+INTERRUPT DISABLING FUNCTIONS
+-----------------------------
+
+Functions that disable interrupts (LOCK equivalent) and enable interrupts
+(UNLOCK equivalent) will act as compiler barriers only. So if memory or I/O
+barriers are required in such a situation, they must be provided from some
+other means.
+
+
+MISCELLANEOUS FUNCTIONS
+-----------------------
+
+Other functions that imply barriers:
+
+ (*) schedule() and similar imply full memory barriers.
+
+
+=================================
+INTER-CPU LOCKING BARRIER EFFECTS
+=================================
+
+On SMP systems locking primitives give a more substantial form of barrier: one
+that does affect memory access ordering on other CPUs, within the context of
+conflict on any particular lock.
+
+
+LOCKS VS MEMORY ACCESSES
+------------------------
+
+Consider the following: the system has a pair of spinlocks (M) and (Q), and
+three CPUs; then should the following sequence of events occur:
+
+ CPU 1 CPU 2
+ =============================== ===============================
+ *A = a; *E = e;
+ LOCK M LOCK Q
+ *B = b; *F = f;
+ *C = c; *G = g;
+ UNLOCK M UNLOCK Q
+ *D = d; *H = h;
+
+Then there is no guarantee as to what order CPU 3 will see the accesses to *A
+through *H occur in, other than the constraints imposed by the separate locks
+on the separate CPUs. It might, for example, see:
+
+ *E, LOCK M, LOCK Q, *G, *C, *F, *A, *B, UNLOCK Q, *D, *H, UNLOCK M
+
+But it won't see any of:
+
+ *B, *C or *D preceding LOCK M
+ *A, *B or *C following UNLOCK M
+ *F, *G or *H preceding LOCK Q
+ *E, *F or *G following UNLOCK Q
+
+
+However, if the following occurs:
+
+ CPU 1 CPU 2
+ =============================== ===============================
+ *A = a;
+ LOCK M [1]
+ *B = b;
+ *C = c;
+ UNLOCK M [1]
+ *D = d; *E = e;
+ LOCK M [2]
+ *F = f;
+ *G = g;
+ UNLOCK M [2]
+ *H = h;
+
+CPU 3 might see:
+
+ *E, LOCK M [1], *C, *B, *A, UNLOCK M [1],
+ LOCK M [2], *H, *F, *G, UNLOCK M [2], *D
+
+But assuming CPU 1 gets the lock first, CPU 3 won't see any of:
+
+ *B, *C, *D, *F, *G or *H preceding LOCK M [1]
+ *A, *B or *C following UNLOCK M [1]
+ *F, *G or *H preceding LOCK M [2]
+ *A, *B, *C, *E, *F or *G following UNLOCK M [2]
+
+
+LOCKS VS I/O ACCESSES
+---------------------
+
+Under certain circumstances (especially involving NUMA), I/O accesses within
+two spinlocked sections on two different CPUs may be seen as interleaved by the
+PCI bridge, because the PCI bridge does not necessarily participate in the
+cache-coherence protocol, and is therefore incapable of issuing the required
+read memory barriers.
+
+For example:
+
+ CPU 1 CPU 2
+ =============================== ===============================
+ spin_lock(Q)
+ writel(0, ADDR)
+ writel(1, DATA);
+ spin_unlock(Q);
+ spin_lock(Q);
+ writel(4, ADDR);
+ writel(5, DATA);
+ spin_unlock(Q);
+
+may be seen by the PCI bridge as follows:
+
+ STORE *ADDR = 0, STORE *ADDR = 4, STORE *DATA = 1, STORE *DATA = 5
+
+which would probably cause the hardware to malfunction.
+
+
+What is necessary here is to intervene with an mmiowb() before dropping the
+spinlock, for example:
+
+ CPU 1 CPU 2
+ =============================== ===============================
+ spin_lock(Q)
+ writel(0, ADDR)
+ writel(1, DATA);
+ mmiowb();
+ spin_unlock(Q);
+ spin_lock(Q);
+ writel(4, ADDR);
+ writel(5, DATA);
+ mmiowb();
+ spin_unlock(Q);
+
+this will ensure that the two stores issued on CPU 1 appear at the PCI bridge
+before either of the stores issued on CPU 2.
+
+
+Furthermore, following a store by a load from the same device obviates the need
+for the mmiowb(), because the load forces the store to complete before the load
+is performed:
+
+ CPU 1 CPU 2
+ =============================== ===============================
+ spin_lock(Q)
+ writel(0, ADDR)
+ a = readl(DATA);
+ spin_unlock(Q);
+ spin_lock(Q);
+ writel(4, ADDR);
+ b = readl(DATA);
+ spin_unlock(Q);
+
+
+See Documentation/DocBook/deviceiobook.tmpl for more information.
+
+
+=================================
+WHERE ARE MEMORY BARRIERS NEEDED?
+=================================
+
+Under normal operation, memory operation reordering is generally not going to
+be a problem as a single-threaded linear piece of code will still appear to
+work correctly, even if it's in an SMP kernel. There are, however, three
+circumstances in which reordering definitely _could_ be a problem:
+
+ (*) Interprocessor interaction.
+
+ (*) Atomic operations.
+
+ (*) Accessing devices.
+
+ (*) Interrupts.
+
+
+INTERPROCESSOR INTERACTION
+--------------------------
+
+When there's a system with more than one processor, more than one CPU in the
+system may be working on the same data set at the same time. This can cause
+synchronisation problems, and the usual way of dealing with them is to use
+locks. Locks, however, are quite expensive, and so it may be preferable to
+operate without the use of a lock if at all possible. In such a case
+operations that affect both CPUs may have to be carefully ordered to prevent
+a malfunction.
+
+Consider, for example, the R/W semaphore slow path. Here a waiting process is
+queued on the semaphore, by virtue of it having a piece of its stack linked to
+the semaphore's list of waiting processes:
+
+ struct rw_semaphore {
+ ...
+ spinlock_t lock;
+ struct list_head waiters;
+ };
+
+ struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ };
+
+To wake up a particular waiter, the up_read() or up_write() functions have to:
+
+ (1) read the next pointer from this waiter's record to know as to where the
+ next waiter record is;
+
+ (2) read the pointer to the waiter's task structure;
+
+ (3) clear the task pointer to tell the waiter it has been given the semaphore;
+
+ (4) call wake_up_process() on the task; and
+
+ (5) release the reference held on the waiter's task struct.
+
+In other words, it has to perform this sequence of events:
+
+ LOAD waiter->list.next;
+ LOAD waiter->task;
+ STORE waiter->task;
+ CALL wakeup
+ RELEASE task
+
+and if any of these steps occur out of order, then the whole thing may
+malfunction.
+
+Once it has queued itself and dropped the semaphore lock, the waiter does not
+get the lock again; it instead just waits for its task pointer to be cleared
+before proceeding. Since the record is on the waiter's stack, this means that
+if the task pointer is cleared _before_ the next pointer in the list is read,
+another CPU might start processing the waiter and might clobber the waiter's
+stack before the up*() function has a chance to read the next pointer.
+
+Consider then what might happen to the above sequence of events:
+
+ CPU 1 CPU 2
+ =============================== ===============================
+ down_xxx()
+ Queue waiter
+ Sleep
+ up_yyy()
+ LOAD waiter->task;
+ STORE waiter->task;
+ Woken up by other event
+ <preempt>
+ Resume processing
+ down_xxx() returns
+ call foo()
+ foo() clobbers *waiter
+ </preempt>
+ LOAD waiter->list.next;
+ --- OOPS ---
+
+This could be dealt with using the semaphore lock, but then the down_xxx()
+function has to needlessly get the spinlock again after being woken up.
+
+The way to deal with this is to insert a general SMP memory barrier:
+
+ LOAD waiter->list.next;
+ LOAD waiter->task;
+ smp_mb();
+ STORE waiter->task;
+ CALL wakeup
+ RELEASE task
+
+In this case, the barrier makes a guarantee that all memory accesses before the
+barrier will appear to happen before all the memory accesses after the barrier
+with respect to the other CPUs on the system. It does _not_ guarantee that all
+the memory accesses before the barrier will be complete by the time the barrier
+instruction itself is complete.
+
+On a UP system - where this wouldn't be a problem - the smp_mb() is just a
+compiler barrier, thus making sure the compiler emits the instructions in the
+right order without actually intervening in the CPU. Since there's only one
+CPU, that CPU's dependency ordering logic will take care of everything else.
+
+
+ATOMIC OPERATIONS
+-----------------
+
+Whilst they are technically interprocessor interaction considerations, atomic
+operations are noted specially as some of them imply full memory barriers and
+some don't, but they're very heavily relied on as a group throughout the
+kernel.
+
+Any atomic operation that modifies some state in memory and returns information
+about the state (old or new) implies an SMP-conditional general memory barrier
+(smp_mb()) on each side of the actual operation (with the exception of
+explicit lock operations, described later). These include:
+
+ xchg();
+ cmpxchg();
+ atomic_cmpxchg();
+ atomic_inc_return();
+ atomic_dec_return();
+ atomic_add_return();
+ atomic_sub_return();
+ atomic_inc_and_test();
+ atomic_dec_and_test();
+ atomic_sub_and_test();
+ atomic_add_negative();
+ atomic_add_unless(); /* when succeeds (returns 1) */
+ test_and_set_bit();
+ test_and_clear_bit();
+ test_and_change_bit();
+
+These are used for such things as implementing LOCK-class and UNLOCK-class
+operations and adjusting reference counters towards object destruction, and as
+such the implicit memory barrier effects are necessary.
+
+
+The following operations are potential problems as they do _not_ imply memory
+barriers, but might be used for implementing such things as UNLOCK-class
+operations:
+
+ atomic_set();
+ set_bit();
+ clear_bit();
+ change_bit();
+
+With these the appropriate explicit memory barrier should be used if necessary
+(smp_mb__before_clear_bit() for instance).
+
+
+The following also do _not_ imply memory barriers, and so may require explicit
+memory barriers under some circumstances (smp_mb__before_atomic_dec() for
+instance):
+
+ atomic_add();
+ atomic_sub();
+ atomic_inc();
+ atomic_dec();
+
+If they're used for statistics generation, then they probably don't need memory
+barriers, unless there's a coupling between statistical data.
+
+If they're used for reference counting on an object to control its lifetime,
+they probably don't need memory barriers because either the reference count
+will be adjusted inside a locked section, or the caller will already hold
+sufficient references to make the lock, and thus a memory barrier unnecessary.
+
+If they're used for constructing a lock of some description, then they probably
+do need memory barriers as a lock primitive generally has to do things in a
+specific order.
+
+Basically, each usage case has to be carefully considered as to whether memory
+barriers are needed or not.
+
+The following operations are special locking primitives:
+
+ test_and_set_bit_lock();
+ clear_bit_unlock();
+ __clear_bit_unlock();
+
+These implement LOCK-class and UNLOCK-class operations. These should be used in
+preference to other operations when implementing locking primitives, because
+their implementations can be optimised on many architectures.
+
+[!] Note that special memory barrier primitives are available for these
+situations because on some CPUs the atomic instructions used imply full memory
+barriers, and so barrier instructions are superfluous in conjunction with them,
+and in such cases the special barrier primitives will be no-ops.
+
+See Documentation/atomic_ops.txt for more information.
+
+
+ACCESSING DEVICES
+-----------------
+
+Many devices can be memory mapped, and so appear to the CPU as if they're just
+a set of memory locations. To control such a device, the driver usually has to
+make the right memory accesses in exactly the right order.
+
+However, having a clever CPU or a clever compiler creates a potential problem
+in that the carefully sequenced accesses in the driver code won't reach the
+device in the requisite order if the CPU or the compiler thinks it is more
+efficient to reorder, combine or merge accesses - something that would cause
+the device to malfunction.
+
+Inside of the Linux kernel, I/O should be done through the appropriate accessor
+routines - such as inb() or writel() - which know how to make such accesses
+appropriately sequential. Whilst this, for the most part, renders the explicit
+use of memory barriers unnecessary, there are a couple of situations where they
+might be needed:
+
+ (1) On some systems, I/O stores are not strongly ordered across all CPUs, and
+ so for _all_ general drivers locks should be used and mmiowb() must be
+ issued prior to unlocking the critical section.
+
+ (2) If the accessor functions are used to refer to an I/O memory window with
+ relaxed memory access properties, then _mandatory_ memory barriers are
+ required to enforce ordering.
+
+See Documentation/DocBook/deviceiobook.tmpl for more information.
+
+
+INTERRUPTS
+----------
+
+A driver may be interrupted by its own interrupt service routine, and thus the
+two parts of the driver may interfere with each other's attempts to control or
+access the device.
+
+This may be alleviated - at least in part - by disabling local interrupts (a
+form of locking), such that the critical operations are all contained within
+the interrupt-disabled section in the driver. Whilst the driver's interrupt
+routine is executing, the driver's core may not run on the same CPU, and its
+interrupt is not permitted to happen again until the current interrupt has been
+handled, thus the interrupt handler does not need to lock against that.
+
+However, consider a driver that was talking to an ethernet card that sports an
+address register and a data register. If that driver's core talks to the card
+under interrupt-disablement and then the driver's interrupt handler is invoked:
+
+ LOCAL IRQ DISABLE
+ writew(ADDR, 3);
+ writew(DATA, y);
+ LOCAL IRQ ENABLE
+ <interrupt>
+ writew(ADDR, 4);
+ q = readw(DATA);
+ </interrupt>
+
+The store to the data register might happen after the second store to the
+address register if ordering rules are sufficiently relaxed:
+
+ STORE *ADDR = 3, STORE *ADDR = 4, STORE *DATA = y, q = LOAD *DATA
+
+
+If ordering rules are relaxed, it must be assumed that accesses done inside an
+interrupt disabled section may leak outside of it and may interleave with
+accesses performed in an interrupt - and vice versa - unless implicit or
+explicit barriers are used.
+
+Normally this won't be a problem because the I/O accesses done inside such
+sections will include synchronous load operations on strictly ordered I/O
+registers that form implicit I/O barriers. If this isn't sufficient then an
+mmiowb() may need to be used explicitly.
+
+
+A similar situation may occur between an interrupt routine and two routines
+running on separate CPUs that communicate with each other. If such a case is
+likely, then interrupt-disabling locks should be used to guarantee ordering.
+
+
+==========================
+KERNEL I/O BARRIER EFFECTS
+==========================
+
+When accessing I/O memory, drivers should use the appropriate accessor
+functions:
+
+ (*) inX(), outX():
+
+ These are intended to talk to I/O space rather than memory space, but
+ that's primarily a CPU-specific concept. The i386 and x86_64 processors do
+ indeed have special I/O space access cycles and instructions, but many
+ CPUs don't have such a concept.
+
+ The PCI bus, amongst others, defines an I/O space concept which - on such
+ CPUs as i386 and x86_64 - readily maps to the CPU's concept of I/O
+ space. However, it may also be mapped as a virtual I/O space in the CPU's
+ memory map, particularly on those CPUs that don't support alternate I/O
+ spaces.
+
+ Accesses to this space may be fully synchronous (as on i386), but
+ intermediary bridges (such as the PCI host bridge) may not fully honour
+ that.
+
+ They are guaranteed to be fully ordered with respect to each other.
+
+ They are not guaranteed to be fully ordered with respect to other types of
+ memory and I/O operation.
+
+ (*) readX(), writeX():
+
+ Whether these are guaranteed to be fully ordered and uncombined with
+ respect to each other on the issuing CPU depends on the characteristics
+ defined for the memory window through which they're accessing. On later
+ i386 architecture machines, for example, this is controlled by way of the
+ MTRR registers.
+
+ Ordinarily, these will be guaranteed to be fully ordered and uncombined,
+ provided they're not accessing a prefetchable device.
+
+ However, intermediary hardware (such as a PCI bridge) may indulge in
+ deferral if it so wishes; to flush a store, a load from the same location
+ is preferred[*], but a load from the same device or from configuration
+ space should suffice for PCI.
+
+ [*] NOTE! attempting to load from the same location as was written to may
+ cause a malfunction - consider the 16550 Rx/Tx serial registers for
+ example.
+
+ Used with prefetchable I/O memory, an mmiowb() barrier may be required to
+ force stores to be ordered.
+
+ Please refer to the PCI specification for more information on interactions
+ between PCI transactions.
+
+ (*) readX_relaxed()
+
+ These are similar to readX(), but are not guaranteed to be ordered in any
+ way. Be aware that there is no I/O read barrier available.
+
+ (*) ioreadX(), iowriteX()
+
+ These will perform appropriately for the type of access they're actually
+ doing, be it inX()/outX() or readX()/writeX().
+
+
+========================================
+ASSUMED MINIMUM EXECUTION ORDERING MODEL
+========================================
+
+It has to be assumed that the conceptual CPU is weakly-ordered but that it will
+maintain the appearance of program causality with respect to itself. Some CPUs
+(such as i386 or x86_64) are more constrained than others (such as powerpc or
+frv), and so the most relaxed case (namely DEC Alpha) must be assumed outside
+of arch-specific code.
+
+This means that it must be considered that the CPU will execute its instruction
+stream in any order it feels like - or even in parallel - provided that if an
+instruction in the stream depends on an earlier instruction, then that
+earlier instruction must be sufficiently complete[*] before the later
+instruction may proceed; in other words: provided that the appearance of
+causality is maintained.
+
+ [*] Some instructions have more than one effect - such as changing the
+ condition codes, changing registers or changing memory - and different
+ instructions may depend on different effects.
+
+A CPU may also discard any instruction sequence that winds up having no
+ultimate effect. For example, if two adjacent instructions both load an
+immediate value into the same register, the first may be discarded.
+
+
+Similarly, it has to be assumed that compiler might reorder the instruction
+stream in any way it sees fit, again provided the appearance of causality is
+maintained.
+
+
+============================
+THE EFFECTS OF THE CPU CACHE
+============================
+
+The way cached memory operations are perceived across the system is affected to
+a certain extent by the caches that lie between CPUs and memory, and by the
+memory coherence system that maintains the consistency of state in the system.
+
+As far as the way a CPU interacts with another part of the system through the
+caches goes, the memory system has to include the CPU's caches, and memory
+barriers for the most part act at the interface between the CPU and its cache
+(memory barriers logically act on the dotted line in the following diagram):
+
+ <--- CPU ---> : <----------- Memory ----------->
+ :
+ +--------+ +--------+ : +--------+ +-----------+
+ | | | | : | | | | +--------+
+ | CPU | | Memory | : | CPU | | | | |
+ | Core |--->| Access |----->| Cache |<-->| | | |
+ | | | Queue | : | | | |--->| Memory |
+ | | | | : | | | | | |
+ +--------+ +--------+ : +--------+ | | | |
+ : | Cache | +--------+
+ : | Coherency |
+ : | Mechanism | +--------+
+ +--------+ +--------+ : +--------+ | | | |
+ | | | | : | | | | | |
+ | CPU | | Memory | : | CPU | | |--->| Device |
+ | Core |--->| Access |----->| Cache |<-->| | | |
+ | | | Queue | : | | | | | |
+ | | | | : | | | | +--------+
+ +--------+ +--------+ : +--------+ +-----------+
+ :
+ :
+
+Although any particular load or store may not actually appear outside of the
+CPU that issued it since it may have been satisfied within the CPU's own cache,
+it will still appear as if the full memory access had taken place as far as the
+other CPUs are concerned since the cache coherency mechanisms will migrate the
+cacheline over to the accessing CPU and propagate the effects upon conflict.
+
+The CPU core may execute instructions in any order it deems fit, provided the
+expected program causality appears to be maintained. Some of the instructions
+generate load and store operations which then go into the queue of memory
+accesses to be performed. The core may place these in the queue in any order
+it wishes, and continue execution until it is forced to wait for an instruction
+to complete.
+
+What memory barriers are concerned with is controlling the order in which
+accesses cross from the CPU side of things to the memory side of things, and
+the order in which the effects are perceived to happen by the other observers
+in the system.
+
+[!] Memory barriers are _not_ needed within a given CPU, as CPUs always see
+their own loads and stores as if they had happened in program order.
+
+[!] MMIO or other device accesses may bypass the cache system. This depends on
+the properties of the memory window through which devices are accessed and/or
+the use of any special device communication instructions the CPU may have.
+
+
+CACHE COHERENCY
+---------------
+
+Life isn't quite as simple as it may appear above, however: for while the
+caches are expected to be coherent, there's no guarantee that that coherency
+will be ordered. This means that whilst changes made on one CPU will
+eventually become visible on all CPUs, there's no guarantee that they will
+become apparent in the same order on those other CPUs.
+
+
+Consider dealing with a system that has a pair of CPUs (1 & 2), each of which
+has a pair of parallel data caches (CPU 1 has A/B, and CPU 2 has C/D):
+
+ :
+ : +--------+
+ : +---------+ | |
+ +--------+ : +--->| Cache A |<------->| |
+ | | : | +---------+ | |
+ | CPU 1 |<---+ | |
+ | | : | +---------+ | |
+ +--------+ : +--->| Cache B |<------->| |
+ : +---------+ | |
+ : | Memory |
+ : +---------+ | System |
+ +--------+ : +--->| Cache C |<------->| |
+ | | : | +---------+ | |
+ | CPU 2 |<---+ | |
+ | | : | +---------+ | |
+ +--------+ : +--->| Cache D |<------->| |
+ : +---------+ | |
+ : +--------+
+ :
+
+Imagine the system has the following properties:
+
+ (*) an odd-numbered cache line may be in cache A, cache C or it may still be
+ resident in memory;
+
+ (*) an even-numbered cache line may be in cache B, cache D or it may still be
+ resident in memory;
+
+ (*) whilst the CPU core is interrogating one cache, the other cache may be
+ making use of the bus to access the rest of the system - perhaps to
+ displace a dirty cacheline or to do a speculative load;
+
+ (*) each cache has a queue of operations that need to be applied to that cache
+ to maintain coherency with the rest of the system;
+
+ (*) the coherency queue is not flushed by normal loads to lines already
+ present in the cache, even though the contents of the queue may
+ potentially affect those loads.
+
+Imagine, then, that two writes are made on the first CPU, with a write barrier
+between them to guarantee that they will appear to reach that CPU's caches in
+the requisite order:
+
+ CPU 1 CPU 2 COMMENT
+ =============== =============== =======================================
+ u == 0, v == 1 and p == &u, q == &u
+ v = 2;
+ smp_wmb(); Make sure change to v is visible before
+ change to p
+ <A:modify v=2> v is now in cache A exclusively
+ p = &v;
+ <B:modify p=&v> p is now in cache B exclusively
+
+The write memory barrier forces the other CPUs in the system to perceive that
+the local CPU's caches have apparently been updated in the correct order. But
+now imagine that the second CPU wants to read those values:
+
+ CPU 1 CPU 2 COMMENT
+ =============== =============== =======================================
+ ...
+ q = p;
+ x = *q;
+
+The above pair of reads may then fail to happen in the expected order, as the
+cacheline holding p may get updated in one of the second CPU's caches whilst
+the update to the cacheline holding v is delayed in the other of the second
+CPU's caches by some other cache event:
+
+ CPU 1 CPU 2 COMMENT
+ =============== =============== =======================================
+ u == 0, v == 1 and p == &u, q == &u
+ v = 2;
+ smp_wmb();
+ <A:modify v=2> <C:busy>
+ <C:queue v=2>
+ p = &v; q = p;
+ <D:request p>
+ <B:modify p=&v> <D:commit p=&v>
+ <D:read p>
+ x = *q;
+ <C:read *q> Reads from v before v updated in cache
+ <C:unbusy>
+ <C:commit v=2>
+
+Basically, whilst both cachelines will be updated on CPU 2 eventually, there's
+no guarantee that, without intervention, the order of update will be the same
+as that committed on CPU 1.
+
+
+To intervene, we need to interpolate a data dependency barrier or a read
+barrier between the loads. This will force the cache to commit its coherency
+queue before processing any further requests:
+
+ CPU 1 CPU 2 COMMENT
+ =============== =============== =======================================
+ u == 0, v == 1 and p == &u, q == &u
+ v = 2;
+ smp_wmb();
+ <A:modify v=2> <C:busy>
+ <C:queue v=2>
+ p = &v; q = p;
+ <D:request p>
+ <B:modify p=&v> <D:commit p=&v>
+ <D:read p>
+ smp_read_barrier_depends()
+ <C:unbusy>
+ <C:commit v=2>
+ x = *q;
+ <C:read *q> Reads from v after v updated in cache
+
+
+This sort of problem can be encountered on DEC Alpha processors as they have a
+split cache that improves performance by making better use of the data bus.
+Whilst most CPUs do imply a data dependency barrier on the read when a memory
+access depends on a read, not all do, so it may not be relied on.
+
+Other CPUs may also have split caches, but must coordinate between the various
+cachelets for normal memory accesses. The semantics of the Alpha removes the
+need for coordination in the absence of memory barriers.
+
+
+CACHE COHERENCY VS DMA
+----------------------
+
+Not all systems maintain cache coherency with respect to devices doing DMA. In
+such cases, a device attempting DMA may obtain stale data from RAM because
+dirty cache lines may be resident in the caches of various CPUs, and may not
+have been written back to RAM yet. To deal with this, the appropriate part of
+the kernel must flush the overlapping bits of cache on each CPU (and maybe
+invalidate them as well).
+
+In addition, the data DMA'd to RAM by a device may be overwritten by dirty
+cache lines being written back to RAM from a CPU's cache after the device has
+installed its own data, or cache lines present in the CPU's cache may simply
+obscure the fact that RAM has been updated, until at such time as the cacheline
+is discarded from the CPU's cache and reloaded. To deal with this, the
+appropriate part of the kernel must invalidate the overlapping bits of the
+cache on each CPU.
+
+See Documentation/cachetlb.txt for more information on cache management.
+
+
+CACHE COHERENCY VS MMIO
+-----------------------
+
+Memory mapped I/O usually takes place through memory locations that are part of
+a window in the CPU's memory space that has different properties assigned than
+the usual RAM directed window.
+
+Amongst these properties is usually the fact that such accesses bypass the
+caching entirely and go directly to the device buses. This means MMIO accesses
+may, in effect, overtake accesses to cached memory that were emitted earlier.
+A memory barrier isn't sufficient in such a case, but rather the cache must be
+flushed between the cached memory write and the MMIO access if the two are in
+any way dependent.
+
+
+=========================
+THE THINGS CPUS GET UP TO
+=========================
+
+A programmer might take it for granted that the CPU will perform memory
+operations in exactly the order specified, so that if the CPU is, for example,
+given the following piece of code to execute:
+
+ a = *A;
+ *B = b;
+ c = *C;
+ d = *D;
+ *E = e;
+
+they would then expect that the CPU will complete the memory operation for each
+instruction before moving on to the next one, leading to a definite sequence of
+operations as seen by external observers in the system:
+
+ LOAD *A, STORE *B, LOAD *C, LOAD *D, STORE *E.
+
+
+Reality is, of course, much messier. With many CPUs and compilers, the above
+assumption doesn't hold because:
+
+ (*) loads are more likely to need to be completed immediately to permit
+ execution progress, whereas stores can often be deferred without a
+ problem;
+
+ (*) loads may be done speculatively, and the result discarded should it prove
+ to have been unnecessary;
+
+ (*) loads may be done speculatively, leading to the result having been fetched
+ at the wrong time in the expected sequence of events;
+
+ (*) the order of the memory accesses may be rearranged to promote better use
+ of the CPU buses and caches;
+
+ (*) loads and stores may be combined to improve performance when talking to
+ memory or I/O hardware that can do batched accesses of adjacent locations,
+ thus cutting down on transaction setup costs (memory and PCI devices may
+ both be able to do this); and
+
+ (*) the CPU's data cache may affect the ordering, and whilst cache-coherency
+ mechanisms may alleviate this - once the store has actually hit the cache
+ - there's no guarantee that the coherency management will be propagated in
+ order to other CPUs.
+
+So what another CPU, say, might actually observe from the above piece of code
+is:
+
+ LOAD *A, ..., LOAD {*C,*D}, STORE *E, STORE *B
+
+ (Where "LOAD {*C,*D}" is a combined load)
+
+
+However, it is guaranteed that a CPU will be self-consistent: it will see its
+_own_ accesses appear to be correctly ordered, without the need for a memory
+barrier. For instance with the following code:
+
+ U = *A;
+ *A = V;
+ *A = W;
+ X = *A;
+ *A = Y;
+ Z = *A;
+
+and assuming no intervention by an external influence, it can be assumed that
+the final result will appear to be:
+
+ U == the original value of *A
+ X == W
+ Z == Y
+ *A == Y
+
+The code above may cause the CPU to generate the full sequence of memory
+accesses:
+
+ U=LOAD *A, STORE *A=V, STORE *A=W, X=LOAD *A, STORE *A=Y, Z=LOAD *A
+
+in that order, but, without intervention, the sequence may have almost any
+combination of elements combined or discarded, provided the program's view of
+the world remains consistent.
+
+The compiler may also combine, discard or defer elements of the sequence before
+the CPU even sees them.
+
+For instance:
+
+ *A = V;
+ *A = W;
+
+may be reduced to:
+
+ *A = W;
+
+since, without a write barrier, it can be assumed that the effect of the
+storage of V to *A is lost. Similarly:
+
+ *A = Y;
+ Z = *A;
+
+may, without a memory barrier, be reduced to:
+
+ *A = Y;
+ Z = Y;
+
+and the LOAD operation never appear outside of the CPU.
+
+
+AND THEN THERE'S THE ALPHA
+--------------------------
+
+The DEC Alpha CPU is one of the most relaxed CPUs there is. Not only that,
+some versions of the Alpha CPU have a split data cache, permitting them to have
+two semantically-related cache lines updated at separate times. This is where
+the data dependency barrier really becomes necessary as this synchronises both
+caches with the memory coherence system, thus making it seem like pointer
+changes vs new data occur in the right order.
+
+The Alpha defines the Linux kernel's memory barrier model.
+
+See the subsection on "Cache Coherency" above.
+
+
+==========
+REFERENCES
+==========
+
+Alpha AXP Architecture Reference Manual, Second Edition (Sites & Witek,
+Digital Press)
+ Chapter 5.2: Physical Address Space Characteristics
+ Chapter 5.4: Caches and Write Buffers
+ Chapter 5.5: Data Sharing
+ Chapter 5.6: Read/Write Ordering
+
+AMD64 Architecture Programmer's Manual Volume 2: System Programming
+ Chapter 7.1: Memory-Access Ordering
+ Chapter 7.4: Buffering and Combining Memory Writes
+
+IA-32 Intel Architecture Software Developer's Manual, Volume 3:
+System Programming Guide
+ Chapter 7.1: Locked Atomic Operations
+ Chapter 7.2: Memory Ordering
+ Chapter 7.4: Serializing Instructions
+
+The SPARC Architecture Manual, Version 9
+ Chapter 8: Memory Models
+ Appendix D: Formal Specification of the Memory Models
+ Appendix J: Programming with the Memory Models
+
+UltraSPARC Programmer Reference Manual
+ Chapter 5: Memory Accesses and Cacheability
+ Chapter 15: Sparc-V9 Memory Models
+
+UltraSPARC III Cu User's Manual
+ Chapter 9: Memory Models
+
+UltraSPARC IIIi Processor User's Manual
+ Chapter 8: Memory Models
+
+UltraSPARC Architecture 2005
+ Chapter 9: Memory
+ Appendix D: Formal Specifications of the Memory Models
+
+UltraSPARC T1 Supplement to the UltraSPARC Architecture 2005
+ Chapter 8: Memory Models
+ Appendix F: Caches and Cache Coherency
+
+Solaris Internals, Core Kernel Architecture, p63-68:
+ Chapter 3.3: Hardware Considerations for Locks and
+ Synchronization
+
+Unix Systems for Modern Architectures, Symmetric Multiprocessing and Caching
+for Kernel Programmers:
+ Chapter 13: Other Memory Models
+
+Intel Itanium Architecture Software Developer's Manual: Volume 1:
+ Section 2.6: Speculation
+ Section 4.4: Memory Access
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
new file mode 100644
index 0000000..168117b
--- /dev/null
+++ b/Documentation/memory-hotplug.txt
@@ -0,0 +1,374 @@
+==============
+Memory Hotplug
+==============
+
+Created: Jul 28 2007
+Add description of notifier of memory hotplug Oct 11 2007
+
+This document is about memory hotplug including how-to-use and current status.
+Because Memory Hotplug is still under development, contents of this text will
+be changed often.
+
+1. Introduction
+ 1.1 purpose of memory hotplug
+ 1.2. Phases of memory hotplug
+ 1.3. Unit of Memory online/offline operation
+2. Kernel Configuration
+3. sysfs files for memory hotplug
+4. Physical memory hot-add phase
+ 4.1 Hardware(Firmware) Support
+ 4.2 Notify memory hot-add event by hand
+5. Logical Memory hot-add phase
+ 5.1. State of memory
+ 5.2. How to online memory
+6. Logical memory remove
+ 6.1 Memory offline and ZONE_MOVABLE
+ 6.2. How to offline memory
+7. Physical memory remove
+8. Memory hotplug event notifier
+9. Future Work List
+
+Note(1): x86_64's has special implementation for memory hotplug.
+ This text does not describe it.
+Note(2): This text assumes that sysfs is mounted at /sys.
+
+
+---------------
+1. Introduction
+---------------
+
+1.1 purpose of memory hotplug
+------------
+Memory Hotplug allows users to increase/decrease the amount of memory.
+Generally, there are two purposes.
+
+(A) For changing the amount of memory.
+ This is to allow a feature like capacity on demand.
+(B) For installing/removing DIMMs or NUMA-nodes physically.
+ This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc.
+
+(A) is required by highly virtualized environments and (B) is required by
+hardware which supports memory power management.
+
+Linux memory hotplug is designed for both purpose.
+
+
+1.2. Phases of memory hotplug
+---------------
+There are 2 phases in Memory Hotplug.
+ 1) Physical Memory Hotplug phase
+ 2) Logical Memory Hotplug phase.
+
+The First phase is to communicate hardware/firmware and make/erase
+environment for hotplugged memory. Basically, this phase is necessary
+for the purpose (B), but this is good phase for communication between
+highly virtualized environments too.
+
+When memory is hotplugged, the kernel recognizes new memory, makes new memory
+management tables, and makes sysfs files for new memory's operation.
+
+If firmware supports notification of connection of new memory to OS,
+this phase is triggered automatically. ACPI can notify this event. If not,
+"probe" operation by system administration is used instead.
+(see Section 4.).
+
+Logical Memory Hotplug phase is to change memory state into
+avaiable/unavailable for users. Amount of memory from user's view is
+changed by this phase. The kernel makes all memory in it as free pages
+when a memory range is available.
+
+In this document, this phase is described as online/offline.
+
+Logical Memory Hotplug phase is triggred by write of sysfs file by system
+administrator. For the hot-add case, it must be executed after Physical Hotplug
+phase by hand.
+(However, if you writes udev's hotplug scripts for memory hotplug, these
+ phases can be execute in seamless way.)
+
+
+1.3. Unit of Memory online/offline operation
+------------
+Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory
+into chunks of the same size. The chunk is called a "section". The size of
+a section is architecture dependent. For example, power uses 16MiB, ia64 uses
+1GiB. The unit of online/offline operation is "one section". (see Section 3.)
+
+To determine the size of sections, please read this file:
+
+/sys/devices/system/memory/block_size_bytes
+
+This file shows the size of sections in byte.
+
+-----------------------
+2. Kernel Configuration
+-----------------------
+To use memory hotplug feature, kernel must be compiled with following
+config options.
+
+- For all memory hotplug
+ Memory model -> Sparse Memory (CONFIG_SPARSEMEM)
+ Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG)
+
+- To enable memory removal, the followings are also necessary
+ Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE)
+ Page Migration (CONFIG_MIGRATION)
+
+- For ACPI memory hotplug, the followings are also necessary
+ Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY)
+ This option can be kernel module.
+
+- As a related configuration, if your box has a feature of NUMA-node hotplug
+ via ACPI, then this option is necessary too.
+ ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
+ (CONFIG_ACPI_CONTAINER).
+ This option can be kernel module too.
+
+--------------------------------
+3 sysfs files for memory hotplug
+--------------------------------
+All sections have their device information under /sys/devices/system/memory as
+
+/sys/devices/system/memory/memoryXXX
+(XXX is section id.)
+
+Now, XXX is defined as start_address_of_section / section_size.
+
+For example, assume 1GiB section size. A device for a memory starting at
+0x100000000 is /sys/device/system/memory/memory4
+(0x100000000 / 1Gib = 4)
+This device covers address range [0x100000000 ... 0x140000000)
+
+Under each section, you can see 3 files.
+
+/sys/devices/system/memory/memoryXXX/phys_index
+/sys/devices/system/memory/memoryXXX/phys_device
+/sys/devices/system/memory/memoryXXX/state
+
+'phys_index' : read-only and contains section id, same as XXX.
+'state' : read-write
+ at read: contains online/offline state of memory.
+ at write: user can specify "online", "offline" command
+'phys_device': read-only: designed to show the name of physical memory device.
+ This is not well implemented now.
+
+NOTE:
+ These directories/files appear after physical memory hotplug phase.
+
+
+--------------------------------
+4. Physical memory hot-add phase
+--------------------------------
+
+4.1 Hardware(Firmware) Support
+------------
+On x86_64/ia64 platform, memory hotplug by ACPI is supported.
+
+In general, the firmware (ACPI) which supports memory hotplug defines
+memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80,
+Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev
+script. This will be done automatically.
+
+But scripts for memory hotplug are not contained in generic udev package(now).
+You may have to write it by yourself or online/offline memory by hand.
+Please see "How to online memory", "How to offline memory" in this text.
+
+If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004",
+"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler
+calls hotplug code for all of objects which are defined in it.
+If memory device is found, memory hotplug code will be called.
+
+
+4.2 Notify memory hot-add event by hand
+------------
+In some environments, especially virtualized environment, firmware will not
+notify memory hotplug event to the kernel. For such environment, "probe"
+interface is supported. This interface depends on CONFIG_ARCH_MEMORY_PROBE.
+
+Now, CONFIG_ARCH_MEMORY_PROBE is supported only by powerpc but it does not
+contain highly architecture codes. Please add config if you need "probe"
+interface.
+
+Probe interface is located at
+/sys/devices/system/memory/probe
+
+You can tell the physical address of new memory to the kernel by
+
+% echo start_address_of_new_memory > /sys/devices/system/memory/probe
+
+Then, [start_address_of_new_memory, start_address_of_new_memory + section_size)
+memory range is hot-added. In this case, hotplug script is not called (in
+current implementation). You'll have to online memory by yourself.
+Please see "How to online memory" in this text.
+
+
+
+------------------------------
+5. Logical Memory hot-add phase
+------------------------------
+
+5.1. State of memory
+------------
+To see (online/offline) state of memory section, read 'state' file.
+
+% cat /sys/device/system/memory/memoryXXX/state
+
+
+If the memory section is online, you'll read "online".
+If the memory section is offline, you'll read "offline".
+
+
+5.2. How to online memory
+------------
+Even if the memory is hot-added, it is not at ready-to-use state.
+For using newly added memory, you have to "online" the memory section.
+
+For onlining, you have to write "online" to the section's state file as:
+
+% echo online > /sys/devices/system/memory/memoryXXX/state
+
+After this, section memoryXXX's state will be 'online' and the amount of
+available memory will be increased.
+
+Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA).
+This may be changed in future.
+
+
+
+------------------------
+6. Logical memory remove
+------------------------
+
+6.1 Memory offline and ZONE_MOVABLE
+------------
+Memory offlining is more complicated than memory online. Because memory offline
+has to make the whole memory section be unused, memory offline can fail if
+the section includes memory which cannot be freed.
+
+In general, memory offline can use 2 techniques.
+
+(1) reclaim and free all memory in the section.
+(2) migrate all pages in the section.
+
+In the current implementation, Linux's memory offline uses method (2), freeing
+all pages in the section by page migration. But not all pages are
+migratable. Under current Linux, migratable pages are anonymous pages and
+page caches. For offlining a section by migration, the kernel has to guarantee
+that the section contains only migratable pages.
+
+Now, a boot option for making a section which consists of migratable pages is
+supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
+create ZONE_MOVABLE...a zone which is just used for movable pages.
+(See also Documentation/kernel-parameters.txt)
+
+Assume the system has "TOTAL" amount of memory at boot time, this boot option
+creates ZONE_MOVABLE as following.
+
+1) When kernelcore=YYYY boot option is used,
+ Size of memory not for movable pages (not for offline) is YYYY.
+ Size of memory for movable pages (for offline) is TOTAL-YYYY.
+
+2) When movablecore=ZZZZ boot option is used,
+ Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
+ Size of memory for movable pages (for offline) is ZZZZ.
+
+
+Note) Unfortunately, there is no information to show which section belongs
+to ZONE_MOVABLE. This is TBD.
+
+
+6.2. How to offline memory
+------------
+You can offline a section by using the same sysfs interface that was used in
+memory onlining.
+
+% echo offline > /sys/devices/system/memory/memoryXXX/state
+
+If offline succeeds, the state of the memory section is changed to be "offline".
+If it fails, some error core (like -EBUSY) will be returned by the kernel.
+Even if a section does not belong to ZONE_MOVABLE, you can try to offline it.
+If it doesn't contain 'unmovable' memory, you'll get success.
+
+A section under ZONE_MOVABLE is considered to be able to be offlined easily.
+But under some busy state, it may return -EBUSY. Even if a memory section
+cannot be offlined due to -EBUSY, you can retry offlining it and may be able to
+offline it (or not).
+(For example, a page is referred to by some kernel internal call and released
+ soon.)
+
+Consideration:
+Memory hotplug's design direction is to make the possibility of memory offlining
+higher and to guarantee unplugging memory under any situation. But it needs
+more work. Returning -EBUSY under some situation may be good because the user
+can decide to retry more or not by himself. Currently, memory offlining code
+does some amount of retry with 120 seconds timeout.
+
+-------------------------
+7. Physical memory remove
+-------------------------
+Need more implementation yet....
+ - Notification completion of remove works by OS to firmware.
+ - Guard from remove if not yet.
+
+--------------------------------
+8. Memory hotplug event notifier
+--------------------------------
+Memory hotplug has event notifer. There are 6 types of notification.
+
+MEMORY_GOING_ONLINE
+ Generated before new memory becomes available in order to be able to
+ prepare subsystems to handle memory. The page allocator is still unable
+ to allocate from the new memory.
+
+MEMORY_CANCEL_ONLINE
+ Generated if MEMORY_GOING_ONLINE fails.
+
+MEMORY_ONLINE
+ Generated when memory has succesfully brought online. The callback may
+ allocate pages from the new memory.
+
+MEMORY_GOING_OFFLINE
+ Generated to begin the process of offlining memory. Allocations are no
+ longer possible from the memory but some of the memory to be offlined
+ is still in use. The callback can be used to free memory known to a
+ subsystem from the indicated memory section.
+
+MEMORY_CANCEL_OFFLINE
+ Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
+ the section that we attempted to offline.
+
+MEMORY_OFFLINE
+ Generated after offlining memory is complete.
+
+A callback routine can be registered by
+ hotplug_memory_notifier(callback_func, priority)
+
+The second argument of callback function (action) is event types of above.
+The third argument is passed by pointer of struct memory_notify.
+
+struct memory_notify {
+ unsigned long start_pfn;
+ unsigned long nr_pages;
+ int status_cahnge_nid;
+}
+
+start_pfn is start_pfn of online/offline memory.
+nr_pages is # of pages of online/offline memory.
+status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
+set/clear. It means a new(memoryless) node gets new memory by online and a
+node loses all memory. If this is -1, then nodemask status is not changed.
+If status_changed_nid >= 0, callback should create/discard structures for the
+node if necessary.
+
+--------------
+9. Future Work
+--------------
+ - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
+ sysctl or new control file.
+ - showing memory section and physical device relationship.
+ - showing memory section and node relationship (maybe good for NUMA)
+ - showing memory section is under ZONE_MOVABLE or not
+ - test and make it better memory offlining.
+ - support HugeTLB page migration and offlining.
+ - memmap removing at memory offline.
+ - physical remove memory.
+
diff --git a/Documentation/memory.txt b/Documentation/memory.txt
new file mode 100644
index 0000000..2b3dedd
--- /dev/null
+++ b/Documentation/memory.txt
@@ -0,0 +1,60 @@
+There are several classic problems related to memory on Linux
+systems.
+
+ 1) There are some buggy motherboards which cannot properly
+ deal with the memory above 16MB. Consider exchanging
+ your motherboard.
+
+ 2) You cannot do DMA on the ISA bus to addresses above
+ 16M. Most device drivers under Linux allow the use
+ of bounce buffers which work around this problem. Drivers
+ that don't use bounce buffers will be unstable with
+ more than 16M installed. Drivers that use bounce buffers
+ will be OK, but may have slightly higher overhead.
+
+ 3) There are some motherboards that will not cache above
+ a certain quantity of memory. If you have one of these
+ motherboards, your system will be SLOWER, not faster
+ as you add more memory. Consider exchanging your
+ motherboard.
+
+All of these problems can be addressed with the "mem=XXXM" boot option
+(where XXX is the size of RAM to use in megabytes).
+It can also tell Linux to use less memory than is actually installed.
+If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid
+physical address space collisions.
+
+See the documentation of your boot loader (LILO, loadlin, etc.) about
+how to pass options to the kernel.
+
+There are other memory problems which Linux cannot deal with. Random
+corruption of memory is usually a sign of serious hardware trouble.
+Try:
+
+ * Reducing memory settings in the BIOS to the most conservative
+ timings.
+
+ * Adding a cooling fan.
+
+ * Not overclocking your CPU.
+
+ * Having the memory tested in a memory tester or exchanged
+ with the vendor. Consider testing it with memtest86 yourself.
+
+ * Exchanging your CPU, cache, or motherboard for one that works.
+
+ * Disabling the cache from the BIOS.
+
+ * Try passing the "mem=4M" option to the kernel to limit
+ Linux to using a very small amount of memory. Use "memmap="-option
+ together with "mem=" on systems with PCI to avoid physical address
+ space collisions.
+
+
+Other tricks:
+
+ * Try passing the "no-387" option to the kernel to ignore
+ a buggy FPU.
+
+ * Try passing the "no-hlt" option to disable the potentially
+ buggy HLT instruction in your CPU.
diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX
new file mode 100644
index 0000000..8ae9cff
--- /dev/null
+++ b/Documentation/mips/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - this file.
+AU1xxx_IDE.README
+ - README for MIPS AU1XXX IDE driver.
diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README
new file mode 100644
index 0000000..25a6ed1
--- /dev/null
+++ b/Documentation/mips/AU1xxx_IDE.README
@@ -0,0 +1,126 @@
+README for MIPS AU1XXX IDE driver - Released 2005-07-15
+
+ABOUT
+-----
+This file describes the 'drivers/ide/mips/au1xxx-ide.c', related files and the
+services they provide.
+
+If you are short in patience and just want to know how to add your hard disc to
+the white or black list, go to the 'ADD NEW HARD DISC TO WHITE OR BLACK LIST'
+section.
+
+
+LICENSE
+-------
+
+Copyright (c) 2003-2005 AMD, Personal Connectivity Solutions
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 2 of the License, or (at your option) any later
+version.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+675 Mass Ave, Cambridge, MA 02139, USA.
+
+Note: for more information, please refer "AMD Alchemy Au1200/Au1550 IDE
+ Interface and Linux Device Driver" Application Note.
+
+
+FILES, CONFIGS AND COMPATABILITY
+--------------------------------
+
+Two files are introduced:
+
+ a) 'include/asm-mips/mach-au1x00/au1xxx_ide.h'
+ containes : struct _auide_hwif
+ timing parameters for PIO mode 0/1/2/3/4
+ timing parameters for MWDMA 0/1/2
+
+ b) 'drivers/ide/mips/au1xxx-ide.c'
+ contains the functionality of the AU1XXX IDE driver
+
+Four configs variables are introduced:
+
+ CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode
+ CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode
+ CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA
+ controller
+ CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size
+ per descriptor
+
+
+SUPPORTED IDE MODES
+-------------------
+
+The AU1XXX IDE driver supported all PIO modes - PIO mode 0/1/2/3/4 - and all
+MWDMA modes - MWDMA 0/1/2 -. There is no support for SWDMA and UDMA mode.
+
+To change the PIO mode use the program hdparm with option -p, e.g.
+'hdparm -p0 [device]' for PIO mode 0. To enable the MWDMA mode use the option
+-X, e.g. 'hdparm -X32 [device]' for MWDMA mode 0.
+
+
+PERFORMANCE CONFIGURATIONS
+--------------------------
+
+If the used system doesn't need USB support enable the following kernel configs:
+
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_IDEDMA_PCI_AUTO=y
+CONFIG_BLK_DEV_IDE_AU1XXX=y
+CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA=y
+CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ=128
+CONFIG_BLK_DEV_IDEDMA=y
+CONFIG_IDEDMA_AUTO=y
+
+Also define 'IDE_AU1XXX_BURSTMODE' in 'drivers/ide/mips/au1xxx-ide.c' to enable
+the burst support on DBDMA controller.
+
+If the used system need the USB support enable the following kernel configs for
+high IDE to USB throughput.
+
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_IDEDMA_PCI_AUTO=y
+CONFIG_BLK_DEV_IDE_AU1XXX=y
+CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA=y
+CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ=128
+CONFIG_BLK_DEV_IDEDMA=y
+CONFIG_IDEDMA_AUTO=y
+
+Also undefine 'IDE_AU1XXX_BURSTMODE' in 'drivers/ide/mips/au1xxx-ide.c' to
+disable the burst support on DBDMA controller.
+
+
+ACKNOWLEDGMENTS
+---------------
+
+These drivers wouldn't have been done without the base of kernel 2.4.x AU1XXX
+IDE driver from AMD.
+
+Additional input also from:
+Matthias Lenk <matthias.lenk@amd.com>
+
+Happy hacking!
+Enrico Walther <enrico.walther@amd.com>
diff --git a/Documentation/mn10300/ABI.txt b/Documentation/mn10300/ABI.txt
new file mode 100644
index 0000000..1fef1f0
--- /dev/null
+++ b/Documentation/mn10300/ABI.txt
@@ -0,0 +1,149 @@
+ =========================
+ MN10300 FUNCTION CALL ABI
+ =========================
+
+=======
+GENERAL
+=======
+
+The MN10300/AM33 kernel runs in little-endian mode; big-endian mode is not
+supported.
+
+The stack grows downwards, and should always be 32-bit aligned. There are
+separate stack pointer registers for userspace and the kernel.
+
+
+================
+ARGUMENT PASSING
+================
+
+The first two arguments (assuming up to 32-bits per argument) to a function are
+passed in the D0 and D1 registers respectively; all other arguments are passed
+on the stack.
+
+If 64-bit arguments are being passed, then they are never split between
+registers and the stack. If the first argument is a 64-bit value, it will be
+passed in D0:D1. If the first argument is not a 64-bit value, but the second
+is, the second will be passed entirely on the stack and D1 will be unused.
+
+Arguments smaller than 32-bits are not coelesced within a register or a stack
+word. For example, two byte-sized arguments will always be passed in separate
+registers or word-sized stack slots.
+
+
+=================
+CALLING FUNCTIONS
+=================
+
+The caller must allocate twelve bytes on the stack for the callee's use before
+it inserts a CALL instruction. The CALL instruction will write into the TOS
+word, but won't actually modify the stack pointer; similarly, the RET
+instruction reads from the TOS word of the stack, but doesn't move the stack
+pointer beyond it.
+
+
+ Stack:
+ | |
+ | |
+ |---------------| SP+20
+ | 4th Arg |
+ |---------------| SP+16
+ | 3rd Arg |
+ |---------------| SP+12
+ | D1 Save Slot |
+ |---------------| SP+8
+ | D0 Save Slot |
+ |---------------| SP+4
+ | Return Addr |
+ |---------------| SP
+ | |
+ | |
+
+
+The caller must leave space on the stack (hence an allocation of twelve bytes)
+in which the callee may store the first two arguments.
+
+
+============
+RETURN VALUE
+============
+
+The return value is passed in D0 for an integer (or D0:D1 for a 64-bit value),
+or A0 for a pointer.
+
+If the return value is a value larger than 64-bits, or is a structure or an
+array, then a hidden first argument will be passed to the callee by the caller:
+this will point to a piece of memory large enough to hold the result of the
+function. In this case, the callee will return the value in that piece of
+memory, and no value will be returned in D0 or A0.
+
+
+===================
+REGISTER CLOBBERING
+===================
+
+The values in certain registers may be clobbered by the callee, and other
+values must be saved:
+
+ Clobber: D0-D1, A0-A1, E0-E3
+ Save: D2-D3, A2-A3, E4-E7, SP
+
+All other non-supervisor-only registers are clobberable (such as MDR, MCRL,
+MCRH).
+
+
+=================
+SPECIAL REGISTERS
+=================
+
+Certain ordinary registers may carry special usage for the compiler:
+
+ A3: Frame pointer
+ E2: TLS pointer
+
+
+==========
+KERNEL ABI
+==========
+
+The kernel may use a slightly different ABI internally.
+
+ (*) E2
+
+ If CONFIG_MN10300_CURRENT_IN_E2 is defined, then the current task pointer
+ will be kept in the E2 register, and that register will be marked
+ unavailable for the compiler to use as a scratch register.
+
+ Normally the kernel uses something like:
+
+ MOV SP,An
+ AND 0xFFFFE000,An
+ MOV (An),Rm // Rm holds current
+ MOV (yyy,Rm) // Access current->yyy
+
+ To find the address of current; but since this option permits current to
+ be carried globally in an register, it can use:
+
+ MOV (yyy,E2) // Access current->yyy
+
+ instead.
+
+
+===============
+SYSTEM CALL ABI
+===============
+
+System calls are called with the following convention:
+
+ REGISTER ENTRY EXIT
+ =============== ======================= =======================
+ D0 Syscall number Return value
+ A0 1st syscall argument Saved
+ D1 2nd syscall argument Saved
+ A3 3rd syscall argument Saved
+ A2 4th syscall argument Saved
+ D3 5th syscall argument Saved
+ D2 6th syscall argument Saved
+
+All other registers are saved. The layout is a consequence of the way the MOVM
+instruction stores registers onto the stack.
diff --git a/Documentation/mn10300/compartmentalisation.txt b/Documentation/mn10300/compartmentalisation.txt
new file mode 100644
index 0000000..8958b51
--- /dev/null
+++ b/Documentation/mn10300/compartmentalisation.txt
@@ -0,0 +1,60 @@
+ =========================================
+ PART-SPECIFIC SOURCE COMPARTMENTALISATION
+ =========================================
+
+The sources for various parts are compartmentalised at two different levels:
+
+ (1) Processor level
+
+ The "processor level" is a CPU core plus the other on-silicon
+ peripherals.
+
+ Processor-specific header files are divided among directories in a similar
+ way to the CPU level:
+
+ (*) include/asm-mn10300/proc-mn103e010/
+
+ Support for the AM33v2 CPU core.
+
+ The appropriate processor is selected by a CONFIG_MN10300_PROC_YYYY option
+ from the "Processor support" choice menu in the arch/mn10300/Kconfig file.
+
+
+ (2) Unit level
+
+ The "unit level" is a processor plus all the external peripherals
+ controlled by that processor.
+
+ Unit-specific header files are divided among directories in a similar way
+ to the CPU level; not only that, but specific sources may also be
+ segregated into separate directories under the arch directory:
+
+ (*) include/asm-mn10300/unit-asb2303/
+ (*) arch/mn10300/unit-asb2303/
+
+ Support for the ASB2303 board with an ASB2308 daughter board.
+
+ (*) include/asm-mn10300/unit-asb2305/
+ (*) arch/mn10300/unit-asb2305/
+
+ Support for the ASB2305 board.
+
+ The appropriate processor is selected by a CONFIG_MN10300_UNIT_ZZZZ option
+ from the "Unit type" choice menu in the arch/mn10300/Kconfig file.
+
+
+============
+COMPILE TIME
+============
+
+When the kernel is compiled, symbolic links will be made in the asm header file
+directory for this arch:
+
+ include/asm-mn10300/proc => include/asm-mn10300/proc-YYYY/
+ include/asm-mn10300/unit => include/asm-mn10300/unit-ZZZZ/
+
+So that the header files contained in those directories can be accessed without
+lots of #ifdef-age.
+
+The appropriate arch/mn10300/unit-ZZZZ directory will also be entered by the
+compilation process; all other unit-specific directories will be ignored.
diff --git a/Documentation/mono.txt b/Documentation/mono.txt
new file mode 100644
index 0000000..e8e1758
--- /dev/null
+++ b/Documentation/mono.txt
@@ -0,0 +1,66 @@
+ Mono(tm) Binary Kernel Support for Linux
+ -----------------------------------------
+
+To configure Linux to automatically execute Mono-based .NET binaries
+(in the form of .exe files) without the need to use the mono CLR
+wrapper, you can use the BINFMT_MISC kernel support.
+
+This will allow you to execute Mono-based .NET binaries just like any
+other program after you have done the following:
+
+1) You MUST FIRST install the Mono CLR support, either by downloading
+ a binary package, a source tarball or by installing from CVS. Binary
+ packages for several distributions can be found at:
+
+ http://go-mono.com/download.html
+
+ Instructions for compiling Mono can be found at:
+
+ http://www.go-mono.com/compiling.html
+
+ Once the Mono CLR support has been installed, just check that
+ /usr/bin/mono (which could be located elsewhere, for example
+ /usr/local/bin/mono) is working.
+
+2) You have to compile BINFMT_MISC either as a module or into
+ the kernel (CONFIG_BINFMT_MISC) and set it up properly.
+ If you choose to compile it as a module, you will have
+ to insert it manually with modprobe/insmod, as kmod
+ cannot be easily supported with binfmt_misc.
+ Read the file 'binfmt_misc.txt' in this directory to know
+ more about the configuration process.
+
+3) Add the following entries to /etc/rc.local or similar script
+ to be run at system startup:
+
+# Insert BINFMT_MISC module into the kernel
+if [ ! -e /proc/sys/fs/binfmt_misc/register ]; then
+ /sbin/modprobe binfmt_misc
+ # Some distributions, like Fedora Core, perform
+ # the following command automatically when the
+ # binfmt_misc module is loaded into the kernel.
+ # Thus, it is possible that the following line
+ # is not needed at all. Look at /etc/modprobe.conf
+ # to check whether this is applicable or not.
+ mount -t binfmt_misc none /proc/sys/fs/binfmt_misc
+fi
+
+# Register support for .NET CLR binaries
+if [ -e /proc/sys/fs/binfmt_misc/register ]; then
+ # Replace /usr/bin/mono with the correct pathname to
+ # the Mono CLR runtime (usually /usr/local/bin/mono
+ # when compiling from sources or CVS).
+ echo ':CLR:M::MZ::/usr/bin/mono:' > /proc/sys/fs/binfmt_misc/register
+else
+ echo "No binfmt_misc support"
+ exit 1
+fi
+
+4) Check that .exe binaries can be ran without the need of a
+ wrapper script, simply by launching the .exe file directly
+ from a command prompt, for example:
+
+ /usr/bin/xsd.exe
+
+ NOTE: If this fails with a permission denied error, check
+ that the .exe file has execute permissions.
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
new file mode 100644
index 0000000..bdf93b7
--- /dev/null
+++ b/Documentation/mtd/nand_ecc.txt
@@ -0,0 +1,714 @@
+Introduction
+============
+
+Having looked at the linux mtd/nand driver and more specific at nand_ecc.c
+I felt there was room for optimisation. I bashed the code for a few hours
+performing tricks like table lookup removing superfluous code etc.
+After that the speed was increased by 35-40%.
+Still I was not too happy as I felt there was additional room for improvement.
+
+Bad! I was hooked.
+I decided to annotate my steps in this file. Perhaps it is useful to someone
+or someone learns something from it.
+
+
+The problem
+===========
+
+NAND flash (at least SLC one) typically has sectors of 256 bytes.
+However NAND flash is not extremely reliable so some error detection
+(and sometimes correction) is needed.
+
+This is done by means of a Hamming code. I'll try to explain it in
+laymans terms (and apologies to all the pro's in the field in case I do
+not use the right terminology, my coding theory class was almost 30
+years ago, and I must admit it was not one of my favourites).
+
+As I said before the ecc calculation is performed on sectors of 256
+bytes. This is done by calculating several parity bits over the rows and
+columns. The parity used is even parity which means that the parity bit = 1
+if the data over which the parity is calculated is 1 and the parity bit = 0
+if the data over which the parity is calculated is 0. So the total
+number of bits over the data over which the parity is calculated + the
+parity bit is even. (see wikipedia if you can't follow this).
+Parity is often calculated by means of an exclusive or operation,
+sometimes also referred to as xor. In C the operator for xor is ^
+
+Back to ecc.
+Let's give a small figure:
+
+byte 0: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp4 ... rp14
+byte 1: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp2 rp4 ... rp14
+byte 2: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp4 ... rp14
+byte 3: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp4 ... rp14
+byte 4: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp5 ... rp14
+....
+byte 254: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp5 ... rp15
+byte 255: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp5 ... rp15
+ cp1 cp0 cp1 cp0 cp1 cp0 cp1 cp0
+ cp3 cp3 cp2 cp2 cp3 cp3 cp2 cp2
+ cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4
+
+This figure represents a sector of 256 bytes.
+cp is my abbreviaton for column parity, rp for row parity.
+
+Let's start to explain column parity.
+cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
+so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even.
+Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7.
+cp2 is the parity over bit0, bit1, bit4 and bit5
+cp3 is the parity over bit2, bit3, bit6 and bit7.
+cp4 is the parity over bit0, bit1, bit2 and bit3.
+cp5 is the parity over bit4, bit5, bit6 and bit7.
+Note that each of cp0 .. cp5 is exactly one bit.
+
+Row parity actually works almost the same.
+rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254)
+rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255)
+rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ...
+(so handle two bytes, then skip 2 bytes).
+rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...)
+for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc.
+so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...)
+and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, ..
+The story now becomes quite boring. I guess you get the idea.
+rp6 covers 8 bytes then skips 8 etc
+rp7 skips 8 bytes then covers 8 etc
+rp8 covers 16 bytes then skips 16 etc
+rp9 skips 16 bytes then covers 16 etc
+rp10 covers 32 bytes then skips 32 etc
+rp11 skips 32 bytes then covers 32 etc
+rp12 covers 64 bytes then skips 64 etc
+rp13 skips 64 bytes then covers 64 etc
+rp14 covers 128 bytes then skips 128
+rp15 skips 128 bytes then covers 128
+
+In the end the parity bits are grouped together in three bytes as
+follows:
+ECC Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0
+ECC 0 rp07 rp06 rp05 rp04 rp03 rp02 rp01 rp00
+ECC 1 rp15 rp14 rp13 rp12 rp11 rp10 rp09 rp08
+ECC 2 cp5 cp4 cp3 cp2 cp1 cp0 1 1
+
+I detected after writing this that ST application note AN1823
+(http://www.st.com/stonline/books/pdf/docs/10123.pdf) gives a much
+nicer picture.(but they use line parity as term where I use row parity)
+Oh well, I'm graphically challenged, so suffer with me for a moment :-)
+And I could not reuse the ST picture anyway for copyright reasons.
+
+
+Attempt 0
+=========
+
+Implementing the parity calculation is pretty simple.
+In C pseudocode:
+for (i = 0; i < 256; i++)
+{
+ if (i & 0x01)
+ rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
+ else
+ rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
+ if (i & 0x02)
+ rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
+ else
+ rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2;
+ if (i & 0x04)
+ rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5;
+ else
+ rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4;
+ if (i & 0x08)
+ rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7;
+ else
+ rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6;
+ if (i & 0x10)
+ rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9;
+ else
+ rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8;
+ if (i & 0x20)
+ rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
+ else
+ rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
+ if (i & 0x40)
+ rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
+ else
+ rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12;
+ if (i & 0x80)
+ rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15;
+ else
+ rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14;
+ cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0;
+ cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1;
+ cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2;
+ cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3
+ cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4
+ cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5
+}
+
+
+Analysis 0
+==========
+
+C does have bitwise operators but not really operators to do the above
+efficiently (and most hardware has no such instructions either).
+Therefore without implementing this it was clear that the code above was
+not going to bring me a Nobel prize :-)
+
+Fortunately the exclusive or operation is commutative, so we can combine
+the values in any order. So instead of calculating all the bits
+individually, let us try to rearrange things.
+For the column parity this is easy. We can just xor the bytes and in the
+end filter out the relevant bits. This is pretty nice as it will bring
+all cp calculation out of the if loop.
+
+Similarly we can first xor the bytes for the various rows.
+This leads to:
+
+
+Attempt 1
+=========
+
+const char parity[256] = {
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
+};
+
+void ecc1(const unsigned char *buf, unsigned char *code)
+{
+ int i;
+ const unsigned char *bp = buf;
+ unsigned char cur;
+ unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+ unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+ unsigned char par;
+
+ par = 0;
+ rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+ rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+ rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+ rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+ for (i = 0; i < 256; i++)
+ {
+ cur = *bp++;
+ par ^= cur;
+ if (i & 0x01) rp1 ^= cur; else rp0 ^= cur;
+ if (i & 0x02) rp3 ^= cur; else rp2 ^= cur;
+ if (i & 0x04) rp5 ^= cur; else rp4 ^= cur;
+ if (i & 0x08) rp7 ^= cur; else rp6 ^= cur;
+ if (i & 0x10) rp9 ^= cur; else rp8 ^= cur;
+ if (i & 0x20) rp11 ^= cur; else rp10 ^= cur;
+ if (i & 0x40) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x80) rp15 ^= cur; else rp14 ^= cur;
+ }
+ code[0] =
+ (parity[rp7] << 7) |
+ (parity[rp6] << 6) |
+ (parity[rp5] << 5) |
+ (parity[rp4] << 4) |
+ (parity[rp3] << 3) |
+ (parity[rp2] << 2) |
+ (parity[rp1] << 1) |
+ (parity[rp0]);
+ code[1] =
+ (parity[rp15] << 7) |
+ (parity[rp14] << 6) |
+ (parity[rp13] << 5) |
+ (parity[rp12] << 4) |
+ (parity[rp11] << 3) |
+ (parity[rp10] << 2) |
+ (parity[rp9] << 1) |
+ (parity[rp8]);
+ code[2] =
+ (parity[par & 0xf0] << 7) |
+ (parity[par & 0x0f] << 6) |
+ (parity[par & 0xcc] << 5) |
+ (parity[par & 0x33] << 4) |
+ (parity[par & 0xaa] << 3) |
+ (parity[par & 0x55] << 2);
+ code[0] = ~code[0];
+ code[1] = ~code[1];
+ code[2] = ~code[2];
+}
+
+Still pretty straightforward. The last three invert statements are there to
+give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash
+all data is 0xff, so the checksum then matches.
+
+I also introduced the parity lookup. I expected this to be the fastest
+way to calculate the parity, but I will investigate alternatives later
+on.
+
+
+Analysis 1
+==========
+
+The code works, but is not terribly efficient. On my system it took
+almost 4 times as much time as the linux driver code. But hey, if it was
+*that* easy this would have been done long before.
+No pain. no gain.
+
+Fortunately there is plenty of room for improvement.
+
+In step 1 we moved from bit-wise calculation to byte-wise calculation.
+However in C we can also use the unsigned long data type and virtually
+every modern microprocessor supports 32 bit operations, so why not try
+to write our code in such a way that we process data in 32 bit chunks.
+
+Of course this means some modification as the row parity is byte by
+byte. A quick analysis:
+for the column parity we use the par variable. When extending to 32 bits
+we can in the end easily calculate p0 and p1 from it.
+(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
+respectively)
+also rp2 and rp3 can be easily retrieved from par as rp3 covers the
+first two bytes and rp2 the last two bytes.
+
+Note that of course now the loop is executed only 64 times (256/4).
+And note that care must taken wrt byte ordering. The way bytes are
+ordered in a long is machine dependent, and might affect us.
+Anyway, if there is an issue: this code is developed on x86 (to be
+precise: a DELL PC with a D920 Intel CPU)
+
+And of course the performance might depend on alignment, but I expect
+that the I/O buffers in the nand driver are aligned properly (and
+otherwise that should be fixed to get maximum performance).
+
+Let's give it a try...
+
+
+Attempt 2
+=========
+
+extern const char parity[256];
+
+void ecc2(const unsigned char *buf, unsigned char *code)
+{
+ int i;
+ const unsigned long *bp = (unsigned long *)buf;
+ unsigned long cur;
+ unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+ unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+ unsigned long par;
+
+ par = 0;
+ rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+ rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+ rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+ rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+ for (i = 0; i < 64; i++)
+ {
+ cur = *bp++;
+ par ^= cur;
+ if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+ if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+ if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+ if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+ if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+ }
+ /*
+ we need to adapt the code generation for the fact that rp vars are now
+ long; also the column parity calculation needs to be changed.
+ we'll bring rp4 to 15 back to single byte entities by shifting and
+ xoring
+ */
+ rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff;
+ rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff;
+ rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff;
+ rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff;
+ rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff;
+ rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff;
+ rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff;
+ rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff;
+ rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff;
+ rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff;
+ rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff;
+ rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff;
+ rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff;
+ rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff;
+ par ^= (par >> 16);
+ rp1 = (par >> 8); rp1 &= 0xff;
+ rp0 = (par & 0xff);
+ par ^= (par >> 8); par &= 0xff;
+
+ code[0] =
+ (parity[rp7] << 7) |
+ (parity[rp6] << 6) |
+ (parity[rp5] << 5) |
+ (parity[rp4] << 4) |
+ (parity[rp3] << 3) |
+ (parity[rp2] << 2) |
+ (parity[rp1] << 1) |
+ (parity[rp0]);
+ code[1] =
+ (parity[rp15] << 7) |
+ (parity[rp14] << 6) |
+ (parity[rp13] << 5) |
+ (parity[rp12] << 4) |
+ (parity[rp11] << 3) |
+ (parity[rp10] << 2) |
+ (parity[rp9] << 1) |
+ (parity[rp8]);
+ code[2] =
+ (parity[par & 0xf0] << 7) |
+ (parity[par & 0x0f] << 6) |
+ (parity[par & 0xcc] << 5) |
+ (parity[par & 0x33] << 4) |
+ (parity[par & 0xaa] << 3) |
+ (parity[par & 0x55] << 2);
+ code[0] = ~code[0];
+ code[1] = ~code[1];
+ code[2] = ~code[2];
+}
+
+The parity array is not shown any more. Note also that for these
+examples I kinda deviated from my regular programming style by allowing
+multiple statements on a line, not using { } in then and else blocks
+with only a single statement and by using operators like ^=
+
+
+Analysis 2
+==========
+
+The code (of course) works, and hurray: we are a little bit faster than
+the linux driver code (about 15%). But wait, don't cheer too quickly.
+THere is more to be gained.
+If we look at e.g. rp14 and rp15 we see that we either xor our data with
+rp14 or with rp15. However we also have par which goes over all data.
+This means there is no need to calculate rp14 as it can be calculated from
+rp15 through rp14 = par ^ rp15;
+(or if desired we can avoid calculating rp15 and calculate it from
+rp14). That is why some places refer to inverse parity.
+Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
+Effectively this means we can eliminate the else clause from the if
+statements. Also we can optimise the calculation in the end a little bit
+by going from long to byte first. Actually we can even avoid the table
+lookups
+
+Attempt 3
+=========
+
+Odd replaced:
+ if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+ if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+ if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+ if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+ if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+with
+ if (i & 0x01) rp5 ^= cur;
+ if (i & 0x02) rp7 ^= cur;
+ if (i & 0x04) rp9 ^= cur;
+ if (i & 0x08) rp11 ^= cur;
+ if (i & 0x10) rp13 ^= cur;
+ if (i & 0x20) rp15 ^= cur;
+
+ and outside the loop added:
+ rp4 = par ^ rp5;
+ rp6 = par ^ rp7;
+ rp8 = par ^ rp9;
+ rp10 = par ^ rp11;
+ rp12 = par ^ rp13;
+ rp14 = par ^ rp15;
+
+And after that the code takes about 30% more time, although the number of
+statements is reduced. This is also reflected in the assembly code.
+
+
+Analysis 3
+==========
+
+Very weird. Guess it has to do with caching or instruction parallellism
+or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting
+observation was that this one is only 30% slower (according to time)
+executing the code as my 3Ghz D920 processor.
+
+Well, it was expected not to be easy so maybe instead move to a
+different track: let's move back to the code from attempt2 and do some
+loop unrolling. This will eliminate a few if statements. I'll try
+different amounts of unrolling to see what works best.
+
+
+Attempt 4
+=========
+
+Unrolled the loop 1, 2, 3 and 4 times.
+For 4 the code starts with:
+
+ for (i = 0; i < 4; i++)
+ {
+ cur = *bp++;
+ par ^= cur;
+ rp4 ^= cur;
+ rp6 ^= cur;
+ rp8 ^= cur;
+ rp10 ^= cur;
+ if (i & 0x1) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x2) rp15 ^= cur; else rp14 ^= cur;
+ cur = *bp++;
+ par ^= cur;
+ rp5 ^= cur;
+ rp6 ^= cur;
+ ...
+
+
+Analysis 4
+==========
+
+Unrolling once gains about 15%
+Unrolling twice keeps the gain at about 15%
+Unrolling three times gives a gain of 30% compared to attempt 2.
+Unrolling four times gives a marginal improvement compared to unrolling
+three times.
+
+I decided to proceed with a four time unrolled loop anyway. It was my gut
+feeling that in the next steps I would obtain additional gain from it.
+
+The next step was triggered by the fact that par contains the xor of all
+bytes and rp4 and rp5 each contain the xor of half of the bytes.
+So in effect par = rp4 ^ rp5. But as xor is commutative we can also say
+that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can
+eliminate rp5 (or rp4, but I already foresaw another optimisation).
+The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15.
+
+
+Attempt 5
+=========
+
+Effectively so all odd digit rp assignments in the loop were removed.
+This included the else clause of the if statements.
+Of course after the loop we need to correct things by adding code like:
+ rp5 = par ^ rp4;
+Also the initial assignments (rp5 = 0; etc) could be removed.
+Along the line I also removed the initialisation of rp0/1/2/3.
+
+
+Analysis 5
+==========
+
+Measurements showed this was a good move. The run-time roughly halved
+compared with attempt 4 with 4 times unrolled, and we only require 1/3rd
+of the processor time compared to the current code in the linux kernel.
+
+However, still I thought there was more. I didn't like all the if
+statements. Why not keep a running parity and only keep the last if
+statement. Time for yet another version!
+
+
+Attempt 6
+=========
+
+THe code within the for loop was changed to:
+
+ for (i = 0; i < 4; i++)
+ {
+ cur = *bp++; tmppar = cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp8 ^= cur;
+
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur;
+
+ par ^= tmppar;
+ if ((i & 0x1) == 0) rp12 ^= tmppar;
+ if ((i & 0x2) == 0) rp14 ^= tmppar;
+ }
+
+As you can see tmppar is used to accumulate the parity within a for
+iteration. In the last 3 statements is is added to par and, if needed,
+to rp12 and rp14.
+
+While making the changes I also found that I could exploit that tmppar
+contains the running parity for this iteration. So instead of having:
+rp4 ^= cur; rp6 = cur;
+I removed the rp6 = cur; statement and did rp6 ^= tmppar; on next
+statement. A similar change was done for rp8 and rp10
+
+
+Analysis 6
+==========
+
+Measuring this code again showed big gain. When executing the original
+linux code 1 million times, this took about 1 second on my system.
+(using time to measure the performance). After this iteration I was back
+to 0.075 sec. Actually I had to decide to start measuring over 10
+million interations in order not to loose too much accuracy. This one
+definitely seemed to be the jackpot!
+
+There is a little bit more room for improvement though. There are three
+places with statements:
+rp4 ^= cur; rp6 ^= cur;
+It seems more efficient to also maintain a variable rp4_6 in the while
+loop; This eliminates 3 statements per loop. Of course after the loop we
+need to correct by adding:
+ rp4 ^= rp4_6;
+ rp6 ^= rp4_6
+Furthermore there are 4 sequential assingments to rp8. This can be
+encoded slightly more efficient by saving tmppar before those 4 lines
+and later do rp8 = rp8 ^ tmppar ^ notrp8;
+(where notrp8 is the value of rp8 before those 4 lines).
+Again a use of the commutative property of xor.
+Time for a new test!
+
+
+Attempt 7
+=========
+
+The new code now looks like:
+
+ for (i = 0; i < 4; i++)
+ {
+ cur = *bp++; tmppar = cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+ cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+ notrp8 = tmppar;
+ cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur;
+ rp8 = rp8 ^ tmppar ^ notrp8;
+
+ cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur;
+
+ par ^= tmppar;
+ if ((i & 0x1) == 0) rp12 ^= tmppar;
+ if ((i & 0x2) == 0) rp14 ^= tmppar;
+ }
+ rp4 ^= rp4_6;
+ rp6 ^= rp4_6;
+
+
+Not a big change, but every penny counts :-)
+
+
+Analysis 7
+==========
+
+Acutally this made things worse. Not very much, but I don't want to move
+into the wrong direction. Maybe something to investigate later. Could
+have to do with caching again.
+
+Guess that is what there is to win within the loop. Maybe unrolling one
+more time will help. I'll keep the optimisations from 7 for now.
+
+
+Attempt 8
+=========
+
+Unrolled the loop one more time.
+
+
+Analysis 8
+==========
+
+This makes things worse. Let's stick with attempt 6 and continue from there.
+Although it seems that the code within the loop cannot be optimised
+further there is still room to optimize the generation of the ecc codes.
+We can simply calcualate the total parity. If this is 0 then rp4 = rp5
+etc. If the parity is 1, then rp4 = !rp5;
+But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
+in the result byte and then do something like
+ code[0] |= (code[0] << 1);
+Lets test this.
+
+
+Attempt 9
+=========
+
+Changed the code but again this slightly degrades performance. Tried all
+kind of other things, like having dedicated parity arrays to avoid the
+shift after parity[rp7] << 7; No gain.
+Change the lookup using the parity array by using shift operators (e.g.
+replace parity[rp7] << 7 with:
+rp7 ^= (rp7 << 4);
+rp7 ^= (rp7 << 2);
+rp7 ^= (rp7 << 1);
+rp7 &= 0x80;
+No gain.
+
+The only marginal change was inverting the parity bits, so we can remove
+the last three invert statements.
+
+Ah well, pity this does not deliver more. Then again 10 million
+iterations using the linux driver code takes between 13 and 13.5
+seconds, whereas my code now takes about 0.73 seconds for those 10
+million iterations. So basically I've improved the performance by a
+factor 18 on my system. Not that bad. Of course on different hardware
+you will get different results. No warranties!
+
+But of course there is no such thing as a free lunch. The codesize almost
+tripled (from 562 bytes to 1434 bytes). Then again, it is not that much.
+
+
+Correcting errors
+=================
+
+For correcting errors I again used the ST application note as a starter,
+but I also peeked at the existing code.
+The algorithm itself is pretty straightforward. Just xor the given and
+the calculated ecc. If all bytes are 0 there is no problem. If 11 bits
+are 1 we have one correctable bit error. If there is 1 bit 1, we have an
+error in the given ecc code.
+It proved to be fastest to do some table lookups. Performance gain
+introduced by this is about a factor 2 on my system when a repair had to
+be done, and 1% or so if no repair had to be done.
+Code size increased from 330 bytes to 686 bytes for this function.
+(gcc 4.2, -O3)
+
+
+Conclusion
+==========
+
+The gain when calculating the ecc is tremendous. Om my development hardware
+a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
+embedded system with a MIPS core a factor 7 was obtained.
+On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
+5 (big endian mode, gcc 4.1.2, -O3)
+For correction not much gain could be obtained (as bitflips are rare). Then
+again there are also much less cycles spent there.
+
+It seems there is not much more gain possible in this, at least when
+programmed in C. Of course it might be possible to squeeze something more
+out of it with an assembler program, but due to pipeline behaviour etc
+this is very tricky (at least for intel hw).
+
+Author: Frans Meulenbroeks
+Copyright (C) 2008 Koninklijke Philips Electronics NV.
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt
new file mode 100644
index 0000000..aa60d1f
--- /dev/null
+++ b/Documentation/mutex-design.txt
@@ -0,0 +1,138 @@
+Generic Mutex Subsystem
+
+started by Ingo Molnar <mingo@redhat.com>
+
+ "Why on earth do we need a new mutex subsystem, and what's wrong
+ with semaphores?"
+
+firstly, there's nothing wrong with semaphores. But if the simpler
+mutex semantics are sufficient for your code, then there are a couple
+of advantages of mutexes:
+
+ - 'struct mutex' is smaller on most architectures: .e.g on x86,
+ 'struct semaphore' is 20 bytes, 'struct mutex' is 16 bytes.
+ A smaller structure size means less RAM footprint, and better
+ CPU-cache utilization.
+
+ - tighter code. On x86 i get the following .text sizes when
+ switching all mutex-alike semaphores in the kernel to the mutex
+ subsystem:
+
+ text data bss dec hex filename
+ 3280380 868188 396860 4545428 455b94 vmlinux-semaphore
+ 3255329 865296 396732 4517357 44eded vmlinux-mutex
+
+ that's 25051 bytes of code saved, or a 0.76% win - off the hottest
+ codepaths of the kernel. (The .data savings are 2892 bytes, or 0.33%)
+ Smaller code means better icache footprint, which is one of the
+ major optimization goals in the Linux kernel currently.
+
+ - the mutex subsystem is slightly faster and has better scalability for
+ contended workloads. On an 8-way x86 system, running a mutex-based
+ kernel and testing creat+unlink+close (of separate, per-task files)
+ in /tmp with 16 parallel tasks, the average number of ops/sec is:
+
+ Semaphores: Mutexes:
+
+ $ ./test-mutex V 16 10 $ ./test-mutex V 16 10
+ 8 CPUs, running 16 tasks. 8 CPUs, running 16 tasks.
+ checking VFS performance. checking VFS performance.
+ avg loops/sec: 34713 avg loops/sec: 84153
+ CPU utilization: 63% CPU utilization: 22%
+
+ i.e. in this workload, the mutex based kernel was 2.4 times faster
+ than the semaphore based kernel, _and_ it also had 2.8 times less CPU
+ utilization. (In terms of 'ops per CPU cycle', the semaphore kernel
+ performed 551 ops/sec per 1% of CPU time used, while the mutex kernel
+ performed 3825 ops/sec per 1% of CPU time used - it was 6.9 times
+ more efficient.)
+
+ the scalability difference is visible even on a 2-way P4 HT box:
+
+ Semaphores: Mutexes:
+
+ $ ./test-mutex V 16 10 $ ./test-mutex V 16 10
+ 4 CPUs, running 16 tasks. 8 CPUs, running 16 tasks.
+ checking VFS performance. checking VFS performance.
+ avg loops/sec: 127659 avg loops/sec: 181082
+ CPU utilization: 100% CPU utilization: 34%
+
+ (the straight performance advantage of mutexes is 41%, the per-cycle
+ efficiency of mutexes is 4.1 times better.)
+
+ - there are no fastpath tradeoffs, the mutex fastpath is just as tight
+ as the semaphore fastpath. On x86, the locking fastpath is 2
+ instructions:
+
+ c0377ccb <mutex_lock>:
+ c0377ccb: f0 ff 08 lock decl (%eax)
+ c0377cce: 78 0e js c0377cde <.text.lock.mutex>
+ c0377cd0: c3 ret
+
+ the unlocking fastpath is equally tight:
+
+ c0377cd1 <mutex_unlock>:
+ c0377cd1: f0 ff 00 lock incl (%eax)
+ c0377cd4: 7e 0f jle c0377ce5 <.text.lock.mutex+0x7>
+ c0377cd6: c3 ret
+
+ - 'struct mutex' semantics are well-defined and are enforced if
+ CONFIG_DEBUG_MUTEXES is turned on. Semaphores on the other hand have
+ virtually no debugging code or instrumentation. The mutex subsystem
+ checks and enforces the following rules:
+
+ * - only one task can hold the mutex at a time
+ * - only the owner can unlock the mutex
+ * - multiple unlocks are not permitted
+ * - recursive locking is not permitted
+ * - a mutex object must be initialized via the API
+ * - a mutex object must not be initialized via memset or copying
+ * - task may not exit with mutex held
+ * - memory areas where held locks reside must not be freed
+ * - held mutexes must not be reinitialized
+ * - mutexes may not be used in hardware or software interrupt
+ * contexts such as tasklets and timers
+
+ furthermore, there are also convenience features in the debugging
+ code:
+
+ * - uses symbolic names of mutexes, whenever they are printed in debug output
+ * - point-of-acquire tracking, symbolic lookup of function names
+ * - list of all locks held in the system, printout of them
+ * - owner tracking
+ * - detects self-recursing locks and prints out all relevant info
+ * - detects multi-task circular deadlocks and prints out all affected
+ * locks and tasks (and only those tasks)
+
+Disadvantages
+-------------
+
+The stricter mutex API means you cannot use mutexes the same way you
+can use semaphores: e.g. they cannot be used from an interrupt context,
+nor can they be unlocked from a different context that which acquired
+it. [ I'm not aware of any other (e.g. performance) disadvantages from
+using mutexes at the moment, please let me know if you find any. ]
+
+Implementation of mutexes
+-------------------------
+
+'struct mutex' is the new mutex type, defined in include/linux/mutex.h
+and implemented in kernel/mutex.c. It is a counter-based mutex with a
+spinlock and a wait-list. The counter has 3 states: 1 for "unlocked",
+0 for "locked" and negative numbers (usually -1) for "locked, potential
+waiters queued".
+
+the APIs of 'struct mutex' have been streamlined:
+
+ DEFINE_MUTEX(name);
+
+ mutex_init(mutex);
+
+ void mutex_lock(struct mutex *lock);
+ int mutex_lock_interruptible(struct mutex *lock);
+ int mutex_trylock(struct mutex *lock);
+ void mutex_unlock(struct mutex *lock);
+ int mutex_is_locked(struct mutex *lock);
+ void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+ int mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass);
diff --git a/Documentation/namespaces/compatibility-list.txt b/Documentation/namespaces/compatibility-list.txt
new file mode 100644
index 0000000..defc558
--- /dev/null
+++ b/Documentation/namespaces/compatibility-list.txt
@@ -0,0 +1,39 @@
+ Namespaces compatibility list
+
+This document contains the information about the problems user
+may have when creating tasks living in different namespaces.
+
+Here's the summary. This matrix shows the known problems, that
+occur when tasks share some namespace (the columns) while living
+in different other namespaces (the rows):
+
+ UTS IPC VFS PID User Net
+UTS X
+IPC X 1
+VFS X
+PID 1 1 X
+User 2 2 X
+Net X
+
+1. Both the IPC and the PID namespaces provide IDs to address
+ object inside the kernel. E.g. semaphore with IPCID or
+ process group with pid.
+
+ In both cases, tasks shouldn't try exposing this ID to some
+ other task living in a different namespace via a shared filesystem
+ or IPC shmem/message. The fact is that this ID is only valid
+ within the namespace it was obtained in and may refer to some
+ other object in another namespace.
+
+2. Intentionally, two equal user IDs in different user namespaces
+ should not be equal from the VFS point of view. In other
+ words, user 10 in one user namespace shouldn't have the same
+ access permissions to files, belonging to user 10 in another
+ namespace.
+
+ The same is true for the IPC namespaces being shared - two users
+ from different user namespaces should not access the same IPC objects
+ even having equal UIDs.
+
+ But currently this is not so.
+
diff --git a/Documentation/netlabel/00-INDEX b/Documentation/netlabel/00-INDEX
new file mode 100644
index 0000000..837bf35
--- /dev/null
+++ b/Documentation/netlabel/00-INDEX
@@ -0,0 +1,10 @@
+00-INDEX
+ - this file.
+cipso_ipv4.txt
+ - documentation on the IPv4 CIPSO protocol engine.
+draft-ietf-cipso-ipsecurity-01.txt
+ - IETF draft of the CIPSO protocol, dated 16 July 1992.
+introduction.txt
+ - NetLabel introduction, READ THIS FIRST.
+lsm_interface.txt
+ - documentation on the NetLabel kernel security module API.
diff --git a/Documentation/netlabel/cipso_ipv4.txt b/Documentation/netlabel/cipso_ipv4.txt
new file mode 100644
index 0000000..93dacb1
--- /dev/null
+++ b/Documentation/netlabel/cipso_ipv4.txt
@@ -0,0 +1,48 @@
+NetLabel CIPSO/IPv4 Protocol Engine
+==============================================================================
+Paul Moore, paul.moore@hp.com
+
+May 17, 2006
+
+ * Overview
+
+The NetLabel CIPSO/IPv4 protocol engine is based on the IETF Commercial IP
+Security Option (CIPSO) draft from July 16, 1992. A copy of this draft can be
+found in this directory, consult '00-INDEX' for the filename. While the IETF
+draft never made it to an RFC standard it has become a de-facto standard for
+labeled networking and is used in many trusted operating systems.
+
+ * Outbound Packet Processing
+
+The CIPSO/IPv4 protocol engine applies the CIPSO IP option to packets by
+adding the CIPSO label to the socket. This causes all packets leaving the
+system through the socket to have the CIPSO IP option applied. The socket's
+CIPSO label can be changed at any point in time, however, it is recommended
+that it is set upon the socket's creation. The LSM can set the socket's CIPSO
+label by using the NetLabel security module API; if the NetLabel "domain" is
+configured to use CIPSO for packet labeling then a CIPSO IP option will be
+generated and attached to the socket.
+
+ * Inbound Packet Processing
+
+The CIPSO/IPv4 protocol engine validates every CIPSO IP option it finds at the
+IP layer without any special handling required by the LSM. However, in order
+to decode and translate the CIPSO label on the packet the LSM must use the
+NetLabel security module API to extract the security attributes of the packet.
+This is typically done at the socket layer using the 'socket_sock_rcv_skb()'
+LSM hook.
+
+ * Label Translation
+
+The CIPSO/IPv4 protocol engine contains a mechanism to translate CIPSO security
+attributes such as sensitivity level and category to values which are
+appropriate for the host. These mappings are defined as part of a CIPSO
+Domain Of Interpretation (DOI) definition and are configured through the
+NetLabel user space communication layer. Each DOI definition can have a
+different security attribute mapping table.
+
+ * Label Translation Cache
+
+The NetLabel system provides a framework for caching security attribute
+mappings from the network labels to the corresponding LSM identifiers. The
+CIPSO/IPv4 protocol engine supports this caching mechanism.
diff --git a/Documentation/netlabel/draft-ietf-cipso-ipsecurity-01.txt b/Documentation/netlabel/draft-ietf-cipso-ipsecurity-01.txt
new file mode 100644
index 0000000..256c2c9
--- /dev/null
+++ b/Documentation/netlabel/draft-ietf-cipso-ipsecurity-01.txt
@@ -0,0 +1,791 @@
+IETF CIPSO Working Group
+16 July, 1992
+
+
+
+ COMMERCIAL IP SECURITY OPTION (CIPSO 2.2)
+
+
+
+1. Status
+
+This Internet Draft provides the high level specification for a Commercial
+IP Security Option (CIPSO). This draft reflects the version as approved by
+the CIPSO IETF Working Group. Distribution of this memo is unlimited.
+
+This document is an Internet Draft. Internet Drafts are working documents
+of the Internet Engineering Task Force (IETF), its Areas, and its Working
+Groups. Note that other groups may also distribute working documents as
+Internet Drafts.
+
+Internet Drafts are draft documents valid for a maximum of six months.
+Internet Drafts may be updated, replaced, or obsoleted by other documents
+at any time. It is not appropriate to use Internet Drafts as reference
+material or to cite them other than as a "working draft" or "work in
+progress."
+
+Please check the I-D abstract listing contained in each Internet Draft
+directory to learn the current status of this or any other Internet Draft.
+
+
+
+
+2. Background
+
+Currently the Internet Protocol includes two security options. One of
+these options is the DoD Basic Security Option (BSO) (Type 130) which allows
+IP datagrams to be labeled with security classifications. This option
+provides sixteen security classifications and a variable number of handling
+restrictions. To handle additional security information, such as security
+categories or compartments, another security option (Type 133) exists and
+is referred to as the DoD Extended Security Option (ESO). The values for
+the fixed fields within these two options are administered by the Defense
+Information Systems Agency (DISA).
+
+Computer vendors are now building commercial operating systems with
+mandatory access controls and multi-level security. These systems are
+no longer built specifically for a particular group in the defense or
+intelligence communities. They are generally available commercial systems
+for use in a variety of government and civil sector environments.
+
+The small number of ESO format codes can not support all the possible
+applications of a commercial security option. The BSO and ESO were
+designed to only support the United States DoD. CIPSO has been designed
+to support multiple security policies. This Internet Draft provides the
+format and procedures required to support a Mandatory Access Control
+security policy. Support for additional security policies shall be
+defined in future RFCs.
+
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 1]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+
+3. CIPSO Format
+
+Option type: 134 (Class 0, Number 6, Copy on Fragmentation)
+Option length: Variable
+
+This option permits security related information to be passed between
+systems within a single Domain of Interpretation (DOI). A DOI is a
+collection of systems which agree on the meaning of particular values
+in the security option. An authority that has been assigned a DOI
+identifier will define a mapping between appropriate CIPSO field values
+and their human readable equivalent. This authority will distribute that
+mapping to hosts within the authority's domain. These mappings may be
+sensitive, therefore a DOI authority is not required to make these
+mappings available to anyone other than the systems that are included in
+the DOI.
+
+This option MUST be copied on fragmentation. This option appears at most
+once in a datagram. All multi-octet fields in the option are defined to be
+transmitted in network byte order. The format of this option is as follows:
+
++----------+----------+------//------+-----------//---------+
+| 10000110 | LLLLLLLL | DDDDDDDDDDDD | TTTTTTTTTTTTTTTTTTTT |
++----------+----------+------//------+-----------//---------+
+
+ TYPE=134 OPTION DOMAIN OF TAGS
+ LENGTH INTERPRETATION
+
+
+ Figure 1. CIPSO Format
+
+
+3.1 Type
+
+This field is 1 octet in length. Its value is 134.
+
+
+3.2 Length
+
+This field is 1 octet in length. It is the total length of the option
+including the type and length fields. With the current IP header length
+restriction of 40 octets the value of this field MUST not exceed 40.
+
+
+3.3 Domain of Interpretation Identifier
+
+This field is an unsigned 32 bit integer. The value 0 is reserved and MUST
+not appear as the DOI identifier in any CIPSO option. Implementations
+should assume that the DOI identifier field is not aligned on any particular
+byte boundary.
+
+To conserve space in the protocol, security levels and categories are
+represented by numbers rather than their ASCII equivalent. This requires
+a mapping table within CIPSO hosts to map these numbers to their
+corresponding ASCII representations. Non-related groups of systems may
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 2]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+have their own unique mappings. For example, one group of systems may
+use the number 5 to represent Unclassified while another group may use the
+number 1 to represent that same security level. The DOI identifier is used
+to identify which mapping was used for the values within the option.
+
+
+3.4 Tag Types
+
+A common format for passing security related information is necessary
+for interoperability. CIPSO uses sets of "tags" to contain the security
+information relevant to the data in the IP packet. Each tag begins with
+a tag type identifier followed by the length of the tag and ends with the
+actual security information to be passed. All multi-octet fields in a tag
+are defined to be transmitted in network byte order. Like the DOI
+identifier field in the CIPSO header, implementations should assume that
+all tags, as well as fields within a tag, are not aligned on any particular
+octet boundary. The tag types defined in this document contain alignment
+bytes to assist alignment of some information, however alignment can not
+be guaranteed if CIPSO is not the first IP option.
+
+CIPSO tag types 0 through 127 are reserved for defining standard tag
+formats. Their definitions will be published in RFCs. Tag types whose
+identifiers are greater than 127 are defined by the DOI authority and may
+only be meaningful in certain Domains of Interpretation. For these tag
+types, implementations will require the DOI identifier as well as the tag
+number to determine the security policy and the format associated with the
+tag. Use of tag types above 127 are restricted to closed networks where
+interoperability with other networks will not be an issue. Implementations
+that support a tag type greater than 127 MUST support at least one DOI that
+requires only tag types 1 to 127.
+
+Tag type 0 is reserved. Tag types 1, 2, and 5 are defined in this
+Internet Draft. Types 3 and 4 are reserved for work in progress.
+The standard format for all current and future CIPSO tags is shown below:
+
++----------+----------+--------//--------+
+| TTTTTTTT | LLLLLLLL | IIIIIIIIIIIIIIII |
++----------+----------+--------//--------+
+ TAG TAG TAG
+ TYPE LENGTH INFORMATION
+
+ Figure 2: Standard Tag Format
+
+In the three tag types described in this document, the length and count
+restrictions are based on the current IP limitation of 40 octets for all
+IP options. If the IP header is later expanded, then the length and count
+restrictions specified in this document may increase to use the full area
+provided for IP options.
+
+
+3.4.1 Tag Type Classes
+
+Tag classes consist of tag types that have common processing requirements
+and support the same security policy. The three tags defined in this
+Internet Draft belong to the Mandatory Access Control (MAC) Sensitivity
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 3]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+class and support the MAC Sensitivity security policy.
+
+
+3.4.2 Tag Type 1
+
+This is referred to as the "bit-mapped" tag type. Tag type 1 is included
+in the MAC Sensitivity tag type class. The format of this tag type is as
+follows:
+
++----------+----------+----------+----------+--------//---------+
+| 00000001 | LLLLLLLL | 00000000 | LLLLLLLL | CCCCCCCCCCCCCCCCC |
++----------+----------+----------+----------+--------//---------+
+
+ TAG TAG ALIGNMENT SENSITIVITY BIT MAP OF
+ TYPE LENGTH OCTET LEVEL CATEGORIES
+
+ Figure 3. Tag Type 1 Format
+
+
+3.4.2.1 Tag Type
+
+This field is 1 octet in length and has a value of 1.
+
+
+3.4.2.2 Tag Length
+
+This field is 1 octet in length. It is the total length of the tag type
+including the type and length fields. With the current IP header length
+restriction of 40 bytes the value within this field is between 4 and 34.
+
+
+3.4.2.3 Alignment Octet
+
+This field is 1 octet in length and always has the value of 0. Its purpose
+is to align the category bitmap field on an even octet boundary. This will
+speed many implementations including router implementations.
+
+
+3.4.2.4 Sensitivity Level
+
+This field is 1 octet in length. Its value is from 0 to 255. The values
+are ordered with 0 being the minimum value and 255 representing the maximum
+value.
+
+
+3.4.2.5 Bit Map of Categories
+
+The length of this field is variable and ranges from 0 to 30 octets. This
+provides representation of categories 0 to 239. The ordering of the bits
+is left to right or MSB to LSB. For example category 0 is represented by
+the most significant bit of the first byte and category 15 is represented
+by the least significant bit of the second byte. Figure 4 graphically
+shows this ordering. Bit N is binary 1 if category N is part of the label
+for the datagram, and bit N is binary 0 if category N is not part of the
+label. Except for the optimized tag 1 format described in the next section,
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 4]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+minimal encoding SHOULD be used resulting in no trailing zero octets in the
+category bitmap.
+
+ octet 0 octet 1 octet 2 octet 3 octet 4 octet 5
+ XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX . . .
+bit 01234567 89111111 11112222 22222233 33333333 44444444
+number 012345 67890123 45678901 23456789 01234567
+
+ Figure 4. Ordering of Bits in Tag 1 Bit Map
+
+
+3.4.2.6 Optimized Tag 1 Format
+
+Routers work most efficiently when processing fixed length fields. To
+support these routers there is an optimized form of tag type 1. The format
+does not change. The only change is to the category bitmap which is set to
+a constant length of 10 octets. Trailing octets required to fill out the 10
+octets are zero filled. Ten octets, allowing for 80 categories, was chosen
+because it makes the total length of the CIPSO option 20 octets. If CIPSO
+is the only option then the option will be full word aligned and additional
+filler octets will not be required.
+
+
+3.4.3 Tag Type 2
+
+This is referred to as the "enumerated" tag type. It is used to describe
+large but sparsely populated sets of categories. Tag type 2 is in the MAC
+Sensitivity tag type class. The format of this tag type is as follows:
+
++----------+----------+----------+----------+-------------//-------------+
+| 00000010 | LLLLLLLL | 00000000 | LLLLLLLL | CCCCCCCCCCCCCCCCCCCCCCCCCC |
++----------+----------+----------+----------+-------------//-------------+
+
+ TAG TAG ALIGNMENT SENSITIVITY ENUMERATED
+ TYPE LENGTH OCTET LEVEL CATEGORIES
+
+ Figure 5. Tag Type 2 Format
+
+
+3.4.3.1 Tag Type
+
+This field is one octet in length and has a value of 2.
+
+
+3.4.3.2 Tag Length
+
+This field is 1 octet in length. It is the total length of the tag type
+including the type and length fields. With the current IP header length
+restriction of 40 bytes the value within this field is between 4 and 34.
+
+
+3.4.3.3 Alignment Octet
+
+This field is 1 octet in length and always has the value of 0. Its purpose
+is to align the category field on an even octet boundary. This will
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 5]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+speed many implementations including router implementations.
+
+
+3.4.3.4 Sensitivity Level
+
+This field is 1 octet in length. Its value is from 0 to 255. The values
+are ordered with 0 being the minimum value and 255 representing the
+maximum value.
+
+
+3.4.3.5 Enumerated Categories
+
+In this tag, categories are represented by their actual value rather than
+by their position within a bit field. The length of each category is 2
+octets. Up to 15 categories may be represented by this tag. Valid values
+for categories are 0 to 65534. Category 65535 is not a valid category
+value. The categories MUST be listed in ascending order within the tag.
+
+
+3.4.4 Tag Type 5
+
+This is referred to as the "range" tag type. It is used to represent
+labels where all categories in a range, or set of ranges, are included
+in the sensitivity label. Tag type 5 is in the MAC Sensitivity tag type
+class. The format of this tag type is as follows:
+
++----------+----------+----------+----------+------------//-------------+
+| 00000101 | LLLLLLLL | 00000000 | LLLLLLLL | Top/Bottom | Top/Bottom |
++----------+----------+----------+----------+------------//-------------+
+
+ TAG TAG ALIGNMENT SENSITIVITY CATEGORY RANGES
+ TYPE LENGTH OCTET LEVEL
+
+ Figure 6. Tag Type 5 Format
+
+
+3.4.4.1 Tag Type
+
+This field is one octet in length and has a value of 5.
+
+
+3.4.4.2 Tag Length
+
+This field is 1 octet in length. It is the total length of the tag type
+including the type and length fields. With the current IP header length
+restriction of 40 bytes the value within this field is between 4 and 34.
+
+
+3.4.4.3 Alignment Octet
+
+This field is 1 octet in length and always has the value of 0. Its purpose
+is to align the category range field on an even octet boundary. This will
+speed many implementations including router implementations.
+
+
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 6]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+3.4.4.4 Sensitivity Level
+
+This field is 1 octet in length. Its value is from 0 to 255. The values
+are ordered with 0 being the minimum value and 255 representing the maximum
+value.
+
+
+3.4.4.5 Category Ranges
+
+A category range is a 4 octet field comprised of the 2 octet index of the
+highest numbered category followed by the 2 octet index of the lowest
+numbered category. These range endpoints are inclusive within the range of
+categories. All categories within a range are included in the sensitivity
+label. This tag may contain a maximum of 7 category pairs. The bottom
+category endpoint for the last pair in the tag MAY be omitted and SHOULD be
+assumed to be 0. The ranges MUST be non-overlapping and be listed in
+descending order. Valid values for categories are 0 to 65534. Category
+65535 is not a valid category value.
+
+
+3.4.5 Minimum Requirements
+
+A CIPSO implementation MUST be capable of generating at least tag type 1 in
+the non-optimized form. In addition, a CIPSO implementation MUST be able
+to receive any valid tag type 1 even those using the optimized tag type 1
+format.
+
+
+4. Configuration Parameters
+
+The configuration parameters defined below are required for all CIPSO hosts,
+gateways, and routers that support multiple sensitivity labels. A CIPSO
+host is defined to be the origination or destination system for an IP
+datagram. A CIPSO gateway provides IP routing services between two or more
+IP networks and may be required to perform label translations between
+networks. A CIPSO gateway may be an enhanced CIPSO host or it may just
+provide gateway services with no end system CIPSO capabilities. A CIPSO
+router is a dedicated IP router that routes IP datagrams between two or more
+IP networks.
+
+An implementation of CIPSO on a host MUST have the capability to reject a
+datagram for reasons that the information contained can not be adequately
+protected by the receiving host or if acceptance may result in violation of
+the host or network security policy. In addition, a CIPSO gateway or router
+MUST be able to reject datagrams going to networks that can not provide
+adequate protection or may violate the network's security policy. To
+provide this capability the following minimal set of configuration
+parameters are required for CIPSO implementations:
+
+HOST_LABEL_MAX - This parameter contains the maximum sensitivity label that
+a CIPSO host is authorized to handle. All datagrams that have a label
+greater than this maximum MUST be rejected by the CIPSO host. This
+parameter does not apply to CIPSO gateways or routers. This parameter need
+not be defined explicitly as it can be implicitly derived from the
+PORT_LABEL_MAX parameters for the associated interfaces.
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 7]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+
+HOST_LABEL_MIN - This parameter contains the minimum sensitivity label that
+a CIPSO host is authorized to handle. All datagrams that have a label less
+than this minimum MUST be rejected by the CIPSO host. This parameter does
+not apply to CIPSO gateways or routers. This parameter need not be defined
+explicitly as it can be implicitly derived from the PORT_LABEL_MIN
+parameters for the associated interfaces.
+
+PORT_LABEL_MAX - This parameter contains the maximum sensitivity label for
+all datagrams that may exit a particular network interface port. All
+outgoing datagrams that have a label greater than this maximum MUST be
+rejected by the CIPSO system. The label within this parameter MUST be
+less than or equal to the label within the HOST_LABEL_MAX parameter. This
+parameter does not apply to CIPSO hosts that support only one network port.
+
+PORT_LABEL_MIN - This parameter contains the minimum sensitivity label for
+all datagrams that may exit a particular network interface port. All
+outgoing datagrams that have a label less than this minimum MUST be
+rejected by the CIPSO system. The label within this parameter MUST be
+greater than or equal to the label within the HOST_LABEL_MIN parameter.
+This parameter does not apply to CIPSO hosts that support only one network
+port.
+
+PORT_DOI - This parameter is used to assign a DOI identifier value to a
+particular network interface port. All CIPSO labels within datagrams
+going out this port MUST use the specified DOI identifier. All CIPSO
+hosts and gateways MUST support either this parameter, the NET_DOI
+parameter, or the HOST_DOI parameter.
+
+NET_DOI - This parameter is used to assign a DOI identifier value to a
+particular IP network address. All CIPSO labels within datagrams destined
+for the particular IP network MUST use the specified DOI identifier. All
+CIPSO hosts and gateways MUST support either this parameter, the PORT_DOI
+parameter, or the HOST_DOI parameter.
+
+HOST_DOI - This parameter is used to assign a DOI identifier value to a
+particular IP host address. All CIPSO labels within datagrams destined for
+the particular IP host will use the specified DOI identifier. All CIPSO
+hosts and gateways MUST support either this parameter, the PORT_DOI
+parameter, or the NET_DOI parameter.
+
+This list represents the minimal set of configuration parameters required
+to be compliant. Implementors are encouraged to add to this list to
+provide enhanced functionality and control. For example, many security
+policies may require both incoming and outgoing datagrams be checked against
+the port and host label ranges.
+
+
+4.1 Port Range Parameters
+
+The labels represented by the PORT_LABEL_MAX and PORT_LABEL_MIN parameters
+MAY be in CIPSO or local format. Some CIPSO systems, such as routers, may
+want to have the range parameters expressed in CIPSO format so that incoming
+labels do not have to be converted to a local format before being compared
+against the range. If multiple DOIs are supported by one of these CIPSO
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 8]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+systems then multiple port range parameters would be needed, one set for
+each DOI supported on a particular port.
+
+The port range will usually represent the total set of labels that may
+exist on the logical network accessed through the corresponding network
+interface. It may, however, represent a subset of these labels that are
+allowed to enter the CIPSO system.
+
+
+4.2 Single Label CIPSO Hosts
+
+CIPSO implementations that support only one label are not required to
+support the parameters described above. These limited implementations are
+only required to support a NET_LABEL parameter. This parameter contains
+the CIPSO label that may be inserted in datagrams that exit the host. In
+addition, the host MUST reject any incoming datagram that has a label which
+is not equivalent to the NET_LABEL parameter.
+
+
+5. Handling Procedures
+
+This section describes the processing requirements for incoming and
+outgoing IP datagrams. Just providing the correct CIPSO label format
+is not enough. Assumptions will be made by one system on how a
+receiving system will handle the CIPSO label. Wrong assumptions may
+lead to non-interoperability or even a security incident. The
+requirements described below represent the minimal set needed for
+interoperability and that provide users some level of confidence.
+Many other requirements could be added to increase user confidence,
+however at the risk of restricting creativity and limiting vendor
+participation.
+
+
+5.1 Input Procedures
+
+All datagrams received through a network port MUST have a security label
+associated with them, either contained in the datagram or assigned to the
+receiving port. Without this label the host, gateway, or router will not
+have the information it needs to make security decisions. This security
+label will be obtained from the CIPSO if the option is present in the
+datagram. See section 4.1.2 for handling procedures for unlabeled
+datagrams. This label will be compared against the PORT (if appropriate)
+and HOST configuration parameters defined in section 3.
+
+If any field within the CIPSO option, such as the DOI identifier, is not
+recognized the IP datagram is discarded and an ICMP "parameter problem"
+(type 12) is generated and returned. The ICMP code field is set to "bad
+parameter" (code 0) and the pointer is set to the start of the CIPSO field
+that is unrecognized.
+
+If the contents of the CIPSO are valid but the security label is
+outside of the configured host or port label range, the datagram is
+discarded and an ICMP "destination unreachable" (type 3) is generated
+and returned. The code field of the ICMP is set to "communication with
+destination network administratively prohibited" (code 9) or to
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 9]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+"communication with destination host administratively prohibited"
+(code 10). The value of the code field used is dependent upon whether
+the originator of the ICMP message is acting as a CIPSO host or a CIPSO
+gateway. The recipient of the ICMP message MUST be able to handle either
+value. The same procedure is performed if a CIPSO can not be added to an
+IP packet because it is too large to fit in the IP options area.
+
+If the error is triggered by receipt of an ICMP message, the message
+is discarded and no response is permitted (consistent with general ICMP
+processing rules).
+
+
+5.1.1 Unrecognized tag types
+
+The default condition for any CIPSO implementation is that an
+unrecognized tag type MUST be treated as a "parameter problem" and
+handled as described in section 4.1. A CIPSO implementation MAY allow
+the system administrator to identify tag types that may safely be
+ignored. This capability is an allowable enhancement, not a
+requirement.
+
+
+5.1.2 Unlabeled Packets
+
+A network port may be configured to not require a CIPSO label for all
+incoming datagrams. For this configuration a CIPSO label must be
+assigned to that network port and associated with all unlabeled IP
+datagrams. This capability might be used for single level networks or
+networks that have CIPSO and non-CIPSO hosts and the non-CIPSO hosts
+all operate at the same label.
+
+If a CIPSO option is required and none is found, the datagram is
+discarded and an ICMP "parameter problem" (type 12) is generated and
+returned to the originator of the datagram. The code field of the ICMP
+is set to "option missing" (code 1) and the ICMP pointer is set to 134
+(the value of the option type for the missing CIPSO option).
+
+
+5.2 Output Procedures
+
+A CIPSO option MUST appear only once in a datagram. Only one tag type
+from the MAC Sensitivity class MAY be included in a CIPSO option. Given
+the current set of defined tag types, this means that CIPSO labels at
+first will contain only one tag.
+
+All datagrams leaving a CIPSO system MUST meet the following condition:
+
+ PORT_LABEL_MIN <= CIPSO label <= PORT_LABEL_MAX
+
+If this condition is not satisfied the datagram MUST be discarded.
+If the CIPSO system only supports one port, the HOST_LABEL_MIN and the
+HOST_LABEL_MAX parameters MAY be substituted for the PORT parameters in
+the above condition.
+
+The DOI identifier to be used for all outgoing datagrams is configured by
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 10]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+the administrator. If port level DOI identifier assignment is used, then
+the PORT_DOI configuration parameter MUST contain the DOI identifier to
+use. If network level DOI assignment is used, then the NET_DOI parameter
+MUST contain the DOI identifier to use. And if host level DOI assignment
+is employed, then the HOST_DOI parameter MUST contain the DOI identifier
+to use. A CIPSO implementation need only support one level of DOI
+assignment.
+
+
+5.3 DOI Processing Requirements
+
+A CIPSO implementation MUST support at least one DOI and SHOULD support
+multiple DOIs. System and network administrators are cautioned to
+ensure that at least one DOI is common within an IP network to allow for
+broadcasting of IP datagrams.
+
+CIPSO gateways MUST be capable of translating a CIPSO option from one
+DOI to another when forwarding datagrams between networks. For
+efficiency purposes this capability is only a desired feature for CIPSO
+routers.
+
+
+5.4 Label of ICMP Messages
+
+The CIPSO label to be used on all outgoing ICMP messages MUST be equivalent
+to the label of the datagram that caused the ICMP message. If the ICMP was
+generated due to a problem associated with the original CIPSO label then the
+following responses are allowed:
+
+ a. Use the CIPSO label of the original IP datagram
+ b. Drop the original datagram with no return message generated
+
+In most cases these options will have the same effect. If you can not
+interpret the label or if it is outside the label range of your host or
+interface then an ICMP message with the same label will probably not be
+able to exit the system.
+
+
+6. Assignment of DOI Identifier Numbers =
+
+Requests for assignment of a DOI identifier number should be addressed to
+the Internet Assigned Numbers Authority (IANA).
+
+
+7. Acknowledgements
+
+Much of the material in this RFC is based on (and copied from) work
+done by Gary Winiger of Sun Microsystems and published as Commercial
+IP Security Option at the INTEROP 89, Commercial IPSO Workshop.
+
+
+8. Author's Address
+
+To submit mail for distribution to members of the IETF CIPSO Working
+Group, send mail to: cipso@wdl1.wdl.loral.com.
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 11]
+
+
+
+CIPSO INTERNET DRAFT 16 July, 1992
+
+
+
+
+To be added to or deleted from this distribution, send mail to:
+cipso-request@wdl1.wdl.loral.com.
+
+
+9. References
+
+RFC 1038, "Draft Revised IP Security Option", M. St. Johns, IETF, January
+1988.
+
+RFC 1108, "U.S. Department of Defense Security Options
+for the Internet Protocol", Stephen Kent, IAB, 1 March, 1991.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Internet Draft, Expires 15 Jan 93 [PAGE 12]
+
+
+
diff --git a/Documentation/netlabel/introduction.txt b/Documentation/netlabel/introduction.txt
new file mode 100644
index 0000000..5ecd8d1
--- /dev/null
+++ b/Documentation/netlabel/introduction.txt
@@ -0,0 +1,46 @@
+NetLabel Introduction
+==============================================================================
+Paul Moore, paul.moore@hp.com
+
+August 2, 2006
+
+ * Overview
+
+NetLabel is a mechanism which can be used by kernel security modules to attach
+security attributes to outgoing network packets generated from user space
+applications and read security attributes from incoming network packets. It
+is composed of three main components, the protocol engines, the communication
+layer, and the kernel security module API.
+
+ * Protocol Engines
+
+The protocol engines are responsible for both applying and retrieving the
+network packet's security attributes. If any translation between the network
+security attributes and those on the host are required then the protocol
+engine will handle those tasks as well. Other kernel subsystems should
+refrain from calling the protocol engines directly, instead they should use
+the NetLabel kernel security module API described below.
+
+Detailed information about each NetLabel protocol engine can be found in this
+directory, consult '00-INDEX' for filenames.
+
+ * Communication Layer
+
+The communication layer exists to allow NetLabel configuration and monitoring
+from user space. The NetLabel communication layer uses a message based
+protocol built on top of the Generic NETLINK transport mechanism. The exact
+formatting of these NetLabel messages as well as the Generic NETLINK family
+names can be found in the 'net/netlabel/' directory as comments in the
+header files as well as in 'include/net/netlabel.h'.
+
+ * Security Module API
+
+The purpose of the NetLabel security module API is to provide a protocol
+independent interface to the underlying NetLabel protocol engines. In addition
+to protocol independence, the security module API is designed to be completely
+LSM independent which should allow multiple LSMs to leverage the same code
+base.
+
+Detailed information about the NetLabel security module API can be found in the
+'include/net/netlabel.h' header file as well as the 'lsm_interface.txt' file
+found in this directory.
diff --git a/Documentation/netlabel/lsm_interface.txt b/Documentation/netlabel/lsm_interface.txt
new file mode 100644
index 0000000..98dd9f7
--- /dev/null
+++ b/Documentation/netlabel/lsm_interface.txt
@@ -0,0 +1,47 @@
+NetLabel Linux Security Module Interface
+==============================================================================
+Paul Moore, paul.moore@hp.com
+
+May 17, 2006
+
+ * Overview
+
+NetLabel is a mechanism which can set and retrieve security attributes from
+network packets. It is intended to be used by LSM developers who want to make
+use of a common code base for several different packet labeling protocols.
+The NetLabel security module API is defined in 'include/net/netlabel.h' but a
+brief overview is given below.
+
+ * NetLabel Security Attributes
+
+Since NetLabel supports multiple different packet labeling protocols and LSMs
+it uses the concept of security attributes to refer to the packet's security
+labels. The NetLabel security attributes are defined by the
+'netlbl_lsm_secattr' structure in the NetLabel header file. Internally the
+NetLabel subsystem converts the security attributes to and from the correct
+low-level packet label depending on the NetLabel build time and run time
+configuration. It is up to the LSM developer to translate the NetLabel
+security attributes into whatever security identifiers are in use for their
+particular LSM.
+
+ * NetLabel LSM Protocol Operations
+
+These are the functions which allow the LSM developer to manipulate the labels
+on outgoing packets as well as read the labels on incoming packets. Functions
+exist to operate both on sockets as well as the sk_buffs directly. These high
+level functions are translated into low level protocol operations based on how
+the administrator has configured the NetLabel subsystem.
+
+ * NetLabel Label Mapping Cache Operations
+
+Depending on the exact configuration, translation between the network packet
+label and the internal LSM security identifier can be time consuming. The
+NetLabel label mapping cache is a caching mechanism which can be used to
+sidestep much of this overhead once a mapping has been established. Once the
+LSM has received a packet, used NetLabel to decode it's security attributes,
+and translated the security attributes into a LSM internal identifier the LSM
+can use the NetLabel caching functions to associate the LSM internal
+identifier with the network packet's label. This means that in the future
+when a incoming packet matches a cached value not only are the internal
+NetLabel translation mechanisms bypassed but the LSM translation mechanisms are
+bypassed as well which should result in a significant reduction in overhead.
diff --git a/Documentation/networking/.gitignore b/Documentation/networking/.gitignore
new file mode 100644
index 0000000..286a568
--- /dev/null
+++ b/Documentation/networking/.gitignore
@@ -0,0 +1 @@
+ifenslave
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
new file mode 100644
index 0000000..1634c6d
--- /dev/null
+++ b/Documentation/networking/00-INDEX
@@ -0,0 +1,110 @@
+00-INDEX
+ - this file
+3c505.txt
+ - information on the 3Com EtherLink Plus (3c505) driver.
+6pack.txt
+ - info on the 6pack protocol, an alternative to KISS for AX.25
+DLINK.txt
+ - info on the D-Link DE-600/DE-620 parallel port pocket adapters
+PLIP.txt
+ - PLIP: The Parallel Line Internet Protocol device driver
+README.sb1000
+ - info on General Instrument/NextLevel SURFboard1000 cable modem.
+alias.txt
+ - info on using alias network devices
+arcnet-hardware.txt
+ - tons of info on ARCnet, hubs, jumper settings for ARCnet cards, etc.
+arcnet.txt
+ - info on the using the ARCnet driver itself.
+atm.txt
+ - info on where to get ATM programs and support for Linux.
+ax25.txt
+ - info on using AX.25 and NET/ROM code for Linux
+baycom.txt
+ - info on the driver for Baycom style amateur radio modems
+bridge.txt
+ - where to get user space programs for ethernet bridging with Linux.
+can.txt
+ - documentation on CAN protocol family.
+cops.txt
+ - info on the COPS LocalTalk Linux driver
+cs89x0.txt
+ - the Crystal LAN (CS8900/20-based) Ethernet ISA adapter driver
+cxacru.txt
+ - Conexant AccessRunner USB ADSL Modem
+de4x5.txt
+ - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver
+decnet.txt
+ - info on using the DECnet networking layer in Linux.
+depca.txt
+ - the Digital DEPCA/EtherWORKS DE1?? and DE2?? LANCE Ethernet driver
+dgrs.txt
+ - the Digi International RightSwitch SE-X Ethernet driver
+dmfe.txt
+ - info on the Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver.
+e100.txt
+ - info on Intel's EtherExpress PRO/100 line of 10/100 boards
+e1000.txt
+ - info on Intel's E1000 line of gigabit ethernet boards
+eql.txt
+ - serial IP load balancing
+ethertap.txt
+ - the Ethertap user space packet reception and transmission driver
+ewrk3.txt
+ - the Digital EtherWORKS 3 DE203/4/5 Ethernet driver
+filter.txt
+ - Linux Socket Filtering
+fore200e.txt
+ - FORE Systems PCA-200E/SBA-200E ATM NIC driver info.
+framerelay.txt
+ - info on using Frame Relay/Data Link Connection Identifier (DLCI).
+generic_netlink.txt
+ - info on Generic Netlink
+ip-sysctl.txt
+ - /proc/sys/net/ipv4/* variables
+ip_dynaddr.txt
+ - IP dynamic address hack e.g. for auto-dialup links
+ipddp.txt
+ - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation
+iphase.txt
+ - Interphase PCI ATM (i)Chip IA Linux driver info.
+irda.txt
+ - where to get IrDA (infrared) utilities and info for Linux.
+lapb-module.txt
+ - programming information of the LAPB module.
+ltpc.txt
+ - the Apple or Farallon LocalTalk PC card driver
+multicast.txt
+ - Behaviour of cards under Multicast
+netdevices.txt
+ - info on network device driver functions exported to the kernel.
+olympic.txt
+ - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info.
+policy-routing.txt
+ - IP policy-based routing
+ray_cs.txt
+ - Raylink Wireless LAN card driver info.
+skfp.txt
+ - SysKonnect FDDI (SK-5xxx, Compaq Netelligent) driver info.
+smc9.txt
+ - the driver for SMC's 9000 series of Ethernet cards
+smctr.txt
+ - SMC TokenCard TokenRing Linux driver info.
+tcp.txt
+ - short blurb on how TCP output takes place.
+tlan.txt
+ - ThunderLAN (Compaq Netelligent 10/100, Olicom OC-2xxx) driver info.
+tms380tr.txt
+ - SysKonnect Token Ring ISA/PCI adapter driver info.
+tuntap.txt
+ - TUN/TAP device driver, allowing user space Rx/Tx of packets.
+vortex.txt
+ - info on using 3Com Vortex (3c590, 3c592, 3c595, 3c597) Ethernet cards.
+wavelan.txt
+ - AT&T GIS (nee NCR) WaveLAN card: An Ethernet-like radio transceiver
+x25.txt
+ - general info on X.25 development.
+x25-iface.txt
+ - description of the X.25 Packet Layer to LAPB device interface.
+z8530drv.txt
+ - info about Linux driver for Z8530 based HDLC cards for AX.25
diff --git a/Documentation/networking/3c359.txt b/Documentation/networking/3c359.txt
new file mode 100644
index 0000000..4af8071
--- /dev/null
+++ b/Documentation/networking/3c359.txt
@@ -0,0 +1,58 @@
+
+3COM PCI TOKEN LINK VELOCITY XL TOKEN RING CARDS README
+
+Release 0.9.0 - Release
+ Jul 17th 2000 Mike Phillips
+
+ 1.2.0 - Final
+ Feb 17th 2002 Mike Phillips
+ Updated for submission to the 2.4.x kernel.
+
+Thanks:
+ Terry Murphy from 3Com for tech docs and support,
+ Adam D. Ligas for testing the driver.
+
+Note:
+ This driver will NOT work with the 3C339 Token Ring cards, you need
+to use the tms380 driver instead.
+
+Options:
+
+The driver accepts three options: ringspeed, pkt_buf_sz and message_level.
+
+These options can be specified differently for each card found.
+
+ringspeed: Has one of three settings 0 (default), 4 or 16. 0 will
+make the card autosense the ringspeed and join at the appropriate speed,
+this will be the default option for most people. 4 or 16 allow you to
+explicitly force the card to operate at a certain speed. The card will fail
+if you try to insert it at the wrong speed. (Although some hubs will allow
+this so be *very* careful). The main purpose for explicitly setting the ring
+speed is for when the card is first on the ring. In autosense mode, if the card
+cannot detect any active monitors on the ring it will open at the same speed as
+its last opening. This can be hazardous if this speed does not match the speed
+you want the ring to operate at.
+
+pkt_buf_sz: This is this initial receive buffer allocation size. This will
+default to 4096 if no value is entered. You may increase performance of the
+driver by setting this to a value larger than the network packet size, although
+the driver now re-sizes buffers based on MTU settings as well.
+
+message_level: Controls level of messages created by the driver. Defaults to 0:
+which only displays start-up and critical messages. Presently any non-zero
+value will display all soft messages as well. NB This does not turn
+debugging messages on, that must be done by modified the source code.
+
+Variable MTU size:
+
+The driver can handle a MTU size upto either 4500 or 18000 depending upon
+ring speed. The driver also changes the size of the receive buffers as part
+of the mtu re-sizing, so if you set mtu = 18000, you will need to be able
+to allocate 16 * (sk_buff with 18000 buffer size) call it 18500 bytes per ring
+position = 296,000 bytes of memory space, plus of course anything
+necessary for the tx sk_buff's. Remember this is per card, so if you are
+building routers, gateway's etc, you could start to use a lot of memory
+real fast.
+
+2/17/02 Mike Phillips
+
diff --git a/Documentation/networking/3c505.txt b/Documentation/networking/3c505.txt
new file mode 100644
index 0000000..72f38b1
--- /dev/null
+++ b/Documentation/networking/3c505.txt
@@ -0,0 +1,45 @@
+The 3Com Etherlink Plus (3c505) driver.
+
+This driver now uses DMA. There is currently no support for PIO operation.
+The default DMA channel is 6; this is _not_ autoprobed, so you must
+make sure you configure it correctly. If loading the driver as a
+module, you can do this with "modprobe 3c505 dma=n". If the driver is
+linked statically into the kernel, you must either use an "ether="
+statement on the command line, or change the definition of ELP_DMA in 3c505.h.
+
+The driver will warn you if it has to fall back on the compiled in
+default DMA channel.
+
+If no base address is given at boot time, the driver will autoprobe
+ports 0x300, 0x280 and 0x310 (in that order). If no IRQ is given, the driver
+will try to probe for it.
+
+The driver can be used as a loadable module.
+
+Theoretically, one instance of the driver can now run multiple cards,
+in the standard way (when loading a module, say "modprobe 3c505
+io=0x300,0x340 irq=10,11 dma=6,7" or whatever). I have not tested
+this, though.
+
+The driver may now support revision 2 hardware; the dependency on
+being able to read the host control register has been removed. This
+is also untested, since I don't have a suitable card.
+
+Known problems:
+ I still see "DMA upload timed out" messages from time to time. These
+seem to be fairly non-fatal though.
+ The card is old and slow.
+
+To do:
+ Improve probe/setup code
+ Test multicast and promiscuous operation
+
+Authors:
+ The driver is mainly written by Craig Southeren, email
+ <craigs@ineluki.apana.org.au>.
+ Parts of the driver (adapting the driver to 1.1.4+ kernels,
+ IRQ/address detection, some changes) and this README by
+ Juha Laiho <jlaiho@ichaos.nullnet.fi>.
+ DMA mode, more fixes, etc, by Philip Blundell <pjb27@cam.ac.uk>
+ Multicard support, Software configurable DMA, etc., by
+ Christopher Collins <ccollins@pcug.org.au>
diff --git a/Documentation/networking/3c509.txt b/Documentation/networking/3c509.txt
new file mode 100644
index 0000000..0643e3b
--- /dev/null
+++ b/Documentation/networking/3c509.txt
@@ -0,0 +1,210 @@
+Linux and the 3Com EtherLink III Series Ethercards (driver v1.18c and higher)
+----------------------------------------------------------------------------
+
+This file contains the instructions and caveats for v1.18c and higher versions
+of the 3c509 driver. You should not use the driver without reading this file.
+
+release 1.0
+28 February 2002
+Current maintainer (corrections to):
+ David Ruggiero <jdr@farfalle.com>
+
+----------------------------------------------------------------------------
+
+(0) Introduction
+
+The following are notes and information on using the 3Com EtherLink III series
+ethercards in Linux. These cards are commonly known by the most widely-used
+card's 3Com model number, 3c509. They are all 10mb/s ISA-bus cards and shouldn't
+be (but sometimes are) confused with the similarly-numbered PCI-bus "3c905"
+(aka "Vortex" or "Boomerang") series. Kernel support for the 3c509 family is
+provided by the module 3c509.c, which has code to support all of the following
+models:
+
+ 3c509 (original ISA card)
+ 3c509B (later revision of the ISA card; supports full-duplex)
+ 3c589 (PCMCIA)
+ 3c589B (later revision of the 3c589; supports full-duplex)
+ 3c529 (MCA)
+ 3c579 (EISA)
+
+Large portions of this documentation were heavily borrowed from the guide
+written the original author of the 3c509 driver, Donald Becker. The master
+copy of that document, which contains notes on older versions of the driver,
+currently resides on Scyld web server: http://www.scyld.com/network/3c509.html.
+
+
+(1) Special Driver Features
+
+Overriding card settings
+
+The driver allows boot- or load-time overriding of the card's detected IOADDR,
+IRQ, and transceiver settings, although this capability shouldn't generally be
+needed except to enable full-duplex mode (see below). An example of the syntax
+for LILO parameters for doing this:
+
+ ether=10,0x310,3,0x3c509,eth0
+
+This configures the first found 3c509 card for IRQ 10, base I/O 0x310, and
+transceiver type 3 (10base2). The flag "0x3c509" must be set to avoid conflicts
+with other card types when overriding the I/O address. When the driver is
+loaded as a module, only the IRQ and transceiver setting may be overridden.
+For example, setting two cards to 10base2/IRQ10 and AUI/IRQ11 is done by using
+the xcvr and irq module options:
+
+ options 3c509 xcvr=3,1 irq=10,11
+
+
+(2) Full-duplex mode
+
+The v1.18c driver added support for the 3c509B's full-duplex capabilities.
+In order to enable and successfully use full-duplex mode, three conditions
+must be met:
+
+(a) You must have a Etherlink III card model whose hardware supports full-
+duplex operations. Currently, the only members of the 3c509 family that are
+positively known to support full-duplex are the 3c509B (ISA bus) and 3c589B
+(PCMCIA) cards. Cards without the "B" model designation do *not* support
+full-duplex mode; these include the original 3c509 (no "B"), the original
+3c589, the 3c529 (MCA bus), and the 3c579 (EISA bus).
+
+(b) You must be using your card's 10baseT transceiver (i.e., the RJ-45
+connector), not its AUI (thick-net) or 10base2 (thin-net/coax) interfaces.
+AUI and 10base2 network cabling is physically incapable of full-duplex
+operation.
+
+(c) Most importantly, your 3c509B must be connected to a link partner that is
+itself full-duplex capable. This is almost certainly one of two things: a full-
+duplex-capable Ethernet switch (*not* a hub), or a full-duplex-capable NIC on
+another system that's connected directly to the 3c509B via a crossover cable.
+
+/////Extremely important caution concerning full-duplex mode/////
+Understand that the 3c509B's hardware's full-duplex support is much more
+limited than that provide by more modern network interface cards. Although
+at the physical layer of the network it fully supports full-duplex operation,
+the card was designed before the current Ethernet auto-negotiation (N-way)
+spec was written. This means that the 3c509B family ***cannot and will not
+auto-negotiate a full-duplex connection with its link partner under any
+circumstances, no matter how it is initialized***. If the full-duplex mode
+of the 3c509B is enabled, its link partner will very likely need to be
+independently _forced_ into full-duplex mode as well; otherwise various nasty
+failures will occur - at the very least, you'll see massive numbers of packet
+collisions. This is one of very rare circumstances where disabling auto-
+negotiation and forcing the duplex mode of a network interface card or switch
+would ever be necessary or desirable.
+
+
+(3) Available Transceiver Types
+
+For versions of the driver v1.18c and above, the available transceiver types are:
+
+0 transceiver type from EEPROM config (normally 10baseT); force half-duplex
+1 AUI (thick-net / DB15 connector)
+2 (undefined)
+3 10base2 (thin-net == coax / BNC connector)
+4 10baseT (RJ-45 connector); force half-duplex mode
+8 transceiver type and duplex mode taken from card's EEPROM config settings
+12 10baseT (RJ-45 connector); force full-duplex mode
+
+Prior to driver version 1.18c, only transceiver codes 0-4 were supported. Note
+that the new transceiver codes 8 and 12 are the *only* ones that will enable
+full-duplex mode, no matter what the card's detected EEPROM settings might be.
+This insured that merely upgrading the driver from an earlier version would
+never automatically enable full-duplex mode in an existing installation;
+it must always be explicitly enabled via one of these code in order to be
+activated.
+
+
+(4a) Interpretation of error messages and common problems
+
+Error Messages
+
+eth0: Infinite loop in interrupt, status 2011.
+These are "mostly harmless" message indicating that the driver had too much
+work during that interrupt cycle. With a status of 0x2011 you are receiving
+packets faster than they can be removed from the card. This should be rare
+or impossible in normal operation. Possible causes of this error report are:
+
+ - a "green" mode enabled that slows the processor down when there is no
+ keyboard activity.
+
+ - some other device or device driver hogging the bus or disabling interrupts.
+ Check /proc/interrupts for excessive interrupt counts. The timer tick
+ interrupt should always be incrementing faster than the others.
+
+No received packets
+If a 3c509, 3c562 or 3c589 can successfully transmit packets, but never
+receives packets (as reported by /proc/net/dev or 'ifconfig') you likely
+have an interrupt line problem. Check /proc/interrupts to verify that the
+card is actually generating interrupts. If the interrupt count is not
+increasing you likely have a physical conflict with two devices trying to
+use the same ISA IRQ line. The common conflict is with a sound card on IRQ10
+or IRQ5, and the easiest solution is to move the 3c509 to a different
+interrupt line. If the device is receiving packets but 'ping' doesn't work,
+you have a routing problem.
+
+Tx Carrier Errors Reported in /proc/net/dev
+If an EtherLink III appears to transmit packets, but the "Tx carrier errors"
+field in /proc/net/dev increments as quickly as the Tx packet count, you
+likely have an unterminated network or the incorrect media transceiver selected.
+
+3c509B card is not detected on machines with an ISA PnP BIOS.
+While the updated driver works with most PnP BIOS programs, it does not work
+with all. This can be fixed by disabling PnP support using the 3Com-supplied
+setup program.
+
+3c509 card is not detected on overclocked machines
+Increase the delay time in id_read_eeprom() from the current value, 500,
+to an absurdly high value, such as 5000.
+
+
+(4b) Decoding Status and Error Messages
+
+The bits in the main status register are:
+
+value description
+0x01 Interrupt latch
+0x02 Tx overrun, or Rx underrun
+0x04 Tx complete
+0x08 Tx FIFO room available
+0x10 A complete Rx packet has arrived
+0x20 A Rx packet has started to arrive
+0x40 The driver has requested an interrupt
+0x80 Statistics counter nearly full
+
+The bits in the transmit (Tx) status word are:
+
+value description
+0x02 Out-of-window collision.
+0x04 Status stack overflow (normally impossible).
+0x08 16 collisions.
+0x10 Tx underrun (not enough PCI bus bandwidth).
+0x20 Tx jabber.
+0x40 Tx interrupt requested.
+0x80 Status is valid (this should always be set).
+
+
+When a transmit error occurs the driver produces a status message such as
+
+ eth0: Transmit error, Tx status register 82
+
+The two values typically seen here are:
+
+0x82
+Out of window collision. This typically occurs when some other Ethernet
+host is incorrectly set to full duplex on a half duplex network.
+
+0x88
+16 collisions. This typically occurs when the network is exceptionally busy
+or when another host doesn't correctly back off after a collision. If this
+error is mixed with 0x82 errors it is the result of a host incorrectly set
+to full duplex (see above).
+
+Both of these errors are the result of network problems that should be
+corrected. They do not represent driver malfunction.
+
+
+(5) Revision history (this file)
+
+28Feb02 v1.0 DR New; major portions based on Becker original 3c509 docs
+
diff --git a/Documentation/networking/6pack.txt b/Documentation/networking/6pack.txt
new file mode 100644
index 0000000..d0777a1
--- /dev/null
+++ b/Documentation/networking/6pack.txt
@@ -0,0 +1,175 @@
+This is the 6pack-mini-HOWTO, written by
+
+Andreas Könsgen DG3KQ
+Internet: ajk@iehk.rwth-aachen.de
+AMPR-net: dg3kq@db0pra.ampr.org
+AX.25: dg3kq@db0ach.#nrw.deu.eu
+
+Last update: April 7, 1998
+
+1. What is 6pack, and what are the advantages to KISS?
+
+6pack is a transmission protocol for data exchange between the PC and
+the TNC over a serial line. It can be used as an alternative to KISS.
+
+6pack has two major advantages:
+- The PC is given full control over the radio
+ channel. Special control data is exchanged between the PC and the TNC so
+ that the PC knows at any time if the TNC is receiving data, if a TNC
+ buffer underrun or overrun has occurred, if the PTT is
+ set and so on. This control data is processed at a higher priority than
+ normal data, so a data stream can be interrupted at any time to issue an
+ important event. This helps to improve the channel access and timing
+ algorithms as everything is computed in the PC. It would even be possible
+ to experiment with something completely different from the known CSMA and
+ DAMA channel access methods.
+ This kind of real-time control is especially important to supply several
+ TNCs that are connected between each other and the PC by a daisy chain
+ (however, this feature is not supported yet by the Linux 6pack driver).
+
+- Each packet transferred over the serial line is supplied with a checksum,
+ so it is easy to detect errors due to problems on the serial line.
+ Received packets that are corrupt are not passed on to the AX.25 layer.
+ Damaged packets that the TNC has received from the PC are not transmitted.
+
+More details about 6pack are described in the file 6pack.ps that is located
+in the doc directory of the AX.25 utilities package.
+
+2. Who has developed the 6pack protocol?
+
+The 6pack protocol has been developed by Ekki Plicht DF4OR, Henning Rech
+DF9IC and Gunter Jost DK7WJ. A driver for 6pack, written by Gunter Jost and
+Matthias Welwarsky DG2FEF, comes along with the PC version of FlexNet.
+They have also written a firmware for TNCs to perform the 6pack
+protocol (see section 4 below).
+
+3. Where can I get the latest version of 6pack for LinuX?
+
+At the moment, the 6pack stuff can obtained via anonymous ftp from
+db0bm.automation.fh-aachen.de. In the directory /incoming/dg3kq,
+there is a file named 6pack.tgz.
+
+4. Preparing the TNC for 6pack operation
+
+To be able to use 6pack, a special firmware for the TNC is needed. The EPROM
+of a newly bought TNC does not contain 6pack, so you will have to
+program an EPROM yourself. The image file for 6pack EPROMs should be
+available on any packet radio box where PC/FlexNet can be found. The name of
+the file is 6pack.bin. This file is copyrighted and maintained by the FlexNet
+team. It can be used under the terms of the license that comes along
+with PC/FlexNet. Please do not ask me about the internals of this file as I
+don't know anything about it. I used a textual description of the 6pack
+protocol to program the Linux driver.
+
+TNCs contain a 64kByte EPROM, the lower half of which is used for
+the firmware/KISS. The upper half is either empty or is sometimes
+programmed with software called TAPR. In the latter case, the TNC
+is supplied with a DIP switch so you can easily change between the
+two systems. When programming a new EPROM, one of the systems is replaced
+by 6pack. It is useful to replace TAPR, as this software is rarely used
+nowadays. If your TNC is not equipped with the switch mentioned above, you
+can build in one yourself that switches over the highest address pin
+of the EPROM between HIGH and LOW level. After having inserted the new EPROM
+and switched to 6pack, apply power to the TNC for a first test. The connect
+and the status LED are lit for about a second if the firmware initialises
+the TNC correctly.
+
+5. Building and installing the 6pack driver
+
+The driver has been tested with kernel version 2.1.90. Use with older
+kernels may lead to a compilation error because the interface to a kernel
+function has been changed in the 2.1.8x kernels.
+
+How to turn on 6pack support:
+
+- In the linux kernel configuration program, select the code maturity level
+ options menu and turn on the prompting for development drivers.
+
+- Select the amateur radio support menu and turn on the serial port 6pack
+ driver.
+
+- Compile and install the kernel and the modules.
+
+To use the driver, the kissattach program delivered with the AX.25 utilities
+has to be modified.
+
+- Do a cd to the directory that holds the kissattach sources. Edit the
+ kissattach.c file. At the top, insert the following lines:
+
+ #ifndef N_6PACK
+ #define N_6PACK (N_AX25+1)
+ #endif
+
+ Then find the line
+
+ int disc = N_AX25;
+
+ and replace N_AX25 by N_6PACK.
+
+- Recompile kissattach. Rename it to spattach to avoid confusions.
+
+Installing the driver:
+
+- Do an insmod 6pack. Look at your /var/log/messages file to check if the
+ module has printed its initialization message.
+
+- Do a spattach as you would launch kissattach when starting a KISS port.
+ Check if the kernel prints the message '6pack: TNC found'.
+
+- From here, everything should work as if you were setting up a KISS port.
+ The only difference is that the network device that represents
+ the 6pack port is called sp instead of sl or ax. So, sp0 would be the
+ first 6pack port.
+
+Although the driver has been tested on various platforms, I still declare it
+ALPHA. BE CAREFUL! Sync your disks before insmoding the 6pack module
+and spattaching. Watch out if your computer behaves strangely. Read section
+6 of this file about known problems.
+
+Note that the connect and status LEDs of the TNC are controlled in a
+different way than they are when the TNC is used with PC/FlexNet. When using
+FlexNet, the connect LED is on if there is a connection; the status LED is
+on if there is data in the buffer of the PC's AX.25 engine that has to be
+transmitted. Under Linux, the 6pack layer is beyond the AX.25 layer,
+so the 6pack driver doesn't know anything about connects or data that
+has not yet been transmitted. Therefore the LEDs are controlled
+as they are in KISS mode: The connect LED is turned on if data is transferred
+from the PC to the TNC over the serial line, the status LED if data is
+sent to the PC.
+
+6. Known problems
+
+When testing the driver with 2.0.3x kernels and
+operating with data rates on the radio channel of 9600 Baud or higher,
+the driver may, on certain systems, sometimes print the message '6pack:
+bad checksum', which is due to data loss if the other station sends two
+or more subsequent packets. I have been told that this is due to a problem
+with the serial driver of 2.0.3x kernels. I don't know yet if the problem
+still exists with 2.1.x kernels, as I have heard that the serial driver
+code has been changed with 2.1.x.
+
+When shutting down the sp interface with ifconfig, the kernel crashes if
+there is still an AX.25 connection left over which an IP connection was
+running, even if that IP connection is already closed. The problem does not
+occur when there is a bare AX.25 connection still running. I don't know if
+this is a problem of the 6pack driver or something else in the kernel.
+
+The driver has been tested as a module, not yet as a kernel-builtin driver.
+
+The 6pack protocol supports daisy-chaining of TNCs in a token ring, which is
+connected to one serial port of the PC. This feature is not implemented
+and at least at the moment I won't be able to do it because I do not have
+the opportunity to build a TNC daisy-chain and test it.
+
+Some of the comments in the source code are inaccurate. They are left from
+the SLIP/KISS driver, from which the 6pack driver has been derived.
+I haven't modified or removed them yet -- sorry! The code itself needs
+some cleaning and optimizing. This will be done in a later release.
+
+If you encounter a bug or if you have a question or suggestion concerning the
+driver, feel free to mail me, using the addresses given at the beginning of
+this file.
+
+Have fun!
+
+Andreas
diff --git a/Documentation/networking/DLINK.txt b/Documentation/networking/DLINK.txt
new file mode 100644
index 0000000..55d2443
--- /dev/null
+++ b/Documentation/networking/DLINK.txt
@@ -0,0 +1,203 @@
+Released 1994-06-13
+
+
+ CONTENTS:
+
+ 1. Introduction.
+ 2. License.
+ 3. Files in this release.
+ 4. Installation.
+ 5. Problems and tuning.
+ 6. Using the drivers with earlier releases.
+ 7. Acknowledgments.
+
+
+ 1. INTRODUCTION.
+
+ This is a set of Ethernet drivers for the D-Link DE-600/DE-620
+ pocket adapters, for the parallel port on a Linux based machine.
+ Some adapter "clones" will also work. Xircom is _not_ a clone...
+ These drivers _can_ be used as loadable modules,
+ and were developed for use on Linux 1.1.13 and above.
+ For use on Linux 1.0.X, or earlier releases, see below.
+
+ I have used these drivers for NFS, ftp, telnet and X-clients on
+ remote machines. Transmissions with ftp seems to work as
+ good as can be expected (i.e. > 80k bytes/sec) from a
+ parallel port...:-) Receive speeds will be about 60-80% of this.
+ Depending on your machine, somewhat higher speeds can be achieved.
+
+ All comments/fixes to Bjorn Ekwall (bj0rn@blox.se).
+
+
+ 2. LICENSE.
+
+ This program is free software; you can redistribute it
+ and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE. See the GNU General Public License for more
+ details.
+
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA
+ 02139, USA.
+
+
+ 3. FILES IN THIS RELEASE.
+
+ README.DLINK This file.
+ de600.c The Source (may it be with You :-) for the DE-600
+ de620.c ditto for the DE-620
+ de620.h Macros for de620.c
+
+ If you are upgrading from the d-link tar release, there will
+ also be a "dlink-patches" file that will patch Linux 1.1.18:
+ linux/drivers/net/Makefile
+ linux/drivers/net/CONFIG
+ linux/drivers/net/MODULES
+ linux/drivers/net/Space.c
+ linux/config.in
+ Apply the patch by:
+ "cd /usr/src; patch -p0 < linux/drivers/net/dlink-patches"
+ The old source, "linux/drivers/net/d_link.c", can be removed.
+
+
+ 4. INSTALLATION.
+
+ o Get the latest net binaries, according to current net.wisdom.
+
+ o Read the NET-2 and Ethernet HOWTOs and modify your setup.
+
+ o If your parallel port has a strange address or irq,
+ modify "linux/drivers/net/CONFIG" accordingly, or adjust
+ the parameters in the "tuning" section in the sources.
+
+ If you are going to use the drivers as loadable modules, do _not_
+ enable them while doing "make config", but instead make sure that
+ the drivers are included in "linux/drivers/net/MODULES".
+
+ If you are _not_ going to use the driver(s) as loadable modules,
+ but instead have them included in the kernel, remember to enable
+ the drivers while doing "make config".
+
+ o To include networking and DE600/DE620 support in your kernel:
+ # cd /linux
+ (as modules:)
+ # make config (answer yes on CONFIG_NET and CONFIG_INET)
+ (else included in the kernel:)
+ # make config (answer yes on CONFIG _NET, _INET and _DE600 or _DE620)
+ # make clean
+ # make zImage (or whatever magic you usually do)
+
+ o I use lilo to boot multiple kernels, so that I at least
+ can have one working kernel :-). If you do too, append
+ these lines to /etc/lilo/config:
+
+ image = /linux/zImage
+ label = newlinux
+ root = /dev/hda2 (or whatever YOU have...)
+
+ # /etc/lilo/install
+
+ o Do "sync" and reboot the new kernel with a D-Link
+ DE-600/DE-620 pocket adapter connected.
+
+ o The adapter can be configured with ifconfig eth?
+ where the actual number is decided by the kernel
+ when the drivers are initialized.
+
+
+ 5. "PROBLEMS" AND TUNING,
+
+ o If you see error messages from the driver, and if the traffic
+ stops on the adapter, try to do "ifconfig" and "route" once
+ more, just as in "rc.inet1". This should take care of most
+ problems, including effects from power loss, or adapters that
+ aren't connected to the printer port in some way or another.
+ You can somewhat change the behaviour by enabling/disabling
+ the macro SHUTDOWN_WHEN_LOST in the "tuning" section.
+ For the DE-600 there is another macro, CHECK_LOST_DE600,
+ that you might want to read about in the "tuning" section.
+
+ o Some machines have trouble handling the parallel port and
+ the adapter at high speed. If you experience problems:
+
+ DE-600:
+ - The adapter is not recognized at boot, i.e. an Ethernet
+ address of 00:80:c8:... is not shown, try to add another
+ "; SLOW_DOWN_IO"
+ at DE600_SLOW_DOWN in the "tuning" section. As a last resort,
+ uncomment: "#define REALLY_SLOW_IO" (see <asm/io.h> for hints).
+
+ - You experience "timeout" messages: first try to add another
+ "; SLOW_DOWN_IO"
+ at DE600_SLOW_DOWN in the "tuning" section, _then_ try to
+ increase the value (original value: 5) at
+ "if (tickssofar < 5)" near line 422.
+
+ DE-620:
+ - Your parallel port might be "sluggish". To cater for
+ this, there are the macros LOWSPEED and READ_DELAY/WRITE_DELAY
+ in the "tuning" section. Your first step should be to enable
+ LOWSPEED, and after that you can "tune" the XXX_DELAY values.
+
+ o If the adapter _is_ recognized at boot but you get messages
+ about "Network Unreachable", then the problem is probably
+ _not_ with the driver. Check your net configuration instead
+ (ifconfig and route) in "rc.inet1".
+
+ o There is some rudimentary support for debugging, look at
+ the source. Use "-DDE600_DEBUG=3" or "-DDE620_DEBUG=3"
+ when compiling, or include it in "linux/drivers/net/CONFIG".
+ IF YOU HAVE PROBLEMS YOU CAN'T SOLVE: PLEASE COMPILE THE DRIVER
+ WITH DEBUGGING ENABLED, AND SEND ME THE RESULTING OUTPUT!
+
+
+ 6. USING THE DRIVERS WITH EARLIER RELEASES.
+
+ The later 1.1.X releases of the Linux kernel include some
+ changes in the networking layer (a.k.a. NET3). This affects
+ these drivers in a few places. The hints that follow are
+ _not_ tested by me, since I don't have the disk space to keep
+ all releases on-line.
+ Known needed changes to date:
+ - release patchfile: some patches will fail, but they should
+ be easy to apply "by hand", since they are trivial.
+ (Space.c: d_link_init() is now called de600_probe())
+ - de600.c: change "mark_bh(NET_BH)" to "mark_bh(INET_BH)".
+ - de620.c: (maybe) change the code around "netif_rx(skb);" to be
+ similar to the code around "dev_rint(...)" in de600.c
+
+
+ 7. ACKNOWLEDGMENTS.
+
+ These drivers wouldn't have been done without the base
+ (and support) from Ross Biro, and D-Link Systems Inc.
+ The driver relies upon GPL-ed source from D-Link Systems Inc.
+ and from Russel Nelson at Crynwr Software <nelson@crynwr.com>.
+
+ Additional input also from:
+ Donald Becker <becker@super.org>, Alan Cox <A.Cox@swansea.ac.uk>
+ and Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+
+ DE-600 alpha release primary victim^H^H^H^H^H^Htester:
+ - Erik Proper <erikp@cs.kun.nl>.
+ Good input also from several users, most notably
+ - Mark Burton <markb@ordern.demon.co.uk>.
+
+ DE-620 alpha release victims^H^H^H^H^H^H^Htesters:
+ - J. Joshua Kopper <kopper@rtsg.mot.com>
+ - Olav Kvittem <Olav.Kvittem@uninett.no>
+ - Germano Caronni <caronni@nessie.cs.id.ethz.ch>
+ - Jeremy Fitzhardinge <jeremy@suite.sw.oz.au>
+
+
+ Happy hacking!
+
+ Bjorn Ekwall == bj0rn@blox.se
diff --git a/Documentation/networking/LICENSE.qla3xxx b/Documentation/networking/LICENSE.qla3xxx
new file mode 100644
index 0000000..2f2077e
--- /dev/null
+++ b/Documentation/networking/LICENSE.qla3xxx
@@ -0,0 +1,46 @@
+Copyright (c) 2003-2006 QLogic Corporation
+QLogic Linux Networking HBA Driver
+
+This program includes a device driver for Linux 2.6 that may be
+distributed with QLogic hardware specific firmware binary file.
+You may modify and redistribute the device driver code under the
+GNU General Public License as published by the Free Software
+Foundation (version 2 or a later version).
+
+You may redistribute the hardware specific firmware binary file
+under the following terms:
+
+ 1. Redistribution of source code (only if applicable),
+ must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+
+ 2. Redistribution in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+ 3. The name of QLogic Corporation may not be used to
+ endorse or promote products derived from this software
+ without specific prior written permission
+
+REGARDLESS OF WHAT LICENSING MECHANISM IS USED OR APPLICABLE,
+THIS PROGRAM IS PROVIDED BY QLOGIC CORPORATION "AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+USER ACKNOWLEDGES AND AGREES THAT USE OF THIS PROGRAM WILL NOT
+CREATE OR GIVE GROUNDS FOR A LICENSE BY IMPLICATION, ESTOPPEL, OR
+OTHERWISE IN ANY INTELLECTUAL PROPERTY RIGHTS (PATENT, COPYRIGHT,
+TRADE SECRET, MASK WORK, OR OTHER PROPRIETARY RIGHT) EMBODIED IN
+ANY OTHER QLOGIC HARDWARE OR SOFTWARE EITHER SOLELY OR IN
+COMBINATION WITH THIS PROGRAM.
+
diff --git a/Documentation/networking/LICENSE.qlge b/Documentation/networking/LICENSE.qlge
new file mode 100644
index 0000000..123b6ed
--- /dev/null
+++ b/Documentation/networking/LICENSE.qlge
@@ -0,0 +1,46 @@
+Copyright (c) 2003-2008 QLogic Corporation
+QLogic Linux Networking HBA Driver
+
+This program includes a device driver for Linux 2.6 that may be
+distributed with QLogic hardware specific firmware binary file.
+You may modify and redistribute the device driver code under the
+GNU General Public License as published by the Free Software
+Foundation (version 2 or a later version).
+
+You may redistribute the hardware specific firmware binary file
+under the following terms:
+
+ 1. Redistribution of source code (only if applicable),
+ must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+
+ 2. Redistribution in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+ 3. The name of QLogic Corporation may not be used to
+ endorse or promote products derived from this software
+ without specific prior written permission
+
+REGARDLESS OF WHAT LICENSING MECHANISM IS USED OR APPLICABLE,
+THIS PROGRAM IS PROVIDED BY QLOGIC CORPORATION "AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+USER ACKNOWLEDGES AND AGREES THAT USE OF THIS PROGRAM WILL NOT
+CREATE OR GIVE GROUNDS FOR A LICENSE BY IMPLICATION, ESTOPPEL, OR
+OTHERWISE IN ANY INTELLECTUAL PROPERTY RIGHTS (PATENT, COPYRIGHT,
+TRADE SECRET, MASK WORK, OR OTHER PROPRIETARY RIGHT) EMBODIED IN
+ANY OTHER QLOGIC HARDWARE OR SOFTWARE EITHER SOLELY OR IN
+COMBINATION WITH THIS PROGRAM.
+
diff --git a/Documentation/networking/Makefile b/Documentation/networking/Makefile
new file mode 100644
index 0000000..6d8af1a
--- /dev/null
+++ b/Documentation/networking/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := ifenslave
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/networking/PLIP.txt b/Documentation/networking/PLIP.txt
new file mode 100644
index 0000000..ad7e3f7
--- /dev/null
+++ b/Documentation/networking/PLIP.txt
@@ -0,0 +1,215 @@
+PLIP: The Parallel Line Internet Protocol Device
+
+Donald Becker (becker@super.org)
+I.D.A. Supercomputing Research Center, Bowie MD 20715
+
+At some point T. Thorn will probably contribute text,
+Tommy Thorn (tthorn@daimi.aau.dk)
+
+PLIP Introduction
+-----------------
+
+This document describes the parallel port packet pusher for Net/LGX.
+This device interface allows a point-to-point connection between two
+parallel ports to appear as a IP network interface.
+
+What is PLIP?
+=============
+
+PLIP is Parallel Line IP, that is, the transportation of IP packages
+over a parallel port. In the case of a PC, the obvious choice is the
+printer port. PLIP is a non-standard, but [can use] uses the standard
+LapLink null-printer cable [can also work in turbo mode, with a PLIP
+cable]. [The protocol used to pack IP packages, is a simple one
+initiated by Crynwr.]
+
+Advantages of PLIP
+==================
+
+It's cheap, it's available everywhere, and it's easy.
+
+The PLIP cable is all that's needed to connect two Linux boxes, and it
+can be built for very few bucks.
+
+Connecting two Linux boxes takes only a second's decision and a few
+minutes' work, no need to search for a [supported] netcard. This might
+even be especially important in the case of notebooks, where netcards
+are not easily available.
+
+Not requiring a netcard also means that apart from connecting the
+cables, everything else is software configuration [which in principle
+could be made very easy.]
+
+Disadvantages of PLIP
+=====================
+
+Doesn't work over a modem, like SLIP and PPP. Limited range, 15 m.
+Can only be used to connect three (?) Linux boxes. Doesn't connect to
+an existing Ethernet. Isn't standard (not even de facto standard, like
+SLIP).
+
+Performance
+===========
+
+PLIP easily outperforms Ethernet cards....(ups, I was dreaming, but
+it *is* getting late. EOB)
+
+PLIP driver details
+-------------------
+
+The Linux PLIP driver is an implementation of the original Crynwr protocol,
+that uses the parallel port subsystem of the kernel in order to properly
+share parallel ports between PLIP and other services.
+
+IRQs and trigger timeouts
+=========================
+
+When a parallel port used for a PLIP driver has an IRQ configured to it, the
+PLIP driver is signaled whenever data is sent to it via the cable, such that
+when no data is available, the driver isn't being used.
+
+However, on some machines it is hard, if not impossible, to configure an IRQ
+to a certain parallel port, mainly because it is used by some other device.
+On these machines, the PLIP driver can be used in IRQ-less mode, where
+the PLIP driver would constantly poll the parallel port for data waiting,
+and if such data is available, process it. This mode is less efficient than
+the IRQ mode, because the driver has to check the parallel port many times
+per second, even when no data at all is sent. Some rough measurements
+indicate that there isn't a noticeable performance drop when using IRQ-less
+mode as compared to IRQ mode as far as the data transfer speed is involved.
+There is a performance drop on the machine hosting the driver.
+
+When the PLIP driver is used in IRQ mode, the timeout used for triggering a
+data transfer (the maximal time the PLIP driver would allow the other side
+before announcing a timeout, when trying to handshake a transfer of some
+data) is, by default, 500usec. As IRQ delivery is more or less immediate,
+this timeout is quite sufficient.
+
+When in IRQ-less mode, the PLIP driver polls the parallel port HZ times
+per second (where HZ is typically 100 on most platforms, and 1024 on an
+Alpha, as of this writing). Between two such polls, there are 10^6/HZ usecs.
+On an i386, for example, 10^6/100 = 10000usec. It is easy to see that it is
+quite possible for the trigger timeout to expire between two such polls, as
+the timeout is only 500usec long. As a result, it is required to change the
+trigger timeout on the *other* side of a PLIP connection, to about
+10^6/HZ usecs. If both sides of a PLIP connection are used in IRQ-less mode,
+this timeout is required on both sides.
+
+It appears that in practice, the trigger timeout can be shorter than in the
+above calculation. It isn't an important issue, unless the wire is faulty,
+in which case a long timeout would stall the machine when, for whatever
+reason, bits are dropped.
+
+A utility that can perform this change in Linux is plipconfig, which is part
+of the net-tools package (its location can be found in the
+Documentation/Changes file). An example command would be
+'plipconfig plipX trigger 10000', where plipX is the appropriate
+PLIP device.
+
+PLIP hardware interconnection
+-----------------------------
+
+PLIP uses several different data transfer methods. The first (and the
+only one implemented in the early version of the code) uses a standard
+printer "null" cable to transfer data four bits at a time using
+data bit outputs connected to status bit inputs.
+
+The second data transfer method relies on both machines having
+bi-directional parallel ports, rather than output-only ``printer''
+ports. This allows byte-wide transfers and avoids reconstructing
+nibbles into bytes, leading to much faster transfers.
+
+Parallel Transfer Mode 0 Cable
+==============================
+
+The cable for the first transfer mode is a standard
+printer "null" cable which transfers data four bits at a time using
+data bit outputs of the first port (machine T) connected to the
+status bit inputs of the second port (machine R). There are five
+status inputs, and they are used as four data inputs and a clock (data
+strobe) input, arranged so that the data input bits appear as contiguous
+bits with standard status register implementation.
+
+A cable that implements this protocol is available commercially as a
+"Null Printer" or "Turbo Laplink" cable. It can be constructed with
+two DB-25 male connectors symmetrically connected as follows:
+
+ STROBE output 1*
+ D0->ERROR 2 - 15 15 - 2
+ D1->SLCT 3 - 13 13 - 3
+ D2->PAPOUT 4 - 12 12 - 4
+ D3->ACK 5 - 10 10 - 5
+ D4->BUSY 6 - 11 11 - 6
+ D5,D6,D7 are 7*, 8*, 9*
+ AUTOFD output 14*
+ INIT output 16*
+ SLCTIN 17 - 17
+ extra grounds are 18*,19*,20*,21*,22*,23*,24*
+ GROUND 25 - 25
+* Do not connect these pins on either end
+
+If the cable you are using has a metallic shield it should be
+connected to the metallic DB-25 shell at one end only.
+
+Parallel Transfer Mode 1
+========================
+
+The second data transfer method relies on both machines having
+bi-directional parallel ports, rather than output-only ``printer''
+ports. This allows byte-wide transfers, and avoids reconstructing
+nibbles into bytes. This cable should not be used on unidirectional
+``printer'' (as opposed to ``parallel'') ports or when the machine
+isn't configured for PLIP, as it will result in output driver
+conflicts and the (unlikely) possibility of damage.
+
+The cable for this transfer mode should be constructed as follows:
+
+ STROBE->BUSY 1 - 11
+ D0->D0 2 - 2
+ D1->D1 3 - 3
+ D2->D2 4 - 4
+ D3->D3 5 - 5
+ D4->D4 6 - 6
+ D5->D5 7 - 7
+ D6->D6 8 - 8
+ D7->D7 9 - 9
+ INIT -> ACK 16 - 10
+ AUTOFD->PAPOUT 14 - 12
+ SLCT->SLCTIN 13 - 17
+ GND->ERROR 18 - 15
+ extra grounds are 19*,20*,21*,22*,23*,24*
+ GROUND 25 - 25
+* Do not connect these pins on either end
+
+Once again, if the cable you are using has a metallic shield it should
+be connected to the metallic DB-25 shell at one end only.
+
+PLIP Mode 0 transfer protocol
+=============================
+
+The PLIP driver is compatible with the "Crynwr" parallel port transfer
+standard in Mode 0. That standard specifies the following protocol:
+
+ send header nibble '0x8'
+ count-low octet
+ count-high octet
+ ... data octets
+ checksum octet
+
+Each octet is sent as
+ <wait for rx. '0x1?'> <send 0x10+(octet&0x0F)>
+ <wait for rx. '0x0?'> <send 0x00+((octet>>4)&0x0F)>
+
+To start a transfer the transmitting machine outputs a nibble 0x08.
+That raises the ACK line, triggering an interrupt in the receiving
+machine. The receiving machine disables interrupts and raises its own ACK
+line.
+
+Restated:
+
+(OUT is bit 0-4, OUT.j is bit j from OUT. IN likewise)
+Send_Byte:
+ OUT := low nibble, OUT.4 := 1
+ WAIT FOR IN.4 = 1
+ OUT := high nibble, OUT.4 := 0
+ WAIT FOR IN.4 = 0
diff --git a/Documentation/networking/README.ipw2100 b/Documentation/networking/README.ipw2100
new file mode 100644
index 0000000..f3fcaa4
--- /dev/null
+++ b/Documentation/networking/README.ipw2100
@@ -0,0 +1,294 @@
+
+Intel(R) PRO/Wireless 2100 Driver for Linux in support of:
+
+Intel(R) PRO/Wireless 2100 Network Connection
+
+Copyright (C) 2003-2006, Intel Corporation
+
+README.ipw2100
+
+Version: git-1.1.5
+Date : January 25, 2006
+
+Index
+-----------------------------------------------
+0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER
+1. Introduction
+2. Release git-1.1.5 Current Features
+3. Command Line Parameters
+4. Sysfs Helper Files
+5. Radio Kill Switch
+6. Dynamic Firmware
+7. Power Management
+8. Support
+9. License
+
+
+0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER
+-----------------------------------------------
+
+Important Notice FOR ALL USERS OR DISTRIBUTORS!!!!
+
+Intel wireless LAN adapters are engineered, manufactured, tested, and
+quality checked to ensure that they meet all necessary local and
+governmental regulatory agency requirements for the regions that they
+are designated and/or marked to ship into. Since wireless LANs are
+generally unlicensed devices that share spectrum with radars,
+satellites, and other licensed and unlicensed devices, it is sometimes
+necessary to dynamically detect, avoid, and limit usage to avoid
+interference with these devices. In many instances Intel is required to
+provide test data to prove regional and local compliance to regional and
+governmental regulations before certification or approval to use the
+product is granted. Intel's wireless LAN's EEPROM, firmware, and
+software driver are designed to carefully control parameters that affect
+radio operation and to ensure electromagnetic compliance (EMC). These
+parameters include, without limitation, RF power, spectrum usage,
+channel scanning, and human exposure.
+
+For these reasons Intel cannot permit any manipulation by third parties
+of the software provided in binary format with the wireless WLAN
+adapters (e.g., the EEPROM and firmware). Furthermore, if you use any
+patches, utilities, or code with the Intel wireless LAN adapters that
+have been manipulated by an unauthorized party (i.e., patches,
+utilities, or code (including open source code modifications) which have
+not been validated by Intel), (i) you will be solely responsible for
+ensuring the regulatory compliance of the products, (ii) Intel will bear
+no liability, under any theory of liability for any issues associated
+with the modified products, including without limitation, claims under
+the warranty and/or issues arising from regulatory non-compliance, and
+(iii) Intel will not provide or be required to assist in providing
+support to any third parties for such modified products.
+
+Note: Many regulatory agencies consider Wireless LAN adapters to be
+modules, and accordingly, condition system-level regulatory approval
+upon receipt and review of test data documenting that the antennas and
+system configuration do not cause the EMC and radio operation to be
+non-compliant.
+
+The drivers available for download from SourceForge are provided as a
+part of a development project. Conformance to local regulatory
+requirements is the responsibility of the individual developer. As
+such, if you are interested in deploying or shipping a driver as part of
+solution intended to be used for purposes other than development, please
+obtain a tested driver from Intel Customer Support at:
+
+http://support.intel.com/support/notebook/sb/CS-006408.htm
+
+
+1. Introduction
+-----------------------------------------------
+
+This document provides a brief overview of the features supported by the
+IPW2100 driver project. The main project website, where the latest
+development version of the driver can be found, is:
+
+ http://ipw2100.sourceforge.net
+
+There you can find the not only the latest releases, but also information about
+potential fixes and patches, as well as links to the development mailing list
+for the driver project.
+
+
+2. Release git-1.1.5 Current Supported Features
+-----------------------------------------------
+- Managed (BSS) and Ad-Hoc (IBSS)
+- WEP (shared key and open)
+- Wireless Tools support
+- 802.1x (tested with XSupplicant 1.0.1)
+
+Enabled (but not supported) features:
+- Monitor/RFMon mode
+- WPA/WPA2
+
+The distinction between officially supported and enabled is a reflection
+on the amount of validation and interoperability testing that has been
+performed on a given feature.
+
+
+3. Command Line Parameters
+-----------------------------------------------
+
+If the driver is built as a module, the following optional parameters are used
+by entering them on the command line with the modprobe command using this
+syntax:
+
+ modprobe ipw2100 [<option>=<VAL1><,VAL2>...]
+
+For example, to disable the radio on driver loading, enter:
+
+ modprobe ipw2100 disable=1
+
+The ipw2100 driver supports the following module parameters:
+
+Name Value Example:
+debug 0x0-0xffffffff debug=1024
+mode 0,1,2 mode=1 /* AdHoc */
+channel int channel=3 /* Only valid in AdHoc or Monitor */
+associate boolean associate=0 /* Do NOT auto associate */
+disable boolean disable=1 /* Do not power the HW */
+
+
+4. Sysfs Helper Files
+---------------------------
+-----------------------------------------------
+
+There are several ways to control the behavior of the driver. Many of the
+general capabilities are exposed through the Wireless Tools (iwconfig). There
+are a few capabilities that are exposed through entries in the Linux Sysfs.
+
+
+----- Driver Level ------
+For the driver level files, look in /sys/bus/pci/drivers/ipw2100/
+
+ debug_level
+
+ This controls the same global as the 'debug' module parameter. For
+ information on the various debugging levels available, run the 'dvals'
+ script found in the driver source directory.
+
+ NOTE: 'debug_level' is only enabled if CONFIG_IPW2100_DEBUG is turn
+ on.
+
+----- Device Level ------
+For the device level files look in
+
+ /sys/bus/pci/drivers/ipw2100/{PCI-ID}/
+
+For example:
+ /sys/bus/pci/drivers/ipw2100/0000:02:01.0
+
+For the device level files, see /sys/bus/pci/drivers/ipw2100:
+
+ rf_kill
+ read -
+ 0 = RF kill not enabled (radio on)
+ 1 = SW based RF kill active (radio off)
+ 2 = HW based RF kill active (radio off)
+ 3 = Both HW and SW RF kill active (radio off)
+ write -
+ 0 = If SW based RF kill active, turn the radio back on
+ 1 = If radio is on, activate SW based RF kill
+
+ NOTE: If you enable the SW based RF kill and then toggle the HW
+ based RF kill from ON -> OFF -> ON, the radio will NOT come back on
+
+
+5. Radio Kill Switch
+-----------------------------------------------
+Most laptops provide the ability for the user to physically disable the radio.
+Some vendors have implemented this as a physical switch that requires no
+software to turn the radio off and on. On other laptops, however, the switch
+is controlled through a button being pressed and a software driver then making
+calls to turn the radio off and on. This is referred to as a "software based
+RF kill switch"
+
+See the Sysfs helper file 'rf_kill' for determining the state of the RF switch
+on your system.
+
+
+6. Dynamic Firmware
+-----------------------------------------------
+As the firmware is licensed under a restricted use license, it can not be
+included within the kernel sources. To enable the IPW2100 you will need a
+firmware image to load into the wireless NIC's processors.
+
+You can obtain these images from <http://ipw2100.sf.net/firmware.php>.
+
+See INSTALL for instructions on installing the firmware.
+
+
+7. Power Management
+-----------------------------------------------
+The IPW2100 supports the configuration of the Power Save Protocol
+through a private wireless extension interface. The IPW2100 supports
+the following different modes:
+
+ off No power management. Radio is always on.
+ on Automatic power management
+ 1-5 Different levels of power management. The higher the
+ number the greater the power savings, but with an impact to
+ packet latencies.
+
+Power management works by powering down the radio after a certain
+interval of time has passed where no packets are passed through the
+radio. Once powered down, the radio remains in that state for a given
+period of time. For higher power savings, the interval between last
+packet processed to sleep is shorter and the sleep period is longer.
+
+When the radio is asleep, the access point sending data to the station
+must buffer packets at the AP until the station wakes up and requests
+any buffered packets. If you have an AP that does not correctly support
+the PSP protocol you may experience packet loss or very poor performance
+while power management is enabled. If this is the case, you will need
+to try and find a firmware update for your AP, or disable power
+management (via `iwconfig eth1 power off`)
+
+To configure the power level on the IPW2100 you use a combination of
+iwconfig and iwpriv. iwconfig is used to turn power management on, off,
+and set it to auto.
+
+ iwconfig eth1 power off Disables radio power down
+ iwconfig eth1 power on Enables radio power management to
+ last set level (defaults to AUTO)
+ iwpriv eth1 set_power 0 Sets power level to AUTO and enables
+ power management if not previously
+ enabled.
+ iwpriv eth1 set_power 1-5 Set the power level as specified,
+ enabling power management if not
+ previously enabled.
+
+You can view the current power level setting via:
+
+ iwpriv eth1 get_power
+
+It will return the current period or timeout that is configured as a string
+in the form of xxxx/yyyy (z) where xxxx is the timeout interval (amount of
+time after packet processing), yyyy is the period to sleep (amount of time to
+wait before powering the radio and querying the access point for buffered
+packets), and z is the 'power level'. If power management is turned off the
+xxxx/yyyy will be replaced with 'off' -- the level reported will be the active
+level if `iwconfig eth1 power on` is invoked.
+
+
+8. Support
+-----------------------------------------------
+
+For general development information and support,
+go to:
+
+ http://ipw2100.sf.net/
+
+The ipw2100 1.1.0 driver and firmware can be downloaded from:
+
+ http://support.intel.com
+
+For installation support on the ipw2100 1.1.0 driver on Linux kernels
+2.6.8 or greater, email support is available from:
+
+ http://supportmail.intel.com
+
+9. License
+-----------------------------------------------
+
+ Copyright(c) 2003 - 2006 Intel Corporation. All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License (version 2) as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ License Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
diff --git a/Documentation/networking/README.ipw2200 b/Documentation/networking/README.ipw2200
new file mode 100644
index 0000000..4f2a40f
--- /dev/null
+++ b/Documentation/networking/README.ipw2200
@@ -0,0 +1,472 @@
+
+Intel(R) PRO/Wireless 2915ABG Driver for Linux in support of:
+
+Intel(R) PRO/Wireless 2200BG Network Connection
+Intel(R) PRO/Wireless 2915ABG Network Connection
+
+Note: The Intel(R) PRO/Wireless 2915ABG Driver for Linux and Intel(R)
+PRO/Wireless 2200BG Driver for Linux is a unified driver that works on
+both hardware adapters listed above. In this document the Intel(R)
+PRO/Wireless 2915ABG Driver for Linux will be used to reference the
+unified driver.
+
+Copyright (C) 2004-2006, Intel Corporation
+
+README.ipw2200
+
+Version: 1.1.2
+Date : March 30, 2006
+
+
+Index
+-----------------------------------------------
+0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER
+1. Introduction
+1.1. Overview of features
+1.2. Module parameters
+1.3. Wireless Extension Private Methods
+1.4. Sysfs Helper Files
+1.5. Supported channels
+2. Ad-Hoc Networking
+3. Interacting with Wireless Tools
+3.1. iwconfig mode
+3.2. iwconfig sens
+4. About the Version Numbers
+5. Firmware installation
+6. Support
+7. License
+
+
+0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER
+-----------------------------------------------
+
+Important Notice FOR ALL USERS OR DISTRIBUTORS!!!!
+
+Intel wireless LAN adapters are engineered, manufactured, tested, and
+quality checked to ensure that they meet all necessary local and
+governmental regulatory agency requirements for the regions that they
+are designated and/or marked to ship into. Since wireless LANs are
+generally unlicensed devices that share spectrum with radars,
+satellites, and other licensed and unlicensed devices, it is sometimes
+necessary to dynamically detect, avoid, and limit usage to avoid
+interference with these devices. In many instances Intel is required to
+provide test data to prove regional and local compliance to regional and
+governmental regulations before certification or approval to use the
+product is granted. Intel's wireless LAN's EEPROM, firmware, and
+software driver are designed to carefully control parameters that affect
+radio operation and to ensure electromagnetic compliance (EMC). These
+parameters include, without limitation, RF power, spectrum usage,
+channel scanning, and human exposure.
+
+For these reasons Intel cannot permit any manipulation by third parties
+of the software provided in binary format with the wireless WLAN
+adapters (e.g., the EEPROM and firmware). Furthermore, if you use any
+patches, utilities, or code with the Intel wireless LAN adapters that
+have been manipulated by an unauthorized party (i.e., patches,
+utilities, or code (including open source code modifications) which have
+not been validated by Intel), (i) you will be solely responsible for
+ensuring the regulatory compliance of the products, (ii) Intel will bear
+no liability, under any theory of liability for any issues associated
+with the modified products, including without limitation, claims under
+the warranty and/or issues arising from regulatory non-compliance, and
+(iii) Intel will not provide or be required to assist in providing
+support to any third parties for such modified products.
+
+Note: Many regulatory agencies consider Wireless LAN adapters to be
+modules, and accordingly, condition system-level regulatory approval
+upon receipt and review of test data documenting that the antennas and
+system configuration do not cause the EMC and radio operation to be
+non-compliant.
+
+The drivers available for download from SourceForge are provided as a
+part of a development project. Conformance to local regulatory
+requirements is the responsibility of the individual developer. As
+such, if you are interested in deploying or shipping a driver as part of
+solution intended to be used for purposes other than development, please
+obtain a tested driver from Intel Customer Support at:
+
+http://support.intel.com/support/notebook/sb/CS-006408.htm
+
+
+1. Introduction
+-----------------------------------------------
+The following sections attempt to provide a brief introduction to using
+the Intel(R) PRO/Wireless 2915ABG Driver for Linux.
+
+This document is not meant to be a comprehensive manual on
+understanding or using wireless technologies, but should be sufficient
+to get you moving without wires on Linux.
+
+For information on building and installing the driver, see the INSTALL
+file.
+
+
+1.1. Overview of Features
+-----------------------------------------------
+The current release (1.1.2) supports the following features:
+
++ BSS mode (Infrastructure, Managed)
++ IBSS mode (Ad-Hoc)
++ WEP (OPEN and SHARED KEY mode)
++ 802.1x EAP via wpa_supplicant and xsupplicant
++ Wireless Extension support
++ Full B and G rate support (2200 and 2915)
++ Full A rate support (2915 only)
++ Transmit power control
++ S state support (ACPI suspend/resume)
+
+The following features are currently enabled, but not officially
+supported:
+
++ WPA
++ long/short preamble support
++ Monitor mode (aka RFMon)
+
+The distinction between officially supported and enabled is a reflection
+on the amount of validation and interoperability testing that has been
+performed on a given feature.
+
+
+
+1.2. Command Line Parameters
+-----------------------------------------------
+
+Like many modules used in the Linux kernel, the Intel(R) PRO/Wireless
+2915ABG Driver for Linux allows configuration options to be provided
+as module parameters. The most common way to specify a module parameter
+is via the command line.
+
+The general form is:
+
+% modprobe ipw2200 parameter=value
+
+Where the supported parameter are:
+
+ associate
+ Set to 0 to disable the auto scan-and-associate functionality of the
+ driver. If disabled, the driver will not attempt to scan
+ for and associate to a network until it has been configured with
+ one or more properties for the target network, for example configuring
+ the network SSID. Default is 1 (auto-associate)
+
+ Example: % modprobe ipw2200 associate=0
+
+ auto_create
+ Set to 0 to disable the auto creation of an Ad-Hoc network
+ matching the channel and network name parameters provided.
+ Default is 1.
+
+ channel
+ channel number for association. The normal method for setting
+ the channel would be to use the standard wireless tools
+ (i.e. `iwconfig eth1 channel 10`), but it is useful sometimes
+ to set this while debugging. Channel 0 means 'ANY'
+
+ debug
+ If using a debug build, this is used to control the amount of debug
+ info is logged. See the 'dvals' and 'load' script for more info on
+ how to use this (the dvals and load scripts are provided as part
+ of the ipw2200 development snapshot releases available from the
+ SourceForge project at http://ipw2200.sf.net)
+
+ led
+ Can be used to turn on experimental LED code.
+ 0 = Off, 1 = On. Default is 0.
+
+ mode
+ Can be used to set the default mode of the adapter.
+ 0 = Managed, 1 = Ad-Hoc, 2 = Monitor
+
+
+1.3. Wireless Extension Private Methods
+-----------------------------------------------
+
+As an interface designed to handle generic hardware, there are certain
+capabilities not exposed through the normal Wireless Tool interface. As
+such, a provision is provided for a driver to declare custom, or
+private, methods. The Intel(R) PRO/Wireless 2915ABG Driver for Linux
+defines several of these to configure various settings.
+
+The general form of using the private wireless methods is:
+
+ % iwpriv $IFNAME method parameters
+
+Where $IFNAME is the interface name the device is registered with
+(typically eth1, customized via one of the various network interface
+name managers, such as ifrename)
+
+The supported private methods are:
+
+ get_mode
+ Can be used to report out which IEEE mode the driver is
+ configured to support. Example:
+
+ % iwpriv eth1 get_mode
+ eth1 get_mode:802.11bg (6)
+
+ set_mode
+ Can be used to configure which IEEE mode the driver will
+ support.
+
+ Usage:
+ % iwpriv eth1 set_mode {mode}
+ Where {mode} is a number in the range 1-7:
+ 1 802.11a (2915 only)
+ 2 802.11b
+ 3 802.11ab (2915 only)
+ 4 802.11g
+ 5 802.11ag (2915 only)
+ 6 802.11bg
+ 7 802.11abg (2915 only)
+
+ get_preamble
+ Can be used to report configuration of preamble length.
+
+ set_preamble
+ Can be used to set the configuration of preamble length:
+
+ Usage:
+ % iwpriv eth1 set_preamble {mode}
+ Where {mode} is one of:
+ 1 Long preamble only
+ 0 Auto (long or short based on connection)
+
+
+1.4. Sysfs Helper Files:
+-----------------------------------------------
+
+The Linux kernel provides a pseudo file system that can be used to
+access various components of the operating system. The Intel(R)
+PRO/Wireless 2915ABG Driver for Linux exposes several configuration
+parameters through this mechanism.
+
+An entry in the sysfs can support reading and/or writing. You can
+typically query the contents of a sysfs entry through the use of cat,
+and can set the contents via echo. For example:
+
+% cat /sys/bus/pci/drivers/ipw2200/debug_level
+
+Will report the current debug level of the driver's logging subsystem
+(only available if CONFIG_IPW2200_DEBUG was configured when the driver
+was built).
+
+You can set the debug level via:
+
+% echo $VALUE > /sys/bus/pci/drivers/ipw2200/debug_level
+
+Where $VALUE would be a number in the case of this sysfs entry. The
+input to sysfs files does not have to be a number. For example, the
+firmware loader used by hotplug utilizes sysfs entries for transfering
+the firmware image from user space into the driver.
+
+The Intel(R) PRO/Wireless 2915ABG Driver for Linux exposes sysfs entries
+at two levels -- driver level, which apply to all instances of the driver
+(in the event that there are more than one device installed) and device
+level, which applies only to the single specific instance.
+
+
+1.4.1 Driver Level Sysfs Helper Files
+-----------------------------------------------
+
+For the driver level files, look in /sys/bus/pci/drivers/ipw2200/
+
+ debug_level
+
+ This controls the same global as the 'debug' module parameter
+
+
+
+1.4.2 Device Level Sysfs Helper Files
+-----------------------------------------------
+
+For the device level files, look in
+
+ /sys/bus/pci/drivers/ipw2200/{PCI-ID}/
+
+For example:
+ /sys/bus/pci/drivers/ipw2200/0000:02:01.0
+
+For the device level files, see /sys/bus/pci/drivers/ipw2200:
+
+ rf_kill
+ read -
+ 0 = RF kill not enabled (radio on)
+ 1 = SW based RF kill active (radio off)
+ 2 = HW based RF kill active (radio off)
+ 3 = Both HW and SW RF kill active (radio off)
+ write -
+ 0 = If SW based RF kill active, turn the radio back on
+ 1 = If radio is on, activate SW based RF kill
+
+ NOTE: If you enable the SW based RF kill and then toggle the HW
+ based RF kill from ON -> OFF -> ON, the radio will NOT come back on
+
+ ucode
+ read-only access to the ucode version number
+
+ led
+ read -
+ 0 = LED code disabled
+ 1 = LED code enabled
+ write -
+ 0 = Disable LED code
+ 1 = Enable LED code
+
+ NOTE: The LED code has been reported to hang some systems when
+ running ifconfig and is therefore disabled by default.
+
+
+1.5. Supported channels
+-----------------------------------------------
+
+Upon loading the Intel(R) PRO/Wireless 2915ABG Driver for Linux, a
+message stating the detected geography code and the number of 802.11
+channels supported by the card will be displayed in the log.
+
+The geography code corresponds to a regulatory domain as shown in the
+table below.
+
+ Supported channels
+Code Geography 802.11bg 802.11a
+
+--- Restricted 11 0
+ZZF Custom US/Canada 11 8
+ZZD Rest of World 13 0
+ZZA Custom USA & Europe & High 11 13
+ZZB Custom NA & Europe 11 13
+ZZC Custom Japan 11 4
+ZZM Custom 11 0
+ZZE Europe 13 19
+ZZJ Custom Japan 14 4
+ZZR Rest of World 14 0
+ZZH High Band 13 4
+ZZG Custom Europe 13 4
+ZZK Europe 13 24
+ZZL Europe 11 13
+
+
+2. Ad-Hoc Networking
+-----------------------------------------------
+
+When using a device in an Ad-Hoc network, it is useful to understand the
+sequence and requirements for the driver to be able to create, join, or
+merge networks.
+
+The following attempts to provide enough information so that you can
+have a consistent experience while using the driver as a member of an
+Ad-Hoc network.
+
+2.1. Joining an Ad-Hoc Network
+-----------------------------------------------
+
+The easiest way to get onto an Ad-Hoc network is to join one that
+already exists.
+
+2.2. Creating an Ad-Hoc Network
+-----------------------------------------------
+
+An Ad-Hoc networks is created using the syntax of the Wireless tool.
+
+For Example:
+iwconfig eth1 mode ad-hoc essid testing channel 2
+
+2.3. Merging Ad-Hoc Networks
+-----------------------------------------------
+
+
+3. Interaction with Wireless Tools
+-----------------------------------------------
+
+3.1 iwconfig mode
+-----------------------------------------------
+
+When configuring the mode of the adapter, all run-time configured parameters
+are reset to the value used when the module was loaded. This includes
+channels, rates, ESSID, etc.
+
+3.2 iwconfig sens
+-----------------------------------------------
+
+The 'iwconfig ethX sens XX' command will not set the signal sensitivity
+threshold, as described in iwconfig documentation, but rather the number
+of consecutive missed beacons that will trigger handover, i.e. roaming
+to another access point. At the same time, it will set the disassociation
+threshold to 3 times the given value.
+
+
+4. About the Version Numbers
+-----------------------------------------------
+
+Due to the nature of open source development projects, there are
+frequently changes being incorporated that have not gone through
+a complete validation process. These changes are incorporated into
+development snapshot releases.
+
+Releases are numbered with a three level scheme:
+
+ major.minor.development
+
+Any version where the 'development' portion is 0 (for example
+1.0.0, 1.1.0, etc.) indicates a stable version that will be made
+available for kernel inclusion.
+
+Any version where the 'development' portion is not a 0 (for
+example 1.0.1, 1.1.5, etc.) indicates a development version that is
+being made available for testing and cutting edge users. The stability
+and functionality of the development releases are not know. We make
+efforts to try and keep all snapshots reasonably stable, but due to the
+frequency of their release, and the desire to get those releases
+available as quickly as possible, unknown anomalies should be expected.
+
+The major version number will be incremented when significant changes
+are made to the driver. Currently, there are no major changes planned.
+
+5. Firmware installation
+----------------------------------------------
+
+The driver requires a firmware image, download it and extract the
+files under /lib/firmware (or wherever your hotplug's firmware.agent
+will look for firmware files)
+
+The firmware can be downloaded from the following URL:
+
+ http://ipw2200.sf.net/
+
+
+6. Support
+-----------------------------------------------
+
+For direct support of the 1.0.0 version, you can contact
+http://supportmail.intel.com, or you can use the open source project
+support.
+
+For general information and support, go to:
+
+ http://ipw2200.sf.net/
+
+
+7. License
+-----------------------------------------------
+
+ Copyright(c) 2003 - 2006 Intel Corporation. All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
diff --git a/Documentation/networking/README.sb1000 b/Documentation/networking/README.sb1000
new file mode 100644
index 0000000..f82d425
--- /dev/null
+++ b/Documentation/networking/README.sb1000
@@ -0,0 +1,207 @@
+sb1000 is a module network device driver for the General Instrument (also known
+as NextLevel) SURFboard1000 internal cable modem board. This is an ISA card
+which is used by a number of cable TV companies to provide cable modem access.
+It's a one-way downstream-only cable modem, meaning that your upstream net link
+is provided by your regular phone modem.
+
+This driver was written by Franco Venturi <fventuri@mediaone.net>. He deserves
+a great deal of thanks for this wonderful piece of code!
+
+-----------------------------------------------------------------------------
+
+Support for this device is now a part of the standard Linux kernel. The
+driver source code file is drivers/net/sb1000.c. In addition to this
+you will need:
+
+1.) The "cmconfig" program. This is a utility which supplements "ifconfig"
+to configure the cable modem and network interface (usually called "cm0");
+and
+
+2.) Several PPP scripts which live in /etc/ppp to make connecting via your
+cable modem easy.
+
+ These utilities can be obtained from:
+
+ http://www.jacksonville.net/~fventuri/
+
+ in Franco's original source code distribution .tar.gz file. Support for
+ the sb1000 driver can be found at:
+
+ http://home.adelphia.net/~siglercm/sb1000.html
+ http://linuxpower.cx/~cable/
+
+ along with these utilities.
+
+3.) The standard isapnp tools. These are necessary to configure your SB1000
+card at boot time (or afterwards by hand) since it's a PnP card.
+
+ If you don't have these installed as a standard part of your Linux
+ distribution, you can find them at:
+
+ http://www.roestock.demon.co.uk/isapnptools/
+
+ or check your Linux distribution binary CD or their web site. For help with
+ isapnp, pnpdump, or /etc/isapnp.conf, go to:
+
+ http://www.roestock.demon.co.uk/isapnptools/isapnpfaq.html
+
+-----------------------------------------------------------------------------
+
+To make the SB1000 card work, follow these steps:
+
+1.) Run `make config', or `make menuconfig', or `make xconfig', whichever
+you prefer, in the top kernel tree directory to set up your kernel
+configuration. Make sure to say "Y" to "Prompt for development drivers"
+and to say "M" to the sb1000 driver. Also say "Y" or "M" to all the standard
+networking questions to get TCP/IP and PPP networking support.
+
+2.) *BEFORE* you build the kernel, edit drivers/net/sb1000.c. Make sure
+to redefine the value of READ_DATA_PORT to match the I/O address used
+by isapnp to access your PnP cards. This is the value of READPORT in
+/etc/isapnp.conf or given by the output of pnpdump.
+
+3.) Build and install the kernel and modules as usual.
+
+4.) Boot your new kernel following the usual procedures.
+
+5.) Set up to configure the new SB1000 PnP card by capturing the output
+of "pnpdump" to a file and editing this file to set the correct I/O ports,
+IRQ, and DMA settings for all your PnP cards. Make sure none of the settings
+conflict with one another. Then test this configuration by running the
+"isapnp" command with your new config file as the input. Check for
+errors and fix as necessary. (As an aside, I use I/O ports 0x110 and
+0x310 and IRQ 11 for my SB1000 card and these work well for me. YMMV.)
+Then save the finished config file as /etc/isapnp.conf for proper configuration
+on subsequent reboots.
+
+6.) Download the original file sb1000-1.1.2.tar.gz from Franco's site or one of
+the others referenced above. As root, unpack it into a temporary directory and
+do a `make cmconfig' and then `install -c cmconfig /usr/local/sbin'. Don't do
+`make install' because it expects to find all the utilities built and ready for
+installation, not just cmconfig.
+
+7.) As root, copy all the files under the ppp/ subdirectory in Franco's
+tar file into /etc/ppp, being careful not to overwrite any files that are
+already in there. Then modify ppp@gi-on to set the correct login name,
+phone number, and frequency for the cable modem. Also edit pap-secrets
+to specify your login name and password and any site-specific information
+you need.
+
+8.) Be sure to modify /etc/ppp/firewall to use ipchains instead of
+the older ipfwadm commands from the 2.0.x kernels. There's a neat utility to
+convert ipfwadm commands to ipchains commands:
+
+ http://users.dhp.com/~whisper/ipfwadm2ipchains/
+
+You may also wish to modify the firewall script to implement a different
+firewalling scheme.
+
+9.) Start the PPP connection via the script /etc/ppp/ppp@gi-on. You must be
+root to do this. It's better to use a utility like sudo to execute
+frequently used commands like this with root permissions if possible. If you
+connect successfully the cable modem interface will come up and you'll see a
+driver message like this at the console:
+
+ cm0: sb1000 at (0x110,0x310), csn 1, S/N 0x2a0d16d8, IRQ 11.
+ sb1000.c:v1.1.2 6/01/98 (fventuri@mediaone.net)
+
+The "ifconfig" command should show two new interfaces, ppp0 and cm0.
+The command "cmconfig cm0" will give you information about the cable modem
+interface.
+
+10.) Try pinging a site via `ping -c 5 www.yahoo.com', for example. You should
+see packets received.
+
+11.) If you can't get site names (like www.yahoo.com) to resolve into
+IP addresses (like 204.71.200.67), be sure your /etc/resolv.conf file
+has no syntax errors and has the right nameserver IP addresses in it.
+If this doesn't help, try something like `ping -c 5 204.71.200.67' to
+see if the networking is running but the DNS resolution is where the
+problem lies.
+
+12.) If you still have problems, go to the support web sites mentioned above
+and read the information and documentation there.
+
+-----------------------------------------------------------------------------
+
+Common problems:
+
+1.) Packets go out on the ppp0 interface but don't come back on the cm0
+interface. It looks like I'm connected but I can't even ping any
+numerical IP addresses. (This happens predominantly on Debian systems due
+to a default boot-time configuration script.)
+
+Solution -- As root `echo 0 > /proc/sys/net/ipv4/conf/cm0/rp_filter' so it
+can share the same IP address as the ppp0 interface. Note that this
+command should probably be added to the /etc/ppp/cablemodem script
+*right*between* the "/sbin/ifconfig" and "/sbin/cmconfig" commands.
+You may need to do this to /proc/sys/net/ipv4/conf/ppp0/rp_filter as well.
+If you do this to /proc/sys/net/ipv4/conf/default/rp_filter on each reboot
+(in rc.local or some such) then any interfaces can share the same IP
+addresses.
+
+2.) I get "unresolved symbol" error messages on executing `insmod sb1000.o'.
+
+Solution -- You probably have a non-matching kernel source tree and
+/usr/include/linux and /usr/include/asm header files. Make sure you
+install the correct versions of the header files in these two directories.
+Then rebuild and reinstall the kernel.
+
+3.) When isapnp runs it reports an error, and my SB1000 card isn't working.
+
+Solution -- There's a problem with later versions of isapnp using the "(CHECK)"
+option in the lines that allocate the two I/O addresses for the SB1000 card.
+This first popped up on RH 6.0. Delete "(CHECK)" for the SB1000 I/O addresses.
+Make sure they don't conflict with any other pieces of hardware first! Then
+rerun isapnp and go from there.
+
+4.) I can't execute the /etc/ppp/ppp@gi-on file.
+
+Solution -- As root do `chmod ug+x /etc/ppp/ppp@gi-on'.
+
+5.) The firewall script isn't working (with 2.2.x and higher kernels).
+
+Solution -- Use the ipfwadm2ipchains script referenced above to convert the
+/etc/ppp/firewall script from the deprecated ipfwadm commands to ipchains.
+
+6.) I'm getting *tons* of firewall deny messages in the /var/kern.log,
+/var/messages, and/or /var/syslog files, and they're filling up my /var
+partition!!!
+
+Solution -- First, tell your ISP that you're receiving DoS (Denial of Service)
+and/or portscanning (UDP connection attempts) attacks! Look over the deny
+messages to figure out what the attack is and where it's coming from. Next,
+edit /etc/ppp/cablemodem and make sure the ",nobroadcast" option is turned on
+to the "cmconfig" command (uncomment that line). If you're not receiving these
+denied packets on your broadcast interface (IP address xxx.yyy.zzz.255
+typically), then someone is attacking your machine in particular. Be careful
+out there....
+
+7.) Everything seems to work fine but my computer locks up after a while
+(and typically during a lengthy download through the cable modem)!
+
+Solution -- You may need to add a short delay in the driver to 'slow down' the
+SURFboard because your PC might not be able to keep up with the transfer rate
+of the SB1000. To do this, it's probably best to download Franco's
+sb1000-1.1.2.tar.gz archive and build and install sb1000.o manually. You'll
+want to edit the 'Makefile' and look for the 'SB1000_DELAY'
+define. Uncomment those 'CFLAGS' lines (and comment out the default ones)
+and try setting the delay to something like 60 microseconds with:
+'-DSB1000_DELAY=60'. Then do `make' and as root `make install' and try
+it out. If it still doesn't work or you like playing with the driver, you may
+try other numbers. Remember though that the higher the delay, the slower the
+driver (which slows down the rest of the PC too when it is actively
+used). Thanks to Ed Daiga for this tip!
+
+-----------------------------------------------------------------------------
+
+Credits: This README came from Franco Venturi's original README file which is
+still supplied with his driver .tar.gz archive. I and all other sb1000 users
+owe Franco a tremendous "Thank you!" Additional thanks goes to Carl Patten
+and Ralph Bonnell who are now managing the Linux SB1000 web site, and to
+the SB1000 users who reported and helped debug the common problems listed
+above.
+
+
+ Clemmitt Sigler
+ csigler@vt.edu
diff --git a/Documentation/networking/alias.txt b/Documentation/networking/alias.txt
new file mode 100644
index 0000000..cd12c2f
--- /dev/null
+++ b/Documentation/networking/alias.txt
@@ -0,0 +1,53 @@
+
+IP-Aliasing:
+============
+
+IP-aliases are additional IP-addresses/masks hooked up to a base
+interface by adding a colon and a string when running ifconfig.
+This string is usually numeric, but this is not a must.
+
+IP-Aliases are avail if CONFIG_INET (`standard' IPv4 networking)
+is configured in the kernel.
+
+
+o Alias creation.
+ Alias creation is done by 'magic' interface naming: eg. to create a
+ 200.1.1.1 alias for eth0 ...
+
+ # ifconfig eth0:0 200.1.1.1 etc,etc....
+ ~~ -> request alias #0 creation (if not yet exists) for eth0
+
+ The corresponding route is also set up by this command.
+ Please note: The route always points to the base interface.
+
+
+o Alias deletion.
+ The alias is removed by shutting the alias down:
+
+ # ifconfig eth0:0 down
+ ~~~~~~~~~~ -> will delete alias
+
+
+o Alias (re-)configuring
+
+ Aliases are not real devices, but programs should be able to configure and
+ refer to them as usual (ifconfig, route, etc).
+
+
+o Relationship with main device
+
+ If the base device is shut down the added aliases will be deleted
+ too.
+
+
+Contact
+-------
+Please finger or e-mail me:
+ Juan Jose Ciarlante <jjciarla@raiz.uncu.edu.ar>
+
+Updated by Erik Schoenfelder <schoenfr@gaertner.DE>
+
+; local variables:
+; mode: indented-text
+; mode: auto-fill
+; end:
diff --git a/Documentation/networking/arcnet-hardware.txt b/Documentation/networking/arcnet-hardware.txt
new file mode 100644
index 0000000..731de41
--- /dev/null
+++ b/Documentation/networking/arcnet-hardware.txt
@@ -0,0 +1,3133 @@
+
+-----------------------------------------------------------------------------
+1) This file is a supplement to arcnet.txt. Please read that for general
+ driver configuration help.
+-----------------------------------------------------------------------------
+2) This file is no longer Linux-specific. It should probably be moved out of
+ the kernel sources. Ideas?
+-----------------------------------------------------------------------------
+
+Because so many people (myself included) seem to have obtained ARCnet cards
+without manuals, this file contains a quick introduction to ARCnet hardware,
+some cabling tips, and a listing of all jumper settings I can find. Please
+e-mail apenwarr@worldvisions.ca with any settings for your particular card,
+or any other information you have!
+
+
+INTRODUCTION TO ARCNET
+----------------------
+
+ARCnet is a network type which works in a way similar to popular Ethernet
+networks but which is also different in some very important ways.
+
+First of all, you can get ARCnet cards in at least two speeds: 2.5 Mbps
+(slower than Ethernet) and 100 Mbps (faster than normal Ethernet). In fact,
+there are others as well, but these are less common. The different hardware
+types, as far as I'm aware, are not compatible and so you cannot wire a
+100 Mbps card to a 2.5 Mbps card, and so on. From what I hear, my driver does
+work with 100 Mbps cards, but I haven't been able to verify this myself,
+since I only have the 2.5 Mbps variety. It is probably not going to saturate
+your 100 Mbps card. Stop complaining. :)
+
+You also cannot connect an ARCnet card to any kind of Ethernet card and
+expect it to work.
+
+There are two "types" of ARCnet - STAR topology and BUS topology. This
+refers to how the cards are meant to be wired together. According to most
+available documentation, you can only connect STAR cards to STAR cards and
+BUS cards to BUS cards. That makes sense, right? Well, it's not quite
+true; see below under "Cabling."
+
+Once you get past these little stumbling blocks, ARCnet is actually quite a
+well-designed standard. It uses something called "modified token passing"
+which makes it completely incompatible with so-called "Token Ring" cards,
+but which makes transfers much more reliable than Ethernet does. In fact,
+ARCnet will guarantee that a packet arrives safely at the destination, and
+even if it can't possibly be delivered properly (ie. because of a cable
+break, or because the destination computer does not exist) it will at least
+tell the sender about it.
+
+Because of the carefully defined action of the "token", it will always make
+a pass around the "ring" within a maximum length of time. This makes it
+useful for realtime networks.
+
+In addition, all known ARCnet cards have an (almost) identical programming
+interface. This means that with one ARCnet driver you can support any
+card, whereas with Ethernet each manufacturer uses what is sometimes a
+completely different programming interface, leading to a lot of different,
+sometimes very similar, Ethernet drivers. Of course, always using the same
+programming interface also means that when high-performance hardware
+facilities like PCI bus mastering DMA appear, it's hard to take advantage of
+them. Let's not go into that.
+
+One thing that makes ARCnet cards difficult to program for, however, is the
+limit on their packet sizes; standard ARCnet can only send packets that are
+up to 508 bytes in length. This is smaller than the Internet "bare minimum"
+of 576 bytes, let alone the Ethernet MTU of 1500. To compensate, an extra
+level of encapsulation is defined by RFC1201, which I call "packet
+splitting," that allows "virtual packets" to grow as large as 64K each,
+although they are generally kept down to the Ethernet-style 1500 bytes.
+
+For more information on the advantages and disadvantages (mostly the
+advantages) of ARCnet networks, you might try the "ARCnet Trade Association"
+WWW page:
+ http://www.arcnet.com
+
+
+CABLING ARCNET NETWORKS
+-----------------------
+
+This section was rewritten by
+ Vojtech Pavlik <vojtech@suse.cz>
+using information from several people, including:
+ Avery Pennraun <apenwarr@worldvisions.ca>
+ Stephen A. Wood <saw@hallc1.cebaf.gov>
+ John Paul Morrison <jmorriso@bogomips.ee.ubc.ca>
+ Joachim Koenig <jojo@repas.de>
+and Avery touched it up a bit, at Vojtech's request.
+
+ARCnet (the classic 2.5 Mbps version) can be connected by two different
+types of cabling: coax and twisted pair. The other ARCnet-type networks
+(100 Mbps TCNS and 320 kbps - 32 Mbps ARCnet Plus) use different types of
+cabling (Type1, Fiber, C1, C4, C5).
+
+For a coax network, you "should" use 93 Ohm RG-62 cable. But other cables
+also work fine, because ARCnet is a very stable network. I personally use 75
+Ohm TV antenna cable.
+
+Cards for coax cabling are shipped in two different variants: for BUS and
+STAR network topologies. They are mostly the same. The only difference
+lies in the hybrid chip installed. BUS cards use high impedance output,
+while STAR use low impedance. Low impedance card (STAR) is electrically
+equal to a high impedance one with a terminator installed.
+
+Usually, the ARCnet networks are built up from STAR cards and hubs. There
+are two types of hubs - active and passive. Passive hubs are small boxes
+with four BNC connectors containing four 47 Ohm resistors:
+
+ | | wires
+ R + junction
+-R-+-R- R 47 Ohm resistors
+ R
+ |
+
+The shielding is connected together. Active hubs are much more complicated;
+they are powered and contain electronics to amplify the signal and send it
+to other segments of the net. They usually have eight connectors. Active
+hubs come in two variants - dumb and smart. The dumb variant just
+amplifies, but the smart one decodes to digital and encodes back all packets
+coming through. This is much better if you have several hubs in the net,
+since many dumb active hubs may worsen the signal quality.
+
+And now to the cabling. What you can connect together:
+
+1. A card to a card. This is the simplest way of creating a 2-computer
+ network.
+
+2. A card to a passive hub. Remember that all unused connectors on the hub
+ must be properly terminated with 93 Ohm (or something else if you don't
+ have the right ones) terminators.
+ (Avery's note: oops, I didn't know that. Mine (TV cable) works
+ anyway, though.)
+
+3. A card to an active hub. Here is no need to terminate the unused
+ connectors except some kind of aesthetic feeling. But, there may not be
+ more than eleven active hubs between any two computers. That of course
+ doesn't limit the number of active hubs on the network.
+
+4. An active hub to another.
+
+5. An active hub to passive hub.
+
+Remember that you cannot connect two passive hubs together. The power loss
+implied by such a connection is too high for the net to operate reliably.
+
+An example of a typical ARCnet network:
+
+ R S - STAR type card
+ S------H--------A-------S R - Terminator
+ | | H - Hub
+ | | A - Active hub
+ | S----H----S
+ S |
+ |
+ S
+
+The BUS topology is very similar to the one used by Ethernet. The only
+difference is in cable and terminators: they should be 93 Ohm. Ethernet
+uses 50 Ohm impedance. You use T connectors to put the computers on a single
+line of cable, the bus. You have to put terminators at both ends of the
+cable. A typical BUS ARCnet network looks like:
+
+ RT----T------T------T------T------TR
+ B B B B B B
+
+ B - BUS type card
+ R - Terminator
+ T - T connector
+
+But that is not all! The two types can be connected together. According to
+the official documentation the only way of connecting them is using an active
+hub:
+
+ A------T------T------TR
+ | B B B
+ S---H---S
+ |
+ S
+
+The official docs also state that you can use STAR cards at the ends of
+BUS network in place of a BUS card and a terminator:
+
+ S------T------T------S
+ B B
+
+But, according to my own experiments, you can simply hang a BUS type card
+anywhere in middle of a cable in a STAR topology network. And more - you
+can use the bus card in place of any star card if you use a terminator. Then
+you can build very complicated networks fulfilling all your needs! An
+example:
+
+ S
+ |
+ RT------T-------T------H------S
+ B B B |
+ | R
+ S------A------T-------T-------A-------H------TR
+ | B B | | B
+ | S BT |
+ | | | S----A-----S
+ S------H---A----S | |
+ | | S------T----H---S |
+ S S B R S
+
+A basically different cabling scheme is used with Twisted Pair cabling. Each
+of the TP cards has two RJ (phone-cord style) connectors. The cards are
+then daisy-chained together using a cable connecting every two neighboring
+cards. The ends are terminated with RJ 93 Ohm terminators which plug into
+the empty connectors of cards on the ends of the chain. An example:
+
+ ___________ ___________
+ _R_|_ _|_|_ _|_R_
+ | | | | | |
+ |Card | |Card | |Card |
+ |_____| |_____| |_____|
+
+
+There are also hubs for the TP topology. There is nothing difficult
+involved in using them; you just connect a TP chain to a hub on any end or
+even at both. This way you can create almost any network configuration.
+The maximum of 11 hubs between any two computers on the net applies here as
+well. An example:
+
+ RP-------P--------P--------H-----P------P-----PR
+ |
+ RP-----H--------P--------H-----P------PR
+ | |
+ PR PR
+
+ R - RJ Terminator
+ P - TP Card
+ H - TP Hub
+
+Like any network, ARCnet has a limited cable length. These are the maximum
+cable lengths between two active ends (an active end being an active hub or
+a STAR card).
+
+ RG-62 93 Ohm up to 650 m
+ RG-59/U 75 Ohm up to 457 m
+ RG-11/U 75 Ohm up to 533 m
+ IBM Type 1 150 Ohm up to 200 m
+ IBM Type 3 100 Ohm up to 100 m
+
+The maximum length of all cables connected to a passive hub is limited to 65
+meters for RG-62 cabling; less for others. You can see that using passive
+hubs in a large network is a bad idea. The maximum length of a single "BUS
+Trunk" is about 300 meters for RG-62. The maximum distance between the two
+most distant points of the net is limited to 3000 meters. The maximum length
+of a TP cable between two cards/hubs is 650 meters.
+
+
+SETTING THE JUMPERS
+-------------------
+
+All ARCnet cards should have a total of four or five different settings:
+
+ - the I/O address: this is the "port" your ARCnet card is on. Probed
+ values in the Linux ARCnet driver are only from 0x200 through 0x3F0. (If
+ your card has additional ones, which is possible, please tell me.) This
+ should not be the same as any other device on your system. According to
+ a doc I got from Novell, MS Windows prefers values of 0x300 or more,
+ eating net connections on my system (at least) otherwise. My guess is
+ this may be because, if your card is at 0x2E0, probing for a serial port
+ at 0x2E8 will reset the card and probably mess things up royally.
+ - Avery's favourite: 0x300.
+
+ - the IRQ: on 8-bit cards, it might be 2 (9), 3, 4, 5, or 7.
+ on 16-bit cards, it might be 2 (9), 3, 4, 5, 7, or 10-15.
+
+ Make sure this is different from any other card on your system. Note
+ that IRQ2 is the same as IRQ9, as far as Linux is concerned. You can
+ "cat /proc/interrupts" for a somewhat complete list of which ones are in
+ use at any given time. Here is a list of common usages from Vojtech
+ Pavlik <vojtech@suse.cz>:
+ ("Not on bus" means there is no way for a card to generate this
+ interrupt)
+ IRQ 0 - Timer 0 (Not on bus)
+ IRQ 1 - Keyboard (Not on bus)
+ IRQ 2 - IRQ Controller 2 (Not on bus, nor does interrupt the CPU)
+ IRQ 3 - COM2
+ IRQ 4 - COM1
+ IRQ 5 - FREE (LPT2 if you have it; sometimes COM3; maybe PLIP)
+ IRQ 6 - Floppy disk controller
+ IRQ 7 - FREE (LPT1 if you don't use the polling driver; PLIP)
+ IRQ 8 - Realtime Clock Interrupt (Not on bus)
+ IRQ 9 - FREE (VGA vertical sync interrupt if enabled)
+ IRQ 10 - FREE
+ IRQ 11 - FREE
+ IRQ 12 - FREE
+ IRQ 13 - Numeric Coprocessor (Not on bus)
+ IRQ 14 - Fixed Disk Controller
+ IRQ 15 - FREE (Fixed Disk Controller 2 if you have it)
+
+ Note: IRQ 9 is used on some video cards for the "vertical retrace"
+ interrupt. This interrupt would have been handy for things like
+ video games, as it occurs exactly once per screen refresh, but
+ unfortunately IBM cancelled this feature starting with the original
+ VGA and thus many VGA/SVGA cards do not support it. For this
+ reason, no modern software uses this interrupt and it can almost
+ always be safely disabled, if your video card supports it at all.
+
+ If your card for some reason CANNOT disable this IRQ (usually there
+ is a jumper), one solution would be to clip the printed circuit
+ contact on the board: it's the fourth contact from the left on the
+ back side. I take no responsibility if you try this.
+
+ - Avery's favourite: IRQ2 (actually IRQ9). Watch that VGA, though.
+
+ - the memory address: Unlike most cards, ARCnets use "shared memory" for
+ copying buffers around. Make SURE it doesn't conflict with any other
+ used memory in your system!
+ A0000 - VGA graphics memory (ok if you don't have VGA)
+ B0000 - Monochrome text mode
+ C0000 \ One of these is your VGA BIOS - usually C0000.
+ E0000 /
+ F0000 - System BIOS
+
+ Anything less than 0xA0000 is, well, a BAD idea since it isn't above
+ 640k.
+ - Avery's favourite: 0xD0000
+
+ - the station address: Every ARCnet card has its own "unique" network
+ address from 0 to 255. Unlike Ethernet, you can set this address
+ yourself with a jumper or switch (or on some cards, with special
+ software). Since it's only 8 bits, you can only have 254 ARCnet cards
+ on a network. DON'T use 0 or 255, since these are reserved (although
+ neat stuff will probably happen if you DO use them). By the way, if you
+ haven't already guessed, don't set this the same as any other ARCnet on
+ your network!
+ - Avery's favourite: 3 and 4. Not that it matters.
+
+ - There may be ETS1 and ETS2 settings. These may or may not make a
+ difference on your card (many manuals call them "reserved"), but are
+ used to change the delays used when powering up a computer on the
+ network. This is only necessary when wiring VERY long range ARCnet
+ networks, on the order of 4km or so; in any case, the only real
+ requirement here is that all cards on the network with ETS1 and ETS2
+ jumpers have them in the same position. Chris Hindy <chrish@io.org>
+ sent in a chart with actual values for this:
+ ET1 ET2 Response Time Reconfiguration Time
+ --- --- ------------- --------------------
+ open open 74.7us 840us
+ open closed 283.4us 1680us
+ closed open 561.8us 1680us
+ closed closed 1118.6us 1680us
+
+ Make sure you set ETS1 and ETS2 to the SAME VALUE for all cards on your
+ network.
+
+Also, on many cards (not mine, though) there are red and green LED's.
+Vojtech Pavlik <vojtech@suse.cz> tells me this is what they mean:
+ GREEN RED Status
+ ----- --- ------
+ OFF OFF Power off
+ OFF Short flashes Cabling problems (broken cable or not
+ terminated)
+ OFF (short) ON Card init
+ ON ON Normal state - everything OK, nothing
+ happens
+ ON Long flashes Data transfer
+ ON OFF Never happens (maybe when wrong ID)
+
+
+The following is all the specific information people have sent me about
+their own particular ARCnet cards. It is officially a mess, and contains
+huge amounts of duplicated information. I have no time to fix it. If you
+want to, PLEASE DO! Just send me a 'diff -u' of all your changes.
+
+The model # is listed right above specifics for that card, so you should be
+able to use your text viewer's "search" function to find the entry you want.
+If you don't KNOW what kind of card you have, try looking through the
+various diagrams to see if you can tell.
+
+If your model isn't listed and/or has different settings, PLEASE PLEASE
+tell me. I had to figure mine out without the manual, and it WASN'T FUN!
+
+Even if your ARCnet model isn't listed, but has the same jumpers as another
+model that is, please e-mail me to say so.
+
+Cards Listed in this file (in this order, mostly):
+
+ Manufacturer Model # Bits
+ ------------ ------- ----
+ SMC PC100 8
+ SMC PC110 8
+ SMC PC120 8
+ SMC PC130 8
+ SMC PC270E 8
+ SMC PC500 16
+ SMC PC500Longboard 16
+ SMC PC550Longboard 16
+ SMC PC600 16
+ SMC PC710 8
+ SMC? LCS-8830(-T) 8/16
+ Puredata PDI507 8
+ CNet Tech CN120-Series 8
+ CNet Tech CN160-Series 16
+ Lantech? UM9065L chipset 8
+ Acer 5210-003 8
+ Datapoint? LAN-ARC-8 8
+ Topware TA-ARC/10 8
+ Thomas-Conrad 500-6242-0097 REV A 8
+ Waterloo? (C)1985 Waterloo Micro. 8
+ No Name -- 8/16
+ No Name Taiwan R.O.C? 8
+ No Name Model 9058 8
+ Tiara Tiara Lancard? 8
+
+
+** SMC = Standard Microsystems Corp.
+** CNet Tech = CNet Technology, Inc.
+
+
+Unclassified Stuff
+------------------
+ - Please send any other information you can find.
+
+ - And some other stuff (more info is welcome!):
+ From: root@ultraworld.xs4all.nl (Timo Hilbrink)
+ To: apenwarr@foxnet.net (Avery Pennarun)
+ Date: Wed, 26 Oct 1994 02:10:32 +0000 (GMT)
+ Reply-To: timoh@xs4all.nl
+
+ [...parts deleted...]
+
+ About the jumpers: On my PC130 there is one more jumper, located near the
+ cable-connector and it's for changing to star or bus topology;
+ closed: star - open: bus
+ On the PC500 are some more jumper-pins, one block labeled with RX,PDN,TXI
+ and another with ALE,LA17,LA18,LA19 these are undocumented..
+
+ [...more parts deleted...]
+
+ --- CUT ---
+
+
+** Standard Microsystems Corp (SMC) **
+PC100, PC110, PC120, PC130 (8-bit cards)
+PC500, PC600 (16-bit cards)
+---------------------------------
+ - mainly from Avery Pennarun <apenwarr@worldvisions.ca>. Values depicted
+ are from Avery's setup.
+ - special thanks to Timo Hilbrink <timoh@xs4all.nl> for noting that PC120,
+ 130, 500, and 600 all have the same switches as Avery's PC100.
+ PC500/600 have several extra, undocumented pins though. (?)
+ - PC110 settings were verified by Stephen A. Wood <saw@cebaf.gov>
+ - Also, the JP- and S-numbers probably don't match your card exactly. Try
+ to find jumpers/switches with the same number of settings - it's
+ probably more reliable.
+
+
+ JP5 [|] : : : :
+(IRQ Setting) IRQ2 IRQ3 IRQ4 IRQ5 IRQ7
+ Put exactly one jumper on exactly one set of pins.
+
+
+ 1 2 3 4 5 6 7 8 9 10
+ S1 /----------------------------------\
+(I/O and Memory | 1 1 * 0 0 0 0 * 1 1 0 1 |
+ addresses) \----------------------------------/
+ |--| |--------| |--------|
+ (a) (b) (m)
+
+ WARNING. It's very important when setting these which way
+ you're holding the card, and which way you think is '1'!
+
+ If you suspect that your settings are not being made
+ correctly, try reversing the direction or inverting the
+ switch positions.
+
+ a: The first digit of the I/O address.
+ Setting Value
+ ------- -----
+ 00 0
+ 01 1
+ 10 2
+ 11 3
+
+ b: The second digit of the I/O address.
+ Setting Value
+ ------- -----
+ 0000 0
+ 0001 1
+ 0010 2
+ ... ...
+ 1110 E
+ 1111 F
+
+ The I/O address is in the form ab0. For example, if
+ a is 0x2 and b is 0xE, the address will be 0x2E0.
+
+ DO NOT SET THIS LESS THAN 0x200!!!!!
+
+
+ m: The first digit of the memory address.
+ Setting Value
+ ------- -----
+ 0000 0
+ 0001 1
+ 0010 2
+ ... ...
+ 1110 E
+ 1111 F
+
+ The memory address is in the form m0000. For example, if
+ m is D, the address will be 0xD0000.
+
+ DO NOT SET THIS TO C0000, F0000, OR LESS THAN A0000!
+
+ 1 2 3 4 5 6 7 8
+ S2 /--------------------------\
+(Station Address) | 1 1 0 0 0 0 0 0 |
+ \--------------------------/
+
+ Setting Value
+ ------- -----
+ 00000000 00
+ 10000000 01
+ 01000000 02
+ ...
+ 01111111 FE
+ 11111111 FF
+
+ Note that this is binary with the digits reversed!
+
+ DO NOT SET THIS TO 0 OR 255 (0xFF)!
+
+
+*****************************************************************************
+
+** Standard Microsystems Corp (SMC) **
+PC130E/PC270E (8-bit cards)
+---------------------------
+ - from Juergen Seifert <seifert@htwm.de>
+
+
+STANDARD MICROSYSTEMS CORPORATION (SMC) ARCNET(R)-PC130E/PC270E
+===============================================================
+
+This description has been written by Juergen Seifert <seifert@htwm.de>
+using information from the following Original SMC Manual
+
+ "Configuration Guide for
+ ARCNET(R)-PC130E/PC270
+ Network Controller Boards
+ Pub. # 900.044A
+ June, 1989"
+
+ARCNET is a registered trademark of the Datapoint Corporation
+SMC is a registered trademark of the Standard Microsystems Corporation
+
+The PC130E is an enhanced version of the PC130 board, is equipped with a
+standard BNC female connector for connection to RG-62/U coax cable.
+Since this board is designed both for point-to-point connection in star
+networks and for connection to bus networks, it is downwardly compatible
+with all the other standard boards designed for coax networks (that is,
+the PC120, PC110 and PC100 star topology boards and the PC220, PC210 and
+PC200 bus topology boards).
+
+The PC270E is an enhanced version of the PC260 board, is equipped with two
+modular RJ11-type jacks for connection to twisted pair wiring.
+It can be used in a star or a daisy-chained network.
+
+
+ 8 7 6 5 4 3 2 1
+ ________________________________________________________________
+ | | S1 | |
+ | |_________________| |
+ | Offs|Base |I/O Addr |
+ | RAM Addr | ___|
+ | ___ ___ CR3 |___|
+ | | \/ | CR4 |___|
+ | | PROM | ___|
+ | | | N | | 8
+ | | SOCKET | o | | 7
+ | |________| d | | 6
+ | ___________________ e | | 5
+ | | | A | S | 4
+ | |oo| EXT2 | | d | 2 | 3
+ | |oo| EXT1 | SMC | d | | 2
+ | |oo| ROM | 90C63 | r |___| 1
+ | |oo| IRQ7 | | |o| _____|
+ | |oo| IRQ5 | | |o| | J1 |
+ | |oo| IRQ4 | | STAR |_____|
+ | |oo| IRQ3 | | | J2 |
+ | |oo| IRQ2 |___________________| |_____|
+ |___ ______________|
+ | |
+ |_____________________________________________|
+
+Legend:
+
+SMC 90C63 ARCNET Controller / Transceiver /Logic
+S1 1-3: I/O Base Address Select
+ 4-6: Memory Base Address Select
+ 7-8: RAM Offset Select
+S2 1-8: Node ID Select
+EXT Extended Timeout Select
+ROM ROM Enable Select
+STAR Selected - Star Topology (PC130E only)
+ Deselected - Bus Topology (PC130E only)
+CR3/CR4 Diagnostic LEDs
+J1 BNC RG62/U Connector (PC130E only)
+J1 6-position Telephone Jack (PC270E only)
+J2 6-position Telephone Jack (PC270E only)
+
+Setting one of the switches to Off/Open means "1", On/Closed means "0".
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in group S2 are used to set the node ID.
+These switches work in a way similar to the PC100-series cards; see that
+entry for more information.
+
+
+Setting the I/O Base Address
+----------------------------
+
+The first three switches in switch group S1 are used to select one
+of eight possible I/O Base addresses using the following table
+
+
+ Switch | Hex I/O
+ 1 2 3 | Address
+ -------|--------
+ 0 0 0 | 260
+ 0 0 1 | 290
+ 0 1 0 | 2E0 (Manufacturer's default)
+ 0 1 1 | 2F0
+ 1 0 0 | 300
+ 1 0 1 | 350
+ 1 1 0 | 380
+ 1 1 1 | 3E0
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer requires 2K of a 16K block of RAM. The base of this
+16K block can be located in any of eight positions.
+Switches 4-6 of switch group S1 select the Base of the 16K block.
+Within that 16K address space, the buffer may be assigned any one of four
+positions, determined by the offset, switches 7 and 8 of group S1.
+
+ Switch | Hex RAM | Hex ROM
+ 4 5 6 7 8 | Address | Address *)
+ -----------|---------|-----------
+ 0 0 0 0 0 | C0000 | C2000
+ 0 0 0 0 1 | C0800 | C2000
+ 0 0 0 1 0 | C1000 | C2000
+ 0 0 0 1 1 | C1800 | C2000
+ | |
+ 0 0 1 0 0 | C4000 | C6000
+ 0 0 1 0 1 | C4800 | C6000
+ 0 0 1 1 0 | C5000 | C6000
+ 0 0 1 1 1 | C5800 | C6000
+ | |
+ 0 1 0 0 0 | CC000 | CE000
+ 0 1 0 0 1 | CC800 | CE000
+ 0 1 0 1 0 | CD000 | CE000
+ 0 1 0 1 1 | CD800 | CE000
+ | |
+ 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default)
+ 0 1 1 0 1 | D0800 | D2000
+ 0 1 1 1 0 | D1000 | D2000
+ 0 1 1 1 1 | D1800 | D2000
+ | |
+ 1 0 0 0 0 | D4000 | D6000
+ 1 0 0 0 1 | D4800 | D6000
+ 1 0 0 1 0 | D5000 | D6000
+ 1 0 0 1 1 | D5800 | D6000
+ | |
+ 1 0 1 0 0 | D8000 | DA000
+ 1 0 1 0 1 | D8800 | DA000
+ 1 0 1 1 0 | D9000 | DA000
+ 1 0 1 1 1 | D9800 | DA000
+ | |
+ 1 1 0 0 0 | DC000 | DE000
+ 1 1 0 0 1 | DC800 | DE000
+ 1 1 0 1 0 | DD000 | DE000
+ 1 1 0 1 1 | DD800 | DE000
+ | |
+ 1 1 1 0 0 | E0000 | E2000
+ 1 1 1 0 1 | E0800 | E2000
+ 1 1 1 1 0 | E1000 | E2000
+ 1 1 1 1 1 | E1800 | E2000
+
+*) To enable the 8K Boot PROM install the jumper ROM.
+ The default is jumper ROM not installed.
+
+
+Setting the Timeouts and Interrupt
+----------------------------------
+
+The jumpers labeled EXT1 and EXT2 are used to determine the timeout
+parameters. These two jumpers are normally left open.
+
+To select a hardware interrupt level set one (only one!) of the jumpers
+IRQ2, IRQ3, IRQ4, IRQ5, IRQ7. The Manufacturer's default is IRQ2.
+
+
+Configuring the PC130E for Star or Bus Topology
+-----------------------------------------------
+
+The single jumper labeled STAR is used to configure the PC130E board for
+star or bus topology.
+When the jumper is installed, the board may be used in a star network, when
+it is removed, the board can be used in a bus topology.
+
+
+Diagnostic LEDs
+---------------
+
+Two diagnostic LEDs are visible on the rear bracket of the board.
+The green LED monitors the network activity: the red one shows the
+board activity:
+
+ Green | Status Red | Status
+ -------|------------------- ---------|-------------------
+ on | normal activity flash/on | data transfer
+ blink | reconfiguration off | no data transfer;
+ off | defective board or | incorrect memory or
+ | node ID is zero | I/O address
+
+
+*****************************************************************************
+
+** Standard Microsystems Corp (SMC) **
+PC500/PC550 Longboard (16-bit cards)
+-------------------------------------
+ - from Juergen Seifert <seifert@htwm.de>
+
+
+STANDARD MICROSYSTEMS CORPORATION (SMC) ARCNET-PC500/PC550 Long Board
+=====================================================================
+
+Note: There is another Version of the PC500 called Short Version, which
+ is different in hard- and software! The most important differences
+ are:
+ - The long board has no Shared memory.
+ - On the long board the selection of the interrupt is done by binary
+ coded switch, on the short board directly by jumper.
+
+[Avery's note: pay special attention to that: the long board HAS NO SHARED
+MEMORY. This means the current Linux-ARCnet driver can't use these cards.
+I have obtained a PC500Longboard and will be doing some experiments on it in
+the future, but don't hold your breath. Thanks again to Juergen Seifert for
+his advice about this!]
+
+This description has been written by Juergen Seifert <seifert@htwm.de>
+using information from the following Original SMC Manual
+
+ "Configuration Guide for
+ SMC ARCNET-PC500/PC550
+ Series Network Controller Boards
+ Pub. # 900.033 Rev. A
+ November, 1989"
+
+ARCNET is a registered trademark of the Datapoint Corporation
+SMC is a registered trademark of the Standard Microsystems Corporation
+
+The PC500 is equipped with a standard BNC female connector for connection
+to RG-62/U coax cable.
+The board is designed both for point-to-point connection in star networks
+and for connection to bus networks.
+
+The PC550 is equipped with two modular RJ11-type jacks for connection
+to twisted pair wiring.
+It can be used in a star or a daisy-chained (BUS) network.
+
+ 1
+ 0 9 8 7 6 5 4 3 2 1 6 5 4 3 2 1
+ ____________________________________________________________________
+ < | SW1 | | SW2 | |
+ > |_____________________| |_____________| |
+ < IRQ |I/O Addr |
+ > ___|
+ < CR4 |___|
+ > CR3 |___|
+ < ___|
+ > N | | 8
+ < o | | 7
+ > d | S | 6
+ < e | W | 5
+ > A | 3 | 4
+ < d | | 3
+ > d | | 2
+ < r |___| 1
+ > |o| _____|
+ < |o| | J1 |
+ > 3 1 JP6 |_____|
+ < |o|o| JP2 | J2 |
+ > |o|o| |_____|
+ < 4 2__ ______________|
+ > | | |
+ <____| |_____________________________________________|
+
+Legend:
+
+SW1 1-6: I/O Base Address Select
+ 7-10: Interrupt Select
+SW2 1-6: Reserved for Future Use
+SW3 1-8: Node ID Select
+JP2 1-4: Extended Timeout Select
+JP6 Selected - Star Topology (PC500 only)
+ Deselected - Bus Topology (PC500 only)
+CR3 Green Monitors Network Activity
+CR4 Red Monitors Board Activity
+J1 BNC RG62/U Connector (PC500 only)
+J1 6-position Telephone Jack (PC550 only)
+J2 6-position Telephone Jack (PC550 only)
+
+Setting one of the switches to Off/Open means "1", On/Closed means "0".
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in group SW3 are used to set the node ID. Each node
+attached to the network must have an unique node ID which must be
+different from 0.
+Switch 1 serves as the least significant bit (LSB).
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Value
+ -------|-------
+ 1 | 1
+ 2 | 2
+ 3 | 4
+ 4 | 8
+ 5 | 16
+ 6 | 32
+ 7 | 64
+ 8 | 128
+
+Some Examples:
+
+ Switch | Hex | Decimal
+ 8 7 6 5 4 3 2 1 | Node ID | Node ID
+ ----------------|---------|---------
+ 0 0 0 0 0 0 0 0 | not allowed
+ 0 0 0 0 0 0 0 1 | 1 | 1
+ 0 0 0 0 0 0 1 0 | 2 | 2
+ 0 0 0 0 0 0 1 1 | 3 | 3
+ . . . | |
+ 0 1 0 1 0 1 0 1 | 55 | 85
+ . . . | |
+ 1 0 1 0 1 0 1 0 | AA | 170
+ . . . | |
+ 1 1 1 1 1 1 0 1 | FD | 253
+ 1 1 1 1 1 1 1 0 | FE | 254
+ 1 1 1 1 1 1 1 1 | FF | 255
+
+
+Setting the I/O Base Address
+----------------------------
+
+The first six switches in switch group SW1 are used to select one
+of 32 possible I/O Base addresses using the following table
+
+ Switch | Hex I/O
+ 6 5 4 3 2 1 | Address
+ -------------|--------
+ 0 1 0 0 0 0 | 200
+ 0 1 0 0 0 1 | 210
+ 0 1 0 0 1 0 | 220
+ 0 1 0 0 1 1 | 230
+ 0 1 0 1 0 0 | 240
+ 0 1 0 1 0 1 | 250
+ 0 1 0 1 1 0 | 260
+ 0 1 0 1 1 1 | 270
+ 0 1 1 0 0 0 | 280
+ 0 1 1 0 0 1 | 290
+ 0 1 1 0 1 0 | 2A0
+ 0 1 1 0 1 1 | 2B0
+ 0 1 1 1 0 0 | 2C0
+ 0 1 1 1 0 1 | 2D0
+ 0 1 1 1 1 0 | 2E0 (Manufacturer's default)
+ 0 1 1 1 1 1 | 2F0
+ 1 1 0 0 0 0 | 300
+ 1 1 0 0 0 1 | 310
+ 1 1 0 0 1 0 | 320
+ 1 1 0 0 1 1 | 330
+ 1 1 0 1 0 0 | 340
+ 1 1 0 1 0 1 | 350
+ 1 1 0 1 1 0 | 360
+ 1 1 0 1 1 1 | 370
+ 1 1 1 0 0 0 | 380
+ 1 1 1 0 0 1 | 390
+ 1 1 1 0 1 0 | 3A0
+ 1 1 1 0 1 1 | 3B0
+ 1 1 1 1 0 0 | 3C0
+ 1 1 1 1 0 1 | 3D0
+ 1 1 1 1 1 0 | 3E0
+ 1 1 1 1 1 1 | 3F0
+
+
+Setting the Interrupt
+---------------------
+
+Switches seven through ten of switch group SW1 are used to select the
+interrupt level. The interrupt level is binary coded, so selections
+from 0 to 15 would be possible, but only the following eight values will
+be supported: 3, 4, 5, 7, 9, 10, 11, 12.
+
+ Switch | IRQ
+ 10 9 8 7 |
+ ---------|--------
+ 0 0 1 1 | 3
+ 0 1 0 0 | 4
+ 0 1 0 1 | 5
+ 0 1 1 1 | 7
+ 1 0 0 1 | 9 (=2) (default)
+ 1 0 1 0 | 10
+ 1 0 1 1 | 11
+ 1 1 0 0 | 12
+
+
+Setting the Timeouts
+--------------------
+
+The two jumpers JP2 (1-4) are used to determine the timeout parameters.
+These two jumpers are normally left open.
+Refer to the COM9026 Data Sheet for alternate configurations.
+
+
+Configuring the PC500 for Star or Bus Topology
+----------------------------------------------
+
+The single jumper labeled JP6 is used to configure the PC500 board for
+star or bus topology.
+When the jumper is installed, the board may be used in a star network, when
+it is removed, the board can be used in a bus topology.
+
+
+Diagnostic LEDs
+---------------
+
+Two diagnostic LEDs are visible on the rear bracket of the board.
+The green LED monitors the network activity: the red one shows the
+board activity:
+
+ Green | Status Red | Status
+ -------|------------------- ---------|-------------------
+ on | normal activity flash/on | data transfer
+ blink | reconfiguration off | no data transfer;
+ off | defective board or | incorrect memory or
+ | node ID is zero | I/O address
+
+
+*****************************************************************************
+
+** SMC **
+PC710 (8-bit card)
+------------------
+ - from J.S. van Oosten <jvoosten@compiler.tdcnet.nl>
+
+Note: this data is gathered by experimenting and looking at info of other
+cards. However, I'm sure I got 99% of the settings right.
+
+The SMC710 card resembles the PC270 card, but is much more basic (i.e. no
+LEDs, RJ11 jacks, etc.) and 8 bit. Here's a little drawing:
+
+ _______________________________________
+ | +---------+ +---------+ |____
+ | | S2 | | S1 | |
+ | +---------+ +---------+ |
+ | |
+ | +===+ __ |
+ | | R | | | X-tal ###___
+ | | O | |__| ####__'|
+ | | M | || ###
+ | +===+ |
+ | |
+ | .. JP1 +----------+ |
+ | .. | big chip | |
+ | .. | 90C63 | |
+ | .. | | |
+ | .. +----------+ |
+ ------- -----------
+ |||||||||||||||||||||
+
+The row of jumpers at JP1 actually consists of 8 jumpers, (sometimes
+labelled) the same as on the PC270, from top to bottom: EXT2, EXT1, ROM,
+IRQ7, IRQ5, IRQ4, IRQ3, IRQ2 (gee, wonder what they would do? :-) )
+
+S1 and S2 perform the same function as on the PC270, only their numbers
+are swapped (S1 is the nodeaddress, S2 sets IO- and RAM-address).
+
+I know it works when connected to a PC110 type ARCnet board.
+
+
+*****************************************************************************
+
+** Possibly SMC **
+LCS-8830(-T) (8 and 16-bit cards)
+---------------------------------
+ - from Mathias Katzer <mkatzer@HRZ.Uni-Bielefeld.DE>
+ - Marek Michalkiewicz <marekm@i17linuxb.ists.pwr.wroc.pl> says the
+ LCS-8830 is slightly different from LCS-8830-T. These are 8 bit, BUS
+ only (the JP0 jumper is hardwired), and BNC only.
+
+This is a LCS-8830-T made by SMC, I think ('SMC' only appears on one PLCC,
+nowhere else, not even on the few Xeroxed sheets from the manual).
+
+SMC ARCnet Board Type LCS-8830-T
+
+ ------------------------------------
+ | |
+ | JP3 88 8 JP2 |
+ | ##### | \ |
+ | ##### ET1 ET2 ###|
+ | 8 ###|
+ | U3 SW 1 JP0 ###| Phone Jacks
+ | -- ###|
+ | | | |
+ | | | SW2 |
+ | | | |
+ | | | ##### |
+ | -- ##### #### BNC Connector
+ | ####
+ | 888888 JP1 |
+ | 234567 |
+ -- -------
+ |||||||||||||||||||||||||||
+ --------------------------
+
+
+SW1: DIP-Switches for Station Address
+SW2: DIP-Switches for Memory Base and I/O Base addresses
+
+JP0: If closed, internal termination on (default open)
+JP1: IRQ Jumpers
+JP2: Boot-ROM enabled if closed
+JP3: Jumpers for response timeout
+
+U3: Boot-ROM Socket
+
+
+ET1 ET2 Response Time Idle Time Reconfiguration Time
+
+ 78 86 840
+ X 285 316 1680
+ X 563 624 1680
+ X X 1130 1237 1680
+
+(X means closed jumper)
+
+(DIP-Switch downwards means "0")
+
+The station address is binary-coded with SW1.
+
+The I/O base address is coded with DIP-Switches 6,7 and 8 of SW2:
+
+Switches Base
+678 Address
+000 260-26f
+100 290-29f
+010 2e0-2ef
+110 2f0-2ff
+001 300-30f
+101 350-35f
+011 380-38f
+111 3e0-3ef
+
+
+DIP Switches 1-5 of SW2 encode the RAM and ROM Address Range:
+
+Switches RAM ROM
+12345 Address Range Address Range
+00000 C:0000-C:07ff C:2000-C:3fff
+10000 C:0800-C:0fff
+01000 C:1000-C:17ff
+11000 C:1800-C:1fff
+00100 C:4000-C:47ff C:6000-C:7fff
+10100 C:4800-C:4fff
+01100 C:5000-C:57ff
+11100 C:5800-C:5fff
+00010 C:C000-C:C7ff C:E000-C:ffff
+10010 C:C800-C:Cfff
+01010 C:D000-C:D7ff
+11010 C:D800-C:Dfff
+00110 D:0000-D:07ff D:2000-D:3fff
+10110 D:0800-D:0fff
+01110 D:1000-D:17ff
+11110 D:1800-D:1fff
+00001 D:4000-D:47ff D:6000-D:7fff
+10001 D:4800-D:4fff
+01001 D:5000-D:57ff
+11001 D:5800-D:5fff
+00101 D:8000-D:87ff D:A000-D:bfff
+10101 D:8800-D:8fff
+01101 D:9000-D:97ff
+11101 D:9800-D:9fff
+00011 D:C000-D:c7ff D:E000-D:ffff
+10011 D:C800-D:cfff
+01011 D:D000-D:d7ff
+11011 D:D800-D:dfff
+00111 E:0000-E:07ff E:2000-E:3fff
+10111 E:0800-E:0fff
+01111 E:1000-E:17ff
+11111 E:1800-E:1fff
+
+
+*****************************************************************************
+
+** PureData Corp **
+PDI507 (8-bit card)
+--------------------
+ - from Mark Rejhon <mdrejhon@magi.com> (slight modifications by Avery)
+ - Avery's note: I think PDI508 cards (but definitely NOT PDI508Plus cards)
+ are mostly the same as this. PDI508Plus cards appear to be mainly
+ software-configured.
+
+Jumpers:
+ There is a jumper array at the bottom of the card, near the edge
+ connector. This array is labelled J1. They control the IRQs and
+ something else. Put only one jumper on the IRQ pins.
+
+ ETS1, ETS2 are for timing on very long distance networks. See the
+ more general information near the top of this file.
+
+ There is a J2 jumper on two pins. A jumper should be put on them,
+ since it was already there when I got the card. I don't know what
+ this jumper is for though.
+
+ There is a two-jumper array for J3. I don't know what it is for,
+ but there were already two jumpers on it when I got the card. It's
+ a six pin grid in a two-by-three fashion. The jumpers were
+ configured as follows:
+
+ .-------.
+ o | o o |
+ :-------: ------> Accessible end of card with connectors
+ o | o o | in this direction ------->
+ `-------'
+
+Carl de Billy <CARL@carainfo.com> explains J3 and J4:
+
+ J3 Diagram:
+
+ .-------.
+ o | o o |
+ :-------: TWIST Technology
+ o | o o |
+ `-------'
+ .-------.
+ | o o | o
+ :-------: COAX Technology
+ | o o | o
+ `-------'
+
+ - If using coax cable in a bus topology the J4 jumper must be removed;
+ place it on one pin.
+
+ - If using bus topology with twisted pair wiring move the J3
+ jumpers so they connect the middle pin and the pins closest to the RJ11
+ Connectors. Also the J4 jumper must be removed; place it on one pin of
+ J4 jumper for storage.
+
+ - If using star topology with twisted pair wiring move the J3
+ jumpers so they connect the middle pin and the pins closest to the RJ11
+ connectors.
+
+
+DIP Switches:
+
+ The DIP switches accessible on the accessible end of the card while
+ it is installed, is used to set the ARCnet address. There are 8
+ switches. Use an address from 1 to 254.
+
+ Switch No.
+ 12345678 ARCnet address
+ -----------------------------------------
+ 00000000 FF (Don't use this!)
+ 00000001 FE
+ 00000010 FD
+ ....
+ 11111101 2
+ 11111110 1
+ 11111111 0 (Don't use this!)
+
+ There is another array of eight DIP switches at the top of the
+ card. There are five labelled MS0-MS4 which seem to control the
+ memory address, and another three labelled IO0-IO2 which seem to
+ control the base I/O address of the card.
+
+ This was difficult to test by trial and error, and the I/O addresses
+ are in a weird order. This was tested by setting the DIP switches,
+ rebooting the computer, and attempting to load ARCETHER at various
+ addresses (mostly between 0x200 and 0x400). The address that caused
+ the red transmit LED to blink, is the one that I thought works.
+
+ Also, the address 0x3D0 seem to have a special meaning, since the
+ ARCETHER packet driver loaded fine, but without the red LED
+ blinking. I don't know what 0x3D0 is for though. I recommend using
+ an address of 0x300 since Windows may not like addresses below
+ 0x300.
+
+ IO Switch No.
+ 210 I/O address
+ -------------------------------
+ 111 0x260
+ 110 0x290
+ 101 0x2E0
+ 100 0x2F0
+ 011 0x300
+ 010 0x350
+ 001 0x380
+ 000 0x3E0
+
+ The memory switches set a reserved address space of 0x1000 bytes
+ (0x100 segment units, or 4k). For example if I set an address of
+ 0xD000, it will use up addresses 0xD000 to 0xD100.
+
+ The memory switches were tested by booting using QEMM386 stealth,
+ and using LOADHI to see what address automatically became excluded
+ from the upper memory regions, and then attempting to load ARCETHER
+ using these addresses.
+
+ I recommend using an ARCnet memory address of 0xD000, and putting
+ the EMS page frame at 0xC000 while using QEMM stealth mode. That
+ way, you get contiguous high memory from 0xD100 almost all the way
+ the end of the megabyte.
+
+ Memory Switch 0 (MS0) didn't seem to work properly when set to OFF
+ on my card. It could be malfunctioning on my card. Experiment with
+ it ON first, and if it doesn't work, set it to OFF. (It may be a
+ modifier for the 0x200 bit?)
+
+ MS Switch No.
+ 43210 Memory address
+ --------------------------------
+ 00001 0xE100 (guessed - was not detected by QEMM)
+ 00011 0xE000 (guessed - was not detected by QEMM)
+ 00101 0xDD00
+ 00111 0xDC00
+ 01001 0xD900
+ 01011 0xD800
+ 01101 0xD500
+ 01111 0xD400
+ 10001 0xD100
+ 10011 0xD000
+ 10101 0xCD00
+ 10111 0xCC00
+ 11001 0xC900 (guessed - crashes tested system)
+ 11011 0xC800 (guessed - crashes tested system)
+ 11101 0xC500 (guessed - crashes tested system)
+ 11111 0xC400 (guessed - crashes tested system)
+
+
+*****************************************************************************
+
+** CNet Technology Inc. **
+120 Series (8-bit cards)
+------------------------
+ - from Juergen Seifert <seifert@htwm.de>
+
+
+CNET TECHNOLOGY INC. (CNet) ARCNET 120A SERIES
+==============================================
+
+This description has been written by Juergen Seifert <seifert@htwm.de>
+using information from the following Original CNet Manual
+
+ "ARCNET
+ USER'S MANUAL
+ for
+ CN120A
+ CN120AB
+ CN120TP
+ CN120ST
+ CN120SBT
+ P/N:12-01-0007
+ Revision 3.00"
+
+ARCNET is a registered trademark of the Datapoint Corporation
+
+P/N 120A ARCNET 8 bit XT/AT Star
+P/N 120AB ARCNET 8 bit XT/AT Bus
+P/N 120TP ARCNET 8 bit XT/AT Twisted Pair
+P/N 120ST ARCNET 8 bit XT/AT Star, Twisted Pair
+P/N 120SBT ARCNET 8 bit XT/AT Star, Bus, Twisted Pair
+
+ __________________________________________________________________
+ | |
+ | ___|
+ | LED |___|
+ | ___|
+ | N | | ID7
+ | o | | ID6
+ | d | S | ID5
+ | e | W | ID4
+ | ___________________ A | 2 | ID3
+ | | | d | | ID2
+ | | | 1 2 3 4 5 6 7 8 d | | ID1
+ | | | _________________ r |___| ID0
+ | | 90C65 || SW1 | ____|
+ | JP 8 7 | ||_________________| | |
+ | |o|o| JP1 | | | J2 |
+ | |o|o| |oo| | | JP 1 1 1 | |
+ | ______________ | | 0 1 2 |____|
+ | | PROM | |___________________| |o|o|o| _____|
+ | > SOCKET | JP 6 5 4 3 2 |o|o|o| | J1 |
+ | |______________| |o|o|o|o|o| |o|o|o| |_____|
+ |_____ |o|o|o|o|o| ______________|
+ | |
+ |_____________________________________________|
+
+Legend:
+
+90C65 ARCNET Probe
+S1 1-5: Base Memory Address Select
+ 6-8: Base I/O Address Select
+S2 1-8: Node ID Select (ID0-ID7)
+JP1 ROM Enable Select
+JP2 IRQ2
+JP3 IRQ3
+JP4 IRQ4
+JP5 IRQ5
+JP6 IRQ7
+JP7/JP8 ET1, ET2 Timeout Parameters
+JP10/JP11 Coax / Twisted Pair Select (CN120ST/SBT only)
+JP12 Terminator Select (CN120AB/ST/SBT only)
+J1 BNC RG62/U Connector (all except CN120TP)
+J2 Two 6-position Telephone Jack (CN120TP/ST/SBT only)
+
+Setting one of the switches to Off means "1", On means "0".
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in SW2 are used to set the node ID. Each node attached
+to the network must have an unique node ID which must be different from 0.
+Switch 1 (ID0) serves as the least significant bit (LSB).
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Label | Value
+ -------|-------|-------
+ 1 | ID0 | 1
+ 2 | ID1 | 2
+ 3 | ID2 | 4
+ 4 | ID3 | 8
+ 5 | ID4 | 16
+ 6 | ID5 | 32
+ 7 | ID6 | 64
+ 8 | ID7 | 128
+
+Some Examples:
+
+ Switch | Hex | Decimal
+ 8 7 6 5 4 3 2 1 | Node ID | Node ID
+ ----------------|---------|---------
+ 0 0 0 0 0 0 0 0 | not allowed
+ 0 0 0 0 0 0 0 1 | 1 | 1
+ 0 0 0 0 0 0 1 0 | 2 | 2
+ 0 0 0 0 0 0 1 1 | 3 | 3
+ . . . | |
+ 0 1 0 1 0 1 0 1 | 55 | 85
+ . . . | |
+ 1 0 1 0 1 0 1 0 | AA | 170
+ . . . | |
+ 1 1 1 1 1 1 0 1 | FD | 253
+ 1 1 1 1 1 1 1 0 | FE | 254
+ 1 1 1 1 1 1 1 1 | FF | 255
+
+
+Setting the I/O Base Address
+----------------------------
+
+The last three switches in switch block SW1 are used to select one
+of eight possible I/O Base addresses using the following table
+
+
+ Switch | Hex I/O
+ 6 7 8 | Address
+ ------------|--------
+ ON ON ON | 260
+ OFF ON ON | 290
+ ON OFF ON | 2E0 (Manufacturer's default)
+ OFF OFF ON | 2F0
+ ON ON OFF | 300
+ OFF ON OFF | 350
+ ON OFF OFF | 380
+ OFF OFF OFF | 3E0
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer (RAM) requires 2K. The base of this buffer can be
+located in any of eight positions. The address of the Boot Prom is
+memory base + 8K or memory base + 0x2000.
+Switches 1-5 of switch block SW1 select the Memory Base address.
+
+ Switch | Hex RAM | Hex ROM
+ 1 2 3 4 5 | Address | Address *)
+ --------------------|---------|-----------
+ ON ON ON ON ON | C0000 | C2000
+ ON ON OFF ON ON | C4000 | C6000
+ ON ON ON OFF ON | CC000 | CE000
+ ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default)
+ ON ON ON ON OFF | D4000 | D6000
+ ON ON OFF ON OFF | D8000 | DA000
+ ON ON ON OFF OFF | DC000 | DE000
+ ON ON OFF OFF OFF | E0000 | E2000
+
+*) To enable the Boot ROM install the jumper JP1
+
+Note: Since the switches 1 and 2 are always set to ON it may be possible
+ that they can be used to add an offset of 2K, 4K or 6K to the base
+ address, but this feature is not documented in the manual and I
+ haven't tested it yet.
+
+
+Setting the Interrupt Line
+--------------------------
+
+To select a hardware interrupt level install one (only one!) of the jumpers
+JP2, JP3, JP4, JP5, JP6. JP2 is the default.
+
+ Jumper | IRQ
+ -------|-----
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 7
+
+
+Setting the Internal Terminator on CN120AB/TP/SBT
+--------------------------------------------------
+
+The jumper JP12 is used to enable the internal terminator.
+
+ -----
+ 0 | 0 |
+ ----- ON | | ON
+ | 0 | | 0 |
+ | | OFF ----- OFF
+ | 0 | 0
+ -----
+ Terminator Terminator
+ disabled enabled
+
+
+Selecting the Connector Type on CN120ST/SBT
+-------------------------------------------
+
+ JP10 JP11 JP10 JP11
+ ----- -----
+ 0 0 | 0 | | 0 |
+ ----- ----- | | | |
+ | 0 | | 0 | | 0 | | 0 |
+ | | | | ----- -----
+ | 0 | | 0 | 0 0
+ ----- -----
+ Coaxial Cable Twisted Pair Cable
+ (Default)
+
+
+Setting the Timeout Parameters
+------------------------------
+
+The jumpers labeled EXT1 and EXT2 are used to determine the timeout
+parameters. These two jumpers are normally left open.
+
+
+
+*****************************************************************************
+
+** CNet Technology Inc. **
+160 Series (16-bit cards)
+-------------------------
+ - from Juergen Seifert <seifert@htwm.de>
+
+CNET TECHNOLOGY INC. (CNet) ARCNET 160A SERIES
+==============================================
+
+This description has been written by Juergen Seifert <seifert@htwm.de>
+using information from the following Original CNet Manual
+
+ "ARCNET
+ USER'S MANUAL
+ for
+ CN160A
+ CN160AB
+ CN160TP
+ P/N:12-01-0006
+ Revision 3.00"
+
+ARCNET is a registered trademark of the Datapoint Corporation
+
+P/N 160A ARCNET 16 bit XT/AT Star
+P/N 160AB ARCNET 16 bit XT/AT Bus
+P/N 160TP ARCNET 16 bit XT/AT Twisted Pair
+
+ ___________________________________________________________________
+ < _________________________ ___|
+ > |oo| JP2 | | LED |___|
+ < |oo| JP1 | 9026 | LED |___|
+ > |_________________________| ___|
+ < N | | ID7
+ > 1 o | | ID6
+ < 1 2 3 4 5 6 7 8 9 0 d | S | ID5
+ > _______________ _____________________ e | W | ID4
+ < | PROM | | SW1 | A | 2 | ID3
+ > > SOCKET | |_____________________| d | | ID2
+ < |_______________| | IO-Base | MEM | d | | ID1
+ > r |___| ID0
+ < ____|
+ > | |
+ < | J1 |
+ > | |
+ < |____|
+ > 1 1 1 1 |
+ < 3 4 5 6 7 JP 8 9 0 1 2 3 |
+ > |o|o|o|o|o| |o|o|o|o|o|o| |
+ < |o|o|o|o|o| __ |o|o|o|o|o|o| ___________|
+ > | | |
+ <____________| |_______________________________________|
+
+Legend:
+
+9026 ARCNET Probe
+SW1 1-6: Base I/O Address Select
+ 7-10: Base Memory Address Select
+SW2 1-8: Node ID Select (ID0-ID7)
+JP1/JP2 ET1, ET2 Timeout Parameters
+JP3-JP13 Interrupt Select
+J1 BNC RG62/U Connector (CN160A/AB only)
+J1 Two 6-position Telephone Jack (CN160TP only)
+LED
+
+Setting one of the switches to Off means "1", On means "0".
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in SW2 are used to set the node ID. Each node attached
+to the network must have an unique node ID which must be different from 0.
+Switch 1 (ID0) serves as the least significant bit (LSB).
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Label | Value
+ -------|-------|-------
+ 1 | ID0 | 1
+ 2 | ID1 | 2
+ 3 | ID2 | 4
+ 4 | ID3 | 8
+ 5 | ID4 | 16
+ 6 | ID5 | 32
+ 7 | ID6 | 64
+ 8 | ID7 | 128
+
+Some Examples:
+
+ Switch | Hex | Decimal
+ 8 7 6 5 4 3 2 1 | Node ID | Node ID
+ ----------------|---------|---------
+ 0 0 0 0 0 0 0 0 | not allowed
+ 0 0 0 0 0 0 0 1 | 1 | 1
+ 0 0 0 0 0 0 1 0 | 2 | 2
+ 0 0 0 0 0 0 1 1 | 3 | 3
+ . . . | |
+ 0 1 0 1 0 1 0 1 | 55 | 85
+ . . . | |
+ 1 0 1 0 1 0 1 0 | AA | 170
+ . . . | |
+ 1 1 1 1 1 1 0 1 | FD | 253
+ 1 1 1 1 1 1 1 0 | FE | 254
+ 1 1 1 1 1 1 1 1 | FF | 255
+
+
+Setting the I/O Base Address
+----------------------------
+
+The first six switches in switch block SW1 are used to select the I/O Base
+address using the following table:
+
+ Switch | Hex I/O
+ 1 2 3 4 5 6 | Address
+ ------------------------|--------
+ OFF ON ON OFF OFF ON | 260
+ OFF ON OFF ON ON OFF | 290
+ OFF ON OFF OFF OFF ON | 2E0 (Manufacturer's default)
+ OFF ON OFF OFF OFF OFF | 2F0
+ OFF OFF ON ON ON ON | 300
+ OFF OFF ON OFF ON OFF | 350
+ OFF OFF OFF ON ON ON | 380
+ OFF OFF OFF OFF OFF ON | 3E0
+
+Note: Other IO-Base addresses seem to be selectable, but only the above
+ combinations are documented.
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The switches 7-10 of switch block SW1 are used to select the Memory
+Base address of the RAM (2K) and the PROM.
+
+ Switch | Hex RAM | Hex ROM
+ 7 8 9 10 | Address | Address
+ ----------------|---------|-----------
+ OFF OFF ON ON | C0000 | C8000
+ OFF OFF ON OFF | D0000 | D8000 (Default)
+ OFF OFF OFF ON | E0000 | E8000
+
+Note: Other MEM-Base addresses seem to be selectable, but only the above
+ combinations are documented.
+
+
+Setting the Interrupt Line
+--------------------------
+
+To select a hardware interrupt level install one (only one!) of the jumpers
+JP3 through JP13 using the following table:
+
+ Jumper | IRQ
+ -------|-----------------
+ 3 | 14
+ 4 | 15
+ 5 | 12
+ 6 | 11
+ 7 | 10
+ 8 | 3
+ 9 | 4
+ 10 | 5
+ 11 | 6
+ 12 | 7
+ 13 | 2 (=9) Default!
+
+Note: - Do not use JP11=IRQ6, it may conflict with your Floppy Disk
+ Controller
+ - Use JP3=IRQ14 only, if you don't have an IDE-, MFM-, or RLL-
+ Hard Disk, it may conflict with their controllers
+
+
+Setting the Timeout Parameters
+------------------------------
+
+The jumpers labeled JP1 and JP2 are used to determine the timeout
+parameters. These two jumpers are normally left open.
+
+
+*****************************************************************************
+
+** Lantech **
+8-bit card, unknown model
+-------------------------
+ - from Vlad Lungu <vlungu@ugal.ro> - his e-mail address seemed broken at
+ the time I tried to reach him. Sorry Vlad, if you didn't get my reply.
+
+ ________________________________________________________________
+ | 1 8 |
+ | ___________ __|
+ | | SW1 | LED |__|
+ | |__________| |
+ | ___|
+ | _____________________ |S | 8
+ | | | |W |
+ | | | |2 |
+ | | | |__| 1
+ | | UM9065L | |o| JP4 ____|____
+ | | | |o| | CN |
+ | | | |________|
+ | | | |
+ | |___________________| |
+ | |
+ | |
+ | _____________ |
+ | | | |
+ | | PROM | |ooooo| JP6 |
+ | |____________| |ooooo| |
+ |_____________ _ _|
+ |____________________________________________| |__|
+
+
+UM9065L : ARCnet Controller
+
+SW 1 : Shared Memory Address and I/O Base
+
+ ON=0
+
+ 12345|Memory Address
+ -----|--------------
+ 00001| D4000
+ 00010| CC000
+ 00110| D0000
+ 01110| D1000
+ 01101| D9000
+ 10010| CC800
+ 10011| DC800
+ 11110| D1800
+
+It seems that the bits are considered in reverse order. Also, you must
+observe that some of those addresses are unusual and I didn't probe them; I
+used a memory dump in DOS to identify them. For the 00000 configuration and
+some others that I didn't write here the card seems to conflict with the
+video card (an S3 GENDAC). I leave the full decoding of those addresses to
+you.
+
+ 678| I/O Address
+ ---|------------
+ 000| 260
+ 001| failed probe
+ 010| 2E0
+ 011| 380
+ 100| 290
+ 101| 350
+ 110| failed probe
+ 111| 3E0
+
+SW 2 : Node ID (binary coded)
+
+JP 4 : Boot PROM enable CLOSE - enabled
+ OPEN - disabled
+
+JP 6 : IRQ set (ONLY ONE jumper on 1-5 for IRQ 2-6)
+
+
+*****************************************************************************
+
+** Acer **
+8-bit card, Model 5210-003
+--------------------------
+ - from Vojtech Pavlik <vojtech@suse.cz> using portions of the existing
+ arcnet-hardware file.
+
+This is a 90C26 based card. Its configuration seems similar to the SMC
+PC100, but has some additional jumpers I don't know the meaning of.
+
+ __
+ | |
+ ___________|__|_________________________
+ | | | |
+ | | BNC | |
+ | |______| ___|
+ | _____________________ |___
+ | | | |
+ | | Hybrid IC | |
+ | | | o|o J1 |
+ | |_____________________| 8|8 |
+ | 8|8 J5 |
+ | o|o |
+ | 8|8 |
+ |__ 8|8 |
+ (|__| LED o|o |
+ | 8|8 |
+ | 8|8 J15 |
+ | |
+ | _____ |
+ | | | _____ |
+ | | | | | ___|
+ | | | | | |
+ | _____ | ROM | | UFS | |
+ | | | | | | | |
+ | | | ___ | | | | |
+ | | | | | |__.__| |__.__| |
+ | | NCR | |XTL| _____ _____ |
+ | | | |___| | | | | |
+ | |90C26| | | | | |
+ | | | | RAM | | UFS | |
+ | | | J17 o|o | | | | |
+ | | | J16 o|o | | | | |
+ | |__.__| |__.__| |__.__| |
+ | ___ |
+ | | |8 |
+ | |SW2| |
+ | | | |
+ | |___|1 |
+ | ___ |
+ | | |10 J18 o|o |
+ | | | o|o |
+ | |SW1| o|o |
+ | | | J21 o|o |
+ | |___|1 |
+ | |
+ |____________________________________|
+
+
+Legend:
+
+90C26 ARCNET Chip
+XTL 20 MHz Crystal
+SW1 1-6 Base I/O Address Select
+ 7-10 Memory Address Select
+SW2 1-8 Node ID Select (ID0-ID7)
+J1-J5 IRQ Select
+J6-J21 Unknown (Probably extra timeouts & ROM enable ...)
+LED1 Activity LED
+BNC Coax connector (STAR ARCnet)
+RAM 2k of SRAM
+ROM Boot ROM socket
+UFS Unidentified Flying Sockets
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in SW2 are used to set the node ID. Each node attached
+to the network must have an unique node ID which must not be 0.
+Switch 1 (ID0) serves as the least significant bit (LSB).
+
+Setting one of the switches to OFF means "1", ON means "0".
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Value
+ -------|-------
+ 1 | 1
+ 2 | 2
+ 3 | 4
+ 4 | 8
+ 5 | 16
+ 6 | 32
+ 7 | 64
+ 8 | 128
+
+Don't set this to 0 or 255; these values are reserved.
+
+
+Setting the I/O Base Address
+----------------------------
+
+The switches 1 to 6 of switch block SW1 are used to select one
+of 32 possible I/O Base addresses using the following tables
+
+ | Hex
+ Switch | Value
+ -------|-------
+ 1 | 200
+ 2 | 100
+ 3 | 80
+ 4 | 40
+ 5 | 20
+ 6 | 10
+
+The I/O address is sum of all switches set to "1". Remember that
+the I/O address space bellow 0x200 is RESERVED for mainboard, so
+switch 1 should be ALWAYS SET TO OFF.
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer (RAM) requires 2K. The base of this buffer can be
+located in any of sixteen positions. However, the addresses below
+A0000 are likely to cause system hang because there's main RAM.
+
+Jumpers 7-10 of switch block SW1 select the Memory Base address.
+
+ Switch | Hex RAM
+ 7 8 9 10 | Address
+ ----------------|---------
+ OFF OFF OFF OFF | F0000 (conflicts with main BIOS)
+ OFF OFF OFF ON | E0000
+ OFF OFF ON OFF | D0000
+ OFF OFF ON ON | C0000 (conflicts with video BIOS)
+ OFF ON OFF OFF | B0000 (conflicts with mono video)
+ OFF ON OFF ON | A0000 (conflicts with graphics)
+
+
+Setting the Interrupt Line
+--------------------------
+
+Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means
+shorted, OFF means open.
+
+ Jumper | IRQ
+ 1 2 3 4 5 |
+ ----------------------------
+ ON OFF OFF OFF OFF | 7
+ OFF ON OFF OFF OFF | 5
+ OFF OFF ON OFF OFF | 4
+ OFF OFF OFF ON OFF | 3
+ OFF OFF OFF OFF ON | 2
+
+
+Unknown jumpers & sockets
+-------------------------
+
+I know nothing about these. I just guess that J16&J17 are timeout
+jumpers and maybe one of J18-J21 selects ROM. Also J6-J10 and
+J11-J15 are connecting IRQ2-7 to some pins on the UFSs. I can't
+guess the purpose.
+
+
+*****************************************************************************
+
+** Datapoint? **
+LAN-ARC-8, an 8-bit card
+------------------------
+ - from Vojtech Pavlik <vojtech@suse.cz>
+
+This is another SMC 90C65-based ARCnet card. I couldn't identify the
+manufacturer, but it might be DataPoint, because the card has the
+original arcNet logo in its upper right corner.
+
+ _______________________________________________________
+ | _________ |
+ | | SW2 | ON arcNet |
+ | |_________| OFF ___|
+ | _____________ 1 ______ 8 | | 8
+ | | | SW1 | XTAL | ____________ | S |
+ | > RAM (2k) | |______|| | | W |
+ | |_____________| | H | | 3 |
+ | _________|_____ y | |___| 1
+ | _________ | | |b | |
+ | |_________| | | |r | |
+ | | SMC | |i | |
+ | | 90C65| |d | |
+ | _________ | | | | |
+ | | SW1 | ON | | |I | |
+ | |_________| OFF |_________|_____/C | _____|
+ | 1 8 | | | |___
+ | ______________ | | | BNC |___|
+ | | | |____________| |_____|
+ | > EPROM SOCKET | _____________ |
+ | |______________| |_____________| |
+ | ______________|
+ | |
+ |________________________________________|
+
+Legend:
+
+90C65 ARCNET Chip
+SW1 1-5: Base Memory Address Select
+ 6-8: Base I/O Address Select
+SW2 1-8: Node ID Select
+SW3 1-5: IRQ Select
+ 6-7: Extra Timeout
+ 8 : ROM Enable
+BNC Coax connector
+XTAL 20 MHz Crystal
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in SW3 are used to set the node ID. Each node attached
+to the network must have an unique node ID which must not be 0.
+Switch 1 serves as the least significant bit (LSB).
+
+Setting one of the switches to Off means "1", On means "0".
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Value
+ -------|-------
+ 1 | 1
+ 2 | 2
+ 3 | 4
+ 4 | 8
+ 5 | 16
+ 6 | 32
+ 7 | 64
+ 8 | 128
+
+
+Setting the I/O Base Address
+----------------------------
+
+The last three switches in switch block SW1 are used to select one
+of eight possible I/O Base addresses using the following table
+
+
+ Switch | Hex I/O
+ 6 7 8 | Address
+ ------------|--------
+ ON ON ON | 260
+ OFF ON ON | 290
+ ON OFF ON | 2E0 (Manufacturer's default)
+ OFF OFF ON | 2F0
+ ON ON OFF | 300
+ OFF ON OFF | 350
+ ON OFF OFF | 380
+ OFF OFF OFF | 3E0
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer (RAM) requires 2K. The base of this buffer can be
+located in any of eight positions. The address of the Boot Prom is
+memory base + 0x2000.
+Jumpers 3-5 of switch block SW1 select the Memory Base address.
+
+ Switch | Hex RAM | Hex ROM
+ 1 2 3 4 5 | Address | Address *)
+ --------------------|---------|-----------
+ ON ON ON ON ON | C0000 | C2000
+ ON ON OFF ON ON | C4000 | C6000
+ ON ON ON OFF ON | CC000 | CE000
+ ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default)
+ ON ON ON ON OFF | D4000 | D6000
+ ON ON OFF ON OFF | D8000 | DA000
+ ON ON ON OFF OFF | DC000 | DE000
+ ON ON OFF OFF OFF | E0000 | E2000
+
+*) To enable the Boot ROM set the switch 8 of switch block SW3 to position ON.
+
+The switches 1 and 2 probably add 0x0800 and 0x1000 to RAM base address.
+
+
+Setting the Interrupt Line
+--------------------------
+
+Switches 1-5 of the switch block SW3 control the IRQ level.
+
+ Jumper | IRQ
+ 1 2 3 4 5 |
+ ----------------------------
+ ON OFF OFF OFF OFF | 3
+ OFF ON OFF OFF OFF | 4
+ OFF OFF ON OFF OFF | 5
+ OFF OFF OFF ON OFF | 7
+ OFF OFF OFF OFF ON | 2
+
+
+Setting the Timeout Parameters
+------------------------------
+
+The switches 6-7 of the switch block SW3 are used to determine the timeout
+parameters. These two switches are normally left in the OFF position.
+
+
+*****************************************************************************
+
+** Topware **
+8-bit card, TA-ARC/10
+-------------------------
+ - from Vojtech Pavlik <vojtech@suse.cz>
+
+This is another very similar 90C65 card. Most of the switches and jumpers
+are the same as on other clones.
+
+ _____________________________________________________________________
+| ___________ | | ______ |
+| |SW2 NODE ID| | | | XTAL | |
+| |___________| | Hybrid IC | |______| |
+| ___________ | | __|
+| |SW1 MEM+I/O| |_________________________| LED1|__|)
+| |___________| 1 2 |
+| J3 |o|o| TIMEOUT ______|
+| ______________ |o|o| | |
+| | | ___________________ | RJ |
+| > EPROM SOCKET | | \ |------|
+|J2 |______________| | | | |
+||o| | | |______|
+||o| ROM ENABLE | SMC | _________ |
+| _____________ | 90C65 | |_________| _____|
+| | | | | | |___
+| > RAM (2k) | | | | BNC |___|
+| |_____________| | | |_____|
+| |____________________| |
+| ________ IRQ 2 3 4 5 7 ___________ |
+||________| |o|o|o|o|o| |___________| |
+|________ J1|o|o|o|o|o| ______________|
+ | |
+ |_____________________________________________|
+
+Legend:
+
+90C65 ARCNET Chip
+XTAL 20 MHz Crystal
+SW1 1-5 Base Memory Address Select
+ 6-8 Base I/O Address Select
+SW2 1-8 Node ID Select (ID0-ID7)
+J1 IRQ Select
+J2 ROM Enable
+J3 Extra Timeout
+LED1 Activity LED
+BNC Coax connector (BUS ARCnet)
+RJ Twisted Pair Connector (daisy chain)
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in SW2 are used to set the node ID. Each node attached to
+the network must have an unique node ID which must not be 0. Switch 1 (ID0)
+serves as the least significant bit (LSB).
+
+Setting one of the switches to Off means "1", On means "0".
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Label | Value
+ -------|-------|-------
+ 1 | ID0 | 1
+ 2 | ID1 | 2
+ 3 | ID2 | 4
+ 4 | ID3 | 8
+ 5 | ID4 | 16
+ 6 | ID5 | 32
+ 7 | ID6 | 64
+ 8 | ID7 | 128
+
+Setting the I/O Base Address
+----------------------------
+
+The last three switches in switch block SW1 are used to select one
+of eight possible I/O Base addresses using the following table:
+
+
+ Switch | Hex I/O
+ 6 7 8 | Address
+ ------------|--------
+ ON ON ON | 260 (Manufacturer's default)
+ OFF ON ON | 290
+ ON OFF ON | 2E0
+ OFF OFF ON | 2F0
+ ON ON OFF | 300
+ OFF ON OFF | 350
+ ON OFF OFF | 380
+ OFF OFF OFF | 3E0
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer (RAM) requires 2K. The base of this buffer can be
+located in any of eight positions. The address of the Boot Prom is
+memory base + 0x2000.
+Jumpers 3-5 of switch block SW1 select the Memory Base address.
+
+ Switch | Hex RAM | Hex ROM
+ 1 2 3 4 5 | Address | Address *)
+ --------------------|---------|-----------
+ ON ON ON ON ON | C0000 | C2000
+ ON ON OFF ON ON | C4000 | C6000 (Manufacturer's default)
+ ON ON ON OFF ON | CC000 | CE000
+ ON ON OFF OFF ON | D0000 | D2000
+ ON ON ON ON OFF | D4000 | D6000
+ ON ON OFF ON OFF | D8000 | DA000
+ ON ON ON OFF OFF | DC000 | DE000
+ ON ON OFF OFF OFF | E0000 | E2000
+
+*) To enable the Boot ROM short the jumper J2.
+
+The jumpers 1 and 2 probably add 0x0800 and 0x1000 to RAM address.
+
+
+Setting the Interrupt Line
+--------------------------
+
+Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means
+shorted, OFF means open.
+
+ Jumper | IRQ
+ 1 2 3 4 5 |
+ ----------------------------
+ ON OFF OFF OFF OFF | 2
+ OFF ON OFF OFF OFF | 3
+ OFF OFF ON OFF OFF | 4
+ OFF OFF OFF ON OFF | 5
+ OFF OFF OFF OFF ON | 7
+
+
+Setting the Timeout Parameters
+------------------------------
+
+The jumpers J3 are used to set the timeout parameters. These two
+jumpers are normally left open.
+
+
+*****************************************************************************
+
+** Thomas-Conrad **
+Model #500-6242-0097 REV A (8-bit card)
+---------------------------------------
+ - from Lars Karlsson <100617.3473@compuserve.com>
+
+ ________________________________________________________
+ | ________ ________ |_____
+ | |........| |........| |
+ | |________| |________| ___|
+ | SW 3 SW 1 | |
+ | Base I/O Base Addr. Station | |
+ | address | |
+ | ______ switch | |
+ | | | | |
+ | | | |___|
+ | | | ______ |___._
+ | |______| |______| ____| BNC
+ | Jumper- _____| Connector
+ | Main chip block _ __| '
+ | | | | RJ Connector
+ | |_| | with 110 Ohm
+ | |__ Terminator
+ | ___________ __|
+ | |...........| | RJ-jack
+ | |...........| _____ | (unused)
+ | |___________| |_____| |__
+ | Boot PROM socket IRQ-jumpers |_ Diagnostic
+ |________ __ _| LED (red)
+ | | | | | | | | | | | | | | | | | | | | | |
+ | | | | | | | | | | | | | | | | | | | | |________|
+ |
+ |
+
+And here are the settings for some of the switches and jumpers on the cards.
+
+
+ I/O
+
+ 1 2 3 4 5 6 7 8
+
+2E0----- 0 0 0 1 0 0 0 1
+2F0----- 0 0 0 1 0 0 0 0
+300----- 0 0 0 0 1 1 1 1
+350----- 0 0 0 0 1 1 1 0
+
+"0" in the above example means switch is off "1" means that it is on.
+
+
+ ShMem address.
+
+ 1 2 3 4 5 6 7 8
+
+CX00--0 0 1 1 | | |
+DX00--0 0 1 0 |
+X000--------- 1 1 |
+X400--------- 1 0 |
+X800--------- 0 1 |
+XC00--------- 0 0
+ENHANCED----------- 1
+COMPATIBLE--------- 0
+
+
+ IRQ
+
+
+ 3 4 5 7 2
+ . . . . .
+ . . . . .
+
+
+There is a DIP-switch with 8 switches, used to set the shared memory address
+to be used. The first 6 switches set the address, the 7th doesn't have any
+function, and the 8th switch is used to select "compatible" or "enhanced".
+When I got my two cards, one of them had this switch set to "enhanced". That
+card didn't work at all, it wasn't even recognized by the driver. The other
+card had this switch set to "compatible" and it behaved absolutely normally. I
+guess that the switch on one of the cards, must have been changed accidentally
+when the card was taken out of its former host. The question remains
+unanswered, what is the purpose of the "enhanced" position?
+
+[Avery's note: "enhanced" probably either disables shared memory (use IO
+ports instead) or disables IO ports (use memory addresses instead). This
+varies by the type of card involved. I fail to see how either of these
+enhance anything. Send me more detailed information about this mode, or
+just use "compatible" mode instead.]
+
+
+*****************************************************************************
+
+** Waterloo Microsystems Inc. ?? **
+8-bit card (C) 1985
+-------------------
+ - from Robert Michael Best <rmb117@cs.usask.ca>
+
+[Avery's note: these don't work with my driver for some reason. These cards
+SEEM to have settings similar to the PDI508Plus, which is
+software-configured and doesn't work with my driver either. The "Waterloo
+chip" is a boot PROM, probably designed specifically for the University of
+Waterloo. If you have any further information about this card, please
+e-mail me.]
+
+The probe has not been able to detect the card on any of the J2 settings,
+and I tried them again with the "Waterloo" chip removed.
+
+ _____________________________________________________________________
+| \/ \/ ___ __ __ |
+| C4 C4 |^| | M || ^ ||^| |
+| -- -- |_| | 5 || || | C3 |
+| \/ \/ C10 |___|| ||_| |
+| C4 C4 _ _ | | ?? |
+| -- -- | \/ || | |
+| | || | |
+| | || C1 | |
+| | || | \/ _____|
+| | C6 || | C9 | |___
+| | || | -- | BNC |___|
+| | || | >C7| |_____|
+| | || | |
+| __ __ |____||_____| 1 2 3 6 |
+|| ^ | >C4| |o|o|o|o|o|o| J2 >C4| |
+|| | |o|o|o|o|o|o| |
+|| C2 | >C4| >C4| |
+|| | >C8| |
+|| | 2 3 4 5 6 7 IRQ >C4| |
+||_____| |o|o|o|o|o|o| J3 |
+|_______ |o|o|o|o|o|o| _______________|
+ | |
+ |_____________________________________________|
+
+C1 -- "COM9026
+ SMC 8638"
+ In a chip socket.
+
+C2 -- "@Copyright
+ Waterloo Microsystems Inc.
+ 1985"
+ In a chip Socket with info printed on a label covering a round window
+ showing the circuit inside. (The window indicates it is an EPROM chip.)
+
+C3 -- "COM9032
+ SMC 8643"
+ In a chip socket.
+
+C4 -- "74LS"
+ 9 total no sockets.
+
+M5 -- "50006-136
+ 20.000000 MHZ
+ MTQ-T1-S3
+ 0 M-TRON 86-40"
+ Metallic case with 4 pins, no socket.
+
+C6 -- "MOSTEK@TC8643
+ MK6116N-20
+ MALAYSIA"
+ No socket.
+
+C7 -- No stamp or label but in a 20 pin chip socket.
+
+C8 -- "PAL10L8CN
+ 8623"
+ In a 20 pin socket.
+
+C9 -- "PAl16R4A-2CN
+ 8641"
+ In a 20 pin socket.
+
+C10 -- "M8640
+ NMC
+ 9306N"
+ In an 8 pin socket.
+
+?? -- Some components on a smaller board and attached with 20 pins all
+ along the side closest to the BNC connector. The are coated in a dark
+ resin.
+
+On the board there are two jumper banks labeled J2 and J3. The
+manufacturer didn't put a J1 on the board. The two boards I have both
+came with a jumper box for each bank.
+
+J2 -- Numbered 1 2 3 4 5 6.
+ 4 and 5 are not stamped due to solder points.
+
+J3 -- IRQ 2 3 4 5 6 7
+
+The board itself has a maple leaf stamped just above the irq jumpers
+and "-2 46-86" beside C2. Between C1 and C6 "ASS 'Y 300163" and "@1986
+CORMAN CUSTOM ELECTRONICS CORP." stamped just below the BNC connector.
+Below that "MADE IN CANADA"
+
+
+*****************************************************************************
+
+** No Name **
+8-bit cards, 16-bit cards
+-------------------------
+ - from Juergen Seifert <seifert@htwm.de>
+
+NONAME 8-BIT ARCNET
+===================
+
+I have named this ARCnet card "NONAME", since there is no name of any
+manufacturer on the Installation manual nor on the shipping box. The only
+hint to the existence of a manufacturer at all is written in copper,
+it is "Made in Taiwan"
+
+This description has been written by Juergen Seifert <seifert@htwm.de>
+using information from the Original
+ "ARCnet Installation Manual"
+
+
+ ________________________________________________________________
+ | |STAR| BUS| T/P| |
+ | |____|____|____| |
+ | _____________________ |
+ | | | |
+ | | | |
+ | | | |
+ | | SMC | |
+ | | | |
+ | | COM90C65 | |
+ | | | |
+ | | | |
+ | |__________-__________| |
+ | _____|
+ | _______________ | CN |
+ | | PROM | |_____|
+ | > SOCKET | |
+ | |_______________| 1 2 3 4 5 6 7 8 1 2 3 4 5 6 7 8 |
+ | _______________ _______________ |
+ | |o|o|o|o|o|o|o|o| | SW1 || SW2 ||
+ | |o|o|o|o|o|o|o|o| |_______________||_______________||
+ |___ 2 3 4 5 7 E E R Node ID IOB__|__MEM____|
+ | \ IRQ / T T O |
+ |__________________1_2_M______________________|
+
+Legend:
+
+COM90C65: ARCnet Probe
+S1 1-8: Node ID Select
+S2 1-3: I/O Base Address Select
+ 4-6: Memory Base Address Select
+ 7-8: RAM Offset Select
+ET1, ET2 Extended Timeout Select
+ROM ROM Enable Select
+CN RG62 Coax Connector
+STAR| BUS | T/P Three fields for placing a sign (colored circle)
+ indicating the topology of the card
+
+Setting one of the switches to Off means "1", On means "0".
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in group SW1 are used to set the node ID.
+Each node attached to the network must have an unique node ID which
+must be different from 0.
+Switch 8 serves as the least significant bit (LSB).
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Value
+ -------|-------
+ 8 | 1
+ 7 | 2
+ 6 | 4
+ 5 | 8
+ 4 | 16
+ 3 | 32
+ 2 | 64
+ 1 | 128
+
+Some Examples:
+
+ Switch | Hex | Decimal
+ 1 2 3 4 5 6 7 8 | Node ID | Node ID
+ ----------------|---------|---------
+ 0 0 0 0 0 0 0 0 | not allowed
+ 0 0 0 0 0 0 0 1 | 1 | 1
+ 0 0 0 0 0 0 1 0 | 2 | 2
+ 0 0 0 0 0 0 1 1 | 3 | 3
+ . . . | |
+ 0 1 0 1 0 1 0 1 | 55 | 85
+ . . . | |
+ 1 0 1 0 1 0 1 0 | AA | 170
+ . . . | |
+ 1 1 1 1 1 1 0 1 | FD | 253
+ 1 1 1 1 1 1 1 0 | FE | 254
+ 1 1 1 1 1 1 1 1 | FF | 255
+
+
+Setting the I/O Base Address
+----------------------------
+
+The first three switches in switch group SW2 are used to select one
+of eight possible I/O Base addresses using the following table
+
+ Switch | Hex I/O
+ 1 2 3 | Address
+ ------------|--------
+ ON ON ON | 260
+ ON ON OFF | 290
+ ON OFF ON | 2E0 (Manufacturer's default)
+ ON OFF OFF | 2F0
+ OFF ON ON | 300
+ OFF ON OFF | 350
+ OFF OFF ON | 380
+ OFF OFF OFF | 3E0
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer requires 2K of a 16K block of RAM. The base of this
+16K block can be located in any of eight positions.
+Switches 4-6 of switch group SW2 select the Base of the 16K block.
+Within that 16K address space, the buffer may be assigned any one of four
+positions, determined by the offset, switches 7 and 8 of group SW2.
+
+ Switch | Hex RAM | Hex ROM
+ 4 5 6 7 8 | Address | Address *)
+ -----------|---------|-----------
+ 0 0 0 0 0 | C0000 | C2000
+ 0 0 0 0 1 | C0800 | C2000
+ 0 0 0 1 0 | C1000 | C2000
+ 0 0 0 1 1 | C1800 | C2000
+ | |
+ 0 0 1 0 0 | C4000 | C6000
+ 0 0 1 0 1 | C4800 | C6000
+ 0 0 1 1 0 | C5000 | C6000
+ 0 0 1 1 1 | C5800 | C6000
+ | |
+ 0 1 0 0 0 | CC000 | CE000
+ 0 1 0 0 1 | CC800 | CE000
+ 0 1 0 1 0 | CD000 | CE000
+ 0 1 0 1 1 | CD800 | CE000
+ | |
+ 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default)
+ 0 1 1 0 1 | D0800 | D2000
+ 0 1 1 1 0 | D1000 | D2000
+ 0 1 1 1 1 | D1800 | D2000
+ | |
+ 1 0 0 0 0 | D4000 | D6000
+ 1 0 0 0 1 | D4800 | D6000
+ 1 0 0 1 0 | D5000 | D6000
+ 1 0 0 1 1 | D5800 | D6000
+ | |
+ 1 0 1 0 0 | D8000 | DA000
+ 1 0 1 0 1 | D8800 | DA000
+ 1 0 1 1 0 | D9000 | DA000
+ 1 0 1 1 1 | D9800 | DA000
+ | |
+ 1 1 0 0 0 | DC000 | DE000
+ 1 1 0 0 1 | DC800 | DE000
+ 1 1 0 1 0 | DD000 | DE000
+ 1 1 0 1 1 | DD800 | DE000
+ | |
+ 1 1 1 0 0 | E0000 | E2000
+ 1 1 1 0 1 | E0800 | E2000
+ 1 1 1 1 0 | E1000 | E2000
+ 1 1 1 1 1 | E1800 | E2000
+
+*) To enable the 8K Boot PROM install the jumper ROM.
+ The default is jumper ROM not installed.
+
+
+Setting Interrupt Request Lines (IRQ)
+-------------------------------------
+
+To select a hardware interrupt level set one (only one!) of the jumpers
+IRQ2, IRQ3, IRQ4, IRQ5 or IRQ7. The manufacturer's default is IRQ2.
+
+
+Setting the Timeouts
+--------------------
+
+The two jumpers labeled ET1 and ET2 are used to determine the timeout
+parameters (response and reconfiguration time). Every node in a network
+must be set to the same timeout values.
+
+ ET1 ET2 | Response Time (us) | Reconfiguration Time (ms)
+ --------|--------------------|--------------------------
+ Off Off | 78 | 840 (Default)
+ Off On | 285 | 1680
+ On Off | 563 | 1680
+ On On | 1130 | 1680
+
+On means jumper installed, Off means jumper not installed
+
+
+NONAME 16-BIT ARCNET
+====================
+
+The manual of my 8-Bit NONAME ARCnet Card contains another description
+of a 16-Bit Coax / Twisted Pair Card. This description is incomplete,
+because there are missing two pages in the manual booklet. (The table
+of contents reports pages ... 2-9, 2-11, 2-12, 3-1, ... but inside
+the booklet there is a different way of counting ... 2-9, 2-10, A-1,
+(empty page), 3-1, ..., 3-18, A-1 (again), A-2)
+Also the picture of the board layout is not as good as the picture of
+8-Bit card, because there isn't any letter like "SW1" written to the
+picture.
+Should somebody have such a board, please feel free to complete this
+description or to send a mail to me!
+
+This description has been written by Juergen Seifert <seifert@htwm.de>
+using information from the Original
+ "ARCnet Installation Manual"
+
+
+ ___________________________________________________________________
+ < _________________ _________________ |
+ > | SW? || SW? | |
+ < |_________________||_________________| |
+ > ____________________ |
+ < | | |
+ > | | |
+ < | | |
+ > | | |
+ < | | |
+ > | | |
+ < | | |
+ > |____________________| |
+ < ____|
+ > ____________________ | |
+ < | | | J1 |
+ > | < | |
+ < |____________________| ? ? ? ? ? ? |____|
+ > |o|o|o|o|o|o| |
+ < |o|o|o|o|o|o| |
+ > |
+ < __ ___________|
+ > | | |
+ <____________| |_______________________________________|
+
+
+Setting one of the switches to Off means "1", On means "0".
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in group SW2 are used to set the node ID.
+Each node attached to the network must have an unique node ID which
+must be different from 0.
+Switch 8 serves as the least significant bit (LSB).
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Value
+ -------|-------
+ 8 | 1
+ 7 | 2
+ 6 | 4
+ 5 | 8
+ 4 | 16
+ 3 | 32
+ 2 | 64
+ 1 | 128
+
+Some Examples:
+
+ Switch | Hex | Decimal
+ 1 2 3 4 5 6 7 8 | Node ID | Node ID
+ ----------------|---------|---------
+ 0 0 0 0 0 0 0 0 | not allowed
+ 0 0 0 0 0 0 0 1 | 1 | 1
+ 0 0 0 0 0 0 1 0 | 2 | 2
+ 0 0 0 0 0 0 1 1 | 3 | 3
+ . . . | |
+ 0 1 0 1 0 1 0 1 | 55 | 85
+ . . . | |
+ 1 0 1 0 1 0 1 0 | AA | 170
+ . . . | |
+ 1 1 1 1 1 1 0 1 | FD | 253
+ 1 1 1 1 1 1 1 0 | FE | 254
+ 1 1 1 1 1 1 1 1 | FF | 255
+
+
+Setting the I/O Base Address
+----------------------------
+
+The first three switches in switch group SW1 are used to select one
+of eight possible I/O Base addresses using the following table
+
+ Switch | Hex I/O
+ 3 2 1 | Address
+ ------------|--------
+ ON ON ON | 260
+ ON ON OFF | 290
+ ON OFF ON | 2E0 (Manufacturer's default)
+ ON OFF OFF | 2F0
+ OFF ON ON | 300
+ OFF ON OFF | 350
+ OFF OFF ON | 380
+ OFF OFF OFF | 3E0
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer requires 2K of a 16K block of RAM. The base of this
+16K block can be located in any of eight positions.
+Switches 6-8 of switch group SW1 select the Base of the 16K block.
+Within that 16K address space, the buffer may be assigned any one of four
+positions, determined by the offset, switches 4 and 5 of group SW1.
+
+ Switch | Hex RAM | Hex ROM
+ 8 7 6 5 4 | Address | Address
+ -----------|---------|-----------
+ 0 0 0 0 0 | C0000 | C2000
+ 0 0 0 0 1 | C0800 | C2000
+ 0 0 0 1 0 | C1000 | C2000
+ 0 0 0 1 1 | C1800 | C2000
+ | |
+ 0 0 1 0 0 | C4000 | C6000
+ 0 0 1 0 1 | C4800 | C6000
+ 0 0 1 1 0 | C5000 | C6000
+ 0 0 1 1 1 | C5800 | C6000
+ | |
+ 0 1 0 0 0 | CC000 | CE000
+ 0 1 0 0 1 | CC800 | CE000
+ 0 1 0 1 0 | CD000 | CE000
+ 0 1 0 1 1 | CD800 | CE000
+ | |
+ 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default)
+ 0 1 1 0 1 | D0800 | D2000
+ 0 1 1 1 0 | D1000 | D2000
+ 0 1 1 1 1 | D1800 | D2000
+ | |
+ 1 0 0 0 0 | D4000 | D6000
+ 1 0 0 0 1 | D4800 | D6000
+ 1 0 0 1 0 | D5000 | D6000
+ 1 0 0 1 1 | D5800 | D6000
+ | |
+ 1 0 1 0 0 | D8000 | DA000
+ 1 0 1 0 1 | D8800 | DA000
+ 1 0 1 1 0 | D9000 | DA000
+ 1 0 1 1 1 | D9800 | DA000
+ | |
+ 1 1 0 0 0 | DC000 | DE000
+ 1 1 0 0 1 | DC800 | DE000
+ 1 1 0 1 0 | DD000 | DE000
+ 1 1 0 1 1 | DD800 | DE000
+ | |
+ 1 1 1 0 0 | E0000 | E2000
+ 1 1 1 0 1 | E0800 | E2000
+ 1 1 1 1 0 | E1000 | E2000
+ 1 1 1 1 1 | E1800 | E2000
+
+
+Setting Interrupt Request Lines (IRQ)
+-------------------------------------
+
+??????????????????????????????????????
+
+
+Setting the Timeouts
+--------------------
+
+??????????????????????????????????????
+
+
+*****************************************************************************
+
+** No Name **
+8-bit cards ("Made in Taiwan R.O.C.")
+-----------
+ - from Vojtech Pavlik <vojtech@suse.cz>
+
+I have named this ARCnet card "NONAME", since I got only the card with
+no manual at all and the only text identifying the manufacturer is
+"MADE IN TAIWAN R.O.C" printed on the card.
+
+ ____________________________________________________________
+ | 1 2 3 4 5 6 7 8 |
+ | |o|o| JP1 o|o|o|o|o|o|o|o| ON |
+ | + o|o|o|o|o|o|o|o| ___|
+ | _____________ o|o|o|o|o|o|o|o| OFF _____ | | ID7
+ | | | SW1 | | | | ID6
+ | > RAM (2k) | ____________________ | H | | S | ID5
+ | |_____________| | || y | | W | ID4
+ | | || b | | 2 | ID3
+ | | || r | | | ID2
+ | | || i | | | ID1
+ | | 90C65 || d | |___| ID0
+ | SW3 | || | |
+ | |o|o|o|o|o|o|o|o| ON | || I | |
+ | |o|o|o|o|o|o|o|o| | || C | |
+ | |o|o|o|o|o|o|o|o| OFF |____________________|| | _____|
+ | 1 2 3 4 5 6 7 8 | | | |___
+ | ______________ | | | BNC |___|
+ | | | |_____| |_____|
+ | > EPROM SOCKET | |
+ | |______________| |
+ | ______________|
+ | |
+ |_____________________________________________|
+
+Legend:
+
+90C65 ARCNET Chip
+SW1 1-5: Base Memory Address Select
+ 6-8: Base I/O Address Select
+SW2 1-8: Node ID Select (ID0-ID7)
+SW3 1-5: IRQ Select
+ 6-7: Extra Timeout
+ 8 : ROM Enable
+JP1 Led connector
+BNC Coax connector
+
+Although the jumpers SW1 and SW3 are marked SW, not JP, they are jumpers, not
+switches.
+
+Setting the jumpers to ON means connecting the upper two pins, off the bottom
+two - or - in case of IRQ setting, connecting none of them at all.
+
+Setting the Node ID
+-------------------
+
+The eight switches in SW2 are used to set the node ID. Each node attached
+to the network must have an unique node ID which must not be 0.
+Switch 1 (ID0) serves as the least significant bit (LSB).
+
+Setting one of the switches to Off means "1", On means "0".
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+
+ Switch | Label | Value
+ -------|-------|-------
+ 1 | ID0 | 1
+ 2 | ID1 | 2
+ 3 | ID2 | 4
+ 4 | ID3 | 8
+ 5 | ID4 | 16
+ 6 | ID5 | 32
+ 7 | ID6 | 64
+ 8 | ID7 | 128
+
+Some Examples:
+
+ Switch | Hex | Decimal
+ 8 7 6 5 4 3 2 1 | Node ID | Node ID
+ ----------------|---------|---------
+ 0 0 0 0 0 0 0 0 | not allowed
+ 0 0 0 0 0 0 0 1 | 1 | 1
+ 0 0 0 0 0 0 1 0 | 2 | 2
+ 0 0 0 0 0 0 1 1 | 3 | 3
+ . . . | |
+ 0 1 0 1 0 1 0 1 | 55 | 85
+ . . . | |
+ 1 0 1 0 1 0 1 0 | AA | 170
+ . . . | |
+ 1 1 1 1 1 1 0 1 | FD | 253
+ 1 1 1 1 1 1 1 0 | FE | 254
+ 1 1 1 1 1 1 1 1 | FF | 255
+
+
+Setting the I/O Base Address
+----------------------------
+
+The last three switches in switch block SW1 are used to select one
+of eight possible I/O Base addresses using the following table
+
+
+ Switch | Hex I/O
+ 6 7 8 | Address
+ ------------|--------
+ ON ON ON | 260
+ OFF ON ON | 290
+ ON OFF ON | 2E0 (Manufacturer's default)
+ OFF OFF ON | 2F0
+ ON ON OFF | 300
+ OFF ON OFF | 350
+ ON OFF OFF | 380
+ OFF OFF OFF | 3E0
+
+
+Setting the Base Memory (RAM) buffer Address
+--------------------------------------------
+
+The memory buffer (RAM) requires 2K. The base of this buffer can be
+located in any of eight positions. The address of the Boot Prom is
+memory base + 0x2000.
+Jumpers 3-5 of jumper block SW1 select the Memory Base address.
+
+ Switch | Hex RAM | Hex ROM
+ 1 2 3 4 5 | Address | Address *)
+ --------------------|---------|-----------
+ ON ON ON ON ON | C0000 | C2000
+ ON ON OFF ON ON | C4000 | C6000
+ ON ON ON OFF ON | CC000 | CE000
+ ON ON OFF OFF ON | D0000 | D2000 (Manufacturer's default)
+ ON ON ON ON OFF | D4000 | D6000
+ ON ON OFF ON OFF | D8000 | DA000
+ ON ON ON OFF OFF | DC000 | DE000
+ ON ON OFF OFF OFF | E0000 | E2000
+
+*) To enable the Boot ROM set the jumper 8 of jumper block SW3 to position ON.
+
+The jumpers 1 and 2 probably add 0x0800, 0x1000 and 0x1800 to RAM adders.
+
+Setting the Interrupt Line
+--------------------------
+
+Jumpers 1-5 of the jumper block SW3 control the IRQ level.
+
+ Jumper | IRQ
+ 1 2 3 4 5 |
+ ----------------------------
+ ON OFF OFF OFF OFF | 2
+ OFF ON OFF OFF OFF | 3
+ OFF OFF ON OFF OFF | 4
+ OFF OFF OFF ON OFF | 5
+ OFF OFF OFF OFF ON | 7
+
+
+Setting the Timeout Parameters
+------------------------------
+
+The jumpers 6-7 of the jumper block SW3 are used to determine the timeout
+parameters. These two jumpers are normally left in the OFF position.
+
+
+*****************************************************************************
+
+** No Name **
+(Generic Model 9058)
+--------------------
+ - from Andrew J. Kroll <ag784@freenet.buffalo.edu>
+ - Sorry this sat in my to-do box for so long, Andrew! (yikes - over a
+ year!)
+ _____
+ | <
+ | .---'
+ ________________________________________________________________ | |
+ | | SW2 | | |
+ | ___________ |_____________| | |
+ | | | 1 2 3 4 5 6 ___| |
+ | > 6116 RAM | _________ 8 | | |
+ | |___________| |20MHzXtal| 7 | | |
+ | |_________| __________ 6 | S | |
+ | 74LS373 | |- 5 | W | |
+ | _________ | E |- 4 | | |
+ | >_______| ______________|..... P |- 3 | 3 | |
+ | | | : O |- 2 | | |
+ | | | : X |- 1 |___| |
+ | ________________ | | : Y |- | |
+ | | SW1 | | SL90C65 | : |- | |
+ | |________________| | | : B |- | |
+ | 1 2 3 4 5 6 7 8 | | : O |- | |
+ | |_________o____|..../ A |- _______| |
+ | ____________________ | R |- | |------,
+ | | | | D |- | BNC | # |
+ | > 2764 PROM SOCKET | |__________|- |_______|------'
+ | |____________________| _________ | |
+ | >________| <- 74LS245 | |
+ | | |
+ |___ ______________| |
+ |H H H H H H H H H H H H H H H H H H H H H H H| | |
+ |U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U| | |
+ \|
+Legend:
+
+SL90C65 ARCNET Controller / Transceiver /Logic
+SW1 1-5: IRQ Select
+ 6: ET1
+ 7: ET2
+ 8: ROM ENABLE
+SW2 1-3: Memory Buffer/PROM Address
+ 3-6: I/O Address Map
+SW3 1-8: Node ID Select
+BNC BNC RG62/U Connection
+ *I* have had success using RG59B/U with *NO* terminators!
+ What gives?!
+
+SW1: Timeouts, Interrupt and ROM
+---------------------------------
+
+To select a hardware interrupt level set one (only one!) of the dip switches
+up (on) SW1...(switches 1-5)
+IRQ3, IRQ4, IRQ5, IRQ7, IRQ2. The Manufacturer's default is IRQ2.
+
+The switches on SW1 labeled EXT1 (switch 6) and EXT2 (switch 7)
+are used to determine the timeout parameters. These two dip switches
+are normally left off (down).
+
+ To enable the 8K Boot PROM position SW1 switch 8 on (UP) labeled ROM.
+ The default is jumper ROM not installed.
+
+
+Setting the I/O Base Address
+----------------------------
+
+The last three switches in switch group SW2 are used to select one
+of eight possible I/O Base addresses using the following table
+
+
+ Switch | Hex I/O
+ 4 5 6 | Address
+ -------|--------
+ 0 0 0 | 260
+ 0 0 1 | 290
+ 0 1 0 | 2E0 (Manufacturer's default)
+ 0 1 1 | 2F0
+ 1 0 0 | 300
+ 1 0 1 | 350
+ 1 1 0 | 380
+ 1 1 1 | 3E0
+
+
+Setting the Base Memory Address (RAM & ROM)
+-------------------------------------------
+
+The memory buffer requires 2K of a 16K block of RAM. The base of this
+16K block can be located in any of eight positions.
+Switches 1-3 of switch group SW2 select the Base of the 16K block.
+(0 = DOWN, 1 = UP)
+I could, however, only verify two settings...
+
+ Switch| Hex RAM | Hex ROM
+ 1 2 3 | Address | Address
+ ------|---------|-----------
+ 0 0 0 | E0000 | E2000
+ 0 0 1 | D0000 | D2000 (Manufacturer's default)
+ 0 1 0 | ????? | ?????
+ 0 1 1 | ????? | ?????
+ 1 0 0 | ????? | ?????
+ 1 0 1 | ????? | ?????
+ 1 1 0 | ????? | ?????
+ 1 1 1 | ????? | ?????
+
+
+Setting the Node ID
+-------------------
+
+The eight switches in group SW3 are used to set the node ID.
+Each node attached to the network must have an unique node ID which
+must be different from 0.
+Switch 1 serves as the least significant bit (LSB).
+switches in the DOWN position are OFF (0) and in the UP position are ON (1)
+
+The node ID is the sum of the values of all switches set to "1"
+These values are:
+ Switch | Value
+ -------|-------
+ 1 | 1
+ 2 | 2
+ 3 | 4
+ 4 | 8
+ 5 | 16
+ 6 | 32
+ 7 | 64
+ 8 | 128
+
+Some Examples:
+
+ Switch# | Hex | Decimal
+8 7 6 5 4 3 2 1 | Node ID | Node ID
+----------------|---------|---------
+0 0 0 0 0 0 0 0 | not allowed <-.
+0 0 0 0 0 0 0 1 | 1 | 1 |
+0 0 0 0 0 0 1 0 | 2 | 2 |
+0 0 0 0 0 0 1 1 | 3 | 3 |
+ . . . | | |
+0 1 0 1 0 1 0 1 | 55 | 85 |
+ . . . | | + Don't use 0 or 255!
+1 0 1 0 1 0 1 0 | AA | 170 |
+ . . . | | |
+1 1 1 1 1 1 0 1 | FD | 253 |
+1 1 1 1 1 1 1 0 | FE | 254 |
+1 1 1 1 1 1 1 1 | FF | 255 <-'
+
+
+*****************************************************************************
+
+** Tiara **
+(model unknown)
+-------------------------
+ - from Christoph Lameter <christoph@lameter.com>
+
+
+Here is information about my card as far as I could figure it out:
+----------------------------------------------- tiara
+Tiara LanCard of Tiara Computer Systems.
+
++----------------------------------------------+
+! ! Transmitter Unit ! !
+! +------------------+ -------
+! MEM Coax Connector
+! ROM 7654321 <- I/O -------
+! : : +--------+ !
+! : : ! 90C66LJ! +++
+! : : ! ! !D Switch to set
+! : : ! ! !I the Nodenumber
+! : : +--------+ !P
+! !++
+! 234567 <- IRQ !
++------------!!!!!!!!!!!!!!!!!!!!!!!!--------+
+ !!!!!!!!!!!!!!!!!!!!!!!!
+
+0 = Jumper Installed
+1 = Open
+
+Top Jumper line Bit 7 = ROM Enable 654=Memory location 321=I/O
+
+Settings for Memory Location (Top Jumper Line)
+456 Address selected
+000 C0000
+001 C4000
+010 CC000
+011 D0000
+100 D4000
+101 D8000
+110 DC000
+111 E0000
+
+Settings for I/O Address (Top Jumper Line)
+123 Port
+000 260
+001 290
+010 2E0
+011 2F0
+100 300
+101 350
+110 380
+111 3E0
+
+Settings for IRQ Selection (Lower Jumper Line)
+234567
+011111 IRQ 2
+101111 IRQ 3
+110111 IRQ 4
+111011 IRQ 5
+111110 IRQ 7
+
+*****************************************************************************
+
+
+Other Cards
+-----------
+
+I have no information on other models of ARCnet cards at the moment. Please
+send any and all info to:
+ apenwarr@worldvisions.ca
+
+Thanks.
diff --git a/Documentation/networking/arcnet.txt b/Documentation/networking/arcnet.txt
new file mode 100644
index 0000000..7960125
--- /dev/null
+++ b/Documentation/networking/arcnet.txt
@@ -0,0 +1,555 @@
+----------------------------------------------------------------------------
+NOTE: See also arcnet-hardware.txt in this directory for jumper-setting
+and cabling information if you're like many of us and didn't happen to get a
+manual with your ARCnet card.
+----------------------------------------------------------------------------
+
+Since no one seems to listen to me otherwise, perhaps a poem will get your
+attention:
+ This driver's getting fat and beefy,
+ But my cat is still named Fifi.
+
+Hmm, I think I'm allowed to call that a poem, even though it's only two
+lines. Hey, I'm in Computer Science, not English. Give me a break.
+
+The point is: I REALLY REALLY REALLY REALLY REALLY want to hear from you if
+you test this and get it working. Or if you don't. Or anything.
+
+ARCnet 0.32 ALPHA first made it into the Linux kernel 1.1.80 - this was
+nice, but after that even FEWER people started writing to me because they
+didn't even have to install the patch. <sigh>
+
+Come on, be a sport! Send me a success report!
+
+(hey, that was even better than my original poem... this is getting bad!)
+
+
+--------
+WARNING:
+--------
+
+If you don't e-mail me about your success/failure soon, I may be forced to
+start SINGING. And we don't want that, do we?
+
+(You know, it might be argued that I'm pushing this point a little too much.
+If you think so, why not flame me in a quick little e-mail? Please also
+include the type of card(s) you're using, software, size of network, and
+whether it's working or not.)
+
+My e-mail address is: apenwarr@worldvisions.ca
+
+
+---------------------------------------------------------------------------
+
+
+These are the ARCnet drivers for Linux.
+
+
+This new release (2.91) has been put together by David Woodhouse
+<dwmw2@infradead.org>, in an attempt to tidy up the driver after adding support
+for yet another chipset. Now the generic support has been separated from the
+individual chipset drivers, and the source files aren't quite so packed with
+#ifdefs! I've changed this file a bit, but kept it in the first person from
+Avery, because I didn't want to completely rewrite it.
+
+The previous release resulted from many months of on-and-off effort from me
+(Avery Pennarun), many bug reports/fixes and suggestions from others, and in
+particular a lot of input and coding from Tomasz Motylewski. Starting with
+ARCnet 2.10 ALPHA, Tomasz's all-new-and-improved RFC1051 support has been
+included and seems to be working fine!
+
+
+Where do I discuss these drivers?
+---------------------------------
+
+Tomasz has been so kind as to set up a new and improved mailing list.
+Subscribe by sending a message with the BODY "subscribe linux-arcnet YOUR
+REAL NAME" to listserv@tichy.ch.uj.edu.pl. Then, to submit messages to the
+list, mail to linux-arcnet@tichy.ch.uj.edu.pl.
+
+There are archives of the mailing list at:
+ http://tichy.ch.uj.edu.pl/lists/linux-arcnet
+
+The people on linux-net@vger.kernel.org have also been known to be very
+helpful, especially when we're talking about ALPHA Linux kernels that may or
+may not work right in the first place.
+
+
+Other Drivers and Info
+----------------------
+
+You can try my ARCNET page on the World Wide Web at:
+ http://www.worldvisions.ca/~apenwarr/arcnet/
+
+Also, SMC (one of the companies that makes ARCnet cards) has a WWW site you
+might be interested in, which includes several drivers for various cards
+including ARCnet. Try:
+ http://www.smc.com/
+
+Performance Technologies makes various network software that supports
+ARCnet:
+ http://www.perftech.com/ or ftp to ftp.perftech.com.
+
+Novell makes a networking stack for DOS which includes ARCnet drivers. Try
+FTPing to ftp.novell.com.
+
+You can get the Crynwr packet driver collection (including arcether.com, the
+one you'll want to use with ARCnet cards) from
+oak.oakland.edu:/simtel/msdos/pktdrvr. It won't work perfectly on a 386+
+without patches, though, and also doesn't like several cards. Fixed
+versions are available on my WWW page, or via e-mail if you don't have WWW
+access.
+
+
+Installing the Driver
+---------------------
+
+All you will need to do in order to install the driver is:
+ make config
+ (be sure to choose ARCnet in the network devices
+ and at least one chipset driver.)
+ make clean
+ make zImage
+
+If you obtained this ARCnet package as an upgrade to the ARCnet driver in
+your current kernel, you will need to first copy arcnet.c over the one in
+the linux/drivers/net directory.
+
+You will know the driver is installed properly if you get some ARCnet
+messages when you reboot into the new Linux kernel.
+
+There are four chipset options:
+
+ 1. Standard ARCnet COM90xx chipset.
+
+This is the normal ARCnet card, which you've probably got. This is the only
+chipset driver which will autoprobe if not told where the card is.
+It following options on the command line:
+ com90xx=[<io>[,<irq>[,<shmem>]]][,<name>] | <name>
+
+If you load the chipset support as a module, the options are:
+ io=<io> irq=<irq> shmem=<shmem> device=<name>
+
+To disable the autoprobe, just specify "com90xx=" on the kernel command line.
+To specify the name alone, but allow autoprobe, just put "com90xx=<name>"
+
+ 2. ARCnet COM20020 chipset.
+
+This is the new chipset from SMC with support for promiscuous mode (packet
+sniffing), extra diagnostic information, etc. Unfortunately, there is no
+sensible method of autoprobing for these cards. You must specify the I/O
+address on the kernel command line.
+The command line options are:
+ com20020=<io>[,<irq>[,<node_ID>[,backplane[,CKP[,timeout]]]]][,name]
+
+If you load the chipset support as a module, the options are:
+ io=<io> irq=<irq> node=<node_ID> backplane=<backplane> clock=<CKP>
+ timeout=<timeout> device=<name>
+
+The COM20020 chipset allows you to set the node ID in software, overriding the
+default which is still set in DIP switches on the card. If you don't have the
+COM20020 data sheets, and you don't know what the other three options refer
+to, then they won't interest you - forget them.
+
+ 3. ARCnet COM90xx chipset in IO-mapped mode.
+
+This will also work with the normal ARCnet cards, but doesn't use the shared
+memory. It performs less well than the above driver, but is provided in case
+you have a card which doesn't support shared memory, or (strangely) in case
+you have so many ARCnet cards in your machine that you run out of shmem slots.
+If you don't give the IO address on the kernel command line, then the driver
+will not find the card.
+The command line options are:
+ com90io=<io>[,<irq>][,<name>]
+
+If you load the chipset support as a module, the options are:
+ io=<io> irq=<irq> device=<name>
+
+ 4. ARCnet RIM I cards.
+
+These are COM90xx chips which are _completely_ memory mapped. The support for
+these is not tested. If you have one, please mail the author with a success
+report. All options must be specified, except the device name.
+Command line options:
+ arcrimi=<shmem>,<irq>,<node_ID>[,<name>]
+
+If you load the chipset support as a module, the options are:
+ shmem=<shmem> irq=<irq> node=<node_ID> device=<name>
+
+
+Loadable Module Support
+-----------------------
+
+Configure and rebuild Linux. When asked, answer 'm' to "Generic ARCnet
+support" and to support for your ARCnet chipset if you want to use the
+loadable module. You can also say 'y' to "Generic ARCnet support" and 'm'
+to the chipset support if you wish.
+
+ make config
+ make clean
+ make zImage
+ make modules
+
+If you're using a loadable module, you need to use insmod to load it, and
+you can specify various characteristics of your card on the command
+line. (In recent versions of the driver, autoprobing is much more reliable
+and works as a module, so most of this is now unnecessary.)
+
+For example:
+ cd /usr/src/linux/modules
+ insmod arcnet.o
+ insmod com90xx.o
+ insmod com20020.o io=0x2e0 device=eth1
+
+
+Using the Driver
+----------------
+
+If you build your kernel with ARCnet COM90xx support included, it should
+probe for your card automatically when you boot. If you use a different
+chipset driver complied into the kernel, you must give the necessary options
+on the kernel command line, as detailed above.
+
+Go read the NET-2-HOWTO and ETHERNET-HOWTO for Linux; they should be
+available where you picked up this driver. Think of your ARCnet as a
+souped-up (or down, as the case may be) Ethernet card.
+
+By the way, be sure to change all references from "eth0" to "arc0" in the
+HOWTOs. Remember that ARCnet isn't a "true" Ethernet, and the device name
+is DIFFERENT.
+
+
+Multiple Cards in One Computer
+------------------------------
+
+Linux has pretty good support for this now, but since I've been busy, the
+ARCnet driver has somewhat suffered in this respect. COM90xx support, if
+compiled into the kernel, will (try to) autodetect all the installed cards.
+
+If you have other cards, with support compiled into the kernel, then you can
+just repeat the options on the kernel command line, e.g.:
+LILO: linux com20020=0x2e0 com20020=0x380 com90io=0x260
+
+If you have the chipset support built as a loadable module, then you need to
+do something like this:
+ insmod -o arc0 com90xx
+ insmod -o arc1 com20020 io=0x2e0
+ insmod -o arc2 com90xx
+The ARCnet drivers will now sort out their names automatically.
+
+
+How do I get it to work with...?
+--------------------------------
+
+NFS: Should be fine linux->linux, just pretend you're using Ethernet cards.
+ oak.oakland.edu:/simtel/msdos/nfs has some nice DOS clients. There
+ is also a DOS-based NFS server called SOSS. It doesn't multitask
+ quite the way Linux does (actually, it doesn't multitask AT ALL) but
+ you never know what you might need.
+
+ With AmiTCP (and possibly others), you may need to set the following
+ options in your Amiga nfstab: MD 1024 MR 1024 MW 1024
+ (Thanks to Christian Gottschling <ferksy@indigo.tng.oche.de>
+ for this.)
+
+ Probably these refer to maximum NFS data/read/write block sizes. I
+ don't know why the defaults on the Amiga didn't work; write to me if
+ you know more.
+
+DOS: If you're using the freeware arcether.com, you might want to install
+ the driver patch from my web page. It helps with PC/TCP, and also
+ can get arcether to load if it timed out too quickly during
+ initialization. In fact, if you use it on a 386+ you REALLY need
+ the patch, really.
+
+Windows: See DOS :) Trumpet Winsock works fine with either the Novell or
+ Arcether client, assuming you remember to load winpkt of course.
+
+LAN Manager and Windows for Workgroups: These programs use protocols that
+ are incompatible with the Internet standard. They try to pretend
+ the cards are Ethernet, and confuse everyone else on the network.
+
+ However, v2.00 and higher of the Linux ARCnet driver supports this
+ protocol via the 'arc0e' device. See the section on "Multiprotocol
+ Support" for more information.
+
+ Using the freeware Samba server and clients for Linux, you can now
+ interface quite nicely with TCP/IP-based WfWg or Lan Manager
+ networks.
+
+Windows 95: Tools are included with Win95 that let you use either the LANMAN
+ style network drivers (NDIS) or Novell drivers (ODI) to handle your
+ ARCnet packets. If you use ODI, you'll need to use the 'arc0'
+ device with Linux. If you use NDIS, then try the 'arc0e' device.
+ See the "Multiprotocol Support" section below if you need arc0e,
+ you're completely insane, and/or you need to build some kind of
+ hybrid network that uses both encapsulation types.
+
+OS/2: I've been told it works under Warp Connect with an ARCnet driver from
+ SMC. You need to use the 'arc0e' interface for this. If you get
+ the SMC driver to work with the TCP/IP stuff included in the
+ "normal" Warp Bonus Pack, let me know.
+
+ ftp.microsoft.com also has a freeware "Lan Manager for OS/2" client
+ which should use the same protocol as WfWg does. I had no luck
+ installing it under Warp, however. Please mail me with any results.
+
+NetBSD/AmiTCP: These use an old version of the Internet standard ARCnet
+ protocol (RFC1051) which is compatible with the Linux driver v2.10
+ ALPHA and above using the arc0s device. (See "Multiprotocol ARCnet"
+ below.) ** Newer versions of NetBSD apparently support RFC1201.
+
+
+Using Multiprotocol ARCnet
+--------------------------
+
+The ARCnet driver v2.10 ALPHA supports three protocols, each on its own
+"virtual network device":
+
+ arc0 - RFC1201 protocol, the official Internet standard which just
+ happens to be 100% compatible with Novell's TRXNET driver.
+ Version 1.00 of the ARCnet driver supported _only_ this
+ protocol. arc0 is the fastest of the three protocols (for
+ whatever reason), and allows larger packets to be used
+ because it supports RFC1201 "packet splitting" operations.
+ Unless you have a specific need to use a different protocol,
+ I strongly suggest that you stick with this one.
+
+ arc0e - "Ethernet-Encapsulation" which sends packets over ARCnet
+ that are actually a lot like Ethernet packets, including the
+ 6-byte hardware addresses. This protocol is compatible with
+ Microsoft's NDIS ARCnet driver, like the one in WfWg and
+ LANMAN. Because the MTU of 493 is actually smaller than the
+ one "required" by TCP/IP (576), there is a chance that some
+ network operations will not function properly. The Linux
+ TCP/IP layer can compensate in most cases, however, by
+ automatically fragmenting the TCP/IP packets to make them
+ fit. arc0e also works slightly more slowly than arc0, for
+ reasons yet to be determined. (Probably it's the smaller
+ MTU that does it.)
+
+ arc0s - The "[s]imple" RFC1051 protocol is the "previous" Internet
+ standard that is completely incompatible with the new
+ standard. Some software today, however, continues to
+ support the old standard (and only the old standard)
+ including NetBSD and AmiTCP. RFC1051 also does not support
+ RFC1201's packet splitting, and the MTU of 507 is still
+ smaller than the Internet "requirement," so it's quite
+ possible that you may run into problems. It's also slower
+ than RFC1201 by about 25%, for the same reason as arc0e.
+
+ The arc0s support was contributed by Tomasz Motylewski
+ and modified somewhat by me. Bugs are probably my fault.
+
+You can choose not to compile arc0e and arc0s into the driver if you want -
+this will save you a bit of memory and avoid confusion when eg. trying to
+use the "NFS-root" stuff in recent Linux kernels.
+
+The arc0e and arc0s devices are created automatically when you first
+ifconfig the arc0 device. To actually use them, though, you need to also
+ifconfig the other virtual devices you need. There are a number of ways you
+can set up your network then:
+
+
+1. Single Protocol.
+
+ This is the simplest way to configure your network: use just one of the
+ two available protocols. As mentioned above, it's a good idea to use
+ only arc0 unless you have a good reason (like some other software, ie.
+ WfWg, that only works with arc0e).
+
+ If you need only arc0, then the following commands should get you going:
+ ifconfig arc0 MY.IP.ADD.RESS
+ route add MY.IP.ADD.RESS arc0
+ route add -net SUB.NET.ADD.RESS arc0
+ [add other local routes here]
+
+ If you need arc0e (and only arc0e), it's a little different:
+ ifconfig arc0 MY.IP.ADD.RESS
+ ifconfig arc0e MY.IP.ADD.RESS
+ route add MY.IP.ADD.RESS arc0e
+ route add -net SUB.NET.ADD.RESS arc0e
+
+ arc0s works much the same way as arc0e.
+
+
+2. More than one protocol on the same wire.
+
+ Now things start getting confusing. To even try it, you may need to be
+ partly crazy. Here's what *I* did. :) Note that I don't include arc0s in
+ my home network; I don't have any NetBSD or AmiTCP computers, so I only
+ use arc0s during limited testing.
+
+ I have three computers on my home network; two Linux boxes (which prefer
+ RFC1201 protocol, for reasons listed above), and one XT that can't run
+ Linux but runs the free Microsoft LANMAN Client instead.
+
+ Worse, one of the Linux computers (freedom) also has a modem and acts as
+ a router to my Internet provider. The other Linux box (insight) also has
+ its own IP address and needs to use freedom as its default gateway. The
+ XT (patience), however, does not have its own Internet IP address and so
+ I assigned it one on a "private subnet" (as defined by RFC1597).
+
+ To start with, take a simple network with just insight and freedom.
+ Insight needs to:
+ - talk to freedom via RFC1201 (arc0) protocol, because I like it
+ more and it's faster.
+ - use freedom as its Internet gateway.
+
+ That's pretty easy to do. Set up insight like this:
+ ifconfig arc0 insight
+ route add insight arc0
+ route add freedom arc0 /* I would use the subnet here (like I said
+ to to in "single protocol" above),
+ but the rest of the subnet
+ unfortunately lies across the PPP
+ link on freedom, which confuses
+ things. */
+ route add default gw freedom
+
+ And freedom gets configured like so:
+ ifconfig arc0 freedom
+ route add freedom arc0
+ route add insight arc0
+ /* and default gateway is configured by pppd */
+
+ Great, now insight talks to freedom directly on arc0, and sends packets
+ to the Internet through freedom. If you didn't know how to do the above,
+ you should probably stop reading this section now because it only gets
+ worse.
+
+ Now, how do I add patience into the network? It will be using LANMAN
+ Client, which means I need the arc0e device. It needs to be able to talk
+ to both insight and freedom, and also use freedom as a gateway to the
+ Internet. (Recall that patience has a "private IP address" which won't
+ work on the Internet; that's okay, I configured Linux IP masquerading on
+ freedom for this subnet).
+
+ So patience (necessarily; I don't have another IP number from my
+ provider) has an IP address on a different subnet than freedom and
+ insight, but needs to use freedom as an Internet gateway. Worse, most
+ DOS networking programs, including LANMAN, have braindead networking
+ schemes that rely completely on the netmask and a 'default gateway' to
+ determine how to route packets. This means that to get to freedom or
+ insight, patience WILL send through its default gateway, regardless of
+ the fact that both freedom and insight (courtesy of the arc0e device)
+ could understand a direct transmission.
+
+ I compensate by giving freedom an extra IP address - aliased 'gatekeeper'
+ - that is on my private subnet, the same subnet that patience is on. I
+ then define gatekeeper to be the default gateway for patience.
+
+ To configure freedom (in addition to the commands above):
+ ifconfig arc0e gatekeeper
+ route add gatekeeper arc0e
+ route add patience arc0e
+
+ This way, freedom will send all packets for patience through arc0e,
+ giving its IP address as gatekeeper (on the private subnet). When it
+ talks to insight or the Internet, it will use its "freedom" Internet IP
+ address.
+
+ You will notice that we haven't configured the arc0e device on insight.
+ This would work, but is not really necessary, and would require me to
+ assign insight another special IP number from my private subnet. Since
+ both insight and patience are using freedom as their default gateway, the
+ two can already talk to each other.
+
+ It's quite fortunate that I set things up like this the first time (cough
+ cough) because it's really handy when I boot insight into DOS. There, it
+ runs the Novell ODI protocol stack, which only works with RFC1201 ARCnet.
+ In this mode it would be impossible for insight to communicate directly
+ with patience, since the Novell stack is incompatible with Microsoft's
+ Ethernet-Encap. Without changing any settings on freedom or patience, I
+ simply set freedom as the default gateway for insight (now in DOS,
+ remember) and all the forwarding happens "automagically" between the two
+ hosts that would normally not be able to communicate at all.
+
+ For those who like diagrams, I have created two "virtual subnets" on the
+ same physical ARCnet wire. You can picture it like this:
+
+
+ [RFC1201 NETWORK] [ETHER-ENCAP NETWORK]
+ (registered Internet subnet) (RFC1597 private subnet)
+
+ (IP Masquerade)
+ /---------------\ * /---------------\
+ | | * | |
+ | +-Freedom-*-Gatekeeper-+ |
+ | | | * | |
+ \-------+-------/ | * \-------+-------/
+ | | |
+ Insight | Patience
+ (Internet)
+
+
+
+It works: what now?
+-------------------
+
+Send mail describing your setup, preferably including driver version, kernel
+version, ARCnet card model, CPU type, number of systems on your network, and
+list of software in use to me at the following address:
+ apenwarr@worldvisions.ca
+
+I do send (sometimes automated) replies to all messages I receive. My email
+can be weird (and also usually gets forwarded all over the place along the
+way to me), so if you don't get a reply within a reasonable time, please
+resend.
+
+
+It doesn't work: what now?
+--------------------------
+
+Do the same as above, but also include the output of the ifconfig and route
+commands, as well as any pertinent log entries (ie. anything that starts
+with "arcnet:" and has shown up since the last reboot) in your mail.
+
+If you want to try fixing it yourself (I strongly recommend that you mail me
+about the problem first, since it might already have been solved) you may
+want to try some of the debug levels available. For heavy testing on
+D_DURING or more, it would be a REALLY good idea to kill your klogd daemon
+first! D_DURING displays 4-5 lines for each packet sent or received. D_TX,
+D_RX, and D_SKB actually DISPLAY each packet as it is sent or received,
+which is obviously quite big.
+
+Starting with v2.40 ALPHA, the autoprobe routines have changed
+significantly. In particular, they won't tell you why the card was not
+found unless you turn on the D_INIT_REASONS debugging flag.
+
+Once the driver is running, you can run the arcdump shell script (available
+from me or in the full ARCnet package, if you have it) as root to list the
+contents of the arcnet buffers at any time. To make any sense at all out of
+this, you should grab the pertinent RFCs. (some are listed near the top of
+arcnet.c). arcdump assumes your card is at 0xD0000. If it isn't, edit the
+script.
+
+Buffers 0 and 1 are used for receiving, and Buffers 2 and 3 are for sending.
+Ping-pong buffers are implemented both ways.
+
+If your debug level includes D_DURING and you did NOT define SLOW_XMIT_COPY,
+the buffers are cleared to a constant value of 0x42 every time the card is
+reset (which should only happen when you do an ifconfig up, or when Linux
+decides that the driver is broken). During a transmit, unused parts of the
+buffer will be cleared to 0x42 as well. This is to make it easier to figure
+out which bytes are being used by a packet.
+
+You can change the debug level without recompiling the kernel by typing:
+ ifconfig arc0 down metric 1xxx
+ /etc/rc.d/rc.inet1
+where "xxx" is the debug level you want. For example, "metric 1015" would put
+you at debug level 15. Debug level 7 is currently the default.
+
+Note that the debug level is (starting with v1.90 ALPHA) a binary
+combination of different debug flags; so debug level 7 is really 1+2+4 or
+D_NORMAL+D_EXTRA+D_INIT. To include D_DURING, you would add 16 to this,
+resulting in debug level 23.
+
+If you don't understand that, you probably don't want to know anyway.
+E-mail me about your problem.
+
+
+I want to send money: what now?
+-------------------------------
+
+Go take a nap or something. You'll feel better in the morning.
diff --git a/Documentation/networking/atm.txt b/Documentation/networking/atm.txt
new file mode 100644
index 0000000..82921ce
--- /dev/null
+++ b/Documentation/networking/atm.txt
@@ -0,0 +1,8 @@
+In order to use anything but the most primitive functions of ATM,
+several user-mode programs are required to assist the kernel. These
+programs and related material can be found via the ATM on Linux Web
+page at http://linux-atm.sourceforge.net/
+
+If you encounter problems with ATM, please report them on the ATM
+on Linux mailing list. Subscription information, archives, etc.,
+can be found on http://linux-atm.sourceforge.net/
diff --git a/Documentation/networking/ax25.txt b/Documentation/networking/ax25.txt
new file mode 100644
index 0000000..8257dbf
--- /dev/null
+++ b/Documentation/networking/ax25.txt
@@ -0,0 +1,10 @@
+To use the amateur radio protocols within Linux you will need to get a
+suitable copy of the AX.25 Utilities. More detailed information about
+AX.25, NET/ROM and ROSE, associated programs and and utilities can be
+found on http://www.linux-ax25.org.
+
+There is an active mailing list for discussing Linux amateur radio matters
+called linux-hams@vger.kernel.org. To subscribe to it, send a message to
+majordomo@vger.kernel.org with the words "subscribe linux-hams" in the body
+of the message, the subject field is ignored. You don't need to be
+subscribed to post but of course that means you might miss an answer.
diff --git a/Documentation/networking/baycom.txt b/Documentation/networking/baycom.txt
new file mode 100644
index 0000000..4e68849
--- /dev/null
+++ b/Documentation/networking/baycom.txt
@@ -0,0 +1,158 @@
+ LINUX DRIVERS FOR BAYCOM MODEMS
+
+ Thomas M. Sailer, HB9JNX/AE4WA, <sailer@ife.ee.ethz.ch>
+
+!!NEW!! (04/98) The drivers for the baycom modems have been split into
+separate drivers as they did not share any code, and the driver
+and device names have changed.
+
+This document describes the Linux Kernel Drivers for simple Baycom style
+amateur radio modems.
+
+The following drivers are available:
+
+baycom_ser_fdx:
+ This driver supports the SER12 modems either full or half duplex.
+ Its baud rate may be changed via the `baud' module parameter,
+ therefore it supports just about every bit bang modem on a
+ serial port. Its devices are called bcsf0 through bcsf3.
+ This is the recommended driver for SER12 type modems,
+ however if you have a broken UART clone that does not have working
+ delta status bits, you may try baycom_ser_hdx.
+
+baycom_ser_hdx:
+ This is an alternative driver for SER12 type modems.
+ It only supports half duplex, and only 1200 baud. Its devices
+ are called bcsh0 through bcsh3. Use this driver only if baycom_ser_fdx
+ does not work with your UART.
+
+baycom_par:
+ This driver supports the par96 and picpar modems.
+ Its devices are called bcp0 through bcp3.
+
+baycom_epp:
+ This driver supports the EPP modem.
+ Its devices are called bce0 through bce3.
+ This driver is work-in-progress.
+
+The following modems are supported:
+
+ser12: This is a very simple 1200 baud AFSK modem. The modem consists only
+ of a modulator/demodulator chip, usually a TI TCM3105. The computer
+ is responsible for regenerating the receiver bit clock, as well as
+ for handling the HDLC protocol. The modem connects to a serial port,
+ hence the name. Since the serial port is not used as an async serial
+ port, the kernel driver for serial ports cannot be used, and this
+ driver only supports standard serial hardware (8250, 16450, 16550)
+
+par96: This is a modem for 9600 baud FSK compatible to the G3RUH standard.
+ The modem does all the filtering and regenerates the receiver clock.
+ Data is transferred from and to the PC via a shift register.
+ The shift register is filled with 16 bits and an interrupt is signalled.
+ The PC then empties the shift register in a burst. This modem connects
+ to the parallel port, hence the name. The modem leaves the
+ implementation of the HDLC protocol and the scrambler polynomial to
+ the PC.
+
+picpar: This is a redesign of the par96 modem by Henning Rech, DF9IC. The modem
+ is protocol compatible to par96, but uses only three low power ICs
+ and can therefore be fed from the parallel port and does not require
+ an additional power supply. Furthermore, it incorporates a carrier
+ detect circuitry.
+
+EPP: This is a high-speed modem adaptor that connects to an enhanced parallel port.
+ Its target audience is users working over a high speed hub (76.8kbit/s).
+
+eppfpga: This is a redesign of the EPP adaptor.
+
+
+
+All of the above modems only support half duplex communications. However,
+the driver supports the KISS (see below) fullduplex command. It then simply
+starts to send as soon as there's a packet to transmit and does not care
+about DCD, i.e. it starts to send even if there's someone else on the channel.
+This command is required by some implementations of the DAMA channel
+access protocol.
+
+
+The Interface of the drivers
+
+Unlike previous drivers, these drivers are no longer character devices,
+but they are now true kernel network interfaces. Installation is therefore
+simple. Once installed, four interfaces named bc{sf,sh,p,e}[0-3] are available.
+sethdlc from the ax25 utilities may be used to set driver states etc.
+Users of userland AX.25 stacks may use the net2kiss utility (also available
+in the ax25 utilities package) to convert packets of a network interface
+to a KISS stream on a pseudo tty. There's also a patch available from
+me for WAMPES which allows attaching a kernel network interface directly.
+
+
+Configuring the driver
+
+Every time a driver is inserted into the kernel, it has to know which
+modems it should access at which ports. This can be done with the setbaycom
+utility. If you are only using one modem, you can also configure the
+driver from the insmod command line (or by means of an option line in
+/etc/modprobe.conf).
+
+Examples:
+ modprobe baycom_ser_fdx mode="ser12*" iobase=0x3f8 irq=4
+ sethdlc -i bcsf0 -p mode "ser12*" io 0x3f8 irq 4
+
+Both lines configure the first port to drive a ser12 modem at the first
+serial port (COM1 under DOS). The * in the mode parameter instructs the driver to use
+the software DCD algorithm (see below).
+
+ insmod baycom_par mode="picpar" iobase=0x378
+ sethdlc -i bcp0 -p mode "picpar" io 0x378
+
+Both lines configure the first port to drive a picpar modem at the
+first parallel port (LPT1 under DOS). (Note: picpar implies
+hardware DCD, par96 implies software DCD).
+
+The channel access parameters can be set with sethdlc -a or kissparms.
+Note that both utilities interpret the values slightly differently.
+
+
+Hardware DCD versus Software DCD
+
+To avoid collisions on the air, the driver must know when the channel is
+busy. This is the task of the DCD circuitry/software. The driver may either
+utilise a software DCD algorithm (options=1) or use a DCD signal from
+the hardware (options=0).
+
+ser12: if software DCD is utilised, the radio's squelch should always be
+ open. It is highly recommended to use the software DCD algorithm,
+ as it is much faster than most hardware squelch circuitry. The
+ disadvantage is a slightly higher load on the system.
+
+par96: the software DCD algorithm for this type of modem is rather poor.
+ The modem simply does not provide enough information to implement
+ a reasonable DCD algorithm in software. Therefore, if your radio
+ feeds the DCD input of the PAR96 modem, the use of the hardware
+ DCD circuitry is recommended.
+
+picpar: the picpar modem features a builtin DCD hardware, which is highly
+ recommended.
+
+
+
+Compatibility with the rest of the Linux kernel
+
+The serial driver and the baycom serial drivers compete
+for the same hardware resources. Of course only one driver can access a given
+interface at a time. The serial driver grabs all interfaces it can find at
+startup time. Therefore the baycom drivers subsequently won't be able to
+access a serial port. You might therefore find it necessary to release
+a port owned by the serial driver with 'setserial /dev/ttyS# uart none', where
+# is the number of the interface. The baycom drivers do not reserve any
+ports at startup, unless one is specified on the 'insmod' command line. Another
+method to solve the problem is to compile all drivers as modules and
+leave it to kmod to load the correct driver depending on the application.
+
+The parallel port drivers (baycom_par, baycom_epp) now use the parport subsystem
+to arbitrate the ports between different client drivers.
+
+vy 73s de
+Tom Sailer, sailer@ife.ee.ethz.ch
+hb9jnx @ hb9w.ampr.org
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
new file mode 100644
index 0000000..688dfe1
--- /dev/null
+++ b/Documentation/networking/bonding.txt
@@ -0,0 +1,2346 @@
+
+ Linux Ethernet Bonding Driver HOWTO
+
+ Latest update: 12 November 2007
+
+Initial release : Thomas Davis <tadavis at lbl.gov>
+Corrections, HA extensions : 2000/10/03-15 :
+ - Willy Tarreau <willy at meta-x.org>
+ - Constantine Gavrilov <const-g at xpert.com>
+ - Chad N. Tindel <ctindel at ieee dot org>
+ - Janice Girouard <girouard at us dot ibm dot com>
+ - Jay Vosburgh <fubar at us dot ibm dot com>
+
+Reorganized and updated Feb 2005 by Jay Vosburgh
+Added Sysfs information: 2006/04/24
+ - Mitch Williams <mitch.a.williams at intel.com>
+
+Introduction
+============
+
+ The Linux bonding driver provides a method for aggregating
+multiple network interfaces into a single logical "bonded" interface.
+The behavior of the bonded interfaces depends upon the mode; generally
+speaking, modes provide either hot standby or load balancing services.
+Additionally, link integrity monitoring may be performed.
+
+ The bonding driver originally came from Donald Becker's
+beowulf patches for kernel 2.0. It has changed quite a bit since, and
+the original tools from extreme-linux and beowulf sites will not work
+with this version of the driver.
+
+ For new versions of the driver, updated userspace tools, and
+who to ask for help, please follow the links at the end of this file.
+
+Table of Contents
+=================
+
+1. Bonding Driver Installation
+
+2. Bonding Driver Options
+
+3. Configuring Bonding Devices
+3.1 Configuration with Sysconfig Support
+3.1.1 Using DHCP with Sysconfig
+3.1.2 Configuring Multiple Bonds with Sysconfig
+3.2 Configuration with Initscripts Support
+3.2.1 Using DHCP with Initscripts
+3.2.2 Configuring Multiple Bonds with Initscripts
+3.3 Configuring Bonding Manually with Ifenslave
+3.3.1 Configuring Multiple Bonds Manually
+3.4 Configuring Bonding Manually via Sysfs
+
+4. Querying Bonding Configuration
+4.1 Bonding Configuration
+4.2 Network Configuration
+
+5. Switch Configuration
+
+6. 802.1q VLAN Support
+
+7. Link Monitoring
+7.1 ARP Monitor Operation
+7.2 Configuring Multiple ARP Targets
+7.3 MII Monitor Operation
+
+8. Potential Trouble Sources
+8.1 Adventures in Routing
+8.2 Ethernet Device Renaming
+8.3 Painfully Slow Or No Failed Link Detection By Miimon
+
+9. SNMP agents
+
+10. Promiscuous mode
+
+11. Configuring Bonding for High Availability
+11.1 High Availability in a Single Switch Topology
+11.2 High Availability in a Multiple Switch Topology
+11.2.1 HA Bonding Mode Selection for Multiple Switch Topology
+11.2.2 HA Link Monitoring for Multiple Switch Topology
+
+12. Configuring Bonding for Maximum Throughput
+12.1 Maximum Throughput in a Single Switch Topology
+12.1.1 MT Bonding Mode Selection for Single Switch Topology
+12.1.2 MT Link Monitoring for Single Switch Topology
+12.2 Maximum Throughput in a Multiple Switch Topology
+12.2.1 MT Bonding Mode Selection for Multiple Switch Topology
+12.2.2 MT Link Monitoring for Multiple Switch Topology
+
+13. Switch Behavior Issues
+13.1 Link Establishment and Failover Delays
+13.2 Duplicated Incoming Packets
+
+14. Hardware Specific Considerations
+14.1 IBM BladeCenter
+
+15. Frequently Asked Questions
+
+16. Resources and Links
+
+
+1. Bonding Driver Installation
+==============================
+
+ Most popular distro kernels ship with the bonding driver
+already available as a module and the ifenslave user level control
+program installed and ready for use. If your distro does not, or you
+have need to compile bonding from source (e.g., configuring and
+installing a mainline kernel from kernel.org), you'll need to perform
+the following steps:
+
+1.1 Configure and build the kernel with bonding
+-----------------------------------------------
+
+ The current version of the bonding driver is available in the
+drivers/net/bonding subdirectory of the most recent kernel source
+(which is available on http://kernel.org). Most users "rolling their
+own" will want to use the most recent kernel from kernel.org.
+
+ Configure kernel with "make menuconfig" (or "make xconfig" or
+"make config"), then select "Bonding driver support" in the "Network
+device support" section. It is recommended that you configure the
+driver as module since it is currently the only way to pass parameters
+to the driver or configure more than one bonding device.
+
+ Build and install the new kernel and modules, then continue
+below to install ifenslave.
+
+1.2 Install ifenslave Control Utility
+-------------------------------------
+
+ The ifenslave user level control program is included in the
+kernel source tree, in the file Documentation/networking/ifenslave.c.
+It is generally recommended that you use the ifenslave that
+corresponds to the kernel that you are using (either from the same
+source tree or supplied with the distro), however, ifenslave
+executables from older kernels should function (but features newer
+than the ifenslave release are not supported). Running an ifenslave
+that is newer than the kernel is not supported, and may or may not
+work.
+
+ To install ifenslave, do the following:
+
+# gcc -Wall -O -I/usr/src/linux/include ifenslave.c -o ifenslave
+# cp ifenslave /sbin/ifenslave
+
+ If your kernel source is not in "/usr/src/linux," then replace
+"/usr/src/linux/include" in the above with the location of your kernel
+source include directory.
+
+ You may wish to back up any existing /sbin/ifenslave, or, for
+testing or informal use, tag the ifenslave to the kernel version
+(e.g., name the ifenslave executable /sbin/ifenslave-2.6.10).
+
+IMPORTANT NOTE:
+
+ If you omit the "-I" or specify an incorrect directory, you
+may end up with an ifenslave that is incompatible with the kernel
+you're trying to build it for. Some distros (e.g., Red Hat from 7.1
+onwards) do not have /usr/include/linux symbolically linked to the
+default kernel source include directory.
+
+SECOND IMPORTANT NOTE:
+ If you plan to configure bonding using sysfs, you do not need
+to use ifenslave.
+
+2. Bonding Driver Options
+=========================
+
+ Options for the bonding driver are supplied as parameters to the
+bonding module at load time, or are specified via sysfs.
+
+ Module options may be given as command line arguments to the
+insmod or modprobe command, but are usually specified in either the
+/etc/modules.conf or /etc/modprobe.conf configuration file, or in a
+distro-specific configuration file (some of which are detailed in the next
+section).
+
+ Details on bonding support for sysfs is provided in the
+"Configuring Bonding Manually via Sysfs" section, below.
+
+ The available bonding driver parameters are listed below. If a
+parameter is not specified the default value is used. When initially
+configuring a bond, it is recommended "tail -f /var/log/messages" be
+run in a separate window to watch for bonding driver error messages.
+
+ It is critical that either the miimon or arp_interval and
+arp_ip_target parameters be specified, otherwise serious network
+degradation will occur during link failures. Very few devices do not
+support at least miimon, so there is really no reason not to use it.
+
+ Options with textual values will accept either the text name
+or, for backwards compatibility, the option value. E.g.,
+"mode=802.3ad" and "mode=4" set the same mode.
+
+ The parameters are as follows:
+
+arp_interval
+
+ Specifies the ARP link monitoring frequency in milliseconds.
+
+ The ARP monitor works by periodically checking the slave
+ devices to determine whether they have sent or received
+ traffic recently (the precise criteria depends upon the
+ bonding mode, and the state of the slave). Regular traffic is
+ generated via ARP probes issued for the addresses specified by
+ the arp_ip_target option.
+
+ This behavior can be modified by the arp_validate option,
+ below.
+
+ If ARP monitoring is used in an etherchannel compatible mode
+ (modes 0 and 2), the switch should be configured in a mode
+ that evenly distributes packets across all links. If the
+ switch is configured to distribute the packets in an XOR
+ fashion, all replies from the ARP targets will be received on
+ the same link which could cause the other team members to
+ fail. ARP monitoring should not be used in conjunction with
+ miimon. A value of 0 disables ARP monitoring. The default
+ value is 0.
+
+arp_ip_target
+
+ Specifies the IP addresses to use as ARP monitoring peers when
+ arp_interval is > 0. These are the targets of the ARP request
+ sent to determine the health of the link to the targets.
+ Specify these values in ddd.ddd.ddd.ddd format. Multiple IP
+ addresses must be separated by a comma. At least one IP
+ address must be given for ARP monitoring to function. The
+ maximum number of targets that can be specified is 16. The
+ default value is no IP addresses.
+
+arp_validate
+
+ Specifies whether or not ARP probes and replies should be
+ validated in the active-backup mode. This causes the ARP
+ monitor to examine the incoming ARP requests and replies, and
+ only consider a slave to be up if it is receiving the
+ appropriate ARP traffic.
+
+ Possible values are:
+
+ none or 0
+
+ No validation is performed. This is the default.
+
+ active or 1
+
+ Validation is performed only for the active slave.
+
+ backup or 2
+
+ Validation is performed only for backup slaves.
+
+ all or 3
+
+ Validation is performed for all slaves.
+
+ For the active slave, the validation checks ARP replies to
+ confirm that they were generated by an arp_ip_target. Since
+ backup slaves do not typically receive these replies, the
+ validation performed for backup slaves is on the ARP request
+ sent out via the active slave. It is possible that some
+ switch or network configurations may result in situations
+ wherein the backup slaves do not receive the ARP requests; in
+ such a situation, validation of backup slaves must be
+ disabled.
+
+ This option is useful in network configurations in which
+ multiple bonding hosts are concurrently issuing ARPs to one or
+ more targets beyond a common switch. Should the link between
+ the switch and target fail (but not the switch itself), the
+ probe traffic generated by the multiple bonding instances will
+ fool the standard ARP monitor into considering the links as
+ still up. Use of the arp_validate option can resolve this, as
+ the ARP monitor will only consider ARP requests and replies
+ associated with its own instance of bonding.
+
+ This option was added in bonding version 3.1.0.
+
+downdelay
+
+ Specifies the time, in milliseconds, to wait before disabling
+ a slave after a link failure has been detected. This option
+ is only valid for the miimon link monitor. The downdelay
+ value should be a multiple of the miimon value; if not, it
+ will be rounded down to the nearest multiple. The default
+ value is 0.
+
+fail_over_mac
+
+ Specifies whether active-backup mode should set all slaves to
+ the same MAC address at enslavement (the traditional
+ behavior), or, when enabled, perform special handling of the
+ bond's MAC address in accordance with the selected policy.
+
+ Possible values are:
+
+ none or 0
+
+ This setting disables fail_over_mac, and causes
+ bonding to set all slaves of an active-backup bond to
+ the same MAC address at enslavement time. This is the
+ default.
+
+ active or 1
+
+ The "active" fail_over_mac policy indicates that the
+ MAC address of the bond should always be the MAC
+ address of the currently active slave. The MAC
+ address of the slaves is not changed; instead, the MAC
+ address of the bond changes during a failover.
+
+ This policy is useful for devices that cannot ever
+ alter their MAC address, or for devices that refuse
+ incoming broadcasts with their own source MAC (which
+ interferes with the ARP monitor).
+
+ The down side of this policy is that every device on
+ the network must be updated via gratuitous ARP,
+ vs. just updating a switch or set of switches (which
+ often takes place for any traffic, not just ARP
+ traffic, if the switch snoops incoming traffic to
+ update its tables) for the traditional method. If the
+ gratuitous ARP is lost, communication may be
+ disrupted.
+
+ When this policy is used in conjuction with the mii
+ monitor, devices which assert link up prior to being
+ able to actually transmit and receive are particularly
+ susecptible to loss of the gratuitous ARP, and an
+ appropriate updelay setting may be required.
+
+ follow or 2
+
+ The "follow" fail_over_mac policy causes the MAC
+ address of the bond to be selected normally (normally
+ the MAC address of the first slave added to the bond).
+ However, the second and subsequent slaves are not set
+ to this MAC address while they are in a backup role; a
+ slave is programmed with the bond's MAC address at
+ failover time (and the formerly active slave receives
+ the newly active slave's MAC address).
+
+ This policy is useful for multiport devices that
+ either become confused or incur a performance penalty
+ when multiple ports are programmed with the same MAC
+ address.
+
+
+ The default policy is none, unless the first slave cannot
+ change its MAC address, in which case the active policy is
+ selected by default.
+
+ This option may be modified via sysfs only when no slaves are
+ present in the bond.
+
+ This option was added in bonding version 3.2.0. The "follow"
+ policy was added in bonding version 3.3.0.
+
+lacp_rate
+
+ Option specifying the rate in which we'll ask our link partner
+ to transmit LACPDU packets in 802.3ad mode. Possible values
+ are:
+
+ slow or 0
+ Request partner to transmit LACPDUs every 30 seconds
+
+ fast or 1
+ Request partner to transmit LACPDUs every 1 second
+
+ The default is slow.
+
+max_bonds
+
+ Specifies the number of bonding devices to create for this
+ instance of the bonding driver. E.g., if max_bonds is 3, and
+ the bonding driver is not already loaded, then bond0, bond1
+ and bond2 will be created. The default value is 1. Specifying
+ a value of 0 will load bonding, but will not create any devices.
+
+miimon
+
+ Specifies the MII link monitoring frequency in milliseconds.
+ This determines how often the link state of each slave is
+ inspected for link failures. A value of zero disables MII
+ link monitoring. A value of 100 is a good starting point.
+ The use_carrier option, below, affects how the link state is
+ determined. See the High Availability section for additional
+ information. The default value is 0.
+
+mode
+
+ Specifies one of the bonding policies. The default is
+ balance-rr (round robin). Possible values are:
+
+ balance-rr or 0
+
+ Round-robin policy: Transmit packets in sequential
+ order from the first available slave through the
+ last. This mode provides load balancing and fault
+ tolerance.
+
+ active-backup or 1
+
+ Active-backup policy: Only one slave in the bond is
+ active. A different slave becomes active if, and only
+ if, the active slave fails. The bond's MAC address is
+ externally visible on only one port (network adapter)
+ to avoid confusing the switch.
+
+ In bonding version 2.6.2 or later, when a failover
+ occurs in active-backup mode, bonding will issue one
+ or more gratuitous ARPs on the newly active slave.
+ One gratuitous ARP is issued for the bonding master
+ interface and each VLAN interfaces configured above
+ it, provided that the interface has at least one IP
+ address configured. Gratuitous ARPs issued for VLAN
+ interfaces are tagged with the appropriate VLAN id.
+
+ This mode provides fault tolerance. The primary
+ option, documented below, affects the behavior of this
+ mode.
+
+ balance-xor or 2
+
+ XOR policy: Transmit based on the selected transmit
+ hash policy. The default policy is a simple [(source
+ MAC address XOR'd with destination MAC address) modulo
+ slave count]. Alternate transmit policies may be
+ selected via the xmit_hash_policy option, described
+ below.
+
+ This mode provides load balancing and fault tolerance.
+
+ broadcast or 3
+
+ Broadcast policy: transmits everything on all slave
+ interfaces. This mode provides fault tolerance.
+
+ 802.3ad or 4
+
+ IEEE 802.3ad Dynamic link aggregation. Creates
+ aggregation groups that share the same speed and
+ duplex settings. Utilizes all slaves in the active
+ aggregator according to the 802.3ad specification.
+
+ Slave selection for outgoing traffic is done according
+ to the transmit hash policy, which may be changed from
+ the default simple XOR policy via the xmit_hash_policy
+ option, documented below. Note that not all transmit
+ policies may be 802.3ad compliant, particularly in
+ regards to the packet mis-ordering requirements of
+ section 43.2.4 of the 802.3ad standard. Differing
+ peer implementations will have varying tolerances for
+ noncompliance.
+
+ Prerequisites:
+
+ 1. Ethtool support in the base drivers for retrieving
+ the speed and duplex of each slave.
+
+ 2. A switch that supports IEEE 802.3ad Dynamic link
+ aggregation.
+
+ Most switches will require some type of configuration
+ to enable 802.3ad mode.
+
+ balance-tlb or 5
+
+ Adaptive transmit load balancing: channel bonding that
+ does not require any special switch support. The
+ outgoing traffic is distributed according to the
+ current load (computed relative to the speed) on each
+ slave. Incoming traffic is received by the current
+ slave. If the receiving slave fails, another slave
+ takes over the MAC address of the failed receiving
+ slave.
+
+ Prerequisite:
+
+ Ethtool support in the base drivers for retrieving the
+ speed of each slave.
+
+ balance-alb or 6
+
+ Adaptive load balancing: includes balance-tlb plus
+ receive load balancing (rlb) for IPV4 traffic, and
+ does not require any special switch support. The
+ receive load balancing is achieved by ARP negotiation.
+ The bonding driver intercepts the ARP Replies sent by
+ the local system on their way out and overwrites the
+ source hardware address with the unique hardware
+ address of one of the slaves in the bond such that
+ different peers use different hardware addresses for
+ the server.
+
+ Receive traffic from connections created by the server
+ is also balanced. When the local system sends an ARP
+ Request the bonding driver copies and saves the peer's
+ IP information from the ARP packet. When the ARP
+ Reply arrives from the peer, its hardware address is
+ retrieved and the bonding driver initiates an ARP
+ reply to this peer assigning it to one of the slaves
+ in the bond. A problematic outcome of using ARP
+ negotiation for balancing is that each time that an
+ ARP request is broadcast it uses the hardware address
+ of the bond. Hence, peers learn the hardware address
+ of the bond and the balancing of receive traffic
+ collapses to the current slave. This is handled by
+ sending updates (ARP Replies) to all the peers with
+ their individually assigned hardware address such that
+ the traffic is redistributed. Receive traffic is also
+ redistributed when a new slave is added to the bond
+ and when an inactive slave is re-activated. The
+ receive load is distributed sequentially (round robin)
+ among the group of highest speed slaves in the bond.
+
+ When a link is reconnected or a new slave joins the
+ bond the receive traffic is redistributed among all
+ active slaves in the bond by initiating ARP Replies
+ with the selected MAC address to each of the
+ clients. The updelay parameter (detailed below) must
+ be set to a value equal or greater than the switch's
+ forwarding delay so that the ARP Replies sent to the
+ peers will not be blocked by the switch.
+
+ Prerequisites:
+
+ 1. Ethtool support in the base drivers for retrieving
+ the speed of each slave.
+
+ 2. Base driver support for setting the hardware
+ address of a device while it is open. This is
+ required so that there will always be one slave in the
+ team using the bond hardware address (the
+ curr_active_slave) while having a unique hardware
+ address for each slave in the bond. If the
+ curr_active_slave fails its hardware address is
+ swapped with the new curr_active_slave that was
+ chosen.
+
+num_grat_arp
+
+ Specifies the number of gratuitous ARPs to be issued after a
+ failover event. One gratuitous ARP is issued immediately after
+ the failover, subsequent ARPs are sent at a rate of one per link
+ monitor interval (arp_interval or miimon, whichever is active).
+
+ The valid range is 0 - 255; the default value is 1. This option
+ affects only the active-backup mode. This option was added for
+ bonding version 3.3.0.
+
+primary
+
+ A string (eth0, eth2, etc) specifying which slave is the
+ primary device. The specified device will always be the
+ active slave while it is available. Only when the primary is
+ off-line will alternate devices be used. This is useful when
+ one slave is preferred over another, e.g., when one slave has
+ higher throughput than another.
+
+ The primary option is only valid for active-backup mode.
+
+updelay
+
+ Specifies the time, in milliseconds, to wait before enabling a
+ slave after a link recovery has been detected. This option is
+ only valid for the miimon link monitor. The updelay value
+ should be a multiple of the miimon value; if not, it will be
+ rounded down to the nearest multiple. The default value is 0.
+
+use_carrier
+
+ Specifies whether or not miimon should use MII or ETHTOOL
+ ioctls vs. netif_carrier_ok() to determine the link
+ status. The MII or ETHTOOL ioctls are less efficient and
+ utilize a deprecated calling sequence within the kernel. The
+ netif_carrier_ok() relies on the device driver to maintain its
+ state with netif_carrier_on/off; at this writing, most, but
+ not all, device drivers support this facility.
+
+ If bonding insists that the link is up when it should not be,
+ it may be that your network device driver does not support
+ netif_carrier_on/off. The default state for netif_carrier is
+ "carrier on," so if a driver does not support netif_carrier,
+ it will appear as if the link is always up. In this case,
+ setting use_carrier to 0 will cause bonding to revert to the
+ MII / ETHTOOL ioctl method to determine the link state.
+
+ A value of 1 enables the use of netif_carrier_ok(), a value of
+ 0 will use the deprecated MII / ETHTOOL ioctls. The default
+ value is 1.
+
+xmit_hash_policy
+
+ Selects the transmit hash policy to use for slave selection in
+ balance-xor and 802.3ad modes. Possible values are:
+
+ layer2
+
+ Uses XOR of hardware MAC addresses to generate the
+ hash. The formula is
+
+ (source MAC XOR destination MAC) modulo slave count
+
+ This algorithm will place all traffic to a particular
+ network peer on the same slave.
+
+ This algorithm is 802.3ad compliant.
+
+ layer2+3
+
+ This policy uses a combination of layer2 and layer3
+ protocol information to generate the hash.
+
+ Uses XOR of hardware MAC addresses and IP addresses to
+ generate the hash. The formula is
+
+ (((source IP XOR dest IP) AND 0xffff) XOR
+ ( source MAC XOR destination MAC ))
+ modulo slave count
+
+ This algorithm will place all traffic to a particular
+ network peer on the same slave. For non-IP traffic,
+ the formula is the same as for the layer2 transmit
+ hash policy.
+
+ This policy is intended to provide a more balanced
+ distribution of traffic than layer2 alone, especially
+ in environments where a layer3 gateway device is
+ required to reach most destinations.
+
+ This algorithm is 802.3ad compliant.
+
+ layer3+4
+
+ This policy uses upper layer protocol information,
+ when available, to generate the hash. This allows for
+ traffic to a particular network peer to span multiple
+ slaves, although a single connection will not span
+ multiple slaves.
+
+ The formula for unfragmented TCP and UDP packets is
+
+ ((source port XOR dest port) XOR
+ ((source IP XOR dest IP) AND 0xffff)
+ modulo slave count
+
+ For fragmented TCP or UDP packets and all other IP
+ protocol traffic, the source and destination port
+ information is omitted. For non-IP traffic, the
+ formula is the same as for the layer2 transmit hash
+ policy.
+
+ This policy is intended to mimic the behavior of
+ certain switches, notably Cisco switches with PFC2 as
+ well as some Foundry and IBM products.
+
+ This algorithm is not fully 802.3ad compliant. A
+ single TCP or UDP conversation containing both
+ fragmented and unfragmented packets will see packets
+ striped across two interfaces. This may result in out
+ of order delivery. Most traffic types will not meet
+ this criteria, as TCP rarely fragments traffic, and
+ most UDP traffic is not involved in extended
+ conversations. Other implementations of 802.3ad may
+ or may not tolerate this noncompliance.
+
+ The default value is layer2. This option was added in bonding
+ version 2.6.3. In earlier versions of bonding, this parameter
+ does not exist, and the layer2 policy is the only policy. The
+ layer2+3 value was added for bonding version 3.2.2.
+
+
+3. Configuring Bonding Devices
+==============================
+
+ You can configure bonding using either your distro's network
+initialization scripts, or manually using either ifenslave or the
+sysfs interface. Distros generally use one of two packages for the
+network initialization scripts: initscripts or sysconfig. Recent
+versions of these packages have support for bonding, while older
+versions do not.
+
+ We will first describe the options for configuring bonding for
+distros using versions of initscripts and sysconfig with full or
+partial support for bonding, then provide information on enabling
+bonding without support from the network initialization scripts (i.e.,
+older versions of initscripts or sysconfig).
+
+ If you're unsure whether your distro uses sysconfig or
+initscripts, or don't know if it's new enough, have no fear.
+Determining this is fairly straightforward.
+
+ First, issue the command:
+
+$ rpm -qf /sbin/ifup
+
+ It will respond with a line of text starting with either
+"initscripts" or "sysconfig," followed by some numbers. This is the
+package that provides your network initialization scripts.
+
+ Next, to determine if your installation supports bonding,
+issue the command:
+
+$ grep ifenslave /sbin/ifup
+
+ If this returns any matches, then your initscripts or
+sysconfig has support for bonding.
+
+3.1 Configuration with Sysconfig Support
+----------------------------------------
+
+ This section applies to distros using a version of sysconfig
+with bonding support, for example, SuSE Linux Enterprise Server 9.
+
+ SuSE SLES 9's networking configuration system does support
+bonding, however, at this writing, the YaST system configuration
+front end does not provide any means to work with bonding devices.
+Bonding devices can be managed by hand, however, as follows.
+
+ First, if they have not already been configured, configure the
+slave devices. On SLES 9, this is most easily done by running the
+yast2 sysconfig configuration utility. The goal is for to create an
+ifcfg-id file for each slave device. The simplest way to accomplish
+this is to configure the devices for DHCP (this is only to get the
+file ifcfg-id file created; see below for some issues with DHCP). The
+name of the configuration file for each device will be of the form:
+
+ifcfg-id-xx:xx:xx:xx:xx:xx
+
+ Where the "xx" portion will be replaced with the digits from
+the device's permanent MAC address.
+
+ Once the set of ifcfg-id-xx:xx:xx:xx:xx:xx files has been
+created, it is necessary to edit the configuration files for the slave
+devices (the MAC addresses correspond to those of the slave devices).
+Before editing, the file will contain multiple lines, and will look
+something like this:
+
+BOOTPROTO='dhcp'
+STARTMODE='on'
+USERCTL='no'
+UNIQUE='XNzu.WeZGOGF+4wE'
+_nm_name='bus-pci-0001:61:01.0'
+
+ Change the BOOTPROTO and STARTMODE lines to the following:
+
+BOOTPROTO='none'
+STARTMODE='off'
+
+ Do not alter the UNIQUE or _nm_name lines. Remove any other
+lines (USERCTL, etc).
+
+ Once the ifcfg-id-xx:xx:xx:xx:xx:xx files have been modified,
+it's time to create the configuration file for the bonding device
+itself. This file is named ifcfg-bondX, where X is the number of the
+bonding device to create, starting at 0. The first such file is
+ifcfg-bond0, the second is ifcfg-bond1, and so on. The sysconfig
+network configuration system will correctly start multiple instances
+of bonding.
+
+ The contents of the ifcfg-bondX file is as follows:
+
+BOOTPROTO="static"
+BROADCAST="10.0.2.255"
+IPADDR="10.0.2.10"
+NETMASK="255.255.0.0"
+NETWORK="10.0.2.0"
+REMOTE_IPADDR=""
+STARTMODE="onboot"
+BONDING_MASTER="yes"
+BONDING_MODULE_OPTS="mode=active-backup miimon=100"
+BONDING_SLAVE0="eth0"
+BONDING_SLAVE1="bus-pci-0000:06:08.1"
+
+ Replace the sample BROADCAST, IPADDR, NETMASK and NETWORK
+values with the appropriate values for your network.
+
+ The STARTMODE specifies when the device is brought online.
+The possible values are:
+
+ onboot: The device is started at boot time. If you're not
+ sure, this is probably what you want.
+
+ manual: The device is started only when ifup is called
+ manually. Bonding devices may be configured this
+ way if you do not wish them to start automatically
+ at boot for some reason.
+
+ hotplug: The device is started by a hotplug event. This is not
+ a valid choice for a bonding device.
+
+ off or ignore: The device configuration is ignored.
+
+ The line BONDING_MASTER='yes' indicates that the device is a
+bonding master device. The only useful value is "yes."
+
+ The contents of BONDING_MODULE_OPTS are supplied to the
+instance of the bonding module for this device. Specify the options
+for the bonding mode, link monitoring, and so on here. Do not include
+the max_bonds bonding parameter; this will confuse the configuration
+system if you have multiple bonding devices.
+
+ Finally, supply one BONDING_SLAVEn="slave device" for each
+slave. where "n" is an increasing value, one for each slave. The
+"slave device" is either an interface name, e.g., "eth0", or a device
+specifier for the network device. The interface name is easier to
+find, but the ethN names are subject to change at boot time if, e.g.,
+a device early in the sequence has failed. The device specifiers
+(bus-pci-0000:06:08.1 in the example above) specify the physical
+network device, and will not change unless the device's bus location
+changes (for example, it is moved from one PCI slot to another). The
+example above uses one of each type for demonstration purposes; most
+configurations will choose one or the other for all slave devices.
+
+ When all configuration files have been modified or created,
+networking must be restarted for the configuration changes to take
+effect. This can be accomplished via the following:
+
+# /etc/init.d/network restart
+
+ Note that the network control script (/sbin/ifdown) will
+remove the bonding module as part of the network shutdown processing,
+so it is not necessary to remove the module by hand if, e.g., the
+module parameters have changed.
+
+ Also, at this writing, YaST/YaST2 will not manage bonding
+devices (they do not show bonding interfaces on its list of network
+devices). It is necessary to edit the configuration file by hand to
+change the bonding configuration.
+
+ Additional general options and details of the ifcfg file
+format can be found in an example ifcfg template file:
+
+/etc/sysconfig/network/ifcfg.template
+
+ Note that the template does not document the various BONDING_
+settings described above, but does describe many of the other options.
+
+3.1.1 Using DHCP with Sysconfig
+-------------------------------
+
+ Under sysconfig, configuring a device with BOOTPROTO='dhcp'
+will cause it to query DHCP for its IP address information. At this
+writing, this does not function for bonding devices; the scripts
+attempt to obtain the device address from DHCP prior to adding any of
+the slave devices. Without active slaves, the DHCP requests are not
+sent to the network.
+
+3.1.2 Configuring Multiple Bonds with Sysconfig
+-----------------------------------------------
+
+ The sysconfig network initialization system is capable of
+handling multiple bonding devices. All that is necessary is for each
+bonding instance to have an appropriately configured ifcfg-bondX file
+(as described above). Do not specify the "max_bonds" parameter to any
+instance of bonding, as this will confuse sysconfig. If you require
+multiple bonding devices with identical parameters, create multiple
+ifcfg-bondX files.
+
+ Because the sysconfig scripts supply the bonding module
+options in the ifcfg-bondX file, it is not necessary to add them to
+the system /etc/modules.conf or /etc/modprobe.conf configuration file.
+
+3.2 Configuration with Initscripts Support
+------------------------------------------
+
+ This section applies to distros using a recent version of
+initscripts with bonding support, for example, Red Hat Enterprise Linux
+version 3 or later, Fedora, etc. On these systems, the network
+initialization scripts have knowledge of bonding, and can be configured to
+control bonding devices. Note that older versions of the initscripts
+package have lower levels of support for bonding; this will be noted where
+applicable.
+
+ These distros will not automatically load the network adapter
+driver unless the ethX device is configured with an IP address.
+Because of this constraint, users must manually configure a
+network-script file for all physical adapters that will be members of
+a bondX link. Network script files are located in the directory:
+
+/etc/sysconfig/network-scripts
+
+ The file name must be prefixed with "ifcfg-eth" and suffixed
+with the adapter's physical adapter number. For example, the script
+for eth0 would be named /etc/sysconfig/network-scripts/ifcfg-eth0.
+Place the following text in the file:
+
+DEVICE=eth0
+USERCTL=no
+ONBOOT=yes
+MASTER=bond0
+SLAVE=yes
+BOOTPROTO=none
+
+ The DEVICE= line will be different for every ethX device and
+must correspond with the name of the file, i.e., ifcfg-eth1 must have
+a device line of DEVICE=eth1. The setting of the MASTER= line will
+also depend on the final bonding interface name chosen for your bond.
+As with other network devices, these typically start at 0, and go up
+one for each device, i.e., the first bonding instance is bond0, the
+second is bond1, and so on.
+
+ Next, create a bond network script. The file name for this
+script will be /etc/sysconfig/network-scripts/ifcfg-bondX where X is
+the number of the bond. For bond0 the file is named "ifcfg-bond0",
+for bond1 it is named "ifcfg-bond1", and so on. Within that file,
+place the following text:
+
+DEVICE=bond0
+IPADDR=192.168.1.1
+NETMASK=255.255.255.0
+NETWORK=192.168.1.0
+BROADCAST=192.168.1.255
+ONBOOT=yes
+BOOTPROTO=none
+USERCTL=no
+
+ Be sure to change the networking specific lines (IPADDR,
+NETMASK, NETWORK and BROADCAST) to match your network configuration.
+
+ For later versions of initscripts, such as that found with Fedora
+7 and Red Hat Enterprise Linux version 5 (or later), it is possible, and,
+indeed, preferable, to specify the bonding options in the ifcfg-bond0
+file, e.g. a line of the format:
+
+BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=+192.168.1.254"
+
+ will configure the bond with the specified options. The options
+specified in BONDING_OPTS are identical to the bonding module parameters
+except for the arp_ip_target field. Each target should be included as a
+separate option and should be preceded by a '+' to indicate it should be
+added to the list of queried targets, e.g.,
+
+ arp_ip_target=+192.168.1.1 arp_ip_target=+192.168.1.2
+
+ is the proper syntax to specify multiple targets. When specifying
+options via BONDING_OPTS, it is not necessary to edit /etc/modules.conf or
+/etc/modprobe.conf.
+
+ For older versions of initscripts that do not support
+BONDING_OPTS, it is necessary to edit /etc/modules.conf (or
+/etc/modprobe.conf, depending upon your distro) to load the bonding module
+with your desired options when the bond0 interface is brought up. The
+following lines in /etc/modules.conf (or modprobe.conf) will load the
+bonding module, and select its options:
+
+alias bond0 bonding
+options bond0 mode=balance-alb miimon=100
+
+ Replace the sample parameters with the appropriate set of
+options for your configuration.
+
+ Finally run "/etc/rc.d/init.d/network restart" as root. This
+will restart the networking subsystem and your bond link should be now
+up and running.
+
+3.2.1 Using DHCP with Initscripts
+---------------------------------
+
+ Recent versions of initscripts (the versions supplied with Fedora
+Core 3 and Red Hat Enterprise Linux 4, or later versions, are reported to
+work) have support for assigning IP information to bonding devices via
+DHCP.
+
+ To configure bonding for DHCP, configure it as described
+above, except replace the line "BOOTPROTO=none" with "BOOTPROTO=dhcp"
+and add a line consisting of "TYPE=Bonding". Note that the TYPE value
+is case sensitive.
+
+3.2.2 Configuring Multiple Bonds with Initscripts
+-------------------------------------------------
+
+ Initscripts packages that are included with Fedora 7 and Red Hat
+Enterprise Linux 5 support multiple bonding interfaces by simply
+specifying the appropriate BONDING_OPTS= in ifcfg-bondX where X is the
+number of the bond. This support requires sysfs support in the kernel,
+and a bonding driver of version 3.0.0 or later. Other configurations may
+not support this method for specifying multiple bonding interfaces; for
+those instances, see the "Configuring Multiple Bonds Manually" section,
+below.
+
+3.3 Configuring Bonding Manually with Ifenslave
+-----------------------------------------------
+
+ This section applies to distros whose network initialization
+scripts (the sysconfig or initscripts package) do not have specific
+knowledge of bonding. One such distro is SuSE Linux Enterprise Server
+version 8.
+
+ The general method for these systems is to place the bonding
+module parameters into /etc/modules.conf or /etc/modprobe.conf (as
+appropriate for the installed distro), then add modprobe and/or
+ifenslave commands to the system's global init script. The name of
+the global init script differs; for sysconfig, it is
+/etc/init.d/boot.local and for initscripts it is /etc/rc.d/rc.local.
+
+ For example, if you wanted to make a simple bond of two e100
+devices (presumed to be eth0 and eth1), and have it persist across
+reboots, edit the appropriate file (/etc/init.d/boot.local or
+/etc/rc.d/rc.local), and add the following:
+
+modprobe bonding mode=balance-alb miimon=100
+modprobe e100
+ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up
+ifenslave bond0 eth0
+ifenslave bond0 eth1
+
+ Replace the example bonding module parameters and bond0
+network configuration (IP address, netmask, etc) with the appropriate
+values for your configuration.
+
+ Unfortunately, this method will not provide support for the
+ifup and ifdown scripts on the bond devices. To reload the bonding
+configuration, it is necessary to run the initialization script, e.g.,
+
+# /etc/init.d/boot.local
+
+ or
+
+# /etc/rc.d/rc.local
+
+ It may be desirable in such a case to create a separate script
+which only initializes the bonding configuration, then call that
+separate script from within boot.local. This allows for bonding to be
+enabled without re-running the entire global init script.
+
+ To shut down the bonding devices, it is necessary to first
+mark the bonding device itself as being down, then remove the
+appropriate device driver modules. For our example above, you can do
+the following:
+
+# ifconfig bond0 down
+# rmmod bonding
+# rmmod e100
+
+ Again, for convenience, it may be desirable to create a script
+with these commands.
+
+
+3.3.1 Configuring Multiple Bonds Manually
+-----------------------------------------
+
+ This section contains information on configuring multiple
+bonding devices with differing options for those systems whose network
+initialization scripts lack support for configuring multiple bonds.
+
+ If you require multiple bonding devices, but all with the same
+options, you may wish to use the "max_bonds" module parameter,
+documented above.
+
+ To create multiple bonding devices with differing options, it is
+preferrable to use bonding parameters exported by sysfs, documented in the
+section below.
+
+ For versions of bonding without sysfs support, the only means to
+provide multiple instances of bonding with differing options is to load
+the bonding driver multiple times. Note that current versions of the
+sysconfig network initialization scripts handle this automatically; if
+your distro uses these scripts, no special action is needed. See the
+section Configuring Bonding Devices, above, if you're not sure about your
+network initialization scripts.
+
+ To load multiple instances of the module, it is necessary to
+specify a different name for each instance (the module loading system
+requires that every loaded module, even multiple instances of the same
+module, have a unique name). This is accomplished by supplying multiple
+sets of bonding options in /etc/modprobe.conf, for example:
+
+alias bond0 bonding
+options bond0 -o bond0 mode=balance-rr miimon=100
+
+alias bond1 bonding
+options bond1 -o bond1 mode=balance-alb miimon=50
+
+ will load the bonding module two times. The first instance is
+named "bond0" and creates the bond0 device in balance-rr mode with an
+miimon of 100. The second instance is named "bond1" and creates the
+bond1 device in balance-alb mode with an miimon of 50.
+
+ In some circumstances (typically with older distributions),
+the above does not work, and the second bonding instance never sees
+its options. In that case, the second options line can be substituted
+as follows:
+
+install bond1 /sbin/modprobe --ignore-install bonding -o bond1 \
+ mode=balance-alb miimon=50
+
+ This may be repeated any number of times, specifying a new and
+unique name in place of bond1 for each subsequent instance.
+
+ It has been observed that some Red Hat supplied kernels are unable
+to rename modules at load time (the "-o bond1" part). Attempts to pass
+that option to modprobe will produce an "Operation not permitted" error.
+This has been reported on some Fedora Core kernels, and has been seen on
+RHEL 4 as well. On kernels exhibiting this problem, it will be impossible
+to configure multiple bonds with differing parameters (as they are older
+kernels, and also lack sysfs support).
+
+3.4 Configuring Bonding Manually via Sysfs
+------------------------------------------
+
+ Starting with version 3.0.0, Channel Bonding may be configured
+via the sysfs interface. This interface allows dynamic configuration
+of all bonds in the system without unloading the module. It also
+allows for adding and removing bonds at runtime. Ifenslave is no
+longer required, though it is still supported.
+
+ Use of the sysfs interface allows you to use multiple bonds
+with different configurations without having to reload the module.
+It also allows you to use multiple, differently configured bonds when
+bonding is compiled into the kernel.
+
+ You must have the sysfs filesystem mounted to configure
+bonding this way. The examples in this document assume that you
+are using the standard mount point for sysfs, e.g. /sys. If your
+sysfs filesystem is mounted elsewhere, you will need to adjust the
+example paths accordingly.
+
+Creating and Destroying Bonds
+-----------------------------
+To add a new bond foo:
+# echo +foo > /sys/class/net/bonding_masters
+
+To remove an existing bond bar:
+# echo -bar > /sys/class/net/bonding_masters
+
+To show all existing bonds:
+# cat /sys/class/net/bonding_masters
+
+NOTE: due to 4K size limitation of sysfs files, this list may be
+truncated if you have more than a few hundred bonds. This is unlikely
+to occur under normal operating conditions.
+
+Adding and Removing Slaves
+--------------------------
+ Interfaces may be enslaved to a bond using the file
+/sys/class/net/<bond>/bonding/slaves. The semantics for this file
+are the same as for the bonding_masters file.
+
+To enslave interface eth0 to bond bond0:
+# ifconfig bond0 up
+# echo +eth0 > /sys/class/net/bond0/bonding/slaves
+
+To free slave eth0 from bond bond0:
+# echo -eth0 > /sys/class/net/bond0/bonding/slaves
+
+ When an interface is enslaved to a bond, symlinks between the
+two are created in the sysfs filesystem. In this case, you would get
+/sys/class/net/bond0/slave_eth0 pointing to /sys/class/net/eth0, and
+/sys/class/net/eth0/master pointing to /sys/class/net/bond0.
+
+ This means that you can tell quickly whether or not an
+interface is enslaved by looking for the master symlink. Thus:
+# echo -eth0 > /sys/class/net/eth0/master/bonding/slaves
+will free eth0 from whatever bond it is enslaved to, regardless of
+the name of the bond interface.
+
+Changing a Bond's Configuration
+-------------------------------
+ Each bond may be configured individually by manipulating the
+files located in /sys/class/net/<bond name>/bonding
+
+ The names of these files correspond directly with the command-
+line parameters described elsewhere in this file, and, with the
+exception of arp_ip_target, they accept the same values. To see the
+current setting, simply cat the appropriate file.
+
+ A few examples will be given here; for specific usage
+guidelines for each parameter, see the appropriate section in this
+document.
+
+To configure bond0 for balance-alb mode:
+# ifconfig bond0 down
+# echo 6 > /sys/class/net/bond0/bonding/mode
+ - or -
+# echo balance-alb > /sys/class/net/bond0/bonding/mode
+ NOTE: The bond interface must be down before the mode can be
+changed.
+
+To enable MII monitoring on bond0 with a 1 second interval:
+# echo 1000 > /sys/class/net/bond0/bonding/miimon
+ NOTE: If ARP monitoring is enabled, it will disabled when MII
+monitoring is enabled, and vice-versa.
+
+To add ARP targets:
+# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
+# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target
+ NOTE: up to 10 target addresses may be specified.
+
+To remove an ARP target:
+# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
+
+Example Configuration
+---------------------
+ We begin with the same example that is shown in section 3.3,
+executed with sysfs, and without using ifenslave.
+
+ To make a simple bond of two e100 devices (presumed to be eth0
+and eth1), and have it persist across reboots, edit the appropriate
+file (/etc/init.d/boot.local or /etc/rc.d/rc.local), and add the
+following:
+
+modprobe bonding
+modprobe e100
+echo balance-alb > /sys/class/net/bond0/bonding/mode
+ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up
+echo 100 > /sys/class/net/bond0/bonding/miimon
+echo +eth0 > /sys/class/net/bond0/bonding/slaves
+echo +eth1 > /sys/class/net/bond0/bonding/slaves
+
+ To add a second bond, with two e1000 interfaces in
+active-backup mode, using ARP monitoring, add the following lines to
+your init script:
+
+modprobe e1000
+echo +bond1 > /sys/class/net/bonding_masters
+echo active-backup > /sys/class/net/bond1/bonding/mode
+ifconfig bond1 192.168.2.1 netmask 255.255.255.0 up
+echo +192.168.2.100 /sys/class/net/bond1/bonding/arp_ip_target
+echo 2000 > /sys/class/net/bond1/bonding/arp_interval
+echo +eth2 > /sys/class/net/bond1/bonding/slaves
+echo +eth3 > /sys/class/net/bond1/bonding/slaves
+
+
+4. Querying Bonding Configuration
+=================================
+
+4.1 Bonding Configuration
+-------------------------
+
+ Each bonding device has a read-only file residing in the
+/proc/net/bonding directory. The file contents include information
+about the bonding configuration, options and state of each slave.
+
+ For example, the contents of /proc/net/bonding/bond0 after the
+driver is loaded with parameters of mode=0 and miimon=1000 is
+generally as follows:
+
+ Ethernet Channel Bonding Driver: 2.6.1 (October 29, 2004)
+ Bonding Mode: load balancing (round-robin)
+ Currently Active Slave: eth0
+ MII Status: up
+ MII Polling Interval (ms): 1000
+ Up Delay (ms): 0
+ Down Delay (ms): 0
+
+ Slave Interface: eth1
+ MII Status: up
+ Link Failure Count: 1
+
+ Slave Interface: eth0
+ MII Status: up
+ Link Failure Count: 1
+
+ The precise format and contents will change depending upon the
+bonding configuration, state, and version of the bonding driver.
+
+4.2 Network configuration
+-------------------------
+
+ The network configuration can be inspected using the ifconfig
+command. Bonding devices will have the MASTER flag set; Bonding slave
+devices will have the SLAVE flag set. The ifconfig output does not
+contain information on which slaves are associated with which masters.
+
+ In the example below, the bond0 interface is the master
+(MASTER) while eth0 and eth1 are slaves (SLAVE). Notice all slaves of
+bond0 have the same MAC address (HWaddr) as bond0 for all modes except
+TLB and ALB that require a unique MAC address for each slave.
+
+# /sbin/ifconfig
+bond0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4
+ inet addr:XXX.XXX.XXX.YYY Bcast:XXX.XXX.XXX.255 Mask:255.255.252.0
+ UP BROADCAST RUNNING MASTER MULTICAST MTU:1500 Metric:1
+ RX packets:7224794 errors:0 dropped:0 overruns:0 frame:0
+ TX packets:3286647 errors:1 dropped:0 overruns:1 carrier:0
+ collisions:0 txqueuelen:0
+
+eth0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4
+ UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1
+ RX packets:3573025 errors:0 dropped:0 overruns:0 frame:0
+ TX packets:1643167 errors:1 dropped:0 overruns:1 carrier:0
+ collisions:0 txqueuelen:100
+ Interrupt:10 Base address:0x1080
+
+eth1 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4
+ UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1
+ RX packets:3651769 errors:0 dropped:0 overruns:0 frame:0
+ TX packets:1643480 errors:0 dropped:0 overruns:0 carrier:0
+ collisions:0 txqueuelen:100
+ Interrupt:9 Base address:0x1400
+
+5. Switch Configuration
+=======================
+
+ For this section, "switch" refers to whatever system the
+bonded devices are directly connected to (i.e., where the other end of
+the cable plugs into). This may be an actual dedicated switch device,
+or it may be another regular system (e.g., another computer running
+Linux),
+
+ The active-backup, balance-tlb and balance-alb modes do not
+require any specific configuration of the switch.
+
+ The 802.3ad mode requires that the switch have the appropriate
+ports configured as an 802.3ad aggregation. The precise method used
+to configure this varies from switch to switch, but, for example, a
+Cisco 3550 series switch requires that the appropriate ports first be
+grouped together in a single etherchannel instance, then that
+etherchannel is set to mode "lacp" to enable 802.3ad (instead of
+standard EtherChannel).
+
+ The balance-rr, balance-xor and broadcast modes generally
+require that the switch have the appropriate ports grouped together.
+The nomenclature for such a group differs between switches, it may be
+called an "etherchannel" (as in the Cisco example, above), a "trunk
+group" or some other similar variation. For these modes, each switch
+will also have its own configuration options for the switch's transmit
+policy to the bond. Typical choices include XOR of either the MAC or
+IP addresses. The transmit policy of the two peers does not need to
+match. For these three modes, the bonding mode really selects a
+transmit policy for an EtherChannel group; all three will interoperate
+with another EtherChannel group.
+
+
+6. 802.1q VLAN Support
+======================
+
+ It is possible to configure VLAN devices over a bond interface
+using the 8021q driver. However, only packets coming from the 8021q
+driver and passing through bonding will be tagged by default. Self
+generated packets, for example, bonding's learning packets or ARP
+packets generated by either ALB mode or the ARP monitor mechanism, are
+tagged internally by bonding itself. As a result, bonding must
+"learn" the VLAN IDs configured above it, and use those IDs to tag
+self generated packets.
+
+ For reasons of simplicity, and to support the use of adapters
+that can do VLAN hardware acceleration offloading, the bonding
+interface declares itself as fully hardware offloading capable, it gets
+the add_vid/kill_vid notifications to gather the necessary
+information, and it propagates those actions to the slaves. In case
+of mixed adapter types, hardware accelerated tagged packets that
+should go through an adapter that is not offloading capable are
+"un-accelerated" by the bonding driver so the VLAN tag sits in the
+regular location.
+
+ VLAN interfaces *must* be added on top of a bonding interface
+only after enslaving at least one slave. The bonding interface has a
+hardware address of 00:00:00:00:00:00 until the first slave is added.
+If the VLAN interface is created prior to the first enslavement, it
+would pick up the all-zeroes hardware address. Once the first slave
+is attached to the bond, the bond device itself will pick up the
+slave's hardware address, which is then available for the VLAN device.
+
+ Also, be aware that a similar problem can occur if all slaves
+are released from a bond that still has one or more VLAN interfaces on
+top of it. When a new slave is added, the bonding interface will
+obtain its hardware address from the first slave, which might not
+match the hardware address of the VLAN interfaces (which was
+ultimately copied from an earlier slave).
+
+ There are two methods to insure that the VLAN device operates
+with the correct hardware address if all slaves are removed from a
+bond interface:
+
+ 1. Remove all VLAN interfaces then recreate them
+
+ 2. Set the bonding interface's hardware address so that it
+matches the hardware address of the VLAN interfaces.
+
+ Note that changing a VLAN interface's HW address would set the
+underlying device -- i.e. the bonding interface -- to promiscuous
+mode, which might not be what you want.
+
+
+7. Link Monitoring
+==================
+
+ The bonding driver at present supports two schemes for
+monitoring a slave device's link state: the ARP monitor and the MII
+monitor.
+
+ At the present time, due to implementation restrictions in the
+bonding driver itself, it is not possible to enable both ARP and MII
+monitoring simultaneously.
+
+7.1 ARP Monitor Operation
+-------------------------
+
+ The ARP monitor operates as its name suggests: it sends ARP
+queries to one or more designated peer systems on the network, and
+uses the response as an indication that the link is operating. This
+gives some assurance that traffic is actually flowing to and from one
+or more peers on the local network.
+
+ The ARP monitor relies on the device driver itself to verify
+that traffic is flowing. In particular, the driver must keep up to
+date the last receive time, dev->last_rx, and transmit start time,
+dev->trans_start. If these are not updated by the driver, then the
+ARP monitor will immediately fail any slaves using that driver, and
+those slaves will stay down. If networking monitoring (tcpdump, etc)
+shows the ARP requests and replies on the network, then it may be that
+your device driver is not updating last_rx and trans_start.
+
+7.2 Configuring Multiple ARP Targets
+------------------------------------
+
+ While ARP monitoring can be done with just one target, it can
+be useful in a High Availability setup to have several targets to
+monitor. In the case of just one target, the target itself may go
+down or have a problem making it unresponsive to ARP requests. Having
+an additional target (or several) increases the reliability of the ARP
+monitoring.
+
+ Multiple ARP targets must be separated by commas as follows:
+
+# example options for ARP monitoring with three targets
+alias bond0 bonding
+options bond0 arp_interval=60 arp_ip_target=192.168.0.1,192.168.0.3,192.168.0.9
+
+ For just a single target the options would resemble:
+
+# example options for ARP monitoring with one target
+alias bond0 bonding
+options bond0 arp_interval=60 arp_ip_target=192.168.0.100
+
+
+7.3 MII Monitor Operation
+-------------------------
+
+ The MII monitor monitors only the carrier state of the local
+network interface. It accomplishes this in one of three ways: by
+depending upon the device driver to maintain its carrier state, by
+querying the device's MII registers, or by making an ethtool query to
+the device.
+
+ If the use_carrier module parameter is 1 (the default value),
+then the MII monitor will rely on the driver for carrier state
+information (via the netif_carrier subsystem). As explained in the
+use_carrier parameter information, above, if the MII monitor fails to
+detect carrier loss on the device (e.g., when the cable is physically
+disconnected), it may be that the driver does not support
+netif_carrier.
+
+ If use_carrier is 0, then the MII monitor will first query the
+device's (via ioctl) MII registers and check the link state. If that
+request fails (not just that it returns carrier down), then the MII
+monitor will make an ethtool ETHOOL_GLINK request to attempt to obtain
+the same information. If both methods fail (i.e., the driver either
+does not support or had some error in processing both the MII register
+and ethtool requests), then the MII monitor will assume the link is
+up.
+
+8. Potential Sources of Trouble
+===============================
+
+8.1 Adventures in Routing
+-------------------------
+
+ When bonding is configured, it is important that the slave
+devices not have routes that supersede routes of the master (or,
+generally, not have routes at all). For example, suppose the bonding
+device bond0 has two slaves, eth0 and eth1, and the routing table is
+as follows:
+
+Kernel IP routing table
+Destination Gateway Genmask Flags MSS Window irtt Iface
+10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth0
+10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth1
+10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 bond0
+127.0.0.0 0.0.0.0 255.0.0.0 U 40 0 0 lo
+
+ This routing configuration will likely still update the
+receive/transmit times in the driver (needed by the ARP monitor), but
+may bypass the bonding driver (because outgoing traffic to, in this
+case, another host on network 10 would use eth0 or eth1 before bond0).
+
+ The ARP monitor (and ARP itself) may become confused by this
+configuration, because ARP requests (generated by the ARP monitor)
+will be sent on one interface (bond0), but the corresponding reply
+will arrive on a different interface (eth0). This reply looks to ARP
+as an unsolicited ARP reply (because ARP matches replies on an
+interface basis), and is discarded. The MII monitor is not affected
+by the state of the routing table.
+
+ The solution here is simply to insure that slaves do not have
+routes of their own, and if for some reason they must, those routes do
+not supersede routes of their master. This should generally be the
+case, but unusual configurations or errant manual or automatic static
+route additions may cause trouble.
+
+8.2 Ethernet Device Renaming
+----------------------------
+
+ On systems with network configuration scripts that do not
+associate physical devices directly with network interface names (so
+that the same physical device always has the same "ethX" name), it may
+be necessary to add some special logic to either /etc/modules.conf or
+/etc/modprobe.conf (depending upon which is installed on the system).
+
+ For example, given a modules.conf containing the following:
+
+alias bond0 bonding
+options bond0 mode=some-mode miimon=50
+alias eth0 tg3
+alias eth1 tg3
+alias eth2 e1000
+alias eth3 e1000
+
+ If neither eth0 and eth1 are slaves to bond0, then when the
+bond0 interface comes up, the devices may end up reordered. This
+happens because bonding is loaded first, then its slave device's
+drivers are loaded next. Since no other drivers have been loaded,
+when the e1000 driver loads, it will receive eth0 and eth1 for its
+devices, but the bonding configuration tries to enslave eth2 and eth3
+(which may later be assigned to the tg3 devices).
+
+ Adding the following:
+
+add above bonding e1000 tg3
+
+ causes modprobe to load e1000 then tg3, in that order, when
+bonding is loaded. This command is fully documented in the
+modules.conf manual page.
+
+ On systems utilizing modprobe.conf (or modprobe.conf.local),
+an equivalent problem can occur. In this case, the following can be
+added to modprobe.conf (or modprobe.conf.local, as appropriate), as
+follows (all on one line; it has been split here for clarity):
+
+install bonding /sbin/modprobe tg3; /sbin/modprobe e1000;
+ /sbin/modprobe --ignore-install bonding
+
+ This will, when loading the bonding module, rather than
+performing the normal action, instead execute the provided command.
+This command loads the device drivers in the order needed, then calls
+modprobe with --ignore-install to cause the normal action to then take
+place. Full documentation on this can be found in the modprobe.conf
+and modprobe manual pages.
+
+8.3. Painfully Slow Or No Failed Link Detection By Miimon
+---------------------------------------------------------
+
+ By default, bonding enables the use_carrier option, which
+instructs bonding to trust the driver to maintain carrier state.
+
+ As discussed in the options section, above, some drivers do
+not support the netif_carrier_on/_off link state tracking system.
+With use_carrier enabled, bonding will always see these links as up,
+regardless of their actual state.
+
+ Additionally, other drivers do support netif_carrier, but do
+not maintain it in real time, e.g., only polling the link state at
+some fixed interval. In this case, miimon will detect failures, but
+only after some long period of time has expired. If it appears that
+miimon is very slow in detecting link failures, try specifying
+use_carrier=0 to see if that improves the failure detection time. If
+it does, then it may be that the driver checks the carrier state at a
+fixed interval, but does not cache the MII register values (so the
+use_carrier=0 method of querying the registers directly works). If
+use_carrier=0 does not improve the failover, then the driver may cache
+the registers, or the problem may be elsewhere.
+
+ Also, remember that miimon only checks for the device's
+carrier state. It has no way to determine the state of devices on or
+beyond other ports of a switch, or if a switch is refusing to pass
+traffic while still maintaining carrier on.
+
+9. SNMP agents
+===============
+
+ If running SNMP agents, the bonding driver should be loaded
+before any network drivers participating in a bond. This requirement
+is due to the interface index (ipAdEntIfIndex) being associated to
+the first interface found with a given IP address. That is, there is
+only one ipAdEntIfIndex for each IP address. For example, if eth0 and
+eth1 are slaves of bond0 and the driver for eth0 is loaded before the
+bonding driver, the interface for the IP address will be associated
+with the eth0 interface. This configuration is shown below, the IP
+address 192.168.1.1 has an interface index of 2 which indexes to eth0
+in the ifDescr table (ifDescr.2).
+
+ interfaces.ifTable.ifEntry.ifDescr.1 = lo
+ interfaces.ifTable.ifEntry.ifDescr.2 = eth0
+ interfaces.ifTable.ifEntry.ifDescr.3 = eth1
+ interfaces.ifTable.ifEntry.ifDescr.4 = eth2
+ interfaces.ifTable.ifEntry.ifDescr.5 = eth3
+ interfaces.ifTable.ifEntry.ifDescr.6 = bond0
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.10.10.10 = 5
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.192.168.1.1 = 2
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 4
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1
+
+ This problem is avoided by loading the bonding driver before
+any network drivers participating in a bond. Below is an example of
+loading the bonding driver first, the IP address 192.168.1.1 is
+correctly associated with ifDescr.2.
+
+ interfaces.ifTable.ifEntry.ifDescr.1 = lo
+ interfaces.ifTable.ifEntry.ifDescr.2 = bond0
+ interfaces.ifTable.ifEntry.ifDescr.3 = eth0
+ interfaces.ifTable.ifEntry.ifDescr.4 = eth1
+ interfaces.ifTable.ifEntry.ifDescr.5 = eth2
+ interfaces.ifTable.ifEntry.ifDescr.6 = eth3
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.10.10.10 = 6
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.192.168.1.1 = 2
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 5
+ ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1
+
+ While some distributions may not report the interface name in
+ifDescr, the association between the IP address and IfIndex remains
+and SNMP functions such as Interface_Scan_Next will report that
+association.
+
+10. Promiscuous mode
+====================
+
+ When running network monitoring tools, e.g., tcpdump, it is
+common to enable promiscuous mode on the device, so that all traffic
+is seen (instead of seeing only traffic destined for the local host).
+The bonding driver handles promiscuous mode changes to the bonding
+master device (e.g., bond0), and propagates the setting to the slave
+devices.
+
+ For the balance-rr, balance-xor, broadcast, and 802.3ad modes,
+the promiscuous mode setting is propagated to all slaves.
+
+ For the active-backup, balance-tlb and balance-alb modes, the
+promiscuous mode setting is propagated only to the active slave.
+
+ For balance-tlb mode, the active slave is the slave currently
+receiving inbound traffic.
+
+ For balance-alb mode, the active slave is the slave used as a
+"primary." This slave is used for mode-specific control traffic, for
+sending to peers that are unassigned or if the load is unbalanced.
+
+ For the active-backup, balance-tlb and balance-alb modes, when
+the active slave changes (e.g., due to a link failure), the
+promiscuous setting will be propagated to the new active slave.
+
+11. Configuring Bonding for High Availability
+=============================================
+
+ High Availability refers to configurations that provide
+maximum network availability by having redundant or backup devices,
+links or switches between the host and the rest of the world. The
+goal is to provide the maximum availability of network connectivity
+(i.e., the network always works), even though other configurations
+could provide higher throughput.
+
+11.1 High Availability in a Single Switch Topology
+--------------------------------------------------
+
+ If two hosts (or a host and a single switch) are directly
+connected via multiple physical links, then there is no availability
+penalty to optimizing for maximum bandwidth. In this case, there is
+only one switch (or peer), so if it fails, there is no alternative
+access to fail over to. Additionally, the bonding load balance modes
+support link monitoring of their members, so if individual links fail,
+the load will be rebalanced across the remaining devices.
+
+ See Section 13, "Configuring Bonding for Maximum Throughput"
+for information on configuring bonding with one peer device.
+
+11.2 High Availability in a Multiple Switch Topology
+----------------------------------------------------
+
+ With multiple switches, the configuration of bonding and the
+network changes dramatically. In multiple switch topologies, there is
+a trade off between network availability and usable bandwidth.
+
+ Below is a sample network, configured to maximize the
+availability of the network:
+
+ | |
+ |port3 port3|
+ +-----+----+ +-----+----+
+ | |port2 ISL port2| |
+ | switch A +--------------------------+ switch B |
+ | | | |
+ +-----+----+ +-----++---+
+ |port1 port1|
+ | +-------+ |
+ +-------------+ host1 +---------------+
+ eth0 +-------+ eth1
+
+ In this configuration, there is a link between the two
+switches (ISL, or inter switch link), and multiple ports connecting to
+the outside world ("port3" on each switch). There is no technical
+reason that this could not be extended to a third switch.
+
+11.2.1 HA Bonding Mode Selection for Multiple Switch Topology
+-------------------------------------------------------------
+
+ In a topology such as the example above, the active-backup and
+broadcast modes are the only useful bonding modes when optimizing for
+availability; the other modes require all links to terminate on the
+same peer for them to behave rationally.
+
+active-backup: This is generally the preferred mode, particularly if
+ the switches have an ISL and play together well. If the
+ network configuration is such that one switch is specifically
+ a backup switch (e.g., has lower capacity, higher cost, etc),
+ then the primary option can be used to insure that the
+ preferred link is always used when it is available.
+
+broadcast: This mode is really a special purpose mode, and is suitable
+ only for very specific needs. For example, if the two
+ switches are not connected (no ISL), and the networks beyond
+ them are totally independent. In this case, if it is
+ necessary for some specific one-way traffic to reach both
+ independent networks, then the broadcast mode may be suitable.
+
+11.2.2 HA Link Monitoring Selection for Multiple Switch Topology
+----------------------------------------------------------------
+
+ The choice of link monitoring ultimately depends upon your
+switch. If the switch can reliably fail ports in response to other
+failures, then either the MII or ARP monitors should work. For
+example, in the above example, if the "port3" link fails at the remote
+end, the MII monitor has no direct means to detect this. The ARP
+monitor could be configured with a target at the remote end of port3,
+thus detecting that failure without switch support.
+
+ In general, however, in a multiple switch topology, the ARP
+monitor can provide a higher level of reliability in detecting end to
+end connectivity failures (which may be caused by the failure of any
+individual component to pass traffic for any reason). Additionally,
+the ARP monitor should be configured with multiple targets (at least
+one for each switch in the network). This will insure that,
+regardless of which switch is active, the ARP monitor has a suitable
+target to query.
+
+ Note, also, that of late many switches now support a functionality
+generally referred to as "trunk failover." This is a feature of the
+switch that causes the link state of a particular switch port to be set
+down (or up) when the state of another switch port goes down (or up).
+It's purpose is to propogate link failures from logically "exterior" ports
+to the logically "interior" ports that bonding is able to monitor via
+miimon. Availability and configuration for trunk failover varies by
+switch, but this can be a viable alternative to the ARP monitor when using
+suitable switches.
+
+12. Configuring Bonding for Maximum Throughput
+==============================================
+
+12.1 Maximizing Throughput in a Single Switch Topology
+------------------------------------------------------
+
+ In a single switch configuration, the best method to maximize
+throughput depends upon the application and network environment. The
+various load balancing modes each have strengths and weaknesses in
+different environments, as detailed below.
+
+ For this discussion, we will break down the topologies into
+two categories. Depending upon the destination of most traffic, we
+categorize them into either "gatewayed" or "local" configurations.
+
+ In a gatewayed configuration, the "switch" is acting primarily
+as a router, and the majority of traffic passes through this router to
+other networks. An example would be the following:
+
+
+ +----------+ +----------+
+ | |eth0 port1| | to other networks
+ | Host A +---------------------+ router +------------------->
+ | +---------------------+ | Hosts B and C are out
+ | |eth1 port2| | here somewhere
+ +----------+ +----------+
+
+ The router may be a dedicated router device, or another host
+acting as a gateway. For our discussion, the important point is that
+the majority of traffic from Host A will pass through the router to
+some other network before reaching its final destination.
+
+ In a gatewayed network configuration, although Host A may
+communicate with many other systems, all of its traffic will be sent
+and received via one other peer on the local network, the router.
+
+ Note that the case of two systems connected directly via
+multiple physical links is, for purposes of configuring bonding, the
+same as a gatewayed configuration. In that case, it happens that all
+traffic is destined for the "gateway" itself, not some other network
+beyond the gateway.
+
+ In a local configuration, the "switch" is acting primarily as
+a switch, and the majority of traffic passes through this switch to
+reach other stations on the same network. An example would be the
+following:
+
+ +----------+ +----------+ +--------+
+ | |eth0 port1| +-------+ Host B |
+ | Host A +------------+ switch |port3 +--------+
+ | +------------+ | +--------+
+ | |eth1 port2| +------------------+ Host C |
+ +----------+ +----------+port4 +--------+
+
+
+ Again, the switch may be a dedicated switch device, or another
+host acting as a gateway. For our discussion, the important point is
+that the majority of traffic from Host A is destined for other hosts
+on the same local network (Hosts B and C in the above example).
+
+ In summary, in a gatewayed configuration, traffic to and from
+the bonded device will be to the same MAC level peer on the network
+(the gateway itself, i.e., the router), regardless of its final
+destination. In a local configuration, traffic flows directly to and
+from the final destinations, thus, each destination (Host B, Host C)
+will be addressed directly by their individual MAC addresses.
+
+ This distinction between a gatewayed and a local network
+configuration is important because many of the load balancing modes
+available use the MAC addresses of the local network source and
+destination to make load balancing decisions. The behavior of each
+mode is described below.
+
+
+12.1.1 MT Bonding Mode Selection for Single Switch Topology
+-----------------------------------------------------------
+
+ This configuration is the easiest to set up and to understand,
+although you will have to decide which bonding mode best suits your
+needs. The trade offs for each mode are detailed below:
+
+balance-rr: This mode is the only mode that will permit a single
+ TCP/IP connection to stripe traffic across multiple
+ interfaces. It is therefore the only mode that will allow a
+ single TCP/IP stream to utilize more than one interface's
+ worth of throughput. This comes at a cost, however: the
+ striping generally results in peer systems receiving packets out
+ of order, causing TCP/IP's congestion control system to kick
+ in, often by retransmitting segments.
+
+ It is possible to adjust TCP/IP's congestion limits by
+ altering the net.ipv4.tcp_reordering sysctl parameter. The
+ usual default value is 3, and the maximum useful value is 127.
+ For a four interface balance-rr bond, expect that a single
+ TCP/IP stream will utilize no more than approximately 2.3
+ interface's worth of throughput, even after adjusting
+ tcp_reordering.
+
+ Note that the fraction of packets that will be delivered out of
+ order is highly variable, and is unlikely to be zero. The level
+ of reordering depends upon a variety of factors, including the
+ networking interfaces, the switch, and the topology of the
+ configuration. Speaking in general terms, higher speed network
+ cards produce more reordering (due to factors such as packet
+ coalescing), and a "many to many" topology will reorder at a
+ higher rate than a "many slow to one fast" configuration.
+
+ Many switches do not support any modes that stripe traffic
+ (instead choosing a port based upon IP or MAC level addresses);
+ for those devices, traffic for a particular connection flowing
+ through the switch to a balance-rr bond will not utilize greater
+ than one interface's worth of bandwidth.
+
+ If you are utilizing protocols other than TCP/IP, UDP for
+ example, and your application can tolerate out of order
+ delivery, then this mode can allow for single stream datagram
+ performance that scales near linearly as interfaces are added
+ to the bond.
+
+ This mode requires the switch to have the appropriate ports
+ configured for "etherchannel" or "trunking."
+
+active-backup: There is not much advantage in this network topology to
+ the active-backup mode, as the inactive backup devices are all
+ connected to the same peer as the primary. In this case, a
+ load balancing mode (with link monitoring) will provide the
+ same level of network availability, but with increased
+ available bandwidth. On the plus side, active-backup mode
+ does not require any configuration of the switch, so it may
+ have value if the hardware available does not support any of
+ the load balance modes.
+
+balance-xor: This mode will limit traffic such that packets destined
+ for specific peers will always be sent over the same
+ interface. Since the destination is determined by the MAC
+ addresses involved, this mode works best in a "local" network
+ configuration (as described above), with destinations all on
+ the same local network. This mode is likely to be suboptimal
+ if all your traffic is passed through a single router (i.e., a
+ "gatewayed" network configuration, as described above).
+
+ As with balance-rr, the switch ports need to be configured for
+ "etherchannel" or "trunking."
+
+broadcast: Like active-backup, there is not much advantage to this
+ mode in this type of network topology.
+
+802.3ad: This mode can be a good choice for this type of network
+ topology. The 802.3ad mode is an IEEE standard, so all peers
+ that implement 802.3ad should interoperate well. The 802.3ad
+ protocol includes automatic configuration of the aggregates,
+ so minimal manual configuration of the switch is needed
+ (typically only to designate that some set of devices is
+ available for 802.3ad). The 802.3ad standard also mandates
+ that frames be delivered in order (within certain limits), so
+ in general single connections will not see misordering of
+ packets. The 802.3ad mode does have some drawbacks: the
+ standard mandates that all devices in the aggregate operate at
+ the same speed and duplex. Also, as with all bonding load
+ balance modes other than balance-rr, no single connection will
+ be able to utilize more than a single interface's worth of
+ bandwidth.
+
+ Additionally, the linux bonding 802.3ad implementation
+ distributes traffic by peer (using an XOR of MAC addresses),
+ so in a "gatewayed" configuration, all outgoing traffic will
+ generally use the same device. Incoming traffic may also end
+ up on a single device, but that is dependent upon the
+ balancing policy of the peer's 8023.ad implementation. In a
+ "local" configuration, traffic will be distributed across the
+ devices in the bond.
+
+ Finally, the 802.3ad mode mandates the use of the MII monitor,
+ therefore, the ARP monitor is not available in this mode.
+
+balance-tlb: The balance-tlb mode balances outgoing traffic by peer.
+ Since the balancing is done according to MAC address, in a
+ "gatewayed" configuration (as described above), this mode will
+ send all traffic across a single device. However, in a
+ "local" network configuration, this mode balances multiple
+ local network peers across devices in a vaguely intelligent
+ manner (not a simple XOR as in balance-xor or 802.3ad mode),
+ so that mathematically unlucky MAC addresses (i.e., ones that
+ XOR to the same value) will not all "bunch up" on a single
+ interface.
+
+ Unlike 802.3ad, interfaces may be of differing speeds, and no
+ special switch configuration is required. On the down side,
+ in this mode all incoming traffic arrives over a single
+ interface, this mode requires certain ethtool support in the
+ network device driver of the slave interfaces, and the ARP
+ monitor is not available.
+
+balance-alb: This mode is everything that balance-tlb is, and more.
+ It has all of the features (and restrictions) of balance-tlb,
+ and will also balance incoming traffic from local network
+ peers (as described in the Bonding Module Options section,
+ above).
+
+ The only additional down side to this mode is that the network
+ device driver must support changing the hardware address while
+ the device is open.
+
+12.1.2 MT Link Monitoring for Single Switch Topology
+----------------------------------------------------
+
+ The choice of link monitoring may largely depend upon which
+mode you choose to use. The more advanced load balancing modes do not
+support the use of the ARP monitor, and are thus restricted to using
+the MII monitor (which does not provide as high a level of end to end
+assurance as the ARP monitor).
+
+12.2 Maximum Throughput in a Multiple Switch Topology
+-----------------------------------------------------
+
+ Multiple switches may be utilized to optimize for throughput
+when they are configured in parallel as part of an isolated network
+between two or more systems, for example:
+
+ +-----------+
+ | Host A |
+ +-+---+---+-+
+ | | |
+ +--------+ | +---------+
+ | | |
+ +------+---+ +-----+----+ +-----+----+
+ | Switch A | | Switch B | | Switch C |
+ +------+---+ +-----+----+ +-----+----+
+ | | |
+ +--------+ | +---------+
+ | | |
+ +-+---+---+-+
+ | Host B |
+ +-----------+
+
+ In this configuration, the switches are isolated from one
+another. One reason to employ a topology such as this is for an
+isolated network with many hosts (a cluster configured for high
+performance, for example), using multiple smaller switches can be more
+cost effective than a single larger switch, e.g., on a network with 24
+hosts, three 24 port switches can be significantly less expensive than
+a single 72 port switch.
+
+ If access beyond the network is required, an individual host
+can be equipped with an additional network device connected to an
+external network; this host then additionally acts as a gateway.
+
+12.2.1 MT Bonding Mode Selection for Multiple Switch Topology
+-------------------------------------------------------------
+
+ In actual practice, the bonding mode typically employed in
+configurations of this type is balance-rr. Historically, in this
+network configuration, the usual caveats about out of order packet
+delivery are mitigated by the use of network adapters that do not do
+any kind of packet coalescing (via the use of NAPI, or because the
+device itself does not generate interrupts until some number of
+packets has arrived). When employed in this fashion, the balance-rr
+mode allows individual connections between two hosts to effectively
+utilize greater than one interface's bandwidth.
+
+12.2.2 MT Link Monitoring for Multiple Switch Topology
+------------------------------------------------------
+
+ Again, in actual practice, the MII monitor is most often used
+in this configuration, as performance is given preference over
+availability. The ARP monitor will function in this topology, but its
+advantages over the MII monitor are mitigated by the volume of probes
+needed as the number of systems involved grows (remember that each
+host in the network is configured with bonding).
+
+13. Switch Behavior Issues
+==========================
+
+13.1 Link Establishment and Failover Delays
+-------------------------------------------
+
+ Some switches exhibit undesirable behavior with regard to the
+timing of link up and down reporting by the switch.
+
+ First, when a link comes up, some switches may indicate that
+the link is up (carrier available), but not pass traffic over the
+interface for some period of time. This delay is typically due to
+some type of autonegotiation or routing protocol, but may also occur
+during switch initialization (e.g., during recovery after a switch
+failure). If you find this to be a problem, specify an appropriate
+value to the updelay bonding module option to delay the use of the
+relevant interface(s).
+
+ Second, some switches may "bounce" the link state one or more
+times while a link is changing state. This occurs most commonly while
+the switch is initializing. Again, an appropriate updelay value may
+help.
+
+ Note that when a bonding interface has no active links, the
+driver will immediately reuse the first link that goes up, even if the
+updelay parameter has been specified (the updelay is ignored in this
+case). If there are slave interfaces waiting for the updelay timeout
+to expire, the interface that first went into that state will be
+immediately reused. This reduces down time of the network if the
+value of updelay has been overestimated, and since this occurs only in
+cases with no connectivity, there is no additional penalty for
+ignoring the updelay.
+
+ In addition to the concerns about switch timings, if your
+switches take a long time to go into backup mode, it may be desirable
+to not activate a backup interface immediately after a link goes down.
+Failover may be delayed via the downdelay bonding module option.
+
+13.2 Duplicated Incoming Packets
+--------------------------------
+
+ NOTE: Starting with version 3.0.2, the bonding driver has logic to
+suppress duplicate packets, which should largely eliminate this problem.
+The following description is kept for reference.
+
+ It is not uncommon to observe a short burst of duplicated
+traffic when the bonding device is first used, or after it has been
+idle for some period of time. This is most easily observed by issuing
+a "ping" to some other host on the network, and noticing that the
+output from ping flags duplicates (typically one per slave).
+
+ For example, on a bond in active-backup mode with five slaves
+all connected to one switch, the output may appear as follows:
+
+# ping -n 10.0.4.2
+PING 10.0.4.2 (10.0.4.2) from 10.0.3.10 : 56(84) bytes of data.
+64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.7 ms
+64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!)
+64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!)
+64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!)
+64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!)
+64 bytes from 10.0.4.2: icmp_seq=2 ttl=64 time=0.216 ms
+64 bytes from 10.0.4.2: icmp_seq=3 ttl=64 time=0.267 ms
+64 bytes from 10.0.4.2: icmp_seq=4 ttl=64 time=0.222 ms
+
+ This is not due to an error in the bonding driver, rather, it
+is a side effect of how many switches update their MAC forwarding
+tables. Initially, the switch does not associate the MAC address in
+the packet with a particular switch port, and so it may send the
+traffic to all ports until its MAC forwarding table is updated. Since
+the interfaces attached to the bond may occupy multiple ports on a
+single switch, when the switch (temporarily) floods the traffic to all
+ports, the bond device receives multiple copies of the same packet
+(one per slave device).
+
+ The duplicated packet behavior is switch dependent, some
+switches exhibit this, and some do not. On switches that display this
+behavior, it can be induced by clearing the MAC forwarding table (on
+most Cisco switches, the privileged command "clear mac address-table
+dynamic" will accomplish this).
+
+14. Hardware Specific Considerations
+====================================
+
+ This section contains additional information for configuring
+bonding on specific hardware platforms, or for interfacing bonding
+with particular switches or other devices.
+
+14.1 IBM BladeCenter
+--------------------
+
+ This applies to the JS20 and similar systems.
+
+ On the JS20 blades, the bonding driver supports only
+balance-rr, active-backup, balance-tlb and balance-alb modes. This is
+largely due to the network topology inside the BladeCenter, detailed
+below.
+
+JS20 network adapter information
+--------------------------------
+
+ All JS20s come with two Broadcom Gigabit Ethernet ports
+integrated on the planar (that's "motherboard" in IBM-speak). In the
+BladeCenter chassis, the eth0 port of all JS20 blades is hard wired to
+I/O Module #1; similarly, all eth1 ports are wired to I/O Module #2.
+An add-on Broadcom daughter card can be installed on a JS20 to provide
+two more Gigabit Ethernet ports. These ports, eth2 and eth3, are
+wired to I/O Modules 3 and 4, respectively.
+
+ Each I/O Module may contain either a switch or a passthrough
+module (which allows ports to be directly connected to an external
+switch). Some bonding modes require a specific BladeCenter internal
+network topology in order to function; these are detailed below.
+
+ Additional BladeCenter-specific networking information can be
+found in two IBM Redbooks (www.ibm.com/redbooks):
+
+"IBM eServer BladeCenter Networking Options"
+"IBM eServer BladeCenter Layer 2-7 Network Switching"
+
+BladeCenter networking configuration
+------------------------------------
+
+ Because a BladeCenter can be configured in a very large number
+of ways, this discussion will be confined to describing basic
+configurations.
+
+ Normally, Ethernet Switch Modules (ESMs) are used in I/O
+modules 1 and 2. In this configuration, the eth0 and eth1 ports of a
+JS20 will be connected to different internal switches (in the
+respective I/O modules).
+
+ A passthrough module (OPM or CPM, optical or copper,
+passthrough module) connects the I/O module directly to an external
+switch. By using PMs in I/O module #1 and #2, the eth0 and eth1
+interfaces of a JS20 can be redirected to the outside world and
+connected to a common external switch.
+
+ Depending upon the mix of ESMs and PMs, the network will
+appear to bonding as either a single switch topology (all PMs) or as a
+multiple switch topology (one or more ESMs, zero or more PMs). It is
+also possible to connect ESMs together, resulting in a configuration
+much like the example in "High Availability in a Multiple Switch
+Topology," above.
+
+Requirements for specific modes
+-------------------------------
+
+ The balance-rr mode requires the use of passthrough modules
+for devices in the bond, all connected to an common external switch.
+That switch must be configured for "etherchannel" or "trunking" on the
+appropriate ports, as is usual for balance-rr.
+
+ The balance-alb and balance-tlb modes will function with
+either switch modules or passthrough modules (or a mix). The only
+specific requirement for these modes is that all network interfaces
+must be able to reach all destinations for traffic sent over the
+bonding device (i.e., the network must converge at some point outside
+the BladeCenter).
+
+ The active-backup mode has no additional requirements.
+
+Link monitoring issues
+----------------------
+
+ When an Ethernet Switch Module is in place, only the ARP
+monitor will reliably detect link loss to an external switch. This is
+nothing unusual, but examination of the BladeCenter cabinet would
+suggest that the "external" network ports are the ethernet ports for
+the system, when it fact there is a switch between these "external"
+ports and the devices on the JS20 system itself. The MII monitor is
+only able to detect link failures between the ESM and the JS20 system.
+
+ When a passthrough module is in place, the MII monitor does
+detect failures to the "external" port, which is then directly
+connected to the JS20 system.
+
+Other concerns
+--------------
+
+ The Serial Over LAN (SoL) link is established over the primary
+ethernet (eth0) only, therefore, any loss of link to eth0 will result
+in losing your SoL connection. It will not fail over with other
+network traffic, as the SoL system is beyond the control of the
+bonding driver.
+
+ It may be desirable to disable spanning tree on the switch
+(either the internal Ethernet Switch Module, or an external switch) to
+avoid fail-over delay issues when using bonding.
+
+
+15. Frequently Asked Questions
+==============================
+
+1. Is it SMP safe?
+
+ Yes. The old 2.0.xx channel bonding patch was not SMP safe.
+The new driver was designed to be SMP safe from the start.
+
+2. What type of cards will work with it?
+
+ Any Ethernet type cards (you can even mix cards - a Intel
+EtherExpress PRO/100 and a 3com 3c905b, for example). For most modes,
+devices need not be of the same speed.
+
+ Starting with version 3.2.1, bonding also supports Infiniband
+slaves in active-backup mode.
+
+3. How many bonding devices can I have?
+
+ There is no limit.
+
+4. How many slaves can a bonding device have?
+
+ This is limited only by the number of network interfaces Linux
+supports and/or the number of network cards you can place in your
+system.
+
+5. What happens when a slave link dies?
+
+ If link monitoring is enabled, then the failing device will be
+disabled. The active-backup mode will fail over to a backup link, and
+other modes will ignore the failed link. The link will continue to be
+monitored, and should it recover, it will rejoin the bond (in whatever
+manner is appropriate for the mode). See the sections on High
+Availability and the documentation for each mode for additional
+information.
+
+ Link monitoring can be enabled via either the miimon or
+arp_interval parameters (described in the module parameters section,
+above). In general, miimon monitors the carrier state as sensed by
+the underlying network device, and the arp monitor (arp_interval)
+monitors connectivity to another host on the local network.
+
+ If no link monitoring is configured, the bonding driver will
+be unable to detect link failures, and will assume that all links are
+always available. This will likely result in lost packets, and a
+resulting degradation of performance. The precise performance loss
+depends upon the bonding mode and network configuration.
+
+6. Can bonding be used for High Availability?
+
+ Yes. See the section on High Availability for details.
+
+7. Which switches/systems does it work with?
+
+ The full answer to this depends upon the desired mode.
+
+ In the basic balance modes (balance-rr and balance-xor), it
+works with any system that supports etherchannel (also called
+trunking). Most managed switches currently available have such
+support, and many unmanaged switches as well.
+
+ The advanced balance modes (balance-tlb and balance-alb) do
+not have special switch requirements, but do need device drivers that
+support specific features (described in the appropriate section under
+module parameters, above).
+
+ In 802.3ad mode, it works with systems that support IEEE
+802.3ad Dynamic Link Aggregation. Most managed and many unmanaged
+switches currently available support 802.3ad.
+
+ The active-backup mode should work with any Layer-II switch.
+
+8. Where does a bonding device get its MAC address from?
+
+ When using slave devices that have fixed MAC addresses, or when
+the fail_over_mac option is enabled, the bonding device's MAC address is
+the MAC address of the active slave.
+
+ For other configurations, if not explicitly configured (with
+ifconfig or ip link), the MAC address of the bonding device is taken from
+its first slave device. This MAC address is then passed to all following
+slaves and remains persistent (even if the first slave is removed) until
+the bonding device is brought down or reconfigured.
+
+ If you wish to change the MAC address, you can set it with
+ifconfig or ip link:
+
+# ifconfig bond0 hw ether 00:11:22:33:44:55
+
+# ip link set bond0 address 66:77:88:99:aa:bb
+
+ The MAC address can be also changed by bringing down/up the
+device and then changing its slaves (or their order):
+
+# ifconfig bond0 down ; modprobe -r bonding
+# ifconfig bond0 .... up
+# ifenslave bond0 eth...
+
+ This method will automatically take the address from the next
+slave that is added.
+
+ To restore your slaves' MAC addresses, you need to detach them
+from the bond (`ifenslave -d bond0 eth0'). The bonding driver will
+then restore the MAC addresses that the slaves had before they were
+enslaved.
+
+16. Resources and Links
+=======================
+
+The latest version of the bonding driver can be found in the latest
+version of the linux kernel, found on http://kernel.org
+
+The latest version of this document can be found in either the latest
+kernel source (named Documentation/networking/bonding.txt), or on the
+bonding sourceforge site:
+
+http://www.sourceforge.net/projects/bonding
+
+Discussions regarding the bonding driver take place primarily on the
+bonding-devel mailing list, hosted at sourceforge.net. If you have
+questions or problems, post them to the list. The list address is:
+
+bonding-devel@lists.sourceforge.net
+
+ The administrative interface (to subscribe or unsubscribe) can
+be found at:
+
+https://lists.sourceforge.net/lists/listinfo/bonding-devel
+
+Donald Becker's Ethernet Drivers and diag programs may be found at :
+ - http://www.scyld.com/network/
+
+You will also find a lot of information regarding Ethernet, NWay, MII,
+etc. at www.scyld.com.
+
+-- END --
diff --git a/Documentation/networking/bridge.txt b/Documentation/networking/bridge.txt
new file mode 100644
index 0000000..bec69a8
--- /dev/null
+++ b/Documentation/networking/bridge.txt
@@ -0,0 +1,8 @@
+In order to use the Ethernet bridging functionality, you'll need the
+userspace tools. These programs and documentation are available
+at http://www.linux-foundation.org/en/Net:Bridge. The download page is
+http://prdownloads.sourceforge.net/bridge.
+
+If you still have questions, don't hesitate to post to the mailing list
+(more info http://lists.osdl.org/mailman/listinfo/bridge).
+
diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt
new file mode 100644
index 0000000..2035bc4
--- /dev/null
+++ b/Documentation/networking/can.txt
@@ -0,0 +1,665 @@
+============================================================================
+
+can.txt
+
+Readme file for the Controller Area Network Protocol Family (aka Socket CAN)
+
+This file contains
+
+ 1 Overview / What is Socket CAN
+
+ 2 Motivation / Why using the socket API
+
+ 3 Socket CAN concept
+ 3.1 receive lists
+ 3.2 local loopback of sent frames
+ 3.3 network security issues (capabilities)
+ 3.4 network problem notifications
+
+ 4 How to use Socket CAN
+ 4.1 RAW protocol sockets with can_filters (SOCK_RAW)
+ 4.1.1 RAW socket option CAN_RAW_FILTER
+ 4.1.2 RAW socket option CAN_RAW_ERR_FILTER
+ 4.1.3 RAW socket option CAN_RAW_LOOPBACK
+ 4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS
+ 4.2 Broadcast Manager protocol sockets (SOCK_DGRAM)
+ 4.3 connected transport protocols (SOCK_SEQPACKET)
+ 4.4 unconnected transport protocols (SOCK_DGRAM)
+
+ 5 Socket CAN core module
+ 5.1 can.ko module params
+ 5.2 procfs content
+ 5.3 writing own CAN protocol modules
+
+ 6 CAN network drivers
+ 6.1 general settings
+ 6.2 local loopback of sent frames
+ 6.3 CAN controller hardware filters
+ 6.4 The virtual CAN driver (vcan)
+ 6.5 currently supported CAN hardware
+ 6.6 todo
+
+ 7 Credits
+
+============================================================================
+
+1. Overview / What is Socket CAN
+--------------------------------
+
+The socketcan package is an implementation of CAN protocols
+(Controller Area Network) for Linux. CAN is a networking technology
+which has widespread use in automation, embedded devices, and
+automotive fields. While there have been other CAN implementations
+for Linux based on character devices, Socket CAN uses the Berkeley
+socket API, the Linux network stack and implements the CAN device
+drivers as network interfaces. The CAN socket API has been designed
+as similar as possible to the TCP/IP protocols to allow programmers,
+familiar with network programming, to easily learn how to use CAN
+sockets.
+
+2. Motivation / Why using the socket API
+----------------------------------------
+
+There have been CAN implementations for Linux before Socket CAN so the
+question arises, why we have started another project. Most existing
+implementations come as a device driver for some CAN hardware, they
+are based on character devices and provide comparatively little
+functionality. Usually, there is only a hardware-specific device
+driver which provides a character device interface to send and
+receive raw CAN frames, directly to/from the controller hardware.
+Queueing of frames and higher-level transport protocols like ISO-TP
+have to be implemented in user space applications. Also, most
+character-device implementations support only one single process to
+open the device at a time, similar to a serial interface. Exchanging
+the CAN controller requires employment of another device driver and
+often the need for adaption of large parts of the application to the
+new driver's API.
+
+Socket CAN was designed to overcome all of these limitations. A new
+protocol family has been implemented which provides a socket interface
+to user space applications and which builds upon the Linux network
+layer, so to use all of the provided queueing functionality. A device
+driver for CAN controller hardware registers itself with the Linux
+network layer as a network device, so that CAN frames from the
+controller can be passed up to the network layer and on to the CAN
+protocol family module and also vice-versa. Also, the protocol family
+module provides an API for transport protocol modules to register, so
+that any number of transport protocols can be loaded or unloaded
+dynamically. In fact, the can core module alone does not provide any
+protocol and cannot be used without loading at least one additional
+protocol module. Multiple sockets can be opened at the same time,
+on different or the same protocol module and they can listen/send
+frames on different or the same CAN IDs. Several sockets listening on
+the same interface for frames with the same CAN ID are all passed the
+same received matching CAN frames. An application wishing to
+communicate using a specific transport protocol, e.g. ISO-TP, just
+selects that protocol when opening the socket, and then can read and
+write application data byte streams, without having to deal with
+CAN-IDs, frames, etc.
+
+Similar functionality visible from user-space could be provided by a
+character device, too, but this would lead to a technically inelegant
+solution for a couple of reasons:
+
+* Intricate usage. Instead of passing a protocol argument to
+ socket(2) and using bind(2) to select a CAN interface and CAN ID, an
+ application would have to do all these operations using ioctl(2)s.
+
+* Code duplication. A character device cannot make use of the Linux
+ network queueing code, so all that code would have to be duplicated
+ for CAN networking.
+
+* Abstraction. In most existing character-device implementations, the
+ hardware-specific device driver for a CAN controller directly
+ provides the character device for the application to work with.
+ This is at least very unusual in Unix systems for both, char and
+ block devices. For example you don't have a character device for a
+ certain UART of a serial interface, a certain sound chip in your
+ computer, a SCSI or IDE controller providing access to your hard
+ disk or tape streamer device. Instead, you have abstraction layers
+ which provide a unified character or block device interface to the
+ application on the one hand, and a interface for hardware-specific
+ device drivers on the other hand. These abstractions are provided
+ by subsystems like the tty layer, the audio subsystem or the SCSI
+ and IDE subsystems for the devices mentioned above.
+
+ The easiest way to implement a CAN device driver is as a character
+ device without such a (complete) abstraction layer, as is done by most
+ existing drivers. The right way, however, would be to add such a
+ layer with all the functionality like registering for certain CAN
+ IDs, supporting several open file descriptors and (de)multiplexing
+ CAN frames between them, (sophisticated) queueing of CAN frames, and
+ providing an API for device drivers to register with. However, then
+ it would be no more difficult, or may be even easier, to use the
+ networking framework provided by the Linux kernel, and this is what
+ Socket CAN does.
+
+ The use of the networking framework of the Linux kernel is just the
+ natural and most appropriate way to implement CAN for Linux.
+
+3. Socket CAN concept
+---------------------
+
+ As described in chapter 2 it is the main goal of Socket CAN to
+ provide a socket interface to user space applications which builds
+ upon the Linux network layer. In contrast to the commonly known
+ TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!)
+ medium that has no MAC-layer addressing like ethernet. The CAN-identifier
+ (can_id) is used for arbitration on the CAN-bus. Therefore the CAN-IDs
+ have to be chosen uniquely on the bus. When designing a CAN-ECU
+ network the CAN-IDs are mapped to be sent by a specific ECU.
+ For this reason a CAN-ID can be treated best as a kind of source address.
+
+ 3.1 receive lists
+
+ The network transparent access of multiple applications leads to the
+ problem that different applications may be interested in the same
+ CAN-IDs from the same CAN network interface. The Socket CAN core
+ module - which implements the protocol family CAN - provides several
+ high efficient receive lists for this reason. If e.g. a user space
+ application opens a CAN RAW socket, the raw protocol module itself
+ requests the (range of) CAN-IDs from the Socket CAN core that are
+ requested by the user. The subscription and unsubscription of
+ CAN-IDs can be done for specific CAN interfaces or for all(!) known
+ CAN interfaces with the can_rx_(un)register() functions provided to
+ CAN protocol modules by the SocketCAN core (see chapter 5).
+ To optimize the CPU usage at runtime the receive lists are split up
+ into several specific lists per device that match the requested
+ filter complexity for a given use-case.
+
+ 3.2 local loopback of sent frames
+
+ As known from other networking concepts the data exchanging
+ applications may run on the same or different nodes without any
+ change (except for the according addressing information):
+
+ ___ ___ ___ _______ ___
+ | _ | | _ | | _ | | _ _ | | _ |
+ ||A|| ||B|| ||C|| ||A| |B|| ||C||
+ |___| |___| |___| |_______| |___|
+ | | | | |
+ -----------------(1)- CAN bus -(2)---------------
+
+ To ensure that application A receives the same information in the
+ example (2) as it would receive in example (1) there is need for
+ some kind of local loopback of the sent CAN frames on the appropriate
+ node.
+
+ The Linux network devices (by default) just can handle the
+ transmission and reception of media dependent frames. Due to the
+ arbitration on the CAN bus the transmission of a low prio CAN-ID
+ may be delayed by the reception of a high prio CAN frame. To
+ reflect the correct* traffic on the node the loopback of the sent
+ data has to be performed right after a successful transmission. If
+ the CAN network interface is not capable of performing the loopback for
+ some reason the SocketCAN core can do this task as a fallback solution.
+ See chapter 6.2 for details (recommended).
+
+ The loopback functionality is enabled by default to reflect standard
+ networking behaviour for CAN applications. Due to some requests from
+ the RT-SocketCAN group the loopback optionally may be disabled for each
+ separate socket. See sockopts from the CAN RAW sockets in chapter 4.1.
+
+ * = you really like to have this when you're running analyser tools
+ like 'candump' or 'cansniffer' on the (same) node.
+
+ 3.3 network security issues (capabilities)
+
+ The Controller Area Network is a local field bus transmitting only
+ broadcast messages without any routing and security concepts.
+ In the majority of cases the user application has to deal with
+ raw CAN frames. Therefore it might be reasonable NOT to restrict
+ the CAN access only to the user root, as known from other networks.
+ Since the currently implemented CAN_RAW and CAN_BCM sockets can only
+ send and receive frames to/from CAN interfaces it does not affect
+ security of others networks to allow all users to access the CAN.
+ To enable non-root users to access CAN_RAW and CAN_BCM protocol
+ sockets the Kconfig options CAN_RAW_USER and/or CAN_BCM_USER may be
+ selected at kernel compile time.
+
+ 3.4 network problem notifications
+
+ The use of the CAN bus may lead to several problems on the physical
+ and media access control layer. Detecting and logging of these lower
+ layer problems is a vital requirement for CAN users to identify
+ hardware issues on the physical transceiver layer as well as
+ arbitration problems and error frames caused by the different
+ ECUs. The occurrence of detected errors are important for diagnosis
+ and have to be logged together with the exact timestamp. For this
+ reason the CAN interface driver can generate so called Error Frames
+ that can optionally be passed to the user application in the same
+ way as other CAN frames. Whenever an error on the physical layer
+ or the MAC layer is detected (e.g. by the CAN controller) the driver
+ creates an appropriate error frame. Error frames can be requested by
+ the user application using the common CAN filter mechanisms. Inside
+ this filter definition the (interested) type of errors may be
+ selected. The reception of error frames is disabled by default.
+
+4. How to use Socket CAN
+------------------------
+
+ Like TCP/IP, you first need to open a socket for communicating over a
+ CAN network. Since Socket CAN implements a new protocol family, you
+ need to pass PF_CAN as the first argument to the socket(2) system
+ call. Currently, there are two CAN protocols to choose from, the raw
+ socket protocol and the broadcast manager (BCM). So to open a socket,
+ you would write
+
+ s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
+
+ and
+
+ s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
+
+ respectively. After the successful creation of the socket, you would
+ normally use the bind(2) system call to bind the socket to a CAN
+ interface (which is different from TCP/IP due to different addressing
+ - see chapter 3). After binding (CAN_RAW) or connecting (CAN_BCM)
+ the socket, you can read(2) and write(2) from/to the socket or use
+ send(2), sendto(2), sendmsg(2) and the recv* counterpart operations
+ on the socket as usual. There are also CAN specific socket options
+ described below.
+
+ The basic CAN frame structure and the sockaddr structure are defined
+ in include/linux/can.h:
+
+ struct can_frame {
+ canid_t can_id; /* 32 bit CAN_ID + EFF/RTR/ERR flags */
+ __u8 can_dlc; /* data length code: 0 .. 8 */
+ __u8 data[8] __attribute__((aligned(8)));
+ };
+
+ The alignment of the (linear) payload data[] to a 64bit boundary
+ allows the user to define own structs and unions to easily access the
+ CAN payload. There is no given byteorder on the CAN bus by
+ default. A read(2) system call on a CAN_RAW socket transfers a
+ struct can_frame to the user space.
+
+ The sockaddr_can structure has an interface index like the
+ PF_PACKET socket, that also binds to a specific interface:
+
+ struct sockaddr_can {
+ sa_family_t can_family;
+ int can_ifindex;
+ union {
+ /* transport protocol class address info (e.g. ISOTP) */
+ struct { canid_t rx_id, tx_id; } tp;
+
+ /* reserved for future CAN protocols address information */
+ } can_addr;
+ };
+
+ To determine the interface index an appropriate ioctl() has to
+ be used (example for CAN_RAW sockets without error checking):
+
+ int s;
+ struct sockaddr_can addr;
+ struct ifreq ifr;
+
+ s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
+
+ strcpy(ifr.ifr_name, "can0" );
+ ioctl(s, SIOCGIFINDEX, &ifr);
+
+ addr.can_family = AF_CAN;
+ addr.can_ifindex = ifr.ifr_ifindex;
+
+ bind(s, (struct sockaddr *)&addr, sizeof(addr));
+
+ (..)
+
+ To bind a socket to all(!) CAN interfaces the interface index must
+ be 0 (zero). In this case the socket receives CAN frames from every
+ enabled CAN interface. To determine the originating CAN interface
+ the system call recvfrom(2) may be used instead of read(2). To send
+ on a socket that is bound to 'any' interface sendto(2) is needed to
+ specify the outgoing interface.
+
+ Reading CAN frames from a bound CAN_RAW socket (see above) consists
+ of reading a struct can_frame:
+
+ struct can_frame frame;
+
+ nbytes = read(s, &frame, sizeof(struct can_frame));
+
+ if (nbytes < 0) {
+ perror("can raw socket read");
+ return 1;
+ }
+
+ /* paraniod check ... */
+ if (nbytes < sizeof(struct can_frame)) {
+ fprintf(stderr, "read: incomplete CAN frame\n");
+ return 1;
+ }
+
+ /* do something with the received CAN frame */
+
+ Writing CAN frames can be done similarly, with the write(2) system call:
+
+ nbytes = write(s, &frame, sizeof(struct can_frame));
+
+ When the CAN interface is bound to 'any' existing CAN interface
+ (addr.can_ifindex = 0) it is recommended to use recvfrom(2) if the
+ information about the originating CAN interface is needed:
+
+ struct sockaddr_can addr;
+ struct ifreq ifr;
+ socklen_t len = sizeof(addr);
+ struct can_frame frame;
+
+ nbytes = recvfrom(s, &frame, sizeof(struct can_frame),
+ 0, (struct sockaddr*)&addr, &len);
+
+ /* get interface name of the received CAN frame */
+ ifr.ifr_ifindex = addr.can_ifindex;
+ ioctl(s, SIOCGIFNAME, &ifr);
+ printf("Received a CAN frame from interface %s", ifr.ifr_name);
+
+ To write CAN frames on sockets bound to 'any' CAN interface the
+ outgoing interface has to be defined certainly.
+
+ strcpy(ifr.ifr_name, "can0");
+ ioctl(s, SIOCGIFINDEX, &ifr);
+ addr.can_ifindex = ifr.ifr_ifindex;
+ addr.can_family = AF_CAN;
+
+ nbytes = sendto(s, &frame, sizeof(struct can_frame),
+ 0, (struct sockaddr*)&addr, sizeof(addr));
+
+ 4.1 RAW protocol sockets with can_filters (SOCK_RAW)
+
+ Using CAN_RAW sockets is extensively comparable to the commonly
+ known access to CAN character devices. To meet the new possibilities
+ provided by the multi user SocketCAN approach, some reasonable
+ defaults are set at RAW socket binding time:
+
+ - The filters are set to exactly one filter receiving everything
+ - The socket only receives valid data frames (=> no error frames)
+ - The loopback of sent CAN frames is enabled (see chapter 3.2)
+ - The socket does not receive its own sent frames (in loopback mode)
+
+ These default settings may be changed before or after binding the socket.
+ To use the referenced definitions of the socket options for CAN_RAW
+ sockets, include <linux/can/raw.h>.
+
+ 4.1.1 RAW socket option CAN_RAW_FILTER
+
+ The reception of CAN frames using CAN_RAW sockets can be controlled
+ by defining 0 .. n filters with the CAN_RAW_FILTER socket option.
+
+ The CAN filter structure is defined in include/linux/can.h:
+
+ struct can_filter {
+ canid_t can_id;
+ canid_t can_mask;
+ };
+
+ A filter matches, when
+
+ <received_can_id> & mask == can_id & mask
+
+ which is analogous to known CAN controllers hardware filter semantics.
+ The filter can be inverted in this semantic, when the CAN_INV_FILTER
+ bit is set in can_id element of the can_filter structure. In
+ contrast to CAN controller hardware filters the user may set 0 .. n
+ receive filters for each open socket separately:
+
+ struct can_filter rfilter[2];
+
+ rfilter[0].can_id = 0x123;
+ rfilter[0].can_mask = CAN_SFF_MASK;
+ rfilter[1].can_id = 0x200;
+ rfilter[1].can_mask = 0x700;
+
+ setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
+
+ To disable the reception of CAN frames on the selected CAN_RAW socket:
+
+ setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0);
+
+ To set the filters to zero filters is quite obsolete as not read
+ data causes the raw socket to discard the received CAN frames. But
+ having this 'send only' use-case we may remove the receive list in the
+ Kernel to save a little (really a very little!) CPU usage.
+
+ 4.1.2 RAW socket option CAN_RAW_ERR_FILTER
+
+ As described in chapter 3.4 the CAN interface driver can generate so
+ called Error Frames that can optionally be passed to the user
+ application in the same way as other CAN frames. The possible
+ errors are divided into different error classes that may be filtered
+ using the appropriate error mask. To register for every possible
+ error condition CAN_ERR_MASK can be used as value for the error mask.
+ The values for the error mask are defined in linux/can/error.h .
+
+ can_err_mask_t err_mask = ( CAN_ERR_TX_TIMEOUT | CAN_ERR_BUSOFF );
+
+ setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER,
+ &err_mask, sizeof(err_mask));
+
+ 4.1.3 RAW socket option CAN_RAW_LOOPBACK
+
+ To meet multi user needs the local loopback is enabled by default
+ (see chapter 3.2 for details). But in some embedded use-cases
+ (e.g. when only one application uses the CAN bus) this loopback
+ functionality can be disabled (separately for each socket):
+
+ int loopback = 0; /* 0 = disabled, 1 = enabled (default) */
+
+ setsockopt(s, SOL_CAN_RAW, CAN_RAW_LOOPBACK, &loopback, sizeof(loopback));
+
+ 4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS
+
+ When the local loopback is enabled, all the sent CAN frames are
+ looped back to the open CAN sockets that registered for the CAN
+ frames' CAN-ID on this given interface to meet the multi user
+ needs. The reception of the CAN frames on the same socket that was
+ sending the CAN frame is assumed to be unwanted and therefore
+ disabled by default. This default behaviour may be changed on
+ demand:
+
+ int recv_own_msgs = 1; /* 0 = disabled (default), 1 = enabled */
+
+ setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS,
+ &recv_own_msgs, sizeof(recv_own_msgs));
+
+ 4.2 Broadcast Manager protocol sockets (SOCK_DGRAM)
+ 4.3 connected transport protocols (SOCK_SEQPACKET)
+ 4.4 unconnected transport protocols (SOCK_DGRAM)
+
+
+5. Socket CAN core module
+-------------------------
+
+ The Socket CAN core module implements the protocol family
+ PF_CAN. CAN protocol modules are loaded by the core module at
+ runtime. The core module provides an interface for CAN protocol
+ modules to subscribe needed CAN IDs (see chapter 3.1).
+
+ 5.1 can.ko module params
+
+ - stats_timer: To calculate the Socket CAN core statistics
+ (e.g. current/maximum frames per second) this 1 second timer is
+ invoked at can.ko module start time by default. This timer can be
+ disabled by using stattimer=0 on the module commandline.
+
+ - debug: (removed since SocketCAN SVN r546)
+
+ 5.2 procfs content
+
+ As described in chapter 3.1 the Socket CAN core uses several filter
+ lists to deliver received CAN frames to CAN protocol modules. These
+ receive lists, their filters and the count of filter matches can be
+ checked in the appropriate receive list. All entries contain the
+ device and a protocol module identifier:
+
+ foo@bar:~$ cat /proc/net/can/rcvlist_all
+
+ receive list 'rx_all':
+ (vcan3: no entry)
+ (vcan2: no entry)
+ (vcan1: no entry)
+ device can_id can_mask function userdata matches ident
+ vcan0 000 00000000 f88e6370 f6c6f400 0 raw
+ (any: no entry)
+
+ In this example an application requests any CAN traffic from vcan0.
+
+ rcvlist_all - list for unfiltered entries (no filter operations)
+ rcvlist_eff - list for single extended frame (EFF) entries
+ rcvlist_err - list for error frames masks
+ rcvlist_fil - list for mask/value filters
+ rcvlist_inv - list for mask/value filters (inverse semantic)
+ rcvlist_sff - list for single standard frame (SFF) entries
+
+ Additional procfs files in /proc/net/can
+
+ stats - Socket CAN core statistics (rx/tx frames, match ratios, ...)
+ reset_stats - manual statistic reset
+ version - prints the Socket CAN core version and the ABI version
+
+ 5.3 writing own CAN protocol modules
+
+ To implement a new protocol in the protocol family PF_CAN a new
+ protocol has to be defined in include/linux/can.h .
+ The prototypes and definitions to use the Socket CAN core can be
+ accessed by including include/linux/can/core.h .
+ In addition to functions that register the CAN protocol and the
+ CAN device notifier chain there are functions to subscribe CAN
+ frames received by CAN interfaces and to send CAN frames:
+
+ can_rx_register - subscribe CAN frames from a specific interface
+ can_rx_unregister - unsubscribe CAN frames from a specific interface
+ can_send - transmit a CAN frame (optional with local loopback)
+
+ For details see the kerneldoc documentation in net/can/af_can.c or
+ the source code of net/can/raw.c or net/can/bcm.c .
+
+6. CAN network drivers
+----------------------
+
+ Writing a CAN network device driver is much easier than writing a
+ CAN character device driver. Similar to other known network device
+ drivers you mainly have to deal with:
+
+ - TX: Put the CAN frame from the socket buffer to the CAN controller.
+ - RX: Put the CAN frame from the CAN controller to the socket buffer.
+
+ See e.g. at Documentation/networking/netdevices.txt . The differences
+ for writing CAN network device driver are described below:
+
+ 6.1 general settings
+
+ dev->type = ARPHRD_CAN; /* the netdevice hardware type */
+ dev->flags = IFF_NOARP; /* CAN has no arp */
+
+ dev->mtu = sizeof(struct can_frame);
+
+ The struct can_frame is the payload of each socket buffer in the
+ protocol family PF_CAN.
+
+ 6.2 local loopback of sent frames
+
+ As described in chapter 3.2 the CAN network device driver should
+ support a local loopback functionality similar to the local echo
+ e.g. of tty devices. In this case the driver flag IFF_ECHO has to be
+ set to prevent the PF_CAN core from locally echoing sent frames
+ (aka loopback) as fallback solution:
+
+ dev->flags = (IFF_NOARP | IFF_ECHO);
+
+ 6.3 CAN controller hardware filters
+
+ To reduce the interrupt load on deep embedded systems some CAN
+ controllers support the filtering of CAN IDs or ranges of CAN IDs.
+ These hardware filter capabilities vary from controller to
+ controller and have to be identified as not feasible in a multi-user
+ networking approach. The use of the very controller specific
+ hardware filters could make sense in a very dedicated use-case, as a
+ filter on driver level would affect all users in the multi-user
+ system. The high efficient filter sets inside the PF_CAN core allow
+ to set different multiple filters for each socket separately.
+ Therefore the use of hardware filters goes to the category 'handmade
+ tuning on deep embedded systems'. The author is running a MPC603e
+ @133MHz with four SJA1000 CAN controllers from 2002 under heavy bus
+ load without any problems ...
+
+ 6.4 The virtual CAN driver (vcan)
+
+ Similar to the network loopback devices, vcan offers a virtual local
+ CAN interface. A full qualified address on CAN consists of
+
+ - a unique CAN Identifier (CAN ID)
+ - the CAN bus this CAN ID is transmitted on (e.g. can0)
+
+ so in common use cases more than one virtual CAN interface is needed.
+
+ The virtual CAN interfaces allow the transmission and reception of CAN
+ frames without real CAN controller hardware. Virtual CAN network
+ devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ...
+ When compiled as a module the virtual CAN driver module is called vcan.ko
+
+ Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel
+ netlink interface to create vcan network devices. The creation and
+ removal of vcan network devices can be managed with the ip(8) tool:
+
+ - Create a virtual CAN network interface:
+ ip link add type vcan
+
+ - Create a virtual CAN network interface with a specific name 'vcan42':
+ ip link add dev vcan42 type vcan
+
+ - Remove a (virtual CAN) network interface 'vcan42':
+ ip link del vcan42
+
+ The tool 'vcan' from the SocketCAN SVN repository on BerliOS is obsolete.
+
+ Virtual CAN network device creation in older Kernels:
+ In Linux Kernel versions < 2.6.24 the vcan driver creates 4 vcan
+ netdevices at module load time by default. This value can be changed
+ with the module parameter 'numdev'. E.g. 'modprobe vcan numdev=8'
+
+ 6.5 currently supported CAN hardware
+
+ On the project website http://developer.berlios.de/projects/socketcan
+ there are different drivers available:
+
+ vcan: Virtual CAN interface driver (if no real hardware is available)
+ sja1000: Philips SJA1000 CAN controller (recommended)
+ i82527: Intel i82527 CAN controller
+ mscan: Motorola/Freescale CAN controller (e.g. inside SOC MPC5200)
+ ccan: CCAN controller core (e.g. inside SOC h7202)
+ slcan: For a bunch of CAN adaptors that are attached via a
+ serial line ASCII protocol (for serial / USB adaptors)
+
+ Additionally the different CAN adaptors (ISA/PCI/PCMCIA/USB/Parport)
+ from PEAK Systemtechnik support the CAN netdevice driver model
+ since Linux driver v6.0: http://www.peak-system.com/linux/index.htm
+
+ Please check the Mailing Lists on the berlios OSS project website.
+
+ 6.6 todo
+
+ The configuration interface for CAN network drivers is still an open
+ issue that has not been finalized in the socketcan project. Also the
+ idea of having a library module (candev.ko) that holds functions
+ that are needed by all CAN netdevices is not ready to ship.
+ Your contribution is welcome.
+
+7. Credits
+----------
+
+ Oliver Hartkopp (PF_CAN core, filters, drivers, bcm)
+ Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan)
+ Jan Kizka (RT-SocketCAN core, Socket-API reconciliation)
+ Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews)
+ Robert Schwebel (design reviews, PTXdist integration)
+ Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers)
+ Benedikt Spranger (reviews)
+ Thomas Gleixner (LKML reviews, coding style, posting hints)
+ Andrey Volkov (kernel subtree structure, ioctls, mscan driver)
+ Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003)
+ Klaus Hitschler (PEAK driver integration)
+ Uwe Koppe (CAN netdevices with PF_PACKET approach)
+ Michael Schulze (driver layer loopback requirement, RT CAN drivers review)
diff --git a/Documentation/networking/cops.txt b/Documentation/networking/cops.txt
new file mode 100644
index 0000000..3e344b4
--- /dev/null
+++ b/Documentation/networking/cops.txt
@@ -0,0 +1,63 @@
+Text File for the COPS LocalTalk Linux driver (cops.c).
+ By Jay Schulist <jschlst@samba.org>
+
+This driver has two modes and they are: Dayna mode and Tangent mode.
+Each mode corresponds with the type of card. It has been found
+that there are 2 main types of cards and all other cards are
+the same and just have different names or only have minor differences
+such as more IO ports. As this driver is tested it will
+become more clear exactly what cards are supported.
+
+Right now these cards are known to work with the COPS driver. The
+LT-200 cards work in a somewhat more limited capacity than the
+DL200 cards, which work very well and are in use by many people.
+
+TANGENT driver mode:
+ Tangent ATB-II, Novell NL-1000, Daystar Digital LT-200
+DAYNA driver mode:
+ Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95,
+ Farallon PhoneNET PC III, Farallon PhoneNET PC II
+Other cards possibly supported mode unknown though:
+ Dayna DL2000 (Full length)
+
+The COPS driver defaults to using Dayna mode. To change the driver's
+mode if you built a driver with dual support use board_type=1 or
+board_type=2 for Dayna or Tangent with insmod.
+
+** Operation/loading of the driver.
+Use modprobe like this: /sbin/modprobe cops.o (IO #) (IRQ #)
+If you do not specify any options the driver will try and use the IO = 0x240,
+IRQ = 5. As of right now I would only use IRQ 5 for the card, if autoprobing.
+
+To load multiple COPS driver Localtalk cards you can do one of the following.
+
+insmod cops io=0x240 irq=5
+insmod -o cops2 cops io=0x260 irq=3
+
+Or in lilo.conf put something like this:
+ append="ether=5,0x240,lt0 ether=3,0x260,lt1"
+
+Then bring up the interface with ifconfig. It will look something like this:
+lt0 Link encap:UNSPEC HWaddr 00-00-00-00-00-00-00-F7-00-00-00-00-00-00-00-00
+ inet addr:192.168.1.2 Bcast:192.168.1.255 Mask:255.255.255.0
+ UP BROADCAST RUNNING NOARP MULTICAST MTU:600 Metric:1
+ RX packets:0 errors:0 dropped:0 overruns:0 frame:0
+ TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 coll:0
+
+** Netatalk Configuration
+You will need to configure atalkd with something like the following to make
+it work with the cops.c driver.
+
+* For single LTalk card use.
+dummy -seed -phase 2 -net 2000 -addr 2000.10 -zone "1033"
+lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033"
+
+* For multiple cards, Ethernet and LocalTalk.
+eth0 -seed -phase 2 -net 3000 -addr 3000.20 -zone "1033"
+lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033"
+
+* For multiple LocalTalk cards, and an Ethernet card.
+* Order seems to matter here, Ethernet last.
+lt0 -seed -phase 1 -net 1000 -addr 1000.10 -zone "LocalTalk1"
+lt1 -seed -phase 1 -net 2000 -addr 2000.20 -zone "LocalTalk2"
+eth0 -seed -phase 2 -net 3000 -addr 3000.30 -zone "EtherTalk"
diff --git a/Documentation/networking/cs89x0.txt b/Documentation/networking/cs89x0.txt
new file mode 100644
index 0000000..c725d33
--- /dev/null
+++ b/Documentation/networking/cs89x0.txt
@@ -0,0 +1,703 @@
+
+NOTE
+----
+
+This document was contributed by Cirrus Logic for kernel 2.2.5. This version
+has been updated for 2.3.48 by Andrew Morton.
+
+Cirrus make a copy of this driver available at their website, as
+described below. In general, you should use the driver version which
+comes with your Linux distribution.
+
+
+
+CIRRUS LOGIC LAN CS8900/CS8920 ETHERNET ADAPTERS
+Linux Network Interface Driver ver. 2.00 <kernel 2.3.48>
+===============================================================================
+
+
+TABLE OF CONTENTS
+
+1.0 CIRRUS LOGIC LAN CS8900/CS8920 ETHERNET ADAPTERS
+ 1.1 Product Overview
+ 1.2 Driver Description
+ 1.2.1 Driver Name
+ 1.2.2 File in the Driver Package
+ 1.3 System Requirements
+ 1.4 Licensing Information
+
+2.0 ADAPTER INSTALLATION and CONFIGURATION
+ 2.1 CS8900-based Adapter Configuration
+ 2.2 CS8920-based Adapter Configuration
+
+3.0 LOADING THE DRIVER AS A MODULE
+
+4.0 COMPILING THE DRIVER
+ 4.1 Compiling the Driver as a Loadable Module
+ 4.2 Compiling the driver to support memory mode
+ 4.3 Compiling the driver to support Rx DMA
+ 4.4 Compiling the Driver into the Kernel
+
+5.0 TESTING AND TROUBLESHOOTING
+ 5.1 Known Defects and Limitations
+ 5.2 Testing the Adapter
+ 5.2.1 Diagnostic Self-Test
+ 5.2.2 Diagnostic Network Test
+ 5.3 Using the Adapter's LEDs
+ 5.4 Resolving I/O Conflicts
+
+6.0 TECHNICAL SUPPORT
+ 6.1 Contacting Cirrus Logic's Technical Support
+ 6.2 Information Required Before Contacting Technical Support
+ 6.3 Obtaining the Latest Driver Version
+ 6.4 Current maintainer
+ 6.5 Kernel boot parameters
+
+
+1.0 CIRRUS LOGIC LAN CS8900/CS8920 ETHERNET ADAPTERS
+===============================================================================
+
+
+1.1 PRODUCT OVERVIEW
+
+The CS8900-based ISA Ethernet Adapters from Cirrus Logic follow
+IEEE 802.3 standards and support half or full-duplex operation in ISA bus
+computers on 10 Mbps Ethernet networks. The adapters are designed for operation
+in 16-bit ISA or EISA bus expansion slots and are available in
+10BaseT-only or 3-media configurations (10BaseT, 10Base2, and AUI for 10Base-5
+or fiber networks).
+
+CS8920-based adapters are similar to the CS8900-based adapter with additional
+features for Plug and Play (PnP) support and Wakeup Frame recognition. As
+such, the configuration procedures differ somewhat between the two types of
+adapters. Refer to the "Adapter Configuration" section for details on
+configuring both types of adapters.
+
+
+1.2 DRIVER DESCRIPTION
+
+The CS8900/CS8920 Ethernet Adapter driver for Linux supports the Linux
+v2.3.48 or greater kernel. It can be compiled directly into the kernel
+or loaded at run-time as a device driver module.
+
+1.2.1 Driver Name: cs89x0
+
+1.2.2 Files in the Driver Archive:
+
+The files in the driver at Cirrus' website include:
+
+ readme.txt - this file
+ build - batch file to compile cs89x0.c.
+ cs89x0.c - driver C code
+ cs89x0.h - driver header file
+ cs89x0.o - pre-compiled module (for v2.2.5 kernel)
+ config/Config.in - sample file to include cs89x0 driver in the kernel.
+ config/Makefile - sample file to include cs89x0 driver in the kernel.
+ config/Space.c - sample file to include cs89x0 driver in the kernel.
+
+
+
+1.3 SYSTEM REQUIREMENTS
+
+The following hardware is required:
+
+ * Cirrus Logic LAN (CS8900/20-based) Ethernet ISA Adapter
+
+ * IBM or IBM-compatible PC with:
+ * An 80386 or higher processor
+ * 16 bytes of contiguous IO space available between 210h - 370h
+ * One available IRQ (5,10,11,or 12 for the CS8900, 3-7,9-15 for CS8920).
+
+ * Appropriate cable (and connector for AUI, 10BASE-2) for your network
+ topology.
+
+The following software is required:
+
+* LINUX kernel version 2.3.48 or higher
+
+ * CS8900/20 Setup Utility (DOS-based)
+
+ * LINUX kernel sources for your kernel (if compiling into kernel)
+
+ * GNU Toolkit (gcc and make) v2.6 or above (if compiling into kernel
+ or a module)
+
+
+
+1.4 LICENSING INFORMATION
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, version 1.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+For a full copy of the GNU General Public License, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+2.0 ADAPTER INSTALLATION and CONFIGURATION
+===============================================================================
+
+Both the CS8900 and CS8920-based adapters can be configured using parameters
+stored in an on-board EEPROM. You must use the DOS-based CS8900/20 Setup
+Utility if you want to change the adapter's configuration in EEPROM.
+
+When loading the driver as a module, you can specify many of the adapter's
+configuration parameters on the command-line to override the EEPROM's settings
+or for interface configuration when an EEPROM is not used. (CS8920-based
+adapters must use an EEPROM.) See Section 3.0 LOADING THE DRIVER AS A MODULE.
+
+Since the CS8900/20 Setup Utility is a DOS-based application, you must install
+and configure the adapter in a DOS-based system using the CS8900/20 Setup
+Utility before installation in the target LINUX system. (Not required if
+installing a CS8900-based adapter and the default configuration is acceptable.)
+
+
+2.1 CS8900-BASED ADAPTER CONFIGURATION
+
+CS8900-based adapters shipped from Cirrus Logic have been configured
+with the following "default" settings:
+
+ Operation Mode: Memory Mode
+ IRQ: 10
+ Base I/O Address: 300
+ Memory Base Address: D0000
+ Optimization: DOS Client
+ Transmission Mode: Half-duplex
+ BootProm: None
+ Media Type: Autodetect (3-media cards) or
+ 10BASE-T (10BASE-T only adapter)
+
+You should only change the default configuration settings if conflicts with
+another adapter exists. To change the adapter's configuration, run the
+CS8900/20 Setup Utility.
+
+
+2.2 CS8920-BASED ADAPTER CONFIGURATION
+
+CS8920-based adapters are shipped from Cirrus Logic configured as Plug
+and Play (PnP) enabled. However, since the cs89x0 driver does NOT
+support PnP, you must install the CS8920 adapter in a DOS-based PC and
+run the CS8900/20 Setup Utility to disable PnP and configure the
+adapter before installation in the target Linux system. Failure to do
+this will leave the adapter inactive and the driver will be unable to
+communicate with the adapter.
+
+
+ ****************************************************************
+ * CS8920-BASED ADAPTERS: *
+ * *
+ * CS8920-BASED ADAPTERS ARE PLUG and PLAY ENABLED BY DEFAULT. *
+ * THE CS89X0 DRIVER DOES NOT SUPPORT PnP. THEREFORE, YOU MUST *
+ * RUN THE CS8900/20 SETUP UTILITY TO DISABLE PnP SUPPORT AND *
+ * TO ACTIVATE THE ADAPTER. *
+ ****************************************************************
+
+
+
+
+3.0 LOADING THE DRIVER AS A MODULE
+===============================================================================
+
+If the driver is compiled as a loadable module, you can load the driver module
+with the 'modprobe' command. Many of the adapter's configuration parameters can
+be specified as command-line arguments to the load command. This facility
+provides a means to override the EEPROM's settings or for interface
+configuration when an EEPROM is not used.
+
+Example:
+
+ insmod cs89x0.o io=0x200 irq=0xA media=aui
+
+This example loads the module and configures the adapter to use an IO port base
+address of 200h, interrupt 10, and use the AUI media connection. The following
+configuration options are available on the command line:
+
+* io=### - specify IO address (200h-360h)
+* irq=## - specify interrupt level
+* use_dma=1 - Enable DMA
+* dma=# - specify dma channel (Driver is compiled to support
+ Rx DMA only)
+* dmasize=# (16 or 64) - DMA size 16K or 64K. Default value is set to 16.
+* media=rj45 - specify media type
+ or media=bnc
+ or media=aui
+ or media=auto
+* duplex=full - specify forced half/full/autonegotiate duplex
+ or duplex=half
+ or duplex=auto
+* debug=# - debug level (only available if the driver was compiled
+ for debugging)
+
+NOTES:
+
+a) If an EEPROM is present, any specified command-line parameter
+ will override the corresponding configuration value stored in
+ EEPROM.
+
+b) The "io" parameter must be specified on the command-line.
+
+c) The driver's hardware probe routine is designed to avoid
+ writing to I/O space until it knows that there is a cs89x0
+ card at the written addresses. This could cause problems
+ with device probing. To avoid this behaviour, add one
+ to the `io=' module parameter. This doesn't actually change
+ the I/O address, but it is a flag to tell the driver
+ to partially initialise the hardware before trying to
+ identify the card. This could be dangerous if you are
+ not sure that there is a cs89x0 card at the provided address.
+
+ For example, to scan for an adapter located at IO base 0x300,
+ specify an IO address of 0x301.
+
+d) The "duplex=auto" parameter is only supported for the CS8920.
+
+e) The minimum command-line configuration required if an EEPROM is
+ not present is:
+
+ io
+ irq
+ media type (no autodetect)
+
+f) The following additional parameters are CS89XX defaults (values
+ used with no EEPROM or command-line argument).
+
+ * DMA Burst = enabled
+ * IOCHRDY Enabled = enabled
+ * UseSA = enabled
+ * CS8900 defaults to half-duplex if not specified on command-line
+ * CS8920 defaults to autoneg if not specified on command-line
+ * Use reset defaults for other config parameters
+ * dma_mode = 0
+
+g) You can use ifconfig to set the adapter's Ethernet address.
+
+h) Many Linux distributions use the 'modprobe' command to load
+ modules. This program uses the '/etc/conf.modules' file to
+ determine configuration information which is passed to a driver
+ module when it is loaded. All the configuration options which are
+ described above may be placed within /etc/conf.modules.
+
+ For example:
+
+ > cat /etc/conf.modules
+ ...
+ alias eth0 cs89x0
+ options cs89x0 io=0x0200 dma=5 use_dma=1
+ ...
+
+ In this example we are telling the module system that the
+ ethernet driver for this machine should use the cs89x0 driver. We
+ are asking 'modprobe' to pass the 'io', 'dma' and 'use_dma'
+ arguments to the driver when it is loaded.
+
+i) Cirrus recommend that the cs89x0 use the ISA DMA channels 5, 6 or
+ 7. You will probably find that other DMA channels will not work.
+
+j) The cs89x0 supports DMA for receiving only. DMA mode is
+ significantly more efficient. Flooding a 400 MHz Celeron machine
+ with large ping packets consumes 82% of its CPU capacity in non-DMA
+ mode. With DMA this is reduced to 45%.
+
+k) If your Linux kernel was compiled with inbuilt plug-and-play
+ support you will be able to find information about the cs89x0 card
+ with the command
+
+ cat /proc/isapnp
+
+l) If during DMA operation you find erratic behavior or network data
+ corruption you should use your PC's BIOS to slow the EISA bus clock.
+
+m) If the cs89x0 driver is compiled directly into the kernel
+ (non-modular) then its I/O address is automatically determined by
+ ISA bus probing. The IRQ number, media options, etc are determined
+ from the card's EEPROM.
+
+n) If the cs89x0 driver is compiled directly into the kernel, DMA
+ mode may be selected by providing the kernel with a boot option
+ 'cs89x0_dma=N' where 'N' is the desired DMA channel number (5, 6 or 7).
+
+ Kernel boot options may be provided on the LILO command line:
+
+ LILO boot: linux cs89x0_dma=5
+
+ or they may be placed in /etc/lilo.conf:
+
+ image=/boot/bzImage-2.3.48
+ append="cs89x0_dma=5"
+ label=linux
+ root=/dev/hda5
+ read-only
+
+ The DMA Rx buffer size is hardwired to 16 kbytes in this mode.
+ (64k mode is not available).
+
+
+4.0 COMPILING THE DRIVER
+===============================================================================
+
+The cs89x0 driver can be compiled directly into the kernel or compiled into
+a loadable device driver module.
+
+
+4.1 COMPILING THE DRIVER AS A LOADABLE MODULE
+
+To compile the driver into a loadable module, use the following command
+(single command line, without quotes):
+
+"gcc -D__KERNEL__ -I/usr/src/linux/include -I/usr/src/linux/net/inet -Wall
+-Wstrict-prototypes -O2 -fomit-frame-pointer -DMODULE -DCONFIG_MODVERSIONS
+-c cs89x0.c"
+
+4.2 COMPILING THE DRIVER TO SUPPORT MEMORY MODE
+
+Support for memory mode was not carried over into the 2.3 series kernels.
+
+4.3 COMPILING THE DRIVER TO SUPPORT Rx DMA
+
+The compile-time optionality for DMA was removed in the 2.3 kernel
+series. DMA support is now unconditionally part of the driver. It is
+enabled by the 'use_dma=1' module option.
+
+4.4 COMPILING THE DRIVER INTO THE KERNEL
+
+If your Linux distribution already has support for the cs89x0 driver
+then simply copy the source file to the /usr/src/linux/drivers/net
+directory to replace the original ones and run the make utility to
+rebuild the kernel. See Step 3 for rebuilding the kernel.
+
+If your Linux does not include the cs89x0 driver, you need to edit three
+configuration files, copy the source file to the /usr/src/linux/drivers/net
+directory, and then run the make utility to rebuild the kernel.
+
+1. Edit the following configuration files by adding the statements as
+indicated. (When possible, try to locate the added text to the section of the
+file containing similar statements).
+
+
+a.) In /usr/src/linux/drivers/net/Config.in, add:
+
+tristate 'CS89x0 support' CONFIG_CS89x0
+
+Example:
+
+ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ tristate 'ICL EtherTeam 16i/32 support' CONFIG_ETH16I
+ fi
+
+ tristate 'CS89x0 support' CONFIG_CS89x0
+
+ tristate 'NE2000/NE1000 support' CONFIG_NE2000
+ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ tristate 'NI5210 support' CONFIG_NI52
+
+
+b.) In /usr/src/linux/drivers/net/Makefile, add the following lines:
+
+ifeq ($(CONFIG_CS89x0),y)
+L_OBJS += cs89x0.o
+else
+ ifeq ($(CONFIG_CS89x0),m)
+ M_OBJS += cs89x0.o
+ endif
+endif
+
+
+c.) In /linux/drivers/net/Space.c file, add the line:
+
+extern int cs89x0_probe(struct device *dev);
+
+
+Example:
+
+ extern int ultra_probe(struct device *dev);
+ extern int wd_probe(struct device *dev);
+ extern int el2_probe(struct device *dev);
+
+ extern int cs89x0_probe(struct device *dev);
+
+ extern int ne_probe(struct device *dev);
+ extern int hp_probe(struct device *dev);
+ extern int hp_plus_probe(struct device *dev);
+
+
+Also add:
+
+ #ifdef CONFIG_CS89x0
+ { cs89x0_probe,0 },
+ #endif
+
+
+2.) Copy the driver source files (cs89x0.c and cs89x0.h)
+into the /usr/src/linux/drivers/net directory.
+
+
+3.) Go to /usr/src/linux directory and run 'make config' followed by 'make'
+(or make bzImage) to rebuild the kernel.
+
+4.) Use the DOS 'setup' utility to disable plug and play on the NIC.
+
+
+5.0 TESTING AND TROUBLESHOOTING
+===============================================================================
+
+5.1 KNOWN DEFECTS and LIMITATIONS
+
+Refer to the RELEASE.TXT file distributed as part of this archive for a list of
+known defects, driver limitations, and work arounds.
+
+
+5.2 TESTING THE ADAPTER
+
+Once the adapter has been installed and configured, the diagnostic option of
+the CS8900/20 Setup Utility can be used to test the functionality of the
+adapter and its network connection. Use the diagnostics 'Self Test' option to
+test the functionality of the adapter with the hardware configuration you have
+assigned. You can use the diagnostics 'Network Test' to test the ability of the
+adapter to communicate across the Ethernet with another PC equipped with a
+CS8900/20-based adapter card (it must also be running the CS8900/20 Setup
+Utility).
+
+ NOTE: The Setup Utility's diagnostics are designed to run in a
+ DOS-only operating system environment. DO NOT run the diagnostics
+ from a DOS or command prompt session under Windows 95, Windows NT,
+ OS/2, or other operating system.
+
+To run the diagnostics tests on the CS8900/20 adapter:
+
+ 1.) Boot DOS on the PC and start the CS8900/20 Setup Utility.
+
+ 2.) The adapter's current configuration is displayed. Hit the ENTER key to
+ get to the main menu.
+
+ 4.) Select 'Diagnostics' (ALT-G) from the main menu.
+ * Select 'Self-Test' to test the adapter's basic functionality.
+ * Select 'Network Test' to test the network connection and cabling.
+
+
+5.2.1 DIAGNOSTIC SELF-TEST
+
+The diagnostic self-test checks the adapter's basic functionality as well as
+its ability to communicate across the ISA bus based on the system resources
+assigned during hardware configuration. The following tests are performed:
+
+ * IO Register Read/Write Test
+ The IO Register Read/Write test insures that the CS8900/20 can be
+ accessed in IO mode, and that the IO base address is correct.
+
+ * Shared Memory Test
+ The Shared Memory test insures the CS8900/20 can be accessed in memory
+ mode and that the range of memory addresses assigned does not conflict
+ with other devices in the system.
+
+ * Interrupt Test
+ The Interrupt test insures there are no conflicts with the assigned IRQ
+ signal.
+
+ * EEPROM Test
+ The EEPROM test insures the EEPROM can be read.
+
+ * Chip RAM Test
+ The Chip RAM test insures the 4K of memory internal to the CS8900/20 is
+ working properly.
+
+ * Internal Loop-back Test
+ The Internal Loop Back test insures the adapter's transmitter and
+ receiver are operating properly. If this test fails, make sure the
+ adapter's cable is connected to the network (check for LED activity for
+ example).
+
+ * Boot PROM Test
+ The Boot PROM test insures the Boot PROM is present, and can be read.
+ Failure indicates the Boot PROM was not successfully read due to a
+ hardware problem or due to a conflicts on the Boot PROM address
+ assignment. (Test only applies if the adapter is configured to use the
+ Boot PROM option.)
+
+Failure of a test item indicates a possible system resource conflict with
+another device on the ISA bus. In this case, you should use the Manual Setup
+option to reconfigure the adapter by selecting a different value for the system
+resource that failed.
+
+
+5.2.2 DIAGNOSTIC NETWORK TEST
+
+The Diagnostic Network Test verifies a working network connection by
+transferring data between two CS8900/20 adapters installed in different PCs
+on the same network. (Note: the diagnostic network test should not be run
+between two nodes across a router.)
+
+This test requires that each of the two PCs have a CS8900/20-based adapter
+installed and have the CS8900/20 Setup Utility running. The first PC is
+configured as a Responder and the other PC is configured as an Initiator.
+Once the Initiator is started, it sends data frames to the Responder which
+returns the frames to the Initiator.
+
+The total number of frames received and transmitted are displayed on the
+Initiator's display, along with a count of the number of frames received and
+transmitted OK or in error. The test can be terminated anytime by the user at
+either PC.
+
+To setup the Diagnostic Network Test:
+
+ 1.) Select a PC with a CS8900/20-based adapter and a known working network
+ connection to act as the Responder. Run the CS8900/20 Setup Utility
+ and select 'Diagnostics -> Network Test -> Responder' from the main
+ menu. Hit ENTER to start the Responder.
+
+ 2.) Return to the PC with the CS8900/20-based adapter you want to test and
+ start the CS8900/20 Setup Utility.
+
+ 3.) From the main menu, Select 'Diagnostic -> Network Test -> Initiator'.
+ Hit ENTER to start the test.
+
+You may stop the test on the Initiator at any time while allowing the Responder
+to continue running. In this manner, you can move to additional PCs and test
+them by starting the Initiator on another PC without having to stop/start the
+Responder.
+
+
+
+5.3 USING THE ADAPTER'S LEDs
+
+The 2 and 3-media adapters have two LEDs visible on the back end of the board
+located near the 10Base-T connector.
+
+Link Integrity LED: A "steady" ON of the green LED indicates a valid 10Base-T
+connection. (Only applies to 10Base-T. The green LED has no significance for
+a 10Base-2 or AUI connection.)
+
+TX/RX LED: The yellow LED lights briefly each time the adapter transmits or
+receives data. (The yellow LED will appear to "flicker" on a typical network.)
+
+
+5.4 RESOLVING I/O CONFLICTS
+
+An IO conflict occurs when two or more adapter use the same ISA resource (IO
+address, memory address or IRQ). You can usually detect an IO conflict in one
+of four ways after installing and or configuring the CS8900/20-based adapter:
+
+ 1.) The system does not boot properly (or at all).
+
+ 2.) The driver cannot communicate with the adapter, reporting an "Adapter
+ not found" error message.
+
+ 3.) You cannot connect to the network or the driver will not load.
+
+ 4.) If you have configured the adapter to run in memory mode but the driver
+ reports it is using IO mode when loading, this is an indication of a
+ memory address conflict.
+
+If an IO conflict occurs, run the CS8900/20 Setup Utility and perform a
+diagnostic self-test. Normally, the ISA resource in conflict will fail the
+self-test. If so, reconfigure the adapter selecting another choice for the
+resource in conflict. Run the diagnostics again to check for further IO
+conflicts.
+
+In some cases, such as when the PC will not boot, it may be necessary to remove
+the adapter and reconfigure it by installing it in another PC to run the
+CS8900/20 Setup Utility. Once reinstalled in the target system, run the
+diagnostics self-test to ensure the new configuration is free of conflicts
+before loading the driver again.
+
+When manually configuring the adapter, keep in mind the typical ISA system
+resource usage as indicated in the tables below.
+
+I/O Address Device IRQ Device
+----------- -------- --- --------
+ 200-20F Game I/O adapter 3 COM2, Bus Mouse
+ 230-23F Bus Mouse 4 COM1
+ 270-27F LPT3: third parallel port 5 LPT2
+ 2F0-2FF COM2: second serial port 6 Floppy Disk controller
+ 320-32F Fixed disk controller 7 LPT1
+ 8 Real-time Clock
+ 9 EGA/VGA display adapter
+ 12 Mouse (PS/2)
+Memory Address Device 13 Math Coprocessor
+-------------- --------------------- 14 Hard Disk controller
+A000-BFFF EGA Graphics Adapter
+A000-C7FF VGA Graphics Adapter
+B000-BFFF Mono Graphics Adapter
+B800-BFFF Color Graphics Adapter
+E000-FFFF AT BIOS
+
+
+
+
+6.0 TECHNICAL SUPPORT
+===============================================================================
+
+6.1 CONTACTING CIRRUS LOGIC'S TECHNICAL SUPPORT
+
+Cirrus Logic's CS89XX Technical Application Support can be reached at:
+
+Telephone :(800) 888-5016 (from inside U.S. and Canada)
+ :(512) 442-7555 (from outside the U.S. and Canada)
+Fax :(512) 912-3871
+Email :ethernet@crystal.cirrus.com
+WWW :http://www.cirrus.com
+
+
+6.2 INFORMATION REQUIRED BEFORE CONTACTING TECHNICAL SUPPORT
+
+Before contacting Cirrus Logic for technical support, be prepared to provide as
+Much of the following information as possible.
+
+1.) Adapter type (CRD8900, CDB8900, CDB8920, etc.)
+
+2.) Adapter configuration
+
+ * IO Base, Memory Base, IO or memory mode enabled, IRQ, DMA channel
+ * Plug and Play enabled/disabled (CS8920-based adapters only)
+ * Configured for media auto-detect or specific media type (which type).
+
+3.) PC System's Configuration
+
+ * Plug and Play system (yes/no)
+ * BIOS (make and version)
+ * System make and model
+ * CPU (type and speed)
+ * System RAM
+ * SCSI Adapter
+
+4.) Software
+
+ * CS89XX driver and version
+ * Your network operating system and version
+ * Your system's OS version
+ * Version of all protocol support files
+
+5.) Any Error Message displayed.
+
+
+
+6.3 OBTAINING THE LATEST DRIVER VERSION
+
+You can obtain the latest CS89XX drivers and support software from Cirrus Logic's
+Web site. You can also contact Cirrus Logic's Technical Support (email:
+ethernet@crystal.cirrus.com) and request that you be registered for automatic
+software-update notification.
+
+Cirrus Logic maintains a web page at http://www.cirrus.com with the
+latest drivers and technical publications.
+
+
+6.4 Current maintainer
+
+In February 2000 the maintenance of this driver was assumed by Andrew
+Morton.
+
+6.5 Kernel module parameters
+
+For use in embedded environments with no cs89x0 EEPROM, the kernel boot
+parameter `cs89x0_media=' has been implemented. Usage is:
+
+ cs89x0_media=rj45 or
+ cs89x0_media=aui or
+ cs89x0_media=bnc
+
diff --git a/Documentation/networking/cxacru.txt b/Documentation/networking/cxacru.txt
new file mode 100644
index 0000000..b074681
--- /dev/null
+++ b/Documentation/networking/cxacru.txt
@@ -0,0 +1,84 @@
+Firmware is required for this device: http://accessrunner.sourceforge.net/
+
+While it is capable of managing/maintaining the ADSL connection without the
+module loaded, the device will sometimes stop responding after unloading the
+driver and it is necessary to unplug/remove power to the device to fix this.
+
+Detected devices will appear as ATM devices named "cxacru". In /sys/class/atm/
+these are directories named cxacruN where N is the device number. A symlink
+named device points to the USB interface device's directory which contains
+several sysfs attribute files for retrieving device statistics:
+
+* adsl_controller_version
+
+* adsl_headend
+* adsl_headend_environment
+ Information about the remote headend.
+
+* downstream_attenuation (dB)
+* downstream_bits_per_frame
+* downstream_rate (kbps)
+* downstream_snr_margin (dB)
+ Downstream stats.
+
+* upstream_attenuation (dB)
+* upstream_bits_per_frame
+* upstream_rate (kbps)
+* upstream_snr_margin (dB)
+* transmitter_power (dBm/Hz)
+ Upstream stats.
+
+* downstream_crc_errors
+* downstream_fec_errors
+* downstream_hec_errors
+* upstream_crc_errors
+* upstream_fec_errors
+* upstream_hec_errors
+ Error counts.
+
+* line_startable
+ Indicates that ADSL support on the device
+ is/can be enabled, see adsl_start.
+
+* line_status
+ "initialising"
+ "down"
+ "attempting to activate"
+ "training"
+ "channel analysis"
+ "exchange"
+ "waiting"
+ "up"
+
+ Changes between "down" and "attempting to activate"
+ if there is no signal.
+
+* link_status
+ "not connected"
+ "connected"
+ "lost"
+
+* mac_address
+
+* modulation
+ "ANSI T1.413"
+ "ITU-T G.992.1 (G.DMT)"
+ "ITU-T G.992.2 (G.LITE)"
+
+* startup_attempts
+ Count of total attempts to initialise ADSL.
+
+To enable/disable ADSL, the following can be written to the adsl_state file:
+ "start"
+ "stop
+ "restart" (stops, waits 1.5s, then starts)
+ "poll" (used to resume status polling if it was disabled due to failure)
+
+Changes in adsl/line state are reported via kernel log messages:
+ [4942145.150704] ATM dev 0: ADSL state: running
+ [4942243.663766] ATM dev 0: ADSL line: down
+ [4942249.665075] ATM dev 0: ADSL line: attempting to activate
+ [4942253.654954] ATM dev 0: ADSL line: training
+ [4942255.666387] ATM dev 0: ADSL line: channel analysis
+ [4942259.656262] ATM dev 0: ADSL line: exchange
+ [2635357.696901] ATM dev 0: ADSL line: up (8128 kb/s down | 832 kb/s up)
diff --git a/Documentation/networking/cxgb.txt b/Documentation/networking/cxgb.txt
new file mode 100644
index 0000000..20a8876
--- /dev/null
+++ b/Documentation/networking/cxgb.txt
@@ -0,0 +1,352 @@
+ Chelsio N210 10Gb Ethernet Network Controller
+
+ Driver Release Notes for Linux
+
+ Version 2.1.1
+
+ June 20, 2005
+
+CONTENTS
+========
+ INTRODUCTION
+ FEATURES
+ PERFORMANCE
+ DRIVER MESSAGES
+ KNOWN ISSUES
+ SUPPORT
+
+
+INTRODUCTION
+============
+
+ This document describes the Linux driver for Chelsio 10Gb Ethernet Network
+ Controller. This driver supports the Chelsio N210 NIC and is backward
+ compatible with the Chelsio N110 model 10Gb NICs.
+
+
+FEATURES
+========
+
+ Adaptive Interrupts (adaptive-rx)
+ ---------------------------------
+
+ This feature provides an adaptive algorithm that adjusts the interrupt
+ coalescing parameters, allowing the driver to dynamically adapt the latency
+ settings to achieve the highest performance during various types of network
+ load.
+
+ The interface used to control this feature is ethtool. Please see the
+ ethtool manpage for additional usage information.
+
+ By default, adaptive-rx is disabled.
+ To enable adaptive-rx:
+
+ ethtool -C <interface> adaptive-rx on
+
+ To disable adaptive-rx, use ethtool:
+
+ ethtool -C <interface> adaptive-rx off
+
+ After disabling adaptive-rx, the timer latency value will be set to 50us.
+ You may set the timer latency after disabling adaptive-rx:
+
+ ethtool -C <interface> rx-usecs <microseconds>
+
+ An example to set the timer latency value to 100us on eth0:
+
+ ethtool -C eth0 rx-usecs 100
+
+ You may also provide a timer latency value while disabling adaptive-rx:
+
+ ethtool -C <interface> adaptive-rx off rx-usecs <microseconds>
+
+ If adaptive-rx is disabled and a timer latency value is specified, the timer
+ will be set to the specified value until changed by the user or until
+ adaptive-rx is enabled.
+
+ To view the status of the adaptive-rx and timer latency values:
+
+ ethtool -c <interface>
+
+
+ TCP Segmentation Offloading (TSO) Support
+ -----------------------------------------
+
+ This feature, also known as "large send", enables a system's protocol stack
+ to offload portions of outbound TCP processing to a network interface card
+ thereby reducing system CPU utilization and enhancing performance.
+
+ The interface used to control this feature is ethtool version 1.8 or higher.
+ Please see the ethtool manpage for additional usage information.
+
+ By default, TSO is enabled.
+ To disable TSO:
+
+ ethtool -K <interface> tso off
+
+ To enable TSO:
+
+ ethtool -K <interface> tso on
+
+ To view the status of TSO:
+
+ ethtool -k <interface>
+
+
+PERFORMANCE
+===========
+
+ The following information is provided as an example of how to change system
+ parameters for "performance tuning" an what value to use. You may or may not
+ want to change these system parameters, depending on your server/workstation
+ application. Doing so is not warranted in any way by Chelsio Communications,
+ and is done at "YOUR OWN RISK". Chelsio will not be held responsible for loss
+ of data or damage to equipment.
+
+ Your distribution may have a different way of doing things, or you may prefer
+ a different method. These commands are shown only to provide an example of
+ what to do and are by no means definitive.
+
+ Making any of the following system changes will only last until you reboot
+ your system. You may want to write a script that runs at boot-up which
+ includes the optimal settings for your system.
+
+ Setting PCI Latency Timer:
+ setpci -d 1425:* 0x0c.l=0x0000F800
+
+ Disabling TCP timestamp:
+ sysctl -w net.ipv4.tcp_timestamps=0
+
+ Disabling SACK:
+ sysctl -w net.ipv4.tcp_sack=0
+
+ Setting large number of incoming connection requests:
+ sysctl -w net.ipv4.tcp_max_syn_backlog=3000
+
+ Setting maximum receive socket buffer size:
+ sysctl -w net.core.rmem_max=1024000
+
+ Setting maximum send socket buffer size:
+ sysctl -w net.core.wmem_max=1024000
+
+ Set smp_affinity (on a multiprocessor system) to a single CPU:
+ echo 1 > /proc/irq/<interrupt_number>/smp_affinity
+
+ Setting default receive socket buffer size:
+ sysctl -w net.core.rmem_default=524287
+
+ Setting default send socket buffer size:
+ sysctl -w net.core.wmem_default=524287
+
+ Setting maximum option memory buffers:
+ sysctl -w net.core.optmem_max=524287
+
+ Setting maximum backlog (# of unprocessed packets before kernel drops):
+ sysctl -w net.core.netdev_max_backlog=300000
+
+ Setting TCP read buffers (min/default/max):
+ sysctl -w net.ipv4.tcp_rmem="10000000 10000000 10000000"
+
+ Setting TCP write buffers (min/pressure/max):
+ sysctl -w net.ipv4.tcp_wmem="10000000 10000000 10000000"
+
+ Setting TCP buffer space (min/pressure/max):
+ sysctl -w net.ipv4.tcp_mem="10000000 10000000 10000000"
+
+ TCP window size for single connections:
+ The receive buffer (RX_WINDOW) size must be at least as large as the
+ Bandwidth-Delay Product of the communication link between the sender and
+ receiver. Due to the variations of RTT, you may want to increase the buffer
+ size up to 2 times the Bandwidth-Delay Product. Reference page 289 of
+ "TCP/IP Illustrated, Volume 1, The Protocols" by W. Richard Stevens.
+ At 10Gb speeds, use the following formula:
+ RX_WINDOW >= 1.25MBytes * RTT(in milliseconds)
+ Example for RTT with 100us: RX_WINDOW = (1,250,000 * 0.1) = 125,000
+ RX_WINDOW sizes of 256KB - 512KB should be sufficient.
+ Setting the min, max, and default receive buffer (RX_WINDOW) size:
+ sysctl -w net.ipv4.tcp_rmem="<min> <default> <max>"
+
+ TCP window size for multiple connections:
+ The receive buffer (RX_WINDOW) size may be calculated the same as single
+ connections, but should be divided by the number of connections. The
+ smaller window prevents congestion and facilitates better pacing,
+ especially if/when MAC level flow control does not work well or when it is
+ not supported on the machine. Experimentation may be necessary to attain
+ the correct value. This method is provided as a starting point for the
+ correct receive buffer size.
+ Setting the min, max, and default receive buffer (RX_WINDOW) size is
+ performed in the same manner as single connection.
+
+
+DRIVER MESSAGES
+===============
+
+ The following messages are the most common messages logged by syslog. These
+ may be found in /var/log/messages.
+
+ Driver up:
+ Chelsio Network Driver - version 2.1.1
+
+ NIC detected:
+ eth#: Chelsio N210 1x10GBaseX NIC (rev #), PCIX 133MHz/64-bit
+
+ Link up:
+ eth#: link is up at 10 Gbps, full duplex
+
+ Link down:
+ eth#: link is down
+
+
+KNOWN ISSUES
+============
+
+ These issues have been identified during testing. The following information
+ is provided as a workaround to the problem. In some cases, this problem is
+ inherent to Linux or to a particular Linux Distribution and/or hardware
+ platform.
+
+ 1. Large number of TCP retransmits on a multiprocessor (SMP) system.
+
+ On a system with multiple CPUs, the interrupt (IRQ) for the network
+ controller may be bound to more than one CPU. This will cause TCP
+ retransmits if the packet data were to be split across different CPUs
+ and re-assembled in a different order than expected.
+
+ To eliminate the TCP retransmits, set smp_affinity on the particular
+ interrupt to a single CPU. You can locate the interrupt (IRQ) used on
+ the N110/N210 by using ifconfig:
+ ifconfig <dev_name> | grep Interrupt
+ Set the smp_affinity to a single CPU:
+ echo 1 > /proc/irq/<interrupt_number>/smp_affinity
+
+ It is highly suggested that you do not run the irqbalance daemon on your
+ system, as this will change any smp_affinity setting you have applied.
+ The irqbalance daemon runs on a 10 second interval and binds interrupts
+ to the least loaded CPU determined by the daemon. To disable this daemon:
+ chkconfig --level 2345 irqbalance off
+
+ By default, some Linux distributions enable the kernel feature,
+ irqbalance, which performs the same function as the daemon. To disable
+ this feature, add the following line to your bootloader:
+ noirqbalance
+
+ Example using the Grub bootloader:
+ title Red Hat Enterprise Linux AS (2.4.21-27.ELsmp)
+ root (hd0,0)
+ kernel /vmlinuz-2.4.21-27.ELsmp ro root=/dev/hda3 noirqbalance
+ initrd /initrd-2.4.21-27.ELsmp.img
+
+ 2. After running insmod, the driver is loaded and the incorrect network
+ interface is brought up without running ifup.
+
+ When using 2.4.x kernels, including RHEL kernels, the Linux kernel
+ invokes a script named "hotplug". This script is primarily used to
+ automatically bring up USB devices when they are plugged in, however,
+ the script also attempts to automatically bring up a network interface
+ after loading the kernel module. The hotplug script does this by scanning
+ the ifcfg-eth# config files in /etc/sysconfig/network-scripts, looking
+ for HWADDR=<mac_address>.
+
+ If the hotplug script does not find the HWADDRR within any of the
+ ifcfg-eth# files, it will bring up the device with the next available
+ interface name. If this interface is already configured for a different
+ network card, your new interface will have incorrect IP address and
+ network settings.
+
+ To solve this issue, you can add the HWADDR=<mac_address> key to the
+ interface config file of your network controller.
+
+ To disable this "hotplug" feature, you may add the driver (module name)
+ to the "blacklist" file located in /etc/hotplug. It has been noted that
+ this does not work for network devices because the net.agent script
+ does not use the blacklist file. Simply remove, or rename, the net.agent
+ script located in /etc/hotplug to disable this feature.
+
+ 3. Transport Protocol (TP) hangs when running heavy multi-connection traffic
+ on an AMD Opteron system with HyperTransport PCI-X Tunnel chipset.
+
+ If your AMD Opteron system uses the AMD-8131 HyperTransport PCI-X Tunnel
+ chipset, you may experience the "133-Mhz Mode Split Completion Data
+ Corruption" bug identified by AMD while using a 133Mhz PCI-X card on the
+ bus PCI-X bus.
+
+ AMD states, "Under highly specific conditions, the AMD-8131 PCI-X Tunnel
+ can provide stale data via split completion cycles to a PCI-X card that
+ is operating at 133 Mhz", causing data corruption.
+
+ AMD's provides three workarounds for this problem, however, Chelsio
+ recommends the first option for best performance with this bug:
+
+ For 133Mhz secondary bus operation, limit the transaction length and
+ the number of outstanding transactions, via BIOS configuration
+ programming of the PCI-X card, to the following:
+
+ Data Length (bytes): 1k
+ Total allowed outstanding transactions: 2
+
+ Please refer to AMD 8131-HT/PCI-X Errata 26310 Rev 3.08 August 2004,
+ section 56, "133-MHz Mode Split Completion Data Corruption" for more
+ details with this bug and workarounds suggested by AMD.
+
+ It may be possible to work outside AMD's recommended PCI-X settings, try
+ increasing the Data Length to 2k bytes for increased performance. If you
+ have issues with these settings, please revert to the "safe" settings
+ and duplicate the problem before submitting a bug or asking for support.
+
+ NOTE: The default setting on most systems is 8 outstanding transactions
+ and 2k bytes data length.
+
+ 4. On multiprocessor systems, it has been noted that an application which
+ is handling 10Gb networking can switch between CPUs causing degraded
+ and/or unstable performance.
+
+ If running on an SMP system and taking performance measurements, it
+ is suggested you either run the latest netperf-2.4.0+ or use a binding
+ tool such as Tim Hockin's procstate utilities (runon)
+ <http://www.hockin.org/~thockin/procstate/>.
+
+ Binding netserver and netperf (or other applications) to particular
+ CPUs will have a significant difference in performance measurements.
+ You may need to experiment which CPU to bind the application to in
+ order to achieve the best performance for your system.
+
+ If you are developing an application designed for 10Gb networking,
+ please keep in mind you may want to look at kernel functions
+ sched_setaffinity & sched_getaffinity to bind your application.
+
+ If you are just running user-space applications such as ftp, telnet,
+ etc., you may want to try the runon tool provided by Tim Hockin's
+ procstate utility. You could also try binding the interface to a
+ particular CPU: runon 0 ifup eth0
+
+
+SUPPORT
+=======
+
+ If you have problems with the software or hardware, please contact our
+ customer support team via email at support@chelsio.com or check our website
+ at http://www.chelsio.com
+
+===============================================================================
+
+ Chelsio Communications
+ 370 San Aleso Ave.
+ Suite 100
+ Sunnyvale, CA 94085
+ http://www.chelsio.com
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License, version 2, as
+published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+ Copyright (c) 2003-2005 Chelsio Communications. All rights reserved.
+
+===============================================================================
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
new file mode 100644
index 0000000..39131a3
--- /dev/null
+++ b/Documentation/networking/dccp.txt
@@ -0,0 +1,156 @@
+DCCP protocol
+============
+
+
+Contents
+========
+
+- Introduction
+- Missing features
+- Socket options
+- Notes
+
+Introduction
+============
+
+Datagram Congestion Control Protocol (DCCP) is an unreliable, connection
+oriented protocol designed to solve issues present in UDP and TCP, particularly
+for real-time and multimedia (streaming) traffic.
+It divides into a base protocol (RFC 4340) and plugable congestion control
+modules called CCIDs. Like plugable TCP congestion control, at least one CCID
+needs to be enabled in order for the protocol to function properly. In the Linux
+implementation, this is the TCP-like CCID2 (RFC 4341). Additional CCIDs, such as
+the TCP-friendly CCID3 (RFC 4342), are optional.
+For a brief introduction to CCIDs and suggestions for choosing a CCID to match
+given applications, see section 10 of RFC 4340.
+
+It has a base protocol and pluggable congestion control IDs (CCIDs).
+
+DCCP is a Proposed Standard (RFC 2026), and the homepage for DCCP as a protocol
+is at http://www.ietf.org/html.charters/dccp-charter.html
+
+Missing features
+================
+
+The Linux DCCP implementation does not currently support all the features that are
+specified in RFCs 4340...42.
+
+The known bugs are at:
+ http://linux-net.osdl.org/index.php/TODO#DCCP
+
+For more up-to-date versions of the DCCP implementation, please consider using
+the experimental DCCP test tree; instructions for checking this out are on:
+http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree
+
+
+Socket options
+==============
+
+DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
+service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
+the socket will fall back to 0 (which means that no meaningful service code
+is present). On active sockets this is set before connect(); specifying more
+than one code has no effect (all subsequent service codes are ignored). The
+case is different for passive sockets, where multiple service codes (up to 32)
+can be set before calling bind().
+
+DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
+size (application payload size) in bytes, see RFC 4340, section 14.
+
+DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
+timewait state when closing the connection (RFC 4340, 8.3). The usual case is
+that the closing server sends a CloseReq, whereupon the client holds timewait
+state. When this boolean socket option is on, the server sends a Close instead
+and will enter TIMEWAIT. This option must be set after accept() returns.
+
+DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the
+partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums
+always cover the entire packet and that only fully covered application data is
+accepted by the receiver. Hence, when using this feature on the sender, it must
+be enabled at the receiver, too with suitable choice of CsCov.
+
+DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the
+ range 0..15 are acceptable. The default setting is 0 (full coverage),
+ values between 1..15 indicate partial coverage.
+DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it
+ sets a threshold, where again values 0..15 are acceptable. The default
+ of 0 means that all packets with a partial coverage will be discarded.
+ Values in the range 1..15 indicate that packets with minimally such a
+ coverage value are also acceptable. The higher the number, the more
+ restrictive this setting (see [RFC 4340, sec. 9.2.1]). Partial coverage
+ settings are inherited to the child socket after accept().
+
+The following two options apply to CCID 3 exclusively and are getsockopt()-only.
+In either case, a TFRC info struct (defined in <linux/tfrc.h>) is returned.
+DCCP_SOCKOPT_CCID_RX_INFO
+ Returns a `struct tfrc_rx_info' in optval; the buffer for optval and
+ optlen must be set to at least sizeof(struct tfrc_rx_info).
+DCCP_SOCKOPT_CCID_TX_INFO
+ Returns a `struct tfrc_tx_info' in optval; the buffer for optval and
+ optlen must be set to at least sizeof(struct tfrc_tx_info).
+
+On unidirectional connections it is useful to close the unused half-connection
+via shutdown (SHUT_WR or SHUT_RD): this will reduce per-packet processing costs.
+
+Sysctl variables
+================
+Several DCCP default parameters can be managed by the following sysctls
+(sysctl net.dccp.default or /proc/sys/net/dccp/default):
+
+request_retries
+ The number of active connection initiation retries (the number of
+ Requests minus one) before timing out. In addition, it also governs
+ the behaviour of the other, passive side: this variable also sets
+ the number of times DCCP repeats sending a Response when the initial
+ handshake does not progress from RESPOND to OPEN (i.e. when no Ack
+ is received after the initial Request). This value should be greater
+ than 0, suggested is less than 10. Analogue of tcp_syn_retries.
+
+retries1
+ How often a DCCP Response is retransmitted until the listening DCCP
+ side considers its connecting peer dead. Analogue of tcp_retries1.
+
+retries2
+ The number of times a general DCCP packet is retransmitted. This has
+ importance for retransmitted acknowledgments and feature negotiation,
+ data packets are never retransmitted. Analogue of tcp_retries2.
+
+send_ndp = 1
+ Whether or not to send NDP count options (sec. 7.7.2).
+
+send_ackvec = 1
+ Whether or not to send Ack Vector options (sec. 11.5).
+
+ack_ratio = 2
+ The default Ack Ratio (sec. 11.3) to use.
+
+tx_ccid = 2
+ Default CCID for the sender-receiver half-connection.
+
+rx_ccid = 2
+ Default CCID for the receiver-sender half-connection.
+
+seq_window = 100
+ The initial sequence window (sec. 7.5.2).
+
+tx_qlen = 5
+ The size of the transmit buffer in packets. A value of 0 corresponds
+ to an unbounded transmit buffer.
+
+sync_ratelimit = 125 ms
+ The timeout between subsequent DCCP-Sync packets sent in response to
+ sequence-invalid packets on the same socket (RFC 4340, 7.5.4). The unit
+ of this parameter is milliseconds; a value of 0 disables rate-limiting.
+
+IOCTLS
+======
+FIONREAD
+ Works as in udp(7): returns in the `int' argument pointer the size of
+ the next pending datagram in bytes, or 0 when no datagram is pending.
+
+Notes
+=====
+
+DCCP does not travel through NAT successfully at present on many boxes. This is
+because the checksum covers the pseudo-header as per TCP and UDP. Linux NAT
+support for DCCP has been added.
diff --git a/Documentation/networking/de4x5.txt b/Documentation/networking/de4x5.txt
new file mode 100644
index 0000000..c8e4ca9
--- /dev/null
+++ b/Documentation/networking/de4x5.txt
@@ -0,0 +1,178 @@
+ Originally, this driver was written for the Digital Equipment
+ Corporation series of EtherWORKS Ethernet cards:
+
+ DE425 TP/COAX EISA
+ DE434 TP PCI
+ DE435 TP/COAX/AUI PCI
+ DE450 TP/COAX/AUI PCI
+ DE500 10/100 PCI Fasternet
+
+ but it will now attempt to support all cards which conform to the
+ Digital Semiconductor SROM Specification. The driver currently
+ recognises the following chips:
+
+ DC21040 (no SROM)
+ DC21041[A]
+ DC21140[A]
+ DC21142
+ DC21143
+
+ So far the driver is known to work with the following cards:
+
+ KINGSTON
+ Linksys
+ ZNYX342
+ SMC8432
+ SMC9332 (w/new SROM)
+ ZNYX31[45]
+ ZNYX346 10/100 4 port (can act as a 10/100 bridge!)
+
+ The driver has been tested on a relatively busy network using the DE425,
+ DE434, DE435 and DE500 cards and benchmarked with 'ttcp': it transferred
+ 16M of data to a DECstation 5000/200 as follows:
+
+ TCP UDP
+ TX RX TX RX
+ DE425 1030k 997k 1170k 1128k
+ DE434 1063k 995k 1170k 1125k
+ DE435 1063k 995k 1170k 1125k
+ DE500 1063k 998k 1170k 1125k in 10Mb/s mode
+
+ All values are typical (in kBytes/sec) from a sample of 4 for each
+ measurement. Their error is +/-20k on a quiet (private) network and also
+ depend on what load the CPU has.
+
+ =========================================================================
+
+ The ability to load this driver as a loadable module has been included
+ and used extensively during the driver development (to save those long
+ reboot sequences). Loadable module support under PCI and EISA has been
+ achieved by letting the driver autoprobe as if it were compiled into the
+ kernel. Do make sure you're not sharing interrupts with anything that
+ cannot accommodate interrupt sharing!
+
+ To utilise this ability, you have to do 8 things:
+
+ 0) have a copy of the loadable modules code installed on your system.
+ 1) copy de4x5.c from the /linux/drivers/net directory to your favourite
+ temporary directory.
+ 2) for fixed autoprobes (not recommended), edit the source code near
+ line 5594 to reflect the I/O address you're using, or assign these when
+ loading by:
+
+ insmod de4x5 io=0xghh where g = bus number
+ hh = device number
+
+ NB: autoprobing for modules is now supported by default. You may just
+ use:
+
+ insmod de4x5
+
+ to load all available boards. For a specific board, still use
+ the 'io=?' above.
+ 3) compile de4x5.c, but include -DMODULE in the command line to ensure
+ that the correct bits are compiled (see end of source code).
+ 4) if you are wanting to add a new card, goto 5. Otherwise, recompile a
+ kernel with the de4x5 configuration turned off and reboot.
+ 5) insmod de4x5 [io=0xghh]
+ 6) run the net startup bits for your new eth?? interface(s) manually
+ (usually /etc/rc.inet[12] at boot time).
+ 7) enjoy!
+
+ To unload a module, turn off the associated interface(s)
+ 'ifconfig eth?? down' then 'rmmod de4x5'.
+
+ Automedia detection is included so that in principle you can disconnect
+ from, e.g. TP, reconnect to BNC and things will still work (after a
+ pause whilst the driver figures out where its media went). My tests
+ using ping showed that it appears to work....
+
+ By default, the driver will now autodetect any DECchip based card.
+ Should you have a need to restrict the driver to DIGITAL only cards, you
+ can compile with a DEC_ONLY define, or if loading as a module, use the
+ 'dec_only=1' parameter.
+
+ I've changed the timing routines to use the kernel timer and scheduling
+ functions so that the hangs and other assorted problems that occurred
+ while autosensing the media should be gone. A bonus for the DC21040
+ auto media sense algorithm is that it can now use one that is more in
+ line with the rest (the DC21040 chip doesn't have a hardware timer).
+ The downside is the 1 'jiffies' (10ms) resolution.
+
+ IEEE 802.3u MII interface code has been added in anticipation that some
+ products may use it in the future.
+
+ The SMC9332 card has a non-compliant SROM which needs fixing - I have
+ patched this driver to detect it because the SROM format used complies
+ to a previous DEC-STD format.
+
+ I have removed the buffer copies needed for receive on Intels. I cannot
+ remove them for Alphas since the Tulip hardware only does longword
+ aligned DMA transfers and the Alphas get alignment traps with non
+ longword aligned data copies (which makes them really slow). No comment.
+
+ I have added SROM decoding routines to make this driver work with any
+ card that supports the Digital Semiconductor SROM spec. This will help
+ all cards running the dc2114x series chips in particular. Cards using
+ the dc2104x chips should run correctly with the basic driver. I'm in
+ debt to <mjacob@feral.com> for the testing and feedback that helped get
+ this feature working. So far we have tested KINGSTON, SMC8432, SMC9332
+ (with the latest SROM complying with the SROM spec V3: their first was
+ broken), ZNYX342 and LinkSys. ZNYX314 (dual 21041 MAC) and ZNYX 315
+ (quad 21041 MAC) cards also appear to work despite their incorrectly
+ wired IRQs.
+
+ I have added a temporary fix for interrupt problems when some SCSI cards
+ share the same interrupt as the DECchip based cards. The problem occurs
+ because the SCSI card wants to grab the interrupt as a fast interrupt
+ (runs the service routine with interrupts turned off) vs. this card
+ which really needs to run the service routine with interrupts turned on.
+ This driver will now add the interrupt service routine as a fast
+ interrupt if it is bounced from the slow interrupt. THIS IS NOT A
+ RECOMMENDED WAY TO RUN THE DRIVER and has been done for a limited time
+ until people sort out their compatibility issues and the kernel
+ interrupt service code is fixed. YOU SHOULD SEPARATE OUT THE FAST
+ INTERRUPT CARDS FROM THE SLOW INTERRUPT CARDS to ensure that they do not
+ run on the same interrupt. PCMCIA/CardBus is another can of worms...
+
+ Finally, I think I have really fixed the module loading problem with
+ more than one DECchip based card. As a side effect, I don't mess with
+ the device structure any more which means that if more than 1 card in
+ 2.0.x is installed (4 in 2.1.x), the user will have to edit
+ linux/drivers/net/Space.c to make room for them. Hence, module loading
+ is the preferred way to use this driver, since it doesn't have this
+ limitation.
+
+ Where SROM media detection is used and full duplex is specified in the
+ SROM, the feature is ignored unless lp->params.fdx is set at compile
+ time OR during a module load (insmod de4x5 args='eth??:fdx' [see
+ below]). This is because there is no way to automatically detect full
+ duplex links except through autonegotiation. When I include the
+ autonegotiation feature in the SROM autoconf code, this detection will
+ occur automatically for that case.
+
+ Command line arguments are now allowed, similar to passing arguments
+ through LILO. This will allow a per adapter board set up of full duplex
+ and media. The only lexical constraints are: the board name (dev->name)
+ appears in the list before its parameters. The list of parameters ends
+ either at the end of the parameter list or with another board name. The
+ following parameters are allowed:
+
+ fdx for full duplex
+ autosense to set the media/speed; with the following
+ sub-parameters:
+ TP, TP_NW, BNC, AUI, BNC_AUI, 100Mb, 10Mb, AUTO
+
+ Case sensitivity is important for the sub-parameters. They *must* be
+ upper case. Examples:
+
+ insmod de4x5 args='eth1:fdx autosense=BNC eth0:autosense=100Mb'.
+
+ For a compiled in driver, in linux/drivers/net/CONFIG, place e.g.
+ DE4X5_OPTS = -DDE4X5_PARM='"eth0:fdx autosense=AUI eth2:autosense=TP"'
+
+ Yes, I know full duplex isn't permissible on BNC or AUI; they're just
+ examples. By default, full duplex is turned off and AUTO is the default
+ autosense setting. In reality, I expect only the full duplex option to
+ be used. Note the use of single quotes in the two examples above and the
+ lack of commas to separate items.
diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt
new file mode 100644
index 0000000..d896895
--- /dev/null
+++ b/Documentation/networking/decnet.txt
@@ -0,0 +1,232 @@
+ Linux DECnet Networking Layer Information
+ ===========================================
+
+1) Other documentation....
+
+ o Project Home Pages
+ http://www.chygwyn.com/DECnet/ - Kernel info
+ http://linux-decnet.sourceforge.net/ - Userland tools
+ http://www.sourceforge.net/projects/linux-decnet/ - Status page
+
+2) Configuring the kernel
+
+Be sure to turn on the following options:
+
+ CONFIG_DECNET (obviously)
+ CONFIG_PROC_FS (to see what's going on)
+ CONFIG_SYSCTL (for easy configuration)
+
+if you want to try out router support (not properly debugged yet)
+you'll need the following options as well...
+
+ CONFIG_DECNET_ROUTER (to be able to add/delete routes)
+ CONFIG_NETFILTER (will be required for the DECnet routing daemon)
+
+ CONFIG_DECNET_ROUTE_FWMARK is optional
+
+Don't turn on SIOCGIFCONF support for DECnet unless you are really sure
+that you need it, in general you won't and it can cause ifconfig to
+malfunction.
+
+Run time configuration has changed slightly from the 2.4 system. If you
+want to configure an endnode, then the simplified procedure is as follows:
+
+ o Set the MAC address on your ethernet card before starting _any_ other
+ network protocols.
+
+As soon as your network card is brought into the UP state, DECnet should
+start working. If you need something more complicated or are unsure how
+to set the MAC address, see the next section. Also all configurations which
+worked with 2.4 will work under 2.5 with no change.
+
+3) Command line options
+
+You can set a DECnet address on the kernel command line for compatibility
+with the 2.4 configuration procedure, but in general it's not needed any more.
+If you do st a DECnet address on the command line, it has only one purpose
+which is that its added to the addresses on the loopback device.
+
+With 2.4 kernels, DECnet would only recognise addresses as local if they
+were added to the loopback device. In 2.5, any local interface address
+can be used to loop back to the local machine. Of course this does not
+prevent you adding further addresses to the loopback device if you
+want to.
+
+N.B. Since the address list of an interface determines the addresses for
+which "hello" messages are sent, if you don't set an address on the loopback
+interface then you won't see any entries in /proc/net/neigh for the local
+host until such time as you start a connection. This doesn't affect the
+operation of the local communications in any other way though.
+
+The kernel command line takes options looking like the following:
+
+ decnet.addr=1,2
+
+the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels
+and early 2.3.xx kernels, you must use a comma when specifying the
+DECnet address like this. For more recent 2.3.xx kernels, you may
+use almost any character except space, although a `.` would be the most
+obvious choice :-)
+
+There used to be a third number specifying the node type. This option
+has gone away in favour of a per interface node type. This is now set
+using /proc/sys/net/decnet/conf/<dev>/forwarding. This file can be
+set with a single digit, 0=EndNode, 1=L1 Router and 2=L2 Router.
+
+There are also equivalent options for modules. The node address can
+also be set through the /proc/sys/net/decnet/ files, as can other system
+parameters.
+
+Currently the only supported devices are ethernet and ip_gre. The
+ethernet address of your ethernet card has to be set according to the DECnet
+address of the node in order for it to be autoconfigured (and then appear in
+/proc/net/decnet_dev). There is a utility available at the above
+FTP sites called dn2ethaddr which can compute the correct ethernet
+address to use. The address can be set by ifconfig either before or
+at the time the device is brought up. If you are using RedHat you can
+add the line:
+
+ MACADDR=AA:00:04:00:03:04
+
+or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or
+wherever your network card's configuration lives. Setting the MAC address
+of your ethernet card to an address starting with "hi-ord" will cause a
+DECnet address which matches to be added to the interface (which you can
+verify with iproute2).
+
+The default device for routing can be set through the /proc filesystem
+by setting /proc/sys/net/decnet/default_device to the
+device you want DECnet to route packets out of when no specific route
+is available. Usually this will be eth0, for example:
+
+ echo -n "eth0" >/proc/sys/net/decnet/default_device
+
+If you don't set the default device, then it will default to the first
+ethernet card which has been autoconfigured as described above. You can
+confirm that by looking in the default_device file of course.
+
+There is a list of what the other files under /proc/sys/net/decnet/ do
+on the kernel patch web site (shown above).
+
+4) Run time kernel configuration
+
+This is either done through the sysctl/proc interface (see the kernel web
+pages for details on what the various options do) or through the iproute2
+package in the same way as IPv4/6 configuration is performed.
+
+Documentation for iproute2 is included with the package, although there is
+as yet no specific section on DECnet, most of the features apply to both
+IP and DECnet, albeit with DECnet addresses instead of IP addresses and
+a reduced functionality.
+
+If you want to configure a DECnet router you'll need the iproute2 package
+since its the _only_ way to add and delete routes currently. Eventually
+there will be a routing daemon to send and receive routing messages for
+each interface and update the kernel routing tables accordingly. The
+routing daemon will use netfilter to listen to routing packets, and
+rtnetlink to update the kernels routing tables.
+
+The DECnet raw socket layer has been removed since it was there purely
+for use by the routing daemon which will now use netfilter (a much cleaner
+and more generic solution) instead.
+
+5) How can I tell if its working ?
+
+Here is a quick guide of what to look for in order to know if your DECnet
+kernel subsystem is working.
+
+ - Is the node address set (see /proc/sys/net/decnet/node_address)
+ - Is the node of the correct type
+ (see /proc/sys/net/decnet/conf/<dev>/forwarding)
+ - Is the Ethernet MAC address of each Ethernet card set to match
+ the DECnet address. If in doubt use the dn2ethaddr utility available
+ at the ftp archive.
+ - If the previous two steps are satisfied, and the Ethernet card is up,
+ you should find that it is listed in /proc/net/decnet_dev and also
+ that it appears as a directory in /proc/sys/net/decnet/conf/. The
+ loopback device (lo) should also appear and is required to communicate
+ within a node.
+ - If you have any DECnet routers on your network, they should appear
+ in /proc/net/decnet_neigh, otherwise this file will only contain the
+ entry for the node itself (if it doesn't check to see if lo is up).
+ - If you want to send to any node which is not listed in the
+ /proc/net/decnet_neigh file, you'll need to set the default device
+ to point to an Ethernet card with connection to a router. This is
+ again done with the /proc/sys/net/decnet/default_device file.
+ - Try starting a simple server and client, like the dnping/dnmirror
+ over the loopback interface. With luck they should communicate.
+ For this step and those after, you'll need the DECnet library
+ which can be obtained from the above ftp sites as well as the
+ actual utilities themselves.
+ - If this seems to work, then try talking to a node on your local
+ network, and see if you can obtain the same results.
+ - At this point you are on your own... :-)
+
+6) How to send a bug report
+
+If you've found a bug and want to report it, then there are several things
+you can do to help me work out exactly what it is that is wrong. Useful
+information (_most_ of which _is_ _essential_) includes:
+
+ - What kernel version are you running ?
+ - What version of the patch are you running ?
+ - How far though the above set of tests can you get ?
+ - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ?
+ - Which services are you running ?
+ - Which client caused the problem ?
+ - How much data was being transferred ?
+ - Was the network congested ?
+ - How can the problem be reproduced ?
+ - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of
+ tcpdump don't understand how to dump DECnet properly, so including
+ the hex listing of the packet contents is _essential_, usually the -x flag.
+ You may also need to increase the length grabbed with the -s flag. The
+ -e flag also provides very useful information (ethernet MAC addresses))
+
+7) MAC FAQ
+
+A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet
+interact and how to get the best performance from your hardware.
+
+Ethernet cards are designed to normally only pass received network frames
+to a host computer when they are addressed to it, or to the broadcast address.
+
+Linux has an interface which allows the setting of extra addresses for
+an ethernet card to listen to. If the ethernet card supports it, the
+filtering operation will be done in hardware, if not the extra unwanted packets
+received will be discarded by the host computer. In the latter case,
+significant processor time and bus bandwidth can be used up on a busy
+network (see the NAPI documentation for a longer explanation of these
+effects).
+
+DECnet makes use of this interface to allow running DECnet on an ethernet
+card which has already been configured using TCP/IP (presumably using the
+built in MAC address of the card, as usual) and/or to allow multiple DECnet
+addresses on each physical interface. If you do this, be aware that if your
+ethernet card doesn't support perfect hashing in its MAC address filter
+then your computer will be doing more work than required. Some cards
+will simply set themselves into promiscuous mode in order to receive
+packets from the DECnet specified addresses. So if you have one of these
+cards its better to set the MAC address of the card as described above
+to gain the best efficiency. Better still is to use a card which supports
+NAPI as well.
+
+
+8) Mailing list
+
+If you are keen to get involved in development, or want to ask questions
+about configuration, or even just report bugs, then there is a mailing
+list that you can join, details are at:
+
+http://sourceforge.net/mail/?group_id=4993
+
+9) Legal Info
+
+The Linux DECnet project team have placed their code under the GPL. The
+software is provided "as is" and without warranty express or implied.
+DECnet is a trademark of Compaq. This software is not a product of
+Compaq. We acknowledge the help of people at Compaq in providing extra
+documentation above and beyond what was previously publicly available.
+
+Steve Whitehouse <SteveW@ACM.org>
+
diff --git a/Documentation/networking/depca.txt b/Documentation/networking/depca.txt
new file mode 100644
index 0000000..24c6b26
--- /dev/null
+++ b/Documentation/networking/depca.txt
@@ -0,0 +1,92 @@
+
+DE10x
+=====
+
+Memory Addresses:
+
+ SW1 SW2 SW3 SW4
+64K on on on on d0000 dbfff
+ off on on on c0000 cbfff
+ off off on on e0000 ebfff
+
+32K on on off on d8000 dbfff
+ off on off on c8000 cbfff
+ off off off on e8000 ebfff
+
+DBR ROM on on dc000 dffff
+ off on cc000 cffff
+ off off ec000 effff
+
+Note that the 2K mode is set by SW3/SW4 on/off or off/off. Address
+assignment is through the RBSA register.
+
+I/O Address:
+ SW5
+0x300 on
+0x200 off
+
+Remote Boot:
+ SW6
+Disable on
+Enable off
+
+Remote Boot Timeout:
+ SW7
+2.5min on
+30s off
+
+IRQ:
+ SW8 SW9 SW10 SW11 SW12
+2 on off off off off
+3 off on off off off
+4 off off on off off
+5 off off off on off
+7 off off off off on
+
+DE20x
+=====
+
+Memory Size:
+
+ SW3 SW4
+64K on on
+32K off on
+2K on off
+2K off off
+
+Start Addresses:
+
+ SW1 SW2 SW3 SW4
+64K on on on on c0000 cffff
+ on off on on d0000 dffff
+ off on on on e0000 effff
+
+32K on on off off c8000 cffff
+ on off off off d8000 dffff
+ off on off off e8000 effff
+
+Illegal off off - - - -
+
+I/O Address:
+ SW5
+0x300 on
+0x200 off
+
+Remote Boot:
+ SW6
+Disable on
+Enable off
+
+Remote Boot Timeout:
+ SW7
+2.5min on
+30s off
+
+IRQ:
+ SW8 SW9 SW10 SW11 SW12
+5 on off off off off
+9 off on off off off
+10 off off on off off
+11 off off off on off
+15 off off off off on
+
diff --git a/Documentation/networking/dl2k.txt b/Documentation/networking/dl2k.txt
new file mode 100644
index 0000000..10e8490
--- /dev/null
+++ b/Documentation/networking/dl2k.txt
@@ -0,0 +1,281 @@
+
+ D-Link DL2000-based Gigabit Ethernet Adapter Installation
+ for Linux
+ May 23, 2002
+
+Contents
+========
+ - Compatibility List
+ - Quick Install
+ - Compiling the Driver
+ - Installing the Driver
+ - Option parameter
+ - Configuration Script Sample
+ - Troubleshooting
+
+
+Compatibility List
+=================
+Adapter Support:
+
+D-Link DGE-550T Gigabit Ethernet Adapter.
+D-Link DGE-550SX Gigabit Ethernet Adapter.
+D-Link DL2000-based Gigabit Ethernet Adapter.
+
+
+The driver support Linux kernel 2.4.7 later. We had tested it
+on the environments below.
+
+ . Red Hat v6.2 (update kernel to 2.4.7)
+ . Red Hat v7.0 (update kernel to 2.4.7)
+ . Red Hat v7.1 (kernel 2.4.7)
+ . Red Hat v7.2 (kernel 2.4.7-10)
+
+
+Quick Install
+=============
+Install linux driver as following command:
+
+1. make all
+2. insmod dl2k.ko
+3. ifconfig eth0 up 10.xxx.xxx.xxx netmask 255.0.0.0
+ ^^^^^^^^^^^^^^^\ ^^^^^^^^\
+ IP NETMASK
+Now eth0 should active, you can test it by "ping" or get more information by
+"ifconfig". If tested ok, continue the next step.
+
+4. cp dl2k.ko /lib/modules/`uname -r`/kernel/drivers/net
+5. Add the following line to /etc/modprobe.conf:
+ alias eth0 dl2k
+6. Run "netconfig" or "netconf" to create configuration script ifcfg-eth0
+ located at /etc/sysconfig/network-scripts or create it manually.
+ [see - Configuration Script Sample]
+7. Driver will automatically load and configure at next boot time.
+
+Compiling the Driver
+====================
+ In Linux, NIC drivers are most commonly configured as loadable modules.
+The approach of building a monolithic kernel has become obsolete. The driver
+can be compiled as part of a monolithic kernel, but is strongly discouraged.
+The remainder of this section assumes the driver is built as a loadable module.
+In the Linux environment, it is a good idea to rebuild the driver from the
+source instead of relying on a precompiled version. This approach provides
+better reliability since a precompiled driver might depend on libraries or
+kernel features that are not present in a given Linux installation.
+
+The 3 files necessary to build Linux device driver are dl2k.c, dl2k.h and
+Makefile. To compile, the Linux installation must include the gcc compiler,
+the kernel source, and the kernel headers. The Linux driver supports Linux
+Kernels 2.4.7. Copy the files to a directory and enter the following command
+to compile and link the driver:
+
+CD-ROM drive
+------------
+
+[root@XXX /] mkdir cdrom
+[root@XXX /] mount -r -t iso9660 -o conv=auto /dev/cdrom /cdrom
+[root@XXX /] cd root
+[root@XXX /root] mkdir dl2k
+[root@XXX /root] cd dl2k
+[root@XXX dl2k] cp /cdrom/linux/dl2k.tgz /root/dl2k
+[root@XXX dl2k] tar xfvz dl2k.tgz
+[root@XXX dl2k] make all
+
+Floppy disc drive
+-----------------
+
+[root@XXX /] cd root
+[root@XXX /root] mkdir dl2k
+[root@XXX /root] cd dl2k
+[root@XXX dl2k] mcopy a:/linux/dl2k.tgz /root/dl2k
+[root@XXX dl2k] tar xfvz dl2k.tgz
+[root@XXX dl2k] make all
+
+Installing the Driver
+=====================
+
+ Manual Installation
+ -------------------
+ Once the driver has been compiled, it must be loaded, enabled, and bound
+ to a protocol stack in order to establish network connectivity. To load a
+ module enter the command:
+
+ insmod dl2k.o
+
+ or
+
+ insmod dl2k.o <optional parameter> ; add parameter
+
+ ===============================================================
+ example: insmod dl2k.o media=100mbps_hd
+ or insmod dl2k.o media=3
+ or insmod dl2k.o media=3,2 ; for 2 cards
+ ===============================================================
+
+ Please reference the list of the command line parameters supported by
+ the Linux device driver below.
+
+ The insmod command only loads the driver and gives it a name of the form
+ eth0, eth1, etc. To bring the NIC into an operational state,
+ it is necessary to issue the following command:
+
+ ifconfig eth0 up
+
+ Finally, to bind the driver to the active protocol (e.g., TCP/IP with
+ Linux), enter the following command:
+
+ ifup eth0
+
+ Note that this is meaningful only if the system can find a configuration
+ script that contains the necessary network information. A sample will be
+ given in the next paragraph.
+
+ The commands to unload a driver are as follows:
+
+ ifdown eth0
+ ifconfig eth0 down
+ rmmod dl2k.o
+
+ The following are the commands to list the currently loaded modules and
+ to see the current network configuration.
+
+ lsmod
+ ifconfig
+
+
+ Automated Installation
+ ----------------------
+ This section describes how to install the driver such that it is
+ automatically loaded and configured at boot time. The following description
+ is based on a Red Hat 6.0/7.0 distribution, but it can easily be ported to
+ other distributions as well.
+
+ Red Hat v6.x/v7.x
+ -----------------
+ 1. Copy dl2k.o to the network modules directory, typically
+ /lib/modules/2.x.x-xx/net or /lib/modules/2.x.x/kernel/drivers/net.
+ 2. Locate the boot module configuration file, most commonly modprobe.conf
+ or modules.conf (for 2.4) in the /etc directory. Add the following lines:
+
+ alias ethx dl2k
+ options dl2k <optional parameters>
+
+ where ethx will be eth0 if the NIC is the only ethernet adapter, eth1 if
+ one other ethernet adapter is installed, etc. Refer to the table in the
+ previous section for the list of optional parameters.
+ 3. Locate the network configuration scripts, normally the
+ /etc/sysconfig/network-scripts directory, and create a configuration
+ script named ifcfg-ethx that contains network information.
+ 4. Note that for most Linux distributions, Red Hat included, a configuration
+ utility with a graphical user interface is provided to perform steps 2
+ and 3 above.
+
+
+Parameter Description
+=====================
+You can install this driver without any additional parameter. However, if you
+are going to have extensive functions then it is necessary to set extra
+parameter. Below is a list of the command line parameters supported by the
+Linux device
+driver.
+
+mtu=packet_size - Specifies the maximum packet size. default
+ is 1500.
+
+media=media_type - Specifies the media type the NIC operates at.
+ autosense Autosensing active media.
+ 10mbps_hd 10Mbps half duplex.
+ 10mbps_fd 10Mbps full duplex.
+ 100mbps_hd 100Mbps half duplex.
+ 100mbps_fd 100Mbps full duplex.
+ 1000mbps_fd 1000Mbps full duplex.
+ 1000mbps_hd 1000Mbps half duplex.
+ 0 Autosensing active media.
+ 1 10Mbps half duplex.
+ 2 10Mbps full duplex.
+ 3 100Mbps half duplex.
+ 4 100Mbps full duplex.
+ 5 1000Mbps half duplex.
+ 6 1000Mbps full duplex.
+
+ By default, the NIC operates at autosense.
+ 1000mbps_fd and 1000mbps_hd types are only
+ available for fiber adapter.
+
+vlan=n - Specifies the VLAN ID. If vlan=0, the
+ Virtual Local Area Network (VLAN) function is
+ disable.
+
+jumbo=[0|1] - Specifies the jumbo frame support. If jumbo=1,
+ the NIC accept jumbo frames. By default, this
+ function is disabled.
+ Jumbo frame usually improve the performance
+ int gigabit.
+ This feature need jumbo frame compatible
+ remote.
+
+rx_coalesce=m - Number of rx frame handled each interrupt.
+rx_timeout=n - Rx DMA wait time for an interrupt.
+ If set rx_coalesce > 0, hardware only assert
+ an interrupt for m frames. Hardware won't
+ assert rx interrupt until m frames received or
+ reach timeout of n * 640 nano seconds.
+ Set proper rx_coalesce and rx_timeout can
+ reduce congestion collapse and overload which
+ has been a bottleneck for high speed network.
+
+ For example, rx_coalesce=10 rx_timeout=800.
+ that is, hardware assert only 1 interrupt
+ for 10 frames received or timeout of 512 us.
+
+tx_coalesce=n - Number of tx frame handled each interrupt.
+ Set n > 1 can reduce the interrupts
+ congestion usually lower performance of
+ high speed network card. Default is 16.
+
+tx_flow=[1|0] - Specifies the Tx flow control. If tx_flow=0,
+ the Tx flow control disable else driver
+ autodetect.
+rx_flow=[1|0] - Specifies the Rx flow control. If rx_flow=0,
+ the Rx flow control enable else driver
+ autodetect.
+
+
+Configuration Script Sample
+===========================
+Here is a sample of a simple configuration script:
+
+DEVICE=eth0
+USERCTL=no
+ONBOOT=yes
+POOTPROTO=none
+BROADCAST=207.200.5.255
+NETWORK=207.200.5.0
+NETMASK=255.255.255.0
+IPADDR=207.200.5.2
+
+
+Troubleshooting
+===============
+Q1. Source files contain ^ M behind every line.
+ Make sure all files are Unix file format (no LF). Try the following
+ shell command to convert files.
+
+ cat dl2k.c | col -b > dl2k.tmp
+ mv dl2k.tmp dl2k.c
+
+ OR
+
+ cat dl2k.c | tr -d "\r" > dl2k.tmp
+ mv dl2k.tmp dl2k.c
+
+Q2: Could not find header files (*.h) ?
+ To compile the driver, you need kernel header files. After
+ installing the kernel source, the header files are usually located in
+ /usr/src/linux/include, which is the default include directory configured
+ in Makefile. For some distributions, there is a copy of header files in
+ /usr/src/include/linux and /usr/src/include/asm, that you can change the
+ INCLUDEDIR in Makefile to /usr/include without installing kernel source.
+ Note that RH 7.0 didn't provide correct header files in /usr/include,
+ including those files will make a wrong version driver.
+
diff --git a/Documentation/networking/dm9000.txt b/Documentation/networking/dm9000.txt
new file mode 100644
index 0000000..65df3de
--- /dev/null
+++ b/Documentation/networking/dm9000.txt
@@ -0,0 +1,167 @@
+DM9000 Network driver
+=====================
+
+Copyright 2008 Simtec Electronics,
+ Ben Dooks <ben@simtec.co.uk> <ben-linux@fluff.org>
+
+
+Introduction
+------------
+
+This file describes how to use the DM9000 platform-device based network driver
+that is contained in the files drivers/net/dm9000.c and drivers/net/dm9000.h.
+
+The driver supports three DM9000 variants, the DM9000E which is the first chip
+supported as well as the newer DM9000A and DM9000B devices. It is currently
+maintained and tested by Ben Dooks, who should be CC: to any patches for this
+driver.
+
+
+Defining the platform device
+----------------------------
+
+The minimum set of resources attached to the platform device are as follows:
+
+ 1) The physical address of the address register
+ 2) The physical address of the data register
+ 3) The IRQ line the device's interrupt pin is connected to.
+
+These resources should be specified in that order, as the ordering of the
+two address regions is important (the driver expects these to be address
+and then data).
+
+An example from arch/arm/mach-s3c2410/mach-bast.c is:
+
+static struct resource bast_dm9k_resource[] = {
+ [0] = {
+ .start = S3C2410_CS5 + BAST_PA_DM9000,
+ .end = S3C2410_CS5 + BAST_PA_DM9000 + 3,
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .start = S3C2410_CS5 + BAST_PA_DM9000 + 0x40,
+ .end = S3C2410_CS5 + BAST_PA_DM9000 + 0x40 + 0x3f,
+ .flags = IORESOURCE_MEM,
+ },
+ [2] = {
+ .start = IRQ_DM9000,
+ .end = IRQ_DM9000,
+ .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
+ }
+};
+
+static struct platform_device bast_device_dm9k = {
+ .name = "dm9000",
+ .id = 0,
+ .num_resources = ARRAY_SIZE(bast_dm9k_resource),
+ .resource = bast_dm9k_resource,
+};
+
+Note the setting of the IRQ trigger flag in bast_dm9k_resource[2].flags,
+as this will generate a warning if it is not present. The trigger from
+the flags field will be passed to request_irq() when registering the IRQ
+handler to ensure that the IRQ is setup correctly.
+
+This shows a typical platform device, without the optional configuration
+platform data supplied. The next example uses the same resources, but adds
+the optional platform data to pass extra configuration data:
+
+static struct dm9000_plat_data bast_dm9k_platdata = {
+ .flags = DM9000_PLATF_16BITONLY,
+};
+
+static struct platform_device bast_device_dm9k = {
+ .name = "dm9000",
+ .id = 0,
+ .num_resources = ARRAY_SIZE(bast_dm9k_resource),
+ .resource = bast_dm9k_resource,
+ .dev = {
+ .platform_data = &bast_dm9k_platdata,
+ }
+};
+
+The platform data is defined in include/linux/dm9000.h and described below.
+
+
+Platform data
+-------------
+
+Extra platform data for the DM9000 can describe the IO bus width to the
+device, whether or not an external PHY is attached to the device and
+the availability of an external configuration EEPROM.
+
+The flags for the platform data .flags field are as follows:
+
+DM9000_PLATF_8BITONLY
+
+ The IO should be done with 8bit operations.
+
+DM9000_PLATF_16BITONLY
+
+ The IO should be done with 16bit operations.
+
+DM9000_PLATF_32BITONLY
+
+ The IO should be done with 32bit operations.
+
+DM9000_PLATF_EXT_PHY
+
+ The chip is connected to an external PHY.
+
+DM9000_PLATF_NO_EEPROM
+
+ This can be used to signify that the board does not have an
+ EEPROM, or that the EEPROM should be hidden from the user.
+
+DM9000_PLATF_SIMPLE_PHY
+
+ Switch to using the simpler PHY polling method which does not
+ try and read the MII PHY state regularly. This is only available
+ when using the internal PHY. See the section on link state polling
+ for more information.
+
+ The config symbol DM9000_FORCE_SIMPLE_PHY_POLL, Kconfig entry
+ "Force simple NSR based PHY polling" allows this flag to be
+ forced on at build time.
+
+
+PHY Link state polling
+----------------------
+
+The driver keeps track of the link state and informs the network core
+about link (carrier) availablilty. This is managed by several methods
+depending on the version of the chip and on which PHY is being used.
+
+For the internal PHY, the original (and currently default) method is
+to read the MII state, either when the status changes if we have the
+necessary interrupt support in the chip or every two seconds via a
+periodic timer.
+
+To reduce the overhead for the internal PHY, there is now the option
+of using the DM9000_FORCE_SIMPLE_PHY_POLL config, or DM9000_PLATF_SIMPLE_PHY
+platform data option to read the summary information without the
+expensive MII accesses. This method is faster, but does not print
+as much information.
+
+When using an external PHY, the driver currently has to poll the MII
+link status as there is no method for getting an interrupt on link change.
+
+
+DM9000A / DM9000B
+-----------------
+
+These chips are functionally similar to the DM9000E and are supported easily
+by the same driver. The features are:
+
+ 1) Interrupt on internal PHY state change. This means that the periodic
+ polling of the PHY status may be disabled on these devices when using
+ the internal PHY.
+
+ 2) TCP/UDP checksum offloading, which the driver does not currently support.
+
+
+ethtool
+-------
+
+The driver supports the ethtool interface for access to the driver
+state information, the PHY state and the EEPROM.
diff --git a/Documentation/networking/dmfe.txt b/Documentation/networking/dmfe.txt
new file mode 100644
index 0000000..8006c22
--- /dev/null
+++ b/Documentation/networking/dmfe.txt
@@ -0,0 +1,65 @@
+Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver for Linux.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+
+This driver provides kernel support for Davicom DM9102(A)/DM9132/DM9801 ethernet cards ( CNET
+10/100 ethernet cards uses Davicom chipset too, so this driver supports CNET cards too ).If you
+didn't compile this driver as a module, it will automatically load itself on boot and print a
+line similar to :
+
+ dmfe: Davicom DM9xxx net driver, version 1.36.4 (2002-01-17)
+
+If you compiled this driver as a module, you have to load it on boot.You can load it with command :
+
+ insmod dmfe
+
+This way it will autodetect the device mode.This is the suggested way to load the module.Or you can pass
+a mode= setting to module while loading, like :
+
+ insmod dmfe mode=0 # Force 10M Half Duplex
+ insmod dmfe mode=1 # Force 100M Half Duplex
+ insmod dmfe mode=4 # Force 10M Full Duplex
+ insmod dmfe mode=5 # Force 100M Full Duplex
+
+Next you should configure your network interface with a command similar to :
+
+ ifconfig eth0 172.22.3.18
+ ^^^^^^^^^^^
+ Your IP Address
+
+Then you may have to modify the default routing table with command :
+
+ route add default eth0
+
+
+Now your ethernet card should be up and running.
+
+
+TODO:
+
+Implement pci_driver::suspend() and pci_driver::resume() power management methods.
+Check on 64 bit boxes.
+Check and fix on big endian boxes.
+Test and make sure PCI latency is now correct for all cases.
+
+
+Authors:
+
+Sten Wang <sten_wang@davicom.com.tw > : Original Author
+Tobias Ringstrom <tori@unhappy.mine.nu> : Current Maintainer
+
+Contributors:
+
+Marcelo Tosatti <marcelo@conectiva.com.br>
+Alan Cox <alan@lxorguk.ukuu.org.uk>
+Jeff Garzik <jgarzik@pobox.com>
+Vojtech Pavlik <vojtech@suse.cz>
diff --git a/Documentation/networking/driver.txt b/Documentation/networking/driver.txt
new file mode 100644
index 0000000..ea72d2e
--- /dev/null
+++ b/Documentation/networking/driver.txt
@@ -0,0 +1,94 @@
+Document about softnet driver issues
+
+Transmit path guidelines:
+
+1) The hard_start_xmit method must never return '1' under any
+ normal circumstances. It is considered a hard error unless
+ there is no way your device can tell ahead of time when it's
+ transmit function will become busy.
+
+ Instead it must maintain the queue properly. For example,
+ for a driver implementing scatter-gather this means:
+
+ static int drv_hard_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+ {
+ struct drv *dp = dev->priv;
+
+ lock_tx(dp);
+ ...
+ /* This is a hard error log it. */
+ if (TX_BUFFS_AVAIL(dp) <= (skb_shinfo(skb)->nr_frags + 1)) {
+ netif_stop_queue(dev);
+ unlock_tx(dp);
+ printk(KERN_ERR PFX "%s: BUG! Tx Ring full when queue awake!\n",
+ dev->name);
+ return 1;
+ }
+
+ ... queue packet to card ...
+ ... update tx consumer index ...
+
+ if (TX_BUFFS_AVAIL(dp) <= (MAX_SKB_FRAGS + 1))
+ netif_stop_queue(dev);
+
+ ...
+ unlock_tx(dp);
+ ...
+ }
+
+ And then at the end of your TX reclamation event handling:
+
+ if (netif_queue_stopped(dp->dev) &&
+ TX_BUFFS_AVAIL(dp) > (MAX_SKB_FRAGS + 1))
+ netif_wake_queue(dp->dev);
+
+ For a non-scatter-gather supporting card, the three tests simply become:
+
+ /* This is a hard error log it. */
+ if (TX_BUFFS_AVAIL(dp) <= 0)
+
+ and:
+
+ if (TX_BUFFS_AVAIL(dp) == 0)
+
+ and:
+
+ if (netif_queue_stopped(dp->dev) &&
+ TX_BUFFS_AVAIL(dp) > 0)
+ netif_wake_queue(dp->dev);
+
+2) Do not forget to update netdev->trans_start to jiffies after
+ each new tx packet is given to the hardware.
+
+3) A hard_start_xmit method must not modify the shared parts of a
+ cloned SKB.
+
+4) Do not forget that once you return 0 from your hard_start_xmit
+ method, it is your driver's responsibility to free up the SKB
+ and in some finite amount of time.
+
+ For example, this means that it is not allowed for your TX
+ mitigation scheme to let TX packets "hang out" in the TX
+ ring unreclaimed forever if no new TX packets are sent.
+ This error can deadlock sockets waiting for send buffer room
+ to be freed up.
+
+ If you return 1 from the hard_start_xmit method, you must not keep
+ any reference to that SKB and you must not attempt to free it up.
+
+Probing guidelines:
+
+1) Any hardware layer address you obtain for your device should
+ be verified. For example, for ethernet check it with
+ linux/etherdevice.h:is_valid_ether_addr()
+
+Close/stop guidelines:
+
+1) After the dev->stop routine has been called, the hardware must
+ not receive or transmit any data. All in flight packets must
+ be aborted. If necessary, poll or wait for completion of
+ any reset commands.
+
+2) The dev->stop routine will be called by unregister_netdevice
+ if device is still UP.
diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.txt
new file mode 100644
index 0000000..944aa55
--- /dev/null
+++ b/Documentation/networking/e100.txt
@@ -0,0 +1,206 @@
+Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters
+==============================================================
+
+November 15, 2005
+
+Contents
+========
+
+- In This Release
+- Identifying Your Adapter
+- Building and Installation
+- Driver Configuration Parameters
+- Additional Configurations
+- Known Issues
+- Support
+
+
+In This Release
+===============
+
+This file describes the Linux* Base Driver for the Intel(R) PRO/100 Family of
+Adapters. This driver includes support for Itanium(R)2-based systems.
+
+For questions related to hardware requirements, refer to the documentation
+supplied with your Intel PRO/100 adapter.
+
+The following features are now available in supported kernels:
+ - Native VLANs
+ - Channel Bonding (teaming)
+ - SNMP
+
+Channel Bonding documentation can be found in the Linux kernel source:
+/Documentation/networking/bonding.txt
+
+
+Identifying Your Adapter
+========================
+
+For more information on how to identify your adapter, go to the Adapter &
+Driver ID Guide at:
+
+ http://support.intel.com/support/network/adapter/pro100/21397.htm
+
+For the latest Intel network drivers for Linux, refer to the following
+website. In the search field, enter your adapter name or type, or use the
+networking link on the left to search for your adapter:
+
+ http://downloadfinder.intel.com/scripts-df/support_intel.asp
+
+Driver Configuration Parameters
+===============================
+
+The default value for each parameter is generally the recommended setting,
+unless otherwise noted.
+
+Rx Descriptors: Number of receive descriptors. A receive descriptor is a data
+ structure that describes a receive buffer and its attributes to the network
+ controller. The data in the descriptor is used by the controller to write
+ data from the controller to host memory. In the 3.x.x driver the valid range
+ for this parameter is 64-256. The default value is 64. This parameter can be
+ changed using the command:
+
+ ethtool -G eth? rx n, where n is the number of desired rx descriptors.
+
+Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data
+ structure that describes a transmit buffer and its attributes to the network
+ controller. The data in the descriptor is used by the controller to read
+ data from the host memory to the controller. In the 3.x.x driver the valid
+ range for this parameter is 64-256. The default value is 64. This parameter
+ can be changed using the command:
+
+ ethtool -G eth? tx n, where n is the number of desired tx descriptors.
+
+Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
+ default. Ethtool can be used as follows to force speed/duplex.
+
+ ethtool -s eth? autoneg off speed {10|100} duplex {full|half}
+
+ NOTE: setting the speed/duplex to incorrect values will cause the link to
+ fail.
+
+Event Log Message Level: The driver uses the message level flag to log events
+ to syslog. The message level can be set at driver load time. It can also be
+ set using the command:
+
+ ethtool -s eth? msglvl n
+
+
+Additional Configurations
+=========================
+
+ Configuring the Driver on Different Distributions
+ -------------------------------------------------
+
+ Configuring a network driver to load properly when the system is started is
+ distribution dependent. Typically, the configuration process involves adding
+ an alias line to /etc/modules.conf or /etc/modprobe.conf as well as editing
+ other system startup scripts and/or configuration files. Many popular Linux
+ distributions ship with tools to make these changes for you. To learn the
+ proper way to configure a network device for your system, refer to your
+ distribution documentation. If during this process you are asked for the
+ driver or module name, the name for the Linux Base Driver for the Intel
+ PRO/100 Family of Adapters is e100.
+
+ As an example, if you install the e100 driver for two PRO/100 adapters
+ (eth0 and eth1), add the following to modules.conf or modprobe.conf:
+
+ alias eth0 e100
+ alias eth1 e100
+
+ Viewing Link Messages
+ ---------------------
+ In order to see link messages and other Intel driver information on your
+ console, you must set the dmesg level up to six. This can be done by
+ entering the following on the command line before loading the e100 driver:
+
+ dmesg -n 8
+
+ If you wish to see all messages issued by the driver, including debug
+ messages, set the dmesg level to eight.
+
+ NOTE: This setting is not saved across reboots.
+
+
+ Ethtool
+ -------
+
+ The driver utilizes the ethtool interface for driver configuration and
+ diagnostics, as well as displaying statistical information. Ethtool
+ version 1.6 or later is required for this functionality.
+
+ The latest release of ethtool can be found from
+ http://sourceforge.net/projects/gkernel.
+
+ NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
+ for a more complete ethtool feature set can be enabled by upgrading
+ ethtool to ethtool-1.8.1.
+
+
+ Enabling Wake on LAN* (WoL)
+ ---------------------------
+ WoL is provided through the Ethtool* utility. Ethtool is included with Red
+ Hat* 8.0. For other Linux distributions, download and install Ethtool from
+ the following website: http://sourceforge.net/projects/gkernel.
+
+ For instructions on enabling WoL with Ethtool, refer to the Ethtool man page.
+
+ WoL will be enabled on the system during the next shut down or reboot. For
+ this driver version, in order to enable WoL, the e100 driver must be
+ loaded when shutting down or rebooting the system.
+
+
+ NAPI
+ ----
+
+ NAPI (Rx polling mode) is supported in the e100 driver.
+
+ See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
+
+ Multiple Interfaces on Same Ethernet Broadcast Network
+ ------------------------------------------------------
+
+ Due to the default ARP behavior on Linux, it is not possible to have
+ one system on two IP networks in the same Ethernet broadcast domain
+ (non-partitioned switch) behave as expected. All Ethernet interfaces
+ will respond to IP traffic for any IP address assigned to the system.
+ This results in unbalanced receive traffic.
+
+ If you have multiple interfaces in a server, either turn on ARP
+ filtering by
+
+ (1) entering: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+ (this only works if your kernel's version is higher than 2.4.5), or
+
+ (2) installing the interfaces in separate broadcast domains (either
+ in different switches or in a switch partitioned to VLANs).
+
+
+Support
+=======
+
+For general information, go to the Intel support website at:
+
+ http://support.intel.com
+
+ or the Intel Wired Networking project hosted by Sourceforge at:
+
+ http://sourceforge.net/projects/e1000
+
+If an issue is identified with the released source code on the supported
+kernel with a supported adapter, email the specific information related to the
+issue to e1000-devel@lists.sourceforge.net.
+
+
+License
+=======
+
+This software program is released under the terms of a license agreement
+between you ('Licensee') and Intel. Do not use or load this software or any
+associated materials (collectively, the 'Software') until you have carefully
+read the full terms and conditions of the file COPYING located in this software
+package. By loading or using the Software, you agree to the terms of this
+Agreement. If you do not agree with the terms of this Agreement, do not install
+or use the Software.
+
+* Other names and brands may be claimed as the property of others.
diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt
new file mode 100644
index 0000000..2df7186
--- /dev/null
+++ b/Documentation/networking/e1000.txt
@@ -0,0 +1,642 @@
+Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters
+===============================================================
+
+September 26, 2006
+
+
+Contents
+========
+
+- In This Release
+- Identifying Your Adapter
+- Building and Installation
+- Command Line Parameters
+- Speed and Duplex Configuration
+- Additional Configurations
+- Known Issues
+- Support
+
+
+In This Release
+===============
+
+This file describes the Linux* Base Driver for the Intel(R) PRO/1000 Family
+of Adapters. This driver includes support for Itanium(R)2-based systems.
+
+For questions related to hardware requirements, refer to the documentation
+supplied with your Intel PRO/1000 adapter. All hardware requirements listed
+apply to use with Linux.
+
+The following features are now available in supported kernels:
+ - Native VLANs
+ - Channel Bonding (teaming)
+ - SNMP
+
+Channel Bonding documentation can be found in the Linux kernel source:
+/Documentation/networking/bonding.txt
+
+The driver information previously displayed in the /proc filesystem is not
+supported in this release. Alternatively, you can use ethtool (version 1.6
+or later), lspci, and ifconfig to obtain the same information.
+
+Instructions on updating ethtool can be found in the section "Additional
+Configurations" later in this document.
+
+NOTE: The Intel(R) 82562v 10/100 Network Connection only provides 10/100
+support.
+
+
+Identifying Your Adapter
+========================
+
+For more information on how to identify your adapter, go to the Adapter &
+Driver ID Guide at:
+
+ http://support.intel.com/support/network/adapter/pro100/21397.htm
+
+For the latest Intel network drivers for Linux, refer to the following
+website. In the search field, enter your adapter name or type, or use the
+networking link on the left to search for your adapter:
+
+ http://downloadfinder.intel.com/scripts-df/support_intel.asp
+
+
+Command Line Parameters
+=======================
+
+If the driver is built as a module, the following optional parameters
+are used by entering them on the command line with the modprobe command
+using this syntax:
+
+ modprobe e1000 [<option>=<VAL1>,<VAL2>,...]
+
+For example, with two PRO/1000 PCI adapters, entering:
+
+ modprobe e1000 TxDescriptors=80,128
+
+loads the e1000 driver with 80 TX descriptors for the first adapter and
+128 TX descriptors for the second adapter.
+
+The default value for each parameter is generally the recommended setting,
+unless otherwise noted.
+
+NOTES: For more information about the AutoNeg, Duplex, and Speed
+ parameters, see the "Speed and Duplex Configuration" section in
+ this document.
+
+ For more information about the InterruptThrottleRate,
+ RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay
+ parameters, see the application note at:
+ http://www.intel.com/design/network/applnots/ap450.htm
+
+ A descriptor describes a data buffer and attributes related to
+ the data buffer. This information is accessed by the hardware.
+
+
+AutoNeg
+-------
+(Supported only on adapters with copper connections)
+Valid Range: 0x01-0x0F, 0x20-0x2F
+Default Value: 0x2F
+
+This parameter is a bit-mask that specifies the speed and duplex settings
+advertised by the adapter. When this parameter is used, the Speed and
+Duplex parameters must not be specified.
+
+NOTE: Refer to the Speed and Duplex section of this readme for more
+ information on the AutoNeg parameter.
+
+
+Duplex
+------
+(Supported only on adapters with copper connections)
+Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full)
+Default Value: 0
+
+This defines the direction in which data is allowed to flow. Can be
+either one or two-directional. If both Duplex and the link partner are
+set to auto-negotiate, the board auto-detects the correct duplex. If the
+link partner is forced (either full or half), Duplex defaults to half-
+duplex.
+
+
+FlowControl
+-----------
+Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
+Default Value: Reads flow control settings from the EEPROM
+
+This parameter controls the automatic generation(Tx) and response(Rx)
+to Ethernet PAUSE frames.
+
+
+InterruptThrottleRate
+---------------------
+(not supported on Intel(R) 82542, 82543 or 82544-based adapters)
+Valid Range: 0,1,3,100-100000 (0=off, 1=dynamic, 3=dynamic conservative)
+Default Value: 3
+
+The driver can limit the amount of interrupts per second that the adapter
+will generate for incoming packets. It does this by writing a value to the
+adapter that is based on the maximum amount of interrupts that the adapter
+will generate per second.
+
+Setting InterruptThrottleRate to a value greater or equal to 100
+will program the adapter to send out a maximum of that many interrupts
+per second, even if more packets have come in. This reduces interrupt
+load on the system and can lower CPU utilization under heavy load,
+but will increase latency as packets are not processed as quickly.
+
+The default behaviour of the driver previously assumed a static
+InterruptThrottleRate value of 8000, providing a good fallback value for
+all traffic types,but lacking in small packet performance and latency.
+The hardware can handle many more small packets per second however, and
+for this reason an adaptive interrupt moderation algorithm was implemented.
+
+Since 7.3.x, the driver has two adaptive modes (setting 1 or 3) in which
+it dynamically adjusts the InterruptThrottleRate value based on the traffic
+that it receives. After determining the type of incoming traffic in the last
+timeframe, it will adjust the InterruptThrottleRate to an appropriate value
+for that traffic.
+
+The algorithm classifies the incoming traffic every interval into
+classes. Once the class is determined, the InterruptThrottleRate value is
+adjusted to suit that traffic type the best. There are three classes defined:
+"Bulk traffic", for large amounts of packets of normal size; "Low latency",
+for small amounts of traffic and/or a significant percentage of small
+packets; and "Lowest latency", for almost completely small packets or
+minimal traffic.
+
+In dynamic conservative mode, the InterruptThrottleRate value is set to 4000
+for traffic that falls in class "Bulk traffic". If traffic falls in the "Low
+latency" or "Lowest latency" class, the InterruptThrottleRate is increased
+stepwise to 20000. This default mode is suitable for most applications.
+
+For situations where low latency is vital such as cluster or
+grid computing, the algorithm can reduce latency even more when
+InterruptThrottleRate is set to mode 1. In this mode, which operates
+the same as mode 3, the InterruptThrottleRate will be increased stepwise to
+70000 for traffic in class "Lowest latency".
+
+Setting InterruptThrottleRate to 0 turns off any interrupt moderation
+and may improve small packet latency, but is generally not suitable
+for bulk throughput traffic.
+
+NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and
+ RxAbsIntDelay parameters. In other words, minimizing the receive
+ and/or transmit absolute delays does not force the controller to
+ generate more interrupts than what the Interrupt Throttle Rate
+ allows.
+
+CAUTION: If you are using the Intel(R) PRO/1000 CT Network Connection
+ (controller 82547), setting InterruptThrottleRate to a value
+ greater than 75,000, may hang (stop transmitting) adapters
+ under certain network conditions. If this occurs a NETDEV
+ WATCHDOG message is logged in the system event log. In
+ addition, the controller is automatically reset, restoring
+ the network connection. To eliminate the potential for the
+ hang, ensure that InterruptThrottleRate is set no greater
+ than 75,000 and is not set to 0.
+
+NOTE: When e1000 is loaded with default settings and multiple adapters
+ are in use simultaneously, the CPU utilization may increase non-
+ linearly. In order to limit the CPU utilization without impacting
+ the overall throughput, we recommend that you load the driver as
+ follows:
+
+ modprobe e1000 InterruptThrottleRate=3000,3000,3000
+
+ This sets the InterruptThrottleRate to 3000 interrupts/sec for
+ the first, second, and third instances of the driver. The range
+ of 2000 to 3000 interrupts per second works on a majority of
+ systems and is a good starting point, but the optimal value will
+ be platform-specific. If CPU utilization is not a concern, use
+ RX_POLLING (NAPI) and default driver settings.
+
+
+
+RxDescriptors
+-------------
+Valid Range: 80-256 for 82542 and 82543-based adapters
+ 80-4096 for all other supported adapters
+Default Value: 256
+
+This value specifies the number of receive buffer descriptors allocated
+by the driver. Increasing this value allows the driver to buffer more
+incoming packets, at the expense of increased system memory utilization.
+
+Each descriptor is 16 bytes. A receive buffer is also allocated for each
+descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending
+on the MTU setting. The maximum MTU size is 16110.
+
+NOTE: MTU designates the frame size. It only needs to be set for Jumbo
+ Frames. Depending on the available system resources, the request
+ for a higher number of receive descriptors may be denied. In this
+ case, use a lower number.
+
+
+RxIntDelay
+----------
+Valid Range: 0-65535 (0=off)
+Default Value: 0
+
+This value delays the generation of receive interrupts in units of 1.024
+microseconds. Receive interrupt reduction can improve CPU efficiency if
+properly tuned for specific network traffic. Increasing this value adds
+extra latency to frame reception and can end up decreasing the throughput
+of TCP traffic. If the system is reporting dropped receives, this value
+may be set too high, causing the driver to run out of available receive
+descriptors.
+
+CAUTION: When setting RxIntDelay to a value other than 0, adapters may
+ hang (stop transmitting) under certain network conditions. If
+ this occurs a NETDEV WATCHDOG message is logged in the system
+ event log. In addition, the controller is automatically reset,
+ restoring the network connection. To eliminate the potential
+ for the hang ensure that RxIntDelay is set to 0.
+
+
+RxAbsIntDelay
+-------------
+(This parameter is supported only on 82540, 82545 and later adapters.)
+Valid Range: 0-65535 (0=off)
+Default Value: 128
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+receive interrupt is generated. Useful only if RxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is received within the set amount of time. Proper tuning,
+along with RxIntDelay, may improve traffic throughput in specific network
+conditions.
+
+
+Speed
+-----
+(This parameter is supported only on adapters with copper connections.)
+Valid Settings: 0, 10, 100, 1000
+Default Value: 0 (auto-negotiate at all supported speeds)
+
+Speed forces the line speed to the specified value in megabits per second
+(Mbps). If this parameter is not specified or is set to 0 and the link
+partner is set to auto-negotiate, the board will auto-detect the correct
+speed. Duplex should also be set when Speed is set to either 10 or 100.
+
+
+TxDescriptors
+-------------
+Valid Range: 80-256 for 82542 and 82543-based adapters
+ 80-4096 for all other supported adapters
+Default Value: 256
+
+This value is the number of transmit descriptors allocated by the driver.
+Increasing this value allows the driver to queue more transmits. Each
+descriptor is 16 bytes.
+
+NOTE: Depending on the available system resources, the request for a
+ higher number of transmit descriptors may be denied. In this case,
+ use a lower number.
+
+
+TxIntDelay
+----------
+Valid Range: 0-65535 (0=off)
+Default Value: 64
+
+This value delays the generation of transmit interrupts in units of
+1.024 microseconds. Transmit interrupt reduction can improve CPU
+efficiency if properly tuned for specific network traffic. If the
+system is reporting dropped transmits, this value may be set too high
+causing the driver to run out of available transmit descriptors.
+
+
+TxAbsIntDelay
+-------------
+(This parameter is supported only on 82540, 82545 and later adapters.)
+Valid Range: 0-65535 (0=off)
+Default Value: 64
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+transmit interrupt is generated. Useful only if TxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is sent on the wire within the set amount of time. Proper tuning,
+along with TxIntDelay, may improve traffic throughput in specific
+network conditions.
+
+XsumRX
+------
+(This parameter is NOT supported on the 82542-based adapter.)
+Valid Range: 0-1
+Default Value: 1
+
+A value of '1' indicates that the driver should enable IP checksum
+offload for received packets (both UDP and TCP) to the adapter hardware.
+
+
+Speed and Duplex Configuration
+==============================
+
+Three keywords are used to control the speed and duplex configuration.
+These keywords are Speed, Duplex, and AutoNeg.
+
+If the board uses a fiber interface, these keywords are ignored, and the
+fiber interface board only links at 1000 Mbps full-duplex.
+
+For copper-based boards, the keywords interact as follows:
+
+ The default operation is auto-negotiate. The board advertises all
+ supported speed and duplex combinations, and it links at the highest
+ common speed and duplex mode IF the link partner is set to auto-negotiate.
+
+ If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps
+ is advertised (The 1000BaseT spec requires auto-negotiation.)
+
+ If Speed = 10 or 100, then both Speed and Duplex should be set. Auto-
+ negotiation is disabled, and the AutoNeg parameter is ignored. Partner
+ SHOULD also be forced.
+
+The AutoNeg parameter is used when more control is required over the
+auto-negotiation process. It should be used when you wish to control which
+speed and duplex combinations are advertised during the auto-negotiation
+process.
+
+The parameter may be specified as either a decimal or hexadecimal value as
+determined by the bitmap below.
+
+Bit position 7 6 5 4 3 2 1 0
+Decimal Value 128 64 32 16 8 4 2 1
+Hex value 80 40 20 10 8 4 2 1
+Speed (Mbps) N/A N/A 1000 N/A 100 100 10 10
+Duplex Full Full Half Full Half
+
+Some examples of using AutoNeg:
+
+ modprobe e1000 AutoNeg=0x01 (Restricts autonegotiation to 10 Half)
+ modprobe e1000 AutoNeg=1 (Same as above)
+ modprobe e1000 AutoNeg=0x02 (Restricts autonegotiation to 10 Full)
+ modprobe e1000 AutoNeg=0x03 (Restricts autonegotiation to 10 Half or 10 Full)
+ modprobe e1000 AutoNeg=0x04 (Restricts autonegotiation to 100 Half)
+ modprobe e1000 AutoNeg=0x05 (Restricts autonegotiation to 10 Half or 100
+ Half)
+ modprobe e1000 AutoNeg=0x020 (Restricts autonegotiation to 1000 Full)
+ modprobe e1000 AutoNeg=32 (Same as above)
+
+Note that when this parameter is used, Speed and Duplex must not be specified.
+
+If the link partner is forced to a specific speed and duplex, then this
+parameter should not be used. Instead, use the Speed and Duplex parameters
+previously mentioned to force the adapter to the same speed and duplex.
+
+
+Additional Configurations
+=========================
+
+ Configuring the Driver on Different Distributions
+ -------------------------------------------------
+ Configuring a network driver to load properly when the system is started
+ is distribution dependent. Typically, the configuration process involves
+ adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well
+ as editing other system startup scripts and/or configuration files. Many
+ popular Linux distributions ship with tools to make these changes for you.
+ To learn the proper way to configure a network device for your system,
+ refer to your distribution documentation. If during this process you are
+ asked for the driver or module name, the name for the Linux Base Driver
+ for the Intel(R) PRO/1000 Family of Adapters is e1000.
+
+ As an example, if you install the e1000 driver for two PRO/1000 adapters
+ (eth0 and eth1) and set the speed and duplex to 10full and 100half, add
+ the following to modules.conf or or modprobe.conf:
+
+ alias eth0 e1000
+ alias eth1 e1000
+ options e1000 Speed=10,100 Duplex=2,1
+
+ Viewing Link Messages
+ ---------------------
+ Link messages will not be displayed to the console if the distribution is
+ restricting system messages. In order to see network driver link messages
+ on your console, set dmesg to eight by entering the following:
+
+ dmesg -n 8
+
+ NOTE: This setting is not saved across reboots.
+
+ Jumbo Frames
+ ------------
+ Jumbo Frames support is enabled by changing the MTU to a value larger than
+ the default of 1500. Use the ifconfig command to increase the MTU size.
+ For example:
+
+ ifconfig eth<x> mtu 9000 up
+
+ This setting is not saved across reboots. It can be made permanent if
+ you add:
+
+ MTU=9000
+
+ to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>. This example
+ applies to the Red Hat distributions; other distributions may store this
+ setting in a different location.
+
+ Notes:
+
+ - To enable Jumbo Frames, increase the MTU size on the interface beyond
+ 1500.
+
+ - The maximum MTU setting for Jumbo Frames is 16110. This value coincides
+ with the maximum Jumbo Frames size of 16128.
+
+ - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or
+ loss of link.
+
+ - Some Intel gigabit adapters that support Jumbo Frames have a frame size
+ limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes.
+ The adapters with this limitation are based on the Intel(R) 82571EB,
+ 82572EI, 82573L and 80003ES2LAN controller. These correspond to the
+ following product names:
+ Intel(R) PRO/1000 PT Server Adapter
+ Intel(R) PRO/1000 PT Desktop Adapter
+ Intel(R) PRO/1000 PT Network Connection
+ Intel(R) PRO/1000 PT Dual Port Server Adapter
+ Intel(R) PRO/1000 PT Dual Port Network Connection
+ Intel(R) PRO/1000 PF Server Adapter
+ Intel(R) PRO/1000 PF Network Connection
+ Intel(R) PRO/1000 PF Dual Port Server Adapter
+ Intel(R) PRO/1000 PB Server Connection
+ Intel(R) PRO/1000 PL Network Connection
+ Intel(R) PRO/1000 EB Network Connection with I/O Acceleration
+ Intel(R) PRO/1000 EB Backplane Connection with I/O Acceleration
+ Intel(R) PRO/1000 PT Quad Port Server Adapter
+
+ - Adapters based on the Intel(R) 82542 and 82573V/E controller do not
+ support Jumbo Frames. These correspond to the following product names:
+ Intel(R) PRO/1000 Gigabit Server Adapter
+ Intel(R) PRO/1000 PM Network Connection
+
+ - The following adapters do not support Jumbo Frames:
+ Intel(R) 82562V 10/100 Network Connection
+ Intel(R) 82566DM Gigabit Network Connection
+ Intel(R) 82566DC Gigabit Network Connection
+ Intel(R) 82566MM Gigabit Network Connection
+ Intel(R) 82566MC Gigabit Network Connection
+ Intel(R) 82562GT 10/100 Network Connection
+ Intel(R) 82562G 10/100 Network Connection
+
+
+ Ethtool
+ -------
+ The driver utilizes the ethtool interface for driver configuration and
+ diagnostics, as well as displaying statistical information. Ethtool
+ version 1.6 or later is required for this functionality.
+
+ The latest release of ethtool can be found from
+ http://sourceforge.net/projects/gkernel.
+
+ NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
+ for a more complete ethtool feature set can be enabled by upgrading
+ ethtool to ethtool-1.8.1.
+
+ Enabling Wake on LAN* (WoL)
+ ---------------------------
+ WoL is configured through the Ethtool* utility. Ethtool is included with
+ all versions of Red Hat after Red Hat 7.2. For other Linux distributions,
+ download and install Ethtool from the following website:
+ http://sourceforge.net/projects/gkernel.
+
+ For instructions on enabling WoL with Ethtool, refer to the website listed
+ above.
+
+ WoL will be enabled on the system during the next shut down or reboot.
+ For this driver version, in order to enable WoL, the e1000 driver must be
+ loaded when shutting down or rebooting the system.
+
+ Wake On LAN is only supported on port A for the following devices:
+ Intel(R) PRO/1000 PT Dual Port Network Connection
+ Intel(R) PRO/1000 PT Dual Port Server Connection
+ Intel(R) PRO/1000 PT Dual Port Server Adapter
+ Intel(R) PRO/1000 PF Dual Port Server Adapter
+ Intel(R) PRO/1000 PT Quad Port Server Adapter
+
+ NAPI
+ ----
+ NAPI (Rx polling mode) is enabled in the e1000 driver.
+
+ See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
+
+
+Known Issues
+============
+
+Dropped Receive Packets on Half-duplex 10/100 Networks
+------------------------------------------------------
+If you have an Intel PCI Express adapter running at 10mbps or 100mbps, half-
+duplex, you may observe occasional dropped receive packets. There are no
+workarounds for this problem in this network configuration. The network must
+be updated to operate in full-duplex, and/or 1000mbps only.
+
+Jumbo Frames System Requirement
+-------------------------------
+Memory allocation failures have been observed on Linux systems with 64 MB
+of RAM or less that are running Jumbo Frames. If you are using Jumbo
+Frames, your system may require more than the advertised minimum
+requirement of 64 MB of system memory.
+
+Performance Degradation with Jumbo Frames
+-----------------------------------------
+Degradation in throughput performance may be observed in some Jumbo frames
+environments. If this is observed, increasing the application's socket
+buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values
+may help. See the specific application manual and
+/usr/src/linux*/Documentation/
+networking/ip-sysctl.txt for more details.
+
+Jumbo Frames on Foundry BigIron 8000 switch
+-------------------------------------------
+There is a known issue using Jumbo frames when connected to a Foundry
+BigIron 8000 switch. This is a 3rd party limitation. If you experience
+loss of packets, lower the MTU size.
+
+Allocating Rx Buffers when Using Jumbo Frames
+---------------------------------------------
+Allocating Rx buffers when using Jumbo Frames on 2.6.x kernels may fail if
+the available memory is heavily fragmented. This issue may be seen with PCI-X
+adapters or with packet split disabled. This can be reduced or eliminated
+by changing the amount of available memory for receive buffer allocation, by
+increasing /proc/sys/vm/min_free_kbytes.
+
+Multiple Interfaces on Same Ethernet Broadcast Network
+------------------------------------------------------
+Due to the default ARP behavior on Linux, it is not possible to have
+one system on two IP networks in the same Ethernet broadcast domain
+(non-partitioned switch) behave as expected. All Ethernet interfaces
+will respond to IP traffic for any IP address assigned to the system.
+This results in unbalanced receive traffic.
+
+If you have multiple interfaces in a server, either turn on ARP
+filtering by entering:
+
+ echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+(this only works if your kernel's version is higher than 2.4.5),
+
+NOTE: This setting is not saved across reboots. The configuration
+change can be made permanent by adding the line:
+ net.ipv4.conf.all.arp_filter = 1
+to the file /etc/sysctl.conf
+
+ or,
+
+install the interfaces in separate broadcast domains (either in
+different switches or in a switch partitioned to VLANs).
+
+82541/82547 can't link or are slow to link with some link partners
+-----------------------------------------------------------------
+There is a known compatibility issue with 82541/82547 and some
+low-end switches where the link will not be established, or will
+be slow to establish. In particular, these switches are known to
+be incompatible with 82541/82547:
+
+ Planex FXG-08TE
+ I-O Data ETG-SH8
+
+To workaround this issue, the driver can be compiled with an override
+of the PHY's master/slave setting. Forcing master or forcing slave
+mode will improve time-to-link.
+
+ # make CFLAGS_EXTRA=-DE1000_MASTER_SLAVE=<n>
+
+Where <n> is:
+
+ 0 = Hardware default
+ 1 = Master mode
+ 2 = Slave mode
+ 3 = Auto master/slave
+
+Disable rx flow control with ethtool
+------------------------------------
+In order to disable receive flow control using ethtool, you must turn
+off auto-negotiation on the same command line.
+
+For example:
+
+ ethtool -A eth? autoneg off rx off
+
+Unplugging network cable while ethtool -p is running
+----------------------------------------------------
+In kernel versions 2.5.50 and later (including 2.6 kernel), unplugging
+the network cable while ethtool -p is running will cause the system to
+become unresponsive to keyboard commands, except for control-alt-delete.
+Restarting the system appears to be the only remedy.
+
+
+Support
+=======
+
+For general information, go to the Intel support website at:
+
+ http://support.intel.com
+
+or the Intel Wired Networking project hosted by Sourceforge at:
+
+ http://sourceforge.net/projects/e1000
+
+If an issue is identified with the released source code on the supported
+kernel with a supported adapter, email the specific information related
+to the issue to e1000-devel@lists.sf.net
diff --git a/Documentation/networking/eql.txt b/Documentation/networking/eql.txt
new file mode 100644
index 0000000..0f15501
--- /dev/null
+++ b/Documentation/networking/eql.txt
@@ -0,0 +1,528 @@
+ EQL Driver: Serial IP Load Balancing HOWTO
+ Simon "Guru Aleph-Null" Janes, simon@ncm.com
+ v1.1, February 27, 1995
+
+ This is the manual for the EQL device driver. EQL is a software device
+ that lets you load-balance IP serial links (SLIP or uncompressed PPP)
+ to increase your bandwidth. It will not reduce your latency (i.e. ping
+ times) except in the case where you already have lots of traffic on
+ your link, in which it will help them out. This driver has been tested
+ with the 1.1.75 kernel, and is known to have patched cleanly with
+ 1.1.86. Some testing with 1.1.92 has been done with the v1.1 patch
+ which was only created to patch cleanly in the very latest kernel
+ source trees. (Yes, it worked fine.)
+
+ 1. Introduction
+
+ Which is worse? A huge fee for a 56K leased line or two phone lines?
+ It's probably the former. If you find yourself craving more bandwidth,
+ and have a ISP that is flexible, it is now possible to bind modems
+ together to work as one point-to-point link to increase your
+ bandwidth. All without having to have a special black box on either
+ side.
+
+
+ The eql driver has only been tested with the Livingston PortMaster-2e
+ terminal server. I do not know if other terminal servers support load-
+ balancing, but I do know that the PortMaster does it, and does it
+ almost as well as the eql driver seems to do it (-- Unfortunately, in
+ my testing so far, the Livingston PortMaster 2e's load-balancing is a
+ good 1 to 2 KB/s slower than the test machine working with a 28.8 Kbps
+ and 14.4 Kbps connection. However, I am not sure that it really is
+ the PortMaster, or if it's Linux's TCP drivers. I'm told that Linux's
+ TCP implementation is pretty fast though.--)
+
+
+ I suggest to ISPs out there that it would probably be fair to charge
+ a load-balancing client 75% of the cost of the second line and 50% of
+ the cost of the third line etc...
+
+
+ Hey, we can all dream you know...
+
+
+ 2. Kernel Configuration
+
+ Here I describe the general steps of getting a kernel up and working
+ with the eql driver. From patching, building, to installing.
+
+
+ 2.1. Patching The Kernel
+
+ If you do not have or cannot get a copy of the kernel with the eql
+ driver folded into it, get your copy of the driver from
+ ftp://slaughter.ncm.com/pub/Linux/LOAD_BALANCING/eql-1.1.tar.gz.
+ Unpack this archive someplace obvious like /usr/local/src/. It will
+ create the following files:
+
+
+
+ ______________________________________________________________________
+ -rw-r--r-- guru/ncm 198 Jan 19 18:53 1995 eql-1.1/NO-WARRANTY
+ -rw-r--r-- guru/ncm 30620 Feb 27 21:40 1995 eql-1.1/eql-1.1.patch
+ -rwxr-xr-x guru/ncm 16111 Jan 12 22:29 1995 eql-1.1/eql_enslave
+ -rw-r--r-- guru/ncm 2195 Jan 10 21:48 1995 eql-1.1/eql_enslave.c
+ ______________________________________________________________________
+
+ Unpack a recent kernel (something after 1.1.92) someplace convenient
+ like say /usr/src/linux-1.1.92.eql. Use symbolic links to point
+ /usr/src/linux to this development directory.
+
+
+ Apply the patch by running the commands:
+
+
+ ______________________________________________________________________
+ cd /usr/src
+ patch </usr/local/src/eql-1.1/eql-1.1.patch
+ ______________________________________________________________________
+
+
+
+
+
+ 2.2. Building The Kernel
+
+ After patching the kernel, run make config and configure the kernel
+ for your hardware.
+
+
+ After configuration, make and install according to your habit.
+
+
+ 3. Network Configuration
+
+ So far, I have only used the eql device with the DSLIP SLIP connection
+ manager by Matt Dillon (-- "The man who sold his soul to code so much
+ so quickly."--) . How you configure it for other "connection"
+ managers is up to you. Most other connection managers that I've seen
+ don't do a very good job when it comes to handling more than one
+ connection.
+
+
+ 3.1. /etc/rc.d/rc.inet1
+
+ In rc.inet1, ifconfig the eql device to the IP address you usually use
+ for your machine, and the MTU you prefer for your SLIP lines. One
+ could argue that MTU should be roughly half the usual size for two
+ modems, one-third for three, one-fourth for four, etc... But going
+ too far below 296 is probably overkill. Here is an example ifconfig
+ command that sets up the eql device:
+
+
+
+ ______________________________________________________________________
+ ifconfig eql 198.67.33.239 mtu 1006
+ ______________________________________________________________________
+
+
+
+
+
+ Once the eql device is up and running, add a static default route to
+ it in the routing table using the cool new route syntax that makes
+ life so much easier:
+
+
+
+ ______________________________________________________________________
+ route add default eql
+ ______________________________________________________________________
+
+
+ 3.2. Enslaving Devices By Hand
+
+ Enslaving devices by hand requires two utility programs: eql_enslave
+ and eql_emancipate (-- eql_emancipate hasn't been written because when
+ an enslaved device "dies", it is automatically taken out of the queue.
+ I haven't found a good reason to write it yet... other than for
+ completeness, but that isn't a good motivator is it?--)
+
+
+ The syntax for enslaving a device is "eql_enslave <master-name>
+ <slave-name> <estimated-bps>". Here are some example enslavings:
+
+
+
+ ______________________________________________________________________
+ eql_enslave eql sl0 28800
+ eql_enslave eql ppp0 14400
+ eql_enslave eql sl1 57600
+ ______________________________________________________________________
+
+
+
+
+
+ When you want to free a device from its life of slavery, you can
+ either down the device with ifconfig (eql will automatically bury the
+ dead slave and remove it from its queue) or use eql_emancipate to free
+ it. (-- Or just ifconfig it down, and the eql driver will take it out
+ for you.--)
+
+
+
+ ______________________________________________________________________
+ eql_emancipate eql sl0
+ eql_emancipate eql ppp0
+ eql_emancipate eql sl1
+ ______________________________________________________________________
+
+
+
+
+
+ 3.3. DSLIP Configuration for the eql Device
+
+ The general idea is to bring up and keep up as many SLIP connections
+ as you need, automatically.
+
+
+ 3.3.1. /etc/slip/runslip.conf
+
+ Here is an example runslip.conf:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ______________________________________________________________________
+ name sl-line-1
+ enabled
+ baud 38400
+ mtu 576
+ ducmd -e /etc/slip/dialout/cua2-288.xp -t 9
+ command eql_enslave eql $interface 28800
+ address 198.67.33.239
+ line /dev/cua2
+
+ name sl-line-2
+ enabled
+ baud 38400
+ mtu 576
+ ducmd -e /etc/slip/dialout/cua3-288.xp -t 9
+ command eql_enslave eql $interface 28800
+ address 198.67.33.239
+ line /dev/cua3
+ ______________________________________________________________________
+
+
+
+
+
+ 3.4. Using PPP and the eql Device
+
+ I have not yet done any load-balancing testing for PPP devices, mainly
+ because I don't have a PPP-connection manager like SLIP has with
+ DSLIP. I did find a good tip from LinuxNET:Billy for PPP performance:
+ make sure you have asyncmap set to something so that control
+ characters are not escaped.
+
+
+ I tried to fix up a PPP script/system for redialing lost PPP
+ connections for use with the eql driver the weekend of Feb 25-26 '95
+ (Hereafter known as the 8-hour PPP Hate Festival). Perhaps later this
+ year.
+
+
+ 4. About the Slave Scheduler Algorithm
+
+ The slave scheduler probably could be replaced with a dozen other
+ things and push traffic much faster. The formula in the current set
+ up of the driver was tuned to handle slaves with wildly different
+ bits-per-second "priorities".
+
+
+ All testing I have done was with two 28.8 V.FC modems, one connecting
+ at 28800 bps or slower, and the other connecting at 14400 bps all the
+ time.
+
+
+ One version of the scheduler was able to push 5.3 K/s through the
+ 28800 and 14400 connections, but when the priorities on the links were
+ very wide apart (57600 vs. 14400) the "faster" modem received all
+ traffic and the "slower" modem starved.
+
+
+ 5. Testers' Reports
+
+ Some people have experimented with the eql device with newer
+ kernels (than 1.1.75). I have since updated the driver to patch
+ cleanly in newer kernels because of the removal of the old "slave-
+ balancing" driver config option.
+
+
+ o icee from LinuxNET patched 1.1.86 without any rejects and was able
+ to boot the kernel and enslave a couple of ISDN PPP links.
+
+ 5.1. Randolph Bentson's Test Report
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From bentson@grieg.seaslug.org Wed Feb 8 19:08:09 1995
+ Date: Tue, 7 Feb 95 22:57 PST
+ From: Randolph Bentson <bentson@grieg.seaslug.org>
+ To: guru@ncm.com
+ Subject: EQL driver tests
+
+
+ I have been checking out your eql driver. (Nice work, that!)
+ Although you may already done this performance testing, here
+ are some data I've discovered.
+
+ Randolph Bentson
+ bentson@grieg.seaslug.org
+
+ ---------------------------------------------------------
+
+
+ A pseudo-device driver, EQL, written by Simon Janes, can be used
+ to bundle multiple SLIP connections into what appears to be a
+ single connection. This allows one to improve dial-up network
+ connectivity gradually, without having to buy expensive DSU/CSU
+ hardware and services.
+
+ I have done some testing of this software, with two goals in
+ mind: first, to ensure it actually works as described and
+ second, as a method of exercising my device driver.
+
+ The following performance measurements were derived from a set
+ of SLIP connections run between two Linux systems (1.1.84) using
+ a 486DX2/66 with a Cyclom-8Ys and a 486SLC/40 with a Cyclom-16Y.
+ (Ports 0,1,2,3 were used. A later configuration will distribute
+ port selection across the different Cirrus chips on the boards.)
+ Once a link was established, I timed a binary ftp transfer of
+ 289284 bytes of data. If there were no overhead (packet headers,
+ inter-character and inter-packet delays, etc.) the transfers
+ would take the following times:
+
+ bits/sec seconds
+ 345600 8.3
+ 234600 12.3
+ 172800 16.7
+ 153600 18.8
+ 76800 37.6
+ 57600 50.2
+ 38400 75.3
+ 28800 100.4
+ 19200 150.6
+ 9600 301.3
+
+ A single line running at the lower speeds and with large packets
+ comes to within 2% of this. Performance is limited for the higher
+ speeds (as predicted by the Cirrus databook) to an aggregate of
+ about 160 kbits/sec. The next round of testing will distribute
+ the load across two or more Cirrus chips.
+
+ The good news is that one gets nearly the full advantage of the
+ second, third, and fourth line's bandwidth. (The bad news is
+ that the connection establishment seemed fragile for the higher
+ speeds. Once established, the connection seemed robust enough.)
+
+ #lines speed mtu seconds theory actual %of
+ kbit/sec duration speed speed max
+ 3 115200 900 _ 345600
+ 3 115200 400 18.1 345600 159825 46
+ 2 115200 900 _ 230400
+ 2 115200 600 18.1 230400 159825 69
+ 2 115200 400 19.3 230400 149888 65
+ 4 57600 900 _ 234600
+ 4 57600 600 _ 234600
+ 4 57600 400 _ 234600
+ 3 57600 600 20.9 172800 138413 80
+ 3 57600 900 21.2 172800 136455 78
+ 3 115200 600 21.7 345600 133311 38
+ 3 57600 400 22.5 172800 128571 74
+ 4 38400 900 25.2 153600 114795 74
+ 4 38400 600 26.4 153600 109577 71
+ 4 38400 400 27.3 153600 105965 68
+ 2 57600 900 29.1 115200 99410.3 86
+ 1 115200 900 30.7 115200 94229.3 81
+ 2 57600 600 30.2 115200 95789.4 83
+ 3 38400 900 30.3 115200 95473.3 82
+ 3 38400 600 31.2 115200 92719.2 80
+ 1 115200 600 31.3 115200 92423 80
+ 2 57600 400 32.3 115200 89561.6 77
+ 1 115200 400 32.8 115200 88196.3 76
+ 3 38400 400 33.5 115200 86353.4 74
+ 2 38400 900 43.7 76800 66197.7 86
+ 2 38400 600 44 76800 65746.4 85
+ 2 38400 400 47.2 76800 61289 79
+ 4 19200 900 50.8 76800 56945.7 74
+ 4 19200 400 53.2 76800 54376.7 70
+ 4 19200 600 53.7 76800 53870.4 70
+ 1 57600 900 54.6 57600 52982.4 91
+ 1 57600 600 56.2 57600 51474 89
+ 3 19200 900 60.5 57600 47815.5 83
+ 1 57600 400 60.2 57600 48053.8 83
+ 3 19200 600 62 57600 46658.7 81
+ 3 19200 400 64.7 57600 44711.6 77
+ 1 38400 900 79.4 38400 36433.8 94
+ 1 38400 600 82.4 38400 35107.3 91
+ 2 19200 900 84.4 38400 34275.4 89
+ 1 38400 400 86.8 38400 33327.6 86
+ 2 19200 600 87.6 38400 33023.3 85
+ 2 19200 400 91.2 38400 31719.7 82
+ 4 9600 900 94.7 38400 30547.4 79
+ 4 9600 400 106 38400 27290.9 71
+ 4 9600 600 110 38400 26298.5 68
+ 3 9600 900 118 28800 24515.6 85
+ 3 9600 600 120 28800 24107 83
+ 3 9600 400 131 28800 22082.7 76
+ 1 19200 900 155 19200 18663.5 97
+ 1 19200 600 161 19200 17968 93
+ 1 19200 400 170 19200 17016.7 88
+ 2 9600 600 176 19200 16436.6 85
+ 2 9600 900 180 19200 16071.3 83
+ 2 9600 400 181 19200 15982.5 83
+ 1 9600 900 305 9600 9484.72 98
+ 1 9600 600 314 9600 9212.87 95
+ 1 9600 400 332 9600 8713.37 90
+
+
+
+
+
+ 5.2. Anthony Healy's Report
+
+
+
+
+
+
+
+ Date: Mon, 13 Feb 1995 16:17:29 +1100 (EST)
+ From: Antony Healey <ahealey@st.nepean.uws.edu.au>
+ To: Simon Janes <guru@ncm.com>
+ Subject: Re: Load Balancing
+
+ Hi Simon,
+ I've installed your patch and it works great. I have trialed
+ it over twin SL/IP lines, just over null modems, but I was
+ able to data at over 48Kb/s [ISDN link -Simon]. I managed a
+ transfer of up to 7.5 Kbyte/s on one go, but averaged around
+ 6.4 Kbyte/s, which I think is pretty cool. :)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Documentation/networking/ewrk3.txt b/Documentation/networking/ewrk3.txt
new file mode 100644
index 0000000..90e9e5f
--- /dev/null
+++ b/Documentation/networking/ewrk3.txt
@@ -0,0 +1,46 @@
+The EtherWORKS 3 driver in this distribution is designed to work with all
+kernels > 1.1.33 (approx) and includes tools in the 'ewrk3tools'
+subdirectory to allow set up of the card, similar to the MSDOS
+'NICSETUP.EXE' tools provided on the DOS drivers disk (type 'make' in that
+subdirectory to make the tools).
+
+The supported cards are DE203, DE204 and DE205. All other cards are NOT
+supported - refer to 'depca.c' for running the LANCE based network cards and
+'de4x5.c' for the DIGITAL Semiconductor PCI chip based adapters from
+Digital.
+
+The ability to load this driver as a loadable module has been included and
+used extensively during the driver development (to save those long reboot
+sequences). To utilise this ability, you have to do 8 things:
+
+ 0) have a copy of the loadable modules code installed on your system.
+ 1) copy ewrk3.c from the /linux/drivers/net directory to your favourite
+ temporary directory.
+ 2) edit the source code near line 1898 to reflect the I/O address and
+ IRQ you're using.
+ 3) compile ewrk3.c, but include -DMODULE in the command line to ensure
+ that the correct bits are compiled (see end of source code).
+ 4) if you are wanting to add a new card, goto 5. Otherwise, recompile a
+ kernel with the ewrk3 configuration turned off and reboot.
+ 5) insmod ewrk3.o
+ [Alan Cox: Changed this so you can insmod ewrk3.o irq=x io=y]
+ [Adam Kropelin: Multiple cards now supported by irq=x1,x2 io=y1,y2]
+ 6) run the net startup bits for your new eth?? interface manually
+ (usually /etc/rc.inet[12] at boot time).
+ 7) enjoy!
+
+ Note that autoprobing is not allowed in loadable modules - the system is
+ already up and running and you're messing with interrupts.
+
+ To unload a module, turn off the associated interface
+ 'ifconfig eth?? down' then 'rmmod ewrk3'.
+
+The performance we've achieved so far has been measured through the 'ttcp'
+tool at 975kB/s. This measures the total TCP stack performance which
+includes the card, so don't expect to get much nearer the 1.25MB/s
+theoretical Ethernet rate.
+
+
+Enjoy!
+
+Dave
diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.txt
new file mode 100644
index 0000000..0723db7
--- /dev/null
+++ b/Documentation/networking/fib_trie.txt
@@ -0,0 +1,145 @@
+ LC-trie implementation notes.
+
+Node types
+----------
+leaf
+ An end node with data. This has a copy of the relevant key, along
+ with 'hlist' with routing table entries sorted by prefix length.
+ See struct leaf and struct leaf_info.
+
+trie node or tnode
+ An internal node, holding an array of child (leaf or tnode) pointers,
+ indexed through a subset of the key. See Level Compression.
+
+A few concepts explained
+------------------------
+Bits (tnode)
+ The number of bits in the key segment used for indexing into the
+ child array - the "child index". See Level Compression.
+
+Pos (tnode)
+ The position (in the key) of the key segment used for indexing into
+ the child array. See Path Compression.
+
+Path Compression / skipped bits
+ Any given tnode is linked to from the child array of its parent, using
+ a segment of the key specified by the parent's "pos" and "bits"
+ In certain cases, this tnode's own "pos" will not be immediately
+ adjacent to the parent (pos+bits), but there will be some bits
+ in the key skipped over because they represent a single path with no
+ deviations. These "skipped bits" constitute Path Compression.
+ Note that the search algorithm will simply skip over these bits when
+ searching, making it necessary to save the keys in the leaves to
+ verify that they actually do match the key we are searching for.
+
+Level Compression / child arrays
+ the trie is kept level balanced moving, under certain conditions, the
+ children of a full child (see "full_children") up one level, so that
+ instead of a pure binary tree, each internal node ("tnode") may
+ contain an arbitrarily large array of links to several children.
+ Conversely, a tnode with a mostly empty child array (see empty_children)
+ may be "halved", having some of its children moved downwards one level,
+ in order to avoid ever-increasing child arrays.
+
+empty_children
+ the number of positions in the child array of a given tnode that are
+ NULL.
+
+full_children
+ the number of children of a given tnode that aren't path compressed.
+ (in other words, they aren't NULL or leaves and their "pos" is equal
+ to this tnode's "pos"+"bits").
+
+ (The word "full" here is used more in the sense of "complete" than
+ as the opposite of "empty", which might be a tad confusing.)
+
+Comments
+---------
+
+We have tried to keep the structure of the code as close to fib_hash as
+possible to allow verification and help up reviewing.
+
+fib_find_node()
+ A good start for understanding this code. This function implements a
+ straightforward trie lookup.
+
+fib_insert_node()
+ Inserts a new leaf node in the trie. This is bit more complicated than
+ fib_find_node(). Inserting a new node means we might have to run the
+ level compression algorithm on part of the trie.
+
+trie_leaf_remove()
+ Looks up a key, deletes it and runs the level compression algorithm.
+
+trie_rebalance()
+ The key function for the dynamic trie after any change in the trie
+ it is run to optimize and reorganize. Tt will walk the trie upwards
+ towards the root from a given tnode, doing a resize() at each step
+ to implement level compression.
+
+resize()
+ Analyzes a tnode and optimizes the child array size by either inflating
+ or shrinking it repeatedly until it fulfills the criteria for optimal
+ level compression. This part follows the original paper pretty closely
+ and there may be some room for experimentation here.
+
+inflate()
+ Doubles the size of the child array within a tnode. Used by resize().
+
+halve()
+ Halves the size of the child array within a tnode - the inverse of
+ inflate(). Used by resize();
+
+fn_trie_insert(), fn_trie_delete(), fn_trie_select_default()
+ The route manipulation functions. Should conform pretty closely to the
+ corresponding functions in fib_hash.
+
+fn_trie_flush()
+ This walks the full trie (using nextleaf()) and searches for empty
+ leaves which have to be removed.
+
+fn_trie_dump()
+ Dumps the routing table ordered by prefix length. This is somewhat
+ slower than the corresponding fib_hash function, as we have to walk the
+ entire trie for each prefix length. In comparison, fib_hash is organized
+ as one "zone"/hash per prefix length.
+
+Locking
+-------
+
+fib_lock is used for an RW-lock in the same way that this is done in fib_hash.
+However, the functions are somewhat separated for other possible locking
+scenarios. It might conceivably be possible to run trie_rebalance via RCU
+to avoid read_lock in the fn_trie_lookup() function.
+
+Main lookup mechanism
+---------------------
+fn_trie_lookup() is the main lookup function.
+
+The lookup is in its simplest form just like fib_find_node(). We descend the
+trie, key segment by key segment, until we find a leaf. check_leaf() does
+the fib_semantic_match in the leaf's sorted prefix hlist.
+
+If we find a match, we are done.
+
+If we don't find a match, we enter prefix matching mode. The prefix length,
+starting out at the same as the key length, is reduced one step at a time,
+and we backtrack upwards through the trie trying to find a longest matching
+prefix. The goal is always to reach a leaf and get a positive result from the
+fib_semantic_match mechanism.
+
+Inside each tnode, the search for longest matching prefix consists of searching
+through the child array, chopping off (zeroing) the least significant "1" of
+the child index until we find a match or the child index consists of nothing but
+zeros.
+
+At this point we backtrack (t->stats.backtrack++) up the trie, continuing to
+chop off part of the key in order to find the longest matching prefix.
+
+At this point we will repeatedly descend subtries to look for a match, and there
+are some optimizations available that can provide us with "shortcuts" to avoid
+descending into dead ends. Look for "HL_OPTIMIZE" sections in the code.
+
+To alleviate any doubts about the correctness of the route selection process,
+a new netlink operation has been added. Look for NETLINK_FIB_LOOKUP, which
+gives userland access to fib_lookup().
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
new file mode 100644
index 0000000..bbf2005
--- /dev/null
+++ b/Documentation/networking/filter.txt
@@ -0,0 +1,42 @@
+filter.txt: Linux Socket Filtering
+Written by: Jay Schulist <jschlst@samba.org>
+
+Introduction
+============
+
+ Linux Socket Filtering is derived from the Berkeley
+Packet Filter. There are some distinct differences between
+the BSD and Linux Kernel Filtering.
+
+Linux Socket Filtering (LSF) allows a user-space program to
+attach a filter onto any socket and allow or disallow certain
+types of data to come through the socket. LSF follows exactly
+the same filter code structure as the BSD Berkeley Packet Filter
+(BPF), so referring to the BSD bpf.4 manpage is very helpful in
+creating filters.
+
+LSF is much simpler than BPF. One does not have to worry about
+devices or anything like that. You simply create your filter
+code, send it to the kernel via the SO_ATTACH_FILTER ioctl and
+if your filter code passes the kernel check on it, you then
+immediately begin filtering data on that socket.
+
+You can also detach filters from your socket via the
+SO_DETACH_FILTER ioctl. This will probably not be used much
+since when you close a socket that has a filter on it the
+filter is automagically removed. The other less common case
+may be adding a different filter on the same socket where you had another
+filter that is still running: the kernel takes care of removing
+the old one and placing your new one in its place, assuming your
+filter has passed the checks, otherwise if it fails the old filter
+will remain on that socket.
+
+Examples
+========
+
+Ioctls-
+setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &Filter, sizeof(Filter));
+setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &value, sizeof(value));
+
+See the BSD bpf.4 manpage and the BSD Packet Filter paper written by
+Steven McCanne and Van Jacobson of Lawrence Berkeley Laboratory.
diff --git a/Documentation/networking/fore200e.txt b/Documentation/networking/fore200e.txt
new file mode 100644
index 0000000..b1f337f
--- /dev/null
+++ b/Documentation/networking/fore200e.txt
@@ -0,0 +1,66 @@
+
+FORE Systems PCA-200E/SBA-200E ATM NIC driver
+---------------------------------------------
+
+This driver adds support for the FORE Systems 200E-series ATM adapters
+to the Linux operating system. It is based on the earlier PCA-200E driver
+written by Uwe Dannowski.
+
+The driver simultaneously supports PCA-200E and SBA-200E adapters on
+i386, alpha (untested), powerpc, sparc and sparc64 archs.
+
+The intent is to enable the use of different models of FORE adapters at the
+same time, by hosts that have several bus interfaces (such as PCI+SBUS,
+PCI+MCA or PCI+EISA).
+
+Only PCI and SBUS devices are currently supported by the driver, but support
+for other bus interfaces such as EISA should not be too hard to add (this may
+be more tricky for the MCA bus, though, as FORE made some MCA-specific
+modifications to the adapter's AALI interface).
+
+
+Firmware Copyright Notice
+-------------------------
+
+Please read the fore200e_firmware_copyright file present
+in the linux/drivers/atm directory for details and restrictions.
+
+
+Firmware Updates
+----------------
+
+The FORE Systems 200E-series driver is shipped with firmware data being
+uploaded to the ATM adapters at system boot time or at module loading time.
+The supplied firmware images should work with all adapters.
+
+However, if you encounter problems (the firmware doesn't start or the driver
+is unable to read the PROM data), you may consider trying another firmware
+version. Alternative binary firmware images can be found somewhere on the
+ForeThought CD-ROM supplied with your adapter by FORE Systems.
+
+You can also get the latest firmware images from FORE Systems at
+http://www.fore.com. Register TACTics Online and go to
+the 'software updates' pages. The firmware binaries are part of
+the various ForeThought software distributions.
+
+Notice that different versions of the PCA-200E firmware exist, depending
+on the endianess of the host architecture. The driver is shipped with
+both little and big endian PCA firmware images.
+
+Name and location of the new firmware images can be set at kernel
+configuration time:
+
+1. Copy the new firmware binary files (with .bin, .bin1 or .bin2 suffix)
+ to some directory, such as linux/drivers/atm.
+
+2. Reconfigure your kernel to set the new firmware name and location.
+ Expected pathnames are absolute or relative to the drivers/atm directory.
+
+3. Rebuild and re-install your kernel or your module.
+
+
+Feedback
+--------
+
+Feedback is welcome. Please send success stories/bug reports/
+patches/improvement/comments/flames to <lizzi@cnam.fr>.
diff --git a/Documentation/networking/framerelay.txt b/Documentation/networking/framerelay.txt
new file mode 100644
index 0000000..1a0b720
--- /dev/null
+++ b/Documentation/networking/framerelay.txt
@@ -0,0 +1,39 @@
+Frame Relay (FR) support for linux is built into a two tiered system of device
+drivers. The upper layer implements RFC1490 FR specification, and uses the
+Data Link Connection Identifier (DLCI) as its hardware address. Usually these
+are assigned by your network supplier, they give you the number/numbers of
+the Virtual Connections (VC) assigned to you.
+
+Each DLCI is a point-to-point link between your machine and a remote one.
+As such, a separate device is needed to accommodate the routing. Within the
+net-tools archives is 'dlcicfg'. This program will communicate with the
+base "DLCI" device, and create new net devices named 'dlci00', 'dlci01'...
+The configuration script will ask you how many DLCIs you need, as well as
+how many DLCIs you want to assign to each Frame Relay Access Device (FRAD).
+
+The DLCI uses a number of function calls to communicate with the FRAD, all
+of which are stored in the FRAD's private data area. assoc/deassoc,
+activate/deactivate and dlci_config. The DLCI supplies a receive function
+to the FRAD to accept incoming packets.
+
+With this initial offering, only 1 FRAD driver is available. With many thanks
+to Sangoma Technologies, David Mandelstam & Gene Kozin, the S502A, S502E &
+S508 are supported. This driver is currently set up for only FR, but as
+Sangoma makes more firmware modules available, it can be updated to provide
+them as well.
+
+Configuration of the FRAD makes use of another net-tools program, 'fradcfg'.
+This program makes use of a configuration file (which dlcicfg can also read)
+to specify the types of boards to be configured as FRADs, as well as perform
+any board specific configuration. The Sangoma module of fradcfg loads the
+FR firmware into the card, sets the irq/port/memory information, and provides
+an initial configuration.
+
+Additional FRAD device drivers can be added as hardware is available.
+
+At this time, the dlcicfg and fradcfg programs have not been incorporated into
+the net-tools distribution. They can be found at ftp.invlogic.com, in
+/pub/linux. Note that with OS/2 FTPD, you end up in /pub by default, so just
+use 'cd linux'. v0.10 is for use on pre-2.0.3 and earlier, v0.15 is for
+pre-2.0.4 and later.
+
diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.txt
new file mode 100644
index 0000000..70e6275
--- /dev/null
+++ b/Documentation/networking/gen_stats.txt
@@ -0,0 +1,117 @@
+Generic networking statistics for netlink users
+======================================================================
+
+Statistic counters are grouped into structs:
+
+Struct TLV type Description
+----------------------------------------------------------------------
+gnet_stats_basic TCA_STATS_BASIC Basic statistics
+gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator
+gnet_stats_queue TCA_STATS_QUEUE Queue statistics
+none TCA_STATS_APP Application specific
+
+
+Collecting:
+-----------
+
+Declare the statistic structs you need:
+struct mystruct {
+ struct gnet_stats_basic bstats;
+ struct gnet_stats_queue qstats;
+ ...
+};
+
+Update statistics:
+mystruct->tstats.packet++;
+mystruct->qstats.backlog += skb->pkt_len;
+
+
+Export to userspace (Dump):
+---------------------------
+
+my_dumping_routine(struct sk_buff *skb, ...)
+{
+ struct gnet_dump dump;
+
+ if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump) < 0)
+ goto rtattr_failure;
+
+ if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 ||
+ gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 ||
+ gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0)
+ goto rtattr_failure;
+
+ if (gnet_stats_finish_copy(&dump) < 0)
+ goto rtattr_failure;
+ ...
+}
+
+TCA_STATS/TCA_XSTATS backward compatibility:
+--------------------------------------------
+
+Prior users of struct tc_stats and xstats can maintain backward
+compatibility by calling the compat wrappers to keep providing the
+existing TLV types.
+
+my_dumping_routine(struct sk_buff *skb, ...)
+{
+ if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
+ TCA_XSTATS, &mystruct->lock, &dump) < 0)
+ goto rtattr_failure;
+ ...
+}
+
+A struct tc_stats will be filled out during gnet_stats_copy_* calls
+and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app
+was called.
+
+
+Locking:
+--------
+
+Locks are taken before writing and released once all statistics have
+been written. Locks are always released in case of an error. You
+are responsible for making sure that the lock is initialized.
+
+
+Rate Estimator:
+--------------
+
+0) Prepare an estimator attribute. Most likely this would be in user
+ space. The value of this TLV should contain a tc_estimator structure.
+ As usual, such a TLV needs to be 32 bit aligned and therefore the
+ length needs to be appropriately set, etc. The estimator interval
+ and ewma log need to be converted to the appropriate values.
+ tc_estimator.c::tc_setup_estimator() is advisable to be used as the
+ conversion routine. It does a few clever things. It takes a time
+ interval in microsecs, a time constant also in microsecs and a struct
+ tc_estimator to be populated. The returned tc_estimator can be
+ transported to the kernel. Transfer such a structure in a TLV of type
+ TCA_RATE to your code in the kernel.
+
+In the kernel when setting up:
+1) make sure you have basic stats and rate stats setup first.
+2) make sure you have initialized stats lock that is used to setup such
+ stats.
+3) Now initialize a new estimator:
+
+ int ret = gen_new_estimator(my_basicstats,my_rate_est_stats,
+ mystats_lock, attr_with_tcestimator_struct);
+
+ if ret == 0
+ success
+ else
+ failed
+
+From now on, every time you dump my_rate_est_stats it will contain
+up-to-date info.
+
+Once you are done, call gen_kill_estimator(my_basicstats,
+my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats
+are still valid (i.e still exist) at the time of making this call.
+
+
+Authors:
+--------
+Thomas Graf <tgraf@suug.ch>
+Jamal Hadi Salim <hadi@cyberus.ca>
diff --git a/Documentation/networking/generic-hdlc.txt b/Documentation/networking/generic-hdlc.txt
new file mode 100644
index 0000000..31bc8b7
--- /dev/null
+++ b/Documentation/networking/generic-hdlc.txt
@@ -0,0 +1,132 @@
+Generic HDLC layer
+Krzysztof Halasa <khc@pm.waw.pl>
+
+
+Generic HDLC layer currently supports:
+1. Frame Relay (ANSI, CCITT, Cisco and no LMI).
+ - Normal (routed) and Ethernet-bridged (Ethernet device emulation)
+ interfaces can share a single PVC.
+ - ARP support (no InARP support in the kernel - there is an
+ experimental InARP user-space daemon available on:
+ http://www.kernel.org/pub/linux/utils/net/hdlc/).
+2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation.
+3. Cisco HDLC.
+4. PPP (uses syncppp.c).
+5. X.25 (uses X.25 routines).
+
+Generic HDLC is a protocol driver only - it needs a low-level driver
+for your particular hardware.
+
+Ethernet device emulation (using HDLC or Frame-Relay PVC) is compatible
+with IEEE 802.1Q (VLANs) and 802.1D (Ethernet bridging).
+
+
+Make sure the hdlc.o and the hardware driver are loaded. It should
+create a number of "hdlc" (hdlc0 etc) network devices, one for each
+WAN port. You'll need the "sethdlc" utility, get it from:
+ http://www.kernel.org/pub/linux/utils/net/hdlc/
+
+Compile sethdlc.c utility:
+ gcc -O2 -Wall -o sethdlc sethdlc.c
+Make sure you're using a correct version of sethdlc for your kernel.
+
+Use sethdlc to set physical interface, clock rate, HDLC mode used,
+and add any required PVCs if using Frame Relay.
+Usually you want something like:
+
+ sethdlc hdlc0 clock int rate 128000
+ sethdlc hdlc0 cisco interval 10 timeout 25
+or
+ sethdlc hdlc0 rs232 clock ext
+ sethdlc hdlc0 fr lmi ansi
+ sethdlc hdlc0 create 99
+ ifconfig hdlc0 up
+ ifconfig pvc0 localIP pointopoint remoteIP
+
+In Frame Relay mode, ifconfig master hdlc device up (without assigning
+any IP address to it) before using pvc devices.
+
+
+Setting interface:
+
+* v35 | rs232 | x21 | t1 | e1 - sets physical interface for a given port
+ if the card has software-selectable interfaces
+ loopback - activate hardware loopback (for testing only)
+* clock ext - both RX clock and TX clock external
+* clock int - both RX clock and TX clock internal
+* clock txint - RX clock external, TX clock internal
+* clock txfromrx - RX clock external, TX clock derived from RX clock
+* rate - sets clock rate in bps (for "int" or "txint" clock only)
+
+
+Setting protocol:
+
+* hdlc - sets raw HDLC (IP-only) mode
+ nrz / nrzi / fm-mark / fm-space / manchester - sets transmission code
+ no-parity / crc16 / crc16-pr0 (CRC16 with preset zeros) / crc32-itu
+ crc16-itu (CRC16 with ITU-T polynomial) / crc16-itu-pr0 - sets parity
+
+* hdlc-eth - Ethernet device emulation using HDLC. Parity and encoding
+ as above.
+
+* cisco - sets Cisco HDLC mode (IP, IPv6 and IPX supported)
+ interval - time in seconds between keepalive packets
+ timeout - time in seconds after last received keepalive packet before
+ we assume the link is down
+
+* ppp - sets synchronous PPP mode
+
+* x25 - sets X.25 mode
+
+* fr - Frame Relay mode
+ lmi ansi / ccitt / cisco / none - LMI (link management) type
+ dce - Frame Relay DCE (network) side LMI instead of default DTE (user).
+ It has nothing to do with clocks!
+ t391 - link integrity verification polling timer (in seconds) - user
+ t392 - polling verification timer (in seconds) - network
+ n391 - full status polling counter - user
+ n392 - error threshold - both user and network
+ n393 - monitored events count - both user and network
+
+Frame-Relay only:
+* create n | delete n - adds / deletes PVC interface with DLCI #n.
+ Newly created interface will be named pvc0, pvc1 etc.
+
+* create ether n | delete ether n - adds a device for Ethernet-bridged
+ frames. The device will be named pvceth0, pvceth1 etc.
+
+
+
+
+Board-specific issues
+---------------------
+
+n2.o and c101.o need parameters to work:
+
+ insmod n2 hw=io,irq,ram,ports[:io,irq,...]
+example:
+ insmod n2 hw=0x300,10,0xD0000,01
+
+or
+ insmod c101 hw=irq,ram[:irq,...]
+example:
+ insmod c101 hw=9,0xdc000
+
+If built into the kernel, these drivers need kernel (command line) parameters:
+ n2.hw=io,irq,ram,ports:...
+or
+ c101.hw=irq,ram:...
+
+
+
+If you have a problem with N2, C101 or PLX200SYN card, you can issue the
+"private" command to see port's packet descriptor rings (in kernel logs):
+
+ sethdlc hdlc0 private
+
+The hardware driver has to be build with #define DEBUG_RINGS.
+Attaching this info to bug reports would be helpful. Anyway, let me know
+if you have problems using this.
+
+For patches and other info look at:
+<http://www.kernel.org/pub/linux/utils/net/hdlc/>.
diff --git a/Documentation/networking/generic_netlink.txt b/Documentation/networking/generic_netlink.txt
new file mode 100644
index 0000000..d4f8b8b
--- /dev/null
+++ b/Documentation/networking/generic_netlink.txt
@@ -0,0 +1,3 @@
+A wiki document on how to use Generic Netlink can be found here:
+
+ * http://linux-net.osdl.org/index.php/Generic_Netlink_HOWTO
diff --git a/Documentation/networking/gianfar.txt b/Documentation/networking/gianfar.txt
new file mode 100644
index 0000000..ad474ea
--- /dev/null
+++ b/Documentation/networking/gianfar.txt
@@ -0,0 +1,72 @@
+The Gianfar Ethernet Driver
+Sysfs File description
+
+Author: Andy Fleming <afleming@freescale.com>
+Updated: 2005-07-28
+
+SYSFS
+
+Several of the features of the gianfar driver are controlled
+through sysfs files. These are:
+
+bd_stash:
+To stash RX Buffer Descriptors in the L2, echo 'on' or '1' to
+bd_stash, echo 'off' or '0' to disable
+
+rx_stash_len:
+To stash the first n bytes of the packet in L2, echo the number
+of bytes to buf_stash_len. echo 0 to disable.
+
+WARNING: You could really screw these up if you set them too low or high!
+fifo_threshold:
+To change the number of bytes the controller needs in the
+fifo before it starts transmission, echo the number of bytes to
+fifo_thresh. Range should be 0-511.
+
+fifo_starve:
+When the FIFO has less than this many bytes during a transmit, it
+enters starve mode, and increases the priority of TX memory
+transactions. To change, echo the number of bytes to
+fifo_starve. Range should be 0-511.
+
+fifo_starve_off:
+Once in starve mode, the FIFO remains there until it has this
+many bytes. To change, echo the number of bytes to
+fifo_starve_off. Range should be 0-511.
+
+CHECKSUM OFFLOADING
+
+The eTSEC controller (first included in parts from late 2005 like
+the 8548) has the ability to perform TCP, UDP, and IP checksums
+in hardware. The Linux kernel only offloads the TCP and UDP
+checksums (and always performs the pseudo header checksums), so
+the driver only supports checksumming for TCP/IP and UDP/IP
+packets. Use ethtool to enable or disable this feature for RX
+and TX.
+
+VLAN
+
+In order to use VLAN, please consult Linux documentation on
+configuring VLANs. The gianfar driver supports hardware insertion and
+extraction of VLAN headers, but not filtering. Filtering will be
+done by the kernel.
+
+MULTICASTING
+
+The gianfar driver supports using the group hash table on the
+TSEC (and the extended hash table on the eTSEC) for multicast
+filtering. On the eTSEC, the exact-match MAC registers are used
+before the hash tables. See Linux documentation on how to join
+multicast groups.
+
+PADDING
+
+The gianfar driver supports padding received frames with 2 bytes
+to align the IP header to a 16-byte boundary, when supported by
+hardware.
+
+ETHTOOL
+
+The gianfar driver supports the use of ethtool for many
+configuration options. You must run ethtool only on currently
+open interfaces. See ethtool documentation for details.
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
new file mode 100644
index 0000000..1b96ccd
--- /dev/null
+++ b/Documentation/networking/ifenslave.c
@@ -0,0 +1,1103 @@
+/* Mode: C;
+ * ifenslave.c: Configure network interfaces for parallel routing.
+ *
+ * This program controls the Linux implementation of running multiple
+ * network interfaces in parallel.
+ *
+ * Author: Donald Becker <becker@cesdis.gsfc.nasa.gov>
+ * Copyright 1994-1996 Donald Becker
+ *
+ * This program is free software; you can redistribute it
+ * and/or modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * The author may be reached as becker@CESDIS.gsfc.nasa.gov, or C/O
+ * Center of Excellence in Space Data and Information Sciences
+ * Code 930.5, Goddard Space Flight Center, Greenbelt MD 20771
+ *
+ * Changes :
+ * - 2000/10/02 Willy Tarreau <willy at meta-x.org> :
+ * - few fixes. Master's MAC address is now correctly taken from
+ * the first device when not previously set ;
+ * - detach support : call BOND_RELEASE to detach an enslaved interface.
+ * - give a mini-howto from command-line help : # ifenslave -h
+ *
+ * - 2001/02/16 Chad N. Tindel <ctindel at ieee dot org> :
+ * - Master is now brought down before setting the MAC address. In
+ * the 2.4 kernel you can't change the MAC address while the device is
+ * up because you get EBUSY.
+ *
+ * - 2001/09/13 Takao Indoh <indou dot takao at jp dot fujitsu dot com>
+ * - Added the ability to change the active interface on a mode 1 bond
+ * at runtime.
+ *
+ * - 2001/10/23 Chad N. Tindel <ctindel at ieee dot org> :
+ * - No longer set the MAC address of the master. The bond device will
+ * take care of this itself
+ * - Try the SIOC*** versions of the bonding ioctls before using the
+ * old versions
+ * - 2002/02/18 Erik Habbinga <erik_habbinga @ hp dot com> :
+ * - ifr2.ifr_flags was not initialized in the hwaddr_notset case,
+ * SIOCGIFFLAGS now called before hwaddr_notset test
+ *
+ * - 2002/10/31 Tony Cureington <tony.cureington * hp_com> :
+ * - If the master does not have a hardware address when the first slave
+ * is enslaved, the master is assigned the hardware address of that
+ * slave - there is a comment in bonding.c stating "ifenslave takes
+ * care of this now." This corrects the problem of slaves having
+ * different hardware addresses in active-backup mode when
+ * multiple interfaces are specified on a single ifenslave command
+ * (ifenslave bond0 eth0 eth1).
+ *
+ * - 2003/03/18 - Tsippy Mendelson <tsippy.mendelson at intel dot com> and
+ * Shmulik Hen <shmulik.hen at intel dot com>
+ * - Moved setting the slave's mac address and openning it, from
+ * the application to the driver. This enables support of modes
+ * that need to use the unique mac address of each slave.
+ * The driver also takes care of closing the slave and restoring its
+ * original mac address upon release.
+ * In addition, block possibility of enslaving before the master is up.
+ * This prevents putting the system in an undefined state.
+ *
+ * - 2003/05/01 - Amir Noam <amir.noam at intel dot com>
+ * - Added ABI version control to restore compatibility between
+ * new/old ifenslave and new/old bonding.
+ * - Prevent adding an adapter that is already a slave.
+ * Fixes the problem of stalling the transmission and leaving
+ * the slave in a down state.
+ *
+ * - 2003/05/01 - Shmulik Hen <shmulik.hen at intel dot com>
+ * - Prevent enslaving if the bond device is down.
+ * Fixes the problem of leaving the system in unstable state and
+ * halting when trying to remove the module.
+ * - Close socket on all abnormal exists.
+ * - Add versioning scheme that follows that of the bonding driver.
+ * current version is 1.0.0 as a base line.
+ *
+ * - 2003/05/22 - Jay Vosburgh <fubar at us dot ibm dot com>
+ * - ifenslave -c was broken; it's now fixed
+ * - Fixed problem with routes vanishing from master during enslave
+ * processing.
+ *
+ * - 2003/05/27 - Amir Noam <amir.noam at intel dot com>
+ * - Fix backward compatibility issues:
+ * For drivers not using ABI versions, slave was set down while
+ * it should be left up before enslaving.
+ * Also, master was not set down and the default set_mac_address()
+ * would fail and generate an error message in the system log.
+ * - For opt_c: slave should not be set to the master's setting
+ * while it is running. It was already set during enslave. To
+ * simplify things, it is now handled separately.
+ *
+ * - 2003/12/01 - Shmulik Hen <shmulik.hen at intel dot com>
+ * - Code cleanup and style changes
+ * set version to 1.1.0
+ */
+
+#define APP_VERSION "1.1.0"
+#define APP_RELDATE "December 1, 2003"
+#define APP_NAME "ifenslave"
+
+static char *version =
+APP_NAME ".c:v" APP_VERSION " (" APP_RELDATE ")\n"
+"o Donald Becker (becker@cesdis.gsfc.nasa.gov).\n"
+"o Detach support added on 2000/10/02 by Willy Tarreau (willy at meta-x.org).\n"
+"o 2.4 kernel support added on 2001/02/16 by Chad N. Tindel\n"
+" (ctindel at ieee dot org).\n";
+
+static const char *usage_msg =
+"Usage: ifenslave [-f] <master-if> <slave-if> [<slave-if>...]\n"
+" ifenslave -d <master-if> <slave-if> [<slave-if>...]\n"
+" ifenslave -c <master-if> <slave-if>\n"
+" ifenslave --help\n";
+
+static const char *help_msg =
+"\n"
+" To create a bond device, simply follow these three steps :\n"
+" - ensure that the required drivers are properly loaded :\n"
+" # modprobe bonding ; modprobe <3c59x|eepro100|pcnet32|tulip|...>\n"
+" - assign an IP address to the bond device :\n"
+" # ifconfig bond0 <addr> netmask <mask> broadcast <bcast>\n"
+" - attach all the interfaces you need to the bond device :\n"
+" # ifenslave [{-f|--force}] bond0 eth0 [eth1 [eth2]...]\n"
+" If bond0 didn't have a MAC address, it will take eth0's. Then, all\n"
+" interfaces attached AFTER this assignment will get the same MAC addr.\n"
+" (except for ALB/TLB modes)\n"
+"\n"
+" To set the bond device down and automatically release all the slaves :\n"
+" # ifconfig bond0 down\n"
+"\n"
+" To detach a dead interface without setting the bond device down :\n"
+" # ifenslave {-d|--detach} bond0 eth0 [eth1 [eth2]...]\n"
+"\n"
+" To change active slave :\n"
+" # ifenslave {-c|--change-active} bond0 eth0\n"
+"\n"
+" To show master interface info\n"
+" # ifenslave bond0\n"
+"\n"
+" To show all interfaces info\n"
+" # ifenslave {-a|--all-interfaces}\n"
+"\n"
+" To be more verbose\n"
+" # ifenslave {-v|--verbose} ...\n"
+"\n"
+" # ifenslave {-u|--usage} Show usage\n"
+" # ifenslave {-V|--version} Show version\n"
+" # ifenslave {-h|--help} This message\n"
+"\n";
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <linux/if.h>
+#include <net/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_bonding.h>
+#include <linux/sockios.h>
+
+typedef unsigned long long u64; /* hack, so we may include kernel's ethtool.h */
+typedef __uint32_t u32; /* ditto */
+typedef __uint16_t u16; /* ditto */
+typedef __uint8_t u8; /* ditto */
+#include <linux/ethtool.h>
+
+struct option longopts[] = {
+ /* { name has_arg *flag val } */
+ {"all-interfaces", 0, 0, 'a'}, /* Show all interfaces. */
+ {"change-active", 0, 0, 'c'}, /* Change the active slave. */
+ {"detach", 0, 0, 'd'}, /* Detach a slave interface. */
+ {"force", 0, 0, 'f'}, /* Force the operation. */
+ {"help", 0, 0, 'h'}, /* Give help */
+ {"usage", 0, 0, 'u'}, /* Give usage */
+ {"verbose", 0, 0, 'v'}, /* Report each action taken. */
+ {"version", 0, 0, 'V'}, /* Emit version information. */
+ { 0, 0, 0, 0}
+};
+
+/* Command-line flags. */
+unsigned int
+opt_a = 0, /* Show-all-interfaces flag. */
+opt_c = 0, /* Change-active-slave flag. */
+opt_d = 0, /* Detach a slave interface. */
+opt_f = 0, /* Force the operation. */
+opt_h = 0, /* Help */
+opt_u = 0, /* Usage */
+opt_v = 0, /* Verbose flag. */
+opt_V = 0; /* Version */
+
+int skfd = -1; /* AF_INET socket for ioctl() calls.*/
+int abi_ver = 0; /* userland - kernel ABI version */
+int hwaddr_set = 0; /* Master's hwaddr is set */
+int saved_errno;
+
+struct ifreq master_mtu, master_flags, master_hwaddr;
+struct ifreq slave_mtu, slave_flags, slave_hwaddr;
+
+struct dev_ifr {
+ struct ifreq *req_ifr;
+ char *req_name;
+ int req_type;
+};
+
+struct dev_ifr master_ifra[] = {
+ {&master_mtu, "SIOCGIFMTU", SIOCGIFMTU},
+ {&master_flags, "SIOCGIFFLAGS", SIOCGIFFLAGS},
+ {&master_hwaddr, "SIOCGIFHWADDR", SIOCGIFHWADDR},
+ {NULL, "", 0}
+};
+
+struct dev_ifr slave_ifra[] = {
+ {&slave_mtu, "SIOCGIFMTU", SIOCGIFMTU},
+ {&slave_flags, "SIOCGIFFLAGS", SIOCGIFFLAGS},
+ {&slave_hwaddr, "SIOCGIFHWADDR", SIOCGIFHWADDR},
+ {NULL, "", 0}
+};
+
+static void if_print(char *ifname);
+static int get_drv_info(char *master_ifname);
+static int get_if_settings(char *ifname, struct dev_ifr ifra[]);
+static int get_slave_flags(char *slave_ifname);
+static int set_master_hwaddr(char *master_ifname, struct sockaddr *hwaddr);
+static int set_slave_hwaddr(char *slave_ifname, struct sockaddr *hwaddr);
+static int set_slave_mtu(char *slave_ifname, int mtu);
+static int set_if_flags(char *ifname, short flags);
+static int set_if_up(char *ifname, short flags);
+static int set_if_down(char *ifname, short flags);
+static int clear_if_addr(char *ifname);
+static int set_if_addr(char *master_ifname, char *slave_ifname);
+static int change_active(char *master_ifname, char *slave_ifname);
+static int enslave(char *master_ifname, char *slave_ifname);
+static int release(char *master_ifname, char *slave_ifname);
+#define v_print(fmt, args...) \
+ if (opt_v) \
+ fprintf(stderr, fmt, ## args )
+
+int main(int argc, char *argv[])
+{
+ char **spp, *master_ifname, *slave_ifname;
+ int c, i, rv;
+ int res = 0;
+ int exclusive = 0;
+
+ while ((c = getopt_long(argc, argv, "acdfhuvV", longopts, 0)) != EOF) {
+ switch (c) {
+ case 'a': opt_a++; exclusive++; break;
+ case 'c': opt_c++; exclusive++; break;
+ case 'd': opt_d++; exclusive++; break;
+ case 'f': opt_f++; exclusive++; break;
+ case 'h': opt_h++; exclusive++; break;
+ case 'u': opt_u++; exclusive++; break;
+ case 'v': opt_v++; break;
+ case 'V': opt_V++; exclusive++; break;
+
+ case '?':
+ fprintf(stderr, usage_msg);
+ res = 2;
+ goto out;
+ }
+ }
+
+ /* options check */
+ if (exclusive > 1) {
+ fprintf(stderr, usage_msg);
+ res = 2;
+ goto out;
+ }
+
+ if (opt_v || opt_V) {
+ printf(version);
+ if (opt_V) {
+ res = 0;
+ goto out;
+ }
+ }
+
+ if (opt_u) {
+ printf(usage_msg);
+ res = 0;
+ goto out;
+ }
+
+ if (opt_h) {
+ printf(usage_msg);
+ printf(help_msg);
+ res = 0;
+ goto out;
+ }
+
+ /* Open a basic socket */
+ if ((skfd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
+ perror("socket");
+ res = 1;
+ goto out;
+ }
+
+ if (opt_a) {
+ if (optind == argc) {
+ /* No remaining args */
+ /* show all interfaces */
+ if_print((char *)NULL);
+ goto out;
+ } else {
+ /* Just show usage */
+ fprintf(stderr, usage_msg);
+ res = 2;
+ goto out;
+ }
+ }
+
+ /* Copy the interface name */
+ spp = argv + optind;
+ master_ifname = *spp++;
+
+ if (master_ifname == NULL) {
+ fprintf(stderr, usage_msg);
+ res = 2;
+ goto out;
+ }
+
+ /* exchange abi version with bonding module */
+ res = get_drv_info(master_ifname);
+ if (res) {
+ fprintf(stderr,
+ "Master '%s': Error: handshake with driver failed. "
+ "Aborting\n",
+ master_ifname);
+ goto out;
+ }
+
+ slave_ifname = *spp++;
+
+ if (slave_ifname == NULL) {
+ if (opt_d || opt_c) {
+ fprintf(stderr, usage_msg);
+ res = 2;
+ goto out;
+ }
+
+ /* A single arg means show the
+ * configuration for this interface
+ */
+ if_print(master_ifname);
+ goto out;
+ }
+
+ res = get_if_settings(master_ifname, master_ifra);
+ if (res) {
+ /* Probably a good reason not to go on */
+ fprintf(stderr,
+ "Master '%s': Error: get settings failed: %s. "
+ "Aborting\n",
+ master_ifname, strerror(res));
+ goto out;
+ }
+
+ /* check if master is indeed a master;
+ * if not then fail any operation
+ */
+ if (!(master_flags.ifr_flags & IFF_MASTER)) {
+ fprintf(stderr,
+ "Illegal operation; the specified interface '%s' "
+ "is not a master. Aborting\n",
+ master_ifname);
+ res = 1;
+ goto out;
+ }
+
+ /* check if master is up; if not then fail any operation */
+ if (!(master_flags.ifr_flags & IFF_UP)) {
+ fprintf(stderr,
+ "Illegal operation; the specified master interface "
+ "'%s' is not up.\n",
+ master_ifname);
+ res = 1;
+ goto out;
+ }
+
+ /* Only for enslaving */
+ if (!opt_c && !opt_d) {
+ sa_family_t master_family = master_hwaddr.ifr_hwaddr.sa_family;
+ unsigned char *hwaddr =
+ (unsigned char *)master_hwaddr.ifr_hwaddr.sa_data;
+
+ /* The family '1' is ARPHRD_ETHER for ethernet. */
+ if (master_family != 1 && !opt_f) {
+ fprintf(stderr,
+ "Illegal operation: The specified master "
+ "interface '%s' is not ethernet-like.\n "
+ "This program is designed to work with "
+ "ethernet-like network interfaces.\n "
+ "Use the '-f' option to force the "
+ "operation.\n",
+ master_ifname);
+ res = 1;
+ goto out;
+ }
+
+ /* Check master's hw addr */
+ for (i = 0; i < 6; i++) {
+ if (hwaddr[i] != 0) {
+ hwaddr_set = 1;
+ break;
+ }
+ }
+
+ if (hwaddr_set) {
+ v_print("current hardware address of master '%s' "
+ "is %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, "
+ "type %d\n",
+ master_ifname,
+ hwaddr[0], hwaddr[1],
+ hwaddr[2], hwaddr[3],
+ hwaddr[4], hwaddr[5],
+ master_family);
+ }
+ }
+
+ /* Accepts only one slave */
+ if (opt_c) {
+ /* change active slave */
+ res = get_slave_flags(slave_ifname);
+ if (res) {
+ fprintf(stderr,
+ "Slave '%s': Error: get flags failed. "
+ "Aborting\n",
+ slave_ifname);
+ goto out;
+ }
+ res = change_active(master_ifname, slave_ifname);
+ if (res) {
+ fprintf(stderr,
+ "Master '%s', Slave '%s': Error: "
+ "Change active failed\n",
+ master_ifname, slave_ifname);
+ }
+ } else {
+ /* Accept multiple slaves */
+ do {
+ if (opt_d) {
+ /* detach a slave interface from the master */
+ rv = get_slave_flags(slave_ifname);
+ if (rv) {
+ /* Can't work with this slave. */
+ /* remember the error and skip it*/
+ fprintf(stderr,
+ "Slave '%s': Error: get flags "
+ "failed. Skipping\n",
+ slave_ifname);
+ res = rv;
+ continue;
+ }
+ rv = release(master_ifname, slave_ifname);
+ if (rv) {
+ fprintf(stderr,
+ "Master '%s', Slave '%s': Error: "
+ "Release failed\n",
+ master_ifname, slave_ifname);
+ res = rv;
+ }
+ } else {
+ /* attach a slave interface to the master */
+ rv = get_if_settings(slave_ifname, slave_ifra);
+ if (rv) {
+ /* Can't work with this slave. */
+ /* remember the error and skip it*/
+ fprintf(stderr,
+ "Slave '%s': Error: get "
+ "settings failed: %s. "
+ "Skipping\n",
+ slave_ifname, strerror(rv));
+ res = rv;
+ continue;
+ }
+ rv = enslave(master_ifname, slave_ifname);
+ if (rv) {
+ fprintf(stderr,
+ "Master '%s', Slave '%s': Error: "
+ "Enslave failed\n",
+ master_ifname, slave_ifname);
+ res = rv;
+ }
+ }
+ } while ((slave_ifname = *spp++) != NULL);
+ }
+
+out:
+ if (skfd >= 0) {
+ close(skfd);
+ }
+
+ return res;
+}
+
+static short mif_flags;
+
+/* Get the inteface configuration from the kernel. */
+static int if_getconfig(char *ifname)
+{
+ struct ifreq ifr;
+ int metric, mtu; /* Parameters of the master interface. */
+ struct sockaddr dstaddr, broadaddr, netmask;
+ unsigned char *hwaddr;
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFFLAGS, &ifr) < 0)
+ return -1;
+ mif_flags = ifr.ifr_flags;
+ printf("The result of SIOCGIFFLAGS on %s is %x.\n",
+ ifname, ifr.ifr_flags);
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFADDR, &ifr) < 0)
+ return -1;
+ printf("The result of SIOCGIFADDR is %2.2x.%2.2x.%2.2x.%2.2x.\n",
+ ifr.ifr_addr.sa_data[0], ifr.ifr_addr.sa_data[1],
+ ifr.ifr_addr.sa_data[2], ifr.ifr_addr.sa_data[3]);
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFHWADDR, &ifr) < 0)
+ return -1;
+
+ /* Gotta convert from 'char' to unsigned for printf(). */
+ hwaddr = (unsigned char *)ifr.ifr_hwaddr.sa_data;
+ printf("The result of SIOCGIFHWADDR is type %d "
+ "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x.\n",
+ ifr.ifr_hwaddr.sa_family, hwaddr[0], hwaddr[1],
+ hwaddr[2], hwaddr[3], hwaddr[4], hwaddr[5]);
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFMETRIC, &ifr) < 0) {
+ metric = 0;
+ } else
+ metric = ifr.ifr_metric;
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFMTU, &ifr) < 0)
+ mtu = 0;
+ else
+ mtu = ifr.ifr_mtu;
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFDSTADDR, &ifr) < 0) {
+ memset(&dstaddr, 0, sizeof(struct sockaddr));
+ } else
+ dstaddr = ifr.ifr_dstaddr;
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFBRDADDR, &ifr) < 0) {
+ memset(&broadaddr, 0, sizeof(struct sockaddr));
+ } else
+ broadaddr = ifr.ifr_broadaddr;
+
+ strcpy(ifr.ifr_name, ifname);
+ if (ioctl(skfd, SIOCGIFNETMASK, &ifr) < 0) {
+ memset(&netmask, 0, sizeof(struct sockaddr));
+ } else
+ netmask = ifr.ifr_netmask;
+
+ return 0;
+}
+
+static void if_print(char *ifname)
+{
+ char buff[1024];
+ struct ifconf ifc;
+ struct ifreq *ifr;
+ int i;
+
+ if (ifname == (char *)NULL) {
+ ifc.ifc_len = sizeof(buff);
+ ifc.ifc_buf = buff;
+ if (ioctl(skfd, SIOCGIFCONF, &ifc) < 0) {
+ perror("SIOCGIFCONF failed");
+ return;
+ }
+
+ ifr = ifc.ifc_req;
+ for (i = ifc.ifc_len / sizeof(struct ifreq); --i >= 0; ifr++) {
+ if (if_getconfig(ifr->ifr_name) < 0) {
+ fprintf(stderr,
+ "%s: unknown interface.\n",
+ ifr->ifr_name);
+ continue;
+ }
+
+ if (((mif_flags & IFF_UP) == 0) && !opt_a) continue;
+ /*ife_print(&ife);*/
+ }
+ } else {
+ if (if_getconfig(ifname) < 0) {
+ fprintf(stderr,
+ "%s: unknown interface.\n", ifname);
+ }
+ }
+}
+
+static int get_drv_info(char *master_ifname)
+{
+ struct ifreq ifr;
+ struct ethtool_drvinfo info;
+ char *endptr;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strncpy(ifr.ifr_name, master_ifname, IFNAMSIZ);
+ ifr.ifr_data = (caddr_t)&info;
+
+ info.cmd = ETHTOOL_GDRVINFO;
+ strncpy(info.driver, "ifenslave", 32);
+ snprintf(info.fw_version, 32, "%d", BOND_ABI_VERSION);
+
+ if (ioctl(skfd, SIOCETHTOOL, &ifr) < 0) {
+ if (errno == EOPNOTSUPP) {
+ goto out;
+ }
+
+ saved_errno = errno;
+ v_print("Master '%s': Error: get bonding info failed %s\n",
+ master_ifname, strerror(saved_errno));
+ return 1;
+ }
+
+ abi_ver = strtoul(info.fw_version, &endptr, 0);
+ if (*endptr) {
+ v_print("Master '%s': Error: got invalid string as an ABI "
+ "version from the bonding module\n",
+ master_ifname);
+ return 1;
+ }
+
+out:
+ v_print("ABI ver is %d\n", abi_ver);
+
+ return 0;
+}
+
+static int change_active(char *master_ifname, char *slave_ifname)
+{
+ struct ifreq ifr;
+ int res = 0;
+
+ if (!(slave_flags.ifr_flags & IFF_SLAVE)) {
+ fprintf(stderr,
+ "Illegal operation: The specified slave interface "
+ "'%s' is not a slave\n",
+ slave_ifname);
+ return 1;
+ }
+
+ strncpy(ifr.ifr_name, master_ifname, IFNAMSIZ);
+ strncpy(ifr.ifr_slave, slave_ifname, IFNAMSIZ);
+ if ((ioctl(skfd, SIOCBONDCHANGEACTIVE, &ifr) < 0) &&
+ (ioctl(skfd, BOND_CHANGE_ACTIVE_OLD, &ifr) < 0)) {
+ saved_errno = errno;
+ v_print("Master '%s': Error: SIOCBONDCHANGEACTIVE failed: "
+ "%s\n",
+ master_ifname, strerror(saved_errno));
+ res = 1;
+ }
+
+ return res;
+}
+
+static int enslave(char *master_ifname, char *slave_ifname)
+{
+ struct ifreq ifr;
+ int res = 0;
+
+ if (slave_flags.ifr_flags & IFF_SLAVE) {
+ fprintf(stderr,
+ "Illegal operation: The specified slave interface "
+ "'%s' is already a slave\n",
+ slave_ifname);
+ return 1;
+ }
+
+ res = set_if_down(slave_ifname, slave_flags.ifr_flags);
+ if (res) {
+ fprintf(stderr,
+ "Slave '%s': Error: bring interface down failed\n",
+ slave_ifname);
+ return res;
+ }
+
+ if (abi_ver < 2) {
+ /* Older bonding versions would panic if the slave has no IP
+ * address, so get the IP setting from the master.
+ */
+ set_if_addr(master_ifname, slave_ifname);
+ } else {
+ res = clear_if_addr(slave_ifname);
+ if (res) {
+ fprintf(stderr,
+ "Slave '%s': Error: clear address failed\n",
+ slave_ifname);
+ return res;
+ }
+ }
+
+ if (master_mtu.ifr_mtu != slave_mtu.ifr_mtu) {
+ res = set_slave_mtu(slave_ifname, master_mtu.ifr_mtu);
+ if (res) {
+ fprintf(stderr,
+ "Slave '%s': Error: set MTU failed\n",
+ slave_ifname);
+ return res;
+ }
+ }
+
+ if (hwaddr_set) {
+ /* Master already has an hwaddr
+ * so set it's hwaddr to the slave
+ */
+ if (abi_ver < 1) {
+ /* The driver is using an old ABI, so
+ * the application sets the slave's
+ * hwaddr
+ */
+ res = set_slave_hwaddr(slave_ifname,
+ &(master_hwaddr.ifr_hwaddr));
+ if (res) {
+ fprintf(stderr,
+ "Slave '%s': Error: set hw address "
+ "failed\n",
+ slave_ifname);
+ goto undo_mtu;
+ }
+
+ /* For old ABI the application needs to bring the
+ * slave back up
+ */
+ res = set_if_up(slave_ifname, slave_flags.ifr_flags);
+ if (res) {
+ fprintf(stderr,
+ "Slave '%s': Error: bring interface "
+ "down failed\n",
+ slave_ifname);
+ goto undo_slave_mac;
+ }
+ }
+ /* The driver is using a new ABI,
+ * so the driver takes care of setting
+ * the slave's hwaddr and bringing
+ * it up again
+ */
+ } else {
+ /* No hwaddr for master yet, so
+ * set the slave's hwaddr to it
+ */
+ if (abi_ver < 1) {
+ /* For old ABI, the master needs to be
+ * down before setting it's hwaddr
+ */
+ res = set_if_down(master_ifname, master_flags.ifr_flags);
+ if (res) {
+ fprintf(stderr,
+ "Master '%s': Error: bring interface "
+ "down failed\n",
+ master_ifname);
+ goto undo_mtu;
+ }
+ }
+
+ res = set_master_hwaddr(master_ifname,
+ &(slave_hwaddr.ifr_hwaddr));
+ if (res) {
+ fprintf(stderr,
+ "Master '%s': Error: set hw address "
+ "failed\n",
+ master_ifname);
+ goto undo_mtu;
+ }
+
+ if (abi_ver < 1) {
+ /* For old ABI, bring the master
+ * back up
+ */
+ res = set_if_up(master_ifname, master_flags.ifr_flags);
+ if (res) {
+ fprintf(stderr,
+ "Master '%s': Error: bring interface "
+ "up failed\n",
+ master_ifname);
+ goto undo_master_mac;
+ }
+ }
+
+ hwaddr_set = 1;
+ }
+
+ /* Do the real thing */
+ strncpy(ifr.ifr_name, master_ifname, IFNAMSIZ);
+ strncpy(ifr.ifr_slave, slave_ifname, IFNAMSIZ);
+ if ((ioctl(skfd, SIOCBONDENSLAVE, &ifr) < 0) &&
+ (ioctl(skfd, BOND_ENSLAVE_OLD, &ifr) < 0)) {
+ saved_errno = errno;
+ v_print("Master '%s': Error: SIOCBONDENSLAVE failed: %s\n",
+ master_ifname, strerror(saved_errno));
+ res = 1;
+ }
+
+ if (res) {
+ goto undo_master_mac;
+ }
+
+ return 0;
+
+/* rollback (best effort) */
+undo_master_mac:
+ set_master_hwaddr(master_ifname, &(master_hwaddr.ifr_hwaddr));
+ hwaddr_set = 0;
+ goto undo_mtu;
+undo_slave_mac:
+ set_slave_hwaddr(slave_ifname, &(slave_hwaddr.ifr_hwaddr));
+undo_mtu:
+ set_slave_mtu(slave_ifname, slave_mtu.ifr_mtu);
+ return res;
+}
+
+static int release(char *master_ifname, char *slave_ifname)
+{
+ struct ifreq ifr;
+ int res = 0;
+
+ if (!(slave_flags.ifr_flags & IFF_SLAVE)) {
+ fprintf(stderr,
+ "Illegal operation: The specified slave interface "
+ "'%s' is not a slave\n",
+ slave_ifname);
+ return 1;
+ }
+
+ strncpy(ifr.ifr_name, master_ifname, IFNAMSIZ);
+ strncpy(ifr.ifr_slave, slave_ifname, IFNAMSIZ);
+ if ((ioctl(skfd, SIOCBONDRELEASE, &ifr) < 0) &&
+ (ioctl(skfd, BOND_RELEASE_OLD, &ifr) < 0)) {
+ saved_errno = errno;
+ v_print("Master '%s': Error: SIOCBONDRELEASE failed: %s\n",
+ master_ifname, strerror(saved_errno));
+ return 1;
+ } else if (abi_ver < 1) {
+ /* The driver is using an old ABI, so we'll set the interface
+ * down to avoid any conflicts due to same MAC/IP
+ */
+ res = set_if_down(slave_ifname, slave_flags.ifr_flags);
+ if (res) {
+ fprintf(stderr,
+ "Slave '%s': Error: bring interface "
+ "down failed\n",
+ slave_ifname);
+ }
+ }
+
+ /* set to default mtu */
+ set_slave_mtu(slave_ifname, 1500);
+
+ return res;
+}
+
+static int get_if_settings(char *ifname, struct dev_ifr ifra[])
+{
+ int i;
+ int res = 0;
+
+ for (i = 0; ifra[i].req_ifr; i++) {
+ strncpy(ifra[i].req_ifr->ifr_name, ifname, IFNAMSIZ);
+ res = ioctl(skfd, ifra[i].req_type, ifra[i].req_ifr);
+ if (res < 0) {
+ saved_errno = errno;
+ v_print("Interface '%s': Error: %s failed: %s\n",
+ ifname, ifra[i].req_name,
+ strerror(saved_errno));
+
+ return saved_errno;
+ }
+ }
+
+ return 0;
+}
+
+static int get_slave_flags(char *slave_ifname)
+{
+ int res = 0;
+
+ strncpy(slave_flags.ifr_name, slave_ifname, IFNAMSIZ);
+ res = ioctl(skfd, SIOCGIFFLAGS, &slave_flags);
+ if (res < 0) {
+ saved_errno = errno;
+ v_print("Slave '%s': Error: SIOCGIFFLAGS failed: %s\n",
+ slave_ifname, strerror(saved_errno));
+ } else {
+ v_print("Slave %s: flags %04X.\n",
+ slave_ifname, slave_flags.ifr_flags);
+ }
+
+ return res;
+}
+
+static int set_master_hwaddr(char *master_ifname, struct sockaddr *hwaddr)
+{
+ unsigned char *addr = (unsigned char *)hwaddr->sa_data;
+ struct ifreq ifr;
+ int res = 0;
+
+ strncpy(ifr.ifr_name, master_ifname, IFNAMSIZ);
+ memcpy(&(ifr.ifr_hwaddr), hwaddr, sizeof(struct sockaddr));
+ res = ioctl(skfd, SIOCSIFHWADDR, &ifr);
+ if (res < 0) {
+ saved_errno = errno;
+ v_print("Master '%s': Error: SIOCSIFHWADDR failed: %s\n",
+ master_ifname, strerror(saved_errno));
+ return res;
+ } else {
+ v_print("Master '%s': hardware address set to "
+ "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x.\n",
+ master_ifname, addr[0], addr[1], addr[2],
+ addr[3], addr[4], addr[5]);
+ }
+
+ return res;
+}
+
+static int set_slave_hwaddr(char *slave_ifname, struct sockaddr *hwaddr)
+{
+ unsigned char *addr = (unsigned char *)hwaddr->sa_data;
+ struct ifreq ifr;
+ int res = 0;
+
+ strncpy(ifr.ifr_name, slave_ifname, IFNAMSIZ);
+ memcpy(&(ifr.ifr_hwaddr), hwaddr, sizeof(struct sockaddr));
+ res = ioctl(skfd, SIOCSIFHWADDR, &ifr);
+ if (res < 0) {
+ saved_errno = errno;
+
+ v_print("Slave '%s': Error: SIOCSIFHWADDR failed: %s\n",
+ slave_ifname, strerror(saved_errno));
+
+ if (saved_errno == EBUSY) {
+ v_print(" The device is busy: it must be idle "
+ "before running this command.\n");
+ } else if (saved_errno == EOPNOTSUPP) {
+ v_print(" The device does not support setting "
+ "the MAC address.\n"
+ " Your kernel likely does not support slave "
+ "devices.\n");
+ } else if (saved_errno == EINVAL) {
+ v_print(" The device's address type does not match "
+ "the master's address type.\n");
+ }
+ return res;
+ } else {
+ v_print("Slave '%s': hardware address set to "
+ "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x.\n",
+ slave_ifname, addr[0], addr[1], addr[2],
+ addr[3], addr[4], addr[5]);
+ }
+
+ return res;
+}
+
+static int set_slave_mtu(char *slave_ifname, int mtu)
+{
+ struct ifreq ifr;
+ int res = 0;
+
+ ifr.ifr_mtu = mtu;
+ strncpy(ifr.ifr_name, slave_ifname, IFNAMSIZ);
+
+ res = ioctl(skfd, SIOCSIFMTU, &ifr);
+ if (res < 0) {
+ saved_errno = errno;
+ v_print("Slave '%s': Error: SIOCSIFMTU failed: %s\n",
+ slave_ifname, strerror(saved_errno));
+ } else {
+ v_print("Slave '%s': MTU set to %d.\n", slave_ifname, mtu);
+ }
+
+ return res;
+}
+
+static int set_if_flags(char *ifname, short flags)
+{
+ struct ifreq ifr;
+ int res = 0;
+
+ ifr.ifr_flags = flags;
+ strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
+
+ res = ioctl(skfd, SIOCSIFFLAGS, &ifr);
+ if (res < 0) {
+ saved_errno = errno;
+ v_print("Interface '%s': Error: SIOCSIFFLAGS failed: %s\n",
+ ifname, strerror(saved_errno));
+ } else {
+ v_print("Interface '%s': flags set to %04X.\n", ifname, flags);
+ }
+
+ return res;
+}
+
+static int set_if_up(char *ifname, short flags)
+{
+ return set_if_flags(ifname, flags | IFF_UP);
+}
+
+static int set_if_down(char *ifname, short flags)
+{
+ return set_if_flags(ifname, flags & ~IFF_UP);
+}
+
+static int clear_if_addr(char *ifname)
+{
+ struct ifreq ifr;
+ int res = 0;
+
+ strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
+ ifr.ifr_addr.sa_family = AF_INET;
+ memset(ifr.ifr_addr.sa_data, 0, sizeof(ifr.ifr_addr.sa_data));
+
+ res = ioctl(skfd, SIOCSIFADDR, &ifr);
+ if (res < 0) {
+ saved_errno = errno;
+ v_print("Interface '%s': Error: SIOCSIFADDR failed: %s\n",
+ ifname, strerror(saved_errno));
+ } else {
+ v_print("Interface '%s': address cleared\n", ifname);
+ }
+
+ return res;
+}
+
+static int set_if_addr(char *master_ifname, char *slave_ifname)
+{
+ struct ifreq ifr;
+ int res;
+ unsigned char *ipaddr;
+ int i;
+ struct {
+ char *req_name;
+ char *desc;
+ int g_ioctl;
+ int s_ioctl;
+ } ifra[] = {
+ {"IFADDR", "addr", SIOCGIFADDR, SIOCSIFADDR},
+ {"DSTADDR", "destination addr", SIOCGIFDSTADDR, SIOCSIFDSTADDR},
+ {"BRDADDR", "broadcast addr", SIOCGIFBRDADDR, SIOCSIFBRDADDR},
+ {"NETMASK", "netmask", SIOCGIFNETMASK, SIOCSIFNETMASK},
+ {NULL, NULL, 0, 0},
+ };
+
+ for (i = 0; ifra[i].req_name; i++) {
+ strncpy(ifr.ifr_name, master_ifname, IFNAMSIZ);
+ res = ioctl(skfd, ifra[i].g_ioctl, &ifr);
+ if (res < 0) {
+ int saved_errno = errno;
+
+ v_print("Interface '%s': Error: SIOCG%s failed: %s\n",
+ master_ifname, ifra[i].req_name,
+ strerror(saved_errno));
+
+ ifr.ifr_addr.sa_family = AF_INET;
+ memset(ifr.ifr_addr.sa_data, 0,
+ sizeof(ifr.ifr_addr.sa_data));
+ }
+
+ strncpy(ifr.ifr_name, slave_ifname, IFNAMSIZ);
+ res = ioctl(skfd, ifra[i].s_ioctl, &ifr);
+ if (res < 0) {
+ int saved_errno = errno;
+
+ v_print("Interface '%s': Error: SIOCS%s failed: %s\n",
+ slave_ifname, ifra[i].req_name,
+ strerror(saved_errno));
+
+ }
+
+ ipaddr = (unsigned char *)ifr.ifr_addr.sa_data;
+ v_print("Interface '%s': set IP %s to %d.%d.%d.%d\n",
+ slave_ifname, ifra[i].desc,
+ ipaddr[0], ipaddr[1], ipaddr[2], ipaddr[3]);
+ }
+
+ return 0;
+}
+
+/*
+ * Local variables:
+ * version-control: t
+ * kept-new-versions: 5
+ * c-indent-level: 4
+ * c-basic-offset: 4
+ * tab-width: 4
+ * compile-command: "gcc -Wall -Wstrict-prototypes -O -I/usr/src/linux/include ifenslave.c -o ifenslave"
+ * End:
+ */
+
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
new file mode 100644
index 0000000..d849326
--- /dev/null
+++ b/Documentation/networking/ip-sysctl.txt
@@ -0,0 +1,1268 @@
+/proc/sys/net/ipv4/* Variables:
+
+ip_forward - BOOLEAN
+ 0 - disabled (default)
+ not 0 - enabled
+
+ Forward Packets between interfaces.
+
+ This variable is special, its change resets all configuration
+ parameters to their default state (RFC1122 for hosts, RFC1812
+ for routers)
+
+ip_default_ttl - INTEGER
+ default 64
+
+ip_no_pmtu_disc - BOOLEAN
+ Disable Path MTU Discovery.
+ default FALSE
+
+min_pmtu - INTEGER
+ default 562 - minimum discovered Path MTU
+
+mtu_expires - INTEGER
+ Time, in seconds, that cached PMTU information is kept.
+
+min_adv_mss - INTEGER
+ The advertised MSS depends on the first hop route MTU, but will
+ never be lower than this setting.
+
+IP Fragmentation:
+
+ipfrag_high_thresh - INTEGER
+ Maximum memory used to reassemble IP fragments. When
+ ipfrag_high_thresh bytes of memory is allocated for this purpose,
+ the fragment handler will toss packets until ipfrag_low_thresh
+ is reached.
+
+ipfrag_low_thresh - INTEGER
+ See ipfrag_high_thresh
+
+ipfrag_time - INTEGER
+ Time in seconds to keep an IP fragment in memory.
+
+ipfrag_secret_interval - INTEGER
+ Regeneration interval (in seconds) of the hash secret (or lifetime
+ for the hash secret) for IP fragments.
+ Default: 600
+
+ipfrag_max_dist - INTEGER
+ ipfrag_max_dist is a non-negative integer value which defines the
+ maximum "disorder" which is allowed among fragments which share a
+ common IP source address. Note that reordering of packets is
+ not unusual, but if a large number of fragments arrive from a source
+ IP address while a particular fragment queue remains incomplete, it
+ probably indicates that one or more fragments belonging to that queue
+ have been lost. When ipfrag_max_dist is positive, an additional check
+ is done on fragments before they are added to a reassembly queue - if
+ ipfrag_max_dist (or more) fragments have arrived from a particular IP
+ address between additions to any IP fragment queue using that source
+ address, it's presumed that one or more fragments in the queue are
+ lost. The existing fragment queue will be dropped, and a new one
+ started. An ipfrag_max_dist value of zero disables this check.
+
+ Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
+ result in unnecessarily dropping fragment queues when normal
+ reordering of packets occurs, which could lead to poor application
+ performance. Using a very large value, e.g. 50000, increases the
+ likelihood of incorrectly reassembling IP fragments that originate
+ from different IP datagrams, which could result in data corruption.
+ Default: 64
+
+INET peer storage:
+
+inet_peer_threshold - INTEGER
+ The approximate size of the storage. Starting from this threshold
+ entries will be thrown aggressively. This threshold also determines
+ entries' time-to-live and time intervals between garbage collection
+ passes. More entries, less time-to-live, less GC interval.
+
+inet_peer_minttl - INTEGER
+ Minimum time-to-live of entries. Should be enough to cover fragment
+ time-to-live on the reassembling side. This minimum time-to-live is
+ guaranteed if the pool size is less than inet_peer_threshold.
+ Measured in seconds.
+
+inet_peer_maxttl - INTEGER
+ Maximum time-to-live of entries. Unused entries will expire after
+ this period of time if there is no memory pressure on the pool (i.e.
+ when the number of entries in the pool is very small).
+ Measured in seconds.
+
+inet_peer_gc_mintime - INTEGER
+ Minimum interval between garbage collection passes. This interval is
+ in effect under high memory pressure on the pool.
+ Measured in seconds.
+
+inet_peer_gc_maxtime - INTEGER
+ Minimum interval between garbage collection passes. This interval is
+ in effect under low (or absent) memory pressure on the pool.
+ Measured in seconds.
+
+TCP variables:
+
+somaxconn - INTEGER
+ Limit of socket listen() backlog, known in userspace as SOMAXCONN.
+ Defaults to 128. See also tcp_max_syn_backlog for additional tuning
+ for TCP sockets.
+
+tcp_abc - INTEGER
+ Controls Appropriate Byte Count (ABC) defined in RFC3465.
+ ABC is a way of increasing congestion window (cwnd) more slowly
+ in response to partial acknowledgments.
+ Possible values are:
+ 0 increase cwnd once per acknowledgment (no ABC)
+ 1 increase cwnd once per acknowledgment of full sized segment
+ 2 allow increase cwnd by two if acknowledgment is
+ of two segments to compensate for delayed acknowledgments.
+ Default: 0 (off)
+
+tcp_abort_on_overflow - BOOLEAN
+ If listening service is too slow to accept new connections,
+ reset them. Default state is FALSE. It means that if overflow
+ occurred due to a burst, connection will recover. Enable this
+ option _only_ if you are really sure that listening daemon
+ cannot be tuned to accept connections faster. Enabling this
+ option can harm clients of your server.
+
+tcp_adv_win_scale - INTEGER
+ Count buffering overhead as bytes/2^tcp_adv_win_scale
+ (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
+ if it is <= 0.
+ Default: 2
+
+tcp_allowed_congestion_control - STRING
+ Show/set the congestion control choices available to non-privileged
+ processes. The list is a subset of those listed in
+ tcp_available_congestion_control.
+ Default is "reno" and the default setting (tcp_congestion_control).
+
+tcp_app_win - INTEGER
+ Reserve max(window/2^tcp_app_win, mss) of window for application
+ buffer. Value 0 is special, it means that nothing is reserved.
+ Default: 31
+
+tcp_available_congestion_control - STRING
+ Shows the available congestion control choices that are registered.
+ More congestion control algorithms may be available as modules,
+ but not loaded.
+
+tcp_base_mss - INTEGER
+ The initial value of search_low to be used by the packetization layer
+ Path MTU discovery (MTU probing). If MTU probing is enabled,
+ this is the initial MSS used by the connection.
+
+tcp_congestion_control - STRING
+ Set the congestion control algorithm to be used for new
+ connections. The algorithm "reno" is always available, but
+ additional choices may be available based on kernel configuration.
+ Default is set as part of kernel configuration.
+
+tcp_dsack - BOOLEAN
+ Allows TCP to send "duplicate" SACKs.
+
+tcp_ecn - BOOLEAN
+ Enable Explicit Congestion Notification in TCP.
+
+tcp_fack - BOOLEAN
+ Enable FACK congestion avoidance and fast retransmission.
+ The value is not used, if tcp_sack is not enabled.
+
+tcp_fin_timeout - INTEGER
+ Time to hold socket in state FIN-WAIT-2, if it was closed
+ by our side. Peer can be broken and never close its side,
+ or even died unexpectedly. Default value is 60sec.
+ Usual value used in 2.2 was 180 seconds, you may restore
+ it, but remember that if your machine is even underloaded WEB server,
+ you risk to overflow memory with kilotons of dead sockets,
+ FIN-WAIT-2 sockets are less dangerous than FIN-WAIT-1,
+ because they eat maximum 1.5K of memory, but they tend
+ to live longer. Cf. tcp_max_orphans.
+
+tcp_frto - INTEGER
+ Enables Forward RTO-Recovery (F-RTO) defined in RFC4138.
+ F-RTO is an enhanced recovery algorithm for TCP retransmission
+ timeouts. It is particularly beneficial in wireless environments
+ where packet loss is typically due to random radio interference
+ rather than intermediate router congestion. F-RTO is sender-side
+ only modification. Therefore it does not require any support from
+ the peer.
+
+ If set to 1, basic version is enabled. 2 enables SACK enhanced
+ F-RTO if flow uses SACK. The basic version can be used also when
+ SACK is in use though scenario(s) with it exists where F-RTO
+ interacts badly with the packet counting of the SACK enabled TCP
+ flow.
+
+tcp_frto_response - INTEGER
+ When F-RTO has detected that a TCP retransmission timeout was
+ spurious (i.e, the timeout would have been avoided had TCP set a
+ longer retransmission timeout), TCP has several options what to do
+ next. Possible values are:
+ 0 Rate halving based; a smooth and conservative response,
+ results in halved cwnd and ssthresh after one RTT
+ 1 Very conservative response; not recommended because even
+ though being valid, it interacts poorly with the rest of
+ Linux TCP, halves cwnd and ssthresh immediately
+ 2 Aggressive response; undoes congestion control measures
+ that are now known to be unnecessary (ignoring the
+ possibility of a lost retransmission that would require
+ TCP to be more cautious), cwnd and ssthresh are restored
+ to the values prior timeout
+ Default: 0 (rate halving based)
+
+tcp_keepalive_time - INTEGER
+ How often TCP sends out keepalive messages when keepalive is enabled.
+ Default: 2hours.
+
+tcp_keepalive_probes - INTEGER
+ How many keepalive probes TCP sends out, until it decides that the
+ connection is broken. Default value: 9.
+
+tcp_keepalive_intvl - INTEGER
+ How frequently the probes are send out. Multiplied by
+ tcp_keepalive_probes it is time to kill not responding connection,
+ after probes started. Default value: 75sec i.e. connection
+ will be aborted after ~11 minutes of retries.
+
+tcp_low_latency - BOOLEAN
+ If set, the TCP stack makes decisions that prefer lower
+ latency as opposed to higher throughput. By default, this
+ option is not set meaning that higher throughput is preferred.
+ An example of an application where this default should be
+ changed would be a Beowulf compute cluster.
+ Default: 0
+
+tcp_max_orphans - INTEGER
+ Maximal number of TCP sockets not attached to any user file handle,
+ held by system. If this number is exceeded orphaned connections are
+ reset immediately and warning is printed. This limit exists
+ only to prevent simple DoS attacks, you _must_ not rely on this
+ or lower the limit artificially, but rather increase it
+ (probably, after increasing installed memory),
+ if network conditions require more than default value,
+ and tune network services to linger and kill such states
+ more aggressively. Let me to remind again: each orphan eats
+ up to ~64K of unswappable memory.
+
+tcp_max_syn_backlog - INTEGER
+ Maximal number of remembered connection requests, which are
+ still did not receive an acknowledgment from connecting client.
+ Default value is 1024 for systems with more than 128Mb of memory,
+ and 128 for low memory machines. If server suffers of overload,
+ try to increase this number.
+
+tcp_max_tw_buckets - INTEGER
+ Maximal number of timewait sockets held by system simultaneously.
+ If this number is exceeded time-wait socket is immediately destroyed
+ and warning is printed. This limit exists only to prevent
+ simple DoS attacks, you _must_ not lower the limit artificially,
+ but rather increase it (probably, after increasing installed memory),
+ if network conditions require more than default value.
+
+tcp_mem - vector of 3 INTEGERs: min, pressure, max
+ min: below this number of pages TCP is not bothered about its
+ memory appetite.
+
+ pressure: when amount of memory allocated by TCP exceeds this number
+ of pages, TCP moderates its memory consumption and enters memory
+ pressure mode, which is exited when memory consumption falls
+ under "min".
+
+ max: number of pages allowed for queueing by all TCP sockets.
+
+ Defaults are calculated at boot time from amount of available
+ memory.
+
+tcp_moderate_rcvbuf - BOOLEAN
+ If set, TCP performs receive buffer auto-tuning, attempting to
+ automatically size the buffer (no greater than tcp_rmem[2]) to
+ match the size required by the path for full throughput. Enabled by
+ default.
+
+tcp_mtu_probing - INTEGER
+ Controls TCP Packetization-Layer Path MTU Discovery. Takes three
+ values:
+ 0 - Disabled
+ 1 - Disabled by default, enabled when an ICMP black hole detected
+ 2 - Always enabled, use initial MSS of tcp_base_mss.
+
+tcp_no_metrics_save - BOOLEAN
+ By default, TCP saves various connection metrics in the route cache
+ when the connection closes, so that connections established in the
+ near future can use these to set initial conditions. Usually, this
+ increases overall performance, but may sometimes cause performance
+ degradation. If set, TCP will not cache metrics on closing
+ connections.
+
+tcp_orphan_retries - INTEGER
+ How may times to retry before killing TCP connection, closed
+ by our side. Default value 7 corresponds to ~50sec-16min
+ depending on RTO. If you machine is loaded WEB server,
+ you should think about lowering this value, such sockets
+ may consume significant resources. Cf. tcp_max_orphans.
+
+tcp_reordering - INTEGER
+ Maximal reordering of packets in a TCP stream.
+ Default: 3
+
+tcp_retrans_collapse - BOOLEAN
+ Bug-to-bug compatibility with some broken printers.
+ On retransmit try to send bigger packets to work around bugs in
+ certain TCP stacks.
+
+tcp_retries1 - INTEGER
+ How many times to retry before deciding that something is wrong
+ and it is necessary to report this suspicion to network layer.
+ Minimal RFC value is 3, it is default, which corresponds
+ to ~3sec-8min depending on RTO.
+
+tcp_retries2 - INTEGER
+ How may times to retry before killing alive TCP connection.
+ RFC1122 says that the limit should be longer than 100 sec.
+ It is too small number. Default value 15 corresponds to ~13-30min
+ depending on RTO.
+
+tcp_rfc1337 - BOOLEAN
+ If set, the TCP stack behaves conforming to RFC1337. If unset,
+ we are not conforming to RFC, but prevent TCP TIME_WAIT
+ assassination.
+ Default: 0
+
+tcp_rmem - vector of 3 INTEGERs: min, default, max
+ min: Minimal size of receive buffer used by TCP sockets.
+ It is guaranteed to each TCP socket, even under moderate memory
+ pressure.
+ Default: 8K
+
+ default: initial size of receive buffer used by TCP sockets.
+ This value overrides net.core.rmem_default used by other protocols.
+ Default: 87380 bytes. This value results in window of 65535 with
+ default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit
+ less for default tcp_app_win. See below about these variables.
+
+ max: maximal size of receive buffer allowed for automatically
+ selected receiver buffers for TCP socket. This value does not override
+ net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables
+ automatic tuning of that socket's receive buffer size, in which
+ case this value is ignored.
+ Default: between 87380B and 4MB, depending on RAM size.
+
+tcp_sack - BOOLEAN
+ Enable select acknowledgments (SACKS).
+
+tcp_slow_start_after_idle - BOOLEAN
+ If set, provide RFC2861 behavior and time out the congestion
+ window after an idle period. An idle period is defined at
+ the current RTO. If unset, the congestion window will not
+ be timed out after an idle period.
+ Default: 1
+
+tcp_stdurg - BOOLEAN
+ Use the Host requirements interpretation of the TCP urgent pointer field.
+ Most hosts use the older BSD interpretation, so if you turn this on
+ Linux might not communicate correctly with them.
+ Default: FALSE
+
+tcp_synack_retries - INTEGER
+ Number of times SYNACKs for a passive TCP connection attempt will
+ be retransmitted. Should not be higher than 255. Default value
+ is 5, which corresponds to ~180seconds.
+
+tcp_syncookies - BOOLEAN
+ Only valid when the kernel was compiled with CONFIG_SYNCOOKIES
+ Send out syncookies when the syn backlog queue of a socket
+ overflows. This is to prevent against the common 'SYN flood attack'
+ Default: FALSE
+
+ Note, that syncookies is fallback facility.
+ It MUST NOT be used to help highly loaded servers to stand
+ against legal connection rate. If you see SYN flood warnings
+ in your logs, but investigation shows that they occur
+ because of overload with legal connections, you should tune
+ another parameters until this warning disappear.
+ See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow.
+
+ syncookies seriously violate TCP protocol, do not allow
+ to use TCP extensions, can result in serious degradation
+ of some services (f.e. SMTP relaying), visible not by you,
+ but your clients and relays, contacting you. While you see
+ SYN flood warnings in logs not being really flooded, your server
+ is seriously misconfigured.
+
+tcp_syn_retries - INTEGER
+ Number of times initial SYNs for an active TCP connection attempt
+ will be retransmitted. Should not be higher than 255. Default value
+ is 5, which corresponds to ~180seconds.
+
+tcp_timestamps - BOOLEAN
+ Enable timestamps as defined in RFC1323.
+
+tcp_tso_win_divisor - INTEGER
+ This allows control over what percentage of the congestion window
+ can be consumed by a single TSO frame.
+ The setting of this parameter is a choice between burstiness and
+ building larger TSO frames.
+ Default: 3
+
+tcp_tw_recycle - BOOLEAN
+ Enable fast recycling TIME-WAIT sockets. Default value is 0.
+ It should not be changed without advice/request of technical
+ experts.
+
+tcp_tw_reuse - BOOLEAN
+ Allow to reuse TIME-WAIT sockets for new connections when it is
+ safe from protocol viewpoint. Default value is 0.
+ It should not be changed without advice/request of technical
+ experts.
+
+tcp_window_scaling - BOOLEAN
+ Enable window scaling as defined in RFC1323.
+
+tcp_wmem - vector of 3 INTEGERs: min, default, max
+ min: Amount of memory reserved for send buffers for TCP sockets.
+ Each TCP socket has rights to use it due to fact of its birth.
+ Default: 4K
+
+ default: initial size of send buffer used by TCP sockets. This
+ value overrides net.core.wmem_default used by other protocols.
+ It is usually lower than net.core.wmem_default.
+ Default: 16K
+
+ max: Maximal amount of memory allowed for automatically tuned
+ send buffers for TCP sockets. This value does not override
+ net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables
+ automatic tuning of that socket's send buffer size, in which case
+ this value is ignored.
+ Default: between 64K and 4MB, depending on RAM size.
+
+tcp_workaround_signed_windows - BOOLEAN
+ If set, assume no receipt of a window scaling option means the
+ remote TCP is broken and treats the window as a signed quantity.
+ If unset, assume the remote TCP is not broken even if we do
+ not receive a window scaling option from them.
+ Default: 0
+
+tcp_dma_copybreak - INTEGER
+ Lower limit, in bytes, of the size of socket reads that will be
+ offloaded to a DMA copy engine, if one is present in the system
+ and CONFIG_NET_DMA is enabled.
+ Default: 4096
+
+UDP variables:
+
+udp_mem - vector of 3 INTEGERs: min, pressure, max
+ Number of pages allowed for queueing by all UDP sockets.
+
+ min: Below this number of pages UDP is not bothered about its
+ memory appetite. When amount of memory allocated by UDP exceeds
+ this number, UDP starts to moderate memory usage.
+
+ pressure: This value was introduced to follow format of tcp_mem.
+
+ max: Number of pages allowed for queueing by all UDP sockets.
+
+ Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+ Minimal size of receive buffer used by UDP sockets in moderation.
+ Each UDP socket is able to use the size for receiving data, even if
+ total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+ Default: 4096
+
+udp_wmem_min - INTEGER
+ Minimal size of send buffer used by UDP sockets in moderation.
+ Each UDP socket is able to use the size for sending data, even if
+ total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+ Default: 4096
+
+CIPSOv4 Variables:
+
+cipso_cache_enable - BOOLEAN
+ If set, enable additions to and lookups from the CIPSO label mapping
+ cache. If unset, additions are ignored and lookups always result in a
+ miss. However, regardless of the setting the cache is still
+ invalidated when required when means you can safely toggle this on and
+ off and the cache will always be "safe".
+ Default: 1
+
+cipso_cache_bucket_size - INTEGER
+ The CIPSO label cache consists of a fixed size hash table with each
+ hash bucket containing a number of cache entries. This variable limits
+ the number of entries in each hash bucket; the larger the value the
+ more CIPSO label mappings that can be cached. When the number of
+ entries in a given hash bucket reaches this limit adding new entries
+ causes the oldest entry in the bucket to be removed to make room.
+ Default: 10
+
+cipso_rbm_optfmt - BOOLEAN
+ Enable the "Optimized Tag 1 Format" as defined in section 3.4.2.6 of
+ the CIPSO draft specification (see Documentation/netlabel for details).
+ This means that when set the CIPSO tag will be padded with empty
+ categories in order to make the packet data 32-bit aligned.
+ Default: 0
+
+cipso_rbm_structvalid - BOOLEAN
+ If set, do a very strict check of the CIPSO option when
+ ip_options_compile() is called. If unset, relax the checks done during
+ ip_options_compile(). Either way is "safe" as errors are caught else
+ where in the CIPSO processing code but setting this to 0 (False) should
+ result in less work (i.e. it should be faster) but could cause problems
+ with other implementations that require strict checking.
+ Default: 0
+
+IP Variables:
+
+ip_local_port_range - 2 INTEGERS
+ Defines the local port range that is used by TCP and UDP to
+ choose the local port. The first number is the first, the
+ second the last local port number. Default value depends on
+ amount of memory available on the system:
+ > 128Mb 32768-61000
+ < 128Mb 1024-4999 or even less.
+ This number defines number of active connections, which this
+ system can issue simultaneously to systems not supporting
+ TCP extensions (timestamps). With tcp_tw_recycle enabled
+ (i.e. by default) range 1024-4999 is enough to issue up to
+ 2000 connections per second to systems supporting timestamps.
+
+ip_nonlocal_bind - BOOLEAN
+ If set, allows processes to bind() to non-local IP addresses,
+ which can be quite useful - but may break some applications.
+ Default: 0
+
+ip_dynaddr - BOOLEAN
+ If set non-zero, enables support for dynamic addresses.
+ If set to a non-zero value larger than 1, a kernel log
+ message will be printed when dynamic address rewriting
+ occurs.
+ Default: 0
+
+icmp_echo_ignore_all - BOOLEAN
+ If set non-zero, then the kernel will ignore all ICMP ECHO
+ requests sent to it.
+ Default: 0
+
+icmp_echo_ignore_broadcasts - BOOLEAN
+ If set non-zero, then the kernel will ignore all ICMP ECHO and
+ TIMESTAMP requests sent to it via broadcast/multicast.
+ Default: 1
+
+icmp_ratelimit - INTEGER
+ Limit the maximal rates for sending ICMP packets whose type matches
+ icmp_ratemask (see below) to specific targets.
+ 0 to disable any limiting,
+ otherwise the minimal space between responses in milliseconds.
+ Default: 1000
+
+icmp_ratemask - INTEGER
+ Mask made of ICMP types for which rates are being limited.
+ Significant bits: IHGFEDCBA9876543210
+ Default mask: 0000001100000011000 (6168)
+
+ Bit definitions (see include/linux/icmp.h):
+ 0 Echo Reply
+ 3 Destination Unreachable *
+ 4 Source Quench *
+ 5 Redirect
+ 8 Echo Request
+ B Time Exceeded *
+ C Parameter Problem *
+ D Timestamp Request
+ E Timestamp Reply
+ F Info Request
+ G Info Reply
+ H Address Mask Request
+ I Address Mask Reply
+
+ * These are rate limited by default (see default mask above)
+
+icmp_ignore_bogus_error_responses - BOOLEAN
+ Some routers violate RFC1122 by sending bogus responses to broadcast
+ frames. Such violations are normally logged via a kernel warning.
+ If this is set to TRUE, the kernel will not give such warnings, which
+ will avoid log file clutter.
+ Default: FALSE
+
+icmp_errors_use_inbound_ifaddr - BOOLEAN
+
+ If zero, icmp error messages are sent with the primary address of
+ the exiting interface.
+
+ If non-zero, the message will be sent with the primary address of
+ the interface that received the packet that caused the icmp error.
+ This is the behaviour network many administrators will expect from
+ a router. And it can make debugging complicated network layouts
+ much easier.
+
+ Note that if no primary address exists for the interface selected,
+ then the primary address of the first non-loopback interface that
+ has one will be used regardless of this setting.
+
+ Default: 0
+
+igmp_max_memberships - INTEGER
+ Change the maximum number of multicast groups we can subscribe to.
+ Default: 20
+
+conf/interface/* changes special settings per interface (where "interface" is
+ the name of your network interface)
+conf/all/* is special, changes the settings for all interfaces
+
+
+log_martians - BOOLEAN
+ Log packets with impossible addresses to kernel log.
+ log_martians for the interface will be enabled if at least one of
+ conf/{all,interface}/log_martians is set to TRUE,
+ it will be disabled otherwise
+
+accept_redirects - BOOLEAN
+ Accept ICMP redirect messages.
+ accept_redirects for the interface will be enabled if:
+ - both conf/{all,interface}/accept_redirects are TRUE in the case forwarding
+ for the interface is enabled
+ or
+ - at least one of conf/{all,interface}/accept_redirects is TRUE in the case
+ forwarding for the interface is disabled
+ accept_redirects for the interface will be disabled otherwise
+ default TRUE (host)
+ FALSE (router)
+
+forwarding - BOOLEAN
+ Enable IP forwarding on this interface.
+
+mc_forwarding - BOOLEAN
+ Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE
+ and a multicast routing daemon is required.
+ conf/all/mc_forwarding must also be set to TRUE to enable multicast routing
+ for the interface
+
+medium_id - INTEGER
+ Integer value used to differentiate the devices by the medium they
+ are attached to. Two devices can have different id values when
+ the broadcast packets are received only on one of them.
+ The default value 0 means that the device is the only interface
+ to its medium, value of -1 means that medium is not known.
+
+ Currently, it is used to change the proxy_arp behavior:
+ the proxy_arp feature is enabled for packets forwarded between
+ two devices attached to different media.
+
+proxy_arp - BOOLEAN
+ Do proxy arp.
+ proxy_arp for the interface will be enabled if at least one of
+ conf/{all,interface}/proxy_arp is set to TRUE,
+ it will be disabled otherwise
+
+shared_media - BOOLEAN
+ Send(router) or accept(host) RFC1620 shared media redirects.
+ Overrides ip_secure_redirects.
+ shared_media for the interface will be enabled if at least one of
+ conf/{all,interface}/shared_media is set to TRUE,
+ it will be disabled otherwise
+ default TRUE
+
+secure_redirects - BOOLEAN
+ Accept ICMP redirect messages only for gateways,
+ listed in default gateway list.
+ secure_redirects for the interface will be enabled if at least one of
+ conf/{all,interface}/secure_redirects is set to TRUE,
+ it will be disabled otherwise
+ default TRUE
+
+send_redirects - BOOLEAN
+ Send redirects, if router.
+ send_redirects for the interface will be enabled if at least one of
+ conf/{all,interface}/send_redirects is set to TRUE,
+ it will be disabled otherwise
+ Default: TRUE
+
+bootp_relay - BOOLEAN
+ Accept packets with source address 0.b.c.d destined
+ not to this host as local ones. It is supposed, that
+ BOOTP relay daemon will catch and forward such packets.
+ conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay
+ for the interface
+ default FALSE
+ Not Implemented Yet.
+
+accept_source_route - BOOLEAN
+ Accept packets with SRR option.
+ conf/all/accept_source_route must also be set to TRUE to accept packets
+ with SRR option on the interface
+ default TRUE (router)
+ FALSE (host)
+
+rp_filter - BOOLEAN
+ 1 - do source validation by reversed path, as specified in RFC1812
+ Recommended option for single homed hosts and stub network
+ routers. Could cause troubles for complicated (not loop free)
+ networks running a slow unreliable protocol (sort of RIP),
+ or using static routes.
+
+ 0 - No source validation.
+
+ conf/all/rp_filter must also be set to TRUE to do source validation
+ on the interface
+
+ Default value is 0. Note that some distributions enable it
+ in startup scripts.
+
+arp_filter - BOOLEAN
+ 1 - Allows you to have multiple network interfaces on the same
+ subnet, and have the ARPs for each interface be answered
+ based on whether or not the kernel would route a packet from
+ the ARP'd IP out that interface (therefore you must use source
+ based routing for this to work). In other words it allows control
+ of which cards (usually 1) will respond to an arp request.
+
+ 0 - (default) The kernel can respond to arp requests with addresses
+ from other interfaces. This may seem wrong but it usually makes
+ sense, because it increases the chance of successful communication.
+ IP addresses are owned by the complete host on Linux, not by
+ particular interfaces. Only for more complex setups like load-
+ balancing, does this behaviour cause problems.
+
+ arp_filter for the interface will be enabled if at least one of
+ conf/{all,interface}/arp_filter is set to TRUE,
+ it will be disabled otherwise
+
+arp_announce - INTEGER
+ Define different restriction levels for announcing the local
+ source IP address from IP packets in ARP requests sent on
+ interface:
+ 0 - (default) Use any local address, configured on any interface
+ 1 - Try to avoid local addresses that are not in the target's
+ subnet for this interface. This mode is useful when target
+ hosts reachable via this interface require the source IP
+ address in ARP requests to be part of their logical network
+ configured on the receiving interface. When we generate the
+ request we will check all our subnets that include the
+ target IP and will preserve the source address if it is from
+ such subnet. If there is no such subnet we select source
+ address according to the rules for level 2.
+ 2 - Always use the best local address for this target.
+ In this mode we ignore the source address in the IP packet
+ and try to select local address that we prefer for talks with
+ the target host. Such local address is selected by looking
+ for primary IP addresses on all our subnets on the outgoing
+ interface that include the target IP address. If no suitable
+ local address is found we select the first local address
+ we have on the outgoing interface or on all other interfaces,
+ with the hope we will receive reply for our request and
+ even sometimes no matter the source IP address we announce.
+
+ The max value from conf/{all,interface}/arp_announce is used.
+
+ Increasing the restriction level gives more chance for
+ receiving answer from the resolved target while decreasing
+ the level announces more valid sender's information.
+
+arp_ignore - INTEGER
+ Define different modes for sending replies in response to
+ received ARP requests that resolve local target IP addresses:
+ 0 - (default): reply for any local target IP address, configured
+ on any interface
+ 1 - reply only if the target IP address is local address
+ configured on the incoming interface
+ 2 - reply only if the target IP address is local address
+ configured on the incoming interface and both with the
+ sender's IP address are part from same subnet on this interface
+ 3 - do not reply for local addresses configured with scope host,
+ only resolutions for global and link addresses are replied
+ 4-7 - reserved
+ 8 - do not reply for all local addresses
+
+ The max value from conf/{all,interface}/arp_ignore is used
+ when ARP request is received on the {interface}
+
+arp_accept - BOOLEAN
+ Define behavior when gratuitous arp replies are received:
+ 0 - drop gratuitous arp frames
+ 1 - accept gratuitous arp frames
+
+app_solicit - INTEGER
+ The maximum number of probes to send to the user space ARP daemon
+ via netlink before dropping back to multicast probes (see
+ mcast_solicit). Defaults to 0.
+
+disable_policy - BOOLEAN
+ Disable IPSEC policy (SPD) for this interface
+
+disable_xfrm - BOOLEAN
+ Disable IPSEC encryption on this interface, whatever the policy
+
+
+
+tag - INTEGER
+ Allows you to write a number, which can be used as required.
+ Default value is 0.
+
+Alexey Kuznetsov.
+kuznet@ms2.inr.ac.ru
+
+Updated by:
+Andi Kleen
+ak@muc.de
+Nicolas Delon
+delon.nicolas@wanadoo.fr
+
+
+
+
+/proc/sys/net/ipv6/* Variables:
+
+IPv6 has no global variables such as tcp_*. tcp_* settings under ipv4/ also
+apply to IPv6 [XXX?].
+
+bindv6only - BOOLEAN
+ Default value for IPV6_V6ONLY socket option,
+ which restricts use of the IPv6 socket to IPv6 communication
+ only.
+ TRUE: disable IPv4-mapped address feature
+ FALSE: enable IPv4-mapped address feature
+
+ Default: FALSE (as specified in RFC2553bis)
+
+IPv6 Fragmentation:
+
+ip6frag_high_thresh - INTEGER
+ Maximum memory used to reassemble IPv6 fragments. When
+ ip6frag_high_thresh bytes of memory is allocated for this purpose,
+ the fragment handler will toss packets until ip6frag_low_thresh
+ is reached.
+
+ip6frag_low_thresh - INTEGER
+ See ip6frag_high_thresh
+
+ip6frag_time - INTEGER
+ Time in seconds to keep an IPv6 fragment in memory.
+
+ip6frag_secret_interval - INTEGER
+ Regeneration interval (in seconds) of the hash secret (or lifetime
+ for the hash secret) for IPv6 fragments.
+ Default: 600
+
+conf/default/*:
+ Change the interface-specific default settings.
+
+
+conf/all/*:
+ Change all the interface-specific settings.
+
+ [XXX: Other special features than forwarding?]
+
+conf/all/forwarding - BOOLEAN
+ Enable global IPv6 forwarding between all interfaces.
+
+ IPv4 and IPv6 work differently here; e.g. netfilter must be used
+ to control which interfaces may forward packets and which not.
+
+ This also sets all interfaces' Host/Router setting
+ 'forwarding' to the specified value. See below for details.
+
+ This referred to as global forwarding.
+
+proxy_ndp - BOOLEAN
+ Do proxy ndp.
+
+conf/interface/*:
+ Change special settings per interface.
+
+ The functional behaviour for certain settings is different
+ depending on whether local forwarding is enabled or not.
+
+accept_ra - BOOLEAN
+ Accept Router Advertisements; autoconfigure using them.
+
+ Functional default: enabled if local forwarding is disabled.
+ disabled if local forwarding is enabled.
+
+accept_ra_defrtr - BOOLEAN
+ Learn default router in Router Advertisement.
+
+ Functional default: enabled if accept_ra is enabled.
+ disabled if accept_ra is disabled.
+
+accept_ra_pinfo - BOOLEAN
+ Learn Prefix Information in Router Advertisement.
+
+ Functional default: enabled if accept_ra is enabled.
+ disabled if accept_ra is disabled.
+
+accept_ra_rt_info_max_plen - INTEGER
+ Maximum prefix length of Route Information in RA.
+
+ Route Information w/ prefix larger than or equal to this
+ variable shall be ignored.
+
+ Functional default: 0 if accept_ra_rtr_pref is enabled.
+ -1 if accept_ra_rtr_pref is disabled.
+
+accept_ra_rtr_pref - BOOLEAN
+ Accept Router Preference in RA.
+
+ Functional default: enabled if accept_ra is enabled.
+ disabled if accept_ra is disabled.
+
+accept_redirects - BOOLEAN
+ Accept Redirects.
+
+ Functional default: enabled if local forwarding is disabled.
+ disabled if local forwarding is enabled.
+
+accept_source_route - INTEGER
+ Accept source routing (routing extension header).
+
+ >= 0: Accept only routing header type 2.
+ < 0: Do not accept routing header.
+
+ Default: 0
+
+autoconf - BOOLEAN
+ Autoconfigure addresses using Prefix Information in Router
+ Advertisements.
+
+ Functional default: enabled if accept_ra_pinfo is enabled.
+ disabled if accept_ra_pinfo is disabled.
+
+dad_transmits - INTEGER
+ The amount of Duplicate Address Detection probes to send.
+ Default: 1
+
+forwarding - BOOLEAN
+ Configure interface-specific Host/Router behaviour.
+
+ Note: It is recommended to have the same setting on all
+ interfaces; mixed router/host scenarios are rather uncommon.
+
+ FALSE:
+
+ By default, Host behaviour is assumed. This means:
+
+ 1. IsRouter flag is not set in Neighbour Advertisements.
+ 2. Router Solicitations are being sent when necessary.
+ 3. If accept_ra is TRUE (default), accept Router
+ Advertisements (and do autoconfiguration).
+ 4. If accept_redirects is TRUE (default), accept Redirects.
+
+ TRUE:
+
+ If local forwarding is enabled, Router behaviour is assumed.
+ This means exactly the reverse from the above:
+
+ 1. IsRouter flag is set in Neighbour Advertisements.
+ 2. Router Solicitations are not sent.
+ 3. Router Advertisements are ignored.
+ 4. Redirects are ignored.
+
+ Default: FALSE if global forwarding is disabled (default),
+ otherwise TRUE.
+
+hop_limit - INTEGER
+ Default Hop Limit to set.
+ Default: 64
+
+mtu - INTEGER
+ Default Maximum Transfer Unit
+ Default: 1280 (IPv6 required minimum)
+
+router_probe_interval - INTEGER
+ Minimum interval (in seconds) between Router Probing described
+ in RFC4191.
+
+ Default: 60
+
+router_solicitation_delay - INTEGER
+ Number of seconds to wait after interface is brought up
+ before sending Router Solicitations.
+ Default: 1
+
+router_solicitation_interval - INTEGER
+ Number of seconds to wait between Router Solicitations.
+ Default: 4
+
+router_solicitations - INTEGER
+ Number of Router Solicitations to send until assuming no
+ routers are present.
+ Default: 3
+
+use_tempaddr - INTEGER
+ Preference for Privacy Extensions (RFC3041).
+ <= 0 : disable Privacy Extensions
+ == 1 : enable Privacy Extensions, but prefer public
+ addresses over temporary addresses.
+ > 1 : enable Privacy Extensions and prefer temporary
+ addresses over public addresses.
+ Default: 0 (for most devices)
+ -1 (for point-to-point devices and loopback devices)
+
+temp_valid_lft - INTEGER
+ valid lifetime (in seconds) for temporary addresses.
+ Default: 604800 (7 days)
+
+temp_prefered_lft - INTEGER
+ Preferred lifetime (in seconds) for temporary addresses.
+ Default: 86400 (1 day)
+
+max_desync_factor - INTEGER
+ Maximum value for DESYNC_FACTOR, which is a random value
+ that ensures that clients don't synchronize with each
+ other and generate new addresses at exactly the same time.
+ value is in seconds.
+ Default: 600
+
+regen_max_retry - INTEGER
+ Number of attempts before give up attempting to generate
+ valid temporary addresses.
+ Default: 5
+
+max_addresses - INTEGER
+ Number of maximum addresses per interface. 0 disables limitation.
+ It is recommended not set too large value (or 0) because it would
+ be too easy way to crash kernel to allow to create too much of
+ autoconfigured addresses.
+ Default: 16
+
+disable_ipv6 - BOOLEAN
+ Disable IPv6 operation.
+ Default: FALSE (enable IPv6 operation)
+
+accept_dad - INTEGER
+ Whether to accept DAD (Duplicate Address Detection).
+ 0: Disable DAD
+ 1: Enable DAD (default)
+ 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate
+ link-local address has been found.
+
+icmp/*:
+ratelimit - INTEGER
+ Limit the maximal rates for sending ICMPv6 packets.
+ 0 to disable any limiting,
+ otherwise the minimal space between responses in milliseconds.
+ Default: 1000
+
+
+IPv6 Update by:
+Pekka Savola <pekkas@netcore.fi>
+YOSHIFUJI Hideaki / USAGI Project <yoshfuji@linux-ipv6.org>
+
+
+/proc/sys/net/bridge/* Variables:
+
+bridge-nf-call-arptables - BOOLEAN
+ 1 : pass bridged ARP traffic to arptables' FORWARD chain.
+ 0 : disable this.
+ Default: 1
+
+bridge-nf-call-iptables - BOOLEAN
+ 1 : pass bridged IPv4 traffic to iptables' chains.
+ 0 : disable this.
+ Default: 1
+
+bridge-nf-call-ip6tables - BOOLEAN
+ 1 : pass bridged IPv6 traffic to ip6tables' chains.
+ 0 : disable this.
+ Default: 1
+
+bridge-nf-filter-vlan-tagged - BOOLEAN
+ 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables.
+ 0 : disable this.
+ Default: 1
+
+bridge-nf-filter-pppoe-tagged - BOOLEAN
+ 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables.
+ 0 : disable this.
+ Default: 1
+
+
+proc/sys/net/sctp/* Variables:
+
+addip_enable - BOOLEAN
+ Enable or disable extension of Dynamic Address Reconfiguration
+ (ADD-IP) functionality specified in RFC5061. This extension provides
+ the ability to dynamically add and remove new addresses for the SCTP
+ associations.
+
+ 1: Enable extension.
+
+ 0: Disable extension.
+
+ Default: 0
+
+addip_noauth_enable - BOOLEAN
+ Dynamic Address Reconfiguration (ADD-IP) requires the use of
+ authentication to protect the operations of adding or removing new
+ addresses. This requirement is mandated so that unauthorized hosts
+ would not be able to hijack associations. However, older
+ implementations may not have implemented this requirement while
+ allowing the ADD-IP extension. For reasons of interoperability,
+ we provide this variable to control the enforcement of the
+ authentication requirement.
+
+ 1: Allow ADD-IP extension to be used without authentication. This
+ should only be set in a closed environment for interoperability
+ with older implementations.
+
+ 0: Enforce the authentication requirement
+
+ Default: 0
+
+auth_enable - BOOLEAN
+ Enable or disable Authenticated Chunks extension. This extension
+ provides the ability to send and receive authenticated chunks and is
+ required for secure operation of Dynamic Address Reconfiguration
+ (ADD-IP) extension.
+
+ 1: Enable this extension.
+ 0: Disable this extension.
+
+ Default: 0
+
+prsctp_enable - BOOLEAN
+ Enable or disable the Partial Reliability extension (RFC3758) which
+ is used to notify peers that a given DATA should no longer be expected.
+
+ 1: Enable extension
+ 0: Disable
+
+ Default: 1
+
+max_burst - INTEGER
+ The limit of the number of new packets that can be initially sent. It
+ controls how bursty the generated traffic can be.
+
+ Default: 4
+
+association_max_retrans - INTEGER
+ Set the maximum number for retransmissions that an association can
+ attempt deciding that the remote end is unreachable. If this value
+ is exceeded, the association is terminated.
+
+ Default: 10
+
+max_init_retransmits - INTEGER
+ The maximum number of retransmissions of INIT and COOKIE-ECHO chunks
+ that an association will attempt before declaring the destination
+ unreachable and terminating.
+
+ Default: 8
+
+path_max_retrans - INTEGER
+ The maximum number of retransmissions that will be attempted on a given
+ path. Once this threshold is exceeded, the path is considered
+ unreachable, and new traffic will use a different path when the
+ association is multihomed.
+
+ Default: 5
+
+rto_initial - INTEGER
+ The initial round trip timeout value in milliseconds that will be used
+ in calculating round trip times. This is the initial time interval
+ for retransmissions.
+
+ Default: 3000
+
+rto_max - INTEGER
+ The maximum value (in milliseconds) of the round trip timeout. This
+ is the largest time interval that can elapse between retransmissions.
+
+ Default: 60000
+
+rto_min - INTEGER
+ The minimum value (in milliseconds) of the round trip timeout. This
+ is the smallest time interval the can elapse between retransmissions.
+
+ Default: 1000
+
+hb_interval - INTEGER
+ The interval (in milliseconds) between HEARTBEAT chunks. These chunks
+ are sent at the specified interval on idle paths to probe the state of
+ a given path between 2 associations.
+
+ Default: 30000
+
+sack_timeout - INTEGER
+ The amount of time (in milliseconds) that the implementation will wait
+ to send a SACK.
+
+ Default: 200
+
+valid_cookie_life - INTEGER
+ The default lifetime of the SCTP cookie (in milliseconds). The cookie
+ is used during association establishment.
+
+ Default: 60000
+
+cookie_preserve_enable - BOOLEAN
+ Enable or disable the ability to extend the lifetime of the SCTP cookie
+ that is used during the establishment phase of SCTP association
+
+ 1: Enable cookie lifetime extension.
+ 0: Disable
+
+ Default: 1
+
+rcvbuf_policy - INTEGER
+ Determines if the receive buffer is attributed to the socket or to
+ association. SCTP supports the capability to create multiple
+ associations on a single socket. When using this capability, it is
+ possible that a single stalled association that's buffering a lot
+ of data may block other associations from delivering their data by
+ consuming all of the receive buffer space. To work around this,
+ the rcvbuf_policy could be set to attribute the receiver buffer space
+ to each association instead of the socket. This prevents the described
+ blocking.
+
+ 1: rcvbuf space is per association
+ 0: recbuf space is per socket
+
+ Default: 0
+
+sndbuf_policy - INTEGER
+ Similar to rcvbuf_policy above, this applies to send buffer space.
+
+ 1: Send buffer is tracked per association
+ 0: Send buffer is tracked per socket.
+
+ Default: 0
+
+sctp_mem - vector of 3 INTEGERs: min, pressure, max
+ Number of pages allowed for queueing by all SCTP sockets.
+
+ min: Below this number of pages SCTP is not bothered about its
+ memory appetite. When amount of memory allocated by SCTP exceeds
+ this number, SCTP starts to moderate memory usage.
+
+ pressure: This value was introduced to follow format of tcp_mem.
+
+ max: Number of pages allowed for queueing by all SCTP sockets.
+
+ Default is calculated at boot time from amount of available memory.
+
+sctp_rmem - vector of 3 INTEGERs: min, default, max
+ See tcp_rmem for a description.
+
+sctp_wmem - vector of 3 INTEGERs: min, default, max
+ See tcp_wmem for a description.
+
+UNDOCUMENTED:
+
+/proc/sys/net/core/*
+ dev_weight FIXME
+
+/proc/sys/net/unix/*
+ max_dgram_qlen FIXME
+
+/proc/sys/net/irda/*
+ fast_poll_increase FIXME
+ warn_noreply_time FIXME
+ discovery_slots FIXME
+ slot_timeout FIXME
+ max_baud_rate FIXME
+ discovery_timeout FIXME
+ lap_keepalive_time FIXME
+ max_noreply_time FIXME
+ max_tx_data_size FIXME
+ max_tx_window FIXME
+ min_tx_turn_time FIXME
diff --git a/Documentation/networking/ip_dynaddr.txt b/Documentation/networking/ip_dynaddr.txt
new file mode 100644
index 0000000..45f3c12
--- /dev/null
+++ b/Documentation/networking/ip_dynaddr.txt
@@ -0,0 +1,29 @@
+IP dynamic address hack-port v0.03
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This stuff allows diald ONESHOT connections to get established by
+dynamically changing packet source address (and socket's if local procs).
+It is implemented for TCP diald-box connections(1) and IP_MASQuerading(2).
+
+If enabled[*] and forwarding interface has changed:
+ 1) Socket (and packet) source address is rewritten ON RETRANSMISSIONS
+ while in SYN_SENT state (diald-box processes).
+ 2) Out-bounded MASQueraded source address changes ON OUTPUT (when
+ internal host does retransmission) until a packet from outside is
+ received by the tunnel.
+
+This is specially helpful for auto dialup links (diald), where the
+``actual'' outgoing address is unknown at the moment the link is
+going up. So, the *same* (local AND masqueraded) connections requests that
+bring the link up will be able to get established.
+
+[*] At boot, by default no address rewriting is attempted.
+ To enable:
+ # echo 1 > /proc/sys/net/ipv4/ip_dynaddr
+ To enable verbose mode:
+ # echo 2 > /proc/sys/net/ipv4/ip_dynaddr
+ To disable (default)
+ # echo 0 > /proc/sys/net/ipv4/ip_dynaddr
+
+Enjoy!
+
+-- Juanjo <jjciarla@raiz.uncu.edu.ar>
diff --git a/Documentation/networking/ipddp.txt b/Documentation/networking/ipddp.txt
new file mode 100644
index 0000000..661a555
--- /dev/null
+++ b/Documentation/networking/ipddp.txt
@@ -0,0 +1,78 @@
+Text file for ipddp.c:
+ AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation
+
+This text file is written by Jay Schulist <jschlst@samba.org>
+
+Introduction
+------------
+
+AppleTalk-IP (IPDDP) is the method computers connected to AppleTalk
+networks can use to communicate via IP. AppleTalk-IP is simply IP datagrams
+inside AppleTalk packets.
+
+Through this driver you can either allow your Linux box to communicate
+IP over an AppleTalk network or you can provide IP gatewaying functions
+for your AppleTalk users.
+
+You can currently encapsulate or decapsulate AppleTalk-IP on LocalTalk,
+EtherTalk and PPPTalk. The only limit on the protocol is that of what
+kernel AppleTalk layer and drivers are available.
+
+Each mode requires its own user space software.
+
+Compiling AppleTalk-IP Decapsulation/Encapsulation
+=================================================
+
+AppleTalk-IP decapsulation needs to be compiled into your kernel. You
+will need to turn on AppleTalk-IP driver support. Then you will need to
+select ONE of the two options; IP to AppleTalk-IP encapsulation support or
+AppleTalk-IP to IP decapsulation support. If you compile the driver
+statically you will only be able to use the driver for the function you have
+enabled in the kernel. If you compile the driver as a module you can
+select what mode you want it to run in via a module loading param.
+ipddp_mode=1 for AppleTalk-IP encapsulation and ipddp_mode=2 for
+AppleTalk-IP to IP decapsulation.
+
+Basic instructions for user space tools
+=======================================
+
+To enable AppleTalk-IP decapsulation/encapsulation you will need the
+proper tools. You can get the tools for decapsulation from
+http://spacs1.spacs.k12.wi.us/~jschlst/index.html and for encapsulation
+from http://www.maths.unm.edu/~bradford/ltpc.html
+
+I will briefly describe the operation of the tools, but you will
+need to consult the supporting documentation for each set of tools.
+
+Decapsulation - You will need to download a software package called
+MacGate. In this distribution there will be a tool called MacRoute
+which enables you to add routes to the kernel for your Macs by hand.
+Also the tool MacRegGateWay is included to register the
+proper IP Gateway and IP addresses for your machine. Included in this
+distribution is a patch to netatalk-1.4b2+asun2.0a17.2 (available from
+ftp.u.washington.edu/pub/user-supported/asun/) this patch is optional
+but it allows automatic adding and deleting of routes for Macs. (Handy
+for locations with large Mac installations)
+
+Encapsulation - You will need to download a software daemon called ipddpd.
+This software expects there to be an AppleTalk-IP gateway on the network.
+You will also need to add the proper routes to route your Linux box's IP
+traffic out the ipddp interface.
+
+Common Uses of ipddp.c
+----------------------
+Of course AppleTalk-IP decapsulation and encapsulation, but specifically
+decapsulation is being used most for connecting LocalTalk networks to
+IP networks. Although it has been used on EtherTalk networks to allow
+Macs that are only able to tunnel IP over EtherTalk.
+
+Encapsulation has been used to allow a Linux box stuck on a LocalTalk
+network to use IP. It should work equally well if you are stuck on an
+EtherTalk only network.
+
+Further Assistance
+-------------------
+You can contact me (Jay Schulist <jschlst@samba.org>) with any
+questions regarding decapsulation or encapsulation. Bradford W. Johnson
+<johns393@maroon.tc.umn.edu> originally wrote the ipddp.c driver for IP
+encapsulation in AppleTalk.
diff --git a/Documentation/networking/iphase.txt b/Documentation/networking/iphase.txt
new file mode 100644
index 0000000..55eac4a
--- /dev/null
+++ b/Documentation/networking/iphase.txt
@@ -0,0 +1,158 @@
+
+ READ ME FISRT
+ ATM (i)Chip IA Linux Driver Source
+--------------------------------------------------------------------------------
+ Read This Before You Begin!
+--------------------------------------------------------------------------------
+
+Description
+-----------
+
+This is the README file for the Interphase PCI ATM (i)Chip IA Linux driver
+source release.
+
+The features and limitations of this driver are as follows:
+ - A single VPI (VPI value of 0) is supported.
+ - Supports 4K VCs for the server board (with 512K control memory) and 1K
+ VCs for the client board (with 128K control memory).
+ - UBR, ABR and CBR service categories are supported.
+ - Only AAL5 is supported.
+ - Supports setting of PCR on the VCs.
+ - Multiple adapters in a system are supported.
+ - All variants of Interphase ATM PCI (i)Chip adapter cards are supported,
+ including x575 (OC3, control memory 128K , 512K and packet memory 128K,
+ 512K and 1M), x525 (UTP25) and x531 (DS3 and E3). See
+ http://www.iphase.com/site/iphase-web/?epi_menuItemID=e196f04b4b3b40502f150882e21046a0
+ for details.
+ - Only x86 platforms are supported.
+ - SMP is supported.
+
+
+Before You Start
+----------------
+
+
+Installation
+------------
+
+1. Installing the adapters in the system
+ To install the ATM adapters in the system, follow the steps below.
+ a. Login as root.
+ b. Shut down the system and power off the system.
+ c. Install one or more ATM adapters in the system.
+ d. Connect each adapter to a port on an ATM switch. The green 'Link'
+ LED on the front panel of the adapter will be on if the adapter is
+ connected to the switch properly when the system is powered up.
+ e. Power on and boot the system.
+
+2. [ Removed ]
+
+3. Rebuild kernel with ABR support
+ [ a. and b. removed ]
+ c. Reconfigure the kernel, choose the Interphase ia driver through "make
+ menuconfig" or "make xconfig".
+ d. Rebuild the kernel, loadable modules and the atm tools.
+ e. Install the new built kernel and modules and reboot.
+
+4. Load the adapter hardware driver (ia driver) if it is built as a module
+ a. Login as root.
+ b. Change directory to /lib/modules/<kernel-version>/atm.
+ c. Run "insmod suni.o;insmod iphase.o"
+ The yellow 'status' LED on the front panel of the adapter will blink
+ while the driver is loaded in the system.
+ d. To verify that the 'ia' driver is loaded successfully, run the
+ following command:
+
+ cat /proc/atm/devices
+
+ If the driver is loaded successfully, the output of the command will
+ be similar to the following lines:
+
+ Itf Type ESI/"MAC"addr AAL(TX,err,RX,err,drop) ...
+ 0 ia xxxxxxxxx 0 ( 0 0 0 0 0 ) 5 ( 0 0 0 0 0 )
+
+ You can also check the system log file /var/log/messages for messages
+ related to the ATM driver.
+
+5. Ia Driver Configuration
+
+5.1 Configuration of adapter buffers
+ The (i)Chip boards have 3 different packet RAM size variants: 128K, 512K and
+ 1M. The RAM size decides the number of buffers and buffer size. The default
+ size and number of buffers are set as following:
+
+ Total Rx RAM Tx RAM Rx Buf Tx Buf Rx buf Tx buf
+ RAM size size size size size cnt cnt
+ -------- ------ ------ ------ ------ ------ ------
+ 128K 64K 64K 10K 10K 6 6
+ 512K 256K 256K 10K 10K 25 25
+ 1M 512K 512K 10K 10K 51 51
+
+ These setting should work well in most environments, but can be
+ changed by typing the following command:
+
+ insmod <IA_DIR>/ia.o IA_RX_BUF=<RX_CNT> IA_RX_BUF_SZ=<RX_SIZE> \
+ IA_TX_BUF=<TX_CNT> IA_TX_BUF_SZ=<TX_SIZE>
+ Where:
+ RX_CNT = number of receive buffers in the range (1-128)
+ RX_SIZE = size of receive buffers in the range (48-64K)
+ TX_CNT = number of transmit buffers in the range (1-128)
+ TX_SIZE = size of transmit buffers in the range (48-64K)
+
+ 1. Transmit and receive buffer size must be a multiple of 4.
+ 2. Care should be taken so that the memory required for the
+ transmit and receive buffers is less than or equal to the
+ total adapter packet memory.
+
+5.2 Turn on ia debug trace
+
+ When the ia driver is built with the CONFIG_ATM_IA_DEBUG flag, the driver
+ can provide more debug trace if needed. There is a bit mask variable,
+ IADebugFlag, which controls the output of the traces. You can find the bit
+ map of the IADebugFlag in iphase.h.
+ The debug trace can be turn on through the insmod command line option, for
+ example, "insmod iphase.o IADebugFlag=0xffffffff" can turn on all the debug
+ traces together with loading the driver.
+
+6. Ia Driver Test Using ttcp_atm and PVC
+
+ For the PVC setup, the test machines can either be connected back-to-back or
+ through a switch. If connected through the switch, the switch must be
+ configured for the PVC(s).
+
+ a. For UBR test:
+ At the test machine intended to receive data, type:
+ ttcp_atm -r -a -s 0.100
+ At the other test machine, type:
+ ttcp_atm -t -a -s 0.100 -n 10000
+ Run "ttcp_atm -h" to display more options of the ttcp_atm tool.
+ b. For ABR test:
+ It is the same as the UBR testing, but with an extra command option:
+ -Pabr:max_pcr=<xxx>
+ where:
+ xxx = the maximum peak cell rate, from 170 - 353207.
+ This option must be set on both the machines.
+ c. For CBR test:
+ It is the same as the UBR testing, but with an extra command option:
+ -Pcbr:max_pcr=<xxx>
+ where:
+ xxx = the maximum peak cell rate, from 170 - 353207.
+ This option may only be set on the transmit machine.
+
+
+OUTSTANDING ISSUES
+------------------
+
+
+
+Contact Information
+-------------------
+
+ Customer Support:
+ United States: Telephone: (214) 654-5555
+ Fax: (214) 654-5500
+ E-Mail: intouch@iphase.com
+ Europe: Telephone: 33 (0)1 41 15 44 00
+ Fax: 33 (0)1 41 15 12 13
+ World Wide Web: http://www.iphase.com
+ Anonymous FTP: ftp.iphase.com
diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
new file mode 100644
index 0000000..4ccdbca
--- /dev/null
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -0,0 +1,143 @@
+/proc/sys/net/ipv4/vs/* Variables:
+
+am_droprate - INTEGER
+ default 10
+
+ It sets the always mode drop rate, which is used in the mode 3
+ of the drop_rate defense.
+
+amemthresh - INTEGER
+ default 1024
+
+ It sets the available memory threshold (in pages), which is
+ used in the automatic modes of defense. When there is no
+ enough available memory, the respective strategy will be
+ enabled and the variable is automatically set to 2, otherwise
+ the strategy is disabled and the variable is set to 1.
+
+cache_bypass - BOOLEAN
+ 0 - disabled (default)
+ not 0 - enabled
+
+ If it is enabled, forward packets to the original destination
+ directly when no cache server is available and destination
+ address is not local (iph->daddr is RTN_UNICAST). It is mostly
+ used in transparent web cache cluster.
+
+debug_level - INTEGER
+ 0 - transmission error messages (default)
+ 1 - non-fatal error messages
+ 2 - configuration
+ 3 - destination trash
+ 4 - drop entry
+ 5 - service lookup
+ 6 - scheduling
+ 7 - connection new/expire, lookup and synchronization
+ 8 - state transition
+ 9 - binding destination, template checks and applications
+ 10 - IPVS packet transmission
+ 11 - IPVS packet handling (ip_vs_in/ip_vs_out)
+ 12 or more - packet traversal
+
+ Only available when IPVS is compiled with the CONFIG_IPVS_DEBUG
+
+ Higher debugging levels include the messages for lower debugging
+ levels, so setting debug level 2, includes level 0, 1 and 2
+ messages. Thus, logging becomes more and more verbose the higher
+ the level.
+
+drop_entry - INTEGER
+ 0 - disabled (default)
+
+ The drop_entry defense is to randomly drop entries in the
+ connection hash table, just in order to collect back some
+ memory for new connections. In the current code, the
+ drop_entry procedure can be activated every second, then it
+ randomly scans 1/32 of the whole and drops entries that are in
+ the SYN-RECV/SYNACK state, which should be effective against
+ syn-flooding attack.
+
+ The valid values of drop_entry are from 0 to 3, where 0 means
+ that this strategy is always disabled, 1 and 2 mean automatic
+ modes (when there is no enough available memory, the strategy
+ is enabled and the variable is automatically set to 2,
+ otherwise the strategy is disabled and the variable is set to
+ 1), and 3 means that that the strategy is always enabled.
+
+drop_packet - INTEGER
+ 0 - disabled (default)
+
+ The drop_packet defense is designed to drop 1/rate packets
+ before forwarding them to real servers. If the rate is 1, then
+ drop all the incoming packets.
+
+ The value definition is the same as that of the drop_entry. In
+ the automatic mode, the rate is determined by the follow
+ formula: rate = amemthresh / (amemthresh - available_memory)
+ when available memory is less than the available memory
+ threshold. When the mode 3 is set, the always mode drop rate
+ is controlled by the /proc/sys/net/ipv4/vs/am_droprate.
+
+expire_nodest_conn - BOOLEAN
+ 0 - disabled (default)
+ not 0 - enabled
+
+ The default value is 0, the load balancer will silently drop
+ packets when its destination server is not available. It may
+ be useful, when user-space monitoring program deletes the
+ destination server (because of server overload or wrong
+ detection) and add back the server later, and the connections
+ to the server can continue.
+
+ If this feature is enabled, the load balancer will expire the
+ connection immediately when a packet arrives and its
+ destination server is not available, then the client program
+ will be notified that the connection is closed. This is
+ equivalent to the feature some people requires to flush
+ connections when its destination is not available.
+
+expire_quiescent_template - BOOLEAN
+ 0 - disabled (default)
+ not 0 - enabled
+
+ When set to a non-zero value, the load balancer will expire
+ persistent templates when the destination server is quiescent.
+ This may be useful, when a user makes a destination server
+ quiescent by setting its weight to 0 and it is desired that
+ subsequent otherwise persistent connections are sent to a
+ different destination server. By default new persistent
+ connections are allowed to quiescent destination servers.
+
+ If this feature is enabled, the load balancer will expire the
+ persistence template if it is to be used to schedule a new
+ connection and the destination server is quiescent.
+
+nat_icmp_send - BOOLEAN
+ 0 - disabled (default)
+ not 0 - enabled
+
+ It controls sending icmp error messages (ICMP_DEST_UNREACH)
+ for VS/NAT when the load balancer receives packets from real
+ servers but the connection entries don't exist.
+
+secure_tcp - INTEGER
+ 0 - disabled (default)
+
+ The secure_tcp defense is to use a more complicated state
+ transition table and some possible short timeouts of each
+ state. In the VS/NAT, it delays the entering the ESTABLISHED
+ until the real server starts to send data and ACK packet
+ (after 3-way handshake).
+
+ The value definition is the same as that of drop_entry or
+ drop_packet.
+
+sync_threshold - INTEGER
+ default 3
+
+ It sets synchronization threshold, which is the minimum number
+ of incoming packets that a connection needs to receive before
+ the connection will be synchronized. A connection will be
+ synchronized, every time the number of its incoming packets
+ modulus 50 equals the threshold. The range of the threshold is
+ from 0 to 49.
diff --git a/Documentation/networking/irda.txt b/Documentation/networking/irda.txt
new file mode 100644
index 0000000..bff26c1
--- /dev/null
+++ b/Documentation/networking/irda.txt
@@ -0,0 +1,10 @@
+To use the IrDA protocols within Linux you will need to get a suitable copy
+of the IrDA Utilities. More detailed information about these and associated
+programs can be found on http://irda.sourceforge.net/
+
+For more information about how to use the IrDA protocol stack, see the
+Linux Infrared HOWTO by Werner Heuser <wehe@tuxmobil.org>:
+<http://www.tuxmobil.org/Infrared-HOWTO/Infrared-HOWTO.html>
+
+There is an active mailing list for discussing Linux-IrDA matters called
+ irda-users@lists.sourceforge.net
diff --git a/Documentation/networking/ixgb.txt b/Documentation/networking/ixgb.txt
new file mode 100644
index 0000000..a0d0ffb
--- /dev/null
+++ b/Documentation/networking/ixgb.txt
@@ -0,0 +1,433 @@
+Linux Base Driver for 10 Gigabit Intel(R) Network Connection
+=============================================================
+
+October 9, 2007
+
+
+Contents
+========
+
+- In This Release
+- Identifying Your Adapter
+- Building and Installation
+- Command Line Parameters
+- Improving Performance
+- Additional Configurations
+- Known Issues/Troubleshooting
+- Support
+
+
+
+In This Release
+===============
+
+This file describes the ixgb Linux Base Driver for the 10 Gigabit Intel(R)
+Network Connection. This driver includes support for Itanium(R)2-based
+systems.
+
+For questions related to hardware requirements, refer to the documentation
+supplied with your 10 Gigabit adapter. All hardware requirements listed apply
+to use with Linux.
+
+The following features are available in this kernel:
+ - Native VLANs
+ - Channel Bonding (teaming)
+ - SNMP
+
+Channel Bonding documentation can be found in the Linux kernel source:
+/Documentation/networking/bonding.txt
+
+The driver information previously displayed in the /proc filesystem is not
+supported in this release. Alternatively, you can use ethtool (version 1.6
+or later), lspci, and ifconfig to obtain the same information.
+
+Instructions on updating ethtool can be found in the section "Additional
+Configurations" later in this document.
+
+
+Identifying Your Adapter
+========================
+
+The following Intel network adapters are compatible with the drivers in this
+release:
+
+Controller Adapter Name Physical Layer
+---------- ------------ --------------
+82597EX Intel(R) PRO/10GbE LR/SR/CX4 10G Base-LR (1310 nm optical fiber)
+ Server Adapters 10G Base-SR (850 nm optical fiber)
+ 10G Base-CX4(twin-axial copper cabling)
+
+For more information on how to identify your adapter, go to the Adapter &
+Driver ID Guide at:
+
+ http://support.intel.com/support/network/sb/CS-012904.htm
+
+
+Building and Installation
+=========================
+
+select m for "Intel(R) PRO/10GbE support" located at:
+ Location:
+ -> Device Drivers
+ -> Network device support (NETDEVICES [=y])
+ -> Ethernet (10000 Mbit) (NETDEV_10000 [=y])
+1. make modules && make modules_install
+
+2. Load the module:
+
+    modprobe ixgb <parameter>=<value>
+
+ The insmod command can be used if the full
+ path to the driver module is specified. For example:
+
+ insmod /lib/modules/<KERNEL VERSION>/kernel/drivers/net/ixgb/ixgb.ko
+
+ With 2.6 based kernels also make sure that older ixgb drivers are
+ removed from the kernel, before loading the new module:
+
+ rmmod ixgb; modprobe ixgb
+
+3. Assign an IP address to the interface by entering the following, where
+ x is the interface number:
+
+ ifconfig ethx <IP_address>
+
+4. Verify that the interface works. Enter the following, where <IP_address>
+ is the IP address for another machine on the same subnet as the interface
+ that is being tested:
+
+ ping <IP_address>
+
+
+Command Line Parameters
+=======================
+
+If the driver is built as a module, the following optional parameters are
+used by entering them on the command line with the modprobe command using
+this syntax:
+
+ modprobe ixgb [<option>=<VAL1>,<VAL2>,...]
+
+For example, with two 10GbE PCI adapters, entering:
+
+ modprobe ixgb TxDescriptors=80,128
+
+loads the ixgb driver with 80 TX resources for the first adapter and 128 TX
+resources for the second adapter.
+
+The default value for each parameter is generally the recommended setting,
+unless otherwise noted.
+
+FlowControl
+Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
+Default: Read from the EEPROM
+ If EEPROM is not detected, default is 1
+ This parameter controls the automatic generation(Tx) and response(Rx) to
+ Ethernet PAUSE frames. There are hardware bugs associated with enabling
+ Tx flow control so beware.
+
+RxDescriptors
+Valid Range: 64-512
+Default Value: 512
+ This value is the number of receive descriptors allocated by the driver.
+ Increasing this value allows the driver to buffer more incoming packets.
+ Each descriptor is 16 bytes. A receive buffer is also allocated for
+ each descriptor and can be either 2048, 4056, 8192, or 16384 bytes,
+ depending on the MTU setting. When the MTU size is 1500 or less, the
+ receive buffer size is 2048 bytes. When the MTU is greater than 1500 the
+ receive buffer size will be either 4056, 8192, or 16384 bytes. The
+ maximum MTU size is 16114.
+
+RxIntDelay
+Valid Range: 0-65535 (0=off)
+Default Value: 72
+ This value delays the generation of receive interrupts in units of
+ 0.8192 microseconds. Receive interrupt reduction can improve CPU
+ efficiency if properly tuned for specific network traffic. Increasing
+ this value adds extra latency to frame reception and can end up
+ decreasing the throughput of TCP traffic. If the system is reporting
+ dropped receives, this value may be set too high, causing the driver to
+ run out of available receive descriptors.
+
+TxDescriptors
+Valid Range: 64-4096
+Default Value: 256
+ This value is the number of transmit descriptors allocated by the driver.
+ Increasing this value allows the driver to queue more transmits. Each
+ descriptor is 16 bytes.
+
+XsumRX
+Valid Range: 0-1
+Default Value: 1
+ A value of '1' indicates that the driver should enable IP checksum
+ offload for received packets (both UDP and TCP) to the adapter hardware.
+
+
+Improving Performance
+=====================
+
+With the 10 Gigabit server adapters, the default Linux configuration will
+very likely limit the total available throughput artificially. There is a set
+of configuration changes that, when applied together, will increase the ability
+of Linux to transmit and receive data. The following enhancements were
+originally acquired from settings published at http://www.spec.org/web99/ for
+various submitted results using Linux.
+
+NOTE: These changes are only suggestions, and serve as a starting point for
+ tuning your network performance.
+
+The changes are made in three major ways, listed in order of greatest effect:
+- Use ifconfig to modify the mtu (maximum transmission unit) and the txqueuelen
+ parameter.
+- Use sysctl to modify /proc parameters (essentially kernel tuning)
+- Use setpci to modify the MMRBC field in PCI-X configuration space to increase
+ transmit burst lengths on the bus.
+
+NOTE: setpci modifies the adapter's configuration registers to allow it to read
+up to 4k bytes at a time (for transmits). However, for some systems the
+behavior after modifying this register may be undefined (possibly errors of
+some kind). A power-cycle, hard reset or explicitly setting the e6 register
+back to 22 (setpci -d 8086:1a48 e6.b=22) may be required to get back to a
+stable configuration.
+
+- COPY these lines and paste them into ixgb_perf.sh:
+#!/bin/bash
+echo "configuring network performance , edit this file to change the interface
+or device ID of 10GbE card"
+# set mmrbc to 4k reads, modify only Intel 10GbE device IDs
+# replace 1a48 with appropriate 10GbE device's ID installed on the system,
+# if needed.
+setpci -d 8086:1a48 e6.b=2e
+# set the MTU (max transmission unit) - it requires your switch and clients
+# to change as well.
+# set the txqueuelen
+# your ixgb adapter should be loaded as eth1 for this to work, change if needed
+ifconfig eth1 mtu 9000 txqueuelen 1000 up
+# call the sysctl utility to modify /proc/sys entries
+sysctl -p ./sysctl_ixgb.conf
+- END ixgb_perf.sh
+
+- COPY these lines and paste them into sysctl_ixgb.conf:
+# some of the defaults may be different for your kernel
+# call this file with sysctl -p <this file>
+# these are just suggested values that worked well to increase throughput in
+# several network benchmark tests, your mileage may vary
+
+### IPV4 specific settings
+# turn TCP timestamp support off, default 1, reduces CPU use
+net.ipv4.tcp_timestamps = 0
+# turn SACK support off, default on
+# on systems with a VERY fast bus -> memory interface this is the big gainer
+net.ipv4.tcp_sack = 0
+# set min/default/max TCP read buffer, default 4096 87380 174760
+net.ipv4.tcp_rmem = 10000000 10000000 10000000
+# set min/pressure/max TCP write buffer, default 4096 16384 131072
+net.ipv4.tcp_wmem = 10000000 10000000 10000000
+# set min/pressure/max TCP buffer space, default 31744 32256 32768
+net.ipv4.tcp_mem = 10000000 10000000 10000000
+
+### CORE settings (mostly for socket and UDP effect)
+# set maximum receive socket buffer size, default 131071
+net.core.rmem_max = 524287
+# set maximum send socket buffer size, default 131071
+net.core.wmem_max = 524287
+# set default receive socket buffer size, default 65535
+net.core.rmem_default = 524287
+# set default send socket buffer size, default 65535
+net.core.wmem_default = 524287
+# set maximum amount of option memory buffers, default 10240
+net.core.optmem_max = 524287
+# set number of unprocessed input packets before kernel starts dropping them; default 300
+net.core.netdev_max_backlog = 300000
+- END sysctl_ixgb.conf
+
+Edit the ixgb_perf.sh script if necessary to change eth1 to whatever interface
+your ixgb driver is using and/or replace '1a48' with appropriate 10GbE device's
+ID installed on the system.
+
+NOTE: Unless these scripts are added to the boot process, these changes will
+ only last only until the next system reboot.
+
+
+Resolving Slow UDP Traffic
+--------------------------
+If your server does not seem to be able to receive UDP traffic as fast as it
+can receive TCP traffic, it could be because Linux, by default, does not set
+the network stack buffers as large as they need to be to support high UDP
+transfer rates. One way to alleviate this problem is to allow more memory to
+be used by the IP stack to store incoming data.
+
+For instance, use the commands:
+ sysctl -w net.core.rmem_max=262143
+and
+ sysctl -w net.core.rmem_default=262143
+to increase the read buffer memory max and default to 262143 (256k - 1) from
+defaults of max=131071 (128k - 1) and default=65535 (64k - 1). These variables
+will increase the amount of memory used by the network stack for receives, and
+can be increased significantly more if necessary for your application.
+
+
+Additional Configurations
+=========================
+
+ Configuring the Driver on Different Distributions
+ -------------------------------------------------
+ Configuring a network driver to load properly when the system is started is
+ distribution dependent. Typically, the configuration process involves adding
+ an alias line to /etc/modprobe.conf as well as editing other system startup
+ scripts and/or configuration files. Many popular Linux distributions ship
+ with tools to make these changes for you. To learn the proper way to
+ configure a network device for your system, refer to your distribution
+ documentation. If during this process you are asked for the driver or module
+ name, the name for the Linux Base Driver for the Intel 10GbE Family of
+ Adapters is ixgb.
+
+ Viewing Link Messages
+ ---------------------
+ Link messages will not be displayed to the console if the distribution is
+ restricting system messages. In order to see network driver link messages on
+ your console, set dmesg to eight by entering the following:
+
+ dmesg -n 8
+
+ NOTE: This setting is not saved across reboots.
+
+
+ Jumbo Frames
+ ------------
+ The driver supports Jumbo Frames for all adapters. Jumbo Frames support is
+ enabled by changing the MTU to a value larger than the default of 1500.
+ The maximum value for the MTU is 16114. Use the ifconfig command to
+ increase the MTU size. For example:
+
+ ifconfig ethx mtu 9000 up
+
+ The maximum MTU setting for Jumbo Frames is 16114. This value coincides
+ with the maximum Jumbo Frames size of 16128.
+
+
+ Ethtool
+ -------
+ The driver utilizes the ethtool interface for driver configuration and
+ diagnostics, as well as displaying statistical information. Ethtool
+ version 1.6 or later is required for this functionality.
+
+ The latest release of ethtool can be found from
+ http://sourceforge.net/projects/gkernel
+
+ NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
+ for a more complete ethtool feature set can be enabled by upgrading
+ to the latest version.
+
+
+ NAPI
+ ----
+
+ NAPI (Rx polling mode) is supported in the ixgb driver. NAPI is enabled
+ or disabled based on the configuration of the kernel. see CONFIG_IXGB_NAPI
+
+ See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
+
+
+Known Issues/Troubleshooting
+============================
+
+ NOTE: After installing the driver, if your Intel Network Connection is not
+ working, verify in the "In This Release" section of the readme that you have
+ installed the correct driver.
+
+ Intel(R) PRO/10GbE CX4 Server Adapter Cable Interoperability Issue with
+ Fujitsu XENPAK Module in SmartBits Chassis
+ ---------------------------------------------------------------------
+ Excessive CRC errors may be observed if the Intel(R) PRO/10GbE CX4
+ Server adapter is connected to a Fujitsu XENPAK CX4 module in a SmartBits
+ chassis using 15 m/24AWG cable assemblies manufactured by Fujitsu or Leoni.
+ The CRC errors may be received either by the Intel(R) PRO/10GbE CX4
+ Server adapter or the SmartBits. If this situation occurs using a different
+ cable assembly may resolve the issue.
+
+ CX4 Server Adapter Cable Interoperability Issues with HP Procurve 3400cl
+ Switch Port
+ ------------------------------------------------------------------------
+ Excessive CRC errors may be observed if the Intel(R) PRO/10GbE CX4 Server
+ adapter is connected to an HP Procurve 3400cl switch port using short cables
+ (1 m or shorter). If this situation occurs, using a longer cable may resolve
+ the issue.
+
+ Excessive CRC errors may be observed using Fujitsu 24AWG cable assemblies that
+ Are 10 m or longer or where using a Leoni 15 m/24AWG cable assembly. The CRC
+ errors may be received either by the CX4 Server adapter or at the switch. If
+ this situation occurs, using a different cable assembly may resolve the issue.
+
+
+ Jumbo Frames System Requirement
+ -------------------------------
+ Memory allocation failures have been observed on Linux systems with 64 MB
+ of RAM or less that are running Jumbo Frames. If you are using Jumbo
+ Frames, your system may require more than the advertised minimum
+ requirement of 64 MB of system memory.
+
+
+ Performance Degradation with Jumbo Frames
+ -----------------------------------------
+ Degradation in throughput performance may be observed in some Jumbo frames
+ environments. If this is observed, increasing the application's socket buffer
+ size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values may help.
+ See the specific application manual and /usr/src/linux*/Documentation/
+ networking/ip-sysctl.txt for more details.
+
+
+ Allocating Rx Buffers when Using Jumbo Frames
+ ---------------------------------------------
+ Allocating Rx buffers when using Jumbo Frames on 2.6.x kernels may fail if
+ the available memory is heavily fragmented. This issue may be seen with PCI-X
+ adapters or with packet split disabled. This can be reduced or eliminated
+ by changing the amount of available memory for receive buffer allocation, by
+ increasing /proc/sys/vm/min_free_kbytes.
+
+
+ Multiple Interfaces on Same Ethernet Broadcast Network
+ ------------------------------------------------------
+ Due to the default ARP behavior on Linux, it is not possible to have
+ one system on two IP networks in the same Ethernet broadcast domain
+ (non-partitioned switch) behave as expected. All Ethernet interfaces
+ will respond to IP traffic for any IP address assigned to the system.
+ This results in unbalanced receive traffic.
+
+ If you have multiple interfaces in a server, do either of the following:
+
+ - Turn on ARP filtering by entering:
+ echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+
+ - Install the interfaces in separate broadcast domains - either in
+ different switches or in a switch partitioned to VLANs.
+
+
+ UDP Stress Test Dropped Packet Issue
+ --------------------------------------
+ Under small packets UDP stress test with 10GbE driver, the Linux system
+ may drop UDP packets due to the fullness of socket buffers. You may want
+ to change the driver's Flow Control variables to the minimum value for
+ controlling packet reception.
+
+
+ Tx Hangs Possible Under Stress
+ ------------------------------
+ Under stress conditions, if TX hangs occur, turning off TSO
+ "ethtool -K eth0 tso off" may resolve the problem.
+
+
+Support
+=======
+
+For general information, go to the Intel support website at:
+
+ http://support.intel.com
+
+or the Intel Wired Networking project hosted by Sourceforge at:
+
+ http://sourceforge.net/projects/e1000
+
+If an issue is identified with the released source code on the supported
+kernel with a supported adapter, email the specific information related
+to the issue to e1000-devel@lists.sf.net
diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt
new file mode 100644
index 0000000..2451f55
--- /dev/null
+++ b/Documentation/networking/l2tp.txt
@@ -0,0 +1,169 @@
+This brief document describes how to use the kernel's PPPoL2TP driver
+to provide L2TP functionality. L2TP is a protocol that tunnels one or
+more PPP sessions over a UDP tunnel. It is commonly used for VPNs
+(L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP
+network infrastructure.
+
+Design
+======
+
+The PPPoL2TP driver, drivers/net/pppol2tp.c, provides a mechanism by
+which PPP frames carried through an L2TP session are passed through
+the kernel's PPP subsystem. The standard PPP daemon, pppd, handles all
+PPP interaction with the peer. PPP network interfaces are created for
+each local PPP endpoint.
+
+The L2TP protocol http://www.faqs.org/rfcs/rfc2661.html defines L2TP
+control and data frames. L2TP control frames carry messages between
+L2TP clients/servers and are used to setup / teardown tunnels and
+sessions. An L2TP client or server is implemented in userspace and
+will use a regular UDP socket per tunnel. L2TP data frames carry PPP
+frames, which may be PPP control or PPP data. The kernel's PPP
+subsystem arranges for PPP control frames to be delivered to pppd,
+while data frames are forwarded as usual.
+
+Each tunnel and session within a tunnel is assigned a unique tunnel_id
+and session_id. These ids are carried in the L2TP header of every
+control and data packet. The pppol2tp driver uses them to lookup
+internal tunnel and/or session contexts. Zero tunnel / session ids are
+treated specially - zero ids are never assigned to tunnels or sessions
+in the network. In the driver, the tunnel context keeps a pointer to
+the tunnel UDP socket. The session context keeps a pointer to the
+PPPoL2TP socket, as well as other data that lets the driver interface
+to the kernel PPP subsystem.
+
+Note that the pppol2tp kernel driver handles only L2TP data frames;
+L2TP control frames are simply passed up to userspace in the UDP
+tunnel socket. The kernel handles all datapath aspects of the
+protocol, including data packet resequencing (if enabled).
+
+There are a number of requirements on the userspace L2TP daemon in
+order to use the pppol2tp driver.
+
+1. Use a UDP socket per tunnel.
+
+2. Create a single PPPoL2TP socket per tunnel bound to a special null
+ session id. This is used only for communicating with the driver but
+ must remain open while the tunnel is active. Opening this tunnel
+ management socket causes the driver to mark the tunnel socket as an
+ L2TP UDP encapsulation socket and flags it for use by the
+ referenced tunnel id. This hooks up the UDP receive path via
+ udp_encap_rcv() in net/ipv4/udp.c. PPP data frames are never passed
+ in this special PPPoX socket.
+
+3. Create a PPPoL2TP socket per L2TP session. This is typically done
+ by starting pppd with the pppol2tp plugin and appropriate
+ arguments. A PPPoL2TP tunnel management socket (Step 2) must be
+ created before the first PPPoL2TP session socket is created.
+
+When creating PPPoL2TP sockets, the application provides information
+to the driver about the socket in a socket connect() call. Source and
+destination tunnel and session ids are provided, as well as the file
+descriptor of a UDP socket. See struct pppol2tp_addr in
+include/linux/if_ppp.h. Note that zero tunnel / session ids are
+treated specially. When creating the per-tunnel PPPoL2TP management
+socket in Step 2 above, zero source and destination session ids are
+specified, which tells the driver to prepare the supplied UDP file
+descriptor for use as an L2TP tunnel socket.
+
+Userspace may control behavior of the tunnel or session using
+setsockopt and ioctl on the PPPoX socket. The following socket
+options are supported:-
+
+DEBUG - bitmask of debug message categories. See below.
+SENDSEQ - 0 => don't send packets with sequence numbers
+ 1 => send packets with sequence numbers
+RECVSEQ - 0 => receive packet sequence numbers are optional
+ 1 => drop receive packets without sequence numbers
+LNSMODE - 0 => act as LAC.
+ 1 => act as LNS.
+REORDERTO - reorder timeout (in millisecs). If 0, don't try to reorder.
+
+Only the DEBUG option is supported by the special tunnel management
+PPPoX socket.
+
+In addition to the standard PPP ioctls, a PPPIOCGL2TPSTATS is provided
+to retrieve tunnel and session statistics from the kernel using the
+PPPoX socket of the appropriate tunnel or session.
+
+Debugging
+=========
+
+The driver supports a flexible debug scheme where kernel trace
+messages may be optionally enabled per tunnel and per session. Care is
+needed when debugging a live system since the messages are not
+rate-limited and a busy system could be swamped. Userspace uses
+setsockopt on the PPPoX socket to set a debug mask.
+
+The following debug mask bits are available:
+
+PPPOL2TP_MSG_DEBUG verbose debug (if compiled in)
+PPPOL2TP_MSG_CONTROL userspace - kernel interface
+PPPOL2TP_MSG_SEQ sequence numbers handling
+PPPOL2TP_MSG_DATA data packets
+
+Sample Userspace Code
+=====================
+
+1. Create tunnel management PPPoX socket
+
+ kernel_fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP);
+ if (kernel_fd >= 0) {
+ struct sockaddr_pppol2tp sax;
+ struct sockaddr_in const *peer_addr;
+
+ peer_addr = l2tp_tunnel_get_peer_addr(tunnel);
+ memset(&sax, 0, sizeof(sax));
+ sax.sa_family = AF_PPPOX;
+ sax.sa_protocol = PX_PROTO_OL2TP;
+ sax.pppol2tp.fd = udp_fd; /* fd of tunnel UDP socket */
+ sax.pppol2tp.addr.sin_addr.s_addr = peer_addr->sin_addr.s_addr;
+ sax.pppol2tp.addr.sin_port = peer_addr->sin_port;
+ sax.pppol2tp.addr.sin_family = AF_INET;
+ sax.pppol2tp.s_tunnel = tunnel_id;
+ sax.pppol2tp.s_session = 0; /* special case: mgmt socket */
+ sax.pppol2tp.d_tunnel = 0;
+ sax.pppol2tp.d_session = 0; /* special case: mgmt socket */
+
+ if(connect(kernel_fd, (struct sockaddr *)&sax, sizeof(sax) ) < 0 ) {
+ perror("connect failed");
+ result = -errno;
+ goto err;
+ }
+ }
+
+2. Create session PPPoX data socket
+
+ struct sockaddr_pppol2tp sax;
+ int fd;
+
+ /* Note, the target socket must be bound already, else it will not be ready */
+ sax.sa_family = AF_PPPOX;
+ sax.sa_protocol = PX_PROTO_OL2TP;
+ sax.pppol2tp.fd = tunnel_fd;
+ sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr;
+ sax.pppol2tp.addr.sin_port = addr->sin_port;
+ sax.pppol2tp.addr.sin_family = AF_INET;
+ sax.pppol2tp.s_tunnel = tunnel_id;
+ sax.pppol2tp.s_session = session_id;
+ sax.pppol2tp.d_tunnel = peer_tunnel_id;
+ sax.pppol2tp.d_session = peer_session_id;
+
+ /* session_fd is the fd of the session's PPPoL2TP socket.
+ * tunnel_fd is the fd of the tunnel UDP socket.
+ */
+ fd = connect(session_fd, (struct sockaddr *)&sax, sizeof(sax));
+ if (fd < 0 ) {
+ return -errno;
+ }
+ return 0;
+
+Miscellanous
+============
+
+The PPPoL2TP driver was developed as part of the OpenL2TP project by
+Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server,
+designed from the ground up to have the L2TP datapath in the
+kernel. The project also implemented the pppol2tp plugin for pppd
+which allows pppd to use the kernel driver. Details can be found at
+http://openl2tp.sourceforge.net.
diff --git a/Documentation/networking/lapb-module.txt b/Documentation/networking/lapb-module.txt
new file mode 100644
index 0000000..d4fc8f2
--- /dev/null
+++ b/Documentation/networking/lapb-module.txt
@@ -0,0 +1,263 @@
+ The Linux LAPB Module Interface 1.3
+
+ Jonathan Naylor 29.12.96
+
+Changed (Henner Eisen, 2000-10-29): int return value for data_indication()
+
+The LAPB module will be a separately compiled module for use by any parts of
+the Linux operating system that require a LAPB service. This document
+defines the interfaces to, and the services provided by this module. The
+term module in this context does not imply that the LAPB module is a
+separately loadable module, although it may be. The term module is used in
+its more standard meaning.
+
+The interface to the LAPB module consists of functions to the module,
+callbacks from the module to indicate important state changes, and
+structures for getting and setting information about the module.
+
+Structures
+----------
+
+Probably the most important structure is the skbuff structure for holding
+received and transmitted data, however it is beyond the scope of this
+document.
+
+The two LAPB specific structures are the LAPB initialisation structure and
+the LAPB parameter structure. These will be defined in a standard header
+file, <linux/lapb.h>. The header file <net/lapb.h> is internal to the LAPB
+module and is not for use.
+
+LAPB Initialisation Structure
+-----------------------------
+
+This structure is used only once, in the call to lapb_register (see below).
+It contains information about the device driver that requires the services
+of the LAPB module.
+
+struct lapb_register_struct {
+ void (*connect_confirmation)(int token, int reason);
+ void (*connect_indication)(int token, int reason);
+ void (*disconnect_confirmation)(int token, int reason);
+ void (*disconnect_indication)(int token, int reason);
+ int (*data_indication)(int token, struct sk_buff *skb);
+ void (*data_transmit)(int token, struct sk_buff *skb);
+};
+
+Each member of this structure corresponds to a function in the device driver
+that is called when a particular event in the LAPB module occurs. These will
+be described in detail below. If a callback is not required (!!) then a NULL
+may be substituted.
+
+
+LAPB Parameter Structure
+------------------------
+
+This structure is used with the lapb_getparms and lapb_setparms functions
+(see below). They are used to allow the device driver to get and set the
+operational parameters of the LAPB implementation for a given connection.
+
+struct lapb_parms_struct {
+ unsigned int t1;
+ unsigned int t1timer;
+ unsigned int t2;
+ unsigned int t2timer;
+ unsigned int n2;
+ unsigned int n2count;
+ unsigned int window;
+ unsigned int state;
+ unsigned int mode;
+};
+
+T1 and T2 are protocol timing parameters and are given in units of 100ms. N2
+is the maximum number of tries on the link before it is declared a failure.
+The window size is the maximum number of outstanding data packets allowed to
+be unacknowledged by the remote end, the value of the window is between 1
+and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB
+link.
+
+The mode variable is a bit field used for setting (at present) three values.
+The bit fields have the following meanings:
+
+Bit Meaning
+0 LAPB operation (0=LAPB_STANDARD 1=LAPB_EXTENDED).
+1 [SM]LP operation (0=LAPB_SLP 1=LAPB=MLP).
+2 DTE/DCE operation (0=LAPB_DTE 1=LAPB_DCE)
+3-31 Reserved, must be 0.
+
+Extended LAPB operation indicates the use of extended sequence numbers and
+consequently larger window sizes, the default is standard LAPB operation.
+MLP operation is the same as SLP operation except that the addresses used by
+LAPB are different to indicate the mode of operation, the default is Single
+Link Procedure. The difference between DCE and DTE operation is (i) the
+addresses used for commands and responses, and (ii) when the DCE is not
+connected, it sends DM without polls set, every T1. The upper case constant
+names will be defined in the public LAPB header file.
+
+
+Functions
+---------
+
+The LAPB module provides a number of function entry points.
+
+
+int lapb_register(void *token, struct lapb_register_struct);
+
+This must be called before the LAPB module may be used. If the call is
+successful then LAPB_OK is returned. The token must be a unique identifier
+generated by the device driver to allow for the unique identification of the
+instance of the LAPB link. It is returned by the LAPB module in all of the
+callbacks, and is used by the device driver in all calls to the LAPB module.
+For multiple LAPB links in a single device driver, multiple calls to
+lapb_register must be made. The format of the lapb_register_struct is given
+above. The return values are:
+
+LAPB_OK LAPB registered successfully.
+LAPB_BADTOKEN Token is already registered.
+LAPB_NOMEM Out of memory
+
+
+int lapb_unregister(void *token);
+
+This releases all the resources associated with a LAPB link. Any current
+LAPB link will be abandoned without further messages being passed. After
+this call, the value of token is no longer valid for any calls to the LAPB
+function. The valid return values are:
+
+LAPB_OK LAPB unregistered successfully.
+LAPB_BADTOKEN Invalid/unknown LAPB token.
+
+
+int lapb_getparms(void *token, struct lapb_parms_struct *parms);
+
+This allows the device driver to get the values of the current LAPB
+variables, the lapb_parms_struct is described above. The valid return values
+are:
+
+LAPB_OK LAPB getparms was successful.
+LAPB_BADTOKEN Invalid/unknown LAPB token.
+
+
+int lapb_setparms(void *token, struct lapb_parms_struct *parms);
+
+This allows the device driver to set the values of the current LAPB
+variables, the lapb_parms_struct is described above. The values of t1timer,
+t2timer and n2count are ignored, likewise changing the mode bits when
+connected will be ignored. An error implies that none of the values have
+been changed. The valid return values are:
+
+LAPB_OK LAPB getparms was successful.
+LAPB_BADTOKEN Invalid/unknown LAPB token.
+LAPB_INVALUE One of the values was out of its allowable range.
+
+
+int lapb_connect_request(void *token);
+
+Initiate a connect using the current parameter settings. The valid return
+values are:
+
+LAPB_OK LAPB is starting to connect.
+LAPB_BADTOKEN Invalid/unknown LAPB token.
+LAPB_CONNECTED LAPB module is already connected.
+
+
+int lapb_disconnect_request(void *token);
+
+Initiate a disconnect. The valid return values are:
+
+LAPB_OK LAPB is starting to disconnect.
+LAPB_BADTOKEN Invalid/unknown LAPB token.
+LAPB_NOTCONNECTED LAPB module is not connected.
+
+
+int lapb_data_request(void *token, struct sk_buff *skb);
+
+Queue data with the LAPB module for transmitting over the link. If the call
+is successful then the skbuff is owned by the LAPB module and may not be
+used by the device driver again. The valid return values are:
+
+LAPB_OK LAPB has accepted the data.
+LAPB_BADTOKEN Invalid/unknown LAPB token.
+LAPB_NOTCONNECTED LAPB module is not connected.
+
+
+int lapb_data_received(void *token, struct sk_buff *skb);
+
+Queue data with the LAPB module which has been received from the device. It
+is expected that the data passed to the LAPB module has skb->data pointing
+to the beginning of the LAPB data. If the call is successful then the skbuff
+is owned by the LAPB module and may not be used by the device driver again.
+The valid return values are:
+
+LAPB_OK LAPB has accepted the data.
+LAPB_BADTOKEN Invalid/unknown LAPB token.
+
+
+Callbacks
+---------
+
+These callbacks are functions provided by the device driver for the LAPB
+module to call when an event occurs. They are registered with the LAPB
+module with lapb_register (see above) in the structure lapb_register_struct
+(see above).
+
+
+void (*connect_confirmation)(void *token, int reason);
+
+This is called by the LAPB module when a connection is established after
+being requested by a call to lapb_connect_request (see above). The reason is
+always LAPB_OK.
+
+
+void (*connect_indication)(void *token, int reason);
+
+This is called by the LAPB module when the link is established by the remote
+system. The value of reason is always LAPB_OK.
+
+
+void (*disconnect_confirmation)(void *token, int reason);
+
+This is called by the LAPB module when an event occurs after the device
+driver has called lapb_disconnect_request (see above). The reason indicates
+what has happened. In all cases the LAPB link can be regarded as being
+terminated. The values for reason are:
+
+LAPB_OK The LAPB link was terminated normally.
+LAPB_NOTCONNECTED The remote system was not connected.
+LAPB_TIMEDOUT No response was received in N2 tries from the remote
+ system.
+
+
+void (*disconnect_indication)(void *token, int reason);
+
+This is called by the LAPB module when the link is terminated by the remote
+system or another event has occurred to terminate the link. This may be
+returned in response to a lapb_connect_request (see above) if the remote
+system refused the request. The values for reason are:
+
+LAPB_OK The LAPB link was terminated normally by the remote
+ system.
+LAPB_REFUSED The remote system refused the connect request.
+LAPB_NOTCONNECTED The remote system was not connected.
+LAPB_TIMEDOUT No response was received in N2 tries from the remote
+ system.
+
+
+int (*data_indication)(void *token, struct sk_buff *skb);
+
+This is called by the LAPB module when data has been received from the
+remote system that should be passed onto the next layer in the protocol
+stack. The skbuff becomes the property of the device driver and the LAPB
+module will not perform any more actions on it. The skb->data pointer will
+be pointing to the first byte of data after the LAPB header.
+
+This method should return NET_RX_DROP (as defined in the header
+file include/linux/netdevice.h) if and only if the frame was dropped
+before it could be delivered to the upper layer.
+
+
+void (*data_transmit)(void *token, struct sk_buff *skb);
+
+This is called by the LAPB module when data is to be transmitted to the
+remote system by the device driver. The skbuff becomes the property of the
+device driver and the LAPB module will not perform any more actions on it.
+The skb->data pointer will be pointing to the first byte of the LAPB header.
diff --git a/Documentation/networking/ltpc.txt b/Documentation/networking/ltpc.txt
new file mode 100644
index 0000000..fe2a912
--- /dev/null
+++ b/Documentation/networking/ltpc.txt
@@ -0,0 +1,131 @@
+This is the ALPHA version of the ltpc driver.
+
+In order to use it, you will need at least version 1.3.3 of the
+netatalk package, and the Apple or Farallon LocalTalk PC card.
+There are a number of different LocalTalk cards for the PC; this
+driver applies only to the one with the 65c02 processor chip on it.
+
+To include it in the kernel, select the CONFIG_LTPC switch in the
+configuration dialog. You can also compile it as a module.
+
+While the driver will attempt to autoprobe the I/O port address, IRQ
+line, and DMA channel of the card, this does not always work. For
+this reason, you should be prepared to supply these parameters
+yourself. (see "Card Configuration" below for how to determine or
+change the settings on your card)
+
+When the driver is compiled into the kernel, you can add a line such
+as the following to your /etc/lilo.conf:
+
+ append="ltpc=0x240,9,1"
+
+where the parameters (in order) are the port address, IRQ, and DMA
+channel. The second and third values can be omitted, in which case
+the driver will try to determine them itself.
+
+If you load the driver as a module, you can pass the parameters "io=",
+"irq=", and "dma=" on the command line with insmod or modprobe, or add
+them as options in /etc/modprobe.conf:
+
+ alias lt0 ltpc # autoload the module when the interface is configured
+ options ltpc io=0x240 irq=9 dma=1
+
+Before starting up the netatalk demons (perhaps in rc.local), you
+need to add a line such as:
+
+ /sbin/ifconfig lt0 127.0.0.42
+
+The address is unimportant - however, the card needs to be configured
+with ifconfig so that Netatalk can find it.
+
+The appropriate netatalk configuration depends on whether you are
+attached to a network that includes AppleTalk routers or not. If,
+like me, you are simply connecting to your home Macintoshes and
+printers, you need to set up netatalk to "seed". The way I do this
+is to have the lines
+
+ dummy -seed -phase 2 -net 2000 -addr 2000.26 -zone "1033"
+ lt0 -seed -phase 1 -net 1033 -addr 1033.27 -zone "1033"
+
+in my atalkd.conf. What is going on here is that I need to fool
+netatalk into thinking that there are two AppleTalk interfaces
+present; otherwise, it refuses to seed. This is a hack, and a more
+permanent solution would be to alter the netatalk code. Also, make
+sure you have the correct name for the dummy interface - If it's
+compiled as a module, you will need to refer to it as "dummy0" or some
+such.
+
+If you are attached to an extended AppleTalk network, with routers on
+it, then you don't need to fool around with this -- the appropriate
+line in atalkd.conf is
+
+ lt0 -phase 1
+
+--------------------------------------
+
+Card Configuration:
+
+The interrupts and so forth are configured via the dipswitch on the
+board. Set the switches so as not to conflict with other hardware.
+
+ Interrupts -- set at most one. If none are set, the driver uses
+ polled mode. Because the card was developed in the XT era, the
+ original documentation refers to IRQ2. Since you'll be running
+ this on an AT (or later) class machine, that really means IRQ9.
+
+ SW1 IRQ 4
+ SW2 IRQ 3
+ SW3 IRQ 9 (2 in original card documentation only applies to XT)
+
+
+ DMA -- choose DMA 1 or 3, and set both corresponding switches.
+
+ SW4 DMA 3
+ SW5 DMA 1
+ SW6 DMA 3
+ SW7 DMA 1
+
+
+ I/O address -- choose one.
+
+ SW8 220 / 240
+
+--------------------------------------
+
+IP:
+
+Yes, it is possible to do IP over LocalTalk. However, you can't just
+treat the LocalTalk device like an ordinary Ethernet device, even if
+that's what it looks like to Netatalk.
+
+Instead, you follow the same procedure as for doing IP in EtherTalk.
+See Documentation/networking/ipddp.txt for more information about the
+kernel driver and userspace tools needed.
+
+--------------------------------------
+
+BUGS:
+
+IRQ autoprobing often doesn't work on a cold boot. To get around
+this, either compile the driver as a module, or pass the parameters
+for the card to the kernel as described above.
+
+Also, as usual, autoprobing is not recommended when you use the driver
+as a module. (though it usually works at boot time, at least)
+
+Polled mode is *really* slow sometimes, but this seems to depend on
+the configuration of the network.
+
+It may theoretically be possible to use two LTPC cards in the same
+machine, but this is unsupported, so if you really want to do this,
+you'll probably have to hack the initialization code a bit.
+
+______________________________________
+
+THANKS:
+ Thanks to Alan Cox for helpful discussions early on in this
+work, and to Denis Hainsworth for doing the bleeding-edge testing.
+
+-- Bradford Johnson <bradford@math.umn.edu>
+
+-- Updated 11/09/1998 by David Huggins-Daines <dhd@debian.org>
diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt
new file mode 100644
index 0000000..84906ef
--- /dev/null
+++ b/Documentation/networking/mac80211-injection.txt
@@ -0,0 +1,79 @@
+How to use packet injection with mac80211
+=========================================
+
+mac80211 now allows arbitrary packets to be injected down any Monitor Mode
+interface from userland. The packet you inject needs to be composed in the
+following format:
+
+ [ radiotap header ]
+ [ ieee80211 header ]
+ [ payload ]
+
+The radiotap format is discussed in
+./Documentation/networking/radiotap-headers.txt.
+
+Despite 13 radiotap argument types are currently defined, most only make sense
+to appear on received packets. The following information is parsed from the
+radiotap headers and used to control injection:
+
+ * IEEE80211_RADIOTAP_RATE
+
+ rate in 500kbps units, automatic if invalid or not present
+
+
+ * IEEE80211_RADIOTAP_ANTENNA
+
+ antenna to use, automatic if not present
+
+
+ * IEEE80211_RADIOTAP_DBM_TX_POWER
+
+ transmit power in dBm, automatic if not present
+
+
+ * IEEE80211_RADIOTAP_FLAGS
+
+ IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated
+ IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available
+ IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the
+ current fragmentation threshold. Note that
+ this flag is only reliable when software
+ fragmentation is enabled)
+
+The injection code can also skip all other currently defined radiotap fields
+facilitating replay of captured radiotap headers directly.
+
+Here is an example valid radiotap header defining these three parameters
+
+ 0x00, 0x00, // <-- radiotap version
+ 0x0b, 0x00, // <- radiotap header length
+ 0x04, 0x0c, 0x00, 0x00, // <-- bitmap
+ 0x6c, // <-- rate
+ 0x0c, //<-- tx power
+ 0x01 //<-- antenna
+
+The ieee80211 header follows immediately afterwards, looking for example like
+this:
+
+ 0x08, 0x01, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x13, 0x22, 0x33, 0x44, 0x55, 0x66,
+ 0x13, 0x22, 0x33, 0x44, 0x55, 0x66,
+ 0x10, 0x86
+
+Then lastly there is the payload.
+
+After composing the packet contents, it is sent by send()-ing it to a logical
+mac80211 interface that is in Monitor mode. Libpcap can also be used,
+(which is easier than doing the work to bind the socket to the right
+interface), along the following lines:
+
+ ppcap = pcap_open_live(szInterfaceName, 800, 1, 20, szErrbuf);
+...
+ r = pcap_inject(ppcap, u8aSendBuffer, nLength);
+
+You can also find sources for a complete inject test applet here:
+
+http://penumbra.warmcat.com/_twk/tiki-index.php?page=packetspammer
+
+Andy Green <andy@warmcat.com>
diff --git a/Documentation/networking/mac80211_hwsim/README b/Documentation/networking/mac80211_hwsim/README
new file mode 100644
index 0000000..2ff8ccb
--- /dev/null
+++ b/Documentation/networking/mac80211_hwsim/README
@@ -0,0 +1,67 @@
+mac80211_hwsim - software simulator of 802.11 radio(s) for mac80211
+Copyright (c) 2008, Jouni Malinen <j@w1.fi>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 2 as
+published by the Free Software Foundation.
+
+
+Introduction
+
+mac80211_hwsim is a Linux kernel module that can be used to simulate
+arbitrary number of IEEE 802.11 radios for mac80211. It can be used to
+test most of the mac80211 functionality and user space tools (e.g.,
+hostapd and wpa_supplicant) in a way that matches very closely with
+the normal case of using real WLAN hardware. From the mac80211 view
+point, mac80211_hwsim is yet another hardware driver, i.e., no changes
+to mac80211 are needed to use this testing tool.
+
+The main goal for mac80211_hwsim is to make it easier for developers
+to test their code and work with new features to mac80211, hostapd,
+and wpa_supplicant. The simulated radios do not have the limitations
+of real hardware, so it is easy to generate an arbitrary test setup
+and always reproduce the same setup for future tests. In addition,
+since all radio operation is simulated, any channel can be used in
+tests regardless of regulatory rules.
+
+mac80211_hwsim kernel module has a parameter 'radios' that can be used
+to select how many radios are simulated (default 2). This allows
+configuration of both very simply setups (e.g., just a single access
+point and a station) or large scale tests (multiple access points with
+hundreds of stations).
+
+mac80211_hwsim works by tracking the current channel of each virtual
+radio and copying all transmitted frames to all other radios that are
+currently enabled and on the same channel as the transmitting
+radio. Software encryption in mac80211 is used so that the frames are
+actually encrypted over the virtual air interface to allow more
+complete testing of encryption.
+
+A global monitoring netdev, hwsim#, is created independent of
+mac80211. This interface can be used to monitor all transmitted frames
+regardless of channel.
+
+
+Simple example
+
+This example shows how to use mac80211_hwsim to simulate two radios:
+one to act as an access point and the other as a station that
+associates with the AP. hostapd and wpa_supplicant are used to take
+care of WPA2-PSK authentication. In addition, hostapd is also
+processing access point side of association.
+
+Please note that the current Linux kernel does not enable AP mode, so a
+simple patch is needed to enable AP mode selection:
+http://johannes.sipsolutions.net/patches/kernel/all/LATEST/006-allow-ap-vlan-modes.patch
+
+
+# Build mac80211_hwsim as part of kernel configuration
+
+# Load the module
+modprobe mac80211_hwsim
+
+# Run hostapd (AP) for wlan0
+hostapd hostapd.conf
+
+# Run wpa_supplicant (station) for wlan1
+wpa_supplicant -Dwext -iwlan1 -c wpa_supplicant.conf
diff --git a/Documentation/networking/mac80211_hwsim/hostapd.conf b/Documentation/networking/mac80211_hwsim/hostapd.conf
new file mode 100644
index 0000000..08cde7e
--- /dev/null
+++ b/Documentation/networking/mac80211_hwsim/hostapd.conf
@@ -0,0 +1,11 @@
+interface=wlan0
+driver=nl80211
+
+hw_mode=g
+channel=1
+ssid=mac80211 test
+
+wpa=2
+wpa_key_mgmt=WPA-PSK
+wpa_pairwise=CCMP
+wpa_passphrase=12345678
diff --git a/Documentation/networking/mac80211_hwsim/wpa_supplicant.conf b/Documentation/networking/mac80211_hwsim/wpa_supplicant.conf
new file mode 100644
index 0000000..299128c
--- /dev/null
+++ b/Documentation/networking/mac80211_hwsim/wpa_supplicant.conf
@@ -0,0 +1,10 @@
+ctrl_interface=/var/run/wpa_supplicant
+
+network={
+ ssid="mac80211 test"
+ psk="12345678"
+ key_mgmt=WPA-PSK
+ proto=WPA2
+ pairwise=CCMP
+ group=CCMP
+}
diff --git a/Documentation/networking/multicast.txt b/Documentation/networking/multicast.txt
new file mode 100644
index 0000000..b06c8c6
--- /dev/null
+++ b/Documentation/networking/multicast.txt
@@ -0,0 +1,63 @@
+Behaviour of Cards Under Multicast
+==================================
+
+This is how they currently behave, not what the hardware can do--for example,
+the Lance driver doesn't use its filter, even though the code for loading
+it is in the DEC Lance-based driver.
+
+The following are requirements for multicasting
+-----------------------------------------------
+AppleTalk Multicast hardware filtering not important but
+ avoid cards only doing promisc
+IP-Multicast Multicast hardware filters really help
+IP-MRoute AllMulti hardware filters are of no help
+
+
+Board Multicast AllMulti Promisc Filter
+------------------------------------------------------------------------
+3c501 YES YES YES Software
+3c503 YES YES YES Hardware
+3c505 YES NO YES Hardware
+3c507 NO NO NO N/A
+3c509 YES YES YES Software
+3c59x YES YES YES Software
+ac3200 YES YES YES Hardware
+apricot YES PROMISC YES Hardware
+arcnet NO NO NO N/A
+at1700 PROMISC PROMISC YES Software
+atp PROMISC PROMISC YES Software
+cs89x0 YES YES YES Software
+de4x5 YES YES YES Hardware
+de600 NO NO NO N/A
+de620 PROMISC PROMISC YES Software
+depca YES PROMISC YES Hardware
+dmfe YES YES YES Software(*)
+e2100 YES YES YES Hardware
+eepro YES PROMISC YES Hardware
+eexpress NO NO NO N/A
+ewrk3 YES PROMISC YES Hardware
+hp-plus YES YES YES Hardware
+hp YES YES YES Hardware
+hp100 YES YES YES Hardware
+ibmtr NO NO NO N/A
+ioc3-eth YES YES YES Hardware
+lance YES YES YES Software(#)
+ne YES YES YES Hardware
+ni52 <------------------ Buggy ------------------>
+ni65 YES YES YES Software(#)
+seeq NO NO NO N/A
+sgiseek <------------------ Buggy ------------------>
+smc-ultra YES YES YES Hardware
+sunlance YES YES YES Hardware
+tulip YES YES YES Hardware
+wavelan YES PROMISC YES Hardware
+wd YES YES YES Hardware
+xirc2ps_cs YES YES YES Hardware
+znet YES YES YES Software
+
+
+PROMISC = This multicast mode is in fact promiscuous mode. Avoid using
+cards who go PROMISC on any multicast in a multicast kernel.
+
+(#) = Hardware multicast support is not used yet.
+(*) = Hardware support for Davicom 9132 chipset only.
diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt
new file mode 100644
index 0000000..4caa0e3
--- /dev/null
+++ b/Documentation/networking/multiqueue.txt
@@ -0,0 +1,79 @@
+
+ HOWTO for multiqueue network device support
+ ===========================================
+
+Section 1: Base driver requirements for implementing multiqueue support
+
+Intro: Kernel support for multiqueue devices
+---------------------------------------------------------
+
+Kernel support for multiqueue devices is always present.
+
+Section 1: Base driver requirements for implementing multiqueue support
+-----------------------------------------------------------------------
+
+Base drivers are required to use the new alloc_etherdev_mq() or
+alloc_netdev_mq() functions to allocate the subqueues for the device. The
+underlying kernel API will take care of the allocation and deallocation of
+the subqueue memory, as well as netdev configuration of where the queues
+exist in memory.
+
+The base driver will also need to manage the queues as it does the global
+netdev->queue_lock today. Therefore base drivers should use the
+netif_{start|stop|wake}_subqueue() functions to manage each queue while the
+device is still operational. netdev->queue_lock is still used when the device
+comes online or when it's completely shut down (unregister_netdev(), etc.).
+
+
+Section 2: Qdisc support for multiqueue devices
+
+-----------------------------------------------
+
+Currently two qdiscs are optimized for multiqueue devices. The first is the
+default pfifo_fast qdisc. This qdisc supports one qdisc per hardware queue.
+A new round-robin qdisc, sch_multiq also supports multiple hardware queues. The
+qdisc is responsible for classifying the skb's and then directing the skb's to
+bands and queues based on the value in skb->queue_mapping. Use this field in
+the base driver to determine which queue to send the skb to.
+
+sch_multiq has been added for hardware that wishes to avoid head-of-line
+blocking. It will cycle though the bands and verify that the hardware queue
+associated with the band is not stopped prior to dequeuing a packet.
+
+On qdisc load, the number of bands is based on the number of queues on the
+hardware. Once the association is made, any skb with skb->queue_mapping set,
+will be queued to the band associated with the hardware queue.
+
+
+Section 3: Brief howto using MULTIQ for multiqueue devices
+---------------------------------------------------------------
+
+The userspace command 'tc,' part of the iproute2 package, is used to configure
+qdiscs. To add the MULTIQ qdisc to your network device, assuming the device
+is called eth0, run the following command:
+
+# tc qdisc add dev eth0 root handle 1: multiq
+
+The qdisc will allocate the number of bands to equal the number of queues that
+the device reports, and bring the qdisc online. Assuming eth0 has 4 Tx
+queues, the band mapping would look like:
+
+band 0 => queue 0
+band 1 => queue 1
+band 2 => queue 2
+band 3 => queue 3
+
+Traffic will begin flowing through each queue based on either the simple_tx_hash
+function or based on netdev->select_queue() if you have it defined.
+
+The behavior of tc filters remains the same. However a new tc action,
+skbedit, has been added. Assuming you wanted to route all traffic to a
+specific host, for example 192.168.0.3, through a specific queue you could use
+this action and establish a filter such as:
+
+tc filter add dev eth0 parent 1: protocol ip prio 1 u32 \
+ match ip dst 192.168.0.3 \
+ action skbedit queue_mapping 3
+
+Author: Alexander Duyck <alexander.h.duyck@intel.com>
+Original Author: Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
diff --git a/Documentation/networking/netconsole.txt b/Documentation/networking/netconsole.txt
new file mode 100644
index 0000000..3c2f2b3
--- /dev/null
+++ b/Documentation/networking/netconsole.txt
@@ -0,0 +1,156 @@
+
+started by Ingo Molnar <mingo@redhat.com>, 2001.09.17
+2.6 port and netpoll api by Matt Mackall <mpm@selenic.com>, Sep 9 2003
+
+Please send bug reports to Matt Mackall <mpm@selenic.com>
+and Satyam Sharma <satyam.sharma@gmail.com>
+
+Introduction:
+=============
+
+This module logs kernel printk messages over UDP allowing debugging of
+problem where disk logging fails and serial consoles are impractical.
+
+It can be used either built-in or as a module. As a built-in,
+netconsole initializes immediately after NIC cards and will bring up
+the specified interface as soon as possible. While this doesn't allow
+capture of early kernel panics, it does capture most of the boot
+process.
+
+Sender and receiver configuration:
+==================================
+
+It takes a string configuration parameter "netconsole" in the
+following format:
+
+ netconsole=[src-port]@[src-ip]/[<dev>],[tgt-port]@<tgt-ip>/[tgt-macaddr]
+
+ where
+ src-port source for UDP packets (defaults to 6665)
+ src-ip source IP to use (interface address)
+ dev network interface (eth0)
+ tgt-port port for logging agent (6666)
+ tgt-ip IP address for logging agent
+ tgt-macaddr ethernet MAC address for logging agent (broadcast)
+
+Examples:
+
+ linux netconsole=4444@10.0.0.1/eth1,9353@10.0.0.2/12:34:56:78:9a:bc
+
+ or
+
+ insmod netconsole netconsole=@/,@10.0.0.2/
+
+It also supports logging to multiple remote agents by specifying
+parameters for the multiple agents separated by semicolons and the
+complete string enclosed in "quotes", thusly:
+
+ modprobe netconsole netconsole="@/,@10.0.0.2/;@/eth1,6892@10.0.0.3/"
+
+Built-in netconsole starts immediately after the TCP stack is
+initialized and attempts to bring up the supplied dev at the supplied
+address.
+
+The remote host can run either 'netcat -u -l -p <port>' or syslogd.
+
+Dynamic reconfiguration:
+========================
+
+Dynamic reconfigurability is a useful addition to netconsole that enables
+remote logging targets to be dynamically added, removed, or have their
+parameters reconfigured at runtime from a configfs-based userspace interface.
+[ Note that the parameters of netconsole targets that were specified/created
+from the boot/module option are not exposed via this interface, and hence
+cannot be modified dynamically. ]
+
+To include this feature, select CONFIG_NETCONSOLE_DYNAMIC when building the
+netconsole module (or kernel, if netconsole is built-in).
+
+Some examples follow (where configfs is mounted at the /sys/kernel/config
+mountpoint).
+
+To add a remote logging target (target names can be arbitrary):
+
+ cd /sys/kernel/config/netconsole/
+ mkdir target1
+
+Note that newly created targets have default parameter values (as mentioned
+above) and are disabled by default -- they must first be enabled by writing
+"1" to the "enabled" attribute (usually after setting parameters accordingly)
+as described below.
+
+To remove a target:
+
+ rmdir /sys/kernel/config/netconsole/othertarget/
+
+The interface exposes these parameters of a netconsole target to userspace:
+
+ enabled Is this target currently enabled? (read-write)
+ dev_name Local network interface name (read-write)
+ local_port Source UDP port to use (read-write)
+ remote_port Remote agent's UDP port (read-write)
+ local_ip Source IP address to use (read-write)
+ remote_ip Remote agent's IP address (read-write)
+ local_mac Local interface's MAC address (read-only)
+ remote_mac Remote agent's MAC address (read-write)
+
+The "enabled" attribute is also used to control whether the parameters of
+a target can be updated or not -- you can modify the parameters of only
+disabled targets (i.e. if "enabled" is 0).
+
+To update a target's parameters:
+
+ cat enabled # check if enabled is 1
+ echo 0 > enabled # disable the target (if required)
+ echo eth2 > dev_name # set local interface
+ echo 10.0.0.4 > remote_ip # update some parameter
+ echo cb:a9:87:65:43:21 > remote_mac # update more parameters
+ echo 1 > enabled # enable target again
+
+You can also update the local interface dynamically. This is especially
+useful if you want to use interfaces that have newly come up (and may not
+have existed when netconsole was loaded / initialized).
+
+Miscellaneous notes:
+====================
+
+WARNING: the default target ethernet setting uses the broadcast
+ethernet address to send packets, which can cause increased load on
+other systems on the same ethernet segment.
+
+TIP: some LAN switches may be configured to suppress ethernet broadcasts
+so it is advised to explicitly specify the remote agents' MAC addresses
+from the config parameters passed to netconsole.
+
+TIP: to find out the MAC address of, say, 10.0.0.2, you may try using:
+
+ ping -c 1 10.0.0.2 ; /sbin/arp -n | grep 10.0.0.2
+
+TIP: in case the remote logging agent is on a separate LAN subnet than
+the sender, it is suggested to try specifying the MAC address of the
+default gateway (you may use /sbin/route -n to find it out) as the
+remote MAC address instead.
+
+NOTE: the network device (eth1 in the above case) can run any kind
+of other network traffic, netconsole is not intrusive. Netconsole
+might cause slight delays in other traffic if the volume of kernel
+messages is high, but should have no other impact.
+
+NOTE: if you find that the remote logging agent is not receiving or
+printing all messages from the sender, it is likely that you have set
+the "console_loglevel" parameter (on the sender) to only send high
+priority messages to the console. You can change this at runtime using:
+
+ dmesg -n 8
+
+or by specifying "debug" on the kernel command line at boot, to send
+all kernel messages to the console. A specific value for this parameter
+can also be set using the "loglevel" kernel boot option. See the
+dmesg(8) man page and Documentation/kernel-parameters.txt for details.
+
+Netconsole was designed to be as instantaneous as possible, to
+enable the logging of even the most critical kernel bugs. It works
+from IRQ contexts as well, and does not enable interrupts while
+sending packets. Due to these unique needs, configuration cannot
+be more automatic, and some fundamental limitations will remain:
+only IP networks, UDP packets and ethernet devices are supported.
diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
new file mode 100644
index 0000000..d0f71fc
--- /dev/null
+++ b/Documentation/networking/netdevices.txt
@@ -0,0 +1,108 @@
+
+Network Devices, the Kernel, and You!
+
+
+Introduction
+============
+The following is a random collection of documentation regarding
+network devices.
+
+struct net_device allocation rules
+==================================
+Network device structures need to persist even after module is unloaded and
+must be allocated with kmalloc. If device has registered successfully,
+it will be freed on last use by free_netdev. This is required to handle the
+pathologic case cleanly (example: rmmod mydriver </sys/class/net/myeth/mtu )
+
+There are routines in net_init.c to handle the common cases of
+alloc_etherdev, alloc_netdev. These reserve extra space for driver
+private data which gets freed when the network device is freed. If
+separately allocated data is attached to the network device
+(dev->priv) then it is up to the module exit handler to free that.
+
+MTU
+===
+Each network device has a Maximum Transfer Unit. The MTU does not
+include any link layer protocol overhead. Upper layer protocols must
+not pass a socket buffer (skb) to a device to transmit with more data
+than the mtu. The MTU does not include link layer header overhead, so
+for example on Ethernet if the standard MTU is 1500 bytes used, the
+actual skb will contain up to 1514 bytes because of the Ethernet
+header. Devices should allow for the 4 byte VLAN header as well.
+
+Segmentation Offload (GSO, TSO) is an exception to this rule. The
+upper layer protocol may pass a large socket buffer to the device
+transmit routine, and the device will break that up into separate
+packets based on the current MTU.
+
+MTU is symmetrical and applies both to receive and transmit. A device
+must be able to receive at least the maximum size packet allowed by
+the MTU. A network device may use the MTU as mechanism to size receive
+buffers, but the device should allow packets with VLAN header. With
+standard Ethernet mtu of 1500 bytes, the device should allow up to
+1518 byte packets (1500 + 14 header + 4 tag). The device may either:
+drop, truncate, or pass up oversize packets, but dropping oversize
+packets is preferred.
+
+
+struct net_device synchronization rules
+=======================================
+dev->open:
+ Synchronization: rtnl_lock() semaphore.
+ Context: process
+
+dev->stop:
+ Synchronization: rtnl_lock() semaphore.
+ Context: process
+ Note1: netif_running() is guaranteed false
+ Note2: dev->poll() is guaranteed to be stopped
+
+dev->do_ioctl:
+ Synchronization: rtnl_lock() semaphore.
+ Context: process
+
+dev->get_stats:
+ Synchronization: dev_base_lock rwlock.
+ Context: nominally process, but don't sleep inside an rwlock
+
+dev->hard_start_xmit:
+ Synchronization: netif_tx_lock spinlock.
+
+ When the driver sets NETIF_F_LLTX in dev->features this will be
+ called without holding netif_tx_lock. In this case the driver
+ has to lock by itself when needed. It is recommended to use a try lock
+ for this and return NETDEV_TX_LOCKED when the spin lock fails.
+ The locking there should also properly protect against
+ set_multicast_list. Note that the use of NETIF_F_LLTX is deprecated.
+ Dont use it for new drivers.
+
+ Context: Process with BHs disabled or BH (timer),
+ will be called with interrupts disabled by netconsole.
+
+ Return codes:
+ o NETDEV_TX_OK everything ok.
+ o NETDEV_TX_BUSY Cannot transmit packet, try later
+ Usually a bug, means queue start/stop flow control is broken in
+ the driver. Note: the driver must NOT put the skb in its DMA ring.
+ o NETDEV_TX_LOCKED Locking failed, please retry quickly.
+ Only valid when NETIF_F_LLTX is set.
+
+dev->tx_timeout:
+ Synchronization: netif_tx_lock spinlock.
+ Context: BHs disabled
+ Notes: netif_queue_stopped() is guaranteed true
+
+dev->set_multicast_list:
+ Synchronization: netif_tx_lock spinlock.
+ Context: BHs disabled
+
+struct napi_struct synchronization rules
+========================================
+napi->poll:
+ Synchronization: NAPI_STATE_SCHED bit in napi->state. Device
+ driver's dev->close method will invoke napi_disable() on
+ all NAPI instances which will do a sleeping poll on the
+ NAPI_STATE_SCHED napi->state bit, waiting for all pending
+ NAPI activity to cease.
+ Context: softirq
+ will be called with interrupts disabled by netconsole.
diff --git a/Documentation/networking/netif-msg.txt b/Documentation/networking/netif-msg.txt
new file mode 100644
index 0000000..c967ddb
--- /dev/null
+++ b/Documentation/networking/netif-msg.txt
@@ -0,0 +1,79 @@
+
+________________
+NETIF Msg Level
+
+The design of the network interface message level setting.
+
+History
+
+ The design of the debugging message interface was guided and
+ constrained by backwards compatibility previous practice. It is useful
+ to understand the history and evolution in order to understand current
+ practice and relate it to older driver source code.
+
+ From the beginning of Linux, each network device driver has had a local
+ integer variable that controls the debug message level. The message
+ level ranged from 0 to 7, and monotonically increased in verbosity.
+
+ The message level was not precisely defined past level 3, but were
+ always implemented within +-1 of the specified level. Drivers tended
+ to shed the more verbose level messages as they matured.
+ 0 Minimal messages, only essential information on fatal errors.
+ 1 Standard messages, initialization status. No run-time messages
+ 2 Special media selection messages, generally timer-driver.
+ 3 Interface starts and stops, including normal status messages
+ 4 Tx and Rx frame error messages, and abnormal driver operation
+ 5 Tx packet queue information, interrupt events.
+ 6 Status on each completed Tx packet and received Rx packets
+ 7 Initial contents of Tx and Rx packets
+
+ Initially this message level variable was uniquely named in each driver
+ e.g. "lance_debug", so that a kernel symbolic debugger could locate and
+ modify the setting. When kernel modules became common, the variables
+ were consistently renamed to "debug" and allowed to be set as a module
+ parameter.
+
+ This approach worked well. However there is always a demand for
+ additional features. Over the years the following emerged as
+ reasonable and easily implemented enhancements
+ Using an ioctl() call to modify the level.
+ Per-interface rather than per-driver message level setting.
+ More selective control over the type of messages emitted.
+
+ The netif_msg recommendation adds these features with only a minor
+ complexity and code size increase.
+
+ The recommendation is the following points
+ Retaining the per-driver integer variable "debug" as a module
+ parameter with a default level of '1'.
+
+ Adding a per-interface private variable named "msg_enable". The
+ variable is a bit map rather than a level, and is initialized as
+ 1 << debug
+ Or more precisely
+ debug < 0 ? 0 : 1 << min(sizeof(int)-1, debug)
+
+ Messages should changes from
+ if (debug > 1)
+ printk(MSG_DEBUG "%s: ...
+ to
+ if (np->msg_enable & NETIF_MSG_LINK)
+ printk(MSG_DEBUG "%s: ...
+
+
+The set of message levels is named
+ Old level Name Bit position
+ 0 NETIF_MSG_DRV 0x0001
+ 1 NETIF_MSG_PROBE 0x0002
+ 2 NETIF_MSG_LINK 0x0004
+ 2 NETIF_MSG_TIMER 0x0004
+ 3 NETIF_MSG_IFDOWN 0x0008
+ 3 NETIF_MSG_IFUP 0x0008
+ 4 NETIF_MSG_RX_ERR 0x0010
+ 4 NETIF_MSG_TX_ERR 0x0010
+ 5 NETIF_MSG_TX_QUEUED 0x0020
+ 5 NETIF_MSG_INTR 0x0020
+ 6 NETIF_MSG_TX_DONE 0x0040
+ 6 NETIF_MSG_RX_STATUS 0x0040
+ 7 NETIF_MSG_PKTDATA 0x0080
+
diff --git a/Documentation/networking/olympic.txt b/Documentation/networking/olympic.txt
new file mode 100644
index 0000000..c65a940
--- /dev/null
+++ b/Documentation/networking/olympic.txt
@@ -0,0 +1,79 @@
+
+IBM PCI Pit/Pit-Phy/Olympic CHIPSET BASED TOKEN RING CARDS README
+
+Release 0.2.0 - Release
+ June 8th 1999 Peter De Schrijver & Mike Phillips
+Release 0.9.C - Release
+ April 18th 2001 Mike Phillips
+
+Thanks:
+Erik De Cock, Adrian Bridgett and Frank Fiene for their
+patience and testing.
+Donald Champion for the cardbus support
+Kyle Lucke for the dma api changes.
+Jonathon Bitner for hardware support.
+Everybody on linux-tr for their continued support.
+
+Options:
+
+The driver accepts four options: ringspeed, pkt_buf_sz,
+message_level and network_monitor.
+
+These options can be specified differently for each card found.
+
+ringspeed: Has one of three settings 0 (default), 4 or 16. 0 will
+make the card autosense the ringspeed and join at the appropriate speed,
+this will be the default option for most people. 4 or 16 allow you to
+explicitly force the card to operate at a certain speed. The card will fail
+if you try to insert it at the wrong speed. (Although some hubs will allow
+this so be *very* careful). The main purpose for explicitly setting the ring
+speed is for when the card is first on the ring. In autosense mode, if the card
+cannot detect any active monitors on the ring it will not open, so you must
+re-init the card at the appropriate speed. Unfortunately at present the only
+way of doing this is rmmod and insmod which is a bit tough if it is compiled
+in the kernel.
+
+pkt_buf_sz: This is this initial receive buffer allocation size. This will
+default to 4096 if no value is entered. You may increase performance of the
+driver by setting this to a value larger than the network packet size, although
+the driver now re-sizes buffers based on MTU settings as well.
+
+message_level: Controls level of messages created by the driver. Defaults to 0:
+which only displays start-up and critical messages. Presently any non-zero
+value will display all soft messages as well. NB This does not turn
+debugging messages on, that must be done by modified the source code.
+
+network_monitor: Any non-zero value will provide a quasi network monitoring
+mode. All unexpected MAC frames (beaconing etc.) will be received
+by the driver and the source and destination addresses printed.
+Also an entry will be added in /proc/net called olympic_tr%d, where tr%d
+is the registered device name, i.e tr0, tr1, etc. This displays low
+level information about the configuration of the ring and the adapter.
+This feature has been designed for network administrators to assist in
+the diagnosis of network / ring problems. (This used to OLYMPIC_NETWORK_MONITOR,
+but has now changed to allow each adapter to be configured differently and
+to alleviate the necessity to re-compile olympic to turn the option on).
+
+Multi-card:
+
+The driver will detect multiple cards and will work with shared interrupts,
+each card is assigned the next token ring device, i.e. tr0 , tr1, tr2. The
+driver should also happily reside in the system with other drivers. It has
+been tested with ibmtr.c running, and I personally have had one Olicom PCI
+card and two IBM olympic cards (all on the same interrupt), all running
+together.
+
+Variable MTU size:
+
+The driver can handle a MTU size upto either 4500 or 18000 depending upon
+ring speed. The driver also changes the size of the receive buffers as part
+of the mtu re-sizing, so if you set mtu = 18000, you will need to be able
+to allocate 16 * (sk_buff with 18000 buffer size) call it 18500 bytes per ring
+position = 296,000 bytes of memory space, plus of course anything
+necessary for the tx sk_buff's. Remember this is per card, so if you are
+building routers, gateway's etc, you could start to use a lot of memory
+real fast.
+
+
+6/8/99 Peter De Schrijver and Mike Phillips
+
diff --git a/Documentation/networking/operstates.txt b/Documentation/networking/operstates.txt
new file mode 100644
index 0000000..c9074f9
--- /dev/null
+++ b/Documentation/networking/operstates.txt
@@ -0,0 +1,161 @@
+
+1. Introduction
+
+Linux distinguishes between administrative and operational state of an
+interface. Administrative state is the result of "ip link set dev
+<dev> up or down" and reflects whether the administrator wants to use
+the device for traffic.
+
+However, an interface is not usable just because the admin enabled it
+- ethernet requires to be plugged into the switch and, depending on
+a site's networking policy and configuration, an 802.1X authentication
+to be performed before user data can be transferred. Operational state
+shows the ability of an interface to transmit this user data.
+
+Thanks to 802.1X, userspace must be granted the possibility to
+influence operational state. To accommodate this, operational state is
+split into two parts: Two flags that can be set by the driver only, and
+a RFC2863 compatible state that is derived from these flags, a policy,
+and changeable from userspace under certain rules.
+
+
+2. Querying from userspace
+
+Both admin and operational state can be queried via the netlink
+operation RTM_GETLINK. It is also possible to subscribe to RTMGRP_LINK
+to be notified of updates. This is important for setting from userspace.
+
+These values contain interface state:
+
+ifinfomsg::if_flags & IFF_UP:
+ Interface is admin up
+ifinfomsg::if_flags & IFF_RUNNING:
+ Interface is in RFC2863 operational state UP or UNKNOWN. This is for
+ backward compatibility, routing daemons, dhcp clients can use this
+ flag to determine whether they should use the interface.
+ifinfomsg::if_flags & IFF_LOWER_UP:
+ Driver has signaled netif_carrier_on()
+ifinfomsg::if_flags & IFF_DORMANT:
+ Driver has signaled netif_dormant_on()
+
+These interface flags can also be queried without netlink using the
+SIOCGIFFLAGS ioctl.
+
+TLV IFLA_OPERSTATE
+
+contains RFC2863 state of the interface in numeric representation:
+
+IF_OPER_UNKNOWN (0):
+ Interface is in unknown state, neither driver nor userspace has set
+ operational state. Interface must be considered for user data as
+ setting operational state has not been implemented in every driver.
+IF_OPER_NOTPRESENT (1):
+ Unused in current kernel (notpresent interfaces normally disappear),
+ just a numerical placeholder.
+IF_OPER_DOWN (2):
+ Interface is unable to transfer data on L1, f.e. ethernet is not
+ plugged or interface is ADMIN down.
+IF_OPER_LOWERLAYERDOWN (3):
+ Interfaces stacked on an interface that is IF_OPER_DOWN show this
+ state (f.e. VLAN).
+IF_OPER_TESTING (4):
+ Unused in current kernel.
+IF_OPER_DORMANT (5):
+ Interface is L1 up, but waiting for an external event, f.e. for a
+ protocol to establish. (802.1X)
+IF_OPER_UP (6):
+ Interface is operational up and can be used.
+
+This TLV can also be queried via sysfs.
+
+TLV IFLA_LINKMODE
+
+contains link policy. This is needed for userspace interaction
+described below.
+
+This TLV can also be queried via sysfs.
+
+
+3. Kernel driver API
+
+Kernel drivers have access to two flags that map to IFF_LOWER_UP and
+IFF_DORMANT. These flags can be set from everywhere, even from
+interrupts. It is guaranteed that only the driver has write access,
+however, if different layers of the driver manipulate the same flag,
+the driver has to provide the synchronisation needed.
+
+__LINK_STATE_NOCARRIER, maps to !IFF_LOWER_UP:
+
+The driver uses netif_carrier_on() to clear and netif_carrier_off() to
+set this flag. On netif_carrier_off(), the scheduler stops sending
+packets. The name 'carrier' and the inversion are historical, think of
+it as lower layer.
+
+netif_carrier_ok() can be used to query that bit.
+
+__LINK_STATE_DORMANT, maps to IFF_DORMANT:
+
+Set by the driver to express that the device cannot yet be used
+because some driver controlled protocol establishment has to
+complete. Corresponding functions are netif_dormant_on() to set the
+flag, netif_dormant_off() to clear it and netif_dormant() to query.
+
+On device allocation, networking core sets the flags equivalent to
+netif_carrier_ok() and !netif_dormant().
+
+
+Whenever the driver CHANGES one of these flags, a workqueue event is
+scheduled to translate the flag combination to IFLA_OPERSTATE as
+follows:
+
+!netif_carrier_ok():
+ IF_OPER_LOWERLAYERDOWN if the interface is stacked, IF_OPER_DOWN
+ otherwise. Kernel can recognise stacked interfaces because their
+ ifindex != iflink.
+
+netif_carrier_ok() && netif_dormant():
+ IF_OPER_DORMANT
+
+netif_carrier_ok() && !netif_dormant():
+ IF_OPER_UP if userspace interaction is disabled. Otherwise
+ IF_OPER_DORMANT with the possibility for userspace to initiate the
+ IF_OPER_UP transition afterwards.
+
+
+4. Setting from userspace
+
+Applications have to use the netlink interface to influence the
+RFC2863 operational state of an interface. Setting IFLA_LINKMODE to 1
+via RTM_SETLINK instructs the kernel that an interface should go to
+IF_OPER_DORMANT instead of IF_OPER_UP when the combination
+netif_carrier_ok() && !netif_dormant() is set by the
+driver. Afterwards, the userspace application can set IFLA_OPERSTATE
+to IF_OPER_DORMANT or IF_OPER_UP as long as the driver does not set
+netif_carrier_off() or netif_dormant_on(). Changes made by userspace
+are multicasted on the netlink group RTMGRP_LINK.
+
+So basically a 802.1X supplicant interacts with the kernel like this:
+
+-subscribe to RTMGRP_LINK
+-set IFLA_LINKMODE to 1 via RTM_SETLINK
+-query RTM_GETLINK once to get initial state
+-if initial flags are not (IFF_LOWER_UP && !IFF_DORMANT), wait until
+ netlink multicast signals this state
+-do 802.1X, eventually abort if flags go down again
+-send RTM_SETLINK to set operstate to IF_OPER_UP if authentication
+ succeeds, IF_OPER_DORMANT otherwise
+-see how operstate and IFF_RUNNING is echoed via netlink multicast
+-set interface back to IF_OPER_DORMANT if 802.1X reauthentication
+ fails
+-restart if kernel changes IFF_LOWER_UP or IFF_DORMANT flag
+
+if supplicant goes down, bring back IFLA_LINKMODE to 0 and
+IFLA_OPERSTATE to a sane value.
+
+A routing daemon or dhcp client just needs to care for IFF_RUNNING or
+waiting for operstate to go IF_OPER_UP/IF_OPER_UNKNOWN before
+considering the interface / querying a DHCP address.
+
+
+For technical questions and/or comments please e-mail to Stefan Rompf
+(stefan at loplof.de).
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
new file mode 100644
index 0000000..07c53d5
--- /dev/null
+++ b/Documentation/networking/packet_mmap.txt
@@ -0,0 +1,399 @@
+--------------------------------------------------------------------------------
++ ABSTRACT
+--------------------------------------------------------------------------------
+
+This file documents the CONFIG_PACKET_MMAP option available with the PACKET
+socket interface on 2.4 and 2.6 kernels. This type of sockets is used for
+capture network traffic with utilities like tcpdump or any other that uses
+the libpcap library.
+
+You can find the latest version of this document at
+
+ http://pusa.uv.es/~ulisses/packet_mmap/
+
+Please send me your comments to
+
+ Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
+
+-------------------------------------------------------------------------------
++ Why use PACKET_MMAP
+--------------------------------------------------------------------------------
+
+In Linux 2.4/2.6 if PACKET_MMAP is not enabled, the capture process is very
+inefficient. It uses very limited buffers and requires one system call
+to capture each packet, it requires two if you want to get packet's
+timestamp (like libpcap always does).
+
+In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size
+configurable circular buffer mapped in user space. This way reading packets just
+needs to wait for them, most of the time there is no need to issue a single
+system call. By using a shared buffer between the kernel and the user
+also has the benefit of minimizing packet copies.
+
+It's fine to use PACKET_MMAP to improve the performance of the capture process,
+but it isn't everything. At least, if you are capturing at high speeds (this
+is relative to the cpu speed), you should check if the device driver of your
+network interface card supports some sort of interrupt load mitigation or
+(even better) if it supports NAPI, also make sure it is enabled.
+
+--------------------------------------------------------------------------------
++ How to use CONFIG_PACKET_MMAP
+--------------------------------------------------------------------------------
+
+From the user standpoint, you should use the higher level libpcap library, which
+is a de facto standard, portable across nearly all operating systems
+including Win32.
+
+Said that, at time of this writing, official libpcap 0.8.1 is out and doesn't include
+support for PACKET_MMAP, and also probably the libpcap included in your distribution.
+
+I'm aware of two implementations of PACKET_MMAP in libpcap:
+
+ http://pusa.uv.es/~ulisses/packet_mmap/ (by Simon Patarin, based on libpcap 0.6.2)
+ http://public.lanl.gov/cpw/ (by Phil Wood, based on lastest libpcap)
+
+The rest of this document is intended for people who want to understand
+the low level details or want to improve libpcap by including PACKET_MMAP
+support.
+
+--------------------------------------------------------------------------------
++ How to use CONFIG_PACKET_MMAP directly
+--------------------------------------------------------------------------------
+
+From the system calls stand point, the use of PACKET_MMAP involves
+the following process:
+
+
+[setup] socket() -------> creation of the capture socket
+ setsockopt() ---> allocation of the circular buffer (ring)
+ mmap() ---------> mapping of the allocated buffer to the
+ user process
+
+[capture] poll() ---------> to wait for incoming packets
+
+[shutdown] close() --------> destruction of the capture socket and
+ deallocation of all associated
+ resources.
+
+
+socket creation and destruction is straight forward, and is done
+the same way with or without PACKET_MMAP:
+
+int fd;
+
+fd= socket(PF_PACKET, mode, htons(ETH_P_ALL))
+
+where mode is SOCK_RAW for the raw interface were link level
+information can be captured or SOCK_DGRAM for the cooked
+interface where link level information capture is not
+supported and a link level pseudo-header is provided
+by the kernel.
+
+The destruction of the socket and all associated resources
+is done by a simple call to close(fd).
+
+Next I will describe PACKET_MMAP settings and it's constraints,
+also the mapping of the circular buffer in the user process and
+the use of this buffer.
+
+--------------------------------------------------------------------------------
++ PACKET_MMAP settings
+--------------------------------------------------------------------------------
+
+
+To setup PACKET_MMAP from user level code is done with a call like
+
+ setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
+
+The most significant argument in the previous call is the req parameter,
+this parameter must to have the following structure:
+
+ struct tpacket_req
+ {
+ unsigned int tp_block_size; /* Minimal size of contiguous block */
+ unsigned int tp_block_nr; /* Number of blocks */
+ unsigned int tp_frame_size; /* Size of frame */
+ unsigned int tp_frame_nr; /* Total number of frames */
+ };
+
+This structure is defined in /usr/include/linux/if_packet.h and establishes a
+circular buffer (ring) of unswappable memory mapped in the capture process.
+Being mapped in the capture process allows reading the captured frames and
+related meta-information like timestamps without requiring a system call.
+
+Captured frames are grouped in blocks. Each block is a physically contiguous
+region of memory and holds tp_block_size/tp_frame_size frames. The total number
+of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
+
+ frames_per_block = tp_block_size/tp_frame_size
+
+indeed, packet_set_ring checks that the following condition is true
+
+ frames_per_block * tp_block_nr == tp_frame_nr
+
+
+Lets see an example, with the following values:
+
+ tp_block_size= 4096
+ tp_frame_size= 2048
+ tp_block_nr = 4
+ tp_frame_nr = 8
+
+we will get the following buffer structure:
+
+ block #1 block #2
++---------+---------+ +---------+---------+
+| frame 1 | frame 2 | | frame 3 | frame 4 |
++---------+---------+ +---------+---------+
+
+ block #3 block #4
++---------+---------+ +---------+---------+
+| frame 5 | frame 6 | | frame 7 | frame 8 |
++---------+---------+ +---------+---------+
+
+A frame can be of any size with the only condition it can fit in a block. A block
+can only hold an integer number of frames, or in other words, a frame cannot
+be spawned accross two blocks, so there are some details you have to take into
+account when choosing the frame_size. See "Mapping and use of the circular
+buffer (ring)".
+
+
+--------------------------------------------------------------------------------
++ PACKET_MMAP setting constraints
+--------------------------------------------------------------------------------
+
+In kernel versions prior to 2.4.26 (for the 2.4 branch) and 2.6.5 (2.6 branch),
+the PACKET_MMAP buffer could hold only 32768 frames in a 32 bit architecture or
+16384 in a 64 bit architecture. For information on these kernel versions
+see http://pusa.uv.es/~ulisses/packet_mmap/packet_mmap.pre-2.4.26_2.6.5.txt
+
+ Block size limit
+------------------
+
+As stated earlier, each block is a contiguous physical region of memory. These
+memory regions are allocated with calls to the __get_free_pages() function. As
+the name indicates, this function allocates pages of memory, and the second
+argument is "order" or a power of two number of pages, that is
+(for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes,
+order=2 ==> 16384 bytes, etc. The maximum size of a
+region allocated by __get_free_pages is determined by the MAX_ORDER macro. More
+precisely the limit can be calculated as:
+
+ PAGE_SIZE << MAX_ORDER
+
+ In a i386 architecture PAGE_SIZE is 4096 bytes
+ In a 2.4/i386 kernel MAX_ORDER is 10
+ In a 2.6/i386 kernel MAX_ORDER is 11
+
+So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel
+respectively, with an i386 architecture.
+
+User space programs can include /usr/include/sys/user.h and
+/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations.
+
+The pagesize can also be determined dynamically with the getpagesize (2)
+system call.
+
+
+ Block number limit
+--------------------
+
+To understand the constraints of PACKET_MMAP, we have to see the structure
+used to hold the pointers to each block.
+
+Currently, this structure is a dynamically allocated vector with kmalloc
+called pg_vec, its size limits the number of blocks that can be allocated.
+
+ +---+---+---+---+
+ | x | x | x | x |
+ +---+---+---+---+
+ | | | |
+ | | | v
+ | | v block #4
+ | v block #3
+ v block #2
+ block #1
+
+
+kmalloc allocates any number of bytes of physically contiguous memory from
+a pool of pre-determined sizes. This pool of memory is maintained by the slab
+allocator which is at the end the responsible for doing the allocation and
+hence which imposes the maximum memory that kmalloc can allocate.
+
+In a 2.4/2.6 kernel and the i386 architecture, the limit is 131072 bytes. The
+predetermined sizes that kmalloc uses can be checked in the "size-<bytes>"
+entries of /proc/slabinfo
+
+In a 32 bit architecture, pointers are 4 bytes long, so the total number of
+pointers to blocks is
+
+ 131072/4 = 32768 blocks
+
+
+ PACKET_MMAP buffer size calculator
+------------------------------------
+
+Definitions:
+
+<size-max> : is the maximum size of allocable with kmalloc (see /proc/slabinfo)
+<pointer size>: depends on the architecture -- sizeof(void *)
+<page size> : depends on the architecture -- PAGE_SIZE or getpagesize (2)
+<max-order> : is the value defined with MAX_ORDER
+<frame size> : it's an upper bound of frame's capture size (more on this later)
+
+from these definitions we will derive
+
+ <block number> = <size-max>/<pointer size>
+ <block size> = <pagesize> << <max-order>
+
+so, the max buffer size is
+
+ <block number> * <block size>
+
+and, the number of frames be
+
+ <block number> * <block size> / <frame size>
+
+Suppose the following parameters, which apply for 2.6 kernel and an
+i386 architecture:
+
+ <size-max> = 131072 bytes
+ <pointer size> = 4 bytes
+ <pagesize> = 4096 bytes
+ <max-order> = 11
+
+and a value for <frame size> of 2048 bytes. These parameters will yield
+
+ <block number> = 131072/4 = 32768 blocks
+ <block size> = 4096 << 11 = 8 MiB.
+
+and hence the buffer will have a 262144 MiB size. So it can hold
+262144 MiB / 2048 bytes = 134217728 frames
+
+
+Actually, this buffer size is not possible with an i386 architecture.
+Remember that the memory is allocated in kernel space, in the case of
+an i386 kernel's memory size is limited to 1GiB.
+
+All memory allocations are not freed until the socket is closed. The memory
+allocations are done with GFP_KERNEL priority, this basically means that
+the allocation can wait and swap other process' memory in order to allocate
+the necessary memory, so normally limits can be reached.
+
+ Other constraints
+-------------------
+
+If you check the source code you will see that what I draw here as a frame
+is not only the link level frame. At the beginning of each frame there is a
+header called struct tpacket_hdr used in PACKET_MMAP to hold link level's frame
+meta information like timestamp. So what we draw here a frame it's really
+the following (from include/linux/if_packet.h):
+
+/*
+ Frame structure:
+
+ - Start. Frame must be aligned to TPACKET_ALIGNMENT=16
+ - struct tpacket_hdr
+ - pad to TPACKET_ALIGNMENT=16
+ - struct sockaddr_ll
+ - Gap, chosen so that packet data (Start+tp_net) aligns to
+ TPACKET_ALIGNMENT=16
+ - Start+tp_mac: [ Optional MAC header ]
+ - Start+tp_net: Packet data, aligned to TPACKET_ALIGNMENT=16.
+ - Pad to align to TPACKET_ALIGNMENT=16
+ */
+
+
+ The following are conditions that are checked in packet_set_ring
+
+ tp_block_size must be a multiple of PAGE_SIZE (1)
+ tp_frame_size must be greater than TPACKET_HDRLEN (obvious)
+ tp_frame_size must be a multiple of TPACKET_ALIGNMENT
+ tp_frame_nr must be exactly frames_per_block*tp_block_nr
+
+Note that tp_block_size should be chosen to be a power of two or there will
+be a waste of memory.
+
+--------------------------------------------------------------------------------
++ Mapping and use of the circular buffer (ring)
+--------------------------------------------------------------------------------
+
+The mapping of the buffer in the user process is done with the conventional
+mmap function. Even the circular buffer is compound of several physically
+discontiguous blocks of memory, they are contiguous to the user space, hence
+just one call to mmap is needed:
+
+ mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+
+If tp_frame_size is a divisor of tp_block_size frames will be
+contiguously spaced by tp_frame_size bytes. If not, each
+tp_block_size/tp_frame_size frames there will be a gap between
+the frames. This is because a frame cannot be spawn across two
+blocks.
+
+At the beginning of each frame there is an status field (see
+struct tpacket_hdr). If this field is 0 means that the frame is ready
+to be used for the kernel, If not, there is a frame the user can read
+and the following flags apply:
+
+ from include/linux/if_packet.h
+
+ #define TP_STATUS_COPY 2
+ #define TP_STATUS_LOSING 4
+ #define TP_STATUS_CSUMNOTREADY 8
+
+
+TP_STATUS_COPY : This flag indicates that the frame (and associated
+ meta information) has been truncated because it's
+ larger than tp_frame_size. This packet can be
+ read entirely with recvfrom().
+
+ In order to make this work it must to be
+ enabled previously with setsockopt() and
+ the PACKET_COPY_THRESH option.
+
+ The number of frames than can be buffered to
+ be read with recvfrom is limited like a normal socket.
+ See the SO_RCVBUF option in the socket (7) man page.
+
+TP_STATUS_LOSING : indicates there were packet drops from last time
+ statistics where checked with getsockopt() and
+ the PACKET_STATISTICS option.
+
+TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which
+ it's checksum will be done in hardware. So while
+ reading the packet we should not try to check the
+ checksum.
+
+for convenience there are also the following defines:
+
+ #define TP_STATUS_KERNEL 0
+ #define TP_STATUS_USER 1
+
+The kernel initializes all frames to TP_STATUS_KERNEL, when the kernel
+receives a packet it puts in the buffer and updates the status with
+at least the TP_STATUS_USER flag. Then the user can read the packet,
+once the packet is read the user must zero the status field, so the kernel
+can use again that frame buffer.
+
+The user can use poll (any other variant should apply too) to check if new
+packets are in the ring:
+
+ struct pollfd pfd;
+
+ pfd.fd = fd;
+ pfd.revents = 0;
+ pfd.events = POLLIN|POLLRDNORM|POLLERR;
+
+ if (status == TP_STATUS_KERNEL)
+ retval = poll(&pfd, 1, timeout);
+
+It doesn't incur in a race condition to first check the status value and
+then poll for frames.
+
+--------------------------------------------------------------------------------
++ THANKS
+--------------------------------------------------------------------------------
+
+ Jesse Brandeburg, for fixing my grammathical/spelling errors
+
diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
new file mode 100644
index 0000000..6a07e45
--- /dev/null
+++ b/Documentation/networking/phonet.txt
@@ -0,0 +1,175 @@
+Linux Phonet protocol family
+============================
+
+Introduction
+------------
+
+Phonet is a packet protocol used by Nokia cellular modems for both IPC
+and RPC. With the Linux Phonet socket family, Linux host processes can
+receive and send messages from/to the modem, or any other external
+device attached to the modem. The modem takes care of routing.
+
+Phonet packets can be exchanged through various hardware connections
+depending on the device, such as:
+ - USB with the CDC Phonet interface,
+ - infrared,
+ - Bluetooth,
+ - an RS232 serial port (with a dedicated "FBUS" line discipline),
+ - the SSI bus with some TI OMAP processors.
+
+
+Packets format
+--------------
+
+Phonet packets have a common header as follows:
+
+ struct phonethdr {
+ uint8_t pn_media; /* Media type (link-layer identifier) */
+ uint8_t pn_rdev; /* Receiver device ID */
+ uint8_t pn_sdev; /* Sender device ID */
+ uint8_t pn_res; /* Resource ID or function */
+ uint16_t pn_length; /* Big-endian message byte length (minus 6) */
+ uint8_t pn_robj; /* Receiver object ID */
+ uint8_t pn_sobj; /* Sender object ID */
+ };
+
+On Linux, the link-layer header includes the pn_media byte (see below).
+The next 7 bytes are part of the network-layer header.
+
+The device ID is split: the 6 higher-order bits consitute the device
+address, while the 2 lower-order bits are used for multiplexing, as are
+the 8-bit object identifiers. As such, Phonet can be considered as a
+network layer with 6 bits of address space and 10 bits for transport
+protocol (much like port numbers in IP world).
+
+The modem always has address number zero. All other device have a their
+own 6-bit address.
+
+
+Link layer
+----------
+
+Phonet links are always point-to-point links. The link layer header
+consists of a single Phonet media type byte. It uniquely identifies the
+link through which the packet is transmitted, from the modem's
+perspective. Each Phonet network device shall prepend and set the media
+type byte as appropriate. For convenience, a common phonet_header_ops
+link-layer header operations structure is provided. It sets the
+media type according to the network device hardware address.
+
+Linux Phonet network interfaces support a dedicated link layer packets
+type (ETH_P_PHONET) which is out of the Ethernet type range. They can
+only send and receive Phonet packets.
+
+The virtual TUN tunnel device driver can also be used for Phonet. This
+requires IFF_TUN mode, _without_ the IFF_NO_PI flag. In this case,
+there is no link-layer header, so there is no Phonet media type byte.
+
+Note that Phonet interfaces are not allowed to re-order packets, so
+only the (default) Linux FIFO qdisc should be used with them.
+
+
+Network layer
+-------------
+
+The Phonet socket address family maps the Phonet packet header:
+
+ struct sockaddr_pn {
+ sa_family_t spn_family; /* AF_PHONET */
+ uint8_t spn_obj; /* Object ID */
+ uint8_t spn_dev; /* Device ID */
+ uint8_t spn_resource; /* Resource or function */
+ uint8_t spn_zero[...]; /* Padding */
+ };
+
+The resource field is only used when sending and receiving;
+It is ignored by bind() and getsockname().
+
+
+Low-level datagram protocol
+---------------------------
+
+Applications can send Phonet messages using the Phonet datagram socket
+protocol from the PF_PHONET family. Each socket is bound to one of the
+2^10 object IDs available, and can send and receive packets with any
+other peer.
+
+ struct sockaddr_pn addr = { .spn_family = AF_PHONET, };
+ ssize_t len;
+ socklen_t addrlen = sizeof(addr);
+ int fd;
+
+ fd = socket(PF_PHONET, SOCK_DGRAM, 0);
+ bind(fd, (struct sockaddr *)&addr, sizeof(addr));
+ /* ... */
+
+ sendto(fd, msg, msglen, 0, (struct sockaddr *)&addr, sizeof(addr));
+ len = recvfrom(fd, buf, sizeof(buf), 0,
+ (struct sockaddr *)&addr, &addrlen);
+
+This protocol follows the SOCK_DGRAM connection-less semantics.
+However, connect() and getpeername() are not supported, as they did
+not seem useful with Phonet usages (could be added easily).
+
+
+Phonet Pipe protocol
+--------------------
+
+The Phonet Pipe protocol is a simple sequenced packets protocol
+with end-to-end congestion control. It uses the passive listening
+socket paradigm. The listening socket is bound to an unique free object
+ID. Each listening socket can handle up to 255 simultaneous
+connections, one per accept()'d socket.
+
+ int lfd, cfd;
+
+ lfd = socket(PF_PHONET, SOCK_SEQPACKET, PN_PROTO_PIPE);
+ listen (lfd, INT_MAX);
+
+ /* ... */
+ cfd = accept(lfd, NULL, NULL);
+ for (;;)
+ {
+ char buf[...];
+ ssize_t len = read(cfd, buf, sizeof(buf));
+
+ /* ... */
+
+ write(cfd, msg, msglen);
+ }
+
+Connections are established between two endpoints by a "third party"
+application. This means that both endpoints are passive; so connect()
+is not possible.
+
+WARNING:
+When polling a connected pipe socket for writability, there is an
+intrinsic race condition whereby writability might be lost between the
+polling and the writing system calls. In this case, the socket will
+block until write becomes possible again, unless non-blocking mode
+is enabled.
+
+
+The pipe protocol provides two socket options at the SOL_PNPIPE level:
+
+ PNPIPE_ENCAP accepts one integer value (int) of:
+
+ PNPIPE_ENCAP_NONE: The socket operates normally (default).
+
+ PNPIPE_ENCAP_IP: The socket is used as a backend for a virtual IP
+ interface. This requires CAP_NET_ADMIN capability. GPRS data
+ support on Nokia modems can use this. Note that the socket cannot
+ be reliably poll()'d or read() from while in this mode.
+
+ PNPIPE_IFINDEX is a read-only integer value. It contains the
+ interface index of the network interface created by PNPIPE_ENCAP,
+ or zero if encapsulation is off.
+
+
+Authors
+-------
+
+Linux Phonet was initially written by Sakari Ailus.
+Other contributors include Mikä Liljeberg, Andras Domokos,
+Carlos Chinea and Rémi Denis-Courmont.
+Copyright (C) 2008 Nokia Corporation.
diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
new file mode 100644
index 0000000..88bb71b
--- /dev/null
+++ b/Documentation/networking/phy.txt
@@ -0,0 +1,329 @@
+
+-------
+PHY Abstraction Layer
+(Updated 2008-04-08)
+
+Purpose
+
+ Most network devices consist of set of registers which provide an interface
+ to a MAC layer, which communicates with the physical connection through a
+ PHY. The PHY concerns itself with negotiating link parameters with the link
+ partner on the other side of the network connection (typically, an ethernet
+ cable), and provides a register interface to allow drivers to determine what
+ settings were chosen, and to configure what settings are allowed.
+
+ While these devices are distinct from the network devices, and conform to a
+ standard layout for the registers, it has been common practice to integrate
+ the PHY management code with the network driver. This has resulted in large
+ amounts of redundant code. Also, on embedded systems with multiple (and
+ sometimes quite different) ethernet controllers connected to the same
+ management bus, it is difficult to ensure safe use of the bus.
+
+ Since the PHYs are devices, and the management busses through which they are
+ accessed are, in fact, busses, the PHY Abstraction Layer treats them as such.
+ In doing so, it has these goals:
+
+ 1) Increase code-reuse
+ 2) Increase overall code-maintainability
+ 3) Speed development time for new network drivers, and for new systems
+
+ Basically, this layer is meant to provide an interface to PHY devices which
+ allows network driver writers to write as little code as possible, while
+ still providing a full feature set.
+
+The MDIO bus
+
+ Most network devices are connected to a PHY by means of a management bus.
+ Different devices use different busses (though some share common interfaces).
+ In order to take advantage of the PAL, each bus interface needs to be
+ registered as a distinct device.
+
+ 1) read and write functions must be implemented. Their prototypes are:
+
+ int write(struct mii_bus *bus, int mii_id, int regnum, u16 value);
+ int read(struct mii_bus *bus, int mii_id, int regnum);
+
+ mii_id is the address on the bus for the PHY, and regnum is the register
+ number. These functions are guaranteed not to be called from interrupt
+ time, so it is safe for them to block, waiting for an interrupt to signal
+ the operation is complete
+
+ 2) A reset function is necessary. This is used to return the bus to an
+ initialized state.
+
+ 3) A probe function is needed. This function should set up anything the bus
+ driver needs, setup the mii_bus structure, and register with the PAL using
+ mdiobus_register. Similarly, there's a remove function to undo all of
+ that (use mdiobus_unregister).
+
+ 4) Like any driver, the device_driver structure must be configured, and init
+ exit functions are used to register the driver.
+
+ 5) The bus must also be declared somewhere as a device, and registered.
+
+ As an example for how one driver implemented an mdio bus driver, see
+ drivers/net/gianfar_mii.c and arch/ppc/syslib/mpc85xx_devices.c
+
+Connecting to a PHY
+
+ Sometime during startup, the network driver needs to establish a connection
+ between the PHY device, and the network device. At this time, the PHY's bus
+ and drivers need to all have been loaded, so it is ready for the connection.
+ At this point, there are several ways to connect to the PHY:
+
+ 1) The PAL handles everything, and only calls the network driver when
+ the link state changes, so it can react.
+
+ 2) The PAL handles everything except interrupts (usually because the
+ controller has the interrupt registers).
+
+ 3) The PAL handles everything, but checks in with the driver every second,
+ allowing the network driver to react first to any changes before the PAL
+ does.
+
+ 4) The PAL serves only as a library of functions, with the network device
+ manually calling functions to update status, and configure the PHY
+
+
+Letting the PHY Abstraction Layer do Everything
+
+ If you choose option 1 (The hope is that every driver can, but to still be
+ useful to drivers that can't), connecting to the PHY is simple:
+
+ First, you need a function to react to changes in the link state. This
+ function follows this protocol:
+
+ static void adjust_link(struct net_device *dev);
+
+ Next, you need to know the device name of the PHY connected to this device.
+ The name will look something like, "0:00", where the first number is the
+ bus id, and the second is the PHY's address on that bus. Typically,
+ the bus is responsible for making its ID unique.
+
+ Now, to connect, just call this function:
+
+ phydev = phy_connect(dev, phy_name, &adjust_link, flags, interface);
+
+ phydev is a pointer to the phy_device structure which represents the PHY. If
+ phy_connect is successful, it will return the pointer. dev, here, is the
+ pointer to your net_device. Once done, this function will have started the
+ PHY's software state machine, and registered for the PHY's interrupt, if it
+ has one. The phydev structure will be populated with information about the
+ current state, though the PHY will not yet be truly operational at this
+ point.
+
+ flags is a u32 which can optionally contain phy-specific flags.
+ This is useful if the system has put hardware restrictions on
+ the PHY/controller, of which the PHY needs to be aware.
+
+ interface is a u32 which specifies the connection type used
+ between the controller and the PHY. Examples are GMII, MII,
+ RGMII, and SGMII. For a full list, see include/linux/phy.h
+
+ Now just make sure that phydev->supported and phydev->advertising have any
+ values pruned from them which don't make sense for your controller (a 10/100
+ controller may be connected to a gigabit capable PHY, so you would need to
+ mask off SUPPORTED_1000baseT*). See include/linux/ethtool.h for definitions
+ for these bitfields. Note that you should not SET any bits, or the PHY may
+ get put into an unsupported state.
+
+ Lastly, once the controller is ready to handle network traffic, you call
+ phy_start(phydev). This tells the PAL that you are ready, and configures the
+ PHY to connect to the network. If you want to handle your own interrupts,
+ just set phydev->irq to PHY_IGNORE_INTERRUPT before you call phy_start.
+ Similarly, if you don't want to use interrupts, set phydev->irq to PHY_POLL.
+
+ When you want to disconnect from the network (even if just briefly), you call
+ phy_stop(phydev).
+
+Keeping Close Tabs on the PAL
+
+ It is possible that the PAL's built-in state machine needs a little help to
+ keep your network device and the PHY properly in sync. If so, you can
+ register a helper function when connecting to the PHY, which will be called
+ every second before the state machine reacts to any changes. To do this, you
+ need to manually call phy_attach() and phy_prepare_link(), and then call
+ phy_start_machine() with the second argument set to point to your special
+ handler.
+
+ Currently there are no examples of how to use this functionality, and testing
+ on it has been limited because the author does not have any drivers which use
+ it (they all use option 1). So Caveat Emptor.
+
+Doing it all yourself
+
+ There's a remote chance that the PAL's built-in state machine cannot track
+ the complex interactions between the PHY and your network device. If this is
+ so, you can simply call phy_attach(), and not call phy_start_machine or
+ phy_prepare_link(). This will mean that phydev->state is entirely yours to
+ handle (phy_start and phy_stop toggle between some of the states, so you
+ might need to avoid them).
+
+ An effort has been made to make sure that useful functionality can be
+ accessed without the state-machine running, and most of these functions are
+ descended from functions which did not interact with a complex state-machine.
+ However, again, no effort has been made so far to test running without the
+ state machine, so tryer beware.
+
+ Here is a brief rundown of the functions:
+
+ int phy_read(struct phy_device *phydev, u16 regnum);
+ int phy_write(struct phy_device *phydev, u16 regnum, u16 val);
+
+ Simple read/write primitives. They invoke the bus's read/write function
+ pointers.
+
+ void phy_print_status(struct phy_device *phydev);
+
+ A convenience function to print out the PHY status neatly.
+
+ int phy_clear_interrupt(struct phy_device *phydev);
+ int phy_config_interrupt(struct phy_device *phydev, u32 interrupts);
+
+ Clear the PHY's interrupt, and configure which ones are allowed,
+ respectively. Currently only supports all on, or all off.
+
+ int phy_enable_interrupts(struct phy_device *phydev);
+ int phy_disable_interrupts(struct phy_device *phydev);
+
+ Functions which enable/disable PHY interrupts, clearing them
+ before and after, respectively.
+
+ int phy_start_interrupts(struct phy_device *phydev);
+ int phy_stop_interrupts(struct phy_device *phydev);
+
+ Requests the IRQ for the PHY interrupts, then enables them for
+ start, or disables then frees them for stop.
+
+ struct phy_device * phy_attach(struct net_device *dev, const char *phy_id,
+ u32 flags, phy_interface_t interface);
+
+ Attaches a network device to a particular PHY, binding the PHY to a generic
+ driver if none was found during bus initialization. Passes in
+ any phy-specific flags as needed.
+
+ int phy_start_aneg(struct phy_device *phydev);
+
+ Using variables inside the phydev structure, either configures advertising
+ and resets autonegotiation, or disables autonegotiation, and configures
+ forced settings.
+
+ static inline int phy_read_status(struct phy_device *phydev);
+
+ Fills the phydev structure with up-to-date information about the current
+ settings in the PHY.
+
+ void phy_sanitize_settings(struct phy_device *phydev)
+
+ Resolves differences between currently desired settings, and
+ supported settings for the given PHY device. Does not make
+ the changes in the hardware, though.
+
+ int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
+ int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd);
+
+ Ethtool convenience functions.
+
+ int phy_mii_ioctl(struct phy_device *phydev,
+ struct mii_ioctl_data *mii_data, int cmd);
+
+ The MII ioctl. Note that this function will completely screw up the state
+ machine if you write registers like BMCR, BMSR, ADVERTISE, etc. Best to
+ use this only to write registers which are not standard, and don't set off
+ a renegotiation.
+
+
+PHY Device Drivers
+
+ With the PHY Abstraction Layer, adding support for new PHYs is
+ quite easy. In some cases, no work is required at all! However,
+ many PHYs require a little hand-holding to get up-and-running.
+
+Generic PHY driver
+
+ If the desired PHY doesn't have any errata, quirks, or special
+ features you want to support, then it may be best to not add
+ support, and let the PHY Abstraction Layer's Generic PHY Driver
+ do all of the work.
+
+Writing a PHY driver
+
+ If you do need to write a PHY driver, the first thing to do is
+ make sure it can be matched with an appropriate PHY device.
+ This is done during bus initialization by reading the device's
+ UID (stored in registers 2 and 3), then comparing it to each
+ driver's phy_id field by ANDing it with each driver's
+ phy_id_mask field. Also, it needs a name. Here's an example:
+
+ static struct phy_driver dm9161_driver = {
+ .phy_id = 0x0181b880,
+ .name = "Davicom DM9161E",
+ .phy_id_mask = 0x0ffffff0,
+ ...
+ }
+
+ Next, you need to specify what features (speed, duplex, autoneg,
+ etc) your PHY device and driver support. Most PHYs support
+ PHY_BASIC_FEATURES, but you can look in include/mii.h for other
+ features.
+
+ Each driver consists of a number of function pointers:
+
+ config_init: configures PHY into a sane state after a reset.
+ For instance, a Davicom PHY requires descrambling disabled.
+ probe: Does any setup needed by the driver
+ suspend/resume: power management
+ config_aneg: Changes the speed/duplex/negotiation settings
+ read_status: Reads the current speed/duplex/negotiation settings
+ ack_interrupt: Clear a pending interrupt
+ config_intr: Enable or disable interrupts
+ remove: Does any driver take-down
+
+ Of these, only config_aneg and read_status are required to be
+ assigned by the driver code. The rest are optional. Also, it is
+ preferred to use the generic phy driver's versions of these two
+ functions if at all possible: genphy_read_status and
+ genphy_config_aneg. If this is not possible, it is likely that
+ you only need to perform some actions before and after invoking
+ these functions, and so your functions will wrap the generic
+ ones.
+
+ Feel free to look at the Marvell, Cicada, and Davicom drivers in
+ drivers/net/phy/ for examples (the lxt and qsemi drivers have
+ not been tested as of this writing)
+
+Board Fixups
+
+ Sometimes the specific interaction between the platform and the PHY requires
+ special handling. For instance, to change where the PHY's clock input is,
+ or to add a delay to account for latency issues in the data path. In order
+ to support such contingencies, the PHY Layer allows platform code to register
+ fixups to be run when the PHY is brought up (or subsequently reset).
+
+ When the PHY Layer brings up a PHY it checks to see if there are any fixups
+ registered for it, matching based on UID (contained in the PHY device's phy_id
+ field) and the bus identifier (contained in phydev->dev.bus_id). Both must
+ match, however two constants, PHY_ANY_ID and PHY_ANY_UID, are provided as
+ wildcards for the bus ID and UID, respectively.
+
+ When a match is found, the PHY layer will invoke the run function associated
+ with the fixup. This function is passed a pointer to the phy_device of
+ interest. It should therefore only operate on that PHY.
+
+ The platform code can either register the fixup using phy_register_fixup():
+
+ int phy_register_fixup(const char *phy_id,
+ u32 phy_uid, u32 phy_uid_mask,
+ int (*run)(struct phy_device *));
+
+ Or using one of the two stubs, phy_register_fixup_for_uid() and
+ phy_register_fixup_for_id():
+
+ int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask,
+ int (*run)(struct phy_device *));
+ int phy_register_fixup_for_id(const char *phy_id,
+ int (*run)(struct phy_device *));
+
+ The stubs set one of the two matching criteria, and set the other one to
+ match anything.
+
diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt
new file mode 100644
index 0000000..c6cf4a3
--- /dev/null
+++ b/Documentation/networking/pktgen.txt
@@ -0,0 +1,248 @@
+
+
+ HOWTO for the linux packet generator
+ ------------------------------------
+
+Date: 041221
+
+Enable CONFIG_NET_PKTGEN to compile and build pktgen.o either in kernel
+or as module. Module is preferred. insmod pktgen if needed. Once running
+pktgen creates a thread on each CPU where each thread has affinity to its CPU.
+Monitoring and controlling is done via /proc. Easiest to select a suitable
+a sample script and configure.
+
+On a dual CPU:
+
+ps aux | grep pkt
+root 129 0.3 0.0 0 0 ? SW 2003 523:20 [pktgen/0]
+root 130 0.3 0.0 0 0 ? SW 2003 509:50 [pktgen/1]
+
+
+For monitoring and control pktgen creates:
+ /proc/net/pktgen/pgctrl
+ /proc/net/pktgen/kpktgend_X
+ /proc/net/pktgen/ethX
+
+
+Viewing threads
+===============
+/proc/net/pktgen/kpktgend_0
+Name: kpktgend_0 max_before_softirq: 10000
+Running:
+Stopped: eth1
+Result: OK: max_before_softirq=10000
+
+Most important the devices assigned to thread. Note! A device can only belong
+to one thread.
+
+
+Viewing devices
+===============
+
+Parm section holds configured info. Current hold running stats.
+Result is printed after run or after interruption. Example:
+
+/proc/net/pktgen/eth1
+
+Params: count 10000000 min_pkt_size: 60 max_pkt_size: 60
+ frags: 0 delay: 0 clone_skb: 1000000 ifname: eth1
+ flows: 0 flowlen: 0
+ dst_min: 10.10.11.2 dst_max:
+ src_min: src_max:
+ src_mac: 00:00:00:00:00:00 dst_mac: 00:04:23:AC:FD:82
+ udp_src_min: 9 udp_src_max: 9 udp_dst_min: 9 udp_dst_max: 9
+ src_mac_count: 0 dst_mac_count: 0
+ Flags:
+Current:
+ pkts-sofar: 10000000 errors: 39664
+ started: 1103053986245187us stopped: 1103053999346329us idle: 880401us
+ seq_num: 10000011 cur_dst_mac_offset: 0 cur_src_mac_offset: 0
+ cur_saddr: 0x10a0a0a cur_daddr: 0x20b0a0a
+ cur_udp_dst: 9 cur_udp_src: 9
+ flows: 0
+Result: OK: 13101142(c12220741+d880401) usec, 10000000 (60byte,0frags)
+ 763292pps 390Mb/sec (390805504bps) errors: 39664
+
+Configuring threads and devices
+================================
+This is done via the /proc interface easiest done via pgset in the scripts
+
+Examples:
+
+ pgset "clone_skb 1" sets the number of copies of the same packet
+ pgset "clone_skb 0" use single SKB for all transmits
+ pgset "pkt_size 9014" sets packet size to 9014
+ pgset "frags 5" packet will consist of 5 fragments
+ pgset "count 200000" sets number of packets to send, set to zero
+ for continuous sends until explicitly stopped.
+
+ pgset "delay 5000" adds delay to hard_start_xmit(). nanoseconds
+
+ pgset "dst 10.0.0.1" sets IP destination address
+ (BEWARE! This generator is very aggressive!)
+
+ pgset "dst_min 10.0.0.1" Same as dst
+ pgset "dst_max 10.0.0.254" Set the maximum destination IP.
+ pgset "src_min 10.0.0.1" Set the minimum (or only) source IP.
+ pgset "src_max 10.0.0.254" Set the maximum source IP.
+ pgset "dst6 fec0::1" IPV6 destination address
+ pgset "src6 fec0::2" IPV6 source address
+ pgset "dstmac 00:00:00:00:00:00" sets MAC destination address
+ pgset "srcmac 00:00:00:00:00:00" sets MAC source address
+
+ pgset "src_mac_count 1" Sets the number of MACs we'll range through.
+ The 'minimum' MAC is what you set with srcmac.
+
+ pgset "dst_mac_count 1" Sets the number of MACs we'll range through.
+ The 'minimum' MAC is what you set with dstmac.
+
+ pgset "flag [name]" Set a flag to determine behaviour. Current flags
+ are: IPSRC_RND #IP Source is random (between min/max),
+ IPDST_RND, UDPSRC_RND,
+ UDPDST_RND, MACSRC_RND, MACDST_RND
+ MPLS_RND, VID_RND, SVID_RND
+
+ pgset "udp_src_min 9" set UDP source port min, If < udp_src_max, then
+ cycle through the port range.
+
+ pgset "udp_src_max 9" set UDP source port max.
+ pgset "udp_dst_min 9" set UDP destination port min, If < udp_dst_max, then
+ cycle through the port range.
+ pgset "udp_dst_max 9" set UDP destination port max.
+
+ pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example
+ outer label=16,middle label=32,
+ inner label=0 (IPv4 NULL)) Note that
+ there must be no spaces between the
+ arguments. Leading zeros are required.
+ Do not set the bottom of stack bit,
+ that's done automatically. If you do
+ set the bottom of stack bit, that
+ indicates that you want to randomly
+ generate that address and the flag
+ MPLS_RND will be turned on. You
+ can have any mix of random and fixed
+ labels in the label stack.
+
+ pgset "mpls 0" turn off mpls (or any invalid argument works too!)
+
+ pgset "vlan_id 77" set VLAN ID 0-4095
+ pgset "vlan_p 3" set priority bit 0-7 (default 0)
+ pgset "vlan_cfi 0" set canonical format identifier 0-1 (default 0)
+
+ pgset "svlan_id 22" set SVLAN ID 0-4095
+ pgset "svlan_p 3" set priority bit 0-7 (default 0)
+ pgset "svlan_cfi 0" set canonical format identifier 0-1 (default 0)
+
+ pgset "vlan_id 9999" > 4095 remove vlan and svlan tags
+ pgset "svlan 9999" > 4095 remove svlan tag
+
+
+ pgset "tos XX" set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00)
+ pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00)
+
+ pgset stop aborts injection. Also, ^C aborts generator.
+
+
+Example scripts
+===============
+
+A collection of small tutorial scripts for pktgen is in examples dir.
+
+pktgen.conf-1-1 # 1 CPU 1 dev
+pktgen.conf-1-2 # 1 CPU 2 dev
+pktgen.conf-2-1 # 2 CPU's 1 dev
+pktgen.conf-2-2 # 2 CPU's 2 dev
+pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS
+pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6
+pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS
+pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows.
+
+Run in shell: ./pktgen.conf-X-Y It does all the setup including sending.
+
+
+Interrupt affinity
+===================
+Note when adding devices to a specific CPU there good idea to also assign
+/proc/irq/XX/smp_affinity so the TX-interrupts gets bound to the same CPU.
+as this reduces cache bouncing when freeing skb's.
+
+
+Current commands and configuration options
+==========================================
+
+** Pgcontrol commands:
+
+start
+stop
+
+** Thread commands:
+
+add_device
+rem_device_all
+max_before_softirq
+
+
+** Device commands:
+
+count
+clone_skb
+debug
+
+frags
+delay
+
+src_mac_count
+dst_mac_count
+
+pkt_size
+min_pkt_size
+max_pkt_size
+
+mpls
+
+udp_src_min
+udp_src_max
+
+udp_dst_min
+udp_dst_max
+
+flag
+ IPSRC_RND
+ TXSIZE_RND
+ IPDST_RND
+ UDPSRC_RND
+ UDPDST_RND
+ MACSRC_RND
+ MACDST_RND
+
+dst_min
+dst_max
+
+src_min
+src_max
+
+dst_mac
+src_mac
+
+clear_counters
+
+dst6
+src6
+
+flows
+flowlen
+
+References:
+ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/
+ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/
+
+Paper from Linux-Kongress in Erlangen 2004.
+ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf
+
+Thanks to:
+Grant Grundler for testing on IA-64 and parisc, Harald Welte, Lennert Buytenhek
+Stephen Hemminger, Andi Kleen, Dave Miller and many others.
+
+
+Good luck with the linux net-development.
diff --git a/Documentation/networking/policy-routing.txt b/Documentation/networking/policy-routing.txt
new file mode 100644
index 0000000..36f6936
--- /dev/null
+++ b/Documentation/networking/policy-routing.txt
@@ -0,0 +1,150 @@
+Classes
+-------
+
+ "Class" is a complete routing table in common sense.
+ I.e. it is tree of nodes (destination prefix, tos, metric)
+ with attached information: gateway, device etc.
+ This tree is looked up as specified in RFC1812 5.2.4.3
+ 1. Basic match
+ 2. Longest match
+ 3. Weak TOS.
+ 4. Metric. (should not be in kernel space, but they are)
+ 5. Additional pruning rules. (not in kernel space).
+
+ We have two special type of nodes:
+ REJECT - abort route lookup and return an error value.
+ THROW - abort route lookup in this class.
+
+
+ Currently the number of classes is limited to 255
+ (0 is reserved for "not specified class")
+
+ Three classes are builtin:
+
+ RT_CLASS_LOCAL=255 - local interface addresses,
+ broadcasts, nat addresses.
+
+ RT_CLASS_MAIN=254 - all normal routes are put there
+ by default.
+
+ RT_CLASS_DEFAULT=253 - if ip_fib_model==1, then
+ normal default routes are put there, if ip_fib_model==2
+ all gateway routes are put there.
+
+
+Rules
+-----
+ Rule is a record of (src prefix, src interface, tos, dst prefix)
+ with attached information.
+
+ Rule types:
+ RTP_ROUTE - lookup in attached class
+ RTP_NAT - lookup in attached class and if a match is found,
+ translate packet source address.
+ RTP_MASQUERADE - lookup in attached class and if a match is found,
+ masquerade packet as sourced by us.
+ RTP_DROP - silently drop the packet.
+ RTP_REJECT - drop the packet and send ICMP NET UNREACHABLE.
+ RTP_PROHIBIT - drop the packet and send ICMP COMM. ADM. PROHIBITED.
+
+ Rule flags:
+ RTRF_LOG - log route creations.
+ RTRF_VALVE - One way route (used with masquerading)
+
+Default setup:
+
+root@amber:/pub/ip-routing # iproute -r
+Kernel routing policy rules
+Pref Source Destination TOS Iface Cl
+ 0 default default 00 * 255
+ 254 default default 00 * 254
+ 255 default default 00 * 253
+
+
+Lookup algorithm
+----------------
+
+ We scan rules list, and if a rule is matched, apply it.
+ If a route is found, return it.
+ If it is not found or a THROW node was matched, continue
+ to scan rules.
+
+Applications
+------------
+
+1. Just ignore classes. All the routes are put into MAIN class
+ (and/or into DEFAULT class).
+
+ HOWTO: iproute add PREFIX [ tos TOS ] [ gw GW ] [ dev DEV ]
+ [ metric METRIC ] [ reject ] ... (look at iproute utility)
+
+ or use route utility from current net-tools.
+
+2. Opposite case. Just forget all that you know about routing
+ tables. Every rule is supplied with its own gateway, device
+ info. record. This approach is not appropriate for automated
+ route maintenance, but it is ideal for manual configuration.
+
+ HOWTO: iproute addrule [ from PREFIX ] [ to PREFIX ] [ tos TOS ]
+ [ dev INPUTDEV] [ pref PREFERENCE ] route [ gw GATEWAY ]
+ [ dev OUTDEV ] .....
+
+ Warning: As of now the size of the routing table in this
+ approach is limited to 256. If someone likes this model, I'll
+ relax this limitation.
+
+3. OSPF classes (see RFC1583, RFC1812 E.3.3)
+ Very clean, stable and robust algorithm for OSPF routing
+ domains. Unfortunately, it is not widely used in the Internet.
+
+ Proposed setup:
+ 255 local addresses
+ 254 interface routes
+ 253 ASE routes with external metric
+ 252 ASE routes with internal metric
+ 251 inter-area routes
+ 250 intra-area routes for 1st area
+ 249 intra-area routes for 2nd area
+ etc.
+
+ Rules:
+ iproute addrule class 253
+ iproute addrule class 252
+ iproute addrule class 251
+ iproute addrule to a-prefix-for-1st-area class 250
+ iproute addrule to another-prefix-for-1st-area class 250
+ ...
+ iproute addrule to a-prefix-for-2nd-area class 249
+ ...
+
+ Area classes must be terminated with reject record.
+ iproute add default reject class 250
+ iproute add default reject class 249
+ ...
+
+4. The Variant Router Requirements Algorithm (RFC1812 E.3.2)
+ Create 16 classes for different TOS values.
+ It is a funny, but pretty useless algorithm.
+ I listed it just to show the power of new routing code.
+
+5. All the variety of combinations......
+
+
+GATED
+-----
+
+ Gated does not understand classes, but it will work
+ happily in MAIN+DEFAULT. All policy routes can be set
+ and maintained manually.
+
+IMPORTANT NOTE
+--------------
+ route.c has a compilation time switch CONFIG_IP_LOCAL_RT_POLICY.
+ If it is set, locally originated packets are routed
+ using all the policy list. This is not very convenient and
+ pretty ambiguous when used with NAT and masquerading.
+ I set it to FALSE by default.
+
+
+Alexey Kuznetov
+kuznet@ms2.inr.ac.ru
diff --git a/Documentation/networking/ppp_generic.txt b/Documentation/networking/ppp_generic.txt
new file mode 100644
index 0000000..15b5172
--- /dev/null
+++ b/Documentation/networking/ppp_generic.txt
@@ -0,0 +1,432 @@
+ PPP Generic Driver and Channel Interface
+ ----------------------------------------
+
+ Paul Mackerras
+ paulus@samba.org
+ 7 Feb 2002
+
+The generic PPP driver in linux-2.4 provides an implementation of the
+functionality which is of use in any PPP implementation, including:
+
+* the network interface unit (ppp0 etc.)
+* the interface to the networking code
+* PPP multilink: splitting datagrams between multiple links, and
+ ordering and combining received fragments
+* the interface to pppd, via a /dev/ppp character device
+* packet compression and decompression
+* TCP/IP header compression and decompression
+* detecting network traffic for demand dialling and for idle timeouts
+* simple packet filtering
+
+For sending and receiving PPP frames, the generic PPP driver calls on
+the services of PPP `channels'. A PPP channel encapsulates a
+mechanism for transporting PPP frames from one machine to another. A
+PPP channel implementation can be arbitrarily complex internally but
+has a very simple interface with the generic PPP code: it merely has
+to be able to send PPP frames, receive PPP frames, and optionally
+handle ioctl requests. Currently there are PPP channel
+implementations for asynchronous serial ports, synchronous serial
+ports, and for PPP over ethernet.
+
+This architecture makes it possible to implement PPP multilink in a
+natural and straightforward way, by allowing more than one channel to
+be linked to each ppp network interface unit. The generic layer is
+responsible for splitting datagrams on transmit and recombining them
+on receive.
+
+
+PPP channel API
+---------------
+
+See include/linux/ppp_channel.h for the declaration of the types and
+functions used to communicate between the generic PPP layer and PPP
+channels.
+
+Each channel has to provide two functions to the generic PPP layer,
+via the ppp_channel.ops pointer:
+
+* start_xmit() is called by the generic layer when it has a frame to
+ send. The channel has the option of rejecting the frame for
+ flow-control reasons. In this case, start_xmit() should return 0
+ and the channel should call the ppp_output_wakeup() function at a
+ later time when it can accept frames again, and the generic layer
+ will then attempt to retransmit the rejected frame(s). If the frame
+ is accepted, the start_xmit() function should return 1.
+
+* ioctl() provides an interface which can be used by a user-space
+ program to control aspects of the channel's behaviour. This
+ procedure will be called when a user-space program does an ioctl
+ system call on an instance of /dev/ppp which is bound to the
+ channel. (Usually it would only be pppd which would do this.)
+
+The generic PPP layer provides seven functions to channels:
+
+* ppp_register_channel() is called when a channel has been created, to
+ notify the PPP generic layer of its presence. For example, setting
+ a serial port to the PPPDISC line discipline causes the ppp_async
+ channel code to call this function.
+
+* ppp_unregister_channel() is called when a channel is to be
+ destroyed. For example, the ppp_async channel code calls this when
+ a hangup is detected on the serial port.
+
+* ppp_output_wakeup() is called by a channel when it has previously
+ rejected a call to its start_xmit function, and can now accept more
+ packets.
+
+* ppp_input() is called by a channel when it has received a complete
+ PPP frame.
+
+* ppp_input_error() is called by a channel when it has detected that a
+ frame has been lost or dropped (for example, because of a FCS (frame
+ check sequence) error).
+
+* ppp_channel_index() returns the channel index assigned by the PPP
+ generic layer to this channel. The channel should provide some way
+ (e.g. an ioctl) to transmit this back to user-space, as user-space
+ will need it to attach an instance of /dev/ppp to this channel.
+
+* ppp_unit_number() returns the unit number of the ppp network
+ interface to which this channel is connected, or -1 if the channel
+ is not connected.
+
+Connecting a channel to the ppp generic layer is initiated from the
+channel code, rather than from the generic layer. The channel is
+expected to have some way for a user-level process to control it
+independently of the ppp generic layer. For example, with the
+ppp_async channel, this is provided by the file descriptor to the
+serial port.
+
+Generally a user-level process will initialize the underlying
+communications medium and prepare it to do PPP. For example, with an
+async tty, this can involve setting the tty speed and modes, issuing
+modem commands, and then going through some sort of dialog with the
+remote system to invoke PPP service there. We refer to this process
+as `discovery'. Then the user-level process tells the medium to
+become a PPP channel and register itself with the generic PPP layer.
+The channel then has to report the channel number assigned to it back
+to the user-level process. From that point, the PPP negotiation code
+in the PPP daemon (pppd) can take over and perform the PPP
+negotiation, accessing the channel through the /dev/ppp interface.
+
+At the interface to the PPP generic layer, PPP frames are stored in
+skbuff structures and start with the two-byte PPP protocol number.
+The frame does *not* include the 0xff `address' byte or the 0x03
+`control' byte that are optionally used in async PPP. Nor is there
+any escaping of control characters, nor are there any FCS or framing
+characters included. That is all the responsibility of the channel
+code, if it is needed for the particular medium. That is, the skbuffs
+presented to the start_xmit() function contain only the 2-byte
+protocol number and the data, and the skbuffs presented to ppp_input()
+must be in the same format.
+
+The channel must provide an instance of a ppp_channel struct to
+represent the channel. The channel is free to use the `private' field
+however it wishes. The channel should initialize the `mtu' and
+`hdrlen' fields before calling ppp_register_channel() and not change
+them until after ppp_unregister_channel() returns. The `mtu' field
+represents the maximum size of the data part of the PPP frames, that
+is, it does not include the 2-byte protocol number.
+
+If the channel needs some headroom in the skbuffs presented to it for
+transmission (i.e., some space free in the skbuff data area before the
+start of the PPP frame), it should set the `hdrlen' field of the
+ppp_channel struct to the amount of headroom required. The generic
+PPP layer will attempt to provide that much headroom but the channel
+should still check if there is sufficient headroom and copy the skbuff
+if there isn't.
+
+On the input side, channels should ideally provide at least 2 bytes of
+headroom in the skbuffs presented to ppp_input(). The generic PPP
+code does not require this but will be more efficient if this is done.
+
+
+Buffering and flow control
+--------------------------
+
+The generic PPP layer has been designed to minimize the amount of data
+that it buffers in the transmit direction. It maintains a queue of
+transmit packets for the PPP unit (network interface device) plus a
+queue of transmit packets for each attached channel. Normally the
+transmit queue for the unit will contain at most one packet; the
+exceptions are when pppd sends packets by writing to /dev/ppp, and
+when the core networking code calls the generic layer's start_xmit()
+function with the queue stopped, i.e. when the generic layer has
+called netif_stop_queue(), which only happens on a transmit timeout.
+The start_xmit function always accepts and queues the packet which it
+is asked to transmit.
+
+Transmit packets are dequeued from the PPP unit transmit queue and
+then subjected to TCP/IP header compression and packet compression
+(Deflate or BSD-Compress compression), as appropriate. After this
+point the packets can no longer be reordered, as the decompression
+algorithms rely on receiving compressed packets in the same order that
+they were generated.
+
+If multilink is not in use, this packet is then passed to the attached
+channel's start_xmit() function. If the channel refuses to take
+the packet, the generic layer saves it for later transmission. The
+generic layer will call the channel's start_xmit() function again
+when the channel calls ppp_output_wakeup() or when the core
+networking code calls the generic layer's start_xmit() function
+again. The generic layer contains no timeout and retransmission
+logic; it relies on the core networking code for that.
+
+If multilink is in use, the generic layer divides the packet into one
+or more fragments and puts a multilink header on each fragment. It
+decides how many fragments to use based on the length of the packet
+and the number of channels which are potentially able to accept a
+fragment at the moment. A channel is potentially able to accept a
+fragment if it doesn't have any fragments currently queued up for it
+to transmit. The channel may still refuse a fragment; in this case
+the fragment is queued up for the channel to transmit later. This
+scheme has the effect that more fragments are given to higher-
+bandwidth channels. It also means that under light load, the generic
+layer will tend to fragment large packets across all the channels,
+thus reducing latency, while under heavy load, packets will tend to be
+transmitted as single fragments, thus reducing the overhead of
+fragmentation.
+
+
+SMP safety
+----------
+
+The PPP generic layer has been designed to be SMP-safe. Locks are
+used around accesses to the internal data structures where necessary
+to ensure their integrity. As part of this, the generic layer
+requires that the channels adhere to certain requirements and in turn
+provides certain guarantees to the channels. Essentially the channels
+are required to provide the appropriate locking on the ppp_channel
+structures that form the basis of the communication between the
+channel and the generic layer. This is because the channel provides
+the storage for the ppp_channel structure, and so the channel is
+required to provide the guarantee that this storage exists and is
+valid at the appropriate times.
+
+The generic layer requires these guarantees from the channel:
+
+* The ppp_channel object must exist from the time that
+ ppp_register_channel() is called until after the call to
+ ppp_unregister_channel() returns.
+
+* No thread may be in a call to any of ppp_input(), ppp_input_error(),
+ ppp_output_wakeup(), ppp_channel_index() or ppp_unit_number() for a
+ channel at the time that ppp_unregister_channel() is called for that
+ channel.
+
+* ppp_register_channel() and ppp_unregister_channel() must be called
+ from process context, not interrupt or softirq/BH context.
+
+* The remaining generic layer functions may be called at softirq/BH
+ level but must not be called from a hardware interrupt handler.
+
+* The generic layer may call the channel start_xmit() function at
+ softirq/BH level but will not call it at interrupt level. Thus the
+ start_xmit() function may not block.
+
+* The generic layer will only call the channel ioctl() function in
+ process context.
+
+The generic layer provides these guarantees to the channels:
+
+* The generic layer will not call the start_xmit() function for a
+ channel while any thread is already executing in that function for
+ that channel.
+
+* The generic layer will not call the ioctl() function for a channel
+ while any thread is already executing in that function for that
+ channel.
+
+* By the time a call to ppp_unregister_channel() returns, no thread
+ will be executing in a call from the generic layer to that channel's
+ start_xmit() or ioctl() function, and the generic layer will not
+ call either of those functions subsequently.
+
+
+Interface to pppd
+-----------------
+
+The PPP generic layer exports a character device interface called
+/dev/ppp. This is used by pppd to control PPP interface units and
+channels. Although there is only one /dev/ppp, each open instance of
+/dev/ppp acts independently and can be attached either to a PPP unit
+or a PPP channel. This is achieved using the file->private_data field
+to point to a separate object for each open instance of /dev/ppp. In
+this way an effect similar to Solaris' clone open is obtained,
+allowing us to control an arbitrary number of PPP interfaces and
+channels without having to fill up /dev with hundreds of device names.
+
+When /dev/ppp is opened, a new instance is created which is initially
+unattached. Using an ioctl call, it can then be attached to an
+existing unit, attached to a newly-created unit, or attached to an
+existing channel. An instance attached to a unit can be used to send
+and receive PPP control frames, using the read() and write() system
+calls, along with poll() if necessary. Similarly, an instance
+attached to a channel can be used to send and receive PPP frames on
+that channel.
+
+In multilink terms, the unit represents the bundle, while the channels
+represent the individual physical links. Thus, a PPP frame sent by a
+write to the unit (i.e., to an instance of /dev/ppp attached to the
+unit) will be subject to bundle-level compression and to fragmentation
+across the individual links (if multilink is in use). In contrast, a
+PPP frame sent by a write to the channel will be sent as-is on that
+channel, without any multilink header.
+
+A channel is not initially attached to any unit. In this state it can
+be used for PPP negotiation but not for the transfer of data packets.
+It can then be connected to a PPP unit with an ioctl call, which
+makes it available to send and receive data packets for that unit.
+
+The ioctl calls which are available on an instance of /dev/ppp depend
+on whether it is unattached, attached to a PPP interface, or attached
+to a PPP channel. The ioctl calls which are available on an
+unattached instance are:
+
+* PPPIOCNEWUNIT creates a new PPP interface and makes this /dev/ppp
+ instance the "owner" of the interface. The argument should point to
+ an int which is the desired unit number if >= 0, or -1 to assign the
+ lowest unused unit number. Being the owner of the interface means
+ that the interface will be shut down if this instance of /dev/ppp is
+ closed.
+
+* PPPIOCATTACH attaches this instance to an existing PPP interface.
+ The argument should point to an int containing the unit number.
+ This does not make this instance the owner of the PPP interface.
+
+* PPPIOCATTCHAN attaches this instance to an existing PPP channel.
+ The argument should point to an int containing the channel number.
+
+The ioctl calls available on an instance of /dev/ppp attached to a
+channel are:
+
+* PPPIOCDETACH detaches the instance from the channel. This ioctl is
+ deprecated since the same effect can be achieved by closing the
+ instance. In order to prevent possible races this ioctl will fail
+ with an EINVAL error if more than one file descriptor refers to this
+ instance (i.e. as a result of dup(), dup2() or fork()).
+
+* PPPIOCCONNECT connects this channel to a PPP interface. The
+ argument should point to an int containing the interface unit
+ number. It will return an EINVAL error if the channel is already
+ connected to an interface, or ENXIO if the requested interface does
+ not exist.
+
+* PPPIOCDISCONN disconnects this channel from the PPP interface that
+ it is connected to. It will return an EINVAL error if the channel
+ is not connected to an interface.
+
+* All other ioctl commands are passed to the channel ioctl() function.
+
+The ioctl calls that are available on an instance that is attached to
+an interface unit are:
+
+* PPPIOCSMRU sets the MRU (maximum receive unit) for the interface.
+ The argument should point to an int containing the new MRU value.
+
+* PPPIOCSFLAGS sets flags which control the operation of the
+ interface. The argument should be a pointer to an int containing
+ the new flags value. The bits in the flags value that can be set
+ are:
+ SC_COMP_TCP enable transmit TCP header compression
+ SC_NO_TCP_CCID disable connection-id compression for
+ TCP header compression
+ SC_REJ_COMP_TCP disable receive TCP header decompression
+ SC_CCP_OPEN Compression Control Protocol (CCP) is
+ open, so inspect CCP packets
+ SC_CCP_UP CCP is up, may (de)compress packets
+ SC_LOOP_TRAFFIC send IP traffic to pppd
+ SC_MULTILINK enable PPP multilink fragmentation on
+ transmitted packets
+ SC_MP_SHORTSEQ expect short multilink sequence
+ numbers on received multilink fragments
+ SC_MP_XSHORTSEQ transmit short multilink sequence nos.
+
+ The values of these flags are defined in <linux/if_ppp.h>. Note
+ that the values of the SC_MULTILINK, SC_MP_SHORTSEQ and
+ SC_MP_XSHORTSEQ bits are ignored if the CONFIG_PPP_MULTILINK option
+ is not selected.
+
+* PPPIOCGFLAGS returns the value of the status/control flags for the
+ interface unit. The argument should point to an int where the ioctl
+ will store the flags value. As well as the values listed above for
+ PPPIOCSFLAGS, the following bits may be set in the returned value:
+ SC_COMP_RUN CCP compressor is running
+ SC_DECOMP_RUN CCP decompressor is running
+ SC_DC_ERROR CCP decompressor detected non-fatal error
+ SC_DC_FERROR CCP decompressor detected fatal error
+
+* PPPIOCSCOMPRESS sets the parameters for packet compression or
+ decompression. The argument should point to a ppp_option_data
+ structure (defined in <linux/if_ppp.h>), which contains a
+ pointer/length pair which should describe a block of memory
+ containing a CCP option specifying a compression method and its
+ parameters. The ppp_option_data struct also contains a `transmit'
+ field. If this is 0, the ioctl will affect the receive path,
+ otherwise the transmit path.
+
+* PPPIOCGUNIT returns, in the int pointed to by the argument, the unit
+ number of this interface unit.
+
+* PPPIOCSDEBUG sets the debug flags for the interface to the value in
+ the int pointed to by the argument. Only the least significant bit
+ is used; if this is 1 the generic layer will print some debug
+ messages during its operation. This is only intended for debugging
+ the generic PPP layer code; it is generally not helpful for working
+ out why a PPP connection is failing.
+
+* PPPIOCGDEBUG returns the debug flags for the interface in the int
+ pointed to by the argument.
+
+* PPPIOCGIDLE returns the time, in seconds, since the last data
+ packets were sent and received. The argument should point to a
+ ppp_idle structure (defined in <linux/ppp_defs.h>). If the
+ CONFIG_PPP_FILTER option is enabled, the set of packets which reset
+ the transmit and receive idle timers is restricted to those which
+ pass the `active' packet filter.
+
+* PPPIOCSMAXCID sets the maximum connection-ID parameter (and thus the
+ number of connection slots) for the TCP header compressor and
+ decompressor. The lower 16 bits of the int pointed to by the
+ argument specify the maximum connection-ID for the compressor. If
+ the upper 16 bits of that int are non-zero, they specify the maximum
+ connection-ID for the decompressor, otherwise the decompressor's
+ maximum connection-ID is set to 15.
+
+* PPPIOCSNPMODE sets the network-protocol mode for a given network
+ protocol. The argument should point to an npioctl struct (defined
+ in <linux/if_ppp.h>). The `protocol' field gives the PPP protocol
+ number for the protocol to be affected, and the `mode' field
+ specifies what to do with packets for that protocol:
+
+ NPMODE_PASS normal operation, transmit and receive packets
+ NPMODE_DROP silently drop packets for this protocol
+ NPMODE_ERROR drop packets and return an error on transmit
+ NPMODE_QUEUE queue up packets for transmit, drop received
+ packets
+
+ At present NPMODE_ERROR and NPMODE_QUEUE have the same effect as
+ NPMODE_DROP.
+
+* PPPIOCGNPMODE returns the network-protocol mode for a given
+ protocol. The argument should point to an npioctl struct with the
+ `protocol' field set to the PPP protocol number for the protocol of
+ interest. On return the `mode' field will be set to the network-
+ protocol mode for that protocol.
+
+* PPPIOCSPASS and PPPIOCSACTIVE set the `pass' and `active' packet
+ filters. These ioctls are only available if the CONFIG_PPP_FILTER
+ option is selected. The argument should point to a sock_fprog
+ structure (defined in <linux/filter.h>) containing the compiled BPF
+ instructions for the filter. Packets are dropped if they fail the
+ `pass' filter; otherwise, if they fail the `active' filter they are
+ passed but they do not reset the transmit or receive idle timer.
+
+* PPPIOCSMRRU enables or disables multilink processing for received
+ packets and sets the multilink MRRU (maximum reconstructed receive
+ unit). The argument should point to an int containing the new MRRU
+ value. If the MRRU value is 0, processing of received multilink
+ fragments is disabled. This ioctl is only available if the
+ CONFIG_PPP_MULTILINK option is selected.
+
+Last modified: 7-feb-2002
diff --git a/Documentation/networking/proc_net_tcp.txt b/Documentation/networking/proc_net_tcp.txt
new file mode 100644
index 0000000..4a79209
--- /dev/null
+++ b/Documentation/networking/proc_net_tcp.txt
@@ -0,0 +1,48 @@
+This document describes the interfaces /proc/net/tcp and /proc/net/tcp6.
+Note that these interfaces are deprecated in favor of tcp_diag.
+
+These /proc interfaces provide information about currently active TCP
+connections, and are implemented by tcp4_seq_show() in net/ipv4/tcp_ipv4.c
+and tcp6_seq_show() in net/ipv6/tcp_ipv6.c, respectively.
+
+It will first list all listening TCP sockets, and next list all established
+TCP connections. A typical entry of /proc/net/tcp would look like this (split
+up into 3 parts because of the length of the line):
+
+ 46: 010310AC:9C4C 030310AC:1770 01
+ | | | | | |--> connection state
+ | | | | |------> remote TCP port number
+ | | | |-------------> remote IPv4 address
+ | | |--------------------> local TCP port number
+ | |---------------------------> local IPv4 address
+ |----------------------------------> number of entry
+
+ 00000150:00000000 01:00000019 00000000
+ | | | | |--> number of unrecovered RTO timeouts
+ | | | |----------> number of jiffies until timer expires
+ | | |----------------> timer_active (see below)
+ | |----------------------> receive-queue
+ |-------------------------------> transmit-queue
+
+ 1000 0 54165785 4 cd1e6040 25 4 27 3 -1
+ | | | | | | | | | |--> slow start size threshold,
+ | | | | | | | | | or -1 if the threshold
+ | | | | | | | | | is >= 0xFFFF
+ | | | | | | | | |----> sending congestion window
+ | | | | | | | |-------> (ack.quick<<1)|ack.pingpong
+ | | | | | | |---------> Predicted tick of soft clock
+ | | | | | | (delayed ACK control data)
+ | | | | | |------------> retransmit timeout
+ | | | | |------------------> location of socket in memory
+ | | | |-----------------------> socket reference count
+ | | |-----------------------------> inode
+ | |----------------------------------> unanswered 0-window probes
+ |---------------------------------------------> uid
+
+timer_active:
+ 0 no timer is pending
+ 1 retransmit-timer is pending
+ 2 another timer (e.g. delayed ack or keepalive) is pending
+ 3 this is a socket in TIME_WAIT state. Not all fields will contain
+ data (or even exist)
+ 4 zero window probe timer is pending
diff --git a/Documentation/networking/radiotap-headers.txt b/Documentation/networking/radiotap-headers.txt
new file mode 100644
index 0000000..953331c
--- /dev/null
+++ b/Documentation/networking/radiotap-headers.txt
@@ -0,0 +1,152 @@
+How to use radiotap headers
+===========================
+
+Pointer to the radiotap include file
+------------------------------------
+
+Radiotap headers are variable-length and extensible, you can get most of the
+information you need to know on them from:
+
+./include/net/ieee80211_radiotap.h
+
+This document gives an overview and warns on some corner cases.
+
+
+Structure of the header
+-----------------------
+
+There is a fixed portion at the start which contains a u32 bitmap that defines
+if the possible argument associated with that bit is present or not. So if b0
+of the it_present member of ieee80211_radiotap_header is set, it means that
+the header for argument index 0 (IEEE80211_RADIOTAP_TSFT) is present in the
+argument area.
+
+ < 8-byte ieee80211_radiotap_header >
+ [ <possible argument bitmap extensions ... > ]
+ [ <argument> ... ]
+
+At the moment there are only 13 possible argument indexes defined, but in case
+we run out of space in the u32 it_present member, it is defined that b31 set
+indicates that there is another u32 bitmap following (shown as "possible
+argument bitmap extensions..." above), and the start of the arguments is moved
+forward 4 bytes each time.
+
+Note also that the it_len member __le16 is set to the total number of bytes
+covered by the ieee80211_radiotap_header and any arguments following.
+
+
+Requirements for arguments
+--------------------------
+
+After the fixed part of the header, the arguments follow for each argument
+index whose matching bit is set in the it_present member of
+ieee80211_radiotap_header.
+
+ - the arguments are all stored little-endian!
+
+ - the argument payload for a given argument index has a fixed size. So
+ IEEE80211_RADIOTAP_TSFT being present always indicates an 8-byte argument is
+ present. See the comments in ./include/net/ieee80211_radiotap.h for a nice
+ breakdown of all the argument sizes
+
+ - the arguments must be aligned to a boundary of the argument size using
+ padding. So a u16 argument must start on the next u16 boundary if it isn't
+ already on one, a u32 must start on the next u32 boundary and so on.
+
+ - "alignment" is relative to the start of the ieee80211_radiotap_header, ie,
+ the first byte of the radiotap header. The absolute alignment of that first
+ byte isn't defined. So even if the whole radiotap header is starting at, eg,
+ address 0x00000003, still the first byte of the radiotap header is treated as
+ 0 for alignment purposes.
+
+ - the above point that there may be no absolute alignment for multibyte
+ entities in the fixed radiotap header or the argument region means that you
+ have to take special evasive action when trying to access these multibyte
+ entities. Some arches like Blackfin cannot deal with an attempt to
+ dereference, eg, a u16 pointer that is pointing to an odd address. Instead
+ you have to use a kernel API get_unaligned() to dereference the pointer,
+ which will do it bytewise on the arches that require that.
+
+ - The arguments for a given argument index can be a compound of multiple types
+ together. For example IEEE80211_RADIOTAP_CHANNEL has an argument payload
+ consisting of two u16s of total length 4. When this happens, the padding
+ rule is applied dealing with a u16, NOT dealing with a 4-byte single entity.
+
+
+Example valid radiotap header
+-----------------------------
+
+ 0x00, 0x00, // <-- radiotap version + pad byte
+ 0x0b, 0x00, // <- radiotap header length
+ 0x04, 0x0c, 0x00, 0x00, // <-- bitmap
+ 0x6c, // <-- rate (in 500kHz units)
+ 0x0c, //<-- tx power
+ 0x01 //<-- antenna
+
+
+Using the Radiotap Parser
+-------------------------
+
+If you are having to parse a radiotap struct, you can radically simplify the
+job by using the radiotap parser that lives in net/wireless/radiotap.c and has
+its prototypes available in include/net/cfg80211.h. You use it like this:
+
+#include <net/cfg80211.h>
+
+/* buf points to the start of the radiotap header part */
+
+int MyFunction(u8 * buf, int buflen)
+{
+ int pkt_rate_100kHz = 0, antenna = 0, pwr = 0;
+ struct ieee80211_radiotap_iterator iterator;
+ int ret = ieee80211_radiotap_iterator_init(&iterator, buf, buflen);
+
+ while (!ret) {
+
+ ret = ieee80211_radiotap_iterator_next(&iterator);
+
+ if (ret)
+ continue;
+
+ /* see if this argument is something we can use */
+
+ switch (iterator.this_arg_index) {
+ /*
+ * You must take care when dereferencing iterator.this_arg
+ * for multibyte types... the pointer is not aligned. Use
+ * get_unaligned((type *)iterator.this_arg) to dereference
+ * iterator.this_arg for type "type" safely on all arches.
+ */
+ case IEEE80211_RADIOTAP_RATE:
+ /* radiotap "rate" u8 is in
+ * 500kbps units, eg, 0x02=1Mbps
+ */
+ pkt_rate_100kHz = (*iterator.this_arg) * 5;
+ break;
+
+ case IEEE80211_RADIOTAP_ANTENNA:
+ /* radiotap uses 0 for 1st ant */
+ antenna = *iterator.this_arg);
+ break;
+
+ case IEEE80211_RADIOTAP_DBM_TX_POWER:
+ pwr = *iterator.this_arg;
+ break;
+
+ default:
+ break;
+ }
+ } /* while more rt headers */
+
+ if (ret != -ENOENT)
+ return TXRX_DROP;
+
+ /* discard the radiotap header part */
+ buf += iterator.max_length;
+ buflen -= iterator.max_length;
+
+ ...
+
+}
+
+Andy Green <andy@warmcat.com>
diff --git a/Documentation/networking/ray_cs.txt b/Documentation/networking/ray_cs.txt
new file mode 100644
index 0000000..145d27a
--- /dev/null
+++ b/Documentation/networking/ray_cs.txt
@@ -0,0 +1,150 @@
+September 21, 1999
+
+Copyright (c) 1998 Corey Thomas (corey@world.std.com)
+
+This file is the documentation for the Raylink Wireless LAN card driver for
+Linux. The Raylink wireless LAN card is a PCMCIA card which provides IEEE
+802.11 compatible wireless network connectivity at 1 and 2 megabits/second.
+See http://www.raytheon.com/micro/raylink/ for more information on the Raylink
+card. This driver is in early development and does have bugs. See the known
+bugs and limitations at the end of this document for more information.
+This driver also works with WebGear's Aviator 2.4 and Aviator Pro
+wireless LAN cards.
+
+As of kernel 2.3.18, the ray_cs driver is part of the Linux kernel
+source. My web page for the development of ray_cs is at
+http://world.std.com/~corey/raylink.html and I can be emailed at
+corey@world.std.com
+
+The kernel driver is based on ray_cs-1.62.tgz
+
+The driver at my web page is intended to be used as an add on to
+David Hinds pcmcia package. All the command line parameters are
+available when compiled as a module. When built into the kernel, only
+the essid= string parameter is available via the kernel command line.
+This will change after the method of sorting out parameters for all
+the PCMCIA drivers is agreed upon. If you must have a built in driver
+with nondefault parameters, they can be edited in
+/usr/src/linux/drivers/net/pcmcia/ray_cs.c. Searching for module_param
+will find them all.
+
+Information on card services is available at:
+ http://pcmcia-cs.sourceforge.net/
+
+
+Card services user programs are still required for PCMCIA devices.
+pcmcia-cs-3.1.1 or greater is required for the kernel version of
+the driver.
+
+Currently, ray_cs is not part of David Hinds card services package,
+so the following magic is required.
+
+At the end of the /etc/pcmcia/config.opts file, add the line:
+source ./ray_cs.opts
+This will make card services read the ray_cs.opts file
+when starting. Create the file /etc/pcmcia/ray_cs.opts containing the
+following:
+
+#### start of /etc/pcmcia/ray_cs.opts ###################
+# Configuration options for Raylink Wireless LAN PCMCIA card
+device "ray_cs"
+ class "network" module "misc/ray_cs"
+
+card "RayLink PC Card WLAN Adapter"
+ manfid 0x01a6, 0x0000
+ bind "ray_cs"
+
+module "misc/ray_cs" opts ""
+#### end of /etc/pcmcia/ray_cs.opts #####################
+
+
+To join an existing network with
+different parameters, contact the network administrator for the
+configuration information, and edit /etc/pcmcia/ray_cs.opts.
+Add the parameters below between the empty quotes.
+
+Parameters for ray_cs driver which may be specified in ray_cs.opts:
+
+bc integer 0 = normal mode (802.11 timing)
+ 1 = slow down inter frame timing to allow
+ operation with older breezecom access
+ points.
+
+beacon_period integer beacon period in Kilo-microseconds
+ legal values = must be integer multiple
+ of hop dwell
+ default = 256
+
+country integer 1 = USA (default)
+ 2 = Europe
+ 3 = Japan
+ 4 = Korea
+ 5 = Spain
+ 6 = France
+ 7 = Israel
+ 8 = Australia
+
+essid string ESS ID - network name to join
+ string with maximum length of 32 chars
+ default value = "ADHOC_ESSID"
+
+hop_dwell integer hop dwell time in Kilo-microseconds
+ legal values = 16,32,64,128(default),256
+
+irq_mask integer linux standard 16 bit value 1bit/IRQ
+ lsb is IRQ 0, bit 1 is IRQ 1 etc.
+ Used to restrict choice of IRQ's to use.
+ Recommended method for controlling
+ interrupts is in /etc/pcmcia/config.opts
+
+net_type integer 0 (default) = adhoc network,
+ 1 = infrastructure
+
+phy_addr string string containing new MAC address in
+ hex, must start with x eg
+ x00008f123456
+
+psm integer 0 = continuously active
+ 1 = power save mode (not useful yet)
+
+pc_debug integer (0-5) larger values for more verbose
+ logging. Replaces ray_debug.
+
+ray_debug integer Replaced with pc_debug
+
+ray_mem_speed integer defaults to 500
+
+sniffer integer 0 = not sniffer (default)
+ 1 = sniffer which can be used to record all
+ network traffic using tcpdump or similar,
+ but no normal network use is allowed.
+
+translate integer 0 = no translation (encapsulate frames)
+ 1 = translation (RFC1042/802.1)
+
+
+More on sniffer mode:
+
+tcpdump does not understand 802.11 headers, so it can't
+interpret the contents, but it can record to a file. This is only
+useful for debugging 802.11 lowlevel protocols that are not visible to
+linux. If you want to watch ftp xfers, or do similar things, you
+don't need to use sniffer mode. Also, some packet types are never
+sent up by the card, so you will never see them (ack, rts, cts, probe
+etc.) There is a simple program (showcap) included in the ray_cs
+package which parses the 802.11 headers.
+
+Known Problems and missing features
+
+ Does not work with non x86
+
+ Does not work with SMP
+
+ Support for defragmenting frames is not yet debugged, and in
+ fact is known to not work. I have never encountered a net set
+ up to fragment, but still, it should be fixed.
+
+ The ioctl support is incomplete. The hardware address cannot be set
+ using ifconfig yet. If a different hardware address is needed, it may
+ be set using the phy_addr parameter in ray_cs.opts. This requires
+ a card insertion to take effect.
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
new file mode 100644
index 0000000..a96989a
--- /dev/null
+++ b/Documentation/networking/regulatory.txt
@@ -0,0 +1,194 @@
+Linux wireless regulatory documentation
+---------------------------------------
+
+This document gives a brief review over how the Linux wireless
+regulatory infrastructure works.
+
+More up to date information can be obtained at the project's web page:
+
+http://wireless.kernel.org/en/developers/Regulatory
+
+Keeping regulatory domains in userspace
+---------------------------------------
+
+Due to the dynamic nature of regulatory domains we keep them
+in userspace and provide a framework for userspace to upload
+to the kernel one regulatory domain to be used as the central
+core regulatory domain all wireless devices should adhere to.
+
+How to get regulatory domains to the kernel
+-------------------------------------------
+
+Userspace gets a regulatory domain in the kernel by having
+a userspace agent build it and send it via nl80211. Only
+expected regulatory domains will be respected by the kernel.
+
+A currently available userspace agent which can accomplish this
+is CRDA - central regulatory domain agent. Its documented here:
+
+http://wireless.kernel.org/en/developers/Regulatory/CRDA
+
+Essentially the kernel will send a udev event when it knows
+it needs a new regulatory domain. A udev rule can be put in place
+to trigger crda to send the respective regulatory domain for a
+specific ISO/IEC 3166 alpha2.
+
+Below is an example udev rule which can be used:
+
+# Example file, should be put in /etc/udev/rules.d/regulatory.rules
+KERNEL=="regulatory*", ACTION=="change", SUBSYSTEM=="platform", RUN+="/sbin/crda"
+
+The alpha2 is passed as an environment variable under the variable COUNTRY.
+
+Who asks for regulatory domains?
+--------------------------------
+
+* Users
+
+Users can use iw:
+
+http://wireless.kernel.org/en/users/Documentation/iw
+
+An example:
+
+ # set regulatory domain to "Costa Rica"
+ iw reg set CR
+
+This will request the kernel to set the regulatory domain to
+the specificied alpha2. The kernel in turn will then ask userspace
+to provide a regulatory domain for the alpha2 specified by the user
+by sending a uevent.
+
+* Wireless subsystems for Country Information elements
+
+The kernel will send a uevent to inform userspace a new
+regulatory domain is required. More on this to be added
+as its integration is added.
+
+* Drivers
+
+If drivers determine they need a specific regulatory domain
+set they can inform the wireless core using regulatory_hint().
+They have two options -- they either provide an alpha2 so that
+crda can provide back a regulatory domain for that country or
+they can build their own regulatory domain based on internal
+custom knowledge so the wireless core can respect it.
+
+*Most* drivers will rely on the first mechanism of providing a
+regulatory hint with an alpha2. For these drivers there is an additional
+check that can be used to ensure compliance based on custom EEPROM
+regulatory data. This additional check can be used by drivers by
+registering on its struct wiphy a reg_notifier() callback. This notifier
+is called when the core's regulatory domain has been changed. The driver
+can use this to review the changes made and also review who made them
+(driver, user, country IE) and determine what to allow based on its
+internal EEPROM data. Devices drivers wishing to be capable of world
+roaming should use this callback. More on world roaming will be
+added to this document when its support is enabled.
+
+Device drivers who provide their own built regulatory domain
+do not need a callback as the channels registered by them are
+the only ones that will be allowed and therefore *additional*
+cannels cannot be enabled.
+
+Example code - drivers hinting an alpha2:
+------------------------------------------
+
+This example comes from the zd1211rw device driver. You can start
+by having a mapping of your device's EEPROM country/regulatory
+domain value to to a specific alpha2 as follows:
+
+static struct zd_reg_alpha2_map reg_alpha2_map[] = {
+ { ZD_REGDOMAIN_FCC, "US" },
+ { ZD_REGDOMAIN_IC, "CA" },
+ { ZD_REGDOMAIN_ETSI, "DE" }, /* Generic ETSI, use most restrictive */
+ { ZD_REGDOMAIN_JAPAN, "JP" },
+ { ZD_REGDOMAIN_JAPAN_ADD, "JP" },
+ { ZD_REGDOMAIN_SPAIN, "ES" },
+ { ZD_REGDOMAIN_FRANCE, "FR" },
+
+Then you can define a routine to map your read EEPROM value to an alpha2,
+as follows:
+
+static int zd_reg2alpha2(u8 regdomain, char *alpha2)
+{
+ unsigned int i;
+ struct zd_reg_alpha2_map *reg_map;
+ for (i = 0; i < ARRAY_SIZE(reg_alpha2_map); i++) {
+ reg_map = &reg_alpha2_map[i];
+ if (regdomain == reg_map->reg) {
+ alpha2[0] = reg_map->alpha2[0];
+ alpha2[1] = reg_map->alpha2[1];
+ return 0;
+ }
+ }
+ return 1;
+}
+
+Lastly, you can then hint to the core of your discovered alpha2, if a match
+was found. You need to do this after you have registered your wiphy. You
+are expected to do this during initialization.
+
+ r = zd_reg2alpha2(mac->regdomain, alpha2);
+ if (!r)
+ regulatory_hint(hw->wiphy, alpha2, NULL);
+
+Example code - drivers providing a built in regulatory domain:
+--------------------------------------------------------------
+
+If you have regulatory information you can obtain from your
+driver and you *need* to use this we let you build a regulatory domain
+structure and pass it to the wireless core. To do this you should
+kmalloc() a structure big enough to hold your regulatory domain
+structure and you should then fill it with your data. Finally you simply
+call regulatory_hint() with the regulatory domain structure in it.
+
+Bellow is a simple example, with a regulatory domain cached using the stack.
+Your implementation may vary (read EEPROM cache instead, for example).
+
+Example cache of some regulatory domain
+
+struct ieee80211_regdomain mydriver_jp_regdom = {
+ .n_reg_rules = 3,
+ .alpha2 = "JP",
+ //.alpha2 = "99", /* If I have no alpha2 to map it to */
+ .reg_rules = {
+ /* IEEE 802.11b/g, channels 1..14 */
+ REG_RULE(2412-20, 2484+20, 40, 6, 20, 0),
+ /* IEEE 802.11a, channels 34..48 */
+ REG_RULE(5170-20, 5240+20, 40, 6, 20,
+ NL80211_RRF_PASSIVE_SCAN),
+ /* IEEE 802.11a, channels 52..64 */
+ REG_RULE(5260-20, 5320+20, 40, 6, 20,
+ NL80211_RRF_NO_IBSS |
+ NL80211_RRF_DFS),
+ }
+};
+
+Then in some part of your code after your wiphy has been registered:
+
+ int r;
+ struct ieee80211_regdomain *rd;
+ int size_of_regd;
+ int num_rules = mydriver_jp_regdom.n_reg_rules;
+ unsigned int i;
+
+ size_of_regd = sizeof(struct ieee80211_regdomain) +
+ (num_rules * sizeof(struct ieee80211_reg_rule));
+
+ rd = kzalloc(size_of_regd, GFP_KERNEL);
+ if (!rd)
+ return -ENOMEM;
+
+ memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain));
+
+ for (i=0; i < num_rules; i++) {
+ memcpy(&rd->reg_rules[i], &mydriver_jp_regdom.reg_rules[i],
+ sizeof(struct ieee80211_reg_rule));
+ }
+ r = regulatory_hint(hw->wiphy, NULL, rd);
+ if (r) {
+ kfree(rd);
+ return r;
+ }
+
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
new file mode 100644
index 0000000..c3669a3
--- /dev/null
+++ b/Documentation/networking/rxrpc.txt
@@ -0,0 +1,866 @@
+ ======================
+ RxRPC NETWORK PROTOCOL
+ ======================
+
+The RxRPC protocol driver provides a reliable two-phase transport on top of UDP
+that can be used to perform RxRPC remote operations. This is done over sockets
+of AF_RXRPC family, using sendmsg() and recvmsg() with control data to send and
+receive data, aborts and errors.
+
+Contents of this document:
+
+ (*) Overview.
+
+ (*) RxRPC protocol summary.
+
+ (*) AF_RXRPC driver model.
+
+ (*) Control messages.
+
+ (*) Socket options.
+
+ (*) Security.
+
+ (*) Example client usage.
+
+ (*) Example server usage.
+
+ (*) AF_RXRPC kernel interface.
+
+
+========
+OVERVIEW
+========
+
+RxRPC is a two-layer protocol. There is a session layer which provides
+reliable virtual connections using UDP over IPv4 (or IPv6) as the transport
+layer, but implements a real network protocol; and there's the presentation
+layer which renders structured data to binary blobs and back again using XDR
+(as does SunRPC):
+
+ +-------------+
+ | Application |
+ +-------------+
+ | XDR | Presentation
+ +-------------+
+ | RxRPC | Session
+ +-------------+
+ | UDP | Transport
+ +-------------+
+
+
+AF_RXRPC provides:
+
+ (1) Part of an RxRPC facility for both kernel and userspace applications by
+ making the session part of it a Linux network protocol (AF_RXRPC).
+
+ (2) A two-phase protocol. The client transmits a blob (the request) and then
+ receives a blob (the reply), and the server receives the request and then
+ transmits the reply.
+
+ (3) Retention of the reusable bits of the transport system set up for one call
+ to speed up subsequent calls.
+
+ (4) A secure protocol, using the Linux kernel's key retention facility to
+ manage security on the client end. The server end must of necessity be
+ more active in security negotiations.
+
+AF_RXRPC does not provide XDR marshalling/presentation facilities. That is
+left to the application. AF_RXRPC only deals in blobs. Even the operation ID
+is just the first four bytes of the request blob, and as such is beyond the
+kernel's interest.
+
+
+Sockets of AF_RXRPC family are:
+
+ (1) created as type SOCK_DGRAM;
+
+ (2) provided with a protocol of the type of underlying transport they're going
+ to use - currently only PF_INET is supported.
+
+
+The Andrew File System (AFS) is an example of an application that uses this and
+that has both kernel (filesystem) and userspace (utility) components.
+
+
+======================
+RXRPC PROTOCOL SUMMARY
+======================
+
+An overview of the RxRPC protocol:
+
+ (*) RxRPC sits on top of another networking protocol (UDP is the only option
+ currently), and uses this to provide network transport. UDP ports, for
+ example, provide transport endpoints.
+
+ (*) RxRPC supports multiple virtual "connections" from any given transport
+ endpoint, thus allowing the endpoints to be shared, even to the same
+ remote endpoint.
+
+ (*) Each connection goes to a particular "service". A connection may not go
+ to multiple services. A service may be considered the RxRPC equivalent of
+ a port number. AF_RXRPC permits multiple services to share an endpoint.
+
+ (*) Client-originating packets are marked, thus a transport endpoint can be
+ shared between client and server connections (connections have a
+ direction).
+
+ (*) Up to a billion connections may be supported concurrently between one
+ local transport endpoint and one service on one remote endpoint. An RxRPC
+ connection is described by seven numbers:
+
+ Local address }
+ Local port } Transport (UDP) address
+ Remote address }
+ Remote port }
+ Direction
+ Connection ID
+ Service ID
+
+ (*) Each RxRPC operation is a "call". A connection may make up to four
+ billion calls, but only up to four calls may be in progress on a
+ connection at any one time.
+
+ (*) Calls are two-phase and asymmetric: the client sends its request data,
+ which the service receives; then the service transmits the reply data
+ which the client receives.
+
+ (*) The data blobs are of indefinite size, the end of a phase is marked with a
+ flag in the packet. The number of packets of data making up one blob may
+ not exceed 4 billion, however, as this would cause the sequence number to
+ wrap.
+
+ (*) The first four bytes of the request data are the service operation ID.
+
+ (*) Security is negotiated on a per-connection basis. The connection is
+ initiated by the first data packet on it arriving. If security is
+ requested, the server then issues a "challenge" and then the client
+ replies with a "response". If the response is successful, the security is
+ set for the lifetime of that connection, and all subsequent calls made
+ upon it use that same security. In the event that the server lets a
+ connection lapse before the client, the security will be renegotiated if
+ the client uses the connection again.
+
+ (*) Calls use ACK packets to handle reliability. Data packets are also
+ explicitly sequenced per call.
+
+ (*) There are two types of positive acknowledgement: hard-ACKs and soft-ACKs.
+ A hard-ACK indicates to the far side that all the data received to a point
+ has been received and processed; a soft-ACK indicates that the data has
+ been received but may yet be discarded and re-requested. The sender may
+ not discard any transmittable packets until they've been hard-ACK'd.
+
+ (*) Reception of a reply data packet implicitly hard-ACK's all the data
+ packets that make up the request.
+
+ (*) An call is complete when the request has been sent, the reply has been
+ received and the final hard-ACK on the last packet of the reply has
+ reached the server.
+
+ (*) An call may be aborted by either end at any time up to its completion.
+
+
+=====================
+AF_RXRPC DRIVER MODEL
+=====================
+
+About the AF_RXRPC driver:
+
+ (*) The AF_RXRPC protocol transparently uses internal sockets of the transport
+ protocol to represent transport endpoints.
+
+ (*) AF_RXRPC sockets map onto RxRPC connection bundles. Actual RxRPC
+ connections are handled transparently. One client socket may be used to
+ make multiple simultaneous calls to the same service. One server socket
+ may handle calls from many clients.
+
+ (*) Additional parallel client connections will be initiated to support extra
+ concurrent calls, up to a tunable limit.
+
+ (*) Each connection is retained for a certain amount of time [tunable] after
+ the last call currently using it has completed in case a new call is made
+ that could reuse it.
+
+ (*) Each internal UDP socket is retained [tunable] for a certain amount of
+ time [tunable] after the last connection using it discarded, in case a new
+ connection is made that could use it.
+
+ (*) A client-side connection is only shared between calls if they have have
+ the same key struct describing their security (and assuming the calls
+ would otherwise share the connection). Non-secured calls would also be
+ able to share connections with each other.
+
+ (*) A server-side connection is shared if the client says it is.
+
+ (*) ACK'ing is handled by the protocol driver automatically, including ping
+ replying.
+
+ (*) SO_KEEPALIVE automatically pings the other side to keep the connection
+ alive [TODO].
+
+ (*) If an ICMP error is received, all calls affected by that error will be
+ aborted with an appropriate network error passed through recvmsg().
+
+
+Interaction with the user of the RxRPC socket:
+
+ (*) A socket is made into a server socket by binding an address with a
+ non-zero service ID.
+
+ (*) In the client, sending a request is achieved with one or more sendmsgs,
+ followed by the reply being received with one or more recvmsgs.
+
+ (*) The first sendmsg for a request to be sent from a client contains a tag to
+ be used in all other sendmsgs or recvmsgs associated with that call. The
+ tag is carried in the control data.
+
+ (*) connect() is used to supply a default destination address for a client
+ socket. This may be overridden by supplying an alternate address to the
+ first sendmsg() of a call (struct msghdr::msg_name).
+
+ (*) If connect() is called on an unbound client, a random local port will
+ bound before the operation takes place.
+
+ (*) A server socket may also be used to make client calls. To do this, the
+ first sendmsg() of the call must specify the target address. The server's
+ transport endpoint is used to send the packets.
+
+ (*) Once the application has received the last message associated with a call,
+ the tag is guaranteed not to be seen again, and so it can be used to pin
+ client resources. A new call can then be initiated with the same tag
+ without fear of interference.
+
+ (*) In the server, a request is received with one or more recvmsgs, then the
+ the reply is transmitted with one or more sendmsgs, and then the final ACK
+ is received with a last recvmsg.
+
+ (*) When sending data for a call, sendmsg is given MSG_MORE if there's more
+ data to come on that call.
+
+ (*) When receiving data for a call, recvmsg flags MSG_MORE if there's more
+ data to come for that call.
+
+ (*) When receiving data or messages for a call, MSG_EOR is flagged by recvmsg
+ to indicate the terminal message for that call.
+
+ (*) A call may be aborted by adding an abort control message to the control
+ data. Issuing an abort terminates the kernel's use of that call's tag.
+ Any messages waiting in the receive queue for that call will be discarded.
+
+ (*) Aborts, busy notifications and challenge packets are delivered by recvmsg,
+ and control data messages will be set to indicate the context. Receiving
+ an abort or a busy message terminates the kernel's use of that call's tag.
+
+ (*) The control data part of the msghdr struct is used for a number of things:
+
+ (*) The tag of the intended or affected call.
+
+ (*) Sending or receiving errors, aborts and busy notifications.
+
+ (*) Notifications of incoming calls.
+
+ (*) Sending debug requests and receiving debug replies [TODO].
+
+ (*) When the kernel has received and set up an incoming call, it sends a
+ message to server application to let it know there's a new call awaiting
+ its acceptance [recvmsg reports a special control message]. The server
+ application then uses sendmsg to assign a tag to the new call. Once that
+ is done, the first part of the request data will be delivered by recvmsg.
+
+ (*) The server application has to provide the server socket with a keyring of
+ secret keys corresponding to the security types it permits. When a secure
+ connection is being set up, the kernel looks up the appropriate secret key
+ in the keyring and then sends a challenge packet to the client and
+ receives a response packet. The kernel then checks the authorisation of
+ the packet and either aborts the connection or sets up the security.
+
+ (*) The name of the key a client will use to secure its communications is
+ nominated by a socket option.
+
+
+Notes on recvmsg:
+
+ (*) If there's a sequence of data messages belonging to a particular call on
+ the receive queue, then recvmsg will keep working through them until:
+
+ (a) it meets the end of that call's received data,
+
+ (b) it meets a non-data message,
+
+ (c) it meets a message belonging to a different call, or
+
+ (d) it fills the user buffer.
+
+ If recvmsg is called in blocking mode, it will keep sleeping, awaiting the
+ reception of further data, until one of the above four conditions is met.
+
+ (2) MSG_PEEK operates similarly, but will return immediately if it has put any
+ data in the buffer rather than sleeping until it can fill the buffer.
+
+ (3) If a data message is only partially consumed in filling a user buffer,
+ then the remainder of that message will be left on the front of the queue
+ for the next taker. MSG_TRUNC will never be flagged.
+
+ (4) If there is more data to be had on a call (it hasn't copied the last byte
+ of the last data message in that phase yet), then MSG_MORE will be
+ flagged.
+
+
+================
+CONTROL MESSAGES
+================
+
+AF_RXRPC makes use of control messages in sendmsg() and recvmsg() to multiplex
+calls, to invoke certain actions and to report certain conditions. These are:
+
+ MESSAGE ID SRT DATA MEANING
+ ======================= === =========== ===============================
+ RXRPC_USER_CALL_ID sr- User ID App's call specifier
+ RXRPC_ABORT srt Abort code Abort code to issue/received
+ RXRPC_ACK -rt n/a Final ACK received
+ RXRPC_NET_ERROR -rt error num Network error on call
+ RXRPC_BUSY -rt n/a Call rejected (server busy)
+ RXRPC_LOCAL_ERROR -rt error num Local error encountered
+ RXRPC_NEW_CALL -r- n/a New call received
+ RXRPC_ACCEPT s-- n/a Accept new call
+
+ (SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message)
+
+ (*) RXRPC_USER_CALL_ID
+
+ This is used to indicate the application's call ID. It's an unsigned long
+ that the app specifies in the client by attaching it to the first data
+ message or in the server by passing it in association with an RXRPC_ACCEPT
+ message. recvmsg() passes it in conjunction with all messages except
+ those of the RXRPC_NEW_CALL message.
+
+ (*) RXRPC_ABORT
+
+ This is can be used by an application to abort a call by passing it to
+ sendmsg, or it can be delivered by recvmsg to indicate a remote abort was
+ received. Either way, it must be associated with an RXRPC_USER_CALL_ID to
+ specify the call affected. If an abort is being sent, then error EBADSLT
+ will be returned if there is no call with that user ID.
+
+ (*) RXRPC_ACK
+
+ This is delivered to a server application to indicate that the final ACK
+ of a call was received from the client. It will be associated with an
+ RXRPC_USER_CALL_ID to indicate the call that's now complete.
+
+ (*) RXRPC_NET_ERROR
+
+ This is delivered to an application to indicate that an ICMP error message
+ was encountered in the process of trying to talk to the peer. An
+ errno-class integer value will be included in the control message data
+ indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call
+ affected.
+
+ (*) RXRPC_BUSY
+
+ This is delivered to a client application to indicate that a call was
+ rejected by the server due to the server being busy. It will be
+ associated with an RXRPC_USER_CALL_ID to indicate the rejected call.
+
+ (*) RXRPC_LOCAL_ERROR
+
+ This is delivered to an application to indicate that a local error was
+ encountered and that a call has been aborted because of it. An
+ errno-class integer value will be included in the control message data
+ indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call
+ affected.
+
+ (*) RXRPC_NEW_CALL
+
+ This is delivered to indicate to a server application that a new call has
+ arrived and is awaiting acceptance. No user ID is associated with this,
+ as a user ID must subsequently be assigned by doing an RXRPC_ACCEPT.
+
+ (*) RXRPC_ACCEPT
+
+ This is used by a server application to attempt to accept a call and
+ assign it a user ID. It should be associated with an RXRPC_USER_CALL_ID
+ to indicate the user ID to be assigned. If there is no call to be
+ accepted (it may have timed out, been aborted, etc.), then sendmsg will
+ return error ENODATA. If the user ID is already in use by another call,
+ then error EBADSLT will be returned.
+
+
+==============
+SOCKET OPTIONS
+==============
+
+AF_RXRPC sockets support a few socket options at the SOL_RXRPC level:
+
+ (*) RXRPC_SECURITY_KEY
+
+ This is used to specify the description of the key to be used. The key is
+ extracted from the calling process's keyrings with request_key() and
+ should be of "rxrpc" type.
+
+ The optval pointer points to the description string, and optlen indicates
+ how long the string is, without the NUL terminator.
+
+ (*) RXRPC_SECURITY_KEYRING
+
+ Similar to above but specifies a keyring of server secret keys to use (key
+ type "keyring"). See the "Security" section.
+
+ (*) RXRPC_EXCLUSIVE_CONNECTION
+
+ This is used to request that new connections should be used for each call
+ made subsequently on this socket. optval should be NULL and optlen 0.
+
+ (*) RXRPC_MIN_SECURITY_LEVEL
+
+ This is used to specify the minimum security level required for calls on
+ this socket. optval must point to an int containing one of the following
+ values:
+
+ (a) RXRPC_SECURITY_PLAIN
+
+ Encrypted checksum only.
+
+ (b) RXRPC_SECURITY_AUTH
+
+ Encrypted checksum plus packet padded and first eight bytes of packet
+ encrypted - which includes the actual packet length.
+
+ (c) RXRPC_SECURITY_ENCRYPTED
+
+ Encrypted checksum plus entire packet padded and encrypted, including
+ actual packet length.
+
+
+========
+SECURITY
+========
+
+Currently, only the kerberos 4 equivalent protocol has been implemented
+(security index 2 - rxkad). This requires the rxkad module to be loaded and,
+on the client, tickets of the appropriate type to be obtained from the AFS
+kaserver or the kerberos server and installed as "rxrpc" type keys. This is
+normally done using the klog program. An example simple klog program can be
+found at:
+
+ http://people.redhat.com/~dhowells/rxrpc/klog.c
+
+The payload provided to add_key() on the client should be of the following
+form:
+
+ struct rxrpc_key_sec2_v1 {
+ uint16_t security_index; /* 2 */
+ uint16_t ticket_length; /* length of ticket[] */
+ uint32_t expiry; /* time at which expires */
+ uint8_t kvno; /* key version number */
+ uint8_t __pad[3];
+ uint8_t session_key[8]; /* DES session key */
+ uint8_t ticket[0]; /* the encrypted ticket */
+ };
+
+Where the ticket blob is just appended to the above structure.
+
+
+For the server, keys of type "rxrpc_s" must be made available to the server.
+They have a description of "<serviceID>:<securityIndex>" (eg: "52:2" for an
+rxkad key for the AFS VL service). When such a key is created, it should be
+given the server's secret key as the instantiation data (see the example
+below).
+
+ add_key("rxrpc_s", "52:2", secret_key, 8, keyring);
+
+A keyring is passed to the server socket by naming it in a sockopt. The server
+socket then looks the server secret keys up in this keyring when secure
+incoming connections are made. This can be seen in an example program that can
+be found at:
+
+ http://people.redhat.com/~dhowells/rxrpc/listen.c
+
+
+====================
+EXAMPLE CLIENT USAGE
+====================
+
+A client would issue an operation by:
+
+ (1) An RxRPC socket is set up by:
+
+ client = socket(AF_RXRPC, SOCK_DGRAM, PF_INET);
+
+ Where the third parameter indicates the protocol family of the transport
+ socket used - usually IPv4 but it can also be IPv6 [TODO].
+
+ (2) A local address can optionally be bound:
+
+ struct sockaddr_rxrpc srx = {
+ .srx_family = AF_RXRPC,
+ .srx_service = 0, /* we're a client */
+ .transport_type = SOCK_DGRAM, /* type of transport socket */
+ .transport.sin_family = AF_INET,
+ .transport.sin_port = htons(7000), /* AFS callback */
+ .transport.sin_address = 0, /* all local interfaces */
+ };
+ bind(client, &srx, sizeof(srx));
+
+ This specifies the local UDP port to be used. If not given, a random
+ non-privileged port will be used. A UDP port may be shared between
+ several unrelated RxRPC sockets. Security is handled on a basis of
+ per-RxRPC virtual connection.
+
+ (3) The security is set:
+
+ const char *key = "AFS:cambridge.redhat.com";
+ setsockopt(client, SOL_RXRPC, RXRPC_SECURITY_KEY, key, strlen(key));
+
+ This issues a request_key() to get the key representing the security
+ context. The minimum security level can be set:
+
+ unsigned int sec = RXRPC_SECURITY_ENCRYPTED;
+ setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
+ &sec, sizeof(sec));
+
+ (4) The server to be contacted can then be specified (alternatively this can
+ be done through sendmsg):
+
+ struct sockaddr_rxrpc srx = {
+ .srx_family = AF_RXRPC,
+ .srx_service = VL_SERVICE_ID,
+ .transport_type = SOCK_DGRAM, /* type of transport socket */
+ .transport.sin_family = AF_INET,
+ .transport.sin_port = htons(7005), /* AFS volume manager */
+ .transport.sin_address = ...,
+ };
+ connect(client, &srx, sizeof(srx));
+
+ (5) The request data should then be posted to the server socket using a series
+ of sendmsg() calls, each with the following control message attached:
+
+ RXRPC_USER_CALL_ID - specifies the user ID for this call
+
+ MSG_MORE should be set in msghdr::msg_flags on all but the last part of
+ the request. Multiple requests may be made simultaneously.
+
+ If a call is intended to go to a destination other then the default
+ specified through connect(), then msghdr::msg_name should be set on the
+ first request message of that call.
+
+ (6) The reply data will then be posted to the server socket for recvmsg() to
+ pick up. MSG_MORE will be flagged by recvmsg() if there's more reply data
+ for a particular call to be read. MSG_EOR will be set on the terminal
+ read for a call.
+
+ All data will be delivered with the following control message attached:
+
+ RXRPC_USER_CALL_ID - specifies the user ID for this call
+
+ If an abort or error occurred, this will be returned in the control data
+ buffer instead, and MSG_EOR will be flagged to indicate the end of that
+ call.
+
+
+====================
+EXAMPLE SERVER USAGE
+====================
+
+A server would be set up to accept operations in the following manner:
+
+ (1) An RxRPC socket is created by:
+
+ server = socket(AF_RXRPC, SOCK_DGRAM, PF_INET);
+
+ Where the third parameter indicates the address type of the transport
+ socket used - usually IPv4.
+
+ (2) Security is set up if desired by giving the socket a keyring with server
+ secret keys in it:
+
+ keyring = add_key("keyring", "AFSkeys", NULL, 0,
+ KEY_SPEC_PROCESS_KEYRING);
+
+ const char secret_key[8] = {
+ 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 };
+ add_key("rxrpc_s", "52:2", secret_key, 8, keyring);
+
+ setsockopt(server, SOL_RXRPC, RXRPC_SECURITY_KEYRING, "AFSkeys", 7);
+
+ The keyring can be manipulated after it has been given to the socket. This
+ permits the server to add more keys, replace keys, etc. whilst it is live.
+
+ (2) A local address must then be bound:
+
+ struct sockaddr_rxrpc srx = {
+ .srx_family = AF_RXRPC,
+ .srx_service = VL_SERVICE_ID, /* RxRPC service ID */
+ .transport_type = SOCK_DGRAM, /* type of transport socket */
+ .transport.sin_family = AF_INET,
+ .transport.sin_port = htons(7000), /* AFS callback */
+ .transport.sin_address = 0, /* all local interfaces */
+ };
+ bind(server, &srx, sizeof(srx));
+
+ (3) The server is then set to listen out for incoming calls:
+
+ listen(server, 100);
+
+ (4) The kernel notifies the server of pending incoming connections by sending
+ it a message for each. This is received with recvmsg() on the server
+ socket. It has no data, and has a single dataless control message
+ attached:
+
+ RXRPC_NEW_CALL
+
+ The address that can be passed back by recvmsg() at this point should be
+ ignored since the call for which the message was posted may have gone by
+ the time it is accepted - in which case the first call still on the queue
+ will be accepted.
+
+ (5) The server then accepts the new call by issuing a sendmsg() with two
+ pieces of control data and no actual data:
+
+ RXRPC_ACCEPT - indicate connection acceptance
+ RXRPC_USER_CALL_ID - specify user ID for this call
+
+ (6) The first request data packet will then be posted to the server socket for
+ recvmsg() to pick up. At that point, the RxRPC address for the call can
+ be read from the address fields in the msghdr struct.
+
+ Subsequent request data will be posted to the server socket for recvmsg()
+ to collect as it arrives. All but the last piece of the request data will
+ be delivered with MSG_MORE flagged.
+
+ All data will be delivered with the following control message attached:
+
+ RXRPC_USER_CALL_ID - specifies the user ID for this call
+
+ (8) The reply data should then be posted to the server socket using a series
+ of sendmsg() calls, each with the following control messages attached:
+
+ RXRPC_USER_CALL_ID - specifies the user ID for this call
+
+ MSG_MORE should be set in msghdr::msg_flags on all but the last message
+ for a particular call.
+
+ (9) The final ACK from the client will be posted for retrieval by recvmsg()
+ when it is received. It will take the form of a dataless message with two
+ control messages attached:
+
+ RXRPC_USER_CALL_ID - specifies the user ID for this call
+ RXRPC_ACK - indicates final ACK (no data)
+
+ MSG_EOR will be flagged to indicate that this is the final message for
+ this call.
+
+(10) Up to the point the final packet of reply data is sent, the call can be
+ aborted by calling sendmsg() with a dataless message with the following
+ control messages attached:
+
+ RXRPC_USER_CALL_ID - specifies the user ID for this call
+ RXRPC_ABORT - indicates abort code (4 byte data)
+
+ Any packets waiting in the socket's receive queue will be discarded if
+ this is issued.
+
+Note that all the communications for a particular service take place through
+the one server socket, using control messages on sendmsg() and recvmsg() to
+determine the call affected.
+
+
+=========================
+AF_RXRPC KERNEL INTERFACE
+=========================
+
+The AF_RXRPC module also provides an interface for use by in-kernel utilities
+such as the AFS filesystem. This permits such a utility to:
+
+ (1) Use different keys directly on individual client calls on one socket
+ rather than having to open a whole slew of sockets, one for each key it
+ might want to use.
+
+ (2) Avoid having RxRPC call request_key() at the point of issue of a call or
+ opening of a socket. Instead the utility is responsible for requesting a
+ key at the appropriate point. AFS, for instance, would do this during VFS
+ operations such as open() or unlink(). The key is then handed through
+ when the call is initiated.
+
+ (3) Request the use of something other than GFP_KERNEL to allocate memory.
+
+ (4) Avoid the overhead of using the recvmsg() call. RxRPC messages can be
+ intercepted before they get put into the socket Rx queue and the socket
+ buffers manipulated directly.
+
+To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket,
+bind an address as appropriate and listen if it's to be a server socket, but
+then it passes this to the kernel interface functions.
+
+The kernel interface functions are as follows:
+
+ (*) Begin a new client call.
+
+ struct rxrpc_call *
+ rxrpc_kernel_begin_call(struct socket *sock,
+ struct sockaddr_rxrpc *srx,
+ struct key *key,
+ unsigned long user_call_ID,
+ gfp_t gfp);
+
+ This allocates the infrastructure to make a new RxRPC call and assigns
+ call and connection numbers. The call will be made on the UDP port that
+ the socket is bound to. The call will go to the destination address of a
+ connected client socket unless an alternative is supplied (srx is
+ non-NULL).
+
+ If a key is supplied then this will be used to secure the call instead of
+ the key bound to the socket with the RXRPC_SECURITY_KEY sockopt. Calls
+ secured in this way will still share connections if at all possible.
+
+ The user_call_ID is equivalent to that supplied to sendmsg() in the
+ control data buffer. It is entirely feasible to use this to point to a
+ kernel data structure.
+
+ If this function is successful, an opaque reference to the RxRPC call is
+ returned. The caller now holds a reference on this and it must be
+ properly ended.
+
+ (*) End a client call.
+
+ void rxrpc_kernel_end_call(struct rxrpc_call *call);
+
+ This is used to end a previously begun call. The user_call_ID is expunged
+ from AF_RXRPC's knowledge and will not be seen again in association with
+ the specified call.
+
+ (*) Send data through a call.
+
+ int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg,
+ size_t len);
+
+ This is used to supply either the request part of a client call or the
+ reply part of a server call. msg.msg_iovlen and msg.msg_iov specify the
+ data buffers to be used. msg_iov may not be NULL and must point
+ exclusively to in-kernel virtual addresses. msg.msg_flags may be given
+ MSG_MORE if there will be subsequent data sends for this call.
+
+ The msg must not specify a destination address, control data or any flags
+ other than MSG_MORE. len is the total amount of data to transmit.
+
+ (*) Abort a call.
+
+ void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code);
+
+ This is used to abort a call if it's still in an abortable state. The
+ abort code specified will be placed in the ABORT message sent.
+
+ (*) Intercept received RxRPC messages.
+
+ typedef void (*rxrpc_interceptor_t)(struct sock *sk,
+ unsigned long user_call_ID,
+ struct sk_buff *skb);
+
+ void
+ rxrpc_kernel_intercept_rx_messages(struct socket *sock,
+ rxrpc_interceptor_t interceptor);
+
+ This installs an interceptor function on the specified AF_RXRPC socket.
+ All messages that would otherwise wind up in the socket's Rx queue are
+ then diverted to this function. Note that care must be taken to process
+ the messages in the right order to maintain DATA message sequentiality.
+
+ The interceptor function itself is provided with the address of the socket
+ and handling the incoming message, the ID assigned by the kernel utility
+ to the call and the socket buffer containing the message.
+
+ The skb->mark field indicates the type of message:
+
+ MARK MEANING
+ =============================== =======================================
+ RXRPC_SKB_MARK_DATA Data message
+ RXRPC_SKB_MARK_FINAL_ACK Final ACK received for an incoming call
+ RXRPC_SKB_MARK_BUSY Client call rejected as server busy
+ RXRPC_SKB_MARK_REMOTE_ABORT Call aborted by peer
+ RXRPC_SKB_MARK_NET_ERROR Network error detected
+ RXRPC_SKB_MARK_LOCAL_ERROR Local error encountered
+ RXRPC_SKB_MARK_NEW_CALL New incoming call awaiting acceptance
+
+ The remote abort message can be probed with rxrpc_kernel_get_abort_code().
+ The two error messages can be probed with rxrpc_kernel_get_error_number().
+ A new call can be accepted with rxrpc_kernel_accept_call().
+
+ Data messages can have their contents extracted with the usual bunch of
+ socket buffer manipulation functions. A data message can be determined to
+ be the last one in a sequence with rxrpc_kernel_is_data_last(). When a
+ data message has been used up, rxrpc_kernel_data_delivered() should be
+ called on it..
+
+ Non-data messages should be handled to rxrpc_kernel_free_skb() to dispose
+ of. It is possible to get extra refs on all types of message for later
+ freeing, but this may pin the state of a call until the message is finally
+ freed.
+
+ (*) Accept an incoming call.
+
+ struct rxrpc_call *
+ rxrpc_kernel_accept_call(struct socket *sock,
+ unsigned long user_call_ID);
+
+ This is used to accept an incoming call and to assign it a call ID. This
+ function is similar to rxrpc_kernel_begin_call() and calls accepted must
+ be ended in the same way.
+
+ If this function is successful, an opaque reference to the RxRPC call is
+ returned. The caller now holds a reference on this and it must be
+ properly ended.
+
+ (*) Reject an incoming call.
+
+ int rxrpc_kernel_reject_call(struct socket *sock);
+
+ This is used to reject the first incoming call on the socket's queue with
+ a BUSY message. -ENODATA is returned if there were no incoming calls.
+ Other errors may be returned if the call had been aborted (-ECONNABORTED)
+ or had timed out (-ETIME).
+
+ (*) Record the delivery of a data message and free it.
+
+ void rxrpc_kernel_data_delivered(struct sk_buff *skb);
+
+ This is used to record a data message as having been delivered and to
+ update the ACK state for the call. The socket buffer will be freed.
+
+ (*) Free a message.
+
+ void rxrpc_kernel_free_skb(struct sk_buff *skb);
+
+ This is used to free a non-DATA socket buffer intercepted from an AF_RXRPC
+ socket.
+
+ (*) Determine if a data message is the last one on a call.
+
+ bool rxrpc_kernel_is_data_last(struct sk_buff *skb);
+
+ This is used to determine if a socket buffer holds the last data message
+ to be received for a call (true will be returned if it does, false
+ if not).
+
+ The data message will be part of the reply on a client call and the
+ request on an incoming call. In the latter case there will be more
+ messages, but in the former case there will not.
+
+ (*) Get the abort code from an abort message.
+
+ u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb);
+
+ This is used to extract the abort code from a remote abort message.
+
+ (*) Get the error number from a local or network error message.
+
+ int rxrpc_kernel_get_error_number(struct sk_buff *skb);
+
+ This is used to extract the error number from a message indicating either
+ a local error occurred or a network error occurred.
+
+ (*) Allocate a null key for doing anonymous security.
+
+ struct key *rxrpc_get_null_key(const char *keyname);
+
+ This is used to allocate a null RxRPC key that can be used to indicate
+ anonymous security for a particular domain.
diff --git a/Documentation/networking/s2io.txt b/Documentation/networking/s2io.txt
new file mode 100644
index 0000000..c3d6b4d
--- /dev/null
+++ b/Documentation/networking/s2io.txt
@@ -0,0 +1,150 @@
+Release notes for Neterion's (Formerly S2io) Xframe I/II PCI-X 10GbE driver.
+
+Contents
+=======
+- 1. Introduction
+- 2. Identifying the adapter/interface
+- 3. Features supported
+- 4. Command line parameters
+- 5. Performance suggestions
+- 6. Available Downloads
+
+
+1. Introduction:
+This Linux driver supports Neterion's Xframe I PCI-X 1.0 and
+Xframe II PCI-X 2.0 adapters. It supports several features
+such as jumbo frames, MSI/MSI-X, checksum offloads, TSO, UFO and so on.
+See below for complete list of features.
+All features are supported for both IPv4 and IPv6.
+
+2. Identifying the adapter/interface:
+a. Insert the adapter(s) in your system.
+b. Build and load driver
+# insmod s2io.ko
+c. View log messages
+# dmesg | tail -40
+You will see messages similar to:
+eth3: Neterion Xframe I 10GbE adapter (rev 3), Version 2.0.9.1, Intr type INTA
+eth4: Neterion Xframe II 10GbE adapter (rev 2), Version 2.0.9.1, Intr type INTA
+eth4: Device is on 64 bit 133MHz PCIX(M1) bus
+
+The above messages identify the adapter type(Xframe I/II), adapter revision,
+driver version, interface name(eth3, eth4), Interrupt type(INTA, MSI, MSI-X).
+In case of Xframe II, the PCI/PCI-X bus width and frequency are displayed
+as well.
+
+To associate an interface with a physical adapter use "ethtool -p <ethX>".
+The corresponding adapter's LED will blink multiple times.
+
+3. Features supported:
+a. Jumbo frames. Xframe I/II supports MTU upto 9600 bytes,
+modifiable using ifconfig command.
+
+b. Offloads. Supports checksum offload(TCP/UDP/IP) on transmit
+and receive, TSO.
+
+c. Multi-buffer receive mode. Scattering of packet across multiple
+buffers. Currently driver supports 2-buffer mode which yields
+significant performance improvement on certain platforms(SGI Altix,
+IBM xSeries).
+
+d. MSI/MSI-X. Can be enabled on platforms which support this feature
+(IA64, Xeon) resulting in noticeable performance improvement(upto 7%
+on certain platforms).
+
+e. Statistics. Comprehensive MAC-level and software statistics displayed
+using "ethtool -S" option.
+
+f. Multi-FIFO/Ring. Supports up to 8 transmit queues and receive rings,
+with multiple steering options.
+
+4. Command line parameters
+a. tx_fifo_num
+Number of transmit queues
+Valid range: 1-8
+Default: 1
+
+b. rx_ring_num
+Number of receive rings
+Valid range: 1-8
+Default: 1
+
+c. tx_fifo_len
+Size of each transmit queue
+Valid range: Total length of all queues should not exceed 8192
+Default: 4096
+
+d. rx_ring_sz
+Size of each receive ring(in 4K blocks)
+Valid range: Limited by memory on system
+Default: 30
+
+e. intr_type
+Specifies interrupt type. Possible values 0(INTA), 2(MSI-X)
+Valid values: 0, 2
+Default: 2
+
+5. Performance suggestions
+General:
+a. Set MTU to maximum(9000 for switch setup, 9600 in back-to-back configuration)
+b. Set TCP windows size to optimal value.
+For instance, for MTU=1500 a value of 210K has been observed to result in
+good performance.
+# sysctl -w net.ipv4.tcp_rmem="210000 210000 210000"
+# sysctl -w net.ipv4.tcp_wmem="210000 210000 210000"
+For MTU=9000, TCP window size of 10 MB is recommended.
+# sysctl -w net.ipv4.tcp_rmem="10000000 10000000 10000000"
+# sysctl -w net.ipv4.tcp_wmem="10000000 10000000 10000000"
+
+Transmit performance:
+a. By default, the driver respects BIOS settings for PCI bus parameters.
+However, you may want to experiment with PCI bus parameters
+max-split-transactions(MOST) and MMRBC (use setpci command).
+A MOST value of 2 has been found optimal for Opterons and 3 for Itanium.
+It could be different for your hardware.
+Set MMRBC to 4K**.
+
+For example you can set
+For opteron
+#setpci -d 17d5:* 62=1d
+For Itanium
+#setpci -d 17d5:* 62=3d
+
+For detailed description of the PCI registers, please see Xframe User Guide.
+
+b. Ensure Transmit Checksum offload is enabled. Use ethtool to set/verify this
+parameter.
+c. Turn on TSO(using "ethtool -K")
+# ethtool -K <ethX> tso on
+
+Receive performance:
+a. By default, the driver respects BIOS settings for PCI bus parameters.
+However, you may want to set PCI latency timer to 248.
+#setpci -d 17d5:* LATENCY_TIMER=f8
+For detailed description of the PCI registers, please see Xframe User Guide.
+b. Use 2-buffer mode. This results in large performance boost on
+certain platforms(eg. SGI Altix, IBM xSeries).
+c. Ensure Receive Checksum offload is enabled. Use "ethtool -K ethX" command to
+set/verify this option.
+d. Enable NAPI feature(in kernel configuration Device Drivers ---> Network
+device support ---> Ethernet (10000 Mbit) ---> S2IO 10Gbe Xframe NIC) to
+bring down CPU utilization.
+
+** For AMD opteron platforms with 8131 chipset, MMRBC=1 and MOST=1 are
+recommended as safe parameters.
+For more information, please review the AMD8131 errata at
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26310.pdf
+
+6. Available Downloads
+Neterion "s2io" driver in Red Hat and Suse 2.6-based distributions is kept up
+to date, also the latest "s2io" code (including support for 2.4 kernels) is
+available via "Support" link on the Neterion site: http://www.neterion.com.
+
+For Xframe User Guide (Programming manual), visit ftp site ns1.s2io.com,
+user: linuxdocs password: HALdocs
+
+7. Support
+For further support please contact either your 10GbE Xframe NIC vendor (IBM,
+HP, SGI etc.) or click on the "Support" link on the Neterion site:
+http://www.neterion.com.
+
diff --git a/Documentation/networking/sctp.txt b/Documentation/networking/sctp.txt
new file mode 100644
index 0000000..0c790a7
--- /dev/null
+++ b/Documentation/networking/sctp.txt
@@ -0,0 +1,38 @@
+Linux Kernel SCTP
+
+This is the current BETA release of the Linux Kernel SCTP reference
+implementation.
+
+SCTP (Stream Control Transmission Protocol) is a IP based, message oriented,
+reliable transport protocol, with congestion control, support for
+transparent multi-homing, and multiple ordered streams of messages.
+RFC2960 defines the core protocol. The IETF SIGTRAN working group originally
+developed the SCTP protocol and later handed the protocol over to the
+Transport Area (TSVWG) working group for the continued evolvement of SCTP as a
+general purpose transport.
+
+See the IETF website (http://www.ietf.org) for further documents on SCTP.
+See http://www.ietf.org/rfc/rfc2960.txt
+
+The initial project goal is to create an Linux kernel reference implementation
+of SCTP that is RFC 2960 compliant and provides an programming interface
+referred to as the UDP-style API of the Sockets Extensions for SCTP, as
+proposed in IETF Internet-Drafts.
+
+
+Caveats:
+
+-lksctp can be built as statically or as a module. However, be aware that
+module removal of lksctp is not yet a safe activity.
+
+-There is tentative support for IPv6, but most work has gone towards
+implementation and testing lksctp on IPv4.
+
+
+For more information, please visit the lksctp project website:
+ http://www.sf.net/projects/lksctp
+
+Or contact the lksctp developers through the mailing list:
+ <lksctp-developers@lists.sourceforge.net>
+
+
diff --git a/Documentation/networking/secid.txt b/Documentation/networking/secid.txt
new file mode 100644
index 0000000..95ea067
--- /dev/null
+++ b/Documentation/networking/secid.txt
@@ -0,0 +1,14 @@
+flowi structure:
+
+The secid member in the flow structure is used in LSMs (e.g. SELinux) to indicate
+the label of the flow. This label of the flow is currently used in selecting
+matching labeled xfrm(s).
+
+If this is an outbound flow, the label is derived from the socket, if any, or
+the incoming packet this flow is being generated as a response to (e.g. tcp
+resets, timewait ack, etc.). It is also conceivable that the label could be
+derived from other sources such as process context, device, etc., in special
+cases, as may be appropriate.
+
+If this is an inbound flow, the label is derived from the IPSec security
+associations, if any, used by the packet.
diff --git a/Documentation/networking/skfp.txt b/Documentation/networking/skfp.txt
new file mode 100644
index 0000000..abfddf8
--- /dev/null
+++ b/Documentation/networking/skfp.txt
@@ -0,0 +1,220 @@
+(C)Copyright 1998-2000 SysKonnect,
+===========================================================================
+
+skfp.txt created 11-May-2000
+
+Readme File for skfp.o v2.06
+
+
+This file contains
+(1) OVERVIEW
+(2) SUPPORTED ADAPTERS
+(3) GENERAL INFORMATION
+(4) INSTALLATION
+(5) INCLUSION OF THE ADAPTER IN SYSTEM START
+(6) TROUBLESHOOTING
+(7) FUNCTION OF THE ADAPTER LEDS
+(8) HISTORY
+
+===========================================================================
+
+
+
+(1) OVERVIEW
+============
+
+This README explains how to use the driver 'skfp' for Linux with your
+network adapter.
+
+Chapter 2: Contains a list of all network adapters that are supported by
+ this driver.
+
+Chapter 3: Gives some general information.
+
+Chapter 4: Describes common problems and solutions.
+
+Chapter 5: Shows the changed functionality of the adapter LEDs.
+
+Chapter 6: History of development.
+
+***
+
+
+(2) SUPPORTED ADAPTERS
+======================
+
+The network driver 'skfp' supports the following network adapters:
+SysKonnect adapters:
+ - SK-5521 (SK-NET FDDI-UP)
+ - SK-5522 (SK-NET FDDI-UP DAS)
+ - SK-5541 (SK-NET FDDI-FP)
+ - SK-5543 (SK-NET FDDI-LP)
+ - SK-5544 (SK-NET FDDI-LP DAS)
+ - SK-5821 (SK-NET FDDI-UP64)
+ - SK-5822 (SK-NET FDDI-UP64 DAS)
+ - SK-5841 (SK-NET FDDI-FP64)
+ - SK-5843 (SK-NET FDDI-LP64)
+ - SK-5844 (SK-NET FDDI-LP64 DAS)
+Compaq adapters (not tested):
+ - Netelligent 100 FDDI DAS Fibre SC
+ - Netelligent 100 FDDI SAS Fibre SC
+ - Netelligent 100 FDDI DAS UTP
+ - Netelligent 100 FDDI SAS UTP
+ - Netelligent 100 FDDI SAS Fibre MIC
+***
+
+
+(3) GENERAL INFORMATION
+=======================
+
+From v2.01 on, the driver is integrated in the linux kernel sources.
+Therefor, the installation is the same as for any other adapter
+supported by the kernel.
+Refer to the manual of your distribution about the installation
+of network adapters.
+Makes my life much easier :-)
+***
+
+
+(4) TROUBLESHOOTING
+===================
+
+If you run into problems during installation, check those items:
+
+Problem: The FDDI adapter cannot be found by the driver.
+Reason: Look in /proc/pci for the following entry:
+ 'FDDI network controller: SysKonnect SK-FDDI-PCI ...'
+ If this entry exists, then the FDDI adapter has been
+ found by the system and should be able to be used.
+ If this entry does not exist or if the file '/proc/pci'
+ is not there, then you may have a hardware problem or PCI
+ support may not be enabled in your kernel.
+ The adapter can be checked using the diagnostic program
+ which is available from the SysKonnect web site:
+ www.syskonnect.de
+ Some COMPAQ machines have a problem with PCI under
+ Linux. This is described in the 'PCI howto' document
+ (included in some distributions or available from the
+ www, e.g. at 'www.linux.org') and no workaround is available.
+
+Problem: You want to use your computer as a router between
+ multiple IP subnetworks (using multiple adapters), but
+ you cannot reach computers in other subnetworks.
+Reason: Either the router's kernel is not configured for IP
+ forwarding or there is a problem with the routing table
+ and gateway configuration in at least one of the
+ computers.
+
+If your problem is not listed here, please contact our
+technical support for help.
+You can send email to:
+ linux@syskonnect.de
+When contacting our technical support,
+please ensure that the following information is available:
+- System Manufacturer and Model
+- Boards in your system
+- Distribution
+- Kernel version
+
+***
+
+
+(5) FUNCTION OF THE ADAPTER LEDS
+================================
+
+ The functionality of the LED's on the FDDI network adapters was
+ changed in SMT version v2.82. With this new SMT version, the yellow
+ LED works as a ring operational indicator. An active yellow LED
+ indicates that the ring is down. The green LED on the adapter now
+ works as a link indicator where an active GREEN LED indicates that
+ the respective port has a physical connection.
+
+ With versions of SMT prior to v2.82 a ring up was indicated if the
+ yellow LED was off while the green LED(s) showed the connection
+ status of the adapter. During a ring down the green LED was off and
+ the yellow LED was on.
+
+ All implementations indicate that a driver is not loaded if
+ all LEDs are off.
+
+***
+
+
+(6) HISTORY
+===========
+
+v2.06 (20000511) (In-Kernel version)
+ New features:
+ - 64 bit support
+ - new pci dma interface
+ - in kernel 2.3.99
+
+v2.05 (20000217) (In-Kernel version)
+ New features:
+ - Changes for 2.3.45 kernel
+
+v2.04 (20000207) (Standalone version)
+ New features:
+ - Added rx/tx byte counter
+
+v2.03 (20000111) (Standalone version)
+ Problems fixed:
+ - Fixed printk statements from v2.02
+
+v2.02 (991215) (Standalone version)
+ Problems fixed:
+ - Removed unnecessary output
+ - Fixed path for "printver.sh" in makefile
+
+v2.01 (991122) (In-Kernel version)
+ New features:
+ - Integration in Linux kernel sources
+ - Support for memory mapped I/O.
+
+v2.00 (991112)
+ New features:
+ - Full source released under GPL
+
+v1.05 (991023)
+ Problems fixed:
+ - Compilation with kernel version 2.2.13 failed
+
+v1.04 (990427)
+ Changes:
+ - New SMT module included, changing LED functionality
+ Problems fixed:
+ - Synchronization on SMP machines was buggy
+
+v1.03 (990325)
+ Problems fixed:
+ - Interrupt routing on SMP machines could be incorrect
+
+v1.02 (990310)
+ New features:
+ - Support for kernel versions 2.2.x added
+ - Kernel patch instead of private duplicate of kernel functions
+
+v1.01 (980812)
+ Problems fixed:
+ Connection hangup with telnet
+ Slow telnet connection
+
+v1.00 beta 01 (980507)
+ New features:
+ None.
+ Problems fixed:
+ None.
+ Known limitations:
+ - tar archive instead of standard package format (rpm).
+ - FDDI statistic is empty.
+ - not tested with 2.1.xx kernels
+ - integration in kernel not tested
+ - not tested simultaneously with FDDI adapters from other vendors.
+ - only X86 processors supported.
+ - SBA (Synchronous Bandwidth Allocator) parameters can
+ not be configured.
+ - does not work on some COMPAQ machines. See the PCI howto
+ document for details about this problem.
+ - data corruption with kernel versions below 2.0.33.
+
+*** End of information file ***
diff --git a/Documentation/networking/smc9.txt b/Documentation/networking/smc9.txt
new file mode 100644
index 0000000..d1e1507
--- /dev/null
+++ b/Documentation/networking/smc9.txt
@@ -0,0 +1,42 @@
+
+SMC 9xxxx Driver
+Revision 0.12
+3/5/96
+Copyright 1996 Erik Stahlman
+Released under terms of the GNU General Public License.
+
+This file contains the instructions and caveats for my SMC9xxx driver. You
+should not be using the driver without reading this file.
+
+Things to note about installation:
+
+ 1. The driver should work on all kernels from 1.2.13 until 1.3.71.
+ (A kernel patch is supplied for 1.3.71 )
+
+ 2. If you include this into the kernel, you might need to change some
+ options, such as for forcing IRQ.
+
+
+ 3. To compile as a module, run 'make' .
+ Make will give you the appropriate options for various kernel support.
+
+ 4. Loading the driver as a module :
+
+ use: insmod smc9194.o
+ optional parameters:
+ io=xxxx : your base address
+ irq=xx : your irq
+ ifport=x : 0 for whatever is default
+ 1 for twisted pair
+ 2 for AUI ( or BNC on some cards )
+
+How to obtain the latest version?
+
+FTP:
+ ftp://fenris.campus.vt.edu/smc9/smc9-12.tar.gz
+ ftp://sfbox.vt.edu/filebox/F/fenris/smc9/smc9-12.tar.gz
+
+
+Contacting me:
+ erik@mail.vt.edu
+
diff --git a/Documentation/networking/smctr.txt b/Documentation/networking/smctr.txt
new file mode 100644
index 0000000..9af25b8
--- /dev/null
+++ b/Documentation/networking/smctr.txt
@@ -0,0 +1,66 @@
+Text File for the SMC TokenCard TokenRing Linux driver (smctr.c).
+ By Jay Schulist <jschlst@samba.org>
+
+The Linux SMC Token Ring driver works with the SMC TokenCard Elite (8115T)
+ISA and SMC TokenCard Elite/A (8115T/A) MCA adapters.
+
+Latest information on this driver can be obtained on the Linux-SNA WWW site.
+Please point your browser to: http://www.linux-sna.org
+
+This driver is rather simple to use. Select Y to Token Ring adapter support
+in the kernel configuration. A choice for SMC Token Ring adapters will
+appear. This drives supports all SMC ISA/MCA adapters. Choose this
+option. I personally recommend compiling the driver as a module (M), but if you
+you would like to compile it statically answer Y instead.
+
+This driver supports multiple adapters without the need to load multiple copies
+of the driver. You should be able to load up to 7 adapters without any kernel
+modifications, if you are in need of more please contact the maintainer of this
+driver.
+
+Load the driver either by lilo/loadlin or as a module. When a module using the
+following command will suffice for most:
+
+# modprobe smctr
+smctr.c: v1.00 12/6/99 by jschlst@samba.org
+tr0: SMC TokenCard 8115T at Io 0x300, Irq 10, Rom 0xd8000, Ram 0xcc000.
+
+Now just setup the device via ifconfig and set and routes you may have. After
+this you are ready to start sending some tokens.
+
+Errata:
+1). For anyone wondering where to pick up the SMC adapters please browse
+ to http://www.smc.com
+
+2). If you are the first/only Token Ring Client on a Token Ring LAN, please
+ specify the ringspeed with the ringspeed=[4/16] module option. If no
+ ringspeed is specified the driver will attempt to autodetect the ring
+ speed and/or if the adapter is the first/only station on the ring take
+ the appropriate actions.
+
+ NOTE: Default ring speed is 16MB UTP.
+
+3). PnP support for this adapter sucks. I recommend hard setting the
+ IO/MEM/IRQ by the jumpers on the adapter. If this is not possible
+ load the module with the following io=[ioaddr] mem=[mem_addr]
+ irq=[irq_num].
+
+ The following IRQ, IO, and MEM settings are supported.
+
+ IO ports:
+ 0x200, 0x220, 0x240, 0x260, 0x280, 0x2A0, 0x2C0, 0x2E0, 0x300,
+ 0x320, 0x340, 0x360, 0x380.
+
+ IRQs:
+ 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ Memory addresses:
+ 0xA0000, 0xA4000, 0xA8000, 0xAC000, 0xB0000, 0xB4000,
+ 0xB8000, 0xBC000, 0xC0000, 0xC4000, 0xC8000, 0xCC000,
+ 0xD0000, 0xD4000, 0xD8000, 0xDC000, 0xE0000, 0xE4000,
+ 0xE8000, 0xEC000, 0xF0000, 0xF4000, 0xF8000, 0xFC000
+
+This driver is under the GNU General Public License. Its Firmware image is
+included as an initialized C-array and is licensed by SMC to the Linux
+users of this driver. However no warranty about its fitness is expressed or
+implied by SMC.
diff --git a/Documentation/networking/spider_net.txt b/Documentation/networking/spider_net.txt
new file mode 100644
index 0000000..4b4adb8
--- /dev/null
+++ b/Documentation/networking/spider_net.txt
@@ -0,0 +1,204 @@
+
+ The Spidernet Device Driver
+ ===========================
+
+Written by Linas Vepstas <linas@austin.ibm.com>
+
+Version of 7 June 2007
+
+Abstract
+========
+This document sketches the structure of portions of the spidernet
+device driver in the Linux kernel tree. The spidernet is a gigabit
+ethernet device built into the Toshiba southbridge commonly used
+in the SONY Playstation 3 and the IBM QS20 Cell blade.
+
+The Structure of the RX Ring.
+=============================
+The receive (RX) ring is a circular linked list of RX descriptors,
+together with three pointers into the ring that are used to manage its
+contents.
+
+The elements of the ring are called "descriptors" or "descrs"; they
+describe the received data. This includes a pointer to a buffer
+containing the received data, the buffer size, and various status bits.
+
+There are three primary states that a descriptor can be in: "empty",
+"full" and "not-in-use". An "empty" or "ready" descriptor is ready
+to receive data from the hardware. A "full" descriptor has data in it,
+and is waiting to be emptied and processed by the OS. A "not-in-use"
+descriptor is neither empty or full; it is simply not ready. It may
+not even have a data buffer in it, or is otherwise unusable.
+
+During normal operation, on device startup, the OS (specifically, the
+spidernet device driver) allocates a set of RX descriptors and RX
+buffers. These are all marked "empty", ready to receive data. This
+ring is handed off to the hardware, which sequentially fills in the
+buffers, and marks them "full". The OS follows up, taking the full
+buffers, processing them, and re-marking them empty.
+
+This filling and emptying is managed by three pointers, the "head"
+and "tail" pointers, managed by the OS, and a hardware current
+descriptor pointer (GDACTDPA). The GDACTDPA points at the descr
+currently being filled. When this descr is filled, the hardware
+marks it full, and advances the GDACTDPA by one. Thus, when there is
+flowing RX traffic, every descr behind it should be marked "full",
+and everything in front of it should be "empty". If the hardware
+discovers that the current descr is not empty, it will signal an
+interrupt, and halt processing.
+
+The tail pointer tails or trails the hardware pointer. When the
+hardware is ahead, the tail pointer will be pointing at a "full"
+descr. The OS will process this descr, and then mark it "not-in-use",
+and advance the tail pointer. Thus, when there is flowing RX traffic,
+all of the descrs in front of the tail pointer should be "full", and
+all of those behind it should be "not-in-use". When RX traffic is not
+flowing, then the tail pointer can catch up to the hardware pointer.
+The OS will then note that the current tail is "empty", and halt
+processing.
+
+The head pointer (somewhat mis-named) follows after the tail pointer.
+When traffic is flowing, then the head pointer will be pointing at
+a "not-in-use" descr. The OS will perform various housekeeping duties
+on this descr. This includes allocating a new data buffer and
+dma-mapping it so as to make it visible to the hardware. The OS will
+then mark the descr as "empty", ready to receive data. Thus, when there
+is flowing RX traffic, everything in front of the head pointer should
+be "not-in-use", and everything behind it should be "empty". If no
+RX traffic is flowing, then the head pointer can catch up to the tail
+pointer, at which point the OS will notice that the head descr is
+"empty", and it will halt processing.
+
+Thus, in an idle system, the GDACTDPA, tail and head pointers will
+all be pointing at the same descr, which should be "empty". All of the
+other descrs in the ring should be "empty" as well.
+
+The show_rx_chain() routine will print out the the locations of the
+GDACTDPA, tail and head pointers. It will also summarize the contents
+of the ring, starting at the tail pointer, and listing the status
+of the descrs that follow.
+
+A typical example of the output, for a nearly idle system, might be
+
+net eth1: Total number of descrs=256
+net eth1: Chain tail located at descr=20
+net eth1: Chain head is at 20
+net eth1: HW curr desc (GDACTDPA) is at 21
+net eth1: Have 1 descrs with stat=x40800101
+net eth1: HW next desc (GDACNEXTDA) is at 22
+net eth1: Last 255 descrs with stat=xa0800000
+
+In the above, the hardware has filled in one descr, number 20. Both
+head and tail are pointing at 20, because it has not yet been emptied.
+Meanwhile, hw is pointing at 21, which is free.
+
+The "Have nnn decrs" refers to the descr starting at the tail: in this
+case, nnn=1 descr, starting at descr 20. The "Last nnn descrs" refers
+to all of the rest of the descrs, from the last status change. The "nnn"
+is a count of how many descrs have exactly the same status.
+
+The status x4... corresponds to "full" and status xa... corresponds
+to "empty". The actual value printed is RXCOMST_A.
+
+In the device driver source code, a different set of names are
+used for these same concepts, so that
+
+"empty" == SPIDER_NET_DESCR_CARDOWNED == 0xa
+"full" == SPIDER_NET_DESCR_FRAME_END == 0x4
+"not in use" == SPIDER_NET_DESCR_NOT_IN_USE == 0xf
+
+
+The RX RAM full bug/feature
+===========================
+
+As long as the OS can empty out the RX buffers at a rate faster than
+the hardware can fill them, there is no problem. If, for some reason,
+the OS fails to empty the RX ring fast enough, the hardware GDACTDPA
+pointer will catch up to the head, notice the not-empty condition,
+ad stop. However, RX packets may still continue arriving on the wire.
+The spidernet chip can save some limited number of these in local RAM.
+When this local ram fills up, the spider chip will issue an interrupt
+indicating this (GHIINT0STS will show ERRINT, and the GRMFLLINT bit
+will be set in GHIINT1STS). When the RX ram full condition occurs,
+a certain bug/feature is triggered that has to be specially handled.
+This section describes the special handling for this condition.
+
+When the OS finally has a chance to run, it will empty out the RX ring.
+In particular, it will clear the descriptor on which the hardware had
+stopped. However, once the hardware has decided that a certain
+descriptor is invalid, it will not restart at that descriptor; instead
+it will restart at the next descr. This potentially will lead to a
+deadlock condition, as the tail pointer will be pointing at this descr,
+which, from the OS point of view, is empty; the OS will be waiting for
+this descr to be filled. However, the hardware has skipped this descr,
+and is filling the next descrs. Since the OS doesn't see this, there
+is a potential deadlock, with the OS waiting for one descr to fill,
+while the hardware is waiting for a different set of descrs to become
+empty.
+
+A call to show_rx_chain() at this point indicates the nature of the
+problem. A typical print when the network is hung shows the following:
+
+net eth1: Spider RX RAM full, incoming packets might be discarded!
+net eth1: Total number of descrs=256
+net eth1: Chain tail located at descr=255
+net eth1: Chain head is at 255
+net eth1: HW curr desc (GDACTDPA) is at 0
+net eth1: Have 1 descrs with stat=xa0800000
+net eth1: HW next desc (GDACNEXTDA) is at 1
+net eth1: Have 127 descrs with stat=x40800101
+net eth1: Have 1 descrs with stat=x40800001
+net eth1: Have 126 descrs with stat=x40800101
+net eth1: Last 1 descrs with stat=xa0800000
+
+Both the tail and head pointers are pointing at descr 255, which is
+marked xa... which is "empty". Thus, from the OS point of view, there
+is nothing to be done. In particular, there is the implicit assumption
+that everything in front of the "empty" descr must surely also be empty,
+as explained in the last section. The OS is waiting for descr 255 to
+become non-empty, which, in this case, will never happen.
+
+The HW pointer is at descr 0. This descr is marked 0x4.. or "full".
+Since its already full, the hardware can do nothing more, and thus has
+halted processing. Notice that descrs 0 through 254 are all marked
+"full", while descr 254 and 255 are empty. (The "Last 1 descrs" is
+descr 254, since tail was at 255.) Thus, the system is deadlocked,
+and there can be no forward progress; the OS thinks there's nothing
+to do, and the hardware has nowhere to put incoming data.
+
+This bug/feature is worked around with the spider_net_resync_head_ptr()
+routine. When the driver receives RX interrupts, but an examination
+of the RX chain seems to show it is empty, then it is probable that
+the hardware has skipped a descr or two (sometimes dozens under heavy
+network conditions). The spider_net_resync_head_ptr() subroutine will
+search the ring for the next full descr, and the driver will resume
+operations there. Since this will leave "holes" in the ring, there
+is also a spider_net_resync_tail_ptr() that will skip over such holes.
+
+As of this writing, the spider_net_resync() strategy seems to work very
+well, even under heavy network loads.
+
+
+The TX ring
+===========
+The TX ring uses a low-watermark interrupt scheme to make sure that
+the TX queue is appropriately serviced for large packet sizes.
+
+For packet sizes greater than about 1KBytes, the kernel can fill
+the TX ring quicker than the device can drain it. Once the ring
+is full, the netdev is stopped. When there is room in the ring,
+the netdev needs to be reawakened, so that more TX packets are placed
+in the ring. The hardware can empty the ring about four times per jiffy,
+so its not appropriate to wait for the poll routine to refill, since
+the poll routine runs only once per jiffy. The low-watermark mechanism
+marks a descr about 1/4th of the way from the bottom of the queue, so
+that an interrupt is generated when the descr is processed. This
+interrupt wakes up the netdev, which can then refill the queue.
+For large packets, this mechanism generates a relatively small number
+of interrupts, about 1K/sec. For smaller packets, this will drop to zero
+interrupts, as the hardware can empty the queue faster than the kernel
+can fill it.
+
+
+ ======= END OF DOCUMENT ========
+
diff --git a/Documentation/networking/tc-actions-env-rules.txt b/Documentation/networking/tc-actions-env-rules.txt
new file mode 100644
index 0000000..dcadf6f
--- /dev/null
+++ b/Documentation/networking/tc-actions-env-rules.txt
@@ -0,0 +1,30 @@
+
+The "enviromental" rules for authors of any new tc actions are:
+
+1) If you stealeth or borroweth any packet thou shalt be branching
+from the righteous path and thou shalt cloneth.
+
+For example if your action queues a packet to be processed later,
+or intentionally branches by redirecting a packet, then you need to
+clone the packet.
+
+There are certain fields in the skb tc_verd that need to be reset so we
+avoid loops, etc. A few are generic enough that skb_act_clone()
+resets them for you, so invoke skb_act_clone() rather than skb_clone().
+
+2) If you munge any packet thou shalt call pskb_expand_head in the case
+someone else is referencing the skb. After that you "own" the skb.
+You must also tell us if it is ok to munge the packet (TC_OK2MUNGE),
+this way any action downstream can stomp on the packet.
+
+3) Dropping packets you don't own is a no-no. You simply return
+TC_ACT_SHOT to the caller and they will drop it.
+
+The "enviromental" rules for callers of actions (qdiscs etc) are:
+
+*) Thou art responsible for freeing anything returned as being
+TC_ACT_SHOT/STOLEN/QUEUED. If none of TC_ACT_SHOT/STOLEN/QUEUED is
+returned, then all is great and you don't need to do anything.
+
+Post on netdev if something is unclear.
+
diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt
new file mode 100644
index 0000000..7d11bb5
--- /dev/null
+++ b/Documentation/networking/tcp.txt
@@ -0,0 +1,106 @@
+TCP protocol
+============
+
+Last updated: 9 February 2008
+
+Contents
+========
+
+- Congestion control
+- How the new TCP output machine [nyi] works
+
+Congestion control
+==================
+
+The following variables are used in the tcp_sock for congestion control:
+snd_cwnd The size of the congestion window
+snd_ssthresh Slow start threshold. We are in slow start if
+ snd_cwnd is less than this.
+snd_cwnd_cnt A counter used to slow down the rate of increase
+ once we exceed slow start threshold.
+snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to.
+snd_cwnd_stamp Timestamp for when congestion window last validated.
+snd_cwnd_used Used as a highwater mark for how much of the
+ congestion window is in use. It is used to adjust
+ snd_cwnd down when the link is limited by the
+ application rather than the network.
+
+As of 2.6.13, Linux supports pluggable congestion control algorithms.
+A congestion control mechanism can be registered through functions in
+tcp_cong.c. The functions used by the congestion control mechanism are
+registered via passing a tcp_congestion_ops struct to
+tcp_register_congestion_control. As a minimum name, ssthresh,
+cong_avoid, min_cwnd must be valid.
+
+Private data for a congestion control mechanism is stored in tp->ca_priv.
+tcp_ca(tp) returns a pointer to this space. This is preallocated space - it
+is important to check the size of your private data will fit this space, or
+alternatively space could be allocated elsewhere and a pointer to it could
+be stored here.
+
+There are three kinds of congestion control algorithms currently: The
+simplest ones are derived from TCP reno (highspeed, scalable) and just
+provide an alternative the congestion window calculation. More complex
+ones like BIC try to look at other events to provide better
+heuristics. There are also round trip time based algorithms like
+Vegas and Westwood+.
+
+Good TCP congestion control is a complex problem because the algorithm
+needs to maintain fairness and performance. Please review current
+research and RFC's before developing new modules.
+
+The method that is used to determine which congestion control mechanism is
+determined by the setting of the sysctl net.ipv4.tcp_congestion_control.
+The default congestion control will be the last one registered (LIFO);
+so if you built everything as modules, the default will be reno. If you
+build with the defaults from Kconfig, then CUBIC will be builtin (not a
+module) and it will end up the default.
+
+If you really want a particular default value then you will need
+to set it with the sysctl. If you use a sysctl, the module will be autoloaded
+if needed and you will get the expected protocol. If you ask for an
+unknown congestion method, then the sysctl attempt will fail.
+
+If you remove a tcp congestion control module, then you will get the next
+available one. Since reno cannot be built as a module, and cannot be
+deleted, it will always be available.
+
+How the new TCP output machine [nyi] works.
+===========================================
+
+Data is kept on a single queue. The skb->users flag tells us if the frame is
+one that has been queued already. To add a frame we throw it on the end. Ack
+walks down the list from the start.
+
+We keep a set of control flags
+
+
+ sk->tcp_pend_event
+
+ TCP_PEND_ACK Ack needed
+ TCP_ACK_NOW Needed now
+ TCP_WINDOW Window update check
+ TCP_WINZERO Zero probing
+
+
+ sk->transmit_queue The transmission frame begin
+ sk->transmit_new First new frame pointer
+ sk->transmit_end Where to add frames
+
+ sk->tcp_last_tx_ack Last ack seen
+ sk->tcp_dup_ack Dup ack count for fast retransmit
+
+
+Frames are queued for output by tcp_write. We do our best to send the frames
+off immediately if possible, but otherwise queue and compute the body
+checksum in the copy.
+
+When a write is done we try to clear any pending events and piggy back them.
+If the window is full we queue full sized frames. On the first timeout in
+zero window we split this.
+
+On a timer we walk the retransmit list to send any retransmits, update the
+backoff timers etc. A change of route table stamp causes a change of header
+and recompute. We add any new tcp level headers and refinish the checksum
+before sending.
+
diff --git a/Documentation/networking/tlan.txt b/Documentation/networking/tlan.txt
new file mode 100644
index 0000000..7e6aa5b
--- /dev/null
+++ b/Documentation/networking/tlan.txt
@@ -0,0 +1,117 @@
+(C) 1997-1998 Caldera, Inc.
+(C) 1998 James Banks
+(C) 1999-2001 Torben Mathiasen <tmm@image.dk, torben.mathiasen@compaq.com>
+
+For driver information/updates visit http://opensource.compaq.com
+
+
+TLAN driver for Linux, version 1.14a
+README
+
+
+I. Supported Devices.
+
+ Only PCI devices will work with this driver.
+
+ Supported:
+ Vendor ID Device ID Name
+ 0e11 ae32 Compaq Netelligent 10/100 TX PCI UTP
+ 0e11 ae34 Compaq Netelligent 10 T PCI UTP
+ 0e11 ae35 Compaq Integrated NetFlex 3/P
+ 0e11 ae40 Compaq Netelligent Dual 10/100 TX PCI UTP
+ 0e11 ae43 Compaq Netelligent Integrated 10/100 TX UTP
+ 0e11 b011 Compaq Netelligent 10/100 TX Embedded UTP
+ 0e11 b012 Compaq Netelligent 10 T/2 PCI UTP/Coax
+ 0e11 b030 Compaq Netelligent 10/100 TX UTP
+ 0e11 f130 Compaq NetFlex 3/P
+ 0e11 f150 Compaq NetFlex 3/P
+ 108d 0012 Olicom OC-2325
+ 108d 0013 Olicom OC-2183
+ 108d 0014 Olicom OC-2326
+
+
+ Caveats:
+
+ I am not sure if 100BaseTX daughterboards (for those cards which
+ support such things) will work. I haven't had any solid evidence
+ either way.
+
+ However, if a card supports 100BaseTx without requiring an add
+ on daughterboard, it should work with 100BaseTx.
+
+ The "Netelligent 10 T/2 PCI UTP/Coax" (b012) device is untested,
+ but I do not expect any problems.
+
+
+II. Driver Options
+ 1. You can append debug=x to the end of the insmod line to get
+ debug messages, where x is a bit field where the bits mean
+ the following:
+
+ 0x01 Turn on general debugging messages.
+ 0x02 Turn on receive debugging messages.
+ 0x04 Turn on transmit debugging messages.
+ 0x08 Turn on list debugging messages.
+
+ 2. You can append aui=1 to the end of the insmod line to cause
+ the adapter to use the AUI interface instead of the 10 Base T
+ interface. This is also what to do if you want to use the BNC
+ connector on a TLAN based device. (Setting this option on a
+ device that does not have an AUI/BNC connector will probably
+ cause it to not function correctly.)
+
+ 3. You can set duplex=1 to force half duplex, and duplex=2 to
+ force full duplex.
+
+ 4. You can set speed=10 to force 10Mbs operation, and speed=100
+ to force 100Mbs operation. (I'm not sure what will happen
+ if a card which only supports 10Mbs is forced into 100Mbs
+ mode.)
+
+ 5. You have to use speed=X duplex=Y together now. If you just
+ do "insmod tlan.o speed=100" the driver will do Auto-Neg.
+ To force a 10Mbps Half-Duplex link do "insmod tlan.o speed=10
+ duplex=1".
+
+ 6. If the driver is built into the kernel, you can use the 3rd
+ and 4th parameters to set aui and debug respectively. For
+ example:
+
+ ether=0,0,0x1,0x7,eth0
+
+ This sets aui to 0x1 and debug to 0x7, assuming eth0 is a
+ supported TLAN device.
+
+ The bits in the third byte are assigned as follows:
+
+ 0x01 = aui
+ 0x02 = use half duplex
+ 0x04 = use full duplex
+ 0x08 = use 10BaseT
+ 0x10 = use 100BaseTx
+
+ You also need to set both speed and duplex settings when forcing
+ speeds with kernel-parameters.
+ ether=0,0,0x12,0,eth0 will force link to 100Mbps Half-Duplex.
+
+ 7. If you have more than one tlan adapter in your system, you can
+ use the above options on a per adapter basis. To force a 100Mbit/HD
+ link with your eth1 adapter use:
+
+ insmod tlan speed=0,100 duplex=0,1
+
+ Now eth0 will use auto-neg and eth1 will be forced to 100Mbit/HD.
+ Note that the tlan driver supports a maximum of 8 adapters.
+
+
+III. Things to try if you have problems.
+ 1. Make sure your card's PCI id is among those listed in
+ section I, above.
+ 2. Make sure routing is correct.
+ 3. Try forcing different speed/duplex settings
+
+
+There is also a tlan mailing list which you can join by sending "subscribe tlan"
+in the body of an email to majordomo@vuser.vu.union.edu.
+There is also a tlan website at http://opensource.compaq.com
+
diff --git a/Documentation/networking/tms380tr.txt b/Documentation/networking/tms380tr.txt
new file mode 100644
index 0000000..1f73e13
--- /dev/null
+++ b/Documentation/networking/tms380tr.txt
@@ -0,0 +1,147 @@
+Text file for the Linux SysKonnect Token Ring ISA/PCI Adapter Driver.
+ Text file by: Jay Schulist <jschlst@samba.org>
+
+The Linux SysKonnect Token Ring driver works with the SysKonnect TR4/16(+) ISA,
+SysKonnect TR4/16(+) PCI, SysKonnect TR4/16 PCI, and older revisions of the
+SK NET TR4/16 ISA card.
+
+Latest information on this driver can be obtained on the Linux-SNA WWW site.
+Please point your browser to:
+http://www.linux-sna.org
+
+Many thanks to Christoph Goos for his excellent work on this driver and
+SysKonnect for donating the adapters to Linux-SNA for the testing and
+maintenance of this device driver.
+
+Important information to be noted:
+1. Adapters can be slow to open (~20 secs) and close (~5 secs), please be
+ patient.
+2. This driver works very well when autoprobing for adapters. Why even
+ think about those nasty io/int/dma settings of modprobe when the driver
+ will do it all for you!
+
+This driver is rather simple to use. Select Y to Token Ring adapter support
+in the kernel configuration. A choice for SysKonnect Token Ring adapters will
+appear. This drives supports all SysKonnect ISA and PCI adapters. Choose this
+option. I personally recommend compiling the driver as a module (M), but if you
+you would like to compile it statically answer Y instead.
+
+This driver supports multiple adapters without the need to load multiple copies
+of the driver. You should be able to load up to 7 adapters without any kernel
+modifications, if you are in need of more please contact the maintainer of this
+driver.
+
+Load the driver either by lilo/loadlin or as a module. When a module using the
+following command will suffice for most:
+
+# modprobe sktr
+
+This will produce output similar to the following: (Output is user specific)
+
+sktr.c: v1.01 08/29/97 by Christoph Goos
+tr0: SK NET TR 4/16 PCI found at 0x6100, using IRQ 17.
+tr1: SK NET TR 4/16 PCI found at 0x6200, using IRQ 16.
+tr2: SK NET TR 4/16 ISA found at 0xa20, using IRQ 10 and DMA 5.
+
+Now just setup the device via ifconfig and set and routes you may have. After
+this you are ready to start sending some tokens.
+
+Errata:
+For anyone wondering where to pick up the SysKonnect adapters please browse
+to http://www.syskonnect.com
+
+This driver is under the GNU General Public License. Its Firmware image is
+included as an initialized C-array and is licensed by SysKonnect to the Linux
+users of this driver. However no warranty about its fitness is expressed or
+implied by SysKonnect.
+
+Below find attached the setting for the SK NET TR 4/16 ISA adapters
+-------------------------------------------------------------------
+
+ ***************************
+ *** C O N T E N T S ***
+ ***************************
+
+ 1) Location of DIP-Switch W1
+ 2) Default settings
+ 3) DIP-Switch W1 description
+
+
+ ==============================================================
+ CHAPTER 1 LOCATION OF DIP-SWITCH
+ ==============================================================
+
+UÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ¿
+þUÄÄÄÄÄÄ¿ UÄÄÄÄÄ¿ UÄÄÄ¿ þ
+þAÄÄÄÄÄÄU W1 AÄÄÄÄÄU UÄÄÄÄ¿ þ þ þ
+þUÄÄÄÄÄÄ¿ þ þ þ þ UÄÄÅ¿
+þAÄÄÄÄÄÄU UÄÄÄÄÄÄÄÄÄÄÄ¿ AÄÄÄÄU þ þ þ þþ
+þUÄÄÄÄÄÄ¿ þ þ UÄÄÄ¿ AÄÄÄU AÄÄÅU
+þAÄÄÄÄÄÄU þ TMS380C26 þ þ þ þ
+þUÄÄÄÄÄÄ¿ þ þ AÄÄÄU AÄ¿
+þAÄÄÄÄÄÄU þ þ þ þ
+þ AÄÄÄÄÄÄÄÄÄÄÄU þ þ
+þ þ þ
+þ AÄU
+þ þ
+þ þ
+þ þ
+þ þ
+AÄÄÄÄÄÄÄÄÄÄÄÄAÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄAÄÄAÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄAÄÄÄÄÄÄÄÄÄU
+ AÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄU AÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄU
+
+ ==============================================================
+ CHAPTER 2 DEFAULT SETTINGS
+ ==============================================================
+
+ W1 1 2 3 4 5 6 7 8
+ +------------------------------+
+ | ON X |
+ | OFF X X X X X X X |
+ +------------------------------+
+
+ W1.1 = ON Adapter drives address lines SA17..19
+ W1.2 - 1.5 = OFF BootROM disabled
+ W1.6 - 1.8 = OFF I/O address 0A20h
+
+ ==============================================================
+ CHAPTER 3 DIP SWITCH W1 DESCRIPTION
+ ==============================================================
+
+ UÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄ¿ ON
+ þ 1 þ 2 þ 3 þ 4 þ 5 þ 6 þ 7 þ 8 þ
+ AÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄAÄÄÄU OFF
+ |AD | BootROM Addr. | I/O |
+ +-+-+-------+-------+-----+-----+
+ | | |
+ | | +------ 6 7 8
+ | | ON ON ON 1900h
+ | | ON ON OFF 0900h
+ | | ON OFF ON 1980h
+ | | ON OFF OFF 0980h
+ | | OFF ON ON 1b20h
+ | | OFF ON OFF 0b20h
+ | | OFF OFF ON 1a20h
+ | | OFF OFF OFF 0a20h (+)
+ | |
+ | |
+ | +-------- 2 3 4 5
+ | OFF x x x disabled (+)
+ | ON ON ON ON C0000
+ | ON ON ON OFF C4000
+ | ON ON OFF ON C8000
+ | ON ON OFF OFF CC000
+ | ON OFF ON ON D0000
+ | ON OFF ON OFF D4000
+ | ON OFF OFF ON D8000
+ | ON OFF OFF OFF DC000
+ |
+ |
+ +----- 1
+ OFF adapter does NOT drive SA<17..19>
+ ON adapter drives SA<17..19> (+)
+
+
+ (+) means default setting
+
+ ********************************
diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt
new file mode 100644
index 0000000..7b5996d
--- /dev/null
+++ b/Documentation/networking/tproxy.txt
@@ -0,0 +1,85 @@
+Transparent proxy support
+=========================
+
+This feature adds Linux 2.2-like transparent proxy support to current kernels.
+To use it, enable NETFILTER_TPROXY, the socket match and the TPROXY target in
+your kernel config. You will need policy routing too, so be sure to enable that
+as well.
+
+
+1. Making non-local sockets work
+================================
+
+The idea is that you identify packets with destination address matching a local
+socket on your box, set the packet mark to a certain value, and then match on that
+value using policy routing to have those packets delivered locally:
+
+# iptables -t mangle -N DIVERT
+# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
+# iptables -t mangle -A DIVERT -j MARK --set-mark 1
+# iptables -t mangle -A DIVERT -j ACCEPT
+
+# ip rule add fwmark 1 lookup 100
+# ip route add local 0.0.0.0/0 dev lo table 100
+
+Because of certain restrictions in the IPv4 routing output code you'll have to
+modify your application to allow it to send datagrams _from_ non-local IP
+addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket
+option before calling bind:
+
+fd = socket(AF_INET, SOCK_STREAM, 0);
+/* - 8< -*/
+int value = 1;
+setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value));
+/* - 8< -*/
+name.sin_family = AF_INET;
+name.sin_port = htons(0xCAFE);
+name.sin_addr.s_addr = htonl(0xDEADBEEF);
+bind(fd, &name, sizeof(name));
+
+A trivial patch for netcat is available here:
+http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch
+
+
+2. Redirecting traffic
+======================
+
+Transparent proxying often involves "intercepting" traffic on a router. This is
+usually done with the iptables REDIRECT target; however, there are serious
+limitations of that method. One of the major issues is that it actually
+modifies the packets to change the destination address -- which might not be
+acceptable in certain situations. (Think of proxying UDP for example: you won't
+be able to find out the original destination address. Even in case of TCP
+getting the original destination address is racy.)
+
+The 'TPROXY' target provides similar functionality without relying on NAT. Simply
+add rules like this to the iptables ruleset above:
+
+# iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \
+ --tproxy-mark 0x1/0x1 --on-port 50080
+
+Note that for this to work you'll have to modify the proxy to enable (SOL_IP,
+IP_TRANSPARENT) for the listening socket.
+
+
+3. Iptables extensions
+======================
+
+To use tproxy you'll need to have the 'socket' and 'TPROXY' modules
+compiled for iptables. A patched version of iptables is available
+here: http://git.balabit.hu/?p=bazsi/iptables-tproxy.git
+
+
+4. Application support
+======================
+
+4.1. Squid
+----------
+
+Squid 3.HEAD has support built-in. To use it, pass
+'--enable-linux-netfilter' to configure and set the 'tproxy' option on
+the HTTP listener you redirect traffic to with the TPROXY iptables
+target.
+
+For more information please consult the following page on the Squid
+wiki: http://wiki.squid-cache.org/Features/Tproxy4
diff --git a/Documentation/networking/tuntap.txt b/Documentation/networking/tuntap.txt
new file mode 100644
index 0000000..839cbb7
--- /dev/null
+++ b/Documentation/networking/tuntap.txt
@@ -0,0 +1,150 @@
+Universal TUN/TAP device driver.
+Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com>
+
+ Linux, Solaris drivers
+ Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com>
+
+ FreeBSD TAP driver
+ Copyright (c) 1999-2000 Maksim Yevmenkin <m_evmenkin@yahoo.com>
+
+ Revision of this document 2002 by Florian Thiel <florian.thiel@gmx.net>
+
+1. Description
+ TUN/TAP provides packet reception and transmission for user space programs.
+ It can be seen as a simple Point-to-Point or Ethernet device, which,
+ instead of receiving packets from physical media, receives them from
+ user space program and instead of sending packets via physical media
+ writes them to the user space program.
+
+ In order to use the driver a program has to open /dev/net/tun and issue a
+ corresponding ioctl() to register a network device with the kernel. A network
+ device will appear as tunXX or tapXX, depending on the options chosen. When
+ the program closes the file descriptor, the network device and all
+ corresponding routes will disappear.
+
+ Depending on the type of device chosen the userspace program has to read/write
+ IP packets (with tun) or ethernet frames (with tap). Which one is being used
+ depends on the flags given with the ioctl().
+
+ The package from http://vtun.sourceforge.net/tun contains two simple examples
+ for how to use tun and tap devices. Both programs work like a bridge between
+ two network interfaces.
+ br_select.c - bridge based on select system call.
+ br_sigio.c - bridge based on async io and SIGIO signal.
+ However, the best example is VTun http://vtun.sourceforge.net :))
+
+2. Configuration
+ Create device node:
+ mkdir /dev/net (if it doesn't exist already)
+ mknod /dev/net/tun c 10 200
+
+ Set permissions:
+ e.g. chmod 0666 /dev/net/tun
+ There's no harm in allowing the device to be accessible by non-root users,
+ since CAP_NET_ADMIN is required for creating network devices or for
+ connecting to network devices which aren't owned by the user in question.
+ If you want to create persistent devices and give ownership of them to
+ unprivileged users, then you need the /dev/net/tun device to be usable by
+ those users.
+
+ Driver module autoloading
+
+ Make sure that "Kernel module loader" - module auto-loading
+ support is enabled in your kernel. The kernel should load it on
+ first access.
+
+ Manual loading
+ insert the module by hand:
+ modprobe tun
+
+ If you do it the latter way, you have to load the module every time you
+ need it, if you do it the other way it will be automatically loaded when
+ /dev/net/tun is being opened.
+
+3. Program interface
+ 3.1 Network device allocation:
+
+ char *dev should be the name of the device with a format string (e.g.
+ "tun%d"), but (as far as I can see) this can be any valid network device name.
+ Note that the character pointer becomes overwritten with the real device name
+ (e.g. "tun0")
+
+ #include <linux/if.h>
+ #include <linux/if_tun.h>
+
+ int tun_alloc(char *dev)
+ {
+ struct ifreq ifr;
+ int fd, err;
+
+ if( (fd = open("/dev/net/tun", O_RDWR)) < 0 )
+ return tun_alloc_old(dev);
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ /* Flags: IFF_TUN - TUN device (no Ethernet headers)
+ * IFF_TAP - TAP device
+ *
+ * IFF_NO_PI - Do not provide packet information
+ */
+ ifr.ifr_flags = IFF_TUN;
+ if( *dev )
+ strncpy(ifr.ifr_name, dev, IFNAMSIZ);
+
+ if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ){
+ close(fd);
+ return err;
+ }
+ strcpy(dev, ifr.ifr_name);
+ return fd;
+ }
+
+ 3.2 Frame format:
+ If flag IFF_NO_PI is not set each frame format is:
+ Flags [2 bytes]
+ Proto [2 bytes]
+ Raw protocol(IP, IPv6, etc) frame.
+
+Universal TUN/TAP device driver Frequently Asked Question.
+
+1. What platforms are supported by TUN/TAP driver ?
+Currently driver has been written for 3 Unices:
+ Linux kernels 2.2.x, 2.4.x
+ FreeBSD 3.x, 4.x, 5.x
+ Solaris 2.6, 7.0, 8.0
+
+2. What is TUN/TAP driver used for?
+As mentioned above, main purpose of TUN/TAP driver is tunneling.
+It is used by VTun (http://vtun.sourceforge.net).
+
+Another interesting application using TUN/TAP is pipsecd
+(http://perso.enst.fr/~beyssac/pipsec/), an userspace IPSec
+implementation that can use complete kernel routing (unlike FreeS/WAN).
+
+3. How does Virtual network device actually work ?
+Virtual network device can be viewed as a simple Point-to-Point or
+Ethernet device, which instead of receiving packets from a physical
+media, receives them from user space program and instead of sending
+packets via physical media sends them to the user space program.
+
+Let's say that you configured IPX on the tap0, then whenever
+the kernel sends an IPX packet to tap0, it is passed to the application
+(VTun for example). The application encrypts, compresses and sends it to
+the other side over TCP or UDP. The application on the other side decompresses
+and decrypts the data received and writes the packet to the TAP device,
+the kernel handles the packet like it came from real physical device.
+
+4. What is the difference between TUN driver and TAP driver?
+TUN works with IP frames. TAP works with Ethernet frames.
+
+This means that you have to read/write IP packets when you are using tun and
+ethernet frames when using tap.
+
+5. What is the difference between BPF and TUN/TAP driver?
+BPF is an advanced packet filter. It can be attached to existing
+network interface. It does not provide a virtual network interface.
+A TUN/TAP driver does provide a virtual network interface and it is possible
+to attach BPF to this interface.
+
+6. Does TAP driver support kernel Ethernet bridging?
+Yes. Linux and FreeBSD drivers support Ethernet bridging.
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt
new file mode 100644
index 0000000..855d8da
--- /dev/null
+++ b/Documentation/networking/udplite.txt
@@ -0,0 +1,281 @@
+ ===========================================================================
+ The UDP-Lite protocol (RFC 3828)
+ ===========================================================================
+
+
+ UDP-Lite is a Standards-Track IETF transport protocol whose characteristic
+ is a variable-length checksum. This has advantages for transport of multimedia
+ (video, VoIP) over wireless networks, as partly damaged packets can still be
+ fed into the codec instead of being discarded due to a failed checksum test.
+
+ This file briefly describes the existing kernel support and the socket API.
+ For in-depth information, you can consult:
+
+ o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/
+ From here you can also download some example application source code.
+
+ o The UDP-Lite HOWTO on
+ http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt
+
+ o The Wireshark UDP-Lite WiKi (with capture files):
+ http://wiki.wireshark.org/Lightweight_User_Datagram_Protocol
+
+ o The Protocol Spec, RFC 3828, http://www.ietf.org/rfc/rfc3828.txt
+
+
+ I) APPLICATIONS
+
+ Several applications have been ported successfully to UDP-Lite. Ethereal
+ (now called wireshark) has UDP-Litev4/v6 support by default. The tarball on
+
+ http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/udplite_linux.tar.gz
+
+ has source code for several v4/v6 client-server and network testing examples.
+
+ Porting applications to UDP-Lite is straightforward: only socket level and
+ IPPROTO need to be changed; senders additionally set the checksum coverage
+ length (default = header length = 8). Details are in the next section.
+
+
+ II) PROGRAMMING API
+
+ UDP-Lite provides a connectionless, unreliable datagram service and hence
+ uses the same socket type as UDP. In fact, porting from UDP to UDP-Lite is
+ very easy: simply add `IPPROTO_UDPLITE' as the last argument of the socket(2)
+ call so that the statement looks like:
+
+ s = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE);
+
+ or, respectively,
+
+ s = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDPLITE);
+
+ With just the above change you are able to run UDP-Lite services or connect
+ to UDP-Lite servers. The kernel will assume that you are not interested in
+ using partial checksum coverage and so emulate UDP mode (full coverage).
+
+ To make use of the partial checksum coverage facilities requires setting a
+ single socket option, which takes an integer specifying the coverage length:
+
+ * Sender checksum coverage: UDPLITE_SEND_CSCOV
+
+ For example,
+
+ int val = 20;
+ setsockopt(s, SOL_UDPLITE, UDPLITE_SEND_CSCOV, &val, sizeof(int));
+
+ sets the checksum coverage length to 20 bytes (12b data + 8b header).
+ Of each packet only the first 20 bytes (plus the pseudo-header) will be
+ checksummed. This is useful for RTP applications which have a 12-byte
+ base header.
+
+
+ * Receiver checksum coverage: UDPLITE_RECV_CSCOV
+
+ This option is the receiver-side analogue. It is truly optional, i.e. not
+ required to enable traffic with partial checksum coverage. Its function is
+ that of a traffic filter: when enabled, it instructs the kernel to drop
+ all packets which have a coverage _less_ than this value. For example, if
+ RTP and UDP headers are to be protected, a receiver can enforce that only
+ packets with a minimum coverage of 20 are admitted:
+
+ int min = 20;
+ setsockopt(s, SOL_UDPLITE, UDPLITE_RECV_CSCOV, &min, sizeof(int));
+
+ The calls to getsockopt(2) are analogous. Being an extension and not a stand-
+ alone protocol, all socket options known from UDP can be used in exactly the
+ same manner as before, e.g. UDP_CORK or UDP_ENCAP.
+
+ A detailed discussion of UDP-Lite checksum coverage options is in section IV.
+
+
+ III) HEADER FILES
+
+ The socket API requires support through header files in /usr/include:
+
+ * /usr/include/netinet/in.h
+ to define IPPROTO_UDPLITE
+
+ * /usr/include/netinet/udplite.h
+ for UDP-Lite header fields and protocol constants
+
+ For testing purposes, the following can serve as a `mini' header file:
+
+ #define IPPROTO_UDPLITE 136
+ #define SOL_UDPLITE 136
+ #define UDPLITE_SEND_CSCOV 10
+ #define UDPLITE_RECV_CSCOV 11
+
+ Ready-made header files for various distros are in the UDP-Lite tarball.
+
+
+ IV) KERNEL BEHAVIOUR WITH REGARD TO THE VARIOUS SOCKET OPTIONS
+
+ To enable debugging messages, the log level need to be set to 8, as most
+ messages use the KERN_DEBUG level (7).
+
+ 1) Sender Socket Options
+
+ If the sender specifies a value of 0 as coverage length, the module
+ assumes full coverage, transmits a packet with coverage length of 0
+ and according checksum. If the sender specifies a coverage < 8 and
+ different from 0, the kernel assumes 8 as default value. Finally,
+ if the specified coverage length exceeds the packet length, the packet
+ length is used instead as coverage length.
+
+ 2) Receiver Socket Options
+
+ The receiver specifies the minimum value of the coverage length it
+ is willing to accept. A value of 0 here indicates that the receiver
+ always wants the whole of the packet covered. In this case, all
+ partially covered packets are dropped and an error is logged.
+
+ It is not possible to specify illegal values (<0 and <8); in these
+ cases the default of 8 is assumed.
+
+ All packets arriving with a coverage value less than the specified
+ threshold are discarded, these events are also logged.
+
+ 3) Disabling the Checksum Computation
+
+ On both sender and receiver, checksumming will always be performed
+ and cannot be disabled using SO_NO_CHECK. Thus
+
+ setsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, ... );
+
+ will always will be ignored, while the value of
+
+ getsockopt(sockfd, SOL_SOCKET, SO_NO_CHECK, &value, ...);
+
+ is meaningless (as in TCP). Packets with a zero checksum field are
+ illegal (cf. RFC 3828, sec. 3.1) and will be silently discarded.
+
+ 4) Fragmentation
+
+ The checksum computation respects both buffersize and MTU. The size
+ of UDP-Lite packets is determined by the size of the send buffer. The
+ minimum size of the send buffer is 2048 (defined as SOCK_MIN_SNDBUF
+ in include/net/sock.h), the default value is configurable as
+ net.core.wmem_default or via setting the SO_SNDBUF socket(7)
+ option. The maximum upper bound for the send buffer is determined
+ by net.core.wmem_max.
+
+ Given a payload size larger than the send buffer size, UDP-Lite will
+ split the payload into several individual packets, filling up the
+ send buffer size in each case.
+
+ The precise value also depends on the interface MTU. The interface MTU,
+ in turn, may trigger IP fragmentation. In this case, the generated
+ UDP-Lite packet is split into several IP packets, of which only the
+ first one contains the L4 header.
+
+ The send buffer size has implications on the checksum coverage length.
+ Consider the following example:
+
+ Payload: 1536 bytes Send Buffer: 1024 bytes
+ MTU: 1500 bytes Coverage Length: 856 bytes
+
+ UDP-Lite will ship the 1536 bytes in two separate packets:
+
+ Packet 1: 1024 payload + 8 byte header + 20 byte IP header = 1052 bytes
+ Packet 2: 512 payload + 8 byte header + 20 byte IP header = 540 bytes
+
+ The coverage packet covers the UDP-Lite header and 848 bytes of the
+ payload in the first packet, the second packet is fully covered. Note
+ that for the second packet, the coverage length exceeds the packet
+ length. The kernel always re-adjusts the coverage length to the packet
+ length in such cases.
+
+ As an example of what happens when one UDP-Lite packet is split into
+ several tiny fragments, consider the following example.
+
+ Payload: 1024 bytes Send buffer size: 1024 bytes
+ MTU: 300 bytes Coverage length: 575 bytes
+
+ +-+-----------+--------------+--------------+--------------+
+ |8| 272 | 280 | 280 | 280 |
+ +-+-----------+--------------+--------------+--------------+
+ 280 560 840 1032
+ ^
+ *****checksum coverage*************
+
+ The UDP-Lite module generates one 1032 byte packet (1024 + 8 byte
+ header). According to the interface MTU, these are split into 4 IP
+ packets (280 byte IP payload + 20 byte IP header). The kernel module
+ sums the contents of the entire first two packets, plus 15 bytes of
+ the last packet before releasing the fragments to the IP module.
+
+ To see the analogous case for IPv6 fragmentation, consider a link
+ MTU of 1280 bytes and a write buffer of 3356 bytes. If the checksum
+ coverage is less than 1232 bytes (MTU minus IPv6/fragment header
+ lengths), only the first fragment needs to be considered. When using
+ larger checksum coverage lengths, each eligible fragment needs to be
+ checksummed. Suppose we have a checksum coverage of 3062. The buffer
+ of 3356 bytes will be split into the following fragments:
+
+ Fragment 1: 1280 bytes carrying 1232 bytes of UDP-Lite data
+ Fragment 2: 1280 bytes carrying 1232 bytes of UDP-Lite data
+ Fragment 3: 948 bytes carrying 900 bytes of UDP-Lite data
+
+ The first two fragments have to be checksummed in full, of the last
+ fragment only 598 (= 3062 - 2*1232) bytes are checksummed.
+
+ While it is important that such cases are dealt with correctly, they
+ are (annoyingly) rare: UDP-Lite is designed for optimising multimedia
+ performance over wireless (or generally noisy) links and thus smaller
+ coverage lengths are likely to be expected.
+
+
+ V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING
+
+ Exceptional and error conditions are logged to syslog at the KERN_DEBUG
+ level. Live statistics about UDP-Lite are available in /proc/net/snmp
+ and can (with newer versions of netstat) be viewed using
+
+ netstat -svu
+
+ This displays UDP-Lite statistics variables, whose meaning is as follows.
+
+ InDatagrams: The total number of datagrams delivered to users.
+
+ NoPorts: Number of packets received to an unknown port.
+ These cases are counted separately (not as InErrors).
+
+ InErrors: Number of erroneous UDP-Lite packets. Errors include:
+ * internal socket queue receive errors
+ * packet too short (less than 8 bytes or stated
+ coverage length exceeds received length)
+ * xfrm4_policy_check() returned with error
+ * application has specified larger min. coverage
+ length than that of incoming packet
+ * checksum coverage violated
+ * bad checksum
+
+ OutDatagrams: Total number of sent datagrams.
+
+ These statistics derive from the UDP MIB (RFC 2013).
+
+
+ VI) IPTABLES
+
+ There is packet match support for UDP-Lite as well as support for the LOG target.
+ If you copy and paste the following line into /etc/protocols,
+
+ udplite 136 UDP-Lite # UDP-Lite [RFC 3828]
+
+ then
+ iptables -A INPUT -p udplite -j LOG
+
+ will produce logging output to syslog. Dropping and rejecting packets also works.
+
+
+ VII) MAINTAINER ADDRESS
+
+ The UDP-Lite patch was developed at
+ University of Aberdeen
+ Electronics Research Group
+ Department of Engineering
+ Fraser Noble Building
+ Aberdeen AB24 3UE; UK
+ The current maintainer is Gerrit Renker, <gerrit@erg.abdn.ac.uk>. Initial
+ code was developed by William Stanislaus, <william@erg.abdn.ac.uk>.
diff --git a/Documentation/networking/vortex.txt b/Documentation/networking/vortex.txt
new file mode 100644
index 0000000..bd70976
--- /dev/null
+++ b/Documentation/networking/vortex.txt
@@ -0,0 +1,448 @@
+Documentation/networking/vortex.txt
+Andrew Morton
+30 April 2000
+
+
+This document describes the usage and errata of the 3Com "Vortex" device
+driver for Linux, 3c59x.c.
+
+The driver was written by Donald Becker <becker@scyld.com>
+
+Don is no longer the prime maintainer of this version of the driver.
+Please report problems to one or more of:
+
+ Andrew Morton
+ Netdev mailing list <netdev@vger.kernel.org>
+ Linux kernel mailing list <linux-kernel@vger.kernel.org>
+
+Please note the 'Reporting and Diagnosing Problems' section at the end
+of this file.
+
+
+Since kernel 2.3.99-pre6, this driver incorporates the support for the
+3c575-series Cardbus cards which used to be handled by 3c575_cb.c.
+
+This driver supports the following hardware:
+
+ 3c590 Vortex 10Mbps
+ 3c592 EISA 10Mbps Demon/Vortex
+ 3c597 EISA Fast Demon/Vortex
+ 3c595 Vortex 100baseTx
+ 3c595 Vortex 100baseT4
+ 3c595 Vortex 100base-MII
+ 3c900 Boomerang 10baseT
+ 3c900 Boomerang 10Mbps Combo
+ 3c900 Cyclone 10Mbps TPO
+ 3c900 Cyclone 10Mbps Combo
+ 3c900 Cyclone 10Mbps TPC
+ 3c900B-FL Cyclone 10base-FL
+ 3c905 Boomerang 100baseTx
+ 3c905 Boomerang 100baseT4
+ 3c905B Cyclone 100baseTx
+ 3c905B Cyclone 10/100/BNC
+ 3c905B-FX Cyclone 100baseFx
+ 3c905C Tornado
+ 3c920B-EMB-WNM (ATI Radeon 9100 IGP)
+ 3c980 Cyclone
+ 3c980C Python-T
+ 3cSOHO100-TX Hurricane
+ 3c555 Laptop Hurricane
+ 3c556 Laptop Tornado
+ 3c556B Laptop Hurricane
+ 3c575 [Megahertz] 10/100 LAN CardBus
+ 3c575 Boomerang CardBus
+ 3CCFE575BT Cyclone CardBus
+ 3CCFE575CT Tornado CardBus
+ 3CCFE656 Cyclone CardBus
+ 3CCFEM656B Cyclone+Winmodem CardBus
+ 3CXFEM656C Tornado+Winmodem CardBus
+ 3c450 HomePNA Tornado
+ 3c920 Tornado
+ 3c982 Hydra Dual Port A
+ 3c982 Hydra Dual Port B
+ 3c905B-T4
+ 3c920B-EMB-WNM Tornado
+
+Module parameters
+=================
+
+There are several parameters which may be provided to the driver when
+its module is loaded. These are usually placed in /etc/modprobe.conf
+(/etc/modules.conf in 2.4). Example:
+
+options 3c59x debug=3 rx_copybreak=300
+
+If you are using the PCMCIA tools (cardmgr) then the options may be
+placed in /etc/pcmcia/config.opts:
+
+module "3c59x" opts "debug=3 rx_copybreak=300"
+
+
+The supported parameters are:
+
+debug=N
+
+ Where N is a number from 0 to 7. Anything above 3 produces a lot
+ of output in your system logs. debug=1 is default.
+
+options=N1,N2,N3,...
+
+ Each number in the list provides an option to the corresponding
+ network card. So if you have two 3c905's and you wish to provide
+ them with option 0x204 you would use:
+
+ options=0x204,0x204
+
+ The individual options are composed of a number of bitfields which
+ have the following meanings:
+
+ Possible media type settings
+ 0 10baseT
+ 1 10Mbs AUI
+ 2 undefined
+ 3 10base2 (BNC)
+ 4 100base-TX
+ 5 100base-FX
+ 6 MII (Media Independent Interface)
+ 7 Use default setting from EEPROM
+ 8 Autonegotiate
+ 9 External MII
+ 10 Use default setting from EEPROM
+
+ When generating a value for the 'options' setting, the above media
+ selection values may be OR'ed (or added to) the following:
+
+ 0x8000 Set driver debugging level to 7
+ 0x4000 Set driver debugging level to 2
+ 0x0400 Enable Wake-on-LAN
+ 0x0200 Force full duplex mode.
+ 0x0010 Bus-master enable bit (Old Vortex cards only)
+
+ For example:
+
+ insmod 3c59x options=0x204
+
+ will force full-duplex 100base-TX, rather than allowing the usual
+ autonegotiation.
+
+global_options=N
+
+ Sets the `options' parameter for all 3c59x NICs in the machine.
+ Entries in the `options' array above will override any setting of
+ this.
+
+full_duplex=N1,N2,N3...
+
+ Similar to bit 9 of 'options'. Forces the corresponding card into
+ full-duplex mode. Please use this in preference to the `options'
+ parameter.
+
+ In fact, please don't use this at all! You're better off getting
+ autonegotiation working properly.
+
+global_full_duplex=N1
+
+ Sets full duplex mode for all 3c59x NICs in the machine. Entries
+ in the `full_duplex' array above will override any setting of this.
+
+flow_ctrl=N1,N2,N3...
+
+ Use 802.3x MAC-layer flow control. The 3com cards only support the
+ PAUSE command, which means that they will stop sending packets for a
+ short period if they receive a PAUSE frame from the link partner.
+
+ The driver only allows flow control on a link which is operating in
+ full duplex mode.
+
+ This feature does not appear to work on the 3c905 - only 3c905B and
+ 3c905C have been tested.
+
+ The 3com cards appear to only respond to PAUSE frames which are
+ sent to the reserved destination address of 01:80:c2:00:00:01. They
+ do not honour PAUSE frames which are sent to the station MAC address.
+
+rx_copybreak=M
+
+ The driver preallocates 32 full-sized (1536 byte) network buffers
+ for receiving. When a packet arrives, the driver has to decide
+ whether to leave the packet in its full-sized buffer, or to allocate
+ a smaller buffer and copy the packet across into it.
+
+ This is a speed/space tradeoff.
+
+ The value of rx_copybreak is used to decide when to make the copy.
+ If the packet size is less than rx_copybreak, the packet is copied.
+ The default value for rx_copybreak is 200 bytes.
+
+max_interrupt_work=N
+
+ The driver's interrupt service routine can handle many receive and
+ transmit packets in a single invocation. It does this in a loop.
+ The value of max_interrupt_work governs how mnay times the interrupt
+ service routine will loop. The default value is 32 loops. If this
+ is exceeded the interrupt service routine gives up and generates a
+ warning message "eth0: Too much work in interrupt".
+
+hw_checksums=N1,N2,N3,...
+
+ Recent 3com NICs are able to generate IPv4, TCP and UDP checksums
+ in hardware. Linux has used the Rx checksumming for a long time.
+ The "zero copy" patch which is planned for the 2.4 kernel series
+ allows you to make use of the NIC's DMA scatter/gather and transmit
+ checksumming as well.
+
+ The driver is set up so that, when the zerocopy patch is applied,
+ all Tornado and Cyclone devices will use S/G and Tx checksums.
+
+ This module parameter has been provided so you can override this
+ decision. If you think that Tx checksums are causing a problem, you
+ may disable the feature with `hw_checksums=0'.
+
+ If you think your NIC should be performing Tx checksumming and the
+ driver isn't enabling it, you can force the use of hardware Tx
+ checksumming with `hw_checksums=1'.
+
+ The driver drops a message in the logfiles to indicate whether or
+ not it is using hardware scatter/gather and hardware Tx checksums.
+
+ Scatter/gather and hardware checksums provide considerable
+ performance improvement for the sendfile() system call, but a small
+ decrease in throughput for send(). There is no effect upon receive
+ efficiency.
+
+compaq_ioaddr=N
+compaq_irq=N
+compaq_device_id=N
+
+ "Variables to work-around the Compaq PCI BIOS32 problem"....
+
+watchdog=N
+
+ Sets the time duration (in milliseconds) after which the kernel
+ decides that the transmitter has become stuck and needs to be reset.
+ This is mainly for debugging purposes, although it may be advantageous
+ to increase this value on LANs which have very high collision rates.
+ The default value is 5000 (5.0 seconds).
+
+enable_wol=N1,N2,N3,...
+
+ Enable Wake-on-LAN support for the relevant interface. Donald
+ Becker's `ether-wake' application may be used to wake suspended
+ machines.
+
+ Also enables the NIC's power management support.
+
+global_enable_wol=N
+
+ Sets enable_wol mode for all 3c59x NICs in the machine. Entries in
+ the `enable_wol' array above will override any setting of this.
+
+Media selection
+---------------
+
+A number of the older NICs such as the 3c590 and 3c900 series have
+10base2 and AUI interfaces.
+
+Prior to January, 2001 this driver would autoeselect the 10base2 or AUI
+port if it didn't detect activity on the 10baseT port. It would then
+get stuck on the 10base2 port and a driver reload was necessary to
+switch back to 10baseT. This behaviour could not be prevented with a
+module option override.
+
+Later (current) versions of the driver _do_ support locking of the
+media type. So if you load the driver module with
+
+ modprobe 3c59x options=0
+
+it will permanently select the 10baseT port. Automatic selection of
+other media types does not occur.
+
+
+Transmit error, Tx status register 82
+-------------------------------------
+
+This is a common error which is almost always caused by another host on
+the same network being in full-duplex mode, while this host is in
+half-duplex mode. You need to find that other host and make it run in
+half-duplex mode or fix this host to run in full-duplex mode.
+
+As a last resort, you can force the 3c59x driver into full-duplex mode
+with
+
+ options 3c59x full_duplex=1
+
+but this has to be viewed as a workaround for broken network gear and
+should only really be used for equipment which cannot autonegotiate.
+
+
+Additional resources
+--------------------
+
+Details of the device driver implementation are at the top of the source file.
+
+Additional documentation is available at Don Becker's Linux Drivers site:
+
+ http://www.scyld.com/vortex.html
+
+Donald Becker's driver development site:
+
+ http://www.scyld.com/network.html
+
+Donald's vortex-diag program is useful for inspecting the NIC's state:
+
+ http://www.scyld.com/ethercard_diag.html
+
+Donald's mii-diag program may be used for inspecting and manipulating
+the NIC's Media Independent Interface subsystem:
+
+ http://www.scyld.com/ethercard_diag.html#mii-diag
+
+Donald's wake-on-LAN page:
+
+ http://www.scyld.com/wakeonlan.html
+
+3Com's DOS-based application for setting up the NICs EEPROMs:
+
+ ftp://ftp.3com.com/pub/nic/3c90x/3c90xx2.exe
+
+
+Autonegotiation notes
+---------------------
+
+ The driver uses a one-minute heartbeat for adapting to changes in
+ the external LAN environment if link is up and 5 seconds if link is down.
+ This means that when, for example, a machine is unplugged from a hubbed
+ 10baseT LAN plugged into a switched 100baseT LAN, the throughput
+ will be quite dreadful for up to sixty seconds. Be patient.
+
+ Cisco interoperability note from Walter Wong <wcw+@CMU.EDU>:
+
+ On a side note, adding HAS_NWAY seems to share a problem with the
+ Cisco 6509 switch. Specifically, you need to change the spanning
+ tree parameter for the port the machine is plugged into to 'portfast'
+ mode. Otherwise, the negotiation fails. This has been an issue
+ we've noticed for a while but haven't had the time to track down.
+
+ Cisco switches (Jeff Busch <jbusch@deja.com>)
+
+ My "standard config" for ports to which PC's/servers connect directly:
+
+ interface FastEthernet0/N
+ description machinename
+ load-interval 30
+ spanning-tree portfast
+
+ If autonegotiation is a problem, you may need to specify "speed
+ 100" and "duplex full" as well (or "speed 10" and "duplex half").
+
+ WARNING: DO NOT hook up hubs/switches/bridges to these
+ specially-configured ports! The switch will become very confused.
+
+
+Reporting and diagnosing problems
+---------------------------------
+
+Maintainers find that accurate and complete problem reports are
+invaluable in resolving driver problems. We are frequently not able to
+reproduce problems and must rely on your patience and efforts to get to
+the bottom of the problem.
+
+If you believe you have a driver problem here are some of the
+steps you should take:
+
+- Is it really a driver problem?
+
+ Eliminate some variables: try different cards, different
+ computers, different cables, different ports on the switch/hub,
+ different versions of the kernel or of the driver, etc.
+
+- OK, it's a driver problem.
+
+ You need to generate a report. Typically this is an email to the
+ maintainer and/or linux-net@vger.kernel.org. The maintainer's
+ email address will be in the driver source or in the MAINTAINERS file.
+
+- The contents of your report will vary a lot depending upon the
+ problem. If it's a kernel crash then you should refer to the
+ REPORTING-BUGS file.
+
+ But for most problems it is useful to provide the following:
+
+ o Kernel version, driver version
+
+ o A copy of the banner message which the driver generates when
+ it is initialised. For example:
+
+ eth0: 3Com PCI 3c905C Tornado at 0xa400, 00:50:da:6a:88:f0, IRQ 19
+ 8K byte-wide RAM 5:3 Rx:Tx split, autoselect/Autonegotiate interface.
+ MII transceiver found at address 24, status 782d.
+ Enabling bus-master transmits and whole-frame receives.
+
+ NOTE: You must provide the `debug=2' modprobe option to generate
+ a full detection message. Please do this:
+
+ modprobe 3c59x debug=2
+
+ o If it is a PCI device, the relevant output from 'lspci -vx', eg:
+
+ 00:09.0 Ethernet controller: 3Com Corporation 3c905C-TX [Fast Etherlink] (rev 74)
+ Subsystem: 3Com Corporation: Unknown device 9200
+ Flags: bus master, medium devsel, latency 32, IRQ 19
+ I/O ports at a400 [size=128]
+ Memory at db000000 (32-bit, non-prefetchable) [size=128]
+ Expansion ROM at <unassigned> [disabled] [size=128K]
+ Capabilities: [dc] Power Management version 2
+ 00: b7 10 00 92 07 00 10 02 74 00 00 02 08 20 00 00
+ 10: 01 a4 00 00 00 00 00 db 00 00 00 00 00 00 00 00
+ 20: 00 00 00 00 00 00 00 00 00 00 00 00 b7 10 00 10
+ 30: 00 00 00 00 dc 00 00 00 00 00 00 00 05 01 0a 0a
+
+ o A description of the environment: 10baseT? 100baseT?
+ full/half duplex? switched or hubbed?
+
+ o Any additional module parameters which you may be providing to the driver.
+
+ o Any kernel logs which are produced. The more the merrier.
+ If this is a large file and you are sending your report to a
+ mailing list, mention that you have the logfile, but don't send
+ it. If you're reporting direct to the maintainer then just send
+ it.
+
+ To ensure that all kernel logs are available, add the
+ following line to /etc/syslog.conf:
+
+ kern.* /var/log/messages
+
+ Then restart syslogd with:
+
+ /etc/rc.d/init.d/syslog restart
+
+ (The above may vary, depending upon which Linux distribution you use).
+
+ o If your problem is reproducible then that's great. Try the
+ following:
+
+ 1) Increase the debug level. Usually this is done via:
+
+ a) modprobe driver debug=7
+ b) In /etc/modprobe.conf (or /etc/modules.conf for 2.4):
+ options driver debug=7
+
+ 2) Recreate the problem with the higher debug level,
+ send all logs to the maintainer.
+
+ 3) Download you card's diagnostic tool from Donald
+ Becker's website <http://www.scyld.com/ethercard_diag.html>.
+ Download mii-diag.c as well. Build these.
+
+ a) Run 'vortex-diag -aaee' and 'mii-diag -v' when the card is
+ working correctly. Save the output.
+
+ b) Run the above commands when the card is malfunctioning. Send
+ both sets of output.
+
+Finally, please be patient and be prepared to do some work. You may
+end up working on this problem for a week or more as the maintainer
+asks more questions, asks for more tests, asks for patches to be
+applied, etc. At the end of it all, the problem may even remain
+unresolved.
diff --git a/Documentation/networking/wavelan.txt b/Documentation/networking/wavelan.txt
new file mode 100644
index 0000000..afa6e52
--- /dev/null
+++ b/Documentation/networking/wavelan.txt
@@ -0,0 +1,73 @@
+ The Wavelan drivers saga
+ ------------------------
+
+ By Jean Tourrilhes <jt@hpl.hp.com>
+
+ The Wavelan is a Radio network adapter designed by
+Lucent. Under this generic name is hidden quite a variety of hardware,
+and many Linux driver to support it.
+ The get the full story on Wireless LANs, please consult :
+ http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/
+
+"wavelan" driver (old ISA Wavelan)
+----------------
+ o Config : Network device -> Wireless LAN -> AT&T WaveLAN
+ o Location : .../drivers/net/wireless/wavelan*
+ o in-line doc : .../drivers/net/wireless/wavelan.p.h
+ o on-line doc :
+ http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Wavelan.html
+
+ This is the driver for the ISA version of the first generation
+of the Wavelan, now discontinued. The device is 2 Mb/s, composed of a
+Intel 82586 controller and a Lucent Modem, and is NOT 802.11 compliant.
+ The driver has been tested with the following hardware :
+ o Wavelan ISA 915 MHz (full length ISA card)
+ o Wavelan ISA 915 MHz 2.0 (half length ISA card)
+ o Wavelan ISA 2.4 GHz (full length ISA card, fixed frequency)
+ o Wavelan ISA 2.4 GHz 2.0 (half length ISA card, frequency selectable)
+ o Above cards with the optional DES encryption feature
+
+"wavelan_cs" driver (old Pcmcia Wavelan)
+-------------------
+ o Config : Network device -> PCMCIA network ->
+ Pcmcia Wireless LAN -> AT&T/Lucent WaveLAN
+ o Location : .../drivers/net/pcmcia/wavelan*
+ o in-line doc : .../drivers/net/pcmcia/wavelan_cs.h
+ o on-line doc :
+ http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Wavelan.html
+
+ This is the driver for the PCMCIA version of the first
+generation of the Wavelan, now discontinued. The device is 2 Mb/s,
+composed of a Intel 82593 controller (totally different from the 82586)
+and a Lucent Modem, and NOT 802.11 compatible.
+ The driver has been tested with the following hardware :
+ o Wavelan Pcmcia 915 MHz 2.0 (Pcmcia card + separate
+ modem/antenna block)
+ o Wavelan Pcmcia 2.4 GHz 2.0 (Pcmcia card + separate
+ modem/antenna block)
+
+"wvlan_cs" driver (Wavelan IEEE, GPL)
+-----------------
+ o Config : Not yet in kernel
+ o Location : Pcmcia package 3.1.10+
+ o on-line doc : http://www.fasta.fh-dortmund.de/users/andy/wvlan/
+
+ This is the driver for the current generation of Wavelan IEEE,
+which is 802.11 compatible. Depending on version, it is 2 Mb/s or 11
+Mb/s, with or without encryption, all implemented in Lucent specific
+DSP (the Hermes).
+ This is a GPL full source PCMCIA driver (ISA is just a Pcmcia
+card with ISA-Pcmcia bridge).
+
+"wavelan2_cs" driver (Wavelan IEEE, binary)
+--------------------
+ o Config : Not yet in kernel
+ o Location : ftp://sourceforge.org/pcmcia/contrib/
+
+ This driver support exactly the same hardware as the previous
+driver, the main difference is that it is based on a binary library
+and supported by Lucent.
+
+ I hope it clears the confusion ;-)
+
+ Jean
diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt
new file mode 100644
index 0000000..975cc87
--- /dev/null
+++ b/Documentation/networking/x25-iface.txt
@@ -0,0 +1,123 @@
+ X.25 Device Driver Interface 1.1
+
+ Jonathan Naylor 26.12.96
+
+This is a description of the messages to be passed between the X.25 Packet
+Layer and the X.25 device driver. They are designed to allow for the easy
+setting of the LAPB mode from within the Packet Layer.
+
+The X.25 device driver will be coded normally as per the Linux device driver
+standards. Most X.25 device drivers will be moderately similar to the
+already existing Ethernet device drivers. However unlike those drivers, the
+X.25 device driver has a state associated with it, and this information
+needs to be passed to and from the Packet Layer for proper operation.
+
+All messages are held in sk_buff's just like real data to be transmitted
+over the LAPB link. The first byte of the skbuff indicates the meaning of
+the rest of the skbuff, if any more information does exist.
+
+
+Packet Layer to Device Driver
+-----------------------------
+
+First Byte = 0x00
+
+This indicates that the rest of the skbuff contains data to be transmitted
+over the LAPB link. The LAPB link should already exist before any data is
+passed down.
+
+First Byte = 0x01
+
+Establish the LAPB link. If the link is already established then the connect
+confirmation message should be returned as soon as possible.
+
+First Byte = 0x02
+
+Terminate the LAPB link. If it is already disconnected then the disconnect
+confirmation message should be returned as soon as possible.
+
+First Byte = 0x03
+
+LAPB parameters. To be defined.
+
+
+Device Driver to Packet Layer
+-----------------------------
+
+First Byte = 0x00
+
+This indicates that the rest of the skbuff contains data that has been
+received over the LAPB link.
+
+First Byte = 0x01
+
+LAPB link has been established. The same message is used for both a LAPB
+link connect_confirmation and a connect_indication.
+
+First Byte = 0x02
+
+LAPB link has been terminated. This same message is used for both a LAPB
+link disconnect_confirmation and a disconnect_indication.
+
+First Byte = 0x03
+
+LAPB parameters. To be defined.
+
+
+
+Possible Problems
+=================
+
+(Henner Eisen, 2000-10-28)
+
+The X.25 packet layer protocol depends on a reliable datalink service.
+The LAPB protocol provides such reliable service. But this reliability
+is not preserved by the Linux network device driver interface:
+
+- With Linux 2.4.x (and above) SMP kernels, packet ordering is not
+ preserved. Even if a device driver calls netif_rx(skb1) and later
+ netif_rx(skb2), skb2 might be delivered to the network layer
+ earlier that skb1.
+- Data passed upstream by means of netif_rx() might be dropped by the
+ kernel if the backlog queue is congested.
+
+The X.25 packet layer protocol will detect this and reset the virtual
+call in question. But many upper layer protocols are not designed to
+handle such N-Reset events gracefully. And frequent N-Reset events
+will always degrade performance.
+
+Thus, driver authors should make netif_rx() as reliable as possible:
+
+SMP re-ordering will not occur if the driver's interrupt handler is
+always executed on the same CPU. Thus,
+
+- Driver authors should use irq affinity for the interrupt handler.
+
+The probability of packet loss due to backlog congestion can be
+reduced by the following measures or a combination thereof:
+
+(1) Drivers for kernel versions 2.4.x and above should always check the
+ return value of netif_rx(). If it returns NET_RX_DROP, the
+ driver's LAPB protocol must not confirm reception of the frame
+ to the peer.
+ This will reliably suppress packet loss. The LAPB protocol will
+ automatically cause the peer to re-transmit the dropped packet
+ later.
+ The lapb module interface was modified to support this. Its
+ data_indication() method should now transparently pass the
+ netif_rx() return value to the (lapb mopdule) caller.
+(2) Drivers for kernel versions 2.2.x should always check the global
+ variable netdev_dropping when a new frame is received. The driver
+ should only call netif_rx() if netdev_dropping is zero. Otherwise
+ the driver should not confirm delivery of the frame and drop it.
+ Alternatively, the driver can queue the frame internally and call
+ netif_rx() later when netif_dropping is 0 again. In that case, delivery
+ confirmation should also be deferred such that the internal queue
+ cannot grow to much.
+ This will not reliably avoid packet loss, but the probability
+ of packet loss in netif_rx() path will be significantly reduced.
+(3) Additionally, driver authors might consider to support
+ CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up
+ when a previously congested backlog queue becomes empty again.
+ The driver could uses this for flow-controlling the peer by means
+ of the LAPB protocol's flow-control service.
diff --git a/Documentation/networking/x25.txt b/Documentation/networking/x25.txt
new file mode 100644
index 0000000..c91c6d7
--- /dev/null
+++ b/Documentation/networking/x25.txt
@@ -0,0 +1,44 @@
+Linux X.25 Project
+
+As my third year dissertation at University I have taken it upon myself to
+write an X.25 implementation for Linux. My aim is to provide a complete X.25
+Packet Layer and a LAPB module to allow for "normal" X.25 to be run using
+Linux. There are two sorts of X.25 cards available, intelligent ones that
+implement LAPB on the card itself, and unintelligent ones that simply do
+framing, bit-stuffing and checksumming. These both need to be handled by the
+system.
+
+I therefore decided to write the implementation such that as far as the
+Packet Layer is concerned, the link layer was being performed by a lower
+layer of the Linux kernel and therefore it did not concern itself with
+implementation of LAPB. Therefore the LAPB modules would be called by
+unintelligent X.25 card drivers and not by intelligent ones, this would
+provide a uniform device driver interface, and simplify configuration.
+
+To confuse matters a little, an 802.2 LLC implementation for Linux is being
+written which will allow X.25 to be run over an Ethernet (or Token Ring) and
+conform with the JNT "Pink Book", this will have a different interface to
+the Packet Layer but there will be no confusion since the class of device
+being served by the LLC will be completely separate from LAPB. The LLC
+implementation is being done as part of another protocol project (SNA) and
+by a different author.
+
+Just when you thought that it could not become more confusing, another
+option appeared, XOT. This allows X.25 Packet Layer frames to operate over
+the Internet using TCP/IP as a reliable link layer. RFC1613 specifies the
+format and behaviour of the protocol. If time permits this option will also
+be actively considered.
+
+A linux-x25 mailing list has been created at vger.kernel.org to support the
+development and use of Linux X.25. It is early days yet, but interested
+parties are welcome to subscribe to it. Just send a message to
+majordomo@vger.kernel.org with the following in the message body:
+
+subscribe linux-x25
+end
+
+The contents of the Subject line are ignored.
+
+Jonathan
+
+g4klx@g4klx.demon.co.uk
diff --git a/Documentation/networking/xfrm_proc.txt b/Documentation/networking/xfrm_proc.txt
new file mode 100644
index 0000000..d0d8baf
--- /dev/null
+++ b/Documentation/networking/xfrm_proc.txt
@@ -0,0 +1,74 @@
+XFRM proc - /proc/net/xfrm_* files
+==================================
+Masahide NAKAMURA <nakam@linux-ipv6.org>
+
+
+Transformation Statistics
+-------------------------
+xfrm_proc is a statistics shown factor dropped by transformation
+for developer.
+It is a counter designed from current transformation source code
+and defined like linux private MIB.
+
+Inbound statistics
+~~~~~~~~~~~~~~~~~~
+XfrmInError:
+ All errors which is not matched others
+XfrmInBufferError:
+ No buffer is left
+XfrmInHdrError:
+ Header error
+XfrmInNoStates:
+ No state is found
+ i.e. Either inbound SPI, address, or IPsec protocol at SA is wrong
+XfrmInStateProtoError:
+ Transformation protocol specific error
+ e.g. SA key is wrong
+XfrmInStateModeError:
+ Transformation mode specific error
+XfrmInStateSeqError:
+ Sequence error
+ i.e. Sequence number is out of window
+XfrmInStateExpired:
+ State is expired
+XfrmInStateMismatch:
+ State has mismatch option
+ e.g. UDP encapsulation type is mismatch
+XfrmInStateInvalid:
+ State is invalid
+XfrmInTmplMismatch:
+ No matching template for states
+ e.g. Inbound SAs are correct but SP rule is wrong
+XfrmInNoPols:
+ No policy is found for states
+ e.g. Inbound SAs are correct but no SP is found
+XfrmInPolBlock:
+ Policy discards
+XfrmInPolError:
+ Policy error
+
+Outbound errors
+~~~~~~~~~~~~~~~
+XfrmOutError:
+ All errors which is not matched others
+XfrmOutBundleGenError:
+ Bundle generation error
+XfrmOutBundleCheckError:
+ Bundle check error
+XfrmOutNoStates:
+ No state is found
+XfrmOutStateProtoError:
+ Transformation protocol specific error
+XfrmOutStateModeError:
+ Transformation mode specific error
+XfrmOutStateSeqError:
+ Sequence error
+ i.e. Sequence number overflow
+XfrmOutStateExpired:
+ State is expired
+XfrmOutPolBlock:
+ Policy discards
+XfrmOutPolDead:
+ Policy is dead
+XfrmOutPolError:
+ Policy error
diff --git a/Documentation/networking/xfrm_sync.txt b/Documentation/networking/xfrm_sync.txt
new file mode 100644
index 0000000..d7aac9d
--- /dev/null
+++ b/Documentation/networking/xfrm_sync.txt
@@ -0,0 +1,169 @@
+
+The sync patches work is based on initial patches from
+Krisztian <hidden@balabit.hu> and others and additional patches
+from Jamal <hadi@cyberus.ca>.
+
+The end goal for syncing is to be able to insert attributes + generate
+events so that the an SA can be safely moved from one machine to another
+for HA purposes.
+The idea is to synchronize the SA so that the takeover machine can do
+the processing of the SA as accurate as possible if it has access to it.
+
+We already have the ability to generate SA add/del/upd events.
+These patches add ability to sync and have accurate lifetime byte (to
+ensure proper decay of SAs) and replay counters to avoid replay attacks
+with as minimal loss at failover time.
+This way a backup stays as closely uptodate as an active member.
+
+Because the above items change for every packet the SA receives,
+it is possible for a lot of the events to be generated.
+For this reason, we also add a nagle-like algorithm to restrict
+the events. i.e we are going to set thresholds to say "let me
+know if the replay sequence threshold is reached or 10 secs have passed"
+These thresholds are set system-wide via sysctls or can be updated
+per SA.
+
+The identified items that need to be synchronized are:
+- the lifetime byte counter
+note that: lifetime time limit is not important if you assume the failover
+machine is known ahead of time since the decay of the time countdown
+is not driven by packet arrival.
+- the replay sequence for both inbound and outbound
+
+1) Message Structure
+----------------------
+
+nlmsghdr:aevent_id:optional-TLVs.
+
+The netlink message types are:
+
+XFRM_MSG_NEWAE and XFRM_MSG_GETAE.
+
+A XFRM_MSG_GETAE does not have TLVs.
+A XFRM_MSG_NEWAE will have at least two TLVs (as is
+discussed further below).
+
+aevent_id structure looks like:
+
+ struct xfrm_aevent_id {
+ struct xfrm_usersa_id sa_id;
+ xfrm_address_t saddr;
+ __u32 flags;
+ __u32 reqid;
+ };
+
+The unique SA is identified by the combination of xfrm_usersa_id,
+reqid and saddr.
+
+flags are used to indicate different things. The possible
+flags are:
+ XFRM_AE_RTHR=1, /* replay threshold*/
+ XFRM_AE_RVAL=2, /* replay value */
+ XFRM_AE_LVAL=4, /* lifetime value */
+ XFRM_AE_ETHR=8, /* expiry timer threshold */
+ XFRM_AE_CR=16, /* Event cause is replay update */
+ XFRM_AE_CE=32, /* Event cause is timer expiry */
+ XFRM_AE_CU=64, /* Event cause is policy update */
+
+How these flags are used is dependent on the direction of the
+message (kernel<->user) as well the cause (config, query or event).
+This is described below in the different messages.
+
+The pid will be set appropriately in netlink to recognize direction
+(0 to the kernel and pid = processid that created the event
+when going from kernel to user space)
+
+A program needs to subscribe to multicast group XFRMNLGRP_AEVENTS
+to get notified of these events.
+
+2) TLVS reflect the different parameters:
+-----------------------------------------
+
+a) byte value (XFRMA_LTIME_VAL)
+This TLV carries the running/current counter for byte lifetime since
+last event.
+
+b)replay value (XFRMA_REPLAY_VAL)
+This TLV carries the running/current counter for replay sequence since
+last event.
+
+c)replay threshold (XFRMA_REPLAY_THRESH)
+This TLV carries the threshold being used by the kernel to trigger events
+when the replay sequence is exceeded.
+
+d) expiry timer (XFRMA_ETIMER_THRESH)
+This is a timer value in milliseconds which is used as the nagle
+value to rate limit the events.
+
+3) Default configurations for the parameters:
+----------------------------------------------
+
+By default these events should be turned off unless there is
+at least one listener registered to listen to the multicast
+group XFRMNLGRP_AEVENTS.
+
+Programs installing SAs will need to specify the two thresholds, however,
+in order to not change existing applications such as racoon
+we also provide default threshold values for these different parameters
+in case they are not specified.
+
+the two sysctls/proc entries are:
+a) /proc/sys/net/core/sysctl_xfrm_aevent_etime
+used to provide default values for the XFRMA_ETIMER_THRESH in incremental
+units of time of 100ms. The default is 10 (1 second)
+
+b) /proc/sys/net/core/sysctl_xfrm_aevent_rseqth
+used to provide default values for XFRMA_REPLAY_THRESH parameter
+in incremental packet count. The default is two packets.
+
+4) Message types
+----------------
+
+a) XFRM_MSG_GETAE issued by user-->kernel.
+XFRM_MSG_GETAE does not carry any TLVs.
+The response is a XFRM_MSG_NEWAE which is formatted based on what
+XFRM_MSG_GETAE queried for.
+The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
+*if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved
+*if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved
+
+b) XFRM_MSG_NEWAE is issued by either user space to configure
+or kernel to announce events or respond to a XFRM_MSG_GETAE.
+
+i) user --> kernel to configure a specific SA.
+any of the values or threshold parameters can be updated by passing the
+appropriate TLV.
+A response is issued back to the sender in user space to indicate success
+or failure.
+In the case of success, additionally an event with
+XFRM_MSG_NEWAE is also issued to any listeners as described in iii).
+
+ii) kernel->user direction as a response to XFRM_MSG_GETAE
+The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
+The threshold TLVs will be included if explicitly requested in
+the XFRM_MSG_GETAE message.
+
+iii) kernel->user to report as event if someone sets any values or
+thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above).
+In such a case XFRM_AE_CU flag is set to inform the user that
+the change happened as a result of an update.
+The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
+
+iv) kernel->user to report event when replay threshold or a timeout
+is exceeded.
+In such a case either XFRM_AE_CR (replay exceeded) or XFRM_AE_CE (timeout
+happened) is set to inform the user what happened.
+Note the two flags are mutually exclusive.
+The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
+
+Exceptions to threshold settings
+--------------------------------
+
+If you have an SA that is getting hit by traffic in bursts such that
+there is a period where the timer threshold expires with no packets
+seen, then an odd behavior is seen as follows:
+The first packet arrival after a timer expiry will trigger a timeout
+aevent; i.e we dont wait for a timeout period or a packet threshold
+to be reached. This is done for simplicity and efficiency reasons.
+
+-JHS
diff --git a/Documentation/networking/xfrm_sysctl.txt b/Documentation/networking/xfrm_sysctl.txt
new file mode 100644
index 0000000..5bbd167
--- /dev/null
+++ b/Documentation/networking/xfrm_sysctl.txt
@@ -0,0 +1,4 @@
+/proc/sys/net/core/xfrm_* Variables:
+
+xfrm_acq_expires - INTEGER
+ default 30 - hard timeout in seconds for acquire requests
diff --git a/Documentation/networking/z8530drv.txt b/Documentation/networking/z8530drv.txt
new file mode 100644
index 0000000..2206abb
--- /dev/null
+++ b/Documentation/networking/z8530drv.txt
@@ -0,0 +1,657 @@
+This is a subset of the documentation. To use this driver you MUST have the
+full package from:
+
+Internet:
+=========
+
+1. ftp://ftp.ccac.rwth-aachen.de/pub/jr/z8530drv-utils_3.0-3.tar.gz
+
+2. ftp://ftp.pspt.fi/pub/ham/linux/ax25/z8530drv-utils_3.0-3.tar.gz
+
+Please note that the information in this document may be hopelessly outdated.
+A new version of the documentation, along with links to other important
+Linux Kernel AX.25 documentation and programs, is available on
+http://yaina.de/jreuter
+
+-----------------------------------------------------------------------------
+
+
+ SCC.C - Linux driver for Z8530 based HDLC cards for AX.25
+
+ ********************************************************************
+
+ (c) 1993,2000 by Joerg Reuter DL1BKE <jreuter@yaina.de>
+
+ portions (c) 1993 Guido ten Dolle PE1NNZ
+
+ for the complete copyright notice see >> Copying.Z8530DRV <<
+
+ ********************************************************************
+
+
+1. Initialization of the driver
+===============================
+
+To use the driver, 3 steps must be performed:
+
+ 1. if compiled as module: loading the module
+ 2. Setup of hardware, MODEM and KISS parameters with sccinit
+ 3. Attach each channel to the Linux kernel AX.25 with "ifconfig"
+
+Unlike the versions below 2.4 this driver is a real network device
+driver. If you want to run xNOS instead of our fine kernel AX.25
+use a 2.x version (available from above sites) or read the
+AX.25-HOWTO on how to emulate a KISS TNC on network device drivers.
+
+
+1.1 Loading the module
+======================
+
+(If you're going to compile the driver as a part of the kernel image,
+ skip this chapter and continue with 1.2)
+
+Before you can use a module, you'll have to load it with
+
+ insmod scc.o
+
+please read 'man insmod' that comes with module-init-tools.
+
+You should include the insmod in one of the /etc/rc.d/rc.* files,
+and don't forget to insert a call of sccinit after that. It
+will read your /etc/z8530drv.conf.
+
+1.2. /etc/z8530drv.conf
+=======================
+
+To setup all parameters you must run /sbin/sccinit from one
+of your rc.*-files. This has to be done BEFORE you can
+"ifconfig" an interface. Sccinit reads the file /etc/z8530drv.conf
+and sets the hardware, MODEM and KISS parameters. A sample file is
+delivered with this package. Change it to your needs.
+
+The file itself consists of two main sections.
+
+1.2.1 configuration of hardware parameters
+==========================================
+
+The hardware setup section defines the following parameters for each
+Z8530:
+
+chip 1
+data_a 0x300 # data port A
+ctrl_a 0x304 # control port A
+data_b 0x301 # data port B
+ctrl_b 0x305 # control port B
+irq 5 # IRQ No. 5
+pclock 4915200 # clock
+board BAYCOM # hardware type
+escc no # enhanced SCC chip? (8580/85180/85280)
+vector 0 # latch for interrupt vector
+special no # address of special function register
+option 0 # option to set via sfr
+
+
+chip - this is just a delimiter to make sccinit a bit simpler to
+ program. A parameter has no effect.
+
+data_a - the address of the data port A of this Z8530 (needed)
+ctrl_a - the address of the control port A (needed)
+data_b - the address of the data port B (needed)
+ctrl_b - the address of the control port B (needed)
+
+irq - the used IRQ for this chip. Different chips can use different
+ IRQs or the same. If they share an interrupt, it needs to be
+ specified within one chip-definition only.
+
+pclock - the clock at the PCLK pin of the Z8530 (option, 4915200 is
+ default), measured in Hertz
+
+board - the "type" of the board:
+
+ SCC type value
+ ---------------------------------
+ PA0HZP SCC card PA0HZP
+ EAGLE card EAGLE
+ PC100 card PC100
+ PRIMUS-PC (DG9BL) card PRIMUS
+ BayCom (U)SCC card BAYCOM
+
+escc - if you want support for ESCC chips (8580, 85180, 85280), set
+ this to "yes" (option, defaults to "no")
+
+vector - address of the vector latch (aka "intack port") for PA0HZP
+ cards. There can be only one vector latch for all chips!
+ (option, defaults to 0)
+
+special - address of the special function register on several cards.
+ (option, defaults to 0)
+
+option - The value you write into that register (option, default is 0)
+
+You can specify up to four chips (8 channels). If this is not enough,
+just change
+
+ #define MAXSCC 4
+
+to a higher value.
+
+Example for the BAYCOM USCC:
+----------------------------
+
+chip 1
+data_a 0x300 # data port A
+ctrl_a 0x304 # control port A
+data_b 0x301 # data port B
+ctrl_b 0x305 # control port B
+irq 5 # IRQ No. 5 (#)
+board BAYCOM # hardware type (*)
+#
+# SCC chip 2
+#
+chip 2
+data_a 0x302
+ctrl_a 0x306
+data_b 0x303
+ctrl_b 0x307
+board BAYCOM
+
+An example for a PA0HZP card:
+-----------------------------
+
+chip 1
+data_a 0x153
+data_b 0x151
+ctrl_a 0x152
+ctrl_b 0x150
+irq 9
+pclock 4915200
+board PA0HZP
+vector 0x168
+escc no
+#
+#
+#
+chip 2
+data_a 0x157
+data_b 0x155
+ctrl_a 0x156
+ctrl_b 0x154
+irq 9
+pclock 4915200
+board PA0HZP
+vector 0x168
+escc no
+
+A DRSI would should probably work with this:
+--------------------------------------------
+(actually: two DRSI cards...)
+
+chip 1
+data_a 0x303
+data_b 0x301
+ctrl_a 0x302
+ctrl_b 0x300
+irq 7
+pclock 4915200
+board DRSI
+escc no
+#
+#
+#
+chip 2
+data_a 0x313
+data_b 0x311
+ctrl_a 0x312
+ctrl_b 0x310
+irq 7
+pclock 4915200
+board DRSI
+escc no
+
+Note that you cannot use the on-board baudrate generator off DRSI
+cards. Use "mode dpll" for clock source (see below).
+
+This is based on information provided by Mike Bilow (and verified
+by Paul Helay)
+
+The utility "gencfg"
+--------------------
+
+If you only know the parameters for the PE1CHL driver for DOS,
+run gencfg. It will generate the correct port addresses (I hope).
+Its parameters are exactly the same as the ones you use with
+the "attach scc" command in net, except that the string "init" must
+not appear. Example:
+
+gencfg 2 0x150 4 2 0 1 0x168 9 4915200
+
+will print a skeleton z8530drv.conf for the OptoSCC to stdout.
+
+gencfg 2 0x300 2 4 5 -4 0 7 4915200 0x10
+
+does the same for the BAYCOM USCC card. In my opinion it is much easier
+to edit scc_config.h...
+
+
+1.2.2 channel configuration
+===========================
+
+The channel definition is divided into three sub sections for each
+channel:
+
+An example for scc0:
+
+# DEVICE
+
+device scc0 # the device for the following params
+
+# MODEM / BUFFERS
+
+speed 1200 # the default baudrate
+clock dpll # clock source:
+ # dpll = normal half duplex operation
+ # external = MODEM provides own Rx/Tx clock
+ # divider = use full duplex divider if
+ # installed (1)
+mode nrzi # HDLC encoding mode
+ # nrzi = 1k2 MODEM, G3RUH 9k6 MODEM
+ # nrz = DF9IC 9k6 MODEM
+ #
+bufsize 384 # size of buffers. Note that this must include
+ # the AX.25 header, not only the data field!
+ # (optional, defaults to 384)
+
+# KISS (Layer 1)
+
+txdelay 36 # (see chapter 1.4)
+persist 64
+slot 8
+tail 8
+fulldup 0
+wait 12
+min 3
+maxkey 7
+idle 3
+maxdef 120
+group 0
+txoff off
+softdcd on
+slip off
+
+The order WITHIN these sections is unimportant. The order OF these
+sections IS important. The MODEM parameters are set with the first
+recognized KISS parameter...
+
+Please note that you can initialize the board only once after boot
+(or insmod). You can change all parameters but "mode" and "clock"
+later with the Sccparam program or through KISS. Just to avoid
+security holes...
+
+(1) this divider is usually mounted on the SCC-PBC (PA0HZP) or not
+ present at all (BayCom). It feeds back the output of the DPLL
+ (digital pll) as transmit clock. Using this mode without a divider
+ installed will normally result in keying the transceiver until
+ maxkey expires --- of course without sending anything (useful).
+
+2. Attachment of a channel by your AX.25 software
+=================================================
+
+2.1 Kernel AX.25
+================
+
+To set up an AX.25 device you can simply type:
+
+ ifconfig scc0 44.128.1.1 hw ax25 dl0tha-7
+
+This will create a network interface with the IP number 44.128.20.107
+and the callsign "dl0tha". If you do not have any IP number (yet) you
+can use any of the 44.128.0.0 network. Note that you do not need
+axattach. The purpose of axattach (like slattach) is to create a KISS
+network device linked to a TTY. Please read the documentation of the
+ax25-utils and the AX.25-HOWTO to learn how to set the parameters of
+the kernel AX.25.
+
+2.2 NOS, NET and TFKISS
+=======================
+
+Since the TTY driver (aka KISS TNC emulation) is gone you need
+to emulate the old behaviour. The cost of using these programs is
+that you probably need to compile the kernel AX.25, regardless of whether
+you actually use it or not. First setup your /etc/ax25/axports,
+for example:
+
+ 9k6 dl0tha-9 9600 255 4 9600 baud port (scc3)
+ axlink dl0tha-15 38400 255 4 Link to NOS
+
+Now "ifconfig" the scc device:
+
+ ifconfig scc3 44.128.1.1 hw ax25 dl0tha-9
+
+You can now axattach a pseudo-TTY:
+
+ axattach /dev/ptys0 axlink
+
+and start your NOS and attach /dev/ptys0 there. The problem is that
+NOS is reachable only via digipeating through the kernel AX.25
+(disastrous on a DAMA controlled channel). To solve this problem,
+configure "rxecho" to echo the incoming frames from "9k6" to "axlink"
+and outgoing frames from "axlink" to "9k6" and start:
+
+ rxecho
+
+Or simply use "kissbridge" coming with z8530drv-utils:
+
+ ifconfig scc3 hw ax25 dl0tha-9
+ kissbridge scc3 /dev/ptys0
+
+
+3. Adjustment and Display of parameters
+=======================================
+
+3.1 Displaying SCC Parameters:
+==============================
+
+Once a SCC channel has been attached, the parameter settings and
+some statistic information can be shown using the param program:
+
+dl1bke-u:~$ sccstat scc0
+
+Parameters:
+
+speed : 1200 baud
+txdelay : 36
+persist : 255
+slottime : 0
+txtail : 8
+fulldup : 1
+waittime : 12
+mintime : 3 sec
+maxkeyup : 7 sec
+idletime : 3 sec
+maxdefer : 120 sec
+group : 0x00
+txoff : off
+softdcd : on
+SLIP : off
+
+Status:
+
+HDLC Z8530 Interrupts Buffers
+-----------------------------------------------------------------------
+Sent : 273 RxOver : 0 RxInts : 125074 Size : 384
+Received : 1095 TxUnder: 0 TxInts : 4684 NoSpace : 0
+RxErrors : 1591 ExInts : 11776
+TxErrors : 0 SpInts : 1503
+Tx State : idle
+
+
+The status info shown is:
+
+Sent - number of frames transmitted
+Received - number of frames received
+RxErrors - number of receive errors (CRC, ABORT)
+TxErrors - number of discarded Tx frames (due to various reasons)
+Tx State - status of the Tx interrupt handler: idle/busy/active/tail (2)
+RxOver - number of receiver overruns
+TxUnder - number of transmitter underruns
+RxInts - number of receiver interrupts
+TxInts - number of transmitter interrupts
+EpInts - number of receiver special condition interrupts
+SpInts - number of external/status interrupts
+Size - maximum size of an AX.25 frame (*with* AX.25 headers!)
+NoSpace - number of times a buffer could not get allocated
+
+An overrun is abnormal. If lots of these occur, the product of
+baudrate and number of interfaces is too high for the processing
+power of your computer. NoSpace errors are unlikely to be caused by the
+driver or the kernel AX.25.
+
+
+3.2 Setting Parameters
+======================
+
+
+The setting of parameters of the emulated KISS TNC is done in the
+same way in the SCC driver. You can change parameters by using
+the kissparms program from the ax25-utils package or use the program
+"sccparam":
+
+ sccparam <device> <paramname> <decimal-|hexadecimal value>
+
+You can change the following parameters:
+
+param : value
+------------------------
+speed : 1200
+txdelay : 36
+persist : 255
+slottime : 0
+txtail : 8
+fulldup : 1
+waittime : 12
+mintime : 3
+maxkeyup : 7
+idletime : 3
+maxdefer : 120
+group : 0x00
+txoff : off
+softdcd : on
+SLIP : off
+
+
+The parameters have the following meaning:
+
+speed:
+ The baudrate on this channel in bits/sec
+
+ Example: sccparam /dev/scc3 speed 9600
+
+txdelay:
+ The delay (in units of 10 ms) after keying of the
+ transmitter, until the first byte is sent. This is usually
+ called "TXDELAY" in a TNC. When 0 is specified, the driver
+ will just wait until the CTS signal is asserted. This
+ assumes the presence of a timer or other circuitry in the
+ MODEM and/or transmitter, that asserts CTS when the
+ transmitter is ready for data.
+ A normal value of this parameter is 30-36.
+
+ Example: sccparam /dev/scc0 txd 20
+
+persist:
+ This is the probability that the transmitter will be keyed
+ when the channel is found to be free. It is a value from 0
+ to 255, and the probability is (value+1)/256. The value
+ should be somewhere near 50-60, and should be lowered when
+ the channel is used more heavily.
+
+ Example: sccparam /dev/scc2 persist 20
+
+slottime:
+ This is the time between samples of the channel. It is
+ expressed in units of 10 ms. About 200-300 ms (value 20-30)
+ seems to be a good value.
+
+ Example: sccparam /dev/scc0 slot 20
+
+tail:
+ The time the transmitter will remain keyed after the last
+ byte of a packet has been transferred to the SCC. This is
+ necessary because the CRC and a flag still have to leave the
+ SCC before the transmitter is keyed down. The value depends
+ on the baudrate selected. A few character times should be
+ sufficient, e.g. 40ms at 1200 baud. (value 4)
+ The value of this parameter is in 10 ms units.
+
+ Example: sccparam /dev/scc2 4
+
+full:
+ The full-duplex mode switch. This can be one of the following
+ values:
+
+ 0: The interface will operate in CSMA mode (the normal
+ half-duplex packet radio operation)
+ 1: Fullduplex mode, i.e. the transmitter will be keyed at
+ any time, without checking the received carrier. It
+ will be unkeyed when there are no packets to be sent.
+ 2: Like 1, but the transmitter will remain keyed, also
+ when there are no packets to be sent. Flags will be
+ sent in that case, until a timeout (parameter 10)
+ occurs.
+
+ Example: sccparam /dev/scc0 fulldup off
+
+wait:
+ The initial waittime before any transmit attempt, after the
+ frame has been queue for transmit. This is the length of
+ the first slot in CSMA mode. In full duplex modes it is
+ set to 0 for maximum performance.
+ The value of this parameter is in 10 ms units.
+
+ Example: sccparam /dev/scc1 wait 4
+
+maxkey:
+ The maximal time the transmitter will be keyed to send
+ packets, in seconds. This can be useful on busy CSMA
+ channels, to avoid "getting a bad reputation" when you are
+ generating a lot of traffic. After the specified time has
+ elapsed, no new frame will be started. Instead, the trans-
+ mitter will be switched off for a specified time (parameter
+ min), and then the selected algorithm for keyup will be
+ started again.
+ The value 0 as well as "off" will disable this feature,
+ and allow infinite transmission time.
+
+ Example: sccparam /dev/scc0 maxk 20
+
+min:
+ This is the time the transmitter will be switched off when
+ the maximum transmission time is exceeded.
+
+ Example: sccparam /dev/scc3 min 10
+
+idle
+ This parameter specifies the maximum idle time in full duplex
+ 2 mode, in seconds. When no frames have been sent for this
+ time, the transmitter will be keyed down. A value of 0 is
+ has same result as the fullduplex mode 1. This parameter
+ can be disabled.
+
+ Example: sccparam /dev/scc2 idle off # transmit forever
+
+maxdefer
+ This is the maximum time (in seconds) to wait for a free channel
+ to send. When this timer expires the transmitter will be keyed
+ IMMEDIATELY. If you love to get trouble with other users you
+ should set this to a very low value ;-)
+
+ Example: sccparam /dev/scc0 maxdefer 240 # 2 minutes
+
+
+txoff:
+ When this parameter has the value 0, the transmission of packets
+ is enable. Otherwise it is disabled.
+
+ Example: sccparam /dev/scc2 txoff on
+
+group:
+ It is possible to build special radio equipment to use more than
+ one frequency on the same band, e.g. using several receivers and
+ only one transmitter that can be switched between frequencies.
+ Also, you can connect several radios that are active on the same
+ band. In these cases, it is not possible, or not a good idea, to
+ transmit on more than one frequency. The SCC driver provides a
+ method to lock transmitters on different interfaces, using the
+ "param <interface> group <x>" command. This will only work when
+ you are using CSMA mode (parameter full = 0).
+ The number <x> must be 0 if you want no group restrictions, and
+ can be computed as follows to create restricted groups:
+ <x> is the sum of some OCTAL numbers:
+
+ 200 This transmitter will only be keyed when all other
+ transmitters in the group are off.
+ 100 This transmitter will only be keyed when the carrier
+ detect of all other interfaces in the group is off.
+ 0xx A byte that can be used to define different groups.
+ Interfaces are in the same group, when the logical AND
+ between their xx values is nonzero.
+
+ Examples:
+ When 2 interfaces use group 201, their transmitters will never be
+ keyed at the same time.
+ When 2 interfaces use group 101, the transmitters will only key
+ when both channels are clear at the same time. When group 301,
+ the transmitters will not be keyed at the same time.
+
+ Don't forget to convert the octal numbers into decimal before
+ you set the parameter.
+
+ Example: (to be written)
+
+softdcd:
+ use a software dcd instead of the real one... Useful for a very
+ slow squelch.
+
+ Example: sccparam /dev/scc0 soft on
+
+
+4. Problems
+===========
+
+If you have tx-problems with your BayCom USCC card please check
+the manufacturer of the 8530. SGS chips have a slightly
+different timing. Try Zilog... A solution is to write to register 8
+instead to the data port, but this won't work with the ESCC chips.
+*SIGH!*
+
+A very common problem is that the PTT locks until the maxkeyup timer
+expires, although interrupts and clock source are correct. In most
+cases compiling the driver with CONFIG_SCC_DELAY (set with
+make config) solves the problems. For more hints read the (pseudo) FAQ
+and the documentation coming with z8530drv-utils.
+
+I got reports that the driver has problems on some 386-based systems.
+(i.e. Amstrad) Those systems have a bogus AT bus timing which will
+lead to delayed answers on interrupts. You can recognize these
+problems by looking at the output of Sccstat for the suspected
+port. If it shows under- and overruns you own such a system.
+
+Delayed processing of received data: This depends on
+
+- the kernel version
+
+- kernel profiling compiled or not
+
+- a high interrupt load
+
+- a high load of the machine --- running X, Xmorph, XV and Povray,
+ while compiling the kernel... hmm ... even with 32 MB RAM ... ;-)
+ Or running a named for the whole .ampr.org domain on an 8 MB
+ box...
+
+- using information from rxecho or kissbridge.
+
+Kernel panics: please read /linux/README and find out if it
+really occurred within the scc driver.
+
+If you cannot solve a problem, send me
+
+- a description of the problem,
+- information on your hardware (computer system, scc board, modem)
+- your kernel version
+- the output of cat /proc/net/z8530
+
+4. Thor RLC100
+==============
+
+Mysteriously this board seems not to work with the driver. Anyone
+got it up-and-running?
+
+
+Many thanks to Linus Torvalds and Alan Cox for including the driver
+in the Linux standard distribution and their support.
+
+Joerg Reuter ampr-net: dl1bke@db0pra.ampr.org
+ AX-25 : DL1BKE @ DB0ABH.#BAY.DEU.EU
+ Internet: jreuter@yaina.de
+ WWW : http://yaina.de/jreuter
diff --git a/Documentation/nmi_watchdog.txt b/Documentation/nmi_watchdog.txt
new file mode 100644
index 0000000..90aa453
--- /dev/null
+++ b/Documentation/nmi_watchdog.txt
@@ -0,0 +1,78 @@
+
+[NMI watchdog is available for x86 and x86-64 architectures]
+
+Is your system locking up unpredictably? No keyboard activity, just
+a frustrating complete hard lockup? Do you want to help us debugging
+such lockups? If all yes then this document is definitely for you.
+
+On many x86/x86-64 type hardware there is a feature that enables
+us to generate 'watchdog NMI interrupts'. (NMI: Non Maskable Interrupt
+which get executed even if the system is otherwise locked up hard).
+This can be used to debug hard kernel lockups. By executing periodic
+NMI interrupts, the kernel can monitor whether any CPU has locked up,
+and print out debugging messages if so.
+
+In order to use the NMI watchdog, you need to have APIC support in your
+kernel. For SMP kernels, APIC support gets compiled in automatically. For
+UP, enable either CONFIG_X86_UP_APIC (Processor type and features -> Local
+APIC support on uniprocessors) or CONFIG_X86_UP_IOAPIC (Processor type and
+features -> IO-APIC support on uniprocessors) in your kernel config.
+CONFIG_X86_UP_APIC is for uniprocessor machines without an IO-APIC.
+CONFIG_X86_UP_IOAPIC is for uniprocessor with an IO-APIC. [Note: certain
+kernel debugging options, such as Kernel Stack Meter or Kernel Tracer,
+may implicitly disable the NMI watchdog.]
+
+For x86-64, the needed APIC is always compiled in.
+
+Using local APIC (nmi_watchdog=2) needs the first performance register, so
+you can't use it for other purposes (such as high precision performance
+profiling.) However, at least oprofile and the perfctr driver disable the
+local APIC NMI watchdog automatically.
+
+To actually enable the NMI watchdog, use the 'nmi_watchdog=N' boot
+parameter. Eg. the relevant lilo.conf entry:
+
+ append="nmi_watchdog=1"
+
+For SMP machines and UP machines with an IO-APIC use nmi_watchdog=1.
+For UP machines without an IO-APIC use nmi_watchdog=2, this only works
+for some processor types. If in doubt, boot with nmi_watchdog=1 and
+check the NMI count in /proc/interrupts; if the count is zero then
+reboot with nmi_watchdog=2 and check the NMI count. If it is still
+zero then log a problem, you probably have a processor that needs to be
+added to the nmi code.
+
+A 'lockup' is the following scenario: if any CPU in the system does not
+execute the period local timer interrupt for more than 5 seconds, then
+the NMI handler generates an oops and kills the process. This
+'controlled crash' (and the resulting kernel messages) can be used to
+debug the lockup. Thus whenever the lockup happens, wait 5 seconds and
+the oops will show up automatically. If the kernel produces no messages
+then the system has crashed so hard (eg. hardware-wise) that either it
+cannot even accept NMI interrupts, or the crash has made the kernel
+unable to print messages.
+
+Be aware that when using local APIC, the frequency of NMI interrupts
+it generates, depends on the system load. The local APIC NMI watchdog,
+lacking a better source, uses the "cycles unhalted" event. As you may
+guess it doesn't tick when the CPU is in the halted state (which happens
+when the system is idle), but if your system locks up on anything but the
+"hlt" processor instruction, the watchdog will trigger very soon as the
+"cycles unhalted" event will happen every clock tick. If it locks up on
+"hlt", then you are out of luck -- the event will not happen at all and the
+watchdog won't trigger. This is a shortcoming of the local APIC watchdog
+-- unfortunately there is no "clock ticks" event that would work all the
+time. The I/O APIC watchdog is driven externally and has no such shortcoming.
+But its NMI frequency is much higher, resulting in a more significant hit
+to the overall system performance.
+
+On x86 nmi_watchdog is disabled by default so you have to enable it with
+a boot time parameter.
+
+NOTE: In kernels prior to 2.4.2-ac18 the NMI-oopser is enabled unconditionally
+on x86 SMP boxes.
+
+[ feel free to send bug reports, suggestions and patches to
+ Ingo Molnar <mingo@redhat.com> or the Linux SMP mailing
+ list at <linux-smp@vger.kernel.org> ]
+
diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
new file mode 100644
index 0000000..7714f57
--- /dev/null
+++ b/Documentation/nommu-mmap.txt
@@ -0,0 +1,244 @@
+ =============================
+ NO-MMU MEMORY MAPPING SUPPORT
+ =============================
+
+The kernel has limited support for memory mapping under no-MMU conditions, such
+as are used in uClinux environments. From the userspace point of view, memory
+mapping is made use of in conjunction with the mmap() system call, the shmat()
+call and the execve() system call. From the kernel's point of view, execve()
+mapping is actually performed by the binfmt drivers, which call back into the
+mmap() routines to do the actual work.
+
+Memory mapping behaviour also involves the way fork(), vfork(), clone() and
+ptrace() work. Under uClinux there is no fork(), and clone() must be supplied
+the CLONE_VM flag.
+
+The behaviour is similar between the MMU and no-MMU cases, but not identical;
+and it's also much more restricted in the latter case:
+
+ (*) Anonymous mapping, MAP_PRIVATE
+
+ In the MMU case: VM regions backed by arbitrary pages; copy-on-write
+ across fork.
+
+ In the no-MMU case: VM regions backed by arbitrary contiguous runs of
+ pages.
+
+ (*) Anonymous mapping, MAP_SHARED
+
+ These behave very much like private mappings, except that they're
+ shared across fork() or clone() without CLONE_VM in the MMU case. Since
+ the no-MMU case doesn't support these, behaviour is identical to
+ MAP_PRIVATE there.
+
+ (*) File, MAP_PRIVATE, PROT_READ / PROT_EXEC, !PROT_WRITE
+
+ In the MMU case: VM regions backed by pages read from file; changes to
+ the underlying file are reflected in the mapping; copied across fork.
+
+ In the no-MMU case:
+
+ - If one exists, the kernel will re-use an existing mapping to the
+ same segment of the same file if that has compatible permissions,
+ even if this was created by another process.
+
+ - If possible, the file mapping will be directly on the backing device
+ if the backing device has the BDI_CAP_MAP_DIRECT capability and
+ appropriate mapping protection capabilities. Ramfs, romfs, cramfs
+ and mtd might all permit this.
+
+ - If the backing device device can't or won't permit direct sharing,
+ but does have the BDI_CAP_MAP_COPY capability, then a copy of the
+ appropriate bit of the file will be read into a contiguous bit of
+ memory and any extraneous space beyond the EOF will be cleared
+
+ - Writes to the file do not affect the mapping; writes to the mapping
+ are visible in other processes (no MMU protection), but should not
+ happen.
+
+ (*) File, MAP_PRIVATE, PROT_READ / PROT_EXEC, PROT_WRITE
+
+ In the MMU case: like the non-PROT_WRITE case, except that the pages in
+ question get copied before the write actually happens. From that point
+ on writes to the file underneath that page no longer get reflected into
+ the mapping's backing pages. The page is then backed by swap instead.
+
+ In the no-MMU case: works much like the non-PROT_WRITE case, except
+ that a copy is always taken and never shared.
+
+ (*) Regular file / blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE
+
+ In the MMU case: VM regions backed by pages read from file; changes to
+ pages written back to file; writes to file reflected into pages backing
+ mapping; shared across fork.
+
+ In the no-MMU case: not supported.
+
+ (*) Memory backed regular file, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE
+
+ In the MMU case: As for ordinary regular files.
+
+ In the no-MMU case: The filesystem providing the memory-backed file
+ (such as ramfs or tmpfs) may choose to honour an open, truncate, mmap
+ sequence by providing a contiguous sequence of pages to map. In that
+ case, a shared-writable memory mapping will be possible. It will work
+ as for the MMU case. If the filesystem does not provide any such
+ support, then the mapping request will be denied.
+
+ (*) Memory backed blockdev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE
+
+ In the MMU case: As for ordinary regular files.
+
+ In the no-MMU case: As for memory backed regular files, but the
+ blockdev must be able to provide a contiguous run of pages without
+ truncate being called. The ramdisk driver could do this if it allocated
+ all its memory as a contiguous array upfront.
+
+ (*) Memory backed chardev, MAP_SHARED, PROT_READ / PROT_EXEC / PROT_WRITE
+
+ In the MMU case: As for ordinary regular files.
+
+ In the no-MMU case: The character device driver may choose to honour
+ the mmap() by providing direct access to the underlying device if it
+ provides memory or quasi-memory that can be accessed directly. Examples
+ of such are frame buffers and flash devices. If the driver does not
+ provide any such support, then the mapping request will be denied.
+
+
+============================
+FURTHER NOTES ON NO-MMU MMAP
+============================
+
+ (*) A request for a private mapping of less than a page in size may not return
+ a page-aligned buffer. This is because the kernel calls kmalloc() to
+ allocate the buffer, not get_free_page().
+
+ (*) A list of all the mappings on the system is visible through /proc/maps in
+ no-MMU mode.
+
+ (*) A list of all the mappings in use by a process is visible through
+ /proc/<pid>/maps in no-MMU mode.
+
+ (*) Supplying MAP_FIXED or a requesting a particular mapping address will
+ result in an error.
+
+ (*) Files mapped privately usually have to have a read method provided by the
+ driver or filesystem so that the contents can be read into the memory
+ allocated if mmap() chooses not to map the backing device directly. An
+ error will result if they don't. This is most likely to be encountered
+ with character device files, pipes, fifos and sockets.
+
+
+==========================
+INTERPROCESS SHARED MEMORY
+==========================
+
+Both SYSV IPC SHM shared memory and POSIX shared memory is supported in NOMMU
+mode. The former through the usual mechanism, the latter through files created
+on ramfs or tmpfs mounts.
+
+
+=======
+FUTEXES
+=======
+
+Futexes are supported in NOMMU mode if the arch supports them. An error will
+be given if an address passed to the futex system call lies outside the
+mappings made by a process or if the mapping in which the address lies does not
+support futexes (such as an I/O chardev mapping).
+
+
+=============
+NO-MMU MREMAP
+=============
+
+The mremap() function is partially supported. It may change the size of a
+mapping, and may move it[*] if MREMAP_MAYMOVE is specified and if the new size
+of the mapping exceeds the size of the slab object currently occupied by the
+memory to which the mapping refers, or if a smaller slab object could be used.
+
+MREMAP_FIXED is not supported, though it is ignored if there's no change of
+address and the object does not need to be moved.
+
+Shared mappings may not be moved. Shareable mappings may not be moved either,
+even if they are not currently shared.
+
+The mremap() function must be given an exact match for base address and size of
+a previously mapped object. It may not be used to create holes in existing
+mappings, move parts of existing mappings or resize parts of mappings. It must
+act on a complete mapping.
+
+[*] Not currently supported.
+
+
+============================================
+PROVIDING SHAREABLE CHARACTER DEVICE SUPPORT
+============================================
+
+To provide shareable character device support, a driver must provide a
+file->f_op->get_unmapped_area() operation. The mmap() routines will call this
+to get a proposed address for the mapping. This may return an error if it
+doesn't wish to honour the mapping because it's too long, at a weird offset,
+under some unsupported combination of flags or whatever.
+
+The driver should also provide backing device information with capabilities set
+to indicate the permitted types of mapping on such devices. The default is
+assumed to be readable and writable, not executable, and only shareable
+directly (can't be copied).
+
+The file->f_op->mmap() operation will be called to actually inaugurate the
+mapping. It can be rejected at that point. Returning the ENOSYS error will
+cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified.
+
+The vm_ops->close() routine will be invoked when the last mapping on a chardev
+is removed. An existing mapping will be shared, partially or not, if possible
+without notifying the driver.
+
+It is permitted also for the file->f_op->get_unmapped_area() operation to
+return -ENOSYS. This will be taken to mean that this operation just doesn't
+want to handle it, despite the fact it's got an operation. For instance, it
+might try directing the call to a secondary driver which turns out not to
+implement it. Such is the case for the framebuffer driver which attempts to
+direct the call to the device-specific driver. Under such circumstances, the
+mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a
+copy mapped otherwise.
+
+IMPORTANT NOTE:
+
+ Some types of device may present a different appearance to anyone
+ looking at them in certain modes. Flash chips can be like this; for
+ instance if they're in programming or erase mode, you might see the
+ status reflected in the mapping, instead of the data.
+
+ In such a case, care must be taken lest userspace see a shared or a
+ private mapping showing such information when the driver is busy
+ controlling the device. Remember especially: private executable
+ mappings may still be mapped directly off the device under some
+ circumstances!
+
+
+==============================================
+PROVIDING SHAREABLE MEMORY-BACKED FILE SUPPORT
+==============================================
+
+Provision of shared mappings on memory backed files is similar to the provision
+of support for shared mapped character devices. The main difference is that the
+filesystem providing the service will probably allocate a contiguous collection
+of pages and permit mappings to be made on that.
+
+It is recommended that a truncate operation applied to such a file that
+increases the file size, if that file is empty, be taken as a request to gather
+enough pages to honour a mapping. This is required to support POSIX shared
+memory.
+
+Memory backed devices are indicated by the mapping's backing device info having
+the memory_backed flag set.
+
+
+========================================
+PROVIDING SHAREABLE BLOCK DEVICE SUPPORT
+========================================
+
+Provision of shared mappings on block device files is exactly the same as for
+character devices. If there isn't a real device underneath, then the driver
+should allocate sufficient contiguous memory to honour any supported mapping.
diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt
new file mode 100644
index 0000000..80133ac
--- /dev/null
+++ b/Documentation/numastat.txt
@@ -0,0 +1,22 @@
+
+Numa policy hit/miss statistics
+
+/sys/devices/system/node/node*/numastat
+
+All units are pages. Hugepages have separate counters.
+
+numa_hit A process wanted to allocate memory from this node,
+ and succeeded.
+numa_miss A process wanted to allocate memory from this node,
+ but ended up with memory from another.
+numa_foreign A process wanted to allocate on another node,
+ but ended up with memory from this one.
+local_node A process ran on this node and got memory from it.
+other_node A process ran on this node and got memory from another node.
+interleave_hit Interleaving wanted to allocate from this node
+ and succeeded.
+
+For easier reading you can use the numastat utility from the numactl package
+(ftp://ftp.suse.com/pub/people/ak/numa/numactl*). Note that it only works
+well right now on machines with a small number of CPUs.
+
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
new file mode 100644
index 0000000..b152e81
--- /dev/null
+++ b/Documentation/oops-tracing.txt
@@ -0,0 +1,264 @@
+NOTE: ksymoops is useless on 2.6. Please use the Oops in its original format
+(from dmesg, etc). Ignore any references in this or other docs to "decoding
+the Oops" or "running it through ksymoops". If you post an Oops from 2.6 that
+has been run through ksymoops, people will just tell you to repost it.
+
+Quick Summary
+-------------
+
+Find the Oops and send it to the maintainer of the kernel area that seems to be
+involved with the problem. Don't worry too much about getting the wrong person.
+If you are unsure send it to the person responsible for the code relevant to
+what you were doing. If it occurs repeatably try and describe how to recreate
+it. That's worth even more than the oops.
+
+If you are totally stumped as to whom to send the report, send it to
+linux-kernel@vger.kernel.org. Thanks for your help in making Linux as
+stable as humanly possible.
+
+Where is the Oops?
+----------------------
+
+Normally the Oops text is read from the kernel buffers by klogd and
+handed to syslogd which writes it to a syslog file, typically
+/var/log/messages (depends on /etc/syslog.conf). Sometimes klogd dies,
+in which case you can run dmesg > file to read the data from the kernel
+buffers and save it. Or you can cat /proc/kmsg > file, however you
+have to break in to stop the transfer, kmsg is a "never ending file".
+If the machine has crashed so badly that you cannot enter commands or
+the disk is not available then you have three options :-
+
+(1) Hand copy the text from the screen and type it in after the machine
+ has restarted. Messy but it is the only option if you have not
+ planned for a crash. Alternatively, you can take a picture of
+ the screen with a digital camera - not nice, but better than
+ nothing. If the messages scroll off the top of the console, you
+ may find that booting with a higher resolution (eg, vga=791)
+ will allow you to read more of the text. (Caveat: This needs vesafb,
+ so won't help for 'early' oopses)
+
+(2) Boot with a serial console (see Documentation/serial-console.txt),
+ run a null modem to a second machine and capture the output there
+ using your favourite communication program. Minicom works well.
+
+(3) Use Kdump (see Documentation/kdump/kdump.txt),
+ extract the kernel ring buffer from old memory with using dmesg
+ gdbmacro in Documentation/kdump/gdbmacros.txt.
+
+
+Full Information
+----------------
+
+NOTE: the message from Linus below applies to 2.4 kernel. I have preserved it
+for historical reasons, and because some of the information in it still
+applies. Especially, please ignore any references to ksymoops.
+
+From: Linus Torvalds <torvalds@osdl.org>
+
+How to track down an Oops.. [originally a mail to linux-kernel]
+
+The main trick is having 5 years of experience with those pesky oops
+messages ;-)
+
+Actually, there are things you can do that make this easier. I have two
+separate approaches:
+
+ gdb /usr/src/linux/vmlinux
+ gdb> disassemble <offending_function>
+
+That's the easy way to find the problem, at least if the bug-report is
+well made (like this one was - run through ksymoops to get the
+information of which function and the offset in the function that it
+happened in).
+
+Oh, it helps if the report happens on a kernel that is compiled with the
+same compiler and similar setups.
+
+The other thing to do is disassemble the "Code:" part of the bug report:
+ksymoops will do this too with the correct tools, but if you don't have
+the tools you can just do a silly program:
+
+ char str[] = "\xXX\xXX\xXX...";
+ main(){}
+
+and compile it with gcc -g and then do "disassemble str" (where the "XX"
+stuff are the values reported by the Oops - you can just cut-and-paste
+and do a replace of spaces to "\x" - that's what I do, as I'm too lazy
+to write a program to automate this all).
+
+Alternatively, you can use the shell script in scripts/decodecode.
+Its usage is: decodecode < oops.txt
+
+The hex bytes that follow "Code:" may (in some architectures) have a series
+of bytes that precede the current instruction pointer as well as bytes at and
+following the current instruction pointer. In some cases, one instruction
+byte or word is surrounded by <> or (), as in "<86>" or "(f00d)". These
+<> or () markings indicate the current instruction pointer. Example from
+i386, split into multiple lines for readability:
+
+Code: f9 0f 8d f9 00 00 00 8d 42 0c e8 dd 26 11 c7 a1 60 ea 2b f9 8b 50 08 a1
+64 ea 2b f9 8d 34 82 8b 1e 85 db 74 6d 8b 15 60 ea 2b f9 <8b> 43 04 39 42 54
+7e 04 40 89 42 54 8b 43 04 3b 05 00 f6 52 c0
+
+Finally, if you want to see where the code comes from, you can do
+
+ cd /usr/src/linux
+ make fs/buffer.s # or whatever file the bug happened in
+
+and then you get a better idea of what happens than with the gdb
+disassembly.
+
+Now, the trick is just then to combine all the data you have: the C
+sources (and general knowledge of what it _should_ do), the assembly
+listing and the code disassembly (and additionally the register dump you
+also get from the "oops" message - that can be useful to see _what_ the
+corrupted pointers were, and when you have the assembler listing you can
+also match the other registers to whatever C expressions they were used
+for).
+
+Essentially, you just look at what doesn't match (in this case it was the
+"Code" disassembly that didn't match with what the compiler generated).
+Then you need to find out _why_ they don't match. Often it's simple - you
+see that the code uses a NULL pointer and then you look at the code and
+wonder how the NULL pointer got there, and if it's a valid thing to do
+you just check against it..
+
+Now, if somebody gets the idea that this is time-consuming and requires
+some small amount of concentration, you're right. Which is why I will
+mostly just ignore any panic reports that don't have the symbol table
+info etc looked up: it simply gets too hard to look it up (I have some
+programs to search for specific patterns in the kernel code segment, and
+sometimes I have been able to look up those kinds of panics too, but
+that really requires pretty good knowledge of the kernel just to be able
+to pick out the right sequences etc..)
+
+_Sometimes_ it happens that I just see the disassembled code sequence
+from the panic, and I know immediately where it's coming from. That's when
+I get worried that I've been doing this for too long ;-)
+
+ Linus
+
+
+---------------------------------------------------------------------------
+Notes on Oops tracing with klogd:
+
+In order to help Linus and the other kernel developers there has been
+substantial support incorporated into klogd for processing protection
+faults. In order to have full support for address resolution at least
+version 1.3-pl3 of the sysklogd package should be used.
+
+When a protection fault occurs the klogd daemon automatically
+translates important addresses in the kernel log messages to their
+symbolic equivalents. This translated kernel message is then
+forwarded through whatever reporting mechanism klogd is using. The
+protection fault message can be simply cut out of the message files
+and forwarded to the kernel developers.
+
+Two types of address resolution are performed by klogd. The first is
+static translation and the second is dynamic translation. Static
+translation uses the System.map file in much the same manner that
+ksymoops does. In order to do static translation the klogd daemon
+must be able to find a system map file at daemon initialization time.
+See the klogd man page for information on how klogd searches for map
+files.
+
+Dynamic address translation is important when kernel loadable modules
+are being used. Since memory for kernel modules is allocated from the
+kernel's dynamic memory pools there are no fixed locations for either
+the start of the module or for functions and symbols in the module.
+
+The kernel supports system calls which allow a program to determine
+which modules are loaded and their location in memory. Using these
+system calls the klogd daemon builds a symbol table which can be used
+to debug a protection fault which occurs in a loadable kernel module.
+
+At the very minimum klogd will provide the name of the module which
+generated the protection fault. There may be additional symbolic
+information available if the developer of the loadable module chose to
+export symbol information from the module.
+
+Since the kernel module environment can be dynamic there must be a
+mechanism for notifying the klogd daemon when a change in module
+environment occurs. There are command line options available which
+allow klogd to signal the currently executing daemon that symbol
+information should be refreshed. See the klogd manual page for more
+information.
+
+A patch is included with the sysklogd distribution which modifies the
+modules-2.0.0 package to automatically signal klogd whenever a module
+is loaded or unloaded. Applying this patch provides essentially
+seamless support for debugging protection faults which occur with
+kernel loadable modules.
+
+The following is an example of a protection fault in a loadable module
+processed by klogd:
+---------------------------------------------------------------------------
+Aug 29 09:51:01 blizard kernel: Unable to handle kernel paging request at virtual address f15e97cc
+Aug 29 09:51:01 blizard kernel: current->tss.cr3 = 0062d000, %cr3 = 0062d000
+Aug 29 09:51:01 blizard kernel: *pde = 00000000
+Aug 29 09:51:01 blizard kernel: Oops: 0002
+Aug 29 09:51:01 blizard kernel: CPU: 0
+Aug 29 09:51:01 blizard kernel: EIP: 0010:[oops:_oops+16/3868]
+Aug 29 09:51:01 blizard kernel: EFLAGS: 00010212
+Aug 29 09:51:01 blizard kernel: eax: 315e97cc ebx: 003a6f80 ecx: 001be77b edx: 00237c0c
+Aug 29 09:51:01 blizard kernel: esi: 00000000 edi: bffffdb3 ebp: 00589f90 esp: 00589f8c
+Aug 29 09:51:01 blizard kernel: ds: 0018 es: 0018 fs: 002b gs: 002b ss: 0018
+Aug 29 09:51:01 blizard kernel: Process oops_test (pid: 3374, process nr: 21, stackpage=00589000)
+Aug 29 09:51:01 blizard kernel: Stack: 315e97cc 00589f98 0100b0b4 bffffed4 0012e38e 00240c64 003a6f80 00000001
+Aug 29 09:51:01 blizard kernel: 00000000 00237810 bfffff00 0010a7fa 00000003 00000001 00000000 bfffff00
+Aug 29 09:51:01 blizard kernel: bffffdb3 bffffed4 ffffffda 0000002b 0007002b 0000002b 0000002b 00000036
+Aug 29 09:51:01 blizard kernel: Call Trace: [oops:_oops_ioctl+48/80] [_sys_ioctl+254/272] [_system_call+82/128]
+Aug 29 09:51:01 blizard kernel: Code: c7 00 05 00 00 00 eb 08 90 90 90 90 90 90 90 90 89 ec 5d c3
+---------------------------------------------------------------------------
+
+Dr. G.W. Wettstein Oncology Research Div. Computing Facility
+Roger Maris Cancer Center INTERNET: greg@wind.rmcc.com
+820 4th St. N.
+Fargo, ND 58122
+Phone: 701-234-7556
+
+
+---------------------------------------------------------------------------
+Tainted kernels:
+
+Some oops reports contain the string 'Tainted: ' after the program
+counter. This indicates that the kernel has been tainted by some
+mechanism. The string is followed by a series of position-sensitive
+characters, each representing a particular tainted value.
+
+ 1: 'G' if all modules loaded have a GPL or compatible license, 'P' if
+ any proprietary module has been loaded. Modules without a
+ MODULE_LICENSE or with a MODULE_LICENSE that is not recognised by
+ insmod as GPL compatible are assumed to be proprietary.
+
+ 2: 'F' if any module was force loaded by "insmod -f", ' ' if all
+ modules were loaded normally.
+
+ 3: 'S' if the oops occurred on an SMP kernel running on hardware that
+ hasn't been certified as safe to run multiprocessor.
+ Currently this occurs only on various Athlons that are not
+ SMP capable.
+
+ 4: 'R' if a module was force unloaded by "rmmod -f", ' ' if all
+ modules were unloaded normally.
+
+ 5: 'M' if any processor has reported a Machine Check Exception,
+ ' ' if no Machine Check Exceptions have occurred.
+
+ 6: 'B' if a page-release function has found a bad page reference or
+ some unexpected page flags.
+
+ 7: 'U' if a user or user application specifically requested that the
+ Tainted flag be set, ' ' otherwise.
+
+ 8: 'D' if the kernel has died recently, i.e. there was an OOPS or BUG.
+
+ 9: 'A' if the ACPI table has been overridden.
+
+ 10: 'W' if a warning has previously been issued by the kernel.
+
+The primary reason for the 'Tainted: ' string is to tell kernel
+debuggers if this is a clean kernel or if anything unusual has
+occurred. Tainting is permanent: even if an offending module is
+unloaded, the tainted value remains to indicate that the kernel is not
+trustworthy.
diff --git a/Documentation/parisc/00-INDEX b/Documentation/parisc/00-INDEX
new file mode 100644
index 0000000..cbd0609
--- /dev/null
+++ b/Documentation/parisc/00-INDEX
@@ -0,0 +1,6 @@
+00-INDEX
+ - this file.
+debugging
+ - some debugging hints for real-mode code
+registers
+ - current/planned usage of registers
diff --git a/Documentation/parisc/debugging b/Documentation/parisc/debugging
new file mode 100644
index 0000000..d728594
--- /dev/null
+++ b/Documentation/parisc/debugging
@@ -0,0 +1,39 @@
+okay, here are some hints for debugging the lower-level parts of
+linux/parisc.
+
+
+1. Absolute addresses
+
+A lot of the assembly code currently runs in real mode, which means
+absolute addresses are used instead of virtual addresses as in the
+rest of the kernel. To translate an absolute address to a virtual
+address you can lookup in System.map, add __PAGE_OFFSET (0x10000000
+currently).
+
+
+2. HPMCs
+
+When real-mode code tries to access non-existent memory, you'll get
+an HPMC instead of a kernel oops. To debug an HPMC, try to find
+the System Responder/Requestor addresses. The System Requestor
+address should match (one of the) processor HPAs (high addresses in
+the I/O range); the System Responder address is the address real-mode
+code tried to access.
+
+Typical values for the System Responder address are addresses larger
+than __PAGE_OFFSET (0x10000000) which mean a virtual address didn't
+get translated to a physical address before real-mode code tried to
+access it.
+
+
+3. Q bit fun
+
+Certain, very critical code has to clear the Q bit in the PSW. What
+happens when the Q bit is cleared is the CPU does not update the
+registers interruption handlers read to find out where the machine
+was interrupted - so if you get an interruption between the instruction
+that clears the Q bit and the RFI that sets it again you don't know
+where exactly it happened. If you're lucky the IAOQ will point to the
+instrucion that cleared the Q bit, if you're not it points anywhere
+at all. Usually Q bit problems will show themselves in unexplainable
+system hangs or running off the end of physical memory.
diff --git a/Documentation/parisc/registers b/Documentation/parisc/registers
new file mode 100644
index 0000000..dd3cadd
--- /dev/null
+++ b/Documentation/parisc/registers
@@ -0,0 +1,121 @@
+Register Usage for Linux/PA-RISC
+
+[ an asterisk is used for planned usage which is currently unimplemented ]
+
+ General Registers as specified by ABI
+
+ Control Registers
+
+CR 0 (Recovery Counter) used for ptrace
+CR 1-CR 7(undefined) unused
+CR 8 (Protection ID) per-process value*
+CR 9, 12, 13 (PIDS) unused
+CR10 (CCR) lazy FPU saving*
+CR11 as specified by ABI (SAR)
+CR14 (interruption vector) initialized to fault_vector
+CR15 (EIEM) initialized to all ones*
+CR16 (Interval Timer) read for cycle count/write starts Interval Tmr
+CR17-CR22 interruption parameters
+CR19 Interrupt Instruction Register
+CR20 Interrupt Space Register
+CR21 Interrupt Offset Register
+CR22 Interrupt PSW
+CR23 (EIRR) read for pending interrupts/write clears bits
+CR24 (TR 0) Kernel Space Page Directory Pointer
+CR25 (TR 1) User Space Page Directory Pointer
+CR26 (TR 2) not used
+CR27 (TR 3) Thread descriptor pointer
+CR28 (TR 4) not used
+CR29 (TR 5) not used
+CR30 (TR 6) current / 0
+CR31 (TR 7) Temporary register, used in various places
+
+ Space Registers (kernel mode)
+
+SR0 temporary space register
+SR4-SR7 set to 0
+SR1 temporary space register
+SR2 kernel should not clobber this
+SR3 used for userspace accesses (current process)
+
+ Space Registers (user mode)
+
+SR0 temporary space register
+SR1 temporary space register
+SR2 holds space of linux gateway page
+SR3 holds user address space value while in kernel
+SR4-SR7 Defines short address space for user/kernel
+
+
+ Processor Status Word
+
+W (64-bit addresses) 0
+E (Little-endian) 0
+S (Secure Interval Timer) 0
+T (Taken Branch Trap) 0
+H (Higher-privilege trap) 0
+L (Lower-privilege trap) 0
+N (Nullify next instruction) used by C code
+X (Data memory break disable) 0
+B (Taken Branch) used by C code
+C (code address translation) 1, 0 while executing real-mode code
+V (divide step correction) used by C code
+M (HPMC mask) 0, 1 while executing HPMC handler*
+C/B (carry/borrow bits) used by C code
+O (ordered references) 1*
+F (performance monitor) 0
+R (Recovery Counter trap) 0
+Q (collect interruption state) 1 (0 in code directly preceding an rfi)
+P (Protection Identifiers) 1*
+D (Data address translation) 1, 0 while executing real-mode code
+I (external interrupt mask) used by cli()/sti() macros
+
+ "Invisible" Registers
+
+PSW default W value 0
+PSW default E value 0
+Shadow Registers used by interruption handler code
+TOC enable bit 1
+
+=========================================================================
+Register usage notes, originally from John Marvin, with some additional
+notes from Randolph Chung.
+
+For the general registers:
+
+r1,r2,r19-r26,r28,r29 & r31 can be used without saving them first. And of
+course, you need to save them if you care about them, before calling
+another procedure. Some of the above registers do have special meanings
+that you should be aware of:
+
+ r1: The addil instruction is hardwired to place its result in r1,
+ so if you use that instruction be aware of that.
+
+ r2: This is the return pointer. In general you don't want to
+ use this, since you need the pointer to get back to your
+ caller. However, it is grouped with this set of registers
+ since the caller can't rely on the value being the same
+ when you return, i.e. you can copy r2 to another register
+ and return through that register after trashing r2, and
+ that should not cause a problem for the calling routine.
+
+ r19-r22: these are generally regarded as temporary registers.
+ Note that in 64 bit they are arg7-arg4.
+
+ r23-r26: these are arg3-arg0, i.e. you can use them if you
+ don't care about the values that were passed in anymore.
+
+ r28,r29: are ret0 and ret1. They are what you pass return values
+ in. r28 is the primary return. When returning small structures
+ r29 may also be used to pass data back to the caller.
+
+ r30: stack pointer
+
+ r31: the ble instruction puts the return pointer in here.
+
+
+r3-r18,r27,r30 need to be saved and restored. r3-r18 are just
+ general purpose registers. r27 is the data pointer, and is
+ used to make references to global variables easier. r30 is
+ the stack pointer.
+
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt
new file mode 100644
index 0000000..120eb20
--- /dev/null
+++ b/Documentation/parport-lowlevel.txt
@@ -0,0 +1,1471 @@
+PARPORT interface documentation
+-------------------------------
+
+Time-stamp: <2000-02-24 13:30:20 twaugh>
+
+Described here are the following functions:
+
+Global functions:
+ parport_register_driver
+ parport_unregister_driver
+ parport_enumerate
+ parport_register_device
+ parport_unregister_device
+ parport_claim
+ parport_claim_or_block
+ parport_release
+ parport_yield
+ parport_yield_blocking
+ parport_wait_peripheral
+ parport_poll_peripheral
+ parport_wait_event
+ parport_negotiate
+ parport_read
+ parport_write
+ parport_open
+ parport_close
+ parport_device_id
+ parport_device_coords
+ parport_find_class
+ parport_find_device
+ parport_set_timeout
+
+Port functions (can be overridden by low-level drivers):
+ SPP:
+ port->ops->read_data
+ port->ops->write_data
+ port->ops->read_status
+ port->ops->read_control
+ port->ops->write_control
+ port->ops->frob_control
+ port->ops->enable_irq
+ port->ops->disable_irq
+ port->ops->data_forward
+ port->ops->data_reverse
+
+ EPP:
+ port->ops->epp_write_data
+ port->ops->epp_read_data
+ port->ops->epp_write_addr
+ port->ops->epp_read_addr
+
+ ECP:
+ port->ops->ecp_write_data
+ port->ops->ecp_read_data
+ port->ops->ecp_write_addr
+
+ Other:
+ port->ops->nibble_read_data
+ port->ops->byte_read_data
+ port->ops->compat_write_data
+
+The parport subsystem comprises 'parport' (the core port-sharing
+code), and a variety of low-level drivers that actually do the port
+accesses. Each low-level driver handles a particular style of port
+(PC, Amiga, and so on).
+
+The parport interface to the device driver author can be broken down
+into global functions and port functions.
+
+The global functions are mostly for communicating between the device
+driver and the parport subsystem: acquiring a list of available ports,
+claiming a port for exclusive use, and so on. They also include
+'generic' functions for doing standard things that will work on any
+IEEE 1284-capable architecture.
+
+The port functions are provided by the low-level drivers, although the
+core parport module provides generic 'defaults' for some routines.
+The port functions can be split into three groups: SPP, EPP, and ECP.
+
+SPP (Standard Parallel Port) functions modify so-called 'SPP'
+registers: data, status, and control. The hardware may not actually
+have registers exactly like that, but the PC does and this interface is
+modelled after common PC implementations. Other low-level drivers may
+be able to emulate most of the functionality.
+
+EPP (Enhanced Parallel Port) functions are provided for reading and
+writing in IEEE 1284 EPP mode, and ECP (Extended Capabilities Port)
+functions are used for IEEE 1284 ECP mode. (What about BECP? Does
+anyone care?)
+
+Hardware assistance for EPP and/or ECP transfers may or may not be
+available, and if it is available it may or may not be used. If
+hardware is not used, the transfer will be software-driven. In order
+to cope with peripherals that only tenuously support IEEE 1284, a
+low-level driver specific function is provided, for altering 'fudge
+factors'.
+
+GLOBAL FUNCTIONS
+----------------
+
+parport_register_driver - register a device driver with parport
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_driver {
+ const char *name;
+ void (*attach) (struct parport *);
+ void (*detach) (struct parport *);
+ struct parport_driver *next;
+};
+int parport_register_driver (struct parport_driver *driver);
+
+DESCRIPTION
+
+In order to be notified about parallel ports when they are detected,
+parport_register_driver should be called. Your driver will
+immediately be notified of all ports that have already been detected,
+and of each new port as low-level drivers are loaded.
+
+A 'struct parport_driver' contains the textual name of your driver,
+a pointer to a function to handle new ports, and a pointer to a
+function to handle ports going away due to a low-level driver
+unloading. Ports will only be detached if they are not being used
+(i.e. there are no devices registered on them).
+
+The visible parts of the 'struct parport *' argument given to
+attach/detach are:
+
+struct parport
+{
+ struct parport *next; /* next parport in list */
+ const char *name; /* port's name */
+ unsigned int modes; /* bitfield of hardware modes */
+ struct parport_device_info probe_info;
+ /* IEEE1284 info */
+ int number; /* parport index */
+ struct parport_operations *ops;
+ ...
+};
+
+There are other members of the structure, but they should not be
+touched.
+
+The 'modes' member summarises the capabilities of the underlying
+hardware. It consists of flags which may be bitwise-ored together:
+
+ PARPORT_MODE_PCSPP IBM PC registers are available,
+ i.e. functions that act on data,
+ control and status registers are
+ probably writing directly to the
+ hardware.
+ PARPORT_MODE_TRISTATE The data drivers may be turned off.
+ This allows the data lines to be used
+ for reverse (peripheral to host)
+ transfers.
+ PARPORT_MODE_COMPAT The hardware can assist with
+ compatibility-mode (printer)
+ transfers, i.e. compat_write_block.
+ PARPORT_MODE_EPP The hardware can assist with EPP
+ transfers.
+ PARPORT_MODE_ECP The hardware can assist with ECP
+ transfers.
+ PARPORT_MODE_DMA The hardware can use DMA, so you might
+ want to pass ISA DMA-able memory
+ (i.e. memory allocated using the
+ GFP_DMA flag with kmalloc) to the
+ low-level driver in order to take
+ advantage of it.
+
+There may be other flags in 'modes' as well.
+
+The contents of 'modes' is advisory only. For example, if the
+hardware is capable of DMA, and PARPORT_MODE_DMA is in 'modes', it
+doesn't necessarily mean that DMA will always be used when possible.
+Similarly, hardware that is capable of assisting ECP transfers won't
+necessarily be used.
+
+RETURN VALUE
+
+Zero on success, otherwise an error code.
+
+ERRORS
+
+None. (Can it fail? Why return int?)
+
+EXAMPLE
+
+static void lp_attach (struct parport *port)
+{
+ ...
+ private = kmalloc (...);
+ dev[count++] = parport_register_device (...);
+ ...
+}
+
+static void lp_detach (struct parport *port)
+{
+ ...
+}
+
+static struct parport_driver lp_driver = {
+ "lp",
+ lp_attach,
+ lp_detach,
+ NULL /* always put NULL here */
+};
+
+int lp_init (void)
+{
+ ...
+ if (parport_register_driver (&lp_driver)) {
+ /* Failed; nothing we can do. */
+ return -EIO;
+ }
+ ...
+}
+
+SEE ALSO
+
+parport_unregister_driver, parport_register_device, parport_enumerate
+
+parport_unregister_driver - tell parport to forget about this driver
+-------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_driver {
+ const char *name;
+ void (*attach) (struct parport *);
+ void (*detach) (struct parport *);
+ struct parport_driver *next;
+};
+void parport_unregister_driver (struct parport_driver *driver);
+
+DESCRIPTION
+
+This tells parport not to notify the device driver of new ports or of
+ports going away. Registered devices belonging to that driver are NOT
+unregistered: parport_unregister_device must be used for each one.
+
+EXAMPLE
+
+void cleanup_module (void)
+{
+ ...
+ /* Stop notifications. */
+ parport_unregister_driver (&lp_driver);
+
+ /* Unregister devices. */
+ for (i = 0; i < NUM_DEVS; i++)
+ parport_unregister_device (dev[i]);
+ ...
+}
+
+SEE ALSO
+
+parport_register_driver, parport_enumerate
+
+parport_enumerate - retrieve a list of parallel ports (DEPRECATED)
+-----------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport *parport_enumerate (void);
+
+DESCRIPTION
+
+Retrieve the first of a list of valid parallel ports for this machine.
+Successive parallel ports can be found using the 'struct parport
+*next' element of the 'struct parport *' that is returned. If 'next'
+is NULL, there are no more parallel ports in the list. The number of
+ports in the list will not exceed PARPORT_MAX.
+
+RETURN VALUE
+
+A 'struct parport *' describing a valid parallel port for the machine,
+or NULL if there are none.
+
+ERRORS
+
+This function can return NULL to indicate that there are no parallel
+ports to use.
+
+EXAMPLE
+
+int detect_device (void)
+{
+ struct parport *port;
+
+ for (port = parport_enumerate ();
+ port != NULL;
+ port = port->next) {
+ /* Try to detect a device on the port... */
+ ...
+ }
+ }
+
+ ...
+}
+
+NOTES
+
+parport_enumerate is deprecated; parport_register_driver should be
+used instead.
+
+SEE ALSO
+
+parport_register_driver, parport_unregister_driver
+
+parport_register_device - register to use a port
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+typedef int (*preempt_func) (void *handle);
+typedef void (*wakeup_func) (void *handle);
+typedef int (*irq_func) (int irq, void *handle, struct pt_regs *);
+
+struct pardevice *parport_register_device(struct parport *port,
+ const char *name,
+ preempt_func preempt,
+ wakeup_func wakeup,
+ irq_func irq,
+ int flags,
+ void *handle);
+
+DESCRIPTION
+
+Use this function to register your device driver on a parallel port
+('port'). Once you have done that, you will be able to use
+parport_claim and parport_release in order to use the port.
+
+The ('name') argument is the name of the device that appears in /proc
+filesystem. The string must be valid for the whole lifetime of the
+device (until parport_unregister_device is called).
+
+This function will register three callbacks into your driver:
+'preempt', 'wakeup' and 'irq'. Each of these may be NULL in order to
+indicate that you do not want a callback.
+
+When the 'preempt' function is called, it is because another driver
+wishes to use the parallel port. The 'preempt' function should return
+non-zero if the parallel port cannot be released yet -- if zero is
+returned, the port is lost to another driver and the port must be
+re-claimed before use.
+
+The 'wakeup' function is called once another driver has released the
+port and no other driver has yet claimed it. You can claim the
+parallel port from within the 'wakeup' function (in which case the
+claim is guaranteed to succeed), or choose not to if you don't need it
+now.
+
+If an interrupt occurs on the parallel port your driver has claimed,
+the 'irq' function will be called. (Write something about shared
+interrupts here.)
+
+The 'handle' is a pointer to driver-specific data, and is passed to
+the callback functions.
+
+'flags' may be a bitwise combination of the following flags:
+
+ Flag Meaning
+ PARPORT_DEV_EXCL The device cannot share the parallel port at all.
+ Use this only when absolutely necessary.
+
+The typedefs are not actually defined -- they are only shown in order
+to make the function prototype more readable.
+
+The visible parts of the returned 'struct pardevice' are:
+
+struct pardevice {
+ struct parport *port; /* Associated port */
+ void *private; /* Device driver's 'handle' */
+ ...
+};
+
+RETURN VALUE
+
+A 'struct pardevice *': a handle to the registered parallel port
+device that can be used for parport_claim, parport_release, etc.
+
+ERRORS
+
+A return value of NULL indicates that there was a problem registering
+a device on that port.
+
+EXAMPLE
+
+static int preempt (void *handle)
+{
+ if (busy_right_now)
+ return 1;
+
+ must_reclaim_port = 1;
+ return 0;
+}
+
+static void wakeup (void *handle)
+{
+ struct toaster *private = handle;
+ struct pardevice *dev = private->dev;
+ if (!dev) return; /* avoid races */
+
+ if (want_port)
+ parport_claim (dev);
+}
+
+static int toaster_detect (struct toaster *private, struct parport *port)
+{
+ private->dev = parport_register_device (port, "toaster", preempt,
+ wakeup, NULL, 0,
+ private);
+ if (!private->dev)
+ /* Couldn't register with parport. */
+ return -EIO;
+
+ must_reclaim_port = 0;
+ busy_right_now = 1;
+ parport_claim_or_block (private->dev);
+ ...
+ /* Don't need the port while the toaster warms up. */
+ busy_right_now = 0;
+ ...
+ busy_right_now = 1;
+ if (must_reclaim_port) {
+ parport_claim_or_block (private->dev);
+ must_reclaim_port = 0;
+ }
+ ...
+}
+
+SEE ALSO
+
+parport_unregister_device, parport_claim
+
+parport_unregister_device - finish using a port
+-------------------------
+
+SYNPOPSIS
+
+#include <linux/parport.h>
+
+void parport_unregister_device (struct pardevice *dev);
+
+DESCRIPTION
+
+This function is the opposite of parport_register_device. After using
+parport_unregister_device, 'dev' is no longer a valid device handle.
+
+You should not unregister a device that is currently claimed, although
+if you do it will be released automatically.
+
+EXAMPLE
+
+ ...
+ kfree (dev->private); /* before we lose the pointer */
+ parport_unregister_device (dev);
+ ...
+
+SEE ALSO
+
+parport_unregister_driver
+
+parport_claim, parport_claim_or_block - claim the parallel port for a device
+-------------------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_claim (struct pardevice *dev);
+int parport_claim_or_block (struct pardevice *dev);
+
+DESCRIPTION
+
+These functions attempt to gain control of the parallel port on which
+'dev' is registered. 'parport_claim' does not block, but
+'parport_claim_or_block' may do. (Put something here about blocking
+interruptibly or non-interruptibly.)
+
+You should not try to claim a port that you have already claimed.
+
+RETURN VALUE
+
+A return value of zero indicates that the port was successfully
+claimed, and the caller now has possession of the parallel port.
+
+If 'parport_claim_or_block' blocks before returning successfully, the
+return value is positive.
+
+ERRORS
+
+ -EAGAIN The port is unavailable at the moment, but another attempt
+ to claim it may succeed.
+
+SEE ALSO
+
+parport_release
+
+parport_release - release the parallel port
+---------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+void parport_release (struct pardevice *dev);
+
+DESCRIPTION
+
+Once a parallel port device has been claimed, it can be released using
+'parport_release'. It cannot fail, but you should not release a
+device that you do not have possession of.
+
+EXAMPLE
+
+static size_t write (struct pardevice *dev, const void *buf,
+ size_t len)
+{
+ ...
+ written = dev->port->ops->write_ecp_data (dev->port, buf,
+ len);
+ parport_release (dev);
+ ...
+}
+
+
+SEE ALSO
+
+change_mode, parport_claim, parport_claim_or_block, parport_yield
+
+parport_yield, parport_yield_blocking - temporarily release a parallel port
+-------------------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_yield (struct pardevice *dev)
+int parport_yield_blocking (struct pardevice *dev);
+
+DESCRIPTION
+
+When a driver has control of a parallel port, it may allow another
+driver to temporarily 'borrow' it. 'parport_yield' does not block;
+'parport_yield_blocking' may do.
+
+RETURN VALUE
+
+A return value of zero indicates that the caller still owns the port
+and the call did not block.
+
+A positive return value from 'parport_yield_blocking' indicates that
+the caller still owns the port and the call blocked.
+
+A return value of -EAGAIN indicates that the caller no longer owns the
+port, and it must be re-claimed before use.
+
+ERRORS
+
+ -EAGAIN Ownership of the parallel port was given away.
+
+SEE ALSO
+
+parport_release
+
+parport_wait_peripheral - wait for status lines, up to 35ms
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_wait_peripheral (struct parport *port,
+ unsigned char mask,
+ unsigned char val);
+
+DESCRIPTION
+
+Wait for the status lines in mask to match the values in val.
+
+RETURN VALUE
+
+ -EINTR a signal is pending
+ 0 the status lines in mask have values in val
+ 1 timed out while waiting (35ms elapsed)
+
+SEE ALSO
+
+parport_poll_peripheral
+
+parport_poll_peripheral - wait for status lines, in usec
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_poll_peripheral (struct parport *port,
+ unsigned char mask,
+ unsigned char val,
+ int usec);
+
+DESCRIPTION
+
+Wait for the status lines in mask to match the values in val.
+
+RETURN VALUE
+
+ -EINTR a signal is pending
+ 0 the status lines in mask have values in val
+ 1 timed out while waiting (usec microseconds have elapsed)
+
+SEE ALSO
+
+parport_wait_peripheral
+
+parport_wait_event - wait for an event on a port
+------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_wait_event (struct parport *port, signed long timeout)
+
+DESCRIPTION
+
+Wait for an event (e.g. interrupt) on a port. The timeout is in
+jiffies.
+
+RETURN VALUE
+
+ 0 success
+ <0 error (exit as soon as possible)
+ >0 timed out
+
+parport_negotiate - perform IEEE 1284 negotiation
+-----------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_negotiate (struct parport *, int mode);
+
+DESCRIPTION
+
+Perform IEEE 1284 negotiation.
+
+RETURN VALUE
+
+ 0 handshake OK; IEEE 1284 peripheral and mode available
+ -1 handshake failed; peripheral not compliant (or none present)
+ 1 handshake OK; IEEE 1284 peripheral present but mode not
+ available
+
+SEE ALSO
+
+parport_read, parport_write
+
+parport_read - read data from device
+------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+ssize_t parport_read (struct parport *, void *buf, size_t len);
+
+DESCRIPTION
+
+Read data from device in current IEEE 1284 transfer mode. This only
+works for modes that support reverse data transfer.
+
+RETURN VALUE
+
+If negative, an error code; otherwise the number of bytes transferred.
+
+SEE ALSO
+
+parport_write, parport_negotiate
+
+parport_write - write data to device
+-------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+ssize_t parport_write (struct parport *, const void *buf, size_t len);
+
+DESCRIPTION
+
+Write data to device in current IEEE 1284 transfer mode. This only
+works for modes that support forward data transfer.
+
+RETURN VALUE
+
+If negative, an error code; otherwise the number of bytes transferred.
+
+SEE ALSO
+
+parport_read, parport_negotiate
+
+parport_open - register device for particular device number
+------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct pardevice *parport_open (int devnum, const char *name,
+ int (*pf) (void *),
+ void (*kf) (void *),
+ void (*irqf) (int, void *,
+ struct pt_regs *),
+ int flags, void *handle);
+
+DESCRIPTION
+
+This is like parport_register_device but takes a device number instead
+of a pointer to a struct parport.
+
+RETURN VALUE
+
+See parport_register_device. If no device is associated with devnum,
+NULL is returned.
+
+SEE ALSO
+
+parport_register_device
+
+parport_close - unregister device for particular device number
+-------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+void parport_close (struct pardevice *dev);
+
+DESCRIPTION
+
+This is the equivalent of parport_unregister_device for parport_open.
+
+SEE ALSO
+
+parport_unregister_device, parport_open
+
+parport_device_id - obtain IEEE 1284 Device ID
+-----------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+ssize_t parport_device_id (int devnum, char *buffer, size_t len);
+
+DESCRIPTION
+
+Obtains the IEEE 1284 Device ID associated with a given device.
+
+RETURN VALUE
+
+If negative, an error code; otherwise, the number of bytes of buffer
+that contain the device ID. The format of the device ID is as
+follows:
+
+[length][ID]
+
+The first two bytes indicate the inclusive length of the entire Device
+ID, and are in big-endian order. The ID is a sequence of pairs of the
+form:
+
+key:value;
+
+NOTES
+
+Many devices have ill-formed IEEE 1284 Device IDs.
+
+SEE ALSO
+
+parport_find_class, parport_find_device
+
+parport_device_coords - convert device number to device coordinates
+------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_device_coords (int devnum, int *parport, int *mux,
+ int *daisy);
+
+DESCRIPTION
+
+Convert between device number (zero-based) and device coordinates
+(port, multiplexor, daisy chain address).
+
+RETURN VALUE
+
+Zero on success, in which case the coordinates are (*parport, *mux,
+*daisy).
+
+SEE ALSO
+
+parport_open, parport_device_id
+
+parport_find_class - find a device by its class
+------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+typedef enum {
+ PARPORT_CLASS_LEGACY = 0, /* Non-IEEE1284 device */
+ PARPORT_CLASS_PRINTER,
+ PARPORT_CLASS_MODEM,
+ PARPORT_CLASS_NET,
+ PARPORT_CLASS_HDC, /* Hard disk controller */
+ PARPORT_CLASS_PCMCIA,
+ PARPORT_CLASS_MEDIA, /* Multimedia device */
+ PARPORT_CLASS_FDC, /* Floppy disk controller */
+ PARPORT_CLASS_PORTS,
+ PARPORT_CLASS_SCANNER,
+ PARPORT_CLASS_DIGCAM,
+ PARPORT_CLASS_OTHER, /* Anything else */
+ PARPORT_CLASS_UNSPEC, /* No CLS field in ID */
+ PARPORT_CLASS_SCSIADAPTER
+} parport_device_class;
+
+int parport_find_class (parport_device_class cls, int from);
+
+DESCRIPTION
+
+Find a device by class. The search starts from device number from+1.
+
+RETURN VALUE
+
+The device number of the next device in that class, or -1 if no such
+device exists.
+
+NOTES
+
+Example usage:
+
+int devnum = -1;
+while ((devnum = parport_find_class (PARPORT_CLASS_DIGCAM, devnum)) != -1) {
+ struct pardevice *dev = parport_open (devnum, ...);
+ ...
+}
+
+SEE ALSO
+
+parport_find_device, parport_open, parport_device_id
+
+parport_find_device - find a device by its class
+------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+int parport_find_device (const char *mfg, const char *mdl, int from);
+
+DESCRIPTION
+
+Find a device by vendor and model. The search starts from device
+number from+1.
+
+RETURN VALUE
+
+The device number of the next device matching the specifications, or
+-1 if no such device exists.
+
+NOTES
+
+Example usage:
+
+int devnum = -1;
+while ((devnum = parport_find_device ("IOMEGA", "ZIP+", devnum)) != -1) {
+ struct pardevice *dev = parport_open (devnum, ...);
+ ...
+}
+
+SEE ALSO
+
+parport_find_class, parport_open, parport_device_id
+
+parport_set_timeout - set the inactivity timeout
+-------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+long parport_set_timeout (struct pardevice *dev, long inactivity);
+
+DESCRIPTION
+
+Set the inactivity timeout, in jiffies, for a registered device. The
+previous timeout is returned.
+
+RETURN VALUE
+
+The previous timeout, in jiffies.
+
+NOTES
+
+Some of the port->ops functions for a parport may take time, owing to
+delays at the peripheral. After the peripheral has not responded for
+'inactivity' jiffies, a timeout will occur and the blocking function
+will return.
+
+A timeout of 0 jiffies is a special case: the function must do as much
+as it can without blocking or leaving the hardware in an unknown
+state. If port operations are performed from within an interrupt
+handler, for instance, a timeout of 0 jiffies should be used.
+
+Once set for a registered device, the timeout will remain at the set
+value until set again.
+
+SEE ALSO
+
+port->ops->xxx_read/write_yyy
+
+PORT FUNCTIONS
+--------------
+
+The functions in the port->ops structure (struct parport_operations)
+are provided by the low-level driver responsible for that port.
+
+port->ops->read_data - read the data register
+--------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ unsigned char (*read_data) (struct parport *port);
+ ...
+};
+
+DESCRIPTION
+
+If port->modes contains the PARPORT_MODE_TRISTATE flag and the
+PARPORT_CONTROL_DIRECTION bit in the control register is set, this
+returns the value on the data pins. If port->modes contains the
+PARPORT_MODE_TRISTATE flag and the PARPORT_CONTROL_DIRECTION bit is
+not set, the return value _may_ be the last value written to the data
+register. Otherwise the return value is undefined.
+
+SEE ALSO
+
+write_data, read_status, write_control
+
+port->ops->write_data - write the data register
+---------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ void (*write_data) (struct parport *port, unsigned char d);
+ ...
+};
+
+DESCRIPTION
+
+Writes to the data register. May have side-effects (a STROBE pulse,
+for instance).
+
+SEE ALSO
+
+read_data, read_status, write_control
+
+port->ops->read_status - read the status register
+----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ unsigned char (*read_status) (struct parport *port);
+ ...
+};
+
+DESCRIPTION
+
+Reads from the status register. This is a bitmask:
+
+- PARPORT_STATUS_ERROR (printer fault, "nFault")
+- PARPORT_STATUS_SELECT (on-line, "Select")
+- PARPORT_STATUS_PAPEROUT (no paper, "PError")
+- PARPORT_STATUS_ACK (handshake, "nAck")
+- PARPORT_STATUS_BUSY (busy, "Busy")
+
+There may be other bits set.
+
+SEE ALSO
+
+read_data, write_data, write_control
+
+port->ops->read_control - read the control register
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ unsigned char (*read_control) (struct parport *port);
+ ...
+};
+
+DESCRIPTION
+
+Returns the last value written to the control register (either from
+write_control or frob_control). No port access is performed.
+
+SEE ALSO
+
+read_data, write_data, read_status, write_control
+
+port->ops->write_control - write the control register
+------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ void (*write_control) (struct parport *port, unsigned char s);
+ ...
+};
+
+DESCRIPTION
+
+Writes to the control register. This is a bitmask:
+ _______
+- PARPORT_CONTROL_STROBE (nStrobe)
+ _______
+- PARPORT_CONTROL_AUTOFD (nAutoFd)
+ _____
+- PARPORT_CONTROL_INIT (nInit)
+ _________
+- PARPORT_CONTROL_SELECT (nSelectIn)
+
+SEE ALSO
+
+read_data, write_data, read_status, frob_control
+
+port->ops->frob_control - write control register bits
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ unsigned char (*frob_control) (struct parport *port,
+ unsigned char mask,
+ unsigned char val);
+ ...
+};
+
+DESCRIPTION
+
+This is equivalent to reading from the control register, masking out
+the bits in mask, exclusive-or'ing with the bits in val, and writing
+the result to the control register.
+
+As some ports don't allow reads from the control port, a software copy
+of its contents is maintained, so frob_control is in fact only one
+port access.
+
+SEE ALSO
+
+read_data, write_data, read_status, write_control
+
+port->ops->enable_irq - enable interrupt generation
+---------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ void (*enable_irq) (struct parport *port);
+ ...
+};
+
+DESCRIPTION
+
+The parallel port hardware is instructed to generate interrupts at
+appropriate moments, although those moments are
+architecture-specific. For the PC architecture, interrupts are
+commonly generated on the rising edge of nAck.
+
+SEE ALSO
+
+disable_irq
+
+port->ops->disable_irq - disable interrupt generation
+----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ void (*disable_irq) (struct parport *port);
+ ...
+};
+
+DESCRIPTION
+
+The parallel port hardware is instructed not to generate interrupts.
+The interrupt itself is not masked.
+
+SEE ALSO
+
+enable_irq
+
+port->ops->data_forward - enable data drivers
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ void (*data_forward) (struct parport *port);
+ ...
+};
+
+DESCRIPTION
+
+Enables the data line drivers, for 8-bit host-to-peripheral
+communications.
+
+SEE ALSO
+
+data_reverse
+
+port->ops->data_reverse - tristate the buffer
+-----------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ void (*data_reverse) (struct parport *port);
+ ...
+};
+
+DESCRIPTION
+
+Places the data bus in a high impedance state, if port->modes has the
+PARPORT_MODE_TRISTATE bit set.
+
+SEE ALSO
+
+data_forward
+
+port->ops->epp_write_data - write EPP data
+-------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*epp_write_data) (struct parport *port, const void *buf,
+ size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Writes data in EPP mode, and returns the number of bytes written.
+
+The 'flags' parameter may be one or more of the following,
+bitwise-or'ed together:
+
+PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and
+ 32-bit registers. However, if a transfer
+ times out, the return value may be unreliable.
+
+SEE ALSO
+
+epp_read_data, epp_write_addr, epp_read_addr
+
+port->ops->epp_read_data - read EPP data
+------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*epp_read_data) (struct parport *port, void *buf,
+ size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Reads data in EPP mode, and returns the number of bytes read.
+
+The 'flags' parameter may be one or more of the following,
+bitwise-or'ed together:
+
+PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and
+ 32-bit registers. However, if a transfer
+ times out, the return value may be unreliable.
+
+SEE ALSO
+
+epp_write_data, epp_write_addr, epp_read_addr
+
+port->ops->epp_write_addr - write EPP address
+-------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*epp_write_addr) (struct parport *port,
+ const void *buf, size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Writes EPP addresses (8 bits each), and returns the number written.
+
+The 'flags' parameter may be one or more of the following,
+bitwise-or'ed together:
+
+PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and
+ 32-bit registers. However, if a transfer
+ times out, the return value may be unreliable.
+
+(Does PARPORT_EPP_FAST make sense for this function?)
+
+SEE ALSO
+
+epp_write_data, epp_read_data, epp_read_addr
+
+port->ops->epp_read_addr - read EPP address
+------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*epp_read_addr) (struct parport *port, void *buf,
+ size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Reads EPP addresses (8 bits each), and returns the number read.
+
+The 'flags' parameter may be one or more of the following,
+bitwise-or'ed together:
+
+PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and
+ 32-bit registers. However, if a transfer
+ times out, the return value may be unreliable.
+
+(Does PARPORT_EPP_FAST make sense for this function?)
+
+SEE ALSO
+
+epp_write_data, epp_read_data, epp_write_addr
+
+port->ops->ecp_write_data - write a block of ECP data
+-------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*ecp_write_data) (struct parport *port,
+ const void *buf, size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Writes a block of ECP data. The 'flags' parameter is ignored.
+
+RETURN VALUE
+
+The number of bytes written.
+
+SEE ALSO
+
+ecp_read_data, ecp_write_addr
+
+port->ops->ecp_read_data - read a block of ECP data
+------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*ecp_read_data) (struct parport *port,
+ void *buf, size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Reads a block of ECP data. The 'flags' parameter is ignored.
+
+RETURN VALUE
+
+The number of bytes read. NB. There may be more unread data in a
+FIFO. Is there a way of stunning the FIFO to prevent this?
+
+SEE ALSO
+
+ecp_write_block, ecp_write_addr
+
+port->ops->ecp_write_addr - write a block of ECP addresses
+-------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*ecp_write_addr) (struct parport *port,
+ const void *buf, size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Writes a block of ECP addresses. The 'flags' parameter is ignored.
+
+RETURN VALUE
+
+The number of bytes written.
+
+NOTES
+
+This may use a FIFO, and if so shall not return until the FIFO is empty.
+
+SEE ALSO
+
+ecp_read_data, ecp_write_data
+
+port->ops->nibble_read_data - read a block of data in nibble mode
+---------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*nibble_read_data) (struct parport *port,
+ void *buf, size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Reads a block of data in nibble mode. The 'flags' parameter is ignored.
+
+RETURN VALUE
+
+The number of whole bytes read.
+
+SEE ALSO
+
+byte_read_data, compat_write_data
+
+port->ops->byte_read_data - read a block of data in byte mode
+-------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*byte_read_data) (struct parport *port,
+ void *buf, size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Reads a block of data in byte mode. The 'flags' parameter is ignored.
+
+RETURN VALUE
+
+The number of bytes read.
+
+SEE ALSO
+
+nibble_read_data, compat_write_data
+
+port->ops->compat_write_data - write a block of data in compatibility mode
+----------------------------
+
+SYNOPSIS
+
+#include <linux/parport.h>
+
+struct parport_operations {
+ ...
+ size_t (*compat_write_data) (struct parport *port,
+ const void *buf, size_t len, int flags);
+ ...
+};
+
+DESCRIPTION
+
+Writes a block of data in compatibility mode. The 'flags' parameter
+is ignored.
+
+RETURN VALUE
+
+The number of bytes written.
+
+SEE ALSO
+
+nibble_read_data, byte_read_data
diff --git a/Documentation/parport.txt b/Documentation/parport.txt
new file mode 100644
index 0000000..93a7cee
--- /dev/null
+++ b/Documentation/parport.txt
@@ -0,0 +1,268 @@
+The `parport' code provides parallel-port support under Linux. This
+includes the ability to share one port between multiple device
+drivers.
+
+You can pass parameters to the parport code to override its automatic
+detection of your hardware. This is particularly useful if you want
+to use IRQs, since in general these can't be autoprobed successfully.
+By default IRQs are not used even if they _can_ be probed. This is
+because there are a lot of people using the same IRQ for their
+parallel port and a sound card or network card.
+
+The parport code is split into two parts: generic (which deals with
+port-sharing) and architecture-dependent (which deals with actually
+using the port).
+
+
+Parport as modules
+==================
+
+If you load the parport code as a module, say
+
+ # insmod parport
+
+to load the generic parport code. You then must load the
+architecture-dependent code with (for example):
+
+ # insmod parport_pc io=0x3bc,0x378,0x278 irq=none,7,auto
+
+to tell the parport code that you want three PC-style ports, one at
+0x3bc with no IRQ, one at 0x378 using IRQ 7, and one at 0x278 with an
+auto-detected IRQ. Currently, PC-style (parport_pc), Sun `bpp',
+Amiga, Atari, and MFC3 hardware is supported.
+
+PCI parallel I/O card support comes from parport_pc. Base I/O
+addresses should not be specified for supported PCI cards since they
+are automatically detected.
+
+
+KMod
+----
+
+If you use kmod, you will find it useful to edit /etc/modprobe.conf.
+Here is an example of the lines that need to be added:
+
+ alias parport_lowlevel parport_pc
+ options parport_pc io=0x378,0x278 irq=7,auto
+
+KMod will then automatically load parport_pc (with the options
+"io=0x378,0x278 irq=7,auto") whenever a parallel port device driver
+(such as lp) is loaded.
+
+Note that these are example lines only! You shouldn't in general need
+to specify any options to parport_pc in order to be able to use a
+parallel port.
+
+
+Parport probe [optional]
+-------------
+
+In 2.2 kernels there was a module called parport_probe, which was used
+for collecting IEEE 1284 device ID information. This has now been
+enhanced and now lives with the IEEE 1284 support. When a parallel
+port is detected, the devices that are connected to it are analysed,
+and information is logged like this:
+
+ parport0: Printer, BJC-210 (Canon)
+
+The probe information is available from files in /proc/sys/dev/parport/.
+
+
+Parport linked into the kernel statically
+=========================================
+
+If you compile the parport code into the kernel, then you can use
+kernel boot parameters to get the same effect. Add something like the
+following to your LILO command line:
+
+ parport=0x3bc parport=0x378,7 parport=0x278,auto,nofifo
+
+You can have many `parport=...' statements, one for each port you want
+to add. Adding `parport=0' to the kernel command-line will disable
+parport support entirely. Adding `parport=auto' to the kernel
+command-line will make parport use any IRQ lines or DMA channels that
+it auto-detects.
+
+
+Files in /proc
+==============
+
+If you have configured the /proc filesystem into your kernel, you will
+see a new directory entry: /proc/sys/dev/parport. In there will be a
+directory entry for each parallel port for which parport is
+configured. In each of those directories are a collection of files
+describing that parallel port.
+
+The /proc/sys/dev/parport directory tree looks like:
+
+parport
+|-- default
+| |-- spintime
+| `-- timeslice
+|-- parport0
+| |-- autoprobe
+| |-- autoprobe0
+| |-- autoprobe1
+| |-- autoprobe2
+| |-- autoprobe3
+| |-- devices
+| | |-- active
+| | `-- lp
+| | `-- timeslice
+| |-- base-addr
+| |-- irq
+| |-- dma
+| |-- modes
+| `-- spintime
+`-- parport1
+ |-- autoprobe
+ |-- autoprobe0
+ |-- autoprobe1
+ |-- autoprobe2
+ |-- autoprobe3
+ |-- devices
+ | |-- active
+ | `-- ppa
+ | `-- timeslice
+ |-- base-addr
+ |-- irq
+ |-- dma
+ |-- modes
+ `-- spintime
+
+
+File: Contents:
+
+devices/active A list of the device drivers using that port. A "+"
+ will appear by the name of the device currently using
+ the port (it might not appear against any). The
+ string "none" means that there are no device drivers
+ using that port.
+
+base-addr Parallel port's base address, or addresses if the port
+ has more than one in which case they are separated
+ with tabs. These values might not have any sensible
+ meaning for some ports.
+
+irq Parallel port's IRQ, or -1 if none is being used.
+
+dma Parallel port's DMA channel, or -1 if none is being
+ used.
+
+modes Parallel port's hardware modes, comma-separated,
+ meaning:
+
+ PCSPP PC-style SPP registers are available.
+ TRISTATE Port is bidirectional.
+ COMPAT Hardware acceleration for printers is
+ available and will be used.
+ EPP Hardware acceleration for EPP protocol
+ is available and will be used.
+ ECP Hardware acceleration for ECP protocol
+ is available and will be used.
+ DMA DMA is available and will be used.
+
+ Note that the current implementation will only take
+ advantage of COMPAT and ECP modes if it has an IRQ
+ line to use.
+
+autoprobe Any IEEE-1284 device ID information that has been
+ acquired from the (non-IEEE 1284.3) device.
+
+autoprobe[0-3] IEEE 1284 device ID information retrieved from
+ daisy-chain devices that conform to IEEE 1284.3.
+
+spintime The number of microseconds to busy-loop while waiting
+ for the peripheral to respond. You might find that
+ adjusting this improves performance, depending on your
+ peripherals. This is a port-wide setting, i.e. it
+ applies to all devices on a particular port.
+
+timeslice The number of milliseconds that a device driver is
+ allowed to keep a port claimed for. This is advisory,
+ and driver can ignore it if it must.
+
+default/* The defaults for spintime and timeslice. When a new
+ port is registered, it picks up the default spintime.
+ When a new device is registered, it picks up the
+ default timeslice.
+
+Device drivers
+==============
+
+Once the parport code is initialised, you can attach device drivers to
+specific ports. Normally this happens automatically; if the lp driver
+is loaded it will create one lp device for each port found. You can
+override this, though, by using parameters either when you load the lp
+driver:
+
+ # insmod lp parport=0,2
+
+or on the LILO command line:
+
+ lp=parport0 lp=parport2
+
+Both the above examples would inform lp that you want /dev/lp0 to be
+the first parallel port, and /dev/lp1 to be the _third_ parallel port,
+with no lp device associated with the second port (parport1). Note
+that this is different to the way older kernels worked; there used to
+be a static association between the I/O port address and the device
+name, so /dev/lp0 was always the port at 0x3bc. This is no longer the
+case - if you only have one port, it will default to being /dev/lp0,
+regardless of base address.
+
+Also:
+
+ * If you selected the IEEE 1284 support at compile time, you can say
+ `lp=auto' on the kernel command line, and lp will create devices
+ only for those ports that seem to have printers attached.
+
+ * If you give PLIP the `timid' parameter, either with `plip=timid' on
+ the command line, or with `insmod plip timid=1' when using modules,
+ it will avoid any ports that seem to be in use by other devices.
+
+ * IRQ autoprobing works only for a few port types at the moment.
+
+Reporting printer problems with parport
+=======================================
+
+If you are having problems printing, please go through these steps to
+try to narrow down where the problem area is.
+
+When reporting problems with parport, really you need to give all of
+the messages that parport_pc spits out when it initialises. There are
+several code paths:
+
+o polling
+o interrupt-driven, protocol in software
+o interrupt-driven, protocol in hardware using PIO
+o interrupt-driven, protocol in hardware using DMA
+
+The kernel messages that parport_pc logs give an indication of which
+code path is being used. (They could be a lot better actually..)
+
+For normal printer protocol, having IEEE 1284 modes enabled or not
+should not make a difference.
+
+To turn off the 'protocol in hardware' code paths, disable
+CONFIG_PARPORT_PC_FIFO. Note that when they are enabled they are not
+necessarily _used_; it depends on whether the hardware is available,
+enabled by the BIOS, and detected by the driver.
+
+So, to start with, disable CONFIG_PARPORT_PC_FIFO, and load parport_pc
+with 'irq=none'. See if printing works then. It really should,
+because this is the simplest code path.
+
+If that works fine, try with 'io=0x378 irq=7' (adjust for your
+hardware), to make it use interrupt-driven in-software protocol.
+
+If _that_ works fine, then one of the hardware modes isn't working
+right. Enable CONFIG_PARPORT_PC_FIFO (no, it isn't a module option,
+and yes, it should be), set the port to ECP mode in the BIOS and note
+the DMA channel, and try with:
+
+ io=0x378 irq=7 dma=none (for PIO)
+ io=0x378 irq=7 dma=3 (for DMA)
+--
+philb@gnu.org
+tim@cyberelk.net
diff --git a/Documentation/pcmcia/.gitignore b/Documentation/pcmcia/.gitignore
new file mode 100644
index 0000000..53d0813
--- /dev/null
+++ b/Documentation/pcmcia/.gitignore
@@ -0,0 +1 @@
+crc32hash
diff --git a/Documentation/pcmcia/Makefile b/Documentation/pcmcia/Makefile
new file mode 100644
index 0000000..accde87
--- /dev/null
+++ b/Documentation/pcmcia/Makefile
@@ -0,0 +1,10 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := crc32hash
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_crc32hash.o += -I$(objtree)/usr/include
diff --git a/Documentation/pcmcia/crc32hash.c b/Documentation/pcmcia/crc32hash.c
new file mode 100644
index 0000000..4210e5a
--- /dev/null
+++ b/Documentation/pcmcia/crc32hash.c
@@ -0,0 +1,32 @@
+/* crc32hash.c - derived from linux/lib/crc32.c, GNU GPL v2 */
+/* Usage example:
+$ ./crc32hash "Dual Speed"
+*/
+
+#include <string.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+
+unsigned int crc32(unsigned char const *p, unsigned int len)
+{
+ int i;
+ unsigned int crc = 0;
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0xedb88320 : 0);
+ }
+ return crc;
+}
+
+int main(int argc, char **argv) {
+ unsigned int result;
+ if (argc != 2) {
+ printf("no string passed as argument\n");
+ return -1;
+ }
+ result = crc32((unsigned char const *)argv[1], strlen(argv[1]));
+ printf("0x%x\n", result);
+ return 0;
+}
diff --git a/Documentation/pcmcia/devicetable.txt b/Documentation/pcmcia/devicetable.txt
new file mode 100644
index 0000000..199afd1
--- /dev/null
+++ b/Documentation/pcmcia/devicetable.txt
@@ -0,0 +1,33 @@
+Matching of PCMCIA devices to drivers is done using one or more of the
+following criteria:
+
+- manufactor ID
+- card ID
+- product ID strings _and_ hashes of these strings
+- function ID
+- device function (actual and pseudo)
+
+You should use the helpers in include/pcmcia/device_id.h for generating the
+struct pcmcia_device_id[] entries which match devices to drivers.
+
+If you want to match product ID strings, you also need to pass the crc32
+hashes of the string to the macro, e.g. if you want to match the product ID
+string 1, you need to use
+
+PCMCIA_DEVICE_PROD_ID1("some_string", 0x(hash_of_some_string)),
+
+If the hash is incorrect, the kernel will inform you about this in "dmesg"
+upon module initialization, and tell you of the correct hash.
+
+You can determine the hash of the product ID strings by catting the file
+"modalias" in the sysfs directory of the PCMCIA device. It generates a string
+in the following form:
+pcmcia:m0149cC1ABf06pfn00fn00pa725B842DpbF1EFEE84pc0877B627pd00000000
+
+The hex value after "pa" is the hash of product ID string 1, after "pb" for
+string 2 and so on.
+
+Alternatively, you can use crc32hash (see Documentation/pcmcia/crc32hash.c)
+to determine the crc32 hash. Simply pass the string you want to evaluate
+as argument to this program, e.g.:
+$ ./crc32hash "Dual Speed"
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
new file mode 100644
index 0000000..0599343
--- /dev/null
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -0,0 +1,90 @@
+This file details changes in 2.6 which affect PCMCIA card driver authors:
+
+* New configuration loop helper (as of 2.6.28)
+ By calling pcmcia_loop_config(), a driver can iterate over all available
+ configuration options. During a driver's probe() phase, one doesn't need
+ to use pcmcia_get_{first,next}_tuple, pcmcia_get_tuple_data and
+ pcmcia_parse_tuple directly in most if not all cases.
+
+* New release helper (as of 2.6.17)
+ Instead of calling pcmcia_release_{configuration,io,irq,win}, all that's
+ necessary now is calling pcmcia_disable_device. As there is no valid
+ reason left to call pcmcia_release_io and pcmcia_release_irq, the
+ exports for them were removed.
+
+* Unify detach and REMOVAL event code, as well as attach and INSERTION
+ code (as of 2.6.16)
+ void (*remove) (struct pcmcia_device *dev);
+ int (*probe) (struct pcmcia_device *dev);
+
+* Move suspend, resume and reset out of event handler (as of 2.6.16)
+ int (*suspend) (struct pcmcia_device *dev);
+ int (*resume) (struct pcmcia_device *dev);
+ should be initialized in struct pcmcia_driver, and handle
+ (SUSPEND == RESET_PHYSICAL) and (RESUME == CARD_RESET) events
+
+* event handler initialization in struct pcmcia_driver (as of 2.6.13)
+ The event handler is notified of all events, and must be initialized
+ as the event() callback in the driver's struct pcmcia_driver.
+
+* pcmcia/version.h should not be used (as of 2.6.13)
+ This file will be removed eventually.
+
+* in-kernel device<->driver matching (as of 2.6.13)
+ PCMCIA devices and their correct drivers can now be matched in
+ kernelspace. See 'devicetable.txt' for details.
+
+* Device model integration (as of 2.6.11)
+ A struct pcmcia_device is registered with the device model core,
+ and can be used (e.g. for SET_NETDEV_DEV) by using
+ handle_to_dev(client_handle_t * handle).
+
+* Convert internal I/O port addresses to unsigned int (as of 2.6.11)
+ ioaddr_t should be replaced by unsigned int in PCMCIA card drivers.
+
+* irq_mask and irq_list parameters (as of 2.6.11)
+ The irq_mask and irq_list parameters should no longer be used in
+ PCMCIA card drivers. Instead, it is the job of the PCMCIA core to
+ determine which IRQ should be used. Therefore, link->irq.IRQInfo2
+ is ignored.
+
+* client->PendingEvents is gone (as of 2.6.11)
+ client->PendingEvents is no longer available.
+
+* client->Attributes are gone (as of 2.6.11)
+ client->Attributes is unused, therefore it is removed from all
+ PCMCIA card drivers
+
+* core functions no longer available (as of 2.6.11)
+ The following functions have been removed from the kernel source
+ because they are unused by all in-kernel drivers, and no external
+ driver was reported to rely on them:
+ pcmcia_get_first_region()
+ pcmcia_get_next_region()
+ pcmcia_modify_window()
+ pcmcia_set_event_mask()
+ pcmcia_get_first_window()
+ pcmcia_get_next_window()
+
+* device list iteration upon module removal (as of 2.6.10)
+ It is no longer necessary to iterate on the driver's internal
+ client list and call the ->detach() function upon module removal.
+
+* Resource management. (as of 2.6.8)
+ Although the PCMCIA subsystem will allocate resources for cards,
+ it no longer marks these resources busy. This means that driver
+ authors are now responsible for claiming your resources as per
+ other drivers in Linux. You should use request_region() to mark
+ your IO regions in-use, and request_mem_region() to mark your
+ memory regions in-use. The name argument should be a pointer to
+ your driver name. Eg, for pcnet_cs, name should point to the
+ string "pcnet_cs".
+
+* CardServices is gone
+ CardServices() in 2.4 is just a big switch statement to call various
+ services. In 2.6, all of those entry points are exported and called
+ directly (except for pcmcia_report_error(), just use cs_error() instead).
+
+* struct pcmcia_driver
+ You need to use struct pcmcia_driver and pcmcia_{un,}register_driver
+ instead of {un,}register_pccard_driver
diff --git a/Documentation/pcmcia/driver.txt b/Documentation/pcmcia/driver.txt
new file mode 100644
index 0000000..0ac1679
--- /dev/null
+++ b/Documentation/pcmcia/driver.txt
@@ -0,0 +1,30 @@
+PCMCIA Driver
+-------------
+
+
+sysfs
+-----
+
+New PCMCIA IDs may be added to a device driver pcmcia_device_id table at
+runtime as shown below:
+
+echo "match_flags manf_id card_id func_id function device_no \
+prod_id_hash[0] prod_id_hash[1] prod_id_hash[2] prod_id_hash[3]" > \
+/sys/bus/pcmcia/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The meaning is described in the PCMCIA specification, the match_flags is
+a bitwise or-ed combination from PCMCIA_DEV_ID_MATCH_* constants
+defined in include/linux/mod_devicetable.h.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCMCIA device listed in its (newly updated) pcmcia_device_id list.
+
+A common use-case is to add a new device according to the manufacturer ID
+and the card ID (form the manf_id and card_id file in the device tree).
+For this, just use:
+
+echo "0x3 manf_id card_id 0 0 0 0 0 0 0" > \
+ /sys/bus/pcmcia/drivers/{driver}/new_id
+
+after loading the driver.
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
new file mode 100644
index 0000000..9a5bc86
--- /dev/null
+++ b/Documentation/pi-futex.txt
@@ -0,0 +1,121 @@
+Lightweight PI-futexes
+----------------------
+
+We are calling them lightweight for 3 reasons:
+
+ - in the user-space fastpath a PI-enabled futex involves no kernel work
+ (or any other PI complexity) at all. No registration, no extra kernel
+ calls - just pure fast atomic ops in userspace.
+
+ - even in the slowpath, the system call and scheduling pattern is very
+ similar to normal futexes.
+
+ - the in-kernel PI implementation is streamlined around the mutex
+ abstraction, with strict rules that keep the implementation
+ relatively simple: only a single owner may own a lock (i.e. no
+ read-write lock support), only the owner may unlock a lock, no
+ recursive locking, etc.
+
+Priority Inheritance - why?
+---------------------------
+
+The short reply: user-space PI helps achieving/improving determinism for
+user-space applications. In the best-case, it can help achieve
+determinism and well-bound latencies. Even in the worst-case, PI will
+improve the statistical distribution of locking related application
+delays.
+
+The longer reply:
+-----------------
+
+Firstly, sharing locks between multiple tasks is a common programming
+technique that often cannot be replaced with lockless algorithms. As we
+can see it in the kernel [which is a quite complex program in itself],
+lockless structures are rather the exception than the norm - the current
+ratio of lockless vs. locky code for shared data structures is somewhere
+between 1:10 and 1:100. Lockless is hard, and the complexity of lockless
+algorithms often endangers to ability to do robust reviews of said code.
+I.e. critical RT apps often choose lock structures to protect critical
+data structures, instead of lockless algorithms. Furthermore, there are
+cases (like shared hardware, or other resource limits) where lockless
+access is mathematically impossible.
+
+Media players (such as Jack) are an example of reasonable application
+design with multiple tasks (with multiple priority levels) sharing
+short-held locks: for example, a highprio audio playback thread is
+combined with medium-prio construct-audio-data threads and low-prio
+display-colory-stuff threads. Add video and decoding to the mix and
+we've got even more priority levels.
+
+So once we accept that synchronization objects (locks) are an
+unavoidable fact of life, and once we accept that multi-task userspace
+apps have a very fair expectation of being able to use locks, we've got
+to think about how to offer the option of a deterministic locking
+implementation to user-space.
+
+Most of the technical counter-arguments against doing priority
+inheritance only apply to kernel-space locks. But user-space locks are
+different, there we cannot disable interrupts or make the task
+non-preemptible in a critical section, so the 'use spinlocks' argument
+does not apply (user-space spinlocks have the same priority inversion
+problems as other user-space locking constructs). Fact is, pretty much
+the only technique that currently enables good determinism for userspace
+locks (such as futex-based pthread mutexes) is priority inheritance:
+
+Currently (without PI), if a high-prio and a low-prio task shares a lock
+[this is a quite common scenario for most non-trivial RT applications],
+even if all critical sections are coded carefully to be deterministic
+(i.e. all critical sections are short in duration and only execute a
+limited number of instructions), the kernel cannot guarantee any
+deterministic execution of the high-prio task: any medium-priority task
+could preempt the low-prio task while it holds the shared lock and
+executes the critical section, and could delay it indefinitely.
+
+Implementation:
+---------------
+
+As mentioned before, the userspace fastpath of PI-enabled pthread
+mutexes involves no kernel work at all - they behave quite similarly to
+normal futex-based locks: a 0 value means unlocked, and a value==TID
+means locked. (This is the same method as used by list-based robust
+futexes.) Userspace uses atomic ops to lock/unlock these mutexes without
+entering the kernel.
+
+To handle the slowpath, we have added two new futex ops:
+
+ FUTEX_LOCK_PI
+ FUTEX_UNLOCK_PI
+
+If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to
+TID fails], then FUTEX_LOCK_PI is called. The kernel does all the
+remaining work: if there is no futex-queue attached to the futex address
+yet then the code looks up the task that owns the futex [it has put its
+own TID into the futex value], and attaches a 'PI state' structure to
+the futex-queue. The pi_state includes an rt-mutex, which is a PI-aware,
+kernel-based synchronization object. The 'other' task is made the owner
+of the rt-mutex, and the FUTEX_WAITERS bit is atomically set in the
+futex value. Then this task tries to lock the rt-mutex, on which it
+blocks. Once it returns, it has the mutex acquired, and it sets the
+futex value to its own TID and returns. Userspace has no other work to
+perform - it now owns the lock, and futex value contains
+FUTEX_WAITERS|TID.
+
+If the unlock side fastpath succeeds, [i.e. userspace manages to do a
+TID -> 0 atomic transition of the futex value], then no kernel work is
+triggered.
+
+If the unlock fastpath fails (because the FUTEX_WAITERS bit is set),
+then FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the
+behalf of userspace - and it also unlocks the attached
+pi_state->rt_mutex and thus wakes up any potential waiters.
+
+Note that under this approach, contrary to previous PI-futex approaches,
+there is no prior 'registration' of a PI-futex. [which is not quite
+possible anyway, due to existing ABI properties of pthread mutexes.]
+
+Also, under this scheme, 'robustness' and 'PI' are two orthogonal
+properties of futexes, and all four combinations are possible: futex,
+robust-futex, PI-futex, robust+PI-futex.
+
+More details about priority inheritance can be found in
+Documentation/rt-mutex.txt.
diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt
new file mode 100644
index 0000000..a327db6
--- /dev/null
+++ b/Documentation/pnp.txt
@@ -0,0 +1,252 @@
+Linux Plug and Play Documentation
+by Adam Belay <ambx1@neo.rr.com>
+last updated: Oct. 16, 2002
+---------------------------------------------------------------------------------------
+
+
+
+Overview
+--------
+ Plug and Play provides a means of detecting and setting resources for legacy or
+otherwise unconfigurable devices. The Linux Plug and Play Layer provides these
+services to compatible drivers.
+
+
+
+The User Interface
+------------------
+ The Linux Plug and Play user interface provides a means to activate PnP devices
+for legacy and user level drivers that do not support Linux Plug and Play. The
+user interface is integrated into sysfs.
+
+In addition to the standard sysfs file the following are created in each
+device's directory:
+id - displays a list of support EISA IDs
+options - displays possible resource configurations
+resources - displays currently allocated resources and allows resource changes
+
+-activating a device
+
+#echo "auto" > resources
+
+this will invoke the automatic resource config system to activate the device
+
+-manually activating a device
+
+#echo "manual <depnum> <mode>" > resources
+<depnum> - the configuration number
+<mode> - static or dynamic
+ static = for next boot
+ dynamic = now
+
+-disabling a device
+
+#echo "disable" > resources
+
+
+EXAMPLE:
+
+Suppose you need to activate the floppy disk controller.
+1.) change to the proper directory, in my case it is
+/driver/bus/pnp/devices/00:0f
+# cd /driver/bus/pnp/devices/00:0f
+# cat name
+PC standard floppy disk controller
+
+2.) check if the device is already active
+# cat resources
+DISABLED
+
+- Notice the string "DISABLED". THis means the device is not active.
+
+3.) check the device's possible configurations (optional)
+# cat options
+Dependent: 01 - Priority acceptable
+ port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding
+ port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding
+ irq 6
+ dma 2 8-bit compatible
+Dependent: 02 - Priority acceptable
+ port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding
+ port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding
+ irq 6
+ dma 2 8-bit compatible
+
+4.) now activate the device
+# echo "auto" > resources
+
+5.) finally check if the device is active
+# cat resources
+io 0x3f0-0x3f5
+io 0x3f7-0x3f7
+irq 6
+dma 2
+
+also there are a series of kernel parameters:
+pnp_reserve_irq=irq1[,irq2] ....
+pnp_reserve_dma=dma1[,dma2] ....
+pnp_reserve_io=io1,size1[,io2,size2] ....
+pnp_reserve_mem=mem1,size1[,mem2,size2] ....
+
+
+
+The Unified Plug and Play Layer
+-------------------------------
+ All Plug and Play drivers, protocols, and services meet at a central location
+called the Plug and Play Layer. This layer is responsible for the exchange of
+information between PnP drivers and PnP protocols. Thus it automatically
+forwards commands to the proper protocol. This makes writing PnP drivers
+significantly easier.
+
+The following functions are available from the Plug and Play Layer:
+
+pnp_get_protocol
+- increments the number of uses by one
+
+pnp_put_protocol
+- deincrements the number of uses by one
+
+pnp_register_protocol
+- use this to register a new PnP protocol
+
+pnp_unregister_protocol
+- use this function to remove a PnP protocol from the Plug and Play Layer
+
+pnp_register_driver
+- adds a PnP driver to the Plug and Play Layer
+- this includes driver model integration
+- returns zero for success or a negative error number for failure; count
+ calls to the .add() method if you need to know how many devices bind to
+ the driver
+
+pnp_unregister_driver
+- removes a PnP driver from the Plug and Play Layer
+
+
+
+Plug and Play Protocols
+-----------------------
+ This section contains information for PnP protocol developers.
+
+The following Protocols are currently available in the computing world:
+- PNPBIOS: used for system devices such as serial and parallel ports.
+- ISAPNP: provides PnP support for the ISA bus
+- ACPI: among its many uses, ACPI provides information about system level
+devices.
+It is meant to replace the PNPBIOS. It is not currently supported by Linux
+Plug and Play but it is planned to be in the near future.
+
+
+Requirements for a Linux PnP protocol:
+1.) the protocol must use EISA IDs
+2.) the protocol must inform the PnP Layer of a devices current configuration
+- the ability to set resources is optional but preferred.
+
+The following are PnP protocol related functions:
+
+pnp_add_device
+- use this function to add a PnP device to the PnP layer
+- only call this function when all wanted values are set in the pnp_dev
+structure
+
+pnp_init_device
+- call this to initialize the PnP structure
+
+pnp_remove_device
+- call this to remove a device from the Plug and Play Layer.
+- it will fail if the device is still in use.
+- automatically will free mem used by the device and related structures
+
+pnp_add_id
+- adds a EISA ID to the list of supported IDs for the specified device
+
+For more information consult the source of a protocol such as
+/drivers/pnp/pnpbios/core.c.
+
+
+
+Linux Plug and Play Drivers
+---------------------------
+ This section contains information for linux PnP driver developers.
+
+The New Way
+...........
+1.) first make a list of supported EISA IDS
+ex:
+static const struct pnp_id pnp_dev_table[] = {
+ /* Standard LPT Printer Port */
+ {.id = "PNP0400", .driver_data = 0},
+ /* ECP Printer Port */
+ {.id = "PNP0401", .driver_data = 0},
+ {.id = ""}
+};
+
+Please note that the character 'X' can be used as a wild card in the function
+portion (last four characters).
+ex:
+ /* Unknown PnP modems */
+ { "PNPCXXX", UNKNOWN_DEV },
+
+Supported PnP card IDs can optionally be defined.
+ex:
+static const struct pnp_id pnp_card_table[] = {
+ { "ANYDEVS", 0 },
+ { "", 0 }
+};
+
+2.) Optionally define probe and remove functions. It may make sense not to
+define these functions if the driver already has a reliable method of detecting
+the resources, such as the parport_pc driver.
+ex:
+static int
+serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
+ struct pnp_id *dev_id)
+{
+. . .
+
+ex:
+static void serial_pnp_remove(struct pnp_dev * dev)
+{
+. . .
+
+consult /drivers/serial/8250_pnp.c for more information.
+
+3.) create a driver structure
+ex:
+
+static struct pnp_driver serial_pnp_driver = {
+ .name = "serial",
+ .card_id_table = pnp_card_table,
+ .id_table = pnp_dev_table,
+ .probe = serial_pnp_probe,
+ .remove = serial_pnp_remove,
+};
+
+* name and id_table cannot be NULL.
+
+4.) register the driver
+ex:
+
+static int __init serial8250_pnp_init(void)
+{
+ return pnp_register_driver(&serial_pnp_driver);
+}
+
+The Old Way
+...........
+
+a series of compatibility functions have been created to make it easy to convert
+
+ISAPNP drivers. They should serve as a temporary solution only.
+
+they are as follows:
+
+struct pnp_card *pnp_find_card(unsigned short vendor,
+ unsigned short device,
+ struct pnp_card *from)
+
+struct pnp_dev *pnp_find_dev(struct pnp_card *card,
+ unsigned short vendor,
+ unsigned short function,
+ struct pnp_dev *from)
+
diff --git a/Documentation/power/00-INDEX b/Documentation/power/00-INDEX
new file mode 100644
index 0000000..fb742c2
--- /dev/null
+++ b/Documentation/power/00-INDEX
@@ -0,0 +1,40 @@
+00-INDEX
+ - This file
+apm-acpi.txt
+ - basic info about the APM and ACPI support.
+basic-pm-debugging.txt
+ - Debugging suspend and resume
+devices.txt
+ - How drivers interact with system-wide power management
+drivers-testing.txt
+ - Testing suspend and resume support in device drivers
+freezing-of-tasks.txt
+ - How processes and controlled during suspend
+interface.txt
+ - Power management user interface in /sys/power
+notifiers.txt
+ - Registering suspend notifiers in device drivers
+pci.txt
+ - How the PCI Subsystem Does Power Management
+pm_qos_interface.txt
+ - info on Linux PM Quality of Service interface
+power_supply_class.txt
+ - Tells userspace about battery, UPS, AC or DC power supply properties
+s2ram.txt
+ - How to get suspend to ram working (and debug it when it isn't)
+states.txt
+ - System power management states
+swsusp-and-swap-files.txt
+ - Using swap files with software suspend (to disk)
+swsusp-dmcrypt.txt
+ - How to use dm-crypt and software suspend (to disk) together
+swsusp.txt
+ - Goals, implementation, and usage of software suspend (ACPI S3)
+tricks.txt
+ - How to trick software suspend (to disk) into working when it isn't
+userland-swsusp.txt
+ - Experimental implementation of software suspend in userspace
+video_extension.txt
+ - ACPI video extensions
+video.txt
+ - Video issues during resume from suspend
diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt
new file mode 100644
index 0000000..1bd799d
--- /dev/null
+++ b/Documentation/power/apm-acpi.txt
@@ -0,0 +1,32 @@
+APM or ACPI?
+------------
+If you have a relatively recent x86 mobile, desktop, or server system,
+odds are it supports either Advanced Power Management (APM) or
+Advanced Configuration and Power Interface (ACPI). ACPI is the newer
+of the two technologies and puts power management in the hands of the
+operating system, allowing for more intelligent power management than
+is possible with BIOS controlled APM.
+
+The best way to determine which, if either, your system supports is to
+build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
+enabled by default). If a working ACPI implementation is found, the
+ACPI driver will override and disable APM, otherwise the APM driver
+will be used.
+
+No, sorry, you cannot have both ACPI and APM enabled and running at
+once. Some people with broken ACPI or broken APM implementations
+would like to use both to get a full set of working features, but you
+simply cannot mix and match the two. Only one power management
+interface can be in control of the machine at once. Think about it..
+
+User-space Daemons
+------------------
+Both APM and ACPI rely on user-space daemons, apmd and acpid
+respectively, to be completely functional. Obtain both of these
+daemons from your Linux distribution or from the Internet (see below)
+and be sure that they are started sometime in the system boot process.
+Go ahead and start both. If ACPI or APM is not available on your
+system the associated daemon will exit gracefully.
+
+ apmd: http://worldvisions.ca/~apenwarr/apmd/
+ acpid: http://acpid.sf.net/
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
new file mode 100644
index 0000000..1555001
--- /dev/null
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -0,0 +1,204 @@
+Debugging hibernation and suspend
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Testing hibernation (aka suspend to disk or STD)
+
+To check if hibernation works, you can try to hibernate in the "reboot" mode:
+
+# echo reboot > /sys/power/disk
+# echo disk > /sys/power/state
+
+and the system should create a hibernation image, reboot, resume and get back to
+the command prompt where you have started the transition. If that happens,
+hibernation is most likely to work correctly. Still, you need to repeat the
+test at least a couple of times in a row for confidence. [This is necessary,
+because some problems only show up on a second attempt at suspending and
+resuming the system.] Moreover, hibernating in the "reboot" and "shutdown"
+modes causes the PM core to skip some platform-related callbacks which on ACPI
+systems might be necessary to make hibernation work. Thus, if you machine fails
+to hibernate or resume in the "reboot" mode, you should try the "platform" mode:
+
+# echo platform > /sys/power/disk
+# echo disk > /sys/power/state
+
+which is the default and recommended mode of hibernation.
+
+Unfortunately, the "platform" mode of hibernation does not work on some systems
+with broken BIOSes. In such cases the "shutdown" mode of hibernation might
+work:
+
+# echo shutdown > /sys/power/disk
+# echo disk > /sys/power/state
+
+(it is similar to the "reboot" mode, but it requires you to press the power
+button to make the system resume).
+
+If neither "platform" nor "shutdown" hibernation mode works, you will need to
+identify what goes wrong.
+
+a) Test modes of hibernation
+
+To find out why hibernation fails on your system, you can use a special testing
+facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then,
+there is the file /sys/power/pm_test that can be used to make the hibernation
+core run in a test mode. There are 5 test modes available:
+
+freezer
+- test the freezing of processes
+
+devices
+- test the freezing of processes and suspending of devices
+
+platform
+- test the freezing of processes, suspending of devices and platform
+ global control methods(*)
+
+processors
+- test the freezing of processes, suspending of devices, platform
+ global control methods(*) and the disabling of nonboot CPUs
+
+core
+- test the freezing of processes, suspending of devices, platform global
+ control methods(*), the disabling of nonboot CPUs and suspending of
+ platform/system devices
+
+(*) the platform global control methods are only available on ACPI systems
+ and are only tested if the hibernation mode is set to "platform"
+
+To use one of them it is necessary to write the corresponding string to
+/sys/power/pm_test (eg. "devices" to test the freezing of processes and
+suspending devices) and issue the standard hibernation commands. For example,
+to use the "devices" test mode along with the "platform" mode of hibernation,
+you should do the following:
+
+# echo devices > /sys/power/pm_test
+# echo platform > /sys/power/disk
+# echo disk > /sys/power/state
+
+Then, the kernel will try to freeze processes, suspend devices, wait 5 seconds,
+resume devices and thaw processes. If "platform" is written to
+/sys/power/pm_test , then after suspending devices the kernel will additionally
+invoke the global control methods (eg. ACPI global control methods) used to
+prepare the platform firmware for hibernation. Next, it will wait 5 seconds and
+invoke the platform (eg. ACPI) global methods used to cancel hibernation etc.
+
+Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
+hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test
+contains a space-separated list of all available tests (including "none" that
+represents the normal functionality) in which the current test level is
+indicated by square brackets.
+
+Generally, as you can see, each test level is more "invasive" than the previous
+one and the "core" level tests the hardware and drivers as deeply as possible
+without creating a hibernation image. Obviously, if the "devices" test fails,
+the "platform" test will fail as well and so on. Thus, as a rule of thumb, you
+should try the test modes starting from "freezer", through "devices", "platform"
+and "processors" up to "core" (repeat the test on each level a couple of times
+to make sure that any random factors are avoided).
+
+If the "freezer" test fails, there is a task that cannot be frozen (in that case
+it usually is possible to identify the offending task by analysing the output of
+dmesg obtained after the failing test). Failure at this level usually means
+that there is a problem with the tasks freezer subsystem that should be
+reported.
+
+If the "devices" test fails, most likely there is a driver that cannot suspend
+or resume its device (in the latter case the system may hang or become unstable
+after the test, so please take that into consideration). To find this driver,
+you can carry out a binary search according to the rules:
+- if the test fails, unload a half of the drivers currently loaded and repeat
+(that would probably involve rebooting the system, so always note what drivers
+have been loaded before the test),
+- if the test succeeds, load a half of the drivers you have unloaded most
+recently and repeat.
+
+Once you have found the failing driver (there can be more than just one of
+them), you have to unload it every time before hibernation. In that case please
+make sure to report the problem with the driver.
+
+It is also possible that the "devices" test will still fail after you have
+unloaded all modules. In that case, you may want to look in your kernel
+configuration for the drivers that can be compiled as modules (and test again
+with these drivers compiled as modules). You may also try to use some special
+kernel command line options such as "noapic", "noacpi" or even "acpi=off".
+
+If the "platform" test fails, there is a problem with the handling of the
+platform (eg. ACPI) firmware on your system. In that case the "platform" mode
+of hibernation is not likely to work. You can try the "shutdown" mode, but that
+is rather a poor man's workaround.
+
+If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
+work (of course, this only may be an issue on SMP systems) and the problem
+should be reported. In that case you can also try to switch the nonboot CPUs
+off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
+see if that works.
+
+If the "core" test fails, which means that suspending of the system/platform
+devices has failed (these devices are suspended on one CPU with interrupts off),
+the problem is most probably hardware-related and serious, so it should be
+reported.
+
+A failure of any of the "platform", "processors" or "core" tests may cause your
+system to hang or become unstable, so please beware. Such a failure usually
+indicates a serious problem that very well may be related to the hardware, but
+please report it anyway.
+
+b) Testing minimal configuration
+
+If all of the hibernation test modes work, you can boot the system with the
+"init=/bin/bash" command line parameter and attempt to hibernate in the
+"reboot", "shutdown" and "platform" modes. If that does not work, there
+probably is a problem with a driver statically compiled into the kernel and you
+can try to compile more drivers as modules, so that they can be tested
+individually. Otherwise, there is a problem with a modular driver and you can
+find it by loading a half of the modules you normally use and binary searching
+in accordance with the algorithm:
+- if there are n modules loaded and the attempt to suspend and resume fails,
+unload n/2 of the modules and try again (that would probably involve rebooting
+the system),
+- if there are n modules loaded and the attempt to suspend and resume succeeds,
+load n/2 modules more and try again.
+
+Again, if you find the offending module(s), it(they) must be unloaded every time
+before hibernation, and please report the problem with it(them).
+
+c) Advanced debugging
+
+In case that hibernation does not work on your system even in the minimal
+configuration and compiling more drivers as modules is not practical or some
+modules cannot be unloaded, you can use one of the more advanced debugging
+techniques to find the problem. First, if there is a serial port in your box,
+you can boot the kernel with the 'no_console_suspend' parameter and try to log
+kernel messages using the serial console. This may provide you with some
+information about the reasons of the suspend (resume) failure. Alternatively,
+it may be possible to use a FireWire port for debugging with firescope
+(ftp://ftp.firstfloor.org/pub/ak/firescope/). On x86 it is also possible to
+use the PM_TRACE mechanism documented in Documentation/s2ram.txt .
+
+2. Testing suspend to RAM (STR)
+
+To verify that the STR works, it is generally more convenient to use the s2ram
+tool available from http://suspend.sf.net and documented at
+http://en.opensuse.org/s2ram . However, before doing that it is recommended to
+carry out STR testing using the facility described in section 1.
+
+Namely, after writing "freezer", "devices", "platform", "processors", or "core"
+into /sys/power/pm_test (available if the kernel is compiled with
+CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
+to given string. The STR test modes are defined in the same way as for
+hibernation, so please refer to Section 1 for more information about them. In
+particular, the "core" test allows you to test everything except for the actual
+invocation of the platform firmware in order to put the system into the sleep
+state.
+
+Among other things, the testing with the help of /sys/power/pm_test may allow
+you to identify drivers that fail to suspend or resume their devices. They
+should be unloaded every time before an STR transition.
+
+Next, you can follow the instructions at http://en.opensuse.org/s2ram to test
+the system, but if it does not work "out of the box", you may need to boot it
+with "init=/bin/bash" and test s2ram in the minimal configuration. In that
+case, you may be able to search for failing drivers by following the procedure
+analogous to the one described in section 1. If you find some failing drivers,
+you will have to unload them every time before an STR transition (ie. before
+you run s2ram), and please report the problems with them.
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
new file mode 100644
index 0000000..421e7d0
--- /dev/null
+++ b/Documentation/power/devices.txt
@@ -0,0 +1,512 @@
+Most of the code in Linux is device drivers, so most of the Linux power
+management code is also driver-specific. Most drivers will do very little;
+others, especially for platforms with small batteries (like cell phones),
+will do a lot.
+
+This writeup gives an overview of how drivers interact with system-wide
+power management goals, emphasizing the models and interfaces that are
+shared by everything that hooks up to the driver model core. Read it as
+background for the domain-specific work you'd do with any specific driver.
+
+
+Two Models for Device Power Management
+======================================
+Drivers will use one or both of these models to put devices into low-power
+states:
+
+ System Sleep model:
+ Drivers can enter low power states as part of entering system-wide
+ low-power states like "suspend-to-ram", or (mostly for systems with
+ disks) "hibernate" (suspend-to-disk).
+
+ This is something that device, bus, and class drivers collaborate on
+ by implementing various role-specific suspend and resume methods to
+ cleanly power down hardware and software subsystems, then reactivate
+ them without loss of data.
+
+ Some drivers can manage hardware wakeup events, which make the system
+ leave that low-power state. This feature may be disabled using the
+ relevant /sys/devices/.../power/wakeup file; enabling it may cost some
+ power usage, but let the whole system enter low power states more often.
+
+ Runtime Power Management model:
+ Drivers may also enter low power states while the system is running,
+ independently of other power management activity. Upstream drivers
+ will normally not know (or care) if the device is in some low power
+ state when issuing requests; the driver will auto-resume anything
+ that's needed when it gets a request.
+
+ This doesn't have, or need much infrastructure; it's just something you
+ should do when writing your drivers. For example, clk_disable() unused
+ clocks as part of minimizing power drain for currently-unused hardware.
+ Of course, sometimes clusters of drivers will collaborate with each
+ other, which could involve task-specific power management.
+
+There's not a lot to be said about those low power states except that they
+are very system-specific, and often device-specific. Also, that if enough
+drivers put themselves into low power states (at "runtime"), the effect may be
+the same as entering some system-wide low-power state (system sleep) ... and
+that synergies exist, so that several drivers using runtime pm might put the
+system into a state where even deeper power saving options are available.
+
+Most suspended devices will have quiesced all I/O: no more DMA or irqs, no
+more data read or written, and requests from upstream drivers are no longer
+accepted. A given bus or platform may have different requirements though.
+
+Examples of hardware wakeup events include an alarm from a real time clock,
+network wake-on-LAN packets, keyboard or mouse activity, and media insertion
+or removal (for PCMCIA, MMC/SD, USB, and so on).
+
+
+Interfaces for Entering System Sleep States
+===========================================
+Most of the programming interfaces a device driver needs to know about
+relate to that first model: entering a system-wide low power state,
+rather than just minimizing power consumption by one device.
+
+
+Bus Driver Methods
+------------------
+The core methods to suspend and resume devices reside in struct bus_type.
+These are mostly of interest to people writing infrastructure for busses
+like PCI or USB, or because they define the primitives that device drivers
+may need to apply in domain-specific ways to their devices:
+
+struct bus_type {
+ ...
+ int (*suspend)(struct device *dev, pm_message_t state);
+ int (*suspend_late)(struct device *dev, pm_message_t state);
+
+ int (*resume_early)(struct device *dev);
+ int (*resume)(struct device *dev);
+};
+
+Bus drivers implement those methods as appropriate for the hardware and
+the drivers using it; PCI works differently from USB, and so on. Not many
+people write bus drivers; most driver code is a "device driver" that
+builds on top of bus-specific framework code.
+
+For more information on these driver calls, see the description later;
+they are called in phases for every device, respecting the parent-child
+sequencing in the driver model tree. Note that as this is being written,
+only the suspend() and resume() are widely available; not many bus drivers
+leverage all of those phases, or pass them down to lower driver levels.
+
+
+/sys/devices/.../power/wakeup files
+-----------------------------------
+All devices in the driver model have two flags to control handling of
+wakeup events, which are hardware signals that can force the device and/or
+system out of a low power state. These are initialized by bus or device
+driver code using device_init_wakeup(dev,can_wakeup).
+
+The "can_wakeup" flag just records whether the device (and its driver) can
+physically support wakeup events. When that flag is clear, the sysfs
+"wakeup" file is empty, and device_may_wakeup() returns false.
+
+For devices that can issue wakeup events, a separate flag controls whether
+that device should try to use its wakeup mechanism. The initial value of
+device_may_wakeup() will be true, so that the device's "wakeup" file holds
+the value "enabled". Userspace can change that to "disabled" so that
+device_may_wakeup() returns false; or change it back to "enabled" (so that
+it returns true again).
+
+
+EXAMPLE: PCI Device Driver Methods
+-----------------------------------
+PCI framework software calls these methods when the PCI device driver bound
+to a device device has provided them:
+
+struct pci_driver {
+ ...
+ int (*suspend)(struct pci_device *pdev, pm_message_t state);
+ int (*suspend_late)(struct pci_device *pdev, pm_message_t state);
+
+ int (*resume_early)(struct pci_device *pdev);
+ int (*resume)(struct pci_device *pdev);
+};
+
+Drivers will implement those methods, and call PCI-specific procedures
+like pci_set_power_state(), pci_enable_wake(), pci_save_state(), and
+pci_restore_state() to manage PCI-specific mechanisms. (PCI config space
+could be saved during driver probe, if it weren't for the fact that some
+systems rely on userspace tweaking using setpci.) Devices are suspended
+before their bridges enter low power states, and likewise bridges resume
+before their devices.
+
+
+Upper Layers of Driver Stacks
+-----------------------------
+Device drivers generally have at least two interfaces, and the methods
+sketched above are the ones which apply to the lower level (nearer PCI, USB,
+or other bus hardware). The network and block layers are examples of upper
+level interfaces, as is a character device talking to userspace.
+
+Power management requests normally need to flow through those upper levels,
+which often use domain-oriented requests like "blank that screen". In
+some cases those upper levels will have power management intelligence that
+relates to end-user activity, or other devices that work in cooperation.
+
+When those interfaces are structured using class interfaces, there is a
+standard way to have the upper layer stop issuing requests to a given
+class device (and restart later):
+
+struct class {
+ ...
+ int (*suspend)(struct device *dev, pm_message_t state);
+ int (*resume)(struct device *dev);
+};
+
+Those calls are issued in specific phases of the process by which the
+system enters a low power "suspend" state, or resumes from it.
+
+
+Calling Drivers to Enter System Sleep States
+============================================
+When the system enters a low power state, each device's driver is asked
+to suspend the device by putting it into state compatible with the target
+system state. That's usually some version of "off", but the details are
+system-specific. Also, wakeup-enabled devices will usually stay partly
+functional in order to wake the system.
+
+When the system leaves that low power state, the device's driver is asked
+to resume it. The suspend and resume operations always go together, and
+both are multi-phase operations.
+
+For simple drivers, suspend might quiesce the device using the class code
+and then turn its hardware as "off" as possible with late_suspend. The
+matching resume calls would then completely reinitialize the hardware
+before reactivating its class I/O queues.
+
+More power-aware drivers drivers will use more than one device low power
+state, either at runtime or during system sleep states, and might trigger
+system wakeup events.
+
+
+Call Sequence Guarantees
+------------------------
+To ensure that bridges and similar links needed to talk to a device are
+available when the device is suspended or resumed, the device tree is
+walked in a bottom-up order to suspend devices. A top-down order is
+used to resume those devices.
+
+The ordering of the device tree is defined by the order in which devices
+get registered: a child can never be registered, probed or resumed before
+its parent; and can't be removed or suspended after that parent.
+
+The policy is that the device tree should match hardware bus topology.
+(Or at least the control bus, for devices which use multiple busses.)
+In particular, this means that a device registration may fail if the parent of
+the device is suspending (ie. has been chosen by the PM core as the next
+device to suspend) or has already suspended, as well as after all of the other
+devices have been suspended. Device drivers must be prepared to cope with such
+situations.
+
+
+Suspending Devices
+------------------
+Suspending a given device is done in several phases. Suspending the
+system always includes every phase, executing calls for every device
+before the next phase begins. Not all busses or classes support all
+these callbacks; and not all drivers use all the callbacks.
+
+The phases are seen by driver notifications issued in this order:
+
+ 1 class.suspend(dev, message) is called after tasks are frozen, for
+ devices associated with a class that has such a method. This
+ method may sleep.
+
+ Since I/O activity usually comes from such higher layers, this is
+ a good place to quiesce all drivers of a given type (and keep such
+ code out of those drivers).
+
+ 2 bus.suspend(dev, message) is called next. This method may sleep,
+ and is often morphed into a device driver call with bus-specific
+ parameters and/or rules.
+
+ This call should handle parts of device suspend logic that require
+ sleeping. It probably does work to quiesce the device which hasn't
+ been abstracted into class.suspend() or bus.suspend_late().
+
+ 3 bus.suspend_late(dev, message) is called with IRQs disabled, and
+ with only one CPU active. Until the bus.resume_early() phase
+ completes (see later), IRQs are not enabled again. This method
+ won't be exposed by all busses; for message based busses like USB,
+ I2C, or SPI, device interactions normally require IRQs. This bus
+ call may be morphed into a driver call with bus-specific parameters.
+
+ This call might save low level hardware state that might otherwise
+ be lost in the upcoming low power state, and actually put the
+ device into a low power state ... so that in some cases the device
+ may stay partly usable until this late. This "late" call may also
+ help when coping with hardware that behaves badly.
+
+The pm_message_t parameter is currently used to refine those semantics
+(described later).
+
+At the end of those phases, drivers should normally have stopped all I/O
+transactions (DMA, IRQs), saved enough state that they can re-initialize
+or restore previous state (as needed by the hardware), and placed the
+device into a low-power state. On many platforms they will also use
+clk_disable() to gate off one or more clock sources; sometimes they will
+also switch off power supplies, or reduce voltages. Drivers which have
+runtime PM support may already have performed some or all of the steps
+needed to prepare for the upcoming system sleep state.
+
+When any driver sees that its device_can_wakeup(dev), it should make sure
+to use the relevant hardware signals to trigger a system wakeup event.
+For example, enable_irq_wake() might identify GPIO signals hooked up to
+a switch or other external hardware, and pci_enable_wake() does something
+similar for PCI's PME# signal.
+
+If a driver (or bus, or class) fails it suspend method, the system won't
+enter the desired low power state; it will resume all the devices it's
+suspended so far.
+
+Note that drivers may need to perform different actions based on the target
+system lowpower/sleep state. At this writing, there are only platform
+specific APIs through which drivers could determine those target states.
+
+
+Device Low Power (suspend) States
+---------------------------------
+Device low-power states aren't very standard. One device might only handle
+"on" and "off, while another might support a dozen different versions of
+"on" (how many engines are active?), plus a state that gets back to "on"
+faster than from a full "off".
+
+Some busses define rules about what different suspend states mean. PCI
+gives one example: after the suspend sequence completes, a non-legacy
+PCI device may not perform DMA or issue IRQs, and any wakeup events it
+issues would be issued through the PME# bus signal. Plus, there are
+several PCI-standard device states, some of which are optional.
+
+In contrast, integrated system-on-chip processors often use irqs as the
+wakeup event sources (so drivers would call enable_irq_wake) and might
+be able to treat DMA completion as a wakeup event (sometimes DMA can stay
+active too, it'd only be the CPU and some peripherals that sleep).
+
+Some details here may be platform-specific. Systems may have devices that
+can be fully active in certain sleep states, such as an LCD display that's
+refreshed using DMA while most of the system is sleeping lightly ... and
+its frame buffer might even be updated by a DSP or other non-Linux CPU while
+the Linux control processor stays idle.
+
+Moreover, the specific actions taken may depend on the target system state.
+One target system state might allow a given device to be very operational;
+another might require a hard shut down with re-initialization on resume.
+And two different target systems might use the same device in different
+ways; the aforementioned LCD might be active in one product's "standby",
+but a different product using the same SOC might work differently.
+
+
+Meaning of pm_message_t.event
+-----------------------------
+Parameters to suspend calls include the device affected and a message of
+type pm_message_t, which has one field: the event. If driver does not
+recognize the event code, suspend calls may abort the request and return
+a negative errno. However, most drivers will be fine if they implement
+PM_EVENT_SUSPEND semantics for all messages.
+
+The event codes are used to refine the goal of suspending the device, and
+mostly matter when creating or resuming system memory image snapshots, as
+used with suspend-to-disk:
+
+ PM_EVENT_SUSPEND -- quiesce the driver and put hardware into a low-power
+ state. When used with system sleep states like "suspend-to-RAM" or
+ "standby", the upcoming resume() call will often be able to rely on
+ state kept in hardware, or issue system wakeup events.
+
+ PM_EVENT_HIBERNATE -- Put hardware into a low-power state and enable wakeup
+ events as appropriate. It is only used with hibernation
+ (suspend-to-disk) and few devices are able to wake up the system from
+ this state; most are completely powered off.
+
+ PM_EVENT_FREEZE -- quiesce the driver, but don't necessarily change into
+ any low power mode. A system snapshot is about to be taken, often
+ followed by a call to the driver's resume() method. Neither wakeup
+ events nor DMA are allowed.
+
+ PM_EVENT_PRETHAW -- quiesce the driver, knowing that the upcoming resume()
+ will restore a suspend-to-disk snapshot from a different kernel image.
+ Drivers that are smart enough to look at their hardware state during
+ resume() processing need that state to be correct ... a PRETHAW could
+ be used to invalidate that state (by resetting the device), like a
+ shutdown() invocation would before a kexec() or system halt. Other
+ drivers might handle this the same way as PM_EVENT_FREEZE. Neither
+ wakeup events nor DMA are allowed.
+
+To enter "standby" (ACPI S1) or "Suspend to RAM" (STR, ACPI S3) states, or
+the similarly named APM states, only PM_EVENT_SUSPEND is used; the other event
+codes are used for hibernation ("Suspend to Disk", STD, ACPI S4).
+
+There's also PM_EVENT_ON, a value which never appears as a suspend event
+but is sometimes used to record the "not suspended" device state.
+
+
+Resuming Devices
+----------------
+Resuming is done in multiple phases, much like suspending, with all
+devices processing each phase's calls before the next phase begins.
+
+The phases are seen by driver notifications issued in this order:
+
+ 1 bus.resume_early(dev) is called with IRQs disabled, and with
+ only one CPU active. As with bus.suspend_late(), this method
+ won't be supported on busses that require IRQs in order to
+ interact with devices.
+
+ This reverses the effects of bus.suspend_late().
+
+ 2 bus.resume(dev) is called next. This may be morphed into a device
+ driver call with bus-specific parameters; implementations may sleep.
+
+ This reverses the effects of bus.suspend().
+
+ 3 class.resume(dev) is called for devices associated with a class
+ that has such a method. Implementations may sleep.
+
+ This reverses the effects of class.suspend(), and would usually
+ reactivate the device's I/O queue.
+
+At the end of those phases, drivers should normally be as functional as
+they were before suspending: I/O can be performed using DMA and IRQs, and
+the relevant clocks are gated on. The device need not be "fully on"; it
+might be in a runtime lowpower/suspend state that acts as if it were.
+
+However, the details here may again be platform-specific. For example,
+some systems support multiple "run" states, and the mode in effect at
+the end of resume() might not be the one which preceded suspension.
+That means availability of certain clocks or power supplies changed,
+which could easily affect how a driver works.
+
+
+Drivers need to be able to handle hardware which has been reset since the
+suspend methods were called, for example by complete reinitialization.
+This may be the hardest part, and the one most protected by NDA'd documents
+and chip errata. It's simplest if the hardware state hasn't changed since
+the suspend() was called, but that can't always be guaranteed.
+
+Drivers must also be prepared to notice that the device has been removed
+while the system was powered off, whenever that's physically possible.
+PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
+where common Linux platforms will see such removal. Details of how drivers
+will notice and handle such removals are currently bus-specific, and often
+involve a separate thread.
+
+
+Note that the bus-specific runtime PM wakeup mechanism can exist, and might
+be defined to share some of the same driver code as for system wakeup. For
+example, a bus-specific device driver's resume() method might be used there,
+so it wouldn't only be called from bus.resume() during system-wide wakeup.
+See bus-specific information about how runtime wakeup events are handled.
+
+
+System Devices
+--------------
+System devices follow a slightly different API, which can be found in
+
+ include/linux/sysdev.h
+ drivers/base/sys.c
+
+System devices will only be suspended with interrupts disabled, and after
+all other devices have been suspended. On resume, they will be resumed
+before any other devices, and also with interrupts disabled.
+
+That is, IRQs are disabled, the suspend_late() phase begins, then the
+sysdev_driver.suspend() phase, and the system enters a sleep state. Then
+the sysdev_driver.resume() phase begins, followed by the resume_early()
+phase, after which IRQs are enabled.
+
+Code to actually enter and exit the system-wide low power state sometimes
+involves hardware details that are only known to the boot firmware, and
+may leave a CPU running software (from SRAM or flash memory) that monitors
+the system and manages its wakeup sequence.
+
+
+Runtime Power Management
+========================
+Many devices are able to dynamically power down while the system is still
+running. This feature is useful for devices that are not being used, and
+can offer significant power savings on a running system. These devices
+often support a range of runtime power states, which might use names such
+as "off", "sleep", "idle", "active", and so on. Those states will in some
+cases (like PCI) be partially constrained by a bus the device uses, and will
+usually include hardware states that are also used in system sleep states.
+
+However, note that if a driver puts a device into a runtime low power state
+and the system then goes into a system-wide sleep state, it normally ought
+to resume into that runtime low power state rather than "full on". Such
+distinctions would be part of the driver-internal state machine for that
+hardware; the whole point of runtime power management is to be sure that
+drivers are decoupled in that way from the state machine governing phases
+of the system-wide power/sleep state transitions.
+
+
+Power Saving Techniques
+-----------------------
+Normally runtime power management is handled by the drivers without specific
+userspace or kernel intervention, by device-aware use of techniques like:
+
+ Using information provided by other system layers
+ - stay deeply "off" except between open() and close()
+ - if transceiver/PHY indicates "nobody connected", stay "off"
+ - application protocols may include power commands or hints
+
+ Using fewer CPU cycles
+ - using DMA instead of PIO
+ - removing timers, or making them lower frequency
+ - shortening "hot" code paths
+ - eliminating cache misses
+ - (sometimes) offloading work to device firmware
+
+ Reducing other resource costs
+ - gating off unused clocks in software (or hardware)
+ - switching off unused power supplies
+ - eliminating (or delaying/merging) IRQs
+ - tuning DMA to use word and/or burst modes
+
+ Using device-specific low power states
+ - using lower voltages
+ - avoiding needless DMA transfers
+
+Read your hardware documentation carefully to see the opportunities that
+may be available. If you can, measure the actual power usage and check
+it against the budget established for your project.
+
+
+Examples: USB hosts, system timer, system CPU
+----------------------------------------------
+USB host controllers make interesting, if complex, examples. In many cases
+these have no work to do: no USB devices are connected, or all of them are
+in the USB "suspend" state. Linux host controller drivers can then disable
+periodic DMA transfers that would otherwise be a constant power drain on the
+memory subsystem, and enter a suspend state. In power-aware controllers,
+entering that suspend state may disable the clock used with USB signaling,
+saving a certain amount of power.
+
+The controller will be woken from that state (with an IRQ) by changes to the
+signal state on the data lines of a given port, for example by an existing
+peripheral requesting "remote wakeup" or by plugging a new peripheral. The
+same wakeup mechanism usually works from "standby" sleep states, and on some
+systems also from "suspend to RAM" (or even "suspend to disk") states.
+(Except that ACPI may be involved instead of normal IRQs, on some hardware.)
+
+System devices like timers and CPUs may have special roles in the platform
+power management scheme. For example, system timers using a "dynamic tick"
+approach don't just save CPU cycles (by eliminating needless timer IRQs),
+but they may also open the door to using lower power CPU "idle" states that
+cost more than a jiffie to enter and exit. On x86 systems these are states
+like "C3"; note that periodic DMA transfers from a USB host controller will
+also prevent entry to a C3 state, much like a periodic timer IRQ.
+
+That kind of runtime mechanism interaction is common. "System On Chip" (SOC)
+processors often have low power idle modes that can't be entered unless
+certain medium-speed clocks (often 12 or 48 MHz) are gated off. When the
+drivers gate those clocks effectively, then the system idle task may be able
+to use the lower power idle modes and thereby increase battery life.
+
+If the CPU can have a "cpufreq" driver, there also may be opportunities
+to shift to lower voltage settings and reduce the power cost of executing
+a given number of instructions. (Without voltage adjustment, it's rare
+for cpufreq to save much power; the cost-per-instruction must go down.)
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
new file mode 100644
index 0000000..7f7a737
--- /dev/null
+++ b/Documentation/power/drivers-testing.txt
@@ -0,0 +1,46 @@
+Testing suspend and resume support in device drivers
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Preparing the test system
+
+Unfortunately, to effectively test the support for the system-wide suspend and
+resume transitions in a driver, it is necessary to suspend and resume a fully
+functional system with this driver loaded. Moreover, that should be done
+several times, preferably several times in a row, and separately for hibernation
+(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
+cases involves slightly different operations and different interactions with
+the machine's BIOS.
+
+Of course, for this purpose the test system has to be known to suspend and
+resume without the driver being tested. Thus, if possible, you should first
+resolve all suspend/resume-related problems in the test system before you start
+testing the new driver. Please see Documentation/power/basic-pm-debugging.txt
+for more information about the debugging of suspend/resume functionality.
+
+2. Testing the driver
+
+Once you have resolved the suspend/resume-related problems with your test system
+without the new driver, you are ready to test it:
+
+a) Build the driver as a module, load it and try the test modes of hibernation
+ (see: Documents/power/basic-pm-debugging.txt, 1).
+
+b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
+ "platform" modes (see: Documents/power/basic-pm-debugging.txt, 1).
+
+c) Compile the driver directly into the kernel and try the test modes of
+ hibernation.
+
+d) Attempt to hibernate with the driver compiled directly into the kernel
+ in the "reboot", "shutdown" and "platform" modes.
+
+e) Try the test modes of suspend (see: Documents/power/basic-pm-debugging.txt,
+ 2). [As far as the STR tests are concerned, it should not matter whether or
+ not the driver is built as a module.]
+
+f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
+ (see: Documents/power/basic-pm-debugging.txt, 2).
+
+Each of the above tests should be repeated several times and the STD tests
+should be mixed with the STR tests. If any of them fails, the driver cannot be
+regarded as suspend/resume-safe.
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
new file mode 100644
index 0000000..38b5724
--- /dev/null
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -0,0 +1,178 @@
+Freezing of tasks
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+I. What is the freezing of tasks?
+
+The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).
+
+II. How does it work?
+
+There are four per-task flags used for that, PF_NOFREEZE, PF_FROZEN, TIF_FREEZE
+and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have
+PF_NOFREEZE unset (all user space processes and some kernel threads) are
+regarded as 'freezable' and treated in a special way before the system enters a
+suspend state as well as before a hibernation image is created (in what follows
+we only consider hibernation, but the description also applies to suspend).
+
+Namely, as the first step of the hibernation procedure the function
+freeze_processes() (defined in kernel/power/process.c) is called. It executes
+try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
+either wakes them up, if they are kernel threads, or sends fake signals to them,
+if they are user space processes. A task that has TIF_FREEZE set, should react
+to it by calling the function called refrigerator() (defined in
+kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state
+to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it.
+Then, we say that the task is 'frozen' and therefore the set of functions
+handling this mechanism is referred to as 'the freezer' (these functions are
+defined in kernel/power/process.c and include/linux/freezer.h). User space
+processes are generally frozen before kernel threads.
+
+It is not recommended to call refrigerator() directly. Instead, it is
+recommended to use the try_to_freeze() function (defined in
+include/linux/freezer.h), that checks the task's TIF_FREEZE flag and makes the
+task enter refrigerator() if the flag is set.
+
+For user space processes try_to_freeze() is called automatically from the
+signal-handling code, but the freezable kernel threads need to call it
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if TIF_FREEZE is set and calling
+try_to_freeze(). The main loop of a freezable kernel thread may look like the
+following one:
+
+ set_freezable();
+ do {
+ hub_events();
+ wait_event_freezable(khubd_wait,
+ !list_empty(&hub_event_list) ||
+ kthread_should_stop());
+ } while (!kthread_should_stop() || !list_empty(&hub_event_list));
+
+(from drivers/usb/core/hub.c::hub_thread()).
+
+If a freezable kernel thread fails to call try_to_freeze() after the freezer has
+set TIF_FREEZE for it, the freezing of tasks will fail and the entire
+hibernation operation will be cancelled. For this reason, freezable kernel
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
+
+After the system memory state has been restored from a hibernation image and
+devices have been reinitialized, the function thaw_processes() is called in
+order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that
+have been frozen leave refrigerator() and continue running.
+
+III. Which kernel threads are freezable?
+
+Kernel threads are not freezable by default. However, a kernel thread may clear
+PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
+directly is strongly discouraged). From this point it is regarded as freezable
+and must call try_to_freeze() in a suitable place.
+
+IV. Why do we do that?
+
+Generally speaking, there is a couple of reasons to use the freezing of tasks:
+
+1. The principal reason is to prevent filesystems from being damaged after
+hibernation. At the moment we have no simple means of checkpointing
+filesystems, so if there are any modifications made to filesystem data and/or
+metadata on disks, we cannot bring them back to the state from before the
+modifications. At the same time each hibernation image contains some
+filesystem-related information that must be consistent with the state of the
+on-disk data and metadata after the system memory state has been restored from
+the image (otherwise the filesystems will be damaged in a nasty way, usually
+making them almost impossible to repair). We therefore freeze tasks that might
+cause the on-disk filesystems' data and metadata to be modified after the
+hibernation image has been created and before the system is finally powered off.
+The majority of these are user space processes, but if any of the kernel threads
+may cause something like this to happen, they have to be freezable.
+
+2. Next, to create the hibernation image we need to free a sufficient amount of
+memory (approximately 50% of available RAM) and we need to do that before
+devices are deactivated, because we generally need them for swapping out. Then,
+after the memory for the image has been freed, we don't want tasks to allocate
+additional memory and we prevent them from doing that by freezing them earlier.
+[Of course, this also means that device drivers should not allocate substantial
+amounts of memory from their .suspend() callbacks before hibernation, but this
+is e separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
+from interfering with the suspending and resuming of devices. A user space
+process running on a second CPU while we are suspending devices may, for
+example, be troublesome and without the freezing of tasks we would need some
+safeguards against race conditions that might occur in such a case.
+
+Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
+of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
+
+"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
+
+Linus: In many ways, 'at all'.
+
+I _do_ realize the IO request queue issues, and that we cannot actually do
+s2ram with some devices in the middle of a DMA. So we want to be able to
+avoid *that*, there's no question about that. And I suspect that stopping
+user threads and then waiting for a sync is practically one of the easier
+ways to do so.
+
+So in practice, the 'at all' may become a 'why freeze kernel threads?' and
+freezing user threads I don't find really objectionable."
+
+Still, there are kernel threads that may want to be freezable. For example, if
+a kernel that belongs to a device driver accesses the device directly, it in
+principle needs to know when the device is suspended, so that it doesn't try to
+access it at that time. However, if the kernel thread is freezable, it will be
+frozen before the driver's .suspend() callback is executed and it will be
+thawed after the driver's .resume() callback has run, so it won't be accessing
+the device while it's suspended.
+
+4. Another reason for freezing tasks is to prevent user space processes from
+realizing that hibernation (or suspend) operation takes place. Ideally, user
+space processes should not notice that such a system-wide operation has occurred
+and should continue running without any problems after the restore (or resume
+from suspend). Unfortunately, in the most general case this is quite difficult
+to achieve without the freezing of tasks. Consider, for example, a process
+that depends on all CPUs being online while it's running. Since we need to
+disable nonboot CPUs during the hibernation, if this process is not frozen, it
+may notice that the number of CPUs has changed and may start to work incorrectly
+because of that.
+
+V. Are there any problems related to the freezing of tasks?
+
+Yes, there are.
+
+First of all, the freezing of kernel threads may be tricky if they depend one
+on another. For example, if kernel thread A waits for a completion (in the
+TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
+and B is frozen in the meantime, then A will be blocked until B is thawed, which
+may be undesirable. That's why kernel threads are not freezable by default.
+
+Second, there are the following two problems related to the freezing of user
+space processes:
+1. Putting processes into an uninterruptible sleep distorts the load average.
+2. Now that we have FUSE, plus the framework for doing device drivers in
+userspace, it gets even more complicated because some userspace processes are
+now doing the sorts of things that kernel threads do
+(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
+
+The problem 1. seems to be fixable, although it hasn't been fixed so far. The
+other one is more serious, but it seems that we can work around it by using
+hibernation (and suspend) notifiers (in that case, though, we won't be able to
+avoid the realization by the user space processes that the hibernation is taking
+place).
+
+There are also problems that the freezing of tasks tends to expose, although
+they are not directly related to it. For example, if request_firmware() is
+called from a device driver's .resume() routine, it will timeout and eventually
+fail, because the user land process that should respond to the request is frozen
+at this point. So, seemingly, the failure is due to the freezing of tasks.
+Suppose, however, that the firmware file is located on a filesystem accessible
+only through another device that hasn't been resumed yet. In that case,
+request_firmware() will fail regardless of whether or not the freezing of tasks
+is used. Consequently, the problem is not really related to the freezing of
+tasks, since it generally exists anyway.
+
+A driver must have all firmwares it may need in RAM before suspend() is called.
+If keeping them is not practical, for example due to their size, they must be
+requested early enough using the suspend notifier API described in notifiers.txt.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
new file mode 100644
index 0000000..e67211f
--- /dev/null
+++ b/Documentation/power/interface.txt
@@ -0,0 +1,75 @@
+Power Management Interface
+
+
+The power management subsystem provides a unified sysfs interface to
+userspace, regardless of what architecture or platform one is
+running. The interface exists in /sys/power/ directory (assuming sysfs
+is mounted at /sys).
+
+/sys/power/state controls system power state. Reading from this file
+returns what states are supported, which is hard-coded to 'standby'
+(Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk'
+(Suspend-to-Disk).
+
+Writing to this file one of those strings causes the system to
+transition into that state. Please see the file
+Documentation/power/states.txt for a description of each of those
+states.
+
+
+/sys/power/disk controls the operating mode of the suspend-to-disk
+mechanism. Suspend-to-disk can be handled in several ways. We have a
+few options for putting the system to sleep - using the platform driver
+(e.g. ACPI or other suspend_ops), powering off the system or rebooting the
+system (for testing).
+
+Additionally, /sys/power/disk can be used to turn on one of the two testing
+modes of the suspend-to-disk mechanism: 'testproc' or 'test'. If the
+suspend-to-disk mechanism is in the 'testproc' mode, writing 'disk' to
+/sys/power/state will cause the kernel to disable nonboot CPUs and freeze
+tasks, wait for 5 seconds, unfreeze tasks and enable nonboot CPUs. If it is
+in the 'test' mode, writing 'disk' to /sys/power/state will cause the kernel
+to disable nonboot CPUs and freeze tasks, shrink memory, suspend devices, wait
+for 5 seconds, resume devices, unfreeze tasks and enable nonboot CPUs. Then,
+we are able to look in the log messages and work out, for example, which code
+is being slow and which device drivers are misbehaving.
+
+Reading from this file will display all supported modes and the currently
+selected one in brackets, for example
+
+ [shutdown] reboot test testproc
+
+Writing to this file will accept one of
+
+ 'platform' (only if the platform supports it)
+ 'shutdown'
+ 'reboot'
+ 'testproc'
+ 'test'
+
+/sys/power/image_size controls the size of the image created by
+the suspend-to-disk mechanism. It can be written a string
+representing a non-negative integer that will be used as an upper
+limit of the image size, in bytes. The suspend-to-disk mechanism will
+do its best to ensure the image size will not exceed that number. However,
+if this turns out to be impossible, it will try to suspend anyway using the
+smallest image possible. In particular, if "0" is written to this file, the
+suspend image will be as small as possible.
+
+Reading from this file will display the current image size limit, which
+is set to 500 MB by default.
+
+/sys/power/pm_trace controls the code which saves the last PM event point in
+the RTC across reboots, so that you can debug a machine that just hangs
+during suspend (or more commonly, during resume). Namely, the RTC is only
+used to save the last PM event point if this file contains '1'. Initially it
+contains '0' which may be changed to '1' by writing a string representing a
+nonzero integer into it.
+
+To use this debugging feature you should attempt to suspend the machine, then
+reboot it and run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+CAUTION: Using it will cause your machine's real-time (CMOS) clock to be
+set to a random invalid time after a resume.
diff --git a/Documentation/power/notifiers.txt b/Documentation/power/notifiers.txt
new file mode 100644
index 0000000..ae1b7ec
--- /dev/null
+++ b/Documentation/power/notifiers.txt
@@ -0,0 +1,58 @@
+Suspend notifiers
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+There are some operations that device drivers may want to carry out in their
+.suspend() routines, but shouldn't, because they can cause the hibernation or
+suspend to fail. For example, a driver may want to allocate a substantial amount
+of memory (like 50 MB) in .suspend(), but that shouldn't be done after the
+swsusp's memory shrinker has run.
+
+Also, there may be some operations, that subsystems want to carry out before a
+hibernation/suspend or after a restore/resume, requiring the system to be fully
+functional, so the drivers' .suspend() and .resume() routines are not suitable
+for this purpose. For example, device drivers may want to upload firmware to
+their devices after a restore from a hibernation image, but they cannot do it by
+calling request_firmware() from their .resume() routines (user land processes
+are frozen at this point). The solution may be to load the firmware into
+memory before processes are frozen and upload it from there in the .resume()
+routine. Of course, a hibernation notifier may be used for this purpose.
+
+The subsystems that have such needs can register suspend notifiers that will be
+called upon the following events by the suspend core:
+
+PM_HIBERNATION_PREPARE The system is going to hibernate or suspend, tasks will
+ be frozen immediately.
+
+PM_POST_HIBERNATION The system memory state has been restored from a
+ hibernation image or an error occured during the
+ hibernation. Device drivers' .resume() callbacks have
+ been executed and tasks have been thawed.
+
+PM_RESTORE_PREPARE The system is going to restore a hibernation image.
+ If all goes well the restored kernel will issue a
+ PM_POST_HIBERNATION notification.
+
+PM_POST_RESTORE An error occurred during the hibernation restore.
+ Device drivers' .resume() callbacks have been executed
+ and tasks have been thawed.
+
+PM_SUSPEND_PREPARE The system is preparing for a suspend.
+
+PM_POST_SUSPEND The system has just resumed or an error occured during
+ the suspend. Device drivers' .resume() callbacks have
+ been executed and tasks have been thawed.
+
+It is generally assumed that whatever the notifiers do for
+PM_HIBERNATION_PREPARE, should be undone for PM_POST_HIBERNATION. Analogously,
+operations performed for PM_SUSPEND_PREPARE should be reversed for
+PM_POST_SUSPEND. Additionally, all of the notifiers are called for
+PM_POST_HIBERNATION if one of them fails for PM_HIBERNATION_PREPARE, and
+all of the notifiers are called for PM_POST_SUSPEND if one of them fails for
+PM_SUSPEND_PREPARE.
+
+The hibernation and suspend notifiers are called with pm_mutex held. They are
+defined in the usual way, but their last argument is meaningless (it is always
+NULL). To register and/or unregister a suspend notifier use the functions
+register_pm_notifier() and unregister_pm_notifier(), respectively, defined in
+include/linux/suspend.h . If you don't need to unregister the notifier, you can
+also use the pm_notifier() macro defined in include/linux/suspend.h .
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
new file mode 100644
index 0000000..dd8fe43
--- /dev/null
+++ b/Documentation/power/pci.txt
@@ -0,0 +1,299 @@
+
+PCI Power Management
+~~~~~~~~~~~~~~~~~~~~
+
+An overview of the concepts and the related functions in the Linux kernel
+
+Patrick Mochel <mochel@transmeta.com>
+(and others)
+
+---------------------------------------------------------------------------
+
+1. Overview
+2. How the PCI Subsystem Does Power Management
+3. PCI Utility Functions
+4. PCI Device Drivers
+5. Resources
+
+1. Overview
+~~~~~~~~~~~
+
+The PCI Power Management Specification was introduced between the PCI 2.1 and
+PCI 2.2 Specifications. It a standard interface for controlling various
+power management operations.
+
+Implementation of the PCI PM Spec is optional, as are several sub-components of
+it. If a device supports the PCI PM Spec, the device will have an 8 byte
+capability field in its PCI configuration space. This field is used to describe
+and control the standard PCI power management features.
+
+The PCI PM spec defines 4 operating states for devices (D0 - D3) and for buses
+(B0 - B3). The higher the number, the less power the device consumes. However,
+the higher the number, the longer the latency is for the device to return to
+an operational state (D0).
+
+There are actually two D3 states. When someone talks about D3, they usually
+mean D3hot, which corresponds to an ACPI D2 state (power is reduced, the
+device may lose some context). But they may also mean D3cold, which is an
+ACPI D3 state (power is fully off, all state was discarded); or both.
+
+Bus power management is not covered in this version of this document.
+
+Note that all PCI devices support D0 and D3cold by default, regardless of
+whether or not they implement any of the PCI PM spec.
+
+The possible state transitions that a device can undergo are:
+
++---------------------------+
+| Current State | New State |
++---------------------------+
+| D0 | D1, D2, D3|
++---------------------------+
+| D1 | D2, D3 |
++---------------------------+
+| D2 | D3 |
++---------------------------+
+| D1, D2, D3 | D0 |
++---------------------------+
+
+Note that when the system is entering a global suspend state, all devices will
+be placed into D3 and when resuming, all devices will be placed into D0.
+However, when the system is running, other state transitions are possible.
+
+2. How The PCI Subsystem Handles Power Management
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The PCI suspend/resume functionality is accessed indirectly via the Power
+Management subsystem. At boot, the PCI driver registers a power management
+callback with that layer. Upon entering a suspend state, the PM layer iterates
+through all of its registered callbacks. This currently takes place only during
+APM state transitions.
+
+Upon going to sleep, the PCI subsystem walks its device tree twice. Both times,
+it does a depth first walk of the device tree. The first walk saves each of the
+device's state and checks for devices that will prevent the system from entering
+a global power state. The next walk then places the devices in a low power
+state.
+
+The first walk allows a graceful recovery in the event of a failure, since none
+of the devices have actually been powered down.
+
+In both walks, in particular the second, all children of a bridge are touched
+before the actual bridge itself. This allows the bridge to retain power while
+its children are being accessed.
+
+Upon resuming from sleep, just the opposite must be true: all bridges must be
+powered on and restored before their children are powered on. This is easily
+accomplished with a breadth-first walk of the PCI device tree.
+
+
+3. PCI Utility Functions
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These are helper functions designed to be called by individual device drivers.
+Assuming that a device behaves as advertised, these should be applicable in most
+cases. However, results may vary.
+
+Note that these functions are never implicitly called for the driver. The driver
+is always responsible for deciding when and if to call these.
+
+
+pci_save_state
+--------------
+
+Usage:
+ pci_save_state(struct pci_dev *dev);
+
+Description:
+ Save first 64 bytes of PCI config space, along with any additional
+ PCI-Express or PCI-X information.
+
+
+pci_restore_state
+-----------------
+
+Usage:
+ pci_restore_state(struct pci_dev *dev);
+
+Description:
+ Restore previously saved config space.
+
+
+pci_set_power_state
+-------------------
+
+Usage:
+ pci_set_power_state(struct pci_dev *dev, pci_power_t state);
+
+Description:
+ Transition device to low power state using PCI PM Capabilities
+ registers.
+
+ Will fail under one of the following conditions:
+ - If state is less than current state, but not D0 (illegal transition)
+ - Device doesn't support PM Capabilities
+ - Device does not support requested state
+
+
+pci_enable_wake
+---------------
+
+Usage:
+ pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
+
+Description:
+ Enable device to generate PME# during low power state using PCI PM
+ Capabilities.
+
+ Checks whether if device supports generating PME# from requested state
+ and fail if it does not, unless enable == 0 (request is to disable wake
+ events, which is implicit if it doesn't even support it in the first
+ place).
+
+ Note that the PMC Register in the device's PM Capabilities has a bitmask
+ of the states it supports generating PME# from. D3hot is bit 3 and
+ D3cold is bit 4. So, while a value of 4 as the state may not seem
+ semantically correct, it is.
+
+
+4. PCI Device Drivers
+~~~~~~~~~~~~~~~~~~~~~
+
+These functions are intended for use by individual drivers, and are defined in
+struct pci_driver:
+
+ int (*suspend) (struct pci_dev *dev, pm_message_t state);
+ int (*resume) (struct pci_dev *dev);
+
+
+suspend
+-------
+
+Usage:
+
+if (dev->driver && dev->driver->suspend)
+ dev->driver->suspend(dev,state);
+
+A driver uses this function to actually transition the device into a low power
+state. This should include disabling I/O, IRQs, and bus-mastering, as well as
+physically transitioning the device to a lower power state; it may also include
+calls to pci_enable_wake().
+
+Bus mastering may be disabled by doing:
+
+pci_disable_device(dev);
+
+For devices that support the PCI PM Spec, this may be used to set the device's
+power state to match the suspend() parameter:
+
+pci_set_power_state(dev,state);
+
+The driver is also responsible for disabling any other device-specific features
+(e.g blanking screen, turning off on-card memory, etc).
+
+The driver should be sure to track the current state of the device, as it may
+obviate the need for some operations.
+
+The driver should update the current_state field in its pci_dev structure in
+this function, except for PM-capable devices when pci_set_power_state is used.
+
+resume
+------
+
+Usage:
+
+if (dev->driver && dev->driver->resume)
+ dev->driver->resume(dev)
+
+The resume callback may be called from any power state, and is always meant to
+transition the device to the D0 state.
+
+The driver is responsible for reenabling any features of the device that had
+been disabled during previous suspend calls, such as IRQs and bus mastering,
+as well as calling pci_restore_state().
+
+If the device is currently in D3, it may need to be reinitialized in resume().
+
+ * Some types of devices, like bus controllers, will preserve context in D3hot
+ (using Vcc power). Their drivers will often want to avoid re-initializing
+ them after re-entering D0 (perhaps to avoid resetting downstream devices).
+
+ * Other kinds of devices in D3hot will discard device context as part of a
+ soft reset when re-entering the D0 state.
+
+ * Devices resuming from D3cold always go through a power-on reset. Some
+ device context can also be preserved using Vaux power.
+
+ * Some systems hide D3cold resume paths from drivers. For example, on PCs
+ the resume path for suspend-to-disk often runs BIOS powerup code, which
+ will sometimes re-initialize the device.
+
+To handle resets during D3 to D0 transitions, it may be convenient to share
+device initialization code between probe() and resume(). Device parameters
+can also be saved before the driver suspends into D3, avoiding re-probe.
+
+If the device supports the PCI PM Spec, it can use this to physically transition
+the device to D0:
+
+pci_set_power_state(dev,0);
+
+Note that if the entire system is transitioning out of a global sleep state, all
+devices will be placed in the D0 state, so this is not necessary. However, in
+the event that the device is placed in the D3 state during normal operation,
+this call is necessary. It is impossible to determine which of the two events is
+taking place in the driver, so it is always a good idea to make that call.
+
+The driver should take note of the state that it is resuming from in order to
+ensure correct (and speedy) operation.
+
+The driver should update the current_state field in its pci_dev structure in
+this function, except for PM-capable devices when pci_set_power_state is used.
+
+
+
+A reference implementation
+-------------------------
+.suspend()
+{
+ /* driver specific operations */
+
+ /* Disable IRQ */
+ free_irq();
+ /* If using MSI */
+ pci_disable_msi();
+
+ pci_save_state();
+ pci_enable_wake();
+ /* Disable IO/bus master/irq router */
+ pci_disable_device();
+ pci_set_power_state(pci_choose_state());
+}
+
+.resume()
+{
+ pci_set_power_state(PCI_D0);
+ pci_restore_state();
+ /* device's irq possibly is changed, driver should take care */
+ pci_enable_device();
+ pci_set_master();
+
+ /* if using MSI, device's vector possibly is changed */
+ pci_enable_msi();
+
+ request_irq();
+ /* driver specific operations; */
+}
+
+This is a typical implementation. Drivers can slightly change the order
+of the operations in the implementation, ignore some operations or add
+more driver specific operations in it, but drivers should do something like
+this on the whole.
+
+5. Resources
+~~~~~~~~~~~~
+
+PCI Local Bus Specification
+PCI Bus Power Management Interface Specification
+
+ http://www.pcisig.com
+
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
new file mode 100644
index 0000000..c40866e
--- /dev/null
+++ b/Documentation/power/pm_qos_interface.txt
@@ -0,0 +1,64 @@
+PM Quality Of Service Interface.
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Currently we have {cpu_dma_latency, network_latency, network_throughput} as the
+initial set of pm_qos parameters.
+
+Each parameters have defined units:
+ * latency: usec
+ * timeout: usec
+ * throughput: kbs (kilo bit / sec)
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter. The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h. This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requirements is maintained along with
+an aggregated target value. The aggregated target value is updated with
+changes to the requirement list or elements of the list. Typically the
+aggregated target value is simply the max or min of the requirement values held
+in the parameter list elements.
+
+From kernel mode the use of this interface is simple:
+pm_qos_add_requirement(param_id, name, target_value):
+Will insert a named element in the list for that identified PM_QOS parameter
+with the target value. Upon change to this list the new target is recomputed
+and any registered notifiers are called only if the target value is now
+different.
+
+pm_qos_update_requirement(param_id, name, new_target_value):
+Will search the list identified by the param_id for the named list element and
+then update its target value, calling the notification tree if the aggregated
+target is changed. with that name is already registered.
+
+pm_qos_remove_requirement(param_id, name):
+Will search the identified list for the named element and remove it, after
+removal it will update the aggregate target and call the notification tree if
+the target was changed as a result of removing the named requirement.
+
+
+From user mode:
+Only processes can register a pm_qos requirement. To provide for automatic
+cleanup for process the interface requires the process to register its
+parameter requirements in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+requirement on the parameter. The name of the requirement is "process_<PID>"
+derived from the current->pid from within the open system call.
+
+To change the requested target value the process needs to write a s32 value to
+the open device node. This translates to a pm_qos_update_requirement call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
new file mode 100644
index 0000000..c6cd495
--- /dev/null
+++ b/Documentation/power/power_supply_class.txt
@@ -0,0 +1,173 @@
+Linux power supply class
+========================
+
+Synopsis
+~~~~~~~~
+Power supply class used to represent battery, UPS, AC or DC power supply
+properties to user-space.
+
+It defines core set of attributes, which should be applicable to (almost)
+every power supply out there. Attributes are available via sysfs and uevent
+interfaces.
+
+Each attribute has well defined meaning, up to unit of measure used. While
+the attributes provided are believed to be universally applicable to any
+power supply, specific monitoring hardware may not be able to provide them
+all, so any of them may be skipped.
+
+Power supply class is extensible, and allows to define drivers own attributes.
+The core attribute set is subject to the standard Linux evolution (i.e.
+if it will be found that some attribute is applicable to many power supply
+types or their drivers, it can be added to the core set).
+
+It also integrates with LED framework, for the purpose of providing
+typically expected feedback of battery charging/fully charged status and
+AC/USB power supply online status. (Note that specific details of the
+indication (including whether to use it at all) are fully controllable by
+user and/or specific machine defaults, per design principles of LED
+framework).
+
+
+Attributes/properties
+~~~~~~~~~~~~~~~~~~~~~
+Power supply class has predefined set of attributes, this eliminates code
+duplication across drivers. Power supply class insist on reusing its
+predefined attributes *and* their units.
+
+So, userspace gets predictable set of attributes and their units for any
+kind of power supply, and can process/present them to a user in consistent
+manner. Results for different power supplies and machines are also directly
+comparable.
+
+See drivers/power/ds2760_battery.c and drivers/power/pda_power.c for the
+example how to declare and handle attributes.
+
+
+Units
+~~~~~
+Quoting include/linux/power_supply.h:
+
+ All voltages, currents, charges, energies, time and temperatures in µV,
+ µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
+ stated. It's driver's job to convert its raw values to units in which
+ this class operates.
+
+
+Attributes/properties detailed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+~ ~ ~ ~ ~ ~ ~ Charge/Energy/Capacity - how to not confuse ~ ~ ~ ~ ~ ~ ~
+~ ~
+~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity" ~
+~ of battery, this class distinguish these terms. Don't mix them! ~
+~ ~
+~ CHARGE_* attributes represents capacity in µAh only. ~
+~ ENERGY_* attributes represents capacity in µWh only. ~
+~ CAPACITY attribute represents capacity in *percents*, from 0 to 100. ~
+~ ~
+~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
+
+Postfixes:
+_AVG - *hardware* averaged value, use it if your hardware is really able to
+report averaged values.
+_NOW - momentary/instantaneous values.
+
+STATUS - this attribute represents operating status (charging, full,
+discharging (i.e. powering a load), etc.). This corresponds to
+BATTERY_STATUS_* values, as defined in battery.h.
+
+HEALTH - represents health of the battery, values corresponds to
+POWER_SUPPLY_HEALTH_*, defined in battery.h.
+
+VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and
+minimal power supply voltages. Maximal/minimal means values of voltages
+when battery considered "full"/"empty" at normal conditions. Yes, there is
+no direct relation between voltage and battery capacity, but some dumb
+batteries use voltage for very approximated calculation of capacity.
+Battery driver also can use this attribute just to inform userspace
+about maximal and minimal voltage thresholds of a given battery.
+
+VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that
+these ones should be used if hardware could only guess (measure and
+retain) the thresholds of a given power supply.
+
+CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when
+battery considered full/empty.
+
+ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy.
+
+CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value
+of charge when battery became full/empty". It also could mean "value of
+charge when battery considered full/empty at given conditions (temperature,
+age)". I.e. these attributes represents real thresholds, not design values.
+
+CHARGE_COUNTER - the current charge counter (in µAh). This could easily
+be negative; there is no empty or full value. It is only useful for
+relative, time-based measurements.
+
+ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
+
+CAPACITY - capacity in percents.
+
+TEMP - temperature of the power supply.
+TEMP_AMBIENT - ambient temperature.
+
+TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e.
+while battery powers a load)
+TIME_TO_FULL - seconds left for battery to be considered full (i.e.
+while battery is charging)
+
+
+Battery <-> external power supply interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Often power supplies are acting as supplies and supplicants at the same
+time. Batteries are good example. So, batteries usually care if they're
+externally powered or not.
+
+For that case, power supply class implements notification mechanism for
+batteries.
+
+External power supply (AC) lists supplicants (batteries) names in
+"supplied_to" struct member, and each power_supply_changed() call
+issued by external power supply will notify supplicants via
+external_power_changed callback.
+
+
+QA
+~~
+Q: Where is POWER_SUPPLY_PROP_XYZ attribute?
+A: If you cannot find attribute suitable for your driver needs, feel free
+ to add it and send patch along with your driver.
+
+ The attributes available currently are the ones currently provided by the
+ drivers written.
+
+ Good candidates to add in future: model/part#, cycle_time, manufacturer,
+ etc.
+
+
+Q: I have some very specific attribute (e.g. battery color), should I add
+ this attribute to standard ones?
+A: Most likely, no. Such attribute can be placed in the driver itself, if
+ it is useful. Of course, if the attribute in question applicable to
+ large set of batteries, provided by many drivers, and/or comes from
+ some general battery specification/standard, it may be a candidate to
+ be added to the core attribute set.
+
+
+Q: Suppose, my battery monitoring chip/firmware does not provides capacity
+ in percents, but provides charge_{now,full,empty}. Should I calculate
+ percentage capacity manually, inside the driver, and register CAPACITY
+ attribute? The same question about time_to_empty/time_to_full.
+A: Most likely, no. This class is designed to export properties which are
+ directly measurable by the specific hardware available.
+
+ Inferring not available properties using some heuristics or mathematical
+ model is not subject of work for a battery driver. Such functionality
+ should be factored out, and in fact, apm_power, the driver to serve
+ legacy APM API on top of power supply class, uses a simple heuristic of
+ approximating remaining battery capacity based on its charge, current,
+ voltage and so on. But full-fledged battery model is likely not subject
+ for kernel at all, as it would require floating point calculation to deal
+ with things like differential equations and Kalman filters. This is
+ better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
new file mode 100644
index 0000000..82b7a43
--- /dev/null
+++ b/Documentation/power/regulator/consumer.txt
@@ -0,0 +1,182 @@
+Regulator Consumer Driver Interface
+===================================
+
+This text describes the regulator interface for consumer device drivers.
+Please see overview.txt for a description of the terms used in this text.
+
+
+1. Consumer Regulator Access (static & dynamic drivers)
+=======================================================
+
+A consumer driver can get access to it's supply regulator by calling :-
+
+regulator = regulator_get(dev, "Vcc");
+
+The consumer passes in it's struct device pointer and power supply ID. The core
+then finds the correct regulator by consulting a machine specific lookup table.
+If the lookup is successful then this call will return a pointer to the struct
+regulator that supplies this consumer.
+
+To release the regulator the consumer driver should call :-
+
+regulator_put(regulator);
+
+Consumers can be supplied by more than one regulator e.g. codec consumer with
+analog and digital supplies :-
+
+digital = regulator_get(dev, "Vcc"); /* digital core */
+analog = regulator_get(dev, "Avdd"); /* analog */
+
+The regulator access functions regulator_get() and regulator_put() will
+usually be called in your device drivers probe() and remove() respectively.
+
+
+2. Regulator Output Enable & Disable (static & dynamic drivers)
+====================================================================
+
+A consumer can enable it's power supply by calling:-
+
+int regulator_enable(regulator);
+
+NOTE: The supply may already be enabled before regulator_enabled() is called.
+This may happen if the consumer shares the regulator or the regulator has been
+previously enabled by bootloader or kernel board initialization code.
+
+A consumer can determine if a regulator is enabled by calling :-
+
+int regulator_is_enabled(regulator);
+
+This will return > zero when the regulator is enabled.
+
+
+A consumer can disable it's supply when no longer needed by calling :-
+
+int regulator_disable(regulator);
+
+NOTE: This may not disable the supply if it's shared with other consumers. The
+regulator will only be disabled when the enabled reference count is zero.
+
+Finally, a regulator can be forcefully disabled in the case of an emergency :-
+
+int regulator_force_disable(regulator);
+
+NOTE: this will immediately and forcefully shutdown the regulator output. All
+consumers will be powered off.
+
+
+3. Regulator Voltage Control & Status (dynamic drivers)
+======================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+voltage to match system operating points. e.g. CPUfreq drivers can scale
+voltage along with frequency to save power, SD drivers may need to select the
+correct card voltage, etc.
+
+Consumers can control their supply voltage by calling :-
+
+int regulator_set_voltage(regulator, min_uV, max_uV);
+
+Where min_uV and max_uV are the minimum and maximum acceptable voltages in
+microvolts.
+
+NOTE: this can be called when the regulator is enabled or disabled. If called
+when enabled, then the voltage changes instantly, otherwise the voltage
+configuration changes and the voltage is physically set when the regulator is
+next enabled.
+
+The regulators configured voltage output can be found by calling :-
+
+int regulator_get_voltage(regulator);
+
+NOTE: get_voltage() will return the configured output voltage whether the
+regulator is enabled or disabled and should NOT be used to determine regulator
+output state. However this can be used in conjunction with is_enabled() to
+determine the regulator physical output voltage.
+
+
+4. Regulator Current Limit Control & Status (dynamic drivers)
+===========================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+current limit to match system operating points. e.g. LCD backlight driver can
+change the current limit to vary the backlight brightness, USB drivers may want
+to set the limit to 500mA when supplying power.
+
+Consumers can control their supply current limit by calling :-
+
+int regulator_set_current_limit(regulator, min_uV, max_uV);
+
+Where min_uA and max_uA are the minimum and maximum acceptable current limit in
+microamps.
+
+NOTE: this can be called when the regulator is enabled or disabled. If called
+when enabled, then the current limit changes instantly, otherwise the current
+limit configuration changes and the current limit is physically set when the
+regulator is next enabled.
+
+A regulators current limit can be found by calling :-
+
+int regulator_get_current_limit(regulator);
+
+NOTE: get_current_limit() will return the current limit whether the regulator
+is enabled or disabled and should not be used to determine regulator current
+load.
+
+
+5. Regulator Operating Mode Control & Status (dynamic drivers)
+=============================================================
+
+Some consumers can further save system power by changing the operating mode of
+their supply regulator to be more efficient when the consumers operating state
+changes. e.g. consumer driver is idle and subsequently draws less current
+
+Regulator operating mode can be changed indirectly or directly.
+
+Indirect operating mode control.
+--------------------------------
+Consumer drivers can request a change in their supply regulator operating mode
+by calling :-
+
+int regulator_set_optimum_mode(struct regulator *regulator, int load_uA);
+
+This will cause the core to recalculate the total load on the regulator (based
+on all it's consumers) and change operating mode (if necessary and permitted)
+to best match the current operating load.
+
+The load_uA value can be determined from the consumers datasheet. e.g.most
+datasheets have tables showing the max current consumed in certain situations.
+
+Most consumers will use indirect operating mode control since they have no
+knowledge of the regulator or whether the regulator is shared with other
+consumers.
+
+Direct operating mode control.
+------------------------------
+Bespoke or tightly coupled drivers may want to directly control regulator
+operating mode depending on their operating point. This can be achieved by
+calling :-
+
+int regulator_set_mode(struct regulator *regulator, unsigned int mode);
+unsigned int regulator_get_mode(struct regulator *regulator);
+
+Direct mode will only be used by consumers that *know* about the regulator and
+are not sharing the regulator with other consumers.
+
+
+6. Regulator Events
+===================
+Regulators can notify consumers of external events. Events could be received by
+consumers under regulator stress or failure conditions.
+
+Consumers can register interest in regulator events by calling :-
+
+int regulator_register_notifier(struct regulator *regulator,
+ struct notifier_block *nb);
+
+Consumers can uregister interest by calling :-
+
+int regulator_unregister_notifier(struct regulator *regulator,
+ struct notifier_block *nb);
+
+Regulators use the kernel notifier framework to send event to thier interested
+consumers.
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
new file mode 100644
index 0000000..ce3487d
--- /dev/null
+++ b/Documentation/power/regulator/machine.txt
@@ -0,0 +1,93 @@
+Regulator Machine Driver Interface
+===================================
+
+The regulator machine driver interface is intended for board/machine specific
+initialisation code to configure the regulator subsystem.
+
+Consider the following machine :-
+
+ Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
+ |
+ +-> [Consumer B @ 3.3V]
+
+The drivers for consumers A & B must be mapped to the correct regulator in
+order to control their power supply. This mapping can be achieved in machine
+initialisation code by creating a struct regulator_consumer_supply for
+each regulator.
+
+struct regulator_consumer_supply {
+ struct device *dev; /* consumer */
+ const char *supply; /* consumer supply - e.g. "vcc" */
+};
+
+e.g. for the machine above
+
+static struct regulator_consumer_supply regulator1_consumers[] = {
+{
+ .dev = &platform_consumerB_device.dev,
+ .supply = "Vcc",
+},};
+
+static struct regulator_consumer_supply regulator2_consumers[] = {
+{
+ .dev = &platform_consumerA_device.dev,
+ .supply = "Vcc",
+},};
+
+This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
+to the 'Vcc' supply for Consumer A.
+
+Constraints can now be registered by defining a struct regulator_init_data
+for each regulator power domain. This structure also maps the consumers
+to their supply regulator :-
+
+static struct regulator_init_data regulator1_data = {
+ .constraints = {
+ .min_uV = 3300000,
+ .max_uV = 3300000,
+ .valid_modes_mask = REGULATOR_MODE_NORMAL,
+ },
+ .num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
+ .consumer_supplies = regulator1_consumers,
+};
+
+Regulator-1 supplies power to Regulator-2. This relationship must be registered
+with the core so that Regulator-1 is also enabled when Consumer A enables it's
+supply (Regulator-2). The supply regulator is set by the supply_regulator_dev
+field below:-
+
+static struct regulator_init_data regulator2_data = {
+ .supply_regulator_dev = &platform_regulator1_device.dev,
+ .constraints = {
+ .min_uV = 1800000,
+ .max_uV = 2000000,
+ .valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
+ .valid_modes_mask = REGULATOR_MODE_NORMAL,
+ },
+ .num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
+ .consumer_supplies = regulator2_consumers,
+};
+
+Finally the regulator devices must be registered in the usual manner.
+
+static struct platform_device regulator_devices[] = {
+{
+ .name = "regulator",
+ .id = DCDC_1,
+ .dev = {
+ .platform_data = &regulator1_data,
+ },
+},
+{
+ .name = "regulator",
+ .id = DCDC_2,
+ .dev = {
+ .platform_data = &regulator2_data,
+ },
+},
+};
+/* register regulator 1 device */
+platform_device_register(&wm8350_regulator_devices[0]);
+
+/* register regulator 2 device */
+platform_device_register(&wm8350_regulator_devices[1]);
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
new file mode 100644
index 0000000..bdcb332
--- /dev/null
+++ b/Documentation/power/regulator/overview.txt
@@ -0,0 +1,171 @@
+Linux voltage and current regulator framework
+=============================================
+
+About
+=====
+
+This framework is designed to provide a standard kernel interface to control
+voltage and current regulators.
+
+The intention is to allow systems to dynamically control regulator power output
+in order to save power and prolong battery life. This applies to both voltage
+regulators (where voltage output is controllable) and current sinks (where
+current limit is controllable).
+
+(C) 2008 Wolfson Microelectronics PLC.
+Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+
+
+Nomenclature
+============
+
+Some terms used in this document:-
+
+ o Regulator - Electronic device that supplies power to other devices.
+ Most regulators can enable and disable their output whilst
+ some can control their output voltage and or current.
+
+ Input Voltage -> Regulator -> Output Voltage
+
+
+ o PMIC - Power Management IC. An IC that contains numerous regulators
+ and often contains other susbsystems.
+
+
+ o Consumer - Electronic device that is supplied power by a regulator.
+ Consumers can be classified into two types:-
+
+ Static: consumer does not change it's supply voltage or
+ current limit. It only needs to enable or disable it's
+ power supply. It's supply voltage is set by the hardware,
+ bootloader, firmware or kernel board initialisation code.
+
+ Dynamic: consumer needs to change it's supply voltage or
+ current limit to meet operation demands.
+
+
+ o Power Domain - Electronic circuit that is supplied it's input power by the
+ output power of a regulator, switch or by another power
+ domain.
+
+ The supply regulator may be behind a switch(s). i.e.
+
+ Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
+ | |
+ | +-> [Consumer B], [Consumer C]
+ |
+ +-> [Consumer D], [Consumer E]
+
+ That is one regulator and three power domains:
+
+ Domain 1: Switch-1, Consumers D & E.
+ Domain 2: Switch-2, Consumers B & C.
+ Domain 3: Consumer A.
+
+ and this represents a "supplies" relationship:
+
+ Domain-1 --> Domain-2 --> Domain-3.
+
+ A power domain may have regulators that are supplied power
+ by other regulators. i.e.
+
+ Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
+ |
+ +-> [Consumer B]
+
+ This gives us two regulators and two power domains:
+
+ Domain 1: Regulator-2, Consumer B.
+ Domain 2: Consumer A.
+
+ and a "supplies" relationship:
+
+ Domain-1 --> Domain-2
+
+
+ o Constraints - Constraints are used to define power levels for performance
+ and hardware protection. Constraints exist at three levels:
+
+ Regulator Level: This is defined by the regulator hardware
+ operating parameters and is specified in the regulator
+ datasheet. i.e.
+
+ - voltage output is in the range 800mV -> 3500mV.
+ - regulator current output limit is 20mA @ 5V but is
+ 10mA @ 10V.
+
+ Power Domain Level: This is defined in software by kernel
+ level board initialisation code. It is used to constrain a
+ power domain to a particular power range. i.e.
+
+ - Domain-1 voltage is 3300mV
+ - Domain-2 voltage is 1400mV -> 1600mV
+ - Domain-3 current limit is 0mA -> 20mA.
+
+ Consumer Level: This is defined by consumer drivers
+ dynamically setting voltage or current limit levels.
+
+ e.g. a consumer backlight driver asks for a current increase
+ from 5mA to 10mA to increase LCD illumination. This passes
+ to through the levels as follows :-
+
+ Consumer: need to increase LCD brightness. Lookup and
+ request next current mA value in brightness table (the
+ consumer driver could be used on several different
+ personalities based upon the same reference device).
+
+ Power Domain: is the new current limit within the domain
+ operating limits for this domain and system state (e.g.
+ battery power, USB power)
+
+ Regulator Domains: is the new current limit within the
+ regulator operating parameters for input/ouput voltage.
+
+ If the regulator request passes all the constraint tests
+ then the new regulator value is applied.
+
+
+Design
+======
+
+The framework is designed and targeted at SoC based devices but may also be
+relevant to non SoC devices and is split into the following four interfaces:-
+
+
+ 1. Consumer driver interface.
+
+ This uses a similar API to the kernel clock interface in that consumer
+ drivers can get and put a regulator (like they can with clocks atm) and
+ get/set voltage, current limit, mode, enable and disable. This should
+ allow consumers complete control over their supply voltage and current
+ limit. This also compiles out if not in use so drivers can be reused in
+ systems with no regulator based power control.
+
+ See Documentation/power/regulator/consumer.txt
+
+ 2. Regulator driver interface.
+
+ This allows regulator drivers to register their regulators and provide
+ operations to the core. It also has a notifier call chain for propagating
+ regulator events to clients.
+
+ See Documentation/power/regulator/regulator.txt
+
+ 3. Machine interface.
+
+ This interface is for machine specific code and allows the creation of
+ voltage/current domains (with constraints) for each regulator. It can
+ provide regulator constraints that will prevent device damage through
+ overvoltage or over current caused by buggy client drivers. It also
+ allows the creation of a regulator tree whereby some regulators are
+ supplied by others (similar to a clock tree).
+
+ See Documentation/power/regulator/machine.txt
+
+ 4. Userspace ABI.
+
+ The framework also exports a lot of useful voltage/current/opmode data to
+ userspace via sysfs. This could be used to help monitor device power
+ consumption and status.
+
+ See Documentation/ABI/testing/regulator-sysfs.txt
diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
new file mode 100644
index 0000000..4200acc
--- /dev/null
+++ b/Documentation/power/regulator/regulator.txt
@@ -0,0 +1,30 @@
+Regulator Driver Interface
+==========================
+
+The regulator driver interface is relatively simple and designed to allow
+regulator drivers to register their services with the core framework.
+
+
+Registration
+============
+
+Drivers can register a regulator by calling :-
+
+struct regulator_dev *regulator_register(struct device *dev,
+ struct regulator_desc *regulator_desc);
+
+This will register the regulators capabilities and operations to the regulator
+core.
+
+Regulators can be unregistered by calling :-
+
+void regulator_unregister(struct regulator_dev *rdev);
+
+
+Regulator Events
+================
+Regulators can send events (e.g. over temp, under voltage, etc) to consumer
+drivers by calling :-
+
+int regulator_notifier_call_chain(struct regulator_dev *rdev,
+ unsigned long event, void *data);
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
new file mode 100644
index 0000000..2ebdc60
--- /dev/null
+++ b/Documentation/power/s2ram.txt
@@ -0,0 +1,74 @@
+ How to get s2ram working
+ ~~~~~~~~~~~~~~~~~~~~~~~~
+ 2006 Linus Torvalds
+ 2006 Pavel Machek
+
+1) Check suspend.sf.net, program s2ram there has long whitelist of
+ "known ok" machines, along with tricks to use on each one.
+
+2) If that does not help, try reading tricks.txt and
+ video.txt. Perhaps problem is as simple as broken module, and
+ simple module unload can fix it.
+
+3) You can use Linus' TRACE_RESUME infrastructure, described below.
+
+ Using TRACE_RESUME
+ ~~~~~~~~~~~~~~~~~~
+
+I've been working at making the machines I have able to STR, and almost
+always it's a driver that is buggy. Thank God for the suspend/resume
+debugging - the thing that Chuck tried to disable. That's often the _only_
+way to debug these things, and it's actually pretty powerful (but
+time-consuming - having to insert TRACE_RESUME() markers into the device
+driver that doesn't resume and recompile and reboot).
+
+Anyway, the way to debug this for people who are interested (have a
+machine that doesn't boot) is:
+
+ - enable PM_DEBUG, and PM_TRACE
+
+ - use a script like this:
+
+ #!/bin/sh
+ sync
+ echo 1 > /sys/power/pm_trace
+ echo mem > /sys/power/state
+
+ to suspend
+
+ - if it doesn't come back up (which is usually the problem), reboot by
+ holding the power button down, and look at the dmesg output for things
+ like
+
+ Magic number: 4:156:725
+ hash matches drivers/base/power/resume.c:28
+ hash matches device 0000:01:00.0
+
+ which means that the last trace event was just before trying to resume
+ device 0000:01:00.0. Then figure out what driver is controlling that
+ device (lspci and /sys/devices/pci* is your friend), and see if you can
+ fix it, disable it, or trace into its resume function.
+
+For example, the above happens to be the VGA device on my EVO, which I
+used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
+that "radeonfb" simply cannot resume that device - it tries to set the
+PLL's, and it just _hangs_. Using the regular VGA console and letting X
+resume it instead works fine.
+
+NOTE
+====
+pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
+Reason for this is that the RTC is the only reliably available piece of
+hardware during resume operations where a value can be set that will
+survive a reboot.
+
+Consequence is that after a resume (even if it is successful) your system
+clock will have a value corresponding to the magic mumber instead of the
+correct date/time! It is therefore advisable to use a program like ntp-date
+or rdate to reset the correct date/time from an external time source when
+using this trace option.
+
+As the clock keeps ticking it is also essential that the reboot is done
+quickly after the resume failure. The trace option does not use the seconds
+or the low order bits of the minutes of the RTC, but a too long delay will
+corrupt the magic value.
diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
new file mode 100644
index 0000000..34800cc
--- /dev/null
+++ b/Documentation/power/states.txt
@@ -0,0 +1,80 @@
+
+System Power Management States
+
+
+The kernel supports three power management states generically, though
+each is dependent on platform support code to implement the low-level
+details for each state. This file describes each state, what they are
+commonly called, what ACPI state they map to, and what string to write
+to /sys/power/state to enter that state
+
+
+State: Standby / Power-On Suspend
+ACPI State: S1
+String: "standby"
+
+This state offers minimal, though real, power savings, while providing
+a very low-latency transition back to a working system. No operating
+state is lost (the CPU retains power), so the system easily starts up
+again where it left off.
+
+We try to put devices in a low-power state equivalent to D1, which
+also offers low power savings, but low resume latency. Not all devices
+support D1, and those that don't are left on.
+
+A transition from Standby to the On state should take about 1-2
+seconds.
+
+
+State: Suspend-to-RAM
+ACPI State: S3
+String: "mem"
+
+This state offers significant power savings as everything in the
+system is put into a low-power state, except for memory, which is
+placed in self-refresh mode to retain its contents.
+
+System and device state is saved and kept in memory. All devices are
+suspended and put into D3. In many cases, all peripheral buses lose
+power when entering STR, so devices must be able to handle the
+transition back to the On state.
+
+For at least ACPI, STR requires some minimal boot-strapping code to
+resume the system from STR. This may be true on other platforms.
+
+A transition from Suspend-to-RAM to the On state should take about
+3-5 seconds.
+
+
+State: Suspend-to-disk
+ACPI State: S4
+String: "disk"
+
+This state offers the greatest power savings, and can be used even in
+the absence of low-level platform support for power management. This
+state operates similarly to Suspend-to-RAM, but includes a final step
+of writing memory contents to disk. On resume, this is read and memory
+is restored to its pre-suspend state.
+
+STD can be handled by the firmware or the kernel. If it is handled by
+the firmware, it usually requires a dedicated partition that must be
+setup via another operating system for it to use. Despite the
+inconvenience, this method requires minimal work by the kernel, since
+the firmware will also handle restoring memory contents on resume.
+
+For suspend-to-disk, a mechanism called swsusp called 'swsusp' (Swap
+Suspend) is used to write memory contents to free swap space.
+swsusp has some restrictive requirements, but should work in most
+cases. Some, albeit outdated, documentation can be found in
+Documentation/power/swsusp.txt. Alternatively, userspace can do most
+of the actual suspend to disk work, see userland-swsusp.txt.
+
+Once memory state is written to disk, the system may either enter a
+low-power state (like ACPI S4), or it may simply power down. Powering
+down offers greater savings, and allows this mechanism to work on any
+system. However, entering a real low-power state allows the user to
+trigger wake up events (e.g. pressing a key or opening a laptop lid).
+
+A transition from Suspend-to-Disk to the On state should take about 30
+seconds, though it's typically a bit more with the current
+implementation.
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
new file mode 100644
index 0000000..f281886
--- /dev/null
+++ b/Documentation/power/swsusp-and-swap-files.txt
@@ -0,0 +1,60 @@
+Using swap files with software suspend (swsusp)
+ (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+The Linux kernel handles swap files almost in the same way as it handles swap
+partitions and there are only two differences between these two types of swap
+areas:
+(1) swap files need not be contiguous,
+(2) the header of a swap file is not in the first block of the partition that
+holds it. From the swsusp's point of view (1) is not a problem, because it is
+already taken care of by the swap-handling code, but (2) has to be taken into
+consideration.
+
+In principle the location of a swap file's header may be determined with the
+help of appropriate filesystem driver. Unfortunately, however, it requires the
+filesystem holding the swap file to be mounted, and if this filesystem is
+journaled, it cannot be mounted during resume from disk. For this reason to
+identify a swap file swsusp uses the name of the partition that holds the file
+and the offset from the beginning of the partition at which the swap file's
+header is located. For convenience, this offset is expressed in <PAGE_SIZE>
+units.
+
+In order to use a swap file with swsusp, you need to:
+
+1) Create the swap file and make it active, eg.
+
+# dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
+# mkswap <swap_file_path>
+# swapon <swap_file_path>
+
+2) Use an application that will bmap the swap file with the help of the
+FIBMAP ioctl and determine the location of the file's swap header, as the
+offset, in <PAGE_SIZE> units, from the beginning of the partition which
+holds the swap file.
+
+3) Add the following parameters to the kernel command line:
+
+resume=<swap_file_partition> resume_offset=<swap_file_offset>
+
+where <swap_file_partition> is the partition on which the swap file is located
+and <swap_file_offset> is the offset of the swap header determined by the
+application in 2) (of course, this step may be carried out automatically
+by the same application that determines the swap file's header offset using the
+FIBMAP ioctl)
+
+OR
+
+Use a userland suspend application that will set the partition and offset
+with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
+Documentation/power/userland-swsusp.txt (this is the only method to suspend
+to a swap file allowing the resume to be initiated from an initrd or initramfs
+image).
+
+Now, swsusp will use the swap file in the same way in which it would use a swap
+partition. In particular, the swap file has to be active (ie. be present in
+/proc/swaps) so that it can be used for suspending.
+
+Note that if the swap file used for suspending is deleted and recreated,
+the location of its header need not be the same as before. Thus every time
+this happens the value of the "resume_offset=" kernel command line parameter
+has to be updated.
diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt
new file mode 100644
index 0000000..59931b4
--- /dev/null
+++ b/Documentation/power/swsusp-dmcrypt.txt
@@ -0,0 +1,138 @@
+Author: Andreas Steinmetz <ast@domdv.de>
+
+
+How to use dm-crypt and swsusp together:
+========================================
+
+Some prerequisites:
+You know how dm-crypt works. If not, visit the following web page:
+http://www.saout.de/misc/dm-crypt/
+You have read Documentation/power/swsusp.txt and understand it.
+You did read Documentation/initrd.txt and know how an initrd works.
+You know how to create or how to modify an initrd.
+
+Now your system is properly set up, your disk is encrypted except for
+the swap device(s) and the boot partition which may contain a mini
+system for crypto setup and/or rescue purposes. You may even have
+an initrd that does your current crypto setup already.
+
+At this point you want to encrypt your swap, too. Still you want to
+be able to suspend using swsusp. This, however, means that you
+have to be able to either enter a passphrase or that you read
+the key(s) from an external device like a pcmcia flash disk
+or an usb stick prior to resume. So you need an initrd, that sets
+up dm-crypt and then asks swsusp to resume from the encrypted
+swap device.
+
+The most important thing is that you set up dm-crypt in such
+a way that the swap device you suspend to/resume from has
+always the same major/minor within the initrd as well as
+within your running system. The easiest way to achieve this is
+to always set up this swap device first with dmsetup, so that
+it will always look like the following:
+
+brw------- 1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
+
+Now set up your kernel to use /dev/mapper/swap0 as the default
+resume partition, so your kernel .config contains:
+
+CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
+
+Prepare your boot loader to use the initrd you will create or
+modify. For lilo the simplest setup looks like the following
+lines:
+
+image=/boot/vmlinuz
+initrd=/boot/initrd.gz
+label=linux
+append="root=/dev/ram0 init=/linuxrc rw"
+
+Finally you need to create or modify your initrd. Lets assume
+you create an initrd that reads the required dm-crypt setup
+from a pcmcia flash disk card. The card is formatted with an ext2
+fs which resides on /dev/hde1 when the card is inserted. The
+card contains at least the encrypted swap setup in a file
+named "swapkey". /etc/fstab of your initrd contains something
+like the following:
+
+/dev/hda1 /mnt ext3 ro 0 0
+none /proc proc defaults,noatime,nodiratime 0 0
+none /sys sysfs defaults,noatime,nodiratime 0 0
+
+/dev/hda1 contains an unencrypted mini system that sets up all
+of your crypto devices, again by reading the setup from the
+pcmcia flash disk. What follows now is a /linuxrc for your
+initrd that allows you to resume from encrypted swap and that
+continues boot with your mini system on /dev/hda1 if resume
+does not happen:
+
+#!/bin/sh
+PATH=/sbin:/bin:/usr/sbin:/usr/bin
+mount /proc
+mount /sys
+mapped=0
+noresume=`grep -c noresume /proc/cmdline`
+if [ "$*" != "" ]
+then
+ noresume=1
+fi
+dmesg -n 1
+/sbin/cardmgr -q
+for i in 1 2 3 4 5 6 7 8 9 0
+do
+ if [ -f /proc/ide/hde/media ]
+ then
+ usleep 500000
+ mount -t ext2 -o ro /dev/hde1 /mnt
+ if [ -f /mnt/swapkey ]
+ then
+ dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
+ fi
+ umount /mnt
+ break
+ fi
+ usleep 500000
+done
+killproc /sbin/cardmgr
+dmesg -n 6
+if [ $mapped = 1 ]
+then
+ if [ $noresume != 0 ]
+ then
+ mkswap /dev/mapper/swap0 > /dev/null 2>&1
+ fi
+ echo 254:0 > /sys/power/resume
+ dmsetup remove swap0
+fi
+umount /sys
+mount /mnt
+umount /proc
+cd /mnt
+pivot_root . mnt
+mount /proc
+umount -l /mnt
+umount /proc
+exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
+
+Please don't mind the weird loop above, busybox's msh doesn't know
+the let statement. Now, what is happening in the script?
+First we have to decide if we want to try to resume, or not.
+We will not resume if booting with "noresume" or any parameters
+for init like "single" or "emergency" as boot parameters.
+
+Then we need to set up dmcrypt with the setup data from the
+pcmcia flash disk. If this succeeds we need to reset the swap
+device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
+then attempts to resume from the first device mapper device.
+Note that it is important to set the device in /sys/power/resume,
+regardless if resuming or not, otherwise later suspend will fail.
+If resume starts, script execution terminates here.
+
+Otherwise we just remove the encrypted swap device and leave it to the
+mini system on /dev/hda1 to set the whole crypto up (it is up to
+you to modify this to your taste).
+
+What then follows is the well known process to change the root
+file system and continue booting from there. I prefer to unmount
+the initrd prior to continue booting but it is up to you to modify
+this.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
new file mode 100644
index 0000000..9d60ab7
--- /dev/null
+++ b/Documentation/power/swsusp.txt
@@ -0,0 +1,407 @@
+Some warnings, first.
+
+ * BIG FAT WARNING *********************************************************
+ *
+ * If you touch anything on disk between suspend and resume...
+ * ...kiss your data goodbye.
+ *
+ * If you do resume from initrd after your filesystems are mounted...
+ * ...bye bye root partition.
+ * [this is actually same case as above]
+ *
+ * If you have unsupported (*) devices using DMA, you may have some
+ * problems. If your disk driver does not support suspend... (IDE does),
+ * it may cause some problems, too. If you change kernel command line
+ * between suspend and resume, it may do something wrong. If you change
+ * your hardware while system is suspended... well, it was not good idea;
+ * but it will probably only crash.
+ *
+ * (*) suspend/resume support is needed to make it safe.
+ *
+ * If you have any filesystems on USB devices mounted before software suspend,
+ * they won't be accessible after resume and you may lose data, as though
+ * you have unplugged the USB devices with mounted filesystems on them;
+ * see the FAQ below for details. (This is not true for more traditional
+ * power states like "standby", which normally don't turn USB off.)
+
+You need to append resume=/dev/your_swap_partition to kernel command
+line. Then you suspend by
+
+echo shutdown > /sys/power/disk; echo disk > /sys/power/state
+
+. If you feel ACPI works pretty well on your system, you might try
+
+echo platform > /sys/power/disk; echo disk > /sys/power/state
+
+. If you have SATA disks, you'll need recent kernels with SATA suspend
+support. For suspend and resume to work, make sure your disk drivers
+are built into kernel -- not modules. [There's way to make
+suspend/resume with modular disk drivers, see FAQ, but you probably
+should not do that.]
+
+If you want to limit the suspend image size to N bytes, do
+
+echo N > /sys/power/image_size
+
+before suspend (it is limited to 500 MB by default).
+
+
+Article about goals and implementation of Software Suspend for Linux
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Author: G‚ábor Kuti
+Last revised: 2003-10-20 by Pavel Machek
+
+Idea and goals to achieve
+
+Nowadays it is common in several laptops that they have a suspend button. It
+saves the state of the machine to a filesystem or to a partition and switches
+to standby mode. Later resuming the machine the saved state is loaded back to
+ram and the machine can continue its work. It has two real benefits. First we
+save ourselves the time machine goes down and later boots up, energy costs
+are real high when running from batteries. The other gain is that we don't have to
+interrupt our programs so processes that are calculating something for a long
+time shouldn't need to be written interruptible.
+
+swsusp saves the state of the machine into active swaps and then reboots or
+powerdowns. You must explicitly specify the swap partition to resume from with
+``resume='' kernel option. If signature is found it loads and restores saved
+state. If the option ``noresume'' is specified as a boot parameter, it skips
+the resuming.
+
+In the meantime while the system is suspended you should not add/remove any
+of the hardware, write to the filesystems, etc.
+
+Sleep states summary
+====================
+
+There are three different interfaces you can use, /proc/acpi should
+work like this:
+
+In a really perfect world:
+echo 1 > /proc/acpi/sleep # for standby
+echo 2 > /proc/acpi/sleep # for suspend to ram
+echo 3 > /proc/acpi/sleep # for suspend to ram, but with more power conservative
+echo 4 > /proc/acpi/sleep # for suspend to disk
+echo 5 > /proc/acpi/sleep # for shutdown unfriendly the system
+
+and perhaps
+echo 4b > /proc/acpi/sleep # for suspend to disk via s4bios
+
+Frequently Asked Questions
+==========================
+
+Q: well, suspending a server is IMHO a really stupid thing,
+but... (Diego Zuccato):
+
+A: You bought new UPS for your server. How do you install it without
+bringing machine down? Suspend to disk, rearrange power cables,
+resume.
+
+You have your server on UPS. Power died, and UPS is indicating 30
+seconds to failure. What do you do? Suspend to disk.
+
+
+Q: Maybe I'm missing something, but why don't the regular I/O paths work?
+
+A: We do use the regular I/O paths. However we cannot restore the data
+to its original location as we load it. That would create an
+inconsistent kernel state which would certainly result in an oops.
+Instead, we load the image into unused memory and then atomically copy
+it back to it original location. This implies, of course, a maximum
+image size of half the amount of memory.
+
+There are two solutions to this:
+
+* require half of memory to be free during suspend. That way you can
+read "new" data onto free spots, then cli and copy
+
+* assume we had special "polling" ide driver that only uses memory
+between 0-640KB. That way, I'd have to make sure that 0-640KB is free
+during suspending, but otherwise it would work...
+
+suspend2 shares this fundamental limitation, but does not include user
+data and disk caches into "used memory" by saving them in
+advance. That means that the limitation goes away in practice.
+
+Q: Does linux support ACPI S4?
+
+A: Yes. That's what echo platform > /sys/power/disk does.
+
+Q: What is 'suspend2'?
+
+A: suspend2 is 'Software Suspend 2', a forked implementation of
+suspend-to-disk which is available as separate patches for 2.4 and 2.6
+kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
+highmem and preemption. It also has a extensible architecture that
+allows for arbitrary transformations on the image (compression,
+encryption) and arbitrary backends for writing the image (eg to swap
+or an NFS share[Work In Progress]). Questions regarding suspend2
+should be sent to the mailing list available through the suspend2
+website, and not to the Linux Kernel Mailing List. We are working
+toward merging suspend2 into the mainline kernel.
+
+Q: What is the freezing of tasks and why are we using it?
+
+A: The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures). See freezing-of-tasks.txt for details.
+
+Q: What is the difference between "platform" and "shutdown"?
+
+A:
+
+shutdown: save state in linux, then tell bios to powerdown
+
+platform: save state in linux, then tell bios to powerdown and blink
+ "suspended led"
+
+"platform" is actually right thing to do where supported, but
+"shutdown" is most reliable (except on ACPI systems).
+
+Q: I do not understand why you have such strong objections to idea of
+selective suspend.
+
+A: Do selective suspend during runtime power management, that's okay. But
+it's useless for suspend-to-disk. (And I do not see how you could use
+it for suspend-to-ram, I hope you do not want that).
+
+Lets see, so you suggest to
+
+* SUSPEND all but swap device and parents
+* Snapshot
+* Write image to disk
+* SUSPEND swap device and parents
+* Powerdown
+
+Oh no, that does not work, if swap device or its parents uses DMA,
+you've corrupted data. You'd have to do
+
+* SUSPEND all but swap device and parents
+* FREEZE swap device and parents
+* Snapshot
+* UNFREEZE swap device and parents
+* Write
+* SUSPEND swap device and parents
+
+Which means that you still need that FREEZE state, and you get more
+complicated code. (And I have not yet introduce details like system
+devices).
+
+Q: There don't seem to be any generally useful behavioral
+distinctions between SUSPEND and FREEZE.
+
+A: Doing SUSPEND when you are asked to do FREEZE is always correct,
+but it may be unneccessarily slow. If you want your driver to stay simple,
+slowness may not matter to you. It can always be fixed later.
+
+For devices like disk it does matter, you do not want to spindown for
+FREEZE.
+
+Q: After resuming, system is paging heavily, leading to very bad interactivity.
+
+A: Try running
+
+cat `cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u` > /dev/null
+
+after resume. swapoff -a; swapon -a may also be useful.
+
+Q: What happens to devices during swsusp? They seem to be resumed
+during system suspend?
+
+A: That's correct. We need to resume them if we want to write image to
+disk. Whole sequence goes like
+
+ Suspend part
+ ~~~~~~~~~~~~
+ running system, user asks for suspend-to-disk
+
+ user processes are stopped
+
+ suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+ with state snapshot
+
+ state snapshot: copy of whole used memory is taken with interrupts disabled
+
+ resume(): devices are woken up so that we can write image to swap
+
+ write image to swap
+
+ suspend(PMSG_SUSPEND): suspend devices so that we can power off
+
+ turn the power off
+
+ Resume part
+ ~~~~~~~~~~~
+ (is actually pretty similar)
+
+ running system, user asks for suspend-to-disk
+
+ user processes are stopped (in common case there are none, but with resume-from-initrd, noone knows)
+
+ read image from disk
+
+ suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+ with image restoration
+
+ image restoration: rewrite memory with image
+
+ resume(): devices are woken up so that system can continue
+
+ thaw all user processes
+
+Q: What is this 'Encrypt suspend image' for?
+
+A: First of all: it is not a replacement for dm-crypt encrypted swap.
+It cannot protect your computer while it is suspended. Instead it does
+protect from leaking sensitive data after resume from suspend.
+
+Think of the following: you suspend while an application is running
+that keeps sensitive data in memory. The application itself prevents
+the data from being swapped out. Suspend, however, must write these
+data to swap to be able to resume later on. Without suspend encryption
+your sensitive data are then stored in plaintext on disk. This means
+that after resume your sensitive data are accessible to all
+applications having direct access to the swap device which was used
+for suspend. If you don't need swap after resume these data can remain
+on disk virtually forever. Thus it can happen that your system gets
+broken in weeks later and sensitive data which you thought were
+encrypted and protected are retrieved and stolen from the swap device.
+To prevent this situation you should use 'Encrypt suspend image'.
+
+During suspend a temporary key is created and this key is used to
+encrypt the data written to disk. When, during resume, the data was
+read back into memory the temporary key is destroyed which simply
+means that all data written to disk during suspend are then
+inaccessible so they can't be stolen later on. The only thing that
+you must then take care of is that you call 'mkswap' for the swap
+partition used for suspend as early as possible during regular
+boot. This asserts that any temporary key from an oopsed suspend or
+from a failed or aborted resume is erased from the swap device.
+
+As a rule of thumb use encrypted swap to protect your data while your
+system is shut down or suspended. Additionally use the encrypted
+suspend image to prevent sensitive data from being stolen after
+resume.
+
+Q: Can I suspend to a swap file?
+
+A: Generally, yes, you can. However, it requires you to use the "resume=" and
+"resume_offset=" kernel command line parameters, so the resume from a swap file
+cannot be initiated from an initrd or initramfs image. See
+swsusp-and-swap-files.txt for details.
+
+Q: Is there a maximum system RAM size that is supported by swsusp?
+
+A: It should work okay with highmem.
+
+Q: Does swsusp (to disk) use only one swap partition or can it use
+multiple swap partitions (aggregate them into one logical space)?
+
+A: Only one swap partition, sorry.
+
+Q: If my application(s) causes lots of memory & swap space to be used
+(over half of the total system RAM), is it correct that it is likely
+to be useless to try to suspend to disk while that app is running?
+
+A: No, it should work okay, as long as your app does not mlock()
+it. Just prepare big enough swap partition.
+
+Q: What information is useful for debugging suspend-to-disk problems?
+
+A: Well, last messages on the screen are always useful. If something
+is broken, it is usually some kernel driver, therefore trying with as
+little as possible modules loaded helps a lot. I also prefer people to
+suspend from console, preferably without X running. Booting with
+init=/bin/bash, then swapon and starting suspend sequence manually
+usually does the trick. Then it is good idea to try with latest
+vanilla kernel.
+
+Q: How can distributions ship a swsusp-supporting kernel with modular
+disk drivers (especially SATA)?
+
+A: Well, it can be done, load the drivers, then do echo into
+/sys/power/disk/resume file from initrd. Be sure not to mount
+anything, not even read-only mount, or you are going to lose your
+data.
+
+Q: How do I make suspend more verbose?
+
+A: If you want to see any non-error kernel messages on the virtual
+terminal the kernel switches to during suspend, you have to set the
+kernel console loglevel to at least 4 (KERN_WARNING), for example by
+doing
+
+ # save the old loglevel
+ read LOGLEVEL DUMMY < /proc/sys/kernel/printk
+ # set the loglevel so we see the progress bar.
+ # if the level is higher than needed, we leave it alone.
+ if [ $LOGLEVEL -lt 5 ]; then
+ echo 5 > /proc/sys/kernel/printk
+ fi
+
+ IMG_SZ=0
+ read IMG_SZ < /sys/power/image_size
+ echo -n disk > /sys/power/state
+ RET=$?
+ #
+ # the logic here is:
+ # if image_size > 0 (without kernel support, IMG_SZ will be zero),
+ # then try again with image_size set to zero.
+ if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
+ echo 0 > /sys/power/image_size
+ echo -n disk > /sys/power/state
+ RET=$?
+ fi
+
+ # restore previous loglevel
+ echo $LOGLEVEL > /proc/sys/kernel/printk
+ exit $RET
+
+Q: Is this true that if I have a mounted filesystem on a USB device and
+I suspend to disk, I can lose data unless the filesystem has been mounted
+with "sync"?
+
+A: That's right ... if you disconnect that device, you may lose data.
+In fact, even with "-o sync" you can lose data if your programs have
+information in buffers they haven't written out to a disk you disconnect,
+or if you disconnect before the device finished saving data you wrote.
+
+Software suspend normally powers down USB controllers, which is equivalent
+to disconnecting all USB devices attached to your system.
+
+Your system might well support low-power modes for its USB controllers
+while the system is asleep, maintaining the connection, using true sleep
+modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the
+/sys/power/state file; write "standby" or "mem".) We've not seen any
+hardware that can use these modes through software suspend, although in
+theory some systems might support "platform" modes that won't break the
+USB connections.
+
+Remember that it's always a bad idea to unplug a disk drive containing a
+mounted filesystem. That's true even when your system is asleep! The
+safest thing is to unmount all filesystems on removable media (such USB,
+Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
+before suspending; then remount them after resuming.
+
+There is a work-around for this problem. For more information, see
+Documentation/usb/persist.txt.
+
+Q: Can I suspend-to-disk using a swap partition under LVM?
+
+A: No. You can suspend successfully, but you'll not be able to
+resume. uswsusp should be able to work with LVM. See suspend.sf.net.
+
+Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
+compiled with the similar configuration files. Anyway I found that
+suspend to disk (and resume) is much slower on 2.6.16 compared to
+2.6.15. Any idea for why that might happen or how can I speed it up?
+
+A: This is because the size of the suspend image is now greater than
+for 2.6.15 (by saving more data we can get more responsive system
+after resume).
+
+There's the /sys/power/image_size knob that controls the size of the
+image. If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
+root), the 2.6.15 behavior should be restored. If it is still too
+slow, take a look at suspend.sf.net -- userland suspend is faster and
+supports LZF compression to speed it up further.
diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt
new file mode 100644
index 0000000..3b26bb5
--- /dev/null
+++ b/Documentation/power/tricks.txt
@@ -0,0 +1,27 @@
+ swsusp/S3 tricks
+ ~~~~~~~~~~~~~~~~
+Pavel Machek <pavel@suse.cz>
+
+If you want to trick swsusp/S3 into working, you might want to try:
+
+* go with minimal config, turn off drivers like USB, AGP you don't
+ really need
+
+* turn off APIC and preempt
+
+* use ext2. At least it has working fsck. [If something seems to go
+ wrong, force fsck when you have a chance]
+
+* turn off modules
+
+* use vga text console, shut down X. [If you really want X, you might
+ want to try vesafb later]
+
+* try running as few processes as possible, preferably go to single
+ user mode.
+
+* due to video issues, swsusp should be easier to get working than
+ S3. Try that first.
+
+When you make it work, try to find out what exactly was it that broke
+suspend, and preferably fix that.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
new file mode 100644
index 0000000..7b99636
--- /dev/null
+++ b/Documentation/power/userland-swsusp.txt
@@ -0,0 +1,165 @@
+Documentation for userland software suspend interface
+ (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel. Such utilities are available, for example, from
+<http://suspend.sourceforge.net>. You may want to have a look at them if you
+are going to develop your own suspend/resume utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in include/linux/suspend_ioctls.h . The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing. If open for
+reading, it is considered to be in the suspend mode. Otherwise it is
+assumed to be in the resume mode. The device cannot be open for simultaneous
+reading and writing. It is also impossible to have the device open more than
+once at a time.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE - freeze user space processes (the current process is
+ not frozen); this is required for SNAPSHOT_CREATE_IMAGE
+ and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the
+ last argument of ioctl() should be a pointer to an int variable,
+ the value of which will indicate whether the call returned after
+ creating the snapshot (1) or after restoring the system memory state
+ from it (0) (after resume the system finds itself finishing the
+ SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
+ has been created the read() operation can be used to transfer
+ it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
+ uploaded snapshot image; before calling it you should transfer
+ the system memory snapshot back to the kernel using the write()
+ operation; this call will not succeed if the snapshot
+ image is not available to the kernel
+
+SNAPSHOT_FREE - free memory allocated for the snapshot image
+
+SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image
+ (the kernel will do its best to ensure the image size will not exceed
+ this number, but if it turns out to be impossible, the kernel will
+ create the smallest image possible)
+
+SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image
+
+SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the
+ last argument should be a pointer to an unsigned int variable that will
+ contain the result if the call is successful).
+
+SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition
+ (the last argument should be a pointer to a loff_t variable that
+ will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by
+ SNAPSHOT_ALLOC_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
+ units) from the beginning of the partition at which the swap header is
+ located (the last ioctl() argument should point to a struct
+ resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
+ containing the resume device specification and the offset); for swap
+ partitions the offset is always 0, but it is different from zero for
+ swap files (see Documentation/swsusp-and-swap-files.txt for details).
+
+SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support,
+ depending on the argument value (enable, if the argument is nonzero)
+
+SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation
+ state (eg. ACPI S4) using the platform (eg. ACPI) driver
+
+SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
+ immediately enter the suspend-to-RAM state, so this call must always
+ be preceded by the SNAPSHOT_FREEZE call and it is also necessary
+ to use the SNAPSHOT_UNFREEZE call after the system wakes up. This call
+ is needed to implement the suspend-to-both mechanism in which the
+ suspend image is first created, as though the system had been suspended
+ to disk, and then the system is suspended to RAM (this makes it possible
+ to resume the system from RAM if there's enough battery power or restore
+ its state on the basis of the saved suspend image otherwise)
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel. It has the following limitations:
+- you cannot read() more than one virtual memory page at a time
+- read()s accross page boundaries are impossible (ie. if ypu read() 1/2 of
+ a page in the previous call, you will only be able to read()
+ _at_ _most_ 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel. It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap parition, called the resume
+partition, or a swap file as storage space (if a swap file is used, the resume
+partition is the partition that holds this file). However, this is not really
+required, as they can use, for example, a special (blank) suspend partition or
+a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
+mounted afterwards.
+
+These utilities MUST NOT make any assumptions regarding the ordering of
+data within the snapshot image. The contents of the image are entirely owned
+by the kernel and its structure may be changed in future kernel releases.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read). Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header. If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferrably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+1. If the value is 1 (ie. the system memory snapshot has just been
+ created and the system is ready for saving it):
+ (a) The suspending utility MUST NOT close the snapshot device
+ _unless_ the whole suspend procedure is to be cancelled, in
+ which case, if the snapshot image has already been saved, the
+ suspending utility SHOULD destroy it, preferrably by zapping
+ its header. If the suspend is not to be cancelled, the
+ system MUST be powered off or rebooted after the snapshot
+ image has been saved.
+ (b) The suspending utility SHOULD NOT attempt to perform any
+ file system operations (including reads) on the file systems
+ that were mounted before SNAPSHOT_CREATE_IMAGE has been
+ called. However, it MAY mount a file system that was not
+ mounted at that time and perform some operations on it (eg.
+ use it for saving the image).
+2. If the value is 0 (ie. the system state has just been restored from
+ the snapshot image), the suspending utility MUST close the snapshot
+ device. Afterwards it will be treated as a regular userland process,
+ so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
new file mode 100644
index 0000000..2b35849
--- /dev/null
+++ b/Documentation/power/video.txt
@@ -0,0 +1,185 @@
+
+ Video issues with S3 resume
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 2003-2006, Pavel Machek
+
+During S3 resume, hardware needs to be reinitialized. For most
+devices, this is easy, and kernel driver knows how to do
+it. Unfortunately there's one exception: video card. Those are usually
+initialized by BIOS, and kernel does not have enough information to
+boot video card. (Kernel usually does not even contain video card
+driver -- vesafb and vgacon are widely used).
+
+This is not problem for swsusp, because during swsusp resume, BIOS is
+run normally so video card is normally initialized. It should not be
+problem for S1 standby, because hardware should retain its state over
+that.
+
+We either have to run video BIOS during early resume, or interpret it
+using vbetool later, or maybe nothing is necessary on particular
+system because video state is preserved. Unfortunately different
+methods work on different systems, and no known method suits all of
+them.
+
+Userland application called s2ram has been developed; it contains long
+whitelist of systems, and automatically selects working method for a
+given system. It can be downloaded from CVS at
+www.sf.net/projects/suspend . If you get a system that is not in the
+whitelist, please try to find a working solution, and submit whitelist
+entry so that work does not need to be repeated.
+
+Currently, VBE_SAVE method (6 below) works on most
+systems. Unfortunately, vbetool only runs after userland is resumed,
+so it makes debugging of early resume problems
+hard/impossible. Methods that do not rely on userland are preferable.
+
+Details
+~~~~~~~
+
+There are a few types of systems where video works after S3 resume:
+
+(1) systems where video state is preserved over S3.
+
+(2) systems where it is possible to call the video BIOS during S3
+ resume. Unfortunately, it is not correct to call the video BIOS at
+ that point, but it happens to work on some machines. Use
+ acpi_sleep=s3_bios.
+
+(3) systems that initialize video card into vga text mode and where
+ the BIOS works well enough to be able to set video mode. Use
+ acpi_sleep=s3_mode on these.
+
+(4) on some systems s3_bios kicks video into text mode, and
+ acpi_sleep=s3_bios,s3_mode is needed.
+
+(5) radeon systems, where X can soft-boot your video card. You'll need
+ a new enough X, and a plain text console (no vesafb or radeonfb). See
+ http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
+ Alternatively, you should use vbetool (6) instead.
+
+(6) other radeon systems, where vbetool is enough to bring system back
+ to life. It needs text console to be working. Do vbetool vbestate
+ save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
+ vbestate restore < /tmp/delme; setfont <whatever>, and your video
+ should work.
+
+(7) on some systems, it is possible to boot most of kernel, and then
+ POSTing bios works. Ole Rohne has patch to do just that at
+ http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
+
+(8) on some systems, you can use the video_post utility mentioned here:
+ http://bugzilla.kernel.org/show_bug.cgi?id=3670. Do echo 3 > /sys/power/state
+ && /usr/sbin/video_post - which will initialize the display in console mode.
+ If you are in X, you can switch to a virtual terminal and back to X using
+ CTRL+ALT+F1 - CTRL+ALT+F7 to get the display working in graphical mode again.
+
+Now, if you pass acpi_sleep=something, and it does not work with your
+bios, you'll get a hard crash during resume. Be careful. Also it is
+safest to do your experiments with plain old VGA console. The vesafb
+and radeonfb (etc) drivers have a tendency to crash the machine during
+resume.
+
+You may have a system where none of above works. At that point you
+either invent another ugly hack that works, or write proper driver for
+your video card (good luck getting docs :-(). Maybe suspending from X
+(proper X, knowing your hardware, not XF68_FBcon) might have better
+chance of working.
+
+Table of known working notebooks:
+
+Model hack (or "how to do it")
+------------------------------------------------------------------------------
+Acer Aspire 1406LC ole's late BIOS init (7), turn off DRI
+Acer TM 230 s3_bios (2)
+Acer TM 242FX vbetool (6)
+Acer TM C110 video_post (8)
+Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8)
+Acer TM 4052LCi s3_bios (2)
+Acer TM 636Lci s3_bios,s3_mode (4)
+Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back
+Acer TM 660 ??? (*)
+Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6)
+Acer TM 803 vga=normal, X patches, see webpage (5) or vbetool (6)
+Acer TM 803LCi vga=normal, vbetool (6)
+Arima W730a vbetool needed (6)
+Asus L2400D s3_mode (3)(***) (S1 also works OK)
+Asus L3350M (SiS 740) (6)
+Asus L3800C (Radeon M7) s3_bios (2) (S1 also works OK)
+Asus M6887Ne vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org
+Athlon64 desktop prototype s3_bios (2)
+Compal CL-50 ??? (*)
+Compaq Armada E500 - P3-700 none (1) (S1 also works OK)
+Compaq Evo N620c vga=normal, s3_bios (2)
+Dell 600m, ATI R250 Lf none (1), but needs xorg-x11-6.8.1.902-1
+Dell D600, ATI RV250 vga=normal and X, or try vbestate (6)
+Dell D610 vga=normal and X (possibly vbestate (6) too, but not tested)
+Dell Inspiron 4000 ??? (*)
+Dell Inspiron 500m ??? (*)
+Dell Inspiron 510m ???
+Dell Inspiron 5150 vbetool needed (6)
+Dell Inspiron 600m ??? (*)
+Dell Inspiron 8200 ??? (*)
+Dell Inspiron 8500 ??? (*)
+Dell Inspiron 8600 ??? (*)
+eMachines athlon64 machines vbetool needed (6) (someone please get me model #s)
+HP NC6000 s3_bios, may not use radeonfb (2); or vbetool (6)
+HP NX7000 ??? (*)
+HP Pavilion ZD7000 vbetool post needed, need open-source nv driver for X
+HP Omnibook XE3 athlon version none (1)
+HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV
+HP Omnibook XE3L-GF vbetool (6)
+HP Omnibook 5150 none (1), (S1 also works OK)
+IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work.
+IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(]
+IBM TP R32 / Type 2658-MMG none (1)
+IBM TP R40 2722B3G ??? (*)
+IBM TP R50p / Type 1832-22U s3_bios (2)
+IBM TP R51 none (1)
+IBM TP T30 236681A ??? (*)
+IBM TP T40 / Type 2373-MU4 none (1)
+IBM TP T40p none (1)
+IBM TP R40p s3_bios (2)
+IBM TP T41p s3_bios (2), switch to X after resume
+IBM TP T42 s3_bios (2)
+IBM ThinkPad T42p (2373-GTG) s3_bios (2)
+IBM TP X20 ??? (*)
+IBM TP X30 s3_bios, s3_mode (4)
+IBM TP X31 / Type 2672-XXH none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
+IBM TP X32 none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results?
+IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4)
+IBM TP 600e none(1), but a switch to console and back to X is needed
+Medion MD4220 ??? (*)
+Samsung P35 vbetool needed (6)
+Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off
+Sony Vaio PCG-C1VRX/K s3_bios (2)
+Sony Vaio PCG-F403 ??? (*)
+Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver
+Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
+Sony Vaio PCG-N505SN ??? (*)
+Sony Vaio vgn-s260 X or boot-radeon can init it (5)
+Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will be blank unless you return to X.
+Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4)
+Toshiba Libretto L5 none (1)
+Toshiba Libretto 100CT/110CT vbetool (6)
+Toshiba Portege 3020CT s3_mode (3)
+Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4090XCDT ??? (*)
+Toshiba Satellite P10-554 s3_bios,s3_mode (4)(****)
+Toshiba M30 (2) xor X with nvidia driver using internal AGP
+Uniwill 244IIO ??? (*)
+
+Known working desktop systems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Mainboard Graphics card hack (or "how to do it")
+------------------------------------------------------------------------------
+Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4)
+
+
+(*) from http://www.ubuntulinux.org/wiki/HoaryPMResults, not sure
+ which options to use. If you know, please tell me.
+
+(***) To be tested with a newer kernel.
+
+(****) Not with SMP kernel, UP only.
diff --git a/Documentation/power/video_extension.txt b/Documentation/power/video_extension.txt
new file mode 100644
index 0000000..b2f9b15
--- /dev/null
+++ b/Documentation/power/video_extension.txt
@@ -0,0 +1,37 @@
+ACPI video extensions
+~~~~~~~~~~~~~~~~~~~~~
+
+This driver implement the ACPI Extensions For Display Adapters for
+integrated graphics devices on motherboard, as specified in ACPI 2.0
+Specification, Appendix B, allowing to perform some basic control like
+defining the video POST device, retrieving EDID information or to
+setup a video output, etc. Note that this is an ref. implementation
+only. It may or may not work for your integrated video device.
+
+Interfaces exposed to userland through /proc/acpi/video:
+
+VGA/info : display the supported video bus device capability like Video ROM, CRT/LCD/TV.
+VGA/ROM : Used to get a copy of the display devices' ROM data (up to 4k).
+VGA/POST_info : Used to determine what options are implemented.
+VGA/POST : Used to get/set POST device.
+VGA/DOS : Used to get/set ownership of output switching:
+ Please refer ACPI spec B.4.1 _DOS
+VGA/CRT : CRT output
+VGA/LCD : LCD output
+VGA/TVO : TV output
+VGA/*/brightness : Used to get/set brightness of output device
+
+Notify event through /proc/acpi/event:
+
+#define ACPI_VIDEO_NOTIFY_SWITCH 0x80
+#define ACPI_VIDEO_NOTIFY_PROBE 0x81
+#define ACPI_VIDEO_NOTIFY_CYCLE 0x82
+#define ACPI_VIDEO_NOTIFY_NEXT_OUTPUT 0x83
+#define ACPI_VIDEO_NOTIFY_PREV_OUTPUT 0x84
+
+#define ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS 0x82
+#define ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS 0x83
+#define ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS 0x84
+#define ACPI_VIDEO_NOTIFY_ZERO_BRIGHTNESS 0x85
+#define ACPI_VIDEO_NOTIFY_DISPLAY_OFF 0x86
+
diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX
new file mode 100644
index 0000000..e3960b8
--- /dev/null
+++ b/Documentation/powerpc/00-INDEX
@@ -0,0 +1,27 @@
+Index of files in Documentation/powerpc. If you think something about
+Linux/PPC needs an entry here, needs correction or you've written one
+please mail me.
+ Cort Dougan (cort@fsmlabs.com)
+
+00-INDEX
+ - this file
+booting-without-of.txt
+ - Booting the Linux/ppc kernel without Open Firmware
+cpu_features.txt
+ - info on how we support a variety of CPUs with minimal compile-time
+ options.
+eeh-pci-error-recovery.txt
+ - info on PCI Bus EEH Error Recovery
+hvcs.txt
+ - IBM "Hypervisor Virtual Console Server" Installation Guide
+mpc52xx.txt
+ - Linux 2.6.x on MPC52xx family
+mpc52xx-device-tree-bindings.txt
+ - MPC5200 Device Tree Bindings
+sound.txt
+ - info on sound support under Linux/PPC
+zImage_layout.txt
+ - info on the kernel images for Linux/PPC
+qe_firmware.txt
+ - describes the layout of firmware binaries for the Freescale QUICC
+ Engine and the code that parses and uploads the microcode therein.
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
new file mode 100644
index 0000000..0ab0230
--- /dev/null
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -0,0 +1,2703 @@
+ Booting the Linux/ppc kernel without Open Firmware
+ --------------------------------------------------
+
+(c) 2005 Benjamin Herrenschmidt <benh at kernel.crashing.org>,
+ IBM Corp.
+(c) 2005 Becky Bruce <becky.bruce at freescale.com>,
+ Freescale Semiconductor, FSL SOC and 32-bit additions
+(c) 2006 MontaVista Software, Inc.
+ Flash chip node definition
+
+Table of Contents
+=================
+
+ I - Introduction
+ 1) Entry point for arch/powerpc
+ 2) Board support
+
+ II - The DT block format
+ 1) Header
+ 2) Device tree generalities
+ 3) Device tree "structure" block
+ 4) Device tree "strings" block
+
+ III - Required content of the device tree
+ 1) Note about cells and address representation
+ 2) Note about "compatible" properties
+ 3) Note about "name" properties
+ 4) Note about node and property names and character set
+ 5) Required nodes and properties
+ a) The root node
+ b) The /cpus node
+ c) The /cpus/* nodes
+ d) the /memory node(s)
+ e) The /chosen node
+ f) the /soc<SOCname> node
+
+ IV - "dtc", the device tree compiler
+
+ V - Recommendations for a bootloader
+
+ VI - System-on-a-chip devices and nodes
+ 1) Defining child nodes of an SOC
+ 2) Representing devices without a current OF specification
+ a) PHY nodes
+ b) Interrupt controllers
+ c) CFI or JEDEC memory-mapped NOR flash
+ d) 4xx/Axon EMAC ethernet nodes
+ e) Xilinx IP cores
+ f) USB EHCI controllers
+ g) MDIO on GPIOs
+ h) SPI busses
+
+ VII - Marvell Discovery mv64[345]6x System Controller chips
+ 1) The /system-controller node
+ 2) Child nodes of /system-controller
+ a) Marvell Discovery MDIO bus
+ b) Marvell Discovery ethernet controller
+ c) Marvell Discovery PHY nodes
+ d) Marvell Discovery SDMA nodes
+ e) Marvell Discovery BRG nodes
+ f) Marvell Discovery CUNIT nodes
+ g) Marvell Discovery MPSCROUTING nodes
+ h) Marvell Discovery MPSCINTR nodes
+ i) Marvell Discovery MPSC nodes
+ j) Marvell Discovery Watch Dog Timer nodes
+ k) Marvell Discovery I2C nodes
+ l) Marvell Discovery PIC (Programmable Interrupt Controller) nodes
+ m) Marvell Discovery MPP (Multipurpose Pins) multiplexing nodes
+ n) Marvell Discovery GPP (General Purpose Pins) nodes
+ o) Marvell Discovery PCI host bridge node
+ p) Marvell Discovery CPU Error nodes
+ q) Marvell Discovery SRAM Controller nodes
+ r) Marvell Discovery PCI Error Handler nodes
+ s) Marvell Discovery Memory Controller nodes
+
+ VIII - Specifying interrupt information for devices
+ 1) interrupts property
+ 2) interrupt-parent property
+ 3) OpenPIC Interrupt Controllers
+ 4) ISA Interrupt Controllers
+
+ IX - Specifying GPIO information for devices
+ 1) gpios property
+ 2) gpio-controller nodes
+
+ X - Specifying device power management information (sleep property)
+
+ Appendix A - Sample SOC node for MPC8540
+
+
+Revision Information
+====================
+
+ May 18, 2005: Rev 0.1 - Initial draft, no chapter III yet.
+
+ May 19, 2005: Rev 0.2 - Add chapter III and bits & pieces here or
+ clarifies the fact that a lot of things are
+ optional, the kernel only requires a very
+ small device tree, though it is encouraged
+ to provide an as complete one as possible.
+
+ May 24, 2005: Rev 0.3 - Precise that DT block has to be in RAM
+ - Misc fixes
+ - Define version 3 and new format version 16
+ for the DT block (version 16 needs kernel
+ patches, will be fwd separately).
+ String block now has a size, and full path
+ is replaced by unit name for more
+ compactness.
+ linux,phandle is made optional, only nodes
+ that are referenced by other nodes need it.
+ "name" property is now automatically
+ deduced from the unit name
+
+ June 1, 2005: Rev 0.4 - Correct confusion between OF_DT_END and
+ OF_DT_END_NODE in structure definition.
+ - Change version 16 format to always align
+ property data to 4 bytes. Since tokens are
+ already aligned, that means no specific
+ required alignment between property size
+ and property data. The old style variable
+ alignment would make it impossible to do
+ "simple" insertion of properties using
+ memmove (thanks Milton for
+ noticing). Updated kernel patch as well
+ - Correct a few more alignment constraints
+ - Add a chapter about the device-tree
+ compiler and the textural representation of
+ the tree that can be "compiled" by dtc.
+
+ November 21, 2005: Rev 0.5
+ - Additions/generalizations for 32-bit
+ - Changed to reflect the new arch/powerpc
+ structure
+ - Added chapter VI
+
+
+ ToDo:
+ - Add some definitions of interrupt tree (simple/complex)
+ - Add some definitions for PCI host bridges
+ - Add some common address format examples
+ - Add definitions for standard properties and "compatible"
+ names for cells that are not already defined by the existing
+ OF spec.
+ - Compare FSL SOC use of PCI to standard and make sure no new
+ node definition required.
+ - Add more information about node definitions for SOC devices
+ that currently have no standard, like the FSL CPM.
+
+
+I - Introduction
+================
+
+During the recent development of the Linux/ppc64 kernel, and more
+specifically, the addition of new platform types outside of the old
+IBM pSeries/iSeries pair, it was decided to enforce some strict rules
+regarding the kernel entry and bootloader <-> kernel interfaces, in
+order to avoid the degeneration that had become the ppc32 kernel entry
+point and the way a new platform should be added to the kernel. The
+legacy iSeries platform breaks those rules as it predates this scheme,
+but no new board support will be accepted in the main tree that
+doesn't follows them properly. In addition, since the advent of the
+arch/powerpc merged architecture for ppc32 and ppc64, new 32-bit
+platforms and 32-bit platforms which move into arch/powerpc will be
+required to use these rules as well.
+
+The main requirement that will be defined in more detail below is
+the presence of a device-tree whose format is defined after Open
+Firmware specification. However, in order to make life easier
+to embedded board vendors, the kernel doesn't require the device-tree
+to represent every device in the system and only requires some nodes
+and properties to be present. This will be described in detail in
+section III, but, for example, the kernel does not require you to
+create a node for every PCI device in the system. It is a requirement
+to have a node for PCI host bridges in order to provide interrupt
+routing informations and memory/IO ranges, among others. It is also
+recommended to define nodes for on chip devices and other busses that
+don't specifically fit in an existing OF specification. This creates a
+great flexibility in the way the kernel can then probe those and match
+drivers to device, without having to hard code all sorts of tables. It
+also makes it more flexible for board vendors to do minor hardware
+upgrades without significantly impacting the kernel code or cluttering
+it with special cases.
+
+
+1) Entry point for arch/powerpc
+-------------------------------
+
+ There is one and one single entry point to the kernel, at the start
+ of the kernel image. That entry point supports two calling
+ conventions:
+
+ a) Boot from Open Firmware. If your firmware is compatible
+ with Open Firmware (IEEE 1275) or provides an OF compatible
+ client interface API (support for "interpret" callback of
+ forth words isn't required), you can enter the kernel with:
+
+ r5 : OF callback pointer as defined by IEEE 1275
+ bindings to powerpc. Only the 32-bit client interface
+ is currently supported
+
+ r3, r4 : address & length of an initrd if any or 0
+
+ The MMU is either on or off; the kernel will run the
+ trampoline located in arch/powerpc/kernel/prom_init.c to
+ extract the device-tree and other information from open
+ firmware and build a flattened device-tree as described
+ in b). prom_init() will then re-enter the kernel using
+ the second method. This trampoline code runs in the
+ context of the firmware, which is supposed to handle all
+ exceptions during that time.
+
+ b) Direct entry with a flattened device-tree block. This entry
+ point is called by a) after the OF trampoline and can also be
+ called directly by a bootloader that does not support the Open
+ Firmware client interface. It is also used by "kexec" to
+ implement "hot" booting of a new kernel from a previous
+ running one. This method is what I will describe in more
+ details in this document, as method a) is simply standard Open
+ Firmware, and thus should be implemented according to the
+ various standard documents defining it and its binding to the
+ PowerPC platform. The entry point definition then becomes:
+
+ r3 : physical pointer to the device-tree block
+ (defined in chapter II) in RAM
+
+ r4 : physical pointer to the kernel itself. This is
+ used by the assembly code to properly disable the MMU
+ in case you are entering the kernel with MMU enabled
+ and a non-1:1 mapping.
+
+ r5 : NULL (as to differentiate with method a)
+
+ Note about SMP entry: Either your firmware puts your other
+ CPUs in some sleep loop or spin loop in ROM where you can get
+ them out via a soft reset or some other means, in which case
+ you don't need to care, or you'll have to enter the kernel
+ with all CPUs. The way to do that with method b) will be
+ described in a later revision of this document.
+
+
+2) Board support
+----------------
+
+64-bit kernels:
+
+ Board supports (platforms) are not exclusive config options. An
+ arbitrary set of board supports can be built in a single kernel
+ image. The kernel will "know" what set of functions to use for a
+ given platform based on the content of the device-tree. Thus, you
+ should:
+
+ a) add your platform support as a _boolean_ option in
+ arch/powerpc/Kconfig, following the example of PPC_PSERIES,
+ PPC_PMAC and PPC_MAPLE. The later is probably a good
+ example of a board support to start from.
+
+ b) create your main platform file as
+ "arch/powerpc/platforms/myplatform/myboard_setup.c" and add it
+ to the Makefile under the condition of your CONFIG_
+ option. This file will define a structure of type "ppc_md"
+ containing the various callbacks that the generic code will
+ use to get to your platform specific code
+
+ c) Add a reference to your "ppc_md" structure in the
+ "machines" table in arch/powerpc/kernel/setup_64.c if you are
+ a 64-bit platform.
+
+ d) request and get assigned a platform number (see PLATFORM_*
+ constants in arch/powerpc/include/asm/processor.h
+
+32-bit embedded kernels:
+
+ Currently, board support is essentially an exclusive config option.
+ The kernel is configured for a single platform. Part of the reason
+ for this is to keep kernels on embedded systems small and efficient;
+ part of this is due to the fact the code is already that way. In the
+ future, a kernel may support multiple platforms, but only if the
+ platforms feature the same core architecture. A single kernel build
+ cannot support both configurations with Book E and configurations
+ with classic Powerpc architectures.
+
+ 32-bit embedded platforms that are moved into arch/powerpc using a
+ flattened device tree should adopt the merged tree practice of
+ setting ppc_md up dynamically, even though the kernel is currently
+ built with support for only a single platform at a time. This allows
+ unification of the setup code, and will make it easier to go to a
+ multiple-platform-support model in the future.
+
+NOTE: I believe the above will be true once Ben's done with the merge
+of the boot sequences.... someone speak up if this is wrong!
+
+ To add a 32-bit embedded platform support, follow the instructions
+ for 64-bit platforms above, with the exception that the Kconfig
+ option should be set up such that the kernel builds exclusively for
+ the platform selected. The processor type for the platform should
+ enable another config option to select the specific board
+ supported.
+
+NOTE: If Ben doesn't merge the setup files, may need to change this to
+point to setup_32.c
+
+
+ I will describe later the boot process and various callbacks that
+ your platform should implement.
+
+
+II - The DT block format
+========================
+
+
+This chapter defines the actual format of the flattened device-tree
+passed to the kernel. The actual content of it and kernel requirements
+are described later. You can find example of code manipulating that
+format in various places, including arch/powerpc/kernel/prom_init.c
+which will generate a flattened device-tree from the Open Firmware
+representation, or the fs2dt utility which is part of the kexec tools
+which will generate one from a filesystem representation. It is
+expected that a bootloader like uboot provides a bit more support,
+that will be discussed later as well.
+
+Note: The block has to be in main memory. It has to be accessible in
+both real mode and virtual mode with no mapping other than main
+memory. If you are writing a simple flash bootloader, it should copy
+the block to RAM before passing it to the kernel.
+
+
+1) Header
+---------
+
+ The kernel is entered with r3 pointing to an area of memory that is
+ roughly described in arch/powerpc/include/asm/prom.h by the structure
+ boot_param_header:
+
+struct boot_param_header {
+ u32 magic; /* magic word OF_DT_HEADER */
+ u32 totalsize; /* total size of DT block */
+ u32 off_dt_struct; /* offset to structure */
+ u32 off_dt_strings; /* offset to strings */
+ u32 off_mem_rsvmap; /* offset to memory reserve map
+ */
+ u32 version; /* format version */
+ u32 last_comp_version; /* last compatible version */
+
+ /* version 2 fields below */
+ u32 boot_cpuid_phys; /* Which physical CPU id we're
+ booting on */
+ /* version 3 fields below */
+ u32 size_dt_strings; /* size of the strings block */
+
+ /* version 17 fields below */
+ u32 size_dt_struct; /* size of the DT structure block */
+};
+
+ Along with the constants:
+
+/* Definitions used by the flattened device tree */
+#define OF_DT_HEADER 0xd00dfeed /* 4: version,
+ 4: total size */
+#define OF_DT_BEGIN_NODE 0x1 /* Start node: full name
+ */
+#define OF_DT_END_NODE 0x2 /* End node */
+#define OF_DT_PROP 0x3 /* Property: name off,
+ size, content */
+#define OF_DT_END 0x9
+
+ All values in this header are in big endian format, the various
+ fields in this header are defined more precisely below. All
+ "offset" values are in bytes from the start of the header; that is
+ from the value of r3.
+
+ - magic
+
+ This is a magic value that "marks" the beginning of the
+ device-tree block header. It contains the value 0xd00dfeed and is
+ defined by the constant OF_DT_HEADER
+
+ - totalsize
+
+ This is the total size of the DT block including the header. The
+ "DT" block should enclose all data structures defined in this
+ chapter (who are pointed to by offsets in this header). That is,
+ the device-tree structure, strings, and the memory reserve map.
+
+ - off_dt_struct
+
+ This is an offset from the beginning of the header to the start
+ of the "structure" part the device tree. (see 2) device tree)
+
+ - off_dt_strings
+
+ This is an offset from the beginning of the header to the start
+ of the "strings" part of the device-tree
+
+ - off_mem_rsvmap
+
+ This is an offset from the beginning of the header to the start
+ of the reserved memory map. This map is a list of pairs of 64-
+ bit integers. Each pair is a physical address and a size. The
+ list is terminated by an entry of size 0. This map provides the
+ kernel with a list of physical memory areas that are "reserved"
+ and thus not to be used for memory allocations, especially during
+ early initialization. The kernel needs to allocate memory during
+ boot for things like un-flattening the device-tree, allocating an
+ MMU hash table, etc... Those allocations must be done in such a
+ way to avoid overriding critical things like, on Open Firmware
+ capable machines, the RTAS instance, or on some pSeries, the TCE
+ tables used for the iommu. Typically, the reserve map should
+ contain _at least_ this DT block itself (header,total_size). If
+ you are passing an initrd to the kernel, you should reserve it as
+ well. You do not need to reserve the kernel image itself. The map
+ should be 64-bit aligned.
+
+ - version
+
+ This is the version of this structure. Version 1 stops
+ here. Version 2 adds an additional field boot_cpuid_phys.
+ Version 3 adds the size of the strings block, allowing the kernel
+ to reallocate it easily at boot and free up the unused flattened
+ structure after expansion. Version 16 introduces a new more
+ "compact" format for the tree itself that is however not backward
+ compatible. Version 17 adds an additional field, size_dt_struct,
+ allowing it to be reallocated or moved more easily (this is
+ particularly useful for bootloaders which need to make
+ adjustments to a device tree based on probed information). You
+ should always generate a structure of the highest version defined
+ at the time of your implementation. Currently that is version 17,
+ unless you explicitly aim at being backward compatible.
+
+ - last_comp_version
+
+ Last compatible version. This indicates down to what version of
+ the DT block you are backward compatible. For example, version 2
+ is backward compatible with version 1 (that is, a kernel build
+ for version 1 will be able to boot with a version 2 format). You
+ should put a 1 in this field if you generate a device tree of
+ version 1 to 3, or 16 if you generate a tree of version 16 or 17
+ using the new unit name format.
+
+ - boot_cpuid_phys
+
+ This field only exist on version 2 headers. It indicate which
+ physical CPU ID is calling the kernel entry point. This is used,
+ among others, by kexec. If you are on an SMP system, this value
+ should match the content of the "reg" property of the CPU node in
+ the device-tree corresponding to the CPU calling the kernel entry
+ point (see further chapters for more informations on the required
+ device-tree contents)
+
+ - size_dt_strings
+
+ This field only exists on version 3 and later headers. It
+ gives the size of the "strings" section of the device tree (which
+ starts at the offset given by off_dt_strings).
+
+ - size_dt_struct
+
+ This field only exists on version 17 and later headers. It gives
+ the size of the "structure" section of the device tree (which
+ starts at the offset given by off_dt_struct).
+
+ So the typical layout of a DT block (though the various parts don't
+ need to be in that order) looks like this (addresses go from top to
+ bottom):
+
+
+ ------------------------------
+ r3 -> | struct boot_param_header |
+ ------------------------------
+ | (alignment gap) (*) |
+ ------------------------------
+ | memory reserve map |
+ ------------------------------
+ | (alignment gap) |
+ ------------------------------
+ | |
+ | device-tree structure |
+ | |
+ ------------------------------
+ | (alignment gap) |
+ ------------------------------
+ | |
+ | device-tree strings |
+ | |
+ -----> ------------------------------
+ |
+ |
+ --- (r3 + totalsize)
+
+ (*) The alignment gaps are not necessarily present; their presence
+ and size are dependent on the various alignment requirements of
+ the individual data blocks.
+
+
+2) Device tree generalities
+---------------------------
+
+This device-tree itself is separated in two different blocks, a
+structure block and a strings block. Both need to be aligned to a 4
+byte boundary.
+
+First, let's quickly describe the device-tree concept before detailing
+the storage format. This chapter does _not_ describe the detail of the
+required types of nodes & properties for the kernel, this is done
+later in chapter III.
+
+The device-tree layout is strongly inherited from the definition of
+the Open Firmware IEEE 1275 device-tree. It's basically a tree of
+nodes, each node having two or more named properties. A property can
+have a value or not.
+
+It is a tree, so each node has one and only one parent except for the
+root node who has no parent.
+
+A node has 2 names. The actual node name is generally contained in a
+property of type "name" in the node property list whose value is a
+zero terminated string and is mandatory for version 1 to 3 of the
+format definition (as it is in Open Firmware). Version 16 makes it
+optional as it can generate it from the unit name defined below.
+
+There is also a "unit name" that is used to differentiate nodes with
+the same name at the same level, it is usually made of the node
+names, the "@" sign, and a "unit address", which definition is
+specific to the bus type the node sits on.
+
+The unit name doesn't exist as a property per-se but is included in
+the device-tree structure. It is typically used to represent "path" in
+the device-tree. More details about the actual format of these will be
+below.
+
+The kernel powerpc generic code does not make any formal use of the
+unit address (though some board support code may do) so the only real
+requirement here for the unit address is to ensure uniqueness of
+the node unit name at a given level of the tree. Nodes with no notion
+of address and no possible sibling of the same name (like /memory or
+/cpus) may omit the unit address in the context of this specification,
+or use the "@0" default unit address. The unit name is used to define
+a node "full path", which is the concatenation of all parent node
+unit names separated with "/".
+
+The root node doesn't have a defined name, and isn't required to have
+a name property either if you are using version 3 or earlier of the
+format. It also has no unit address (no @ symbol followed by a unit
+address). The root node unit name is thus an empty string. The full
+path to the root node is "/".
+
+Every node which actually represents an actual device (that is, a node
+which isn't only a virtual "container" for more nodes, like "/cpus"
+is) is also required to have a "device_type" property indicating the
+type of node .
+
+Finally, every node that can be referenced from a property in another
+node is required to have a "linux,phandle" property. Real open
+firmware implementations provide a unique "phandle" value for every
+node that the "prom_init()" trampoline code turns into
+"linux,phandle" properties. However, this is made optional if the
+flattened device tree is used directly. An example of a node
+referencing another node via "phandle" is when laying out the
+interrupt tree which will be described in a further version of this
+document.
+
+This "linux, phandle" property is a 32-bit value that uniquely
+identifies a node. You are free to use whatever values or system of
+values, internal pointers, or whatever to generate these, the only
+requirement is that every node for which you provide that property has
+a unique value for it.
+
+Here is an example of a simple device-tree. In this example, an "o"
+designates a node followed by the node unit name. Properties are
+presented with their name followed by their content. "content"
+represents an ASCII string (zero terminated) value, while <content>
+represents a 32-bit hexadecimal value. The various nodes in this
+example will be discussed in a later chapter. At this point, it is
+only meant to give you a idea of what a device-tree looks like. I have
+purposefully kept the "name" and "linux,phandle" properties which
+aren't necessary in order to give you a better idea of what the tree
+looks like in practice.
+
+ / o device-tree
+ |- name = "device-tree"
+ |- model = "MyBoardName"
+ |- compatible = "MyBoardFamilyName"
+ |- #address-cells = <2>
+ |- #size-cells = <2>
+ |- linux,phandle = <0>
+ |
+ o cpus
+ | | - name = "cpus"
+ | | - linux,phandle = <1>
+ | | - #address-cells = <1>
+ | | - #size-cells = <0>
+ | |
+ | o PowerPC,970@0
+ | |- name = "PowerPC,970"
+ | |- device_type = "cpu"
+ | |- reg = <0>
+ | |- clock-frequency = <5f5e1000>
+ | |- 64-bit
+ | |- linux,phandle = <2>
+ |
+ o memory@0
+ | |- name = "memory"
+ | |- device_type = "memory"
+ | |- reg = <00000000 00000000 00000000 20000000>
+ | |- linux,phandle = <3>
+ |
+ o chosen
+ |- name = "chosen"
+ |- bootargs = "root=/dev/sda2"
+ |- linux,phandle = <4>
+
+This tree is almost a minimal tree. It pretty much contains the
+minimal set of required nodes and properties to boot a linux kernel;
+that is, some basic model informations at the root, the CPUs, and the
+physical memory layout. It also includes misc information passed
+through /chosen, like in this example, the platform type (mandatory)
+and the kernel command line arguments (optional).
+
+The /cpus/PowerPC,970@0/64-bit property is an example of a
+property without a value. All other properties have a value. The
+significance of the #address-cells and #size-cells properties will be
+explained in chapter IV which defines precisely the required nodes and
+properties and their content.
+
+
+3) Device tree "structure" block
+
+The structure of the device tree is a linearized tree structure. The
+"OF_DT_BEGIN_NODE" token starts a new node, and the "OF_DT_END_NODE"
+ends that node definition. Child nodes are simply defined before
+"OF_DT_END_NODE" (that is nodes within the node). A 'token' is a 32
+bit value. The tree has to be "finished" with a OF_DT_END token
+
+Here's the basic structure of a single node:
+
+ * token OF_DT_BEGIN_NODE (that is 0x00000001)
+ * for version 1 to 3, this is the node full path as a zero
+ terminated string, starting with "/". For version 16 and later,
+ this is the node unit name only (or an empty string for the
+ root node)
+ * [align gap to next 4 bytes boundary]
+ * for each property:
+ * token OF_DT_PROP (that is 0x00000003)
+ * 32-bit value of property value size in bytes (or 0 if no
+ value)
+ * 32-bit value of offset in string block of property name
+ * property value data if any
+ * [align gap to next 4 bytes boundary]
+ * [child nodes if any]
+ * token OF_DT_END_NODE (that is 0x00000002)
+
+So the node content can be summarized as a start token, a full path,
+a list of properties, a list of child nodes, and an end token. Every
+child node is a full node structure itself as defined above.
+
+NOTE: The above definition requires that all property definitions for
+a particular node MUST precede any subnode definitions for that node.
+Although the structure would not be ambiguous if properties and
+subnodes were intermingled, the kernel parser requires that the
+properties come first (up until at least 2.6.22). Any tools
+manipulating a flattened tree must take care to preserve this
+constraint.
+
+4) Device tree "strings" block
+
+In order to save space, property names, which are generally redundant,
+are stored separately in the "strings" block. This block is simply the
+whole bunch of zero terminated strings for all property names
+concatenated together. The device-tree property definitions in the
+structure block will contain offset values from the beginning of the
+strings block.
+
+
+III - Required content of the device tree
+=========================================
+
+WARNING: All "linux,*" properties defined in this document apply only
+to a flattened device-tree. If your platform uses a real
+implementation of Open Firmware or an implementation compatible with
+the Open Firmware client interface, those properties will be created
+by the trampoline code in the kernel's prom_init() file. For example,
+that's where you'll have to add code to detect your board model and
+set the platform number. However, when using the flattened device-tree
+entry point, there is no prom_init() pass, and thus you have to
+provide those properties yourself.
+
+
+1) Note about cells and address representation
+----------------------------------------------
+
+The general rule is documented in the various Open Firmware
+documentations. If you choose to describe a bus with the device-tree
+and there exist an OF bus binding, then you should follow the
+specification. However, the kernel does not require every single
+device or bus to be described by the device tree.
+
+In general, the format of an address for a device is defined by the
+parent bus type, based on the #address-cells and #size-cells
+properties. Note that the parent's parent definitions of #address-cells
+and #size-cells are not inherited so every node with children must specify
+them. The kernel requires the root node to have those properties defining
+addresses format for devices directly mapped on the processor bus.
+
+Those 2 properties define 'cells' for representing an address and a
+size. A "cell" is a 32-bit number. For example, if both contain 2
+like the example tree given above, then an address and a size are both
+composed of 2 cells, and each is a 64-bit number (cells are
+concatenated and expected to be in big endian format). Another example
+is the way Apple firmware defines them, with 2 cells for an address
+and one cell for a size. Most 32-bit implementations should define
+#address-cells and #size-cells to 1, which represents a 32-bit value.
+Some 32-bit processors allow for physical addresses greater than 32
+bits; these processors should define #address-cells as 2.
+
+"reg" properties are always a tuple of the type "address size" where
+the number of cells of address and size is specified by the bus
+#address-cells and #size-cells. When a bus supports various address
+spaces and other flags relative to a given address allocation (like
+prefetchable, etc...) those flags are usually added to the top level
+bits of the physical address. For example, a PCI physical address is
+made of 3 cells, the bottom two containing the actual address itself
+while the top cell contains address space indication, flags, and pci
+bus & device numbers.
+
+For busses that support dynamic allocation, it's the accepted practice
+to then not provide the address in "reg" (keep it 0) though while
+providing a flag indicating the address is dynamically allocated, and
+then, to provide a separate "assigned-addresses" property that
+contains the fully allocated addresses. See the PCI OF bindings for
+details.
+
+In general, a simple bus with no address space bits and no dynamic
+allocation is preferred if it reflects your hardware, as the existing
+kernel address parsing functions will work out of the box. If you
+define a bus type with a more complex address format, including things
+like address space bits, you'll have to add a bus translator to the
+prom_parse.c file of the recent kernels for your bus type.
+
+The "reg" property only defines addresses and sizes (if #size-cells is
+non-0) within a given bus. In order to translate addresses upward
+(that is into parent bus addresses, and possibly into CPU physical
+addresses), all busses must contain a "ranges" property. If the
+"ranges" property is missing at a given level, it's assumed that
+translation isn't possible, i.e., the registers are not visible on the
+parent bus. The format of the "ranges" property for a bus is a list
+of:
+
+ bus address, parent bus address, size
+
+"bus address" is in the format of the bus this bus node is defining,
+that is, for a PCI bridge, it would be a PCI address. Thus, (bus
+address, size) defines a range of addresses for child devices. "parent
+bus address" is in the format of the parent bus of this bus. For
+example, for a PCI host controller, that would be a CPU address. For a
+PCI<->ISA bridge, that would be a PCI address. It defines the base
+address in the parent bus where the beginning of that range is mapped.
+
+For a new 64-bit powerpc board, I recommend either the 2/2 format or
+Apple's 2/1 format which is slightly more compact since sizes usually
+fit in a single 32-bit word. New 32-bit powerpc boards should use a
+1/1 format, unless the processor supports physical addresses greater
+than 32-bits, in which case a 2/1 format is recommended.
+
+Alternatively, the "ranges" property may be empty, indicating that the
+registers are visible on the parent bus using an identity mapping
+translation. In other words, the parent bus address space is the same
+as the child bus address space.
+
+2) Note about "compatible" properties
+-------------------------------------
+
+These properties are optional, but recommended in devices and the root
+node. The format of a "compatible" property is a list of concatenated
+zero terminated strings. They allow a device to express its
+compatibility with a family of similar devices, in some cases,
+allowing a single driver to match against several devices regardless
+of their actual names.
+
+3) Note about "name" properties
+-------------------------------
+
+While earlier users of Open Firmware like OldWorld macintoshes tended
+to use the actual device name for the "name" property, it's nowadays
+considered a good practice to use a name that is closer to the device
+class (often equal to device_type). For example, nowadays, ethernet
+controllers are named "ethernet", an additional "model" property
+defining precisely the chip type/model, and "compatible" property
+defining the family in case a single driver can driver more than one
+of these chips. However, the kernel doesn't generally put any
+restriction on the "name" property; it is simply considered good
+practice to follow the standard and its evolutions as closely as
+possible.
+
+Note also that the new format version 16 makes the "name" property
+optional. If it's absent for a node, then the node's unit name is then
+used to reconstruct the name. That is, the part of the unit name
+before the "@" sign is used (or the entire unit name if no "@" sign
+is present).
+
+4) Note about node and property names and character set
+-------------------------------------------------------
+
+While open firmware provides more flexible usage of 8859-1, this
+specification enforces more strict rules. Nodes and properties should
+be comprised only of ASCII characters 'a' to 'z', '0' to
+'9', ',', '.', '_', '+', '#', '?', and '-'. Node names additionally
+allow uppercase characters 'A' to 'Z' (property names should be
+lowercase. The fact that vendors like Apple don't respect this rule is
+irrelevant here). Additionally, node and property names should always
+begin with a character in the range 'a' to 'z' (or 'A' to 'Z' for node
+names).
+
+The maximum number of characters for both nodes and property names
+is 31. In the case of node names, this is only the leftmost part of
+a unit name (the pure "name" property), it doesn't include the unit
+address which can extend beyond that limit.
+
+
+5) Required nodes and properties
+--------------------------------
+ These are all that are currently required. However, it is strongly
+ recommended that you expose PCI host bridges as documented in the
+ PCI binding to open firmware, and your interrupt tree as documented
+ in OF interrupt tree specification.
+
+ a) The root node
+
+ The root node requires some properties to be present:
+
+ - model : this is your board name/model
+ - #address-cells : address representation for "root" devices
+ - #size-cells: the size representation for "root" devices
+ - device_type : This property shouldn't be necessary. However, if
+ you decide to create a device_type for your root node, make sure it
+ is _not_ "chrp" unless your platform is a pSeries or PAPR compliant
+ one for 64-bit, or a CHRP-type machine for 32-bit as this will
+ matched by the kernel this way.
+
+ Additionally, some recommended properties are:
+
+ - compatible : the board "family" generally finds its way here,
+ for example, if you have 2 board models with a similar layout,
+ that typically get driven by the same platform code in the
+ kernel, you would use a different "model" property but put a
+ value in "compatible". The kernel doesn't directly use that
+ value but it is generally useful.
+
+ The root node is also generally where you add additional properties
+ specific to your board like the serial number if any, that sort of
+ thing. It is recommended that if you add any "custom" property whose
+ name may clash with standard defined ones, you prefix them with your
+ vendor name and a comma.
+
+ b) The /cpus node
+
+ This node is the parent of all individual CPU nodes. It doesn't
+ have any specific requirements, though it's generally good practice
+ to have at least:
+
+ #address-cells = <00000001>
+ #size-cells = <00000000>
+
+ This defines that the "address" for a CPU is a single cell, and has
+ no meaningful size. This is not necessary but the kernel will assume
+ that format when reading the "reg" properties of a CPU node, see
+ below
+
+ c) The /cpus/* nodes
+
+ So under /cpus, you are supposed to create a node for every CPU on
+ the machine. There is no specific restriction on the name of the
+ CPU, though It's common practice to call it PowerPC,<name>. For
+ example, Apple uses PowerPC,G5 while IBM uses PowerPC,970FX.
+
+ Required properties:
+
+ - device_type : has to be "cpu"
+ - reg : This is the physical CPU number, it's a single 32-bit cell
+ and is also used as-is as the unit number for constructing the
+ unit name in the full path. For example, with 2 CPUs, you would
+ have the full path:
+ /cpus/PowerPC,970FX@0
+ /cpus/PowerPC,970FX@1
+ (unit addresses do not require leading zeroes)
+ - d-cache-block-size : one cell, L1 data cache block size in bytes (*)
+ - i-cache-block-size : one cell, L1 instruction cache block size in
+ bytes
+ - d-cache-size : one cell, size of L1 data cache in bytes
+ - i-cache-size : one cell, size of L1 instruction cache in bytes
+
+(*) The cache "block" size is the size on which the cache management
+instructions operate. Historically, this document used the cache
+"line" size here which is incorrect. The kernel will prefer the cache
+block size and will fallback to cache line size for backward
+compatibility.
+
+ Recommended properties:
+
+ - timebase-frequency : a cell indicating the frequency of the
+ timebase in Hz. This is not directly used by the generic code,
+ but you are welcome to copy/paste the pSeries code for setting
+ the kernel timebase/decrementer calibration based on this
+ value.
+ - clock-frequency : a cell indicating the CPU core clock frequency
+ in Hz. A new property will be defined for 64-bit values, but if
+ your frequency is < 4Ghz, one cell is enough. Here as well as
+ for the above, the common code doesn't use that property, but
+ you are welcome to re-use the pSeries or Maple one. A future
+ kernel version might provide a common function for this.
+ - d-cache-line-size : one cell, L1 data cache line size in bytes
+ if different from the block size
+ - i-cache-line-size : one cell, L1 instruction cache line size in
+ bytes if different from the block size
+
+ You are welcome to add any property you find relevant to your board,
+ like some information about the mechanism used to soft-reset the
+ CPUs. For example, Apple puts the GPIO number for CPU soft reset
+ lines in there as a "soft-reset" property since they start secondary
+ CPUs by soft-resetting them.
+
+
+ d) the /memory node(s)
+
+ To define the physical memory layout of your board, you should
+ create one or more memory node(s). You can either create a single
+ node with all memory ranges in its reg property, or you can create
+ several nodes, as you wish. The unit address (@ part) used for the
+ full path is the address of the first range of memory defined by a
+ given node. If you use a single memory node, this will typically be
+ @0.
+
+ Required properties:
+
+ - device_type : has to be "memory"
+ - reg : This property contains all the physical memory ranges of
+ your board. It's a list of addresses/sizes concatenated
+ together, with the number of cells of each defined by the
+ #address-cells and #size-cells of the root node. For example,
+ with both of these properties being 2 like in the example given
+ earlier, a 970 based machine with 6Gb of RAM could typically
+ have a "reg" property here that looks like:
+
+ 00000000 00000000 00000000 80000000
+ 00000001 00000000 00000001 00000000
+
+ That is a range starting at 0 of 0x80000000 bytes and a range
+ starting at 0x100000000 and of 0x100000000 bytes. You can see
+ that there is no memory covering the IO hole between 2Gb and
+ 4Gb. Some vendors prefer splitting those ranges into smaller
+ segments, but the kernel doesn't care.
+
+ e) The /chosen node
+
+ This node is a bit "special". Normally, that's where open firmware
+ puts some variable environment information, like the arguments, or
+ the default input/output devices.
+
+ This specification makes a few of these mandatory, but also defines
+ some linux-specific properties that would be normally constructed by
+ the prom_init() trampoline when booting with an OF client interface,
+ but that you have to provide yourself when using the flattened format.
+
+ Recommended properties:
+
+ - bootargs : This zero-terminated string is passed as the kernel
+ command line
+ - linux,stdout-path : This is the full path to your standard
+ console device if any. Typically, if you have serial devices on
+ your board, you may want to put the full path to the one set as
+ the default console in the firmware here, for the kernel to pick
+ it up as its own default console. If you look at the function
+ set_preferred_console() in arch/ppc64/kernel/setup.c, you'll see
+ that the kernel tries to find out the default console and has
+ knowledge of various types like 8250 serial ports. You may want
+ to extend this function to add your own.
+
+ Note that u-boot creates and fills in the chosen node for platforms
+ that use it.
+
+ (Note: a practice that is now obsolete was to include a property
+ under /chosen called interrupt-controller which had a phandle value
+ that pointed to the main interrupt controller)
+
+ f) the /soc<SOCname> node
+
+ This node is used to represent a system-on-a-chip (SOC) and must be
+ present if the processor is a SOC. The top-level soc node contains
+ information that is global to all devices on the SOC. The node name
+ should contain a unit address for the SOC, which is the base address
+ of the memory-mapped register set for the SOC. The name of an soc
+ node should start with "soc", and the remainder of the name should
+ represent the part number for the soc. For example, the MPC8540's
+ soc node would be called "soc8540".
+
+ Required properties:
+
+ - device_type : Should be "soc"
+ - ranges : Should be defined as specified in 1) to describe the
+ translation of SOC addresses for memory mapped SOC registers.
+ - bus-frequency: Contains the bus frequency for the SOC node.
+ Typically, the value of this field is filled in by the boot
+ loader.
+
+
+ Recommended properties:
+
+ - reg : This property defines the address and size of the
+ memory-mapped registers that are used for the SOC node itself.
+ It does not include the child device registers - these will be
+ defined inside each child node. The address specified in the
+ "reg" property should match the unit address of the SOC node.
+ - #address-cells : Address representation for "soc" devices. The
+ format of this field may vary depending on whether or not the
+ device registers are memory mapped. For memory mapped
+ registers, this field represents the number of cells needed to
+ represent the address of the registers. For SOCs that do not
+ use MMIO, a special address format should be defined that
+ contains enough cells to represent the required information.
+ See 1) above for more details on defining #address-cells.
+ - #size-cells : Size representation for "soc" devices
+ - #interrupt-cells : Defines the width of cells used to represent
+ interrupts. Typically this value is <2>, which includes a
+ 32-bit number that represents the interrupt number, and a
+ 32-bit number that represents the interrupt sense and level.
+ This field is only needed if the SOC contains an interrupt
+ controller.
+
+ The SOC node may contain child nodes for each SOC device that the
+ platform uses. Nodes should not be created for devices which exist
+ on the SOC but are not used by a particular platform. See chapter VI
+ for more information on how to specify devices that are part of a SOC.
+
+ Example SOC node for the MPC8540:
+
+ soc8540@e0000000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ #interrupt-cells = <2>;
+ device_type = "soc";
+ ranges = <00000000 e0000000 00100000>
+ reg = <e0000000 00003000>;
+ bus-frequency = <0>;
+ }
+
+
+
+IV - "dtc", the device tree compiler
+====================================
+
+
+dtc source code can be found at
+<http://ozlabs.org/~dgibson/dtc/dtc.tar.gz>
+
+WARNING: This version is still in early development stage; the
+resulting device-tree "blobs" have not yet been validated with the
+kernel. The current generated bloc lacks a useful reserve map (it will
+be fixed to generate an empty one, it's up to the bootloader to fill
+it up) among others. The error handling needs work, bugs are lurking,
+etc...
+
+dtc basically takes a device-tree in a given format and outputs a
+device-tree in another format. The currently supported formats are:
+
+ Input formats:
+ -------------
+
+ - "dtb": "blob" format, that is a flattened device-tree block
+ with
+ header all in a binary blob.
+ - "dts": "source" format. This is a text file containing a
+ "source" for a device-tree. The format is defined later in this
+ chapter.
+ - "fs" format. This is a representation equivalent to the
+ output of /proc/device-tree, that is nodes are directories and
+ properties are files
+
+ Output formats:
+ ---------------
+
+ - "dtb": "blob" format
+ - "dts": "source" format
+ - "asm": assembly language file. This is a file that can be
+ sourced by gas to generate a device-tree "blob". That file can
+ then simply be added to your Makefile. Additionally, the
+ assembly file exports some symbols that can be used.
+
+
+The syntax of the dtc tool is
+
+ dtc [-I <input-format>] [-O <output-format>]
+ [-o output-filename] [-V output_version] input_filename
+
+
+The "output_version" defines what version of the "blob" format will be
+generated. Supported versions are 1,2,3 and 16. The default is
+currently version 3 but that may change in the future to version 16.
+
+Additionally, dtc performs various sanity checks on the tree, like the
+uniqueness of linux, phandle properties, validity of strings, etc...
+
+The format of the .dts "source" file is "C" like, supports C and C++
+style comments.
+
+/ {
+}
+
+The above is the "device-tree" definition. It's the only statement
+supported currently at the toplevel.
+
+/ {
+ property1 = "string_value"; /* define a property containing a 0
+ * terminated string
+ */
+
+ property2 = <1234abcd>; /* define a property containing a
+ * numerical 32-bit value (hexadecimal)
+ */
+
+ property3 = <12345678 12345678 deadbeef>;
+ /* define a property containing 3
+ * numerical 32-bit values (cells) in
+ * hexadecimal
+ */
+ property4 = [0a 0b 0c 0d de ea ad be ef];
+ /* define a property whose content is
+ * an arbitrary array of bytes
+ */
+
+ childnode@addresss { /* define a child node named "childnode"
+ * whose unit name is "childnode at
+ * address"
+ */
+
+ childprop = "hello\n"; /* define a property "childprop" of
+ * childnode (in this case, a string)
+ */
+ };
+};
+
+Nodes can contain other nodes etc... thus defining the hierarchical
+structure of the tree.
+
+Strings support common escape sequences from C: "\n", "\t", "\r",
+"\(octal value)", "\x(hex value)".
+
+It is also suggested that you pipe your source file through cpp (gcc
+preprocessor) so you can use #include's, #define for constants, etc...
+
+Finally, various options are planned but not yet implemented, like
+automatic generation of phandles, labels (exported to the asm file so
+you can point to a property content and change it easily from whatever
+you link the device-tree with), label or path instead of numeric value
+in some cells to "point" to a node (replaced by a phandle at compile
+time), export of reserve map address to the asm file, ability to
+specify reserve map content at compile time, etc...
+
+We may provide a .h include file with common definitions of that
+proves useful for some properties (like building PCI properties or
+interrupt maps) though it may be better to add a notion of struct
+definitions to the compiler...
+
+
+V - Recommendations for a bootloader
+====================================
+
+
+Here are some various ideas/recommendations that have been proposed
+while all this has been defined and implemented.
+
+ - The bootloader may want to be able to use the device-tree itself
+ and may want to manipulate it (to add/edit some properties,
+ like physical memory size or kernel arguments). At this point, 2
+ choices can be made. Either the bootloader works directly on the
+ flattened format, or the bootloader has its own internal tree
+ representation with pointers (similar to the kernel one) and
+ re-flattens the tree when booting the kernel. The former is a bit
+ more difficult to edit/modify, the later requires probably a bit
+ more code to handle the tree structure. Note that the structure
+ format has been designed so it's relatively easy to "insert"
+ properties or nodes or delete them by just memmoving things
+ around. It contains no internal offsets or pointers for this
+ purpose.
+
+ - An example of code for iterating nodes & retrieving properties
+ directly from the flattened tree format can be found in the kernel
+ file arch/ppc64/kernel/prom.c, look at scan_flat_dt() function,
+ its usage in early_init_devtree(), and the corresponding various
+ early_init_dt_scan_*() callbacks. That code can be re-used in a
+ GPL bootloader, and as the author of that code, I would be happy
+ to discuss possible free licensing to any vendor who wishes to
+ integrate all or part of this code into a non-GPL bootloader.
+
+
+
+VI - System-on-a-chip devices and nodes
+=======================================
+
+Many companies are now starting to develop system-on-a-chip
+processors, where the processor core (CPU) and many peripheral devices
+exist on a single piece of silicon. For these SOCs, an SOC node
+should be used that defines child nodes for the devices that make
+up the SOC. While platforms are not required to use this model in
+order to boot the kernel, it is highly encouraged that all SOC
+implementations define as complete a flat-device-tree as possible to
+describe the devices on the SOC. This will allow for the
+genericization of much of the kernel code.
+
+
+1) Defining child nodes of an SOC
+---------------------------------
+
+Each device that is part of an SOC may have its own node entry inside
+the SOC node. For each device that is included in the SOC, the unit
+address property represents the address offset for this device's
+memory-mapped registers in the parent's address space. The parent's
+address space is defined by the "ranges" property in the top-level soc
+node. The "reg" property for each node that exists directly under the
+SOC node should contain the address mapping from the child address space
+to the parent SOC address space and the size of the device's
+memory-mapped register file.
+
+For many devices that may exist inside an SOC, there are predefined
+specifications for the format of the device tree node. All SOC child
+nodes should follow these specifications, except where noted in this
+document.
+
+See appendix A for an example partial SOC node definition for the
+MPC8540.
+
+
+2) Representing devices without a current OF specification
+----------------------------------------------------------
+
+Currently, there are many devices on SOCs that do not have a standard
+representation pre-defined as part of the open firmware
+specifications, mainly because the boards that contain these SOCs are
+not currently booted using open firmware. This section contains
+descriptions for the SOC devices for which new nodes have been
+defined; this list will expand as more and more SOC-containing
+platforms are moved over to use the flattened-device-tree model.
+
+ a) PHY nodes
+
+ Required properties:
+
+ - device_type : Should be "ethernet-phy"
+ - interrupts : <a b> where a is the interrupt number and b is a
+ field that represents an encoding of the sense and level
+ information for the interrupt. This should be encoded based on
+ the information in section 2) depending on the type of interrupt
+ controller you have.
+ - interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+ - reg : The ID number for the phy, usually a small integer
+ - linux,phandle : phandle for this node; likely referenced by an
+ ethernet controller node.
+
+
+ Example:
+
+ ethernet-phy@0 {
+ linux,phandle = <2452000>
+ interrupt-parent = <40000>;
+ interrupts = <35 1>;
+ reg = <0>;
+ device_type = "ethernet-phy";
+ };
+
+
+ b) Interrupt controllers
+
+ Some SOC devices contain interrupt controllers that are different
+ from the standard Open PIC specification. The SOC device nodes for
+ these types of controllers should be specified just like a standard
+ OpenPIC controller. Sense and level information should be encoded
+ as specified in section 2) of this chapter for each device that
+ specifies an interrupt.
+
+ Example :
+
+ pic@40000 {
+ linux,phandle = <40000>;
+ interrupt-controller;
+ #address-cells = <0>;
+ reg = <40000 40000>;
+ compatible = "chrp,open-pic";
+ device_type = "open-pic";
+ };
+
+ c) CFI or JEDEC memory-mapped NOR flash
+
+ Flash chips (Memory Technology Devices) are often used for solid state
+ file systems on embedded devices.
+
+ - compatible : should contain the specific model of flash chip(s)
+ used, if known, followed by either "cfi-flash" or "jedec-flash"
+ - reg : Address range of the flash chip
+ - bank-width : Width (in bytes) of the flash bank. Equal to the
+ device width times the number of interleaved chips.
+ - device-width : (optional) Width of a single flash chip. If
+ omitted, assumed to be equal to 'bank-width'.
+ - #address-cells, #size-cells : Must be present if the flash has
+ sub-nodes representing partitions (see below). In this case
+ both #address-cells and #size-cells must be equal to 1.
+
+ For JEDEC compatible devices, the following additional properties
+ are defined:
+
+ - vendor-id : Contains the flash chip's vendor id (1 byte).
+ - device-id : Contains the flash chip's device id (1 byte).
+
+ In addition to the information on the flash bank itself, the
+ device tree may optionally contain additional information
+ describing partitions of the flash address space. This can be
+ used on platforms which have strong conventions about which
+ portions of the flash are used for what purposes, but which don't
+ use an on-flash partition table such as RedBoot.
+
+ Each partition is represented as a sub-node of the flash device.
+ Each node's name represents the name of the corresponding
+ partition of the flash device.
+
+ Flash partitions
+ - reg : The partition's offset and size within the flash bank.
+ - label : (optional) The label / name for this flash partition.
+ If omitted, the label is taken from the node name (excluding
+ the unit address).
+ - read-only : (optional) This parameter, if present, is a hint to
+ Linux that this flash partition should only be mounted
+ read-only. This is usually used for flash partitions
+ containing early-boot firmware images or data which should not
+ be clobbered.
+
+ Example:
+
+ flash@ff000000 {
+ compatible = "amd,am29lv128ml", "cfi-flash";
+ reg = <ff000000 01000000>;
+ bank-width = <4>;
+ device-width = <1>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ fs@0 {
+ label = "fs";
+ reg = <0 f80000>;
+ };
+ firmware@f80000 {
+ label ="firmware";
+ reg = <f80000 80000>;
+ read-only;
+ };
+ };
+
+ d) 4xx/Axon EMAC ethernet nodes
+
+ The EMAC ethernet controller in IBM and AMCC 4xx chips, and also
+ the Axon bridge. To operate this needs to interact with a ths
+ special McMAL DMA controller, and sometimes an RGMII or ZMII
+ interface. In addition to the nodes and properties described
+ below, the node for the OPB bus on which the EMAC sits must have a
+ correct clock-frequency property.
+
+ i) The EMAC node itself
+
+ Required properties:
+ - device_type : "network"
+
+ - compatible : compatible list, contains 2 entries, first is
+ "ibm,emac-CHIP" where CHIP is the host ASIC (440gx,
+ 405gp, Axon) and second is either "ibm,emac" or
+ "ibm,emac4". For Axon, thus, we have: "ibm,emac-axon",
+ "ibm,emac4"
+ - interrupts : <interrupt mapping for EMAC IRQ and WOL IRQ>
+ - interrupt-parent : optional, if needed for interrupt mapping
+ - reg : <registers mapping>
+ - local-mac-address : 6 bytes, MAC address
+ - mal-device : phandle of the associated McMAL node
+ - mal-tx-channel : 1 cell, index of the tx channel on McMAL associated
+ with this EMAC
+ - mal-rx-channel : 1 cell, index of the rx channel on McMAL associated
+ with this EMAC
+ - cell-index : 1 cell, hardware index of the EMAC cell on a given
+ ASIC (typically 0x0 and 0x1 for EMAC0 and EMAC1 on
+ each Axon chip)
+ - max-frame-size : 1 cell, maximum frame size supported in bytes
+ - rx-fifo-size : 1 cell, Rx fifo size in bytes for 10 and 100 Mb/sec
+ operations.
+ For Axon, 2048
+ - tx-fifo-size : 1 cell, Tx fifo size in bytes for 10 and 100 Mb/sec
+ operations.
+ For Axon, 2048.
+ - fifo-entry-size : 1 cell, size of a fifo entry (used to calculate
+ thresholds).
+ For Axon, 0x00000010
+ - mal-burst-size : 1 cell, MAL burst size (used to calculate thresholds)
+ in bytes.
+ For Axon, 0x00000100 (I think ...)
+ - phy-mode : string, mode of operations of the PHY interface.
+ Supported values are: "mii", "rmii", "smii", "rgmii",
+ "tbi", "gmii", rtbi", "sgmii".
+ For Axon on CAB, it is "rgmii"
+ - mdio-device : 1 cell, required iff using shared MDIO registers
+ (440EP). phandle of the EMAC to use to drive the
+ MDIO lines for the PHY used by this EMAC.
+ - zmii-device : 1 cell, required iff connected to a ZMII. phandle of
+ the ZMII device node
+ - zmii-channel : 1 cell, required iff connected to a ZMII. Which ZMII
+ channel or 0xffffffff if ZMII is only used for MDIO.
+ - rgmii-device : 1 cell, required iff connected to an RGMII. phandle
+ of the RGMII device node.
+ For Axon: phandle of plb5/plb4/opb/rgmii
+ - rgmii-channel : 1 cell, required iff connected to an RGMII. Which
+ RGMII channel is used by this EMAC.
+ Fox Axon: present, whatever value is appropriate for each
+ EMAC, that is the content of the current (bogus) "phy-port"
+ property.
+
+ Optional properties:
+ - phy-address : 1 cell, optional, MDIO address of the PHY. If absent,
+ a search is performed.
+ - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY
+ for, used if phy-address is absent. bit 0x00000001 is
+ MDIO address 0.
+ For Axon it can be absent, thouugh my current driver
+ doesn't handle phy-address yet so for now, keep
+ 0x00ffffff in it.
+ - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec
+ operations (if absent the value is the same as
+ rx-fifo-size). For Axon, either absent or 2048.
+ - tx-fifo-size-gige : 1 cell, Tx fifo size in bytes for 1000 Mb/sec
+ operations (if absent the value is the same as
+ tx-fifo-size). For Axon, either absent or 2048.
+ - tah-device : 1 cell, optional. If connected to a TAH engine for
+ offload, phandle of the TAH device node.
+ - tah-channel : 1 cell, optional. If appropriate, channel used on the
+ TAH engine.
+
+ Example:
+
+ EMAC0: ethernet@40000800 {
+ device_type = "network";
+ compatible = "ibm,emac-440gp", "ibm,emac";
+ interrupt-parent = <&UIC1>;
+ interrupts = <1c 4 1d 4>;
+ reg = <40000800 70>;
+ local-mac-address = [00 04 AC E3 1B 1E];
+ mal-device = <&MAL0>;
+ mal-tx-channel = <0 1>;
+ mal-rx-channel = <0>;
+ cell-index = <0>;
+ max-frame-size = <5dc>;
+ rx-fifo-size = <1000>;
+ tx-fifo-size = <800>;
+ phy-mode = "rmii";
+ phy-map = <00000001>;
+ zmii-device = <&ZMII0>;
+ zmii-channel = <0>;
+ };
+
+ ii) McMAL node
+
+ Required properties:
+ - device_type : "dma-controller"
+ - compatible : compatible list, containing 2 entries, first is
+ "ibm,mcmal-CHIP" where CHIP is the host ASIC (like
+ emac) and the second is either "ibm,mcmal" or
+ "ibm,mcmal2".
+ For Axon, "ibm,mcmal-axon","ibm,mcmal2"
+ - interrupts : <interrupt mapping for the MAL interrupts sources:
+ 5 sources: tx_eob, rx_eob, serr, txde, rxde>.
+ For Axon: This is _different_ from the current
+ firmware. We use the "delayed" interrupts for txeob
+ and rxeob. Thus we end up with mapping those 5 MPIC
+ interrupts, all level positive sensitive: 10, 11, 32,
+ 33, 34 (in decimal)
+ - dcr-reg : < DCR registers range >
+ - dcr-parent : if needed for dcr-reg
+ - num-tx-chans : 1 cell, number of Tx channels
+ - num-rx-chans : 1 cell, number of Rx channels
+
+ iii) ZMII node
+
+ Required properties:
+ - compatible : compatible list, containing 2 entries, first is
+ "ibm,zmii-CHIP" where CHIP is the host ASIC (like
+ EMAC) and the second is "ibm,zmii".
+ For Axon, there is no ZMII node.
+ - reg : <registers mapping>
+
+ iv) RGMII node
+
+ Required properties:
+ - compatible : compatible list, containing 2 entries, first is
+ "ibm,rgmii-CHIP" where CHIP is the host ASIC (like
+ EMAC) and the second is "ibm,rgmii".
+ For Axon, "ibm,rgmii-axon","ibm,rgmii"
+ - reg : <registers mapping>
+ - revision : as provided by the RGMII new version register if
+ available.
+ For Axon: 0x0000012a
+
+ e) Xilinx IP cores
+
+ The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
+ in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range
+ of standard device types (network, serial, etc.) and miscellanious
+ devices (gpio, LCD, spi, etc). Also, since these devices are
+ implemented within the fpga fabric every instance of the device can be
+ synthesised with different options that change the behaviour.
+
+ Each IP-core has a set of parameters which the FPGA designer can use to
+ control how the core is synthesized. Historically, the EDK tool would
+ extract the device parameters relevant to device drivers and copy them
+ into an 'xparameters.h' in the form of #define symbols. This tells the
+ device drivers how the IP cores are configured, but it requres the kernel
+ to be recompiled every time the FPGA bitstream is resynthesized.
+
+ The new approach is to export the parameters into the device tree and
+ generate a new device tree each time the FPGA bitstream changes. The
+ parameters which used to be exported as #defines will now become
+ properties of the device node. In general, device nodes for IP-cores
+ will take the following form:
+
+ (name): (generic-name)@(base-address) {
+ compatible = "xlnx,(ip-core-name)-(HW_VER)"
+ [, (list of compatible devices), ...];
+ reg = <(baseaddr) (size)>;
+ interrupt-parent = <&interrupt-controller-phandle>;
+ interrupts = < ... >;
+ xlnx,(parameter1) = "(string-value)";
+ xlnx,(parameter2) = <(int-value)>;
+ };
+
+ (generic-name): an open firmware-style name that describes the
+ generic class of device. Preferably, this is one word, such
+ as 'serial' or 'ethernet'.
+ (ip-core-name): the name of the ip block (given after the BEGIN
+ directive in system.mhs). Should be in lowercase
+ and all underscores '_' converted to dashes '-'.
+ (name): is derived from the "PARAMETER INSTANCE" value.
+ (parameter#): C_* parameters from system.mhs. The C_ prefix is
+ dropped from the parameter name, the name is converted
+ to lowercase and all underscore '_' characters are
+ converted to dashes '-'.
+ (baseaddr): the baseaddr parameter value (often named C_BASEADDR).
+ (HW_VER): from the HW_VER parameter.
+ (size): the address range size (often C_HIGHADDR - C_BASEADDR + 1).
+
+ Typically, the compatible list will include the exact IP core version
+ followed by an older IP core version which implements the same
+ interface or any other device with the same interface.
+
+ 'reg', 'interrupt-parent' and 'interrupts' are all optional properties.
+
+ For example, the following block from system.mhs:
+
+ BEGIN opb_uartlite
+ PARAMETER INSTANCE = opb_uartlite_0
+ PARAMETER HW_VER = 1.00.b
+ PARAMETER C_BAUDRATE = 115200
+ PARAMETER C_DATA_BITS = 8
+ PARAMETER C_ODD_PARITY = 0
+ PARAMETER C_USE_PARITY = 0
+ PARAMETER C_CLK_FREQ = 50000000
+ PARAMETER C_BASEADDR = 0xEC100000
+ PARAMETER C_HIGHADDR = 0xEC10FFFF
+ BUS_INTERFACE SOPB = opb_7
+ PORT OPB_Clk = CLK_50MHz
+ PORT Interrupt = opb_uartlite_0_Interrupt
+ PORT RX = opb_uartlite_0_RX
+ PORT TX = opb_uartlite_0_TX
+ PORT OPB_Rst = sys_bus_reset_0
+ END
+
+ becomes the following device tree node:
+
+ opb_uartlite_0: serial@ec100000 {
+ device_type = "serial";
+ compatible = "xlnx,opb-uartlite-1.00.b";
+ reg = <ec100000 10000>;
+ interrupt-parent = <&opb_intc_0>;
+ interrupts = <1 0>; // got this from the opb_intc parameters
+ current-speed = <d#115200>; // standard serial device prop
+ clock-frequency = <d#50000000>; // standard serial device prop
+ xlnx,data-bits = <8>;
+ xlnx,odd-parity = <0>;
+ xlnx,use-parity = <0>;
+ };
+
+ Some IP cores actually implement 2 or more logical devices. In
+ this case, the device should still describe the whole IP core with
+ a single node and add a child node for each logical device. The
+ ranges property can be used to translate from parent IP-core to the
+ registers of each device. In addition, the parent node should be
+ compatible with the bus type 'xlnx,compound', and should contain
+ #address-cells and #size-cells, as with any other bus. (Note: this
+ makes the assumption that both logical devices have the same bus
+ binding. If this is not true, then separate nodes should be used
+ for each logical device). The 'cell-index' property can be used to
+ enumerate logical devices within an IP core. For example, the
+ following is the system.mhs entry for the dual ps2 controller found
+ on the ml403 reference design.
+
+ BEGIN opb_ps2_dual_ref
+ PARAMETER INSTANCE = opb_ps2_dual_ref_0
+ PARAMETER HW_VER = 1.00.a
+ PARAMETER C_BASEADDR = 0xA9000000
+ PARAMETER C_HIGHADDR = 0xA9001FFF
+ BUS_INTERFACE SOPB = opb_v20_0
+ PORT Sys_Intr1 = ps2_1_intr
+ PORT Sys_Intr2 = ps2_2_intr
+ PORT Clkin1 = ps2_clk_rx_1
+ PORT Clkin2 = ps2_clk_rx_2
+ PORT Clkpd1 = ps2_clk_tx_1
+ PORT Clkpd2 = ps2_clk_tx_2
+ PORT Rx1 = ps2_d_rx_1
+ PORT Rx2 = ps2_d_rx_2
+ PORT Txpd1 = ps2_d_tx_1
+ PORT Txpd2 = ps2_d_tx_2
+ END
+
+ It would result in the following device tree nodes:
+
+ opb_ps2_dual_ref_0: opb-ps2-dual-ref@a9000000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "xlnx,compound";
+ ranges = <0 a9000000 2000>;
+ // If this device had extra parameters, then they would
+ // go here.
+ ps2@0 {
+ compatible = "xlnx,opb-ps2-dual-ref-1.00.a";
+ reg = <0 40>;
+ interrupt-parent = <&opb_intc_0>;
+ interrupts = <3 0>;
+ cell-index = <0>;
+ };
+ ps2@1000 {
+ compatible = "xlnx,opb-ps2-dual-ref-1.00.a";
+ reg = <1000 40>;
+ interrupt-parent = <&opb_intc_0>;
+ interrupts = <3 0>;
+ cell-index = <0>;
+ };
+ };
+
+ Also, the system.mhs file defines bus attachments from the processor
+ to the devices. The device tree structure should reflect the bus
+ attachments. Again an example; this system.mhs fragment:
+
+ BEGIN ppc405_virtex4
+ PARAMETER INSTANCE = ppc405_0
+ PARAMETER HW_VER = 1.01.a
+ BUS_INTERFACE DPLB = plb_v34_0
+ BUS_INTERFACE IPLB = plb_v34_0
+ END
+
+ BEGIN opb_intc
+ PARAMETER INSTANCE = opb_intc_0
+ PARAMETER HW_VER = 1.00.c
+ PARAMETER C_BASEADDR = 0xD1000FC0
+ PARAMETER C_HIGHADDR = 0xD1000FDF
+ BUS_INTERFACE SOPB = opb_v20_0
+ END
+
+ BEGIN opb_uart16550
+ PARAMETER INSTANCE = opb_uart16550_0
+ PARAMETER HW_VER = 1.00.d
+ PARAMETER C_BASEADDR = 0xa0000000
+ PARAMETER C_HIGHADDR = 0xa0001FFF
+ BUS_INTERFACE SOPB = opb_v20_0
+ END
+
+ BEGIN plb_v34
+ PARAMETER INSTANCE = plb_v34_0
+ PARAMETER HW_VER = 1.02.a
+ END
+
+ BEGIN plb_bram_if_cntlr
+ PARAMETER INSTANCE = plb_bram_if_cntlr_0
+ PARAMETER HW_VER = 1.00.b
+ PARAMETER C_BASEADDR = 0xFFFF0000
+ PARAMETER C_HIGHADDR = 0xFFFFFFFF
+ BUS_INTERFACE SPLB = plb_v34_0
+ END
+
+ BEGIN plb2opb_bridge
+ PARAMETER INSTANCE = plb2opb_bridge_0
+ PARAMETER HW_VER = 1.01.a
+ PARAMETER C_RNG0_BASEADDR = 0x20000000
+ PARAMETER C_RNG0_HIGHADDR = 0x3FFFFFFF
+ PARAMETER C_RNG1_BASEADDR = 0x60000000
+ PARAMETER C_RNG1_HIGHADDR = 0x7FFFFFFF
+ PARAMETER C_RNG2_BASEADDR = 0x80000000
+ PARAMETER C_RNG2_HIGHADDR = 0xBFFFFFFF
+ PARAMETER C_RNG3_BASEADDR = 0xC0000000
+ PARAMETER C_RNG3_HIGHADDR = 0xDFFFFFFF
+ BUS_INTERFACE SPLB = plb_v34_0
+ BUS_INTERFACE MOPB = opb_v20_0
+ END
+
+ Gives this device tree (some properties removed for clarity):
+
+ plb@0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "xlnx,plb-v34-1.02.a";
+ device_type = "ibm,plb";
+ ranges; // 1:1 translation
+
+ plb_bram_if_cntrl_0: bram@ffff0000 {
+ reg = <ffff0000 10000>;
+ }
+
+ opb@20000000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <20000000 20000000 20000000
+ 60000000 60000000 20000000
+ 80000000 80000000 40000000
+ c0000000 c0000000 20000000>;
+
+ opb_uart16550_0: serial@a0000000 {
+ reg = <a00000000 2000>;
+ };
+
+ opb_intc_0: interrupt-controller@d1000fc0 {
+ reg = <d1000fc0 20>;
+ };
+ };
+ };
+
+ That covers the general approach to binding xilinx IP cores into the
+ device tree. The following are bindings for specific devices:
+
+ i) Xilinx ML300 Framebuffer
+
+ Simple framebuffer device from the ML300 reference design (also on the
+ ML403 reference design as well as others).
+
+ Optional properties:
+ - resolution = <xres yres> : pixel resolution of framebuffer. Some
+ implementations use a different resolution.
+ Default is <d#640 d#480>
+ - virt-resolution = <xvirt yvirt> : Size of framebuffer in memory.
+ Default is <d#1024 d#480>.
+ - rotate-display (empty) : rotate display 180 degrees.
+
+ ii) Xilinx SystemACE
+
+ The Xilinx SystemACE device is used to program FPGAs from an FPGA
+ bitstream stored on a CF card. It can also be used as a generic CF
+ interface device.
+
+ Optional properties:
+ - 8-bit (empty) : Set this property for SystemACE in 8 bit mode
+
+ iii) Xilinx EMAC and Xilinx TEMAC
+
+ Xilinx Ethernet devices. In addition to general xilinx properties
+ listed above, nodes for these devices should include a phy-handle
+ property, and may include other common network device properties
+ like local-mac-address.
+
+ iv) Xilinx Uartlite
+
+ Xilinx uartlite devices are simple fixed speed serial ports.
+
+ Required properties:
+ - current-speed : Baud rate of uartlite
+
+ v) Xilinx hwicap
+
+ Xilinx hwicap devices provide access to the configuration logic
+ of the FPGA through the Internal Configuration Access Port
+ (ICAP). The ICAP enables partial reconfiguration of the FPGA,
+ readback of the configuration information, and some control over
+ 'warm boots' of the FPGA fabric.
+
+ Required properties:
+ - xlnx,family : The family of the FPGA, necessary since the
+ capabilities of the underlying ICAP hardware
+ differ between different families. May be
+ 'virtex2p', 'virtex4', or 'virtex5'.
+
+ vi) Xilinx Uart 16550
+
+ Xilinx UART 16550 devices are very similar to the NS16550 but with
+ different register spacing and an offset from the base address.
+
+ Required properties:
+ - clock-frequency : Frequency of the clock input
+ - reg-offset : A value of 3 is required
+ - reg-shift : A value of 2 is required
+
+ f) USB EHCI controllers
+
+ Required properties:
+ - compatible : should be "usb-ehci".
+ - reg : should contain at least address and length of the standard EHCI
+ register set for the device. Optional platform-dependent registers
+ (debug-port or other) can be also specified here, but only after
+ definition of standard EHCI registers.
+ - interrupts : one EHCI interrupt should be described here.
+ If device registers are implemented in big endian mode, the device
+ node should have "big-endian-regs" property.
+ If controller implementation operates with big endian descriptors,
+ "big-endian-desc" property should be specified.
+ If both big endian registers and descriptors are used by the controller
+ implementation, "big-endian" property can be specified instead of having
+ both "big-endian-regs" and "big-endian-desc".
+
+ Example (Sequoia 440EPx):
+ ehci@e0000300 {
+ compatible = "ibm,usb-ehci-440epx", "usb-ehci";
+ interrupt-parent = <&UIC0>;
+ interrupts = <1a 4>;
+ reg = <0 e0000300 90 0 e0000390 70>;
+ big-endian;
+ };
+
+ g) MDIO on GPIOs
+
+ Currently defined compatibles:
+ - virtual,gpio-mdio
+
+ MDC and MDIO lines connected to GPIO controllers are listed in the
+ gpios property as described in section VIII.1 in the following order:
+
+ MDC, MDIO.
+
+ Example:
+
+ mdio {
+ compatible = "virtual,mdio-gpio";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ gpios = <&qe_pio_a 11
+ &qe_pio_c 6>;
+ };
+
+ h) SPI (Serial Peripheral Interface) busses
+
+ SPI busses can be described with a node for the SPI master device
+ and a set of child nodes for each SPI slave on the bus. For this
+ discussion, it is assumed that the system's SPI controller is in
+ SPI master mode. This binding does not describe SPI controllers
+ in slave mode.
+
+ The SPI master node requires the following properties:
+ - #address-cells - number of cells required to define a chip select
+ address on the SPI bus.
+ - #size-cells - should be zero.
+ - compatible - name of SPI bus controller following generic names
+ recommended practice.
+ No other properties are required in the SPI bus node. It is assumed
+ that a driver for an SPI bus device will understand that it is an SPI bus.
+ However, the binding does not attempt to define the specific method for
+ assigning chip select numbers. Since SPI chip select configuration is
+ flexible and non-standardized, it is left out of this binding with the
+ assumption that board specific platform code will be used to manage
+ chip selects. Individual drivers can define additional properties to
+ support describing the chip select layout.
+
+ SPI slave nodes must be children of the SPI master node and can
+ contain the following properties.
+ - reg - (required) chip select address of device.
+ - compatible - (required) name of SPI device following generic names
+ recommended practice
+ - spi-max-frequency - (required) Maximum SPI clocking speed of device in Hz
+ - spi-cpol - (optional) Empty property indicating device requires
+ inverse clock polarity (CPOL) mode
+ - spi-cpha - (optional) Empty property indicating device requires
+ shifted clock phase (CPHA) mode
+ - spi-cs-high - (optional) Empty property indicating device requires
+ chip select active high
+
+ SPI example for an MPC5200 SPI bus:
+ spi@f00 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
+ reg = <0xf00 0x20>;
+ interrupts = <2 13 0 2 14 0>;
+ interrupt-parent = <&mpc5200_pic>;
+
+ ethernet-switch@0 {
+ compatible = "micrel,ks8995m";
+ spi-max-frequency = <1000000>;
+ reg = <0>;
+ };
+
+ codec@1 {
+ compatible = "ti,tlv320aic26";
+ spi-max-frequency = <100000>;
+ reg = <1>;
+ };
+ };
+
+VII - Marvell Discovery mv64[345]6x System Controller chips
+===========================================================
+
+The Marvell mv64[345]60 series of system controller chips contain
+many of the peripherals needed to implement a complete computer
+system. In this section, we define device tree nodes to describe
+the system controller chip itself and each of the peripherals
+which it contains. Compatible string values for each node are
+prefixed with the string "marvell,", for Marvell Technology Group Ltd.
+
+1) The /system-controller node
+
+ This node is used to represent the system-controller and must be
+ present when the system uses a system controller chip. The top-level
+ system-controller node contains information that is global to all
+ devices within the system controller chip. The node name begins
+ with "system-controller" followed by the unit address, which is
+ the base address of the memory-mapped register set for the system
+ controller chip.
+
+ Required properties:
+
+ - ranges : Describes the translation of system controller addresses
+ for memory mapped registers.
+ - clock-frequency: Contains the main clock frequency for the system
+ controller chip.
+ - reg : This property defines the address and size of the
+ memory-mapped registers contained within the system controller
+ chip. The address specified in the "reg" property should match
+ the unit address of the system-controller node.
+ - #address-cells : Address representation for system controller
+ devices. This field represents the number of cells needed to
+ represent the address of the memory-mapped registers of devices
+ within the system controller chip.
+ - #size-cells : Size representation for for the memory-mapped
+ registers within the system controller chip.
+ - #interrupt-cells : Defines the width of cells used to represent
+ interrupts.
+
+ Optional properties:
+
+ - model : The specific model of the system controller chip. Such
+ as, "mv64360", "mv64460", or "mv64560".
+ - compatible : A string identifying the compatibility identifiers
+ of the system controller chip.
+
+ The system-controller node contains child nodes for each system
+ controller device that the platform uses. Nodes should not be created
+ for devices which exist on the system controller chip but are not used
+
+ Example Marvell Discovery mv64360 system-controller node:
+
+ system-controller@f1000000 { /* Marvell Discovery mv64360 */
+ #address-cells = <1>;
+ #size-cells = <1>;
+ model = "mv64360"; /* Default */
+ compatible = "marvell,mv64360";
+ clock-frequency = <133333333>;
+ reg = <0xf1000000 0x10000>;
+ virtual-reg = <0xf1000000>;
+ ranges = <0x88000000 0x88000000 0x1000000 /* PCI 0 I/O Space */
+ 0x80000000 0x80000000 0x8000000 /* PCI 0 MEM Space */
+ 0xa0000000 0xa0000000 0x4000000 /* User FLASH */
+ 0x00000000 0xf1000000 0x0010000 /* Bridge's regs */
+ 0xf2000000 0xf2000000 0x0040000>;/* Integrated SRAM */
+
+ [ child node definitions... ]
+ }
+
+2) Child nodes of /system-controller
+
+ a) Marvell Discovery MDIO bus
+
+ The MDIO is a bus to which the PHY devices are connected. For each
+ device that exists on this bus, a child node should be created. See
+ the definition of the PHY node below for an example of how to define
+ a PHY.
+
+ Required properties:
+ - #address-cells : Should be <1>
+ - #size-cells : Should be <0>
+ - device_type : Should be "mdio"
+ - compatible : Should be "marvell,mv64360-mdio"
+
+ Example:
+
+ mdio {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ device_type = "mdio";
+ compatible = "marvell,mv64360-mdio";
+
+ ethernet-phy@0 {
+ ......
+ };
+ };
+
+
+ b) Marvell Discovery ethernet controller
+
+ The Discover ethernet controller is described with two levels
+ of nodes. The first level describes an ethernet silicon block
+ and the second level describes up to 3 ethernet nodes within
+ that block. The reason for the multiple levels is that the
+ registers for the node are interleaved within a single set
+ of registers. The "ethernet-block" level describes the
+ shared register set, and the "ethernet" nodes describe ethernet
+ port-specific properties.
+
+ Ethernet block node
+
+ Required properties:
+ - #address-cells : <1>
+ - #size-cells : <0>
+ - compatible : "marvell,mv64360-eth-block"
+ - reg : Offset and length of the register set for this block
+
+ Example Discovery Ethernet block node:
+ ethernet-block@2000 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "marvell,mv64360-eth-block";
+ reg = <0x2000 0x2000>;
+ ethernet@0 {
+ .......
+ };
+ };
+
+ Ethernet port node
+
+ Required properties:
+ - device_type : Should be "network".
+ - compatible : Should be "marvell,mv64360-eth".
+ - reg : Should be <0>, <1>, or <2>, according to which registers
+ within the silicon block the device uses.
+ - interrupts : <a> where a is the interrupt number for the port.
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+ - phy : the phandle for the PHY connected to this ethernet
+ controller.
+ - local-mac-address : 6 bytes, MAC address
+
+ Example Discovery Ethernet port node:
+ ethernet@0 {
+ device_type = "network";
+ compatible = "marvell,mv64360-eth";
+ reg = <0>;
+ interrupts = <32>;
+ interrupt-parent = <&PIC>;
+ phy = <&PHY0>;
+ local-mac-address = [ 00 00 00 00 00 00 ];
+ };
+
+
+
+ c) Marvell Discovery PHY nodes
+
+ Required properties:
+ - device_type : Should be "ethernet-phy"
+ - interrupts : <a> where a is the interrupt number for this phy.
+ - interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+ - reg : The ID number for the phy, usually a small integer
+
+ Example Discovery PHY node:
+ ethernet-phy@1 {
+ device_type = "ethernet-phy";
+ compatible = "broadcom,bcm5421";
+ interrupts = <76>; /* GPP 12 */
+ interrupt-parent = <&PIC>;
+ reg = <1>;
+ };
+
+
+ d) Marvell Discovery SDMA nodes
+
+ Represent DMA hardware associated with the MPSC (multiprotocol
+ serial controllers).
+
+ Required properties:
+ - compatible : "marvell,mv64360-sdma"
+ - reg : Offset and length of the register set for this device
+ - interrupts : <a> where a is the interrupt number for the DMA
+ device.
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+
+ Example Discovery SDMA node:
+ sdma@4000 {
+ compatible = "marvell,mv64360-sdma";
+ reg = <0x4000 0xc18>;
+ virtual-reg = <0xf1004000>;
+ interrupts = <36>;
+ interrupt-parent = <&PIC>;
+ };
+
+
+ e) Marvell Discovery BRG nodes
+
+ Represent baud rate generator hardware associated with the MPSC
+ (multiprotocol serial controllers).
+
+ Required properties:
+ - compatible : "marvell,mv64360-brg"
+ - reg : Offset and length of the register set for this device
+ - clock-src : A value from 0 to 15 which selects the clock
+ source for the baud rate generator. This value corresponds
+ to the CLKS value in the BRGx configuration register. See
+ the mv64x60 User's Manual.
+ - clock-frequence : The frequency (in Hz) of the baud rate
+ generator's input clock.
+ - current-speed : The current speed setting (presumably by
+ firmware) of the baud rate generator.
+
+ Example Discovery BRG node:
+ brg@b200 {
+ compatible = "marvell,mv64360-brg";
+ reg = <0xb200 0x8>;
+ clock-src = <8>;
+ clock-frequency = <133333333>;
+ current-speed = <9600>;
+ };
+
+
+ f) Marvell Discovery CUNIT nodes
+
+ Represent the Serial Communications Unit device hardware.
+
+ Required properties:
+ - reg : Offset and length of the register set for this device
+
+ Example Discovery CUNIT node:
+ cunit@f200 {
+ reg = <0xf200 0x200>;
+ };
+
+
+ g) Marvell Discovery MPSCROUTING nodes
+
+ Represent the Discovery's MPSC routing hardware
+
+ Required properties:
+ - reg : Offset and length of the register set for this device
+
+ Example Discovery CUNIT node:
+ mpscrouting@b500 {
+ reg = <0xb400 0xc>;
+ };
+
+
+ h) Marvell Discovery MPSCINTR nodes
+
+ Represent the Discovery's MPSC DMA interrupt hardware registers
+ (SDMA cause and mask registers).
+
+ Required properties:
+ - reg : Offset and length of the register set for this device
+
+ Example Discovery MPSCINTR node:
+ mpsintr@b800 {
+ reg = <0xb800 0x100>;
+ };
+
+
+ i) Marvell Discovery MPSC nodes
+
+ Represent the Discovery's MPSC (Multiprotocol Serial Controller)
+ serial port.
+
+ Required properties:
+ - device_type : "serial"
+ - compatible : "marvell,mv64360-mpsc"
+ - reg : Offset and length of the register set for this device
+ - sdma : the phandle for the SDMA node used by this port
+ - brg : the phandle for the BRG node used by this port
+ - cunit : the phandle for the CUNIT node used by this port
+ - mpscrouting : the phandle for the MPSCROUTING node used by this port
+ - mpscintr : the phandle for the MPSCINTR node used by this port
+ - cell-index : the hardware index of this cell in the MPSC core
+ - max_idle : value needed for MPSC CHR3 (Maximum Frame Length)
+ register
+ - interrupts : <a> where a is the interrupt number for the MPSC.
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+
+ Example Discovery MPSCINTR node:
+ mpsc@8000 {
+ device_type = "serial";
+ compatible = "marvell,mv64360-mpsc";
+ reg = <0x8000 0x38>;
+ virtual-reg = <0xf1008000>;
+ sdma = <&SDMA0>;
+ brg = <&BRG0>;
+ cunit = <&CUNIT>;
+ mpscrouting = <&MPSCROUTING>;
+ mpscintr = <&MPSCINTR>;
+ cell-index = <0>;
+ max_idle = <40>;
+ interrupts = <40>;
+ interrupt-parent = <&PIC>;
+ };
+
+
+ j) Marvell Discovery Watch Dog Timer nodes
+
+ Represent the Discovery's watchdog timer hardware
+
+ Required properties:
+ - compatible : "marvell,mv64360-wdt"
+ - reg : Offset and length of the register set for this device
+
+ Example Discovery Watch Dog Timer node:
+ wdt@b410 {
+ compatible = "marvell,mv64360-wdt";
+ reg = <0xb410 0x8>;
+ };
+
+
+ k) Marvell Discovery I2C nodes
+
+ Represent the Discovery's I2C hardware
+
+ Required properties:
+ - device_type : "i2c"
+ - compatible : "marvell,mv64360-i2c"
+ - reg : Offset and length of the register set for this device
+ - interrupts : <a> where a is the interrupt number for the I2C.
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+
+ Example Discovery I2C node:
+ compatible = "marvell,mv64360-i2c";
+ reg = <0xc000 0x20>;
+ virtual-reg = <0xf100c000>;
+ interrupts = <37>;
+ interrupt-parent = <&PIC>;
+ };
+
+
+ l) Marvell Discovery PIC (Programmable Interrupt Controller) nodes
+
+ Represent the Discovery's PIC hardware
+
+ Required properties:
+ - #interrupt-cells : <1>
+ - #address-cells : <0>
+ - compatible : "marvell,mv64360-pic"
+ - reg : Offset and length of the register set for this device
+ - interrupt-controller
+
+ Example Discovery PIC node:
+ pic {
+ #interrupt-cells = <1>;
+ #address-cells = <0>;
+ compatible = "marvell,mv64360-pic";
+ reg = <0x0 0x88>;
+ interrupt-controller;
+ };
+
+
+ m) Marvell Discovery MPP (Multipurpose Pins) multiplexing nodes
+
+ Represent the Discovery's MPP hardware
+
+ Required properties:
+ - compatible : "marvell,mv64360-mpp"
+ - reg : Offset and length of the register set for this device
+
+ Example Discovery MPP node:
+ mpp@f000 {
+ compatible = "marvell,mv64360-mpp";
+ reg = <0xf000 0x10>;
+ };
+
+
+ n) Marvell Discovery GPP (General Purpose Pins) nodes
+
+ Represent the Discovery's GPP hardware
+
+ Required properties:
+ - compatible : "marvell,mv64360-gpp"
+ - reg : Offset and length of the register set for this device
+
+ Example Discovery GPP node:
+ gpp@f000 {
+ compatible = "marvell,mv64360-gpp";
+ reg = <0xf100 0x20>;
+ };
+
+
+ o) Marvell Discovery PCI host bridge node
+
+ Represents the Discovery's PCI host bridge device. The properties
+ for this node conform to Rev 2.1 of the PCI Bus Binding to IEEE
+ 1275-1994. A typical value for the compatible property is
+ "marvell,mv64360-pci".
+
+ Example Discovery PCI host bridge node
+ pci@80000000 {
+ #address-cells = <3>;
+ #size-cells = <2>;
+ #interrupt-cells = <1>;
+ device_type = "pci";
+ compatible = "marvell,mv64360-pci";
+ reg = <0xcf8 0x8>;
+ ranges = <0x01000000 0x0 0x0
+ 0x88000000 0x0 0x01000000
+ 0x02000000 0x0 0x80000000
+ 0x80000000 0x0 0x08000000>;
+ bus-range = <0 255>;
+ clock-frequency = <66000000>;
+ interrupt-parent = <&PIC>;
+ interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+ interrupt-map = <
+ /* IDSEL 0x0a */
+ 0x5000 0 0 1 &PIC 80
+ 0x5000 0 0 2 &PIC 81
+ 0x5000 0 0 3 &PIC 91
+ 0x5000 0 0 4 &PIC 93
+
+ /* IDSEL 0x0b */
+ 0x5800 0 0 1 &PIC 91
+ 0x5800 0 0 2 &PIC 93
+ 0x5800 0 0 3 &PIC 80
+ 0x5800 0 0 4 &PIC 81
+
+ /* IDSEL 0x0c */
+ 0x6000 0 0 1 &PIC 91
+ 0x6000 0 0 2 &PIC 93
+ 0x6000 0 0 3 &PIC 80
+ 0x6000 0 0 4 &PIC 81
+
+ /* IDSEL 0x0d */
+ 0x6800 0 0 1 &PIC 93
+ 0x6800 0 0 2 &PIC 80
+ 0x6800 0 0 3 &PIC 81
+ 0x6800 0 0 4 &PIC 91
+ >;
+ };
+
+
+ p) Marvell Discovery CPU Error nodes
+
+ Represent the Discovery's CPU error handler device.
+
+ Required properties:
+ - compatible : "marvell,mv64360-cpu-error"
+ - reg : Offset and length of the register set for this device
+ - interrupts : the interrupt number for this device
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+
+ Example Discovery CPU Error node:
+ cpu-error@0070 {
+ compatible = "marvell,mv64360-cpu-error";
+ reg = <0x70 0x10 0x128 0x28>;
+ interrupts = <3>;
+ interrupt-parent = <&PIC>;
+ };
+
+
+ q) Marvell Discovery SRAM Controller nodes
+
+ Represent the Discovery's SRAM controller device.
+
+ Required properties:
+ - compatible : "marvell,mv64360-sram-ctrl"
+ - reg : Offset and length of the register set for this device
+ - interrupts : the interrupt number for this device
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+
+ Example Discovery SRAM Controller node:
+ sram-ctrl@0380 {
+ compatible = "marvell,mv64360-sram-ctrl";
+ reg = <0x380 0x80>;
+ interrupts = <13>;
+ interrupt-parent = <&PIC>;
+ };
+
+
+ r) Marvell Discovery PCI Error Handler nodes
+
+ Represent the Discovery's PCI error handler device.
+
+ Required properties:
+ - compatible : "marvell,mv64360-pci-error"
+ - reg : Offset and length of the register set for this device
+ - interrupts : the interrupt number for this device
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+
+ Example Discovery PCI Error Handler node:
+ pci-error@1d40 {
+ compatible = "marvell,mv64360-pci-error";
+ reg = <0x1d40 0x40 0xc28 0x4>;
+ interrupts = <12>;
+ interrupt-parent = <&PIC>;
+ };
+
+
+ s) Marvell Discovery Memory Controller nodes
+
+ Represent the Discovery's memory controller device.
+
+ Required properties:
+ - compatible : "marvell,mv64360-mem-ctrl"
+ - reg : Offset and length of the register set for this device
+ - interrupts : the interrupt number for this device
+ - interrupt-parent : the phandle for the interrupt controller
+ that services interrupts for this device.
+
+ Example Discovery Memory Controller node:
+ mem-ctrl@1400 {
+ compatible = "marvell,mv64360-mem-ctrl";
+ reg = <0x1400 0x60>;
+ interrupts = <17>;
+ interrupt-parent = <&PIC>;
+ };
+
+
+VIII - Specifying interrupt information for devices
+===================================================
+
+The device tree represents the busses and devices of a hardware
+system in a form similar to the physical bus topology of the
+hardware.
+
+In addition, a logical 'interrupt tree' exists which represents the
+hierarchy and routing of interrupts in the hardware.
+
+The interrupt tree model is fully described in the
+document "Open Firmware Recommended Practice: Interrupt
+Mapping Version 0.9". The document is available at:
+<http://playground.sun.com/1275/practice>.
+
+1) interrupts property
+----------------------
+
+Devices that generate interrupts to a single interrupt controller
+should use the conventional OF representation described in the
+OF interrupt mapping documentation.
+
+Each device which generates interrupts must have an 'interrupt'
+property. The interrupt property value is an arbitrary number of
+of 'interrupt specifier' values which describe the interrupt or
+interrupts for the device.
+
+The encoding of an interrupt specifier is determined by the
+interrupt domain in which the device is located in the
+interrupt tree. The root of an interrupt domain specifies in
+its #interrupt-cells property the number of 32-bit cells
+required to encode an interrupt specifier. See the OF interrupt
+mapping documentation for a detailed description of domains.
+
+For example, the binding for the OpenPIC interrupt controller
+specifies an #interrupt-cells value of 2 to encode the interrupt
+number and level/sense information. All interrupt children in an
+OpenPIC interrupt domain use 2 cells per interrupt in their interrupts
+property.
+
+The PCI bus binding specifies a #interrupt-cell value of 1 to encode
+which interrupt pin (INTA,INTB,INTC,INTD) is used.
+
+2) interrupt-parent property
+----------------------------
+
+The interrupt-parent property is specified to define an explicit
+link between a device node and its interrupt parent in
+the interrupt tree. The value of interrupt-parent is the
+phandle of the parent node.
+
+If the interrupt-parent property is not defined for a node, it's
+interrupt parent is assumed to be an ancestor in the node's
+_device tree_ hierarchy.
+
+3) OpenPIC Interrupt Controllers
+--------------------------------
+
+OpenPIC interrupt controllers require 2 cells to encode
+interrupt information. The first cell defines the interrupt
+number. The second cell defines the sense and level
+information.
+
+Sense and level information should be encoded as follows:
+
+ 0 = low to high edge sensitive type enabled
+ 1 = active low level sensitive type enabled
+ 2 = active high level sensitive type enabled
+ 3 = high to low edge sensitive type enabled
+
+4) ISA Interrupt Controllers
+----------------------------
+
+ISA PIC interrupt controllers require 2 cells to encode
+interrupt information. The first cell defines the interrupt
+number. The second cell defines the sense and level
+information.
+
+ISA PIC interrupt controllers should adhere to the ISA PIC
+encodings listed below:
+
+ 0 = active low level sensitive type enabled
+ 1 = active high level sensitive type enabled
+ 2 = high to low edge sensitive type enabled
+ 3 = low to high edge sensitive type enabled
+
+IX - Specifying GPIO information for devices
+============================================
+
+1) gpios property
+-----------------
+
+Nodes that makes use of GPIOs should define them using `gpios' property,
+format of which is: <&gpio-controller1-phandle gpio1-specifier
+ &gpio-controller2-phandle gpio2-specifier
+ 0 /* holes are permitted, means no GPIO 3 */
+ &gpio-controller4-phandle gpio4-specifier
+ ...>;
+
+Note that gpio-specifier length is controller dependent.
+
+gpio-specifier may encode: bank, pin position inside the bank,
+whether pin is open-drain and whether pin is logically inverted.
+
+Example of the node using GPIOs:
+
+ node {
+ gpios = <&qe_pio_e 18 0>;
+ };
+
+In this example gpio-specifier is "18 0" and encodes GPIO pin number,
+and empty GPIO flags as accepted by the "qe_pio_e" gpio-controller.
+
+2) gpio-controller nodes
+------------------------
+
+Every GPIO controller node must have #gpio-cells property defined,
+this information will be used to translate gpio-specifiers.
+
+Example of two SOC GPIO banks defined as gpio-controller nodes:
+
+ qe_pio_a: gpio-controller@1400 {
+ #gpio-cells = <2>;
+ compatible = "fsl,qe-pario-bank-a", "fsl,qe-pario-bank";
+ reg = <0x1400 0x18>;
+ gpio-controller;
+ };
+
+ qe_pio_e: gpio-controller@1460 {
+ #gpio-cells = <2>;
+ compatible = "fsl,qe-pario-bank-e", "fsl,qe-pario-bank";
+ reg = <0x1460 0x18>;
+ gpio-controller;
+ };
+
+X - Specifying Device Power Management Information (sleep property)
+===================================================================
+
+Devices on SOCs often have mechanisms for placing devices into low-power
+states that are decoupled from the devices' own register blocks. Sometimes,
+this information is more complicated than a cell-index property can
+reasonably describe. Thus, each device controlled in such a manner
+may contain a "sleep" property which describes these connections.
+
+The sleep property consists of one or more sleep resources, each of
+which consists of a phandle to a sleep controller, followed by a
+controller-specific sleep specifier of zero or more cells.
+
+The semantics of what type of low power modes are possible are defined
+by the sleep controller. Some examples of the types of low power modes
+that may be supported are:
+
+ - Dynamic: The device may be disabled or enabled at any time.
+ - System Suspend: The device may request to be disabled or remain
+ awake during system suspend, but will not be disabled until then.
+ - Permanent: The device is disabled permanently (until the next hard
+ reset).
+
+Some devices may share a clock domain with each other, such that they should
+only be suspended when none of the devices are in use. Where reasonable,
+such nodes should be placed on a virtual bus, where the bus has the sleep
+property. If the clock domain is shared among devices that cannot be
+reasonably grouped in this manner, then create a virtual sleep controller
+(similar to an interrupt nexus, except that defining a standardized
+sleep-map should wait until its necessity is demonstrated).
+
+Appendix A - Sample SOC node for MPC8540
+========================================
+
+ soc@e0000000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "fsl,mpc8540-ccsr", "simple-bus";
+ device_type = "soc";
+ ranges = <0x00000000 0xe0000000 0x00100000>
+ bus-frequency = <0>;
+ interrupt-parent = <&pic>;
+
+ ethernet@24000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ device_type = "network";
+ model = "TSEC";
+ compatible = "gianfar", "simple-bus";
+ reg = <0x24000 0x1000>;
+ local-mac-address = [ 00 E0 0C 00 73 00 ];
+ interrupts = <29 2 30 2 34 2>;
+ phy-handle = <&phy0>;
+ sleep = <&pmc 00000080>;
+ ranges;
+
+ mdio@24520 {
+ reg = <0x24520 0x20>;
+ compatible = "fsl,gianfar-mdio";
+
+ phy0: ethernet-phy@0 {
+ interrupts = <5 1>;
+ reg = <0>;
+ device_type = "ethernet-phy";
+ };
+
+ phy1: ethernet-phy@1 {
+ interrupts = <5 1>;
+ reg = <1>;
+ device_type = "ethernet-phy";
+ };
+
+ phy3: ethernet-phy@3 {
+ interrupts = <7 1>;
+ reg = <3>;
+ device_type = "ethernet-phy";
+ };
+ };
+ };
+
+ ethernet@25000 {
+ device_type = "network";
+ model = "TSEC";
+ compatible = "gianfar";
+ reg = <0x25000 0x1000>;
+ local-mac-address = [ 00 E0 0C 00 73 01 ];
+ interrupts = <13 2 14 2 18 2>;
+ phy-handle = <&phy1>;
+ sleep = <&pmc 00000040>;
+ };
+
+ ethernet@26000 {
+ device_type = "network";
+ model = "FEC";
+ compatible = "gianfar";
+ reg = <0x26000 0x1000>;
+ local-mac-address = [ 00 E0 0C 00 73 02 ];
+ interrupts = <41 2>;
+ phy-handle = <&phy3>;
+ sleep = <&pmc 00000020>;
+ };
+
+ serial@4500 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "fsl,mpc8540-duart", "simple-bus";
+ sleep = <&pmc 00000002>;
+ ranges;
+
+ serial@4500 {
+ device_type = "serial";
+ compatible = "ns16550";
+ reg = <0x4500 0x100>;
+ clock-frequency = <0>;
+ interrupts = <42 2>;
+ };
+
+ serial@4600 {
+ device_type = "serial";
+ compatible = "ns16550";
+ reg = <0x4600 0x100>;
+ clock-frequency = <0>;
+ interrupts = <42 2>;
+ };
+ };
+
+ pic: pic@40000 {
+ interrupt-controller;
+ #address-cells = <0>;
+ #interrupt-cells = <2>;
+ reg = <0x40000 0x40000>;
+ compatible = "chrp,open-pic";
+ device_type = "open-pic";
+ };
+
+ i2c@3000 {
+ interrupts = <43 2>;
+ reg = <0x3000 0x100>;
+ compatible = "fsl-i2c";
+ dfsrr;
+ sleep = <&pmc 00000004>;
+ };
+
+ pmc: power@e0070 {
+ compatible = "fsl,mpc8540-pmc", "fsl,mpc8548-pmc";
+ reg = <0xe0070 0x20>;
+ };
+ };
diff --git a/Documentation/powerpc/bootwrapper.txt b/Documentation/powerpc/bootwrapper.txt
new file mode 100644
index 0000000..d60fced
--- /dev/null
+++ b/Documentation/powerpc/bootwrapper.txt
@@ -0,0 +1,141 @@
+The PowerPC boot wrapper
+------------------------
+Copyright (C) Secret Lab Technologies Ltd.
+
+PowerPC image targets compresses and wraps the kernel image (vmlinux) with
+a boot wrapper to make it usable by the system firmware. There is no
+standard PowerPC firmware interface, so the boot wrapper is designed to
+be adaptable for each kind of image that needs to be built.
+
+The boot wrapper can be found in the arch/powerpc/boot/ directory. The
+Makefile in that directory has targets for all the available image types.
+The different image types are used to support all of the various firmware
+interfaces found on PowerPC platforms. OpenFirmware is the most commonly
+used firmware type on general purpose PowerPC systems from Apple, IBM and
+others. U-Boot is typically found on embedded PowerPC hardware, but there
+are a handful of other firmware implementations which are also popular. Each
+firmware interface requires a different image format.
+
+The boot wrapper is built from the makefile in arch/powerpc/boot/Makefile and
+it uses the wrapper script (arch/powerpc/boot/wrapper) to generate target
+image. The details of the build system is discussed in the next section.
+Currently, the following image format targets exist:
+
+ cuImage.%: Backwards compatible uImage for older version of
+ U-Boot (for versions that don't understand the device
+ tree). This image embeds a device tree blob inside
+ the image. The boot wrapper, kernel and device tree
+ are all embedded inside the U-Boot uImage file format
+ with boot wrapper code that extracts data from the old
+ bd_info structure and loads the data into the device
+ tree before jumping into the kernel.
+ Because of the series of #ifdefs found in the
+ bd_info structure used in the old U-Boot interfaces,
+ cuImages are platform specific. Each specific
+ U-Boot platform has a different platform init file
+ which populates the embedded device tree with data
+ from the platform specific bd_info file. The platform
+ specific cuImage platform init code can be found in
+ arch/powerpc/boot/cuboot.*.c. Selection of the correct
+ cuImage init code for a specific board can be found in
+ the wrapper structure.
+ dtbImage.%: Similar to zImage, except device tree blob is embedded
+ inside the image instead of provided by firmware. The
+ output image file can be either an elf file or a flat
+ binary depending on the platform.
+ dtbImages are used on systems which do not have an
+ interface for passing a device tree directly.
+ dtbImages are similar to simpleImages except that
+ dtbImages have platform specific code for extracting
+ data from the board firmware, but simpleImages do not
+ talk to the firmware at all.
+ PlayStation 3 support uses dtbImage. So do Embedded
+ Planet boards using the PlanetCore firmware. Board
+ specific initialization code is typically found in a
+ file named arch/powerpc/boot/<platform>.c; but this
+ can be overridden by the wrapper script.
+ simpleImage.%: Firmware independent compressed image that does not
+ depend on any particular firmware interface and embeds
+ a device tree blob. This image is a flat binary that
+ can be loaded to any location in RAM and jumped to.
+ Firmware cannot pass any configuration data to the
+ kernel with this image type and it depends entirely on
+ the embedded device tree for all information.
+ The simpleImage is useful for booting systems with
+ an unknown firmware interface or for booting from
+ a debugger when no firmware is present (such as on
+ the Xilinx Virtex platform). The only assumption that
+ simpleImage makes is that RAM is correctly initialized
+ and that the MMU is either off or has RAM mapped to
+ base address 0.
+ simpleImage also supports inserting special platform
+ specific initialization code to the start of the bootup
+ sequence. The virtex405 platform uses this feature to
+ ensure that the cache is invalidated before caching
+ is enabled. Platform specific initialization code is
+ added as part of the wrapper script and is keyed on
+ the image target name. For example, all
+ simpleImage.virtex405-* targets will add the
+ virtex405-head.S initialization code (This also means
+ that the dts file for virtex405 targets should be
+ named (virtex405-<board>.dts). Search the wrapper
+ script for 'virtex405' and see the file
+ arch/powerpc/boot/virtex405-head.S for details.
+ treeImage.%; Image format for used with OpenBIOS firmware found
+ on some ppc4xx hardware. This image embeds a device
+ tree blob inside the image.
+ uImage: Native image format used by U-Boot. The uImage target
+ does not add any boot code. It just wraps a compressed
+ vmlinux in the uImage data structure. This image
+ requires a version of U-Boot that is able to pass
+ a device tree to the kernel at boot. If using an older
+ version of U-Boot, then you need to use a cuImage
+ instead.
+ zImage.%: Image format which does not embed a device tree.
+ Used by OpenFirmware and other firmware interfaces
+ which are able to supply a device tree. This image
+ expects firmware to provide the device tree at boot.
+ Typically, if you have general purpose PowerPC
+ hardware then you want this image format.
+
+Image types which embed a device tree blob (simpleImage, dtbImage, treeImage,
+and cuImage) all generate the device tree blob from a file in the
+arch/powerpc/boot/dts/ directory. The Makefile selects the correct device
+tree source based on the name of the target. Therefore, if the kernel is
+built with 'make treeImage.walnut simpleImage.virtex405-ml403', then the
+build system will use arch/powerpc/boot/dts/walnut.dts to build
+treeImage.walnut and arch/powerpc/boot/dts/virtex405-ml403.dts to build
+the simpleImage.virtex405-ml403.
+
+Two special targets called 'zImage' and 'zImage.initrd' also exist. These
+targets build all the default images as selected by the kernel configuration.
+Default images are selected by the boot wrapper Makefile
+(arch/powerpc/boot/Makefile) by adding targets to the $image-y variable. Look
+at the Makefile to see which default image targets are available.
+
+How it is built
+---------------
+arch/powerpc is designed to support multiplatform kernels, which means
+that a single vmlinux image can be booted on many different target boards.
+It also means that the boot wrapper must be able to wrap for many kinds of
+images on a single build. The design decision was made to not use any
+conditional compilation code (#ifdef, etc) in the boot wrapper source code.
+All of the boot wrapper pieces are buildable at any time regardless of the
+kernel configuration. Building all the wrapper bits on every kernel build
+also ensures that obscure parts of the wrapper are at the very least compile
+tested in a large variety of environments.
+
+The wrapper is adapted for different image types at link time by linking in
+just the wrapper bits that are appropriate for the image type. The 'wrapper
+script' (found in arch/powerpc/boot/wrapper) is called by the Makefile and
+is responsible for selecting the correct wrapper bits for the image type.
+The arguments are well documented in the script's comment block, so they
+are not repeated here. However, it is worth mentioning that the script
+uses the -p (platform) argument as the main method of deciding which wrapper
+bits to compile in. Look for the large 'case "$platform" in' block in the
+middle of the script. This is also the place where platform specific fixups
+can be selected by changing the link order.
+
+In particular, care should be taken when working with cuImages. cuImage
+wrapper bits are very board specific and care should be taken to make sure
+the target you are trying to build is supported by the wrapper bits.
diff --git a/Documentation/powerpc/cpu_features.txt b/Documentation/powerpc/cpu_features.txt
new file mode 100644
index 0000000..4727398
--- /dev/null
+++ b/Documentation/powerpc/cpu_features.txt
@@ -0,0 +1,56 @@
+Hollis Blanchard <hollis@austin.ibm.com>
+5 Jun 2002
+
+This document describes the system (including self-modifying code) used in the
+PPC Linux kernel to support a variety of PowerPC CPUs without requiring
+compile-time selection.
+
+Early in the boot process the ppc32 kernel detects the current CPU type and
+chooses a set of features accordingly. Some examples include Altivec support,
+split instruction and data caches, and if the CPU supports the DOZE and NAP
+sleep modes.
+
+Detection of the feature set is simple. A list of processors can be found in
+arch/ppc/kernel/cputable.c. The PVR register is masked and compared with each
+value in the list. If a match is found, the cpu_features of cur_cpu_spec is
+assigned to the feature bitmask for this processor and a __setup_cpu function
+is called.
+
+C code may test 'cur_cpu_spec[smp_processor_id()]->cpu_features' for a
+particular feature bit. This is done in quite a few places, for example
+in ppc_setup_l2cr().
+
+Implementing cpufeatures in assembly is a little more involved. There are
+several paths that are performance-critical and would suffer if an array
+index, structure dereference, and conditional branch were added. To avoid the
+performance penalty but still allow for runtime (rather than compile-time) CPU
+selection, unused code is replaced by 'nop' instructions. This nop'ing is
+based on CPU 0's capabilities, so a multi-processor system with non-identical
+processors will not work (but such a system would likely have other problems
+anyways).
+
+After detecting the processor type, the kernel patches out sections of code
+that shouldn't be used by writing nop's over it. Using cpufeatures requires
+just 2 macros (found in include/asm-ppc/cputable.h), as seen in head.S
+transfer_to_handler:
+
+ #ifdef CONFIG_ALTIVEC
+ BEGIN_FTR_SECTION
+ mfspr r22,SPRN_VRSAVE /* if G4, save vrsave register value */
+ stw r22,THREAD_VRSAVE(r23)
+ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+ #endif /* CONFIG_ALTIVEC */
+
+If CPU 0 supports Altivec, the code is left untouched. If it doesn't, both
+instructions are replaced with nop's.
+
+The END_FTR_SECTION macro has two simpler variations: END_FTR_SECTION_IFSET
+and END_FTR_SECTION_IFCLR. These simply test if a flag is set (in
+cur_cpu_spec[0]->cpu_features) or is cleared, respectively. These two macros
+should be used in the majority of cases.
+
+The END_FTR_SECTION macros are implemented by storing information about this
+code in the '__ftr_fixup' ELF section. When do_cpu_ftr_fixups
+(arch/ppc/kernel/misc.S) is invoked, it will iterate over the records in
+__ftr_fixup, and if the required feature is not present it will loop writing
+nop's from each BEGIN_FTR_SECTION to END_FTR_SECTION.
diff --git a/Documentation/powerpc/dts-bindings/fsl/83xx-512x-pci.txt b/Documentation/powerpc/dts-bindings/fsl/83xx-512x-pci.txt
new file mode 100644
index 0000000..35a4653
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/83xx-512x-pci.txt
@@ -0,0 +1,40 @@
+* Freescale 83xx and 512x PCI bridges
+
+Freescale 83xx and 512x SOCs include the same pci bridge core.
+
+83xx/512x specific notes:
+- reg: should contain two address length tuples
+ The first is for the internal pci bridge registers
+ The second is for the pci config space access registers
+
+Example (MPC8313ERDB)
+ pci0: pci@e0008500 {
+ cell-index = <1>;
+ interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+ interrupt-map = <
+ /* IDSEL 0x0E -mini PCI */
+ 0x7000 0x0 0x0 0x1 &ipic 18 0x8
+ 0x7000 0x0 0x0 0x2 &ipic 18 0x8
+ 0x7000 0x0 0x0 0x3 &ipic 18 0x8
+ 0x7000 0x0 0x0 0x4 &ipic 18 0x8
+
+ /* IDSEL 0x0F - PCI slot */
+ 0x7800 0x0 0x0 0x1 &ipic 17 0x8
+ 0x7800 0x0 0x0 0x2 &ipic 18 0x8
+ 0x7800 0x0 0x0 0x3 &ipic 17 0x8
+ 0x7800 0x0 0x0 0x4 &ipic 18 0x8>;
+ interrupt-parent = <&ipic>;
+ interrupts = <66 0x8>;
+ bus-range = <0x0 0x0>;
+ ranges = <0x02000000 0x0 0x90000000 0x90000000 0x0 0x10000000
+ 0x42000000 0x0 0x80000000 0x80000000 0x0 0x10000000
+ 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00100000>;
+ clock-frequency = <66666666>;
+ #interrupt-cells = <1>;
+ #size-cells = <2>;
+ #address-cells = <3>;
+ reg = <0xe0008500 0x100 /* internal registers */
+ 0xe0008300 0x8>; /* config space access registers */
+ compatible = "fsl,mpc8349-pci";
+ device_type = "pci";
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
new file mode 100644
index 0000000..d015dce
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
@@ -0,0 +1,40 @@
+GPIO controllers on MPC8xxx SoCs
+
+This is for the non-QE/CPM/GUTs GPIO controllers as found on
+8349, 8572, 8610 and compatible.
+
+Every GPIO controller node must have #gpio-cells property defined,
+this information will be used to translate gpio-specifiers.
+
+Required properties:
+- compatible : "fsl,<CHIP>-gpio" followed by "fsl,mpc8349-gpio" for
+ 83xx, "fsl,mpc8572-gpio" for 85xx and "fsl,mpc8610-gpio" for 86xx.
+- #gpio-cells : Should be two. The first cell is the pin number and the
+ second cell is used to specify optional parameters (currently unused).
+ - interrupts : Interrupt mapping for GPIO IRQ (currently unused).
+ - interrupt-parent : Phandle for the interrupt controller that
+ services interrupts for this device.
+- gpio-controller : Marks the port as GPIO controller.
+
+Example of gpio-controller nodes for a MPC8347 SoC:
+
+ gpio1: gpio-controller@c00 {
+ #gpio-cells = <2>;
+ compatible = "fsl,mpc8347-gpio", "fsl,mpc8349-gpio";
+ reg = <0xc00 0x100>;
+ interrupts = <74 0x8>;
+ interrupt-parent = <&ipic>;
+ gpio-controller;
+ };
+
+ gpio2: gpio-controller@d00 {
+ #gpio-cells = <2>;
+ compatible = "fsl,mpc8347-gpio", "fsl,mpc8349-gpio";
+ reg = <0xd00 0x100>;
+ interrupts = <75 0x8>;
+ interrupt-parent = <&ipic>;
+ gpio-controller;
+ };
+
+See booting-without-of.txt for details of how to specify GPIO
+information for devices.
diff --git a/Documentation/powerpc/dts-bindings/fsl/board.txt b/Documentation/powerpc/dts-bindings/fsl/board.txt
new file mode 100644
index 0000000..81a917e
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/board.txt
@@ -0,0 +1,29 @@
+* Board Control and Status (BCSR)
+
+Required properties:
+
+ - compatible : Should be "fsl,<board>-bcsr"
+ - reg : Offset and length of the register set for the device
+
+Example:
+
+ bcsr@f8000000 {
+ compatible = "fsl,mpc8360mds-bcsr";
+ reg = <f8000000 8000>;
+ };
+
+* Freescale on board FPGA
+
+This is the memory-mapped registers for on board FPGA.
+
+Required properities:
+- compatible : should be "fsl,fpga-pixis".
+- reg : should contain the address and the lenght of the FPPGA register
+ set.
+
+Example (MPC8610HPCD):
+
+ board-control@e8000000 {
+ compatible = "fsl,fpga-pixis";
+ reg = <0xe8000000 32>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
new file mode 100644
index 0000000..088fc47
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
@@ -0,0 +1,67 @@
+* Freescale Communications Processor Module
+
+NOTE: This is an interim binding, and will likely change slightly,
+as more devices are supported. The QE bindings especially are
+incomplete.
+
+* Root CPM node
+
+Properties:
+- compatible : "fsl,cpm1", "fsl,cpm2", or "fsl,qe".
+- reg : A 48-byte region beginning with CPCR.
+
+Example:
+ cpm@119c0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ #interrupt-cells = <2>;
+ compatible = "fsl,mpc8272-cpm", "fsl,cpm2";
+ reg = <119c0 30>;
+ }
+
+* Properties common to mulitple CPM/QE devices
+
+- fsl,cpm-command : This value is ORed with the opcode and command flag
+ to specify the device on which a CPM command operates.
+
+- fsl,cpm-brg : Indicates which baud rate generator the device
+ is associated with. If absent, an unused BRG
+ should be dynamically allocated. If zero, the
+ device uses an external clock rather than a BRG.
+
+- reg : Unless otherwise specified, the first resource represents the
+ scc/fcc/ucc registers, and the second represents the device's
+ parameter RAM region (if it has one).
+
+* Multi-User RAM (MURAM)
+
+The multi-user/dual-ported RAM is expressed as a bus under the CPM node.
+
+Ranges must be set up subject to the following restrictions:
+
+- Children's reg nodes must be offsets from the start of all muram, even
+ if the user-data area does not begin at zero.
+- If multiple range entries are used, the difference between the parent
+ address and the child address must be the same in all, so that a single
+ mapping can cover them all while maintaining the ability to determine
+ CPM-side offsets with pointer subtraction. It is recommended that
+ multiple range entries not be used.
+- A child address of zero must be translatable, even if no reg resources
+ contain it.
+
+A child "data" node must exist, compatible with "fsl,cpm-muram-data", to
+indicate the portion of muram that is usable by the OS for arbitrary
+purposes. The data node may have an arbitrary number of reg resources,
+all of which contribute to the allocatable muram pool.
+
+Example, based on mpc8272:
+ muram@0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0 0 10000>;
+
+ data@0 {
+ compatible = "fsl,cpm-muram-data";
+ reg = <0 2000 9800 800>;
+ };
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/brg.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/brg.txt
new file mode 100644
index 0000000..4c7d45e
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/brg.txt
@@ -0,0 +1,21 @@
+* Baud Rate Generators
+
+Currently defined compatibles:
+fsl,cpm-brg
+fsl,cpm1-brg
+fsl,cpm2-brg
+
+Properties:
+- reg : There may be an arbitrary number of reg resources; BRG
+ numbers are assigned to these in order.
+- clock-frequency : Specifies the base frequency driving
+ the BRG.
+
+Example:
+ brg@119f0 {
+ compatible = "fsl,mpc8272-brg",
+ "fsl,cpm2-brg",
+ "fsl,cpm-brg";
+ reg = <119f0 10 115f0 10>;
+ clock-frequency = <d#25000000>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/i2c.txt
new file mode 100644
index 0000000..87bc604
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/i2c.txt
@@ -0,0 +1,41 @@
+* I2C
+
+The I2C controller is expressed as a bus under the CPM node.
+
+Properties:
+- compatible : "fsl,cpm1-i2c", "fsl,cpm2-i2c"
+- reg : On CPM2 devices, the second resource doesn't specify the I2C
+ Parameter RAM itself, but the I2C_BASE field of the CPM2 Parameter RAM
+ (typically 0x8afc 0x2).
+- #address-cells : Should be one. The cell is the i2c device address with
+ the r/w bit set to zero.
+- #size-cells : Should be zero.
+- clock-frequency : Can be used to set the i2c clock frequency. If
+ unspecified, a default frequency of 60kHz is being used.
+The following two properties are deprecated. They are only used by legacy
+i2c drivers to find the bus to probe:
+- linux,i2c-index : Can be used to hard code an i2c bus number. By default,
+ the bus number is dynamically assigned by the i2c core.
+- linux,i2c-class : Can be used to override the i2c class. The class is used
+ by legacy i2c device drivers to find a bus in a specific context like
+ system management, video or sound. By default, I2C_CLASS_HWMON (1) is
+ being used. The definition of the classes can be found in
+ include/i2c/i2c.h
+
+Example, based on mpc823:
+
+ i2c@860 {
+ compatible = "fsl,mpc823-i2c",
+ "fsl,cpm1-i2c";
+ reg = <0x860 0x20 0x3c80 0x30>;
+ interrupts = <16>;
+ interrupt-parent = <&CPM_PIC>;
+ fsl,cpm-command = <0x10>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ rtc@68 {
+ compatible = "dallas,ds1307";
+ reg = <0x68>;
+ };
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/pic.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/pic.txt
new file mode 100644
index 0000000..8e3ee16
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/pic.txt
@@ -0,0 +1,18 @@
+* Interrupt Controllers
+
+Currently defined compatibles:
+- fsl,cpm1-pic
+ - only one interrupt cell
+- fsl,pq1-pic
+- fsl,cpm2-pic
+ - second interrupt cell is level/sense:
+ - 2 is falling edge
+ - 8 is active low
+
+Example:
+ interrupt-controller@10c00 {
+ #interrupt-cells = <2>;
+ interrupt-controller;
+ reg = <10c00 80>;
+ compatible = "mpc8272-pic", "fsl,cpm2-pic";
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/usb.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/usb.txt
new file mode 100644
index 0000000..74bfda4
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm/usb.txt
@@ -0,0 +1,15 @@
+* USB (Universal Serial Bus Controller)
+
+Properties:
+- compatible : "fsl,cpm1-usb", "fsl,cpm2-usb", "fsl,qe-usb"
+
+Example:
+ usb@11bc0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "fsl,cpm2-usb";
+ reg = <11b60 18 8b00 100>;
+ interrupts = <b 8>;
+ interrupt-parent = <&PIC>;
+ fsl,cpm-command = <2e600000>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
new file mode 100644
index 0000000..1815dfe
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
@@ -0,0 +1,38 @@
+Every GPIO controller node must have #gpio-cells property defined,
+this information will be used to translate gpio-specifiers.
+
+On CPM1 devices, all ports are using slightly different register layouts.
+Ports A, C and D are 16bit ports and Ports B and E are 32bit ports.
+
+On CPM2 devices, all ports are 32bit ports and use a common register layout.
+
+Required properties:
+- compatible : "fsl,cpm1-pario-bank-a", "fsl,cpm1-pario-bank-b",
+ "fsl,cpm1-pario-bank-c", "fsl,cpm1-pario-bank-d",
+ "fsl,cpm1-pario-bank-e", "fsl,cpm2-pario-bank"
+- #gpio-cells : Should be two. The first cell is the pin number and the
+ second cell is used to specify optional paramters (currently unused).
+- gpio-controller : Marks the port as GPIO controller.
+
+Example of three SOC GPIO banks defined as gpio-controller nodes:
+
+ CPM1_PIO_A: gpio-controller@950 {
+ #gpio-cells = <2>;
+ compatible = "fsl,cpm1-pario-bank-a";
+ reg = <0x950 0x10>;
+ gpio-controller;
+ };
+
+ CPM1_PIO_B: gpio-controller@ab8 {
+ #gpio-cells = <2>;
+ compatible = "fsl,cpm1-pario-bank-b";
+ reg = <0xab8 0x10>;
+ gpio-controller;
+ };
+
+ CPM1_PIO_E: gpio-controller@ac8 {
+ #gpio-cells = <2>;
+ compatible = "fsl,cpm1-pario-bank-e";
+ reg = <0xac8 0x18>;
+ gpio-controller;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/network.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/network.txt
new file mode 100644
index 0000000..0e42694
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/network.txt
@@ -0,0 +1,45 @@
+* Network
+
+Currently defined compatibles:
+- fsl,cpm1-scc-enet
+- fsl,cpm2-scc-enet
+- fsl,cpm1-fec-enet
+- fsl,cpm2-fcc-enet (third resource is GFEMR)
+- fsl,qe-enet
+
+Example:
+
+ ethernet@11300 {
+ device_type = "network";
+ compatible = "fsl,mpc8272-fcc-enet",
+ "fsl,cpm2-fcc-enet";
+ reg = <11300 20 8400 100 11390 1>;
+ local-mac-address = [ 00 00 00 00 00 00 ];
+ interrupts = <20 8>;
+ interrupt-parent = <&PIC>;
+ phy-handle = <&PHY0>;
+ fsl,cpm-command = <12000300>;
+ };
+
+* MDIO
+
+Currently defined compatibles:
+fsl,pq1-fec-mdio (reg is same as first resource of FEC device)
+fsl,cpm2-mdio-bitbang (reg is port C registers)
+
+Properties for fsl,cpm2-mdio-bitbang:
+fsl,mdio-pin : pin of port C controlling mdio data
+fsl,mdc-pin : pin of port C controlling mdio clock
+
+Example:
+ mdio@10d40 {
+ device_type = "mdio";
+ compatible = "fsl,mpc8272ads-mdio-bitbang",
+ "fsl,mpc8272-mdio-bitbang",
+ "fsl,cpm2-mdio-bitbang";
+ reg = <10d40 14>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ fsl,mdio-pin = <12>;
+ fsl,mdc-pin = <13>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
new file mode 100644
index 0000000..78790d5
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
@@ -0,0 +1,58 @@
+* Freescale QUICC Engine module (QE)
+This represents qe module that is installed on PowerQUICC II Pro.
+
+NOTE: This is an interim binding; it should be updated to fit
+in with the CPM binding later in this document.
+
+Basically, it is a bus of devices, that could act more or less
+as a complete entity (UCC, USB etc ). All of them should be siblings on
+the "root" qe node, using the common properties from there.
+The description below applies to the qe of MPC8360 and
+more nodes and properties would be extended in the future.
+
+i) Root QE device
+
+Required properties:
+- compatible : should be "fsl,qe";
+- model : precise model of the QE, Can be "QE", "CPM", or "CPM2"
+- reg : offset and length of the device registers.
+- bus-frequency : the clock frequency for QUICC Engine.
+
+Recommended properties
+- brg-frequency : the internal clock source frequency for baud-rate
+ generators in Hz.
+
+Example:
+ qe@e0100000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ #interrupt-cells = <2>;
+ compatible = "fsl,qe";
+ ranges = <0 e0100000 00100000>;
+ reg = <e0100000 480>;
+ brg-frequency = <0>;
+ bus-frequency = <179A7B00>;
+ }
+
+* Multi-User RAM (MURAM)
+
+Required properties:
+- compatible : should be "fsl,qe-muram", "fsl,cpm-muram".
+- mode : the could be "host" or "slave".
+- ranges : Should be defined as specified in 1) to describe the
+ translation of MURAM addresses.
+- data-only : sub-node which defines the address area under MURAM
+ bus that can be allocated as data/parameter
+
+Example:
+
+ muram@10000 {
+ compatible = "fsl,qe-muram", "fsl,cpm-muram";
+ ranges = <0 00010000 0000c000>;
+
+ data-only@0{
+ compatible = "fsl,qe-muram-data",
+ "fsl,cpm-muram-data";
+ reg = <0 c000>;
+ };
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
new file mode 100644
index 0000000..6c238f5
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
@@ -0,0 +1,24 @@
+* Uploaded QE firmware
+
+ If a new firwmare has been uploaded to the QE (usually by the
+ boot loader), then a 'firmware' child node should be added to the QE
+ node. This node provides information on the uploaded firmware that
+ device drivers may need.
+
+ Required properties:
+ - id: The string name of the firmware. This is taken from the 'id'
+ member of the qe_firmware structure of the uploaded firmware.
+ Device drivers can search this string to determine if the
+ firmware they want is already present.
+ - extended-modes: The Extended Modes bitfield, taken from the
+ firmware binary. It is a 64-bit number represented
+ as an array of two 32-bit numbers.
+ - virtual-traps: The virtual traps, taken from the firmware binary.
+ It is an array of 8 32-bit numbers.
+
+Example:
+ firmware {
+ id = "Soft-UART";
+ extended-modes = <0 0>;
+ virtual-traps = <0 0 0 0 0 0 0 0>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/par_io.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/par_io.txt
new file mode 100644
index 0000000..6098426
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/par_io.txt
@@ -0,0 +1,51 @@
+* Parallel I/O Ports
+
+This node configures Parallel I/O ports for CPUs with QE support.
+The node should reside in the "soc" node of the tree. For each
+device that using parallel I/O ports, a child node should be created.
+See the definition of the Pin configuration nodes below for more
+information.
+
+Required properties:
+- device_type : should be "par_io".
+- reg : offset to the register set and its length.
+- num-ports : number of Parallel I/O ports
+
+Example:
+par_io@1400 {
+ reg = <1400 100>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ device_type = "par_io";
+ num-ports = <7>;
+ ucc_pin@01 {
+ ......
+ };
+
+Note that "par_io" nodes are obsolete, and should not be used for
+the new device trees. Instead, each Par I/O bank should be represented
+via its own gpio-controller node:
+
+Required properties:
+- #gpio-cells : should be "2".
+- compatible : should be "fsl,<chip>-qe-pario-bank",
+ "fsl,mpc8323-qe-pario-bank".
+- reg : offset to the register set and its length.
+- gpio-controller : node to identify gpio controllers.
+
+Example:
+ qe_pio_a: gpio-controller@1400 {
+ #gpio-cells = <2>;
+ compatible = "fsl,mpc8360-qe-pario-bank",
+ "fsl,mpc8323-qe-pario-bank";
+ reg = <0x1400 0x18>;
+ gpio-controller;
+ };
+
+ qe_pio_e: gpio-controller@1460 {
+ #gpio-cells = <2>;
+ compatible = "fsl,mpc8360-qe-pario-bank",
+ "fsl,mpc8323-qe-pario-bank";
+ reg = <0x1460 0x18>;
+ gpio-controller;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/pincfg.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/pincfg.txt
new file mode 100644
index 0000000..c5b4306
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/pincfg.txt
@@ -0,0 +1,60 @@
+* Pin configuration nodes
+
+Required properties:
+- linux,phandle : phandle of this node; likely referenced by a QE
+ device.
+- pio-map : array of pin configurations. Each pin is defined by 6
+ integers. The six numbers are respectively: port, pin, dir,
+ open_drain, assignment, has_irq.
+ - port : port number of the pin; 0-6 represent port A-G in UM.
+ - pin : pin number in the port.
+ - dir : direction of the pin, should encode as follows:
+
+ 0 = The pin is disabled
+ 1 = The pin is an output
+ 2 = The pin is an input
+ 3 = The pin is I/O
+
+ - open_drain : indicates the pin is normal or wired-OR:
+
+ 0 = The pin is actively driven as an output
+ 1 = The pin is an open-drain driver. As an output, the pin is
+ driven active-low, otherwise it is three-stated.
+
+ - assignment : function number of the pin according to the Pin Assignment
+ tables in User Manual. Each pin can have up to 4 possible functions in
+ QE and two options for CPM.
+ - has_irq : indicates if the pin is used as source of external
+ interrupts.
+
+Example:
+ ucc_pin@01 {
+ linux,phandle = <140001>;
+ pio-map = <
+ /* port pin dir open_drain assignment has_irq */
+ 0 3 1 0 1 0 /* TxD0 */
+ 0 4 1 0 1 0 /* TxD1 */
+ 0 5 1 0 1 0 /* TxD2 */
+ 0 6 1 0 1 0 /* TxD3 */
+ 1 6 1 0 3 0 /* TxD4 */
+ 1 7 1 0 1 0 /* TxD5 */
+ 1 9 1 0 2 0 /* TxD6 */
+ 1 a 1 0 2 0 /* TxD7 */
+ 0 9 2 0 1 0 /* RxD0 */
+ 0 a 2 0 1 0 /* RxD1 */
+ 0 b 2 0 1 0 /* RxD2 */
+ 0 c 2 0 1 0 /* RxD3 */
+ 0 d 2 0 1 0 /* RxD4 */
+ 1 1 2 0 2 0 /* RxD5 */
+ 1 0 2 0 2 0 /* RxD6 */
+ 1 4 2 0 2 0 /* RxD7 */
+ 0 7 1 0 1 0 /* TX_EN */
+ 0 8 1 0 1 0 /* TX_ER */
+ 0 f 2 0 1 0 /* RX_DV */
+ 0 10 2 0 1 0 /* RX_ER */
+ 0 0 2 0 1 0 /* RX_CLK */
+ 2 9 1 0 3 0 /* GTX_CLK - CLK10 */
+ 2 8 2 0 1 0>; /* GTX125 - CLK9 */
+ };
+
+
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/ucc.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/ucc.txt
new file mode 100644
index 0000000..e47734b
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/ucc.txt
@@ -0,0 +1,70 @@
+* UCC (Unified Communications Controllers)
+
+Required properties:
+- device_type : should be "network", "hldc", "uart", "transparent"
+ "bisync", "atm", or "serial".
+- compatible : could be "ucc_geth" or "fsl_atm" and so on.
+- cell-index : the ucc number(1-8), corresponding to UCCx in UM.
+- reg : Offset and length of the register set for the device
+- interrupts : <a b> where a is the interrupt number and b is a
+ field that represents an encoding of the sense and level
+ information for the interrupt. This should be encoded based on
+ the information in section 2) depending on the type of interrupt
+ controller you have.
+- interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+- pio-handle : The phandle for the Parallel I/O port configuration.
+- port-number : for UART drivers, the port number to use, between 0 and 3.
+ This usually corresponds to the /dev/ttyQE device, e.g. <0> = /dev/ttyQE0.
+ The port number is added to the minor number of the device. Unlike the
+ CPM UART driver, the port-number is required for the QE UART driver.
+- soft-uart : for UART drivers, if specified this means the QE UART device
+ driver should use "Soft-UART" mode, which is needed on some SOCs that have
+ broken UART hardware. Soft-UART is provided via a microcode upload.
+- rx-clock-name: the UCC receive clock source
+ "none": clock source is disabled
+ "brg1" through "brg16": clock source is BRG1-BRG16, respectively
+ "clk1" through "clk24": clock source is CLK1-CLK24, respectively
+- tx-clock-name: the UCC transmit clock source
+ "none": clock source is disabled
+ "brg1" through "brg16": clock source is BRG1-BRG16, respectively
+ "clk1" through "clk24": clock source is CLK1-CLK24, respectively
+The following two properties are deprecated. rx-clock has been replaced
+with rx-clock-name, and tx-clock has been replaced with tx-clock-name.
+Drivers that currently use the deprecated properties should continue to
+do so, in order to support older device trees, but they should be updated
+to check for the new properties first.
+- rx-clock : represents the UCC receive clock source.
+ 0x00 : clock source is disabled;
+ 0x1~0x10 : clock source is BRG1~BRG16 respectively;
+ 0x11~0x28: clock source is QE_CLK1~QE_CLK24 respectively.
+- tx-clock: represents the UCC transmit clock source;
+ 0x00 : clock source is disabled;
+ 0x1~0x10 : clock source is BRG1~BRG16 respectively;
+ 0x11~0x28: clock source is QE_CLK1~QE_CLK24 respectively.
+
+Required properties for network device_type:
+- mac-address : list of bytes representing the ethernet address.
+- phy-handle : The phandle for the PHY connected to this controller.
+
+Recommended properties:
+- phy-connection-type : a string naming the controller/PHY interface type,
+ i.e., "mii" (default), "rmii", "gmii", "rgmii", "rgmii-id" (Internal
+ Delay), "rgmii-txid" (delay on TX only), "rgmii-rxid" (delay on RX only),
+ "tbi", or "rtbi".
+
+Example:
+ ucc@2000 {
+ device_type = "network";
+ compatible = "ucc_geth";
+ cell-index = <1>;
+ reg = <2000 200>;
+ interrupts = <a0 0>;
+ interrupt-parent = <700>;
+ mac-address = [ 00 04 9f 00 23 23 ];
+ rx-clock = "none";
+ tx-clock = "clk9";
+ phy-handle = <212000>;
+ phy-connection-type = "gmii";
+ pio-handle = <140001>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/usb.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/usb.txt
new file mode 100644
index 0000000..9ccd5f3
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/usb.txt
@@ -0,0 +1,37 @@
+Freescale QUICC Engine USB Controller
+
+Required properties:
+- compatible : should be "fsl,<chip>-qe-usb", "fsl,mpc8323-qe-usb".
+- reg : the first two cells should contain usb registers location and
+ length, the next two two cells should contain PRAM location and
+ length.
+- interrupts : should contain USB interrupt.
+- interrupt-parent : interrupt source phandle.
+- fsl,fullspeed-clock : specifies the full speed USB clock source:
+ "none": clock source is disabled
+ "brg1" through "brg16": clock source is BRG1-BRG16, respectively
+ "clk1" through "clk24": clock source is CLK1-CLK24, respectively
+- fsl,lowspeed-clock : specifies the low speed USB clock source:
+ "none": clock source is disabled
+ "brg1" through "brg16": clock source is BRG1-BRG16, respectively
+ "clk1" through "clk24": clock source is CLK1-CLK24, respectively
+- hub-power-budget : USB power budget for the root hub, in mA.
+- gpios : should specify GPIOs in this order: USBOE, USBTP, USBTN, USBRP,
+ USBRN, SPEED (optional), and POWER (optional).
+
+Example:
+
+usb@6c0 {
+ compatible = "fsl,mpc8360-qe-usb", "fsl,mpc8323-qe-usb";
+ reg = <0x6c0 0x40 0x8b00 0x100>;
+ interrupts = <11>;
+ interrupt-parent = <&qeic>;
+ fsl,fullspeed-clock = "clk21";
+ gpios = <&qe_pio_b 2 0 /* USBOE */
+ &qe_pio_b 3 0 /* USBTP */
+ &qe_pio_b 8 0 /* USBTN */
+ &qe_pio_b 9 0 /* USBRP */
+ &qe_pio_b 11 0 /* USBRN */
+ &qe_pio_e 20 0 /* SPEED */
+ &qe_pio_e 21 0 /* POWER */>;
+};
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/serial.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/serial.txt
new file mode 100644
index 0000000..2ea76d9
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/serial.txt
@@ -0,0 +1,32 @@
+* Serial
+
+Currently defined compatibles:
+- fsl,cpm1-smc-uart
+- fsl,cpm2-smc-uart
+- fsl,cpm1-scc-uart
+- fsl,cpm2-scc-uart
+- fsl,qe-uart
+
+Modem control lines connected to GPIO controllers are listed in the gpios
+property as described in booting-without-of.txt, section IX.1 in the following
+order:
+
+CTS, RTS, DCD, DSR, DTR, and RI.
+
+The gpios property is optional and can be left out when control lines are
+not used.
+
+Example:
+
+ serial@11a00 {
+ device_type = "serial";
+ compatible = "fsl,mpc8272-scc-uart",
+ "fsl,cpm2-scc-uart";
+ reg = <11a00 20 8000 100>;
+ interrupts = <28 8>;
+ interrupt-parent = <&PIC>;
+ fsl,cpm-brg = <1>;
+ fsl,cpm-command = <00800000>;
+ gpios = <&gpio_c 15 0
+ &gpio_d 29 0>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/diu.txt b/Documentation/powerpc/dts-bindings/fsl/diu.txt
new file mode 100644
index 0000000..deb35de
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/diu.txt
@@ -0,0 +1,18 @@
+* Freescale Display Interface Unit
+
+The Freescale DIU is a LCD controller, with proper hardware, it can also
+drive DVI monitors.
+
+Required properties:
+- compatible : should be "fsl-diu".
+- reg : should contain at least address and length of the DIU register
+ set.
+- Interrupts : one DIU interrupt should be describe here.
+
+Example (MPC8610HPCD):
+ display@2c000 {
+ compatible = "fsl,diu";
+ reg = <0x2c000 100>;
+ interrupts = <72 2>;
+ interrupt-parent = <&mpic>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/dma.txt b/Documentation/powerpc/dts-bindings/fsl/dma.txt
new file mode 100644
index 0000000..cc45311
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/dma.txt
@@ -0,0 +1,136 @@
+* Freescale 83xx DMA Controller
+
+Freescale PowerPC 83xx have on chip general purpose DMA controllers.
+
+Required properties:
+
+- compatible : compatible list, contains 2 entries, first is
+ "fsl,CHIP-dma", where CHIP is the processor
+ (mpc8349, mpc8360, etc.) and the second is
+ "fsl,elo-dma"
+- reg : <registers mapping for DMA general status reg>
+- ranges : Should be defined as specified in 1) to describe the
+ DMA controller channels.
+- cell-index : controller index. 0 for controller @ 0x8100
+- interrupts : <interrupt mapping for DMA IRQ>
+- interrupt-parent : optional, if needed for interrupt mapping
+
+
+- DMA channel nodes:
+ - compatible : compatible list, contains 2 entries, first is
+ "fsl,CHIP-dma-channel", where CHIP is the processor
+ (mpc8349, mpc8350, etc.) and the second is
+ "fsl,elo-dma-channel". However, see note below.
+ - reg : <registers mapping for channel>
+ - cell-index : dma channel index starts at 0.
+
+Optional properties:
+ - interrupts : <interrupt mapping for DMA channel IRQ>
+ (on 83xx this is expected to be identical to
+ the interrupts property of the parent node)
+ - interrupt-parent : optional, if needed for interrupt mapping
+
+Example:
+ dma@82a8 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "fsl,mpc8349-dma", "fsl,elo-dma";
+ reg = <82a8 4>;
+ ranges = <0 8100 1a4>;
+ interrupt-parent = <&ipic>;
+ interrupts = <47 8>;
+ cell-index = <0>;
+ dma-channel@0 {
+ compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
+ cell-index = <0>;
+ reg = <0 80>;
+ };
+ dma-channel@80 {
+ compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
+ cell-index = <1>;
+ reg = <80 80>;
+ };
+ dma-channel@100 {
+ compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
+ cell-index = <2>;
+ reg = <100 80>;
+ };
+ dma-channel@180 {
+ compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
+ cell-index = <3>;
+ reg = <180 80>;
+ };
+ };
+
+* Freescale 85xx/86xx DMA Controller
+
+Freescale PowerPC 85xx/86xx have on chip general purpose DMA controllers.
+
+Required properties:
+
+- compatible : compatible list, contains 2 entries, first is
+ "fsl,CHIP-dma", where CHIP is the processor
+ (mpc8540, mpc8540, etc.) and the second is
+ "fsl,eloplus-dma"
+- reg : <registers mapping for DMA general status reg>
+- cell-index : controller index. 0 for controller @ 0x21000,
+ 1 for controller @ 0xc000
+- ranges : Should be defined as specified in 1) to describe the
+ DMA controller channels.
+
+- DMA channel nodes:
+ - compatible : compatible list, contains 2 entries, first is
+ "fsl,CHIP-dma-channel", where CHIP is the processor
+ (mpc8540, mpc8560, etc.) and the second is
+ "fsl,eloplus-dma-channel". However, see note below.
+ - cell-index : dma channel index starts at 0.
+ - reg : <registers mapping for channel>
+ - interrupts : <interrupt mapping for DMA channel IRQ>
+ - interrupt-parent : optional, if needed for interrupt mapping
+
+Example:
+ dma@21300 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "fsl,mpc8540-dma", "fsl,eloplus-dma";
+ reg = <21300 4>;
+ ranges = <0 21100 200>;
+ cell-index = <0>;
+ dma-channel@0 {
+ compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
+ reg = <0 80>;
+ cell-index = <0>;
+ interrupt-parent = <&mpic>;
+ interrupts = <14 2>;
+ };
+ dma-channel@80 {
+ compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
+ reg = <80 80>;
+ cell-index = <1>;
+ interrupt-parent = <&mpic>;
+ interrupts = <15 2>;
+ };
+ dma-channel@100 {
+ compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
+ reg = <100 80>;
+ cell-index = <2>;
+ interrupt-parent = <&mpic>;
+ interrupts = <16 2>;
+ };
+ dma-channel@180 {
+ compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
+ reg = <180 80>;
+ cell-index = <3>;
+ interrupt-parent = <&mpic>;
+ interrupts = <17 2>;
+ };
+ };
+
+Note on DMA channel compatible properties: The compatible property must say
+"fsl,elo-dma-channel" or "fsl,eloplus-dma-channel" to be used by the Elo DMA
+driver (fsldma). Any DMA channel used by fsldma cannot be used by another
+DMA driver, such as the SSI sound drivers for the MPC8610. Therefore, any DMA
+channel that should be used for another driver should not use
+"fsl,elo-dma-channel" or "fsl,eloplus-dma-channel". For the SSI drivers, for
+example, the compatible property should be "fsl,ssi-dma-channel". See ssi.txt
+for more information.
diff --git a/Documentation/powerpc/dts-bindings/fsl/gtm.txt b/Documentation/powerpc/dts-bindings/fsl/gtm.txt
new file mode 100644
index 0000000..9a33efd
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/gtm.txt
@@ -0,0 +1,31 @@
+* Freescale General-purpose Timers Module
+
+Required properties:
+ - compatible : should be
+ "fsl,<chip>-gtm", "fsl,gtm" for SOC GTMs
+ "fsl,<chip>-qe-gtm", "fsl,qe-gtm", "fsl,gtm" for QE GTMs
+ "fsl,<chip>-cpm2-gtm", "fsl,cpm2-gtm", "fsl,gtm" for CPM2 GTMs
+ - reg : should contain gtm registers location and length (0x40).
+ - interrupts : should contain four interrupts.
+ - interrupt-parent : interrupt source phandle.
+ - clock-frequency : specifies the frequency driving the timer.
+
+Example:
+
+timer@500 {
+ compatible = "fsl,mpc8360-gtm", "fsl,gtm";
+ reg = <0x500 0x40>;
+ interrupts = <90 8 78 8 84 8 72 8>;
+ interrupt-parent = <&ipic>;
+ /* filled by u-boot */
+ clock-frequency = <0>;
+};
+
+timer@440 {
+ compatible = "fsl,mpc8360-qe-gtm", "fsl,qe-gtm", "fsl,gtm";
+ reg = <0x440 0x40>;
+ interrupts = <12 13 14 15>;
+ interrupt-parent = <&qeic>;
+ /* filled by u-boot */
+ clock-frequency = <0>;
+};
diff --git a/Documentation/powerpc/dts-bindings/fsl/guts.txt b/Documentation/powerpc/dts-bindings/fsl/guts.txt
new file mode 100644
index 0000000..9e7a241
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/guts.txt
@@ -0,0 +1,25 @@
+* Global Utilities Block
+
+The global utilities block controls power management, I/O device
+enabling, power-on-reset configuration monitoring, general-purpose
+I/O signal configuration, alternate function selection for multiplexed
+signals, and clock control.
+
+Required properties:
+
+ - compatible : Should define the compatible device type for
+ global-utilities.
+ - reg : Offset and length of the register set for the device.
+
+Recommended properties:
+
+ - fsl,has-rstcr : Indicates that the global utilities register set
+ contains a functioning "reset control register" (i.e. the board
+ is wired to reset upon setting the HRESET_REQ bit in this register).
+
+Example:
+ global-utilities@e0000 { /* global utilities block */
+ compatible = "fsl,mpc8548-guts";
+ reg = <e0000 1000>;
+ fsl,has-rstcr;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
new file mode 100644
index 0000000..d0ab33e
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
@@ -0,0 +1,32 @@
+* I2C
+
+Required properties :
+
+ - device_type : Should be "i2c"
+ - reg : Offset and length of the register set for the device
+
+Recommended properties :
+
+ - compatible : Should be "fsl-i2c" for parts compatible with
+ Freescale I2C specifications.
+ - interrupts : <a b> where a is the interrupt number and b is a
+ field that represents an encoding of the sense and level
+ information for the interrupt. This should be encoded based on
+ the information in section 2) depending on the type of interrupt
+ controller you have.
+ - interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+ - dfsrr : boolean; if defined, indicates that this I2C device has
+ a digital filter sampling rate register
+ - fsl5200-clocking : boolean; if defined, indicated that this device
+ uses the FSL 5200 clocking mechanism.
+
+Example :
+ i2c@3000 {
+ interrupt-parent = <40000>;
+ interrupts = <1b 3>;
+ reg = <3000 18>;
+ device_type = "i2c";
+ compatible = "fsl-i2c";
+ dfsrr;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/lbc.txt b/Documentation/powerpc/dts-bindings/fsl/lbc.txt
new file mode 100644
index 0000000..3300fec
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/lbc.txt
@@ -0,0 +1,35 @@
+* Chipselect/Local Bus
+
+Properties:
+- name : Should be localbus
+- #address-cells : Should be either two or three. The first cell is the
+ chipselect number, and the remaining cells are the
+ offset into the chipselect.
+- #size-cells : Either one or two, depending on how large each chipselect
+ can be.
+- ranges : Each range corresponds to a single chipselect, and cover
+ the entire access window as configured.
+
+Example:
+ localbus@f0010100 {
+ compatible = "fsl,mpc8272-localbus",
+ "fsl,pq2-localbus";
+ #address-cells = <2>;
+ #size-cells = <1>;
+ reg = <f0010100 40>;
+
+ ranges = <0 0 fe000000 02000000
+ 1 0 f4500000 00008000>;
+
+ flash@0,0 {
+ compatible = "jedec-flash";
+ reg = <0 0 2000000>;
+ bank-width = <4>;
+ device-width = <1>;
+ };
+
+ board-control@1,0 {
+ reg = <1 0 20>;
+ compatible = "fsl,mpc8272ads-bcsr";
+ };
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/mcu-mpc8349emitx.txt b/Documentation/powerpc/dts-bindings/fsl/mcu-mpc8349emitx.txt
new file mode 100644
index 0000000..0f76633
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/mcu-mpc8349emitx.txt
@@ -0,0 +1,17 @@
+Freescale MPC8349E-mITX-compatible Power Management Micro Controller Unit (MCU)
+
+Required properties:
+- compatible : "fsl,<mcu-chip>-<board>", "fsl,mcu-mpc8349emitx".
+- reg : should specify I2C address (0x0a).
+- #gpio-cells : should be 2.
+- gpio-controller : should be present.
+
+Example:
+
+mcu@0a {
+ #gpio-cells = <2>;
+ compatible = "fsl,mc9s08qg8-mpc8349emitx",
+ "fsl,mcu-mpc8349emitx";
+ reg = <0x0a>;
+ gpio-controller;
+};
diff --git a/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt b/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
new file mode 100644
index 0000000..b26b919
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
@@ -0,0 +1,36 @@
+* Freescale MSI interrupt controller
+
+Reguired properities:
+- compatible : compatible list, contains 2 entries,
+ first is "fsl,CHIP-msi", where CHIP is the processor(mpc8610, mpc8572,
+ etc.) and the second is "fsl,mpic-msi" or "fsl,ipic-msi" depending on
+ the parent type.
+- reg : should contain the address and the length of the shared message
+ interrupt register set.
+- msi-available-ranges: use <start count> style section to define which
+ msi interrupt can be used in the 256 msi interrupts. This property is
+ optional, without this, all the 256 MSI interrupts can be used.
+- interrupts : each one of the interrupts here is one entry per 32 MSIs,
+ and routed to the host interrupt controller. the interrupts should
+ be set as edge sensitive.
+- interrupt-parent: the phandle for the interrupt controller
+ that services interrupts for this device. for 83xx cpu, the interrupts
+ are routed to IPIC, and for 85xx/86xx cpu the interrupts are routed
+ to MPIC.
+
+Example:
+ msi@41600 {
+ compatible = "fsl,mpc8610-msi", "fsl,mpic-msi";
+ reg = <0x41600 0x80>;
+ msi-available-ranges = <0 0x100>;
+ interrupts = <
+ 0xe0 0
+ 0xe1 0
+ 0xe2 0
+ 0xe3 0
+ 0xe4 0
+ 0xe5 0
+ 0xe6 0
+ 0xe7 0>;
+ interrupt-parent = <&mpic>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/pmc.txt b/Documentation/powerpc/dts-bindings/fsl/pmc.txt
new file mode 100644
index 0000000..02f6f43
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/pmc.txt
@@ -0,0 +1,63 @@
+* Power Management Controller
+
+Properties:
+- compatible: "fsl,<chip>-pmc".
+
+ "fsl,mpc8349-pmc" should be listed for any chip whose PMC is
+ compatible. "fsl,mpc8313-pmc" should also be listed for any chip
+ whose PMC is compatible, and implies deep-sleep capability.
+
+ "fsl,mpc8548-pmc" should be listed for any chip whose PMC is
+ compatible. "fsl,mpc8536-pmc" should also be listed for any chip
+ whose PMC is compatible, and implies deep-sleep capability.
+
+ "fsl,mpc8641d-pmc" should be listed for any chip whose PMC is
+ compatible; all statements below that apply to "fsl,mpc8548-pmc" also
+ apply to "fsl,mpc8641d-pmc".
+
+ Compatibility does not include bit assigments in SCCR/PMCDR/DEVDISR; these
+ bit assigments are indicated via the sleep specifier in each device's
+ sleep property.
+
+- reg: For devices compatible with "fsl,mpc8349-pmc", the first resource
+ is the PMC block, and the second resource is the Clock Configuration
+ block.
+
+ For devices compatible with "fsl,mpc8548-pmc", the first resource
+ is a 32-byte block beginning with DEVDISR.
+
+- interrupts: For "fsl,mpc8349-pmc"-compatible devices, the first
+ resource is the PMC block interrupt.
+
+- fsl,mpc8313-wakeup-timer: For "fsl,mpc8313-pmc"-compatible devices,
+ this is a phandle to an "fsl,gtm" node on which timer 4 can be used as
+ a wakeup source from deep sleep.
+
+Sleep specifiers:
+
+ fsl,mpc8349-pmc: Sleep specifiers consist of one cell. For each bit
+ that is set in the cell, the corresponding bit in SCCR will be saved
+ and cleared on suspend, and restored on resume. This sleep controller
+ supports disabling and resuming devices at any time.
+
+ fsl,mpc8536-pmc: Sleep specifiers consist of three cells, the third of
+ which will be ORed into PMCDR upon suspend, and cleared from PMCDR
+ upon resume. The first two cells are as described for fsl,mpc8578-pmc.
+ This sleep controller only supports disabling devices during system
+ sleep, or permanently.
+
+ fsl,mpc8548-pmc: Sleep specifiers consist of one or two cells, the
+ first of which will be ORed into DEVDISR (and the second into
+ DEVDISR2, if present -- this cell should be zero or absent if the
+ hardware does not have DEVDISR2) upon a request for permanent device
+ disabling. This sleep controller does not support configuring devices
+ to disable during system sleep (unless supported by another compatible
+ match), or dynamically.
+
+Example:
+
+ power@b00 {
+ compatible = "fsl,mpc8313-pmc", "fsl,mpc8349-pmc";
+ reg = <0xb00 0x100 0xa00 0x100>;
+ interrupts = <80 8>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/sata.txt b/Documentation/powerpc/dts-bindings/fsl/sata.txt
new file mode 100644
index 0000000..b46bcf4
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/sata.txt
@@ -0,0 +1,29 @@
+* Freescale 8xxx/3.0 Gb/s SATA nodes
+
+SATA nodes are defined to describe on-chip Serial ATA controllers.
+Each SATA port should have its own node.
+
+Required properties:
+- compatible : compatible list, contains 2 entries, first is
+ "fsl,CHIP-sata", where CHIP is the processor
+ (mpc8315, mpc8379, etc.) and the second is
+ "fsl,pq-sata"
+- interrupts : <interrupt mapping for SATA IRQ>
+- cell-index : controller index.
+ 1 for controller @ 0x18000
+ 2 for controller @ 0x19000
+ 3 for controller @ 0x1a000
+ 4 for controller @ 0x1b000
+
+Optional properties:
+- interrupt-parent : optional, if needed for interrupt mapping
+- reg : <registers mapping>
+
+Example:
+ sata@18000 {
+ compatible = "fsl,mpc8379-sata", "fsl,pq-sata";
+ reg = <0x18000 0x1000>;
+ cell-index = <1>;
+ interrupts = <2c 8>;
+ interrupt-parent = < &ipic >;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/sec.txt b/Documentation/powerpc/dts-bindings/fsl/sec.txt
new file mode 100644
index 0000000..2b6f2d4
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/sec.txt
@@ -0,0 +1,68 @@
+Freescale SoC SEC Security Engines
+
+Required properties:
+
+- compatible : Should contain entries for this and backward compatible
+ SEC versions, high to low, e.g., "fsl,sec2.1", "fsl,sec2.0"
+- reg : Offset and length of the register set for the device
+- interrupts : the SEC's interrupt number
+- fsl,num-channels : An integer representing the number of channels
+ available.
+- fsl,channel-fifo-len : An integer representing the number of
+ descriptor pointers each channel fetch fifo can hold.
+- fsl,exec-units-mask : The bitmask representing what execution units
+ (EUs) are available. It's a single 32-bit cell. EU information
+ should be encoded following the SEC's Descriptor Header Dword
+ EU_SEL0 field documentation, i.e. as follows:
+
+ bit 0 = reserved - should be 0
+ bit 1 = set if SEC has the ARC4 EU (AFEU)
+ bit 2 = set if SEC has the DES/3DES EU (DEU)
+ bit 3 = set if SEC has the message digest EU (MDEU/MDEU-A)
+ bit 4 = set if SEC has the random number generator EU (RNG)
+ bit 5 = set if SEC has the public key EU (PKEU)
+ bit 6 = set if SEC has the AES EU (AESU)
+ bit 7 = set if SEC has the Kasumi EU (KEU)
+ bit 8 = set if SEC has the CRC EU (CRCU)
+ bit 11 = set if SEC has the message digest EU extended alg set (MDEU-B)
+
+remaining bits are reserved for future SEC EUs.
+
+- fsl,descriptor-types-mask : The bitmask representing what descriptors
+ are available. It's a single 32-bit cell. Descriptor type information
+ should be encoded following the SEC's Descriptor Header Dword DESC_TYPE
+ field documentation, i.e. as follows:
+
+ bit 0 = set if SEC supports the aesu_ctr_nonsnoop desc. type
+ bit 1 = set if SEC supports the ipsec_esp descriptor type
+ bit 2 = set if SEC supports the common_nonsnoop desc. type
+ bit 3 = set if SEC supports the 802.11i AES ccmp desc. type
+ bit 4 = set if SEC supports the hmac_snoop_no_afeu desc. type
+ bit 5 = set if SEC supports the srtp descriptor type
+ bit 6 = set if SEC supports the non_hmac_snoop_no_afeu desc.type
+ bit 7 = set if SEC supports the pkeu_assemble descriptor type
+ bit 8 = set if SEC supports the aesu_key_expand_output desc.type
+ bit 9 = set if SEC supports the pkeu_ptmul descriptor type
+ bit 10 = set if SEC supports the common_nonsnoop_afeu desc. type
+ bit 11 = set if SEC supports the pkeu_ptadd_dbl descriptor type
+
+ ..and so on and so forth.
+
+Optional properties:
+
+- interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+
+Example:
+
+ /* MPC8548E */
+ crypto@30000 {
+ compatible = "fsl,sec2.1", "fsl,sec2.0";
+ reg = <0x30000 0x10000>;
+ interrupts = <29 2>;
+ interrupt-parent = <&mpic>;
+ fsl,num-channels = <4>;
+ fsl,channel-fifo-len = <24>;
+ fsl,exec-units-mask = <0xfe>;
+ fsl,descriptor-types-mask = <0x12b0ebf>;
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/spi.txt b/Documentation/powerpc/dts-bindings/fsl/spi.txt
new file mode 100644
index 0000000..e7d9a34
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/spi.txt
@@ -0,0 +1,24 @@
+* SPI (Serial Peripheral Interface)
+
+Required properties:
+- cell-index : SPI controller index.
+- compatible : should be "fsl,spi".
+- mode : the SPI operation mode, it can be "cpu" or "cpu-qe".
+- reg : Offset and length of the register set for the device
+- interrupts : <a b> where a is the interrupt number and b is a
+ field that represents an encoding of the sense and level
+ information for the interrupt. This should be encoded based on
+ the information in section 2) depending on the type of interrupt
+ controller you have.
+- interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+
+Example:
+ spi@4c0 {
+ cell-index = <0>;
+ compatible = "fsl,spi";
+ reg = <4c0 40>;
+ interrupts = <82 0>;
+ interrupt-parent = <700>;
+ mode = "cpu";
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/ssi.txt b/Documentation/powerpc/dts-bindings/fsl/ssi.txt
new file mode 100644
index 0000000..a2d9639
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/ssi.txt
@@ -0,0 +1,61 @@
+Freescale Synchronous Serial Interface
+
+The SSI is a serial device that communicates with audio codecs. It can
+be programmed in AC97, I2S, left-justified, or right-justified modes.
+
+Required properties:
+- compatible : compatible list, containing "fsl,ssi"
+- cell-index : the SSI, <0> = SSI1, <1> = SSI2, and so on
+- reg : offset and length of the register set for the device
+- interrupts : <a b> where a is the interrupt number and b is a
+ field that represents an encoding of the sense and
+ level information for the interrupt. This should be
+ encoded based on the information in section 2)
+ depending on the type of interrupt controller you
+ have.
+- interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+- fsl,mode : the operating mode for the SSI interface
+ "i2s-slave" - I2S mode, SSI is clock slave
+ "i2s-master" - I2S mode, SSI is clock master
+ "lj-slave" - left-justified mode, SSI is clock slave
+ "lj-master" - l.j. mode, SSI is clock master
+ "rj-slave" - right-justified mode, SSI is clock slave
+ "rj-master" - r.j., SSI is clock master
+ "ac97-slave" - AC97 mode, SSI is clock slave
+ "ac97-master" - AC97 mode, SSI is clock master
+- fsl,playback-dma: phandle to a node for the DMA channel to use for
+ playback of audio. This is typically dictated by SOC
+ design. See the notes below.
+- fsl,capture-dma: phandle to a node for the DMA channel to use for
+ capture (recording) of audio. This is typically dictated
+ by SOC design. See the notes below.
+
+Optional properties:
+- codec-handle : phandle to a 'codec' node that defines an audio
+ codec connected to this SSI. This node is typically
+ a child of an I2C or other control node.
+
+Child 'codec' node required properties:
+- compatible : compatible list, contains the name of the codec
+
+Child 'codec' node optional properties:
+- clock-frequency : The frequency of the input clock, which typically
+ comes from an on-board dedicated oscillator.
+
+Notes on fsl,playback-dma and fsl,capture-dma:
+
+On SOCs that have an SSI, specific DMA channels are hard-wired for playback
+and capture. On the MPC8610, for example, SSI1 must use DMA channel 0 for
+playback and DMA channel 1 for capture. SSI2 must use DMA channel 2 for
+playback and DMA channel 3 for capture. The developer can choose which
+DMA controller to use, but the channels themselves are hard-wired. The
+purpose of these two properties is to represent this hardware design.
+
+The device tree nodes for the DMA channels that are referenced by
+"fsl,playback-dma" and "fsl,capture-dma" must be marked as compatible with
+"fsl,ssi-dma-channel". The SOC-specific compatible string (e.g.
+"fsl,mpc8610-dma-channel") can remain. If these nodes are left as
+"fsl,elo-dma-channel" or "fsl,eloplus-dma-channel", then the generic Elo DMA
+drivers (fsldma) will attempt to use them, and it will conflict with the
+sound drivers.
diff --git a/Documentation/powerpc/dts-bindings/fsl/tsec.txt b/Documentation/powerpc/dts-bindings/fsl/tsec.txt
new file mode 100644
index 0000000..cf55fa4
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/tsec.txt
@@ -0,0 +1,62 @@
+* MDIO IO device
+
+The MDIO is a bus to which the PHY devices are connected. For each
+device that exists on this bus, a child node should be created. See
+the definition of the PHY node below for an example of how to define
+a PHY.
+
+Required properties:
+ - reg : Offset and length of the register set for the device
+ - compatible : Should define the compatible device type for the
+ mdio. Currently, this is most likely to be "fsl,gianfar-mdio"
+
+Example:
+
+ mdio@24520 {
+ reg = <24520 20>;
+ compatible = "fsl,gianfar-mdio";
+
+ ethernet-phy@0 {
+ ......
+ };
+ };
+
+
+* Gianfar-compatible ethernet nodes
+
+Properties:
+
+ - device_type : Should be "network"
+ - model : Model of the device. Can be "TSEC", "eTSEC", or "FEC"
+ - compatible : Should be "gianfar"
+ - reg : Offset and length of the register set for the device
+ - local-mac-address : List of bytes representing the ethernet address of
+ this controller
+ - interrupts : For FEC devices, the first interrupt is the device's
+ interrupt. For TSEC and eTSEC devices, the first interrupt is
+ transmit, the second is receive, and the third is error.
+ - phy-handle : The phandle for the PHY connected to this ethernet
+ controller.
+ - fixed-link : <a b c d e> where a is emulated phy id - choose any,
+ but unique to the all specified fixed-links, b is duplex - 0 half,
+ 1 full, c is link speed - d#10/d#100/d#1000, d is pause - 0 no
+ pause, 1 pause, e is asym_pause - 0 no asym_pause, 1 asym_pause.
+ - phy-connection-type : a string naming the controller/PHY interface type,
+ i.e., "mii" (default), "rmii", "gmii", "rgmii", "rgmii-id", "sgmii",
+ "tbi", or "rtbi". This property is only really needed if the connection
+ is of type "rgmii-id", as all other connection types are detected by
+ hardware.
+ - fsl,magic-packet : If present, indicates that the hardware supports
+ waking up via magic packet.
+
+Example:
+ ethernet@24000 {
+ device_type = "network";
+ model = "TSEC";
+ compatible = "gianfar";
+ reg = <0x24000 0x1000>;
+ local-mac-address = [ 00 E0 0C 00 73 00 ];
+ interrupts = <29 2 30 2 34 2>;
+ interrupt-parent = <&mpic>;
+ phy-handle = <&phy0>
+ };
diff --git a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
new file mode 100644
index 0000000..84a04d5
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
@@ -0,0 +1,28 @@
+Freescale Localbus UPM programmed to work with NAND flash
+
+Required properties:
+- compatible : "fsl,upm-nand".
+- reg : should specify localbus chip select and size used for the chip.
+- fsl,upm-addr-offset : UPM pattern offset for the address latch.
+- fsl,upm-cmd-offset : UPM pattern offset for the command latch.
+- gpios : may specify optional GPIO connected to the Ready-Not-Busy pin.
+
+Example:
+
+upm@1,0 {
+ compatible = "fsl,upm-nand";
+ reg = <1 0 1>;
+ fsl,upm-addr-offset = <16>;
+ fsl,upm-cmd-offset = <8>;
+ gpios = <&qe_pio_e 18 0>;
+
+ flash {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "...";
+
+ partition@0 {
+ ...
+ };
+ };
+};
diff --git a/Documentation/powerpc/dts-bindings/fsl/usb.txt b/Documentation/powerpc/dts-bindings/fsl/usb.txt
new file mode 100644
index 0000000..b001524
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/usb.txt
@@ -0,0 +1,59 @@
+Freescale SOC USB controllers
+
+The device node for a USB controller that is part of a Freescale
+SOC is as described in the document "Open Firmware Recommended
+Practice : Universal Serial Bus" with the following modifications
+and additions :
+
+Required properties :
+ - compatible : Should be "fsl-usb2-mph" for multi port host USB
+ controllers, or "fsl-usb2-dr" for dual role USB controllers
+ - phy_type : For multi port host USB controllers, should be one of
+ "ulpi", or "serial". For dual role USB controllers, should be
+ one of "ulpi", "utmi", "utmi_wide", or "serial".
+ - reg : Offset and length of the register set for the device
+ - port0 : boolean; if defined, indicates port0 is connected for
+ fsl-usb2-mph compatible controllers. Either this property or
+ "port1" (or both) must be defined for "fsl-usb2-mph" compatible
+ controllers.
+ - port1 : boolean; if defined, indicates port1 is connected for
+ fsl-usb2-mph compatible controllers. Either this property or
+ "port0" (or both) must be defined for "fsl-usb2-mph" compatible
+ controllers.
+ - dr_mode : indicates the working mode for "fsl-usb2-dr" compatible
+ controllers. Can be "host", "peripheral", or "otg". Default to
+ "host" if not defined for backward compatibility.
+
+Recommended properties :
+ - interrupts : <a b> where a is the interrupt number and b is a
+ field that represents an encoding of the sense and level
+ information for the interrupt. This should be encoded based on
+ the information in section 2) depending on the type of interrupt
+ controller you have.
+ - interrupt-parent : the phandle for the interrupt controller that
+ services interrupts for this device.
+
+Example multi port host USB controller device node :
+ usb@22000 {
+ compatible = "fsl-usb2-mph";
+ reg = <22000 1000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ interrupt-parent = <700>;
+ interrupts = <27 1>;
+ phy_type = "ulpi";
+ port0;
+ port1;
+ };
+
+Example dual role USB controller device node :
+ usb@23000 {
+ compatible = "fsl-usb2-dr";
+ reg = <23000 1000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ interrupt-parent = <700>;
+ interrupts = <26 1>;
+ dr_mode = "otg";
+ phy = "ulpi";
+ };
diff --git a/Documentation/powerpc/dts-bindings/gpio/led.txt b/Documentation/powerpc/dts-bindings/gpio/led.txt
new file mode 100644
index 0000000..ff51f4c
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/gpio/led.txt
@@ -0,0 +1,15 @@
+LED connected to GPIO
+
+Required properties:
+- compatible : should be "gpio-led".
+- label : (optional) the label for this LED. If omitted, the label is
+ taken from the node name (excluding the unit address).
+- gpios : should specify LED GPIO.
+
+Example:
+
+led@0 {
+ compatible = "gpio-led";
+ label = "hdd";
+ gpios = <&mcu_pio 0 1>;
+};
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
new file mode 100644
index 0000000..9d4e33d
--- /dev/null
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -0,0 +1,334 @@
+
+
+ PCI Bus EEH Error Recovery
+ --------------------------
+ Linas Vepstas
+ <linas@austin.ibm.com>
+ 12 January 2005
+
+
+Overview:
+---------
+The IBM POWER-based pSeries and iSeries computers include PCI bus
+controller chips that have extended capabilities for detecting and
+reporting a large variety of PCI bus error conditions. These features
+go under the name of "EEH", for "Extended Error Handling". The EEH
+hardware features allow PCI bus errors to be cleared and a PCI
+card to be "rebooted", without also having to reboot the operating
+system.
+
+This is in contrast to traditional PCI error handling, where the
+PCI chip is wired directly to the CPU, and an error would cause
+a CPU machine-check/check-stop condition, halting the CPU entirely.
+Another "traditional" technique is to ignore such errors, which
+can lead to data corruption, both of user data or of kernel data,
+hung/unresponsive adapters, or system crashes/lockups. Thus,
+the idea behind EEH is that the operating system can become more
+reliable and robust by protecting it from PCI errors, and giving
+the OS the ability to "reboot"/recover individual PCI devices.
+
+Future systems from other vendors, based on the PCI-E specification,
+may contain similar features.
+
+
+Causes of EEH Errors
+--------------------
+EEH was originally designed to guard against hardware failure, such
+as PCI cards dying from heat, humidity, dust, vibration and bad
+electrical connections. The vast majority of EEH errors seen in
+"real life" are due to either poorly seated PCI cards, or,
+unfortunately quite commonly, due to device driver bugs, device firmware
+bugs, and sometimes PCI card hardware bugs.
+
+The most common software bug, is one that causes the device to
+attempt to DMA to a location in system memory that has not been
+reserved for DMA access for that card. This is a powerful feature,
+as it prevents what; otherwise, would have been silent memory
+corruption caused by the bad DMA. A number of device driver
+bugs have been found and fixed in this way over the past few
+years. Other possible causes of EEH errors include data or
+address line parity errors (for example, due to poor electrical
+connectivity due to a poorly seated card), and PCI-X split-completion
+errors (due to software, device firmware, or device PCI hardware bugs).
+The vast majority of "true hardware failures" can be cured by
+physically removing and re-seating the PCI card.
+
+
+Detection and Recovery
+----------------------
+In the following discussion, a generic overview of how to detect
+and recover from EEH errors will be presented. This is followed
+by an overview of how the current implementation in the Linux
+kernel does it. The actual implementation is subject to change,
+and some of the finer points are still being debated. These
+may in turn be swayed if or when other architectures implement
+similar functionality.
+
+When a PCI Host Bridge (PHB, the bus controller connecting the
+PCI bus to the system CPU electronics complex) detects a PCI error
+condition, it will "isolate" the affected PCI card. Isolation
+will block all writes (either to the card from the system, or
+from the card to the system), and it will cause all reads to
+return all-ff's (0xff, 0xffff, 0xffffffff for 8/16/32-bit reads).
+This value was chosen because it is the same value you would
+get if the device was physically unplugged from the slot.
+This includes access to PCI memory, I/O space, and PCI config
+space. Interrupts; however, will continued to be delivered.
+
+Detection and recovery are performed with the aid of ppc64
+firmware. The programming interfaces in the Linux kernel
+into the firmware are referred to as RTAS (Run-Time Abstraction
+Services). The Linux kernel does not (should not) access
+the EEH function in the PCI chipsets directly, primarily because
+there are a number of different chipsets out there, each with
+different interfaces and quirks. The firmware provides a
+uniform abstraction layer that will work with all pSeries
+and iSeries hardware (and be forwards-compatible).
+
+If the OS or device driver suspects that a PCI slot has been
+EEH-isolated, there is a firmware call it can make to determine if
+this is the case. If so, then the device driver should put itself
+into a consistent state (given that it won't be able to complete any
+pending work) and start recovery of the card. Recovery normally
+would consist of resetting the PCI device (holding the PCI #RST
+line high for two seconds), followed by setting up the device
+config space (the base address registers (BAR's), latency timer,
+cache line size, interrupt line, and so on). This is followed by a
+reinitialization of the device driver. In a worst-case scenario,
+the power to the card can be toggled, at least on hot-plug-capable
+slots. In principle, layers far above the device driver probably
+do not need to know that the PCI card has been "rebooted" in this
+way; ideally, there should be at most a pause in Ethernet/disk/USB
+I/O while the card is being reset.
+
+If the card cannot be recovered after three or four resets, the
+kernel/device driver should assume the worst-case scenario, that the
+card has died completely, and report this error to the sysadmin.
+In addition, error messages are reported through RTAS and also through
+syslogd (/var/log/messages) to alert the sysadmin of PCI resets.
+The correct way to deal with failed adapters is to use the standard
+PCI hotplug tools to remove and replace the dead card.
+
+
+Current PPC64 Linux EEH Implementation
+--------------------------------------
+At this time, a generic EEH recovery mechanism has been implemented,
+so that individual device drivers do not need to be modified to support
+EEH recovery. This generic mechanism piggy-backs on the PCI hotplug
+infrastructure, and percolates events up through the userspace/udev
+infrastructure. Following is a detailed description of how this is
+accomplished.
+
+EEH must be enabled in the PHB's very early during the boot process,
+and if a PCI slot is hot-plugged. The former is performed by
+eeh_init() in arch/powerpc/platforms/pseries/eeh.c, and the later by
+drivers/pci/hotplug/pSeries_pci.c calling in to the eeh.c code.
+EEH must be enabled before a PCI scan of the device can proceed.
+Current Power5 hardware will not work unless EEH is enabled;
+although older Power4 can run with it disabled. Effectively,
+EEH can no longer be turned off. PCI devices *must* be
+registered with the EEH code; the EEH code needs to know about
+the I/O address ranges of the PCI device in order to detect an
+error. Given an arbitrary address, the routine
+pci_get_device_by_addr() will find the pci device associated
+with that address (if any).
+
+The default arch/powerpc/include/asm/io.h macros readb(), inb(), insb(),
+etc. include a check to see if the i/o read returned all-0xff's.
+If so, these make a call to eeh_dn_check_failure(), which in turn
+asks the firmware if the all-ff's value is the sign of a true EEH
+error. If it is not, processing continues as normal. The grand
+total number of these false alarms or "false positives" can be
+seen in /proc/ppc64/eeh (subject to change). Normally, almost
+all of these occur during boot, when the PCI bus is scanned, where
+a large number of 0xff reads are part of the bus scan procedure.
+
+If a frozen slot is detected, code in
+arch/powerpc/platforms/pseries/eeh.c will print a stack trace to
+syslog (/var/log/messages). This stack trace has proven to be very
+useful to device-driver authors for finding out at what point the EEH
+error was detected, as the error itself usually occurs slightly
+beforehand.
+
+Next, it uses the Linux kernel notifier chain/work queue mechanism to
+allow any interested parties to find out about the failure. Device
+drivers, or other parts of the kernel, can use
+eeh_register_notifier(struct notifier_block *) to find out about EEH
+events. The event will include a pointer to the pci device, the
+device node and some state info. Receivers of the event can "do as
+they wish"; the default handler will be described further in this
+section.
+
+To assist in the recovery of the device, eeh.c exports the
+following functions:
+
+rtas_set_slot_reset() -- assert the PCI #RST line for 1/8th of a second
+rtas_configure_bridge() -- ask firmware to configure any PCI bridges
+ located topologically under the pci slot.
+eeh_save_bars() and eeh_restore_bars(): save and restore the PCI
+ config-space info for a device and any devices under it.
+
+
+A handler for the EEH notifier_block events is implemented in
+drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events().
+It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter().
+This last call causes the device driver for the card to be stopped,
+which causes uevents to go out to user space. This triggers
+user-space scripts that might issue commands such as "ifdown eth0"
+for ethernet cards, and so on. This handler then sleeps for 5 seconds,
+hoping to give the user-space scripts enough time to complete.
+It then resets the PCI card, reconfigures the device BAR's, and
+any bridges underneath. It then calls rpaphp_enable_pci_slot(),
+which restarts the device driver and triggers more user-space
+events (for example, calling "ifup eth0" for ethernet cards).
+
+
+Device Shutdown and User-Space Events
+-------------------------------------
+This section documents what happens when a pci slot is unconfigured,
+focusing on how the device driver gets shut down, and on how the
+events get delivered to user-space scripts.
+
+Following is an example sequence of events that cause a device driver
+close function to be called during the first phase of an EEH reset.
+The following sequence is an example of the pcnet32 device driver.
+
+ rpa_php_unconfig_pci_adapter (struct slot *) // in rpaphp_pci.c
+ {
+ calls
+ pci_remove_bus_device (struct pci_dev *) // in /drivers/pci/remove.c
+ {
+ calls
+ pci_destroy_dev (struct pci_dev *)
+ {
+ calls
+ device_unregister (&dev->dev) // in /drivers/base/core.c
+ {
+ calls
+ device_del (struct device *)
+ {
+ calls
+ bus_remove_device() // in /drivers/base/bus.c
+ {
+ calls
+ device_release_driver()
+ {
+ calls
+ struct device_driver->remove() which is just
+ pci_device_remove() // in /drivers/pci/pci_driver.c
+ {
+ calls
+ struct pci_driver->remove() which is just
+ pcnet32_remove_one() // in /drivers/net/pcnet32.c
+ {
+ calls
+ unregister_netdev() // in /net/core/dev.c
+ {
+ calls
+ dev_close() // in /net/core/dev.c
+ {
+ calls dev->stop();
+ which is just pcnet32_close() // in pcnet32.c
+ {
+ which does what you wanted
+ to stop the device
+ }
+ }
+ }
+ which
+ frees pcnet32 device driver memory
+ }
+ }}}}}}
+
+
+ in drivers/pci/pci_driver.c,
+ struct device_driver->remove() is just pci_device_remove()
+ which calls struct pci_driver->remove() which is pcnet32_remove_one()
+ which calls unregister_netdev() (in net/core/dev.c)
+ which calls dev_close() (in net/core/dev.c)
+ which calls dev->stop() which is pcnet32_close()
+ which then does the appropriate shutdown.
+
+---
+Following is the analogous stack trace for events sent to user-space
+when the pci device is unconfigured.
+
+rpa_php_unconfig_pci_adapter() { // in rpaphp_pci.c
+ calls
+ pci_remove_bus_device (struct pci_dev *) { // in /drivers/pci/remove.c
+ calls
+ pci_destroy_dev (struct pci_dev *) {
+ calls
+ device_unregister (&dev->dev) { // in /drivers/base/core.c
+ calls
+ device_del(struct device * dev) { // in /drivers/base/core.c
+ calls
+ kobject_del() { //in /libs/kobject.c
+ calls
+ kobject_uevent() { // in /libs/kobject.c
+ calls
+ kset_uevent() { // in /lib/kobject.c
+ calls
+ kset->uevent_ops->uevent() // which is really just
+ a call to
+ dev_uevent() { // in /drivers/base/core.c
+ calls
+ dev->bus->uevent() which is really just a call to
+ pci_uevent () { // in drivers/pci/hotplug.c
+ which prints device name, etc....
+ }
+ }
+ then kobject_uevent() sends a netlink uevent to userspace
+ --> userspace uevent
+ (during early boot, nobody listens to netlink events and
+ kobject_uevent() executes uevent_helper[], which runs the
+ event process /sbin/hotplug)
+ }
+ }
+ kobject_del() then calls sysfs_remove_dir(), which would
+ trigger any user-space daemon that was watching /sysfs,
+ and notice the delete event.
+
+
+Pro's and Con's of the Current Design
+-------------------------------------
+There are several issues with the current EEH software recovery design,
+which may be addressed in future revisions. But first, note that the
+big plus of the current design is that no changes need to be made to
+individual device drivers, so that the current design throws a wide net.
+The biggest negative of the design is that it potentially disturbs
+network daemons and file systems that didn't need to be disturbed.
+
+-- A minor complaint is that resetting the network card causes
+ user-space back-to-back ifdown/ifup burps that potentially disturb
+ network daemons, that didn't need to even know that the pci
+ card was being rebooted.
+
+-- A more serious concern is that the same reset, for SCSI devices,
+ causes havoc to mounted file systems. Scripts cannot post-facto
+ unmount a file system without flushing pending buffers, but this
+ is impossible, because I/O has already been stopped. Thus,
+ ideally, the reset should happen at or below the block layer,
+ so that the file systems are not disturbed.
+
+ Reiserfs does not tolerate errors returned from the block device.
+ Ext3fs seems to be tolerant, retrying reads/writes until it does
+ succeed. Both have been only lightly tested in this scenario.
+
+ The SCSI-generic subsystem already has built-in code for performing
+ SCSI device resets, SCSI bus resets, and SCSI host-bus-adapter
+ (HBA) resets. These are cascaded into a chain of attempted
+ resets if a SCSI command fails. These are completely hidden
+ from the block layer. It would be very natural to add an EEH
+ reset into this chain of events.
+
+-- If a SCSI error occurs for the root device, all is lost unless
+ the sysadmin had the foresight to run /bin, /sbin, /etc, /var
+ and so on, out of ramdisk/tmpfs.
+
+
+Conclusions
+-----------
+There's forward progress ...
+
+
diff --git a/Documentation/powerpc/hvcs.txt b/Documentation/powerpc/hvcs.txt
new file mode 100644
index 0000000..f93462c
--- /dev/null
+++ b/Documentation/powerpc/hvcs.txt
@@ -0,0 +1,567 @@
+===========================================================================
+ HVCS
+ IBM "Hypervisor Virtual Console Server" Installation Guide
+ for Linux Kernel 2.6.4+
+ Copyright (C) 2004 IBM Corporation
+
+===========================================================================
+NOTE:Eight space tabs are the optimum editor setting for reading this file.
+===========================================================================
+
+ Author(s) : Ryan S. Arnold <rsa@us.ibm.com>
+ Date Created: March, 02, 2004
+ Last Changed: August, 24, 2004
+
+---------------------------------------------------------------------------
+Table of contents:
+
+ 1. Driver Introduction:
+ 2. System Requirements
+ 3. Build Options:
+ 3.1 Built-in:
+ 3.2 Module:
+ 4. Installation:
+ 5. Connection:
+ 6. Disconnection:
+ 7. Configuration:
+ 8. Questions & Answers:
+ 9. Reporting Bugs:
+
+---------------------------------------------------------------------------
+1. Driver Introduction:
+
+This is the device driver for the IBM Hypervisor Virtual Console Server,
+"hvcs". The IBM hvcs provides a tty driver interface to allow Linux user
+space applications access to the system consoles of logically partitioned
+operating systems (Linux and AIX) running on the same partitioned Power5
+ppc64 system. Physical hardware consoles per partition are not practical
+on this hardware so system consoles are accessed by this driver using
+firmware interfaces to virtual terminal devices.
+
+---------------------------------------------------------------------------
+2. System Requirements:
+
+This device driver was written using 2.6.4 Linux kernel APIs and will only
+build and run on kernels of this version or later.
+
+This driver was written to operate solely on IBM Power5 ppc64 hardware
+though some care was taken to abstract the architecture dependent firmware
+calls from the driver code.
+
+Sysfs must be mounted on the system so that the user can determine which
+major and minor numbers are associated with each vty-server. Directions
+for sysfs mounting are outside the scope of this document.
+
+---------------------------------------------------------------------------
+3. Build Options:
+
+The hvcs driver registers itself as a tty driver. The tty layer
+dynamically allocates a block of major and minor numbers in a quantity
+requested by the registering driver. The hvcs driver asks the tty layer
+for 64 of these major/minor numbers by default to use for hvcs device node
+entries.
+
+If the default number of device entries is adequate then this driver can be
+built into the kernel. If not, the default can be over-ridden by inserting
+the driver as a module with insmod parameters.
+
+---------------------------------------------------------------------------
+3.1 Built-in:
+
+The following menuconfig example demonstrates selecting to build this
+driver into the kernel.
+
+ Device Drivers --->
+ Character devices --->
+ <*> IBM Hypervisor Virtual Console Server Support
+
+Begin the kernel make process.
+
+---------------------------------------------------------------------------
+3.2 Module:
+
+The following menuconfig example demonstrates selecting to build this
+driver as a kernel module.
+
+ Device Drivers --->
+ Character devices --->
+ <M> IBM Hypervisor Virtual Console Server Support
+
+The make process will build the following kernel modules:
+
+ hvcs.ko
+ hvcserver.ko
+
+To insert the module with the default allocation execute the following
+commands in the order they appear:
+
+ insmod hvcserver.ko
+ insmod hvcs.ko
+
+The hvcserver module contains architecture specific firmware calls and must
+be inserted first, otherwise the hvcs module will not find some of the
+symbols it expects.
+
+To override the default use an insmod parameter as follows (requesting 4
+tty devices as an example):
+
+ insmod hvcs.ko hvcs_parm_num_devs=4
+
+There is a maximum number of dev entries that can be specified on insmod.
+We think that 1024 is currently a decent maximum number of server adapters
+to allow. This can always be changed by modifying the constant in the
+source file before building.
+
+NOTE: The length of time it takes to insmod the driver seems to be related
+to the number of tty interfaces the registering driver requests.
+
+In order to remove the driver module execute the following command:
+
+ rmmod hvcs.ko
+
+The recommended method for installing hvcs as a module is to use depmod to
+build a current modules.dep file in /lib/modules/`uname -r` and then
+execute:
+
+modprobe hvcs hvcs_parm_num_devs=4
+
+The modules.dep file indicates that hvcserver.ko needs to be inserted
+before hvcs.ko and modprobe uses this file to smartly insert the modules in
+the proper order.
+
+The following modprobe command is used to remove hvcs and hvcserver in the
+proper order:
+
+modprobe -r hvcs
+
+---------------------------------------------------------------------------
+4. Installation:
+
+The tty layer creates sysfs entries which contain the major and minor
+numbers allocated for the hvcs driver. The following snippet of "tree"
+output of the sysfs directory shows where these numbers are presented:
+
+ sys/
+ |-- *other sysfs base dirs*
+ |
+ |-- class
+ | |-- *other classes of devices*
+ | |
+ | `-- tty
+ | |-- *other tty devices*
+ | |
+ | |-- hvcs0
+ | | `-- dev
+ | |-- hvcs1
+ | | `-- dev
+ | |-- hvcs2
+ | | `-- dev
+ | |-- hvcs3
+ | | `-- dev
+ | |
+ | |-- *other tty devices*
+ |
+ |-- *other sysfs base dirs*
+
+For the above examples the following output is a result of cat'ing the
+"dev" entry in the hvcs directory:
+
+ Pow5:/sys/class/tty/hvcs0/ # cat dev
+ 254:0
+
+ Pow5:/sys/class/tty/hvcs1/ # cat dev
+ 254:1
+
+ Pow5:/sys/class/tty/hvcs2/ # cat dev
+ 254:2
+
+ Pow5:/sys/class/tty/hvcs3/ # cat dev
+ 254:3
+
+The output from reading the "dev" attribute is the char device major and
+minor numbers that the tty layer has allocated for this driver's use. Most
+systems running hvcs will already have the device entries created or udev
+will do it automatically.
+
+Given the example output above, to manually create a /dev/hvcs* node entry
+mknod can be used as follows:
+
+ mknod /dev/hvcs0 c 254 0
+ mknod /dev/hvcs1 c 254 1
+ mknod /dev/hvcs2 c 254 2
+ mknod /dev/hvcs3 c 254 3
+
+Using mknod to manually create the device entries makes these device nodes
+persistent. Once created they will exist prior to the driver insmod.
+
+Attempting to connect an application to /dev/hvcs* prior to insertion of
+the hvcs module will result in an error message similar to the following:
+
+ "/dev/hvcs*: No such device".
+
+NOTE: Just because there is a device node present doesn't mean that there
+is a vty-server device configured for that node.
+
+---------------------------------------------------------------------------
+5. Connection
+
+Since this driver controls devices that provide a tty interface a user can
+interact with the device node entries using any standard tty-interactive
+method (e.g. "cat", "dd", "echo"). The intent of this driver however, is
+to provide real time console interaction with a Linux partition's console,
+which requires the use of applications that provide bi-directional,
+interactive I/O with a tty device.
+
+Applications (e.g. "minicom" and "screen") that act as terminal emulators
+or perform terminal type control sequence conversion on the data being
+passed through them are NOT acceptable for providing interactive console
+I/O. These programs often emulate antiquated terminal types (vt100 and
+ANSI) and expect inbound data to take the form of one of these supported
+terminal types but they either do not convert, or do not _adequately_
+convert, outbound data into the terminal type of the terminal which invoked
+them (though screen makes an attempt and can apparently be configured with
+much termcap wrestling.)
+
+For this reason kermit and cu are two of the recommended applications for
+interacting with a Linux console via an hvcs device. These programs simply
+act as a conduit for data transfer to and from the tty device. They do not
+require inbound data to take the form of a particular terminal type, nor do
+they cook outbound data to a particular terminal type.
+
+In order to ensure proper functioning of console applications one must make
+sure that once connected to a /dev/hvcs console that the console's $TERM
+env variable is set to the exact terminal type of the terminal emulator
+used to launch the interactive I/O application. If one is using xterm and
+kermit to connect to /dev/hvcs0 when the console prompt becomes available
+one should "export TERM=xterm" on the console. This tells ncurses
+applications that are invoked from the console that they should output
+control sequences that xterm can understand.
+
+As a precautionary measure an hvcs user should always "exit" from their
+session before disconnecting an application such as kermit from the device
+node. If this is not done, the next user to connect to the console will
+continue using the previous user's logged in session which includes
+using the $TERM variable that the previous user supplied.
+
+Hotplug add and remove of vty-server adapters affects which /dev/hvcs* node
+is used to connect to each vty-server adapter. In order to determine which
+vty-server adapter is associated with which /dev/hvcs* node a special sysfs
+attribute has been added to each vty-server sysfs entry. This entry is
+called "index" and showing it reveals an integer that refers to the
+/dev/hvcs* entry to use to connect to that device. For instance cating the
+index attribute of vty-server adapter 30000004 shows the following.
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat index
+ 2
+
+This index of '2' means that in order to connect to vty-server adapter
+30000004 the user should interact with /dev/hvcs2.
+
+It should be noted that due to the system hotplug I/O capabilities of a
+system the /dev/hvcs* entry that interacts with a particular vty-server
+adapter is not guaranteed to remain the same across system reboots. Look
+in the Q & A section for more on this issue.
+
+---------------------------------------------------------------------------
+6. Disconnection
+
+As a security feature to prevent the delivery of stale data to an
+unintended target the Power5 system firmware disables the fetching of data
+and discards that data when a connection between a vty-server and a vty has
+been severed. As an example, when a vty-server is immediately disconnected
+from a vty following output of data to the vty the vty adapter may not have
+enough time between when it received the data interrupt and when the
+connection was severed to fetch the data from firmware before the fetch is
+disabled by firmware.
+
+When hvcs is being used to serve consoles this behavior is not a huge issue
+because the adapter stays connected for large amounts of time following
+almost all data writes. When hvcs is being used as a tty conduit to tunnel
+data between two partitions [see Q & A below] this is a huge problem
+because the standard Linux behavior when cat'ing or dd'ing data to a device
+is to open the tty, send the data, and then close the tty. If this driver
+manually terminated vty-server connections on tty close this would close
+the vty-server and vty connection before the target vty has had a chance to
+fetch the data.
+
+Additionally, disconnecting a vty-server and vty only on module removal or
+adapter removal is impractical because other vty-servers in other
+partitions may require the usage of the target vty at any time.
+
+Due to this behavioral restriction disconnection of vty-servers from the
+connected vty is a manual procedure using a write to a sysfs attribute
+outlined below, on the other hand the initial vty-server connection to a
+vty is established automatically by this driver. Manual vty-server
+connection is never required.
+
+In order to terminate the connection between a vty-server and vty the
+"vterm_state" sysfs attribute within each vty-server's sysfs entry is used.
+Reading this attribute reveals the current connection state of the
+vty-server adapter. A zero means that the vty-server is not connected to a
+vty. A one indicates that a connection is active.
+
+Writing a '0' (zero) to the vterm_state attribute will disconnect the VTERM
+connection between the vty-server and target vty ONLY if the vterm_state
+previously read '1'. The write directive is ignored if the vterm_state
+read '0' or if any value other than '0' was written to the vterm_state
+attribute. The following example will show the method used for verifying
+the vty-server connection status and disconnecting a vty-server connection.
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state
+ 1
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo 0 > vterm_state
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat vterm_state
+ 0
+
+All vty-server connections are automatically terminated when the device is
+hotplug removed and when the module is removed.
+
+---------------------------------------------------------------------------
+7. Configuration
+
+Each vty-server has a sysfs entry in the /sys/devices/vio directory, which
+is symlinked in several other sysfs tree directories, notably under the
+hvcs driver entry, which looks like the following example:
+
+ Pow5:/sys/bus/vio/drivers/hvcs # ls
+ . .. 30000003 30000004 rescan
+
+By design, firmware notifies the hvcs driver of vty-server lifetimes and
+partner vty removals but not the addition of partner vtys. Since an HMC
+Super Admin can add partner info dynamically we have provided the hvcs
+driver sysfs directory with the "rescan" update attribute which will query
+firmware and update the partner info for all the vty-servers that this
+driver manages. Writing a '1' to the attribute triggers the update. An
+explicit example follows:
+
+ Pow5:/sys/bus/vio/drivers/hvcs # echo 1 > rescan
+
+Reading the attribute will indicate a state of '1' or '0'. A one indicates
+that an update is in process. A zero indicates that an update has
+completed or was never executed.
+
+Vty-server entries in this directory are a 32 bit partition unique unit
+address that is created by firmware. An example vty-server sysfs entry
+looks like the following:
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # ls
+ . current_vty devspec name partner_vtys
+ .. index partner_clcs vterm_state
+
+Each entry is provided, by default with a "name" attribute. Reading the
+"name" attribute will reveal the device type as shown in the following
+example:
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000003 # cat name
+ vty-server
+
+Each entry is also provided, by default, with a "devspec" attribute which
+reveals the full device specification when read, as shown in the following
+example:
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat devspec
+ /vdevice/vty-server@30000004
+
+Each vty-server sysfs dir is provided with two read-only attributes that
+provide lists of easily parsed partner vty data: "partner_vtys" and
+"partner_clcs".
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_vtys
+ 30000000
+ 30000001
+ 30000002
+ 30000000
+ 30000000
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # cat partner_clcs
+ U5112.428.103048A-V3-C0
+ U5112.428.103048A-V3-C2
+ U5112.428.103048A-V3-C3
+ U5112.428.103048A-V4-C0
+ U5112.428.103048A-V5-C0
+
+Reading partner_vtys returns a list of partner vtys. Vty unit address
+numbering is only per-partition-unique so entries will frequently repeat.
+
+Reading partner_clcs returns a list of "converged location codes" which are
+composed of a system serial number followed by "-V*", where the '*' is the
+target partition number, and "-C*", where the '*' is the slot of the
+adapter. The first vty partner corresponds to the first clc item, the
+second vty partner to the second clc item, etc.
+
+A vty-server can only be connected to a single vty at a time. The entry,
+"current_vty" prints the clc of the currently selected partner vty when
+read.
+
+The current_vty can be changed by writing a valid partner clc to the entry
+as in the following example:
+
+ Pow5:/sys/bus/vio/drivers/hvcs/30000004 # echo U5112.428.10304
+ 8A-V4-C0 > current_vty
+
+Changing the current_vty when a vty-server is already connected to a vty
+does not affect the current connection. The change takes effect when the
+currently open connection is freed.
+
+Information on the "vterm_state" attribute was covered earlier on the
+chapter entitled "disconnection".
+
+---------------------------------------------------------------------------
+8. Questions & Answers:
+===========================================================================
+Q: What are the security concerns involving hvcs?
+
+A: There are three main security concerns:
+
+ 1. The creator of the /dev/hvcs* nodes has the ability to restrict
+ the access of the device entries to certain users or groups. It
+ may be best to create a special hvcs group privilege for providing
+ access to system consoles.
+
+ 2. To provide network security when grabbing the console it is
+ suggested that the user connect to the console hosting partition
+ using a secure method, such as SSH or sit at a hardware console.
+
+ 3. Make sure to exit the user session when done with a console or
+ the next vty-server connection (which may be from another
+ partition) will experience the previously logged in session.
+
+---------------------------------------------------------------------------
+Q: How do I multiplex a console that I grab through hvcs so that other
+people can see it:
+
+A: You can use "screen" to directly connect to the /dev/hvcs* device and
+setup a session on your machine with the console group privileges. As
+pointed out earlier by default screen doesn't provide the termcap settings
+for most terminal emulators to provide adequate character conversion from
+term type "screen" to others. This means that curses based programs may
+not display properly in screen sessions.
+
+---------------------------------------------------------------------------
+Q: Why are the colors all messed up?
+Q: Why are the control characters acting strange or not working?
+Q: Why is the console output all strange and unintelligible?
+
+A: Please see the preceding section on "Connection" for a discussion of how
+applications can affect the display of character control sequences.
+Additionally, just because you logged into the console using and xterm
+doesn't mean someone else didn't log into the console with the HMC console
+(vt320) before you and leave the session logged in. The best thing to do
+is to export TERM to the terminal type of your terminal emulator when you
+get the console. Additionally make sure to "exit" the console before you
+disconnect from the console. This will ensure that the next user gets
+their own TERM type set when they login.
+
+---------------------------------------------------------------------------
+Q: When I try to CONNECT kermit to an hvcs device I get:
+"Sorry, can't open connection: /dev/hvcs*"What is happening?
+
+A: Some other Power5 console mechanism has a connection to the vty and
+isn't giving it up. You can try to force disconnect the consoles from the
+HMC by right clicking on the partition and then selecting "close terminal".
+Otherwise you have to hunt down the people who have console authority. It
+is possible that you already have the console open using another kermit
+session and just forgot about it. Please review the console options for
+Power5 systems to determine the many ways a system console can be held.
+
+OR
+
+A: Another user may not have a connectivity method currently attached to a
+/dev/hvcs device but the vterm_state may reveal that they still have the
+vty-server connection established. They need to free this using the method
+outlined in the section on "Disconnection" in order for others to connect
+to the target vty.
+
+OR
+
+A: The user profile you are using to execute kermit probably doesn't have
+permissions to use the /dev/hvcs* device.
+
+OR
+
+A: You probably haven't inserted the hvcs.ko module yet but the /dev/hvcs*
+entry still exists (on systems without udev).
+
+OR
+
+A: There is not a corresponding vty-server device that maps to an existing
+/dev/hvcs* entry.
+
+---------------------------------------------------------------------------
+Q: When I try to CONNECT kermit to an hvcs device I get:
+"Sorry, write access to UUCP lockfile directory denied."
+
+A: The /dev/hvcs* entry you have specified doesn't exist where you said it
+does? Maybe you haven't inserted the module (on systems with udev).
+
+---------------------------------------------------------------------------
+Q: If I already have one Linux partition installed can I use hvcs on said
+partition to provide the console for the install of a second Linux
+partition?
+
+A: Yes granted that your are connected to the /dev/hvcs* device using
+kermit or cu or some other program that doesn't provide terminal emulation.
+
+---------------------------------------------------------------------------
+Q: Can I connect to more than one partition's console at a time using this
+driver?
+
+A: Yes. Of course this means that there must be more than one vty-server
+configured for this partition and each must point to a disconnected vty.
+
+---------------------------------------------------------------------------
+Q: Does the hvcs driver support dynamic (hotplug) addition of devices?
+
+A: Yes, if you have dlpar and hotplug enabled for your system and it has
+been built into the kernel the hvcs drivers is configured to dynamically
+handle additions of new devices and removals of unused devices.
+
+---------------------------------------------------------------------------
+Q: For some reason /dev/hvcs* doesn't map to the same vty-server adapter
+after a reboot. What happened?
+
+A: Assignment of vty-server adapters to /dev/hvcs* entries is always done
+in the order that the adapters are exposed. Due to hotplug capabilities of
+this driver assignment of hotplug added vty-servers may be in a different
+order than how they would be exposed on module load. Rebooting or
+reloading the module after dynamic addition may result in the /dev/hvcs*
+and vty-server coupling changing if a vty-server adapter was added in a
+slot inbetween two other vty-server adapters. Refer to the section above
+on how to determine which vty-server goes with which /dev/hvcs* node.
+Hint; look at the sysfs "index" attribute for the vty-server.
+
+---------------------------------------------------------------------------
+Q: Can I use /dev/hvcs* as a conduit to another partition and use a tty
+device on that partition as the other end of the pipe?
+
+A: Yes, on Power5 platforms the hvc_console driver provides a tty interface
+for extra /dev/hvc* devices (where /dev/hvc0 is most likely the console).
+In order to get a tty conduit working between the two partitions the HMC
+Super Admin must create an additional "serial server" for the target
+partition with the HMC gui which will show up as /dev/hvc* when the target
+partition is rebooted.
+
+The HMC Super Admin then creates an additional "serial client" for the
+current partition and points this at the target partition's newly created
+"serial server" adapter (remember the slot). This shows up as an
+additional /dev/hvcs* device.
+
+Now a program on the target system can be configured to read or write to
+/dev/hvc* and another program on the current partition can be configured to
+read or write to /dev/hvcs*. Now you have a tty conduit between two
+partitions.
+
+---------------------------------------------------------------------------
+9. Reporting Bugs:
+
+The proper channel for reporting bugs is either through the Linux OS
+distribution company that provided your OS or by posting issues to the
+PowerPC development mailing list at:
+
+linuxppc-dev@ozlabs.org
+
+This request is to provide a documented and searchable public exchange
+of the problems and solutions surrounding this driver for the benefit of
+all users.
diff --git a/Documentation/powerpc/kvm_440.txt b/Documentation/powerpc/kvm_440.txt
new file mode 100644
index 0000000..c02a003
--- /dev/null
+++ b/Documentation/powerpc/kvm_440.txt
@@ -0,0 +1,41 @@
+Hollis Blanchard <hollisb@us.ibm.com>
+15 Apr 2008
+
+Various notes on the implementation of KVM for PowerPC 440:
+
+To enforce isolation, host userspace, guest kernel, and guest userspace all
+run at user privilege level. Only the host kernel runs in supervisor mode.
+Executing privileged instructions in the guest traps into KVM (in the host
+kernel), where we decode and emulate them. Through this technique, unmodified
+440 Linux kernels can be run (slowly) as guests. Future performance work will
+focus on reducing the overhead and frequency of these traps.
+
+The usual code flow is started from userspace invoking an "run" ioctl, which
+causes KVM to switch into guest context. We use IVPR to hijack the host
+interrupt vectors while running the guest, which allows us to direct all
+interrupts to kvmppc_handle_interrupt(). At this point, we could either
+- handle the interrupt completely (e.g. emulate "mtspr SPRG0"), or
+- let the host interrupt handler run (e.g. when the decrementer fires), or
+- return to host userspace (e.g. when the guest performs device MMIO)
+
+Address spaces: We take advantage of the fact that Linux doesn't use the AS=1
+address space (in host or guest), which gives us virtual address space to use
+for guest mappings. While the guest is running, the host kernel remains mapped
+in AS=0, but the guest can only use AS=1 mappings.
+
+TLB entries: The TLB entries covering the host linear mapping remain
+present while running the guest. This reduces the overhead of lightweight
+exits, which are handled by KVM running in the host kernel. We keep three
+copies of the TLB:
+ - guest TLB: contents of the TLB as the guest sees it
+ - shadow TLB: the TLB that is actually in hardware while guest is running
+ - host TLB: to restore TLB state when context switching guest -> host
+When a TLB miss occurs because a mapping was not present in the shadow TLB,
+but was present in the guest TLB, KVM handles the fault without invoking the
+guest. Large guest pages are backed by multiple 4KB shadow pages through this
+mechanism.
+
+IO: MMIO and DCR accesses are emulated by userspace. We use virtio for network
+and block IO, so those drivers must be enabled in the guest. It's possible
+that some qemu device emulation (e.g. e1000 or rtl8139) may also work with
+little effort.
diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
new file mode 100644
index 0000000..6f12f1c
--- /dev/null
+++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
@@ -0,0 +1,277 @@
+MPC5200 Device Tree Bindings
+----------------------------
+
+(c) 2006-2007 Secret Lab Technologies Ltd
+Grant Likely <grant.likely at secretlab.ca>
+
+********** DRAFT ***********
+* WARNING: Do not depend on the stability of these bindings just yet.
+* The MPC5200 device tree conventions are still in flux
+* Keep an eye on the linuxppc-dev mailing list for more details
+********** DRAFT ***********
+
+I - Introduction
+================
+Boards supported by the arch/powerpc architecture require device tree be
+passed by the boot loader to the kernel at boot time. The device tree
+describes what devices are present on the board and how they are
+connected. The device tree can either be passed as a binary blob (as
+described in Documentation/powerpc/booting-without-of.txt), or passed
+by Open Firmware (IEEE 1275) compatible firmware using an OF compatible
+client interface API.
+
+This document specifies the requirements on the device-tree for mpc5200
+based boards. These requirements are above and beyond the details
+specified in either the Open Firmware spec or booting-without-of.txt
+
+All new mpc5200-based boards are expected to match this document. In
+cases where this document is not sufficient to support a new board port,
+this document should be updated as part of adding the new board support.
+
+II - Philosophy
+===============
+The core of this document is naming convention. The whole point of
+defining this convention is to reduce or eliminate the number of
+special cases required to support a 5200 board. If all 5200 boards
+follow the same convention, then generic 5200 support code will work
+rather than coding special cases for each new board.
+
+This section tries to capture the thought process behind why the naming
+convention is what it is.
+
+1. names
+---------
+There is strong convention/requirements already established for children
+of the root node. 'cpus' describes the processor cores, 'memory'
+describes memory, and 'chosen' provides boot configuration. Other nodes
+are added to describe devices attached to the processor local bus.
+
+Following convention already established with other system-on-chip
+processors, 5200 device trees should use the name 'soc5200' for the
+parent node of on chip devices, and the root node should be its parent.
+
+Child nodes are typically named after the configured function. ie.
+the FEC node is named 'ethernet', and a PSC in uart mode is named 'serial'.
+
+2. device_type property
+-----------------------
+similar to the node name convention above; the device_type reflects the
+configured function of a device. ie. 'serial' for a uart and 'spi' for
+an spi controller. However, while node names *should* reflect the
+configured function, device_type *must* match the configured function
+exactly.
+
+3. compatible property
+----------------------
+Since device_type isn't enough to match devices to drivers, there also
+needs to be a naming convention for the compatible property. Compatible
+is an list of device descriptions sorted from specific to generic. For
+the mpc5200, the required format for each compatible value is
+<chip>-<device>[-<mode>]. The OS should be able to match a device driver
+to the device based solely on the compatible value. If two drivers
+match on the compatible list; the 'most compatible' driver should be
+selected.
+
+The split between the MPC5200 and the MPC5200B leaves a bit of a
+conundrum. How should the compatible property be set up to provide
+maximum compatibility information; but still accurately describe the
+chip? For the MPC5200; the answer is easy. Most of the SoC devices
+originally appeared on the MPC5200. Since they didn't exist anywhere
+else; the 5200 compatible properties will contain only one item;
+"mpc5200-<device>".
+
+The 5200B is almost the same as the 5200, but not quite. It fixes
+silicon bugs and it adds a small number of enhancements. Most of the
+devices either provide exactly the same interface as on the 5200. A few
+devices have extra functions but still have a backwards compatible mode.
+To express this information as completely as possible, 5200B device trees
+should have two items in the compatible list;
+"mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended
+that 5200B device trees follow this convention (instead of only listing
+the base mpc5200 item).
+
+If another chip appear on the market with one of the mpc5200 SoC
+devices, then the compatible list should include mpc5200-<device>.
+
+ie. ethernet on mpc5200: compatible = "mpc5200-ethernet"
+ ethernet on mpc5200b: compatible = "mpc5200b-ethernet\0mpc5200-ethernet"
+
+Modal devices, like PSCs, also append the configured function to the
+end of the compatible field. ie. A PSC in i2s mode would specify
+"mpc5200-psc-i2s", not "mpc5200-i2s". This convention is chosen to
+avoid naming conflicts with non-psc devices providing the same
+function. For example, "mpc5200-spi" and "mpc5200-psc-spi" describe
+the mpc5200 simple spi device and a PSC spi mode respectively.
+
+If the soc device is more generic and present on other SOCs, the
+compatible property can specify the more generic device type also.
+
+ie. mscan: compatible = "mpc5200-mscan\0fsl,mscan";
+
+At the time of writing, exact chip may be either 'mpc5200' or
+'mpc5200b'.
+
+Device drivers should always try to match as generically as possible.
+
+III - Structure
+===============
+The device tree for an mpc5200 board follows the structure defined in
+booting-without-of.txt with the following additional notes:
+
+0) the root node
+----------------
+Typical root description node; see booting-without-of
+
+1) The cpus node
+----------------
+The cpus node follows the basic layout described in booting-without-of.
+The bus-frequency property holds the XLB bus frequency
+The clock-frequency property holds the core frequency
+
+2) The memory node
+------------------
+Typical memory description node; see booting-without-of.
+
+3) The soc5200 node
+-------------------
+This node describes the on chip SOC peripherals. Every mpc5200 based
+board will have this node, and as such there is a common naming
+convention for SOC devices.
+
+Required properties:
+name type description
+---- ---- -----------
+device_type string must be "soc"
+ranges int should be <0 baseaddr baseaddr+10000>
+reg int must be <baseaddr 10000>
+compatible string mpc5200: "mpc5200-soc"
+ mpc5200b: "mpc5200b-soc\0mpc5200-soc"
+system-frequency int Fsystem frequency; source of all
+ other clocks.
+bus-frequency int IPB bus frequency in HZ. Clock rate
+ used by most of the soc devices.
+#interrupt-cells int must be <3>.
+
+Recommended properties:
+name type description
+---- ---- -----------
+model string Exact model of the chip;
+ ie: model="fsl,mpc5200"
+revision string Silicon revision of chip
+ ie: revision="M08A"
+
+The 'model' and 'revision' properties are *strongly* recommended. Having
+them presence acts as a bit of a safety net for working around as yet
+undiscovered bugs on one version of silicon. For example, device drivers
+can use the model and revision properties to decide if a bug fix should
+be turned on.
+
+4) soc5200 child nodes
+----------------------
+Any on chip SOC devices available to Linux must appear as soc5200 child nodes.
+
+Note: The tables below show the value for the mpc5200. A mpc5200b device
+tree should use the "mpc5200b-<device>\0mpc5200-<device> form.
+
+Required soc5200 child nodes:
+name device_type compatible Description
+---- ----------- ---------- -----------
+cdm@<addr> cdm mpc5200-cmd Clock Distribution
+pic@<addr> interrupt-controller mpc5200-pic need an interrupt
+ controller to boot
+bestcomm@<addr> dma-controller mpc5200-bestcomm 5200 pic also requires
+ the bestcomm device
+
+Recommended soc5200 child nodes; populate as needed for your board
+name device_type compatible Description
+---- ----------- ---------- -----------
+gpt@<addr> gpt fsl,mpc5200-gpt General purpose timers
+gpt@<addr> gpt fsl,mpc5200-gpt-gpio General purpose
+ timers in GPIO mode
+gpio@<addr> fsl,mpc5200-gpio MPC5200 simple gpio
+ controller
+gpio@<addr> fsl,mpc5200-gpio-wkup MPC5200 wakeup gpio
+ controller
+rtc@<addr> rtc mpc5200-rtc Real time clock
+mscan@<addr> mscan mpc5200-mscan CAN bus controller
+pci@<addr> pci mpc5200-pci PCI bridge
+serial@<addr> serial mpc5200-psc-uart PSC in serial mode
+i2s@<addr> sound mpc5200-psc-i2s PSC in i2s mode
+ac97@<addr> sound mpc5200-psc-ac97 PSC in ac97 mode
+spi@<addr> spi mpc5200-psc-spi PSC in spi mode
+irda@<addr> irda mpc5200-psc-irda PSC in IrDA mode
+spi@<addr> spi mpc5200-spi MPC5200 spi device
+ethernet@<addr> network mpc5200-fec MPC5200 ethernet device
+ata@<addr> ata mpc5200-ata IDE ATA interface
+i2c@<addr> i2c mpc5200-i2c I2C controller
+usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller
+xlb@<addr> xlb mpc5200-xlb XLB arbitrator
+
+Important child node properties
+name type description
+---- ---- -----------
+cell-index int When multiple devices are present, is the
+ index of the device in the hardware (ie. There
+ are 6 PSC on the 5200 numbered PSC1 to PSC6)
+ PSC1 has 'cell-index = <0>'
+ PSC4 has 'cell-index = <3>'
+
+5) General Purpose Timer nodes (child of soc5200 node)
+On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board
+design supports the internal wdt, then the device node for GPT0 should
+include the empty property 'fsl,has-wdt'.
+
+6) PSC nodes (child of soc5200 node)
+PSC nodes can define the optional 'port-number' property to force assignment
+order of serial ports. For example, PSC5 might be physically connected to
+the port labeled 'COM1' and PSC1 wired to 'COM1'. In this case, PSC5 would
+have a "port-number = <0>" property, and PSC1 would have "port-number = <1>".
+
+PSC in i2s mode: The mpc5200 and mpc5200b PSCs are not compatible when in
+i2s mode. An 'mpc5200b-psc-i2s' node cannot include 'mpc5200-psc-i2s' in the
+compatible field.
+
+7) GPIO controller nodes
+Each GPIO controller node should have the empty property gpio-controller and
+#gpio-cells set to 2. First cell is the GPIO number which is interpreted
+according to the bit numbers in the GPIO control registers. The second cell
+is for flags which is currently unsused.
+
+8) FEC nodes
+The FEC node can specify one of the following properties to configure
+the MII link:
+"fsl,7-wire-mode" - An empty property that specifies the link uses 7-wire
+ mode instead of MII
+"current-speed" - Specifies that the MII should be configured for a fixed
+ speed. This property should contain two cells. The
+ first cell specifies the speed in Mbps and the second
+ should be '0' for half duplex and '1' for full duplex
+"phy-handle" - Contains a phandle to an Ethernet PHY.
+
+IV - Extra Notes
+================
+
+1. Interrupt mapping
+--------------------
+The mpc5200 pic driver splits hardware IRQ numbers into two levels. The
+split reflects the layout of the PIC hardware itself, which groups
+interrupts into one of three groups; CRIT, MAIN or PERP. Also, the
+Bestcomm dma engine has it's own set of interrupt sources which are
+cascaded off of peripheral interrupt 0, which the driver interprets as a
+fourth group, SDMA.
+
+The interrupts property for device nodes using the mpc5200 pic consists
+of three cells; <L1 L2 level>
+
+ L1 := [CRIT=0, MAIN=1, PERP=2, SDMA=3]
+ L2 := interrupt number; directly mapped from the value in the
+ "ICTL PerStat, MainStat, CritStat Encoded Register"
+ level := [LEVEL_HIGH=0, EDGE_RISING=1, EDGE_FALLING=2, LEVEL_LOW=3]
+
+2. Shared registers
+-------------------
+Some SoC devices share registers between them. ie. the i2c devices use
+a single clock control register, and almost all device are affected by
+the port_config register. Devices which need to manipulate shared regs
+should look to the parent SoC node. The soc node is responsible
+for arbitrating all shared register access.
diff --git a/Documentation/powerpc/mpc52xx.txt b/Documentation/powerpc/mpc52xx.txt
new file mode 100644
index 0000000..10dd4ab
--- /dev/null
+++ b/Documentation/powerpc/mpc52xx.txt
@@ -0,0 +1,39 @@
+Linux 2.6.x on MPC52xx family
+-----------------------------
+
+For the latest info, go to http://www.246tNt.com/mpc52xx/
+
+To compile/use :
+
+ - U-Boot:
+ # <edit Makefile to set ARCH=ppc & CROSS_COMPILE=... ( also EXTRAVERSION
+ if you wish to ).
+ # make lite5200_defconfig
+ # make uImage
+
+ then, on U-boot:
+ => tftpboot 200000 uImage
+ => tftpboot 400000 pRamdisk
+ => bootm 200000 400000
+
+ - DBug:
+ # <edit Makefile to set ARCH=ppc & CROSS_COMPILE=... ( also EXTRAVERSION
+ if you wish to ).
+ # make lite5200_defconfig
+ # cp your_initrd.gz arch/ppc/boot/images/ramdisk.image.gz
+ # make zImage.initrd
+ # make
+
+ then in DBug:
+ DBug> dn -i zImage.initrd.lite5200
+
+
+Some remarks :
+ - The port is named mpc52xxx, and config options are PPC_MPC52xx. The MGT5100
+ is not supported, and I'm not sure anyone is interesting in working on it
+ so. I didn't took 5xxx because there's apparently a lot of 5xxx that have
+ nothing to do with the MPC5200. I also included the 'MPC' for the same
+ reason.
+ - Of course, I inspired myself from the 2.4 port. If you think I forgot to
+ mention you/your company in the copyright of some code, I'll correct it
+ ASAP.
diff --git a/Documentation/powerpc/phyp-assisted-dump.txt b/Documentation/powerpc/phyp-assisted-dump.txt
new file mode 100644
index 0000000..c4682b9
--- /dev/null
+++ b/Documentation/powerpc/phyp-assisted-dump.txt
@@ -0,0 +1,127 @@
+
+ Hypervisor-Assisted Dump
+ ------------------------
+ November 2007
+
+The goal of hypervisor-assisted dump is to enable the dump of
+a crashed system, and to do so from a fully-reset system, and
+to minimize the total elapsed time until the system is back
+in production use.
+
+As compared to kdump or other strategies, hypervisor-assisted
+dump offers several strong, practical advantages:
+
+-- Unlike kdump, the system has been reset, and loaded
+ with a fresh copy of the kernel. In particular,
+ PCI and I/O devices have been reinitialized and are
+ in a clean, consistent state.
+-- As the dump is performed, the dumped memory becomes
+ immediately available to the system for normal use.
+-- After the dump is completed, no further reboots are
+ required; the system will be fully usable, and running
+ in it's normal, production mode on it normal kernel.
+
+The above can only be accomplished by coordination with,
+and assistance from the hypervisor. The procedure is
+as follows:
+
+-- When a system crashes, the hypervisor will save
+ the low 256MB of RAM to a previously registered
+ save region. It will also save system state, system
+ registers, and hardware PTE's.
+
+-- After the low 256MB area has been saved, the
+ hypervisor will reset PCI and other hardware state.
+ It will *not* clear RAM. It will then launch the
+ bootloader, as normal.
+
+-- The freshly booted kernel will notice that there
+ is a new node (ibm,dump-kernel) in the device tree,
+ indicating that there is crash data available from
+ a previous boot. It will boot into only 256MB of RAM,
+ reserving the rest of system memory.
+
+-- Userspace tools will parse /sys/kernel/release_region
+ and read /proc/vmcore to obtain the contents of memory,
+ which holds the previous crashed kernel. The userspace
+ tools may copy this info to disk, or network, nas, san,
+ iscsi, etc. as desired.
+
+ For Example: the values in /sys/kernel/release-region
+ would look something like this (address-range pairs).
+ CPU:0x177fee000-0x10000: HPTE:0x177ffe020-0x1000: /
+ DUMP:0x177fff020-0x10000000, 0x10000000-0x16F1D370A
+
+-- As the userspace tools complete saving a portion of
+ dump, they echo an offset and size to
+ /sys/kernel/release_region to release the reserved
+ memory back to general use.
+
+ An example of this is:
+ "echo 0x40000000 0x10000000 > /sys/kernel/release_region"
+ which will release 256MB at the 1GB boundary.
+
+Please note that the hypervisor-assisted dump feature
+is only available on Power6-based systems with recent
+firmware versions.
+
+Implementation details:
+----------------------
+
+During boot, a check is made to see if firmware supports
+this feature on this particular machine. If it does, then
+we check to see if a active dump is waiting for us. If yes
+then everything but 256 MB of RAM is reserved during early
+boot. This area is released once we collect a dump from user
+land scripts that are run. If there is dump data, then
+the /sys/kernel/release_region file is created, and
+the reserved memory is held.
+
+If there is no waiting dump data, then only the highest
+256MB of the ram is reserved as a scratch area. This area
+is *not* released: this region will be kept permanently
+reserved, so that it can act as a receptacle for a copy
+of the low 256MB in the case a crash does occur. See,
+however, "open issues" below, as to whether
+such a reserved region is really needed.
+
+Currently the dump will be copied from /proc/vmcore to a
+a new file upon user intervention. The starting address
+to be read and the range for each data point in provided
+in /sys/kernel/release_region.
+
+The tools to examine the dump will be same as the ones
+used for kdump.
+
+General notes:
+--------------
+Security: please note that there are potential security issues
+with any sort of dump mechanism. In particular, plaintext
+(unencrypted) data, and possibly passwords, may be present in
+the dump data. Userspace tools must take adequate precautions to
+preserve security.
+
+Open issues/ToDo:
+------------
+ o The various code paths that tell the hypervisor that a crash
+ occurred, vs. it simply being a normal reboot, should be
+ reviewed, and possibly clarified/fixed.
+
+ o Instead of using /sys/kernel, should there be a /sys/dump
+ instead? There is a dump_subsys being created by the s390 code,
+ perhaps the pseries code should use a similar layout as well.
+
+ o Is reserving a 256MB region really required? The goal of
+ reserving a 256MB scratch area is to make sure that no
+ important crash data is clobbered when the hypervisor
+ save low mem to the scratch area. But, if one could assure
+ that nothing important is located in some 256MB area, then
+ it would not need to be reserved. Something that can be
+ improved in subsequent versions.
+
+ o Still working the kdump team to integrate this with kdump,
+ some work remains but this would not affect the current
+ patches.
+
+ o Still need to write a shell script, to copy the dump away.
+ Currently I am parsing it manually.
diff --git a/Documentation/powerpc/qe_firmware.txt b/Documentation/powerpc/qe_firmware.txt
new file mode 100644
index 0000000..06da4d4
--- /dev/null
+++ b/Documentation/powerpc/qe_firmware.txt
@@ -0,0 +1,295 @@
+ Freescale QUICC Engine Firmware Uploading
+ -----------------------------------------
+
+(c) 2007 Timur Tabi <timur at freescale.com>,
+ Freescale Semiconductor
+
+Table of Contents
+=================
+
+ I - Software License for Firmware
+
+ II - Microcode Availability
+
+ III - Description and Terminology
+
+ IV - Microcode Programming Details
+
+ V - Firmware Structure Layout
+
+ VI - Sample Code for Creating Firmware Files
+
+Revision Information
+====================
+
+November 30, 2007: Rev 1.0 - Initial version
+
+I - Software License for Firmware
+=================================
+
+Each firmware file comes with its own software license. For information on
+the particular license, please see the license text that is distributed with
+the firmware.
+
+II - Microcode Availability
+===========================
+
+Firmware files are distributed through various channels. Some are available on
+http://opensource.freescale.com. For other firmware files, please contact
+your Freescale representative or your operating system vendor.
+
+III - Description and Terminology
+================================
+
+In this document, the term 'microcode' refers to the sequence of 32-bit
+integers that compose the actual QE microcode.
+
+The term 'firmware' refers to a binary blob that contains the microcode as
+well as other data that
+
+ 1) describes the microcode's purpose
+ 2) describes how and where to upload the microcode
+ 3) specifies the values of various registers
+ 4) includes additional data for use by specific device drivers
+
+Firmware files are binary files that contain only a firmware.
+
+IV - Microcode Programming Details
+===================================
+
+The QE architecture allows for only one microcode present in I-RAM for each
+RISC processor. To replace any current microcode, a full QE reset (which
+disables the microcode) must be performed first.
+
+QE microcode is uploaded using the following procedure:
+
+1) The microcode is placed into I-RAM at a specific location, using the
+ IRAM.IADD and IRAM.IDATA registers.
+
+2) The CERCR.CIR bit is set to 0 or 1, depending on whether the firmware
+ needs split I-RAM. Split I-RAM is only meaningful for SOCs that have
+ QEs with multiple RISC processors, such as the 8360. Splitting the I-RAM
+ allows each processor to run a different microcode, effectively creating an
+ asymmetric multiprocessing (AMP) system.
+
+3) The TIBCR trap registers are loaded with the addresses of the trap handlers
+ in the microcode.
+
+4) The RSP.ECCR register is programmed with the value provided.
+
+5) If necessary, device drivers that need the virtual traps and extended mode
+ data will use them.
+
+Virtual Microcode Traps
+
+These virtual traps are conditional branches in the microcode. These are
+"soft" provisional introduced in the ROMcode in order to enable higher
+flexibility and save h/w traps If new features are activated or an issue is
+being fixed in the RAM package utilizing they should be activated. This data
+structure signals the microcode which of these virtual traps is active.
+
+This structure contains 6 words that the application should copy to some
+specific been defined. This table describes the structure.
+
+ ---------------------------------------------------------------
+ | Offset in | | Destination Offset | Size of |
+ | array | Protocol | within PRAM | Operand |
+ --------------------------------------------------------------|
+ | 0 | Ethernet | 0xF8 | 4 bytes |
+ | | interworking | | |
+ ---------------------------------------------------------------
+ | 4 | ATM | 0xF8 | 4 bytes |
+ | | interworking | | |
+ ---------------------------------------------------------------
+ | 8 | PPP | 0xF8 | 4 bytes |
+ | | interworking | | |
+ ---------------------------------------------------------------
+ | 12 | Ethernet RX | 0x22 | 1 byte |
+ | | Distributor Page | | |
+ ---------------------------------------------------------------
+ | 16 | ATM Globtal | 0x28 | 1 byte |
+ | | Params Table | | |
+ ---------------------------------------------------------------
+ | 20 | Insert Frame | 0xF8 | 4 bytes |
+ ---------------------------------------------------------------
+
+
+Extended Modes
+
+This is a double word bit array (64 bits) that defines special functionality
+which has an impact on the softwarew drivers. Each bit has its own impact
+and has special instructions for the s/w associated with it. This structure is
+described in this table:
+
+ -----------------------------------------------------------------------
+ | Bit # | Name | Description |
+ -----------------------------------------------------------------------
+ | 0 | General | Indicates that prior to each host command |
+ | | push command | given by the application, the software must |
+ | | | assert a special host command (push command)|
+ | | | CECDR = 0x00800000. |
+ | | | CECR = 0x01c1000f. |
+ -----------------------------------------------------------------------
+ | 1 | UCC ATM | Indicates that after issuing ATM RX INIT |
+ | | RX INIT | command, the host must issue another special|
+ | | push command | command (push command) and immediately |
+ | | | following that re-issue the ATM RX INIT |
+ | | | command. (This makes the sequence of |
+ | | | initializing the ATM receiver a sequence of |
+ | | | three host commands) |
+ | | | CECDR = 0x00800000. |
+ | | | CECR = 0x01c1000f. |
+ -----------------------------------------------------------------------
+ | 2 | Add/remove | Indicates that following the specific host |
+ | | command | command: "Add/Remove entry in Hash Lookup |
+ | | validation | Table" used in Interworking setup, the user |
+ | | | must issue another command. |
+ | | | CECDR = 0xce000003. |
+ | | | CECR = 0x01c10f58. |
+ -----------------------------------------------------------------------
+ | 3 | General push | Indicates that the s/w has to initialize |
+ | | command | some pointers in the Ethernet thread pages |
+ | | | which are used when Header Compression is |
+ | | | activated. The full details of these |
+ | | | pointers is located in the software drivers.|
+ -----------------------------------------------------------------------
+ | 4 | General push | Indicates that after issuing Ethernet TX |
+ | | command | INIT command, user must issue this command |
+ | | | for each SNUM of Ethernet TX thread. |
+ | | | CECDR = 0x00800003. |
+ | | | CECR = 0x7'b{0}, 8'b{Enet TX thread SNUM}, |
+ | | | 1'b{1}, 12'b{0}, 4'b{1} |
+ -----------------------------------------------------------------------
+ | 5 - 31 | N/A | Reserved, set to zero. |
+ -----------------------------------------------------------------------
+
+V - Firmware Structure Layout
+==============================
+
+QE microcode from Freescale is typically provided as a header file. This
+header file contains macros that define the microcode binary itself as well as
+some other data used in uploading that microcode. The format of these files
+do not lend themselves to simple inclusion into other code. Hence,
+the need for a more portable format. This section defines that format.
+
+Instead of distributing a header file, the microcode and related data are
+embedded into a binary blob. This blob is passed to the qe_upload_firmware()
+function, which parses the blob and performs everything necessary to upload
+the microcode.
+
+All integers are big-endian. See the comments for function
+qe_upload_firmware() for up-to-date implementation information.
+
+This structure supports versioning, where the version of the structure is
+embedded into the structure itself. To ensure forward and backwards
+compatibility, all versions of the structure must use the same 'qe_header'
+structure at the beginning.
+
+'header' (type: struct qe_header):
+ The 'length' field is the size, in bytes, of the entire structure,
+ including all the microcode embedded in it, as well as the CRC (if
+ present).
+
+ The 'magic' field is an array of three bytes that contains the letters
+ 'Q', 'E', and 'F'. This is an identifier that indicates that this
+ structure is a QE Firmware structure.
+
+ The 'version' field is a single byte that indicates the version of this
+ structure. If the layout of the structure should ever need to be
+ changed to add support for additional types of microcode, then the
+ version number should also be changed.
+
+The 'id' field is a null-terminated string(suitable for printing) that
+identifies the firmware.
+
+The 'count' field indicates the number of 'microcode' structures. There
+must be one and only one 'microcode' structure for each RISC processor.
+Therefore, this field also represents the number of RISC processors for this
+SOC.
+
+The 'soc' structure contains the SOC numbers and revisions used to match
+the microcode to the SOC itself. Normally, the microcode loader should
+check the data in this structure with the SOC number and revisions, and
+only upload the microcode if there's a match. However, this check is not
+made on all platforms.
+
+Although it is not recommended, you can specify '0' in the soc.model
+field to skip matching SOCs altogether.
+
+The 'model' field is a 16-bit number that matches the actual SOC. The
+'major' and 'minor' fields are the major and minor revision numbers,
+respectively, of the SOC.
+
+For example, to match the 8323, revision 1.0:
+ soc.model = 8323
+ soc.major = 1
+ soc.minor = 0
+
+'padding' is neccessary for structure alignment. This field ensures that the
+'extended_modes' field is aligned on a 64-bit boundary.
+
+'extended_modes' is a bitfield that defines special functionality which has an
+impact on the device drivers. Each bit has its own impact and has special
+instructions for the driver associated with it. This field is stored in
+the QE library and available to any driver that calles qe_get_firmware_info().
+
+'vtraps' is an array of 8 words that contain virtual trap values for each
+virtual traps. As with 'extended_modes', this field is stored in the QE
+library and available to any driver that calles qe_get_firmware_info().
+
+'microcode' (type: struct qe_microcode):
+ For each RISC processor there is one 'microcode' structure. The first
+ 'microcode' structure is for the first RISC, and so on.
+
+ The 'id' field is a null-terminated string suitable for printing that
+ identifies this particular microcode.
+
+ 'traps' is an array of 16 words that contain hardware trap values
+ for each of the 16 traps. If trap[i] is 0, then this particular
+ trap is to be ignored (i.e. not written to TIBCR[i]). The entire value
+ is written as-is to the TIBCR[i] register, so be sure to set the EN
+ and T_IBP bits if necessary.
+
+ 'eccr' is the value to program into the ECCR register.
+
+ 'iram_offset' is the offset into IRAM to start writing the
+ microcode.
+
+ 'count' is the number of 32-bit words in the microcode.
+
+ 'code_offset' is the offset, in bytes, from the beginning of this
+ structure where the microcode itself can be found. The first
+ microcode binary should be located immediately after the 'microcode'
+ array.
+
+ 'major', 'minor', and 'revision' are the major, minor, and revision
+ version numbers, respectively, of the microcode. If all values are 0,
+ then these fields are ignored.
+
+ 'reserved' is necessary for structure alignment. Since 'microcode'
+ is an array, the 64-bit 'extended_modes' field needs to be aligned
+ on a 64-bit boundary, and this can only happen if the size of
+ 'microcode' is a multiple of 8 bytes. To ensure that, we add
+ 'reserved'.
+
+After the last microcode is a 32-bit CRC. It can be calculated using
+this algorithm:
+
+u32 crc32(const u8 *p, unsigned int len)
+{
+ unsigned int i;
+ u32 crc = 0;
+
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? 0xedb88320 : 0);
+ }
+ return crc;
+}
+
+VI - Sample Code for Creating Firmware Files
+============================================
+
+A Python program that creates firmware binaries from the header files normally
+distributed by Freescale can be found on http://opensource.freescale.com.
diff --git a/Documentation/powerpc/sound.txt b/Documentation/powerpc/sound.txt
new file mode 100644
index 0000000..df23d95
--- /dev/null
+++ b/Documentation/powerpc/sound.txt
@@ -0,0 +1,81 @@
+ Information about PowerPC Sound support
+=====================================================================
+
+Please mail me (Cort Dougan, cort@fsmlabs.com) if you have questions,
+comments or corrections.
+
+Last Change: 6.16.99
+
+This just covers sound on the PReP and CHRP systems for now and later
+will contain information on the PowerMac's.
+
+Sound on PReP has been tested and is working with the PowerStack and IBM
+Power Series onboard sound systems which are based on the cs4231(2) chip.
+The sound options when doing the make config are a bit different from
+the default, though.
+
+The I/O base, irq and dma lines that you enter during the make config
+are ignored and are set when booting according to the machine type.
+This is so that one binary can be used for Motorola and IBM machines
+which use different values and isn't allowed by the driver, so things
+are hacked together in such a way as to allow this information to be
+set automatically on boot.
+
+1. Motorola PowerStack PReP machines
+
+ Enable support for "Crystal CS4232 based (PnP) cards" and for the
+ Microsoft Sound System. The MSS isn't used, but some of the routines
+ that the CS4232 driver uses are in it.
+
+ Although the options you set are ignored and determined automatically
+ on boot these are included for information only:
+
+ (830) CS4232 audio I/O base 530, 604, E80 or F40
+ (10) CS4232 audio IRQ 5, 7, 9, 11, 12 or 15
+ (6) CS4232 audio DMA 0, 1 or 3
+ (7) CS4232 second (duplex) DMA 0, 1 or 3
+
+ This will allow simultaneous record and playback, as 2 different dma
+ channels are used.
+
+ The sound will be all left channel and very low volume since the
+ auxiliary input isn't muted by default. I had the changes necessary
+ for this in the kernel but the sound driver maintainer didn't want
+ to include them since it wasn't common in other machines. To fix this
+ you need to mute it using a mixer utility of some sort (if you find one
+ please let me know) or by patching the driver yourself and recompiling.
+
+ There is a problem on the PowerStack 2's (PowerStack Pro's) using a
+ different irq/drq than the kernel expects. Unfortunately, I don't know
+ which irq/drq it is so if anyone knows please email me.
+
+ Midi is not supported since the cs4232 driver doesn't support midi yet.
+
+2. IBM PowerPersonal PReP machines
+
+ I've only tested sound on the Power Personal Series of IBM workstations
+ so if you try it on others please let me know the result. I'm especially
+ interested in the 43p's sound system, which I know nothing about.
+
+ Enable support for "Crystal CS4232 based (PnP) cards" and for the
+ Microsoft Sound System. The MSS isn't used, but some of the routines
+ that the CS4232 driver uses are in it.
+
+ Although the options you set are ignored and determined automatically
+ on boot these are included for information only:
+
+ (530) CS4232 audio I/O base 530, 604, E80 or F40
+ (5) CS4232 audio IRQ 5, 7, 9, 11, 12 or 15
+ (1) CS4232 audio DMA 0, 1 or 3
+ (7) CS4232 second (duplex) DMA 0, 1 or 3
+ (330) CS4232 MIDI I/O base 330, 370, 3B0 or 3F0
+ (9) CS4232 MIDI IRQ 5, 7, 9, 11, 12 or 15
+
+ This setup does _NOT_ allow for recording yet.
+
+ Midi is not supported since the cs4232 driver doesn't support midi yet.
+
+2. IBM CHRP
+
+ I have only tested this on the 43P-150. Build the kernel with the cs4232
+ set as a module and load the module with irq=9 dma=1 dma2=2 io=0x550
diff --git a/Documentation/powerpc/zImage_layout.txt b/Documentation/powerpc/zImage_layout.txt
new file mode 100644
index 0000000..048e015
--- /dev/null
+++ b/Documentation/powerpc/zImage_layout.txt
@@ -0,0 +1,47 @@
+ Information about the Linux/PPC kernel images
+=====================================================================
+
+Please mail me (Cort Dougan, cort@fsmlabs.com) if you have questions,
+comments or corrections.
+
+This document is meant to answer several questions I've had about how
+the PReP system boots and how Linux/PPC interacts with that mechanism.
+It would be nice if we could have information on how other architectures
+boot here as well. If you have anything to contribute, please
+let me know.
+
+
+1. PReP boot file
+
+ This is the file necessary to boot PReP systems from floppy or
+ hard drive. The firmware reads the PReP partition table entry
+ and will load the image accordingly.
+
+ To boot the zImage, copy it onto a floppy with dd if=zImage of=/dev/fd0h1440
+ or onto a PReP hard drive partition with dd if=zImage of=/dev/sda4
+ assuming you've created a PReP partition (type 0x41) with fdisk on
+ /dev/sda4.
+
+ The layout of the image format is:
+
+ 0x0 +------------+
+ | | PReP partition table entry
+ | |
+ 0x400 +------------+
+ | | Bootstrap program code + data
+ | |
+ | |
+ +------------+
+ | | compressed kernel, elf header removed
+ +------------+
+ | | initrd (if loaded)
+ +------------+
+ | | Elf section table for bootstrap program
+ +------------+
+
+
+2. MBX boot file
+
+ The MBX boards can load an elf image, and relocate it to the
+ proper location in memory - it copies the image to the location it was
+ linked at.
diff --git a/Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c b/Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c
new file mode 100644
index 0000000..f8e8e95
--- /dev/null
+++ b/Documentation/prctl/disable-tsc-ctxt-sw-stress-test.c
@@ -0,0 +1,96 @@
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Tests if the control register is updated correctly
+ * at context switches
+ *
+ * Warning: this test will cause a very high load for a few seconds
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <wait.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+uint64_t rdtsc() {
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+void sigsegv_expect(int sig)
+{
+ /* */
+}
+
+void segvtask(void)
+{
+ if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ signal(SIGSEGV, sigsegv_expect);
+ alarm(10);
+ rdtsc();
+ fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
+ exit(0);
+}
+
+
+void sigsegv_fail(int sig)
+{
+ fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
+ exit(0);
+}
+
+void rdtsctask(void)
+{
+ if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ signal(SIGSEGV, sigsegv_fail);
+ alarm(10);
+ for(;;) rdtsc();
+}
+
+
+int main(int argc, char **argv)
+{
+ int n_tasks = 100, i;
+
+ fprintf(stderr, "[No further output means we're allright]\n");
+
+ for (i=0; i<n_tasks; i++)
+ if (fork() == 0)
+ {
+ if (i & 1)
+ segvtask();
+ else
+ rdtsctask();
+ }
+
+ for (i=0; i<n_tasks; i++)
+ wait(NULL);
+
+ exit(0);
+}
+
diff --git a/Documentation/prctl/disable-tsc-on-off-stress-test.c b/Documentation/prctl/disable-tsc-on-off-stress-test.c
new file mode 100644
index 0000000..1fcd914
--- /dev/null
+++ b/Documentation/prctl/disable-tsc-on-off-stress-test.c
@@ -0,0 +1,95 @@
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Tests if the control register is updated correctly
+ * when set with prctl()
+ *
+ * Warning: this test will cause a very high load for a few seconds
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <wait.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+/* snippet from wikipedia :-) */
+
+uint64_t rdtsc() {
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+int should_segv = 0;
+
+void sigsegv_cb(int sig)
+{
+ if (!should_segv)
+ {
+ fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
+ exit(0);
+ }
+ if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ should_segv = 0;
+
+ rdtsc();
+}
+
+void task(void)
+{
+ signal(SIGSEGV, sigsegv_cb);
+ alarm(10);
+ for(;;)
+ {
+ rdtsc();
+ if (should_segv)
+ {
+ fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
+ exit(0);
+ }
+ if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ should_segv = 1;
+ }
+}
+
+
+int main(int argc, char **argv)
+{
+ int n_tasks = 100, i;
+
+ fprintf(stderr, "[No further output means we're allright]\n");
+
+ for (i=0; i<n_tasks; i++)
+ if (fork() == 0)
+ task();
+
+ for (i=0; i<n_tasks; i++)
+ wait(NULL);
+
+ exit(0);
+}
+
diff --git a/Documentation/prctl/disable-tsc-test.c b/Documentation/prctl/disable-tsc-test.c
new file mode 100644
index 0000000..843c81e
--- /dev/null
+++ b/Documentation/prctl/disable-tsc-test.c
@@ -0,0 +1,94 @@
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Basic test to test behaviour of PR_GET_TSC and PR_SET_TSC
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+const char *tsc_names[] =
+{
+ [0] = "[not set]",
+ [PR_TSC_ENABLE] = "PR_TSC_ENABLE",
+ [PR_TSC_SIGSEGV] = "PR_TSC_SIGSEGV",
+};
+
+uint64_t rdtsc() {
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+void sigsegv_cb(int sig)
+{
+ int tsc_val = 0;
+
+ printf("[ SIG_SEGV ]\n");
+ printf("prctl(PR_GET_TSC, &tsc_val); ");
+ fflush(stdout);
+
+ if ( prctl(PR_GET_TSC, &tsc_val) == -1)
+ perror("prctl");
+
+ printf("tsc_val == %s\n", tsc_names[tsc_val]);
+ printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
+ fflush(stdout);
+ if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
+ perror("prctl");
+
+ printf("rdtsc() == ");
+}
+
+int main(int argc, char **argv)
+{
+ int tsc_val = 0;
+
+ signal(SIGSEGV, sigsegv_cb);
+
+ printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+ printf("prctl(PR_GET_TSC, &tsc_val); ");
+ fflush(stdout);
+
+ if ( prctl(PR_GET_TSC, &tsc_val) == -1)
+ perror("prctl");
+
+ printf("tsc_val == %s\n", tsc_names[tsc_val]);
+ printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+ printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
+ fflush(stdout);
+
+ if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
+ perror("prctl");
+
+ printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+ printf("prctl(PR_SET_TSC, PR_TSC_SIGSEGV)\n");
+ fflush(stdout);
+
+ if ( prctl(PR_SET_TSC, PR_TSC_SIGSEGV) == -1)
+ perror("prctl");
+
+ printf("rdtsc() == ");
+ fflush(stdout);
+ printf("%llu\n", (unsigned long long)rdtsc());
+ fflush(stdout);
+
+ exit(EXIT_SUCCESS);
+}
+
diff --git a/Documentation/preempt-locking.txt b/Documentation/preempt-locking.txt
new file mode 100644
index 0000000..57883ca
--- /dev/null
+++ b/Documentation/preempt-locking.txt
@@ -0,0 +1,135 @@
+ Proper Locking Under a Preemptible Kernel:
+ Keeping Kernel Code Preempt-Safe
+ Robert Love <rml@tech9.net>
+ Last Updated: 28 Aug 2002
+
+
+INTRODUCTION
+
+
+A preemptible kernel creates new locking issues. The issues are the same as
+those under SMP: concurrency and reentrancy. Thankfully, the Linux preemptible
+kernel model leverages existing SMP locking mechanisms. Thus, the kernel
+requires explicit additional locking for very few additional situations.
+
+This document is for all kernel hackers. Developing code in the kernel
+requires protecting these situations.
+
+
+RULE #1: Per-CPU data structures need explicit protection
+
+
+Two similar problems arise. An example code snippet:
+
+ struct this_needs_locking tux[NR_CPUS];
+ tux[smp_processor_id()] = some_value;
+ /* task is preempted here... */
+ something = tux[smp_processor_id()];
+
+First, since the data is per-CPU, it may not have explicit SMP locking, but
+require it otherwise. Second, when a preempted task is finally rescheduled,
+the previous value of smp_processor_id may not equal the current. You must
+protect these situations by disabling preemption around them.
+
+You can also use put_cpu() and get_cpu(), which will disable preemption.
+
+
+RULE #2: CPU state must be protected.
+
+
+Under preemption, the state of the CPU must be protected. This is arch-
+dependent, but includes CPU structures and state not preserved over a context
+switch. For example, on x86, entering and exiting FPU mode is now a critical
+section that must occur while preemption is disabled. Think what would happen
+if the kernel is executing a floating-point instruction and is then preempted.
+Remember, the kernel does not save FPU state except for user tasks. Therefore,
+upon preemption, the FPU registers will be sold to the lowest bidder. Thus,
+preemption must be disabled around such regions.
+
+Note, some FPU functions are already explicitly preempt safe. For example,
+kernel_fpu_begin and kernel_fpu_end will disable and enable preemption.
+However, math_state_restore must be called with preemption disabled.
+
+
+RULE #3: Lock acquire and release must be performed by same task
+
+
+A lock acquired in one task must be released by the same task. This
+means you can't do oddball things like acquire a lock and go off to
+play while another task releases it. If you want to do something
+like this, acquire and release the task in the same code path and
+have the caller wait on an event by the other task.
+
+
+SOLUTION
+
+
+Data protection under preemption is achieved by disabling preemption for the
+duration of the critical region.
+
+preempt_enable() decrement the preempt counter
+preempt_disable() increment the preempt counter
+preempt_enable_no_resched() decrement, but do not immediately preempt
+preempt_check_resched() if needed, reschedule
+preempt_count() return the preempt counter
+
+The functions are nestable. In other words, you can call preempt_disable
+n-times in a code path, and preemption will not be reenabled until the n-th
+call to preempt_enable. The preempt statements define to nothing if
+preemption is not enabled.
+
+Note that you do not need to explicitly prevent preemption if you are holding
+any locks or interrupts are disabled, since preemption is implicitly disabled
+in those cases.
+
+But keep in mind that 'irqs disabled' is a fundamentally unsafe way of
+disabling preemption - any spin_unlock() decreasing the preemption count
+to 0 might trigger a reschedule. A simple printk() might trigger a reschedule.
+So use this implicit preemption-disabling property only if you know that the
+affected codepath does not do any of this. Best policy is to use this only for
+small, atomic code that you wrote and which calls no complex functions.
+
+Example:
+
+ cpucache_t *cc; /* this is per-CPU */
+ preempt_disable();
+ cc = cc_data(searchp);
+ if (cc && cc->avail) {
+ __free_block(searchp, cc_entry(cc), cc->avail);
+ cc->avail = 0;
+ }
+ preempt_enable();
+ return 0;
+
+Notice how the preemption statements must encompass every reference of the
+critical variables. Another example:
+
+ int buf[NR_CPUS];
+ set_cpu_val(buf);
+ if (buf[smp_processor_id()] == -1) printf(KERN_INFO "wee!\n");
+ spin_lock(&buf_lock);
+ /* ... */
+
+This code is not preempt-safe, but see how easily we can fix it by simply
+moving the spin_lock up two lines.
+
+
+PREVENTING PREEMPTION USING INTERRUPT DISABLING
+
+
+It is possible to prevent a preemption event using local_irq_disable and
+local_irq_save. Note, when doing so, you must be very careful to not cause
+an event that would set need_resched and result in a preemption check. When
+in doubt, rely on locking or explicit preemption disabling.
+
+Note in 2.5 interrupt disabling is now only per-CPU (e.g. local).
+
+An additional concern is proper usage of local_irq_disable and local_irq_save.
+These may be used to protect from preemption, however, on exit, if preemption
+may be enabled, a test to see if preemption is required should be done. If
+these are called from the spin_lock and read/write lock macros, the right thing
+is done. They may also be called within a spin-lock protected region, however,
+if they are ever called outside of this context, a test for preemption should
+be made. Do note that calls from interrupt context or bottom half/ tasklets
+are also protected by preemption locks and so may use the versions which do
+not check preemption.
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
new file mode 100644
index 0000000..1b5a5dd
--- /dev/null
+++ b/Documentation/printk-formats.txt
@@ -0,0 +1,35 @@
+If variable is of Type, use printk format specifier:
+---------------------------------------------------------
+ int %d or %x
+ unsigned int %u or %x
+ long %ld or %lx
+ unsigned long %lu or %lx
+ long long %lld or %llx
+ unsigned long long %llu or %llx
+ size_t %zu or %zx
+ ssize_t %zd or %zx
+
+Raw pointer value SHOULD be printed with %p.
+
+u64 SHOULD be printed with %llu/%llx, (unsigned long long):
+
+ printk("%llu", (unsigned long long)u64_var);
+
+s64 SHOULD be printed with %lld/%llx, (long long):
+
+ printk("%lld", (long long)s64_var);
+
+If <type> is dependent on a config option for its size (e.g., sector_t,
+blkcnt_t, phys_addr_t, resource_size_t) or is architecture-dependent
+for its size (e.g., tcflag_t), use a format specifier of its largest
+possible type and explicitly cast to it. Example:
+
+ printk("test: sector number/total blocks: %llu/%llu\n",
+ (unsigned long long)sector, (unsigned long long)blockcount);
+
+Reminder: sizeof() result is of type size_t.
+
+Thank you for your cooperation and attention.
+
+
+By Randy Dunlap <rdunlap@xenotime.net>
diff --git a/Documentation/prio_tree.txt b/Documentation/prio_tree.txt
new file mode 100644
index 0000000..3aa68f9
--- /dev/null
+++ b/Documentation/prio_tree.txt
@@ -0,0 +1,107 @@
+The prio_tree.c code indexes vmas using 3 different indexes:
+ * heap_index = vm_pgoff + vm_size_in_pages : end_vm_pgoff
+ * radix_index = vm_pgoff : start_vm_pgoff
+ * size_index = vm_size_in_pages
+
+A regular radix-priority-search-tree indexes vmas using only heap_index and
+radix_index. The conditions for indexing are:
+ * ->heap_index >= ->left->heap_index &&
+ ->heap_index >= ->right->heap_index
+ * if (->heap_index == ->left->heap_index)
+ then ->radix_index < ->left->radix_index;
+ * if (->heap_index == ->right->heap_index)
+ then ->radix_index < ->right->radix_index;
+ * nodes are hashed to left or right subtree using radix_index
+ similar to a pure binary radix tree.
+
+A regular radix-priority-search-tree helps to store and query
+intervals (vmas). However, a regular radix-priority-search-tree is only
+suitable for storing vmas with different radix indices (vm_pgoff).
+
+Therefore, the prio_tree.c extends the regular radix-priority-search-tree
+to handle many vmas with the same vm_pgoff. Such vmas are handled in
+2 different ways: 1) All vmas with the same radix _and_ heap indices are
+linked using vm_set.list, 2) if there are many vmas with the same radix
+index, but different heap indices and if the regular radix-priority-search
+tree cannot index them all, we build an overflow-sub-tree that indexes such
+vmas using heap and size indices instead of heap and radix indices. For
+example, in the figure below some vmas with vm_pgoff = 0 (zero) are
+indexed by regular radix-priority-search-tree whereas others are pushed
+into an overflow-subtree. Note that all vmas in an overflow-sub-tree have
+the same vm_pgoff (radix_index) and if necessary we build different
+overflow-sub-trees to handle each possible radix_index. For example,
+in figure we have 3 overflow-sub-trees corresponding to radix indices
+0, 2, and 4.
+
+In the final tree the first few (prio_tree_root->index_bits) levels
+are indexed using heap and radix indices whereas the overflow-sub-trees below
+those levels (i.e. levels prio_tree_root->index_bits + 1 and higher) are
+indexed using heap and size indices. In overflow-sub-trees the size_index
+is used for hashing the nodes to appropriate places.
+
+Now, an example prio_tree:
+
+ vmas are represented [radix_index, size_index, heap_index]
+ i.e., [start_vm_pgoff, vm_size_in_pages, end_vm_pgoff]
+
+level prio_tree_root->index_bits = 3
+-----
+ _
+ 0 [0,7,7] |
+ / \ |
+ ------------------ ------------ | Regular
+ / \ | radix priority
+ 1 [1,6,7] [4,3,7] | search tree
+ / \ / \ |
+ ------- ----- ------ ----- | heap-and-radix
+ / \ / \ | indexed
+ 2 [0,6,6] [2,5,7] [5,2,7] [6,1,7] |
+ / \ / \ / \ / \ |
+ 3 [0,5,5] [1,5,6] [2,4,6] [3,4,7] [4,2,6] [5,1,6] [6,0,6] [7,0,7] |
+ / / / _
+ / / / _
+ 4 [0,4,4] [2,3,5] [4,1,5] |
+ / / / |
+ 5 [0,3,3] [2,2,4] [4,0,4] | Overflow-sub-trees
+ / / |
+ 6 [0,2,2] [2,1,3] | heap-and-size
+ / / | indexed
+ 7 [0,1,1] [2,0,2] |
+ / |
+ 8 [0,0,0] |
+ _
+
+Note that we use prio_tree_root->index_bits to optimize the height
+of the heap-and-radix indexed tree. Since prio_tree_root->index_bits is
+set according to the maximum end_vm_pgoff mapped, we are sure that all
+bits (in vm_pgoff) above prio_tree_root->index_bits are 0 (zero). Therefore,
+we only use the first prio_tree_root->index_bits as radix_index.
+Whenever index_bits is increased in prio_tree_expand, we shuffle the tree
+to make sure that the first prio_tree_root->index_bits levels of the tree
+is indexed properly using heap and radix indices.
+
+We do not optimize the height of overflow-sub-trees using index_bits.
+The reason is: there can be many such overflow-sub-trees and all of
+them have to be suffled whenever the index_bits increases. This may involve
+walking the whole prio_tree in prio_tree_insert->prio_tree_expand code
+path which is not desirable. Hence, we do not optimize the height of the
+heap-and-size indexed overflow-sub-trees using prio_tree->index_bits.
+Instead the overflow sub-trees are indexed using full BITS_PER_LONG bits
+of size_index. This may lead to skewed sub-trees because most of the
+higher significant bits of the size_index are likely to be 0 (zero). In
+the example above, all 3 overflow-sub-trees are skewed. This may marginally
+affect the performance. However, processes rarely map many vmas with the
+same start_vm_pgoff but different end_vm_pgoffs. Therefore, we normally
+do not require overflow-sub-trees to index all vmas.
+
+From the above discussion it is clear that the maximum height of
+a prio_tree can be prio_tree_root->index_bits + BITS_PER_LONG.
+However, in most of the common cases we do not need overflow-sub-trees,
+so the tree height in the common cases will be prio_tree_root->index_bits.
+
+It is fair to mention here that the prio_tree_root->index_bits
+is increased on demand, however, the index_bits is not decreased when
+vmas are removed from the prio_tree. That's tricky to do. Hence, it's
+left as a home work problem.
+
+
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
new file mode 100644
index 0000000..7224459
--- /dev/null
+++ b/Documentation/rbtree.txt
@@ -0,0 +1,192 @@
+Red-black Trees (rbtree) in Linux
+January 18, 2007
+Rob Landley <rob@landley.net>
+=============================
+
+What are red-black trees, and what are they for?
+------------------------------------------------
+
+Red-black trees are a type of self-balancing binary search tree, used for
+storing sortable key/value data pairs. This differs from radix trees (which
+are used to efficiently store sparse arrays and thus use long integer indexes
+to insert/access/delete nodes) and hash tables (which are not kept sorted to
+be easily traversed in order, and must be tuned for a specific size and
+hash function where rbtrees scale gracefully storing arbitrary keys).
+
+Red-black trees are similar to AVL trees, but provide faster real-time bounded
+worst case performance for insertion and deletion (at most two rotations and
+three rotations, respectively, to balance the tree), with slightly slower
+(but still O(log n)) lookup time.
+
+To quote Linux Weekly News:
+
+ There are a number of red-black trees in use in the kernel.
+ The anticipatory, deadline, and CFQ I/O schedulers all employ
+ rbtrees to track requests; the packet CD/DVD driver does the same.
+ The high-resolution timer code uses an rbtree to organize outstanding
+ timer requests. The ext3 filesystem tracks directory entries in a
+ red-black tree. Virtual memory areas (VMAs) are tracked with red-black
+ trees, as are epoll file descriptors, cryptographic keys, and network
+ packets in the "hierarchical token bucket" scheduler.
+
+This document covers use of the Linux rbtree implementation. For more
+information on the nature and implementation of Red Black Trees, see:
+
+ Linux Weekly News article on red-black trees
+ http://lwn.net/Articles/184495/
+
+ Wikipedia entry on red-black trees
+ http://en.wikipedia.org/wiki/Red-black_tree
+
+Linux implementation of red-black trees
+---------------------------------------
+
+Linux's rbtree implementation lives in the file "lib/rbtree.c". To use it,
+"#include <linux/rbtree.h>".
+
+The Linux rbtree implementation is optimized for speed, and thus has one
+less layer of indirection (and better cache locality) than more traditional
+tree implementations. Instead of using pointers to separate rb_node and data
+structures, each instance of struct rb_node is embedded in the data structure
+it organizes. And instead of using a comparison callback function pointer,
+users are expected to write their own tree search and insert functions
+which call the provided rbtree functions. Locking is also left up to the
+user of the rbtree code.
+
+Creating a new rbtree
+---------------------
+
+Data nodes in an rbtree tree are structures containing a struct rb_node member:
+
+ struct mytype {
+ struct rb_node node;
+ char *keystring;
+ };
+
+When dealing with a pointer to the embedded struct rb_node, the containing data
+structure may be accessed with the standard container_of() macro. In addition,
+individual members may be accessed directly via rb_entry(node, type, member).
+
+At the root of each rbtree is an rb_root structure, which is initialized to be
+empty via:
+
+ struct rb_root mytree = RB_ROOT;
+
+Searching for a value in an rbtree
+----------------------------------
+
+Writing a search function for your tree is fairly straightforward: start at the
+root, compare each value, and follow the left or right branch as necessary.
+
+Example:
+
+ struct mytype *my_search(struct rb_root *root, char *string)
+ {
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct mytype *data = container_of(node, struct mytype, node);
+ int result;
+
+ result = strcmp(string, data->keystring);
+
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+ }
+
+Inserting data into an rbtree
+-----------------------------
+
+Inserting data in the tree involves first searching for the place to insert the
+new node, then inserting the node and rebalancing ("recoloring") the tree.
+
+The search for insertion differs from the previous search by finding the
+location of the pointer on which to graft the new node. The new node also
+needs a link to its parent node for rebalancing purposes.
+
+Example:
+
+ int my_insert(struct rb_root *root, struct mytype *data)
+ {
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct mytype *this = container_of(*new, struct mytype, node);
+ int result = strcmp(data->keystring, this->keystring);
+
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else
+ return FALSE;
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(data->node, parent, new);
+ rb_insert_color(data->node, root);
+
+ return TRUE;
+ }
+
+Removing or replacing existing data in an rbtree
+------------------------------------------------
+
+To remove an existing node from a tree, call:
+
+ void rb_erase(struct rb_node *victim, struct rb_root *tree);
+
+Example:
+
+ struct mytype *data = mysearch(mytree, "walrus");
+
+ if (data) {
+ rb_erase(data->node, mytree);
+ myfree(data);
+ }
+
+To replace an existing node in a tree with a new one with the same key, call:
+
+ void rb_replace_node(struct rb_node *old, struct rb_node *new,
+ struct rb_root *tree);
+
+Replacing a node this way does not re-sort the tree: If the new node doesn't
+have the same key as the old node, the rbtree will probably become corrupted.
+
+Iterating through the elements stored in an rbtree (in sort order)
+------------------------------------------------------------------
+
+Four functions are provided for iterating through an rbtree's contents in
+sorted order. These work on arbitrary trees, and should not need to be
+modified or wrapped (except for locking purposes):
+
+ struct rb_node *rb_first(struct rb_root *tree);
+ struct rb_node *rb_last(struct rb_root *tree);
+ struct rb_node *rb_next(struct rb_node *node);
+ struct rb_node *rb_prev(struct rb_node *node);
+
+To start iterating, call rb_first() or rb_last() with a pointer to the root
+of the tree, which will return a pointer to the node structure contained in
+the first or last element in the tree. To continue, fetch the next or previous
+node by calling rb_next() or rb_prev() on the current node. This will return
+NULL when there are no more nodes left.
+
+The iterator functions return a pointer to the embedded struct rb_node, from
+which the containing data structure may be accessed with the container_of()
+macro, and individual members may be accessed directly via
+rb_entry(node, type, member).
+
+Example:
+
+ struct rb_node *node;
+ for (node = rb_first(&mytree); node; node = rb_next(node))
+ printk("key=%s\n", rb_entry(node, int, keystring));
+
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
new file mode 100644
index 0000000..b65f079
--- /dev/null
+++ b/Documentation/rfkill.txt
@@ -0,0 +1,569 @@
+rfkill - RF switch subsystem support
+====================================
+
+1 Introduction
+2 Implementation details
+3 Kernel driver guidelines
+3.1 wireless device drivers
+3.2 platform/switch drivers
+3.3 input device drivers
+4 Kernel API
+5 Userspace support
+
+
+1. Introduction:
+
+The rfkill switch subsystem exists to add a generic interface to circuitry that
+can enable or disable the signal output of a wireless *transmitter* of any
+type. By far, the most common use is to disable radio-frequency transmitters.
+
+Note that disabling the signal output means that the the transmitter is to be
+made to not emit any energy when "blocked". rfkill is not about blocking data
+transmissions, it is about blocking energy emission.
+
+The rfkill subsystem offers support for keys and switches often found on
+laptops to enable wireless devices like WiFi and Bluetooth, so that these keys
+and switches actually perform an action in all wireless devices of a given type
+attached to the system.
+
+The buttons to enable and disable the wireless transmitters are important in
+situations where the user is for example using his laptop on a location where
+radio-frequency transmitters _must_ be disabled (e.g. airplanes).
+
+Because of this requirement, userspace support for the keys should not be made
+mandatory. Because userspace might want to perform some additional smarter
+tasks when the key is pressed, rfkill provides userspace the possibility to
+take over the task to handle the key events.
+
+===============================================================================
+2: Implementation details
+
+The rfkill subsystem is composed of various components: the rfkill class, the
+rfkill-input module (an input layer handler), and some specific input layer
+events.
+
+The rfkill class provides kernel drivers with an interface that allows them to
+know when they should enable or disable a wireless network device transmitter.
+This is enabled by the CONFIG_RFKILL Kconfig option.
+
+The rfkill class support makes sure userspace will be notified of all state
+changes on rfkill devices through uevents. It provides a notification chain
+for interested parties in the kernel to also get notified of rfkill state
+changes in other drivers. It creates several sysfs entries which can be used
+by userspace. See section "Userspace support".
+
+The rfkill-input module provides the kernel with the ability to implement a
+basic response when the user presses a key or button (or toggles a switch)
+related to rfkill functionality. It is an in-kernel implementation of default
+policy of reacting to rfkill-related input events and neither mandatory nor
+required for wireless drivers to operate. It is enabled by the
+CONFIG_RFKILL_INPUT Kconfig option.
+
+rfkill-input is a rfkill-related events input layer handler. This handler will
+listen to all rfkill key events and will change the rfkill state of the
+wireless devices accordingly. With this option enabled userspace could either
+do nothing or simply perform monitoring tasks.
+
+The rfkill-input module also provides EPO (emergency power-off) functionality
+for all wireless transmitters. This function cannot be overridden, and it is
+always active. rfkill EPO is related to *_RFKILL_ALL input layer events.
+
+
+Important terms for the rfkill subsystem:
+
+In order to avoid confusion, we avoid the term "switch" in rfkill when it is
+referring to an electronic control circuit that enables or disables a
+transmitter. We reserve it for the physical device a human manipulates
+(which is an input device, by the way):
+
+rfkill switch:
+
+ A physical device a human manipulates. Its state can be perceived by
+ the kernel either directly (through a GPIO pin, ACPI GPE) or by its
+ effect on a rfkill line of a wireless device.
+
+rfkill controller:
+
+ A hardware circuit that controls the state of a rfkill line, which a
+ kernel driver can interact with *to modify* that state (i.e. it has
+ either write-only or read/write access).
+
+rfkill line:
+
+ An input channel (hardware or software) of a wireless device, which
+ causes a wireless transmitter to stop emitting energy (BLOCK) when it
+ is active. Point of view is extremely important here: rfkill lines are
+ always seen from the PoV of a wireless device (and its driver).
+
+soft rfkill line/software rfkill line:
+
+ A rfkill line the wireless device driver can directly change the state
+ of. Related to rfkill_state RFKILL_STATE_SOFT_BLOCKED.
+
+hard rfkill line/hardware rfkill line:
+
+ A rfkill line that works fully in hardware or firmware, and that cannot
+ be overridden by the kernel driver. The hardware device or the
+ firmware just exports its status to the driver, but it is read-only.
+ Related to rfkill_state RFKILL_STATE_HARD_BLOCKED.
+
+The enum rfkill_state describes the rfkill state of a transmitter:
+
+When a rfkill line or rfkill controller is in the RFKILL_STATE_UNBLOCKED state,
+the wireless transmitter (radio TX circuit for example) is *enabled*. When the
+it is in the RFKILL_STATE_SOFT_BLOCKED or RFKILL_STATE_HARD_BLOCKED, the
+wireless transmitter is to be *blocked* from operating.
+
+RFKILL_STATE_SOFT_BLOCKED indicates that a call to toggle_radio() can change
+that state. RFKILL_STATE_HARD_BLOCKED indicates that a call to toggle_radio()
+will not be able to change the state and will return with a suitable error if
+attempts are made to set the state to RFKILL_STATE_UNBLOCKED.
+
+RFKILL_STATE_HARD_BLOCKED is used by drivers to signal that the device is
+locked in the BLOCKED state by a hardwire rfkill line (typically an input pin
+that, when active, forces the transmitter to be disabled) which the driver
+CANNOT override.
+
+Full rfkill functionality requires two different subsystems to cooperate: the
+input layer and the rfkill class. The input layer issues *commands* to the
+entire system requesting that devices registered to the rfkill class change
+state. The way this interaction happens is not complex, but it is not obvious
+either:
+
+Kernel Input layer:
+
+ * Generates KEY_WWAN, KEY_WLAN, KEY_BLUETOOTH, SW_RFKILL_ALL, and
+ other such events when the user presses certain keys, buttons, or
+ toggles certain physical switches.
+
+ THE INPUT LAYER IS NEVER USED TO PROPAGATE STATUS, NOTIFICATIONS OR THE
+ KIND OF STUFF AN ON-SCREEN-DISPLAY APPLICATION WOULD REPORT. It is
+ used to issue *commands* for the system to change behaviour, and these
+ commands may or may not be carried out by some kernel driver or
+ userspace application. It follows that doing user feedback based only
+ on input events is broken, as there is no guarantee that an input event
+ will be acted upon.
+
+ Most wireless communication device drivers implementing rfkill
+ functionality MUST NOT generate these events, and have no reason to
+ register themselves with the input layer. Doing otherwise is a common
+ misconception. There is an API to propagate rfkill status change
+ information, and it is NOT the input layer.
+
+rfkill class:
+
+ * Calls a hook in a driver to effectively change the wireless
+ transmitter state;
+ * Keeps track of the wireless transmitter state (with help from
+ the driver);
+ * Generates userspace notifications (uevents) and a call to a
+ notification chain (kernel) when there is a wireless transmitter
+ state change;
+ * Connects a wireless communications driver with the common rfkill
+ control system, which, for example, allows actions such as
+ "switch all bluetooth devices offline" to be carried out by
+ userspace or by rfkill-input.
+
+ THE RFKILL CLASS NEVER ISSUES INPUT EVENTS. THE RFKILL CLASS DOES
+ NOT LISTEN TO INPUT EVENTS. NO DRIVER USING THE RFKILL CLASS SHALL
+ EVER LISTEN TO, OR ACT ON RFKILL INPUT EVENTS. Doing otherwise is
+ a layering violation.
+
+ Most wireless data communication drivers in the kernel have just to
+ implement the rfkill class API to work properly. Interfacing to the
+ input layer is not often required (and is very often a *bug*) on
+ wireless drivers.
+
+ Platform drivers often have to attach to the input layer to *issue*
+ (but never to listen to) rfkill events for rfkill switches, and also to
+ the rfkill class to export a control interface for the platform rfkill
+ controllers to the rfkill subsystem. This does NOT mean the rfkill
+ switch is attached to a rfkill class (doing so is almost always wrong).
+ It just means the same kernel module is the driver for different
+ devices (rfkill switches and rfkill controllers).
+
+
+Userspace input handlers (uevents) or kernel input handlers (rfkill-input):
+
+ * Implements the policy of what should happen when one of the input
+ layer events related to rfkill operation is received.
+ * Uses the sysfs interface (userspace) or private rfkill API calls
+ to tell the devices registered with the rfkill class to change
+ their state (i.e. translates the input layer event into real
+ action).
+ * rfkill-input implements EPO by handling EV_SW SW_RFKILL_ALL 0
+ (power off all transmitters) in a special way: it ignores any
+ overrides and local state cache and forces all transmitters to the
+ RFKILL_STATE_SOFT_BLOCKED state (including those which are already
+ supposed to be BLOCKED). Note that the opposite event (power on all
+ transmitters) is handled normally.
+
+Userspace uevent handler or kernel platform-specific drivers hooked to the
+rfkill notifier chain:
+
+ * Taps into the rfkill notifier chain or to KOBJ_CHANGE uevents,
+ in order to know when a device that is registered with the rfkill
+ class changes state;
+ * Issues feedback notifications to the user;
+ * In the rare platforms where this is required, synthesizes an input
+ event to command all *OTHER* rfkill devices to also change their
+ statues when a specific rfkill device changes state.
+
+
+===============================================================================
+3: Kernel driver guidelines
+
+Remember: point-of-view is everything for a driver that connects to the rfkill
+subsystem. All the details below must be measured/perceived from the point of
+view of the specific driver being modified.
+
+The first thing one needs to know is whether his driver should be talking to
+the rfkill class or to the input layer. In rare cases (platform drivers), it
+could happen that you need to do both, as platform drivers often handle a
+variety of devices in the same driver.
+
+Do not mistake input devices for rfkill controllers. The only type of "rfkill
+switch" device that is to be registered with the rfkill class are those
+directly controlling the circuits that cause a wireless transmitter to stop
+working (or the software equivalent of them), i.e. what we call a rfkill
+controller. Every other kind of "rfkill switch" is just an input device and
+MUST NOT be registered with the rfkill class.
+
+A driver should register a device with the rfkill class when ALL of the
+following conditions are met (they define a rfkill controller):
+
+1. The device is/controls a data communications wireless transmitter;
+
+2. The kernel can interact with the hardware/firmware to CHANGE the wireless
+ transmitter state (block/unblock TX operation);
+
+3. The transmitter can be made to not emit any energy when "blocked":
+ rfkill is not about blocking data transmissions, it is about blocking
+ energy emission;
+
+A driver should register a device with the input subsystem to issue
+rfkill-related events (KEY_WLAN, KEY_BLUETOOTH, KEY_WWAN, KEY_WIMAX,
+SW_RFKILL_ALL, etc) when ALL of the folowing conditions are met:
+
+1. It is directly related to some physical device the user interacts with, to
+ command the O.S./firmware/hardware to enable/disable a data communications
+ wireless transmitter.
+
+ Examples of the physical device are: buttons, keys and switches the user
+ will press/touch/slide/switch to enable or disable the wireless
+ communication device.
+
+2. It is NOT slaved to another device, i.e. there is no other device that
+ issues rfkill-related input events in preference to this one.
+
+ Please refer to the corner cases and examples section for more details.
+
+When in doubt, do not issue input events. For drivers that should generate
+input events in some platforms, but not in others (e.g. b43), the best solution
+is to NEVER generate input events in the first place. That work should be
+deferred to a platform-specific kernel module (which will know when to generate
+events through the rfkill notifier chain) or to userspace. This avoids the
+usual maintenance problems with DMI whitelisting.
+
+
+Corner cases and examples:
+====================================
+
+1. If the device is an input device that, because of hardware or firmware,
+causes wireless transmitters to be blocked regardless of the kernel's will, it
+is still just an input device, and NOT to be registered with the rfkill class.
+
+2. If the wireless transmitter switch control is read-only, it is an input
+device and not to be registered with the rfkill class (and maybe not to be made
+an input layer event source either, see below).
+
+3. If there is some other device driver *closer* to the actual hardware the
+user interacted with (the button/switch/key) to issue an input event, THAT is
+the device driver that should be issuing input events.
+
+E.g:
+ [RFKILL slider switch] -- [GPIO hardware] -- [WLAN card rf-kill input]
+ (platform driver) (wireless card driver)
+
+The user is closer to the RFKILL slide switch plaform driver, so the driver
+which must issue input events is the platform driver looking at the GPIO
+hardware, and NEVER the wireless card driver (which is just a slave). It is
+very likely that there are other leaves than just the WLAN card rf-kill input
+(e.g. a bluetooth card, etc)...
+
+On the other hand, some embedded devices do this:
+
+ [RFKILL slider switch] -- [WLAN card rf-kill input]
+ (wireless card driver)
+
+In this situation, the wireless card driver *could* register itself as an input
+device and issue rf-kill related input events... but in order to AVOID the need
+for DMI whitelisting, the wireless card driver does NOT do it. Userspace (HAL)
+or a platform driver (that exists only on these embedded devices) will do the
+dirty job of issuing the input events.
+
+
+COMMON MISTAKES in kernel drivers, related to rfkill:
+====================================
+
+1. NEVER confuse input device keys and buttons with input device switches.
+
+ 1a. Switches are always set or reset. They report the current state
+ (on position or off position).
+
+ 1b. Keys and buttons are either in the pressed or not-pressed state, and
+ that's it. A "button" that latches down when you press it, and
+ unlatches when you press it again is in fact a switch as far as input
+ devices go.
+
+Add the SW_* events you need for switches, do NOT try to emulate a button using
+KEY_* events just because there is no such SW_* event yet. Do NOT try to use,
+for example, KEY_BLUETOOTH when you should be using SW_BLUETOOTH instead.
+
+2. Input device switches (sources of EV_SW events) DO store their current state
+(so you *must* initialize it by issuing a gratuitous input layer event on
+driver start-up and also when resuming from sleep), and that state CAN be
+queried from userspace through IOCTLs. There is no sysfs interface for this,
+but that doesn't mean you should break things trying to hook it to the rfkill
+class to get a sysfs interface :-)
+
+3. Do not issue *_RFKILL_ALL events by default, unless you are sure it is the
+correct event for your switch/button. These events are emergency power-off
+events when they are trying to turn the transmitters off. An example of an
+input device which SHOULD generate *_RFKILL_ALL events is the wireless-kill
+switch in a laptop which is NOT a hotkey, but a real switch that kills radios
+in hardware, even if the O.S. has gone to lunch. An example of an input device
+which SHOULD NOT generate *_RFKILL_ALL events by default, is any sort of hot
+key that does nothing by itself, as well as any hot key that is type-specific
+(e.g. the one for WLAN).
+
+
+3.1 Guidelines for wireless device drivers
+------------------------------------------
+
+(in this text, rfkill->foo means the foo field of struct rfkill).
+
+1. Each independent transmitter in a wireless device (usually there is only one
+transmitter per device) should have a SINGLE rfkill class attached to it.
+
+2. If the device does not have any sort of hardware assistance to allow the
+driver to rfkill the device, the driver should emulate it by taking all actions
+required to silence the transmitter.
+
+3. If it is impossible to silence the transmitter (i.e. it still emits energy,
+even if it is just in brief pulses, when there is no data to transmit and there
+is no hardware support to turn it off) do NOT lie to the users. Do not attach
+it to a rfkill class. The rfkill subsystem does not deal with data
+transmission, it deals with energy emission. If the transmitter is emitting
+energy, it is not blocked in rfkill terms.
+
+4. It doesn't matter if the device has multiple rfkill input lines affecting
+the same transmitter, their combined state is to be exported as a single state
+per transmitter (see rule 1).
+
+This rule exists because users of the rfkill subsystem expect to get (and set,
+when possible) the overall transmitter rfkill state, not of a particular rfkill
+line.
+
+5. The wireless device driver MUST NOT leave the transmitter enabled during
+suspend and hibernation unless:
+
+ 5.1. The transmitter has to be enabled for some sort of functionality
+ like wake-on-wireless-packet or autonomous packed forwarding in a mesh
+ network, and that functionality is enabled for this suspend/hibernation
+ cycle.
+
+AND
+
+ 5.2. The device was not on a user-requested BLOCKED state before
+ the suspend (i.e. the driver must NOT unblock a device, not even
+ to support wake-on-wireless-packet or remain in the mesh).
+
+In other words, there is absolutely no allowed scenario where a driver can
+automatically take action to unblock a rfkill controller (obviously, this deals
+with scenarios where soft-blocking or both soft and hard blocking is happening.
+Scenarios where hardware rfkill lines are the only ones blocking the
+transmitter are outside of this rule, since the wireless device driver does not
+control its input hardware rfkill lines in the first place).
+
+6. During resume, rfkill will try to restore its previous state.
+
+7. After a rfkill class is suspended, it will *not* call rfkill->toggle_radio
+until it is resumed.
+
+
+Example of a WLAN wireless driver connected to the rfkill subsystem:
+--------------------------------------------------------------------
+
+A certain WLAN card has one input pin that causes it to block the transmitter
+and makes the status of that input pin available (only for reading!) to the
+kernel driver. This is a hard rfkill input line (it cannot be overridden by
+the kernel driver).
+
+The card also has one PCI register that, if manipulated by the driver, causes
+it to block the transmitter. This is a soft rfkill input line.
+
+It has also a thermal protection circuitry that shuts down its transmitter if
+the card overheats, and makes the status of that protection available (only for
+reading!) to the kernel driver. This is also a hard rfkill input line.
+
+If either one of these rfkill lines are active, the transmitter is blocked by
+the hardware and forced offline.
+
+The driver should allocate and attach to its struct device *ONE* instance of
+the rfkill class (there is only one transmitter).
+
+It can implement the get_state() hook, and return RFKILL_STATE_HARD_BLOCKED if
+either one of its two hard rfkill input lines are active. If the two hard
+rfkill lines are inactive, it must return RFKILL_STATE_SOFT_BLOCKED if its soft
+rfkill input line is active. Only if none of the rfkill input lines are
+active, will it return RFKILL_STATE_UNBLOCKED.
+
+Since the device has a hardware rfkill line, it IS subject to state changes
+external to rfkill. Therefore, the driver must make sure that it calls
+rfkill_force_state() to keep the status always up-to-date, and it must do a
+rfkill_force_state() on resume from sleep.
+
+Every time the driver gets a notification from the card that one of its rfkill
+lines changed state (polling might be needed on badly designed cards that don't
+generate interrupts for such events), it recomputes the rfkill state as per
+above, and calls rfkill_force_state() to update it.
+
+The driver should implement the toggle_radio() hook, that:
+
+1. Returns an error if one of the hardware rfkill lines are active, and the
+caller asked for RFKILL_STATE_UNBLOCKED.
+
+2. Activates the soft rfkill line if the caller asked for state
+RFKILL_STATE_SOFT_BLOCKED. It should do this even if one of the hard rfkill
+lines are active, effectively double-blocking the transmitter.
+
+3. Deactivates the soft rfkill line if none of the hardware rfkill lines are
+active and the caller asked for RFKILL_STATE_UNBLOCKED.
+
+===============================================================================
+4: Kernel API
+
+To build a driver with rfkill subsystem support, the driver should depend on
+(or select) the Kconfig symbol RFKILL; it should _not_ depend on RKFILL_INPUT.
+
+The hardware the driver talks to may be write-only (where the current state
+of the hardware is unknown), or read-write (where the hardware can be queried
+about its current state).
+
+The rfkill class will call the get_state hook of a device every time it needs
+to know the *real* current state of the hardware. This can happen often, but
+it does not do any polling, so it is not enough on hardware that is subject
+to state changes outside of the rfkill subsystem.
+
+Therefore, calling rfkill_force_state() when a state change happens is
+mandatory when the device has a hardware rfkill line, or when something else
+like the firmware could cause its state to be changed without going through the
+rfkill class.
+
+Some hardware provides events when its status changes. In these cases, it is
+best for the driver to not provide a get_state hook, and instead register the
+rfkill class *already* with the correct status, and keep it updated using
+rfkill_force_state() when it gets an event from the hardware.
+
+rfkill_force_state() must be used on the device resume handlers to update the
+rfkill status, should there be any chance of the device status changing during
+the sleep.
+
+There is no provision for a statically-allocated rfkill struct. You must
+use rfkill_allocate() to allocate one.
+
+You should:
+ - rfkill_allocate()
+ - modify rfkill fields (flags, name)
+ - modify state to the current hardware state (THIS IS THE ONLY TIME
+ YOU CAN ACCESS state DIRECTLY)
+ - rfkill_register()
+
+The only way to set a device to the RFKILL_STATE_HARD_BLOCKED state is through
+a suitable return of get_state() or through rfkill_force_state().
+
+When a device is in the RFKILL_STATE_HARD_BLOCKED state, the only way to switch
+it to a different state is through a suitable return of get_state() or through
+rfkill_force_state().
+
+If toggle_radio() is called to set a device to state RFKILL_STATE_SOFT_BLOCKED
+when that device is already at the RFKILL_STATE_HARD_BLOCKED state, it should
+not return an error. Instead, it should try to double-block the transmitter,
+so that its state will change from RFKILL_STATE_HARD_BLOCKED to
+RFKILL_STATE_SOFT_BLOCKED should the hardware blocking cease.
+
+Please refer to the source for more documentation.
+
+===============================================================================
+5: Userspace support
+
+rfkill devices issue uevents (with an action of "change"), with the following
+environment variables set:
+
+RFKILL_NAME
+RFKILL_STATE
+RFKILL_TYPE
+
+The ABI for these variables is defined by the sysfs attributes. It is best
+to take a quick look at the source to make sure of the possible values.
+
+It is expected that HAL will trap those, and bridge them to DBUS, etc. These
+events CAN and SHOULD be used to give feedback to the user about the rfkill
+status of the system.
+
+Input devices may issue events that are related to rfkill. These are the
+various KEY_* events and SW_* events supported by rfkill-input.c.
+
+******IMPORTANT******
+When rfkill-input is ACTIVE, userspace is NOT TO CHANGE THE STATE OF AN RFKILL
+SWITCH IN RESPONSE TO AN INPUT EVENT also handled by rfkill-input, unless it
+has set to true the user_claim attribute for that particular switch. This rule
+is *absolute*; do NOT violate it.
+******IMPORTANT******
+
+Userspace must not assume it is the only source of control for rfkill switches.
+Their state CAN and WILL change due to firmware actions, direct user actions,
+and the rfkill-input EPO override for *_RFKILL_ALL.
+
+When rfkill-input is not active, userspace must initiate a rfkill status
+change by writing to the "state" attribute in order for anything to happen.
+
+Take particular care to implement EV_SW SW_RFKILL_ALL properly. When that
+switch is set to OFF, *every* rfkill device *MUST* be immediately put into the
+RFKILL_STATE_SOFT_BLOCKED state, no questions asked.
+
+The following sysfs entries will be created:
+
+ name: Name assigned by driver to this key (interface or driver name).
+ type: Name of the key type ("wlan", "bluetooth", etc).
+ state: Current state of the transmitter
+ 0: RFKILL_STATE_SOFT_BLOCKED
+ transmitter is forced off, but one can override it
+ by a write to the state attribute;
+ 1: RFKILL_STATE_UNBLOCKED
+ transmiter is NOT forced off, and may operate if
+ all other conditions for such operation are met
+ (such as interface is up and configured, etc);
+ 2: RFKILL_STATE_HARD_BLOCKED
+ transmitter is forced off by something outside of
+ the driver's control. One cannot set a device to
+ this state through writes to the state attribute;
+ claim: 1: Userspace handles events, 0: Kernel handles events
+
+Both the "state" and "claim" entries are also writable. For the "state" entry
+this means that when 1 or 0 is written, the device rfkill state (if not yet in
+the requested state), will be will be toggled accordingly.
+
+For the "claim" entry writing 1 to it means that the kernel no longer handles
+key events even though RFKILL_INPUT input was enabled. When "claim" has been
+set to 0, userspace should make sure that it listens for the input events or
+check the sysfs "state" entry regularly to correctly perform the required tasks
+when the rkfill key is pressed.
+
+A note about input devices and EV_SW events:
+
+In order to know the current state of an input device switch (like
+SW_RFKILL_ALL), you will need to use an IOCTL. That information is not
+available through sysfs in a generic way at this time, and it is not available
+through the rfkill class AT ALL.
diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
new file mode 100644
index 0000000..535f69f
--- /dev/null
+++ b/Documentation/robust-futex-ABI.txt
@@ -0,0 +1,182 @@
+Started by Paul Jackson <pj@sgi.com>
+
+The robust futex ABI
+--------------------
+
+Robust_futexes provide a mechanism that is used in addition to normal
+futexes, for kernel assist of cleanup of held locks on task exit.
+
+The interesting data as to what futexes a thread is holding is kept on a
+linked list in user space, where it can be updated efficiently as locks
+are taken and dropped, without kernel intervention. The only additional
+kernel intervention required for robust_futexes above and beyond what is
+required for futexes is:
+
+ 1) a one time call, per thread, to tell the kernel where its list of
+ held robust_futexes begins, and
+ 2) internal kernel code at exit, to handle any listed locks held
+ by the exiting thread.
+
+The existing normal futexes already provide a "Fast Userspace Locking"
+mechanism, which handles uncontested locking without needing a system
+call, and handles contested locking by maintaining a list of waiting
+threads in the kernel. Options on the sys_futex(2) system call support
+waiting on a particular futex, and waking up the next waiter on a
+particular futex.
+
+For robust_futexes to work, the user code (typically in a library such
+as glibc linked with the application) has to manage and place the
+necessary list elements exactly as the kernel expects them. If it fails
+to do so, then improperly listed locks will not be cleaned up on exit,
+probably causing deadlock or other such failure of the other threads
+waiting on the same locks.
+
+A thread that anticipates possibly using robust_futexes should first
+issue the system call:
+
+ asmlinkage long
+ sys_set_robust_list(struct robust_list_head __user *head, size_t len);
+
+The pointer 'head' points to a structure in the threads address space
+consisting of three words. Each word is 32 bits on 32 bit arch's, or 64
+bits on 64 bit arch's, and local byte order. Each thread should have
+its own thread private 'head'.
+
+If a thread is running in 32 bit compatibility mode on a 64 native arch
+kernel, then it can actually have two such structures - one using 32 bit
+words for 32 bit compatibility mode, and one using 64 bit words for 64
+bit native mode. The kernel, if it is a 64 bit kernel supporting 32 bit
+compatibility mode, will attempt to process both lists on each task
+exit, if the corresponding sys_set_robust_list() call has been made to
+setup that list.
+
+ The first word in the memory structure at 'head' contains a
+ pointer to a single linked list of 'lock entries', one per lock,
+ as described below. If the list is empty, the pointer will point
+ to itself, 'head'. The last 'lock entry' points back to the 'head'.
+
+ The second word, called 'offset', specifies the offset from the
+ address of the associated 'lock entry', plus or minus, of what will
+ be called the 'lock word', from that 'lock entry'. The 'lock word'
+ is always a 32 bit word, unlike the other words above. The 'lock
+ word' holds 3 flag bits in the upper 3 bits, and the thread id (TID)
+ of the thread holding the lock in the bottom 29 bits. See further
+ below for a description of the flag bits.
+
+ The third word, called 'list_op_pending', contains transient copy of
+ the address of the 'lock entry', during list insertion and removal,
+ and is needed to correctly resolve races should a thread exit while
+ in the middle of a locking or unlocking operation.
+
+Each 'lock entry' on the single linked list starting at 'head' consists
+of just a single word, pointing to the next 'lock entry', or back to
+'head' if there are no more entries. In addition, nearby to each 'lock
+entry', at an offset from the 'lock entry' specified by the 'offset'
+word, is one 'lock word'.
+
+The 'lock word' is always 32 bits, and is intended to be the same 32 bit
+lock variable used by the futex mechanism, in conjunction with
+robust_futexes. The kernel will only be able to wakeup the next thread
+waiting for a lock on a threads exit if that next thread used the futex
+mechanism to register the address of that 'lock word' with the kernel.
+
+For each futex lock currently held by a thread, if it wants this
+robust_futex support for exit cleanup of that lock, it should have one
+'lock entry' on this list, with its associated 'lock word' at the
+specified 'offset'. Should a thread die while holding any such locks,
+the kernel will walk this list, mark any such locks with a bit
+indicating their holder died, and wakeup the next thread waiting for
+that lock using the futex mechanism.
+
+When a thread has invoked the above system call to indicate it
+anticipates using robust_futexes, the kernel stores the passed in 'head'
+pointer for that task. The task may retrieve that value later on by
+using the system call:
+
+ asmlinkage long
+ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+ size_t __user *len_ptr);
+
+It is anticipated that threads will use robust_futexes embedded in
+larger, user level locking structures, one per lock. The kernel
+robust_futex mechanism doesn't care what else is in that structure, so
+long as the 'offset' to the 'lock word' is the same for all
+robust_futexes used by that thread. The thread should link those locks
+it currently holds using the 'lock entry' pointers. It may also have
+other links between the locks, such as the reverse side of a double
+linked list, but that doesn't matter to the kernel.
+
+By keeping its locks linked this way, on a list starting with a 'head'
+pointer known to the kernel, the kernel can provide to a thread the
+essential service available for robust_futexes, which is to help clean
+up locks held at the time of (a perhaps unexpectedly) exit.
+
+Actual locking and unlocking, during normal operations, is handled
+entirely by user level code in the contending threads, and by the
+existing futex mechanism to wait for, and wakeup, locks. The kernels
+only essential involvement in robust_futexes is to remember where the
+list 'head' is, and to walk the list on thread exit, handling locks
+still held by the departing thread, as described below.
+
+There may exist thousands of futex lock structures in a threads shared
+memory, on various data structures, at a given point in time. Only those
+lock structures for locks currently held by that thread should be on
+that thread's robust_futex linked lock list a given time.
+
+A given futex lock structure in a user shared memory region may be held
+at different times by any of the threads with access to that region. The
+thread currently holding such a lock, if any, is marked with the threads
+TID in the lower 29 bits of the 'lock word'.
+
+When adding or removing a lock from its list of held locks, in order for
+the kernel to correctly handle lock cleanup regardless of when the task
+exits (perhaps it gets an unexpected signal 9 in the middle of
+manipulating this list), the user code must observe the following
+protocol on 'lock entry' insertion and removal:
+
+On insertion:
+ 1) set the 'list_op_pending' word to the address of the 'lock word'
+ to be inserted,
+ 2) acquire the futex lock,
+ 3) add the lock entry, with its thread id (TID) in the bottom 29 bits
+ of the 'lock word', to the linked list starting at 'head', and
+ 4) clear the 'list_op_pending' word.
+
+On removal:
+ 1) set the 'list_op_pending' word to the address of the 'lock word'
+ to be removed,
+ 2) remove the lock entry for this lock from the 'head' list,
+ 2) release the futex lock, and
+ 2) clear the 'lock_op_pending' word.
+
+On exit, the kernel will consider the address stored in
+'list_op_pending' and the address of each 'lock word' found by walking
+the list starting at 'head'. For each such address, if the bottom 29
+bits of the 'lock word' at offset 'offset' from that address equals the
+exiting threads TID, then the kernel will do two things:
+
+ 1) if bit 31 (0x80000000) is set in that word, then attempt a futex
+ wakeup on that address, which will waken the next thread that has
+ used to the futex mechanism to wait on that address, and
+ 2) atomically set bit 30 (0x40000000) in the 'lock word'.
+
+In the above, bit 31 was set by futex waiters on that lock to indicate
+they were waiting, and bit 30 is set by the kernel to indicate that the
+lock owner died holding the lock.
+
+The kernel exit code will silently stop scanning the list further if at
+any point:
+
+ 1) the 'head' pointer or an subsequent linked list pointer
+ is not a valid address of a user space word
+ 2) the calculated location of the 'lock word' (address plus
+ 'offset') is not the valid address of a 32 bit user space
+ word
+ 3) if the list contains more than 1 million (subject to
+ future kernel configuration changes) elements.
+
+When the kernel sees a list entry whose 'lock word' doesn't have the
+current threads TID in the lower 29 bits, it does nothing with that
+entry, and goes on to the next entry.
+
+Bit 29 (0x20000000) of the 'lock word' is reserved for future use.
diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt
new file mode 100644
index 0000000..0a9446a
--- /dev/null
+++ b/Documentation/robust-futexes.txt
@@ -0,0 +1,218 @@
+Started by: Ingo Molnar <mingo@redhat.com>
+
+Background
+----------
+
+what are robust futexes? To answer that, we first need to understand
+what futexes are: normal futexes are special types of locks that in the
+noncontended case can be acquired/released from userspace without having
+to enter the kernel.
+
+A futex is in essence a user-space address, e.g. a 32-bit lock variable
+field. If userspace notices contention (the lock is already owned and
+someone else wants to grab it too) then the lock is marked with a value
+that says "there's a waiter pending", and the sys_futex(FUTEX_WAIT)
+syscall is used to wait for the other guy to release it. The kernel
+creates a 'futex queue' internally, so that it can later on match up the
+waiter with the waker - without them having to know about each other.
+When the owner thread releases the futex, it notices (via the variable
+value) that there were waiter(s) pending, and does the
+sys_futex(FUTEX_WAKE) syscall to wake them up. Once all waiters have
+taken and released the lock, the futex is again back to 'uncontended'
+state, and there's no in-kernel state associated with it. The kernel
+completely forgets that there ever was a futex at that address. This
+method makes futexes very lightweight and scalable.
+
+"Robustness" is about dealing with crashes while holding a lock: if a
+process exits prematurely while holding a pthread_mutex_t lock that is
+also shared with some other process (e.g. yum segfaults while holding a
+pthread_mutex_t, or yum is kill -9-ed), then waiters for that lock need
+to be notified that the last owner of the lock exited in some irregular
+way.
+
+To solve such types of problems, "robust mutex" userspace APIs were
+created: pthread_mutex_lock() returns an error value if the owner exits
+prematurely - and the new owner can decide whether the data protected by
+the lock can be recovered safely.
+
+There is a big conceptual problem with futex based mutexes though: it is
+the kernel that destroys the owner task (e.g. due to a SEGFAULT), but
+the kernel cannot help with the cleanup: if there is no 'futex queue'
+(and in most cases there is none, futexes being fast lightweight locks)
+then the kernel has no information to clean up after the held lock!
+Userspace has no chance to clean up after the lock either - userspace is
+the one that crashes, so it has no opportunity to clean up. Catch-22.
+
+In practice, when e.g. yum is kill -9-ed (or segfaults), a system reboot
+is needed to release that futex based lock. This is one of the leading
+bugreports against yum.
+
+To solve this problem, the traditional approach was to extend the vma
+(virtual memory area descriptor) concept to have a notion of 'pending
+robust futexes attached to this area'. This approach requires 3 new
+syscall variants to sys_futex(): FUTEX_REGISTER, FUTEX_DEREGISTER and
+FUTEX_RECOVER. At do_exit() time, all vmas are searched to see whether
+they have a robust_head set. This approach has two fundamental problems
+left:
+
+ - it has quite complex locking and race scenarios. The vma-based
+ approach had been pending for years, but they are still not completely
+ reliable.
+
+ - they have to scan _every_ vma at sys_exit() time, per thread!
+
+The second disadvantage is a real killer: pthread_exit() takes around 1
+microsecond on Linux, but with thousands (or tens of thousands) of vmas
+every pthread_exit() takes a millisecond or more, also totally
+destroying the CPU's L1 and L2 caches!
+
+This is very much noticeable even for normal process sys_exit_group()
+calls: the kernel has to do the vma scanning unconditionally! (this is
+because the kernel has no knowledge about how many robust futexes there
+are to be cleaned up, because a robust futex might have been registered
+in another task, and the futex variable might have been simply mmap()-ed
+into this process's address space).
+
+This huge overhead forced the creation of CONFIG_FUTEX_ROBUST so that
+normal kernels can turn it off, but worse than that: the overhead makes
+robust futexes impractical for any type of generic Linux distribution.
+
+So something had to be done.
+
+New approach to robust futexes
+------------------------------
+
+At the heart of this new approach there is a per-thread private list of
+robust locks that userspace is holding (maintained by glibc) - which
+userspace list is registered with the kernel via a new syscall [this
+registration happens at most once per thread lifetime]. At do_exit()
+time, the kernel checks this user-space list: are there any robust futex
+locks to be cleaned up?
+
+In the common case, at do_exit() time, there is no list registered, so
+the cost of robust futexes is just a simple current->robust_list != NULL
+comparison. If the thread has registered a list, then normally the list
+is empty. If the thread/process crashed or terminated in some incorrect
+way then the list might be non-empty: in this case the kernel carefully
+walks the list [not trusting it], and marks all locks that are owned by
+this thread with the FUTEX_OWNER_DIED bit, and wakes up one waiter (if
+any).
+
+The list is guaranteed to be private and per-thread at do_exit() time,
+so it can be accessed by the kernel in a lockless way.
+
+There is one race possible though: since adding to and removing from the
+list is done after the futex is acquired by glibc, there is a few
+instructions window for the thread (or process) to die there, leaving
+the futex hung. To protect against this possibility, userspace (glibc)
+also maintains a simple per-thread 'list_op_pending' field, to allow the
+kernel to clean up if the thread dies after acquiring the lock, but just
+before it could have added itself to the list. Glibc sets this
+list_op_pending field before it tries to acquire the futex, and clears
+it after the list-add (or list-remove) has finished.
+
+That's all that is needed - all the rest of robust-futex cleanup is done
+in userspace [just like with the previous patches].
+
+Ulrich Drepper has implemented the necessary glibc support for this new
+mechanism, which fully enables robust mutexes.
+
+Key differences of this userspace-list based approach, compared to the
+vma based method:
+
+ - it's much, much faster: at thread exit time, there's no need to loop
+ over every vma (!), which the VM-based method has to do. Only a very
+ simple 'is the list empty' op is done.
+
+ - no VM changes are needed - 'struct address_space' is left alone.
+
+ - no registration of individual locks is needed: robust mutexes dont
+ need any extra per-lock syscalls. Robust mutexes thus become a very
+ lightweight primitive - so they dont force the application designer
+ to do a hard choice between performance and robustness - robust
+ mutexes are just as fast.
+
+ - no per-lock kernel allocation happens.
+
+ - no resource limits are needed.
+
+ - no kernel-space recovery call (FUTEX_RECOVER) is needed.
+
+ - the implementation and the locking is "obvious", and there are no
+ interactions with the VM.
+
+Performance
+-----------
+
+I have benchmarked the time needed for the kernel to process a list of 1
+million (!) held locks, using the new method [on a 2GHz CPU]:
+
+ - with FUTEX_WAIT set [contended mutex]: 130 msecs
+ - without FUTEX_WAIT set [uncontended mutex]: 30 msecs
+
+I have also measured an approach where glibc does the lock notification
+[which it currently does for !pshared robust mutexes], and that took 256
+msecs - clearly slower, due to the 1 million FUTEX_WAKE syscalls
+userspace had to do.
+
+(1 million held locks are unheard of - we expect at most a handful of
+locks to be held at a time. Nevertheless it's nice to know that this
+approach scales nicely.)
+
+Implementation details
+----------------------
+
+The patch adds two new syscalls: one to register the userspace list, and
+one to query the registered list pointer:
+
+ asmlinkage long
+ sys_set_robust_list(struct robust_list_head __user *head,
+ size_t len);
+
+ asmlinkage long
+ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+ size_t __user *len_ptr);
+
+List registration is very fast: the pointer is simply stored in
+current->robust_list. [Note that in the future, if robust futexes become
+widespread, we could extend sys_clone() to register a robust-list head
+for new threads, without the need of another syscall.]
+
+So there is virtually zero overhead for tasks not using robust futexes,
+and even for robust futex users, there is only one extra syscall per
+thread lifetime, and the cleanup operation, if it happens, is fast and
+straightforward. The kernel doesn't have any internal distinction between
+robust and normal futexes.
+
+If a futex is found to be held at exit time, the kernel sets the
+following bit of the futex word:
+
+ #define FUTEX_OWNER_DIED 0x40000000
+
+and wakes up the next futex waiter (if any). User-space does the rest of
+the cleanup.
+
+Otherwise, robust futexes are acquired by glibc by putting the TID into
+the futex field atomically. Waiters set the FUTEX_WAITERS bit:
+
+ #define FUTEX_WAITERS 0x80000000
+
+and the remaining bits are for the TID.
+
+Testing, architecture support
+-----------------------------
+
+i've tested the new syscalls on x86 and x86_64, and have made sure the
+parsing of the userspace list is robust [ ;-) ] even if the list is
+deliberately corrupted.
+
+i386 and x86_64 syscalls are wired up at the moment, and Ulrich has
+tested the new glibc code (on x86_64 and i386), and it works for his
+robust-mutex testcases.
+
+All other architectures should build just fine too - but they wont have
+the new syscalls yet.
+
+Architectures need to implement the new futex_atomic_cmpxchg_inatomic()
+inline function before writing up the syscalls (that function returns
+-ENOSYS right now).
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt
new file mode 100644
index 0000000..4b736d2
--- /dev/null
+++ b/Documentation/rt-mutex-design.txt
@@ -0,0 +1,781 @@
+#
+# Copyright (c) 2006 Steven Rostedt
+# Licensed under the GNU Free Documentation License, Version 1.2
+#
+
+RT-mutex implementation design
+------------------------------
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/rt-mutex.txt. Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run. This happens for several reasons, and
+most of the time it can't be helped. Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource. This is a priority inversion. What we want to prevent
+is something called unbounded priority inversion. That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is were you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock. This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem.
+
+ grab lock L1 (owned by C)
+ |
+A ---+
+ C preempted by B
+ |
+C +----+
+
+B +-------->
+ B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document. Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process. To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A. So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A. As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain - The PI chain is an ordered series of locks and processes that cause
+ processes to inherit priorities from a previous process that is
+ blocked on one of its locks. This is described in more detail
+ later in this document.
+
+mutex - In this document, to differentiate from locks that implement
+ PI and spin locks that are used in the PI code, from now on
+ the PI locks will be called a mutex.
+
+lock - In this document from now on, I will use the term lock when
+ referring to spin locks that are used to protect parts of the PI
+ algorithm. These locks disable preemption for UP (when
+ CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+ entering critical sections simultaneously.
+
+spin lock - Same as lock above.
+
+waiter - A waiter is a struct that is stored on the stack of a blocked
+ process. Since the scope of the waiter is within the code for
+ a process being blocked on the mutex, it is fine to allocate
+ the waiter on the process's stack (local variable). This
+ structure holds a pointer to the task, as well as the mutex that
+ the task is blocked on. It also has the plist node structures to
+ place the task in the waiter_list of a mutex as well as the
+ pi_list of a mutex owner task (described below).
+
+ waiter is sometimes used in reference to the task that is waiting
+ on a mutex. This is the same as waiter->task.
+
+waiters - A list of processes that are blocked on a mutex.
+
+top waiter - The highest priority process waiting on a specific mutex.
+
+top pi waiter - The highest priority process waiting on one of the mutexes
+ that a specific process owns.
+
+Note: task and process are used interchangeably in this document, mostly to
+ differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place. Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example:
+
+ Process: A, B, C, D, E
+ Mutexes: L1, L2, L3, L4
+
+ A owns: L1
+ B blocked on L1
+ B owns L2
+ C blocked on L2
+ C owns L3
+ D blocked on L3
+ D owns L4
+ E blocked on L4
+
+The chain would be:
+
+ E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be:
+
+ F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains:
+
+ E->L4->D->L3->C->L2-+
+ |
+ +->B->L1->A
+ |
+ F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes. If we add another process G that is
+blocked on mutex L2:
+
+ G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again.
+
+ E->L4->D->L3->C-+
+ +->L2-+
+ | |
+ G-+ +->B->L1->A
+ |
+ F->L5-+
+
+
+Plist
+-----
+
+Before I go further and talk about how the PI chain is stored through lists
+on both mutexes and processes, I'll explain the plist. This is similar to
+the struct list_head functionality that is already in the kernel.
+The implementation of plist is out of scope for this document, but it is
+very important to understand what it does.
+
+There are a few differences between plist and list, the most important one
+being that plist is a priority sorted linked list. This means that the
+priorities of the plist are sorted, such that it takes O(1) to retrieve the
+highest priority item in the list. Obviously this is useful to store processes
+based on their priorities.
+
+Another difference, which is important for implementation, is that, unlike
+list, the head of the list is a different element than the nodes of a list.
+So the head of the list is declared as struct plist_head and nodes that will
+be added to the list are declared as struct plist_node.
+
+
+Mutex Waiter List
+-----------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The mutex
+has a plist to store these waiters by priority. This list is protected by
+a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock. Since the modification of the waiter list is never done in
+interrupt context, the wait_lock can be taken without disabling interrupts.
+
+
+Task PI List
+------------
+
+To keep track of the PI chains, each process has its own PI list. This is
+a list of all top waiters of the mutexes that are owned by the process.
+Note that this list only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI list is always the highest priority task that
+is waiting on a mutex that is owned by the task. So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this list.
+
+This list is stored in the task structure of a process as a plist called
+pi_list. This list is protected by a spin lock also in the task structure,
+called pi_lock. This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined. But is very complex to figure it out, since it depends on all
+the nesting of mutexes. Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way.
+
+void func1(void)
+{
+ mutex_lock(L1);
+
+ /* do anything */
+
+ mutex_unlock(L1);
+}
+
+void func2(void)
+{
+ mutex_lock(L1);
+ mutex_lock(L2);
+
+ /* do something */
+
+ mutex_unlock(L2);
+ mutex_unlock(L1);
+}
+
+void func3(void)
+{
+ mutex_lock(L2);
+ mutex_lock(L3);
+
+ /* do something else */
+
+ mutex_unlock(L3);
+ mutex_unlock(L2);
+}
+
+void func4(void)
+{
+ mutex_lock(L3);
+
+ /* do something again */
+
+ mutex_unlock(L3);
+}
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last. With D being preempted
+in func4 in the "do something again" area, we have a locking that follows:
+
+D owns L3
+ C blocked on L3
+ C owns L2
+ B blocked on L2
+ B owns L1
+ A blocked on L1
+
+And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two. So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data. So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain. More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex. If the
+mutex is not owned, this owner is set to NULL. Since all architectures
+have the task structure on at least a four byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the two
+least significant bits to be used as flags. This part is also described
+in Documentation/rt-mutex.txt, but will also be briefly described here.
+
+Bit 0 is used as the "Pending Owner" flag. This is described later.
+Bit 1 is used as the "Has Waiters" flags. This is also described later
+ in more detail, but is set whenever there are waiters on a mutex.
+
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange). This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically:
+
+unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+{
+ unsigned long T = *A;
+ if (*A == *B) {
+ *A = *C;
+ }
+ return T;
+}
+#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be. You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time. But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it. This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority. With the help of the pi_list of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio,
+__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock
+to already be taken), rt_mutex_get_prio, and rt_mutex_setprio.
+
+rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio.
+
+rt_mutex_getprio returns the priority that the task should have. Either the
+task's own normal priority, or if a process of a higher priority is waiting on
+a mutex owned by the task, then that higher priority should be returned.
+Since the pi_list of a task holds an order by priority list of all the top
+waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs
+to compare the top pi waiter to its own normal priority, and return the higher
+priority back.
+
+(Note: if looking at the code, you will notice that the lower number of
+ prio is returned. This is because the prio field in the task structure
+ is an inverse order of the actual priority. So a "prio" of 5 is
+ of higher priority than a "prio" of 10.)
+
+__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the
+result does not equal the task's current priority, then rt_mutex_setprio
+is called to adjust the priority of the task to the new priority.
+Note that rt_mutex_setprio is defined in kernel/sched.c to implement the
+actual change in priority.
+
+It is interesting to note that __rt_mutex_adjust_prio can either increase
+or decrease the priority of the task. In the case that a higher priority
+process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio
+would increase/boost the task's priority. But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task. That is because the pi_list
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best. It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, and a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting).
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held. That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it. This means that the task is set to the priority that it
+should be at, but the plist nodes of the task's waiter have not been updated
+with the new priorities, and that this task may not be in the proper locations
+in the pi_lists and wait_lists that the task is blocked on. This function
+solves all that.
+
+A loop is entered, where task is the owner to be checked for PI changes that
+was passed by parameter (for the first iteration). The pi_lock of this task is
+taken to prevent any more changes to the pi_list of the task. This also
+prevents new tasks from completing the blocking on a mutex that is owned by this
+task.
+
+If the task is not blocked on a mutex then the loop is exited. We are at
+the top of the PI chain.
+
+A check is now done to see if the original waiter (the process that is blocked
+on the current mutex) is the top pi waiter of the task. That is, is this
+waiter on the top of the task's pi_list. If it is not, it either means that
+there is another process higher in priority that is blocked on one of the
+mutexes that the task owns, or that the waiter has just woken up via a signal
+or timeout and has left the PI chain. In either case, the loop is exited, since
+we don't need to do any more changes to the priority of the current task, or any
+task that owns a mutex that this current task is waiting on. A priority chain
+walk is only needed when a new top pi waiter is made to a task.
+
+The next check sees if the task's waiter plist node has the priority equal to
+the priority the task is set at. If they are equal, then we are done with
+the loop. Remember that the function started with the priority of the
+task adjusted, but the plist nodes that hold the task in other processes
+pi_lists have not been adjusted.
+
+Next, we look at the mutex that the task is blocked on. The mutex's wait_lock
+is taken. This is done by a spin_trylock, because the locking order of the
+pi_lock and wait_lock goes in the opposite direction. If we fail to grab the
+lock, the pi_lock is released, and we restart the loop.
+
+Now that we have both the pi_lock of the task as well as the wait_lock of
+the mutex the task is blocked on, we update the task's waiter's plist node
+that is located on the mutex's wait_list.
+
+Now we release the pi_lock of the task.
+
+Next the owner of the mutex has its pi_lock taken, so we can update the
+task's entry in the owner's pi_list. If the task is the highest priority
+process on the mutex's wait_list, then we remove the previous top waiter
+from the owner's pi_list, and replace it with the task.
+
+Note: It is possible that the task was the current top waiter on the mutex,
+ in which case the task is not yet on the pi_list of the waiter. This
+ is OK, since plist_del does nothing if the plist node is not on any
+ list.
+
+If the task was not the top waiter of the mutex, but it was before we
+did the priority updates, that means we are deboosting/lowering the
+task. In this case, the task is removed from the pi_list of the owner,
+and the new top waiter is added.
+
+Lastly, we unlock both the pi_lock of the task, as well as the mutex's
+wait_lock, and continue the loop again. On the next iteration of the
+loop, the previous owner of the mutex will be the task that will be
+processed.
+
+Note: One might think that the owner of this mutex might have changed
+ since we just grab the mutex's wait_lock. And one could be right.
+ The important thing to remember is that the owner could not have
+ become the task that is being processed in the PI chain, since
+ we have taken that task's pi_lock at the beginning of the loop.
+ So as long as there is an owner of this mutex that is not the same
+ process as the tasked being worked on, we are OK.
+
+ Looking closely at the code, one might be confused. The check for the
+ end of the PI chain is when the task isn't blocked on anything or the
+ task's waiter structure "task" element is NULL. This check is
+ protected only by the task's pi_lock. But the code to unlock the mutex
+ sets the task's waiter structure "task" element to NULL with only
+ the protection of the mutex's wait_lock, which was not taken yet.
+ Isn't this a race condition if the task becomes the new owner?
+
+ The answer is No! The trick is the spin_trylock of the mutex's
+ wait_lock. If we fail that lock, we release the pi_lock of the
+ task and continue the loop, doing the end of PI chain check again.
+
+ In the code to release the lock, the wait_lock of the mutex is held
+ the entire time, and it is not let go when we grab the pi_lock of the
+ new owner of the mutex. So if the switch of a new owner were to happen
+ after the check for end of the PI chain and the grabbing of the
+ wait_lock, the unlocking code would spin on the new owner's pi_lock
+ but never give up the wait_lock. So the PI chain loop is guaranteed to
+ fail the spin_trylock on the wait_lock, release the pi_lock, and
+ try again.
+
+ If you don't quite understand the above, that's OK. You don't have to,
+ unless you really want to make a proof out of it ;)
+
+
+Pending Owners and Lock stealing
+--------------------------------
+
+One of the flags in the owner field of the mutex structure is "Pending Owner".
+What this means is that an owner was chosen by the process releasing the
+mutex, but that owner has yet to wake up and actually take the mutex.
+
+Why is this important? Why can't we just give the mutex to another process
+and be done with it?
+
+The PI code is to help with real-time processes, and to let the highest
+priority process run as long as possible with little latencies and delays.
+If a high priority process owns a mutex that a lower priority process is
+blocked on, when the mutex is released it would be given to the lower priority
+process. What if the higher priority process wants to take that mutex again.
+The high priority process would fail to take that mutex that it just gave up
+and it would need to boost the lower priority process to run with full
+latency of that critical section (since the low priority process just entered
+it).
+
+There's no reason a high priority process that gives up a mutex should be
+penalized if it tries to take that mutex again. If the new owner of the
+mutex has not woken up yet, there's no reason that the higher priority process
+could not take that mutex away.
+
+To solve this, we introduced Pending Ownership and Lock Stealing. When a
+new process is given a mutex that it was blocked on, it is only given
+pending ownership. This means that it's the new owner, unless a higher
+priority process comes in and tries to grab that mutex. If a higher priority
+process does come along and wants that mutex, we let the higher priority
+process "steal" the mutex from the pending owner (only if it is still pending)
+and continue with the mutex.
+
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex. This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails). Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, whether it is owned or pending owner
+we go about the slow path (rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack. This is because the waiter structure is only needed for the
+scope of this function. The waiter structure holds the nodes to store
+the task on the wait_list of the mutex, and if need be, the pi_list of
+the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex. This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path. The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field. Yes, this could really
+be false, because if the mutex has no owner, there are no waiters and
+the current task also won't have any waiters. But we don't have the lock
+yet, so we assume we are going to be a waiter. The reason for this is to
+play nice for those architectures that do have CMPXCHG. By setting this flag
+now, the owner of the mutex can't release the mutex without going into the
+slow unlock path, and it would then need to grab the wait_lock, which this
+code currently holds. So setting the "Has Waiters" flag forces the owner
+to synchronize with this code.
+
+Now that we know that we can't have any races with the owner releasing the
+mutex, we check to see if we can take the ownership. This is done if the
+mutex doesn't have a owner, or if we can steal the mutex from a pending
+owner. Let's look at the situations we have here.
+
+ 1) Has owner that is pending
+ ----------------------------
+
+ The mutex has a owner, but it hasn't woken up and the mutex flag
+ "Pending Owner" is set. The first check is to see if the owner isn't the
+ current task. This is because this function is also used for the pending
+ owner to grab the mutex. When a pending owner wakes up, it checks to see
+ if it can take the mutex, and this is done if the owner is already set to
+ itself. If so, we succeed and leave the function, clearing the "Pending
+ Owner" bit.
+
+ If the pending owner is not current, we check to see if the current priority is
+ higher than the pending owner. If not, we fail the function and return.
+
+ There's also something special about a pending owner. That is a pending owner
+ is never blocked on a mutex. So there is no PI chain to worry about. It also
+ means that if the mutex doesn't have any waiters, there's no accounting needed
+ to update the pending owner's pi_list, since we only worry about processes
+ blocked on the current mutex.
+
+ If there are waiters on this mutex, and we just stole the ownership, we need
+ to take the top waiter, remove it from the pi_list of the pending owner, and
+ add it to the current pi_list. Note that at this moment, the pending owner
+ is no longer on the list of waiters. This is fine, since the pending owner
+ would add itself back when it realizes that it had the ownership stolen
+ from itself. When the pending owner tries to grab the mutex, it will fail
+ in try_to_take_rt_mutex if the owner field points to another process.
+
+ 2) No owner
+ -----------
+
+ If there is no owner (or we successfully stole the lock), we set the owner
+ of the mutex to current, and set the flag of "Has Waiters" if the current
+ mutex actually has waiters, or we clear the flag if it doesn't. See, it was
+ OK that we set that flag early, since now it is cleared.
+
+ 3) Failed to grab ownership
+ ---------------------------
+
+ The most interesting case is when we fail to take ownership. This means that
+ there exists an owner, or there's a pending owner with equal or higher
+ priority than the current task.
+
+We'll continue on the failed case.
+
+If the mutex has a timeout, we set up a timer to go off to break us out
+of this mutex if we failed to get it after a specified amount of time.
+
+Now we enter a loop that will continue to try to take ownership of the mutex, or
+fail from a timeout or signal.
+
+Once again we try to take the mutex. This will usually fail the first time
+in the loop, since it had just failed to get the mutex. But the second time
+in the loop, this would likely succeed, since the task would likely be
+the pending owner.
+
+If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done
+here.
+
+The waiter structure has a "task" field that points to the task that is blocked
+on the mutex. This field can be NULL the first time it goes through the loop
+or if the task is a pending owner and had it's mutex stolen. If the "task"
+field is NULL then we need to set up the accounting for it.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process. The "task" field is set to the process, and the "lock" field
+to the mutex. The plist nodes are initialized to the processes current
+priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the wait_list. If the current process is the highest
+priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_list of the owner,
+and add the current process to that list. Since the pi_list of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_list changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The schedule can then wake up for a few reasons.
+ 1) we were given pending ownership of the mutex.
+ 2) we received a signal and was TASK_INTERRUPTIBLE
+ 3) we had a timeout and was TASK_INTERRUPTIBLE
+
+In any of these cases, we continue the loop and once again try to grab the
+ownership of the mutex. If we succeed, we exit the loop, otherwise we continue
+and on signal and timeout, will exit the loop, or if we had the mutex stolen
+we just simply add ourselves back on the lists and go back to sleep.
+
+Note: For various reasons, because of timeout and signals, the steal mutex
+ algorithm needs to be careful. This is because the current process is
+ still on the wait_list. And because of dynamic changing of priorities,
+ especially on SCHED_OTHER tasks, the current process can be the
+ highest priority task on the wait_list.
+
+Failed to get mutex on Timeout or Signal
+----------------------------------------
+
+If a timeout or signal occurred, the waiter's "task" field would not be
+NULL and the task needs to be taken off the wait_list of the mutex and perhaps
+pi_list of the owner. If this process was a high priority process, then
+the rt_mutex_adjust_prio_chain needs to be executed again on the owner,
+but this time it will be lowering the priorities.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG. Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex. If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex. This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not. On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not. On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too. If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters then the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up and give that waiter
+pending ownership.
+
+On the wake up code, the pi_lock of the current owner is taken. The top
+waiter of the lock is found and removed from the wait_list of the mutex
+as well as the pi_list of the current owner. The task field of the new
+pending owner's waiter structure is set to NULL, and the owner field of the
+mutex is set to the new owner with the "Pending Owner" bit set, as well
+as the "Has Waiters" bit if there still are other processes blocked on the
+mutex.
+
+The pi_lock of the previous owner is released, and the new pending owner's
+pi_lock is taken. Remember that this is the trick to prevent the race
+condition in rt_mutex_adjust_prio_chain from adding itself as a waiter
+on the mutex.
+
+We now clear the "pi_blocked_on" field of the new pending owner, and if
+the mutex still has waiters pending, we add the new top waiter to the pi_list
+of the pending owner.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author: Steven Rostedt <rostedt@goodmis.org>
+
+Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
diff --git a/Documentation/rt-mutex.txt b/Documentation/rt-mutex.txt
new file mode 100644
index 0000000..243393d
--- /dev/null
+++ b/Documentation/rt-mutex.txt
@@ -0,0 +1,79 @@
+RT-mutex subsystem with PI support
+----------------------------------
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter list is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters list. This list too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. [The
+priority enqueueing is handled by "plists", see include/linux/plist.h
+for more details.]
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1
+are used to keep track of the "owner is pending" and "rtmutex has
+waiters" state.
+
+ owner bit1 bit0
+ NULL 0 0 mutex is free (fast acquire possible)
+ NULL 0 1 invalid state
+ NULL 1 0 Transitional state*
+ NULL 1 1 invalid state
+ taskpointer 0 0 mutex is held (fast release possible)
+ taskpointer 0 1 task is pending owner
+ taskpointer 1 0 mutex is held and has waiters
+ taskpointer 1 1 task is pending owner and mutex has waiters
+
+Pending-ownership handling is a performance optimization:
+pending-ownership is assigned to the first (highest priority) waiter of
+the mutex, when the mutex is released. The thread is woken up and once
+it starts executing it can acquire the mutex. Until the mutex is taken
+by it (bit 0 is cleared) a competing higher priority thread can "steal"
+the mutex which puts the woken up thread back on the waiters list.
+
+The pending-ownership optimization is especially important for the
+uninterrupted workflow of high-prio tasks which repeatedly
+takes/releases locks that have lower-prio waiters. Without this
+optimization the higher-prio thread would ping-pong to the lower-prio
+task [because at unlock time we always assign a new owner].
+
+(*) The "mutex has waiters" bit gets set to take the lock. If the lock
+doesn't already have an owner, this bit is quickly cleared if there are
+no waiters. So this is a transitional state to synchronize with looking
+at the owner field of the mutex and the mutex owner releasing the lock.
diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
new file mode 100644
index 0000000..8deffcd
--- /dev/null
+++ b/Documentation/rtc.txt
@@ -0,0 +1,450 @@
+
+ Real Time Clock (RTC) Drivers for Linux
+ =======================================
+
+When Linux developers talk about a "Real Time Clock", they usually mean
+something that tracks wall clock time and is battery backed so that it
+works even with system power off. Such clocks will normally not track
+the local time zone or daylight savings time -- unless they dual boot
+with MS-Windows -- but will instead be set to Coordinated Universal Time
+(UTC, formerly "Greenwich Mean Time").
+
+The newest non-PC hardware tends to just count seconds, like the time(2)
+system call reports, but RTCs also very commonly represent time using
+the Gregorian calendar and 24 hour time, as reported by gmtime(3).
+
+Linux has two largely-compatible userspace RTC API families you may
+need to know about:
+
+ * /dev/rtc ... is the RTC provided by PC compatible systems,
+ so it's not very portable to non-x86 systems.
+
+ * /dev/rtc0, /dev/rtc1 ... are part of a framework that's
+ supported by a wide variety of RTC chips on all systems.
+
+Programmers need to understand that the PC/AT functionality is not
+always available, and some systems can do much more. That is, the
+RTCs use the same API to make requests in both RTC frameworks (using
+different filenames of course), but the hardware may not offer the
+same functionality. For example, not every RTC is hooked up to an
+IRQ, so they can't all issue alarms; and where standard PC RTCs can
+only issue an alarm up to 24 hours in the future, other hardware may
+be able to schedule one any time in the upcoming century.
+
+
+ Old PC/AT-Compatible driver: /dev/rtc
+ --------------------------------------
+
+All PCs (even Alpha machines) have a Real Time Clock built into them.
+Usually they are built into the chipset of the computer, but some may
+actually have a Motorola MC146818 (or clone) on the board. This is the
+clock that keeps the date and time while your computer is turned off.
+
+ACPI has standardized that MC146818 functionality, and extended it in
+a few ways (enabling longer alarm periods, and wake-from-hibernate).
+That functionality is NOT exposed in the old driver.
+
+However it can also be used to generate signals from a slow 2Hz to a
+relatively fast 8192Hz, in increments of powers of two. These signals
+are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is
+for...) It can also function as a 24hr alarm, raising IRQ 8 when the
+alarm goes off. The alarm can also be programmed to only check any
+subset of the three programmable values, meaning that it could be set to
+ring on the 30th second of the 30th minute of every hour, for example.
+The clock can also be set to generate an interrupt upon every clock
+update, thus generating a 1Hz signal.
+
+The interrupts are reported via /dev/rtc (major 10, minor 135, read only
+character device) in the form of an unsigned long. The low byte contains
+the type of interrupt (update-done, alarm-rang, or periodic) that was
+raised, and the remaining bytes contain the number of interrupts since
+the last read. Status information is reported through the pseudo-file
+/proc/driver/rtc if the /proc filesystem was enabled. The driver has
+built in locking so that only one process is allowed to have the /dev/rtc
+interface open at a time.
+
+A user process can monitor these interrupts by doing a read(2) or a
+select(2) on /dev/rtc -- either will block/stop the user process until
+the next interrupt is received. This is useful for things like
+reasonably high frequency data acquisition where one doesn't want to
+burn up 100% CPU by polling gettimeofday etc. etc.
+
+At high frequencies, or under high loads, the user process should check
+the number of interrupts received since the last read to determine if
+there has been any interrupt "pileup" so to speak. Just for reference, a
+typical 486-33 running a tight read loop on /dev/rtc will start to suffer
+occasional interrupt pileup (i.e. > 1 IRQ event since last read) for
+frequencies above 1024Hz. So you really should check the high bytes
+of the value you read, especially at frequencies above that of the
+normal timer interrupt, which is 100Hz.
+
+Programming and/or enabling interrupt frequencies greater than 64Hz is
+only allowed by root. This is perhaps a bit conservative, but we don't want
+an evil user generating lots of IRQs on a slow 386sx-16, where it might have
+a negative impact on performance. This 64Hz limit can be changed by writing
+a different value to /proc/sys/dev/rtc/max-user-freq. Note that the
+interrupt handler is only a few lines of code to minimize any possibility
+of this effect.
+
+Also, if the kernel time is synchronized with an external source, the
+kernel will write the time back to the CMOS clock every 11 minutes. In
+the process of doing this, the kernel briefly turns off RTC periodic
+interrupts, so be aware of this if you are doing serious work. If you
+don't synchronize the kernel time with an external source (via ntp or
+whatever) then the kernel will keep its hands off the RTC, allowing you
+exclusive access to the device for your applications.
+
+The alarm and/or interrupt frequency are programmed into the RTC via
+various ioctl(2) calls as listed in ./include/linux/rtc.h
+Rather than write 50 pages describing the ioctl() and so on, it is
+perhaps more useful to include a small test program that demonstrates
+how to use them, and demonstrates the features of the driver. This is
+probably a lot more useful to people interested in writing applications
+that will be using this driver. See the code at the end of this document.
+
+(The original /dev/rtc driver was written by Paul Gortmaker.)
+
+
+ New portable "RTC Class" drivers: /dev/rtcN
+ --------------------------------------------
+
+Because Linux supports many non-ACPI and non-PC platforms, some of which
+have more than one RTC style clock, it needed a more portable solution
+than expecting a single battery-backed MC146818 clone on every system.
+Accordingly, a new "RTC Class" framework has been defined. It offers
+three different userspace interfaces:
+
+ * /dev/rtcN ... much the same as the older /dev/rtc interface
+
+ * /sys/class/rtc/rtcN ... sysfs attributes support readonly
+ access to some RTC attributes.
+
+ * /proc/driver/rtc ... the first RTC (rtc0) may expose itself
+ using a procfs interface. More information is (currently) shown
+ here than through sysfs.
+
+The RTC Class framework supports a wide variety of RTCs, ranging from those
+integrated into embeddable system-on-chip (SOC) processors to discrete chips
+using I2C, SPI, or some other bus to communicate with the host CPU. There's
+even support for PC-style RTCs ... including the features exposed on newer PCs
+through ACPI.
+
+The new framework also removes the "one RTC per system" restriction. For
+example, maybe the low-power battery-backed RTC is a discrete I2C chip, but
+a high functionality RTC is integrated into the SOC. That system might read
+the system clock from the discrete RTC, but use the integrated one for all
+other tasks, because of its greater functionality.
+
+The ioctl() calls supported by /dev/rtc are also supported by the RTC class
+framework. However, because the chips and systems are not standardized,
+some PC/AT functionality might not be provided. And in the same way, some
+newer features -- including those enabled by ACPI -- are exposed by the
+RTC class framework, but can't be supported by the older driver.
+
+ * RTC_RD_TIME, RTC_SET_TIME ... every RTC supports at least reading
+ time, returning the result as a Gregorian calendar date and 24 hour
+ wall clock time. To be most useful, this time may also be updated.
+
+ * RTC_AIE_ON, RTC_AIE_OFF, RTC_ALM_SET, RTC_ALM_READ ... when the RTC
+ is connected to an IRQ line, it can often issue an alarm IRQ up to
+ 24 hours in the future. (Use RTC_WKALM_* by preference.)
+
+ * RTC_WKALM_SET, RTC_WKALM_RD ... RTCs that can issue alarms beyond
+ the next 24 hours use a slightly more powerful API, which supports
+ setting the longer alarm time and enabling its IRQ using a single
+ request (using the same model as EFI firmware).
+
+ * RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, it probably
+ also offers update IRQs whenever the "seconds" counter changes.
+ If needed, the RTC framework can emulate this mechanism.
+
+ * RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... another
+ feature often accessible with an IRQ line is a periodic IRQ, issued
+ at settable frequencies (usually 2^N Hz).
+
+In many cases, the RTC alarm can be a system wake event, used to force
+Linux out of a low power sleep state (or hibernation) back to a fully
+operational state. For example, a system could enter a deep power saving
+state until it's time to execute some scheduled tasks.
+
+Note that many of these ioctls need not actually be implemented by your
+driver. The common rtc-dev interface handles many of these nicely if your
+driver returns ENOIOCTLCMD. Some common examples:
+
+ * RTC_RD_TIME, RTC_SET_TIME: the read_time/set_time functions will be
+ called with appropriate values.
+
+ * RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: the
+ set_alarm/read_alarm functions will be called.
+
+ * RTC_IRQP_SET, RTC_IRQP_READ: the irq_set_freq function will be called
+ to set the frequency while the framework will handle the read for you
+ since the frequency is stored in the irq_freq member of the rtc_device
+ structure. Your driver needs to initialize the irq_freq member during
+ init. Make sure you check the requested frequency is in range of your
+ hardware in the irq_set_freq function. If it isn't, return -EINVAL. If
+ you cannot actually change the frequency, do not define irq_set_freq.
+
+If all else fails, check out the rtc-test.c driver!
+
+
+-------------------- 8< ---------------- 8< -----------------------------
+
+/*
+ * Real Time Clock Driver Test/Example Program
+ *
+ * Compile with:
+ * gcc -s -Wall -Wstrict-prototypes rtctest.c -o rtctest
+ *
+ * Copyright (C) 1996, Paul Gortmaker.
+ *
+ * Released under the GNU General Public License, version 2,
+ * included herein by reference.
+ *
+ */
+
+#include <stdio.h>
+#include <linux/rtc.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+
+
+/*
+ * This expects the new RTC class driver framework, working with
+ * clocks that will often not be clones of what the PC-AT had.
+ * Use the command line to specify another RTC if you need one.
+ */
+static const char default_rtc[] = "/dev/rtc0";
+
+
+int main(int argc, char **argv)
+{
+ int i, fd, retval, irqcount = 0;
+ unsigned long tmp, data;
+ struct rtc_time rtc_tm;
+ const char *rtc = default_rtc;
+
+ switch (argc) {
+ case 2:
+ rtc = argv[1];
+ /* FALLTHROUGH */
+ case 1:
+ break;
+ default:
+ fprintf(stderr, "usage: rtctest [rtcdev]\n");
+ return 1;
+ }
+
+ fd = open(rtc, O_RDONLY);
+
+ if (fd == -1) {
+ perror(rtc);
+ exit(errno);
+ }
+
+ fprintf(stderr, "\n\t\t\tRTC Driver Test Example.\n\n");
+
+ /* Turn on update interrupts (one per second) */
+ retval = ioctl(fd, RTC_UIE_ON, 0);
+ if (retval == -1) {
+ if (errno == ENOTTY) {
+ fprintf(stderr,
+ "\n...Update IRQs not supported.\n");
+ goto test_READ;
+ }
+ perror("RTC_UIE_ON ioctl");
+ exit(errno);
+ }
+
+ fprintf(stderr, "Counting 5 update (1/sec) interrupts from reading %s:",
+ rtc);
+ fflush(stderr);
+ for (i=1; i<6; i++) {
+ /* This read will block */
+ retval = read(fd, &data, sizeof(unsigned long));
+ if (retval == -1) {
+ perror("read");
+ exit(errno);
+ }
+ fprintf(stderr, " %d",i);
+ fflush(stderr);
+ irqcount++;
+ }
+
+ fprintf(stderr, "\nAgain, from using select(2) on /dev/rtc:");
+ fflush(stderr);
+ for (i=1; i<6; i++) {
+ struct timeval tv = {5, 0}; /* 5 second timeout on select */
+ fd_set readfds;
+
+ FD_ZERO(&readfds);
+ FD_SET(fd, &readfds);
+ /* The select will wait until an RTC interrupt happens. */
+ retval = select(fd+1, &readfds, NULL, NULL, &tv);
+ if (retval == -1) {
+ perror("select");
+ exit(errno);
+ }
+ /* This read won't block unlike the select-less case above. */
+ retval = read(fd, &data, sizeof(unsigned long));
+ if (retval == -1) {
+ perror("read");
+ exit(errno);
+ }
+ fprintf(stderr, " %d",i);
+ fflush(stderr);
+ irqcount++;
+ }
+
+ /* Turn off update interrupts */
+ retval = ioctl(fd, RTC_UIE_OFF, 0);
+ if (retval == -1) {
+ perror("RTC_UIE_OFF ioctl");
+ exit(errno);
+ }
+
+test_READ:
+ /* Read the RTC time/date */
+ retval = ioctl(fd, RTC_RD_TIME, &rtc_tm);
+ if (retval == -1) {
+ perror("RTC_RD_TIME ioctl");
+ exit(errno);
+ }
+
+ fprintf(stderr, "\n\nCurrent RTC date/time is %d-%d-%d, %02d:%02d:%02d.\n",
+ rtc_tm.tm_mday, rtc_tm.tm_mon + 1, rtc_tm.tm_year + 1900,
+ rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
+
+ /* Set the alarm to 5 sec in the future, and check for rollover */
+ rtc_tm.tm_sec += 5;
+ if (rtc_tm.tm_sec >= 60) {
+ rtc_tm.tm_sec %= 60;
+ rtc_tm.tm_min++;
+ }
+ if (rtc_tm.tm_min == 60) {
+ rtc_tm.tm_min = 0;
+ rtc_tm.tm_hour++;
+ }
+ if (rtc_tm.tm_hour == 24)
+ rtc_tm.tm_hour = 0;
+
+ retval = ioctl(fd, RTC_ALM_SET, &rtc_tm);
+ if (retval == -1) {
+ if (errno == ENOTTY) {
+ fprintf(stderr,
+ "\n...Alarm IRQs not supported.\n");
+ goto test_PIE;
+ }
+ perror("RTC_ALM_SET ioctl");
+ exit(errno);
+ }
+
+ /* Read the current alarm settings */
+ retval = ioctl(fd, RTC_ALM_READ, &rtc_tm);
+ if (retval == -1) {
+ perror("RTC_ALM_READ ioctl");
+ exit(errno);
+ }
+
+ fprintf(stderr, "Alarm time now set to %02d:%02d:%02d.\n",
+ rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
+
+ /* Enable alarm interrupts */
+ retval = ioctl(fd, RTC_AIE_ON, 0);
+ if (retval == -1) {
+ perror("RTC_AIE_ON ioctl");
+ exit(errno);
+ }
+
+ fprintf(stderr, "Waiting 5 seconds for alarm...");
+ fflush(stderr);
+ /* This blocks until the alarm ring causes an interrupt */
+ retval = read(fd, &data, sizeof(unsigned long));
+ if (retval == -1) {
+ perror("read");
+ exit(errno);
+ }
+ irqcount++;
+ fprintf(stderr, " okay. Alarm rang.\n");
+
+ /* Disable alarm interrupts */
+ retval = ioctl(fd, RTC_AIE_OFF, 0);
+ if (retval == -1) {
+ perror("RTC_AIE_OFF ioctl");
+ exit(errno);
+ }
+
+test_PIE:
+ /* Read periodic IRQ rate */
+ retval = ioctl(fd, RTC_IRQP_READ, &tmp);
+ if (retval == -1) {
+ /* not all RTCs support periodic IRQs */
+ if (errno == ENOTTY) {
+ fprintf(stderr, "\nNo periodic IRQ support\n");
+ goto done;
+ }
+ perror("RTC_IRQP_READ ioctl");
+ exit(errno);
+ }
+ fprintf(stderr, "\nPeriodic IRQ rate is %ldHz.\n", tmp);
+
+ fprintf(stderr, "Counting 20 interrupts at:");
+ fflush(stderr);
+
+ /* The frequencies 128Hz, 256Hz, ... 8192Hz are only allowed for root. */
+ for (tmp=2; tmp<=64; tmp*=2) {
+
+ retval = ioctl(fd, RTC_IRQP_SET, tmp);
+ if (retval == -1) {
+ /* not all RTCs can change their periodic IRQ rate */
+ if (errno == ENOTTY) {
+ fprintf(stderr,
+ "\n...Periodic IRQ rate is fixed\n");
+ goto done;
+ }
+ perror("RTC_IRQP_SET ioctl");
+ exit(errno);
+ }
+
+ fprintf(stderr, "\n%ldHz:\t", tmp);
+ fflush(stderr);
+
+ /* Enable periodic interrupts */
+ retval = ioctl(fd, RTC_PIE_ON, 0);
+ if (retval == -1) {
+ perror("RTC_PIE_ON ioctl");
+ exit(errno);
+ }
+
+ for (i=1; i<21; i++) {
+ /* This blocks */
+ retval = read(fd, &data, sizeof(unsigned long));
+ if (retval == -1) {
+ perror("read");
+ exit(errno);
+ }
+ fprintf(stderr, " %d",i);
+ fflush(stderr);
+ irqcount++;
+ }
+
+ /* Disable periodic interrupts */
+ retval = ioctl(fd, RTC_PIE_OFF, 0);
+ if (retval == -1) {
+ perror("RTC_PIE_OFF ioctl");
+ exit(errno);
+ }
+ }
+
+done:
+ fprintf(stderr, "\n\n\t\t\t *** Test complete ***\n");
+
+ close(fd);
+
+ return 0;
+}
diff --git a/Documentation/s390/00-INDEX b/Documentation/s390/00-INDEX
new file mode 100644
index 0000000..3a2b963
--- /dev/null
+++ b/Documentation/s390/00-INDEX
@@ -0,0 +1,26 @@
+00-INDEX
+ - this file.
+3270.ChangeLog
+ - ChangeLog for the UTS Global 3270-support patch (outdated).
+3270.txt
+ - how to use the IBM 3270 display system support.
+cds.txt
+ - s390 common device support (common I/O layer).
+CommonIO
+ - common I/O layer command line parameters, procfs and debugfs entries
+config3270.sh
+ - example configuration for 3270 devices.
+DASD
+ - information on the DASD disk device driver.
+Debugging390.txt
+ - hints for debugging on s390 systems.
+driver-model.txt
+ - information on s390 devices and the driver model.
+monreader.txt
+ - information on accessing the z/VM monitor stream from Linux.
+s390dbf.txt
+ - information on using the s390 debug feature.
+TAPE
+ - information on the driver for channel-attached tapes.
+zfcpdump
+ - information on the s390 SCSI dump tool.
diff --git a/Documentation/s390/3270.ChangeLog b/Documentation/s390/3270.ChangeLog
new file mode 100644
index 0000000..031c360
--- /dev/null
+++ b/Documentation/s390/3270.ChangeLog
@@ -0,0 +1,44 @@
+ChangeLog for the UTS Global 3270-support patch
+
+Sep 2002: Get bootup colors right on 3270 console
+ * In tubttybld.c, substantially revise ESC processing so that
+ ESC sequences (especially coloring ones) and the strings
+ they affect work as right as 3270 can get them. Also, set
+ screen height to omit the two rows used for input area, in
+ tty3270_open() in tubtty.c.
+
+Sep 2002: Dynamically get 3270 input buffer
+ * Oversize 3270 screen widths may exceed GEOM_MAXINPLEN columns,
+ so get input-area buffer dynamically when sizing the device in
+ tubmakemin() in tuball.c (if it's the console) or tty3270_open()
+ in tubtty.c (if needed). Change tubp->tty_input to be a
+ pointer rather than an array, in tubio.h.
+
+Sep 2002: Fix tubfs kmalloc()s
+ * Do read and write lengths correctly in fs3270_read()
+ and fs3270_write(), whilst never asking kmalloc()
+ for more than 0x800 bytes. Affects tubfs.c and tubio.h.
+
+Sep 2002: Recognize 3270 control unit type 3174
+ * Recognize control-unit type 0x3174 as well as 0x327?.
+ The IBM 2047 device emulates a 3174 control unit.
+ Modularize control-unit recognition in tuball.c by
+ adding and invoking new tub3270_is_ours().
+
+Apr 2002: Fix 3270 console reboot loop
+ * (Belated log entry) Fixed reboot loop if 3270 console,
+ in tubtty.c:ttu3270_bh().
+
+Feb 6, 2001:
+ * This changelog is new
+ * tub3270 now supports 3270 console:
+ Specify y for CONFIG_3270 and y for CONFIG_3270_CONSOLE.
+ Support for 3215 will not appear if 3270 console support
+ is chosen.
+ NOTE: The default is 3270 console support, NOT 3215.
+ * the components are remodularized: added source modules are
+ tubttybld.c and tubttyscl.c, for screen-building code and
+ scroll-timeout code.
+ * tub3270 source for this (2.4.0) version is #ifdeffed to
+ build with both 2.4.0 and 2.2.16.2.
+ * color support and minimal other ESC-sequence support is added.
diff --git a/Documentation/s390/3270.txt b/Documentation/s390/3270.txt
new file mode 100644
index 0000000..7a5c73a
--- /dev/null
+++ b/Documentation/s390/3270.txt
@@ -0,0 +1,272 @@
+IBM 3270 Display System support
+
+This file describes the driver that supports local channel attachment
+of IBM 3270 devices. It consists of three sections:
+ * Introduction
+ * Installation
+ * Operation
+
+
+INTRODUCTION.
+
+This paper describes installing and operating 3270 devices under
+Linux/390. A 3270 device is a block-mode rows-and-columns terminal of
+which I'm sure hundreds of millions were sold by IBM and clonemakers
+twenty and thirty years ago.
+
+You may have 3270s in-house and not know it. If you're using the
+VM-ESA operating system, define a 3270 to your virtual machine by using
+the command "DEF GRAF <hex-address>" This paper presumes you will be
+defining four 3270s with the CP/CMS commands
+
+ DEF GRAF 620
+ DEF GRAF 621
+ DEF GRAF 622
+ DEF GRAF 623
+
+Your network connection from VM-ESA allows you to use x3270, tn3270, or
+another 3270 emulator, started from an xterm window on your PC or
+workstation. With the DEF GRAF command, an application such as xterm,
+and this Linux-390 3270 driver, you have another way of talking to your
+Linux box.
+
+This paper covers installation of the driver and operation of a
+dialed-in x3270.
+
+
+INSTALLATION.
+
+You install the driver by installing a patch, doing a kernel build, and
+running the configuration script (config3270.sh, in this directory).
+
+WARNING: If you are using 3270 console support, you must rerun the
+configuration script every time you change the console's address (perhaps
+by using the condev= parameter in silo's /boot/parmfile). More precisely,
+you should rerun the configuration script every time your set of 3270s,
+including the console 3270, changes subchannel identifier relative to
+one another. ReIPL as soon as possible after running the configuration
+script and the resulting /tmp/mkdev3270.
+
+If you have chosen to make tub3270 a module, you add a line to
+/etc/modprobe.conf. If you are working on a VM virtual machine, you
+can use DEF GRAF to define virtual 3270 devices.
+
+You may generate both 3270 and 3215 console support, or one or the
+other, or neither. If you generate both, the console type under VM is
+not changed. Use #CP Q TERM to see what the current console type is.
+Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only
+3270 console support, then the driver automatically converts your console
+at boot time to a 3270 if it is a 3215.
+
+In brief, these are the steps:
+ 1. Install the tub3270 patch
+ 2. (If a module) add a line to /etc/modprobe.conf
+ 3. (If VM) define devices with DEF GRAF
+ 4. Reboot
+ 5. Configure
+
+To test that everything works, assuming VM and x3270,
+ 1. Bring up an x3270 window.
+ 2. Use the DIAL command in that window.
+ 3. You should immediately see a Linux login screen.
+
+Here are the installation steps in detail:
+
+ 1. The 3270 driver is a part of the official Linux kernel
+ source. Build a tree with the kernel source and any necessary
+ patches. Then do
+ make oldconfig
+ (If you wish to disable 3215 console support, edit
+ .config; change CONFIG_TN3215's value to "n";
+ and rerun "make oldconfig".)
+ make image
+ make modules
+ make modules_install
+
+ 2. (Perform this step only if you have configured tub3270 as a
+ module.) Add a line to /etc/modprobe.conf to automatically
+ load the driver when it's needed. With this line added,
+ you will see login prompts appear on your 3270s as soon as
+ boot is complete (or with emulated 3270s, as soon as you dial
+ into your vm guest using the command "DIAL <vmguestname>").
+ Since the line-mode major number is 227, the line to add to
+ /etc/modprobe.conf should be:
+ alias char-major-227 tub3270
+
+ 3. Define graphic devices to your vm guest machine, if you
+ haven't already. Define them before you reboot (reipl):
+ DEFINE GRAF 620
+ DEFINE GRAF 621
+ DEFINE GRAF 622
+ DEFINE GRAF 623
+
+ 4. Reboot. The reboot process scans hardware devices, including
+ 3270s, and this enables the tub3270 driver once loaded to respond
+ correctly to the configuration requests of the next step. If
+ you have chosen 3270 console support, your console now behaves
+ as a 3270, not a 3215.
+
+ 5. Run the 3270 configuration script config3270. It is
+ distributed in this same directory, Documentation/s390, as
+ config3270.sh. Inspect the output script it produces,
+ /tmp/mkdev3270, and then run that script. This will create the
+ necessary character special device files and make the necessary
+ changes to /etc/inittab.
+
+ Then notify /sbin/init that /etc/inittab has changed, by issuing
+ the telinit command with the q operand:
+ cd Documentation/s390
+ sh config3270.sh
+ sh /tmp/mkdev3270
+ telinit q
+
+ This should be sufficient for your first time. If your 3270
+ configuration has changed and you're reusing config3270, you
+ should follow these steps:
+ Change 3270 configuration
+ Reboot
+ Run config3270 and /tmp/mkdev3270
+ Reboot
+
+Here are the testing steps in detail:
+
+ 1. Bring up an x3270 window, or use an actual hardware 3278 or
+ 3279, or use the 3270 emulator of your choice. You would be
+ running the emulator on your PC or workstation. You would use
+ the command, for example,
+ x3270 vm-esa-domain-name &
+ if you wanted a 3278 Model 4 with 43 rows of 80 columns, the
+ default model number. The driver does not take advantage of
+ extended attributes.
+
+ The screen you should now see contains a VM logo with input
+ lines near the bottom. Use TAB to move to the bottom line,
+ probably labeled "COMMAND ===>".
+
+ 2. Use the DIAL command instead of the LOGIN command to connect
+ to one of the virtual 3270s you defined with the DEF GRAF
+ commands:
+ dial my-vm-guest-name
+
+ 3. You should immediately see a login prompt from your
+ Linux-390 operating system. If that does not happen, you would
+ see instead the line "DIALED TO my-vm-guest-name 0620".
+
+ To troubleshoot: do these things.
+
+ A. Is the driver loaded? Use the lsmod command (no operands)
+ to find out. Probably it isn't. Try loading it manually, with
+ the command "insmod tub3270". Does that command give error
+ messages? Ha! There's your problem.
+
+ B. Is the /etc/inittab file modified as in installation step 3
+ above? Use the grep command to find out; for instance, issue
+ "grep 3270 /etc/inittab". Nothing found? There's your
+ problem!
+
+ C. Are the device special files created, as in installation
+ step 2 above? Use the ls -l command to find out; for instance,
+ issue "ls -l /dev/3270/tty620". The output should start with the
+ letter "c" meaning character device and should contain "227, 1"
+ just to the left of the device name. No such file? no "c"?
+ Wrong major number? Wrong minor number? There's your
+ problem!
+
+ D. Do you get the message
+ "HCPDIA047E my-vm-guest-name 0620 does not exist"?
+ If so, you must issue the command "DEF GRAF 620" from your VM
+ 3215 console and then reboot the system.
+
+
+
+OPERATION.
+
+The driver defines three areas on the 3270 screen: the log area, the
+input area, and the status area.
+
+The log area takes up all but the bottom two lines of the screen. The
+driver writes terminal output to it, starting at the top line and going
+down. When it fills, the status area changes from "Linux Running" to
+"Linux More...". After a scrolling timeout of (default) 5 sec, the
+screen clears and more output is written, from the top down.
+
+The input area extends from the beginning of the second-to-last screen
+line to the start of the status area. You type commands in this area
+and hit ENTER to execute them.
+
+The status area initializes to "Linux Running" to give you a warm
+fuzzy feeling. When the log area fills up and output awaits, it
+changes to "Linux More...". At this time you can do several things or
+nothing. If you do nothing, the screen will clear in (default) 5 sec
+and more output will appear. You may hit ENTER with nothing typed in
+the input area to toggle between "Linux More..." and "Linux Holding",
+which indicates no scrolling will occur. (If you hit ENTER with "Linux
+Running" and nothing typed, the application receives a newline.)
+
+You may change the scrolling timeout value. For example, the following
+command line:
+ echo scrolltime=60 > /proc/tty/driver/tty3270
+changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if
+you wish to prevent scrolling entirely.
+
+Other things you may do when the log area fills up are: hit PA2 to
+clear the log area and write more output to it, or hit CLEAR to clear
+the log area and the input area and write more output to the log area.
+
+Some of the Program Function (PF) and Program Attention (PA) keys are
+preassigned special functions. The ones that are not yield an alarm
+when pressed.
+
+PA1 causes a SIGINT to the currently running application. You may do
+the same thing from the input area, by typing "^C" and hitting ENTER.
+
+PA2 causes the log area to be cleared. If output awaits, it is then
+written to the log area.
+
+PF3 causes an EOF to be received as input by the application. You may
+cause an EOF also by typing "^D" and hitting ENTER.
+
+No PF key is preassigned to cause a job suspension, but you may cause a
+job suspension by typing "^Z" and hitting ENTER. You may wish to
+assign this function to a PF key. To make PF7 cause job suspension,
+execute the command:
+ echo pf7=^z > /proc/tty/driver/tty3270
+
+If the input you type does not end with the two characters "^n", the
+driver appends a newline character and sends it to the tty driver;
+otherwise the driver strips the "^n" and does not append a newline.
+The IBM 3215 driver behaves similarly.
+
+Pf10 causes the most recent command to be retrieved from the tube's
+command stack (default depth 20) and displayed in the input area. You
+may hit PF10 again for the next-most-recent command, and so on. A
+command is entered into the stack only when the input area is not made
+invisible (such as for password entry) and it is not identical to the
+current top entry. PF10 rotates backward through the command stack;
+PF11 rotates forward. You may assign the backward function to any PF
+key (or PA key, for that matter), say, PA3, with the command:
+ echo -e pa3=\\033k > /proc/tty/driver/tty3270
+This assigns the string ESC-k to PA3. Similarly, the string ESC-j
+performs the forward function. (Rationale: In bash with vi-mode line
+editing, ESC-k and ESC-j retrieve backward and forward history.
+Suggestions welcome.)
+
+Is a stack size of twenty commands not to your liking? Change it on
+the fly. To change to saving the last 100 commands, execute the
+command:
+ echo recallsize=100 > /proc/tty/driver/tty3270
+
+Have a command you issue frequently? Assign it to a PF or PA key! Use
+the command
+ echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270
+to execute the commands mkdir foobar and cd foobar immediately when you
+hit PF24. Want to see the command line first, before you execute it?
+Use the -n option of the echo command:
+ echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270
+
+
+
+Happy testing! I welcome any and all comments about this document, the
+driver, etc etc.
+
+Dick Hitt <rbh00@utsglobal.com>
diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO
new file mode 100644
index 0000000..339207d
--- /dev/null
+++ b/Documentation/s390/CommonIO
@@ -0,0 +1,117 @@
+S/390 common I/O-Layer - command line parameters, procfs and debugfs entries
+============================================================================
+
+Command line parameters
+-----------------------
+
+* ccw_timeout_log
+
+ Enable logging of debug information in case of ccw device timeouts.
+
+* cio_ignore = {all} |
+ {<device> | <range of devices>} |
+ {!<device> | !<range of devices>}
+
+ The given devices will be ignored by the common I/O-layer; no detection
+ and device sensing will be done on any of those devices. The subchannel to
+ which the device in question is attached will be treated as if no device was
+ attached.
+
+ An ignored device can be un-ignored later; see the "/proc entries"-section for
+ details.
+
+ The devices must be given either as bus ids (0.x.abcd) or as hexadecimal
+ device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you
+ give a device number 0xabcd, it will be interpreted as 0.0.abcd.
+
+ You can use the 'all' keyword to ignore all devices.
+ The '!' operator will cause the I/O-layer to _not_ ignore a device.
+ The command line is parsed from left to right.
+
+ For example,
+ cio_ignore=0.0.0023-0.0.0042,0.0.4711
+ will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device
+ 0.0.4711, if detected.
+ As another example,
+ cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02
+ will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02.
+
+ By default, no devices are ignored.
+
+
+/proc entries
+-------------
+
+* /proc/cio_ignore
+
+ Lists the ranges of devices (by bus id) which are ignored by common I/O.
+
+ You can un-ignore certain or all devices by piping to /proc/cio_ignore.
+ "free all" will un-ignore all ignored devices,
+ "free <device range>, <device range>, ..." will un-ignore the specified
+ devices.
+
+ For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored,
+ - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore
+ will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023
+ to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored;
+ - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device
+ 0.0.0041;
+ - echo free all > /proc/cio_ignore will un-ignore all remaining ignored
+ devices.
+
+ When a device is un-ignored, device recognition and sensing is performed and
+ the device driver will be notified if possible, so the device will become
+ available to the system. Note that un-ignoring is performed asynchronously.
+
+ You can also add ranges of devices to be ignored by piping to
+ /proc/cio_ignore; "add <device range>, <device range>, ..." will ignore the
+ specified devices.
+
+ Note: While already known devices can be added to the list of devices to be
+ ignored, there will be no effect on then. However, if such a device
+ disappears and then reappears, it will then be ignored. To make
+ known devices go away, you need the "purge" command (see below).
+
+ For example,
+ "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore"
+ will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
+ devices.
+
+ You can remove already known but now ignored devices via
+ "echo purge > /proc/cio_ignore"
+ All devices ignored but still registered and not online (= not in use)
+ will be deregistered and thus removed from the system.
+
+ The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
+ compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
+ numbers given as 0xabcd will be interpreted as 0.0.abcd.
+
+* For some of the information present in the /proc filesystem in 2.4 (namely,
+ /proc/subchannels and /proc/chpids), see driver-model.txt.
+ Information formerly in /proc/irq_count is now in /proc/interrupts.
+
+
+debugfs entries
+---------------
+
+* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature)
+
+ Some views generated by the debug feature to hold various debug outputs.
+
+ - /sys/kernel/debug/s390dbf/cio_crw/sprintf
+ Messages from the processing of pending channel report words (machine check
+ handling).
+
+ - /sys/kernel/debug/s390dbf/cio_msg/sprintf
+ Various debug messages from the common I/O-layer.
+
+ - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
+ Logs the calling of functions in the common I/O-layer and, if applicable,
+ which subchannel they were called for, as well as dumps of some data
+ structures (like irb in an error case).
+
+ The level of logging can be changed to be more or less verbose by piping to
+ /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the
+ documentation on the S/390 debug feature (Documentation/s390/s390dbf.txt)
+ for details.
diff --git a/Documentation/s390/DASD b/Documentation/s390/DASD
new file mode 100644
index 0000000..9963f1e
--- /dev/null
+++ b/Documentation/s390/DASD
@@ -0,0 +1,73 @@
+DASD device driver
+
+S/390's disk devices (DASDs) are managed by Linux via the DASD device
+driver. It is valid for all types of DASDs and represents them to
+Linux as block devices, namely "dd". Currently the DASD driver uses a
+single major number (254) and 4 minor numbers per volume (1 for the
+physical volume and 3 for partitions). With respect to partitions see
+below. Thus you may have up to 64 DASD devices in your system.
+
+The kernel parameter 'dasd=from-to,...' may be issued arbitrary times
+in the kernel's parameter line or not at all. The 'from' and 'to'
+parameters are to be given in hexadecimal notation without a leading
+0x.
+If you supply kernel parameters the different instances are processed
+in order of appearance and a minor number is reserved for any device
+covered by the supplied range up to 64 volumes. Additional DASDs are
+ignored. If you do not supply the 'dasd=' kernel parameter at all, the
+DASD driver registers all supported DASDs of your system to a minor
+number in ascending order of the subchannel number.
+
+The driver currently supports ECKD-devices and there are stubs for
+support of the FBA and CKD architectures. For the FBA architecture
+only some smart data structures are missing to make the support
+complete.
+We performed our testing on 3380 and 3390 type disks of different
+sizes, under VM and on the bare hardware (LPAR), using internal disks
+of the multiprise as well as a RAMAC virtual array. Disks exported by
+an Enterprise Storage Server (Seascape) should work fine as well.
+
+We currently implement one partition per volume, which is the whole
+volume, skipping the first blocks up to the volume label. These are
+reserved for IPL records and IBM's volume label to assure
+accessibility of the DASD from other OSs. In a later stage we will
+provide support of partitions, maybe VTOC oriented or using a kind of
+partition table in the label record.
+
+USAGE
+
+-Low-level format (?CKD only)
+For using an ECKD-DASD as a Linux harddisk you have to low-level
+format the tracks by issuing the BLKDASDFORMAT-ioctl on that
+device. This will erase any data on that volume including IBM volume
+labels, VTOCs etc. The ioctl may take a 'struct format_data *' or
+'NULL' as an argument.
+typedef struct {
+ int start_unit;
+ int stop_unit;
+ int blksize;
+} format_data_t;
+When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole
+disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit
+and stop_unit are the first and last track to be formatted. If
+stop_unit is -1 it implies that the DASD is formatted from start_unit
+up to the last track. blksize can be any power of two between 512 and
+4096. We recommend no blksize lower than 1024 because the ext2fs uses
+1kB blocks anyway and you gain approx. 50% of capacity increasing your
+blksize from 512 byte to 1kB.
+
+-Make a filesystem
+Then you can mk??fs the filesystem of your choice on that volume or
+partition. For reasons of sanity you should build your filesystem on
+the partition /dev/dd?1 instead of the whole volume. You only lose 3kB
+but may be sure that you can reuse your data after introduction of a
+real partition table.
+
+BUGS:
+- Performance sometimes is rather low because we don't fully exploit clustering
+
+TODO-List:
+- Add IBM'S Disk layout to genhd
+- Enhance driver to use more than one major number
+- Enable usage as a module
+- Support Cache fast write and DASD fast write (ECKD)
diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt
new file mode 100644
index 0000000..d30a281
--- /dev/null
+++ b/Documentation/s390/Debugging390.txt
@@ -0,0 +1,2535 @@
+
+ Debugging on Linux for s/390 & z/Architecture
+ by
+ Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
+ Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ Best viewed with fixed width fonts
+
+Overview of Document:
+=====================
+This document is intended to give a good overview of how to debug
+Linux for s/390 & z/Architecture. It isn't intended as a complete reference & not a
+tutorial on the fundamentals of C & assembly. It doesn't go into
+390 IO in any detail. It is intended to complement the documents in the
+reference section below & any other worthwhile references you get.
+
+It is intended like the Enterprise Systems Architecture/390 Reference Summary
+to be printed out & used as a quick cheat sheet self help style reference when
+problems occur.
+
+Contents
+========
+Register Set
+Address Spaces on Intel Linux
+Address Spaces on Linux for s/390 & z/Architecture
+The Linux for s/390 & z/Architecture Kernel Task Structure
+Register Usage & Stackframes on Linux for s/390 & z/Architecture
+A sample program with comments
+Compiling programs for debugging on Linux for s/390 & z/Architecture
+Figuring out gcc compile errors
+Debugging Tools
+objdump
+strace
+Performance Debugging
+Debugging under VM
+s/390 & z/Architecture IO Overview
+Debugging IO on s/390 & z/Architecture under VM
+GDB on s/390 & z/Architecture
+Stack chaining in gdb by hand
+Examining core dumps
+ldd
+Debugging modules
+The proc file system
+Starting points for debugging scripting languages etc.
+Dumptool & Lcrash
+SysRq
+References
+Special Thanks
+
+Register Set
+============
+The current architectures have the following registers.
+
+16 General propose registers, 32 bit on s/390 64 bit on z/Architecture, r0-r15 or gpr0-gpr15 used for arithmetic & addressing.
+
+16 Control registers, 32 bit on s/390 64 bit on z/Architecture, ( cr0-cr15 kernel usage only ) used for memory management,
+interrupt control,debugging control etc.
+
+16 Access registers ( ar0-ar15 ) 32 bit on s/390 & z/Architecture
+not used by normal programs but potentially could
+be used as temporary storage. Their main purpose is their 1 to 1
+association with general purpose registers and are used in
+the kernel for copying data between kernel & user address spaces.
+Access register 0 ( & access register 1 on z/Architecture ( needs 64 bit
+pointer ) ) is currently used by the pthread library as a pointer to
+the current running threads private area.
+
+16 64 bit floating point registers (fp0-fp15 ) IEEE & HFP floating
+point format compliant on G5 upwards & a Floating point control reg (FPC)
+4 64 bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines.
+Note:
+Linux (currently) always uses IEEE & emulates G5 IEEE format on older machines,
+( provided the kernel is configured for this ).
+
+
+The PSW is the most important register on the machine it
+is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of
+a program counter (pc), condition code register,memory space designator.
+In IBM standard notation I am counting bit 0 as the MSB.
+It has several advantages over a normal program counter
+in that you can change address translation & program counter
+in a single instruction. To change address translation,
+e.g. switching address translation off requires that you
+have a logical=physical mapping for the address you are
+currently running at.
+
+ Bit Value
+s/390 z/Architecture
+0 0 Reserved ( must be 0 ) otherwise specification exception occurs.
+
+1 1 Program Event Recording 1 PER enabled,
+ PER is used to facilitate debugging e.g. single stepping.
+
+2-4 2-4 Reserved ( must be 0 ).
+
+5 5 Dynamic address translation 1=DAT on.
+
+6 6 Input/Output interrupt Mask
+
+7 7 External interrupt Mask used primarily for interprocessor signalling &
+ clock interrupts.
+
+8-11 8-11 PSW Key used for complex memory protection mechanism not used under linux
+
+12 12 1 on s/390 0 on z/Architecture
+
+13 13 Machine Check Mask 1=enable machine check interrupts
+
+14 14 Wait State set this to 1 to stop the processor except for interrupts & give
+ time to other LPARS used in CPU idle in the kernel to increase overall
+ usage of processor resources.
+
+15 15 Problem state ( if set to 1 certain instructions are disabled )
+ all linux user programs run with this bit 1
+ ( useful info for debugging under VM ).
+
+16-17 16-17 Address Space Control
+
+ 00 Primary Space Mode when DAT on
+ The linux kernel currently runs in this mode, CR1 is affiliated with
+ this mode & points to the primary segment table origin etc.
+
+ 01 Access register mode this mode is used in functions to
+ copy data between kernel & user space.
+
+ 10 Secondary space mode not used in linux however CR7 the
+ register affiliated with this mode is & this & normally
+ CR13=CR7 to allow us to copy data between kernel & user space.
+ We do this as follows:
+ We set ar2 to 0 to designate its
+ affiliated gpr ( gpr2 )to point to primary=kernel space.
+ We set ar4 to 1 to designate its
+ affiliated gpr ( gpr4 ) to point to secondary=home=user space
+ & then essentially do a memcopy(gpr2,gpr4,size) to
+ copy data between the address spaces, the reason we use home space for the
+ kernel & don't keep secondary space free is that code will not run in
+ secondary space.
+
+ 11 Home Space Mode all user programs run in this mode.
+ it is affiliated with CR13.
+
+18-19 18-19 Condition codes (CC)
+
+20 20 Fixed point overflow mask if 1=FPU exceptions for this event
+ occur ( normally 0 )
+
+21 21 Decimal overflow mask if 1=FPU exceptions for this event occur
+ ( normally 0 )
+
+22 22 Exponent underflow mask if 1=FPU exceptions for this event occur
+ ( normally 0 )
+
+23 23 Significance Mask if 1=FPU exceptions for this event occur
+ ( normally 0 )
+
+24-31 24-30 Reserved Must be 0.
+
+ 31 Extended Addressing Mode
+ 32 Basic Addressing Mode
+ Used to set addressing mode
+ PSW 31 PSW 32
+ 0 0 24 bit
+ 0 1 31 bit
+ 1 1 64 bit
+
+32 1=31 bit addressing mode 0=24 bit addressing mode (for backward
+ compatibility), linux always runs with this bit set to 1
+
+33-64 Instruction address.
+ 33-63 Reserved must be 0
+ 64-127 Address
+ In 24 bits mode bits 64-103=0 bits 104-127 Address
+ In 31 bits mode bits 64-96=0 bits 97-127 Address
+ Note: unlike 31 bit mode on s/390 bit 96 must be zero
+ when loading the address with LPSWE otherwise a
+ specification exception occurs, LPSW is fully backward
+ compatible.
+
+
+Prefix Page(s)
+--------------
+This per cpu memory area is too intimately tied to the processor not to mention.
+It exists between the real addresses 0-4096 on s/390 & 0-8192 z/Architecture & is exchanged
+with a 1 page on s/390 or 2 pages on z/Architecture in absolute storage by the set
+prefix instruction in linux'es startup.
+This page is mapped to a different prefix for each processor in an SMP configuration
+( assuming the os designer is sane of course :-) ).
+Bytes 0-512 ( 200 hex ) on s/390 & 0-512,4096-4544,4604-5119 currently on z/Architecture
+are used by the processor itself for holding such information as exception indications &
+entry points for exceptions.
+Bytes after 0xc00 hex are used by linux for per processor globals on s/390 & z/Architecture
+( there is a gap on z/Architecture too currently between 0xc00 & 1000 which linux uses ).
+The closest thing to this on traditional architectures is the interrupt
+vector table. This is a good thing & does simplify some of the kernel coding
+however it means that we now cannot catch stray NULL pointers in the
+kernel without hard coded checks.
+
+
+
+Address Spaces on Intel Linux
+=============================
+
+The traditional Intel Linux is approximately mapped as follows forgive
+the ascii art.
+0xFFFFFFFF 4GB Himem *****************
+ * *
+ * Kernel Space *
+ * *
+ ***************** ****************
+User Space Himem (typically 0xC0000000 3GB )* User Stack * * *
+ ***************** * *
+ * Shared Libs * * Next Process *
+ ***************** * to *
+ * * <== * Run * <==
+ * User Program * * *
+ * Data BSS * * *
+ * Text * * *
+ * Sections * * *
+0x00000000 ***************** ****************
+
+Now it is easy to see that on Intel it is quite easy to recognise a kernel address
+as being one greater than user space himem ( in this case 0xC0000000).
+& addresses of less than this are the ones in the current running program on this
+processor ( if an smp box ).
+If using the virtual machine ( VM ) as a debugger it is quite difficult to
+know which user process is running as the address space you are looking at
+could be from any process in the run queue.
+
+The limitation of Intels addressing technique is that the linux
+kernel uses a very simple real address to virtual addressing technique
+of Real Address=Virtual Address-User Space Himem.
+This means that on Intel the kernel linux can typically only address
+Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines
+can typically use.
+They can lower User Himem to 2GB or lower & thus be
+able to use 2GB of RAM however this shrinks the maximum size
+of User Space from 3GB to 2GB they have a no win limit of 4GB unless
+they go to 64 Bit.
+
+
+On 390 our limitations & strengths make us slightly different.
+For backward compatibility we are only allowed use 31 bits (2GB)
+of our 32 bit addresses, however, we use entirely separate address
+spaces for the user & kernel.
+
+This means we can support 2GB of non Extended RAM on s/390, & more
+with the Extended memory management swap device &
+currently 4TB of physical memory currently on z/Architecture.
+
+
+Address Spaces on Linux for s/390 & z/Architecture
+==================================================
+
+Our addressing scheme is as follows
+
+
+Himem 0x7fffffff 2GB on s/390 ***************** ****************
+currently 0x3ffffffffff (2^42)-1 * User Stack * * *
+on z/Architecture. ***************** * *
+ * Shared Libs * * *
+ ***************** * *
+ * * * Kernel *
+ * User Program * * *
+ * Data BSS * * *
+ * Text * * *
+ * Sections * * *
+0x00000000 ***************** ****************
+
+This also means that we need to look at the PSW problem state bit
+or the addressing mode to decide whether we are looking at
+user or kernel space.
+
+Virtual Addresses on s/390 & z/Architecture
+===========================================
+
+A virtual address on s/390 is made up of 3 parts
+The SX ( segment index, roughly corresponding to the PGD & PMD in linux terminology )
+being bits 1-11.
+The PX ( page index, corresponding to the page table entry (pte) in linux terminology )
+being bits 12-19.
+The remaining bits BX (the byte index are the offset in the page )
+i.e. bits 20 to 31.
+
+On z/Architecture in linux we currently make up an address from 4 parts.
+The region index bits (RX) 0-32 we currently use bits 22-32
+The segment index (SX) being bits 33-43
+The page index (PX) being bits 44-51
+The byte index (BX) being bits 52-63
+
+Notes:
+1) s/390 has no PMD so the PMD is really the PGD also.
+A lot of this stuff is defined in pgtable.h.
+
+2) Also seeing as s/390's page indexes are only 1k in size
+(bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k )
+to make the best use of memory by updating 4 segment indices
+entries each time we mess with a PMD & use offsets
+0,1024,2048 & 3072 in this page as for our segment indexes.
+On z/Architecture our page indexes are now 2k in size
+( bits 12-19 x 8 bytes per pte ) we do a similar trick
+but only mess with 2 segment indices each time we mess with
+a PMD.
+
+3) As z/Architecture supports up to a massive 5-level page table lookup we
+can only use 3 currently on Linux ( as this is all the generic kernel
+currently supports ) however this may change in future
+this allows us to access ( according to my sums )
+4TB of virtual storage per process i.e.
+4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes,
+enough for another 2 or 3 of years I think :-).
+to do this we use a region-third-table designation type in
+our address space control registers.
+
+
+The Linux for s/390 & z/Architecture Kernel Task Structure
+==========================================================
+Each process/thread under Linux for S390 has its own kernel task_struct
+defined in linux/include/linux/sched.h
+The S390 on initialisation & resuming of a process on a cpu sets
+the __LC_KERNEL_STACK variable in the spare prefix area for this cpu
+(which we use for per-processor globals).
+
+The kernel stack pointer is intimately tied with the task structure for
+each processor as follows.
+
+ s/390
+ ************************
+ * 1 page kernel stack *
+ * ( 4K ) *
+ ************************
+ * 1 page task_struct *
+ * ( 4K ) *
+8K aligned ************************
+
+ z/Architecture
+ ************************
+ * 2 page kernel stack *
+ * ( 8K ) *
+ ************************
+ * 2 page task_struct *
+ * ( 8K ) *
+16K aligned ************************
+
+What this means is that we don't need to dedicate any register or global variable
+to point to the current running process & can retrieve it with the following
+very simple construct for s/390 & one very similar for z/Architecture.
+
+static inline struct task_struct * get_current(void)
+{
+ struct task_struct *current;
+ __asm__("lhi %0,-8192\n\t"
+ "nr %0,15"
+ : "=r" (current) );
+ return current;
+}
+
+i.e. just anding the current kernel stack pointer with the mask -8192.
+Thankfully because Linux doesn't have support for nested IO interrupts
+& our devices have large buffers can survive interrupts being shut for
+short amounts of time we don't need a separate stack for interrupts.
+
+
+
+
+Register Usage & Stackframes on Linux for s/390 & z/Architecture
+=================================================================
+Overview:
+---------
+This is the code that gcc produces at the top & the bottom of
+each function. It usually is fairly consistent & similar from
+function to function & if you know its layout you can probably
+make some headway in finding the ultimate cause of a problem
+after a crash without a source level debugger.
+
+Note: To follow stackframes requires a knowledge of C or Pascal &
+limited knowledge of one assembly language.
+
+It should be noted that there are some differences between the
+s/390 & z/Architecture stack layouts as the z/Architecture stack layout didn't have
+to maintain compatibility with older linkage formats.
+
+Glossary:
+---------
+alloca:
+This is a built in compiler function for runtime allocation
+of extra space on the callers stack which is obviously freed
+up on function exit ( e.g. the caller may choose to allocate nothing
+of a buffer of 4k if required for temporary purposes ), it generates
+very efficient code ( a few cycles ) when compared to alternatives
+like malloc.
+
+automatics: These are local variables on the stack,
+i.e they aren't in registers & they aren't static.
+
+back-chain:
+This is a pointer to the stack pointer before entering a
+framed functions ( see frameless function ) prologue got by
+dereferencing the address of the current stack pointer,
+ i.e. got by accessing the 32 bit value at the stack pointers
+current location.
+
+base-pointer:
+This is a pointer to the back of the literal pool which
+is an area just behind each procedure used to store constants
+in each function.
+
+call-clobbered: The caller probably needs to save these registers if there
+is something of value in them, on the stack or elsewhere before making a
+call to another procedure so that it can restore it later.
+
+epilogue:
+The code generated by the compiler to return to the caller.
+
+frameless-function
+A frameless function in Linux for s390 & z/Architecture is one which doesn't
+need more than the register save area ( 96 bytes on s/390, 160 on z/Architecture )
+given to it by the caller.
+A frameless function never:
+1) Sets up a back chain.
+2) Calls alloca.
+3) Calls other normal functions
+4) Has automatics.
+
+GOT-pointer:
+This is a pointer to the global-offset-table in ELF
+( Executable Linkable Format, Linux'es most common executable format ),
+all globals & shared library objects are found using this pointer.
+
+lazy-binding
+ELF shared libraries are typically only loaded when routines in the shared
+library are actually first called at runtime. This is lazy binding.
+
+procedure-linkage-table
+This is a table found from the GOT which contains pointers to routines
+in other shared libraries which can't be called to by easier means.
+
+prologue:
+The code generated by the compiler to set up the stack frame.
+
+outgoing-args:
+This is extra area allocated on the stack of the calling function if the
+parameters for the callee's cannot all be put in registers, the same
+area can be reused by each function the caller calls.
+
+routine-descriptor:
+A COFF executable format based concept of a procedure reference
+actually being 8 bytes or more as opposed to a simple pointer to the routine.
+This is typically defined as follows
+Routine Descriptor offset 0=Pointer to Function
+Routine Descriptor offset 4=Pointer to Table of Contents
+The table of contents/TOC is roughly equivalent to a GOT pointer.
+& it means that shared libraries etc. can be shared between several
+environments each with their own TOC.
+
+
+static-chain: This is used in nested functions a concept adopted from pascal
+by gcc not used in ansi C or C++ ( although quite useful ), basically it
+is a pointer used to reference local variables of enclosing functions.
+You might come across this stuff once or twice in your lifetime.
+
+e.g.
+The function below should return 11 though gcc may get upset & toss warnings
+about unused variables.
+int FunctionA(int a)
+{
+ int b;
+ FunctionC(int c)
+ {
+ b=c+1;
+ }
+ FunctionC(10);
+ return(b);
+}
+
+
+s/390 & z/Architecture Register usage
+=====================================
+r0 used by syscalls/assembly call-clobbered
+r1 used by syscalls/assembly call-clobbered
+r2 argument 0 / return value 0 call-clobbered
+r3 argument 1 / return value 1 (if long long) call-clobbered
+r4 argument 2 call-clobbered
+r5 argument 3 call-clobbered
+r6 argument 4 saved
+r7 pointer-to arguments 5 to ... saved
+r8 this & that saved
+r9 this & that saved
+r10 static-chain ( if nested function ) saved
+r11 frame-pointer ( if function used alloca ) saved
+r12 got-pointer saved
+r13 base-pointer saved
+r14 return-address saved
+r15 stack-pointer saved
+
+f0 argument 0 / return value ( float/double ) call-clobbered
+f2 argument 1 call-clobbered
+f4 z/Architecture argument 2 saved
+f6 z/Architecture argument 3 saved
+The remaining floating points
+f1,f3,f5 f7-f15 are call-clobbered.
+
+Notes:
+------
+1) The only requirement is that registers which are used
+by the callee are saved, e.g. the compiler is perfectly
+capable of using r11 for purposes other than a frame a
+frame pointer if a frame pointer is not needed.
+2) In functions with variable arguments e.g. printf the calling procedure
+is identical to one without variable arguments & the same number of
+parameters. However, the prologue of this function is somewhat more
+hairy owing to it having to move these parameters to the stack to
+get va_start, va_arg & va_end to work.
+3) Access registers are currently unused by gcc but are used in
+the kernel. Possibilities exist to use them at the moment for
+temporary storage but it isn't recommended.
+4) Only 4 of the floating point registers are used for
+parameter passing as older machines such as G3 only have only 4
+& it keeps the stack frame compatible with other compilers.
+However with IEEE floating point emulation under linux on the
+older machines you are free to use the other 12.
+5) A long long or double parameter cannot be have the
+first 4 bytes in a register & the second four bytes in the
+outgoing args area. It must be purely in the outgoing args
+area if crossing this boundary.
+6) Floating point parameters are mixed with outgoing args
+on the outgoing args area in the order the are passed in as parameters.
+7) Floating point arguments 2 & 3 are saved in the outgoing args area for
+z/Architecture
+
+
+Stack Frame Layout
+------------------
+s/390 z/Architecture
+0 0 back chain ( a 0 here signifies end of back chain )
+4 8 eos ( end of stack, not used on Linux for S390 used in other linkage formats )
+8 16 glue used in other s/390 linkage formats for saved routine descriptors etc.
+12 24 glue used in other s/390 linkage formats for saved routine descriptors etc.
+16 32 scratch area
+20 40 scratch area
+24 48 saved r6 of caller function
+28 56 saved r7 of caller function
+32 64 saved r8 of caller function
+36 72 saved r9 of caller function
+40 80 saved r10 of caller function
+44 88 saved r11 of caller function
+48 96 saved r12 of caller function
+52 104 saved r13 of caller function
+56 112 saved r14 of caller function
+60 120 saved r15 of caller function
+64 128 saved f4 of caller function
+72 132 saved f6 of caller function
+80 undefined
+96 160 outgoing args passed from caller to callee
+96+x 160+x possible stack alignment ( 8 bytes desirable )
+96+x+y 160+x+y alloca space of caller ( if used )
+96+x+y+z 160+x+y+z automatics of caller ( if used )
+0 back-chain
+
+A sample program with comments.
+===============================
+
+Comments on the function test
+-----------------------------
+1) It didn't need to set up a pointer to the constant pool gpr13 as it isn't used
+( :-( ).
+2) This is a frameless function & no stack is bought.
+3) The compiler was clever enough to recognise that it could return the
+value in r2 as well as use it for the passed in parameter ( :-) ).
+4) The basr ( branch relative & save ) trick works as follows the instruction
+has a special case with r0,r0 with some instruction operands is understood as
+the literal value 0, some risc architectures also do this ). So now
+we are branching to the next address & the address new program counter is
+in r13,so now we subtract the size of the function prologue we have executed
++ the size of the literal pool to get to the top of the literal pool
+0040037c int test(int b)
+{ # Function prologue below
+ 40037c: 90 de f0 34 stm %r13,%r14,52(%r15) # Save registers r13 & r14
+ 400380: 0d d0 basr %r13,%r0 # Set up pointer to constant pool using
+ 400382: a7 da ff fa ahi %r13,-6 # basr trick
+ return(5+b);
+ # Huge main program
+ 400386: a7 2a 00 05 ahi %r2,5 # add 5 to r2
+
+ # Function epilogue below
+ 40038a: 98 de f0 34 lm %r13,%r14,52(%r15) # restore registers r13 & 14
+ 40038e: 07 fe br %r14 # return
+}
+
+Comments on the function main
+-----------------------------
+1) The compiler did this function optimally ( 8-) )
+
+Literal pool for main.
+400390: ff ff ff ec .long 0xffffffec
+main(int argc,char *argv[])
+{ # Function prologue below
+ 400394: 90 bf f0 2c stm %r11,%r15,44(%r15) # Save necessary registers
+ 400398: 18 0f lr %r0,%r15 # copy stack pointer to r0
+ 40039a: a7 fa ff a0 ahi %r15,-96 # Make area for callee saving
+ 40039e: 0d d0 basr %r13,%r0 # Set up r13 to point to
+ 4003a0: a7 da ff f0 ahi %r13,-16 # literal pool
+ 4003a4: 50 00 f0 00 st %r0,0(%r15) # Save backchain
+
+ return(test(5)); # Main Program Below
+ 4003a8: 58 e0 d0 00 l %r14,0(%r13) # load relative address of test from
+ # literal pool
+ 4003ac: a7 28 00 05 lhi %r2,5 # Set first parameter to 5
+ 4003b0: 4d ee d0 00 bas %r14,0(%r14,%r13) # jump to test setting r14 as return
+ # address using branch & save instruction.
+
+ # Function Epilogue below
+ 4003b4: 98 bf f0 8c lm %r11,%r15,140(%r15)# Restore necessary registers.
+ 4003b8: 07 fe br %r14 # return to do program exit
+}
+
+
+Compiler updates
+----------------
+
+main(int argc,char *argv[])
+{
+ 4004fc: 90 7f f0 1c stm %r7,%r15,28(%r15)
+ 400500: a7 d5 00 04 bras %r13,400508 <main+0xc>
+ 400504: 00 40 04 f4 .long 0x004004f4
+ # compiler now puts constant pool in code to so it saves an instruction
+ 400508: 18 0f lr %r0,%r15
+ 40050a: a7 fa ff a0 ahi %r15,-96
+ 40050e: 50 00 f0 00 st %r0,0(%r15)
+ return(test(5));
+ 400512: 58 10 d0 00 l %r1,0(%r13)
+ 400516: a7 28 00 05 lhi %r2,5
+ 40051a: 0d e1 basr %r14,%r1
+ # compiler adds 1 extra instruction to epilogue this is done to
+ # avoid processor pipeline stalls owing to data dependencies on g5 &
+ # above as register 14 in the old code was needed directly after being loaded
+ # by the lm %r11,%r15,140(%r15) for the br %14.
+ 40051c: 58 40 f0 98 l %r4,152(%r15)
+ 400520: 98 7f f0 7c lm %r7,%r15,124(%r15)
+ 400524: 07 f4 br %r4
+}
+
+
+Hartmut ( our compiler developer ) also has been threatening to take out the
+stack backchain in optimised code as this also causes pipeline stalls, you
+have been warned.
+
+64 bit z/Architecture code disassembly
+--------------------------------------
+
+If you understand the stuff above you'll understand the stuff
+below too so I'll avoid repeating myself & just say that
+some of the instructions have g's on the end of them to indicate
+they are 64 bit & the stack offsets are a bigger,
+the only other difference you'll find between 32 & 64 bit is that
+we now use f4 & f6 for floating point arguments on 64 bit.
+00000000800005b0 <test>:
+int test(int b)
+{
+ return(5+b);
+ 800005b0: a7 2a 00 05 ahi %r2,5
+ 800005b4: b9 14 00 22 lgfr %r2,%r2 # downcast to integer
+ 800005b8: 07 fe br %r14
+ 800005ba: 07 07 bcr 0,%r7
+
+
+}
+
+00000000800005bc <main>:
+main(int argc,char *argv[])
+{
+ 800005bc: eb bf f0 58 00 24 stmg %r11,%r15,88(%r15)
+ 800005c2: b9 04 00 1f lgr %r1,%r15
+ 800005c6: a7 fb ff 60 aghi %r15,-160
+ 800005ca: e3 10 f0 00 00 24 stg %r1,0(%r15)
+ return(test(5));
+ 800005d0: a7 29 00 05 lghi %r2,5
+ # brasl allows jumps > 64k & is overkill here bras would do fune
+ 800005d4: c0 e5 ff ff ff ee brasl %r14,800005b0 <test>
+ 800005da: e3 40 f1 10 00 04 lg %r4,272(%r15)
+ 800005e0: eb bf f0 f8 00 04 lmg %r11,%r15,248(%r15)
+ 800005e6: 07 f4 br %r4
+}
+
+
+
+Compiling programs for debugging on Linux for s/390 & z/Architecture
+====================================================================
+-gdwarf-2 now works it should be considered the default debugging
+format for s/390 & z/Architecture as it is more reliable for debugging
+shared libraries, normal -g debugging works much better now
+Thanks to the IBM java compiler developers bug reports.
+
+This is typically done adding/appending the flags -g or -gdwarf-2 to the
+CFLAGS & LDFLAGS variables Makefile of the program concerned.
+
+If using gdb & you would like accurate displays of registers &
+ stack traces compile without optimisation i.e make sure
+that there is no -O2 or similar on the CFLAGS line of the Makefile &
+the emitted gcc commands, obviously this will produce worse code
+( not advisable for shipment ) but it is an aid to the debugging process.
+
+This aids debugging because the compiler will copy parameters passed in
+in registers onto the stack so backtracing & looking at passed in
+parameters will work, however some larger programs which use inline functions
+will not compile without optimisation.
+
+Debugging with optimisation has since much improved after fixing
+some bugs, please make sure you are using gdb-5.0 or later developed
+after Nov'2000.
+
+Figuring out gcc compile errors
+===============================
+If you are getting a lot of syntax errors compiling a program & the problem
+isn't blatantly obvious from the source.
+It often helps to just preprocess the file, this is done with the -E
+option in gcc.
+What this does is that it runs through the very first phase of compilation
+( compilation in gcc is done in several stages & gcc calls many programs to
+achieve its end result ) with the -E option gcc just calls the gcc preprocessor (cpp).
+The c preprocessor does the following, it joins all the files #included together
+recursively ( #include files can #include other files ) & also the c file you wish to compile.
+It puts a fully qualified path of the #included files in a comment & it
+does macro expansion.
+This is useful for debugging because
+1) You can double check whether the files you expect to be included are the ones
+that are being included ( e.g. double check that you aren't going to the i386 asm directory ).
+2) Check that macro definitions aren't clashing with typedefs,
+3) Check that definitions aren't being used before they are being included.
+4) Helps put the line emitting the error under the microscope if it contains macros.
+
+For convenience the Linux kernel's makefile will do preprocessing automatically for you
+by suffixing the file you want built with .i ( instead of .o )
+
+e.g.
+from the linux directory type
+make arch/s390/kernel/signal.i
+this will build
+
+s390-gcc -D__KERNEL__ -I/home1/barrow/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer
+-fno-strict-aliasing -D__SMP__ -pipe -fno-strength-reduce -E arch/s390/kernel/signal.c
+> arch/s390/kernel/signal.i
+
+Now look at signal.i you should see something like.
+
+
+# 1 "/home1/barrow/linux/include/asm/types.h" 1
+typedef unsigned short umode_t;
+typedef __signed__ char __s8;
+typedef unsigned char __u8;
+typedef __signed__ short __s16;
+typedef unsigned short __u16;
+
+If instead you are getting errors further down e.g.
+unknown instruction:2515 "move.l" or better still unknown instruction:2515
+"Fixme not implemented yet, call Martin" you are probably are attempting to compile some code
+meant for another architecture or code that is simply not implemented, with a fixme statement
+stuck into the inline assembly code so that the author of the file now knows he has work to do.
+To look at the assembly emitted by gcc just before it is about to call gas ( the gnu assembler )
+use the -S option.
+Again for your convenience the Linux kernel's Makefile will hold your hand &
+do all this donkey work for you also by building the file with the .s suffix.
+e.g.
+from the Linux directory type
+make arch/s390/kernel/signal.s
+
+s390-gcc -D__KERNEL__ -I/home1/barrow/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer
+-fno-strict-aliasing -D__SMP__ -pipe -fno-strength-reduce -S arch/s390/kernel/signal.c
+-o arch/s390/kernel/signal.s
+
+
+This will output something like, ( please note the constant pool & the useful comments
+in the prologue to give you a hand at interpreting it ).
+
+.LC54:
+ .string "misaligned (__u16 *) in __xchg\n"
+.LC57:
+ .string "misaligned (__u32 *) in __xchg\n"
+.L$PG1: # Pool sys_sigsuspend
+.LC192:
+ .long -262401
+.LC193:
+ .long -1
+.LC194:
+ .long schedule-.L$PG1
+.LC195:
+ .long do_signal-.L$PG1
+ .align 4
+.globl sys_sigsuspend
+ .type sys_sigsuspend,@function
+sys_sigsuspend:
+# leaf function 0
+# automatics 16
+# outgoing args 0
+# need frame pointer 0
+# call alloca 0
+# has varargs 0
+# incoming args (stack) 0
+# function length 168
+ STM 8,15,32(15)
+ LR 0,15
+ AHI 15,-112
+ BASR 13,0
+.L$CO1: AHI 13,.L$PG1-.L$CO1
+ ST 0,0(15)
+ LR 8,2
+ N 5,.LC192-.L$PG1(13)
+
+Adding -g to the above output makes the output even more useful
+e.g. typing
+make CC:="s390-gcc -g" kernel/sched.s
+
+which compiles.
+s390-gcc -g -D__KERNEL__ -I/home/barrow/linux-2.3/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strict-aliasing -pipe -fno-strength-reduce -S kernel/sched.c -o kernel/sched.s
+
+also outputs stabs ( debugger ) info, from this info you can find out the
+offsets & sizes of various elements in structures.
+e.g. the stab for the structure
+struct rlimit {
+ unsigned long rlim_cur;
+ unsigned long rlim_max;
+};
+is
+.stabs "rlimit:T(151,2)=s8rlim_cur:(0,5),0,32;rlim_max:(0,5),32,32;;",128,0,0,0
+from this stab you can see that
+rlimit_cur starts at bit offset 0 & is 32 bits in size
+rlimit_max starts at bit offset 32 & is 32 bits in size.
+
+
+Debugging Tools:
+================
+
+objdump
+=======
+This is a tool with many options the most useful being ( if compiled with -g).
+objdump --source <victim program or object file> > <victims debug listing >
+
+
+The whole kernel can be compiled like this ( Doing this will make a 17MB kernel
+& a 200 MB listing ) however you have to strip it before building the image
+using the strip command to make it a more reasonable size to boot it.
+
+A source/assembly mixed dump of the kernel can be done with the line
+objdump --source vmlinux > vmlinux.lst
+Also, if the file isn't compiled -g, this will output as much debugging information
+as it can (e.g. function names). This is very slow as it spends lots
+of time searching for debugging info. The following self explanatory line should be used
+instead if the code isn't compiled -g, as it is much faster:
+objdump --disassemble-all --syms vmlinux > vmlinux.lst
+
+As hard drive space is valuable most of us use the following approach.
+1) Look at the emitted psw on the console to find the crash address in the kernel.
+2) Look at the file System.map ( in the linux directory ) produced when building
+the kernel to find the closest address less than the current PSW to find the
+offending function.
+3) use grep or similar to search the source tree looking for the source file
+ with this function if you don't know where it is.
+4) rebuild this object file with -g on, as an example suppose the file was
+( /arch/s390/kernel/signal.o )
+5) Assuming the file with the erroneous function is signal.c Move to the base of the
+Linux source tree.
+6) rm /arch/s390/kernel/signal.o
+7) make /arch/s390/kernel/signal.o
+8) watch the gcc command line emitted
+9) type it in again or alternatively cut & paste it on the console adding the -g option.
+10) objdump --source arch/s390/kernel/signal.o > signal.lst
+This will output the source & the assembly intermixed, as the snippet below shows
+This will unfortunately output addresses which aren't the same
+as the kernel ones you should be able to get around the mental arithmetic
+by playing with the --adjust-vma parameter to objdump.
+
+
+
+
+static inline void spin_lock(spinlock_t *lp)
+{
+ a0: 18 34 lr %r3,%r4
+ a2: a7 3a 03 bc ahi %r3,956
+ __asm__ __volatile(" lhi 1,-1\n"
+ a6: a7 18 ff ff lhi %r1,-1
+ aa: 1f 00 slr %r0,%r0
+ ac: ba 01 30 00 cs %r0,%r1,0(%r3)
+ b0: a7 44 ff fd jm aa <sys_sigsuspend+0x2e>
+ saveset = current->blocked;
+ b4: d2 07 f0 68 mvc 104(8,%r15),972(%r4)
+ b8: 43 cc
+ return (set->sig[0] & mask) != 0;
+}
+
+6) If debugging under VM go down to that section in the document for more info.
+
+
+I now have a tool which takes the pain out of --adjust-vma
+& you are able to do something like
+make /arch/s390/kernel/traps.lst
+& it automatically generates the correctly relocated entries for
+the text segment in traps.lst.
+This tool is now standard in linux distro's in scripts/makelst
+
+strace:
+-------
+Q. What is it ?
+A. It is a tool for intercepting calls to the kernel & logging them
+to a file & on the screen.
+
+Q. What use is it ?
+A. You can use it to find out what files a particular program opens.
+
+
+
+Example 1
+---------
+If you wanted to know does ping work but didn't have the source
+strace ping -c 1 127.0.0.1
+& then look at the man pages for each of the syscalls below,
+( In fact this is sometimes easier than looking at some spaghetti
+source which conditionally compiles for several architectures ).
+Not everything that it throws out needs to make sense immediately.
+
+Just looking quickly you can see that it is making up a RAW socket
+for the ICMP protocol.
+Doing an alarm(10) for a 10 second timeout
+& doing a gettimeofday call before & after each read to see
+how long the replies took, & writing some text to stdout so the user
+has an idea what is going on.
+
+socket(PF_INET, SOCK_RAW, IPPROTO_ICMP) = 3
+getuid() = 0
+setuid(0) = 0
+stat("/usr/share/locale/C/libc.cat", 0xbffff134) = -1 ENOENT (No such file or directory)
+stat("/usr/share/locale/libc/C", 0xbffff134) = -1 ENOENT (No such file or directory)
+stat("/usr/local/share/locale/C/libc.cat", 0xbffff134) = -1 ENOENT (No such file or directory)
+getpid() = 353
+setsockopt(3, SOL_SOCKET, SO_BROADCAST, [1], 4) = 0
+setsockopt(3, SOL_SOCKET, SO_RCVBUF, [49152], 4) = 0
+fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(3, 1), ...}) = 0
+mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x40008000
+ioctl(1, TCGETS, {B9600 opost isig icanon echo ...}) = 0
+write(1, "PING 127.0.0.1 (127.0.0.1): 56 d"..., 42PING 127.0.0.1 (127.0.0.1): 56 data bytes
+) = 42
+sigaction(SIGINT, {0x8049ba0, [], SA_RESTART}, {SIG_DFL}) = 0
+sigaction(SIGALRM, {0x8049600, [], SA_RESTART}, {SIG_DFL}) = 0
+gettimeofday({948904719, 138951}, NULL) = 0
+sendto(3, "\10\0D\201a\1\0\0\17#\2178\307\36"..., 64, 0, {sin_family=AF_INET,
+sin_port=htons(0), sin_addr=inet_addr("127.0.0.1")}, 16) = 64
+sigaction(SIGALRM, {0x8049600, [], SA_RESTART}, {0x8049600, [], SA_RESTART}) = 0
+sigaction(SIGALRM, {0x8049ba0, [], SA_RESTART}, {0x8049600, [], SA_RESTART}) = 0
+alarm(10) = 0
+recvfrom(3, "E\0\0T\0005\0\0@\1|r\177\0\0\1\177"..., 192, 0,
+{sin_family=AF_INET, sin_port=htons(50882), sin_addr=inet_addr("127.0.0.1")}, [16]) = 84
+gettimeofday({948904719, 160224}, NULL) = 0
+recvfrom(3, "E\0\0T\0006\0\0\377\1\275p\177\0"..., 192, 0,
+{sin_family=AF_INET, sin_port=htons(50882), sin_addr=inet_addr("127.0.0.1")}, [16]) = 84
+gettimeofday({948904719, 166952}, NULL) = 0
+write(1, "64 bytes from 127.0.0.1: icmp_se"...,
+5764 bytes from 127.0.0.1: icmp_seq=0 ttl=255 time=28.0 ms
+
+Example 2
+---------
+strace passwd 2>&1 | grep open
+produces the following output
+open("/etc/ld.so.cache", O_RDONLY) = 3
+open("/opt/kde/lib/libc.so.5", O_RDONLY) = -1 ENOENT (No such file or directory)
+open("/lib/libc.so.5", O_RDONLY) = 3
+open("/dev", O_RDONLY) = 3
+open("/var/run/utmp", O_RDONLY) = 3
+open("/etc/passwd", O_RDONLY) = 3
+open("/etc/shadow", O_RDONLY) = 3
+open("/etc/login.defs", O_RDONLY) = 4
+open("/dev/tty", O_RDONLY) = 4
+
+The 2>&1 is done to redirect stderr to stdout & grep is then filtering this input
+through the pipe for each line containing the string open.
+
+
+Example 3
+---------
+Getting sophisticated
+telnetd crashes & I don't know why
+
+Steps
+-----
+1) Replace the following line in /etc/inetd.conf
+telnet stream tcp nowait root /usr/sbin/in.telnetd -h
+with
+telnet stream tcp nowait root /blah
+
+2) Create the file /blah with the following contents to start tracing telnetd
+#!/bin/bash
+/usr/bin/strace -o/t1 -f /usr/sbin/in.telnetd -h
+3) chmod 700 /blah to make it executable only to root
+4)
+killall -HUP inetd
+or ps aux | grep inetd
+get inetd's process id
+& kill -HUP inetd to restart it.
+
+Important options
+-----------------
+-o is used to tell strace to output to a file in our case t1 in the root directory
+-f is to follow children i.e.
+e.g in our case above telnetd will start the login process & subsequently a shell like bash.
+You will be able to tell which is which from the process ID's listed on the left hand side
+of the strace output.
+-p<pid> will tell strace to attach to a running process, yup this can be done provided
+ it isn't being traced or debugged already & you have enough privileges,
+the reason 2 processes cannot trace or debug the same program is that strace
+becomes the parent process of the one being debugged & processes ( unlike people )
+can have only one parent.
+
+
+However the file /t1 will get big quite quickly
+to test it telnet 127.0.0.1
+
+now look at what files in.telnetd execve'd
+413 execve("/usr/sbin/in.telnetd", ["/usr/sbin/in.telnetd", "-h"], [/* 17 vars */]) = 0
+414 execve("/bin/login", ["/bin/login", "-h", "localhost", "-p"], [/* 2 vars */]) = 0
+
+Whey it worked!.
+
+
+Other hints:
+------------
+If the program is not very interactive ( i.e. not much keyboard input )
+& is crashing in one architecture but not in another you can do
+an strace of both programs under as identical a scenario as you can
+on both architectures outputting to a file then.
+do a diff of the two traces using the diff program
+i.e.
+diff output1 output2
+& maybe you'll be able to see where the call paths differed, this
+is possibly near the cause of the crash.
+
+More info
+---------
+Look at man pages for strace & the various syscalls
+e.g. man strace, man alarm, man socket.
+
+
+Performance Debugging
+=====================
+gcc is capable of compiling in profiling code just add the -p option
+to the CFLAGS, this obviously affects program size & performance.
+This can be used by the gprof gnu profiling tool or the
+gcov the gnu code coverage tool ( code coverage is a means of testing
+code quality by checking if all the code in an executable in exercised by
+a tester ).
+
+
+Using top to find out where processes are sleeping in the kernel
+----------------------------------------------------------------
+To do this copy the System.map from the root directory where
+the linux kernel was built to the /boot directory on your
+linux machine.
+Start top
+Now type fU<return>
+You should see a new field called WCHAN which
+tells you where each process is sleeping here is a typical output.
+
+ 6:59pm up 41 min, 1 user, load average: 0.00, 0.00, 0.00
+28 processes: 27 sleeping, 1 running, 0 zombie, 0 stopped
+CPU states: 0.0% user, 0.1% system, 0.0% nice, 99.8% idle
+Mem: 254900K av, 45976K used, 208924K free, 0K shrd, 28636K buff
+Swap: 0K av, 0K used, 0K free 8620K cached
+
+ PID USER PRI NI SIZE RSS SHARE WCHAN STAT LIB %CPU %MEM TIME COMMAND
+ 750 root 12 0 848 848 700 do_select S 0 0.1 0.3 0:00 in.telnetd
+ 767 root 16 0 1140 1140 964 R 0 0.1 0.4 0:00 top
+ 1 root 8 0 212 212 180 do_select S 0 0.0 0.0 0:00 init
+ 2 root 9 0 0 0 0 down_inte SW 0 0.0 0.0 0:00 kmcheck
+
+The time command
+----------------
+Another related command is the time command which gives you an indication
+of where a process is spending the majority of its time.
+e.g.
+time ping -c 5 nc
+outputs
+real 0m4.054s
+user 0m0.010s
+sys 0m0.010s
+
+Debugging under VM
+==================
+
+Notes
+-----
+Addresses & values in the VM debugger are always hex never decimal
+Address ranges are of the format <HexValue1>-<HexValue2> or <HexValue1>.<HexValue2>
+e.g. The address range 0x2000 to 0x3000 can be described as 2000-3000 or 2000.1000
+
+The VM Debugger is case insensitive.
+
+VM's strengths are usually other debuggers weaknesses you can get at any resource
+no matter how sensitive e.g. memory management resources,change address translation
+in the PSW. For kernel hacking you will reap dividends if you get good at it.
+
+The VM Debugger displays operators but not operands, probably because some
+of it was written when memory was expensive & the programmer was probably proud that
+it fitted into 2k of memory & the programmers & didn't want to shock hardcore VM'ers by
+changing the interface :-), also the debugger displays useful information on the same line &
+the author of the code probably felt that it was a good idea not to go over
+the 80 columns on the screen.
+
+As some of you are probably in a panic now this isn't as unintuitive as it may seem
+as the 390 instructions are easy to decode mentally & you can make a good guess at a lot
+of them as all the operands are nibble ( half byte aligned ) & if you have an objdump listing
+also it is quite easy to follow, if you don't have an objdump listing keep a copy of
+the s/390 Reference Summary & look at between pages 2 & 7 or alternatively the
+s/390 principles of operation.
+e.g. even I can guess that
+0001AFF8' LR 180F CC 0
+is a ( load register ) lr r0,r15
+
+Also it is very easy to tell the length of a 390 instruction from the 2 most significant
+bits in the instruction ( not that this info is really useful except if you are trying to
+make sense of a hexdump of code ).
+Here is a table
+Bits Instruction Length
+------------------------------------------
+00 2 Bytes
+01 4 Bytes
+10 4 Bytes
+11 6 Bytes
+
+
+
+
+The debugger also displays other useful info on the same line such as the
+addresses being operated on destination addresses of branches & condition codes.
+e.g.
+00019736' AHI A7DAFF0E CC 1
+000198BA' BRC A7840004 -> 000198C2' CC 0
+000198CE' STM 900EF068 >> 0FA95E78 CC 2
+
+
+
+Useful VM debugger commands
+---------------------------
+
+I suppose I'd better mention this before I start
+to list the current active traces do
+Q TR
+there can be a maximum of 255 of these per set
+( more about trace sets later ).
+To stop traces issue a
+TR END.
+To delete a particular breakpoint issue
+TR DEL <breakpoint number>
+
+The PA1 key drops to CP mode so you can issue debugger commands,
+Doing alt c (on my 3270 console at least ) clears the screen.
+hitting b <enter> comes back to the running operating system
+from cp mode ( in our case linux ).
+It is typically useful to add shortcuts to your profile.exec file
+if you have one ( this is roughly equivalent to autoexec.bat in DOS ).
+file here are a few from mine.
+/* this gives me command history on issuing f12 */
+set pf12 retrieve
+/* this continues */
+set pf8 imm b
+/* goes to trace set a */
+set pf1 imm tr goto a
+/* goes to trace set b */
+set pf2 imm tr goto b
+/* goes to trace set c */
+set pf3 imm tr goto c
+
+
+
+Instruction Tracing
+-------------------
+Setting a simple breakpoint
+TR I PSWA <address>
+To debug a particular function try
+TR I R <function address range>
+TR I on its own will single step.
+TR I DATA <MNEMONIC> <OPTIONAL RANGE> will trace for particular mnemonics
+e.g.
+TR I DATA 4D R 0197BC.4000
+will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000
+if you were inclined you could add traces for all branch instructions &
+suffix them with the run prefix so you would have a backtrace on screen
+when a program crashes.
+TR BR <INTO OR FROM> will trace branches into or out of an address.
+e.g.
+TR BR INTO 0 is often quite useful if a program is getting awkward & deciding
+to branch to 0 & crashing as this will stop at the address before in jumps to 0.
+TR I R <address range> RUN cmd d g
+single steps a range of addresses but stays running &
+displays the gprs on each step.
+
+
+
+Displaying & modifying Registers
+--------------------------------
+D G will display all the gprs
+Adding a extra G to all the commands is necessary to access the full 64 bit
+content in VM on z/Architecture obviously this isn't required for access registers
+as these are still 32 bit.
+e.g. DGG instead of DG
+D X will display all the control registers
+D AR will display all the access registers
+D AR4-7 will display access registers 4 to 7
+CPU ALL D G will display the GRPS of all CPUS in the configuration
+D PSW will display the current PSW
+st PSW 2000 will put the value 2000 into the PSW &
+cause crash your machine.
+D PREFIX displays the prefix offset
+
+
+Displaying Memory
+-----------------
+To display memory mapped using the current PSW's mapping try
+D <range>
+To make VM display a message each time it hits a particular address & continue try
+D I<range> will disassemble/display a range of instructions.
+ST addr 32 bit word will store a 32 bit aligned address
+D T<range> will display the EBCDIC in an address ( if you are that way inclined )
+D R<range> will display real addresses ( without DAT ) but with prefixing.
+There are other complex options to display if you need to get at say home space
+but are in primary space the easiest thing to do is to temporarily
+modify the PSW to the other addressing mode, display the stuff & then
+restore it.
+
+
+
+Hints
+-----
+If you want to issue a debugger command without halting your virtual machine with the
+PA1 key try prefixing the command with #CP e.g.
+#cp tr i pswa 2000
+also suffixing most debugger commands with RUN will cause them not
+to stop just display the mnemonic at the current instruction on the console.
+If you have several breakpoints you want to put into your program &
+you get fed up of cross referencing with System.map
+you can do the following trick for several symbols.
+grep do_signal System.map
+which emits the following among other things
+0001f4e0 T do_signal
+now you can do
+
+TR I PSWA 0001f4e0 cmd msg * do_signal
+This sends a message to your own console each time do_signal is entered.
+( As an aside I wrote a perl script once which automatically generated a REXX
+script with breakpoints on every kernel procedure, this isn't a good idea
+because there are thousands of these routines & VM can only set 255 breakpoints
+at a time so you nearly had to spend as long pruning the file down as you would
+entering the msg's by hand ),however, the trick might be useful for a single object file.
+On linux'es 3270 emulator x3270 there is a very useful option under the file ment
+Save Screens In File this is very good of keeping a copy of traces.
+
+From CMS help <command name> will give you online help on a particular command.
+e.g.
+HELP DISPLAY
+
+Also CP has a file called profile.exec which automatically gets called
+on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session
+CP has a feature similar to doskey, it may be useful for you to
+use profile.exec to define some keystrokes.
+e.g.
+SET PF9 IMM B
+This does a single step in VM on pressing F8.
+SET PF10 ^
+This sets up the ^ key.
+which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed directly into some 3270 consoles.
+SET PF11 ^-
+This types the starting keystrokes for a sysrq see SysRq below.
+SET PF12 RETRIEVE
+This retrieves command history on pressing F12.
+
+
+Sometimes in VM the display is set up to scroll automatically this
+can be very annoying if there are messages you wish to look at
+to stop this do
+TERM MORE 255 255
+This will nearly stop automatic screen updates, however it will
+cause a denial of service if lots of messages go to the 3270 console,
+so it would be foolish to use this as the default on a production machine.
+
+
+Tracing particular processes
+----------------------------
+The kernel's text segment is intentionally at an address in memory that it will
+very seldom collide with text segments of user programs ( thanks Martin ),
+this simplifies debugging the kernel.
+However it is quite common for user processes to have addresses which collide
+this can make debugging a particular process under VM painful under normal
+circumstances as the process may change when doing a
+TR I R <address range>.
+Thankfully after reading VM's online help I figured out how to debug
+I particular process.
+
+Your first problem is to find the STD ( segment table designation )
+of the program you wish to debug.
+There are several ways you can do this here are a few
+1) objdump --syms <program to be debugged> | grep main
+To get the address of main in the program.
+tr i pswa <address of main>
+Start the program, if VM drops to CP on what looks like the entry
+point of the main function this is most likely the process you wish to debug.
+Now do a D X13 or D XG13 on z/Architecture.
+On 31 bit the STD is bits 1-19 ( the STO segment table origin )
+& 25-31 ( the STL segment table length ) of CR13.
+now type
+TR I R STD <CR13's value> 0.7fffffff
+e.g.
+TR I R STD 8F32E1FF 0.7fffffff
+Another very useful variation is
+TR STORE INTO STD <CR13's value> <address range>
+for finding out when a particular variable changes.
+
+An alternative way of finding the STD of a currently running process
+is to do the following, ( this method is more complex but
+could be quite convenient if you aren't updating the kernel much &
+so your kernel structures will stay constant for a reasonable period of
+time ).
+
+grep task /proc/<pid>/status
+from this you should see something like
+task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68
+This now gives you a pointer to the task structure.
+Now make CC:="s390-gcc -g" kernel/sched.s
+To get the task_struct stabinfo.
+( task_struct is defined in include/linux/sched.h ).
+Now we want to look at
+task->active_mm->pgd
+on my machine the active_mm in the task structure stab is
+active_mm:(4,12),672,32
+its offset is 672/8=84=0x54
+the pgd member in the mm_struct stab is
+pgd:(4,6)=*(29,5),96,32
+so its offset is 96/8=12=0xc
+
+so we'll
+hexdump -s 0xf160054 /dev/mem | more
+i.e. task_struct+active_mm offset
+to look at the active_mm member
+f160054 0fee cc60 0019 e334 0000 0000 0000 0011
+hexdump -s 0x0feecc6c /dev/mem | more
+i.e. active_mm+pgd offset
+feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010
+we get something like
+now do
+TR I R STD <pgd|0x7f> 0.7fffffff
+i.e. the 0x7f is added because the pgd only
+gives the page table origin & we need to set the low bits
+to the maximum possible segment table length.
+TR I R STD 0f2c007f 0.7fffffff
+on z/Architecture you'll probably need to do
+TR I R STD <pgd|0x7> 0.ffffffffffffffff
+to set the TableType to 0x1 & the Table length to 3.
+
+
+
+Tracing Program Exceptions
+--------------------------
+If you get a crash which says something like
+illegal operation or specification exception followed by a register dump
+You can restart linux & trace these using the tr prog <range or value> trace option.
+
+
+
+The most common ones you will normally be tracing for is
+1=operation exception
+2=privileged operation exception
+4=protection exception
+5=addressing exception
+6=specification exception
+10=segment translation exception
+11=page translation exception
+
+The full list of these is on page 22 of the current s/390 Reference Summary.
+e.g.
+tr prog 10 will trace segment translation exceptions.
+tr prog on its own will trace all program interruption codes.
+
+Trace Sets
+----------
+On starting VM you are initially in the INITIAL trace set.
+You can do a Q TR to verify this.
+If you have a complex tracing situation where you wish to wait for instance
+till a driver is open before you start tracing IO, but know in your
+heart that you are going to have to make several runs through the code till you
+have a clue whats going on.
+
+What you can do is
+TR I PSWA <Driver open address>
+hit b to continue till breakpoint
+reach the breakpoint
+now do your
+TR GOTO B
+TR IO 7c08-7c09 inst int run
+or whatever the IO channels you wish to trace are & hit b
+
+To got back to the initial trace set do
+TR GOTO INITIAL
+& the TR I PSWA <Driver open address> will be the only active breakpoint again.
+
+
+Tracing linux syscalls under VM
+-------------------------------
+Syscalls are implemented on Linux for S390 by the Supervisor call instruction (SVC) there 256
+possibilities of these as the instruction is made up of a 0xA opcode & the second byte being
+the syscall number. They are traced using the simple command.
+TR SVC <Optional value or range>
+the syscalls are defined in linux/include/asm-s390/unistd.h
+e.g. to trace all file opens just do
+TR SVC 5 ( as this is the syscall number of open )
+
+
+SMP Specific commands
+---------------------
+To find out how many cpus you have
+Q CPUS displays all the CPU's available to your virtual machine
+To find the cpu that the current cpu VM debugger commands are being directed at do
+Q CPU to change the current cpu VM debugger commands are being directed at do
+CPU <desired cpu no>
+
+On a SMP guest issue a command to all CPUs try prefixing the command with cpu all.
+To issue a command to a particular cpu try cpu <cpu number> e.g.
+CPU 01 TR I R 2000.3000
+If you are running on a guest with several cpus & you have a IO related problem
+& cannot follow the flow of code but you know it isn't smp related.
+from the bash prompt issue
+shutdown -h now or halt.
+do a Q CPUS to find out how many cpus you have
+detach each one of them from cp except cpu 0
+by issuing a
+DETACH CPU 01-(number of cpus in configuration)
+& boot linux again.
+TR SIGP will trace inter processor signal processor instructions.
+DEFINE CPU 01-(number in configuration)
+will get your guests cpus back.
+
+
+Help for displaying ascii textstrings
+-------------------------------------
+On the very latest VM Nucleus'es VM can now display ascii
+( thanks Neale for the hint ) by doing
+D TX<lowaddr>.<len>
+e.g.
+D TX0.100
+
+Alternatively
+=============
+Under older VM debuggers ( I love EBDIC too ) you can use this little program I wrote which
+will convert a command line of hex digits to ascii text which can be compiled under linux &
+you can copy the hex digits from your x3270 terminal to your xterm if you are debugging
+from a linuxbox.
+
+This is quite useful when looking at a parameter passed in as a text string
+under VM ( unless you are good at decoding ASCII in your head ).
+
+e.g. consider tracing an open syscall
+TR SVC 5
+We have stopped at a breakpoint
+000151B0' SVC 0A05 -> 0001909A' CC 0
+
+D 20.8 to check the SVC old psw in the prefix area & see was it from userspace
+( for the layout of the prefix area consult P18 of the s/390 390 Reference Summary
+if you have it available ).
+V00000020 070C2000 800151B2
+The problem state bit wasn't set & it's also too early in the boot sequence
+for it to be a userspace SVC if it was we would have to temporarily switch the
+psw to user space addressing so we could get at the first parameter of the open in
+gpr2.
+Next do a
+D G2
+GPR 2 = 00014CB4
+Now display what gpr2 is pointing to
+D 00014CB4.20
+V00014CB4 2F646576 2F636F6E 736F6C65 00001BF5
+V00014CC4 FC00014C B4001001 E0001000 B8070707
+Now copy the text till the first 00 hex ( which is the end of the string
+to an xterm & do hex2ascii on it.
+hex2ascii 2F646576 2F636F6E 736F6C65 00
+outputs
+Decoded Hex:=/ d e v / c o n s o l e 0x00
+We were opening the console device,
+
+You can compile the code below yourself for practice :-),
+/*
+ * hex2ascii.c
+ * a useful little tool for converting a hexadecimal command line to ascii
+ *
+ * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
+ * (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation.
+ */
+#include <stdio.h>
+
+int main(int argc,char *argv[])
+{
+ int cnt1,cnt2,len,toggle=0;
+ int startcnt=1;
+ unsigned char c,hex;
+
+ if(argc>1&&(strcmp(argv[1],"-a")==0))
+ startcnt=2;
+ printf("Decoded Hex:=");
+ for(cnt1=startcnt;cnt1<argc;cnt1++)
+ {
+ len=strlen(argv[cnt1]);
+ for(cnt2=0;cnt2<len;cnt2++)
+ {
+ c=argv[cnt1][cnt2];
+ if(c>='0'&&c<='9')
+ c=c-'0';
+ if(c>='A'&&c<='F')
+ c=c-'A'+10;
+ if(c>='a'&&c<='f')
+ c=c-'a'+10;
+ switch(toggle)
+ {
+ case 0:
+ hex=c<<4;
+ toggle=1;
+ break;
+ case 1:
+ hex+=c;
+ if(hex<32||hex>127)
+ {
+ if(startcnt==1)
+ printf("0x%02X ",(int)hex);
+ else
+ printf(".");
+ }
+ else
+ {
+ printf("%c",hex);
+ if(startcnt==1)
+ printf(" ");
+ }
+ toggle=0;
+ break;
+ }
+ }
+ }
+ printf("\n");
+}
+
+
+
+
+Stack tracing under VM
+----------------------
+A basic backtrace
+-----------------
+
+Here are the tricks I use 9 out of 10 times it works pretty well,
+
+When your backchain reaches a dead end
+--------------------------------------
+This can happen when an exception happens in the kernel & the kernel is entered twice
+if you reach the NULL pointer at the end of the back chain you should be
+able to sniff further back if you follow the following tricks.
+1) A kernel address should be easy to recognise since it is in
+primary space & the problem state bit isn't set & also
+The Hi bit of the address is set.
+2) Another backchain should also be easy to recognise since it is an
+address pointing to another address approximately 100 bytes or 0x70 hex
+behind the current stackpointer.
+
+
+Here is some practice.
+boot the kernel & hit PA1 at some random time
+d g to display the gprs, this should display something like
+GPR 0 = 00000001 00156018 0014359C 00000000
+GPR 4 = 00000001 001B8888 000003E0 00000000
+GPR 8 = 00100080 00100084 00000000 000FE000
+GPR 12 = 00010400 8001B2DC 8001B36A 000FFED8
+Note that GPR14 is a return address but as we are real men we are going to
+trace the stack.
+display 0x40 bytes after the stack pointer.
+
+V000FFED8 000FFF38 8001B838 80014C8E 000FFF38
+V000FFEE8 00000000 00000000 000003E0 00000000
+V000FFEF8 00100080 00100084 00000000 000FE000
+V000FFF08 00010400 8001B2DC 8001B36A 000FFED8
+
+
+Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if
+you look above at our stackframe & also agrees with GPR14.
+
+now backchain
+d 000FFF38.40
+we now are taking the contents of SP to get our first backchain.
+
+V000FFF38 000FFFA0 00000000 00014995 00147094
+V000FFF48 00147090 001470A0 000003E0 00000000
+V000FFF58 00100080 00100084 00000000 001BF1D0
+V000FFF68 00010400 800149BA 80014CA6 000FFF38
+
+This displays a 2nd return address of 80014CA6
+
+now do d 000FFFA0.40 for our 3rd backchain
+
+V000FFFA0 04B52002 0001107F 00000000 00000000
+V000FFFB0 00000000 00000000 FF000000 0001107F
+V000FFFC0 00000000 00000000 00000000 00000000
+V000FFFD0 00010400 80010802 8001085A 000FFFA0
+
+
+our 3rd return address is 8001085A
+
+as the 04B52002 looks suspiciously like rubbish it is fair to assume that the kernel entry routines
+for the sake of optimisation don't set up a backchain.
+
+now look at System.map to see if the addresses make any sense.
+
+grep -i 0001b3 System.map
+outputs among other things
+0001b304 T cpu_idle
+so 8001B36A
+is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it )
+
+
+grep -i 00014 System.map
+produces among other things
+00014a78 T start_kernel
+so 0014CA6 is start_kernel+some hex number I can't add in my head.
+
+grep -i 00108 System.map
+this produces
+00010800 T _stext
+so 8001085A is _stext+0x5a
+
+Congrats you've done your first backchain.
+
+
+
+s/390 & z/Architecture IO Overview
+==================================
+
+I am not going to give a course in 390 IO architecture as this would take me quite a
+while & I'm no expert. Instead I'll give a 390 IO architecture summary for Dummies if you have
+the s/390 principles of operation available read this instead. If nothing else you may find a few
+useful keywords in here & be able to use them on a web search engine like altavista to find
+more useful information.
+
+Unlike other bus architectures modern 390 systems do their IO using mostly
+fibre optics & devices such as tapes & disks can be shared between several mainframes,
+also S390 can support up to 65536 devices while a high end PC based system might be choking
+with around 64. Here is some of the common IO terminology
+
+Subchannel:
+This is the logical number most IO commands use to talk to an IO device there can be up to
+0x10000 (65536) of these in a configuration typically there is a few hundred. Under VM
+for simplicity they are allocated contiguously, however on the native hardware they are not
+they typically stay consistent between boots provided no new hardware is inserted or removed.
+Under Linux for 390 we use these as IRQ's & also when issuing an IO command (CLEAR SUBCHANNEL,
+HALT SUBCHANNEL,MODIFY SUBCHANNEL,RESUME SUBCHANNEL,START SUBCHANNEL,STORE SUBCHANNEL &
+TEST SUBCHANNEL ) we use this as the ID of the device we wish to talk to, the most
+important of these instructions are START SUBCHANNEL ( to start IO ), TEST SUBCHANNEL ( to check
+whether the IO completed successfully ), & HALT SUBCHANNEL ( to kill IO ), a subchannel
+can have up to 8 channel paths to a device this offers redundancy if one is not available.
+
+
+Device Number:
+This number remains static & Is closely tied to the hardware, there are 65536 of these
+also they are made up of a CHPID ( Channel Path ID, the most significant 8 bits )
+& another lsb 8 bits. These remain static even if more devices are inserted or removed
+from the hardware, there is a 1 to 1 mapping between Subchannels & Device Numbers provided
+devices aren't inserted or removed.
+
+Channel Control Words:
+CCWS are linked lists of instructions initially pointed to by an operation request block (ORB),
+which is initially given to Start Subchannel (SSCH) command along with the subchannel number
+for the IO subsystem to process while the CPU continues executing normal code.
+These come in two flavours, Format 0 ( 24 bit for backward )
+compatibility & Format 1 ( 31 bit ). These are typically used to issue read & write
+( & many other instructions ) they consist of a length field & an absolute address field.
+For each IO typically get 1 or 2 interrupts one for channel end ( primary status ) when the
+channel is idle & the second for device end ( secondary status ) sometimes you get both
+concurrently, you check how the IO went on by issuing a TEST SUBCHANNEL at each interrupt,
+from which you receive an Interruption response block (IRB). If you get channel & device end
+status in the IRB without channel checks etc. your IO probably went okay. If you didn't you
+probably need a doctor to examine the IRB & extended status word etc.
+If an error occurs, more sophisticated control units have a facility known as
+concurrent sense this means that if an error occurs Extended sense information will
+be presented in the Extended status word in the IRB if not you have to issue a
+subsequent SENSE CCW command after the test subchannel.
+
+
+TPI( Test pending interrupt) can also be used for polled IO but in multitasking multiprocessor
+systems it isn't recommended except for checking special cases ( i.e. non looping checks for
+pending IO etc. ).
+
+Store Subchannel & Modify Subchannel can be used to examine & modify operating characteristics
+of a subchannel ( e.g. channel paths ).
+
+Other IO related Terms:
+Sysplex: S390's Clustering Technology
+QDIO: S390's new high speed IO architecture to support devices such as gigabit ethernet,
+this architecture is also designed to be forward compatible with up & coming 64 bit machines.
+
+
+General Concepts
+
+Input Output Processors (IOP's) are responsible for communicating between
+the mainframe CPU's & the channel & relieve the mainframe CPU's from the
+burden of communicating with IO devices directly, this allows the CPU's to
+concentrate on data processing.
+
+IOP's can use one or more links ( known as channel paths ) to talk to each
+IO device. It first checks for path availability & chooses an available one,
+then starts ( & sometimes terminates IO ).
+There are two types of channel path: ESCON & the Parallel IO interface.
+
+IO devices are attached to control units, control units provide the
+logic to interface the channel paths & channel path IO protocols to
+the IO devices, they can be integrated with the devices or housed separately
+& often talk to several similar devices ( typical examples would be raid
+controllers or a control unit which connects to 1000 3270 terminals ).
+
+
+ +---------------------------------------------------------------+
+ | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ |
+ | | CPU | | CPU | | CPU | | CPU | | Main | | Expanded | |
+ | | | | | | | | | | Memory | | Storage | |
+ | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ |
+ |---------------------------------------------------------------+
+ | IOP | IOP | IOP |
+ |---------------------------------------------------------------
+ | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C |
+ ----------------------------------------------------------------
+ || ||
+ || Bus & Tag Channel Path || ESCON
+ || ====================== || Channel
+ || || || || Path
+ +----------+ +----------+ +----------+
+ | | | | | |
+ | CU | | CU | | CU |
+ | | | | | |
+ +----------+ +----------+ +----------+
+ | | | | |
++----------+ +----------+ +----------+ +----------+ +----------+
+|I/O Device| |I/O Device| |I/O Device| |I/O Device| |I/O Device|
++----------+ +----------+ +----------+ +----------+ +----------+
+ CPU = Central Processing Unit
+ C = Channel
+ IOP = IP Processor
+ CU = Control Unit
+
+The 390 IO systems come in 2 flavours the current 390 machines support both
+
+The Older 360 & 370 Interface,sometimes called the Parallel I/O interface,
+sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers
+Interface (OEMI).
+
+This byte wide Parallel channel path/bus has parity & data on the "Bus" cable
+& control lines on the "Tag" cable. These can operate in byte multiplex mode for
+sharing between several slow devices or burst mode & monopolize the channel for the
+whole burst. Up to 256 devices can be addressed on one of these cables. These cables are
+about one inch in diameter. The maximum unextended length supported by these cables is
+125 Meters but this can be extended up to 2km with a fibre optic channel extended
+such as a 3044. The maximum burst speed supported is 4.5 megabytes per second however
+some really old processors support only transfer rates of 3.0, 2.0 & 1.0 MB/sec.
+One of these paths can be daisy chained to up to 8 control units.
+
+
+ESCON if fibre optic it is also called FICON
+Was introduced by IBM in 1990. Has 2 fibre optic cables & uses either leds or lasers
+for communication at a signaling rate of up to 200 megabits/sec. As 10bits are transferred
+for every 8 bits info this drops to 160 megabits/sec & to 18.6 Megabytes/sec once
+control info & CRC are added. ESCON only operates in burst mode.
+
+ESCONs typical max cable length is 3km for the led version & 20km for the laser version
+known as XDF ( extended distance facility ). This can be further extended by using an
+ESCON director which triples the above mentioned ranges. Unlike Bus & Tag as ESCON is
+serial it uses a packet switching architecture the standard Bus & Tag control protocol
+is however present within the packets. Up to 256 devices can be attached to each control
+unit that uses one of these interfaces.
+
+Common 390 Devices include:
+Network adapters typically OSA2,3172's,2116's & OSA-E gigabit ethernet adapters,
+Consoles 3270 & 3215 ( a teletype emulated under linux for a line mode console ).
+DASD's direct access storage devices ( otherwise known as hard disks ).
+Tape Drives.
+CTC ( Channel to Channel Adapters ),
+ESCON or Parallel Cables used as a very high speed serial link
+between 2 machines. We use 2 cables under linux to do a bi-directional serial link.
+
+
+Debugging IO on s/390 & z/Architecture under VM
+===============================================
+
+Now we are ready to go on with IO tracing commands under VM
+
+A few self explanatory queries:
+Q OSA
+Q CTC
+Q DISK ( This command is CMS specific )
+Q DASD
+
+
+
+
+
+
+Q OSA on my machine returns
+OSA 7C08 ON OSA 7C08 SUBCHANNEL = 0000
+OSA 7C09 ON OSA 7C09 SUBCHANNEL = 0001
+OSA 7C14 ON OSA 7C14 SUBCHANNEL = 0002
+OSA 7C15 ON OSA 7C15 SUBCHANNEL = 0003
+
+If you have a guest with certain privileges you may be able to see devices
+which don't belong to you. To avoid this, add the option V.
+e.g.
+Q V OSA
+
+Now using the device numbers returned by this command we will
+Trace the io starting up on the first device 7c08 & 7c09
+In our simplest case we can trace the
+start subchannels
+like TR SSCH 7C08-7C09
+or the halt subchannels
+or TR HSCH 7C08-7C09
+MSCH's ,STSCH's I think you can guess the rest
+
+Ingo's favourite trick is tracing all the IO's & CCWS & spooling them into the reader of another
+VM guest so he can ftp the logfile back to his own machine.I'll do a small bit of this & give you
+ a look at the output.
+
+1) Spool stdout to VM reader
+SP PRT TO (another vm guest ) or * for the local vm guest
+2) Fill the reader with the trace
+TR IO 7c08-7c09 INST INT CCW PRT RUN
+3) Start up linux
+i 00c
+4) Finish the trace
+TR END
+5) close the reader
+C PRT
+6) list reader contents
+RDRLIST
+7) copy it to linux4's minidisk
+RECEIVE / LOG TXT A1 ( replace
+8)
+filel & press F11 to look at it
+You should see something like:
+
+00020942' SSCH B2334000 0048813C CC 0 SCH 0000 DEV 7C08
+ CPA 000FFDF0 PARM 00E2C9C4 KEY 0 FPI C0 LPM 80
+ CCW 000FFDF0 E4200100 00487FE8 0000 E4240100 ........
+ IDAL 43D8AFE8
+ IDAL 0FB76000
+00020B0A' I/O DEV 7C08 -> 000197BC' SCH 0000 PARM 00E2C9C4
+00021628' TSCH B2354000 >> 00488164 CC 0 SCH 0000 DEV 7C08
+ CCWA 000FFDF8 DEV STS 0C SCH STS 00 CNT 00EC
+ KEY 0 FPI C0 CC 0 CTLS 4007
+00022238' STSCH B2344000 >> 00488108 CC 0 SCH 0000 DEV 7C08
+
+If you don't like messing up your readed ( because you possibly booted from it )
+you can alternatively spool it to another readers guest.
+
+
+Other common VM device related commands
+---------------------------------------------
+These commands are listed only because they have
+been of use to me in the past & may be of use to
+you too. For more complete info on each of the commands
+use type HELP <command> from CMS.
+detaching devices
+DET <devno range>
+ATT <devno range> <guest>
+attach a device to guest * for your own guest
+READY <devno> cause VM to issue a fake interrupt.
+
+The VARY command is normally only available to VM administrators.
+VARY ON PATH <path> TO <devno range>
+VARY OFF PATH <PATH> FROM <devno range>
+This is used to switch on or off channel paths to devices.
+
+Q CHPID <channel path ID>
+This displays state of devices using this channel path
+D SCHIB <subchannel>
+This displays the subchannel information SCHIB block for the device.
+this I believe is also only available to administrators.
+DEFINE CTC <devno>
+defines a virtual CTC channel to channel connection
+2 need to be defined on each guest for the CTC driver to use.
+COUPLE devno userid remote devno
+Joins a local virtual device to a remote virtual device
+( commonly used for the CTC driver ).
+
+Building a VM ramdisk under CMS which linux can use
+def vfb-<blocksize> <subchannel> <number blocks>
+blocksize is commonly 4096 for linux.
+Formatting it
+format <subchannel> <driver letter e.g. x> (blksize <blocksize>
+
+Sharing a disk between multiple guests
+LINK userid devno1 devno2 mode password
+
+
+
+GDB on S390
+===========
+N.B. if compiling for debugging gdb works better without optimisation
+( see Compiling programs for debugging )
+
+invocation
+----------
+gdb <victim program> <optional corefile>
+
+Online help
+-----------
+help: gives help on commands
+e.g.
+help
+help display
+Note gdb's online help is very good use it.
+
+
+Assembly
+--------
+info registers: displays registers other than floating point.
+info all-registers: displays floating points as well.
+disassemble: disassembles
+e.g.
+disassemble without parameters will disassemble the current function
+disassemble $pc $pc+10
+
+Viewing & modifying variables
+-----------------------------
+print or p: displays variable or register
+e.g. p/x $sp will display the stack pointer
+
+display: prints variable or register each time program stops
+e.g.
+display/x $pc will display the program counter
+display argc
+
+undisplay : undo's display's
+
+info breakpoints: shows all current breakpoints
+
+info stack: shows stack back trace ( if this doesn't work too well, I'll show you the
+stacktrace by hand below ).
+
+info locals: displays local variables.
+
+info args: display current procedure arguments.
+
+set args: will set argc & argv each time the victim program is invoked.
+
+set <variable>=value
+set argc=100
+set $pc=0
+
+
+
+Modifying execution
+-------------------
+step: steps n lines of sourcecode
+step steps 1 line.
+step 100 steps 100 lines of code.
+
+next: like step except this will not step into subroutines
+
+stepi: steps a single machine code instruction.
+e.g. stepi 100
+
+nexti: steps a single machine code instruction but will not step into subroutines.
+
+finish: will run until exit of the current routine
+
+run: (re)starts a program
+
+cont: continues a program
+
+quit: exits gdb.
+
+
+breakpoints
+------------
+
+break
+sets a breakpoint
+e.g.
+
+break main
+
+break *$pc
+
+break *0x400618
+
+heres a really useful one for large programs
+rbr
+Set a breakpoint for all functions matching REGEXP
+e.g.
+rbr 390
+will set a breakpoint with all functions with 390 in their name.
+
+info breakpoints
+lists all breakpoints
+
+delete: delete breakpoint by number or delete them all
+e.g.
+delete 1 will delete the first breakpoint
+delete will delete them all
+
+watch: This will set a watchpoint ( usually hardware assisted ),
+This will watch a variable till it changes
+e.g.
+watch cnt, will watch the variable cnt till it changes.
+As an aside unfortunately gdb's, architecture independent watchpoint code
+is inconsistent & not very good, watchpoints usually work but not always.
+
+info watchpoints: Display currently active watchpoints
+
+condition: ( another useful one )
+Specify breakpoint number N to break only if COND is true.
+Usage is `condition N COND', where N is an integer and COND is an
+expression to be evaluated whenever breakpoint N is reached.
+
+
+
+User defined functions/macros
+-----------------------------
+define: ( Note this is very very useful,simple & powerful )
+usage define <name> <list of commands> end
+
+examples which you should consider putting into .gdbinit in your home directory
+define d
+stepi
+disassemble $pc $pc+10
+end
+
+define e
+nexti
+disassemble $pc $pc+10
+end
+
+
+Other hard to classify stuff
+----------------------------
+signal n:
+sends the victim program a signal.
+e.g. signal 3 will send a SIGQUIT.
+
+info signals:
+what gdb does when the victim receives certain signals.
+
+list:
+e.g.
+list lists current function source
+list 1,10 list first 10 lines of current file.
+list test.c:1,10
+
+
+directory:
+Adds directories to be searched for source if gdb cannot find the source.
+(note it is a bit sensitive about slashes)
+e.g. To add the root of the filesystem to the searchpath do
+directory //
+
+
+call <function>
+This calls a function in the victim program, this is pretty powerful
+e.g.
+(gdb) call printf("hello world")
+outputs:
+$1 = 11
+
+You might now be thinking that the line above didn't work, something extra had to be done.
+(gdb) call fflush(stdout)
+hello world$2 = 0
+As an aside the debugger also calls malloc & free under the hood
+to make space for the "hello world" string.
+
+
+
+hints
+-----
+1) command completion works just like bash
+( if you are a bad typist like me this really helps )
+e.g. hit br <TAB> & cursor up & down :-).
+
+2) if you have a debugging problem that takes a few steps to recreate
+put the steps into a file called .gdbinit in your current working directory
+if you have defined a few extra useful user defined commands put these in
+your home directory & they will be read each time gdb is launched.
+
+A typical .gdbinit file might be.
+break main
+run
+break runtime_exception
+cont
+
+
+stack chaining in gdb by hand
+-----------------------------
+This is done using a the same trick described for VM
+p/x (*($sp+56))&0x7fffffff get the first backchain.
+
+For z/Architecture
+Replace 56 with 112 & ignore the &0x7fffffff
+in the macros below & do nasty casts to longs like the following
+as gdb unfortunately deals with printed arguments as ints which
+messes up everything.
+i.e. here is a 3rd backchain dereference
+p/x *(long *)(***(long ***)$sp+112)
+
+
+this outputs
+$5 = 0x528f18
+on my machine.
+Now you can use
+info symbol (*($sp+56))&0x7fffffff
+you might see something like.
+rl_getc + 36 in section .text telling you what is located at address 0x528f18
+Now do.
+p/x (*(*$sp+56))&0x7fffffff
+This outputs
+$6 = 0x528ed0
+Now do.
+info symbol (*(*$sp+56))&0x7fffffff
+rl_read_key + 180 in section .text
+now do
+p/x (*(**$sp+56))&0x7fffffff
+& so on.
+
+Disassembling instructions without debug info
+---------------------------------------------
+gdb typically complains if there is a lack of debugging
+symbols in the disassemble command with
+"No function contains specified address." To get around
+this do
+x/<number lines to disassemble>xi <address>
+e.g.
+x/20xi 0x400730
+
+
+
+Note: Remember gdb has history just like bash you don't need to retype the
+whole line just use the up & down arrows.
+
+
+
+For more info
+-------------
+From your linuxbox do
+man gdb or info gdb.
+
+core dumps
+----------
+What a core dump ?,
+A core dump is a file generated by the kernel ( if allowed ) which contains the registers,
+& all active pages of the program which has crashed.
+From this file gdb will allow you to look at the registers & stack trace & memory of the
+program as if it just crashed on your system, it is usually called core & created in the
+current working directory.
+This is very useful in that a customer can mail a core dump to a technical support department
+& the technical support department can reconstruct what happened.
+Provided they have an identical copy of this program with debugging symbols compiled in &
+the source base of this build is available.
+In short it is far more useful than something like a crash log could ever hope to be.
+
+In theory all that is missing to restart a core dumped program is a kernel patch which
+will do the following.
+1) Make a new kernel task structure
+2) Reload all the dumped pages back into the kernel's memory management structures.
+3) Do the required clock fixups
+4) Get all files & network connections for the process back into an identical state ( really difficult ).
+5) A few more difficult things I haven't thought of.
+
+
+
+Why have I never seen one ?.
+Probably because you haven't used the command
+ulimit -c unlimited in bash
+to allow core dumps, now do
+ulimit -a
+to verify that the limit was accepted.
+
+A sample core dump
+To create this I'm going to do
+ulimit -c unlimited
+gdb
+to launch gdb (my victim app. ) now be bad & do the following from another
+telnet/xterm session to the same machine
+ps -aux | grep gdb
+kill -SIGSEGV <gdb's pid>
+or alternatively use killall -SIGSEGV gdb if you have the killall command.
+Now look at the core dump.
+./gdb core
+Displays the following
+GNU gdb 4.18
+Copyright 1998 Free Software Foundation, Inc.
+GDB is free software, covered by the GNU General Public License, and you are
+welcome to change it and/or distribute copies of it under certain conditions.
+Type "show copying" to see the conditions.
+There is absolutely no warranty for GDB. Type "show warranty" for details.
+This GDB was configured as "s390-ibm-linux"...
+Core was generated by `./gdb'.
+Program terminated with signal 11, Segmentation fault.
+Reading symbols from /usr/lib/libncurses.so.4...done.
+Reading symbols from /lib/libm.so.6...done.
+Reading symbols from /lib/libc.so.6...done.
+Reading symbols from /lib/ld-linux.so.2...done.
+#0 0x40126d1a in read () from /lib/libc.so.6
+Setting up the environment for debugging gdb.
+Breakpoint 1 at 0x4dc6f8: file utils.c, line 471.
+Breakpoint 2 at 0x4d87a4: file top.c, line 2609.
+(top-gdb) info stack
+#0 0x40126d1a in read () from /lib/libc.so.6
+#1 0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402
+#2 0x528ed0 in rl_read_key () at input.c:381
+#3 0x5167e6 in readline_internal_char () at readline.c:454
+#4 0x5168ee in readline_internal_charloop () at readline.c:507
+#5 0x51692c in readline_internal () at readline.c:521
+#6 0x5164fe in readline (prompt=0x7ffff810 "\177ÿøx\177ÿ÷Ø\177ÿøxÀ")
+ at readline.c:349
+#7 0x4d7a8a in command_line_input (prrompt=0x564420 "(gdb) ", repeat=1,
+ annotation_suffix=0x4d6b44 "prompt") at top.c:2091
+#8 0x4d6cf0 in command_loop () at top.c:1345
+#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635
+
+
+LDD
+===
+This is a program which lists the shared libraries which a library needs,
+Note you also get the relocations of the shared library text segments which
+help when using objdump --source.
+e.g.
+ ldd ./gdb
+outputs
+libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000)
+libm.so.6 => /lib/libm.so.6 (0x4005e000)
+libc.so.6 => /lib/libc.so.6 (0x40084000)
+/lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000)
+
+
+Debugging shared libraries
+==========================
+Most programs use shared libraries, however it can be very painful
+when you single step instruction into a function like printf for the
+first time & you end up in functions like _dl_runtime_resolve this is
+the ld.so doing lazy binding, lazy binding is a concept in ELF where
+shared library functions are not loaded into memory unless they are
+actually used, great for saving memory but a pain to debug.
+To get around this either relink the program -static or exit gdb type
+export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing
+the program in question.
+
+
+
+Debugging modules
+=================
+As modules are dynamically loaded into the kernel their address can be
+anywhere to get around this use the -m option with insmod to emit a load
+map which can be piped into a file if required.
+
+The proc file system
+====================
+What is it ?.
+It is a filesystem created by the kernel with files which are created on demand
+by the kernel if read, or can be used to modify kernel parameters,
+it is a powerful concept.
+
+e.g.
+
+cat /proc/sys/net/ipv4/ip_forward
+On my machine outputs
+0
+telling me ip_forwarding is not on to switch it on I can do
+echo 1 > /proc/sys/net/ipv4/ip_forward
+cat it again
+cat /proc/sys/net/ipv4/ip_forward
+On my machine now outputs
+1
+IP forwarding is on.
+There is a lot of useful info in here best found by going in & having a look around,
+so I'll take you through some entries I consider important.
+
+All the processes running on the machine have there own entry defined by
+/proc/<pid>
+So lets have a look at the init process
+cd /proc/1
+
+cat cmdline
+emits
+init [2]
+
+cd /proc/1/fd
+This contains numerical entries of all the open files,
+some of these you can cat e.g. stdout (2)
+
+cat /proc/29/maps
+on my machine emits
+
+00400000-00478000 r-xp 00000000 5f:00 4103 /bin/bash
+00478000-0047e000 rw-p 00077000 5f:00 4103 /bin/bash
+0047e000-00492000 rwxp 00000000 00:00 0
+40000000-40015000 r-xp 00000000 5f:00 14382 /lib/ld-2.1.2.so
+40015000-40016000 rw-p 00014000 5f:00 14382 /lib/ld-2.1.2.so
+40016000-40017000 rwxp 00000000 00:00 0
+40017000-40018000 rw-p 00000000 00:00 0
+40018000-4001b000 r-xp 00000000 5f:00 14435 /lib/libtermcap.so.2.0.8
+4001b000-4001c000 rw-p 00002000 5f:00 14435 /lib/libtermcap.so.2.0.8
+4001c000-4010d000 r-xp 00000000 5f:00 14387 /lib/libc-2.1.2.so
+4010d000-40111000 rw-p 000f0000 5f:00 14387 /lib/libc-2.1.2.so
+40111000-40114000 rw-p 00000000 00:00 0
+40114000-4011e000 r-xp 00000000 5f:00 14408 /lib/libnss_files-2.1.2.so
+4011e000-4011f000 rw-p 00009000 5f:00 14408 /lib/libnss_files-2.1.2.so
+7fffd000-80000000 rwxp ffffe000 00:00 0
+
+
+Showing us the shared libraries init uses where they are in memory
+& memory access permissions for each virtual memory area.
+
+/proc/1/cwd is a softlink to the current working directory.
+/proc/1/root is the root of the filesystem for this process.
+
+/proc/1/mem is the current running processes memory which you
+can read & write to like a file.
+strace uses this sometimes as it is a bit faster than the
+rather inefficient ptrace interface for peeking at DATA.
+
+
+cat status
+
+Name: init
+State: S (sleeping)
+Pid: 1
+PPid: 0
+Uid: 0 0 0 0
+Gid: 0 0 0 0
+Groups:
+VmSize: 408 kB
+VmLck: 0 kB
+VmRSS: 208 kB
+VmData: 24 kB
+VmStk: 8 kB
+VmExe: 368 kB
+VmLib: 0 kB
+SigPnd: 0000000000000000
+SigBlk: 0000000000000000
+SigIgn: 7fffffffd7f0d8fc
+SigCgt: 00000000280b2603
+CapInh: 00000000fffffeff
+CapPrm: 00000000ffffffff
+CapEff: 00000000fffffeff
+
+User PSW: 070de000 80414146
+task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68
+User GPRS:
+00000400 00000000 0000000b 7ffffa90
+00000000 00000000 00000000 0045d9f4
+0045cafc 7ffffa90 7fffff18 0045cb08
+00010400 804039e8 80403af8 7ffff8b0
+User ACRS:
+00000000 00000000 00000000 00000000
+00000001 00000000 00000000 00000000
+00000000 00000000 00000000 00000000
+00000000 00000000 00000000 00000000
+Kernel BackChain CallChain BackChain CallChain
+ 004b7ca8 8002bd0c 004b7d18 8002b92c
+ 004b7db8 8005cd50 004b7e38 8005d12a
+ 004b7f08 80019114
+Showing among other things memory usage & status of some signals &
+the processes'es registers from the kernel task_structure
+as well as a backchain which may be useful if a process crashes
+in the kernel for some unknown reason.
+
+Some driver debugging techniques
+================================
+debug feature
+-------------
+Some of our drivers now support a "debug feature" in
+/proc/s390dbf see s390dbf.txt in the linux/Documentation directory
+for more info.
+e.g.
+to switch on the lcs "debug feature"
+echo 5 > /proc/s390dbf/lcs/level
+& then after the error occurred.
+cat /proc/s390dbf/lcs/sprintf >/logfile
+the logfile now contains some information which may help
+tech support resolve a problem in the field.
+
+
+
+high level debugging network drivers
+------------------------------------
+ifconfig is a quite useful command
+it gives the current state of network drivers.
+
+If you suspect your network device driver is dead
+one way to check is type
+ifconfig <network device>
+e.g. tr0
+You should see something like
+tr0 Link encap:16/4 Mbps Token Ring (New) HWaddr 00:04:AC:20:8E:48
+ inet addr:9.164.185.132 Bcast:9.164.191.255 Mask:255.255.224.0
+ UP BROADCAST RUNNING MULTICAST MTU:2000 Metric:1
+ RX packets:246134 errors:0 dropped:0 overruns:0 frame:0
+ TX packets:5 errors:0 dropped:0 overruns:0 carrier:0
+ collisions:0 txqueuelen:100
+
+if the device doesn't say up
+try
+/etc/rc.d/init.d/network start
+( this starts the network stack & hopefully calls ifconfig tr0 up ).
+ifconfig looks at the output of /proc/net/dev & presents it in a more presentable form
+Now ping the device from a machine in the same subnet.
+if the RX packets count & TX packets counts don't increment you probably
+have problems.
+next
+cat /proc/net/arp
+Do you see any hardware addresses in the cache if not you may have problems.
+Next try
+ping -c 5 <broadcast_addr> i.e. the Bcast field above in the output of
+ifconfig. Do you see any replies from machines other than the local machine
+if not you may have problems. also if the TX packets count in ifconfig
+hasn't incremented either you have serious problems in your driver
+(e.g. the txbusy field of the network device being stuck on )
+or you may have multiple network devices connected.
+
+
+chandev
+-------
+There is a new device layer for channel devices, some
+drivers e.g. lcs are registered with this layer.
+If the device uses the channel device layer you'll be
+able to find what interrupts it uses & the current state
+of the device.
+See the manpage chandev.8 &type cat /proc/chandev for more info.
+
+
+
+Starting points for debugging scripting languages etc.
+======================================================
+
+bash/sh
+
+bash -x <scriptname>
+e.g. bash -x /usr/bin/bashbug
+displays the following lines as it executes them.
++ MACHINE=i586
++ OS=linux-gnu
++ CC=gcc
++ CFLAGS= -DPROGRAM='bash' -DHOSTTYPE='i586' -DOSTYPE='linux-gnu' -DMACHTYPE='i586-pc-linux-gnu' -DSHELL -DHAVE_CONFIG_H -I. -I. -I./lib -O2 -pipe
++ RELEASE=2.01
++ PATCHLEVEL=1
++ RELSTATUS=release
++ MACHTYPE=i586-pc-linux-gnu
+
+perl -d <scriptname> runs the perlscript in a fully interactive debugger
+<like gdb>.
+Type 'h' in the debugger for help.
+
+for debugging java type
+jdb <filename> another fully interactive gdb style debugger.
+& type ? in the debugger for help.
+
+
+
+Dumptool & Lcrash ( lkcd )
+==========================
+Michael Holzheu & others here at IBM have a fairly mature port of
+SGI's lcrash tool which allows one to look at kernel structures in a
+running kernel.
+
+It also complements a tool called dumptool which dumps all the kernel's
+memory pages & registers to either a tape or a disk.
+This can be used by tech support or an ambitious end user do
+post mortem debugging of a machine like gdb core dumps.
+
+Going into how to use this tool in detail will be explained
+in other documentation supplied by IBM with the patches & the
+lcrash homepage http://oss.sgi.com/projects/lkcd/ & the lcrash manpage.
+
+How they work
+-------------
+Lcrash is a perfectly normal program,however, it requires 2
+additional files, Kerntypes which is built using a patch to the
+linux kernel sources in the linux root directory & the System.map.
+
+Kerntypes is an objectfile whose sole purpose in life
+is to provide stabs debug info to lcrash, to do this
+Kerntypes is built from kerntypes.c which just includes the most commonly
+referenced header files used when debugging, lcrash can then read the
+.stabs section of this file.
+
+Debugging a live system it uses /dev/mem
+alternatively for post mortem debugging it uses the data
+collected by dumptool.
+
+
+
+SysRq
+=====
+This is now supported by linux for s/390 & z/Architecture.
+To enable it do compile the kernel with
+Kernel Hacking -> Magic SysRq Key Enabled
+echo "1" > /proc/sys/kernel/sysrq
+also type
+echo "8" >/proc/sys/kernel/printk
+To make printk output go to console.
+On 390 all commands are prefixed with
+^-
+e.g.
+^-t will show tasks.
+^-? or some unknown command will display help.
+The sysrq key reading is very picky ( I have to type the keys in an
+ xterm session & paste them into the x3270 console )
+& it may be wise to predefine the keys as described in the VM hints above
+
+This is particularly useful for syncing disks unmounting & rebooting
+if the machine gets partially hung.
+
+Read Documentation/sysrq.txt for more info
+
+References:
+===========
+Enterprise Systems Architecture Reference Summary
+Enterprise Systems Architecture Principles of Operation
+Hartmut Penners s390 stack frame sheet.
+IBM Mainframe Channel Attachment a technology brief from a CISCO webpage
+Various bits of man & info pages of Linux.
+Linux & GDB source.
+Various info & man pages.
+CMS Help on tracing commands.
+Linux for s/390 Elf Application Binary Interface
+Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended )
+z/Architecture Principles of Operation SA22-7832-00
+Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the
+Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05
+
+Special Thanks
+==============
+Special thanks to Neale Ferguson who maintains a much
+prettier HTML version of this page at
+http://penguinvm.princeton.edu/notes.html#Debug390
+Bob Grainger Stefan Bader & others for reporting bugs
diff --git a/Documentation/s390/TAPE b/Documentation/s390/TAPE
new file mode 100644
index 0000000..c639aa5
--- /dev/null
+++ b/Documentation/s390/TAPE
@@ -0,0 +1,122 @@
+Channel attached Tape device driver
+
+-----------------------------WARNING-----------------------------------------
+This driver is considered to be EXPERIMENTAL. Do NOT use it in
+production environments. Feel free to test it and report problems back to us.
+-----------------------------------------------------------------------------
+
+The LINUX for zSeries tape device driver manages channel attached tape drives
+which are compatible to IBM 3480 or IBM 3490 magnetic tape subsystems. This
+includes various models of these devices (for example the 3490E).
+
+
+Tape driver features
+
+The device driver supports a maximum of 128 tape devices.
+No official LINUX device major number is assigned to the zSeries tape device
+driver. It allocates major numbers dynamically and reports them on system
+startup.
+Typically it will get major number 254 for both the character device front-end
+and the block device front-end.
+
+The tape device driver needs no kernel parameters. All supported devices
+present are detected on driver initialization at system startup or module load.
+The devices detected are ordered by their subchannel numbers. The device with
+the lowest subchannel number becomes device 0, the next one will be device 1
+and so on.
+
+
+Tape character device front-end
+
+The usual way to read or write to the tape device is through the character
+device front-end. The zSeries tape device driver provides two character devices
+for each physical device -- the first of these will rewind automatically when
+it is closed, the second will not rewind automatically.
+
+The character device nodes are named /dev/rtibm0 (rewinding) and /dev/ntibm0
+(non-rewinding) for the first device, /dev/rtibm1 and /dev/ntibm1 for the
+second, and so on.
+
+The character device front-end can be used as any other LINUX tape device. You
+can write to it and read from it using LINUX facilities such as GNU tar. The
+tool mt can be used to perform control operations, such as rewinding the tape
+or skipping a file.
+
+Most LINUX tape software should work with either tape character device.
+
+
+Tape block device front-end
+
+The tape device may also be accessed as a block device in read-only mode.
+This could be used for software installation in the same way as it is used with
+other operation systems on the zSeries platform (and most LINUX
+distributions are shipped on compact disk using ISO9660 filesystems).
+
+One block device node is provided for each physical device. These are named
+/dev/btibm0 for the first device, /dev/btibm1 for the second and so on.
+You should only use the ISO9660 filesystem on LINUX for zSeries tapes because
+the physical tape devices cannot perform fast seeks and the ISO9660 system is
+optimized for this situation.
+
+
+Tape block device example
+
+In this example a tape with an ISO9660 filesystem is created using the first
+tape device. ISO9660 filesystem support must be built into your system kernel
+for this.
+The mt command is used to issue tape commands and the mkisofs command to
+create an ISO9660 filesystem:
+
+- create a LINUX directory (somedir) with the contents of the filesystem
+ mkdir somedir
+ cp contents somedir
+
+- insert a tape
+
+- ensure the tape is at the beginning
+ mt -f /dev/ntibm0 rewind
+
+- set the blocksize of the character driver. The blocksize 2048 bytes
+ is commonly used on ISO9660 CD-Roms
+ mt -f /dev/ntibm0 setblk 2048
+
+- write the filesystem to the character device driver
+ mkisofs -o /dev/ntibm0 somedir
+
+- rewind the tape again
+ mt -f /dev/ntibm0 rewind
+
+- Now you can mount your new filesystem as a block device:
+ mount -t iso9660 -o ro,block=2048 /dev/btibm0 /mnt
+
+TODO List
+
+ - Driver has to be stabilized still
+
+BUGS
+
+This driver is considered BETA, which means some weaknesses may still
+be in it.
+If an error occurs which cannot be handled by the code you will get a
+sense-data dump.In that case please do the following:
+
+1. set the tape driver debug level to maximum:
+ echo 6 >/proc/s390dbf/tape/level
+
+2. re-perform the actions which produced the bug. (Hopefully the bug will
+ reappear.)
+
+3. get a snapshot from the debug-feature:
+ cat /proc/s390dbf/tape/hex_ascii >somefile
+
+4. Now put the snapshot together with a detailed description of the situation
+ that led to the bug:
+ - Which tool did you use?
+ - Which hardware do you have?
+ - Was your tape unit online?
+ - Is it a shared tape unit?
+
+5. Send an email with your bug report to:
+ mailto:Linux390@de.ibm.com
+
+
diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.txt
new file mode 100644
index 0000000..c4b7b2b
--- /dev/null
+++ b/Documentation/s390/cds.txt
@@ -0,0 +1,472 @@
+Linux for S/390 and zSeries
+
+Common Device Support (CDS)
+Device Driver I/O Support Routines
+
+Authors : Ingo Adlung
+ Cornelia Huck
+
+Copyright, IBM Corp. 1999-2002
+
+Introduction
+
+This document describes the common device support routines for Linux/390.
+Different than other hardware architectures, ESA/390 has defined a unified
+I/O access method. This gives relief to the device drivers as they don't
+have to deal with different bus types, polling versus interrupt
+processing, shared versus non-shared interrupt processing, DMA versus port
+I/O (PIO), and other hardware features more. However, this implies that
+either every single device driver needs to implement the hardware I/O
+attachment functionality itself, or the operating system provides for a
+unified method to access the hardware, providing all the functionality that
+every single device driver would have to provide itself.
+
+The document does not intend to explain the ESA/390 hardware architecture in
+every detail.This information can be obtained from the ESA/390 Principles of
+Operation manual (IBM Form. No. SA22-7201).
+
+In order to build common device support for ESA/390 I/O interfaces, a
+functional layer was introduced that provides generic I/O access methods to
+the hardware.
+
+The common device support layer comprises the I/O support routines defined
+below. Some of them implement common Linux device driver interfaces, while
+some of them are ESA/390 platform specific.
+
+Note:
+In order to write a driver for S/390, you also need to look into the interface
+described in Documentation/s390/driver-model.txt.
+
+Note for porting drivers from 2.4:
+The major changes are:
+* The functions use a ccw_device instead of an irq (subchannel).
+* All drivers must define a ccw_driver (see driver-model.txt) and the associated
+ functions.
+* request_irq() and free_irq() are no longer done by the driver.
+* The oper_handler is (kindof) replaced by the probe() and set_online() functions
+ of the ccw_driver.
+* The not_oper_handler is (kindof) replaced by the remove() and set_offline()
+ functions of the ccw_driver.
+* The channel device layer is gone.
+* The interrupt handlers must be adapted to use a ccw_device as argument.
+ Moreover, they don't return a devstat, but an irb.
+* Before initiating an io, the options must be set via ccw_device_set_options().
+* Instead of calling read_dev_chars()/read_conf_data(), the driver issues
+ the channel program and handles the interrupt itself.
+
+ccw_device_get_ciw()
+ get commands from extended sense data.
+
+ccw_device_start()
+ccw_device_start_timeout()
+ccw_device_start_key()
+ccw_device_start_key_timeout()
+ initiate an I/O request.
+
+ccw_device_resume()
+ resume channel program execution.
+
+ccw_device_halt()
+ terminate the current I/O request processed on the device.
+
+do_IRQ()
+ generic interrupt routine. This function is called by the interrupt entry
+ routine whenever an I/O interrupt is presented to the system. The do_IRQ()
+ routine determines the interrupt status and calls the device specific
+ interrupt handler according to the rules (flags) defined during I/O request
+ initiation with do_IO().
+
+The next chapters describe the functions other than do_IRQ() in more details.
+The do_IRQ() interface is not described, as it is called from the Linux/390
+first level interrupt handler only and does not comprise a device driver
+callable interface. Instead, the functional description of do_IO() also
+describes the input to the device specific interrupt handler.
+
+Note: All explanations apply also to the 64 bit architecture s390x.
+
+
+Common Device Support (CDS) for Linux/390 Device Drivers
+
+General Information
+
+The following chapters describe the I/O related interface routines the
+Linux/390 common device support (CDS) provides to allow for device specific
+driver implementations on the IBM ESA/390 hardware platform. Those interfaces
+intend to provide the functionality required by every device driver
+implementation to allow to drive a specific hardware device on the ESA/390
+platform. Some of the interface routines are specific to Linux/390 and some
+of them can be found on other Linux platforms implementations too.
+Miscellaneous function prototypes, data declarations, and macro definitions
+can be found in the architecture specific C header file
+linux/include/asm-s390/irq.h.
+
+Overview of CDS interface concepts
+
+Different to other hardware platforms, the ESA/390 architecture doesn't define
+interrupt lines managed by a specific interrupt controller and bus systems
+that may or may not allow for shared interrupts, DMA processing, etc.. Instead,
+the ESA/390 architecture has implemented a so called channel subsystem, that
+provides a unified view of the devices physically attached to the systems.
+Though the ESA/390 hardware platform knows about a huge variety of different
+peripheral attachments like disk devices (aka. DASDs), tapes, communication
+controllers, etc. they can all be accessed by a well defined access method and
+they are presenting I/O completion a unified way : I/O interruptions. Every
+single device is uniquely identified to the system by a so called subchannel,
+where the ESA/390 architecture allows for 64k devices be attached.
+
+Linux, however, was first built on the Intel PC architecture, with its two
+cascaded 8259 programmable interrupt controllers (PICs), that allow for a
+maximum of 15 different interrupt lines. All devices attached to such a system
+share those 15 interrupt levels. Devices attached to the ISA bus system must
+not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered
+interrupts. MCA, EISA, PCI and other bus systems base on level triggered
+interrupts, and therewith allow for shared IRQs. However, if multiple devices
+present their hardware status by the same (shared) IRQ, the operating system
+has to call every single device driver registered on this IRQ in order to
+determine the device driver owning the device that raised the interrupt.
+
+Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel).
+For internal use of the common I/O layer, these are still there. However,
+device drivers should use the new calling interface via the ccw_device only.
+
+During its startup the Linux/390 system checks for peripheral devices. Each
+of those devices is uniquely defined by a so called subchannel by the ESA/390
+channel subsystem. While the subchannel numbers are system generated, each
+subchannel also takes a user defined attribute, the so called device number.
+Both subchannel number and device number cannot exceed 65535. During sysfs
+initialisation, the information about control unit type and device types that
+imply specific I/O commands (channel command words - CCWs) in order to operate
+the device are gathered. Device drivers can retrieve this set of hardware
+information during their initialization step to recognize the devices they
+support using the information saved in the struct ccw_device given to them.
+This methods implies that Linux/390 doesn't require to probe for free (not
+armed) interrupt request lines (IRQs) to drive its devices with. Where
+applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS
+ccw to retrieve device characteristics in its online routine.
+
+In order to allow for easy I/O initiation the CDS layer provides a
+ccw_device_start() interface that takes a device specific channel program (one
+or more CCWs) as input sets up the required architecture specific control blocks
+and initiates an I/O request on behalf of the device driver. The
+ccw_device_start() routine allows to specify whether it expects the CDS layer
+to notify the device driver for every interrupt it observes, or with final status
+only. See ccw_device_start() for more details. A device driver must never issue
+ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead.
+
+For long running I/O request to be canceled, the CDS layer provides the
+ccw_device_halt() function. Some devices require to initially issue a HALT
+SUBCHANNEL (HSCH) command without having pending I/O requests. This function is
+also covered by ccw_device_halt().
+
+
+get_ciw() - get command information word
+
+This call enables a device driver to get information about supported commands
+from the extended SenseID data.
+
+struct ciw *
+ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd);
+
+cdev - The ccw_device for which the command is to be retrieved.
+cmd - The command type to be retrieved.
+
+ccw_device_get_ciw() returns:
+NULL - No extended data available, invalid device or command not found.
+!NULL - The command requested.
+
+
+ccw_device_start() - Initiate I/O Request
+
+The ccw_device_start() routines is the I/O request front-end processor. All
+device driver I/O requests must be issued using this routine. A device driver
+must not issue ESA/390 I/O commands itself. Instead the ccw_device_start()
+routine provides all interfaces required to drive arbitrary devices.
+
+This description also covers the status information passed to the device
+driver's interrupt handler as this is related to the rules (flags) defined
+with the associated I/O request when calling ccw_device_start().
+
+int ccw_device_start(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ unsigned long flags);
+int ccw_device_start_timeout(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ unsigned long flags,
+ int expires);
+int ccw_device_start_key(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ __u8 key,
+ unsigned long flags);
+int ccw_device_start_key_timeout(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ __u8 key,
+ unsigned long flags,
+ int expires);
+
+cdev : ccw_device the I/O is destined for
+cpa : logical start address of channel program
+user_intparm : user specific interrupt information; will be presented
+ back to the device driver's interrupt handler. Allows a
+ device driver to associate the interrupt with a
+ particular I/O request.
+lpm : defines the channel path to be used for a specific I/O
+ request. A value of 0 will make cio use the opm.
+key : the storage key to use for the I/O (useful for operating on a
+ storage with a storage key != default key)
+flag : defines the action to be performed for I/O processing
+expires : timeout value in jiffies. The common I/O layer will terminate
+ the running program after this and call the interrupt handler
+ with ERR_PTR(-ETIMEDOUT) as irb.
+
+Possible flag values are :
+
+DOIO_ALLOW_SUSPEND - channel program may become suspended
+DOIO_DENY_PREFETCH - don't allow for CCW prefetch; usually
+ this implies the channel program might
+ become modified
+DOIO_SUPPRESS_INTER - don't call the handler on intermediate status
+
+The cpa parameter points to the first format 1 CCW of a channel program :
+
+struct ccw1 {
+ __u8 cmd_code;/* command code */
+ __u8 flags; /* flags, like IDA addressing, etc. */
+ __u16 count; /* byte count */
+ __u32 cda; /* data address */
+} __attribute__ ((packed,aligned(8)));
+
+with the following CCW flags values defined :
+
+CCW_FLAG_DC - data chaining
+CCW_FLAG_CC - command chaining
+CCW_FLAG_SLI - suppress incorrect length
+CCW_FLAG_SKIP - skip
+CCW_FLAG_PCI - PCI
+CCW_FLAG_IDA - indirect addressing
+CCW_FLAG_SUSPEND - suspend
+
+
+Via ccw_device_set_options(), the device driver may specify the following
+options for the device:
+
+DOIO_EARLY_NOTIFICATION - allow for early interrupt notification
+DOIO_REPORT_ALL - report all interrupt conditions
+
+
+The ccw_device_start() function returns :
+
+ 0 - successful completion or request successfully initiated
+-EBUSY - The device is currently processing a previous I/O request, or there is
+ a status pending at the device.
+-ENODEV - cdev is invalid, the device is not operational or the ccw_device is
+ not online.
+
+When the I/O request completes, the CDS first level interrupt handler will
+accumulate the status in a struct irb and then call the device interrupt handler.
+The intparm field will contain the value the device driver has associated with a
+particular I/O request. If a pending device status was recognized,
+intparm will be set to 0 (zero). This may happen during I/O initiation or delayed
+by an alert status notification. In any case this status is not related to the
+current (last) I/O request. In case of a delayed status notification no special
+interrupt will be presented to indicate I/O completion as the I/O request was
+never started, even though ccw_device_start() returned with successful completion.
+
+The irb may contain an error value, and the device driver should check for this
+first:
+
+-ETIMEDOUT: the common I/O layer terminated the request after the specified
+ timeout value
+-EIO: the common I/O layer terminated the request due to an error state
+
+If the concurrent sense flag in the extended status word (esw) in the irb is
+set, the field erw.scnt in the esw describes the number of device specific
+sense bytes available in the extended control word irb->scsw.ecw[]. No device
+sensing by the device driver itself is required.
+
+The device interrupt handler can use the following definitions to investigate
+the primary unit check source coded in sense byte 0 :
+
+SNS0_CMD_REJECT 0x80
+SNS0_INTERVENTION_REQ 0x40
+SNS0_BUS_OUT_CHECK 0x20
+SNS0_EQUIPMENT_CHECK 0x10
+SNS0_DATA_CHECK 0x08
+SNS0_OVERRUN 0x04
+SNS0_INCOMPL_DOMAIN 0x01
+
+Depending on the device status, multiple of those values may be set together.
+Please refer to the device specific documentation for details.
+
+The irb->scsw.cstat field provides the (accumulated) subchannel status :
+
+SCHN_STAT_PCI - program controlled interrupt
+SCHN_STAT_INCORR_LEN - incorrect length
+SCHN_STAT_PROG_CHECK - program check
+SCHN_STAT_PROT_CHECK - protection check
+SCHN_STAT_CHN_DATA_CHK - channel data check
+SCHN_STAT_CHN_CTRL_CHK - channel control check
+SCHN_STAT_INTF_CTRL_CHK - interface control check
+SCHN_STAT_CHAIN_CHECK - chaining check
+
+The irb->scsw.dstat field provides the (accumulated) device status :
+
+DEV_STAT_ATTENTION - attention
+DEV_STAT_STAT_MOD - status modifier
+DEV_STAT_CU_END - control unit end
+DEV_STAT_BUSY - busy
+DEV_STAT_CHN_END - channel end
+DEV_STAT_DEV_END - device end
+DEV_STAT_UNIT_CHECK - unit check
+DEV_STAT_UNIT_EXCEP - unit exception
+
+Please see the ESA/390 Principles of Operation manual for details on the
+individual flag meanings.
+
+Usage Notes :
+
+ccw_device_start() must be called disabled and with the ccw device lock held.
+
+The device driver is allowed to issue the next ccw_device_start() call from
+within its interrupt handler already. It is not required to schedule a
+bottom-half, unless a non deterministically long running error recovery procedure
+or similar needs to be scheduled. During I/O processing the Linux/390 generic
+I/O device driver support has already obtained the IRQ lock, i.e. the handler
+must not try to obtain it again when calling ccw_device_start() or we end in a
+deadlock situation!
+
+If a device driver relies on an I/O request to be completed prior to start the
+next it can reduce I/O processing overhead by chaining a NoOp I/O command
+CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End
+and Device-End status to be presented together, with a single interrupt.
+However, this should be used with care as it implies the channel will remain
+busy, not being able to process I/O requests for other devices on the same
+channel. Therefore e.g. read commands should never use this technique, as the
+result will be presented by a single interrupt anyway.
+
+In order to minimize I/O overhead, a device driver should use the
+DOIO_REPORT_ALL only if the device can report intermediate interrupt
+information prior to device-end the device driver urgently relies on. In this
+case all I/O interruptions are presented to the device driver until final
+status is recognized.
+
+If a device is able to recover from asynchronously presented I/O errors, it can
+perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some
+devices always report channel-end and device-end together, with a single
+interrupt, others present primary status (channel-end) when the channel is
+ready for the next I/O request and secondary status (device-end) when the data
+transmission has been completed at the device.
+
+Above flag allows to exploit this feature, e.g. for communication devices that
+can handle lost data on the network to allow for enhanced I/O processing.
+
+Unless the channel subsystem at any time presents a secondary status interrupt,
+exploiting this feature will cause only primary status interrupts to be
+presented to the device driver while overlapping I/O is performed. When a
+secondary status without error (alert status) is presented, this indicates
+successful completion for all overlapping ccw_device_start() requests that have
+been issued since the last secondary (final) status.
+
+Channel programs that intend to set the suspend flag on a channel command word
+(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the
+suspend flag will cause a channel program check. At the time the channel program
+becomes suspended an intermediate interrupt will be generated by the channel
+subsystem.
+
+ccw_device_resume() - Resume Channel Program Execution
+
+If a device driver chooses to suspend the current channel program execution by
+setting the CCW suspend flag on a particular CCW, the channel program execution
+is suspended. In order to resume channel program execution the CIO layer
+provides the ccw_device_resume() routine.
+
+int ccw_device_resume(struct ccw_device *cdev);
+
+cdev - ccw_device the resume operation is requested for
+
+The ccw_device_resume() function returns:
+
+ 0 - suspended channel program is resumed
+-EBUSY - status pending
+-ENODEV - cdev invalid or not-operational subchannel
+-EINVAL - resume function not applicable
+-ENOTCONN - there is no I/O request pending for completion
+
+Usage Notes:
+Please have a look at the ccw_device_start() usage notes for more details on
+suspended channel programs.
+
+ccw_device_halt() - Halt I/O Request Processing
+
+Sometimes a device driver might need a possibility to stop the processing of
+a long-running channel program or the device might require to initially issue
+a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt()
+command is provided.
+
+ccw_device_halt() must be called disabled and with the ccw device lock held.
+
+int ccw_device_halt(struct ccw_device *cdev,
+ unsigned long intparm);
+
+cdev : ccw_device the halt operation is requested for
+intparm : interruption parameter; value is only used if no I/O
+ is outstanding, otherwise the intparm associated with
+ the I/O request is returned
+
+The ccw_device_halt() function returns :
+
+ 0 - request successfully initiated
+-EBUSY - the device is currently busy, or status pending.
+-ENODEV - cdev invalid.
+-EINVAL - The device is not operational or the ccw device is not online.
+
+Usage Notes :
+
+A device driver may write a never-ending channel program by writing a channel
+program that at its end loops back to its beginning by means of a transfer in
+channel (TIC) command (CCW_CMD_TIC). Usually this is performed by network
+device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is
+executed a program controlled interrupt (PCI) is generated. The device driver
+can then perform an appropriate action. Prior to interrupt of an outstanding
+read to a network device (with or without PCI flag) a ccw_device_halt()
+is required to end the pending operation.
+
+ccw_device_clear() - Terminage I/O Request Processing
+
+In order to terminate all I/O processing at the subchannel, the clear subchannel
+(CSCH) command is used. It can be issued via ccw_device_clear().
+
+ccw_device_clear() must be called disabled and with the ccw device lock held.
+
+int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm);
+
+cdev: ccw_device the clear operation is requested for
+intparm: interruption parameter (see ccw_device_halt())
+
+The ccw_device_clear() function returns:
+
+ 0 - request successfully initiated
+-ENODEV - cdev invalid
+-EINVAL - The device is not operational or the ccw device is not online.
+
+Miscellaneous Support Routines
+
+This chapter describes various routines to be used in a Linux/390 device
+driver programming environment.
+
+get_ccwdev_lock()
+
+Get the address of the device specific lock. This is then used in
+spin_lock() / spin_unlock() calls.
+
+
+__u8 ccw_device_get_path_mask(struct ccw_device *cdev);
+
+Get the mask of the path currently available for cdev.
diff --git a/Documentation/s390/config3270.sh b/Documentation/s390/config3270.sh
new file mode 100644
index 0000000..515e2f4
--- /dev/null
+++ b/Documentation/s390/config3270.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# config3270 -- Autoconfigure /dev/3270/* and /etc/inittab
+#
+# Usage:
+# config3270
+#
+# Output:
+# /tmp/mkdev3270
+#
+# Operation:
+# 1. Run this script
+# 2. Run the script it produces: /tmp/mkdev3270
+# 3. Issue "telinit q" or reboot, as appropriate.
+#
+P=/proc/tty/driver/tty3270
+ROOT=
+D=$ROOT/dev
+SUBD=3270
+TTY=$SUBD/tty
+TUB=$SUBD/tub
+SCR=$ROOT/tmp/mkdev3270
+SCRTMP=$SCR.a
+GETTYLINE=:2345:respawn:/sbin/mingetty
+INITTAB=$ROOT/etc/inittab
+NINITTAB=$ROOT/etc/NEWinittab
+OINITTAB=$ROOT/etc/OLDinittab
+ADDNOTE=\\"# Additional mingettys for the 3270/tty* driver, tub3270 ---\\"
+
+if ! ls $P > /dev/null 2>&1; then
+ modprobe tub3270 > /dev/null 2>&1
+fi
+ls $P > /dev/null 2>&1 || exit 1
+
+# Initialize two files, one for /dev/3270 commands and one
+# to replace the /etc/inittab file (old one saved in OLDinittab)
+echo "#!/bin/sh" > $SCR || exit 1
+echo " " >> $SCR
+echo "# Script built by /sbin/config3270" >> $SCR
+if [ ! -d /dev/dasd ]; then
+ echo rm -rf "$D/$SUBD/*" >> $SCR
+fi
+echo "grep -v $TTY $INITTAB > $NINITTAB" > $SCRTMP || exit 1
+echo "echo $ADDNOTE >> $NINITTAB" >> $SCRTMP
+if [ ! -d /dev/dasd ]; then
+ echo mkdir -p $D/$SUBD >> $SCR
+fi
+
+# Now query the tub3270 driver for 3270 device information
+# and add appropriate mknod and mingetty lines to our files
+echo what=config > $P
+while read devno maj min;do
+ if [ $min = 0 ]; then
+ fsmaj=$maj
+ if [ ! -d /dev/dasd ]; then
+ echo mknod $D/$TUB c $fsmaj 0 >> $SCR
+ echo chmod 666 $D/$TUB >> $SCR
+ fi
+ elif [ $maj = CONSOLE ]; then
+ if [ ! -d /dev/dasd ]; then
+ echo mknod $D/$TUB$devno c $fsmaj $min >> $SCR
+ fi
+ else
+ if [ ! -d /dev/dasd ]; then
+ echo mknod $D/$TTY$devno c $maj $min >>$SCR
+ echo mknod $D/$TUB$devno c $fsmaj $min >> $SCR
+ fi
+ echo "echo t$min$GETTYLINE $TTY$devno >> $NINITTAB" >> $SCRTMP
+ fi
+done < $P
+
+echo mv $INITTAB $OINITTAB >> $SCRTMP || exit 1
+echo mv $NINITTAB $INITTAB >> $SCRTMP
+cat $SCRTMP >> $SCR
+rm $SCRTMP
+exit 0
diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt
new file mode 100644
index 0000000..bde473d
--- /dev/null
+++ b/Documentation/s390/driver-model.txt
@@ -0,0 +1,287 @@
+S/390 driver model interfaces
+-----------------------------
+
+1. CCW devices
+--------------
+
+All devices which can be addressed by means of ccws are called 'CCW devices' -
+even if they aren't actually driven by ccws.
+
+All ccw devices are accessed via a subchannel, this is reflected in the
+structures under devices/:
+
+devices/
+ - system/
+ - css0/
+ - 0.0.0000/0.0.0815/
+ - 0.0.0001/0.0.4711/
+ - 0.0.0002/
+ - 0.1.0000/0.1.1234/
+ ...
+ - defunct/
+
+In this example, device 0815 is accessed via subchannel 0 in subchannel set 0,
+device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O
+subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1.
+
+The subchannel named 'defunct' does not represent any real subchannel on the
+system; it is a pseudo subchannel where disconnected ccw devices are moved to
+if they are displaced by another ccw device becoming operational on their
+former subchannel. The ccw devices will be moved again to a proper subchannel
+if they become operational again on that subchannel.
+
+You should address a ccw device via its bus id (e.g. 0.0.4711); the device can
+be found under bus/ccw/devices/.
+
+All ccw devices export some data via sysfs.
+
+cutype: The control unit type / model.
+
+devtype: The device type / model, if applicable.
+
+availability: Can be 'good' or 'boxed'; 'no path' or 'no device' for
+ disconnected devices.
+
+online: An interface to set the device online and offline.
+ In the special case of the device being disconnected (see the
+ notify function under 1.2), piping 0 to online will forcibly delete
+ the device.
+
+The device drivers can add entries to export per-device data and interfaces.
+
+There is also some data exported on a per-subchannel basis (see under
+bus/css/devices/):
+
+chpids: Via which chpids the device is connected.
+
+pimpampom: The path installed, path available and path operational masks.
+
+There also might be additional data, for example for block devices.
+
+
+1.1 Bringing up a ccw device
+----------------------------
+
+This is done in several steps.
+
+a. Each driver can provide one or more parameter interfaces where parameters can
+ be specified. These interfaces are also in the driver's responsibility.
+b. After a. has been performed, if necessary, the device is finally brought up
+ via the 'online' interface.
+
+
+1.2 Writing a driver for ccw devices
+------------------------------------
+
+The basic struct ccw_device and struct ccw_driver data structures can be found
+under include/asm/ccwdev.h.
+
+struct ccw_device {
+ spinlock_t *ccwlock;
+ struct ccw_device_private *private;
+ struct ccw_device_id id;
+
+ struct ccw_driver *drv;
+ struct device dev;
+ int online;
+
+ void (*handler) (struct ccw_device *dev, unsigned long intparm,
+ struct irb *irb);
+};
+
+struct ccw_driver {
+ struct module *owner;
+ struct ccw_device_id *ids;
+ int (*probe) (struct ccw_device *);
+ int (*remove) (struct ccw_device *);
+ int (*set_online) (struct ccw_device *);
+ int (*set_offline) (struct ccw_device *);
+ int (*notify) (struct ccw_device *, int);
+ struct device_driver driver;
+ char *name;
+};
+
+The 'private' field contains data needed for internal i/o operation only, and
+is not available to the device driver.
+
+Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models
+and/or device types/models it is interested. This information can later be found
+in the struct ccw_device_id fields:
+
+struct ccw_device_id {
+ __u16 match_flags;
+
+ __u16 cu_type;
+ __u16 dev_type;
+ __u8 cu_model;
+ __u8 dev_model;
+
+ unsigned long driver_info;
+};
+
+The functions in ccw_driver should be used in the following way:
+probe: This function is called by the device layer for each device the driver
+ is interested in. The driver should only allocate private structures
+ to put in dev->driver_data and create attributes (if needed). Also,
+ the interrupt handler (see below) should be set here.
+
+int (*probe) (struct ccw_device *cdev);
+
+Parameters: cdev - the device to be probed.
+
+
+remove: This function is called by the device layer upon removal of the driver,
+ the device or the module. The driver should perform cleanups here.
+
+int (*remove) (struct ccw_device *cdev);
+
+Parameters: cdev - the device to be removed.
+
+
+set_online: This function is called by the common I/O layer when the device is
+ activated via the 'online' attribute. The driver should finally
+ setup and activate the device here.
+
+int (*set_online) (struct ccw_device *);
+
+Parameters: cdev - the device to be activated. The common layer has
+ verified that the device is not already online.
+
+
+set_offline: This function is called by the common I/O layer when the device is
+ de-activated via the 'online' attribute. The driver should shut
+ down the device, but not de-allocate its private data.
+
+int (*set_offline) (struct ccw_device *);
+
+Parameters: cdev - the device to be deactivated. The common layer has
+ verified that the device is online.
+
+
+notify: This function is called by the common I/O layer for some state changes
+ of the device.
+ Signalled to the driver are:
+ * In online state, device detached (CIO_GONE) or last path gone
+ (CIO_NO_PATH). The driver must return !0 to keep the device; for
+ return code 0, the device will be deleted as usual (also when no
+ notify function is registered). If the driver wants to keep the
+ device, it is moved into disconnected state.
+ * In disconnected state, device operational again (CIO_OPER). The
+ common I/O layer performs some sanity checks on device number and
+ Device / CU to be reasonably sure if it is still the same device.
+ If not, the old device is removed and a new one registered. By the
+ return code of the notify function the device driver signals if it
+ wants the device back: !0 for keeping, 0 to make the device being
+ removed and re-registered.
+
+int (*notify) (struct ccw_device *, int);
+
+Parameters: cdev - the device whose state changed.
+ event - the event that happened. This can be one of CIO_GONE,
+ CIO_NO_PATH or CIO_OPER.
+
+The handler field of the struct ccw_device is meant to be set to the interrupt
+handler for the device. In order to accommodate drivers which use several
+distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device
+instead of ccw_driver.
+The handler is registered with the common layer during set_online() processing
+before the driver is called, and is deregistered during set_offline() after the
+driver has been called. Also, after registering / before deregistering, path
+grouping resp. disbanding of the path group (if applicable) are performed.
+
+void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb);
+
+Parameters: dev - the device the handler is called for
+ intparm - the intparm which allows the device driver to identify
+ the i/o the interrupt is associated with, or to recognize
+ the interrupt as unsolicited.
+ irb - interruption response block which contains the accumulated
+ status.
+
+The device driver is called from the common ccw_device layer and can retrieve
+information about the interrupt from the irb parameter.
+
+
+1.3 ccwgroup devices
+--------------------
+
+The ccwgroup mechanism is designed to handle devices consisting of multiple ccw
+devices, like lcs or ctc.
+
+The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to
+this attributes creates a ccwgroup device consisting of these ccw devices (if
+possible). This ccwgroup device can be set online or offline just like a normal
+ccw device.
+
+Each ccwgroup device also provides an 'ungroup' attribute to destroy the device
+again (only when offline). This is a generic ccwgroup mechanism (the driver does
+not need to implement anything beyond normal removal routines).
+
+A ccw device which is a member of a ccwgroup device carries a pointer to the
+ccwgroup device in the driver_data of its device struct. This field must not be
+touched by the driver - it should use the ccwgroup device's driver_data for its
+private data.
+
+To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in
+mind that most drivers will need to implement both a ccwgroup and a ccw driver
+(unless you have a meta ccw driver, like cu3088 for lcs and ctc).
+
+
+2. Channel paths
+-----------------
+
+Channel paths show up, like subchannels, under the channel subsystem root (css0)
+and are called 'chp0.<chpid>'. They have no driver and do not belong to any bus.
+Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect
+only the logical state and not the physical state, since we cannot track the
+latter consistently due to lacking machine support (we don't need to be aware
+of it anyway).
+
+status - Can be 'online' or 'offline'.
+ Piping 'on' or 'off' sets the chpid logically online/offline.
+ Piping 'on' to an online chpid triggers path reprobing for all devices
+ the chpid connects to. This can be used to force the kernel to re-use
+ a channel path the user knows to be online, but the machine hasn't
+ created a machine check for.
+
+type - The physical type of the channel path.
+
+shared - Whether the channel path is shared.
+
+cmg - The channel measurement group.
+
+3. System devices
+-----------------
+
+3.1 xpram
+---------
+
+xpram shows up under devices/system/ as 'xpram'.
+
+3.2 cpus
+--------
+
+For each cpu, a directory is created under devices/system/cpu/. Each cpu has an
+attribute 'online' which can be 0 or 1.
+
+
+4. Other devices
+----------------
+
+4.1 Netiucv
+-----------
+
+The netiucv driver creates an attribute 'connection' under
+bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv
+connection to the specified host.
+
+Netiucv connections show up under devices/iucv/ as "netiucv<ifnum>". The interface
+number is assigned sequentially to the connections defined via the 'connection'
+attribute.
+
+user - shows the connection partner.
+
+buffer - maximum buffer size.
+ Pipe to it to change buffer size.
+
+
diff --git a/Documentation/s390/kvm.txt b/Documentation/s390/kvm.txt
new file mode 100644
index 0000000..6f5ceb0
--- /dev/null
+++ b/Documentation/s390/kvm.txt
@@ -0,0 +1,125 @@
+*** BIG FAT WARNING ***
+The kvm module is currently in EXPERIMENTAL state for s390. This means that
+the interface to the module is not yet considered to remain stable. Thus, be
+prepared that we keep breaking your userspace application and guest
+compatibility over and over again until we feel happy with the result. Make sure
+your guest kernel, your host kernel, and your userspace launcher are in a
+consistent state.
+
+This Documentation describes the unique ioctl calls to /dev/kvm, the resulting
+kvm-vm file descriptors, and the kvm-vcpu file descriptors that differ from x86.
+
+1. ioctl calls to /dev/kvm
+KVM does support the following ioctls on s390 that are common with other
+architectures and do behave the same:
+KVM_GET_API_VERSION
+KVM_CREATE_VM (*) see note
+KVM_CHECK_EXTENSION
+KVM_GET_VCPU_MMAP_SIZE
+
+Notes:
+* KVM_CREATE_VM may fail on s390, if the calling process has multiple
+threads and has not called KVM_S390_ENABLE_SIE before.
+
+In addition, on s390 the following architecture specific ioctls are supported:
+ioctl: KVM_S390_ENABLE_SIE
+args: none
+see also: include/linux/kvm.h
+This call causes the kernel to switch on PGSTE in the user page table. This
+operation is needed in order to run a virtual machine, and it requires the
+calling process to be single-threaded. Note that the first call to KVM_CREATE_VM
+will implicitly try to switch on PGSTE if the user process has not called
+KVM_S390_ENABLE_SIE before. User processes that want to launch multiple threads
+before creating a virtual machine have to call KVM_S390_ENABLE_SIE, or will
+observe an error calling KVM_CREATE_VM. Switching on PGSTE is a one-time
+operation, is not reversible, and will persist over the entire lifetime of
+the calling process. It does not have any user-visible effect other than a small
+performance penalty.
+
+2. ioctl calls to the kvm-vm file descriptor
+KVM does support the following ioctls on s390 that are common with other
+architectures and do behave the same:
+KVM_CREATE_VCPU
+KVM_SET_USER_MEMORY_REGION (*) see note
+KVM_GET_DIRTY_LOG (**) see note
+
+Notes:
+* kvm does only allow exactly one memory slot on s390, which has to start
+ at guest absolute address zero and at a user address that is aligned on any
+ page boundary. This hardware "limitation" allows us to have a few unique
+ optimizations. The memory slot doesn't have to be filled
+ with memory actually, it may contain sparse holes. That said, with different
+ user memory layout this does still allow a large flexibility when
+ doing the guest memory setup.
+** KVM_GET_DIRTY_LOG doesn't work properly yet. The user will receive an empty
+log. This ioctl call is only needed for guest migration, and we intend to
+implement this one in the future.
+
+In addition, on s390 the following architecture specific ioctls for the kvm-vm
+file descriptor are supported:
+ioctl: KVM_S390_INTERRUPT
+args: struct kvm_s390_interrupt *
+see also: include/linux/kvm.h
+This ioctl is used to submit a floating interrupt for a virtual machine.
+Floating interrupts may be delivered to any virtual cpu in the configuration.
+Only some interrupt types defined in include/linux/kvm.h make sense when
+submitted as floating interrupts. The following interrupts are not considered
+to be useful as floating interrupts, and a call to inject them will result in
+-EINVAL error code: program interrupts and interprocessor signals. Valid
+floating interrupts are:
+KVM_S390_INT_VIRTIO
+KVM_S390_INT_SERVICE
+
+3. ioctl calls to the kvm-vcpu file descriptor
+KVM does support the following ioctls on s390 that are common with other
+architectures and do behave the same:
+KVM_RUN
+KVM_GET_REGS
+KVM_SET_REGS
+KVM_GET_SREGS
+KVM_SET_SREGS
+KVM_GET_FPU
+KVM_SET_FPU
+
+In addition, on s390 the following architecture specific ioctls for the
+kvm-vcpu file descriptor are supported:
+ioctl: KVM_S390_INTERRUPT
+args: struct kvm_s390_interrupt *
+see also: include/linux/kvm.h
+This ioctl is used to submit an interrupt for a specific virtual cpu.
+Only some interrupt types defined in include/linux/kvm.h make sense when
+submitted for a specific cpu. The following interrupts are not considered
+to be useful, and a call to inject them will result in -EINVAL error code:
+service processor calls and virtio interrupts. Valid interrupt types are:
+KVM_S390_PROGRAM_INT
+KVM_S390_SIGP_STOP
+KVM_S390_RESTART
+KVM_S390_SIGP_SET_PREFIX
+KVM_S390_INT_EMERGENCY
+
+ioctl: KVM_S390_STORE_STATUS
+args: unsigned long
+see also: include/linux/kvm.h
+This ioctl stores the state of the cpu at the guest real address given as
+argument, unless one of the following values defined in include/linux/kvm.h
+is given as arguement:
+KVM_S390_STORE_STATUS_NOADDR - the CPU stores its status to the save area in
+absolute lowcore as defined by the principles of operation
+KVM_S390_STORE_STATUS_PREFIXED - the CPU stores its status to the save area in
+its prefix page just like the dump tool that comes with zipl. This is useful
+to create a system dump for use with lkcdutils or crash.
+
+ioctl: KVM_S390_SET_INITIAL_PSW
+args: struct kvm_s390_psw *
+see also: include/linux/kvm.h
+This ioctl can be used to set the processor status word (psw) of a stopped cpu
+prior to running it with KVM_RUN. Note that this call is not required to modify
+the psw during sie intercepts that fall back to userspace because struct kvm_run
+does contain the psw, and this value is evaluated during reentry of KVM_RUN
+after the intercept exit was recognized.
+
+ioctl: KVM_S390_INITIAL_RESET
+args: none
+see also: include/linux/kvm.h
+This ioctl can be used to perform an initial cpu reset as defined by the
+principles of operation. The target cpu has to be in stopped state.
diff --git a/Documentation/s390/monreader.txt b/Documentation/s390/monreader.txt
new file mode 100644
index 0000000..beeaa4b
--- /dev/null
+++ b/Documentation/s390/monreader.txt
@@ -0,0 +1,197 @@
+
+Date : 2004-Nov-26
+Author: Gerald Schaefer (geraldsc@de.ibm.com)
+
+
+ Linux API for read access to z/VM Monitor Records
+ =================================================
+
+
+Description
+===========
+This item delivers a new Linux API in the form of a misc char device that is
+useable from user space and allows read access to the z/VM Monitor Records
+collected by the *MONITOR System Service of z/VM.
+
+
+User Requirements
+=================
+The z/VM guest on which you want to access this API needs to be configured in
+order to allow IUCV connections to the *MONITOR service, i.e. it needs the
+IUCV *MONITOR statement in its user entry. If the monitor DCSS to be used is
+restricted (likely), you also need the NAMESAVE <DCSS NAME> statement.
+This item will use the IUCV device driver to access the z/VM services, so you
+need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1.
+
+There are two options for being able to load the monitor DCSS (examples assume
+that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the
+location of the monitor DCSS with the Class E privileged CP command Q NSS MAP
+(the values BEGPAG and ENDPAG are given in units of 4K pages).
+
+See also "CP Command and Utility Reference" (SC24-6081-00) for more information
+on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning
+and Administration" (SC24-6116-00) for more information on DCSSes.
+
+1st option:
+-----------
+You can use the CP command DEF STOR CONFIG to define a "memory hole" in your
+guest virtual storage around the address range of the DCSS.
+
+Example: DEF STOR CONFIG 0.140M 200M.200M
+
+This defines two blocks of storage, the first is 140MB in size an begins at
+address 0MB, the second is 200MB in size and begins at address 200MB,
+resulting in a total storage of 340MB. Note that the first block should
+always start at 0 and be at least 64MB in size.
+
+2nd option:
+-----------
+Your guest virtual storage has to end below the starting address of the DCSS
+and you have to specify the "mem=" kernel parameter in your parmfile with a
+value greater than the ending address of the DCSS.
+
+Example: DEF STOR 140M
+
+This defines 140MB storage size for your guest, the parameter "mem=160M" is
+added to the parmfile.
+
+
+User Interface
+==============
+The char device is implemented as a kernel module named "monreader",
+which can be loaded via the modprobe command, or it can be compiled into the
+kernel instead. There is one optional module (or kernel) parameter, "mondcss",
+to specify the name of the monitor DCSS. If the module is compiled into the
+kernel, the kernel parameter "monreader.mondcss=<DCSS NAME>" can be specified
+in the parmfile.
+
+The default name for the DCSS is "MONDCSS" if none is specified. In case that
+there are other users already connected to the *MONITOR service (e.g.
+Performance Toolkit), the monitor DCSS is already defined and you have to use
+the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name
+of the monitor DCSS, if already defined, and the users connected to the
+*MONITOR service.
+Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor
+DCSS if your z/VM doesn't have one already, you need Class E privileges to
+define and save a DCSS.
+
+Example:
+--------
+modprobe monreader mondcss=MYDCSS
+
+This loads the module and sets the DCSS name to "MYDCSS".
+
+NOTE:
+-----
+This API provides no interface to control the *MONITOR service, e.g. specify
+which data should be collected. This can be done by the CP command MONITOR
+(Class E privileged), see "CP Command and Utility Reference".
+
+Device nodes with udev:
+-----------------------
+After loading the module, a char device will be created along with the device
+node /<udev directory>/monreader.
+
+Device nodes without udev:
+--------------------------
+If your distribution does not support udev, a device node will not be created
+automatically and you have to create it manually after loading the module.
+Therefore you need to know the major and minor numbers of the device. These
+numbers can be found in /sys/class/misc/monreader/dev.
+Typing cat /sys/class/misc/monreader/dev will give an output of the form
+<major>:<minor>. The device node can be created via the mknod command, enter
+mknod <name> c <major> <minor>, where <name> is the name of the device node
+to be created.
+
+Example:
+--------
+# modprobe monreader
+# cat /sys/class/misc/monreader/dev
+10:63
+# mknod /dev/monreader c 10 63
+
+This loads the module with the default monitor DCSS (MONDCSS) and creates a
+device node.
+
+File operations:
+----------------
+The following file operations are supported: open, release, read, poll.
+There are two alternative methods for reading: either non-blocking read in
+conjunction with polling, or blocking read without polling. IOCTLs are not
+supported.
+
+Read:
+-----
+Reading from the device provides a 12 Byte monitor control element (MCE),
+followed by a set of one or more contiguous monitor records (similar to the
+output of the CMS utility MONWRITE without the 4K control blocks). The MCE
+contains information on the type of the following record set (sample/event
+data), the monitor domains contained within it and the start and end address
+of the record set in the monitor DCSS. The start and end address can be used
+to determine the size of the record set, the end address is the address of the
+last byte of data. The start address is needed to handle "end-of-frame" records
+correctly (domain 1, record 13), i.e. it can be used to determine the record
+start offset relative to a 4K page (frame) boundary.
+
+See "Appendix A: *MONITOR" in the "z/VM Performance" document for a description
+of the monitor control element layout. The layout of the monitor records can
+be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html
+
+The layout of the data stream provided by the monreader device is as follows:
+...
+<0 byte read>
+<first MCE> \
+<first set of records> |
+... |- data set
+<last MCE> |
+<last set of records> /
+<0 byte read>
+...
+
+There may be more than one combination of MCE and corresponding record set
+within one data set and the end of each data set is indicated by a successful
+read with a return value of 0 (0 byte read).
+Any received data must be considered invalid until a complete set was
+read successfully, including the closing 0 byte read. Therefore you should
+always read the complete set into a buffer before processing the data.
+
+The maximum size of a data set can be as large as the size of the
+monitor DCSS, so design the buffer adequately or use dynamic memory allocation.
+The size of the monitor DCSS will be printed into syslog after loading the
+module. You can also use the (Class E privileged) CP command Q NSS MAP to
+list all available segments and information about them.
+
+As with most char devices, error conditions are indicated by returning a
+negative value for the number of bytes read. In this case, the errno variable
+indicates the error condition:
+
+EIO: reply failed, read data is invalid and the application
+ should discard the data read since the last successful read with 0 size.
+EFAULT: copy_to_user failed, read data is invalid and the application should
+ discard the data read since the last successful read with 0 size.
+EAGAIN: occurs on a non-blocking read if there is no data available at the
+ moment. There is no data missing or corrupted, just try again or rather
+ use polling for non-blocking reads.
+EOVERFLOW: message limit reached, the data read since the last successful
+ read with 0 size is valid but subsequent records may be missing.
+
+In the last case (EOVERFLOW) there may be missing data, in the first two cases
+(EIO, EFAULT) there will be missing data. It's up to the application if it will
+continue reading subsequent data or rather exit.
+
+Open:
+-----
+Only one user is allowed to open the char device. If it is already in use, the
+open function will fail (return a negative value) and set errno to EBUSY.
+The open function may also fail if an IUCV connection to the *MONITOR service
+cannot be established. In this case errno will be set to EIO and an error
+message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER
+codes are described in the "z/VM Performance" book, Appendix A.
+
+NOTE:
+-----
+As soon as the device is opened, incoming messages will be accepted and they
+will account for the message limit, i.e. opening the device without reading
+from it will provoke the "message limit reached" error (EOVERFLOW error code)
+eventually.
+
diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt
new file mode 100644
index 0000000..e054209
--- /dev/null
+++ b/Documentation/s390/s390dbf.txt
@@ -0,0 +1,649 @@
+S390 Debug Feature
+==================
+
+files: arch/s390/kernel/debug.c
+ include/asm-s390/debug.h
+
+Description:
+------------
+The goal of this feature is to provide a kernel debug logging API
+where log records can be stored efficiently in memory, where each component
+(e.g. device drivers) can have one separate debug log.
+One purpose of this is to inspect the debug logs after a production system crash
+in order to analyze the reason for the crash.
+If the system still runs but only a subcomponent which uses dbf fails,
+it is possible to look at the debug logs on a live system via the Linux
+debugfs filesystem.
+The debug feature may also very useful for kernel and driver development.
+
+Design:
+-------
+Kernel components (e.g. device drivers) can register themselves at the debug
+feature with the function call debug_register(). This function initializes a
+debug log for the caller. For each debug log exists a number of debug areas
+where exactly one is active at one time. Each debug area consists of contiguous
+pages in memory. In the debug areas there are stored debug entries (log records)
+which are written by event- and exception-calls.
+
+An event-call writes the specified debug entry to the active debug
+area and updates the log pointer for the active area. If the end
+of the active debug area is reached, a wrap around is done (ring buffer)
+and the next debug entry will be written at the beginning of the active
+debug area.
+
+An exception-call writes the specified debug entry to the log and
+switches to the next debug area. This is done in order to be sure
+that the records which describe the origin of the exception are not
+overwritten when a wrap around for the current area occurs.
+
+The debug areas themselves are also ordered in form of a ring buffer.
+When an exception is thrown in the last debug area, the following debug
+entries are then written again in the very first area.
+
+There are three versions for the event- and exception-calls: One for
+logging raw data, one for text and one for numbers.
+
+Each debug entry contains the following data:
+
+- Timestamp
+- Cpu-Number of calling task
+- Level of debug entry (0...6)
+- Return Address to caller
+- Flag, if entry is an exception or not
+
+The debug logs can be inspected in a live system through entries in
+the debugfs-filesystem. Under the toplevel directory "s390dbf" there is
+a directory for each registered component, which is named like the
+corresponding component. The debugfs normally should be mounted to
+/sys/kernel/debug therefore the debug feature can be accessed under
+/sys/kernel/debug/s390dbf.
+
+The content of the directories are files which represent different views
+to the debug log. Each component can decide which views should be
+used through registering them with the function debug_register_view().
+Predefined views for hex/ascii, sprintf and raw binary data are provided.
+It is also possible to define other views. The content of
+a view can be inspected simply by reading the corresponding debugfs file.
+
+All debug logs have an actual debug level (range from 0 to 6).
+The default level is 3. Event and Exception functions have a 'level'
+parameter. Only debug entries with a level that is lower or equal
+than the actual level are written to the log. This means, when
+writing events, high priority log entries should have a low level
+value whereas low priority entries should have a high one.
+The actual debug level can be changed with the help of the debugfs-filesystem
+through writing a number string "x" to the 'level' debugfs file which is
+provided for every debug log. Debugging can be switched off completely
+by using "-" on the 'level' debugfs file.
+
+Example:
+
+> echo "-" > /sys/kernel/debug/s390dbf/dasd/level
+
+It is also possible to deactivate the debug feature globally for every
+debug log. You can change the behavior using 2 sysctl parameters in
+/proc/sys/s390dbf:
+There are currently 2 possible triggers, which stop the debug feature
+globally. The first possibility is to use the "debug_active" sysctl. If
+set to 1 the debug feature is running. If "debug_active" is set to 0 the
+debug feature is turned off.
+The second trigger which stops the debug feature is a kernel oops.
+That prevents the debug feature from overwriting debug information that
+happened before the oops. After an oops you can reactivate the debug feature
+by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not
+suggested to use an oopsed kernel in a production environment.
+If you want to disallow the deactivation of the debug feature, you can use
+the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug
+feature cannot be stopped. If the debug feature is already stopped, it
+will stay deactivated.
+
+Kernel Interfaces:
+------------------
+
+----------------------------------------------------------------------------
+debug_info_t *debug_register(char *name, int pages, int nr_areas,
+ int buf_size);
+
+Parameter: name: Name of debug log (e.g. used for debugfs entry)
+ pages: number of pages, which will be allocated per area
+ nr_areas: number of debug areas
+ buf_size: size of data area in each debug entry
+
+Return Value: Handle for generated debug area
+ NULL if register failed
+
+Description: Allocates memory for a debug log
+ Must not be called within an interrupt handler
+
+----------------------------------------------------------------------------
+debug_info_t *debug_register_mode(char *name, int pages, int nr_areas,
+ int buf_size, mode_t mode, uid_t uid,
+ gid_t gid);
+
+Parameter: name: Name of debug log (e.g. used for debugfs entry)
+ pages: Number of pages, which will be allocated per area
+ nr_areas: Number of debug areas
+ buf_size: Size of data area in each debug entry
+ mode: File mode for debugfs files. E.g. S_IRWXUGO
+ uid: User ID for debugfs files. Currently only 0 is
+ supported.
+ gid: Group ID for debugfs files. Currently only 0 is
+ supported.
+
+Return Value: Handle for generated debug area
+ NULL if register failed
+
+Description: Allocates memory for a debug log
+ Must not be called within an interrupt handler
+
+---------------------------------------------------------------------------
+void debug_unregister (debug_info_t * id);
+
+Parameter: id: handle for debug log
+
+Return Value: none
+
+Description: frees memory for a debug log
+ Must not be called within an interrupt handler
+
+---------------------------------------------------------------------------
+void debug_set_level (debug_info_t * id, int new_level);
+
+Parameter: id: handle for debug log
+ new_level: new debug level
+
+Return Value: none
+
+Description: Sets new actual debug level if new_level is valid.
+
+---------------------------------------------------------------------------
+void debug_stop_all(void);
+
+Parameter: none
+
+Return Value: none
+
+Description: stops the debug feature if stopping is allowed. Currently
+ used in case of a kernel oops.
+
+---------------------------------------------------------------------------
+debug_entry_t* debug_event (debug_info_t* id, int level, void* data,
+ int length);
+
+Parameter: id: handle for debug log
+ level: debug level
+ data: pointer to data for debug entry
+ length: length of data in bytes
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry to active debug area (if level <= actual
+ debug level)
+
+---------------------------------------------------------------------------
+debug_entry_t* debug_int_event (debug_info_t * id, int level,
+ unsigned int data);
+debug_entry_t* debug_long_event(debug_info_t * id, int level,
+ unsigned long data);
+
+Parameter: id: handle for debug log
+ level: debug level
+ data: integer value for debug entry
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry to active debug area (if level <= actual
+ debug level)
+
+---------------------------------------------------------------------------
+debug_entry_t* debug_text_event (debug_info_t * id, int level,
+ const char* data);
+
+Parameter: id: handle for debug log
+ level: debug level
+ data: string for debug entry
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry in ascii format to active debug area
+ (if level <= actual debug level)
+
+---------------------------------------------------------------------------
+debug_entry_t* debug_sprintf_event (debug_info_t * id, int level,
+ char* string,...);
+
+Parameter: id: handle for debug log
+ level: debug level
+ string: format string for debug entry
+ ...: varargs used as in sprintf()
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry with format string and varargs (longs) to
+ active debug area (if level $<=$ actual debug level).
+ floats and long long datatypes cannot be used as varargs.
+
+---------------------------------------------------------------------------
+
+debug_entry_t* debug_exception (debug_info_t* id, int level, void* data,
+ int length);
+
+Parameter: id: handle for debug log
+ level: debug level
+ data: pointer to data for debug entry
+ length: length of data in bytes
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry to active debug area (if level <= actual
+ debug level) and switches to next debug area
+
+---------------------------------------------------------------------------
+debug_entry_t* debug_int_exception (debug_info_t * id, int level,
+ unsigned int data);
+debug_entry_t* debug_long_exception(debug_info_t * id, int level,
+ unsigned long data);
+
+Parameter: id: handle for debug log
+ level: debug level
+ data: integer value for debug entry
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry to active debug area (if level <= actual
+ debug level) and switches to next debug area
+
+---------------------------------------------------------------------------
+debug_entry_t* debug_text_exception (debug_info_t * id, int level,
+ const char* data);
+
+Parameter: id: handle for debug log
+ level: debug level
+ data: string for debug entry
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry in ascii format to active debug area
+ (if level <= actual debug level) and switches to next debug
+ area
+
+---------------------------------------------------------------------------
+debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level,
+ char* string,...);
+
+Parameter: id: handle for debug log
+ level: debug level
+ string: format string for debug entry
+ ...: varargs used as in sprintf()
+
+Return Value: Address of written debug entry
+
+Description: writes debug entry with format string and varargs (longs) to
+ active debug area (if level $<=$ actual debug level) and
+ switches to next debug area.
+ floats and long long datatypes cannot be used as varargs.
+
+---------------------------------------------------------------------------
+
+int debug_register_view (debug_info_t * id, struct debug_view *view);
+
+Parameter: id: handle for debug log
+ view: pointer to debug view struct
+
+Return Value: 0 : ok
+ < 0: Error
+
+Description: registers new debug view and creates debugfs dir entry
+
+---------------------------------------------------------------------------
+int debug_unregister_view (debug_info_t * id, struct debug_view *view);
+
+Parameter: id: handle for debug log
+ view: pointer to debug view struct
+
+Return Value: 0 : ok
+ < 0: Error
+
+Description: unregisters debug view and removes debugfs dir entry
+
+
+
+Predefined views:
+-----------------
+
+extern struct debug_view debug_hex_ascii_view;
+extern struct debug_view debug_raw_view;
+extern struct debug_view debug_sprintf_view;
+
+Examples
+--------
+
+/*
+ * hex_ascii- + raw-view Example
+ */
+
+#include <linux/init.h>
+#include <asm/debug.h>
+
+static debug_info_t* debug_info;
+
+static int init(void)
+{
+ /* register 4 debug areas with one page each and 4 byte data field */
+
+ debug_info = debug_register ("test", 1, 4, 4 );
+ debug_register_view(debug_info,&debug_hex_ascii_view);
+ debug_register_view(debug_info,&debug_raw_view);
+
+ debug_text_event(debug_info, 4 , "one ");
+ debug_int_exception(debug_info, 4, 4711);
+ debug_event(debug_info, 3, &debug_info, 4);
+
+ return 0;
+}
+
+static void cleanup(void)
+{
+ debug_unregister (debug_info);
+}
+
+module_init(init);
+module_exit(cleanup);
+
+---------------------------------------------------------------------------
+
+/*
+ * sprintf-view Example
+ */
+
+#include <linux/init.h>
+#include <asm/debug.h>
+
+static debug_info_t* debug_info;
+
+static int init(void)
+{
+ /* register 4 debug areas with one page each and data field for */
+ /* format string pointer + 2 varargs (= 3 * sizeof(long)) */
+
+ debug_info = debug_register ("test", 1, 4, sizeof(long) * 3);
+ debug_register_view(debug_info,&debug_sprintf_view);
+
+ debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__);
+ debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info);
+
+ return 0;
+}
+
+static void cleanup(void)
+{
+ debug_unregister (debug_info);
+}
+
+module_init(init);
+module_exit(cleanup);
+
+
+
+Debugfs Interface
+----------------
+Views to the debug logs can be investigated through reading the corresponding
+debugfs-files:
+
+Example:
+
+> ls /sys/kernel/debug/s390dbf/dasd
+flush hex_ascii level pages raw
+> cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort +1
+00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | ....
+00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE
+00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | ....
+00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP
+01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD
+01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | ....
+01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ...
+01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | ....
+01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE
+01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | ....
+
+See section about predefined views for explanation of the above output!
+
+Changing the debug level
+------------------------
+
+Example:
+
+
+> cat /sys/kernel/debug/s390dbf/dasd/level
+3
+> echo "5" > /sys/kernel/debug/s390dbf/dasd/level
+> cat /sys/kernel/debug/s390dbf/dasd/level
+5
+
+Flushing debug areas
+--------------------
+Debug areas can be flushed with piping the number of the desired
+area (0...n) to the debugfs file "flush". When using "-" all debug areas
+are flushed.
+
+Examples:
+
+1. Flush debug area 0:
+> echo "0" > /sys/kernel/debug/s390dbf/dasd/flush
+
+2. Flush all debug areas:
+> echo "-" > /sys/kernel/debug/s390dbf/dasd/flush
+
+Changing the size of debug areas
+------------------------------------
+It is possible the change the size of debug areas through piping
+the number of pages to the debugfs file "pages". The resize request will
+also flush the debug areas.
+
+Example:
+
+Define 4 pages for the debug areas of debug feature "dasd":
+> echo "4" > /sys/kernel/debug/s390dbf/dasd/pages
+
+Stooping the debug feature
+--------------------------
+Example:
+
+1. Check if stopping is allowed
+> cat /proc/sys/s390dbf/debug_stoppable
+2. Stop debug feature
+> echo 0 > /proc/sys/s390dbf/debug_active
+
+lcrash Interface
+----------------
+It is planned that the dump analysis tool lcrash gets an additional command
+'s390dbf' to display all the debug logs. With this tool it will be possible
+to investigate the debug logs on a live system and with a memory dump after
+a system crash.
+
+Investigating raw memory
+------------------------
+One last possibility to investigate the debug logs at a live
+system and after a system crash is to look at the raw memory
+under VM or at the Service Element.
+It is possible to find the anker of the debug-logs through
+the 'debug_area_first' symbol in the System map. Then one has
+to follow the correct pointers of the data-structures defined
+in debug.h and find the debug-areas in memory.
+Normally modules which use the debug feature will also have
+a global variable with the pointer to the debug-logs. Following
+this pointer it will also be possible to find the debug logs in
+memory.
+
+For this method it is recommended to use '16 * x + 4' byte (x = 0..n)
+for the length of the data field in debug_register() in
+order to see the debug entries well formatted.
+
+
+Predefined Views
+----------------
+
+There are three predefined views: hex_ascii, raw and sprintf.
+The hex_ascii view shows the data field in hex and ascii representation
+(e.g. '45 43 4b 44 | ECKD').
+The raw view returns a bytestream as the debug areas are stored in memory.
+
+The sprintf view formats the debug entries in the same way as the sprintf
+function would do. The sprintf event/exception functions write to the
+debug entry a pointer to the format string (size = sizeof(long))
+and for each vararg a long value. So e.g. for a debug entry with a format
+string plus two varargs one would need to allocate a (3 * sizeof(long))
+byte data area in the debug_register() function.
+
+
+NOTE: If using the sprintf view do NOT use other event/exception functions
+than the sprintf-event and -exception functions.
+
+The format of the hex_ascii and sprintf view is as follows:
+- Number of area
+- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated
+ Universal Time (UTC), January 1, 1970)
+- level of debug entry
+- Exception flag (* = Exception)
+- Cpu-Number of calling task
+- Return Address to caller
+- data field
+
+The format of the raw view is:
+- Header as described in debug.h
+- datafield
+
+A typical line of the hex_ascii view will look like the following (first line
+is only for explanation and will not be displayed when 'cating' the view):
+
+area time level exception cpu caller data (hex + ascii)
+--------------------------------------------------------------------------
+00 00964419409:440690 1 - 00 88023fe
+
+
+Defining views
+--------------
+
+Views are specified with the 'debug_view' structure. There are defined
+callback functions which are used for reading and writing the debugfs files:
+
+struct debug_view {
+ char name[DEBUG_MAX_PROCF_LEN];
+ debug_prolog_proc_t* prolog_proc;
+ debug_header_proc_t* header_proc;
+ debug_format_proc_t* format_proc;
+ debug_input_proc_t* input_proc;
+ void* private_data;
+};
+
+where
+
+typedef int (debug_header_proc_t) (debug_info_t* id,
+ struct debug_view* view,
+ int area,
+ debug_entry_t* entry,
+ char* out_buf);
+
+typedef int (debug_format_proc_t) (debug_info_t* id,
+ struct debug_view* view, char* out_buf,
+ const char* in_buf);
+typedef int (debug_prolog_proc_t) (debug_info_t* id,
+ struct debug_view* view,
+ char* out_buf);
+typedef int (debug_input_proc_t) (debug_info_t* id,
+ struct debug_view* view,
+ struct file* file, const char* user_buf,
+ size_t in_buf_size, loff_t* offset);
+
+
+The "private_data" member can be used as pointer to view specific data.
+It is not used by the debug feature itself.
+
+The output when reading a debugfs file is structured like this:
+
+"prolog_proc output"
+
+"header_proc output 1" "format_proc output 1"
+"header_proc output 2" "format_proc output 2"
+"header_proc output 3" "format_proc output 3"
+...
+
+When a view is read from the debugfs, the Debug Feature calls the
+'prolog_proc' once for writing the prolog.
+Then 'header_proc' and 'format_proc' are called for each
+existing debug entry.
+
+The input_proc can be used to implement functionality when it is written to
+the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level).
+
+For header_proc there can be used the default function
+debug_dflt_header_fn() which is defined in debug.h.
+and which produces the same header output as the predefined views.
+E.g:
+00 00964419409:440761 2 - 00 88023ec
+
+In order to see how to use the callback functions check the implementation
+of the default views!
+
+Example
+
+#include <asm/debug.h>
+
+#define UNKNOWNSTR "data: %08x"
+
+const char* messages[] =
+{"This error...........\n",
+ "That error...........\n",
+ "Problem..............\n",
+ "Something went wrong.\n",
+ "Everything ok........\n",
+ NULL
+};
+
+static int debug_test_format_fn(
+ debug_info_t * id, struct debug_view *view,
+ char *out_buf, const char *in_buf
+)
+{
+ int i, rc = 0;
+
+ if(id->buf_size >= 4) {
+ int msg_nr = *((int*)in_buf);
+ if(msg_nr < sizeof(messages)/sizeof(char*) - 1)
+ rc += sprintf(out_buf, "%s", messages[msg_nr]);
+ else
+ rc += sprintf(out_buf, UNKNOWNSTR, msg_nr);
+ }
+ out:
+ return rc;
+}
+
+struct debug_view debug_test_view = {
+ "myview", /* name of view */
+ NULL, /* no prolog */
+ &debug_dflt_header_fn, /* default header for each entry */
+ &debug_test_format_fn, /* our own format function */
+ NULL, /* no input function */
+ NULL /* no private data */
+};
+
+=====
+test:
+=====
+debug_info_t *debug_info;
+...
+debug_info = debug_register ("test", 0, 4, 4 ));
+debug_register_view(debug_info, &debug_test_view);
+for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i);
+
+> cat /sys/kernel/debug/s390dbf/test/myview
+00 00964419734:611402 1 - 00 88042ca This error...........
+00 00964419734:611405 1 - 00 88042ca That error...........
+00 00964419734:611408 1 - 00 88042ca Problem..............
+00 00964419734:611411 1 - 00 88042ca Something went wrong.
+00 00964419734:611414 1 - 00 88042ca Everything ok........
+00 00964419734:611417 1 - 00 88042ca data: 00000005
+00 00964419734:611419 1 - 00 88042ca data: 00000006
+00 00964419734:611422 1 - 00 88042ca data: 00000007
+00 00964419734:611425 1 - 00 88042ca data: 00000008
+00 00964419734:611428 1 - 00 88042ca data: 00000009
diff --git a/Documentation/s390/zfcpdump.txt b/Documentation/s390/zfcpdump.txt
new file mode 100644
index 0000000..cf45d27
--- /dev/null
+++ b/Documentation/s390/zfcpdump.txt
@@ -0,0 +1,87 @@
+s390 SCSI dump tool (zfcpdump)
+
+System z machines (z900 or higher) provide hardware support for creating system
+dumps on SCSI disks. The dump process is initiated by booting a dump tool, which
+has to create a dump of the current (probably crashed) Linux image. In order to
+not overwrite memory of the crashed Linux with data of the dump tool, the
+hardware saves some memory plus the register sets of the boot cpu before the
+dump tool is loaded. There exists an SCLP hardware interface to obtain the saved
+memory afterwards. Currently 32 MB are saved.
+
+This zfcpdump implementation consists of a Linux dump kernel together with
+a userspace dump tool, which are loaded together into the saved memory region
+below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in
+the s390-tools package) to make the device bootable. The operator of a Linux
+system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump
+resides on.
+
+The kernel part of zfcpdump is implemented as a debugfs file under "zcore/mem",
+which exports memory and registers of the crashed Linux in an s390
+standalone dump format. It can be used in the same way as e.g. /dev/mem. The
+dump format defines a 4K header followed by plain uncompressed memory. The
+register sets are stored in the prefix pages of the respective cpus. To build a
+dump enabled kernel with the zcore driver, the kernel config option
+CONFIG_ZFCPDUMP has to be set. When reading from "zcore/mem", the part of
+memory, which has been saved by hardware is read by the driver via the SCLP
+hardware interface. The second part is just copied from the non overwritten real
+memory.
+
+The userspace application of zfcpdump can reside e.g. in an intitramfs or an
+initrd. It reads from zcore/mem and writes the system dump to a file on a
+SCSI disk.
+
+To build a zfcpdump kernel use the following settings in your kernel
+configuration:
+ * CONFIG_ZFCPDUMP=y
+ * Enable ZFCP driver
+ * Enable SCSI driver
+ * Enable ext2 and ext3 filesystems
+ * Disable as many features as possible to keep the kernel small.
+ E.g. network support is not needed at all.
+
+To use the zfcpdump userspace application in an initramfs you have to do the
+following:
+
+ * Copy the zfcpdump executable somewhere into your Linux tree.
+ E.g. to "arch/s390/boot/zfcpdump. If you do not want to include
+ shared libraries, compile the tool with the "-static" gcc option.
+ * If you want to include e2fsck, add it to your source tree, too. The zfcpdump
+ application attempts to start /sbin/e2fsck from the ramdisk.
+ * Use an initramfs config file like the following:
+
+ dir /dev 755 0 0
+ nod /dev/console 644 0 0 c 5 1
+ nod /dev/null 644 0 0 c 1 3
+ nod /dev/sda1 644 0 0 b 8 1
+ nod /dev/sda2 644 0 0 b 8 2
+ nod /dev/sda3 644 0 0 b 8 3
+ nod /dev/sda4 644 0 0 b 8 4
+ nod /dev/sda5 644 0 0 b 8 5
+ nod /dev/sda6 644 0 0 b 8 6
+ nod /dev/sda7 644 0 0 b 8 7
+ nod /dev/sda8 644 0 0 b 8 8
+ nod /dev/sda9 644 0 0 b 8 9
+ nod /dev/sda10 644 0 0 b 8 10
+ nod /dev/sda11 644 0 0 b 8 11
+ nod /dev/sda12 644 0 0 b 8 12
+ nod /dev/sda13 644 0 0 b 8 13
+ nod /dev/sda14 644 0 0 b 8 14
+ nod /dev/sda15 644 0 0 b 8 15
+ file /init arch/s390/boot/zfcpdump 755 0 0
+ file /sbin/e2fsck arch/s390/boot/e2fsck 755 0 0
+ dir /proc 755 0 0
+ dir /sys 755 0 0
+ dir /mnt 755 0 0
+ dir /sbin 755 0 0
+
+ * Issue "make image" to build the zfcpdump image with initramfs.
+
+In a Linux distribution the zfcpdump enabled kernel image must be copied to
+/usr/share/zfcpdump/zfcpdump.image, where the s390 zipl tool is looking for the
+dump kernel when preparing a SCSI dump disk.
+
+If you use a ramdisk copy it to "/usr/share/zfcpdump/zfcpdump.rd".
+
+For more information on how to use zfcpdump refer to the s390 'Using the Dump
+Tools book', which is available from
+http://www.ibm.com/developerworks/linux/linux390.
diff --git a/Documentation/scheduler/00-INDEX b/Documentation/scheduler/00-INDEX
new file mode 100644
index 0000000..aabcc3a
--- /dev/null
+++ b/Documentation/scheduler/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - this file.
+sched-arch.txt
+ - CPU Scheduler implementation hints for architecture specific code.
+sched-coding.txt
+ - reference for various scheduler-related methods in the O(1) scheduler.
+sched-design-CFS.txt
+ - goals, design and implementation of the Complete Fair Scheduler.
+sched-domains.txt
+ - information on scheduling domains.
+sched-nice-design.txt
+ - How and why the scheduler's nice levels are implemented.
+sched-rt-group.txt
+ - real-time group scheduling.
+sched-stats.txt
+ - information on schedstats (Linux Scheduler Statistics).
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
new file mode 100644
index 0000000..941615a
--- /dev/null
+++ b/Documentation/scheduler/sched-arch.txt
@@ -0,0 +1,89 @@
+ CPU Scheduler implementation hints for architecture specific code
+
+ Nick Piggin, 2005
+
+Context switch
+==============
+1. Runqueue locking
+By default, the switch_to arch function is called with the runqueue
+locked. This is usually not a problem unless switch_to may need to
+take the runqueue lock. This is usually due to a wake up operation in
+the context switch. See include/asm-ia64/system.h for an example.
+
+To request the scheduler call switch_to with the runqueue unlocked,
+you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
+(typically the one where switch_to is defined).
+
+Unlocked context switches introduce only a very minor performance
+penalty to the core scheduler implementation in the CONFIG_SMP case.
+
+2. Interrupt status
+By default, the switch_to arch function is called with interrupts
+disabled. Interrupts may be enabled over the call if it is likely to
+introduce a significant interrupt latency by adding the line
+`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
+unlocked context switches. This define also implies
+`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an
+example.
+
+
+CPU idle
+========
+Your cpu_idle routines need to obey the following rules:
+
+1. Preempt should now disabled over idle routines. Should only
+ be enabled to call schedule() then disabled again.
+
+2. need_resched/TIF_NEED_RESCHED is only ever set, and will never
+ be cleared until the running task has called schedule(). Idle
+ threads need only ever query need_resched, and may never set or
+ clear it.
+
+3. When cpu_idle finds (need_resched() == 'true'), it should call
+ schedule(). It should not call schedule() otherwise.
+
+4. The only time interrupts need to be disabled when checking
+ need_resched is if we are about to sleep the processor until
+ the next interrupt (this doesn't provide any protection of
+ need_resched, it prevents losing an interrupt).
+
+ 4a. Common problem with this type of sleep appears to be:
+ local_irq_disable();
+ if (!need_resched()) {
+ local_irq_enable();
+ *** resched interrupt arrives here ***
+ __asm__("sleep until next interrupt");
+ }
+
+5. TIF_POLLING_NRFLAG can be set by idle routines that do not
+ need an interrupt to wake them up when need_resched goes high.
+ In other words, they must be periodically polling need_resched,
+ although it may be reasonable to do some background work or enter
+ a low CPU priority.
+
+ 5a. If TIF_POLLING_NRFLAG is set, and we do decide to enter
+ an interrupt sleep, it needs to be cleared then a memory
+ barrier issued (followed by a test of need_resched with
+ interrupts disabled, as explained in 3).
+
+arch/i386/kernel/process.c has examples of both polling and
+sleeping idle functions.
+
+
+Possible arch/ problems
+=======================
+
+Possible arch problems I found (and either tried to fix or didn't):
+
+h8300 - Is such sleeping racy vs interrupts? (See #4a).
+ The H8/300 manual I found indicates yes, however disabling IRQs
+ over the sleep mean only NMIs can wake it up, so can't fix easily
+ without doing spin waiting.
+
+ia64 - is safe_halt call racy vs interrupts? (does it sleep?) (See #4a)
+
+sh64 - Is sleeping racy vs interrupts? (See #4a)
+
+sparc - IRQs on at this point(?), change local_irq_save to _disable.
+ - TODO: needs secondary CPUs to disable preempt (See #1)
+
diff --git a/Documentation/scheduler/sched-coding.txt b/Documentation/scheduler/sched-coding.txt
new file mode 100644
index 0000000..cbd8db7
--- /dev/null
+++ b/Documentation/scheduler/sched-coding.txt
@@ -0,0 +1,126 @@
+ Reference for various scheduler-related methods in the O(1) scheduler
+ Robert Love <rml@tech9.net>, MontaVista Software
+
+
+Note most of these methods are local to kernel/sched.c - this is by design.
+The scheduler is meant to be self-contained and abstracted away. This document
+is primarily for understanding the scheduler, not interfacing to it. Some of
+the discussed interfaces, however, are general process/scheduling methods.
+They are typically defined in include/linux/sched.h.
+
+
+Main Scheduling Methods
+-----------------------
+
+void load_balance(runqueue_t *this_rq, int idle)
+ Attempts to pull tasks from one cpu to another to balance cpu usage,
+ if needed. This method is called explicitly if the runqueues are
+ imbalanced or periodically by the timer tick. Prior to calling,
+ the current runqueue must be locked and interrupts disabled.
+
+void schedule()
+ The main scheduling function. Upon return, the highest priority
+ process will be active.
+
+
+Locking
+-------
+
+Each runqueue has its own lock, rq->lock. When multiple runqueues need
+to be locked, lock acquires must be ordered by ascending &runqueue value.
+
+A specific runqueue is locked via
+
+ task_rq_lock(task_t pid, unsigned long *flags)
+
+which disables preemption, disables interrupts, and locks the runqueue pid is
+running on. Likewise,
+
+ task_rq_unlock(task_t pid, unsigned long *flags)
+
+unlocks the runqueue pid is running on, restores interrupts to their previous
+state, and reenables preemption.
+
+The routines
+
+ double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+
+and
+
+ double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+
+safely lock and unlock, respectively, the two specified runqueues. They do
+not, however, disable and restore interrupts. Users are required to do so
+manually before and after calls.
+
+
+Values
+------
+
+MAX_PRIO
+ The maximum priority of the system, stored in the task as task->prio.
+ Lower priorities are higher. Normal (non-RT) priorities range from
+ MAX_RT_PRIO to (MAX_PRIO - 1).
+MAX_RT_PRIO
+ The maximum real-time priority of the system. Valid RT priorities
+ range from 0 to (MAX_RT_PRIO - 1).
+MAX_USER_RT_PRIO
+ The maximum real-time priority that is exported to user-space. Should
+ always be equal to or less than MAX_RT_PRIO. Setting it less allows
+ kernel threads to have higher priorities than any user-space task.
+MIN_TIMESLICE
+MAX_TIMESLICE
+ Respectively, the minimum and maximum timeslices (quanta) of a process.
+
+Data
+----
+
+struct runqueue
+ The main per-CPU runqueue data structure.
+struct task_struct
+ The main per-process data structure.
+
+
+General Methods
+---------------
+
+cpu_rq(cpu)
+ Returns the runqueue of the specified cpu.
+this_rq()
+ Returns the runqueue of the current cpu.
+task_rq(pid)
+ Returns the runqueue which holds the specified pid.
+cpu_curr(cpu)
+ Returns the task currently running on the given cpu.
+rt_task(pid)
+ Returns true if pid is real-time, false if not.
+
+
+Process Control Methods
+-----------------------
+
+void set_user_nice(task_t *p, long nice)
+ Sets the "nice" value of task p to the given value.
+int setscheduler(pid_t pid, int policy, struct sched_param *param)
+ Sets the scheduling policy and parameters for the given pid.
+int set_cpus_allowed(task_t *p, unsigned long new_mask)
+ Sets a given task's CPU affinity and migrates it to a proper cpu.
+ Callers must have a valid reference to the task and assure the
+ task not exit prematurely. No locks can be held during the call.
+set_task_state(tsk, state_value)
+ Sets the given task's state to the given value.
+set_current_state(state_value)
+ Sets the current task's state to the given value.
+void set_tsk_need_resched(struct task_struct *tsk)
+ Sets need_resched in the given task.
+void clear_tsk_need_resched(struct task_struct *tsk)
+ Clears need_resched in the given task.
+void set_need_resched()
+ Sets need_resched in the current task.
+void clear_need_resched()
+ Clears need_resched in the current task.
+int need_resched()
+ Returns true if need_resched is set in the current task, false
+ otherwise.
+yield()
+ Place the current process at the end of the runqueue and call schedule.
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
new file mode 100644
index 0000000..eb471c7
--- /dev/null
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -0,0 +1,275 @@
+ =============
+ CFS Scheduler
+ =============
+
+
+1. OVERVIEW
+
+CFS stands for "Completely Fair Scheduler," and is the new "desktop" process
+scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the
+replacement for the previous vanilla scheduler's SCHED_OTHER interactivity
+code.
+
+80% of CFS's design can be summed up in a single sentence: CFS basically models
+an "ideal, precise multi-tasking CPU" on real hardware.
+
+"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% physical
+power and which can run each task at precise equal speed, in parallel, each at
+1/nr_running speed. For example: if there are 2 tasks running, then it runs
+each at 50% physical power --- i.e., actually in parallel.
+
+On real hardware, we can run only a single task at once, so we have to
+introduce the concept of "virtual runtime." The virtual runtime of a task
+specifies when its next timeslice would start execution on the ideal
+multi-tasking CPU described above. In practice, the virtual runtime of a task
+is its actual runtime normalized to the total number of running tasks.
+
+
+
+2. FEW IMPLEMENTATION DETAILS
+
+In CFS the virtual runtime is expressed and tracked via the per-task
+p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately
+timestamp and measure the "expected CPU time" a task should have gotten.
+
+[ small detail: on "ideal" hardware, at any time all tasks would have the same
+ p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
+ would ever get "out of balance" from the "ideal" share of CPU time. ]
+
+CFS's task picking logic is based on this p->se.vruntime value and it is thus
+very simple: it always tries to run the task with the smallest p->se.vruntime
+value (i.e., the task which executed least so far). CFS always tries to split
+up CPU time between runnable tasks as close to "ideal multitasking hardware" as
+possible.
+
+Most of the rest of CFS's design just falls out of this really simple concept,
+with a few add-on embellishments like nice levels, multiprocessing and various
+algorithm variants to recognize sleepers.
+
+
+
+3. THE RBTREE
+
+CFS's design is quite radical: it does not use the old data structures for the
+runqueues, but it uses a time-ordered rbtree to build a "timeline" of future
+task execution, and thus has no "array switch" artifacts (by which both the
+previous vanilla scheduler and RSDL/SD are affected).
+
+CFS also maintains the rq->cfs.min_vruntime value, which is a monotonic
+increasing value tracking the smallest vruntime among all tasks in the
+runqueue. The total amount of work done by the system is tracked using
+min_vruntime; that value is used to place newly activated entities on the left
+side of the tree as much as possible.
+
+The total number of running tasks in the runqueue is accounted through the
+rq->cfs.load value, which is the sum of the weights of the tasks queued on the
+runqueue.
+
+CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the
+p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to
+account for possible wraparounds). CFS picks the "leftmost" task from this
+tree and sticks to it.
+As the system progresses forwards, the executed tasks are put into the tree
+more and more to the right --- slowly but surely giving a chance for every task
+to become the "leftmost task" and thus get on the CPU within a deterministic
+amount of time.
+
+Summing up, CFS works like this: it runs a task a bit, and when the task
+schedules (or a scheduler tick happens) the task's CPU usage is "accounted
+for": the (small) time it just spent using the physical CPU is added to
+p->se.vruntime. Once p->se.vruntime gets high enough so that another task
+becomes the "leftmost task" of the time-ordered rbtree it maintains (plus a
+small amount of "granularity" distance relative to the leftmost task so that we
+do not over-schedule tasks and trash the cache), then the new leftmost task is
+picked and the current task is preempted.
+
+
+
+4. SOME FEATURES OF CFS
+
+CFS uses nanosecond granularity accounting and does not rely on any jiffies or
+other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
+way the previous scheduler had, and has no heuristics whatsoever. There is
+only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+
+ /proc/sys/kernel/sched_min_granularity_ns
+
+which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+"server" (i.e., good batching) workloads. It defaults to a setting suitable
+for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too.
+
+Due to its design, the CFS scheduler is not prone to any of the "attacks" that
+exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c,
+chew.c, ring-test.c, massive_intr.c all work fine and do not impact
+interactivity and produce the expected behavior.
+
+The CFS scheduler has a much stronger handling of nice levels and SCHED_BATCH
+than the previous vanilla scheduler: both types of workloads are isolated much
+more aggressively.
+
+SMP load-balancing has been reworked/sanitized: the runqueue-walking
+assumptions are gone from the load-balancing code now, and iterators of the
+scheduling modules are used. The balancing code got quite a bit simpler as a
+result.
+
+
+
+5. Scheduling policies
+
+CFS implements three scheduling policies:
+
+ - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling
+ policy that is used for regular tasks.
+
+ - SCHED_BATCH: Does not preempt nearly as often as regular tasks
+ would, thereby allowing tasks to run longer and make better use of
+ caches but at the cost of interactivity. This is well suited for
+ batch jobs.
+
+ - SCHED_IDLE: This is even weaker than nice 19, but its not a true
+ idle timer scheduler in order to avoid to get into priority
+ inversion problems which would deadlock the machine.
+
+SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by
+POSIX.
+
+The command chrt from util-linux-ng 2.13.1.1 can set all of these except
+SCHED_IDLE.
+
+
+
+6. SCHEDULING CLASSES
+
+The new CFS scheduler has been designed in such a way to introduce "Scheduling
+Classes," an extensible hierarchy of scheduler modules. These modules
+encapsulate scheduling policy details and are handled by the scheduler core
+without the core code assuming too much about them.
+
+sched_fair.c implements the CFS scheduler described above.
+
+sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
+the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
+priority levels, instead of 140 in the previous scheduler) and it needs no
+expired array.
+
+Scheduling classes are implemented through the sched_class structure, which
+contains hooks to functions that must be called whenever an interesting event
+occurs.
+
+This is the (partial) list of the hooks:
+
+ - enqueue_task(...)
+
+ Called when a task enters a runnable state.
+ It puts the scheduling entity (task) into the red-black tree and
+ increments the nr_running variable.
+
+ - dequeue_tree(...)
+
+ When a task is no longer runnable, this function is called to keep the
+ corresponding scheduling entity out of the red-black tree. It decrements
+ the nr_running variable.
+
+ - yield_task(...)
+
+ This function is basically just a dequeue followed by an enqueue, unless the
+ compat_yield sysctl is turned on; in that case, it places the scheduling
+ entity at the right-most end of the red-black tree.
+
+ - check_preempt_curr(...)
+
+ This function checks if a task that entered the runnable state should
+ preempt the currently running task.
+
+ - pick_next_task(...)
+
+ This function chooses the most appropriate task eligible to run next.
+
+ - set_curr_task(...)
+
+ This function is called when a task changes its scheduling class or changes
+ its task group.
+
+ - task_tick(...)
+
+ This function is mostly called from time tick functions; it might lead to
+ process switch. This drives the running preemption.
+
+ - task_new(...)
+
+ The core scheduler gives the scheduling module an opportunity to manage new
+ task startup. The CFS scheduling module uses it for group scheduling, while
+ the scheduling module for a real-time task does not use it.
+
+
+
+7. GROUP SCHEDULER EXTENSIONS TO CFS
+
+Normally, the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks and
+provide fair CPU time to each such task group. For example, it may be
+desirable to first provide fair CPU time to each user on the system and then to
+each task belonging to a user.
+
+CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be
+grouped and divides CPU time fairly among such groups.
+
+CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
+SCHED_RR) tasks.
+
+CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
+SCHED_BATCH) tasks.
+
+At present, there are two (mutually exclusive) mechanisms to group tasks for
+CPU bandwidth control purposes:
+
+ - Based on user id (CONFIG_USER_SCHED)
+
+ With this option, tasks are grouped according to their user id.
+
+ - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
+
+ This options needs CONFIG_CGROUPS to be defined, and lets the administrator
+ create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
+user and a "cpu_share" file is added in that directory.
+
+ # cd /sys/kernel/uids
+ # cat 512/cpu_share # Display user 512's CPU share
+ 1024
+ # echo 2048 > 512/cpu_share # Modify user 512's CPU share
+ # cat 512/cpu_share # Display user 512's CPU share
+ 2048
+ #
+
+CPU bandwidth between two users is divided in the ratio of their CPU shares.
+For example: if you would like user "root" to get twice the bandwidth of user
+"guest," then set the cpu_share for both the users such that "root"'s cpu_share
+is twice "guest"'s cpu_share.
+
+When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
+group created using the pseudo filesystem. See example steps below to create
+task groups and modify their CPU share using the "cgroups" pseudo filesystem.
+
+ # mkdir /dev/cpuctl
+ # mount -t cgroup -ocpu none /dev/cpuctl
+ # cd /dev/cpuctl
+
+ # mkdir multimedia # create "multimedia" group of tasks
+ # mkdir browser # create "browser" group of tasks
+
+ # #Configure the multimedia group to receive twice the CPU bandwidth
+ # #that of browser group
+
+ # echo 2048 > multimedia/cpu.shares
+ # echo 1024 > browser/cpu.shares
+
+ # firefox & # Launch firefox and move it to "browser" group
+ # echo <firefox_pid> > browser/tasks
+
+ # #Launch gmplayer (or your favourite movie player)
+ # echo <movie_player_pid> > multimedia/tasks
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
new file mode 100644
index 0000000..373ceac
--- /dev/null
+++ b/Documentation/scheduler/sched-domains.txt
@@ -0,0 +1,67 @@
+Each CPU has a "base" scheduling domain (struct sched_domain). These are
+accessed via cpu_sched_domain(i) and this_sched_domain() macros. The domain
+hierarchy is built from these base domains via the ->parent pointer. ->parent
+MUST be NULL terminated, and domain structures should be per-CPU as they
+are locklessly updated.
+
+Each scheduling domain spans a number of CPUs (stored in the ->span field).
+A domain's span MUST be a superset of it child's span (this restriction could
+be relaxed if the need arises), and a base domain for CPU i MUST span at least
+i. The top domain for each CPU will generally span all CPUs in the system
+although strictly it doesn't have to, but this could lead to a case where some
+CPUs will never be given tasks to run unless the CPUs allowed mask is
+explicitly set. A sched domain's span means "balance process load among these
+CPUs".
+
+Each scheduling domain must have one or more CPU groups (struct sched_group)
+which are organised as a circular one way linked list from the ->groups
+pointer. The union of cpumasks of these groups MUST be the same as the
+domain's span. The intersection of cpumasks from any two of these groups
+MUST be the empty set. The group pointed to by the ->groups pointer MUST
+contain the CPU to which the domain belongs. Groups may be shared among
+CPUs as they contain read only data after they have been set up.
+
+Balancing within a sched domain occurs between groups. That is, each group
+is treated as one entity. The load of a group is defined as the sum of the
+load of each of its member CPUs, and only when the load of a group becomes
+out of balance are tasks moved between groups.
+
+In kernel/sched.c, rebalance_tick is run periodically on each CPU. This
+function takes its CPU's base sched domain and checks to see if has reached
+its rebalance interval. If so, then it will run load_balance on that domain.
+rebalance_tick then checks the parent sched_domain (if it exists), and the
+parent of the parent and so forth.
+
+*** Implementing sched domains ***
+The "base" domain will "span" the first level of the hierarchy. In the case
+of SMT, you'll span all siblings of the physical CPU, with each group being
+a single virtual CPU.
+
+In SMP, the parent of the base domain will span all physical CPUs in the
+node. Each group being a single physical CPU. Then with NUMA, the parent
+of the SMP domain will span the entire machine, with each group having the
+cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
+might have just one domain covering its one NUMA level.
+
+The implementor should read comments in include/linux/sched.h:
+struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
+the specifics and what to tune.
+
+For SMT, the architecture must define CONFIG_SCHED_SMT and provide a
+cpumask_t cpu_sibling_map[NR_CPUS], where cpu_sibling_map[i] is the mask of
+all "i"'s siblings as well as "i" itself.
+
+Architectures may retain the regular override the default SD_*_INIT flags
+while using the generic domain builder in kernel/sched.c if they wish to
+retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
+can be done by #define'ing ARCH_HASH_SCHED_TUNE.
+
+Alternatively, the architecture may completely override the generic domain
+builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
+arch_init_sched_domains function. This function will attach domains to all
+CPUs using cpu_attach_domain.
+
+The sched-domains debugging infrastructure can be enabled by enabling
+CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
+which should catch most possible errors (described above). It also prints out
+the domain structure in a visual format.
diff --git a/Documentation/scheduler/sched-nice-design.txt b/Documentation/scheduler/sched-nice-design.txt
new file mode 100644
index 0000000..e2bae5a
--- /dev/null
+++ b/Documentation/scheduler/sched-nice-design.txt
@@ -0,0 +1,108 @@
+This document explains the thinking about the revamped and streamlined
+nice-levels implementation in the new Linux scheduler.
+
+Nice levels were always pretty weak under Linux and people continuously
+pestered us to make nice +19 tasks use up much less CPU time.
+
+Unfortunately that was not that easy to implement under the old
+scheduler, (otherwise we'd have done it long ago) because nice level
+support was historically coupled to timeslice length, and timeslice
+units were driven by the HZ tick, so the smallest timeslice was 1/HZ.
+
+In the O(1) scheduler (in 2003) we changed negative nice levels to be
+much stronger than they were before in 2.4 (and people were happy about
+that change), and we also intentionally calibrated the linear timeslice
+rule so that nice +19 level would be _exactly_ 1 jiffy. To better
+understand it, the timeslice graph went like this (cheesy ASCII art
+alert!):
+
+
+ A
+ \ | [timeslice length]
+ \ |
+ \ |
+ \ |
+ \ |
+ \|___100msecs
+ |^ . _
+ | ^ . _
+ | ^ . _
+ -*----------------------------------*-----> [nice level]
+ -20 | +19
+ |
+ |
+
+So that if someone wanted to really renice tasks, +19 would give a much
+bigger hit than the normal linear rule would do. (The solution of
+changing the ABI to extend priorities was discarded early on.)
+
+This approach worked to some degree for some time, but later on with
+HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which
+we felt to be a bit excessive. Excessive _not_ because it's too small of
+a CPU utilization, but because it causes too frequent (once per
+millisec) rescheduling. (and would thus trash the cache, etc. Remember,
+this was long ago when hardware was weaker and caches were smaller, and
+people were running number crunching apps at nice +19.)
+
+So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the
+right minimal granularity - and this translates to 5% CPU utilization.
+But the fundamental HZ-sensitive property for nice+19 still remained,
+and we never got a single complaint about nice +19 being too _weak_ in
+terms of CPU utilization, we only got complaints about it (still) being
+too _strong_ :-)
+
+To sum it up: we always wanted to make nice levels more consistent, but
+within the constraints of HZ and jiffies and their nasty design level
+coupling to timeslices and granularity it was not really viable.
+
+The second (less frequent but still periodically occuring) complaint
+about Linux's nice level support was its assymetry around the origo
+(which you can see demonstrated in the picture above), or more
+accurately: the fact that nice level behavior depended on the _absolute_
+nice level as well, while the nice API itself is fundamentally
+"relative":
+
+ int nice(int inc);
+
+ asmlinkage long sys_nice(int increment)
+
+(the first one is the glibc API, the second one is the syscall API.)
+Note that the 'inc' is relative to the current nice level. Tools like
+bash's "nice" command mirror this relative API.
+
+With the old scheduler, if you for example started a niced task with +1
+and another task with +2, the CPU split between the two tasks would
+depend on the nice level of the parent shell - if it was at nice -10 the
+CPU split was different than if it was at +5 or +10.
+
+A third complaint against Linux's nice level support was that negative
+nice levels were not 'punchy enough', so lots of people had to resort to
+run audio (and other multimedia) apps under RT priorities such as
+SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation
+proof, and a buggy SCHED_FIFO app can also lock up the system for good.
+
+The new scheduler in v2.6.23 addresses all three types of complaints:
+
+To address the first complaint (of nice levels being not "punchy"
+enough), the scheduler was decoupled from 'time slice' and HZ concepts
+(and granularity was made a separate concept from nice levels) and thus
+it was possible to implement better and more consistent nice +19
+support: with the new scheduler nice +19 tasks get a HZ-independent
+1.5%, instead of the variable 3%-5%-9% range they got in the old
+scheduler.
+
+To address the second complaint (of nice levels not being consistent),
+the new scheduler makes nice(1) have the same CPU utilization effect on
+tasks, regardless of their absolute nice levels. So on the new
+scheduler, running a nice +10 and a nice 11 task has the same CPU
+utilization "split" between them as running a nice -5 and a nice -4
+task. (one will get 55% of the CPU, the other 45%.) That is why nice
+levels were changed to be "multiplicative" (or exponential) - that way
+it does not matter which nice level you start out from, the 'relative
+result' will always be the same.
+
+The third complaint (of negative nice levels not being "punchy" enough
+and forcing audio apps to run under the more dangerous SCHED_FIFO
+scheduling policy) is addressed by the new scheduler almost
+automatically: stronger negative nice levels are an automatic
+side-effect of the recalibrated dynamic range of nice levels.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
new file mode 100644
index 0000000..3ef339f
--- /dev/null
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -0,0 +1,177 @@
+ Real-Time group scheduling
+ --------------------------
+
+CONTENTS
+========
+
+1. Overview
+ 1.1 The problem
+ 1.2 The solution
+2. The interface
+ 2.1 System-wide settings
+ 2.2 Default behaviour
+ 2.3 Basis for grouping tasks
+3. Future plans
+
+
+1. Overview
+===========
+
+
+1.1 The problem
+---------------
+
+Realtime scheduling is all about determinism, a group has to be able to rely on
+the amount of bandwidth (eg. CPU time) being constant. In order to schedule
+multiple groups of realtime tasks, each group must be assigned a fixed portion
+of the CPU time available. Without a minimum guarantee a realtime group can
+obviously fall short. A fuzzy upper limit is of no use since it cannot be
+relied upon. Which leaves us with just the single fixed portion.
+
+1.2 The solution
+----------------
+
+CPU time is divided by means of specifying how much time can be spent running
+in a given period. We allocate this "run time" for each realtime group which
+the other realtime groups will not be permitted to use.
+
+Any time not allocated to a realtime group will be used to run normal priority
+tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
+SCHED_OTHER.
+
+Let's consider an example: a frame fixed realtime renderer must deliver 25
+frames a second, which yields a period of 0.04s per frame. Now say it will also
+have to play some music and respond to input, leaving it with around 80% CPU
+time dedicated for the graphics. We can then give this group a run time of 0.8
+* 0.04s = 0.032s.
+
+This way the graphics group will have a 0.04s period with a 0.032s run time
+limit. Now if the audio thread needs to refill the DMA buffer every 0.005s, but
+needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
+0.00015s. So this group can be scheduled with a period of 0.005s and a run time
+of 0.00015s.
+
+The remaining CPU time will be used for user input and other tasks. Because
+realtime tasks have explicitly allocated the CPU time they need to perform
+their tasks, buffer underruns in the graphics or audio can be eliminated.
+
+NOTE: the above example is not fully implemented as of yet (2.6.25). We still
+lack an EDF scheduler to make non-uniform periods usable.
+
+
+2. The Interface
+================
+
+
+2.1 System wide settings
+------------------------
+
+The system wide settings are configured under the /proc virtual file system:
+
+/proc/sys/kernel/sched_rt_period_us:
+ The scheduling period that is equivalent to 100% CPU bandwidth
+
+/proc/sys/kernel/sched_rt_runtime_us:
+ A global limit on how much time realtime scheduling may use. Even without
+ CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
+ processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
+ available to all realtime groups.
+
+ * Time is specified in us because the interface is s32. This gives an
+ operating range from 1us to about 35 minutes.
+ * sched_rt_period_us takes values from 1 to INT_MAX.
+ * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
+ * A run time of -1 specifies runtime == period, ie. no limit.
+
+
+2.2 Default behaviour
+---------------------
+
+The default values for sched_rt_period_us (1000000 or 1s) and
+sched_rt_runtime_us (950000 or 0.95s). This gives 0.05s to be used by
+SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
+realtime tasks will not lock up the machine but leave a little time to recover
+it. By setting runtime to -1 you'd get the old behaviour back.
+
+By default all bandwidth is assigned to the root group and new groups get the
+period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
+want to assign bandwidth to another group, reduce the root group's bandwidth
+and assign some or all of the difference to another group.
+
+Realtime group scheduling means you have to assign a portion of total CPU
+bandwidth to the group before it will accept realtime tasks. Therefore you will
+not be able to run realtime tasks as any user other than root until you have
+done that, even if the user has the rights to run processes with realtime
+priority!
+
+
+2.3 Basis for grouping tasks
+----------------------------
+
+There are two compile-time settings for allocating CPU bandwidth. These are
+configured using the "Basis for grouping tasks" multiple choice menu under
+General setup > Group CPU Scheduler:
+
+a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id")
+
+This lets you use the virtual files under
+"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
+each user .
+
+The other option is:
+
+.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
+
+This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_us"
+to control the CPU time reserved for each control group instead.
+
+For more information on working with control groups, you should read
+Documentation/cgroups.txt as well.
+
+Group settings are checked against the following limits in order to keep the configuration
+schedulable:
+
+ \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
+
+For now, this can be simplified to just the following (but see Future plans):
+
+ \Sum_{i} runtime_{i} <= global_runtime
+
+
+3. Future plans
+===============
+
+There is work in progress to make the scheduling period for each group
+("/sys/kernel/uids/<uid>/cpu_rt_period_us" or
+"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
+
+The constraint on the period is that a subgroup must have a smaller or
+equal period to its parent. But realistically its not very useful _yet_
+as its prone to starvation without deadline scheduling.
+
+Consider two sibling groups A and B; both have 50% bandwidth, but A's
+period is twice the length of B's.
+
+* group A: period=100000us, runtime=10000us
+ - this runs for 0.01s once every 0.1s
+
+* group B: period= 50000us, runtime=10000us
+ - this runs for 0.01s twice every 0.1s (or once every 0.05 sec).
+
+This means that currently a while (1) loop in A will run for the full period of
+B and can starve B's tasks (assuming they are of lower priority) for a whole
+period.
+
+The next project will be SCHED_EDF (Earliest Deadline First scheduling) to bring
+full deadline scheduling to the linux kernel. Deadline scheduling the above
+groups and treating end of the period as a deadline will ensure that they both
+get their allocated time.
+
+Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
+the biggest challenge as the current linux PI infrastructure is geared towards
+the limited static priority levels 0-139. With deadline scheduling you need to
+do deadline inheritance (since priority is inversely proportional to the
+deadline delta (deadline - now).
+
+This means the whole PI machinery will have to be reworked - and that is one of
+the most complex pieces of code we have.
diff --git a/Documentation/scheduler/sched-stats.txt b/Documentation/scheduler/sched-stats.txt
new file mode 100644
index 0000000..01e6940
--- /dev/null
+++ b/Documentation/scheduler/sched-stats.txt
@@ -0,0 +1,156 @@
+Version 14 of schedstats includes support for sched_domains, which hit the
+mainline kernel in 2.6.20 although it is identical to the stats from version
+12 which was in the kernel from 2.6.13-2.6.19 (version 13 never saw a kernel
+release). Some counters make more sense to be per-runqueue; other to be
+per-domain. Note that domains (and their associated information) will only
+be pertinent and available on machines utilizing CONFIG_SMP.
+
+In version 14 of schedstat, there is at least one level of domain
+statistics for each cpu listed, and there may well be more than one
+domain. Domains have no particular names in this implementation, but
+the highest numbered one typically arbitrates balancing across all the
+cpus on the machine, while domain0 is the most tightly focused domain,
+sometimes balancing only between pairs of cpus. At this time, there
+are no architectures which need more than three domain levels. The first
+field in the domain stats is a bit map indicating which cpus are affected
+by that domain.
+
+These fields are counters, and only increment. Programs which make use
+of these will need to start with a baseline observation and then calculate
+the change in the counters at each subsequent observation. A perl script
+which does this for many of the fields is available at
+
+ http://eaglet.rain.com/rick/linux/schedstat/
+
+Note that any such script will necessarily be version-specific, as the main
+reason to change versions is changes in the output format. For those wishing
+to write their own scripts, the fields are described here.
+
+CPU statistics
+--------------
+cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12
+
+NOTE: In the sched_yield() statistics, the active queue is considered empty
+ if it has only one process in it, since obviously the process calling
+ sched_yield() is that process.
+
+First four fields are sched_yield() statistics:
+ 1) # of times both the active and the expired queue were empty
+ 2) # of times just the active queue was empty
+ 3) # of times just the expired queue was empty
+ 4) # of times sched_yield() was called
+
+Next three are schedule() statistics:
+ 5) # of times we switched to the expired queue and reused it
+ 6) # of times schedule() was called
+ 7) # of times schedule() left the processor idle
+
+Next two are try_to_wake_up() statistics:
+ 8) # of times try_to_wake_up() was called
+ 9) # of times try_to_wake_up() was called to wake up the local cpu
+
+Next three are statistics describing scheduling latency:
+ 10) sum of all time spent running by tasks on this processor (in jiffies)
+ 11) sum of all time spent waiting to run by tasks on this processor (in
+ jiffies)
+ 12) # of timeslices run on this cpu
+
+
+Domain statistics
+-----------------
+One of these is produced per domain for each cpu described. (Note that if
+CONFIG_SMP is not defined, *no* domains are utilized and these lines
+will not appear in the output.)
+
+domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
+
+The first field is a bit mask indicating what cpus this domain operates over.
+
+The next 24 are a variety of load_balance() statistics in grouped into types
+of idleness (idle, busy, and newly idle):
+
+ 1) # of times in this domain load_balance() was called when the
+ cpu was idle
+ 2) # of times in this domain load_balance() checked but found
+ the load did not require balancing when the cpu was idle
+ 3) # of times in this domain load_balance() tried to move one or
+ more tasks and failed, when the cpu was idle
+ 4) sum of imbalances discovered (if any) with each call to
+ load_balance() in this domain when the cpu was idle
+ 5) # of times in this domain pull_task() was called when the cpu
+ was idle
+ 6) # of times in this domain pull_task() was called even though
+ the target task was cache-hot when idle
+ 7) # of times in this domain load_balance() was called but did
+ not find a busier queue while the cpu was idle
+ 8) # of times in this domain a busier queue was found while the
+ cpu was idle but no busier group was found
+
+ 9) # of times in this domain load_balance() was called when the
+ cpu was busy
+ 10) # of times in this domain load_balance() checked but found the
+ load did not require balancing when busy
+ 11) # of times in this domain load_balance() tried to move one or
+ more tasks and failed, when the cpu was busy
+ 12) sum of imbalances discovered (if any) with each call to
+ load_balance() in this domain when the cpu was busy
+ 13) # of times in this domain pull_task() was called when busy
+ 14) # of times in this domain pull_task() was called even though the
+ target task was cache-hot when busy
+ 15) # of times in this domain load_balance() was called but did not
+ find a busier queue while the cpu was busy
+ 16) # of times in this domain a busier queue was found while the cpu
+ was busy but no busier group was found
+
+ 17) # of times in this domain load_balance() was called when the
+ cpu was just becoming idle
+ 18) # of times in this domain load_balance() checked but found the
+ load did not require balancing when the cpu was just becoming idle
+ 19) # of times in this domain load_balance() tried to move one or more
+ tasks and failed, when the cpu was just becoming idle
+ 20) sum of imbalances discovered (if any) with each call to
+ load_balance() in this domain when the cpu was just becoming idle
+ 21) # of times in this domain pull_task() was called when newly idle
+ 22) # of times in this domain pull_task() was called even though the
+ target task was cache-hot when just becoming idle
+ 23) # of times in this domain load_balance() was called but did not
+ find a busier queue while the cpu was just becoming idle
+ 24) # of times in this domain a busier queue was found while the cpu
+ was just becoming idle but no busier group was found
+
+ Next three are active_load_balance() statistics:
+ 25) # of times active_load_balance() was called
+ 26) # of times active_load_balance() tried to move a task and failed
+ 27) # of times active_load_balance() successfully moved a task
+
+ Next three are sched_balance_exec() statistics:
+ 28) sbe_cnt is not used
+ 29) sbe_balanced is not used
+ 30) sbe_pushed is not used
+
+ Next three are sched_balance_fork() statistics:
+ 31) sbf_cnt is not used
+ 32) sbf_balanced is not used
+ 33) sbf_pushed is not used
+
+ Next three are try_to_wake_up() statistics:
+ 34) # of times in this domain try_to_wake_up() awoke a task that
+ last ran on a different cpu in this domain
+ 35) # of times in this domain try_to_wake_up() moved a task to the
+ waking cpu because it was cache-cold on its own cpu anyway
+ 36) # of times in this domain try_to_wake_up() started passive balancing
+
+/proc/<pid>/schedstat
+----------------
+schedstats also adds a new /proc/<pid>/schedstat file to include some of
+the same information on a per-process level. There are three fields in
+this file correlating for that process to:
+ 1) time spent on the cpu
+ 2) time spent waiting on a runqueue
+ 3) # of timeslices run on this cpu
+
+A program could be easily written to make use of these extra fields to
+report on how well a particular process or set of processes is faring
+under the scheduler's policies. A simple version of such a program is
+available at
+ http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c
diff --git a/Documentation/scsi/00-INDEX b/Documentation/scsi/00-INDEX
new file mode 100644
index 0000000..c2e18e1
--- /dev/null
+++ b/Documentation/scsi/00-INDEX
@@ -0,0 +1,94 @@
+00-INDEX
+ - this file
+53c700.txt
+ - info on driver for 53c700 based adapters
+BusLogic.txt
+ - info on driver for adapters with BusLogic chips
+ChangeLog.1992-1997
+ - Changes to scsi files, if not listed elsewhere
+ChangeLog.arcmsr
+ - Changes to driver for ARECA's SATA RAID controller cards
+ChangeLog.ips
+ - IBM ServeRAID driver Changelog
+ChangeLog.lpfc
+ - Changes to lpfc driver
+ChangeLog.megaraid
+ - Changes to LSI megaraid controller.
+ChangeLog.megaraid_sas
+ - Changes to serial attached scsi version of LSI megaraid controller.
+ChangeLog.ncr53c8xx
+ - Changes to ncr53c8xx driver
+ChangeLog.sym53c8xx
+ - Changes to sym53c8xx driver
+ChangeLog.sym53c8xx_2
+ - Changes to second generation of sym53c8xx driver
+FlashPoint.txt
+ - info on driver for BusLogic FlashPoint adapters
+LICENSE.FlashPoint
+ - Licence of the Flashpoint driver
+LICENSE.qla2xxx
+ - License for QLogic Linux Fibre Channel HBA Driver firmware.
+Mylex.txt
+ - info on driver for Mylex adapters
+NinjaSCSI.txt
+ - info on WorkBiT NinjaSCSI-32/32Bi driver
+aacraid.txt
+ - Driver supporting Adaptec RAID controllers
+aha152x.txt
+ - info on driver for Adaptec AHA152x based adapters
+aic79xx.txt
+ - Adaptec Ultra320 SCSI host adapters
+aic7xxx.txt
+ - info on driver for Adaptec controllers
+aic7xxx_old.txt
+ - info on driver for Adaptec controllers, old generation
+arcmsr_spec.txt
+ - ARECA FIRMWARE SPEC (for IOP331 adapter)
+dc395x.txt
+ - README file for the dc395x SCSI driver
+dpti.txt
+ - info on driver for DPT SmartRAID and Adaptec I2O RAID based adapters
+dtc3x80.txt
+ - info on driver for DTC 2x80 based adapters
+g_NCR5380.txt
+ - info on driver for NCR5380 and NCR53c400 based adapters
+hptiop.txt
+ - HIGHPOINT ROCKETRAID 3xxx RAID DRIVER
+ibmmca.txt
+ - info on driver for IBM adapters with MCA bus
+in2000.txt
+ - info on in2000 driver
+libsas.txt
+ - Serial Attached SCSI management layer.
+lpfc.txt
+ - LPFC driver release notes
+megaraid.txt
+ - Common Management Module, shared code handling ioctls for LSI drivers
+ncr53c8xx.txt
+ - info on driver for NCR53c8xx based adapters
+osst.txt
+ - info on driver for OnStream SC-x0 SCSI tape
+ppa.txt
+ - info on driver for IOmega zip drive
+qlogicfas.txt
+ - info on driver for QLogic FASxxx based adapters
+scsi-changer.txt
+ - README for the SCSI media changer driver
+scsi-generic.txt
+ - info on the sg driver for generic (non-disk/CD/tape) SCSI devices.
+scsi.txt
+ - short blurb on using SCSI support as a module.
+scsi_mid_low_api.txt
+ - info on API between SCSI layer and low level drivers
+scsi_eh.txt
+ - info on SCSI midlayer error handling infrastructure
+scsi_fc_transport.txt
+ - SCSI Fiber Channel Tansport
+st.txt
+ - info on scsi tape driver
+sym53c500_cs.txt
+ - info on PCMCIA driver for Symbios Logic 53c500 based adapters
+sym53c8xx_2.txt
+ - info on second generation driver for sym53c8xx based adapters
+tmscsim.txt
+ - info on driver for AM53c974 based adapters
diff --git a/Documentation/scsi/53c700.txt b/Documentation/scsi/53c700.txt
new file mode 100644
index 0000000..0da681d
--- /dev/null
+++ b/Documentation/scsi/53c700.txt
@@ -0,0 +1,154 @@
+General Description
+===================
+
+This driver supports the 53c700 and 53c700-66 chips. It also supports
+the 53c710 but only in 53c700 emulation mode. It is full featured and
+does sync (-66 and 710 only), disconnects and tag command queueing.
+
+Since the 53c700 must be interfaced to a bus, you need to wrapper the
+card detector around this driver. For an example, see the
+NCR_D700.[ch] or lasi700.[ch] files.
+
+The comments in the 53c700.[ch] files tell you which parts you need to
+fill in to get the driver working.
+
+
+Compile Time Flags
+==================
+
+The driver may be either io mapped or memory mapped. This is
+selectable by configuration flags:
+
+CONFIG_53C700_MEM_MAPPED
+
+define if the driver is memory mapped.
+
+CONFIG_53C700_IO_MAPPED
+
+define if the driver is to be io mapped.
+
+One or other of the above flags *must* be defined.
+
+Other flags are:
+
+CONFIG_53C700_LE_ON_BE
+
+define if the chipset must be supported in little endian mode on a big
+endian architecture (used for the 700 on parisc).
+
+CONFIG_53C700_USE_CONSISTENT
+
+allocate consistent memory (should only be used if your architecture
+has a mixture of consistent and inconsistent memory). Fully
+consistent or fully inconsistent architectures should not define this.
+
+
+Using the Chip Core Driver
+==========================
+
+In order to plumb the 53c700 chip core driver into a working SCSI
+driver, you need to know three things about the way the chip is wired
+into your system (or expansion card).
+
+1. The clock speed of the SCSI core
+2. The interrupt line used
+3. The memory (or io space) location of the 53c700 registers.
+
+Optionally, you may also need to know other things, like how to read
+the SCSI Id from the card bios or whether the chip is wired for
+differential operation.
+
+Usually you can find items 2. and 3. from general spec. documents or
+even by examining the configuration of a working driver under another
+operating system.
+
+The clock speed is usually buried deep in the technical literature.
+It is required because it is used to set up both the synchronous and
+asynchronous dividers for the chip. As a general rule of thumb,
+manufacturers set the clock speed at the lowest possible setting
+consistent with the best operation of the chip (although some choose
+to drive it off the CPU or bus clock rather than going to the expense
+of an extra clock chip). The best operation clock speeds are:
+
+53c700 - 25MHz
+53c700-66 - 50MHz
+53c710 - 40Mhz
+
+Writing Your Glue Driver
+========================
+
+This will be a standard SCSI driver (I don't know of a good document
+describing this, just copy from some other driver) with at least a
+detect and release entry.
+
+In the detect routine, you need to allocate a struct
+NCR_700_Host_Parameters sized memory area and clear it (so that the
+default values for everything are 0). Then you must fill in the
+parameters that matter to you (see below), plumb the NCR_700_intr
+routine into the interrupt line and call NCR_700_detect with the host
+template and the new parameters as arguments. You should also call
+the relevant request_*_region function and place the register base
+address into the `base' pointer of the host parameters.
+
+In the release routine, you must free the NCR_700_Host_Parameters that
+you allocated, call the corresponding release_*_region and free the
+interrupt.
+
+Handling Interrupts
+-------------------
+
+In general, you should just plumb the card's interrupt line in with
+
+request_irq(irq, NCR_700_intr, <irq flags>, <driver name>, host);
+
+where host is the return from the relevant NCR_700_detect() routine.
+
+You may also write your own interrupt handling routine which calls
+NCR_700_intr() directly. However, you should only really do this if
+you have a card with more than one chip on it and you can read a
+register to tell which set of chips wants the interrupt.
+
+Settable NCR_700_Host_Parameters
+--------------------------------
+
+The following are a list of the user settable parameters:
+
+clock: (MANDATORY)
+
+Set to the clock speed of the chip in MHz.
+
+base: (MANDATORY)
+
+set to the base of the io or mem region for the register set. On 64
+bit architectures this is only 32 bits wide, so the registers must be
+mapped into the low 32 bits of memory.
+
+pci_dev: (OPTIONAL)
+
+set to the PCI board device. Leave NULL for a non-pci board. This is
+used for the pci_alloc_consistent() and pci_map_*() functions.
+
+dmode_extra: (OPTIONAL, 53c710 only)
+
+extra flags for the DMODE register. These are used to control bus
+output pins on the 710. The settings should be a combination of
+DMODE_FC1 and DMODE_FC2. What these pins actually do is entirely up
+to the board designer. Usually it is safe to ignore this setting.
+
+differential: (OPTIONAL)
+
+set to 1 if the chip drives a differential bus.
+
+force_le_on_be: (OPTIONAL, only if CONFIG_53C700_LE_ON_BE is set)
+
+set to 1 if the chip is operating in little endian mode on a big
+endian architecture.
+
+chip710: (OPTIONAL)
+
+set to 1 if the chip is a 53c710.
+
+burst_disable: (OPTIONAL, 53c710 only)
+
+disable 8 byte bursting for DMA transfers.
+
diff --git a/Documentation/scsi/BusLogic.txt b/Documentation/scsi/BusLogic.txt
new file mode 100644
index 0000000..98023ba
--- /dev/null
+++ b/Documentation/scsi/BusLogic.txt
@@ -0,0 +1,566 @@
+ BusLogic MultiMaster and FlashPoint SCSI Driver for Linux
+
+ Version 2.0.15 for Linux 2.0
+ Version 2.1.15 for Linux 2.1
+
+ PRODUCTION RELEASE
+
+ 17 August 1998
+
+ Leonard N. Zubkoff
+ Dandelion Digital
+ lnz@dandelion.com
+
+ Copyright 1995-1998 by Leonard N. Zubkoff <lnz@dandelion.com>
+
+
+ INTRODUCTION
+
+BusLogic, Inc. designed and manufactured a variety of high performance SCSI
+host adapters which share a common programming interface across a diverse
+collection of bus architectures by virtue of their MultiMaster ASIC technology.
+BusLogic was acquired by Mylex Corporation in February 1996, but the products
+supported by this driver originated under the BusLogic name and so that name is
+retained in the source code and documentation.
+
+This driver supports all present BusLogic MultiMaster Host Adapters, and should
+support any future MultiMaster designs with little or no modification. More
+recently, BusLogic introduced the FlashPoint Host Adapters, which are less
+costly and rely on the host CPU, rather than including an onboard processor.
+Despite not having an onboard CPU, the FlashPoint Host Adapters perform very
+well and have very low command latency. BusLogic has recently provided me with
+the FlashPoint Driver Developer's Kit, which comprises documentation and freely
+redistributable source code for the FlashPoint SCCB Manager. The SCCB Manager
+is the library of code that runs on the host CPU and performs functions
+analogous to the firmware on the MultiMaster Host Adapters. Thanks to their
+having provided the SCCB Manager, this driver now supports the FlashPoint Host
+Adapters as well.
+
+My primary goals in writing this completely new BusLogic driver for Linux are
+to achieve the full performance that BusLogic SCSI Host Adapters and modern
+SCSI peripherals are capable of, and to provide a highly robust driver that can
+be depended upon for high performance mission critical applications. All of
+the major performance features can be configured from the Linux kernel command
+line or at module initialization time, allowing individual installations to
+tune driver performance and error recovery to their particular needs.
+
+The latest information on Linux support for BusLogic SCSI Host Adapters, as
+well as the most recent release of this driver and the latest firmware for the
+BT-948/958/958D, will always be available from my Linux Home Page at URL
+"http://www.dandelion.com/Linux/".
+
+Bug reports should be sent via electronic mail to "lnz@dandelion.com". Please
+include with the bug report the complete configuration messages reported by the
+driver and SCSI subsystem at startup, along with any subsequent system messages
+relevant to SCSI operations, and a detailed description of your system's
+hardware configuration.
+
+Mylex has been an excellent company to work with and I highly recommend their
+products to the Linux community. In November 1995, I was offered the
+opportunity to become a beta test site for their latest MultiMaster product,
+the BT-948 PCI Ultra SCSI Host Adapter, and then again for the BT-958 PCI Wide
+Ultra SCSI Host Adapter in January 1996. This was mutually beneficial since
+Mylex received a degree and kind of testing that their own testing group cannot
+readily achieve, and the Linux community has available high performance host
+adapters that have been well tested with Linux even before being brought to
+market. This relationship has also given me the opportunity to interact
+directly with their technical staff, to understand more about the internal
+workings of their products, and in turn to educate them about the needs and
+potential of the Linux community.
+
+More recently, Mylex has reaffirmed the company's interest in supporting the
+Linux community, and I am now working on a Linux driver for the DAC960 PCI RAID
+Controllers. Mylex's interest and support is greatly appreciated.
+
+Unlike some other vendors, if you contact Mylex Technical Support with a
+problem and are running Linux, they will not tell you that your use of their
+products is unsupported. Their latest product marketing literature even states
+"Mylex SCSI host adapters are compatible with all major operating systems
+including: ... Linux ...".
+
+Mylex Corporation is located at 34551 Ardenwood Blvd., Fremont, California
+94555, USA and can be reached at 510/796-6100 or on the World Wide Web at
+http://www.mylex.com. Mylex HBA Technical Support can be reached by electronic
+mail at techsup@mylex.com, by Voice at 510/608-2400, or by FAX at 510/745-7715.
+Contact information for offices in Europe and Japan is available on the Web
+site.
+
+
+ DRIVER FEATURES
+
+o Configuration Reporting and Testing
+
+ During system initialization, the driver reports extensively on the host
+ adapter hardware configuration, including the synchronous transfer parameters
+ requested and negotiated with each target device. AutoSCSI settings for
+ Synchronous Negotiation, Wide Negotiation, and Disconnect/Reconnect are
+ reported for each target device, as well as the status of Tagged Queuing.
+ If the same setting is in effect for all target devices, then a single word
+ or phrase is used; otherwise, a letter is provided for each target device to
+ indicate the individual status. The following examples
+ should clarify this reporting format:
+
+ Synchronous Negotiation: Ultra
+
+ Synchronous negotiation is enabled for all target devices and the host
+ adapter will attempt to negotiate for 20.0 mega-transfers/second.
+
+ Synchronous Negotiation: Fast
+
+ Synchronous negotiation is enabled for all target devices and the host
+ adapter will attempt to negotiate for 10.0 mega-transfers/second.
+
+ Synchronous Negotiation: Slow
+
+ Synchronous negotiation is enabled for all target devices and the host
+ adapter will attempt to negotiate for 5.0 mega-transfers/second.
+
+ Synchronous Negotiation: Disabled
+
+ Synchronous negotiation is disabled and all target devices are limited to
+ asynchronous operation.
+
+ Synchronous Negotiation: UFSNUUU#UUUUUUUU
+
+ Synchronous negotiation to Ultra speed is enabled for target devices 0
+ and 4 through 15, to Fast speed for target device 1, to Slow speed for
+ target device 2, and is not permitted to target device 3. The host
+ adapter's SCSI ID is represented by the "#".
+
+ The status of Wide Negotiation, Disconnect/Reconnect, and Tagged Queuing
+ are reported as "Enabled", Disabled", or a sequence of "Y" and "N" letters.
+
+o Performance Features
+
+ BusLogic SCSI Host Adapters directly implement SCSI-2 Tagged Queuing, and so
+ support has been included in the driver to utilize tagged queuing with any
+ target devices that report having the tagged queuing capability. Tagged
+ queuing allows for multiple outstanding commands to be issued to each target
+ device or logical unit, and can improve I/O performance substantially. In
+ addition, BusLogic's Strict Round Robin Mode is used to optimize host adapter
+ performance, and scatter/gather I/O can support as many segments as can be
+ effectively utilized by the Linux I/O subsystem. Control over the use of
+ tagged queuing for each target device as well as individual selection of the
+ tagged queue depth is available through driver options provided on the kernel
+ command line or at module initialization time. By default, the queue depth
+ is determined automatically based on the host adapter's total queue depth and
+ the number, type, speed, and capabilities of the target devices found. In
+ addition, tagged queuing is automatically disabled whenever the host adapter
+ firmware version is known not to implement it correctly, or whenever a tagged
+ queue depth of 1 is selected. Tagged queuing is also disabled for individual
+ target devices if disconnect/reconnect is disabled for that device.
+
+o Robustness Features
+
+ The driver implements extensive error recovery procedures. When the higher
+ level parts of the SCSI subsystem request that a timed out command be reset,
+ a selection is made between a full host adapter hard reset and SCSI bus reset
+ versus sending a bus device reset message to the individual target device
+ based on the recommendation of the SCSI subsystem. Error recovery strategies
+ are selectable through driver options individually for each target device,
+ and also include sending a bus device reset to the specific target device
+ associated with the command being reset, as well as suppressing error
+ recovery entirely to avoid perturbing an improperly functioning device. If
+ the bus device reset error recovery strategy is selected and sending a bus
+ device reset does not restore correct operation, the next command that is
+ reset will force a full host adapter hard reset and SCSI bus reset. SCSI bus
+ resets caused by other devices and detected by the host adapter are also
+ handled by issuing a soft reset to the host adapter and re-initialization.
+ Finally, if tagged queuing is active and more than one command reset occurs
+ in a 10 minute interval, or if a command reset occurs within the first 10
+ minutes of operation, then tagged queuing will be disabled for that target
+ device. These error recovery options improve overall system robustness by
+ preventing individual errant devices from causing the system as a whole to
+ lock up or crash, and thereby allowing a clean shutdown and restart after the
+ offending component is removed.
+
+o PCI Configuration Support
+
+ On PCI systems running kernels compiled with PCI BIOS support enabled, this
+ driver will interrogate the PCI configuration space and use the I/O port
+ addresses assigned by the system BIOS, rather than the ISA compatible I/O
+ port addresses. The ISA compatible I/O port address is then disabled by the
+ driver. On PCI systems it is also recommended that the AutoSCSI utility be
+ used to disable the ISA compatible I/O port entirely as it is not necessary.
+ The ISA compatible I/O port is disabled by default on the BT-948/958/958D.
+
+o /proc File System Support
+
+ Copies of the host adapter configuration information together with updated
+ data transfer and error recovery statistics are available through the
+ /proc/scsi/BusLogic/<N> interface.
+
+o Shared Interrupts Support
+
+ On systems that support shared interrupts, any number of BusLogic Host
+ Adapters may share the same interrupt request channel.
+
+
+ SUPPORTED HOST ADAPTERS
+
+The following list comprises the supported BusLogic SCSI Host Adapters as of
+the date of this document. It is recommended that anyone purchasing a BusLogic
+Host Adapter not in the following table contact the author beforehand to verify
+that it is or will be supported.
+
+FlashPoint Series PCI Host Adapters:
+
+FlashPoint LT (BT-930) Ultra SCSI-3
+FlashPoint LT (BT-930R) Ultra SCSI-3 with RAIDPlus
+FlashPoint LT (BT-920) Ultra SCSI-3 (BT-930 without BIOS)
+FlashPoint DL (BT-932) Dual Channel Ultra SCSI-3
+FlashPoint DL (BT-932R) Dual Channel Ultra SCSI-3 with RAIDPlus
+FlashPoint LW (BT-950) Wide Ultra SCSI-3
+FlashPoint LW (BT-950R) Wide Ultra SCSI-3 with RAIDPlus
+FlashPoint DW (BT-952) Dual Channel Wide Ultra SCSI-3
+FlashPoint DW (BT-952R) Dual Channel Wide Ultra SCSI-3 with RAIDPlus
+
+MultiMaster "W" Series Host Adapters:
+
+BT-948 PCI Ultra SCSI-3
+BT-958 PCI Wide Ultra SCSI-3
+BT-958D PCI Wide Differential Ultra SCSI-3
+
+MultiMaster "C" Series Host Adapters:
+
+BT-946C PCI Fast SCSI-2
+BT-956C PCI Wide Fast SCSI-2
+BT-956CD PCI Wide Differential Fast SCSI-2
+BT-445C VLB Fast SCSI-2
+BT-747C EISA Fast SCSI-2
+BT-757C EISA Wide Fast SCSI-2
+BT-757CD EISA Wide Differential Fast SCSI-2
+BT-545C ISA Fast SCSI-2
+BT-540CF ISA Fast SCSI-2
+
+MultiMaster "S" Series Host Adapters:
+
+BT-445S VLB Fast SCSI-2
+BT-747S EISA Fast SCSI-2
+BT-747D EISA Differential Fast SCSI-2
+BT-757S EISA Wide Fast SCSI-2
+BT-757D EISA Wide Differential Fast SCSI-2
+BT-545S ISA Fast SCSI-2
+BT-542D ISA Differential Fast SCSI-2
+BT-742A EISA SCSI-2 (742A revision H)
+BT-542B ISA SCSI-2 (542B revision H)
+
+MultiMaster "A" Series Host Adapters:
+
+BT-742A EISA SCSI-2 (742A revisions A - G)
+BT-542B ISA SCSI-2 (542B revisions A - G)
+
+AMI FastDisk Host Adapters that are true BusLogic MultiMaster clones are also
+supported by this driver.
+
+BusLogic SCSI Host Adapters are available packaged both as bare boards and as
+retail kits. The BT- model numbers above refer to the bare board packaging.
+The retail kit model numbers are found by replacing BT- with KT- in the above
+list. The retail kit includes the bare board and manual as well as cabling and
+driver media and documentation that are not provided with bare boards.
+
+
+ FLASHPOINT INSTALLATION NOTES
+
+o RAIDPlus Support
+
+ FlashPoint Host Adapters now include RAIDPlus, Mylex's bootable software
+ RAID. RAIDPlus is not supported on Linux, and there are no plans to support
+ it. The MD driver in Linux 2.0 provides for concatenation (LINEAR) and
+ striping (RAID-0), and support for mirroring (RAID-1), fixed parity (RAID-4),
+ and distributed parity (RAID-5) is available separately. The built-in Linux
+ RAID support is generally more flexible and is expected to perform better
+ than RAIDPlus, so there is little impetus to include RAIDPlus support in the
+ BusLogic driver.
+
+o Enabling UltraSCSI Transfers
+
+ FlashPoint Host Adapters ship with their configuration set to "Factory
+ Default" settings that are conservative and do not allow for UltraSCSI speed
+ to be negotiated. This results in fewer problems when these host adapters
+ are installed in systems with cabling or termination that is not sufficient
+ for UltraSCSI operation, or where existing SCSI devices do not properly
+ respond to synchronous transfer negotiation for UltraSCSI speed. AutoSCSI
+ may be used to load "Optimum Performance" settings which allow UltraSCSI
+ speed to be negotiated with all devices, or UltraSCSI speed can be enabled on
+ an individual basis. It is recommended that SCAM be manually disabled after
+ the "Optimum Performance" settings are loaded.
+
+
+ BT-948/958/958D INSTALLATION NOTES
+
+The BT-948/958/958D PCI Ultra SCSI Host Adapters have some features which may
+require attention in some circumstances when installing Linux.
+
+o PCI I/O Port Assignments
+
+ When configured to factory default settings, the BT-948/958/958D will only
+ recognize the PCI I/O port assignments made by the motherboard's PCI BIOS.
+ The BT-948/958/958D will not respond to any of the ISA compatible I/O ports
+ that previous BusLogic SCSI Host Adapters respond to. This driver supports
+ the PCI I/O port assignments, so this is the preferred configuration.
+ However, if the obsolete BusLogic driver must be used for any reason, such as
+ a Linux distribution that does not yet use this driver in its boot kernel,
+ BusLogic has provided an AutoSCSI configuration option to enable a legacy ISA
+ compatible I/O port.
+
+ To enable this backward compatibility option, invoke the AutoSCSI utility via
+ Ctrl-B at system startup and select "Adapter Configuration", "View/Modify
+ Configuration", and then change the "ISA Compatible Port" setting from
+ "Disable" to "Primary" or "Alternate". Once this driver has been installed,
+ the "ISA Compatible Port" option should be set back to "Disable" to avoid
+ possible future I/O port conflicts. The older BT-946C/956C/956CD also have
+ this configuration option, but the factory default setting is "Primary".
+
+o PCI Slot Scanning Order
+
+ In systems with multiple BusLogic PCI Host Adapters, the order in which the
+ PCI slots are scanned may appear reversed with the BT-948/958/958D as
+ compared to the BT-946C/956C/956CD. For booting from a SCSI disk to work
+ correctly, it is necessary that the host adapter's BIOS and the kernel agree
+ on which disk is the boot device, which requires that they recognize the PCI
+ host adapters in the same order. The motherboard's PCI BIOS provides a
+ standard way of enumerating the PCI host adapters, which is used by the Linux
+ kernel. Some PCI BIOS implementations enumerate the PCI slots in order of
+ increasing bus number and device number, while others do so in the opposite
+ direction.
+
+ Unfortunately, Microsoft decided that Windows 95 would always enumerate the
+ PCI slots in order of increasing bus number and device number regardless of
+ the PCI BIOS enumeration, and requires that their scheme be supported by the
+ host adapter's BIOS to receive Windows 95 certification. Therefore, the
+ factory default settings of the BT-948/958/958D enumerate the host adapters
+ by increasing bus number and device number. To disable this feature, invoke
+ the AutoSCSI utility via Ctrl-B at system startup and select "Adapter
+ Configuration", "View/Modify Configuration", press Ctrl-F10, and then change
+ the "Use Bus And Device # For PCI Scanning Seq." option to OFF.
+
+ This driver will interrogate the setting of the PCI Scanning Sequence option
+ so as to recognize the host adapters in the same order as they are enumerated
+ by the host adapter's BIOS.
+
+o Enabling UltraSCSI Transfers
+
+ The BT-948/958/958D ship with their configuration set to "Factory Default"
+ settings that are conservative and do not allow for UltraSCSI speed to be
+ negotiated. This results in fewer problems when these host adapters are
+ installed in systems with cabling or termination that is not sufficient for
+ UltraSCSI operation, or where existing SCSI devices do not properly respond
+ to synchronous transfer negotiation for UltraSCSI speed. AutoSCSI may be
+ used to load "Optimum Performance" settings which allow UltraSCSI speed to be
+ negotiated with all devices, or UltraSCSI speed can be enabled on an
+ individual basis. It is recommended that SCAM be manually disabled after the
+ "Optimum Performance" settings are loaded.
+
+
+ DRIVER OPTIONS
+
+BusLogic Driver Options may be specified either via the Linux Kernel Command
+Line or via the Loadable Kernel Module Installation Facility. Driver Options
+for multiple host adapters may be specified either by separating the option
+strings by a semicolon, or by specifying multiple "BusLogic=" strings on the
+command line. Individual option specifications for a single host adapter are
+separated by commas. The Probing and Debugging Options apply to all host
+adapters whereas the remaining options apply individually only to the
+selected host adapter.
+
+The BusLogic Driver Probing Options comprise the following:
+
+IO:<integer>
+
+ The "IO:" option specifies an ISA I/O Address to be probed for a non-PCI
+ MultiMaster Host Adapter. If neither "IO:" nor "NoProbeISA" options are
+ specified, then the standard list of BusLogic MultiMaster ISA I/O Addresses
+ will be probed (0x330, 0x334, 0x230, 0x234, 0x130, and 0x134). Multiple
+ "IO:" options may be specified to precisely determine the I/O Addresses to
+ be probed, but the probe order will always follow the standard list.
+
+NoProbe
+
+ The "NoProbe" option disables all probing and therefore no BusLogic Host
+ Adapters will be detected.
+
+NoProbeISA
+
+ The "NoProbeISA" option disables probing of the standard BusLogic ISA I/O
+ Addresses and therefore only PCI MultiMaster and FlashPoint Host Adapters
+ will be detected.
+
+NoProbePCI
+
+ The "NoProbePCI" options disables the interrogation of PCI Configuration
+ Space and therefore only ISA Multimaster Host Adapters will be detected, as
+ well as PCI Multimaster Host Adapters that have their ISA Compatible I/O
+ Port set to "Primary" or "Alternate".
+
+NoSortPCI
+
+ The "NoSortPCI" option forces PCI MultiMaster Host Adapters to be
+ enumerated in the order provided by the PCI BIOS, ignoring any setting of
+ the AutoSCSI "Use Bus And Device # For PCI Scanning Seq." option.
+
+MultiMasterFirst
+
+ The "MultiMasterFirst" option forces MultiMaster Host Adapters to be probed
+ before FlashPoint Host Adapters. By default, if both FlashPoint and PCI
+ MultiMaster Host Adapters are present, this driver will probe for
+ FlashPoint Host Adapters first unless the BIOS primary disk is controlled
+ by the first PCI MultiMaster Host Adapter, in which case MultiMaster Host
+ Adapters will be probed first.
+
+FlashPointFirst
+
+ The "FlashPointFirst" option forces FlashPoint Host Adapters to be probed
+ before MultiMaster Host Adapters.
+
+The BusLogic Driver Tagged Queuing Options allow for explicitly specifying
+the Queue Depth and whether Tagged Queuing is permitted for each Target
+Device (assuming that the Target Device supports Tagged Queuing). The Queue
+Depth is the number of SCSI Commands that are allowed to be concurrently
+presented for execution (either to the Host Adapter or Target Device). Note
+that explicitly enabling Tagged Queuing may lead to problems; the option to
+enable or disable Tagged Queuing is provided primarily to allow disabling
+Tagged Queuing on Target Devices that do not implement it correctly. The
+following options are available:
+
+QueueDepth:<integer>
+
+ The "QueueDepth:" or QD:" option specifies the Queue Depth to use for all
+ Target Devices that support Tagged Queuing, as well as the maximum Queue
+ Depth for devices that do not support Tagged Queuing. If no Queue Depth
+ option is provided, the Queue Depth will be determined automatically based
+ on the Host Adapter's Total Queue Depth and the number, type, speed, and
+ capabilities of the detected Target Devices. For Host Adapters that
+ require ISA Bounce Buffers, the Queue Depth is automatically set by default
+ to BusLogic_TaggedQueueDepthBB or BusLogic_UntaggedQueueDepthBB to avoid
+ excessive preallocation of DMA Bounce Buffer memory. Target Devices that
+ do not support Tagged Queuing always have their Queue Depth set to
+ BusLogic_UntaggedQueueDepth or BusLogic_UntaggedQueueDepthBB, unless a
+ lower Queue Depth option is provided. A Queue Depth of 1 automatically
+ disables Tagged Queuing.
+
+QueueDepth:[<integer>,<integer>...]
+
+ The "QueueDepth:[...]" or "QD:[...]" option specifies the Queue Depth
+ individually for each Target Device. If an <integer> is omitted, the
+ associated Target Device will have its Queue Depth selected automatically.
+
+TaggedQueuing:Default
+
+ The "TaggedQueuing:Default" or "TQ:Default" option permits Tagged Queuing
+ based on the firmware version of the BusLogic Host Adapter and based on
+ whether the Queue Depth allows queuing multiple commands.
+
+TaggedQueuing:Enable
+
+ The "TaggedQueuing:Enable" or "TQ:Enable" option enables Tagged Queuing for
+ all Target Devices on this Host Adapter, overriding any limitation that
+ would otherwise be imposed based on the Host Adapter firmware version.
+
+TaggedQueuing:Disable
+
+ The "TaggedQueuing:Disable" or "TQ:Disable" option disables Tagged Queuing
+ for all Target Devices on this Host Adapter.
+
+TaggedQueuing:<Target-Spec>
+
+ The "TaggedQueuing:<Target-Spec>" or "TQ:<Target-Spec>" option controls
+ Tagged Queuing individually for each Target Device. <Target-Spec> is a
+ sequence of "Y", "N", and "X" characters. "Y" enables Tagged Queuing, "N"
+ disables Tagged Queuing, and "X" accepts the default based on the firmware
+ version. The first character refers to Target Device 0, the second to
+ Target Device 1, and so on; if the sequence of "Y", "N", and "X" characters
+ does not cover all the Target Devices, unspecified characters are assumed
+ to be "X".
+
+The BusLogic Driver Miscellaneous Options comprise the following:
+
+BusSettleTime:<seconds>
+
+ The "BusSettleTime:" or "BST:" option specifies the Bus Settle Time in
+ seconds. The Bus Settle Time is the amount of time to wait between a Host
+ Adapter Hard Reset which initiates a SCSI Bus Reset and issuing any SCSI
+ Commands. If unspecified, it defaults to BusLogic_DefaultBusSettleTime.
+
+InhibitTargetInquiry
+
+ The "InhibitTargetInquiry" option inhibits the execution of an Inquire
+ Target Devices or Inquire Installed Devices command on MultiMaster Host
+ Adapters. This may be necessary with some older Target Devices that do not
+ respond correctly when Logical Units above 0 are addressed.
+
+The BusLogic Driver Debugging Options comprise the following:
+
+TraceProbe
+
+ The "TraceProbe" option enables tracing of Host Adapter Probing.
+
+TraceHardwareReset
+
+ The "TraceHardwareReset" option enables tracing of Host Adapter Hardware
+ Reset.
+
+TraceConfiguration
+
+ The "TraceConfiguration" option enables tracing of Host Adapter
+ Configuration.
+
+TraceErrors
+
+ The "TraceErrors" option enables tracing of SCSI Commands that return an
+ error from the Target Device. The CDB and Sense Data will be printed for
+ each SCSI Command that fails.
+
+Debug
+
+ The "Debug" option enables all debugging options.
+
+The following examples demonstrate setting the Queue Depth for Target Devices
+1 and 2 on the first host adapter to 7 and 15, the Queue Depth for all Target
+Devices on the second host adapter to 31, and the Bus Settle Time on the
+second host adapter to 30 seconds.
+
+Linux Kernel Command Line:
+
+ linux BusLogic=QueueDepth:[,7,15];QueueDepth:31,BusSettleTime:30
+
+LILO Linux Boot Loader (in /etc/lilo.conf):
+
+ append = "BusLogic=QueueDepth:[,7,15];QueueDepth:31,BusSettleTime:30"
+
+INSMOD Loadable Kernel Module Installation Facility:
+
+ insmod BusLogic.o \
+ 'BusLogic="QueueDepth:[,7,15];QueueDepth:31,BusSettleTime:30"'
+
+NOTE: Module Utilities 2.1.71 or later is required for correct parsing
+ of driver options containing commas.
+
+
+ DRIVER INSTALLATION
+
+This distribution was prepared for Linux kernel version 2.0.35, but should be
+compatible with 2.0.4 or any later 2.0 series kernel.
+
+To install the new BusLogic SCSI driver, you may use the following commands,
+replacing "/usr/src" with wherever you keep your Linux kernel source tree:
+
+ cd /usr/src
+ tar -xvzf BusLogic-2.0.15.tar.gz
+ mv README.* LICENSE.* BusLogic.[ch] FlashPoint.c linux/drivers/scsi
+ patch -p0 < BusLogic.patch (only for 2.0.33 and below)
+ cd linux
+ make config
+ make zImage
+
+Then install "arch/i386/boot/zImage" as your standard kernel, run lilo if
+appropriate, and reboot.
+
+
+ BUSLOGIC ANNOUNCEMENTS MAILING LIST
+
+The BusLogic Announcements Mailing List provides a forum for informing Linux
+users of new driver releases and other announcements regarding Linux support
+for BusLogic SCSI Host Adapters. To join the mailing list, send a message to
+"buslogic-announce-request@dandelion.com" with the line "subscribe" in the
+message body.
diff --git a/Documentation/scsi/ChangeLog.1992-1997 b/Documentation/scsi/ChangeLog.1992-1997
new file mode 100644
index 0000000..6faad7e
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.1992-1997
@@ -0,0 +1,2023 @@
+Sat Jan 18 15:51:45 1997 Richard Henderson <rth@tamu.edu>
+
+ * Don't play with usage_count directly, instead hand around
+ the module header and use the module macros.
+
+Fri May 17 00:00:00 1996 Leonard N. Zubkoff <lnz@dandelion.com>
+
+ * BusLogic Driver Version 2.0.3 Released.
+
+Tue Apr 16 21:00:00 1996 Leonard N. Zubkoff <lnz@dandelion.com>
+
+ * BusLogic Driver Version 1.3.2 Released.
+
+Sun Dec 31 23:26:00 1995 Leonard N. Zubkoff <lnz@dandelion.com>
+
+ * BusLogic Driver Version 1.3.1 Released.
+
+Fri Nov 10 15:29:49 1995 Leonard N. Zubkoff <lnz@dandelion.com>
+
+ * Released new BusLogic driver.
+
+Wed Aug 9 22:37:04 1995 Andries Brouwer <aeb@cwi.nl>
+
+ As a preparation for new device code, separated the various
+ functions the request->dev field had into the device proper,
+ request->rq_dev and a status field request->rq_status.
+
+ The 2nd argument of bios_param is now a kdev_t.
+
+Wed Jul 19 10:43:15 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * scsi.c (scsi_proc_info): /proc/scsi/scsi now also lists all
+ attached devices.
+
+ * scsi_proc.c (proc_print_scsidevice): Added. Used by scsi.c and
+ eata_dma_proc.c to produce some device info for /proc/scsi.
+
+ * eata_dma.c (eata_queue)(eata_int_handler)(eata_scsi_done):
+ Changed handling of internal SCSI commands send to the HBA.
+
+
+Wed Jul 19 10:09:17 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * Linux 1.3.11 released.
+
+ * eata_dma.c (eata_queue)(eata_int_handler): Added code to do
+ command latency measurements if requested by root through
+ /proc/scsi interface.
+ Throughout Use HZ constant for time references.
+
+ * eata_pio.c: Use HZ constant for time references.
+
+ * aic7xxx.c, aic7xxx.h, aic7xxx_asm.c: Changed copyright from BSD
+ to GNU style.
+
+ * scsi.h: Added READ_12 command opcode constant
+
+Wed Jul 19 09:25:30 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * Linux 1.3.10 released.
+
+ * scsi_proc.c (dispatch_scsi_info): Removed unused variable.
+
+Wed Jul 19 09:25:30 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * Linux 1.3.9 released.
+
+ * scsi.c Blacklist concept expanded to 'support' more device
+ deficiencies. blacklist[] renamed to device_list[]
+ (scan_scsis): Code cleanup.
+
+ * scsi_debug.c (scsi_debug_proc_info): Added support to control
+ device lockup simulation via /proc/scsi interface.
+
+
+Wed Jul 19 09:22:34 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * Linux 1.3.7 released.
+
+ * scsi_proc.c: Fixed a number of bugs in directory handling
+
+Wed Jul 19 09:18:28 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * Linux 1.3.5 released.
+
+ * Native wide, multichannel and /proc/scsi support now in official
+ kernel distribution.
+
+ * scsi.c/h, hosts.c/h et al reindented to increase readability
+ (especially on 80 column wide terminals).
+
+ * scsi.c, scsi_proc.c, ../../fs/proc/inode.c: Added
+ /proc/scsi/scsi which allows root to scan for hotplugged devices.
+
+ * scsi.c (scsi_proc_info): Added, to support /proc/scsi/scsi.
+ (scan_scsis): Added some 'spaghetti' code to allow scanning for
+ single devices.
+
+
+Thu Jun 20 15:20:27 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * proc.c: Renamed to scsi_proc.c
+
+Mon Jun 12 20:32:45 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * Linux 1.3.0 released.
+
+Mon May 15 19:33:14 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * scsi.c: Added native multichannel and wide scsi support.
+
+ * proc.c (dispatch_scsi_info) (build_proc_dir_hba_entries):
+ Updated /proc/scsi interface.
+
+Thu May 4 17:58:48 1995 Michael Neuffer <neuffer@goofy.zdv.uni-mainz.de>
+
+ * sd.c (requeue_sd_request): Zero out the scatterlist only if
+ scsi_malloc returned memory for it.
+
+ * eata_dma.c (register_HBA) (eata_queue): Add support for
+ large scatter/gather tables and set use_clustering accordingly
+
+ * hosts.c: Make use_clustering changeable in the Scsi_Host structure.
+
+Wed Apr 12 15:25:52 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.2.5 released.
+
+ * buslogic.c: Update to version 1.15 (From Leonard N. Zubkoff).
+ Fixed interrupt routine to avoid races when handling multiple
+ complete commands per interrupt. Seems to come up with faster
+ cards.
+
+ * eata_dma.c: Update to 2.3.5r. Modularize. Improved error handling
+ throughout and fixed bug interrupt routine which resulted in shifted
+ status bytes. Added blink LED state checks for ISA and EISA HBAs.
+ Memory management bug seems to have disappeared ==> increasing
+ C_P_L_CURRENT_MAX to 16 for now. Decreasing C_P_L_DIV to 3 for
+ performance reasons.
+
+ * scsi.c: If we get a FMK, EOM, or ILI when attempting to scan
+ the bus, assume that it was just noise on the bus, and ignore
+ the device.
+
+ * scsi.h: Update and add a bunch of missing commands which we
+ were never using.
+
+ * sd.c: Use restore_flags in do_sd_request - this may result in
+ latency conditions, but it gets rid of races and crashes.
+ Do not save flags again when searching for a second command to
+ queue.
+
+ * st.c: Use bytes, not STP->buffer->buffer_size when reading
+ from tape.
+
+
+Tue Apr 4 09:42:08 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.2.4 released.
+
+ * st.c: Fix typo - restoring wrong flags.
+
+Wed Mar 29 06:55:12 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.2.3 released.
+
+ * st.c: Perform some waiting operations with interrupts off.
+ Is this correct???
+
+Wed Mar 22 10:34:26 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.2.2 released.
+
+ * aha152x.c: Modularize. Add support for PCMCIA.
+
+ * eata.c: Update to version 2.0. Fixed bug preventing media
+ detection. If scsi_register_host returns NULL, fail gracefully.
+
+ * scsi.c: Detect as NEC (for photo-cd purposes) for the 84
+ and 25 models as "NEC_OLDCDR".
+
+ * scsi.h: Add define for NEC_OLDCDR
+
+ * sr.c: Add handling for NEC_OLDCDR. Treat as unknown.
+
+ * u14-34f.c: Update to version 2.0. Fixed same bug as in
+ eata.c.
+
+
+Mon Mar 6 11:11:20 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.2.0 released. Yeah!!!
+
+ * Minor spelling/punctuation changes throughout. Nothing
+ substantive.
+
+Mon Feb 20 21:33:03 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.95 released.
+
+ * qlogic.c: Update to version 0.41.
+
+ * seagate.c: Change some message to be more descriptive about what
+ we detected.
+
+ * sr.c: spelling/whitespace changes.
+
+Mon Feb 20 21:33:03 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.94 released.
+
+Mon Feb 20 08:57:17 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.93 released.
+
+ * hosts.h: Change io_port to long int from short.
+
+ * 53c7,8xx.c: crash on AEN fixed, SCSI reset is no longer a NOP,
+ NULL pointer panic on odd UDCs fixed, two bugs in diagnostic output
+ fixed, should initialize correctly if left running, now loadable,
+ new memory allocation, extraneous diagnostic output suppressed,
+ splx() replaced with save/restore flags. [ Drew ]
+
+ * hosts.c, hosts.h, scsi_ioctl.c, sd.c, sd_ioctl.c, sg.c, sr.c,
+ sr_ioctl.c: Add special junk at end that Emacs will use for
+ formatting the file.
+
+ * qlogic.c: Update to v0.40a. Improve parity handling.
+
+ * scsi.c: Add Hitachi DK312C to blacklist. Change "};" to "}" in
+ many places. Use scsi_init_malloc to get command block - may
+ need this to be dma compatible for some host adapters.
+ Restore interrupts after unregistering a host.
+
+ * sd.c: Use sti instead of restore flags - causes latency problems.
+
+ * seagate.c: Use controller_type to determine string used when
+ registering irq.
+
+ * sr.c: More photo-cd hacks to make sure we get the xa stuff right.
+ * sr.h, sr.c: Change is_xa to xa_flags field.
+
+ * st.c: Disable retries for write operations.
+
+Wed Feb 15 10:52:56 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.92 released.
+
+ * eata.c: Update to 1.17.
+
+ * eata_dma.c: Update to 2.31a. Add more support for /proc/scsi.
+ Continuing modularization. Less crashes because of the bug in the
+ memory management ==> increase C_P_L_CURRENT_MAX to 10
+ and decrease C_P_L_DIV to 4.
+
+ * hosts.c: If we remove last host registered, reuse host number.
+ When freeing memory from host being deregistered, free extra_bytes
+ too.
+
+ * scsi.c (scan_scsis): memset(SDpnt, 0) and set SCmd.device to SDpnt.
+ Change memory allocation to work around bugs in __get_dma_pages.
+ Do not free host if usage count is not zero (for modules).
+
+ * sr_ioctl.c: Increase IOCTL_TIMEOUT to 3000.
+
+ * st.c: Allow for ST_EXTRA_DEVS in st data structures.
+
+ * u14-34f.c: Update to 1.17.
+
+Thu Feb 9 10:11:16 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.91 released.
+
+ * eata.c: Update to 1.16. Use wish_block instead of host->block.
+
+ * hosts.c: Initialize wish_block to 0.
+
+ * hosts.h: Add wish_block.
+
+ * scsi.c: Use wish_block as indicator that the host should be added
+ to block list.
+
+ * sg.c: Add SG_EXTRA_DEVS to number of slots.
+
+ * u14-34f.c: Use wish_block.
+
+Tue Feb 7 11:46:04 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.90 released.
+
+ * eata.c: Change naming from eata_* to eata2x_*. Now at vers 1.15.
+ Update interrupt handler to take pt_regs as arg. Allow blocking
+ even if loaded as module. Initialize target_time_out array.
+ Do not put sti(); in timing loop.
+
+ * hosts.c: Do not reuse host numbers.
+ Use scsi_make_blocked_list to generate blocking list.
+
+ * script_asm.pl: Beats me. Don't know perl. Something to do with
+ phase index.
+
+ * scsi.c (scsi_make_blocked_list): New function - code copied from
+ hosts.c.
+
+ * scsi.c: Update code to disable photo CD for Toshiba cdroms.
+ Use just manufacturer name, not model number.
+
+ * sr.c: Fix setting density for Toshiba drives.
+
+ * u14-34f.c: Clear target_time_out array during reset.
+
+Wed Feb 1 09:20:45 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.89 released.
+
+ * Makefile, u14-34f.c: Modularize.
+
+ * Makefile, eata.c: Modularize. Now version 1.14
+
+ * NCR5380.c: Update interrupt handler with new arglist. Minor
+ cleanups.
+
+ * eata_dma.c: Begin to modularize. Add hooks for /proc/scsi.
+ New version 2.3.0a. Add code in interrupt handler to allow
+ certain CDROM drivers to be detected which return a
+ CHECK_CONDITION during SCSI bus scan. Add opcode check to get
+ all DATA IN and DATA OUT phases right. Utilize HBA_interpret flag.
+ Improvements in HBA identification. Various other minor stuff.
+
+ * hosts.c: Initialize ->dma_channel and ->io_port when registering
+ a new host.
+
+ * qlogic.c: Modularize and add PCMCIA support.
+
+ * scsi.c: Add Hitachi to blacklist.
+
+ * scsi.c: Change default to no lun scan (too many problem devices).
+
+ * scsi.h: Define QUEUE_FULL condition.
+
+ * sd.c: Do not check for non-existent partition until after
+ new media check.
+
+ * sg.c: Undo previous change which was wrong.
+
+ * sr_ioctl.c: Increase IOCTL_TIMEOUT to 2000.
+
+ * st.c: Patches from Kai - improve filemark handling.
+
+Tue Jan 31 17:32:12 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.88 released.
+
+ * Throughout - spelling/grammar fixups.
+
+ * scsi.c: Make sure that all buffers are 16 byte aligned - some
+ drivers (buslogic) need this.
+
+ * scsi.c (scan_scsis): Remove message printed.
+
+ * scsi.c (scsi_init): Move message here.
+
+Mon Jan 30 06:40:25 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.87 released.
+
+ * sr.c: Photo-cd related changes. (Gerd Knorr??).
+
+ * st.c: Changes from Kai related to EOM detection.
+
+Mon Jan 23 23:53:10 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.86 released.
+
+ * 53c7,8xx.h: Change SG size to 127.
+
+ * eata_dma: Update to version 2.10i. Remove bug in the registration
+ of multiple HBAs and channels. Minor other improvements and stylistic
+ changes.
+
+ * scsi.c: Test for Toshiba XM-3401TA and exclude from detection
+ as toshiba drive - photo cd does not work with this drive.
+
+ * sr.c: Update photocd code.
+
+Mon Jan 23 23:53:10 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.85 released.
+
+ * st.c, st_ioctl.c, sg.c, sd_ioctl.c, scsi_ioctl.c, hosts.c:
+ include linux/mm.h
+
+ * qlogic.c, buslogic.c, aha1542.c: Include linux/module.h.
+
+Sun Jan 22 22:08:46 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.84 released.
+
+ * Makefile: Support for loadable QLOGIC boards.
+
+ * aha152x.c: Update to version 1.8 from Juergen.
+
+ * eata_dma.c: Update from Michael Neuffer.
+ Remove hard limit of 2 commands per lun and make it better
+ configurable. Improvements in HBA identification.
+
+ * in2000.c: Fix biosparam to support large disks.
+
+ * qlogic.c: Minor changes (change sti -> restore_flags).
+
+Wed Jan 18 23:33:09 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.83 released.
+
+ * aha1542.c(aha1542_intr_handle): Use arguments handed down to find
+ which irq.
+
+ * buslogic.c: Likewise.
+
+ * eata_dma.c: Use min of 2 cmd_per_lun for OCS_enabled boards.
+
+ * scsi.c: Make RECOVERED_ERROR a SUGGEST_IS_OK.
+
+ * sd.c: Fail if we are opening a non-existent partition.
+
+ * sr.c: Bump SR_TIMEOUT to 15000.
+ Do not probe for media size at boot time(hard on changers).
+ Flag device as needing sector size instead.
+
+ * sr_ioctl.c: Remove CDROMMULTISESSION_SYS ioctl.
+
+ * ultrastor.c: Fix bug in call to ultrastor_interrupt (wrong #args).
+
+Mon Jan 16 07:18:23 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.82 released.
+
+ Throughout.
+ - Change all interrupt handlers to accept new calling convention.
+ In particular, we now receive the irq number as one of the arguments.
+
+ * More minor spelling corrections in some of the new files.
+
+ * aha1542.c, buslogic.c: Clean up interrupt handler a little now
+ that we receive the irq as an arg.
+
+ * aha274x.c: s/snarf_region/request_region/
+
+ * eata.c: Update to version 1.12. Fix some comments and display a
+ message if we cannot reserve the port addresses.
+
+ * u14-34f.c: Update to version 1.13. Fix some comments and display a
+ message if we cannot reserve the port addresses.
+
+ * eata_dma.c: Define get_board_data function (send INQUIRY command).
+ Use to improve detection of variants of different DPT boards. Change
+ version subnumber to "0g".
+
+ * fdomain.c: Update to version 5.26. Improve detection of some boards
+ repackaged by IBM.
+
+ * scsi.c (scsi_register_host): Change "name" to const char *.
+
+ * sr.c: Fix problem in set mode command for Toshiba drives.
+
+ * sr.c: Fix typo from patch 81.
+
+Fri Jan 13 12:54:46 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.81 released. Codefreeze for 1.2 release announced.
+
+ Big changes here.
+
+ * eata_dma.*: New files from Michael Neuffer.
+ (neuffer@goofy.zdv.uni-mainz.de). Should support
+ all eata/dpt cards.
+
+ * hosts.c, Makefile: Add eata_dma.
+
+ * README.st: Document MTEOM.
+
+ Patches from me (ERY) to finish support for low-level loadable scsi.
+ It now works, and is actually useful.
+
+ * Throughout - add new argument to scsi_init_malloc that takes an
+ additional parameter. This is used as a priority to kmalloc,
+ and you can specify the GFP_DMA flag if you need DMA-able memory.
+
+ * Makefile: For source files that are loadable, always add name
+ to SCSI_SRCS. Fill in modules: target.
+
+ * hosts.c: Change next_host to next_scsi_host, and make global.
+ Print hosts after we have identified all of them. Use info()
+ function if present, otherwise use name field.
+
+ * hosts.h: Change attach function to return int, not void.
+ Define number of device slots to allow for loadable devices.
+ Define tags to tell scsi module code what type of module we
+ are loading.
+
+ * scsi.c: Fix scan_scsis so that it can be run by a user process.
+ Do not use waiting loops - use up and down mechanism as long
+ as current != task[0].
+
+ * scsi.c(scan_scsis): Do not use stack variables for I/O - this
+ could be > 16Mb if we are loading a module at runtime (i.e. use
+ scsi_init_malloc to get some memory we know will be safe).
+
+ * scsi.c: Change dma freelist to be a set of pages. This allows
+ us to dynamically adjust the size of the list by adding more pages
+ to the pagelist. Fix scsi_malloc and scsi_free accordingly.
+
+ * scsi_module.c: Fix include.
+
+ * sd.c: Declare detach function. Increment/decrement module usage
+ count as required. Fix init functions to allow loaded devices.
+ Revalidate all new disks so we get the partition tables. Define
+ detach function.
+
+ * sr.c: Likewise.
+
+ * sg.c: Declare detach function. Allow attachment of devices on
+ loaded drivers.
+
+ * st.c: Declare detach function. Increment/decrement module usage
+ count as required.
+
+Tue Jan 10 10:09:58 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.79 released.
+
+ Patch from some undetermined individual who needs to get a life :-).
+
+ * sr.c: Attacked by spelling bee...
+
+ Patches from Gerd Knorr:
+
+ * sr.c: make printk messages for photoCD a little more informative.
+
+ * sr_ioctl.c: Fix CDROMMULTISESSION_SYS ioctl.
+
+Mon Jan 9 10:01:37 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.78 released.
+
+ * Makefile: Add empty modules: target.
+
+ * Wheee. Now change register_iomem to request_region.
+
+ * in2000.c: Bugfix - apparently this is the fix that we have
+ all been waiting for. It fixes a problem whereby the driver
+ is not stable under heavy load. Race condition and all that.
+ Patch from Peter Lu.
+
+Wed Jan 4 21:17:40 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.77 released.
+
+ * 53c7,8xx.c: Fix from Linus - emulate splx.
+
+ Throughout:
+
+ Change "snarf_region" with "register_iomem".
+
+ * scsi_module.c: New file. Contains support for low-level loadable
+ scsi drivers. [ERY].
+
+ * sd.c: More s/int/long/ changes.
+
+ * seagate.c: Explicitly include linux/config.h
+
+ * sg.c: Increment/decrement module usage count on open/close.
+
+ * sg.c: Be a bit more careful about the user not supplying enough
+ information for a valid command. Pass correct size down to
+ scsi_do_cmd.
+
+ * sr.c: More changes for Photo-CD. This apparently breaks NEC drives.
+
+ * sr_ioctl.c: Support CDROMMULTISESSION ioctl.
+
+
+Sun Jan 1 19:55:21 1995 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.76 released.
+
+ * constants.c: Add type cast in switch statement.
+
+ * scsi.c (scsi_free): Change datatype of "offset" to long.
+ (scsi_malloc): Change a few more variables to long. Who
+ did this and why was it important? 64 bit machines?
+
+
+ Lots of changes to use save_state/restore_state instead of cli/sti.
+ Files changed include:
+
+ * aha1542.c:
+ * aha1740.c:
+ * buslogic.c:
+ * in2000.c:
+ * scsi.c:
+ * scsi_debug.c:
+ * sd.c:
+ * sr.c:
+ * st.c:
+
+Wed Dec 28 16:38:29 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.75 released.
+
+ * buslogic.c: Spelling fix.
+
+ * scsi.c: Add HP C1790A and C2500A scanjet to blacklist.
+
+ * scsi.c: Spelling fixup.
+
+ * sd.c: Add support for sd_hardsizes (hard sector sizes).
+
+ * ultrastor.c: Use save_flags/restore_flags instead of cli/sti.
+
+Fri Dec 23 13:36:25 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.74 released.
+
+ * README.st: Update from Kai Makisara.
+
+ * eata.c: New version from Dario - version 1.11.
+ use scsicam bios_param routine. Add support for 2011
+ and 2021 boards.
+
+ * hosts.c: Add support for blocking. Linked list automatically
+ generated when shpnt->block is set.
+
+ * scsi.c: Add sankyo & HP scanjet to blacklist. Add support for
+ kicking things loose when we deadlock.
+
+ * scsi.c: Recognize scanners and processors in scan_scsis.
+
+ * scsi_ioctl.h: Increase timeout to 9 seconds.
+
+ * st.c: New version from Kai - add better support for backspace.
+
+ * u14-34f.c: New version from Dario. Supports blocking.
+
+Wed Dec 14 14:46:30 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.73 released.
+
+ * buslogic.c: Update from Dave Gentzel. Version 1.14.
+ Add module related stuff. More fault tolerant if out of
+ DMA memory.
+
+ * fdomain.c: New version from Rik Faith - version 5.22. Add support
+ for ISA-200S SCSI adapter.
+
+ * hosts.c: Spelling.
+
+ * qlogic.c: Update to version 0.38a. Add more support for PCMCIA.
+
+ * scsi.c: Mask device type with 0x1f during scan_scsis.
+ Add support for deadlocking, err, make that getting out of
+ deadlock situations that are created when we allow the user
+ to limit requests to one host adapter at a time.
+
+ * scsi.c: Bugfix - pass pid, not SCpnt as second arg to
+ scsi_times_out.
+
+ * scsi.c: Restore interrupt state to previous value instead of using
+ cli/sti pairs.
+
+ * scsi.c: Add a bunch of module stuff (all commented out for now).
+
+ * scsi.c: Clean up scsi_dump_status.
+
+Tue Dec 6 12:34:20 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.72 released.
+
+ * sg.c: Bugfix - always use sg_free, since we might have big buff.
+
+Fri Dec 2 11:24:53 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.71 released.
+
+ * sg.c: Clear buff field when not in use. Only call scsi_free if
+ non-null.
+
+ * scsi.h: Call wake_up(&wait_for_request) when done with a
+ command.
+
+ * scsi.c (scsi_times_out): Pass pid down so that we can protect
+ against race conditions.
+
+ * scsi.c (scsi_abort): Zero timeout field if we get the
+ NOT_RUNNING message back from low-level driver.
+
+
+ * scsi.c (scsi_done): Restore cmd_len, use_sg here.
+
+ * scsi.c (request_sense): Not here.
+
+ * hosts.h: Add new forbidden_addr, forbidden_size fields. Who
+ added these and why????
+
+ * hosts.c (scsi_mem_init): Mark pages as reserved if they fall in
+ the forbidden regions. I am not sure - I think this is so that
+ we can deal with boards that do incomplete decoding of their
+ address lines for the bios chips, but I am not entirely sure.
+
+ * buslogic.c: Set forbidden_addr stuff if using a buggy board.
+
+ * aha1740.c: Test for NULL pointer in SCtmp. This should not
+ occur, but a nice message is better than a kernel segfault.
+
+ * 53c7,8xx.c: Add new PCI chip ID for 815.
+
+Fri Dec 2 11:24:53 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.70 released.
+
+ * ChangeLog, st.c: Spelling.
+
+Tue Nov 29 18:48:42 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.69 released.
+
+ * u14-34f.h: Non-functional change. [Dario].
+
+ * u14-34f.c: Use block field in Scsi_Host to prevent commands from
+ being queued to more than one host at the same time (used when
+ motherboard does not deal with multiple bus-masters very well).
+ Only when SINGLE_HOST_OPERATIONS is defined.
+ Use new cmd_per_lun field. [Dario]
+
+ * eata.c: Likewise.
+
+ * st.c: More changes from Kai. Add ready flag to indicate drive
+ status.
+
+ * README.st: Document this.
+
+ * sr.c: Bugfix (do not subtract CD_BLOCK_OFFSET) for photo-cd
+ code.
+
+ * sg.c: Bugfix - fix problem where opcode is not correctly set up.
+
+ * seagate.[c,h]: Use #defines to set driver name.
+
+ * scsi_ioctl.c: Zero buffer before executing command.
+
+ * scsi.c: Use new cmd_per_lun field in Scsi_Hosts as appropriate.
+ Add Sony CDU55S to blacklist.
+
+ * hosts.h: Add new cmd_per_lun field to Scsi_Hosts.
+
+ * hosts.c: Initialize cmd_per_lun in Scsi_Hosts from template.
+
+ * buslogic.c: Use cmd_per_lun field - initialize to different
+ values depending upon bus type (i.e. use 1 if ISA, so we do not
+ hog memory). Use other patches which got lost from 1.1.68.
+
+ * aha1542.c: Spelling.
+
+Tue Nov 29 15:43:50 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.68 released.
+
+ Add support for 12 byte vendor specific commands in scsi-generics,
+ more (i.e. the last mandatory) low-level changes to support
+ loadable modules, plus a few other changes people have requested
+ lately. Changes by me (ERY) unless otherwise noted. Spelling
+ changes appear from some unknown corner of the universe.
+
+ * Throughout: Change COMMAND_SIZE() to use SCpnt->cmd_len.
+
+ * Throughout: Change info() low level function to take a Scsi_Host
+ pointer. This way the info function can return specific
+ information about the host in question, if desired.
+
+ * All low-level drivers: Add NULL in initializer for the
+ usage_count field added to Scsi_Host_Template.
+
+ * aha152x.[c,h]: Remove redundant info() function.
+
+ * aha1542.[c,h]: Likewise.
+
+ * aha1740.[c,h]: Likewise.
+
+ * aha274x.[c,h]: Likewise.
+
+ * eata.[c,h]: Likewise.
+
+ * pas16.[c,h]: Likewise.
+
+ * scsi_debug.[c,h]: Likewise.
+
+ * t128.[c,h]: Likewise.
+
+ * u14-34f.[c,h]: Likewise.
+
+ * ultrastor.[c,h]: Likewise.
+
+ * wd7000.[c,h]: Likewise.
+
+ * aha1542.c: Add support for command line options with lilo to set
+ DMA parameters, I/O port. From Matt Aarnio.
+
+ * buslogic.[c,h]: New version (1.13) from Dave Gentzel.
+
+ * hosts.h: Add new field to Scsi_Hosts "block" to allow blocking
+ all I/O to certain other cards. Helps prevent problems with some
+ ISA motherboards.
+
+ * hosts.h: Add usage_count to Scsi_Host_Template.
+
+ * hosts.h: Add n_io_port to Scsi_Host (used when releasing module).
+
+ * hosts.c: Initialize block field.
+
+ * in2000.c: Remove "static" declarations from exported functions.
+
+ * in2000.h: Likewise.
+
+ * scsi.c: Correctly set cmd_len field as required. Save and
+ change setting when doing a request_sense, restore when done.
+ Move abort timeout message. Fix panic in request_queueable to
+ print correct function name.
+
+ * scsi.c: When incrementing usage count, walk block linked list
+ for host, and or in SCSI_HOST_BLOCK bit. When decrementing usage
+ count to 0, clear this bit to allow usage to continue, wake up
+ processes waiting.
+
+
+ * scsi_ioctl.c: If we have an info() function, call it, otherwise
+ if we have a "name" field, use it, else do nothing.
+
+ * sd.c, sr.c: Clear cmd_len field prior to each command we
+ generate.
+
+ * sd.h: Add "has_part_table" bit to rscsi_disks.
+
+ * sg.[c,h]: Add support for vendor specific 12 byte commands (i.e.
+ override command length in COMMAND_SIZE).
+
+ * sr.c: Bugfix from Gerd in photocd code.
+
+ * sr.c: Bugfix in get_sectorsize - always use scsi_malloc buffer -
+ we cannot guarantee that the stack is < 16Mb.
+
+Tue Nov 22 15:40:46 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.67 released.
+
+ * sr.c: Change spelling of manufactor to manufacturer.
+
+ * scsi.h: Likewise.
+
+ * scsi.c: Likewise.
+
+ * qlogic.c: Spelling corrections.
+
+ * in2000.h: Spelling corrections.
+
+ * in2000.c: Update from Bill Earnest, change from
+ jshiffle@netcom.com. Support new bios versions.
+
+ * README.qlogic: Spelling correction.
+
+Tue Nov 22 15:40:46 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.66 released.
+
+ * u14-34f.c: Spelling corrections.
+
+ * sr.[h,c]: Add support for multi-session CDs from Gerd Knorr.
+
+ * scsi.h: Add manufactor field for keeping track of device
+ manufacturer.
+
+ * scsi.c: More spelling corrections.
+
+ * qlogic.h, qlogic.c, README.qlogic: New driver from Tom Zerucha.
+
+ * in2000.c, in2000.h: New driver from Brad McLean/Bill Earnest.
+
+ * fdomain.c: Spelling correction.
+
+ * eata.c: Spelling correction.
+
+Fri Nov 18 15:22:44 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.65 released.
+
+ * eata.h: Update version string to 1.08.00.
+
+ * eata.c: Set sg_tablesize correctly for DPT PM2012 boards.
+
+ * aha274x.seq: Spell checking.
+
+ * README.st: Likewise.
+
+ * README.aha274x: Likewise.
+
+ * ChangeLog: Likewise.
+
+Tue Nov 15 15:35:08 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.64 released.
+
+ * u14-34f.h: Update version number to 1.10.01.
+
+ * u14-34f.c: Use Scsi_Host can_queue variable instead of one from template.
+
+ * eata.[c,h]: New driver for DPT boards from Dario Ballabio.
+
+ * buslogic.c: Use can_queue field.
+
+Wed Nov 30 12:09:09 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.63 released.
+
+ * sd.c: Give I/O error if we attempt 512 byte I/O to a disk with
+ 1024 byte sectors.
+
+ * scsicam.c: Make sure we do read from whole disk (mask off
+ partition).
+
+ * scsi.c: Use can_queue in Scsi_Host structure.
+ Fix panic message about invalid host.
+
+ * hosts.c: Initialize can_queue from template.
+
+ * hosts.h: Add can_queue to Scsi_Host structure.
+
+ * aha1740.c: Print out warning about NULL ecbptr.
+
+Fri Nov 4 12:40:30 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.62 released.
+
+ * fdomain.c: Update to version 5.20. (From Rik Faith). Support
+ BIOS version 3.5.
+
+ * st.h: Add ST_EOD symbol.
+
+ * st.c: Patches from Kai Makisara - support additional densities,
+ add support for MTFSS, MTBSS, MTWSM commands.
+
+ * README.st: Update to document new commands.
+
+ * scsi.c: Add Mediavision CDR-H93MV to blacklist.
+
+Sat Oct 29 20:57:36 1994 Eric Youngdale (eric@andante.aib.com)
+
+ * Linux 1.1.60 released.
+
+ * u14-34f.[c,h]: New driver from Dario Ballabio.
+
+ * aic7770.c, aha274x_seq.h, aha274x.seq, aha274x.h, aha274x.c,
+ README.aha274x: New files, new driver from John Aycock.
+
+
+Tue Oct 11 08:47:39 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.54 released.
+
+ * Add third PCI chip id. [Drew]
+
+ * buslogic.c: Set BUSLOGIC_CMDLUN back to 1 [Eric].
+
+ * ultrastor.c: Fix asm directives for new GCC.
+
+ * sr.c, sd.c: Use new end_scsi_request function.
+
+ * scsi.h(end_scsi_request): Return pointer to block if still
+ active, else return NULL if inactive. Fixes race condition.
+
+Sun Oct 9 20:23:14 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.53 released.
+
+ * scsi.c: Do not allocate dma bounce buffers if we have exactly
+ 16Mb.
+
+Fri Sep 9 05:35:30 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.51 released.
+
+ * aha152x.c: Add support for disabling the parity check. Update
+ to version 1.4. [Juergen].
+
+ * seagate.c: Tweak debugging message.
+
+Wed Aug 31 10:15:55 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.50 released.
+
+ * aha152x.c: Add eb800 for Vtech Platinum SMP boards. [Juergen].
+
+ * scsi.c: Add Quantum PD1225S to blacklist.
+
+Fri Aug 26 09:38:45 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.49 released.
+
+ * sd.c: Fix bug when we were deleting the wrong entry if we
+ get an unsupported sector size device.
+
+ * sr.c: Another spelling patch.
+
+Thu Aug 25 09:15:27 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.48 released.
+
+ * Throughout: Use new semantics for request_dma, as appropriate.
+
+ * sr.c: Print correct device number.
+
+Sun Aug 21 17:49:23 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.47 released.
+
+ * NCR5380.c: Add support for LIMIT_TRANSFERSIZE.
+
+ * constants.h: Add prototype for print_Scsi_Cmnd.
+
+ * pas16.c: Some more minor tweaks. Test for Mediavision board.
+ Allow for disks > 1Gb. [Drew??]
+
+ * sr.c: Set SCpnt->transfersize.
+
+Tue Aug 16 17:29:35 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.46 released.
+
+ * Throughout: More spelling fixups.
+
+ * buslogic.c: Add a few more fixups from Dave. Disk translation
+ mainly.
+
+ * pas16.c: Add a few patches (Drew?).
+
+
+Thu Aug 11 20:45:15 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.44 released.
+
+ * hosts.c: Add type casts for scsi_init_malloc.
+
+ * scsicam.c: Add type cast.
+
+Wed Aug 10 19:23:01 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.43 released.
+
+ * Throughout: Spelling cleanups. [??]
+
+ * aha152x.c, NCR53*.c, fdomain.c, g_NCR5380.c, pas16.c, seagate.c,
+ t128.c: Use request_irq, not irqaction. [??]
+
+ * aha1542.c: Move test for shost before we start to use shost.
+
+ * aha1542.c, aha1740.c, ultrastor.c, wd7000.c: Use new
+ calling sequence for request_irq.
+
+ * buslogic.c: Update from Dave Gentzel.
+
+Tue Aug 9 09:32:59 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.42 released.
+
+ * NCR5380.c: Change NCR5380_print_status to static.
+
+ * seagate.c: A few more bugfixes. Only Drew knows what they are
+ for.
+
+ * ultrastor.c: Tweak some __asm__ directives so that it works
+ with newer compilers. [??]
+
+Sat Aug 6 21:29:36 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.40 released.
+
+ * NCR5380.c: Return SCSI_RESET_WAKEUP from reset function.
+
+ * aha1542.c: Reset mailbox status after a bus device reset.
+
+ * constants.c: Fix typo (;;).
+
+ * g_NCR5380.c:
+ * pas16.c: Correct usage of NCR5380_init.
+
+ * scsi.c: Remove redundant (and unused variables).
+
+ * sd.c: Use memset to clear all of rscsi_disks before we use it.
+
+ * sg.c: Ditto, except for scsi_generics.
+
+ * sr.c: Ditto, except for scsi_CDs.
+
+ * st.c: Initialize STp->device.
+
+ * seagate.c: Fix bug. [Drew]
+
+Thu Aug 4 08:47:27 1994 Eric Youngdale (eric@andante)
+
+ * Linux 1.1.39 released.
+
+ * Makefile: Fix typo in NCR53C7xx.
+
+ * st.c: Print correct number for device.
+
+Tue Aug 2 11:29:14 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.38 released.
+
+ Lots of changes in 1.1.38. All from Drew unless otherwise noted.
+
+ * 53c7,8xx.c: New file from Drew. PCI driver.
+
+ * 53c7,8xx.h: Likewise.
+
+ * 53c7,8xx.scr: Likewise.
+
+ * 53c8xx_d.h, 53c8xx_u.h, script_asm.pl: Likewise.
+
+ * scsicam.c: New file from Drew. Read block 0 on the disk and
+ read the partition table. Attempt to deduce the geometry from
+ the partition table if possible. Only used by 53c[7,8]xx right
+ now, but could be used by any device for which we have no way
+ of identifying the geometry.
+
+ * sd.c: Use device letters instead of sd%d in a lot of messages.
+
+ * seagate.c: Fix bug that resulted in lockups with some devices.
+
+ * sr.c (sr_open): Return -EROFS, not -EACCES if we attempt to open
+ device for write.
+
+ * hosts.c, Makefile: Update for new driver.
+
+ * NCR5380.c, NCR5380.h, g_NCR5380.h: Update from Drew to support
+ 53C400 chip.
+
+ * constants.c: Define CONST_CMND and CONST_MSG. Other minor
+ cleanups along the way. Improve handling of CONST_MSG.
+
+ * fdomain.c, fdomain.h: New version from Rik Faith. Update to
+ 5.18. Should now support TMC-3260 PCI card with 18C30 chip.
+
+ * pas16.c: Update with new irq initialization.
+
+ * t128.c: Update with minor cleanups.
+
+ * scsi.c (scsi_pid): New variable - gives each command a unique
+ id. Add Quantum LPS5235S to blacklist. Change in_scan to
+ in_scan_scsis and make global.
+
+ * scsi.h: Add some defines for extended message handling,
+ INITIATE/RELEASE_RECOVERY. Add a few new fields to support sync
+ transfers.
+
+ * scsi_ioctl.h: Add ioctl to request synchronous transfers.
+
+
+Tue Jul 26 21:36:58 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.37 released.
+
+ * aha1542.c: Always call aha1542_mbenable, use new udelay
+ mechanism so we do not wait a long time if the board does not
+ implement this command.
+
+ * g_NCR5380.c: Remove #include <linux/config.h> and #if
+ defined(CONFIG_SCSI_*).
+
+ * seagate.c: Likewise.
+
+ Next round of changes to support loadable modules. Getting closer
+ now, still not possible to do anything remotely usable.
+
+ hosts.c: Create a linked list of detected high level devices.
+ (scsi_register_device): New function to insert into this list.
+ (scsi_init): Call scsi_register_device for each of the known high
+ level drivers.
+
+ hosts.h: Add prototype for linked list header. Add structure
+ definition for device template structure which defines the linked
+ list.
+
+ scsi.c: (scan_scsis): Use linked list instead of knowledge about
+ existing high level device drivers.
+ (scsi_dev_init): Use init functions for drivers on linked list
+ instead of explicit list to initialize and attach devices to high
+ level drivers.
+
+ scsi.h: Add new field "attached" to scsi_device - count of number
+ of high level devices attached.
+
+ sd.c, sr.c, sg.c, st.c: Adjust init/attach functions to use new
+ scheme.
+
+Sat Jul 23 13:03:17 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.35 released.
+
+ * ultrastor.c: Change constraint on asm() operand so that it works
+ with gcc 2.6.0.
+
+Thu Jul 21 10:37:39 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.33 released.
+
+ * sr.c(sr_open): Do not allow opens with write access.
+
+Mon Jul 18 09:51:22 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.31 released.
+
+ * sd.c: Increase SD_TIMEOUT from 300 to 600.
+
+ * sr.c: Remove stray task_struct* variable that was no longer
+ used.
+
+ * sr_ioctl.c: Fix typo in up() call.
+
+Sun Jul 17 16:25:29 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.30 released.
+
+ * scsi.c (scan_scsis): Fix detection of some Toshiba CDROM drives
+ that report themselves as disk drives.
+
+ * (Throughout): Use request.sem instead of request.waiting.
+ Should fix swap problem with fdomain.
+
+Thu Jul 14 10:51:42 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.29 released.
+
+ * scsi.c (scan_scsis): Add new devices to end of linked list, not
+ to the beginning.
+
+ * scsi.h (SCSI_SLEEP): Remove brain dead hack to try to save
+ the task state before sleeping.
+
+Sat Jul 9 15:01:03 1994 Eric Youngdale (eric@esp22)
+
+ More changes to eventually support loadable modules. Mainly
+ we want to use linked lists instead of arrays because it is easier
+ to dynamically add and remove things this way.
+
+ Quite a bit more work is needed before loadable modules are
+ possible (and usable) with scsi, but this is most of the grunge
+ work.
+
+ * Linux 1.1.28 released.
+
+ * scsi.c, scsi.h (allocate_device, request_queueable): Change
+ argument from index into scsi_devices to a pointer to the
+ Scsi_Device struct.
+
+ * Throughout: Change all calls to allocate_device,
+ request_queueable to use new calling sequence.
+
+ * Throughout: Use SCpnt->device instead of
+ scsi_devices[SCpnt->index]. Ugh - the pointer was there all along
+ - much cleaner this way.
+
+ * scsi.c (scsi_init_malloc, scsi_free_malloc): New functions -
+ allow us to pretend that we have a working malloc when we
+ initialize. Use this instead of passing memory_start, memory_end
+ around all over the place.
+
+ * scsi.h, st.c, sr.c, sd.c, sg.c: Change *_init1 functions to use
+ scsi_init_malloc, remove all arguments, no return value.
+
+ * scsi.h: Remove index field from Scsi_Device and Scsi_Cmnd
+ structs.
+
+ * scsi.c (scsi_dev_init): Set up for scsi_init_malloc.
+ (scan_scsis): Get SDpnt from scsi_init_malloc, and refresh
+ when we discover a device. Free pointer before returning.
+ Change scsi_devices into a linked list.
+
+ * scsi.c (scan_scsis): Change to only scan one host.
+ (scsi_dev_init): Loop over all detected hosts, and scan them.
+
+ * hosts.c (scsi_init_free): Change so that number of extra bytes
+ is stored in struct, and we do not have to pass it each time.
+
+ * hosts.h: Change Scsi_Host_Template struct to include "next" and
+ "release" functions. Initialize to NULL in all low level
+ adapters.
+
+ * hosts.c: Rename scsi_hosts to builtin_scsi_hosts, create linked
+ list scsi_hosts, linked together with the new "next" field.
+
+Wed Jul 6 05:45:02 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.25 released.
+
+ * aha152x.c: Changes from Juergen - cleanups and updates.
+
+ * sd.c, sr.c: Use new check_media_change and revalidate
+ file_operations fields.
+
+ * st.c, st.h: Add changes from Kai Makisara, dated Jun 22.
+
+ * hosts.h: Change SG_ALL back to 0xff. Apparently soft error
+ in /dev/brain resulted in having this bumped up.
+ Change first parameter in bios_param function to be Disk * instead
+ of index into rscsi_disks.
+
+ * sd_ioctl.c: Pass pointer to rscsi_disks element instead of index
+ to array.
+
+ * sd.h: Add struct name "scsi_disk" to typedef for Scsi_Disk.
+
+ * scsi.c: Remove redundant Maxtor XT8760S from blacklist.
+ In scsi_reset, add printk when DEBUG defined.
+
+ * All low level drivers: Modify definitions of bios_param in
+ appropriate way.
+
+Thu Jun 16 10:31:59 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.20 released.
+
+ * scsi_ioctl.c: Only pass down the actual number of characters
+ required to scsi_do_cmd, not the one rounded up to a even number
+ of sectors.
+
+ * ultrastor.c: Changes from Caleb Epstein for 24f cards. Support
+ larger SG lists.
+
+ * ultrastor.c: Changes from me - use scsi_register to register
+ host. Add some consistency checking,
+
+Wed Jun 1 21:12:13 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.19 released.
+
+ * scsi.h: Add new return code for reset() function:
+ SCSI_RESET_PUNT.
+
+ * scsi.c: Make SCSI_RESET_PUNT the same as SCSI_RESET_WAKEUP for
+ now.
+
+ * aha1542.c: If the command responsible for the reset is not
+ pending, return SCSI_RESET_PUNT.
+
+ * aha1740.c, buslogic.c, wd7000.c, ultrastor.c: Return
+ SCSI_RESET_PUNT instead of SCSI_RESET_SNOOZE.
+
+Tue May 31 19:36:01 1994 Eric Youngdale (eric@esp22)
+
+ * buslogic.c: Do not print out message about "must be Adaptec"
+ if we have detected a buslogic card. Print out a warning message
+ if we are configuring for >16Mb, since the 445S at board level
+ D or earlier does not work right. The "D" level board can be made
+ to work by flipping an undocumented switch, but this is too subtle.
+
+ Changes based upon patches in Yggdrasil distribution.
+
+ * sg.c, sg.h: Return sense data to user.
+
+ * aha1542.c, aha1740.c, buslogic.c: Do not panic if
+ sense buffer is wrong size.
+
+ * hosts.c: Test for ultrastor card before any of the others.
+
+ * scsi.c: Allow boot-time option for max_scsi_luns=? so that
+ buggy firmware has an easy work-around.
+
+Sun May 15 20:24:34 1994 Eric Youngdale (eric@esp22)
+
+ * Linux 1.1.15 released.
+
+ Post-codefreeze thaw...
+
+ * buslogic.[c,h]: New driver from David Gentzel.
+
+ * hosts.h: Add use_clustering field to explicitly say whether
+ clustering should be used for devices attached to this host
+ adapter. The buslogic board apparently supports large SG lists,
+ but it is apparently faster if sd.c condenses this into a smaller
+ list.
+
+ * sd.c: Use this field instead of heuristic.
+
+ * All host adapter include files: Add appropriate initializer for
+ use_clustering field.
+
+ * scsi.h: Add #defines for return codes for the abort and reset
+ functions. There are now a specific set of return codes to fully
+ specify all of the possible things that the low-level adapter
+ could do.
+
+ * scsi.c: Act based upon return codes from abort/reset functions.
+
+ * All host adapter abort/reset functions: Return new return code.
+
+ * Add code in scsi.c to help debug timeouts. Use #define
+ DEBUG_TIMEOUT to enable this.
+
+ * scsi.c: If the host->irq field is set, use
+ disable_irq/enable_irq before calling queuecommand if we
+ are not already in an interrupt. Reduce races, and we
+ can be sloppier about cli/sti in the interrupt routines now
+ (reduce interrupt latency).
+
+ * constants.c: Fix some things to eliminate warnings. Add some
+ sense descriptions that were omitted before.
+
+ * aha1542.c: Watch for SCRD from host adapter - if we see it, set
+ a flag. Currently we only print out the number of pending
+ commands that might need to be restarted.
+
+ * aha1542.c (aha1542_abort): Look for lost interrupts, OGMB still
+ full, and attempt to recover. Otherwise give up.
+
+ * aha1542.c (aha1542_reset): Try BUS DEVICE RESET, and then pass
+ DID_RESET back up to the upper level code for all commands running
+ on this target (even on different LUNs).
+
+Sat May 7 14:54:01 1994
+
+ * Linux 1.1.12 released.
+
+ * st.c, st.h: New version from Kai. Supports boot time
+ specification of number of buffers.
+
+ * wd7000.[c,h]: Updated driver from John Boyd. Now supports
+ more than one wd7000 board in machine at one time, among other things.
+
+Wed Apr 20 22:20:35 1994
+
+ * Linux 1.1.8 released.
+
+ * sd.c: Add a few type casts where scsi_malloc is called.
+
+Wed Apr 13 12:53:29 1994
+
+ * Linux 1.1.4 released.
+
+ * scsi.c: Clean up a few printks (use %p to print pointers).
+
+Wed Apr 13 11:33:02 1994
+
+ * Linux 1.1.3 released.
+
+ * fdomain.c: Update to version 5.16 (Handle different FIFO sizes
+ better).
+
+Fri Apr 8 08:57:19 1994
+
+ * Linux 1.1.2 released.
+
+ * Throughout: SCSI portion of cluster diffs added.
+
+Tue Apr 5 07:41:50 1994
+
+ * Linux 1.1 development tree initiated.
+
+ * The linux 1.0 development tree is now effectively frozen except
+ for obvious bugfixes.
+
+******************************************************************
+******************************************************************
+******************************************************************
+******************************************************************
+
+Sun Apr 17 00:17:39 1994
+
+ * Linux 1.0, patchlevel 9 released.
+
+ * fdomain.c: Update to version 5.16 (Handle different FIFO sizes
+ better).
+
+Thu Apr 7 08:36:20 1994
+
+ * Linux 1.0, patchlevel8 released.
+
+ * fdomain.c: Update to version 5.15 from 5.9. Handles 3.4 bios.
+
+Sun Apr 3 14:43:03 1994
+
+ * Linux 1.0, patchlevel6 released.
+
+ * wd7000.c: Make stab at fixing race condition.
+
+Sat Mar 26 14:14:50 1994
+
+ * Linux 1.0, patchlevel5 released.
+
+ * aha152x.c, Makefile: Fix a few bugs (too much data message).
+ Add a few more bios signatures. (Patches from Juergen).
+
+ * aha1542.c: Fix race condition in aha1542_out.
+
+Mon Mar 21 16:36:20 1994
+
+ * Linux 1.0, patchlevel3 released.
+
+ * sd.c, st.c, sr.c, sg.c: Return -ENXIO, not -ENODEV if we attempt
+ to open a non-existent device.
+
+ * scsi.c: Add Chinon cdrom to blacklist.
+
+ * sr_ioctl.c: Check return status of verify_area.
+
+Sat Mar 6 16:06:19 1994
+
+ * Linux 1.0 released (technically a pre-release).
+
+ * scsi.c: Add IMS CDD521, Maxtor XT-8760S to blacklist.
+
+Tue Feb 15 10:58:20 1994
+
+ * pl15e released.
+
+ * aha1542.c: For 1542C, allow dynamic device scan with >1Gb turned
+ off.
+
+ * constants.c: Fix typo in definition of CONSTANTS.
+
+ * pl15d released.
+
+Fri Feb 11 10:10:16 1994
+
+ * pl15c released.
+
+ * scsi.c: Add Maxtor XT-3280 and Rodime RO3000S to blacklist.
+
+ * scsi.c: Allow tagged queueing for scsi 3 devices as well.
+ Some really old devices report a version number of 0. Disallow
+ LUN != 0 for these.
+
+Thu Feb 10 09:48:57 1994
+
+ * pl15b released.
+
+Sun Feb 6 12:19:46 1994
+
+ * pl15a released.
+
+Fri Feb 4 09:02:17 1994
+
+ * scsi.c: Add Teac cdrom to blacklist.
+
+Thu Feb 3 14:16:43 1994
+
+ * pl15 released.
+
+Tue Feb 1 15:47:43 1994
+
+ * pl14w released.
+
+ * wd7000.c (wd_bases): Fix typo in last change.
+
+Mon Jan 24 17:37:23 1994
+
+ * pl14u released.
+
+ * aha1542.c: Support 1542CF/extended bios. Different from 1542C
+
+ * wd7000.c: Allow bios at 0xd8000 as well.
+
+ * ultrastor.c: Do not truncate cylinders to 1024.
+
+ * fdomain.c: Update to version 5.9 (add new bios signature).
+
+ * NCR5380.c: Update from Drew - should work a lot better now.
+
+Sat Jan 8 15:13:10 1994
+
+ * pl14o released.
+
+ * sr_ioctl.c: Zero reserved field before trying to set audio volume.
+
+Wed Jan 5 13:21:10 1994
+
+ * pl14m released.
+
+ * fdomain.c: Update to version 5.8. No functional difference???
+
+Tue Jan 4 14:26:13 1994
+
+ * pl14l released.
+
+ * ultrastor.c: Remove outl, inl functions (now provided elsewhere).
+
+Mon Jan 3 12:27:25 1994
+
+ * pl14k released.
+
+ * aha152x.c: Remove insw and outsw functions.
+
+ * fdomain.c: Ditto.
+
+Wed Dec 29 09:47:20 1993
+
+ * pl14i released.
+
+ * scsi.c: Support RECOVERED_ERROR for tape drives.
+
+ * st.c: Update of tape driver from Kai.
+
+Tue Dec 21 09:18:30 1993
+
+ * pl14g released.
+
+ * aha1542.[c,h]: Support extended BIOS stuff.
+
+ * scsi.c: Clean up messages about disks, so they are displayed as
+ sda, sdb, etc instead of sd0, sd1, etc.
+
+ * sr.c: Force reread of capacity if disk was changed.
+ Clear buffer before asking for capacity/sectorsize (some drives
+ do not report this properly). Set needs_sector_size flag if
+ drive did not return sensible sector size.
+
+Mon Dec 13 12:13:47 1993
+
+ * aha152x.c: Update to version .101 from Juergen.
+
+Mon Nov 29 03:03:00 1993
+
+ * linux 0.99.14 released.
+
+ * All scsi stuff moved from kernel/blk_drv/scsi to drivers/scsi.
+
+ * Throughout: Grammatical corrections to various comments.
+
+ * Makefile: fix so that we do not need to compile things we are
+ not going to use.
+
+ * NCR5380.c, NCR5380.h, g_NCR5380.c, g_NCR5380.h, pas16.c,
+ pas16.h, t128.c, t128.h: New files from Drew.
+
+ * aha152x.c, aha152x.h: New files from Juergen Fischer.
+
+ * aha1542.c: Support for more than one 1542 in the machine
+ at the same time. Make functions static that do not need
+ visibility.
+
+ * aha1740.c: Set NEEDS_JUMPSTART flag in reset function, so we
+ know to restart the command. Change prototype of aha1740_reset
+ to take a command pointer.
+
+ * constants.c: Clean up a few things.
+
+ * fdomain.c: Update to version 5.6. Move snarf_region. Allow
+ board to be set at different SCSI ids. Remove support for
+ reselection (did not work well). Set JUMPSTART flag in reset
+ code.
+
+ * hosts.c: Support new low-level adapters. Allow for more than
+ one adapter of a given type.
+
+ * hosts.h: Allow for more than one adapter of a given type.
+
+ * scsi.c: Add scsi_device_types array, if NEEDS_JUMPSTART is set
+ after a low-level reset, start the command again. Sort blacklist,
+ and add Maxtor MXT-1240S, XT-4170S, NEC CDROM 84, Seagate ST157N.
+
+ * scsi.h: Add constants for tagged queueing.
+
+ * Throughout: Use constants from major.h instead of hardcoded
+ numbers for major numbers.
+
+ * scsi_ioctl.c: Fix bug in buffer length in ioctl_command. Use
+ verify_area in GET_IDLUN ioctl. Add new ioctls for
+ TAGGED_QUEUE_ENABLE, DISABLE. Only allow IOCTL_SEND_COMMAND by
+ superuser.
+
+ * sd.c: Only pay attention to UNIT_ATTENTION for removable disks.
+ Fix bug where sometimes portions of blocks would get lost
+ resulting in processes hanging. Add messages when we spin up a
+ disk, and fix a bug in the timing. Increase read-ahead for disks
+ that are on a scatter-gather capable host adapter.
+
+ * seagate.c: Fix so that some parameters can be set from the lilo
+ prompt. Supply jumpstart flag if we are resetting and need the
+ command restarted. Fix so that we return 1 if we detect a card
+ so that multiple card detection works correctly. Add yet another
+ signature for FD cards (950). Add another signature for ST0x.
+
+ * sg.c, sg.h: New files from Lawrence Foard for generic scsi
+ access.
+
+ * sr.c: Add type casts for (void*) so that we can do pointer
+ arithmetic. Works with GCC without this, but it is not strictly
+ correct. Same bugfix as was in sd.c. Increase read-ahead a la
+ disk driver.
+
+ * sr_ioctl.c: Use scsi_malloc buffer instead of buffer from stack
+ since we cannot guarantee that the stack is < 16Mb.
+
+ ultrastor.c: Update to support 24f properly (JFC's driver).
+
+ wd7000.c: Supply jumpstart flag for reset. Do not round up
+ number of cylinders in biosparam function.
+
+Sat Sep 4 20:49:56 1993
+
+ * 0.99pl13 released.
+
+ * Throughout: Use check_region/snarf_region for all low-level
+ drivers.
+
+ * aha1542.c: Do hard reset instead of soft (some ethercard probes
+ screw us up).
+
+ * scsi.c: Add new flag ASKED_FOR_SENSE so that we can tell if we are
+ in a loop whereby the device returns null sense data.
+
+ * sd.c: Add code to spin up a drive if it is not already spinning.
+ Do this one at a time to make it easier on power supplies.
+
+ * sd_ioctl.c: Use sync_dev instead of fsync_dev in BLKFLSBUF ioctl.
+
+ * seagate.c: Switch around DATA/CONTROL lines.
+
+ * st.c: Change sense to unsigned.
+
+Thu Aug 5 11:59:18 1993
+
+ * 0.99pl12 released.
+
+ * constants.c, constants.h: New files with ascii descriptions of
+ various conditions.
+
+ * Makefile: Do not try to count the number of low-level drivers,
+ just generate the list of .o files.
+
+ * aha1542.c: Replace 16 with sizeof(SCpnt->sense_buffer). Add tests
+ for addresses > 16Mb, panic if we find one.
+
+ * aha1740.c: Ditto with sizeof().
+
+ * fdomain.c: Update to version 3.18. Add new signature, register IRQ
+ with irqaction. Use ID 7 for new board. Be more intelligent about
+ obtaining the h/s/c numbers for biosparam.
+
+ * hosts.c: Do not depend upon Makefile generated count of the number
+ of low-level host adapters.
+
+ * scsi.c: Use array for scsi_command_size instead of a function. Add
+ Texel cdrom and Maxtor XT-4380S to blacklist. Allow compile time
+ option for no-multi lun scan. Add semaphore for possible problems
+ with handshaking, assume device is faulty until we know it not to be
+ the case. Add DEBUG_INIT symbol to dump info as we scan for devices.
+ Zero sense buffer so we can tell if we need to request it. When
+ examining sense information, request sense if buffer is all zero.
+ If RESET, request sense information to see what to do next.
+
+ * scsi_debug.c: Change some constants to use symbols like INT_MAX.
+
+ * scsi_ioctl.c (kernel_scsi_ioctl): New function -for making ioctl
+ calls from kernel space.
+
+ * sd.c: Increase timeout to 300. Use functions in constants.h to
+ display info. Use scsi_malloc buffer for READ_CAPACITY, since
+ we cannot guarantee that a stack based buffer is < 16Mb.
+
+ * sd_ioctl.c: Add BLKFLSBUF ioctl.
+
+ * seagate.c: Add new compile time options for ARBITRATE,
+ SLOW_HANDSHAKE, and SLOW_RATE. Update assembly loops for transferring
+ data. Use kernel_scsi_ioctl to request mode page with geometry.
+
+ * sr.c: Use functions in constants.c to display messages.
+
+ * st.c: Support for variable block size.
+
+ * ultrastor.c: Do not use cache for tape drives. Set
+ unchecked_isa_dma flag, even though this may not be needed (gets set
+ later).
+
+Sat Jul 17 18:32:44 1993
+
+ * 0.99pl11 released. C++ compilable.
+
+ * Throughout: Add type casts all over the place, and use "ip" instead
+ of "info" in the various biosparam functions.
+
+ * Makefile: Compile seagate.c with C++ compiler.
+
+ * aha1542.c: Always set ccb pointer as this gets trashed somehow on
+ some systems. Add a few type casts. Update biosparam function a little.
+
+ * aha1740.c: Add a few type casts.
+
+ * fdomain.c: Update to version 3.17 from 3.6. Now works with
+ TMC-18C50.
+
+ * scsi.c: Minor changes here and there with datatypes. Save use_sg
+ when requesting sense information so that this can properly be
+ restored if we retry the command. Set aside dma buffers assuming each
+ block is 1 page, not 1Kb minix block.
+
+ * scsi_ioctl.c: Add a few type casts. Other minor changes.
+
+ * sd.c: Correctly free all scsi_malloc'd memory if we run out of
+ dma_pool. Store blocksize information for each partition.
+
+ * seagate.c: Minor cleanups here and there.
+
+ * sr.c: Set up blocksize array for all discs. Fix bug in freeing
+ buffers if we run out of dma pool.
+
+Thu Jun 2 17:58:11 1993
+
+ * 0.99pl10 released.
+
+ * aha1542.c: Support for BT 445S (VL-bus board with no dma channel).
+
+ * fdomain.c: Upgrade to version 3.6. Preliminary support for TNC-18C50.
+
+ * scsi.c: First attempt to fix problem with old_use_sg. Change
+ NOT_READY to a SUGGEST_ABORT. Fix timeout race where time might
+ get decremented past zero.
+
+ * sd.c: Add block_fsync function to dispatch table.
+
+ * sr.c: Increase timeout to 500 from 250. Add entry for sync in
+ dispatch table (supply NULL). If we do not have a sectorsize,
+ try to get it in the sd_open function. Add new function just to
+ obtain sectorsize.
+
+ * sr.h: Add needs_sector_size semaphore.
+
+ * st.c: Add NULL for fsync in dispatch table.
+
+ * wd7000.c: Allow another condition for power on that are normal
+ and do not require a panic.
+
+Thu Apr 22 23:10:11 1993
+
+ * 0.99pl9 released.
+
+ * aha1542.c: Use (void) instead of () in setup_mailboxes.
+
+ * scsi.c: Initialize transfersize and underflow fields in SCmd to 0.
+ Do not panic for unsupported message bytes.
+
+ * scsi.h: Allocate 12 bytes instead of 10 for commands. Add
+ transfersize and underflow fields.
+
+ * scsi_ioctl.c: Further bugfix to ioctl_probe.
+
+ * sd.c: Use long instead of int for last parameter in sd_ioctl.
+ Initialize transfersize and underflow fields.
+
+ * sd_ioctl.c: Ditto for sd_ioctl(,,,,);
+
+ * seagate.c: New version from Drew. Includes new signatures for FD
+ cards. Support for 0ws jumper. Correctly initialize
+ scsi_hosts[hostnum].this_id. Improved handing of
+ disconnect/reconnect, and support command linking. Use
+ transfersize and underflow fields. Support scatter-gather.
+
+ * sr.c, sr_ioctl.c: Use long instead of int for last parameter in sr_ioctl.
+ Use buffer and buflength in do_ioctl. Patches from Chris Newbold for
+ scsi-2 audio commands.
+
+ * ultrastor.c: Comment out in_byte (compiler warning).
+
+ * wd7000.c: Change () to (void) in wd7000_enable_dma.
+
+Wed Mar 31 16:36:25 1993
+
+ * 0.99pl8 released.
+
+ * aha1542.c: Handle mailboxes better for 1542C.
+ Do not truncate number of cylinders at 1024 for biosparam call.
+
+ * aha1740.c: Fix a few minor bugs for multiple devices.
+ Same as above for biosparam.
+
+ * scsi.c: Add lockable semaphore for removable devices that can have
+ media removal prevented. Add another signature for flopticals.
+ (allocate_device): Fix race condition. Allow more space in dma pool
+ for blocksizes of up to 4Kb.
+
+ * scsi.h: Define COMMAND_SIZE. Define a SCSI specific version of
+ INIT_REQUEST that can run with interrupts off.
+
+ * scsi_ioctl.c: Make ioctl_probe function more idiot-proof. If
+ a removable device says ILLEGAL REQUEST to a door-locking command,
+ clear lockable flag. Add SCSI_IOCTL_GET_IDLUN ioctl. Do not attempt
+ to lock door for devices that do not have lockable semaphore set.
+
+ * sd.c: Fix race condition for multiple disks. Use INIT_SCSI_REQUEST
+ instead of INIT_REQUEST. Allow sector sizes of 1024 and 256. For
+ removable disks that are not ready, mark them as having a media change
+ (some drives do not report this later).
+
+ * seagate.c: Use volatile keyword for memory-mapped register pointers.
+
+ * sr.c: Fix race condition, a la sd.c. Increase the number of retries
+ to 1. Use INIT_SCSI_REQUEST. Allow 512 byte sector sizes. Do a
+ read_capacity when we init the device so we know the size and
+ sectorsize.
+
+ * st.c: If ioctl not found in st.c, try scsi_ioctl for others.
+
+ * ultrastor.c: Do not truncate number of cylinders at 1024 for
+ biosparam call.
+
+ * wd7000.c: Ditto.
+ Throughout: Use COMMAND_SIZE macro to determine length of scsi
+ command.
+
+
+
+Sat Mar 13 17:31:29 1993
+
+ * 0.99pl7 released.
+
+ Throughout: Improve punctuation in some messages, and use new
+ verify_area syntax.
+
+ * aha1542.c: Handle unexpected interrupts better.
+
+ * scsi.c: Ditto. Handle reset conditions a bit better, asking for
+ sense information and retrying if required.
+
+ * scsi_ioctl.c: Allow for 12 byte scsi commands.
+
+ * ultrastor.c: Update to use scatter-gather.
+
+Sat Feb 20 17:57:15 1993
+
+ * 0.99pl6 released.
+
+ * fdomain.c: Update to version 3.5. Handle spurious interrupts
+ better.
+
+ * sd.c: Use register_blkdev function.
+
+ * sr.c: Ditto.
+
+ * st.c: Use register_chrdev function.
+
+ * wd7000.c: Undo previous change.
+
+Sat Feb 6 11:20:43 1993
+
+ * 0.99pl5 released.
+
+ * scsi.c: Fix bug in testing for UNIT_ATTENTION.
+
+ * wd7000.c: Check at more addresses for bios. Fix bug in biosparam
+ (heads & sectors turned around).
+
+Wed Jan 20 18:13:59 1993
+
+ * 0.99pl4 released.
+
+ * scsi.c: Ignore leading spaces when looking for blacklisted devices.
+
+ * seagate.c: Add a few new signatures for FD cards. Another patch
+ with SCint to fix race condition. Use recursion_depth to keep track
+ of how many times we have been recursively called, and do not start
+ another command unless we are on the outer level. Fixes bug
+ with Syquest cartridge drives (used to crash kernel), because
+ they do not disconnect with large data transfers.
+
+Tue Jan 12 14:33:36 1993
+
+ * 0.99pl3 released.
+
+ * fdomain.c: Update to version 3.3 (a few new signatures).
+
+ * scsi.c: Add CDU-541, Denon DRD-25X to blacklist.
+ (allocate_request, request_queueable): Init request.waiting to NULL if
+ non-buffer type of request.
+
+ * seagate.c: Allow controller to be overridden with CONTROLLER symbol.
+ Set SCint=NULL when we are done, to remove race condition.
+
+ * st.c: Changes from Kai.
+
+Wed Dec 30 20:03:47 1992
+
+ * 0.99pl2 released.
+
+ * scsi.c: Blacklist back in. Remove Newbury drive as other bugfix
+ eliminates need for it here.
+
+ * sd.c: Return ENODEV instead of EACCES if no such device available.
+ (sd_init) Init blkdev_fops earlier so that sd_open is available sooner.
+
+ * sr.c: Same as above for sd.c.
+
+ * st.c: Return ENODEV instead of ENXIO if no device. Init chrdev_fops
+ sooner, so that it is always there even if no tapes.
+
+ * seagate.c (controller_type): New variable to keep track of ST0x or
+ FD. Modify signatures list to indicate controller type, and init
+ controller_type once we find a match.
+
+ * wd7000.c (wd7000_set_sync): Remove redundant function.
+
+Sun Dec 20 16:26:24 1992
+
+ * 0.99pl1 released.
+
+ * scsi_ioctl.c: Bugfix - check dev->index, not dev->id against
+ NR_SCSI_DEVICES.
+
+ * sr_ioctl.c: Verify that device exists before allowing an ioctl.
+
+ * st.c: Patches from Kai - change timeout values, improve end of tape
+ handling.
+
+Sun Dec 13 18:15:23 1992
+
+ * 0.99 kernel released. Baseline for this ChangeLog.
diff --git a/Documentation/scsi/ChangeLog.arcmsr b/Documentation/scsi/ChangeLog.arcmsr
new file mode 100644
index 0000000..038a3e6
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.arcmsr
@@ -0,0 +1,118 @@
+**************************************************************************
+** History
+**
+** REV# DATE NAME DESCRIPTION
+** 1.00.00.00 3/31/2004 Erich Chen First release
+** 1.10.00.04 7/28/2004 Erich Chen modify for ioctl
+** 1.10.00.06 8/28/2004 Erich Chen modify for 2.6.x
+** 1.10.00.08 9/28/2004 Erich Chen modify for x86_64
+** 1.10.00.10 10/10/2004 Erich Chen bug fix for SMP & ioctl
+** 1.20.00.00 11/29/2004 Erich Chen bug fix with arcmsr_bus_reset when PHY error
+** 1.20.00.02 12/09/2004 Erich Chen bug fix with over 2T bytes RAID Volume
+** 1.20.00.04 1/09/2005 Erich Chen fits for Debian linux kernel version 2.2.xx
+** 1.20.00.05 2/20/2005 Erich Chen cleanly as look like a Linux driver at 2.6.x
+** thanks for peoples kindness comment
+** Kornel Wieliczek
+** Christoph Hellwig
+** Adrian Bunk
+** Andrew Morton
+** Christoph Hellwig
+** James Bottomley
+** Arjan van de Ven
+** 1.20.00.06 3/12/2005 Erich Chen fix with arcmsr_pci_unmap_dma "unsigned long" cast,
+** modify PCCB POOL allocated by "dma_alloc_coherent"
+** (Kornel Wieliczek's comment)
+** 1.20.00.07 3/23/2005 Erich Chen bug fix with arcmsr_scsi_host_template_init
+** occur segmentation fault,
+** if RAID adapter does not on PCI slot
+** and modprobe/rmmod this driver twice.
+** bug fix enormous stack usage (Adrian Bunk's comment)
+** 1.20.00.08 6/23/2005 Erich Chen bug fix with abort command,
+** in case of heavy loading when sata cable
+** working on low quality connection
+** 1.20.00.09 9/12/2005 Erich Chen bug fix with abort command handling, firmware version check
+** and firmware update notify for hardware bug fix
+** 1.20.00.10 9/23/2005 Erich Chen enhance sysfs function for change driver's max tag Q number.
+** add DMA_64BIT_MASK for backward compatible with all 2.6.x
+** add some useful message for abort command
+** add ioctl code 'ARCMSR_IOCTL_FLUSH_ADAPTER_CACHE'
+** customer can send this command for sync raid volume data
+** 1.20.00.11 9/29/2005 Erich Chen by comment of Arjan van de Ven fix incorrect msleep redefine
+** cast off sizeof(dma_addr_t) condition for 64bit pci_set_dma_mask
+** 1.20.00.12 9/30/2005 Erich Chen bug fix with 64bit platform's ccbs using if over 4G system memory
+** change 64bit pci_set_consistent_dma_mask into 32bit
+** increcct adapter count if adapter initialize fail.
+** miss edit at arcmsr_build_ccb....
+** psge += sizeof(struct _SG64ENTRY *) =>
+** psge += sizeof(struct _SG64ENTRY)
+** 64 bits sg entry would be incorrectly calculated
+** thanks Kornel Wieliczek give me kindly notify
+** and detail description
+** 1.20.00.13 11/15/2005 Erich Chen scheduling pending ccb with FIFO
+** change the architecture of arcmsr command queue list
+** for linux standard list
+** enable usage of pci message signal interrupt
+** follow Randy.Danlup kindness suggestion cleanup this code
+** 1.20.00.14 05/02/2007 Erich Chen & Nick Cheng
+** 1.implement PCI-Express error recovery function and AER capability
+** 2.implement the selection of ARCMSR_MAX_XFER_SECTORS_B=4096
+** if firmware version is newer than 1.42
+** 3.modify arcmsr_iop_reset to improve the ability
+** 4.modify the ISR, arcmsr_interrupt routine,to prevent the
+** inconsistency with sg_mod driver if application directly calls
+** the arcmsr driver w/o passing through scsi mid layer
+** specially thanks to Yanmin Zhang's openhanded help about AER
+** 1.20.00.15 08/30/2007 Erich Chen & Nick Cheng
+** 1. support ARC1200/1201/1202 SATA RAID adapter, which is named
+** ACB_ADAPTER_TYPE_B
+** 2. modify the arcmsr_pci_slot_reset function
+** 3. modify the arcmsr_pci_ers_disconnect_forepart function
+** 4. modify the arcmsr_pci_ers_need_reset_forepart function
+** 1.20.00.15 09/27/2007 Erich Chen & Nick Cheng
+** 1. add arcmsr_enable_eoi_mode() on adapter Type B
+** 2. add readl(reg->iop2drv_doorbell_reg) in arcmsr_handle_hbb_isr()
+** in case of the doorbell interrupt clearance is cached
+** 1.20.00.15 10/01/2007 Erich Chen & Nick Cheng
+** 1. modify acb->devstate[i][j]
+** as ARECA_RAID_GOOD instead of
+** ARECA_RAID_GONE in arcmsr_alloc_ccb_pool
+** 1.20.00.15 11/06/2007 Erich Chen & Nick Cheng
+** 1. add conditional declaration for
+** arcmsr_pci_error_detected() and
+** arcmsr_pci_slot_reset
+** 1.20.00.15 11/23/2007 Erich Chen & Nick Cheng
+** 1.check if the sg list member number
+** exceeds arcmsr default limit in arcmsr_build_ccb()
+** 2.change the returned value type of arcmsr_build_ccb()
+** from "void" to "int"
+** 3.add the conditional check if arcmsr_build_ccb()
+** returns FAILED
+** 1.20.00.15 12/04/2007 Erich Chen & Nick Cheng
+** 1. modify arcmsr_drain_donequeue() to ignore unknown
+** command and let kernel process command timeout.
+** This could handle IO request violating max. segments
+** while Linux XFS over DM-CRYPT.
+** Thanks to Milan Broz's comments <mbroz@redhat.com>
+** 1.20.00.15 12/24/2007 Erich Chen & Nick Cheng
+** 1.fix the portability problems
+** 2.fix type B where we should _not_ iounmap() acb->pmu;
+** it's not ioremapped.
+** 3.add return -ENOMEM if ioremap() fails
+** 4.transfer IS_SG64_ADDR w/ cpu_to_le32()
+** in arcmsr_build_ccb
+** 5. modify acb->devstate[i][j] as ARECA_RAID_GONE instead of
+** ARECA_RAID_GOOD in arcmsr_alloc_ccb_pool()
+** 6.fix arcmsr_cdb->Context as (unsigned long)arcmsr_cdb
+** 7.add the checking state of
+** (outbound_intstatus & ARCMSR_MU_OUTBOUND_HANDLE_INT) == 0
+** in arcmsr_handle_hba_isr
+** 8.replace pci_alloc_consistent()/pci_free_consistent() with kmalloc()/kfree() in arcmsr_iop_message_xfer()
+** 9. fix the release of dma memory for type B in arcmsr_free_ccb_pool()
+** 10.fix the arcmsr_polling_hbb_ccbdone()
+** 1.20.00.15 02/27/2008 Erich Chen & Nick Cheng
+** 1.arcmsr_iop_message_xfer() is called from atomic context under the
+** queuecommand scsi_host_template handler. James Bottomley pointed out
+** that the current GFP_KERNEL|GFP_DMA flags are wrong: firstly we are in
+** atomic context, secondly this memory is not used for DMA.
+** Also removed some unneeded casts. Thanks to Daniel Drake <dsd@gentoo.org>
+**************************************************************************
diff --git a/Documentation/scsi/ChangeLog.ips b/Documentation/scsi/ChangeLog.ips
new file mode 100644
index 0000000..5019f51
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.ips
@@ -0,0 +1,122 @@
+IBM ServeRAID driver Change Log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ 5.00.01 - Sarasota ( 5i ) adapters must always be scanned first
+ - Get rid on IOCTL_NEW_COMMAND code
+ - Add Extended DCDB Commands for Tape Support in 5I
+
+ 4.90.11 - Don't actually RESET unless it's physically required
+ - Remove unused compile options
+
+ 4.90.08 - Data Corruption if First Scatter Gather Element is > 64K
+
+ 4.90.08 - Increase Delays in Flashing ( Trombone Only - 4H )
+
+ 4.90.05 - Use New PCI Architecture to facilitate Hot Plug Development
+
+ 4.90.01 - Add ServeRAID Version Checking
+
+ 4.80.26 - Clean up potential code problems ( Arjan's recommendations )
+
+ 4.80.21 - Change memcpy() to copy_to_user() in NVRAM Page 5 IOCTL path
+
+ 4.80.20 - Set max_sectors in Scsi_Host structure ( if >= 2.4.7 kernel )
+ - 5 second delay needed after resetting an i960 adapter
+
+ 4.80.14 - Take all semaphores off stack
+ - Clean Up New_IOCTL path
+
+ 4.80.04 - Eliminate calls to strtok() if 2.4.x or greater
+ - Adjustments to Device Queue Depth
+
+ 4.80.00 - Make ia64 Safe
+
+ 4.72.01 - I/O Mapped Memory release ( so "insmod ips" does not Fail )
+ - Don't Issue Internal FFDC Command if there are Active Commands
+ - Close Window for getting too many IOCTL's active
+
+ 4.72.00 - Allow for a Scatter-Gather Element to exceed MAX_XFER Size
+
+ 4.71.00 - Change all memory allocations to not use GFP_DMA flag
+ - Code Clean-Up for 2.4.x kernel
+
+ 4.70.15 - Fix Breakup for very large ( non-SG ) requests
+
+ 4.70.13 - Don't release HA Lock in ips_next() until SC taken off queue
+ - Unregister SCSI device in ips_release()
+ - Don't Send CDB's if we already know the device is not present
+
+ 4.70.12 - Corrective actions for bad controller ( during initialization )
+
+ 4.70.09 - Use a Common ( Large Buffer ) for Flashing from the JCRM CD
+ - Add IPSSEND Flash Support
+ - Set Sense Data for Unknown SCSI Command
+ - Use Slot Number from NVRAM Page 5
+ - Restore caller's DCDB Structure
+
+ 4.20.14 - Update patch files for kernel 2.4.0-test5
+
+ 4.20.13 - Fix some failure cases / reset code
+ - Hook into the reboot_notifier to flush the controller
+ cache
+
+ 4.20.03 - Rename version to coincide with new release schedules
+ - Performance fixes
+ - Fix truncation of /proc files with cat
+ - Merge in changes through kernel 2.4.0test1ac21
+
+ 4.10.13 - Fix for dynamic unload and proc file system
+
+ 4.10.00 - Add support for ServeRAID 4M/4L
+
+ 4.00.06 - Fix timeout with initial FFDC command
+
+ 4.00.05 - Remove wish_block from init routine
+ - Use linux/spinlock.h instead of asm/spinlock.h for kernels
+ 2.3.18 and later
+ - Sync with other changes from the 2.3 kernels
+
+ 4.00.04 - Rename structures/constants to be prefixed with IPS_
+
+ 4.00.03 - Add alternative passthru interface
+ - Add ability to flash ServeRAID BIOS
+
+ 4.00.02 - Fix problem with PT DCDB with no buffer
+
+ 4.00.01 - Add support for First Failure Data Capture
+
+ 4.00.00 - Add support for ServeRAID 4
+
+ 3.60.02 - Make DCDB direction based on lookup table.
+ - Only allow one DCDB command to a SCSI ID at a time.
+
+ 3.60.01 - Remove bogus error check in passthru routine.
+
+ 3.60.00 - Bump max commands to 128 for use with ServeRAID
+ firmware 3.60.
+ - Change version to 3.60 to coincide with ServeRAID release
+ numbering.
+
+ 1.00.00 - Initial Public Release
+ - Functionally equivalent to 0.99.05
+
+ 0.99.05 - Fix an oops on certain passthru commands
+
+ 0.99.04 - Fix race condition in the passthru mechanism
+ -- this required the interface to the utilities to change
+ - Fix error recovery code
+
+ 0.99.03 - Make interrupt routine handle all completed request on the
+ adapter not just the first one
+ - Make sure passthru commands get woken up if we run out of
+ SCBs
+ - Send all of the commands on the queue at once rather than
+ one at a time since the card will support it.
+
+ 0.99.02 - Added some additional debug statements to print out
+ errors if an error occurs while trying to read/write
+ to a logical drive (IPS_DEBUG).
+
+ - Fixed read/write errors when the adapter is using an
+ 8K stripe size.
+
diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc
new file mode 100644
index 0000000..ae3f962
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.lpfc
@@ -0,0 +1,1865 @@
+Known issues :
+ * Please read the associated RELEASE-NOTES file !!!
+ * This source release intended for upstream kernel releases only!
+
+Changes from 20050323 to 20050413
+
+ * Changed version number to 8.0.28
+ * Fixed build warning for 2.6.12-rc2 kernels: mempool_alloc now
+ requires a function which takes an unsigned int for gfp_flags.
+ * Removed pci dma sync calls to coherent/consistent pci memory.
+ * Merged patch from Christoph Hellwig <hch@lst.de>: split helpers
+ for fabric and nport logins out of lpfc_cmpl_els_flogi.
+ * Removed sysfs attributes that are used to dump the various
+ discovery lists.
+ * Fix for issue where not all luns are seen. Search all lists
+ other than unmap list in lpfc_find_target(). Otherwise INQUIRY
+ to luns on nodes in NPR or other relevant states (PLOGI,
+ PRLI...) are errored back and scan() terminates.
+ * Removed FC_TRANSPORT_PATCHESxxx defines. They're in 2.6.12-rc1.
+ * Compare return value of lpfc_scsi_tgt_reset against SCSI
+ midlayer codes SUCCESS/FAILED which that function returns rather
+ than SLI return code.
+ * Removed extraneous calls to lpfc_sli_next_iotag which should
+ only be called from lpfc_sli_submit_iocb. Also make
+ lpfc_sli_next_iotag static.
+ * Added PCI ID for LP10000-S.
+ * Changes in lpfc_abort_handler(): Return SUCCESS if we did not
+ find command in both TX and TX completion queues. Return ERROR
+ if we timed out waiting for command to complete after abort was
+ issued.
+ * Zero-out response sense length in lpfc_scsi_prep_cmnd to prevent
+ interpretation of stale sense length when the command completes
+ - was causing spurious 0710 messages.
+ * Moved clearing of host_scribble inside host_lock in IO
+ completion path.
+ * Fixed a bunch of mixed tab/space indentation.
+ * Allow hex format numbers in sysfs attribute setting. Fix
+ application hang when invalid numbers are used in sysfs
+ settings.
+ * Removed extra iotag allocation by lpfc_abort_handler.
+ * Clear host_scribble in the scsi_cmnd structure when failing in
+ queuecommand.
+ * Changed logic at top of lpfc_abort_handler so that if the
+ command's host_scibble field is NULL, return SUCCESS because the
+ driver has already returned the command to the midlayer.
+
+Changes from 20050308 to 20050323
+
+ * Changed version number to 8.0.27
+ * Changed a few lines from patch submitted by Christoph Hellwig
+ (3/19). MAILBOX_WSIZE * (uint32_t) is replaced with an
+ equivalent MAILBOX_CMDSIZE macro.
+ * Merged patch from Christoph Hellwig (3/19): some misc patches
+ against the latest drivers:
+ - stop using volatile. if you need special ordering use memory
+ barriers but that doesn't seem to be the case here
+ - switch lpfc_sli_pcimem_bcopy to take void * arguments.
+ - remove typecast for constants - a U postfix marks them
+ unsigned int in C
+ - add a MAILBOX_CMD_SIZE macro, as most users of
+ MAILBOX_CMD_WSIZE didn't really want the word count
+ - kill struct lpfc_scsi_dma_buf and embedded the two members
+ directly in struct lpfc_scsi_buf
+ - don't call dma_sync function on allocations from
+ pci_pool_alloc - it's only for streaming mappings (pci_map_*)
+ * Merged patch from Christoph Hellwig (3/19) - nlp_failMask isn't
+ ever used by the driver, just reported to userspace (and that in
+ a multi-value file which is against the sysfs guidelines).
+ * Change pci_module_init to pci_register_module() with appropriate
+ ifdefs.
+ * Added #include <linux/dma-mapping.h> as required by the DMA
+ 32bit and 64bit defines on some archs.
+ * Merged patch from Christoph Hellwig (03/19) - fix initialization
+ order - scsi_add_host must happen last from scsi POV. Also some
+ minor style/comment fixups.
+ * Fixed use of TRANSPORT_PATCHES_V2 by changing to
+ FC_TRANSPORT_PATCHES_V2.
+
+Changes from 20050223 to 20050308
+
+ * Changed version number to 8.0.26
+ * Revise TRANSPORT_PATCHES_V2 so that lpfc_target is removed and
+ rport data is used instead. Removed device_queue_hash[].
+ * Changed RW attributes of scan_down, max_luns and fcp_bind_method
+ to R only.
+ * Fixed RSCN handling during initial link initialization.
+ * Fixed issue with receiving PLOGI handling when node is on NPR
+ list and marked for ADISC.
+ * Fixed RSCN timeout issues.
+ * Reduced severity of "SCSI layer issued abort device" message to
+ KERN_WARNING.
+ * Feedback from Christoph Hellwig (on 2/5) - In the LPFC_EVT_SCAN
+ case the caller already has the target ID handly, so pass that
+ one in evt_arg1.
+ * Fix compile warning/resultant panic in
+ lpfc_register_remote_port().
+
+Changes from 20050215 to 20050223
+
+ * Changed version number to 8.0.25
+ * Add appropriate comments to lpfc_sli.c.
+ * Use DMA_64BIT_MASK and DMA_32BIT_MASK defines instead of
+ 0xffffffffffffffffULL & 0xffffffffULL respectively. Use pci
+ equivalents instead of dma_set_mask and also modify condition
+ clause to actually exit on error condition.
+ * Restart els timeout handler only if txcmplq_cnt. On submission,
+ mod_timer the els_tmofunc. This prevents the worker thread from
+ waking up the els_tmo handler un-necessarily. The thread was
+ being woken up even when there were no pending els commands.
+ * Added new typedefs for abort and reset functions.
+ * Collapsed lpfc_sli_abort_iocb_xxx into a single function.
+ * Collapsed lpfc_sli_sum_iocb_xxx into a single function.
+ * Removed TXQ from all abort and reset handlers since it is never
+ used.
+ * Fixed Oops panic in 8.0.23 (reported on SourceForge). The
+ driver was not handling LPFC_IO_POLL cases correctly in
+ fast_ring_event and was setting the tgt_reset timeout to 0 in
+ lpfc_reset_bus_handler. This 0 timeout would not allow the FW
+ to timeout ABTS's on bad targets and allow the driver to have an
+ iocb on two lists. Also split the lpfc_sli_ringtxcmpl_get
+ function into two routines to match the fast and slow completion
+ semantics - ELS completions worked for the wrong reasons. Also
+ provided new log message number - had two 0326 entries.
+ * Removed unused #define LPFC_SCSI_INITIAL_BPL_SIZE.
+ * Removed unused struct lpfc_node_farp_pend definition.
+ * Removed unused #define LPFC_SLIM2_PAGE_AREA.
+ * Changed zeros used as pointers to NULL.
+ * Removed unneeded braces around single line in lpfc_do_work.
+ * Close humongous memory leak in lpfc_sli.c - driver was losing 13
+ iocbq structures per LIP.
+ * Removed last of GFP_ATOMIC allocations.
+ * Locks are not taken outside of nportdisc, hbadisc, els and most
+ of the init, sli, mbox and ct groups of functions
+ * Fix comment for lpfc_sli_iocb_cmd_type to fit within 80 columns.
+ * Replaced wait_event() with wait_event_interruptible().
+ wait_event() puts the woker thread in an UNINTERRUPTIBLE state
+ causing it to figure in load average calculations. Also add a
+ BUG_ON to the ret code of wait_event_interruptible() since the
+ premise is that the worker thread is signal-immune.
+
+Changes from 20050208 to 20050215
+
+ * Changed version number to 8.0.24
+ * Fixed a memory leak of iocbq structure. For ELS solicited iocbs
+ sli layer now frees the response iocbs after processing it.
+ * Closed large memory leak -- we were losing 13 iocbq structures
+ per LIP.
+ * Changing EIO and ENOMEM to -EIO and -ENOMEM respectively.
+ * Cleanup of lpfc_sli_iocb_cmd_type array and typing of iocb type.
+ * Implemented Christoph Hellwig's feedback from 02/05: Remove
+ macros putLunHigh, putLunLow. Use lpfc_put_lun() inline instead.
+ * Integrated Christoph Hellwig's feedback from 02/05: Instead of
+ cpu_to_be32(), use swab16((uint16_t)lun). This is the same as
+ "swab16() on LE" and "<<16 on BE".
+ * Added updates for revised FC remote port patch (dev_loss_tmo
+ moved to rport, hostdata renamed dd_data, add fc_remote_host()
+ on shutdown).
+ * Removed unnecessary function prototype.
+ * Added code to prevent waking up worker thread after the exit of
+ worker thread. Fixes panic seen with insmod/rmmod testing with
+ 70 disks.
+ * Integrated Christoph Hellwig's patch from 1/30: Make some
+ variables/code static (namely lpfcAlpaArray and
+ process_nodev_timeout()).
+ * Integrated Christoph Hellwig's patch from 1/30: Use
+ switch...case instead of if...else if...else if while decoding
+ JDEC id.
+
+Changes from 20050201 to 20050208
+
+ * Changed version number to 8.0.23
+ * Make lpfc_work_done, lpfc_get_scsi_buf,
+ lpfc_mbx_process_link_up, lpfc_mbx_issue_link_down and
+ lpfc_sli_chipset_init static.
+ * Cleaned up references to list_head->next field in the driver.
+ * Replaced lpfc_discq_post_event with lpfc_workq_post_event.
+ * Implmented Christoph Hellwig's review from 2/5: Check for return
+ values of kmalloc.
+ * Integrated Christoph Hellwig's patch from 1/30: Protecting
+ scan_tmo and friends in !FC_TRANSPORT_PATCHES_V2 &&
+ !USE_SCAN_TARGET.
+ * Integrated Christoph Hellwig's patch from 1/30: Some fixes in
+ the evt handling area.
+ * Integrated Christoph Hellwig's patch from 1/30: Remove usage of
+ intr_inited variable. The interrupt initilization from OS side
+ now happens in lpfc_probe_one().
+ * Integrated Christoph Hellwig's patch from 1/30: remove shim
+ lpfc_alloc_transport_attr - remove shim lpfc_alloc_shost_attrs -
+ remove shim lpfc_scsi_host_init - allocate phba mem in scsi's
+ hostdata readjust code so that they are no use after free's
+ (don't use after scsi_host_put) - make lpfc_alloc_sysfs_attr
+ return errors
+ * Fixed panic in lpfc_probe_one(). Do not delete in a list
+ iterator that is not safe.
+ * Clean up fast lookup array of the fcp_ring when aborting iocbs.
+ * Following timeout handlers moved to the lpfc worker thread:
+ lpfc_disc_timeout, lpfc_els_timeout, lpfc_mbox, lpfc_fdmi_tmo,
+ lpfc_nodev_timeout, lpfc_els_retry_delay.
+ * Removed unused NLP_NS_NODE #define.
+ * Integrated Christoph Hellwig's patch from 1/30: remove unused
+ lpfc_hba_list; remove unused lpfc_rdrev_wd30; remove
+ lpfc_get_brd_no and use Linux provided IDR.
+ * Changed board reset procedure so that lpfc_sli_send_reset()
+ writes the INITFF bit and leaves lpfc_sli_brdreset() to clear
+ the bit.
+ * Removed outfcpio sysfs device attribute.
+ * VPD changes: 1) Modify driver to use the model name and
+ description from the VPD data if it exists 2) Rework use of DUMP
+ mailbox command to support HBAs with 256 bytes of SLIM.
+ * Fixed compile error for implicit definition of struct
+ scsi_target
+
+Changes from 20050124 to 20050201
+
+ * Changed version number to 8.0.22
+ * Moved discovery timeout handler to worker thread. There are
+ function calls in this function which are not safe to call from
+ HW interrupt context.
+ * Removed free_irq from the error path of HBA initialization.
+ This will fix the free of uninitialised IRQ when config_port
+ fails.
+ * Make sure function which processes unsolicited IOCBs on ELS ring
+ still is called with the lock held.
+ * Clear LA bit from work_ha when we are not supposed to handle LA.
+ * Fix double locking bug in the error handling part of
+ lpfc_mbx_cmpl_read_la.
+ * Implemented fast IOCB processing for FCP ring.
+ * Since mboxes are now unconditionally allocated outside of the
+ lock, free them in cases where they are not used.
+ * Moved out a couple of GFP_ATOMICs in lpfc_disc_timeout, to
+ before locks so that they can GFP_KERNEL instead. Also cleaned
+ up code.
+ * Collapsed interrupt handling code into one function.
+ * Removed event posting and handling of solicited and unsolicited
+ iocbs.
+ * Remove ELS ring handling leftovers from the lpfc_sli_inter().
+ * ELS ring (any slow ring) moved from the lpfc_sli_inter() into a
+ worker thread. Link Attention, Mbox Attention, and Error
+ Attention, as well as slow rings' attention is passed to the
+ worker thread via worker thread copy of Host Attention
+ register. Corresponding events are removed from the event queue
+ handling.
+ * Add entries to hba structure to delegate some functionality from
+ the lpfc_sli_inter() to a worker thread.
+ * Reduced used of GFP_ATOMIC for memory allocations.
+ * Moved locks deeper in order to change GFP_ATOMIC to GFP_KERNEL.
+ * IOCB initialization fix for Raw IO.
+ * Removed qcmdcnt, iodonecnt, errcnt from lpfc_target and from
+ driver.
+ * Added call to lpfc_els_abort in lpfc_free_node. Modified
+ lpfc_els_abort to reset txq and txcmplq iterator after a
+ iocb_cmpl call.
+ * Fixed a use after free issue in lpfc_init.c.
+ * Defined default mailbox completion routine and removed code in
+ the sli layer which checks the mbox_cmpl == 0 to free mail box
+ resources.
+ * In lpfc_workq_post_event, clean up comment formatting and remove
+ unneeded cast of kmalloc's return.
+ * Removed loop which calls fc_remote_port_unblock and
+ fc_remote_port_delete for every target as this same effect is
+ accomplished by the scsi_remove_host call.
+ * Minor cleanup of header files. Stop header files including
+ other header files. Removed sentinels which hide multiple
+ inclusions. Removed unneeded #include directives.
+ * Fixed memory leaks in mailbox error paths.
+ * Moved lock from around of lpfc_work_done to lpfc_work_done
+ itself.
+ * Removed typedef for LPFC_WORK_EVT_t and left just struct
+ lpfc_work_evt to comply with linux_scsi review coding style.
+ * Fixed some trailing whitespaces, spaces used for indentation and
+ ill-formatting multiline comments.
+ * Bug fix for Raw IO errors. Reuse of IOCBs now mandates setting
+ of ulpPU and fcpi_parm to avoid incorrect read check of Write IO
+ and incorrect read length.
+
+Changes from 20050110 to 20050124
+
+ * Changed version number to 8.0.21
+ * Removed unpleasant casting in the definition and use of
+ lpfc_disc_action function pointer array.
+ * Makefile cleanup. Use ?= operator for setting default
+ KERNELVERSION and BASEINCLUDE values. Use $(PWD) consistently.
+ * Removed call to lpfc_sli_intr from lpfc_config_port_post. All
+ Linux systems will service hardware interrupts while bringing up
+ the driver.
+ * Christoph Hellwig change request: Reorg of contents of
+ lpfc_hbadisc.c, lpfc_scsi.h, lpfc_init.c, lpfc_sli.c,
+ lpfc_attr.c, lpfc_scsi.c.
+ * Renamed discovery thread to lpfc_worker thread. Moved handling
+ of error attention and link attention and mbox event handler to
+ lpfc_worker thread.
+ * Removed .proc_info and .proc_name from the driver template and
+ associated code.
+ * Removed check of FC_UNLOADING flag in lpfc_queuecommand to
+ determine what result to return.
+ * Move modification of FC_UNLOADING flag under host_lock.
+ * Fix IOERR_RCV_BUFFER_WAITING handling for CT and ELS subsystem.
+ * Workaround firmware bug for IOERR_RCV_BUFFER_WAITING on ELS
+ ring.
+ * Fixed a couple lpfc_post_buffer problems in lpfc_init.c.
+ * Add missing spaces to the parameter descriptions for
+ lpfc_cr_delay, lpfc_cr_count and lpfc_discovery_threads.
+ * Lock before calling lpfc_sli_hba_down().
+ * Fix leak of "host" in the error path in the remove_one() path.
+ * Fix comment for lpfc_cr_count. It defaults to 1.
+ * Fix issue where we are calling lpfc_disc_done() recursively from
+ lpfc_linkdown(), but list_for_each_entry_safe() is not safe for
+ such use.
+ * Bump lpfc_discovery_threads (count of outstading ELS commands in
+ discovery) to 32
+ * If the SCSI midlayer tries to recover from an error on a lun
+ while the corresponding target is in the NPR state, lpfc driver
+ will reject all the resets. This will cause the target to be
+ moved to offline state and block all the I/Os. The fix for this
+ is to delay the lun reset to a target which is not in MAPPED
+ state until the target is rediscovered or nodev timeout is
+ fired.
+
+Changes from 20041229 to 20050110
+
+ * Changed version number to 8.0.20
+ * rport fix: use new fc_remote_port_rolechg() function instead of
+ direct structure change
+ * rport fix: last null pointer check
+ * Phase II of GFP_ATOMIC effort. Replaced iocb_mem_pool and
+ scsibuf_mem_pool with kmalloc and linked list. Inserted list
+ operations for mempool_alloc calls. General code cleanup. All
+ abort and reset routines converted. Handle_ring_event
+ converted.
+ * If the mbox_cmpl == lpfc_sli_wake_mbox_wait in
+ lpfc_sli_handle_mb_event, pmb->context1 points to a waitq. Do
+ not free the structure.
+ * rport fixes: fix for rmmod crash
+ * rport fixes: when receiving PRLI's, set node/rport role values
+ * rport fixes: fix for unload and for fabric port deletes
+ * VPD info bug fix.
+ * lpfc_linkdown() should be able to process all outstanding events
+ by calling lpfc_disc_done() even if it is called from
+ lpfc_disc_done() Moving all events from phba->dpc_disc to local
+ local_dpc_disc prevents those events from being processed.
+ Removing that queue. From now on we should not see "Illegal
+ State Transition" messages.
+ * Release host lock and enable interrupts when calling
+ del_timer_sync()
+ * All related to rports: Clean up issues with rport deletion
+ Convert to using block/unblock on list remove (was del/add)
+ Moved rport delete to freenode - so rport tracks node.
+ * rport fixes: for fport, get maxframe and class support
+ information
+ * Added use of wait_event to work with kthread interface.
+ * Ensure that scsi_transport_fc.h is always pulled in by
+ lpfc_scsiport.c
+ * In remote port changes: no longer nulling target->pnode when
+ removing from mapped list. Pnode get nulled when the node is
+ freed (after nodev tmo). This bug was causing i/o recieved in
+ the small window while the device was blocked to be errored w/
+ did_no_connect. With the fix, it returns host_busy
+ (per the pre-remote port changes).
+ * Merge in support for fc transport remote port use. This removes
+ any consistent bindings within the driver. All scanning is now
+ on a per-target basis driven by the discovery engine.
+
+Changes from 20041220 to 20041229
+
+ * Changed version number to 8.0.19
+ * Fixed bug for handling RSCN type 3. Terminate RSCN mode
+ properly after ADISC handling completes.
+ * Add list_remove_head macro. Macro cleans up memory allocation
+ list handling. Also clean up lpfc_reset_bus_handler - routine
+ does not need to allocate its own scsi_cmnd and scsi_device
+ structures.
+ * Fixed potential discovery bug, nlp list corrutpion fix potential
+ memory leak
+ * Part 1 of the memory allocation rework request by linux-scsi.
+ This effort fixes the number of bdes per scsi_buf to 64, makes
+ the scatter-gather count a module parameter, builds a linked
+ list of scsi_bufs, and removes all dependencies on lpfc_mem.h.
+ * Reverted lpfc_do_dpc, probe_one, remove_one to original
+ implementation. Too many problems (driver not completing
+ initial discovery, and IO not starting to disks). Backs out
+ kthread patch.
+ * Fix race condition in lpfc_do_dpc. If wake_up interrupt occurs
+ while lpfc_do_dpc is running disc_done and the dpc list is
+ empty, the latest insertion is missed and the schedule_timeout
+ does not wakeup. The sleep interval is MAX_SCHEDULE_TIMEOUT
+ defined as ~0UL >> 1, a very large number. Hacked it to 5*HZ
+ for now.
+ * Fixed bug introduced when discovery thread implementation was
+ moved to kthread. kthread_stop() is not able to wake up thread
+ waiting on a semaphore and "modprobe -r lpfc" is not always
+ (most of the times) able to complete. Fix is in not using
+ semaphore for the interruptable sleep.
+ * Small Makefile cleanup - Remove remnants of 2.4 vs. 2.6
+ determination.
+
+Changes from 20041213 to 20041220
+
+ * Changed version number to 8.0.18
+ * Janitorial cleanup after removal of sliinit and ringinit[] ring
+ statistic is owned by the ring and SLI stats are in sli
+ structure.
+ * Integrated patch from Christoph Hellwig <hch@lst.de> Kill
+ compile warnings on 64 bit platforms: %variables for %llx format
+ specifiers must be caste to long long because %(u)int64_t can
+ just be long on 64bit platforms.
+ * Integrated patch from Christoph Hellwig <hch@lst.de> Removes
+ dead code.
+ * Integrated patch from Christoph Hellwig <hch@lst.de>: use
+ kthread interface.
+ * Print LPFC_MODULE_DESC banner in module init routine.
+ * Removed sliinit structure and ringinit[] array.
+ * Changed log message number from 324 to 326 in lpfc_sli.c.
+ * Wait longer for commands to complete in lpfc_reset_bus_handler
+ and lpfc_reset_bus_handler. Also use schedule_timeout() instead
+ of msleep() and add error message in lpfc_abort_handler()
+ * When setting lpfc_nodev_tmo, from dev_loss set routine, make 1
+ sec minimum value.
+ * Functions which assume lock being held were called without lock
+ and kernel complained about unlocking lock which is not locked.
+ * Added code in linkdown to unreg if we know login session will be
+ terminated.
+ * Removed automap config parameter and fixed up use_adisc logic to
+ include FCP2 devices.
+
+Changes from 20041207 to 20041213
+
+ * Changed version number to 8.0.17
+ * Fix sparse warnings by adding __iomem markers to lpfc_compat.h.
+ * Fix some sparse warnings -- 0 used as NULL pointer.
+ * Make sure there's a space between every if and it's (.
+ * Fix some overly long lines and make sure hard tabs are used for
+ indentation.
+ * Remove all trailing whitespace.
+ * Integrate Christoph Hellwig's patch for 8.0.14: if
+ pci_module_init fails we need to release the transport template.
+ (also don't print the driver name at startup, linux drivers can
+ be loaded without hardware present, and noise in the log for
+ that case is considered unpolite, better print messages only for
+ hardware actually found).
+ * Integrate Christoph Hellwig's patch for 8.0.14: Add missing
+ __iomem annotations, remove broken casts, mark functions static.
+ Only major changes is chaning of some offsets from word-based to
+ byte-based so we cans simply do void pointer arithmetics (gcc
+ extension) instead of casting to uint32_t.
+ * Integrate Christoph Hellwig's patch for 8.0.14: flag is always
+ LPFC_SLI_ABORT_IMED, aka 0 - remove dead code.
+ * Modified preprocessor #ifdef, #if, #ifndef to reflect upstream
+ kernel submission. Clean build with make clean;make and make
+ clean;make ADVANCED=1 on SMP x86, 2.6.10-rc2 on RHEL 4 Beta
+ 1. IO with a few lips and a long cable pull behaved accordingly.
+ * Implement full VPD support.
+ * Abort handler will try to wait for abort completion before
+ returning. Fixes some panics in iocb completion code path.
+
+Changes from 20041130 to 20041207
+
+ * Changed version number to 8.0.16
+ * Hung dt session fix. When the midlayer calls to abort a scsi
+ command, make sure the driver does not complete post-abort
+ handler. Just NULL the iocb_cmpl callback handler and let SLI
+ take over.
+ * Add Read check that uses SLI option to validate all READ data
+ actually received.
+
+
+Changes from 20041123 to 20041130
+
+ * Changed version number to 8.0.15
+ * Ifdef'd unused "binary" attributes by DFC_DEBUG for clean
+ compiles
+ * Stop DID_ERROR from showing up along with QUEUE_FULL set by the
+ Clarion array (SCSI error ret. val. 0x70028) There is no need
+ for driver to hard fail command which was failed by the target
+ device.
+ * Fix for Scsi device scan bug reported on SourceForge. Driver
+ was returning a DID_ERROR in lpfc_handle_fcp_error causing
+ midlayer to mark report luns as failing even though it
+ succeeded.
+ * Don't ignore SCSI status on underrun conditions for inquiries,
+ test unit ready's, etc. This was causing us to lose
+ reservation conflicts, etc
+
+Changes from 20041018 to 20041123
+
+ * Changed version number to 8.0.14
+ * Added new function "iterator" lpfc_sli_next_iocb_slot() which
+ returns pointer to iocb entry at cmdidx if queue is not full.
+ It also updates next_cmdidx, and local_getidx (but not cmdidx)
+ * lpfc_sli_submit_iocb() copies next_cmdidx into cmdidx. Now it is
+ the only place were we are updating cmdidx.
+ * lpfc_sli_update_ring() is split in to two --
+ lpfc_sli_update_ring() and lpfc_sli_update_full_ring().
+ * lpfc_sli_update_ring() don't to read back correct value of
+ cmdidx.
+ * Simplified lpfc_sli_resume_iocb() and its use.
+ * New static function lpfc_sli_next_iocb(phba, pring, &piocb) to
+ iterate through commands in the TX queue and new command (at the
+ end).
+ * Reduced max_lun to 256 (due to issues reported to some arrays).
+ Fixed comment, and macro values so def=256, min=1, max=32768.
+ * Fix an obvious typo/bug: kfree was used to free lpfc_scsi_buf
+ instead of mempool_free in lpfc_scsiport.c.
+ * Suppress nodev_tmo message for FABRIC nodes.
+ * Fixed some usage of plain integer as NULL pointer.
+ * Bug fix for FLOGI cmpl, lpfc_els_chk_latt error path code
+ cleanup.
+ * Fixup lpfc_els_chk_latt() to have Fabric NPorts go thru
+ discovery state machine as well.
+ * Fixes to lpfc_els_chk_latt().
+ * Use DID not SCSI target id as a port_id and add some missing
+ locks in lpfc_fcp.c.
+ * Changed eh_abort_handler to return FAILED if command is not
+ found in driver.
+ * Fix crash: paging request at virtual address 0000000000100108 -
+ a result of removing from the txcmpl list item which was already
+ removed (100100 is a LIST_POISON1 value from the next pointer
+ and 8 is an offset of the "prev") Driver runs out of iotags and
+ does not handle that case well. The root of the proble is in the
+ initialization code in lpfc_sli.c
+ * Changes to work with proposed linux kernel patch to support
+ hotplug.
+ * Zero out seg_cnt in prep_io failure path to prevent double sg
+ unmap calls.
+ * Fix setting of upper 32 bits for Host Group Ring Pointers if in
+ SLIM. Old code was inappropriately masking off low order bits.
+ * Use scsi_[activate|deactivate]_tcq calls provided in scsi_tcq.h.
+ * Integrated patch from Christoph Hellwig (hch@lst.de): don't call
+ pci_dma_sync_* on coherent memory. pci_dma_sync_* is need and
+ must be used only with streaming dma mappings pci_map_*, not
+ coherent mappings. Note: There are more consistent mappings
+ that are using pci_dma_sync calls. Probably these should be
+ removed as well.
+ * Modified lpfc_free_scsi_buf to accomodate all three scsi_buf
+ free types to alleviate miscellaneous panics with cable pull
+ testing.
+ * Set hotplug to default 0 and lpfc_target_remove to not remove
+ devices unless hotplug is enabled.
+ * Fixed discovery bug: plogi cmpl uses ndlp after its freed.
+ * Fixed discovery bug: rnid acc cmpl, can potentially use ndlp
+ after its freed.
+ * Modularize code path in lpfc_target_remove().
+ * Changes to support SCSI hotplug (ifdef'ed out because they need
+ kernel support USE_SCAN_TARGET requires kernel support to export
+ the interface to scsi_scan_target and to move the SCAN_WILD_CARD
+ define to a general scsi header file. USE_RESCAN_HOST requires
+ kernel support to export an interface to scan_scsi_host() with
+ the rescan flag turned on).
+ * Removed redundant variable declaration of lpfc_linkdown_tmo.
+ * Fix for large port count remove test.
+ * Added check to see if BAR1 register is valid before using BAR1
+ register for programming config_port mail box command.
+ * Added lpfc_scsi_hotplug to enable/disable driver support of SCSI
+ hotplug.
+ * Changed lpfc_disc_neverdev() to lpfc_disc_illegal() and changed
+ lpfc_disc_nodev() to lpfc_disc_noop(). Adjusted appropriate
+ events to use these routines.
+ * Add support for SCSI device hotplug.
+ * Take dummy lpfc_target's into account for lpfc_slave_destroy().
+ * Bug fix to store WWPN / WWNN in NameServer / FDMI lpfc_nodelist
+ entries.
+ * Added slavecnt in lpfc_target for diagnostic purposes.
+ * Added lpfc_hba load/unload flags to take care of special cases
+ for add/remove device.
+ * Have target add/remove delay before scanning.
+ * Have rmmod path cleanup blocked devices before scsi_remove_host.
+ * Added a #define for msleep for 2.6.5 kernels.
+ * In reset bus handler if memory allocation fails, return FAILED
+ and not SUCCESS.
+ * Have lpfc eh handlers, bus_reset and lun_reset, wait for all
+ associated I/Os to complete before returning.
+ * Fix memset byte count in lpfc_hba_init so that
+ LP1050 would initialize correctly.
+ * Backround nodev_timeout processing to DPC This enables us to
+ unblock (stop dev_loss_tmo) when appopriate.
+ * Fix array discovery with multiple luns. The max_luns was 0 at
+ the time the host structure was intialized. lpfc_cfg_params
+ then set the max_luns to the correct value afterwards.
+ * Remove unused define LPFC_MAX_LUN and set the default value of
+ lpfc_max_lun parameter to 512.
+ * Reduced stack usage of lpfc_hba_init.
+ * Cleaned up the following warning generated by
+ scripts/checkincludes.pl lpfc_fcp.c: scsi/scsi_cmnd.h is
+ included more than once.
+ * Replaced "set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(timeout)" with "msleep(timeout)".
+ * Fixnode was loosing starget when rediscovered. We saw messages
+ like: lpfc 0000:04:02.0: 0:0263 Cannot block scsi target as a
+ result. Moved starget field into struct lpfc_target which is
+ referenced from the node.
+ * Add additional SLI layer logging in lpfc_sli.c.
+ * Ignore more unexpected completions in lpfc_nportdisc.c.
+ * Can not call lpfc_target_unblock from the soft interrupt
+ context. It seems to be not nessasery to unblock target from
+ nodev timeout.
+ * Introduce and use less lethal event handler for unexpected
+ events in lpfc_nportdisc.c.
+ * Can not call fc_target_(un)block() functions with interrupts
+ disabled in lpfc_scsiport.c.
+ * Added new configuration parameter, lpfc_max_luns range 1-32768,
+ default 32768.
+ * Allow lpfc_fcp.c to call lpfc_get_hba_sym_node_name().
+ * Increase nodev timeout from 20 seconds to 30 seconds.
+ * Replace some kfree((void*)ptr) with kfree(ptr).
+ * Make 3 functions static: lpfc_get_hba_sym_node_name,
+ lpfc_intr_prep and lpfc_setup_slim_access. Move lpfc_intr_prep
+ and lpfc_setup_slim_access so they're defined before being used.
+ * Remove an unecessary list_del() in lpfc_hbadisc.c.
+ * Set nlp_state before calling lpfc_nlp_list() since this will
+ potentially call fc_target_unblock which may cause a race in
+ queuecommand by releasing host_lock.
+ * Since lpfc_nodev_tmo < dev_loss_tmo remove queuecommand
+ DID_BAD_TARGET return for now.
+ * Fix a problem with rcv logo.
+ * Remove unused portstatistics_t structure.
+ * Remove #if 0 and unnecessary checks in lpfc_fcp.c.
+ * Simplify lpfc_issue_lip: Extra layer of protection removed.
+ * Grab lock before calling lpfc_sli_issue_mbox(phba, pmb,
+ MBX_NOWAIT) in lpfc_sli_issue_mbox_wait().
+
+Changes from 20040920 to 20041018
+
+ * Changed version number to 8.0.13
+ * Hide some attributes using #ifndef DFC_DEBUG ... #endif.
+ * Modify Makefile to (1) make BUILD_NO_DEBUG=1 will hide some
+ (binary) attributes (2) make BUILD_FC_TRANS=0 will build driver
+ for 2.6.5 kernel with block/unblock patch.
+ * Modified #ifdef names.
+ * Added support for proposed FC transport host attributes (which
+ replaces some of the attributes we had local to the driver).
+ Removed the binary statistics sysfs attribute.
+ * Added extra ELS verbose logging for ELS responses.
+ * Added recognition for BUILD_FC_TRANS=2 to Makefile to define
+ FC_TRANS_VER2.
+ * Add a pointer for link stats allocation.
+ * Exported lpfc_get_hba_sym_node_name for use by FC_TRANS_VER2
+ sysfs routines.
+ * Fix discovery problem in lip testing: if device sends an ELS cmd
+ (i.e. LOGO) before our FLOGI completes it should be LS_RJT'ed.
+ * Moved #defines around to provide target_add/remove for upstream
+ kernel deliverables only not SLES9. Provided ifdefs to #include
+ target_block/unblock only if FC_TRANS_VER1.
+ * Add sanity check in lpfc_nlp_list move setting nlp_Target
+ outside #ifdef.
+ * Added a blocked member to the lpfc_target structure for
+ block/unblock. This member allows the driver to know when to
+ unblock for pci_remove_one or pci_add_one. #ifdef'd some more
+ block/unblock stuff and removed some defensive checks from
+ target_block/unblock.
+ * Moved + 5 second window to dev_loss_tmo setting and updated
+ comments.
+ * Removed NULL target check from target_block/unblock and fixed up
+ a few comments.
+ * Enable sysfs attributes on 2.6.5 kernels and remove extra
+ compatibility code.
+ * Remove any and all trailing whitespace.
+ * Added message 0718 and return error when dma_map_single fails.
+ * Changed the fcpCntl2 commands to include an FCP_ prefix to get
+ rid of build warnings on later 2.6.9-rc kernels. Build
+ conflicts with scsi/scsi.h. Remove inclusions of scsi/scsi.h
+ from hbadisc.c, sli.c, and fcp.c since these modules had no
+ dependencies on scsi.h.
+ * Fixed a bug with RSCN handling. A RSCN received on one device,
+ shouldn't affect other devices not referenced by the RSCN.
+ * Moved #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,6) to include
+ lpfc_jedec_to_ascii to prevent warning in SLES 9.
+ * Update Makefile to account for SLES 9 and scsi-target upstream
+ kernel.
+ * This checkin provides block/unblock hooks for the upstream scsi
+ target kernel and 2.6.5 on SLES9 SP1 with the block/unblock
+ patch.
+ * Discovery changes regarding setting targetp->pnode and
+ ndlp->nlp_Target Ensure fc_target_* routines are called properly
+ from discovery. Remove list_del's from lpfc_cleanup(). Ensure
+ all the lpfc_consistent_bind_* routines don't set any driver
+ structure objects.
+ * Fix for timeout of READ_LA or READ_SPARAM mailbox command
+ causing panic.
+ * Cleanup list_del()'s for Discovery ndlp lists.
+ * Bug fixes for some insmod/rmmod crashes, link down crashes and
+ device loss crashes.
+ * Removed NLP_SEARCH_DEQUE.
+ * Call lpfc_target_unblock only if the targetp is nonNull and with
+ the host_lock held.
+ * Added qcmdcnt back along with misc bug fixes to discovery.
+ * Changed tgt_io to outfcpio lpfc_fcp.c.
+ * Fixed errors caused by LIP and cable pulls both with and without
+ block/unblock patch.
+ * For now we have to call fc_target_unblock and fc_target_block
+ with interrupts enabled.
+ * Save seg_cnt from dma_map_sg. Save scatter-gather start address
+ and pass back to dma_unmap_sg in error with seg_cnt.
+ * Incorporating block/unblock calls into driver with ifdefs. This
+ change is supported by scsi-target-2.6 kernel and forward only.
+ * Merged in some discovery bug fixes and added tgt io counters.
+ * Added sysfs attributes/interfaces: read only attribute
+ "management_version" and write only attribute "issue_lip".
+ * Fix build on big endian machines: while #if was OK with
+ __BIG_ENDIAN which defined as 4321, __BIG_ENDIAN_BITFIELD has to
+ be tested with #ifdef because it does not have any value, it is
+ either defined or not.
+ * Add fabric_name and port_type attributes.
+ * Change mdelay to msleep. mdelay works, but wastefully uses cpu
+ resources without a lock held. Revert to msleep. Tested with
+ sg_reset for bus and three attached targets.
+ * Added the customary #ifndef...#define...#endif to
+ lpfc_version.h.
+ * Integrate patches from Christoph Hellwig: two new helpers common
+ to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant
+ cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is
+ gone - lpfc_sli_issue_iocb_wait loses it's flags argument
+ totally.
+ * Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit
+ field.
+ * Moved some routines out of lpfc_fcp.c into more appropriate
+ files.
+ * Whitespace cleanup: remove all trailing whitespace.
+ * Make lpfc_disc_ndlp_show static to lpfc_fcp.c.
+ * Remove leftover printk and replace some with
+ printk(KERN_WARNING)
+ * Trivial: fix a few long lines and a soft tab.
+ * Remove warnings generated by Sparse against driver (make
+ C=1). Mostly these are "using integer as pointer warnings"
+ i.e. use NULL instead of 0.
+ * Integrated patch from Christoph Hellwig: Quite a lot of changes
+ here, the most notable is that the phba->slim2p lpfc_dmabuf goes
+ away in favour of a typede pointer and a dma_addr_t. Due to the
+ typed pointer lots of the cast mess can go away, and while at it
+ I also replaced the messy SLI2_SLIM_t with a simple struct
+ lpfc2_sli2_slim that only contains the part of the union we care
+ about while using SLI2_SLIM_SIZE for all size calculations
+ directly.
+ * Integrated patch from Christoph Hellwig: This streamlines the
+ I/O completion path a little more, especially taking care of
+ fast-pathing the non-error case. Also removes tons of dead
+ members and defines from lpfc_scsi.h - e.g. lpfc_target is down
+ to nothing more then the lpfc_nodelist pointer.
+ * Added binary sysfs file to issue mbox commands
+ * Replaced #if __BIG_ENDIAN with #if __BIG_ENDIAN_BITFIELD for
+ compatibility with the user space applications.
+ * Decrease the amount of data in proc_info.
+ * Condense nodelist flag members.
+ * Expand INFO for discovery sysfs shost entries.
+ * Notify user if information exceeds 4k sysfs limit.
+ * Removed a bunch of unused #defines.
+ * Added initial sysfs discovery shost attributes.
+ * Remove unused #defines lpfc_disc.h.
+ * Fixed failMask nodelist settings.
+ * Cleanup some old comments / unused variables.
+ * Add LP101 to list of recognized adapters.
+
+Changes from 20040908 to 20040920
+
+ * Changed version number to 8.0.12
+ * Removed used #defines: DEFAULT_PCI_LATENCY_CLOCKS and
+ PCI_LATENCY_VALUE from lpfc_hw.h.
+ * Changes to accomodate rnid.
+ * Fix RSCN handling so RSCN NS queries only effect NPorts found in
+ RSCN data.
+ * If we rcv a plogi on a NPort queued up for discovery, clear the
+ NLP_NPR_2B_DISC bit since rcv plogi logic will force NPort thru
+ discovery.
+ * Ensure lpfc_target is also cleaned up in lpfc_cleanup().
+ * Preliminary changes for block/unblock kernel API extensions in
+ progress with linux-scsi list. These are name changes and
+ prototype changes only.
+ * Added send_abts flag to lpfc_els_abort. For rcv LOGO when ADISC
+ sent, the XRI of the LOGO rcv'ed is the same as the ADISC
+ sent. Thus we cannot ABTS the ADISC before sending the LOGO ACC.
+ * Weed out some unused fc_flags. Add FC_DISC_TMO.
+ * board_online sysfs attribute added to support libdfc functions
+ InitDiagEnv and SetBrdEnv.
+ * Streamline code in lpfc_els_retry fixup abort case in
+ lpfc_els_timeout_handler().
+ * Flush discovery/ELS events when we bring SLI layer down.
+ * ctlreg and slimem binary attributes added to support libdfc
+ read/write mem/ctl functions.
+ * Integrated Christoph Hellwig's patch: Cleanup
+ lpfc_sli_ringpostbuf_get.
+ * Modified lpfc_slave_alloc and lpfc_slave_destroy to allocate and
+ free a dummy target pointer. This allows queuecommand to skip
+ the NULL target pointer check and avoid the console spam when
+ slave_alloc fails.
+ * Fix cfg_scan_down logic, it was reversed.
+ * Init list head ctrspbuflist.
+ * Change name of lpfc_driver_abort to lpfc_els_abort since it is
+ only valid for ELS ring.
+ * Remove unused third argument for lpfc_consistent_bind_get().
+ * Fix up iotag fields in lpfc_prep_els_iocb().
+ * Remove log message on code path triggered by lpfc_els_abort().
+ * Set host->unique_id in lpfc_fcp.c.
+ * Removed deadwood: lpfc_target.pHba not necessary anymore.
+ * Integrated patch from Christoph Hellwig: remove dead
+ SLI_IOCB_POLL handling.
+ * Integrated patch from Christoph Hellwig: Streamline I/O
+ submission and completion path a little.
+ * Remove unnecessary lpfc_brd_no. Ensure brd_no assignment is
+ unique.
+ * Removed unused MAX_FCP_LUN.
+ * Use mod_timer instead of add_timer for fdmi in lpfc_ct.c.
+ * Fixed misc discovery problems.
+ * Move stopping timers till just before lpfc_mem_free() call.
+ * Fix up NameServer reglogin error path.
+ * Cleanup possible outstanding discovery timers on rmmod.
+ * Fix discovery NPort to NPort pt2pt problem.
+ * Get rid of ip_tmofunc / scsi_tmofunc.
+ * Integrated patch from Christoph Hellwig:
+ lpfc_disc_done/lpfc_do_dpc cleanup - lpfc_disc_done can return
+ void - move lpfc_do_dpc and lpfc_disc_done to lpfc_hbadisc.c -
+ remove checking of list emptiness before calling lpfc_disc_done,
+ it handles the emtpy list case just fine and the additional
+ instructions cost less then the bustlocked spinlock operations.
+ * Integrated patch from Christoph Hellwig: This adds a new 64bit
+ counter instead, brd_no isn't reused anymore. Also some tiny
+ whitespace cleanups in surrounding code.
+ * Reorder functions in lpfc_els.c to remove need for prototypes.
+ * Removed unsed prototypes from lpfc_crtn.h -
+ lpfc_ip_timeout_handler, lpfc_read_pci and lpfc_revoke.
+ * Removed some unused prototypes from lpfc_crtn.h -
+ lpfc_scsi_hba_reset, lpfc_scsi_issue_inqsn,
+ lpfc_scsi_issue_inqp0, lpfc_scsi_timeout_handler.
+ * Integrated patch from Christoph Hellwig: remove TRUE/FALSE
+ usage.
+ * Integrated patch from Christoph Hellwig: Remove unused function
+ prototypes lpfc_set_pkt_len and lpfc_get_pkt_data from
+ lpfc_crtn.h - fixes build warnings.
+ * Removed unused struct lpfc_dmabufip definition from lpfc_mem.h.
+ * Removed pre-2.6.5 MODULE_VERSION macro from lpfc_compat.h.
+ * Fixing missing static and removing dead code.
+ * Adding nodewwn, portwwn and portfcid shost attributes.
+ * Initial support for CT via sysfs. request payloads of size less
+ than PAGE_SIZE and rsp payloads of size PAGE_SIZE are supported.
+ Driver maintains a list of rsp's and passes back rsp's
+ corresponding to the pid of the calling process.
+ * Support for RefreshInformation, GetAdapterAttributes,
+ GetPortStatistics.
+ * Make nodev-tmo default to 20 seconds.
+ * Fix up some DSM error cases, unreg_login rpi where needed.
+ * Fix up comments for fc_target_block / fc_target_unblock.
+ * Fix up code for scsi_block_requests / scsi_unblock_requests.
+ * Add NLP_FCP_TARGET for nodeinfo support.
+ * Move suspend/resume in lpfc_nlp_list under appropriate case -
+ Used host_lock for DPC to avoid race (remove dpc_lock)
+ * Fix some corner cases for PLOGI receive - simplify error case
+ for cmpl_reglogin_reglogin_issue.
+ * Bug fix for ppc64 EEH MMIO panic - always do readl after
+ writel's of HBA registers to force flush.
+ * Get rid of initial static routine declarations in lpfc_hbadisc.c
+ and lpfc_els.c.
+ * Updates to discovery processing.
+
+Changes from 20040823 to 20040908
+
+ * Changed version number to 8.0.11
+ * Removed persistent binding code.
+ * Display both ASC and ASCQ info.
+ * Fixed link down->up transitions when linkdown tmo expires. Fix
+ was in the defensive error checking at the start of
+ queuecommand.
+ * Removed lpfc_scsi_timeout_handler as this timer is no longer
+ required. The midlayer will exhaust retries and then call
+ lpfc_abort_handler, lpfc_reset_lun_handler, and
+ lpfc_reset_target_handler.
+ * Minimal support for SCSI flat space addressing/volume set
+ addressing. Use 16 bits of LUN address so that flat
+ addressing/VSA will work.
+ * Changed 2 occurences of if( 1 != f(x)) to if(f(x) != 1)
+ * Drop include of lpfc_cfgparm.h.
+ * Reduce stack usage of lpfc_fdmi_cmd in lpfc_ct.c.
+ * Add minimum range checking property to /sys write/store
+ functions.
+ * Fix display of node_name and port_name via fc transport
+ attr.
+ * Removed biosparam code.
+ * Removed range checking. phba->config[] array elements are now
+ embedded into the hba struct. lpfc_config_setup() has been
+ removed.
+ * Collapsed lpfc_scsi_cmd_start into lpfc_queuecommand and cleaned
+ up combined routines.
+ * Removed unused prototypes myprint and
+ lpfc_sched_service_high_priority_queue.
+ * Removed unused function lpfc_nodev.
+ * Removed scsi_cmnd->timeout_per_command cancelation. SCSI midlayer
+ now times out all commands - FW is instructed to not timeout.
+ * Removed polling code from lpfc_scsi_cmd_start. Reorganized
+ queuecommand and cmd_start some.
+
+Changes from 20040810 to 20040823
+
+ * Changed version number to 8.0.10
+ * Additional timer changes as per Arjan / Christoph's comments.
+ * Used mod_timer() instead of del_timer_sync() where appropriate.
+ * Fixed a use after free case (panic on 2.6.8.1 with
+ CONFIG_DEBUG_SLAB set).
+ * Fix compile warning in lpfc_fcp.c.
+ * Minor fix for log message, that prints unassigned brdno which is
+ zero.
+ * Move scsi_host_alloc() to the beginning of probe_one(). This
+ ensures that host_lock is available at later stages and also
+ avoids tons of unnecessary initializing if host_alloc()
+ fails.
+ * Removed else clause from lpfc_slave_configure that set
+ sdev->queue_depth. The driver informs the midlayer of its
+ setting in the template and only overrides if queue tagging is
+ enabled.
+ * Added PCI_DEVICE_ID_ZEPHYR and PCI_DEVICE_ID_ZFLY (Junior
+ Zephyr) support
+
+Changes from 20040730 to 20040810
+
+ * Changed version number to 8.0.9
+ * Removed per HBA driver lock. Driver now uses the host->host_lock
+ * Restored support for the 2.6.5 kernel for those linux distributions
+ shipped with the 2.6.5 kernel.
+ * Applied patch from Christoph Hellwig (hch@infradead.org) as follows
+ "[PATCH] use scsi host private data in ->proc_info.
+ * Applied patch from Christoph Hellwig (hch@infradead.org) as follows
+ "Re: [Emulex] Ready for next round. This patch cleans up the memory
+ allocation routines a little and fixes a missing mempool_destroy and
+ some missing error handling."
+ * Changed pointers assignments from 0 to NULL.
+ * Added fixes to the lpfc_reset_lun_handler and lpfc_reset_bus_handler
+ entry points that caused kernel to Oops or hang.
+ * Added fixes to targetless hosts that caused modprobe and insmod to hang.
+ * Ongoing cleanup to many files
+
+Changes from 20040723 to 20040730
+
+ * Changed version number to 8.0.8
+ * Removed unused LPFN_DRIVER_VERSION #define.
+ * Folded lpfc_findnode_scsiid into lpfc_find_target, its only
+ caller.
+ * Removed 2 unneeded arguments to lpfc_find_target (lun and
+ create_flag).
+ * Make lpfc_sli_reset_on_init = 1
+ * Minor cleanup to quieten sparse.
+ * Removed missing function = 0 in tmo routine in lpfc_els.c.
+ * Moved additional binding parameters into lpfc_defaults.c:
+ lpfc_automap / lpfc_fcp_bind_method
+ * Use msecs_to_jiffies() where applicable.
+ * Only use queue depth attribute only after SLI HBA setup was
+ completed.
+ * Put in memory barriers for PPC
+ * Added PCI_DEVICE_ID_HELIOS and PCI_DEVICE_ID_JFLY (Junior
+ Helios) support
+ * Added 4&10 gigabit choices in user option link_speed
+ * Updated timer logic: Set timer data after init_timer use
+ timer_pending() instead of expires.
+ * Removed some remnants of IP over FC support from Kconfig and
+ Makefile.
+ * Remove redundant prototypes for lpfc_handle_eratt,
+ lpfc_handle_latt and lpfc_read_pci.
+ * Ongoing cleanup of lpfc_init.c.
+ * Changed LPFC_CFG_DFT_HBA_Q_DEPTH -> LPFC_CFG_HBA_Q_DEPTH.
+ * Another cleanup stab at lpfc_ct.c. Remove castings, structure
+ code sanely, remove redundant code, reorganize code so that
+ functions are invoked after definition.
+
+Changes from 20040716 to 20040723
+
+ * Changed version number to 8.0.7
+ * Cleanup of lpfc_ct.c. Removed number of casts, removed tons of
+ dead/redundant code, cleaned up badly and poorly written code,
+ cleaned up return values.
+ * Fixed Persistent binding implementation
+ * Removed all references to lpfc_scsi_req_tmo
+ * Removed last references to lun_skip config parameter.
+ * Removed LPFC_DEV_RPTLUN node failure bit because we don't issue
+ REPORT_LUNS from the driver anymore.
+ * Removed LUN-tracking in driver. Removed lpfc_lun struct and
+ moved any functionality we still need to lpfc_target.
+ * Added new lpfc_jedec_to_ascii() call and replace two instances
+ of duplicate code with calls to this function.
+ * Removed Volume Set Addressing handling on LUN IDs.
+ * Applied patch from Christoph Hellwig (hch@infradead.org) that
+ removes dead code belonging to lpfc_build_scsi_cmnd() and its
+ call path. This is related to the recently removed report_lun
+ code.
+
+Changes from 20040709 to 20040716
+
+ * Changed version number to 8.0.6
+ * Removed internal report LUNs usage. Removed functions:
+ lpfc_disc_issue_rptlun, lpfc_disc_cmpl_rptlun,
+ lpfc_disc_retry_rptlun and their use.
+ * Removed usused scheduler prototypes in lpfc_crtn.h
+ * Replace lpfc_geportname() with generic memcmp().
+ * Rearrange code in lpfc_rcv_plogi_plogi_issue() to make it a
+ little more readable.
+ * Remove redundant port_cmp != 2 check in if
+ (!port_cmp) { .... if (port_cmp != 2).... }
+ * Clock changes: removed struct clk_data and timerList.
+ * Clock changes: seperate nodev_tmo and els_retry_delay into 2
+ seperate timers and convert to 1 argument changed
+ LPFC_NODE_FARP_PEND_t to struct lpfc_node_farp_pend convert
+ ipfarp_tmo to 1 argument convert target struct tmofunc and
+ rtplunfunc to 1 argument * cr_count, cr_delay and
+ discovery_threads are only needed to be module_params and not
+ visible via sysfs.
+
+Changes from 20040614 to 20040709
+
+ * Changed version number to 8.0.5
+ * Make lpfc_info static.
+ * Make lpfc_get_scsi_buf static.
+ * Print a warning if pci_set_mwi returns an error.
+ * Changed SERV_PARM to struct serv_parm.
+ * Changed LS_RJT to struct ls_rjt.
+ * Changed CSP to struct csp.
+ * Changed CLASS_PARMS to struct class_parms.
+ * Some cosmetic coding style cleanups to lpfc_fcp.c.
+ * Providing a sysfs interface that dumps the last 32
+ LINK_[UP|DOWN] and RSCN events.
+ * Get rid of delay_iodone timer.
+ * Remove qfull timers and qfull logic.
+ * Convert mbox_tmo, nlp_xri_tmo to 1 argment clock handler
+ * Removed duplicate extern defs of the bind variables.
+ * Streamline usage of the defines CLASS2 and CLASS3, removing
+ un-necessary checks on config[LPFC_CFG_FCP_CLASS].
+ * Moving the persistent binding variables to new file
+ lpfc_defaults.c
+ * Changed LPFC_SCSI_BUF_t to struct lpfc_scsi_buf.
+ * Moved config specific code from probe_one() into
+ config_setup(). Removing a redundant check on scandown value
+ from bind_setup() as this is already done in config_setup().
+ * Changed LPFC_SLI_t to struct lpfc_sli.
+ * Changed FCP_CMND to struct fcp_cmnd.
+ * Changed FCP_RSP to struct fcp_rsp.
+ * Remove the need for buf_tmo.
+ * Changed ULP_BDE64 to struct ulp_bde64.
+ * Changed ULP_BDE to struct ulp_bde.
+ * Cleanup lpfc_os_return_scsi_cmd() and it's call path.
+ * Removed lpfc_no_device_delay.
+ * Consolidating lpfc_hba_put_event() into lpfc_put_event().
+ * Removed following attributes and their functionality:
+ lpfc_extra_io_tmo, lpfc_nodev_holdio, lpfc_delay_rsp_err,
+ lpfc_tgt_queue_depth and lpfc_check_cond_err.
+ * Clock changes consolidating timers, just in the struct lpfc_hba,
+ to get rid of clkData and pass only one argument to timeout
+ routine. Also, removing need for outstanding clock linked list
+ to stop these timers at rmmod.
+ * Move lpfc.conf contents into lpfc_fcp.c. Removing per adapter
+ attributes in favor of global attributes.
+ * Fix a potential null pointer reference of pmbuf in lpfc_ct.c.
+ * On reset_lun, issue LUN_RESET as opposed to ABORT_TASK_SET.
+ * Removed SCSI_REQ_TMO related code.
+ * Introducing two new defines LPFC_ATTR_R and LPFC_ATTR_RW that do
+ a module_param, MODULE_PARM_DESC, lpfc_param_show,
+ [lpfc_param_store] and CLASS_DEVICE_ATTRIBUTE.
+ * Properly clean up when allocation of a linked BDE fails in the
+ SCSI queuecommand path.
+ * Fail SCSI command if dma_map_sg call fails.
+ * Remove unused macros SWAP_ALWAYS and SWAP_ALWAYS16.
+ * Reset context2 to 0 on exit in
+ lpfc_sli_issue_iocb_wait_high_priority() and
+ lpfc_sli_issue_iocb_wait().
+ * Arranging lpfc_scsiport.c to follow style of use after
+ definition. This removes the need for the cruft of forward
+ declarations. Also removing a redundant #define ScsiResult as it
+ already available elsewhere.
+ * Applying "Streamline lpfc error handling" patch from Christoph
+ Hellwig (hch@infradead.org) with following modifications: fix
+ mem leaks, remove some misplaced code that need not be there,
+ print a message on exit (old code prints two (entry/exit)), make
+ ret values consistent (either 1/0 or SUCCESS/FAILURE), keep all
+ eh routines in a single file (lpfc_scsiport.c).
+ * Move contents of lpfc_module_param.h into lpfc_fcp.c.
+ * Changed sysfs attributes to CLASS_DEVICE_ATTRIBUTES (previously
+ DEVICE_ATTRIBUTES). They now appear in
+ /sys/class/scsi_host/hostx (previously in
+ /sys/bus/pci/drivers/lpfc/devx).
+ * Removed lpfc_syfs.h and lpfc_sysfs.c.
+ * Cleanup of config params. Throttle params have been removed.
+ max_lun has been removed. max_target is replaced with a #define,
+ lun_skip is removed. Remove ipfc config params and related
+ code.
+ * Changed DMABUF_t usage to struct lpfc_dmabuf.
+ * Downsizing iCfgParam structure to include a_string, a_low, a_hi
+ and a_default values only.
+ * Free SCSI buf safety memory pool on shutdown to eliminate memory
+ leak.
+ * Change lpfc_printf_log to a #define. Also include phba->brd_no
+ and newline in the print string rather than in the #define.
+ * Remove code that optionally locates Host Group Pointers in host
+ memory SLIM since this is no longer needed for PPC64, once
+ CONFIG_PORT uses HBA's view of its BAR0.
+ * Removed the forward declarations of the sli functions and
+ rearranging the code in lpfc_sli.c.
+ * Removed the preamble functionality from logging.
+ * Make lpfc_sli_hba_setup() return negative error codes on error
+ and correct the comment left over in lpfc_fcp.c
+ * Removed the lpfc_loadtime variable.
+ * Put a space between all ifs and their open parens '('.
+ * Change Studly_Caps LPFC_SCSI_BUF_t to struct lpfc_scsi_buf.
+ * Fixed insmod hang after hardware error.
+ * Relocated scsi_host alloc to before we enable the interrupt
+ handler
+ * Add .tmp_versions directory to Makefile clean target. This
+ directory is created in the 2.6.5+ build process (with Red Hat
+ kernels at least).
+ * Changing phba->config to kmalloc lpfc_icfgparam and not
+ *phba->config. This is manifesting itself as a panic in
+ pci_release_region().
+ * Fix for firmware download / board reset problem.
+ * Integrated patch from Christoph Hellwig (hch@infradead.org) to
+ reorganize and cleanup lpfc_fcp.c
+ * Don't abort commands immediately when there is an RSCN event to
+ give driver time to rediscover targets before the midlayer
+ retries the SCSI commands.
+
+Changes from 20040604 to 20040614
+
+ * Changed version number to 8.0.4
+ * Removed lpfc_valid_lun function.
+ * Added scsi_buf safety pool to address scsi_buf failures in
+ queuecommand under low memory conditions. Allocations now come
+ from kmalloc initially, but if kmalloc fails, the allocation
+ comes from the safety pool.
+ * Modified lpfc_slave_alloc to only set the scsi_device->hostdata
+ pointer if the driver has discovered the target. This routine
+ always returns success now as well since no error ever occurs in
+ the alloc routine.
+ * Mask only info and warning messages. Print all error messages
+ irrespective of mask.
+ * Removing lpfc_log_chk_msg_disabled()
+ * Changed lpfc_printf_log to take struct lpfc_hba * directly
+ instead of a "board number".
+ * Convert dma_sync_single to pci_dma_sync_single_for_{device/cpu}.
+ * Implemented new style log messages. The message strings are now
+ embedded in the call to lpfc_printf_log.
+ * Decreased FLOGI discovery timeout to 20 seconds.
+ * On error in lpfc_pci_probe_one() return -1 and not 1.
+ * Allow for board numbers that are not sequential, paving the way
+ for hotplug support.
+ * scsi_add_host() can fail, so wrap it around in an if(). Also
+ initiate scsi_scan_host() after attaching the sysfs attributes.
+ * lpfc_release_version is used only in lpfc_ct.c, so move it there
+ and mark it as static.
+ * Removed lpfc_sleep_ms and replaced with mdelay or schedule calls
+ directly
+ * Removed all (struct list_head *) casts from clkData-related list
+ handling in list_add, list_del macros.
+ * Removed EXPORT_SYMBOLs.
+ * Removed LPFC_MIN_QFULL and lpfc_qthrottle_up.
+ * Replace LPFCSCSITARGET_t with struct lpfc_target.
+ * Replace LPFCSCSILUN_t with struct lpfc_lun.
+ * Remove unused struct declarations (fcPathId and fcRouteId) from
+ lpfc_scsi.h.
+ * Rewrite use of FC transport attributes.
+ * Fix crash when link is lost. This was due to lpfc_delay_iodone
+ calling list_del on an object that was never put on a list.
+ * Remove trailing spaces at the end of all lines.
+ * Set MAX_FCP_TARGET to 256 from 0xff. Set MAX_FCP_LUN and
+ MAX_FCP_CMDS to their decimal equivalents and updated
+ documentation.
+
+Changes from 20040526 to 20040604
+
+ * Changed version number to 8.0.3
+ * Completed sysfs FC transport support.
+ * Removed unused fields in SCSI LUN and SCSI Target structures:
+ void *pTargetProto; void *pTargetOSEnv; void *pLunOSEnv;
+ * Modified list_for_each to list_for_each_entry. Modified
+ list_for_each_safe to list_for_each_entry_safe.
+ * Remove lpfc_dfc.h file.
+ * Changed pHba->phba, pCommand->pcmd
+ * Changed plogi_ndlp -> plogindlp, pos_tmp->postmp, pRsp->prsp,
+ pCmd->pcmd
+ * Changed pText -> ptext
+ * Changed p_tmp_buff -> ptmpbuff
+ * Changed pBufList -> pbuflist, pRsp -> prsp, pCmd -> pcmd
+ * Changed *pos_tmp -> *postmp, *p_mbuf -> *pmbuf
+ * Following changes are made to the SCSI fast path: Added
+ DMA_BUF_t member to the lpfc_scsi_buf_t. This will reduce a
+ memory allocation in the scsi fast path. Added check for
+ targetp == NULL in the scsi fast path. Increased number of
+ scatter gather entries in lpfc_scsi_dma_ext to 4 from 3 and
+ changed the size of lpfc_scsi_dma_ext to 264
+ * Fixing some missing static lpfc_nportdisc.c.
+ * Reordered #include lines so that lpfc.h doesn't have to #include
+ other header files.
+ * Remove lpfc_get_hba_sym_node_name() as a global EXPORT and make
+ it static.
+ * Move struct clk_data definition from lpfc_hw.h to lpfc_sli.h.
+ * Changed LPFC_IOCBQ_t to struct lpfc_iocbq.
+ * Changed LPFC_SLI_RING_t to struct lpfc_sli_ring.
+ * Changed LPFC_NODELIST_t to struct lpfc_nodelist.
+ * Rearranged lpfc_nportdisc.c by moving state machine array
+ (lpfc_disc_action) and the one function that uses it,
+ lpfc_disc_state_machine, to the end of the file, removing the
+ need for the raft of prototypes at the top.
+ * Changed LPFC_BINDLIST_t to struct lpfc_bindlist.
+ * Removed lpfc_issue_ct_rsp(), lpfc_sleep(), lpfc_add_bind(),
+ lpfc_del_bind(), lpfc_sli_wake_mbox_wait() and
+ lpfc_sli_issue_mbox_wait().
+ * Fixed a large number of overly-long lines.
+ * Fixed some discovery problems: Introduced deferred ndlp removal
+ when in DSM to avoid panic when in nested DMSs Fix NportId
+ fffc01 handling to not relogin after LOGO fixed handling of LOGO
+ on PLOGI issue.
+ * Changed SLI_CT_REQUEST to lpfc_sli_ct_request.
+ * Changed NAME_TYPE to struct lpfc_name.
+ * Changed lpfcCfgParam_t to struct lpfc_cfgparam.
+ * Changed LPFC_STAT_t to struct lpfc_stats.
+ * Changed HBAEVT_t to struct lpfc_hba_event.
+ * Changed Studly_Caps lpfcHBA_t to struct lpfc_hba.
+ * Removed no longer used tasklet_running flag.
+ * Removing *PSOME_VAR typedefs and using SOME_VAR* directly.
+ * Changing .use_clustering to ENABLE_CLUSTERING.
+ * Modify lpfc_queuecommand to return SCSI_MLQUEUE_HOST_BUSY when
+ it can't queue a SCSI command. Also, remove cmnds_in_flight
+ member of struct lpfcHBA for 2.6 kernels as it was only needed
+ to determine what to return from queuecommand.
+ * Change return type of lpfc_evt_iocb_free to void as it doesn't
+ return anything.
+ * Remove unused cmnd_retry_list and in_retry members in struct
+ lpfcHBA.
+ * Remove some instances of unneeded casting of kmalloc's return in
+ lpfc_scsiport.c
+ * Remove lpfc_linux_attach() and lpfc_linux_detach(). Integrate
+ them into lpfc_probe_one() and lpfc_release_one() respectively.
+ * Remove lpfc_num_iocbs, lpfc_num_bufs module parameters
+ * Remove #defines for NUM_NODES, NUM_BUFS and NUM_IOCBS
+
+Changes from 20040515 to 20040526
+
+ * Changing version number to 8.0.2.
+ * Including dma-mapping.h as one of the include headers. Also
+ rearrange the #include order.
+ * Make functions static as appropriate.
+ * queuecommand() will now return SCSI_MLQUEUE_HOST_BUSY instead of
+ 1 to backpressure midlayer.
+ * Removed function prototypes for lpfc_start_timer() and
+ lpfc_stop_timer()
+ * Changed timer support to be inline. Clk_data is now declared
+ right next to the corresponding timer_list entry so we don't
+ have to allocate these clk_data dynamically.
+ * Add readls after writels to PCI space to flush the writes.
+ * Fix misspelled word "safety" in function names.
+ * Fix up comments in lpfc.conf for per HBA parameters to reflect
+ new implementation.
+ * Change lpfc_proc_info handler to get the Nodename from
+ fc_nodename and not fc_portname.
+ * Fix up some comments and whitespace in lpfc_fcp.c.
+ * Formatting changes: get rid of leading spaces in code
+ * Move discovery processing from tasklet to a kernel thread.
+ * Move ndlp node from unmap list to map list if ADISC completed
+ successfully.
+ * Flush all the ELS IOCBs when there is a link event.
+ * LP9802 qdepth is twice the LP9802DC qdepth. Delay
+ elx_sched_init after READ_CONFIG to get max_xri from the
+ firmware. Reset ELX_CFG_DFT_HBA_Q_DEPTH to max_xri after
+ READ_CONFIG
+ * Fix fc_get_cfg_parm() to be more robust and support embedded hex
+ values. The lpfc_param's are now defined as:
+ lpfc_log_verbose="lpfc:0,lpfc0:0x10,lpfc1:4,lpfc100:0xffff" The
+ "," delimter does not matter. It can be anything or not exist at
+ all. ie param = "lpfc:0lpfc0:0x10.lpfc1:4txtlpfc100:0xffff" will
+ also work. Additionally the string is treated as case
+ insensitive.
+ * Changed all usage of lpfc_find_lun_device() to lpfc_find_lun().
+ * Removed unnecessary wrappers lpfc_find_lun_device() and
+ lpfc_tran_find_lun().
+ * Switch from using internal bus/id/lun to similar data from
+ scsi_device structure.
+ * Eliminate one-line function lpfc_find_target()
+ * Added slave_alloc, slave_destory
+ * lpfc_scsi_cmd_start can now acquire lun pointer from
+ scsi_device->hostdata, which is setup in slave_alloc.
+ * Eliminate unnecessary checking on every cmd just to see if we
+ are accessing the device the first time.
+ * Remove assumption in lpfc_reset_lun_handler that a valid
+ lpfc_scsi_buf is hung off of linux's scsi_cmnd->host_scribble
+ when our reset is called.
+
+Changes from 20040507 to 20040515
+
+ * Changed version to 8.0.1
+ * Fixed crash on driver rmmod after error injection tests and
+ lpfc_tasklet deadlock.
+ * Modified lpfc.conf to remove limit on number of support hosts
+ * Removed HBAAPI
+ * Removed duplication of SCSI opcodes from lpfc_fcp.h that are
+ available in scsi/scsi.h
+ * Rework module_param usage
+ * Added MODULE_PARAM_DESC for various module_params
+ * Removed #define EXPORT_SYMTAB
+ * Removed #includes of if_arp.h and rtnetlink.h
+ * Removed string "Open Source" from MODULE_DESC
+ * Cleanup duplicated string definitions used by MODULE_DESC
+ * Renamed lpfc_pci_[detect|release] to lpfc_pci_[probe|remove]_one
+ * Fix formatting of lpfc_driver
+ * Remove unnecessary memset to 0 of lpfcDRVR
+ * Attach driver attributes always unless pci_module_init failed
+ * Remove all one-line wrappers from lpfc_mem.
+ * Remove lpfc_sysfs_set_[show|store] as it is no longer needed
+ * Redo lpfc_sysfs_params_[show|store] to one value per attribute rule
+ * Breakdown lpfc_sysfs_info_show into smaller one value per attribute
+ * Use device attributes instead of driver attributes where appropriate
+ * Remove no longer needed EXPORT_SYMBOLs
+ * Remove some unused code (1600 msg's related)
+
+Changes from 20040429 to 20040507
+
+ * Change version to 8.0.0
+ * Fix the number of cmd / rsp ring entries in lpfc_fcp.c to match
+ the divisions setup in lpfc_hw.h.
+ * Remove phba->iflag reference.
+ * Several locking improvements.
+ * Remove functions lpfc_drvr_init_lock, lpfc_drvr_lock,
+ lpfc_drvr_unlock and lpfc_hipri_*.
+ * Remove LPFC_DRVR_LOCK and LPFC_DRVR_UNLOCK macros.
+ * Make lpfc_info() use lpfc_get_hba_model_desc() instead of
+ rewriting almost identical code.
+ * Fix 1 overly long line in each of lpfc_cfgparm.h, lpfc_ftp.c and
+ lpfc_sli.c.
+ * Fix build for Red Hat 2.6.3 kernel by #defining MODULE_VERSION
+ only if it isn't already defined.
+ * Change elx_sli_issue_mbox_wait to return correct error code to
+ the caller.
+ * In some of the els completion routines, after calling
+ lpfc_elx_chk_latt, driver ignores the return code of the
+ lpfc_elx_chk_latt. This will prevent the discovery state machine
+ restarting correctly when there are link events in the middle of
+ discovery state machine running. Fix this by exiting discovery
+ state machine if lpfc_els_chk_latt returns a non zero value.
+ * Removed MAX_LPFC_BRDS from lpfc_diag.h
+ * Removed unused first_check.
+ * Remove some unused fields and defines.
+ * Change lpfc-param names to lpfc_param.
+ * Add use of MODULE_VERSION macro for 2.6 kernels.
+ * Shorten length of some of the comment lines to make them more
+ readable.
+ * Move FCP_* definitions to their own header file, lpfc_fcp.h.
+ * Remove unused prototypes from lpfc_crtn.h: fcptst, iptst,
+ lpfc_DELAYMS.
+ * Remove duplicated prototypes from lpfc_crtn.h:
+ lpfc_config_port_prep, lpfc_config_port_post,
+ lpfc_hba_down_prep.
+ * Removed some unused export_symbols.
+ * Install driver files into */drivers/scsi/lpfc instead of
+ */drivers/scsi.
+
+Changes from 20040426 to 20040429
+
+ * Declared export symbol lpfc_page_alloc and lpfc_page_free
+ * Changed lpfc version number to 6.98.3
+ * Move the definition of MAX_LPFC_BRDS to the only header file
+ that uses it (lpfc_diag.h).
+ * Change lpfc_sli_wake_iocb_wait to do a regular wake_up since
+ lpfc_sli_issue_iocb_wait now sleeps uninterruptible.
+ * Replace list_for_each() with list_for_each_safe() when a list
+ element could be deleted.
+ * Fix IOCB memory leak
+
+Changes from 20040416 to 20040426
+
+ * Change lpfc_config_port_prep() to interpret word 4 of the DUMP
+ mbox response as a byte-count
+ * Add info attribute to sysfs
+ * Minor formatting (spaces to tabs) cleanup in lpfc_sched.h
+ * Remove unused log message number 732
+ * Completing MODULE_PARM -> module_param changes
+ * Removed unused targetenable module parameter
+ * Removed locks from lpfc_sli_issue_mbox_wait routine
+ * Removed code that retry 29,00 check condition
+ * Removed code that manipulates rspSnsLen.
+ * Fix use of lun-q-depth config param
+ * Fix severity inconsistency with log message 249
+ * Removed lpfc_max_target from lpfc_linux_attach
+ * Replace references to lpfcDRVR.pHba[] with lpfc_get_phba_by_inst()
+ * Change lpfc_param to lpfc-param
+ * Partially removed 32 HBA restriction within driver. Incorported
+ lpfc_instcnt, lpfc_instance[], and pHba[] into lpfcDRVR
+ structure Added routines lpfc_get_phba_by_inst()
+ lpfc_get_inst_by_phba() lpfc_check_valid_phba()
+ * Turn on attributes "set" & "params" by default.
+ * Further formatting/whitespace/line length cleanup on: lpfc_ct.c
+ lpfc_els.c lpfc_fcp.c lpfc_hbadisc.c lpfc_init.c lpfc_ipport.c
+ lpfc_mbox.c lpfc_nportdisc.c lpfc_sched.c lpfc_sched.h
+ lpfc_scsi.h lpfc_scsiport.c lpfc_sli.c and lpfc_sli.h
+ * Add log message 249 to log any unsupported device addressing
+ modes encountered.
+ * Add support for 256 targets and 256 LUNs
+ * Fixed panic in lpfc_linkdown.
+ * Removed (struct list_head*) casting in several calls to list_del
+ * Free irq reservation and kill running timers when insmod or
+ modprobe are killed via ctrl-c
+ * Remove drivers/scsi from include path
+ * Wrap use of log message 311 in macro
+ * Detect failure return from pci_map_sg call in lpfc_os_prep_io
+ * Fix use-after-free of IOCB in lpfc_sli_process_sol_iocb which
+ was causing an Oops on 2.6.5 kernel.
+ * Cleanup use of several gotos not used for error exit.
+ * Replace memcpy_toio() and memcpy_toio() with endian-dependent
+ lpfc_memcpy_to_slim() and lpfc_memcpy_from_slim() so that for
+ big endian hosts like PPC64, the SLIM is accessed 4 bytes at a
+ time instead of as a byte-stream.
+
+Changes from 20040409 to 20040416
+
+ * The scsi_register and scsi_alloc_host OS calls can fail and
+ return a zero-valued host pointer. A ctrl-C on 2.6 kernels
+ during driver load will cause this and the driver to panic.
+ Fixed this bug. Also found a bug in the error_x handling with
+ lpfc_sli_hba_down - it was in the wrong place and the driver
+ lock was not held, but needed to be (in lpfc_linux_attach) Fixed
+ both. Did some minor comment clean up.
+ * Removed unwanted (void *) castings.
+ * Replace define of INVALID_PHYS, with kernel 2.6.5's
+ dma_mapping_error() and add a inline function for earlier
+ kernels. Remove lpfc_bad_scatterlist().
+ * Clean up formatting in hbaapi.h, lpfc.h, lpfc_cfgparm.h,
+ lpfc_crtn.h, lpfc_ct.c, lpfc_diag.h, lpfc_disc.h, lpfc_els.c,
+ lpfc_fcp.c, lpfc_hbadisc.c, lpfc_hw.h, lpfc_init.c,
+ lpfc_ipport.c, lpfc_logmsg.c, lpfc_logmsg.h and lpfc_scsiport.c
+ - mostly replacing groups of 8 spaces with hard tabs and keeping
+ lines to 80 column max..
+ * Removed LPFC_DRVR_LOCK call from lpfc_unblock_requests for 2.4
+ kernels. The lpfc_scsi_done routine already unlocks the driver
+ lock since it expects this lock to be held.
+ * Removed global lock capabilities from driver lock routines
+ * Remove SA_INTERRUPT flag from request_irq
+ * Move dma_addr_t cast inside of getPaddr macro as everywhere
+ getPaddr is used, the return is cast to dma_addr_t.
+ * Clean up formatting in lpfc_sli.c and lpfc_sysfs.c - mostly
+ replacing groups of 8 spaces with hard tabs and keeping lines
+ to 80 column max.
+ * Fix build for RHEL 2.1 BOOT kernels by always #including
+ interrupt.h in lpfc.h.
+ * Fix RHEL 3 build by #defining EXPORT_SYMTAB.
+ * Replace sprintf with snprintf in lpfc_proc_info.
+ * Fix build warnings on 2.6 kernels - remove no longer used calls
+ to character device initialization.
+ * Initial support code for discovery in tasklet conversion.
+ * Removing char interface and ioctl code.
+ * Change all elx prefixes to lpfc
+ * Replace lpfc_write_slim() & lpfc_read_slim() with memcpy_toio(),
+ memcpy_fromio(), writel() & readl().
+
+Changes from 20040402 to 20040409
+
+ * Replaced lpfc_read_hbaregs_plus_offset and
+ lpfc_write_hbaregs_plus_offset functions with readl and writel.
+ * Get rid of long mdelay's in insmod path
+ * Changed the way our pci_device_id structures are initialized
+ * Replace lpfc_read/write_CA/HA/HC/HS with calls to readl() &
+ writel() directly.
+ * Increase SLI2_SLIM to 16K Increase cmd / rsp IOCBs accordingly
+ * Removed lpfc_els_chk_latt from the lpfc_config_post function.
+ lpfc_els_chk_latt will enable the link event interrupts when
+ flogi is pending which causes two discovery state machines
+ running parallely.
+ * Add pci_disable_device to unload path.
+ * Move lpfc_sleep_event from lpfc_fcp.c to lpfc_util_ioctl.c
+ * Call dma_map_single() & pci_map_single() directly instead of via
+ macro lpfc_pci_map(). Allow address 0 for PPC64.
+ * Change sleep to uninterruptible in lpfc_sli_issue_icob_wait
+ because this function doesn't handle signals.
+ * Move lpfc_wakeup_event from lpfc_fcp.c to lpfc_ioctl.c
+ * Remove unneeded #include <linux/netdevice.h>
+ * Remove unused clock variables lpfc_clkCnt and lpfc_sec_clk.
+ * Get rid of capitalization of function names.
+ * Removed lpfc_addr_sprintf.
+ * Implemented gotos in lpfc_linux_attach for error cases.
+ * Replace mlist->dma.list = dmp->dma.list; to mlist = dmp.
+ * Remove functions lpfc_get_OsNameVersion and elx_wakeup. Change
+ elx_wakeup to wake_up_interruptible
+ * Add function lpfc_get_os_nameversion and change
+ lpfc_get_OsNameVersion to lpfc_get_os_nameversion.
+ * Remove lpfc_get_OsNameVersion
+ * Change driver name to a consistent lpfc in every visible place.
+ * Fix build warning: removed unused variable ret in lpfc_fdmi_tmo.
+ * Remove lpfc_utsname_nodename_check function
+ * Remove functions lpfc_register_intr and lpfc_unregister_intr
+ * Fill in owner field in lpfc_ops file_operations struct and
+ remove now unnecessary open and close entry points.
+ * Change function name prefixes from elx_ to lpfc_
+ * Remove special case check for TUR in elx_os_prep_io()
+ * Renamed elx_scsi.h to lpfc_scsi.h
+ * Renamed elx_sched.h to lpfc_sched.h
+ * Renamed elx_mem.h to lpfc_mem.h
+ * Renamed elx_sli.h to lpfc_sli.h
+ * Renamed elx_logmsg.h to lpfc_logmsg.h
+ * Renamed elx.h to lpfc.h
+ * Renamed elx_sli.c to lpfc_sli.c
+ * Renamed elx_sched.c to lpfc_sched.c
+ * Renamed elx_mem.c to lpfc_mem.c
+ * Renamed elx_logmsg.c to lpfc_logmsg.c
+ * Renamed lpfcLINUXfcp.c lpfc_fcp.c
+ * Renamed elx_clock.c to lpfc_clock.c
+ * Reduce stack usage in lpfc_info().
+ * Move lpip_stats structure from lpfc_hba.h to lpfc_ip.h.
+ * Move lpfc_stats and HBAEVT_t structures from lpfc_hba.h to
+ lpfc.h
+ * Remove lpfc_hba.h
+ * Remove duplicate rc definitions from
+ * Removed code which used next pointer to store mbox structure.
+ * Cleaned up list iterations.
+ * Removed non list manipulation of the next pointers.
+ * Change list_del()/INIT_LIST_HEAD sequences to list_del_init()
+ * In ELX_IOCBQ_t: Moved hipri_trigger field to iocb_flag. Combined
+ hipri_wait_queue and rsp_iocb in union
+ * Replaced casting from list_head with list_entry macro.
+ * Added ct_ndlp_context field to the ELX_IOCBQ_t.
+ * Do not use DMABUf_t list to store ndlp context
+ * Return 0 from lpfc_process_iotcl_util() when ELX_INITBRDS
+ succeeds.
+ * remove elx_os_scsiport.h
+ * Do not use DMABUf_t list to hold rpi context
+ * Replace elx_cfg_* names with lpfc_cfg-*
+ * Moved FCP activity to ring 0. Moved ELS/CT activity to ring 2.
+ * Clean up formatting of elx_sli.h (tabs for indents, 80 column
+ lines).
+ * Remove unused elxclock declaration in elx_sli.h.
+ * Since everywhere IOCB_ENTRY is used, the return value is cast,
+ move the cast into the macro.
+ * Split ioctls out into seperate files
+
+Changes from 20040326 to 20040402
+
+ * Updated ChangeLog for 20040402 SourceForge drop.
+ * Use safe list iterator for ndlp list
+ * Added code to return NLP_STE_FREED_NODE from the discovery
+ state machine functions if the node is freed from the
+ function.
+ * Fixes to DMABUF_t handling
+ * Fix for load error in discovery
+ * Remove loop_cnt variable from lpfc_rcv_plogi_unused_node.
+ * Remove nle. reference.
+ * Remove support for building 2.4 drivers
+ * Remove elx_util.h and replace elx_disc.h with lpfc_disc.h
+ * Implemented the Linux list macros in the discovery code.
+ Also moved elx_disc.h contents into lpfc_disc.h
+ * Unused variable cleanup
+ * Use Linux list macros for DMABUF_t
+ * Break up ioctls into 3 sections, dfc, util, hbaapi
+ rearranged code so this could be easily seperated into a
+ differnet module later All 3 are currently turned on by
+ defines in lpfc_ioctl.c LPFC_DFC_IOCTL, LPFC_UTIL_IOCTL,
+ LPFC_HBAAPI_IOCTL
+ * Misc cleanup: some goto's; add comments; clarify function
+ args
+ * Added code to use list macro for ELXSCSITARGET_t.
+ * New list implementation for ELX_MBOXQ_t
+ * Cleaned up some list_head casting.
+ * Put IPFC ifdef around two members of struct lpfc_nodelist.
+ * Cleaned up iocb list using list macros and list_head data
+ structure.
+ * lpfc_online() was missing some timer routines that were
+ started by lpfc_linux_attach(). These routines are now also
+ started by lpfc_online(). lpfc_offline() only stopped
+ els_timeout routine. It now stops all timeout routines
+ associated with that hba.
+ * Replace seperate next and prev pointers in struct
+ lpfc_bindlist with list_head type. In elxHBA_t, replace
+ fc_nlpbind_start and _end with fc_nlpbind_list and use
+ list_head macros to access it.
+ * Fix ulpStatus for aborting I/Os overlaps with newer firmware
+ ulpStatus values
+ * Rework params_show/store to be consistent as the other
+ routines. Remove generic'ness and rely on set attribute.
+ * Remove unused log message.
+ * Collapse elx_crtn.h and prod_crtn.h into lpfc_crtn.h
+ * Ifdef Scheduler specific routines
+ * Removed following ununsed ioclt's: ELX_READ_IOCB
+ ELX_READ_MEMSEG ELX_READ_BINFO ELX_READ_EINVAL ELX_READ_LHBA
+ ELX_READ_LXHBA ELX_SET ELX_DBG LPFC_TRACE
+ * Removed variable fc_dbg_flg
+ * Fixed a bug where HBA_Q_DEPTH was set incorrectly for
+ 3-digit HBAs. Also changed can_queue so midlayer will only
+ send (HBA_Q_DEPTH - 10) cmds.
+ * Clean up code in the error path, check condition. Remove
+ ununsed sense-related fields in lun structure.
+ * Added code for safety pools for following objects: mbuf/bpl,
+ mbox, iocb, ndlp, bind
+ * Wrapped '#include <elx_sched.h>' in '#ifdef USE_SCHEDULER'.
+ * Fixed 'make clean' target.
+ * Build now ignores elx_sched.o, and includes lpfc_sysfs.o.
+ * Wrapped lpfndd.o target in BUILD_IPFC ifdef.
+ * Removed elx_os.h inclusion in implementation files.
+ * Removed ELX_OS_IO_t data structure and put data direction
+ and non scatter/gather physical address into the scsi buffer
+ structure directly. Moved DRVR_LOCK, putPaddr, getPaddr
+ macros and some defines into elx.h since they are required
+ by the whole driver.
+ * Migrated following ioctls (debug) ELX_DISPLAY_PCI_ALL
+ ELX_DEVP ELX_READ_BPLIST ELX_RESET_QDEPTH ELX_STAT.
+ * Step 1 of attempt to move all Debug ioctls to sysfs.
+ Implemented the following IOCTLs in sysfs: ELX_WRITE_HC
+ ELX_WRITE_HS ELX_WRITE_HA ELX_WRITE_CA ELX_READ_HC
+ ELX_READ_HS ELX_READ_HA ELX_READ_CA ELX_READ_MB ELX_RESET
+ ELX_READ_HBA ELX_INSTANCE ELX_LIP. Also introduced
+ attribute "set" to be used in conjuction with the above
+ attributes.
+ * Removed DLINK, enque and deque declarations now that clock
+ doesn't use them anymore
+ * Separated install rule so that BUILD_IPFC has to be set when
+ make is called in order for the install rule to attempt to
+ copy the lpfndd.o driver. This change fixes a bug that
+ occurs because the install rule by default attempted to
+ install lpfndd.o, whereas the default make rule did not by
+ default build lpfndd.o.
+ * Keep track if hbaapi index numbers need to be refreshed.
+ * Removed prod_os.h from include list.
+ * Removed LPFC_LOCK and LPFC_UNLOCK macros. Added OS calls
+ into elx_os_scsiport.c. This file is now empty.
+ * Added spin_lock_irqsave and spin_unlock_irqrestore calls
+ into code directly and removed LPFC_LOCK_ and _UNLOCK_
+ macros
+ * Remove references to "elx_clock.h"
+ * Added utsname.h to include list. The previous checkin to
+ elx_os.h removed its inclusion of utsname.h since there is
+ precious little in the file. However, lpfcLINUXfcp.c needs
+ it and now has it.
+ * Removed some commented-out code
+ * Removed elx_lck_t data structure, stray elxDRVR_t type, and
+ include from file. No longer used.
+ * Removed two PCI Sync defines. Removed includes - not
+ needed. Cleaned up macro lines.
+ * Added two functions from elxLINUXfcp.c. These functions
+ were IPFC specific.
+ * Removed hipri lock abstractions and added OS call into code.
+ Removed elx_lck_t and added spinlock_t directly. Moved two
+ IPFC functions into lpfc_ipport.c
+ * Moved IP specific structures to lpfc_ip.h file.
+ * lpfc_ipfarp_timeout() uses system timer. Remove all usages
+ of old internal clock support.
+ * Made changes to compile without IPFC support for the default
+ build. Added ifdef IPFC for all lpfc_ip.h includes.
+ * Patched elx_free_scsi_buf
+ * Removed elx_sched.o from 2.6 dependencies
+ * Reworked lpfc_pcimap.
+ * Use Linux swap macros to replace ELX swapping macros
+ (SWAP_SHORT, SWAP_LONG, SWAP_DATA, SWAP_DATA16,
+ PCIMEM_SHORT, PCIMEM_LONG, PCIMEM_DATA).
+ * move in_interrupt() check inside of elx_sleep_ms()
+ * Moved location of pci.h include.
+ * Restored elx_lck_t types in elxHBA_t.
+ * Removed elx_pci_dma_sync call. Also removed some PCI
+ defines from elx_hw.h and removed the spinlock_t locks that
+ are no longer used in elx.h
+ * elx_iodone() now uses system timer.
+ * elx_qfull_retry() now uses system timer.
+ * lpfc_put_buf(), lpfc_ip_xri_timeout() and
+ lpfc_ip_timeout_handler() now use system timer.
+ * lpfc_fdmi_tmo() and lpfc_qthrottle_up() now use system
+ timer.
+ * Removed num_bufs and num_iocbs configuration parameters.
+ * Fixed a memory corruption bug. This was caused by a memory
+ write to ndlp structure from lpfc_cmpl_els_acc function.
+ This ndlp structure was freed from lpfc_els_unsol_event.
+ * lpfc_disc_timeout() and lpfc_establish_link_tmo() now use
+ system timer. Also update lpfc_els_retry_delay() to do a
+ single lock release at the end.
+ * Remove use of PAN (pseudo adapter number).
+ * Reintroduced usage of the cross compiler for building on
+ ppc64 to remove build errors that were cropping up when
+ using the standard gcc compiler.
+ * Fix no-unlock-before return in lpfc_els_retry_delay which was
+ causing a deadlock on insmod in some environments.
+ * Minor format changes fix up comments
+ * Create utility clock function elx_start_timer() and
+ elx_stop_timer(). All timeout routines now use these common
+ routines.
+ * Minor formating changes fix up comments
+ * Minor formatting changes get rid of failover defines for
+ syntax checking
+ * Minor formatting changes remove ISCSI defines.
+ * Fix typo in install target for 2.4 kernels.
+ * Removed unused elx_scsi_add_timer extern function
+ declaration.
+ * Cleanup casting around DMA masks.
+ * Comment out lpfndd.o modules_install section as lpfndd.o is
+ not generated if CONFIG_NET_LPFC is not set. Also refer to
+ BASEINCLUDE only in out of kernel source module builds as it
+ will not exist otherwise.
+ * Removed unused malloc counters from lpfcLINUXfcp.c.
+ * Remove some unnecessary #includes in lpfcLINUXfcp.c
+ * Remove unncessary #includes in elxLINUXfcp.c
+ * Minor formatting cleanups in Makefile to avoid some
+ linewrapping.
+ * Removed unused elx_mem_pool data structure.
+ * Remove several unnecessary #includes.
+ * Moving fix for memory leak in ioctl lip area to sysfs's lip.
+ * Removed unused elx_dma_handle_t elx_acc_handle_t
+ FC_MAX_SEGSZ and FC_MAX_POOL.
+ * Rewrite of Makefile. Fixes breakages with make -j4 during
+ kernel compile. Does not recompile all files on every
+ build. Uses the kernel build's definitions of CFLAGS,
+ MODFLAGS etc. Removed "make rpm" option.
+ * Removed unused #defines CLOSED, DEAD, OPENED, NORMAL_OPEN
+ and unneeded #include of elx_sched.h in elx.h.
+ * Several log message updates
+ * Add PCI_DEVICE_ID_FIREFLY for LP6000
+ * Fixed known issues in 20040326: driver crashes on rmmod in
+ both 2.4 and 2.6 kernels
+
+
+Changes from 20040319 to 20040326
+
+ * Updated ChangeLog for 20040326 SourceForge drop.
+ * remove lpfc_isr / lpfc_tmr logic fixed up 8 spaces from
+ previous checkins with tabs
+ * replace elx_in_intr() with in_interrupt()
+ * Remove unused messages 1602 and 1603.
+ * Fix the following issues with log messages: Remove unused
+ messages 406, 407, 409, 927, 928, 1201, 1202, 1204, 1205, 1206
+ and 1207. Create a new message 738 to fix duplicate instances
+ of 736.
+ * Removed remaining pci interface abstractions from elxLINUXfcp.c.
+ Implemented OS calls directly in all remaining files and cleaned
+ up modules. Removed prototypes as well.
+ * Removed following functions/structures elx_mem_dmapool
+ elx_idx_dmapool elx_size_dmapool elx_kmem_lock dfc_data_alloc
+ dfc_data_free dfc_mem struct mbuf_info elx_acc_handle_t
+ data_handle elx_dma_handle_t dma_handle struct elx_memseg
+ MEMSEG_t
+ * lpfc_els_timeout_handler() now uses system timer.
+ * Further cleanup of #ifdef powerpc
+ * lpfc_scsi_timeout_handler() now uses system timer.
+ * Replace common driver's own defines for endianess w/ Linux's
+ __BIG_ENDIAN etc.
+ * Added #ifdef IPFC for all IPFC specific code.
+ * lpfc_disc_retry_rptlun() now uses system timer.
+ * lpfc_npr_timeout() now uses system timer.
+ * Modified detect code, on insmod, to only wait a max of 2 secs if
+ link comes up and there are no devices.
+ * Move remaining message logging functions into
+ elx_logmsg.c/elx_logmsg.h.
+ * Added code to clear link attention bit when there is a pending
+ link event and the memory allocation for read_la mail box
+ command fails.
+ * Removed function calls for mapping bar registers and allocating
+ kernel virtual memory mappings to the mapped bars Removed
+ prototypes, lpfc_driver_cache_line, and pci_bar1_map rename to
+ pci_bar2_map.
+ * Allocate mbox only if the hba_state is in ready state.
+ * Complete lip support via sysfs. To lip, echo brdnum >
+ /sys/bus/pci/drivers/lpfc/lip.
+ * moving sysfs show/store implementations to lpfc_sysfs.c. Also add
+ support for lip.
+ * Add files: lpfc_sysfs.c, lpfc_sysfs.h
+ * move LPFC_DRIVER_NAME and LPFC_MODULE_DESC out of lpfcLINUXfcp.c
+ to lpfc_version.h, since it is now needed in lpfc_sysfs.c
+ * elx_mbox_timeout now uses system timer
+ * Changed lpfc_nodev_timeout, lpfc_els_retry_delay and
+ lpfc_linkdown_timeout to use the system timer instead of
+ internal clock support.
+ * Move remaining message logging functions in elx_util.c to
+ elx_logmsg.c.
+ * Remove some unnecessary typecasting.
+ * Remove log message that is no longer used (was used by
+ elx_str_atox).
+ * Replaced DLINK_t and SLINK_t by standard Linux list_head
+ * Removed deque macro
+ * Replaced ELX_DLINK_t ans ELX_SLINK_t by Linux struct list_head
+ (except for clock)
+ * Removed following functions from code: linux_kmalloc linux_kfree
+ elx_alloc_bigbuf elx_free_bigbuf
+ * Removed following abstract functions from the code. elx_malloc
+ elx_free elx_ip_get_rcv_buf elx_ip_free_rcv_buf
+ elx_mem_alloc_dmabuf elx_mem_alloc_dmabufext elx_mem_alloc_dma
+ elx_mem_alloc_buf lpfc_bufmap
+ * Removed custom PCI configuration #defines and replaced with
+ OS-provided #defines. Also added linux/pci.h to *.c files.
+ * Remove elx_str_ctox. Replace elx_str_atox with sscanf.
+ * Many indentation/whitespace fixes.
+ * Replace elx_str_ctox with isxdigit where it was only used to
+ check the value of a character.
+ * Removed following functions from the code. elx_kmem_free
+ elx_kmem_alloc elx_kmem_zalloc
+ * Change use of 2.4 SCSI typedef Scsi_Host_Template to struct
+ scsi_host_template for 2.6 kernels.
+ * Change use of 2.4 SCSI typedefs (Scsi_Device, Scsi_Cmnd,
+ Scsi_Request) the their real struct names.
+ * Move 2.6 compatibility irqreturn definitions to lpfc_compat.h.
+ Protect these definitions from conflicting with similar ones in
+ later 2.4 kernels.
+ * Remove unused definitions: LINUX_TGT_t, LINUX_LUN_t,
+ LINUX_BUF_t, elx_lun_t, SET_ADAPTER_STATUS.
+ * Convert pci_ calls to linux 2.6 dma_ equivalents.
+ * Removed unused types: struct buf, struct sc_buf, T_SCSIBUF
+ typedef.
+ * Fix Makefile so that 2.4 drivers don't always rebuild all files.
+ * Remove unused _static_ and fc_lun_t definitions.
+ * Cleaned up some memory pool implementation code.
+ * Fix panic with char dev changes. Turns out that 2.6.4 code does
+ the same in kernel space with the 2.4 interface style
+ definitions. So remove the new char dev code altogether.
+ * Remove typecasting from fc_get_cfg_param and consolidate
+ multiple instances of the parameter switch into a single
+ instance.
+ * Use lpfc_is_LC_HBA() macro that tests pcidev->device directly
+ instead of saving a private copy that undergoes varied shifting
+ & casting.
+ * Removed usage of all memory pools.
+
+Changes from 20040312 to 20040319
+
+ * Use dev_warn instead of printk for 2.6 kernels
+ * Correct Iocbq completion routine for 2.6 kernel case
+ * Change void *pOSCmd to Scsi_Smnd *pCmd
+ * Change void *pOScmd to struct sk_buff *pCmd
+ * Remove data directon code.
+ * Removed memory pool for buf/bpl buffers and use kmalloc/kfree
+ pci_pool_alloc/free directly.
+ * Move PPC check for DMA address 0 in scatter-gather list, into
+ lpfc_compat.h
+ * Always use pci_unmap_single() instead of pci_unmap_page()
+ * Clean up the 2.6 vs 2.4 #if blocks.
+ * Conditionalize Scheduler
+ * Add a comment to explain a little what the first Makefile
+ section does.
+ * Removed lpfc_intr_post
+ * Sysfs new display format. Also added write functionality. You
+ can [ echo "0 log_verbose 3" >
+ /sys/bus/pci/drivers/lpfc/params]. Hex support yet to be added.
+ * Removed several #ifdef powerpc, including for a discovery issue
+ in lpfc_ValidLun()
+ * Change elx_printf_log to use vsprintf.
+ * Added lpfc_compat.h provides macros to aid compilation in the
+ Linux 2.4 kernel over various platform architectures. Initially
+ support mapping to a DMA address.
+ * Removed memory pool for nlp/bind buffers and use kmalloc/kfree
+ directly.
+ * Removed memory pool for iocb buffers and use kmalloc/kfree
+ directly.
+ * Removed memory pool for mailbox buffers and use kmalloc/kfree
+ directly.
+ * Cleaned up back and forth casts
+ * Initial support for sysfs for 2.6 kernel.
+ * Changed elx_dma_addr_t to dma_addr_t
+ * Fix a 2.6 kernel check to be >= 2.6.0 instead of > (was missing
+ 2.6.0).
+ * Remove elx_printf and elx_str_sprintf. Replace elx_print with
+ printk.
+ * Replace elx_printf with printk.
+ * Replace elx_str_sprintf with sprintf.
+ * Removed the mem_lock, its prototype, function, macro, and
+ iflags.
+ * Use kmalloc/kfree for ELX_SCSI_BUF_t
+ * Use linux pci_pools for SCSI_DMA_EXT
+ * Use linux pci_pools for BPLs.
+ * Minor cleanup of DFC args for PPC64.
+ * Several small indentation cleanups.
+ * New Linux 2.6 style of char device registration.
+ * Migrated members of LPFCHBA_t and LINUX_HBA_t into elxHBA_t
+ * Use strcpy, strncmp, isdigit, strlen instead of abstractions
+ * Cleanup of driver_template.
+ * Facilitate compile time turn on/off of lpfc_network_on.
+ * Split large source files into smaller, better named ones.
+
+Changes from 2.10a to 20040312
+
+ * Fix build for 2.4 kernels
+ * Move driver version macros into lpfc_version.h file.
+ * Fixed data miscompare with LIP.
+ * Removed elx_sli, elx_ioc, elx_disc, elx_sch routines,
+ prototypes, and reference points.
+ * Correct the space insertions with hardtabs
+ * Remove routine call pointers in ELX_SLI_INIT_t struct.
+ * Removed module locks except for drvr, mem, and clock.
+ * Removed unused module locks from sourcebase. Kept drvr_lock,
+ mem_lock, and clock_lock.
+ * Change NULL to 0
diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid
new file mode 100644
index 0000000..eaa4801
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.megaraid
@@ -0,0 +1,614 @@
+Release Date : Thu Nov 16 15:32:35 EST 2006 -
+ Sumant Patro <sumant.patro@lsi.com>
+Current Version : 2.20.5.1 (scsi module), 2.20.2.6 (cmm module)
+Older Version : 2.20.4.9 (scsi module), 2.20.2.6 (cmm module)
+
+1. Changes in Initialization to fix kdump failure.
+ Send SYNC command on loading.
+ This command clears the pending commands in the adapter
+ and re-initialize its internal RAID structure.
+ Without this change, megaraid driver either panics or fails to
+ initialize the adapter during kdump's second kernel boot
+ if there are pending commands or interrupts from other devices
+ sharing the same IRQ.
+2. Authors email-id domain name changed from lsil.com to lsi.com.
+ Also modified the MODULE_AUTHOR to megaraidlinux@lsi.com
+
+Release Date : Fri May 19 09:31:45 EST 2006 - Seokmann Ju <sju@lsil.com>
+Current Version : 2.20.4.9 (scsi module), 2.20.2.6 (cmm module)
+Older Version : 2.20.4.8 (scsi module), 2.20.2.6 (cmm module)
+
+1. Fixed a bug in megaraid_init_mbox().
+ Customer reported "garbage in file on x86_64 platform".
+ Root Cause: the driver registered controllers as 64-bit DMA capable
+ for those which are not support it.
+ Fix: Made change in the function inserting identification machanism
+ identifying 64-bit DMA capable controllers.
+
+ > -----Original Message-----
+ > From: Vasily Averin [mailto:vvs@sw.ru]
+ > Sent: Thursday, May 04, 2006 2:49 PM
+ > To: linux-scsi@vger.kernel.org; Kolli, Neela; Mukker, Atul;
+ > Ju, Seokmann; Bagalkote, Sreenivas;
+ > James.Bottomley@SteelEye.com; devel@openvz.org
+ > Subject: megaraid_mbox: garbage in file
+ >
+ > Hello all,
+ >
+ > I've investigated customers claim on the unstable work of
+ > their node and found a
+ > strange effect: reading from some files leads to the
+ > "attempt to access beyond end of device" messages.
+ >
+ > I've checked filesystem, memory on the node, motherboard BIOS
+ > version, but it
+ > does not help and issue still has been reproduced by simple
+ > file reading.
+ >
+ > Reproducer is simple:
+ >
+ > echo 0xffffffff >/proc/sys/dev/scsi/logging_level ;
+ > cat /vz/private/101/root/etc/ld.so.cache >/tmp/ttt ;
+ > echo 0 >/proc/sys/dev/scsi/logging
+ >
+ > It leads to the following messages in dmesg
+ >
+ > sd_init_command: disk=sda, block=871769260, count=26
+ > sda : block=871769260
+ > sda : reading 26/26 512 byte blocks.
+ > scsi_add_timer: scmd: f79ed980, time: 7500, (c02b1420)
+ > sd 0:1:0:0: send 0xf79ed980 sd 0:1:0:0:
+ > command: Read (10): 28 00 33 f6 24 ac 00 00 1a 00
+ > buffer = 0xf7cfb540, bufflen = 13312, done = 0xc0366b40,
+ > queuecommand 0xc0344010
+ > leaving scsi_dispatch_cmnd()
+ > scsi_delete_timer: scmd: f79ed980, rtn: 1
+ > sd 0:1:0:0: done 0xf79ed980 SUCCESS 0 sd 0:1:0:0:
+ > command: Read (10): 28 00 33 f6 24 ac 00 00 1a 00
+ > scsi host busy 1 failed 0
+ > sd 0:1:0:0: Notifying upper driver of completion (result 0)
+ > sd_rw_intr: sda: res=0x0
+ > 26 sectors total, 13312 bytes done.
+ > use_sg is 4
+ > attempt to access beyond end of device
+ > sda6: rw=0, want=1044134458, limit=951401367
+ > Buffer I/O error on device sda6, logical block 522067228
+ > attempt to access beyond end of device
+
+2. When INQUIRY with EVPD bit set issued to the MegaRAID controller,
+ system memory gets corrupted.
+ Root Cause: MegaRAID F/W handle the INQUIRY with EVPD bit set
+ incorrectly.
+ Fix: MegaRAID F/W has fixed the problem and being process of release,
+ soon. Meanwhile, driver will filter out the request.
+
+3. One of member in the data structure of the driver leads unaligne
+ issue on 64-bit platform.
+ Customer reporeted "kernel unaligned access addrss" issue when
+ application communicates with MegaRAID HBA driver.
+ Root Cause: in uioc_t structure, one of member had misaligned and it
+ led system to display the error message.
+ Fix: A patch submitted to community from following folk.
+
+ > -----Original Message-----
+ > From: linux-scsi-owner@vger.kernel.org
+ > [mailto:linux-scsi-owner@vger.kernel.org] On Behalf Of Sakurai Hiroomi
+ > Sent: Wednesday, July 12, 2006 4:20 AM
+ > To: linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org
+ > Subject: Re: Help: strange messages from kernel on IA64 platform
+ >
+ > Hi,
+ >
+ > I saw same message.
+ >
+ > When GAM(Global Array Manager) is started, The following
+ > message output.
+ > kernel: kernel unaligned access to 0xe0000001fe1080d4,
+ > ip=0xa000000200053371
+ >
+ > The uioc structure used by ioctl is defined by packed,
+ > the allignment of each member are disturbed.
+ > In a 64 bit structure, the allignment of member doesn't fit 64 bit
+ > boundary. this causes this messages.
+ > In a 32 bit structure, we don't see the message because the allinment
+ > of member fit 32 bit boundary even if packed is specified.
+ >
+ > patch
+ > I Add 32 bit dummy member to fit 64 bit boundary. I tested.
+ > We confirmed this patch fix the problem by IA64 server.
+ >
+ > **************************************************************
+ > ****************
+ > --- linux-2.6.9/drivers/scsi/megaraid/megaraid_ioctl.h.orig
+ > 2006-04-03 17:13:03.000000000 +0900
+ > +++ linux-2.6.9/drivers/scsi/megaraid/megaraid_ioctl.h
+ > 2006-04-03 17:14:09.000000000 +0900
+ > @@ -132,6 +132,10 @@
+ > /* Driver Data: */
+ > void __user * user_data;
+ > uint32_t user_data_len;
+ > +
+ > + /* 64bit alignment */
+ > + uint32_t pad_0xBC;
+ > +
+ > mraid_passthru_t __user *user_pthru;
+ >
+ > mraid_passthru_t *pthru32;
+ > **************************************************************
+ > ****************
+
+Release Date : Mon Apr 11 12:27:22 EST 2006 - Seokmann Ju <sju@lsil.com>
+Current Version : 2.20.4.8 (scsi module), 2.20.2.6 (cmm module)
+Older Version : 2.20.4.7 (scsi module), 2.20.2.6 (cmm module)
+
+1. Fixed a bug in megaraid_reset_handler().
+ Customer reported "Unable to handle kernel NULL pointer dereference
+ at virtual address 00000000" when system goes to reset condition
+ for some reason. It happened randomly.
+ Root Cause: in the megaraid_reset_handler(), there is possibility not
+ returning pending packets in the pend_list if there are multiple
+ pending packets.
+ Fix: Made the change in the driver so that it will return all packets
+ in the pend_list.
+
+2. Added change request.
+ As found in the following URL, rmb() only didn't help the
+ problem. I had to increase the loop counter to 0xFFFFFF. (6 F's)
+ http://marc.theaimsgroup.com/?l=linux-scsi&m=110971060502497&w=2
+
+ I attached a patch for your reference, too.
+ Could you check and get this fix in your driver?
+
+ Best Regards,
+ Jun'ichi Nomura
+
+Release Date : Fri Nov 11 12:27:22 EST 2005 - Seokmann Ju <sju@lsil.com>
+Current Version : 2.20.4.7 (scsi module), 2.20.2.6 (cmm module)
+Older Version : 2.20.4.6 (scsi module), 2.20.2.6 (cmm module)
+
+1. Sorted out PCI IDs to remove megaraid support overlaps.
+ Based on the patch from Daniel, sorted out PCI IDs along with
+ charactor node name change from 'megadev' to 'megadev_legacy' to avoid
+ conflict.
+ ---
+ Hopefully we'll be getting the build restriction zapped much sooner,
+ but we should also be thinking about totally removing the hardware
+ support overlap in the megaraid drivers.
+
+ This patch pencils in a date of Feb 06 for this, and performs some
+ printk abuse in hope that existing legacy users might pick up on what's
+ going on.
+
+ Signed-off-by: Daniel Drake <dsd@gentoo.org>
+ ---
+
+2. Fixed a issue: megaraid always fails to reset handler.
+ ---
+ I found that the megaraid driver always fails to reset the
+ adapter with the following message:
+ megaraid: resetting the host...
+ megaraid mbox: reset sequence completed successfully
+ megaraid: fast sync command timed out
+ megaraid: reservation reset failed
+ when the "Cluster mode" of the adapter BIOS is enabled.
+ So, whenever the reset occurs, the adapter goes to
+ offline and just become unavailable.
+
+ Jun'ichi Nomura [mailto:jnomura@mtc.biglobe.ne.jp]
+ ---
+
+Release Date : Mon Mar 07 12:27:22 EST 2005 - Seokmann Ju <sju@lsil.com>
+Current Version : 2.20.4.6 (scsi module), 2.20.2.6 (cmm module)
+Older Version : 2.20.4.5 (scsi module), 2.20.2.5 (cmm module)
+
+1. Added IOCTL backward compatibility.
+ Convert megaraid_mm driver to new compat_ioctl entry points.
+ I don't have easy access to hardware, so only compile tested.
+ - Signed-off-by:Andi Kleen <ak@muc.de>
+
+2. megaraid_mbox fix: wrong order of arguments in memset()
+ That, BTW, shows why cross-builds are useful-the only indication of
+ problem had been a new warning showing up in sparse output on alpha
+ build (number of exceeding 256 got truncated).
+ - Signed-off-by: Al Viro
+ <viro@parcelfarce.linux.theplanet.co.uk>
+
+3. Convert pci_module_init to pci_register_driver
+ Convert from pci_module_init to pci_register_driver
+ (from:http://kerneljanitors.org/TODO)
+ - Signed-off-by: Domen Puncer <domen@coderock.org>
+
+4. Use the pre defined DMA mask constants from dma-mapping.h
+ Use the DMA_{64,32}BIT_MASK constants from dma-mapping.h when calling
+ pci_set_dma_mask() or pci_set_consistend_dma_mask(). See
+ http://marc.theaimsgroup.com/?t=108001993000001&r=1&w=2 for more
+ details.
+ Signed-off-by: Tobias Klauser <tklauser@nuerscht.ch>
+ Signed-off-by: Domen Puncer <domen@coderock.org>
+
+5. Remove SSID checking for Dobson, Lindsay, and Verde based products.
+ Checking the SSVID/SSID for controllers which have Dobson, Lindsay,
+ and Verde is unnecessary because device ID has been assigned by LSI
+ and it is unique value. So, all controllers with these IOPs have to be
+ supported by the driver regardless SSVID/SSID.
+
+6. Date Thu, 27 Jan 2005 04:31:09 +0100
+ From Herbert Poetzl <>
+ Subject RFC: assert_spin_locked() for 2.6
+
+ Greetings!
+
+ overcautious programming will kill your kernel ;)
+ ever thought about checking a spin_lock or even
+ asserting that it must be held (maybe just for
+ spinlock debugging?) ...
+
+ there are several checks present in the kernel
+ where somebody does a variation on the following:
+
+ BUG_ON(!spin_is_locked(&some_lock));
+
+ so what's wrong about that? nothing, unless you
+ compile the code with CONFIG_DEBUG_SPINLOCK but
+ without CONFIG_SMP ... in which case the BUG()
+ will kill your kernel ...
+
+ maybe it's not advised to make such assertions,
+ but here is a solution which works for me ...
+ (compile tested for sh, x86_64 and x86, boot/run
+ tested for x86 only)
+
+ best,
+ Herbert
+
+ - Herbert Poetzl <herbert@13thfloor.at>, Thu, 27 Jan 2005
+
+Release Date : Thu Feb 03 12:27:22 EST 2005 - Seokmann Ju <sju@lsil.com>
+Current Version : 2.20.4.5 (scsi module), 2.20.2.5 (cmm module)
+Older Version : 2.20.4.4 (scsi module), 2.20.2.4 (cmm module)
+
+1. Modified name of two attributes in scsi_host_template.
+ On Wed, 2005-02-02 at 10:56 -0500, Ju, Seokmann wrote:
+ > + .sdev_attrs = megaraid_device_attrs,
+ > + .shost_attrs = megaraid_class_device_attrs,
+
+ These are, perhaps, slightly confusing names.
+ The terms device and class_device have well defined meanings in the
+ generic device model, neither of which is what you mean here.
+ Why not simply megaraid_sdev_attrs and megaraid_shost_attrs?
+
+ Other than this, it looks fine to me too.
+
+Release Date : Thu Jan 27 00:01:03 EST 2005 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.4.4 (scsi module), 2.20.2.5 (cmm module)
+Older Version : 2.20.4.3 (scsi module), 2.20.2.4 (cmm module)
+
+1. Bump up the version of scsi module due to its conflict.
+
+Release Date : Thu Jan 21 00:01:03 EST 2005 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.4.3 (scsi module), 2.20.2.5 (cmm module)
+Older Version : 2.20.4.2 (scsi module), 2.20.2.4 (cmm module)
+
+1. Remove driver ioctl for logical drive to scsi address translation and
+ replace with the sysfs attribute. To remove drives and change
+ capacity, application shall now use the device attribute to get the
+ logical drive number for a scsi device. For adding newly created
+ logical drives, class device attribute would be required to uniquely
+ identify each controller.
+ - Atul Mukker <atulm@lsil.com>
+
+ "James, I've been thinking about this a little more, and you may be on
+ to something here. Let each driver add files as such:"
+
+ - Matt Domsch <Matt_Domsch@dell.com>, 12.15.2004
+ linux-scsi mailing list
+
+
+ "Then, if you simply publish your LD number as an extra parameter of
+ the device, you can look through /sys to find it."
+
+ - James Bottomley <James.Bottomley@SteelEye.com>, 01.03.2005
+ linux-scsi mailing list
+
+
+ "I don't see why not ... it's your driver, you can publish whatever
+ extra information you need as scsi_device attributes; that was one of
+ the designs of the extensible attribute system."
+
+ - James Bottomley <James.Bottomley@SteelEye.com>, 01.06.2005
+ linux-scsi mailing list
+
+2. Add AMI megaraid support - Brian King <brking@charter.net>
+ PCI_VENDOR_ID_AMI, PCI_DEVICE_ID_AMI_MEGARAID3,
+ PCI_VENDOR_ID_AMI, PCI_SUBSYS_ID_PERC3_DC,
+
+3. Make some code static - Adrian Bunk <bunk@stusta.de>
+ Date: Mon, 15 Nov 2004 03:14:57 +0100
+
+ The patch below makes some needlessly global code static.
+ -wait_queue_head_t wait_q;
+ +static wait_queue_head_t wait_q;
+
+ Signed-off-by: Adrian Bunk <bunk@stusta.de>
+
+4. Added NEC ROMB support - NEC MegaRAID PCI Express ROMB controller
+ PCI_VENDOR_ID_LSI_LOGIC, PCI_DEVICE_ID_MEGARAID_NEC_ROMB_2E,
+ PCI_SUBSYS_ID_NEC, PCI_SUBSYS_ID_MEGARAID_NEC_ROMB_2E,
+
+5. Fixed Tape drive issue : For any Direct CDB command to physical device
+ including tape, timeout value set by driver was 10 minutes. With this
+ value, most of command will return within timeout. However, for those
+ command like ERASE or FORMAT, it takes more than an hour depends on
+ capacity of the device and the command could be terminated before it
+ completes.
+ To address this issue, the 'timeout' field in the DCDB command will
+ have NO TIMEOUT (i.e., 4) value as its timeout on DCDB command.
+
+
+
+Release Date : Thu Dec 9 19:10:23 EST 2004
+ - Sreenivas Bagalkote <sreenib@lsil.com>
+
+Current Version : 2.20.4.2 (scsi module), 2.20.2.4 (cmm module)
+Older Version : 2.20.4.1 (scsi module), 2.20.2.3 (cmm module)
+
+i. Introduced driver ioctl that returns scsi address for a given ld.
+
+ "Why can't the existing sysfs interfaces be used to do this?"
+ - Brian King (brking@us.ibm.com)
+
+ "I've looked into solving this another way, but I cannot see how
+ to get this driver-private mapping of logical drive number-> HCTL
+ without putting code something like this into the driver."
+
+ "...and by providing a mapping a function to userspace, the driver
+ is free to change its mapping algorithm in the future if necessary .."
+ - Matt Domsch (Matt_Domsch@dell.com)
+
+Release Date : Thu Dec 9 19:02:14 EST 2004 - Sreenivas Bagalkote <sreenib@lsil.com>
+
+Current Version : 2.20.4.1 (scsi module), 2.20.2.3 (cmm module)
+Older Version : 2.20.4.1 (scsi module), 2.20.2.2 (cmm module)
+
+i. Fix a bug in kioc's dma buffer deallocation
+
+Release Date : Thu Nov 4 18:24:56 EST 2004 - Sreenivas Bagalkote <sreenib@lsil.com>
+
+Current Version : 2.20.4.1 (scsi module), 2.20.2.2 (cmm module)
+Older Version : 2.20.4.0 (scsi module), 2.20.2.1 (cmm module)
+
+i. Handle IOCTL cmd timeouts more properly.
+
+ii. pci_dma_sync_{sg,single}_for_cpu was introduced into megaraid_mbox
+ incorrectly (instead of _for_device). Changed to appropriate
+ pci_dma_sync_{sg,single}_for_device.
+
+Release Date : Wed Oct 06 11:15:29 EDT 2004 - Sreenivas Bagalkote <sreenib@lsil.com>
+Current Version : 2.20.4.0 (scsi module), 2.20.2.1 (cmm module)
+Older Version : 2.20.4.0 (scsi module), 2.20.2.0 (cmm module)
+
+i. Remove CONFIG_COMPAT around register_ioctl32_conversion
+
+Release Date : Mon Sep 27 22:15:07 EDT 2004 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.4.0 (scsi module), 2.20.2.0 (cmm module)
+Older Version : 2.20.3.1 (scsi module), 2.20.2.0 (cmm module)
+
+i. Fix data corruption. Because of a typo in the driver, the IO packets
+ were wrongly shared by the ioctl path. This causes a whole IO command
+ to be replaced by an incoming ioctl command.
+
+Release Date : Tue Aug 24 09:43:35 EDT 2004 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.3.1 (scsi module), 2.20.2.0 (cmm module)
+Older Version : 2.20.3.0 (scsi module), 2.20.2.0 (cmm module)
+
+i. Function reordering so that inline functions are defined before they
+ are actually used. It is now mandatory for GCC 3.4.1 (current stable)
+
+ Declare some heavy-weight functions to be non-inlined,
+ megaraid_mbox_build_cmd, megaraid_mbox_runpendq,
+ megaraid_mbox_prepare_pthru, megaraid_mbox_prepare_epthru,
+ megaraid_busywait_mbox
+
+ - Andrew Morton, 08.19.2004
+ linux-scsi mailing list
+
+ "Something else to clean up after inclusion: every instance of an
+ inline function is actually rendered as a full function call, because
+ the function is always used before it is defined. Atul, please
+ re-arrange the code to eliminate the need for most (all) of the
+ function prototypes at the top of each file, and define (not just
+ declare with a prototype) each inline function before its first use"
+
+ - Matt Domsch <Matt_Domsch@dell.com>, 07.27.2004
+ linux-scsi mailing list
+
+
+ii. Display elapsed time (countdown) while waiting for FW to boot.
+
+iii. Module compilation reorder in Makefile so that unresolved symbols do
+ not occur when driver is compiled non-modular.
+
+ Patrick J. LoPresti <patl@users.sourceforge.net>, 8.22.2004
+ linux-scsi mailing list
+
+
+Release Date : Thu Aug 19 09:58:33 EDT 2004 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.3.0 (scsi module), 2.20.2.0 (cmm module)
+Older Version : 2.20.2.0 (scsi module), 2.20.1.0 (cmm module)
+
+i. When copying the mailbox packets, copy only first 14 bytes (for 32-bit
+ mailboxes) and only first 22 bytes (for 64-bit mailboxes). This is to
+ avoid getting the stale values for busy bit. We want to set the busy
+ bit just before issuing command to the FW.
+
+ii. In the reset handling, if the reseted command is not owned by the
+ driver, do not (wrongly) print information for the "attached" driver
+ packet.
+
+iii. Have extended wait when issuing command in synchronous mode. This is
+ required for the cases where the option ROM is disabled and there is
+ no BIOS to start the controller. The FW starts to boot after receiving
+ the first command from the driver. The current driver has 1 second
+ timeout for the synchronous commands, which is far less than what is
+ actually required. We now wait up to MBOX_RESET_TIME (180 seconds) for
+ FW boot process.
+
+iv. In megaraid_mbox_product_info, clear the mailbox contents completely
+ before preparing the command for inquiry3. This is to ensure that the
+ FW does not get junk values in the command.
+
+v. Do away with the redundant LSI_CONFIG_COMPAT redefinition for
+ CONFIG_COMPAT. Replace <asm/ioctl32.h> with <linux/ioctl32.h>
+
+ - James Bottomley <James.Bottomley@SteelEye.com>, 08.17.2004
+ linux-scsi mailing list
+
+vi. Add support for 64-bit applications. Current drivers assume only
+ 32-bit applications, even on 64-bit platforms. Use the "data" and
+ "buffer" fields of the mimd_t structure, instead of embedded 32-bit
+ addresses in application mailbox and passthru structures.
+
+vii. Move the function declarations for the management module from
+ megaraid_mm.h to megaraid_mm.c
+
+ - Andrew Morton, 08.19.2004
+ linux-scsi mailing list
+
+viii. Change default values for MEGARAID_NEWGEN, MEGARAID_MM, and
+ MEGARAID_MAILBOX to 'n' in Kconfig.megaraid
+
+ - Andrew Morton, 08.19.2004
+ linux-scsi mailing list
+
+ix. replace udelay with msleep
+
+x. Typos corrected in comments and whitespace adjustments, explicit
+ grouping of expressions.
+
+
+Release Date : Fri Jul 23 15:22:07 EDT 2004 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.2.0 (scsi module), 2.20.1.0 (cmm module)
+Older Version : 2.20.1.0 (scsi module), 2.20.0.0 (cmm module)
+
+i. Add PCI ids for Acer ROMB 2E solution
+
+ii. Add PCI ids for I4
+
+iii. Typo corrected for subsys id for megaraid sata 300-4x
+
+iv. Remove yield() while mailbox handshake in synchronous commands
+
+
+ "My other main gripe is things like this:
+
+ + // wait for maximum 1 second for status to post
+ + for (i = 0; i < 40000; i++) {
+ + if (mbox->numstatus != 0xFF) break;
+ + udelay(25); yield();
+ + }
+
+ which litter the driver. Use of yield() in drivers is deprecated."
+
+ - James Bottomley <James.Bottomley@SteelEye.com>, 07.14.2004
+ linux-scsi mailing list
+
+v. Remove redundant __megaraid_busywait_mbox routine
+
+vi. Fix bug in the managment module, which causes a system lockup when the
+ IO module is loaded and then unloaded, followed by executing any
+ management utility. The current version of management module does not
+ handle the adapter unregister properly.
+
+ Specifically, it still keeps a reference to the unregistered
+ controllers. To avoid this, the static array adapters has been
+ replaced by a dynamic list, which gets updated every time an adapter
+ is added or removed.
+
+ Also, during unregistration of the IO module, the resources are
+ now released in the exact reverse order of the allocation time
+ sequence.
+
+
+Release Date : Fri Jun 25 18:58:43 EDT 2004 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.1.0
+Older Version : megaraid 2.20.0.1
+
+i. Stale list pointer in adapter causes kernel panic when module
+ megaraid_mbox is unloaded
+
+
+Release Date : Thu Jun 24 20:37:11 EDT 2004 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.0.1
+Older Version : megaraid 2.20.0.00
+
+i. Modules are not 'y' by default, but depend on current definition of
+ SCSI & PCI.
+
+ii. Redundant structure mraid_driver_t removed.
+
+iii. Miscellaneous indentation and goto/label fixes.
+ - Christoph Hellwig <hch@infradead.org>, 06.24.2004 linux-scsi
+
+iv. scsi_host_put(), do just before completing HBA shutdown.
+
+
+
+Release Date : Mon Jun 21 19:53:54 EDT 2004 - Atul Mukker <atulm@lsil.com>
+Current Version : 2.20.0.0
+Older Version : megaraid 2.20.0.rc2 and 2.00.3
+
+i. Independent module to interact with userland applications and
+ multiplex command to low level RAID module(s).
+
+ "Shared code in a third module, a "library module", is an acceptable
+ solution. modprobe automatically loads dependent modules, so users
+ running "modprobe driver1" or "modprobe driver2" would automatically
+ load the shared library module."
+
+ - Jeff Garzik <jgarzik@pobox.com> 02.25.2004 LKML
+
+ "As Jeff hinted, if your userspace<->driver API is consistent between
+ your new MPT-based RAID controllers and your existing megaraid driver,
+ then perhaps you need a single small helper module (lsiioctl or some
+ better name), loaded by both mptraid and megaraid automatically, which
+ handles registering the /dev/megaraid node dynamically. In this case,
+ both mptraid and megaraid would register with lsiioctl for each
+ adapter discovered, and lsiioctl would essentially be a switch,
+ redirecting userspace tool ioctls to the appropriate driver."
+
+ - Matt Domsch <Matt_Domsch@dell.com> 02.25.2004 LKML
+
+ii. Remove C99 initializations from pci_device id.
+
+ "pci_id_table_g would be much more readable when not using C99
+ initializers.
+ PCI table doesn't change, there's lots of users that prefer the more
+ readable variant. And it's really far less and much easier to grok
+ lines without C99 initializers."
+
+ - Christoph Hellwig <hch@infradead.org>, 05.28.2004 linux-scsi
+
+iii. Many fixes as suggested by Christoph Hellwig <hch@infradead.org> on
+ linux-scsi, 05.28.2004
+
+iv. We now support up to 32 parallel ioctl commands instead of current 1.
+ There is a conscious effort to let memory allocation not fail for ioctl
+ commands.
+
+v. Do away with internal memory management. Use pci_pool_(create|alloc)
+ instead.
+
+vi. Kill tasklet when unloading the driver.
+
+vii. Do not use "host_lock', driver has fine-grain locks now to protect all
+ data structures.
+
+viii. Optimize the build scatter-gather list routine. The callers already
+ know the data transfer address and length.
+
+ix. Better implementation of error handling and recovery. Driver now
+ performs extended errors recovery for instances like scsi cable pull.
+
+x. Disassociate the management commands with an overlaid scsi command.
+ Driver now treats the management packets as special packets and has a
+ dedicated callback routine.
diff --git a/Documentation/scsi/ChangeLog.megaraid_sas b/Documentation/scsi/ChangeLog.megaraid_sas
new file mode 100644
index 0000000..c851ef4
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.megaraid_sas
@@ -0,0 +1,325 @@
+
+1 Release Date : Thur.July. 24 11:41:51 PST 2008 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.04.01
+3 Older Version : 00.00.03.22
+
+1. Add the new controller (0078, 0079) support to the driver
+ Those controllers are LSI's next generatation(gen2) SAS controllers.
+
+1 Release Date : Mon.June. 23 10:12:45 PST 2008 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.22
+3 Older Version : 00.00.03.20
+
+1. Add shutdown DCMD cmd to the shutdown routine to make FW shutdown proper.
+2. Unexpected interrupt occurs in HWR Linux driver, add the dumy readl pci flush will fix this issue.
+
+1 Release Date : Mon. March 10 11:02:31 PDT 2008 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.20-RC1
+3 Older Version : 00.00.03.16
+
+1. Rollback the sense info implementation
+ Sense buffer ptr data type in the ioctl path is reverted back
+ to u32 * as in previous versions of driver.
+
+2. Fixed the driver frame count.
+ When Driver sent wrong frame count to firmware. As this
+ particular command is sent to drive, FW is seeing continuous
+ chip resets and so the command will timeout.
+
+3. Add the new controller(1078DE) support to the driver
+ and Increase the max_wait to 60 from 10 in the controller
+ operational status. With this max_wait increase, driver will
+ make sure the FW will finish the pending cmd for KDUMP case.
+
+1 Release Date : Thur. Nov. 07 16:30:43 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.16
+3 Older Version : 00.00.03.15
+
+1. Increased MFI_POLL_TIMEOUT_SECS to 60 seconds from 10. FW may take
+ a max of 60 seconds to respond to the INIT cmd.
+
+1 Release Date : Fri. Sep. 07 16:30:43 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.15
+3 Older Version : 00.00.03.14
+
+1. Added module parameter "poll_mode_io" to support for "polling"
+ (reduced interrupt operation). In this mode, IO completion
+ interrupts are delayed. At the end of initiating IOs, the
+ driver schedules for cmd completion if there are pending cmds
+ to be completed. A timer-based interrupt has also been added
+ to prevent IO completion processing from being delayed
+ indefinitely in the case that no new IOs are initiated.
+
+1 Release Date : Fri. Sep. 07 16:30:43 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.14
+3 Older Version : 00.00.03.13
+
+1. Setting the max_sectors_per_req based on max SGL supported by the
+ FW. Prior versions calculated this value from controller info
+ (max_sectors_1, max_sectors_2). For certain controllers/FW,
+ this was resulting in a value greater than max SGL supported
+ by the FW. Issue was first reported by users running LUKS+XFS
+ with megaraid_sas. Thanks to RB for providing the logs and
+ duplication steps that helped to get to the root cause of the
+ issue. 2. Increased MFI_POLL_TIMEOUT_SECS to 60 seconds from
+ 10. FW may take a max of 60 seconds to respond to the INIT
+ cmd.
+
+1 Release Date : Fri. June. 15 16:30:43 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.13
+3 Older Version : 00.00.03.12
+
+1. Added the megasas_reset_timer routine to intercept cmd timeout and throttle io.
+
+On Fri, 2007-03-16 at 16:44 -0600, James Bottomley wrote:
+It looks like megaraid_sas at least needs this to throttle its commands
+> as they begin to time out. The code keeps the existing transport
+> template use of eh_timed_out (and allows the transport to override the
+> host if they both have this callback).
+>
+> James
+
+1 Release Date : Sat May. 12 16:30:43 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.12
+3 Older Version : 00.00.03.11
+
+1. When MegaSAS driver receives reset call from OS, driver waits in reset
+routine for max 3 minutes for all pending command completion. Now driver will
+call completion routine every 5 seconds from the reset routine instead of
+waiting for depending on cmd completion from isr path.
+
+1 Release Date : Mon Apr. 30 10:25:52 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.11
+3 Older Version : 00.00.03.09
+
+ 1. Memory Manager for IOCTL removed for 2.6 kernels.
+ pci_alloc_consistent replaced by dma_alloc_coherent. With this
+ change there is no need of memory manager in the driver code
+
+ On Wed, 2007-02-07 at 13:30 -0800, Andrew Morton wrote:
+ > I suspect all this horror is due to stupidity in the DMA API.
+ >
+ > pci_alloc_consistent() just goes and assumes GFP_ATOMIC, whereas
+ > the caller (megasas_mgmt_fw_ioctl) would have been perfectly happy
+ > to use GFP_KERNEL.
+ >
+ > I bet this fixes it
+
+ It does, but the DMA API was expanded to cope with this exact case, so
+ use dma_alloc_coherent() directly in the megaraid code instead. The dev
+ is just &pci_dev->dev.
+
+ James <James.Bottomley@SteelEye.com>
+
+ 3. SYNCHRONIZE_CACHE is not supported by FW and thus blocked by driver.
+ 4. Hibernation support added
+ 5. Performing diskdump while running IO in RHEL 4 was failing. Fixed.
+
+1 Release Date : Fri Feb. 09 14:36:28 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+
+2 Current Version : 00.00.03.09
+3 Older Version : 00.00.03.08
+
+i. Under heavy IO mid-layer prints "DRIVER_TIMEOUT" errors
+
+ The driver now waits for 10 seconds to elapse instead of 5 (as in
+ previous release) to resume IO.
+
+1 Release Date : Mon Feb. 05 11:35:24 PST 2007 -
+ (emaild-id:megaraidlinux@lsi.com)
+ Sumant Patro
+ Bo Yang
+2 Current Version : 00.00.03.08
+3 Older Version : 00.00.03.07
+
+i. Under heavy IO mid-layer prints "DRIVER_TIMEOUT" errors
+
+ Fix: The driver is now throttling IO.
+ Checks added in megasas_queue_command to know if FW is able to
+ process commands within timeout period. If number of retries
+ is 2 or greater,the driver stops sending cmd to FW temporarily. IO is
+ resumed if pending cmd count reduces to 16 or 5 seconds has elapsed
+ from the time cmds were last sent to FW.
+
+ii. FW enables WCE bit in Mode Sense cmd for drives that are configured
+ as WriteBack. The OS may send "SYNCHRONIZE_CACHE" cmd when Logical
+ Disks are exposed with WCE=1. User is advised to enable Write Back
+ mode only when the controller has battery backup. At this time
+ Synhronize cache is not supported by the FW. Driver will short-cycle
+ the cmd and return sucess without sending down to FW.
+
+1 Release Date : Sun Jan. 14 11:21:32 PDT 2007 -
+ Sumant Patro <Sumant.Patro@lsil.com>/Bo Yang
+2 Current Version : 00.00.03.07
+3 Older Version : 00.00.03.06
+
+i. bios_param entry added in scsi_host_template that returns disk geometry
+ information.
+
+1 Release Date : Fri Oct 20 11:21:32 PDT 2006 - Sumant Patro <Sumant.Patro@lsil.com>/Bo Yang
+2 Current Version : 00.00.03.06
+3 Older Version : 00.00.03.05
+
+1. Added new memory management module to support the IOCTL memory allocation. For IOCTL we try to allocate from the memory pool created during driver initialization. If mem pool is empty then we allocate at run time.
+2. Added check in megasas_queue_command and dpc/isr routine to see if we have already declared adapter dead
+ (hw_crit_error=1). If hw_crit_error==1, now we donot accept any processing of pending cmds/accept any cmd from OS
+
+1 Release Date : Mon Oct 02 11:21:32 PDT 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.03.05
+3 Older Version : 00.00.03.04
+
+i. PCI_DEVICE macro used
+
+ Convert the pci_device_id-table of the megaraid_sas-driver to the PCI_DEVICE-macro, to safe some lines.
+
+ - Henrik Kretzschmar <henne@nachtwindheim.de>
+ii. All compiler warnings removed
+iii. megasas_ctrl_info struct reverted to 3.02 release
+iv. Default value of megasas_dbg_lvl set to 0
+v. Removing in megasas_exit the sysfs entry created for megasas_dbg_lvl
+vi. In megasas_teardown_frame_pool(), cmd->frame was passed instead of
+ cmd->sense to pci_pool_free. Fixed. Bug was pointed out by
+ Eric Sesterhenn
+
+1 Release Date : Wed Sep 13 14:22:51 PDT 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.03.04
+3 Older Version : 00.00.03.03
+
+i. Added Reboot notify
+ii. Reduced by 1 max cmds sent to FW from Driver to make the reply_q_sz same
+ as Max Cmds FW can support
+
+1 Release Date : Tue Aug 22 16:33:14 PDT 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.03.03
+3 Older Version : 00.00.03.02
+
+i. Send stop adapter to FW & Dump pending FW cmds before declaring adapter dead.
+ New varible added to set dbg level.
+ii. Disable interrupt made as fn pointer as they are different for 1068 / 1078
+iii. Frame count optimization. Main frame can contain 2 SGE for 64 bit SGLs and
+ 3 SGE for 32 bit SGL
+iv. Tasklet added for cmd completion
+v. If FW in operational state before firing INIT, now we send RESET Flag to FW instead of just READY. This is used to do soft reset.
+vi. megasas_ctrl_prop structure updated (based on FW struct)
+vii. Added print : FW now in Ready State during initialization
+
+1 Release Date : Sun Aug 06 22:49:52 PDT 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.03.02
+3 Older Version : 00.00.03.01
+
+i. Added FW tranistion state for Hotplug scenario
+
+1 Release Date : Sun May 14 22:49:52 PDT 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.03.01
+3 Older Version : 00.00.02.04
+
+i. Added support for ZCR controller.
+
+ New device id 0x413 added.
+
+ii. Bug fix : Disable controller interrupt before firing INIT cmd to FW.
+
+ Interrupt is enabled after required initialization is over.
+ This is done to ensure that driver is ready to handle interrupts when
+ it is generated by the controller.
+
+ -Sumant Patro <Sumant.Patro@lsil.com>
+
+1 Release Date : Wed Feb 03 14:31:44 PST 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.02.04
+3 Older Version : 00.00.02.04
+
+i. Remove superflous instance_lock
+
+ gets rid of the otherwise superflous instance_lock and avoids an unsave
+ unsynchronized access in the error handler.
+
+ - Christoph Hellwig <hch@lst.de>
+
+
+1 Release Date : Wed Feb 03 14:31:44 PST 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.02.04
+3 Older Version : 00.00.02.04
+
+i. Support for 1078 type (ppc IOP) controller, device id : 0x60 added.
+ During initialization, depending on the device id, the template members
+ are initialized with function pointers specific to the ppc or
+ xscale controllers.
+
+ -Sumant Patro <Sumant.Patro@lsil.com>
+
+1 Release Date : Fri Feb 03 14:16:25 PST 2006 - Sumant Patro
+ <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.02.04
+3 Older Version : 00.00.02.02
+i. Register 16 byte CDB capability with scsi midlayer
+
+ "Ths patch properly registers the 16 byte command length capability of the
+ megaraid_sas controlled hardware with the scsi midlayer. All megaraid_sas
+ hardware supports 16 byte CDB's."
+
+ -Joshua Giles <joshua_giles@dell.com>
+
+1 Release Date : Mon Jan 23 14:09:01 PST 2006 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.02.02
+3 Older Version : 00.00.02.01
+
+i. New template defined to represent each family of controllers (identified by processor used).
+ The template will have defintions that will be initialised to appropritae values for a specific family of controllers. The template definition has four function pointers. During driver initialisation the function pointers will be set based on the controller family type. This change is done to support new controllers that has different processors and thus different register set.
+
+ -Sumant Patro <Sumant.Patro@lsil.com>
+
+1 Release Date : Mon Dec 19 14:36:26 PST 2005 - Sumant Patro <Sumant.Patro@lsil.com>
+2 Current Version : 00.00.02.00-rc4
+3 Older Version : 00.00.02.01
+
+i. Code reorganized to remove code duplication in megasas_build_cmd.
+
+ "There's a lot of duplicate code megasas_build_cmd. Move that out of the different codepathes and merge the reminder of megasas_build_cmd into megasas_queue_command"
+
+ - Christoph Hellwig <hch@lst.de>
+
+ii. Defined MEGASAS_IOC_FIRMWARE32 for code paths that handles 32 bit applications in 64 bit systems.
+
+ "MEGASAS_IOC_FIRMWARE can't be redefined if CONFIG_COMPAT is set, we need to define a MEGASAS_IOC_FIRMWARE32 define so native binaries continue to work"
+
+ - Christoph Hellwig <hch@lst.de>
diff --git a/Documentation/scsi/ChangeLog.ncr53c8xx b/Documentation/scsi/ChangeLog.ncr53c8xx
new file mode 100644
index 0000000..a9f721a
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.ncr53c8xx
@@ -0,0 +1,495 @@
+Sat May 12 12:00 2001 Gerard Roudier (groudier@club-internet.fr)
+ * version ncr53c8xx-3.4.3b
+ - Ensure LEDC bit in GPCNTL is cleared when reading the NVRAM.
+ Fix sent by Stig Telfer <stig@api-networks.com>.
+ - Define scsi_set_pci_device() as nil for kernel < 2.4.4.
+
+Mon Feb 12 22:30 2001 Gerard Roudier (groudier@club-internet.fr)
+ * version ncr53c8xx-3.4.3
+ - Call pci_enable_device() as AC wants this to be done.
+ - Get both the BAR cookies actual and PCI BAR values.
+ (see Changelog.sym53c8xx rev. 1.7.3 for details)
+ - Merge changes for linux-2.4 that declare the host template
+ in the driver object also when the driver is statically
+ linked with the kernel.
+
+Sun Sep 24 21:30 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version ncr53c8xx-3.4.2
+ - See Changelog.sym53c8xx, driver version 1.7.2.
+
+Wed Jul 26 23:30 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version ncr53c8xx-3.4.1
+ - Provide OpenFirmare path through the proc FS on PPC.
+ - Remove trailing argument #2 from a couple of #undefs.
+
+Sun Jul 09 16:30 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version ncr53c8xx-3.4.0
+ - Remove the PROFILE C and SCRIPTS code.
+ This facility was not this useful and thus was not longer
+ desirable given the increasing complexity of the driver code.
+ - Merges from FreeBSD sym-1.6.2 driver:
+ * Clarify memory barriers needed by the driver for architectures
+ that implement a weak memory ordering.
+ - General cleanup:
+ Move definitions for barriers and IO/MMIO operations to the
+ sym53c8xx_defs.h header files. They are now shared by the
+ both drivers.
+ Use SCSI_NCR_IOMAPPED instead of NCR_IOMAPPED.
+
+Thu May 11 12:30 2000 Pam Delaney (pam.delaney@lsil.com)
+ * revision 3.3b
+
+Mon Apr 24 12:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2i
+ - Return value 1 (instead of 0) from the driver setup routine.
+ - Let the driver also attach controllers that have been set to
+ OFF in the NVRAM as it did prior to revision 3.2g.
+
+Sat Apr 1 12:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2h
+ - Fix a compilation problem on Alpha introduced in version 3.2g.
+ (`port' changed to `base_io').
+ - Move from `sym' to this driver a tiny change for __sparc__ that
+ applies to cache line size (? Probably from David S Miller).
+ - Make sure no data transfer will happen for Scsi_Cmnd requests
+ that supply SCSI_DATA_NONE direction (this avoids some BUG()
+ statement in the PCI code when a data buffer is also supplied).
+
+Thu Mar 16 9:30 2000 Pam Delaney (pam.delaney@lsil.com)
+ * revision 3.3b-3
+ - Added exclusion for the 53C1010 and 53C1010_66 chips
+ to the driver (change to sym53c8xx_comm.h).
+
+Mon March 6 23:15 2000 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2g
+ - Add the file sym53c8xx_comm.h that collects code that should
+ be shared by sym53c8xx and ncr53c8xx drivers. For now, it is
+ a header file that is only included by the ncr53c8xx driver,
+ but things will be cleaned up later. This code addresses
+ notably:
+ * Chip detection and PCI related initialisations
+ * NVRAM detection and reading
+ * DMA mapping
+ * Boot setup command
+ * And some other ...
+ - Add support for the new dynamic dma mapping kernel interface.
+ Requires Linux-2.3.47 (tested with pre-2.3.47-6).
+ - Get data transfer direction from the scsi command structure
+ (Scsi_Cmnd) when this information is available.
+
+Mon March 6 23:15 2000 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2g
+ - Add the file sym53c8xx_comm.h that collects code that should
+ be shared by sym53c8xx and ncr53c8xx drivers. For now, it is
+ a header file that is only included by the ncr53c8xx driver,
+ but things will be cleaned up later. This code addresses
+ notably:
+ * Chip detection and PCI related initialisations
+ * NVRAM detection and reading
+ * DMA mapping
+ * Boot setup command
+ * And some other ...
+ - Add support for the new dynamic dma mapping kernel interface.
+ Requires Linux-2.3.47 (tested with pre-2.3.47-6).
+ - Get data transfer direction from the scsi command structure
+ (Scsi_Cmnd) when this information is available.
+
+Fri Jan 14 14:00 2000 Pam Delaney (pam.delaney@lsil.com)
+ * revision pre-3.3b-1
+ - Merge parallel driver series 3.31 and 3.2e
+
+Tue Jan 11 14:00 2000 Pam Delaney (pam.delaney@lsil.com)
+ * revision 3.31
+ - Added support for mounting disks on wide-narrow-wide
+ scsi configurations.
+ - Built off of version 3.30
+
+Mon Jan 10 13:30 2000 Pam Delaney (pam.delaney@lsil.com)
+ * revision 3.30
+ - Added capability to use the integrity checking code
+ in the kernel (optional).
+ - Disabled support for the 53C1010.
+ - Built off of version 3.2c
+
+Sat Jan 8 22:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2e
+ - Add year 2000 copyright.
+ - Display correctly bus signals when bus is detected wrong.
+ - Remove the dead code that broke driver 3.2d.
+
+Mon Dec 6 22:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2d
+ - Change messages written by the driver at initialisation and
+ through the /proc FS (rather cosmetic changes that consist in
+ printing out the PCI bus number and device/function).
+ - Get rid of the old PCI bios interface, but preserve kernel 2.0
+ compatibility from a simple wrapper.
+ - Remove the compilation condition about having to acquire the
+ io_request_lock since it seems to be a definite feature now.:)
+ - proc_dir structure no longer needed for kernel >= 2.3.27.
+ - Change the driver detection code by the sym53c8xx one, modulo
+ some minor changes. The driver can now attach any number of
+ controllers (>40) and does no longer hoger stack space at
+ initialisation.
+ - Definitely disable overlapped PCI arbitration for all dual
+ function chips, since I cannot make sure for what chip revisions
+ it is actually safe.
+ - Add support for the SYM53C1510D.
+ - Update the poor Tekram sync factor table.
+ - Remove the compilation condition about having to acquire the
+ io_request_lock since it seems to be a definite feature now.:)
+ - proc_dir structure no longer needed for kernel >= 2.3.27.
+
+Sat Sep 11 18:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2c
+ - Handle correctly (hopefully) jiffies wrap-around.
+ - Restore the entry used to detect 875 until revision 0xff.
+ (I removed it inadvertently, it seems :) )
+ - Replace __initfunc() which is deprecated stuff by __init which
+ is not yet so. ;-)
+ - Add support of some 'resource handling' for linux-2.3.13.
+ Basically the BARs have been changed to something more complex
+ in the pci_dev structure.
+ - Remove some deprecated code.
+
+Sat May 10 11:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision pre-3.2b-1
+ - Support for the 53C895A by Pamela Delaney <pam.delaney@lsil.com>
+ The 53C895A contains all of the features of the 896 but has only
+ one channel and has a 32 bit PCI bus. It does 64 bit PCI addressing
+ using dual cycle PCI data transfers.
+ - Miscellaneous minor fixes.
+ - Some additions to the README.ncr53c8xx file.
+
+Sun Apr 11 10:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2a
+ - Add 'hostid:#id' boot option. This option allows to change the
+ default SCSI id the driver uses for controllers.
+ - Remove nvram layouts and driver set-up structures from the C source,
+ and use the one defined in sym53c8xx_defs.h file.
+ (shared by both drivers).
+ - Set for now MAX LUNS to 16 (instead of 8).
+
+Thu Mar 11 23:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.2 (8xx-896 driver bundle)
+ - Only define the host template in ncr53c8xx.h and include the
+ sym53c8xx_defs.h file.
+ - Declare static all symbols that do not need to be visible from
+ outside the driver code.
+ - Add 'excl' boot command option that allows to pass to the driver
+ io address of devices not to attach.
+ - Add info() function called from the host template to print
+ driver/host information.
+ - Minor documentation additions.
+
+Sat Mar 6 11:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1h
+ - Fix some oooold bug that hangs the bus if a device rejects a
+ negotiation. Btw, the corresponding stuff also needed some cleanup
+ and thus the change is a bit larger than it could have been.
+ - Still some typo that made compilation fail for 64 bit (trivial fix).
+
+Sun Feb 14:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1g
+ - Deal correctly with 64 bit PCI address registers on Linux 2.2.
+ Pointed out by Leonard Zubkoff.
+ - Allow to tune request_irq() flags from the boot command line using
+ ncr53c8xx=irqm:??, as follows:
+ a) If bit 0x10 is set in irqm, IRQF_SHARED flag is not used.
+ b) If bit 0x20 is set in irqm, IRQF_DISABLED flag is not used.
+ By default the driver uses both IRQF_SHARED and IRQF_DISABLED.
+ Option 'ncr53c8xx=irqm:0x20' may be used when an IRQ is shared by
+ a 53C8XX adapter and a network board.
+ - Tiny mispelling fixed (ABORT instead of ABRT). Was fortunately
+ harmless.
+ - Negotiate SYNC data transfers with CCS devices.
+
+Sat Jan 16 17:30 1999 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1f
+ - Some PCI fix-ups not needed any more for PPC (from Cort).
+ - Cache line size set to 16 DWORDS for Sparc (from DSM).
+ - Waiting list look-up didn't work for the first command of the list.
+ - Remove 2 useless lines of code.
+
+Sun Dec 13 18:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1e
+ - Same work-around as for the 53c876 rev <= 0x15 for 53c896 rev 1:
+ Disable overlapped arbitration. This will not make difference
+ since the chip has on-chip RAM.
+
+Thu Nov 26 22:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1d
+ - The SISL RAID change requires now remap_pci_mem() stuff to be
+ compiled for __i386__ when normal IOs are used.
+ - Minor spelling fixes in doc files.
+
+Sat Nov 21 18:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1c
+ - Ignore chips that are driven by SISL RAID (DAC 960).
+ Change sent by Leonard Zubkoff and slightly reworked.
+ - Still a buglet in the tags initial settings that needed to be fixed.
+ It was not possible to disable TGQ at system startup for devices
+ that claim TGQ support. The driver used at least 2 for the queue
+ depth but did'nt keep track of user settings for tags depth lower
+ than 2.
+
+Wed Nov 11 10:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1b
+ - The driver was unhappy when configured with default_tags > MAX_TAGS
+ Hopefully doubly-fixed.
+ - Update the Configure.help driver section that speaks of TAGS.
+
+Wed Oct 21 21:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.1a
+ - Changes from Eddie Dost for Sparc and Alpha:
+ ioremap/iounmap support for Sparc.
+ pcivtophys changed to bus_dvma_to_phys.
+ - Add the 53c876 description to the chip table. This is only useful
+ for printing the right name of the controller.
+ - DEL-441 Item 2 work-around for the 53c876 rev <= 5 (0x15).
+ - Add additional checking of INQUIRY data:
+ Check INQUIRY data received length is at least 7. Byte 7 of
+ inquiry data contains device features bits and the driver might
+ be confused by garbage. Also check peripheral qualifier.
+ - Cleanup of the SCSI tasks management:
+ Remove the special case for 32 tags. Now the driver only uses the
+ scheme that allows up to 64 tags per LUN.
+ Merge some code from the 896 driver.
+ Use a 1,3,5,...MAXTAGS*2+1 tag numbering. Previous driver could
+ use any tag number from 1 to 253 and some non conformant devices
+ might have problems with large tag numbers.
+ - 'no_sync' changed to 'no_disc' in the README file. This is an old
+ and trivial mistake that seems to demonstrate the README file is
+ not often read. :)
+
+Sun Oct 4 14:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.0i
+ - Cosmetic changes for sparc (but not for the driver) that needs
+ __irq_itoa() to be used for printed IRQ value to be understandable.
+ - Some problems with the driver that didn't occur using driver 2.5f
+ were due to a SCSI selection problem triggered by a clearly
+ documented feature that in fact seems not to work: (53C8XX chips
+ are claimed by the manuals to be able to execute SCSI scripts just
+ after abitration while the SCSI core is performing SCSI selection).
+ This optimization is broken and has been removed.
+ - Some broken scsi devices are confused when a negotiation is started
+ on a LUN that does not correspond to a real device. According to
+ SCSI specs, this is a device firmware bug. This has been worked
+ around by only starting negotiation if the LUN has previously be
+ used for at least 1 successful SCSI command.
+ - The 'last message sent' printed out on M_REJECT message reception
+ was read from the SFBR i/o register after the previous message had
+ been sent.
+ This was not correct and affects all previous driver versions and
+ the original FreeBSD one as well. The SCSI scripts has been fixed
+ so that it now provides the right information to the C code.
+
+Sat Jul 18 13:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.0g
+ - Preliminary fixes for Big Endian (sent by Eddie C. Dost).
+ Big Endian architectures should work again with the driver.
+ Eddie's patch has been partially applied since current 2.1.109
+ does not have all the Sparc changes of the vger tree.
+ - Use of BITS_PER_LONG instead of (~0UL == 0xffffffffUL) has fixed
+ the problem observed when the driver was compiled using EGCS or
+ PGCC.
+
+Mon Jul 13 20:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.0f
+ - Some spelling fixes.
+ - linux/config.h misplaced in ncr53c8xx.h
+ - MODULE_PARM stuff added for linux 2.1.
+ - check INQUIRY response data format is exactly 2.
+ - use BITS_PER_LONG if defined.
+
+Sun Jun 28 12:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.0e
+ - Some cleanup, spelling fixes, version checks, documentations
+ changes, etc ...
+
+Sat Jun 20 20:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.0c
+ - Add a boot setup option that allows to set up device queue depths
+ at boot-up. This option is very useful since Linux does not
+ allow to change scsi device queue depth once the system has been
+ booted up.
+
+Sun Jun 15 23:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.0a
+ - Support for up to 64 TAGS per LUN.
+ - Rewrite the TARGET vs LUN capabilities management.
+ CmdQueue is now handled as a LUN capability as it shall be.
+ This also fixes a bug triggered when disabling tagged command
+ queuing for a device that had this feature enabled.
+ - Remove the ncr_opennings() stuff that was useless under Linux
+ and hard to understand to me.
+ - Add "setverbose" procfs driver command. It allows to tune
+ verbose level after boot-up. Setting this level to zero, for
+ example avoid flooding the syslog file.
+ - Add KERN_XXX to some printk's.
+
+Tue Jun 10 23:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 3.0
+ - Linux config changes for 2.0.34:
+ Remove NVRAM detection config option. This option is now enabled
+ by default but can be disabled by editing the driver header file.
+ Add a PROFILE config option.
+ - Update Configure.help
+ - Add calls to new function mdelay() for milli-seconds delay if
+ kernel version >= 2.1.105.
+ - Replace all printf(s) by printk(s). After all, the ncr53c8xx is
+ a driver for Linux.
+ - Perform auto-sense on COMMAND TERMINATED. Not sure it is useful.
+ - Some other minor changes.
+
+Tue Jun 4 23:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6n
+ - Code cleanup and simplification:
+ Remove kernel 1.2.X and 1.3.X support.
+ Remove the _old_ target capabilities table.
+ Remove the error recovery code that have'nt been really useful.
+ Use a single alignment boundary (CACHE_LINE_SIZE) for data
+ structures.
+ - Several aggressive SCRIPTS optimizations and changes:
+ Reselect SCRIPTS code rewritten.
+ Support for selection/reselection without ATN.
+ And some others.
+ - Miscallaneous changes in the C code:
+ Count actual number of CCB queued to the controller (future use).
+ Lots of other minor changes.
+
+Wed May 13 20:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6m
+ - Problem of missed SCSI bus reset with the 53C895 fixed by
+ Richard Waltham. The 53C895 needs about 650 us for the bus
+ mode to settle. Delays used while resetting the controller
+ and the bus have been adjusted. Thanks Richard!
+ - Some simplification for 64 bit arch done ccb address testing.
+ - Add a check of the MSG_OUT phase after Selection with ATN.
+ - The new tagged queue stuff seems ok, so some informationnal
+ message have been conditionned by verbose >= 3.
+ - Donnot reset if a SBMC interrupt reports the same bus mode.
+ - Print out the whole driver set-up. Some options were missing and
+ the print statement was misplaced for modules.
+ - Ignore a SCSI parity interrupt if the chip is not connected to
+ the SCSI bus.
+
+Sat May 1 16:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6l
+ - Add CCB done queue support for Alpha and perhaps some other
+ architectures.
+ - Add some barriers to enforce memory ordering for x86 and
+ Alpha architectures.
+ - Fix something that looks like an old bug in the nego SIR
+ interrupt code in case of negotiation failure.
+
+Sat Apr 25 21:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6k
+ - Remove all accesses to the on-chip RAM from the C code:
+ Use SCRIPTS to load the on-chip RAM.
+ Use SCRIPTS to repair the start queue on selection timeout.
+ Use the copy of script in main memory to calculate the chip
+ context on phase mismatch.
+ - The above allows now to use the on-chip RAM without requiring
+ to get access to the on-chip RAM from the C code. This makes
+ on-chip RAM useable for linux-1.2.13 and for Linux-Alpha for
+ instance.
+ - Some simplifications and cleanups in the SCRIPTS and C code.
+ - Buglet fixed in parity error recovery SCRIPTS (never tested).
+ - Minor updates in README.ncr53c8xx.
+
+Wed Apr 15 21:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6j
+ - Incorporate changes from linux-2.1.95 ncr53c8xx driver version.
+ - Add SMP support for linux-2.1.95 and above.
+ - Fix a bug when QUEUE FULL is returned and no commands are
+ disconnected. This happens with Atlas I / L912 and may happen
+ with Atlas II / LXY4.
+ - Nail another one on CHECK condition when requeuing the command
+ for auto-sense.
+ - Call scsi_done() for all completed commands after interrupt
+ handling.
+ - Increase the done queue to 24 entries.
+
+Sat Apr 4 20:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6i
+ - CTEST0 is used by the 53C885 for Power Management and
+ priority setting between the 2 functions.
+ Use SDID instead as actual target number. Just have had to
+ overwrite it with SSID on reselection.
+ - Split DATA_IN and DATA_OUT scripts into 2 sub-scripts.
+ 64 segments are moved from on-chip RAM scripts.
+ If more segments, a script in main memory is used for the
+ additional segments.
+ - Since the SCRIPTS processor continues SCRIPTS execution after
+ having won arbitration, do some stuff prior to testing any SCSI
+ phase on reselection. This should have the vertue to process
+ scripts in parallel with the SCSI core performing selection.
+ - Increase the done queue to 12 entries.
+
+Sun Mar 29 12:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6h
+ - Some fixes.
+
+Tue Mar 26 23:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6g
+ - New done queue. 8 entries by default (6 always useable).
+ Can be increased if needed.
+ - Resources management using doubly linked queues.
+ - New auto-sense and QUEUE FULL handling that does not need to
+ stall the NCR queue any more.
+ - New CCB starvation avoiding algorithm.
+ - Prepare CCBs for SCSI commands that cannot be queued, instead of
+ inserting these commands into the waiting list. The waiting list
+ is now only used while resetting and when memory for CCBs is not
+ yet available?
+
+Sun Feb 8 22:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6f
+ - Some fixes in order to really support the 53C895, at least with
+ FAST-20 devices.
+ - Heavy changes in the target/lun resources management to allow
+ the scripts to jump directly to the CCB on reselection instead
+ of walking on the lun CCBs list. Up to 32 tags per lun are now
+ supported without script processor and PCI traffic overhead.
+
+Sun Jan 11 22:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * revision 2.6d
+ - new (different ?) implementation of the start queue:
+ Use a simple CALL to a launch script in the CCB.
+ - implement a minimal done queue (1 entry :-) ).
+ this avoid scanning all CCBs on INT FLY (Only scan all CCBs, on
+ overflow). Hit ratio is better than 99.9 % on my system, so no
+ need to have a larger done queue.
+ - generalization of the restart of CCB on special condition as
+ Abort, QUEUE FULL, CHECK CONDITION.
+ This has been called 'silly scheduler'.
+ - make all the profiling code conditionned by a config option.
+ This spare some PCI traffic and C code when this feature is not
+ needed.
+ - handle more cleanly the situation where direction is unknown.
+ The pointers patching is now performed by the SCRIPTS processor.
+ - remove some useless scripts instructions.
+
+ Ported from driver 2.5 series:
+ ------------------------------
+ - Use FAST-5 instead of SLOW for slow scsi devices according to
+ new SPI-2 draft.
+ - Make some changes in order to accommodate with 875 rev <= 3
+ device errata listing 397. Minor consequences are:
+ . Leave use of PCI Write and Invalidate under user control.
+ Now, by default the driver does not enable PCI MWI and option
+ 'specf:y' is required in order to enable this feature.
+ . Memory Read Line is not enabled for 875 and 875-like chips.
+ . Programmed burst length set to 64 DWORDS (instead of 128).
+ (Note: SYMBIOS uses 32 DWORDS for the SDMS BIOS)
+ - Add 'buschk' boot option.
+ This option enables checking of SCSI BUS data lines after SCSI
+ RESET (set by default). (Submitted by Richard Waltham).
+ - Update the README file.
+ - Dispatch CONDITION MET and RESERVATION CONFLICT scsi status
+ as OK driver status.
+ - Update the README file and the Symbios NVRAM format definition
+ with removable media flags values (available with SDMS 4.09).
+ - Several PCI configuration registers fix-ups for powerpc.
+ (Patch sent by Cort).
diff --git a/Documentation/scsi/ChangeLog.sym53c8xx b/Documentation/scsi/ChangeLog.sym53c8xx
new file mode 100644
index 0000000..ef985ec
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.sym53c8xx
@@ -0,0 +1,593 @@
+Sat May 12 12:00 2001 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.3c
+ - Ensure LEDC bit in GPCNTL is cleared when reading the NVRAM.
+ Fix sent by Stig Telfer <stig@api-networks.com>.
+ - Backport from SYM-2 the work-around that allows to support
+ hardwares that fail PCI parity checking.
+ - Check that we received at least 8 bytes of INQUIRY response
+ for byte 7, that contains device capabilities, to be valid.
+ - Define scsi_set_pci_device() as nil for kernel < 2.4.4.
+ - + A couple of minor changes.
+
+Sat Apr 7 19:30 2001 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.3b
+ - Fix an unaligned LOAD from scripts (was used as dummy read).
+ - In ncr_soft_reset(), only try to ABORT the current operation
+ for chips that support SRUN bit in ISTAT1 and if SCRIPTS are
+ currently running, as 896 and 1010 manuals suggest.
+ - In the CCB abort path, do not assume that the CCB is currently
+ queued to SCRIPTS. This is not always true, notably after a
+ QUEUE FULL status or when using untagged commands.
+
+Sun Mar 4 18:30 2001 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.3a
+ - Fix an issue in the ncr_int_udc() (unexpected disconnect)
+ handling. If the DSA didn't match a CCB, a bad write to
+ memory could happen.
+
+Mon Feb 12 22:30 2001 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.3
+ - Support for hppa.
+ Tiny patch sent to me by Robert Hirst.
+ - Tiny patch for ia64 sent to me by Pamela Delaney.
+
+Tue Feb 6 13:30 2001 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.3-pre1
+ - Call pci_enable_device() as AC wants this to be done.
+ - Get both the BAR cookies used by CPU and actual PCI BAR
+ values used from SCRIPTS. Recent PCI chips are able to
+ access themselves using internal cycles, but they compare
+ BAR values to destination address to make decision.
+ Earlier chips simply use PCI transactions to access IO
+ registers from SCRIPTS.
+ The bus_dvma_to_mem() interface that reverses the actual
+ PCI BAR value from the BAR cookie is now useless.
+ This point had been discussed at the list and the solution
+ got approved by PCI code maintainer (Martin Mares).
+ - Merge changes for linux-2.4 that declare the host template
+ in the driver object also when the driver is statically
+ linked with the kernel.
+ - Increase SCSI message size up to 12 bytes, given that 8
+ bytes was not enough for the PPR message (fix).
+ - Add field 'maxoffs_st' (max offset for ST data transfers).
+ The C1010 supports offset 62 in DT mode but only 31 in
+ ST mode, to 2 different values for the max SCSI offset
+ are needed. Replace the obviously wrong masking of the
+ offset against 0x1f for ST mode by a lowering to
+ maxoffs_st of the SCSI offset in ST mode.
+ - Refine a work-around for the C1010-66. Revision 1 does
+ not requires extra cycles in DT DATA OUT phase.
+ - Add a missing endian-ization (abrt_tbl.addr).
+ - Minor clean-up in the np structure for fields accessed
+ from SCRIPTS that requires special alignments.
+
+Sun Sep 24 21:30 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.2
+ - Remove the hack for PPC added in previous driver version.
+ - Add FE_DAC feature bit to distinguish between 64 bit PCI
+ addressing (FE_DAC) and 64 bit PCI interface (FE_64BIT).
+ - Get rid of the boot command line "ultra:" argument.
+ This parameter wasn't that clever since we can use "sync:"
+ for Ultra/Ultra2 settings, and for Ultra3 we may want to
+ pass PPR options (for now only DT clocking).
+ - Add FE_VARCLK feature bit that indicates that SCSI clock
+ frequency may vary depending on board design and thus,
+ the driver should try to evaluate the SCSI clock.
+ - Simplify the way the driver determine the SCSI clock:
+ ULTRA3 -> 160 MHz, ULTRA2 -> 80 MHz otherwise 40 MHz.
+ Measure the SCSI clock frequency if FE_VARCLK is set.
+ - Remove FE_CLK80 feature bit that got useless.
+ - Add support for the SYM53C875A (Pamela Delaney).
+
+Wed Jul 26 23:30 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.1
+ - Provide OpenFirmare path through the proc FS on PPC.
+ - Download of on-chip SRAM using memcpy_toio() doesn't work
+ on PPC. Restore previous method (MEMORY MOVE from SCRIPTS).
+ - Remove trailing argument #2 from a couple of #undefs.
+
+Sun Jul 09 16:30 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.7.0
+ - Remove the PROFILE C and SCRIPTS code.
+ This facility was not this useful and thus was not longer
+ desirable given the increasing complexity of the driver code.
+ - Merges from FreeBSD sym-1.6.2 driver:
+ * Clarify memory barriers needed by the driver for architectures
+ that implement a weak memory ordering.
+ * Simpler handling of illegal phases and data overrun from
+ SCRIPTS. These errors are now immediately reported to
+ the C code by an interrupt.
+ * Sync the residual handling code with sym-1.6.2 and now
+ report `resid' to user for linux version >= 2.3.99
+ - General cleanup:
+ Move definitions for barriers and IO/MMIO operations to the
+ sym53c8xx_defs.h header files. They are now shared by the
+ both drivers.
+ Remove unused options that claimed to optimize for the 896.
+ If fact, they were not this clever. :)
+ Use SCSI_NCR_IOMAPPED instead of NCR_IOMAPPED.
+ Remove a couple of unused fields from data structures.
+
+Thu May 11 12:40 2000 Pam Delaney (pam.delaney@lsil.com)
+ * version sym53c8xx-1.6b
+ - Merged version.
+
+Mon Apr 24 12:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5m
+ - Return value 1 (instead of 0) from the driver setup routine.
+ - Do not enable PCI DAC cycles. This just broke support for
+ SYM534C896 on sparc64. Problem fixed by David S. Miller.
+
+Fri Apr 14 9:00 2000 Pam Delaney (pam.delaney@lsil.com)
+ * version sym53c8xx-1.6b-9
+ - Added 53C1010_66 support.
+ - Small fix to integrity checking code.
+ - Removed requirement for integrity checking if want to run
+ at ultra 3.
+
+Sat Apr 1 12:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5l
+ - Tiny change for __sparc__ appeared in 2.3.99-pre4.1 that
+ applies to cache line size (? Probably from David S Miller).
+ - Make sure no data transfer will happen for Scsi_Cmnd requests
+ that supply SCSI_DATA_NONE direction (this avoids some BUG()
+ statement in the PCI code when a data buffer is also supplied).
+
+Sat Mar 11 12:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.6b-5
+ - Test against expected data transfer direction from SCRIPTS.
+ - Add support for the new dynamic dma mapping kernel interface.
+ Requires Linux-2.3.47 (tested with pre-2.3.47-6).
+ Many thanks to David S. Miller for his preliminary changes
+ that have been useful guidelines.
+ - Get data transfer direction from the scsi command structure
+ (Scsi_Cmnd) with kernels that provide this information.
+
+Mon Mar 6 23:30 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5k
+ - Test against expected data transfer direction from SCRIPTS.
+ - Revert the change in 'ncr_flush_done_cmds()' but unmap the
+ scsi dma buffer prior to queueing the command to our done
+ list.
+ - Miscellaneous (minor) fixes in the code added in driver
+ version 1.5j.
+
+Mon Feb 14 4:00 2000 Pam Delaney (pam.delaney@lsil.com)
+ * version sym53c8xx-pre-1.6b-2.
+ - Updated the SCRIPTS error handling of the SWIDE
+ condition - to remove any reads of the sbdl
+ register. Changes needed because the 896 and 1010
+ chips will check parity in some special circumstances.
+ This will cause a parity error interrupt if not in
+ data phase. Changes based on those made in the
+ FreeBSD driver version 1.3.2.
+
+Sun Feb 20 11:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5j
+ - Add support for the new dynamic dma mapping kernel interface.
+ Requires Linux-2.3.47 (tested with pre-2.3.47-6).
+ Many thanks to David S. Miller for his preliminary changes
+ that have been useful guidelines, for having reviewed the
+ code and having tested this driver version on Ultra-Sparc.
+ - 2 tiny bugs fixed in the PCI wrapper that provides support
+ for early kernels without pci device structure.
+ - Get data transfer direction from the scsi command structure
+ (Scsi_Cmnd) with kernels that provide this information.
+ - Fix an old bug that only affected 896 rev. 1 when driver
+ profile support option was set in kernel configuration.
+
+Fri Jan 14 14:00 2000 Pam Delaney (pam.delaney@lsil.com)
+ * version sym53c8xx-pre-1.6b-1.
+ - Merge parallel driver series 1.61 and 1.5e
+
+Tue Jan 11 14:00 2000 Pam Delaney (pam.delaney@lsil.com)
+ * version sym53c8xx-1.61
+ - Added support for mounting disks on wide-narrow-wide
+ scsi configurations.
+ - Modified offset to be a maximum of 31 in ST mode,
+ 62 in DT mode.
+ - Based off of 1.60
+
+Mon Jan 10 10:00 2000 Pam Delaney (pam.delaney@lsil.com)
+ * version sym53c8xx-1.60
+ - Added capability to use the integrity checking code
+ in the kernel (optional).
+ - Added PPR negotiation.
+ - Added support for 53C1010 Ultra 3 part.
+ - Based off of 1.5f
+
+Sat Jan 8 22:00 2000 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5h
+ - Add year 2000 copyright.
+ - Display correctly bus signals when bus is detected wrong.
+ - Some fix for Sparc from DSM that went directly to kernel tree.
+
+Mon Dec 6 22:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5g
+ - Change messages written by the driver at initialisation and
+ through the /proc FS (rather cosmetic changes that consist in
+ printing out the PCI bus number and PCI device/function).
+ - Ensure the SCRIPTS processor is stopped while calibrating the
+ SCSI clock (the initialisation code has been a bit reworked).
+ Change moved to the FreeBSD sym_hipd driver).
+ - Some fixes in the MODIFY_DP/IGN_RESIDUE code and residual
+ calculation (moved from FreeBSD sym_hipd driver).
+ - Add NVRAM support for Tekram boards that use 24C16 EEPROM.
+ Code moved from the FreeBSD sym_hipd driver, since it has
+ been that one that got this feature first.
+ - Definitely disable overlapped PCI arbitration for all dual
+ function chips, since I cannot make sure for what chip revisions
+ it is actually safe.
+ - Add support for the SYM53C1510D (also for ncr53c8xx).
+ - Fix up properly the PCI latency timer when needed or asked for.
+ - Get rid of the old PCI bios interface, but preserve kernel 2.0
+ compatibility from a simple wrapper.
+ - Update the poor Tekram sync factor table.
+ - Fix in a tiny 'printk' bug that may oops in case of extended
+ errors (unrecovered parity error, data overrun, etc ...)
+ (Sent by Pamela Delaney from LSILOGIC)
+ - Remove the compilation condition about having to acquire the
+ io_request_lock since it seems to be a definite feature now.:)
+ - Change get_pages by GetPages since Linux >= 2.3.27 now wants
+ get_pages to ever be used as a kernel symbol (from 2.3.27).
+ - proc_dir structure no longer needed for kernel >= 2.3.27.
+
+Sun Oct 3 19:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5f
+ - Change the way the driver checks the PCI clock frequency, so
+ that overclocked PCI BUS up to 48 MHz will not be refused.
+ The more the BUS is overclocked, the less the driver will
+ guarantee that its measure of the SCSI clock is correct.
+ - Backport some minor improvements of SCRIPTS from the sym_hipd
+ driver.
+ - Backport the code rewrite of the START QUEUE dequeuing (on
+ bad scsi status received) from the sym_hipd driver.
+
+Sat Sep 11 11:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5e
+ - New linux-2.3.13 __setup scheme support added.
+ - Cleanup of the extended error status handling:
+ Use 1 bit per error type.
+ - Also save the extended error status prior to auto-sense.
+ - Add the FE_DIFF chip feature bit to indicate support of
+ diff probing from GPIO3 (825/825A/876/875).
+ - Remove the quirk handling that has been useless since day one.
+ - Work-around PCI chips being reported twice on some platforms.
+ - Add some redundant PCI reads in order to deal with common
+ bridge misbehaviour regarding posted write flushing.
+ - Add some other conditionnal code for people who have to deal
+ with really broken bridges (they will have to edit a source
+ file to try these options).
+ - Handle correctly (hopefully) jiffies wrap-around.
+ - Restore the entry used to detect 875 until revision 0xff.
+ (I removed it inadvertently, it seems :) )
+ - Replace __initfunc() which is deprecated stuff by __init which
+ is not yet so. ;-)
+ - Rewrite the MESSAGE IN scripts more generic by using a MOVE
+ table indirect. Extended messages of any size are accepted now.
+ (Size is limited to 8 for now, but a constant is just to be
+ increased if necessary)
+ - Fix some bug in the fully untested MDP handling:) and share
+ some code between MDP handling and residual calculation.
+ - Calculate the data transfer residual as the 2's complement
+ integer (A positive value in returned on data overrun, and
+ a negative one on underrun).
+ - Add support of some 'resource handling' for linux-2.3.13.
+ Basically the BARs have been changed to something more complex
+ in the pci_dev structure.
+ - Remove some deprecated code.
+
+Sat Jun 5 11:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5c
+ - Do not negotiate on auto-sense if we are currently using 8 bit
+ async transfer for the target.
+ - Only check for SISL/RAID on i386 platforms.
+ (A problem has been reported on PPC with that code).
+ - On MSG REJECT for a negotiation, the driver attempted to restart
+ the SCRIPT processor when this one was already running.
+
+Sat May 29 12:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5b
+ - Force negotiation prior auto-sense.
+ This ensures that the driver will be able to grab the sense data
+ from a device that has received a BUS DEVICE RESET message from
+ another initiator.
+ - Complete all disconnected CCBs for a logical UNIT if we are told
+ about a UNIT ATTENTION for a RESET condition by this target.
+ - Add the control command 'cleardev' that allows to send a ABORT
+ message to a logical UNIT (for test purpose).
+
+Tue May 25 23:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5a
+ - Add support for task abort and bus device reset SCSI message
+ and implement proper synchonisation with SCRIPTS to handle
+ correctly task abortion without races.
+ - Send an ABORT message (if untagged) or ABORT TAG message (if tagged)
+ when the driver is told to abort a command that is disconnected and
+ complete the command with appropriate error.
+ If the aborted command is not yet started, remove it from the start
+ queue and complete it with error.
+ - Add the control command 'resetdev' that allows to send a BUS
+ DEVICE RESET message to a target (for test purpose).
+ - Clean-up some unused or useless code.
+
+Fri May 21 23:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.5
+ - Add support for CHMOV with Wide controllers.
+ - Handling of the SWIDE (low byte residue at the end of a CHMOV
+ in DATA IN phase with WIDE transfer when the byte count gets odd).
+ - Handling of the IGNORE WIDE RESIDUE message.
+ Handled from SCRIPTS as possible with some optimizations when both
+ a wide device and the controller are odd at the same time (SWIDE
+ present and IGNORE WIDE RESIDUE message on the BUS at the same time).
+ - Check against data OVERRUN/UNDERRUN condition at the end of a data
+ transfer, whatever a SWIDE is present (OVERRUN in DATA IN phase)
+ or the SODL is full (UNDERRUN in DATA out phase).
+ - Handling of the MODIFY DATA POINTER message.
+ This one cannot be handled from SCRIPTS, but hopefully it will not
+ happen very often. :)
+ - Large rewrite of the SCSI MESSAGE handling.
+
+Sun May 9 11:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.4
+ - Support for IMMEDIATE ARBITRATION.
+ See the README file for detailed information about this feature.
+ Requires both a compile option and a boot option.
+ - Minor SCRIPTS optimization in reselection pattern for LUN 0.
+ - Simpler algorithm to deal with SCSI command starvation.
+ Just use 2 tag counters in flip/flop and switch to the other
+ one every 3 seconds.
+ - Do some work in SCRIPTS after the SELECT instruction and prior
+ to testing for a PHASE. SYMBIOS say this feature is working fine.
+ (Btw, only problems with Toshiba 3401B had been reported).
+ - Measure the PCI clock speed and do not attach controllers if
+ result is greater than 37 MHz. Since the precision of the
+ algorithm (from Stefan Esser) is better than 2%, this should
+ be fine.
+ - Fix the misdetection of SYM53C875E (was detected as a 876).
+ - Fix the misdetection of SYM53C810 not A (was detected as a 810A).
+ - Support for up to 256 TAGS per LUN (CMD_PER_LUN).
+ Currently limited to 255 due to Linux limitation. :)
+ - Support for up to 508 active commands (CAN_QUEUE).
+ - Support for the 53C895A by Pamela Delaney <pam.delaney@lsil.com>
+ The 53C895A contains all of the features of the 896 but has only
+ one channel and has a 32 bit PCI bus. It does 64 bit PCI addressing
+ using dual cycle PCI data transfers.
+ - Miscellaneous minor fixes.
+ - Some additions to the README.ncr53c8xx file.
+
+Tue Apr 15 10:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.3e
+ - Support for any number of LUNs (64) (SPI2-compliant).
+ (Btw, this may only be ever useful under linux-2.2 ;-))
+
+Sun Apr 11 10:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.3d
+ - Add 'hostid:#id' boot option. This option allows to change the
+ default SCSI id the driver uses for controllers.
+ - Make SCRIPTS not use self-mastering for PCI.
+ There were still 2 places the driver used this feature of the
+ 53C8XX family.
+ - Move some data structures (nvram layouts and driver set-up) to
+ the sym53c8xx_defs.h file. So, the both drivers will share them.
+ - Set MAX LUNS to 16 (instead of 8).
+
+Sat Mar 20 21:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.3b
+ - Add support for NCR PQS PDS.
+ James Bottomley <James.Bottomley@columbiasc.ncr.com>
+ - Allow value 0 for host ID.
+ - Support more than 8 controllers (> 40 in fact :-) )
+ - Add 'excl=#ioaddr' boot option: exclude controller.
+ (Version 1.3a driver)
+
+Thu Mar 11 23:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.3 (8xx-896 driver bundle)
+ - Equivalent changes as ncr53c8xx-3.2 due to the driver bundle.
+ (See Changelog.ncr53c8xx)
+ - Do a normal soft reset as first chip reset, since aborting current
+ operation may raise an interrupt we are not able to handle since
+ the interrupt handler is not yet established.
+
+Sat Mar 6 11:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.2b
+ - Fix some oooold bug that hangs the bus if a device rejects a
+ negotiation. Btw, the corresponding stuff also needed some cleanup
+ and thus the change is a bit larger than it could have been.
+ - Still some typo that made compilation fail for 64 bit (trivial fix).
+
+Sun Feb 21 20:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.2a
+ - The rewrite of the interrupt handling broke the SBMC interrupt
+ handling due to a 1 bit mask tiny error. Hopefully fixed.
+ - If INQUIRY came from a scatter list, the driver looked into
+ the scatterlist instead of the data.:) Since this should never
+ happen, we just discard the data if use_sg is not zero.
+
+Fri Feb 12 23:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.2
+ - Major rewrite of the interrupt handling and recovery stuff for
+ the support of non compliant SCSI removal, insertion and all
+ kinds of screw-up that may happen on the SCSI BUS.
+ Hopefully, the driver is now unbreakable or may-be, it is just
+ quite brocken. :-)
+ Many thanks to Johnson Russel (Symbios) for having responded to
+ my questions and for his interesting advices and comments about
+ support of SCSI hot-plug.
+ - Add 'recovery' option to driver set-up.
+ - Negotiate SYNC data transfers with CCS devices.
+ - Deal correctly with 64 bit PCI address registers on Linux 2.2.
+ Pointed out by Leonard Zubkoff.
+
+Sun Jan 31 18:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.1a
+ - Some 896 chip revisions (all for now :-)), may hang-up if the
+ soft reset bit is set at the wrong time while SCRIPTS are running.
+ We need to first abort the current SCRIPTS operation prior to
+ resetting the chip. This fix has been sent to me by SYMBIOS/LSI
+ and I just translated it into ncr53c8xx syntax.
+ Must be considered 100 % trustable, unless I did some mistake
+ when translating it. :-)
+
+Sun Jan 24 18:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.1
+ - Major rewrite of the SCSI parity error handling.
+ The informations contained in the data manuals are incomplete about
+ this feature.
+ I asked SYMBIOS about and got in reply the explanations that are
+ _indeed_ missing in the data manuals.
+ - Allow to tune request_irq() flags from the boot command line using
+ ncr53c8xx=irqm:??, as follows:
+ a) If bit 0x10 is set in irqm, SA_SHIRQ flag is not used.
+ b) If bit 0x20 is set in irqm, SA_INTERRUPT flag is not used.
+ By default the driver uses both SA_SHIRQ and SA_INTERRUPT.
+ Option 'ncr53c8xx=irqm:0x20' may be used when an IRQ is shared by
+ a 53C8XX adapter and a network board.
+ - Fix for 64 bit PCI address register calculation. (Lance Robinson)
+ - Fix for big-endian in phase mismatch handling. (Michal Jaegermann)
+
+Fri Jan 1 20:00 1999 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.0a
+ - Waiting list look-up didn't work for the first command of the list.
+ Hopefully fixed, but tested on paper only. ;)
+ - Remove the most part of PPC specific code for Linux-2.2.
+ Thanks to Cort.
+ - Some other minors changes.
+
+Sat Dec 19 21:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version sym53c8xx-1.0
+ - Define some new IO registers for the 896 (istat1, mbox0, mbox1)
+ - Revamp slighly the Symbios NVRAM lay-out based on the excerpt of
+ the header file I received from Symbios.
+ - Check the PCI bus number for the boot order (Using a fast
+ PCI controller behing a PCI-PCI bridge seems sub-optimal).
+ - Disable overlapped PCI arbitration for the 896 revision 1.
+ - Reduce a bit the number of IO register reads for phase mismatch
+ by reading DWORDS at a time instead of BYTES.
+
+Thu Dec 3 24:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.18
+ - I received this afternoon a 896 from SYMBIOS and started testing
+ the driver with this beast. After having fixed 3 buglets, it worked
+ with all features enabled including the phase mismatch handling
+ from SCRIPTS. Since this feature is not yet tested enough, the
+ boot option 'ncr53c8xx=specf:1' is still required to enable the
+ driver to handle PM from SCRIPTS.
+
+Sun Nov 29 18:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.17
+ - The SISL RAID change requires now remap_pci_mem() stuff to be
+ compiled for __i386__ when normal IOs are used.
+ - The PCI memory read from SCRIPTS that should ensure ordering
+ was in fact misplaced. BTW, this may explain why broken PCI
+ device drivers regarding ordering are working so well. ;-)
+ - Rewrite ncr53c8xx_setup (boot command line options) since the
+ binary code was a bit too bloated in my opinion.
+ - Make the code simpler in the wakeup_done routine.
+
+Tue Nov 24 23:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.16
+ - Add SCSI_NCR_OPTIMIZE_896_1 compile option and 'optim' boot option.
+ When set, the driver unconditionnaly assumes that the interrupt
+ handler is called for command completion, then clears INTF, scans
+ the done queue and returns if some completed CCB is found. If no
+ completed CCB are found, interrupt handling will proceed normally.
+ With a 896 that handles MA from SCRIPTS, this can be a great win,
+ since the driver will never performs PCI read transactions, but
+ only PCI write transactions that may be posted.
+ If the driver haven't to also raise the SIGP this would be perfect.
+ Even with this penalty, I think that this will work great.
+ Obviously this optimization makes sense only if the IRQ is not
+ shared with another device.
+ - Still a buglet in the tags initial settings that needed to be fixed.
+ It was not possible to disable TGQ at system startup for devices
+ that claim TGQ support. The driver used at least 2 for the queue
+ depth but did'nt keep track of user settings for tags depth lower
+ than 2.
+
+Thu Nov 19 23:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.15
+ - Add support for hardware LED control of the 896.
+ - Ignore chips that are driven by SISL RAID (DAC 960).
+ Change sent by Leonard Zubkoff and slightly reworked.
+ - Prevent 810A rev 11 and 860 rev 1 from using cache line based
+ transactions since those early chip revisions may use such on
+ LOAD/STORE instructions (work-around).
+ - Remove some useless and bloat code from the pci init stuff.
+ - Do not use the readX()/writeX() kernel functions for __i386__,
+ since they perform useless masking operations in order to deal
+ with broken driver in 2.1.X kernel.
+
+Wed Nov 11 10:00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.14
+ - The driver was unhappy when configured with default_tags > MAX_TAGS
+ Hopefully doubly-fixed.
+ - Set PCI_PARITY in PCI_COMMAND register in not set (PCI fix-up).
+ - Print out some message if phase mismatch is handled from SCRIPTS.
+
+Sun Nov 1 14H00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.13
+ - Some rewrite of the device detection code. This code had been
+ patched too much and needed to be face-lifted a bit.
+ Remove all platform dependent fix-ups that was not needed or
+ conflicted with some other driver code as work-arounds.
+ Reread the NVRAM before the calling of ncr_attach(). This spares
+ stack space and so allows to handle more boards.
+ Handle 64 bit base addresses under linux-2.0.X.
+ Set MASTER bit in PCI COMMAND register if not set.
+
+Wed Oct 30 22H00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.12
+ - Damned! I just broke the driver for Alpha by leaving a stale
+ instruction in the source code. Hopefully fixed.
+ - Do not set PFEN when it is useless. Doing so we are sure that BOF
+ will be active, since the manual appears to be very unclear on what
+ feature is actually used by the chip when both PFEN and BOF are
+ set.
+
+Sat Oct 24 16H00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.11
+ - LOAD/STORE instructions were miscompiled for register offsets
+ beyond 0x7f. This broke accesses to 896' new registers.
+ - Disable by default Phase Mismatch handling from SCRIPTS, since
+ current 896 rev.1 seems not to operate safely with the driver
+ when this feature is enabled (and above LOAD/STORE fix applied).
+ I will change the default to 'enabled' when this problem will be
+ solved.
+ Using boot option 'ncr53c8xx=specf:1' enables this feature.
+ - Implement a work-around (DEL 472 - ITEM 5) that should allow the
+ driver to safely enable hardware phase mismatch with 896 rev. 1.
+
+Tue Oct 20 22H00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.10
+ - Add the 53c876 description to the chip table. This is only useful
+ for printing the right name of the controller.
+ - Add additional checking of INQUIRY data:
+ Check INQUIRY data received length is at least 7. Byte 7 of
+ inquiry data contains device features bits and the driver might
+ be confused by garbage. Also check peripheral qualifier.
+ - Use a 1,3,5,...MAXTAGS*2+1 tag numbering. Previous driver could
+ use any tag number from 1 to 253 and some non conformant devices
+ might have problems with large tag numbers.
+ - Use NAME53C and NAME53C8XX for chip name prefix chip family name.
+ Just give a try using "sym53c" and "sym53c8xx" instead of "ncr53c"
+ and "ncr53c8xx". :-)
+
+Sun Oct 11 17H00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.9
+ - DEL-441 Item 2 work-around for the 53c876 rev <= 5 (0x15).
+ - Break ncr_scatter() into 2 functions in order to guarantee best
+ possible code optimization for the case we get a scatter list.
+ - Add the code intended to support up to 1 tera-byte for 64 bit systems.
+ It is probably too early, but I wanted to complete the thing.
+
+Sat Oct 3 14H00 1998 Gerard Roudier (groudier@club-internet.fr)
+ * version pre-sym53c8xx-0.8
+ - Do some testing with io_mapped and fix what needed to be so.
+ - Wait for SCSI selection to complete or time-out immediately after
+ the chip won arbitration, since executing SCRIPTS while the SCSI
+ core is performing SCSI selection breaks the selection procedure
+ at least for some chip revisions.
+ - Interrupt the SCRIPTS if a device does not go to MSG OUT phase after
+ having been selected with ATN. Such a situation is not recoverable,
+ better to fail when we are stuck.
diff --git a/Documentation/scsi/ChangeLog.sym53c8xx_2 b/Documentation/scsi/ChangeLog.sym53c8xx_2
new file mode 100644
index 0000000..18a5d71
--- /dev/null
+++ b/Documentation/scsi/ChangeLog.sym53c8xx_2
@@ -0,0 +1,144 @@
+Sat Dec 30 21:30 2000 Gerard Roudier
+ * version sym-2.1.0-20001230
+ - Initial release of SYM-2.
+
+Mon Jan 08 21:30 2001 Gerard Roudier
+ * version sym-2.1.1-20010108
+ - Change a couple of defines containing ncr or NCR by their
+ equivalent containing sym or SYM instead.
+
+Sun Jan 14 22:30 2001 Gerard Roudier
+ * version sym-2.1.2-20010114
+ - Fix a couple of printfs:
+ * Add the target number to the display of transfer parameters.
+ * Make the display of TCQ and queue depth clearer.
+
+Wed Jan 17 23:30 2001 Gerard Roudier
+ * version sym-2.1.3-20010117
+ - Wrong residual values were returned in some situations.
+ This broke cdrecord with linux-2.4.0, for example.
+
+Sat Jan 20 18:00 2001 Gerard Roudier
+ * version sym-2.1.4-20010120
+ - Add year 2001 to Copyright.
+ - A tiny bug in the dma memory freeing path has been fixed.
+ (Driver unload failed with a bad address reference).
+
+Wed Jan 24 21:00 2001 Gerard Roudier
+ * version sym-2.1.5-20010124
+ - Make the driver work under Linux-2.4.x when statically linked
+ with the kernel.
+ - Check against memory allocation failure for SCRIPTZ and add the
+ missing free of this memory on instance detach.
+ - Check against GPIO3 pulled low for HVD controllers (driver did
+ just the opposite).
+ Misdetection of BUS mode was triggered on module reload only,
+ since BIOS settings were trusted instead on first load.
+
+Wed Feb 7 21:00 2001 Gerard Roudier
+ * version sym-2.1.6-20010207
+ - Call pci_enable_device() as wished by kernel maintainers.
+ - Change the sym_queue_scsiio() interface.
+ This is intended to simplify portability.
+ - Move the code intended to deal with the dowloading of SCRIPTS
+ from SCRIPTS :) in the patch method (was wrongly placed in
+ the SCRIPTS setup method).
+ - Add a missing cpu_to_scr() (np->abort_tbl.addr)
+ - Remove a wrong cpu_to_scr() (np->targtbl_ba)
+ - Cleanup a bit the PPR failure recovery code.
+
+Sat Mar 3 21:00 2001 Gerard Roudier
+ - Add option SYM_OPT_ANNOUNCE_TRANSFER_RATE and move the
+ corresponding code to file sym_misc.c.
+ Also move the code that sniffes INQUIRY to sym_misc.c.
+ This allows to share the corresponding code with NetBSD
+ without polluating the core driver source (sym_hipd.c).
+ - Add optionnal code that handles IO timeouts from the driver.
+ (not used under Linux, but required for NetBSD)
+ - Donnot assume any longer that PAGE_SHIFT and PAGE_SIZE are
+ defined at compile time, as at least NetBSD uses variables
+ in memory for that.
+ - Refine a work-around for the C1010-33 that consists in
+ disabling internal LOAD/STORE. Was applied up to revision 1.
+ Is now only applied to revision 0.
+ - Some code reorganisations due to code moves between files.
+
+Tues Apr 10 21:00 2001 Gerard Roudier
+ * version sym-2.1.9-20010412
+ - Reset 53C896 and 53C1010 chip according to the manual.
+ (i.e.: set the ABRT bit in ISTAT if SCRIPTS are running)
+ - Set #LUN in request sense only if scsi version <= 2 and
+ #LUN <= 7.
+ - Set busy_itl in LCB to 1 if the LCB is allocated and a
+ SCSI command is active. This is a simplification.
+ - In sym_hcb_free(), do not scan the free_ccbq if no CCBs
+ has been allocated. This fixes a panic if attach failed.
+ - Add DT/ST (double/simple transition) in the transfer
+ negotiation announce.
+ - Forces the max number of tasks per LUN to at least 64.
+ - Use pci_set_dma_mask() for linux-2.4.3 and above.
+ - A couple of comments fixes.
+
+Wed May 22:00 2001 Gerard Roudier
+ * version sym-2.1.10-20010509
+ - Mask GPCNTL against 0x1c (was 0xfc) for the reading of the NVRAM.
+ This ensure LEDC bit will not be set on 896 and later chips.
+ Fix sent by Chip Salzenberg <chip@perlsupport.com>.
+ - Define the number of PQS BUSes supported.
+ Fix sent by Stig Telfer <stig@api-networks.com>
+ - Miscellaneous common code rearrangements due to NetBSD accel
+ ioctl support, without impact on Linux (hopefully).
+
+Mon July 2 12:00 2001 Gerard Roudier
+ * version sym-2.1.11-20010702
+ - Add Tekram 390 U2B/U2W SCSI LED handling.
+ Submitted by Chip Salzenberg <chip@valinux.com>
+ - Add call to scsi_set_pci_device() for kernels >= 2.4.4.
+ - Check pci dma mapping failures and complete the IO with some
+ error when such mapping fails.
+ - Fill in instance->max_cmd_len for kernels > 2.4.0.
+ - A couple of tiny fixes ...
+
+Sun Sep 9 18:00 2001 Gerard Roudier
+ * version sym-2.1.12-20010909
+ - Change my email address.
+ - Add infrastructure for the forthcoming 64 bit DMA addressing support.
+ (Based on PCI 64 bit patch from David S. Miller)
+ - Donnot use anymore vm_offset_t type.
+
+Sat Sep 15 20:00 2001 Gerard Roudier
+ * version sym-2.1.13-20010916
+ - Add support for 64 bit DMA addressing using segment registers.
+ 16 registers for up to 4 GB x 16 -> 64 GB.
+
+Sat Sep 22 12:00 2001 Gerard Roudier
+ * version sym-2.1.14-20010922
+ - Complete rewrite of the eh handling. The driver is now using a
+ semaphore in order to behave synchronously as required by the eh
+ threads. A timer is also used to prevent from waiting indefinitely.
+
+Sun Sep 30 17:00 2001 Gerard Roudier
+ * version sym-2.1.15-20010930
+ - Include <linux/module.h> unconditionnaly as expected by latest
+ kernels.
+ - Use del_timer_sync() for recent kernels to kill the driver timer
+ on module release.
+
+Sun Oct 28 15:00 2001 Gerard Roudier
+ * version sym-2.1.16-20011028
+ - Slightly simplify driver configuration.
+ - Prepare a new patch against linux-2.4.13.
+
+Sat Nov 17 10:00 2001 Gerard Roudier
+ * version sym-2.1.17
+ - Fix a couple of gcc/gcc3 warnings.
+ - Allocate separately from the HCB the array for CCBs hashed by DSA.
+ All driver memory allocations are now not greater than 1 PAGE
+ even on PPC64 / 4KB PAGE surprising setup.
+
+Sat Dec 01 18:00 2001 Gerard Roudier
+ * version sym-2.1.17a
+ - Use u_long instead of U32 for the IO base cookie. This is more
+ consistent with what archs are expecting.
+ - Use MMIO per default for Power PC instead of some fake normal IO,
+ as Paul Mackerras stated that MMIO works fine now on this arch.
diff --git a/Documentation/scsi/FlashPoint.txt b/Documentation/scsi/FlashPoint.txt
new file mode 100644
index 0000000..d5acaa3
--- /dev/null
+++ b/Documentation/scsi/FlashPoint.txt
@@ -0,0 +1,163 @@
+The BusLogic FlashPoint SCSI Host Adapters are now fully supported on Linux.
+The upgrade program described below has been officially terminated effective
+31 March 1997 since it is no longer needed.
+
+
+
+ MYLEX INTRODUCES LINUX OPERATING SYSTEM SUPPORT FOR ITS
+ BUSLOGIC FLASHPOINT LINE OF SCSI HOST ADAPTERS
+
+
+FREMONT, CA, -- October 8, 1996 -- Mylex Corporation has expanded Linux
+operating system support to its BusLogic brand of FlashPoint Ultra SCSI
+host adapters. All of BusLogic's other SCSI host adapters, including the
+MultiMaster line, currently support the Linux operating system. Linux
+drivers and information will be available on October 15th at
+http://www.dandelion.com/Linux/.
+
+"Mylex is committed to supporting the Linux community," says Peter Shambora,
+vice president of marketing for Mylex. "We have supported Linux driver
+development and provided technical support for our host adapters for several
+years, and are pleased to now make our FlashPoint products available to this
+user base."
+
+The Linux Operating System
+
+Linux is a freely-distributed implementation of UNIX for Intel x86, Sun
+SPARC, SGI MIPS, Motorola 68k, Digital Alpha AXP and Motorola PowerPC
+machines. It supports a wide range of software, including the X Window
+System, Emacs, and TCP/IP networking. Further information is available at
+http://www.linux.org and http://www.ssc.com/linux.
+
+FlashPoint Host Adapters
+
+The FlashPoint family of Ultra SCSI host adapters, designed for workstation
+and file server environments, are available in narrow, wide, dual channel,
+and dual channel wide versions. These adapters feature SeqEngine
+automation technology, which minimizes SCSI command overhead and reduces
+the number of interrupts generated to the CPU.
+
+About Mylex
+
+Mylex Corporation (NASDAQ/NM SYMBOL: MYLX), founded in 1983, is a leading
+producer of RAID technology and network management products. The company
+produces high performance disk array (RAID) controllers, and complementary
+computer products for network servers, mass storage systems, workstations
+and system boards. Through its wide range of RAID controllers and its
+BusLogic line of Ultra SCSI host adapter products, Mylex provides enabling
+intelligent I/O technologies that increase network management control,
+enhance CPU utilization, optimize I/O performance, and ensure data security
+and availability. Products are sold globally through a network of OEMs,
+major distributors, VARs, and system integrators. Mylex Corporation is
+headquartered at 34551 Ardenwood Blvd., Fremont, CA.
+
+ ####
+
+Contact:
+
+Peter Shambora
+Vice President of Marketing
+Mylex Corp.
+510/796-6100
+peters@mylex.com
+
+ ANNOUNCEMENT
+ BusLogic FlashPoint LT/BT-948 Upgrade Program
+ 1 February 1996
+
+ ADDITIONAL ANNOUNCEMENT
+ BusLogic FlashPoint LW/BT-958 Upgrade Program
+ 14 June 1996
+
+Ever since its introduction last October, the BusLogic FlashPoint LT has
+been problematic for members of the Linux community, in that no Linux
+drivers have been available for this new Ultra SCSI product. Despite it's
+officially being positioned as a desktop workstation product, and not being
+particularly well suited for a high performance multitasking operating
+system like Linux, the FlashPoint LT has been touted by computer system
+vendors as the latest thing, and has been sold even on many of their high
+end systems, to the exclusion of the older MultiMaster products. This has
+caused grief for many people who inadvertently purchased a system expecting
+that all BusLogic SCSI Host Adapters were supported by Linux, only to
+discover that the FlashPoint was not supported and would not be for quite
+some time, if ever.
+
+After this problem was identified, BusLogic contacted its major OEM
+customers to make sure the BT-946C/956C MultiMaster cards would still be
+made available, and that Linux users who mistakenly ordered systems with
+the FlashPoint would be able to upgrade to the BT-946C. While this helped
+many purchasers of new systems, it was only a partial solution to the
+overall problem of FlashPoint support for Linux users. It did nothing to
+assist the people who initially purchased a FlashPoint for a supported
+operating system and then later decided to run Linux, or those who had
+ended up with a FlashPoint LT, believing it was supported, and were unable
+to return it.
+
+In the middle of December, I asked to meet with BusLogic's senior
+management to discuss the issues related to Linux and free software support
+for the FlashPoint. Rumors of varying accuracy had been circulating
+publicly about BusLogic's attitude toward the Linux community, and I felt
+it was best that these issues be addressed directly. I sent an email
+message after 11pm one evening, and the meeting took place the next
+afternoon. Unfortunately, corporate wheels sometimes grind slowly,
+especially when a company is being acquired, and so it's taken until now
+before the details were completely determined and a public statement could
+be made.
+
+BusLogic is not prepared at this time to release the information necessary
+for third parties to write drivers for the FlashPoint. The only existing
+FlashPoint drivers have been written directly by BusLogic Engineering, and
+there is no FlashPoint documentation sufficiently detailed to allow outside
+developers to write a driver without substantial assistance. While there
+are people at BusLogic who would rather not release the details of the
+FlashPoint architecture at all, that debate has not yet been settled either
+way. In any event, even if documentation were available today it would
+take quite a while for a usable driver to be written, especially since I'm
+not convinced that the effort required would be worthwhile.
+
+However, BusLogic does remain committed to providing a high performance
+SCSI solution for the Linux community, and does not want to see anyone left
+unable to run Linux because they have a Flashpoint LT. Therefore, BusLogic
+has put in place a direct upgrade program to allow any Linux user worldwide
+to trade in their FlashPoint LT for the new BT-948 MultiMaster PCI Ultra
+SCSI Host Adapter. The BT-948 is the Ultra SCSI successor to the BT-946C
+and has all the best features of both the BT-946C and FlashPoint LT,
+including smart termination and a flash PROM for easy firmware updates, and
+is of course compatible with the present Linux driver. The price for this
+upgrade has been set at US $45 plus shipping and handling, and the upgrade
+program will be administered through BusLogic Technical Support, which can
+be reached by electronic mail at techsup@buslogic.com, by Voice at +1 408
+654-0760, or by FAX at +1 408 492-1542.
+
+As of 14 June 1996, the original BusLogic FlashPoint LT to BT-948 upgrade
+program has now been extended to encompass the FlashPoint LW Wide Ultra
+SCSI Host Adapter. Any Linux user worldwide may trade in their FlashPoint
+LW (BT-950) for a BT-958 MultiMaster PCI Ultra SCSI Host Adapter. The
+price for this upgrade has been set at US $65 plus shipping and handling.
+
+I was a beta test site for the BT-948/958, and versions 1.2.1 and 1.3.1 of
+my BusLogic driver already included latent support for the BT-948/958.
+Additional cosmetic support for the Ultra SCSI MultiMaster cards was added
+subsequent releases. As a result of this cooperative testing process,
+several firmware bugs were found and corrected. My heavily loaded Linux
+test system provided an ideal environment for testing error recovery
+processes that are much more rarely exercised in production systems, but
+are crucial to overall system stability. It was especially convenient
+being able to work directly with their firmware engineer in demonstrating
+the problems under control of the firmware debugging environment; things
+sure have come a long way since the last time I worked on firmware for an
+embedded system. I am presently working on some performance testing and
+expect to have some data to report in the not too distant future.
+
+BusLogic asked me to send this announcement since a large percentage of the
+questions regarding support for the FlashPoint have either been sent to me
+directly via email, or have appeared in the Linux newsgroups in which I
+participate. To summarize, BusLogic is offering Linux users an upgrade
+from the unsupported FlashPoint LT (BT-930) to the supported BT-948 for US
+$45 plus shipping and handling, or from the unsupported FlashPoint LW
+(BT-950) to the supported BT-958 for $65 plus shipping and handling.
+Contact BusLogic Technical Support at techsup@buslogic.com or +1 408
+654-0760 to take advantage of their offer.
+
+ Leonard N. Zubkoff
+ lnz@dandelion.com
diff --git a/Documentation/scsi/LICENSE.FlashPoint b/Documentation/scsi/LICENSE.FlashPoint
new file mode 100644
index 0000000..ffd0fe2
--- /dev/null
+++ b/Documentation/scsi/LICENSE.FlashPoint
@@ -0,0 +1,60 @@
+ FlashPoint Driver Developer's Kit
+ Version 1.0
+
+ Copyright 1995-1996 by Mylex Corporation
+ All Rights Reserved
+
+This program is free software; you may redistribute and/or modify it under
+the terms of either:
+
+ a) the GNU General Public License as published by the Free Software
+ Foundation; either version 2, or (at your option) any later version,
+
+ or
+
+ b) the "BSD-style License" included below.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY, without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See either the GNU General Public
+License or the BSD-style License below for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+675 Mass Ave, Cambridge, MA 02139, USA.
+
+The BSD-style License is as follows:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain this LICENSE.FlashPoint
+ file, without modification, this list of conditions, and the following
+ disclaimer. The following copyright notice must appear immediately at
+ the beginning of all source files:
+
+ Copyright 1995-1996 by Mylex Corporation. All Rights Reserved
+
+ This file is available under both the GNU General Public License
+ and a BSD-style copyright; see LICENSE.FlashPoint for details.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+3. The name of Mylex Corporation may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY MYLEX CORP. ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
diff --git a/Documentation/scsi/LICENSE.qla2xxx b/Documentation/scsi/LICENSE.qla2xxx
new file mode 100644
index 0000000..9e15b4f
--- /dev/null
+++ b/Documentation/scsi/LICENSE.qla2xxx
@@ -0,0 +1,45 @@
+Copyright (c) 2003-2005 QLogic Corporation
+QLogic Linux Fibre Channel HBA Driver
+
+This program includes a device driver for Linux 2.6 that may be
+distributed with QLogic hardware specific firmware binary file.
+You may modify and redistribute the device driver code under the
+GNU General Public License as published by the Free Software
+Foundation (version 2 or a later version).
+
+You may redistribute the hardware specific firmware binary file
+under the following terms:
+
+ 1. Redistribution of source code (only if applicable),
+ must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+
+ 2. Redistribution in binary form must reproduce the above
+ copyright notice, this list of conditions and the
+ following disclaimer in the documentation and/or other
+ materials provided with the distribution.
+
+ 3. The name of QLogic Corporation may not be used to
+ endorse or promote products derived from this software
+ without specific prior written permission
+
+REGARDLESS OF WHAT LICENSING MECHANISM IS USED OR APPLICABLE,
+THIS PROGRAM IS PROVIDED BY QLOGIC CORPORATION "AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+USER ACKNOWLEDGES AND AGREES THAT USE OF THIS PROGRAM WILL NOT
+CREATE OR GIVE GROUNDS FOR A LICENSE BY IMPLICATION, ESTOPPEL, OR
+OTHERWISE IN ANY INTELLECTUAL PROPERTY RIGHTS (PATENT, COPYRIGHT,
+TRADE SECRET, MASK WORK, OR OTHER PROPRIETARY RIGHT) EMBODIED IN
+ANY OTHER QLOGIC HARDWARE OR SOFTWARE EITHER SOLELY OR IN
+COMBINATION WITH THIS PROGRAM.
diff --git a/Documentation/scsi/Mylex.txt b/Documentation/scsi/Mylex.txt
new file mode 100644
index 0000000..cdf6929
--- /dev/null
+++ b/Documentation/scsi/Mylex.txt
@@ -0,0 +1,5 @@
+Please see the file README.BusLogic for information about Linux support for
+Mylex (formerly BusLogic) MultiMaster and FlashPoint SCSI Host Adapters.
+
+The Mylex DAC960 PCI RAID Controllers are now supported. Please consult
+http://www.dandelion.com/Linux/ for further information on the DAC960 driver.
diff --git a/Documentation/scsi/NinjaSCSI.txt b/Documentation/scsi/NinjaSCSI.txt
new file mode 100644
index 0000000..3229b64
--- /dev/null
+++ b/Documentation/scsi/NinjaSCSI.txt
@@ -0,0 +1,130 @@
+
+ WorkBiT NinjaSCSI-3/32Bi driver for Linux
+
+1. Comment
+ This is Workbit corp.'s(http://www.workbit.co.jp/) NinjaSCSI-3
+(http://www.workbit.co.jp/ts/z_nj3r.html) and NinjaSCSI-32Bi
+(http://www.workbit.co.jp/ts/z_njsc32bi.html) PCMCIA card driver module
+for Linux.
+
+2. My Linux environment
+Linux kernel: 2.4.7 / 2.2.19
+pcmcia-cs: 3.1.27
+gcc: gcc-2.95.4
+PC card: I-O data PCSC-F (NinjaSCSI-3)
+ I-O data CBSC-II in 16 bit mode (NinjaSCSI-32Bi)
+SCSI device: I-O data CDPS-PX24 (CD-ROM drive)
+ Media Intelligent MMO-640GT (Optical disk drive)
+
+3. Install
+[1] Check your PC card is true "NinjaSCSI-3" card.
+ If you installed pcmcia-cs already, pcmcia reports your card as UNKNOWN
+ card, and write ["WBT", "NinjaSCSI-3", "R1.0"] or some other string to
+ your console or log file.
+ You can also use "cardctl" program (this program is in pcmcia-cs source
+ code) to get more info.
+
+# cat /var/log/messages
+...
+Jan 2 03:45:06 lindberg cardmgr[78]: unsupported card in socket 1
+Jan 2 03:45:06 lindberg cardmgr[78]: product info: "WBT", "NinjaSCSI-3", "R1.0"
+...
+# cardctl ident
+Socket 0:
+ no product info available
+Socket 1:
+ product info: "IO DATA", "CBSC16 ", "1"
+
+
+[2] Get the Linux kernel source, and extract it to /usr/src.
+ Because the NinjaSCSI driver requires some SCSI header files in Linux
+ kernel source, I recommend rebuilding your kernel; this eliminates
+ some versioning problems.
+$ cd /usr/src
+$ tar -zxvf linux-x.x.x.tar.gz
+$ cd linux
+$ make config
+...
+
+[3] If you use this driver with Kernel 2.2, unpack pcmcia-cs in some directory
+ and make & install. This driver requires the pcmcia-cs header file.
+$ cd /usr/src
+$ tar zxvf cs-pcmcia-cs-3.x.x.tar.gz
+...
+
+[4] Extract this driver's archive somewhere, and edit Makefile, then do make.
+$ tar -zxvf nsp_cs-x.x.tar.gz
+$ cd nsp_cs-x.x
+$ emacs Makefile
+...
+$ make
+
+[5] Copy nsp_cs.ko to suitable place, like /lib/modules/<Kernel version>/pcmcia/ .
+
+[6] Add these lines to /etc/pcmcia/config .
+ If you use pcmcia-cs-3.1.8 or later, we can use "nsp_cs.conf" file.
+ So, you don't need to edit file. Just copy to /etc/pcmcia/ .
+
+-------------------------------------
+device "nsp_cs"
+ class "scsi" module "nsp_cs"
+
+card "WorkBit NinjaSCSI-3"
+ version "WBT", "NinjaSCSI-3", "R1.0"
+ bind "nsp_cs"
+
+card "WorkBit NinjaSCSI-32Bi (16bit)"
+ version "WORKBIT", "UltraNinja-16", "1"
+ bind "nsp_cs"
+
+# OEM
+card "WorkBit NinjaSCSI-32Bi (16bit) / IO-DATA"
+ version "IO DATA", "CBSC16 ", "1"
+ bind "nsp_cs"
+
+# OEM
+card "WorkBit NinjaSCSI-32Bi (16bit) / KME-1"
+ version "KME ", "SCSI-CARD-001", "1"
+ bind "nsp_cs"
+card "WorkBit NinjaSCSI-32Bi (16bit) / KME-2"
+ version "KME ", "SCSI-CARD-002", "1"
+ bind "nsp_cs"
+card "WorkBit NinjaSCSI-32Bi (16bit) / KME-3"
+ version "KME ", "SCSI-CARD-003", "1"
+ bind "nsp_cs"
+card "WorkBit NinjaSCSI-32Bi (16bit) / KME-4"
+ version "KME ", "SCSI-CARD-004", "1"
+ bind "nsp_cs"
+-------------------------------------
+
+[7] Start (or restart) pcmcia-cs.
+# /etc/rc.d/rc.pcmcia start (BSD style)
+or
+# /etc/init.d/pcmcia start (SYSV style)
+
+
+4. History
+See README.nin_cs .
+
+5. Caution
+ If you eject card when doing some operation for your SCSI device or suspend
+your computer, you encount some *BAD* error like disk crash.
+ It works good when I using this driver right way. But I'm not guarantee
+your data. Please backup your data when you use this driver.
+
+6. Known Bugs
+ In 2.4 kernel, you can't use 640MB Optical disk. This error comes from
+high level SCSI driver.
+
+7. Testing
+ Please send me some reports(bug reports etc..) of this software.
+When you send report, please tell me these or more.
+ card name
+ kernel version
+ your SCSI device name(hard drive, CD-ROM, etc...)
+
+8. Copyright
+ See GPL.
+
+
+2001/08/08 yokota@netlab.is.tsukuba.ac.jp <YOKOTA Hiroshi>
diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt
new file mode 100644
index 0000000..ddace3a
--- /dev/null
+++ b/Documentation/scsi/aacraid.txt
@@ -0,0 +1,157 @@
+AACRAID Driver for Linux (take two)
+
+Introduction
+-------------------------
+The aacraid driver adds support for Adaptec (http://www.adaptec.com)
+RAID controllers. This is a major rewrite from the original
+Adaptec supplied driver. It has significantly cleaned up both the code
+and the running binary size (the module is less than half the size of
+the original).
+
+Supported Cards/Chipsets
+-------------------------
+ PCI ID (pci.ids) OEM Product
+ 9005:0285:9005:0285 Adaptec 2200S (Vulcan)
+ 9005:0285:9005:0286 Adaptec 2120S (Crusader)
+ 9005:0285:9005:0287 Adaptec 2200S (Vulcan-2m)
+ 9005:0285:9005:0288 Adaptec 3230S (Harrier)
+ 9005:0285:9005:0289 Adaptec 3240S (Tornado)
+ 9005:0285:9005:028a Adaptec 2020ZCR (Skyhawk)
+ 9005:0285:9005:028b Adaptec 2025ZCR (Terminator)
+ 9005:0286:9005:028c Adaptec 2230S (Lancer)
+ 9005:0286:9005:028c Adaptec 2230SLP (Lancer)
+ 9005:0286:9005:028d Adaptec 2130S (Lancer)
+ 9005:0285:9005:028e Adaptec 2020SA (Skyhawk)
+ 9005:0285:9005:028f Adaptec 2025SA (Terminator)
+ 9005:0285:9005:0290 Adaptec 2410SA (Jaguar)
+ 9005:0285:103c:3227 Adaptec 2610SA (Bearcat HP release)
+ 9005:0285:9005:0293 Adaptec 21610SA (Corsair-16)
+ 9005:0285:9005:0296 Adaptec 2240S (SabreExpress)
+ 9005:0285:9005:0292 Adaptec 2810SA (Corsair-8)
+ 9005:0285:9005:0297 Adaptec 4005 (AvonPark)
+ 9005:0285:9005:0298 Adaptec 4000 (BlackBird)
+ 9005:0285:9005:0299 Adaptec 4800SAS (Marauder-X)
+ 9005:0285:9005:029a Adaptec 4805SAS (Marauder-E)
+ 9005:0286:9005:029b Adaptec 2820SA (Intruder)
+ 9005:0286:9005:029c Adaptec 2620SA (Intruder)
+ 9005:0286:9005:029d Adaptec 2420SA (Intruder HP release)
+ 9005:0286:9005:02ac Adaptec 1800 (Typhoon44)
+ 9005:0285:9005:02b5 Adaptec 5445 (Voodoo44)
+ 9005:0285:15d9:02b5 SMC AOC-USAS-S4i
+ 9005:0285:9005:02b6 Adaptec 5805 (Voodoo80)
+ 9005:0285:15d9:02b6 SMC AOC-USAS-S8i
+ 9005:0285:9005:02b7 Adaptec 5085 (Voodoo08)
+ 9005:0285:9005:02bb Adaptec 3405 (Marauder40LP)
+ 9005:0285:9005:02bc Adaptec 3805 (Marauder80LP)
+ 9005:0285:9005:02c7 Adaptec 3085 (Marauder08ELP)
+ 9005:0285:9005:02bd Adaptec 31205 (Marauder120)
+ 9005:0285:9005:02be Adaptec 31605 (Marauder160)
+ 9005:0285:9005:02c3 Adaptec 51205 (Voodoo120)
+ 9005:0285:9005:02c4 Adaptec 51605 (Voodoo160)
+ 9005:0285:15d9:02c9 SMC AOC-USAS-S4iR
+ 9005:0285:15d9:02ca SMC AOC-USAS-S8iR
+ 9005:0285:9005:02ce Adaptec 51245 (Voodoo124)
+ 9005:0285:9005:02cf Adaptec 51645 (Voodoo164)
+ 9005:0285:9005:02d0 Adaptec 52445 (Voodoo244)
+ 9005:0285:9005:02d1 Adaptec 5405 (Voodoo40)
+ 9005:0285:15d9:02d2 SMC AOC-USAS-S8i-LP
+ 9005:0285:15d9:02d3 SMC AOC-USAS-S8iR-LP
+ 9005:0285:9005:02d4 Adaptec ASR-2045 (Voodoo04 Lite)
+ 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite)
+ 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite)
+ 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite)
+ 9005:0285:9005:02d8 Adaptec 5405G (Voodoo40 PM)
+ 9005:0285:9005:02d9 Adaptec 5445G (Voodoo44 PM)
+ 9005:0285:9005:02da Adaptec 5805G (Voodoo80 PM)
+ 9005:0285:9005:02db Adaptec 5085G (Voodoo08 PM)
+ 9005:0285:9005:02dc Adaptec 51245G (Voodoo124 PM)
+ 9005:0285:9005:02dd Adaptec 51645G (Voodoo164 PM)
+ 9005:0285:9005:02de Adaptec 52445G (Voodoo244 PM)
+ 9005:0285:9005:02df Adaptec ASR-2045G (Voodoo04 Lite PM)
+ 9005:0285:9005:02e0 Adaptec ASR-2405G (Voodoo40 Lite PM)
+ 9005:0285:9005:02e1 Adaptec ASR-2445G (Voodoo44 Lite PM)
+ 9005:0285:9005:02e2 Adaptec ASR-2805G (Voodoo80 Lite PM)
+ 1011:0046:9005:0364 Adaptec 5400S (Mustang)
+ 1011:0046:9005:0365 Adaptec 5400S (Mustang)
+ 9005:0287:9005:0800 Adaptec Themisto (Jupiter)
+ 9005:0200:9005:0200 Adaptec Themisto (Jupiter)
+ 9005:0286:9005:0800 Adaptec Callisto (Jupiter)
+ 1011:0046:9005:1364 Dell PERC 2/QC (Quad Channel, Mustang)
+ 1011:0046:9005:1365 Dell PERC 2/QC (Quad Channel, Mustang)
+ 1028:0001:1028:0001 Dell PERC 2/Si (Iguana)
+ 1028:0003:1028:0003 Dell PERC 3/Si (SlimFast)
+ 1028:0002:1028:0002 Dell PERC 3/Di (Opal)
+ 1028:0004:1028:0004 Dell PERC 3/SiF (Iguana)
+ 1028:0004:1028:00d0 Dell PERC 3/DiF (Iguana)
+ 1028:0002:1028:00d1 Dell PERC 3/DiV (Viper)
+ 1028:0002:1028:00d9 Dell PERC 3/DiL (Lexus)
+ 1028:000a:1028:0106 Dell PERC 3/DiJ (Jaguar)
+ 1028:000a:1028:011b Dell PERC 3/DiD (Dagger)
+ 1028:000a:1028:0121 Dell PERC 3/DiB (Boxster)
+ 9005:0285:1028:0287 Dell PERC 320/DC (Vulcan)
+ 9005:0285:1028:0291 Dell CERC 2 (DellCorsair)
+ 1011:0046:103c:10c2 HP NetRAID-4M (Mustang)
+ 9005:0285:17aa:0286 Legend S220 (Crusader)
+ 9005:0285:17aa:0287 Legend S230 (Vulcan)
+ 9005:0285:9005:0290 IBM ServeRAID 7t (Jaguar)
+ 9005:0285:1014:02F2 IBM ServeRAID 8i (AvonPark)
+ 9005:0286:1014:9540 IBM ServeRAID 8k/8k-l4 (AuroraLite)
+ 9005:0286:1014:9580 IBM ServeRAID 8k/8k-l8 (Aurora)
+ 9005:0285:1014:034d IBM ServeRAID 8s (Marauder-E)
+ 9005:0286:9005:029e ICP ICP9024RO (Lancer)
+ 9005:0286:9005:029f ICP ICP9014RO (Lancer)
+ 9005:0286:9005:02a0 ICP ICP9047MA (Lancer)
+ 9005:0286:9005:02a1 ICP ICP9087MA (Lancer)
+ 9005:0285:9005:02a4 ICP ICP9085LI (Marauder-X)
+ 9005:0285:9005:02a5 ICP ICP5085BR (Marauder-E)
+ 9005:0286:9005:02a6 ICP ICP9067MA (Intruder-6)
+ 9005:0285:9005:02b2 ICP (Voodoo 8 internal 8 external)
+ 9005:0285:9005:02b8 ICP ICP5445SL (Voodoo44)
+ 9005:0285:9005:02b9 ICP ICP5085SL (Voodoo80)
+ 9005:0285:9005:02ba ICP ICP5805SL (Voodoo08)
+ 9005:0285:9005:02bf ICP ICP5045BL (Marauder40LP)
+ 9005:0285:9005:02c0 ICP ICP5085BL (Marauder80LP)
+ 9005:0285:9005:02c8 ICP ICP5805BL (Marauder08ELP)
+ 9005:0285:9005:02c1 ICP ICP5125BR (Marauder120)
+ 9005:0285:9005:02c2 ICP ICP5165BR (Marauder160)
+ 9005:0285:9005:02c5 ICP ICP5125SL (Voodoo120)
+ 9005:0285:9005:02c6 ICP ICP5165SL (Voodoo160)
+ 9005:0286:9005:02ab (Typhoon40)
+ 9005:0286:9005:02ad (Aurora ARK)
+ 9005:0286:9005:02ae (Aurora Lite ARK)
+ 9005:0285:9005:02b0 (Sunrise Lake ARK)
+ 9005:0285:9005:02b1 Adaptec (Voodoo 8 internal 8 external)
+ 9005:0285:108e:7aac SUN STK RAID REM (Voodoo44 Coyote)
+ 9005:0285:108e:0286 SUN STK RAID INT (Cougar)
+ 9005:0285:108e:0287 SUN STK RAID EXT (Prometheus)
+ 9005:0285:108e:7aae SUN STK RAID EM (Narvi)
+
+People
+-------------------------
+Alan Cox <alan@lxorguk.ukuu.org.uk>
+Christoph Hellwig <hch@infradead.org> (updates for new-style PCI probing and SCSI host registration,
+ small cleanups/fixes)
+Matt Domsch <matt_domsch@dell.com> (revision ioctl, adapter messages)
+Deanna Bonds (non-DASD support, PAE fibs and 64 bit, added new adaptec controllers
+ added new ioctls, changed scsi interface to use new error handler,
+ increased the number of fibs and outstanding commands to a container)
+
+ (fixed 64bit and 64G memory model, changed confusing naming convention
+ where fibs that go to the hardware are consistently called hw_fibs and
+ not just fibs like the name of the driver tracking structure)
+Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations.
+
+Original Driver
+-------------------------
+Adaptec Unix OEM Product Group
+
+Mailing List
+-------------------------
+linux-scsi@vger.kernel.org (Interested parties troll here)
+Also note this is very different to Brian's original driver
+so don't expect him to support it.
+Adaptec does support this driver. Contact Adaptec tech support or
+aacraid@adaptec.com
+
+Original by Brian Boerner February 2001
+Rewritten by Alan Cox, November 2001
diff --git a/Documentation/scsi/advansys.txt b/Documentation/scsi/advansys.txt
new file mode 100644
index 0000000..4a3db62
--- /dev/null
+++ b/Documentation/scsi/advansys.txt
@@ -0,0 +1,243 @@
+AdvanSys (Advanced System Products, Inc.) manufactures the following
+RISC-based, Bus-Mastering, Fast (10 Mhz) and Ultra (20 Mhz) Narrow
+(8-bit transfer) SCSI Host Adapters for the ISA, EISA, VL, and PCI
+buses and RISC-based, Bus-Mastering, Ultra (20 Mhz) Wide (16-bit
+transfer) SCSI Host Adapters for the PCI bus.
+
+The CDB counts below indicate the number of SCSI CDB (Command
+Descriptor Block) requests that can be stored in the RISC chip
+cache and board LRAM. A CDB is a single SCSI command. The driver
+detect routine will display the number of CDBs available for each
+adapter detected. The number of CDBs used by the driver can be
+lowered in the BIOS by changing the 'Host Queue Size' adapter setting.
+
+Laptop Products:
+ ABP-480 - Bus-Master CardBus (16 CDB)
+
+Connectivity Products:
+ ABP510/5150 - Bus-Master ISA (240 CDB)
+ ABP5140 - Bus-Master ISA PnP (16 CDB)
+ ABP5142 - Bus-Master ISA PnP with floppy (16 CDB)
+ ABP902/3902 - Bus-Master PCI (16 CDB)
+ ABP3905 - Bus-Master PCI (16 CDB)
+ ABP915 - Bus-Master PCI (16 CDB)
+ ABP920 - Bus-Master PCI (16 CDB)
+ ABP3922 - Bus-Master PCI (16 CDB)
+ ABP3925 - Bus-Master PCI (16 CDB)
+ ABP930 - Bus-Master PCI (16 CDB)
+ ABP930U - Bus-Master PCI Ultra (16 CDB)
+ ABP930UA - Bus-Master PCI Ultra (16 CDB)
+ ABP960 - Bus-Master PCI MAC/PC (16 CDB)
+ ABP960U - Bus-Master PCI MAC/PC Ultra (16 CDB)
+
+Single Channel Products:
+ ABP542 - Bus-Master ISA with floppy (240 CDB)
+ ABP742 - Bus-Master EISA (240 CDB)
+ ABP842 - Bus-Master VL (240 CDB)
+ ABP940 - Bus-Master PCI (240 CDB)
+ ABP940U - Bus-Master PCI Ultra (240 CDB)
+ ABP940UA/3940UA - Bus-Master PCI Ultra (240 CDB)
+ ABP970 - Bus-Master PCI MAC/PC (240 CDB)
+ ABP970U - Bus-Master PCI MAC/PC Ultra (240 CDB)
+ ABP3960UA - Bus-Master PCI MAC/PC Ultra (240 CDB)
+ ABP940UW/3940UW - Bus-Master PCI Ultra-Wide (253 CDB)
+ ABP970UW - Bus-Master PCI MAC/PC Ultra-Wide (253 CDB)
+ ABP3940U2W - Bus-Master PCI LVD/Ultra2-Wide (253 CDB)
+
+Multi-Channel Products:
+ ABP752 - Dual Channel Bus-Master EISA (240 CDB Per Channel)
+ ABP852 - Dual Channel Bus-Master VL (240 CDB Per Channel)
+ ABP950 - Dual Channel Bus-Master PCI (240 CDB Per Channel)
+ ABP950UW - Dual Channel Bus-Master PCI Ultra-Wide (253 CDB Per Channel)
+ ABP980 - Four Channel Bus-Master PCI (240 CDB Per Channel)
+ ABP980U - Four Channel Bus-Master PCI Ultra (240 CDB Per Channel)
+ ABP980UA/3980UA - Four Channel Bus-Master PCI Ultra (16 CDB Per Chan.)
+ ABP3950U2W - Bus-Master PCI LVD/Ultra2-Wide and Ultra-Wide (253 CDB)
+ ABP3950U3W - Bus-Master PCI Dual LVD2/Ultra3-Wide (253 CDB)
+
+Driver Compile Time Options and Debugging
+
+The following constants can be defined in the source file.
+
+1. ADVANSYS_ASSERT - Enable driver assertions (Def: Enabled)
+
+ Enabling this option adds assertion logic statements to the
+ driver. If an assertion fails a message will be displayed to
+ the console, but the system will continue to operate. Any
+ assertions encountered should be reported to the person
+ responsible for the driver. Assertion statements may proactively
+ detect problems with the driver and facilitate fixing these
+ problems. Enabling assertions will add a small overhead to the
+ execution of the driver.
+
+2. ADVANSYS_DEBUG - Enable driver debugging (Def: Disabled)
+
+ Enabling this option adds tracing functions to the driver and the
+ ability to set a driver tracing level at boot time. This option is
+ very useful for debugging the driver, but it will add to the size
+ of the driver execution image and add overhead to the execution of
+ the driver.
+
+ The amount of debugging output can be controlled with the global
+ variable 'asc_dbglvl'. The higher the number the more output. By
+ default the debug level is 0.
+
+ If the driver is loaded at boot time and the LILO Driver Option
+ is included in the system, the debug level can be changed by
+ specifying a 5th (ASC_NUM_IOPORT_PROBE + 1) I/O Port. The
+ first three hex digits of the pseudo I/O Port must be set to
+ 'deb' and the fourth hex digit specifies the debug level: 0 - F.
+ The following command line will look for an adapter at 0x330
+ and set the debug level to 2.
+
+ linux advansys=0x330,0,0,0,0xdeb2
+
+ If the driver is built as a loadable module this variable can be
+ defined when the driver is loaded. The following insmod command
+ will set the debug level to one.
+
+ insmod advansys.o asc_dbglvl=1
+
+ Debugging Message Levels:
+ 0: Errors Only
+ 1: High-Level Tracing
+ 2-N: Verbose Tracing
+
+ To enable debug output to console, please make sure that:
+
+ a. System and kernel logging is enabled (syslogd, klogd running).
+ b. Kernel messages are routed to console output. Check
+ /etc/syslog.conf for an entry similar to this:
+
+ kern.* /dev/console
+
+ c. klogd is started with the appropriate -c parameter
+ (e.g. klogd -c 8)
+
+ This will cause printk() messages to be be displayed on the
+ current console. Refer to the klogd(8) and syslogd(8) man pages
+ for details.
+
+ Alternatively you can enable printk() to console with this
+ program. However, this is not the 'official' way to do this.
+ Debug output is logged in /var/log/messages.
+
+ main()
+ {
+ syscall(103, 7, 0, 0);
+ }
+
+ Increasing LOG_BUF_LEN in kernel/printk.c to something like
+ 40960 allows more debug messages to be buffered in the kernel
+ and written to the console or log file.
+
+3. ADVANSYS_STATS - Enable statistics (Def: Enabled)
+
+ Enabling this option adds statistics collection and display
+ through /proc to the driver. The information is useful for
+ monitoring driver and device performance. It will add to the
+ size of the driver execution image and add minor overhead to
+ the execution of the driver.
+
+ Statistics are maintained on a per adapter basis. Driver entry
+ point call counts and transfer size counts are maintained.
+ Statistics are only available for kernels greater than or equal
+ to v1.3.0 with the CONFIG_PROC_FS (/proc) file system configured.
+
+ AdvanSys SCSI adapter files have the following path name format:
+
+ /proc/scsi/advansys/{0,1,2,3,...}
+
+ This information can be displayed with cat. For example:
+
+ cat /proc/scsi/advansys/0
+
+ When ADVANSYS_STATS is not defined the AdvanSys /proc files only
+ contain adapter and device configuration information.
+
+Driver LILO Option
+
+If init/main.c is modified as described in the 'Directions for Adding
+the AdvanSys Driver to Linux' section (B.4.) above, the driver will
+recognize the 'advansys' LILO command line and /etc/lilo.conf option.
+This option can be used to either disable I/O port scanning or to limit
+scanning to 1 - 4 I/O ports. Regardless of the option setting EISA and
+PCI boards will still be searched for and detected. This option only
+affects searching for ISA and VL boards.
+
+Examples:
+ 1. Eliminate I/O port scanning:
+ boot: linux advansys=
+ or
+ boot: linux advansys=0x0
+ 2. Limit I/O port scanning to one I/O port:
+ boot: linux advansys=0x110
+ 3. Limit I/O port scanning to four I/O ports:
+ boot: linux advansys=0x110,0x210,0x230,0x330
+
+For a loadable module the same effect can be achieved by setting
+the 'asc_iopflag' variable and 'asc_ioport' array when loading
+the driver, e.g.
+
+ insmod advansys.o asc_iopflag=1 asc_ioport=0x110,0x330
+
+If ADVANSYS_DEBUG is defined a 5th (ASC_NUM_IOPORT_PROBE + 1)
+I/O Port may be added to specify the driver debug level. Refer to
+the 'Driver Compile Time Options and Debugging' section above for
+more information.
+
+Credits (Chronological Order)
+
+Bob Frey <bfrey@turbolinux.com.cn> wrote the AdvanSys SCSI driver
+and maintained it up to 3.3F. He continues to answer questions
+and help maintain the driver.
+
+Nathan Hartwell <mage@cdc3.cdc.net> provided the directions and
+basis for the Linux v1.3.X changes which were included in the
+1.2 release.
+
+Thomas E Zerucha <zerucha@shell.portal.com> pointed out a bug
+in advansys_biosparam() which was fixed in the 1.3 release.
+
+Erik Ratcliffe <erik@caldera.com> has done testing of the
+AdvanSys driver in the Caldera releases.
+
+Rik van Riel <H.H.vanRiel@fys.ruu.nl> provided a patch to
+AscWaitTixISRDone() which he found necessary to make the
+driver work with a SCSI-1 disk.
+
+Mark Moran <mmoran@mmoran.com> has helped test Ultra-Wide
+support in the 3.1A driver.
+
+Doug Gilbert <dgilbert@interlog.com> has made changes and
+suggestions to improve the driver and done a lot of testing.
+
+Ken Mort <ken@mort.net> reported a DEBUG compile bug fixed
+in 3.2K.
+
+Tom Rini <trini@kernel.crashing.org> provided the CONFIG_ISA
+patch and helped with PowerPC wide and narrow board support.
+
+Philip Blundell <philb@gnu.org> provided an
+advansys_interrupts_enabled patch.
+
+Dave Jones <dave@denial.force9.co.uk> reported the compiler
+warnings generated when CONFIG_PROC_FS was not defined in
+the 3.2M driver.
+
+Jerry Quinn <jlquinn@us.ibm.com> fixed PowerPC support (endian
+problems) for wide cards.
+
+Bryan Henderson <bryanh@giraffe-data.com> helped debug narrow
+card error handling.
+
+Manuel Veloso <veloso@pobox.com> worked hard on PowerPC narrow
+board support and fixed a bug in AscGetEEPConfig().
+
+Arnaldo Carvalho de Melo <acme@conectiva.com.br> made
+save_flags/restore_flags changes.
+
+Andy Kellner <AKellner@connectcom.net> continued the Advansys SCSI
+driver development for ConnectCom (Version > 3.3F).
+
+Ken Witherow for extensive testing during the development of version 3.4.
diff --git a/Documentation/scsi/aha152x.txt b/Documentation/scsi/aha152x.txt
new file mode 100644
index 0000000..29ce6d8
--- /dev/null
+++ b/Documentation/scsi/aha152x.txt
@@ -0,0 +1,183 @@
+$Id: README.aha152x,v 1.2 1999/12/25 15:32:30 fischer Exp fischer $
+Adaptec AHA-1520/1522 SCSI driver for Linux (aha152x)
+
+Copyright 1993-1999 Jürgen Fischer <fischer@norbit.de>
+TC1550 patches by Luuk van Dijk (ldz@xs4all.nl)
+
+
+In Revision 2 the driver was modified a lot (especially the
+bottom-half handler complete()).
+
+The driver is much cleaner now, has support for the new
+error handling code in 2.3, produced less cpu load (much
+less polling loops), has slightly higher throughput (at
+least on my ancient test box; a i486/33Mhz/20MB).
+
+
+CONFIGURATION ARGUMENTS:
+
+IOPORT base io address (0x340/0x140)
+IRQ interrupt level (9-12; default 11)
+SCSI_ID scsi id of controller (0-7; default 7)
+RECONNECT allow targets to disconnect from the bus (0/1; default 1 [on])
+PARITY enable parity checking (0/1; default 1 [on])
+SYNCHRONOUS enable synchronous transfers (0/1; default 1 [on])
+DELAY: bus reset delay (default 100)
+EXT_TRANS: enable extended translation (0/1: default 0 [off])
+ (see NOTES)
+
+COMPILE TIME CONFIGURATION (go into AHA152X in drivers/scsi/Makefile):
+
+-DAUTOCONF
+ use configuration the controller reports (AHA-152x only)
+
+-DSKIP_BIOSTEST
+ Don't test for BIOS signature (AHA-1510 or disabled BIOS)
+
+-DSETUP0="{ IOPORT, IRQ, SCSI_ID, RECONNECT, PARITY, SYNCHRONOUS, DELAY, EXT_TRANS }"
+ override for the first controller
+
+-DSETUP1="{ IOPORT, IRQ, SCSI_ID, RECONNECT, PARITY, SYNCHRONOUS, DELAY, EXT_TRANS }"
+ override for the second controller
+
+-DAHA152X_DEBUG
+ enable debugging output
+
+-DAHA152X_STAT
+ enable some statistics
+
+
+LILO COMMAND LINE OPTIONS:
+
+aha152x=<IOPORT>[,<IRQ>[,<SCSI-ID>[,<RECONNECT>[,<PARITY>[,<SYNCHRONOUS>[,<DELAY> [,<EXT_TRANS]]]]]]]
+
+ The normal configuration can be overridden by specifying a command line.
+ When you do this, the BIOS test is skipped. Entered values have to be
+ valid (known). Don't use values that aren't supported under normal
+ operation. If you think that you need other values: contact me.
+ For two controllers use the aha152x statement twice.
+
+
+SYMBOLS FOR MODULE CONFIGURATION:
+
+Choose from 2 alternatives:
+
+1. specify everything (old)
+
+aha152x=IOPORT,IRQ,SCSI_ID,RECONNECT,PARITY,SYNCHRONOUS,DELAY,EXT_TRANS
+ configuration override for first controller
+
+
+aha152x1=IOPORT,IRQ,SCSI_ID,RECONNECT,PARITY,SYNCHRONOUS,DELAY,EXT_TRANS
+ configuration override for second controller
+
+2. specify only what you need to (irq or io is required; new)
+
+io=IOPORT0[,IOPORT1]
+ IOPORT for first and second controller
+
+irq=IRQ0[,IRQ1]
+ IRQ for first and second controller
+
+scsiid=SCSIID0[,SCSIID1]
+ SCSIID for first and second controller
+
+reconnect=RECONNECT0[,RECONNECT1]
+ allow targets to disconnect for first and second controller
+
+parity=PAR0[PAR1]
+ use parity for first and second controller
+
+sync=SYNCHRONOUS0[,SYNCHRONOUS1]
+ enable synchronous transfers for first and second controller
+
+delay=DELAY0[,DELAY1]
+ reset DELAY for first and second controller
+
+exttrans=EXTTRANS0[,EXTTRANS1]
+ enable extended translation for first and second controller
+
+
+If you use both alternatives the first will be taken.
+
+
+NOTES ON EXT_TRANS:
+
+SCSI uses block numbers to address blocks/sectors on a device.
+The BIOS uses a cylinder/head/sector addressing scheme (C/H/S)
+scheme instead. DOS expects a BIOS or driver that understands this
+C/H/S addressing.
+
+The number of cylinders/heads/sectors is called geometry and is required
+as base for requests in C/H/S addressing. SCSI only knows about the
+total capacity of disks in blocks (sectors).
+
+Therefore the SCSI BIOS/DOS driver has to calculate a logical/virtual
+geometry just to be able to support that addressing scheme. The geometry
+returned by the SCSI BIOS is a pure calculation and has nothing to
+do with the real/physical geometry of the disk (which is usually
+irrelevant anyway).
+
+Basically this has no impact at all on Linux, because it also uses block
+instead of C/H/S addressing. Unfortunately C/H/S addressing is also used
+in the partition table and therefore every operating system has to know
+the right geometry to be able to interpret it.
+
+Moreover there are certain limitations to the C/H/S addressing scheme,
+namely the address space is limited to upto 255 heads, upto 63 sectors
+and a maximum of 1023 cylinders.
+
+The AHA-1522 BIOS calculates the geometry by fixing the number of heads
+to 64, the number of sectors to 32 and by calculating the number of
+cylinders by dividing the capacity reported by the disk by 64*32 (1 MB).
+This is considered to be the default translation.
+
+With respect to the limit of 1023 cylinders using C/H/S you can only
+address the first GB of your disk in the partition table. Therefore
+BIOSes of some newer controllers based on the AIC-6260/6360 support
+extended translation. This means that the BIOS uses 255 for heads,
+63 for sectors and then divides the capacity of the disk by 255*63
+(about 8 MB), as soon it sees a disk greater than 1 GB. That results
+in a maximum of about 8 GB addressable diskspace in the partition table
+(but there are already bigger disks out there today).
+
+To make it even more complicated the translation mode might/might
+not be configurable in certain BIOS setups.
+
+This driver does some more or less failsafe guessing to get the
+geometry right in most cases:
+
+- for disks<1GB: use default translation (C/32/64)
+
+- for disks>1GB:
+ - take current geometry from the partition table
+ (using scsicam_bios_param and accept only `valid' geometries,
+ ie. either (C/32/64) or (C/63/255)). This can be extended translation
+ even if it's not enabled in the driver.
+
+ - if that fails, take extended translation if enabled by override,
+ kernel or module parameter, otherwise take default translation and
+ ask the user for verification. This might on not yet partitioned
+ disks.
+
+
+REFERENCES USED:
+
+ "AIC-6260 SCSI Chip Specification", Adaptec Corporation.
+
+ "SCSI COMPUTER SYSTEM INTERFACE - 2 (SCSI-2)", X3T9.2/86-109 rev. 10h
+
+ "Writing a SCSI device driver for Linux", Rik Faith (faith@cs.unc.edu)
+
+ "Kernel Hacker's Guide", Michael K. Johnson (johnsonm@sunsite.unc.edu)
+
+ "Adaptec 1520/1522 User's Guide", Adaptec Corporation.
+
+ Michael K. Johnson (johnsonm@sunsite.unc.edu)
+
+ Drew Eckhardt (drew@cs.colorado.edu)
+
+ Eric Youngdale (eric@andante.org)
+
+ special thanks to Eric Youngdale for the free(!) supplying the
+ documentation on the chip.
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
new file mode 100644
index 0000000..683ccae
--- /dev/null
+++ b/Documentation/scsi/aic79xx.txt
@@ -0,0 +1,497 @@
+====================================================================
+= Adaptec Ultra320 Family Manager Set =
+= =
+= README for =
+= The Linux Operating System =
+====================================================================
+
+The following information is available in this file:
+
+ 1. Supported Hardware
+ 2. Version History
+ 3. Command Line Options
+ 4. Additional Notes
+ 5. Contacting Adaptec
+
+
+1. Supported Hardware
+
+ The following Adaptec SCSI Host Adapters are supported by this
+ driver set.
+
+ Ultra320 ASIC Description
+ ----------------------------------------------------------------
+ AIC-7901A Single Channel 64-bit PCI-X 133MHz to
+ Ultra320 SCSI ASIC
+ AIC-7901B Single Channel 64-bit PCI-X 133MHz to
+ Ultra320 SCSI ASIC with Retained Training
+ AIC-7902A4 Dual Channel 64-bit PCI-X 133MHz to
+ Ultra320 SCSI ASIC
+ AIC-7902B Dual Channel 64-bit PCI-X 133MHz to
+ Ultra320 SCSI ASIC with Retained Training
+
+ Ultra320 Adapters Description ASIC
+ --------------------------------------------------------------------------
+ Adaptec SCSI Card 39320 Dual Channel 64-bit PCI-X 133MHz to 7902A4/7902B
+ Ultra320 SCSI Card (one external
+ 68-pin, two internal 68-pin)
+ Adaptec SCSI Card 39320A Dual Channel 64-bit PCI-X 133MHz to 7902B
+ Ultra320 SCSI Card (one external
+ 68-pin, two internal 68-pin)
+ Adaptec SCSI Card 39320D Dual Channel 64-bit PCI-X 133MHz to 7902A4
+ Ultra320 SCSI Card (two external VHDC
+ and one internal 68-pin)
+ Adaptec SCSI Card 39320D Dual Channel 64-bit PCI-X 133MHz to 7902A4
+ Ultra320 SCSI Card (two external VHDC
+ and one internal 68-pin) based on the
+ AIC-7902B ASIC
+ Adaptec SCSI Card 29320 Single Channel 64-bit PCI-X 133MHz to 7901A
+ Ultra320 SCSI Card (one external
+ 68-pin, two internal 68-pin, one
+ internal 50-pin)
+ Adaptec SCSI Card 29320A Single Channel 64-bit PCI-X 133MHz to 7901B
+ Ultra320 SCSI Card (one external
+ 68-pin, two internal 68-pin, one
+ internal 50-pin)
+ Adaptec SCSI Card 29320LP Single Channel 64-bit Low Profile 7901A
+ PCI-X 133MHz to Ultra320 SCSI Card
+ (One external VHDC, one internal
+ 68-pin)
+ Adaptec SCSI Card 29320ALP Single Channel 64-bit Low Profile 7901B
+ PCI-X 133MHz to Ultra320 SCSI Card
+ (One external VHDC, one internal
+ 68-pin)
+2. Version History
+
+ 3.0 (December 1st, 2005)
+ - Updated driver to use SCSI transport class infrastructure
+ - Upported sequencer and core fixes from adaptec released
+ version 2.0.15 of the driver.
+
+ 1.3.11 (July 11, 2003)
+ - Fix several deadlock issues.
+ - Add 29320ALP and 39320B Id's.
+
+ 1.3.10 (June 3rd, 2003)
+ - Align the SCB_TAG field on a 16byte boundary. This avoids
+ SCB corruption on some PCI-33 busses.
+ - Correct non-zero luns on Rev B. hardware.
+ - Update for change in 2.5.X SCSI proc FS interface.
+ - When negotiation async via an 8bit WDTR message, send
+ an SDTR with an offset of 0 to be sure the target
+ knows we are async. This works around a firmware defect
+ in the Quantum Atlas 10K.
+ - Implement controller suspend and resume.
+ - Clear PCI error state during driver attach so that we
+ don't disable memory mapped I/O due to a stray write
+ by some other driver probe that occurred before we
+ claimed the controller.
+
+ 1.3.9 (May 22nd, 2003)
+ - Fix compiler errors.
+ - Remove S/G splitting for segments that cross a 4GB boundary.
+ This is guaranteed not to happen in Linux.
+ - Add support for scsi_report_device_reset() found in
+ 2.5.X kernels.
+ - Add 7901B support.
+ - Simplify handling of the packetized lun Rev A workaround.
+ - Correct and simplify handling of the ignore wide residue
+ message. The previous code would fail to report a residual
+ if the transaction data length was even and we received
+ an IWR message.
+
+ 1.3.8 (April 29th, 2003)
+ - Fix types accessed via the command line interface code.
+ - Perform a few firmware optimizations.
+ - Fix "Unexpected PKT busfree" errors.
+ - Use a sequencer interrupt to notify the host of
+ commands with bad status. We defer the notification
+ until there are no outstanding selections to ensure
+ that the host is interrupted for as short a time as
+ possible.
+ - Remove pre-2.2.X support.
+ - Add support for new 2.5.X interrupt API.
+ - Correct big-endian architecture support.
+
+ 1.3.7 (April 16th, 2003)
+ - Use del_timer_sync() to ensure that no timeouts
+ are pending during controller shutdown.
+ - For pre-2.5.X kernels, carefully adjust our segment
+ list size to avoid SCSI malloc pool fragmentation.
+ - Cleanup channel display in our /proc output.
+ - Workaround duplicate device entries in the mid-layer
+ device list during add-single-device.
+
+ 1.3.6 (March 28th, 2003)
+ - Correct a double free in the Domain Validation code.
+ - Correct a reference to free'ed memory during controller
+ shutdown.
+ - Reset the bus on an SE->LVD change. This is required
+ to reset our transceivers.
+
+ 1.3.5 (March 24th, 2003)
+ - Fix a few register window mode bugs.
+ - Include read streaming in the PPR flags we display in
+ diagnostics as well as /proc.
+ - Add PCI hot plug support for 2.5.X kernels.
+ - Correct default precompensation value for RevA hardware.
+ - Fix Domain Validation thread shutdown.
+ - Add a firmware workaround to make the LED blink
+ brighter during packetized operations on the H2A4.
+ - Correct /proc display of user read streaming settings.
+ - Simplify driver locking by releasing the io_request_lock
+ upon driver entry from the mid-layer.
+ - Cleanup command line parsing and move much of this code
+ to aiclib.
+
+ 1.3.4 (February 28th, 2003)
+ - Correct a race condition in our error recovery handler.
+ - Allow Test Unit Ready commands to take a full 5 seconds
+ during Domain Validation.
+
+ 1.3.2 (February 19th, 2003)
+ - Correct a Rev B. regression due to the GEM318
+ compatibility fix included in 1.3.1.
+
+ 1.3.1 (February 11th, 2003)
+ - Add support for the 39320A.
+ - Improve recovery for certain PCI-X errors.
+ - Fix handling of LQ/DATA/LQ/DATA for the
+ same write transaction that can occur without
+ interveining training.
+ - Correct compatibility issues with the GEM318
+ enclosure services device.
+ - Correct data corruption issue that occurred under
+ high tag depth write loads.
+ - Adapt to a change in the 2.5.X daemonize() API.
+ - Correct a "Missing case in ahd_handle_scsiint" panic.
+
+ 1.3.0 (January 21st, 2003)
+ - Full regression testing for all U320 products completed.
+ - Added abort and target/lun reset error recovery handler and
+ interrupt coalescing.
+
+ 1.2.0 (November 14th, 2002)
+ - Added support for Domain Validation
+ - Add support for the Hewlett-Packard version of the 39320D
+ and AIC-7902 adapters.
+ Support for previous adapters has not been fully tested and should
+ only be used at the customer's own risk.
+
+ 1.1.1 (September 24th, 2002)
+ - Added support for the Linux 2.5.X kernel series
+
+ 1.1.0 (September 17th, 2002)
+ - Added support for four additional SCSI products:
+ ASC-39320, ASC-29320, ASC-29320LP, AIC-7901.
+
+ 1.0.0 (May 30th, 2002)
+ - Initial driver release.
+
+ 2.1. Software/Hardware Features
+ - Support for the SPI-4 "Ultra320" standard:
+ - 320MB/s transfer rates
+ - Packetized SCSI Protocol at 160MB/s and 320MB/s
+ - Quick Arbitration Selection (QAS)
+ - Retained Training Information (Rev B. ASIC only)
+ - Interrupt Coalessing
+ - Initiator Mode (target mode not currently
+ supported)
+ - Support for the PCI-X standard up to 133MHz
+ - Support for the PCI v2.2 standard
+ - Domain Validation
+
+ 2.2. Operating System Support:
+ - Redhat Linux 7.2, 7.3, 8.0, Advanced Server 2.1
+ - SuSE Linux 7.3, 8.0, 8.1, Enterprise Server 7
+ - only Intel and AMD x86 supported at this time
+ - >4GB memory configurations supported.
+
+ Refer to the User's Guide for more details on this.
+
+3. Command Line Options
+
+ WARNING: ALTERING OR ADDING THESE DRIVER PARAMETERS
+ INCORRECTLY CAN RENDER YOUR SYSTEM INOPERABLE.
+ USE THEM WITH CAUTION.
+
+ Edit the file "modprobe.conf" in the directory /etc and add/edit a
+ line containing 'options aic79xx aic79xx=[command[,command...]]' where
+ 'command' is one or more of the following:
+ -----------------------------------------------------------------
+ Option: verbose
+ Definition: enable additional informative messages during
+ driver operation.
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: debug:[value]
+ Definition: Enables various levels of debugging information
+ The bit definitions for the debugging mask can
+ be found in drivers/scsi/aic7xxx/aic79xx.h under
+ the "Debug" heading.
+ Possible Values: 0x0000 = no debugging, 0xffff = full debugging
+ Default Value: 0x0000
+ -----------------------------------------------------------------
+ Option: no_reset
+ Definition: Do not reset the bus during the initial probe
+ phase
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: extended
+ Definition: Force extended translation on the controller
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: periodic_otag
+ Definition: Send an ordered tag periodically to prevent
+ tag starvation. Needed for some older devices
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: reverse_scan
+ Definition: Probe the scsi bus in reverse order, starting
+ with target 15
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: global_tag_depth
+ Definition: Global tag depth for all targets on all busses.
+ This option sets the default tag depth which
+ may be selectively overridden vi the tag_info
+ option.
+ Possible Values: 1 - 253
+ Default Value: 32
+ -----------------------------------------------------------------
+ Option: tag_info:{{value[,value...]}[,{value[,value...]}...]}
+ Definition: Set the per-target tagged queue depth on a
+ per controller basis. Both controllers and targets
+ may be ommitted indicating that they should retain
+ the default tag depth.
+ Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32}
+ On Controller 0
+ specifies a tag depth of 16 for target 0
+ specifies a tag depth of 64 for target 3
+ specifies a tag depth of 8 for targets 4 and 5
+ leaves target 6 at the default
+ specifies a tag depth of 32 for targets 1,2,7-15
+ All other targets retain the default depth.
+
+ tag_info:{{},{32,,32}}
+ On Controller 1
+ specifies a tag depth of 32 for targets 0 and 2
+ All other targets retain the default depth.
+
+ Possible Values: 1 - 253
+ Default Value: 32
+ -----------------------------------------------------------------
+ Option: rd_strm: {rd_strm_bitmask[,rd_strm_bitmask...]}
+ Definition: Enable read streaming on a per target basis.
+ The rd_strm_bitmask is a 16 bit hex value in which
+ each bit represents a target. Setting the target's
+ bit to '1' enables read streaming for that
+ target. Controllers may be ommitted indicating that
+ they should retain the default read streaming setting.
+ Example: rd_strm:{0x0041}
+ On Controller 0
+ enables read streaming for targets 0 and 6.
+ disables read streaming for targets 1-5,7-15.
+ All other targets retain the default read
+ streaming setting.
+ Example: rd_strm:{0x0023,,0xFFFF}
+ On Controller 0
+ enables read streaming for targets 1,2, and 5.
+ disables read streaming for targets 3,4,6-15.
+ On Controller 2
+ enables read streaming for all targets.
+ All other targets retain the default read
+ streaming setting.
+
+ Possible Values: 0x0000 - 0xffff
+ Default Value: 0x0000
+ -----------------------------------------------------------------
+ Option: dv: {value[,value...]}
+ Definition: Set Domain Validation Policy on a per-controller basis.
+ Controllers may be ommitted indicating that
+ they should retain the default read streaming setting.
+ Example: dv:{-1,0,,1,1,0}
+ On Controller 0 leave DV at its default setting.
+ On Controller 1 disable DV.
+ Skip configuration on Controller 2.
+ On Controllers 3 and 4 enable DV.
+ On Controller 5 disable DV.
+
+ Possible Values: < 0 Use setting from serial EEPROM.
+ 0 Disable DV
+ > 0 Enable DV
+ Default Value: DV Serial EEPROM configuration setting.
+ -----------------------------------------------------------------
+ Option: seltime:[value]
+ Definition: Specifies the selection timeout value
+ Possible Values: 0 = 256ms, 1 = 128ms, 2 = 64ms, 3 = 32ms
+ Default Value: 0
+ -----------------------------------------------------------------
+
+ *** The following three options should only be changed at ***
+ *** the direction of a technical support representative. ***
+
+ -----------------------------------------------------------------
+ Option: precomp: {value[,value...]}
+ Definition: Set IO Cell precompensation value on a per-controller
+ basis.
+ Controllers may be ommitted indicating that
+ they should retain the default precompensation setting.
+ Example: precomp:{0x1}
+ On Controller 0 set precompensation to 1.
+ Example: precomp:{1,,7}
+ On Controller 0 set precompensation to 1.
+ On Controller 2 set precompensation to 8.
+
+ Possible Values: 0 - 7
+ Default Value: Varies based on chip revision
+ -----------------------------------------------------------------
+ Option: slewrate: {value[,value...]}
+ Definition: Set IO Cell slew rate on a per-controller basis.
+ Controllers may be ommitted indicating that
+ they should retain the default slew rate setting.
+ Example: slewrate:{0x1}
+ On Controller 0 set slew rate to 1.
+ Example: slewrate :{1,,8}
+ On Controller 0 set slew rate to 1.
+ On Controller 2 set slew rate to 8.
+
+ Possible Values: 0 - 15
+ Default Value: Varies based on chip revision
+ -----------------------------------------------------------------
+ Option: amplitude: {value[,value...]}
+ Definition: Set IO Cell signal amplitude on a per-controller basis.
+ Controllers may be ommitted indicating that
+ they should retain the default read streaming setting.
+ Example: amplitude:{0x1}
+ On Controller 0 set amplitude to 1.
+ Example: amplitude :{1,,7}
+ On Controller 0 set amplitude to 1.
+ On Controller 2 set amplitude to 7.
+
+ Possible Values: 1 - 7
+ Default Value: Varies based on chip revision
+ -----------------------------------------------------------------
+
+ Example: 'options aic79xx aic79xx=verbose,rd_strm:{{0x0041}}'
+ enables verbose output in the driver and turns read streaming on
+ for targets 0 and 6 of Controller 0.
+
+4. Additional Notes
+
+ 4.1. Known/Unresolved or FYI Issues
+
+ * Under SuSE Linux Enterprise 7, the driver may fail to operate
+ correctly due to a problem with PCI interrupt routing in the
+ Linux kernel. Please contact SuSE for an updated Linux
+ kernel.
+
+ 4.2. Third-Party Compatibility Issues
+
+ * Adaptec only supports Ultra320 hard drives running
+ the latest firmware available. Please check with
+ your hard drive manufacturer to ensure you have the
+ latest version.
+
+ 4.3. Operating System or Technology Limitations
+
+ * PCI Hot Plug is untested and may cause the operating system
+ to stop responding.
+ * Luns that are not numbered contiguously starting with 0 might not
+ be automatically probed during system startup. This is a limitation
+ of the OS. Please contact your Linux vendor for instructions on
+ manually probing non-contiguous luns.
+ * Using the Driver Update Disk version of this package during OS
+ installation under RedHat might result in two versions of this
+ driver being installed into the system module directory. This
+ might cause problems with the /sbin/mkinitrd program and/or
+ other RPM packages that try to install system modules. The best
+ way to correct this once the system is running is to install
+ the latest RPM package version of this driver, available from
+ http://www.adaptec.com.
+
+
+5. Adaptec Customer Support
+
+ A Technical Support Identification (TSID) Number is required for
+ Adaptec technical support.
+ - The 12-digit TSID can be found on the white barcode-type label
+ included inside the box with your product. The TSID helps us
+ provide more efficient service by accurately identifying your
+ product and support status.
+
+ Support Options
+ - Search the Adaptec Support Knowledgebase (ASK) at
+ http://ask.adaptec.com for articles, troubleshooting tips, and
+ frequently asked questions about your product.
+ - For support via Email, submit your question to Adaptec's
+ Technical Support Specialists at http://ask.adaptec.com/.
+
+ North America
+ - Visit our Web site at http://www.adaptec.com/.
+ - For information about Adaptec's support options, call
+ 408-957-2550, 24 hours a day, 7 days a week.
+ - To speak with a Technical Support Specialist,
+ * For hardware products, call 408-934-7274,
+ Monday to Friday, 3:00 am to 5:00 pm, PDT.
+ * For RAID and Fibre Channel products, call 321-207-2000,
+ Monday to Friday, 3:00 am to 5:00 pm, PDT.
+ To expedite your service, have your computer with you.
+ - To order Adaptec products, including accessories and cables,
+ call 408-957-7274. To order cables online go to
+ http://www.adaptec.com/buy-cables/.
+
+ Europe
+ - Visit our Web site at http://www.adaptec-europe.com/.
+ - To speak with a Technical Support Specialist, call, or email,
+ * German: +49 89 4366 5522, Monday-Friday, 9:00-17:00 CET,
+ http://ask-de.adaptec.com/.
+ * French: +49 89 4366 5533, Monday-Friday, 9:00-17:00 CET,
+ http://ask-fr.adaptec.com/.
+ * English: +49 89 4366 5544, Monday-Friday, 9:00-17:00 GMT,
+ http://ask.adaptec.com/.
+ - You can order Adaptec cables online at
+ http://www.adaptec.com/buy-cables/.
+
+ Japan
+ - Visit our web site at http://www.adaptec.co.jp/.
+ - To speak with a Technical Support Specialist, call
+ +81 3 5308 6120, Monday-Friday, 9:00 a.m. to 12:00 p.m.,
+ 1:00 p.m. to 6:00 p.m.
+
+-------------------------------------------------------------------
+/*
+ * Copyright (c) 2003 Adaptec Inc. 691 S. Milpitas Blvd., Milpitas CA 95035 USA.
+ * All rights reserved.
+ *
+ * You are permitted to redistribute, use and modify this README file in whole
+ * or in part in conjunction with redistribution of software governed by the
+ * General Public License, provided that the following conditions are met:
+ * 1. Redistributions of README file must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * 3. Modifications or new contributions must be attributed in a copyright
+ * notice identifying the author ("Contributor") and added below the
+ * original copyright notice. The copyright notice is for purposes of
+ * identifying contributors and should not be deemed as permission to alter
+ * the permissions given by Adaptec.
+ *
+ * THIS README FILE IS PROVIDED BY ADAPTEC AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ANY
+ * WARRANTIES OF NON-INFRINGEMENT OR THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * ADAPTEC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS README
+ * FILE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt
new file mode 100644
index 0000000..b7e238c
--- /dev/null
+++ b/Documentation/scsi/aic7xxx.txt
@@ -0,0 +1,394 @@
+====================================================================
+= Adaptec Aic7xxx Fast -> Ultra160 Family Manager Set v7.0 =
+= README for =
+= The Linux Operating System =
+====================================================================
+
+The following information is available in this file:
+
+ 1. Supported Hardware
+ 2. Version History
+ 3. Command Line Options
+ 4. Contacting Adaptec
+
+1. Supported Hardware
+
+ The following Adaptec SCSI Chips and Host Adapters are supported by
+ the aic7xxx driver.
+
+ Chip MIPS Host Bus MaxSync MaxWidth SCBs Notes
+ ---------------------------------------------------------------
+ aic7770 10 EISA/VL 10MHz 16Bit 4 1
+ aic7850 10 PCI/32 10MHz 8Bit 3
+ aic7855 10 PCI/32 10MHz 8Bit 3
+ aic7856 10 PCI/32 10MHz 8Bit 3
+ aic7859 10 PCI/32 20MHz 8Bit 3
+ aic7860 10 PCI/32 20MHz 8Bit 3
+ aic7870 10 PCI/32 10MHz 16Bit 16
+ aic7880 10 PCI/32 20MHz 16Bit 16
+ aic7890 20 PCI/32 40MHz 16Bit 16 3 4 5 6 7 8
+ aic7891 20 PCI/64 40MHz 16Bit 16 3 4 5 6 7 8
+ aic7892 20 PCI/64-66 80MHz 16Bit 16 3 4 5 6 7 8
+ aic7895 15 PCI/32 20MHz 16Bit 16 2 3 4 5
+ aic7895C 15 PCI/32 20MHz 16Bit 16 2 3 4 5 8
+ aic7896 20 PCI/32 40MHz 16Bit 16 2 3 4 5 6 7 8
+ aic7897 20 PCI/64 40MHz 16Bit 16 2 3 4 5 6 7 8
+ aic7899 20 PCI/64-66 80MHz 16Bit 16 2 3 4 5 6 7 8
+
+ 1. Multiplexed Twin Channel Device - One controller servicing two
+ busses.
+ 2. Multi-function Twin Channel Device - Two controllers on one chip.
+ 3. Command Channel Secondary DMA Engine - Allows scatter gather list
+ and SCB prefetch.
+ 4. 64 Byte SCB Support - Allows disconnected, untagged request table
+ for all possible target/lun combinations.
+ 5. Block Move Instruction Support - Doubles the speed of certain
+ sequencer operations.
+ 6. `Bayonet' style Scatter Gather Engine - Improves S/G prefetch
+ performance.
+ 7. Queuing Registers - Allows queuing of new transactions without
+ pausing the sequencer.
+ 8. Multiple Target IDs - Allows the controller to respond to selection
+ as a target on multiple SCSI IDs.
+
+ Controller Chip Host-Bus Int-Connectors Ext-Connectors Notes
+ --------------------------------------------------------------------------
+ AHA-274X[A] aic7770 EISA SE-50M SE-HD50F
+ AHA-274X[A]W aic7770 EISA SE-HD68F SE-HD68F
+ SE-50M
+ AHA-274X[A]T aic7770 EISA 2 X SE-50M SE-HD50F
+ AHA-2842 aic7770 VL SE-50M SE-HD50F
+ AHA-2940AU aic7860 PCI/32 SE-50M SE-HD50F
+ AVA-2902I aic7860 PCI/32 SE-50M
+ AVA-2902E aic7860 PCI/32 SE-50M
+ AVA-2906 aic7856 PCI/32 SE-50M SE-DB25F
+ APC-7850 aic7850 PCI/32 SE-50M 1
+ AVA-2940 aic7860 PCI/32 SE-50M
+ AHA-2920B aic7860 PCI/32 SE-50M
+ AHA-2930B aic7860 PCI/32 SE-50M
+ AHA-2920C aic7856 PCI/32 SE-50M SE-HD50F
+ AHA-2930C aic7860 PCI/32 SE-50M
+ AHA-2930C aic7860 PCI/32 SE-50M
+ AHA-2910C aic7860 PCI/32 SE-50M
+ AHA-2915C aic7860 PCI/32 SE-50M
+ AHA-2940AU/CN aic7860 PCI/32 SE-50M SE-HD50F
+ AHA-2944W aic7870 PCI/32 HVD-HD68F HVD-HD68F
+ HVD-50M
+ AHA-3940W aic7870 PCI/32 2 X SE-HD68F SE-HD68F 2
+ AHA-2940UW aic7880 PCI/32 SE-HD68F
+ SE-50M SE-HD68F
+ AHA-2940U aic7880 PCI/32 SE-50M SE-HD50F
+ AHA-2940D aic7880 PCI/32
+ aHA-2940 A/T aic7880 PCI/32
+ AHA-2940D A/T aic7880 PCI/32
+ AHA-3940UW aic7880 PCI/32 2 X SE-HD68F SE-HD68F 3
+ AHA-3940UWD aic7880 PCI/32 2 X SE-HD68F 2 X SE-VHD68F 3
+ AHA-3940U aic7880 PCI/32 2 X SE-50M SE-HD50F 3
+ AHA-2944UW aic7880 PCI/32 HVD-HD68F HVD-HD68F
+ HVD-50M
+ AHA-3944UWD aic7880 PCI/32 2 X HVD-HD68F 2 X HVD-VHD68F 3
+ AHA-4944UW aic7880 PCI/32
+ AHA-2930UW aic7880 PCI/32
+ AHA-2940UW Pro aic7880 PCI/32 SE-HD68F SE-HD68F 4
+ SE-50M
+ AHA-2940UW/CN aic7880 PCI/32
+ AHA-2940UDual aic7895 PCI/32
+ AHA-2940UWDual aic7895 PCI/32
+ AHA-3940UWD aic7895 PCI/32
+ AHA-3940AUW aic7895 PCI/32
+ AHA-3940AUWD aic7895 PCI/32
+ AHA-3940AU aic7895 PCI/32
+ AHA-3944AUWD aic7895 PCI/32 2 X HVD-HD68F 2 X HVD-VHD68F
+ AHA-2940U2B aic7890 PCI/32 LVD-HD68F LVD-HD68F
+ AHA-2940U2 OEM aic7891 PCI/64
+ AHA-2940U2W aic7890 PCI/32 LVD-HD68F LVD-HD68F
+ SE-HD68F
+ SE-50M
+ AHA-2950U2B aic7891 PCI/64 LVD-HD68F LVD-HD68F
+ AHA-2930U2 aic7890 PCI/32 LVD-HD68F SE-HD50F
+ SE-50M
+ AHA-3950U2B aic7897 PCI/64
+ AHA-3950U2D aic7897 PCI/64
+ AHA-29160 aic7892 PCI/64-66
+ AHA-29160 CPQ aic7892 PCI/64-66
+ AHA-29160N aic7892 PCI/32 LVD-HD68F SE-HD50F
+ SE-50M
+ AHA-29160LP aic7892 PCI/64-66
+ AHA-19160 aic7892 PCI/64-66
+ AHA-29150LP aic7892 PCI/64-66
+ AHA-29130LP aic7892 PCI/64-66
+ AHA-3960D aic7899 PCI/64-66 2 X LVD-HD68F 2 X LVD-VHD68F
+ LVD-50M
+ AHA-3960D CPQ aic7899 PCI/64-66 2 X LVD-HD68F 2 X LVD-VHD68F
+ LVD-50M
+ AHA-39160 aic7899 PCI/64-66 2 X LVD-HD68F 2 X LVD-VHD68F
+ LVD-50M
+
+ 1. No BIOS support
+ 2. DEC21050 PCI-PCI bridge with multiple controller chips on secondary bus
+ 3. DEC2115X PCI-PCI bridge with multiple controller chips on secondary bus
+ 4. All three SCSI connectors may be used simultaneously without
+ SCSI "stub" effects.
+
+2. Version History
+ 7.0 (4th August, 2005)
+ - Updated driver to use SCSI transport class infrastructure
+ - Upported sequencer and core fixes from last adaptec released
+ version of the driver.
+ 6.2.36 (June 3rd, 2003)
+ - Correct code that disables PCI parity error checking.
+ - Correct and simplify handling of the ignore wide residue
+ message. The previous code would fail to report a residual
+ if the transaction data length was even and we received
+ an IWR message.
+ - Add support for the 2.5.X EISA framework.
+ - Update for change in 2.5.X SCSI proc FS interface.
+ - Correct Domain Validation command-line option parsing.
+ - When negotiation async via an 8bit WDTR message, send
+ an SDTR with an offset of 0 to be sure the target
+ knows we are async. This works around a firmware defect
+ in the Quantum Atlas 10K.
+ - Clear PCI error state during driver attach so that we
+ don't disable memory mapped I/O due to a stray write
+ by some other driver probe that occurred before we
+ claimed the controller.
+
+ 6.2.35 (May 14th, 2003)
+ - Fix a few GCC 3.3 compiler warnings.
+ - Correct operation on EISA Twin Channel controller.
+ - Add support for 2.5.X's scsi_report_device_reset().
+
+ 6.2.34 (May 5th, 2003)
+ - Fix locking regression introduced in 6.2.29 that
+ could cause a lock order reversal between the io_request_lock
+ and our per-softc lock. This was only possible on RH9,
+ SuSE, and kernel.org 2.4.X kernels.
+
+ 6.2.33 (April 30th, 2003)
+ - Dynamically disable PCI parity error reporting after
+ 10 errors are reported to the user. These errors are
+ the result of some other device issuing PCI transactions
+ with bad parity. Once the user has been informed of the
+ problem, continuing to report the errors just degrades
+ our performance.
+
+ 6.2.32 (March 28th, 2003)
+ - Dynamically sized S/G lists to avoid SCSI malloc
+ pool fragmentation and SCSI mid-layer deadlock.
+
+ 6.2.28 (January 20th, 2003)
+ - Domain Validation Fixes
+ - Add ability to disable PCI parity error checking.
+ - Enhanced Memory Mapped I/O probe
+
+ 6.2.20 (November 7th, 2002)
+ - Added Domain Validation.
+
+3. Command Line Options
+
+ WARNING: ALTERING OR ADDING THESE DRIVER PARAMETERS
+ INCORRECTLY CAN RENDER YOUR SYSTEM INOPERABLE.
+ USE THEM WITH CAUTION.
+
+ Edit the file "modprobe.conf" in the directory /etc and add/edit a
+ line containing 'options aic7xxx aic7xxx=[command[,command...]]' where
+ 'command' is one or more of the following:
+ -----------------------------------------------------------------
+ Option: verbose
+ Definition: enable additional informative messages during
+ driver operation.
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: debug:[value]
+ Definition: Enables various levels of debugging information
+ Possible Values: 0x0000 = no debugging, 0xffff = full debugging
+ Default Value: 0x0000
+ -----------------------------------------------------------------
+ Option: no_probe
+ Option: probe_eisa_vl
+ Definition: Do not probe for EISA/VLB controllers.
+ This is a toggle. If the driver is compiled
+ to not probe EISA/VLB controllers by default,
+ specifying "no_probe" will enable this probing.
+ If the driver is compiled to probe EISA/VLB
+ controllers by default, specifying "no_probe"
+ will disable this probing.
+ Possible Values: This option is a toggle
+ Default Value: EISA/VLB probing is disabled by default.
+ -----------------------------------------------------------------
+ Option: pci_parity
+ Definition: Toggles the detection of PCI parity errors.
+ On many motherboards with VIA chipsets,
+ PCI parity is not generated correctly on the
+ PCI bus. It is impossible for the hardware to
+ differentiate between these "spurious" parity
+ errors and real parity errors. The symptom of
+ this problem is a stream of the message:
+ "scsi0: Data Parity Error Detected during address or write data phase"
+ output by the driver.
+ Possible Values: This option is a toggle
+ Default Value: PCI Parity Error reporting is disabled
+ -----------------------------------------------------------------
+ Option: no_reset
+ Definition: Do not reset the bus during the initial probe
+ phase
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: extended
+ Definition: Force extended translation on the controller
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: periodic_otag
+ Definition: Send an ordered tag periodically to prevent
+ tag starvation. Needed for some older devices
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: reverse_scan
+ Definition: Probe the scsi bus in reverse order, starting
+ with target 15
+ Possible Values: This option is a flag
+ Default Value: disabled
+ -----------------------------------------------------------------
+ Option: global_tag_depth:[value]
+ Definition: Global tag depth for all targets on all busses.
+ This option sets the default tag depth which
+ may be selectively overridden vi the tag_info
+ option.
+ Possible Values: 1 - 253
+ Default Value: 32
+ -----------------------------------------------------------------
+ Option: tag_info:{{value[,value...]}[,{value[,value...]}...]}
+ Definition: Set the per-target tagged queue depth on a
+ per controller basis. Both controllers and targets
+ may be omitted indicating that they should retain
+ the default tag depth.
+ Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32}
+ On Controller 0
+ specifies a tag depth of 16 for target 0
+ specifies a tag depth of 64 for target 3
+ specifies a tag depth of 8 for targets 4 and 5
+ leaves target 6 at the default
+ specifies a tag depth of 32 for targets 1,2,7-15
+ All other targets retain the default depth.
+
+ tag_info:{{},{32,,32}}
+ On Controller 1
+ specifies a tag depth of 32 for targets 0 and 2
+ All other targets retain the default depth.
+
+ Possible Values: 1 - 253
+ Default Value: 32
+ -----------------------------------------------------------------
+ Option: seltime:[value]
+ Definition: Specifies the selection timeout value
+ Possible Values: 0 = 256ms, 1 = 128ms, 2 = 64ms, 3 = 32ms
+ Default Value: 0
+ -----------------------------------------------------------------
+ Option: dv: {value[,value...]}
+ Definition: Set Domain Validation Policy on a per-controller basis.
+ Controllers may be omitted indicating that
+ they should retain the default read streaming setting.
+ Example: dv:{-1,0,,1,1,0}
+ On Controller 0 leave DV at its default setting.
+ On Controller 1 disable DV.
+ Skip configuration on Controller 2.
+ On Controllers 3 and 4 enable DV.
+ On Controller 5 disable DV.
+
+ Possible Values: < 0 Use setting from serial EEPROM.
+ 0 Disable DV
+ > 0 Enable DV
+
+ Default Value: SCSI-Select setting on controllers with a SCSI Select
+ option for DV. Otherwise, on for controllers supporting
+ U160 speeds and off for all other controller types.
+ -----------------------------------------------------------------
+
+ Example:
+ 'options aic7xxx aic7xxx=verbose,no_probe,tag_info:{{},{,,10}},seltime:1'
+ enables verbose logging, Disable EISA/VLB probing,
+ and set tag depth on Controller 1/Target 2 to 10 tags.
+
+4. Adaptec Customer Support
+
+ A Technical Support Identification (TSID) Number is required for
+ Adaptec technical support.
+ - The 12-digit TSID can be found on the white barcode-type label
+ included inside the box with your product. The TSID helps us
+ provide more efficient service by accurately identifying your
+ product and support status.
+
+ Support Options
+ - Search the Adaptec Support Knowledgebase (ASK) at
+ http://ask.adaptec.com for articles, troubleshooting tips, and
+ frequently asked questions about your product.
+ - For support via Email, submit your question to Adaptec's
+ Technical Support Specialists at http://ask.adaptec.com/.
+
+ North America
+ - Visit our Web site at http://www.adaptec.com/.
+ - For information about Adaptec's support options, call
+ 408-957-2550, 24 hours a day, 7 days a week.
+ - To speak with a Technical Support Specialist,
+ * For hardware products, call 408-934-7274,
+ Monday to Friday, 3:00 am to 5:00 pm, PDT.
+ * For RAID and Fibre Channel products, call 321-207-2000,
+ Monday to Friday, 3:00 am to 5:00 pm, PDT.
+ To expedite your service, have your computer with you.
+ - To order Adaptec products, including accessories and cables,
+ call 408-957-7274. To order cables online go to
+ http://www.adaptec.com/buy-cables/.
+
+ Europe
+ - Visit our Web site at http://www.adaptec-europe.com/.
+ - To speak with a Technical Support Specialist, call, or email,
+ * German: +49 89 4366 5522, Monday-Friday, 9:00-17:00 CET,
+ http://ask-de.adaptec.com/.
+ * French: +49 89 4366 5533, Monday-Friday, 9:00-17:00 CET,
+ http://ask-fr.adaptec.com/.
+ * English: +49 89 4366 5544, Monday-Friday, 9:00-17:00 GMT,
+ http://ask.adaptec.com/.
+ - You can order Adaptec cables online at
+ http://www.adaptec.com/buy-cables/.
+
+ Japan
+ - Visit our web site at http://www.adaptec.co.jp/.
+ - To speak with a Technical Support Specialist, call
+ +81 3 5308 6120, Monday-Friday, 9:00 a.m. to 12:00 p.m.,
+ 1:00 p.m. to 6:00 p.m.
+
+-------------------------------------------------------------------
+/*
+ * Copyright (c) 2003 Adaptec Inc. 691 S. Milpitas Blvd., Milpitas CA 95035 USA.
+ * All rights reserved.
+ *
+ * You are permitted to redistribute, use and modify this README file in whole
+ * or in part in conjunction with redistribution of software governed by the
+ * General Public License, provided that the following conditions are met:
+ * 1. Redistributions of README file must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * 3. Modifications or new contributions must be attributed in a copyright
+ * notice identifying the author ("Contributor") and added below the
+ * original copyright notice. The copyright notice is for purposes of
+ * identifying contributors and should not be deemed as permission to alter
+ * the permissions given by Adaptec.
+ *
+ * THIS README FILE IS PROVIDED BY ADAPTEC AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ANY
+ * WARRANTIES OF NON-INFRINGEMENT OR THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * ADAPTEC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS README
+ * FILE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/Documentation/scsi/aic7xxx_old.txt b/Documentation/scsi/aic7xxx_old.txt
new file mode 100644
index 0000000..7bd210a
--- /dev/null
+++ b/Documentation/scsi/aic7xxx_old.txt
@@ -0,0 +1,511 @@
+ AIC7xxx Driver for Linux
+
+Introduction
+----------------------------
+The AIC7xxx SCSI driver adds support for Adaptec (http://www.adaptec.com)
+SCSI controllers and chipsets. Major portions of the driver and driver
+development are shared between both Linux and FreeBSD. Support for the
+AIC-7xxx chipsets have been in the default Linux kernel since approximately
+linux-1.1.x and fairly stable since linux-1.2.x, and are also in FreeBSD
+2.1.0 or later.
+
+ Supported cards/chipsets
+ ----------------------------
+ Adaptec Cards
+ ----------------------------
+ AHA-274x
+ AHA-274xT
+ AHA-2842
+ AHA-2910B
+ AHA-2920C
+ AHA-2930
+ AHA-2930U
+ AHA-2930CU
+ AHA-2930U2
+ AHA-2940
+ AHA-2940W
+ AHA-2940U
+ AHA-2940UW
+ AHA-2940UW-PRO
+ AHA-2940AU
+ AHA-2940U2W
+ AHA-2940U2
+ AHA-2940U2B
+ AHA-2940U2BOEM
+ AHA-2944D
+ AHA-2944WD
+ AHA-2944UD
+ AHA-2944UWD
+ AHA-2950U2
+ AHA-2950U2W
+ AHA-2950U2B
+ AHA-29160M
+ AHA-3940
+ AHA-3940U
+ AHA-3940W
+ AHA-3940UW
+ AHA-3940AUW
+ AHA-3940U2W
+ AHA-3950U2B
+ AHA-3950U2D
+ AHA-3960D
+ AHA-39160M
+ AHA-3985
+ AHA-3985U
+ AHA-3985W
+ AHA-3985UW
+
+ Motherboard Chipsets
+ ----------------------------
+ AIC-777x
+ AIC-785x
+ AIC-786x
+ AIC-787x
+ AIC-788x
+ AIC-789x
+ AIC-3860
+
+ Bus Types
+ ----------------------------
+ W - Wide SCSI, SCSI-3, 16bit bus, 68pin connector, will also support
+ SCSI-1/SCSI-2 50pin devices, transfer rates up to 20MB/s.
+ U - Ultra SCSI, transfer rates up to 40MB/s.
+ U2- Ultra 2 SCSI, transfer rates up to 80MB/s.
+ D - Differential SCSI.
+ T - Twin Channel SCSI. Up to 14 SCSI devices.
+
+ AHA-274x - EISA SCSI controller
+ AHA-284x - VLB SCSI controller
+ AHA-29xx - PCI SCSI controller
+ AHA-394x - PCI controllers with two separate SCSI controllers on-board.
+ AHA-398x - PCI RAID controllers with three separate SCSI controllers
+ on-board.
+
+ Not Supported Devices
+ ------------------------------
+ Adaptec Cards
+ ----------------------------
+ AHA-2920 (Only the cards that use the Future Domain chipset are not
+ supported, any 2920 cards based on Adaptec AIC chipsets,
+ such as the 2920C, are supported)
+ AAA-13x Raid Adapters
+ AAA-113x Raid Port Card
+
+ Motherboard Chipsets
+ ----------------------------
+ AIC-7810
+
+ Bus Types
+ ----------------------------
+ R - Raid Port busses are not supported.
+
+ The hardware RAID devices sold by Adaptec are *NOT* supported by this
+ driver (and will people please stop emailing me about them, they are
+ a totally separate beast from the bare SCSI controllers and this driver
+ cannot be retrofitted in any sane manner to support the hardware RAID
+ features on those cards - Doug Ledford).
+
+
+ People
+ ------------------------------
+ Justin T Gibbs gibbs@plutotech.com
+ (BSD Driver Author)
+ Dan Eischen deischen@iworks.InterWorks.org
+ (Original Linux Driver Co-maintainer)
+ Dean Gehnert deang@teleport.com
+ (Original Linux FTP/patch maintainer)
+ Jess Johnson jester@frenzy.com
+ (AIC7xxx FAQ author)
+ Doug Ledford dledford@redhat.com
+ (Current Linux aic7xxx-5.x.x Driver/Patch/FTP maintainer)
+
+ Special thanks go to John Aycock (aycock@cpsc.ucalgary.ca), the original
+ author of the driver. John has since retired from the project. Thanks
+ again for all his work!
+
+ Mailing list
+ ------------------------------
+ There is a mailing list available for users who want to track development
+ and converse with other users and developers. This list is for both
+ FreeBSD and Linux support of the AIC7xxx chipsets.
+
+ To subscribe to the AIC7xxx mailing list send mail to the list server,
+ with "subscribe AIC7xxx" in the body (no Subject: required):
+ To: majordomo@FreeBSD.ORG
+ ---
+ subscribe AIC7xxx
+
+ To unsubscribe from the list, send mail to the list server with:
+ To: majordomo@FreeBSD.ORG
+ ---
+ unsubscribe AIC7xxx
+
+ Send regular messages and replies to: AIC7xxx@FreeBSD.ORG
+
+ Boot Command line options
+ ------------------------------
+ "aic7xxx=no_reset" - Eliminate the SCSI bus reset during startup.
+ Some SCSI devices need the initial reset that this option disables
+ in order to work. If you have problems at bootup, please make sure
+ you aren't using this option.
+
+ "aic7xxx=reverse_scan" - Certain PCI motherboards scan for devices at
+ bootup by scanning from the highest numbered PCI device to the
+ lowest numbered PCI device, others do just the opposite and scan
+ from lowest to highest numbered PCI device. There is no reliable
+ way to autodetect this ordering. So, we default to the most common
+ order, which is lowest to highest. Then, in case your motherboard
+ scans from highest to lowest, we have this option. If your BIOS
+ finds the drives on controller A before controller B but the linux
+ kernel finds your drives on controller B before A, then you should
+ use this option.
+
+ "aic7xxx=extended" - Force the driver to detect extended drive translation
+ on your controller. This helps those people who have cards without
+ a SEEPROM make sure that linux and all other operating systems think
+ the same way about your hard drives.
+
+ "aic7xxx=scbram" - Some cards have external SCB RAM that can be used to
+ give the card more hardware SCB slots. This allows the driver to use
+ that SCB RAM. Without this option, the driver won't touch the SCB
+ RAM because it is known to cause problems on a few cards out there
+ (such as 3985 class cards).
+
+ "aic7xxx=irq_trigger:x" - Replace x with either 0 or 1 to force the kernel
+ to use the correct IRQ type for your card. This only applies to EISA
+ based controllers. On these controllers, 0 is for Edge triggered
+ interrupts, and 1 is for Level triggered interrupts. If you aren't
+ sure or don't know which IRQ trigger type your EISA card uses, then
+ let the kernel autodetect the trigger type.
+
+ "aic7xxx=verbose" - This option can be used in one of two ways. If you
+ simply specify aic7xxx=verbose, then the kernel will automatically
+ pick the default set of verbose messages for you to see.
+ Alternatively, you can specify the command as
+ "aic7xxx=verbose:0xXXXX" where the X entries are replaced with
+ hexadecimal digits. This option is a bit field type option. For
+ a full listing of the available options, search for the
+ #define VERBOSE_xxxxxx lines in the aic7xxx.c file. If you want
+ verbose messages, then it is recommended that you simply use the
+ aic7xxx=verbose variant of this command.
+
+ "aic7xxx=pci_parity:x" - This option controls whether or not the driver
+ enables PCI parity error checking on the PCI bus. By default, this
+ checking is disabled. To enable the checks, simply specify pci_parity
+ with no value afterwords. To reverse the parity from even to odd,
+ supply any number other than 0 or 255. In short:
+ pci_parity - Even parity checking (even is the normal PCI parity)
+ pci_parity:x - Where x > 0, Odd parity checking
+ pci_parity:0 - No check (default)
+ NOTE: In order to get Even PCI parity checking, you must use the
+ version of the option that does not include the : and a number at
+ the end (unless you want to enter exactly 2^32 - 1 as the number).
+
+ "aic7xxx=no_probe" - This option will disable the probing for any VLB
+ based 2842 controllers and any EISA based controllers. This is
+ needed on certain newer motherboards where the normal EISA I/O ranges
+ have been claimed by other PCI devices. Probing on those machines
+ will often result in the machine crashing or spontaneously rebooting
+ during startup. Examples of machines that need this are the
+ Dell PowerEdge 6300 machines.
+
+ "aic7xxx=seltime:2" - This option controls how long the card waits
+ during a device selection sequence for the device to respond.
+ The original SCSI spec says that this "should be" 256ms. This
+ is generally not required with modern devices. However, some
+ very old SCSI I devices need the full 256ms. Most modern devices
+ can run fine with only 64ms. The default for this option is
+ 64ms. If you need to change this option, then use the following
+ table to set the proper value in the example above:
+ 0 - 256ms
+ 1 - 128ms
+ 2 - 64ms
+ 3 - 32ms
+
+ "aic7xxx=panic_on_abort" - This option is for debugging and will cause
+ the driver to panic the linux kernel and freeze the system the first
+ time the drivers abort or reset routines are called. This is most
+ helpful when some problem causes infinite reset loops that scroll too
+ fast to see. By using this option, you can write down what the errors
+ actually are and send that information to me so it can be fixed.
+
+ "aic7xxx=dump_card" - This option will print out the *entire* set of
+ configuration registers on the card during the init sequence. This
+ is a debugging aid used to see exactly what state the card is in
+ when we finally finish our initialization routines. If you don't
+ have documentation on the chipsets, this will do you absolutely
+ no good unless you are simply trying to write all the information
+ down in order to send it to me.
+
+ "aic7xxx=dump_sequencer" - This is the same as the above options except
+ that instead of dumping the register contents on the card, this
+ option dumps the contents of the sequencer program RAM. This gives
+ the ability to verify that the instructions downloaded to the
+ card's sequencer are indeed what they are supposed to be. Again,
+ unless you have documentation to tell you how to interpret these
+ numbers, then it is totally useless.
+
+ "aic7xxx=override_term:0xffffffff" - This option is used to force the
+ termination on your SCSI controllers to a particular setting. This
+ is a bit mask variable that applies for up to 8 aic7xxx SCSI channels.
+ Each channel gets 4 bits, divided as follows:
+ bit 3 2 1 0
+ | | | Enable/Disable Single Ended Low Byte Termination
+ | | En/Disable Single Ended High Byte Termination
+ | En/Disable Low Byte LVD Termination
+ En/Disable High Byte LVD Termination
+
+ The upper 2 bits that deal with LVD termination only apply to Ultra2
+ controllers. Furthermore, due to the current Ultra2 controller
+ designs, these bits are tied together such that setting either bit
+ enables both low and high byte LVD termination. It is not possible
+ to only set high or low byte LVD termination in this manner. This is
+ an artifact of the BIOS definition on Ultra2 controllers. For other
+ controllers, the only important bits are the two lowest bits. Setting
+ the higher bits on non-Ultra2 controllers has no effect. A few
+ examples of how to use this option:
+
+ Enable low and high byte termination on a non-ultra2 controller that
+ is the first aic7xxx controller (the correct bits are 0011),
+ aic7xxx=override_term:0x3
+
+ Enable all termination on the third aic7xxx controller, high byte
+ termination on the second aic7xxx controller, and low and high byte
+ SE termination on the first aic7xxx controller
+ (bits are 1111 0010 0011),
+ aic7xxx=override_term:0xf23
+
+ No attempt has been made to make this option non-cryptic. It really
+ shouldn't be used except in dire circumstances, and if that happens,
+ I'm probably going to be telling you what to set this to anyway :)
+
+ "aic7xxx=stpwlev:0xffffffff" - This option is used to control the STPWLEV
+ bit in the DEVCONFIG PCI register. Currently, this is one of the
+ very few registers that we have absolutely *no* way of detecting
+ what the variable should be. It depends entirely on how the chipset
+ and external terminators were coupled by the card/motherboard maker.
+ Further, a chip reset (at power up) always sets this bit to 0. If
+ there is no BIOS to run on the chipset/card (such as with a 2910C
+ or a motherboard controller with the BIOS totally disabled) then
+ the variable may not get set properly. Of course, if the proper
+ setting was 0, then that's what it would be after the reset, but if
+ the proper setting is actually 1.....you get the picture. Now, since
+ we can't detect this at all, I've added this option to force the
+ setting. If you have a BIOS on your controller then you should never
+ need to use this option. However, if you are having lots of SCSI
+ reset problems and can't seem to get them knocked out, this may help.
+
+ Here's a test to know for certain if you need this option. Make
+ a boot floppy that you can use to boot your computer up and that
+ will detect the aic7xxx controller. Next, power down your computer.
+ While it's down, unplug all SCSI cables from your Adaptec SCSI
+ controller. Boot the system back up to the Adaptec EZ-SCSI BIOS
+ and then make sure that termination is enabled on your adapter (if
+ you have an Adaptec BIOS of course). Next, boot up the floppy you
+ made and wait for it to detect the aic7xxx controller. If the kernel
+ finds the controller fine, says scsi : x hosts and then tries to
+ detect your devices like normal, up to the point where it fails to
+ mount your root file system and panics, then you're fine. If, on
+ the other hand, the system goes into an infinite reset loop, then
+ you need to use this option and/or the previous option to force the
+ proper termination settings on your controller. If this happens,
+ then you next need to figure out what your settings should be.
+
+ To find the correct settings, power your machine back down, connect
+ back up the SCSI cables, and boot back into your machine like normal.
+ However, boot with the aic7xxx=verbose:0x39 option. Record the
+ initial DEVCONFIG values for each of your aic7xxx controllers as
+ they are listed, and also record what the machine is detecting as
+ the proper termination on your controllers. NOTE: the order in
+ which the initial DEVCONFIG values are printed out is not guaranteed
+ to be the same order as the SCSI controllers are registered. The
+ above option and this option both work on the order of the SCSI
+ controllers as they are registered, so make sure you match the right
+ DEVCONFIG values with the right controllers if you have more than
+ one aic7xxx controller.
+
+ Once you have the detected termination settings and the initial
+ DEVCONFIG values for each controller, then figure out what the
+ termination on each of the controllers *should* be. Hopefully, that
+ part is correct, but it could possibly be wrong if there is
+ bogus cable detection logic on your controller or something similar.
+ If all the controllers have the correct termination settings, then
+ don't set the aic7xxx=override_term variable at all, leave it alone.
+ Next, on any controllers that go into an infinite reset loop when
+ you unplug all the SCSI cables, get the starting DEVCONFIG value.
+ If the initial DEVCONFIG value is divisible by 2, then the correct
+ setting for that controller is 0. If it's an odd number, then
+ the correct setting for that controller is 1. For any other
+ controllers that didn't have an infinite reset problem, then reverse
+ the above options. If DEVCONFIG was even, then the correct setting
+ is 1, if not then the correct setting is 0.
+
+ Now that you know what the correct setting was for each controller,
+ we need to encode that into the aic7xxx=stpwlev:0x... variable.
+ This variable is a bit field encoded variable. Bit 0 is for the first
+ aic7xxx controller, bit 1 for the next, etc. Put all these bits
+ together and you get a number. For example, if the third aic7xxx
+ needed a 1, but the second and first both needed a 0, then the bits
+ would be 100 in binary. This then translates to 0x04. You would
+ therefore set aic7xxx=stpwlev:0x04. This is fairly standard binary
+ to hexadecimal conversions here. If you aren't up to speed on the
+ binary->hex conversion then send an email to the aic7xxx mailing
+ list and someone can help you out.
+
+ "aic7xxx=tag_info:{{8,8..},{8,8..},..}" - This option is used to disable
+ or enable Tagged Command Queueing (TCQ) on specific devices. As of
+ driver version 5.1.11, TCQ is now either on or off by default
+ according to the setting you choose during the make config process.
+ In order to en/disable TCQ for certain devices at boot time, a user
+ may use this boot param. The driver will then parse this message out
+ and en/disable the specific device entries that are present based upon
+ the value given. The param line is parsed in the following manner:
+
+ { - first instance indicates the start of this parameter values
+ second instance is the start of entries for a particular
+ device entry
+ } - end the entries for a particular host adapter, or end the entire
+ set of parameter entries
+ , - move to next entry. Inside of a set of device entries, this
+ moves us to the next device on the list. Outside of device
+ entries, this moves us to the next host adapter
+ . - Same effect as , but is safe to use with insmod.
+ x - the number to enter into the array at this position.
+ 0 = Enable tagged queueing on this device and use the default
+ queue depth
+ 1-254 = Enable tagged queueing on this device and use this
+ number as the queue depth
+ 255 = Disable tagged queueing on this device.
+ Note: anything above 32 for an actual queue depth is wasteful
+ and not recommended.
+
+ A few examples of how this can be used:
+
+ tag_info:{{8,12,,0,,255,4}}
+ This line will only effect the first aic7xxx card registered. It
+ will set scsi id 0 to a queue depth of 8, id 1 to 12, leave id 2
+ at the default, set id 3 to tagged queueing enabled and use the
+ default queue depth, id 4 default, id 5 disabled, and id 6 to 4.
+ Any not specified entries stay at the default value, repeated
+ commas with no value specified will simply increment to the next id
+ without changing anything for the missing values.
+
+ tag_info:{,,,{,,,255}}
+ First, second, and third adapters at default values. Fourth
+ adapter, id 3 is disabled. Notice that leading commas simply
+ increment what the first number effects, and there are no need
+ for trailing commas. When you close out an adapter, or the
+ entire entry, anything not explicitly set stays at the default
+ value.
+
+ A final note on this option. The scanner I used for this isn't
+ perfect or highly robust. If you mess the line up, the worst that
+ should happen is that the line will get ignored. If you don't
+ close out the entire entry with the final bracket, then any other
+ aic7xxx options after this will get ignored. So, in general, be
+ sure of what you are entering, and after you have it right, just
+ add it to the lilo.conf file so there won't be any mistakes. As
+ a means of checking this parser, the entire tag_info array for
+ each card is now printed out in the /proc/scsi/aic7xxx/x file. You
+ can use that to verify that your options were parsed correctly.
+
+ Boot command line options may be combined to form the proper set of options
+ a user might need. For example, the following is valid:
+
+ aic7xxx=verbose,extended,irq_trigger:1
+
+ The only requirement is that individual options be separated by a comma or
+ a period on the command line.
+
+ Module Loading command options
+ ------------------------------
+ When loading the aic7xxx driver as a module, the exact same options are
+ available to the user. However, the syntax to specify the options changes
+ slightly. For insmod, you need to wrap the aic7xxx= argument in quotes
+ and replace all ',' with '.'. So, for example, a valid insmod line
+ would be:
+
+ insmod aic7xxx aic7xxx='verbose.irq_trigger:1.extended'
+
+ This line should result in the *exact* same behaviour as if you typed
+ it in at the lilo prompt and the driver was compiled into the kernel
+ instead of being a module. The reason for the single quote is so that
+ the shell won't try to interpret anything in the line, such as {.
+ Insmod assumes any options starting with a letter instead of a number
+ is a character string (which is what we want) and by switching all of
+ the commas to periods, insmod won't interpret this as more than one
+ string and write junk into our binary image. I consider it a bug in
+ the insmod program that even if you wrap your string in quotes (quotes
+ that pass the shell mind you and that insmod sees) it still treats
+ a comma inside of those quotes as starting a new variable, resulting
+ in memory scribbles if you don't switch the commas to periods.
+
+
+ Kernel Compile options
+ ------------------------------
+ The various kernel compile time options for this driver are now fairly
+ well documented in the file Documentation/Configure.help. In order to
+ see this documentation, you need to use one of the advanced configuration
+ programs (menuconfig and xconfig). If you are using the "make menuconfig"
+ method of configuring your kernel, then you would simply highlight the
+ option in question and hit the ? key. If you are using the "make xconfig"
+ method of configuring your kernel, then simply click on the help button
+ next to the option you have questions about. The help information from
+ the Configure.help file will then get automatically displayed.
+
+ /proc support
+ ------------------------------
+ The /proc support for the AIC7xxx can be found in the /proc/scsi/aic7xxx/
+ directory. That directory contains a file for each SCSI controller in
+ the system. Each file presents the current configuration and transfer
+ statistics (enabled with #define in aic7xxx.c) for each controller.
+
+ Thanks to Michael Neuffer for his upper-level SCSI help, and
+ Matthew Jacob for statistics support.
+
+ Debugging the driver
+ ------------------------------
+ Should you have problems with this driver, and would like some help in
+ getting them solved, there are a couple debugging items built into
+ the driver to facilitate getting the needed information from the system.
+ In general, I need a complete description of the problem, with as many
+ logs as possible concerning what happens. To help with this, there is
+ a command option aic7xxx=panic_on_abort. This option, when set, forces
+ the driver to panic the kernel on the first SCSI abort issued by the
+ mid level SCSI code. If your system is going to reset loops and you
+ can't read the screen, then this is what you need. Not only will it
+ stop the system, but it also prints out a large amount of state
+ information in the process. Second, if you specify the option
+ "aic7xxx=verbose:0x1ffff", the system will print out *SOOOO* much
+ information as it runs that you won't be able to see anything.
+ However, this can actually be very useful if your machine simply
+ locks up when trying to boot, since it will pin-point what was last
+ happening (in regards to the aic7xxx driver) immediately prior to
+ the lockup. This is really only useful if your machine simply can
+ not boot up successfully. If you can get your machine to run, then
+ this will produce far too much information.
+
+ FTP sites
+ ------------------------------
+ ftp://ftp.redhat.com/pub/aic/
+ - Out of date. I used to keep stuff here, but too many people
+ complained about having a hard time getting into Red Hat's ftp
+ server. So use the web site below instead.
+ ftp://ftp.pcnet.com/users/eischen/Linux/
+ - Dan Eischen's driver distribution area
+ ftp://ekf2.vsb.cz/pub/linux/kernel/aic7xxx/ftp.teleport.com/
+ - European Linux mirror of Teleport site
+
+ Web sites
+ ------------------------------
+ http://people.redhat.com/dledford/
+ - My web site, also the primary aic7xxx site with several related
+ pages.
+
+Dean W. Gehnert
+deang@teleport.com
+
+$Revision: 3.0 $
+
+Modified by Doug Ledford 1998-2000
+
diff --git a/Documentation/scsi/arcmsr_spec.txt b/Documentation/scsi/arcmsr_spec.txt
new file mode 100644
index 0000000..45d9482
--- /dev/null
+++ b/Documentation/scsi/arcmsr_spec.txt
@@ -0,0 +1,574 @@
+*******************************************************************************
+** ARECA FIRMWARE SPEC
+*******************************************************************************
+** Usage of IOP331 adapter
+** (All In/Out is in IOP331's view)
+** 1. Message 0 --> InitThread message and return code
+** 2. Doorbell is used for RS-232 emulation
+** inDoorBell : bit0 -- data in ready
+** (DRIVER DATA WRITE OK)
+** bit1 -- data out has been read
+** (DRIVER DATA READ OK)
+** outDooeBell: bit0 -- data out ready
+** (IOP331 DATA WRITE OK)
+** bit1 -- data in has been read
+** (IOP331 DATA READ OK)
+** 3. Index Memory Usage
+** offset 0xf00 : for RS232 out (request buffer)
+** offset 0xe00 : for RS232 in (scratch buffer)
+** offset 0xa00 : for inbound message code message_rwbuffer
+** (driver send to IOP331)
+** offset 0xa00 : for outbound message code message_rwbuffer
+** (IOP331 send to driver)
+** 4. RS-232 emulation
+** Currently 128 byte buffer is used
+** 1st uint32_t : Data length (1--124)
+** Byte 4--127 : Max 124 bytes of data
+** 5. PostQ
+** All SCSI Command must be sent through postQ:
+** (inbound queue port) Request frame must be 32 bytes aligned
+** #bit27--bit31 => flag for post ccb
+** #bit0--bit26 => real address (bit27--bit31) of post arcmsr_cdb
+** bit31 :
+** 0 : 256 bytes frame
+** 1 : 512 bytes frame
+** bit30 :
+** 0 : normal request
+** 1 : BIOS request
+** bit29 : reserved
+** bit28 : reserved
+** bit27 : reserved
+** ---------------------------------------------------------------------------
+** (outbount queue port) Request reply
+** #bit27--bit31
+** => flag for reply
+** #bit0--bit26
+** => real address (bit27--bit31) of reply arcmsr_cdb
+** bit31 : must be 0 (for this type of reply)
+** bit30 : reserved for BIOS handshake
+** bit29 : reserved
+** bit28 :
+** 0 : no error, ignore AdapStatus/DevStatus/SenseData
+** 1 : Error, error code in AdapStatus/DevStatus/SenseData
+** bit27 : reserved
+** 6. BIOS request
+** All BIOS request is the same with request from PostQ
+** Except :
+** Request frame is sent from configuration space
+** offset: 0x78 : Request Frame (bit30 == 1)
+** offset: 0x18 : writeonly to generate
+** IRQ to IOP331
+** Completion of request:
+** (bit30 == 0, bit28==err flag)
+** 7. Definition of SGL entry (structure)
+** 8. Message1 Out - Diag Status Code (????)
+** 9. Message0 message code :
+** 0x00 : NOP
+** 0x01 : Get Config
+** ->offset 0xa00 :for outbound message code message_rwbuffer
+** (IOP331 send to driver)
+** Signature 0x87974060(4)
+** Request len 0x00000200(4)
+** numbers of queue 0x00000100(4)
+** SDRAM Size 0x00000100(4)-->256 MB
+** IDE Channels 0x00000008(4)
+** vendor 40 bytes char
+** model 8 bytes char
+** FirmVer 16 bytes char
+** Device Map 16 bytes char
+** FirmwareVersion DWORD <== Added for checking of
+** new firmware capability
+** 0x02 : Set Config
+** ->offset 0xa00 :for inbound message code message_rwbuffer
+** (driver send to IOP331)
+** Signature 0x87974063(4)
+** UPPER32 of Request Frame (4)-->Driver Only
+** 0x03 : Reset (Abort all queued Command)
+** 0x04 : Stop Background Activity
+** 0x05 : Flush Cache
+** 0x06 : Start Background Activity
+** (re-start if background is halted)
+** 0x07 : Check If Host Command Pending
+** (Novell May Need This Function)
+** 0x08 : Set controller time
+** ->offset 0xa00 : for inbound message code message_rwbuffer
+** (driver to IOP331)
+** byte 0 : 0xaa <-- signature
+** byte 1 : 0x55 <-- signature
+** byte 2 : year (04)
+** byte 3 : month (1..12)
+** byte 4 : date (1..31)
+** byte 5 : hour (0..23)
+** byte 6 : minute (0..59)
+** byte 7 : second (0..59)
+*******************************************************************************
+*******************************************************************************
+** RS-232 Interface for Areca Raid Controller
+** The low level command interface is exclusive with VT100 terminal
+** --------------------------------------------------------------------
+** 1. Sequence of command execution
+** --------------------------------------------------------------------
+** (A) Header : 3 bytes sequence (0x5E, 0x01, 0x61)
+** (B) Command block : variable length of data including length,
+** command code, data and checksum byte
+** (C) Return data : variable length of data
+** --------------------------------------------------------------------
+** 2. Command block
+** --------------------------------------------------------------------
+** (A) 1st byte : command block length (low byte)
+** (B) 2nd byte : command block length (high byte)
+** note ..command block length shouldn't > 2040 bytes,
+** length excludes these two bytes
+** (C) 3rd byte : command code
+** (D) 4th and following bytes : variable length data bytes
+** depends on command code
+** (E) last byte : checksum byte (sum of 1st byte until last data byte)
+** --------------------------------------------------------------------
+** 3. Command code and associated data
+** --------------------------------------------------------------------
+** The following are command code defined in raid controller Command
+** code 0x10--0x1? are used for system level management,
+** no password checking is needed and should be implemented in separate
+** well controlled utility and not for end user access.
+** Command code 0x20--0x?? always check the password,
+** password must be entered to enable these command.
+** enum
+** {
+** GUI_SET_SERIAL=0x10,
+** GUI_SET_VENDOR,
+** GUI_SET_MODEL,
+** GUI_IDENTIFY,
+** GUI_CHECK_PASSWORD,
+** GUI_LOGOUT,
+** GUI_HTTP,
+** GUI_SET_ETHERNET_ADDR,
+** GUI_SET_LOGO,
+** GUI_POLL_EVENT,
+** GUI_GET_EVENT,
+** GUI_GET_HW_MONITOR,
+** // GUI_QUICK_CREATE=0x20, (function removed)
+** GUI_GET_INFO_R=0x20,
+** GUI_GET_INFO_V,
+** GUI_GET_INFO_P,
+** GUI_GET_INFO_S,
+** GUI_CLEAR_EVENT,
+** GUI_MUTE_BEEPER=0x30,
+** GUI_BEEPER_SETTING,
+** GUI_SET_PASSWORD,
+** GUI_HOST_INTERFACE_MODE,
+** GUI_REBUILD_PRIORITY,
+** GUI_MAX_ATA_MODE,
+** GUI_RESET_CONTROLLER,
+** GUI_COM_PORT_SETTING,
+** GUI_NO_OPERATION,
+** GUI_DHCP_IP,
+** GUI_CREATE_PASS_THROUGH=0x40,
+** GUI_MODIFY_PASS_THROUGH,
+** GUI_DELETE_PASS_THROUGH,
+** GUI_IDENTIFY_DEVICE,
+** GUI_CREATE_RAIDSET=0x50,
+** GUI_DELETE_RAIDSET,
+** GUI_EXPAND_RAIDSET,
+** GUI_ACTIVATE_RAIDSET,
+** GUI_CREATE_HOT_SPARE,
+** GUI_DELETE_HOT_SPARE,
+** GUI_CREATE_VOLUME=0x60,
+** GUI_MODIFY_VOLUME,
+** GUI_DELETE_VOLUME,
+** GUI_START_CHECK_VOLUME,
+** GUI_STOP_CHECK_VOLUME
+** };
+** Command description :
+** GUI_SET_SERIAL : Set the controller serial#
+** byte 0,1 : length
+** byte 2 : command code 0x10
+** byte 3 : password length (should be 0x0f)
+** byte 4-0x13 : should be "ArEcATecHnoLogY"
+** byte 0x14--0x23 : Serial number string (must be 16 bytes)
+** GUI_SET_VENDOR : Set vendor string for the controller
+** byte 0,1 : length
+** byte 2 : command code 0x11
+** byte 3 : password length (should be 0x08)
+** byte 4-0x13 : should be "ArEcAvAr"
+** byte 0x14--0x3B : vendor string (must be 40 bytes)
+** GUI_SET_MODEL : Set the model name of the controller
+** byte 0,1 : length
+** byte 2 : command code 0x12
+** byte 3 : password length (should be 0x08)
+** byte 4-0x13 : should be "ArEcAvAr"
+** byte 0x14--0x1B : model string (must be 8 bytes)
+** GUI_IDENTIFY : Identify device
+** byte 0,1 : length
+** byte 2 : command code 0x13
+** return "Areca RAID Subsystem "
+** GUI_CHECK_PASSWORD : Verify password
+** byte 0,1 : length
+** byte 2 : command code 0x14
+** byte 3 : password length
+** byte 4-0x?? : user password to be checked
+** GUI_LOGOUT : Logout GUI (force password checking on next command)
+** byte 0,1 : length
+** byte 2 : command code 0x15
+** GUI_HTTP : HTTP interface (reserved for Http proxy service)(0x16)
+**
+** GUI_SET_ETHERNET_ADDR : Set the ethernet MAC address
+** byte 0,1 : length
+** byte 2 : command code 0x17
+** byte 3 : password length (should be 0x08)
+** byte 4-0x13 : should be "ArEcAvAr"
+** byte 0x14--0x19 : Ethernet MAC address (must be 6 bytes)
+** GUI_SET_LOGO : Set logo in HTTP
+** byte 0,1 : length
+** byte 2 : command code 0x18
+** byte 3 : Page# (0/1/2/3) (0xff --> clear OEM logo)
+** byte 4/5/6/7 : 0x55/0xaa/0xa5/0x5a
+** byte 8 : TITLE.JPG data (each page must be 2000 bytes)
+** note page0 1st 2 byte must be
+** actual length of the JPG file
+** GUI_POLL_EVENT : Poll If Event Log Changed
+** byte 0,1 : length
+** byte 2 : command code 0x19
+** GUI_GET_EVENT : Read Event
+** byte 0,1 : length
+** byte 2 : command code 0x1a
+** byte 3 : Event Page (0:1st page/1/2/3:last page)
+** GUI_GET_HW_MONITOR : Get HW monitor data
+** byte 0,1 : length
+** byte 2 : command code 0x1b
+** byte 3 : # of FANs(example 2)
+** byte 4 : # of Voltage sensor(example 3)
+** byte 5 : # of temperature sensor(example 2)
+** byte 6 : # of power
+** byte 7/8 : Fan#0 (RPM)
+** byte 9/10 : Fan#1
+** byte 11/12 : Voltage#0 original value in *1000
+** byte 13/14 : Voltage#0 value
+** byte 15/16 : Voltage#1 org
+** byte 17/18 : Voltage#1
+** byte 19/20 : Voltage#2 org
+** byte 21/22 : Voltage#2
+** byte 23 : Temp#0
+** byte 24 : Temp#1
+** byte 25 : Power indicator (bit0 : power#0,
+** bit1 : power#1)
+** byte 26 : UPS indicator
+** GUI_QUICK_CREATE : Quick create raid/volume set
+** byte 0,1 : length
+** byte 2 : command code 0x20
+** byte 3/4/5/6 : raw capacity
+** byte 7 : raid level
+** byte 8 : stripe size
+** byte 9 : spare
+** byte 10/11/12/13: device mask (the devices to create raid/volume)
+** This function is removed, application like
+** to implement quick create function
+** need to use GUI_CREATE_RAIDSET and GUI_CREATE_VOLUMESET function.
+** GUI_GET_INFO_R : Get Raid Set Information
+** byte 0,1 : length
+** byte 2 : command code 0x20
+** byte 3 : raidset#
+** typedef struct sGUI_RAIDSET
+** {
+** BYTE grsRaidSetName[16];
+** DWORD grsCapacity;
+** DWORD grsCapacityX;
+** DWORD grsFailMask;
+** BYTE grsDevArray[32];
+** BYTE grsMemberDevices;
+** BYTE grsNewMemberDevices;
+** BYTE grsRaidState;
+** BYTE grsVolumes;
+** BYTE grsVolumeList[16];
+** BYTE grsRes1;
+** BYTE grsRes2;
+** BYTE grsRes3;
+** BYTE grsFreeSegments;
+** DWORD grsRawStripes[8];
+** DWORD grsRes4;
+** DWORD grsRes5; // Total to 128 bytes
+** DWORD grsRes6; // Total to 128 bytes
+** } sGUI_RAIDSET, *pGUI_RAIDSET;
+** GUI_GET_INFO_V : Get Volume Set Information
+** byte 0,1 : length
+** byte 2 : command code 0x21
+** byte 3 : volumeset#
+** typedef struct sGUI_VOLUMESET
+** {
+** BYTE gvsVolumeName[16]; // 16
+** DWORD gvsCapacity;
+** DWORD gvsCapacityX;
+** DWORD gvsFailMask;
+** DWORD gvsStripeSize;
+** DWORD gvsNewFailMask;
+** DWORD gvsNewStripeSize;
+** DWORD gvsVolumeStatus;
+** DWORD gvsProgress; // 32
+** sSCSI_ATTR gvsScsi;
+** BYTE gvsMemberDisks;
+** BYTE gvsRaidLevel; // 8
+** BYTE gvsNewMemberDisks;
+** BYTE gvsNewRaidLevel;
+** BYTE gvsRaidSetNumber;
+** BYTE gvsRes0; // 4
+** BYTE gvsRes1[4]; // 64 bytes
+** } sGUI_VOLUMESET, *pGUI_VOLUMESET;
+** GUI_GET_INFO_P : Get Physical Drive Information
+** byte 0,1 : length
+** byte 2 : command code 0x22
+** byte 3 : drive # (from 0 to max-channels - 1)
+** typedef struct sGUI_PHY_DRV
+** {
+** BYTE gpdModelName[40];
+** BYTE gpdSerialNumber[20];
+** BYTE gpdFirmRev[8];
+** DWORD gpdCapacity;
+** DWORD gpdCapacityX; // Reserved for expansion
+** BYTE gpdDeviceState;
+** BYTE gpdPioMode;
+** BYTE gpdCurrentUdmaMode;
+** BYTE gpdUdmaMode;
+** BYTE gpdDriveSelect;
+** BYTE gpdRaidNumber; // 0xff if not belongs to a raid set
+** sSCSI_ATTR gpdScsi;
+** BYTE gpdReserved[40]; // Total to 128 bytes
+** } sGUI_PHY_DRV, *pGUI_PHY_DRV;
+** GUI_GET_INFO_S : Get System Information
+** byte 0,1 : length
+** byte 2 : command code 0x23
+** typedef struct sCOM_ATTR
+** {
+** BYTE comBaudRate;
+** BYTE comDataBits;
+** BYTE comStopBits;
+** BYTE comParity;
+** BYTE comFlowControl;
+** } sCOM_ATTR, *pCOM_ATTR;
+** typedef struct sSYSTEM_INFO
+** {
+** BYTE gsiVendorName[40];
+** BYTE gsiSerialNumber[16];
+** BYTE gsiFirmVersion[16];
+** BYTE gsiBootVersion[16];
+** BYTE gsiMbVersion[16];
+** BYTE gsiModelName[8];
+** BYTE gsiLocalIp[4];
+** BYTE gsiCurrentIp[4];
+** DWORD gsiTimeTick;
+** DWORD gsiCpuSpeed;
+** DWORD gsiICache;
+** DWORD gsiDCache;
+** DWORD gsiScache;
+** DWORD gsiMemorySize;
+** DWORD gsiMemorySpeed;
+** DWORD gsiEvents;
+** BYTE gsiMacAddress[6];
+** BYTE gsiDhcp;
+** BYTE gsiBeeper;
+** BYTE gsiChannelUsage;
+** BYTE gsiMaxAtaMode;
+** BYTE gsiSdramEcc; // 1:if ECC enabled
+** BYTE gsiRebuildPriority;
+** sCOM_ATTR gsiComA; // 5 bytes
+** sCOM_ATTR gsiComB; // 5 bytes
+** BYTE gsiIdeChannels;
+** BYTE gsiScsiHostChannels;
+** BYTE gsiIdeHostChannels;
+** BYTE gsiMaxVolumeSet;
+** BYTE gsiMaxRaidSet;
+** BYTE gsiEtherPort; // 1:if ether net port supported
+** BYTE gsiRaid6Engine; // 1:Raid6 engine supported
+** BYTE gsiRes[75];
+** } sSYSTEM_INFO, *pSYSTEM_INFO;
+** GUI_CLEAR_EVENT : Clear System Event
+** byte 0,1 : length
+** byte 2 : command code 0x24
+** GUI_MUTE_BEEPER : Mute current beeper
+** byte 0,1 : length
+** byte 2 : command code 0x30
+** GUI_BEEPER_SETTING : Disable beeper
+** byte 0,1 : length
+** byte 2 : command code 0x31
+** byte 3 : 0->disable, 1->enable
+** GUI_SET_PASSWORD : Change password
+** byte 0,1 : length
+** byte 2 : command code 0x32
+** byte 3 : pass word length ( must <= 15 )
+** byte 4 : password (must be alpha-numerical)
+** GUI_HOST_INTERFACE_MODE : Set host interface mode
+** byte 0,1 : length
+** byte 2 : command code 0x33
+** byte 3 : 0->Independent, 1->cluster
+** GUI_REBUILD_PRIORITY : Set rebuild priority
+** byte 0,1 : length
+** byte 2 : command code 0x34
+** byte 3 : 0/1/2/3 (low->high)
+** GUI_MAX_ATA_MODE : Set maximum ATA mode to be used
+** byte 0,1 : length
+** byte 2 : command code 0x35
+** byte 3 : 0/1/2/3 (133/100/66/33)
+** GUI_RESET_CONTROLLER : Reset Controller
+** byte 0,1 : length
+** byte 2 : command code 0x36
+** *Response with VT100 screen (discard it)
+** GUI_COM_PORT_SETTING : COM port setting
+** byte 0,1 : length
+** byte 2 : command code 0x37
+** byte 3 : 0->COMA (term port),
+** 1->COMB (debug port)
+** byte 4 : 0/1/2/3/4/5/6/7
+** (1200/2400/4800/9600/19200/38400/57600/115200)
+** byte 5 : data bit
+** (0:7 bit, 1:8 bit : must be 8 bit)
+** byte 6 : stop bit (0:1, 1:2 stop bits)
+** byte 7 : parity (0:none, 1:off, 2:even)
+** byte 8 : flow control
+** (0:none, 1:xon/xoff, 2:hardware => must use none)
+** GUI_NO_OPERATION : No operation
+** byte 0,1 : length
+** byte 2 : command code 0x38
+** GUI_DHCP_IP : Set DHCP option and local IP address
+** byte 0,1 : length
+** byte 2 : command code 0x39
+** byte 3 : 0:dhcp disabled, 1:dhcp enabled
+** byte 4/5/6/7 : IP address
+** GUI_CREATE_PASS_THROUGH : Create pass through disk
+** byte 0,1 : length
+** byte 2 : command code 0x40
+** byte 3 : device #
+** byte 4 : scsi channel (0/1)
+** byte 5 : scsi id (0-->15)
+** byte 6 : scsi lun (0-->7)
+** byte 7 : tagged queue (1 : enabled)
+** byte 8 : cache mode (1 : enabled)
+** byte 9 : max speed (0/1/2/3/4,
+** async/20/40/80/160 for scsi)
+** (0/1/2/3/4, 33/66/100/133/150 for ide )
+** GUI_MODIFY_PASS_THROUGH : Modify pass through disk
+** byte 0,1 : length
+** byte 2 : command code 0x41
+** byte 3 : device #
+** byte 4 : scsi channel (0/1)
+** byte 5 : scsi id (0-->15)
+** byte 6 : scsi lun (0-->7)
+** byte 7 : tagged queue (1 : enabled)
+** byte 8 : cache mode (1 : enabled)
+** byte 9 : max speed (0/1/2/3/4,
+** async/20/40/80/160 for scsi)
+** (0/1/2/3/4, 33/66/100/133/150 for ide )
+** GUI_DELETE_PASS_THROUGH : Delete pass through disk
+** byte 0,1 : length
+** byte 2 : command code 0x42
+** byte 3 : device# to be deleted
+** GUI_IDENTIFY_DEVICE : Identify Device
+** byte 0,1 : length
+** byte 2 : command code 0x43
+** byte 3 : Flash Method
+** (0:flash selected, 1:flash not selected)
+** byte 4/5/6/7 : IDE device mask to be flashed
+** note .... no response data available
+** GUI_CREATE_RAIDSET : Create Raid Set
+** byte 0,1 : length
+** byte 2 : command code 0x50
+** byte 3/4/5/6 : device mask
+** byte 7-22 : raidset name (if byte 7 == 0:use default)
+** GUI_DELETE_RAIDSET : Delete Raid Set
+** byte 0,1 : length
+** byte 2 : command code 0x51
+** byte 3 : raidset#
+** GUI_EXPAND_RAIDSET : Expand Raid Set
+** byte 0,1 : length
+** byte 2 : command code 0x52
+** byte 3 : raidset#
+** byte 4/5/6/7 : device mask for expansion
+** byte 8/9/10 : (8:0 no change, 1 change, 0xff:terminate,
+** 9:new raid level,
+** 10:new stripe size
+** 0/1/2/3/4/5->4/8/16/32/64/128K )
+** byte 11/12/13 : repeat for each volume in the raidset
+** GUI_ACTIVATE_RAIDSET : Activate incomplete raid set
+** byte 0,1 : length
+** byte 2 : command code 0x53
+** byte 3 : raidset#
+** GUI_CREATE_HOT_SPARE : Create hot spare disk
+** byte 0,1 : length
+** byte 2 : command code 0x54
+** byte 3/4/5/6 : device mask for hot spare creation
+** GUI_DELETE_HOT_SPARE : Delete hot spare disk
+** byte 0,1 : length
+** byte 2 : command code 0x55
+** byte 3/4/5/6 : device mask for hot spare deletion
+** GUI_CREATE_VOLUME : Create volume set
+** byte 0,1 : length
+** byte 2 : command code 0x60
+** byte 3 : raidset#
+** byte 4-19 : volume set name
+** (if byte4 == 0, use default)
+** byte 20-27 : volume capacity (blocks)
+** byte 28 : raid level
+** byte 29 : stripe size
+** (0/1/2/3/4/5->4/8/16/32/64/128K)
+** byte 30 : channel
+** byte 31 : ID
+** byte 32 : LUN
+** byte 33 : 1 enable tag
+** byte 34 : 1 enable cache
+** byte 35 : speed
+** (0/1/2/3/4->async/20/40/80/160 for scsi)
+** (0/1/2/3/4->33/66/100/133/150 for IDE )
+** byte 36 : 1 to select quick init
+**
+** GUI_MODIFY_VOLUME : Modify volume Set
+** byte 0,1 : length
+** byte 2 : command code 0x61
+** byte 3 : volumeset#
+** byte 4-19 : new volume set name
+** (if byte4 == 0, not change)
+** byte 20-27 : new volume capacity (reserved)
+** byte 28 : new raid level
+** byte 29 : new stripe size
+** (0/1/2/3/4/5->4/8/16/32/64/128K)
+** byte 30 : new channel
+** byte 31 : new ID
+** byte 32 : new LUN
+** byte 33 : 1 enable tag
+** byte 34 : 1 enable cache
+** byte 35 : speed
+** (0/1/2/3/4->async/20/40/80/160 for scsi)
+** (0/1/2/3/4->33/66/100/133/150 for IDE )
+** GUI_DELETE_VOLUME : Delete volume set
+** byte 0,1 : length
+** byte 2 : command code 0x62
+** byte 3 : volumeset#
+** GUI_START_CHECK_VOLUME : Start volume consistency check
+** byte 0,1 : length
+** byte 2 : command code 0x63
+** byte 3 : volumeset#
+** GUI_STOP_CHECK_VOLUME : Stop volume consistency check
+** byte 0,1 : length
+** byte 2 : command code 0x64
+** ---------------------------------------------------------------------
+** 4. Returned data
+** ---------------------------------------------------------------------
+** (A) Header : 3 bytes sequence (0x5E, 0x01, 0x61)
+** (B) Length : 2 bytes
+** (low byte 1st, excludes length and checksum byte)
+** (C) status or data :
+** <1> If length == 1 ==> 1 byte status code
+** #define GUI_OK 0x41
+** #define GUI_RAIDSET_NOT_NORMAL 0x42
+** #define GUI_VOLUMESET_NOT_NORMAL 0x43
+** #define GUI_NO_RAIDSET 0x44
+** #define GUI_NO_VOLUMESET 0x45
+** #define GUI_NO_PHYSICAL_DRIVE 0x46
+** #define GUI_PARAMETER_ERROR 0x47
+** #define GUI_UNSUPPORTED_COMMAND 0x48
+** #define GUI_DISK_CONFIG_CHANGED 0x49
+** #define GUI_INVALID_PASSWORD 0x4a
+** #define GUI_NO_DISK_SPACE 0x4b
+** #define GUI_CHECKSUM_ERROR 0x4c
+** #define GUI_PASSWORD_REQUIRED 0x4d
+** <2> If length > 1 ==>
+** data block returned from controller
+** and the contents depends on the command code
+** (E) Checksum : checksum of length and status or data byte
+**************************************************************************
diff --git a/Documentation/scsi/dc395x.txt b/Documentation/scsi/dc395x.txt
new file mode 100644
index 0000000..88219f9
--- /dev/null
+++ b/Documentation/scsi/dc395x.txt
@@ -0,0 +1,102 @@
+README file for the dc395x SCSI driver
+==========================================
+
+Status
+------
+The driver has been tested with CD-R and CD-R/W drives. These should
+be safe to use. Testing with hard disks has not been done to any
+great degree and caution should be exercised if you want to attempt
+to use this driver with hard disks.
+
+This is a 2.5 only driver. For a 2.4 driver please see the original
+driver (which this driver started from) at
+http://www.garloff.de/kurt/linux/dc395/
+
+Problems, questions and patches should be submitted to the mailing
+list. Details on the list, including archives, are available at
+http://lists.twibble.org/mailman/listinfo/dc395x/
+
+Parameters
+----------
+The driver uses the settings from the EEPROM set in the SCSI BIOS
+setup. If there is no EEPROM, the driver uses default values.
+Both can be overridden by command line parameters (module or kernel
+parameters).
+
+The following parameters are available:
+
+ - safe
+ Default: 0, Acceptable values: 0 or 1
+
+ If safe is set to 1 then the adapter will use conservative
+ ("safe") default settings. This sets:
+
+ shortcut for dc395x=7,4,9,15,2,10
+
+ - adapter_id
+ Default: 7, Acceptable values: 0 to 15
+
+ Sets the host adapter SCSI ID.
+
+ - max_speed
+ Default: 1, Acceptable value: 0 to 7
+ 0 = 20 Mhz
+ 1 = 12.2 Mhz
+ 2 = 10 Mhz
+ 3 = 8 Mhz
+ 4 = 6.7 Mhz
+ 5 = 5.8 Hhz
+ 6 = 5 Mhz
+ 7 = 4 Mhz
+
+ - dev_mode
+ Bitmap for device configuration
+
+ DevMode bit definition:
+ Bit Val(hex) Val(dec) Meaning
+ *0 0x01 1 Parity check
+ *1 0x02 2 Synchronous Negotiation
+ *2 0x04 4 Disconnection
+ *3 0x08 8 Send Start command on startup. (Not used)
+ *4 0x10 16 Tagged Command Queueing
+ *5 0x20 32 Wide Negotiation
+
+ - adapter_mode
+ Bitmap for adapter configuration
+
+ AdaptMode bit definition
+ Bit Val(hex) Val(dec) Meaning
+ *0 0x01 1 Support more than two drives. (Not used)
+ *1 0x02 2 Use DOS compatible mapping for HDs greater than 1GB.
+ *2 0x04 4 Reset SCSI Bus on startup.
+ *3 0x08 8 Active Negation: Improves SCSI Bus noise immunity.
+ 4 0x10 16 Immediate return on BIOS seek command. (Not used)
+ (*)5 0x20 32 Check for LUNs >= 1.
+
+ - tags
+ Default: 3, Acceptable values: 0-5
+
+ The number of tags is 1<<x, if x has been specified
+
+ - reset_delay
+ Default: 1, Acceptable values: 0-180
+
+ The seconds to not accept commands after a SCSI Reset
+
+
+For the built in driver the parameters should be prefixed with
+dc395x. (eg "dc395x.safe=1")
+
+
+Copyright
+---------
+The driver is free software. It is protected by the GNU General Public
+License (GPL). Please read it, before using this driver. It should be
+included in your kernel sources and with your distribution. It carries the
+filename COPYING. If you don't have it, please ask me to send you one by
+email.
+Note: The GNU GPL says also something about warranty and liability.
+Please be aware the following: While we do my best to provide a working and
+reliable driver, there is a chance, that it will kill your valuable data.
+We refuse to take any responsibility for that. The driver is provided as-is
+and YOU USE IT AT YOUR OWN RESPONSIBILITY.
diff --git a/Documentation/scsi/dpti.txt b/Documentation/scsi/dpti.txt
new file mode 100644
index 0000000..f36dc0e
--- /dev/null
+++ b/Documentation/scsi/dpti.txt
@@ -0,0 +1,83 @@
+ /* TERMS AND CONDITIONS OF USE
+ *
+ * Redistribution and use in source form, with or without modification, are
+ * permitted provided that redistributions of source code must retain the
+ * above copyright notice, this list of conditions and the following disclaimer.
+ *
+ * This software is provided `as is' by Adaptec and
+ * any express or implied warranties, including, but not limited to, the
+ * implied warranties of merchantability and fitness for a particular purpose,
+ * are disclaimed. In no event shall Adaptec be
+ * liable for any direct, indirect, incidental, special, exemplary or
+ * consequential damages (including, but not limited to, procurement of
+ * substitute goods or services; loss of use, data, or profits; or business
+ * interruptions) however caused and on any theory of liability, whether in
+ * contract, strict liability, or tort (including negligence or otherwise)
+ * arising in any way out of the use of this driver software, even if advised
+ * of the possibility of such damage.
+ *
+ ****************************************************************
+ * This driver supports the Adaptec I2O RAID and DPT SmartRAID V I2O boards.
+ *
+ * CREDITS:
+ * The original linux driver was ported to Linux by Karen White while at
+ * Dell Computer. It was ported from Bob Pasteur's (of DPT) original
+ * non-Linux driver. Mark Salyzyn and Bob Pasteur consulted on the original
+ * driver.
+ *
+ * 2.0 version of the driver by Deanna Bonds and Mark Salyzyn.
+ *
+ * HISTORY:
+ * The driver was originally ported to linux version 2.0.34
+ *
+ * V2.0 Rewrite of driver. Re-architectured based on i2o subsystem.
+ * This was the first full GPL version since the last version used
+ * i2osig headers which were not GPL. Developer Testing version.
+ * V2.1 Internal testing
+ * V2.2 First released version
+ *
+ * V2.3
+ * Changes:
+ * Added Raptor Support
+ * Fixed bug causing system to hang under extreme load with
+ * management utilities running (removed GFP_DMA from kmalloc flags)
+ *
+ *
+ * V2.4 First version ready to be submitted to be embedded in the kernel
+ * Changes:
+ * Implemented suggestions from Alan Cox
+ * Added calculation of resid for sg layer
+ * Better error handling
+ * Added checking underflow conditions
+ * Added DATAPROTECT checking
+ * Changed error return codes
+ * Fixed pointer bug in bus reset routine
+ * Enabled hba reset from ioctls (allows a FW flash to reboot and use the new
+ * FW without having to reboot)
+ * Changed proc output
+ *
+ * TODO:
+ * Add 64 bit Scatter Gather when compiled on 64 bit architectures
+ * Add sparse lun scanning
+ * Add code that checks if a device that had been taken offline is
+ * now online (at the FW level) when test unit ready or inquiry
+ * command from scsi-core
+ * Add proc read interface
+ * busrescan command
+ * rescan command
+ * Add code to rescan routine that notifies scsi-core about new devices
+ * Add support for C-PCI (hotplug stuff)
+ * Add ioctl passthru error recovery
+ *
+ * NOTES:
+ * The DPT card optimizes the order of processing commands. Consequently,
+ * a command may take up to 6 minutes to complete after it has been sent
+ * to the board.
+ *
+ * The files dpti_ioctl.h dptsig.h osd_defs.h osd_util.h sys_info.h are part of the
+ * interface files for Adaptec's management routines. These define the structures used
+ * in the ioctls. They are written to be portable. They are hard to read, but I need
+ * to use them 'as is' or I can miss changes in the interface.
+ *
+ */
+
diff --git a/Documentation/scsi/dtc3x80.txt b/Documentation/scsi/dtc3x80.txt
new file mode 100644
index 0000000..e8ae623
--- /dev/null
+++ b/Documentation/scsi/dtc3x80.txt
@@ -0,0 +1,43 @@
+README file for the Linux DTC3180/3280 scsi driver.
+by Ray Van Tassle (rayvt@comm.mot.com) March 1996
+Based on the generic & core NCR5380 code by Drew Eckhard
+
+SCSI device driver for the DTC 3180/3280.
+Data Technology Corp---a division of Qume.
+
+The 3280 has a standard floppy interface.
+
+The 3180 does not. Otherwise, they are identical.
+
+The DTC3x80 does not support DMA but it does have Pseudo-DMA which is
+supported by the driver.
+
+It's DTC406 scsi chip is supposedly compatible with the NCR 53C400.
+It is memory mapped, uses an IRQ, but no dma or io-port. There is
+internal DMA, between SCSI bus and an on-chip 128-byte buffer. Double
+buffering is done automagically by the chip. Data is transferred
+between the on-chip buffer and CPU/RAM via memory moves.
+
+The driver detects the possible memory addresses (jumper selectable):
+ CC00, DC00, C800, and D800
+The possible IRQ's (jumper selectable) are:
+ IRQ 10, 11, 12, 15
+Parity is supported by the chip, but not by this driver.
+Information can be obtained from /proc/scsi/dtc3c80/N.
+
+Note on interrupts:
+
+The documentation says that it can be set to interrupt whenever the
+on-chip buffer needs CPU attention. I couldn't get this to work. So
+the driver polls for data-ready in the pseudo-DMA transfer routine.
+The interrupt support routines in the NCR3280.c core modules handle
+scsi disconnect/reconnect, and this (mostly) works. However..... I
+have tested it with 4 totally different hard drives (both SCSI-1 and
+SCSI-2), and one CDROM drive. Interrupts works great for all but one
+specific hard drive. For this one, the driver will eventually hang in
+the transfer state. I have tested with: "dd bs=4k count=2k
+of=/dev/null if=/dev/sdb". It reads ok for a while, then hangs.
+After beating my head against this for a couple of weeks, getting
+nowhere, I give up. So.....This driver does NOT use interrupts, even
+if you have the card jumpered to an IRQ. Probably nobody will ever
+care.
diff --git a/Documentation/scsi/g_NCR5380.txt b/Documentation/scsi/g_NCR5380.txt
new file mode 100644
index 0000000..3b80f56
--- /dev/null
+++ b/Documentation/scsi/g_NCR5380.txt
@@ -0,0 +1,63 @@
+README file for the Linux g_NCR5380 driver.
+
+(c) 1993 Drew Eckhard
+NCR53c400 extensions (c) 1994,1995,1996 Kevin Lentin
+
+This file documents the NCR53c400 extensions by Kevin Lentin and some
+enhancements to the NCR5380 core.
+
+This driver supports both NCR5380 and NCR53c400 cards in port or memory
+mapped modes. Currently this driver can only support one of those mapping
+modes at a time but it does support both of these chips at the same time.
+The next release of this driver will support port & memory mapped cards at
+the same time. It should be able to handle multiple different cards in the
+same machine.
+
+The drivers/scsi/Makefile has an override in it for the most common
+NCR53c400 card, the Trantor T130B in its default configuration:
+ Port: 0x350
+ IRQ : 5
+
+The NCR53c400 does not support DMA but it does have Pseudo-DMA which is
+supported by the driver.
+
+If the default configuration does not work for you, you can use the kernel
+command lines (eg using the lilo append command):
+ ncr5380=port,irq,dma
+ ncr53c400=port,irq
+or
+ ncr5380=base,irq,dma
+ ncr53c400=base,irq
+
+The driver does not probe for any addresses or ports other than those in
+the OVERRIDE or given to the kernel as above.
+
+This driver provides some information on what it has detected in
+/proc/scsi/g_NCR5380/x where x is the scsi card number as detected at boot
+time. More info to come in the future.
+
+When NCR53c400 support is compiled in, BIOS parameters will be returned by
+the driver (the raw 5380 driver does not and I don't plan to fiddle with
+it!).
+
+This driver works as a module.
+When included as a module, parameters can be passed on the insmod/modprobe
+command line:
+ ncr_irq=xx the interrupt
+ ncr_addr=xx the port or base address (for port or memory
+ mapped, resp.)
+ ncr_dma=xx the DMA
+ ncr_5380=1 to set up for a NCR5380 board
+ ncr_53c400=1 to set up for a NCR53C400 board
+e.g.
+modprobe g_NCR5380 ncr_irq=5 ncr_addr=0x350 ncr_5380=1
+ for a port mapped NCR5380 board or
+modprobe g_NCR5380 ncr_irq=255 ncr_addr=0xc8000 ncr_53c400=1
+ for a memory mapped NCR53C400 board with interrupts disabled.
+
+(255 should be specified for no or DMA interrupt, 254 to autoprobe for an
+ IRQ line if overridden on the command line.)
+
+
+Kevin Lentin
+K.Lentin@cs.monash.edu.au
diff --git a/Documentation/scsi/hptiop.txt b/Documentation/scsi/hptiop.txt
new file mode 100644
index 0000000..a6eb4ad
--- /dev/null
+++ b/Documentation/scsi/hptiop.txt
@@ -0,0 +1,104 @@
+HIGHPOINT ROCKETRAID 3xxx/4xxx ADAPTER DRIVER (hptiop)
+
+Controller Register Map
+-------------------------
+
+For Intel IOP based adapters, the controller IOP is accessed via PCI BAR0:
+
+ BAR0 offset Register
+ 0x10 Inbound Message Register 0
+ 0x14 Inbound Message Register 1
+ 0x18 Outbound Message Register 0
+ 0x1C Outbound Message Register 1
+ 0x20 Inbound Doorbell Register
+ 0x24 Inbound Interrupt Status Register
+ 0x28 Inbound Interrupt Mask Register
+ 0x30 Outbound Interrupt Status Register
+ 0x34 Outbound Interrupt Mask Register
+ 0x40 Inbound Queue Port
+ 0x44 Outbound Queue Port
+
+For Marvell IOP based adapters, the IOP is accessed via PCI BAR0 and BAR1:
+
+ BAR0 offset Register
+ 0x20400 Inbound Doorbell Register
+ 0x20404 Inbound Interrupt Mask Register
+ 0x20408 Outbound Doorbell Register
+ 0x2040C Outbound Interrupt Mask Register
+
+ BAR1 offset Register
+ 0x0 Inbound Queue Head Pointer
+ 0x4 Inbound Queue Tail Pointer
+ 0x8 Outbound Queue Head Pointer
+ 0xC Outbound Queue Tail Pointer
+ 0x10 Inbound Message Register
+ 0x14 Outbound Message Register
+ 0x40-0x1040 Inbound Queue
+ 0x1040-0x2040 Outbound Queue
+
+
+I/O Request Workflow
+----------------------
+
+All queued requests are handled via inbound/outbound queue port.
+A request packet can be allocated in either IOP or host memory.
+
+To send a request to the controller:
+
+ - Get a free request packet by reading the inbound queue port or
+ allocate a free request in host DMA coherent memory.
+
+ The value returned from the inbound queue port is an offset
+ relative to the IOP BAR0.
+
+ Requests allocated in host memory must be aligned on 32-bytes boundary.
+
+ - Fill the packet.
+
+ - Post the packet to IOP by writing it to inbound queue. For requests
+ allocated in IOP memory, write the offset to inbound queue port. For
+ requests allocated in host memory, write (0x80000000|(bus_addr>>5))
+ to the inbound queue port.
+
+ - The IOP process the request. When the request is completed, it
+ will be put into outbound queue. An outbound interrupt will be
+ generated.
+
+ For requests allocated in IOP memory, the request offset is posted to
+ outbound queue.
+
+ For requests allocated in host memory, (0x80000000|(bus_addr>>5))
+ is posted to the outbound queue. If IOP_REQUEST_FLAG_OUTPUT_CONTEXT
+ flag is set in the request, the low 32-bit context value will be
+ posted instead.
+
+ - The host read the outbound queue and complete the request.
+
+ For requests allocated in IOP memory, the host driver free the request
+ by writing it to the outbound queue.
+
+Non-queued requests (reset/flush etc) can be sent via inbound message
+register 0. An outbound message with the same value indicates the completion
+of an inbound message.
+
+
+User-level Interface
+---------------------
+
+The driver exposes following sysfs attributes:
+
+ NAME R/W Description
+ driver-version R driver version string
+ firmware-version R firmware version string
+
+
+-----------------------------------------------------------------------------
+Copyright (C) 2006-2007 HighPoint Technologies, Inc. All Rights Reserved.
+
+ This file is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ linux@highpoint-tech.com
+ http://www.highpoint-tech.com
diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt
new file mode 100644
index 0000000..3920f28
--- /dev/null
+++ b/Documentation/scsi/ibmmca.txt
@@ -0,0 +1,1402 @@
+
+ -=< The IBM Microchannel SCSI-Subsystem >=-
+
+ for the IBM PS/2 series
+
+ Low Level Software-Driver for Linux
+
+ Copyright (c) 1995 Strom Systems, Inc. under the terms of the GNU
+ General Public License. Originally written by Martin Kolinek, December 1995.
+ Officially modified and maintained by Michael Lang since January 1999.
+
+ Version 4.0a
+
+ Last update: January 3, 2001
+
+ Before you Start
+ ----------------
+ This is the common README.ibmmca file for all driver releases of the
+ IBM MCA SCSI driver for Linux. Please note, that driver releases 4.0
+ or newer do not work with kernel versions older than 2.4.0, while driver
+ versions older than 4.0 do not work with kernels 2.4.0 or later! If you
+ try to compile your kernel with the wrong driver source, the
+ compilation is aborted and you get a corresponding error message. This is
+ no bug in the driver; it prevents you from using the wrong source code
+ with the wrong kernel version.
+
+ Authors of this Driver
+ ----------------------
+ - Chris Beauregard (improvement of the SCSI-device mapping by the driver)
+ - Martin Kolinek (origin, first release of this driver)
+ - Klaus Kudielka (multiple SCSI-host management/detection, adaption to
+ Linux Kernel 2.1.x, module support)
+ - Michael Lang (assigning original pun/lun mapping, dynamical ldn
+ assignment, rewritten adapter detection, this file,
+ patches, official driver maintenance and subsequent
+ debugging, related with the driver)
+
+ Table of Contents
+ -----------------
+ 1 Abstract
+ 2 Driver Description
+ 2.1 IBM SCSI-Subsystem Detection
+ 2.2 Physical Units, Logical Units, and Logical Devices
+ 2.3 SCSI-Device Recognition and dynamical ldn Assignment
+ 2.4 SCSI-Device Order
+ 2.5 Regular SCSI-Command-Processing
+ 2.6 Abort & Reset Commands
+ 2.7 Disk Geometry
+ 2.8 Kernel Boot Option
+ 2.9 Driver Module Support
+ 2.10 Multiple Hostadapter Support
+ 2.11 /proc/scsi-Filesystem Information
+ 2.12 /proc/mca-Filesystem Information
+ 2.13 Supported IBM SCSI-Subsystems
+ 2.14 Linux Kernel Versions
+ 3 Code History
+ 4 To do
+ 5 Users' Manual
+ 5.1 Commandline Parameters
+ 5.2 Troubleshooting
+ 5.3 Bug reports
+ 5.4 Support WWW-page
+ 6 References
+ 7 Credits to
+ 7.1 People
+ 7.2 Sponsors & Supporters
+ 8 Trademarks
+ 9 Disclaimer
+
+ * * *
+
+ 1 Abstract
+ ----------
+ This README-file describes the IBM SCSI-subsystem low level driver for
+ Linux. The descriptions which were formerly kept in the source code have
+ been taken out of this file to simplify the codes readability. The driver
+ description has been updated, as most of the former description was already
+ quite outdated. The history of the driver development is also kept inside
+ here. Multiple historical developments have been summarized to shorten the
+ text size a bit. At the end of this file you can find a small manual for
+ this driver and hints to get it running on your machine.
+
+ 2 Driver Description
+ --------------------
+ 2.1 IBM SCSI-Subsystem Detection
+ --------------------------------
+ This is done in the ibmmca_detect() function. It first checks, if the
+ Microchannel-bus support is enabled, as the IBM SCSI-subsystem needs the
+ Microchannel. In a next step, a free interrupt is chosen and the main
+ interrupt handler is connected to it to handle answers of the SCSI-
+ subsystem(s). If the F/W SCSI-adapter is forced by the BIOS to use IRQ11
+ instead of IRQ14, IRQ11 is used for the IBM SCSI-2 F/W adapter. In a
+ further step it is checked, if the adapter gets detected by force from
+ the kernel commandline, where the I/O port and the SCSI-subsystem id can
+ be specified. The next step checks if there is an integrated SCSI-subsystem
+ installed. This register area is fixed through all IBM PS/2 MCA-machines
+ and appears as something like a virtual slot 10 of the MCA-bus. On most
+ PS/2 machines, the POS registers of slot 10 are set to 0xff or 0x00 if not
+ integrated SCSI-controller is available. But on certain PS/2s, like model
+ 9595, this slot 10 is used to store other information which at earlier
+ stage confused the driver and resulted in the detection of some ghost-SCSI.
+ If POS-register 2 and 3 are not 0x00 and not 0xff, but all other POS
+ registers are either 0xff or 0x00, there must be an integrated SCSI-
+ subsystem present and it will be registered as IBM Integrated SCSI-
+ Subsystem. The next step checks, if there is a slot-adapter installed on
+ the MCA-bus. To get this, the first two POS-registers, that represent the
+ adapter ID are checked. If they fit to one of the ids, stored in the
+ adapter list, a SCSI-subsystem is assumed to be found in a slot and will be
+ registered. This check is done through all possible MCA-bus slots to allow
+ more than one SCSI-adapter to be present in the PS/2-system and this is
+ already the first point of problems. Looking into the technical reference
+ manual for the IBM PS/2 common interfaces, the POS2 register must have
+ different interpretation of its single bits to avoid overlapping I/O
+ regions. While one can assume, that the integrated subsystem has a fix
+ I/O-address at 0x3540 - 0x3547, further installed IBM SCSI-adapters must
+ use a different I/O-address. This is expressed by bit 1 to 3 of POS2
+ (multiplied by 8 + 0x3540). Bits 2 and 3 are reserved for the integrated
+ subsystem, but not for the adapters! The following list shows, how the
+ bits of POS2 and POS3 should be interpreted.
+
+ The POS2-register of all PS/2 models' integrated SCSI-subsystems has the
+ following interpretation of bits:
+ Bit 7 - 4 : Chip Revision ID (Release)
+ Bit 3 - 2 : Reserved
+ Bit 1 : 8k NVRAM Disabled
+ Bit 0 : Chip Enable (EN-Signal)
+ The POS3-register is interpreted as follows (for most IBM SCSI-subsys.):
+ Bit 7 - 5 : SCSI ID
+ Bit 4 - 0 : Reserved = 0
+ The slot-adapters have different interpretation of these bits. The IBM SCSI
+ adapter (w/Cache) and the IBM SCSI-2 F/W adapter use the following
+ interpretation of the POS2 register:
+ Bit 7 - 4 : ROM Segment Address Select
+ Bit 3 - 1 : Adapter I/O Address Select (*8+0x3540)
+ Bit 0 : Adapter Enable (EN-Signal)
+ and for the POS3 register:
+ Bit 7 - 5 : SCSI ID
+ Bit 4 : Fairness Enable (SCSI ID3 f. F/W)
+ Bit 3 - 0 : Arbitration Level
+ The most modern product of the series is the IBM SCSI-2 F/W adapter, it
+ allows dual-bus SCSI and SCSI-wide addressing, which means, PUNs may be
+ between 0 and 15. Here, Bit 4 is the high-order bit of the 4-bit wide
+ adapter PUN expression. In short words, this means, that IBM PS/2 machines
+ can only support 1 single integrated subsystem by default. Additional
+ slot-adapters get ports assigned by the automatic configuration tool.
+
+ One day I found a patch in ibmmca_detect(), forcing the I/O-address to be
+ 0x3540 for integrated SCSI-subsystems, there was a remark placed, that on
+ integrated IBM SCSI-subsystems of model 56, the POS2 register was showing 5.
+ This means, that really for these models, POS2 has to be interpreted
+ sticking to the technical reference guide. In this case, the bit 2 (4) is
+ a reserved bit and may not be interpreted. These differences between the
+ adapters and the integrated controllers are taken into account by the
+ detection routine of the driver on from version >3.0g.
+
+ Every time, a SCSI-subsystem is discovered, the ibmmca_register() function
+ is called. This function checks first, if the requested area for the I/O-
+ address of this SCSI-subsystem is still available and assigns this I/O-
+ area to the SCSI-subsystem. There are always 8 sequential I/O-addresses
+ taken for each individual SCSI-subsystem found, which are:
+
+ Offset Type Permissions
+ 0 Command Interface Register 1 Read/Write
+ 1 Command Interface Register 2 Read/Write
+ 2 Command Interface Register 3 Read/Write
+ 3 Command Interface Register 4 Read/Write
+ 4 Attention Register Read/Write
+ 5 Basic Control Register Read/Write
+ 6 Interrupt Status Register Read
+ 7 Basic Status Register Read
+
+ After the I/O-address range is assigned, the host-adapter is assigned
+ to a local structure which keeps all adapter information needed for the
+ driver itself and the mid- and higher-level SCSI-drivers. The SCSI pun/lun
+ and the adapters' ldn tables are initialized and get probed afterwards by
+ the check_devices() function. If no further adapters are found,
+ ibmmca_detect() quits.
+
+ 2.2 Physical Units, Logical Units, and Logical Devices
+ ------------------------------------------------------
+ There can be up to 56 devices on the SCSI bus (besides the adapter):
+ there are up to 7 "physical units" (each identified by physical unit
+ number or pun, also called the scsi id, this is the number you select
+ with hardware jumpers), and each physical unit can have up to 8
+ "logical units" (each identified by logical unit number, or lun,
+ between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two
+ busses and provides support for 30 logical devices at the same time, where
+ in wide-addressing mode you can have 16 puns with 32 luns on each device.
+ This section describes the handling of devices on non-F/W adapters.
+ Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter
+ which means a lot of possible devices for such a small machine.
+
+ Typically the adapter has pun=7, so puns of other physical units
+ are between 0 and 6(15). On a wide-adapter a pun higher than 7 is
+ possible, but is normally not used. Almost all physical units have only
+ one logical unit, with lun=0. A CD-ROM jukebox would be an example of a
+ physical unit with more than one logical unit.
+
+ The embedded microprocessor of the IBM SCSI-subsystem hides the complex
+ two-dimensional (pun,lun) organization from the operating system.
+ When the machine is powered-up (or rebooted), the embedded microprocessor
+ checks, on its own, all 56 possible (pun,lun) combinations, and the first
+ 15 devices found are assigned into a one-dimensional array of so-called
+ "logical devices", identified by "logical device numbers" or ldn. The last
+ ldn=15 is reserved for the subsystem itself. Wide adapters may have
+ to check up to 15 * 8 = 120 pun/lun combinations.
+
+ 2.3 SCSI-Device Recognition and Dynamical ldn Assignment
+ --------------------------------------------------------
+ One consequence of information hiding is that the real (pun,lun)
+ numbers are also hidden. The two possibilities to get around this problem
+ are to offer fake pun/lun combinations to the operating system or to
+ delete the whole mapping of the adapter and to reassign the ldns, using
+ the immediate assign command of the SCSI-subsystem for probing through
+ all possible pun/lun combinations. An ldn is a "logical device number"
+ which is used by IBM SCSI-subsystems to access some valid SCSI-device.
+ At the beginning of the development of this driver, the following approach
+ was used:
+
+ First, the driver checked the ldn's (0 to 6) to find out which ldn's
+ have devices assigned. This was done by the functions check_devices() and
+ device_exists(). The interrupt handler has a special paragraph of code
+ (see local_checking_phase_flag) to assist in the checking. Assume, for
+ example, that three logical devices were found assigned at ldn 0, 1, 2.
+ These are presented to the upper layer of Linux SCSI driver
+ as devices with bogus (pun, lun) equal to (0,0), (1,0), (2,0).
+ On the other hand, if the upper layer issues a command to device
+ say (4,0), this driver returns DID_NO_CONNECT error.
+
+ In a second step of the driver development, the following improvement has
+ been applied: The first approach limited the number of devices to 7, far
+ fewer than the 15 that it could use, then it just mapped ldn ->
+ (ldn/8,ldn%8) for pun,lun. We ended up with a real mishmash of puns
+ and luns, but it all seemed to work.
+
+ The latest development, which is implemented from the driver version 3.0
+ and later, realizes the device recognition in the following way:
+ The physical SCSI-devices on the SCSI-bus are probed via immediate_assign-
+ and device_inquiry-commands, that is all implemented in a completely new
+ made check_devices() subroutine. This delivers an exact map of the physical
+ SCSI-world that is now stored in the get_scsi[][]-array. This means,
+ that the once hidden pun,lun assignment is now known to this driver.
+ It no longer believes in default-settings of the subsystem and maps all
+ ldns to existing pun,lun "by foot". This assures full control of the ldn
+ mapping and allows dynamical remapping of ldns to different pun,lun, if
+ there are more SCSI-devices installed than ldns available (n>15). The
+ ldns from 0 to 6 get 'hardwired' by this driver to puns 0 to 7 at lun=0,
+ excluding the pun of the subsystem. This assures, that at least simple
+ SCSI-installations have optimum access-speed and are not touched by
+ dynamical remapping. The ldns 7 to 14 are put to existing devices with
+ lun>0 or to non-existing devices, in order to satisfy the subsystem, if
+ there are less than 15 SCSI-devices connected. In the case of more than 15
+ devices, the dynamical mapping goes active. If the get_scsi[][] reports a
+ device to be existent, but it has no ldn assigned, it gets an ldn out of 7
+ to 14. The numbers are assigned in cyclic order, therefore it takes 8
+ dynamical reassignments on the SCSI-devices until a certain device
+ loses its ldn again. This assures that dynamical remapping is avoided
+ during intense I/O between up to 15 SCSI-devices (means pun,lun
+ combinations). A further advantage of this method is that people who
+ build their kernel without probing on all luns will get what they expect,
+ because the driver just won't assign everything with lun>0 when
+ multiple lun probing is inactive.
+
+ 2.4 SCSI-Device Order
+ ---------------------
+ Because of the now correct recognition of physical pun,lun, and
+ their report to mid-level- and higher-level-drivers, the new reported puns
+ can be different from the old, faked puns. Therefore, Linux will eventually
+ change /dev/sdXXX assignments and prompt you for corrupted superblock
+ repair on boottime. In this case DO NOT PANIC, YOUR DISKS ARE STILL OK!!!
+ You have to reboot (CTRL-D) with an old kernel and set the /etc/fstab-file
+ entries right. After that, the system should come up as errorfree as before.
+ If your boot-partition is not coming up, also edit the /etc/lilo.conf-file
+ in a Linux session booted on old kernel and run lilo before reboot. Check
+ lilo.conf anyway to get boot on other partitions with foreign OSes right
+ again. But there exists a feature of this driver that allows you to change
+ the assignment order of the SCSI-devices by flipping the PUN-assignment.
+ See the next paragraph for a description.
+
+ The problem for this is, that Linux does not assign the SCSI-devices in the
+ way as described in the ANSI-SCSI-standard. Linux assigns /dev/sda to
+ the device with at minimum id 0. But the first drive should be at id 6,
+ because for historical reasons, drive at id 6 has, by hardware, the highest
+ priority and a drive at id 0 the lowest. IBM was one of the rare producers,
+ where the BIOS assigns drives belonging to the ANSI-SCSI-standard. Most
+ other producers' BIOS does not (I think even Adaptec-BIOS). The
+ IBMMCA_SCSI_ORDER_STANDARD flag, which you set while configuring the
+ kernel enables to choose the preferred way of SCSI-device-assignment.
+ Defining this flag would result in Linux determining the devices in the
+ same order as DOS and OS/2 does on your MCA-machine. This is also standard
+ on most industrial computers and OSes, like e.g. OS-9. Leaving this flag
+ undefined will get your devices ordered in the default way of Linux. See
+ also the remarks of Chris Beauregard from Dec 15, 1997 and the followups
+ in section 3.
+
+ 2.5 Regular SCSI-Command-Processing
+ -----------------------------------
+ Only three functions get involved: ibmmca_queuecommand(), issue_cmd(),
+ and interrupt_handler().
+
+ The upper layer issues a scsi command by calling function
+ ibmmca_queuecommand(). This function fills a "subsystem control block"
+ (scb) and calls a local function issue_cmd(), which writes a scb
+ command into subsystem I/O ports. Once the scb command is carried out,
+ the interrupt_handler() is invoked. If a device is determined to be
+ existant and it has not assigned any ldn, it gets one dynamically.
+ For this, the whole stuff is done in ibmmca_queuecommand().
+
+ 2.6 Abort & Reset Commands
+ --------------------------
+ These are implemented with busy waiting for interrupt to arrive.
+ ibmmca_reset() and ibmmca_abort() do not work sufficiently well
+ up to now and need still a lot of development work. This seems
+ to be a problem with other low-level SCSI drivers too, however
+ this should be no excuse.
+
+ 2.7 Disk Geometry
+ -----------------
+ The ibmmca_biosparams() function should return the same disk geometry
+ as the bios. This is needed for fdisk, etc. The returned geometry is
+ certainly correct for disks smaller than 1 gigabyte. In the meantime,
+ it has been proved, that this works fine even with disks larger than
+ 1 gigabyte.
+
+ 2.8 Kernel Boot Option
+ ----------------------
+ The function ibmmca_scsi_setup() is called if option ibmmcascsi=n
+ is passed to the kernel. See file linux/init/main.c for details.
+
+ 2.9 Driver Module Support
+ -------------------------
+ Is implemented and tested by K. Kudielka. This could probably not work
+ on kernels <2.1.0.
+
+ 2.10 Multiple Hostadapter Support
+ ---------------------------------
+ This driver supports up to eight interfaces of type IBM-SCSI-Subsystem.
+ Integrated-, and MCA-adapters are automatically recognized. Unrecognizable
+ IBM-SCSI-Subsystem interfaces can be specified as kernel-parameters.
+
+ 2.11 /proc/scsi-Filesystem Information
+ --------------------------------------
+ Information about the driver condition is given in
+ /proc/scsi/ibmmca/<host_no>. ibmmca_proc_info() provides this information.
+
+ This table is quite informative for interested users. It shows the load
+ of commands on the subsystem and whether you are running the bypassed
+ (software) or integrated (hardware) SCSI-command set (see below). The
+ amount of accesses is shown. Read, write, modeselect is shown separately
+ in order to help debugging problems with CD-ROMs or tapedrives.
+
+ The following table shows the list of 15 logical device numbers, that are
+ used by the SCSI-subsystem. The load on each ldn is shown in the table,
+ again, read and write commands are split. The last column shows the amount
+ of reassignments, that have been applied to the ldns, if you have more than
+ 15 pun/lun combinations available on the SCSI-bus.
+
+ The last two tables show the pun/lun map and the positions of the ldns
+ on this pun/lun map. This may change during operation, when a ldn is
+ reassigned to another pun/lun combination. If the necessity for dynamical
+ assignments is set to 'no', the ldn structure keeps static.
+
+ 2.12 /proc/mca-Filesystem Information
+ -------------------------------------
+ The slot-file contains all default entries and in addition chip and I/O-
+ address information of the SCSI-subsystem. This information is provided
+ by ibmmca_getinfo().
+
+ 2.13 Supported IBM SCSI-Subsystems
+ ----------------------------------
+ The following IBM SCSI-subsystems are supported by this driver:
+
+ - IBM Fast/Wide SCSI-2 Adapter
+ - IBM 7568 Industrial Computer SCSI Adapter w/Cache
+ - IBM Expansion Unit SCSI Controller
+ - IBM SCSI Adapter w/Cache
+ - IBM SCSI Adapter
+ - IBM Integrated SCSI Controller
+ - All clones, 100% compatible with the chipset and subsystem command
+ system of IBM SCSI-adapters (forced detection)
+
+ 2.14 Linux Kernel Versions
+ --------------------------
+ The IBM SCSI-subsystem low level driver is prepared to be used with
+ all versions of Linux between 2.0.x and 2.4.x. The compatibility checks
+ are fully implemented up from version 3.1e of the driver. This means, that
+ you just need the latest ibmmca.h and ibmmca.c file and copy it in the
+ linux/drivers/scsi directory. The code is automatically adapted during
+ kernel compilation. This is different from kernel 2.4.0! Here version
+ 4.0 or later of the driver must be used for kernel 2.4.0 or later. Version
+ 4.0 or later does not work together with older kernels! Driver versions
+ older than 4.0 do not work together with kernel 2.4.0 or later. They work
+ on all older kernels.
+
+ 3 Code History
+ --------------
+ Jan 15 1996: First public release.
+ - Martin Kolinek
+
+ Jan 23 1996: Scrapped code which reassigned scsi devices to logical
+ device numbers. Instead, the existing assignment (created
+ when the machine is powered-up or rebooted) is used.
+ A side effect is that the upper layer of Linux SCSI
+ device driver gets bogus scsi ids (this is benign),
+ and also the hard disks are ordered under Linux the
+ same way as they are under dos (i.e., C: disk is sda,
+ D: disk is sdb, etc.).
+ - Martin Kolinek
+
+ I think that the CD-ROM is now detected only if a CD is
+ inside CD_ROM while Linux boots. This can be fixed later,
+ once the driver works on all types of PS/2's.
+ - Martin Kolinek
+
+ Feb 7 1996: Modified biosparam function. Fixed the CD-ROM detection.
+ For now, devices other than harddisk and CD_ROM are
+ ignored. Temporarily modified abort() function
+ to behave like reset().
+ - Martin Kolinek
+
+ Mar 31 1996: The integrated scsi subsystem is correctly found
+ in PS/2 models 56,57, but not in model 76. Therefore
+ the ibmmca_scsi_setup() function has been added today.
+ This function allows the user to force detection of
+ scsi subsystem. The kernel option has format
+ ibmmcascsi=n
+ where n is the scsi_id (pun) of the subsystem. Most likely, n is 7.
+ - Martin Kolinek
+
+ Aug 21 1996: Modified the code which maps ldns to (pun,0). It was
+ insufficient for those of us with CD-ROM changers.
+ - Chris Beauregard
+
+ Dec 14 1996: More improvements to the ldn mapping. See check_devices
+ for details. Did more fiddling with the integrated SCSI detection,
+ but I think it's ultimately hopeless without actually testing the
+ model of the machine. The 56, 57, 76 and 95 (ultimedia) all have
+ different integrated SCSI register configurations. However, the 56
+ and 57 are the only ones that have problems with forced detection.
+ - Chris Beauregard
+
+ Mar 8-16 1997: Modified driver to run as a module and to support
+ multiple adapters. A structure, called ibmmca_hostdata, is now
+ present, containing all the variables, that were once only
+ available for one single adapter. The find_subsystem-routine has vanished.
+ The hardware recognition is now done in ibmmca_detect directly.
+ This routine checks for presence of MCA-bus, checks the interrupt
+ level and continues with checking the installed hardware.
+ Certain PS/2-models do not recognize a SCSI-subsystem automatically.
+ Hence, the setup defined by command-line-parameters is checked first.
+ Thereafter, the routine probes for an integrated SCSI-subsystem.
+ Finally, adapters are checked. This method has the advantage to cover all
+ possible combinations of multiple SCSI-subsystems on one MCA-board. Up to
+ eight SCSI-subsystems can be recognized and announced to the upper-level
+ drivers with this improvement. A set of defines made changes to other
+ routines as small as possible.
+ - Klaus Kudielka
+
+ May 30 1997: (v1.5b)
+ 1) SCSI-command capability enlarged by the recognition of MODE_SELECT.
+ This needs the RD-Bit to be disabled on IM_OTHER_SCSI_CMD_CMD which
+ allows data to be written from the system to the device. It is a
+ necessary step to be allowed to set blocksize of SCSI-tape-drives and
+ the tape-speed, without confusing the SCSI-Subsystem.
+ 2) The recognition of a tape is included in the check_devices routine.
+ This is done by checking for TYPE_TAPE, that is already defined in
+ the kernel-scsi-environment. The markup of a tape is done in the
+ global ldn_is_tape[] array. If the entry on index ldn
+ is 1, there is a tapedrive connected.
+ 3) The ldn_is_tape[] array is necessary to distinguish between tape- and
+ other devices. Fixed blocklength devices should not cause a problem
+ with the SCB-command for read and write in the ibmmca_queuecommand
+ subroutine. Therefore, I only derivate the READ_XX, WRITE_XX for
+ the tape-devices, as recommended by IBM in this Technical Reference,
+ mentioned below. (IBM recommends to avoid using the read/write of the
+ subsystem, but the fact was, that read/write causes a command error from
+ the subsystem and this causes kernel-panic.)
+ 4) In addition, I propose to use the ldn instead of a fix char for the
+ display of PS2_DISK_LED_ON(). On 95, one can distinguish between the
+ devices that are accessed. It shows activity and easyfies debugging.
+ The tape-support has been tested with a SONY SDT-5200 and a HP DDS-2
+ (I do not know yet the type). Optimization and CD-ROM audio-support,
+ I am working on ...
+ - Michael Lang
+
+ June 19 1997: (v1.6b)
+ 1) Submitting the extra-array ldn_is_tape[] -> to the local ld[]
+ device-array.
+ 2) CD-ROM Audio-Play seems to work now.
+ 3) When using DDS-2 (120M) DAT-Tapes, mtst shows still density-code
+ 0x13 for ordinary DDS (61000 BPM) instead 0x24 for DDS-2. This appears
+ also on Adaptec 2940 adaptor in a PCI-System. Therefore, I assume that
+ the problem is independent of the low-level-driver/bus-architecture.
+ 4) Hexadecimal ldn on PS/2-95 LED-display.
+ 5) Fixing of the PS/2-LED on/off that it works right with tapedrives and
+ does not confuse the disk_rw_in_progress counter.
+ - Michael Lang
+
+ June 21 1997: (v1.7b)
+ 1) Adding of a proc_info routine to inform in /proc/scsi/ibmmca/<host> the
+ outer-world about operational load statistics on the different ldns,
+ seen by the driver. Everybody that has more than one IBM-SCSI should
+ test this, because I only have one and cannot see what happens with more
+ than one IBM-SCSI hosts.
+ 2) Definition of a driver version-number to have a better recognition of
+ the source when there are existing too much releases that may confuse
+ the user, when reading about release-specific problems. Up to know,
+ I calculated the version-number to be 1.7. Because we are in BETA-test
+ yet, it is today 1.7b.
+ 3) Sorry for the heavy bug I programmed on June 19 1997! After that, the
+ CD-ROM did not work any more! The C7-command was a fake impression
+ I got while programming. Now, the READ and WRITE commands for CD-ROM are
+ no longer running over the subsystem, but just over
+ IM_OTHER_SCSI_CMD_CMD. On my observations (PS/2-95), now CD-ROM mounts
+ much faster(!) and hopefully all fancy multimedia-functions, like direct
+ digital recording from audio-CDs also work. (I tried it with cdda2wav
+ from the cdwtools-package and it filled up the harddisk immediately :-).)
+ To easify boolean logics, a further local device-type in ld[], called
+ is_cdrom has been included.
+ 4) If one uses a SCSI-device of unsupported type/commands, one
+ immediately runs into a kernel-panic caused by Command Error. To better
+ understand which SCSI-command caused the problem, I extended this
+ specific panic-message slightly.
+ - Michael Lang
+
+ June 25 1997: (v1.8b)
+ 1) Some cosmetic changes for the handling of SCSI-device-types.
+ Now, also CD-Burners / WORMs and SCSI-scanners should work. For
+ MO-drives I have no experience, therefore not yet supported.
+ In logical_devices I changed from different type-variables to one
+ called 'device_type' where the values, corresponding to scsi.h,
+ of a SCSI-device are stored.
+ 2) There existed a small bug, that maps a device, coming after a SCSI-tape
+ wrong. Therefore, e.g. a CD-ROM changer would have been mapped wrong
+ -> problem removed.
+ 3) Extension of the logical_device structure. Now it contains also device,
+ vendor and revision-level of a SCSI-device for internal usage.
+ - Michael Lang
+
+ June 26-29 1997: (v2.0b)
+ 1) The release number 2.0b is necessary because of the completely new done
+ recognition and handling of SCSI-devices with the adapter. As I got
+ from Chris the hint, that the subsystem can reassign ldns dynamically,
+ I remembered this immediate_assign-command, I found once in the handbook.
+ Now, the driver first kills all ldn assignments that are set by default
+ on the SCSI-subsystem. After that, it probes on all puns and luns for
+ devices by going through all combinations with immediate_assign and
+ probing for devices, using device_inquiry. The found physical(!) pun,lun
+ structure is stored in get_scsi[][] as device types. This is followed
+ by the assignment of all ldns to existing SCSI-devices. If more ldns
+ than devices are available, they are assigned to non existing pun,lun
+ combinations to satisfy the adapter. With this, the dynamical mapping
+ was possible to implement. (For further info see the text in the
+ source code and in the description below. Read the description
+ below BEFORE installing this driver on your system!)
+ 2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION.
+ 3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID
+ (pun) of the accessed SCSI-device. This is now senseful, because the
+ pun known within the driver is exactly the pun of the physical device
+ and no longer a fake one.
+ 4) The /proc/scsi/ibmmca/<host_no> consists now of the first part, where
+ hit-statistics of ldns is shown and a second part, where the maps of
+ physical and logical SCSI-devices are displayed. This could be very
+ interesting, when one is using more than 15 SCSI-devices in order to
+ follow the dynamical remapping of ldns.
+ - Michael Lang
+
+ June 26-29 1997: (v2.0b-1)
+ 1) I forgot to switch the local_checking_phase_flag to 1 and back to 0
+ in the dynamical remapping part in ibmmca_queuecommand for the
+ device_exist routine. Sorry.
+ - Michael Lang
+
+ July 1-13 1997: (v3.0b,c)
+ 1) Merging of the driver-developments of Klaus Kudielka and Michael Lang
+ in order to get a optimum and unified driver-release for the
+ IBM-SCSI-Subsystem-Adapter(s).
+ For people, using the Kernel-release >=2.1.0, module-support should
+ be no problem. For users, running under <2.1.0, module-support may not
+ work, because the methods have changed between 2.0.x and 2.1.x.
+ 2) Added some more effective statistics for /proc-output.
+ 3) Change typecasting at necessary points from (unsigned long) to
+ virt_to_bus().
+ 4) Included #if... at special points to have specific adaption of the
+ driver to kernel 2.0.x and 2.1.x. It should therefore also run with
+ later releases.
+ 5) Magneto-Optical drives and medium-changers are also recognized, now.
+ Therefore, we have a completely gapfree recognition of all SCSI-
+ device-types, that are known by Linux up to kernel 2.1.31.
+ 6) The flag SCSI_IBMMCA_DEV_RESET has been inserted. If it is set within
+ the configuration, each connected SCSI-device will get a reset command
+ during boottime. This can be necessary for some special SCSI-devices.
+ This flag should be included in Config.in.
+ (See also the new Config.in file.)
+ Probable next improvement: bad disk handler.
+ - Michael Lang
+
+ Sept 14 1997: (v3.0c)
+ 1) Some debugging and speed optimization applied.
+ - Michael Lang
+
+ Dec 15, 1997
+ - chrisb@truespectra.com
+ - made the front panel display thingy optional, specified from the
+ command-line via ibmmcascsi=display. Along the lines of the /LED
+ option for the OS/2 driver.
+ - fixed small bug in the LED display that would hang some machines.
+ - reversed ordering of the drives (using the
+ IBMMCA_SCSI_ORDER_STANDARD define). This is necessary for two main
+ reasons:
+ - users who've already installed Linux won't be screwed. Keep
+ in mind that not everyone is a kernel hacker.
+ - be consistent with the BIOS ordering of the drives. In the
+ BIOS, id 6 is C:, id 0 might be D:. With this scheme, they'd be
+ backwards. This confuses the crap out of those heathens who've
+ got a impure Linux installation (which, <wince>, I'm one of).
+ This whole problem arises because IBM is actually non-standard with
+ the id to BIOS mappings. You'll find, in fdomain.c, a similar
+ comment about a few FD BIOS revisions. The Linux (and apparently
+ industry) standard is that C: maps to scsi id (0,0). Let's stick
+ with that standard.
+ - Since this is technically a branch of my own, I changed the
+ version number to 3.0e-cpb.
+
+ Jan 17, 1998: (v3.0f)
+ 1) Addition of some statistical info for /proc in proc_info.
+ 2) Taking care of the SCSI-assignment problem, dealed by Chris at Dec 15
+ 1997. In fact, IBM is right, concerning the assignment of SCSI-devices
+ to driveletters. It is conform to the ANSI-definition of the SCSI-
+ standard to assign drive C: to SCSI-id 6, because it is the highest
+ hardware priority after the hostadapter (that has still today by
+ default everywhere id 7). Also realtime-operating systems that I use,
+ like LynxOS and OS9, which are quite industrial systems use top-down
+ numbering of the harddisks, that is also starting at id 6. Now, one
+ sits a bit between two chairs. On one hand side, using the define
+ IBMMCA_SCSI_ORDER_STANDARD makes Linux assigning disks conform to
+ the IBM- and ANSI-SCSI-standard and keeps this driver downward
+ compatible to older releases, on the other hand side, people is quite
+ habituated in believing that C: is assigned to (0,0) and much other
+ SCSI-BIOS do so. Therefore, I moved the IBMMCA_SCSI_ORDER_STANDARD
+ define out of the driver and put it into Config.in as subitem of
+ 'IBM SCSI support'. A help, added to Documentation/Configure.help
+ explains the differences between saying 'y' or 'n' to the user, when
+ IBMMCA_SCSI_ORDER_STANDARD prompts, so the ordinary user is enabled to
+ choose the way of assignment, depending on his own situation and gusto.
+ 3) Adapted SCSI_IBMMCA_DEV_RESET to the local naming convention, so it is
+ now called IBMMCA_SCSI_DEV_RESET.
+ 4) Optimization of proc_info and its subroutines.
+ 5) Added more in-source-comments and extended the driver description by
+ some explanation about the SCSI-device-assignment problem.
+ - Michael Lang
+
+ Jan 18, 1998: (v3.0g)
+ 1) Correcting names to be absolutely conform to the later 2.1.x releases.
+ This is necessary for
+ IBMMCA_SCSI_DEV_RESET -> CONFIG_IBMMCA_SCSI_DEV_RESET
+ IBMMCA_SCSI_ORDER_STANDARD -> CONFIG_IBMMCA_SCSI_ORDER_STANDARD
+ - Michael Lang
+
+ Jan 18, 1999: (v3.1 MCA-team internal)
+ 1) The multiple hosts structure is accessed from every subroutine, so there
+ is no longer the address of the device structure passed from function
+ to function, but only the hostindex. A call by value, nothing more. This
+ should really be understood by the compiler and the subsystem should get
+ the right values and addresses.
+ 2) The SCSI-subsystem detection was not complete and quite hugely buggy up
+ to now, compared to the technical manual. The interpretation of the pos2
+ register is not as assumed by people before, therefore, I dropped a note
+ in the ibmmca_detect function to show the registers' interpretation.
+ The pos-registers of integrated SCSI-subsystems do not contain any
+ information concerning the IO-port offset, really. Instead, they contain
+ some info about the adapter, the chip, the NVRAM .... The I/O-port is
+ fixed to 0x3540 - 0x3547. There can be more than one adapters in the
+ slots and they get an offset for the I/O area in order to get their own
+ I/O-address area. See chapter 2 for detailed description. At least, the
+ detection should now work right, even on models other than 95. The 95ers
+ came happily around the bug, as their pos2 register contains always 0
+ in the critical area. Reserved bits are not allowed to be interpreted,
+ therefore, IBM is allowed to set those bits as they like and they may
+ really vary between different PS/2 models. So, now, no interpretation
+ of reserved bits - hopefully no trouble here anymore.
+ 3) The command error, which you may get on models 55, 56, 57, 70, 77 and
+ P70 may have been caused by the fact, that adapters of older design do
+ not like sending commands to non-existing SCSI-devices and will react
+ with a command error as a sign of protest. While this error is not
+ present on IBM SCSI Adapter w/cache, it appears on IBM Integrated SCSI
+ Adapters. Therefore, I implemented a workaround to forgive those
+ adapters their protests, but it is marked up in the statistics, so
+ after a successful boot, you can see in /proc/scsi/ibmmca/<host_number>
+ how often the command errors have been forgiven to the SCSI-subsystem.
+ If the number is bigger than 0, you have a SCSI subsystem of older
+ design, what should no longer matter.
+ 4) ibmmca_getinfo() has been adapted very carefully, so it shows in the
+ slotn file really, what is senseful to be presented.
+ 5) ibmmca_register() has been extended in its parameter list in order to
+ pass the right name of the SCSI-adapter to Linux.
+ - Michael Lang
+
+ Feb 6, 1999: (v3.1)
+ 1) Finally, after some 3.1Beta-releases, the 3.1 release. Sorry, for
+ the delayed release, but it was not finished with the release of
+ Kernel 2.2.0.
+ - Michael Lang
+
+ Feb 10, 1999 (v3.1)
+ 1) Added a new commandline parameter called 'bypass' in order to bypass
+ every integrated subsystem SCSI-command consequently in case of
+ troubles.
+ 2) Concatenated read_capacity requests to the harddisks. It gave a lot
+ of troubles with some controllers and after I wanted to apply some
+ extensions, it jumped out in the same situation, on my w/cache, as like
+ on D. Weinehalls' Model 56, having integrated SCSI. This gave me the
+ decisive hint to move the code-part out and declare it global. Now
+ it seems to work far better and more stable. Let us see what
+ the world thinks of it...
+ 3) By the way, only Sony DAT-drives seem to show density code 0x13. A
+ test with a HP drive gave right results, so the problem is vendor-
+ specific and not a problem of the OS or the driver.
+ - Michael Lang
+
+ Feb 18, 1999 (v3.1d)
+ 1) The abort command and the reset function have been checked for
+ inconsistencies. From the logical point of thinking, they work
+ at their optimum, now, but as the subsystem does not answer with an
+ interrupt, abort never finishes, sigh...
+ 2) Everything, that is accessed by a busmaster request from the adapter
+ is now declared as global variable, even the return-buffer in the
+ local checking phase. This assures, that no accesses to undefined memory
+ areas are performed.
+ 3) In ibmmca.h, the line unchecked_isa_dma is added with 1 in order to
+ avoid memory-pointers for the areas higher than 16MByte in order to
+ be sure, it also works on 16-Bit Microchannel bus systems.
+ 4) A lot of small things have been found, but nothing that endangered the
+ driver operations. Just it should be more stable, now.
+ - Michael Lang
+
+ Feb 20, 1999 (v3.1e)
+ 1) I took the warning from the Linux Kernel Hackers Guide serious and
+ checked the cmd->result return value to the done-function very carefully.
+ It is obvious, that the IBM SCSI only delivers the tsb.dev_status, if
+ some error appeared, else it is undefined. Now, this is fixed. Before
+ any SCB command gets queued, the tsb.dev_status is set to 0, so the
+ cmd->result won't screw up Linux higher level drivers.
+ 2) The reset-function has slightly improved. This is still planed for
+ abort. During the abort and the reset function, no interrupts are
+ allowed. This is however quite hard to cope with, so the INT-status
+ register is read. When the interrupt gets queued, one can find its
+ status immediately on that register and is enabled to continue in the
+ reset function. I had no chance to test this really, only in a bogus
+ situation, I got this function running, but the situation was too much
+ worse for Linux :-(, so tests will continue.
+ 3) Buffers got now consistent. No open address mapping, as before and
+ therefore no further troubles with the unassigned memory segmentation
+ faults that scrambled probes on 95XX series and even on 85XX series,
+ when the kernel is done in a not so perfectly fitting way.
+ 4) Spontaneous interrupts from the subsystem, appearing without any
+ command previously queued are answered with a DID_BAD_INTR result.
+ 5) Taken into account ZP Gus' proposals to reverse the SCSI-device
+ scan order. As it does not work on Kernel 2.1.x or 2.2.x, as proposed
+ by him, I implemented it in a slightly derived way, which offers in
+ addition more flexibility.
+ - Michael Lang
+
+ Apr 23, 2000 (v3.2pre1)
+ 1) During a very long time, I collected a huge amount of bug reports from
+ various people, trying really quite different things on their SCSI-
+ PS/2s. Today, all these bug reports are taken into account and should be
+ mostly solved. The major topics were:
+ - Driver crashes during boottime by no obvious reason.
+ - Driver panics while the midlevel-SCSI-driver is trying to inquire
+ the SCSI-device properties, even though hardware is in perfect state.
+ - Displayed info for the various slot-cards is interpreted wrong.
+ The main reasons for the crashes were two:
+ 1) The commands to check for device information like INQUIRY,
+ TEST_UNIT_READY, REQUEST_SENSE and MODE_SENSE cause the devices
+ to deliver information of up to 255 bytes. Midlevel drivers offer
+ 1024 bytes of space for the answer, but the IBM-SCSI-adapters do
+ not accept this, as they stick quite near to ANSI-SCSI and report
+ a COMMAND_ERROR message which causes the driver to panic. The main
+ problem was located around the INQUIRY command. Now, for all the
+ mentioned commands, the buffersize sent to the adapter is at
+ maximum 255 which seems to be a quite reasonable solution.
+ TEST_UNIT_READY gets a buffersize of 0 to make sure that no
+ data is transferred in order to avoid any possible command failure.
+ 2) On unsuccessful TEST_UNIT_READY, the mid-level driver has to send
+ a REQUEST_SENSE in order to see where the problem is located. This
+ REQUEST_SENSE may have various length in its answer-buffer. IBM
+ SCSI-subsystems report a command failure if the returned buffersize
+ is different from the sent buffersize, but this can be suppressed by
+ a special bit, which is now done and problems seem to be solved.
+ 2) Code adaption to all kernel-releases. Now, the 3.2 code compiles on
+ 2.0.x, 2.1.x, 2.2.x and 2.3.x kernel releases without any code-changes.
+ 3) Commandline-parameters are recognized again, even under Kernel 2.3.x or
+ higher.
+ - Michael Lang
+
+ April 27, 2000 (v3.2pre2)
+ 1) Bypassed commands get read by the adapter by one cycle instead of two.
+ This increases SCSI-performance.
+ 2) Synchronous datatransfer is provided for sure to be 5 MHz on older
+ SCSI and 10 MHz on internal F/W SCSI-adapter.
+ 3) New commandline parameters allow to force the adapter to slow down while
+ in synchronous transfer. Could be helpful for very old devices.
+ - Michael Lang
+
+ June 2, 2000 (v3.2pre5)
+ 1) Added Jim Shorney's contribution to make the activity indicator
+ flashing in addition to the LED-alphanumeric display-panel on
+ models 95A. To be enabled to choose this feature freely, a new
+ commandline parameter is added, called 'activity'.
+ 2) Added the READ_CONTROL bit for test_unit_ready SCSI-command.
+ 3) Added some suppress_exception bits to read_device_capacity and
+ all device_inquiry occurrences in the driver code.
+ 4) Complaints about the various KERNEL_VERSION implementations are
+ taken into account. Every local_LinuxKernelVersion occurrence is
+ now replaced by KERNEL_VERSION, defined in linux/version.h.
+ Corresponding changes were applied to ibmmca.h, too. This was a
+ contribution to all kernel-parts by Philipp Hahn.
+ - Michael Lang
+
+ July 17, 2000 (v3.2pre8)
+ A long period of collecting bug reports from all corners of the world
+ now lead to the following corrections to the code:
+ 1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this
+ was that it is possible to disable Fast-SCSI for the external bus.
+ The feature-control command, where this crash appeared regularly, tried
+ to set the maximum speed of 10MHz synchronous transfer speed and that
+ reports a COMMAND ERROR if external bus Fast-SCSI is disabled. Now,
+ the feature-command probes down from maximum speed until the adapter
+ stops to complain, which is at the same time the maximum possible
+ speed selected in the reference program. So, F/W external can run at
+ 5 MHz (slow-) or 10 MHz (fast-SCSI). During feature probing, the
+ COMMAND ERROR message is used to detect if the adapter does not complain.
+ 2) Up to now, only combined busmode is supported, if you use external
+ SCSI-devices, attached to the F/W-controller. If dual bus is selected,
+ only the internal SCSI-devices get accessed by Linux. For most
+ applications, this should do fine.
+ 3) Wide-SCSI-addressing (16-Bit) is now possible for the internal F/W
+ bus on the F/W adapter. If F/W adapter is detected, the driver
+ automatically uses the extended PUN/LUN <-> LDN mapping tables, which
+ are now new from 3.2pre8. This allows PUNs between 0 and 15 and should
+ provide more fun with the F/W adapter.
+ 4) Several machines use the SCSI: POS registers for internal/undocumented
+ storage of system relevant info. This confused the driver, mainly on
+ models 9595, as it expected no onboard SCSI only, if all POS in
+ the integrated SCSI-area are set to 0x00 or 0xff. Now, the mechanism
+ to check for integrated SCSI is much more restrictive and these problems
+ should be history.
+ - Michael Lang
+
+ July 18, 2000 (v3.2pre9)
+ This develop rather quickly at the moment. Two major things were still
+ missing in 3.2pre8:
+ 1) The adapter PUN for F/W adapters has 4-bits, while all other adapters
+ have 3-bits. This is now taken into account for F/W.
+ 2) When you select CONFIG_IBMMCA_SCSI_ORDER_STANDARD, you should
+ normally get the inverse probing order of your devices on the SCSI-bus.
+ The ANSI device order gets scrambled in version 3.2pre8!! Now, a new
+ and tested algorithm inverts the device-order on the SCSI-bus and
+ automatically avoids accidental access to whatever SCSI PUN the adapter
+ is set and works with SCSI- and Wide-SCSI-addressing.
+ - Michael Lang
+
+ July 23, 2000 (v3.2pre10 unpublished)
+ 1) LED panel display supports wide-addressing in ibmmca=display mode.
+ 2) Adapter-information and autoadaption to address-space is done.
+ 3) Auto-probing for maximum synchronous SCSI transfer rate is working.
+ 4) Optimization to some embedded function calls is applied.
+ 5) Added some comment for the user to wait for SCSI-devices being probed.
+ 6) Finished version 3.2 for Kernel 2.4.0. It least, I thought it is but...
+ - Michael Lang
+
+ July 26, 2000 (v3.2pre11)
+ 1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and
+ a model 9595. Asking around in the community, nobody except of me has
+ seen such errors. Weird, but I am trying to recompile everything on
+ the model 9595. Maybe, as I use a specially modified gcc, that could
+ cause problems. But, it was not the reason. The true background was,
+ that the kernel was compiled for i386 and the 9595 has a 486DX-2.
+ Normally, no troubles should appear, but for this special machine,
+ only the right processor support is working fine!
+ 2) Previous problems with synchronous speed, slowing down from one adapter
+ to the next during probing are corrected. Now, local variables store
+ the synchronous bitmask for every single adapter found on the MCA bus.
+ 3) LED alphanumeric panel support for XX95 systems is now showing some
+ alive rotator during boottime. This makes sense, when no monitor is
+ connected to the system. You can get rid of all display activity, if
+ you do not use any parameter or just ibmmcascsi=activity, for the
+ harddrive activity LED, existent on all PS/2, except models 8595-XXX.
+ If no monitor is available, please use ibmmcascsi=display, which works
+ fine together with the linuxinfo utility for the LED-panel.
+ - Michael Lang
+
+ July 29, 2000 (v3.2)
+ 1) Submission of this driver for kernel 2.4test-XX and 2.2.17.
+ - Michael Lang
+
+ December 28, 2000 (v3.2d / v4.0)
+ 1) The interrupt handler had some wrong statement to wait for. This
+ was done due to experimental reasons during 3.2 development but it
+ has shown that this is not stable enough. Going back to wait for the
+ adapter to be not busy is best.
+ 2) Inquiry requests can be shorter than 255 bytes of return buffer. Due
+ to a bug in the ibmmca_queuecommand routine, this buffer was forced
+ to 255 at minimum. If the memory address, this return buffer is pointing
+ to does not offer more space, invalid memory accesses destabilized the
+ kernel.
+ 3) version 4.0 is only valid for kernel 2.4.0 or later. This is necessary
+ to remove old kernel version dependent waste from the driver. 3.2d is
+ only distributed with older kernels but keeps compatibility with older
+ kernel versions. 4.0 and higher versions cannot be used with older
+ kernels anymore!! You must have at least kernel 2.4.0!!
+ 4) The commandline argument 'bypass' and all its functionality got removed
+ in version 4.0. This was never really necessary, as all troubles were
+ based on non-command related reasons up to now, so bypassing commands
+ did not help to avoid any bugs. It is kept in 3.2X for debugging reasons.
+ 5) Dynamic reassignment of ldns was again verified and analyzed to be
+ completely inoperational. This is corrected and should work now.
+ 6) All commands that get sent to the SCSI adapter were verified and
+ completed in such a way, that they are now completely conform to the
+ demands in the technical description of IBM. Main candidates were the
+ DEVICE_INQUIRY, REQUEST_SENSE and DEVICE_CAPACITY commands. They must
+ be transferred by bypassing the internal command buffer of the adapter
+ or else the response can be a random result. GET_POS_INFO would be more
+ safe in usage, if one could use the SUPRESS_EXCEPTION_SHORT, but this
+ is not allowed by the technical references of IBM. (Sorry, folks, the
+ model 80 problem is still a task to be solved in a different way.)
+ 7) v3.2d is still hold back for some days for testing, while 4.0 is
+ released.
+ - Michael Lang
+
+ January 3, 2001 (v4.0a)
+ 1) A lot of complains after the 2.4.0-prerelease kernel came in about
+ the impossibility to compile the driver as a module. This problem is
+ solved. In combination with that problem, some unprecise declaration
+ of the function option_setup() gave some warnings during compilation.
+ This is solved, too by a forward declaration in ibmmca.c.
+ 2) #ifdef argument concerning CONFIG_SCSI_IBMMCA is no longer needed and
+ was entirely removed.
+ 3) Some switch statements got optimized in code, as some minor variables
+ in internal SCSI-command handlers.
+ - Michael Lang
+
+ 4 To do
+ -------
+ - IBM SCSI-2 F/W external SCSI bus support in separate mode!
+ - It seems that the handling of bad disks is really bad -
+ non-existent, in fact. However, a low-level driver cannot help
+ much, if such things happen.
+
+ 5 Users' Manual
+ ---------------
+ 5.1 Commandline Parameters
+ --------------------------
+ There exist several features for the IBM SCSI-subsystem driver.
+ The commandline parameter format is:
+
+ ibmmcascsi=<command1>,<command2>,<command3>,...
+
+ where commandN can be one of the following:
+
+ display Owners of a model 95 or other PS/2 systems with an
+ alphanumeric LED display may set this to have their
+ display showing the following output of the 8 digits:
+
+ ------DA
+
+ where '-' stays dark, 'D' shows the SCSI-device id
+ and 'A' shows the SCSI hostindex, being currently
+ accessed. During boottime, this will give the message
+
+ SCSIini*
+
+ on the LED-panel, where the * represents a rotator,
+ showing the activity during the probing phase of the
+ driver which can take up to two minutes per SCSI-adapter.
+ adisplay This works like display, but gives more optical overview
+ of the activities on the SCSI-bus. The display will have
+ the following output:
+
+ 6543210A
+
+ where the numbers 0 to 6 light up at the shown position,
+ when the SCSI-device is accessed. 'A' shows again the SCSI
+ hostindex. If display nor adisplay is set, the internal
+ PS/2 harddisk LED is used for media-activities. So, if
+ you really do not have a system with a LED-display, you
+ should not set display or adisplay. Keep in mind, that
+ display and adisplay can only be used alternatively. It
+ is not recommended to use this option, if you have some
+ wide-addressed devices e.g. at the SCSI-2 F/W adapter in
+ your system. In addition, the usage of the display for
+ other tasks in parallel, like the linuxinfo-utility makes
+ no sense with this option.
+ activity This enables the PS/2 harddisk LED activity indicator.
+ Most PS/2 have no alphanumeric LED display, but some
+ indicator. So you should use this parameter to activate it.
+ If you own model 9595 (Server95), you can have both, the
+ LED panel and the activity indicator in parallel. However,
+ some PS/2s, like the 8595 do not have any harddisk LED
+ activity indicator, which means, that you must use the
+ alphanumeric LED display if you want to monitor SCSI-
+ activity.
+ bypass This is obsolete from driver version 4.0, as the adapters
+ got that far understood, that the selection between
+ integrated and bypassed commands should now work completely
+ correct! For historical reasons, the old description is
+ kept here:
+ This commandline parameter forces the driver never to use
+ SCSI-subsystems' integrated SCSI-command set. Except of
+ the immediate assign, which is of vital importance for
+ every IBM SCSI-subsystem to set its ldns right. Instead,
+ the ordinary ANSI-SCSI-commands are used and passed by the
+ controller to the SCSI-devices, therefore 'bypass'. The
+ effort, done by the subsystem is quite bogus and at a
+ minimum and therefore it should work everywhere. This
+ could maybe solve troubles with old or integrated SCSI-
+ controllers and nasty harddisks. Keep in mind, that using
+ this flag will slow-down SCSI-accesses slightly, as the
+ software generated commands are always slower than the
+ hardware. Non-harddisk devices always get read/write-
+ commands in bypass mode. On the most recent releases of
+ the Linux IBM-SCSI-driver, the bypass command should be
+ no longer a necessary thing, if you are sure about your
+ SCSI-hardware!
+ normal This is the parameter, introduced on the 2.0.x development
+ rail by ZP Gu. This parameter defines the SCSI-device
+ scan order in the new industry standard. This means, that
+ the first SCSI-device is the one with the lowest pun.
+ E.g. harddisk at pun=0 is scanned before harddisk at
+ pun=6, which means, that harddisk at pun=0 gets sda
+ and the one at pun=6 gets sdb.
+ ansi The ANSI-standard for the right scan order, as done by
+ IBM, Microware and Microsoft, scans SCSI-devices starting
+ at the highest pun, which means, that e.g. harddisk at
+ pun=6 gets sda and a harddisk at pun=0 gets sdb. If you
+ like to have the same SCSI-device order, as in DOS, OS-9
+ or OS/2, just use this parameter.
+ fast SCSI-I/O in synchronous mode is done at 5 MHz for IBM-
+ SCSI-devices. SCSI-2 Fast/Wide Adapter/A external bus
+ should then run at 10 MHz if Fast-SCSI is enabled,
+ and at 5 MHz if Fast-SCSI is disabled on the external
+ bus. This is the default setting when nothing is
+ specified here.
+ medium Synchronous rate is at 50% approximately, which means
+ 2.5 MHz for IBM SCSI-adapters and 5.0 MHz for F/W ext.
+ SCSI-bus (when Fast-SCSI speed enabled on external bus).
+ slow The slowest possible synchronous transfer rate is set.
+ This means 1.82 MHz for IBM SCSI-adapters and 2.0 MHz
+ for F/W external bus at Fast-SCSI speed on the external
+ bus.
+
+ A further option is that you can force the SCSI-driver to accept a SCSI-
+ subsystem at a certain I/O-address with a predefined adapter PUN. This
+ is done by entering
+
+ commandN = I/O-base
+ commandN+1 = adapter PUN
+
+ e.g. ibmmcascsi=0x3540,7 will force the driver to detect a SCSI-subsystem
+ at I/O-address 0x3540 with adapter PUN 7. Please only use this method, if
+ the driver does really not recognize your SCSI-adapter! With driver version
+ 3.2, this recognition of various adapters was hugely improved and you
+ should try first to remove your commandline arguments of such type with a
+ newer driver. I bet, it will be recognized correctly. Even multiple and
+ different types of IBM SCSI-adapters should be recognized correctly, too.
+ Use the forced detection method only as last solution!
+
+ Examples:
+
+ ibmmcascsi=adisplay
+
+ This will use the advanced display mode for the model 95 LED alphanumeric
+ display.
+
+ ibmmcascsi=display,0x3558,7
+
+ This will activate the default display mode for the model 95 LED display
+ and will force the driver to accept a SCSI-subsystem at I/O-base 0x3558
+ with adapter PUN 7.
+
+ 5.2 Troubleshooting
+ -------------------
+ The following FAQs should help you to solve some major problems with this
+ driver.
+
+ Q: "Reset SCSI-devices at boottime" halts the system at boottime, why?
+ A: This is only tested with the IBM SCSI Adapter w/cache. It is not
+ yet proven to run on other adapters, however you may be lucky.
+ In version 3.1d this has been hugely improved and should work better,
+ now. Normally you really won't need to activate this flag in the
+ kernel configuration, as all post 1989 SCSI-devices should accept
+ the reset-signal, when the computer is switched on. The SCSI-
+ subsystem generates this reset while being initialized. This flag
+ is really reserved for users with very old, very strange or self-made
+ SCSI-devices.
+ Q: Why is the SCSI-order of my drives mirrored to the device-order
+ seen from OS/2 or DOS ?
+ A: It depends on the operating system, if it looks at the devices in
+ ANSI-SCSI-standard (starting from pun 6 and going down to pun 0) or
+ if it just starts at pun 0 and counts up. If you want to be conform
+ with OS/2 and DOS, you have to activate this flag in the kernel
+ configuration or you should set 'ansi' as parameter for the kernel.
+ The parameter 'normal' sets the new industry standard, starting
+ from pun 0, scanning up to pun 6. This allows you to change your
+ opinion still after having already compiled the kernel.
+ Q: Why can't I find IBM MCA SCSI support in the config menu?
+ A: You have to activate MCA bus support, first.
+ Q: Where can I find the latest info about this driver?
+ A: See the file MAINTAINERS for the current WWW-address, which offers
+ updates, info and Q/A lists. At this file's origin, the webaddress
+ was: http://www.uni-mainz.de/~langm000/linux.html
+ Q: My SCSI-adapter is not recognized by the driver, what can I do?
+ A: Just force it to be recognized by kernel parameters. See section 5.1.
+ If this really happens, do also send e-mail to the maintainer, as
+ forced detection should be never necessary. Forced detection is in
+ principal some flaw of the driver adapter detection and goes into
+ bug reports.
+ Q: The driver screws up, if it starts to probe SCSI-devices, is there
+ some way out of it?
+ A: Yes, that was some recognition problem of the correct SCSI-adapter
+ and its I/O base addresses. Upgrade your driver to the latest release
+ and it should be fine again.
+ Q: I get a message: panic IBM MCA SCSI: command error .... , what can
+ I do against this?
+ A: Previously, I followed the way by ignoring command errors by using
+ ibmmcascsi=forgiveall, but this command no longer exists and is
+ obsolete. If such a problem appears, it is caused by some segmentation
+ fault of the driver, which maps to some unallowed area. The latest
+ version of the driver should be ok, as most bugs have been solved.
+ Q: There are still kernel panics, even after having set
+ ibmmcascsi=forgiveall. Are there other possibilities to prevent
+ such panics?
+ A: No, get just the latest release of the driver and it should work
+ better and better with increasing version number. Forget about this
+ ibmmcascsi=forgiveall, as also ignorecmd are obsolete.!
+ Q: Linux panics or stops without any comment, but it is probable, that my
+ harddisk(s) have bad blocks.
+ A: Sorry, the bad-block handling is still a feeble point of this driver,
+ but is on the schedule for development in the near future.
+ Q: Linux panics while dynamically assigning SCSI-ids or ldns.
+ A: If you disconnect a SCSI-device from the machine, while Linux is up
+ and the driver uses dynamical reassignment of logical device numbers
+ (ldn), it really gets "angry" if it won't find devices, that were still
+ present at boottime and stops Linux.
+ Q: The system does not recover after an abort-command has been generated.
+ A: This is regrettably true, as it is not yet understood, why the
+ SCSI-adapter does really NOT generate any interrupt at the end of
+ the abort-command. As no interrupt is generated, the abort command
+ cannot get finished and the system hangs, sorry, but checks are
+ running to hunt down this problem. If there is a real pending command,
+ the interrupt MUST get generated after abort. In this case, it
+ should finish well.
+ Q: The system gets in bad shape after a SCSI-reset, is this known?
+ A: Yes, as there are a lot of prescriptions (see the Linux Hackers'
+ Guide) what has to be done for reset, we still share the bad shape of
+ the reset functions with all other low level SCSI-drivers.
+ Astonishingly, reset works in most cases quite ok, but the harddisks
+ won't run in synchronous mode anymore after a reset, until you reboot.
+ Q: Why does my XXX w/Cache adapter not use read-prefetch?
+ A: Ok, that is not completely possible. If a cache is present, the
+ adapter tries to use it internally. Explicitly, one can use the cache
+ with a read prefetch command, maybe in future, but this requires
+ some major overhead of SCSI-commands that risks the performance to
+ go down more than it gets improved. Tests with that are running.
+ Q: I have a IBM SCSI-2 Fast/Wide adapter, it boots in some way and hangs.
+ A: Yes, that is understood, as for sure, your SCSI-2 Fast/Wide adapter
+ was in such a case recognized as integrated SCSI-adapter or something
+ else, but not as the correct adapter. As the I/O-ports get assigned
+ wrongly by that reason, the system should crash in most cases. You
+ should upgrade to the latest release of the SCSI-driver. The
+ recommended version is 3.2 or later. Here, the F/W support is in
+ a stable and reliable condition. Wide-addressing is in addition
+ supported.
+ Q: I get an Oops message and something like "killing interrupt".
+ A: The reason for this is that the IBM SCSI-subsystem only sends a
+ termination status back, if some error appeared. In former releases
+ of the driver, it was not checked, if the termination status block
+ is NULL. From version 3.2, it is taken care of this.
+ Q: I have a F/W adapter and the driver sees my internal SCSI-devices,
+ but ignores the external ones.
+ A: Select combined busmode in the IBM config-program and check for that
+ no SCSI-id on the external devices appears on internal devices.
+ Reboot afterwards. Dual busmode is supported, but works only for the
+ internal bus, yet. External bus is still ignored. Take care for your
+ SCSI-ids. If combined bus-mode is activated, on some adapters,
+ the wide-addressing is not possible, so devices with ids between 8
+ and 15 get ignored by the driver & adapter!
+ Q: I have a 9595 and I get a NMI during heavy SCSI I/O e.g. during fsck.
+ A COMMAND ERROR is reported and characters on the screen are missing.
+ Warm reboot is not possible. Things look like quite weird.
+ A: Check the processor type of your 9595. If you have an 80486 or 486DX-2
+ processor complex on your mainboard and you compiled a kernel that
+ supports 80386 processors, it is possible, that the kernel cannot
+ keep track of the PS/2 interrupt handling and stops on an NMI. Just
+ compile a kernel for the correct processor type of your PS/2 and
+ everything should be fine. This is necessary even if one assumes,
+ that some 80486 system should be downward compatible to 80386
+ software.
+ Q: Some commands hang and interrupts block the machine. After some
+ timeout, the syslog reports that it tries to call abort, but the
+ machine is frozen.
+ A: This can be a busy wait bug in the interrupt handler of driver
+ version 3.2. You should at least upgrade to 3.2c if you use
+ kernel < 2.4.0 and driver version 4.0 if you use kernel 2.4.0 or
+ later (including all test releases).
+ Q: I have a PS/2 model 80 and more than 16 MBytes of RAM. The driver
+ completely refuses to work, reports NMIs, COMMAND ERRORs or other
+ ambiguous stuff. When reducing the RAM size down below 16 MB,
+ everything is running smoothly.
+ A: No real answer, yet. In any case, one should force the kernel to
+ present SCBs only below the 16 MBytes barrier. Maybe this solves the
+ problem. Not yet tried, but guessing that it could work. To get this,
+ set unchecked_isa_dma argument of ibmmca.h from 0 to 1.
+
+ 5.3 Bug reports
+ --------------
+ If you really find bugs in the source code or the driver will successfully
+ refuse to work on your machine, you should send a bug report to me. The
+ best for this is to follow the instructions on the WWW-page for this
+ driver. Fill out the bug-report form, placed on the WWW-page and ship it,
+ so the bugs can be taken into account with maximum efforts. But, please
+ do not send bug reports about this driver to Linus Torvalds or Leonard
+ Zubkoff, as Linus is buried in E-Mail and Leonard is supervising all
+ SCSI-drivers and won't have the time left to look inside every single
+ driver to fix a bug and especially DO NOT send modified code to Linus
+ Torvalds or Alan J. Cox which has not been checked here!!! They are both
+ quite buried in E-mail (as me, sometimes, too) and one should first check
+ for problems on my local teststand. Recently, I got a lot of
+ bug reports for errors in the ibmmca.c code, which I could not imagine, but
+ a look inside some Linux-distribution showed me quite often some modified
+ code, which did no longer work on most other machines than the one of the
+ modifier. Ok, so now that there is maintenance service available for this
+ driver, please use this address first in order to keep the level of
+ confusion low. Thank you!
+
+ When you get a SCSI-error message that panics your system, a list of
+ register-entries of the SCSI-subsystem is shown (from Version 3.1d). With
+ this list, it is very easy for the maintainer to localize the problem in
+ the driver or in the configuration of the user. Please write down all the
+ values from this report and send them to the maintainer. This would really
+ help a lot and makes life easier concerning misunderstandings.
+
+ Use the bug-report form (see 5.4 for its address) to send all the bug-
+ stuff to the maintainer or write e-mail with the values from the table.
+
+ 5.4 Support WWW-page
+ --------------------
+ The address of the IBM SCSI-subsystem supporting WWW-page is:
+
+ http://www.staff.uni-mainz.de/mlang/linux.html
+
+ Here you can find info about the background of this driver, patches,
+ troubleshooting support, news and a bugreport form. Please check that
+ WWW-page regularly for latest hints. If ever this URL changes, please
+ refer to the MAINTAINERS file in order to get the latest address.
+
+ For the bugreport, please fill out the formular on the corresponding
+ WWW-page. Read the dedicated instructions and write as much as you
+ know about your problem. If you do not like such formulars, please send
+ some e-mail directly, but at least with the same information as required by
+ the formular.
+
+ If you have extensive bug reports, including Oops messages and
+ screen-shots, please feel free to send it directly to the address
+ of the maintainer, too. The current address of the maintainer is:
+
+ Michael Lang <langa2@kph.uni-mainz.de>
+
+ 6 References
+ ------------
+ IBM Corp., "Update for the PS/2 Hardware Interface Technical Reference,
+ Common Interfaces", Armonk, September 1991, PN 04G3281,
+ (available in the U.S. for $21.75 at 1-800-IBM-PCTB or in Germany for
+ around 40,-DM at "Hallo IBM").
+
+ IBM Corp., "Personal System/2 Micro Channel SCSI
+ Adapter with Cache Technical Reference", Armonk, March 1990, PN 68X2365.
+
+ IBM Corp., "Personal System/2 Micro Channel SCSI
+ Adapter Technical Reference", Armonk, March 1990, PN 68X2397.
+
+ IBM Corp., "SCSI-2 Fast/Wide Adapter/A Technical Reference - Dual Bus",
+ Armonk, March 1994, PN 83G7545.
+
+ Friedhelm Schmidt, "SCSI-Bus und IDE-Schnittstelle - Moderne Peripherie-
+ Schnittstellen: Hardware, Protokollbeschreibung und Anwendung", 2. Aufl.
+ Addison Wesley, 1996.
+
+ Michael K. Johnson, "The Linux Kernel Hackers' Guide", Version 0.6, Chapel
+ Hill - North Carolina, 1995
+
+ Andreas Kaiser, "SCSI TAPE BACKUP for OS/2 2.0", Version 2.12, Stuttgart
+ 1993
+
+ Helmut Rompel, "IBM Computerwelt GUIDE", What is what bei IBM., Systeme *
+ Programme * Begriffe, IWT-Verlag GmbH - Muenchen, 1988
+
+ 7 Credits to
+ ------------
+ 7.1 People
+ ----------
+ Klaus Grimm
+ who already a long time ago gave me the old code from the
+ SCSI-driver in order to get it running for some old machine
+ in our institute.
+ Martin Kolinek
+ who wrote the first release of the IBM SCSI-subsystem driver.
+ Chris Beauregard
+ who for a long time maintained MCA-Linux and the SCSI-driver
+ in the beginning. Chris, wherever you are: Cheers to you!
+ Klaus Kudielka
+ with whom in the 2.1.x times, I had a quite fruitful
+ cooperation to get the driver running as a module and to get
+ it running with multiple SCSI-adapters.
+ David Weinehall
+ for his excellent maintenance of the MCA-stuff and the quite
+ detailed bug reports and ideas for this driver (and his
+ patience ;-)).
+ Alan J. Cox
+ for his bug reports and his bold activities in cross-checking
+ the driver-code with his teststand.
+
+ 7.2 Sponsors & Supporters
+ -------------------------
+ "Hallo IBM",
+ IBM-Deutschland GmbH
+ the service of IBM-Deutschland for customers. Their E-Mail
+ service is unbeatable. Whatever old stuff I asked for, I
+ always got some helpful answers.
+ Karl-Otto Reimers,
+ IBM Klub - Sparte IBM Geschichte, Sindelfingen
+ for sending me a copy of the w/Cache manual from the
+ IBM-Deutschland archives.
+ Harald Staiger
+ for his extensive hardware donations which allows me today
+ still to test the driver in various constellations.
+ Erich Fritscher
+ for his very kind sponsoring.
+ Louis Ohland,
+ Charles Lasitter
+ for support by shipping me an IBM SCSI-2 Fast/Wide manual.
+ In addition, the contribution of various hardware is quite
+ decessive and will make it possible to add FWSR (RAID)
+ adapter support to the driver in the near future! So,
+ complaints about no RAID support won't remain forever.
+ Yes, folks, that is no joke, RAID support is going to rise!
+ Erik Weber
+ for the great deal we made about a model 9595 and the nice
+ surrounding equipment and the cool trip to Mannheim
+ second-hand computer market. In addition, I would like
+ to thank him for his exhaustive SCSI-driver testing on his
+ 95er PS/2 park.
+ Anthony Hogbin
+ for his direct shipment of a SCSI F/W adapter, which allowed
+ me immediately on the first stage to try it on model 8557
+ together with onboard SCSI adapter and some SCSI w/Cache.
+ Andreas Hotz
+ for his support by memory and an IBM SCSI-adapter. Collecting
+ all this together now allows me to try really things with
+ the driver at maximum load and variety on various models in
+ a very quick and efficient way.
+ Peter Jennewein
+ for his model 30, which serves me as part of my teststand
+ and his cool remark about how you make an ordinary diskette
+ drive working and how to connect it to an IBM-diskette port.
+ Johannes Gutenberg-Universitaet, Mainz &
+ Institut fuer Kernphysik, Mainz Microtron (MAMI)
+ for the offered space, the link, placed on the central
+ homepage and the space to store and offer the driver and
+ related material and the free working times, which allow
+ me to answer all your e-mail.
+
+ 8 Trademarks
+ ------------
+ IBM, PS/2, OS/2, Microchannel are registered trademarks of International
+ Business Machines Corporation
+
+ MS-DOS is a registered trademark of Microsoft Corporation
+
+ Microware, OS-9 are registered trademarks of Microware Systems
+
+ 9 Disclaimer
+ ------------
+ Beside the GNU General Public License and the dependent disclaimers and disclaimers
+ concerning the Linux-kernel in special, this SCSI-driver comes without any
+ warranty. Its functionality is tested as good as possible on certain
+ machines and combinations of computer hardware, which does not exclude,
+ that data loss or severe damage of hardware is possible while using this
+ part of software on some arbitrary computer hardware or in combination
+ with other software packages. It is highly recommended to make backup
+ copies of your data before using this software. Furthermore, personal
+ injuries by hardware defects, that could be caused by this SCSI-driver are
+ not excluded and it is highly recommended to handle this driver with a
+ maximum of carefulness.
+
+ This driver supports hardware, produced by International Business Machines
+ Corporation (IBM).
+
+------
+Michael Lang
+(langa2@kph.uni-mainz.de)
diff --git a/Documentation/scsi/in2000.txt b/Documentation/scsi/in2000.txt
new file mode 100644
index 0000000..c3e2a90
--- /dev/null
+++ b/Documentation/scsi/in2000.txt
@@ -0,0 +1,202 @@
+
+UPDATE NEWS: version 1.33 - 26 Aug 98
+
+ Interrupt management in this driver has become, over
+ time, increasingly odd and difficult to explain - this
+ has been mostly due to my own mental inadequacies. In
+ recent kernels, it has failed to function at all when
+ compiled for SMP. I've fixed that problem, and after
+ taking a fresh look at interrupts in general, greatly
+ reduced the number of places where they're fiddled
+ with. Done some heavy testing and it looks very good.
+ The driver now makes use of the __initfunc() and
+ __initdata macros to save about 4k of kernel memory.
+ Once again, the same code works for both 2.0.xx and
+ 2.1.xx kernels.
+
+UPDATE NEWS: version 1.32 - 28 Mar 98
+
+ Removed the check for legal IN2000 hardware versions:
+ It appears that the driver works fine with serial
+ EPROMs (the 8-pin chip that defines hardware rev) as
+ old as 2.1, so we'll assume that all cards are OK.
+
+UPDATE NEWS: version 1.31 - 6 Jul 97
+
+ Fixed a bug that caused incorrect SCSI status bytes to be
+ returned from commands sent to LUNs greater than 0. This
+ means that CDROM changers work now! Fixed a bug in the
+ handling of command-line arguments when loaded as a module.
+ Also put all the header data in in2000.h where it belongs.
+ There are no longer any differences between this driver in
+ the 2.1.xx source tree and the 2.0.xx tree, as of 2.0.31
+ and 2.1.45 (or is it .46?) - this makes things much easier
+ for me...
+
+UPDATE NEWS: version 1.30 - 14 Oct 96
+
+ Fixed a bug in the code that sets the transfer direction
+ bit (DESTID_DPD in the WD_DESTINATION_ID register). There
+ are quite a few SCSI commands that do a write-to-device;
+ now we deal with all of them correctly. Thanks to Joerg
+ Dorchain for catching this one.
+
+UPDATE NEWS: version 1.29 - 24 Sep 96
+
+ The memory-mapped hardware on the card is now accessed via
+ the 'readb()' and 'readl()' macros - required by the new
+ memory management scheme in the 2.1.x kernel series.
+ As suggested by Andries Brouwer, 'bios_param()' no longer
+ forces an artificial 1023 track limit on drives. Also
+ removed some kludge-code left over from struggles with
+ older (buggy) compilers.
+
+UPDATE NEWS: version 1.28 - 07 May 96
+
+ Tightened up the "interrupts enabled/disabled" discipline
+ in 'in2000_queuecommand()' and maybe 1 or 2 other places.
+ I _think_ it may have been a little too lax, causing an
+ occasional crash during full moon. A fully functional
+ /proc interface is now in place - if you want to play
+ with it, start by doing 'cat /proc/scsi/in2000/0'. You
+ can also use it to change a few run-time parameters on
+ the fly, but it's mostly for debugging. The curious
+ should take a good look at 'in2000_proc_info()' in the
+ in2000.c file to get an understanding of what it's all
+ about; I figure that people who are really into it will
+ want to add features suited to their own needs...
+ Also, sync is now DISABLED by default.
+
+UPDATE NEWS: version 1.27 - 10 Apr 96
+
+ Fixed a well-hidden bug in the adaptive-disconnect code
+ that would show up every now and then during extreme
+ heavy loads involving 2 or more simultaneously active
+ devices. Thanks to Joe Mack for keeping my nose to the
+ grindstone on this one.
+
+UPDATE NEWS: version 1.26 - 07 Mar 96
+
+ 1.25 had a nasty bug that bit people with swap partitions
+ and tape drives. Also, in my attempt to guess my way
+ through Intel assembly language, I made an error in the
+ inline code for IO writes. Made a few other changes and
+ repairs - this version (fingers crossed) should work well.
+
+UPDATE NEWS: version 1.25 - 05 Mar 96
+
+ Kernel 1.3.70 interrupt mods added; old kernels still OK.
+ Big help from Bill Earnest and David Willmore on speed
+ testing and optimizing: I think there's a real improvement
+ in this area.
+ New! User-friendly command-line interface for LILO and
+ module loading - the old method is gone, so you'll need
+ to read the comments for 'setup_strings' near the top
+ of in2000.c. For people with CDROM's or other devices
+ that have a tough time with sync negotiation, you can
+ now selectively disable sync on individual devices -
+ search for the 'nosync' keyword in the command-line
+ comments. Some of you disable the BIOS on the card, which
+ caused the auto-detect function to fail; there is now a
+ command-line option to force detection of a ROM-less card.
+
+UPDATE NEWS: version 1.24a - 24 Feb 96
+
+ There was a bug in the synchronous transfer code. Only
+ a few people downloaded before I caught it - could have
+ been worse.
+
+UPDATE NEWS: version 1.24 - 23 Feb 96
+
+ Lots of good changes. Advice from Bill Earnest resulted
+ in much better detection of cards, more efficient usage
+ of the fifo, and (hopefully) faster data transfers. The
+ jury is still out on speed - I hope it's improved some.
+ One nifty new feature is a cool way of doing disconnect/
+ reselect. The driver defaults to what I'm calling
+ 'adaptive disconnect' - meaning that each command is
+ evaluated individually as to whether or not it should be
+ run with the option to disconnect/reselect (if the device
+ chooses), or as a "SCSI-bus-hog". When several devices
+ are operating simultaneously, disconnects are usually an
+ advantage. In a single device system, or if only 1 device
+ is being accessed, transfers usually go faster if disconnects
+ are not allowed.
+
+
+
+The default arguments (you get these when you don't give an 'in2000'
+command-line argument, or you give a blank argument) will cause
+the driver to do adaptive disconnect, synchronous transfers, and a
+minimum of debug messages. If you want to fool with the options,
+search for 'setup_strings' near the top of the in2000.c file and
+check the 'hostdata->args' section in in2000.h - but be warned! Not
+everything is working yet (some things will never work, probably).
+I believe that disabling disconnects (DIS_NEVER) will allow you
+to choose a LEVEL2 value higher than 'L2_BASIC', but I haven't
+spent a lot of time testing this. You might try 'ENABLE_CLUSTERING'
+to see what happens: my tests showed little difference either way.
+There's also a define called 'DEFAULT_SX_PER'; this sets the data
+transfer speed for the asynchronous mode. I've put it at 500 ns
+despite the fact that the card could handle settings of 376 or
+252, because higher speeds may be a problem with poor quality
+cables or improper termination; 500 ns is a compromise. You can
+choose your own default through the command-line with the
+'period' keyword.
+
+
+------------------------------------------------
+*********** DIP switch settings **************
+------------------------------------------------
+
+ sw1-1 sw1-2 BIOS address (hex)
+ -----------------------------------------
+ off off C8000 - CBFF0
+ on off D8000 - DBFF0
+ off on D0000 - D3FF0
+ on on BIOS disabled
+
+ sw1-3 sw1-4 IO port address (hex)
+ ------------------------------------
+ off off 220 - 22F
+ on off 200 - 20F
+ off on 110 - 11F
+ on on 100 - 10F
+
+ sw1-5 sw1-6 sw1-7 Interrupt
+ ------------------------------
+ off off off 15
+ off on off 14
+ off off on 11
+ off on on 10
+ on - - disabled
+
+ sw1-8 function depends on BIOS version. In earlier versions this
+ controlled synchronous data transfer support for MSDOS:
+ off = disabled
+ on = enabled
+ In later ROMs (starting with 01.3 in April 1994) sw1-8 controls
+ the "greater than 2 disk drive" feature that first appeared in
+ MSDOS 5.0 (ignored by Linux):
+ off = 2 drives maximum
+ on = 7 drives maximum
+
+ sw1-9 Floppy controller
+ --------------------------
+ off disabled
+ on enabled
+
+------------------------------------------------
+
+ I should mention that Drew Eckhardt's 'Generic NCR5380' sources
+ were my main inspiration, with lots of reference to the IN2000
+ driver currently distributed in the kernel source. I also owe
+ much to a driver written by Hamish Macdonald for Linux-m68k(!).
+ And to Eric Wright for being an ALPHA guinea pig. And to Bill
+ Earnest for 2 tons of great input and information. And to David
+ Willmore for extensive 'bonnie' testing. And to Joe Mack for
+ continual testing and feedback.
+
+
+ John Shifflett jshiffle@netcom.com
+
diff --git a/Documentation/scsi/libsas.txt b/Documentation/scsi/libsas.txt
new file mode 100644
index 0000000..aa54f54
--- /dev/null
+++ b/Documentation/scsi/libsas.txt
@@ -0,0 +1,484 @@
+SAS Layer
+---------
+
+The SAS Layer is a management infrastructure which manages
+SAS LLDDs. It sits between SCSI Core and SAS LLDDs. The
+layout is as follows: while SCSI Core is concerned with
+SAM/SPC issues, and a SAS LLDD+sequencer is concerned with
+phy/OOB/link management, the SAS layer is concerned with:
+
+ * SAS Phy/Port/HA event management (LLDD generates,
+ SAS Layer processes),
+ * SAS Port management (creation/destruction),
+ * SAS Domain discovery and revalidation,
+ * SAS Domain device management,
+ * SCSI Host registration/unregistration,
+ * Device registration with SCSI Core (SAS) or libata
+ (SATA), and
+ * Expander management and exporting expander control
+ to user space.
+
+A SAS LLDD is a PCI device driver. It is concerned with
+phy/OOB management, and vendor specific tasks and generates
+events to the SAS layer.
+
+The SAS Layer does most SAS tasks as outlined in the SAS 1.1
+spec.
+
+The sas_ha_struct describes the SAS LLDD to the SAS layer.
+Most of it is used by the SAS Layer but a few fields need to
+be initialized by the LLDDs.
+
+After initializing your hardware, from the probe() function
+you call sas_register_ha(). It will register your LLDD with
+the SCSI subsystem, creating a SCSI host and it will
+register your SAS driver with the sysfs SAS tree it creates.
+It will then return. Then you enable your phys to actually
+start OOB (at which point your driver will start calling the
+notify_* event callbacks).
+
+Structure descriptions:
+
+struct sas_phy --------------------
+Normally this is statically embedded to your driver's
+phy structure:
+ struct my_phy {
+ blah;
+ struct sas_phy sas_phy;
+ bleh;
+ };
+And then all the phys are an array of my_phy in your HA
+struct (shown below).
+
+Then as you go along and initialize your phys you also
+initialize the sas_phy struct, along with your own
+phy structure.
+
+In general, the phys are managed by the LLDD and the ports
+are managed by the SAS layer. So the phys are initialized
+and updated by the LLDD and the ports are initialized and
+updated by the SAS layer.
+
+There is a scheme where the LLDD can RW certain fields,
+and the SAS layer can only read such ones, and vice versa.
+The idea is to avoid unnecessary locking.
+
+enabled -- must be set (0/1)
+id -- must be set [0,MAX_PHYS)
+class, proto, type, role, oob_mode, linkrate -- must be set
+oob_mode -- you set this when OOB has finished and then notify
+the SAS Layer.
+
+sas_addr -- this normally points to an array holding the sas
+address of the phy, possibly somewhere in your my_phy
+struct.
+
+attached_sas_addr -- set this when you (LLDD) receive an
+IDENTIFY frame or a FIS frame, _before_ notifying the SAS
+layer. The idea is that sometimes the LLDD may want to fake
+or provide a different SAS address on that phy/port and this
+allows it to do this. At best you should copy the sas
+address from the IDENTIFY frame or maybe generate a SAS
+address for SATA directly attached devices. The Discover
+process may later change this.
+
+frame_rcvd -- this is where you copy the IDENTIFY/FIS frame
+when you get it; you lock, copy, set frame_rcvd_size and
+unlock the lock, and then call the event. It is a pointer
+since there's no way to know your hw frame size _exactly_,
+so you define the actual array in your phy struct and let
+this pointer point to it. You copy the frame from your
+DMAable memory to that area holding the lock.
+
+sas_prim -- this is where primitives go when they're
+received. See sas.h. Grab the lock, set the primitive,
+release the lock, notify.
+
+port -- this points to the sas_port if the phy belongs
+to a port -- the LLDD only reads this. It points to the
+sas_port this phy is part of. Set by the SAS Layer.
+
+ha -- may be set; the SAS layer sets it anyway.
+
+lldd_phy -- you should set this to point to your phy so you
+can find your way around faster when the SAS layer calls one
+of your callbacks and passes you a phy. If the sas_phy is
+embedded you can also use container_of -- whatever you
+prefer.
+
+
+struct sas_port --------------------
+The LLDD doesn't set any fields of this struct -- it only
+reads them. They should be self explanatory.
+
+phy_mask is 32 bit, this should be enough for now, as I
+haven't heard of a HA having more than 8 phys.
+
+lldd_port -- I haven't found use for that -- maybe other
+LLDD who wish to have internal port representation can make
+use of this.
+
+
+struct sas_ha_struct --------------------
+It normally is statically declared in your own LLDD
+structure describing your adapter:
+struct my_sas_ha {
+ blah;
+ struct sas_ha_struct sas_ha;
+ struct my_phy phys[MAX_PHYS];
+ struct sas_port sas_ports[MAX_PHYS]; /* (1) */
+ bleh;
+};
+
+(1) If your LLDD doesn't have its own port representation.
+
+What needs to be initialized (sample function given below).
+
+pcidev
+sas_addr -- since the SAS layer doesn't want to mess with
+ memory allocation, etc, this points to statically
+ allocated array somewhere (say in your host adapter
+ structure) and holds the SAS address of the host
+ adapter as given by you or the manufacturer, etc.
+sas_port
+sas_phy -- an array of pointers to structures. (see
+ note above on sas_addr).
+ These must be set. See more notes below.
+num_phys -- the number of phys present in the sas_phy array,
+ and the number of ports present in the sas_port
+ array. There can be a maximum num_phys ports (one per
+ port) so we drop the num_ports, and only use
+ num_phys.
+
+The event interface:
+
+ /* LLDD calls these to notify the class of an event. */
+ void (*notify_ha_event)(struct sas_ha_struct *, enum ha_event);
+ void (*notify_port_event)(struct sas_phy *, enum port_event);
+ void (*notify_phy_event)(struct sas_phy *, enum phy_event);
+
+When sas_register_ha() returns, those are set and can be
+called by the LLDD to notify the SAS layer of such events
+the SAS layer.
+
+The port notification:
+
+ /* The class calls these to notify the LLDD of an event. */
+ void (*lldd_port_formed)(struct sas_phy *);
+ void (*lldd_port_deformed)(struct sas_phy *);
+
+If the LLDD wants notification when a port has been formed
+or deformed it sets those to a function satisfying the type.
+
+A SAS LLDD should also implement at least one of the Task
+Management Functions (TMFs) described in SAM:
+
+ /* Task Management Functions. Must be called from process context. */
+ int (*lldd_abort_task)(struct sas_task *);
+ int (*lldd_abort_task_set)(struct domain_device *, u8 *lun);
+ int (*lldd_clear_aca)(struct domain_device *, u8 *lun);
+ int (*lldd_clear_task_set)(struct domain_device *, u8 *lun);
+ int (*lldd_I_T_nexus_reset)(struct domain_device *);
+ int (*lldd_lu_reset)(struct domain_device *, u8 *lun);
+ int (*lldd_query_task)(struct sas_task *);
+
+For more information please read SAM from T10.org.
+
+Port and Adapter management:
+
+ /* Port and Adapter management */
+ int (*lldd_clear_nexus_port)(struct sas_port *);
+ int (*lldd_clear_nexus_ha)(struct sas_ha_struct *);
+
+A SAS LLDD should implement at least one of those.
+
+Phy management:
+
+ /* Phy management */
+ int (*lldd_control_phy)(struct sas_phy *, enum phy_func);
+
+lldd_ha -- set this to point to your HA struct. You can also
+use container_of if you embedded it as shown above.
+
+A sample initialization and registration function
+can look like this (called last thing from probe())
+*but* before you enable the phys to do OOB:
+
+static int register_sas_ha(struct my_sas_ha *my_ha)
+{
+ int i;
+ static struct sas_phy *sas_phys[MAX_PHYS];
+ static struct sas_port *sas_ports[MAX_PHYS];
+
+ my_ha->sas_ha.sas_addr = &my_ha->sas_addr[0];
+
+ for (i = 0; i < MAX_PHYS; i++) {
+ sas_phys[i] = &my_ha->phys[i].sas_phy;
+ sas_ports[i] = &my_ha->sas_ports[i];
+ }
+
+ my_ha->sas_ha.sas_phy = sas_phys;
+ my_ha->sas_ha.sas_port = sas_ports;
+ my_ha->sas_ha.num_phys = MAX_PHYS;
+
+ my_ha->sas_ha.lldd_port_formed = my_port_formed;
+
+ my_ha->sas_ha.lldd_dev_found = my_dev_found;
+ my_ha->sas_ha.lldd_dev_gone = my_dev_gone;
+
+ my_ha->sas_ha.lldd_max_execute_num = lldd_max_execute_num; (1)
+
+ my_ha->sas_ha.lldd_queue_size = ha_can_queue;
+ my_ha->sas_ha.lldd_execute_task = my_execute_task;
+
+ my_ha->sas_ha.lldd_abort_task = my_abort_task;
+ my_ha->sas_ha.lldd_abort_task_set = my_abort_task_set;
+ my_ha->sas_ha.lldd_clear_aca = my_clear_aca;
+ my_ha->sas_ha.lldd_clear_task_set = my_clear_task_set;
+ my_ha->sas_ha.lldd_I_T_nexus_reset= NULL; (2)
+ my_ha->sas_ha.lldd_lu_reset = my_lu_reset;
+ my_ha->sas_ha.lldd_query_task = my_query_task;
+
+ my_ha->sas_ha.lldd_clear_nexus_port = my_clear_nexus_port;
+ my_ha->sas_ha.lldd_clear_nexus_ha = my_clear_nexus_ha;
+
+ my_ha->sas_ha.lldd_control_phy = my_control_phy;
+
+ return sas_register_ha(&my_ha->sas_ha);
+}
+
+(1) This is normally a LLDD parameter, something of the
+lines of a task collector. What it tells the SAS Layer is
+whether the SAS layer should run in Direct Mode (default:
+value 0 or 1) or Task Collector Mode (value greater than 1).
+
+In Direct Mode, the SAS Layer calls Execute Task as soon as
+it has a command to send to the SDS, _and_ this is a single
+command, i.e. not linked.
+
+Some hardware (e.g. aic94xx) has the capability to DMA more
+than one task at a time (interrupt) from host memory. Task
+Collector Mode is an optional feature for HAs which support
+this in their hardware. (Again, it is completely optional
+even if your hardware supports it.)
+
+In Task Collector Mode, the SAS Layer would do _natural_
+coalescing of tasks and at the appropriate moment it would
+call your driver to DMA more than one task in a single HA
+interrupt. DMBS may want to use this by insmod/modprobe
+setting the lldd_max_execute_num to something greater than
+1.
+
+(2) SAS 1.1 does not define I_T Nexus Reset TMF.
+
+Events
+------
+
+Events are _the only way_ a SAS LLDD notifies the SAS layer
+of anything. There is no other method or way a LLDD to tell
+the SAS layer of anything happening internally or in the SAS
+domain.
+
+Phy events:
+ PHYE_LOSS_OF_SIGNAL, (C)
+ PHYE_OOB_DONE,
+ PHYE_OOB_ERROR, (C)
+ PHYE_SPINUP_HOLD.
+
+Port events, passed on a _phy_:
+ PORTE_BYTES_DMAED, (M)
+ PORTE_BROADCAST_RCVD, (E)
+ PORTE_LINK_RESET_ERR, (C)
+ PORTE_TIMER_EVENT, (C)
+ PORTE_HARD_RESET.
+
+Host Adapter event:
+ HAE_RESET
+
+A SAS LLDD should be able to generate
+ - at least one event from group C (choice),
+ - events marked M (mandatory) are mandatory (only one),
+ - events marked E (expander) if it wants the SAS layer
+ to handle domain revalidation (only one such).
+ - Unmarked events are optional.
+
+Meaning:
+
+HAE_RESET -- when your HA got internal error and was reset.
+
+PORTE_BYTES_DMAED -- on receiving an IDENTIFY/FIS frame
+PORTE_BROADCAST_RCVD -- on receiving a primitive
+PORTE_LINK_RESET_ERR -- timer expired, loss of signal, loss
+of DWS, etc. (*)
+PORTE_TIMER_EVENT -- DWS reset timeout timer expired (*)
+PORTE_HARD_RESET -- Hard Reset primitive received.
+
+PHYE_LOSS_OF_SIGNAL -- the device is gone (*)
+PHYE_OOB_DONE -- OOB went fine and oob_mode is valid
+PHYE_OOB_ERROR -- Error while doing OOB, the device probably
+got disconnected. (*)
+PHYE_SPINUP_HOLD -- SATA is present, COMWAKE not sent.
+
+(*) should set/clear the appropriate fields in the phy,
+ or alternatively call the inlined sas_phy_disconnected()
+ which is just a helper, from their tasklet.
+
+The Execute Command SCSI RPC:
+
+ int (*lldd_execute_task)(struct sas_task *, int num,
+ unsigned long gfp_flags);
+
+Used to queue a task to the SAS LLDD. @task is the tasks to
+be executed. @num should be the number of tasks being
+queued at this function call (they are linked listed via
+task::list), @gfp_mask should be the gfp_mask defining the
+context of the caller.
+
+This function should implement the Execute Command SCSI RPC,
+or if you're sending a SCSI Task as linked commands, you
+should also use this function.
+
+That is, when lldd_execute_task() is called, the command(s)
+go out on the transport *immediately*. There is *no*
+queuing of any sort and at any level in a SAS LLDD.
+
+The use of task::list is two-fold, one for linked commands,
+the other discussed below.
+
+It is possible to queue up more than one task at a time, by
+initializing the list element of struct sas_task, and
+passing the number of tasks enlisted in this manner in num.
+
+Returns: -SAS_QUEUE_FULL, -ENOMEM, nothing was queued;
+ 0, the task(s) were queued.
+
+If you want to pass num > 1, then either
+A) you're the only caller of this function and keep track
+ of what you've queued to the LLDD, or
+B) you know what you're doing and have a strategy of
+ retrying.
+
+As opposed to queuing one task at a time (function call),
+batch queuing of tasks, by having num > 1, greatly
+simplifies LLDD code, sequencer code, and _hardware design_,
+and has some performance advantages in certain situations
+(DBMS).
+
+The LLDD advertises if it can take more than one command at
+a time at lldd_execute_task(), by setting the
+lldd_max_execute_num parameter (controlled by "collector"
+module parameter in aic94xx SAS LLDD).
+
+You should leave this to the default 1, unless you know what
+you're doing.
+
+This is a function of the LLDD, to which the SAS layer can
+cater to.
+
+int lldd_queue_size
+ The host adapter's queue size. This is the maximum
+number of commands the lldd can have pending to domain
+devices on behalf of all upper layers submitting through
+lldd_execute_task().
+
+You really want to set this to something (much) larger than
+1.
+
+This _really_ has absolutely nothing to do with queuing.
+There is no queuing in SAS LLDDs.
+
+struct sas_task {
+ dev -- the device this task is destined to
+ list -- must be initialized (INIT_LIST_HEAD)
+ task_proto -- _one_ of enum sas_proto
+ scatter -- pointer to scatter gather list array
+ num_scatter -- number of elements in scatter
+ total_xfer_len -- total number of bytes expected to be transferred
+ data_dir -- PCI_DMA_...
+ task_done -- callback when the task has finished execution
+};
+
+When an external entity, entity other than the LLDD or the
+SAS Layer, wants to work with a struct domain_device, it
+_must_ call kobject_get() when getting a handle on the
+device and kobject_put() when it is done with the device.
+
+This does two things:
+ A) implements proper kfree() for the device;
+ B) increments/decrements the kref for all players:
+ domain_device
+ all domain_device's ... (if past an expander)
+ port
+ host adapter
+ pci device
+ and up the ladder, etc.
+
+DISCOVERY
+---------
+
+The sysfs tree has the following purposes:
+ a) It shows you the physical layout of the SAS domain at
+ the current time, i.e. how the domain looks in the
+ physical world right now.
+ b) Shows some device parameters _at_discovery_time_.
+
+This is a link to the tree(1) program, very useful in
+viewing the SAS domain:
+ftp://mama.indstate.edu/linux/tree/
+I expect user space applications to actually create a
+graphical interface of this.
+
+That is, the sysfs domain tree doesn't show or keep state if
+you e.g., change the meaning of the READY LED MEANING
+setting, but it does show you the current connection status
+of the domain device.
+
+Keeping internal device state changes is responsibility of
+upper layers (Command set drivers) and user space.
+
+When a device or devices are unplugged from the domain, this
+is reflected in the sysfs tree immediately, and the device(s)
+removed from the system.
+
+The structure domain_device describes any device in the SAS
+domain. It is completely managed by the SAS layer. A task
+points to a domain device, this is how the SAS LLDD knows
+where to send the task(s) to. A SAS LLDD only reads the
+contents of the domain_device structure, but it never creates
+or destroys one.
+
+Expander management from User Space
+-----------------------------------
+
+In each expander directory in sysfs, there is a file called
+"smp_portal". It is a binary sysfs attribute file, which
+implements an SMP portal (Note: this is *NOT* an SMP port),
+to which user space applications can send SMP requests and
+receive SMP responses.
+
+Functionality is deceptively simple:
+
+1. Build the SMP frame you want to send. The format and layout
+ is described in the SAS spec. Leave the CRC field equal 0.
+open(2)
+2. Open the expander's SMP portal sysfs file in RW mode.
+write(2)
+3. Write the frame you built in 1.
+read(2)
+4. Read the amount of data you expect to receive for the frame you built.
+ If you receive different amount of data you expected to receive,
+ then there was some kind of error.
+close(2)
+All this process is shown in detail in the function do_smp_func()
+and its callers, in the file "expander_conf.c".
+
+The kernel functionality is implemented in the file
+"sas_expander.c".
+
+The program "expander_conf.c" implements this. It takes one
+argument, the sysfs file name of the SMP portal to the
+expander, and gives expander information, including routing
+tables.
+
+The SMP portal gives you complete control of the expander,
+so please be careful.
diff --git a/Documentation/scsi/link_power_management_policy.txt b/Documentation/scsi/link_power_management_policy.txt
new file mode 100644
index 0000000..d18993d
--- /dev/null
+++ b/Documentation/scsi/link_power_management_policy.txt
@@ -0,0 +1,19 @@
+This parameter allows the user to set the link (interface) power management.
+There are 3 possible options:
+
+Value Effect
+----------------------------------------------------------------------------
+min_power Tell the controller to try to make the link use the
+ least possible power when possible. This may
+ sacrifice some performance due to increased latency
+ when coming out of lower power states.
+
+max_performance Generally, this means no power management. Tell
+ the controller to have performance be a priority
+ over power management.
+
+medium_power Tell the controller to enter a lower power state
+ when possible, but do not enter the lowest power
+ state, thus improving latency over min_power setting.
+
+
diff --git a/Documentation/scsi/lpfc.txt b/Documentation/scsi/lpfc.txt
new file mode 100644
index 0000000..5741ea8
--- /dev/null
+++ b/Documentation/scsi/lpfc.txt
@@ -0,0 +1,83 @@
+
+LPFC Driver Release Notes:
+
+=============================================================================
+
+
+ IMPORTANT:
+
+ Starting in the 8.0.17 release, the driver began to be targeted strictly
+ toward the upstream kernel. As such, we removed #ifdefs for older kernels
+ (pre 2.6.10). The 8.0.16 release should be used if the driver is to be
+ run on one of the older kernels.
+
+ The proposed modifications to the transport layer for FC remote ports
+ and extended attribute support is now part of the upstream kernel
+ as of 2.6.12. We no longer need to provide patches for this support,
+ nor a *full* version which has old an new kernel support.
+
+ The driver now requires a 2.6.12 (if pre-release, 2.6.12-rc1) or later
+ kernel.
+
+ Please heed these dependencies....
+
+
+ ********************************************************************
+
+
+The following information is provided for additional background on the
+history of the driver as we push for upstream acceptance.
+
+Cable pull and temporary device Loss:
+
+ In older revisions of the lpfc driver, the driver internally queued i/o
+ received from the midlayer. In the cases where a cable was pulled, link
+ jitter, or a device temporarily loses connectivity (due to its cable
+ being removed, a switch rebooting, or a device reboot), the driver could
+ hide the disappearance of the device from the midlayer. I/O's issued to
+ the LLDD would simply be queued for a short duration, allowing the device
+ to reappear or link come back alive, with no inadvertent side effects
+ to the system. If the driver did not hide these conditions, i/o would be
+ errored by the driver, the mid-layer would exhaust its retries, and the
+ device would be taken offline. Manual intervention would be required to
+ re-enable the device.
+
+ The community supporting kernel.org has driven an effort to remove
+ internal queuing from all LLDDs. The philosophy is that internal
+ queuing is unnecessary as the block layer already performs the
+ queuing. Removing the queues from the LLDD makes a more predictable
+ and more simple LLDD.
+
+ As a potential new addition to kernel.org, the 8.x driver was asked to
+ have all internal queuing removed. Emulex complied with this request.
+ In explaining the impacts of this change, Emulex has worked with the
+ community in modifying the behavior of the SCSI midlayer so that SCSI
+ devices can be temporarily suspended while transport events (such as
+ those described) can occur.
+
+ The proposed patch was posted to the linux-scsi mailing list. The patch
+ is contained in the 2.6.10-rc2 (and later) patch kits. As such, this
+ patch is part of the standard 2.6.10 kernel.
+
+ By default, the driver expects the patches for block/unblock interfaces
+ to be present in the kernel. No #define needs to be set to enable support.
+
+
+Kernel Support
+
+ This source package is targeted for the upstream kernel only. (See notes
+ at the top of this file). It relies on interfaces that are slowing
+ migrating into the kernel.org kernel.
+
+ At this time, the driver requires the 2.6.12 (if pre-release, 2.6.12-rc1)
+ kernel.
+
+ If a driver is needed for older kernels please utilize the 8.0.16
+ driver sources.
+
+
+Patches
+
+ Thankfully, at this time, patches are not needed.
+
+
diff --git a/Documentation/scsi/megaraid.txt b/Documentation/scsi/megaraid.txt
new file mode 100644
index 0000000..3c7cea5
--- /dev/null
+++ b/Documentation/scsi/megaraid.txt
@@ -0,0 +1,70 @@
+ Notes on Management Module
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Overview:
+--------
+
+Different classes of controllers from LSI Logic accept and respond to the
+user applications in a similar way. They understand the same firmware control
+commands. Furthermore, the applications also can treat different classes of
+the controllers uniformly. Hence it is logical to have a single module that
+interfaces with the applications on one side and all the low level drivers
+on the other.
+
+The advantages, though obvious, are listed for completeness:
+
+ i. Avoid duplicate code from the low level drivers.
+ ii. Unburden the low level drivers from having to export the
+ character node device and related handling.
+ iii. Implement any policy mechanisms in one place.
+ iv. Applications have to interface with only module instead of
+ multiple low level drivers.
+
+Currently this module (called Common Management Module) is used only to issue
+ioctl commands. But this module is envisioned to handle all user space level
+interactions. So any 'proc', 'sysfs' implementations will be localized in this
+common module.
+
+Credits:
+-------
+
+"Shared code in a third module, a "library module", is an acceptable
+solution. modprobe automatically loads dependent modules, so users
+running "modprobe driver1" or "modprobe driver2" would automatically
+load the shared library module."
+
+ - Jeff Garzik (jgarzik@pobox.com), 02.25.2004 LKML
+
+"As Jeff hinted, if your userspace<->driver API is consistent between
+your new MPT-based RAID controllers and your existing megaraid driver,
+then perhaps you need a single small helper module (lsiioctl or some
+better name), loaded by both mptraid and megaraid automatically, which
+handles registering the /dev/megaraid node dynamically. In this case,
+both mptraid and megaraid would register with lsiioctl for each
+adapter discovered, and lsiioctl would essentially be a switch,
+redirecting userspace tool ioctls to the appropriate driver."
+
+ - Matt Domsch, (Matt_Domsch@dell.com), 02.25.2004 LKML
+
+Design:
+------
+
+The Common Management Module is implemented in megaraid_mm.[ch] files. This
+module acts as a registry for low level hba drivers. The low level drivers
+(currently only megaraid) register each controller with the common module.
+
+The applications interface with the common module via the character device
+node exported by the module.
+
+The lower level drivers now understand only a new improved ioctl packet called
+uioc_t. The management module converts the older ioctl packets from the older
+applications into uioc_t. After driver handles the uioc_t, the common module
+will convert that back into the old format before returning to applications.
+
+As new applications evolve and replace the old ones, the old packet format
+will be retired.
+
+Common module dedicates one uioc_t packet to each controller registered. This
+can easily be more than one. But since megaraid is the only low level driver
+today, and it can handle only one ioctl, there is no reason to have more. But
+as new controller classes get added, this will be tuned appropriately.
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
new file mode 100644
index 0000000..230e308
--- /dev/null
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -0,0 +1,1849 @@
+The Linux NCR53C8XX/SYM53C8XX drivers README file
+
+Written by Gerard Roudier <groudier@free.fr>
+21 Rue Carnot
+95170 DEUIL LA BARRE - FRANCE
+
+29 May 1999
+===============================================================================
+
+1. Introduction
+2. Supported chips and SCSI features
+3. Advantages of the enhanced 896 driver
+ 3.1 Optimized SCSI SCRIPTS
+ 3.2 New features of the SYM53C896 (64 bit PCI dual LVD SCSI controller)
+4. Memory mapped I/O versus normal I/O
+5. Tagged command queueing
+6. Parity checking
+7. Profiling information
+8. Control commands
+ 8.1 Set minimum synchronous period
+ 8.2 Set wide size
+ 8.3 Set maximum number of concurrent tagged commands
+ 8.4 Set order type for tagged command
+ 8.5 Set debug mode
+ 8.6 Clear profile counters
+ 8.7 Set flag (no_disc)
+ 8.8 Set verbose level
+ 8.9 Reset all logical units of a target
+ 8.10 Abort all tasks of all logical units of a target
+9. Configuration parameters
+10. Boot setup commands
+ 10.1 Syntax
+ 10.2 Available arguments
+ 10.2.1 Master parity checking
+ 10.2.2 Scsi parity checking
+ 10.2.3 Scsi disconnections
+ 10.2.4 Special features
+ 10.2.5 Ultra SCSI support
+ 10.2.6 Default number of tagged commands
+ 10.2.7 Default synchronous period factor
+ 10.2.8 Negotiate synchronous with all devices
+ 10.2.9 Verbosity level
+ 10.2.10 Debug mode
+ 10.2.11 Burst max
+ 10.2.12 LED support
+ 10.2.13 Max wide
+ 10.2.14 Differential mode
+ 10.2.15 IRQ mode
+ 10.2.16 Reverse probe
+ 10.2.17 Fix up PCI configuration space
+ 10.2.18 Serial NVRAM
+ 10.2.19 Check SCSI BUS
+ 10.2.20 Exclude a host from being attached
+ 10.2.21 Suggest a default SCSI id for hosts
+ 10.2.22 Enable use of IMMEDIATE ARBITRATION
+ 10.3 Advised boot setup commands
+ 10.4 PCI configuration fix-up boot option
+ 10.5 Serial NVRAM support boot option
+ 10.6 SCSI BUS checking boot option
+ 10.7 IMMEDIATE ARBITRATION boot option
+11. Some constants and flags of the ncr53c8xx.h header file
+12. Installation
+13. Architecture dependent features
+14. Known problems
+ 14.1 Tagged commands with Iomega Jaz device
+ 14.2 Device names change when another controller is added
+ 14.3 Using only 8 bit devices with a WIDE SCSI controller.
+ 14.4 Possible data corruption during a Memory Write and Invalidate
+ 14.5 IRQ sharing problems
+15. SCSI problem troubleshooting
+ 15.1 Problem tracking
+ 15.2 Understanding hardware error reports
+16. Synchronous transfer negotiation tables
+ 16.1 Synchronous timings for 53C875 and 53C860 Ultra-SCSI controllers
+ 16.2 Synchronous timings for fast SCSI-2 53C8XX controllers
+17. Serial NVRAM support (by Richard Waltham)
+ 17.1 Features
+ 17.2 Symbios NVRAM layout
+ 17.3 Tekram NVRAM layout
+18. Support for Big Endian
+ 18.1 Big Endian CPU
+ 18.2 NCR chip in Big Endian mode of operations
+
+===============================================================================
+
+1. Introduction
+
+The initial Linux ncr53c8xx driver has been a port of the ncr driver from
+FreeBSD that has been achieved in November 1995 by:
+ Gerard Roudier <groudier@free.fr>
+
+The original driver has been written for 386bsd and FreeBSD by:
+ Wolfgang Stanglmeier <wolf@cologne.de>
+ Stefan Esser <se@mi.Uni-Koeln.de>
+
+It is now available as a bundle of 2 drivers:
+
+- ncr53c8xx generic driver that supports all the SYM53C8XX family including
+ the earliest 810 rev. 1, the latest 896 (2 channel LVD SCSI controller) and
+ the new 895A (1 channel LVD SCSI controller).
+- sym53c8xx enhanced driver (a.k.a. 896 drivers) that drops support of oldest
+ chips in order to gain advantage of new features, as LOAD/STORE instructions
+ available since the 810A and hardware phase mismatch available with the
+ 896 and the 895A.
+
+You can find technical information about the NCR 8xx family in the
+PCI-HOWTO written by Michael Will and in the SCSI-HOWTO written by
+Drew Eckhardt.
+
+Information about new chips is available at LSILOGIC web server:
+
+ http://www.lsilogic.com/
+
+SCSI standard documentations are available at SYMBIOS ftp server:
+
+ ftp://ftp.symbios.com/
+
+Useful SCSI tools written by Eric Youngdale are available at tsx-11:
+
+ ftp://tsx-11.mit.edu/pub/linux/ALPHA/scsi/scsiinfo-X.Y.tar.gz
+ ftp://tsx-11.mit.edu/pub/linux/ALPHA/scsi/scsidev-X.Y.tar.gz
+
+These tools are not ALPHA but quite clean and work quite well.
+It is essential you have the 'scsiinfo' package.
+
+This short documentation describes the features of the generic and enhanced
+drivers, configuration parameters and control commands available through
+the proc SCSI file system read / write operations.
+
+This driver has been tested OK with linux/i386, Linux/Alpha and Linux/PPC.
+
+Latest driver version and patches are available at:
+
+ ftp://ftp.tux.org/pub/people/gerard-roudier
+or
+ ftp://ftp.symbios.com/mirror/ftp.tux.org/pub/tux/roudier/drivers
+
+I am not a native speaker of English and there are probably lots of
+mistakes in this README file. Any help will be welcome.
+
+
+2. Supported chips and SCSI features
+
+The following features are supported for all chips:
+
+ Synchronous negotiation
+ Disconnection
+ Tagged command queuing
+ SCSI parity checking
+ Master parity checking
+
+"Wide negotiation" is supported for chips that allow it. The
+following table shows some characteristics of NCR 8xx family chips
+and what drivers support them.
+
+ Supported by Supported by
+ On board the generic the enhanced
+Chip SDMS BIOS Wide SCSI std. Max. sync driver driver
+---- --------- ---- --------- ---------- ------------ -------------
+810 N N FAST10 10 MB/s Y N
+810A N N FAST10 10 MB/s Y Y
+815 Y N FAST10 10 MB/s Y N
+825 Y Y FAST10 20 MB/s Y N
+825A Y Y FAST10 20 MB/s Y Y
+860 N N FAST20 20 MB/s Y Y
+875 Y Y FAST20 40 MB/s Y Y
+876 Y Y FAST20 40 MB/s Y Y
+895 Y Y FAST40 80 MB/s Y Y
+895A Y Y FAST40 80 MB/s Y Y
+896 Y Y FAST40 80 MB/s Y Y
+897 Y Y FAST40 80 MB/s Y Y
+1510D Y Y FAST40 80 MB/s Y Y
+1010 Y Y FAST80 160 MB/s N Y
+1010_66* Y Y FAST80 160 MB/s N Y
+
+* Chip supports 33MHz and 66MHz PCI buses.
+
+
+Summary of other supported features:
+
+Module: allow to load the driver
+Memory mapped I/O: increases performance
+Profiling information: read operations from the proc SCSI file system
+Control commands: write operations to the proc SCSI file system
+Debugging information: written to syslog (expert only)
+Scatter / gather
+Shared interrupt
+Boot setup commands
+Serial NVRAM: Symbios and Tekram formats
+
+
+3. Advantages of the enhanced 896 driver
+
+3.1 Optimized SCSI SCRIPTS.
+
+The 810A, 825A, 875, 895, 896 and 895A support new SCSI SCRIPTS instructions
+named LOAD and STORE that allow to move up to 1 DWORD from/to an IO register
+to/from memory much faster that the MOVE MEMORY instruction that is supported
+by the 53c7xx and 53c8xx family.
+The LOAD/STORE instructions support absolute and DSA relative addressing
+modes. The SCSI SCRIPTS had been entirely rewritten using LOAD/STORE instead
+of MOVE MEMORY instructions.
+
+3.2 New features of the SYM53C896 (64 bit PCI dual LVD SCSI controller)
+
+The 896 and the 895A allows handling of the phase mismatch context from
+SCRIPTS (avoids the phase mismatch interrupt that stops the SCSI processor
+until the C code has saved the context of the transfer).
+Implementing this without using LOAD/STORE instructions would be painfull
+and I didn't even want to try it.
+
+The 896 chip supports 64 bit PCI transactions and addressing, while the
+895A supports 32 bit PCI transactions and 64 bit addressing.
+The SCRIPTS processor of these chips is not true 64 bit, but uses segment
+registers for bit 32-63. Another interesting feature is that LOAD/STORE
+instructions that address the on-chip RAM (8k) remain internal to the chip.
+
+Due to the use of LOAD/STORE SCRIPTS instructions, this driver does not
+support the following chips:
+- SYM53C810 revision < 0x10 (16)
+- SYM53C815 all revisions
+- SYM53C825 revision < 0x10 (16)
+
+4. Memory mapped I/O versus normal I/O
+
+Memory mapped I/O has less latency than normal I/O. Since
+linux-1.3.x, memory mapped I/O is used rather than normal I/O. Memory
+mapped I/O seems to work fine on most hardware configurations, but
+some poorly designed motherboards may break this feature.
+
+The configuration option CONFIG_SCSI_NCR53C8XX_IOMAPPED forces the
+driver to use normal I/O in all cases.
+
+
+5. Tagged command queueing
+
+Queuing more than 1 command at a time to a device allows it to perform
+optimizations based on actual head positions and its mechanical
+characteristics. This feature may also reduce average command latency.
+In order to really gain advantage of this feature, devices must have
+a reasonable cache size (No miracle is to be expected for a low-end
+hard disk with 128 KB or less).
+Some kown SCSI devices do not properly support tagged command queuing.
+Generally, firmware revisions that fix this kind of problems are available
+at respective vendor web/ftp sites.
+All I can say is that the hard disks I use on my machines behave well with
+this driver with tagged command queuing enabled:
+
+- IBM S12 0662
+- Conner 1080S
+- Quantum Atlas I
+- Quantum Atlas II
+
+If your controller has NVRAM, you can configure this feature per target
+from the user setup tool. The Tekram Setup program allows to tune the
+maximum number of queued commands up to 32. The Symbios Setup only allows
+to enable or disable this feature.
+
+The maximum number of simultaneous tagged commands queued to a device
+is currently set to 8 by default. This value is suitable for most SCSI
+disks. With large SCSI disks (>= 2GB, cache >= 512KB, average seek time
+<= 10 ms), using a larger value may give better performances.
+
+The sym53c8xx driver supports up to 255 commands per device, and the
+generic ncr53c8xx driver supports up to 64, but using more than 32 is
+generally not worth-while, unless you are using a very large disk or disk
+array. It is noticeable that most of recent hard disks seem not to accept
+more than 64 simultaneous commands. So, using more than 64 queued commands
+is probably just resource wasting.
+
+If your controller does not have NVRAM or if it is managed by the SDMS
+BIOS/SETUP, you can configure tagged queueing feature and device queue
+depths from the boot command-line. For example:
+
+ ncr53c8xx=tags:4/t2t3q15-t4q7/t1u0q32
+
+will set tagged commands queue depths as follow:
+
+- target 2 all luns on controller 0 --> 15
+- target 3 all luns on controller 0 --> 15
+- target 4 all luns on controller 0 --> 7
+- target 1 lun 0 on controller 1 --> 32
+- all other target/lun --> 4
+
+In some special conditions, some SCSI disk firmwares may return a
+QUEUE FULL status for a SCSI command. This behaviour is managed by the
+driver using the following heuristic:
+
+- Each time a QUEUE FULL status is returned, tagged queue depth is reduced
+ to the actual number of disconnected commands.
+
+- Every 1000 successfully completed SCSI commands, if allowed by the
+ current limit, the maximum number of queueable commands is incremented.
+
+Since QUEUE FULL status reception and handling is resource wasting, the
+driver notifies by default this problem to user by indicating the actual
+number of commands used and their status, as well as its decision on the
+device queue depth change.
+The heuristic used by the driver in handling QUEUE FULL ensures that the
+impact on performances is not too bad. You can get rid of the messages by
+setting verbose level to zero, as follow:
+
+1st method: boot your system using 'ncr53c8xx=verb:0' option.
+2nd method: apply "setverbose 0" control command to the proc fs entry
+ corresponding to your controller after boot-up.
+
+6. Parity checking
+
+The driver supports SCSI parity checking and PCI bus master parity
+checking. These features must be enabled in order to ensure safe data
+transfers. However, some flawed devices or mother boards will have
+problems with parity. You can disable either PCI parity or SCSI parity
+checking by entering appropriate options from the boot command line.
+(See 10: Boot setup commands).
+
+7. Profiling information
+
+Profiling information is available through the proc SCSI file system.
+Since gathering profiling information may impact performances, this
+feature is disabled by default and requires a compilation configuration
+option to be set to Y.
+
+The device associated with a host has the following pathname:
+
+ /proc/scsi/ncr53c8xx/N (N=0,1,2 ....)
+
+Generally, only 1 board is used on hardware configuration, and that device is:
+ /proc/scsi/ncr53c8xx/0
+
+However, if the driver has been made as module, the number of the
+hosts is incremented each time the driver is loaded.
+
+In order to display profiling information, just enter:
+
+ cat /proc/scsi/ncr53c8xx/0
+
+and you will get something like the following text:
+
+-------------------------------------------------------
+General information:
+ Chip NCR53C810, device id 0x1, revision id 0x2
+ IO port address 0x6000, IRQ number 10
+ Using memory mapped IO at virtual address 0x282c000
+ Synchronous transfer period 25, max commands per lun 4
+Profiling information:
+ num_trans = 18014
+ num_kbytes = 671314
+ num_disc = 25763
+ num_break = 1673
+ num_int = 1685
+ num_fly = 18038
+ ms_setup = 4940
+ ms_data = 369940
+ ms_disc = 183090
+ ms_post = 1320
+-------------------------------------------------------
+
+General information is easy to understand. The device ID and the
+revision ID identify the SCSI chip as follows:
+
+Chip Device id Revision Id
+---- --------- -----------
+810 0x1 < 0x10
+810A 0x1 >= 0x10
+815 0x4
+825 0x3 < 0x10
+860 0x6
+825A 0x3 >= 0x10
+875 0xf
+895 0xc
+
+The profiling information is updated upon completion of SCSI commands.
+A data structure is allocated and zeroed when the host adapter is
+attached. So, if the driver is a module, the profile counters are
+cleared each time the driver is loaded. The "clearprof" command
+allows you to clear these counters at any time.
+
+The following counters are available:
+
+("num" prefix means "number of",
+"ms" means milli-seconds)
+
+num_trans
+ Number of completed commands
+ Example above: 18014 completed commands
+
+num_kbytes
+ Number of kbytes transferred
+ Example above: 671 MB transferred
+
+num_disc
+ Number of SCSI disconnections
+ Example above: 25763 SCSI disconnections
+
+num_break
+ number of script interruptions (phase mismatch)
+ Example above: 1673 script interruptions
+
+num_int
+ Number of interrupts other than "on the fly"
+ Example above: 1685 interruptions not "on the fly"
+
+num_fly
+ Number of interrupts "on the fly"
+ Example above: 18038 interruptions "on the fly"
+
+ms_setup
+ Elapsed time for SCSI commands setups
+ Example above: 4.94 seconds
+
+ms_data
+ Elapsed time for data transfers
+ Example above: 369.94 seconds spent for data transfer
+
+ms_disc
+ Elapsed time for SCSI disconnections
+ Example above: 183.09 seconds spent disconnected
+
+ms_post
+ Elapsed time for command post processing
+ (time from SCSI status get to command completion call)
+ Example above: 1.32 seconds spent for post processing
+
+Due to the 1/100 second tick of the system clock, "ms_post" time may
+be wrong.
+
+In the example above, we got 18038 interrupts "on the fly" and only
+1673 script breaks generally due to disconnections inside a segment
+of the scatter list.
+
+
+8. Control commands
+
+Control commands can be sent to the driver with write operations to
+the proc SCSI file system. The generic command syntax is the
+following:
+
+ echo "<verb> <parameters>" >/proc/scsi/ncr53c8xx/0
+ (assumes controller number is 0)
+
+Using "all" for "<target>" parameter with the commands below will
+apply to all targets of the SCSI chain (except the controller).
+
+Available commands:
+
+8.1 Set minimum synchronous period factor
+
+ setsync <target> <period factor>
+
+ target: target number
+ period: minimum synchronous period.
+ Maximum speed = 1000/(4*period factor) except for special
+ cases below.
+
+ Specify a period of 255, to force asynchronous transfer mode.
+
+ 10 means 25 nano-seconds synchronous period
+ 11 means 30 nano-seconds synchronous period
+ 12 means 50 nano-seconds synchronous period
+
+8.2 Set wide size
+
+ setwide <target> <size>
+
+ target: target number
+ size: 0=8 bits, 1=16bits
+
+8.3 Set maximum number of concurrent tagged commands
+
+ settags <target> <tags>
+
+ target: target number
+ tags: number of concurrent tagged commands
+ must not be greater than SCSI_NCR_MAX_TAGS (default: 8)
+
+8.4 Set order type for tagged command
+
+ setorder <order>
+
+ order: 3 possible values:
+ simple: use SIMPLE TAG for all operations (read and write)
+ ordered: use ORDERED TAG for all operations
+ default: use default tag type,
+ SIMPLE TAG for read operations
+ ORDERED TAG for write operations
+
+
+8.5 Set debug mode
+
+ setdebug <list of debug flags>
+
+ Available debug flags:
+ alloc: print info about memory allocations (ccb, lcb)
+ queue: print info about insertions into the command start queue
+ result: print sense data on CHECK CONDITION status
+ scatter: print info about the scatter process
+ scripts: print info about the script binding process
+ tiny: print minimal debugging information
+ timing: print timing information of the NCR chip
+ nego: print information about SCSI negotiations
+ phase: print information on script interruptions
+
+ Use "setdebug" with no argument to reset debug flags.
+
+
+8.6 Clear profile counters
+
+ clearprof
+
+ The profile counters are automatically cleared when the amount of
+ data transferred reaches 1000 GB in order to avoid overflow.
+ The "clearprof" command allows you to clear these counters at any time.
+
+
+8.7 Set flag (no_disc)
+
+ setflag <target> <flag>
+
+ target: target number
+
+ For the moment, only one flag is available:
+
+ no_disc: not allow target to disconnect.
+
+ Do not specify any flag in order to reset the flag. For example:
+ - setflag 4
+ will reset no_disc flag for target 4, so will allow it disconnections.
+ - setflag all
+ will allow disconnection for all devices on the SCSI bus.
+
+
+8.8 Set verbose level
+
+ setverbose #level
+
+ The driver default verbose level is 1. This command allows to change
+ th driver verbose level after boot-up.
+
+8.9 Reset all logical units of a target
+
+ resetdev <target>
+
+ target: target number
+ The driver will try to send a BUS DEVICE RESET message to the target.
+ (Only supported by the SYM53C8XX driver and provided for test purpose)
+
+8.10 Abort all tasks of all logical units of a target
+
+ cleardev <target>
+
+ target: target number
+ The driver will try to send a ABORT message to all the logical units
+ of the target.
+ (Only supported by the SYM53C8XX driver and provided for test purpose)
+
+
+9. Configuration parameters
+
+If the firmware of all your devices is perfect enough, all the
+features supported by the driver can be enabled at start-up. However,
+if only one has a flaw for some SCSI feature, you can disable the
+support by the driver of this feature at linux start-up and enable
+this feature after boot-up only for devices that support it safely.
+
+CONFIG_SCSI_NCR53C8XX_IOMAPPED (default answer: n)
+ Answer "y" if you suspect your mother board to not allow memory mapped I/O.
+ May slow down performance a little. This option is required by
+ Linux/PPC and is used no matter what you select here. Linux/PPC
+ suffers no performance loss with this option since all IO is memory
+ mapped anyway.
+
+CONFIG_SCSI_NCR53C8XX_DEFAULT_TAGS (default answer: 8)
+ Default tagged command queue depth.
+
+CONFIG_SCSI_NCR53C8XX_MAX_TAGS (default answer: 8)
+ This option allows you to specify the maximum number of tagged commands
+ that can be queued to a device. The maximum supported value is 32.
+
+CONFIG_SCSI_NCR53C8XX_SYNC (default answer: 5)
+ This option allows you to specify the frequency in MHz the driver
+ will use at boot time for synchronous data transfer negotiations.
+ This frequency can be changed later with the "setsync" control command.
+ 0 means "asynchronous data transfers".
+
+CONFIG_SCSI_NCR53C8XX_FORCE_SYNC_NEGO (default answer: n)
+ Force synchronous negotiation for all SCSI-2 devices.
+ Some SCSI-2 devices do not report this feature in byte 7 of inquiry
+ response but do support it properly (TAMARACK scanners for example).
+
+CONFIG_SCSI_NCR53C8XX_NO_DISCONNECT (default and only reasonable answer: n)
+ If you suspect a device of yours does not properly support disconnections,
+ you can answer "y". Then, all SCSI devices will never disconnect the bus
+ even while performing long SCSI operations.
+
+CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT
+ Genuine SYMBIOS boards use GPIO0 in output for controller LED and GPIO3
+ bit as a flag indicating singled-ended/differential interface.
+ If all the boards of your system are genuine SYMBIOS boards or use
+ BIOS and drivers from SYMBIOS, you would want to enable this option.
+ This option must NOT be enabled if your system has at least one 53C8XX
+ based scsi board with a vendor-specific BIOS.
+ For example, Tekram DC-390/U, DC-390/W and DC-390/F scsi controllers
+ use a vendor-specific BIOS and are known to not use SYMBIOS compatible
+ GPIO wiring. So, this option must not be enabled if your system has
+ such a board installed.
+
+CONFIG_SCSI_NCR53C8XX_NVRAM_DETECT
+ Enable support for reading the serial NVRAM data on Symbios and
+ some Symbios compatible cards, and Tekram DC390W/U/F cards. Useful for
+ systems with more than one Symbios compatible controller where at least
+ one has a serial NVRAM, or for a system with a mixture of Symbios and
+ Tekram cards. Enables setting the boot order of host adaptors
+ to something other than the default order or "reverse probe" order.
+ Also enables Symbios and Tekram cards to be distinguished so
+ CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT may be set in a system with a
+ mixture of Symbios and Tekram cards so the Symbios cards can make use of
+ the full range of Symbios features, differential, led pin, without
+ causing problems for the Tekram card(s).
+
+10. Boot setup commands
+
+10.1 Syntax
+
+Setup commands can be passed to the driver either at boot time or as a
+string variable using 'insmod'.
+
+A boot setup command for the ncr53c8xx (sym53c8xx) driver begins with the
+driver name "ncr53c8xx="(sym53c8xx). The kernel syntax parser then expects
+an optional list of integers separated with comma followed by an optional
+list of comma-separated strings. Example of boot setup command under lilo
+prompt:
+
+lilo: linux root=/dev/hda2 ncr53c8xx=tags:4,sync:10,debug:0x200
+
+- enable tagged commands, up to 4 tagged commands queued.
+- set synchronous negotiation speed to 10 Mega-transfers / second.
+- set DEBUG_NEGO flag.
+
+Since comma seems not to be allowed when defining a string variable using
+'insmod', the driver also accepts <space> as option separator.
+The following command will install driver module with the same options as
+above.
+
+ insmod ncr53c8xx.o ncr53c8xx="tags:4 sync:10 debug:0x200"
+
+For the moment, the integer list of arguments is discarded by the driver.
+It will be used in the future in order to allow a per controller setup.
+
+Each string argument must be specified as "keyword:value". Only lower-case
+characters and digits are allowed.
+
+In a system that contains multiple 53C8xx adapters insmod will install the
+specified driver on each adapter. To exclude a chip use the 'excl' keyword.
+
+The sequence of commands,
+
+ insmod sym53c8xx sym53c8xx=excl:0x1400
+ insmod ncr53c8xx
+
+installs the sym53c8xx driver on all adapters except the one at IO port
+address 0x1400 and then installs the ncr53c8xx driver to the adapter at IO
+port address 0x1400.
+
+
+10.2 Available arguments
+
+10.2.1 Master parity checking
+ mpar:y enabled
+ mpar:n disabled
+
+10.2.2 Scsi parity checking
+ spar:y enabled
+ spar:n disabled
+
+10.2.3 Scsi disconnections
+ disc:y enabled
+ disc:n disabled
+
+10.2.4 Special features
+ Only apply to 810A, 825A, 860, 875 and 895 controllers.
+ Have no effect with other ones.
+ specf:y (or 1) enabled
+ specf:n (or 0) disabled
+ specf:3 enabled except Memory Write And Invalidate
+ The default driver setup is 'specf:3'. As a consequence, option 'specf:y'
+ must be specified in the boot setup command to enable Memory Write And
+ Invalidate.
+
+10.2.5 Ultra SCSI support
+ Only apply to 860, 875, 895, 895a, 896, 1010 and 1010_66 controllers.
+ Have no effect with other ones.
+ ultra:n All ultra speeds enabled
+ ultra:2 Ultra2 enabled
+ ultra:1 Ultra enabled
+ ultra:0 Ultra speeds disabled
+
+10.2.6 Default number of tagged commands
+ tags:0 (or tags:1 ) tagged command queuing disabled
+ tags:#tags (#tags > 1) tagged command queuing enabled
+ #tags will be truncated to the max queued commands configuration parameter.
+ This option also allows to specify a command queue depth for each device
+ that support tagged command queueing.
+ Example:
+ ncr53c8xx=tags:10/t2t3q16-t5q24/t1u2q32
+ will set devices queue depth as follow:
+ - controller #0 target #2 and target #3 -> 16 commands,
+ - controller #0 target #5 -> 24 commands,
+ - controller #1 target #1 logical unit #2 -> 32 commands,
+ - all other logical units (all targets, all controllers) -> 10 commands.
+
+10.2.7 Default synchronous period factor
+ sync:255 disabled (asynchronous transfer mode)
+ sync:#factor
+ #factor = 10 Ultra-2 SCSI 40 Mega-transfers / second
+ #factor = 11 Ultra-2 SCSI 33 Mega-transfers / second
+ #factor < 25 Ultra SCSI 20 Mega-transfers / second
+ #factor < 50 Fast SCSI-2
+
+ In all cases, the driver will use the minimum transfer period supported by
+ controllers according to NCR53C8XX chip type.
+
+10.2.8 Negotiate synchronous with all devices
+ (force sync nego)
+ fsn:y enabled
+ fsn:n disabled
+
+10.2.9 Verbosity level
+ verb:0 minimal
+ verb:1 normal
+ verb:2 too much
+
+10.2.10 Debug mode
+ debug:0 clear debug flags
+ debug:#x set debug flags
+ #x is an integer value combining the following power-of-2 values:
+ DEBUG_ALLOC 0x1
+ DEBUG_PHASE 0x2
+ DEBUG_POLL 0x4
+ DEBUG_QUEUE 0x8
+ DEBUG_RESULT 0x10
+ DEBUG_SCATTER 0x20
+ DEBUG_SCRIPT 0x40
+ DEBUG_TINY 0x80
+ DEBUG_TIMING 0x100
+ DEBUG_NEGO 0x200
+ DEBUG_TAGS 0x400
+ DEBUG_FREEZE 0x800
+ DEBUG_RESTART 0x1000
+
+ You can play safely with DEBUG_NEGO. However, some of these flags may
+ generate bunches of syslog messages.
+
+10.2.11 Burst max
+ burst:0 burst disabled
+ burst:255 get burst length from initial IO register settings.
+ burst:#x burst enabled (1<<#x burst transfers max)
+ #x is an integer value which is log base 2 of the burst transfers max.
+ The NCR53C875 and NCR53C825A support up to 128 burst transfers (#x = 7).
+ Other chips only support up to 16 (#x = 4).
+ This is a maximum value. The driver set the burst length according to chip
+ and revision ids. By default the driver uses the maximum value supported
+ by the chip.
+
+10.2.12 LED support
+ led:1 enable LED support
+ led:0 disable LED support
+ Donnot enable LED support if your scsi board does not use SDMS BIOS.
+ (See 'Configuration parameters')
+
+10.2.13 Max wide
+ wide:1 wide scsi enabled
+ wide:0 wide scsi disabled
+ Some scsi boards use a 875 (ultra wide) and only supply narrow connectors.
+ If you have connected a wide device with a 50 pins to 68 pins cable
+ converter, any accepted wide negotiation will break further data transfers.
+ In such a case, using "wide:0" in the bootup command will be helpful.
+
+10.2.14 Differential mode
+ diff:0 never set up diff mode
+ diff:1 set up diff mode if BIOS set it
+ diff:2 always set up diff mode
+ diff:3 set diff mode if GPIO3 is not set
+
+10.2.15 IRQ mode
+ irqm:0 always open drain
+ irqm:1 same as initial settings (assumed BIOS settings)
+ irqm:2 always totem pole
+ irqm:0x10 driver will not use IRQF_SHARED flag when requesting irq
+ irqm:0x20 driver will not use IRQF_DISABLED flag when requesting irq
+
+ (Bits 0x10 and 0x20 can be combined with hardware irq mode option)
+
+10.2.16 Reverse probe
+ revprob:n probe chip ids from the PCI configuration in this order:
+ 810, 815, 820, 860, 875, 885, 895, 896
+ revprob:y probe chip ids in the reverse order.
+
+10.2.17 Fix up PCI configuration space
+ pcifix:<option bits>
+
+ Available option bits:
+ 0x0: No attempt to fix PCI configuration space registers values.
+ 0x1: Set PCI cache-line size register if not set.
+ 0x2: Set write and invalidate bit in PCI command register.
+ 0x4: Increase if necessary PCI latency timer according to burst max.
+
+ Use 'pcifix:7' in order to allow the driver to fix up all PCI features.
+
+10.2.18 Serial NVRAM
+ nvram:n do not look for serial NVRAM
+ nvram:y test controllers for onboard serial NVRAM
+ (alternate binary form)
+ mvram=<bits options>
+ 0x01 look for NVRAM (equivalent to nvram=y)
+ 0x02 ignore NVRAM "Synchronous negotiation" parameters for all devices
+ 0x04 ignore NVRAM "Wide negotiation" parameter for all devices
+ 0x08 ignore NVRAM "Scan at boot time" parameter for all devices
+ 0x80 also attach controllers set to OFF in the NVRAM (sym53c8xx only)
+
+10.2.19 Check SCSI BUS
+ buschk:<option bits>
+
+ Available option bits:
+ 0x0: No check.
+ 0x1: Check and do not attach the controller on error.
+ 0x2: Check and just warn on error.
+ 0x4: Disable SCSI bus integrity checking.
+
+10.2.20 Exclude a host from being attached
+ excl=<io_address>
+
+ Prevent host at a given io address from being attached.
+ For example 'ncr53c8xx=excl:0xb400,excl:0xc000' indicate to the
+ ncr53c8xx driver not to attach hosts at address 0xb400 and 0xc000.
+
+10.2.21 Suggest a default SCSI id for hosts
+ hostid:255 no id suggested.
+ hostid:#x (0 < x < 7) x suggested for hosts SCSI id.
+
+ If a host SCSI id is available from the NVRAM, the driver will ignore
+ any value suggested as boot option. Otherwise, if a suggested value
+ different from 255 has been supplied, it will use it. Otherwise, it will
+ try to deduce the value previously set in the hardware and use value
+ 7 if the hardware value is zero.
+
+10.2.22 Enable use of IMMEDIATE ARBITRATION
+ (only supported by the sym53c8xx driver. See 10.7 for more details)
+ iarb:0 do not use this feature.
+ iarb:#x use this feature according to bit fields as follow:
+
+ bit 0 (1) : enable IARB each time the initiator has been reselected
+ when it arbitrated for the SCSI BUS.
+ (#x >> 4) : maximum number of successive settings of IARB if the initiator
+ win arbitration and it has other commands to send to a device.
+
+Boot fail safe
+ safe:y load the following assumed fail safe initial setup
+
+ master parity disabled mpar:n
+ scsi parity enabled spar:y
+ disconnections not allowed disc:n
+ special features disabled specf:n
+ ultra scsi disabled ultra:n
+ force sync negotiation disabled fsn:n
+ reverse probe disabled revprob:n
+ PCI fix up disabled pcifix:0
+ serial NVRAM enabled nvram:y
+ verbosity level 2 verb:2
+ tagged command queuing disabled tags:0
+ synchronous negotiation disabled sync:255
+ debug flags none debug:0
+ burst length from BIOS settings burst:255
+ LED support disabled led:0
+ wide support disabled wide:0
+ settle time 10 seconds settle:10
+ differential support from BIOS settings diff:1
+ irq mode from BIOS settings irqm:1
+ SCSI BUS check do not attach on error buschk:1
+ immediate arbitration disabled iarb:0
+
+10.3 Advised boot setup commands
+
+If the driver has been configured with default options, the equivalent
+boot setup is:
+
+ ncr53c8xx=mpar:y,spar:y,disc:y,specf:3,fsn:n,ultra:2,fsn:n,revprob:n,verb:1\
+ tags:0,sync:50,debug:0,burst:7,led:0,wide:1,settle:2,diff:0,irqm:0
+
+For an installation diskette or a safe but not fast system,
+boot setup can be:
+
+ ncr53c8xx=safe:y,mpar:y,disc:y
+ ncr53c8xx=safe:y,disc:y
+ ncr53c8xx=safe:y,mpar:y
+ ncr53c8xx=safe:y
+
+My personal system works flawlessly with the following equivalent setup:
+
+ ncr53c8xx=mpar:y,spar:y,disc:y,specf:1,fsn:n,ultra:2,fsn:n,revprob:n,verb:1\
+ tags:32,sync:12,debug:0,burst:7,led:1,wide:1,settle:2,diff:0,irqm:0
+
+The driver prints its actual setup when verbosity level is 2. You can try
+"ncr53c8xx=verb:2" to get the "static" setup of the driver, or add "verb:2"
+to your boot setup command in order to check the actual setup the driver is
+using.
+
+10.4 PCI configuration fix-up boot option
+
+pcifix:<option bits>
+
+Available option bits:
+ 0x1: Set PCI cache-line size register if not set.
+ 0x2: Set write and invalidate bit in PCI command register.
+
+Use 'pcifix:3' in order to allow the driver to fix both PCI features.
+
+These options only apply to new SYMBIOS chips 810A, 825A, 860, 875
+and 895 and are only supported for Pentium and 486 class processors.
+Recent SYMBIOS 53C8XX scsi processors are able to use PCI read multiple
+and PCI write and invalidate commands. These features require the
+cache line size register to be properly set in the PCI configuration
+space of the chips. On the other hand, chips will use PCI write and
+invalidate commands only if the corresponding bit is set to 1 in the
+PCI command register.
+
+Not all PCI bioses set the PCI cache line register and the PCI write and
+invalidate bit in the PCI configuration space of 53C8XX chips.
+Optimized PCI accesses may be broken for some PCI/memory controllers or
+make problems with some PCI boards.
+
+This fix-up worked flawlessly on my previous system.
+(MB Triton HX / 53C875 / 53C810A)
+I use these options at my own risks as you will do if you decide to
+use them too.
+
+
+10.5 Serial NVRAM support boot option
+
+nvram:n do not look for serial NVRAM
+nvram:y test controllers for onboard serial NVRAM
+
+This option can also been entered as an hexadecimal value that allows
+to control what information the driver will get from the NVRAM and what
+information it will ignore.
+For details see '17. Serial NVRAM support'.
+
+When this option is enabled, the driver tries to detect all boards using
+a Serial NVRAM. This memory is used to hold user set up parameters.
+
+The parameters the driver is able to get from the NVRAM depend on the
+data format used, as follow:
+
+ Tekram format Symbios format
+General and host parameters
+ Boot order N Y
+ Host SCSI ID Y Y
+ SCSI parity checking Y Y
+ Verbose boot messages N Y
+SCSI devices parameters
+ Synchronous transfer speed Y Y
+ Wide 16 / Narrow Y Y
+ Tagged Command Queuing enabled Y Y
+ Disconnections enabled Y Y
+ Scan at boot time N Y
+
+In order to speed up the system boot, for each device configured without
+the "scan at boot time" option, the driver forces an error on the
+first TEST UNIT READY command received for this device.
+
+Some SDMS BIOS revisions seem to be unable to boot cleanly with very fast
+hard disks. In such a situation you cannot configure the NVRAM with
+optimized parameters value.
+
+The 'nvram' boot option can be entered in hexadecimal form in order
+to ignore some options configured in the NVRAM, as follow:
+
+mvram=<bits options>
+ 0x01 look for NVRAM (equivalent to nvram=y)
+ 0x02 ignore NVRAM "Synchronous negotiation" parameters for all devices
+ 0x04 ignore NVRAM "Wide negotiation" parameter for all devices
+ 0x08 ignore NVRAM "Scan at boot time" parameter for all devices
+ 0x80 also attach controllers set to OFF in the NVRAM (sym53c8xx only)
+
+Option 0x80 is only supported by the sym53c8xx driver and is disabled by
+default. Result is that, by default (option not set), the sym53c8xx driver
+will not attach controllers set to OFF in the NVRAM.
+
+The ncr53c8xx always tries to attach all the controllers. Option 0x80 has
+not been added to the ncr53c8xx driver, since it has been reported to
+confuse users who use this driver since a long time. If you desire a
+controller not to be attached by the ncr53c8xx driver at Linux boot, you
+must use the 'excl' driver boot option.
+
+10.6 SCSI BUS checking boot option.
+
+When this option is set to a non-zero value, the driver checks SCSI lines
+logic state, 100 micro-seconds after having asserted the SCSI RESET line.
+The driver just reads SCSI lines and checks all lines read FALSE except RESET.
+Since SCSI devices shall release the BUS at most 800 nano-seconds after SCSI
+RESET has been asserted, any signal to TRUE may indicate a SCSI BUS problem.
+Unfortunately, the following common SCSI BUS problems are not detected:
+- Only 1 terminator installed.
+- Misplaced terminators.
+- Bad quality terminators.
+On the other hand, either bad cabling, broken devices, not conformant
+devices, ... may cause a SCSI signal to be wrong when te driver reads it.
+
+10.7 IMMEDIATE ARBITRATION boot option
+
+This option is only supported by the SYM53C8XX driver (not by the NCR53C8XX).
+
+SYMBIOS 53C8XX chips are able to arbitrate for the SCSI BUS as soon as they
+have detected an expected disconnection (BUS FREE PHASE). For this process
+to be started, bit 1 of SCNTL1 IO register must be set when the chip is
+connected to the SCSI BUS.
+
+When this feature has been enabled for the current connection, the chip has
+every chance to win arbitration if only devices with lower priority are
+competing for the SCSI BUS. By the way, when the chip is using SCSI id 7,
+then it will for sure win the next SCSI BUS arbitration.
+
+Since, there is no way to know what devices are trying to arbitrate for the
+BUS, using this feature can be extremely unfair. So, you are not advised
+to enable it, or at most enable this feature for the case the chip lost
+the previous arbitration (boot option 'iarb:1').
+
+This feature has the following advantages:
+
+a) Allow the initiator with ID 7 to win arbitration when it wants so.
+b) Overlap at least 4 micro-seconds of arbitration time with the execution
+ of SCRIPTS that deal with the end of the current connection and that
+ starts the next job.
+
+Hmmm... But (a) may just prevent other devices from reselecting the initiator,
+and delay data transfers or status/completions, and (b) may just waste
+SCSI BUS bandwidth if the SCRIPTS execution lasts more than 4 micro-seconds.
+
+The use of IARB needs the SCSI_NCR_IARB_SUPPORT option to have been defined
+at compile time and the 'iarb' boot option to have been set to a non zero
+value at boot time. It is not that useful for real work, but can be used
+to stress SCSI devices or for some applications that can gain advantage of
+it. By the way, if you experience badnesses like 'unexpected disconnections',
+'bad reselections', etc... when using IARB on heavy IO load, you should not
+be surprised, because force-feeding anything and blocking its arse at the
+same time cannot work for a long time. :-))
+
+
+11. Some constants and flags of the ncr53c8xx.h header file
+
+Some of these are defined from the configuration parameters. To
+change other "defines", you must edit the header file. Do that only
+if you know what you are doing.
+
+SCSI_NCR_SETUP_SPECIAL_FEATURES (default: defined)
+ If defined, the driver will enable some special features according
+ to chip and revision id.
+ For 810A, 860, 825A, 875 and 895 scsi chips, this option enables
+ support of features that reduce load of PCI bus and memory accesses
+ during scsi transfer processing: burst op-code fetch, read multiple,
+ read line, prefetch, cache line, write and invalidate,
+ burst 128 (875 only), large dma fifo (875 only), offset 16 (875 only).
+ Can be changed by the following boot setup command:
+ ncr53c8xx=specf:n
+
+SCSI_NCR_IOMAPPED (default: not defined)
+ If defined, normal I/O is forced.
+
+SCSI_NCR_SHARE_IRQ (default: defined)
+ If defined, request shared IRQ.
+
+SCSI_NCR_MAX_TAGS (default: 8)
+ Maximum number of simultaneous tagged commands to a device.
+ Can be changed by "settags <target> <maxtags>"
+
+SCSI_NCR_SETUP_DEFAULT_SYNC (default: 50)
+ Transfer period factor the driver will use at boot time for synchronous
+ negotiation. 0 means asynchronous.
+ Can be changed by "setsync <target> <period factor>"
+
+SCSI_NCR_SETUP_DEFAULT_TAGS (default: 8)
+ Default number of simultaneous tagged commands to a device.
+ < 1 means tagged command queuing disabled at start-up.
+
+SCSI_NCR_ALWAYS_SIMPLE_TAG (default: defined)
+ Use SIMPLE TAG for read and write commands.
+ Can be changed by "setorder <ordered|simple|default>"
+
+SCSI_NCR_SETUP_DISCONNECTION (default: defined)
+ If defined, targets are allowed to disconnect.
+
+SCSI_NCR_SETUP_FORCE_SYNC_NEGO (default: not defined)
+ If defined, synchronous negotiation is tried for all SCSI-2 devices.
+ Can be changed by "setsync <target> <period>"
+
+SCSI_NCR_SETUP_MASTER_PARITY (default: defined)
+ If defined, master parity checking is enabled.
+
+SCSI_NCR_SETUP_MASTER_PARITY (default: defined)
+ If defined, SCSI parity checking is enabled.
+
+SCSI_NCR_PROFILE_SUPPORT (default: not defined)
+ If defined, profiling information is gathered.
+
+SCSI_NCR_MAX_SCATTER (default: 128)
+ Scatter list size of the driver ccb.
+
+SCSI_NCR_MAX_TARGET (default: 16)
+ Max number of targets per host.
+
+SCSI_NCR_MAX_HOST (default: 2)
+ Max number of host controllers.
+
+SCSI_NCR_SETTLE_TIME (default: 2)
+ Number of seconds the driver will wait after reset.
+
+SCSI_NCR_TIMEOUT_ALERT (default: 3)
+ If a pending command will time out after this amount of seconds,
+ an ordered tag is used for the next command.
+ Avoids timeouts for unordered tagged commands.
+
+SCSI_NCR_CAN_QUEUE (default: 7*SCSI_NCR_MAX_TAGS)
+ Max number of commands that can be queued to a host.
+
+SCSI_NCR_CMD_PER_LUN (default: SCSI_NCR_MAX_TAGS)
+ Max number of commands queued to a host for a device.
+
+SCSI_NCR_SG_TABLESIZE (default: SCSI_NCR_MAX_SCATTER-1)
+ Max size of the Linux scatter/gather list.
+
+SCSI_NCR_MAX_LUN (default: 8)
+ Max number of LUNs per target.
+
+
+12. Installation
+
+This driver is part of the linux kernel distribution.
+Driver files are located in the sub-directory "drivers/scsi" of the
+kernel source tree.
+
+Driver files:
+
+ README.ncr53c8xx : this file
+ ChangeLog.ncr53c8xx : change log
+ ncr53c8xx.h : definitions
+ ncr53c8xx.c : the driver code
+
+New driver versions are made available separately in order to allow testing
+changes and new features prior to including them into the linux kernel
+distribution. The following URL provides information on latest available
+patches:
+
+ ftp://ftp.tux.org/pub/people/gerard-roudier/README
+
+
+13. Architecture dependent features.
+
+<Not yet written>
+
+
+14. Known problems
+
+14.1 Tagged commands with Iomega Jaz device
+
+I have not tried this device, however it has been reported to me the
+following: This device is capable of Tagged command queuing. However
+while spinning up, it rejects Tagged commands. This behaviour is
+conforms to 6.8.2 of SCSI-2 specifications. The current behaviour of
+the driver in that situation is not satisfying. So do not enable
+Tagged command queuing for devices that are able to spin down. The
+other problem that may appear is timeouts. The only way to avoid
+timeouts seems to edit linux/drivers/scsi/sd.c and to increase the
+current timeout values.
+
+14.2 Device names change when another controller is added.
+
+When you add a new NCR53C8XX chip based controller to a system that already
+has one or more controllers of this family, it may happen that the order
+the driver registers them to the kernel causes problems due to device
+name changes.
+When at least one controller uses NvRAM, SDMS BIOS version 4 allows you to
+define the order the BIOS will scan the scsi boards. The driver attaches
+controllers according to BIOS information if NvRAM detect option is set.
+
+If your controllers do not have NvRAM, you can:
+
+- Ask the driver to probe chip ids in reverse order from the boot command
+ line: ncr53c8xx=revprob:y
+- Make appropriate changes in the fstab.
+- Use the 'scsidev' tool from Eric Youngdale.
+
+14.3 Using only 8 bit devices with a WIDE SCSI controller.
+
+When only 8 bit NARROW devices are connected to a 16 bit WIDE SCSI controller,
+you must ensure that lines of the wide part of the SCSI BUS are pulled-up.
+This can be achieved by ENABLING the WIDE TERMINATOR portion of the SCSI
+controller card.
+The TYAN 1365 documentation revision 1.2 is not correct about such settings.
+(page 10, figure 3.3).
+
+14.4 Possible data corruption during a Memory Write and Invalidate
+
+This problem is described in SYMBIOS DEL 397, Part Number 69-039241, ITEM 4.
+
+In some complex situations, 53C875 chips revision <= 3 may start a PCI
+Write and Invalidate Command at a not cache-line-aligned 4 DWORDS boundary.
+This is only possible when Cache Line Size is 8 DWORDS or greater.
+Pentium systems use a 8 DWORDS cache line size and so are concerned by
+this chip bug, unlike i486 systems that use a 4 DWORDS cache line size.
+
+When this situation occurs, the chip may complete the Write and Invalidate
+command after having only filled part of the last cache line involved in
+the transfer, leaving to data corruption the remainder of this cache line.
+
+Not using Write And Invalidate obviously gets rid of this chip bug, and so
+it is now the default setting of the driver.
+However, for people like me who want to enable this feature, I have added
+part of a work-around suggested by SYMBIOS. This work-around resets the
+addressing logic when the DATA IN phase is entered and so prevents the bug
+from being triggered for the first SCSI MOVE of the phase. This work-around
+should be enough according to the following:
+
+The only driver internal data structure that is greater than 8 DWORDS and
+that is moved by the SCRIPTS processor is the 'CCB header' that contains
+the context of the SCSI transfer. This data structure is aligned on 8 DWORDS
+boundary (Pentium Cache Line Size), and so is immune to this chip bug, at
+least on Pentium systems.
+But the conditions of this bug can be met when a SCSI read command is
+performed using a buffer that is 4 DWORDS but not cache-line aligned.
+This cannot happen under Linux when scatter/gather lists are used since
+they only refer to system buffers that are well aligned. So, a work around
+may only be needed under Linux when a scatter/gather list is not used and
+when the SCSI DATA IN phase is reentered after a phase mismatch.
+
+14.5 IRQ sharing problems
+
+When an IRQ is shared by devices that are handled by different drivers, it
+may happen that one driver complains about the request of the IRQ having
+failed. Inder Linux-2.0, this may be due to one driver having requested the
+IRQ using the IRQF_DISABLED flag but some other having requested the same IRQ
+without this flag. Under both Linux-2.0 and linux-2.2, this may be caused by
+one driver not having requested the IRQ with the IRQF_SHARED flag.
+
+By default, the ncr53c8xx and sym53c8xx drivers request IRQs with both the
+IRQF_DISABLED and the IRQF_SHARED flag under Linux-2.0 and with only the IRQF_SHARED
+flag under Linux-2.2.
+
+Under Linux-2.0, you can disable use of IRQF_DISABLED flag from the boot
+command line by using the following option:
+
+ ncr53c8xx=irqm:0x20 (for the generic ncr53c8xx driver)
+ sym53c8xx=irqm:0x20 (for the sym53c8xx driver)
+
+If this does not fix the problem, then you may want to check how all other
+drivers are requesting the IRQ and report the problem. Note that if at least
+a single driver does not request the IRQ with the IRQF_SHARED flag (share IRQ),
+then the request of the IRQ obviously will not succeed for all the drivers.
+
+15. SCSI problem troubleshooting
+
+15.1 Problem tracking
+
+Most SCSI problems are due to a non conformant SCSI bus or to buggy
+devices. If unfortunately you have SCSI problems, you can check the
+following things:
+
+- SCSI bus cables
+- terminations at both end of the SCSI chain
+- linux syslog messages (some of them may help you)
+
+If you do not find the source of problems, you can configure the
+driver with no features enabled.
+
+- only asynchronous data transfers
+- tagged commands disabled
+- disconnections not allowed
+
+Now, if your SCSI bus is ok, your system have every chance to work
+with this safe configuration but performances will not be optimal.
+
+If it still fails, then you can send your problem description to
+appropriate mailing lists or news-groups. Send me a copy in order to
+be sure I will receive it. Obviously, a bug in the driver code is
+possible.
+
+ My email address: Gerard Roudier <groudier@free.fr>
+
+Allowing disconnections is important if you use several devices on
+your SCSI bus but often causes problems with buggy devices.
+Synchronous data transfers increases throughput of fast devices like
+hard disks. Good SCSI hard disks with a large cache gain advantage of
+tagged commands queuing.
+
+Try to enable one feature at a time with control commands. For example:
+
+- echo "setsync all 25" >/proc/scsi/ncr53c8xx/0
+ Will enable fast synchronous data transfer negotiation for all targets.
+
+- echo "setflag 3" >/proc/scsi/ncr53c8xx/0
+ Will reset flags (no_disc) for target 3, and so will allow it to disconnect
+ the SCSI Bus.
+
+- echo "settags 3 8" >/proc/scsi/ncr53c8xx/0
+ Will enable tagged command queuing for target 3 if that device supports it.
+
+Once you have found the device and the feature that cause problems, just
+disable that feature for that device.
+
+15.2 Understanding hardware error reports
+
+When the driver detects an unexpected error condition, it may display a
+message of the following pattern.
+
+sym53c876-0:1: ERROR (0:48) (1-21-65) (f/95) @ (script 7c0:19000000).
+sym53c876-0: script cmd = 19000000
+sym53c876-0: regdump: da 10 80 95 47 0f 01 07 75 01 81 21 80 01 09 00.
+
+Some fields in such a message may help you understand the cause of the
+problem, as follows:
+
+sym53c876-0:1: ERROR (0:48) (1-21-65) (f/95) @ (script 7c0:19000000).
+............A.........B.C....D.E..F....G.H.......I.....J...K.......
+
+Field A : target number.
+ SCSI ID of the device the controller was talking with at the moment the
+ error occurs.
+
+Field B : DSTAT io register (DMA STATUS)
+ Bit 0x40 : MDPE Master Data Parity Error
+ Data parity error detected on the PCI BUS.
+ Bit 0x20 : BF Bus Fault
+ PCI bus fault condition detected
+ Bit 0x01 : IID Illegal Instruction Detected
+ Set by the chip when it detects an Illegal Instruction format
+ on some condition that makes an instruction illegal.
+ Bit 0x80 : DFE Dma Fifo Empty
+ Pure status bit that does not indicate an error.
+ If the reported DSTAT value contains a combination of MDPE (0x40),
+ BF (0x20), then the cause may be likely due to a PCI BUS problem.
+
+Field C : SIST io register (SCSI Interrupt Status)
+ Bit 0x08 : SGE SCSI GROSS ERROR
+ Indicates that the chip detected a severe error condition
+ on the SCSI BUS that prevents the SCSI protocol from functioning
+ properly.
+ Bit 0x04 : UDC Unexpected Disconnection
+ Indicates that the device released the SCSI BUS when the chip
+ was not expecting this to happen. A device may behave so to
+ indicate the SCSI initiator that an error condition not reportable using the SCSI protocol has occurred.
+ Bit 0x02 : RST SCSI BUS Reset
+ Generally SCSI targets do not reset the SCSI BUS, although any
+ device on the BUS can reset it at any time.
+ Bit 0x01 : PAR Parity
+ SCSI parity error detected.
+ On a faulty SCSI BUS, any error condition among SGE (0x08), UDC (0x04) and
+ PAR (0x01) may be detected by the chip. If your SCSI system sometimes
+ encounters such error conditions, especially SCSI GROSS ERROR, then a SCSI
+ BUS problem is likely the cause of these errors.
+
+For fields D,E,F,G and H, you may look into the sym53c8xx_defs.h file
+that contains some minimal comments on IO register bits.
+Field D : SOCL Scsi Output Control Latch
+ This register reflects the state of the SCSI control lines the
+ chip want to drive or compare against.
+Field E : SBCL Scsi Bus Control Lines
+ Actual value of control lines on the SCSI BUS.
+Field F : SBDL Scsi Bus Data Lines
+ Actual value of data lines on the SCSI BUS.
+Field G : SXFER SCSI Transfer
+ Contains the setting of the Synchronous Period for output and
+ the current Synchronous offset (offset 0 means asynchronous).
+Field H : SCNTL3 Scsi Control Register 3
+ Contains the setting of timing values for both asynchronous and
+ synchronous data transfers.
+
+Understanding Fields I, J, K and dumps requires to have good knowledge of
+SCSI standards, chip cores functionnals and internal driver data structures.
+You are not required to decode and understand them, unless you want to help
+maintain the driver code.
+
+16. Synchronous transfer negotiation tables
+
+Tables below have been created by calling the routine the driver uses
+for synchronisation negotiation timing calculation and chip setting.
+The first table corresponds to Ultra chips 53875 and 53C860 with 80 MHz
+clock and 5 clock divisors.
+The second one has been calculated by setting the scsi clock to 40 Mhz
+and using 4 clock divisors and so applies to all NCR53C8XX chips in fast
+SCSI-2 mode.
+
+Periods are in nano-seconds and speeds are in Mega-transfers per second.
+1 Mega-transfers/second means 1 MB/s with 8 bits SCSI and 2 MB/s with
+Wide16 SCSI.
+
+16.1 Synchronous timings for 53C895, 53C875 and 53C860 SCSI controllers
+
+ ----------------------------------------------
+ Negotiated NCR settings
+ Factor Period Speed Period Speed
+ ------ ------ ------ ------ ------
+ 10 25 40.000 25 40.000 (53C895 only)
+ 11 30.2 33.112 31.25 32.000 (53C895 only)
+ 12 50 20.000 50 20.000
+ 13 52 19.230 62 16.000
+ 14 56 17.857 62 16.000
+ 15 60 16.666 62 16.000
+ 16 64 15.625 75 13.333
+ 17 68 14.705 75 13.333
+ 18 72 13.888 75 13.333
+ 19 76 13.157 87 11.428
+ 20 80 12.500 87 11.428
+ 21 84 11.904 87 11.428
+ 22 88 11.363 93 10.666
+ 23 92 10.869 93 10.666
+ 24 96 10.416 100 10.000
+ 25 100 10.000 100 10.000
+ 26 104 9.615 112 8.888
+ 27 108 9.259 112 8.888
+ 28 112 8.928 112 8.888
+ 29 116 8.620 125 8.000
+ 30 120 8.333 125 8.000
+ 31 124 8.064 125 8.000
+ 32 128 7.812 131 7.619
+ 33 132 7.575 150 6.666
+ 34 136 7.352 150 6.666
+ 35 140 7.142 150 6.666
+ 36 144 6.944 150 6.666
+ 37 148 6.756 150 6.666
+ 38 152 6.578 175 5.714
+ 39 156 6.410 175 5.714
+ 40 160 6.250 175 5.714
+ 41 164 6.097 175 5.714
+ 42 168 5.952 175 5.714
+ 43 172 5.813 175 5.714
+ 44 176 5.681 187 5.333
+ 45 180 5.555 187 5.333
+ 46 184 5.434 187 5.333
+ 47 188 5.319 200 5.000
+ 48 192 5.208 200 5.000
+ 49 196 5.102 200 5.000
+
+
+16.2 Synchronous timings for fast SCSI-2 53C8XX controllers
+
+ ----------------------------------------------
+ Negotiated NCR settings
+ Factor Period Speed Period Speed
+ ------ ------ ------ ------ ------
+ 25 100 10.000 100 10.000
+ 26 104 9.615 125 8.000
+ 27 108 9.259 125 8.000
+ 28 112 8.928 125 8.000
+ 29 116 8.620 125 8.000
+ 30 120 8.333 125 8.000
+ 31 124 8.064 125 8.000
+ 32 128 7.812 131 7.619
+ 33 132 7.575 150 6.666
+ 34 136 7.352 150 6.666
+ 35 140 7.142 150 6.666
+ 36 144 6.944 150 6.666
+ 37 148 6.756 150 6.666
+ 38 152 6.578 175 5.714
+ 39 156 6.410 175 5.714
+ 40 160 6.250 175 5.714
+ 41 164 6.097 175 5.714
+ 42 168 5.952 175 5.714
+ 43 172 5.813 175 5.714
+ 44 176 5.681 187 5.333
+ 45 180 5.555 187 5.333
+ 46 184 5.434 187 5.333
+ 47 188 5.319 200 5.000
+ 48 192 5.208 200 5.000
+ 49 196 5.102 200 5.000
+
+
+17. Serial NVRAM (added by Richard Waltham: dormouse@farsrobt.demon.co.uk)
+
+17.1 Features
+
+Enabling serial NVRAM support enables detection of the serial NVRAM included
+on Symbios and some Symbios compatible host adaptors, and Tekram boards. The
+serial NVRAM is used by Symbios and Tekram to hold set up parameters for the
+host adaptor and it's attached drives.
+
+The Symbios NVRAM also holds data on the boot order of host adaptors in a
+system with more than one host adaptor. This enables the order of scanning
+the cards for drives to be changed from the default used during host adaptor
+detection.
+
+This can be done to a limited extent at the moment using "reverse probe" but
+this only changes the order of detection of different types of cards. The
+NVRAM boot order settings can do this as well as change the order the same
+types of cards are scanned in, something "reverse probe" cannot do.
+
+Tekram boards using Symbios chips, DC390W/F/U, which have NVRAM are detected
+and this is used to distinguish between Symbios compatible and Tekram host
+adaptors. This is used to disable the Symbios compatible "diff" setting
+incorrectly set on Tekram boards if the CONFIG_SCSI_53C8XX_SYMBIOS_COMPAT
+configuration parameter is set enabling both Symbios and Tekram boards to be
+used together with the Symbios cards using all their features, including
+"diff" support. ("led pin" support for Symbios compatible cards can remain
+enabled when using Tekram cards. It does nothing useful for Tekram host
+adaptors but does not cause problems either.)
+
+
+17.2 Symbios NVRAM layout
+
+typical data at NVRAM address 0x100 (53c810a NVRAM)
+-----------------------------------------------------------
+00 00
+64 01
+8e 0b
+
+00 30 00 00 00 00 07 00 00 00 00 00 00 00 07 04 10 04 00 00
+
+04 00 0f 00 00 10 00 50 00 00 01 00 00 62
+04 00 03 00 00 10 00 58 00 00 01 00 00 63
+04 00 01 00 00 10 00 48 00 00 01 00 00 61
+00 00 00 00 00 00 00 00 00 00 00 00 00 00
+
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+
+fe fe
+00 00
+00 00
+-----------------------------------------------------------
+NVRAM layout details
+
+NVRAM Address 0x000-0x0ff not used
+ 0x100-0x26f initialised data
+ 0x270-0x7ff not used
+
+general layout
+
+ header - 6 bytes,
+ data - 356 bytes (checksum is byte sum of this data)
+ trailer - 6 bytes
+ ---
+ total 368 bytes
+
+data area layout
+
+ controller set up - 20 bytes
+ boot configuration - 56 bytes (4x14 bytes)
+ device set up - 128 bytes (16x8 bytes)
+ unused (spare?) - 152 bytes (19x8 bytes)
+ ---
+ total 356 bytes
+
+-----------------------------------------------------------
+header
+
+00 00 - ?? start marker
+64 01 - byte count (lsb/msb excludes header/trailer)
+8e 0b - checksum (lsb/msb excludes header/trailer)
+-----------------------------------------------------------
+controller set up
+
+00 30 00 00 00 00 07 00 00 00 00 00 00 00 07 04 10 04 00 00
+ | | | |
+ | | | -- host ID
+ | | |
+ | | --Removable Media Support
+ | | 0x00 = none
+ | | 0x01 = Bootable Device
+ | | 0x02 = All with Media
+ | |
+ | --flag bits 2
+ | 0x00000001= scan order hi->low
+ | (default 0x00 - scan low->hi)
+ --flag bits 1
+ 0x00000001 scam enable
+ 0x00000010 parity enable
+ 0x00000100 verbose boot msgs
+
+remaining bytes unknown - they do not appear to change in my
+current set up for any of the controllers.
+
+default set up is identical for 53c810a and 53c875 NVRAM
+(Removable Media added Symbios BIOS version 4.09)
+-----------------------------------------------------------
+boot configuration
+
+boot order set by order of the devices in this table
+
+04 00 0f 00 00 10 00 50 00 00 01 00 00 62 -- 1st controller
+04 00 03 00 00 10 00 58 00 00 01 00 00 63 2nd controller
+04 00 01 00 00 10 00 48 00 00 01 00 00 61 3rd controller
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 4th controller
+ | | | | | | | |
+ | | | | | | ---- PCI io port adr
+ | | | | | --0x01 init/scan at boot time
+ | | | | --PCI device/function number (0xdddddfff)
+ | | ----- ?? PCI vendor ID (lsb/msb)
+ ----PCI device ID (lsb/msb)
+
+?? use of this data is a guess but seems reasonable
+
+remaining bytes unknown - they do not appear to change in my
+current set up
+
+default set up is identical for 53c810a and 53c875 NVRAM
+-----------------------------------------------------------
+device set up (up to 16 devices - includes controller)
+
+0f 00 08 08 64 00 0a 00 - id 0
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00 - id 15
+ | | | | | |
+ | | | | ----timeout (lsb/msb)
+ | | | --synch period (0x?? 40 Mtrans/sec- fast 40) (probably 0x28)
+ | | | (0x30 20 Mtrans/sec- fast 20)
+ | | | (0x64 10 Mtrans/sec- fast )
+ | | | (0xc8 5 Mtrans/sec)
+ | | | (0x00 asynchronous)
+ | | -- ?? max sync offset (0x08 in NVRAM on 53c810a)
+ | | (0x10 in NVRAM on 53c875)
+ | --device bus width (0x08 narrow)
+ | (0x10 16 bit wide)
+ --flag bits
+ 0x00000001 - disconnect enabled
+ 0x00000010 - scan at boot time
+ 0x00000100 - scan luns
+ 0x00001000 - queue tags enabled
+
+remaining bytes unknown - they do not appear to change in my
+current set up
+
+?? use of this data is a guess but seems reasonable
+(but it could be max bus width)
+
+default set up for 53c810a NVRAM
+default set up for 53c875 NVRAM - bus width - 0x10
+ - sync offset ? - 0x10
+ - sync period - 0x30
+-----------------------------------------------------------
+?? spare device space (32 bit bus ??)
+
+00 00 00 00 00 00 00 00 (19x8bytes)
+.
+.
+00 00 00 00 00 00 00 00
+
+default set up is identical for 53c810a and 53c875 NVRAM
+-----------------------------------------------------------
+trailer
+
+fe fe - ? end marker ?
+00 00
+00 00
+
+default set up is identical for 53c810a and 53c875 NVRAM
+-----------------------------------------------------------
+
+
+
+17.3 Tekram NVRAM layout
+
+nvram 64x16 (1024 bit)
+
+Drive settings
+
+Drive ID 0-15 (addr 0x0yyyy0 = device setup, yyyy = ID)
+ (addr 0x0yyyy1 = 0x0000)
+
+ x x x x x x x x x x x x x x x x
+ | | | | | | | | |
+ | | | | | | | | ----- parity check 0 - off
+ | | | | | | | | 1 - on
+ | | | | | | | |
+ | | | | | | | ------- sync neg 0 - off
+ | | | | | | | 1 - on
+ | | | | | | |
+ | | | | | | --------- disconnect 0 - off
+ | | | | | | 1 - on
+ | | | | | |
+ | | | | | ----------- start cmd 0 - off
+ | | | | | 1 - on
+ | | | | |
+ | | | | -------------- tagged cmds 0 - off
+ | | | | 1 - on
+ | | | |
+ | | | ---------------- wide neg 0 - off
+ | | | 1 - on
+ | | |
+ --------------------------- sync rate 0 - 10.0 Mtrans/sec
+ 1 - 8.0
+ 2 - 6.6
+ 3 - 5.7
+ 4 - 5.0
+ 5 - 4.0
+ 6 - 3.0
+ 7 - 2.0
+ 7 - 2.0
+ 8 - 20.0
+ 9 - 16.7
+ a - 13.9
+ b - 11.9
+
+Global settings
+
+Host flags 0 (addr 0x100000, 32)
+
+ x x x x x x x x x x x x x x x x
+ | | | | | | | | | | | |
+ | | | | | | | | ----------- host ID 0x00 - 0x0f
+ | | | | | | | |
+ | | | | | | | ----------------------- support for 0 - off
+ | | | | | | | > 2 drives 1 - on
+ | | | | | | |
+ | | | | | | ------------------------- support drives 0 - off
+ | | | | | | > 1Gbytes 1 - on
+ | | | | | |
+ | | | | | --------------------------- bus reset on 0 - off
+ | | | | | power on 1 - on
+ | | | | |
+ | | | | ----------------------------- active neg 0 - off
+ | | | | 1 - on
+ | | | |
+ | | | -------------------------------- imm seek 0 - off
+ | | | 1 - on
+ | | |
+ | | ---------------------------------- scan luns 0 - off
+ | | 1 - on
+ | |
+ -------------------------------------- removable 0 - disable
+ as BIOS dev 1 - boot device
+ 2 - all
+
+Host flags 1 (addr 0x100001, 33)
+
+ x x x x x x x x x x x x x x x x
+ | | | | | |
+ | | | --------- boot delay 0 - 3 sec
+ | | | 1 - 5
+ | | | 2 - 10
+ | | | 3 - 20
+ | | | 4 - 30
+ | | | 5 - 60
+ | | | 6 - 120
+ | | |
+ --------------------------- max tag cmds 0 - 2
+ 1 - 4
+ 2 - 8
+ 3 - 16
+ 4 - 32
+
+Host flags 2 (addr 0x100010, 34)
+
+ x x x x x x x x x x x x x x x x
+ |
+ ----- F2/F6 enable 0 - off ???
+ 1 - on ???
+
+checksum (addr 0x111111)
+
+checksum = 0x1234 - (sum addr 0-63)
+
+----------------------------------------------------------------------------
+
+default nvram data:
+
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+
+0x0f07 0x0400 0x0001 0x0000 0x0000 0x0000 0x0000 0x0000
+0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
+0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
+0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0xfbbc
+
+
+18. Support for Big Endian
+
+The PCI local bus has been primarily designed for x86 architecture.
+As a consequence, PCI devices generally expect DWORDS using little endian
+byte ordering.
+
+18.1 Big Endian CPU
+
+In order to support NCR chips on a Big Endian architecture the driver has to
+perform byte reordering each time it is needed. This feature has been
+added to the driver by Cort <cort@cs.nmt.edu> and is available in driver
+version 2.5 and later ones. For the moment Big Endian support has only
+been tested on Linux/PPC (PowerPC).
+
+18.2 NCR chip in Big Endian mode of operations
+
+It can be read in SYMBIOS documentation that some chips support a special
+Big Endian mode, on paper: 53C815, 53C825A, 53C875, 53C875N, 53C895.
+This mode of operations is not software-selectable, but needs pin named
+BigLit to be pulled-up. Using this mode, most of byte reorderings should
+be avoided when the driver is running on a Big Endian CPU.
+Driver version 2.5 is also, in theory, ready for this feature.
+
+===============================================================================
+End of NCR53C8XX driver README file
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt
new file mode 100644
index 0000000..f536907
--- /dev/null
+++ b/Documentation/scsi/osst.txt
@@ -0,0 +1,218 @@
+README file for the osst driver
+===============================
+(w) Kurt Garloff <garloff@suse.de> 12/2000
+
+This file describes the osst driver as of version 0.8.x/0.9.x, the released
+version of the osst driver.
+It is intended to help advanced users to understand the role of osst and to
+get them started using (and maybe debugging) it.
+It won't address issues like "How do I compile a kernel?" or "How do I load
+a module?", as these are too basic.
+Once the OnStream got merged into the official kernel, the distro makers
+will provide the OnStream support for those who are not familiar with
+hacking their kernels.
+
+
+Purpose
+-------
+The osst driver was developed, because the standard SCSI tape driver in
+Linux, st, does not support the OnStream SC-x0 SCSI tape. The st is not to
+blame for that, as the OnStream tape drives do not support the standard SCSI
+command set for Serial Access Storage Devices (SASDs), which basically
+corresponds to the QIC-157 spec.
+Nevertheless, the OnStream tapes are nice pieces of hardware and therefore
+the osst driver has been written to make these tape devs supported by Linux.
+The driver is free software. It's released under the GNU GPL and planned to
+be integrated into the mainstream kernel.
+
+
+Implementation
+--------------
+The osst is a new high-level SCSI driver, just like st, sr, sd and sg. It
+can be compiled into the kernel or loaded as a module.
+As it represents a new device, it got assigned a new device node: /dev/osstX
+are character devices with major no 206 and minor numbers like the /dev/stX
+devices. If those are not present, you may create them by calling
+Makedevs.sh as root (see below).
+The driver started being a copy of st and as such, the osst devices'
+behavior looks very much the same as st to the userspace applications.
+
+
+History
+-------
+In the first place, osst shared it's identity very much with st. That meant
+that it used the same kernel structures and the same device node as st.
+So you could only have either of them being present in the kernel. This has
+been fixed by registering an own device, now.
+st and osst can coexist, each only accessing the devices it can support by
+themselves.
+
+
+Installation
+------------
+osst got integrated into the linux kernel. Select it during kernel
+configuration as module or compile statically into the kernel.
+Compile your kernel and install the modules.
+
+Now, your osst driver is inside the kernel or available as a module,
+depending on your choice during kernel config. You may still need to create
+the device nodes by calling the Makedevs.sh script (see below) manually.
+
+To load your module, you may use the command
+modprobe osst
+as root. dmesg should show you, whether your OnStream tapes have been
+recognized.
+
+If you want to have the module autoloaded on access to /dev/osst, you may
+add something like
+alias char-major-206 osst
+to your /etc/modprobe.conf (before 2.6: modules.conf).
+
+You may find it convenient to create a symbolic link
+ln -s nosst0 /dev/tape
+to make programs assuming a default name of /dev/tape more convenient to
+use.
+
+The device nodes for osst have to be created. Use the Makedevs.sh script
+attached to this file.
+
+
+Using it
+--------
+You may use the OnStream tape driver with your standard backup software,
+which may be tar, cpio, amanda, arkeia, BRU, Lone Tar, ...
+by specifying /dev/(n)osst0 as the tape device to use or using the above
+symlink trick. The IOCTLs to control tape operation are also mostly
+supported and you may try the mt (or mt_st) program to jump between
+filemarks, eject the tape, ...
+
+There's one limitation: You need to use a block size of 32kB.
+
+(This limitation is worked on and will be fixed in version 0.8.8 of
+ this driver.)
+
+If you just want to get started with standard software, here is an example
+for creating and restoring a full backup:
+# Backup
+tar cvf - / --exclude /proc | buffer -s 32k -m 24M -B -t -o /dev/nosst0
+# Restore
+buffer -s 32k -m 8M -B -t -i /dev/osst0 | tar xvf - -C /
+
+The buffer command has been used to buffer the data before it goes to the
+tape (or the file system) in order to smooth out the data stream and prevent
+the tape from needing to stop and rewind. The OnStream does have an internal
+buffer and a variable speed which help this, but especially on writing, the
+buffering still proves useful in most cases. It also pads the data to
+guarantees the block size of 32k. (Otherwise you may pass the -b64 option to
+tar.)
+Expect something like 1.8MB/s for the SC-x0 drives and 0.9MB/s for the DI-30.
+The USB drive will give you about 0.7MB/s.
+On a fast machine, you may profit from software data compression (z flag for
+tar).
+
+
+USB and IDE
+-----------
+Via the SCSI emulation layers usb-storage and ide-scsi, you can also use the
+osst driver to drive the USB-30 and the DI-30 drives. (Unfortunately, there
+is no such layer for the parallel port, otherwise the DP-30 would work as
+well.) For the USB support, you need the latest 2.4.0-test kernels and the
+latest usb-storage driver from
+http://www.linux-usb.org/
+http://sourceforge.net/cvs/?group_id=3581
+
+Note that the ide-tape driver as of 1.16f uses a slightly outdated on-tape
+format and therefore is not completely interoperable with osst tapes.
+
+The ADR-x0 line is fully SCSI-2 compliant and is supported by st, not osst.
+The on-tape format is supposed to be compatible with the one used by osst.
+
+
+Feedback and updates
+--------------------
+The driver development is coordinated through a mailing list
+<osst@linux1.onstream.nl>
+a CVS repository and some web pages.
+The tester's pages which contain recent news and updated drivers to download
+can be found on
+http://linux1.onstream.nl/test/
+
+If you find any problems, please have a look at the tester's page in order
+to see whether the problem is already known and solved. Otherwise, please
+report it to the mailing list. Your feedback is welcome. (This holds also
+for reports of successful usage, of course.)
+In case of trouble, please do always provide the following info:
+* driver and kernel version used (see syslog)
+* driver messages (syslog)
+* SCSI config and OnStream Firmware (/proc/scsi/scsi)
+* description of error. Is it reproducible?
+* software and commands used
+
+You may subscribe to the mailing list, BTW, it's a majordomo list.
+
+
+Status
+------
+0.8.0 was the first widespread BETA release. Since then a lot of reports
+have been sent, but mostly reported success or only minor trouble.
+All the issues have been addressed.
+Check the web pages for more info about the current developments.
+0.9.x is the tree for the 2.3/2.4 kernel.
+
+
+Acknowledgments
+----------------
+The driver has been started by making a copy of Kai Makisara's st driver.
+Most of the development has been done by Willem Riede. The presence of the
+userspace program osg (onstreamsg) from Terry Hardie has been rather
+helpful. The same holds for Gadi Oxman's ide-tape support for the DI-30.
+I did add some patches to those drivers as well and coordinated things a
+little bit.
+Note that most of them did mostly spend their spare time for the creation of
+this driver.
+The people from OnStream, especially Jack Bombeeck did support this project
+and always tried to answer HW or FW related questions. Furthermore, he
+pushed the FW developers to do the right things.
+SuSE did support this project by allowing me to work on it during my working
+time for them and by integrating the driver into their distro.
+
+More people did help by sending useful comments. Sorry to those who have
+been forgotten. Thanks to all the GNU/FSF and Linux developers who made this
+platform such an interesting, nice and stable platform.
+Thanks go to those who tested the drivers and did send useful reports. Your
+help is needed!
+
+
+Makedevs.sh
+-----------
+#!/bin/sh
+# Script to create OnStream SC-x0 device nodes (major 206)
+# Usage: Makedevs.sh [nos [path to dev]]
+# $Id: README.osst.kernel,v 1.4 2000/12/20 14:13:15 garloff Exp $
+major=206
+nrs=4
+dir=/dev
+test -z "$1" || nrs=$1
+test -z "$2" || dir=$2
+declare -i nr
+nr=0
+test -d $dir || mkdir -p $dir
+while test $nr -lt $nrs; do
+ mknod $dir/osst$nr c $major $nr
+ chown 0.disk $dir/osst$nr; chmod 660 $dir/osst$nr;
+ mknod $dir/nosst$nr c $major $[nr+128]
+ chown 0.disk $dir/nosst$nr; chmod 660 $dir/nosst$nr;
+ mknod $dir/osst${nr}l c $major $[nr+32]
+ chown 0.disk $dir/osst${nr}l; chmod 660 $dir/osst${nr}l;
+ mknod $dir/nosst${nr}l c $major $[nr+160]
+ chown 0.disk $dir/nosst${nr}l; chmod 660 $dir/nosst${nr}l;
+ mknod $dir/osst${nr}m c $major $[nr+64]
+ chown 0.disk $dir/osst${nr}m; chmod 660 $dir/osst${nr}m;
+ mknod $dir/nosst${nr}m c $major $[nr+192]
+ chown 0.disk $dir/nosst${nr}m; chmod 660 $dir/nosst${nr}m;
+ mknod $dir/osst${nr}a c $major $[nr+96]
+ chown 0.disk $dir/osst${nr}a; chmod 660 $dir/osst${nr}a;
+ mknod $dir/nosst${nr}a c $major $[nr+224]
+ chown 0.disk $dir/nosst${nr}a; chmod 660 $dir/nosst${nr}a;
+ let nr+=1
+done
diff --git a/Documentation/scsi/ppa.txt b/Documentation/scsi/ppa.txt
new file mode 100644
index 0000000..067ac39
--- /dev/null
+++ b/Documentation/scsi/ppa.txt
@@ -0,0 +1,14 @@
+-------- Terse where to get ZIP Drive help info --------
+
+General Iomega ZIP drive page for Linux:
+http://www.torque.net/~campbell/
+
+Driver archive for old drivers:
+http://www.torque.net/~campbell/ppa/
+
+Linux Parport page (parallel port)
+http://www.torque.net/parport/
+
+Email list for Linux Parport
+linux-parport@torque.net
+
diff --git a/Documentation/scsi/qlogicfas.txt b/Documentation/scsi/qlogicfas.txt
new file mode 100644
index 0000000..c211d82
--- /dev/null
+++ b/Documentation/scsi/qlogicfas.txt
@@ -0,0 +1,78 @@
+
+This driver supports the Qlogic FASXXX family of chips. This driver
+only works with the ISA, VLB, and PCMCIA versions of the Qlogic
+FastSCSI! cards as well as any other card based on the FASXX chip
+(including the Control Concepts SCSI/IDE/SIO/PIO/FDC cards).
+
+This driver does NOT support the PCI version. Support for these PCI
+Qlogic boards:
+
+ * IQ-PCI
+ * IQ-PCI-10
+ * IQ-PCI-D
+
+is provided by the qla1280 driver.
+
+Nor does it support the PCI-Basic, which is supported by the
+'am53c974' driver.
+
+PCMCIA SUPPORT
+
+This currently only works if the card is enabled first from DOS. This
+means you will have to load your socket and card services, and
+QL41DOS.SYS and QL40ENBL.SYS. These are a minimum, but loading the
+rest of the modules won't interfere with the operation. The next
+thing to do is load the kernel without resetting the hardware, which
+can be a simple ctrl-alt-delete with a boot floppy, or by using
+loadlin with the kernel image accessible from DOS. If you are using
+the Linux PCMCIA driver, you will have to adjust it or otherwise stop
+it from configuring the card.
+
+I am working with the PCMCIA group to make it more flexible, but that
+may take a while.
+
+ALL CARDS
+
+The top of the qlogic.c file has a number of defines that controls
+configuration. As shipped, it provides a balance between speed and
+function. If there are any problems, try setting SLOW_CABLE to 1, and
+then try changing USE_IRQ and TURBO_PDMA to zero. If you are familiar
+with SCSI, there are other settings which can tune the bus.
+
+It may be a good idea to enable RESET_AT_START, especially if the
+devices may not have been just powered up, or if you are restarting
+after a crash, since they may be busy trying to complete the last
+command or something. It comes up faster if this is set to zero, and
+if you have reliable hardware and connections it may be more useful to
+not reset things.
+
+SOME TROUBLESHOOTING TIPS
+
+Make sure it works properly under DOS. You should also do an initial FDISK
+on a new drive if you want partitions.
+
+Don't enable all the speedups first. If anything is wrong, they will make
+any problem worse.
+
+IMPORTANT
+
+The best way to test if your cables, termination, etc. are good is to
+copy a very big file (e.g. a doublespace container file, or a very
+large executable or archive). It should be at least 5 megabytes, but
+you can do multiple tests on smaller files. Then do a COMP to verify
+that the file copied properly. (Turn off all caching when doing these
+tests, otherwise you will test your RAM and not the files). Then do
+10 COMPs, comparing the same file on the SCSI hard drive, i.e. "COMP
+realbig.doc realbig.doc". Then do it after the computer gets warm.
+
+I noticed my system which seems to work 100% would fail this test if
+the computer was left on for a few hours. It was worse with longer
+cables, and more devices on the SCSI bus. What seems to happen is
+that it gets a false ACK causing an extra byte to be inserted into the
+stream (and this is not detected). This can be caused by bad
+termination (the ACK can be reflected), or by noise when the chips
+work less well because of the heat, or when cables get too long for
+the speed.
+
+Remember, if it doesn't work under DOS, it probably won't work under
+Linux.
diff --git a/Documentation/scsi/scsi-changer.txt b/Documentation/scsi/scsi-changer.txt
new file mode 100644
index 0000000..032399b
--- /dev/null
+++ b/Documentation/scsi/scsi-changer.txt
@@ -0,0 +1,180 @@
+
+README for the SCSI media changer driver
+========================================
+
+This is a driver for SCSI Medium Changer devices, which are listed
+with "Type: Medium Changer" in /proc/scsi/scsi.
+
+This is for *real* Jukeboxes. It is *not* supported to work with
+common small CD-ROM changers, neither one-lun-per-slot SCSI changers
+nor IDE drives.
+
+Userland tools available from here:
+ http://linux.bytesex.org/misc/changer.html
+
+
+General Information
+-------------------
+
+First some words about how changers work: A changer has 2 (possibly
+more) SCSI ID's. One for the changer device which controls the robot,
+and one for the device which actually reads and writes the data. The
+later may be anything, a MOD, a CD-ROM, a tape or whatever. For the
+changer device this is a "don't care", he *only* shuffles around the
+media, nothing else.
+
+
+The SCSI changer model is complex, compared to - for example - IDE-CD
+changers. But it allows to handle nearly all possible cases. It knows
+4 different types of changer elements:
+
+ media transport - this one shuffles around the media, i.e. the
+ transport arm. Also known as "picker".
+ storage - a slot which can hold a media.
+ import/export - the same as above, but is accessible from outside,
+ i.e. there the operator (you !) can use this to
+ fill in and remove media from the changer.
+ Sometimes named "mailslot".
+ data transfer - this is the device which reads/writes, i.e. the
+ CD-ROM / Tape / whatever drive.
+
+None of these is limited to one: A huge Jukebox could have slots for
+123 CD-ROM's, 5 CD-ROM readers (and therefore 6 SCSI ID's: the changer
+and each CD-ROM) and 2 transport arms. No problem to handle.
+
+
+How it is implemented
+---------------------
+
+I implemented the driver as character device driver with a NetBSD-like
+ioctl interface. Just grabbed NetBSD's header file and one of the
+other linux SCSI device drivers as starting point. The interface
+should be source code compatible with NetBSD. So if there is any
+software (anybody knows ???) which supports a BSDish changer driver,
+it should work with this driver too.
+
+Over time a few more ioctls where added, volume tag support for example
+wasn't covered by the NetBSD ioctl API.
+
+
+Current State
+-------------
+
+Support for more than one transport arm is not implemented yet (and
+nobody asked for it so far...).
+
+I test and use the driver myself with a 35 slot cdrom jukebox from
+Grundig. I got some reports telling it works ok with tape autoloaders
+(Exabyte, HP and DEC). Some People use this driver with amanda. It
+works fine with small (11 slots) and a huge (4 MOs, 88 slots)
+magneto-optical Jukebox. Probably with lots of other changers too, most
+(but not all :-) people mail me only if it does *not* work...
+
+I don't have any device lists, neither black-list nor white-list. Thus
+it is quite useless to ask me whenever a specific device is supported or
+not. In theory every changer device which supports the SCSI-2 media
+changer command set should work out-of-the-box with this driver. If it
+doesn't, it is a bug. Either within the driver or within the firmware
+of the changer device.
+
+
+Using it
+--------
+
+This is a character device with major number is 86, so use
+"mknod /dev/sch0 c 86 0" to create the special file for the driver.
+
+If the module finds the changer, it prints some messages about the
+device [ try "dmesg" if you don't see anything ] and should show up in
+/proc/devices. If not.... some changers use ID ? / LUN 0 for the
+device and ID ? / LUN 1 for the robot mechanism. But Linux does *not*
+look for LUNs other than 0 as default, because there are too many
+broken devices. So you can try:
+
+ 1) echo "scsi add-single-device 0 0 ID 1" > /proc/scsi/scsi
+ (replace ID with the SCSI-ID of the device)
+ 2) boot the kernel with "max_scsi_luns=1" on the command line
+ (append="max_scsi_luns=1" in lilo.conf should do the trick)
+
+
+Trouble?
+--------
+
+If you insmod the driver with "insmod debug=1", it will be verbose and
+prints a lot of stuff to the syslog. Compiling the kernel with
+CONFIG_SCSI_CONSTANTS=y improves the quality of the error messages alot
+because the kernel will translate the error codes into human-readable
+strings then.
+
+You can display these messages with the dmesg command (or check the
+logfiles). If you email me some question because of a problem with the
+driver, please include these messages.
+
+
+Insmod options
+--------------
+
+debug=0/1
+ Enable debug messages (see above, default: 0).
+
+verbose=0/1
+ Be verbose (default: 1).
+
+init=0/1
+ Send INITIALIZE ELEMENT STATUS command to the changer
+ at insmod time (default: 1).
+
+timeout_init=<seconds>
+ timeout for the INITIALIZE ELEMENT STATUS command
+ (default: 3600).
+
+timeout_move=<seconds>
+ timeout for all other commands (default: 120).
+
+dt_id=<id1>,<id2>,...
+dt_lun=<lun1>,<lun2>,...
+ These two allow to specify the SCSI ID and LUN for the data
+ transfer elements. You likely don't need this as the jukebox
+ should provide this information. But some devices don't ...
+
+vendor_firsts=
+vendor_counts=
+vendor_labels=
+ These insmod options can be used to tell the driver that there
+ are some vendor-specific element types. Grundig for example
+ does this. Some jukeboxes have a printer to label fresh burned
+ CDs, which is addressed as element 0xc000 (type 5). To tell the
+ driver about this vendor-specific element, use this:
+ $ insmod ch \
+ vendor_firsts=0xc000 \
+ vendor_counts=1 \
+ vendor_labels=printer
+ All three insmod options accept up to four comma-separated
+ values, this way you can configure the element types 5-8.
+ You likely need the SCSI specs for the device in question to
+ find the correct values as they are not covered by the SCSI-2
+ standard.
+
+
+Credits
+-------
+
+I wrote this driver using the famous mailing-patches-around-the-world
+method. With (more or less) help from:
+
+ Daniel Moehwald <moehwald@hdg.de>
+ Dane Jasper <dane@sonic.net>
+ R. Scott Bailey <sbailey@dsddi.eds.com>
+ Jonathan Corbet <corbet@lwn.net>
+
+Special thanks go to
+ Martin Kuehne <martin.kuehne@bnbt.de>
+for a old, second-hand (but full functional) cdrom jukebox which I use
+to develop/test driver and tools now.
+
+Have fun,
+
+ Gerd
+
+--
+Gerd Knorr <kraxel@bytesex.org>
diff --git a/Documentation/scsi/scsi-generic.txt b/Documentation/scsi/scsi-generic.txt
new file mode 100644
index 0000000..c38e2b3
--- /dev/null
+++ b/Documentation/scsi/scsi-generic.txt
@@ -0,0 +1,101 @@
+ Notes on Linux SCSI Generic (sg) driver
+ ---------------------------------------
+ 20020126
+Introduction
+============
+The SCSI Generic driver (sg) is one of the four "high level" SCSI device
+drivers along with sd, st and sr (disk, tape and CDROM respectively). Sg
+is more generalized (but lower level) than its siblings and tends to be
+used on SCSI devices that don't fit into the already serviced categories.
+Thus sg is used for scanners, CD writers and reading audio CDs digitally
+amongst other things.
+
+Rather than document the driver's interface here, version information
+is provided plus pointers (i.e. URLs) where to find documentation
+and examples.
+
+
+Major versions of the sg driver
+===============================
+There are three major versions of sg found in the linux kernel (lk):
+ - sg version 1 (original) from 1992 to early 1999 (lk 2.2.5) .
+ It is based in the sg_header interface structure.
+ - sg version 2 from lk 2.2.6 in the 2.2 series. It is based on
+ an extended version of the sg_header interface structure.
+ - sg version 3 found in the lk 2.4 series (and the lk 2.5 series).
+ It adds the sg_io_hdr interface structure.
+
+
+Sg driver documentation
+=======================
+The most recent documentation of the sg driver is kept at the Linux
+Documentation Project's (LDP) site:
+http://www.tldp.org/HOWTO/SCSI-Generic-HOWTO
+This describes the sg version 3 driver found in the lk 2.4 series.
+The LDP renders documents in single and multiple page HTML, postscript
+and pdf. This document can also be found at:
+http://www.torque.net/sg/p/sg_v3_ho.html
+
+Documentation for the version 2 sg driver found in the lk 2.2 series can
+be found at http://www.torque.net/sg/p/scsi-generic.txt . A larger version
+is at: http://www.torque.net/sg/p/scsi-generic_long.txt .
+
+The original documentation for the sg driver (prior to lk 2.2.6) can be
+found at http://www.torque.net/sg/p/original/SCSI-Programming-HOWTO.txt
+and in the LDP archives.
+
+A changelog with brief notes can be found in the
+/usr/src/linux/include/scsi/sg.h file. Note that the glibc maintainers copy
+and edit this file (removing its changelog for example) before placing it
+in /usr/include/scsi/sg.h . Driver debugging information and other notes
+can be found at the top of the /usr/src/linux/drivers/scsi/sg.c file.
+
+A more general description of the Linux SCSI subsystem of which sg is a
+part can be found at http://www.tldp.org/HOWTO/SCSI-2.4-HOWTO .
+
+
+Example code and utilities
+==========================
+There are two packages of sg utilities:
+ - sg3_utils for the sg version 3 driver found in lk 2.4
+ - sg_utils for the sg version 2 (and original) driver found in lk 2.2
+ and earlier
+Both packages will work in the lk 2.4 series however sg3_utils offers more
+capabilities. They can be found at: http://www.torque.net/sg and
+freshmeat.net
+
+Another approach is to look at the applications that use the sg driver.
+These include cdrecord, cdparanoia, SANE and cdrdao.
+
+
+Mapping of Linux kernel versions to sg driver versions
+======================================================
+Here is a list of linux kernels in the 2.4 series that had new version
+of the sg driver:
+ lk 2.4.0 : sg version 3.1.17
+ lk 2.4.7 : sg version 3.1.19
+ lk 2.4.10 : sg version 3.1.20 **
+ lk 2.4.17 : sg version 3.1.22
+
+** There were 3 changes to sg version 3.1.20 by third parties in the
+ next six linux kernel versions.
+
+For reference here is a list of linux kernels in the 2.2 series that had
+new version of the sg driver:
+ lk 2.2.0 : original sg version [with no version number]
+ lk 2.2.6 : sg version 2.1.31
+ lk 2.2.8 : sg version 2.1.32
+ lk 2.2.10 : sg version 2.1.34 [SG_GET_VERSION_NUM ioctl first appeared]
+ lk 2.2.14 : sg version 2.1.36
+ lk 2.2.16 : sg version 2.1.38
+ lk 2.2.17 : sg version 2.1.39
+ lk 2.2.20 : sg version 2.1.40
+
+The lk 2.5 development series has recently commenced and it currently
+contains sg version 3.5.23 which is functionally equivalent to sg
+version 3.1.22 found in lk 2.4.17 .
+
+
+Douglas Gilbert
+26th January 2002
+dgilbert@interlog.com
diff --git a/Documentation/scsi/scsi.txt b/Documentation/scsi/scsi.txt
new file mode 100644
index 0000000..dd1bbf4
--- /dev/null
+++ b/Documentation/scsi/scsi.txt
@@ -0,0 +1,44 @@
+SCSI subsystem documentation
+============================
+The Linux Documentation Project (LDP) maintains a document describing
+the SCSI subsystem in the Linux kernel (lk) 2.4 series. See:
+http://www.tldp.org/HOWTO/SCSI-2.4-HOWTO . The LDP has single
+and multiple page HTML renderings as well as postscript and pdf.
+It can also be found at http://www.torque.net/scsi/SCSI-2.4-HOWTO .
+
+
+Notes on using modules in the SCSI subsystem
+============================================
+The scsi support in the linux kernel can be modularized in a number of
+different ways depending upon the needs of the end user. To understand
+your options, we should first define a few terms.
+
+The scsi-core (also known as the "mid level") contains the core of scsi
+support. Without it you can do nothing with any of the other scsi drivers.
+The scsi core support can be a module (scsi_mod.o), or it can be built into
+the kernel. If the core is a module, it must be the first scsi module
+loaded, and if you unload the modules, it will have to be the last one
+unloaded. In practice the modprobe and rmmod commands (and "autoclean")
+will enforce the correct ordering of loading and unloading modules in
+the SCSI subsystem.
+
+The individual upper and lower level drivers can be loaded in any order
+once the scsi core is present in the kernel (either compiled in or loaded
+as a module). The disk driver (sd_mod.o), cdrom driver (sr_mod.o),
+tape driver ** (st.o) and scsi generics driver (sg.o) represent the upper
+level drivers to support the various assorted devices which can be
+controlled. You can for example load the tape driver to use the tape drive,
+and then unload it once you have no further need for the driver (and release
+the associated memory).
+
+The lower level drivers are the ones that support the individual cards that
+are supported for the hardware platform that you are running under. Those
+individual cards are often called Host Bus Adapters (HBAs). For example the
+aic7xxx.o driver is used to control all recent SCSI controller cards from
+Adaptec. Almost all lower level drivers can be built either as modules or
+built into the kernel.
+
+
+** There is a variant of the st driver for controlling OnStream tape
+ devices. Its module name is osst.o .
+
diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt
new file mode 100644
index 0000000..7acbebb
--- /dev/null
+++ b/Documentation/scsi/scsi_eh.txt
@@ -0,0 +1,479 @@
+
+SCSI EH
+======================================
+
+ This document describes SCSI midlayer error handling infrastructure.
+Please refer to Documentation/scsi/scsi_mid_low_api.txt for more
+information regarding SCSI midlayer.
+
+TABLE OF CONTENTS
+
+[1] How SCSI commands travel through the midlayer and to EH
+ [1-1] struct scsi_cmnd
+ [1-2] How do scmd's get completed?
+ [1-2-1] Completing a scmd w/ scsi_done
+ [1-2-2] Completing a scmd w/ timeout
+ [1-3] How EH takes over
+[2] How SCSI EH works
+ [2-1] EH through fine-grained callbacks
+ [2-1-1] Overview
+ [2-1-2] Flow of scmds through EH
+ [2-1-3] Flow of control
+ [2-2] EH through transportt->eh_strategy_handler()
+ [2-2-1] Pre transportt->eh_strategy_handler() SCSI midlayer conditions
+ [2-2-2] Post transportt->eh_strategy_handler() SCSI midlayer conditions
+ [2-2-3] Things to consider
+
+
+[1] How SCSI commands travel through the midlayer and to EH
+
+[1-1] struct scsi_cmnd
+
+ Each SCSI command is represented with struct scsi_cmnd (== scmd). A
+scmd has two list_head's to link itself into lists. The two are
+scmd->list and scmd->eh_entry. The former is used for free list or
+per-device allocated scmd list and not of much interest to this EH
+discussion. The latter is used for completion and EH lists and unless
+otherwise stated scmds are always linked using scmd->eh_entry in this
+discussion.
+
+
+[1-2] How do scmd's get completed?
+
+ Once LLDD gets hold of a scmd, either the LLDD will complete the
+command by calling scsi_done callback passed from midlayer when
+invoking hostt->queuecommand() or SCSI midlayer will time it out.
+
+
+[1-2-1] Completing a scmd w/ scsi_done
+
+ For all non-EH commands, scsi_done() is the completion callback. It
+does the following.
+
+ 1. Delete timeout timer. If it fails, it means that timeout timer
+ has expired and is going to finish the command. Just return.
+
+ 2. Link scmd to per-cpu scsi_done_q using scmd->en_entry
+
+ 3. Raise SCSI_SOFTIRQ
+
+ SCSI_SOFTIRQ handler scsi_softirq calls scsi_decide_disposition() to
+determine what to do with the command. scsi_decide_disposition()
+looks at the scmd->result value and sense data to determine what to do
+with the command.
+
+ - SUCCESS
+ scsi_finish_command() is invoked for the command. The
+ function does some maintenance choirs and notify completion by
+ calling scmd->done() callback, which, for fs requests, would
+ be HLD completion callback - sd:sd_rw_intr, sr:rw_intr,
+ st:st_intr.
+
+ - NEEDS_RETRY
+ - ADD_TO_MLQUEUE
+ scmd is requeued to blk queue.
+
+ - otherwise
+ scsi_eh_scmd_add(scmd, 0) is invoked for the command. See
+ [1-3] for details of this function.
+
+
+[1-2-2] Completing a scmd w/ timeout
+
+ The timeout handler is scsi_times_out(). When a timeout occurs, this
+function
+
+ 1. invokes optional hostt->eh_timed_out() callback. Return value can
+ be one of
+
+ - EH_HANDLED
+ This indicates that eh_timed_out() dealt with the timeout. The
+ scmd is passed to __scsi_done() and thus linked into per-cpu
+ scsi_done_q. Normal command completion described in [1-2-1]
+ follows.
+
+ - EH_RESET_TIMER
+ This indicates that more time is required to finish the
+ command. Timer is restarted. This action is counted as a
+ retry and only allowed scmd->allowed + 1(!) times. Once the
+ limit is reached, action for EH_NOT_HANDLED is taken instead.
+
+ *NOTE* This action is racy as the LLDD could finish the scmd
+ after the timeout has expired but before it's added back. In
+ such cases, scsi_done() would think that timeout has occurred
+ and return without doing anything. We lose completion and the
+ command will time out again.
+
+ - EH_NOT_HANDLED
+ This is the same as when eh_timed_out() callback doesn't exist.
+ Step #2 is taken.
+
+ 2. scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD) is invoked for the
+ command. See [1-3] for more information.
+
+
+[1-3] How EH takes over
+
+ scmds enter EH via scsi_eh_scmd_add(), which does the following.
+
+ 1. Turns on scmd->eh_eflags as requested. It's 0 for error
+ completions and SCSI_EH_CANCEL_CMD for timeouts.
+
+ 2. Links scmd->eh_entry to shost->eh_cmd_q
+
+ 3. Sets SHOST_RECOVERY bit in shost->shost_state
+
+ 4. Increments shost->host_failed
+
+ 5. Wakes up SCSI EH thread if shost->host_busy == shost->host_failed
+
+ As can be seen above, once any scmd is added to shost->eh_cmd_q,
+SHOST_RECOVERY shost_state bit is turned on. This prevents any new
+scmd to be issued from blk queue to the host; eventually, all scmds on
+the host either complete normally, fail and get added to eh_cmd_q, or
+time out and get added to shost->eh_cmd_q.
+
+ If all scmds either complete or fail, the number of in-flight scmds
+becomes equal to the number of failed scmds - i.e. shost->host_busy ==
+shost->host_failed. This wakes up SCSI EH thread. So, once woken up,
+SCSI EH thread can expect that all in-flight commands have failed and
+are linked on shost->eh_cmd_q.
+
+ Note that this does not mean lower layers are quiescent. If a LLDD
+completed a scmd with error status, the LLDD and lower layers are
+assumed to forget about the scmd at that point. However, if a scmd
+has timed out, unless hostt->eh_timed_out() made lower layers forget
+about the scmd, which currently no LLDD does, the command is still
+active as long as lower layers are concerned and completion could
+occur at any time. Of course, all such completions are ignored as the
+timer has already expired.
+
+ We'll talk about how SCSI EH takes actions to abort - make LLDD
+forget about - timed out scmds later.
+
+
+[2] How SCSI EH works
+
+ LLDD's can implement SCSI EH actions in one of the following two
+ways.
+
+ - Fine-grained EH callbacks
+ LLDD can implement fine-grained EH callbacks and let SCSI
+ midlayer drive error handling and call appropriate callbacks.
+ This will be discussed further in [2-1].
+
+ - eh_strategy_handler() callback
+ This is one big callback which should perform whole error
+ handling. As such, it should do all choirs SCSI midlayer
+ performs during recovery. This will be discussed in [2-2].
+
+ Once recovery is complete, SCSI EH resumes normal operation by
+calling scsi_restart_operations(), which
+
+ 1. Checks if door locking is needed and locks door.
+
+ 2. Clears SHOST_RECOVERY shost_state bit
+
+ 3. Wakes up waiters on shost->host_wait. This occurs if someone
+ calls scsi_block_when_processing_errors() on the host.
+ (*QUESTION* why is it needed? All operations will be blocked
+ anyway after it reaches blk queue.)
+
+ 4. Kicks queues in all devices on the host in the asses
+
+
+[2-1] EH through fine-grained callbacks
+
+[2-1-1] Overview
+
+ If eh_strategy_handler() is not present, SCSI midlayer takes charge
+of driving error handling. EH's goals are two - make LLDD, host and
+device forget about timed out scmds and make them ready for new
+commands. A scmd is said to be recovered if the scmd is forgotten by
+lower layers and lower layers are ready to process or fail the scmd
+again.
+
+ To achieve these goals, EH performs recovery actions with increasing
+severity. Some actions are performed by issuing SCSI commands and
+others are performed by invoking one of the following fine-grained
+hostt EH callbacks. Callbacks may be omitted and omitted ones are
+considered to fail always.
+
+int (* eh_abort_handler)(struct scsi_cmnd *);
+int (* eh_device_reset_handler)(struct scsi_cmnd *);
+int (* eh_bus_reset_handler)(struct scsi_cmnd *);
+int (* eh_host_reset_handler)(struct scsi_cmnd *);
+
+ Higher-severity actions are taken only when lower-severity actions
+cannot recover some of failed scmds. Also, note that failure of the
+highest-severity action means EH failure and results in offlining of
+all unrecovered devices.
+
+ During recovery, the following rules are followed
+
+ - Recovery actions are performed on failed scmds on the to do list,
+ eh_work_q. If a recovery action succeeds for a scmd, recovered
+ scmds are removed from eh_work_q.
+
+ Note that single recovery action on a scmd can recover multiple
+ scmds. e.g. resetting a device recovers all failed scmds on the
+ device.
+
+ - Higher severity actions are taken iff eh_work_q is not empty after
+ lower severity actions are complete.
+
+ - EH reuses failed scmds to issue commands for recovery. For
+ timed-out scmds, SCSI EH ensures that LLDD forgets about a scmd
+ before reusing it for EH commands.
+
+ When a scmd is recovered, the scmd is moved from eh_work_q to EH
+local eh_done_q using scsi_eh_finish_cmd(). After all scmds are
+recovered (eh_work_q is empty), scsi_eh_flush_done_q() is invoked to
+either retry or error-finish (notify upper layer of failure) recovered
+scmds.
+
+ scmds are retried iff its sdev is still online (not offlined during
+EH), REQ_FAILFAST is not set and ++scmd->retries is less than
+scmd->allowed.
+
+
+[2-1-2] Flow of scmds through EH
+
+ 1. Error completion / time out
+ ACTION: scsi_eh_scmd_add() is invoked for scmd
+ - set scmd->eh_eflags
+ - add scmd to shost->eh_cmd_q
+ - set SHOST_RECOVERY
+ - shost->host_failed++
+ LOCKING: shost->host_lock
+
+ 2. EH starts
+ ACTION: move all scmds to EH's local eh_work_q. shost->eh_cmd_q
+ is cleared.
+ LOCKING: shost->host_lock (not strictly necessary, just for
+ consistency)
+
+ 3. scmd recovered
+ ACTION: scsi_eh_finish_cmd() is invoked to EH-finish scmd
+ - shost->host_failed--
+ - clear scmd->eh_eflags
+ - scsi_setup_cmd_retry()
+ - move from local eh_work_q to local eh_done_q
+ LOCKING: none
+
+ 4. EH completes
+ ACTION: scsi_eh_flush_done_q() retries scmds or notifies upper
+ layer of failure.
+ - scmd is removed from eh_done_q and scmd->eh_entry is cleared
+ - if retry is necessary, scmd is requeued using
+ scsi_queue_insert()
+ - otherwise, scsi_finish_command() is invoked for scmd
+ LOCKING: queue or finish function performs appropriate locking
+
+
+[2-1-3] Flow of control
+
+ EH through fine-grained callbacks start from scsi_unjam_host().
+
+<<scsi_unjam_host>>
+
+ 1. Lock shost->host_lock, splice_init shost->eh_cmd_q into local
+ eh_work_q and unlock host_lock. Note that shost->eh_cmd_q is
+ cleared by this action.
+
+ 2. Invoke scsi_eh_get_sense.
+
+ <<scsi_eh_get_sense>>
+
+ This action is taken for each error-completed
+ (!SCSI_EH_CANCEL_CMD) commands without valid sense data. Most
+ SCSI transports/LLDDs automatically acquire sense data on
+ command failures (autosense). Autosense is recommended for
+ performance reasons and as sense information could get out of
+ sync inbetween occurrence of CHECK CONDITION and this action.
+
+ Note that if autosense is not supported, scmd->sense_buffer
+ contains invalid sense data when error-completing the scmd
+ with scsi_done(). scsi_decide_disposition() always returns
+ FAILED in such cases thus invoking SCSI EH. When the scmd
+ reaches here, sense data is acquired and
+ scsi_decide_disposition() is called again.
+
+ 1. Invoke scsi_request_sense() which issues REQUEST_SENSE
+ command. If fails, no action. Note that taking no action
+ causes higher-severity recovery to be taken for the scmd.
+
+ 2. Invoke scsi_decide_disposition() on the scmd
+
+ - SUCCESS
+ scmd->retries is set to scmd->allowed preventing
+ scsi_eh_flush_done_q() from retrying the scmd and
+ scsi_eh_finish_cmd() is invoked.
+
+ - NEEDS_RETRY
+ scsi_eh_finish_cmd() invoked
+
+ - otherwise
+ No action.
+
+ 3. If !list_empty(&eh_work_q), invoke scsi_eh_abort_cmds().
+
+ <<scsi_eh_abort_cmds>>
+
+ This action is taken for each timed out command.
+ hostt->eh_abort_handler() is invoked for each scmd. The
+ handler returns SUCCESS if it has succeeded to make LLDD and
+ all related hardware forget about the scmd.
+
+ If a timedout scmd is successfully aborted and the sdev is
+ either offline or ready, scsi_eh_finish_cmd() is invoked for
+ the scmd. Otherwise, the scmd is left in eh_work_q for
+ higher-severity actions.
+
+ Note that both offline and ready status mean that the sdev is
+ ready to process new scmds, where processing also implies
+ immediate failing; thus, if a sdev is in one of the two
+ states, no further recovery action is needed.
+
+ Device readiness is tested using scsi_eh_tur() which issues
+ TEST_UNIT_READY command. Note that the scmd must have been
+ aborted successfully before reusing it for TEST_UNIT_READY.
+
+ 4. If !list_empty(&eh_work_q), invoke scsi_eh_ready_devs()
+
+ <<scsi_eh_ready_devs>>
+
+ This function takes four increasingly more severe measures to
+ make failed sdevs ready for new commands.
+
+ 1. Invoke scsi_eh_stu()
+
+ <<scsi_eh_stu>>
+
+ For each sdev which has failed scmds with valid sense data
+ of which scsi_check_sense()'s verdict is FAILED,
+ START_STOP_UNIT command is issued w/ start=1. Note that
+ as we explicitly choose error-completed scmds, it is known
+ that lower layers have forgotten about the scmd and we can
+ reuse it for STU.
+
+ If STU succeeds and the sdev is either offline or ready,
+ all failed scmds on the sdev are EH-finished with
+ scsi_eh_finish_cmd().
+
+ *NOTE* If hostt->eh_abort_handler() isn't implemented or
+ failed, we may still have timed out scmds at this point
+ and STU doesn't make lower layers forget about those
+ scmds. Yet, this function EH-finish all scmds on the sdev
+ if STU succeeds leaving lower layers in an inconsistent
+ state. It seems that STU action should be taken only when
+ a sdev has no timed out scmd.
+
+ 2. If !list_empty(&eh_work_q), invoke scsi_eh_bus_device_reset().
+
+ <<scsi_eh_bus_device_reset>>
+
+ This action is very similar to scsi_eh_stu() except that,
+ instead of issuing STU, hostt->eh_device_reset_handler()
+ is used. Also, as we're not issuing SCSI commands and
+ resetting clears all scmds on the sdev, there is no need
+ to choose error-completed scmds.
+
+ 3. If !list_empty(&eh_work_q), invoke scsi_eh_bus_reset()
+
+ <<scsi_eh_bus_reset>>
+
+ hostt->eh_bus_reset_handler() is invoked for each channel
+ with failed scmds. If bus reset succeeds, all failed
+ scmds on all ready or offline sdevs on the channel are
+ EH-finished.
+
+ 4. If !list_empty(&eh_work_q), invoke scsi_eh_host_reset()
+
+ <<scsi_eh_host_reset>>
+
+ This is the last resort. hostt->eh_host_reset_handler()
+ is invoked. If host reset succeeds, all failed scmds on
+ all ready or offline sdevs on the host are EH-finished.
+
+ 5. If !list_empty(&eh_work_q), invoke scsi_eh_offline_sdevs()
+
+ <<scsi_eh_offline_sdevs>>
+
+ Take all sdevs which still have unrecovered scmds offline
+ and EH-finish the scmds.
+
+ 5. Invoke scsi_eh_flush_done_q().
+
+ <<scsi_eh_flush_done_q>>
+
+ At this point all scmds are recovered (or given up) and
+ put on eh_done_q by scsi_eh_finish_cmd(). This function
+ flushes eh_done_q by either retrying or notifying upper
+ layer of failure of the scmds.
+
+
+[2-2] EH through transportt->eh_strategy_handler()
+
+ transportt->eh_strategy_handler() is invoked in the place of
+scsi_unjam_host() and it is responsible for whole recovery process.
+On completion, the handler should have made lower layers forget about
+all failed scmds and either ready for new commands or offline. Also,
+it should perform SCSI EH maintenance choirs to maintain integrity of
+SCSI midlayer. IOW, of the steps described in [2-1-2], all steps
+except for #1 must be implemented by eh_strategy_handler().
+
+
+[2-2-1] Pre transportt->eh_strategy_handler() SCSI midlayer conditions
+
+ The following conditions are true on entry to the handler.
+
+ - Each failed scmd's eh_flags field is set appropriately.
+
+ - Each failed scmd is linked on scmd->eh_cmd_q by scmd->eh_entry.
+
+ - SHOST_RECOVERY is set.
+
+ - shost->host_failed == shost->host_busy
+
+
+[2-2-2] Post transportt->eh_strategy_handler() SCSI midlayer conditions
+
+ The following conditions must be true on exit from the handler.
+
+ - shost->host_failed is zero.
+
+ - Each scmd's eh_eflags field is cleared.
+
+ - Each scmd is in such a state that scsi_setup_cmd_retry() on the
+ scmd doesn't make any difference.
+
+ - shost->eh_cmd_q is cleared.
+
+ - Each scmd->eh_entry is cleared.
+
+ - Either scsi_queue_insert() or scsi_finish_command() is called on
+ each scmd. Note that the handler is free to use scmd->retries and
+ ->allowed to limit the number of retries.
+
+
+[2-2-3] Things to consider
+
+ - Know that timed out scmds are still active on lower layers. Make
+ lower layers forget about them before doing anything else with
+ those scmds.
+
+ - For consistency, when accessing/modifying shost data structure,
+ grab shost->host_lock.
+
+ - On completion, each failed sdev must have forgotten about all
+ active scmds.
+
+ - On completion, each failed sdev must be ready for new commands or
+ offline.
+
+
+--
+Tejun Heo
+htejun@gmail.com
+11th September 2005
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
new file mode 100644
index 0000000..38d324d
--- /dev/null
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -0,0 +1,486 @@
+ SCSI FC Tansport
+ =============================================
+
+Date: 4/12/2007
+Kernel Revisions for features:
+ rports : <<TBS>>
+ vports : 2.6.22 (? TBD)
+
+
+Introduction
+============
+This file documents the features and components of the SCSI FC Transport.
+It also provides documents the API between the transport and FC LLDDs.
+The FC transport can be found at:
+ drivers/scsi/scsi_transport_fc.c
+ include/scsi/scsi_transport_fc.h
+ include/scsi/scsi_netlink_fc.h
+
+This file is found at Documentation/scsi/scsi_fc_transport.txt
+
+
+FC Remote Ports (rports)
+========================================================================
+<< To Be Supplied >>
+
+
+FC Virtual Ports (vports)
+========================================================================
+
+Overview:
+-------------------------------
+
+ New FC standards have defined mechanisms which allows for a single physical
+ port to appear on as multiple communication ports. Using the N_Port Id
+ Virtualization (NPIV) mechanism, a point-to-point connection to a Fabric
+ can be assigned more than 1 N_Port_ID. Each N_Port_ID appears as a
+ separate port to other endpoints on the fabric, even though it shares one
+ physical link to the switch for communication. Each N_Port_ID can have a
+ unique view of the fabric based on fabric zoning and array lun-masking
+ (just like a normal non-NPIV adapter). Using the Virtual Fabric (VF)
+ mechanism, adding a fabric header to each frame allows the port to
+ interact with the Fabric Port to join multiple fabrics. The port will
+ obtain an N_Port_ID on each fabric it joins. Each fabric will have its
+ own unique view of endpoints and configuration parameters. NPIV may be
+ used together with VF so that the port can obtain multiple N_Port_IDs
+ on each virtual fabric.
+
+ The FC transport is now recognizing a new object - a vport. A vport is
+ an entity that has a world-wide unique World Wide Port Name (wwpn) and
+ World Wide Node Name (wwnn). The transport also allows for the FC4's to
+ be specified for the vport, with FCP_Initiator being the primary role
+ expected. Once instantiated by one of the above methods, it will have a
+ distinct N_Port_ID and view of fabric endpoints and storage entities.
+ The fc_host associated with the physical adapter will export the ability
+ to create vports. The transport will create the vport object within the
+ Linux device tree, and instruct the fc_host's driver to instantiate the
+ virtual port. Typically, the driver will create a new scsi_host instance
+ on the vport, resulting in a unique <H,C,T,L> namespace for the vport.
+ Thus, whether a FC port is based on a physical port or on a virtual port,
+ each will appear as a unique scsi_host with its own target and lun space.
+
+ Note: At this time, the transport is written to create only NPIV-based
+ vports. However, consideration was given to VF-based vports and it
+ should be a minor change to add support if needed. The remaining
+ discussion will concentrate on NPIV.
+
+ Note: World Wide Name assignment (and uniqueness guarantees) are left
+ up to an administrative entity controlling the vport. For example,
+ if vports are to be associated with virtual machines, a XEN mgmt
+ utility would be responsible for creating wwpn/wwnn's for the vport,
+ using it's own naming authority and OUI. (Note: it already does this
+ for virtual MAC addresses).
+
+
+Device Trees and Vport Objects:
+-------------------------------
+
+ Today, the device tree typically contains the scsi_host object,
+ with rports and scsi target objects underneath it. Currently the FC
+ transport creates the vport object and places it under the scsi_host
+ object corresponding to the physical adapter. The LLDD will allocate
+ a new scsi_host for the vport and link it's object under the vport.
+ The remainder of the tree under the vports scsi_host is the same
+ as the non-NPIV case. The transport is written currently to easily
+ allow the parent of the vport to be something other than the scsi_host.
+ This could be used in the future to link the object onto a vm-specific
+ device tree. If the vport's parent is not the physical port's scsi_host,
+ a symbolic link to the vport object will be placed in the physical
+ port's scsi_host.
+
+ Here's what to expect in the device tree :
+ The typical Physical Port's Scsi_Host:
+ /sys/devices/.../host17/
+ and it has the typical descendant tree:
+ /sys/devices/.../host17/rport-17:0-0/target17:0:0/17:0:0:0:
+ and then the vport is created on the Physical Port:
+ /sys/devices/.../host17/vport-17:0-0
+ and the vport's Scsi_Host is then created:
+ /sys/devices/.../host17/vport-17:0-0/host18
+ and then the rest of the tree progresses, such as:
+ /sys/devices/.../host17/vport-17:0-0/host18/rport-18:0-0/target18:0:0/18:0:0:0:
+
+ Here's what to expect in the sysfs tree :
+ scsi_hosts:
+ /sys/class/scsi_host/host17 physical port's scsi_host
+ /sys/class/scsi_host/host18 vport's scsi_host
+ fc_hosts:
+ /sys/class/fc_host/host17 physical port's fc_host
+ /sys/class/fc_host/host18 vport's fc_host
+ fc_vports:
+ /sys/class/fc_vports/vport-17:0-0 the vport's fc_vport
+ fc_rports:
+ /sys/class/fc_remote_ports/rport-17:0-0 rport on the physical port
+ /sys/class/fc_remote_ports/rport-18:0-0 rport on the vport
+
+
+Vport Attributes:
+-------------------------------
+
+ The new fc_vport class object has the following attributes
+
+ node_name: Read_Only
+ The WWNN of the vport
+
+ port_name: Read_Only
+ The WWPN of the vport
+
+ roles: Read_Only
+ Indicates the FC4 roles enabled on the vport.
+
+ symbolic_name: Read_Write
+ A string, appended to the driver's symbolic port name string, which
+ is registered with the switch to identify the vport. For example,
+ a hypervisor could set this string to "Xen Domain 2 VM 5 Vport 2",
+ and this set of identifiers can be seen on switch management screens
+ to identify the port.
+
+ vport_delete: Write_Only
+ When written with a "1", will tear down the vport.
+
+ vport_disable: Write_Only
+ When written with a "1", will transition the vport to a disabled.
+ state. The vport will still be instantiated with the Linux kernel,
+ but it will not be active on the FC link.
+ When written with a "0", will enable the vport.
+
+ vport_last_state: Read_Only
+ Indicates the previous state of the vport. See the section below on
+ "Vport States".
+
+ vport_state: Read_Only
+ Indicates the state of the vport. See the section below on
+ "Vport States".
+
+ vport_type: Read_Only
+ Reflects the FC mechanism used to create the virtual port.
+ Only NPIV is supported currently.
+
+
+ For the fc_host class object, the following attributes are added for vports:
+
+ max_npiv_vports: Read_Only
+ Indicates the maximum number of NPIV-based vports that the
+ driver/adapter can support on the fc_host.
+
+ npiv_vports_inuse: Read_Only
+ Indicates how many NPIV-based vports have been instantiated on the
+ fc_host.
+
+ vport_create: Write_Only
+ A "simple" create interface to instantiate a vport on an fc_host.
+ A "<WWPN>:<WWNN>" string is written to the attribute. The transport
+ then instantiates the vport object and calls the LLDD to create the
+ vport with the role of FCP_Initiator. Each WWN is specified as 16
+ hex characters and may *not* contain any prefixes (e.g. 0x, x, etc).
+
+ vport_delete: Write_Only
+ A "simple" delete interface to teardown a vport. A "<WWPN>:<WWNN>"
+ string is written to the attribute. The transport will locate the
+ vport on the fc_host with the same WWNs and tear it down. Each WWN
+ is specified as 16 hex characters and may *not* contain any prefixes
+ (e.g. 0x, x, etc).
+
+
+Vport States:
+-------------------------------
+
+ Vport instantiation consists of two parts:
+ - Creation with the kernel and LLDD. This means all transport and
+ driver data structures are built up, and device objects created.
+ This is equivalent to a driver "attach" on an adapter, which is
+ independent of the adapter's link state.
+ - Instantiation of the vport on the FC link via ELS traffic, etc.
+ This is equivalent to a "link up" and successfull link initialization.
+ Further information can be found in the interfaces section below for
+ Vport Creation.
+
+ Once a vport has been instantiated with the kernel/LLDD, a vport state
+ can be reported via the sysfs attribute. The following states exist:
+
+ FC_VPORT_UNKNOWN - Unknown
+ An temporary state, typically set only while the vport is being
+ instantiated with the kernel and LLDD.
+
+ FC_VPORT_ACTIVE - Active
+ The vport has been successfully been created on the FC link.
+ It is fully functional.
+
+ FC_VPORT_DISABLED - Disabled
+ The vport instantiated, but "disabled". The vport is not instantiated
+ on the FC link. This is equivalent to a physical port with the
+ link "down".
+
+ FC_VPORT_LINKDOWN - Linkdown
+ The vport is not operational as the physical link is not operational.
+
+ FC_VPORT_INITIALIZING - Initializing
+ The vport is in the process of instantiating on the FC link.
+ The LLDD will set this state just prior to starting the ELS traffic
+ to create the vport. This state will persist until the vport is
+ successfully created (state becomes FC_VPORT_ACTIVE) or it fails
+ (state is one of the values below). As this state is transitory,
+ it will not be preserved in the "vport_last_state".
+
+ FC_VPORT_NO_FABRIC_SUPP - No Fabric Support
+ The vport is not operational. One of the following conditions were
+ encountered:
+ - The FC topology is not Point-to-Point
+ - The FC port is not connected to an F_Port
+ - The F_Port has indicated that NPIV is not supported.
+
+ FC_VPORT_NO_FABRIC_RSCS - No Fabric Resources
+ The vport is not operational. The Fabric failed FDISC with a status
+ indicating that it does not have sufficient resources to complete
+ the operation.
+
+ FC_VPORT_FABRIC_LOGOUT - Fabric Logout
+ The vport is not operational. The Fabric has LOGO'd the N_Port_ID
+ associated with the vport.
+
+ FC_VPORT_FABRIC_REJ_WWN - Fabric Rejected WWN
+ The vport is not operational. The Fabric failed FDISC with a status
+ indicating that the WWN's are not valid.
+
+ FC_VPORT_FAILED - VPort Failed
+ The vport is not operational. This is a catchall for all other
+ error conditions.
+
+
+ The following state table indicates the different state transitions:
+
+ State Event New State
+ --------------------------------------------------------------------
+ n/a Initialization Unknown
+ Unknown: Link Down Linkdown
+ Link Up & Loop No Fabric Support
+ Link Up & no Fabric No Fabric Support
+ Link Up & FLOGI response No Fabric Support
+ indicates no NPIV support
+ Link Up & FDISC being sent Initializing
+ Disable request Disable
+ Linkdown: Link Up Unknown
+ Initializing: FDISC ACC Active
+ FDISC LS_RJT w/ no resources No Fabric Resources
+ FDISC LS_RJT w/ invalid Fabric Rejected WWN
+ pname or invalid nport_id
+ FDISC LS_RJT failed for Vport Failed
+ other reasons
+ Link Down Linkdown
+ Disable request Disable
+ Disable: Enable request Unknown
+ Active: LOGO received from fabric Fabric Logout
+ Link Down Linkdown
+ Disable request Disable
+ Fabric Logout: Link still up Unknown
+
+ The following 4 error states all have the same transitions:
+ No Fabric Support:
+ No Fabric Resources:
+ Fabric Rejected WWN:
+ Vport Failed:
+ Disable request Disable
+ Link goes down Linkdown
+
+
+Transport <-> LLDD Interfaces :
+-------------------------------
+
+Vport support by LLDD:
+
+ The LLDD indicates support for vports by supplying a vport_create()
+ function in the transport template. The presense of this function will
+ cause the creation of the new attributes on the fc_host. As part of
+ the physical port completing its initialization relative to the
+ transport, it should set the max_npiv_vports attribute to indicate the
+ maximum number of vports the driver and/or adapter supports.
+
+
+Vport Creation:
+
+ The LLDD vport_create() syntax is:
+
+ int vport_create(struct fc_vport *vport, bool disable)
+
+ where:
+ vport: Is the newly allocated vport object
+ disable: If "true", the vport is to be created in a disabled stated.
+ If "false", the vport is to be enabled upon creation.
+
+ When a request is made to create a new vport (via sgio/netlink, or the
+ vport_create fc_host attribute), the transport will validate that the LLDD
+ can support another vport (e.g. max_npiv_vports > npiv_vports_inuse).
+ If not, the create request will be failed. If space remains, the transport
+ will increment the vport count, create the vport object, and then call the
+ LLDD's vport_create() function with the newly allocated vport object.
+
+ As mentioned above, vport creation is divided into two parts:
+ - Creation with the kernel and LLDD. This means all transport and
+ driver data structures are built up, and device objects created.
+ This is equivalent to a driver "attach" on an adapter, which is
+ independent of the adapter's link state.
+ - Instantiation of the vport on the FC link via ELS traffic, etc.
+ This is equivalent to a "link up" and successfull link initialization.
+
+ The LLDD's vport_create() function will not synchronously wait for both
+ parts to be fully completed before returning. It must validate that the
+ infrastructure exists to support NPIV, and complete the first part of
+ vport creation (data structure build up) before returning. We do not
+ hinge vport_create() on the link-side operation mainly because:
+ - The link may be down. It is not a failure if it is. It simply
+ means the vport is in an inoperable state until the link comes up.
+ This is consistent with the link bouncing post vport creation.
+ - The vport may be created in a disabled state.
+ - This is consistent with a model where: the vport equates to a
+ FC adapter. The vport_create is synonymous with driver attachment
+ to the adapter, which is independent of link state.
+
+ Note: special error codes have been defined to delineate infrastructure
+ failure cases for quicker resolution.
+
+ The expected behavior for the LLDD's vport_create() function is:
+ - Validate Infrastructure:
+ - If the driver or adapter cannot support another vport, whether
+ due to improper firmware, (a lie about) max_npiv, or a lack of
+ some other resource - return VPCERR_UNSUPPORTED.
+ - If the driver validates the WWN's against those already active on
+ the adapter and detects an overlap - return VPCERR_BAD_WWN.
+ - If the driver detects the topology is loop, non-fabric, or the
+ FLOGI did not support NPIV - return VPCERR_NO_FABRIC_SUPP.
+ - Allocate data structures. If errors are encountered, such as out
+ of memory conditions, return the respective negative Exxx error code.
+ - If the role is FCP Initiator, the LLDD is to :
+ - Call scsi_host_alloc() to allocate a scsi_host for the vport.
+ - Call scsi_add_host(new_shost, &vport->dev) to start the scsi_host
+ and bind it as a child of the vport device.
+ - Initializes the fc_host attribute values.
+ - Kick of further vport state transitions based on the disable flag and
+ link state - and return success (zero).
+
+ LLDD Implementers Notes:
+ - It is suggested that there be a different fc_function_templates for
+ the physical port and the virtual port. The physical port's template
+ would have the vport_create, vport_delete, and vport_disable functions,
+ while the vports would not.
+ - It is suggested that there be different scsi_host_templates
+ for the physical port and virtual port. Likely, there are driver
+ attributes, embedded into the scsi_host_template, that are applicable
+ for the physical port only (link speed, topology setting, etc). This
+ ensures that the attributes are applicable to the respective scsi_host.
+
+
+Vport Disable/Enable:
+
+ The LLDD vport_disable() syntax is:
+
+ int vport_disable(struct fc_vport *vport, bool disable)
+
+ where:
+ vport: Is vport to to be enabled or disabled
+ disable: If "true", the vport is to be disabled.
+ If "false", the vport is to be enabled.
+
+ When a request is made to change the disabled state on a vport, the
+ transport will validate the request against the existing vport state.
+ If the request is to disable and the vport is already disabled, the
+ request will fail. Similarly, if the request is to enable, and the
+ vport is not in a disabled state, the request will fail. If the request
+ is valid for the vport state, the transport will call the LLDD to
+ change the vport's state.
+
+ Within the LLDD, if a vport is disabled, it remains instantiated with
+ the kernel and LLDD, but it is not active or visible on the FC link in
+ any way. (see Vport Creation and the 2 part instantiation discussion).
+ The vport will remain in this state until it is deleted or re-enabled.
+ When enabling a vport, the LLDD reinstantiates the vport on the FC
+ link - essentially restarting the LLDD statemachine (see Vport States
+ above).
+
+
+Vport Deletion:
+
+ The LLDD vport_delete() syntax is:
+
+ int vport_delete(struct fc_vport *vport)
+
+ where:
+ vport: Is vport to delete
+
+ When a request is made to delete a vport (via sgio/netlink, or via the
+ fc_host or fc_vport vport_delete attributes), the transport will call
+ the LLDD to terminate the vport on the FC link, and teardown all other
+ datastructures and references. If the LLDD completes successfully,
+ the transport will teardown the vport objects and complete the vport
+ removal. If the LLDD delete request fails, the vport object will remain,
+ but will be in an indeterminate state.
+
+ Within the LLDD, the normal code paths for a scsi_host teardown should
+ be followed. E.g. If the vport has a FCP Initiator role, the LLDD
+ will call fc_remove_host() for the vports scsi_host, followed by
+ scsi_remove_host() and scsi_host_put() for the vports scsi_host.
+
+
+Other:
+ fc_host port_type attribute:
+ There is a new fc_host port_type value - FC_PORTTYPE_NPIV. This value
+ must be set on all vport-based fc_hosts. Normally, on a physical port,
+ the port_type attribute would be set to NPORT, NLPORT, etc based on the
+ topology type and existence of the fabric. As this is not applicable to
+ a vport, it makes more sense to report the FC mechanism used to create
+ the vport.
+
+ Driver unload:
+ FC drivers are required to call fc_remove_host() prior to calling
+ scsi_remove_host(). This allows the fc_host to tear down all remote
+ ports prior the scsi_host being torn down. The fc_remove_host() call
+ was updated to remove all vports for the fc_host as well.
+
+
+Transport supplied functions
+----------------------------
+
+The following functions are supplied by the FC-transport for use by LLDs.
+
+ fc_vport_create - create a vport
+ fc_vport_terminate - detach and remove a vport
+
+Details:
+
+/**
+ * fc_vport_create - Admin App or LLDD requests creation of a vport
+ * @shost: scsi host the virtual port is connected to.
+ * @ids: The world wide names, FC4 port roles, etc for
+ * the virtual port.
+ *
+ * Notes:
+ * This routine assumes no locks are held on entry.
+ */
+struct fc_vport *
+fc_vport_create(struct Scsi_Host *shost, struct fc_vport_identifiers *ids)
+
+/**
+ * fc_vport_terminate - Admin App or LLDD requests termination of a vport
+ * @vport: fc_vport to be terminated
+ *
+ * Calls the LLDD vport_delete() function, then deallocates and removes
+ * the vport from the shost and object tree.
+ *
+ * Notes:
+ * This routine assumes no locks are held on entry.
+ */
+int
+fc_vport_terminate(struct fc_vport *vport)
+
+
+Credits
+=======
+The following people have contributed to this document:
+
+
+
+
+
+
+James Smart
+james.smart@emulex.com
+
diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt
new file mode 100644
index 0000000..a6d5354
--- /dev/null
+++ b/Documentation/scsi/scsi_mid_low_api.txt
@@ -0,0 +1,1421 @@
+ Linux Kernel 2.6 series
+ SCSI mid_level - lower_level driver interface
+ =============================================
+
+Introduction
+============
+This document outlines the interface between the Linux SCSI mid level and
+SCSI lower level drivers. Lower level drivers (LLDs) are variously called
+host bus adapter (HBA) drivers and host drivers (HD). A "host" in this
+context is a bridge between a computer IO bus (e.g. PCI or ISA) and a
+single SCSI initiator port on a SCSI transport. An "initiator" port
+(SCSI terminology, see SAM-3 at http://www.t10.org) sends SCSI commands
+to "target" SCSI ports (e.g. disks). There can be many LLDs in a running
+system, but only one per hardware type. Most LLDs can control one or more
+SCSI HBAs. Some HBAs contain multiple hosts.
+
+In some cases the SCSI transport is an external bus that already has
+its own subsystem in Linux (e.g. USB and ieee1394). In such cases the
+SCSI subsystem LLD is a software bridge to the other driver subsystem.
+Examples are the usb-storage driver (found in the drivers/usb/storage
+directory) and the ieee1394/sbp2 driver (found in the drivers/ieee1394
+directory).
+
+For example, the aic7xxx LLD controls Adaptec SCSI parallel interface
+(SPI) controllers based on that company's 7xxx chip series. The aic7xxx
+LLD can be built into the kernel or loaded as a module. There can only be
+one aic7xxx LLD running in a Linux system but it may be controlling many
+HBAs. These HBAs might be either on PCI daughter-boards or built into
+the motherboard (or both). Some aic7xxx based HBAs are dual controllers
+and thus represent two hosts. Like most modern HBAs, each aic7xxx host
+has its own PCI device address. [The one-to-one correspondence between
+a SCSI host and a PCI device is common but not required (e.g. with
+ISA or MCA adapters).]
+
+The SCSI mid level isolates an LLD from other layers such as the SCSI
+upper layer drivers and the block layer.
+
+This version of the document roughly matches linux kernel version 2.6.8 .
+
+Documentation
+=============
+There is a SCSI documentation directory within the kernel source tree,
+typically Documentation/scsi . Most documents are in plain
+(i.e. ASCII) text. This file is named scsi_mid_low_api.txt and can be
+found in that directory. A more recent copy of this document may be found
+at http://www.torque.net/scsi/scsi_mid_low_api.txt.gz .
+Many LLDs are documented there (e.g. aic7xxx.txt). The SCSI mid-level is
+briefly described in scsi.txt which contains a url to a document
+describing the SCSI subsystem in the lk 2.4 series. Two upper level
+drivers have documents in that directory: st.txt (SCSI tape driver) and
+scsi-generic.txt (for the sg driver).
+
+Some documentation (or urls) for LLDs may be found in the C source code
+or in the same directory as the C source code. For example to find a url
+about the USB mass storage driver see the
+/usr/src/linux/drivers/usb/storage directory.
+
+The Linux kernel source Documentation/DocBook/scsidrivers.tmpl file
+refers to this file. With the appropriate DocBook tool-set, this permits
+users to generate html, ps and pdf renderings of information within this
+file (e.g. the interface functions).
+
+Driver structure
+================
+Traditionally an LLD for the SCSI subsystem has been at least two files in
+the drivers/scsi directory. For example, a driver called "xyz" has a header
+file "xyz.h" and a source file "xyz.c". [Actually there is no good reason
+why this couldn't all be in one file; the header file is superfluous.] Some
+drivers that have been ported to several operating systems have more than
+two files. For example the aic7xxx driver has separate files for generic
+and OS-specific code (e.g. FreeBSD and Linux). Such drivers tend to have
+their own directory under the drivers/scsi directory.
+
+When a new LLD is being added to Linux, the following files (found in the
+drivers/scsi directory) will need some attention: Makefile and Kconfig .
+It is probably best to study how existing LLDs are organized.
+
+As the 2.5 series development kernels evolve into the 2.6 series
+production series, changes are being introduced into this interface. An
+example of this is driver initialization code where there are now 2 models
+available. The older one, similar to what was found in the lk 2.4 series,
+is based on hosts that are detected at HBA driver load time. This will be
+referred to the "passive" initialization model. The newer model allows HBAs
+to be hot plugged (and unplugged) during the lifetime of the LLD and will
+be referred to as the "hotplug" initialization model. The newer model is
+preferred as it can handle both traditional SCSI equipment that is
+permanently connected as well as modern "SCSI" devices (e.g. USB or
+IEEE 1394 connected digital cameras) that are hotplugged. Both
+initialization models are discussed in the following sections.
+
+An LLD interfaces to the SCSI subsystem several ways:
+ a) directly invoking functions supplied by the mid level
+ b) passing a set of function pointers to a registration function
+ supplied by the mid level. The mid level will then invoke these
+ functions at some point in the future. The LLD will supply
+ implementations of these functions.
+ c) direct access to instances of well known data structures maintained
+ by the mid level
+
+Those functions in group a) are listed in a section entitled "Mid level
+supplied functions" below.
+
+Those functions in group b) are listed in a section entitled "Interface
+functions" below. Their function pointers are placed in the members of
+"struct scsi_host_template", an instance of which is passed to
+scsi_host_alloc() ** . Those interface functions that the LLD does not
+wish to supply should have NULL placed in the corresponding member of
+struct scsi_host_template. Defining an instance of struct
+scsi_host_template at file scope will cause NULL to be placed in function
+ pointer members not explicitly initialized.
+
+Those usages in group c) should be handled with care, especially in a
+"hotplug" environment. LLDs should be aware of the lifetime of instances
+that are shared with the mid level and other layers.
+
+All functions defined within an LLD and all data defined at file scope
+should be static. For example the slave_alloc() function in an LLD
+called "xxx" could be defined as
+"static int xxx_slave_alloc(struct scsi_device * sdev) { /* code */ }"
+
+** the scsi_host_alloc() function is a replacement for the rather vaguely
+named scsi_register() function in most situations. The scsi_register()
+and scsi_unregister() functions remain to support legacy LLDs that use
+the passive initialization model.
+
+
+Hotplug initialization model
+============================
+In this model an LLD controls when SCSI hosts are introduced and removed
+from the SCSI subsystem. Hosts can be introduced as early as driver
+initialization and removed as late as driver shutdown. Typically a driver
+will respond to a sysfs probe() callback that indicates an HBA has been
+detected. After confirming that the new device is one that the LLD wants
+to control, the LLD will initialize the HBA and then register a new host
+with the SCSI mid level.
+
+During LLD initialization the driver should register itself with the
+appropriate IO bus on which it expects to find HBA(s) (e.g. the PCI bus).
+This can probably be done via sysfs. Any driver parameters (especially
+those that are writable after the driver is loaded) could also be
+registered with sysfs at this point. The SCSI mid level first becomes
+aware of an LLD when that LLD registers its first HBA.
+
+At some later time, the LLD becomes aware of an HBA and what follows
+is a typical sequence of calls between the LLD and the mid level.
+This example shows the mid level scanning the newly introduced HBA for 3
+scsi devices of which only the first 2 respond:
+
+ HBA PROBE: assume 2 SCSI devices found in scan
+LLD mid level LLD
+===-------------------=========--------------------===------
+scsi_host_alloc() -->
+scsi_add_host() ---->
+scsi_scan_host() -------+
+ |
+ slave_alloc()
+ slave_configure() --> scsi_adjust_queue_depth()
+ |
+ slave_alloc()
+ slave_configure()
+ |
+ slave_alloc() ***
+ slave_destroy() ***
+------------------------------------------------------------
+
+If the LLD wants to adjust the default queue settings, it can invoke
+scsi_adjust_queue_depth() in its slave_configure() routine.
+
+*** For scsi devices that the mid level tries to scan but do not
+ respond, a slave_alloc(), slave_destroy() pair is called.
+
+When an HBA is being removed it could be as part of an orderly shutdown
+associated with the LLD module being unloaded (e.g. with the "rmmod"
+command) or in response to a "hot unplug" indicated by sysfs()'s
+remove() callback being invoked. In either case, the sequence is the
+same:
+
+ HBA REMOVE: assume 2 SCSI devices attached
+LLD mid level LLD
+===----------------------=========-----------------===------
+scsi_remove_host() ---------+
+ |
+ slave_destroy()
+ slave_destroy()
+scsi_host_put()
+------------------------------------------------------------
+
+It may be useful for a LLD to keep track of struct Scsi_Host instances
+(a pointer is returned by scsi_host_alloc()). Such instances are "owned"
+by the mid-level. struct Scsi_Host instances are freed from
+scsi_host_put() when the reference count hits zero.
+
+Hot unplugging an HBA that controls a disk which is processing SCSI
+commands on a mounted file system is an interesting situation. Reference
+counting logic is being introduced into the mid level to cope with many
+of the issues involved. See the section on reference counting below.
+
+
+The hotplug concept may be extended to SCSI devices. Currently, when an
+HBA is added, the scsi_scan_host() function causes a scan for SCSI devices
+attached to the HBA's SCSI transport. On newer SCSI transports the HBA
+may become aware of a new SCSI device _after_ the scan has completed.
+An LLD can use this sequence to make the mid level aware of a SCSI device:
+
+ SCSI DEVICE hotplug
+LLD mid level LLD
+===-------------------=========--------------------===------
+scsi_add_device() ------+
+ |
+ slave_alloc()
+ slave_configure() [--> scsi_adjust_queue_depth()]
+------------------------------------------------------------
+
+In a similar fashion, an LLD may become aware that a SCSI device has been
+removed (unplugged) or the connection to it has been interrupted. Some
+existing SCSI transports (e.g. SPI) may not become aware that a SCSI
+device has been removed until a subsequent SCSI command fails which will
+probably cause that device to be set offline by the mid level. An LLD that
+detects the removal of a SCSI device can instigate its removal from
+upper layers with this sequence:
+
+ SCSI DEVICE hot unplug
+LLD mid level LLD
+===----------------------=========-----------------===------
+scsi_remove_device() -------+
+ |
+ slave_destroy()
+------------------------------------------------------------
+
+It may be useful for an LLD to keep track of struct scsi_device instances
+(a pointer is passed as the parameter to slave_alloc() and
+slave_configure() callbacks). Such instances are "owned" by the mid-level.
+struct scsi_device instances are freed after slave_destroy().
+
+
+Passive initialization model
+============================
+These older LLDs include a file called "scsi_module.c" [yes the ".c" is a
+little surprising] in their source code. For that file to work an
+instance of struct scsi_host_template with the name "driver_template"
+needs to be defined. Here is a typical code sequence used in this model:
+ static struct scsi_host_template driver_template = {
+ ...
+ };
+ #include "scsi_module.c"
+
+The scsi_module.c file contains two functions:
+ - init_this_scsi_driver() which is executed when the LLD is
+ initialized (i.e. boot time or module load time)
+ - exit_this_scsi_driver() which is executed when the LLD is shut
+ down (i.e. module unload time)
+Note: since these functions are tagged with __init and __exit qualifiers
+an LLD should not call them explicitly (since the kernel does that).
+
+Here is an example of an initialization sequence when two hosts are
+detected (so detect() returns 2) and the SCSI bus scan on each host
+finds 1 SCSI device (and a second device does not respond).
+
+LLD mid level LLD
+===----------------------=========-----------------===------
+init_this_scsi_driver() ----+
+ |
+ detect() -----------------+
+ | |
+ | scsi_register()
+ | scsi_register()
+ |
+ slave_alloc()
+ slave_configure() --> scsi_adjust_queue_depth()
+ slave_alloc() ***
+ slave_destroy() ***
+ |
+ slave_alloc()
+ slave_configure()
+ slave_alloc() ***
+ slave_destroy() ***
+------------------------------------------------------------
+
+The mid level invokes scsi_adjust_queue_depth() with tagged queuing off and
+"cmd_per_lun" for that host as the queue length. These settings can be
+overridden by a slave_configure() supplied by the LLD.
+
+*** For scsi devices that the mid level tries to scan but do not
+ respond, a slave_alloc(), slave_destroy() pair is called.
+
+Here is an LLD shutdown sequence:
+
+LLD mid level LLD
+===----------------------=========-----------------===------
+exit_this_scsi_driver() ----+
+ |
+ slave_destroy()
+ release() --> scsi_unregister()
+ |
+ slave_destroy()
+ release() --> scsi_unregister()
+------------------------------------------------------------
+
+An LLD need not define slave_destroy() (i.e. it is optional).
+
+The shortcoming of the "passive initialization model" is that host
+registration and de-registration are (typically) tied to LLD initialization
+and shutdown. Once the LLD is initialized then a new host that appears
+(e.g. via hotplugging) cannot easily be added without a redundant
+driver shutdown and re-initialization. It may be possible to write an LLD
+that uses both initialization models.
+
+
+Reference Counting
+==================
+The Scsi_Host structure has had reference counting infrastructure added.
+This effectively spreads the ownership of struct Scsi_Host instances
+across the various SCSI layers which use them. Previously such instances
+were exclusively owned by the mid level. LLDs would not usually need to
+directly manipulate these reference counts but there may be some cases
+where they do.
+
+There are 3 reference counting functions of interest associated with
+struct Scsi_Host:
+ - scsi_host_alloc(): returns a pointer to new instance of struct
+ Scsi_Host which has its reference count ^^ set to 1
+ - scsi_host_get(): adds 1 to the reference count of the given instance
+ - scsi_host_put(): decrements 1 from the reference count of the given
+ instance. If the reference count reaches 0 then the given instance
+ is freed
+
+The Scsi_device structure has had reference counting infrastructure added.
+This effectively spreads the ownership of struct Scsi_device instances
+across the various SCSI layers which use them. Previously such instances
+were exclusively owned by the mid level. See the access functions declared
+towards the end of include/scsi/scsi_device.h . If an LLD wants to keep
+a copy of a pointer to a Scsi_device instance it should use scsi_device_get()
+to bump its reference count. When it is finished with the pointer it can
+use scsi_device_put() to decrement its reference count (and potentially
+delete it).
+
+^^ struct Scsi_Host actually has 2 reference counts which are manipulated
+in parallel by these functions.
+
+
+Conventions
+===========
+First, Linus Torvalds's thoughts on C coding style can be found in the
+Documentation/CodingStyle file.
+
+Next, there is a movement to "outlaw" typedefs introducing synonyms for
+struct tags. Both can be still found in the SCSI subsystem, but
+the typedefs have been moved to a single file, scsi_typedefs.h to
+make their future removal easier, for example:
+"typedef struct scsi_cmnd Scsi_Cmnd;"
+
+Also, most C99 enhancements are encouraged to the extent they are supported
+by the relevant gcc compilers. So C99 style structure and array
+initializers are encouraged where appropriate. Don't go too far,
+VLAs are not properly supported yet. An exception to this is the use of
+"//" style comments; /*...*/ comments are still preferred in Linux.
+
+Well written, tested and documented code, need not be re-formatted to
+comply with the above conventions. For example, the aic7xxx driver
+comes to Linux from FreeBSD and Adaptec's own labs. No doubt FreeBSD
+and Adaptec have their own coding conventions.
+
+
+Mid level supplied functions
+============================
+These functions are supplied by the SCSI mid level for use by LLDs.
+The names (i.e. entry points) of these functions are exported
+so an LLD that is a module can access them. The kernel will
+arrange for the SCSI mid level to be loaded and initialized before any LLD
+is initialized. The functions below are listed alphabetically and their
+names all start with "scsi_".
+
+Summary:
+ scsi_activate_tcq - turn on tag command queueing
+ scsi_add_device - creates new scsi device (lu) instance
+ scsi_add_host - perform sysfs registration and set up transport class
+ scsi_adjust_queue_depth - change the queue depth on a SCSI device
+ scsi_bios_ptable - return copy of block device's partition table
+ scsi_block_requests - prevent further commands being queued to given host
+ scsi_deactivate_tcq - turn off tag command queueing
+ scsi_host_alloc - return a new scsi_host instance whose refcount==1
+ scsi_host_get - increments Scsi_Host instance's refcount
+ scsi_host_put - decrements Scsi_Host instance's refcount (free if 0)
+ scsi_partsize - parse partition table into cylinders, heads + sectors
+ scsi_register - create and register a scsi host adapter instance.
+ scsi_remove_device - detach and remove a SCSI device
+ scsi_remove_host - detach and remove all SCSI devices owned by host
+ scsi_report_bus_reset - report scsi _bus_ reset observed
+ scsi_scan_host - scan SCSI bus
+ scsi_track_queue_full - track successive QUEUE_FULL events
+ scsi_unblock_requests - allow further commands to be queued to given host
+ scsi_unregister - [calls scsi_host_put()]
+
+
+Details:
+
+/**
+ * scsi_activate_tcq - turn on tag command queueing ("ordered" task attribute)
+ * @sdev: device to turn on TCQ for
+ * @depth: queue depth
+ *
+ * Returns nothing
+ *
+ * Might block: no
+ *
+ * Notes: Eventually, it is hoped depth would be the maximum depth
+ * the device could cope with and the real queue depth
+ * would be adjustable from 0 to depth.
+ *
+ * Defined (inline) in: include/scsi/scsi_tcq.h
+ **/
+void scsi_activate_tcq(struct scsi_device *sdev, int depth)
+
+
+/**
+ * scsi_add_device - creates new scsi device (lu) instance
+ * @shost: pointer to scsi host instance
+ * @channel: channel number (rarely other than 0)
+ * @id: target id number
+ * @lun: logical unit number
+ *
+ * Returns pointer to new struct scsi_device instance or
+ * ERR_PTR(-ENODEV) (or some other bent pointer) if something is
+ * wrong (e.g. no lu responds at given address)
+ *
+ * Might block: yes
+ *
+ * Notes: This call is usually performed internally during a scsi
+ * bus scan when an HBA is added (i.e. scsi_scan_host()). So it
+ * should only be called if the HBA becomes aware of a new scsi
+ * device (lu) after scsi_scan_host() has completed. If successful
+ * this call can lead to slave_alloc() and slave_configure() callbacks
+ * into the LLD.
+ *
+ * Defined in: drivers/scsi/scsi_scan.c
+ **/
+struct scsi_device * scsi_add_device(struct Scsi_Host *shost,
+ unsigned int channel,
+ unsigned int id, unsigned int lun)
+
+
+/**
+ * scsi_add_host - perform sysfs registration and set up transport class
+ * @shost: pointer to scsi host instance
+ * @dev: pointer to struct device of type scsi class
+ *
+ * Returns 0 on success, negative errno of failure (e.g. -ENOMEM)
+ *
+ * Might block: no
+ *
+ * Notes: Only required in "hotplug initialization model" after a
+ * successful call to scsi_host_alloc(). This function does not
+ * scan the bus; this can be done by calling scsi_scan_host() or
+ * in some other transport-specific way. The LLD must set up
+ * the transport template before calling this function and may only
+ * access the transport class data after this function has been called.
+ *
+ * Defined in: drivers/scsi/hosts.c
+ **/
+int scsi_add_host(struct Scsi_Host *shost, struct device * dev)
+
+
+/**
+ * scsi_adjust_queue_depth - allow LLD to change queue depth on a SCSI device
+ * @sdev: pointer to SCSI device to change queue depth on
+ * @tagged: 0 - no tagged queuing
+ * MSG_SIMPLE_TAG - simple tagged queuing
+ * MSG_ORDERED_TAG - ordered tagged queuing
+ * @tags Number of tags allowed if tagged queuing enabled,
+ * or number of commands the LLD can queue up
+ * in non-tagged mode (as per cmd_per_lun).
+ *
+ * Returns nothing
+ *
+ * Might block: no
+ *
+ * Notes: Can be invoked any time on a SCSI device controlled by this
+ * LLD. [Specifically during and after slave_configure() and prior to
+ * slave_destroy().] Can safely be invoked from interrupt code. Actual
+ * queue depth change may be delayed until the next command is being
+ * processed. See also scsi_activate_tcq() and scsi_deactivate_tcq().
+ *
+ * Defined in: drivers/scsi/scsi.c [see source code for more notes]
+ *
+ **/
+void scsi_adjust_queue_depth(struct scsi_device * sdev, int tagged,
+ int tags)
+
+
+/**
+ * scsi_bios_ptable - return copy of block device's partition table
+ * @dev: pointer to block device
+ *
+ * Returns pointer to partition table, or NULL for failure
+ *
+ * Might block: yes
+ *
+ * Notes: Caller owns memory returned (free with kfree() )
+ *
+ * Defined in: drivers/scsi/scsicam.c
+ **/
+unsigned char *scsi_bios_ptable(struct block_device *dev)
+
+
+/**
+ * scsi_block_requests - prevent further commands being queued to given host
+ *
+ * @shost: pointer to host to block commands on
+ *
+ * Returns nothing
+ *
+ * Might block: no
+ *
+ * Notes: There is no timer nor any other means by which the requests
+ * get unblocked other than the LLD calling scsi_unblock_requests().
+ *
+ * Defined in: drivers/scsi/scsi_lib.c
+**/
+void scsi_block_requests(struct Scsi_Host * shost)
+
+
+/**
+ * scsi_deactivate_tcq - turn off tag command queueing
+ * @sdev: device to turn off TCQ for
+ * @depth: queue depth (stored in sdev)
+ *
+ * Returns nothing
+ *
+ * Might block: no
+ *
+ * Defined (inline) in: include/scsi/scsi_tcq.h
+ **/
+void scsi_deactivate_tcq(struct scsi_device *sdev, int depth)
+
+
+/**
+ * scsi_host_alloc - create a scsi host adapter instance and perform basic
+ * initialization.
+ * @sht: pointer to scsi host template
+ * @privsize: extra bytes to allocate in hostdata array (which is the
+ * last member of the returned Scsi_Host instance)
+ *
+ * Returns pointer to new Scsi_Host instance or NULL on failure
+ *
+ * Might block: yes
+ *
+ * Notes: When this call returns to the LLD, the SCSI bus scan on
+ * this host has _not_ yet been done.
+ * The hostdata array (by default zero length) is a per host scratch
+ * area for the LLD's exclusive use.
+ * Both associated refcounting objects have their refcount set to 1.
+ * Full registration (in sysfs) and a bus scan are performed later when
+ * scsi_add_host() and scsi_scan_host() are called.
+ *
+ * Defined in: drivers/scsi/hosts.c .
+ **/
+struct Scsi_Host * scsi_host_alloc(struct scsi_host_template * sht,
+ int privsize)
+
+
+/**
+ * scsi_host_get - increment Scsi_Host instance refcount
+ * @shost: pointer to struct Scsi_Host instance
+ *
+ * Returns nothing
+ *
+ * Might block: currently may block but may be changed to not block
+ *
+ * Notes: Actually increments the counts in two sub-objects
+ *
+ * Defined in: drivers/scsi/hosts.c
+ **/
+void scsi_host_get(struct Scsi_Host *shost)
+
+
+/**
+ * scsi_host_put - decrement Scsi_Host instance refcount, free if 0
+ * @shost: pointer to struct Scsi_Host instance
+ *
+ * Returns nothing
+ *
+ * Might block: currently may block but may be changed to not block
+ *
+ * Notes: Actually decrements the counts in two sub-objects. If the
+ * latter refcount reaches 0, the Scsi_Host instance is freed.
+ * The LLD need not worry exactly when the Scsi_Host instance is
+ * freed, it just shouldn't access the instance after it has balanced
+ * out its refcount usage.
+ *
+ * Defined in: drivers/scsi/hosts.c
+ **/
+void scsi_host_put(struct Scsi_Host *shost)
+
+
+/**
+ * scsi_partsize - parse partition table into cylinders, heads + sectors
+ * @buf: pointer to partition table
+ * @capacity: size of (total) disk in 512 byte sectors
+ * @cyls: outputs number of cylinders calculated via this pointer
+ * @hds: outputs number of heads calculated via this pointer
+ * @secs: outputs number of sectors calculated via this pointer
+ *
+ * Returns 0 on success, -1 on failure
+ *
+ * Might block: no
+ *
+ * Notes: Caller owns memory returned (free with kfree() )
+ *
+ * Defined in: drivers/scsi/scsicam.c
+ **/
+int scsi_partsize(unsigned char *buf, unsigned long capacity,
+ unsigned int *cyls, unsigned int *hds, unsigned int *secs)
+
+
+/**
+ * scsi_register - create and register a scsi host adapter instance.
+ * @sht: pointer to scsi host template
+ * @privsize: extra bytes to allocate in hostdata array (which is the
+ * last member of the returned Scsi_Host instance)
+ *
+ * Returns pointer to new Scsi_Host instance or NULL on failure
+ *
+ * Might block: yes
+ *
+ * Notes: When this call returns to the LLD, the SCSI bus scan on
+ * this host has _not_ yet been done.
+ * The hostdata array (by default zero length) is a per host scratch
+ * area for the LLD.
+ *
+ * Defined in: drivers/scsi/hosts.c .
+ **/
+struct Scsi_Host * scsi_register(struct scsi_host_template * sht,
+ int privsize)
+
+
+/**
+ * scsi_remove_device - detach and remove a SCSI device
+ * @sdev: a pointer to a scsi device instance
+ *
+ * Returns value: 0 on success, -EINVAL if device not attached
+ *
+ * Might block: yes
+ *
+ * Notes: If an LLD becomes aware that a scsi device (lu) has
+ * been removed but its host is still present then it can request
+ * the removal of that scsi device. If successful this call will
+ * lead to the slave_destroy() callback being invoked. sdev is an
+ * invalid pointer after this call.
+ *
+ * Defined in: drivers/scsi/scsi_sysfs.c .
+ **/
+int scsi_remove_device(struct scsi_device *sdev)
+
+
+/**
+ * scsi_remove_host - detach and remove all SCSI devices owned by host
+ * @shost: a pointer to a scsi host instance
+ *
+ * Returns value: 0 on success, 1 on failure (e.g. LLD busy ??)
+ *
+ * Might block: yes
+ *
+ * Notes: Should only be invoked if the "hotplug initialization
+ * model" is being used. It should be called _prior_ to
+ * scsi_unregister().
+ *
+ * Defined in: drivers/scsi/hosts.c .
+ **/
+int scsi_remove_host(struct Scsi_Host *shost)
+
+
+/**
+ * scsi_report_bus_reset - report scsi _bus_ reset observed
+ * @shost: a pointer to a scsi host involved
+ * @channel: channel (within) host on which scsi bus reset occurred
+ *
+ * Returns nothing
+ *
+ * Might block: no
+ *
+ * Notes: This only needs to be called if the reset is one which
+ * originates from an unknown location. Resets originated by the
+ * mid level itself don't need to call this, but there should be
+ * no harm. The main purpose of this is to make sure that a
+ * CHECK_CONDITION is properly treated.
+ *
+ * Defined in: drivers/scsi/scsi_error.c .
+ **/
+void scsi_report_bus_reset(struct Scsi_Host * shost, int channel)
+
+
+/**
+ * scsi_scan_host - scan SCSI bus
+ * @shost: a pointer to a scsi host instance
+ *
+ * Might block: yes
+ *
+ * Notes: Should be called after scsi_add_host()
+ *
+ * Defined in: drivers/scsi/scsi_scan.c
+ **/
+void scsi_scan_host(struct Scsi_Host *shost)
+
+
+/**
+ * scsi_track_queue_full - track successive QUEUE_FULL events on given
+ * device to determine if and when there is a need
+ * to adjust the queue depth on the device.
+ * @sdev: pointer to SCSI device instance
+ * @depth: Current number of outstanding SCSI commands on this device,
+ * not counting the one returned as QUEUE_FULL.
+ *
+ * Returns 0 - no change needed
+ * >0 - adjust queue depth to this new depth
+ * -1 - drop back to untagged operation using host->cmd_per_lun
+ * as the untagged command depth
+ *
+ * Might block: no
+ *
+ * Notes: LLDs may call this at any time and we will do "The Right
+ * Thing"; interrupt context safe.
+ *
+ * Defined in: drivers/scsi/scsi.c .
+ **/
+int scsi_track_queue_full(struct scsi_device *sdev, int depth)
+
+
+/**
+ * scsi_unblock_requests - allow further commands to be queued to given host
+ *
+ * @shost: pointer to host to unblock commands on
+ *
+ * Returns nothing
+ *
+ * Might block: no
+ *
+ * Defined in: drivers/scsi/scsi_lib.c .
+**/
+void scsi_unblock_requests(struct Scsi_Host * shost)
+
+
+/**
+ * scsi_unregister - unregister and free memory used by host instance
+ * @shp: pointer to scsi host instance to unregister.
+ *
+ * Returns nothing
+ *
+ * Might block: no
+ *
+ * Notes: Should not be invoked if the "hotplug initialization
+ * model" is being used. Called internally by exit_this_scsi_driver()
+ * in the "passive initialization model". Hence a LLD has no need to
+ * call this function directly.
+ *
+ * Defined in: drivers/scsi/hosts.c .
+ **/
+void scsi_unregister(struct Scsi_Host * shp)
+
+
+
+
+Interface Functions
+===================
+Interface functions are supplied (defined) by LLDs and their function
+pointers are placed in an instance of struct scsi_host_template which
+is passed to scsi_host_alloc() [or scsi_register() / init_this_scsi_driver()].
+Some are mandatory. Interface functions should be declared static. The
+accepted convention is that driver "xyz" will declare its slave_configure()
+function as:
+ static int xyz_slave_configure(struct scsi_device * sdev);
+and so forth for all interface functions listed below.
+
+A pointer to this function should be placed in the 'slave_configure' member
+of a "struct scsi_host_template" instance. A pointer to such an instance
+should be passed to the mid level's scsi_host_alloc() [or scsi_register() /
+init_this_scsi_driver()].
+
+The interface functions are also described in the include/scsi/scsi_host.h
+file immediately above their definition point in "struct scsi_host_template".
+In some cases more detail is given in scsi_host.h than below.
+
+The interface functions are listed below in alphabetical order.
+
+Summary:
+ bios_param - fetch head, sector, cylinder info for a disk
+ detect - detects HBAs this driver wants to control
+ eh_timed_out - notify the host that a command timer expired
+ eh_abort_handler - abort given command
+ eh_bus_reset_handler - issue SCSI bus reset
+ eh_device_reset_handler - issue SCSI device reset
+ eh_host_reset_handler - reset host (host bus adapter)
+ info - supply information about given host
+ ioctl - driver can respond to ioctls
+ proc_info - supports /proc/scsi/{driver_name}/{host_no}
+ queuecommand - queue scsi command, invoke 'done' on completion
+ release - release all resources associated with given host
+ slave_alloc - prior to any commands being sent to a new device
+ slave_configure - driver fine tuning for given device after attach
+ slave_destroy - given device is about to be shut down
+
+
+Details:
+
+/**
+ * bios_param - fetch head, sector, cylinder info for a disk
+ * @sdev: pointer to scsi device context (defined in
+ * include/scsi/scsi_device.h)
+ * @bdev: pointer to block device context (defined in fs.h)
+ * @capacity: device size (in 512 byte sectors)
+ * @params: three element array to place output:
+ * params[0] number of heads (max 255)
+ * params[1] number of sectors (max 63)
+ * params[2] number of cylinders
+ *
+ * Return value is ignored
+ *
+ * Locks: none
+ *
+ * Calling context: process (sd)
+ *
+ * Notes: an arbitrary geometry (based on READ CAPACITY) is used
+ * if this function is not provided. The params array is
+ * pre-initialized with made up values just in case this function
+ * doesn't output anything.
+ *
+ * Optionally defined in: LLD
+ **/
+ int bios_param(struct scsi_device * sdev, struct block_device *bdev,
+ sector_t capacity, int params[3])
+
+
+/**
+ * detect - detects HBAs this driver wants to control
+ * @shtp: host template for this driver.
+ *
+ * Returns number of hosts this driver wants to control. 0 means no
+ * suitable hosts found.
+ *
+ * Locks: none held
+ *
+ * Calling context: process [invoked from init_this_scsi_driver()]
+ *
+ * Notes: First function called from the SCSI mid level on this
+ * driver. Upper level drivers (e.g. sd) may not (yet) be present.
+ * For each host found, this method should call scsi_register()
+ * [see hosts.c].
+ *
+ * Defined in: LLD (required if "passive initialization mode" is used,
+ * not invoked in "hotplug initialization mode")
+ **/
+ int detect(struct scsi_host_template * shtp)
+
+
+/**
+ * eh_timed_out - The timer for the command has just fired
+ * @scp: identifies command timing out
+ *
+ * Returns:
+ *
+ * EH_HANDLED: I fixed the error, please complete the command
+ * EH_RESET_TIMER: I need more time, reset the timer and
+ * begin counting again
+ * EH_NOT_HANDLED Begin normal error recovery
+ *
+ *
+ * Locks: None held
+ *
+ * Calling context: interrupt
+ *
+ * Notes: This is to give the LLD an opportunity to do local recovery.
+ * This recovery is limited to determining if the outstanding command
+ * will ever complete. You may not abort and restart the command from
+ * this callback.
+ *
+ * Optionally defined in: LLD
+ **/
+ int eh_timed_out(struct scsi_cmnd * scp)
+
+
+/**
+ * eh_abort_handler - abort command associated with scp
+ * @scp: identifies command to be aborted
+ *
+ * Returns SUCCESS if command aborted else FAILED
+ *
+ * Locks: None held
+ *
+ * Calling context: kernel thread
+ *
+ * Notes: Invoked from scsi_eh thread. No other commands will be
+ * queued on current host during eh.
+ *
+ * Optionally defined in: LLD
+ **/
+ int eh_abort_handler(struct scsi_cmnd * scp)
+
+
+/**
+ * eh_bus_reset_handler - issue SCSI bus reset
+ * @scp: SCSI bus that contains this device should be reset
+ *
+ * Returns SUCCESS if command aborted else FAILED
+ *
+ * Locks: None held
+ *
+ * Calling context: kernel thread
+ *
+ * Notes: Invoked from scsi_eh thread. No other commands will be
+ * queued on current host during eh.
+ *
+ * Optionally defined in: LLD
+ **/
+ int eh_bus_reset_handler(struct scsi_cmnd * scp)
+
+
+/**
+ * eh_device_reset_handler - issue SCSI device reset
+ * @scp: identifies SCSI device to be reset
+ *
+ * Returns SUCCESS if command aborted else FAILED
+ *
+ * Locks: None held
+ *
+ * Calling context: kernel thread
+ *
+ * Notes: Invoked from scsi_eh thread. No other commands will be
+ * queued on current host during eh.
+ *
+ * Optionally defined in: LLD
+ **/
+ int eh_device_reset_handler(struct scsi_cmnd * scp)
+
+
+/**
+ * eh_host_reset_handler - reset host (host bus adapter)
+ * @scp: SCSI host that contains this device should be reset
+ *
+ * Returns SUCCESS if command aborted else FAILED
+ *
+ * Locks: None held
+ *
+ * Calling context: kernel thread
+ *
+ * Notes: Invoked from scsi_eh thread. No other commands will be
+ * queued on current host during eh.
+ * With the default eh_strategy in place, if none of the _abort_,
+ * _device_reset_, _bus_reset_ or this eh handler function are
+ * defined (or they all return FAILED) then the device in question
+ * will be set offline whenever eh is invoked.
+ *
+ * Optionally defined in: LLD
+ **/
+ int eh_host_reset_handler(struct scsi_cmnd * scp)
+
+
+/**
+ * info - supply information about given host: driver name plus data
+ * to distinguish given host
+ * @shp: host to supply information about
+ *
+ * Return ASCII null terminated string. [This driver is assumed to
+ * manage the memory pointed to and maintain it, typically for the
+ * lifetime of this host.]
+ *
+ * Locks: none
+ *
+ * Calling context: process
+ *
+ * Notes: Often supplies PCI or ISA information such as IO addresses
+ * and interrupt numbers. If not supplied struct Scsi_Host::name used
+ * instead. It is assumed the returned information fits on one line
+ * (i.e. does not included embedded newlines).
+ * The SCSI_IOCTL_PROBE_HOST ioctl yields the string returned by this
+ * function (or struct Scsi_Host::name if this function is not
+ * available).
+ * In a similar manner, init_this_scsi_driver() outputs to the console
+ * each host's "info" (or name) for the driver it is registering.
+ * Also if proc_info() is not supplied, the output of this function
+ * is used instead.
+ *
+ * Optionally defined in: LLD
+ **/
+ const char * info(struct Scsi_Host * shp)
+
+
+/**
+ * ioctl - driver can respond to ioctls
+ * @sdp: device that ioctl was issued for
+ * @cmd: ioctl number
+ * @arg: pointer to read or write data from. Since it points to
+ * user space, should use appropriate kernel functions
+ * (e.g. copy_from_user() ). In the Unix style this argument
+ * can also be viewed as an unsigned long.
+ *
+ * Returns negative "errno" value when there is a problem. 0 or a
+ * positive value indicates success and is returned to the user space.
+ *
+ * Locks: none
+ *
+ * Calling context: process
+ *
+ * Notes: The SCSI subsystem uses a "trickle down" ioctl model.
+ * The user issues an ioctl() against an upper level driver
+ * (e.g. /dev/sdc) and if the upper level driver doesn't recognize
+ * the 'cmd' then it is passed to the SCSI mid level. If the SCSI
+ * mid level does not recognize it, then the LLD that controls
+ * the device receives the ioctl. According to recent Unix standards
+ * unsupported ioctl() 'cmd' numbers should return -ENOTTY.
+ *
+ * Optionally defined in: LLD
+ **/
+ int ioctl(struct scsi_device *sdp, int cmd, void *arg)
+
+
+/**
+ * proc_info - supports /proc/scsi/{driver_name}/{host_no}
+ * @buffer: anchor point to output to (0==writeto1_read0) or fetch from
+ * (1==writeto1_read0).
+ * @start: where "interesting" data is written to. Ignored when
+ * 1==writeto1_read0.
+ * @offset: offset within buffer 0==writeto1_read0 is actually
+ * interested in. Ignored when 1==writeto1_read0 .
+ * @length: maximum (or actual) extent of buffer
+ * @host_no: host number of interest (struct Scsi_Host::host_no)
+ * @writeto1_read0: 1 -> data coming from user space towards driver
+ * (e.g. "echo some_string > /proc/scsi/xyz/2")
+ * 0 -> user what data from this driver
+ * (e.g. "cat /proc/scsi/xyz/2")
+ *
+ * Returns length when 1==writeto1_read0. Otherwise number of chars
+ * output to buffer past offset.
+ *
+ * Locks: none held
+ *
+ * Calling context: process
+ *
+ * Notes: Driven from scsi_proc.c which interfaces to proc_fs. proc_fs
+ * support can now be configured out of the scsi subsystem.
+ *
+ * Optionally defined in: LLD
+ **/
+ int proc_info(char * buffer, char ** start, off_t offset,
+ int length, int host_no, int writeto1_read0)
+
+
+/**
+ * queuecommand - queue scsi command, invoke 'done' on completion
+ * @scp: pointer to scsi command object
+ * @done: function pointer to be invoked on completion
+ *
+ * Returns 0 on success.
+ *
+ * If there's a failure, return either:
+ *
+ * SCSI_MLQUEUE_DEVICE_BUSY if the device queue is full, or
+ * SCSI_MLQUEUE_HOST_BUSY if the entire host queue is full
+ *
+ * On both of these returns, the mid-layer will requeue the I/O
+ *
+ * - if the return is SCSI_MLQUEUE_DEVICE_BUSY, only that particular
+ * device will be paused, and it will be unpaused when a command to
+ * the device returns (or after a brief delay if there are no more
+ * outstanding commands to it). Commands to other devices continue
+ * to be processed normally.
+ *
+ * - if the return is SCSI_MLQUEUE_HOST_BUSY, all I/O to the host
+ * is paused and will be unpaused when any command returns from
+ * the host (or after a brief delay if there are no outstanding
+ * commands to the host).
+ *
+ * For compatibility with earlier versions of queuecommand, any
+ * other return value is treated the same as
+ * SCSI_MLQUEUE_HOST_BUSY.
+ *
+ * Other types of errors that are detected immediately may be
+ * flagged by setting scp->result to an appropriate value,
+ * invoking the 'done' callback, and then returning 0 from this
+ * function. If the command is not performed immediately (and the
+ * LLD is starting (or will start) the given command) then this
+ * function should place 0 in scp->result and return 0.
+ *
+ * Command ownership. If the driver returns zero, it owns the
+ * command and must take responsibility for ensuring the 'done'
+ * callback is executed. Note: the driver may call done before
+ * returning zero, but after it has called done, it may not
+ * return any value other than zero. If the driver makes a
+ * non-zero return, it must not execute the command's done
+ * callback at any time.
+ *
+ * Locks: struct Scsi_Host::host_lock held on entry (with "irqsave")
+ * and is expected to be held on return.
+ *
+ * Calling context: in interrupt (soft irq) or process context
+ *
+ * Notes: This function should be relatively fast. Normally it will
+ * not wait for IO to complete. Hence the 'done' callback is invoked
+ * (often directly from an interrupt service routine) some time after
+ * this function has returned. In some cases (e.g. pseudo adapter
+ * drivers that manufacture the response to a SCSI INQUIRY)
+ * the 'done' callback may be invoked before this function returns.
+ * If the 'done' callback is not invoked within a certain period
+ * the SCSI mid level will commence error processing.
+ * If a status of CHECK CONDITION is placed in "result" when the
+ * 'done' callback is invoked, then the LLD driver should
+ * perform autosense and fill in the struct scsi_cmnd::sense_buffer
+ * array. The scsi_cmnd::sense_buffer array is zeroed prior to
+ * the mid level queuing a command to an LLD.
+ *
+ * Defined in: LLD
+ **/
+ int queuecommand(struct scsi_cmnd * scp,
+ void (*done)(struct scsi_cmnd *))
+
+
+/**
+ * release - release all resources associated with given host
+ * @shp: host to be released.
+ *
+ * Return value ignored (could soon be a function returning void).
+ *
+ * Locks: none held
+ *
+ * Calling context: process
+ *
+ * Notes: Invoked from scsi_module.c's exit_this_scsi_driver().
+ * LLD's implementation of this function should call
+ * scsi_unregister(shp) prior to returning.
+ * Only needed for old-style host templates.
+ *
+ * Defined in: LLD (required in "passive initialization model",
+ * should not be defined in hotplug model)
+ **/
+ int release(struct Scsi_Host * shp)
+
+
+/**
+ * slave_alloc - prior to any commands being sent to a new device
+ * (i.e. just prior to scan) this call is made
+ * @sdp: pointer to new device (about to be scanned)
+ *
+ * Returns 0 if ok. Any other return is assumed to be an error and
+ * the device is ignored.
+ *
+ * Locks: none
+ *
+ * Calling context: process
+ *
+ * Notes: Allows the driver to allocate any resources for a device
+ * prior to its initial scan. The corresponding scsi device may not
+ * exist but the mid level is just about to scan for it (i.e. send
+ * and INQUIRY command plus ...). If a device is found then
+ * slave_configure() will be called while if a device is not found
+ * slave_destroy() is called.
+ * For more details see the include/scsi/scsi_host.h file.
+ *
+ * Optionally defined in: LLD
+ **/
+ int slave_alloc(struct scsi_device *sdp)
+
+
+/**
+ * slave_configure - driver fine tuning for given device just after it
+ * has been first scanned (i.e. it responded to an
+ * INQUIRY)
+ * @sdp: device that has just been attached
+ *
+ * Returns 0 if ok. Any other return is assumed to be an error and
+ * the device is taken offline. [offline devices will _not_ have
+ * slave_destroy() called on them so clean up resources.]
+ *
+ * Locks: none
+ *
+ * Calling context: process
+ *
+ * Notes: Allows the driver to inspect the response to the initial
+ * INQUIRY done by the scanning code and take appropriate action.
+ * For more details see the include/scsi/scsi_host.h file.
+ *
+ * Optionally defined in: LLD
+ **/
+ int slave_configure(struct scsi_device *sdp)
+
+
+/**
+ * slave_destroy - given device is about to be shut down. All
+ * activity has ceased on this device.
+ * @sdp: device that is about to be shut down
+ *
+ * Returns nothing
+ *
+ * Locks: none
+ *
+ * Calling context: process
+ *
+ * Notes: Mid level structures for given device are still in place
+ * but are about to be torn down. Any per device resources allocated
+ * by this driver for given device should be freed now. No further
+ * commands will be sent for this sdp instance. [However the device
+ * could be re-attached in the future in which case a new instance
+ * of struct scsi_device would be supplied by future slave_alloc()
+ * and slave_configure() calls.]
+ *
+ * Optionally defined in: LLD
+ **/
+ void slave_destroy(struct scsi_device *sdp)
+
+
+
+Data Structures
+===============
+struct scsi_host_template
+-------------------------
+There is one "struct scsi_host_template" instance per LLD ***. It is
+typically initialized as a file scope static in a driver's header file. That
+way members that are not explicitly initialized will be set to 0 or NULL.
+Member of interest:
+ name - name of driver (may contain spaces, please limit to
+ less than 80 characters)
+ proc_name - name used in "/proc/scsi/<proc_name>/<host_no>" and
+ by sysfs in one of its "drivers" directories. Hence
+ "proc_name" should only contain characters acceptable
+ to a Unix file name.
+ (*queuecommand)() - primary callback that the mid level uses to inject
+ SCSI commands into an LLD.
+The structure is defined and commented in include/scsi/scsi_host.h
+
+*** In extreme situations a single driver may have several instances
+ if it controls several different classes of hardware (e.g. an LLD
+ that handles both ISA and PCI cards and has a separate instance of
+ struct scsi_host_template for each class).
+
+struct Scsi_Host
+----------------
+There is one struct Scsi_Host instance per host (HBA) that an LLD
+controls. The struct Scsi_Host structure has many members in common
+with "struct scsi_host_template". When a new struct Scsi_Host instance
+is created (in scsi_host_alloc() in hosts.c) those common members are
+initialized from the driver's struct scsi_host_template instance. Members
+of interest:
+ host_no - system wide unique number that is used for identifying
+ this host. Issued in ascending order from 0.
+ can_queue - must be greater than 0; do not send more than can_queue
+ commands to the adapter.
+ this_id - scsi id of host (scsi initiator) or -1 if not known
+ sg_tablesize - maximum scatter gather elements allowed by host.
+ 0 implies scatter gather not supported by host
+ max_sectors - maximum number of sectors (usually 512 bytes) allowed
+ in a single SCSI command. The default value of 0 leads
+ to a setting of SCSI_DEFAULT_MAX_SECTORS (defined in
+ scsi_host.h) which is currently set to 1024. So for a
+ disk the maximum transfer size is 512 KB when max_sectors
+ is not defined. Note that this size may not be sufficient
+ for disk firmware uploads.
+ cmd_per_lun - maximum number of commands that can be queued on devices
+ controlled by the host. Overridden by LLD calls to
+ scsi_adjust_queue_depth().
+ unchecked_isa_dma - 1=>only use bottom 16 MB of ram (ISA DMA addressing
+ restriction), 0=>can use full 32 bit (or better) DMA
+ address space
+ use_clustering - 1=>SCSI commands in mid level's queue can be merged,
+ 0=>disallow SCSI command merging
+ hostt - pointer to driver's struct scsi_host_template from which
+ this struct Scsi_Host instance was spawned
+ hostt->proc_name - name of LLD. This is the driver name that sysfs uses
+ transportt - pointer to driver's struct scsi_transport_template instance
+ (if any). FC and SPI transports currently supported.
+ sh_list - a double linked list of pointers to all struct Scsi_Host
+ instances (currently ordered by ascending host_no)
+ my_devices - a double linked list of pointers to struct scsi_device
+ instances that belong to this host.
+ hostdata[0] - area reserved for LLD at end of struct Scsi_Host. Size
+ is set by the second argument (named 'xtr_bytes') to
+ scsi_host_alloc() or scsi_register().
+
+The scsi_host structure is defined in include/scsi/scsi_host.h
+
+struct scsi_device
+------------------
+Generally, there is one instance of this structure for each SCSI logical unit
+on a host. Scsi devices connected to a host are uniquely identified by a
+channel number, target id and logical unit number (lun).
+The structure is defined in include/scsi/scsi_device.h
+
+struct scsi_cmnd
+----------------
+Instances of this structure convey SCSI commands to the LLD and responses
+back to the mid level. The SCSI mid level will ensure that no more SCSI
+commands become queued against the LLD than are indicated by
+scsi_adjust_queue_depth() (or struct Scsi_Host::cmd_per_lun). There will
+be at least one instance of struct scsi_cmnd available for each SCSI device.
+Members of interest:
+ cmnd - array containing SCSI command
+ cmnd_len - length (in bytes) of SCSI command
+ sc_data_direction - direction of data transfer in data phase. See
+ "enum dma_data_direction" in include/linux/dma-mapping.h
+ request_bufflen - number of data bytes to transfer (0 if no data phase)
+ use_sg - ==0 -> no scatter gather list, hence transfer data
+ to/from request_buffer
+ - >0 -> scatter gather list (actually an array) in
+ request_buffer with use_sg elements
+ request_buffer - either contains data buffer or scatter gather list
+ depending on the setting of use_sg. Scatter gather
+ elements are defined by 'struct scatterlist' found
+ in include/asm/scatterlist.h .
+ done - function pointer that should be invoked by LLD when the
+ SCSI command is completed (successfully or otherwise).
+ Should only be called by an LLD if the LLD has accepted
+ the command (i.e. queuecommand() returned or will return
+ 0). The LLD may invoke 'done' prior to queuecommand()
+ finishing.
+ result - should be set by LLD prior to calling 'done'. A value
+ of 0 implies a successfully completed command (and all
+ data (if any) has been transferred to or from the SCSI
+ target device). 'result' is a 32 bit unsigned integer that
+ can be viewed as 4 related bytes. The SCSI status value is
+ in the LSB. See include/scsi/scsi.h status_byte(),
+ msg_byte(), host_byte() and driver_byte() macros and
+ related constants.
+ sense_buffer - an array (maximum size: SCSI_SENSE_BUFFERSIZE bytes) that
+ should be written when the SCSI status (LSB of 'result')
+ is set to CHECK_CONDITION (2). When CHECK_CONDITION is
+ set, if the top nibble of sense_buffer[0] has the value 7
+ then the mid level will assume the sense_buffer array
+ contains a valid SCSI sense buffer; otherwise the mid
+ level will issue a REQUEST_SENSE SCSI command to
+ retrieve the sense buffer. The latter strategy is error
+ prone in the presence of command queuing so the LLD should
+ always "auto-sense".
+ device - pointer to scsi_device object that this command is
+ associated with.
+ resid - an LLD should set this signed integer to the requested
+ transfer length (i.e. 'request_bufflen') less the number
+ of bytes that are actually transferred. 'resid' is
+ preset to 0 so an LLD can ignore it if it cannot detect
+ underruns (overruns should be rare). If possible an LLD
+ should set 'resid' prior to invoking 'done'. The most
+ interesting case is data transfers from a SCSI target
+ device device (i.e. READs) that underrun.
+ underflow - LLD should place (DID_ERROR << 16) in 'result' if
+ actual number of bytes transferred is less than this
+ figure. Not many LLDs implement this check and some that
+ do just output an error message to the log rather than
+ report a DID_ERROR. Better for an LLD to implement
+ 'resid'.
+
+The scsi_cmnd structure is defined in include/scsi/scsi_cmnd.h
+
+
+Locks
+=====
+Each struct Scsi_Host instance has a spin_lock called struct
+Scsi_Host::default_lock which is initialized in scsi_host_alloc() [found in
+hosts.c]. Within the same function the struct Scsi_Host::host_lock pointer
+is initialized to point at default_lock. Thereafter lock and unlock
+operations performed by the mid level use the struct Scsi_Host::host_lock
+pointer. Previously drivers could override the host_lock pointer but
+this is not allowed anymore.
+
+
+Autosense
+=========
+Autosense (or auto-sense) is defined in the SAM-2 document as "the
+automatic return of sense data to the application client coincident
+with the completion of a SCSI command" when a status of CHECK CONDITION
+occurs. LLDs should perform autosense. This should be done when the LLD
+detects a CHECK CONDITION status by either:
+ a) instructing the SCSI protocol (e.g. SCSI Parallel Interface (SPI))
+ to perform an extra data in phase on such responses
+ b) or, the LLD issuing a REQUEST SENSE command itself
+
+Either way, when a status of CHECK CONDITION is detected, the mid level
+decides whether the LLD has performed autosense by checking struct
+scsi_cmnd::sense_buffer[0] . If this byte has an upper nibble of 7 (or 0xf)
+then autosense is assumed to have taken place. If it has another value (and
+this byte is initialized to 0 before each command) then the mid level will
+issue a REQUEST SENSE command.
+
+In the presence of queued commands the "nexus" that maintains sense
+buffer data from the command that failed until a following REQUEST SENSE
+may get out of synchronization. This is why it is best for the LLD
+to perform autosense.
+
+
+Changes since lk 2.4 series
+===========================
+io_request_lock has been replaced by several finer grained locks. The lock
+relevant to LLDs is struct Scsi_Host::host_lock and there is
+one per SCSI host.
+
+The older error handling mechanism has been removed. This means the
+LLD interface functions abort() and reset() have been removed.
+The struct scsi_host_template::use_new_eh_code flag has been removed.
+
+In the 2.4 series the SCSI subsystem configuration descriptions were
+aggregated with the configuration descriptions from all other Linux
+subsystems in the Documentation/Configure.help file. In the 2.6 series,
+the SCSI subsystem now has its own (much smaller) drivers/scsi/Kconfig
+file that contains both configuration and help information.
+
+struct SHT has been renamed to struct scsi_host_template.
+
+Addition of the "hotplug initialization model" and many extra functions
+to support it.
+
+
+Credits
+=======
+The following people have contributed to this document:
+ Mike Anderson <andmike at us dot ibm dot com>
+ James Bottomley <James dot Bottomley at hansenpartnership dot com>
+ Patrick Mansfield <patmans at us dot ibm dot com>
+ Christoph Hellwig <hch at infradead dot org>
+ Doug Ledford <dledford at redhat dot com>
+ Andries Brouwer <Andries dot Brouwer at cwi dot nl>
+ Randy Dunlap <rdunlap at xenotime dot net>
+ Alan Stern <stern at rowland dot harvard dot edu>
+
+
+Douglas Gilbert
+dgilbert at interlog dot com
+21st September 2004
diff --git a/Documentation/scsi/st.txt b/Documentation/scsi/st.txt
new file mode 100644
index 0000000..4075260
--- /dev/null
+++ b/Documentation/scsi/st.txt
@@ -0,0 +1,509 @@
+This file contains brief information about the SCSI tape driver.
+The driver is currently maintained by Kai Mäkisara (email
+Kai.Makisara@kolumbus.fi)
+
+Last modified: Sun Feb 24 21:59:07 2008 by kai.makisara
+
+
+BASICS
+
+The driver is generic, i.e., it does not contain any code tailored
+to any specific tape drive. The tape parameters can be specified with
+one of the following three methods:
+
+1. Each user can specify the tape parameters he/she wants to use
+directly with ioctls. This is administratively a very simple and
+flexible method and applicable to single-user workstations. However,
+in a multiuser environment the next user finds the tape parameters in
+state the previous user left them.
+
+2. The system manager (root) can define default values for some tape
+parameters, like block size and density using the MTSETDRVBUFFER ioctl.
+These parameters can be programmed to come into effect either when a
+new tape is loaded into the drive or if writing begins at the
+beginning of the tape. The second method is applicable if the tape
+drive performs auto-detection of the tape format well (like some
+QIC-drives). The result is that any tape can be read, writing can be
+continued using existing format, and the default format is used if
+the tape is rewritten from the beginning (or a new tape is written
+for the first time). The first method is applicable if the drive
+does not perform auto-detection well enough and there is a single
+"sensible" mode for the device. An example is a DAT drive that is
+used only in variable block mode (I don't know if this is sensible
+or not :-).
+
+The user can override the parameters defined by the system
+manager. The changes persist until the defaults again come into
+effect.
+
+3. By default, up to four modes can be defined and selected using the minor
+number (bits 5 and 6). The number of modes can be changed by changing
+ST_NBR_MODE_BITS in st.h. Mode 0 corresponds to the defaults discussed
+above. Additional modes are dormant until they are defined by the
+system manager (root). When specification of a new mode is started,
+the configuration of mode 0 is used to provide a starting point for
+definition of the new mode.
+
+Using the modes allows the system manager to give the users choices
+over some of the buffering parameters not directly accessible to the
+users (buffered and asynchronous writes). The modes also allow choices
+between formats in multi-tape operations (the explicitly overridden
+parameters are reset when a new tape is loaded).
+
+If more than one mode is used, all modes should contain definitions
+for the same set of parameters.
+
+Many Unices contain internal tables that associate different modes to
+supported devices. The Linux SCSI tape driver does not contain such
+tables (and will not do that in future). Instead of that, a utility
+program can be made that fetches the inquiry data sent by the device,
+scans its database, and sets up the modes using the ioctls. Another
+alternative is to make a small script that uses mt to set the defaults
+tailored to the system.
+
+The driver supports fixed and variable block size (within buffer
+limits). Both the auto-rewind (minor equals device number) and
+non-rewind devices (minor is 128 + device number) are implemented.
+
+In variable block mode, the byte count in write() determines the size
+of the physical block on tape. When reading, the drive reads the next
+tape block and returns to the user the data if the read() byte count
+is at least the block size. Otherwise, error ENOMEM is returned.
+
+In fixed block mode, the data transfer between the drive and the
+driver is in multiples of the block size. The write() byte count must
+be a multiple of the block size. This is not required when reading but
+may be advisable for portability.
+
+Support is provided for changing the tape partition and partitioning
+of the tape with one or two partitions. By default support for
+partitioned tape is disabled for each driver and it can be enabled
+with the ioctl MTSETDRVBUFFER.
+
+By default the driver writes one filemark when the device is closed after
+writing and the last operation has been a write. Two filemarks can be
+optionally written. In both cases end of data is signified by
+returning zero bytes for two consecutive reads.
+
+If rewind, offline, bsf, or seek is done and previous tape operation was
+write, a filemark is written before moving tape.
+
+The compile options are defined in the file linux/drivers/scsi/st_options.h.
+
+4. If the open option O_NONBLOCK is used, open succeeds even if the
+drive is not ready. If O_NONBLOCK is not used, the driver waits for
+the drive to become ready. If this does not happen in ST_BLOCK_SECONDS
+seconds, open fails with the errno value EIO. With O_NONBLOCK the
+device can be opened for writing even if there is a write protected
+tape in the drive (commands trying to write something return error if
+attempted).
+
+
+MINOR NUMBERS
+
+The tape driver currently supports 128 drives by default. This number
+can be increased by editing st.h and recompiling the driver if
+necessary. The upper limit is 2^17 drives if 4 modes for each drive
+are used.
+
+The minor numbers consist of the following bit fields:
+
+dev_upper non-rew mode dev-lower
+ 20 - 8 7 6 5 4 0
+The non-rewind bit is always bit 7 (the uppermost bit in the lowermost
+byte). The bits defining the mode are below the non-rewind bit. The
+remaining bits define the tape device number. This numbering is
+backward compatible with the numbering used when the minor number was
+only 8 bits wide.
+
+
+SYSFS SUPPORT
+
+The driver creates the directory /sys/class/scsi_tape and populates it with
+directories corresponding to the existing tape devices. There are autorewind
+and non-rewind entries for each mode. The names are stxy and nstxy, where x
+is the tape number and y a character corresponding to the mode (none, l, m,
+a). For example, the directories for the first tape device are (assuming four
+modes): st0 nst0 st0l nst0l st0m nst0m st0a nst0a.
+
+Each directory contains the entries: default_blksize default_compression
+default_density defined dev device driver. The file 'defined' contains 1
+if the mode is defined and zero if not defined. The files 'default_*' contain
+the defaults set by the user. The value -1 means the default is not set. The
+file 'dev' contains the device numbers corresponding to this device. The links
+'device' and 'driver' point to the SCSI device and driver entries.
+
+Each directory also contains the entry 'options' which shows the currently
+enabled driver and mode options. The value in the file is a bit mask where the
+bit definitions are the same as those used with MTSETDRVBUFFER in setting the
+options.
+
+A link named 'tape' is made from the SCSI device directory to the class
+directory corresponding to the mode 0 auto-rewind device (e.g., st0).
+
+
+BSD AND SYS V SEMANTICS
+
+The user can choose between these two behaviours of the tape driver by
+defining the value of the symbol ST_SYSV. The semantics differ when a
+file being read is closed. The BSD semantics leaves the tape where it
+currently is whereas the SYS V semantics moves the tape past the next
+filemark unless the filemark has just been crossed.
+
+The default is BSD semantics.
+
+
+BUFFERING
+
+The driver tries to do transfers directly to/from user space. If this
+is not possible, a driver buffer allocated at run-time is used. If
+direct i/o is not possible for the whole transfer, the driver buffer
+is used (i.e., bounce buffers for individual pages are not
+used). Direct i/o can be impossible because of several reasons, e.g.:
+- one or more pages are at addresses not reachable by the HBA
+- the number of pages in the transfer exceeds the number of
+ scatter/gather segments permitted by the HBA
+- one or more pages can't be locked into memory (should not happen in
+ any reasonable situation)
+
+The size of the driver buffers is always at least one tape block. In fixed
+block mode, the minimum buffer size is defined (in 1024 byte units) by
+ST_FIXED_BUFFER_BLOCKS. With small block size this allows buffering of
+several blocks and using one SCSI read or write to transfer all of the
+blocks. Buffering of data across write calls in fixed block mode is
+allowed if ST_BUFFER_WRITES is non-zero and direct i/o is not used.
+Buffer allocation uses chunks of memory having sizes 2^n * (page
+size). Because of this the actual buffer size may be larger than the
+minimum allowable buffer size.
+
+NOTE that if direct i/o is used, the small writes are not buffered. This may
+cause a surprise when moving from 2.4. There small writes (e.g., tar without
+-b option) may have had good throughput but this is not true any more with
+2.6. Direct i/o can be turned off to solve this problem but a better solution
+is to use bigger write() byte counts (e.g., tar -b 64).
+
+Asynchronous writing. Writing the buffer contents to the tape is
+started and the write call returns immediately. The status is checked
+at the next tape operation. Asynchronous writes are not done with
+direct i/o and not in fixed block mode.
+
+Buffered writes and asynchronous writes may in some rare cases cause
+problems in multivolume operations if there is not enough space on the
+tape after the early-warning mark to flush the driver buffer.
+
+Read ahead for fixed block mode (ST_READ_AHEAD). Filling the buffer is
+attempted even if the user does not want to get all of the data at
+this read command. Should be disabled for those drives that don't like
+a filemark to truncate a read request or that don't like backspacing.
+
+Scatter/gather buffers (buffers that consist of chunks non-contiguous
+in the physical memory) are used if contiguous buffers can't be
+allocated. To support all SCSI adapters (including those not
+supporting scatter/gather), buffer allocation is using the following
+three kinds of chunks:
+1. The initial segment that is used for all SCSI adapters including
+those not supporting scatter/gather. The size of this buffer will be
+(PAGE_SIZE << ST_FIRST_ORDER) bytes if the system can give a chunk of
+this size (and it is not larger than the buffer size specified by
+ST_BUFFER_BLOCKS). If this size is not available, the driver halves
+the size and tries again until the size of one page. The default
+settings in st_options.h make the driver to try to allocate all of the
+buffer as one chunk.
+2. The scatter/gather segments to fill the specified buffer size are
+allocated so that as many segments as possible are used but the number
+of segments does not exceed ST_FIRST_SG.
+3. The remaining segments between ST_MAX_SG (or the module parameter
+max_sg_segs) and the number of segments used in phases 1 and 2
+are used to extend the buffer at run-time if this is necessary. The
+number of scatter/gather segments allowed for the SCSI adapter is not
+exceeded if it is smaller than the maximum number of scatter/gather
+segments specified. If the maximum number allowed for the SCSI adapter
+is smaller than the number of segments used in phases 1 and 2,
+extending the buffer will always fail.
+
+
+EOM BEHAVIOUR WHEN WRITING
+
+When the end of medium early warning is encountered, the current write
+is finished and the number of bytes is returned. The next write
+returns -1 and errno is set to ENOSPC. To enable writing a trailer,
+the next write is allowed to proceed and, if successful, the number of
+bytes is returned. After this, -1 and the number of bytes are
+alternately returned until the physical end of medium (or some other
+error) is encountered.
+
+
+MODULE PARAMETERS
+
+The buffer size, write threshold, and the maximum number of allocated buffers
+are configurable when the driver is loaded as a module. The keywords are:
+
+buffer_kbs=xxx the buffer size for fixed block mode is set
+ to xxx kilobytes
+write_threshold_kbs=xxx the write threshold in kilobytes set to xxx
+max_sg_segs=xxx the maximum number of scatter/gather
+ segments
+try_direct_io=x try direct transfer between user buffer and
+ tape drive if this is non-zero
+
+Note that if the buffer size is changed but the write threshold is not
+set, the write threshold is set to the new buffer size - 2 kB.
+
+
+BOOT TIME CONFIGURATION
+
+If the driver is compiled into the kernel, the same parameters can be
+also set using, e.g., the LILO command line. The preferred syntax is
+to use the same keyword used when loading as module but prepended
+with 'st.'. For instance, to set the maximum number of scatter/gather
+segments, the parameter 'st.max_sg_segs=xx' should be used (xx is the
+number of scatter/gather segments).
+
+For compatibility, the old syntax from early 2.5 and 2.4 kernel
+versions is supported. The same keywords can be used as when loading
+the driver as module. If several parameters are set, the keyword-value
+pairs are separated with a comma (no spaces allowed). A colon can be
+used instead of the equal mark. The definition is prepended by the
+string st=. Here is an example:
+
+ st=buffer_kbs:64,write_threshold_kbs:60
+
+The following syntax used by the old kernel versions is also supported:
+
+ st=aa[,bb[,dd]]
+
+where
+ aa is the buffer size for fixed block mode in 1024 byte units
+ bb is the write threshold in 1024 byte units
+ dd is the maximum number of scatter/gather segments
+
+
+IOCTLS
+
+The tape is positioned and the drive parameters are set with ioctls
+defined in mtio.h The tape control program 'mt' uses these ioctls. Try
+to find an mt that supports all of the Linux SCSI tape ioctls and
+opens the device for writing if the tape contents will be modified
+(look for a package mt-st* from the Linux ftp sites; the GNU mt does
+not open for writing for, e.g., erase).
+
+The supported ioctls are:
+
+The following use the structure mtop:
+
+MTFSF Space forward over count filemarks. Tape positioned after filemark.
+MTFSFM As above but tape positioned before filemark.
+MTBSF Space backward over count filemarks. Tape positioned before
+ filemark.
+MTBSFM As above but ape positioned after filemark.
+MTFSR Space forward over count records.
+MTBSR Space backward over count records.
+MTFSS Space forward over count setmarks.
+MTBSS Space backward over count setmarks.
+MTWEOF Write count filemarks.
+MTWSM Write count setmarks.
+MTREW Rewind tape.
+MTOFFL Set device off line (often rewind plus eject).
+MTNOP Do nothing except flush the buffers.
+MTRETEN Re-tension tape.
+MTEOM Space to end of recorded data.
+MTERASE Erase tape. If the argument is zero, the short erase command
+ is used. The long erase command is used with all other values
+ of the argument.
+MTSEEK Seek to tape block count. Uses Tandberg-compatible seek (QFA)
+ for SCSI-1 drives and SCSI-2 seek for SCSI-2 drives. The file and
+ block numbers in the status are not valid after a seek.
+MTSETBLK Set the drive block size. Setting to zero sets the drive into
+ variable block mode (if applicable).
+MTSETDENSITY Sets the drive density code to arg. See drive
+ documentation for available codes.
+MTLOCK and MTUNLOCK Explicitly lock/unlock the tape drive door.
+MTLOAD and MTUNLOAD Explicitly load and unload the tape. If the
+ command argument x is between MT_ST_HPLOADER_OFFSET + 1 and
+ MT_ST_HPLOADER_OFFSET + 6, the number x is used sent to the
+ drive with the command and it selects the tape slot to use of
+ HP C1553A changer.
+MTCOMPRESSION Sets compressing or uncompressing drive mode using the
+ SCSI mode page 15. Note that some drives other methods for
+ control of compression. Some drives (like the Exabytes) use
+ density codes for compression control. Some drives use another
+ mode page but this page has not been implemented in the
+ driver. Some drives without compression capability will accept
+ any compression mode without error.
+MTSETPART Moves the tape to the partition given by the argument at the
+ next tape operation. The block at which the tape is positioned
+ is the block where the tape was previously positioned in the
+ new active partition unless the next tape operation is
+ MTSEEK. In this case the tape is moved directly to the block
+ specified by MTSEEK. MTSETPART is inactive unless
+ MT_ST_CAN_PARTITIONS set.
+MTMKPART Formats the tape with one partition (argument zero) or two
+ partitions (the argument gives in megabytes the size of
+ partition 1 that is physically the first partition of the
+ tape). The drive has to support partitions with size specified
+ by the initiator. Inactive unless MT_ST_CAN_PARTITIONS set.
+MTSETDRVBUFFER
+ Is used for several purposes. The command is obtained from count
+ with mask MT_SET_OPTIONS, the low order bits are used as argument.
+ This command is only allowed for the superuser (root). The
+ subcommands are:
+ 0
+ The drive buffer option is set to the argument. Zero means
+ no buffering.
+ MT_ST_BOOLEANS
+ Sets the buffering options. The bits are the new states
+ (enabled/disabled) the following options (in the
+ parenthesis is specified whether the option is global or
+ can be specified differently for each mode):
+ MT_ST_BUFFER_WRITES write buffering (mode)
+ MT_ST_ASYNC_WRITES asynchronous writes (mode)
+ MT_ST_READ_AHEAD read ahead (mode)
+ MT_ST_TWO_FM writing of two filemarks (global)
+ MT_ST_FAST_EOM using the SCSI spacing to EOD (global)
+ MT_ST_AUTO_LOCK automatic locking of the drive door (global)
+ MT_ST_DEF_WRITES the defaults are meant only for writes (mode)
+ MT_ST_CAN_BSR backspacing over more than one records can
+ be used for repositioning the tape (global)
+ MT_ST_NO_BLKLIMS the driver does not ask the block limits
+ from the drive (block size can be changed only to
+ variable) (global)
+ MT_ST_CAN_PARTITIONS enables support for partitioned
+ tapes (global)
+ MT_ST_SCSI2LOGICAL the logical block number is used in
+ the MTSEEK and MTIOCPOS for SCSI-2 drives instead of
+ the device dependent address. It is recommended to set
+ this flag unless there are tapes using the device
+ dependent (from the old times) (global)
+ MT_ST_SYSV sets the SYSV semantics (mode)
+ MT_ST_NOWAIT enables immediate mode (i.e., don't wait for
+ the command to finish) for some commands (e.g., rewind)
+ MT_ST_SILI enables setting the SILI bit in SCSI commands when
+ reading in variable block mode to enhance performance when
+ reading blocks shorter than the byte count; set this only
+ if you are sure that the drive supports SILI and the HBA
+ correctly returns transfer residuals
+ MT_ST_DEBUGGING debugging (global; debugging must be
+ compiled into the driver)
+ MT_ST_SETBOOLEANS
+ MT_ST_CLEARBOOLEANS
+ Sets or clears the option bits.
+ MT_ST_WRITE_THRESHOLD
+ Sets the write threshold for this device to kilobytes
+ specified by the lowest bits.
+ MT_ST_DEF_BLKSIZE
+ Defines the default block size set automatically. Value
+ 0xffffff means that the default is not used any more.
+ MT_ST_DEF_DENSITY
+ MT_ST_DEF_DRVBUFFER
+ Used to set or clear the density (8 bits), and drive buffer
+ state (3 bits). If the value is MT_ST_CLEAR_DEFAULT
+ (0xfffff) the default will not be used any more. Otherwise
+ the lowermost bits of the value contain the new value of
+ the parameter.
+ MT_ST_DEF_COMPRESSION
+ The compression default will not be used if the value of
+ the lowermost byte is 0xff. Otherwise the lowermost bit
+ contains the new default. If the bits 8-15 are set to a
+ non-zero number, and this number is not 0xff, the number is
+ used as the compression algorithm. The value
+ MT_ST_CLEAR_DEFAULT can be used to clear the compression
+ default.
+ MT_ST_SET_TIMEOUT
+ Set the normal timeout in seconds for this device. The
+ default is 900 seconds (15 minutes). The timeout should be
+ long enough for the retries done by the device while
+ reading/writing.
+ MT_ST_SET_LONG_TIMEOUT
+ Set the long timeout that is used for operations that are
+ known to take a long time. The default is 14000 seconds
+ (3.9 hours). For erase this value is further multiplied by
+ eight.
+ MT_ST_SET_CLN
+ Set the cleaning request interpretation parameters using
+ the lowest 24 bits of the argument. The driver can set the
+ generic status bit GMT_CLN if a cleaning request bit pattern
+ is found from the extended sense data. Many drives set one or
+ more bits in the extended sense data when the drive needs
+ cleaning. The bits are device-dependent. The driver is
+ given the number of the sense data byte (the lowest eight
+ bits of the argument; must be >= 18 (values 1 - 17
+ reserved) and <= the maximum requested sense data sixe),
+ a mask to select the relevant bits (the bits 9-16), and the
+ bit pattern (bits 17-23). If the bit pattern is zero, one
+ or more bits under the mask indicate cleaning request. If
+ the pattern is non-zero, the pattern must match the masked
+ sense data byte.
+
+ (The cleaning bit is set if the additional sense code and
+ qualifier 00h 17h are seen regardless of the setting of
+ MT_ST_SET_CLN.)
+
+The following ioctl uses the structure mtpos:
+MTIOCPOS Reads the current position from the drive. Uses
+ Tandberg-compatible QFA for SCSI-1 drives and the SCSI-2
+ command for the SCSI-2 drives.
+
+The following ioctl uses the structure mtget to return the status:
+MTIOCGET Returns some status information.
+ The file number and block number within file are returned. The
+ block is -1 when it can't be determined (e.g., after MTBSF).
+ The drive type is either MTISSCSI1 or MTISSCSI2.
+ The number of recovered errors since the previous status call
+ is stored in the lower word of the field mt_erreg.
+ The current block size and the density code are stored in the field
+ mt_dsreg (shifts for the subfields are MT_ST_BLKSIZE_SHIFT and
+ MT_ST_DENSITY_SHIFT).
+ The GMT_xxx status bits reflect the drive status. GMT_DR_OPEN
+ is set if there is no tape in the drive. GMT_EOD means either
+ end of recorded data or end of tape. GMT_EOT means end of tape.
+
+
+MISCELLANEOUS COMPILE OPTIONS
+
+The recovered write errors are considered fatal if ST_RECOVERED_WRITE_FATAL
+is defined.
+
+The maximum number of tape devices is determined by the define
+ST_MAX_TAPES. If more tapes are detected at driver initialization, the
+maximum is adjusted accordingly.
+
+Immediate return from tape positioning SCSI commands can be enabled by
+defining ST_NOWAIT. If this is defined, the user should take care that
+the next tape operation is not started before the previous one has
+finished. The drives and SCSI adapters should handle this condition
+gracefully, but some drive/adapter combinations are known to hang the
+SCSI bus in this case.
+
+The MTEOM command is by default implemented as spacing over 32767
+filemarks. With this method the file number in the status is
+correct. The user can request using direct spacing to EOD by setting
+ST_FAST_EOM 1 (or using the MT_ST_OPTIONS ioctl). In this case the file
+number will be invalid.
+
+When using read ahead or buffered writes the position within the file
+may not be correct after the file is closed (correct position may
+require backspacing over more than one record). The correct position
+within file can be obtained if ST_IN_FILE_POS is defined at compile
+time or the MT_ST_CAN_BSR bit is set for the drive with an ioctl.
+(The driver always backs over a filemark crossed by read ahead if the
+user does not request data that far.)
+
+
+DEBUGGING HINTS
+
+To enable debugging messages, edit st.c and #define DEBUG 1. As seen
+above, debugging can be switched off with an ioctl if debugging is
+compiled into the driver. The debugging output is not voluminous.
+
+If the tape seems to hang, I would be very interested to hear where
+the driver is waiting. With the command 'ps -l' you can see the state
+of the process using the tape. If the state is D, the process is
+waiting for something. The field WCHAN tells where the driver is
+waiting. If you have the current System.map in the correct place (in
+/boot for the procps I use) or have updated /etc/psdatabase (for kmem
+ps), ps writes the function name in the WCHAN field. If not, you have
+to look up the function from System.map.
+
+Note also that the timeouts are very long compared to most other
+drivers. This means that the Linux driver may appear hung although the
+real reason is that the tape firmware has got confused.
diff --git a/Documentation/scsi/sym53c500_cs.txt b/Documentation/scsi/sym53c500_cs.txt
new file mode 100644
index 0000000..75febcf
--- /dev/null
+++ b/Documentation/scsi/sym53c500_cs.txt
@@ -0,0 +1,23 @@
+The sym53c500_cs driver originated as an add-on to David Hinds' pcmcia-cs
+package, and was written by Tom Corner (tcorner@via.at). A rewrite was
+long overdue, and the current version addresses the following concerns:
+
+ (1) extensive kernel changes between 2.4 and 2.6.
+ (2) deprecated PCMCIA support outside the kernel.
+
+All the USE_BIOS code has been ripped out. It was never used, and could
+not have worked anyway. The USE_DMA code is likewise gone. Many thanks
+to YOKOTA Hiroshi (nsp_cs driver) and David Hinds (qlogic_cs driver) for
+the code fragments I shamelessly adapted for this work. Thanks also to
+Christoph Hellwig for his patient tutelage while I stumbled about.
+
+The Symbios Logic 53c500 chip was used in the "newer" (circa 1997) version
+of the New Media Bus Toaster PCMCIA SCSI controller. Presumably there are
+other products using this chip, but I've never laid eyes (much less hands)
+on one.
+
+Through the years, there have been a number of downloads of the pcmcia-cs
+version of this driver, and I guess it worked for those users. It worked
+for Tom Corner, and it works for me. Your mileage will probably vary.
+
+--Bob Tracy (rct@frus.com)
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt
new file mode 100644
index 0000000..49ea5c5
--- /dev/null
+++ b/Documentation/scsi/sym53c8xx_2.txt
@@ -0,0 +1,1048 @@
+The Linux SYM-2 driver documentation file
+
+Written by Gerard Roudier <groudier@free.fr>
+21 Rue Carnot
+95170 DEUIL LA BARRE - FRANCE
+
+Updated by Matthew Wilcox <matthew@wil.cx>
+
+2004-10-09
+===============================================================================
+
+1. Introduction
+2. Supported chips and SCSI features
+3. Advantages of this driver for newer chips.
+ 3.1 Optimized SCSI SCRIPTS
+ 3.2 New features appeared with the SYM53C896
+4. Memory mapped I/O versus normal I/O
+5. Tagged command queueing
+6. Parity checking
+7. Profiling information
+8. Control commands
+ 8.1 Set minimum synchronous period
+ 8.2 Set wide size
+ 8.3 Set maximum number of concurrent tagged commands
+ 8.4 Set debug mode
+ 8.5 Set flag (no_disc)
+ 8.6 Set verbose level
+ 8.7 Reset all logical units of a target
+ 8.8 Abort all tasks of all logical units of a target
+9. Configuration parameters
+10. Boot setup commands
+ 10.1 Syntax
+ 10.2 Available arguments
+ 10.2.1 Default number of tagged commands
+ 10.2.2 Burst max
+ 10.2.3 LED support
+ 10.2.4 Differential mode
+ 10.2.5 IRQ mode
+ 10.2.6 Check SCSI BUS
+ 10.2.7 Suggest a default SCSI id for hosts
+ 10.2.8 Verbosity level
+ 10.2.9 Debug mode
+ 10.2.10 Settle delay
+ 10.2.11 Serial NVRAM
+ 10.2.12 Exclude a host from being attached
+ 10.3 Converting from old options
+ 10.4 SCSI BUS checking boot option
+11. SCSI problem troubleshooting
+ 15.1 Problem tracking
+ 15.2 Understanding hardware error reports
+12. Serial NVRAM support (by Richard Waltham)
+ 17.1 Features
+ 17.2 Symbios NVRAM layout
+ 17.3 Tekram NVRAM layout
+
+===============================================================================
+
+1. Introduction
+
+This driver supports the whole SYM53C8XX family of PCI-SCSI controllers.
+It also support the subset of LSI53C10XX PCI-SCSI controllers that are based
+on the SYM53C8XX SCRIPTS language.
+
+It replaces the sym53c8xx+ncr53c8xx driver bundle and shares its core code
+with the FreeBSD SYM-2 driver. The `glue' that allows this driver to work
+under Linux is contained in 2 files named sym_glue.h and sym_glue.c.
+Other drivers files are intended not to depend on the Operating System
+on which the driver is used.
+
+The history of this driver can be summarized as follows:
+
+1993: ncr driver written for 386bsd and FreeBSD by:
+ Wolfgang Stanglmeier <wolf@cologne.de>
+ Stefan Esser <se@mi.Uni-Koeln.de>
+
+1996: port of the ncr driver to Linux-1.2.13 and rename it ncr53c8xx.
+ Gerard Roudier
+
+1998: new sym53c8xx driver for Linux based on LOAD/STORE instruction and that
+ adds full support for the 896 but drops support for early NCR devices.
+ Gerard Roudier
+
+1999: port of the sym53c8xx driver to FreeBSD and support for the LSI53C1010
+ 33 MHz and 66MHz Ultra-3 controllers. The new driver is named `sym'.
+ Gerard Roudier
+
+2000: Add support for early NCR devices to FreeBSD `sym' driver.
+ Break the driver into several sources and separate the OS glue
+ code from the core code that can be shared among different O/Ses.
+ Write a glue code for Linux.
+ Gerard Roudier
+
+2004: Remove FreeBSD compatibility code. Remove support for versions of
+ Linux before 2.6. Start using Linux facilities.
+
+This README file addresses the Linux version of the driver. Under FreeBSD,
+the driver documentation is the sym.8 man page.
+
+Information about new chips is available at LSILOGIC web server:
+
+ http://www.lsilogic.com/
+
+SCSI standard documentations are available at T10 site:
+
+ http://www.t10.org/
+
+Useful SCSI tools written by Eric Youngdale are part of most Linux
+distributions:
+ scsiinfo: command line tool
+ scsi-config: TCL/Tk tool using scsiinfo
+
+2. Supported chips and SCSI features
+
+The following features are supported for all chips:
+
+ Synchronous negotiation
+ Disconnection
+ Tagged command queuing
+ SCSI parity checking
+ PCI Master parity checking
+
+Other features depends on chip capabilities.
+The driver notably uses optimized SCRIPTS for devices that support
+LOAD/STORE and handles PHASE MISMATCH from SCRIPTS for devices that
+support the corresponding feature.
+
+The following table shows some characteristics of the chip family.
+
+ On board LOAD/STORE HARDWARE
+Chip SDMS BIOS Wide SCSI std. Max. sync SCRIPTS PHASE MISMATCH
+---- --------- ---- --------- ---------- ---------- --------------
+810 N N FAST10 10 MB/s N N
+810A N N FAST10 10 MB/s Y N
+815 Y N FAST10 10 MB/s N N
+825 Y Y FAST10 20 MB/s N N
+825A Y Y FAST10 20 MB/s Y N
+860 N N FAST20 20 MB/s Y N
+875 Y Y FAST20 40 MB/s Y N
+875A Y Y FAST20 40 MB/s Y Y
+876 Y Y FAST20 40 MB/s Y N
+895 Y Y FAST40 80 MB/s Y N
+895A Y Y FAST40 80 MB/s Y Y
+896 Y Y FAST40 80 MB/s Y Y
+897 Y Y FAST40 80 MB/s Y Y
+1510D Y Y FAST40 80 MB/s Y Y
+1010 Y Y FAST80 160 MB/s Y Y
+1010_66* Y Y FAST80 160 MB/s Y Y
+
+* Chip supports 33MHz and 66MHz PCI bus clock.
+
+
+Summary of other supported features:
+
+Module: allow to load the driver
+Memory mapped I/O: increases performance
+Control commands: write operations to the proc SCSI file system
+Debugging information: written to syslog (expert only)
+Scatter / gather
+Shared interrupt
+Boot setup commands
+Serial NVRAM: Symbios and Tekram formats
+
+
+3. Advantages of this driver for newer chips.
+
+3.1 Optimized SCSI SCRIPTS.
+
+All chips except the 810, 815 and 825, support new SCSI SCRIPTS instructions
+named LOAD and STORE that allow to move up to 1 DWORD from/to an IO register
+to/from memory much faster that the MOVE MEMORY instruction that is supported
+by the 53c7xx and 53c8xx family.
+
+The LOAD/STORE instructions support absolute and DSA relative addressing
+modes. The SCSI SCRIPTS had been entirely rewritten using LOAD/STORE instead
+of MOVE MEMORY instructions.
+
+Due to the lack of LOAD/STORE SCRIPTS instructions by earlier chips, this
+driver also incorporates a different SCRIPTS set based on MEMORY MOVE, in
+order to provide support for the entire SYM53C8XX chips family.
+
+3.2 New features appeared with the SYM53C896
+
+Newer chips (see above) allows handling of the phase mismatch context from
+SCRIPTS (avoids the phase mismatch interrupt that stops the SCSI processor
+until the C code has saved the context of the transfer).
+
+The 896 and 1010 chips support 64 bit PCI transactions and addressing,
+while the 895A supports 32 bit PCI transactions and 64 bit addressing.
+The SCRIPTS processor of these chips is not true 64 bit, but uses segment
+registers for bit 32-63. Another interesting feature is that LOAD/STORE
+instructions that address the on-chip RAM (8k) remain internal to the chip.
+
+4. Memory mapped I/O versus normal I/O
+
+Memory mapped I/O has less latency than normal I/O and is the recommended
+way for doing IO with PCI devices. Memory mapped I/O seems to work fine on
+most hardware configurations, but some poorly designed chipsets may break
+this feature. A configuration option is provided for normal I/O to be
+used but the driver defaults to MMIO.
+
+5. Tagged command queueing
+
+Queuing more than 1 command at a time to a device allows it to perform
+optimizations based on actual head positions and its mechanical
+characteristics. This feature may also reduce average command latency.
+In order to really gain advantage of this feature, devices must have
+a reasonable cache size (No miracle is to be expected for a low-end
+hard disk with 128 KB or less).
+Some kown old SCSI devices do not properly support tagged command queuing.
+Generally, firmware revisions that fix this kind of problems are available
+at respective vendor web/ftp sites.
+All I can say is that I never have had problem with tagged queuing using
+this driver and its predecessors. Hard disks that behaved correctly for
+me using tagged commands are the following:
+
+- IBM S12 0662
+- Conner 1080S
+- Quantum Atlas I
+- Quantum Atlas II
+- Seagate Cheetah I
+- Quantum Viking II
+- IBM DRVS
+- Quantum Atlas IV
+- Seagate Cheetah II
+
+If your controller has NVRAM, you can configure this feature per target
+from the user setup tool. The Tekram Setup program allows to tune the
+maximum number of queued commands up to 32. The Symbios Setup only allows
+to enable or disable this feature.
+
+The maximum number of simultaneous tagged commands queued to a device
+is currently set to 16 by default. This value is suitable for most SCSI
+disks. With large SCSI disks (>= 2GB, cache >= 512KB, average seek time
+<= 10 ms), using a larger value may give better performances.
+
+This driver supports up to 255 commands per device, and but using more than
+64 is generally not worth-while, unless you are using a very large disk or
+disk arrays. It is noticeable that most of recent hard disks seem not to
+accept more than 64 simultaneous commands. So, using more than 64 queued
+commands is probably just resource wasting.
+
+If your controller does not have NVRAM or if it is managed by the SDMS
+BIOS/SETUP, you can configure tagged queueing feature and device queue
+depths from the boot command-line. For example:
+
+ sym53c8xx=tags:4/t2t3q15-t4q7/t1u0q32
+
+will set tagged commands queue depths as follow:
+
+- target 2 all luns on controller 0 --> 15
+- target 3 all luns on controller 0 --> 15
+- target 4 all luns on controller 0 --> 7
+- target 1 lun 0 on controller 1 --> 32
+- all other target/lun --> 4
+
+In some special conditions, some SCSI disk firmwares may return a
+QUEUE FULL status for a SCSI command. This behaviour is managed by the
+driver using the following heuristic:
+
+- Each time a QUEUE FULL status is returned, tagged queue depth is reduced
+ to the actual number of disconnected commands.
+
+- Every 200 successfully completed SCSI commands, if allowed by the
+ current limit, the maximum number of queueable commands is incremented.
+
+Since QUEUE FULL status reception and handling is resource wasting, the
+driver notifies by default this problem to user by indicating the actual
+number of commands used and their status, as well as its decision on the
+device queue depth change.
+The heuristic used by the driver in handling QUEUE FULL ensures that the
+impact on performances is not too bad. You can get rid of the messages by
+setting verbose level to zero, as follow:
+
+1st method: boot your system using 'sym53c8xx=verb:0' option.
+2nd method: apply "setverbose 0" control command to the proc fs entry
+ corresponding to your controller after boot-up.
+
+6. Parity checking
+
+The driver supports SCSI parity checking and PCI bus master parity
+checking. These features must be enabled in order to ensure safe
+data transfers. Some flawed devices or mother boards may have problems
+with parity. The options to defeat parity checking have been removed
+from the driver.
+
+7. Profiling information
+
+This driver does not provide profiling informations as did its predecessors.
+This feature was not this useful and added complexity to the code.
+As the driver code got more complex, I have decided to remove everything
+that didn't seem actually useful.
+
+8. Control commands
+
+Control commands can be sent to the driver with write operations to
+the proc SCSI file system. The generic command syntax is the
+following:
+
+ echo "<verb> <parameters>" >/proc/scsi/sym53c8xx/0
+ (assumes controller number is 0)
+
+Using "all" for "<target>" parameter with the commands below will
+apply to all targets of the SCSI chain (except the controller).
+
+Available commands:
+
+8.1 Set minimum synchronous period factor
+
+ setsync <target> <period factor>
+
+ target: target number
+ period: minimum synchronous period.
+ Maximum speed = 1000/(4*period factor) except for special
+ cases below.
+
+ Specify a period of 0, to force asynchronous transfer mode.
+
+ 9 means 12.5 nano-seconds synchronous period
+ 10 means 25 nano-seconds synchronous period
+ 11 means 30 nano-seconds synchronous period
+ 12 means 50 nano-seconds synchronous period
+
+8.2 Set wide size
+
+ setwide <target> <size>
+
+ target: target number
+ size: 0=8 bits, 1=16bits
+
+8.3 Set maximum number of concurrent tagged commands
+
+ settags <target> <tags>
+
+ target: target number
+ tags: number of concurrent tagged commands
+ must not be greater than configured (default: 16)
+
+8.4 Set debug mode
+
+ setdebug <list of debug flags>
+
+ Available debug flags:
+ alloc: print info about memory allocations (ccb, lcb)
+ queue: print info about insertions into the command start queue
+ result: print sense data on CHECK CONDITION status
+ scatter: print info about the scatter process
+ scripts: print info about the script binding process
+ tiny: print minimal debugging information
+ timing: print timing information of the NCR chip
+ nego: print information about SCSI negotiations
+ phase: print information on script interruptions
+
+ Use "setdebug" with no argument to reset debug flags.
+
+
+8.5 Set flag (no_disc)
+
+ setflag <target> <flag>
+
+ target: target number
+
+ For the moment, only one flag is available:
+
+ no_disc: not allow target to disconnect.
+
+ Do not specify any flag in order to reset the flag. For example:
+ - setflag 4
+ will reset no_disc flag for target 4, so will allow it disconnections.
+ - setflag all
+ will allow disconnection for all devices on the SCSI bus.
+
+
+8.6 Set verbose level
+
+ setverbose #level
+
+ The driver default verbose level is 1. This command allows to change
+ th driver verbose level after boot-up.
+
+8.7 Reset all logical units of a target
+
+ resetdev <target>
+
+ target: target number
+ The driver will try to send a BUS DEVICE RESET message to the target.
+
+8.8 Abort all tasks of all logical units of a target
+
+ cleardev <target>
+
+ target: target number
+ The driver will try to send a ABORT message to all the logical units
+ of the target.
+
+
+9. Configuration parameters
+
+Under kernel configuration tools (make menuconfig, for example), it is
+possible to change some default driver configuration parameters.
+If the firmware of all your devices is perfect enough, all the
+features supported by the driver can be enabled at start-up. However,
+if only one has a flaw for some SCSI feature, you can disable the
+support by the driver of this feature at linux start-up and enable
+this feature after boot-up only for devices that support it safely.
+
+Configuration parameters:
+
+Use normal IO (default answer: n)
+ Answer "y" if you suspect your mother board to not allow memory mapped I/O.
+ May slow down performance a little.
+
+Default tagged command queue depth (default answer: 16)
+ Entering 0 defaults to tagged commands not being used.
+ This parameter can be specified from the boot command line.
+
+Maximum number of queued commands (default answer: 32)
+ This option allows you to specify the maximum number of tagged commands
+ that can be queued to a device. The maximum supported value is 255.
+
+Synchronous transfers frequency (default answer: 80)
+ This option allows you to specify the frequency in MHz the driver
+ will use at boot time for synchronous data transfer negotiations.
+ 0 means "asynchronous data transfers".
+
+10. Boot setup commands
+
+10.1 Syntax
+
+Setup commands can be passed to the driver either at boot time or as
+parameters to modprobe, as described in Documentation/kernel-parameters.txt
+
+Example of boot setup command under lilo prompt:
+
+lilo: linux root=/dev/sda2 sym53c8xx.cmd_per_lun=4 sym53c8xx.sync=10 sym53c8xx.debug=0x200
+
+- enable tagged commands, up to 4 tagged commands queued.
+- set synchronous negotiation speed to 10 Mega-transfers / second.
+- set DEBUG_NEGO flag.
+
+The following command will install the driver module with the same
+options as above.
+
+ modprobe sym53c8xx cmd_per_lun=4 sync=10 debug=0x200
+
+10.2 Available arguments
+
+10.2.1 Default number of tagged commands
+ cmd_per_lun=0 (or cmd_per_lun=1) tagged command queuing disabled
+ cmd_per_lun=#tags (#tags > 1) tagged command queuing enabled
+ #tags will be truncated to the max queued commands configuration parameter.
+
+10.2.2 Burst max
+ burst=0 burst disabled
+ burst=255 get burst length from initial IO register settings.
+ burst=#x burst enabled (1<<#x burst transfers max)
+ #x is an integer value which is log base 2 of the burst transfers max.
+ By default the driver uses the maximum value supported by the chip.
+
+10.2.3 LED support
+ led=1 enable LED support
+ led=0 disable LED support
+ Do not enable LED support if your scsi board does not use SDMS BIOS.
+ (See 'Configuration parameters')
+
+10.2.4 Differential mode
+ diff=0 never set up diff mode
+ diff=1 set up diff mode if BIOS set it
+ diff=2 always set up diff mode
+ diff=3 set diff mode if GPIO3 is not set
+
+10.2.5 IRQ mode
+ irqm=0 always open drain
+ irqm=1 same as initial settings (assumed BIOS settings)
+ irqm=2 always totem pole
+
+10.2.6 Check SCSI BUS
+ buschk=<option bits>
+
+ Available option bits:
+ 0x0: No check.
+ 0x1: Check and do not attach the controller on error.
+ 0x2: Check and just warn on error.
+
+10.2.7 Suggest a default SCSI id for hosts
+ hostid=255 no id suggested.
+ hostid=#x (0 < x < 7) x suggested for hosts SCSI id.
+
+ If a host SCSI id is available from the NVRAM, the driver will ignore
+ any value suggested as boot option. Otherwise, if a suggested value
+ different from 255 has been supplied, it will use it. Otherwise, it will
+ try to deduce the value previously set in the hardware and use value
+ 7 if the hardware value is zero.
+
+10.2.8 Verbosity level
+ verb=0 minimal
+ verb=1 normal
+ verb=2 too much
+
+10.2.9 Debug mode
+ debug=0 clear debug flags
+ debug=#x set debug flags
+ #x is an integer value combining the following power-of-2 values:
+ DEBUG_ALLOC 0x1
+ DEBUG_PHASE 0x2
+ DEBUG_POLL 0x4
+ DEBUG_QUEUE 0x8
+ DEBUG_RESULT 0x10
+ DEBUG_SCATTER 0x20
+ DEBUG_SCRIPT 0x40
+ DEBUG_TINY 0x80
+ DEBUG_TIMING 0x100
+ DEBUG_NEGO 0x200
+ DEBUG_TAGS 0x400
+ DEBUG_FREEZE 0x800
+ DEBUG_RESTART 0x1000
+
+ You can play safely with DEBUG_NEGO. However, some of these flags may
+ generate bunches of syslog messages.
+
+10.2.10 Settle delay
+ settle=n delay for n seconds
+
+ After a bus reset, the driver will delay for n seconds before talking
+ to any device on the bus. The default is 3 seconds and safe mode will
+ default it to 10.
+
+10.2.11 Serial NVRAM
+ NB: option not currently implemented.
+ nvram=n do not look for serial NVRAM
+ nvram=y test controllers for onboard serial NVRAM
+ (alternate binary form)
+ nvram=<bits options>
+ 0x01 look for NVRAM (equivalent to nvram=y)
+ 0x02 ignore NVRAM "Synchronous negotiation" parameters for all devices
+ 0x04 ignore NVRAM "Wide negotiation" parameter for all devices
+ 0x08 ignore NVRAM "Scan at boot time" parameter for all devices
+ 0x80 also attach controllers set to OFF in the NVRAM (sym53c8xx only)
+
+10.2.12 Exclude a host from being attached
+ excl=<io_address>,...
+
+ Prevent host at a given io address from being attached.
+ For example 'excl=0xb400,0xc000' indicate to the
+ driver not to attach hosts at address 0xb400 and 0xc000.
+
+10.3 Converting from old style options
+
+Previously, the sym2 driver accepted arguments of the form
+ sym53c8xx=tags:4,sync:10,debug:0x200
+
+As a result of the new module parameters, this is no longer available.
+Most of the options have remained the same, but tags has become
+cmd_per_lun to reflect its different purposes. The sample above would
+be specified as:
+ modprobe sym53c8xx cmd_per_lun=4 sync=10 debug=0x200
+
+or on the kernel boot line as:
+ sym53c8xx.cmd_per_lun=4 sym53c8xx.sync=10 sym53c8xx.debug=0x200
+
+10.4 SCSI BUS checking boot option.
+
+When this option is set to a non-zero value, the driver checks SCSI lines
+logic state, 100 micro-seconds after having asserted the SCSI RESET line.
+The driver just reads SCSI lines and checks all lines read FALSE except RESET.
+Since SCSI devices shall release the BUS at most 800 nano-seconds after SCSI
+RESET has been asserted, any signal to TRUE may indicate a SCSI BUS problem.
+Unfortunately, the following common SCSI BUS problems are not detected:
+- Only 1 terminator installed.
+- Misplaced terminators.
+- Bad quality terminators.
+On the other hand, either bad cabling, broken devices, not conformant
+devices, ... may cause a SCSI signal to be wrong when te driver reads it.
+
+15. SCSI problem troubleshooting
+
+15.1 Problem tracking
+
+Most SCSI problems are due to a non conformant SCSI bus or too buggy
+devices. If unfortunately you have SCSI problems, you can check the
+following things:
+
+- SCSI bus cables
+- terminations at both end of the SCSI chain
+- linux syslog messages (some of them may help you)
+
+If you do not find the source of problems, you can configure the
+driver or devices in the NVRAM with minimal features.
+
+- only asynchronous data transfers
+- tagged commands disabled
+- disconnections not allowed
+
+Now, if your SCSI bus is ok, your system has every chance to work
+with this safe configuration but performances will not be optimal.
+
+If it still fails, then you can send your problem description to
+appropriate mailing lists or news-groups. Send me a copy in order to
+be sure I will receive it. Obviously, a bug in the driver code is
+possible.
+
+ My current email address: Gerard Roudier <groudier@free.fr>
+
+Allowing disconnections is important if you use several devices on
+your SCSI bus but often causes problems with buggy devices.
+Synchronous data transfers increases throughput of fast devices like
+hard disks. Good SCSI hard disks with a large cache gain advantage of
+tagged commands queuing.
+
+15.2 Understanding hardware error reports
+
+When the driver detects an unexpected error condition, it may display a
+message of the following pattern.
+
+sym0:1: ERROR (0:48) (1-21-65) (f/95/0) @ (script 7c0:19000000).
+sym0: script cmd = 19000000
+sym0: regdump: da 10 80 95 47 0f 01 07 75 01 81 21 80 01 09 00.
+
+Some fields in such a message may help you understand the cause of the
+problem, as follows:
+
+sym0:1: ERROR (0:48) (1-21-65) (f/95/0) @ (script 7c0:19000000).
+.....A.........B.C....D.E..F....G.H..I.......J.....K...L.......
+
+Field A : target number.
+ SCSI ID of the device the controller was talking with at the moment the
+ error occurs.
+
+Field B : DSTAT io register (DMA STATUS)
+ Bit 0x40 : MDPE Master Data Parity Error
+ Data parity error detected on the PCI BUS.
+ Bit 0x20 : BF Bus Fault
+ PCI bus fault condition detected
+ Bit 0x01 : IID Illegal Instruction Detected
+ Set by the chip when it detects an Illegal Instruction format
+ on some condition that makes an instruction illegal.
+ Bit 0x80 : DFE Dma Fifo Empty
+ Pure status bit that does not indicate an error.
+ If the reported DSTAT value contains a combination of MDPE (0x40),
+ BF (0x20), then the cause may be likely due to a PCI BUS problem.
+
+Field C : SIST io register (SCSI Interrupt Status)
+ Bit 0x08 : SGE SCSI GROSS ERROR
+ Indicates that the chip detected a severe error condition
+ on the SCSI BUS that prevents the SCSI protocol from functioning
+ properly.
+ Bit 0x04 : UDC Unexpected Disconnection
+ Indicates that the device released the SCSI BUS when the chip
+ was not expecting this to happen. A device may behave so to
+ indicate the SCSI initiator that an error condition not reportable using the SCSI protocol has occurred.
+ Bit 0x02 : RST SCSI BUS Reset
+ Generally SCSI targets do not reset the SCSI BUS, although any
+ device on the BUS can reset it at any time.
+ Bit 0x01 : PAR Parity
+ SCSI parity error detected.
+ On a faulty SCSI BUS, any error condition among SGE (0x08), UDC (0x04) and
+ PAR (0x01) may be detected by the chip. If your SCSI system sometimes
+ encounters such error conditions, especially SCSI GROSS ERROR, then a SCSI
+ BUS problem is likely the cause of these errors.
+
+For fields D,E,F,G and H, you may look into the sym53c8xx_defs.h file
+that contains some minimal comments on IO register bits.
+Field D : SOCL Scsi Output Control Latch
+ This register reflects the state of the SCSI control lines the
+ chip want to drive or compare against.
+Field E : SBCL Scsi Bus Control Lines
+ Actual value of control lines on the SCSI BUS.
+Field F : SBDL Scsi Bus Data Lines
+ Actual value of data lines on the SCSI BUS.
+Field G : SXFER SCSI Transfer
+ Contains the setting of the Synchronous Period for output and
+ the current Synchronous offset (offset 0 means asynchronous).
+Field H : SCNTL3 Scsi Control Register 3
+ Contains the setting of timing values for both asynchronous and
+ synchronous data transfers.
+Field I : SCNTL4 Scsi Control Register 4
+ Only meaningful for 53C1010 Ultra3 controllers.
+
+Understanding Fields J, K, L and dumps requires to have good knowledge of
+SCSI standards, chip cores functionnals and internal driver data structures.
+You are not required to decode and understand them, unless you want to help
+maintain the driver code.
+
+17. Serial NVRAM (added by Richard Waltham: dormouse@farsrobt.demon.co.uk)
+
+17.1 Features
+
+Enabling serial NVRAM support enables detection of the serial NVRAM included
+on Symbios and some Symbios compatible host adaptors, and Tekram boards. The
+serial NVRAM is used by Symbios and Tekram to hold set up parameters for the
+host adaptor and it's attached drives.
+
+The Symbios NVRAM also holds data on the boot order of host adaptors in a
+system with more than one host adaptor. This information is no longer used
+as it's fundamentally incompatible with the hotplug PCI model.
+
+Tekram boards using Symbios chips, DC390W/F/U, which have NVRAM are detected
+and this is used to distinguish between Symbios compatible and Tekram host
+adaptors. This is used to disable the Symbios compatible "diff" setting
+incorrectly set on Tekram boards if the CONFIG_SCSI_53C8XX_SYMBIOS_COMPAT
+configuration parameter is set enabling both Symbios and Tekram boards to be
+used together with the Symbios cards using all their features, including
+"diff" support. ("led pin" support for Symbios compatible cards can remain
+enabled when using Tekram cards. It does nothing useful for Tekram host
+adaptors but does not cause problems either.)
+
+The parameters the driver is able to get from the NVRAM depend on the
+data format used, as follow:
+
+ Tekram format Symbios format
+General and host parameters
+ Boot order N Y
+ Host SCSI ID Y Y
+ SCSI parity checking Y Y
+ Verbose boot messages N Y
+SCSI devices parameters
+ Synchronous transfer speed Y Y
+ Wide 16 / Narrow Y Y
+ Tagged Command Queuing enabled Y Y
+ Disconnections enabled Y Y
+ Scan at boot time N Y
+
+In order to speed up the system boot, for each device configured without
+the "scan at boot time" option, the driver forces an error on the
+first TEST UNIT READY command received for this device.
+
+
+17.2 Symbios NVRAM layout
+
+typical data at NVRAM address 0x100 (53c810a NVRAM)
+-----------------------------------------------------------
+00 00
+64 01
+8e 0b
+
+00 30 00 00 00 00 07 00 00 00 00 00 00 00 07 04 10 04 00 00
+
+04 00 0f 00 00 10 00 50 00 00 01 00 00 62
+04 00 03 00 00 10 00 58 00 00 01 00 00 63
+04 00 01 00 00 10 00 48 00 00 01 00 00 61
+00 00 00 00 00 00 00 00 00 00 00 00 00 00
+
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+00 00 00 00 00 00 00 00
+
+fe fe
+00 00
+00 00
+-----------------------------------------------------------
+NVRAM layout details
+
+NVRAM Address 0x000-0x0ff not used
+ 0x100-0x26f initialised data
+ 0x270-0x7ff not used
+
+general layout
+
+ header - 6 bytes,
+ data - 356 bytes (checksum is byte sum of this data)
+ trailer - 6 bytes
+ ---
+ total 368 bytes
+
+data area layout
+
+ controller set up - 20 bytes
+ boot configuration - 56 bytes (4x14 bytes)
+ device set up - 128 bytes (16x8 bytes)
+ unused (spare?) - 152 bytes (19x8 bytes)
+ ---
+ total 356 bytes
+
+-----------------------------------------------------------
+header
+
+00 00 - ?? start marker
+64 01 - byte count (lsb/msb excludes header/trailer)
+8e 0b - checksum (lsb/msb excludes header/trailer)
+-----------------------------------------------------------
+controller set up
+
+00 30 00 00 00 00 07 00 00 00 00 00 00 00 07 04 10 04 00 00
+ | | | |
+ | | | -- host ID
+ | | |
+ | | --Removable Media Support
+ | | 0x00 = none
+ | | 0x01 = Bootable Device
+ | | 0x02 = All with Media
+ | |
+ | --flag bits 2
+ | 0x00000001= scan order hi->low
+ | (default 0x00 - scan low->hi)
+ --flag bits 1
+ 0x00000001 scam enable
+ 0x00000010 parity enable
+ 0x00000100 verbose boot msgs
+
+remaining bytes unknown - they do not appear to change in my
+current set up for any of the controllers.
+
+default set up is identical for 53c810a and 53c875 NVRAM
+(Removable Media added Symbios BIOS version 4.09)
+-----------------------------------------------------------
+boot configuration
+
+boot order set by order of the devices in this table
+
+04 00 0f 00 00 10 00 50 00 00 01 00 00 62 -- 1st controller
+04 00 03 00 00 10 00 58 00 00 01 00 00 63 2nd controller
+04 00 01 00 00 10 00 48 00 00 01 00 00 61 3rd controller
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 4th controller
+ | | | | | | | |
+ | | | | | | ---- PCI io port adr
+ | | | | | --0x01 init/scan at boot time
+ | | | | --PCI device/function number (0xdddddfff)
+ | | ----- ?? PCI vendor ID (lsb/msb)
+ ----PCI device ID (lsb/msb)
+
+?? use of this data is a guess but seems reasonable
+
+remaining bytes unknown - they do not appear to change in my
+current set up
+
+default set up is identical for 53c810a and 53c875 NVRAM
+-----------------------------------------------------------
+device set up (up to 16 devices - includes controller)
+
+0f 00 08 08 64 00 0a 00 - id 0
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00
+0f 00 08 08 64 00 0a 00 - id 15
+ | | | | | |
+ | | | | ----timeout (lsb/msb)
+ | | | --synch period (0x?? 40 Mtrans/sec- fast 40) (probably 0x28)
+ | | | (0x30 20 Mtrans/sec- fast 20)
+ | | | (0x64 10 Mtrans/sec- fast )
+ | | | (0xc8 5 Mtrans/sec)
+ | | | (0x00 asynchronous)
+ | | -- ?? max sync offset (0x08 in NVRAM on 53c810a)
+ | | (0x10 in NVRAM on 53c875)
+ | --device bus width (0x08 narrow)
+ | (0x10 16 bit wide)
+ --flag bits
+ 0x00000001 - disconnect enabled
+ 0x00000010 - scan at boot time
+ 0x00000100 - scan luns
+ 0x00001000 - queue tags enabled
+
+remaining bytes unknown - they do not appear to change in my
+current set up
+
+?? use of this data is a guess but seems reasonable
+(but it could be max bus width)
+
+default set up for 53c810a NVRAM
+default set up for 53c875 NVRAM - bus width - 0x10
+ - sync offset ? - 0x10
+ - sync period - 0x30
+-----------------------------------------------------------
+?? spare device space (32 bit bus ??)
+
+00 00 00 00 00 00 00 00 (19x8bytes)
+.
+.
+00 00 00 00 00 00 00 00
+
+default set up is identical for 53c810a and 53c875 NVRAM
+-----------------------------------------------------------
+trailer
+
+fe fe - ? end marker ?
+00 00
+00 00
+
+default set up is identical for 53c810a and 53c875 NVRAM
+-----------------------------------------------------------
+
+
+
+17.3 Tekram NVRAM layout
+
+nvram 64x16 (1024 bit)
+
+Drive settings
+
+Drive ID 0-15 (addr 0x0yyyy0 = device setup, yyyy = ID)
+ (addr 0x0yyyy1 = 0x0000)
+
+ x x x x x x x x x x x x x x x x
+ | | | | | | | | |
+ | | | | | | | | ----- parity check 0 - off
+ | | | | | | | | 1 - on
+ | | | | | | | |
+ | | | | | | | ------- sync neg 0 - off
+ | | | | | | | 1 - on
+ | | | | | | |
+ | | | | | | --------- disconnect 0 - off
+ | | | | | | 1 - on
+ | | | | | |
+ | | | | | ----------- start cmd 0 - off
+ | | | | | 1 - on
+ | | | | |
+ | | | | -------------- tagged cmds 0 - off
+ | | | | 1 - on
+ | | | |
+ | | | ---------------- wide neg 0 - off
+ | | | 1 - on
+ | | |
+ --------------------------- sync rate 0 - 10.0 Mtrans/sec
+ 1 - 8.0
+ 2 - 6.6
+ 3 - 5.7
+ 4 - 5.0
+ 5 - 4.0
+ 6 - 3.0
+ 7 - 2.0
+ 7 - 2.0
+ 8 - 20.0
+ 9 - 16.7
+ a - 13.9
+ b - 11.9
+
+Global settings
+
+Host flags 0 (addr 0x100000, 32)
+
+ x x x x x x x x x x x x x x x x
+ | | | | | | | | | | | |
+ | | | | | | | | ----------- host ID 0x00 - 0x0f
+ | | | | | | | |
+ | | | | | | | ----------------------- support for 0 - off
+ | | | | | | | > 2 drives 1 - on
+ | | | | | | |
+ | | | | | | ------------------------- support drives 0 - off
+ | | | | | | > 1Gbytes 1 - on
+ | | | | | |
+ | | | | | --------------------------- bus reset on 0 - off
+ | | | | | power on 1 - on
+ | | | | |
+ | | | | ----------------------------- active neg 0 - off
+ | | | | 1 - on
+ | | | |
+ | | | -------------------------------- imm seek 0 - off
+ | | | 1 - on
+ | | |
+ | | ---------------------------------- scan luns 0 - off
+ | | 1 - on
+ | |
+ -------------------------------------- removable 0 - disable
+ as BIOS dev 1 - boot device
+ 2 - all
+
+Host flags 1 (addr 0x100001, 33)
+
+ x x x x x x x x x x x x x x x x
+ | | | | | |
+ | | | --------- boot delay 0 - 3 sec
+ | | | 1 - 5
+ | | | 2 - 10
+ | | | 3 - 20
+ | | | 4 - 30
+ | | | 5 - 60
+ | | | 6 - 120
+ | | |
+ --------------------------- max tag cmds 0 - 2
+ 1 - 4
+ 2 - 8
+ 3 - 16
+ 4 - 32
+
+Host flags 2 (addr 0x100010, 34)
+
+ x x x x x x x x x x x x x x x x
+ |
+ ----- F2/F6 enable 0 - off ???
+ 1 - on ???
+
+checksum (addr 0x111111)
+
+checksum = 0x1234 - (sum addr 0-63)
+
+----------------------------------------------------------------------------
+
+default nvram data:
+
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+0x0037 0x0000 0x0037 0x0000 0x0037 0x0000 0x0037 0x0000
+
+0x0f07 0x0400 0x0001 0x0000 0x0000 0x0000 0x0000 0x0000
+0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
+0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
+0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0xfbbc
+
+
+===============================================================================
+End of Linux SYM-2 driver documentation file
diff --git a/Documentation/scsi/tmscsim.txt b/Documentation/scsi/tmscsim.txt
new file mode 100644
index 0000000..61c0531
--- /dev/null
+++ b/Documentation/scsi/tmscsim.txt
@@ -0,0 +1,449 @@
+The tmscsim driver
+==================
+
+1. Purpose and history
+2. Installation
+3. Features
+4. Configuration via /proc/scsi/tmscsim/?
+5. Configuration via boot/module params
+6. Potential improvements
+7. Bug reports, debugging and updates
+8. Acknowledgements
+9. Copyright
+
+
+1. Purpose and history
+----------------------
+The tmscsim driver supports PCI SCSI Host Adapters based on the AM53C974
+chip. AM53C974 based SCSI adapters include:
+ Tekram DC390, DC390T
+ Dawicontrol 2974
+ QLogic Fast! PCI Basic
+ some on-board adapters
+(This is most probably not a complete list)
+
+It has originally written by C.L. Huang from the Tekram corp. to support the
+Tekram DC390(T) adapter. This is where the name comes from: tm = Tekram
+scsi = SCSI driver, m = AMD (?) as opposed to w for the DC390W/U/F
+(NCR53c8X5, X=2/7) driver. Yes, there was also a driver for the latter,
+tmscsiw, which supported DC390W/U/F adapters. It's not maintained any more,
+as the ncr53c8xx is perfectly supporting these adapters since some time.
+
+The driver first appeared in April 1996, exclusively supported the DC390
+and has been enhanced since then in various steps. In May 1998 support for
+general AM53C974 based adapters and some possibilities to configure it were
+added. The non-DC390 support works by assuming some values for the data
+normally taken from the DC390 EEPROM. See below (chapter 5) for details.
+
+When using the DC390, the configuration is still be done using the DC390
+BIOS setup. The DC390 EEPROM is read and used by the driver, any boot or
+module parameters (chapter 5) are ignored! However, you can change settings
+dynamically, as described in chapter 4.
+
+For a more detailed description of the driver's history, see the first lines
+of tmscsim.c.
+The numbering scheme isn't consistent. The first versions went from 1.00 to
+1.12, then 1.20a to 1.20t. Finally I decided to use the ncr53c8xx scheme. So
+the next revisions will be 2.0a to 2.0X (stable), 2.1a to 2.1X (experimental),
+2.2a to 2.2X (stable, again) etc. (X = anything between a and z.) If I send
+fixes to people for testing, I create intermediate versions with a digit
+appended, e.g. 2.0c3.
+
+
+2. Installation
+---------------
+If you got any recent kernel with this driver and document included in
+linux/drivers/scsi, you basically have to do nothing special to use this
+driver. Of course you have to choose to compile SCSI support and DC390(T)
+support into your kernel or as module when configuring your kernel for
+compiling.
+NEW: You may as well compile this module outside your kernel, using the
+supplied Makefile.
+
+ If you got an old kernel (pre 2.1.127, pre 2.0.37p1) with an old version of
+ this driver: Get dc390-21125-20b.diff.gz or dc390-2036p21-20b1.diff.gz from
+ my web page and apply the patch. Apply further patches to upgrade to the
+ latest version of the driver.
+
+ If you want to do it manually, you should copy the files (dc390.h,
+ tmscsim.h, tmscsim.c, scsiiom.c and README.tmscsim) from this directory to
+ linux/drivers/scsi. You have to recompile your kernel/module of course.
+
+ You should apply the three patches included in dc390-120-kernel.diff
+ (Applying them: cd /usr/src; patch -p0 <~/dc390-120-kernel.diff)
+ The patches are against 2.1.125, so you might have to manually resolve
+ rejections when applying to another kernel version.
+
+ The patches will update the kernel startup code to allow boot parameters to
+ be passed to the driver, update the Documentation and finally offer you the
+ possibility to omit the non-DC390 parts of the driver.
+ (By selecting "Omit support for non DC390" you basically disable the
+ emulation of a DC390 EEPROM for non DC390 adapters. This saves a few bytes
+ of memory.)
+
+If you got a very old kernel without the tmscsim driver (pre 2.0.31)
+I recommend upgrading your kernel. However, if you don't want to, please
+contact me to get the appropriate patches.
+
+
+Upgrading a SCSI driver is always a delicate thing to do. The 2.0 driver has
+proven stable on many systems, but it's still a good idea to take some
+precautions. In an ideal world you would have a full backup of your disks.
+The world isn't ideal and most people don't have full backups (me neither).
+So take at least the following measures:
+* make your kernel remount the FS read-only on detecting an error:
+ tune2fs -e remount-ro /dev/sd??
+* have copies of your SCSI disk's partition tables on some safe location:
+ dd if=/dev/sda of=/mnt/floppy/sda bs=512 count=1
+ or just print it with:
+ fdisk -l | lpr
+* make sure you are able to boot Linux (e.g. from floppy disk using InitRD)
+ if your SCSI disk gets corrupted. You can use
+ ftp://student.physik.uni-dortmund.de/pub/linux/kernel/bootdisk.gz
+
+One more warning: I used to overclock my PCI bus to 41.67 MHz. My Tekram
+DC390F (Sym53c875) accepted this as well as my Millenium. But the Am53C974
+produced errors and started to corrupt my disks. So don't do that! A 37.50
+MHz PCI bus works for me, though, but I don't recommend using higher clocks
+than the 33.33 MHz being in the PCI spec.
+
+If you want to share the IRQ with another device and the driver refuses to
+do so, you might succeed with changing the DC390_IRQ type in tmscsim.c to
+IRQF_SHARED | IRQF_DISABLED.
+
+
+3.Features
+----------
+- SCSI
+ * Tagged command queueing
+ * Sync speed up to 10 MHz
+ * Disconnection
+ * Multiple LUNs
+
+- General / Linux interface
+ * Support for up to 4 AM53C974 adapters.
+ * DC390 EEPROM usage or boot/module params
+ * Information via cat /proc/scsi/tmscsim/?
+ * Dynamically configurable by writing to /proc/scsi/tmscsim/?
+ * Dynamic allocation of resources
+ * SMP support: Locking on io_request lock (Linux 2.1/2.2) or adapter
+ specific locks (Linux 2.5?)
+ * Uniform source code for Linux-2.x.y
+ * Support for dyn. addition/removal of devices via add/remove-single-device
+ (Try: echo "scsi add-single-device C B T U" >/proc/scsi/scsi
+ C = Controller, B = Bus, T = Target SCSI ID, U = Unit SCSI LUN.)
+ Use with care!
+ * Try to use the partition table for the determination of the mapping
+
+
+4. Configuration via /proc/scsi/tmscsim/?
+-----------------------------------------
+First of all look at the output of /proc/scsi/tmscsim/? by typing
+ cat /proc/scsi/tmscsim/?
+The "?" should be replaced by the SCSI host number. (The shell might do this
+for you.)
+You will see some info regarding the adapter and, at the end, a listing of
+the attached devices and their settings.
+
+Here's an example:
+garloff@kurt:/home/garloff > cat /proc/scsi/tmscsim/0
+Tekram DC390/AM53C974 PCI SCSI Host Adapter, Driver Version 2.0e7 2000-11-28
+SCSI Host Nr 1, AM53C974 Adapter Nr 0
+IOPortBase 0xb000, IRQ 10
+MaxID 8, MaxLUN 8, AdapterID 6, SelTimeout 250 ms, DelayReset 1 s
+TagMaxNum 16, Status 0x00, ACBFlag 0x00, GlitchEater 24 ns
+Statistics: Cmnds 1470165, Cmnds not sent directly 0, Out of SRB conds 0
+ Lost arbitrations 587, Sel. connected 0, Connected: No
+Nr of attached devices: 4, Nr of DCBs: 4
+Map of attached LUNs: 01 00 00 03 01 00 00 00
+Idx ID LUN Prty Sync DsCn SndS TagQ NegoPeriod SyncSpeed SyncOffs MaxCmd
+00 00 00 Yes Yes Yes Yes Yes 100 ns 10.0 M 15 16
+01 03 00 Yes Yes Yes Yes No 100 ns 10.0 M 15 01
+02 03 01 Yes Yes Yes Yes No 100 ns 10.0 M 15 01
+03 04 00 Yes Yes Yes Yes No 100 ns 10.0 M 15 01
+
+Note that the settings MaxID and MaxLUN are not zero- but one-based, which
+means that a setting MaxLUN=4, will result in the support of LUNs 0..3. This
+is somehow inconvenient, but the way the mid-level SCSI code expects it to be.
+
+ACB and DCB are acronyms for Adapter Control Block and Device Control Block.
+These are data structures of the driver containing information about the
+adapter and the connected SCSI devices respectively.
+
+Idx is the device index (just a consecutive number for the driver), ID and
+LUN are the SCSI ID and LUN, Prty means Parity checking, Sync synchronous
+negotiation, DsCn Disconnection, SndS Send Start command on startup (not
+used by the driver) and TagQ Tagged Command Queueing. NegoPeriod and
+SyncSpeed are somehow redundant, because they are reciprocal values
+(1 / 112 ns = 8.9 MHz). At least in theory. The driver is able to adjust the
+NegoPeriod more accurate (4ns) than the SyncSpeed (1 / 25ns). I don't know
+if certain devices will have problems with this discrepancy. Max. speed is
+10 MHz corresp. to a min. NegoPeriod of 100 ns.
+(The driver allows slightly higher speeds if the devices (Ultra SCSI) accept
+it, but that's out of adapter spec, on your own risk and unlikely to improve
+performance. You're likely to crash your disks.)
+SyncOffs is the offset used for synchronous negotiations; max. is 15.
+The last values are only shown, if Sync is enabled. (NegoPeriod is still
+displayed in brackets to show the values which will be used after enabling
+Sync.)
+MaxCmd ist the number of commands (=tags) which can be processed at the same
+time by the device.
+
+If you want to change a setting, you can do that by writing to
+/proc/scsi/tmscsim/?. Basically you have to imitate the output of driver.
+(Don't use the brackets for NegoPeriod on Sync disabled devices.)
+You don't have to care about capitalisation. The driver will accept space,
+tab, comma, = and : as separators.
+
+There are three kinds of changes:
+
+(1) Change driver settings:
+ You type the names of the parameters and the params following it.
+ Example:
+ echo "MaxLUN=8 seltimeout 200" >/proc/scsi/tmscsim/0
+
+ Note that you can only change MaxID, MaxLUN, AdapterID, SelTimeOut,
+ TagMaxNum, ACBFlag, GlitchEater and DelayReset. Don't change ACBFlag
+ unless you want to see what happens, if the driver hangs.
+
+(2) Change device settings: You write a config line to the driver. The Nr
+ must match the ID and LUN given. If you give "-" as parameter, it is
+ ignored and the corresponding setting won't be changed.
+ You can use "y" or "n" instead of "Yes" and "No" if you want to.
+ You don't need to specify a full line. The driver automatically performs
+ an INQUIRY on the device if necessary to check if it is capable to operate
+ with the given settings (Sync, TagQ).
+ Examples:
+ echo "0 0 0 y y y - y - 10 " >/proc/scsi/tmscsim/0
+ echo "3 5 0 y n y " >/proc/scsi/tmscsim/0
+
+ To give a short explanation of the first example:
+ The first three numbers, "0 0 0" (Device index 0, SCSI ID 0, SCSI LUN 0),
+ select the device to which the following parameters apply. Note that it
+ would be sufficient to use the index or both SCSI ID and LUN, but I chose
+ to require all three to have a syntax similar to the output.
+ The following "y y y - y" enables Parity checking, enables Synchronous
+ transfers, Disconnection, leaves Send Start (not used) untouched and
+ enables Tagged Command Queueing for the selected device. The "-" skips
+ the Negotiation Period setting but the "10" sets the max sync. speed to
+ 10 MHz. It's useless to specify both NegoPeriod and SyncSpeed as
+ discussed above. The values used in this example will result in maximum
+ performance.
+
+(3) Special commands: You can force a SCSI bus reset, an INQUIRY command, the
+ removal or the addition of a device's DCB and a SCSI register dump.
+ This is only used for debugging when you meet problems. The parameter of
+ the INQUIRY and REMOVE commands is the device index as shown by the
+ output of /proc/scsi/tmscsim/? in the device listing in the first column
+ (Idx). ADD takes the SCSI ID and LUN.
+ Examples:
+ echo "reset" >/proc/scsi/tmscsim/0
+ echo "inquiry 1" >/proc/scsi/tmscsim/0
+ echo "remove 2" >/proc/scsi/tmscsim/1
+ echo "add 2 3" >/proc/scsi/tmscsim/?
+ echo "dump" >/proc/scsi/tmscsim/0
+
+ Note that you will meet problems when you REMOVE a device's DCB with the
+ remove command if it contains partitions which are mounted. Only use it
+ after unmounting its partitions, telling the SCSI mid-level code to
+ remove it (scsi remove-single-device) and you really need a few bytes of
+ memory.
+ The ADD command allows you to configure a device before you tell the
+ mid-level code to try detection.
+
+
+I'd suggest reviewing the output of /proc/scsi/tmscsim/? after changing
+settings to see if everything changed as requested.
+
+
+5. Configuration via boot/module parameters
+-------------------------------------------
+With the DC390, the driver reads its EEPROM settings and tries to use them.
+But you may want to override the settings prior to being able to change the
+driver configuration via /proc/scsi/tmscsim/?.
+If you do have another AM53C974 based adapter, that's even the only
+possibility to adjust settings before you are able to write to the
+/proc/scsi/tmscsim/? pseudo-file, e.g. if you want to use another
+adapter ID than 7.
+(BTW, the log message "DC390: No EEPROM found!" is normal without a DC390.)
+For this purpose, you can pass options to the driver before it is initialised
+by using kernel or module parameters. See lilo(8) or modprobe(1) manual
+pages on how to pass params to the kernel or a module.
+[NOTE: Formerly, it was not possible to override the EEPROM supplied
+ settings of the DC390 with cmd line parameters. This has changed since
+ 2.0e7]
+
+The syntax of the params is much shorter than the syntax of the /proc/...
+interface. This makes it a little bit more difficult to use. However, long
+parameter lines have the risk to be misinterpreted and the length of kernel
+parameters is limited.
+
+As the support for non-DC390 adapters works by simulating the values of the
+DC390 EEPROM, the settings are given in a DC390 BIOS' way.
+
+Here's the syntax:
+tmscsim=AdaptID,SpdIdx,DevMode,AdaptMode,TaggedCmnds,DelayReset
+
+Each of the parameters is a number, containing the described information:
+
+* AdaptID: The SCSI ID of the host adapter. Must be in the range 0..7
+ Default is 7.
+
+* SpdIdx: The index of the maximum speed as in the DC390 BIOS. The values
+ 0..7 mean 10, 8.0, 6.7, 5.7, 5.0, 4.0, 3.1 and 2 MHz resp. Default is
+ 0 (10.0 MHz).
+
+* DevMode is a bit mapped value describing the per-device features. It
+ applies to all devices. (Sync, Disc and TagQ will only apply, if the
+ device supports it.) The meaning of the bits (* = default):
+
+ Bit Val(hex) Val(dec) Meaning
+ *0 0x01 1 Parity check
+ *1 0x02 2 Synchronous Negotiation
+ *2 0x04 4 Disconnection
+ *3 0x08 8 Send Start command on startup. (Not used)
+ *4 0x10 16 Tagged Command Queueing
+
+ As usual, the desired value is obtained by adding the wanted values. If
+ you want to enable all values, e.g., you would use 31(0x1f). Default is 31.
+
+* AdaptMode is a bit mapped value describing the enabled adapter features.
+
+ Bit Val(hex) Val(dec) Meaning
+ *0 0x01 1 Support more than two drives. (Not used)
+ *1 0x02 2 Use DOS compatible mapping for HDs greater than 1GB.
+ *2 0x04 4 Reset SCSI Bus on startup.
+ *3 0x08 8 Active Negation: Improves SCSI Bus noise immunity.
+ 4 0x10 16 Immediate return on BIOS seek command. (Not used)
+ (*)5 0x20 32 Check for LUNs >= 1.
+
+ The default for LUN Check depends on CONFIG_SCSI_MULTI_LUN.
+
+* TaggedCmnds is a number indicating the maximum number of Tagged Commands.
+ It is the binary logarithm - 1 of the actual number. Max is 4 (32).
+ Value Number of Tagged Commands
+ 0 2
+ 1 4
+ 2 8
+ *3 16
+ 4 32
+
+* DelayReset is the time in seconds (minus 0.5s), the adapter waits, after a
+ bus reset. Default is 1 (corresp. to 1.5s).
+
+Example:
+ modprobe tmscsim tmscsim=6,2,31
+would set the adapter ID to 6, max. speed to 6.7 MHz, enable all device
+features and leave the adapter features, the number of Tagged Commands
+and the Delay after a reset to the defaults.
+
+As you can see, you don't need to specify all of the six params.
+If you want values to be ignored (i.e. the EEprom settings or the defaults
+will be used), you may pass -2 (not 0!) at the corresponding position.
+
+The defaults (7,0,31,15,3,1) are aggressive to allow good performance. You
+can use tmscsim=7,0,31,63,4,0 for maximum performance, if your SCSI chain
+allows it. If you meet problems, you can use tmscsim=-1 which is a shortcut
+for tmscsim=7,4,9,15,2,10.
+
+
+6. Potential improvements
+-------------------------
+Most of the intended work on the driver has been done. Here are a few ideas
+to further improve its usability:
+
+* Cleanly separate per-Target and per-LUN properties (DCB)
+* More intelligent abort() routine
+* Use new_eh code (Linux-2.1+)
+* Have the mid-level (ML) code (and not the driver) handle more of the
+ various conditions.
+* Command queueing in the driver: Eliminate Query list and use ML instead.
+* More user friendly boot/module param syntax
+
+Further investigation on these problems:
+
+* Driver hangs with sync readcdda (xcdroast) (most probably VIA PCI error)
+
+Known problems:
+Please see http://www.garloff.de/kurt/linux/dc390/problems.html
+
+* Changing the parameters of multi-lun by the tmscsim/? interface will
+ cause problems, cause these settings are mostly per Target and not per LUN
+ and should be updated accordingly. To be fixed for 2.0d24.
+* CDRs (eg Yam CRW4416) not recognized, because some buggy devices don't
+ recover from a SCSI reset in time. Use a higher delay or don't issue
+ a SCSI bus reset on driver initialization. See problems page.
+ For the CRW4416S, this seems to be solved with firmware 1.0g (reported by
+ Jean-Yves Barbier).
+* TEAC CD-532S not being recognized. (Works with 1.11).
+* Scanners (eg. Astra UMAX 1220S) don't work: Disable Sync Negotiation.
+ If this does not help, try echo "INQUIRY t" >/proc/scsi/tmscsim/? (t
+ replaced by the dev index of your scanner). You may try to reset your SCSI
+ bus afterwards (echo "RESET" >/proc/scsi/tmscsim/?).
+ The problem seems to be solved as of 2.0d18, thanks to Andreas Rick.
+* If there is a valid partition table, the driver will use it for determining
+ the mapping. If there's none, a reasonable mapping (Symbios-like) will be
+ assumed. Other operating systems may not like this mapping, though
+ it's consistent with the BIOS' behaviour. Old DC390 drivers ignored the
+ partition table and used a H/S = 64/32 or 255/63 translation. So if you
+ want to be compatible to those, use this old mapping when creating
+ partition tables. Even worse, on bootup the DC390 might complain if other
+ mappings are found, so auto rebooting may fail.
+* In some situations, the driver will get stuck in an abort loop. This is a
+ bad interaction between the Mid-Layer of Linux' SCSI code and the driver.
+ Try to disable DsCn, if you meet this problem. Please contact me for
+ further debugging.
+
+
+7. Bug reports, debugging and updates
+-------------------------------------
+Whenever you have problems with the driver, you are invited to ask the
+author for help. However, I'd suggest reading the docs and trying to solve
+the problem yourself, first.
+If you find something, which you believe to be a bug, please report it to me.
+Please append the output of /proc/scsi/scsi, /proc/scsi/tmscsim/? and
+maybe the DC390 log messages to the report.
+
+Bug reports should be send to me (Kurt Garloff <dc390@garloff.de>) as well
+as to the linux-scsi list (<linux-scsi@vger.kernel.org>), as sometimes bugs
+are caused by the SCSI mid-level code.
+
+I will ask you for some more details and probably I will also ask you to
+enable some of the DEBUG options in the driver (tmscsim.c:DC390_DEBUGXXX
+defines). The driver will produce some data for the syslog facility then.
+Beware: If your syslog gets written to a SCSI disk connected to your
+AM53C974, the logging might produce log output again, and you might end
+having your box spending most of its time doing the logging.
+
+The latest version of the driver can be found at:
+ http://www.garloff.de/kurt/linux/dc390/
+ ftp://ftp.suse.com/pub/people/garloff/linux/dc390/
+
+
+8. Acknowledgements
+-------------------
+Thanks to Linus Torvalds, Alan Cox, the FSF people, the XFree86 team and
+all the others for the wonderful OS and software.
+Thanks to C.L. Huang and Philip Giang (Tekram) for the initial driver
+release and support.
+Thanks to Doug Ledford, Gérard Roudier for support with SCSI coding.
+Thanks to a lot of people (espec. Chiaki Ishikawa, Andreas Haumer, Hubert
+Tonneau) for intensively testing the driver (and even risking data loss
+doing this during early revisions).
+Recently, SuSE GmbH, Nuernberg, FRG, has been paying me for the driver
+development and maintenance. Special thanks!
+
+
+9. Copyright
+------------
+ This driver is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+ If you want to use any later version of the GNU GPL, you will probably
+ be allowed to, but you have to ask me and Tekram <erich@tekram.com.tw>
+ before.
+
+-------------------------------------------------------------------------
+Written by Kurt Garloff <kurt@garloff.de> 1998/06/11
+Last updated 2000/11/28, driver revision 2.0e7
+$Id: README.tmscsim,v 2.25.2.7 2000/12/20 01:07:12 garloff Exp $
diff --git a/Documentation/serial-console.txt b/Documentation/serial-console.txt
new file mode 100644
index 0000000..9a7bc8b
--- /dev/null
+++ b/Documentation/serial-console.txt
@@ -0,0 +1,109 @@
+ Linux Serial Console
+
+To use a serial port as console you need to compile the support into your
+kernel - by default it is not compiled in. For PC style serial ports
+it's the config option next to "Standard/generic (dumb) serial support".
+You must compile serial support into the kernel and not as a module.
+
+It is possible to specify multiple devices for console output. You can
+define a new kernel command line option to select which device(s) to
+use for console output.
+
+The format of this option is:
+
+ console=device,options
+
+ device: tty0 for the foreground virtual console
+ ttyX for any other virtual console
+ ttySx for a serial port
+ lp0 for the first parallel port
+ ttyUSB0 for the first USB serial device
+
+ options: depend on the driver. For the serial port this
+ defines the baudrate/parity/bits/flow control of
+ the port, in the format BBBBPNF, where BBBB is the
+ speed, P is parity (n/o/e), N is number of bits,
+ and F is flow control ('r' for RTS). Default is
+ 9600n8. The maximum baudrate is 115200.
+
+You can specify multiple console= options on the kernel command line.
+Output will appear on all of them. The last device will be used when
+you open /dev/console. So, for example:
+
+ console=ttyS1,9600 console=tty0
+
+defines that opening /dev/console will get you the current foreground
+virtual console, and kernel messages will appear on both the VGA
+console and the 2nd serial port (ttyS1 or COM2) at 9600 baud.
+
+Note that you can only define one console per device type (serial, video).
+
+If no console device is specified, the first device found capable of
+acting as a system console will be used. At this time, the system
+first looks for a VGA card and then for a serial port. So if you don't
+have a VGA card in your system the first serial port will automatically
+become the console.
+
+You will need to create a new device to use /dev/console. The official
+/dev/console is now character device 5,1.
+
+(You can also use a network device as a console. See
+Documentation/networking/netconsole.txt for information on that.)
+
+Here's an example that will use /dev/ttyS1 (COM2) as the console.
+Replace the sample values as needed.
+
+1. Create /dev/console (real console) and /dev/tty0 (master virtual
+ console):
+
+ cd /dev
+ rm -f console tty0
+ mknod -m 622 console c 5 1
+ mknod -m 622 tty0 c 4 0
+
+2. LILO can also take input from a serial device. This is a very
+ useful option. To tell LILO to use the serial port:
+ In lilo.conf (global section):
+
+ serial = 1,9600n8 (ttyS1, 9600 bd, no parity, 8 bits)
+
+3. Adjust to kernel flags for the new kernel,
+ again in lilo.conf (kernel section)
+
+ append = "console=ttyS1,9600"
+
+4. Make sure a getty runs on the serial port so that you can login to
+ it once the system is done booting. This is done by adding a line
+ like this to /etc/inittab (exact syntax depends on your getty):
+
+ S1:23:respawn:/sbin/getty -L ttyS1 9600 vt100
+
+5. Init and /etc/ioctl.save
+
+ Sysvinit remembers its stty settings in a file in /etc, called
+ `/etc/ioctl.save'. REMOVE THIS FILE before using the serial
+ console for the first time, because otherwise init will probably
+ set the baudrate to 38400 (baudrate of the virtual console).
+
+6. /dev/console and X
+ Programs that want to do something with the virtual console usually
+ open /dev/console. If you have created the new /dev/console device,
+ and your console is NOT the virtual console some programs will fail.
+ Those are programs that want to access the VT interface, and use
+ /dev/console instead of /dev/tty0. Some of those programs are:
+
+ Xfree86, svgalib, gpm, SVGATextMode
+
+ It should be fixed in modern versions of these programs though.
+
+ Note that if you boot without a console= option (or with
+ console=/dev/tty0), /dev/console is the same as /dev/tty0. In that
+ case everything will still work.
+
+7. Thanks
+
+ Thanks to Geert Uytterhoeven <geert@linux-m68k.org>
+ for porting the patches from 2.1.4x to 2.1.6x for taking care of
+ the integration of these patches into m68k, ppc and alpha.
+
+Miquel van Smoorenburg <miquels@cistron.nl>, 11-Jun-2000
diff --git a/Documentation/serial/00-INDEX b/Documentation/serial/00-INDEX
new file mode 100644
index 0000000..07dcdb0
--- /dev/null
+++ b/Documentation/serial/00-INDEX
@@ -0,0 +1,24 @@
+00-INDEX
+ - this file.
+README.cycladesZ
+ - info on Cyclades-Z firmware loading.
+computone.txt
+ - info on Computone Intelliport II/Plus Multiport Serial Driver.
+digiepca.txt
+ - info on Digi Intl. {PC,PCI,EISA}Xx and Xem series cards.
+hayes-esp.txt
+ - info on using the Hayes ESP serial driver.
+moxa-smartio
+ - file with info on installing/using Moxa multiport serial driver.
+riscom8.txt
+ - notes on using the RISCom/8 multi-port serial driver.
+rocket.txt
+ - info on the Comtrol RocketPort multiport serial driver.
+specialix.txt
+ - info on hardware/driver for specialix IO8+ multiport serial card.
+stallion.txt
+ - info on using the Stallion multiport serial driver.
+sx.txt
+ - info on the Specialix SX/SI multiport serial driver.
+tty.txt
+ - guide to the locking policies of the tty layer.
diff --git a/Documentation/serial/README.cycladesZ b/Documentation/serial/README.cycladesZ
new file mode 100644
index 0000000..024a694
--- /dev/null
+++ b/Documentation/serial/README.cycladesZ
@@ -0,0 +1,8 @@
+
+The Cyclades-Z must have firmware loaded onto the card before it will
+operate. This operation should be performed during system startup,
+
+The firmware, loader program and the latest device driver code are
+available from Cyclades at
+ ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
+
diff --git a/Documentation/serial/computone.txt b/Documentation/serial/computone.txt
new file mode 100644
index 0000000..c57ea47
--- /dev/null
+++ b/Documentation/serial/computone.txt
@@ -0,0 +1,522 @@
+NOTE: This is an unmaintained driver. It is not guaranteed to work due to
+changes made in the tty layer in 2.6. If you wish to take over maintenance of
+this driver, contact Michael Warfield <mhw@wittsend.com>.
+
+Changelog:
+----------
+11-01-2001: Original Document
+
+10-29-2004: Minor misspelling & format fix, update status of driver.
+ James Nelson <james4765@gmail.com>
+
+Computone Intelliport II/Plus Multiport Serial Driver
+-----------------------------------------------------
+
+Release Notes For Linux Kernel 2.2 and higher.
+These notes are for the drivers which have already been integrated into the
+kernel and have been tested on Linux kernels 2.0, 2.2, 2.3, and 2.4.
+
+Version: 1.2.14
+Date: 11/01/2001
+Historical Author: Andrew Manison <amanison@america.net>
+Primary Author: Doug McNash
+Support: support@computone.com
+Fixes and Updates: Mike Warfield <mhw@wittsend.com>
+
+This file assumes that you are using the Computone drivers which are
+integrated into the kernel sources. For updating the drivers or installing
+drivers into kernels which do not already have Computone drivers, please
+refer to the instructions in the README.computone file in the driver patch.
+
+
+1. INTRODUCTION
+
+This driver supports the entire family of Intelliport II/Plus controllers
+with the exception of the MicroChannel controllers. It does not support
+products previous to the Intelliport II.
+
+This driver was developed on the v2.0.x Linux tree and has been tested up
+to v2.4.14; it will probably not work with earlier v1.X kernels,.
+
+
+2. QUICK INSTALLATION
+
+Hardware - If you have an ISA card, find a free interrupt and io port.
+ List those in use with `cat /proc/interrupts` and
+ `cat /proc/ioports`. Set the card dip switches to a free
+ address. You may need to configure your BIOS to reserve an
+ irq for an ISA card. PCI and EISA parameters are set
+ automagically. Insert card into computer with the power off
+ before or after drivers installation.
+
+ Note the hardware address from the Computone ISA cards installed into
+ the system. These are required for editing ip2.c or editing
+ /etc/modprobe.conf, or for specification on the modprobe
+ command line.
+
+ Note that the /etc/modules.conf should be used for older (pre-2.6)
+ kernels.
+
+Software -
+
+Module installation:
+
+a) Determine free irq/address to use if any (configure BIOS if need be)
+b) Run "make config" or "make menuconfig" or "make xconfig"
+ Select (m) module for CONFIG_COMPUTONE under character
+ devices. CONFIG_PCI and CONFIG_MODULES also may need to be set.
+c) Set address on ISA cards then:
+ edit /usr/src/linux/drivers/char/ip2.c if needed
+ or
+ edit /etc/modprobe.conf if needed (module).
+ or both to match this setting.
+d) Run "make modules"
+e) Run "make modules_install"
+f) Run "/sbin/depmod -a"
+g) install driver using `modprobe ip2 <options>` (options listed below)
+h) run ip2mkdev (either the script below or the binary version)
+
+
+Kernel installation:
+
+a) Determine free irq/address to use if any (configure BIOS if need be)
+b) Run "make config" or "make menuconfig" or "make xconfig"
+ Select (y) kernel for CONFIG_COMPUTONE under character
+ devices. CONFIG_PCI may need to be set if you have PCI bus.
+c) Set address on ISA cards then:
+ edit /usr/src/linux/drivers/char/ip2.c
+ (Optional - may be specified on kernel command line now)
+d) Run "make zImage" or whatever target you prefer.
+e) mv /usr/src/linux/arch/i386/boot/zImage to /boot.
+f) Add new config for this kernel into /etc/lilo.conf, run "lilo"
+ or copy to a floppy disk and boot from that floppy disk.
+g) Reboot using this kernel
+h) run ip2mkdev (either the script below or the binary version)
+
+Kernel command line options:
+
+When compiling the driver into the kernel, io and irq may be
+compiled into the driver by editing ip2.c and setting the values for
+io and irq in the appropriate array. An alternative is to specify
+a command line parameter to the kernel at boot up.
+
+ ip2=io0,irq0,io1,irq1,io2,irq2,io3,irq3
+
+Note that this order is very different from the specifications for the
+modload parameters which have separate IRQ and IO specifiers.
+
+The io port also selects PCI (1) and EISA (2) boards.
+
+ io=0 No board
+ io=1 PCI board
+ io=2 EISA board
+ else ISA board io address
+
+You only need to specify the boards which are present.
+
+ Examples:
+
+ 2 PCI boards:
+
+ ip2=1,0,1,0
+
+ 1 ISA board at 0x310 irq 5:
+
+ ip2=0x310,5
+
+This can be added to and "append" option in lilo.conf similar to this:
+
+ append="ip2=1,0,1,0"
+
+
+3. INSTALLATION
+
+Previously, the driver sources were packaged with a set of patch files
+to update the character drivers' makefile and configuration file, and other
+kernel source files. A build script (ip2build) was included which applies
+the patches if needed, and build any utilities needed.
+What you receive may be a single patch file in conventional kernel
+patch format build script. That form can also be applied by
+running patch -p1 < ThePatchFile. Otherwise run ip2build.
+
+The driver can be installed as a module (recommended) or built into the
+kernel. This is selected as for other drivers through the `make config`
+command from the root of the Linux source tree. If the driver is built
+into the kernel you will need to edit the file ip2.c to match the boards
+you are installing. See that file for instructions. If the driver is
+installed as a module the configuration can also be specified on the
+modprobe command line as follows:
+
+ modprobe ip2 irq=irq1,irq2,irq3,irq4 io=addr1,addr2,addr3,addr4
+
+where irqnum is one of the valid Intelliport II interrupts (3,4,5,7,10,11,
+12,15) and addr1-4 are the base addresses for up to four controllers. If
+the irqs are not specified the driver uses the default in ip2.c (which
+selects polled mode). If no base addresses are specified the defaults in
+ip2.c are used. If you are autoloading the driver module with kerneld or
+kmod the base addresses and interrupt number must also be set in ip2.c
+and recompile or just insert and options line in /etc/modprobe.conf or both.
+The options line is equivalent to the command line and takes precedence over
+what is in ip2.c.
+
+/etc/modprobe.conf sample:
+ options ip2 io=1,0x328 irq=1,10
+ alias char-major-71 ip2
+ alias char-major-72 ip2
+ alias char-major-73 ip2
+
+The equivalent in ip2.c:
+
+static int io[IP2_MAX_BOARDS]= { 1, 0x328, 0, 0 };
+static int irq[IP2_MAX_BOARDS] = { 1, 10, -1, -1 };
+
+The equivalent for the kernel command line (in lilo.conf):
+
+ append="ip2=1,1,0x328,10"
+
+
+Note: Both io and irq should be updated to reflect YOUR system. An "io"
+ address of 1 or 2 indicates a PCI or EISA card in the board table.
+ The PCI or EISA irq will be assigned automatically.
+
+Specifying an invalid or in-use irq will default the driver into
+running in polled mode for that card. If all irq entries are 0 then
+all cards will operate in polled mode.
+
+If you select the driver as part of the kernel run :
+
+ make zlilo (or whatever you do to create a bootable kernel)
+
+If you selected a module run :
+
+ make modules && make modules_install
+
+The utility ip2mkdev (see 5 and 7 below) creates all the device nodes
+required by the driver. For a device to be created it must be configured
+in the driver and the board must be installed. Only devices corresponding
+to real IntelliPort II ports are created. With multiple boards and expansion
+boxes this will leave gaps in the sequence of device names. ip2mkdev uses
+Linux tty naming conventions: ttyF0 - ttyF255 for normal devices, and
+cuf0 - cuf255 for callout devices.
+
+
+4. USING THE DRIVERS
+
+As noted above, the driver implements the ports in accordance with Linux
+conventions, and the devices should be interchangeable with the standard
+serial devices. (This is a key point for problem reporting: please make
+sure that what you are trying do works on the ttySx/cuax ports first; then
+tell us what went wrong with the ip2 ports!)
+
+Higher speeds can be obtained using the setserial utility which remaps
+38,400 bps (extb) to 57,600 bps, 115,200 bps, or a custom speed.
+Intelliport II installations using the PowerPort expansion module can
+use the custom speed setting to select the highest speeds: 153,600 bps,
+230,400 bps, 307,200 bps, 460,800bps and 921,600 bps. The base for
+custom baud rate configuration is fixed at 921,600 for cards/expansion
+modules with ST654's and 115200 for those with Cirrus CD1400's. This
+corresponds to the maximum bit rates those chips are capable.
+For example if the baud base is 921600 and the baud divisor is 18 then
+the custom rate is 921600/18 = 51200 bps. See the setserial man page for
+complete details. Of course if stty accepts the higher rates now you can
+use that as well as the standard ioctls().
+
+
+5. ip2mkdev and assorted utilities...
+
+Several utilities, including the source for a binary ip2mkdev utility are
+available under .../drivers/char/ip2. These can be build by changing to
+that directory and typing "make" after the kernel has be built. If you do
+not wish to compile the binary utilities, the shell script below can be
+cut out and run as "ip2mkdev" to create the necessary device files. To
+use the ip2mkdev script, you must have procfs enabled and the proc file
+system mounted on /proc.
+
+
+6. NOTES
+
+This is a release version of the driver, but it is impossible to test it
+in all configurations of Linux. If there is any anomalous behaviour that
+does not match the standard serial port's behaviour please let us know.
+
+
+7. ip2mkdev shell script
+
+Previously, this script was simply attached here. It is now attached as a
+shar archive to make it easier to extract the script from the documentation.
+To create the ip2mkdev shell script change to a convenient directory (/tmp
+works just fine) and run the following command:
+
+ unshar Documentation/serial/computone.txt
+ (This file)
+
+You should now have a file ip2mkdev in your current working directory with
+permissions set to execute. Running that script with then create the
+necessary devices for the Computone boards, interfaces, and ports which
+are present on you system at the time it is run.
+
+
+#!/bin/sh
+# This is a shell archive (produced by GNU sharutils 4.2.1).
+# To extract the files from this archive, save it to some FILE, remove
+# everything before the `!/bin/sh' line above, then type `sh FILE'.
+#
+# Made on 2001-10-29 10:32 EST by <mhw@alcove.wittsend.com>.
+# Source directory was `/home2/src/tmp'.
+#
+# Existing files will *not* be overwritten unless `-c' is specified.
+#
+# This shar contains:
+# length mode name
+# ------ ---------- ------------------------------------------
+# 4251 -rwxr-xr-x ip2mkdev
+#
+save_IFS="${IFS}"
+IFS="${IFS}:"
+gettext_dir=FAILED
+locale_dir=FAILED
+first_param="$1"
+for dir in $PATH
+do
+ if test "$gettext_dir" = FAILED && test -f $dir/gettext \
+ && ($dir/gettext --version >/dev/null 2>&1)
+ then
+ set `$dir/gettext --version 2>&1`
+ if test "$3" = GNU
+ then
+ gettext_dir=$dir
+ fi
+ fi
+ if test "$locale_dir" = FAILED && test -f $dir/shar \
+ && ($dir/shar --print-text-domain-dir >/dev/null 2>&1)
+ then
+ locale_dir=`$dir/shar --print-text-domain-dir`
+ fi
+done
+IFS="$save_IFS"
+if test "$locale_dir" = FAILED || test "$gettext_dir" = FAILED
+then
+ echo=echo
+else
+ TEXTDOMAINDIR=$locale_dir
+ export TEXTDOMAINDIR
+ TEXTDOMAIN=sharutils
+ export TEXTDOMAIN
+ echo="$gettext_dir/gettext -s"
+fi
+if touch -am -t 200112312359.59 $$.touch >/dev/null 2>&1 && test ! -f 200112312359.59 -a -f $$.touch; then
+ shar_touch='touch -am -t $1$2$3$4$5$6.$7 "$8"'
+elif touch -am 123123592001.59 $$.touch >/dev/null 2>&1 && test ! -f 123123592001.59 -a ! -f 123123592001.5 -a -f $$.touch; then
+ shar_touch='touch -am $3$4$5$6$1$2.$7 "$8"'
+elif touch -am 1231235901 $$.touch >/dev/null 2>&1 && test ! -f 1231235901 -a -f $$.touch; then
+ shar_touch='touch -am $3$4$5$6$2 "$8"'
+else
+ shar_touch=:
+ echo
+ $echo 'WARNING: not restoring timestamps. Consider getting and'
+ $echo "installing GNU \`touch', distributed in GNU File Utilities..."
+ echo
+fi
+rm -f 200112312359.59 123123592001.59 123123592001.5 1231235901 $$.touch
+#
+if mkdir _sh17581; then
+ $echo 'x -' 'creating lock directory'
+else
+ $echo 'failed to create lock directory'
+ exit 1
+fi
+# ============= ip2mkdev ==============
+if test -f 'ip2mkdev' && test "$first_param" != -c; then
+ $echo 'x -' SKIPPING 'ip2mkdev' '(file already exists)'
+else
+ $echo 'x -' extracting 'ip2mkdev' '(text)'
+ sed 's/^X//' << 'SHAR_EOF' > 'ip2mkdev' &&
+#!/bin/sh -
+#
+# ip2mkdev
+#
+# Make or remove devices as needed for Computone Intelliport drivers
+#
+# First rule! If the dev file exists and you need it, don't mess
+# with it. That prevents us from screwing up open ttys, ownership
+# and permissions on a running system!
+#
+# This script will NOT remove devices that no longer exist if their
+# board or interface box has been removed. If you want to get rid
+# of them, you can manually do an "rm -f /dev/ttyF* /dev/cuaf*"
+# before running this script. Running this script will then recreate
+# all the valid devices.
+#
+# Michael H. Warfield
+# /\/\|=mhw=|\/\/
+# mhw@wittsend.com
+#
+# Updated 10/29/2000 for version 1.2.13 naming convention
+# under devfs. /\/\|=mhw=|\/\/
+#
+# Updated 03/09/2000 for devfs support in ip2 drivers. /\/\|=mhw=|\/\/
+#
+X
+if test -d /dev/ip2 ; then
+# This is devfs mode... We don't do anything except create symlinks
+# from the real devices to the old names!
+X cd /dev
+X echo "Creating symbolic links to devfs devices"
+X for i in `ls ip2` ; do
+X if test ! -L ip2$i ; then
+X # Remove it incase it wasn't a symlink (old device)
+X rm -f ip2$i
+X ln -s ip2/$i ip2$i
+X fi
+X done
+X for i in `( cd tts ; ls F* )` ; do
+X if test ! -L tty$i ; then
+X # Remove it incase it wasn't a symlink (old device)
+X rm -f tty$i
+X ln -s tts/$i tty$i
+X fi
+X done
+X for i in `( cd cua ; ls F* )` ; do
+X DEVNUMBER=`expr $i : 'F\(.*\)'`
+X if test ! -L cuf$DEVNUMBER ; then
+X # Remove it incase it wasn't a symlink (old device)
+X rm -f cuf$DEVNUMBER
+X ln -s cua/$i cuf$DEVNUMBER
+X fi
+X done
+X exit 0
+fi
+X
+if test ! -f /proc/tty/drivers
+then
+X echo "\
+Unable to check driver status.
+Make sure proc file system is mounted."
+X
+X exit 255
+fi
+X
+if test ! -f /proc/tty/driver/ip2
+then
+X echo "\
+Unable to locate ip2 proc file.
+Attempting to load driver"
+X
+X if /sbin/insmod ip2
+X then
+X if test ! -f /proc/tty/driver/ip2
+X then
+X echo "\
+Unable to locate ip2 proc file after loading driver.
+Driver initialization failure or driver version error.
+"
+X exit 255
+X fi
+X else
+X echo "Unable to load ip2 driver."
+X exit 255
+X fi
+fi
+X
+# Ok... So we got the driver loaded and we can locate the procfs files.
+# Next we need our major numbers.
+X
+TTYMAJOR=`sed -e '/^ip2/!d' -e '/\/dev\/tt/!d' -e 's/.*tt[^ ]*[ ]*\([0-9]*\)[ ]*.*/\1/' < /proc/tty/drivers`
+CUAMAJOR=`sed -e '/^ip2/!d' -e '/\/dev\/cu/!d' -e 's/.*cu[^ ]*[ ]*\([0-9]*\)[ ]*.*/\1/' < /proc/tty/drivers`
+BRDMAJOR=`sed -e '/^Driver: /!d' -e 's/.*IMajor=\([0-9]*\)[ ]*.*/\1/' < /proc/tty/driver/ip2`
+X
+echo "\
+TTYMAJOR = $TTYMAJOR
+CUAMAJOR = $CUAMAJOR
+BRDMAJOR = $BRDMAJOR
+"
+X
+# Ok... Now we should know our major numbers, if appropriate...
+# Now we need our boards and start the device loops.
+X
+grep '^Board [0-9]:' /proc/tty/driver/ip2 | while read token number type alltherest
+do
+X # The test for blank "type" will catch the stats lead-in lines
+X # if they exist in the file
+X if test "$type" = "vacant" -o "$type" = "Vacant" -o "$type" = ""
+X then
+X continue
+X fi
+X
+X BOARDNO=`expr "$number" : '\([0-9]\):'`
+X PORTS=`expr "$alltherest" : '.*ports=\([0-9]*\)' | tr ',' ' '`
+X MINORS=`expr "$alltherest" : '.*minors=\([0-9,]*\)' | tr ',' ' '`
+X
+X if test "$BOARDNO" = "" -o "$PORTS" = ""
+X then
+# This may be a bug. We should at least get this much information
+X echo "Unable to process board line"
+X continue
+X fi
+X
+X if test "$MINORS" = ""
+X then
+# Silently skip this one. This board seems to have no boxes
+X continue
+X fi
+X
+X echo "board $BOARDNO: $type ports = $PORTS; port numbers = $MINORS"
+X
+X if test "$BRDMAJOR" != ""
+X then
+X BRDMINOR=`expr $BOARDNO \* 4`
+X STSMINOR=`expr $BRDMINOR + 1`
+X if test ! -c /dev/ip2ipl$BOARDNO ; then
+X mknod /dev/ip2ipl$BOARDNO c $BRDMAJOR $BRDMINOR
+X fi
+X if test ! -c /dev/ip2stat$BOARDNO ; then
+X mknod /dev/ip2stat$BOARDNO c $BRDMAJOR $STSMINOR
+X fi
+X fi
+X
+X if test "$TTYMAJOR" != ""
+X then
+X PORTNO=$BOARDBASE
+X
+X for PORTNO in $MINORS
+X do
+X if test ! -c /dev/ttyF$PORTNO ; then
+X # We got the hardware but no device - make it
+X mknod /dev/ttyF$PORTNO c $TTYMAJOR $PORTNO
+X fi
+X done
+X fi
+X
+X if test "$CUAMAJOR" != ""
+X then
+X PORTNO=$BOARDBASE
+X
+X for PORTNO in $MINORS
+X do
+X if test ! -c /dev/cuf$PORTNO ; then
+X # We got the hardware but no device - make it
+X mknod /dev/cuf$PORTNO c $CUAMAJOR $PORTNO
+X fi
+X done
+X fi
+done
+X
+Xexit 0
+SHAR_EOF
+ (set 20 01 10 29 10 32 01 'ip2mkdev'; eval "$shar_touch") &&
+ chmod 0755 'ip2mkdev' ||
+ $echo 'restore of' 'ip2mkdev' 'failed'
+ if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \
+ && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then
+ md5sum -c << SHAR_EOF >/dev/null 2>&1 \
+ || $echo 'ip2mkdev:' 'MD5 check failed'
+cb5717134509f38bad9fde6b1f79b4a4 ip2mkdev
+SHAR_EOF
+ else
+ shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'ip2mkdev'`"
+ test 4251 -eq "$shar_count" ||
+ $echo 'ip2mkdev:' 'original size' '4251,' 'current size' "$shar_count!"
+ fi
+fi
+rm -fr _sh17581
+exit 0
diff --git a/Documentation/serial/digiepca.txt b/Documentation/serial/digiepca.txt
new file mode 100644
index 0000000..f2560e2
--- /dev/null
+++ b/Documentation/serial/digiepca.txt
@@ -0,0 +1,98 @@
+NOTE: This driver is obsolete. Digi provides a 2.6 driver (dgdm) at
+http://www.digi.com for PCI cards. They no longer maintain this driver,
+and have no 2.6 driver for ISA cards.
+
+This driver requires a number of user-space tools. They can be acquired from
+http://www.digi.com, but only works with 2.4 kernels.
+
+
+The Digi Intl. epca driver.
+----------------------------
+The Digi Intl. epca driver for Linux supports the following boards:
+
+Digi PC/Xem, PC/Xr, PC/Xe, PC/Xi, PC/Xeve
+Digi EISA/Xem, PCI/Xem, PCI/Xr
+
+Limitations:
+------------
+Currently the driver only autoprobes for supported PCI boards.
+
+The Linux MAKEDEV command does not support generating the Digiboard
+Devices. Users executing digiConfig to setup EISA and PC series cards
+will have their device nodes automatically constructed (cud?? for ~CLOCAL,
+and ttyD?? for CLOCAL). Users wishing to boot their board from the LILO
+prompt, or those users booting PCI cards may use buildDIGI to construct
+the necessary nodes.
+
+Notes:
+------
+This driver may be configured via LILO. For users who have already configured
+their driver using digiConfig, configuring from LILO will override previous
+settings. Multiple boards may be configured by issuing multiple LILO command
+lines. For examples see the bottom of this document.
+
+Device names start at 0 and continue up. Beware of this as previous Digi
+drivers started device names with 1.
+
+PCI boards are auto-detected and configured by the driver. PCI boards will
+be allocated device numbers (internally) beginning with the lowest PCI slot
+first. In other words a PCI card in slot 3 will always have higher device
+nodes than a PCI card in slot 1.
+
+LILO config examples:
+---------------------
+Using LILO's APPEND command, a string of comma separated identifiers or
+integers can be used to configure supported boards. The six values in order
+are:
+
+ Enable/Disable this card or Override,
+ Type of card: PC/Xe (AccelePort) (0), PC/Xeve (1), PC/Xem or PC/Xr (2),
+ EISA/Xem (3), PC/64Xe (4), PC/Xi (5),
+ Enable/Disable alternate pin arrangement,
+ Number of ports on this card,
+ I/O Port where card is configured (in HEX if using string identifiers),
+ Base of memory window (in HEX if using string identifiers),
+
+NOTE : PCI boards are auto-detected and configured. Do not attempt to
+configure PCI boards with the LILO append command. If you wish to override
+previous configuration data (As set by digiConfig), but you do not wish to
+configure any specific card (Example if there are PCI cards in the system)
+the following override command will accomplish this:
+-> append="digi=2"
+
+Samples:
+ append="digiepca=E,PC/Xe,D,16,200,D0000"
+ or
+ append="digi=1,0,0,16,512,851968"
+
+Supporting Tools:
+-----------------
+Supporting tools include digiDload, digiConfig, buildPCI, and ditty. See
+drivers/char/README.epca for more details. Note,
+this driver REQUIRES that digiDload be executed prior to it being used.
+Failure to do this will result in an ENODEV error.
+
+Documentation:
+--------------
+Complete documentation for this product may be found in the tool package.
+
+Sources of information and support:
+-----------------------------------
+Digi Intl. support site for this product:
+
+-> http://www.digi.com
+
+Acknowledgments:
+----------------
+Much of this work (And even text) was derived from a similar document
+supporting the original public domain DigiBoard driver Copyright (C)
+1994,1995 Troy De Jongh. Many thanks to Christoph Lameter
+(christoph@lameter.com) and Mike McLagan (mike.mclagan@linux.org) who authored
+and contributed to the original document.
+
+Changelog:
+----------
+10-29-04: Update status of driver, remove dead links in document
+ James Nelson <james4765@gmail.com>
+
+2000 (?) Original Document
diff --git a/Documentation/serial/driver b/Documentation/serial/driver
new file mode 100644
index 0000000..77ba0af
--- /dev/null
+++ b/Documentation/serial/driver
@@ -0,0 +1,397 @@
+
+ Low Level Serial API
+ --------------------
+
+
+This document is meant as a brief overview of some aspects of the new serial
+driver. It is not complete, any questions you have should be directed to
+<rmk@arm.linux.org.uk>
+
+The reference implementation is contained within amba_pl011.c.
+
+
+
+Low Level Serial Hardware Driver
+--------------------------------
+
+The low level serial hardware driver is responsible for supplying port
+information (defined by uart_port) and a set of control methods (defined
+by uart_ops) to the core serial driver. The low level driver is also
+responsible for handling interrupts for the port, and providing any
+console support.
+
+
+Console Support
+---------------
+
+The serial core provides a few helper functions. This includes identifing
+the correct port structure (via uart_get_console) and decoding command line
+arguments (uart_parse_options).
+
+There is also a helper function (uart_write_console) which performs a
+character by character write, translating newlines to CRLF sequences.
+Driver writers are recommended to use this function rather than implementing
+their own version.
+
+
+Locking
+-------
+
+It is the responsibility of the low level hardware driver to perform the
+necessary locking using port->lock. There are some exceptions (which
+are described in the uart_ops listing below.)
+
+There are three locks. A per-port spinlock, a per-port tmpbuf semaphore,
+and an overall semaphore.
+
+From the core driver perspective, the port->lock locks the following
+data:
+
+ port->mctrl
+ port->icount
+ info->xmit.head (circ->head)
+ info->xmit.tail (circ->tail)
+
+The low level driver is free to use this lock to provide any additional
+locking.
+
+The core driver uses the info->tmpbuf_sem lock to prevent multi-threaded
+access to the info->tmpbuf bouncebuffer used for port writes.
+
+The port_sem semaphore is used to protect against ports being added/
+removed or reconfigured at inappropriate times.
+
+
+uart_ops
+--------
+
+The uart_ops structure is the main interface between serial_core and the
+hardware specific driver. It contains all the methods to control the
+hardware.
+
+ tx_empty(port)
+ This function tests whether the transmitter fifo and shifter
+ for the port described by 'port' is empty. If it is empty,
+ this function should return TIOCSER_TEMT, otherwise return 0.
+ If the port does not support this operation, then it should
+ return TIOCSER_TEMT.
+
+ Locking: none.
+ Interrupts: caller dependent.
+ This call must not sleep
+
+ set_mctrl(port, mctrl)
+ This function sets the modem control lines for port described
+ by 'port' to the state described by mctrl. The relevant bits
+ of mctrl are:
+ - TIOCM_RTS RTS signal.
+ - TIOCM_DTR DTR signal.
+ - TIOCM_OUT1 OUT1 signal.
+ - TIOCM_OUT2 OUT2 signal.
+ - TIOCM_LOOP Set the port into loopback mode.
+ If the appropriate bit is set, the signal should be driven
+ active. If the bit is clear, the signal should be driven
+ inactive.
+
+ Locking: port->lock taken.
+ Interrupts: locally disabled.
+ This call must not sleep
+
+ get_mctrl(port)
+ Returns the current state of modem control inputs. The state
+ of the outputs should not be returned, since the core keeps
+ track of their state. The state information should include:
+ - TIOCM_DCD state of DCD signal
+ - TIOCM_CTS state of CTS signal
+ - TIOCM_DSR state of DSR signal
+ - TIOCM_RI state of RI signal
+ The bit is set if the signal is currently driven active. If
+ the port does not support CTS, DCD or DSR, the driver should
+ indicate that the signal is permanently active. If RI is
+ not available, the signal should not be indicated as active.
+
+ Locking: port->lock taken.
+ Interrupts: locally disabled.
+ This call must not sleep
+
+ stop_tx(port)
+ Stop transmitting characters. This might be due to the CTS
+ line becoming inactive or the tty layer indicating we want
+ to stop transmission due to an XOFF character.
+
+ The driver should stop transmitting characters as soon as
+ possible.
+
+ Locking: port->lock taken.
+ Interrupts: locally disabled.
+ This call must not sleep
+
+ start_tx(port)
+ Start transmitting characters.
+
+ Locking: port->lock taken.
+ Interrupts: locally disabled.
+ This call must not sleep
+
+ stop_rx(port)
+ Stop receiving characters; the port is in the process of
+ being closed.
+
+ Locking: port->lock taken.
+ Interrupts: locally disabled.
+ This call must not sleep
+
+ enable_ms(port)
+ Enable the modem status interrupts.
+
+ This method may be called multiple times. Modem status
+ interrupts should be disabled when the shutdown method is
+ called.
+
+ Locking: port->lock taken.
+ Interrupts: locally disabled.
+ This call must not sleep
+
+ break_ctl(port,ctl)
+ Control the transmission of a break signal. If ctl is
+ nonzero, the break signal should be transmitted. The signal
+ should be terminated when another call is made with a zero
+ ctl.
+
+ Locking: none.
+ Interrupts: caller dependent.
+ This call must not sleep
+
+ startup(port)
+ Grab any interrupt resources and initialise any low level driver
+ state. Enable the port for reception. It should not activate
+ RTS nor DTR; this will be done via a separate call to set_mctrl.
+
+ This method will only be called when the port is initially opened.
+
+ Locking: port_sem taken.
+ Interrupts: globally disabled.
+
+ shutdown(port)
+ Disable the port, disable any break condition that may be in
+ effect, and free any interrupt resources. It should not disable
+ RTS nor DTR; this will have already been done via a separate
+ call to set_mctrl.
+
+ Drivers must not access port->info once this call has completed.
+
+ This method will only be called when there are no more users of
+ this port.
+
+ Locking: port_sem taken.
+ Interrupts: caller dependent.
+
+ flush_buffer(port)
+ Flush any write buffers, reset any DMA state and stop any
+ ongoing DMA transfers.
+
+ This will be called whenever the port->info->xmit circular
+ buffer is cleared.
+
+ Locking: port->lock taken.
+ Interrupts: locally disabled.
+ This call must not sleep
+
+ set_termios(port,termios,oldtermios)
+ Change the port parameters, including word length, parity, stop
+ bits. Update read_status_mask and ignore_status_mask to indicate
+ the types of events we are interested in receiving. Relevant
+ termios->c_cflag bits are:
+ CSIZE - word size
+ CSTOPB - 2 stop bits
+ PARENB - parity enable
+ PARODD - odd parity (when PARENB is in force)
+ CREAD - enable reception of characters (if not set,
+ still receive characters from the port, but
+ throw them away.
+ CRTSCTS - if set, enable CTS status change reporting
+ CLOCAL - if not set, enable modem status change
+ reporting.
+ Relevant termios->c_iflag bits are:
+ INPCK - enable frame and parity error events to be
+ passed to the TTY layer.
+ BRKINT
+ PARMRK - both of these enable break events to be
+ passed to the TTY layer.
+
+ IGNPAR - ignore parity and framing errors
+ IGNBRK - ignore break errors, If IGNPAR is also
+ set, ignore overrun errors as well.
+ The interaction of the iflag bits is as follows (parity error
+ given as an example):
+ Parity error INPCK IGNPAR
+ n/a 0 n/a character received, marked as
+ TTY_NORMAL
+ None 1 n/a character received, marked as
+ TTY_NORMAL
+ Yes 1 0 character received, marked as
+ TTY_PARITY
+ Yes 1 1 character discarded
+
+ Other flags may be used (eg, xon/xoff characters) if your
+ hardware supports hardware "soft" flow control.
+
+ Locking: none.
+ Interrupts: caller dependent.
+ This call must not sleep
+
+ pm(port,state,oldstate)
+ Perform any power management related activities on the specified
+ port. State indicates the new state (defined by ACPI D0-D3),
+ oldstate indicates the previous state. Essentially, D0 means
+ fully on, D3 means powered down.
+
+ This function should not be used to grab any resources.
+
+ This will be called when the port is initially opened and finally
+ closed, except when the port is also the system console. This
+ will occur even if CONFIG_PM is not set.
+
+ Locking: none.
+ Interrupts: caller dependent.
+
+ type(port)
+ Return a pointer to a string constant describing the specified
+ port, or return NULL, in which case the string 'unknown' is
+ substituted.
+
+ Locking: none.
+ Interrupts: caller dependent.
+
+ release_port(port)
+ Release any memory and IO region resources currently in use by
+ the port.
+
+ Locking: none.
+ Interrupts: caller dependent.
+
+ request_port(port)
+ Request any memory and IO region resources required by the port.
+ If any fail, no resources should be registered when this function
+ returns, and it should return -EBUSY on failure.
+
+ Locking: none.
+ Interrupts: caller dependent.
+
+ config_port(port,type)
+ Perform any autoconfiguration steps required for the port. `type`
+ contains a bit mask of the required configuration. UART_CONFIG_TYPE
+ indicates that the port requires detection and identification.
+ port->type should be set to the type found, or PORT_UNKNOWN if
+ no port was detected.
+
+ UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+ which should be probed using standard kernel autoprobing techniques.
+ This is not necessary on platforms where ports have interrupts
+ internally hard wired (eg, system on a chip implementations).
+
+ Locking: none.
+ Interrupts: caller dependent.
+
+ verify_port(port,serinfo)
+ Verify the new serial port information contained within serinfo is
+ suitable for this port type.
+
+ Locking: none.
+ Interrupts: caller dependent.
+
+ ioctl(port,cmd,arg)
+ Perform any port specific IOCTLs. IOCTL commands must be defined
+ using the standard numbering system found in <asm/ioctl.h>
+
+ Locking: none.
+ Interrupts: caller dependent.
+
+Other functions
+---------------
+
+uart_update_timeout(port,cflag,baud)
+ Update the FIFO drain timeout, port->timeout, according to the
+ number of bits, parity, stop bits and baud rate.
+
+ Locking: caller is expected to take port->lock
+ Interrupts: n/a
+
+uart_get_baud_rate(port,termios,old,min,max)
+ Return the numeric baud rate for the specified termios, taking
+ account of the special 38400 baud "kludge". The B0 baud rate
+ is mapped to 9600 baud.
+
+ If the baud rate is not within min..max, then if old is non-NULL,
+ the original baud rate will be tried. If that exceeds the
+ min..max constraint, 9600 baud will be returned. termios will
+ be updated to the baud rate in use.
+
+ Note: min..max must always allow 9600 baud to be selected.
+
+ Locking: caller dependent.
+ Interrupts: n/a
+
+uart_get_divisor(port,baud)
+ Return the divsor (baud_base / baud) for the specified baud
+ rate, appropriately rounded.
+
+ If 38400 baud and custom divisor is selected, return the
+ custom divisor instead.
+
+ Locking: caller dependent.
+ Interrupts: n/a
+
+uart_match_port(port1,port2)
+ This utility function can be used to determine whether two
+ uart_port structures describe the same port.
+
+ Locking: n/a
+ Interrupts: n/a
+
+uart_write_wakeup(port)
+ A driver is expected to call this function when the number of
+ characters in the transmit buffer have dropped below a threshold.
+
+ Locking: port->lock should be held.
+ Interrupts: n/a
+
+uart_register_driver(drv)
+ Register a uart driver with the core driver. We in turn register
+ with the tty layer, and initialise the core driver per-port state.
+
+ drv->port should be NULL, and the per-port structures should be
+ registered using uart_add_one_port after this call has succeeded.
+
+ Locking: none
+ Interrupts: enabled
+
+uart_unregister_driver()
+ Remove all references to a driver from the core driver. The low
+ level driver must have removed all its ports via the
+ uart_remove_one_port() if it registered them with uart_add_one_port().
+
+ Locking: none
+ Interrupts: enabled
+
+uart_suspend_port()
+
+uart_resume_port()
+
+uart_add_one_port()
+
+uart_remove_one_port()
+
+Other notes
+-----------
+
+It is intended some day to drop the 'unused' entries from uart_port, and
+allow low level drivers to register their own individual uart_port's with
+the core. This will allow drivers to use uart_port as a pointer to a
+structure containing both the uart_port entry with their own extensions,
+thus:
+
+ struct my_port {
+ struct uart_port port;
+ int my_stuff;
+ };
diff --git a/Documentation/serial/hayes-esp.txt b/Documentation/serial/hayes-esp.txt
new file mode 100644
index 0000000..09b5d58
--- /dev/null
+++ b/Documentation/serial/hayes-esp.txt
@@ -0,0 +1,154 @@
+HAYES ESP DRIVER VERSION 2.1
+
+A big thanks to the people at Hayes, especially Alan Adamson. Their support
+has enabled me to provide enhancements to the driver.
+
+Please report your experiences with this driver to me (arobinso@nyx.net). I
+am looking for both positive and negative feedback.
+
+*** IMPORTANT CHANGES FOR 2.1 ***
+Support for PIO mode. Five situations will cause PIO mode to be used:
+1) A multiport card is detected. PIO mode will always be used. (8 port cards
+do not support DMA).
+2) The DMA channel is set to an invalid value (anything other than 1 or 3).
+3) The DMA buffer/channel could not be allocated. The port will revert to PIO
+mode until it is reopened.
+4) Less than a specified number of bytes need to be transferred to/from the
+FIFOs. PIO mode will be used for that transfer only.
+5) A port needs to do a DMA transfer and another port is already using the
+DMA channel. PIO mode will be used for that transfer only.
+
+Since the Hayes ESP seems to conflict with other cards (notably sound cards)
+when using DMA, DMA is turned off by default. To use DMA, it must be turned
+on explicitly, either with the "dma=" option described below or with
+setserial. A multiport card can be forced into DMA mode by using setserial;
+however, most multiport cards don't support DMA.
+
+The latest version of setserial allows the enhanced configuration of the ESP
+card to be viewed and modified.
+***
+
+This package contains the files needed to compile a module to support the Hayes
+ESP card. The drivers are basically a modified version of the serial drivers.
+
+Features:
+
+- Uses the enhanced mode of the ESP card, allowing a wider range of
+ interrupts and features than compatibility mode
+- Uses DMA and 16 bit PIO mode to transfer data to and from the ESP's FIFOs,
+ reducing CPU load
+- Supports primary and secondary ports
+
+
+If the driver is compiled as a module, the IRQs to use can be specified by
+using the irq= option. The format is:
+
+irq=[0x100],[0x140],[0x180],[0x200],[0x240],[0x280],[0x300],[0x380]
+
+The address in brackets is the base address of the card. The IRQ of
+nonexistent cards can be set to 0. If an IRQ of a card that does exist is set
+to 0, the driver will attempt to guess at the correct IRQ. For example, to set
+the IRQ of the card at address 0x300 to 12, the insmod command would be:
+
+insmod esp irq=0,0,0,0,0,0,12,0
+
+The custom divisor can be set by using the divisor= option. The format is the
+same as for the irq= option. Each divisor value is a series of hex digits,
+with each digit representing the divisor to use for a corresponding port. The
+divisor value is constructed RIGHT TO LEFT. Specifying a nonzero divisor value
+will automatically set the spd_cust flag. To calculate the divisor to use for
+a certain baud rate, divide the port's base baud (generally 921600) by the
+desired rate. For example, to set the divisor of the primary port at 0x300 to
+4 and the divisor of the secondary port at 0x308 to 8, the insmod command would
+be:
+
+insmod esp divisor=0,0,0,0,0,0,0x84,0
+
+The dma= option can be used to set the DMA channel. The channel can be either
+1 or 3. Specifying any other value will force the driver to use PIO mode.
+For example, to set the DMA channel to 3, the insmod command would be:
+
+insmod esp dma=3
+
+The rx_trigger= and tx_trigger= options can be used to set the FIFO trigger
+levels. They specify when the ESP card should send an interrupt. Larger
+values will decrease the number of interrupts; however, a value too high may
+result in data loss. Valid values are 1 through 1023, with 768 being the
+default. For example, to set the receive trigger level to 512 bytes and the
+transmit trigger level to 700 bytes, the insmod command would be:
+
+insmod esp rx_trigger=512 tx_trigger=700
+
+The flow_off= and flow_on= options can be used to set the hardware flow off/
+flow on levels. The flow on level must be lower than the flow off level, and
+the flow off level should be higher than rx_trigger. Valid values are 1
+through 1023, with 1016 being the default flow off level and 944 being the
+default flow on level. For example, to set the flow off level to 1000 bytes
+and the flow on level to 935 bytes, the insmod command would be:
+
+insmod esp flow_off=1000 flow_on=935
+
+The rx_timeout= option can be used to set the receive timeout value. This
+value indicates how long after receiving the last character that the ESP card
+should wait before signalling an interrupt. Valid values are 0 though 255,
+with 128 being the default. A value too high will increase latency, and a
+value too low will cause unnecessary interrupts. For example, to set the
+receive timeout to 255, the insmod command would be:
+
+insmod esp rx_timeout=255
+
+The pio_threshold= option sets the threshold (in number of characters) for
+using PIO mode instead of DMA mode. For example, if this value is 32,
+transfers of 32 bytes or less will always use PIO mode.
+
+insmod esp pio_threshold=32
+
+Multiple options can be listed on the insmod command line by separating each
+option with a space. For example:
+
+insmod esp dma=3 trigger=512
+
+The esp module can be automatically loaded when needed. To cause this to
+happen, add the following lines to /etc/modprobe.conf (replacing the last line
+with options for your configuration):
+
+alias char-major-57 esp
+alias char-major-58 esp
+options esp irq=0,0,0,0,0,0,3,0 divisor=0,0,0,0,0,0,0x4,0
+
+You may also need to run 'depmod -a'.
+
+Devices must be created manually. To create the devices, note the output from
+the module after it is inserted. The output will appear in the location where
+kernel messages usually appear (usually /var/adm/messages). Create two devices
+for each 'tty' mentioned, one with major of 57 and the other with major of 58.
+The minor number should be the same as the tty number reported. The commands
+would be (replace ? with the tty number):
+
+mknod /dev/ttyP? c 57 ?
+mknod /dev/cup? c 58 ?
+
+For example, if the following line appears:
+
+Oct 24 18:17:23 techno kernel: ttyP8 at 0x0140 (irq = 3) is an ESP primary port
+
+...two devices should be created:
+
+mknod /dev/ttyP8 c 57 8
+mknod /dev/cup8 c 58 8
+
+You may need to set the permissions on the devices:
+
+chmod 666 /dev/ttyP*
+chmod 666 /dev/cup*
+
+The ESP module and the serial module should not conflict (they can be used at
+the same time). After the ESP module has been loaded the ports on the ESP card
+will no longer be accessible by the serial driver.
+
+If I/O errors are experienced when accessing the port, check for IRQ and DMA
+conflicts ('cat /proc/interrupts' and 'cat /proc/dma' for a list of IRQs and
+DMAs currently in use).
+
+Enjoy!
+Andrew J. Robinson <arobinso@nyx.net>
diff --git a/Documentation/serial/moxa-smartio b/Documentation/serial/moxa-smartio
new file mode 100644
index 0000000..5337e80
--- /dev/null
+++ b/Documentation/serial/moxa-smartio
@@ -0,0 +1,523 @@
+=============================================================================
+ MOXA Smartio/Industio Family Device Driver Installation Guide
+ for Linux Kernel 2.4.x, 2.6.x
+ Copyright (C) 2008, Moxa Inc.
+=============================================================================
+Date: 01/21/2008
+
+Content
+
+1. Introduction
+2. System Requirement
+3. Installation
+ 3.1 Hardware installation
+ 3.2 Driver files
+ 3.3 Device naming convention
+ 3.4 Module driver configuration
+ 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x.
+ 3.6 Custom configuration
+ 3.7 Verify driver installation
+4. Utilities
+5. Setserial
+6. Troubleshooting
+
+-----------------------------------------------------------------------------
+1. Introduction
+
+ The Smartio/Industio/UPCI family Linux driver supports following multiport
+ boards.
+
+ - 2 ports multiport board
+ CP-102U, CP-102UL, CP-102UF
+ CP-132U-I, CP-132UL,
+ CP-132, CP-132I, CP132S, CP-132IS,
+ CI-132, CI-132I, CI-132IS,
+ (C102H, C102HI, C102HIS, C102P, CP-102, CP-102S)
+
+ - 4 ports multiport board
+ CP-104EL,
+ CP-104UL, CP-104JU,
+ CP-134U, CP-134U-I,
+ C104H/PCI, C104HS/PCI,
+ CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL,
+ C104H, C104HS,
+ CI-104J, CI-104JS,
+ CI-134, CI-134I, CI-134IS,
+ (C114HI, CT-114I, C104P)
+ POS-104UL,
+ CB-114,
+ CB-134I
+
+ - 8 ports multiport board
+ CP-118EL, CP-168EL,
+ CP-118U, CP-168U,
+ C168H/PCI,
+ C168H, C168HS,
+ (C168P),
+ CB-108
+
+ This driver and installation procedure have been developed upon Linux Kernel
+ 2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order
+ to maintain compatibility, this version has also been properly tested with
+ RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem
+ occurs, please contact Moxa at support@moxa.com.tw.
+
+ In addition to device driver, useful utilities are also provided in this
+ version. They are
+ - msdiag Diagnostic program for displaying installed Moxa
+ Smartio/Industio boards.
+ - msmon Monitor program to observe data count and line status signals.
+ - msterm A simple terminal program which is useful in testing serial
+ ports.
+ - io-irq.exe Configuration program to setup ISA boards. Please note that
+ this program can only be executed under DOS.
+
+ All the drivers and utilities are published in form of source code under
+ GNU General Public License in this version. Please refer to GNU General
+ Public License announcement in each source code file for more detail.
+
+ In Moxa's Web sites, you may always find latest driver at http://web.moxa.com.
+
+ This version of driver can be installed as Loadable Module (Module driver)
+ or built-in into kernel (Static driver). You may refer to following
+ installation procedure for suitable one. Before you install the driver,
+ please refer to hardware installation procedure in the User's Manual.
+
+ We assume the user should be familiar with following documents.
+ - Serial-HOWTO
+ - Kernel-HOWTO
+
+-----------------------------------------------------------------------------
+2. System Requirement
+ - Hardware platform: Intel x86 machine
+ - Kernel version: 2.4.x or 2.6.x
+ - gcc version 2.72 or later
+ - Maximum 4 boards can be installed in combination
+
+-----------------------------------------------------------------------------
+3. Installation
+
+ 3.1 Hardware installation
+ 3.2 Driver files
+ 3.3 Device naming convention
+ 3.4 Module driver configuration
+ 3.5 Static driver configuration for Linux kernel 2.4.x, 2.6.x.
+ 3.6 Custom configuration
+ 3.7 Verify driver installation
+
+
+ 3.1 Hardware installation
+
+ There are two types of buses, ISA and PCI, for Smartio/Industio
+ family multiport board.
+
+ ISA board
+ ---------
+ You'll have to configure CAP address, I/O address, Interrupt Vector
+ as well as IRQ before installing this driver. Please refer to hardware
+ installation procedure in User's Manual before proceed any further.
+ Please make sure the JP1 is open after the ISA board is set properly.
+
+ PCI/UPCI board
+ --------------
+ You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict
+ with other ISA devices. Please refer to hardware installation
+ procedure in User's Manual in advance.
+
+ PCI IRQ Sharing
+ -----------
+ Each port within the same multiport board shares the same IRQ. Up to
+ 4 Moxa Smartio/Industio PCI Family multiport boards can be installed
+ together on one system and they can share the same IRQ.
+
+
+ 3.2 Driver files
+
+ The driver file may be obtained from ftp, CD-ROM or floppy disk. The
+ first step, anyway, is to copy driver file "mxser.tgz" into specified
+ directory. e.g. /moxa. The execute commands as below.
+
+ # cd /
+ # mkdir moxa
+ # cd /moxa
+ # tar xvf /dev/fd0
+
+ or
+
+ # cd /
+ # mkdir moxa
+ # cd /moxa
+ # cp /mnt/cdrom/<driver directory>/mxser.tgz .
+ # tar xvfz mxser.tgz
+
+
+ 3.3 Device naming convention
+
+ You may find all the driver and utilities files in /moxa/mxser.
+ Following installation procedure depends on the model you'd like to
+ run the driver. If you prefer module driver, please refer to 3.4.
+ If static driver is required, please refer to 3.5.
+
+ Dialin and callout port
+ -----------------------
+ This driver remains traditional serial device properties. There are
+ two special file name for each serial port. One is dial-in port
+ which is named "ttyMxx". For callout port, the naming convention
+ is "cumxx".
+
+ Device naming when more than 2 boards installed
+ -----------------------------------------------
+ Naming convention for each Smartio/Industio multiport board is
+ pre-defined as below.
+
+ Board Num. Dial-in Port Callout port
+ 1st board ttyM0 - ttyM7 cum0 - cum7
+ 2nd board ttyM8 - ttyM15 cum8 - cum15
+ 3rd board ttyM16 - ttyM23 cum16 - cum23
+ 4th board ttyM24 - ttym31 cum24 - cum31
+
+
+ !!!!!!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ Under Kernel 2.6 the cum Device is Obsolete. So use ttyM*
+ device instead.
+ !!!!!!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+ Board sequence
+ --------------
+ This driver will activate ISA boards according to the parameter set
+ in the driver. After all specified ISA board activated, PCI board
+ will be installed in the system automatically driven.
+ Therefore the board number is sorted by the CAP address of ISA boards.
+ For PCI boards, their sequence will be after ISA boards and C168H/PCI
+ has higher priority than C104H/PCI boards.
+
+ 3.4 Module driver configuration
+ Module driver is easiest way to install. If you prefer static driver
+ installation, please skip this paragraph.
+
+
+ ------------- Prepare to use the MOXA driver--------------------
+ 3.4.1 Create tty device with correct major number
+ Before using MOXA driver, your system must have the tty devices
+ which are created with driver's major number. We offer one shell
+ script "msmknod" to simplify the procedure.
+ This step is only needed to be executed once. But you still
+ need to do this procedure when:
+ a. You change the driver's major number. Please refer the "3.7"
+ section.
+ b. Your total installed MOXA boards number is changed. Maybe you
+ add/delete one MOXA board.
+ c. You want to change the tty name. This needs to modify the
+ shell script "msmknod"
+
+ The procedure is:
+ # cd /moxa/mxser/driver
+ # ./msmknod
+
+ This shell script will require the major number for dial-in
+ device and callout device to create tty device. You also need
+ to specify the total installed MOXA board number. Default major
+ numbers for dial-in device and callout device are 30, 35. If
+ you need to change to other number, please refer section "3.7"
+ for more detailed procedure.
+ Msmknod will delete any special files occupying the same device
+ naming.
+
+ 3.4.2 Build the MOXA driver and utilities
+ Before using the MOXA driver and utilities, you need compile the
+ all the source code. This step is only need to be executed once.
+ But you still re-compile the source code if you modify the source
+ code. For example, if you change the driver's major number (see
+ "3.7" section), then you need to do this step again.
+
+ Find "Makefile" in /moxa/mxser, then run
+
+ # make clean; make install
+
+ !!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!
+ For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1:
+ # make clean; make installsp1
+
+ For Red Hat Enterprise Linux AS4/ES4/WS4:
+ # make clean; make installsp2
+ !!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!
+
+ The driver files "mxser.o" and utilities will be properly compiled
+ and copied to system directories respectively.
+
+ ------------- Load MOXA driver--------------------
+ 3.4.3 Load the MOXA driver
+
+ # modprobe mxser <argument>
+
+ will activate the module driver. You may run "lsmod" to check
+ if "mxser" is activated. If the MOXA board is ISA board, the
+ <argument> is needed. Please refer to section "3.4.5" for more
+ information.
+
+
+ ------------- Load MOXA driver on boot --------------------
+ 3.4.4 For the above description, you may manually execute
+ "modprobe mxser" to activate this driver and run
+ "rmmod mxser" to remove it.
+ However, it's better to have a boot time configuration to
+ eliminate manual operation. Boot time configuration can be
+ achieved by rc file. We offer one "rc.mxser" file to simplify
+ the procedure under "moxa/mxser/driver".
+
+ But if you use ISA board, please modify the "modprobe ..." command
+ to add the argument (see "3.4.5" section). After modifying the
+ rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser"
+ manually to make sure the modification is ok. If any error
+ encountered, please try to modify again. If the modification is
+ completed, follow the below step.
+
+ Run following command for setting rc files.
+
+ # cd /moxa/mxser/driver
+ # cp ./rc.mxser /etc/rc.d
+ # cd /etc/rc.d
+
+ Check "rc.serial" is existed or not. If "rc.serial" doesn't exist,
+ create it by vi, run "chmod 755 rc.serial" to change the permission.
+ Add "/etc/rc.d/rc.mxser" in last line,
+
+ Reboot and check if moxa.o activated by "lsmod" command.
+
+ 3.4.5. If you'd like to drive Smartio/Industio ISA boards in the system,
+ you'll have to add parameter to specify CAP address of given
+ board while activating "mxser.o". The format for parameters are
+ as follows.
+
+ modprobe mxser ioaddr=0x???,0x???,0x???,0x???
+ | | | |
+ | | | +- 4th ISA board
+ | | +------ 3rd ISA board
+ | +------------ 2nd ISA board
+ +------------------- 1st ISA board
+
+ 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x
+
+ Note: To use static driver, you must install the linux kernel
+ source package.
+
+ 3.5.1 Backup the built-in driver in the kernel.
+ # cd /usr/src/linux/drivers/char
+ # mv mxser.c mxser.c.old
+
+ For Red Hat 7.x user, you need to create link:
+ # cd /usr/src
+ # ln -s linux-2.4 linux
+
+ 3.5.2 Create link
+ # cd /usr/src/linux/drivers/char
+ # ln -s /moxa/mxser/driver/mxser.c mxser.c
+
+ 3.5.3 Add CAP address list for ISA boards. For PCI boards user,
+ please skip this step.
+
+ In module mode, the CAP address for ISA board is given by
+ parameter. In static driver configuration, you'll have to
+ assign it within driver's source code. If you will not
+ install any ISA boards, you may skip to next portion.
+ The instructions to modify driver source code are as
+ below.
+ a. # cd /moxa/mxser/driver
+ # vi mxser.c
+ b. Find the array mxserBoardCAP[] as below.
+
+ static int mxserBoardCAP[]
+ = {0x00, 0x00, 0x00, 0x00};
+
+ c. Change the address within this array using vi. For
+ example, to driver 2 ISA boards with CAP address
+ 0x280 and 0x180 as 1st and 2nd board. Just to change
+ the source code as follows.
+
+ static int mxserBoardCAP[]
+ = {0x280, 0x180, 0x00, 0x00};
+
+ 3.5.4 Setup kernel configuration
+
+ Configure the kernel:
+
+ # cd /usr/src/linux
+ # make menuconfig
+
+ You will go into a menu-driven system. Please select [Character
+ devices][Non-standard serial port support], enable the [Moxa
+ SmartIO support] driver with "[*]" for built-in (not "[M]"), then
+ select [Exit] to exit this program.
+
+ 3.5.5 Rebuild kernel
+ The following are for Linux kernel rebuilding, for your
+ reference only.
+ For appropriate details, please refer to the Linux document.
+
+ a. cd /usr/src/linux
+ b. make clean /* take a few minutes */
+ c. make dep /* take a few minutes */
+ d. make bzImage /* take probably 10-20 minutes */
+ e. make install /* copy boot image to correct position */
+ f. Please make sure the boot kernel (vmlinuz) is in the
+ correct position.
+ g. If you use 'lilo' utility, you should check /etc/lilo.conf
+ 'image' item specified the path which is the 'vmlinuz' path,
+ or you will load wrong (or old) boot kernel image (vmlinuz).
+ After checking /etc/lilo.conf, please run "lilo".
+
+ Note that if the result of "make bzImage" is ERROR, then you have to
+ go back to Linux configuration Setup. Type "make menuconfig" in
+ directory /usr/src/linux.
+
+
+ 3.5.6 Make tty device and special file
+ # cd /moxa/mxser/driver
+ # ./msmknod
+
+ 3.5.7 Make utility
+ # cd /moxa/mxser/utility
+ # make clean; make install
+
+ 3.5.8 Reboot
+
+
+
+ 3.6 Custom configuration
+ Although this driver already provides you default configuration, you
+ still can change the device name and major number. The instruction to
+ change these parameters are shown as below.
+
+ Change Device name
+ ------------------
+ If you'd like to use other device names instead of default naming
+ convention, all you have to do is to modify the internal code
+ within the shell script "msmknod". First, you have to open "msmknod"
+ by vi. Locate each line contains "ttyM" and "cum" and change them
+ to the device name you desired. "msmknod" creates the device names
+ you need next time executed.
+
+ Change Major number
+ -------------------
+ If major number 30 and 35 had been occupied, you may have to select
+ 2 free major numbers for this driver. There are 3 steps to change
+ major numbers.
+
+ 3.6.1 Find free major numbers
+ In /proc/devices, you may find all the major numbers occupied
+ in the system. Please select 2 major numbers that are available.
+ e.g. 40, 45.
+ 3.6.2 Create special files
+ Run /moxa/mxser/driver/msmknod to create special files with
+ specified major numbers.
+ 3.6.3 Modify driver with new major number
+ Run vi to open /moxa/mxser/driver/mxser.c. Locate the line
+ contains "MXSERMAJOR". Change the content as below.
+ #define MXSERMAJOR 40
+ #define MXSERCUMAJOR 45
+ 3.6.4 Run "make clean; make install" in /moxa/mxser/driver.
+
+ 3.7 Verify driver installation
+ You may refer to /var/log/messages to check the latest status
+ log reported by this driver whenever it's activated.
+
+-----------------------------------------------------------------------------
+4. Utilities
+ There are 3 utilities contained in this driver. They are msdiag, msmon and
+ msterm. These 3 utilities are released in form of source code. They should
+ be compiled into executable file and copied into /usr/bin.
+
+ Before using these utilities, please load driver (refer 3.4 & 3.5) and
+ make sure you had run the "msmknod" utility.
+
+ msdiag - Diagnostic
+ --------------------
+ This utility provides the function to display what Moxa Smartio/Industio
+ board found by driver in the system.
+
+ msmon - Port Monitoring
+ -----------------------
+ This utility gives the user a quick view about all the MOXA ports'
+ activities. One can easily learn each port's total received/transmitted
+ (Rx/Tx) character count since the time when the monitoring is started.
+ Rx/Tx throughputs per second are also reported in interval basis (e.g.
+ the last 5 seconds) and in average basis (since the time the monitoring
+ is started). You can reset all ports' count by <HOME> key. <+> <->
+ (plus/minus) keys to change the displaying time interval. Press <ENTER>
+ on the port, that cursor stay, to view the port's communication
+ parameters, signal status, and input/output queue.
+
+ msterm - Terminal Emulation
+ ---------------------------
+ This utility provides data sending and receiving ability of all tty ports,
+ especially for MOXA ports. It is quite useful for testing simple
+ application, for example, sending AT command to a modem connected to the
+ port or used as a terminal for login purpose. Note that this is only a
+ dumb terminal emulation without handling full screen operation.
+
+-----------------------------------------------------------------------------
+5. Setserial
+
+ Supported Setserial parameters are listed as below.
+
+ uart set UART type(16450-->disable FIFO, 16550A-->enable FIFO)
+ close_delay set the amount of time(in 1/100 of a second) that DTR
+ should be kept low while being closed.
+ closing_wait set the amount of time(in 1/100 of a second) that the
+ serial port should wait for data to be drained while
+ being closed, before the receiver is disable.
+ spd_hi Use 57.6kb when the application requests 38.4kb.
+ spd_vhi Use 115.2kb when the application requests 38.4kb.
+ spd_shi Use 230.4kb when the application requests 38.4kb.
+ spd_warp Use 460.8kb when the application requests 38.4kb.
+ spd_normal Use 38.4kb when the application requests 38.4kb.
+ spd_cust Use the custom divisor to set the speed when the
+ application requests 38.4kb.
+ divisor This option set the custom divison.
+ baud_base This option set the base baud rate.
+
+-----------------------------------------------------------------------------
+6. Troubleshooting
+
+ The boot time error messages and solutions are stated as clearly as
+ possible. If all the possible solutions fail, please contact our technical
+ support team to get more help.
+
+
+ Error msg: More than 4 Moxa Smartio/Industio family boards found. Fifth board
+ and after are ignored.
+ Solution:
+ To avoid this problem, please unplug fifth and after board, because Moxa
+ driver supports up to 4 boards.
+
+ Error msg: Request_irq fail, IRQ(?) may be conflict with another device.
+ Solution:
+ Other PCI or ISA devices occupy the assigned IRQ. If you are not sure
+ which device causes the situation, please check /proc/interrupts to find
+ free IRQ and simply change another free IRQ for Moxa board.
+
+ Error msg: Board #: C1xx Series(CAP=xxx) interrupt number invalid.
+ Solution:
+ Each port within the same multiport board shares the same IRQ. Please set
+ one IRQ (IRQ doesn't equal to zero) for one Moxa board.
+
+ Error msg: No interrupt vector be set for Moxa ISA board(CAP=xxx).
+ Solution:
+ Moxa ISA board needs an interrupt vector.Please refer to user's manual
+ "Hardware Installation" chapter to set interrupt vector.
+
+ Error msg: Couldn't install MOXA Smartio/Industio family driver!
+ Solution:
+ Load Moxa driver fail, the major number may conflict with other devices.
+ Please refer to previous section 3.7 to change a free major number for
+ Moxa driver.
+
+ Error msg: Couldn't install MOXA Smartio/Industio family callout driver!
+ Solution:
+ Load Moxa callout driver fail, the callout device major number may
+ conflict with other devices. Please refer to previous section 3.7 to
+ change a free callout device major number for Moxa driver.
+
+
+-----------------------------------------------------------------------------
+
diff --git a/Documentation/serial/riscom8.txt b/Documentation/serial/riscom8.txt
new file mode 100644
index 0000000..14f61fd
--- /dev/null
+++ b/Documentation/serial/riscom8.txt
@@ -0,0 +1,36 @@
+* NOTE - this is an unmaintained driver. The original author cannot be located.
+
+SDL Communications is now SBS Technologies, and does not have any
+information on these ancient ISA cards on their website.
+
+James Nelson <james4765@gmail.com> - 12-12-2004
+
+ This is the README for RISCom/8 multi-port serial driver
+ (C) 1994-1996 D.Gorodchanin
+ See file LICENSE for terms and conditions.
+
+NOTE: English is not my native language.
+ I'm sorry for any mistakes in this text.
+
+Misc. notes for RISCom/8 serial driver, in no particular order :)
+
+1) This driver can support up to 4 boards at time.
+ Use string "riscom8=0xXXX,0xXXX,0xXXX,0xXXX" at LILO prompt, for
+ setting I/O base addresses for boards. If you compile driver
+ as module use modprobe options "iobase=0xXXX iobase1=0xXXX iobase2=..."
+
+2) The driver partially supports famous 'setserial' program, you can use almost
+ any of its options, excluding port & irq settings.
+
+3) There are some misc. defines at the beginning of riscom8.c, please read the
+ comments and try to change some of them in case of problems.
+
+4) I consider the current state of the driver as BETA.
+
+5) SDL Communications WWW page is http://www.sdlcomm.com.
+
+6) You can use the MAKEDEV program to create RISCom/8 /dev/ttyL* entries.
+
+7) Minor numbers for first board are 0-7, for second 8-15, etc.
+
+22 Apr 1996.
diff --git a/Documentation/serial/rocket.txt b/Documentation/serial/rocket.txt
new file mode 100644
index 0000000..1d85829
--- /dev/null
+++ b/Documentation/serial/rocket.txt
@@ -0,0 +1,189 @@
+Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
+Device Driver for the Linux Operating System
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+PRODUCT OVERVIEW
+----------------
+
+This driver provides a loadable kernel driver for the Comtrol RocketPort
+and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
+high-speed serial ports or modems. This driver supports up to a combination
+of four RocketPort or RocketModems boards in one machine simultaneously.
+This file assumes that you are using the RocketPort driver which is
+integrated into the kernel sources.
+
+The driver can also be installed as an external module using the usual
+"make;make install" routine. This external module driver, obtainable
+from the Comtrol website listed below, is useful for updating the driver
+or installing it into kernels which do not have the driver configured
+into them. Installations instructions for the external module
+are in the included README and HW_INSTALL files.
+
+RocketPort ISA and RocketModem II PCI boards currently are only supported by
+this driver in module form.
+
+The RocketPort ISA board requires I/O ports to be configured by the DIP
+switches on the board. See the section "ISA Rocketport Boards" below for
+information on how to set the DIP switches.
+
+You pass the I/O port to the driver using the following module parameters:
+
+board1 : I/O port for the first ISA board
+board2 : I/O port for the second ISA board
+board3 : I/O port for the third ISA board
+board4 : I/O port for the fourth ISA board
+
+There is a set of utilities and scripts provided with the external driver
+( downloadable from http://www.comtrol.com ) that ease the configuration and
+setup of the ISA cards.
+
+The RocketModem II PCI boards require firmware to be loaded into the card
+before it will function. The driver has only been tested as a module for this
+board.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+INSTALLATION PROCEDURES
+-----------------------
+
+RocketPort/RocketModem PCI cards require no driver configuration, they are
+automatically detected and configured.
+
+The RocketPort driver can be installed as a module (recommended) or built
+into the kernel. This is selected, as for other drivers, through the `make config`
+command from the root of the Linux source tree during the kernel build process.
+
+The RocketPort/RocketModem serial ports installed by this driver are assigned
+device major number 46, and will be named /dev/ttyRx, where x is the port number
+starting at zero (ex. /dev/ttyR0, /devttyR1, ...). If you have multiple cards
+installed in the system, the mapping of port names to serial ports is displayed
+in the system log at /var/log/messages.
+
+If installed as a module, the module must be loaded. This can be done
+manually by entering "modprobe rocket". To have the module loaded automatically
+upon system boot, edit the /etc/modprobe.conf file and add the line
+"alias char-major-46 rocket".
+
+In order to use the ports, their device names (nodes) must be created with mknod.
+This is only required once, the system will retain the names once created. To
+create the RocketPort/RocketModem device names, use the command
+"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero. For example:
+
+>mknod /dev/ttyR0 c 46 0
+>mknod /dev/ttyR1 c 46 1
+>mknod /dev/ttyR2 c 46 2
+
+The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
+for you:
+
+>/dev/MAKEDEV ttyR
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+ISA Rocketport Boards
+---------------------
+
+You must assign and configure the I/O addresses used by the ISA Rocketport
+card before installing and using it. This is done by setting a set of DIP
+switches on the Rocketport board.
+
+
+SETTING THE I/O ADDRESS
+-----------------------
+
+Before installing RocketPort(R) or RocketPort RA boards, you must find
+a range of I/O addresses for it to use. The first RocketPort card
+requires a 68-byte contiguous block of I/O addresses, starting at one
+of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
+0x300h, 0x340h, 0x380h. This I/O address must be reflected in the DIP
+switches of *all* of the Rocketport cards.
+
+The second, third, and fourth RocketPort cards require a 64-byte
+contiguous block of I/O addresses, starting at one of the following
+I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
+0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h. The I/O address used by the
+second, third, and fourth Rocketport cards (if present) are set via
+software control. The DIP switch settings for the I/O address must be
+set to the value of the first Rocketport cards.
+
+In order to distinguish each of the card from the others, each card
+must have a unique board ID set on the dip switches. The first
+Rocketport board must be set with the DIP switches corresponding to
+the first board, the second board must be set with the DIP switches
+corresponding to the second board, etc. IMPORTANT: The board ID is
+the only place where the DIP switch settings should differ between the
+various Rocketport boards in a system.
+
+The I/O address range used by any of the RocketPort cards must not
+conflict with any other cards in the system, including other
+RocketPort cards. Below, you will find a list of commonly used I/O
+address ranges which may be in use by other devices in your system.
+On a Linux system, "cat /proc/ioports" will also be helpful in
+identifying what I/O addresses are being used by devices on your
+system.
+
+Remember, the FIRST RocketPort uses 68 I/O addresses. So, if you set it
+for 0x100, it will occupy 0x100 to 0x143. This would mean that you
+CAN NOT set the second, third or fourth board for address 0x140 since
+the first 4 bytes of that range are used by the first board. You would
+need to set the second, third, or fourth board to one of the next available
+blocks such as 0x180.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+RocketPort and RocketPort RA SW1 Settings:
+
+ +-------------------------------+
+ | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
+ +-------+-------+---------------+
+ | Unused| Card | I/O Port Block|
+ +-------------------------------+
+
+DIP Switches DIP Switches
+7 8 6 5
+=================== ===================
+On On UNUSED, MUST BE ON. On On First Card <==== Default
+ On Off Second Card
+ Off On Third Card
+ Off Off Fourth Card
+
+DIP Switches I/O Address Range
+4 3 2 1 Used by the First Card
+=====================================
+On Off On Off 100-143
+On Off Off On 140-183
+On Off Off Off 180-1C3 <==== Default
+Off On On Off 200-243
+Off On Off On 240-283
+Off On Off Off 280-2C3
+Off Off On Off 300-343
+Off Off Off On 340-383
+Off Off Off Off 380-3C3
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+REPORTING BUGS
+--------------
+
+For technical support, please provide the following
+information: Driver version, kernel release, distribution of
+kernel, and type of board you are using. Error messages and log
+printouts port configuration details are especially helpful.
+
+USA
+ Phone: (612) 494-4100
+ FAX: (612) 494-4199
+ email: support@comtrol.com
+
+Comtrol Europe
+ Phone: +44 (0) 1 869 323-220
+ FAX: +44 (0) 1 869 323-211
+ email: support@comtrol.co.uk
+
+Web: http://www.comtrol.com
+FTP: ftp.comtrol.com
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+
diff --git a/Documentation/serial/specialix.txt b/Documentation/serial/specialix.txt
new file mode 100644
index 0000000..6eb6f3a
--- /dev/null
+++ b/Documentation/serial/specialix.txt
@@ -0,0 +1,383 @@
+
+ specialix.txt -- specialix IO8+ multiport serial driver readme.
+
+
+
+ Copyright (C) 1997 Roger Wolff (R.E.Wolff@BitWizard.nl)
+
+ Specialix pays for the development and support of this driver.
+ Please DO contact io8-linux@specialix.co.uk if you require
+ support.
+
+ This driver was developed in the BitWizard linux device
+ driver service. If you require a linux device driver for your
+ product, please contact devices@BitWizard.nl for a quote.
+
+ This code is firmly based on the riscom/8 serial driver,
+ written by Dmitry Gorodchanin. The specialix IO8+ card
+ programming information was obtained from the CL-CD1865 Data
+ Book, and Specialix document number 6200059: IO8+ Hardware
+ Functional Specification, augmented by document number 6200088:
+ Merak Hardware Functional Specification. (IO8+/PCI is also
+ called Merak)
+
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of
+ the License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE. See the GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ USA.
+
+
+Intro
+=====
+
+
+This file contains some random information, that I like to have online
+instead of in a manual that can get lost. Ever misplace your Linux
+kernel sources? And the manual of one of the boards in your computer?
+
+
+Addresses and interrupts
+========================
+
+Address dip switch settings:
+The dip switch sets bits 2-9 of the IO address.
+
+ 9 8 7 6 5 4 3 2
+ +-----------------+
+ 0 | X X X X X X X |
+ | | = IoBase = 0x100
+ 1 | X |
+ +-----------------+ ------ RS232 connectors ---->
+
+ | | |
+ edge connector
+ | | |
+ V V V
+
+Base address 0x100 caused a conflict in one of my computers once. I
+haven't the foggiest why. My Specialix card is now at 0x180. My
+other computer runs just fine with the Specialix card at 0x100....
+The card occupies 4 addresses, but actually only two are really used.
+
+The PCI version doesn't have any dip switches. The BIOS assigns
+an IO address.
+
+The driver now still autoprobes at 0x100, 0x180, 0x250 and 0x260. If
+that causes trouble for you, please report that. I'll remove
+autoprobing then.
+
+The driver will tell the card what IRQ to use, so you don't have to
+change any jumpers to change the IRQ. Just use a command line
+argument (irq=xx) to the insmod program to set the interrupt.
+
+The BIOS assigns the IRQ on the PCI version. You have no say in what
+IRQ to use in that case.
+
+If your specialix cards are not at the default locations, you can use
+the kernel command line argument "specialix=io0,irq0,io1,irq1...".
+Here "io0" is the io address for the first card, and "irq0" is the
+irq line that the first card should use. And so on.
+
+Examples.
+
+You use the driver as a module and have three cards at 0x100, 0x250
+and 0x180. And some way or another you want them detected in that
+order. Moreover irq 12 is taken (e.g. by your PS/2 mouse).
+
+ insmod specialix.o iobase=0x100,0x250,0x180 irq=9,11,15
+
+The same three cards, but now in the kernel would require you to
+add
+
+ specialix=0x100,9,0x250,11,0x180,15
+
+to the command line. This would become
+
+ append="specialix=0x100,9,0x250,11,0x180,15"
+
+in your /etc/lilo.conf file if you use lilo.
+
+The Specialix driver is slightly odd: It allows you to have the second
+or third card detected without having a first card. This has
+advantages and disadvantages. A slot that isn't filled by an ISA card,
+might be filled if a PCI card is detected. Thus if you have an ISA
+card at 0x250 and a PCI card, you would get:
+
+sx0: specialix IO8+ Board at 0x100 not found.
+sx1: specialix IO8+ Board at 0x180 not found.
+sx2: specialix IO8+ board detected at 0x250, IRQ 12, CD1865 Rev. B.
+sx3: specialix IO8+ Board at 0x260 not found.
+sx0: specialix IO8+ board detected at 0xd800, IRQ 9, CD1865 Rev. B.
+
+This would happen if you don't give any probe hints to the driver.
+If you would specify:
+
+ specialix=0x250,11
+
+you'd get the following messages:
+
+sx0: specialix IO8+ board detected at 0x250, IRQ 11, CD1865 Rev. B.
+sx1: specialix IO8+ board detected at 0xd800, IRQ 9, CD1865 Rev. B.
+
+ISA probing is aborted after the IO address you gave is exhausted, and
+the PCI card is now detected as the second card. The ISA card is now
+also forced to IRQ11....
+
+
+Baud rates
+==========
+
+The rev 1.2 and below boards use a CL-CD1864. These chips can only
+do 64kbit. The rev 1.3 and newer boards use a CL-CD1865. These chips
+are officially capable of 115k2.
+
+The Specialix card uses a 25MHz crystal (in times two mode, which in
+fact is a divided by two mode). This is not enough to reach the rated
+115k2 on all ports at the same time. With this clock rate you can only
+do 37% of this rate. This means that at 115k2 on all ports you are
+going to lose characters (The chip cannot handle that many incoming
+bits at this clock rate.) (Yes, you read that correctly: there is a
+limit to the number of -=bits=- per second that the chip can handle.)
+
+If you near the "limit" you will first start to see a graceful
+degradation in that the chip cannot keep the transmitter busy at all
+times. However with a central clock this slow, you can also get it to
+miss incoming characters. The driver will print a warning message when
+you are outside the official specs. The messages usually show up in
+the file /var/log/messages .
+
+The specialix card cannot reliably do 115k2. If you use it, you have
+to do "extensive testing" (*) to verify if it actually works.
+
+When "mgetty" communicates with my modem at 115k2 it reports:
+got: +++[0d]ATQ0V1H0[0d][0d][8a]O[cb][0d][8a]
+ ^^^^ ^^^^ ^^^^
+
+The three characters that have the "^^^" under them have suffered a
+bit error in the highest bit. In conclusion: I've tested it, and found
+that it simply DOESN'T work for me. I also suspect that this is also
+caused by the baud rate being just a little bit out of tune.
+
+I upgraded the crystal to 66Mhz on one of my Specialix cards. Works
+great! Contact me for details. (Voids warranty, requires a steady hand
+and more such restrictions....)
+
+
+(*) Cirrus logic CD1864 databook, page 40.
+
+
+Cables for the Specialix IO8+
+=============================
+
+The pinout of the connectors on the IO8+ is:
+
+ pin short direction long name
+ name
+ Pin 1 DCD input Data Carrier Detect
+ Pin 2 RXD input Receive
+ Pin 3 DTR/RTS output Data Terminal Ready/Ready To Send
+ Pin 4 GND - Ground
+ Pin 5 TXD output Transmit
+ Pin 6 CTS input Clear To Send
+
+
+ -- 6 5 4 3 2 1 --
+ | |
+ | |
+ | |
+ | |
+ +----- -----+
+ |__________|
+ clip
+
+ Front view of an RJ12 connector. Cable moves "into" the paper.
+ (the plug is ready to plug into your mouth this way...)
+
+
+ NULL cable. I don't know who is going to use these except for
+ testing purposes, but I tested the cards with this cable. (It
+ took quite a while to figure out, so I'm not going to delete
+ it. So there! :-)
+
+
+ This end goes This end needs
+ straight into the some twists in
+ RJ12 plug. the wiring.
+ IO8+ RJ12 IO8+ RJ12
+ 1 DCD white -
+ - - 1 DCD
+ 2 RXD black 5 TXD
+ 3 DTR/RTS red 6 CTS
+ 4 GND green 4 GND
+ 5 TXD yellow 2 RXD
+ 6 CTS blue 3 DTR/RTS
+
+
+ Same NULL cable, but now sorted on the second column.
+
+ 1 DCD white -
+ - - 1 DCD
+ 5 TXD yellow 2 RXD
+ 6 CTS blue 3 DTR/RTS
+ 4 GND green 4 GND
+ 2 RXD black 5 TXD
+ 3 DTR/RTS red 6 CTS
+
+
+
+ This is a modem cable usable for hardware handshaking:
+ RJ12 DB25 DB9
+ 1 DCD white 8 DCD 1 DCD
+ 2 RXD black 3 RXD 2 RXD
+ 3 DTR/RTS red 4 RTS 7 RTS
+ 4 GND green 7 GND 5 GND
+ 5 TXD yellow 2 TXD 3 TXD
+ 6 CTS blue 5 CTS 8 CTS
+ +---- 6 DSR 6 DSR
+ +---- 20 DTR 4 DTR
+
+ This is a modem cable usable for software handshaking:
+ It allows you to reset the modem using the DTR ioctls.
+ I (REW) have never tested this, "but xxxxxxxxxxxxx
+ says that it works." If you test this, please
+ tell me and I'll fill in your name on the xxx's.
+
+ RJ12 DB25 DB9
+ 1 DCD white 8 DCD 1 DCD
+ 2 RXD black 3 RXD 2 RXD
+ 3 DTR/RTS red 20 DTR 4 DTR
+ 4 GND green 7 GND 5 GND
+ 5 TXD yellow 2 TXD 3 TXD
+ 6 CTS blue 5 CTS 8 CTS
+ +---- 6 DSR 6 DSR
+ +---- 4 RTS 7 RTS
+
+ I bought a 6 wire flat cable. It was colored as indicated.
+ Check that yours is the same before you trust me on this.
+
+
+Hardware handshaking issues.
+============================
+
+The driver can be told to operate in two different ways. The default
+behaviour is specialix.sx_rtscts = 0 where the pin behaves as DTR when
+hardware handshaking is off. It behaves as the RTS hardware
+handshaking signal when hardware handshaking is selected.
+
+When you use this, you have to use the appropriate cable. The
+cable will either be compatible with hardware handshaking or with
+software handshaking. So switching on the fly is not really an
+option.
+
+I actually prefer to use the "specialix.sx_rtscts=1" option.
+This makes the DTR/RTS pin always an RTS pin, and ioctls to
+change DTR are always ignored. I have a cable that is configured
+for this.
+
+
+Ports and devices
+=================
+
+Port 0 is the one furthest from the card-edge connector.
+
+Devices:
+
+You should make the devices as follows:
+
+bash
+cd /dev
+for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 \
+ 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+do
+ echo -n "$i "
+ mknod /dev/ttyW$i c 75 $i
+ mknod /dev/cuw$i c 76 $i
+done
+echo ""
+
+If your system doesn't come with these devices preinstalled, bug your
+linux-vendor about this. They have had ample time to get this
+implemented by now.
+
+You cannot have more than 4 boards in one computer. The card only
+supports 4 different interrupts. If you really want this, contact me
+about this and I'll give you a few tips (requires soldering iron)....
+
+If you have enough PCI slots, you can probably use more than 4 PCI
+versions of the card though....
+
+The PCI version of the card cannot adhere to the mechanical part of
+the PCI spec because the 8 serial connectors are simply too large. If
+it doesn't fit in your computer, bring back the card.
+
+
+------------------------------------------------------------------------
+
+
+ Fixed bugs and restrictions:
+ - During initialization, interrupts are blindly turned on.
+ Having a shadow variable would cause an extra memory
+ access on every IO instruction.
+ - The interrupt (on the card) should be disabled when we
+ don't allocate the Linux end of the interrupt. This allows
+ a different driver/card to use it while all ports are not in
+ use..... (a la standard serial port)
+ == An extra _off variant of the sx_in and sx_out macros are
+ now available. They don't set the interrupt enable bit.
+ These are used during initialization. Normal operation uses
+ the old variant which enables the interrupt line.
+ - RTS/DTR issue needs to be implemented according to
+ specialix' spec.
+ I kind of like the "determinism" of the current
+ implementation. Compile time flag?
+ == Ok. Compile time flag! Default is how Specialix likes it.
+ == Now a config time flag! Gets saved in your config file. Neat!
+ - Can you set the IO address from the lilo command line?
+ If you need this, bug me about it, I'll make it.
+ == Hah! No bugging needed. Fixed! :-)
+ - Cirrus logic hasn't gotten back to me yet why the CD1865 can
+ and the CD1864 can't do 115k2. I suspect that this is
+ because the CD1864 is not rated for 33MHz operation.
+ Therefore the CD1864 versions of the card can't do 115k2 on
+ all ports just like the CD1865 versions. The driver does
+ not block 115k2 on CD1864 cards.
+ == I called the Cirrus Logic representative here in Holland.
+ The CD1864 databook is identical to the CD1865 databook,
+ except for an extra warning at the end. Similar Bit errors
+ have been observed in testing at 115k2 on both an 1865 and
+ a 1864 chip. I see no reason why I would prohibit 115k2 on
+ 1864 chips and not do it on 1865 chips. Actually there is
+ reason to prohibit it on BOTH chips. I print a warning.
+ If you use 115k2, you're on your own.
+ - A spiky CD may send spurious HUPs. Also in CLOCAL???
+ -- A fix for this turned out to be counter productive.
+ Different fix? Current behaviour is acceptable?
+ -- Maybe the current implementation is correct. If anybody
+ gets bitten by this, please report, and it will get fixed.
+
+ -- Testing revealed that when in CLOCAL, the problem doesn't
+ occur. As warned for in the CD1865 manual, the chip may
+ send modem intr's on a spike. We could filter those out,
+ but that would be a cludge anyway (You'd still risk getting
+ a spurious HUP when two spikes occur.).....
+
+
+
+ Bugs & restrictions:
+ - This is a difficult card to autoprobe.
+ You have to WRITE to the address register to even
+ read-probe a CD186x register. Disable autodetection?
+ -- Specialix: any suggestions?
+
+
diff --git a/Documentation/serial/stallion.txt b/Documentation/serial/stallion.txt
new file mode 100644
index 0000000..5c4902d
--- /dev/null
+++ b/Documentation/serial/stallion.txt
@@ -0,0 +1,392 @@
+* NOTE - This is an unmaintained driver. Lantronix, which bought Stallion
+technologies, is not active in driver maintenance, and they have no information
+on when or if they will have a 2.6 driver.
+
+James Nelson <james4765@gmail.com> - 12-12-2004
+
+Stallion Multiport Serial Driver Readme
+---------------------------------------
+
+Copyright (C) 1994-1999, Stallion Technologies.
+
+Version: 5.5.1
+Date: 28MAR99
+
+
+
+1. INTRODUCTION
+
+There are two drivers that work with the different families of Stallion
+multiport serial boards. One is for the Stallion smart boards - that is
+EasyIO, EasyConnection 8/32 and EasyConnection 8/64-PCI, the other for
+the true Stallion intelligent multiport boards - EasyConnection 8/64
+(ISA, EISA, MCA), EasyConnection/RA-PCI, ONboard and Brumby.
+
+If you are using any of the Stallion intelligent multiport boards (Brumby,
+ONboard, EasyConnection 8/64 (ISA, EISA, MCA), EasyConnection/RA-PCI) with
+Linux you will need to get the driver utility package. This contains a
+firmware loader and the firmware images necessary to make the devices operate.
+
+The Stallion Technologies ftp site, ftp.stallion.com, will always have
+the latest version of the driver utility package.
+
+ftp://ftp.stallion.com/drivers/ata5/Linux/ata-linux-550.tar.gz
+
+As of the printing of this document the latest version of the driver
+utility package is 5.5.0. If a later version is now available then you
+should use the latest version.
+
+If you are using the EasyIO, EasyConnection 8/32 or EasyConnection 8/64-PCI
+boards then you don't need this package, although it does have a serial stats
+display program.
+
+If you require DIP switch settings, EISA or MCA configuration files, or any
+other information related to Stallion boards then have a look at Stallion's
+web pages at http://www.stallion.com.
+
+
+
+2. INSTALLATION
+
+The drivers can be used as loadable modules or compiled into the kernel.
+You can choose which when doing a "config" on the kernel.
+
+All ISA, EISA and MCA boards that you want to use need to be configured into
+the driver(s). All PCI boards will be automatically detected when you load
+the driver - so they do not need to be entered into the driver(s)
+configuration structure. Note that kernel PCI support is required to use PCI
+boards.
+
+There are two methods of configuring ISA, EISA and MCA boards into the drivers.
+If using the driver as a loadable module then the simplest method is to pass
+the driver configuration as module arguments. The other method is to modify
+the driver source to add configuration lines for each board in use.
+
+If you have pre-built Stallion driver modules then the module argument
+configuration method should be used. A lot of Linux distributions come with
+pre-built driver modules in /lib/modules/X.Y.Z/misc for the kernel in use.
+That makes things pretty simple to get going.
+
+
+2.1 MODULE DRIVER CONFIGURATION:
+
+The simplest configuration for modules is to use the module load arguments
+to configure any ISA, EISA or MCA boards. PCI boards are automatically
+detected, so do not need any additional configuration at all.
+
+If using EasyIO, EasyConnection 8/32 ISA or MCA, or EasyConnection 8/63-PCI
+boards then use the "stallion" driver module, Otherwise if you are using
+an EasyConnection 8/64 ISA, EISA or MCA, EasyConnection/RA-PCI, ONboard,
+Brumby or original Stallion board then use the "istallion" driver module.
+
+Typically to load up the smart board driver use:
+
+ modprobe stallion
+
+This will load the EasyIO and EasyConnection 8/32 driver. It will output a
+message to say that it loaded and print the driver version number. It will
+also print out whether it found the configured boards or not. These messages
+may not appear on the console, but typically are always logged to
+/var/adm/messages or /var/log/syslog files - depending on how the klogd and
+syslogd daemons are setup on your system.
+
+To load the intelligent board driver use:
+
+ modprobe istallion
+
+It will output similar messages to the smart board driver.
+
+If not using an auto-detectable board type (that is a PCI board) then you
+will also need to supply command line arguments to the modprobe command
+when loading the driver. The general form of the configuration argument is
+
+ board?=<name>[,<ioaddr>[,<addr>][,<irq>]]
+
+where:
+
+ board? -- specifies the arbitrary board number of this board,
+ can be in the range 0 to 3.
+
+ name -- textual name of this board. The board name is the common
+ board name, or any "shortened" version of that. The board
+ type number may also be used here.
+
+ ioaddr -- specifies the I/O address of this board. This argument is
+ optional, but should generally be specified.
+
+ addr -- optional second address argument. Some board types require
+ a second I/O address, some require a memory address. The
+ exact meaning of this argument depends on the board type.
+
+ irq -- optional IRQ line used by this board.
+
+Up to 4 board configuration arguments can be specified on the load line.
+Here is some examples:
+
+ modprobe stallion board0=easyio,0x2a0,5
+
+This configures an EasyIO board as board 0 at I/O address 0x2a0 and IRQ 5.
+
+ modprobe istallion board3=ec8/64,0x2c0,0xcc000
+
+This configures an EasyConnection 8/64 ISA as board 3 at I/O address 0x2c0 at
+memory address 0xcc000.
+
+ modprobe stallion board1=ec8/32-at,0x2a0,0x280,10
+
+This configures an EasyConnection 8/32 ISA board at primary I/O address 0x2a0,
+secondary address 0x280 and IRQ 10.
+
+You will probably want to enter this module load and configuration information
+into your system startup scripts so that the drivers are loaded and configured
+on each system boot. Typically the start up script would be something like
+/etc/modprobe.conf.
+
+
+2.2 STATIC DRIVER CONFIGURATION:
+
+For static driver configuration you need to modify the driver source code.
+Entering ISA, EISA and MCA boards into the driver(s) configuration structure
+involves editing the driver(s) source file. It's pretty easy if you follow
+the instructions below. Both drivers can support up to 4 boards. The smart
+card driver (the stallion.c driver) supports any combination of EasyIO and
+EasyConnection 8/32 boards (up to a total of 4). The intelligent driver
+supports any combination of ONboards, Brumbys, Stallions and EasyConnection
+8/64 (ISA and EISA) boards (up to a total of 4).
+
+To set up the driver(s) for the boards that you want to use you need to
+edit the appropriate driver file and add configuration entries.
+
+If using EasyIO or EasyConnection 8/32 ISA or MCA boards,
+ In drivers/char/stallion.c:
+ - find the definition of the stl_brdconf array (of structures)
+ near the top of the file
+ - modify this to match the boards you are going to install
+ (the comments before this structure should help)
+ - save and exit
+
+If using ONboard, Brumby, Stallion or EasyConnection 8/64 (ISA or EISA)
+boards,
+ In drivers/char/istallion.c:
+ - find the definition of the stli_brdconf array (of structures)
+ near the top of the file
+ - modify this to match the boards you are going to install
+ (the comments before this structure should help)
+ - save and exit
+
+Once you have set up the board configurations then you are ready to build
+the kernel or modules.
+
+When the new kernel is booted, or the loadable module loaded then the
+driver will emit some kernel trace messages about whether the configured
+boards were detected or not. Depending on how your system logger is set
+up these may come out on the console, or just be logged to
+/var/adm/messages or /var/log/syslog. You should check the messages to
+confirm that all is well.
+
+
+2.3 SHARING INTERRUPTS
+
+It is possible to share interrupts between multiple EasyIO and
+EasyConnection 8/32 boards in an EISA system. To do this you must be using
+static driver configuration, modifying the driver source code to add driver
+configuration. Then a couple of extra things are required:
+
+1. When entering the board resources into the stallion.c file you need to
+ mark the boards as using level triggered interrupts. Do this by replacing
+ the "0" entry at field position 6 (the last field) in the board
+ configuration structure with a "1". (This is the structure that defines
+ the board type, I/O locations, etc. for each board). All boards that are
+ sharing an interrupt must be set this way, and each board should have the
+ same interrupt number specified here as well. Now build the module or
+ kernel as you would normally.
+
+2. When physically installing the boards into the system you must enter
+ the system EISA configuration utility. You will need to install the EISA
+ configuration files for *all* the EasyIO and EasyConnection 8/32 boards
+ that are sharing interrupts. The Stallion EasyIO and EasyConnection 8/32
+ EISA configuration files required are supplied by Stallion Technologies
+ on the EASY Utilities floppy diskette (usually supplied in the box with
+ the board when purchased. If not, you can pick it up from Stallion's FTP
+ site, ftp.stallion.com). You will need to edit the board resources to
+ choose level triggered interrupts, and make sure to set each board's
+ interrupt to the same IRQ number.
+
+You must complete both the above steps for this to work. When you reboot
+or load the driver your EasyIO and EasyConnection 8/32 boards will be
+sharing interrupts.
+
+
+2.4 USING HIGH SHARED MEMORY
+
+The EasyConnection 8/64-EI, ONboard and Stallion boards are capable of
+using shared memory addresses above the usual 640K - 1Mb range. The ONboard
+ISA and the Stallion boards can be programmed to use memory addresses up to
+16Mb (the ISA bus addressing limit), and the EasyConnection 8/64-EI and
+ONboard/E can be programmed for memory addresses up to 4Gb (the EISA bus
+addressing limit).
+
+The higher than 1Mb memory addresses are fully supported by this driver.
+Just enter the address as you normally would for a lower than 1Mb address
+(in the driver's board configuration structure).
+
+
+
+2.5 TROUBLE SHOOTING
+
+If a board is not found by the driver but is actually in the system then the
+most likely problem is that the I/O address is wrong. Change the module load
+argument for the loadable module form. Or change it in the driver stallion.c
+or istallion.c configuration structure and rebuild the kernel or modules, or
+change it on the board.
+
+On EasyIO and EasyConnection 8/32 boards the IRQ is software programmable, so
+if there is a conflict you may need to change the IRQ used for a board. There
+are no interrupts to worry about for ONboard, Brumby or EasyConnection 8/64
+(ISA, EISA and MCA) boards. The memory region on EasyConnection 8/64 and
+ONboard boards is software programmable, but not on the Brumby boards.
+
+
+
+3. USING THE DRIVERS
+
+3.1 INTELLIGENT DRIVER OPERATION
+
+The intelligent boards also need to have their "firmware" code downloaded
+to them. This is done via a user level application supplied in the driver
+utility package called "stlload". Compile this program wherever you dropped
+the package files, by typing "make". In its simplest form you can then type
+
+ ./stlload -i cdk.sys
+
+in this directory and that will download board 0 (assuming board 0 is an
+EasyConnection 8/64 or EasyConnection/RA board). To download to an
+ONboard, Brumby or Stallion do:
+
+ ./stlload -i 2681.sys
+
+Normally you would want all boards to be downloaded as part of the standard
+system startup. To achieve this, add one of the lines above into the
+/etc/rc.d/rc.S or /etc/rc.d/rc.serial file. To download each board just add
+the "-b <brd-number>" option to the line. You will need to download code for
+every board. You should probably move the stlload program into a system
+directory, such as /usr/sbin. Also, the default location of the cdk.sys image
+file in the stlload down-loader is /usr/lib/stallion. Create that directory
+and put the cdk.sys and 2681.sys files in it. (It's a convenient place to put
+them anyway). As an example your /etc/rc.d/rc.S file might have the
+following lines added to it (if you had 3 boards):
+
+ /usr/sbin/stlload -b 0 -i /usr/lib/stallion/cdk.sys
+ /usr/sbin/stlload -b 1 -i /usr/lib/stallion/2681.sys
+ /usr/sbin/stlload -b 2 -i /usr/lib/stallion/2681.sys
+
+The image files cdk.sys and 2681.sys are specific to the board types. The
+cdk.sys will only function correctly on an EasyConnection 8/64 board. Similarly
+the 2681.sys image fill only operate on ONboard, Brumby and Stallion boards.
+If you load the wrong image file into a board it will fail to start up, and
+of course the ports will not be operational!
+
+If you are using the modularized version of the driver you might want to put
+the modprobe calls in the startup script as well (before the download lines
+obviously).
+
+
+3.2 USING THE SERIAL PORTS
+
+Once the driver is installed you will need to setup some device nodes to
+access the serial ports. The simplest method is to use the /dev/MAKEDEV program.
+It will automatically create device entries for Stallion boards. This will
+create the normal serial port devices as /dev/ttyE# where# is the port number
+starting from 0. A bank of 64 minor device numbers is allocated to each board,
+so the first port on the second board is port 64,etc. A set of callout type
+devices may also be created. They are created as the devices /dev/cue# where #
+is the same as for the ttyE devices.
+
+For the most part the Stallion driver tries to emulate the standard PC system
+COM ports and the standard Linux serial driver. The idea is that you should
+be able to use Stallion board ports and COM ports interchangeably without
+modifying anything but the device name. Anything that doesn't work like that
+should be considered a bug in this driver!
+
+If you look at the driver code you will notice that it is fairly closely
+based on the Linux serial driver (linux/drivers/char/serial.c). This is
+intentional, obviously this is the easiest way to emulate its behavior!
+
+Since this driver tries to emulate the standard serial ports as much as
+possible, most system utilities should work as they do for the standard
+COM ports. Most importantly "stty" works as expected and "setserial" can
+also be used (excepting the ability to auto-configure the I/O and IRQ
+addresses of boards). Higher baud rates are supported in the usual fashion
+through setserial or using the CBAUDEX extensions. Note that the EasyIO and
+EasyConnection (all types) support at least 57600 and 115200 baud. The newer
+EasyConnection XP modules and new EasyIO boards support 230400 and 460800
+baud as well. The older boards including ONboard and Brumby support a
+maximum baud rate of 38400.
+
+If you are unfamiliar with how to use serial ports, then get the Serial-HOWTO
+by Greg Hankins. It will explain everything you need to know!
+
+
+
+4. NOTES
+
+You can use both drivers at once if you have a mix of board types installed
+in a system. However to do this you will need to change the major numbers
+used by one of the drivers. Currently both drivers use major numbers 24, 25
+and 28 for their devices. Change one driver to use some other major numbers,
+and then modify the mkdevnods script to make device nodes based on those new
+major numbers. For example, you could change the istallion.c driver to use
+major numbers 60, 61 and 62. You will also need to create device nodes with
+different names for the ports, for example ttyF# and cuf#.
+
+The original Stallion board is no longer supported by Stallion Technologies.
+Although it is known to work with the istallion driver.
+
+Finding a free physical memory address range can be a problem. The older
+boards like the Stallion and ONboard need large areas (64K or even 128K), so
+they can be very difficult to get into a system. If you have 16 Mb of RAM
+then you have no choice but to put them somewhere in the 640K -> 1Mb range.
+ONboards require 64K, so typically 0xd0000 is good, or 0xe0000 on some
+systems. If you have an original Stallion board, "V4.0" or Rev.O, then you
+need a 64K memory address space, so again 0xd0000 and 0xe0000 are good.
+Older Stallion boards are a much bigger problem. They need 128K of address
+space and must be on a 128K boundary. If you don't have a VGA card then
+0xc0000 might be usable - there is really no other place you can put them
+below 1Mb.
+
+Both the ONboard and old Stallion boards can use higher memory addresses as
+well, but you must have less than 16Mb of RAM to be able to use them. Usual
+high memory addresses used include 0xec0000 and 0xf00000.
+
+The Brumby boards only require 16Kb of address space, so you can usually
+squeeze them in somewhere. Common addresses are 0xc8000, 0xcc000, or in
+the 0xd0000 range. EasyConnection 8/64 boards are even better, they only
+require 4Kb of address space, again usually 0xc8000, 0xcc000 or 0xd0000
+are good.
+
+If you are using an EasyConnection 8/64-EI or ONboard/E then usually the
+0xd0000 or 0xe0000 ranges are the best options below 1Mb. If neither of
+them can be used then the high memory support to use the really high address
+ranges is the best option. Typically the 2Gb range is convenient for them,
+and gets them well out of the way.
+
+The ports of the EasyIO-8M board do not have DCD or DTR signals. So these
+ports cannot be used as real modem devices. Generally, when using these
+ports you should only use the cueX devices.
+
+The driver utility package contains a couple of very useful programs. One
+is a serial port statistics collection and display program - very handy
+for solving serial port problems. The other is an extended option setting
+program that works with the intelligent boards.
+
+
+
+5. DISCLAIMER
+
+The information contained in this document is believed to be accurate and
+reliable. However, no responsibility is assumed by Stallion Technologies
+Pty. Ltd. for its use, nor any infringements of patents or other rights
+of third parties resulting from its use. Stallion Technologies reserves
+the right to modify the design of its products and will endeavour to change
+the information in manuals and accompanying documentation accordingly.
+
diff --git a/Documentation/serial/sx.txt b/Documentation/serial/sx.txt
new file mode 100644
index 0000000..cb4efa0
--- /dev/null
+++ b/Documentation/serial/sx.txt
@@ -0,0 +1,294 @@
+
+ sx.txt -- specialix SX/SI multiport serial driver readme.
+
+
+
+ Copyright (C) 1997 Roger Wolff (R.E.Wolff@BitWizard.nl)
+
+ Specialix pays for the development and support of this driver.
+ Please DO contact support@specialix.co.uk if you require
+ support.
+
+ This driver was developed in the BitWizard linux device
+ driver service. If you require a linux device driver for your
+ product, please contact devices@BitWizard.nl for a quote.
+
+ (History)
+ There used to be an SI driver by Simon Allan. This is a complete
+ rewrite from scratch. Just a few lines-of-code have been snatched.
+
+ (Sources)
+ Specialix document number 6210028: SX Host Card and Download Code
+ Software Functional Specification.
+
+ (Copying)
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of
+ the License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE. See the GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ USA.
+
+ (Addendum)
+ I'd appreciate it that if you have fixes, that you send them
+ to me first.
+
+
+Introduction
+============
+
+This file contains some random information, that I like to have online
+instead of in a manual that can get lost. Ever misplace your Linux
+kernel sources? And the manual of one of the boards in your computer?
+
+
+Theory of operation
+===================
+
+An important thing to know is that the driver itself doesn't have the
+firmware for the card. This means that you need the separate package
+"sx_firmware". For now you can get the source at
+
+ ftp://ftp.bitwizard.nl/specialix/sx_firmware_<version>.tgz
+
+The firmware load needs a "misc" device, so you'll need to enable the
+"Support for user misc device modules" in your kernel configuration.
+The misc device needs to be called "/dev/specialix_sxctl". It needs
+misc major 10, and minor number 167 (assigned by HPA). The section
+on creating device files below also creates this device.
+
+After loading the sx.o module into your kernel, the driver will report
+the number of cards detected, but because it doesn't have any
+firmware, it will not be able to determine the number of ports. Only
+when you then run "sx_firmware" will the firmware be downloaded and
+the rest of the driver initialized. At that time the sx_firmware
+program will report the number of ports installed.
+
+In contrast with many other multi port serial cards, some of the data
+structures are only allocated when the card knows the number of ports
+that are connected. This means we won't waste memory for 120 port
+descriptor structures when you only have 8 ports. If you experience
+problems due to this, please report them: I haven't seen any.
+
+
+Interrupts
+==========
+
+A multi port serial card, would generate a horrendous amount of
+interrupts if it would interrupt the CPU for every received
+character. Even more than 10 years ago, the trick not to use
+interrupts but to poll the serial cards was invented.
+
+The SX card allow us to do this two ways. First the card limits its
+own interrupt rate to a rate that won't overwhelm the CPU. Secondly,
+we could forget about the cards interrupt completely and use the
+internal timer for this purpose.
+
+Polling the card can take up to a few percent of your CPU. Using the
+interrupts would be better if you have most of the ports idle. Using
+timer-based polling is better if your card almost always has work to
+do. You save the separate interrupt in that case.
+
+In any case, it doesn't really matter all that much.
+
+The most common problem with interrupts is that for ISA cards in a PCI
+system the BIOS has to be told to configure that interrupt as "legacy
+ISA". Otherwise the card can pull on the interrupt line all it wants
+but the CPU won't see this.
+
+If you can't get the interrupt to work, remember that polling mode is
+more efficient (provided you actually use the card intensively).
+
+
+Allowed Configurations
+======================
+
+Some configurations are disallowed. Even though at a glance they might
+seem to work, they are known to lockup the bus between the host card
+and the device concentrators. You should respect the drivers decision
+not to support certain configurations. It's there for a reason.
+
+Warning: Seriously technical stuff ahead. Executive summary: Don't use
+SX cards except configured at a 64k boundary. Skip the next paragraph.
+
+The SX cards can theoretically be placed at a 32k boundary. So for
+instance you can put an SX card at 0xc8000-0xd7fff. This is not a
+"recommended configuration". ISA cards have to tell the bus controller
+how they like their timing. Due to timing issues they have to do this
+based on which 64k window the address falls into. This means that the
+32k window below and above the SX card have to use exactly the same
+timing as the SX card. That reportedly works for other SX cards. But
+you're still left with two useless 32k windows that should not be used
+by anybody else.
+
+
+Configuring the driver
+======================
+
+PCI cards are always detected. The driver auto-probes for ISA cards at
+some sensible addresses. Please report if the auto-probe causes trouble
+in your system, or when a card isn't detected.
+
+I'm afraid I haven't implemented "kernel command line parameters" yet.
+This means that if the default doesn't work for you, you shouldn't use
+the compiled-into-the-kernel version of the driver. Use a module
+instead. If you convince me that you need this, I'll make it for
+you. Deal?
+
+I'm afraid that the module parameters are a bit clumsy. If you have a
+better idea, please tell me.
+
+You can specify several parameters:
+
+ sx_poll: number of jiffies between timer-based polls.
+
+ Set this to "0" to disable timer based polls.
+ Initialization of cards without a working interrupt
+ will fail.
+
+ Set this to "1" if you want a polling driver.
+ (on Intel: 100 polls per second). If you don't use
+ fast baud rates, you might consider a value like "5".
+ (If you don't know how to do the math, use 1).
+
+ sx_slowpoll: Number of jiffies between timer-based polls.
+ Set this to "100" to poll once a second.
+ This should get the card out of a stall if the driver
+ ever misses an interrupt. I've never seen this happen,
+ and if it does, that's a bug. Tell me.
+
+ sx_maxints: Number of interrupts to request from the card.
+ The card normally limits interrupts to about 100 per
+ second to offload the host CPU. You can increase this
+ number to reduce latency on the card a little.
+ Note that if you give a very high number you can overload
+ your CPU as well as the CPU on the host card. This setting
+ is inaccurate and not recommended for SI cards (But it
+ works).
+
+ sx_irqmask: The mask of allowable IRQs to use. I suggest you set
+ this to 0 (disable IRQs all together) and use polling if
+ the assignment of IRQs becomes problematic. This is defined
+ as the sum of (1 << irq) 's that you want to allow. So
+ sx_irqmask of 8 (1 << 3) specifies that only irq 3 may
+ be used by the SX driver. If you want to specify to the
+ driver: "Either irq 11 or 12 is ok for you to use", then
+ specify (1 << 11) | (1 << 12) = 0x1800 .
+
+ sx_debug: You can enable different sorts of debug traces with this.
+ At "-1" all debugging traces are active. You'll get several
+ times more debugging output than you'll get characters
+ transmitted.
+
+
+Baud rates
+==========
+
+Theoretically new SXDCs should be capable of more than 460k
+baud. However the line drivers usually give up before that. Also the
+CPU on the card may not be able to handle 8 channels going at full
+blast at that speed. Moreover, the buffers are not large enough to
+allow operation with 100 interrupts per second. You'll have to realize
+that the card has a 256 byte buffer, so you'll have to increase the
+number of interrupts per second if you have more than 256*100 bytes
+per second to transmit. If you do any performance testing in this
+area, I'd be glad to hear from you...
+
+(Psst Linux users..... I think the Linux driver is more efficient than
+the driver for other OSes. If you can and want to benchmark them
+against each other, be my guest, and report your findings...... :-)
+
+
+Ports and devices
+=================
+
+Port 0 is the top connector on the module closest to the host
+card. Oh, the ports on the SXDCs and TAs are labelled from 1 to 8
+instead of from 0 to 7, as they are numbered by linux. I'm stubborn in
+this: I know for sure that I wouldn't be able to calculate which port
+is which anymore if I would change that....
+
+
+Devices:
+
+You should make the device files as follows:
+
+#!/bin/sh
+# (I recommend that you cut-and-paste this into a file and run that)
+cd /dev
+t=0
+mknod specialix_sxctl c 10 167
+while [ $t -lt 64 ]
+ do
+ echo -n "$t "
+ mknod ttyX$t c 32 $t
+ mknod cux$t c 33 $t
+ t=`expr $t + 1`
+done
+echo ""
+rm /etc/psdevtab
+ps > /dev/null
+
+
+This creates 64 devices. If you have more, increase the constant on
+the line with "while". The devices start at 0, as is customary on
+Linux. Specialix seems to like starting the numbering at 1.
+
+If your system doesn't come with these devices pre-installed, bug your
+linux-vendor about this. They should have these devices
+"pre-installed" before the new millennium. The "ps" stuff at the end
+is to "tell" ps that the new devices exist.
+
+Officially the maximum number of cards per computer is 4. This driver
+however supports as many cards in one machine as you want. You'll run
+out of interrupts after a few, but you can switch to polled operation
+then. At about 256 ports (More than 8 cards), we run out of minor
+device numbers. Sorry. I suggest you buy a second computer.... (Or
+switch to RIO).
+
+------------------------------------------------------------------------
+
+
+ Fixed bugs and restrictions:
+ - Hangup processing.
+ -- Done.
+
+ - the write path in generic_serial (lockup / oops).
+ -- Done (Ugly: not the way I want it. Copied from serial.c).
+
+ - write buffer isn't flushed at close.
+ -- Done. I still seem to lose a few chars at close.
+ Sorry. I think that this is a firmware issue. (-> Specialix)
+
+ - drain hardware before changing termios
+ - Change debug on the fly.
+ - ISA free irq -1. (no firmware loaded).
+ - adding c8000 as a probe address. Added warning.
+ - Add a RAMtest for the RAM on the card.c
+ - Crash when opening a port "way" of the number of allowed ports.
+ (for example opening port 60 when there are only 24 ports attached)
+ - Sometimes the use-count strays a bit. After a few hours of
+ testing the use count is sometimes "3". If you are not like
+ me and can remember what you did to get it that way, I'd
+ appreciate an Email. Possibly fixed. Tell me if anyone still
+ sees this.
+ - TAs don't work right if you don't connect all the modem control
+ signals. SXDCs do. T225 firmware problem -> Specialix.
+ (Mostly fixed now, I think. Tell me if you encounter this!)
+
+ Bugs & restrictions:
+
+ - Arbitrary baud rates. Requires firmware update. (-> Specialix)
+
+ - Low latency (mostly firmware, -> Specialix)
+
+
+
diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt
new file mode 100644
index 0000000..8e65c44
--- /dev/null
+++ b/Documentation/serial/tty.txt
@@ -0,0 +1,292 @@
+
+ The Lockronomicon
+
+Your guide to the ancient and twisted locking policies of the tty layer and
+the warped logic behind them. Beware all ye who read on.
+
+FIXME: still need to work out the full set of BKL assumptions and document
+them so they can eventually be killed off.
+
+
+Line Discipline
+---------------
+
+Line disciplines are registered with tty_register_ldisc() passing the
+discipline number and the ldisc structure. At the point of registration the
+discipline must be ready to use and it is possible it will get used before
+the call returns success. If the call returns an error then it won't get
+called. Do not re-use ldisc numbers as they are part of the userspace ABI
+and writing over an existing ldisc will cause demons to eat your computer.
+After the return the ldisc data has been copied so you may free your own
+copy of the structure. You must not re-register over the top of the line
+discipline even with the same data or your computer again will be eaten by
+demons.
+
+In order to remove a line discipline call tty_unregister_ldisc().
+In ancient times this always worked. In modern times the function will
+return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
+code manages the module counts this should not usually be a concern.
+
+Heed this warning: the reference count field of the registered copies of the
+tty_ldisc structure in the ldisc table counts the number of lines using this
+discipline. The reference count of the tty_ldisc structure within a tty
+counts the number of active users of the ldisc at this instant. In effect it
+counts the number of threads of execution within an ldisc method (plus those
+about to enter and exit although this detail matters not).
+
+Line Discipline Methods
+-----------------------
+
+TTY side interfaces:
+
+open() - Called when the line discipline is attached to
+ the terminal. No other call into the line
+ discipline for this tty will occur until it
+ completes successfully. Can sleep.
+
+close() - This is called on a terminal when the line
+ discipline is being unplugged. At the point of
+ execution no further users will enter the
+ ldisc code for this tty. Can sleep.
+
+hangup() - Called when the tty line is hung up.
+ The line discipline should cease I/O to the tty.
+ No further calls into the ldisc code will occur.
+ Can sleep.
+
+write() - A process is writing data through the line
+ discipline. Multiple write calls are serialized
+ by the tty layer for the ldisc. May sleep.
+
+flush_buffer() - (optional) May be called at any point between
+ open and close, and instructs the line discipline
+ to empty its input buffer.
+
+chars_in_buffer() - (optional) Report the number of bytes in the input
+ buffer.
+
+set_termios() - (optional) Called on termios structure changes.
+ The caller passes the old termios data and the
+ current data is in the tty. Called under the
+ termios semaphore so allowed to sleep. Serialized
+ against itself only.
+
+read() - Move data from the line discipline to the user.
+ Multiple read calls may occur in parallel and the
+ ldisc must deal with serialization issues. May
+ sleep.
+
+poll() - Check the status for the poll/select calls. Multiple
+ poll calls may occur in parallel. May sleep.
+
+ioctl() - Called when an ioctl is handed to the tty layer
+ that might be for the ldisc. Multiple ioctl calls
+ may occur in parallel. May sleep.
+
+Driver Side Interfaces:
+
+receive_buf() - Hand buffers of bytes from the driver to the ldisc
+ for processing. Semantics currently rather
+ mysterious 8(
+
+write_wakeup() - May be called at any point between open and close.
+ The TTY_DO_WRITE_WAKEUP flag indicates if a call
+ is needed but always races versus calls. Thus the
+ ldisc must be careful about setting order and to
+ handle unexpected calls. Must not sleep.
+
+ The driver is forbidden from calling this directly
+ from the ->write call from the ldisc as the ldisc
+ is permitted to call the driver write method from
+ this function. In such a situation defer it.
+
+
+Driver Access
+
+Line discipline methods can call the following methods of the underlying
+hardware driver through the function pointers within the tty->driver
+structure:
+
+write() Write a block of characters to the tty device.
+ Returns the number of characters accepted. The
+ character buffer passed to this method is already
+ in kernel space.
+
+put_char() Queues a character for writing to the tty device.
+ If there is no room in the queue, the character is
+ ignored.
+
+flush_chars() (Optional) If defined, must be called after
+ queueing characters with put_char() in order to
+ start transmission.
+
+write_room() Returns the numbers of characters the tty driver
+ will accept for queueing to be written.
+
+ioctl() Invoke device specific ioctl.
+ Expects data pointers to refer to userspace.
+ Returns ENOIOCTLCMD for unrecognized ioctl numbers.
+
+set_termios() Notify the tty driver that the device's termios
+ settings have changed. New settings are in
+ tty->termios. Previous settings should be passed in
+ the "old" argument.
+
+ The API is defined such that the driver should return
+ the actual modes selected. This means that the
+ driver function is responsible for modifying any
+ bits in the request it cannot fulfill to indicate
+ the actual modes being used. A device with no
+ hardware capability for change (eg a USB dongle or
+ virtual port) can provide NULL for this method.
+
+throttle() Notify the tty driver that input buffers for the
+ line discipline are close to full, and it should
+ somehow signal that no more characters should be
+ sent to the tty.
+
+unthrottle() Notify the tty driver that characters can now be
+ sent to the tty without fear of overrunning the
+ input buffers of the line disciplines.
+
+stop() Ask the tty driver to stop outputting characters
+ to the tty device.
+
+start() Ask the tty driver to resume sending characters
+ to the tty device.
+
+hangup() Ask the tty driver to hang up the tty device.
+
+break_ctl() (Optional) Ask the tty driver to turn on or off
+ BREAK status on the RS-232 port. If state is -1,
+ then the BREAK status should be turned on; if
+ state is 0, then BREAK should be turned off.
+ If this routine is not implemented, use ioctls
+ TIOCSBRK / TIOCCBRK instead.
+
+wait_until_sent() Waits until the device has written out all of the
+ characters in its transmitter FIFO.
+
+send_xchar() Send a high-priority XON/XOFF character to the device.
+
+
+Flags
+
+Line discipline methods have access to tty->flags field containing the
+following interesting flags:
+
+TTY_THROTTLED Driver input is throttled. The ldisc should call
+ tty->driver->unthrottle() in order to resume
+ reception when it is ready to process more data.
+
+TTY_DO_WRITE_WAKEUP If set, causes the driver to call the ldisc's
+ write_wakeup() method in order to resume
+ transmission when it can accept more data
+ to transmit.
+
+TTY_IO_ERROR If set, causes all subsequent userspace read/write
+ calls on the tty to fail, returning -EIO.
+
+TTY_OTHER_CLOSED Device is a pty and the other side has closed.
+
+TTY_NO_WRITE_SPLIT Prevent driver from splitting up writes into
+ smaller chunks.
+
+
+Locking
+
+Callers to the line discipline functions from the tty layer are required to
+take line discipline locks. The same is true of calls from the driver side
+but not yet enforced.
+
+Three calls are now provided
+
+ ldisc = tty_ldisc_ref(tty);
+
+takes a handle to the line discipline in the tty and returns it. If no ldisc
+is currently attached or the ldisc is being closed and re-opened at this
+point then NULL is returned. While this handle is held the ldisc will not
+change or go away.
+
+ tty_ldisc_deref(ldisc)
+
+Returns the ldisc reference and allows the ldisc to be closed. Returning the
+reference takes away your right to call the ldisc functions until you take
+a new reference.
+
+ ldisc = tty_ldisc_ref_wait(tty);
+
+Performs the same function as tty_ldisc_ref except that it will wait for an
+ldisc change to complete and then return a reference to the new ldisc.
+
+While these functions are slightly slower than the old code they should have
+minimal impact as most receive logic uses the flip buffers and they only
+need to take a reference when they push bits up through the driver.
+
+A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc
+functions are called with the ldisc unavailable. Thus tty_ldisc_ref will
+fail in this situation if used within these functions. Ldisc and driver
+code calling its own functions must be careful in this case.
+
+
+Driver Interface
+----------------
+
+open() - Called when a device is opened. May sleep
+
+close() - Called when a device is closed. At the point of
+ return from this call the driver must make no
+ further ldisc calls of any kind. May sleep
+
+write() - Called to write bytes to the device. May not
+ sleep. May occur in parallel in special cases.
+ Because this includes panic paths drivers generally
+ shouldn't try and do clever locking here.
+
+put_char() - Stuff a single character onto the queue. The
+ driver is guaranteed following up calls to
+ flush_chars.
+
+flush_chars() - Ask the kernel to write put_char queue
+
+write_room() - Return the number of characters tht can be stuffed
+ into the port buffers without overflow (or less).
+ The ldisc is responsible for being intelligent
+ about multi-threading of write_room/write calls
+
+ioctl() - Called when an ioctl may be for the driver
+
+set_termios() - Called on termios change, serialized against
+ itself by a semaphore. May sleep.
+
+set_ldisc() - Notifier for discipline change. At the point this
+ is done the discipline is not yet usable. Can now
+ sleep (I think)
+
+throttle() - Called by the ldisc to ask the driver to do flow
+ control. Serialization including with unthrottle
+ is the job of the ldisc layer.
+
+unthrottle() - Called by the ldisc to ask the driver to stop flow
+ control.
+
+stop() - Ldisc notifier to the driver to stop output. As with
+ throttle the serializations with start() are down
+ to the ldisc layer.
+
+start() - Ldisc notifier to the driver to start output.
+
+hangup() - Ask the tty driver to cause a hangup initiated
+ from the host side. [Can sleep ??]
+
+break_ctl() - Send RS232 break. Can sleep. Can get called in
+ parallel, driver must serialize (for now), and
+ with write calls.
+
+wait_until_sent() - Wait for characters to exit the hardware queue
+ of the driver. Can sleep
+
+send_xchar() - Send XON/XOFF and if possible jump the queue with
+ it in order to get fast flow control responses.
+ Cannot sleep ??
+
diff --git a/Documentation/sgi-ioc4.txt b/Documentation/sgi-ioc4.txt
new file mode 100644
index 0000000..876c96a
--- /dev/null
+++ b/Documentation/sgi-ioc4.txt
@@ -0,0 +1,45 @@
+The SGI IOC4 PCI device is a bit of a strange beast, so some notes on
+it are in order.
+
+First, even though the IOC4 performs multiple functions, such as an
+IDE controller, a serial controller, a PS/2 keyboard/mouse controller,
+and an external interrupt mechanism, it's not implemented as a
+multifunction device. The consequence of this from a software
+standpoint is that all these functions share a single IRQ, and
+they can't all register to own the same PCI device ID. To make
+matters a bit worse, some of the register blocks (and even registers
+themselves) present in IOC4 are mixed-purpose between these several
+functions, meaning that there's no clear "owning" device driver.
+
+The solution is to organize the IOC4 driver into several independent
+drivers, "ioc4", "sgiioc4", and "ioc4_serial". Note that there is no
+PS/2 controller driver as this functionality has never been wired up
+on a shipping IO card.
+
+ioc4
+====
+This is the core (or shim) driver for IOC4. It is responsible for
+initializing the basic functionality of the chip, and allocating
+the PCI resources that are shared between the IOC4 functions.
+
+This driver also provides registration functions that the other
+IOC4 drivers can call to make their presence known. Each driver
+needs to provide a probe and remove function, which are invoked
+by the core driver at appropriate times. The interface of these
+IOC4 function probe and remove operations isn't precisely the same
+as PCI device probe and remove operations, but is logically the
+same operation.
+
+sgiioc4
+=======
+This is the IDE driver for IOC4. Its name isn't very descriptive
+simply for historical reasons (it used to be the only IOC4 driver
+component). There's not much to say about it other than it hooks
+up to the ioc4 driver via the appropriate registration, probe, and
+remove functions.
+
+ioc4_serial
+===========
+This is the serial driver for IOC4. There's not much to say about it
+other than it hooks up to the ioc4 driver via the appropriate registration,
+probe, and remove functions.
diff --git a/Documentation/sgi-visws.txt b/Documentation/sgi-visws.txt
new file mode 100644
index 0000000..7ff0811
--- /dev/null
+++ b/Documentation/sgi-visws.txt
@@ -0,0 +1,13 @@
+
+The SGI Visual Workstations (models 320 and 540) are based around
+the Cobalt, Lithium, and Arsenic ASICs. The Cobalt ASIC is the
+main system ASIC which interfaces the 1-4 IA32 cpus, the memory
+system, and the I/O system in the Lithium ASIC. The Cobalt ASIC
+also contains the 3D gfx rendering engine which renders to main
+system memory -- part of which is used as the frame buffer which
+is DMA'ed to a video connector using the Arsenic ASIC. A PIIX4
+chip and NS87307 are used to provide legacy device support (IDE,
+serial, floppy, and parallel).
+
+The Visual Workstation chipset largely conforms to the PC architecture
+with some notable exceptions such as interrupt handling.
diff --git a/Documentation/sh/clk.txt b/Documentation/sh/clk.txt
new file mode 100644
index 0000000..114b595
--- /dev/null
+++ b/Documentation/sh/clk.txt
@@ -0,0 +1,32 @@
+Clock framework on SuperH architecture
+
+The framework on SH extends existing API by the function clk_set_rate_ex,
+which prototype is as follows:
+
+ clk_set_rate_ex (struct clk *clk, unsigned long rate, int algo_id)
+
+The algo_id parameter is used to specify algorithm used to recalculate clocks,
+adjanced to clock, specified as first argument. It is assumed that algo_id==0
+means no changes to adjanced clock
+
+Internally, the clk_set_rate_ex forwards request to clk->ops->set_rate method,
+if it is present in ops structure. The method should set the clock rate and adjust
+all needed clocks according to the passed algo_id.
+Exact values for algo_id are machine-dependent. For the sh7722, the following
+values are defined:
+
+ NO_CHANGE = 0,
+ IUS_N1_N1, /* I:U = N:1, U:Sh = N:1 */
+ IUS_322, /* I:U:Sh = 3:2:2 */
+ IUS_522, /* I:U:Sh = 5:2:2 */
+ IUS_N11, /* I:U:Sh = N:1:1 */
+ SB_N1, /* Sh:B = N:1 */
+ SB3_N1, /* Sh:B3 = N:1 */
+ SB3_32, /* Sh:B3 = 3:2 */
+ SB3_43, /* Sh:B3 = 4:3 */
+ SB3_54, /* Sh:B3 = 5:4 */
+ BP_N1, /* B:P = N:1 */
+ IP_N1 /* I:P = N:1 */
+
+Each of these constants means relation between clocks that can be set via the FRQCR
+register
diff --git a/Documentation/sh/kgdb.txt b/Documentation/sh/kgdb.txt
new file mode 100644
index 0000000..05b4ba8
--- /dev/null
+++ b/Documentation/sh/kgdb.txt
@@ -0,0 +1,179 @@
+
+This file describes the configuration and behavior of KGDB for the SH
+kernel. Based on a description from Henry Bell <henry.bell@st.com>, it
+has been modified to account for quirks in the current implementation.
+
+Version
+=======
+
+This version of KGDB was written for 2.4.xx kernels for the SH architecture.
+Further documentation is available from the linux-sh project website.
+
+
+Debugging Setup: Host
+======================
+
+The two machines will be connected together via a serial line - this
+should be a null modem cable i.e. with a twist.
+
+On your DEVELOPMENT machine, go to your kernel source directory and
+build the kernel, enabling KGDB support in the "kernel hacking" section.
+This includes the KGDB code, and also makes the kernel be compiled with
+the "-g" option set -- necessary for debugging.
+
+To install this new kernel, use the following installation procedure.
+
+Decide on which tty port you want the machines to communicate, then
+cable them up back-to-back using the null modem. On the DEVELOPMENT
+machine, you may wish to create an initialization file called .gdbinit
+(in the kernel source directory or in your home directory) to execute
+commonly-used commands at startup.
+
+A minimal .gdbinit might look like this:
+
+ file vmlinux
+ set remotebaud 115200
+ target remote /dev/ttyS0
+
+Change the "target" definition so that it specifies the tty port that
+you intend to use. Change the "remotebaud" definition to match the
+data rate that you are going to use for the com line (115200 is the
+default).
+
+Debugging Setup: Target
+========================
+
+By default, the KGDB stub will communicate with the host GDB using
+ttySC1 at 115200 baud, 8 databits, no parity; these defaults can be
+changed in the kernel configuration. As the kernel starts up, KGDB will
+initialize so that breakpoints, kernel segfaults, and so forth will
+generally enter the debugger.
+
+This behavior can be modified by including the "kgdb" option in the
+kernel command line; this option has the general form:
+
+ kgdb=<ttyspec>,<action>
+
+The <ttyspec> indicates the port to use, and can optionally specify
+baud, parity and databits -- e.g. "ttySC0,9600N8" or "ttySC1,19200".
+
+The <action> can be "halt" or "disabled". The "halt" action enters the
+debugger via a breakpoint as soon as kgdb is initialized; the "disabled"
+action causes kgdb to ignore kernel segfaults and such until explicitly
+entered by a breakpoint in the code or by external action (sysrq or NMI).
+
+(Both <ttyspec> and <action> can appear alone, w/o the separating comma.)
+
+For example, if you wish to debug early in kernel startup code, you
+might specify the halt option:
+
+ kgdb=halt
+
+Boot the TARGET machine, which will appear to hang.
+
+On your DEVELOPMENT machine, cd to the source directory and run the gdb
+program. (This is likely to be a cross GDB which runs on your host but
+is built for an SH target.) If everything is working correctly you
+should see gdb print out a few lines indicating that a breakpoint has
+been taken. It will actually show a line of code in the target kernel
+inside the gdbstub activation code.
+
+NOTE: BE SURE TO TERMINATE OR SUSPEND any other host application which
+may be using the same serial port (for example, a terminal emulator you
+have been using to connect to the target boot code.) Otherwise, data
+from the target may not all get to GDB!
+
+You can now use whatever gdb commands you like to set breakpoints.
+Enter "continue" to start your target machine executing again. At this
+point the target system will run at full speed until it encounters
+your breakpoint or gets a segment violation in the kernel, or whatever.
+
+Serial Ports: KGDB, Console
+============================
+
+This version of KGDB may not gracefully handle conflict with other
+drivers in the kernel using the same port. If KGDB is configured on the
+same port (and with the same parameters) as the kernel console, or if
+CONFIG_SH_KGDB_CONSOLE is configured, things should be fine (though in
+some cases console messages may appear twice through GDB). But if the
+KGDB port is not the kernel console and used by another serial driver
+which assumes different serial parameters (e.g. baud rate) KGDB may not
+recover.
+
+Also, when KGDB is entered via sysrq-g (requires CONFIG_KGDB_SYSRQ) and
+the kgdb port uses the same port as the console, detaching GDB will not
+restore the console to working order without the port being re-opened.
+
+Another serious consequence of this is that GDB currently CANNOT break
+into KGDB externally (e.g. via ^C or <BREAK>); unless a breakpoint or
+error is encountered, the only way to enter KGDB after the initial halt
+(see above) is via NMI (CONFIG_KGDB_NMI) or sysrq-g (CONFIG_KGDB_SYSRQ).
+
+Code is included for the basic Hitachi Solution Engine boards to allow
+the use of ttyS0 for KGDB if desired; this is less robust, but may be
+useful in some cases. (This cannot be selected using the config file,
+but only through the kernel command line, e.g. "kgdb=ttyS0", though the
+configured defaults for baud rate etc. still apply if not overridden.)
+
+If gdbstub Does Not Work
+========================
+
+If it doesn't work, you will have to troubleshoot it. Do the easy
+things first like double checking your cabling and data rates. You
+might try some non-kernel based programs to see if the back-to-back
+connection works properly. Just something simple like cat /etc/hosts
+/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you
+if you can send data from one machine to the other. There is no point
+in tearing out your hair in the kernel if the line doesn't work.
+
+If you need to debug the GDB/KGDB communication itself, the gdb commands
+"set debug remote 1" and "set debug serial 1" may be useful, but be
+warned: they produce a lot of output.
+
+Threads
+=======
+
+Each process in a target machine is seen as a gdb thread. gdb thread related
+commands (info threads, thread n) can be used. CONFIG_KGDB_THREAD must
+be defined for this to work.
+
+In this version, kgdb reports PID_MAX (32768) as the process ID for the
+idle process (pid 0), since GDB does not accept 0 as an ID.
+
+Detaching (exiting KGDB)
+=========================
+
+There are two ways to resume full-speed target execution: "continue" and
+"detach". With "continue", GDB inserts any specified breakpoints in the
+target code and resumes execution; the target is still in "gdb mode".
+If a breakpoint or other debug event (e.g. NMI) happens, the target
+halts and communicates with GDB again, which is waiting for it.
+
+With "detach", GDB does *not* insert any breakpoints; target execution
+is resumed and GDB stops communicating (does not wait for the target).
+In this case, the target is no longer in "gdb mode" -- for example,
+console messages no longer get sent separately to the KGDB port, or
+encapsulated for GDB. If a debug event (e.g. NMI) occurs, the target
+will re-enter "gdb mode" and will display this fact on the console; you
+must give a new "target remote" command to gdb.
+
+NOTE: TO AVOID LOSSING CONSOLE MESSAGES IN CASE THE KERNEL CONSOLE AND
+KGDB USING THE SAME PORT, THE TARGET WAITS FOR ANY INPUT CHARACTER ON
+THE KGDB PORT AFTER A DETACH COMMAND. For example, after the detach you
+could start a terminal emulator on the same host port and enter a <cr>;
+however, this program must then be terminated or suspended in order to
+use GBD again if KGDB is re-entered.
+
+
+Acknowledgements
+================
+
+This code was mostly generated by Henry Bell <henry.bell@st.com>;
+largely from KGDB by Amit S. Kale <akale@veritas.com> - extracts from
+code by Glenn Engel, Jim Kingdon, David Grothe <dave@gcom.com>, Tigran
+Aivazian <tigran@sco.com>, William Gatliff <bgat@open-widgets.com>, Ben
+Lee, Steve Chamberlain and Benoit Miller <fulg@iname.com> are also
+included.
+
+Jeremy Siegel
+<jsiegel@mvista.com>
diff --git a/Documentation/sh/new-machine.txt b/Documentation/sh/new-machine.txt
new file mode 100644
index 0000000..f035416
--- /dev/null
+++ b/Documentation/sh/new-machine.txt
@@ -0,0 +1,278 @@
+
+ Adding a new board to LinuxSH
+ ================================
+
+ Paul Mundt <lethal@linux-sh.org>
+
+This document attempts to outline what steps are necessary to add support
+for new boards to the LinuxSH port under the new 2.5 and 2.6 kernels. This
+also attempts to outline some of the noticeable changes between the 2.4
+and the 2.5/2.6 SH backend.
+
+1. New Directory Structure
+==========================
+
+The first thing to note is the new directory structure. Under 2.4, most
+of the board-specific code (with the exception of stboards) ended up
+in arch/sh/kernel/ directly, with board-specific headers ending up in
+include/asm-sh/. For the new kernel, things are broken out by board type,
+companion chip type, and CPU type. Looking at a tree view of this directory
+hierarchy looks like the following:
+
+Board-specific code:
+
+.
+|-- arch
+| `-- sh
+| `-- boards
+| |-- adx
+| | `-- board-specific files
+| |-- bigsur
+| | `-- board-specific files
+| |
+| ... more boards here ...
+|
+`-- include
+ `-- asm-sh
+ |-- adx
+ | `-- board-specific headers
+ |-- bigsur
+ | `-- board-specific headers
+ |
+ .. more boards here ...
+
+Next, for companion chips:
+.
+`-- arch
+ `-- sh
+ `-- cchips
+ `-- hd6446x
+ `-- hd64461
+ `-- cchip-specific files
+
+... and so on. Headers for the companion chips are treated the same way as
+board-specific headers. Thus, include/asm-sh/hd64461 is home to all of the
+hd64461-specific headers.
+
+Finally, CPU family support is also abstracted:
+.
+|-- arch
+| `-- sh
+| |-- kernel
+| | `-- cpu
+| | |-- sh2
+| | | `-- SH-2 generic files
+| | |-- sh3
+| | | `-- SH-3 generic files
+| | `-- sh4
+| | `-- SH-4 generic files
+| `-- mm
+| `-- This is also broken out per CPU family, so each family can
+| have their own set of cache/tlb functions.
+|
+`-- include
+ `-- asm-sh
+ |-- cpu-sh2
+ | `-- SH-2 specific headers
+ |-- cpu-sh3
+ | `-- SH-3 specific headers
+ `-- cpu-sh4
+ `-- SH-4 specific headers
+
+It should be noted that CPU subtypes are _not_ abstracted. Thus, these still
+need to be dealt with by the CPU family specific code.
+
+2. Adding a New Board
+=====================
+
+The first thing to determine is whether the board you are adding will be
+isolated, or whether it will be part of a family of boards that can mostly
+share the same board-specific code with minor differences.
+
+In the first case, this is just a matter of making a directory for your
+board in arch/sh/boards/ and adding rules to hook your board in with the
+build system (more on this in the next section). However, for board families
+it makes more sense to have a common top-level arch/sh/boards/ directory
+and then populate that with sub-directories for each member of the family.
+Both the Solution Engine and the hp6xx boards are an example of this.
+
+After you have setup your new arch/sh/boards/ directory, remember that you
+should also add a directory in include/asm-sh for headers localized to this
+board (if there are going to be more than one). In order to interoperate
+seamlessly with the build system, it's best to have this directory the same
+as the arch/sh/boards/ directory name, though if your board is again part of
+a family, the build system has ways of dealing with this (via incdir-y
+overloading), and you can feel free to name the directory after the family
+member itself.
+
+There are a few things that each board is required to have, both in the
+arch/sh/boards and the include/asm-sh/ hierarchy. In order to better
+explain this, we use some examples for adding an imaginary board. For
+setup code, we're required at the very least to provide definitions for
+get_system_type() and platform_setup(). For our imaginary board, this
+might look something like:
+
+/*
+ * arch/sh/boards/vapor/setup.c - Setup code for imaginary board
+ */
+#include <linux/init.h>
+#include <asm/rtc.h> /* for board_time_init() */
+
+const char *get_system_type(void)
+{
+ return "FooTech Vaporboard";
+}
+
+int __init platform_setup(void)
+{
+ /*
+ * If our hardware actually existed, we would do real
+ * setup here. Though it's also sane to leave this empty
+ * if there's no real init work that has to be done for
+ * this board.
+ */
+
+ /*
+ * Presume all FooTech boards have the same broken timer,
+ * and also presume that we've defined foo_timer_init to
+ * do something useful.
+ */
+ board_time_init = foo_timer_init;
+
+ /* Start-up imaginary PCI ... */
+
+ /* And whatever else ... */
+
+ return 0;
+}
+
+Our new imaginary board will also have to tie into the machvec in order for it
+to be of any use.
+
+machvec functions fall into a number of categories:
+
+ - I/O functions to IO memory (inb etc) and PCI/main memory (readb etc).
+ - I/O mapping functions (ioport_map, ioport_unmap, etc).
+ - a 'heartbeat' function.
+ - PCI and IRQ initialization routines.
+ - Consistent allocators (for boards that need special allocators,
+ particularly for allocating out of some board-specific SRAM for DMA
+ handles).
+
+There are machvec functions added and removed over time, so always be sure to
+consult include/asm-sh/machvec.h for the current state of the machvec.
+
+The kernel will automatically wrap in generic routines for undefined function
+pointers in the machvec at boot time, as machvec functions are referenced
+unconditionally throughout most of the tree. Some boards have incredibly
+sparse machvecs (such as the dreamcast and sh03), whereas others must define
+virtually everything (rts7751r2d).
+
+Adding a new machine is relatively trivial (using vapor as an example):
+
+If the board-specific definitions are quite minimalistic, as is the case for
+the vast majority of boards, simply having a single board-specific header is
+sufficient.
+
+ - add a new file include/asm-sh/vapor.h which contains prototypes for
+ any machine specific IO functions prefixed with the machine name, for
+ example vapor_inb. These will be needed when filling out the machine
+ vector.
+
+ Note that these prototypes are generated automatically by setting
+ __IO_PREFIX to something sensible. A typical example would be:
+
+ #define __IO_PREFIX vapor
+ #include <asm/io_generic.h>
+
+ somewhere in the board-specific header. Any boards being ported that still
+ have a legacy io.h should remove it entirely and switch to the new model.
+
+ - Add machine vector definitions to the board's setup.c. At a bare minimum,
+ this must be defined as something like:
+
+ struct sh_machine_vector mv_vapor __initmv = {
+ .mv_name = "vapor",
+ };
+ ALIAS_MV(vapor)
+
+ - finally add a file arch/sh/boards/vapor/io.c, which contains definitions of
+ the machine specific io functions (if there are enough to warrant it).
+
+3. Hooking into the Build System
+================================
+
+Now that we have the corresponding directories setup, and all of the
+board-specific code is in place, it's time to look at how to get the
+whole mess to fit into the build system.
+
+Large portions of the build system are now entirely dynamic, and merely
+require the proper entry here and there in order to get things done.
+
+The first thing to do is to add an entry to arch/sh/Kconfig, under the
+"System type" menu:
+
+config SH_VAPOR
+ bool "Vapor"
+ help
+ select Vapor if configuring for a FooTech Vaporboard.
+
+next, this has to be added into arch/sh/Makefile. All boards require a
+machdir-y entry in order to be built. This entry needs to be the name of
+the board directory as it appears in arch/sh/boards, even if it is in a
+sub-directory (in which case, all parent directories below arch/sh/boards/
+need to be listed). For our new board, this entry can look like:
+
+machdir-$(CONFIG_SH_VAPOR) += vapor
+
+provided that we've placed everything in the arch/sh/boards/vapor/ directory.
+
+Next, the build system assumes that your include/asm-sh directory will also
+be named the same. If this is not the case (as is the case with multiple
+boards belonging to a common family), then the directory name needs to be
+implicitly appended to incdir-y. The existing code manages this for the
+Solution Engine and hp6xx boards, so see these for an example.
+
+Once that is taken care of, it's time to add an entry for the mach type.
+This is done by adding an entry to the end of the arch/sh/tools/mach-types
+list. The method for doing this is self explanatory, and so we won't waste
+space restating it here. After this is done, you will be able to use
+implicit checks for your board if you need this somewhere throughout the
+common code, such as:
+
+ /* Make sure we're on the FooTech Vaporboard */
+ if (!mach_is_vapor())
+ return -ENODEV;
+
+also note that the mach_is_boardname() check will be implicitly forced to
+lowercase, regardless of the fact that the mach-types entries are all
+uppercase. You can read the script if you really care, but it's pretty ugly,
+so you probably don't want to do that.
+
+Now all that's left to do is providing a defconfig for your new board. This
+way, other people who end up with this board can simply use this config
+for reference instead of trying to guess what settings are supposed to be
+used on it.
+
+Also, as soon as you have copied over a sample .config for your new board
+(assume arch/sh/configs/vapor_defconfig), you can also use this directly as a
+build target, and it will be implicitly listed as such in the help text.
+
+Looking at the 'make help' output, you should now see something like:
+
+Architecture specific targets (sh):
+ zImage - Compressed kernel image (arch/sh/boot/zImage)
+ adx_defconfig - Build for adx
+ cqreek_defconfig - Build for cqreek
+ dreamcast_defconfig - Build for dreamcast
+...
+ vapor_defconfig - Build for vapor
+
+which then allows you to do:
+
+$ make ARCH=sh CROSS_COMPILE=sh4-linux- vapor_defconfig vmlinux
+
+which will in turn copy the defconfig for this board, run it through
+oldconfig (prompting you for any new options since the time of creation),
+and start you on your way to having a functional kernel for your new
+board.
diff --git a/Documentation/sh/register-banks.txt b/Documentation/sh/register-banks.txt
new file mode 100644
index 0000000..a6719f2
--- /dev/null
+++ b/Documentation/sh/register-banks.txt
@@ -0,0 +1,33 @@
+ Notes on register bank usage in the kernel
+ ==========================================
+
+Introduction
+------------
+
+The SH-3 and SH-4 CPU families traditionally include a single partial register
+bank (selected by SR.RB, only r0 ... r7 are banked), whereas other families
+may have more full-featured banking or simply no such capabilities at all.
+
+SR.RB banking
+-------------
+
+In the case of this type of banking, banked registers are mapped directly to
+r0 ... r7 if SR.RB is set to the bank we are interested in, otherwise ldc/stc
+can still be used to reference the banked registers (as r0_bank ... r7_bank)
+when in the context of another bank. The developer must keep the SR.RB value
+in mind when writing code that utilizes these banked registers, for obvious
+reasons. Userspace is also not able to poke at the bank1 values, so these can
+be used rather effectively as scratch registers by the kernel.
+
+Presently the kernel uses several of these registers.
+
+ - r0_bank, r1_bank (referenced as k0 and k1, used for scratch
+ registers when doing exception handling).
+ - r2_bank (used to track the EXPEVT/INTEVT code)
+ - Used by do_IRQ() and friends for doing irq mapping based off
+ of the interrupt exception vector jump table offset
+ - r6_bank (global interrupt mask)
+ - The SR.IMASK interrupt handler makes use of this to set the
+ interrupt priority level (used by local_irq_enable())
+ - r7_bank (current)
+
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
new file mode 100644
index 0000000..bef6550
--- /dev/null
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -0,0 +1,2417 @@
+
+ Advanced Linux Sound Architecture - Driver
+ ==========================================
+ Configuration guide
+
+
+Kernel Configuration
+====================
+
+To enable ALSA support you need at least to build the kernel with
+primary sound card support (CONFIG_SOUND). Since ALSA can emulate OSS,
+you don't have to choose any of the OSS modules.
+
+Enable "OSS API emulation" (CONFIG_SND_OSSEMUL) and both OSS mixer and
+PCM supports if you want to run OSS applications with ALSA.
+
+If you want to support the WaveTable functionality on cards such as
+SB Live! then you need to enable "Sequencer support"
+(CONFIG_SND_SEQUENCER).
+
+To make ALSA debug messages more verbose, enable the "Verbose printk"
+and "Debug" options. To check for memory leaks, turn on "Debug memory"
+too. "Debug detection" will add checks for the detection of cards.
+
+Please note that all the ALSA ISA drivers support the Linux isapnp API
+(if the card supports ISA PnP). You don't need to configure the cards
+using isapnptools.
+
+
+Creating ALSA devices
+=====================
+
+This depends on your distribution, but normally you use the /dev/MAKEDEV
+script to create the necessary device nodes. On some systems you use a
+script named 'snddevices'.
+
+
+Module parameters
+=================
+
+The user can load modules with options. If the module supports more than
+one card and you have more than one card of the same type then you can
+specify multiple values for the option separated by commas.
+
+Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
+
+ Module snd
+ ----------
+
+ The core ALSA module. It is used by all ALSA card drivers.
+ It takes the following options which have global effects.
+
+ major - major number for sound driver
+ - Default: 116
+ cards_limit
+ - limiting card index for auto-loading (1-8)
+ - Default: 1
+ - For auto-loading more than one card, specify this
+ option together with snd-card-X aliases.
+ slots - Reserve the slot index for the given driver.
+ This option takes multiple strings.
+ See "Module Autoloading Support" section for details.
+
+ Module snd-pcm-oss
+ ------------------
+
+ The PCM OSS emulation module.
+ This module takes options which change the mapping of devices.
+
+ dsp_map - PCM device number maps assigned to the 1st OSS device.
+ - Default: 0
+ adsp_map - PCM device number maps assigned to the 2st OSS device.
+ - Default: 1
+ nonblock_open
+ - Don't block opening busy PCM devices. Default: 1
+
+ For example, when dsp_map=2, /dev/dsp will be mapped to PCM #2 of
+ the card #0. Similarly, when adsp_map=0, /dev/adsp will be mapped
+ to PCM #0 of the card #0.
+ For changing the second or later card, specify the option with
+ commas, such like "dsp_map=0,1".
+
+ nonblock_open option is used to change the behavior of the PCM
+ regarding opening the device. When this option is non-zero,
+ opening a busy OSS PCM device won't be blocked but return
+ immediately with EAGAIN (just like O_NONBLOCK flag).
+
+ Module snd-rawmidi
+ ------------------
+
+ This module takes options which change the mapping of devices.
+ similar to those of the snd-pcm-oss module.
+
+ midi_map - MIDI device number maps assigned to the 1st OSS device.
+ - Default: 0
+ amidi_map - MIDI device number maps assigned to the 2st OSS device.
+ - Default: 1
+
+ Common parameters for top sound card modules
+ --------------------------------------------
+
+ Each of top level sound card module takes the following options.
+
+ index - index (slot #) of sound card
+ - Values: 0 through 31 or negative
+ - If nonnegative, assign that index number
+ - if negative, interpret as a bitmask of permissible
+ indices; the first free permitted index is assigned
+ - Default: -1
+ id - card ID (identifier or name)
+ - Can be up to 15 characters long
+ - Default: the card type
+ - A directory by this name is created under /proc/asound/
+ containing information about the card
+ - This ID can be used instead of the index number in
+ identifying the card
+ enable - enable card
+ - Default: enabled, for PCI and ISA PnP cards
+
+ Module snd-adlib
+ ----------------
+
+ Module for AdLib FM cards.
+
+ port - port # for OPL chip
+
+ This module supports multiple cards. It does not support autoprobe, so
+ the port must be specified. For actual AdLib FM cards it will be 0x388.
+ Note that this card does not have PCM support and no mixer; only FM
+ synthesis.
+
+ Make sure you have "sbiload" from the alsa-tools package available and,
+ after loading the module, find out the assigned ALSA sequencer port
+ number through "sbiload -l". Example output:
+
+ Port Client name Port name
+ 64:0 OPL2 FM synth OPL2 FM Port
+
+ Load the std.sb and drums.sb patches also supplied by sbiload:
+
+ sbiload -p 64:0 std.sb drums.sb
+
+ If you use this driver to drive an OPL3, you can use std.o3 and drums.o3
+ instead. To have the card produce sound, use aplaymidi from alsa-utils:
+
+ aplaymidi -p 64:0 foo.mid
+
+ Module snd-ad1816a
+ ------------------
+
+ Module for sound cards based on Analog Devices AD1816A/AD1815 ISA chips.
+
+ clockfreq - Clock frequency for AD1816A chip (default = 0, 33000Hz)
+
+ This module supports multiple cards, autoprobe and PnP.
+
+ Module snd-ad1848
+ -----------------
+
+ Module for sound cards based on AD1848/AD1847/CS4248 ISA chips.
+
+ port - port # for AD1848 chip
+ irq - IRQ # for AD1848 chip
+ dma1 - DMA # for AD1848 chip (0,1,3)
+
+ This module supports multiple cards. It does not support autoprobe
+ thus main port must be specified!!! Other ports are optional.
+
+ The power-management is supported.
+
+ Module snd-ad1889
+ -----------------
+
+ Module for Analog Devices AD1889 chips.
+
+ ac97_quirk - AC'97 workaround for strange hardware
+ See the description of intel8x0 module for details.
+
+ This module supports multiple cards.
+
+ Module snd-ali5451
+ ------------------
+
+ Module for ALi M5451 PCI chip.
+
+ pcm_channels - Number of hardware channels assigned for PCM
+ spdif - Support SPDIF I/O
+ - Default: disabled
+
+ This module supports one chip and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-als100
+ -----------------
+
+ Module for sound cards based on Avance Logic ALS100/ALS120 ISA chips.
+
+ This module supports multiple cards, autoprobe and PnP.
+
+ The power-management is supported.
+
+ Module snd-als300
+ -----------------
+
+ Module for Avance Logic ALS300 and ALS300+
+
+ This module supports multiple cards.
+
+ The power-management is supported.
+
+ Module snd-als4000
+ ------------------
+
+ Module for sound cards based on Avance Logic ALS4000 PCI chip.
+
+ joystick_port - port # for legacy joystick support.
+ 0 = disabled (default), 1 = auto-detect
+
+ This module supports multiple cards, autoprobe and PnP.
+
+ The power-management is supported.
+
+ Module snd-atiixp
+ -----------------
+
+ Module for ATI IXP 150/200/250/400 AC97 controllers.
+
+ ac97_clock - AC'97 clock (default = 48000)
+ ac97_quirk - AC'97 workaround for strange hardware
+ See "AC97 Quirk Option" section below.
+ ac97_codec - Workaround to specify which AC'97 codec
+ instead of probing. If this works for you
+ file a bug with your `lspci -vn` output.
+ -2 -- Force probing.
+ -1 -- Default behavior.
+ 0-2 -- Use the specified codec.
+ spdif_aclink - S/PDIF transfer over AC-link (default = 1)
+
+ This module supports one card and autoprobe.
+
+ ATI IXP has two different methods to control SPDIF output. One is
+ over AC-link and another is over the "direct" SPDIF output. The
+ implementation depends on the motherboard, and you'll need to
+ choose the correct one via spdif_aclink module option.
+
+ The power-management is supported.
+
+ Module snd-atiixp-modem
+ -----------------------
+
+ Module for ATI IXP 150/200/250 AC97 modem controllers.
+
+ This module supports one card and autoprobe.
+
+ Note: The default index value of this module is -2, i.e. the first
+ slot is excluded.
+
+ The power-management is supported.
+
+ Module snd-au8810, snd-au8820, snd-au8830
+ -----------------------------------------
+
+ Module for Aureal Vortex, Vortex2 and Advantage device.
+
+ pcifix - Control PCI workarounds
+ 0 = Disable all workarounds
+ 1 = Force the PCI latency of the Aureal card to 0xff
+ 2 = Force the Extend PCI#2 Internal Master for Efficient
+ Handling of Dummy Requests on the VIA KT133 AGP Bridge
+ 3 = Force both settings
+ 255 = Autodetect what is required (default)
+
+ This module supports all ADB PCM channels, ac97 mixer, SPDIF, hardware
+ EQ, mpu401, gameport. A3D and wavetable support are still in development.
+ Development and reverse engineering work is being coordinated at
+ http://savannah.nongnu.org/projects/openvortex/
+ SPDIF output has a copy of the AC97 codec output, unless you use the
+ "spdif" pcm device, which allows raw data passthru.
+ The hardware EQ hardware and SPDIF is only present in the Vortex2 and
+ Advantage.
+
+ Note: Some ALSA mixer applications don't handle the SPDIF sample rate
+ control correctly. If you have problems regarding this, try
+ another ALSA compliant mixer (alsamixer works).
+
+ Module snd-aw2
+ --------------
+
+ Module for Audiowerk2 sound card
+
+ This module supports multiple cards.
+
+ Module snd-azt2320
+ ------------------
+
+ Module for sound cards based on Aztech System AZT2320 ISA chip (PnP only).
+
+ This module supports multiple cards, PnP and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-azt3328
+ ------------------
+
+ Module for sound cards based on Aztech AZF3328 PCI chip.
+
+ joystick - Enable joystick (default off)
+
+ This module supports multiple cards.
+
+ Module snd-bt87x
+ ----------------
+
+ Module for video cards based on Bt87x chips.
+
+ digital_rate - Override the default digital rate (Hz)
+ load_all - Load the driver even if the card model isn't known
+
+ This module supports multiple cards.
+
+ Note: The default index value of this module is -2, i.e. the first
+ slot is excluded.
+
+ Module snd-ca0106
+ -----------------
+
+ Module for Creative Audigy LS and SB Live 24bit
+
+ This module supports multiple cards.
+
+
+ Module snd-cmi8330
+ ------------------
+
+ Module for sound cards based on C-Media CMI8330 ISA chips.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ wssport - port # for CMI8330 chip (WSS)
+ wssirq - IRQ # for CMI8330 chip (WSS)
+ wssdma - first DMA # for CMI8330 chip (WSS)
+ sbport - port # for CMI8330 chip (SB16)
+ sbirq - IRQ # for CMI8330 chip (SB16)
+ sbdma8 - 8bit DMA # for CMI8330 chip (SB16)
+ sbdma16 - 16bit DMA # for CMI8330 chip (SB16)
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-cmipci
+ -----------------
+
+ Module for C-Media CMI8338/8738/8768/8770 PCI sound cards.
+
+ mpu_port - port address of MIDI interface (8338 only):
+ 0x300,0x310,0x320,0x330 = legacy port,
+ 0 = disable (default)
+ fm_port - port address of OPL-3 FM synthesizer (8x38 only):
+ 0x388 = legacy port,
+ 1 = integrated PCI port (default on 8738),
+ 0 = disable
+ soft_ac3 - Software-conversion of raw SPDIF packets (model 033 only)
+ (default = 1)
+ joystick_port - Joystick port address (0 = disable, 1 = auto-detect)
+
+ This module supports autoprobe and multiple cards.
+
+ The power-management is supported.
+
+ Module snd-cs4231
+ -----------------
+
+ Module for sound cards based on CS4231 ISA chips.
+
+ port - port # for CS4231 chip
+ mpu_port - port # for MPU-401 UART (optional), -1 = disable
+ irq - IRQ # for CS4231 chip
+ mpu_irq - IRQ # for MPU-401 UART
+ dma1 - first DMA # for CS4231 chip
+ dma2 - second DMA # for CS4231 chip
+
+ This module supports multiple cards. This module does not support autoprobe
+ thus main port must be specified!!! Other ports are optional.
+
+ The power-management is supported.
+
+ Module snd-cs4232
+ -----------------
+
+ Module for sound cards based on CS4232/CS4232A ISA chips.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for CS4232 chip (PnP setup - 0x534)
+ cport - control port # for CS4232 chip (PnP setup - 0x120,0x210,0xf00)
+ mpu_port - port # for MPU-401 UART (PnP setup - 0x300), -1 = disable
+ fm_port - FM port # for CS4232 chip (PnP setup - 0x388), -1 = disable
+ irq - IRQ # for CS4232 chip (5,7,9,11,12,15)
+ mpu_irq - IRQ # for MPU-401 UART (9,11,12,15)
+ dma1 - first DMA # for CS4232 chip (0,1,3)
+ dma2 - second DMA # for Yamaha CS4232 chip (0,1,3), -1 = disable
+
+ This module supports multiple cards. This module does not support autoprobe
+ (if ISA PnP is not used) thus main port must be specified!!! Other ports are
+ optional.
+
+ The power-management is supported.
+
+ Module snd-cs4236
+ -----------------
+
+ Module for sound cards based on CS4235/CS4236/CS4236B/CS4237B/
+ CS4238B/CS4239 ISA chips.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for CS4236 chip (PnP setup - 0x534)
+ cport - control port # for CS4236 chip (PnP setup - 0x120,0x210,0xf00)
+ mpu_port - port # for MPU-401 UART (PnP setup - 0x300), -1 = disable
+ fm_port - FM port # for CS4236 chip (PnP setup - 0x388), -1 = disable
+ irq - IRQ # for CS4236 chip (5,7,9,11,12,15)
+ mpu_irq - IRQ # for MPU-401 UART (9,11,12,15)
+ dma1 - first DMA # for CS4236 chip (0,1,3)
+ dma2 - second DMA # for CS4236 chip (0,1,3), -1 = disable
+
+ This module supports multiple cards. This module does not support autoprobe
+ (if ISA PnP is not used) thus main port and control port must be
+ specified!!! Other ports are optional.
+
+ The power-management is supported.
+
+ Module snd-cs4281
+ -----------------
+
+ Module for Cirrus Logic CS4281 soundchip.
+
+ dual_codec - Secondary codec ID (0 = disable, default)
+
+ This module supports multiple cards.
+
+ The power-management is supported.
+
+ Module snd-cs46xx
+ -----------------
+
+ Module for PCI sound cards based on CS4610/CS4612/CS4614/CS4615/CS4622/
+ CS4624/CS4630/CS4280 PCI chips.
+
+ external_amp - Force to enable external amplifier.
+ thinkpad - Force to enable Thinkpad's CLKRUN control.
+ mmap_valid - Support OSS mmap mode (default = 0).
+
+ This module supports multiple cards and autoprobe.
+ Usually external amp and CLKRUN controls are detected automatically
+ from PCI sub vendor/device ids. If they don't work, give the options
+ above explicitly.
+
+ The power-management is supported.
+
+ Module snd-cs5530
+ _________________
+
+ Module for Cyrix/NatSemi Geode 5530 chip.
+
+ Module snd-cs5535audio
+ ----------------------
+
+ Module for multifunction CS5535 companion PCI device
+
+ The power-management is supported.
+
+ Module snd-darla20
+ ------------------
+
+ Module for Echoaudio Darla20
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-darla24
+ ------------------
+
+ Module for Echoaudio Darla24
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-dt019x
+ -----------------
+
+ Module for Diamond Technologies DT-019X / Avance Logic ALS-007 (PnP
+ only)
+
+ This module supports multiple cards. This module is enabled only with
+ ISA PnP support.
+
+ The power-management is supported.
+
+ Module snd-dummy
+ ----------------
+
+ Module for the dummy sound card. This "card" doesn't do any output
+ or input, but you may use this module for any application which
+ requires a sound card (like RealPlayer).
+
+ The power-management is supported.
+
+ Module snd-echo3g
+ -----------------
+
+ Module for Echoaudio 3G cards (Gina3G/Layla3G)
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-emu10k1
+ ------------------
+
+ Module for EMU10K1/EMU10k2 based PCI sound cards.
+ * Sound Blaster Live!
+ * Sound Blaster PCI 512
+ * Emu APS (partially supported)
+ * Sound Blaster Audigy
+
+ extin - bitmap of available external inputs for FX8010 (see bellow)
+ extout - bitmap of available external outputs for FX8010 (see bellow)
+ seq_ports - allocated sequencer ports (4 by default)
+ max_synth_voices - limit of voices used for wavetable (64 by default)
+ max_buffer_size - specifies the maximum size of wavetable/pcm buffers
+ given in MB unit. Default value is 128.
+ enable_ir - enable IR
+
+ This module supports multiple cards and autoprobe.
+
+ Input & Output configurations [extin/extout]
+ * Creative Card wo/Digital out [0x0003/0x1f03]
+ * Creative Card w/Digital out [0x0003/0x1f0f]
+ * Creative Card w/Digital CD in [0x000f/0x1f0f]
+ * Creative Card wo/Digital out + LiveDrive [0x3fc3/0x1fc3]
+ * Creative Card w/Digital out + LiveDrive [0x3fc3/0x1fcf]
+ * Creative Card w/Digital CD in + LiveDrive [0x3fcf/0x1fcf]
+ * Creative Card wo/Digital out + Digital I/O 2 [0x0fc3/0x1f0f]
+ * Creative Card w/Digital out + Digital I/O 2 [0x0fc3/0x1f0f]
+ * Creative Card w/Digital CD in + Digital I/O 2 [0x0fcf/0x1f0f]
+ * Creative Card 5.1/w Digital out + LiveDrive [0x3fc3/0x1fff]
+ * Creative Card 5.1 (c) 2003 [0x3fc3/0x7cff]
+ * Creative Card all ins and outs [0x3fff/0x7fff]
+
+ The power-management is supported.
+
+ Module snd-emu10k1x
+ -------------------
+
+ Module for Creative Emu10k1X (SB Live Dell OEM version)
+
+ This module supports multiple cards.
+
+ Module snd-ens1370
+ ------------------
+
+ Module for Ensoniq AudioPCI ES1370 PCI sound cards.
+ * SoundBlaster PCI 64
+ * SoundBlaster PCI 128
+
+ joystick - Enable joystick (default off)
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-ens1371
+ ------------------
+
+ Module for Ensoniq AudioPCI ES1371 PCI sound cards.
+ * SoundBlaster PCI 64
+ * SoundBlaster PCI 128
+ * SoundBlaster Vibra PCI
+
+ joystick_port - port # for joystick (0x200,0x208,0x210,0x218),
+ 0 = disable (default), 1 = auto-detect
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-es968
+ ----------------
+
+ Module for sound cards based on ESS ES968 chip (PnP only).
+
+ This module supports multiple cards, PnP and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-es1688
+ -----------------
+
+ Module for ESS AudioDrive ES-1688 and ES-688 sound cards.
+
+ port - port # for ES-1688 chip (0x220,0x240,0x260)
+ mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default)
+ irq - IRQ # for ES-1688 chip (5,7,9,10)
+ mpu_irq - IRQ # for MPU-401 port (5,7,9,10)
+ dma8 - DMA # for ES-1688 chip (0,1,3)
+
+ This module supports multiple cards and autoprobe (without MPU-401 port).
+
+ Module snd-es18xx
+ -----------------
+
+ Module for ESS AudioDrive ES-18xx sound cards.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for ES-18xx chip (0x220,0x240,0x260)
+ mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default)
+ fm_port - port # for FM (optional, not used)
+ irq - IRQ # for ES-18xx chip (5,7,9,10)
+ dma1 - first DMA # for ES-18xx chip (0,1,3)
+ dma2 - first DMA # for ES-18xx chip (0,1,3)
+
+ This module supports multiple cards, ISA PnP and autoprobe (without MPU-401
+ port if native ISA PnP routines are not used).
+ When dma2 is equal with dma1, the driver works as half-duplex.
+
+ The power-management is supported.
+
+ Module snd-es1938
+ -----------------
+
+ Module for sound cards based on ESS Solo-1 (ES1938,ES1946) chips.
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-es1968
+ -----------------
+
+ Module for sound cards based on ESS Maestro-1/2/2E (ES1968/ES1978) chips.
+
+ total_bufsize - total buffer size in kB (1-4096kB)
+ pcm_substreams_p - playback channels (1-8, default=2)
+ pcm_substreams_c - capture channels (1-8, default=0)
+ clock - clock (0 = auto-detection)
+ use_pm - support the power-management (0 = off, 1 = on,
+ 2 = auto (default))
+ enable_mpu - enable MPU401 (0 = off, 1 = on, 2 = auto (default))
+ joystick - enable joystick (default off)
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-fm801
+ ----------------
+
+ Module for ForteMedia FM801 based PCI sound cards.
+
+ tea575x_tuner - Enable TEA575x tuner
+ - 1 = MediaForte 256-PCS
+ - 2 = MediaForte 256-PCPR
+ - 3 = MediaForte 64-PCR
+ - High 16-bits are video (radio) device number + 1
+ - example: 0x10002 (MediaForte 256-PCPR, device 1)
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-gina20
+ -----------------
+
+ Module for Echoaudio Gina20
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-gina24
+ -----------------
+
+ Module for Echoaudio Gina24
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-gusclassic
+ ---------------------
+
+ Module for Gravis UltraSound Classic sound card.
+
+ port - port # for GF1 chip (0x220,0x230,0x240,0x250,0x260)
+ irq - IRQ # for GF1 chip (3,5,9,11,12,15)
+ dma1 - DMA # for GF1 chip (1,3,5,6,7)
+ dma2 - DMA # for GF1 chip (1,3,5,6,7,-1=disable)
+ joystick_dac - 0 to 31, (0.59V-4.52V or 0.389V-2.98V)
+ voices - GF1 voices limit (14-32)
+ pcm_voices - reserved PCM voices
+
+ This module supports multiple cards and autoprobe.
+
+ Module snd-gusextreme
+ ---------------------
+
+ Module for Gravis UltraSound Extreme (Synergy ViperMax) sound card.
+
+ port - port # for ES-1688 chip (0x220,0x230,0x240,0x250,0x260)
+ gf1_port - port # for GF1 chip (0x210,0x220,0x230,0x240,0x250,0x260,0x270)
+ mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable
+ irq - IRQ # for ES-1688 chip (5,7,9,10)
+ gf1_irq - IRQ # for GF1 chip (3,5,9,11,12,15)
+ mpu_irq - IRQ # for MPU-401 port (5,7,9,10)
+ dma8 - DMA # for ES-1688 chip (0,1,3)
+ dma1 - DMA # for GF1 chip (1,3,5,6,7)
+ joystick_dac - 0 to 31, (0.59V-4.52V or 0.389V-2.98V)
+ voices - GF1 voices limit (14-32)
+ pcm_voices - reserved PCM voices
+
+ This module supports multiple cards and autoprobe (without MPU-401 port).
+
+ Module snd-gusmax
+ -----------------
+
+ Module for Gravis UltraSound MAX sound card.
+
+ port - port # for GF1 chip (0x220,0x230,0x240,0x250,0x260)
+ irq - IRQ # for GF1 chip (3,5,9,11,12,15)
+ dma1 - DMA # for GF1 chip (1,3,5,6,7)
+ dma2 - DMA # for GF1 chip (1,3,5,6,7,-1=disable)
+ joystick_dac - 0 to 31, (0.59V-4.52V or 0.389V-2.98V)
+ voices - GF1 voices limit (14-32)
+ pcm_voices - reserved PCM voices
+
+ This module supports multiple cards and autoprobe.
+
+ Module snd-hda-intel
+ --------------------
+
+ Module for Intel HD Audio (ICH6, ICH6M, ESB2, ICH7, ICH8, ICH9, ICH10,
+ PCH, SCH),
+ ATI SB450, SB600, R600, RS600, RS690, RS780, RV610, RV620,
+ RV630, RV635, RV670, RV770,
+ VIA VT8251/VT8237A,
+ SIS966, ULI M5461
+
+ [Multiple options for each card instance]
+ model - force the model name
+ position_fix - Fix DMA pointer (0 = auto, 1 = use LPIB, 2 = POSBUF)
+ probe_mask - Bitmask to probe codecs (default = -1, meaning all slots)
+ bdl_pos_adj - Specifies the DMA IRQ timing delay in samples.
+ Passing -1 will make the driver to choose the appropriate
+ value based on the controller chip.
+
+ [Single (global) options]
+ single_cmd - Use single immediate commands to communicate with
+ codecs (for debugging only)
+ enable_msi - Enable Message Signaled Interrupt (MSI) (default = off)
+ power_save - Automatic power-saving timtout (in second, 0 =
+ disable)
+ power_save_controller - Reset HD-audio controller in power-saving mode
+ (default = on)
+
+ This module supports multiple cards and autoprobe.
+
+ Each codec may have a model table for different configurations.
+ If your machine isn't listed there, the default (usually minimal)
+ configuration is set up. You can pass "model=<name>" option to
+ specify a certain model in such a case. There are different
+ models depending on the codec chip.
+
+ Model name Description
+ ---------- -----------
+ ALC880
+ 3stack 3-jack in back and a headphone out
+ 3stack-digout 3-jack in back, a HP out and a SPDIF out
+ 5stack 5-jack in back, 2-jack in front
+ 5stack-digout 5-jack in back, 2-jack in front, a SPDIF out
+ 6stack 6-jack in back, 2-jack in front
+ 6stack-digout 6-jack with a SPDIF out
+ w810 3-jack
+ z71v 3-jack (HP shared SPDIF)
+ asus 3-jack (ASUS Mobo)
+ asus-w1v ASUS W1V
+ asus-dig ASUS with SPDIF out
+ asus-dig2 ASUS with SPDIF out (using GPIO2)
+ uniwill 3-jack
+ fujitsu Fujitsu Laptops (Pi1536)
+ F1734 2-jack
+ lg LG laptop (m1 express dual)
+ lg-lw LG LW20/LW25 laptop
+ tcl TCL S700
+ clevo Clevo laptops (m520G, m665n)
+ medion Medion Rim 2150
+ test for testing/debugging purpose, almost all controls can be
+ adjusted. Appearing only when compiled with
+ $CONFIG_SND_DEBUG=y
+ auto auto-config reading BIOS (default)
+
+ ALC260
+ hp HP machines
+ hp-3013 HP machines (3013-variant)
+ hp-dc7600 HP DC7600
+ fujitsu Fujitsu S7020
+ acer Acer TravelMate
+ will Will laptops (PB V7900)
+ replacer Replacer 672V
+ basic fixed pin assignment (old default model)
+ test for testing/debugging purpose, almost all controls can
+ adjusted. Appearing only when compiled with
+ $CONFIG_SND_DEBUG=y
+ auto auto-config reading BIOS (default)
+
+ ALC262
+ fujitsu Fujitsu Laptop
+ hp-bpc HP xw4400/6400/8400/9400 laptops
+ hp-bpc-d7000 HP BPC D7000
+ hp-tc-t5735 HP Thin Client T5735
+ hp-rp5700 HP RP5700
+ benq Benq ED8
+ benq-t31 Benq T31
+ hippo Hippo (ATI) with jack detection, Sony UX-90s
+ hippo_1 Hippo (Benq) with jack detection
+ sony-assamd Sony ASSAMD
+ toshiba-s06 Toshiba S06
+ toshiba-rx1 Toshiba RX1
+ ultra Samsung Q1 Ultra Vista model
+ lenovo-3000 Lenovo 3000 y410
+ nec NEC Versa S9100
+ basic fixed pin assignment w/o SPDIF
+ auto auto-config reading BIOS (default)
+
+ ALC267/268
+ quanta-il1 Quanta IL1 mini-notebook
+ 3stack 3-stack model
+ toshiba Toshiba A205
+ acer Acer laptops
+ acer-aspire Acer Aspire One
+ dell Dell OEM laptops (Vostro 1200)
+ zepto Zepto laptops
+ test for testing/debugging purpose, almost all controls can
+ adjusted. Appearing only when compiled with
+ $CONFIG_SND_DEBUG=y
+ auto auto-config reading BIOS (default)
+
+ ALC269
+ basic Basic preset
+ quanta Quanta FL1
+ eeepc-p703 ASUS Eeepc P703 P900A
+ eeepc-p901 ASUS Eeepc P901 S101
+
+ ALC662/663
+ 3stack-dig 3-stack (2-channel) with SPDIF
+ 3stack-6ch 3-stack (6-channel)
+ 3stack-6ch-dig 3-stack (6-channel) with SPDIF
+ 6stack-dig 6-stack with SPDIF
+ lenovo-101e Lenovo laptop
+ eeepc-p701 ASUS Eeepc P701
+ eeepc-ep20 ASUS Eeepc EP20
+ ecs ECS/Foxconn mobo
+ m51va ASUS M51VA
+ g71v ASUS G71V
+ h13 ASUS H13
+ g50v ASUS G50V
+ asus-mode1 ASUS
+ asus-mode2 ASUS
+ asus-mode3 ASUS
+ asus-mode4 ASUS
+ asus-mode5 ASUS
+ asus-mode6 ASUS
+ auto auto-config reading BIOS (default)
+
+ ALC882/885
+ 3stack-dig 3-jack with SPDIF I/O
+ 6stack-dig 6-jack digital with SPDIF I/O
+ arima Arima W820Di1
+ targa Targa T8, MSI-1049 T8
+ asus-a7j ASUS A7J
+ asus-a7m ASUS A7M
+ macpro MacPro support
+ mbp3 Macbook Pro rev3
+ imac24 iMac 24'' with jack detection
+ w2jc ASUS W2JC
+ auto auto-config reading BIOS (default)
+
+ ALC883/888
+ 3stack-dig 3-jack with SPDIF I/O
+ 6stack-dig 6-jack digital with SPDIF I/O
+ 3stack-6ch 3-jack 6-channel
+ 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O
+ 6stack-dig-demo 6-jack digital for Intel demo board
+ acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc)
+ acer-aspire Acer Aspire 9810
+ medion Medion Laptops
+ medion-md2 Medion MD2
+ targa-dig Targa/MSI
+ targa-2ch-dig Targs/MSI with 2-channel
+ laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE)
+ lenovo-101e Lenovo 101E
+ lenovo-nb0763 Lenovo NB0763
+ lenovo-ms7195-dig Lenovo MS7195
+ lenovo-sky Lenovo Sky
+ haier-w66 Haier W66
+ 3stack-hp HP machines with 3stack (Lucknow, Samba boards)
+ 6stack-dell Dell machines with 6stack (Inspiron 530)
+ mitac Mitac 8252D
+ clevo-m720 Clevo M720 laptop series
+ fujitsu-pi2515 Fujitsu AMILO Pi2515
+ 3stack-6ch-intel Intel DG33* boards
+ auto auto-config reading BIOS (default)
+
+ ALC861/660
+ 3stack 3-jack
+ 3stack-dig 3-jack with SPDIF I/O
+ 6stack-dig 6-jack with SPDIF I/O
+ 3stack-660 3-jack (for ALC660)
+ uniwill-m31 Uniwill M31 laptop
+ toshiba Toshiba laptop support
+ asus Asus laptop support
+ asus-laptop ASUS F2/F3 laptops
+ auto auto-config reading BIOS (default)
+
+ ALC861VD/660VD
+ 3stack 3-jack
+ 3stack-dig 3-jack with SPDIF OUT
+ 6stack-dig 6-jack with SPDIF OUT
+ 3stack-660 3-jack (for ALC660VD)
+ 3stack-660-digout 3-jack with SPDIF OUT (for ALC660VD)
+ lenovo Lenovo 3000 C200
+ dallas Dallas laptops
+ hp HP TX1000
+ auto auto-config reading BIOS (default)
+
+ CMI9880
+ minimal 3-jack in back
+ min_fp 3-jack in back, 2-jack in front
+ full 6-jack in back, 2-jack in front
+ full_dig 6-jack in back, 2-jack in front, SPDIF I/O
+ allout 5-jack in back, 2-jack in front, SPDIF out
+ auto auto-config reading BIOS (default)
+
+ AD1882 / AD1882A
+ 3stack 3-stack mode (default)
+ 6stack 6-stack mode
+
+ AD1884A / AD1883 / AD1984A / AD1984B
+ desktop 3-stack desktop (default)
+ laptop laptop with HP jack sensing
+ mobile mobile devices with HP jack sensing
+ thinkpad Lenovo Thinkpad X300
+
+ AD1884
+ N/A
+
+ AD1981
+ basic 3-jack (default)
+ hp HP nx6320
+ thinkpad Lenovo Thinkpad T60/X60/Z60
+ toshiba Toshiba U205
+
+ AD1983
+ N/A
+
+ AD1984
+ basic default configuration
+ thinkpad Lenovo Thinkpad T61/X61
+ dell Dell T3400
+
+ AD1986A
+ 6stack 6-jack, separate surrounds (default)
+ 3stack 3-stack, shared surrounds
+ laptop 2-channel only (FSC V2060, Samsung M50)
+ laptop-eapd 2-channel with EAPD (ASUS A6J)
+ laptop-automute 2-channel with EAPD and HP-automute (Lenovo N100)
+ ultra 2-channel with EAPD (Samsung Ultra tablet PC)
+ samsung 2-channel with EAPD (Samsung R65)
+
+ AD1988/AD1988B/AD1989A/AD1989B
+ 6stack 6-jack
+ 6stack-dig ditto with SPDIF
+ 3stack 3-jack
+ 3stack-dig ditto with SPDIF
+ laptop 3-jack with hp-jack automute
+ laptop-dig ditto with SPDIF
+ auto auto-config reading BIOS (default)
+
+ Conexant 5045
+ laptop-hpsense Laptop with HP sense (old model laptop)
+ laptop-micsense Laptop with Mic sense (old model fujitsu)
+ laptop-hpmicsense Laptop with HP and Mic senses
+ benq Benq R55E
+ test for testing/debugging purpose, almost all controls
+ can be adjusted. Appearing only when compiled with
+ $CONFIG_SND_DEBUG=y
+
+ Conexant 5047
+ laptop Basic Laptop config
+ laptop-hp Laptop config for some HP models (subdevice 30A5)
+ laptop-eapd Laptop config with EAPD support
+ test for testing/debugging purpose, almost all controls
+ can be adjusted. Appearing only when compiled with
+ $CONFIG_SND_DEBUG=y
+
+ Conexant 5051
+ laptop Basic Laptop config (default)
+ hp HP Spartan laptop
+
+ STAC9200
+ ref Reference board
+ dell-d21 Dell (unknown)
+ dell-d22 Dell (unknown)
+ dell-d23 Dell (unknown)
+ dell-m21 Dell Inspiron 630m, Dell Inspiron 640m
+ dell-m22 Dell Latitude D620, Dell Latitude D820
+ dell-m23 Dell XPS M1710, Dell Precision M90
+ dell-m24 Dell Latitude 120L
+ dell-m25 Dell Inspiron E1505n
+ dell-m26 Dell Inspiron 1501
+ dell-m27 Dell Inspiron E1705/9400
+ gateway Gateway laptops with EAPD control
+ panasonic Panasonic CF-74
+
+ STAC9205/9254
+ ref Reference board
+ dell-m42 Dell (unknown)
+ dell-m43 Dell Precision
+ dell-m44 Dell Inspiron
+
+ STAC9220/9221
+ ref Reference board
+ 3stack D945 3stack
+ 5stack D945 5stack + SPDIF
+ intel-mac-v1 Intel Mac Type 1
+ intel-mac-v2 Intel Mac Type 2
+ intel-mac-v3 Intel Mac Type 3
+ intel-mac-v4 Intel Mac Type 4
+ intel-mac-v5 Intel Mac Type 5
+ intel-mac-auto Intel Mac (detect type according to subsystem id)
+ macmini Intel Mac Mini (equivalent with type 3)
+ macbook Intel Mac Book (eq. type 5)
+ macbook-pro-v1 Intel Mac Book Pro 1st generation (eq. type 3)
+ macbook-pro Intel Mac Book Pro 2nd generation (eq. type 3)
+ imac-intel Intel iMac (eq. type 2)
+ imac-intel-20 Intel iMac (newer version) (eq. type 3)
+ dell-d81 Dell (unknown)
+ dell-d82 Dell (unknown)
+ dell-m81 Dell (unknown)
+ dell-m82 Dell XPS M1210
+
+ STAC9202/9250/9251
+ ref Reference board, base config
+ m2-2 Some Gateway MX series laptops
+ m6 Some Gateway NX series laptops
+ pa6 Gateway NX860 series
+
+ STAC9227/9228/9229/927x
+ ref Reference board
+ ref-no-jd Reference board without HP/Mic jack detection
+ 3stack D965 3stack
+ 5stack D965 5stack + SPDIF
+ dell-3stack Dell Dimension E520
+ dell-bios Fixes with Dell BIOS setup
+
+ STAC92HD71B*
+ ref Reference board
+ dell-m4-1 Dell desktops
+ dell-m4-2 Dell desktops
+ dell-m4-3 Dell desktops
+
+ STAC92HD73*
+ ref Reference board
+ no-jd BIOS setup but without jack-detection
+ dell-m6-amic Dell desktops/laptops with analog mics
+ dell-m6-dmic Dell desktops/laptops with digital mics
+ dell-m6 Dell desktops/laptops with both type of mics
+
+ STAC9872
+ vaio Setup for VAIO FE550G/SZ110
+ vaio-ar Setup for VAIO AR
+
+ The model name "genric" is treated as a special case. When this
+ model is given, the driver uses the generic codec parser without
+ "codec-patch". It's sometimes good for testing and debugging.
+
+ If the default configuration doesn't work and one of the above
+ matches with your device, report it together with the PCI
+ subsystem ID (output of "lspci -nv") to ALSA BTS or alsa-devel
+ ML (see the section "Links and Addresses").
+
+ power_save and power_save_controller options are for power-saving
+ mode. See powersave.txt for details.
+
+ Note 2: If you get click noises on output, try the module option
+ position_fix=1 or 2. position_fix=1 will use the SD_LPIB
+ register value without FIFO size correction as the current
+ DMA pointer. position_fix=2 will make the driver to use
+ the position buffer instead of reading SD_LPIB register.
+ (Usually SD_LPIB register is more accurate than the
+ position buffer.)
+
+ NB: If you get many "azx_get_response timeout" messages at
+ loading, it's likely a problem of interrupts (e.g. ACPI irq
+ routing). Try to boot with options like "pci=noacpi". Also, you
+ can try "single_cmd=1" module option. This will switch the
+ communication method between HDA controller and codecs to the
+ single immediate commands instead of CORB/RIRB. Basically, the
+ single command mode is provided only for BIOS, and you won't get
+ unsolicited events, too. But, at least, this works independently
+ from the irq. Remember this is a last resort, and should be
+ avoided as much as possible...
+
+ MORE NOTES ON "azx_get_response timeout" PROBLEMS:
+ On some hardwares, you may need to add a proper probe_mask option
+ to avoid the "azx_get_response timeout" problem above, instead.
+ This occurs when the access to non-existing or non-working codec slot
+ (likely a modem one) causes a stall of the communication via HD-audio
+ bus. You can see which codec slots are probed by enabling
+ CONFIG_SND_DEBUG_VERBOSE, or simply from the file name of the codec
+ proc files. Then limit the slots to probe by probe_mask option.
+ For example, probe_mask=1 means to probe only the first slot, and
+ probe_mask=4 means only the third slot.
+
+ The power-management is supported.
+
+ Module snd-hdsp
+ ---------------
+
+ Module for RME Hammerfall DSP audio interface(s)
+
+ This module supports multiple cards.
+
+ Note: The firmware data can be automatically loaded via hotplug
+ when CONFIG_FW_LOADER is set. Otherwise, you need to load
+ the firmware via hdsploader utility included in alsa-tools
+ package.
+ The firmware data is found in alsa-firmware package.
+
+ Note: snd-page-alloc module does the job which snd-hammerfall-mem
+ module did formerly. It will allocate the buffers in advance
+ when any HDSP cards are found. To make the buffer
+ allocation sure, load snd-page-alloc module in the early
+ stage of boot sequence. See "Early Buffer Allocation"
+ section.
+
+ Module snd-hdspm
+ ----------------
+
+ Module for RME HDSP MADI board.
+
+ precise_ptr - Enable precise pointer, or disable.
+ line_outs_monitor - Send playback streams to analog outs by default.
+ enable_monitor - Enable Analog Out on Channel 63/64 by default.
+
+ See hdspm.txt for details.
+
+ Module snd-hifier
+ -----------------
+
+ Module for the MediaTek/TempoTec HiFier Fantasia sound card.
+
+ This module supports autoprobe and multiple cards.
+
+ Module snd-ice1712
+ ------------------
+
+ Module for Envy24 (ICE1712) based PCI sound cards.
+ * MidiMan M Audio Delta 1010
+ * MidiMan M Audio Delta 1010LT
+ * MidiMan M Audio Delta DiO 2496
+ * MidiMan M Audio Delta 66
+ * MidiMan M Audio Delta 44
+ * MidiMan M Audio Delta 410
+ * MidiMan M Audio Audiophile 2496
+ * TerraTec EWS 88MT
+ * TerraTec EWS 88D
+ * TerraTec EWX 24/96
+ * TerraTec DMX 6Fire
+ * TerraTec Phase 88
+ * Hoontech SoundTrack DSP 24
+ * Hoontech SoundTrack DSP 24 Value
+ * Hoontech SoundTrack DSP 24 Media 7.1
+ * Event Electronics, EZ8
+ * Digigram VX442
+ * Lionstracs, Mediastaton
+ * Terrasoniq TS 88
+
+ model - Use the given board model, one of the following:
+ delta1010, dio2496, delta66, delta44, audiophile, delta410,
+ delta1010lt, vx442, ewx2496, ews88mt, ews88mt_new, ews88d,
+ dmx6fire, dsp24, dsp24_value, dsp24_71, ez8,
+ phase88, mediastation
+ omni - Omni I/O support for MidiMan M-Audio Delta44/66
+ cs8427_timeout - reset timeout for the CS8427 chip (S/PDIF transceiver)
+ in msec resolution, default value is 500 (0.5 sec)
+
+ This module supports multiple cards and autoprobe. Note: The consumer part
+ is not used with all Envy24 based cards (for example in the MidiMan Delta
+ serie).
+
+ Note: The supported board is detected by reading EEPROM or PCI
+ SSID (if EEPROM isn't available). You can override the
+ model by passing "model" module option in case that the
+ driver isn't configured properly or you want to try another
+ type for testing.
+
+ Module snd-ice1724
+ ------------------
+
+ Module for Envy24HT (VT/ICE1724), Envy24PT (VT1720) based PCI sound cards.
+ * MidiMan M Audio Revolution 5.1
+ * MidiMan M Audio Revolution 7.1
+ * MidiMan M Audio Audiophile 192
+ * AMP Ltd AUDIO2000
+ * TerraTec Aureon 5.1 Sky
+ * TerraTec Aureon 7.1 Space
+ * TerraTec Aureon 7.1 Universe
+ * TerraTec Phase 22
+ * TerraTec Phase 28
+ * AudioTrak Prodigy 7.1
+ * AudioTrak Prodigy 7.1 LT
+ * AudioTrak Prodigy 7.1 XT
+ * AudioTrak Prodigy 7.1 HIFI
+ * AudioTrak Prodigy 7.1 HD2
+ * AudioTrak Prodigy 192
+ * Pontis MS300
+ * Albatron K8X800 Pro II
+ * Chaintech ZNF3-150
+ * Chaintech ZNF3-250
+ * Chaintech 9CJS
+ * Chaintech AV-710
+ * Shuttle SN25P
+ * Onkyo SE-90PCI
+ * Onkyo SE-200PCI
+ * ESI Juli@
+ * Hercules Fortissimo IV
+ * EGO-SYS WaveTerminal 192M
+
+ model - Use the given board model, one of the following:
+ revo51, revo71, amp2000, prodigy71, prodigy71lt,
+ prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192,
+ juli, aureon51, aureon71, universe, ap192, k8x800,
+ phase22, phase28, ms300, av710, se200pci, se90pci,
+ fortissimo4, sn25p, WT192M
+
+ This module supports multiple cards and autoprobe.
+
+ Note: The supported board is detected by reading EEPROM or PCI
+ SSID (if EEPROM isn't available). You can override the
+ model by passing "model" module option in case that the
+ driver isn't configured properly or you want to try another
+ type for testing.
+
+ Module snd-indigo
+ -----------------
+
+ Module for Echoaudio Indigo
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-indigodj
+ -------------------
+
+ Module for Echoaudio Indigo DJ
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-indigoio
+ -------------------
+
+ Module for Echoaudio Indigo IO
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-intel8x0
+ -------------------
+
+ Module for AC'97 motherboards from Intel and compatibles.
+ * Intel i810/810E, i815, i820, i830, i84x, MX440
+ ICH5, ICH6, ICH7, 6300ESB, ESB2
+ * SiS 7012 (SiS 735)
+ * NVidia NForce, NForce2, NForce3, MCP04, CK804
+ CK8, CK8S, MCP501
+ * AMD AMD768, AMD8111
+ * ALi m5455
+
+ ac97_clock - AC'97 codec clock base (0 = auto-detect)
+ ac97_quirk - AC'97 workaround for strange hardware
+ See "AC97 Quirk Option" section below.
+ buggy_irq - Enable workaround for buggy interrupts on some
+ motherboards (default yes on nForce chips,
+ otherwise off)
+ buggy_semaphore - Enable workaround for hardwares with buggy
+ semaphores (e.g. on some ASUS laptops)
+ (default off)
+ spdif_aclink - Use S/PDIF over AC-link instead of direct connection
+ from the controller chip
+ (0 = off, 1 = on, -1 = default)
+
+ This module supports one chip and autoprobe.
+
+ Note: the latest driver supports auto-detection of chip clock.
+ if you still encounter too fast playback, specify the clock
+ explicitly via the module option "ac97_clock=41194".
+
+ Joystick/MIDI ports are not supported by this driver. If your
+ motherboard has these devices, use the ns558 or snd-mpu401
+ modules, respectively.
+
+ The power-management is supported.
+
+ Module snd-intel8x0m
+ --------------------
+
+ Module for Intel ICH (i8x0) chipset MC97 modems.
+ * Intel i810/810E, i815, i820, i830, i84x, MX440
+ ICH5, ICH6, ICH7
+ * SiS 7013 (SiS 735)
+ * NVidia NForce, NForce2, NForce2s, NForce3
+ * AMD AMD8111
+ * ALi m5455
+
+ ac97_clock - AC'97 codec clock base (0 = auto-detect)
+
+ This module supports one card and autoprobe.
+
+ Note: The default index value of this module is -2, i.e. the first
+ slot is excluded.
+
+ The power-management is supported.
+
+ Module snd-interwave
+ --------------------
+
+ Module for Gravis UltraSound PnP, Dynasonic 3-D/Pro, STB Sound Rage 32
+ and other sound cards based on AMD InterWave (tm) chip.
+
+ joystick_dac - 0 to 31, (0.59V-4.52V or 0.389V-2.98V)
+ midi - 1 = MIDI UART enable, 0 = MIDI UART disable (default)
+ pcm_voices - reserved PCM voices for the synthesizer (default 2)
+ effect - 1 = InterWave effects enable (default 0);
+ requires 8 voices
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for InterWave chip (0x210,0x220,0x230,0x240,0x250,0x260)
+ irq - IRQ # for InterWave chip (3,5,9,11,12,15)
+ dma1 - DMA # for InterWave chip (0,1,3,5,6,7)
+ dma2 - DMA # for InterWave chip (0,1,3,5,6,7,-1=disable)
+
+ This module supports multiple cards, autoprobe and ISA PnP.
+
+ Module snd-interwave-stb
+ ------------------------
+
+ Module for UltraSound 32-Pro (sound card from STB used by Compaq)
+ and other sound cards based on AMD InterWave (tm) chip with TEA6330T
+ circuit for extended control of bass, treble and master volume.
+
+ joystick_dac - 0 to 31, (0.59V-4.52V or 0.389V-2.98V)
+ midi - 1 = MIDI UART enable, 0 = MIDI UART disable (default)
+ pcm_voices - reserved PCM voices for the synthesizer (default 2)
+ effect - 1 = InterWave effects enable (default 0);
+ requires 8 voices
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for InterWave chip (0x210,0x220,0x230,0x240,0x250,0x260)
+ port_tc - tone control (i2c bus) port # for TEA6330T chip (0x350,0x360,0x370,0x380)
+ irq - IRQ # for InterWave chip (3,5,9,11,12,15)
+ dma1 - DMA # for InterWave chip (0,1,3,5,6,7)
+ dma2 - DMA # for InterWave chip (0,1,3,5,6,7,-1=disable)
+
+ This module supports multiple cards, autoprobe and ISA PnP.
+
+ Module snd-korg1212
+ -------------------
+
+ Module for Korg 1212 IO PCI card
+
+ This module supports multiple cards.
+
+ Module snd-layla20
+ ------------------
+
+ Module for Echoaudio Layla20
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-layla24
+ ------------------
+
+ Module for Echoaudio Layla24
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-maestro3
+ -------------------
+
+ Module for Allegro/Maestro3 chips
+
+ external_amp - enable external amp (enabled by default)
+ amp_gpio - GPIO pin number for external amp (0-15) or
+ -1 for default pin (8 for allegro, 1 for
+ others)
+
+ This module supports autoprobe and multiple chips.
+
+ Note: the binding of amplifier is dependent on hardware.
+ If there is no sound even though all channels are unmuted, try to
+ specify other gpio connection via amp_gpio option.
+ For example, a Panasonic notebook might need "amp_gpio=0x0d"
+ option.
+
+ The power-management is supported.
+
+ Module snd-mia
+ ---------------
+
+ Module for Echoaudio Mia
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-miro
+ ---------------
+
+ Module for Miro soundcards: miroSOUND PCM 1 pro,
+ miroSOUND PCM 12,
+ miroSOUND PCM 20 Radio.
+
+ port - Port # (0x530,0x604,0xe80,0xf40)
+ irq - IRQ # (5,7,9,10,11)
+ dma1 - 1st dma # (0,1,3)
+ dma2 - 2nd dma # (0,1)
+ mpu_port - MPU-401 port # (0x300,0x310,0x320,0x330)
+ mpu_irq - MPU-401 irq # (5,7,9,10)
+ fm_port - FM Port # (0x388)
+ wss - enable WSS mode
+ ide - enable onboard ide support
+
+ Module snd-mixart
+ -----------------
+
+ Module for Digigram miXart8 sound cards.
+
+ This module supports multiple cards.
+ Note: One miXart8 board will be represented as 4 alsa cards.
+ See MIXART.txt for details.
+
+ When the driver is compiled as a module and the hotplug firmware
+ is supported, the firmware data is loaded via hotplug automatically.
+ Install the necessary firmware files in alsa-firmware package.
+ When no hotplug fw loader is available, you need to load the
+ firmware via mixartloader utility in alsa-tools package.
+
+ Module snd-mona
+ ---------------
+
+ Module for Echoaudio Mona
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+
+ Module snd-mpu401
+ -----------------
+
+ Module for MPU-401 UART devices.
+
+ port - port number or -1 (disable)
+ irq - IRQ number or -1 (disable)
+ pnp - PnP detection - 0 = disable, 1 = enable (default)
+
+ This module supports multiple devices and PnP.
+
+ Module snd-mtpav
+ ----------------
+
+ Module for MOTU MidiTimePiece AV multiport MIDI (on the parallel
+ port).
+
+ port - I/O port # for MTPAV (0x378,0x278, default=0x378)
+ irq - IRQ # for MTPAV (7,5, default=7)
+ hwports - number of supported hardware ports, default=8.
+
+ Module supports only 1 card. This module has no enable option.
+
+ Module snd-mts64
+ ----------------
+
+ Module for Ego Systems (ESI) Miditerminal 4140
+
+ This module supports multiple devices.
+ Requires parport (CONFIG_PARPORT).
+
+ Module snd-nm256
+ ----------------
+
+ Module for NeoMagic NM256AV/ZX chips
+
+ playback_bufsize - max playback frame size in kB (4-128kB)
+ capture_bufsize - max capture frame size in kB (4-128kB)
+ force_ac97 - 0 or 1 (disabled by default)
+ buffer_top - specify buffer top address
+ use_cache - 0 or 1 (disabled by default)
+ vaio_hack - alias buffer_top=0x25a800
+ reset_workaround - enable AC97 RESET workaround for some laptops
+ reset_workaround2 - enable extended AC97 RESET workaround for some
+ other laptops
+
+ This module supports one chip and autoprobe.
+
+ The power-management is supported.
+
+ Note: on some notebooks the buffer address cannot be detected
+ automatically, or causes hang-up during initialization.
+ In such a case, specify the buffer top address explicitly via
+ the buffer_top option.
+ For example,
+ Sony F250: buffer_top=0x25a800
+ Sony F270: buffer_top=0x272800
+ The driver supports only ac97 codec. It's possible to force
+ to initialize/use ac97 although it's not detected. In such a
+ case, use force_ac97=1 option - but *NO* guarantee whether it
+ works!
+
+ Note: The NM256 chip can be linked internally with non-AC97
+ codecs. This driver supports only the AC97 codec, and won't work
+ with machines with other (most likely CS423x or OPL3SAx) chips,
+ even though the device is detected in lspci. In such a case, try
+ other drivers, e.g. snd-cs4232 or snd-opl3sa2. Some has ISA-PnP
+ but some doesn't have ISA PnP. You'll need to specify isapnp=0
+ and proper hardware parameters in the case without ISA PnP.
+
+ Note: some laptops need a workaround for AC97 RESET. For the
+ known hardware like Dell Latitude LS and Sony PCG-F305, this
+ workaround is enabled automatically. For other laptops with a
+ hard freeze, you can try reset_workaround=1 option.
+
+ Note: Dell Latitude CSx laptops have another problem regarding
+ AC97 RESET. On these laptops, reset_workaround2 option is
+ turned on as default. This option is worth to try if the
+ previous reset_workaround option doesn't help.
+
+ Note: This driver is really crappy. It's a porting from the
+ OSS driver, which is a result of black-magic reverse engineering.
+ The detection of codec will fail if the driver is loaded *after*
+ X-server as described above. You might be able to force to load
+ the module, but it may result in hang-up. Hence, make sure that
+ you load this module *before* X if you encounter this kind of
+ problem.
+
+ Module snd-opl3sa2
+ ------------------
+
+ Module for Yamaha OPL3-SA2/SA3 sound cards.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - control port # for OPL3-SA chip (0x370)
+ sb_port - SB port # for OPL3-SA chip (0x220,0x240)
+ wss_port - WSS port # for OPL3-SA chip (0x530,0xe80,0xf40,0x604)
+ midi_port - port # for MPU-401 UART (0x300,0x330), -1 = disable
+ fm_port - FM port # for OPL3-SA chip (0x388), -1 = disable
+ irq - IRQ # for OPL3-SA chip (5,7,9,10)
+ dma1 - first DMA # for Yamaha OPL3-SA chip (0,1,3)
+ dma2 - second DMA # for Yamaha OPL3-SA chip (0,1,3), -1 = disable
+
+ This module supports multiple cards and ISA PnP. It does not support
+ autoprobe (if ISA PnP is not used) thus all ports must be specified!!!
+
+ The power-management is supported.
+
+ Module snd-opti92x-ad1848
+ -------------------------
+
+ Module for sound cards based on OPTi 82c92x and Analog Devices AD1848 chips.
+ Module works with OAK Mozart cards as well.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for WSS chip (0x530,0xe80,0xf40,0x604)
+ mpu_port - port # for MPU-401 UART (0x300,0x310,0x320,0x330)
+ fm_port - port # for OPL3 device (0x388)
+ irq - IRQ # for WSS chip (5,7,9,10,11)
+ mpu_irq - IRQ # for MPU-401 UART (5,7,9,10)
+ dma1 - first DMA # for WSS chip (0,1,3)
+
+ This module supports only one card, autoprobe and PnP.
+
+ Module snd-opti92x-cs4231
+ -------------------------
+
+ Module for sound cards based on OPTi 82c92x and Crystal CS4231 chips.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for WSS chip (0x530,0xe80,0xf40,0x604)
+ mpu_port - port # for MPU-401 UART (0x300,0x310,0x320,0x330)
+ fm_port - port # for OPL3 device (0x388)
+ irq - IRQ # for WSS chip (5,7,9,10,11)
+ mpu_irq - IRQ # for MPU-401 UART (5,7,9,10)
+ dma1 - first DMA # for WSS chip (0,1,3)
+ dma2 - second DMA # for WSS chip (0,1,3)
+
+ This module supports only one card, autoprobe and PnP.
+
+ Module snd-opti93x
+ ------------------
+
+ Module for sound cards based on OPTi 82c93x chips.
+
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for WSS chip (0x530,0xe80,0xf40,0x604)
+ mpu_port - port # for MPU-401 UART (0x300,0x310,0x320,0x330)
+ fm_port - port # for OPL3 device (0x388)
+ irq - IRQ # for WSS chip (5,7,9,10,11)
+ mpu_irq - IRQ # for MPU-401 UART (5,7,9,10)
+ dma1 - first DMA # for WSS chip (0,1,3)
+ dma2 - second DMA # for WSS chip (0,1,3)
+
+ This module supports only one card, autoprobe and PnP.
+
+ Module snd-oxygen
+ -----------------
+
+ Module for sound cards based on the C-Media CMI8788 chip:
+ * Asound A-8788
+ * AuzenTech X-Meridian
+ * Bgears b-Enspirer
+ * Club3D Theatron DTS
+ * HT-Omega Claro
+ * Razer Barracuda AC-1
+ * Sondigo Inferno
+
+ This module supports autoprobe and multiple cards.
+
+ Module snd-pcsp
+ -----------------
+
+ Module for internal PC-Speaker.
+
+ nforce_wa - enable NForce chipset workaround. Expect bad sound.
+
+ This module supports system beeps, some kind of PCM playback and
+ even a few mixer controls.
+
+ Module snd-pcxhr
+ ----------------
+
+ Module for Digigram PCXHR boards
+
+ This module supports multiple cards.
+
+ Module snd-portman2x4
+ ---------------------
+
+ Module for Midiman Portman 2x4 parallel port MIDI interface
+
+ This module supports multiple cards.
+
+ Module snd-powermac (on ppc only)
+ ---------------------------------
+
+ Module for PowerMac, iMac and iBook on-board soundchips
+
+ enable_beep - enable beep using PCM (enabled as default)
+
+ Module supports autoprobe a chip.
+
+ Note: the driver may have problems regarding endianess.
+
+ The power-management is supported.
+
+ Module snd-pxa2xx-ac97 (on arm only)
+ ------------------------------------
+
+ Module for AC97 driver for the Intel PXA2xx chip
+
+ For ARM architecture only.
+
+ The power-management is supported.
+
+ Module snd-riptide
+ ------------------
+
+ Module for Conexant Riptide chip
+
+ joystick_port - Joystick port # (default: 0x200)
+ mpu_port - MPU401 port # (default: 0x330)
+ opl3_port - OPL3 port # (default: 0x388)
+
+ This module supports multiple cards.
+ The driver requires the firmware loader support on kernel.
+ You need to install the firmware file "riptide.hex" to the standard
+ firmware path (e.g. /lib/firmware).
+
+ Module snd-rme32
+ ----------------
+
+ Module for RME Digi32, Digi32 Pro and Digi32/8 (Sek'd Prodif32,
+ Prodif96 and Prodif Gold) sound cards.
+
+ This module supports multiple cards.
+
+ Module snd-rme96
+ ----------------
+
+ Module for RME Digi96, Digi96/8 and Digi96/8 PRO/PAD/PST sound cards.
+
+ This module supports multiple cards.
+
+ Module snd-rme9652
+ ------------------
+
+ Module for RME Digi9652 (Hammerfall, Hammerfall-Light) sound cards.
+
+ precise_ptr - Enable precise pointer (doesn't work reliably).
+ (default = 0)
+
+ This module supports multiple cards.
+
+ Note: snd-page-alloc module does the job which snd-hammerfall-mem
+ module did formerly. It will allocate the buffers in advance
+ when any RME9652 cards are found. To make the buffer
+ allocation sure, load snd-page-alloc module in the early
+ stage of boot sequence. See "Early Buffer Allocation"
+ section.
+
+ Module snd-sa11xx-uda1341 (on arm only)
+ ---------------------------------------
+
+ Module for Philips UDA1341TS on Compaq iPAQ H3600 sound card.
+
+ Module supports only one card.
+ Module has no enable and index options.
+
+ The power-management is supported.
+
+ Module snd-sb8
+ --------------
+
+ Module for 8-bit SoundBlaster cards: SoundBlaster 1.0,
+ SoundBlaster 2.0,
+ SoundBlaster Pro
+
+ port - port # for SB DSP chip (0x220,0x240,0x260)
+ irq - IRQ # for SB DSP chip (5,7,9,10)
+ dma8 - DMA # for SB DSP chip (1,3)
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-sb16 and snd-sbawe
+ -----------------------------
+
+ Module for 16-bit SoundBlaster cards: SoundBlaster 16 (PnP),
+ SoundBlaster AWE 32 (PnP),
+ SoundBlaster AWE 64 PnP
+
+ mic_agc - Mic Auto-Gain-Control - 0 = disable, 1 = enable (default)
+ csp - ASP/CSP chip support - 0 = disable (default), 1 = enable
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ port - port # for SB DSP 4.x chip (0x220,0x240,0x260)
+ mpu_port - port # for MPU-401 UART (0x300,0x330), -1 = disable
+ awe_port - base port # for EMU8000 synthesizer (0x620,0x640,0x660)
+ (snd-sbawe module only)
+ irq - IRQ # for SB DSP 4.x chip (5,7,9,10)
+ dma8 - 8-bit DMA # for SB DSP 4.x chip (0,1,3)
+ dma16 - 16-bit DMA # for SB DSP 4.x chip (5,6,7)
+
+ This module supports multiple cards, autoprobe and ISA PnP.
+
+ Note: To use Vibra16X cards in 16-bit half duplex mode, you must
+ disable 16bit DMA with dma16 = -1 module parameter.
+ Also, all Sound Blaster 16 type cards can operate in 16-bit
+ half duplex mode through 8-bit DMA channel by disabling their
+ 16-bit DMA channel.
+
+ The power-management is supported.
+
+ Module snd-sc6000
+ -----------------
+
+ Module for Gallant SC-6000 soundcard.
+
+ port - Port # (0x220 or 0x240)
+ mss_port - MSS Port # (0x530 or 0xe80)
+ irq - IRQ # (5,7,9,10,11)
+ mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq
+ dma - DMA # (1,3,0)
+
+ This module supports multiple cards.
+
+ This card is also known as Audio Excel DSP 16 or Zoltrix AV302.
+
+ Module snd-sgalaxy
+ ------------------
+
+ Module for Aztech Sound Galaxy sound card.
+
+ sbport - Port # for SB16 interface (0x220,0x240)
+ wssport - Port # for WSS interface (0x530,0xe80,0xf40,0x604)
+ irq - IRQ # (7,9,10,11)
+ dma1 - DMA #
+
+ This module supports multiple cards.
+
+ The power-management is supported.
+
+ Module snd-sscape
+ -----------------
+
+ Module for ENSONIQ SoundScape PnP cards.
+
+ port - Port # (PnP setup)
+ wss_port - WSS Port # (PnP setup)
+ irq - IRQ # (PnP setup)
+ mpu_irq - MPU-401 IRQ # (PnP setup)
+ dma - DMA # (PnP setup)
+ dma2 - 2nd DMA # (PnP setup, -1 to disable)
+
+ This module supports multiple cards. ISA PnP must be enabled.
+ You need sscape_ctl tool in alsa-tools package for loading
+ the microcode.
+
+ Module snd-sun-amd7930 (on sparc only)
+ --------------------------------------
+
+ Module for AMD7930 sound chips found on Sparcs.
+
+ This module supports multiple cards.
+
+ Module snd-sun-cs4231 (on sparc only)
+ -------------------------------------
+
+ Module for CS4231 sound chips found on Sparcs.
+
+ This module supports multiple cards.
+
+ Module snd-sun-dbri (on sparc only)
+ -----------------------------------
+
+ Module for DBRI sound chips found on Sparcs.
+
+ This module supports multiple cards.
+
+ Module snd-wavefront
+ --------------------
+
+ Module for Turtle Beach Maui, Tropez and Tropez+ sound cards.
+
+ use_cs4232_midi - Use CS4232 MPU-401 interface
+ (inaccessibly located inside your computer)
+ isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+
+ with isapnp=0, the following options are available:
+
+ cs4232_pcm_port - Port # for CS4232 PCM interface.
+ cs4232_pcm_irq - IRQ # for CS4232 PCM interface (5,7,9,11,12,15).
+ cs4232_mpu_port - Port # for CS4232 MPU-401 interface.
+ cs4232_mpu_irq - IRQ # for CS4232 MPU-401 interface (9,11,12,15).
+ ics2115_port - Port # for ICS2115
+ ics2115_irq - IRQ # for ICS2115
+ fm_port - FM OPL-3 Port #
+ dma1 - DMA1 # for CS4232 PCM interface.
+ dma2 - DMA2 # for CS4232 PCM interface.
+
+ The below are options for wavefront_synth features:
+ wf_raw - Assume that we need to boot the OS (default:no)
+ If yes, then during driver loading, the state of the board is
+ ignored, and we reset the board and load the firmware anyway.
+ fx_raw - Assume that the FX process needs help (default:yes)
+ If false, we'll leave the FX processor in whatever state it is
+ when the driver is loaded. The default is to download the
+ microprogram and associated coefficients to set it up for
+ "default" operation, whatever that means.
+ debug_default - Debug parameters for card initialization
+ wait_usecs - How long to wait without sleeping, usecs
+ (default:150)
+ This magic number seems to give pretty optimal throughput
+ based on my limited experimentation.
+ If you want to play around with it and find a better value, be
+ my guest. Remember, the idea is to get a number that causes us
+ to just busy wait for as many WaveFront commands as possible,
+ without coming up with a number so large that we hog the whole
+ CPU.
+ Specifically, with this number, out of about 134,000 status
+ waits, only about 250 result in a sleep.
+ sleep_interval - How long to sleep when waiting for reply
+ (default: 100)
+ sleep_tries - How many times to try sleeping during a wait
+ (default: 50)
+ ospath - Pathname to processed ICS2115 OS firmware
+ (default:wavefront.os)
+ The path name of the ISC2115 OS firmware. In the recent
+ version, it's handled via firmware loader framework, so it
+ must be installed in the proper path, typically,
+ /lib/firmware.
+ reset_time - How long to wait for a reset to take effect
+ (default:2)
+ ramcheck_time - How many seconds to wait for the RAM test
+ (default:20)
+ osrun_time - How many seconds to wait for the ICS2115 OS
+ (default:10)
+
+ This module supports multiple cards and ISA PnP.
+
+ Note: the firmware file "wavefront.os" was located in the earlier
+ version in /etc. Now it's loaded via firmware loader, and
+ must be in the proper firmware path, such as /lib/firmware.
+ Copy (or symlink) the file appropriately if you get an error
+ regarding firmware downloading after upgrading the kernel.
+
+ Module snd-sonicvibes
+ ---------------------
+
+ Module for S3 SonicVibes PCI sound cards.
+ * PINE Schubert 32 PCI
+
+ reverb - Reverb Enable - 1 = enable, 0 = disable (default)
+ - SoundCard must have onboard SRAM for this.
+ mge - Mic Gain Enable - 1 = enable, 0 = disable (default)
+
+ This module supports multiple cards and autoprobe.
+
+ Module snd-serial-u16550
+ ------------------------
+
+ Module for UART16550A serial MIDI ports.
+
+ port - port # for UART16550A chip
+ irq - IRQ # for UART16550A chip, -1 = poll mode
+ speed - speed in bauds (9600,19200,38400,57600,115200)
+ 38400 = default
+ base - base for divisor in bauds (57600,115200,230400,460800)
+ 115200 = default
+ outs - number of MIDI ports in a serial port (1-4)
+ 1 = default
+ adaptor - Type of adaptor.
+ 0 = Soundcanvas, 1 = MS-124T, 2 = MS-124W S/A,
+ 3 = MS-124W M/B, 4 = Generic
+
+ This module supports multiple cards. This module does not support autoprobe
+ thus the main port must be specified!!! Other options are optional.
+
+ Module snd-trident
+ ------------------
+
+ Module for Trident 4DWave DX/NX sound cards.
+ * Best Union Miss Melody 4DWave PCI
+ * HIS 4DWave PCI
+ * Warpspeed ONSpeed 4DWave PCI
+ * AzTech PCI 64-Q3D
+ * Addonics SV 750
+ * CHIC True Sound 4Dwave
+ * Shark Predator4D-PCI
+ * Jaton SonicWave 4D
+ * SiS SI7018 PCI Audio
+ * Hoontech SoundTrack Digital 4DWave NX
+
+ pcm_channels - max channels (voices) reserved for PCM
+ wavetable_size - max wavetable size in kB (4-?kb)
+
+ This module supports multiple cards and autoprobe.
+
+ The power-management is supported.
+
+ Module snd-usb-audio
+ --------------------
+
+ Module for USB audio and USB MIDI devices.
+
+ vid - Vendor ID for the device (optional)
+ pid - Product ID for the device (optional)
+ nrpacks - Max. number of packets per URB (default: 8)
+ async_unlink - Use async unlink mode (default: yes)
+ device_setup - Device specific magic number (optional)
+ - Influence depends on the device
+ - Default: 0x0000
+ ignore_ctl_error - Ignore any USB-controller regarding mixer
+ interface (default: no)
+
+ This module supports multiple devices, autoprobe and hotplugging.
+
+ NB: nrpacks parameter can be modified dynamically via sysfs.
+ Don't put the value over 20. Changing via sysfs has no sanity
+ check.
+ NB: async_unlink=0 would cause Oops. It remains just for
+ debugging purpose (if any).
+ NB: ignore_ctl_error=1 may help when you get an error at accessing
+ the mixer element such as URB error -22. This happens on some
+ buggy USB device or the controller.
+
+ Module snd-usb-caiaq
+ --------------------
+
+ Module for caiaq UB audio interfaces,
+ * Native Instruments RigKontrol2
+ * Native Instruments Kore Controller
+ * Native Instruments Audio Kontrol 1
+ * Native Instruments Audio 8 DJ
+
+ This module supports multiple devices, autoprobe and hotplugging.
+
+ Module snd-usb-usx2y
+ --------------------
+
+ Module for Tascam USB US-122, US-224 and US-428 devices.
+
+ This module supports multiple devices, autoprobe and hotplugging.
+
+ Note: you need to load the firmware via usx2yloader utility included
+ in alsa-tools and alsa-firmware packages.
+
+ Module snd-via82xx
+ ------------------
+
+ Module for AC'97 motherboards based on VIA 82C686A/686B, 8233,
+ 8233A, 8233C, 8235, 8237 (south) bridge.
+
+ mpu_port - 0x300,0x310,0x320,0x330, otherwise obtain BIOS setup
+ [VIA686A/686B only]
+ joystick - Enable joystick (default off) [VIA686A/686B only]
+ ac97_clock - AC'97 codec clock base (default 48000Hz)
+ dxs_support - support DXS channels,
+ 0 = auto (default), 1 = enable, 2 = disable,
+ 3 = 48k only, 4 = no VRA, 5 = enable any sample
+ rate and different sample rates on different
+ channels
+ [VIA8233/C, 8235, 8237 only]
+ ac97_quirk - AC'97 workaround for strange hardware
+ See "AC97 Quirk Option" section below.
+
+ This module supports one chip and autoprobe.
+
+ Note: on some SMP motherboards like MSI 694D the interrupts might
+ not be generated properly. In such a case, please try to
+ set the SMP (or MPS) version on BIOS to 1.1 instead of
+ default value 1.4. Then the interrupt number will be
+ assigned under 15. You might also upgrade your BIOS.
+
+ Note: VIA8233/5/7 (not VIA8233A) can support DXS (direct sound)
+ channels as the first PCM. On these channels, up to 4
+ streams can be played at the same time, and the controller
+ can perform sample rate conversion with separate rates for
+ each channel.
+ As default (dxs_support = 0), 48k fixed rate is chosen
+ except for the known devices since the output is often
+ noisy except for 48k on some mother boards due to the
+ bug of BIOS.
+ Please try once dxs_support=5 and if it works on other
+ sample rates (e.g. 44.1kHz of mp3 playback), please let us
+ know the PCI subsystem vendor/device id's (output of
+ "lspci -nv").
+ If dxs_support=5 does not work, try dxs_support=4; if it
+ doesn't work too, try dxs_support=1. (dxs_support=1 is
+ usually for old motherboards. The correct implemented
+ board should work with 4 or 5.) If it still doesn't
+ work and the default setting is ok, dxs_support=3 is the
+ right choice. If the default setting doesn't work at all,
+ try dxs_support=2 to disable the DXS channels.
+ In any cases, please let us know the result and the
+ subsystem vendor/device ids. See "Links and Addresses"
+ below.
+
+ Note: for the MPU401 on VIA823x, use snd-mpu401 driver
+ additionally. The mpu_port option is for VIA686 chips only.
+
+ The power-management is supported.
+
+ Module snd-via82xx-modem
+ ------------------------
+
+ Module for VIA82xx AC97 modem
+
+ ac97_clock - AC'97 codec clock base (default 48000Hz)
+
+ This module supports one card and autoprobe.
+
+ Note: The default index value of this module is -2, i.e. the first
+ slot is excluded.
+
+ The power-management is supported.
+
+ Module snd-virmidi
+ ------------------
+
+ Module for virtual rawmidi devices.
+ This module creates virtual rawmidi devices which communicate
+ to the corresponding ALSA sequencer ports.
+
+ midi_devs - MIDI devices # (1-4, default=4)
+
+ This module supports multiple cards.
+
+ Module snd-virtuoso
+ -------------------
+
+ Module for sound cards based on the Asus AV100/AV200 chips,
+ i.e., Xonar D1, DX, D2, D2X and HDAV1.3 (Deluxe).
+
+ This module supports autoprobe and multiple cards.
+
+ Module snd-vx222
+ ----------------
+
+ Module for Digigram VX-Pocket VX222, V222 v2 and Mic cards.
+
+ mic - Enable Microphone on V222 Mic (NYI)
+ ibl - Capture IBL size. (default = 0, minimum size)
+
+ This module supports multiple cards.
+
+ When the driver is compiled as a module and the hotplug firmware
+ is supported, the firmware data is loaded via hotplug automatically.
+ Install the necessary firmware files in alsa-firmware package.
+ When no hotplug fw loader is available, you need to load the
+ firmware via vxloader utility in alsa-tools package. To invoke
+ vxloader automatically, add the following to /etc/modprobe.conf
+
+ install snd-vx222 /sbin/modprobe --first-time -i snd-vx222 && /usr/bin/vxloader
+
+ (for 2.2/2.4 kernels, add "post-install /usr/bin/vxloader" to
+ /etc/modules.conf, instead.)
+ IBL size defines the interrupts period for PCM. The smaller size
+ gives smaller latency but leads to more CPU consumption, too.
+ The size is usually aligned to 126. As default (=0), the smallest
+ size is chosen. The possible IBL values can be found in
+ /proc/asound/cardX/vx-status proc file.
+
+ The power-management is supported.
+
+ Module snd-vxpocket
+ -------------------
+
+ Module for Digigram VX-Pocket VX2 and 440 PCMCIA cards.
+
+ ibl - Capture IBL size. (default = 0, minimum size)
+
+ This module supports multiple cards. The module is compiled only when
+ PCMCIA is supported on kernel.
+
+ With the older 2.6.x kernel, to activate the driver via the card
+ manager, you'll need to set up /etc/pcmcia/vxpocket.conf. See the
+ sound/pcmcia/vx/vxpocket.c. 2.6.13 or later kernel requires no
+ longer require a config file.
+
+ When the driver is compiled as a module and the hotplug firmware
+ is supported, the firmware data is loaded via hotplug automatically.
+ Install the necessary firmware files in alsa-firmware package.
+ When no hotplug fw loader is available, you need to load the
+ firmware via vxloader utility in alsa-tools package.
+
+ About capture IBL, see the description of snd-vx222 module.
+
+ Note: snd-vxp440 driver is merged to snd-vxpocket driver since
+ ALSA 1.0.10.
+
+ The power-management is supported.
+
+ Module snd-ymfpci
+ -----------------
+
+ Module for Yamaha PCI chips (YMF72x, YMF74x & YMF75x).
+
+ mpu_port - 0x300,0x330,0x332,0x334, 0 (disable) by default,
+ 1 (auto-detect for YMF744/754 only)
+ fm_port - 0x388,0x398,0x3a0,0x3a8, 0 (disable) by default
+ 1 (auto-detect for YMF744/754 only)
+ joystick_port - 0x201,0x202,0x204,0x205, 0 (disable) by default,
+ 1 (auto-detect)
+ rear_switch - enable shared rear/line-in switch (bool)
+
+ This module supports autoprobe and multiple chips.
+
+ The power-management is supported.
+
+ Module snd-pdaudiocf
+ --------------------
+
+ Module for Sound Core PDAudioCF sound card.
+
+ The power-management is supported.
+
+
+AC97 Quirk Option
+=================
+
+The ac97_quirk option is used to enable/override the workaround for
+specific devices on drivers for on-board AC'97 controllers like
+snd-intel8x0. Some hardware have swapped output pins between Master
+and Headphone, or Surround (thanks to confusion of AC'97
+specifications from version to version :-)
+
+The driver provides the auto-detection of known problematic devices,
+but some might be unknown or wrongly detected. In such a case, pass
+the proper value with this option.
+
+The following strings are accepted:
+ - default Don't override the default setting
+ - none Disable the quirk
+ - hp_only Bind Master and Headphone controls as a single control
+ - swap_hp Swap headphone and master controls
+ - swap_surround Swap master and surround controls
+ - ad_sharing For AD1985, turn on OMS bit and use headphone
+ - alc_jack For ALC65x, turn on the jack sense mode
+ - inv_eapd Inverted EAPD implementation
+ - mute_led Bind EAPD bit for turning on/off mute LED
+
+For backward compatibility, the corresponding integer value -1, 0,
+... are accepted, too.
+
+For example, if "Master" volume control has no effect on your device
+but only "Headphone" does, pass ac97_quirk=hp_only module option.
+
+
+Configuring Non-ISAPNP Cards
+============================
+
+When the kernel is configured with ISA-PnP support, the modules
+supporting the isapnp cards will have module options "isapnp".
+If this option is set, *only* the ISA-PnP devices will be probed.
+For probing the non ISA-PnP cards, you have to pass "isapnp=0" option
+together with the proper i/o and irq configuration.
+
+When the kernel is configured without ISA-PnP support, isapnp option
+will be not built in.
+
+
+Module Autoloading Support
+==========================
+
+The ALSA drivers can be loaded automatically on demand by defining
+module aliases. The string 'snd-card-%1' is requested for ALSA native
+devices where %i is sound card number from zero to seven.
+
+To auto-load an ALSA driver for OSS services, define the string
+'sound-slot-%i' where %i means the slot number for OSS, which
+corresponds to the card index of ALSA. Usually, define this
+as the same card module.
+
+An example configuration for a single emu10k1 card is like below:
+----- /etc/modprobe.conf
+alias snd-card-0 snd-emu10k1
+alias sound-slot-0 snd-emu10k1
+----- /etc/modprobe.conf
+
+The available number of auto-loaded sound cards depends on the module
+option "cards_limit" of snd module. As default it's set to 1.
+To enable the auto-loading of multiple cards, specify the number of
+sound cards in that option.
+
+When multiple cards are available, it'd better to specify the index
+number for each card via module option, too, so that the order of
+cards is kept consistent.
+
+An example configuration for two sound cards is like below:
+
+----- /etc/modprobe.conf
+# ALSA portion
+options snd cards_limit=2
+alias snd-card-0 snd-interwave
+alias snd-card-1 snd-ens1371
+options snd-interwave index=0
+options snd-ens1371 index=1
+# OSS/Free portion
+alias sound-slot-0 snd-interwave
+alias sound-slot-1 snd-ens1371
+----- /etc/modprobe.conf
+
+In this example, the interwave card is always loaded as the first card
+(index 0) and ens1371 as the second (index 1).
+
+Alternative (and new) way to fixate the slot assignment is to use
+"slots" option of snd module. In the case above, specify like the
+following:
+
+options snd slots=snd-interwave,snd-ens1371
+
+Then, the first slot (#0) is reserved for snd-interwave driver, and
+the second (#1) for snd-ens1371. You can omit index option in each
+driver if slots option is used (although you can still have them at
+the same time as long as they don't conflict).
+
+The slots option is especially useful for avoiding the possible
+hot-plugging and the resultant slot conflict. For example, in the
+case above again, the first two slots are already reserved. If any
+other driver (e.g. snd-usb-audio) is loaded before snd-interwave or
+snd-ens1371, it will be assigned to the third or later slot.
+
+When a module name is given with '!', the slot will be given for any
+modules but that name. For example, "slots=!snd-pcsp" will reserve
+the first slot for any modules but snd-pcsp.
+
+
+ALSA PCM devices to OSS devices mapping
+=======================================
+
+/dev/snd/pcmC0D0[c|p] -> /dev/audio0 (/dev/audio) -> minor 4
+/dev/snd/pcmC0D0[c|p] -> /dev/dsp0 (/dev/dsp) -> minor 3
+/dev/snd/pcmC0D1[c|p] -> /dev/adsp0 (/dev/adsp) -> minor 12
+/dev/snd/pcmC1D0[c|p] -> /dev/audio1 -> minor 4+16 = 20
+/dev/snd/pcmC1D0[c|p] -> /dev/dsp1 -> minor 3+16 = 19
+/dev/snd/pcmC1D1[c|p] -> /dev/adsp1 -> minor 12+16 = 28
+/dev/snd/pcmC2D0[c|p] -> /dev/audio2 -> minor 4+32 = 36
+/dev/snd/pcmC2D0[c|p] -> /dev/dsp2 -> minor 3+32 = 39
+/dev/snd/pcmC2D1[c|p] -> /dev/adsp2 -> minor 12+32 = 44
+
+The first number from /dev/snd/pcmC{X}D{Y}[c|p] expression means
+sound card number and second means device number. The ALSA devices
+have either 'c' or 'p' suffix indicating the direction, capture and
+playback, respectively.
+
+Please note that the device mapping above may be varied via the module
+options of snd-pcm-oss module.
+
+
+Proc interfaces (/proc/asound)
+==============================
+
+/proc/asound/card#/pcm#[cp]/oss
+-------------------------------
+ String "erase" - erase all additional informations about OSS applications
+ String "<app_name> <fragments> <fragment_size> [<options>]"
+
+ <app_name> - name of application with (higher priority) or without path
+ <fragments> - number of fragments or zero if auto
+ <fragment_size> - size of fragment in bytes or zero if auto
+ <options> - optional parameters
+ - disable the application tries to open a pcm device for
+ this channel but does not want to use it.
+ (Cause a bug or mmap needs)
+ It's good for Quake etc...
+ - direct don't use plugins
+ - block force block mode (rvplayer)
+ - non-block force non-block mode
+ - whole-frag write only whole fragments (optimization affecting
+ playback only)
+ - no-silence do not fill silence ahead to avoid clicks
+ - buggy-ptr Returns the whitespace blocks in GETOPTR ioctl
+ instead of filled blocks
+
+ Example: echo "x11amp 128 16384" > /proc/asound/card0/pcm0p/oss
+ echo "squake 0 0 disable" > /proc/asound/card0/pcm0c/oss
+ echo "rvplayer 0 0 block" > /proc/asound/card0/pcm0p/oss
+
+
+Early Buffer Allocation
+=======================
+
+Some drivers (e.g. hdsp) require the large contiguous buffers, and
+sometimes it's too late to find such spaces when the driver module is
+actually loaded due to memory fragmentation. You can pre-allocate the
+PCM buffers by loading snd-page-alloc module and write commands to its
+proc file in prior, for example, in the early boot stage like
+/etc/init.d/*.local scripts.
+
+Reading the proc file /proc/drivers/snd-page-alloc shows the current
+usage of page allocation. In writing, you can send the following
+commands to the snd-page-alloc driver:
+
+ - add VENDOR DEVICE MASK SIZE BUFFERS
+
+ VENDOR and DEVICE are PCI vendor and device IDs. They take
+ integer numbers (0x prefix is needed for the hex).
+ MASK is the PCI DMA mask. Pass 0 if not restricted.
+ SIZE is the size of each buffer to allocate. You can pass
+ k and m suffix for KB and MB. The max number is 16MB.
+ BUFFERS is the number of buffers to allocate. It must be greater
+ than 0. The max number is 4.
+
+ - erase
+
+ This will erase the all pre-allocated buffers which are not in
+ use.
+
+
+Links and Addresses
+===================
+
+ ALSA project homepage
+ http://www.alsa-project.org
+
+ ALSA Bug Tracking System
+ https://bugtrack.alsa-project.org/bugs/
+
+ ALSA Developers ML
+ mailto:alsa-devel@alsa-project.org
diff --git a/Documentation/sound/alsa/Audigy-mixer.txt b/Documentation/sound/alsa/Audigy-mixer.txt
new file mode 100644
index 0000000..7f10dc6
--- /dev/null
+++ b/Documentation/sound/alsa/Audigy-mixer.txt
@@ -0,0 +1,345 @@
+
+ Sound Blaster Audigy mixer / default DSP code
+ ===========================================
+
+This is based on SB-Live-mixer.txt.
+
+The EMU10K2 chips have a DSP part which can be programmed to support
+various ways of sample processing, which is described here.
+(This article does not deal with the overall functionality of the
+EMU10K2 chips. See the manuals section for further details.)
+
+The ALSA driver programs this portion of chip by default code
+(can be altered later) which offers the following functionality:
+
+
+1) Digital mixer controls
+-------------------------
+
+These controls are built using the DSP instructions. They offer extended
+functionality. Only the default build-in code in the ALSA driver is described
+here. Note that the controls work as attenuators: the maximum value is the
+neutral position leaving the signal unchanged. Note that if the same destination
+is mentioned in multiple controls, the signal is accumulated and can be wrapped
+(set to maximal or minimal value without checking of overflow).
+
+
+Explanation of used abbreviations:
+
+DAC - digital to analog converter
+ADC - analog to digital converter
+I2S - one-way three wire serial bus for digital sound by Philips Semiconductors
+ (this standard is used for connecting standalone DAC and ADC converters)
+LFE - low frequency effects (subwoofer signal)
+AC97 - a chip containing an analog mixer, DAC and ADC converters
+IEC958 - S/PDIF
+FX-bus - the EMU10K2 chip has an effect bus containing 64 accumulators.
+ Each of the synthesizer voices can feed its output to these accumulators
+ and the DSP microcontroller can operate with the resulting sum.
+
+name='PCM Front Playback Volume',index=0
+
+This control is used to attenuate samples for left and right front PCM FX-bus
+accumulators. ALSA uses accumulators 8 and 9 for left and right front PCM
+samples for 5.1 playback. The result samples are forwarded to the front DAC PCM
+slots of the Philips DAC.
+
+name='PCM Surround Playback Volume',index=0
+
+This control is used to attenuate samples for left and right surround PCM FX-bus
+accumulators. ALSA uses accumulators 2 and 3 for left and right surround PCM
+samples for 5.1 playback. The result samples are forwarded to the surround DAC PCM
+slots of the Philips DAC.
+
+name='PCM Center Playback Volume',index=0
+
+This control is used to attenuate samples for center PCM FX-bus accumulator.
+ALSA uses accumulator 6 for center PCM sample for 5.1 playback. The result sample
+is forwarded to the center DAC PCM slot of the Philips DAC.
+
+name='PCM LFE Playback Volume',index=0
+
+This control is used to attenuate sample for LFE PCM FX-bus accumulator.
+ALSA uses accumulator 7 for LFE PCM sample for 5.1 playback. The result sample
+is forwarded to the LFE DAC PCM slot of the Philips DAC.
+
+name='PCM Playback Volume',index=0
+
+This control is used to attenuate samples for left and right PCM FX-bus
+accumulators. ALSA uses accumulators 0 and 1 for left and right PCM samples for
+stereo playback. The result samples are forwarded to the front DAC PCM slots
+of the Philips DAC.
+
+name='PCM Capture Volume',index=0
+
+This control is used to attenuate samples for left and right PCM FX-bus
+accumulator. ALSA uses accumulators 0 and 1 for left and right PCM.
+The result is forwarded to the ADC capture FIFO (thus to the standard capture
+PCM device).
+
+name='Music Playback Volume',index=0
+
+This control is used to attenuate samples for left and right MIDI FX-bus
+accumulators. ALSA uses accumulators 4 and 5 for left and right MIDI samples.
+The result samples are forwarded to the front DAC PCM slots of the AC97 codec.
+
+name='Music Capture Volume',index=0
+
+These controls are used to attenuate samples for left and right MIDI FX-bus
+accumulator. ALSA uses accumulators 4 and 5 for left and right PCM.
+The result is forwarded to the ADC capture FIFO (thus to the standard capture
+PCM device).
+
+name='Mic Playback Volume',index=0
+
+This control is used to attenuate samples for left and right Mic input.
+For Mic input is used AC97 codec. The result samples are forwarded to
+the front DAC PCM slots of the Philips DAC. Samples are forwarded to Mic
+capture FIFO (device 1 - 16bit/8KHz mono) too without volume control.
+
+name='Mic Capture Volume',index=0
+
+This control is used to attenuate samples for left and right Mic input.
+The result is forwarded to the ADC capture FIFO (thus to the standard capture
+PCM device).
+
+name='Audigy CD Playback Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 TTL
+digital inputs (usually used by a CDROM drive). The result samples are
+forwarded to the front DAC PCM slots of the Philips DAC.
+
+name='Audigy CD Capture Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 TTL
+digital inputs (usually used by a CDROM drive). The result samples are
+forwarded to the ADC capture FIFO (thus to the standard capture PCM device).
+
+name='IEC958 Optical Playback Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 optical
+digital input. The result samples are forwarded to the front DAC PCM slots
+of the Philips DAC.
+
+name='IEC958 Optical Capture Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 optical
+digital inputs. The result samples are forwarded to the ADC capture FIFO
+(thus to the standard capture PCM device).
+
+name='Line2 Playback Volume',index=0
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs (on the AudigyDrive). The result samples are forwarded to the front
+DAC PCM slots of the Philips DAC.
+
+name='Line2 Capture Volume',index=1
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs (on the AudigyDrive). The result samples are forwarded to the ADC
+capture FIFO (thus to the standard capture PCM device).
+
+name='Analog Mix Playback Volume',index=0
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs from Philips ADC. The result samples are forwarded to the front
+DAC PCM slots of the Philips DAC. This contains mix from analog sources
+like CD, Line In, Aux, ....
+
+name='Analog Mix Capture Volume',index=1
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs Philips ADC. The result samples are forwarded to the ADC
+capture FIFO (thus to the standard capture PCM device).
+
+name='Aux2 Playback Volume',index=0
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs (on the AudigyDrive). The result samples are forwarded to the front
+DAC PCM slots of the Philips DAC.
+
+name='Aux2 Capture Volume',index=1
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs (on the AudigyDrive). The result samples are forwarded to the ADC
+capture FIFO (thus to the standard capture PCM device).
+
+name='Front Playback Volume',index=0
+
+All stereo signals are mixed together and mirrored to surround, center and LFE.
+This control is used to attenuate samples for left and right front speakers of
+this mix.
+
+name='Surround Playback Volume',index=0
+
+All stereo signals are mixed together and mirrored to surround, center and LFE.
+This control is used to attenuate samples for left and right surround speakers of
+this mix.
+
+name='Center Playback Volume',index=0
+
+All stereo signals are mixed together and mirrored to surround, center and LFE.
+This control is used to attenuate sample for center speaker of this mix.
+
+name='LFE Playback Volume',index=0
+
+All stereo signals are mixed together and mirrored to surround, center and LFE.
+This control is used to attenuate sample for LFE speaker of this mix.
+
+name='Tone Control - Switch',index=0
+
+This control turns the tone control on or off. The samples for front, rear
+and center / LFE outputs are affected.
+
+name='Tone Control - Bass',index=0
+
+This control sets the bass intensity. There is no neutral value!!
+When the tone control code is activated, the samples are always modified.
+The closest value to pure signal is 20.
+
+name='Tone Control - Treble',index=0
+
+This control sets the treble intensity. There is no neutral value!!
+When the tone control code is activated, the samples are always modified.
+The closest value to pure signal is 20.
+
+name='Master Playback Volume',index=0
+
+This control is used to attenuate samples for front, surround, center and
+LFE outputs.
+
+name='IEC958 Optical Raw Playback Switch',index=0
+
+If this switch is on, then the samples for the IEC958 (S/PDIF) digital
+output are taken only from the raw FX8010 PCM, otherwise standard front
+PCM samples are taken.
+
+
+2) PCM stream related controls
+------------------------------
+
+name='EMU10K1 PCM Volume',index 0-31
+
+Channel volume attenuation in range 0-0xffff. The maximum value (no
+attenuation) is default. The channel mapping for three values is
+as follows:
+
+ 0 - mono, default 0xffff (no attenuation)
+ 1 - left, default 0xffff (no attenuation)
+ 2 - right, default 0xffff (no attenuation)
+
+name='EMU10K1 PCM Send Routing',index 0-31
+
+This control specifies the destination - FX-bus accumulators. There 24
+values with this mapping:
+
+ 0 - mono, A destination (FX-bus 0-63), default 0
+ 1 - mono, B destination (FX-bus 0-63), default 1
+ 2 - mono, C destination (FX-bus 0-63), default 2
+ 3 - mono, D destination (FX-bus 0-63), default 3
+ 4 - mono, E destination (FX-bus 0-63), default 0
+ 5 - mono, F destination (FX-bus 0-63), default 0
+ 6 - mono, G destination (FX-bus 0-63), default 0
+ 7 - mono, H destination (FX-bus 0-63), default 0
+ 8 - left, A destination (FX-bus 0-63), default 0
+ 9 - left, B destination (FX-bus 0-63), default 1
+ 10 - left, C destination (FX-bus 0-63), default 2
+ 11 - left, D destination (FX-bus 0-63), default 3
+ 12 - left, E destination (FX-bus 0-63), default 0
+ 13 - left, F destination (FX-bus 0-63), default 0
+ 14 - left, G destination (FX-bus 0-63), default 0
+ 15 - left, H destination (FX-bus 0-63), default 0
+ 16 - right, A destination (FX-bus 0-63), default 0
+ 17 - right, B destination (FX-bus 0-63), default 1
+ 18 - right, C destination (FX-bus 0-63), default 2
+ 19 - right, D destination (FX-bus 0-63), default 3
+ 20 - right, E destination (FX-bus 0-63), default 0
+ 21 - right, F destination (FX-bus 0-63), default 0
+ 22 - right, G destination (FX-bus 0-63), default 0
+ 23 - right, H destination (FX-bus 0-63), default 0
+
+Don't forget that it's illegal to assign a channel to the same FX-bus accumulator
+more than once (it means 0=0 && 1=0 is an invalid combination).
+
+name='EMU10K1 PCM Send Volume',index 0-31
+
+It specifies the attenuation (amount) for given destination in range 0-255.
+The channel mapping is following:
+
+ 0 - mono, A destination attn, default 255 (no attenuation)
+ 1 - mono, B destination attn, default 255 (no attenuation)
+ 2 - mono, C destination attn, default 0 (mute)
+ 3 - mono, D destination attn, default 0 (mute)
+ 4 - mono, E destination attn, default 0 (mute)
+ 5 - mono, F destination attn, default 0 (mute)
+ 6 - mono, G destination attn, default 0 (mute)
+ 7 - mono, H destination attn, default 0 (mute)
+ 8 - left, A destination attn, default 255 (no attenuation)
+ 9 - left, B destination attn, default 0 (mute)
+ 10 - left, C destination attn, default 0 (mute)
+ 11 - left, D destination attn, default 0 (mute)
+ 12 - left, E destination attn, default 0 (mute)
+ 13 - left, F destination attn, default 0 (mute)
+ 14 - left, G destination attn, default 0 (mute)
+ 15 - left, H destination attn, default 0 (mute)
+ 16 - right, A destination attn, default 0 (mute)
+ 17 - right, B destination attn, default 255 (no attenuation)
+ 18 - right, C destination attn, default 0 (mute)
+ 19 - right, D destination attn, default 0 (mute)
+ 20 - right, E destination attn, default 0 (mute)
+ 21 - right, F destination attn, default 0 (mute)
+ 22 - right, G destination attn, default 0 (mute)
+ 23 - right, H destination attn, default 0 (mute)
+
+
+
+4) MANUALS/PATENTS:
+-------------------
+
+ftp://opensource.creative.com/pub/doc
+-------------------------------------
+
+ Files:
+ LM4545.pdf AC97 Codec
+
+ m2049.pdf The EMU10K1 Digital Audio Processor
+
+ hog63.ps FX8010 - A DSP Chip Architecture for Audio Effects
+
+
+WIPO Patents
+------------
+ Patent numbers:
+ WO 9901813 (A1) Audio Effects Processor with multiple asynchronous (Jan. 14, 1999)
+ streams
+
+ WO 9901814 (A1) Processor with Instruction Set for Audio Effects (Jan. 14, 1999)
+
+ WO 9901953 (A1) Audio Effects Processor having Decoupled Instruction
+ Execution and Audio Data Sequencing (Jan. 14, 1999)
+
+
+US Patents (http://www.uspto.gov/)
+----------------------------------
+
+ US 5925841 Digital Sampling Instrument employing cache memory (Jul. 20, 1999)
+
+ US 5928342 Audio Effects Processor integrated on a single chip (Jul. 27, 1999)
+ with a multiport memory onto which multiple asynchronous
+ digital sound samples can be concurrently loaded
+
+ US 5930158 Processor with Instruction Set for Audio Effects (Jul. 27, 1999)
+
+ US 6032235 Memory initialization circuit (Tram) (Feb. 29, 2000)
+
+ US 6138207 Interpolation looping of audio samples in cache connected to (Oct. 24, 2000)
+ system bus with prioritization and modification of bus transfers
+ in accordance with loop ends and minimum block sizes
+
+ US 6151670 Method for conserving memory storage using a (Nov. 21, 2000)
+ pool of short term memory registers
+
+ US 6195715 Interrupt control for multiple programs communicating with (Feb. 27, 2001)
+ a common interrupt by associating programs to GP registers,
+ defining interrupt register, polling GP registers, and invoking
+ callback routine associated with defined interrupt register
diff --git a/Documentation/sound/alsa/Audiophile-Usb.txt b/Documentation/sound/alsa/Audiophile-Usb.txt
new file mode 100644
index 0000000..a4c53d8
--- /dev/null
+++ b/Documentation/sound/alsa/Audiophile-Usb.txt
@@ -0,0 +1,442 @@
+ Guide to using M-Audio Audiophile USB with ALSA and Jack v1.5
+ ========================================================
+
+ Thibault Le Meur <Thibault.LeMeur@supelec.fr>
+
+This document is a guide to using the M-Audio Audiophile USB (tm) device with
+ALSA and JACK.
+
+History
+=======
+* v1.4 - Thibault Le Meur (2007-07-11)
+ - Added Low Endianness nature of 16bits-modes
+ found by Hakan Lennestal <Hakan.Lennestal@brfsodrahamn.se>
+ - Modifying document structure
+* v1.5 - Thibault Le Meur (2007-07-12)
+ - Added AC3/DTS passthru info
+
+
+1 - Audiophile USB Specs and correct usage
+==========================================
+
+This part is a reminder of important facts about the functions and limitations
+of the device.
+
+The device has 4 audio interfaces, and 2 MIDI ports:
+ * Analog Stereo Input (Ai)
+ - This port supports 2 pairs of line-level audio inputs (1/4" TS and RCA)
+ - When the 1/4" TS (jack) connectors are connected, the RCA connectors
+ are disabled
+ * Analog Stereo Output (Ao)
+ * Digital Stereo Input (Di)
+ * Digital Stereo Output (Do)
+ * Midi In (Mi)
+ * Midi Out (Mo)
+
+The internal DAC/ADC has the following characteristics:
+* sample depth of 16 or 24 bits
+* sample rate from 8kHz to 96kHz
+* Two interfaces can't use different sample depths at the same time.
+Moreover, the Audiophile USB documentation gives the following Warning:
+"Please exit any audio application running before switching between bit depths"
+
+Due to the USB 1.1 bandwidth limitation, a limited number of interfaces can be
+activated at the same time depending on the audio mode selected:
+ * 16-bit/48kHz ==> 4 channels in + 4 channels out
+ - Ai+Ao+Di+Do
+ * 24-bit/48kHz ==> 4 channels in + 2 channels out,
+ or 2 channels in + 4 channels out
+ - Ai+Ao+Do or Ai+Di+Ao or Ai+Di+Do or Di+Ao+Do
+ * 24-bit/96kHz ==> 2 channels in _or_ 2 channels out (half duplex only)
+ - Ai or Ao or Di or Do
+
+Important facts about the Digital interface:
+--------------------------------------------
+ * The Do port additionally supports surround-encoded AC-3 and DTS passthrough,
+though I haven't tested it under Linux
+ - Note that in this setup only the Do interface can be enabled
+ * Apart from recording an audio digital stream, enabling the Di port is a way
+to synchronize the device to an external sample clock
+ - As a consequence, the Di port must be enable only if an active Digital
+source is connected
+ - Enabling Di when no digital source is connected can result in a
+synchronization error (for instance sound played at an odd sample rate)
+
+
+2 - Audiophile USB MIDI support in ALSA
+=======================================
+
+The Audiophile USB MIDI ports will be automatically supported once the
+following modules have been loaded:
+ * snd-usb-audio
+ * snd-seq-midi
+
+No additional setting is required.
+
+
+3 - Audiophile USB Audio support in ALSA
+========================================
+
+Audio functions of the Audiophile USB device are handled by the snd-usb-audio
+module. This module can work in a default mode (without any device-specific
+parameter), or in an "advanced" mode with the device-specific parameter called
+"device_setup".
+
+3.1 - Default Alsa driver mode
+------------------------------
+
+The default behavior of the snd-usb-audio driver is to list the device
+capabilities at startup and activate the required mode when required
+by the applications: for instance if the user is recording in a
+24bit-depth-mode and immediately after wants to switch to a 16bit-depth mode,
+the snd-usb-audio module will reconfigure the device on the fly.
+
+This approach has the advantage to let the driver automatically switch from sample
+rates/depths automatically according to the user's needs. However, those who
+are using the device under windows know that this is not how the device is meant to
+work: under windows applications must be closed before using the m-audio control
+panel to switch the device working mode. Thus as we'll see in next section, this
+Default Alsa driver mode can lead to device misconfigurations.
+
+Let's get back to the Default Alsa driver mode for now. In this case the
+Audiophile interfaces are mapped to alsa pcm devices in the following
+way (I suppose the device's index is 1):
+ * hw:1,0 is Ao in playback and Di in capture
+ * hw:1,1 is Do in playback and Ai in capture
+ * hw:1,2 is Do in AC3/DTS passthrough mode
+
+In this mode, the device uses Big Endian byte-encoding so that
+supported audio format are S16_BE for 16-bit depth modes and S24_3BE for
+24-bits depth mode.
+
+One exception is the hw:1,2 port which was reported to be Little Endian
+compliant (supposedly supporting S16_LE) but processes in fact only S16_BE streams.
+This has been fixed in kernel 2.6.23 and above and now the hw:1,2 interface
+is reported to be big endian in this default driver mode.
+
+Examples:
+ * playing a S24_3BE encoded raw file to the Ao port
+ % aplay -D hw:1,0 -c2 -t raw -r48000 -fS24_3BE test.raw
+ * recording a S24_3BE encoded raw file from the Ai port
+ % arecord -D hw:1,1 -c2 -t raw -r48000 -fS24_3BE test.raw
+ * playing a S16_BE encoded raw file to the Do port
+ % aplay -D hw:1,1 -c2 -t raw -r48000 -fS16_BE test.raw
+ * playing an ac3 sample file to the Do port
+ % aplay -D hw:1,2 --channels=6 ac3_S16_BE_encoded_file.raw
+
+If you're happy with the default Alsa driver mode and don't experience any
+issue with this mode, then you can skip the following chapter.
+
+3.2 - Advanced module setup
+---------------------------
+
+Due to the hardware constraints described above, the device initialization made
+by the Alsa driver in default mode may result in a corrupted state of the
+device. For instance, a particularly annoying issue is that the sound captured
+from the Ai interface sounds distorted (as if boosted with an excessive high
+volume gain).
+
+For people having this problem, the snd-usb-audio module has a new module
+parameter called "device_setup" (this parameter was introduced in kernel
+release 2.6.17)
+
+3.2.1 - Initializing the working mode of the Audiophile USB
+
+As far as the Audiophile USB device is concerned, this value let the user
+specify:
+ * the sample depth
+ * the sample rate
+ * whether the Di port is used or not
+
+When initialized with "device_setup=0x00", the snd-usb-audio module has
+the same behaviour as when the parameter is omitted (see paragraph "Default
+Alsa driver mode" above)
+
+Others modes are described in the following subsections.
+
+3.2.1.1 - 16-bit modes
+
+The two supported modes are:
+
+ * device_setup=0x01
+ - 16bits 48kHz mode with Di disabled
+ - Ai,Ao,Do can be used at the same time
+ - hw:1,0 is not available in capture mode
+ - hw:1,2 is not available
+
+ * device_setup=0x11
+ - 16bits 48kHz mode with Di enabled
+ - Ai,Ao,Di,Do can be used at the same time
+ - hw:1,0 is available in capture mode
+ - hw:1,2 is not available
+
+In this modes the device operates only at 16bits-modes. Before kernel 2.6.23,
+the devices where reported to be Big-Endian when in fact they were Little-Endian
+so that playing a file was a matter of using:
+ % aplay -D hw:1,1 -c2 -t raw -r48000 -fS16_BE test_S16_LE.raw
+where "test_S16_LE.raw" was in fact a little-endian sample file.
+
+Thanks to Hakan Lennestal (who discovered the Little-Endiannes of the device in
+these modes) a fix has been committed (expected in kernel 2.6.23) and
+Alsa now reports Little-Endian interfaces. Thus playing a file now is as simple as
+using:
+ % aplay -D hw:1,1 -c2 -t raw -r48000 -fS16_LE test_S16_LE.raw
+
+3.2.1.2 - 24-bit modes
+
+The three supported modes are:
+
+ * device_setup=0x09
+ - 24bits 48kHz mode with Di disabled
+ - Ai,Ao,Do can be used at the same time
+ - hw:1,0 is not available in capture mode
+ - hw:1,2 is not available
+
+ * device_setup=0x19
+ - 24bits 48kHz mode with Di enabled
+ - 3 ports from {Ai,Ao,Di,Do} can be used at the same time
+ - hw:1,0 is available in capture mode and an active digital source must be
+ connected to Di
+ - hw:1,2 is not available
+
+ * device_setup=0x0D or 0x10
+ - 24bits 96kHz mode
+ - Di is enabled by default for this mode but does not need to be connected
+ to an active source
+ - Only 1 port from {Ai,Ao,Di,Do} can be used at the same time
+ - hw:1,0 is available in captured mode
+ - hw:1,2 is not available
+
+In these modes the device is only Big-Endian compliant (see "Default Alsa driver
+mode" above for an aplay command example)
+
+3.2.1.3 - AC3 w/ DTS passthru mode
+
+Thanks to Hakan Lennestal, I now have a report saying that this mode works.
+
+ * device_setup=0x03
+ - 16bits 48kHz mode with only the Do port enabled
+ - AC3 with DTS passthru
+ - Caution with this setup the Do port is mapped to the pcm device hw:1,0
+
+The command line used to playback the AC3/DTS encoded .wav-files in this mode:
+ % aplay -D hw:1,0 --channels=6 ac3_S16_LE_encoded_file.raw
+
+3.2.2 - How to use the device_setup parameter
+----------------------------------------------
+
+The parameter can be given:
+
+ * By manually probing the device (as root):
+ # modprobe -r snd-usb-audio
+ # modprobe snd-usb-audio index=1 device_setup=0x09
+
+ * Or while configuring the modules options in your modules configuration file
+ - For Fedora distributions, edit the /etc/modprobe.conf file:
+ alias snd-card-1 snd-usb-audio
+ options snd-usb-audio index=1 device_setup=0x09
+
+CAUTION when initializing the device
+-------------------------------------
+
+ * Correct initialization on the device requires that device_setup is given to
+ the module BEFORE the device is turned on. So, if you use the "manual probing"
+ method described above, take care to power-on the device AFTER this initialization.
+
+ * Failing to respect this will lead to a misconfiguration of the device. In this case
+ turn off the device, unprobe the snd-usb-audio module, then probe it again with
+ correct device_setup parameter and then (and only then) turn on the device again.
+
+ * If you've correctly initialized the device in a valid mode and then want to switch
+ to another mode (possibly with another sample-depth), please use also the following
+ procedure:
+ - first turn off the device
+ - de-register the snd-usb-audio module (modprobe -r)
+ - change the device_setup parameter by changing the device_setup
+ option in /etc/modprobe.conf
+ - turn on the device
+ * A workaround for this last issue has been applied to kernel 2.6.23, but it may not
+ be enough to ensure the 'stability' of the device initialization.
+
+3.2.3 - Technical details for hackers
+-------------------------------------
+This section is for hackers, wanting to understand details about the device
+internals and how Alsa supports it.
+
+3.2.3.1 - Audiophile USB's device_setup structure
+
+If you want to understand the device_setup magic numbers for the Audiophile
+USB, you need some very basic understanding of binary computation. However,
+this is not required to use the parameter and you may skip this section.
+
+The device_setup is one byte long and its structure is the following:
+
+ +---+---+---+---+---+---+---+---+
+ | b7| b6| b5| b4| b3| b2| b1| b0|
+ +---+---+---+---+---+---+---+---+
+ | 0 | 0 | 0 | Di|24B|96K|DTS|SET|
+ +---+---+---+---+---+---+---+---+
+
+Where:
+ * b0 is the "SET" bit
+ - it MUST be set if device_setup is initialized
+ * b1 is the "DTS" bit
+ - it is set only for Digital output with DTS/AC3
+ - this setup is not tested
+ * b2 is the Rate selection flag
+ - When set to "1" the rate range is 48.1-96kHz
+ - Otherwise the sample rate range is 8-48kHz
+ * b3 is the bit depth selection flag
+ - When set to "1" samples are 24bits long
+ - Otherwise they are 16bits long
+ - Note that b2 implies b3 as the 96kHz mode is only supported for 24 bits
+ samples
+ * b4 is the Digital input flag
+ - When set to "1" the device assumes that an active digital source is
+ connected
+ - You shouldn't enable Di if no source is seen on the port (this leads to
+ synchronization issues)
+ - b4 is implied by b2 (since only one port is enabled at a time no synch
+ error can occur)
+ * b5 to b7 are reserved for future uses, and must be set to "0"
+ - might become Ao, Do, Ai, for b7, b6, b4 respectively
+
+Caution:
+ * there is no check on the value you will give to device_setup
+ - for instance choosing 0x05 (16bits 96kHz) will fail back to 0x09 since
+ b2 implies b3. But _there_will_be_no_warning_ in /var/log/messages
+ * Hardware constraints due to the USB bus limitation aren't checked
+ - choosing b2 will prepare all interfaces for 24bits/96kHz but you'll
+ only be able to use one at the same time
+
+3.2.3.2 - USB implementation details for this device
+
+You may safely skip this section if you're not interested in driver
+hacking.
+
+This section describes some internal aspects of the device and summarizes the
+data I got by usb-snooping the windows and Linux drivers.
+
+The M-Audio Audiophile USB has 7 USB Interfaces:
+a "USB interface":
+ * USB Interface nb.0
+ * USB Interface nb.1
+ - Audio Control function
+ * USB Interface nb.2
+ - Analog Output
+ * USB Interface nb.3
+ - Digital Output
+ * USB Interface nb.4
+ - Analog Input
+ * USB Interface nb.5
+ - Digital Input
+ * USB Interface nb.6
+ - MIDI interface compliant with the MIDIMAN quirk
+
+Each interface has 5 altsettings (AltSet 1,2,3,4,5) except:
+ * Interface 3 (Digital Out) has an extra Alset nb.6
+ * Interface 5 (Digital In) does not have Alset nb.3 and 5
+
+Here is a short description of the AltSettings capabilities:
+ * AltSettings 1 corresponds to
+ - 24-bit depth, 48.1-96kHz sample mode
+ - Adaptive playback (Ao and Do), Synch capture (Ai), or Asynch capture (Di)
+ * AltSettings 2 corresponds to
+ - 24-bit depth, 8-48kHz sample mode
+ - Asynch capture and playback (Ao,Ai,Do,Di)
+ * AltSettings 3 corresponds to
+ - 24-bit depth, 8-48kHz sample mode
+ - Synch capture (Ai) and Adaptive playback (Ao,Do)
+ * AltSettings 4 corresponds to
+ - 16-bit depth, 8-48kHz sample mode
+ - Asynch capture and playback (Ao,Ai,Do,Di)
+ * AltSettings 5 corresponds to
+ - 16-bit depth, 8-48kHz sample mode
+ - Synch capture (Ai) and Adaptive playback (Ao,Do)
+ * AltSettings 6 corresponds to
+ - 16-bit depth, 8-48kHz sample mode
+ - Synch playback (Do), audio format type III IEC1937_AC-3
+
+In order to ensure a correct initialization of the device, the driver
+_must_know_ how the device will be used:
+ * if DTS is chosen, only Interface 2 with AltSet nb.6 must be
+ registered
+ * if 96KHz only AltSets nb.1 of each interface must be selected
+ * if samples are using 24bits/48KHz then AltSet 2 must me used if
+ Digital input is connected, and only AltSet nb.3 if Digital input
+ is not connected
+ * if samples are using 16bits/48KHz then AltSet 4 must me used if
+ Digital input is connected, and only AltSet nb.5 if Digital input
+ is not connected
+
+When device_setup is given as a parameter to the snd-usb-audio module, the
+parse_audio_endpoints function uses a quirk called
+"audiophile_skip_setting_quirk" in order to prevent AltSettings not
+corresponding to device_setup from being registered in the driver.
+
+4 - Audiophile USB and Jack support
+===================================
+
+This section deals with support of the Audiophile USB device in Jack.
+
+There are 2 main potential issues when using Jackd with the device:
+* support for Big-Endian devices in 24-bit modes
+* support for 4-in / 4-out channels
+
+4.1 - Direct support in Jackd
+-----------------------------
+
+Jack supports big endian devices only in recent versions (thanks to
+Andreas Steinmetz for his first big-endian patch). I can't remember
+exactly when this support was released into jackd, let's just say that
+with jackd version 0.103.0 it's almost ok (just a small bug is affecting
+16bits Big-Endian devices, but since you've read carefully the above
+paragraphs, you're now using kernel >= 2.6.23 and your 16bits devices
+are now Little Endians ;-) ).
+
+You can run jackd with the following command for playback with Ao and
+record with Ai:
+ % jackd -R -dalsa -Phw:1,0 -r48000 -p128 -n2 -D -Chw:1,1
+
+4.2 - Using Alsa plughw
+-----------------------
+If you don't have a recent Jackd installed, you can downgrade to using
+the Alsa "plug" converter.
+
+For instance here is one way to run Jack with 2 playback channels on Ao and 2
+capture channels from Ai:
+ % jackd -R -dalsa -dplughw:1 -r48000 -p256 -n2 -D -Cplughw:1,1
+
+However you may see the following warning message:
+"You appear to be using the ALSA software "plug" layer, probably a result of
+using the "default" ALSA device. This is less efficient than it could be.
+Consider using a hardware device instead rather than using the plug layer."
+
+4.3 - Getting 2 input and/or output interfaces in Jack
+------------------------------------------------------
+
+As you can see, starting the Jack server this way will only enable 1 stereo
+input (Di or Ai) and 1 stereo output (Ao or Do).
+
+This is due to the following restrictions:
+* Jack can only open one capture device and one playback device at a time
+* The Audiophile USB is seen as 2 (or three) Alsa devices: hw:1,0, hw:1,1
+ (and optionally hw:1,2)
+
+If you want to get Ai+Di and/or Ao+Do support with Jack, you would need to
+combine the Alsa devices into one logical "complex" device.
+
+If you want to give it a try, I recommend reading the information from
+this page: http://www.sound-man.co.uk/linuxaudio/ice1712multi.html
+It is related to another device (ice1712) but can be adapted to suit
+the Audiophile USB.
+
+Enabling multiple Audiophile USB interfaces for Jackd will certainly require:
+* Making sure your Jackd version has the MMAP_COMPLEX patch (see the ice1712 page)
+* (maybe) patching the alsa-lib/src/pcm/pcm_multi.c file (see the ice1712 page)
+* define a multi device (combination of hw:1,0 and hw:1,1) in your .asoundrc
+ file
+* start jackd with this device
+
+I had no success in testing this for now, if you have any success with this kind
+of setup, please drop me an email.
diff --git a/Documentation/sound/alsa/Bt87x.txt b/Documentation/sound/alsa/Bt87x.txt
new file mode 100644
index 0000000..f158cde
--- /dev/null
+++ b/Documentation/sound/alsa/Bt87x.txt
@@ -0,0 +1,78 @@
+Intro
+=====
+
+You might have noticed that the bt878 grabber cards have actually
+_two_ PCI functions:
+
+$ lspci
+[ ... ]
+00:0a.0 Multimedia video controller: Brooktree Corporation Bt878 (rev 02)
+00:0a.1 Multimedia controller: Brooktree Corporation Bt878 (rev 02)
+[ ... ]
+
+The first does video, it is backward compatible to the bt848. The second
+does audio. snd-bt87x is a driver for the second function. It's a sound
+driver which can be used for recording sound (and _only_ recording, no
+playback). As most TV cards come with a short cable which can be plugged
+into your sound card's line-in you probably don't need this driver if all
+you want to do is just watching TV...
+
+Some cards do not bother to connect anything to the audio input pins of
+the chip, and some other cards use the audio function to transport MPEG
+video data, so it's quite possible that audio recording may not work
+with your card.
+
+
+Driver Status
+=============
+
+The driver is now stable. However, it doesn't know about many TV cards,
+and it refuses to load for cards it doesn't know.
+
+If the driver complains ("Unknown TV card found, the audio driver will
+not load"), you can specify the load_all=1 option to force the driver to
+try to use the audio capture function of your card. If the frequency of
+recorded data is not right, try to specify the digital_rate option with
+other values than the default 32000 (often it's 44100 or 64000).
+
+If you have an unknown card, please mail the ID and board name to
+<alsa-devel@alsa-project.org>, regardless of whether audio capture works
+or not, so that future versions of this driver know about your card.
+
+
+Audio modes
+===========
+
+The chip knows two different modes (digital/analog). snd-bt87x
+registers two PCM devices, one for each mode. They cannot be used at
+the same time.
+
+
+Digital audio mode
+==================
+
+The first device (hw:X,0) gives you 16 bit stereo sound. The sample
+rate depends on the external source which feeds the Bt87x with digital
+sound via I2S interface.
+
+
+Analog audio mode (A/D)
+=======================
+
+The second device (hw:X,1) gives you 8 or 16 bit mono sound. Supported
+sample rates are between 119466 and 448000 Hz (yes, these numbers are
+that high). If you've set the CONFIG_SND_BT87X_OVERCLOCK option, the
+maximum sample rate is 1792000 Hz, but audio data becomes unusable
+beyond 896000 Hz on my card.
+
+The chip has three analog inputs. Consequently you'll get a mixer
+device to control these.
+
+
+Have fun,
+
+ Clemens
+
+
+Written by Clemens Ladisch <clemens@ladisch.de>
+big parts copied from btaudio.txt by Gerd Knorr <kraxel@bytesex.org>
diff --git a/Documentation/sound/alsa/CMIPCI.txt b/Documentation/sound/alsa/CMIPCI.txt
new file mode 100644
index 0000000..16935c8
--- /dev/null
+++ b/Documentation/sound/alsa/CMIPCI.txt
@@ -0,0 +1,254 @@
+ Brief Notes on C-Media 8338/8738/8768/8770 Driver
+ =================================================
+
+ Takashi Iwai <tiwai@suse.de>
+
+
+Front/Rear Multi-channel Playback
+---------------------------------
+
+CM8x38 chip can use ADC as the second DAC so that two different stereo
+channels can be used for front/rear playbacks. Since there are two
+DACs, both streams are handled independently unlike the 4/6ch multi-
+channel playbacks in the section below.
+
+As default, ALSA driver assigns the first PCM device (i.e. hw:0,0 for
+card#0) for front and 4/6ch playbacks, while the second PCM device
+(hw:0,1) is assigned to the second DAC for rear playback.
+
+There are slight differences between the two DACs:
+
+- The first DAC supports U8 and S16LE formats, while the second DAC
+ supports only S16LE.
+- The second DAC supports only two channel stereo.
+
+Please note that the CM8x38 DAC doesn't support continuous playback
+rate but only fixed rates: 5512, 8000, 11025, 16000, 22050, 32000,
+44100 and 48000 Hz.
+
+The rear output can be heard only when "Four Channel Mode" switch is
+disabled. Otherwise no signal will be routed to the rear speakers.
+As default it's turned on.
+
+*** WARNING ***
+When "Four Channel Mode" switch is off, the output from rear speakers
+will be FULL VOLUME regardless of Master and PCM volumes.
+This might damage your audio equipment. Please disconnect speakers
+before your turn off this switch.
+*** WARNING ***
+
+[ Well.. I once got the output with correct volume (i.e. same with the
+ front one) and was so excited. It was even with "Four Channel" bit
+ on and "double DAC" mode. Actually I could hear separate 4 channels
+ from front and rear speakers! But.. after reboot, all was gone.
+ It's a very pity that I didn't save the register dump at that
+ time.. Maybe there is an unknown register to achieve this... ]
+
+If your card has an extra output jack for the rear output, the rear
+playback should be routed there as default. If not, there is a
+control switch in the driver "Line-In As Rear", which you can change
+via alsamixer or somewhat else. When this switch is on, line-in jack
+is used as rear output.
+
+There are two more controls regarding to the rear output.
+The "Exchange DAC" switch is used to exchange front and rear playback
+routes, i.e. the 2nd DAC is output from front output.
+
+
+4/6 Multi-Channel Playback
+--------------------------
+
+The recent CM8738 chips support for the 4/6 multi-channel playback
+function. This is useful especially for AC3 decoding.
+
+When the multi-channel is supported, the driver name has a suffix
+"-MC" such like "CMI8738-MC6". You can check this name from
+/proc/asound/cards.
+
+When the 4/6-ch output is enabled, the second DAC accepts up to 6 (or
+4) channels. While the dual DAC supports two different rates or
+formats, the 4/6-ch playback supports only the same condition for all
+channels. Since the multi-channel playback mode uses both DACs, you
+cannot operate with full-duplex.
+
+The 4.0 and 5.1 modes are defined as the pcm "surround40" and "surround51"
+in alsa-lib. For example, you can play a WAV file with 6 channels like
+
+ % aplay -Dsurround51 sixchannels.wav
+
+For programming the 4/6 channel playback, you need to specify the PCM
+channels as you like and set the format S16LE. For example, for playback
+with 4 channels,
+
+ snd_pcm_hw_params_set_access(pcm, hw, SND_PCM_ACCESS_RW_INTERLEAVED);
+ // or mmap if you like
+ snd_pcm_hw_params_set_format(pcm, hw, SND_PCM_FORMAT_S16_LE);
+ snd_pcm_hw_params_set_channels(pcm, hw, 4);
+
+and use the interleaved 4 channel data.
+
+There are some control switchs affecting to the speaker connections:
+
+"Line-In Mode" - an enum control to change the behavior of line-in
+ jack. Either "Line-In", "Rear Output" or "Bass Output" can
+ be selected. The last item is available only with model 039
+ or newer.
+ When "Rear Output" is chosen, the surround channels 3 and 4
+ are output to line-in jack.
+"Mic-In Mode" - an enum control to change the behavior of mic-in
+ jack. Either "Mic-In" or "Center/LFE Output" can be
+ selected.
+ When "Center/LFE Output" is chosen, the center and bass
+ channels (channels 5 and 6) are output to mic-in jack.
+
+Digital I/O
+-----------
+
+The CM8x38 provides the excellent SPDIF capability with very cheap
+price (yes, that's the reason I bought the card :)
+
+The SPDIF playback and capture are done via the third PCM device
+(hw:0,2). Usually this is assigned to the PCM device "spdif".
+The available rates are 44100 and 48000 Hz.
+For playback with aplay, you can run like below:
+
+ % aplay -Dhw:0,2 foo.wav
+
+or
+
+ % aplay -Dspdif foo.wav
+
+24bit format is also supported experimentally.
+
+The playback and capture over SPDIF use normal DAC and ADC,
+respectively, so you cannot playback both analog and digital streams
+simultaneously.
+
+To enable SPDIF output, you need to turn on "IEC958 Output Switch"
+control via mixer or alsactl ("IEC958" is the official name of
+so-called S/PDIF). Then you'll see the red light on from the card so
+you know that's working obviously :)
+The SPDIF input is always enabled, so you can hear SPDIF input data
+from line-out with "IEC958 In Monitor" switch at any time (see
+below).
+
+You can play via SPDIF even with the first device (hw:0,0),
+but SPDIF is enabled only when the proper format (S16LE), sample rate
+(441100 or 48000) and channels (2) are used. Otherwise it's turned
+off. (Also don't forget to turn on "IEC958 Output Switch", too.)
+
+
+Additionally there are relevant control switches:
+
+"IEC958 Mix Analog" - Mix analog PCM playback and FM-OPL/3 streams and
+ output through SPDIF. This switch appears only on old chip
+ models (CM8738 033 and 037).
+ Note: without this control you can output PCM to SPDIF.
+ This is "mixing" of streams, so e.g. it's not for AC3 output
+ (see the next section).
+
+"IEC958 In Select" - Select SPDIF input, the internal CD-in (false)
+ and the external input (true).
+
+"IEC958 Loop" - SPDIF input data is loop back into SPDIF
+ output (aka bypass)
+
+"IEC958 Copyright" - Set the copyright bit.
+
+"IEC958 5V" - Select 0.5V (coax) or 5V (optical) interface.
+ On some cards this doesn't work and you need to change the
+ configuration with hardware dip-switch.
+
+"IEC958 In Monitor" - SPDIF input is routed to DAC.
+
+"IEC958 In Phase Inverse" - Set SPDIF input format as inverse.
+ [FIXME: this doesn't work on all chips..]
+
+"IEC958 In Valid" - Set input validity flag detection.
+
+Note: When "PCM Playback Switch" is on, you'll hear the digital output
+stream through analog line-out.
+
+
+The AC3 (RAW DIGITAL) OUTPUT
+----------------------------
+
+The driver supports raw digital (typically AC3) i/o over SPDIF. This
+can be toggled via IEC958 playback control, but usually you need to
+access it via alsa-lib. See alsa-lib documents for more details.
+
+On the raw digital mode, the "PCM Playback Switch" is automatically
+turned off so that non-audio data is heard from the analog line-out.
+Similarly the following switches are off: "IEC958 Mix Analog" and
+"IEC958 Loop". The switches are resumed after closing the SPDIF PCM
+device automatically to the previous state.
+
+On the model 033, AC3 is implemented by the software conversion in
+the alsa-lib. If you need to bypass the software conversion of IEC958
+subframes, pass the "soft_ac3=0" module option. This doesn't matter
+on the newer models.
+
+
+ANALOG MIXER INTERFACE
+----------------------
+
+The mixer interface on CM8x38 is similar to SB16.
+There are Master, PCM, Synth, CD, Line, Mic and PC Speaker playback
+volumes. Synth, CD, Line and Mic have playback and capture switches,
+too, as well as SB16.
+
+In addition to the standard SB mixer, CM8x38 provides more functions.
+- PCM playback switch
+- PCM capture switch (to capture the data sent to DAC)
+- Mic Boost switch
+- Mic capture volume
+- Aux playback volume/switch and capture switch
+- 3D control switch
+
+
+MIDI CONTROLLER
+---------------
+
+With CMI8338 chips, the MPU401-UART interface is disabled as default.
+You need to set the module option "mpu_port" to a valid I/O port address
+to enable MIDI support. Valid I/O ports are 0x300, 0x310, 0x320 and
+0x330. Choose a value that doesn't conflict with other cards.
+
+With CMI8738 and newer chips, the MIDI interface is enabled by default
+and the driver automatically chooses a port address.
+
+There is _no_ hardware wavetable function on this chip (except for
+OPL3 synth below).
+What's said as MIDI synth on Windows is a software synthesizer
+emulation. On Linux use TiMidity or other softsynth program for
+playing MIDI music.
+
+
+FM OPL/3 Synth
+--------------
+
+The FM OPL/3 is also enabled as default only for the first card.
+Set "fm_port" module option for more cards.
+
+The output quality of FM OPL/3 is, however, very weird.
+I don't know why..
+
+CMI8768 and newer chips do not have the FM synth.
+
+
+Joystick and Modem
+------------------
+
+The legacy joystick is supported. To enable the joystick support, pass
+joystick_port=1 module option. The value 1 means the auto-detection.
+If the auto-detection fails, try to pass the exact I/O address.
+
+The modem is enabled dynamically via a card control switch "Modem".
+
+
+Debugging Information
+---------------------
+
+The registers are shown in /proc/asound/cardX/cmipci. If you have any
+problem (especially unexpected behavior of mixer), please attach the
+output of this proc file together with the bug report.
diff --git a/Documentation/sound/alsa/ControlNames.txt b/Documentation/sound/alsa/ControlNames.txt
new file mode 100644
index 0000000..5b18298
--- /dev/null
+++ b/Documentation/sound/alsa/ControlNames.txt
@@ -0,0 +1,84 @@
+This document describes standard names of mixer controls.
+
+Syntax: SOURCE [DIRECTION] FUNCTION
+
+DIRECTION:
+ <nothing> (both directions)
+ Playback
+ Capture
+ Bypass Playback
+ Bypass Capture
+
+FUNCTION:
+ Switch (on/off switch)
+ Volume
+ Route (route control, hardware specific)
+
+SOURCE:
+ Master
+ Master Mono
+ Hardware Master
+ Headphone
+ PC Speaker
+ Phone
+ Phone Input
+ Phone Output
+ Synth
+ FM
+ Mic
+ Line
+ CD
+ Video
+ Zoom Video
+ Aux
+ PCM
+ PCM Front
+ PCM Rear
+ PCM Pan
+ Loopback
+ Analog Loopback (D/A -> A/D loopback)
+ Digital Loopback (playback -> capture loopback - without analog path)
+ Mono
+ Mono Output
+ Multi
+ ADC
+ Wave
+ Music
+ I2S
+ IEC958
+
+Exceptions:
+ [Digital] Capture Source
+ [Digital] Capture Switch (aka input gain switch)
+ [Digital] Capture Volume (aka input gain volume)
+ [Digital] Playback Switch (aka output gain switch)
+ [Digital] Playback Volume (aka output gain volume)
+ Tone Control - Switch
+ Tone Control - Bass
+ Tone Control - Treble
+ 3D Control - Switch
+ 3D Control - Center
+ 3D Control - Depth
+ 3D Control - Wide
+ 3D Control - Space
+ 3D Control - Level
+ Mic Boost [(?dB)]
+
+PCM interface:
+
+ Sample Clock Source { "Word", "Internal", "AutoSync" }
+ Clock Sync Status { "Lock", "Sync", "No Lock" }
+ External Rate /* external capture rate */
+ Capture Rate /* capture rate taken from external source */
+
+IEC958 (S/PDIF) interface:
+
+ IEC958 [...] [Playback|Capture] Switch /* turn on/off the IEC958 interface */
+ IEC958 [...] [Playback|Capture] Volume /* digital volume control */
+ IEC958 [...] [Playback|Capture] Default /* default or global value - read/write */
+ IEC958 [...] [Playback|Capture] Mask /* consumer and professional mask */
+ IEC958 [...] [Playback|Capture] Con Mask /* consumer mask */
+ IEC958 [...] [Playback|Capture] Pro Mask /* professional mask */
+ IEC958 [...] [Playback|Capture] PCM Stream /* the settings assigned to a PCM stream */
+ IEC958 Q-subcode [Playback|Capture] Default /* Q-subcode bits */
+ IEC958 Preamble [Playback|Capture] Default /* burst preamble words (4*16bits) */
diff --git a/Documentation/sound/alsa/DocBook/alsa-driver-api.tmpl b/Documentation/sound/alsa/DocBook/alsa-driver-api.tmpl
new file mode 100644
index 0000000..9d644f7
--- /dev/null
+++ b/Documentation/sound/alsa/DocBook/alsa-driver-api.tmpl
@@ -0,0 +1,100 @@
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.1//EN">
+
+<book>
+<?dbhtml filename="index.html">
+
+<!-- ****************************************************** -->
+<!-- Header -->
+<!-- ****************************************************** -->
+ <bookinfo>
+ <title>The ALSA Driver API</title>
+
+ <legalnotice>
+ <para>
+ This document is free; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ </para>
+
+ <para>
+ This document is distributed in the hope that it will be useful,
+ but <emphasis>WITHOUT ANY WARRANTY</emphasis>; without even the
+ implied warranty of <emphasis>MERCHANTABILITY or FITNESS FOR A
+ PARTICULAR PURPOSE</emphasis>. See the GNU General Public License
+ for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+ </legalnotice>
+
+ </bookinfo>
+
+ <chapter><title>Management of Cards and Devices</title>
+ <sect1><title>Card Management</title>
+!Esound/core/init.c
+ </sect1>
+ <sect1><title>Device Components</title>
+!Esound/core/device.c
+ </sect1>
+ <sect1><title>Module requests and Device File Entries</title>
+!Esound/core/sound.c
+ </sect1>
+ <sect1><title>Memory Management Helpers</title>
+!Esound/core/memory.c
+!Esound/core/memalloc.c
+ </sect1>
+ </chapter>
+ <chapter><title>PCM API</title>
+ <sect1><title>PCM Core</title>
+!Esound/core/pcm.c
+!Esound/core/pcm_lib.c
+!Esound/core/pcm_native.c
+ </sect1>
+ <sect1><title>PCM Format Helpers</title>
+!Esound/core/pcm_misc.c
+ </sect1>
+ <sect1><title>PCM Memory Management</title>
+!Esound/core/pcm_memory.c
+ </sect1>
+ </chapter>
+ <chapter><title>Control/Mixer API</title>
+ <sect1><title>General Control Interface</title>
+!Esound/core/control.c
+ </sect1>
+ <sect1><title>AC97 Codec API</title>
+!Esound/pci/ac97/ac97_codec.c
+!Esound/pci/ac97/ac97_pcm.c
+ </sect1>
+ </chapter>
+ <chapter><title>MIDI API</title>
+ <sect1><title>Raw MIDI API</title>
+!Esound/core/rawmidi.c
+ </sect1>
+ <sect1><title>MPU401-UART API</title>
+!Esound/drivers/mpu401/mpu401_uart.c
+ </sect1>
+ </chapter>
+ <chapter><title>Proc Info API</title>
+ <sect1><title>Proc Info Interface</title>
+!Esound/core/info.c
+ </sect1>
+ </chapter>
+ <chapter><title>Miscellaneous Functions</title>
+ <sect1><title>Hardware-Dependent Devices API</title>
+!Esound/core/hwdep.c
+ </sect1>
+ <sect1><title>ISA DMA Helpers</title>
+!Esound/core/isadma.c
+ </sect1>
+ <sect1><title>Other Helper Macros</title>
+!Iinclude/sound/core.h
+ </sect1>
+ </chapter>
+
+</book>
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
new file mode 100644
index 0000000..87a7c07
--- /dev/null
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -0,0 +1,6210 @@
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.1//EN">
+
+<book>
+<?dbhtml filename="index.html">
+
+<!-- ****************************************************** -->
+<!-- Header -->
+<!-- ****************************************************** -->
+ <bookinfo>
+ <title>Writing an ALSA Driver</title>
+ <author>
+ <firstname>Takashi</firstname>
+ <surname>Iwai</surname>
+ <affiliation>
+ <address>
+ <email>tiwai@suse.de</email>
+ </address>
+ </affiliation>
+ </author>
+
+ <date>Oct 15, 2007</date>
+ <edition>0.3.7</edition>
+
+ <abstract>
+ <para>
+ This document describes how to write an ALSA (Advanced Linux
+ Sound Architecture) driver.
+ </para>
+ </abstract>
+
+ <legalnotice>
+ <para>
+ Copyright (c) 2002-2005 Takashi Iwai <email>tiwai@suse.de</email>
+ </para>
+
+ <para>
+ This document is free; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ </para>
+
+ <para>
+ This document is distributed in the hope that it will be useful,
+ but <emphasis>WITHOUT ANY WARRANTY</emphasis>; without even the
+ implied warranty of <emphasis>MERCHANTABILITY or FITNESS FOR A
+ PARTICULAR PURPOSE</emphasis>. See the GNU General Public License
+ for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+ </legalnotice>
+
+ </bookinfo>
+
+<!-- ****************************************************** -->
+<!-- Preface -->
+<!-- ****************************************************** -->
+ <preface id="preface">
+ <title>Preface</title>
+ <para>
+ This document describes how to write an
+ <ulink url="http://www.alsa-project.org/"><citetitle>
+ ALSA (Advanced Linux Sound Architecture)</citetitle></ulink>
+ driver. The document focuses mainly on PCI soundcards.
+ In the case of other device types, the API might
+ be different, too. However, at least the ALSA kernel API is
+ consistent, and therefore it would be still a bit help for
+ writing them.
+ </para>
+
+ <para>
+ This document targets people who already have enough
+ C language skills and have basic linux kernel programming
+ knowledge. This document doesn't explain the general
+ topic of linux kernel coding and doesn't cover low-level
+ driver implementation details. It only describes
+ the standard way to write a PCI sound driver on ALSA.
+ </para>
+
+ <para>
+ If you are already familiar with the older ALSA ver.0.5.x API, you
+ can check the drivers such as <filename>sound/pci/es1938.c</filename> or
+ <filename>sound/pci/maestro3.c</filename> which have also almost the same
+ code-base in the ALSA 0.5.x tree, so you can compare the differences.
+ </para>
+
+ <para>
+ This document is still a draft version. Any feedback and
+ corrections, please!!
+ </para>
+ </preface>
+
+
+<!-- ****************************************************** -->
+<!-- File Tree Structure -->
+<!-- ****************************************************** -->
+ <chapter id="file-tree">
+ <title>File Tree Structure</title>
+
+ <section id="file-tree-general">
+ <title>General</title>
+ <para>
+ The ALSA drivers are provided in two ways.
+ </para>
+
+ <para>
+ One is the trees provided as a tarball or via cvs from the
+ ALSA's ftp site, and another is the 2.6 (or later) Linux kernel
+ tree. To synchronize both, the ALSA driver tree is split into
+ two different trees: alsa-kernel and alsa-driver. The former
+ contains purely the source code for the Linux 2.6 (or later)
+ tree. This tree is designed only for compilation on 2.6 or
+ later environment. The latter, alsa-driver, contains many subtle
+ files for compiling ALSA drivers outside of the Linux kernel tree,
+ wrapper functions for older 2.2 and 2.4 kernels, to adapt the latest kernel API,
+ and additional drivers which are still in development or in
+ tests. The drivers in alsa-driver tree will be moved to
+ alsa-kernel (and eventually to the 2.6 kernel tree) when they are
+ finished and confirmed to work fine.
+ </para>
+
+ <para>
+ The file tree structure of ALSA driver is depicted below. Both
+ alsa-kernel and alsa-driver have almost the same file
+ structure, except for <quote>core</quote> directory. It's
+ named as <quote>acore</quote> in alsa-driver tree.
+
+ <example>
+ <title>ALSA File Tree Structure</title>
+ <literallayout>
+ sound
+ /core
+ /oss
+ /seq
+ /oss
+ /instr
+ /ioctl32
+ /include
+ /drivers
+ /mpu401
+ /opl3
+ /i2c
+ /l3
+ /synth
+ /emux
+ /pci
+ /(cards)
+ /isa
+ /(cards)
+ /arm
+ /ppc
+ /sparc
+ /usb
+ /pcmcia /(cards)
+ /oss
+ </literallayout>
+ </example>
+ </para>
+ </section>
+
+ <section id="file-tree-core-directory">
+ <title>core directory</title>
+ <para>
+ This directory contains the middle layer which is the heart
+ of ALSA drivers. In this directory, the native ALSA modules are
+ stored. The sub-directories contain different modules and are
+ dependent upon the kernel config.
+ </para>
+
+ <section id="file-tree-core-directory-oss">
+ <title>core/oss</title>
+
+ <para>
+ The codes for PCM and mixer OSS emulation modules are stored
+ in this directory. The rawmidi OSS emulation is included in
+ the ALSA rawmidi code since it's quite small. The sequencer
+ code is stored in <filename>core/seq/oss</filename> directory (see
+ <link linkend="file-tree-core-directory-seq-oss"><citetitle>
+ below</citetitle></link>).
+ </para>
+ </section>
+
+ <section id="file-tree-core-directory-ioctl32">
+ <title>core/ioctl32</title>
+
+ <para>
+ This directory contains the 32bit-ioctl wrappers for 64bit
+ architectures such like x86-64, ppc64 and sparc64. For 32bit
+ and alpha architectures, these are not compiled.
+ </para>
+ </section>
+
+ <section id="file-tree-core-directory-seq">
+ <title>core/seq</title>
+ <para>
+ This directory and its sub-directories are for the ALSA
+ sequencer. This directory contains the sequencer core and
+ primary sequencer modules such like snd-seq-midi,
+ snd-seq-virmidi, etc. They are compiled only when
+ <constant>CONFIG_SND_SEQUENCER</constant> is set in the kernel
+ config.
+ </para>
+ </section>
+
+ <section id="file-tree-core-directory-seq-oss">
+ <title>core/seq/oss</title>
+ <para>
+ This contains the OSS sequencer emulation codes.
+ </para>
+ </section>
+
+ <section id="file-tree-core-directory-deq-instr">
+ <title>core/seq/instr</title>
+ <para>
+ This directory contains the modules for the sequencer
+ instrument layer.
+ </para>
+ </section>
+ </section>
+
+ <section id="file-tree-include-directory">
+ <title>include directory</title>
+ <para>
+ This is the place for the public header files of ALSA drivers,
+ which are to be exported to user-space, or included by
+ several files at different directories. Basically, the private
+ header files should not be placed in this directory, but you may
+ still find files there, due to historical reasons :)
+ </para>
+ </section>
+
+ <section id="file-tree-drivers-directory">
+ <title>drivers directory</title>
+ <para>
+ This directory contains code shared among different drivers
+ on different architectures. They are hence supposed not to be
+ architecture-specific.
+ For example, the dummy pcm driver and the serial MIDI
+ driver are found in this directory. In the sub-directories,
+ there is code for components which are independent from
+ bus and cpu architectures.
+ </para>
+
+ <section id="file-tree-drivers-directory-mpu401">
+ <title>drivers/mpu401</title>
+ <para>
+ The MPU401 and MPU401-UART modules are stored here.
+ </para>
+ </section>
+
+ <section id="file-tree-drivers-directory-opl3">
+ <title>drivers/opl3 and opl4</title>
+ <para>
+ The OPL3 and OPL4 FM-synth stuff is found here.
+ </para>
+ </section>
+ </section>
+
+ <section id="file-tree-i2c-directory">
+ <title>i2c directory</title>
+ <para>
+ This contains the ALSA i2c components.
+ </para>
+
+ <para>
+ Although there is a standard i2c layer on Linux, ALSA has its
+ own i2c code for some cards, because the soundcard needs only a
+ simple operation and the standard i2c API is too complicated for
+ such a purpose.
+ </para>
+
+ <section id="file-tree-i2c-directory-l3">
+ <title>i2c/l3</title>
+ <para>
+ This is a sub-directory for ARM L3 i2c.
+ </para>
+ </section>
+ </section>
+
+ <section id="file-tree-synth-directory">
+ <title>synth directory</title>
+ <para>
+ This contains the synth middle-level modules.
+ </para>
+
+ <para>
+ So far, there is only Emu8000/Emu10k1 synth driver under
+ the <filename>synth/emux</filename> sub-directory.
+ </para>
+ </section>
+
+ <section id="file-tree-pci-directory">
+ <title>pci directory</title>
+ <para>
+ This directory and its sub-directories hold the top-level card modules
+ for PCI soundcards and the code specific to the PCI BUS.
+ </para>
+
+ <para>
+ The drivers compiled from a single file are stored directly
+ in the pci directory, while the drivers with several source files are
+ stored on their own sub-directory (e.g. emu10k1, ice1712).
+ </para>
+ </section>
+
+ <section id="file-tree-isa-directory">
+ <title>isa directory</title>
+ <para>
+ This directory and its sub-directories hold the top-level card modules
+ for ISA soundcards.
+ </para>
+ </section>
+
+ <section id="file-tree-arm-ppc-sparc-directories">
+ <title>arm, ppc, and sparc directories</title>
+ <para>
+ They are used for top-level card modules which are
+ specific to one of these architectures.
+ </para>
+ </section>
+
+ <section id="file-tree-usb-directory">
+ <title>usb directory</title>
+ <para>
+ This directory contains the USB-audio driver. In the latest version, the
+ USB MIDI driver is integrated in the usb-audio driver.
+ </para>
+ </section>
+
+ <section id="file-tree-pcmcia-directory">
+ <title>pcmcia directory</title>
+ <para>
+ The PCMCIA, especially PCCard drivers will go here. CardBus
+ drivers will be in the pci directory, because their API is identical
+ to that of standard PCI cards.
+ </para>
+ </section>
+
+ <section id="file-tree-oss-directory">
+ <title>oss directory</title>
+ <para>
+ The OSS/Lite source files are stored here in Linux 2.6 (or
+ later) tree. In the ALSA driver tarball, this directory is empty,
+ of course :)
+ </para>
+ </section>
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Basic Flow for PCI Drivers -->
+<!-- ****************************************************** -->
+ <chapter id="basic-flow">
+ <title>Basic Flow for PCI Drivers</title>
+
+ <section id="basic-flow-outline">
+ <title>Outline</title>
+ <para>
+ The minimum flow for PCI soundcards is as follows:
+
+ <itemizedlist>
+ <listitem><para>define the PCI ID table (see the section
+ <link linkend="pci-resource-entries"><citetitle>PCI Entries
+ </citetitle></link>).</para></listitem>
+ <listitem><para>create <function>probe()</function> callback.</para></listitem>
+ <listitem><para>create <function>remove()</function> callback.</para></listitem>
+ <listitem><para>create a <structname>pci_driver</structname> structure
+ containing the three pointers above.</para></listitem>
+ <listitem><para>create an <function>init()</function> function just calling
+ the <function>pci_register_driver()</function> to register the pci_driver table
+ defined above.</para></listitem>
+ <listitem><para>create an <function>exit()</function> function to call
+ the <function>pci_unregister_driver()</function> function.</para></listitem>
+ </itemizedlist>
+ </para>
+ </section>
+
+ <section id="basic-flow-example">
+ <title>Full Code Example</title>
+ <para>
+ The code example is shown below. Some parts are kept
+ unimplemented at this moment but will be filled in the
+ next sections. The numbers in the comment lines of the
+ <function>snd_mychip_probe()</function> function
+ refer to details explained in the following section.
+
+ <example>
+ <title>Basic Flow for PCI Drivers - Example</title>
+ <programlisting>
+<![CDATA[
+ #include <linux/init.h>
+ #include <linux/pci.h>
+ #include <linux/slab.h>
+ #include <sound/core.h>
+ #include <sound/initval.h>
+
+ /* module parameters (see "Module Parameters") */
+ /* SNDRV_CARDS: maximum number of cards supported by this module */
+ static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
+ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
+ static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
+
+ /* definition of the chip-specific record */
+ struct mychip {
+ struct snd_card *card;
+ /* the rest of the implementation will be in section
+ * "PCI Resource Management"
+ */
+ };
+
+ /* chip-specific destructor
+ * (see "PCI Resource Management")
+ */
+ static int snd_mychip_free(struct mychip *chip)
+ {
+ .... /* will be implemented later... */
+ }
+
+ /* component-destructor
+ * (see "Management of Cards and Components")
+ */
+ static int snd_mychip_dev_free(struct snd_device *device)
+ {
+ return snd_mychip_free(device->device_data);
+ }
+
+ /* chip-specific constructor
+ * (see "Management of Cards and Components")
+ */
+ static int __devinit snd_mychip_create(struct snd_card *card,
+ struct pci_dev *pci,
+ struct mychip **rchip)
+ {
+ struct mychip *chip;
+ int err;
+ static struct snd_device_ops ops = {
+ .dev_free = snd_mychip_dev_free,
+ };
+
+ *rchip = NULL;
+
+ /* check PCI availability here
+ * (see "PCI Resource Management")
+ */
+ ....
+
+ /* allocate a chip-specific data with zero filled */
+ chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+ if (chip == NULL)
+ return -ENOMEM;
+
+ chip->card = card;
+
+ /* rest of initialization here; will be implemented
+ * later, see "PCI Resource Management"
+ */
+ ....
+
+ err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+ if (err < 0) {
+ snd_mychip_free(chip);
+ return err;
+ }
+
+ snd_card_set_dev(card, &pci->dev);
+
+ *rchip = chip;
+ return 0;
+ }
+
+ /* constructor -- see "Constructor" sub-section */
+ static int __devinit snd_mychip_probe(struct pci_dev *pci,
+ const struct pci_device_id *pci_id)
+ {
+ static int dev;
+ struct snd_card *card;
+ struct mychip *chip;
+ int err;
+
+ /* (1) */
+ if (dev >= SNDRV_CARDS)
+ return -ENODEV;
+ if (!enable[dev]) {
+ dev++;
+ return -ENOENT;
+ }
+
+ /* (2) */
+ card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0);
+ if (card == NULL)
+ return -ENOMEM;
+
+ /* (3) */
+ err = snd_mychip_create(card, pci, &chip);
+ if (err < 0) {
+ snd_card_free(card);
+ return err;
+ }
+
+ /* (4) */
+ strcpy(card->driver, "My Chip");
+ strcpy(card->shortname, "My Own Chip 123");
+ sprintf(card->longname, "%s at 0x%lx irq %i",
+ card->shortname, chip->ioport, chip->irq);
+
+ /* (5) */
+ .... /* implemented later */
+
+ /* (6) */
+ err = snd_card_register(card);
+ if (err < 0) {
+ snd_card_free(card);
+ return err;
+ }
+
+ /* (7) */
+ pci_set_drvdata(pci, card);
+ dev++;
+ return 0;
+ }
+
+ /* destructor -- see the "Destructor" sub-section */
+ static void __devexit snd_mychip_remove(struct pci_dev *pci)
+ {
+ snd_card_free(pci_get_drvdata(pci));
+ pci_set_drvdata(pci, NULL);
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </section>
+
+ <section id="basic-flow-constructor">
+ <title>Constructor</title>
+ <para>
+ The real constructor of PCI drivers is the <function>probe</function> callback.
+ The <function>probe</function> callback and other component-constructors which are called
+ from the <function>probe</function> callback should be defined with
+ the <parameter>__devinit</parameter> prefix. You
+ cannot use the <parameter>__init</parameter> prefix for them,
+ because any PCI device could be a hotplug device.
+ </para>
+
+ <para>
+ In the <function>probe</function> callback, the following scheme is often used.
+ </para>
+
+ <section id="basic-flow-constructor-device-index">
+ <title>1) Check and increment the device index.</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int dev;
+ ....
+ if (dev >= SNDRV_CARDS)
+ return -ENODEV;
+ if (!enable[dev]) {
+ dev++;
+ return -ENOENT;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ where enable[dev] is the module option.
+ </para>
+
+ <para>
+ Each time the <function>probe</function> callback is called, check the
+ availability of the device. If not available, simply increment
+ the device index and returns. dev will be incremented also
+ later (<link
+ linkend="basic-flow-constructor-set-pci"><citetitle>step
+ 7</citetitle></link>).
+ </para>
+ </section>
+
+ <section id="basic-flow-constructor-create-card">
+ <title>2) Create a card instance</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_card *card;
+ ....
+ card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The details will be explained in the section
+ <link linkend="card-management-card-instance"><citetitle>
+ Management of Cards and Components</citetitle></link>.
+ </para>
+ </section>
+
+ <section id="basic-flow-constructor-create-main">
+ <title>3) Create a main component</title>
+ <para>
+ In this part, the PCI resources are allocated.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct mychip *chip;
+ ....
+ err = snd_mychip_create(card, pci, &chip);
+ if (err < 0) {
+ snd_card_free(card);
+ return err;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ The details will be explained in the section <link
+ linkend="pci-resource"><citetitle>PCI Resource
+ Management</citetitle></link>.
+ </para>
+ </section>
+
+ <section id="basic-flow-constructor-main-component">
+ <title>4) Set the driver ID and name strings.</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ strcpy(card->driver, "My Chip");
+ strcpy(card->shortname, "My Own Chip 123");
+ sprintf(card->longname, "%s at 0x%lx irq %i",
+ card->shortname, chip->ioport, chip->irq);
+]]>
+ </programlisting>
+ </informalexample>
+
+ The driver field holds the minimal ID string of the
+ chip. This is used by alsa-lib's configurator, so keep it
+ simple but unique.
+ Even the same driver can have different driver IDs to
+ distinguish the functionality of each chip type.
+ </para>
+
+ <para>
+ The shortname field is a string shown as more verbose
+ name. The longname field contains the information
+ shown in <filename>/proc/asound/cards</filename>.
+ </para>
+ </section>
+
+ <section id="basic-flow-constructor-create-other">
+ <title>5) Create other components, such as mixer, MIDI, etc.</title>
+ <para>
+ Here you define the basic components such as
+ <link linkend="pcm-interface"><citetitle>PCM</citetitle></link>,
+ mixer (e.g. <link linkend="api-ac97"><citetitle>AC97</citetitle></link>),
+ MIDI (e.g. <link linkend="midi-interface"><citetitle>MPU-401</citetitle></link>),
+ and other interfaces.
+ Also, if you want a <link linkend="proc-interface"><citetitle>proc
+ file</citetitle></link>, define it here, too.
+ </para>
+ </section>
+
+ <section id="basic-flow-constructor-register-card">
+ <title>6) Register the card instance.</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ err = snd_card_register(card);
+ if (err < 0) {
+ snd_card_free(card);
+ return err;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Will be explained in the section <link
+ linkend="card-management-registration"><citetitle>Management
+ of Cards and Components</citetitle></link>, too.
+ </para>
+ </section>
+
+ <section id="basic-flow-constructor-set-pci">
+ <title>7) Set the PCI driver data and return zero.</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ pci_set_drvdata(pci, card);
+ dev++;
+ return 0;
+]]>
+ </programlisting>
+ </informalexample>
+
+ In the above, the card record is stored. This pointer is
+ used in the remove callback and power-management
+ callbacks, too.
+ </para>
+ </section>
+ </section>
+
+ <section id="basic-flow-destructor">
+ <title>Destructor</title>
+ <para>
+ The destructor, remove callback, simply releases the card
+ instance. Then the ALSA middle layer will release all the
+ attached components automatically.
+ </para>
+
+ <para>
+ It would be typically like the following:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void __devexit snd_mychip_remove(struct pci_dev *pci)
+ {
+ snd_card_free(pci_get_drvdata(pci));
+ pci_set_drvdata(pci, NULL);
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ The above code assumes that the card pointer is set to the PCI
+ driver data.
+ </para>
+ </section>
+
+ <section id="basic-flow-header-files">
+ <title>Header Files</title>
+ <para>
+ For the above example, at least the following include files
+ are necessary.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ #include <linux/init.h>
+ #include <linux/pci.h>
+ #include <linux/slab.h>
+ #include <sound/core.h>
+ #include <sound/initval.h>
+]]>
+ </programlisting>
+ </informalexample>
+
+ where the last one is necessary only when module options are
+ defined in the source file. If the code is split into several
+ files, the files without module options don't need them.
+ </para>
+
+ <para>
+ In addition to these headers, you'll need
+ <filename>&lt;linux/interrupt.h&gt;</filename> for interrupt
+ handling, and <filename>&lt;asm/io.h&gt;</filename> for I/O
+ access. If you use the <function>mdelay()</function> or
+ <function>udelay()</function> functions, you'll need to include
+ <filename>&lt;linux/delay.h&gt;</filename> too.
+ </para>
+
+ <para>
+ The ALSA interfaces like the PCM and control APIs are defined in other
+ <filename>&lt;sound/xxx.h&gt;</filename> header files.
+ They have to be included after
+ <filename>&lt;sound/core.h&gt;</filename>.
+ </para>
+
+ </section>
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Management of Cards and Components -->
+<!-- ****************************************************** -->
+ <chapter id="card-management">
+ <title>Management of Cards and Components</title>
+
+ <section id="card-management-card-instance">
+ <title>Card Instance</title>
+ <para>
+ For each soundcard, a <quote>card</quote> record must be allocated.
+ </para>
+
+ <para>
+ A card record is the headquarters of the soundcard. It manages
+ the whole list of devices (components) on the soundcard, such as
+ PCM, mixers, MIDI, synthesizer, and so on. Also, the card
+ record holds the ID and the name strings of the card, manages
+ the root of proc files, and controls the power-management states
+ and hotplug disconnections. The component list on the card
+ record is used to manage the correct release of resources at
+ destruction.
+ </para>
+
+ <para>
+ As mentioned above, to create a card instance, call
+ <function>snd_card_new()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_card *card;
+ card = snd_card_new(index, id, module, extra_size);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The function takes four arguments, the card-index number, the
+ id string, the module pointer (usually
+ <constant>THIS_MODULE</constant>),
+ and the size of extra-data space. The last argument is used to
+ allocate card-&gt;private_data for the
+ chip-specific data. Note that these data
+ are allocated by <function>snd_card_new()</function>.
+ </para>
+ </section>
+
+ <section id="card-management-component">
+ <title>Components</title>
+ <para>
+ After the card is created, you can attach the components
+ (devices) to the card instance. In an ALSA driver, a component is
+ represented as a struct <structname>snd_device</structname> object.
+ A component can be a PCM instance, a control interface, a raw
+ MIDI interface, etc. Each such instance has one component
+ entry.
+ </para>
+
+ <para>
+ A component can be created via
+ <function>snd_device_new()</function> function.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_device_new(card, SNDRV_DEV_XXX, chip, &ops);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ This takes the card pointer, the device-level
+ (<constant>SNDRV_DEV_XXX</constant>), the data pointer, and the
+ callback pointers (<parameter>&amp;ops</parameter>). The
+ device-level defines the type of components and the order of
+ registration and de-registration. For most components, the
+ device-level is already defined. For a user-defined component,
+ you can use <constant>SNDRV_DEV_LOWLEVEL</constant>.
+ </para>
+
+ <para>
+ This function itself doesn't allocate the data space. The data
+ must be allocated manually beforehand, and its pointer is passed
+ as the argument. This pointer is used as the
+ (<parameter>chip</parameter> identifier in the above example)
+ for the instance.
+ </para>
+
+ <para>
+ Each pre-defined ALSA component such as ac97 and pcm calls
+ <function>snd_device_new()</function> inside its
+ constructor. The destructor for each component is defined in the
+ callback pointers. Hence, you don't need to take care of
+ calling a destructor for such a component.
+ </para>
+
+ <para>
+ If you wish to create your own component, you need to
+ set the destructor function to the dev_free callback in
+ the <parameter>ops</parameter>, so that it can be released
+ automatically via <function>snd_card_free()</function>.
+ The next example will show an implementation of chip-specific
+ data.
+ </para>
+ </section>
+
+ <section id="card-management-chip-specific">
+ <title>Chip-Specific Data</title>
+ <para>
+ Chip-specific information, e.g. the I/O port address, its
+ resource pointer, or the irq number, is stored in the
+ chip-specific record.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct mychip {
+ ....
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ In general, there are two ways of allocating the chip record.
+ </para>
+
+ <section id="card-management-chip-specific-snd-card-new">
+ <title>1. Allocating via <function>snd_card_new()</function>.</title>
+ <para>
+ As mentioned above, you can pass the extra-data-length
+ to the 4th argument of <function>snd_card_new()</function>, i.e.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ card = snd_card_new(index[dev], id[dev], THIS_MODULE, sizeof(struct mychip));
+]]>
+ </programlisting>
+ </informalexample>
+
+ struct <structname>mychip</structname> is the type of the chip record.
+ </para>
+
+ <para>
+ In return, the allocated record can be accessed as
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct mychip *chip = card->private_data;
+]]>
+ </programlisting>
+ </informalexample>
+
+ With this method, you don't have to allocate twice.
+ The record is released together with the card instance.
+ </para>
+ </section>
+
+ <section id="card-management-chip-specific-allocate-extra">
+ <title>2. Allocating an extra device.</title>
+
+ <para>
+ After allocating a card instance via
+ <function>snd_card_new()</function> (with
+ <constant>NULL</constant> on the 4th arg), call
+ <function>kzalloc()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_card *card;
+ struct mychip *chip;
+ card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL);
+ .....
+ chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The chip record should have the field to hold the card
+ pointer at least,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct mychip {
+ struct snd_card *card;
+ ....
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Then, set the card pointer in the returned chip instance.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ chip->card = card;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Next, initialize the fields, and register this chip
+ record as a low-level device with a specified
+ <parameter>ops</parameter>,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct snd_device_ops ops = {
+ .dev_free = snd_mychip_dev_free,
+ };
+ ....
+ snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+]]>
+ </programlisting>
+ </informalexample>
+
+ <function>snd_mychip_dev_free()</function> is the
+ device-destructor function, which will call the real
+ destructor.
+ </para>
+
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_mychip_dev_free(struct snd_device *device)
+ {
+ return snd_mychip_free(device->device_data);
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ where <function>snd_mychip_free()</function> is the real destructor.
+ </para>
+ </section>
+ </section>
+
+ <section id="card-management-registration">
+ <title>Registration and Release</title>
+ <para>
+ After all components are assigned, register the card instance
+ by calling <function>snd_card_register()</function>. Access
+ to the device files is enabled at this point. That is, before
+ <function>snd_card_register()</function> is called, the
+ components are safely inaccessible from external side. If this
+ call fails, exit the probe function after releasing the card via
+ <function>snd_card_free()</function>.
+ </para>
+
+ <para>
+ For releasing the card instance, you can call simply
+ <function>snd_card_free()</function>. As mentioned earlier, all
+ components are released automatically by this call.
+ </para>
+
+ <para>
+ As further notes, the destructors (both
+ <function>snd_mychip_dev_free</function> and
+ <function>snd_mychip_free</function>) cannot be defined with
+ the <parameter>__devexit</parameter> prefix, because they may be
+ called from the constructor, too, at the false path.
+ </para>
+
+ <para>
+ For a device which allows hotplugging, you can use
+ <function>snd_card_free_when_closed</function>. This one will
+ postpone the destruction until all devices are closed.
+ </para>
+
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- PCI Resource Management -->
+<!-- ****************************************************** -->
+ <chapter id="pci-resource">
+ <title>PCI Resource Management</title>
+
+ <section id="pci-resource-example">
+ <title>Full Code Example</title>
+ <para>
+ In this section, we'll complete the chip-specific constructor,
+ destructor and PCI entries. Example code is shown first,
+ below.
+
+ <example>
+ <title>PCI Resource Management Example</title>
+ <programlisting>
+<![CDATA[
+ struct mychip {
+ struct snd_card *card;
+ struct pci_dev *pci;
+
+ unsigned long port;
+ int irq;
+ };
+
+ static int snd_mychip_free(struct mychip *chip)
+ {
+ /* disable hardware here if any */
+ .... /* (not implemented in this document) */
+
+ /* release the irq */
+ if (chip->irq >= 0)
+ free_irq(chip->irq, chip);
+ /* release the I/O ports & memory */
+ pci_release_regions(chip->pci);
+ /* disable the PCI entry */
+ pci_disable_device(chip->pci);
+ /* release the data */
+ kfree(chip);
+ return 0;
+ }
+
+ /* chip-specific constructor */
+ static int __devinit snd_mychip_create(struct snd_card *card,
+ struct pci_dev *pci,
+ struct mychip **rchip)
+ {
+ struct mychip *chip;
+ int err;
+ static struct snd_device_ops ops = {
+ .dev_free = snd_mychip_dev_free,
+ };
+
+ *rchip = NULL;
+
+ /* initialize the PCI entry */
+ err = pci_enable_device(pci);
+ if (err < 0)
+ return err;
+ /* check PCI availability (28bit DMA) */
+ if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
+ pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
+ printk(KERN_ERR "error to set 28bit mask DMA\n");
+ pci_disable_device(pci);
+ return -ENXIO;
+ }
+
+ chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+ if (chip == NULL) {
+ pci_disable_device(pci);
+ return -ENOMEM;
+ }
+
+ /* initialize the stuff */
+ chip->card = card;
+ chip->pci = pci;
+ chip->irq = -1;
+
+ /* (1) PCI resource allocation */
+ err = pci_request_regions(pci, "My Chip");
+ if (err < 0) {
+ kfree(chip);
+ pci_disable_device(pci);
+ return err;
+ }
+ chip->port = pci_resource_start(pci, 0);
+ if (request_irq(pci->irq, snd_mychip_interrupt,
+ IRQF_SHARED, "My Chip", chip)) {
+ printk(KERN_ERR "cannot grab irq %d\n", pci->irq);
+ snd_mychip_free(chip);
+ return -EBUSY;
+ }
+ chip->irq = pci->irq;
+
+ /* (2) initialization of the chip hardware */
+ .... /* (not implemented in this document) */
+
+ err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+ if (err < 0) {
+ snd_mychip_free(chip);
+ return err;
+ }
+
+ snd_card_set_dev(card, &pci->dev);
+
+ *rchip = chip;
+ return 0;
+ }
+
+ /* PCI IDs */
+ static struct pci_device_id snd_mychip_ids[] = {
+ { PCI_VENDOR_ID_FOO, PCI_DEVICE_ID_BAR,
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, },
+ ....
+ { 0, }
+ };
+ MODULE_DEVICE_TABLE(pci, snd_mychip_ids);
+
+ /* pci_driver definition */
+ static struct pci_driver driver = {
+ .name = "My Own Chip",
+ .id_table = snd_mychip_ids,
+ .probe = snd_mychip_probe,
+ .remove = __devexit_p(snd_mychip_remove),
+ };
+
+ /* module initialization */
+ static int __init alsa_card_mychip_init(void)
+ {
+ return pci_register_driver(&driver);
+ }
+
+ /* module clean up */
+ static void __exit alsa_card_mychip_exit(void)
+ {
+ pci_unregister_driver(&driver);
+ }
+
+ module_init(alsa_card_mychip_init)
+ module_exit(alsa_card_mychip_exit)
+
+ EXPORT_NO_SYMBOLS; /* for old kernels only */
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </section>
+
+ <section id="pci-resource-some-haftas">
+ <title>Some Hafta's</title>
+ <para>
+ The allocation of PCI resources is done in the
+ <function>probe()</function> function, and usually an extra
+ <function>xxx_create()</function> function is written for this
+ purpose.
+ </para>
+
+ <para>
+ In the case of PCI devices, you first have to call
+ the <function>pci_enable_device()</function> function before
+ allocating resources. Also, you need to set the proper PCI DMA
+ mask to limit the accessed I/O range. In some cases, you might
+ need to call <function>pci_set_master()</function> function,
+ too.
+ </para>
+
+ <para>
+ Suppose the 28bit mask, and the code to be added would be like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ err = pci_enable_device(pci);
+ if (err < 0)
+ return err;
+ if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
+ pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
+ printk(KERN_ERR "error to set 28bit mask DMA\n");
+ pci_disable_device(pci);
+ return -ENXIO;
+ }
+
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </section>
+
+ <section id="pci-resource-resource-allocation">
+ <title>Resource Allocation</title>
+ <para>
+ The allocation of I/O ports and irqs is done via standard kernel
+ functions. Unlike ALSA ver.0.5.x., there are no helpers for
+ that. And these resources must be released in the destructor
+ function (see below). Also, on ALSA 0.9.x, you don't need to
+ allocate (pseudo-)DMA for PCI like in ALSA 0.5.x.
+ </para>
+
+ <para>
+ Now assume that the PCI device has an I/O port with 8 bytes
+ and an interrupt. Then struct <structname>mychip</structname> will have the
+ following fields:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct mychip {
+ struct snd_card *card;
+
+ unsigned long port;
+ int irq;
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ For an I/O port (and also a memory region), you need to have
+ the resource pointer for the standard resource management. For
+ an irq, you have to keep only the irq number (integer). But you
+ need to initialize this number as -1 before actual allocation,
+ since irq 0 is valid. The port address and its resource pointer
+ can be initialized as null by
+ <function>kzalloc()</function> automatically, so you
+ don't have to take care of resetting them.
+ </para>
+
+ <para>
+ The allocation of an I/O port is done like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ err = pci_request_regions(pci, "My Chip");
+ if (err < 0) {
+ kfree(chip);
+ pci_disable_device(pci);
+ return err;
+ }
+ chip->port = pci_resource_start(pci, 0);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ <!-- obsolete -->
+ It will reserve the I/O port region of 8 bytes of the given
+ PCI device. The returned value, chip-&gt;res_port, is allocated
+ via <function>kmalloc()</function> by
+ <function>request_region()</function>. The pointer must be
+ released via <function>kfree()</function>, but there is a
+ problem with this. This issue will be explained later.
+ </para>
+
+ <para>
+ The allocation of an interrupt source is done like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ if (request_irq(pci->irq, snd_mychip_interrupt,
+ IRQF_SHARED, "My Chip", chip)) {
+ printk(KERN_ERR "cannot grab irq %d\n", pci->irq);
+ snd_mychip_free(chip);
+ return -EBUSY;
+ }
+ chip->irq = pci->irq;
+]]>
+ </programlisting>
+ </informalexample>
+
+ where <function>snd_mychip_interrupt()</function> is the
+ interrupt handler defined <link
+ linkend="pcm-interface-interrupt-handler"><citetitle>later</citetitle></link>.
+ Note that chip-&gt;irq should be defined
+ only when <function>request_irq()</function> succeeded.
+ </para>
+
+ <para>
+ On the PCI bus, interrupts can be shared. Thus,
+ <constant>IRQF_SHARED</constant> is used as the interrupt flag of
+ <function>request_irq()</function>.
+ </para>
+
+ <para>
+ The last argument of <function>request_irq()</function> is the
+ data pointer passed to the interrupt handler. Usually, the
+ chip-specific record is used for that, but you can use what you
+ like, too.
+ </para>
+
+ <para>
+ I won't give details about the interrupt handler at this
+ point, but at least its appearance can be explained now. The
+ interrupt handler looks usually like the following:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static irqreturn_t snd_mychip_interrupt(int irq, void *dev_id)
+ {
+ struct mychip *chip = dev_id;
+ ....
+ return IRQ_HANDLED;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Now let's write the corresponding destructor for the resources
+ above. The role of destructor is simple: disable the hardware
+ (if already activated) and release the resources. So far, we
+ have no hardware part, so the disabling code is not written here.
+ </para>
+
+ <para>
+ To release the resources, the <quote>check-and-release</quote>
+ method is a safer way. For the interrupt, do like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ if (chip->irq >= 0)
+ free_irq(chip->irq, chip);
+]]>
+ </programlisting>
+ </informalexample>
+
+ Since the irq number can start from 0, you should initialize
+ chip-&gt;irq with a negative value (e.g. -1), so that you can
+ check the validity of the irq number as above.
+ </para>
+
+ <para>
+ When you requested I/O ports or memory regions via
+ <function>pci_request_region()</function> or
+ <function>pci_request_regions()</function> like in this example,
+ release the resource(s) using the corresponding function,
+ <function>pci_release_region()</function> or
+ <function>pci_release_regions()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ pci_release_regions(chip->pci);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ When you requested manually via <function>request_region()</function>
+ or <function>request_mem_region</function>, you can release it via
+ <function>release_resource()</function>. Suppose that you keep
+ the resource pointer returned from <function>request_region()</function>
+ in chip-&gt;res_port, the release procedure looks like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ release_and_free_resource(chip->res_port);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Don't forget to call <function>pci_disable_device()</function>
+ before the end.
+ </para>
+
+ <para>
+ And finally, release the chip-specific record.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ kfree(chip);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Again, remember that you cannot
+ use the <parameter>__devexit</parameter> prefix for this destructor.
+ </para>
+
+ <para>
+ We didn't implement the hardware disabling part in the above.
+ If you need to do this, please note that the destructor may be
+ called even before the initialization of the chip is completed.
+ It would be better to have a flag to skip hardware disabling
+ if the hardware was not initialized yet.
+ </para>
+
+ <para>
+ When the chip-data is assigned to the card using
+ <function>snd_device_new()</function> with
+ <constant>SNDRV_DEV_LOWLELVEL</constant> , its destructor is
+ called at the last. That is, it is assured that all other
+ components like PCMs and controls have already been released.
+ You don't have to stop PCMs, etc. explicitly, but just
+ call low-level hardware stopping.
+ </para>
+
+ <para>
+ The management of a memory-mapped region is almost as same as
+ the management of an I/O port. You'll need three fields like
+ the following:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct mychip {
+ ....
+ unsigned long iobase_phys;
+ void __iomem *iobase_virt;
+ };
+]]>
+ </programlisting>
+ </informalexample>
+
+ and the allocation would be like below:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ if ((err = pci_request_regions(pci, "My Chip")) < 0) {
+ kfree(chip);
+ return err;
+ }
+ chip->iobase_phys = pci_resource_start(pci, 0);
+ chip->iobase_virt = ioremap_nocache(chip->iobase_phys,
+ pci_resource_len(pci, 0));
+]]>
+ </programlisting>
+ </informalexample>
+
+ and the corresponding destructor would be:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_mychip_free(struct mychip *chip)
+ {
+ ....
+ if (chip->iobase_virt)
+ iounmap(chip->iobase_virt);
+ ....
+ pci_release_regions(chip->pci);
+ ....
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ </section>
+
+ <section id="pci-resource-device-struct">
+ <title>Registration of Device Struct</title>
+ <para>
+ At some point, typically after calling <function>snd_device_new()</function>,
+ you need to register the struct <structname>device</structname> of the chip
+ you're handling for udev and co. ALSA provides a macro for compatibility with
+ older kernels. Simply call like the following:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_card_set_dev(card, &pci->dev);
+]]>
+ </programlisting>
+ </informalexample>
+ so that it stores the PCI's device pointer to the card. This will be
+ referred by ALSA core functions later when the devices are registered.
+ </para>
+ <para>
+ In the case of non-PCI, pass the proper device struct pointer of the BUS
+ instead. (In the case of legacy ISA without PnP, you don't have to do
+ anything.)
+ </para>
+ </section>
+
+ <section id="pci-resource-entries">
+ <title>PCI Entries</title>
+ <para>
+ So far, so good. Let's finish the missing PCI
+ stuff. At first, we need a
+ <structname>pci_device_id</structname> table for this
+ chipset. It's a table of PCI vendor/device ID number, and some
+ masks.
+ </para>
+
+ <para>
+ For example,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct pci_device_id snd_mychip_ids[] = {
+ { PCI_VENDOR_ID_FOO, PCI_DEVICE_ID_BAR,
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, },
+ ....
+ { 0, }
+ };
+ MODULE_DEVICE_TABLE(pci, snd_mychip_ids);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The first and second fields of
+ the <structname>pci_device_id</structname> structure are the vendor and
+ device IDs. If you have no reason to filter the matching
+ devices, you can leave the remaining fields as above. The last
+ field of the <structname>pci_device_id</structname> struct contains
+ private data for this entry. You can specify any value here, for
+ example, to define specific operations for supported device IDs.
+ Such an example is found in the intel8x0 driver.
+ </para>
+
+ <para>
+ The last entry of this list is the terminator. You must
+ specify this all-zero entry.
+ </para>
+
+ <para>
+ Then, prepare the <structname>pci_driver</structname> record:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct pci_driver driver = {
+ .name = "My Own Chip",
+ .id_table = snd_mychip_ids,
+ .probe = snd_mychip_probe,
+ .remove = __devexit_p(snd_mychip_remove),
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The <structfield>probe</structfield> and
+ <structfield>remove</structfield> functions have already
+ been defined in the previous sections.
+ The <structfield>remove</structfield> function should
+ be defined with the
+ <function>__devexit_p()</function> macro, so that it's not
+ defined for built-in (and non-hot-pluggable) case. The
+ <structfield>name</structfield>
+ field is the name string of this device. Note that you must not
+ use a slash <quote>/</quote> in this string.
+ </para>
+
+ <para>
+ And at last, the module entries:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int __init alsa_card_mychip_init(void)
+ {
+ return pci_register_driver(&driver);
+ }
+
+ static void __exit alsa_card_mychip_exit(void)
+ {
+ pci_unregister_driver(&driver);
+ }
+
+ module_init(alsa_card_mychip_init)
+ module_exit(alsa_card_mychip_exit)
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Note that these module entries are tagged with
+ <parameter>__init</parameter> and
+ <parameter>__exit</parameter> prefixes, not
+ <parameter>__devinit</parameter> nor
+ <parameter>__devexit</parameter>.
+ </para>
+
+ <para>
+ Oh, one thing was forgotten. If you have no exported symbols,
+ you need to declare it in 2.2 or 2.4 kernels (it's not necessary in 2.6 kernels).
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ EXPORT_NO_SYMBOLS;
+]]>
+ </programlisting>
+ </informalexample>
+
+ That's all!
+ </para>
+ </section>
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- PCM Interface -->
+<!-- ****************************************************** -->
+ <chapter id="pcm-interface">
+ <title>PCM Interface</title>
+
+ <section id="pcm-interface-general">
+ <title>General</title>
+ <para>
+ The PCM middle layer of ALSA is quite powerful and it is only
+ necessary for each driver to implement the low-level functions
+ to access its hardware.
+ </para>
+
+ <para>
+ For accessing to the PCM layer, you need to include
+ <filename>&lt;sound/pcm.h&gt;</filename> first. In addition,
+ <filename>&lt;sound/pcm_params.h&gt;</filename> might be needed
+ if you access to some functions related with hw_param.
+ </para>
+
+ <para>
+ Each card device can have up to four pcm instances. A pcm
+ instance corresponds to a pcm device file. The limitation of
+ number of instances comes only from the available bit size of
+ the Linux's device numbers. Once when 64bit device number is
+ used, we'll have more pcm instances available.
+ </para>
+
+ <para>
+ A pcm instance consists of pcm playback and capture streams,
+ and each pcm stream consists of one or more pcm substreams. Some
+ soundcards support multiple playback functions. For example,
+ emu10k1 has a PCM playback of 32 stereo substreams. In this case, at
+ each open, a free substream is (usually) automatically chosen
+ and opened. Meanwhile, when only one substream exists and it was
+ already opened, the successful open will either block
+ or error with <constant>EAGAIN</constant> according to the
+ file open mode. But you don't have to care about such details in your
+ driver. The PCM middle layer will take care of such work.
+ </para>
+ </section>
+
+ <section id="pcm-interface-example">
+ <title>Full Code Example</title>
+ <para>
+ The example code below does not include any hardware access
+ routines but shows only the skeleton, how to build up the PCM
+ interfaces.
+
+ <example>
+ <title>PCM Example Code</title>
+ <programlisting>
+<![CDATA[
+ #include <sound/pcm.h>
+ ....
+
+ /* hardware definition */
+ static struct snd_pcm_hardware snd_mychip_playback_hw = {
+ .info = (SNDRV_PCM_INFO_MMAP |
+ SNDRV_PCM_INFO_INTERLEAVED |
+ SNDRV_PCM_INFO_BLOCK_TRANSFER |
+ SNDRV_PCM_INFO_MMAP_VALID),
+ .formats = SNDRV_PCM_FMTBIT_S16_LE,
+ .rates = SNDRV_PCM_RATE_8000_48000,
+ .rate_min = 8000,
+ .rate_max = 48000,
+ .channels_min = 2,
+ .channels_max = 2,
+ .buffer_bytes_max = 32768,
+ .period_bytes_min = 4096,
+ .period_bytes_max = 32768,
+ .periods_min = 1,
+ .periods_max = 1024,
+ };
+
+ /* hardware definition */
+ static struct snd_pcm_hardware snd_mychip_capture_hw = {
+ .info = (SNDRV_PCM_INFO_MMAP |
+ SNDRV_PCM_INFO_INTERLEAVED |
+ SNDRV_PCM_INFO_BLOCK_TRANSFER |
+ SNDRV_PCM_INFO_MMAP_VALID),
+ .formats = SNDRV_PCM_FMTBIT_S16_LE,
+ .rates = SNDRV_PCM_RATE_8000_48000,
+ .rate_min = 8000,
+ .rate_max = 48000,
+ .channels_min = 2,
+ .channels_max = 2,
+ .buffer_bytes_max = 32768,
+ .period_bytes_min = 4096,
+ .period_bytes_max = 32768,
+ .periods_min = 1,
+ .periods_max = 1024,
+ };
+
+ /* open callback */
+ static int snd_mychip_playback_open(struct snd_pcm_substream *substream)
+ {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ struct snd_pcm_runtime *runtime = substream->runtime;
+
+ runtime->hw = snd_mychip_playback_hw;
+ /* more hardware-initialization will be done here */
+ ....
+ return 0;
+ }
+
+ /* close callback */
+ static int snd_mychip_playback_close(struct snd_pcm_substream *substream)
+ {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ /* the hardware-specific codes will be here */
+ ....
+ return 0;
+
+ }
+
+ /* open callback */
+ static int snd_mychip_capture_open(struct snd_pcm_substream *substream)
+ {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ struct snd_pcm_runtime *runtime = substream->runtime;
+
+ runtime->hw = snd_mychip_capture_hw;
+ /* more hardware-initialization will be done here */
+ ....
+ return 0;
+ }
+
+ /* close callback */
+ static int snd_mychip_capture_close(struct snd_pcm_substream *substream)
+ {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ /* the hardware-specific codes will be here */
+ ....
+ return 0;
+
+ }
+
+ /* hw_params callback */
+ static int snd_mychip_pcm_hw_params(struct snd_pcm_substream *substream,
+ struct snd_pcm_hw_params *hw_params)
+ {
+ return snd_pcm_lib_malloc_pages(substream,
+ params_buffer_bytes(hw_params));
+ }
+
+ /* hw_free callback */
+ static int snd_mychip_pcm_hw_free(struct snd_pcm_substream *substream)
+ {
+ return snd_pcm_lib_free_pages(substream);
+ }
+
+ /* prepare callback */
+ static int snd_mychip_pcm_prepare(struct snd_pcm_substream *substream)
+ {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ struct snd_pcm_runtime *runtime = substream->runtime;
+
+ /* set up the hardware with the current configuration
+ * for example...
+ */
+ mychip_set_sample_format(chip, runtime->format);
+ mychip_set_sample_rate(chip, runtime->rate);
+ mychip_set_channels(chip, runtime->channels);
+ mychip_set_dma_setup(chip, runtime->dma_addr,
+ chip->buffer_size,
+ chip->period_size);
+ return 0;
+ }
+
+ /* trigger callback */
+ static int snd_mychip_pcm_trigger(struct snd_pcm_substream *substream,
+ int cmd)
+ {
+ switch (cmd) {
+ case SNDRV_PCM_TRIGGER_START:
+ /* do something to start the PCM engine */
+ ....
+ break;
+ case SNDRV_PCM_TRIGGER_STOP:
+ /* do something to stop the PCM engine */
+ ....
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* pointer callback */
+ static snd_pcm_uframes_t
+ snd_mychip_pcm_pointer(struct snd_pcm_substream *substream)
+ {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ unsigned int current_ptr;
+
+ /* get the current hardware pointer */
+ current_ptr = mychip_get_hw_pointer(chip);
+ return current_ptr;
+ }
+
+ /* operators */
+ static struct snd_pcm_ops snd_mychip_playback_ops = {
+ .open = snd_mychip_playback_open,
+ .close = snd_mychip_playback_close,
+ .ioctl = snd_pcm_lib_ioctl,
+ .hw_params = snd_mychip_pcm_hw_params,
+ .hw_free = snd_mychip_pcm_hw_free,
+ .prepare = snd_mychip_pcm_prepare,
+ .trigger = snd_mychip_pcm_trigger,
+ .pointer = snd_mychip_pcm_pointer,
+ };
+
+ /* operators */
+ static struct snd_pcm_ops snd_mychip_capture_ops = {
+ .open = snd_mychip_capture_open,
+ .close = snd_mychip_capture_close,
+ .ioctl = snd_pcm_lib_ioctl,
+ .hw_params = snd_mychip_pcm_hw_params,
+ .hw_free = snd_mychip_pcm_hw_free,
+ .prepare = snd_mychip_pcm_prepare,
+ .trigger = snd_mychip_pcm_trigger,
+ .pointer = snd_mychip_pcm_pointer,
+ };
+
+ /*
+ * definitions of capture are omitted here...
+ */
+
+ /* create a pcm device */
+ static int __devinit snd_mychip_new_pcm(struct mychip *chip)
+ {
+ struct snd_pcm *pcm;
+ int err;
+
+ err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1, &pcm);
+ if (err < 0)
+ return err;
+ pcm->private_data = chip;
+ strcpy(pcm->name, "My Chip");
+ chip->pcm = pcm;
+ /* set operators */
+ snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK,
+ &snd_mychip_playback_ops);
+ snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE,
+ &snd_mychip_capture_ops);
+ /* pre-allocation of buffers */
+ /* NOTE: this may fail */
+ snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+ snd_dma_pci_data(chip->pci),
+ 64*1024, 64*1024);
+ return 0;
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </section>
+
+ <section id="pcm-interface-constructor">
+ <title>Constructor</title>
+ <para>
+ A pcm instance is allocated by the <function>snd_pcm_new()</function>
+ function. It would be better to create a constructor for pcm,
+ namely,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int __devinit snd_mychip_new_pcm(struct mychip *chip)
+ {
+ struct snd_pcm *pcm;
+ int err;
+
+ err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1, &pcm);
+ if (err < 0)
+ return err;
+ pcm->private_data = chip;
+ strcpy(pcm->name, "My Chip");
+ chip->pcm = pcm;
+ ....
+ return 0;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The <function>snd_pcm_new()</function> function takes four
+ arguments. The first argument is the card pointer to which this
+ pcm is assigned, and the second is the ID string.
+ </para>
+
+ <para>
+ The third argument (<parameter>index</parameter>, 0 in the
+ above) is the index of this new pcm. It begins from zero. If
+ you create more than one pcm instances, specify the
+ different numbers in this argument. For example,
+ <parameter>index</parameter> = 1 for the second PCM device.
+ </para>
+
+ <para>
+ The fourth and fifth arguments are the number of substreams
+ for playback and capture, respectively. Here 1 is used for
+ both arguments. When no playback or capture substreams are available,
+ pass 0 to the corresponding argument.
+ </para>
+
+ <para>
+ If a chip supports multiple playbacks or captures, you can
+ specify more numbers, but they must be handled properly in
+ open/close, etc. callbacks. When you need to know which
+ substream you are referring to, then it can be obtained from
+ struct <structname>snd_pcm_substream</structname> data passed to each callback
+ as follows:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_pcm_substream *substream;
+ int index = substream->number;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ After the pcm is created, you need to set operators for each
+ pcm stream.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK,
+ &snd_mychip_playback_ops);
+ snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE,
+ &snd_mychip_capture_ops);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The operators are defined typically like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct snd_pcm_ops snd_mychip_playback_ops = {
+ .open = snd_mychip_pcm_open,
+ .close = snd_mychip_pcm_close,
+ .ioctl = snd_pcm_lib_ioctl,
+ .hw_params = snd_mychip_pcm_hw_params,
+ .hw_free = snd_mychip_pcm_hw_free,
+ .prepare = snd_mychip_pcm_prepare,
+ .trigger = snd_mychip_pcm_trigger,
+ .pointer = snd_mychip_pcm_pointer,
+ };
+]]>
+ </programlisting>
+ </informalexample>
+
+ All the callbacks are described in the
+ <link linkend="pcm-interface-operators"><citetitle>
+ Operators</citetitle></link> subsection.
+ </para>
+
+ <para>
+ After setting the operators, you probably will want to
+ pre-allocate the buffer. For the pre-allocation, simply call
+ the following:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+ snd_dma_pci_data(chip->pci),
+ 64*1024, 64*1024);
+]]>
+ </programlisting>
+ </informalexample>
+
+ It will allocate a buffer up to 64kB as default.
+ Buffer management details will be described in the later section <link
+ linkend="buffer-and-memory"><citetitle>Buffer and Memory
+ Management</citetitle></link>.
+ </para>
+
+ <para>
+ Additionally, you can set some extra information for this pcm
+ in pcm-&gt;info_flags.
+ The available values are defined as
+ <constant>SNDRV_PCM_INFO_XXX</constant> in
+ <filename>&lt;sound/asound.h&gt;</filename>, which is used for
+ the hardware definition (described later). When your soundchip
+ supports only half-duplex, specify like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ pcm->info_flags = SNDRV_PCM_INFO_HALF_DUPLEX;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </section>
+
+ <section id="pcm-interface-destructor">
+ <title>... And the Destructor?</title>
+ <para>
+ The destructor for a pcm instance is not always
+ necessary. Since the pcm device will be released by the middle
+ layer code automatically, you don't have to call the destructor
+ explicitly.
+ </para>
+
+ <para>
+ The destructor would be necessary if you created
+ special records internally and needed to release them. In such a
+ case, set the destructor function to
+ pcm-&gt;private_free:
+
+ <example>
+ <title>PCM Instance with a Destructor</title>
+ <programlisting>
+<![CDATA[
+ static void mychip_pcm_free(struct snd_pcm *pcm)
+ {
+ struct mychip *chip = snd_pcm_chip(pcm);
+ /* free your own data */
+ kfree(chip->my_private_pcm_data);
+ /* do what you like else */
+ ....
+ }
+
+ static int __devinit snd_mychip_new_pcm(struct mychip *chip)
+ {
+ struct snd_pcm *pcm;
+ ....
+ /* allocate your own data */
+ chip->my_private_pcm_data = kmalloc(...);
+ /* set the destructor */
+ pcm->private_data = chip;
+ pcm->private_free = mychip_pcm_free;
+ ....
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </section>
+
+ <section id="pcm-interface-runtime">
+ <title>Runtime Pointer - The Chest of PCM Information</title>
+ <para>
+ When the PCM substream is opened, a PCM runtime instance is
+ allocated and assigned to the substream. This pointer is
+ accessible via <constant>substream-&gt;runtime</constant>.
+ This runtime pointer holds most information you need
+ to control the PCM: the copy of hw_params and sw_params configurations, the buffer
+ pointers, mmap records, spinlocks, etc.
+ </para>
+
+ <para>
+ The definition of runtime instance is found in
+ <filename>&lt;sound/pcm.h&gt;</filename>. Here are
+ the contents of this file:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+struct _snd_pcm_runtime {
+ /* -- Status -- */
+ struct snd_pcm_substream *trigger_master;
+ snd_timestamp_t trigger_tstamp; /* trigger timestamp */
+ int overrange;
+ snd_pcm_uframes_t avail_max;
+ snd_pcm_uframes_t hw_ptr_base; /* Position at buffer restart */
+ snd_pcm_uframes_t hw_ptr_interrupt; /* Position at interrupt time*/
+
+ /* -- HW params -- */
+ snd_pcm_access_t access; /* access mode */
+ snd_pcm_format_t format; /* SNDRV_PCM_FORMAT_* */
+ snd_pcm_subformat_t subformat; /* subformat */
+ unsigned int rate; /* rate in Hz */
+ unsigned int channels; /* channels */
+ snd_pcm_uframes_t period_size; /* period size */
+ unsigned int periods; /* periods */
+ snd_pcm_uframes_t buffer_size; /* buffer size */
+ unsigned int tick_time; /* tick time */
+ snd_pcm_uframes_t min_align; /* Min alignment for the format */
+ size_t byte_align;
+ unsigned int frame_bits;
+ unsigned int sample_bits;
+ unsigned int info;
+ unsigned int rate_num;
+ unsigned int rate_den;
+
+ /* -- SW params -- */
+ struct timespec tstamp_mode; /* mmap timestamp is updated */
+ unsigned int period_step;
+ unsigned int sleep_min; /* min ticks to sleep */
+ snd_pcm_uframes_t start_threshold;
+ snd_pcm_uframes_t stop_threshold;
+ snd_pcm_uframes_t silence_threshold; /* Silence filling happens when
+ noise is nearest than this */
+ snd_pcm_uframes_t silence_size; /* Silence filling size */
+ snd_pcm_uframes_t boundary; /* pointers wrap point */
+
+ snd_pcm_uframes_t silenced_start;
+ snd_pcm_uframes_t silenced_size;
+
+ snd_pcm_sync_id_t sync; /* hardware synchronization ID */
+
+ /* -- mmap -- */
+ volatile struct snd_pcm_mmap_status *status;
+ volatile struct snd_pcm_mmap_control *control;
+ atomic_t mmap_count;
+
+ /* -- locking / scheduling -- */
+ spinlock_t lock;
+ wait_queue_head_t sleep;
+ struct timer_list tick_timer;
+ struct fasync_struct *fasync;
+
+ /* -- private section -- */
+ void *private_data;
+ void (*private_free)(struct snd_pcm_runtime *runtime);
+
+ /* -- hardware description -- */
+ struct snd_pcm_hardware hw;
+ struct snd_pcm_hw_constraints hw_constraints;
+
+ /* -- interrupt callbacks -- */
+ void (*transfer_ack_begin)(struct snd_pcm_substream *substream);
+ void (*transfer_ack_end)(struct snd_pcm_substream *substream);
+
+ /* -- timer -- */
+ unsigned int timer_resolution; /* timer resolution */
+
+ /* -- DMA -- */
+ unsigned char *dma_area; /* DMA area */
+ dma_addr_t dma_addr; /* physical bus address (not accessible from main CPU) */
+ size_t dma_bytes; /* size of DMA area */
+
+ struct snd_dma_buffer *dma_buffer_p; /* allocated buffer */
+
+#if defined(CONFIG_SND_PCM_OSS) || defined(CONFIG_SND_PCM_OSS_MODULE)
+ /* -- OSS things -- */
+ struct snd_pcm_oss_runtime oss;
+#endif
+};
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ For the operators (callbacks) of each sound driver, most of
+ these records are supposed to be read-only. Only the PCM
+ middle-layer changes / updates them. The exceptions are
+ the hardware description (hw), interrupt callbacks
+ (transfer_ack_xxx), DMA buffer information, and the private
+ data. Besides, if you use the standard buffer allocation
+ method via <function>snd_pcm_lib_malloc_pages()</function>,
+ you don't need to set the DMA buffer information by yourself.
+ </para>
+
+ <para>
+ In the sections below, important records are explained.
+ </para>
+
+ <section id="pcm-interface-runtime-hw">
+ <title>Hardware Description</title>
+ <para>
+ The hardware descriptor (struct <structname>snd_pcm_hardware</structname>)
+ contains the definitions of the fundamental hardware
+ configuration. Above all, you'll need to define this in
+ <link linkend="pcm-interface-operators-open-callback"><citetitle>
+ the open callback</citetitle></link>.
+ Note that the runtime instance holds the copy of the
+ descriptor, not the pointer to the existing descriptor. That
+ is, in the open callback, you can modify the copied descriptor
+ (<constant>runtime-&gt;hw</constant>) as you need. For example, if the maximum
+ number of channels is 1 only on some chip models, you can
+ still use the same hardware descriptor and change the
+ channels_max later:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ ...
+ runtime->hw = snd_mychip_playback_hw; /* common definition */
+ if (chip->model == VERY_OLD_ONE)
+ runtime->hw.channels_max = 1;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Typically, you'll have a hardware descriptor as below:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct snd_pcm_hardware snd_mychip_playback_hw = {
+ .info = (SNDRV_PCM_INFO_MMAP |
+ SNDRV_PCM_INFO_INTERLEAVED |
+ SNDRV_PCM_INFO_BLOCK_TRANSFER |
+ SNDRV_PCM_INFO_MMAP_VALID),
+ .formats = SNDRV_PCM_FMTBIT_S16_LE,
+ .rates = SNDRV_PCM_RATE_8000_48000,
+ .rate_min = 8000,
+ .rate_max = 48000,
+ .channels_min = 2,
+ .channels_max = 2,
+ .buffer_bytes_max = 32768,
+ .period_bytes_min = 4096,
+ .period_bytes_max = 32768,
+ .periods_min = 1,
+ .periods_max = 1024,
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ <itemizedlist>
+ <listitem><para>
+ The <structfield>info</structfield> field contains the type and
+ capabilities of this pcm. The bit flags are defined in
+ <filename>&lt;sound/asound.h&gt;</filename> as
+ <constant>SNDRV_PCM_INFO_XXX</constant>. Here, at least, you
+ have to specify whether the mmap is supported and which
+ interleaved format is supported.
+ When the is supported, add the
+ <constant>SNDRV_PCM_INFO_MMAP</constant> flag here. When the
+ hardware supports the interleaved or the non-interleaved
+ formats, <constant>SNDRV_PCM_INFO_INTERLEAVED</constant> or
+ <constant>SNDRV_PCM_INFO_NONINTERLEAVED</constant> flag must
+ be set, respectively. If both are supported, you can set both,
+ too.
+ </para>
+
+ <para>
+ In the above example, <constant>MMAP_VALID</constant> and
+ <constant>BLOCK_TRANSFER</constant> are specified for the OSS mmap
+ mode. Usually both are set. Of course,
+ <constant>MMAP_VALID</constant> is set only if the mmap is
+ really supported.
+ </para>
+
+ <para>
+ The other possible flags are
+ <constant>SNDRV_PCM_INFO_PAUSE</constant> and
+ <constant>SNDRV_PCM_INFO_RESUME</constant>. The
+ <constant>PAUSE</constant> bit means that the pcm supports the
+ <quote>pause</quote> operation, while the
+ <constant>RESUME</constant> bit means that the pcm supports
+ the full <quote>suspend/resume</quote> operation.
+ If the <constant>PAUSE</constant> flag is set,
+ the <structfield>trigger</structfield> callback below
+ must handle the corresponding (pause push/release) commands.
+ The suspend/resume trigger commands can be defined even without
+ the <constant>RESUME</constant> flag. See <link
+ linkend="power-management"><citetitle>
+ Power Management</citetitle></link> section for details.
+ </para>
+
+ <para>
+ When the PCM substreams can be synchronized (typically,
+ synchronized start/stop of a playback and a capture streams),
+ you can give <constant>SNDRV_PCM_INFO_SYNC_START</constant>,
+ too. In this case, you'll need to check the linked-list of
+ PCM substreams in the trigger callback. This will be
+ described in the later section.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <structfield>formats</structfield> field contains the bit-flags
+ of supported formats (<constant>SNDRV_PCM_FMTBIT_XXX</constant>).
+ If the hardware supports more than one format, give all or'ed
+ bits. In the example above, the signed 16bit little-endian
+ format is specified.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <structfield>rates</structfield> field contains the bit-flags of
+ supported rates (<constant>SNDRV_PCM_RATE_XXX</constant>).
+ When the chip supports continuous rates, pass
+ <constant>CONTINUOUS</constant> bit additionally.
+ The pre-defined rate bits are provided only for typical
+ rates. If your chip supports unconventional rates, you need to add
+ the <constant>KNOT</constant> bit and set up the hardware
+ constraint manually (explained later).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <structfield>rate_min</structfield> and
+ <structfield>rate_max</structfield> define the minimum and
+ maximum sample rate. This should correspond somehow to
+ <structfield>rates</structfield> bits.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <structfield>channel_min</structfield> and
+ <structfield>channel_max</structfield>
+ define, as you might already expected, the minimum and maximum
+ number of channels.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <structfield>buffer_bytes_max</structfield> defines the
+ maximum buffer size in bytes. There is no
+ <structfield>buffer_bytes_min</structfield> field, since
+ it can be calculated from the minimum period size and the
+ minimum number of periods.
+ Meanwhile, <structfield>period_bytes_min</structfield> and
+ define the minimum and maximum size of the period in bytes.
+ <structfield>periods_max</structfield> and
+ <structfield>periods_min</structfield> define the maximum and
+ minimum number of periods in the buffer.
+ </para>
+
+ <para>
+ The <quote>period</quote> is a term that corresponds to
+ a fragment in the OSS world. The period defines the size at
+ which a PCM interrupt is generated. This size strongly
+ depends on the hardware.
+ Generally, the smaller period size will give you more
+ interrupts, that is, more controls.
+ In the case of capture, this size defines the input latency.
+ On the other hand, the whole buffer size defines the
+ output latency for the playback direction.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ There is also a field <structfield>fifo_size</structfield>.
+ This specifies the size of the hardware FIFO, but currently it
+ is neither used in the driver nor in the alsa-lib. So, you
+ can ignore this field.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </para>
+ </section>
+
+ <section id="pcm-interface-runtime-config">
+ <title>PCM Configurations</title>
+ <para>
+ Ok, let's go back again to the PCM runtime records.
+ The most frequently referred records in the runtime instance are
+ the PCM configurations.
+ The PCM configurations are stored in the runtime instance
+ after the application sends <type>hw_params</type> data via
+ alsa-lib. There are many fields copied from hw_params and
+ sw_params structs. For example,
+ <structfield>format</structfield> holds the format type
+ chosen by the application. This field contains the enum value
+ <constant>SNDRV_PCM_FORMAT_XXX</constant>.
+ </para>
+
+ <para>
+ One thing to be noted is that the configured buffer and period
+ sizes are stored in <quote>frames</quote> in the runtime.
+ In the ALSA world, 1 frame = channels * samples-size.
+ For conversion between frames and bytes, you can use the
+ <function>frames_to_bytes()</function> and
+ <function>bytes_to_frames()</function> helper functions.
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ period_bytes = frames_to_bytes(runtime, runtime->period_size);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Also, many software parameters (sw_params) are
+ stored in frames, too. Please check the type of the field.
+ <type>snd_pcm_uframes_t</type> is for the frames as unsigned
+ integer while <type>snd_pcm_sframes_t</type> is for the frames
+ as signed integer.
+ </para>
+ </section>
+
+ <section id="pcm-interface-runtime-dma">
+ <title>DMA Buffer Information</title>
+ <para>
+ The DMA buffer is defined by the following four fields,
+ <structfield>dma_area</structfield>,
+ <structfield>dma_addr</structfield>,
+ <structfield>dma_bytes</structfield> and
+ <structfield>dma_private</structfield>.
+ The <structfield>dma_area</structfield> holds the buffer
+ pointer (the logical address). You can call
+ <function>memcpy</function> from/to
+ this pointer. Meanwhile, <structfield>dma_addr</structfield>
+ holds the physical address of the buffer. This field is
+ specified only when the buffer is a linear buffer.
+ <structfield>dma_bytes</structfield> holds the size of buffer
+ in bytes. <structfield>dma_private</structfield> is used for
+ the ALSA DMA allocator.
+ </para>
+
+ <para>
+ If you use a standard ALSA function,
+ <function>snd_pcm_lib_malloc_pages()</function>, for
+ allocating the buffer, these fields are set by the ALSA middle
+ layer, and you should <emphasis>not</emphasis> change them by
+ yourself. You can read them but not write them.
+ On the other hand, if you want to allocate the buffer by
+ yourself, you'll need to manage it in hw_params callback.
+ At least, <structfield>dma_bytes</structfield> is mandatory.
+ <structfield>dma_area</structfield> is necessary when the
+ buffer is mmapped. If your driver doesn't support mmap, this
+ field is not necessary. <structfield>dma_addr</structfield>
+ is also optional. You can use
+ <structfield>dma_private</structfield> as you like, too.
+ </para>
+ </section>
+
+ <section id="pcm-interface-runtime-status">
+ <title>Running Status</title>
+ <para>
+ The running status can be referred via <constant>runtime-&gt;status</constant>.
+ This is the pointer to the struct <structname>snd_pcm_mmap_status</structname>
+ record. For example, you can get the current DMA hardware
+ pointer via <constant>runtime-&gt;status-&gt;hw_ptr</constant>.
+ </para>
+
+ <para>
+ The DMA application pointer can be referred via
+ <constant>runtime-&gt;control</constant>, which points to the
+ struct <structname>snd_pcm_mmap_control</structname> record.
+ However, accessing directly to this value is not recommended.
+ </para>
+ </section>
+
+ <section id="pcm-interface-runtime-private">
+ <title>Private Data</title>
+ <para>
+ You can allocate a record for the substream and store it in
+ <constant>runtime-&gt;private_data</constant>. Usually, this
+ is done in
+ <link linkend="pcm-interface-operators-open-callback"><citetitle>
+ the open callback</citetitle></link>.
+ Don't mix this with <constant>pcm-&gt;private_data</constant>.
+ The <constant>pcm-&gt;private_data</constant> usually points to the
+ chip instance assigned statically at the creation of PCM, while the
+ <constant>runtime-&gt;private_data</constant> points to a dynamic
+ data structure created at the PCM open callback.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_open(struct snd_pcm_substream *substream)
+ {
+ struct my_pcm_data *data;
+ ....
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ substream->runtime->private_data = data;
+ ....
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The allocated object must be released in
+ <link linkend="pcm-interface-operators-open-callback"><citetitle>
+ the close callback</citetitle></link>.
+ </para>
+ </section>
+
+ <section id="pcm-interface-runtime-intr">
+ <title>Interrupt Callbacks</title>
+ <para>
+ The field <structfield>transfer_ack_begin</structfield> and
+ <structfield>transfer_ack_end</structfield> are called at
+ the beginning and at the end of
+ <function>snd_pcm_period_elapsed()</function>, respectively.
+ </para>
+ </section>
+
+ </section>
+
+ <section id="pcm-interface-operators">
+ <title>Operators</title>
+ <para>
+ OK, now let me give details about each pcm callback
+ (<parameter>ops</parameter>). In general, every callback must
+ return 0 if successful, or a negative error number
+ such as <constant>-EINVAL</constant>. To choose an appropriate
+ error number, it is advised to check what value other parts of
+ the kernel return when the same kind of request fails.
+ </para>
+
+ <para>
+ The callback function takes at least the argument with
+ <structname>snd_pcm_substream</structname> pointer. To retrieve
+ the chip record from the given substream instance, you can use the
+ following macro.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ int xxx() {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ ....
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ The macro reads <constant>substream-&gt;private_data</constant>,
+ which is a copy of <constant>pcm-&gt;private_data</constant>.
+ You can override the former if you need to assign different data
+ records per PCM substream. For example, the cmi8330 driver assigns
+ different private_data for playback and capture directions,
+ because it uses two different codecs (SB- and AD-compatible) for
+ different directions.
+ </para>
+
+ <section id="pcm-interface-operators-open-callback">
+ <title>open callback</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_open(struct snd_pcm_substream *substream);
+]]>
+ </programlisting>
+ </informalexample>
+
+ This is called when a pcm substream is opened.
+ </para>
+
+ <para>
+ At least, here you have to initialize the runtime-&gt;hw
+ record. Typically, this is done by like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_open(struct snd_pcm_substream *substream)
+ {
+ struct mychip *chip = snd_pcm_substream_chip(substream);
+ struct snd_pcm_runtime *runtime = substream->runtime;
+
+ runtime->hw = snd_mychip_playback_hw;
+ return 0;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ where <parameter>snd_mychip_playback_hw</parameter> is the
+ pre-defined hardware description.
+ </para>
+
+ <para>
+ You can allocate a private data in this callback, as described
+ in <link linkend="pcm-interface-runtime-private"><citetitle>
+ Private Data</citetitle></link> section.
+ </para>
+
+ <para>
+ If the hardware configuration needs more constraints, set the
+ hardware constraints here, too.
+ See <link linkend="pcm-interface-constraints"><citetitle>
+ Constraints</citetitle></link> for more details.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-close-callback">
+ <title>close callback</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_close(struct snd_pcm_substream *substream);
+]]>
+ </programlisting>
+ </informalexample>
+
+ Obviously, this is called when a pcm substream is closed.
+ </para>
+
+ <para>
+ Any private instance for a pcm substream allocated in the
+ open callback will be released here.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_close(struct snd_pcm_substream *substream)
+ {
+ ....
+ kfree(substream->runtime->private_data);
+ ....
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-ioctl-callback">
+ <title>ioctl callback</title>
+ <para>
+ This is used for any special call to pcm ioctls. But
+ usually you can pass a generic ioctl callback,
+ <function>snd_pcm_lib_ioctl</function>.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-hw-params-callback">
+ <title>hw_params callback</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_hw_params(struct snd_pcm_substream *substream,
+ struct snd_pcm_hw_params *hw_params);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ This is called when the hardware parameter
+ (<structfield>hw_params</structfield>) is set
+ up by the application,
+ that is, once when the buffer size, the period size, the
+ format, etc. are defined for the pcm substream.
+ </para>
+
+ <para>
+ Many hardware setups should be done in this callback,
+ including the allocation of buffers.
+ </para>
+
+ <para>
+ Parameters to be initialized are retrieved by
+ <function>params_xxx()</function> macros. To allocate
+ buffer, you can call a helper function,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_lib_malloc_pages(substream, params_buffer_bytes(hw_params));
+]]>
+ </programlisting>
+ </informalexample>
+
+ <function>snd_pcm_lib_malloc_pages()</function> is available
+ only when the DMA buffers have been pre-allocated.
+ See the section <link
+ linkend="buffer-and-memory-buffer-types"><citetitle>
+ Buffer Types</citetitle></link> for more details.
+ </para>
+
+ <para>
+ Note that this and <structfield>prepare</structfield> callbacks
+ may be called multiple times per initialization.
+ For example, the OSS emulation may
+ call these callbacks at each change via its ioctl.
+ </para>
+
+ <para>
+ Thus, you need to be careful not to allocate the same buffers
+ many times, which will lead to memory leaks! Calling the
+ helper function above many times is OK. It will release the
+ previous buffer automatically when it was already allocated.
+ </para>
+
+ <para>
+ Another note is that this callback is non-atomic
+ (schedulable). This is important, because the
+ <structfield>trigger</structfield> callback
+ is atomic (non-schedulable). That is, mutexes or any
+ schedule-related functions are not available in
+ <structfield>trigger</structfield> callback.
+ Please see the subsection
+ <link linkend="pcm-interface-atomicity"><citetitle>
+ Atomicity</citetitle></link> for details.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-hw-free-callback">
+ <title>hw_free callback</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_hw_free(struct snd_pcm_substream *substream);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ This is called to release the resources allocated via
+ <structfield>hw_params</structfield>. For example, releasing the
+ buffer via
+ <function>snd_pcm_lib_malloc_pages()</function> is done by
+ calling the following:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_lib_free_pages(substream);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ This function is always called before the close callback is called.
+ Also, the callback may be called multiple times, too.
+ Keep track whether the resource was already released.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-prepare-callback">
+ <title>prepare callback</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_prepare(struct snd_pcm_substream *substream);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ This callback is called when the pcm is
+ <quote>prepared</quote>. You can set the format type, sample
+ rate, etc. here. The difference from
+ <structfield>hw_params</structfield> is that the
+ <structfield>prepare</structfield> callback will be called each
+ time
+ <function>snd_pcm_prepare()</function> is called, i.e. when
+ recovering after underruns, etc.
+ </para>
+
+ <para>
+ Note that this callback is now non-atomic.
+ You can use schedule-related functions safely in this callback.
+ </para>
+
+ <para>
+ In this and the following callbacks, you can refer to the
+ values via the runtime record,
+ substream-&gt;runtime.
+ For example, to get the current
+ rate, format or channels, access to
+ runtime-&gt;rate,
+ runtime-&gt;format or
+ runtime-&gt;channels, respectively.
+ The physical address of the allocated buffer is set to
+ runtime-&gt;dma_area. The buffer and period sizes are
+ in runtime-&gt;buffer_size and runtime-&gt;period_size,
+ respectively.
+ </para>
+
+ <para>
+ Be careful that this callback will be called many times at
+ each setup, too.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-trigger-callback">
+ <title>trigger callback</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_trigger(struct snd_pcm_substream *substream, int cmd);
+]]>
+ </programlisting>
+ </informalexample>
+
+ This is called when the pcm is started, stopped or paused.
+ </para>
+
+ <para>
+ Which action is specified in the second argument,
+ <constant>SNDRV_PCM_TRIGGER_XXX</constant> in
+ <filename>&lt;sound/pcm.h&gt;</filename>. At least,
+ the <constant>START</constant> and <constant>STOP</constant>
+ commands must be defined in this callback.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ switch (cmd) {
+ case SNDRV_PCM_TRIGGER_START:
+ /* do something to start the PCM engine */
+ break;
+ case SNDRV_PCM_TRIGGER_STOP:
+ /* do something to stop the PCM engine */
+ break;
+ default:
+ return -EINVAL;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ When the pcm supports the pause operation (given in the info
+ field of the hardware table), the <constant>PAUSE_PUSE</constant>
+ and <constant>PAUSE_RELEASE</constant> commands must be
+ handled here, too. The former is the command to pause the pcm,
+ and the latter to restart the pcm again.
+ </para>
+
+ <para>
+ When the pcm supports the suspend/resume operation,
+ regardless of full or partial suspend/resume support,
+ the <constant>SUSPEND</constant> and <constant>RESUME</constant>
+ commands must be handled, too.
+ These commands are issued when the power-management status is
+ changed. Obviously, the <constant>SUSPEND</constant> and
+ <constant>RESUME</constant> commands
+ suspend and resume the pcm substream, and usually, they
+ are identical to the <constant>STOP</constant> and
+ <constant>START</constant> commands, respectively.
+ See the <link linkend="power-management"><citetitle>
+ Power Management</citetitle></link> section for details.
+ </para>
+
+ <para>
+ As mentioned, this callback is atomic. You cannot call
+ functions which may sleep.
+ The trigger callback should be as minimal as possible,
+ just really triggering the DMA. The other stuff should be
+ initialized hw_params and prepare callbacks properly
+ beforehand.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-pointer-callback">
+ <title>pointer callback</title>
+ <para>
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static snd_pcm_uframes_t snd_xxx_pointer(struct snd_pcm_substream *substream)
+]]>
+ </programlisting>
+ </informalexample>
+
+ This callback is called when the PCM middle layer inquires
+ the current hardware position on the buffer. The position must
+ be returned in frames,
+ ranging from 0 to buffer_size - 1.
+ </para>
+
+ <para>
+ This is called usually from the buffer-update routine in the
+ pcm middle layer, which is invoked when
+ <function>snd_pcm_period_elapsed()</function> is called in the
+ interrupt routine. Then the pcm middle layer updates the
+ position and calculates the available space, and wakes up the
+ sleeping poll threads, etc.
+ </para>
+
+ <para>
+ This callback is also atomic.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-copy-silence">
+ <title>copy and silence callbacks</title>
+ <para>
+ These callbacks are not mandatory, and can be omitted in
+ most cases. These callbacks are used when the hardware buffer
+ cannot be in the normal memory space. Some chips have their
+ own buffer on the hardware which is not mappable. In such a
+ case, you have to transfer the data manually from the memory
+ buffer to the hardware buffer. Or, if the buffer is
+ non-contiguous on both physical and virtual memory spaces,
+ these callbacks must be defined, too.
+ </para>
+
+ <para>
+ If these two callbacks are defined, copy and set-silence
+ operations are done by them. The detailed will be described in
+ the later section <link
+ linkend="buffer-and-memory"><citetitle>Buffer and Memory
+ Management</citetitle></link>.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-ack">
+ <title>ack callback</title>
+ <para>
+ This callback is also not mandatory. This callback is called
+ when the appl_ptr is updated in read or write operations.
+ Some drivers like emu10k1-fx and cs46xx need to track the
+ current appl_ptr for the internal buffer, and this callback
+ is useful only for such a purpose.
+ </para>
+ <para>
+ This callback is atomic.
+ </para>
+ </section>
+
+ <section id="pcm-interface-operators-page-callback">
+ <title>page callback</title>
+
+ <para>
+ This callback is optional too. This callback is used
+ mainly for non-contiguous buffers. The mmap calls this
+ callback to get the page address. Some examples will be
+ explained in the later section <link
+ linkend="buffer-and-memory"><citetitle>Buffer and Memory
+ Management</citetitle></link>, too.
+ </para>
+ </section>
+ </section>
+
+ <section id="pcm-interface-interrupt-handler">
+ <title>Interrupt Handler</title>
+ <para>
+ The rest of pcm stuff is the PCM interrupt handler. The
+ role of PCM interrupt handler in the sound driver is to update
+ the buffer position and to tell the PCM middle layer when the
+ buffer position goes across the prescribed period size. To
+ inform this, call the <function>snd_pcm_period_elapsed()</function>
+ function.
+ </para>
+
+ <para>
+ There are several types of sound chips to generate the interrupts.
+ </para>
+
+ <section id="pcm-interface-interrupt-handler-boundary">
+ <title>Interrupts at the period (fragment) boundary</title>
+ <para>
+ This is the most frequently found type: the hardware
+ generates an interrupt at each period boundary.
+ In this case, you can call
+ <function>snd_pcm_period_elapsed()</function> at each
+ interrupt.
+ </para>
+
+ <para>
+ <function>snd_pcm_period_elapsed()</function> takes the
+ substream pointer as its argument. Thus, you need to keep the
+ substream pointer accessible from the chip instance. For
+ example, define substream field in the chip record to hold the
+ current running substream pointer, and set the pointer value
+ at open callback (and reset at close callback).
+ </para>
+
+ <para>
+ If you acquire a spinlock in the interrupt handler, and the
+ lock is used in other pcm callbacks, too, then you have to
+ release the lock before calling
+ <function>snd_pcm_period_elapsed()</function>, because
+ <function>snd_pcm_period_elapsed()</function> calls other pcm
+ callbacks inside.
+ </para>
+
+ <para>
+ Typical code would be like:
+
+ <example>
+ <title>Interrupt Handler Case #1</title>
+ <programlisting>
+<![CDATA[
+ static irqreturn_t snd_mychip_interrupt(int irq, void *dev_id)
+ {
+ struct mychip *chip = dev_id;
+ spin_lock(&chip->lock);
+ ....
+ if (pcm_irq_invoked(chip)) {
+ /* call updater, unlock before it */
+ spin_unlock(&chip->lock);
+ snd_pcm_period_elapsed(chip->substream);
+ spin_lock(&chip->lock);
+ /* acknowledge the interrupt if necessary */
+ }
+ ....
+ spin_unlock(&chip->lock);
+ return IRQ_HANDLED;
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </section>
+
+ <section id="pcm-interface-interrupt-handler-timer">
+ <title>High frequency timer interrupts</title>
+ <para>
+ This happense when the hardware doesn't generate interrupts
+ at the period boundary but issues timer interrupts at a fixed
+ timer rate (e.g. es1968 or ymfpci drivers).
+ In this case, you need to check the current hardware
+ position and accumulate the processed sample length at each
+ interrupt. When the accumulated size exceeds the period
+ size, call
+ <function>snd_pcm_period_elapsed()</function> and reset the
+ accumulator.
+ </para>
+
+ <para>
+ Typical code would be like the following.
+
+ <example>
+ <title>Interrupt Handler Case #2</title>
+ <programlisting>
+<![CDATA[
+ static irqreturn_t snd_mychip_interrupt(int irq, void *dev_id)
+ {
+ struct mychip *chip = dev_id;
+ spin_lock(&chip->lock);
+ ....
+ if (pcm_irq_invoked(chip)) {
+ unsigned int last_ptr, size;
+ /* get the current hardware pointer (in frames) */
+ last_ptr = get_hw_ptr(chip);
+ /* calculate the processed frames since the
+ * last update
+ */
+ if (last_ptr < chip->last_ptr)
+ size = runtime->buffer_size + last_ptr
+ - chip->last_ptr;
+ else
+ size = last_ptr - chip->last_ptr;
+ /* remember the last updated point */
+ chip->last_ptr = last_ptr;
+ /* accumulate the size */
+ chip->size += size;
+ /* over the period boundary? */
+ if (chip->size >= runtime->period_size) {
+ /* reset the accumulator */
+ chip->size %= runtime->period_size;
+ /* call updater */
+ spin_unlock(&chip->lock);
+ snd_pcm_period_elapsed(substream);
+ spin_lock(&chip->lock);
+ }
+ /* acknowledge the interrupt if necessary */
+ }
+ ....
+ spin_unlock(&chip->lock);
+ return IRQ_HANDLED;
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </section>
+
+ <section id="pcm-interface-interrupt-handler-both">
+ <title>On calling <function>snd_pcm_period_elapsed()</function></title>
+ <para>
+ In both cases, even if more than one period are elapsed, you
+ don't have to call
+ <function>snd_pcm_period_elapsed()</function> many times. Call
+ only once. And the pcm layer will check the current hardware
+ pointer and update to the latest status.
+ </para>
+ </section>
+ </section>
+
+ <section id="pcm-interface-atomicity">
+ <title>Atomicity</title>
+ <para>
+ One of the most important (and thus difficult to debug) problems
+ in kernel programming are race conditions.
+ In the Linux kernel, they are usually avoided via spin-locks, mutexes
+ or semaphores. In general, if a race condition can happen
+ in an interrupt handler, it has to be managed atomically, and you
+ have to use a spinlock to protect the critical session. If the
+ critical section is not in interrupt handler code and
+ if taking a relatively long time to execute is acceptable, you
+ should use mutexes or semaphores instead.
+ </para>
+
+ <para>
+ As already seen, some pcm callbacks are atomic and some are
+ not. For example, the <parameter>hw_params</parameter> callback is
+ non-atomic, while <parameter>trigger</parameter> callback is
+ atomic. This means, the latter is called already in a spinlock
+ held by the PCM middle layer. Please take this atomicity into
+ account when you choose a locking scheme in the callbacks.
+ </para>
+
+ <para>
+ In the atomic callbacks, you cannot use functions which may call
+ <function>schedule</function> or go to
+ <function>sleep</function>. Semaphores and mutexes can sleep,
+ and hence they cannot be used inside the atomic callbacks
+ (e.g. <parameter>trigger</parameter> callback).
+ To implement some delay in such a callback, please use
+ <function>udelay()</function> or <function>mdelay()</function>.
+ </para>
+
+ <para>
+ All three atomic callbacks (trigger, pointer, and ack) are
+ called with local interrupts disabled.
+ </para>
+
+ </section>
+ <section id="pcm-interface-constraints">
+ <title>Constraints</title>
+ <para>
+ If your chip supports unconventional sample rates, or only the
+ limited samples, you need to set a constraint for the
+ condition.
+ </para>
+
+ <para>
+ For example, in order to restrict the sample rates in the some
+ supported values, use
+ <function>snd_pcm_hw_constraint_list()</function>.
+ You need to call this function in the open callback.
+
+ <example>
+ <title>Example of Hardware Constraints</title>
+ <programlisting>
+<![CDATA[
+ static unsigned int rates[] =
+ {4000, 10000, 22050, 44100};
+ static struct snd_pcm_hw_constraint_list constraints_rates = {
+ .count = ARRAY_SIZE(rates),
+ .list = rates,
+ .mask = 0,
+ };
+
+ static int snd_mychip_pcm_open(struct snd_pcm_substream *substream)
+ {
+ int err;
+ ....
+ err = snd_pcm_hw_constraint_list(substream->runtime, 0,
+ SNDRV_PCM_HW_PARAM_RATE,
+ &constraints_rates);
+ if (err < 0)
+ return err;
+ ....
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+
+ <para>
+ There are many different constraints.
+ Look at <filename>sound/pcm.h</filename> for a complete list.
+ You can even define your own constraint rules.
+ For example, let's suppose my_chip can manage a substream of 1 channel
+ if and only if the format is S16_LE, otherwise it supports any format
+ specified in the <structname>snd_pcm_hardware</structname> structure (or in any
+ other constraint_list). You can build a rule like this:
+
+ <example>
+ <title>Example of Hardware Constraints for Channels</title>
+ <programlisting>
+<![CDATA[
+ static int hw_rule_format_by_channels(struct snd_pcm_hw_params *params,
+ struct snd_pcm_hw_rule *rule)
+ {
+ struct snd_interval *c = hw_param_interval(params,
+ SNDRV_PCM_HW_PARAM_CHANNELS);
+ struct snd_mask *f = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
+ struct snd_mask fmt;
+
+ snd_mask_any(&fmt); /* Init the struct */
+ if (c->min < 2) {
+ fmt.bits[0] &= SNDRV_PCM_FMTBIT_S16_LE;
+ return snd_mask_refine(f, &fmt);
+ }
+ return 0;
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+
+ <para>
+ Then you need to call this function to add your rule:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_hw_rule_add(substream->runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS,
+ hw_rule_channels_by_format, 0, SNDRV_PCM_HW_PARAM_FORMAT,
+ -1);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The rule function is called when an application sets the number of
+ channels. But an application can set the format before the number of
+ channels. Thus you also need to define the inverse rule:
+
+ <example>
+ <title>Example of Hardware Constraints for Channels</title>
+ <programlisting>
+<![CDATA[
+ static int hw_rule_channels_by_format(struct snd_pcm_hw_params *params,
+ struct snd_pcm_hw_rule *rule)
+ {
+ struct snd_interval *c = hw_param_interval(params,
+ SNDRV_PCM_HW_PARAM_CHANNELS);
+ struct snd_mask *f = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
+ struct snd_interval ch;
+
+ snd_interval_any(&ch);
+ if (f->bits[0] == SNDRV_PCM_FMTBIT_S16_LE) {
+ ch.min = ch.max = 1;
+ ch.integer = 1;
+ return snd_interval_refine(c, &ch);
+ }
+ return 0;
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+
+ <para>
+ ...and in the open callback:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_hw_rule_add(substream->runtime, 0, SNDRV_PCM_HW_PARAM_FORMAT,
+ hw_rule_format_by_channels, 0, SNDRV_PCM_HW_PARAM_CHANNELS,
+ -1);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ I won't give more details here, rather I
+ would like to say, <quote>Luke, use the source.</quote>
+ </para>
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Control Interface -->
+<!-- ****************************************************** -->
+ <chapter id="control-interface">
+ <title>Control Interface</title>
+
+ <section id="control-interface-general">
+ <title>General</title>
+ <para>
+ The control interface is used widely for many switches,
+ sliders, etc. which are accessed from user-space. Its most
+ important use is the mixer interface. In other words, since ALSA
+ 0.9.x, all the mixer stuff is implemented on the control kernel API.
+ </para>
+
+ <para>
+ ALSA has a well-defined AC97 control module. If your chip
+ supports only the AC97 and nothing else, you can skip this
+ section.
+ </para>
+
+ <para>
+ The control API is defined in
+ <filename>&lt;sound/control.h&gt;</filename>.
+ Include this file if you want to add your own controls.
+ </para>
+ </section>
+
+ <section id="control-interface-definition">
+ <title>Definition of Controls</title>
+ <para>
+ To create a new control, you need to define the
+ following three
+ callbacks: <structfield>info</structfield>,
+ <structfield>get</structfield> and
+ <structfield>put</structfield>. Then, define a
+ struct <structname>snd_kcontrol_new</structname> record, such as:
+
+ <example>
+ <title>Definition of a Control</title>
+ <programlisting>
+<![CDATA[
+ static struct snd_kcontrol_new my_control __devinitdata = {
+ .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+ .name = "PCM Playback Switch",
+ .index = 0,
+ .access = SNDRV_CTL_ELEM_ACCESS_READWRITE,
+ .private_value = 0xffff,
+ .info = my_control_info,
+ .get = my_control_get,
+ .put = my_control_put
+ };
+]]>
+ </programlisting>
+ </example>
+ </para>
+
+ <para>
+ Most likely the control is created via
+ <function>snd_ctl_new1()</function>, and in such a case, you can
+ add the <parameter>__devinitdata</parameter> prefix to the
+ definition as above.
+ </para>
+
+ <para>
+ The <structfield>iface</structfield> field specifies the control
+ type, <constant>SNDRV_CTL_ELEM_IFACE_XXX</constant>, which
+ is usually <constant>MIXER</constant>.
+ Use <constant>CARD</constant> for global controls that are not
+ logically part of the mixer.
+ If the control is closely associated with some specific device on
+ the sound card, use <constant>HWDEP</constant>,
+ <constant>PCM</constant>, <constant>RAWMIDI</constant>,
+ <constant>TIMER</constant>, or <constant>SEQUENCER</constant>, and
+ specify the device number with the
+ <structfield>device</structfield> and
+ <structfield>subdevice</structfield> fields.
+ </para>
+
+ <para>
+ The <structfield>name</structfield> is the name identifier
+ string. Since ALSA 0.9.x, the control name is very important,
+ because its role is classified from its name. There are
+ pre-defined standard control names. The details are described in
+ the <link linkend="control-interface-control-names"><citetitle>
+ Control Names</citetitle></link> subsection.
+ </para>
+
+ <para>
+ The <structfield>index</structfield> field holds the index number
+ of this control. If there are several different controls with
+ the same name, they can be distinguished by the index
+ number. This is the case when
+ several codecs exist on the card. If the index is zero, you can
+ omit the definition above.
+ </para>
+
+ <para>
+ The <structfield>access</structfield> field contains the access
+ type of this control. Give the combination of bit masks,
+ <constant>SNDRV_CTL_ELEM_ACCESS_XXX</constant>, there.
+ The details will be explained in
+ the <link linkend="control-interface-access-flags"><citetitle>
+ Access Flags</citetitle></link> subsection.
+ </para>
+
+ <para>
+ The <structfield>private_value</structfield> field contains
+ an arbitrary long integer value for this record. When using
+ the generic <structfield>info</structfield>,
+ <structfield>get</structfield> and
+ <structfield>put</structfield> callbacks, you can pass a value
+ through this field. If several small numbers are necessary, you can
+ combine them in bitwise. Or, it's possible to give a pointer
+ (casted to unsigned long) of some record to this field, too.
+ </para>
+
+ <para>
+ The <structfield>tlv</structfield> field can be used to provide
+ metadata about the control; see the
+ <link linkend="control-interface-tlv">
+ <citetitle>Metadata</citetitle></link> subsection.
+ </para>
+
+ <para>
+ The other three are
+ <link linkend="control-interface-callbacks"><citetitle>
+ callback functions</citetitle></link>.
+ </para>
+ </section>
+
+ <section id="control-interface-control-names">
+ <title>Control Names</title>
+ <para>
+ There are some standards to define the control names. A
+ control is usually defined from the three parts as
+ <quote>SOURCE DIRECTION FUNCTION</quote>.
+ </para>
+
+ <para>
+ The first, <constant>SOURCE</constant>, specifies the source
+ of the control, and is a string such as <quote>Master</quote>,
+ <quote>PCM</quote>, <quote>CD</quote> and
+ <quote>Line</quote>. There are many pre-defined sources.
+ </para>
+
+ <para>
+ The second, <constant>DIRECTION</constant>, is one of the
+ following strings according to the direction of the control:
+ <quote>Playback</quote>, <quote>Capture</quote>, <quote>Bypass
+ Playback</quote> and <quote>Bypass Capture</quote>. Or, it can
+ be omitted, meaning both playback and capture directions.
+ </para>
+
+ <para>
+ The third, <constant>FUNCTION</constant>, is one of the
+ following strings according to the function of the control:
+ <quote>Switch</quote>, <quote>Volume</quote> and
+ <quote>Route</quote>.
+ </para>
+
+ <para>
+ The example of control names are, thus, <quote>Master Capture
+ Switch</quote> or <quote>PCM Playback Volume</quote>.
+ </para>
+
+ <para>
+ There are some exceptions:
+ </para>
+
+ <section id="control-interface-control-names-global">
+ <title>Global capture and playback</title>
+ <para>
+ <quote>Capture Source</quote>, <quote>Capture Switch</quote>
+ and <quote>Capture Volume</quote> are used for the global
+ capture (input) source, switch and volume. Similarly,
+ <quote>Playback Switch</quote> and <quote>Playback
+ Volume</quote> are used for the global output gain switch and
+ volume.
+ </para>
+ </section>
+
+ <section id="control-interface-control-names-tone">
+ <title>Tone-controls</title>
+ <para>
+ tone-control switch and volumes are specified like
+ <quote>Tone Control - XXX</quote>, e.g. <quote>Tone Control -
+ Switch</quote>, <quote>Tone Control - Bass</quote>,
+ <quote>Tone Control - Center</quote>.
+ </para>
+ </section>
+
+ <section id="control-interface-control-names-3d">
+ <title>3D controls</title>
+ <para>
+ 3D-control switches and volumes are specified like <quote>3D
+ Control - XXX</quote>, e.g. <quote>3D Control -
+ Switch</quote>, <quote>3D Control - Center</quote>, <quote>3D
+ Control - Space</quote>.
+ </para>
+ </section>
+
+ <section id="control-interface-control-names-mic">
+ <title>Mic boost</title>
+ <para>
+ Mic-boost switch is set as <quote>Mic Boost</quote> or
+ <quote>Mic Boost (6dB)</quote>.
+ </para>
+
+ <para>
+ More precise information can be found in
+ <filename>Documentation/sound/alsa/ControlNames.txt</filename>.
+ </para>
+ </section>
+ </section>
+
+ <section id="control-interface-access-flags">
+ <title>Access Flags</title>
+
+ <para>
+ The access flag is the bitmask which specifies the access type
+ of the given control. The default access type is
+ <constant>SNDRV_CTL_ELEM_ACCESS_READWRITE</constant>,
+ which means both read and write are allowed to this control.
+ When the access flag is omitted (i.e. = 0), it is
+ considered as <constant>READWRITE</constant> access as default.
+ </para>
+
+ <para>
+ When the control is read-only, pass
+ <constant>SNDRV_CTL_ELEM_ACCESS_READ</constant> instead.
+ In this case, you don't have to define
+ the <structfield>put</structfield> callback.
+ Similarly, when the control is write-only (although it's a rare
+ case), you can use the <constant>WRITE</constant> flag instead, and
+ you don't need the <structfield>get</structfield> callback.
+ </para>
+
+ <para>
+ If the control value changes frequently (e.g. the VU meter),
+ <constant>VOLATILE</constant> flag should be given. This means
+ that the control may be changed without
+ <link linkend="control-interface-change-notification"><citetitle>
+ notification</citetitle></link>. Applications should poll such
+ a control constantly.
+ </para>
+
+ <para>
+ When the control is inactive, set
+ the <constant>INACTIVE</constant> flag, too.
+ There are <constant>LOCK</constant> and
+ <constant>OWNER</constant> flags to change the write
+ permissions.
+ </para>
+
+ </section>
+
+ <section id="control-interface-callbacks">
+ <title>Callbacks</title>
+
+ <section id="control-interface-callbacks-info">
+ <title>info callback</title>
+ <para>
+ The <structfield>info</structfield> callback is used to get
+ detailed information on this control. This must store the
+ values of the given struct <structname>snd_ctl_elem_info</structname>
+ object. For example, for a boolean control with a single
+ element:
+
+ <example>
+ <title>Example of info callback</title>
+ <programlisting>
+<![CDATA[
+ static int snd_myctl_mono_info(struct snd_kcontrol *kcontrol,
+ struct snd_ctl_elem_info *uinfo)
+ {
+ uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN;
+ uinfo->count = 1;
+ uinfo->value.integer.min = 0;
+ uinfo->value.integer.max = 1;
+ return 0;
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+
+ <para>
+ The <structfield>type</structfield> field specifies the type
+ of the control. There are <constant>BOOLEAN</constant>,
+ <constant>INTEGER</constant>, <constant>ENUMERATED</constant>,
+ <constant>BYTES</constant>, <constant>IEC958</constant> and
+ <constant>INTEGER64</constant>. The
+ <structfield>count</structfield> field specifies the
+ number of elements in this control. For example, a stereo
+ volume would have count = 2. The
+ <structfield>value</structfield> field is a union, and
+ the values stored are depending on the type. The boolean and
+ integer types are identical.
+ </para>
+
+ <para>
+ The enumerated type is a bit different from others. You'll
+ need to set the string for the currently given item index.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_myctl_enum_info(struct snd_kcontrol *kcontrol,
+ struct snd_ctl_elem_info *uinfo)
+ {
+ static char *texts[4] = {
+ "First", "Second", "Third", "Fourth"
+ };
+ uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
+ uinfo->count = 1;
+ uinfo->value.enumerated.items = 4;
+ if (uinfo->value.enumerated.item > 3)
+ uinfo->value.enumerated.item = 3;
+ strcpy(uinfo->value.enumerated.name,
+ texts[uinfo->value.enumerated.item]);
+ return 0;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Some common info callbacks are available for your convenience:
+ <function>snd_ctl_boolean_mono_info()</function> and
+ <function>snd_ctl_boolean_stereo_info()</function>.
+ Obviously, the former is an info callback for a mono channel
+ boolean item, just like <function>snd_myctl_mono_info</function>
+ above, and the latter is for a stereo channel boolean item.
+ </para>
+
+ </section>
+
+ <section id="control-interface-callbacks-get">
+ <title>get callback</title>
+
+ <para>
+ This callback is used to read the current value of the
+ control and to return to user-space.
+ </para>
+
+ <para>
+ For example,
+
+ <example>
+ <title>Example of get callback</title>
+ <programlisting>
+<![CDATA[
+ static int snd_myctl_get(struct snd_kcontrol *kcontrol,
+ struct snd_ctl_elem_value *ucontrol)
+ {
+ struct mychip *chip = snd_kcontrol_chip(kcontrol);
+ ucontrol->value.integer.value[0] = get_some_value(chip);
+ return 0;
+ }
+]]>
+ </programlisting>
+ </example>
+ </para>
+
+ <para>
+ The <structfield>value</structfield> field depends on
+ the type of control as well as on the info callback. For example,
+ the sb driver uses this field to store the register offset,
+ the bit-shift and the bit-mask. The
+ <structfield>private_value</structfield> field is set as follows:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ .private_value = reg | (shift << 16) | (mask << 24)
+]]>
+ </programlisting>
+ </informalexample>
+ and is retrieved in callbacks like
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_sbmixer_get_single(struct snd_kcontrol *kcontrol,
+ struct snd_ctl_elem_value *ucontrol)
+ {
+ int reg = kcontrol->private_value & 0xff;
+ int shift = (kcontrol->private_value >> 16) & 0xff;
+ int mask = (kcontrol->private_value >> 24) & 0xff;
+ ....
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ In the <structfield>get</structfield> callback,
+ you have to fill all the elements if the
+ control has more than one elements,
+ i.e. <structfield>count</structfield> &gt; 1.
+ In the example above, we filled only one element
+ (<structfield>value.integer.value[0]</structfield>) since it's
+ assumed as <structfield>count</structfield> = 1.
+ </para>
+ </section>
+
+ <section id="control-interface-callbacks-put">
+ <title>put callback</title>
+
+ <para>
+ This callback is used to write a value from user-space.
+ </para>
+
+ <para>
+ For example,
+
+ <example>
+ <title>Example of put callback</title>
+ <programlisting>
+<![CDATA[
+ static int snd_myctl_put(struct snd_kcontrol *kcontrol,
+ struct snd_ctl_elem_value *ucontrol)
+ {
+ struct mychip *chip = snd_kcontrol_chip(kcontrol);
+ int changed = 0;
+ if (chip->current_value !=
+ ucontrol->value.integer.value[0]) {
+ change_current_value(chip,
+ ucontrol->value.integer.value[0]);
+ changed = 1;
+ }
+ return changed;
+ }
+]]>
+ </programlisting>
+ </example>
+
+ As seen above, you have to return 1 if the value is
+ changed. If the value is not changed, return 0 instead.
+ If any fatal error happens, return a negative error code as
+ usual.
+ </para>
+
+ <para>
+ As in the <structfield>get</structfield> callback,
+ when the control has more than one elements,
+ all elements must be evaluated in this callback, too.
+ </para>
+ </section>
+
+ <section id="control-interface-callbacks-all">
+ <title>Callbacks are not atomic</title>
+ <para>
+ All these three callbacks are basically not atomic.
+ </para>
+ </section>
+ </section>
+
+ <section id="control-interface-constructor">
+ <title>Constructor</title>
+ <para>
+ When everything is ready, finally we can create a new
+ control. To create a control, there are two functions to be
+ called, <function>snd_ctl_new1()</function> and
+ <function>snd_ctl_add()</function>.
+ </para>
+
+ <para>
+ In the simplest way, you can do like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ err = snd_ctl_add(card, snd_ctl_new1(&my_control, chip));
+ if (err < 0)
+ return err;
+]]>
+ </programlisting>
+ </informalexample>
+
+ where <parameter>my_control</parameter> is the
+ struct <structname>snd_kcontrol_new</structname> object defined above, and chip
+ is the object pointer to be passed to
+ kcontrol-&gt;private_data
+ which can be referred to in callbacks.
+ </para>
+
+ <para>
+ <function>snd_ctl_new1()</function> allocates a new
+ <structname>snd_kcontrol</structname> instance (that's why the definition
+ of <parameter>my_control</parameter> can be with
+ the <parameter>__devinitdata</parameter>
+ prefix), and <function>snd_ctl_add</function> assigns the given
+ control component to the card.
+ </para>
+ </section>
+
+ <section id="control-interface-change-notification">
+ <title>Change Notification</title>
+ <para>
+ If you need to change and update a control in the interrupt
+ routine, you can call <function>snd_ctl_notify()</function>. For
+ example,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, id_pointer);
+]]>
+ </programlisting>
+ </informalexample>
+
+ This function takes the card pointer, the event-mask, and the
+ control id pointer for the notification. The event-mask
+ specifies the types of notification, for example, in the above
+ example, the change of control values is notified.
+ The id pointer is the pointer of struct <structname>snd_ctl_elem_id</structname>
+ to be notified.
+ You can find some examples in <filename>es1938.c</filename> or
+ <filename>es1968.c</filename> for hardware volume interrupts.
+ </para>
+ </section>
+
+ <section id="control-interface-tlv">
+ <title>Metadata</title>
+ <para>
+ To provide information about the dB values of a mixer control, use
+ on of the <constant>DECLARE_TLV_xxx</constant> macros from
+ <filename>&lt;sound/tlv.h&gt;</filename> to define a variable
+ containing this information, set the<structfield>tlv.p
+ </structfield> field to point to this variable, and include the
+ <constant>SNDRV_CTL_ELEM_ACCESS_TLV_READ</constant> flag in the
+ <structfield>access</structfield> field; like this:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static DECLARE_TLV_DB_SCALE(db_scale_my_control, -4050, 150, 0);
+
+ static struct snd_kcontrol_new my_control __devinitdata = {
+ ...
+ .access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
+ SNDRV_CTL_ELEM_ACCESS_TLV_READ,
+ ...
+ .tlv.p = db_scale_my_control,
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The <function>DECLARE_TLV_DB_SCALE</function> macro defines
+ information about a mixer control where each step in the control's
+ value changes the dB value by a constant dB amount.
+ The first parameter is the name of the variable to be defined.
+ The second parameter is the minimum value, in units of 0.01 dB.
+ The third parameter is the step size, in units of 0.01 dB.
+ Set the fourth parameter to 1 if the minimum value actually mutes
+ the control.
+ </para>
+
+ <para>
+ The <function>DECLARE_TLV_DB_LINEAR</function> macro defines
+ information about a mixer control where the control's value affects
+ the output linearly.
+ The first parameter is the name of the variable to be defined.
+ The second parameter is the minimum value, in units of 0.01 dB.
+ The third parameter is the maximum value, in units of 0.01 dB.
+ If the minimum value mutes the control, set the second parameter to
+ <constant>TLV_DB_GAIN_MUTE</constant>.
+ </para>
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- API for AC97 Codec -->
+<!-- ****************************************************** -->
+ <chapter id="api-ac97">
+ <title>API for AC97 Codec</title>
+
+ <section>
+ <title>General</title>
+ <para>
+ The ALSA AC97 codec layer is a well-defined one, and you don't
+ have to write much code to control it. Only low-level control
+ routines are necessary. The AC97 codec API is defined in
+ <filename>&lt;sound/ac97_codec.h&gt;</filename>.
+ </para>
+ </section>
+
+ <section id="api-ac97-example">
+ <title>Full Code Example</title>
+ <para>
+ <example>
+ <title>Example of AC97 Interface</title>
+ <programlisting>
+<![CDATA[
+ struct mychip {
+ ....
+ struct snd_ac97 *ac97;
+ ....
+ };
+
+ static unsigned short snd_mychip_ac97_read(struct snd_ac97 *ac97,
+ unsigned short reg)
+ {
+ struct mychip *chip = ac97->private_data;
+ ....
+ /* read a register value here from the codec */
+ return the_register_value;
+ }
+
+ static void snd_mychip_ac97_write(struct snd_ac97 *ac97,
+ unsigned short reg, unsigned short val)
+ {
+ struct mychip *chip = ac97->private_data;
+ ....
+ /* write the given register value to the codec */
+ }
+
+ static int snd_mychip_ac97(struct mychip *chip)
+ {
+ struct snd_ac97_bus *bus;
+ struct snd_ac97_template ac97;
+ int err;
+ static struct snd_ac97_bus_ops ops = {
+ .write = snd_mychip_ac97_write,
+ .read = snd_mychip_ac97_read,
+ };
+
+ err = snd_ac97_bus(chip->card, 0, &ops, NULL, &bus);
+ if (err < 0)
+ return err;
+ memset(&ac97, 0, sizeof(ac97));
+ ac97.private_data = chip;
+ return snd_ac97_mixer(bus, &ac97, &chip->ac97);
+ }
+
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </section>
+
+ <section id="api-ac97-constructor">
+ <title>Constructor</title>
+ <para>
+ To create an ac97 instance, first call <function>snd_ac97_bus</function>
+ with an <type>ac97_bus_ops_t</type> record with callback functions.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_ac97_bus *bus;
+ static struct snd_ac97_bus_ops ops = {
+ .write = snd_mychip_ac97_write,
+ .read = snd_mychip_ac97_read,
+ };
+
+ snd_ac97_bus(card, 0, &ops, NULL, &pbus);
+]]>
+ </programlisting>
+ </informalexample>
+
+ The bus record is shared among all belonging ac97 instances.
+ </para>
+
+ <para>
+ And then call <function>snd_ac97_mixer()</function> with an
+ struct <structname>snd_ac97_template</structname>
+ record together with the bus pointer created above.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_ac97_template ac97;
+ int err;
+
+ memset(&ac97, 0, sizeof(ac97));
+ ac97.private_data = chip;
+ snd_ac97_mixer(bus, &ac97, &chip->ac97);
+]]>
+ </programlisting>
+ </informalexample>
+
+ where chip-&gt;ac97 is a pointer to a newly created
+ <type>ac97_t</type> instance.
+ In this case, the chip pointer is set as the private data, so that
+ the read/write callback functions can refer to this chip instance.
+ This instance is not necessarily stored in the chip
+ record. If you need to change the register values from the
+ driver, or need the suspend/resume of ac97 codecs, keep this
+ pointer to pass to the corresponding functions.
+ </para>
+ </section>
+
+ <section id="api-ac97-callbacks">
+ <title>Callbacks</title>
+ <para>
+ The standard callbacks are <structfield>read</structfield> and
+ <structfield>write</structfield>. Obviously they
+ correspond to the functions for read and write accesses to the
+ hardware low-level codes.
+ </para>
+
+ <para>
+ The <structfield>read</structfield> callback returns the
+ register value specified in the argument.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static unsigned short snd_mychip_ac97_read(struct snd_ac97 *ac97,
+ unsigned short reg)
+ {
+ struct mychip *chip = ac97->private_data;
+ ....
+ return the_register_value;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ Here, the chip can be cast from ac97-&gt;private_data.
+ </para>
+
+ <para>
+ Meanwhile, the <structfield>write</structfield> callback is
+ used to set the register value.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void snd_mychip_ac97_write(struct snd_ac97 *ac97,
+ unsigned short reg, unsigned short val)
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ These callbacks are non-atomic like the control API callbacks.
+ </para>
+
+ <para>
+ There are also other callbacks:
+ <structfield>reset</structfield>,
+ <structfield>wait</structfield> and
+ <structfield>init</structfield>.
+ </para>
+
+ <para>
+ The <structfield>reset</structfield> callback is used to reset
+ the codec. If the chip requires a special kind of reset, you can
+ define this callback.
+ </para>
+
+ <para>
+ The <structfield>wait</structfield> callback is used to
+ add some waiting time in the standard initialization of the codec. If the
+ chip requires the extra waiting time, define this callback.
+ </para>
+
+ <para>
+ The <structfield>init</structfield> callback is used for
+ additional initialization of the codec.
+ </para>
+ </section>
+
+ <section id="api-ac97-updating-registers">
+ <title>Updating Registers in The Driver</title>
+ <para>
+ If you need to access to the codec from the driver, you can
+ call the following functions:
+ <function>snd_ac97_write()</function>,
+ <function>snd_ac97_read()</function>,
+ <function>snd_ac97_update()</function> and
+ <function>snd_ac97_update_bits()</function>.
+ </para>
+
+ <para>
+ Both <function>snd_ac97_write()</function> and
+ <function>snd_ac97_update()</function> functions are used to
+ set a value to the given register
+ (<constant>AC97_XXX</constant>). The difference between them is
+ that <function>snd_ac97_update()</function> doesn't write a
+ value if the given value has been already set, while
+ <function>snd_ac97_write()</function> always rewrites the
+ value.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_ac97_write(ac97, AC97_MASTER, 0x8080);
+ snd_ac97_update(ac97, AC97_MASTER, 0x8080);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ <function>snd_ac97_read()</function> is used to read the value
+ of the given register. For example,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ value = snd_ac97_read(ac97, AC97_MASTER);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ <function>snd_ac97_update_bits()</function> is used to update
+ some bits in the given register.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_ac97_update_bits(ac97, reg, mask, value);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Also, there is a function to change the sample rate (of a
+ given register such as
+ <constant>AC97_PCM_FRONT_DAC_RATE</constant>) when VRA or
+ DRA is supported by the codec:
+ <function>snd_ac97_set_rate()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_ac97_set_rate(ac97, AC97_PCM_FRONT_DAC_RATE, 44100);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The following registers are available to set the rate:
+ <constant>AC97_PCM_MIC_ADC_RATE</constant>,
+ <constant>AC97_PCM_FRONT_DAC_RATE</constant>,
+ <constant>AC97_PCM_LR_ADC_RATE</constant>,
+ <constant>AC97_SPDIF</constant>. When
+ <constant>AC97_SPDIF</constant> is specified, the register is
+ not really changed but the corresponding IEC958 status bits will
+ be updated.
+ </para>
+ </section>
+
+ <section id="api-ac97-clock-adjustment">
+ <title>Clock Adjustment</title>
+ <para>
+ In some chips, the clock of the codec isn't 48000 but using a
+ PCI clock (to save a quartz!). In this case, change the field
+ bus-&gt;clock to the corresponding
+ value. For example, intel8x0
+ and es1968 drivers have their own function to read from the clock.
+ </para>
+ </section>
+
+ <section id="api-ac97-proc-files">
+ <title>Proc Files</title>
+ <para>
+ The ALSA AC97 interface will create a proc file such as
+ <filename>/proc/asound/card0/codec97#0/ac97#0-0</filename> and
+ <filename>ac97#0-0+regs</filename>. You can refer to these files to
+ see the current status and registers of the codec.
+ </para>
+ </section>
+
+ <section id="api-ac97-multiple-codecs">
+ <title>Multiple Codecs</title>
+ <para>
+ When there are several codecs on the same card, you need to
+ call <function>snd_ac97_mixer()</function> multiple times with
+ ac97.num=1 or greater. The <structfield>num</structfield> field
+ specifies the codec number.
+ </para>
+
+ <para>
+ If you set up multiple codecs, you either need to write
+ different callbacks for each codec or check
+ ac97-&gt;num in the callback routines.
+ </para>
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- MIDI (MPU401-UART) Interface -->
+<!-- ****************************************************** -->
+ <chapter id="midi-interface">
+ <title>MIDI (MPU401-UART) Interface</title>
+
+ <section id="midi-interface-general">
+ <title>General</title>
+ <para>
+ Many soundcards have built-in MIDI (MPU401-UART)
+ interfaces. When the soundcard supports the standard MPU401-UART
+ interface, most likely you can use the ALSA MPU401-UART API. The
+ MPU401-UART API is defined in
+ <filename>&lt;sound/mpu401.h&gt;</filename>.
+ </para>
+
+ <para>
+ Some soundchips have a similar but slightly different
+ implementation of mpu401 stuff. For example, emu10k1 has its own
+ mpu401 routines.
+ </para>
+ </section>
+
+ <section id="midi-interface-constructor">
+ <title>Constructor</title>
+ <para>
+ To create a rawmidi object, call
+ <function>snd_mpu401_uart_new()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_rawmidi *rmidi;
+ snd_mpu401_uart_new(card, 0, MPU401_HW_MPU401, port, info_flags,
+ irq, irq_flags, &rmidi);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The first argument is the card pointer, and the second is the
+ index of this component. You can create up to 8 rawmidi
+ devices.
+ </para>
+
+ <para>
+ The third argument is the type of the hardware,
+ <constant>MPU401_HW_XXX</constant>. If it's not a special one,
+ you can use <constant>MPU401_HW_MPU401</constant>.
+ </para>
+
+ <para>
+ The 4th argument is the I/O port address. Many
+ backward-compatible MPU401 have an I/O port such as 0x330. Or, it
+ might be a part of its own PCI I/O region. It depends on the
+ chip design.
+ </para>
+
+ <para>
+ The 5th argument is a bitflag for additional information.
+ When the I/O port address above is part of the PCI I/O
+ region, the MPU401 I/O port might have been already allocated
+ (reserved) by the driver itself. In such a case, pass a bit flag
+ <constant>MPU401_INFO_INTEGRATED</constant>,
+ and the mpu401-uart layer will allocate the I/O ports by itself.
+ </para>
+
+ <para>
+ When the controller supports only the input or output MIDI stream,
+ pass the <constant>MPU401_INFO_INPUT</constant> or
+ <constant>MPU401_INFO_OUTPUT</constant> bitflag, respectively.
+ Then the rawmidi instance is created as a single stream.
+ </para>
+
+ <para>
+ <constant>MPU401_INFO_MMIO</constant> bitflag is used to change
+ the access method to MMIO (via readb and writeb) instead of
+ iob and outb. In this case, you have to pass the iomapped address
+ to <function>snd_mpu401_uart_new()</function>.
+ </para>
+
+ <para>
+ When <constant>MPU401_INFO_TX_IRQ</constant> is set, the output
+ stream isn't checked in the default interrupt handler. The driver
+ needs to call <function>snd_mpu401_uart_interrupt_tx()</function>
+ by itself to start processing the output stream in the irq handler.
+ </para>
+
+ <para>
+ Usually, the port address corresponds to the command port and
+ port + 1 corresponds to the data port. If not, you may change
+ the <structfield>cport</structfield> field of
+ struct <structname>snd_mpu401</structname> manually
+ afterward. However, <structname>snd_mpu401</structname> pointer is not
+ returned explicitly by
+ <function>snd_mpu401_uart_new()</function>. You need to cast
+ rmidi-&gt;private_data to
+ <structname>snd_mpu401</structname> explicitly,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_mpu401 *mpu;
+ mpu = rmidi->private_data;
+]]>
+ </programlisting>
+ </informalexample>
+
+ and reset the cport as you like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ mpu->cport = my_own_control_port;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The 6th argument specifies the irq number for UART. If the irq
+ is already allocated, pass 0 to the 7th argument
+ (<parameter>irq_flags</parameter>). Otherwise, pass the flags
+ for irq allocation
+ (<constant>SA_XXX</constant> bits) to it, and the irq will be
+ reserved by the mpu401-uart layer. If the card doesn't generate
+ UART interrupts, pass -1 as the irq number. Then a timer
+ interrupt will be invoked for polling.
+ </para>
+ </section>
+
+ <section id="midi-interface-interrupt-handler">
+ <title>Interrupt Handler</title>
+ <para>
+ When the interrupt is allocated in
+ <function>snd_mpu401_uart_new()</function>, the private
+ interrupt handler is used, hence you don't have anything else to do
+ than creating the mpu401 stuff. Otherwise, you have to call
+ <function>snd_mpu401_uart_interrupt()</function> explicitly when
+ a UART interrupt is invoked and checked in your own interrupt
+ handler.
+ </para>
+
+ <para>
+ In this case, you need to pass the private_data of the
+ returned rawmidi object from
+ <function>snd_mpu401_uart_new()</function> as the second
+ argument of <function>snd_mpu401_uart_interrupt()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_mpu401_uart_interrupt(irq, rmidi->private_data, regs);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- RawMIDI Interface -->
+<!-- ****************************************************** -->
+ <chapter id="rawmidi-interface">
+ <title>RawMIDI Interface</title>
+
+ <section id="rawmidi-interface-overview">
+ <title>Overview</title>
+
+ <para>
+ The raw MIDI interface is used for hardware MIDI ports that can
+ be accessed as a byte stream. It is not used for synthesizer
+ chips that do not directly understand MIDI.
+ </para>
+
+ <para>
+ ALSA handles file and buffer management. All you have to do is
+ to write some code to move data between the buffer and the
+ hardware.
+ </para>
+
+ <para>
+ The rawmidi API is defined in
+ <filename>&lt;sound/rawmidi.h&gt;</filename>.
+ </para>
+ </section>
+
+ <section id="rawmidi-interface-constructor">
+ <title>Constructor</title>
+
+ <para>
+ To create a rawmidi device, call the
+ <function>snd_rawmidi_new</function> function:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_rawmidi *rmidi;
+ err = snd_rawmidi_new(chip->card, "MyMIDI", 0, outs, ins, &rmidi);
+ if (err < 0)
+ return err;
+ rmidi->private_data = chip;
+ strcpy(rmidi->name, "My MIDI");
+ rmidi->info_flags = SNDRV_RAWMIDI_INFO_OUTPUT |
+ SNDRV_RAWMIDI_INFO_INPUT |
+ SNDRV_RAWMIDI_INFO_DUPLEX;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The first argument is the card pointer, the second argument is
+ the ID string.
+ </para>
+
+ <para>
+ The third argument is the index of this component. You can
+ create up to 8 rawmidi devices.
+ </para>
+
+ <para>
+ The fourth and fifth arguments are the number of output and
+ input substreams, respectively, of this device (a substream is
+ the equivalent of a MIDI port).
+ </para>
+
+ <para>
+ Set the <structfield>info_flags</structfield> field to specify
+ the capabilities of the device.
+ Set <constant>SNDRV_RAWMIDI_INFO_OUTPUT</constant> if there is
+ at least one output port,
+ <constant>SNDRV_RAWMIDI_INFO_INPUT</constant> if there is at
+ least one input port,
+ and <constant>SNDRV_RAWMIDI_INFO_DUPLEX</constant> if the device
+ can handle output and input at the same time.
+ </para>
+
+ <para>
+ After the rawmidi device is created, you need to set the
+ operators (callbacks) for each substream. There are helper
+ functions to set the operators for all the substreams of a device:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_rawmidi_set_ops(rmidi, SNDRV_RAWMIDI_STREAM_OUTPUT, &snd_mymidi_output_ops);
+ snd_rawmidi_set_ops(rmidi, SNDRV_RAWMIDI_STREAM_INPUT, &snd_mymidi_input_ops);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The operators are usually defined like this:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct snd_rawmidi_ops snd_mymidi_output_ops = {
+ .open = snd_mymidi_output_open,
+ .close = snd_mymidi_output_close,
+ .trigger = snd_mymidi_output_trigger,
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ These callbacks are explained in the <link
+ linkend="rawmidi-interface-callbacks"><citetitle>Callbacks</citetitle></link>
+ section.
+ </para>
+
+ <para>
+ If there are more than one substream, you should give a
+ unique name to each of them:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_rawmidi_substream *substream;
+ list_for_each_entry(substream,
+ &rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT].substreams,
+ list {
+ sprintf(substream->name, "My MIDI Port %d", substream->number + 1);
+ }
+ /* same for SNDRV_RAWMIDI_STREAM_INPUT */
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </section>
+
+ <section id="rawmidi-interface-callbacks">
+ <title>Callbacks</title>
+
+ <para>
+ In all the callbacks, the private data that you've set for the
+ rawmidi device can be accessed as
+ substream-&gt;rmidi-&gt;private_data.
+ <!-- <code> isn't available before DocBook 4.3 -->
+ </para>
+
+ <para>
+ If there is more than one port, your callbacks can determine the
+ port index from the struct snd_rawmidi_substream data passed to each
+ callback:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_rawmidi_substream *substream;
+ int index = substream->number;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <section id="rawmidi-interface-op-open">
+ <title><function>open</function> callback</title>
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_open(struct snd_rawmidi_substream *substream);
+]]>
+ </programlisting>
+ </informalexample>
+
+ <para>
+ This is called when a substream is opened.
+ You can initialize the hardware here, but you shouldn't
+ start transmitting/receiving data yet.
+ </para>
+ </section>
+
+ <section id="rawmidi-interface-op-close">
+ <title><function>close</function> callback</title>
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int snd_xxx_close(struct snd_rawmidi_substream *substream);
+]]>
+ </programlisting>
+ </informalexample>
+
+ <para>
+ Guess what.
+ </para>
+
+ <para>
+ The <function>open</function> and <function>close</function>
+ callbacks of a rawmidi device are serialized with a mutex,
+ and can sleep.
+ </para>
+ </section>
+
+ <section id="rawmidi-interface-op-trigger-out">
+ <title><function>trigger</function> callback for output
+ substreams</title>
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void snd_xxx_output_trigger(struct snd_rawmidi_substream *substream, int up);
+]]>
+ </programlisting>
+ </informalexample>
+
+ <para>
+ This is called with a nonzero <parameter>up</parameter>
+ parameter when there is some data in the substream buffer that
+ must be transmitted.
+ </para>
+
+ <para>
+ To read data from the buffer, call
+ <function>snd_rawmidi_transmit_peek</function>. It will
+ return the number of bytes that have been read; this will be
+ less than the number of bytes requested when there are no more
+ data in the buffer.
+ After the data have been transmitted successfully, call
+ <function>snd_rawmidi_transmit_ack</function> to remove the
+ data from the substream buffer:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ unsigned char data;
+ while (snd_rawmidi_transmit_peek(substream, &data, 1) == 1) {
+ if (snd_mychip_try_to_transmit(data))
+ snd_rawmidi_transmit_ack(substream, 1);
+ else
+ break; /* hardware FIFO full */
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ If you know beforehand that the hardware will accept data, you
+ can use the <function>snd_rawmidi_transmit</function> function
+ which reads some data and removes them from the buffer at once:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ while (snd_mychip_transmit_possible()) {
+ unsigned char data;
+ if (snd_rawmidi_transmit(substream, &data, 1) != 1)
+ break; /* no more data */
+ snd_mychip_transmit(data);
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ If you know beforehand how many bytes you can accept, you can
+ use a buffer size greater than one with the
+ <function>snd_rawmidi_transmit*</function> functions.
+ </para>
+
+ <para>
+ The <function>trigger</function> callback must not sleep. If
+ the hardware FIFO is full before the substream buffer has been
+ emptied, you have to continue transmitting data later, either
+ in an interrupt handler, or with a timer if the hardware
+ doesn't have a MIDI transmit interrupt.
+ </para>
+
+ <para>
+ The <function>trigger</function> callback is called with a
+ zero <parameter>up</parameter> parameter when the transmission
+ of data should be aborted.
+ </para>
+ </section>
+
+ <section id="rawmidi-interface-op-trigger-in">
+ <title><function>trigger</function> callback for input
+ substreams</title>
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void snd_xxx_input_trigger(struct snd_rawmidi_substream *substream, int up);
+]]>
+ </programlisting>
+ </informalexample>
+
+ <para>
+ This is called with a nonzero <parameter>up</parameter>
+ parameter to enable receiving data, or with a zero
+ <parameter>up</parameter> parameter do disable receiving data.
+ </para>
+
+ <para>
+ The <function>trigger</function> callback must not sleep; the
+ actual reading of data from the device is usually done in an
+ interrupt handler.
+ </para>
+
+ <para>
+ When data reception is enabled, your interrupt handler should
+ call <function>snd_rawmidi_receive</function> for all received
+ data:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ void snd_mychip_midi_interrupt(...)
+ {
+ while (mychip_midi_available()) {
+ unsigned char data;
+ data = mychip_midi_read();
+ snd_rawmidi_receive(substream, &data, 1);
+ }
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </section>
+
+ <section id="rawmidi-interface-op-drain">
+ <title><function>drain</function> callback</title>
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void snd_xxx_drain(struct snd_rawmidi_substream *substream);
+]]>
+ </programlisting>
+ </informalexample>
+
+ <para>
+ This is only used with output substreams. This function should wait
+ until all data read from the substream buffer have been transmitted.
+ This ensures that the device can be closed and the driver unloaded
+ without losing data.
+ </para>
+
+ <para>
+ This callback is optional. If you do not set
+ <structfield>drain</structfield> in the struct snd_rawmidi_ops
+ structure, ALSA will simply wait for 50&nbsp;milliseconds
+ instead.
+ </para>
+ </section>
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Miscellaneous Devices -->
+<!-- ****************************************************** -->
+ <chapter id="misc-devices">
+ <title>Miscellaneous Devices</title>
+
+ <section id="misc-devices-opl3">
+ <title>FM OPL3</title>
+ <para>
+ The FM OPL3 is still used in many chips (mainly for backward
+ compatibility). ALSA has a nice OPL3 FM control layer, too. The
+ OPL3 API is defined in
+ <filename>&lt;sound/opl3.h&gt;</filename>.
+ </para>
+
+ <para>
+ FM registers can be directly accessed through the direct-FM API,
+ defined in <filename>&lt;sound/asound_fm.h&gt;</filename>. In
+ ALSA native mode, FM registers are accessed through
+ the Hardware-Dependant Device direct-FM extension API, whereas in
+ OSS compatible mode, FM registers can be accessed with the OSS
+ direct-FM compatible API in <filename>/dev/dmfmX</filename> device.
+ </para>
+
+ <para>
+ To create the OPL3 component, you have two functions to
+ call. The first one is a constructor for the <type>opl3_t</type>
+ instance.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_opl3 *opl3;
+ snd_opl3_create(card, lport, rport, OPL3_HW_OPL3_XXX,
+ integrated, &opl3);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The first argument is the card pointer, the second one is the
+ left port address, and the third is the right port address. In
+ most cases, the right port is placed at the left port + 2.
+ </para>
+
+ <para>
+ The fourth argument is the hardware type.
+ </para>
+
+ <para>
+ When the left and right ports have been already allocated by
+ the card driver, pass non-zero to the fifth argument
+ (<parameter>integrated</parameter>). Otherwise, the opl3 module will
+ allocate the specified ports by itself.
+ </para>
+
+ <para>
+ When the accessing the hardware requires special method
+ instead of the standard I/O access, you can create opl3 instance
+ separately with <function>snd_opl3_new()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_opl3 *opl3;
+ snd_opl3_new(card, OPL3_HW_OPL3_XXX, &opl3);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Then set <structfield>command</structfield>,
+ <structfield>private_data</structfield> and
+ <structfield>private_free</structfield> for the private
+ access function, the private data and the destructor.
+ The l_port and r_port are not necessarily set. Only the
+ command must be set properly. You can retrieve the data
+ from the opl3-&gt;private_data field.
+ </para>
+
+ <para>
+ After creating the opl3 instance via <function>snd_opl3_new()</function>,
+ call <function>snd_opl3_init()</function> to initialize the chip to the
+ proper state. Note that <function>snd_opl3_create()</function> always
+ calls it internally.
+ </para>
+
+ <para>
+ If the opl3 instance is created successfully, then create a
+ hwdep device for this opl3.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_hwdep *opl3hwdep;
+ snd_opl3_hwdep_new(opl3, 0, 1, &opl3hwdep);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The first argument is the <type>opl3_t</type> instance you
+ created, and the second is the index number, usually 0.
+ </para>
+
+ <para>
+ The third argument is the index-offset for the sequencer
+ client assigned to the OPL3 port. When there is an MPU401-UART,
+ give 1 for here (UART always takes 0).
+ </para>
+ </section>
+
+ <section id="misc-devices-hardware-dependent">
+ <title>Hardware-Dependent Devices</title>
+ <para>
+ Some chips need user-space access for special
+ controls or for loading the micro code. In such a case, you can
+ create a hwdep (hardware-dependent) device. The hwdep API is
+ defined in <filename>&lt;sound/hwdep.h&gt;</filename>. You can
+ find examples in opl3 driver or
+ <filename>isa/sb/sb16_csp.c</filename>.
+ </para>
+
+ <para>
+ The creation of the <type>hwdep</type> instance is done via
+ <function>snd_hwdep_new()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_hwdep *hw;
+ snd_hwdep_new(card, "My HWDEP", 0, &hw);
+]]>
+ </programlisting>
+ </informalexample>
+
+ where the third argument is the index number.
+ </para>
+
+ <para>
+ You can then pass any pointer value to the
+ <parameter>private_data</parameter>.
+ If you assign a private data, you should define the
+ destructor, too. The destructor function is set in
+ the <structfield>private_free</structfield> field.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct mydata *p = kmalloc(sizeof(*p), GFP_KERNEL);
+ hw->private_data = p;
+ hw->private_free = mydata_free;
+]]>
+ </programlisting>
+ </informalexample>
+
+ and the implementation of the destructor would be:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void mydata_free(struct snd_hwdep *hw)
+ {
+ struct mydata *p = hw->private_data;
+ kfree(p);
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The arbitrary file operations can be defined for this
+ instance. The file operators are defined in
+ the <parameter>ops</parameter> table. For example, assume that
+ this chip needs an ioctl.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ hw->ops.open = mydata_open;
+ hw->ops.ioctl = mydata_ioctl;
+ hw->ops.release = mydata_release;
+]]>
+ </programlisting>
+ </informalexample>
+
+ And implement the callback functions as you like.
+ </para>
+ </section>
+
+ <section id="misc-devices-IEC958">
+ <title>IEC958 (S/PDIF)</title>
+ <para>
+ Usually the controls for IEC958 devices are implemented via
+ the control interface. There is a macro to compose a name string for
+ IEC958 controls, <function>SNDRV_CTL_NAME_IEC958()</function>
+ defined in <filename>&lt;include/asound.h&gt;</filename>.
+ </para>
+
+ <para>
+ There are some standard controls for IEC958 status bits. These
+ controls use the type <type>SNDRV_CTL_ELEM_TYPE_IEC958</type>,
+ and the size of element is fixed as 4 bytes array
+ (value.iec958.status[x]). For the <structfield>info</structfield>
+ callback, you don't specify
+ the value field for this type (the count field must be set,
+ though).
+ </para>
+
+ <para>
+ <quote>IEC958 Playback Con Mask</quote> is used to return the
+ bit-mask for the IEC958 status bits of consumer mode. Similarly,
+ <quote>IEC958 Playback Pro Mask</quote> returns the bitmask for
+ professional mode. They are read-only controls, and are defined
+ as MIXER controls (iface =
+ <constant>SNDRV_CTL_ELEM_IFACE_MIXER</constant>).
+ </para>
+
+ <para>
+ Meanwhile, <quote>IEC958 Playback Default</quote> control is
+ defined for getting and setting the current default IEC958
+ bits. Note that this one is usually defined as a PCM control
+ (iface = <constant>SNDRV_CTL_ELEM_IFACE_PCM</constant>),
+ although in some places it's defined as a MIXER control.
+ </para>
+
+ <para>
+ In addition, you can define the control switches to
+ enable/disable or to set the raw bit mode. The implementation
+ will depend on the chip, but the control should be named as
+ <quote>IEC958 xxx</quote>, preferably using
+ the <function>SNDRV_CTL_NAME_IEC958()</function> macro.
+ </para>
+
+ <para>
+ You can find several cases, for example,
+ <filename>pci/emu10k1</filename>,
+ <filename>pci/ice1712</filename>, or
+ <filename>pci/cmipci.c</filename>.
+ </para>
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Buffer and Memory Management -->
+<!-- ****************************************************** -->
+ <chapter id="buffer-and-memory">
+ <title>Buffer and Memory Management</title>
+
+ <section id="buffer-and-memory-buffer-types">
+ <title>Buffer Types</title>
+ <para>
+ ALSA provides several different buffer allocation functions
+ depending on the bus and the architecture. All these have a
+ consistent API. The allocation of physically-contiguous pages is
+ done via
+ <function>snd_malloc_xxx_pages()</function> function, where xxx
+ is the bus type.
+ </para>
+
+ <para>
+ The allocation of pages with fallback is
+ <function>snd_malloc_xxx_pages_fallback()</function>. This
+ function tries to allocate the specified pages but if the pages
+ are not available, it tries to reduce the page sizes until
+ enough space is found.
+ </para>
+
+ <para>
+ The release the pages, call
+ <function>snd_free_xxx_pages()</function> function.
+ </para>
+
+ <para>
+ Usually, ALSA drivers try to allocate and reserve
+ a large contiguous physical space
+ at the time the module is loaded for the later use.
+ This is called <quote>pre-allocation</quote>.
+ As already written, you can call the following function at
+ pcm instance construction time (in the case of PCI bus).
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+ snd_dma_pci_data(pci), size, max);
+]]>
+ </programlisting>
+ </informalexample>
+
+ where <parameter>size</parameter> is the byte size to be
+ pre-allocated and the <parameter>max</parameter> is the maximum
+ size to be changed via the <filename>prealloc</filename> proc file.
+ The allocator will try to get an area as large as possible
+ within the given size.
+ </para>
+
+ <para>
+ The second argument (type) and the third argument (device pointer)
+ are dependent on the bus.
+ In the case of the ISA bus, pass <function>snd_dma_isa_data()</function>
+ as the third argument with <constant>SNDRV_DMA_TYPE_DEV</constant> type.
+ For the continuous buffer unrelated to the bus can be pre-allocated
+ with <constant>SNDRV_DMA_TYPE_CONTINUOUS</constant> type and the
+ <function>snd_dma_continuous_data(GFP_KERNEL)</function> device pointer,
+ where <constant>GFP_KERNEL</constant> is the kernel allocation flag to
+ use.
+ For the PCI scatter-gather buffers, use
+ <constant>SNDRV_DMA_TYPE_DEV_SG</constant> with
+ <function>snd_dma_pci_data(pci)</function>
+ (see the
+ <link linkend="buffer-and-memory-non-contiguous"><citetitle>Non-Contiguous Buffers
+ </citetitle></link> section).
+ </para>
+
+ <para>
+ Once the buffer is pre-allocated, you can use the
+ allocator in the <structfield>hw_params</structfield> callback:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_pcm_lib_malloc_pages(substream, size);
+]]>
+ </programlisting>
+ </informalexample>
+
+ Note that you have to pre-allocate to use this function.
+ </para>
+ </section>
+
+ <section id="buffer-and-memory-external-hardware">
+ <title>External Hardware Buffers</title>
+ <para>
+ Some chips have their own hardware buffers and the DMA
+ transfer from the host memory is not available. In such a case,
+ you need to either 1) copy/set the audio data directly to the
+ external hardware buffer, or 2) make an intermediate buffer and
+ copy/set the data from it to the external hardware buffer in
+ interrupts (or in tasklets, preferably).
+ </para>
+
+ <para>
+ The first case works fine if the external hardware buffer is large
+ enough. This method doesn't need any extra buffers and thus is
+ more effective. You need to define the
+ <structfield>copy</structfield> and
+ <structfield>silence</structfield> callbacks for
+ the data transfer. However, there is a drawback: it cannot
+ be mmapped. The examples are GUS's GF1 PCM or emu8000's
+ wavetable PCM.
+ </para>
+
+ <para>
+ The second case allows for mmap on the buffer, although you have
+ to handle an interrupt or a tasklet to transfer the data
+ from the intermediate buffer to the hardware buffer. You can find an
+ example in the vxpocket driver.
+ </para>
+
+ <para>
+ Another case is when the chip uses a PCI memory-map
+ region for the buffer instead of the host memory. In this case,
+ mmap is available only on certain architectures like the Intel one.
+ In non-mmap mode, the data cannot be transferred as in the normal
+ way. Thus you need to define the <structfield>copy</structfield> and
+ <structfield>silence</structfield> callbacks as well,
+ as in the cases above. The examples are found in
+ <filename>rme32.c</filename> and <filename>rme96.c</filename>.
+ </para>
+
+ <para>
+ The implementation of the <structfield>copy</structfield> and
+ <structfield>silence</structfield> callbacks depends upon
+ whether the hardware supports interleaved or non-interleaved
+ samples. The <structfield>copy</structfield> callback is
+ defined like below, a bit
+ differently depending whether the direction is playback or
+ capture:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int playback_copy(struct snd_pcm_substream *substream, int channel,
+ snd_pcm_uframes_t pos, void *src, snd_pcm_uframes_t count);
+ static int capture_copy(struct snd_pcm_substream *substream, int channel,
+ snd_pcm_uframes_t pos, void *dst, snd_pcm_uframes_t count);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ In the case of interleaved samples, the second argument
+ (<parameter>channel</parameter>) is not used. The third argument
+ (<parameter>pos</parameter>) points the
+ current position offset in frames.
+ </para>
+
+ <para>
+ The meaning of the fourth argument is different between
+ playback and capture. For playback, it holds the source data
+ pointer, and for capture, it's the destination data pointer.
+ </para>
+
+ <para>
+ The last argument is the number of frames to be copied.
+ </para>
+
+ <para>
+ What you have to do in this callback is again different
+ between playback and capture directions. In the
+ playback case, you copy the given amount of data
+ (<parameter>count</parameter>) at the specified pointer
+ (<parameter>src</parameter>) to the specified offset
+ (<parameter>pos</parameter>) on the hardware buffer. When
+ coded like memcpy-like way, the copy would be like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ my_memcpy(my_buffer + frames_to_bytes(runtime, pos), src,
+ frames_to_bytes(runtime, count));
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ For the capture direction, you copy the given amount of
+ data (<parameter>count</parameter>) at the specified offset
+ (<parameter>pos</parameter>) on the hardware buffer to the
+ specified pointer (<parameter>dst</parameter>).
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ my_memcpy(dst, my_buffer + frames_to_bytes(runtime, pos),
+ frames_to_bytes(runtime, count));
+]]>
+ </programlisting>
+ </informalexample>
+
+ Note that both the position and the amount of data are given
+ in frames.
+ </para>
+
+ <para>
+ In the case of non-interleaved samples, the implementation
+ will be a bit more complicated.
+ </para>
+
+ <para>
+ You need to check the channel argument, and if it's -1, copy
+ the whole channels. Otherwise, you have to copy only the
+ specified channel. Please check
+ <filename>isa/gus/gus_pcm.c</filename> as an example.
+ </para>
+
+ <para>
+ The <structfield>silence</structfield> callback is also
+ implemented in a similar way.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int silence(struct snd_pcm_substream *substream, int channel,
+ snd_pcm_uframes_t pos, snd_pcm_uframes_t count);
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The meanings of arguments are the same as in the
+ <structfield>copy</structfield>
+ callback, although there is no <parameter>src/dst</parameter>
+ argument. In the case of interleaved samples, the channel
+ argument has no meaning, as well as on
+ <structfield>copy</structfield> callback.
+ </para>
+
+ <para>
+ The role of <structfield>silence</structfield> callback is to
+ set the given amount
+ (<parameter>count</parameter>) of silence data at the
+ specified offset (<parameter>pos</parameter>) on the hardware
+ buffer. Suppose that the data format is signed (that is, the
+ silent-data is 0), and the implementation using a memset-like
+ function would be like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ my_memcpy(my_buffer + frames_to_bytes(runtime, pos), 0,
+ frames_to_bytes(runtime, count));
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ In the case of non-interleaved samples, again, the
+ implementation becomes a bit more complicated. See, for example,
+ <filename>isa/gus/gus_pcm.c</filename>.
+ </para>
+ </section>
+
+ <section id="buffer-and-memory-non-contiguous">
+ <title>Non-Contiguous Buffers</title>
+ <para>
+ If your hardware supports the page table as in emu10k1 or the
+ buffer descriptors as in via82xx, you can use the scatter-gather
+ (SG) DMA. ALSA provides an interface for handling SG-buffers.
+ The API is provided in <filename>&lt;sound/pcm.h&gt;</filename>.
+ </para>
+
+ <para>
+ For creating the SG-buffer handler, call
+ <function>snd_pcm_lib_preallocate_pages()</function> or
+ <function>snd_pcm_lib_preallocate_pages_for_all()</function>
+ with <constant>SNDRV_DMA_TYPE_DEV_SG</constant>
+ in the PCM constructor like other PCI pre-allocator.
+ You need to pass <function>snd_dma_pci_data(pci)</function>,
+ where pci is the struct <structname>pci_dev</structname> pointer
+ of the chip as well.
+ The <type>struct snd_sg_buf</type> instance is created as
+ substream-&gt;dma_private. You can cast
+ the pointer like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_sg_buf *sgbuf = (struct snd_sg_buf *)substream->dma_private;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Then call <function>snd_pcm_lib_malloc_pages()</function>
+ in the <structfield>hw_params</structfield> callback
+ as well as in the case of normal PCI buffer.
+ The SG-buffer handler will allocate the non-contiguous kernel
+ pages of the given size and map them onto the virtually contiguous
+ memory. The virtual pointer is addressed in runtime-&gt;dma_area.
+ The physical address (runtime-&gt;dma_addr) is set to zero,
+ because the buffer is physically non-contigous.
+ The physical address table is set up in sgbuf-&gt;table.
+ You can get the physical address at a certain offset via
+ <function>snd_pcm_sgbuf_get_addr()</function>.
+ </para>
+
+ <para>
+ When a SG-handler is used, you need to set
+ <function>snd_pcm_sgbuf_ops_page</function> as
+ the <structfield>page</structfield> callback.
+ (See <link linkend="pcm-interface-operators-page-callback">
+ <citetitle>page callback section</citetitle></link>.)
+ </para>
+
+ <para>
+ To release the data, call
+ <function>snd_pcm_lib_free_pages()</function> in the
+ <structfield>hw_free</structfield> callback as usual.
+ </para>
+ </section>
+
+ <section id="buffer-and-memory-vmalloced">
+ <title>Vmalloc'ed Buffers</title>
+ <para>
+ It's possible to use a buffer allocated via
+ <function>vmalloc</function>, for example, for an intermediate
+ buffer. Since the allocated pages are not contiguous, you need
+ to set the <structfield>page</structfield> callback to obtain
+ the physical address at every offset.
+ </para>
+
+ <para>
+ The implementation of <structfield>page</structfield> callback
+ would be like this:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ #include <linux/vmalloc.h>
+
+ /* get the physical page pointer on the given offset */
+ static struct page *mychip_page(struct snd_pcm_substream *substream,
+ unsigned long offset)
+ {
+ void *pageptr = substream->runtime->dma_area + offset;
+ return vmalloc_to_page(pageptr);
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Proc Interface -->
+<!-- ****************************************************** -->
+ <chapter id="proc-interface">
+ <title>Proc Interface</title>
+ <para>
+ ALSA provides an easy interface for procfs. The proc files are
+ very useful for debugging. I recommend you set up proc files if
+ you write a driver and want to get a running status or register
+ dumps. The API is found in
+ <filename>&lt;sound/info.h&gt;</filename>.
+ </para>
+
+ <para>
+ To create a proc file, call
+ <function>snd_card_proc_new()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ struct snd_info_entry *entry;
+ int err = snd_card_proc_new(card, "my-file", &entry);
+]]>
+ </programlisting>
+ </informalexample>
+
+ where the second argument specifies the name of the proc file to be
+ created. The above example will create a file
+ <filename>my-file</filename> under the card directory,
+ e.g. <filename>/proc/asound/card0/my-file</filename>.
+ </para>
+
+ <para>
+ Like other components, the proc entry created via
+ <function>snd_card_proc_new()</function> will be registered and
+ released automatically in the card registration and release
+ functions.
+ </para>
+
+ <para>
+ When the creation is successful, the function stores a new
+ instance in the pointer given in the third argument.
+ It is initialized as a text proc file for read only. To use
+ this proc file as a read-only text file as it is, set the read
+ callback with a private data via
+ <function>snd_info_set_text_ops()</function>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_info_set_text_ops(entry, chip, my_proc_read);
+]]>
+ </programlisting>
+ </informalexample>
+
+ where the second argument (<parameter>chip</parameter>) is the
+ private data to be used in the callbacks. The third parameter
+ specifies the read buffer size and the fourth
+ (<parameter>my_proc_read</parameter>) is the callback function, which
+ is defined like
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void my_proc_read(struct snd_info_entry *entry,
+ struct snd_info_buffer *buffer);
+]]>
+ </programlisting>
+ </informalexample>
+
+ </para>
+
+ <para>
+ In the read callback, use <function>snd_iprintf()</function> for
+ output strings, which works just like normal
+ <function>printf()</function>. For example,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static void my_proc_read(struct snd_info_entry *entry,
+ struct snd_info_buffer *buffer)
+ {
+ struct my_chip *chip = entry->private_data;
+
+ snd_iprintf(buffer, "This is my chip!\n");
+ snd_iprintf(buffer, "Port = %ld\n", chip->port);
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The file permissions can be changed afterwards. As default, it's
+ set as read only for all users. If you want to add write
+ permission for the user (root as default), do as follows:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ entry->mode = S_IFREG | S_IRUGO | S_IWUSR;
+]]>
+ </programlisting>
+ </informalexample>
+
+ and set the write buffer size and the callback
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ entry->c.text.write = my_proc_write;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ For the write callback, you can use
+ <function>snd_info_get_line()</function> to get a text line, and
+ <function>snd_info_get_str()</function> to retrieve a string from
+ the line. Some examples are found in
+ <filename>core/oss/mixer_oss.c</filename>, core/oss/and
+ <filename>pcm_oss.c</filename>.
+ </para>
+
+ <para>
+ For a raw-data proc-file, set the attributes as follows:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct snd_info_entry_ops my_file_io_ops = {
+ .read = my_file_io_read,
+ };
+
+ entry->content = SNDRV_INFO_CONTENT_DATA;
+ entry->private_data = chip;
+ entry->c.ops = &my_file_io_ops;
+ entry->size = 4096;
+ entry->mode = S_IFREG | S_IRUGO;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The callback is much more complicated than the text-file
+ version. You need to use a low-level I/O functions such as
+ <function>copy_from/to_user()</function> to transfer the
+ data.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static long my_file_io_read(struct snd_info_entry *entry,
+ void *file_private_data,
+ struct file *file,
+ char *buf,
+ unsigned long count,
+ unsigned long pos)
+ {
+ long size = count;
+ if (pos + size > local_max_size)
+ size = local_max_size - pos;
+ if (copy_to_user(buf, local_data + pos, size))
+ return -EFAULT;
+ return size;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Power Management -->
+<!-- ****************************************************** -->
+ <chapter id="power-management">
+ <title>Power Management</title>
+ <para>
+ If the chip is supposed to work with suspend/resume
+ functions, you need to add power-management code to the
+ driver. The additional code for power-management should be
+ <function>ifdef</function>'ed with
+ <constant>CONFIG_PM</constant>.
+ </para>
+
+ <para>
+ If the driver <emphasis>fully</emphasis> supports suspend/resume
+ that is, the device can be
+ properly resumed to its state when suspend was called,
+ you can set the <constant>SNDRV_PCM_INFO_RESUME</constant> flag
+ in the pcm info field. Usually, this is possible when the
+ registers of the chip can be safely saved and restored to
+ RAM. If this is set, the trigger callback is called with
+ <constant>SNDRV_PCM_TRIGGER_RESUME</constant> after the resume
+ callback completes.
+ </para>
+
+ <para>
+ Even if the driver doesn't support PM fully but
+ partial suspend/resume is still possible, it's still worthy to
+ implement suspend/resume callbacks. In such a case, applications
+ would reset the status by calling
+ <function>snd_pcm_prepare()</function> and restart the stream
+ appropriately. Hence, you can define suspend/resume callbacks
+ below but don't set <constant>SNDRV_PCM_INFO_RESUME</constant>
+ info flag to the PCM.
+ </para>
+
+ <para>
+ Note that the trigger with SUSPEND can always be called when
+ <function>snd_pcm_suspend_all</function> is called,
+ regardless of the <constant>SNDRV_PCM_INFO_RESUME</constant> flag.
+ The <constant>RESUME</constant> flag affects only the behavior
+ of <function>snd_pcm_resume()</function>.
+ (Thus, in theory,
+ <constant>SNDRV_PCM_TRIGGER_RESUME</constant> isn't needed
+ to be handled in the trigger callback when no
+ <constant>SNDRV_PCM_INFO_RESUME</constant> flag is set. But,
+ it's better to keep it for compatibility reasons.)
+ </para>
+ <para>
+ In the earlier version of ALSA drivers, a common
+ power-management layer was provided, but it has been removed.
+ The driver needs to define the suspend/resume hooks according to
+ the bus the device is connected to. In the case of PCI drivers, the
+ callbacks look like below:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ #ifdef CONFIG_PM
+ static int snd_my_suspend(struct pci_dev *pci, pm_message_t state)
+ {
+ .... /* do things for suspend */
+ return 0;
+ }
+ static int snd_my_resume(struct pci_dev *pci)
+ {
+ .... /* do things for suspend */
+ return 0;
+ }
+ #endif
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The scheme of the real suspend job is as follows.
+
+ <orderedlist>
+ <listitem><para>Retrieve the card and the chip data.</para></listitem>
+ <listitem><para>Call <function>snd_power_change_state()</function> with
+ <constant>SNDRV_CTL_POWER_D3hot</constant> to change the
+ power status.</para></listitem>
+ <listitem><para>Call <function>snd_pcm_suspend_all()</function> to suspend the running PCM streams.</para></listitem>
+ <listitem><para>If AC97 codecs are used, call
+ <function>snd_ac97_suspend()</function> for each codec.</para></listitem>
+ <listitem><para>Save the register values if necessary.</para></listitem>
+ <listitem><para>Stop the hardware if necessary.</para></listitem>
+ <listitem><para>Disable the PCI device by calling
+ <function>pci_disable_device()</function>. Then, call
+ <function>pci_save_state()</function> at last.</para></listitem>
+ </orderedlist>
+ </para>
+
+ <para>
+ A typical code would be like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int mychip_suspend(struct pci_dev *pci, pm_message_t state)
+ {
+ /* (1) */
+ struct snd_card *card = pci_get_drvdata(pci);
+ struct mychip *chip = card->private_data;
+ /* (2) */
+ snd_power_change_state(card, SNDRV_CTL_POWER_D3hot);
+ /* (3) */
+ snd_pcm_suspend_all(chip->pcm);
+ /* (4) */
+ snd_ac97_suspend(chip->ac97);
+ /* (5) */
+ snd_mychip_save_registers(chip);
+ /* (6) */
+ snd_mychip_stop_hardware(chip);
+ /* (7) */
+ pci_disable_device(pci);
+ pci_save_state(pci);
+ return 0;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The scheme of the real resume job is as follows.
+
+ <orderedlist>
+ <listitem><para>Retrieve the card and the chip data.</para></listitem>
+ <listitem><para>Set up PCI. First, call <function>pci_restore_state()</function>.
+ Then enable the pci device again by calling <function>pci_enable_device()</function>.
+ Call <function>pci_set_master()</function> if necessary, too.</para></listitem>
+ <listitem><para>Re-initialize the chip.</para></listitem>
+ <listitem><para>Restore the saved registers if necessary.</para></listitem>
+ <listitem><para>Resume the mixer, e.g. calling
+ <function>snd_ac97_resume()</function>.</para></listitem>
+ <listitem><para>Restart the hardware (if any).</para></listitem>
+ <listitem><para>Call <function>snd_power_change_state()</function> with
+ <constant>SNDRV_CTL_POWER_D0</constant> to notify the processes.</para></listitem>
+ </orderedlist>
+ </para>
+
+ <para>
+ A typical code would be like:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int mychip_resume(struct pci_dev *pci)
+ {
+ /* (1) */
+ struct snd_card *card = pci_get_drvdata(pci);
+ struct mychip *chip = card->private_data;
+ /* (2) */
+ pci_restore_state(pci);
+ pci_enable_device(pci);
+ pci_set_master(pci);
+ /* (3) */
+ snd_mychip_reinit_chip(chip);
+ /* (4) */
+ snd_mychip_restore_registers(chip);
+ /* (5) */
+ snd_ac97_resume(chip->ac97);
+ /* (6) */
+ snd_mychip_restart_chip(chip);
+ /* (7) */
+ snd_power_change_state(card, SNDRV_CTL_POWER_D0);
+ return 0;
+ }
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ As shown in the above, it's better to save registers after
+ suspending the PCM operations via
+ <function>snd_pcm_suspend_all()</function> or
+ <function>snd_pcm_suspend()</function>. It means that the PCM
+ streams are already stoppped when the register snapshot is
+ taken. But, remember that you don't have to restart the PCM
+ stream in the resume callback. It'll be restarted via
+ trigger call with <constant>SNDRV_PCM_TRIGGER_RESUME</constant>
+ when necessary.
+ </para>
+
+ <para>
+ OK, we have all callbacks now. Let's set them up. In the
+ initialization of the card, make sure that you can get the chip
+ data from the card instance, typically via
+ <structfield>private_data</structfield> field, in case you
+ created the chip data individually.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int __devinit snd_mychip_probe(struct pci_dev *pci,
+ const struct pci_device_id *pci_id)
+ {
+ ....
+ struct snd_card *card;
+ struct mychip *chip;
+ ....
+ card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL);
+ ....
+ chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+ ....
+ card->private_data = chip;
+ ....
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ When you created the chip data with
+ <function>snd_card_new()</function>, it's anyway accessible
+ via <structfield>private_data</structfield> field.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int __devinit snd_mychip_probe(struct pci_dev *pci,
+ const struct pci_device_id *pci_id)
+ {
+ ....
+ struct snd_card *card;
+ struct mychip *chip;
+ ....
+ card = snd_card_new(index[dev], id[dev], THIS_MODULE,
+ sizeof(struct mychip));
+ ....
+ chip = card->private_data;
+ ....
+ }
+]]>
+ </programlisting>
+ </informalexample>
+
+ </para>
+
+ <para>
+ If you need a space to save the registers, allocate the
+ buffer for it here, too, since it would be fatal
+ if you cannot allocate a memory in the suspend phase.
+ The allocated buffer should be released in the corresponding
+ destructor.
+ </para>
+
+ <para>
+ And next, set suspend/resume callbacks to the pci_driver.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static struct pci_driver driver = {
+ .name = "My Chip",
+ .id_table = snd_my_ids,
+ .probe = snd_my_probe,
+ .remove = __devexit_p(snd_my_remove),
+ #ifdef CONFIG_PM
+ .suspend = snd_my_suspend,
+ .resume = snd_my_resume,
+ #endif
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Module Parameters -->
+<!-- ****************************************************** -->
+ <chapter id="module-parameters">
+ <title>Module Parameters</title>
+ <para>
+ There are standard module options for ALSA. At least, each
+ module should have the <parameter>index</parameter>,
+ <parameter>id</parameter> and <parameter>enable</parameter>
+ options.
+ </para>
+
+ <para>
+ If the module supports multiple cards (usually up to
+ 8 = <constant>SNDRV_CARDS</constant> cards), they should be
+ arrays. The default initial values are defined already as
+ constants for easier programming:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
+ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
+ static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ If the module supports only a single card, they could be single
+ variables, instead. <parameter>enable</parameter> option is not
+ always necessary in this case, but it would be better to have a
+ dummy option for compatibility.
+ </para>
+
+ <para>
+ The module parameters must be declared with the standard
+ <function>module_param()()</function>,
+ <function>module_param_array()()</function> and
+ <function>MODULE_PARM_DESC()</function> macros.
+ </para>
+
+ <para>
+ The typical coding would be like below:
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ #define CARD_NAME "My Chip"
+
+ module_param_array(index, int, NULL, 0444);
+ MODULE_PARM_DESC(index, "Index value for " CARD_NAME " soundcard.");
+ module_param_array(id, charp, NULL, 0444);
+ MODULE_PARM_DESC(id, "ID string for " CARD_NAME " soundcard.");
+ module_param_array(enable, bool, NULL, 0444);
+ MODULE_PARM_DESC(enable, "Enable " CARD_NAME " soundcard.");
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ Also, don't forget to define the module description, classes,
+ license and devices. Especially, the recent modprobe requires to
+ define the module license as GPL, etc., otherwise the system is
+ shown as <quote>tainted</quote>.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ MODULE_DESCRIPTION("My Chip");
+ MODULE_LICENSE("GPL");
+ MODULE_SUPPORTED_DEVICE("{{Vendor,My Chip Name}}");
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- How To Put Your Driver -->
+<!-- ****************************************************** -->
+ <chapter id="how-to-put-your-driver">
+ <title>How To Put Your Driver Into ALSA Tree</title>
+ <section>
+ <title>General</title>
+ <para>
+ So far, you've learned how to write the driver codes.
+ And you might have a question now: how to put my own
+ driver into the ALSA driver tree?
+ Here (finally :) the standard procedure is described briefly.
+ </para>
+
+ <para>
+ Suppose that you create a new PCI driver for the card
+ <quote>xyz</quote>. The card module name would be
+ snd-xyz. The new driver is usually put into the alsa-driver
+ tree, <filename>alsa-driver/pci</filename> directory in
+ the case of PCI cards.
+ Then the driver is evaluated, audited and tested
+ by developers and users. After a certain time, the driver
+ will go to the alsa-kernel tree (to the corresponding directory,
+ such as <filename>alsa-kernel/pci</filename>) and eventually
+ will be integrated into the Linux 2.6 tree (the directory would be
+ <filename>linux/sound/pci</filename>).
+ </para>
+
+ <para>
+ In the following sections, the driver code is supposed
+ to be put into alsa-driver tree. The two cases are covered:
+ a driver consisting of a single source file and one consisting
+ of several source files.
+ </para>
+ </section>
+
+ <section>
+ <title>Driver with A Single Source File</title>
+ <para>
+ <orderedlist>
+ <listitem>
+ <para>
+ Modify alsa-driver/pci/Makefile
+ </para>
+
+ <para>
+ Suppose you have a file xyz.c. Add the following
+ two lines
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd-xyz-objs := xyz.o
+ obj-$(CONFIG_SND_XYZ) += snd-xyz.o
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Create the Kconfig entry
+ </para>
+
+ <para>
+ Add the new entry of Kconfig for your xyz driver.
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ config SND_XYZ
+ tristate "Foobar XYZ"
+ depends on SND
+ select SND_PCM
+ help
+ Say Y here to include support for Foobar XYZ soundcard.
+
+ To compile this driver as a module, choose M here: the module
+ will be called snd-xyz.
+]]>
+ </programlisting>
+ </informalexample>
+
+ the line, select SND_PCM, specifies that the driver xyz supports
+ PCM. In addition to SND_PCM, the following components are
+ supported for select command:
+ SND_RAWMIDI, SND_TIMER, SND_HWDEP, SND_MPU401_UART,
+ SND_OPL3_LIB, SND_OPL4_LIB, SND_VX_LIB, SND_AC97_CODEC.
+ Add the select command for each supported component.
+ </para>
+
+ <para>
+ Note that some selections imply the lowlevel selections.
+ For example, PCM includes TIMER, MPU401_UART includes RAWMIDI,
+ AC97_CODEC includes PCM, and OPL3_LIB includes HWDEP.
+ You don't need to give the lowlevel selections again.
+ </para>
+
+ <para>
+ For the details of Kconfig script, refer to the kbuild
+ documentation.
+ </para>
+
+ </listitem>
+
+ <listitem>
+ <para>
+ Run cvscompile script to re-generate the configure script and
+ build the whole stuff again.
+ </para>
+ </listitem>
+ </orderedlist>
+ </para>
+ </section>
+
+ <section>
+ <title>Drivers with Several Source Files</title>
+ <para>
+ Suppose that the driver snd-xyz have several source files.
+ They are located in the new subdirectory,
+ pci/xyz.
+
+ <orderedlist>
+ <listitem>
+ <para>
+ Add a new directory (<filename>xyz</filename>) in
+ <filename>alsa-driver/pci/Makefile</filename> as below
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ obj-$(CONFIG_SND) += xyz/
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Under the directory <filename>xyz</filename>, create a Makefile
+
+ <example>
+ <title>Sample Makefile for a driver xyz</title>
+ <programlisting>
+<![CDATA[
+ ifndef SND_TOPDIR
+ SND_TOPDIR=../..
+ endif
+
+ include $(SND_TOPDIR)/toplevel.config
+ include $(SND_TOPDIR)/Makefile.conf
+
+ snd-xyz-objs := xyz.o abc.o def.o
+
+ obj-$(CONFIG_SND_XYZ) += snd-xyz.o
+
+ include $(SND_TOPDIR)/Rules.make
+]]>
+ </programlisting>
+ </example>
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Create the Kconfig entry
+ </para>
+
+ <para>
+ This procedure is as same as in the last section.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Run cvscompile script to re-generate the configure script and
+ build the whole stuff again.
+ </para>
+ </listitem>
+ </orderedlist>
+ </para>
+ </section>
+
+ </chapter>
+
+<!-- ****************************************************** -->
+<!-- Useful Functions -->
+<!-- ****************************************************** -->
+ <chapter id="useful-functions">
+ <title>Useful Functions</title>
+
+ <section id="useful-functions-snd-printk">
+ <title><function>snd_printk()</function> and friends</title>
+ <para>
+ ALSA provides a verbose version of the
+ <function>printk()</function> function. If a kernel config
+ <constant>CONFIG_SND_VERBOSE_PRINTK</constant> is set, this
+ function prints the given message together with the file name
+ and the line of the caller. The <constant>KERN_XXX</constant>
+ prefix is processed as
+ well as the original <function>printk()</function> does, so it's
+ recommended to add this prefix, e.g.
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_printk(KERN_ERR "Oh my, sorry, it's extremely bad!\n");
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ There are also <function>printk()</function>'s for
+ debugging. <function>snd_printd()</function> can be used for
+ general debugging purposes. If
+ <constant>CONFIG_SND_DEBUG</constant> is set, this function is
+ compiled, and works just like
+ <function>snd_printk()</function>. If the ALSA is compiled
+ without the debugging flag, it's ignored.
+ </para>
+
+ <para>
+ <function>snd_printdd()</function> is compiled in only when
+ <constant>CONFIG_SND_DEBUG_VERBOSE</constant> is set. Please note
+ that <constant>CONFIG_SND_DEBUG_VERBOSE</constant> is not set as default
+ even if you configure the alsa-driver with
+ <option>--with-debug=full</option> option. You need to give
+ explicitly <option>--with-debug=detect</option> option instead.
+ </para>
+ </section>
+
+ <section id="useful-functions-snd-bug">
+ <title><function>snd_BUG()</function></title>
+ <para>
+ It shows the <computeroutput>BUG?</computeroutput> message and
+ stack trace as well as <function>snd_BUG_ON</function> at the point.
+ It's useful to show that a fatal error happens there.
+ </para>
+ <para>
+ When no debug flag is set, this macro is ignored.
+ </para>
+ </section>
+
+ <section id="useful-functions-snd-bug-on">
+ <title><function>snd_BUG_ON()</function></title>
+ <para>
+ <function>snd_BUG_ON()</function> macro is similar with
+ <function>WARN_ON()</function> macro. For example,
+
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ snd_BUG_ON(!pointer);
+]]>
+ </programlisting>
+ </informalexample>
+
+ or it can be used as the condition,
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ if (snd_BUG_ON(non_zero_is_bug))
+ return -EINVAL;
+]]>
+ </programlisting>
+ </informalexample>
+
+ </para>
+
+ <para>
+ The macro takes an conditional expression to evaluate.
+ When <constant>CONFIG_SND_DEBUG</constant>, is set, the
+ expression is actually evaluated. If it's non-zero, it shows
+ the warning message such as
+ <computeroutput>BUG? (xxx)</computeroutput>
+ normally followed by stack trace. It returns the evaluated
+ value.
+ When no <constant>CONFIG_SND_DEBUG</constant> is set, this
+ macro always returns zero.
+ </para>
+
+ </section>
+
+ </chapter>
+
+
+<!-- ****************************************************** -->
+<!-- Acknowledgments -->
+<!-- ****************************************************** -->
+ <chapter id="acknowledgments">
+ <title>Acknowledgments</title>
+ <para>
+ I would like to thank Phil Kerr for his help for improvement and
+ corrections of this document.
+ </para>
+ <para>
+ Kevin Conder reformatted the original plain-text to the
+ DocBook format.
+ </para>
+ <para>
+ Giuliano Pochini corrected typos and contributed the example codes
+ in the hardware constraints section.
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/sound/alsa/Joystick.txt b/Documentation/sound/alsa/Joystick.txt
new file mode 100644
index 0000000..ccda41b
--- /dev/null
+++ b/Documentation/sound/alsa/Joystick.txt
@@ -0,0 +1,86 @@
+Analog Joystick Support on ALSA Drivers
+=======================================
+ Oct. 14, 2003
+ Takashi Iwai <tiwai@suse.de>
+
+General
+-------
+
+First of all, you need to enable GAMEPORT support on Linux kernel for
+using a joystick with the ALSA driver. For the details of gameport
+support, refer to Documentation/input/joystick.txt.
+
+The joystick support of ALSA drivers is different between ISA and PCI
+cards. In the case of ISA (PnP) cards, it's usually handled by the
+independent module (ns558). Meanwhile, the ALSA PCI drivers have the
+built-in gameport support. Hence, when the ALSA PCI driver is built
+in the kernel, CONFIG_GAMEPORT must be 'y', too. Otherwise, the
+gameport support on that card will be (silently) disabled.
+
+Some adapter modules probe the physical connection of the device at
+the load time. It'd be safer to plug in the joystick device before
+loading the module.
+
+
+PCI Cards
+---------
+
+For PCI cards, the joystick is enabled when the appropriate module
+option is specified. Some drivers don't need options, and the
+joystick support is always enabled. In the former ALSA version, there
+was a dynamic control API for the joystick activation. It was
+changed, however, to the static module options because of the system
+stability and the resource management.
+
+The following PCI drivers support the joystick natively.
+
+ Driver Module Option Available Values
+ ---------------------------------------------------------------------------
+ als4000 joystick_port 0 = disable (default), 1 = auto-detect,
+ manual: any address (e.g. 0x200)
+ au88x0 N/A N/A
+ azf3328 joystick 0 = disable, 1 = enable, -1 = auto (default)
+ ens1370 joystick 0 = disable (default), 1 = enable
+ ens1371 joystick_port 0 = disable (default), 1 = auto-detect,
+ manual: 0x200, 0x208, 0x210, 0x218
+ cmipci joystick_port 0 = disable (default), 1 = auto-detect,
+ manual: any address (e.g. 0x200)
+ cs4281 N/A N/A
+ cs46xx N/A N/A
+ es1938 N/A N/A
+ es1968 joystick 0 = disable (default), 1 = enable
+ sonicvibes N/A N/A
+ trident N/A N/A
+ via82xx(*1) joystick 0 = disable (default), 1 = enable
+ ymfpci joystick_port 0 = disable (default), 1 = auto-detect,
+ manual: 0x201, 0x202, 0x204, 0x205(*2)
+ ---------------------------------------------------------------------------
+
+ *1) VIA686A/B only
+ *2) With YMF744/754 chips, the port address can be chosen arbitrarily
+
+The following drivers don't support gameport natively, but there are
+additional modules. Load the corresponding module to add the gameport
+support.
+
+ Driver Additional Module
+ -----------------------------
+ emu10k1 emu10k1-gp
+ fm801 fm801-gp
+ -----------------------------
+
+Note: the "pcigame" and "cs461x" modules are for the OSS drivers only.
+ These ALSA drivers (cs46xx, trident and au88x0) have the
+ built-in gameport support.
+
+As mentioned above, ALSA PCI drivers have the built-in gameport
+support, so you don't have to load ns558 module. Just load "joydev"
+and the appropriate adapter module (e.g. "analog").
+
+
+ISA Cards
+---------
+
+ALSA ISA drivers don't have the built-in gameport support.
+Instead, you need to load "ns558" module in addition to "joydev" and
+the adapter module (e.g. "analog").
diff --git a/Documentation/sound/alsa/MIXART.txt b/Documentation/sound/alsa/MIXART.txt
new file mode 100644
index 0000000..ef42c44
--- /dev/null
+++ b/Documentation/sound/alsa/MIXART.txt
@@ -0,0 +1,100 @@
+ Alsa driver for Digigram miXart8 and miXart8AES/EBU soundcards
+ Digigram <alsa@digigram.com>
+
+
+GENERAL
+=======
+
+The miXart8 is a multichannel audio processing and mixing soundcard
+that has 4 stereo audio inputs and 4 stereo audio outputs.
+The miXart8AES/EBU is the same with a add-on card that offers further
+4 digital stereo audio inputs and outputs.
+Furthermore the add-on card offers external clock synchronisation
+(AES/EBU, Word Clock, Time Code and Video Synchro)
+
+The mainboard has a PowerPC that offers onboard mpeg encoding and
+decoding, samplerate conversions and various effects.
+
+The driver don't work properly at all until the certain firmwares
+are loaded, i.e. no PCM nor mixer devices will appear.
+Use the mixartloader that can be found in the alsa-tools package.
+
+
+VERSION 0.1.0
+=============
+
+One miXart8 board will be represented as 4 alsa cards, each with 1
+stereo analog capture 'pcm0c' and 1 stereo analog playback 'pcm0p' device.
+With a miXart8AES/EBU there is in addition 1 stereo digital input
+'pcm1c' and 1 stereo digital output 'pcm1p' per card.
+
+Formats
+-------
+U8, S16_LE, S16_BE, S24_3LE, S24_3BE, FLOAT_LE, FLOAT_BE
+Sample rates : 8000 - 48000 Hz continuously
+
+Playback
+--------
+For instance the playback devices are configured to have max. 4
+substreams performing hardware mixing. This could be changed to a
+maximum of 24 substreams if wished.
+Mono files will be played on the left and right channel. Each channel
+can be muted for each stream to use 8 analog/digital outputs separately.
+
+Capture
+-------
+There is one substream per capture device. For instance only stereo
+formats are supported.
+
+Mixer
+-----
+<Master> and <Master Capture> : analog volume control of playback and capture PCM.
+<PCM 0-3> and <PCM Capture> : digital volume control of each analog substream.
+<AES 0-3> and <AES Capture> : digital volume control of each AES/EBU substream.
+<Monitoring> : Loopback from 'pcm0c' to 'pcm0p' with digital volume
+and mute control.
+
+Rem : for best audio quality try to keep a 0 attenuation on the PCM
+and AES volume controls which is set by 219 in the range from 0 to 255
+(about 86% with alsamixer)
+
+
+NOT YET IMPLEMENTED
+===================
+
+- external clock support (AES/EBU, Word Clock, Time Code, Video Sync)
+- MPEG audio formats
+- mono record
+- on-board effects and samplerate conversions
+- linked streams
+
+
+FIRMWARE
+========
+
+[As of 2.6.11, the firmware can be loaded automatically with hotplug
+ when CONFIG_FW_LOADER is set. The mixartloader is necessary only
+ for older versions or when you build the driver into kernel.]
+
+For loading the firmware automatically after the module is loaded, use
+the post-install command. For example, add the following entry to
+/etc/modprobe.conf for miXart driver:
+
+ install snd-mixart /sbin/modprobe --first-time -i snd-mixart && \
+ /usr/bin/mixartloader
+(for 2.2/2.4 kernels, add "post-install snd-mixart /usr/bin/vxloader" to
+ /etc/modules.conf, instead.)
+
+The firmware binaries are installed on /usr/share/alsa/firmware
+(or /usr/local/share/alsa/firmware, depending to the prefix option of
+configure). There will be a miXart.conf file, which define the dsp image
+files.
+
+The firmware files are copyright by Digigram SA
+
+
+COPYRIGHT
+=========
+
+Copyright (c) 2003 Digigram SA <alsa@digigram.com>
+Distributable under GPL.
diff --git a/Documentation/sound/alsa/OSS-Emulation.txt b/Documentation/sound/alsa/OSS-Emulation.txt
new file mode 100644
index 0000000..022aaeb
--- /dev/null
+++ b/Documentation/sound/alsa/OSS-Emulation.txt
@@ -0,0 +1,305 @@
+ NOTES ON KERNEL OSS-EMULATION
+ =============================
+
+ Jan. 22, 2004 Takashi Iwai <tiwai@suse.de>
+
+
+Modules
+=======
+
+ALSA provides a powerful OSS emulation on the kernel.
+The OSS emulation for PCM, mixer and sequencer devices is implemented
+as add-on kernel modules, snd-pcm-oss, snd-mixer-oss and snd-seq-oss.
+When you need to access the OSS PCM, mixer or sequencer devices, the
+corresponding module has to be loaded.
+
+These modules are loaded automatically when the corresponding service
+is called. The alias is defined sound-service-x-y, where x and y are
+the card number and the minor unit number. Usually you don't have to
+define these aliases by yourself.
+
+Only necessary step for auto-loading of OSS modules is to define the
+card alias in /etc/modprobe.conf, such as
+
+ alias sound-slot-0 snd-emu10k1
+
+As the second card, define sound-slot-1 as well.
+Note that you can't use the aliased name as the target name (i.e.
+"alias sound-slot-0 snd-card-0" doesn't work any more like the old
+modutils).
+
+The currently available OSS configuration is shown in
+/proc/asound/oss/sndstat. This shows in the same syntax of
+/dev/sndstat, which is available on the commercial OSS driver.
+On ALSA, you can symlink /dev/sndstat to this proc file.
+
+Please note that the devices listed in this proc file appear only
+after the corresponding OSS-emulation module is loaded. Don't worry
+even if "NOT ENABLED IN CONFIG" is shown in it.
+
+
+Device Mapping
+==============
+
+ALSA supports the following OSS device files:
+
+ PCM:
+ /dev/dspX
+ /dev/adspX
+
+ Mixer:
+ /dev/mixerX
+
+ MIDI:
+ /dev/midi0X
+ /dev/amidi0X
+
+ Sequencer:
+ /dev/sequencer
+ /dev/sequencer2 (aka /dev/music)
+
+where X is the card number from 0 to 7.
+
+(NOTE: Some distributions have the device files like /dev/midi0 and
+ /dev/midi1. They are NOT for OSS but for tclmidi, which is
+ a totally different thing.)
+
+Unlike the real OSS, ALSA cannot use the device files more than the
+assigned ones. For example, the first card cannot use /dev/dsp1 or
+/dev/dsp2, but only /dev/dsp0 and /dev/adsp0.
+
+As seen above, PCM and MIDI may have two devices. Usually, the first
+PCM device (hw:0,0 in ALSA) is mapped to /dev/dsp and the secondary
+device (hw:0,1) to /dev/adsp (if available). For MIDI, /dev/midi and
+/dev/amidi, respectively.
+
+You can change this device mapping via the module options of
+snd-pcm-oss and snd-rawmidi. In the case of PCM, the following
+options are available for snd-pcm-oss:
+
+ dsp_map PCM device number assigned to /dev/dspX
+ (default = 0)
+ adsp_map PCM device number assigned to /dev/adspX
+ (default = 1)
+
+For example, to map the third PCM device (hw:0,2) to /dev/adsp0,
+define like this:
+
+ options snd-pcm-oss adsp_map=2
+
+The options take arrays. For configuring the second card, specify
+two entries separated by comma. For example, to map the third PCM
+device on the second card to /dev/adsp1, define like below:
+
+ options snd-pcm-oss adsp_map=0,2
+
+To change the mapping of MIDI devices, the following options are
+available for snd-rawmidi:
+
+ midi_map MIDI device number assigned to /dev/midi0X
+ (default = 0)
+ amidi_map MIDI device number assigned to /dev/amidi0X
+ (default = 1)
+
+For example, to assign the third MIDI device on the first card to
+/dev/midi00, define as follows:
+
+ options snd-rawmidi midi_map=2
+
+
+PCM Mode
+========
+
+As default, ALSA emulates the OSS PCM with so-called plugin layer,
+i.e. tries to convert the sample format, rate or channels
+automatically when the card doesn't support it natively.
+This will lead to some problems for some applications like quake or
+wine, especially if they use the card only in the MMAP mode.
+
+In such a case, you can change the behavior of PCM per application by
+writing a command to the proc file. There is a proc file for each PCM
+stream, /proc/asound/cardX/pcmY[cp]/oss, where X is the card number
+(zero-based), Y the PCM device number (zero-based), and 'p' is for
+playback and 'c' for capture, respectively. Note that this proc file
+exists only after snd-pcm-oss module is loaded.
+
+The command sequence has the following syntax:
+
+ app_name fragments fragment_size [options]
+
+app_name is the name of application with (higher priority) or without
+path.
+fragments specifies the number of fragments or zero if no specific
+number is given.
+fragment_size is the size of fragment in bytes or zero if not given.
+options is the optional parameters. The following options are
+available:
+
+ disable the application tries to open a pcm device for
+ this channel but does not want to use it.
+ direct don't use plugins
+ block force block open mode
+ non-block force non-block open mode
+ partial-frag write also partial fragments (affects playback only)
+ no-silence do not fill silence ahead to avoid clicks
+
+The disable option is useful when one stream direction (playback or
+capture) is not handled correctly by the application although the
+hardware itself does support both directions.
+The direct option is used, as mentioned above, to bypass the automatic
+conversion and useful for MMAP-applications.
+For example, to playback the first PCM device without plugins for
+quake, send a command via echo like the following:
+
+ % echo "quake 0 0 direct" > /proc/asound/card0/pcm0p/oss
+
+While quake wants only playback, you may append the second command
+to notify driver that only this direction is about to be allocated:
+
+ % echo "quake 0 0 disable" > /proc/asound/card0/pcm0c/oss
+
+The permission of proc files depend on the module options of snd.
+As default it's set as root, so you'll likely need to be superuser for
+sending the command above.
+
+The block and non-block options are used to change the behavior of
+opening the device file.
+
+As default, ALSA behaves as original OSS drivers, i.e. does not block
+the file when it's busy. The -EBUSY error is returned in this case.
+
+This blocking behavior can be changed globally via nonblock_open
+module option of snd-pcm-oss. For using the blocking mode as default
+for OSS devices, define like the following:
+
+ options snd-pcm-oss nonblock_open=0
+
+The partial-frag and no-silence commands have been added recently.
+Both commands are for optimization use only. The former command
+specifies to invoke the write transfer only when the whole fragment is
+filled. The latter stops writing the silence data ahead
+automatically. Both are disabled as default.
+
+You can check the currently defined configuration by reading the proc
+file. The read image can be sent to the proc file again, hence you
+can save the current configuration
+
+ % cat /proc/asound/card0/pcm0p/oss > /somewhere/oss-cfg
+
+and restore it like
+
+ % cat /somewhere/oss-cfg > /proc/asound/card0/pcm0p/oss
+
+Also, for clearing all the current configuration, send "erase" command
+as below:
+
+ % echo "erase" > /proc/asound/card0/pcm0p/oss
+
+
+Mixer Elements
+==============
+
+Since ALSA has completely different mixer interface, the emulation of
+OSS mixer is relatively complicated. ALSA builds up a mixer element
+from several different ALSA (mixer) controls based on the name
+string. For example, the volume element SOUND_MIXER_PCM is composed
+from "PCM Playback Volume" and "PCM Playback Switch" controls for the
+playback direction and from "PCM Capture Volume" and "PCM Capture
+Switch" for the capture directory (if exists). When the PCM volume of
+OSS is changed, all the volume and switch controls above are adjusted
+automatically.
+
+As default, ALSA uses the following control for OSS volumes:
+
+ OSS volume ALSA control Index
+ -----------------------------------------------------
+ SOUND_MIXER_VOLUME Master 0
+ SOUND_MIXER_BASS Tone Control - Bass 0
+ SOUND_MIXER_TREBLE Tone Control - Treble 0
+ SOUND_MIXER_SYNTH Synth 0
+ SOUND_MIXER_PCM PCM 0
+ SOUND_MIXER_SPEAKER PC Speaker 0
+ SOUND_MIXER_LINE Line 0
+ SOUND_MIXER_MIC Mic 0
+ SOUND_MIXER_CD CD 0
+ SOUND_MIXER_IMIX Monitor Mix 0
+ SOUND_MIXER_ALTPCM PCM 1
+ SOUND_MIXER_RECLEV (not assigned)
+ SOUND_MIXER_IGAIN Capture 0
+ SOUND_MIXER_OGAIN Playback 0
+ SOUND_MIXER_LINE1 Aux 0
+ SOUND_MIXER_LINE2 Aux 1
+ SOUND_MIXER_LINE3 Aux 2
+ SOUND_MIXER_DIGITAL1 Digital 0
+ SOUND_MIXER_DIGITAL2 Digital 1
+ SOUND_MIXER_DIGITAL3 Digital 2
+ SOUND_MIXER_PHONEIN Phone 0
+ SOUND_MIXER_PHONEOUT Phone 1
+ SOUND_MIXER_VIDEO Video 0
+ SOUND_MIXER_RADIO Radio 0
+ SOUND_MIXER_MONITOR Monitor 0
+
+The second column is the base-string of the corresponding ALSA
+control. In fact, the controls with "XXX [Playback|Capture]
+[Volume|Switch]" will be checked in addition.
+
+The current assignment of these mixer elements is listed in the proc
+file, /proc/asound/cardX/oss_mixer, which will be like the following
+
+ VOLUME "Master" 0
+ BASS "" 0
+ TREBLE "" 0
+ SYNTH "" 0
+ PCM "PCM" 0
+ ...
+
+where the first column is the OSS volume element, the second column
+the base-string of the corresponding ALSA control, and the third the
+control index. When the string is empty, it means that the
+corresponding OSS control is not available.
+
+For changing the assignment, you can write the configuration to this
+proc file. For example, to map "Wave Playback" to the PCM volume,
+send the command like the following:
+
+ % echo 'VOLUME "Wave Playback" 0' > /proc/asound/card0/oss_mixer
+
+The command is exactly as same as listed in the proc file. You can
+change one or more elements, one volume per line. In the last
+example, both "Wave Playback Volume" and "Wave Playback Switch" will
+be affected when PCM volume is changed.
+
+Like the case of PCM proc file, the permission of proc files depend on
+the module options of snd. you'll likely need to be superuser for
+sending the command above.
+
+As well as in the case of PCM proc file, you can save and restore the
+current mixer configuration by reading and writing the whole file
+image.
+
+
+Duplex Streams
+==============
+
+Note that when attempting to use a single device file for playback and
+capture, the OSS API provides no way to set the format, sample rate or
+number of channels different in each direction. Thus
+ io_handle = open("device", O_RDWR)
+will only function correctly if the values are the same in each direction.
+
+To use different values in the two directions, use both
+ input_handle = open("device", O_RDONLY)
+ output_handle = open("device", O_WRONLY)
+and set the values for the corresponding handle.
+
+
+Unsupported Features
+====================
+
+MMAP on ICE1712 driver
+----------------------
+ICE1712 supports only the unconventional format, interleaved
+10-channels 24bit (packed in 32bit) format. Therefore you cannot mmap
+the buffer as the conventional (mono or 2-channels, 8 or 16bit) format
+on OSS.
+
diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt
new file mode 100644
index 0000000..f738b29
--- /dev/null
+++ b/Documentation/sound/alsa/Procfile.txt
@@ -0,0 +1,207 @@
+ Proc Files of ALSA Drivers
+ ==========================
+ Takashi Iwai <tiwai@suse.de>
+
+General
+-------
+
+ALSA has its own proc tree, /proc/asound. Many useful information are
+found in this tree. When you encounter a problem and need debugging,
+check the files listed in the following sections.
+
+Each card has its subtree cardX, where X is from 0 to 7. The
+card-specific files are stored in the card* subdirectories.
+
+
+Global Information
+------------------
+
+cards
+ Shows the list of currently configured ALSA drivers,
+ index, the id string, short and long descriptions.
+
+version
+ Shows the version string and compile date.
+
+modules
+ Lists the module of each card
+
+devices
+ Lists the ALSA native device mappings.
+
+meminfo
+ Shows the status of allocated pages via ALSA drivers.
+ Appears only when CONFIG_SND_DEBUG=y.
+
+hwdep
+ Lists the currently available hwdep devices in format of
+ <card>-<device>: <name>
+
+pcm
+ Lists the currently available PCM devices in format of
+ <card>-<device>: <id>: <name> : <sub-streams>
+
+timer
+ Lists the currently available timer devices
+
+
+oss/devices
+ Lists the OSS device mappings.
+
+oss/sndstat
+ Provides the output compatible with /dev/sndstat.
+ You can symlink this to /dev/sndstat.
+
+
+Card Specific Files
+-------------------
+
+The card-specific files are found in /proc/asound/card* directories.
+Some drivers (e.g. cmipci) have their own proc entries for the
+register dump, etc (e.g. /proc/asound/card*/cmipci shows the register
+dump). These files would be really helpful for debugging.
+
+When PCM devices are available on this card, you can see directories
+like pcm0p or pcm1c. They hold the PCM information for each PCM
+stream. The number after 'pcm' is the PCM device number from 0, and
+the last 'p' or 'c' means playback or capture direction. The files in
+this subtree is described later.
+
+The status of MIDI I/O is found in midi* files. It shows the device
+name and the received/transmitted bytes through the MIDI device.
+
+When the card is equipped with AC97 codecs, there are codec97#*
+subdirectories (described later).
+
+When the OSS mixer emulation is enabled (and the module is loaded),
+oss_mixer file appears here, too. This shows the current mapping of
+OSS mixer elements to the ALSA control elements. You can change the
+mapping by writing to this device. Read OSS-Emulation.txt for
+details.
+
+
+PCM Proc Files
+--------------
+
+card*/pcm*/info
+ The general information of this PCM device: card #, device #,
+ substreams, etc.
+
+card*/pcm*/xrun_debug
+ This file appears when CONFIG_SND_DEBUG=y.
+ This shows the status of xrun (= buffer overrun/xrun) debug of
+ ALSA PCM middle layer, as an integer from 0 to 2. The value
+ can be changed by writing to this file, such as
+
+ # cat 2 > /proc/asound/card0/pcm0p/xrun_debug
+
+ When this value is greater than 0, the driver will show the
+ messages to kernel log when an xrun is detected. The debug
+ message is shown also when the invalid H/W pointer is detected
+ at the update of periods (usually called from the interrupt
+ handler).
+
+ When this value is greater than 1, the driver will show the
+ stack trace additionally. This may help the debugging.
+
+card*/pcm*/sub*/info
+ The general information of this PCM sub-stream.
+
+card*/pcm*/sub*/status
+ The current status of this PCM sub-stream, elapsed time,
+ H/W position, etc.
+
+card*/pcm*/sub*/hw_params
+ The hardware parameters set for this sub-stream.
+
+card*/pcm*/sub*/sw_params
+ The soft parameters set for this sub-stream.
+
+card*/pcm*/sub*/prealloc
+ The buffer pre-allocation information.
+
+
+AC97 Codec Information
+----------------------
+
+card*/codec97#*/ac97#?-?
+ Shows the general information of this AC97 codec chip, such as
+ name, capabilities, set up.
+
+card*/codec97#0/ac97#?-?+regs
+ Shows the AC97 register dump. Useful for debugging.
+
+ When CONFIG_SND_DEBUG is enabled, you can write to this file for
+ changing an AC97 register directly. Pass two hex numbers.
+ For example,
+
+ # echo 02 9f1f > /proc/asound/card0/codec97#0/ac97#0-0+regs
+
+
+USB Audio Streams
+-----------------
+
+card*/stream*
+ Shows the assignment and the current status of each audio stream
+ of the given card. This information is very useful for debugging.
+
+
+HD-Audio Codecs
+---------------
+
+card*/codec#*
+ Shows the general codec information and the attribute of each
+ widget node.
+
+
+Sequencer Information
+---------------------
+
+seq/drivers
+ Lists the currently available ALSA sequencer drivers.
+
+seq/clients
+ Shows the list of currently available sequencer clients and
+ ports. The connection status and the running status are shown
+ in this file, too.
+
+seq/queues
+ Lists the currently allocated/running sequencer queues.
+
+seq/timer
+ Lists the currently allocated/running sequencer timers.
+
+seq/oss
+ Lists the OSS-compatible sequencer stuffs.
+
+
+Help For Debugging?
+-------------------
+
+When the problem is related with PCM, first try to turn on xrun_debug
+mode. This will give you the kernel messages when and where xrun
+happened.
+
+If it's really a bug, report it with the following information:
+
+ - the name of the driver/card, show in /proc/asound/cards
+ - the register dump, if available (e.g. card*/cmipci)
+
+when it's a PCM problem,
+
+ - set-up of PCM, shown in hw_parms, sw_params, and status in the PCM
+ sub-stream directory
+
+when it's a mixer problem,
+
+ - AC97 proc files, codec97#*/* files
+
+for USB audio/midi,
+
+ - output of lsusb -v
+ - stream* files in card directory
+
+
+The ALSA bug-tracking system is found at:
+
+ https://bugtrack.alsa-project.org/alsa-bug/
diff --git a/Documentation/sound/alsa/SB-Live-mixer.txt b/Documentation/sound/alsa/SB-Live-mixer.txt
new file mode 100644
index 0000000..f5639d4
--- /dev/null
+++ b/Documentation/sound/alsa/SB-Live-mixer.txt
@@ -0,0 +1,356 @@
+
+ Sound Blaster Live mixer / default DSP code
+ ===========================================
+
+
+The EMU10K1 chips have a DSP part which can be programmed to support
+various ways of sample processing, which is described here.
+(This article does not deal with the overall functionality of the
+EMU10K1 chips. See the manuals section for further details.)
+
+The ALSA driver programs this portion of chip by default code
+(can be altered later) which offers the following functionality:
+
+
+1) IEC958 (S/PDIF) raw PCM
+--------------------------
+
+This PCM device (it's the 4th PCM device (index 3!) and first subdevice
+(index 0) for a given card) allows to forward 48kHz, stereo, 16-bit
+little endian streams without any modifications to the digital output
+(coaxial or optical). The universal interface allows the creation of up
+to 8 raw PCM devices operating at 48kHz, 16-bit little endian. It would
+be easy to add support for multichannel devices to the current code,
+but the conversion routines exist only for stereo (2-channel streams)
+at the time.
+
+Look to tram_poke routines in lowlevel/emu10k1/emufx.c for more details.
+
+
+2) Digital mixer controls
+-------------------------
+
+These controls are built using the DSP instructions. They offer extended
+functionality. Only the default build-in code in the ALSA driver is described
+here. Note that the controls work as attenuators: the maximum value is the
+neutral position leaving the signal unchanged. Note that if the same destination
+is mentioned in multiple controls, the signal is accumulated and can be wrapped
+(set to maximal or minimal value without checking of overflow).
+
+
+Explanation of used abbreviations:
+
+DAC - digital to analog converter
+ADC - analog to digital converter
+I2S - one-way three wire serial bus for digital sound by Philips Semiconductors
+ (this standard is used for connecting standalone DAC and ADC converters)
+LFE - low frequency effects (subwoofer signal)
+AC97 - a chip containing an analog mixer, DAC and ADC converters
+IEC958 - S/PDIF
+FX-bus - the EMU10K1 chip has an effect bus containing 16 accumulators.
+ Each of the synthesizer voices can feed its output to these accumulators
+ and the DSP microcontroller can operate with the resulting sum.
+
+
+name='Wave Playback Volume',index=0
+
+This control is used to attenuate samples for left and right PCM FX-bus
+accumulators. ALSA uses accumulators 0 and 1 for left and right PCM samples.
+The result samples are forwarded to the front DAC PCM slots of the AC97 codec.
+
+name='Wave Surround Playback Volume',index=0
+
+This control is used to attenuate samples for left and right PCM FX-bus
+accumulators. ALSA uses accumulators 0 and 1 for left and right PCM samples.
+The result samples are forwarded to the rear I2S DACs. These DACs operates
+separately (they are not inside the AC97 codec).
+
+name='Wave Center Playback Volume',index=0
+
+This control is used to attenuate samples for left and right PCM FX-bus
+accumulators. ALSA uses accumulators 0 and 1 for left and right PCM samples.
+The result is mixed to mono signal (single channel) and forwarded to
+the ??rear?? right DAC PCM slot of the AC97 codec.
+
+name='Wave LFE Playback Volume',index=0
+
+This control is used to attenuate samples for left and right PCM FX-bus
+accumulators. ALSA uses accumulators 0 and 1 for left and right PCM.
+The result is mixed to mono signal (single channel) and forwarded to
+the ??rear?? left DAC PCM slot of the AC97 codec.
+
+name='Wave Capture Volume',index=0
+name='Wave Capture Switch',index=0
+
+These controls are used to attenuate samples for left and right PCM FX-bus
+accumulator. ALSA uses accumulators 0 and 1 for left and right PCM.
+The result is forwarded to the ADC capture FIFO (thus to the standard capture
+PCM device).
+
+name='Music Playback Volume',index=0
+
+This control is used to attenuate samples for left and right MIDI FX-bus
+accumulators. ALSA uses accumulators 4 and 5 for left and right MIDI samples.
+The result samples are forwarded to the front DAC PCM slots of the AC97 codec.
+
+name='Music Capture Volume',index=0
+name='Music Capture Switch',index=0
+
+These controls are used to attenuate samples for left and right MIDI FX-bus
+accumulator. ALSA uses accumulators 4 and 5 for left and right PCM.
+The result is forwarded to the ADC capture FIFO (thus to the standard capture
+PCM device).
+
+name='Surround Playback Volume',index=0
+
+This control is used to attenuate samples for left and right rear PCM FX-bus
+accumulators. ALSA uses accumulators 2 and 3 for left and right rear PCM samples.
+The result samples are forwarded to the rear I2S DACs. These DACs operate
+separately (they are not inside the AC97 codec).
+
+name='Surround Capture Volume',index=0
+name='Surround Capture Switch',index=0
+
+These controls are used to attenuate samples for left and right rear PCM FX-bus
+accumulators. ALSA uses accumulators 2 and 3 for left and right rear PCM samples.
+The result is forwarded to the ADC capture FIFO (thus to the standard capture
+PCM device).
+
+name='Center Playback Volume',index=0
+
+This control is used to attenuate sample for center PCM FX-bus accumulator.
+ALSA uses accumulator 6 for center PCM sample. The result sample is forwarded
+to the ??rear?? right DAC PCM slot of the AC97 codec.
+
+name='LFE Playback Volume',index=0
+
+This control is used to attenuate sample for center PCM FX-bus accumulator.
+ALSA uses accumulator 6 for center PCM sample. The result sample is forwarded
+to the ??rear?? left DAC PCM slot of the AC97 codec.
+
+name='AC97 Playback Volume',index=0
+
+This control is used to attenuate samples for left and right front ADC PCM slots
+of the AC97 codec. The result samples are forwarded to the front DAC PCM
+slots of the AC97 codec.
+********************************************************************************
+*** Note: This control should be zero for the standard operations, otherwise ***
+*** a digital loopback is activated. ***
+********************************************************************************
+
+name='AC97 Capture Volume',index=0
+
+This control is used to attenuate samples for left and right front ADC PCM slots
+of the AC97 codec. The result is forwarded to the ADC capture FIFO (thus to
+the standard capture PCM device).
+********************************************************************************
+*** Note: This control should be 100 (maximal value), otherwise no analog ***
+*** inputs of the AC97 codec can be captured (recorded). ***
+********************************************************************************
+
+name='IEC958 TTL Playback Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 TTL
+digital inputs (usually used by a CDROM drive). The result samples are
+forwarded to the front DAC PCM slots of the AC97 codec.
+
+name='IEC958 TTL Capture Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 TTL
+digital inputs (usually used by a CDROM drive). The result samples are
+forwarded to the ADC capture FIFO (thus to the standard capture PCM device).
+
+name='Zoom Video Playback Volume',index=0
+
+This control is used to attenuate samples from left and right zoom video
+digital inputs (usually used by a CDROM drive). The result samples are
+forwarded to the front DAC PCM slots of the AC97 codec.
+
+name='Zoom Video Capture Volume',index=0
+
+This control is used to attenuate samples from left and right zoom video
+digital inputs (usually used by a CDROM drive). The result samples are
+forwarded to the ADC capture FIFO (thus to the standard capture PCM device).
+
+name='IEC958 LiveDrive Playback Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 optical
+digital input. The result samples are forwarded to the front DAC PCM slots
+of the AC97 codec.
+
+name='IEC958 LiveDrive Capture Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 optical
+digital inputs. The result samples are forwarded to the ADC capture FIFO
+(thus to the standard capture PCM device).
+
+name='IEC958 Coaxial Playback Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 coaxial
+digital inputs. The result samples are forwarded to the front DAC PCM slots
+of the AC97 codec.
+
+name='IEC958 Coaxial Capture Volume',index=0
+
+This control is used to attenuate samples from left and right IEC958 coaxial
+digital inputs. The result samples are forwarded to the ADC capture FIFO
+(thus to the standard capture PCM device).
+
+name='Line LiveDrive Playback Volume',index=0
+name='Line LiveDrive Playback Volume',index=1
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs (on the LiveDrive). The result samples are forwarded to the front
+DAC PCM slots of the AC97 codec.
+
+name='Line LiveDrive Capture Volume',index=1
+name='Line LiveDrive Capture Volume',index=1
+
+This control is used to attenuate samples from left and right I2S ADC
+inputs (on the LiveDrive). The result samples are forwarded to the ADC
+capture FIFO (thus to the standard capture PCM device).
+
+name='Tone Control - Switch',index=0
+
+This control turns the tone control on or off. The samples for front, rear
+and center / LFE outputs are affected.
+
+name='Tone Control - Bass',index=0
+
+This control sets the bass intensity. There is no neutral value!!
+When the tone control code is activated, the samples are always modified.
+The closest value to pure signal is 20.
+
+name='Tone Control - Treble',index=0
+
+This control sets the treble intensity. There is no neutral value!!
+When the tone control code is activated, the samples are always modified.
+The closest value to pure signal is 20.
+
+name='IEC958 Optical Raw Playback Switch',index=0
+
+If this switch is on, then the samples for the IEC958 (S/PDIF) digital
+output are taken only from the raw FX8010 PCM, otherwise standard front
+PCM samples are taken.
+
+name='Headphone Playback Volume',index=1
+
+This control attenuates the samples for the headphone output.
+
+name='Headphone Center Playback Switch',index=1
+
+If this switch is on, then the sample for the center PCM is put to the
+left headphone output (useful for SB Live cards without separate center/LFE
+output).
+
+name='Headphone LFE Playback Switch',index=1
+
+If this switch is on, then the sample for the center PCM is put to the
+right headphone output (useful for SB Live cards without separate center/LFE
+output).
+
+
+3) PCM stream related controls
+------------------------------
+
+name='EMU10K1 PCM Volume',index 0-31
+
+Channel volume attenuation in range 0-0xffff. The maximum value (no
+attenuation) is default. The channel mapping for three values is
+as follows:
+
+ 0 - mono, default 0xffff (no attenuation)
+ 1 - left, default 0xffff (no attenuation)
+ 2 - right, default 0xffff (no attenuation)
+
+name='EMU10K1 PCM Send Routing',index 0-31
+
+This control specifies the destination - FX-bus accumulators. There are
+twelve values with this mapping:
+
+ 0 - mono, A destination (FX-bus 0-15), default 0
+ 1 - mono, B destination (FX-bus 0-15), default 1
+ 2 - mono, C destination (FX-bus 0-15), default 2
+ 3 - mono, D destination (FX-bus 0-15), default 3
+ 4 - left, A destination (FX-bus 0-15), default 0
+ 5 - left, B destination (FX-bus 0-15), default 1
+ 6 - left, C destination (FX-bus 0-15), default 2
+ 7 - left, D destination (FX-bus 0-15), default 3
+ 8 - right, A destination (FX-bus 0-15), default 0
+ 9 - right, B destination (FX-bus 0-15), default 1
+ 10 - right, C destination (FX-bus 0-15), default 2
+ 11 - right, D destination (FX-bus 0-15), default 3
+
+Don't forget that it's illegal to assign a channel to the same FX-bus accumulator
+more than once (it means 0=0 && 1=0 is an invalid combination).
+
+name='EMU10K1 PCM Send Volume',index 0-31
+
+It specifies the attenuation (amount) for given destination in range 0-255.
+The channel mapping is following:
+
+ 0 - mono, A destination attn, default 255 (no attenuation)
+ 1 - mono, B destination attn, default 255 (no attenuation)
+ 2 - mono, C destination attn, default 0 (mute)
+ 3 - mono, D destination attn, default 0 (mute)
+ 4 - left, A destination attn, default 255 (no attenuation)
+ 5 - left, B destination attn, default 0 (mute)
+ 6 - left, C destination attn, default 0 (mute)
+ 7 - left, D destination attn, default 0 (mute)
+ 8 - right, A destination attn, default 0 (mute)
+ 9 - right, B destination attn, default 255 (no attenuation)
+ 10 - right, C destination attn, default 0 (mute)
+ 11 - right, D destination attn, default 0 (mute)
+
+
+
+4) MANUALS/PATENTS:
+-------------------
+
+ftp://opensource.creative.com/pub/doc
+-------------------------------------
+
+ Files:
+ LM4545.pdf AC97 Codec
+
+ m2049.pdf The EMU10K1 Digital Audio Processor
+
+ hog63.ps FX8010 - A DSP Chip Architecture for Audio Effects
+
+
+WIPO Patents
+------------
+ Patent numbers:
+ WO 9901813 (A1) Audio Effects Processor with multiple asynchronous (Jan. 14, 1999)
+ streams
+
+ WO 9901814 (A1) Processor with Instruction Set for Audio Effects (Jan. 14, 1999)
+
+ WO 9901953 (A1) Audio Effects Processor having Decoupled Instruction
+ Execution and Audio Data Sequencing (Jan. 14, 1999)
+
+
+US Patents (http://www.uspto.gov/)
+----------------------------------
+
+ US 5925841 Digital Sampling Instrument employing cache memory (Jul. 20, 1999)
+
+ US 5928342 Audio Effects Processor integrated on a single chip (Jul. 27, 1999)
+ with a multiport memory onto which multiple asynchronous
+ digital sound samples can be concurrently loaded
+
+ US 5930158 Processor with Instruction Set for Audio Effects (Jul. 27, 1999)
+
+ US 6032235 Memory initialization circuit (Tram) (Feb. 29, 2000)
+
+ US 6138207 Interpolation looping of audio samples in cache connected to (Oct. 24, 2000)
+ system bus with prioritization and modification of bus transfers
+ in accordance with loop ends and minimum block sizes
+
+ US 6151670 Method for conserving memory storage using a (Nov. 21, 2000)
+ pool of short term memory registers
+
+ US 6195715 Interrupt control for multiple programs communicating with (Feb. 27, 2001)
+ a common interrupt by associating programs to GP registers,
+ defining interrupt register, polling GP registers, and invoking
+ callback routine associated with defined interrupt register
diff --git a/Documentation/sound/alsa/VIA82xx-mixer.txt b/Documentation/sound/alsa/VIA82xx-mixer.txt
new file mode 100644
index 0000000..1b0ac06
--- /dev/null
+++ b/Documentation/sound/alsa/VIA82xx-mixer.txt
@@ -0,0 +1,8 @@
+
+ VIA82xx mixer
+ =============
+
+On many VIA82xx boards, the 'Input Source Select' mixer control does not work.
+Setting it to 'Input2' on such boards will cause recording to hang, or fail
+with EIO (input/output error) via OSS emulation. This control should be left
+at 'Input1' for such cards.
diff --git a/Documentation/sound/alsa/emu10k1-jack.txt b/Documentation/sound/alsa/emu10k1-jack.txt
new file mode 100644
index 0000000..751d450
--- /dev/null
+++ b/Documentation/sound/alsa/emu10k1-jack.txt
@@ -0,0 +1,74 @@
+This document is a guide to using the emu10k1 based devices with JACK for low
+latency, multichannel recording functionality. All of my recent work to allow
+Linux users to use the full capabilities of their hardware has been inspired
+by the kX Project. Without their work I never would have discovered the true
+power of this hardware.
+
+ http://www.kxproject.com
+ - Lee Revell, 2005.03.30
+
+Low latency, multichannel audio with JACK and the emu10k1/emu10k2
+-----------------------------------------------------------------
+
+Until recently, emu10k1 users on Linux did not have access to the same low
+latency, multichannel features offered by the "kX ASIO" feature of their
+Windows driver. As of ALSA 1.0.9 this is no more!
+
+For those unfamiliar with kX ASIO, this consists of 16 capture and 16 playback
+channels. With a post 2.6.9 Linux kernel, latencies down to 64 (1.33 ms) or
+even 32 (0.66ms) frames should work well.
+
+The configuration is slightly more involved than on Windows, as you have to
+select the correct device for JACK to use. Actually, for qjackctl users it's
+fairly self explanatory - select Duplex, then for capture and playback select
+the multichannel devices, set the in and out channels to 16, and the sample
+rate to 48000Hz. The command line looks like this:
+
+/usr/local/bin/jackd -R -dalsa -r48000 -p64 -n2 -D -Chw:0,2 -Phw:0,3 -S
+
+This will give you 16 input ports and 16 output ports.
+
+The 16 output ports map onto the 16 FX buses (or the first 16 of 64, for the
+Audigy). The mapping from FX bus to physical output is described in
+SB-Live-mixer.txt (or Audigy-mixer.txt).
+
+The 16 input ports are connected to the 16 physical inputs. Contrary to
+popular belief, all emu10k1 cards are multichannel cards. Which of these
+input channels have physical inputs connected to them depends on the card
+model. Trial and error is highly recommended; the pinout diagrams
+for the card have been reverse engineered by some enterprising kX users and are
+available on the internet. Meterbridge is helpful here, and the kX forums are
+packed with useful information.
+
+Each input port will either correspond to a digital (SPDIF) input, an analog
+input, or nothing. The one exception is the SBLive! 5.1. On these devices,
+the second and third input ports are wired to the center/LFE output. You will
+still see 16 capture channels, but only 14 are available for recording inputs.
+
+This chart, borrowed from kxfxlib/da_asio51.cpp, describes the mapping of JACK
+ports to FXBUS2 (multitrack recording input) and EXTOUT (physical output)
+channels.
+
+/*JACK (& ASIO) mappings on 10k1 5.1 SBLive cards:
+--------------------------------------------
+JACK Epilog FXBUS2(nr)
+--------------------------------------------
+capture_1 asio14 FXBUS2(0xe)
+capture_2 asio15 FXBUS2(0xf)
+capture_3 asio0 FXBUS2(0x0)
+~capture_4 Center EXTOUT(0x11) // mapped to by Center
+~capture_5 LFE EXTOUT(0x12) // mapped to by LFE
+capture_6 asio3 FXBUS2(0x3)
+capture_7 asio4 FXBUS2(0x4)
+capture_8 asio5 FXBUS2(0x5)
+capture_9 asio6 FXBUS2(0x6)
+capture_10 asio7 FXBUS2(0x7)
+capture_11 asio8 FXBUS2(0x8)
+capture_12 asio9 FXBUS2(0x9)
+capture_13 asio10 FXBUS2(0xa)
+capture_14 asio11 FXBUS2(0xb)
+capture_15 asio12 FXBUS2(0xc)
+capture_16 asio13 FXBUS2(0xd)
+*/
+
+TODO: describe use of ld10k1/qlo10k1 in conjunction with JACK
diff --git a/Documentation/sound/alsa/hda_codec.txt b/Documentation/sound/alsa/hda_codec.txt
new file mode 100644
index 0000000..34e87ec
--- /dev/null
+++ b/Documentation/sound/alsa/hda_codec.txt
@@ -0,0 +1,322 @@
+Notes on Universal Interface for Intel High Definition Audio Codec
+------------------------------------------------------------------
+
+Takashi Iwai <tiwai@suse.de>
+
+
+[Still a draft version]
+
+
+General
+=======
+
+The snd-hda-codec module supports the generic access function for the
+High Definition (HD) audio codecs. It's designed to be independent
+from the controller code like ac97 codec module. The real accessors
+from/to the controller must be implemented in the lowlevel driver.
+
+The structure of this module is similar with ac97_codec module.
+Each codec chip belongs to a bus class which communicates with the
+controller.
+
+
+Initialization of Bus Instance
+==============================
+
+The card driver has to create struct hda_bus at first. The template
+struct should be filled and passed to the constructor:
+
+struct hda_bus_template {
+ void *private_data;
+ struct pci_dev *pci;
+ const char *modelname;
+ struct hda_bus_ops ops;
+};
+
+The card driver can set and use the private_data field to retrieve its
+own data in callback functions. The pci field is used when the patch
+needs to check the PCI subsystem IDs, so on. For non-PCI system, it
+doesn't have to be set, of course.
+The modelname field specifies the board's specific configuration. The
+string is passed to the codec parser, and it depends on the parser how
+the string is used.
+These fields, private_data, pci and modelname are all optional.
+
+The ops field contains the callback functions as the following:
+
+struct hda_bus_ops {
+ int (*command)(struct hda_codec *codec, hda_nid_t nid, int direct,
+ unsigned int verb, unsigned int parm);
+ unsigned int (*get_response)(struct hda_codec *codec);
+ void (*private_free)(struct hda_bus *);
+#ifdef CONFIG_SND_HDA_POWER_SAVE
+ void (*pm_notify)(struct hda_codec *codec);
+#endif
+};
+
+The command callback is called when the codec module needs to send a
+VERB to the controller. It's always a single command.
+The get_response callback is called when the codec requires the answer
+for the last command. These two callbacks are mandatory and have to
+be given.
+The third, private_free callback, is optional. It's called in the
+destructor to release any necessary data in the lowlevel driver.
+
+The pm_notify callback is available only with
+CONFIG_SND_HDA_POWER_SAVE kconfig. It's called when the codec needs
+to power up or may power down. The controller should check the all
+belonging codecs on the bus whether they are actually powered off
+(check codec->power_on), and optionally the driver may power down the
+controller side, too.
+
+The bus instance is created via snd_hda_bus_new(). You need to pass
+the card instance, the template, and the pointer to store the
+resultant bus instance.
+
+int snd_hda_bus_new(struct snd_card *card, const struct hda_bus_template *temp,
+ struct hda_bus **busp);
+
+It returns zero if successful. A negative return value means any
+error during creation.
+
+
+Creation of Codec Instance
+==========================
+
+Each codec chip on the board is then created on the BUS instance.
+To create a codec instance, call snd_hda_codec_new().
+
+int snd_hda_codec_new(struct hda_bus *bus, unsigned int codec_addr,
+ struct hda_codec **codecp);
+
+The first argument is the BUS instance, the second argument is the
+address of the codec, and the last one is the pointer to store the
+resultant codec instance (can be NULL if not needed).
+
+The codec is stored in a linked list of bus instance. You can follow
+the codec list like:
+
+ struct hda_codec *codec;
+ list_for_each_entry(codec, &bus->codec_list, list) {
+ ...
+ }
+
+The codec isn't initialized at this stage properly. The
+initialization sequence is called when the controls are built later.
+
+
+Codec Access
+============
+
+To access codec, use snd_hda_codec_read() and snd_hda_codec_write().
+snd_hda_param_read() is for reading parameters.
+For writing a sequence of verbs, use snd_hda_sequence_write().
+
+There are variants of cached read/write, snd_hda_codec_write_cache(),
+snd_hda_sequence_write_cache(). These are used for recording the
+register states for the power-mangement resume. When no PM is needed,
+these are equivalent with non-cached version.
+
+To retrieve the number of sub nodes connected to the given node, use
+snd_hda_get_sub_nodes(). The connection list can be obtained via
+snd_hda_get_connections() call.
+
+When an unsolicited event happens, pass the event via
+snd_hda_queue_unsol_event() so that the codec routines will process it
+later.
+
+
+(Mixer) Controls
+================
+
+To create mixer controls of all codecs, call
+snd_hda_build_controls(). It then builds the mixers and does
+initialization stuff on each codec.
+
+
+PCM Stuff
+=========
+
+snd_hda_build_pcms() gives the necessary information to create PCM
+streams. When it's called, each codec belonging to the bus stores
+codec->num_pcms and codec->pcm_info fields. The num_pcms indicates
+the number of elements in pcm_info array. The card driver is supposed
+to traverse the codec linked list, read the pcm information in
+pcm_info array, and build pcm instances according to them.
+
+The pcm_info array contains the following record:
+
+/* PCM information for each substream */
+struct hda_pcm_stream {
+ unsigned int substreams; /* number of substreams, 0 = not exist */
+ unsigned int channels_min; /* min. number of channels */
+ unsigned int channels_max; /* max. number of channels */
+ hda_nid_t nid; /* default NID to query rates/formats/bps, or set up */
+ u32 rates; /* supported rates */
+ u64 formats; /* supported formats (SNDRV_PCM_FMTBIT_) */
+ unsigned int maxbps; /* supported max. bit per sample */
+ struct hda_pcm_ops ops;
+};
+
+/* for PCM creation */
+struct hda_pcm {
+ char *name;
+ struct hda_pcm_stream stream[2];
+};
+
+The name can be passed to snd_pcm_new(). The stream field contains
+the information for playback (SNDRV_PCM_STREAM_PLAYBACK = 0) and
+capture (SNDRV_PCM_STREAM_CAPTURE = 1) directions. The card driver
+should pass substreams to snd_pcm_new() for the number of substreams
+to create.
+
+The channels_min, channels_max, rates and formats should be copied to
+runtime->hw record. They and maxbps fields are used also to compute
+the format value for the HDA codec and controller. Call
+snd_hda_calc_stream_format() to get the format value.
+
+The ops field contains the following callback functions:
+
+struct hda_pcm_ops {
+ int (*open)(struct hda_pcm_stream *info, struct hda_codec *codec,
+ struct snd_pcm_substream *substream);
+ int (*close)(struct hda_pcm_stream *info, struct hda_codec *codec,
+ struct snd_pcm_substream *substream);
+ int (*prepare)(struct hda_pcm_stream *info, struct hda_codec *codec,
+ unsigned int stream_tag, unsigned int format,
+ struct snd_pcm_substream *substream);
+ int (*cleanup)(struct hda_pcm_stream *info, struct hda_codec *codec,
+ struct snd_pcm_substream *substream);
+};
+
+All are non-NULL, so you can call them safely without NULL check.
+
+The open callback should be called in PCM open after runtime->hw is
+set up. It may override some setting and constraints additionally.
+Similarly, the close callback should be called in the PCM close.
+
+The prepare callback should be called in PCM prepare. This will set
+up the codec chip properly for the operation. The cleanup should be
+called in hw_free to clean up the configuration.
+
+The caller should check the return value, at least for open and
+prepare callbacks. When a negative value is returned, some error
+occurred.
+
+
+Proc Files
+==========
+
+Each codec dumps the widget node information in
+/proc/asound/card*/codec#* file. This information would be really
+helpful for debugging. Please provide its contents together with the
+bug report.
+
+
+Power Management
+================
+
+It's simple:
+Call snd_hda_suspend() in the PM suspend callback.
+Call snd_hda_resume() in the PM resume callback.
+
+
+Codec Preset (Patch)
+====================
+
+To set up and handle the codec functionality fully, each codec may
+have a codec preset (patch). It's defined in struct hda_codec_preset:
+
+ struct hda_codec_preset {
+ unsigned int id;
+ unsigned int mask;
+ unsigned int subs;
+ unsigned int subs_mask;
+ unsigned int rev;
+ const char *name;
+ int (*patch)(struct hda_codec *codec);
+ };
+
+When the codec id and codec subsystem id match with the given id and
+subs fields bitwise (with bitmask mask and subs_mask), the callback
+patch is called. The patch callback should initialize the codec and
+set the codec->patch_ops field. This is defined as below:
+
+ struct hda_codec_ops {
+ int (*build_controls)(struct hda_codec *codec);
+ int (*build_pcms)(struct hda_codec *codec);
+ int (*init)(struct hda_codec *codec);
+ void (*free)(struct hda_codec *codec);
+ void (*unsol_event)(struct hda_codec *codec, unsigned int res);
+ #ifdef CONFIG_PM
+ int (*suspend)(struct hda_codec *codec, pm_message_t state);
+ int (*resume)(struct hda_codec *codec);
+ #endif
+ #ifdef CONFIG_SND_HDA_POWER_SAVE
+ int (*check_power_status)(struct hda_codec *codec,
+ hda_nid_t nid);
+ #endif
+ };
+
+The build_controls callback is called from snd_hda_build_controls().
+Similarly, the build_pcms callback is called from
+snd_hda_build_pcms(). The init callback is called after
+build_controls to initialize the hardware.
+The free callback is called as a destructor.
+
+The unsol_event callback is called when an unsolicited event is
+received.
+
+The suspend and resume callbacks are for power management.
+They can be NULL if no special sequence is required. When the resume
+callback is NULL, the driver calls the init callback and resumes the
+registers from the cache. If other handling is needed, you'd need to
+write your own resume callback. There, the amp values can be resumed
+via
+ void snd_hda_codec_resume_amp(struct hda_codec *codec);
+and the other codec registers via
+ void snd_hda_codec_resume_cache(struct hda_codec *codec);
+
+The check_power_status callback is called when the amp value of the
+given widget NID is changed. The codec code can turn on/off the power
+appropriately from this information.
+
+Each entry can be NULL if not necessary to be called.
+
+
+Generic Parser
+==============
+
+When the device doesn't match with any given presets, the widgets are
+parsed via th generic parser (hda_generic.c). Its support is
+limited: no multi-channel support, for example.
+
+
+Digital I/O
+===========
+
+Call snd_hda_create_spdif_out_ctls() from the patch to create controls
+related with SPDIF out.
+
+
+Helper Functions
+================
+
+snd_hda_get_codec_name() stores the codec name on the given string.
+
+snd_hda_check_board_config() can be used to obtain the configuration
+information matching with the device. Define the model string table
+and the table with struct snd_pci_quirk entries (zero-terminated),
+and pass it to the function. The function checks the modelname given
+as a module parameter, and PCI subsystem IDs. If the matching entry
+is found, it returns the config field value.
+
+snd_hda_add_new_ctls() can be used to create and add control entries.
+Pass the zero-terminated array of struct snd_kcontrol_new
+
+Macros HDA_CODEC_VOLUME(), HDA_CODEC_MUTE() and their variables can be
+used for the entry of struct snd_kcontrol_new.
+
+The input MUX helper callbacks for such a control are provided, too:
+snd_hda_input_mux_info() and snd_hda_input_mux_put(). See
+patch_realtek.c for example.
diff --git a/Documentation/sound/alsa/hdspm.txt b/Documentation/sound/alsa/hdspm.txt
new file mode 100644
index 0000000..7a67ff7
--- /dev/null
+++ b/Documentation/sound/alsa/hdspm.txt
@@ -0,0 +1,362 @@
+Software Interface ALSA-DSP MADI Driver
+
+(translated from German, so no good English ;-),
+2004 - winfried ritsch
+
+
+
+ Full functionality has been added to the driver. Since some of
+ the Controls and startup-options are ALSA-Standard and only the
+ special Controls are described and discussed below.
+
+
+ hardware functionality:
+
+
+ Audio transmission:
+
+ number of channels -- depends on transmission mode
+
+ The number of channels chosen is from 1..Nmax. The reason to
+ use for a lower number of channels is only resource allocation,
+ since unused DMA channels are disabled and less memory is
+ allocated. So also the throughput of the PCI system can be
+ scaled. (Only important for low performance boards).
+
+ Single Speed -- 1..64 channels
+
+ (Note: Choosing the 56channel mode for transmission or as
+ receiver, only 56 are transmitted/received over the MADI, but
+ all 64 channels are available for the mixer, so channel count
+ for the driver)
+
+ Double Speed -- 1..32 channels
+
+ Note: Choosing the 56-channel mode for
+ transmission/receive-mode , only 28 are transmitted/received
+ over the MADI, but all 32 channels are available for the mixer,
+ so channel count for the driver
+
+
+ Quad Speed -- 1..16 channels
+
+ Note: Choosing the 56-channel mode for
+ transmission/receive-mode , only 14 are transmitted/received
+ over the MADI, but all 16 channels are available for the mixer,
+ so channel count for the driver
+
+ Format -- signed 32 Bit Little Endian (SNDRV_PCM_FMTBIT_S32_LE)
+
+ Sample Rates --
+
+ Single Speed -- 32000, 44100, 48000
+
+ Double Speed -- 64000, 88200, 96000 (untested)
+
+ Quad Speed -- 128000, 176400, 192000 (untested)
+
+ access-mode -- MMAP (memory mapped), Not interleaved
+ (PCM_NON-INTERLEAVED)
+
+ buffer-sizes -- 64,128,256,512,1024,2048,8192 Samples
+
+ fragments -- 2
+
+ Hardware-pointer -- 2 Modi
+
+
+ The Card supports the readout of the actual Buffer-pointer,
+ where DMA reads/writes. Since of the bulk mode of PCI it is only
+ 64 Byte accurate. SO it is not really usable for the
+ ALSA-mid-level functions (here the buffer-ID gives a better
+ result), but if MMAP is used by the application. Therefore it
+ can be configured at load-time with the parameter
+ precise-pointer.
+
+
+ (Hint: Experimenting I found that the pointer is maximum 64 to
+ large never to small. So if you subtract 64 you always have a
+ safe pointer for writing, which is used on this mode inside
+ ALSA. In theory now you can get now a latency as low as 16
+ Samples, which is a quarter of the interrupt possibilities.)
+
+ Precise Pointer -- off
+ interrupt used for pointer-calculation
+
+ Precise Pointer -- on
+ hardware pointer used.
+
+ Controller:
+
+
+ Since DSP-MADI-Mixer has 8152 Fader, it does not make sense to
+ use the standard mixer-controls, since this would break most of
+ (especially graphic) ALSA-Mixer GUIs. So Mixer control has be
+ provided by a 2-dimensional controller using the
+ hwdep-interface.
+
+ Also all 128+256 Peak and RMS-Meter can be accessed via the
+ hwdep-interface. Since it could be a performance problem always
+ copying and converting Peak and RMS-Levels even if you just need
+ one, I decided to export the hardware structure, so that of
+ needed some driver-guru can implement a memory-mapping of mixer
+ or peak-meters over ioctl, or also to do only copying and no
+ conversion. A test-application shows the usage of the controller.
+
+ Latency Controls --- not implemented !!!
+
+
+ Note: Within the windows-driver the latency is accessible of a
+ control-panel, but buffer-sizes are controlled with ALSA from
+ hwparams-calls and should not be changed in run-state, I did not
+ implement it here.
+
+
+ System Clock -- suspended !!!!
+
+ Name -- "System Clock Mode"
+
+ Access -- Read Write
+
+ Values -- "Master" "Slave"
+
+
+ !!!! This is a hardware-function but is in conflict with the
+ Clock-source controller, which is a kind of ALSA-standard. I
+ makes sense to set the card to a special mode (master at some
+ frequency or slave), since even not using an Audio-application
+ a studio should have working synchronisations setup. So use
+ Clock-source-controller instead !!!!
+
+ Clock Source
+
+ Name -- "Sample Clock Source"
+
+ Access -- Read Write
+
+ Values -- "AutoSync", "Internal 32.0 kHz", "Internal 44.1 kHz",
+ "Internal 48.0 kHz", "Internal 64.0 kHz", "Internal 88.2 kHz",
+ "Internal 96.0 kHz"
+
+ Choose between Master at a specific Frequency and so also the
+ Speed-mode or Slave (Autosync). Also see "Preferred Sync Ref"
+
+
+ !!!! This is no pure hardware function but was implemented by
+ ALSA by some ALSA-drivers before, so I use it also. !!!
+
+
+ Preferred Sync Ref
+
+ Name -- "Preferred Sync Reference"
+
+ Access -- Read Write
+
+ Values -- "Word" "MADI"
+
+
+ Within the Auto-sync-Mode the preferred Sync Source can be
+ chosen. If it is not available another is used if possible.
+
+ Note: Since MADI has a much higher bit-rate than word-clock, the
+ card should synchronise better in MADI Mode. But since the
+ RME-PLL is very good, there are almost no problems with
+ word-clock too. I never found a difference.
+
+
+ TX 64 channel ---
+
+ Name -- "TX 64 channels mode"
+
+ Access -- Read Write
+
+ Values -- 0 1
+
+ Using 64-channel-modus (1) or 56-channel-modus for
+ MADI-transmission (0).
+
+
+ Note: This control is for output only. Input-mode is detected
+ automatically from hardware sending MADI.
+
+
+ Clear TMS ---
+
+ Name -- "Clear Track Marker"
+
+ Access -- Read Write
+
+ Values -- 0 1
+
+
+ Don't use to lower 5 Audio-bits on AES as additional Bits.
+
+
+ Safe Mode oder Auto Input ---
+
+ Name -- "Safe Mode"
+
+ Access -- Read Write
+
+ Values -- 0 1
+
+ (default on)
+
+ If on (1), then if either the optical or coaxial connection
+ has a failure, there is a takeover to the working one, with no
+ sample failure. Its only useful if you use the second as a
+ backup connection.
+
+ Input ---
+
+ Name -- "Input Select"
+
+ Access -- Read Write
+
+ Values -- optical coaxial
+
+
+ Choosing the Input, optical or coaxial. If Safe-mode is active,
+ this is the preferred Input.
+
+-------------- Mixer ----------------------
+
+ Mixer
+
+ Name -- "Mixer"
+
+ Access -- Read Write
+
+ Values - <channel-number 0-127> <Value 0-65535>
+
+
+ Here as a first value the channel-index is taken to get/set the
+ corresponding mixer channel, where 0-63 are the input to output
+ fader and 64-127 the playback to outputs fader. Value 0
+ is channel muted 0 and 32768 an amplification of 1.
+
+ Chn 1-64
+
+ fast mixer for the ALSA-mixer utils. The diagonal of the
+ mixer-matrix is implemented from playback to output.
+
+
+ Line Out
+
+ Name -- "Line Out"
+
+ Access -- Read Write
+
+ Values -- 0 1
+
+ Switching on and off the analog out, which has nothing to do
+ with mixing or routing. the analog outs reflects channel 63,64.
+
+
+--- information (only read access):
+
+ Sample Rate
+
+ Name -- "System Sample Rate"
+
+ Access -- Read-only
+
+ getting the sample rate.
+
+
+ External Rate measured
+
+ Name -- "External Rate"
+
+ Access -- Read only
+
+
+ Should be "Autosync Rate", but Name used is
+ ALSA-Scheme. External Sample frequency liked used on Autosync is
+ reported.
+
+
+ MADI Sync Status
+
+ Name -- "MADI Sync Lock Status"
+
+ Access -- Read
+
+ Values -- 0,1,2
+
+ MADI-Input is 0=Unlocked, 1=Locked, or 2=Synced.
+
+
+ Word Clock Sync Status
+
+ Name -- "Word Clock Lock Status"
+
+ Access -- Read
+
+ Values -- 0,1,2
+
+ Word Clock Input is 0=Unlocked, 1=Locked, or 2=Synced.
+
+ AutoSync
+
+ Name -- "AutoSync Reference"
+
+ Access -- Read
+
+ Values -- "WordClock", "MADI", "None"
+
+ Sync-Reference is either "WordClock", "MADI" or none.
+
+ RX 64ch --- noch nicht implementiert
+
+ MADI-Receiver is in 64 channel mode oder 56 channel mode.
+
+
+ AB_inp --- not tested
+
+ Used input for Auto-Input.
+
+
+ actual Buffer Position --- not implemented
+
+ !!! this is a ALSA internal function, so no control is used !!!
+
+
+
+Calling Parameter:
+
+ index int array (min = 1, max = 8),
+ "Index value for RME HDSPM interface." card-index within ALSA
+
+ note: ALSA-standard
+
+ id string array (min = 1, max = 8),
+ "ID string for RME HDSPM interface."
+
+ note: ALSA-standard
+
+ enable int array (min = 1, max = 8),
+ "Enable/disable specific HDSPM sound-cards."
+
+ note: ALSA-standard
+
+ precise_ptr int array (min = 1, max = 8),
+ "Enable precise pointer, or disable."
+
+ note: Use only when the application supports this (which is a special case).
+
+ line_outs_monitor int array (min = 1, max = 8),
+ "Send playback streams to analog outs by default."
+
+
+ note: each playback channel is mixed to the same numbered output
+ channel (routed). This is against the ALSA-convention, where all
+ channels have to be muted on after loading the driver, but was
+ used before on other cards, so i historically use it again)
+
+
+
+ enable_monitor int array (min = 1, max = 8),
+ "Enable Analog Out on Channel 63/64 by default."
+
+ note: here the analog output is enabled (but not routed). \ No newline at end of file
diff --git a/Documentation/sound/alsa/powersave.txt b/Documentation/sound/alsa/powersave.txt
new file mode 100644
index 0000000..9657e80
--- /dev/null
+++ b/Documentation/sound/alsa/powersave.txt
@@ -0,0 +1,41 @@
+Notes on Power-Saving Mode
+==========================
+
+AC97 and HD-audio drivers have the automatic power-saving mode.
+This feature is enabled via Kconfig CONFIG_SND_AC97_POWER_SAVE
+and CONFIG_SND_HDA_POWER_SAVE options, respectively.
+
+With the automatic power-saving, the driver turns off the codec power
+appropriately when no operation is required. When no applications use
+the device and/or no analog loopback is set, the power disablement is
+done fully or partially. It'll save a certain power consumption, thus
+good for laptops (even for desktops).
+
+The time-out for automatic power-off can be specified via power_save
+module option of snd-ac97-codec and snd-hda-intel modules. Specify
+the time-out value in seconds. 0 means to disable the automatic
+power-saving. The default value of timeout is given via
+CONFIG_SND_AC97_POWER_SAVE_DEFAULT and
+CONFIG_SND_HDA_POWER_SAVE_DEFAULT Kconfig options. Setting this to 1
+(the minimum value) isn't recommended because many applications try to
+reopen the device frequently. 10 would be a good choice for normal
+operations.
+
+The power_save option is exported as writable. This means you can
+adjust the value via sysfs on the fly. For example, to turn on the
+automatic power-save mode with 10 seconds, write to
+/sys/modules/snd_ac97_codec/parameters/power_save (usually as root):
+
+ # echo 10 > /sys/modules/snd_ac97_codec/parameters/power_save
+
+
+Note that you might hear click noise/pop when changing the power
+state. Also, it often takes certain time to wake up from the
+power-down to the active state. These are often hardly to fix, so
+don't report extra bug reports unless you have a fix patch ;-)
+
+For HD-audio interface, there is another module option,
+power_save_controller. This enables/disables the power-save mode of
+the controller side. Setting this on may reduce a bit more power
+consumption, but might result in longer wake-up time and click noise.
+Try to turn it off when you experience such a thing too often.
diff --git a/Documentation/sound/alsa/seq_oss.html b/Documentation/sound/alsa/seq_oss.html
new file mode 100644
index 0000000..d9776cf
--- /dev/null
+++ b/Documentation/sound/alsa/seq_oss.html
@@ -0,0 +1,409 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<HTML>
+<HEAD>
+ <TITLE>OSS Sequencer Emulation on ALSA</TITLE>
+</HEAD>
+<BODY>
+
+<CENTER>
+<H1>
+
+<HR WIDTH="100%"></H1></CENTER>
+
+<CENTER>
+<H1>
+OSS Sequencer Emulation on ALSA</H1></CENTER>
+
+<HR WIDTH="100%">
+<P>Copyright (c) 1998,1999 by Takashi Iwai
+<TT><A HREF="mailto:iwai@ww.uni-erlangen.de">&lt;iwai@ww.uni-erlangen.de></A></TT>
+<P>ver.0.1.8; Nov. 16, 1999
+<H2>
+
+<HR WIDTH="100%"></H2>
+
+<H2>
+1. Description</H2>
+This directory contains the OSS sequencer emulation driver on ALSA. Note
+that this program is still in the development state.
+<P>What this does - it provides the emulation of the OSS sequencer, access
+via
+<TT>/dev/sequencer</TT> and <TT>/dev/music</TT> devices.
+The most of applications using OSS can run if the appropriate ALSA
+sequencer is prepared.
+<P>The following features are emulated by this driver:
+<UL>
+<LI>
+Normal sequencer and MIDI events:</LI>
+
+<BR>They are converted to the ALSA sequencer events, and sent to the corresponding
+port.
+<LI>
+Timer events:</LI>
+
+<BR>The timer is not selectable by ioctl. The control rate is fixed to
+100 regardless of HZ. That is, even on Alpha system, a tick is always
+1/100 second. The base rate and tempo can be changed in <TT>/dev/music</TT>.
+
+<LI>
+Patch loading:</LI>
+
+<BR>It purely depends on the synth drivers whether it's supported since
+the patch loading is realized by callback to the synth driver.
+<LI>
+I/O controls:</LI>
+
+<BR>Most of controls are accepted. Some controls
+are dependent on the synth driver, as well as even on original OSS.</UL>
+Furthermore, you can find the following advanced features:
+<UL>
+<LI>
+Better queue mechanism:</LI>
+
+<BR>The events are queued before processing them.
+<LI>
+Multiple applications:</LI>
+
+<BR>You can run two or more applications simultaneously (even for OSS sequencer)!
+However, each MIDI device is exclusive - that is, if a MIDI device is opened
+once by some application, other applications can't use it. No such a restriction
+in synth devices.
+<LI>
+Real-time event processing:</LI>
+
+<BR>The events can be processed in real time without using out of bound
+ioctl. To switch to real-time mode, send ABSTIME 0 event. The followed
+events will be processed in real-time without queued. To switch off the
+real-time mode, send RELTIME 0 event.
+<LI>
+<TT>/proc</TT> interface:</LI>
+
+<BR>The status of applications and devices can be shown via <TT>/proc/asound/seq/oss</TT>
+at any time. In the later version, configuration will be changed via <TT>/proc</TT>
+interface, too.</UL>
+
+<H2>
+2. Installation</H2>
+Run configure script with both sequencer support (<TT>--with-sequencer=yes</TT>)
+and OSS emulation (<TT>--with-oss=yes</TT>) options. A module <TT>snd-seq-oss.o</TT>
+will be created. If the synth module of your sound card supports for OSS
+emulation (so far, only Emu8000 driver), this module will be loaded automatically.
+Otherwise, you need to load this module manually.
+<P>At beginning, this module probes all the MIDI ports which have been
+already connected to the sequencer. Once after that, the creation and deletion
+of ports are watched by announcement mechanism of ALSA sequencer.
+<P>The available synth and MIDI devices can be found in proc interface.
+Run "<TT>cat /proc/asound/seq/oss</TT>", and check the devices. For example,
+if you use an AWE64 card, you'll see like the following:
+<PRE>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; OSS sequencer emulation version 0.1.8
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ALSA client number 63
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ALSA receiver port 0
+
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Number of applications: 0
+
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Number of synth devices: 1
+
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; synth 0: [EMU8000]
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; type 0x1 : subtype 0x20 : voices 32
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; capabilties : ioctl enabled / load_patch enabled
+
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Number of MIDI devices: 3
+
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; midi 0: [Emu8000 Port-0] ALSA port 65:0
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; capability write / opened none
+
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; midi 1: [Emu8000 Port-1] ALSA port 65:1
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; capability write / opened none
+
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; midi 2: [0: MPU-401 (UART)] ALSA port 64:0
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; capability read/write / opened none</PRE>
+Note that the device number may be different from the information of
+<TT>/proc/asound/oss-devices</TT>
+or ones of the original OSS driver. Use the device number listed in <TT>/proc/asound/seq/oss</TT>
+to play via OSS sequencer emulation.
+<H2>
+3. Using Synthesizer Devices</H2>
+Run your favorite program. I've tested playmidi-2.4, awemidi-0.4.3, gmod-3.1
+and xmp-1.1.5. You can load samples via <TT>/dev/sequencer</TT> like sfxload,
+too.
+<P>If the lowlevel driver supports multiple access to synth devices (like
+Emu8000 driver), two or more applications are allowed to run at the same
+time.
+<H2>
+4. Using MIDI Devices</H2>
+So far, only MIDI output was tested. MIDI input was not checked at all,
+but hopefully it will work. Use the device number listed in <TT>/proc/asound/seq/oss</TT>.
+Be aware that these numbers are mostly different from the list in
+<TT>/proc/asound/oss-devices</TT>.
+<H2>
+5. Module Options</H2>
+The following module options are available:
+<UL>
+<LI>
+<TT>maxqlen</TT></LI>
+
+<BR>specifies the maximum read/write queue length. This queue is private
+for OSS sequencer, so that it is independent from the queue length of ALSA
+sequencer. Default value is 1024.
+<LI>
+<TT>seq_oss_debug</TT></LI>
+
+<BR>specifies the debug level and accepts zero (= no debug message) or
+positive integer. Default value is 0.</UL>
+
+<H2>
+6. Queue Mechanism</H2>
+OSS sequencer emulation uses an ALSA priority queue. The
+events from <TT>/dev/sequencer</TT> are processed and put onto the queue
+specified by module option.
+<P>All the events from <TT>/dev/sequencer</TT> are parsed at beginning.
+The timing events are also parsed at this moment, so that the events may
+be processed in real-time. Sending an event ABSTIME 0 switches the operation
+mode to real-time mode, and sending an event RELTIME 0 switches it off.
+In the real-time mode, all events are dispatched immediately.
+<P>The queued events are dispatched to the corresponding ALSA sequencer
+ports after scheduled time by ALSA sequencer dispatcher.
+<P>If the write-queue is full, the application sleeps until a certain amount
+(as default one half) becomes empty in blocking mode. The synchronization
+to write timing was implemented, too.
+<P>The input from MIDI devices or echo-back events are stored on read FIFO
+queue. If application reads <TT>/dev/sequencer</TT> in blocking mode, the
+process will be awaked.
+
+<H2>
+7. Interface to Synthesizer Device</H2>
+
+<H3>
+7.1. Registration</H3>
+To register an OSS synthesizer device, use <TT>snd_seq_oss_synth_register</TT>
+function.
+<PRE>int snd_seq_oss_synth_register(char *name, int type, int subtype, int nvoices,
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; snd_seq_oss_callback_t *oper, void *private_data)</PRE>
+The arguments <TT>name</TT>, <TT>type</TT>, <TT>subtype</TT> and
+<TT>nvoices</TT>
+are used for making the appropriate synth_info structure for ioctl. The
+return value is an index number of this device. This index must be remembered
+for unregister. If registration is failed, -errno will be returned.
+<P>To release this device, call <TT>snd_seq_oss_synth_unregister function</TT>:
+<PRE>int snd_seq_oss_synth_unregister(int index),</PRE>
+where the <TT>index</TT> is the index number returned by register function.
+<H3>
+7.2. Callbacks</H3>
+OSS synthesizer devices have capability for sample downloading and ioctls
+like sample reset. In OSS emulation, these special features are realized
+by using callbacks. The registration argument oper is used to specify these
+callbacks. The following callback functions must be defined:
+<PRE>snd_seq_oss_callback_t:
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int (*open)(snd_seq_oss_arg_t *p, void *closure);
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int (*close)(snd_seq_oss_arg_t *p);
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int (*ioctl)(snd_seq_oss_arg_t *p, unsigned int cmd, unsigned long arg);
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int (*load_patch)(snd_seq_oss_arg_t *p, int format, const char *buf, int offs, int count);
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int (*reset)(snd_seq_oss_arg_t *p);
+Except for <TT>open</TT> and <TT>close</TT> callbacks, they are allowed
+to be NULL.
+<P>Each callback function takes the argument type snd_seq_oss_arg_t as the
+first argument.
+<PRE>struct snd_seq_oss_arg_t {
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int app_index;
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int file_mode;
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int seq_mode;
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; snd_seq_addr_t addr;
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; void *private_data;
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int event_passing;
+};</PRE>
+The first three fields, <TT>app_index</TT>, <TT>file_mode</TT> and
+<TT>seq_mode</TT>
+are initialized by OSS sequencer. The <TT>app_index</TT> is the application
+index which is unique to each application opening OSS sequencer. The
+<TT>file_mode</TT>
+is bit-flags indicating the file operation mode. See
+<TT>seq_oss.h</TT>
+for its meaning. The <TT>seq_mode</TT> is sequencer operation mode. In
+the current version, only <TT>SND_OSSSEQ_MODE_SYNTH</TT> is used.
+<P>The next two fields, <TT>addr</TT> and <TT>private_data</TT>, must be
+filled by the synth driver at open callback. The <TT>addr</TT> contains
+the address of ALSA sequencer port which is assigned to this device. If
+the driver allocates memory for <TT>private_data</TT>, it must be released
+in close callback by itself.
+<P>The last field, <TT>event_passing</TT>, indicates how to translate note-on
+/ off events. In <TT>PROCESS_EVENTS</TT> mode, the note 255 is regarded
+as velocity change, and key pressure event is passed to the port. In <TT>PASS_EVENTS</TT>
+mode, all note on/off events are passed to the port without modified. <TT>PROCESS_KEYPRESS</TT>
+mode checks the note above 128 and regards it as key pressure event (mainly
+for Emu8000 driver).
+<H4>
+7.2.1. Open Callback</H4>
+The <TT>open</TT> is called at each time this device is opened by an application
+using OSS sequencer. This must not be NULL. Typically, the open callback
+does the following procedure:
+<OL>
+<LI>
+Allocate private data record.</LI>
+
+<LI>
+Create an ALSA sequencer port.</LI>
+
+<LI>
+Set the new port address on arg->addr.</LI>
+
+<LI>
+Set the private data record pointer on arg->private_data.</LI>
+</OL>
+Note that the type bit-flags in port_info of this synth port must NOT contain
+<TT>TYPE_MIDI_GENERIC</TT>
+bit. Instead, <TT>TYPE_SPECIFIC</TT> should be used. Also, <TT>CAP_SUBSCRIPTION</TT>
+bit should NOT be included, too. This is necessary to tell it from other
+normal MIDI devices. If the open procedure succeeded, return zero. Otherwise,
+return -errno.
+<H4>
+7.2.2 Ioctl Callback</H4>
+The <TT>ioctl</TT> callback is called when the sequencer receives device-specific
+ioctls. The following two ioctls should be processed by this callback:
+<UL>
+<LI>
+<TT>IOCTL_SEQ_RESET_SAMPLES</TT></LI>
+
+<BR>reset all samples on memory -- return 0
+<LI>
+<TT>IOCTL_SYNTH_MEMAVL</TT></LI>
+
+<BR>return the available memory size
+<LI>
+<TT>FM_4OP_ENABLE</TT></LI>
+
+<BR>can be ignored usually</UL>
+The other ioctls are processed inside the sequencer without passing to
+the lowlevel driver.
+<H4>
+7.2.3 Load_Patch Callback</H4>
+The <TT>load_patch</TT> callback is used for sample-downloading. This callback
+must read the data on user-space and transfer to each device. Return 0
+if succeeded, and -errno if failed. The format argument is the patch key
+in patch_info record. The buf is user-space pointer where patch_info record
+is stored. The offs can be ignored. The count is total data size of this
+sample data.
+<H4>
+7.2.4 Close Callback</H4>
+The <TT>close</TT> callback is called when this device is closed by the
+applicaion. If any private data was allocated in open callback, it must
+be released in the close callback. The deletion of ALSA port should be
+done here, too. This callback must not be NULL.
+<H4>
+7.2.5 Reset Callback</H4>
+The <TT>reset</TT> callback is called when sequencer device is reset or
+closed by applications. The callback should turn off the sounds on the
+relevant port immediately, and initialize the status of the port. If this
+callback is undefined, OSS seq sends a <TT>HEARTBEAT</TT> event to the
+port.
+<H3>
+7.3 Events</H3>
+Most of the events are processed by sequencer and translated to the adequate
+ALSA sequencer events, so that each synth device can receive by input_event
+callback of ALSA sequencer port. The following ALSA events should be implemented
+by the driver:
+<BR>&nbsp;
+<TABLE BORDER WIDTH="75%" NOSAVE >
+<TR NOSAVE>
+<TD NOSAVE><B>ALSA event</B></TD>
+
+<TD><B>Original OSS events</B></TD>
+</TR>
+
+<TR>
+<TD>NOTEON</TD>
+
+<TD>SEQ_NOTEON
+<BR>MIDI_NOTEON</TD>
+</TR>
+
+<TR>
+<TD>NOTE</TD>
+
+<TD>SEQ_NOTEOFF
+<BR>MIDI_NOTEOFF</TD>
+</TR>
+
+<TR NOSAVE>
+<TD NOSAVE>KEYPRESS</TD>
+
+<TD>MIDI_KEY_PRESSURE</TD>
+</TR>
+
+<TR NOSAVE>
+<TD>CHANPRESS</TD>
+
+<TD NOSAVE>SEQ_AFTERTOUCH
+<BR>MIDI_CHN_PRESSURE</TD>
+</TR>
+
+<TR NOSAVE>
+<TD NOSAVE>PGMCHANGE</TD>
+
+<TD NOSAVE>SEQ_PGMCHANGE
+<BR>MIDI_PGM_CHANGE</TD>
+</TR>
+
+<TR>
+<TD>PITCHBEND</TD>
+
+<TD>SEQ_CONTROLLER(CTRL_PITCH_BENDER)
+<BR>MIDI_PITCH_BEND</TD>
+</TR>
+
+<TR>
+<TD>CONTROLLER</TD>
+
+<TD>MIDI_CTL_CHANGE
+<BR>SEQ_BALANCE (with CTL_PAN)</TD>
+</TR>
+
+<TR>
+<TD>CONTROL14</TD>
+
+<TD>SEQ_CONTROLLER</TD>
+</TR>
+
+<TR>
+<TD>REGPARAM</TD>
+
+<TD>SEQ_CONTROLLER(CTRL_PITCH_BENDER_RANGE)</TD>
+</TR>
+
+<TR>
+<TD>SYSEX</TD>
+
+<TD>SEQ_SYSEX</TD>
+</TR>
+</TABLE>
+
+<P>The most of these behavior can be realized by MIDI emulation driver
+included in the Emu8000 lowlevel driver. In the future release, this module
+will be independent.
+<P>Some OSS events (<TT>SEQ_PRIVATE</TT> and <TT>SEQ_VOLUME</TT> events) are passed as event
+type SND_SEQ_OSS_PRIVATE. The OSS sequencer passes these event 8 byte
+packets without any modification. The lowlevel driver should process these
+events appropriately.
+<H2>
+8. Interface to MIDI Device</H2>
+Since the OSS emulation probes the creation and deletion of ALSA MIDI sequencer
+ports automatically by receiving announcement from ALSA sequencer, the
+MIDI devices don't need to be registered explicitly like synth devices.
+However, the MIDI port_info registered to ALSA sequencer must include a group
+name <TT>SND_SEQ_GROUP_DEVICE</TT> and a capability-bit <TT>CAP_READ</TT> or
+<TT>CAP_WRITE</TT>. Also, subscription capabilities, <TT>CAP_SUBS_READ</TT> or <TT>CAP_SUBS_WRITE</TT>,
+must be defined, too. If these conditions are not satisfied, the port is not
+registered as OSS sequencer MIDI device.
+<P>The events via MIDI devices are parsed in OSS sequencer and converted
+to the corresponding ALSA sequencer events. The input from MIDI sequencer
+is also converted to MIDI byte events by OSS sequencer. This works just
+a reverse way of seq_midi module.
+<H2>
+9. Known Problems / TODO's</H2>
+
+<UL>
+<LI>
+Patch loading via ALSA instrument layer is not implemented yet.</LI>
+</UL>
+
+</BODY>
+</HTML>
diff --git a/Documentation/sound/alsa/serial-u16550.txt b/Documentation/sound/alsa/serial-u16550.txt
new file mode 100644
index 0000000..c191955
--- /dev/null
+++ b/Documentation/sound/alsa/serial-u16550.txt
@@ -0,0 +1,88 @@
+
+ Serial UART 16450/16550 MIDI driver
+ ===================================
+
+The adaptor module parameter allows you to select either:
+
+ 0 - Roland Soundcanvas support (default)
+ 1 - Midiator MS-124T support (1)
+ 2 - Midiator MS-124W S/A mode (2)
+ 3 - MS-124W M/B mode support (3)
+ 4 - Generic device with multiple input support (4)
+
+For the Midiator MS-124W, you must set the physical M-S and A-B
+switches on the Midiator to match the driver mode you select.
+
+In Roland Soundcanvas mode, multiple ALSA raw MIDI substreams are supported
+(midiCnD0-midiCnD15). Whenever you write to a different substream, the driver
+sends the nonstandard MIDI command sequence F5 NN, where NN is the substream
+number plus 1. Roland modules use this command to switch between different
+"parts", so this feature lets you treat each part as a distinct raw MIDI
+substream. The driver provides no way to send F5 00 (no selection) or to not
+send the F5 NN command sequence at all; perhaps it ought to.
+
+Usage example for simple serial converter:
+
+ /sbin/setserial /dev/ttyS0 uart none
+ /sbin/modprobe snd-serial-u16550 port=0x3f8 irq=4 speed=115200
+
+Usage example for Roland SoundCanvas with 4 MIDI ports:
+
+ /sbin/setserial /dev/ttyS0 uart none
+ /sbin/modprobe snd-serial-u16550 port=0x3f8 irq=4 outs=4
+
+In MS-124T mode, one raw MIDI substream is supported (midiCnD0); the outs
+module parameter is automatically set to 1. The driver sends the same data to
+all four MIDI Out connectors. Set the A-B switch and the speed module
+parameter to match (A=19200, B=9600).
+
+Usage example for MS-124T, with A-B switch in A position:
+
+ /sbin/setserial /dev/ttyS0 uart none
+ /sbin/modprobe snd-serial-u16550 port=0x3f8 irq=4 adaptor=1 \
+ speed=19200
+
+In MS-124W S/A mode, one raw MIDI substream is supported (midiCnD0);
+the outs module parameter is automatically set to 1. The driver sends
+the same data to all four MIDI Out connectors at full MIDI speed.
+
+Usage example for S/A mode:
+
+ /sbin/setserial /dev/ttyS0 uart none
+ /sbin/modprobe snd-serial-u16550 port=0x3f8 irq=4 adaptor=2
+
+In MS-124W M/B mode, the driver supports 16 ALSA raw MIDI substreams;
+the outs module parameter is automatically set to 16. The substream
+number gives a bitmask of which MIDI Out connectors the data should be
+sent to, with midiCnD1 sending to Out 1, midiCnD2 to Out 2, midiCnD4 to
+Out 3, and midiCnD8 to Out 4. Thus midiCnD15 sends the data to all 4 ports.
+As a special case, midiCnD0 also sends to all ports, since it is not useful
+to send the data to no ports. M/B mode has extra overhead to select the MIDI
+Out for each byte, so the aggregate data rate across all four MIDI Outs is
+at most one byte every 520 us, as compared with the full MIDI data rate of
+one byte every 320 us per port.
+
+Usage example for M/B mode:
+
+ /sbin/setserial /dev/ttyS0 uart none
+ /sbin/modprobe snd-serial-u16550 port=0x3f8 irq=4 adaptor=3
+
+The MS-124W hardware's M/A mode is currently not supported. This mode allows
+the MIDI Outs to act independently at double the aggregate throughput of M/B,
+but does not allow sending the same byte simultaneously to multiple MIDI Outs.
+The M/A protocol requires the driver to twiddle the modem control lines under
+timing constraints, so it would be a bit more complicated to implement than
+the other modes.
+
+Midiator models other than MS-124W and MS-124T are currently not supported.
+Note that the suffix letter is significant; the MS-124 and MS-124B are not
+compatible, nor are the other known models MS-101, MS-101B, MS-103, and MS-114.
+I do have documentation (tim.mann@compaq.com) that partially covers these models,
+but no units to experiment with. The MS-124W support is tested with a real unit.
+The MS-124T support is untested, but should work.
+
+The Generic driver supports multiple input and output substreams over a single
+serial port. Similar to Roland Soundcanvas mode, F5 NN is used to select the
+appropriate input or output stream (depending on the data direction).
+Additionally, the CTS signal is used to regulate the data flow. The number of
+inputs is specified by the ins parameter.
diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt
new file mode 100644
index 0000000..0ebd7ea
--- /dev/null
+++ b/Documentation/sound/alsa/soc/DAI.txt
@@ -0,0 +1,56 @@
+ASoC currently supports the three main Digital Audio Interfaces (DAI) found on
+SoC controllers and portable audio CODECs today, namely AC97, I2S and PCM.
+
+
+AC97
+====
+
+ AC97 is a five wire interface commonly found on many PC sound cards. It is
+now also popular in many portable devices. This DAI has a reset line and time
+multiplexes its data on its SDATA_OUT (playback) and SDATA_IN (capture) lines.
+The bit clock (BCLK) is always driven by the CODEC (usually 12.288MHz) and the
+frame (FRAME) (usually 48kHz) is always driven by the controller. Each AC97
+frame is 21uS long and is divided into 13 time slots.
+
+The AC97 specification can be found at :-
+http://www.intel.com/design/chipsets/audio/ac97_r23.pdf
+
+
+I2S
+===
+
+ I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and
+Rx lines are used for audio transmission, whilst the bit clock (BCLK) and
+left/right clock (LRC) synchronise the link. I2S is flexible in that either the
+controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock
+usually varies depending on the sample rate and the master system clock
+(SYSCLK). LRCLK is the same as the sample rate. A few devices support separate
+ADC and DAC LRCLKs, this allows for simultaneous capture and playback at
+different sample rates.
+
+I2S has several different operating modes:-
+
+ o I2S - MSB is transmitted on the falling edge of the first BCLK after LRC
+ transition.
+
+ o Left Justified - MSB is transmitted on transition of LRC.
+
+ o Right Justified - MSB is transmitted sample size BCLKs before LRC
+ transition.
+
+PCM
+===
+
+PCM is another 4 wire interface, very similar to I2S, which can support a more
+flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used
+to synchronise the link whilst the Tx and Rx lines are used to transmit and
+receive the audio data. Bit clock usually varies depending on sample rate
+whilst sync runs at the sample rate. PCM also supports Time Division
+Multiplexing (TDM) in that several devices can use the bus simultaneously (this
+is sometimes referred to as network mode).
+
+Common PCM operating modes:-
+
+ o Mode A - MSB is transmitted on falling edge of first BCLK after FRAME/SYNC.
+
+ o Mode B - MSB is transmitted on rising edge of FRAME/SYNC.
diff --git a/Documentation/sound/alsa/soc/clocking.txt b/Documentation/sound/alsa/soc/clocking.txt
new file mode 100644
index 0000000..b130016
--- /dev/null
+++ b/Documentation/sound/alsa/soc/clocking.txt
@@ -0,0 +1,51 @@
+Audio Clocking
+==============
+
+This text describes the audio clocking terms in ASoC and digital audio in
+general. Note: Audio clocking can be complex!
+
+
+Master Clock
+------------
+
+Every audio subsystem is driven by a master clock (sometimes referred to as MCLK
+or SYSCLK). This audio master clock can be derived from a number of sources
+(e.g. crystal, PLL, CPU clock) and is responsible for producing the correct
+audio playback and capture sample rates.
+
+Some master clocks (e.g. PLLs and CPU based clocks) are configurable in that
+their speed can be altered by software (depending on the system use and to save
+power). Other master clocks are fixed at a set frequency (i.e. crystals).
+
+
+DAI Clocks
+----------
+The Digital Audio Interface is usually driven by a Bit Clock (often referred to
+as BCLK). This clock is used to drive the digital audio data across the link
+between the codec and CPU.
+
+The DAI also has a frame clock to signal the start of each audio frame. This
+clock is sometimes referred to as LRC (left right clock) or FRAME. This clock
+runs at exactly the sample rate (LRC = Rate).
+
+Bit Clock can be generated as follows:-
+
+BCLK = MCLK / x
+
+ or
+
+BCLK = LRC * x
+
+ or
+
+BCLK = LRC * Channels * Word Size
+
+This relationship depends on the codec or SoC CPU in particular. In general
+it is best to configure BCLK to the lowest possible speed (depending on your
+rate, number of channels and word size) to save on power.
+
+It is also desirable to use the codec (if possible) to drive (or master) the
+audio clocks as it usually gives more accurate sample rates than the CPU.
+
+
+
diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt
new file mode 100644
index 0000000..1e95342
--- /dev/null
+++ b/Documentation/sound/alsa/soc/codec.txt
@@ -0,0 +1,198 @@
+ASoC Codec Driver
+=================
+
+The codec driver is generic and hardware independent code that configures the
+codec to provide audio capture and playback. It should contain no code that is
+specific to the target platform or machine. All platform and machine specific
+code should be added to the platform and machine drivers respectively.
+
+Each codec driver *must* provide the following features:-
+
+ 1) Codec DAI and PCM configuration
+ 2) Codec control IO - using I2C, 3 Wire(SPI) or both APIs
+ 3) Mixers and audio controls
+ 4) Codec audio operations
+
+Optionally, codec drivers can also provide:-
+
+ 5) DAPM description.
+ 6) DAPM event handler.
+ 7) DAC Digital mute control.
+
+Its probably best to use this guide in conjunction with the existing codec
+driver code in sound/soc/codecs/
+
+ASoC Codec driver breakdown
+===========================
+
+1 - Codec DAI and PCM configuration
+-----------------------------------
+Each codec driver must have a struct snd_soc_codec_dai to define its DAI and
+PCM capabilities and operations. This struct is exported so that it can be
+registered with the core by your machine driver.
+
+e.g.
+
+struct snd_soc_codec_dai wm8731_dai = {
+ .name = "WM8731",
+ /* playback capabilities */
+ .playback = {
+ .stream_name = "Playback",
+ .channels_min = 1,
+ .channels_max = 2,
+ .rates = WM8731_RATES,
+ .formats = WM8731_FORMATS,},
+ /* capture capabilities */
+ .capture = {
+ .stream_name = "Capture",
+ .channels_min = 1,
+ .channels_max = 2,
+ .rates = WM8731_RATES,
+ .formats = WM8731_FORMATS,},
+ /* pcm operations - see section 4 below */
+ .ops = {
+ .prepare = wm8731_pcm_prepare,
+ .hw_params = wm8731_hw_params,
+ .shutdown = wm8731_shutdown,
+ },
+ /* DAI operations - see DAI.txt */
+ .dai_ops = {
+ .digital_mute = wm8731_mute,
+ .set_sysclk = wm8731_set_dai_sysclk,
+ .set_fmt = wm8731_set_dai_fmt,
+ }
+};
+EXPORT_SYMBOL_GPL(wm8731_dai);
+
+
+2 - Codec control IO
+--------------------
+The codec can usually be controlled via an I2C or SPI style interface
+(AC97 combines control with data in the DAI). The codec drivers provide
+functions to read and write the codec registers along with supplying a
+register cache:-
+
+ /* IO control data and register cache */
+ void *control_data; /* codec control (i2c/3wire) data */
+ void *reg_cache;
+
+Codec read/write should do any data formatting and call the hardware
+read write below to perform the IO. These functions are called by the
+core and ALSA when performing DAPM or changing the mixer:-
+
+ unsigned int (*read)(struct snd_soc_codec *, unsigned int);
+ int (*write)(struct snd_soc_codec *, unsigned int, unsigned int);
+
+Codec hardware IO functions - usually points to either the I2C, SPI or AC97
+read/write:-
+
+ hw_write_t hw_write;
+ hw_read_t hw_read;
+
+
+3 - Mixers and audio controls
+-----------------------------
+All the codec mixers and audio controls can be defined using the convenience
+macros defined in soc.h.
+
+ #define SOC_SINGLE(xname, reg, shift, mask, invert)
+
+Defines a single control as follows:-
+
+ xname = Control name e.g. "Playback Volume"
+ reg = codec register
+ shift = control bit(s) offset in register
+ mask = control bit size(s) e.g. mask of 7 = 3 bits
+ invert = the control is inverted
+
+Other macros include:-
+
+ #define SOC_DOUBLE(xname, reg, shift_left, shift_right, mask, invert)
+
+A stereo control
+
+ #define SOC_DOUBLE_R(xname, reg_left, reg_right, shift, mask, invert)
+
+A stereo control spanning 2 registers
+
+ #define SOC_ENUM_SINGLE(xreg, xshift, xmask, xtexts)
+
+Defines an single enumerated control as follows:-
+
+ xreg = register
+ xshift = control bit(s) offset in register
+ xmask = control bit(s) size
+ xtexts = pointer to array of strings that describe each setting
+
+ #define SOC_ENUM_DOUBLE(xreg, xshift_l, xshift_r, xmask, xtexts)
+
+Defines a stereo enumerated control
+
+
+4 - Codec Audio Operations
+--------------------------
+The codec driver also supports the following ALSA operations:-
+
+/* SoC audio ops */
+struct snd_soc_ops {
+ int (*startup)(struct snd_pcm_substream *);
+ void (*shutdown)(struct snd_pcm_substream *);
+ int (*hw_params)(struct snd_pcm_substream *, struct snd_pcm_hw_params *);
+ int (*hw_free)(struct snd_pcm_substream *);
+ int (*prepare)(struct snd_pcm_substream *);
+};
+
+Please refer to the ALSA driver PCM documentation for details.
+http://www.alsa-project.org/~iwai/writing-an-alsa-driver/c436.htm
+
+
+5 - DAPM description.
+---------------------
+The Dynamic Audio Power Management description describes the codec power
+components and their relationships and registers to the ASoC core.
+Please read dapm.txt for details of building the description.
+
+Please also see the examples in other codec drivers.
+
+
+6 - DAPM event handler
+----------------------
+This function is a callback that handles codec domain PM calls and system
+domain PM calls (e.g. suspend and resume). It is used to put the codec
+to sleep when not in use.
+
+Power states:-
+
+ SNDRV_CTL_POWER_D0: /* full On */
+ /* vref/mid, clk and osc on, active */
+
+ SNDRV_CTL_POWER_D1: /* partial On */
+ SNDRV_CTL_POWER_D2: /* partial On */
+
+ SNDRV_CTL_POWER_D3hot: /* Off, with power */
+ /* everything off except vref/vmid, inactive */
+
+ SNDRV_CTL_POWER_D3cold: /* Everything Off, without power */
+
+
+7 - Codec DAC digital mute control
+----------------------------------
+Most codecs have a digital mute before the DACs that can be used to
+minimise any system noise. The mute stops any digital data from
+entering the DAC.
+
+A callback can be created that is called by the core for each codec DAI
+when the mute is applied or freed.
+
+i.e.
+
+static int wm8974_mute(struct snd_soc_codec *codec,
+ struct snd_soc_codec_dai *dai, int mute)
+{
+ u16 mute_reg = wm8974_read_reg_cache(codec, WM8974_DAC) & 0xffbf;
+ if(mute)
+ wm8974_write(codec, WM8974_DAC, mute_reg | 0x40);
+ else
+ wm8974_write(codec, WM8974_DAC, mute_reg);
+ return 0;
+}
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
new file mode 100644
index 0000000..46f9684
--- /dev/null
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -0,0 +1,290 @@
+Dynamic Audio Power Management for Portable Devices
+===================================================
+
+1. Description
+==============
+
+Dynamic Audio Power Management (DAPM) is designed to allow portable
+Linux devices to use the minimum amount of power within the audio
+subsystem at all times. It is independent of other kernel PM and as
+such, can easily co-exist with the other PM systems.
+
+DAPM is also completely transparent to all user space applications as
+all power switching is done within the ASoC core. No code changes or
+recompiling are required for user space applications. DAPM makes power
+switching decisions based upon any audio stream (capture/playback)
+activity and audio mixer settings within the device.
+
+DAPM spans the whole machine. It covers power control within the entire
+audio subsystem, this includes internal codec power blocks and machine
+level power systems.
+
+There are 4 power domains within DAPM
+
+ 1. Codec domain - VREF, VMID (core codec and audio power)
+ Usually controlled at codec probe/remove and suspend/resume, although
+ can be set at stream time if power is not needed for sidetone, etc.
+
+ 2. Platform/Machine domain - physically connected inputs and outputs
+ Is platform/machine and user action specific, is configured by the
+ machine driver and responds to asynchronous events e.g when HP
+ are inserted
+
+ 3. Path domain - audio susbsystem signal paths
+ Automatically set when mixer and mux settings are changed by the user.
+ e.g. alsamixer, amixer.
+
+ 4. Stream domain - DACs and ADCs.
+ Enabled and disabled when stream playback/capture is started and
+ stopped respectively. e.g. aplay, arecord.
+
+All DAPM power switching decisions are made automatically by consulting an audio
+routing map of the whole machine. This map is specific to each machine and
+consists of the interconnections between every audio component (including
+internal codec components). All audio components that effect power are called
+widgets hereafter.
+
+
+2. DAPM Widgets
+===============
+
+Audio DAPM widgets fall into a number of types:-
+
+ o Mixer - Mixes several analog signals into a single analog signal.
+ o Mux - An analog switch that outputs only one of many inputs.
+ o PGA - A programmable gain amplifier or attenuation widget.
+ o ADC - Analog to Digital Converter
+ o DAC - Digital to Analog Converter
+ o Switch - An analog switch
+ o Input - A codec input pin
+ o Output - A codec output pin
+ o Headphone - Headphone (and optional Jack)
+ o Mic - Mic (and optional Jack)
+ o Line - Line Input/Output (and optional Jack)
+ o Speaker - Speaker
+ o Pre - Special PRE widget (exec before all others)
+ o Post - Special POST widget (exec after all others)
+
+(Widgets are defined in include/sound/soc-dapm.h)
+
+Widgets are usually added in the codec driver and the machine driver. There are
+convenience macros defined in soc-dapm.h that can be used to quickly build a
+list of widgets of the codecs and machines DAPM widgets.
+
+Most widgets have a name, register, shift and invert. Some widgets have extra
+parameters for stream name and kcontrols.
+
+
+2.1 Stream Domain Widgets
+-------------------------
+
+Stream Widgets relate to the stream power domain and only consist of ADCs
+(analog to digital converters) and DACs (digital to analog converters).
+
+Stream widgets have the following format:-
+
+SND_SOC_DAPM_DAC(name, stream name, reg, shift, invert),
+
+NOTE: the stream name must match the corresponding stream name in your codec
+snd_soc_codec_dai.
+
+e.g. stream widgets for HiFi playback and capture
+
+SND_SOC_DAPM_DAC("HiFi DAC", "HiFi Playback", REG, 3, 1),
+SND_SOC_DAPM_ADC("HiFi ADC", "HiFi Capture", REG, 2, 1),
+
+
+2.2 Path Domain Widgets
+-----------------------
+
+Path domain widgets have a ability to control or affect the audio signal or
+audio paths within the audio subsystem. They have the following form:-
+
+SND_SOC_DAPM_PGA(name, reg, shift, invert, controls, num_controls)
+
+Any widget kcontrols can be set using the controls and num_controls members.
+
+e.g. Mixer widget (the kcontrols are declared first)
+
+/* Output Mixer */
+static const snd_kcontrol_new_t wm8731_output_mixer_controls[] = {
+SOC_DAPM_SINGLE("Line Bypass Switch", WM8731_APANA, 3, 1, 0),
+SOC_DAPM_SINGLE("Mic Sidetone Switch", WM8731_APANA, 5, 1, 0),
+SOC_DAPM_SINGLE("HiFi Playback Switch", WM8731_APANA, 4, 1, 0),
+};
+
+SND_SOC_DAPM_MIXER("Output Mixer", WM8731_PWR, 4, 1, wm8731_output_mixer_controls,
+ ARRAY_SIZE(wm8731_output_mixer_controls)),
+
+
+2.3 Platform/Machine domain Widgets
+-----------------------------------
+
+Machine widgets are different from codec widgets in that they don't have a
+codec register bit associated with them. A machine widget is assigned to each
+machine audio component (non codec) that can be independently powered. e.g.
+
+ o Speaker Amp
+ o Microphone Bias
+ o Jack connectors
+
+A machine widget can have an optional call back.
+
+e.g. Jack connector widget for an external Mic that enables Mic Bias
+when the Mic is inserted:-
+
+static int spitz_mic_bias(struct snd_soc_dapm_widget* w, int event)
+{
+ gpio_set_value(SPITZ_GPIO_MIC_BIAS, SND_SOC_DAPM_EVENT_ON(event));
+ return 0;
+}
+
+SND_SOC_DAPM_MIC("Mic Jack", spitz_mic_bias),
+
+
+2.4 Codec Domain
+----------------
+
+The codec power domain has no widgets and is handled by the codecs DAPM event
+handler. This handler is called when the codec powerstate is changed wrt to any
+stream event or by kernel PM events.
+
+
+2.5 Virtual Widgets
+-------------------
+
+Sometimes widgets exist in the codec or machine audio map that don't have any
+corresponding soft power control. In this case it is necessary to create
+a virtual widget - a widget with no control bits e.g.
+
+SND_SOC_DAPM_MIXER("AC97 Mixer", SND_SOC_DAPM_NOPM, 0, 0, NULL, 0),
+
+This can be used to merge to signal paths together in software.
+
+After all the widgets have been defined, they can then be added to the DAPM
+subsystem individually with a call to snd_soc_dapm_new_control().
+
+
+3. Codec Widget Interconnections
+================================
+
+Widgets are connected to each other within the codec and machine by audio paths
+(called interconnections). Each interconnection must be defined in order to
+create a map of all audio paths between widgets.
+
+This is easiest with a diagram of the codec (and schematic of the machine audio
+system), as it requires joining widgets together via their audio signal paths.
+
+e.g., from the WM8731 output mixer (wm8731.c)
+
+The WM8731 output mixer has 3 inputs (sources)
+
+ 1. Line Bypass Input
+ 2. DAC (HiFi playback)
+ 3. Mic Sidetone Input
+
+Each input in this example has a kcontrol associated with it (defined in example
+above) and is connected to the output mixer via it's kcontrol name. We can now
+connect the destination widget (wrt audio signal) with it's source widgets.
+
+ /* output mixer */
+ {"Output Mixer", "Line Bypass Switch", "Line Input"},
+ {"Output Mixer", "HiFi Playback Switch", "DAC"},
+ {"Output Mixer", "Mic Sidetone Switch", "Mic Bias"},
+
+So we have :-
+
+ Destination Widget <=== Path Name <=== Source Widget
+
+Or:-
+
+ Sink, Path, Source
+
+Or :-
+
+ "Output Mixer" is connected to the "DAC" via the "HiFi Playback Switch".
+
+When there is no path name connecting widgets (e.g. a direct connection) we
+pass NULL for the path name.
+
+Interconnections are created with a call to:-
+
+snd_soc_dapm_connect_input(codec, sink, path, source);
+
+Finally, snd_soc_dapm_new_widgets(codec) must be called after all widgets and
+interconnections have been registered with the core. This causes the core to
+scan the codec and machine so that the internal DAPM state matches the
+physical state of the machine.
+
+
+3.1 Machine Widget Interconnections
+-----------------------------------
+Machine widget interconnections are created in the same way as codec ones and
+directly connect the codec pins to machine level widgets.
+
+e.g. connects the speaker out codec pins to the internal speaker.
+
+ /* ext speaker connected to codec pins LOUT2, ROUT2 */
+ {"Ext Spk", NULL , "ROUT2"},
+ {"Ext Spk", NULL , "LOUT2"},
+
+This allows the DAPM to power on and off pins that are connected (and in use)
+and pins that are NC respectively.
+
+
+4 Endpoint Widgets
+===================
+An endpoint is a start or end point (widget) of an audio signal within the
+machine and includes the codec. e.g.
+
+ o Headphone Jack
+ o Internal Speaker
+ o Internal Mic
+ o Mic Jack
+ o Codec Pins
+
+When a codec pin is NC it can be marked as not used with a call to
+
+snd_soc_dapm_set_endpoint(codec, "Widget Name", 0);
+
+The last argument is 0 for inactive and 1 for active. This way the pin and its
+input widget will never be powered up and consume power.
+
+This also applies to machine widgets. e.g. if a headphone is connected to a
+jack then the jack can be marked active. If the headphone is removed, then
+the headphone jack can be marked inactive.
+
+
+5 DAPM Widget Events
+====================
+
+Some widgets can register their interest with the DAPM core in PM events.
+e.g. A Speaker with an amplifier registers a widget so the amplifier can be
+powered only when the spk is in use.
+
+/* turn speaker amplifier on/off depending on use */
+static int corgi_amp_event(struct snd_soc_dapm_widget *w, int event)
+{
+ gpio_set_value(CORGI_GPIO_APM_ON, SND_SOC_DAPM_EVENT_ON(event));
+ return 0;
+}
+
+/* corgi machine dapm widgets */
+static const struct snd_soc_dapm_widget wm8731_dapm_widgets =
+ SND_SOC_DAPM_SPK("Ext Spk", corgi_amp_event);
+
+Please see soc-dapm.h for all other widgets that support events.
+
+
+5.1 Event types
+---------------
+
+The following event types are supported by event widgets.
+
+/* dapm event types */
+#define SND_SOC_DAPM_PRE_PMU 0x1 /* before widget power up */
+#define SND_SOC_DAPM_POST_PMU 0x2 /* after widget power up */
+#define SND_SOC_DAPM_PRE_PMD 0x4 /* before widget power down */
+#define SND_SOC_DAPM_POST_PMD 0x8 /* after widget power down */
+#define SND_SOC_DAPM_PRE_REG 0x10 /* before audio path setup */
+#define SND_SOC_DAPM_POST_REG 0x20 /* after audio path setup */
diff --git a/Documentation/sound/alsa/soc/machine.txt b/Documentation/sound/alsa/soc/machine.txt
new file mode 100644
index 0000000..f370e7d
--- /dev/null
+++ b/Documentation/sound/alsa/soc/machine.txt
@@ -0,0 +1,113 @@
+ASoC Machine Driver
+===================
+
+The ASoC machine (or board) driver is the code that glues together the platform
+and codec drivers.
+
+The machine driver can contain codec and platform specific code. It registers
+the audio subsystem with the kernel as a platform device and is represented by
+the following struct:-
+
+/* SoC machine */
+struct snd_soc_machine {
+ char *name;
+
+ int (*probe)(struct platform_device *pdev);
+ int (*remove)(struct platform_device *pdev);
+
+ /* the pre and post PM functions are used to do any PM work before and
+ * after the codec and DAIs do any PM work. */
+ int (*suspend_pre)(struct platform_device *pdev, pm_message_t state);
+ int (*suspend_post)(struct platform_device *pdev, pm_message_t state);
+ int (*resume_pre)(struct platform_device *pdev);
+ int (*resume_post)(struct platform_device *pdev);
+
+ /* machine stream operations */
+ struct snd_soc_ops *ops;
+
+ /* CPU <--> Codec DAI links */
+ struct snd_soc_dai_link *dai_link;
+ int num_links;
+};
+
+probe()/remove()
+----------------
+probe/remove are optional. Do any machine specific probe here.
+
+
+suspend()/resume()
+------------------
+The machine driver has pre and post versions of suspend and resume to take care
+of any machine audio tasks that have to be done before or after the codec, DAIs
+and DMA is suspended and resumed. Optional.
+
+
+Machine operations
+------------------
+The machine specific audio operations can be set here. Again this is optional.
+
+
+Machine DAI Configuration
+-------------------------
+The machine DAI configuration glues all the codec and CPU DAIs together. It can
+also be used to set up the DAI system clock and for any machine related DAI
+initialisation e.g. the machine audio map can be connected to the codec audio
+map, unconnected codec pins can be set as such. Please see corgi.c, spitz.c
+for examples.
+
+struct snd_soc_dai_link is used to set up each DAI in your machine. e.g.
+
+/* corgi digital audio interface glue - connects codec <--> CPU */
+static struct snd_soc_dai_link corgi_dai = {
+ .name = "WM8731",
+ .stream_name = "WM8731",
+ .cpu_dai = &pxa_i2s_dai,
+ .codec_dai = &wm8731_dai,
+ .init = corgi_wm8731_init,
+ .ops = &corgi_ops,
+};
+
+struct snd_soc_machine then sets up the machine with it's DAIs. e.g.
+
+/* corgi audio machine driver */
+static struct snd_soc_machine snd_soc_machine_corgi = {
+ .name = "Corgi",
+ .dai_link = &corgi_dai,
+ .num_links = 1,
+};
+
+
+Machine Audio Subsystem
+-----------------------
+
+The machine soc device glues the platform, machine and codec driver together.
+Private data can also be set here. e.g.
+
+/* corgi audio private data */
+static struct wm8731_setup_data corgi_wm8731_setup = {
+ .i2c_address = 0x1b,
+};
+
+/* corgi audio subsystem */
+static struct snd_soc_device corgi_snd_devdata = {
+ .machine = &snd_soc_machine_corgi,
+ .platform = &pxa2xx_soc_platform,
+ .codec_dev = &soc_codec_dev_wm8731,
+ .codec_data = &corgi_wm8731_setup,
+};
+
+
+Machine Power Map
+-----------------
+
+The machine driver can optionally extend the codec power map and to become an
+audio power map of the audio subsystem. This allows for automatic power up/down
+of speaker/HP amplifiers, etc. Codec pins can be connected to the machines jack
+sockets in the machine init function. See soc/pxa/spitz.c and dapm.txt for
+details.
+
+
+Machine Controls
+----------------
+
+Machine specific audio mixer controls can be added in the DAI init function.
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
new file mode 100644
index 0000000..1e4c6d3
--- /dev/null
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -0,0 +1,86 @@
+ALSA SoC Layer
+==============
+
+The overall project goal of the ALSA System on Chip (ASoC) layer is to
+provide better ALSA support for embedded system-on-chip processors (e.g.
+pxa2xx, au1x00, iMX, etc) and portable audio codecs. Prior to the ASoC
+subsystem there was some support in the kernel for SoC audio, however it
+had some limitations:-
+
+ * Codec drivers were often tightly coupled to the underlying SoC
+ CPU. This is not ideal and leads to code duplication - for example,
+ Linux had different wm8731 drivers for 4 different SoC platforms.
+
+ * There was no standard method to signal user initiated audio events (e.g.
+ Headphone/Mic insertion, Headphone/Mic detection after an insertion
+ event). These are quite common events on portable devices and often require
+ machine specific code to re-route audio, enable amps, etc., after such an
+ event.
+
+ * Drivers tended to power up the entire codec when playing (or
+ recording) audio. This is fine for a PC, but tends to waste a lot of
+ power on portable devices. There was also no support for saving
+ power via changing codec oversampling rates, bias currents, etc.
+
+
+ASoC Design
+===========
+
+The ASoC layer is designed to address these issues and provide the following
+features :-
+
+ * Codec independence. Allows reuse of codec drivers on other platforms
+ and machines.
+
+ * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC
+ interface and codec registers it's audio interface capabilities with the
+ core and are subsequently matched and configured when the application
+ hardware parameters are known.
+
+ * Dynamic Audio Power Management (DAPM). DAPM automatically sets the codec to
+ its minimum power state at all times. This includes powering up/down
+ internal power blocks depending on the internal codec audio routing and any
+ active streams.
+
+ * Pop and click reduction. Pops and clicks can be reduced by powering the
+ codec up/down in the correct sequence (including using digital mute). ASoC
+ signals the codec when to change power states.
+
+ * Machine specific controls: Allow machines to add controls to the sound card
+ (e.g. volume control for speaker amplifier).
+
+To achieve all this, ASoC basically splits an embedded audio system into 3
+components :-
+
+ * Codec driver: The codec driver is platform independent and contains audio
+ controls, audio interface capabilities, codec DAPM definition and codec IO
+ functions.
+
+ * Platform driver: The platform driver contains the audio DMA engine and audio
+ interface drivers (e.g. I2S, AC97, PCM) for that platform.
+
+ * Machine driver: The machine driver handles any machine specific controls and
+ audio events (e.g. turning on an amp at start of playback).
+
+
+Documentation
+=============
+
+The documentation is spilt into the following sections:-
+
+overview.txt: This file.
+
+codec.txt: Codec driver internals.
+
+DAI.txt: Description of Digital Audio Interface standards and how to configure
+a DAI within your codec and CPU DAI drivers.
+
+dapm.txt: Dynamic Audio Power Management
+
+platform.txt: Platform audio DMA and DAI.
+
+machine.txt: Machine driver internals.
+
+pop_clicks.txt: How to minimise audio artifacts.
+
+clocking.txt: ASoC clocking for best power performance.
diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt
new file mode 100644
index 0000000..b681d17
--- /dev/null
+++ b/Documentation/sound/alsa/soc/platform.txt
@@ -0,0 +1,58 @@
+ASoC Platform Driver
+====================
+
+An ASoC platform driver can be divided into audio DMA and SoC DAI configuration
+and control. The platform drivers only target the SoC CPU and must have no board
+specific code.
+
+Audio DMA
+=========
+
+The platform DMA driver optionally supports the following ALSA operations:-
+
+/* SoC audio ops */
+struct snd_soc_ops {
+ int (*startup)(struct snd_pcm_substream *);
+ void (*shutdown)(struct snd_pcm_substream *);
+ int (*hw_params)(struct snd_pcm_substream *, struct snd_pcm_hw_params *);
+ int (*hw_free)(struct snd_pcm_substream *);
+ int (*prepare)(struct snd_pcm_substream *);
+ int (*trigger)(struct snd_pcm_substream *, int);
+};
+
+The platform driver exports its DMA functionality via struct snd_soc_platform:-
+
+struct snd_soc_platform {
+ char *name;
+
+ int (*probe)(struct platform_device *pdev);
+ int (*remove)(struct platform_device *pdev);
+ int (*suspend)(struct platform_device *pdev, struct snd_soc_cpu_dai *cpu_dai);
+ int (*resume)(struct platform_device *pdev, struct snd_soc_cpu_dai *cpu_dai);
+
+ /* pcm creation and destruction */
+ int (*pcm_new)(struct snd_card *, struct snd_soc_codec_dai *, struct snd_pcm *);
+ void (*pcm_free)(struct snd_pcm *);
+
+ /* platform stream ops */
+ struct snd_pcm_ops *pcm_ops;
+};
+
+Please refer to the ALSA driver documentation for details of audio DMA.
+http://www.alsa-project.org/~iwai/writing-an-alsa-driver/c436.htm
+
+An example DMA driver is soc/pxa/pxa2xx-pcm.c
+
+
+SoC DAI Drivers
+===============
+
+Each SoC DAI driver must provide the following features:-
+
+ 1) Digital audio interface (DAI) description
+ 2) Digital audio interface configuration
+ 3) PCM's description
+ 4) SYSCLK configuration
+ 5) Suspend and resume (optional)
+
+Please see codec.txt for a description of items 1 - 4.
diff --git a/Documentation/sound/alsa/soc/pops_clicks.txt b/Documentation/sound/alsa/soc/pops_clicks.txt
new file mode 100644
index 0000000..e1e74da
--- /dev/null
+++ b/Documentation/sound/alsa/soc/pops_clicks.txt
@@ -0,0 +1,52 @@
+Audio Pops and Clicks
+=====================
+
+Pops and clicks are unwanted audio artifacts caused by the powering up and down
+of components within the audio subsystem. This is noticeable on PCs when an
+audio module is either loaded or unloaded (at module load time the sound card is
+powered up and causes a popping noise on the speakers).
+
+Pops and clicks can be more frequent on portable systems with DAPM. This is
+because the components within the subsystem are being dynamically powered
+depending on the audio usage and this can subsequently cause a small pop or
+click every time a component power state is changed.
+
+
+Minimising Playback Pops and Clicks
+===================================
+
+Playback pops in portable audio subsystems cannot be completely eliminated
+currently, however future audio codec hardware will have better pop and click
+suppression. Pops can be reduced within playback by powering the audio
+components in a specific order. This order is different for startup and
+shutdown and follows some basic rules:-
+
+ Startup Order :- DAC --> Mixers --> Output PGA --> Digital Unmute
+
+ Shutdown Order :- Digital Mute --> Output PGA --> Mixers --> DAC
+
+This assumes that the codec PCM output path from the DAC is via a mixer and then
+a PGA (programmable gain amplifier) before being output to the speakers.
+
+
+Minimising Capture Pops and Clicks
+==================================
+
+Capture artifacts are somewhat easier to get rid as we can delay activating the
+ADC until all the pops have occurred. This follows similar power rules to
+playback in that components are powered in a sequence depending upon stream
+startup or shutdown.
+
+ Startup Order - Input PGA --> Mixers --> ADC
+
+ Shutdown Order - ADC --> Mixers --> Input PGA
+
+
+Zipper Noise
+============
+An unwanted zipper noise can occur within the audio playback or capture stream
+when a volume control is changed near its maximum gain value. The zipper noise
+is heard when the gain increase or decrease changes the mean audio signal
+amplitude too quickly. It can be minimised by enabling the zero cross setting
+for each volume control. The ZC forces the gain change to occur when the signal
+crosses the zero amplitude line.
diff --git a/Documentation/sound/oss/ALS b/Documentation/sound/oss/ALS
new file mode 100644
index 0000000..d01ffbf
--- /dev/null
+++ b/Documentation/sound/oss/ALS
@@ -0,0 +1,66 @@
+ALS-007/ALS-100/ALS-200 based sound cards
+=========================================
+
+Support for sound cards based around the Avance Logic
+ALS-007/ALS-100/ALS-200 chip is included. These chips are a single
+chip PnP sound solution which is mostly hardware compatible with the
+Sound Blaster 16 card, with most differences occurring in the use of
+the mixer registers. For this reason the ALS code is integrated
+as part of the Sound Blaster 16 driver (adding only 800 bytes to the
+SB16 driver).
+
+To use an ALS sound card under Linux, enable the following options as
+modules in the sound configuration section of the kernel config:
+ - 100% Sound Blaster compatibles (SB16/32/64, ESS, Jazz16) support
+ - FM synthesizer (YM3812/OPL-3) support
+ - standalone MPU401 support may be required for some cards; for the
+ ALS-007, when using isapnptools, it is required
+Since the ALS-007/100/200 are PnP cards, ISAPnP support should probably be
+compiled in. If kernel level PnP support is not included, isapnptools will
+be required to configure the card before the sound modules are loaded.
+
+When using kernel level ISAPnP, the kernel should correctly identify and
+configure all resources required by the card when the "sb" module is
+inserted. Note that the ALS-007 does not have a 16 bit DMA channel and that
+the MPU401 interface on this card uses a different interrupt to the audio
+section. This should all be correctly configured by the kernel; if problems
+with the MPU401 interface surface, try using the standalone MPU401 module,
+passing "0" as the "sb" module's "mpu_io" module parameter to prevent the
+soundblaster driver attempting to register the MPU401 itself. The onboard
+synth device can be accessed using the "opl3" module.
+
+If isapnptools is used to wake up the sound card (as in 2.2.x), the settings
+of the card's resources should be passed to the kernel modules ("sb", "opl3"
+and "mpu401") using the module parameters. When configuring an ALS-007, be
+sure to specify different IRQs for the audio and MPU401 sections - this card
+requires they be different. For "sb", "io", "irq" and "dma" should be set
+to the same values used to configure the audio section of the card with
+isapnp. "dma16" should be explicitly set to "-1" for an ALS-007 since this
+card does not have a 16 bit dma channel; if not specified the kernel will
+default to using channel 5 anyway which will cause audio not to work.
+"mpu_io" should be set to 0. The "io" parameter of the "opl3" module should
+also agree with the setting used by isapnp. To get the MPU401 interface
+working on an ALS-007 card, the "mpu401" module will be required since this
+card uses separate IRQs for the audio and MPU401 sections and there is no
+parameter available to pass a different IRQ to the "sb" driver (whose
+inbuilt MPU401 driver would otherwise be fine). Insert the mpu401 module
+passing appropriate values using the "io" and "irq" parameters.
+
+The resulting sound driver will provide the following capabilities:
+ - 8 and 16 bit audio playback
+ - 8 and 16 bit audio recording
+ - Software selection of record source (line in, CD, FM, mic, master)
+ - Record and playback of midi data via the external MPU-401
+ - Playback of midi data using inbuilt FM synthesizer
+ - Control of the ALS-007 mixer via any OSS-compatible mixer programs.
+ Controls available are Master (L&R), Line in (L&R), CD (L&R),
+ DSP/PCM/audio out (L&R), FM (L&R) and Mic in (mono).
+
+Jonathan Woithe
+jwoithe@physics.adelaide.edu.au
+30 March 1998
+
+Modified 2000-02-26 by Dave Forrest, drf5n@virginia.edu to add ALS100/ALS200
+Modified 2000-04-10 by Paul Laufer, pelaufer@csupomona.edu to add ISAPnP info.
+Modified 2000-11-19 by Jonathan Woithe, jwoithe@physics.adelaide.edu.au
+ - updated information for kernel 2.4.x.
diff --git a/Documentation/sound/oss/AudioExcelDSP16 b/Documentation/sound/oss/AudioExcelDSP16
new file mode 100644
index 0000000..c0f0892
--- /dev/null
+++ b/Documentation/sound/oss/AudioExcelDSP16
@@ -0,0 +1,101 @@
+Driver
+------
+
+Informations about Audio Excel DSP 16 driver can be found in the source
+file aedsp16.c
+Please, read the head of the source before using it. It contain useful
+informations.
+
+Configuration
+-------------
+
+The Audio Excel configuration, is now done with the standard Linux setup.
+You have to configure the sound card (Sound Blaster or Microsoft Sound System)
+and, if you want it, the Roland MPU-401 (do not use the Sound Blaster MPU-401,
+SB-MPU401) in the main driver menu. Activate the lowlevel drivers then select
+the Audio Excel hardware that you want to initialize. Check the IRQ/DMA/MIRQ
+of the Audio Excel initialization: it must be the same as the SBPRO (or MSS)
+setup. If the parameters are different, correct it.
+I you own a Gallant's audio card based on SC-6600, activate the SC-6600 support.
+If you want to change the configuration of the sound board, be sure to
+check off all the configuration items before re-configure it.
+
+Module parameters
+-----------------
+To use this driver as a module, you must configure some module parameters, to
+set up I/O addresses, IRQ lines and DMA channels. Some parameters are
+mandatory while some others are optional. Here a list of parameters you can
+use with this module:
+
+Name Description
+==== ===========
+MANDATORY
+io I/O base address (0x220 or 0x240)
+irq irq line (5, 7, 9, 10 or 11)
+dma dma channel (0, 1 or 3)
+
+OPTIONAL
+mss_base I/O base address for activate MSS mode (default SBPRO)
+ (0x530 or 0xE80)
+mpu_base I/O base address for activate MPU-401 mode
+ (0x300, 0x310, 0x320 or 0x330)
+mpu_irq MPU-401 irq line (5, 7, 9, 10 or 0)
+
+The /etc/modprobe.conf will have lines like this:
+
+options opl3 io=0x388
+options ad1848 io=0x530 irq=11 dma=3
+options aedsp16 io=0x220 irq=11 dma=3 mss_base=0x530
+
+Where the aedsp16 options are the options for this driver while opl3 and
+ad1848 are the corresponding options for the MSS and OPL3 modules.
+
+Loading MSS and OPL3 needs to pre load the aedsp16 module to set up correctly
+the sound card. Installation dependencies must be written in the modprobe.conf
+file:
+
+install ad1848 /sbin/modprobe aedsp16 && /sbin/modprobe -i ad1848
+install opl3 /sbin/modprobe aedsp16 && /sbin/modprobe -i opl3
+
+Then you must load the sound modules stack in this order:
+sound -> aedsp16 -> [ ad1848, opl3 ]
+
+With the above configuration, loading ad1848 or opl3 modules, will
+automatically load all the sound stack.
+
+Sound cards supported
+---------------------
+This driver supports the SC-6000 and SC-6600 based Gallant's sound card.
+It don't support the Audio Excel DSP 16 III (try the SC-6600 code).
+I'm working on the III version of the card: if someone have useful
+informations about it, please let me know.
+For all the non-supported audio cards, you have to boot MS-DOS (or WIN95)
+activating the audio card with the MS-DOS device driver, then you have to
+<ctrl>-<alt>-<del> and boot Linux.
+Follow these steps:
+
+1) Compile Linux kernel with standard sound driver, using the emulation
+ you want, with the parameters of your audio card,
+ e.g. Microsoft Sound System irq10 dma3
+2) Install your new kernel as the default boot kernel.
+3) Boot MS-DOS and configure the audio card with the boot time device
+ driver, for MSS irq10 dma3 in our example.
+4) <ctrl>-<alt>-<del> and boot Linux. This will maintain the DOS configuration
+ and will boot the new kernel with sound driver. The sound driver will find
+ the audio card and will recognize and attach it.
+
+Reports on User successes
+-------------------------
+
+> Date: Mon, 29 Jul 1996 08:35:40 +0100
+> From: Mr S J Greenaway <sjg95@unixfe.rl.ac.uk>
+> To: riccardo@cdc8g5.cdc.polimi.it (Riccardo Facchetti)
+> Subject: Re: Audio Excel DSP 16 initialization code
+>
+> Just to let you know got my Audio Excel (emulating a MSS) working
+> with my original SB16, thanks for the driver!
+
+
+Last revised: 20 August 1998
+Riccardo Facchetti
+fizban@tin.it
diff --git a/Documentation/sound/oss/CMI8330 b/Documentation/sound/oss/CMI8330
new file mode 100644
index 0000000..9c439f1
--- /dev/null
+++ b/Documentation/sound/oss/CMI8330
@@ -0,0 +1,153 @@
+Documentation for CMI 8330 (SoundPRO)
+-------------------------------------
+Alessandro Zummo <azummo@ita.flashnet.it>
+
+( Be sure to read Documentation/sound/oss/SoundPro too )
+
+
+This adapter is now directly supported by the sb driver.
+
+ The only thing you have to do is to compile the kernel sound
+support as a module and to enable kernel ISAPnP support,
+as shown below.
+
+
+CONFIG_SOUND=m
+CONFIG_SOUND_SB=m
+
+CONFIG_PNP=y
+CONFIG_ISAPNP=y
+
+
+and optionally:
+
+
+CONFIG_SOUND_MPU401=m
+
+ for MPU401 support.
+
+
+(I suggest you to use "make menuconfig" or "make xconfig"
+ for a more comfortable configuration editing)
+
+
+
+Then you can do
+
+ modprobe sb
+
+and everything will be (hopefully) configured.
+
+You should get something similar in syslog:
+
+sb: CMI8330 detected.
+sb: CMI8330 sb base located at 0x220
+sb: CMI8330 mpu base located at 0x330
+sb: CMI8330 mail reports to Alessandro Zummo <azummo@ita.flashnet.it>
+sb: ISAPnP reports CMI 8330 SoundPRO at i/o 0x220, irq 7, dma 1,5
+
+
+
+
+The old documentation file follows for reference
+purposes.
+
+
+How to enable CMI 8330 (SOUNDPRO) soundchip on Linux
+------------------------------------------
+Stefan Laudat <Stefan.Laudat@asit.ro>
+
+[Note: The CMI 8338 is unrelated and is supported by cmpci.o]
+
+
+ In order to use CMI8330 under Linux you just have to use a proper isapnp.conf, a good isapnp and a little bit of patience. I use isapnp 1.17, but
+you may get a better one I guess at http://www.roestock.demon.co.uk/isapnptools/.
+
+ Of course you will have to compile kernel sound support as module, as shown below:
+
+CONFIG_SOUND=m
+CONFIG_SOUND_OSS=m
+CONFIG_SOUND_SB=m
+CONFIG_SOUND_ADLIB=m
+CONFIG_SOUND_MPU401=m
+# Mikro$chaft sound system (kinda useful here ;))
+CONFIG_SOUND_MSS=m
+
+ The /etc/isapnp.conf file will be:
+
+<snip below>
+
+
+(READPORT 0x0203)
+(ISOLATE PRESERVE)
+(IDENTIFY *)
+(VERBOSITY 2)
+(CONFLICT (IO FATAL)(IRQ FATAL)(DMA FATAL)(MEM FATAL)) # or WARNING
+(VERIFYLD N)
+
+
+# WSS
+
+(CONFIGURE CMI0001/16777472 (LD 0
+(IO 0 (SIZE 8) (BASE 0x0530))
+(IO 1 (SIZE 8) (BASE 0x0388))
+(INT 0 (IRQ 7 (MODE +E)))
+(DMA 0 (CHANNEL 0))
+(NAME "CMI0001/16777472[0]{CMI8330/C3D Audio Adapter}")
+(ACT Y)
+))
+
+# MPU
+
+(CONFIGURE CMI0001/16777472 (LD 1
+(IO 0 (SIZE 2) (BASE 0x0330))
+(INT 0 (IRQ 11 (MODE +E)))
+(NAME "CMI0001/16777472[1]{CMI8330/C3D Audio Adapter}")
+(ACT Y)
+))
+
+# Joystick
+
+(CONFIGURE CMI0001/16777472 (LD 2
+(IO 0 (SIZE 8) (BASE 0x0200))
+(NAME "CMI0001/16777472[2]{CMI8330/C3D Audio Adapter}")
+(ACT Y)
+))
+
+# SoundBlaster
+
+(CONFIGURE CMI0001/16777472 (LD 3
+(IO 0 (SIZE 16) (BASE 0x0220))
+(INT 0 (IRQ 5 (MODE +E)))
+(DMA 0 (CHANNEL 1))
+(DMA 1 (CHANNEL 5))
+(NAME "CMI0001/16777472[3]{CMI8330/C3D Audio Adapter}")
+(ACT Y)
+))
+
+
+(WAITFORKEY)
+
+<end of snip>
+
+ The module sequence is trivial:
+
+/sbin/insmod soundcore
+/sbin/insmod sound
+/sbin/insmod uart401
+# insert this first
+/sbin/insmod ad1848 io=0x530 irq=7 dma=0 soundpro=1
+# The sb module is an alternative to the ad1848 (Microsoft Sound System)
+# Anyhow, this is full duplex and has MIDI
+/sbin/insmod sb io=0x220 dma=1 dma16=5 irq=5 mpu_io=0x330
+
+
+
+Alma Chao <elysian@ethereal.torsion.org> suggests the following /etc/modprobe.conf:
+
+alias sound ad1848
+alias synth0 opl3
+options ad1848 io=0x530 irq=7 dma=0 soundpro=1
+options opl3 io=0x388
+
+
diff --git a/Documentation/sound/oss/CS4232 b/Documentation/sound/oss/CS4232
new file mode 100644
index 0000000..7d6af7a
--- /dev/null
+++ b/Documentation/sound/oss/CS4232
@@ -0,0 +1,23 @@
+To configure the Crystal CS423x sound chip and activate its DSP functions,
+modules may be loaded in this order:
+
+ modprobe sound
+ insmod ad1848
+ insmod uart401
+ insmod cs4232 io=* irq=* dma=* dma2=*
+
+This is the meaning of the parameters:
+
+ io--I/O address of the Windows Sound System (normally 0x534)
+ irq--IRQ of this device
+ dma and dma2--DMA channels (DMA2 may be 0)
+
+On some cards, the board attempts to do non-PnP setup, and fails. If you
+have problems, use Linux' PnP facilities.
+
+To get MIDI facilities add
+
+ insmod opl3 io=*
+
+where "io" is the I/O address of the OPL3 synthesizer. This will be shown
+in /proc/sys/pnp and is normally 0x388.
diff --git a/Documentation/sound/oss/ESS b/Documentation/sound/oss/ESS
new file mode 100644
index 0000000..bba93b4
--- /dev/null
+++ b/Documentation/sound/oss/ESS
@@ -0,0 +1,34 @@
+Documentation for the ESS AudioDrive chips
+
+In 2.4 kernels the SoundBlaster driver not only tries to detect an ESS chip, it
+tries to detect the type of ESS chip too. The correct detection of the chip
+doesn't always succeed however, so unless you use the kernel isapnp facilities
+(and you chip is pnp capable) the default behaviour is 2.0 behaviour which
+means: only detect ES688 and ES1688.
+
+All ESS chips now have a recording level setting. This is a need-to-have for
+people who want to use their ESS for recording sound.
+
+Every chip that's detected as a later-than-es1688 chip has a 6 bits logarithmic
+master volume control.
+
+Every chip that's detected as a ES1887 now has Full Duplex support. Made a
+little testprogram that shows that is works, haven't seen a real program that
+needs this however.
+
+For ESS chips an additional parameter "esstype" can be specified. This controls
+the (auto) detection of the ESS chips. It can have 3 kinds of values:
+
+-1 Act like 2.0 kernels: only detect ES688 or ES1688.
+0 Try to auto-detect the chip (may fail for ES1688)
+688 The chip will be treated as ES688
+1688 ,, ,, ,, ,, ,, ,, ES1688
+1868 ,, ,, ,, ,, ,, ,, ES1868
+1869 ,, ,, ,, ,, ,, ,, ES1869
+1788 ,, ,, ,, ,, ,, ,, ES1788
+1887 ,, ,, ,, ,, ,, ,, ES1887
+1888 ,, ,, ,, ,, ,, ,, ES1888
+
+Because Full Duplex is supported for ES1887 you can specify a second DMA
+channel by specifying module parameter dma16. It can be one of: 0, 1, 3 or 5.
+
diff --git a/Documentation/sound/oss/ESS1868 b/Documentation/sound/oss/ESS1868
new file mode 100644
index 0000000..55e922f
--- /dev/null
+++ b/Documentation/sound/oss/ESS1868
@@ -0,0 +1,55 @@
+Documentation for the ESS1868F AudioDrive PnP sound card
+
+The ESS1868 sound card is a PnP ESS1688-compatible 16-bit sound card.
+
+It should be automatically detected by the Linux Kernel isapnp support when you
+load the sb.o module. Otherwise you should take care of:
+
+ * The ESS1868 does not allow use of a 16-bit DMA, thus DMA 0, 1, 2, and 3
+ may only be used.
+
+ * isapnptools version 1.14 does work with ESS1868. Earlier versions might
+ not.
+
+ * Sound support MUST be compiled as MODULES, not statically linked
+ into the kernel.
+
+
+NOTE: this is only needed when not using the kernel isapnp support!
+
+For configuring the sound card's I/O addresses, IRQ and DMA, here is a
+sample copy of the isapnp.conf directives regarding the ESS1868:
+
+(CONFIGURE ESS1868/-1 (LD 1
+(IO 0 (BASE 0x0220))
+(IO 1 (BASE 0x0388))
+(IO 2 (BASE 0x0330))
+(DMA 0 (CHANNEL 1))
+(INT 0 (IRQ 5 (MODE +E)))
+(ACT Y)
+))
+
+(for a full working isapnp.conf file, remember the
+(ISOLATE)
+(IDENTIFY *)
+at the beginning and the
+(WAITFORKEY)
+at the end.)
+
+In this setup, the main card I/O is 0x0220, FM synthesizer is 0x0388, and
+the MPU-401 MIDI port is located at 0x0330. IRQ is IRQ 5, DMA is channel 1.
+
+After configuring the sound card via isapnp, to use the card you must load
+the sound modules with the proper I/O information. Here is my setup:
+
+# ESS1868F AudioDrive initialization
+
+/sbin/modprobe sound
+/sbin/insmod uart401
+/sbin/insmod sb io=0x220 irq=5 dma=1 dma16=-1
+/sbin/insmod mpu401 io=0x330
+/sbin/insmod opl3 io=0x388
+/sbin/insmod v_midi
+
+opl3 is the FM synthesizer
+/sbin/insmod opl3 io=0x388
diff --git a/Documentation/sound/oss/Introduction b/Documentation/sound/oss/Introduction
new file mode 100644
index 0000000..f04ba6b
--- /dev/null
+++ b/Documentation/sound/oss/Introduction
@@ -0,0 +1,459 @@
+Introduction Notes on Modular Sound Drivers and Soundcore
+Wade Hampton
+2/14/2001
+
+Purpose:
+========
+This document provides some general notes on the modular
+sound drivers and their configuration, along with the
+support modules sound.o and soundcore.o.
+
+Note, some of this probably should be added to the Sound-HOWTO!
+
+Note, soundlow.o was present with 2.2 kernels but is not
+required for 2.4.x kernels. References have been removed
+to this.
+
+
+Copying:
+========
+none
+
+
+History:
+========
+0.1.0 11/20/1998 First version, draft
+1.0.0 11/1998 Alan Cox changes, incorporation in 2.2.0
+ as Documentation/sound/oss/Introduction
+1.1.0 6/30/1999 Second version, added notes on making the drivers,
+ added info on multiple sound cards of similar types,]
+ added more diagnostics info, added info about esd.
+ added info on OSS and ALSA.
+1.1.1 19991031 Added notes on sound-slot- and sound-service.
+ (Alan Cox)
+1.1.2 20000920 Modified for Kernel 2.4 (Christoph Hellwig)
+1.1.3 20010214 Minor notes and corrections (Wade Hampton)
+ Added examples of sound-slot-0, etc.
+
+
+Modular Sound Drivers:
+======================
+
+Thanks to the GREAT work by Alan Cox (alan@lxorguk.ukuu.org.uk),
+
+[And Oleg Drokin, Thomas Sailer, Andrew Veliath and more than a few
+ others - not to mention Hannu's original code being designed well
+ enough to cope with that kind of chopping up](Alan)
+
+the standard Linux kernels support a modular sound driver. From
+Alan's comments in linux/drivers/sound/README.FIRST:
+
+ The modular sound driver patches were funded by Red Hat Software
+ (www.redhat.com). The sound driver here is thus a modified version of
+ Hannu's code. Please bear that in mind when considering the appropriate
+ forums for bug reporting.
+
+The modular sound drivers may be loaded via insmod or modprobe.
+To support all the various sound modules, there are two general
+support modules that must be loaded first:
+
+ soundcore.o: Top level handler for the sound system, provides
+ a set of functions for registration of devices
+ by type.
+
+ sound.o: Common sound functions required by all modules.
+
+For the specific sound modules (e.g., sb.o for the Soundblaster),
+read the documentation on that module to determine what options
+are available, for example IRQ, address, DMA.
+
+Warning, the options for different cards sometime use different names
+for the same or a similar feature (dma1= versus dma16=). As a last
+resort, inspect the code (search for module_param).
+
+Notes:
+
+1. There is a new OpenSource sound driver called ALSA which is
+ currently under development: http://www.alsa-project.org/
+ The ALSA drivers support some newer hardware that may not
+ be supported by this sound driver and also provide some
+ additional features.
+
+2. The commercial OSS driver may be obtained from the site:
+ http://www/opensound.com. This may be used for cards that
+ are unsupported by the kernel driver, or may be used
+ by other operating systems.
+
+3. The enlightenment sound daemon may be used for playing
+ multiple sounds at the same time via a single card, eliminating
+ some of the requirements for multiple sound card systems. For
+ more information, see: http://www.tux.org/~ricdude/EsounD.html
+ The "esd" program may be used with the real-player and mpeg
+ players like mpg123 and x11amp. The newer real-player
+ and some games even include built-in support for ESD!
+
+
+Building the Modules:
+=====================
+
+This document does not provide full details on building the
+kernel, etc. The notes below apply only to making the kernel
+sound modules. If this conflicts with the kernel's README,
+the README takes precedence.
+
+1. To make the kernel sound modules, cd to your /usr/src/linux
+ directory (typically) and type make config, make menuconfig,
+ or make xconfig (to start the command line, dialog, or x-based
+ configuration tool).
+
+2. Select the Sound option and a dialog will be displayed.
+
+3. Select M (module) for "Sound card support".
+
+4. Select your sound driver(s) as a module. For ProAudio, Sound
+ Blaster, etc., select M (module) for OSS sound modules.
+ [thanks to Marvin Stodolsky <stodolsk@erols.com>]A
+
+5. Make the kernel (e.g., make bzImage), and install the kernel.
+
+6. Make the modules and install them (make modules; make modules_install).
+
+Note, for 2.5.x kernels, make sure you have the newer module-init-tools
+installed or modules will not be loaded properly. 2.5.x requires an
+updated module-init-tools.
+
+
+Plug and Play (PnP:
+===================
+
+If the sound card is an ISA PnP card, isapnp may be used
+to configure the card. See the file isapnp.txt in the
+directory one level up (e.g., /usr/src/linux/Documentation).
+
+Also the 2.4.x kernels provide PnP capabilities, see the
+file NEWS in this directory.
+
+PCI sound cards are highly recommended, as they are far
+easier to configure and from what I have read, they use
+less resources and are more CPU efficient.
+
+
+INSMOD:
+=======
+
+If loading via insmod, the common modules must be loaded in the
+order below BEFORE loading the other sound modules. The card-specific
+modules may then be loaded (most require parameters). For example,
+I use the following via a shell script to load my SoundBlaster:
+
+SB_BASE=0x240
+SB_IRQ=9
+SB_DMA=3
+SB_DMA2=5
+SB_MPU=0x300
+#
+echo Starting sound
+/sbin/insmod soundcore
+/sbin/insmod sound
+#
+echo Starting sound blaster....
+/sbin/insmod uart401
+/sbin/insmod sb io=$SB_BASE irq=$SB_IRQ dma=$SB_DMA dma16=$SB_DMA2 mpu_io=$SB_MP
+
+When using sound as a module, I typically put these commands
+in a file such as /root/soundon.sh.
+
+
+MODPROBE:
+=========
+
+If loading via modprobe, these common files are automatically loaded
+when requested by modprobe. For example, my /etc/modprobe.conf contains:
+
+alias sound sb
+options sb io=0x240 irq=9 dma=3 dma16=5 mpu_io=0x300
+
+All you need to do to load the module is:
+
+ /sbin/modprobe sb
+
+
+Sound Status:
+=============
+
+The status of sound may be read/checked by:
+ cat (anyfile).au >/dev/audio
+
+[WWH: This may not work properly for SoundBlaster PCI 128 cards
+such as the es1370/1 (see the es1370/1 files in this directory)
+as they do not automatically support uLaw on /dev/audio.]
+
+The status of the modules and which modules depend on
+which other modules may be checked by:
+ /sbin/lsmod
+
+/sbin/lsmod should show something like the following:
+ sb 26280 0
+ uart401 5640 0 [sb]
+ sound 57112 0 [sb uart401]
+ soundcore 1968 8 [sb sound]
+
+
+Removing Sound:
+===============
+
+Sound may be removed by using /sbin/rmmod in the reverse order
+in which you load the modules. Note, if a program has a sound device
+open (e.g., xmixer), that module (and the modules on which it
+depends) may not be unloaded.
+
+For example, I use the following to remove my Soundblaster (rmmod
+in the reverse order in which I loaded the modules):
+
+/sbin/rmmod sb
+/sbin/rmmod uart401
+/sbin/rmmod sound
+/sbin/rmmod soundcore
+
+When using sound as a module, I typically put these commands
+in a script such as /root/soundoff.sh.
+
+
+Removing Sound for use with OSS:
+================================
+
+If you get really stuck or have a card that the kernel modules
+will not support, you can get a commercial sound driver from
+http://www.opensound.com. Before loading the commercial sound
+driver, you should do the following:
+
+1. remove sound modules (detailed above)
+2. remove the sound modules from /etc/modprobe.conf
+3. move the sound modules from /lib/modules/<kernel>/misc
+ (for example, I make a /lib/modules/<kernel>/misc/tmp
+ directory and copy the sound module files to that
+ directory).
+
+
+Multiple Sound Cards:
+=====================
+
+The sound drivers will support multiple sound cards and there
+are some great applications like multitrack that support them.
+Typically, you need two sound cards of different types. Note, this
+uses more precious interrupts and DMA channels and sometimes
+can be a configuration nightmare. I have heard reports of 3-4
+sound cards (typically I only use 2). You can sometimes use
+multiple PCI sound cards of the same type.
+
+On my machine I have two sound cards (cs4232 and Soundblaster Vibra
+16). By loading sound as modules, I can control which is the first
+sound device (/dev/dsp, /dev/audio, /dev/mixer) and which is
+the second. Normally, the cs4232 (Dell sound on the motherboard)
+would be the first sound device, but I prefer the Soundblaster.
+All you have to do is to load the one you want as /dev/dsp
+first (in my case "sb") and then load the other one
+(in my case "cs4232").
+
+If you have two cards of the same type that are jumpered
+cards or different PnP revisions, you may load the same
+module twice. For example, I have a SoundBlaster vibra 16
+and an older SoundBlaster 16 (jumpers). To load the module
+twice, you need to do the following:
+
+1. Copy the sound modules to a new name. For example
+ sb.o could be copied (or symlinked) to sb1.o for the
+ second SoundBlaster.
+
+2. Make a second entry in /etc/modprobe.conf, for example,
+ sound1 or sb1. This second entry should refer to the
+ new module names for example sb1, and should include
+ the I/O, etc. for the second sound card.
+
+3. Update your soundon.sh script, etc.
+
+Warning: I have never been able to get two PnP sound cards of the
+same type to load at the same time. I have tried this several times
+with the Soundblaster Vibra 16 cards. OSS has indicated that this
+is a PnP problem.... If anyone has any luck doing this, please
+send me an E-MAIL. PCI sound cards should not have this problem.a
+Since this was originally release, I have received a couple of
+mails from people who have accomplished this!
+
+NOTE: In Linux 2.4 the Sound Blaster driver (and only this one yet)
+supports multiple cards with one module by default.
+Read the file 'Soundblaster' in this directory for details.
+
+
+Sound Problems:
+===============
+
+First RTFM (including the troubleshooting section
+in the Sound-HOWTO).
+
+1) If you are having problems loading the modules (for
+ example, if you get device conflict errors) try the
+ following:
+
+ A) If you have Win95 or NT on the same computer,
+ write down what addresses, IRQ, and DMA channels
+ those were using for the same hardware. You probably
+ can use these addresses, IRQs, and DMA channels.
+ You should really do this BEFORE attempting to get
+ sound working!
+
+ B) Check (cat) /proc/interrupts, /proc/ioports,
+ and /proc/dma. Are you trying to use an address,
+ IRQ or DMA port that another device is using?
+
+ C) Check (cat) /proc/isapnp
+
+ D) Inspect your /var/log/messages file. Often that will
+ indicate what IRQ or IO port could not be obtained.
+
+ E) Try another port or IRQ. Note this may involve
+ using the PnP tools to move the sound card to
+ another location. Sometimes this is the only way
+ and it is more or less trial and error.
+
+2) If you get motor-boating (the same sound or part of a
+ sound clip repeated), you probably have either an IRQ
+ or DMA conflict. Move the card to another IRQ or DMA
+ port. This has happened to me when playing long files
+ when I had an IRQ conflict.
+
+3. If you get dropouts or pauses when playing high sample
+ rate files such as using mpg123 or x11amp/xmms, you may
+ have too slow of a CPU and may have to use the options to
+ play the files at 1/2 speed. For example, you may use
+ the -2 or -4 option on mpg123. You may also get this
+ when trying to play mpeg files stored on a CD-ROM
+ (my Toshiba T8000 PII/366 sometimes has this problem).
+
+4. If you get "cannot access device" errors, your /dev/dsp
+ files, etc. may be set to owner root, mode 600. You
+ may have to use the command:
+ chmod 666 /dev/dsp /dev/mixer /dev/audio
+
+5. If you get "device busy" errors, another program has the
+ sound device open. For example, if using the Enlightenment
+ sound daemon "esd", the "esd" program has the sound device.
+ If using "esd", please RTFM the docs on ESD. For example,
+ esddsp <program> may be used to play files via a non-esd
+ aware program.
+
+6) Ask for help on the sound list or send E-MAIL to the
+ sound driver author/maintainer.
+
+7) Turn on debug in drivers/sound/sound_config.h (DEB, DDB, MDB).
+
+8) If the system reports insufficient DMA memory then you may want to
+ load sound with the "dmabufs=1" option. Or in /etc/conf.modules add
+
+ preinstall sound dmabufs=1
+
+ This makes the sound system allocate its buffers and hang onto them.
+
+ You may also set persistent DMA when building a 2.4.x kernel.
+
+
+Configuring Sound:
+==================
+
+There are several ways of configuring your sound:
+
+1) On the kernel command line (when using the sound driver(s)
+ compiled in the kernel). Check the driver source and
+ documentation for details.
+
+2) On the command line when using insmod or in a bash script
+ using command line calls to load sound.
+
+3) In /etc/modprobe.conf when using modprobe.
+
+4) Via Red Hat's GPL'd /usr/sbin/sndconfig program (text based).
+
+5) Via the OSS soundconf program (with the commercial version
+ of the OSS driver.
+
+6) By just loading the module and let isapnp do everything relevant
+ for you. This works only with a few drivers yet and - of course -
+ only with isapnp hardware.
+
+And I am sure, several other ways.
+
+Anyone want to write a linuxconf module for configuring sound?
+
+
+Module Loading:
+===============
+
+When a sound card is first referenced and sound is modular, the sound system
+will ask for the sound devices to be loaded. Initially it requests that
+the driver for the sound system is loaded. It then will ask for
+sound-slot-0, where 0 is the first sound card. (sound-slot-1 the second and
+so on). Thus you can do
+
+alias sound-slot-0 sb
+
+To load a soundblaster at this point. If the slot loading does not provide
+the desired device - for example a soundblaster does not directly provide
+a midi synth in all cases then it will request "sound-service-0-n" where n
+is
+
+ 0 Mixer
+
+ 2 MIDI
+
+ 3, 4 DSP audio
+
+
+For example, I use the following to load my Soundblaster PCI 128
+(ES 1371) card first, followed by my SoundBlaster Vibra 16 card,
+then by my TV card:
+
+# Load the Soundblaster PCI 128 as /dev/dsp, /dev/dsp1, /dev/mixer
+alias sound-slot-0 es1371
+
+# Load the Soundblaster Vibra 16 as /dev/dsp2, /dev/mixer1
+alias sound-slot-1 sb
+options sb io=0x240 irq=5 dma=1 dma16=5 mpu_io=0x330
+
+# Load the BTTV (TV card) as /dev/mixer2
+alias sound-slot-2 bttv
+alias sound-service-2-0 tvmixer
+
+pre-install bttv modprobe tuner ; modprobe tvmixer
+pre-install tvmixer modprobe msp3400; modprobe tvaudio
+options tuner debug=0 type=8
+options bttv card=0 radio=0 pll=0
+
+
+For More Information (RTFM):
+============================
+1) Information on kernel modules: manual pages for insmod and modprobe.
+
+2) Information on PnP, RTFM manual pages for isapnp.
+
+3) Sound-HOWTO and Sound-Playing-HOWTO.
+
+4) OSS's WWW site at http://www.opensound.com.
+
+5) All the files in Documentation/sound.
+
+6) The comments and code in linux/drivers/sound.
+
+7) The sndconfig and rhsound documentation from Red Hat.
+
+8) The Linux-sound mailing list: sound-list@redhat.com.
+
+9) Enlightenment documentation (for info on esd)
+ http://www.tux.org/~ricdude/EsounD.html.
+
+10) ALSA home page: http://www.alsa-project.org/
+
+
+Contact Information:
+====================
+Wade Hampton: (whampton@staffnet.com)
+
diff --git a/Documentation/sound/oss/MultiSound b/Documentation/sound/oss/MultiSound
new file mode 100644
index 0000000..e4a18bb
--- /dev/null
+++ b/Documentation/sound/oss/MultiSound
@@ -0,0 +1,1137 @@
+#! /bin/sh
+#
+# Turtle Beach MultiSound Driver Notes
+# -- Andrew Veliath <andrewtv@usa.net>
+#
+# Last update: September 10, 1998
+# Corresponding msnd driver: 0.8.3
+#
+# ** This file is a README (top part) and shell archive (bottom part).
+# The corresponding archived utility sources can be unpacked by
+# running `sh MultiSound' (the utilities are only needed for the
+# Pinnacle and Fiji cards). **
+#
+#
+# -=-=- Getting Firmware -=-=-
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# See the section `Obtaining and Creating Firmware Files' in this
+# document for instructions on obtaining the necessary firmware
+# files.
+#
+#
+# Supported Features
+# ~~~~~~~~~~~~~~~~~~
+#
+# Currently, full-duplex digital audio (/dev/dsp only, /dev/audio is
+# not currently available) and mixer functionality (/dev/mixer) are
+# supported (memory mapped digital audio is not yet supported).
+# Digital transfers and monitoring can be done as well if you have
+# the digital daughterboard (see the section on using the S/PDIF port
+# for more information).
+#
+# Support for the Turtle Beach MultiSound Hurricane architecture is
+# composed of the following modules (these can also operate compiled
+# into the kernel):
+#
+# msnd - MultiSound base (requires soundcore)
+#
+# msnd_classic - Base audio/mixer support for Classic, Monetery and
+# Tahiti cards
+#
+# msnd_pinnacle - Base audio/mixer support for Pinnacle and Fiji cards
+#
+#
+# Important Notes - Read Before Using
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The firmware files are not included (may change in future). You
+# must obtain these images from Turtle Beach (they are included in
+# the MultiSound Development Kits), and place them in /etc/sound for
+# example, and give the full paths in the Linux configuration. If
+# you are compiling in support for the MultiSound driver rather than
+# using it as a module, these firmware files must be accessible
+# during kernel compilation.
+#
+# Please note these files must be binary files, not assembler. See
+# the section later in this document for instructions to obtain these
+# files.
+#
+#
+# Configuring Card Resources
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# ** This section is very important, as your card may not work at all
+# or your machine may crash if you do not do this correctly. **
+#
+# * Classic/Monterey/Tahiti
+#
+# These cards are configured through the driver msnd_classic. You must
+# know the io port, then the driver will select the irq and memory resources
+# on the card. It is up to you to know if these are free locations or now,
+# a conflict can lock the machine up.
+#
+# * Pinnacle/Fiji
+#
+# The Pinnacle and Fiji cards have an extra config port, either
+# 0x250, 0x260 or 0x270. This port can be disabled to have the card
+# configured strictly through PnP, however you lose the ability to
+# access the IDE controller and joystick devices on this card when
+# using PnP. The included pinnaclecfg program in this shell archive
+# can be used to configure the card in non-PnP mode, and in PnP mode
+# you can use isapnptools. These are described briefly here.
+#
+# pinnaclecfg is not required; you can use the msnd_pinnacle module
+# to fully configure the card as well. However, pinnaclecfg can be
+# used to change the resource values of a particular device after the
+# msnd_pinnacle module has been loaded. If you are compiling the
+# driver into the kernel, you must set these values during compile
+# time, however other peripheral resource values can be changed with
+# the pinnaclecfg program after the kernel is loaded.
+#
+#
+# *** PnP mode
+#
+# Use pnpdump to obtain a sample configuration if you can; I was able
+# to obtain one with the command `pnpdump 1 0x203' -- this may vary
+# for you (running pnpdump by itself did not work for me). Then,
+# edit this file and use isapnp to uncomment and set the card values.
+# Use these values when inserting the msnd_pinnacle module. Using
+# this method, you can set the resources for the DSP and the Kurzweil
+# synth (Pinnacle). Since Linux does not directly support PnP
+# devices, you may have difficulty when using the card in PnP mode
+# when it the driver is compiled into the kernel. Using non-PnP mode
+# is preferable in this case.
+#
+# Here is an example mypinnacle.conf for isapnp that sets the card to
+# io base 0x210, irq 5 and mem 0xd8000, and also sets the Kurzweil
+# synth to 0x330 and irq 9 (may need editing for your system):
+#
+# (READPORT 0x0203)
+# (CSN 2)
+# (IDENTIFY *)
+#
+# # DSP
+# (CONFIGURE BVJ0440/-1 (LD 0
+# (INT 0 (IRQ 5 (MODE +E))) (IO 0 (BASE 0x0210)) (MEM 0 (BASE 0x0d8000))
+# (ACT Y)))
+#
+# # Kurzweil Synth (Pinnacle Only)
+# (CONFIGURE BVJ0440/-1 (LD 1
+# (IO 0 (BASE 0x0330)) (INT 0 (IRQ 9 (MODE +E)))
+# (ACT Y)))
+#
+# (WAITFORKEY)
+#
+#
+# *** Non-PnP mode
+#
+# The second way is by running the card in non-PnP mode. This
+# actually has some advantages in that you can access some other
+# devices on the card, such as the joystick and IDE controller. To
+# configure the card, unpack this shell archive and build the
+# pinnaclecfg program. Using this program, you can assign the
+# resource values to the card's devices, or disable the devices. As
+# an alternative to using pinnaclecfg, you can specify many of the
+# configuration values when loading the msnd_pinnacle module (or
+# during kernel configuration when compiling the driver into the
+# kernel).
+#
+# If you specify cfg=0x250 for the msnd_pinnacle module, it
+# automatically configure the card to the given io, irq and memory
+# values using that config port (the config port is jumper selectable
+# on the card to 0x250, 0x260 or 0x270).
+#
+# See the `msnd_pinnacle Additional Options' section below for more
+# information on these parameters (also, if you compile the driver
+# directly into the kernel, these extra parameters can be useful
+# here).
+#
+#
+# ** It is very easy to cause problems in your machine if you choose a
+# resource value which is incorrect. **
+#
+#
+# Examples
+# ~~~~~~~~
+#
+# * MultiSound Classic/Monterey/Tahiti:
+#
+# modprobe soundcore
+# insmod msnd
+# insmod msnd_classic io=0x290 irq=7 mem=0xd0000
+#
+# * MultiSound Pinnacle in PnP mode:
+#
+# modprobe soundcore
+# insmod msnd
+# isapnp mypinnacle.conf
+# insmod msnd_pinnacle io=0x210 irq=5 mem=0xd8000 <-- match mypinnacle.conf values
+#
+# * MultiSound Pinnacle in non-PnP mode (replace 0x250 with your configuration port,
+# one of 0x250, 0x260 or 0x270):
+#
+# insmod soundcore
+# insmod msnd
+# insmod msnd_pinnacle cfg=0x250 io=0x290 irq=5 mem=0xd0000
+#
+# * To use the MPU-compatible Kurzweil synth on the Pinnacle in PnP
+# mode, add the following (assumes you did `isapnp mypinnacle.conf'):
+#
+# insmod sound
+# insmod mpu401 io=0x330 irq=9 <-- match mypinnacle.conf values
+#
+# * To use the MPU-compatible Kurzweil synth on the Pinnacle in non-PnP
+# mode, add the following. Note how we first configure the peripheral's
+# resources, _then_ install a Linux driver for it:
+#
+# insmod sound
+# pinnaclecfg 0x250 mpu 0x330 9
+# insmod mpu401 io=0x330 irq=9
+#
+# -- OR you can use the following sequence without pinnaclecfg in non-PnP mode:
+#
+# insmod soundcore
+# insmod msnd
+# insmod msnd_pinnacle cfg=0x250 io=0x290 irq=5 mem=0xd0000 mpu_io=0x330 mpu_irq=9
+# insmod sound
+# insmod mpu401 io=0x330 irq=9
+#
+# * To setup the joystick port on the Pinnacle in non-PnP mode (though
+# you have to find the actual Linux joystick driver elsewhere), you
+# can use pinnaclecfg:
+#
+# pinnaclecfg 0x250 joystick 0x200
+#
+# -- OR you can configure this using msnd_pinnacle with the following:
+#
+# insmod soundcore
+# insmod msnd
+# insmod msnd_pinnacle cfg=0x250 io=0x290 irq=5 mem=0xd0000 joystick_io=0x200
+#
+#
+# msnd_classic, msnd_pinnacle Required Options
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# If the following options are not given, the module will not load.
+# Examine the kernel message log for informative error messages.
+# WARNING--probing isn't supported so try to make sure you have the
+# correct shared memory area, otherwise you may experience problems.
+#
+# io I/O base of DSP, e.g. io=0x210
+# irq IRQ number, e.g. irq=5
+# mem Shared memory area, e.g. mem=0xd8000
+#
+#
+# msnd_classic, msnd_pinnacle Additional Options
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# fifosize The digital audio FIFOs, in kilobytes. If not
+# specified, the default will be used. Increasing
+# this value will reduce the chance of a FIFO
+# underflow at the expense of increasing overall
+# latency. For example, fifosize=512 will
+# allocate 512kB read and write FIFOs (1MB total).
+# While this may reduce dropouts, a heavy machine
+# load will undoubtedly starve the FIFO of data
+# and you will eventually get dropouts. One
+# option is to alter the scheduling priority of
+# the playback process, using `nice' or some form
+# of POSIX soft real-time scheduling.
+#
+# calibrate_signal Setting this to one calibrates the ADCs to the
+# signal, zero calibrates to the card (defaults
+# to zero).
+#
+#
+# msnd_pinnacle Additional Options
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# digital Specify digital=1 to enable the S/PDIF input
+# if you have the digital daughterboard
+# adapter. This will enable access to the
+# DIGITAL1 input for the soundcard in the mixer.
+# Some mixer programs might have trouble setting
+# the DIGITAL1 source as an input. If you have
+# trouble, you can try the setdigital.c program
+# at the bottom of this document.
+#
+# cfg Non-PnP configuration port for the Pinnacle
+# and Fiji (typically 0x250, 0x260 or 0x270,
+# depending on the jumper configuration). If
+# this option is omitted, then it is assumed
+# that the card is in PnP mode, and that the
+# specified DSP resource values are already
+# configured with PnP (i.e. it won't attempt to
+# do any sort of configuration).
+#
+# When the Pinnacle is in non-PnP mode, you can use the following
+# options to configure particular devices. If a full specification
+# for a device is not given, then the device is not configured. Note
+# that you still must use a Linux driver for any of these devices
+# once their resources are setup (such as the Linux joystick driver,
+# or the MPU401 driver from OSS for the Kurzweil synth).
+#
+# mpu_io I/O port of MPU (on-board Kurzweil synth)
+# mpu_irq IRQ of MPU (on-board Kurzweil synth)
+# ide_io0 First I/O port of IDE controller
+# ide_io1 Second I/O port of IDE controller
+# ide_irq IRQ IDE controller
+# joystick_io I/O port of joystick
+#
+#
+# Obtaining and Creating Firmware Files
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For the Classic/Tahiti/Monterey
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Download to /tmp and unzip the following file from Turtle Beach:
+#
+# ftp://ftp.voyetra.com/pub/tbs/msndcl/msndvkit.zip
+#
+# When unzipped, unzip the file named MsndFiles.zip. Then copy the
+# following firmware files to /etc/sound (note the file renaming):
+#
+# cp DSPCODE/MSNDINIT.BIN /etc/sound/msndinit.bin
+# cp DSPCODE/MSNDPERM.REB /etc/sound/msndperm.bin
+#
+# When configuring the Linux kernel, specify /etc/sound/msndinit.bin and
+# /etc/sound/msndperm.bin for the two firmware files (Linux kernel
+# versions older than 2.2 do not ask for firmware paths, and are
+# hardcoded to /etc/sound).
+#
+# If you are compiling the driver into the kernel, these files must
+# be accessible during compilation, but will not be needed later.
+# The files must remain, however, if the driver is used as a module.
+#
+#
+# For the Pinnacle/Fiji
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# Download to /tmp and unzip the following file from Turtle Beach (be
+# sure to use the entire URL; some have had trouble navigating to the
+# URL):
+#
+# ftp://ftp.voyetra.com/pub/tbs/pinn/pnddk100.zip
+#
+# Unpack this shell archive, and run make in the created directory
+# (you need a C compiler and flex to build the utilities). This
+# should give you the executables conv, pinnaclecfg and setdigital.
+# conv is only used temporarily here to create the firmware files,
+# while pinnaclecfg is used to configure the Pinnacle or Fiji card in
+# non-PnP mode, and setdigital can be used to set the S/PDIF input on
+# the mixer (pinnaclecfg and setdigital should be copied to a
+# convenient place, possibly run during system initialization).
+#
+# To generating the firmware files with the `conv' program, we create
+# the binary firmware files by doing the following conversion
+# (assuming the archive unpacked into a directory named PINNDDK):
+#
+# ./conv < PINNDDK/dspcode/pndspini.asm > /etc/sound/pndspini.bin
+# ./conv < PINNDDK/dspcode/pndsperm.asm > /etc/sound/pndsperm.bin
+#
+# The conv (and conv.l) program is not needed after conversion and can
+# be safely deleted. Then, when configuring the Linux kernel, specify
+# /etc/sound/pndspini.bin and /etc/sound/pndsperm.bin for the two
+# firmware files (Linux kernel versions older than 2.2 do not ask for
+# firmware paths, and are hardcoded to /etc/sound).
+#
+# If you are compiling the driver into the kernel, these files must
+# be accessible during compilation, but will not be needed later.
+# The files must remain, however, if the driver is used as a module.
+#
+#
+# Using Digital I/O with the S/PDIF Port
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# If you have a Pinnacle or Fiji with the digital daughterboard and
+# want to set it as the input source, you can use this program if you
+# have trouble trying to do it with a mixer program (be sure to
+# insert the module with the digital=1 option, or say Y to the option
+# during compiled-in kernel operation). Upon selection of the S/PDIF
+# port, you should be able monitor and record from it.
+#
+# There is something to note about using the S/PDIF port. Digital
+# timing is taken from the digital signal, so if a signal is not
+# connected to the port and it is selected as recording input, you
+# will find PCM playback to be distorted in playback rate. Also,
+# attempting to record at a sampling rate other than the DAT rate may
+# be problematic (i.e. trying to record at 8000Hz when the DAT signal
+# is 44100Hz). If you have a problem with this, set the recording
+# input to analog if you need to record at a rate other than that of
+# the DAT rate.
+#
+#
+# -- Shell archive attached below, just run `sh MultiSound' to extract.
+# Contains Pinnacle/Fiji utilities to convert firmware, configure
+# in non-PnP mode, and select the DIGITAL1 input for the mixer.
+#
+#
+#!/bin/sh
+# This is a shell archive (produced by GNU sharutils 4.2).
+# To extract the files from this archive, save it to some FILE, remove
+# everything before the `!/bin/sh' line above, then type `sh FILE'.
+#
+# Made on 1998-12-04 10:07 EST by <andrewtv@ztransform.velsoft.com>.
+# Source directory was `/home/andrewtv/programming/pinnacle/pinnacle'.
+#
+# Existing files will *not* be overwritten unless `-c' is specified.
+#
+# This shar contains:
+# length mode name
+# ------ ---------- ------------------------------------------
+# 2046 -rw-rw-r-- MultiSound.d/setdigital.c
+# 10235 -rw-rw-r-- MultiSound.d/pinnaclecfg.c
+# 106 -rw-rw-r-- MultiSound.d/Makefile
+# 141 -rw-rw-r-- MultiSound.d/conv.l
+# 1472 -rw-rw-r-- MultiSound.d/msndreset.c
+#
+save_IFS="${IFS}"
+IFS="${IFS}:"
+gettext_dir=FAILED
+locale_dir=FAILED
+first_param="$1"
+for dir in $PATH
+do
+ if test "$gettext_dir" = FAILED && test -f $dir/gettext \
+ && ($dir/gettext --version >/dev/null 2>&1)
+ then
+ set `$dir/gettext --version 2>&1`
+ if test "$3" = GNU
+ then
+ gettext_dir=$dir
+ fi
+ fi
+ if test "$locale_dir" = FAILED && test -f $dir/shar \
+ && ($dir/shar --print-text-domain-dir >/dev/null 2>&1)
+ then
+ locale_dir=`$dir/shar --print-text-domain-dir`
+ fi
+done
+IFS="$save_IFS"
+if test "$locale_dir" = FAILED || test "$gettext_dir" = FAILED
+then
+ echo=echo
+else
+ TEXTDOMAINDIR=$locale_dir
+ export TEXTDOMAINDIR
+ TEXTDOMAIN=sharutils
+ export TEXTDOMAIN
+ echo="$gettext_dir/gettext -s"
+fi
+touch -am 1231235999 $$.touch >/dev/null 2>&1
+if test ! -f 1231235999 && test -f $$.touch; then
+ shar_touch=touch
+else
+ shar_touch=:
+ echo
+ $echo 'WARNING: not restoring timestamps. Consider getting and'
+ $echo "installing GNU \`touch', distributed in GNU File Utilities..."
+ echo
+fi
+rm -f 1231235999 $$.touch
+#
+if mkdir _sh01426; then
+ $echo 'x -' 'creating lock directory'
+else
+ $echo 'failed to create lock directory'
+ exit 1
+fi
+# ============= MultiSound.d/setdigital.c ==============
+if test ! -d 'MultiSound.d'; then
+ $echo 'x -' 'creating directory' 'MultiSound.d'
+ mkdir 'MultiSound.d'
+fi
+if test -f 'MultiSound.d/setdigital.c' && test "$first_param" != -c; then
+ $echo 'x -' SKIPPING 'MultiSound.d/setdigital.c' '(file already exists)'
+else
+ $echo 'x -' extracting 'MultiSound.d/setdigital.c' '(text)'
+ sed 's/^X//' << 'SHAR_EOF' > 'MultiSound.d/setdigital.c' &&
+/*********************************************************************
+X *
+X * setdigital.c - sets the DIGITAL1 input for a mixer
+X *
+X * Copyright (C) 1998 Andrew Veliath
+X *
+X * This program is free software; you can redistribute it and/or modify
+X * it under the terms of the GNU General Public License as published by
+X * the Free Software Foundation; either version 2 of the License, or
+X * (at your option) any later version.
+X *
+X * This program is distributed in the hope that it will be useful,
+X * but WITHOUT ANY WARRANTY; without even the implied warranty of
+X * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+X * GNU General Public License for more details.
+X *
+X * You should have received a copy of the GNU General Public License
+X * along with this program; if not, write to the Free Software
+X * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+X *
+X ********************************************************************/
+X
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/soundcard.h>
+X
+int main(int argc, char *argv[])
+{
+X int fd;
+X unsigned long recmask, recsrc;
+X
+X if (argc != 2) {
+X fprintf(stderr, "usage: setdigital <mixer device>\n");
+X exit(1);
+X }
+X
+X if ((fd = open(argv[1], O_RDWR)) < 0) {
+X perror(argv[1]);
+X exit(1);
+X }
+X
+X if (ioctl(fd, SOUND_MIXER_READ_RECMASK, &recmask) < 0) {
+X fprintf(stderr, "error: ioctl read recording mask failed\n");
+X perror("ioctl");
+X close(fd);
+X exit(1);
+X }
+X
+X if (!(recmask & SOUND_MASK_DIGITAL1)) {
+X fprintf(stderr, "error: cannot find DIGITAL1 device in mixer\n");
+X close(fd);
+X exit(1);
+X }
+X
+X if (ioctl(fd, SOUND_MIXER_READ_RECSRC, &recsrc) < 0) {
+X fprintf(stderr, "error: ioctl read recording source failed\n");
+X perror("ioctl");
+X close(fd);
+X exit(1);
+X }
+X
+X recsrc |= SOUND_MASK_DIGITAL1;
+X
+X if (ioctl(fd, SOUND_MIXER_WRITE_RECSRC, &recsrc) < 0) {
+X fprintf(stderr, "error: ioctl write recording source failed\n");
+X perror("ioctl");
+X close(fd);
+X exit(1);
+X }
+X
+X close(fd);
+X
+X return 0;
+}
+SHAR_EOF
+ $shar_touch -am 1204092598 'MultiSound.d/setdigital.c' &&
+ chmod 0664 'MultiSound.d/setdigital.c' ||
+ $echo 'restore of' 'MultiSound.d/setdigital.c' 'failed'
+ if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \
+ && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then
+ md5sum -c << SHAR_EOF >/dev/null 2>&1 \
+ || $echo 'MultiSound.d/setdigital.c:' 'MD5 check failed'
+e87217fc3e71288102ba41fd81f71ec4 MultiSound.d/setdigital.c
+SHAR_EOF
+ else
+ shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'MultiSound.d/setdigital.c'`"
+ test 2046 -eq "$shar_count" ||
+ $echo 'MultiSound.d/setdigital.c:' 'original size' '2046,' 'current size' "$shar_count!"
+ fi
+fi
+# ============= MultiSound.d/pinnaclecfg.c ==============
+if test -f 'MultiSound.d/pinnaclecfg.c' && test "$first_param" != -c; then
+ $echo 'x -' SKIPPING 'MultiSound.d/pinnaclecfg.c' '(file already exists)'
+else
+ $echo 'x -' extracting 'MultiSound.d/pinnaclecfg.c' '(text)'
+ sed 's/^X//' << 'SHAR_EOF' > 'MultiSound.d/pinnaclecfg.c' &&
+/*********************************************************************
+X *
+X * pinnaclecfg.c - Pinnacle/Fiji Device Configuration Program
+X *
+X * This is for NON-PnP mode only. For PnP mode, use isapnptools.
+X *
+X * This is Linux-specific, and must be run with root permissions.
+X *
+X * Part of the Turtle Beach MultiSound Sound Card Driver for Linux
+X *
+X * Copyright (C) 1998 Andrew Veliath
+X *
+X * This program is free software; you can redistribute it and/or modify
+X * it under the terms of the GNU General Public License as published by
+X * the Free Software Foundation; either version 2 of the License, or
+X * (at your option) any later version.
+X *
+X * This program is distributed in the hope that it will be useful,
+X * but WITHOUT ANY WARRANTY; without even the implied warranty of
+X * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+X * GNU General Public License for more details.
+X *
+X * You should have received a copy of the GNU General Public License
+X * along with this program; if not, write to the Free Software
+X * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+X *
+X ********************************************************************/
+X
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <asm/io.h>
+#include <asm/types.h>
+X
+#define IREG_LOGDEVICE 0x07
+#define IREG_ACTIVATE 0x30
+#define LD_ACTIVATE 0x01
+#define LD_DISACTIVATE 0x00
+#define IREG_EECONTROL 0x3F
+#define IREG_MEMBASEHI 0x40
+#define IREG_MEMBASELO 0x41
+#define IREG_MEMCONTROL 0x42
+#define IREG_MEMRANGEHI 0x43
+#define IREG_MEMRANGELO 0x44
+#define MEMTYPE_8BIT 0x00
+#define MEMTYPE_16BIT 0x02
+#define MEMTYPE_RANGE 0x00
+#define MEMTYPE_HIADDR 0x01
+#define IREG_IO0_BASEHI 0x60
+#define IREG_IO0_BASELO 0x61
+#define IREG_IO1_BASEHI 0x62
+#define IREG_IO1_BASELO 0x63
+#define IREG_IRQ_NUMBER 0x70
+#define IREG_IRQ_TYPE 0x71
+#define IRQTYPE_HIGH 0x02
+#define IRQTYPE_LOW 0x00
+#define IRQTYPE_LEVEL 0x01
+#define IRQTYPE_EDGE 0x00
+X
+#define HIBYTE(w) ((BYTE)(((WORD)(w) >> 8) & 0xFF))
+#define LOBYTE(w) ((BYTE)(w))
+#define MAKEWORD(low,hi) ((WORD)(((BYTE)(low))|(((WORD)((BYTE)(hi)))<<8)))
+X
+typedef __u8 BYTE;
+typedef __u16 USHORT;
+typedef __u16 WORD;
+X
+static int config_port = -1;
+X
+static int msnd_write_cfg(int cfg, int reg, int value)
+{
+X outb(reg, cfg);
+X outb(value, cfg + 1);
+X if (value != inb(cfg + 1)) {
+X fprintf(stderr, "error: msnd_write_cfg: I/O error\n");
+X return -EIO;
+X }
+X return 0;
+}
+X
+static int msnd_read_cfg(int cfg, int reg)
+{
+X outb(reg, cfg);
+X return inb(cfg + 1);
+}
+X
+static int msnd_write_cfg_io0(int cfg, int num, WORD io)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_IO0_BASEHI, HIBYTE(io)))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_IO0_BASELO, LOBYTE(io)))
+X return -EIO;
+X return 0;
+}
+X
+static int msnd_read_cfg_io0(int cfg, int num, WORD *io)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X
+X *io = MAKEWORD(msnd_read_cfg(cfg, IREG_IO0_BASELO),
+X msnd_read_cfg(cfg, IREG_IO0_BASEHI));
+X
+X return 0;
+}
+X
+static int msnd_write_cfg_io1(int cfg, int num, WORD io)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_IO1_BASEHI, HIBYTE(io)))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_IO1_BASELO, LOBYTE(io)))
+X return -EIO;
+X return 0;
+}
+X
+static int msnd_read_cfg_io1(int cfg, int num, WORD *io)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X
+X *io = MAKEWORD(msnd_read_cfg(cfg, IREG_IO1_BASELO),
+X msnd_read_cfg(cfg, IREG_IO1_BASEHI));
+X
+X return 0;
+}
+X
+static int msnd_write_cfg_irq(int cfg, int num, WORD irq)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_IRQ_NUMBER, LOBYTE(irq)))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_IRQ_TYPE, IRQTYPE_EDGE))
+X return -EIO;
+X return 0;
+}
+X
+static int msnd_read_cfg_irq(int cfg, int num, WORD *irq)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X
+X *irq = msnd_read_cfg(cfg, IREG_IRQ_NUMBER);
+X
+X return 0;
+}
+X
+static int msnd_write_cfg_mem(int cfg, int num, int mem)
+{
+X WORD wmem;
+X
+X mem >>= 8;
+X mem &= 0xfff;
+X wmem = (WORD)mem;
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_MEMBASEHI, HIBYTE(wmem)))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_MEMBASELO, LOBYTE(wmem)))
+X return -EIO;
+X if (wmem && msnd_write_cfg(cfg, IREG_MEMCONTROL, (MEMTYPE_HIADDR | MEMTYPE_16BIT)))
+X return -EIO;
+X return 0;
+}
+X
+static int msnd_read_cfg_mem(int cfg, int num, int *mem)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X
+X *mem = MAKEWORD(msnd_read_cfg(cfg, IREG_MEMBASELO),
+X msnd_read_cfg(cfg, IREG_MEMBASEHI));
+X *mem <<= 8;
+X
+X return 0;
+}
+X
+static int msnd_activate_logical(int cfg, int num)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X if (msnd_write_cfg(cfg, IREG_ACTIVATE, LD_ACTIVATE))
+X return -EIO;
+X return 0;
+}
+X
+static int msnd_write_cfg_logical(int cfg, int num, WORD io0, WORD io1, WORD irq, int mem)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X if (msnd_write_cfg_io0(cfg, num, io0))
+X return -EIO;
+X if (msnd_write_cfg_io1(cfg, num, io1))
+X return -EIO;
+X if (msnd_write_cfg_irq(cfg, num, irq))
+X return -EIO;
+X if (msnd_write_cfg_mem(cfg, num, mem))
+X return -EIO;
+X if (msnd_activate_logical(cfg, num))
+X return -EIO;
+X return 0;
+}
+X
+static int msnd_read_cfg_logical(int cfg, int num, WORD *io0, WORD *io1, WORD *irq, int *mem)
+{
+X if (msnd_write_cfg(cfg, IREG_LOGDEVICE, num))
+X return -EIO;
+X if (msnd_read_cfg_io0(cfg, num, io0))
+X return -EIO;
+X if (msnd_read_cfg_io1(cfg, num, io1))
+X return -EIO;
+X if (msnd_read_cfg_irq(cfg, num, irq))
+X return -EIO;
+X if (msnd_read_cfg_mem(cfg, num, mem))
+X return -EIO;
+X return 0;
+}
+X
+static void usage(void)
+{
+X fprintf(stderr,
+X "\n"
+X "pinnaclecfg 1.0\n"
+X "\n"
+X "usage: pinnaclecfg <config port> [device config]\n"
+X "\n"
+X "This is for use with the card in NON-PnP mode only.\n"
+X "\n"
+X "Available devices (not all available for Fiji):\n"
+X "\n"
+X " Device Description\n"
+X " -------------------------------------------------------------------\n"
+X " reset Reset all devices (i.e. disable)\n"
+X " show Display current device configurations\n"
+X "\n"
+X " dsp <io> <irq> <mem> Audio device\n"
+X " mpu <io> <irq> Internal Kurzweil synth\n"
+X " ide <io0> <io1> <irq> On-board IDE controller\n"
+X " joystick <io> Joystick port\n"
+X "\n");
+X exit(1);
+}
+X
+static int cfg_reset(void)
+{
+X int i;
+X
+X for (i = 0; i < 4; ++i)
+X msnd_write_cfg_logical(config_port, i, 0, 0, 0, 0);
+X
+X return 0;
+}
+X
+static int cfg_show(void)
+{
+X int i;
+X int count = 0;
+X
+X for (i = 0; i < 4; ++i) {
+X WORD io0, io1, irq;
+X int mem;
+X msnd_read_cfg_logical(config_port, i, &io0, &io1, &irq, &mem);
+X switch (i) {
+X case 0:
+X if (io0 || irq || mem) {
+X printf("dsp 0x%x %d 0x%x\n", io0, irq, mem);
+X ++count;
+X }
+X break;
+X case 1:
+X if (io0 || irq) {
+X printf("mpu 0x%x %d\n", io0, irq);
+X ++count;
+X }
+X break;
+X case 2:
+X if (io0 || io1 || irq) {
+X printf("ide 0x%x 0x%x %d\n", io0, io1, irq);
+X ++count;
+X }
+X break;
+X case 3:
+X if (io0) {
+X printf("joystick 0x%x\n", io0);
+X ++count;
+X }
+X break;
+X }
+X }
+X
+X if (count == 0)
+X fprintf(stderr, "no devices configured\n");
+X
+X return 0;
+}
+X
+static int cfg_dsp(int argc, char *argv[])
+{
+X int io, irq, mem;
+X
+X if (argc < 3 ||
+X sscanf(argv[0], "0x%x", &io) != 1 ||
+X sscanf(argv[1], "%d", &irq) != 1 ||
+X sscanf(argv[2], "0x%x", &mem) != 1)
+X usage();
+X
+X if (!(io == 0x290 ||
+X io == 0x260 ||
+X io == 0x250 ||
+X io == 0x240 ||
+X io == 0x230 ||
+X io == 0x220 ||
+X io == 0x210 ||
+X io == 0x3e0)) {
+X fprintf(stderr, "error: io must be one of "
+X "210, 220, 230, 240, 250, 260, 290, or 3E0\n");
+X usage();
+X }
+X
+X if (!(irq == 5 ||
+X irq == 7 ||
+X irq == 9 ||
+X irq == 10 ||
+X irq == 11 ||
+X irq == 12)) {
+X fprintf(stderr, "error: irq must be one of "
+X "5, 7, 9, 10, 11 or 12\n");
+X usage();
+X }
+X
+X if (!(mem == 0xb0000 ||
+X mem == 0xc8000 ||
+X mem == 0xd0000 ||
+X mem == 0xd8000 ||
+X mem == 0xe0000 ||
+X mem == 0xe8000)) {
+X fprintf(stderr, "error: mem must be one of "
+X "0xb0000, 0xc8000, 0xd0000, 0xd8000, 0xe0000 or 0xe8000\n");
+X usage();
+X }
+X
+X return msnd_write_cfg_logical(config_port, 0, io, 0, irq, mem);
+}
+X
+static int cfg_mpu(int argc, char *argv[])
+{
+X int io, irq;
+X
+X if (argc < 2 ||
+X sscanf(argv[0], "0x%x", &io) != 1 ||
+X sscanf(argv[1], "%d", &irq) != 1)
+X usage();
+X
+X return msnd_write_cfg_logical(config_port, 1, io, 0, irq, 0);
+}
+X
+static int cfg_ide(int argc, char *argv[])
+{
+X int io0, io1, irq;
+X
+X if (argc < 3 ||
+X sscanf(argv[0], "0x%x", &io0) != 1 ||
+X sscanf(argv[0], "0x%x", &io1) != 1 ||
+X sscanf(argv[1], "%d", &irq) != 1)
+X usage();
+X
+X return msnd_write_cfg_logical(config_port, 2, io0, io1, irq, 0);
+}
+X
+static int cfg_joystick(int argc, char *argv[])
+{
+X int io;
+X
+X if (argc < 1 ||
+X sscanf(argv[0], "0x%x", &io) != 1)
+X usage();
+X
+X return msnd_write_cfg_logical(config_port, 3, io, 0, 0, 0);
+}
+X
+int main(int argc, char *argv[])
+{
+X char *device;
+X int rv = 0;
+X
+X --argc; ++argv;
+X
+X if (argc < 2)
+X usage();
+X
+X sscanf(argv[0], "0x%x", &config_port);
+X if (config_port != 0x250 && config_port != 0x260 && config_port != 0x270) {
+X fprintf(stderr, "error: <config port> must be 0x250, 0x260 or 0x270\n");
+X exit(1);
+X }
+X if (ioperm(config_port, 2, 1)) {
+X perror("ioperm");
+X fprintf(stderr, "note: pinnaclecfg must be run as root\n");
+X exit(1);
+X }
+X device = argv[1];
+X
+X argc -= 2; argv += 2;
+X
+X if (strcmp(device, "reset") == 0)
+X rv = cfg_reset();
+X else if (strcmp(device, "show") == 0)
+X rv = cfg_show();
+X else if (strcmp(device, "dsp") == 0)
+X rv = cfg_dsp(argc, argv);
+X else if (strcmp(device, "mpu") == 0)
+X rv = cfg_mpu(argc, argv);
+X else if (strcmp(device, "ide") == 0)
+X rv = cfg_ide(argc, argv);
+X else if (strcmp(device, "joystick") == 0)
+X rv = cfg_joystick(argc, argv);
+X else {
+X fprintf(stderr, "error: unknown device %s\n", device);
+X usage();
+X }
+X
+X if (rv)
+X fprintf(stderr, "error: device configuration failed\n");
+X
+X return 0;
+}
+SHAR_EOF
+ $shar_touch -am 1204092598 'MultiSound.d/pinnaclecfg.c' &&
+ chmod 0664 'MultiSound.d/pinnaclecfg.c' ||
+ $echo 'restore of' 'MultiSound.d/pinnaclecfg.c' 'failed'
+ if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \
+ && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then
+ md5sum -c << SHAR_EOF >/dev/null 2>&1 \
+ || $echo 'MultiSound.d/pinnaclecfg.c:' 'MD5 check failed'
+366bdf27f0db767a3c7921d0a6db20fe MultiSound.d/pinnaclecfg.c
+SHAR_EOF
+ else
+ shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'MultiSound.d/pinnaclecfg.c'`"
+ test 10235 -eq "$shar_count" ||
+ $echo 'MultiSound.d/pinnaclecfg.c:' 'original size' '10235,' 'current size' "$shar_count!"
+ fi
+fi
+# ============= MultiSound.d/Makefile ==============
+if test -f 'MultiSound.d/Makefile' && test "$first_param" != -c; then
+ $echo 'x -' SKIPPING 'MultiSound.d/Makefile' '(file already exists)'
+else
+ $echo 'x -' extracting 'MultiSound.d/Makefile' '(text)'
+ sed 's/^X//' << 'SHAR_EOF' > 'MultiSound.d/Makefile' &&
+CC = gcc
+CFLAGS = -O
+PROGS = setdigital msndreset pinnaclecfg conv
+X
+all: $(PROGS)
+X
+clean:
+X rm -f $(PROGS)
+SHAR_EOF
+ $shar_touch -am 1204092398 'MultiSound.d/Makefile' &&
+ chmod 0664 'MultiSound.d/Makefile' ||
+ $echo 'restore of' 'MultiSound.d/Makefile' 'failed'
+ if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \
+ && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then
+ md5sum -c << SHAR_EOF >/dev/null 2>&1 \
+ || $echo 'MultiSound.d/Makefile:' 'MD5 check failed'
+76ca8bb44e3882edcf79c97df6c81845 MultiSound.d/Makefile
+SHAR_EOF
+ else
+ shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'MultiSound.d/Makefile'`"
+ test 106 -eq "$shar_count" ||
+ $echo 'MultiSound.d/Makefile:' 'original size' '106,' 'current size' "$shar_count!"
+ fi
+fi
+# ============= MultiSound.d/conv.l ==============
+if test -f 'MultiSound.d/conv.l' && test "$first_param" != -c; then
+ $echo 'x -' SKIPPING 'MultiSound.d/conv.l' '(file already exists)'
+else
+ $echo 'x -' extracting 'MultiSound.d/conv.l' '(text)'
+ sed 's/^X//' << 'SHAR_EOF' > 'MultiSound.d/conv.l' &&
+%%
+[ \n\t,\r]
+\;.*
+DB
+[0-9A-Fa-f]+H { int n; sscanf(yytext, "%xH", &n); printf("%c", n); }
+%%
+int yywrap() { return 1; }
+main() { yylex(); }
+SHAR_EOF
+ $shar_touch -am 0828231798 'MultiSound.d/conv.l' &&
+ chmod 0664 'MultiSound.d/conv.l' ||
+ $echo 'restore of' 'MultiSound.d/conv.l' 'failed'
+ if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \
+ && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then
+ md5sum -c << SHAR_EOF >/dev/null 2>&1 \
+ || $echo 'MultiSound.d/conv.l:' 'MD5 check failed'
+d2411fc32cd71a00dcdc1f009e858dd2 MultiSound.d/conv.l
+SHAR_EOF
+ else
+ shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'MultiSound.d/conv.l'`"
+ test 141 -eq "$shar_count" ||
+ $echo 'MultiSound.d/conv.l:' 'original size' '141,' 'current size' "$shar_count!"
+ fi
+fi
+# ============= MultiSound.d/msndreset.c ==============
+if test -f 'MultiSound.d/msndreset.c' && test "$first_param" != -c; then
+ $echo 'x -' SKIPPING 'MultiSound.d/msndreset.c' '(file already exists)'
+else
+ $echo 'x -' extracting 'MultiSound.d/msndreset.c' '(text)'
+ sed 's/^X//' << 'SHAR_EOF' > 'MultiSound.d/msndreset.c' &&
+/*********************************************************************
+X *
+X * msndreset.c - resets the MultiSound card
+X *
+X * Copyright (C) 1998 Andrew Veliath
+X *
+X * This program is free software; you can redistribute it and/or modify
+X * it under the terms of the GNU General Public License as published by
+X * the Free Software Foundation; either version 2 of the License, or
+X * (at your option) any later version.
+X *
+X * This program is distributed in the hope that it will be useful,
+X * but WITHOUT ANY WARRANTY; without even the implied warranty of
+X * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+X * GNU General Public License for more details.
+X *
+X * You should have received a copy of the GNU General Public License
+X * along with this program; if not, write to the Free Software
+X * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+X *
+X ********************************************************************/
+X
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/soundcard.h>
+X
+int main(int argc, char *argv[])
+{
+X int fd;
+X
+X if (argc != 2) {
+X fprintf(stderr, "usage: msndreset <mixer device>\n");
+X exit(1);
+X }
+X
+X if ((fd = open(argv[1], O_RDWR)) < 0) {
+X perror(argv[1]);
+X exit(1);
+X }
+X
+X if (ioctl(fd, SOUND_MIXER_PRIVATE1, 0) < 0) {
+X fprintf(stderr, "error: msnd ioctl reset failed\n");
+X perror("ioctl");
+X close(fd);
+X exit(1);
+X }
+X
+X close(fd);
+X
+X return 0;
+}
+SHAR_EOF
+ $shar_touch -am 1204100698 'MultiSound.d/msndreset.c' &&
+ chmod 0664 'MultiSound.d/msndreset.c' ||
+ $echo 'restore of' 'MultiSound.d/msndreset.c' 'failed'
+ if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \
+ && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then
+ md5sum -c << SHAR_EOF >/dev/null 2>&1 \
+ || $echo 'MultiSound.d/msndreset.c:' 'MD5 check failed'
+c52f876521084e8eb25e12e01dcccb8a MultiSound.d/msndreset.c
+SHAR_EOF
+ else
+ shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'MultiSound.d/msndreset.c'`"
+ test 1472 -eq "$shar_count" ||
+ $echo 'MultiSound.d/msndreset.c:' 'original size' '1472,' 'current size' "$shar_count!"
+ fi
+fi
+rm -fr _sh01426
+exit 0
diff --git a/Documentation/sound/oss/OPL3 b/Documentation/sound/oss/OPL3
new file mode 100644
index 0000000..2468ff8
--- /dev/null
+++ b/Documentation/sound/oss/OPL3
@@ -0,0 +1,6 @@
+A pure OPL3 card is nice and easy to configure. Simply do
+
+insmod opl3 io=0x388
+
+Change the I/O address in the very unlikely case this card is differently
+configured
diff --git a/Documentation/sound/oss/Opti b/Documentation/sound/oss/Opti
new file mode 100644
index 0000000..c15af3c
--- /dev/null
+++ b/Documentation/sound/oss/Opti
@@ -0,0 +1,222 @@
+Support for the OPTi 82C931 chip
+--------------------------------
+Note: parts of this README file apply also to other
+cards that use the mad16 driver.
+
+Some items in this README file are based on features
+added to the sound driver after Linux-2.1.91 was out.
+By the time of writing this I do not know which official
+kernel release will include these features.
+Please do not report inconsistencies on older Linux
+kernels.
+
+The OPTi 82C931 is supported in its non-PnP mode.
+Usually you do not need to set jumpers, etc. The sound driver
+will check the card status and if it is required it will
+force the card into a mode in which it can be programmed.
+
+If you have another OS installed on your computer it is recommended
+that Linux and the other OS use the same resources.
+
+Also, it is recommended that resources specified in /etc/modprobe.conf
+and resources specified in /etc/isapnp.conf agree.
+
+Compiling the sound driver
+--------------------------
+I highly recommend that you build a modularized sound driver.
+This document does not cover a sound-driver which is built in
+the kernel.
+
+Sound card support should be enabled as a module (chose m).
+Answer 'm' for these items:
+ Generic OPL2/OPL3 FM synthesizer support (CONFIG_SOUND_ADLIB)
+ Microsoft Sound System support (CONFIG_SOUND_MSS)
+ Support for OPTi MAD16 and/or Mozart based cards (CONFIG_SOUND_MAD16)
+ FM synthesizer (YM3812/OPL-3) support (CONFIG_SOUND_YM3812)
+
+The configuration menu may ask for addresses, IRQ lines or DMA
+channels. If the card is used as a module the module loading
+options will override these values.
+
+For the OPTi 931 you can answer 'n' to:
+ Support MIDI in older MAD16 based cards (requires SB) (CONFIG_SOUND_MAD16_OLDCARD)
+If you do need MIDI support in a Mozart or C928 based card you
+need to answer 'm' to the above question. In that case you will
+also need to answer 'm' to:
+ '100% Sound Blaster compatibles (SB16/32/64, ESS, Jazz16) support' (CONFIG_SOUND_SB)
+
+Go on and compile your kernel and modules. Install the modules. Run depmod -a.
+
+Using isapnptools
+-----------------
+In most systems with a PnP BIOS you do not need to use isapnp. The
+initialization provided by the BIOS is sufficient for the driver
+to pick up the card and continue initialization.
+
+If that fails, or if you have other PnP cards, you need to use isapnp
+to initialize the card.
+This was tested with isapnptools-1.11 but I recommend that you use
+isapnptools-1.13 (or newer). Run pnpdump to dump the information
+about your PnP cards. Then edit the resulting file and select
+the options of your choice. This file is normally installed as
+/etc/isapnp.conf.
+
+The driver has one limitation with respect to I/O port resources:
+IO3 base must be 0x0E0C. Although isapnp allows other ports, this
+address is hard-coded into the driver.
+
+Using kmod and autoloading the sound driver
+-------------------------------------------
+Comment: as of linux-2.1.90 kmod is replacing kerneld.
+The config file '/etc/modprobe.conf' is used as before.
+
+This is the sound part of my /etc/modprobe.conf file.
+Following that I will explain each line.
+
+alias mixer0 mad16
+alias audio0 mad16
+alias midi0 mad16
+alias synth0 opl3
+options sb mad16=1
+options mad16 irq=10 dma=0 dma16=1 io=0x530 joystick=1 cdtype=0
+options opl3 io=0x388
+install mad16 /sbin/modprobe -i mad16 && /sbin/ad1848_mixer_reroute 14 8 15 3 16 6
+
+If you have an MPU daughtercard or onboard MPU you will want to add to the
+"options mad16" line - eg
+
+options mad16 irq=5 dma=0 dma16=3 io=0x530 mpu_io=0x330 mpu_irq=9
+
+To set the I/O and IRQ of the MPU.
+
+
+Explain:
+
+alias mixer0 mad16
+alias audio0 mad16
+alias midi0 mad16
+alias synth0 opl3
+
+When any sound device is opened the kernel requests auto-loading
+of char-major-14. There is a built-in alias that translates this
+request to loading the main sound module.
+
+The sound module in its turn will request loading of a sub-driver
+for mixer, audio, midi or synthesizer device. The first 3 are
+supported by the mad16 driver. The synth device is supported
+by the opl3 driver.
+
+There is currently no way to autoload the sound device driver
+if more than one card is installed.
+
+options sb mad16=1
+
+This is left for historical reasons. If you enable the
+config option 'Support MIDI in older MAD16 based cards (requires SB)'
+or if you use an older mad16 driver it will force loading of the
+SoundBlaster driver. This option tells the SB driver not to look
+for a SB card but to wait for the mad16 driver.
+
+options mad16 irq=10 dma=0 dma16=1 io=0x530 joystick=1 cdtype=0
+options opl3 io=0x388
+
+post-install mad16 /sbin/ad1848_mixer_reroute 14 8 15 3 16 6
+
+This sets resources and options for the mad16 and opl3 drivers.
+I use two DMA channels (only one is required) to enable full duplex.
+joystick=1 enables the joystick port. cdtype=0 disables the cd port.
+You can also set mpu_io and mpu_irq in the mad16 options for the
+uart401 driver.
+
+This tells modprobe to run /sbin/ad1848_mixer_reroute after
+mad16 is successfully loaded and initialized. The source
+for ad1848_mixer_reroute is appended to the end of this readme
+file. It is impossible for the sound driver to know the actual
+connections to the mixer. The 3 inputs intended for cd, synth
+and line-in are mapped to the generic inputs line1, line2 and
+line3. This program reroutes these mixer channels to their
+right names (note the right mapping depends on the actual sound
+card that you use).
+The numeric parameters mean:
+ 14=line1 8=cd - reroute line1 to the CD input.
+ 15=line2 3=synth - reroute line2 to the synthesizer input.
+ 16=line3 6=line - reroute line3 to the line input.
+For reference on other input names look at the file
+/usr/include/linux/soundcard.h.
+
+Using a joystick
+-----------------
+You must enable a joystick in the mad16 options. (also
+in /etc/isapnp.conf if you use it).
+Tested with regular analog joysticks.
+
+A CDROM drive connected to the sound card
+-----------------------------------------
+The 82C931 chip has support only for secondary ATAPI cdrom.
+(cdtype=8). Loading the mad16 driver resets the C931 chip
+and if a cdrom was already mounted it may cause a complete
+system hang. Do not use the sound card if you have an alternative.
+If you do use the sound card it is important that you load
+the mad16 driver (use "modprobe mad16" to prevent auto-unloading)
+before the cdrom is accessed the first time.
+
+Using the sound driver built-in to the kernel may help here, but...
+Most new systems have a PnP BIOS and also two IDE controllers.
+The IDE controller on the sound card may be needed only on older
+systems (which have only one IDE controller) but these systems
+also do not have a PnP BIOS - requiring isapnptools and a modularized
+driver.
+
+Known problems
+--------------
+1. See the section on "A CDROM drive connected to the sound card".
+
+2. On my system the codec cannot capture companded sound samples.
+ (eg., recording from /dev/audio). When any companded capture is
+ requested I get stereo-16 bit samples instead. Playback of
+ companded samples works well. Apparently this problem is not common
+ to all C931 based cards. I do not know how to identify cards that
+ have this problem.
+
+Source for ad1848_mixer_reroute.c
+---------------------------------
+#include <stdio.h>
+#include <fcntl.h>
+#include <linux/soundcard.h>
+
+static char *mixer_names[SOUND_MIXER_NRDEVICES] =
+ SOUND_DEVICE_LABELS;
+
+int
+main(int argc, char **argv) {
+ int val, from, to;
+ int i, fd;
+
+ fd = open("/dev/mixer", O_RDWR);
+ if(fd < 0) {
+ perror("/dev/mixer");
+ return 1;
+ }
+
+ for(i = 2; i < argc; i += 2) {
+ from = atoi(argv[i-1]);
+ to = atoi(argv[i]);
+
+ if(to == SOUND_MIXER_NONE)
+ fprintf(stderr, "%s: turning off mixer %s\n",
+ argv[0], mixer_names[to]);
+ else
+ fprintf(stderr, "%s: rerouting mixer %s to %s\n",
+ argv[0], mixer_names[from], mixer_names[to]);
+
+ val = from << 8 | to;
+
+ if(ioctl(fd, SOUND_MIXER_PRIVATE2, &val)) {
+ perror("AD1848 mixer reroute");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
diff --git a/Documentation/sound/oss/PAS16 b/Documentation/sound/oss/PAS16
new file mode 100644
index 0000000..951b3dc
--- /dev/null
+++ b/Documentation/sound/oss/PAS16
@@ -0,0 +1,163 @@
+Pro Audio Spectrum 16 for 2.3.99 and later
+=========================================
+by Thomas Molina (tmolina@home.com)
+last modified 3 Mar 2001
+Acknowledgement to Axel Boldt (boldt@math.ucsb.edu) for stuff taken
+from Configure.help, Riccardo Facchetti for stuff from README.OSS,
+and others whose names I could not find.
+
+This documentation is relevant for the PAS16 driver (pas2_card.c and
+friends) under kernel version 2.3.99 and later. If you are
+unfamiliar with configuring sound under Linux, please read the
+Sound-HOWTO, Documentation/sound/oss/Introduction and other
+relevant docs first.
+
+The following information is relevant information from README.OSS
+and legacy docs for the Pro Audio Spectrum 16 (PAS16):
+==================================================================
+
+The pas2_card.c driver supports the following cards --
+Pro Audio Spectrum 16 (PAS16) and compatibles:
+ Pro Audio Spectrum 16
+ Pro Audio Studio 16
+ Logitech Sound Man 16
+ NOTE! The original Pro Audio Spectrum as well as the PAS+ are not
+ and will not be supported by the driver.
+
+The sound driver configuration dialog
+-------------------------------------
+
+Sound configuration starts by making some yes/no questions. Be careful
+when answering to these questions since answering y to a question may
+prevent some later ones from being asked. For example don't answer y to
+the question about (PAS16) if you don't really have a PAS16. Sound
+configuration may also be made modular by answering m to configuration
+options presented.
+
+Note also that all questions may not be asked. The configuration program
+may disable some questions depending on the earlier choices. It may also
+select some options automatically as well.
+
+ "ProAudioSpectrum 16 support",
+ - Answer 'y'_ONLY_ if you have a Pro Audio Spectrum _16_,
+ Pro Audio Studio 16 or Logitech SoundMan 16 (be sure that
+ you read the above list correctly). Don't answer 'y' if you
+ have some other card made by Media Vision or Logitech since they
+ are not PAS16 compatible.
+ NOTE! Since 3.5-beta10 you need to enable SB support (next question)
+ if you want to use the SB emulation of PAS16. It's also possible to
+ the emulation if you want to use a true SB card together with PAS16
+ (there is another question about this that is asked later).
+
+ "Generic OPL2/OPL3 FM synthesizer support",
+ - Answer 'y' if your card has a FM chip made by Yamaha (OPL2/OPL3/OPL4).
+ The PAS16 has an OPL3-compatible FM chip.
+
+With PAS16 you can use two audio device files at the same time. /dev/dsp (and
+/dev/audio) is connected to the 8/16 bit native codec and the /dev/dsp1 (and
+/dev/audio1) is connected to the SB emulation (8 bit mono only).
+
+
+The new stuff for 2.3.99 and later
+============================================================================
+The following configuration options from Documentation/Configure.help
+are relevant to configuring the PAS16:
+
+Sound card support
+CONFIG_SOUND
+ If you have a sound card in your computer, i.e. if it can say more
+ than an occasional beep, say Y. Be sure to have all the information
+ about your sound card and its configuration down (I/O port,
+ interrupt and DMA channel), because you will be asked for it.
+
+ You want to read the Sound-HOWTO, available from
+ http://www.tldp.org/docs.html#howto . General information
+ about the modular sound system is contained in the files
+ Documentation/sound/oss/Introduction. The file
+ Documentation/sound/oss/README.OSS contains some slightly outdated but
+ still useful information as well.
+
+OSS sound modules
+CONFIG_SOUND_OSS
+ OSS is the Open Sound System suite of sound card drivers. They make
+ sound programming easier since they provide a common API. Say Y or M
+ here (the module will be called sound.o) if you haven't found a
+ driver for your sound card above, then pick your driver from the
+ list below.
+
+Persistent DMA buffers
+CONFIG_SOUND_DMAP
+ Linux can often have problems allocating DMA buffers for ISA sound
+ cards on machines with more than 16MB of RAM. This is because ISA
+ DMA buffers must exist below the 16MB boundary and it is quite
+ possible that a large enough free block in this region cannot be
+ found after the machine has been running for a while. If you say Y
+ here the DMA buffers (64Kb) will be allocated at boot time and kept
+ until the shutdown. This option is only useful if you said Y to
+ "OSS sound modules", above. If you said M to "OSS sound modules"
+ then you can get the persistent DMA buffer functionality by passing
+ the command-line argument "dmabuf=1" to the sound.o module.
+
+ Say y here for PAS16.
+
+ProAudioSpectrum 16 support
+CONFIG_SOUND_PAS
+ Answer Y only if you have a Pro Audio Spectrum 16, ProAudio Studio
+ 16 or Logitech SoundMan 16 sound card. Don't answer Y if you have
+ some other card made by Media Vision or Logitech since they are not
+ PAS16 compatible. It is not necessary to enable the separate
+ Sound Blaster support; it is included in the PAS driver.
+
+ If you compile the driver into the kernel, you have to add
+ "pas2=<io>,<irq>,<dma>,<dma2>,<sbio>,<sbirq>,<sbdma>,<sbdma2>
+ to the kernel command line.
+
+FM Synthesizer (YM3812/OPL-3) support
+CONFIG_SOUND_YM3812
+ Answer Y if your card has a FM chip made by Yamaha (OPL2/OPL3/OPL4).
+ Answering Y is usually a safe and recommended choice, however some
+ cards may have software (TSR) FM emulation. Enabling FM support with
+ these cards may cause trouble (I don't currently know of any such
+ cards, however).
+ Please read the file Documentation/sound/oss/OPL3 if your card has an
+ OPL3 chip.
+ If you compile the driver into the kernel, you have to add
+ "opl3=<io>" to the kernel command line.
+
+ If you compile your drivers into the kernel, you MUST configure
+ OPL3 support as a module for PAS16 support to work properly.
+ You can then get OPL3 functionality by issuing the command:
+ insmod opl3
+ In addition, you must either add the following line to
+ /etc/modprobe.conf:
+ options opl3 io=0x388
+ or else add the following line to /etc/lilo.conf:
+ opl3=0x388
+
+
+EXAMPLES
+===================================================================
+To use the PAS16 in my computer I have enabled the following sound
+configuration options:
+
+CONFIG_SOUND=y
+CONFIG_SOUND_OSS=y
+CONFIG_SOUND_TRACEINIT=y
+CONFIG_SOUND_DMAP=y
+CONFIG_SOUND_PAS=y
+CONFIG_SOUND_SB=n
+CONFIG_SOUND_YM3812=m
+
+I have also included the following append line in /etc/lilo.conf:
+append="pas2=0x388,10,3,-1,0x220,5,1,-1 sb=0x220,5,1,-1 opl3=0x388"
+
+The io address of 0x388 is default configuration on the PAS16. The
+irq of 10 and dma of 3 may not match your installation. The above
+configuration enables PAS16, 8-bit Soundblaster and OPL3
+functionality. If Soundblaster functionality is not desired, the
+following line would be appropriate:
+append="pas2=0x388,10,3,-1,0,-1,-1,-1 opl3=0x388"
+
+If sound is built totally modular, the above options may be
+specified in /etc/modprobe.conf for pas2, sb and opl3
+respectively.
diff --git a/Documentation/sound/oss/PSS b/Documentation/sound/oss/PSS
new file mode 100644
index 0000000..187b952
--- /dev/null
+++ b/Documentation/sound/oss/PSS
@@ -0,0 +1,41 @@
+The PSS cards and other ECHO based cards provide an onboard DSP with
+downloadable programs and also has an AD1848 "Microsoft Sound System"
+device. The PSS driver enables MSS and MPU401 modes of the card. SB
+is not enabled since it doesn't work concurrently with MSS.
+
+If you build this driver as a module then the driver takes the following
+parameters
+
+pss_io. The I/O base the PSS card is configured at (normally 0x220
+ or 0x240)
+
+mss_io The base address of the Microsoft Sound System interface.
+ This is normally 0x530, but may be 0x604 or other addresses.
+
+mss_irq The interrupt assigned to the Microsoft Sound System
+ emulation. IRQ's 3,5,7,9,10,11 and 12 are available. If you
+ get IRQ errors be sure to check the interrupt is set to
+ "ISA/Legacy" in the BIOS on modern machines.
+
+mss_dma The DMA channel used by the Microsoft Sound System.
+ This can be 0, 1, or 3. DMA 0 is not available on older
+ machines and will cause a crash on them.
+
+mpu_io The MPU emulation base address. This sets the base of the
+ synthesizer. It is typically 0x330 but can be altered.
+
+mpu_irq The interrupt to use for the synthesizer. It must differ
+ from the IRQ used by the Microsoft Sound System port.
+
+
+The mpu_io/mpu_irq fields are optional. If they are not specified the
+synthesizer parts are not configured.
+
+When the module is loaded it looks for a file called
+/etc/sound/pss_synth. This is the firmware file from the DOS install disks.
+This fil holds a general MIDI emulation. The file expected is called
+genmidi.ld on newer DOS driver install disks and synth.ld on older ones.
+
+You can also load alternative DSP algorithms into the card if you wish. One
+alternative driver can be found at http://www.mpg123.de/
+
diff --git a/Documentation/sound/oss/PSS-updates b/Documentation/sound/oss/PSS-updates
new file mode 100644
index 0000000..c84dd75
--- /dev/null
+++ b/Documentation/sound/oss/PSS-updates
@@ -0,0 +1,88 @@
+ This file contains notes for users of PSS sound cards who wish to use the
+newly added features of the newest version of this driver.
+
+ The major enhancements present in this new revision of this driver is the
+addition of two new module parameters that allow you to take full advantage of
+all the features present on your PSS sound card. These features include the
+ability to enable both the builtin CDROM and joystick ports.
+
+pss_enable_joystick
+
+ This parameter is basically a flag. A 0 will leave the joystick port
+disabled, while a non-zero value would enable the joystick port. The default
+setting is pss_enable_joystick=0 as this keeps this driver fully compatible
+with systems that were using previous versions of this driver. If you wish to
+enable the joystick port you will have to add pss_enable_joystick=1 as an
+argument to the driver. To actually use the joystick port you will then have
+to load the joystick driver itself. Just remember to load the joystick driver
+AFTER the pss sound driver.
+
+pss_cdrom_port
+
+ This parameter takes a port address as its parameter. Any available port
+address can be specified to enable the CDROM port, except for 0x0 and -1 as
+these values would leave the port disabled. Like the joystick port, the cdrom
+port will require that an appropriate CDROM driver be loaded before you can make
+use of the newly enabled CDROM port. Like the joystick port option above,
+remember to load the CDROM driver AFTER the pss sound driver. While it may
+differ on some PSS sound cards, all the PSS sound cards that I have seen have a
+builtin Wearnes CDROM port. If this is the case with your PSS sound card you
+should load aztcd with the appropriate port option that matches the port you
+assigned to the CDROM port when you loaded your pss sound driver. (ex.
+modprobe pss pss_cdrom_port=0x340 && modprobe aztcd aztcd=0x340) The default
+setting of this parameter leaves the CDROM port disabled to maintain full
+compatibility with systems using previous versions of this driver.
+
+ Other options have also been added for the added convenience and utility
+of the user. These options are only available if this driver is loaded as a
+module.
+
+pss_no_sound
+
+ This module parameter is a flag that can be used to tell the driver to
+just configure non-sound components. 0 configures all components, a non-0
+value will only attept to configure the CDROM and joystick ports. This
+parameter can be used by a user who only wished to use the builtin joystick
+and/or CDROM port(s) of his PSS sound card. If this driver is loaded with this
+parameter and with the parameter below set to true then a user can safely unload
+this driver with the following command "rmmod pss && rmmod ad1848 && rmmod
+mpu401 && rmmod sound && rmmod soundcore" and retain the full functionality of
+his CDROM and/or joystick port(s) while gaining back the memory previously used
+by the sound drivers. This default setting of this parameter is 0 to retain
+full behavioral compatibility with previous versions of this driver.
+
+pss_keep_settings
+
+ This parameter can be used to specify whether you want the driver to reset
+all emulations whenever its unloaded. This can be useful for those who are
+sharing resources (io ports, IRQ's, DMA's) between different ISA cards. This
+flag can also be useful in that future versions of this driver may reset all
+emulations by default on the driver's unloading (as it probably should), so
+specifying it now will ensure that all future versions of this driver will
+continue to work as expected. The default value of this parameter is 1 to
+retain full behavioral compatibility with previous versions of this driver.
+
+pss_firmware
+
+ This parameter can be used to specify the file containing the firmware
+code so that a user could tell the driver where that file is located instead
+of having to put it in a predefined location with a predefined name. The
+default setting of this parameter is "/etc/sound/pss_synth" as this was the
+path and filename the hardcoded value in the previous versions of this driver.
+
+Examples:
+
+# Normal PSS sound card system, loading of drivers.
+# Should be specified in an rc file (ex. Slackware uses /etc/rc.d/rc.modules).
+
+/sbin/modprobe pss pss_io=0x220 mpu_io=0x338 mpu_irq=9 mss_io=0x530 mss_irq=10 mss_dma=1 pss_cdrom_port=0x340 pss_enable_joystick=1
+/sbin/modprobe aztcd aztcd=0x340
+/sbin/modprobe joystick
+
+# System using the PSS sound card just for its CDROM and joystick ports.
+# Should be specified in an rc file (ex. Slackware uses /etc/rc.d/rc.modules).
+
+/sbin/modprobe pss pss_io=0x220 pss_cdrom_port=0x340 pss_enable_joystick=1 pss_no_sound=1
+/sbin/rmmod pss && /sbin/rmmod ad1848 && /sbin/rmmod mpu401 && /sbin/rmmod sound && /sbin/rmmod soundcore # This line not needed, but saves memory.
+/sbin/modprobe aztcd aztcd=0x340
+/sbin/modprobe joystick
diff --git a/Documentation/sound/oss/README.OSS b/Documentation/sound/oss/README.OSS
new file mode 100644
index 0000000..fd42b05
--- /dev/null
+++ b/Documentation/sound/oss/README.OSS
@@ -0,0 +1,1456 @@
+Introduction
+------------
+
+This file is a collection of all the old Readme files distributed with
+OSS/Lite by Hannu Savolainen. Since the new Linux sound driver is founded
+on it I think these information may still be interesting for users that
+have to configure their sound system.
+
+Be warned: Alan Cox is the current maintainer of the Linux sound driver so if
+you have problems with it, please contact him or the current device-specific
+driver maintainer (e.g. for aedsp16 specific problems contact me). If you have
+patches, contributions or suggestions send them to Alan: I'm sure they are
+welcome.
+
+In this document you will find a lot of references about OSS/Lite or ossfree:
+they are gone forever. Keeping this in mind and with a grain of salt this
+document can be still interesting and very helpful.
+
+[ File edited 17.01.1999 - Riccardo Facchetti ]
+[ Edited miroSOUND section 19.04.2001 - Robert Siemer ]
+
+OSS/Free version 3.8 release notes
+----------------------------------
+
+Please read the SOUND-HOWTO (available from sunsite.unc.edu and other Linux FTP
+sites). It gives instructions about using sound with Linux. It's bit out of
+date but still very useful. Information about bug fixes and such things
+is available from the web page (see above).
+
+Please check http://www.opensound.com/pguide for more info about programming
+with OSS API.
+
+ ====================================================
+- THIS VERSION ____REQUIRES____ Linux 2.1.57 OR LATER.
+ ====================================================
+
+Packages "snd-util-3.8.tar.gz" and "snd-data-0.1.tar.Z"
+contain useful utilities to be used with this driver.
+See http://www.opensound.com/ossfree/getting.html for
+download instructions.
+
+If you are looking for the installation instructions, please
+look forward into this document.
+
+Supported sound cards
+---------------------
+
+See below.
+
+Contributors
+------------
+
+This driver contains code by several contributors. In addition several other
+persons have given useful suggestions. The following is a list of major
+contributors. (I could have forgotten some names.)
+
+ Craig Metz 1/2 of the PAS16 Mixer and PCM support
+ Rob Hooft Volume computation algorithm for the FM synth.
+ Mika Liljeberg uLaw encoding and decoding routines
+ Jeff Tranter Linux SOUND HOWTO document
+ Greg Lee Volume computation algorithm for the GUS and
+ lots of valuable suggestions.
+ Andy Warner ISC port
+ Jim Lowe,
+ Amancio Hasty Jr FreeBSD/NetBSD port
+ Anders Baekgaard Bug hunting and valuable suggestions.
+ Joerg Schubert SB16 DSP support (initial version).
+ Andrew Robinson Improvements to the GUS driver
+ Megens SA MIDI recording for SB and SB Pro (initial version).
+ Mikael Nordqvist Linear volume support for GUS and
+ nonblocking /dev/sequencer.
+ Ian Hartas SVR4.2 port
+ Markus Aroharju and
+ Risto Kankkunen Major contributions to the mixer support
+ of GUS v3.7.
+ Hunyue Yau Mixer support for SG NX Pro.
+ Marc Hoffman PSS support (initial version).
+ Rainer Vranken Initialization for Jazz16 (initial version).
+ Peter Trattler Initial version of loadable module support for Linux.
+ JRA Gibson 16 bit mode for Jazz16 (initial version)
+ Davor Jadrijevic MAD16 support (initial version)
+ Gregor Hoffleit Mozart support (initial version)
+ Riccardo Facchetti Audio Excel DSP 16 (aedsp16) support
+ James Hightower Spotting a tiny but important bug in CS423x support.
+ Denis Sablic OPTi 82C924 specific enhancements (non PnP mode)
+ Tim MacKenzie Full duplex support for OPTi 82C930.
+
+ Please look at lowlevel/README for more contributors.
+
+There are probably many other names missing. If you have sent me some
+patches and your name is not in the above list, please inform me.
+
+Sending your contributions or patches
+-------------------------------------
+
+First of all it's highly recommended to contact me before sending anything
+or before even starting to do any work. Tell me what you suggest to be
+changed or what you have planned to do. Also ensure you are using the
+very latest (development) version of OSS/Free since the change may already be
+implemented there. In general it's a major waste of time to try to improve a
+several months old version. Information about the latest version can be found
+from http://www.opensound.com/ossfree. In general there is no point in
+sending me patches relative to production kernels.
+
+Sponsors etc.
+-------------
+
+The following companies have greatly helped development of this driver
+in form of a free copy of their product:
+
+Novell, Inc. UnixWare personal edition + SDK
+The Santa Cruz Operation, Inc. A SCO OpenServer + SDK
+Ensoniq Corp, a SoundScape card and extensive amount of assistance
+MediaTrix Peripherals Inc, a AudioTrix Pro card + SDK
+Acer, Inc. a pair of AcerMagic S23 cards.
+
+In addition the following companies have provided me sufficient amount
+of technical information at least some of their products (free or $$$):
+
+Advanced Gravis Computer Technology Ltd.
+Media Vision Inc.
+Analog Devices Inc.
+Logitech Inc.
+Aztech Labs Inc.
+Crystal Semiconductor Corporation,
+Integrated Circuit Systems Inc.
+OAK Technology
+OPTi
+Turtle Beach
+miro
+Ad Lib Inc. ($$)
+Music Quest Inc. ($$)
+Creative Labs ($$$)
+
+If you have some problems
+=========================
+
+Read the sound HOWTO (sunsite.unc.edu:/pub/Linux/docs/...?).
+Also look at the home page (http://www.opensound.com/ossfree). It may
+contain info about some recent bug fixes.
+
+It's likely that you have some problems when trying to use the sound driver
+first time. Sound cards don't have standard configuration so there are no
+good default configuration to use. Please try to use same I/O, DMA and IRQ
+values for the sound card than with DOS.
+
+If you get an error message when trying to use the driver, please look
+at /var/adm/messages for more verbose error message.
+
+
+The following errors are likely with /dev/dsp and /dev/audio.
+
+ - "No such device or address".
+ This error indicates that there are no suitable hardware for the
+ device file or the sound driver has been compiled without support for
+ this particular device. For example /dev/audio and /dev/dsp will not
+ work if "digitized voice support" was not enabled during "make config".
+
+ - "Device or resource busy". Probably the IRQ (or DMA) channel
+ required by the sound card is in use by some other device/driver.
+
+ - "I/O error". Almost certainly (99%) it's an IRQ or DMA conflict.
+ Look at the kernel messages in /var/adm/notice for more info.
+
+ - "Invalid argument". The application is calling ioctl()
+ with impossible parameters. Check that the application is
+ for sound driver version 2.X or later.
+
+Linux installation
+==================
+
+IMPORTANT! Read this if you are installing a separately
+ distributed version of this driver.
+
+ Check that your kernel version works with this
+ release of the driver (see Readme). Also verify
+ that your current kernel version doesn't have more
+ recent sound driver version than this one. IT'S HIGHLY
+ RECOMMENDED THAT YOU USE THE SOUND DRIVER VERSION THAT
+ IS DISTRIBUTED WITH KERNEL SOURCES.
+
+- When installing separately distributed sound driver you should first
+ read the above notice. Then try to find proper directory where and how
+ to install the driver sources. You should not try to install a separately
+ distributed driver version if you are not able to find the proper way
+ yourself (in this case use the version that is distributed with kernel
+ sources). Remove old version of linux/drivers/sound directory before
+ installing new files.
+
+- To build the device files you need to run the enclosed shell script
+ (see below). You need to do this only when installing sound driver
+ first time or when upgrading to much recent version than the earlier
+ one.
+
+- Configure and compile Linux as normally (remember to include the
+ sound support during "make config"). Please refer to kernel documentation
+ for instructions about configuring and compiling kernel. File Readme.cards
+ contains card specific instructions for configuring this driver for
+ use with various sound cards.
+
+Boot time configuration (using lilo and insmod)
+-----------------------------------------------
+
+This information has been removed. Too many users didn't believe
+that it's really not necessary to use this method. Please look at
+Readme of sound driver version 3.0.1 if you still want to use this method.
+
+Problems
+--------
+
+Common error messages:
+
+- /dev/???????: No such file or directory.
+Run the script at the end of this file.
+
+- /dev/???????: No such device.
+You are not running kernel which contains the sound driver. When using
+modularized sound driver this error means that the sound driver is not
+loaded.
+
+- /dev/????: No such device or address.
+Sound driver didn't detect suitable card when initializing. Please look at
+Readme.cards for info about configuring the driver with your card. Also
+check for possible boot (insmod) time error messages in /var/adm/messages.
+
+- Other messages or problems
+Please check http://www.opensound.com/ossfree for more info.
+
+Configuring version 3.8 (for Linux) with some common sound cards
+================================================================
+
+This document describes configuring sound cards with the freeware version of
+Open Sound Systems (OSS/Free). Information about the commercial version
+(OSS/Linux) and its configuration is available from
+http://www.opensound.com/linux.html. Information presented here is
+not valid for OSS/Linux.
+
+If you are unsure about how to configure OSS/Free
+you can download the free evaluation version of OSS/Linux from the above
+address. There is a chance that it can autodetect your sound card. In this case
+you can use the information included in soundon.log when configuring OSS/Free.
+
+
+IMPORTANT! This document covers only cards that were "known" when
+ this driver version was released. Please look at
+ http://www.opensound.com/ossfree for info about
+ cards introduced recently.
+
+ When configuring the sound driver, you should carefully
+ check each sound configuration option (particularly
+ "Support for /dev/dsp and /dev/audio"). The default values
+ offered by these programs are not necessarily valid.
+
+
+THE BIGGEST MISTAKES YOU CAN MAKE
+=================================
+
+1. Assuming that the card is Sound Blaster compatible when it's not.
+--------------------------------------------------------------------
+
+The number one mistake is to assume that your card is compatible with
+Sound Blaster. Only the cards made by Creative Technology or which have
+one or more chips labeled by Creative are SB compatible. In addition there
+are few sound chipsets which are SB compatible in Linux such as ESS1688 or
+Jazz16. Note that SB compatibility in DOS/Windows does _NOT_ mean anything
+in Linux.
+
+IF YOU REALLY ARE 150% SURE YOU HAVE A SOUND BLASTER YOU CAN SKIP THE REST OF
+THIS CHAPTER.
+
+For most other "supposed to be SB compatible" cards you have to use other
+than SB drivers (see below). It is possible to get most sound cards to work
+in SB mode but in general it's a complete waste of time. There are several
+problems which you will encounter by using SB mode with cards that are not
+truly SB compatible:
+
+- The SB emulation is at most SB Pro (DSP version 3.x) which means that
+you get only 8 bit audio (there is always an another ("native") mode which
+gives the 16 bit capability). The 8 bit only operation is the reason why
+many users claim that sound quality in Linux is much worse than in DOS.
+In addition some applications require 16 bit mode and they produce just
+noise with a 8 bit only device.
+- The card may work only in some cases but refuse to work most of the
+time. The SB compatible mode always requires special initialization which is
+done by the DOS/Windows drivers. This kind of cards work in Linux after
+you have warm booted it after DOS but they don't work after cold boot
+(power on or reset).
+- You get the famous "DMA timed out" messages. Usually all SB clones have
+software selectable IRQ and DMA settings. If the (power on default) values
+currently used by the card don't match configuration of the driver you will
+get the above error message whenever you try to record or play. There are
+few other reasons to the DMA timeout message but using the SB mode seems
+to be the most common cause.
+
+2. Trying to use a PnP (Plug & Play) card just like an ordinary sound card
+--------------------------------------------------------------------------
+
+Plug & Play is a protocol defined by Intel and Microsoft. It lets operating
+systems to easily identify and reconfigure I/O ports, IRQs and DMAs of ISA
+cards. The problem with PnP cards is that the standard Linux doesn't currently
+(versions 2.1.x and earlier) don't support PnP. This means that you will have
+to use some special tricks (see later) to get a PnP card alive. Many PnP cards
+work after they have been initialized but this is not always the case.
+
+There are sometimes both PnP and non-PnP versions of the same sound card.
+The non-PnP version is the original model which usually has been discontinued
+more than an year ago. The PnP version has the same name but with "PnP"
+appended to it (sometimes not). This causes major confusion since the non-PnP
+model works with Linux but the PnP one doesn't.
+
+You should carefully check if "Plug & Play" or "PnP" is mentioned in the name
+of the card or in the documentation or package that came with the card.
+Everything described in the rest of this document is not necessarily valid for
+PnP models of sound cards even you have managed to wake up the card properly.
+Many PnP cards are simply too different from their non-PnP ancestors which are
+covered by this document.
+
+
+Cards that are not (fully) supported by this driver
+===================================================
+
+See http://www.opensound.com/ossfree for information about sound cards
+to be supported in future.
+
+
+How to use sound without recompiling kernel and/or sound driver
+===============================================================
+
+There is a commercial sound driver which comes in precompiled form and doesn't
+require recompiling of the kernel. See http://www.4Front-tech.com/oss.html for
+more info.
+
+
+Configuring PnP cards
+=====================
+
+New versions of most sound cards use the so-called ISA PnP protocol for
+soft configuring their I/O, IRQ, DMA and shared memory resources.
+Currently at least cards made by Creative Technology (SB32 and SB32AWE
+PnP), Gravis (GUS PnP and GUS PnP Pro), Ensoniq (Soundscape PnP) and
+Aztech (some Sound Galaxy models) use PnP technology. The CS4232/4236 audio
+chip by Crystal Semiconductor (Intel Atlantis, HP Pavilion and many other
+motherboards) is also based on PnP technology but there is a "native" driver
+available for it (see information about CS4232 later in this document).
+
+PnP sound cards (as well as most other PnP ISA cards) are not supported
+by this version of the driver . Proper
+support for them should be released during 97 once the kernel level
+PnP support is available.
+
+There is a method to get most of the PnP cards to work. The basic method
+is the following:
+
+1) Boot DOS so the card's DOS drivers have a chance to initialize it.
+2) _Cold_ boot to Linux by using "loadlin.exe". Hitting ctrl-alt-del
+works with older machines but causes a hard reset of all cards on recent
+(Pentium) machines.
+3) If you have the sound driver in Linux configured properly, the card should
+work now. "Proper" means that I/O, IRQ and DMA settings are the same as in
+DOS. The hard part is to find which settings were used. See the documentation of
+your card for more info.
+
+Windows 95 could work as well as DOS but running loadlin may be difficult.
+Probably you should "shut down" your machine to MS-DOS mode before running it.
+
+Some machines have a BIOS utility for setting PnP resources. This is a good
+way to configure some cards. In this case you don't need to boot DOS/Win95
+before starting Linux.
+
+Another way to initialize PnP cards without DOS/Win95 is a Linux based
+PnP isolation tool. When writing this there is a pre alpha test version
+of such a tool available from ftp://ftp.demon.co.uk/pub/unix/linux/utils. The
+file is called isapnptools-*. Please note that this tool is just a temporary
+solution which may be incompatible with future kernel versions having proper
+support for PnP cards. There are bugs in setting DMA channels in earlier
+versions of isapnptools so at least version 1.6 is required with sound cards.
+
+Yet another way to use PnP cards is to use (commercial) OSS/Linux drivers. See
+http://www.opensound.com/linux.html for more info. This is probably the way you
+should do it if you don't want to spend time recompiling the kernel and
+required tools.
+
+
+Read this before trying to configure the driver
+===============================================
+
+There are currently many cards that work with this driver. Some of the cards
+have native support while others work since they emulate some other
+card (usually SB, MSS/WSS and/or MPU401). The following cards have native
+support in the driver. Detailed instructions for configuring these cards
+will be given later in this document.
+
+Pro Audio Spectrum 16 (PAS16) and compatibles:
+ Pro Audio Spectrum 16
+ Pro Audio Studio 16
+ Logitech Sound Man 16
+ NOTE! The original Pro Audio Spectrum as well as the PAS+ are not
+ and will not be supported by the driver.
+
+Media Vision Jazz16 based cards
+ Pro Sonic 16
+ Logitech SoundMan Wave
+ (Other Jazz based cards should work but I don't have any reports
+ about them).
+
+Sound Blasters
+ SB 1.0 to 2.0
+ SB Pro
+ SB 16
+ SB32/64/AWE
+ Configure SB32/64/AWE just like SB16. See lowlevel/README.awe
+ for information about using the wave table synth.
+ NOTE! AWE63/Gold and 16/32/AWE "PnP" cards need to be activated
+ using isapnptools before they work with OSS/Free.
+ SB16 compatible cards by other manufacturers than Creative.
+ You have been fooled since there are _no_ SB16 compatible
+ cards on the market (as of May 1997). It's likely that your card
+ is compatible just with SB Pro but there is also a non-SB-
+ compatible 16 bit mode. Usually it's MSS/WSS but it could also
+ be a proprietary one like MV Jazz16 or ESS ES688. OPTi
+ MAD16 chips are very common in so called "SB 16 bit cards"
+ (try with the MAD16 driver).
+
+ ======================================================================
+ "Supposed to be SB compatible" cards.
+ Forget the SB compatibility and check for other alternatives
+ first. The only cards that work with the SB driver in
+ Linux have been made by Creative Technology (there is at least
+ one chip on the card with "CREATIVE" printed on it). The
+ only other SB compatible chips are ESS and Jazz16 chips
+ (maybe ALSxxx chips too but they probably don't work).
+ Most other "16 bit SB compatible" cards such as "OPTi/MAD16" or
+ "Crystal" are _NOT_ SB compatible in Linux.
+
+ Practically all sound cards have some kind of SB emulation mode
+ in addition to their native (16 bit) mode. In most cases this
+ (8 bit only) SB compatible mode doesn't work with Linux. If
+ you get it working it may cause problems with games and
+ applications which require 16 bit audio. Some 16 bit only
+ applications don't check if the card actually supports 16 bits.
+ They just dump 16 bit data to a 8 bit card which produces just
+ noise.
+
+ In most cases the 16 bit native mode is supported by Linux.
+ Use the SB mode with "clones" only if you don't find anything
+ better from the rest of this doc.
+ ======================================================================
+
+Gravis Ultrasound (GUS)
+ GUS
+ GUS + the 16 bit option
+ GUS MAX
+ GUS ACE (No MIDI port and audio recording)
+ GUS PnP (with RAM)
+
+MPU-401 and compatibles
+ The driver works both with the full (intelligent mode) MPU-401
+ cards (such as MPU IPC-T and MQX-32M) and with the UART only
+ dumb MIDI ports. MPU-401 is currently the most common MIDI
+ interface. Most sound cards are compatible with it. However,
+ don't enable MPU401 mode blindly. Many cards with native support
+ in the driver have their own MPU401 driver. Enabling the standard one
+ will cause a conflict with these cards. So check if your card is
+ in the list of supported cards before enabling MPU401.
+
+Windows Sound System (MSS/WSS)
+ Even when Microsoft has discontinued their own Sound System card
+ they managed to make it a standard. MSS compatible cards are based on
+ a codec chip which is easily available from at least two manufacturers
+ (AD1848 by Analog Devices and CS4231/CS4248 by Crystal Semiconductor).
+ Currently most sound cards are based on one of the MSS compatible codec
+ chips. The CS4231 is used in the high quality cards such as GUS MAX,
+ MediaTrix AudioTrix Pro and TB Tropez (GUS MAX is not MSS compatible).
+
+ Having a AD1848, CS4248 or CS4231 codec chip on the card is a good
+ sign. Even if the card is not MSS compatible, it could be easy to write
+ support for it. Note also that most MSS compatible cards
+ require special boot time initialization which may not be present
+ in the driver. Also, some MSS compatible cards have native support.
+ Enabling the MSS support with these cards is likely to
+ cause a conflict. So check if your card is listed in this file before
+ enabling the MSS support.
+
+Yamaha FM synthesizers (OPL2, OPL3 (not OPL3-SA) and OPL4)
+ Most sound cards have a FM synthesizer chip. The OPL2 is a 2
+ operator chip used in the original AdLib card. Currently it's used
+ only in the cheapest (8 bit mono) cards. The OPL3 is a 4 operator
+ FM chip which provides better sound quality and/or more available
+ voices than the OPL2. The OPL4 is a new chip that has an OPL3 and
+ a wave table synthesizer packed onto the same chip. The driver supports
+ just the OPL3 mode directly. Most cards with an OPL4 (like
+ SM Wave and AudioTrix Pro) support the OPL4 mode using MPU401
+ emulation. Writing a native OPL4 support is difficult
+ since Yamaha doesn't give information about their sample ROM chip.
+
+ Enable the generic OPL2/OPL3 FM synthesizer support if your
+ card has a FM chip made by Yamaha. Don't enable it if your card
+ has a software (TRS) based FM emulator.
+
+ ----------------------------------------------------------------
+ NOTE! OPL3-SA is different chip than the ordinary OPL3. In addition
+ to the FM synth this chip has also digital audio (WSS) and
+ MIDI (MPU401) capabilities. Support for OPL3-SA is described below.
+ ----------------------------------------------------------------
+
+Yamaha OPL3-SA1
+
+ Yamaha OPL3-SA1 (YMF701) is an audio controller chip used on some
+ (Intel) motherboards and on cheap sound cards. It should not be
+ confused with the original OPL3 chip (YMF278) which is entirely
+ different chip. OPL3-SA1 has support for MSS, MPU401 and SB Pro
+ (not used in OSS/Free) in addition to the OPL3 FM synth.
+
+ There are also chips called OPL3-SA2, OPL3-SA3, ..., OPL3SA-N. They
+ are PnP chips and will not work with the OPL3-SA1 driver. You should
+ use the standard MSS, MPU401 and OPL3 options with these chips and to
+ activate the card using isapnptools.
+
+4Front Technologies SoftOSS
+
+ SoftOSS is a software based wave table emulation which works with
+ any 16 bit stereo sound card. Due to its nature a fast CPU is
+ required (P133 is minimum). Although SoftOSS does _not_ use MMX
+ instructions it has proven out that recent processors (which appear
+ to have MMX) perform significantly better with SoftOSS than earlier
+ ones. For example a P166MMX beats a PPro200. SoftOSS should not be used
+ on 486 or 386 machines.
+
+ The amount of CPU load caused by SoftOSS can be controlled by
+ selecting the CONFIG_SOFTOSS_RATE and CONFIG_SOFTOSS_VOICES
+ parameters properly (they will be prompted by make config). It's
+ recommended to set CONFIG_SOFTOSS_VOICES to 32. If you have a
+ P166MMX or faster (PPro200 is not faster) you can set
+ CONFIG_SOFTOSS_RATE to 44100 (kHz). However with slower systems it
+ recommended to use sampling rates around 22050 or even 16000 kHz.
+ Selecting too high values for these parameters may hang your
+ system when playing MIDI files with hight degree of polyphony
+ (number of concurrently playing notes). It's also possible to
+ decrease CONFIG_SOFTOSS_VOICES. This makes it possible to use
+ higher sampling rates. However using fewer voices decreases
+ playback quality more than decreasing the sampling rate.
+
+ SoftOSS keeps the samples loaded on the system's RAM so much RAM is
+ required. SoftOSS should never be used on machines with less than 16 MB
+ of RAM since this is potentially dangerous (you may accidentally run out
+ of memory which probably crashes the machine).
+
+ SoftOSS implements the wave table API originally designed for GUS. For
+ this reason all applications designed for GUS should work (at least
+ after minor modifications). For example gmod/xgmod and playmidi -g are
+ known to work.
+
+ To work SoftOSS will require GUS compatible
+ patch files to be installed on the system (in /dos/ultrasnd/midi). You
+ can use the public domain MIDIA patchset available from several ftp
+ sites.
+
+ *********************************************************************
+ IMPORTANT NOTICE! The original patch set distributed with the Gravis
+ Ultrasound card is not in public domain (even though it's available from
+ some FTP sites). You should contact Voice Crystal (www.voicecrystal.com)
+ if you like to use these patches with SoftOSS included in OSS/Free.
+ *********************************************************************
+
+PSS based cards (AD1848 + ADSP-2115 + Echo ESC614 ASIC)
+ Analog Devices and Echo Speech have together defined a sound card
+ architecture based on the above chips. The DSP chip is used
+ for emulation of SB Pro, FM and General MIDI/MT32.
+
+ There are several cards based on this architecture. The most known
+ ones are Orchid SW32 and Cardinal DSP16.
+
+ The driver supports downloading DSP algorithms to these cards.
+
+ NOTE! You will have to use the "old" config script when configuring
+ PSS cards.
+
+MediaTrix AudioTrix Pro
+ The ATP card is built around a CS4231 codec and an OPL4 synthesizer
+ chips. The OPL4 mode is supported by a microcontroller running a
+ General MIDI emulator. There is also a SB 1.5 compatible playback mode.
+
+Ensoniq SoundScape and compatibles
+ Ensoniq has designed a sound card architecture based on the
+ OTTO synthesizer chip used in their professional MIDI synthesizers.
+ Several companies (including Ensoniq, Reveal and Spea) are selling
+ cards based on this architecture.
+
+ NOTE! The SoundScape PnP is not supported by OSS/Free. Ensoniq VIVO and
+ VIVO90 cards are not compatible with Soundscapes so the Soundscape
+ driver will not work with them. You may want to use OSS/Linux with these
+ cards.
+
+OPTi MAD16 and Mozart based cards
+ The Mozart (OAK OTI-601), MAD16 (OPTi 82C928), MAD16 Pro (OPTi 82C929),
+ OPTi 82C924/82C925 (in _non_ PnP mode) and OPTi 82C930 interface
+ chips are used in many different sound cards, including some
+ cards by Reveal miro and Turtle Beach (Tropez). The purpose of these
+ chips is to connect other audio components to the PC bus. The
+ interface chip performs address decoding for the other chips.
+ NOTE! Tropez Plus is not MAD16 but CS4232 based.
+ NOTE! MAD16 PnP cards (82C924, 82C925, 82C931) are not MAD16 compatible
+ in the PnP mode. You will have to use them in MSS mode after having
+ initialized them using isapnptools or DOS. 82C931 probably requires
+ initialization using DOS/Windows (running isapnptools is not enough).
+ It's possible to use 82C931 with OSS/Free by jumpering it to non-PnP
+ mode (provided that the card has a jumper for this). In non-PnP mode
+ 82C931 is compatible with 82C930 and should work with the MAD16 driver
+ (without need to use isapnptools or DOS to initialize it). All OPTi
+ chips are supported by OSS/Linux (both in PnP and non-PnP modes).
+
+Audio Excel DSP16
+ Support for this card was written by Riccardo Faccetti
+ (riccardo@cdc8g5.cdc.polimi.it). The AEDSP16 driver included in
+ the lowlevel/ directory. To use it you should enable the
+ "Additional low level drivers" option.
+
+Crystal CS4232 and CS4236 based cards such as AcerMagic S23, TB Tropez _Plus_ and
+ many PC motherboards (Compaq, HP, Intel, ...)
+ CS4232 is a PnP multimedia chip which contains a CS3231A codec,
+ SB and MPU401 emulations. There is support for OPL3 too.
+ Unfortunately the MPU401 mode doesn't work (I don't know how to
+ initialize it). CS4236 is an enhanced (compatible) version of CS4232.
+ NOTE! Don't ever try to use isapnptools with CS4232 since this will just
+ freeze your machine (due to chip bugs). If you have problems in getting
+ CS4232 working you could try initializing it with DOS (CS4232C.EXE) and
+ then booting Linux using loadlin. CS4232C.EXE loads a secret firmware
+ patch which is not documented by Crystal.
+
+Turtle Beach Maui and Tropez "classic"
+ This driver version supports sample, patch and program loading commands
+ described in the Maui/Tropez User's manual.
+ There is now full initialization support too. The audio side of
+ the Tropez is based on the MAD16 chip (see above).
+ NOTE! Tropez Plus is different card than Tropez "classic" and will not
+ work fully in Linux. You can get audio features working by configuring
+ the card as a CS4232 based card (above).
+
+
+Jumpers and software configuration
+==================================
+
+Some of the earliest sound cards were jumper configurable. You have to
+configure the driver use I/O, IRQ and DMA settings
+that match the jumpers. Just few 8 bit cards are fully jumper
+configurable (SB 1.x/2.x, SB Pro and clones).
+Some cards made by Aztech have an EEPROM which contains the
+config info. These cards behave much like hardware jumpered cards.
+
+Most cards have jumper for the base I/O address but other parameters
+are software configurable. Sometimes there are few other jumpers too.
+
+Latest cards are fully software configurable or they are PnP ISA
+compatible. There are no jumpers on the board.
+
+The driver handles software configurable cards automatically. Just configure
+the driver to use I/O, IRQ and DMA settings which are known to work.
+You could usually use the same values than with DOS and/or Windows.
+Using different settings is possible but not recommended since it may cause
+some trouble (for example when warm booting from an OS to another or
+when installing new hardware to the machine).
+
+Sound driver sets the soft configurable parameters of the card automatically
+during boot. Usually you don't need to run any extra initialization
+programs when booting Linux but there are some exceptions. See the
+card-specific instructions below for more info.
+
+The drawback of software configuration is that the driver needs to know
+how the card must be initialized. It cannot initialize unknown cards
+even if they are otherwise compatible with some other cards (like SB,
+MPU401 or Windows Sound System).
+
+
+What if your card was not listed above?
+=======================================
+
+The first thing to do is to look at the major IC chips on the card.
+Many of the latest sound cards are based on some standard chips. If you
+are lucky, all of them could be supported by the driver. The most common ones
+are the OPTi MAD16, Mozart, SoundScape (Ensoniq) and the PSS architectures
+listed above. Also look at the end of this file for list of unsupported
+cards and the ones which could be supported later.
+
+The last resort is to send _exact_ name and model information of the card
+to me together with a list of the major IC chips (manufactured, model) to
+me. I could then try to check if your card looks like something familiar.
+
+There are many more cards in the world than listed above. The first thing to
+do with these cards is to check if they emulate some other card or interface
+such as SB, MSS and/or MPU401. In this case there is a chance to get the
+card to work by booting DOS before starting Linux (boot DOS, hit ctrl-alt-del
+and boot Linux without hard resetting the machine). In this method the
+DOS based driver initializes the hardware to use known I/O, IRQ and DMA
+settings. If sound driver is configured to use the same settings, everything
+should work OK.
+
+
+Configuring sound driver (with Linux)
+=====================================
+
+The sound driver is currently distributed as part of the Linux kernel. The
+files are in /usr/src/linux/drivers/sound/.
+
+****************************************************************************
+* ALWAYS USE THE SOUND DRIVER VERSION WHICH IS DISTRIBUTED WITH *
+* THE KERNEL SOURCE PACKAGE YOU ARE USING. SOME ALPHA AND BETA TEST *
+* VERSIONS CAN BE INSTALLED FROM A SEPARATELY DISTRIBUTED PACKAGE *
+* BUT CHECK THAT THE PACKAGE IS NOT MUCH OLDER (OR NEWER) THAN THE *
+* KERNEL YOU ARE USING. IT'S POSSIBLE THAT THE KERNEL/DRIVER *
+* INTERFACE CHANGES BETWEEN KERNEL RELEASES WHICH MAY CAUSE SOME *
+* INCOMPATIBILITY PROBLEMS. *
+* *
+* IN CASE YOU INSTALL A SEPARATELY DISTRIBUTED SOUND DRIVER VERSION, *
+* BE SURE TO REMOVE OR RENAME THE OLD SOUND DRIVER DIRECTORY BEFORE *
+* INSTALLING THE NEW ONE. LEAVING OLD FILES TO THE SOUND DRIVER *
+* DIRECTORY _WILL_ CAUSE PROBLEMS WHEN THE DRIVER IS USED OR *
+* COMPILED. *
+****************************************************************************
+
+To configure the driver, run "make config" in the kernel source directory
+(/usr/src/linux). Answer "y" or "m" to the question about Sound card support
+(after the questions about mouse, CD-ROM, ftape, etc. support). Questions
+about options for sound will then be asked.
+
+After configuring the kernel and sound driver and compile the kernel
+following instructions in the kernel README.
+
+The sound driver configuration dialog
+-------------------------------------
+
+Sound configuration starts by making some yes/no questions. Be careful
+when answering to these questions since answering y to a question may
+prevent some later ones from being asked. For example don't answer y to
+the first question (PAS16) if you don't really have a PAS16. Don't enable
+more cards than you really need since they just consume memory. Also
+some drivers (like MPU401) may conflict with your SCSI controller and
+prevent kernel from booting. If you card was in the list of supported
+cards (above), please look at the card specific config instructions
+(later in this file) before starting to configure. Some cards must be
+configured in way which is not obvious.
+
+So here is the beginning of the config dialog. Answer 'y' or 'n' to these
+questions. The default answer is shown so that (y/n) means 'y' by default and
+(n/y) means 'n'. To use the default value, just hit ENTER. But be careful
+since using the default _doesn't_ guarantee anything.
+
+Note also that all questions may not be asked. The configuration program
+may disable some questions depending on the earlier choices. It may also
+select some options automatically as well.
+
+ "ProAudioSpectrum 16 support",
+ - Answer 'y'_ONLY_ if you have a Pro Audio Spectrum _16_,
+ Pro Audio Studio 16 or Logitech SoundMan 16 (be sure that
+ you read the above list correctly). Don't answer 'y' if you
+ have some other card made by Media Vision or Logitech since they
+ are not PAS16 compatible.
+ NOTE! Since 3.5-beta10 you need to enable SB support (next question)
+ if you want to use the SB emulation of PAS16. It's also possible to
+ the emulation if you want to use a true SB card together with PAS16
+ (there is another question about this that is asked later).
+ "Sound Blaster support",
+ - Answer 'y' if you have an original SB card made by Creative Labs
+ or a full 100% hardware compatible clone (like Thunderboard or
+ SM Games). If your card was in the list of supported cards (above),
+ please look at the card specific instructions later in this file
+ before answering this question. For an unknown card you may answer
+ 'y' if the card claims to be SB compatible.
+ Enable this option also with PAS16 (changed since v3.5-beta9).
+
+ Don't enable SB if you have a MAD16 or Mozart compatible card.
+
+ "Generic OPL2/OPL3 FM synthesizer support",
+ - Answer 'y' if your card has a FM chip made by Yamaha (OPL2/OPL3/OPL4).
+ Answering 'y' is usually a safe and recommended choice. However some
+ cards may have software (TSR) FM emulation. Enabling FM support
+ with these cards may cause trouble. However I don't currently know
+ such cards.
+ "Gravis Ultrasound support",
+ - Answer 'y' if you have GUS or GUS MAX. Answer 'n' if you don't
+ have GUS since the GUS driver consumes much memory.
+ Currently I don't have experiences with the GUS ACE so I don't
+ know what to answer with it.
+ "MPU-401 support (NOT for SB16)",
+ - Be careful with this question. The MPU401 interface is supported
+ by almost any sound card today. However some natively supported cards
+ have their own driver for MPU401. Enabling the MPU401 option with
+ these cards will cause a conflict. Also enabling MPU401 on a system
+ that doesn't really have a MPU401 could cause some trouble. If your
+ card was in the list of supported cards (above), please look at
+ the card specific instructions later in this file.
+
+ In MOST cases this MPU401 driver should only be used with "true"
+ MIDI-only MPU401 professional cards. In most other cases there
+ is another way to get the MPU401 compatible interface of a
+ sound card to work.
+ Support for the MPU401 compatible MIDI port of SB16, ESS1688
+ and MV Jazz16 cards is included in the SB driver. Use it instead
+ of this separate MPU401 driver with these cards. As well
+ Soundscape, PSS and Maui drivers include their own MPU401
+ options.
+
+ It's safe to answer 'y' if you have a true MPU401 MIDI interface
+ card.
+ "6850 UART Midi support",
+ - It's safe to answer 'n' to this question in all cases. The 6850
+ UART interface is so rarely used.
+ "PSS (ECHO-ADI2111) support",
+ - Answer 'y' only if you have Orchid SW32, Cardinal DSP16 or some
+ other card based on the PSS chipset (AD1848 codec + ADSP-2115
+ DSP chip + Echo ESC614 ASIC CHIP).
+ "16 bit sampling option of GUS (_NOT_ GUS MAX)",
+ - Answer 'y' if you have installed the 16 bit sampling daughtercard
+ to your GUS. Answer 'n' if you have GUS MAX. Enabling this option
+ disables GUS MAX support.
+ "GUS MAX support",
+ - Answer 'y' only if you have a GUS MAX.
+ "Microsoft Sound System support",
+ - Again think carefully before answering 'y' to this question. It's
+ safe to answer 'y' in case you have the original Windows Sound
+ System card made by Microsoft or Aztech SG 16 Pro (or NX16 Pro).
+ Also you may answer 'y' in case your card was not listed earlier
+ in this file. For cards having native support in the driver, consult
+ the card specific instructions later in this file. Some drivers
+ have their own MSS support and enabling this option will cause a
+ conflict.
+ Note! The MSS driver permits configuring two DMA channels. This is a
+ "nonstandard" feature and works only with very few cards (if any).
+ In most cases the second DMA channel should be disabled or set to
+ the same channel than the first one. Trying to configure two separate
+ channels with cards that don't support this feature will prevent
+ audio (at least recording) from working.
+ "Ensoniq Soundscape support",
+ - Answer 'y' if you have a sound card based on the Ensoniq SoundScape
+ chipset. Such cards are being manufactured at least by Ensoniq,
+ Spea and Reveal (note that Reveal makes other cards also). The oldest
+ cards made by Spea don't work properly with Linux.
+ Soundscape PnP as well as Ensoniq VIVO work only with the commercial
+ OSS/Linux version.
+ "MediaTrix AudioTrix Pro support",
+ - Answer 'y' if you have the AudioTrix Pro.
+ "Support for MAD16 and/or Mozart based cards",
+ - Answer y if your card has a Mozart (OAK OTI-601) or MAD16
+ (OPTi 82C928, 82C929, 82C924/82C925 or 82C930) audio interface chip.
+ These chips are
+ currently quite common so it's possible that many no-name cards
+ have one of them. In addition the MAD16 chip is used in some
+ cards made by known manufacturers such as Turtle Beach (Tropez),
+ Reveal (some models) and Diamond (some recent models).
+ Note OPTi 82C924 and 82C925 are MAD16 compatible only in non PnP
+ mode (jumper selectable on many cards).
+ "Support for TB Maui"
+ - This enables TB Maui specific initialization. Works with TB Maui
+ and TB Tropez (may not work with Tropez Plus).
+
+
+Then the configuration program asks some y/n questions about the higher
+level services. It's recommended to answer 'y' to each of these questions.
+Answer 'n' only if you know you will not need the option.
+
+ "MIDI interface support",
+ - Answering 'n' disables /dev/midi## devices and access to any
+ MIDI ports using /dev/sequencer and /dev/music. This option
+ also affects any MPU401 and/or General MIDI compatible devices.
+ "FM synthesizer (YM3812/OPL-3) support",
+ - Answer 'y' here.
+ "/dev/sequencer support",
+ - Answering 'n' disables /dev/sequencer and /dev/music.
+
+Entering the I/O, IRQ and DMA config parameters
+-----------------------------------------------
+
+After the above questions the configuration program prompts for the
+card specific configuration information. Usually just a set of
+I/O address, IRQ and DMA numbers are asked. With some cards the program
+asks for some files to be used during initialization of the card. For example
+many cards have a DSP chip or microprocessor which must be initialized by
+downloading a program (microcode) file to the card.
+
+Instructions for answering these questions are given in the next section.
+
+
+Card specific information
+=========================
+
+This section gives additional instructions about configuring some cards.
+Please refer manual of your card for valid I/O, IRQ and DMA numbers. Using
+the same settings with DOS/Windows and Linux is recommended. Using
+different values could cause some problems when switching between
+different operating systems.
+
+Sound Blasters (the original ones by Creative)
+---------------------------------------------
+
+NOTE! Check if you have a PnP Sound Blaster (cards sold after summer 1995
+ are almost certainly PnP ones). With PnP cards you should use isapnptools
+ to activate them (see above).
+
+It's possible to configure these cards to use different I/O, IRQ and
+DMA settings. Since the possible/default settings have changed between various
+models, you have to consult manual of your card for the proper ones. It's
+a good idea to use the same values than with DOS/Windows. With SB and SB Pro
+it's the only choice. SB16 has software selectable IRQ and DMA channels but
+using different values with DOS and Linux is likely to cause troubles. The
+DOS driver is not able to reset the card properly after warm boot from Linux
+if Linux has used different IRQ or DMA values.
+
+The original (steam) Sound Blaster (versions 1.x and 2.x) use always
+DMA1. There is no way to change it.
+
+The SB16 needs two DMA channels. A 8 bit one (1 or 3) is required for
+8 bit operation and a 16 bit one (5, 6 or 7) for the 16 bit mode. In theory
+it's possible to use just one (8 bit) DMA channel by answering the 8 bit
+one when the configuration program asks for the 16 bit one. This may work
+in some systems but is likely to cause terrible noise on some other systems.
+
+It's possible to use two SB16/32/64 at the same time. To do this you should
+first configure OSS/Free for one card. Then edit local.h manually and define
+SB2_BASE, SB2_IRQ, SB2_DMA and SB2_DMA2 for the second one. You can't get
+the OPL3, MIDI and EMU8000 devices of the second card to work. If you are
+going to use two PnP Sound Blasters, ensure that they are of different model
+and have different PnP IDs. There is no way to get two cards with the same
+card ID and serial number to work. The easiest way to check this is trying
+if isapnptools can see both cards or just one.
+
+NOTE! Don't enable the SM Games option (asked by the configuration program)
+ if you are not 101% sure that your card is a Logitech Soundman Games
+ (not a SM Wave or SM16).
+
+SB Clones
+---------
+
+First of all: There are no SB16 clones. There are SB Pro clones with a
+16 bit mode which is not SB16 compatible. The most likely alternative is that
+the 16 bit mode means MSS/WSS.
+
+There are just a few fully 100% hardware SB or SB Pro compatible cards.
+I know just Thunderboard and SM Games. Other cards require some kind of
+hardware initialization before they become SB compatible. Check if your card
+was listed in the beginning of this file. In this case you should follow
+instructions for your card later in this file.
+
+For other not fully SB clones you may try initialization using DOS in
+the following way:
+
+ - Boot DOS so that the card specific driver gets run.
+ - Hit ctrl-alt-del (or use loadlin) to boot Linux. Don't
+ switch off power or press the reset button.
+ - If you use the same I/O, IRQ and DMA settings in Linux, the
+ card should work.
+
+If your card is both SB and MSS compatible, I recommend using the MSS mode.
+Most cards of this kind are not able to work in the SB and the MSS mode
+simultaneously. Using the MSS mode provides 16 bit recording and playback.
+
+ProAudioSpectrum 16 and compatibles
+-----------------------------------
+
+PAS16 has a SB emulation chip which can be used together with the native
+(16 bit) mode of the card. To enable this emulation you should configure
+the driver to have SB support too (this has been changed since version
+3.5-beta9 of this driver).
+
+With current driver versions it's also possible to use PAS16 together with
+another SB compatible card. In this case you should configure SB support
+for the other card and to disable the SB emulation of PAS16 (there is a
+separate questions about this).
+
+With PAS16 you can use two audio device files at the same time. /dev/dsp (and
+/dev/audio) is connected to the 8/16 bit native codec and the /dev/dsp1 (and
+/dev/audio1) is connected to the SB emulation (8 bit mono only).
+
+Gravis Ultrasound
+-----------------
+
+There are many different revisions of the Ultrasound card (GUS). The
+earliest ones (pre 3.7) don't have a hardware mixer. With these cards
+the driver uses a software emulation for synth and pcm playbacks. It's
+also possible to switch some of the inputs (line in, mic) off by setting
+mixer volume of the channel level below 10%. For recording you have
+to select the channel as a recording source and to use volume above 10%.
+
+GUS 3.7 has a hardware mixer.
+
+GUS MAX and the 16 bit sampling daughtercard have a CS4231 codec chip which
+also contains a mixer.
+
+Configuring GUS is simple. Just enable the GUS support and GUS MAX or
+the 16 bit daughtercard if you have them. Note that enabling the daughter
+card disables GUS MAX driver.
+
+NOTE for owners of the 16 bit daughtercard: By default the daughtercard
+uses /dev/dsp (and /dev/audio). Command "ln -sf /dev/dsp1 /dev/dsp"
+selects the daughter card as the default device.
+
+With just the standard GUS enabled the configuration program prompts
+for the I/O, IRQ and DMA numbers for the card. Use the same values than
+with DOS.
+
+With the daughter card option enabled you will be prompted for the I/O,
+IRQ and DMA numbers for the daughter card. You have to use different I/O
+and DMA values than for the standard GUS. The daughter card permits
+simultaneous recording and playback. Use /dev/dsp (the daughtercard) for
+recording and /dev/dsp1 (GUS GF1) for playback.
+
+GUS MAX uses the same I/O address and IRQ settings than the original GUS
+(GUS MAX = GUS + a CS4231 codec). In addition an extra DMA channel may be used.
+Using two DMA channels permits simultaneous playback using two devices
+(dev/dsp0 and /dev/dsp1). The second DMA channel is required for
+full duplex audio.
+To enable the second DMA channels, give a valid DMA channel when the config
+program asks for the GUS MAX DMA (entering -1 disables the second DMA).
+Using 16 bit DMA channels (5,6 or 7) is recommended.
+
+If you have problems in recording with GUS MAX, you could try to use
+just one 8 bit DMA channel. Recording will not work with one DMA
+channel if it's a 16 bit one.
+
+Microphone input of GUS MAX is connected to mixer in little bit nonstandard
+way. There is actually two microphone volume controls. Normal "mic" controls
+only recording level. Mixer control "speaker" is used to control volume of
+microphone signal connected directly to line/speaker out. So just decrease
+volume of "speaker" if you have problems with microphone feedback.
+
+GUS ACE works too but any attempt to record or to use the MIDI port
+will fail.
+
+GUS PnP (with RAM) is partially supported but it needs to be initialized using
+DOS or isapnptools before starting the driver.
+
+MPU401 and Windows Sound System
+-------------------------------
+
+Again. Don't enable these options in case your card is listed
+somewhere else in this file.
+
+Configuring these cards is obvious (or it should be). With MSS
+you should probably enable the OPL3 synth also since
+most MSS compatible cards have it. However check that this is true
+before enabling OPL3.
+
+Sound driver supports more than one MPU401 compatible cards at the same time
+but the config program asks config info for just the first of them.
+Adding the second or third MPU interfaces must be done manually by
+editing sound/local.h (after running the config program). Add defines for
+MPU2_BASE & MPU2_IRQ (and MPU3_BASE & MPU3_IRQ) to the file.
+
+CAUTION!
+
+The default I/O base of Adaptec AHA-1542 SCSI controller is 0x330 which
+is also the default of the MPU401 driver. Don't configure the sound driver to
+use 0x330 as the MPU401 base if you have a AHA1542. The kernel will not boot
+if you make this mistake.
+
+PSS
+---
+
+Even the PSS cards are compatible with SB, MSS and MPU401, you must not
+enable these options when configuring the driver. The configuration
+program handles these options itself. (You may use the SB, MPU and MSS options
+together with PSS if you have another card on the system).
+
+The PSS driver enables MSS and MPU401 modes of the card. SB is not enabled
+since it doesn't work concurrently with MSS. The driver loads also a
+DSP algorithm which is used to for the general MIDI emulation. The
+algorithm file (.ld) is read by the config program and written to a
+file included when the pss.c is compiled. For this reason the config
+program asks if you want to download the file. Use the genmidi.ld file
+distributed with the DOS/Windows drivers of the card (don't use the mt32.ld).
+With some cards the file is called 'synth.ld'. You must have access to
+the file when configuring the driver. The easiest way is to mount the DOS
+partition containing the file with Linux.
+
+It's possible to load your own DSP algorithms and run them with the card.
+Look at the directory pss_test of snd-util-3.0.tar.gz for more info.
+
+AudioTrix Pro
+-------------
+
+You have to enable the OPL3 and SB (not SB Pro or SB16) drivers in addition
+to the native AudioTrix driver. Don't enable MSS or MPU drivers.
+
+Configuring ATP is little bit tricky since it uses so many I/O, IRQ and
+DMA numbers. Using the same values than with DOS/Win is a good idea. Don't
+attempt to use the same IRQ or DMA channels twice.
+
+The SB mode of ATP is implemented so the ATP driver just enables SB
+in the proper address. The SB driver handles the rest. You have to configure
+both the SB driver and the SB mode of ATP to use the same IRQ, DMA and I/O
+settings.
+
+Also the ATP has a microcontroller for the General MIDI emulation (OPL4).
+For this reason the driver asks for the name of a file containing the
+microcode (TRXPRO.HEX). This file is usually located in the directory
+where the DOS drivers were installed. You must have access to this file
+when configuring the driver.
+
+If you have the effects daughtercard, it must be initialized by running
+the setfx program of snd-util-3.0.tar.gz package. This step is not required
+when using the (future) binary distribution version of the driver.
+
+Ensoniq SoundScape
+------------------
+
+NOTE! The new PnP SoundScape is not supported yet. Soundscape compatible
+ cards made by Reveal don't work with Linux. They use older revision
+ of the Soundscape chipset which is not fully compatible with
+ newer cards made by Ensoniq.
+
+The SoundScape driver handles initialization of MSS and MPU supports
+itself so you don't need to enable other drivers than SoundScape
+(enable also the /dev/dsp, /dev/sequencer and MIDI supports).
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!!!! !!!!
+!!!!! NOTE! Before version 3.5-beta6 there WERE two sets of audio !!!!
+!!!!! device files (/dev/dsp0 and /dev/dsp1). The first one WAS !!!!
+!!!!! used only for card initialization and the second for audio !!!!
+!!!!! purposes. It WAS required to change /dev/dsp (a symlink) to !!!!
+!!!!! point to /dev/dsp1. !!!!
+!!!!! !!!!
+!!!!! This is not required with OSS versions 3.5-beta6 and later !!!!
+!!!!! since there is now just one audio device file. Please !!!!
+!!!!! change /dev/dsp to point back to /dev/dsp0 if you are !!!!
+!!!!! upgrading from an earlier driver version using !!!!
+!!!!! (cd /dev;rm dsp;ln -s dsp0 dsp). !!!!
+!!!!! !!!!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+The configuration program asks one DMA channel and two interrupts. One IRQ
+and one DMA is used by the MSS codec. The second IRQ is required for the
+MPU401 mode (you have to use different IRQs for both purposes).
+There were earlier two DMA channels for SoundScape but the current driver
+version requires just one.
+
+The SoundScape card has a Motorola microcontroller which must initialized
+_after_ boot (the driver doesn't initialize it during boot).
+The initialization is done by running the 'ssinit' program which is
+distributed in the snd-util-3.0.tar.gz package. You have to edit two
+defines in the ssinit.c and then compile the program. You may run ssinit
+manually (after each boot) or add it to /etc/rc.d/rc.local.
+
+The ssinit program needs the microcode file that comes with the DOS/Windows
+driver of the card. You will need to use version 1.30.00 or later
+of the microcode file (sndscape.co0 or sndscape.co1 depending on
+your card model). THE OLD sndscape.cod WILL NOT WORK. IT WILL HANG YOUR
+MACHINE. The only way to get the new microcode file is to download
+and install the DOS/Windows driver from ftp://ftp.ensoniq.com/pub.
+
+Then you have to select the proper microcode file to use: soundscape.co0
+is the right one for most cards and sndscape.co1 is for few (older) cards
+made by Reveal and/or Spea. The driver has capability to detect the card
+version during boot. Look at the boot log messages in /var/adm/messages
+and locate the sound driver initialization message for the SoundScape
+card. If the driver displays string <Ensoniq Soundscape (old)>, you have
+an old card and you will need to use sndscape.co1. For other cards use
+soundscape.co0. New Soundscape revisions such as Elite and PnP use
+code files with higher numbers (.co2, .co3, etc.).
+
+NOTE! Ensoniq Soundscape VIVO is not compatible with other Soundscape cards.
+ Currently it's possible to use it in Linux only with OSS/Linux
+ drivers.
+
+Check /var/adm/messages after running ssinit. The driver prints
+the board version after downloading the microcode file. That version
+number must match the number in the name of the microcode file (extension).
+
+Running ssinit with a wrong version of the sndscape.co? file is not
+dangerous as long as you don't try to use a file called sndscape.cod.
+If you have initialized the card using a wrong microcode file (sounds
+are terrible), just modify ssinit.c to use another microcode file and try
+again. It's possible to use an earlier version of sndscape.co[01] but it
+may sound weird.
+
+MAD16 (Pro) and Mozart
+----------------------
+
+You need to enable just the MAD16 /Mozart support when configuring
+the driver. _Don't_ enable SB, MPU401 or MSS. However you will need the
+/dev/audio, /dev/sequencer and MIDI supports.
+
+Mozart and OPTi 82C928 (the original MAD16) chips don't support
+MPU401 mode so enter just 0 when the configuration program asks the
+MPU/MIDI I/O base. The MAD16 Pro (OPTi 82C929) and 82C930 chips have MPU401
+mode.
+
+TB Tropez is based on the 82C929 chip. It has two MIDI ports.
+The one connected to the MAD16 chip is the second one (there is a second
+MIDI connector/pins somewhere??). If you have not connected the second MIDI
+port, just disable the MIDI port of MAD16. The 'Maui' compatible synth of
+Tropez is jumper configurable and not connected to the MAD16 chip (the
+Maui driver can be used with it).
+
+Some MAD16 based cards may cause feedback, whistle or terrible noise if the
+line3 mixer channel is turned too high. This happens at least with Shuttle
+Sound System. Current driver versions set volume of line3 low enough so
+this should not be a problem.
+
+If you have a MAD16 card which have an OPL4 (FM + Wave table) synthesizer
+chip (_not_ an OPL3), you have to append a line containing #define MAD16_OPL4
+to the file linux/drivers/sound/local.h (after running make config).
+
+MAD16 cards having a CS4231 codec support full duplex mode. This mode
+can be enabled by configuring the card to use two DMA channels. Possible
+DMA channel pairs are: 0&1, 1&0 and 3&0.
+
+NOTE! Cards having an OPTi 82C924/82C925 chip work with OSS/Free only in
+non-PnP mode (usually jumper selectable). The PnP mode is supported only
+by OSS/Linux.
+
+MV Jazz (ProSonic)
+------------------
+
+The Jazz16 driver is just a hack made to the SB Pro driver. However it works
+fairly well. You have to enable SB, SB Pro (_not_ SB16) and MPU401 supports
+when configuring the driver. The configuration program asks later if you
+want support for MV Jazz16 based cards (after asking SB base address). Answer
+'y' here and the driver asks the second (16 bit) DMA channel.
+
+The Jazz16 driver uses the MPU401 driver in a way which will cause
+problems if you have another MPU401 compatible card. In this case you must
+give address of the Jazz16 based MPU401 interface when the config
+program prompts for the MPU401 information. Then look at the MPU401
+specific section for instructions about configuring more than one MPU401 cards.
+
+Logitech Soundman Wave
+----------------------
+
+Read the above MV Jazz specific instructions first.
+
+The Logitech SoundMan Wave (don't confuse this with the SM16 or SM Games) is
+a MV Jazz based card which has an additional OPL4 based wave table
+synthesizer. The OPL4 chip is handled by an on board microcontroller
+which must be initialized during boot. The config program asks if
+you have a SM Wave immediately after asking the second DMA channel of jazz16.
+If you answer 'y', the config program will ask name of the file containing
+code to be loaded to the microcontroller. The file is usually called
+MIDI0001.BIN and it's located in the DOS/Windows driver directory. The file
+may also be called as TSUNAMI.BIN or something else (older cards?).
+
+The OPL4 synth will be inaccessible without loading the microcontroller code.
+
+Also remember to enable SB MPU401 support if you want to use the OPL4 mode.
+(Don't enable the 'normal' MPU401 device as with some earlier driver
+versions (pre 3.5-alpha8)).
+
+NOTE! Don't answer 'y' when the driver asks about SM Games support
+ (the next question after the MIDI0001.BIN name). However
+ answering 'y' doesn't cause damage your computer so don't panic.
+
+Sound Galaxies
+--------------
+
+There are many different Sound Galaxy cards made by Aztech. The 8 bit
+ones are fully SB or SB Pro compatible and there should be no problems
+with them.
+
+The older 16 bit cards (SG Pro16, SG NX Pro16, Nova and Lyra) have
+an EEPROM chip for storing the configuration data. There is a microcontroller
+which initializes the card to match the EEPROM settings when the machine
+is powered on. These cards actually behave just like they have jumpers
+for all of the settings. Configure driver for MSS, MPU, SB/SB Pro and OPL3
+supports with these cards.
+
+There are some new Sound Galaxies in the market. I have no experience with
+them so read the card's manual carefully.
+
+ESS ES1688 and ES688 'AudioDrive' based cards
+---------------------------------------------
+
+Support for these two ESS chips is embedded in the SB driver.
+Configure these cards just like SB. Enable the 'SB MPU401 MIDI port'
+if you want to use MIDI features of ES1688. ES688 doesn't have MPU mode
+so you don't need to enable it (the driver uses normal SB MIDI automatically
+with ES688).
+
+NOTE! ESS cards are not compatible with MSS/WSS so don't worry if MSS support
+of OSS doesn't work with it.
+
+There are some ES1688/688 based sound cards and (particularly) motherboards
+which use software configurable I/O port relocation feature of the chip.
+This ESS proprietary feature is supported only by OSS/Linux.
+
+There are ES1688 based cards which use different interrupt pin assignment than
+recommended by ESS (5, 7, 9/2 and 10). In this case all IRQs don't work.
+At least a card called (Pearl?) Hypersound 16 supports IRQ 15 but it doesn't
+work.
+
+ES1868 is a PnP chip which is (supposed to be) compatible with ESS1688
+probably works with OSS/Free after initialization using isapnptools.
+
+Reveal cards
+------------
+
+There are several different cards made/marketed by Reveal. Some of them
+are compatible with SoundScape and some use the MAD16 chip. You may have
+to look at the card and try to identify its origin.
+
+Diamond
+-------
+
+The oldest (Sierra Aria based) sound cards made by Diamond are not supported
+(they may work if the card is initialized using DOS). The recent (LX?)
+models are based on the MAD16 chip which is supported by the driver.
+
+Audio Excel DSP16
+-----------------
+
+Support for this card is currently not functional. A new driver for it
+should be available later this year.
+
+PCMCIA cards
+------------
+
+Sorry, can't help. Some cards may work and some don't.
+
+TI TM4000M notebooks
+--------------------
+
+These computers have a built in sound support based on the Jazz chipset.
+Look at the instructions for MV Jazz (above). It's also important to note
+that there is something wrong with the mouse port and sound at least on
+some TM models. Don't enable the "C&T 82C710 mouse port support" when
+configuring Linux. Having it enabled is likely to cause mysterious problems
+and kernel failures when sound is used.
+
+miroSOUND
+---------
+
+The miroSOUND PCM1-pro, PCM12 and PCM20 radio has been used
+successfully. These cards are based on the MAD16, OPL4, and CS4231A chips
+and everything said in the section about MAD16 cards applies here,
+too. The only major difference between the PCMxx and other MAD16 cards
+is that instead of the mixer in the CS4231 codec a separate mixer
+controlled by an on-board 80C32 microcontroller is used. Control of
+the mixer takes place via the ACI (miro's audio control interface)
+protocol that is implemented in a separate lowlevel driver. Make sure
+you compile this ACI driver together with the normal MAD16 support
+when you use a miroSOUND PCMxx card. The ACI mixer is controlled by
+/dev/mixer and the CS4231 mixer by /dev/mixer1 (depends on load
+time). Only in special cases you want to change something regularly on
+the CS4231 mixer.
+
+The miroSOUND PCM12 and PCM20 radio is capable of full duplex
+operation (simultaneous PCM replay and recording), which allows you to
+implement nice real-time signal processing audio effect software and
+network telephones. The ACI mixer has to be switched into the "solo"
+mode for duplex operation in order to avoid feedback caused by the
+mixer (input hears output signal). You can de-/activate this mode
+through toggleing the record button for the wave controller with an
+OSS-mixer.
+
+The PCM20 contains a radio tuner, which is also controlled by
+ACI. This radio tuner is supported by the ACI driver together with the
+miropcm20.o module. Also the 7-band equalizer is integrated
+(limited by the OSS-design). Developement has started and maybe
+finished for the RDS decoder on this card, too. You will be able to
+read RadioText, the Programme Service name, Programme TYpe and
+others. Even the v4l radio module benefits from it with a refined
+strength value. See aci.[ch] and miropcm20*.[ch] for more details.
+
+The following configuration parameters have worked fine for the PCM12
+in Markus Kuhn's system, many other configurations might work, too:
+CONFIG_MAD16_BASE=0x530, CONFIG_MAD16_IRQ=11, CONFIG_MAD16_DMA=3,
+CONFIG_MAD16_DMA2=0, CONFIG_MAD16_MPU_BASE=0x330, CONFIG_MAD16_MPU_IRQ=10,
+DSP_BUFFSIZE=65536, SELECTED_SOUND_OPTIONS=0x00281000.
+
+Bas van der Linden is using his PCM1-pro with a configuration that
+differs in: CONFIG_MAD16_IRQ=7, CONFIG_MAD16_DMA=1, CONFIG_MAD16_MPU_IRQ=9
+
+Compaq Deskpro XL
+-----------------
+
+The builtin sound hardware of Compaq Deskpro XL is now supported.
+You need to configure the driver with MSS and OPL3 supports enabled.
+In addition you need to manually edit linux/drivers/sound/local.h and
+to add a line containing "#define DESKPROXL" if you used
+make menuconfig/xconfig.
+
+Others?
+-------
+
+Since there are so many different sound cards, it's likely that I have
+forgotten to mention many of them. Please inform me if you know yet another
+card which works with Linux, please inform me (or is anybody else
+willing to maintain a database of supported cards (just like in XF86)?).
+
+Cards not supported yet
+=======================
+
+Please check the version of sound driver you are using before
+complaining that your card is not supported. It's possible you are
+using a driver version which was released months before your card was
+introduced.
+
+First of all, there is an easy way to make most sound cards work with Linux.
+Just use the DOS based driver to initialize the card to a known state, then use
+loadlin.exe to boot Linux. If Linux is configured to use the same I/O, IRQ and
+DMA numbers as DOS, the card could work.
+(ctrl-alt-del can be used in place of loadlin.exe but it doesn't work with
+new motherboards). This method works also with all/most PnP sound cards.
+
+Don't get fooled with SB compatibility. Most cards are compatible with
+SB but that may require a TSR which is not possible with Linux. If
+the card is compatible with MSS, it's a better choice. Some cards
+don't work in the SB and MSS modes at the same time.
+
+Then there are cards which are no longer manufactured and/or which
+are relatively rarely used (such as the 8 bit ProAudioSpectrum
+models). It's extremely unlikely that such cards ever get supported.
+Adding support for a new card requires much work and increases time
+required in maintaining the driver (some changes need to be done
+to all low level drivers and be tested too, maybe with multiple
+operating systems). For this reason I have made a decision to not support
+obsolete cards. It's possible that someone else makes a separately
+distributed driver (diffs) for the card.
+
+Writing a driver for a new card is not possible if there are no
+programming information available about the card. If you don't
+find your new card from this file, look from the home page
+(http://www.opensound.com/ossfree). Then please contact
+manufacturer of the card and ask if they have (or are willing to)
+released technical details of the card. Do this before contacting me. I
+can only answer 'no' if there are no programming information available.
+
+I have made decision to not accept code based on reverse engineering
+to the driver. There are three main reasons: First I don't want to break
+relationships to sound card manufacturers. The second reason is that
+maintaining and supporting a driver without any specs will be a pain.
+The third reason is that companies have freedom to refuse selling their
+products to other than Windows users.
+
+Some companies don't give low level technical information about their
+products to public or at least their require signing a NDA. It's not
+possible to implement a freeware driver for them. However it's possible
+that support for such cards become available in the commercial version
+of this driver (see http://www.4Front-tech.com/oss.html for more info).
+
+There are some common audio chipsets that are not supported yet. For example
+Sierra Aria and IBM Mwave. It's possible that these architectures
+get some support in future but I can't make any promises. Just look
+at the home page (http://www.opensound.com/ossfree/new_cards.html)
+for latest info.
+
+Information about unsupported sound cards and chipsets is welcome as well
+as free copies of sound cards, SDKs and operating systems.
+
+If you have any corrections and/or comments, please contact me.
+
+Hannu Savolainen
+hannu@opensound.com
+
+Personal home page: http://www.compusonic.fi/~hannu
+home page of OSS/Free: http://www.opensound.com/ossfree
+
+home page of commercial OSS
+(Open Sound System) drivers: http://www.opensound.com/oss.html
diff --git a/Documentation/sound/oss/README.modules b/Documentation/sound/oss/README.modules
new file mode 100644
index 0000000..e691d74
--- /dev/null
+++ b/Documentation/sound/oss/README.modules
@@ -0,0 +1,106 @@
+Building a modular sound driver
+================================
+
+ The following information is current as of linux-2.1.85. Check the other
+readme files, especially README.OSS, for information not specific to
+making sound modular.
+
+ First, configure your kernel. This is an idea of what you should be
+setting in the sound section:
+
+<M> Sound card support
+
+<M> 100% Sound Blaster compatibles (SB16/32/64, ESS, Jazz16) support
+
+ I have SoundBlaster. Select your card from the list.
+
+<M> Generic OPL2/OPL3 FM synthesizer support
+<M> FM synthesizer (YM3812/OPL-3) support
+
+ If you don't set these, you will probably find you can play .wav files
+but not .midi. As the help for them says, set them unless you know your
+card does not use one of these chips for FM support.
+
+ Once you are configured, make zlilo, modules, modules_install; reboot.
+Note that it is no longer necessary or possible to configure sound in the
+drivers/sound dir. Now one simply configures and makes one's kernel and
+modules in the usual way.
+
+ Then, add to your /etc/modprobe.conf something like:
+
+alias char-major-14-* sb
+install sb /sbin/modprobe -i sb && /sbin/modprobe adlib_card
+options sb io=0x220 irq=7 dma=1 dma16=5 mpu_io=0x330
+options adlib_card io=0x388 # FM synthesizer
+
+ Alternatively, if you have compiled in kernel level ISAPnP support:
+
+alias char-major-14 sb
+post-install sb /sbin/modprobe "-k" "adlib_card"
+options adlib_card io=0x388
+
+ The effect of this is that the sound driver and all necessary bits and
+pieces autoload on demand, assuming you use kerneld (a sound choice) and
+autoclean when not in use. Also, options for the device drivers are
+set. They will not work without them. Change as appropriate for your card.
+If you are not yet using the very cool kerneld, you will have to "modprobe
+-k sb" yourself to get things going. Eventually things may be fixed so
+that this kludgery is not necessary; for the time being, it seems to work
+well.
+
+ Replace 'sb' with the driver for your card, and give it the right
+options. To find the filename of the driver, look in
+/lib/modules/<kernel-version>/misc. Mine looks like:
+
+adlib_card.o # This is the generic OPLx driver
+opl3.o # The OPL3 driver
+sb.o # <<The SoundBlaster driver. Yours may differ.>>
+sound.o # The sound driver
+uart401.o # Used by sb, maybe other cards
+
+ Whichever card you have, try feeding it the options that would be the
+default if you were making the driver wired, not as modules. You can
+look at function referred to by module_init() for the card to see what
+args are expected.
+
+ Note that at present there is no way to configure the io, irq and other
+parameters for the modular drivers as one does for the wired drivers.. One
+needs to pass the modules the necessary parameters as arguments, either
+with /etc/modprobe.conf or with command-line args to modprobe, e.g.
+
+modprobe sb io=0x220 irq=7 dma=1 dma16=5 mpu_io=0x330
+modprobe adlib_card io=0x388
+
+ recommend using /etc/modprobe.conf.
+
+Persistent DMA Buffers:
+
+The sound modules normally allocate DMA buffers during open() and
+deallocate them during close(). Linux can often have problems allocating
+DMA buffers for ISA cards on machines with more than 16MB RAM. This is
+because ISA DMA buffers must exist below the 16MB boundary and it is quite
+possible that we can't find a large enough free block in this region after
+the machine has been running for any amount of time. The way to avoid this
+problem is to allocate the DMA buffers during module load and deallocate
+them when the module is unloaded. For this to be effective we need to load
+the sound modules right after the kernel boots, either manually or by an
+init script, and keep them around until we shut down. This is a little
+wasteful of RAM, but it guarantees that sound always works.
+
+To make the sound driver use persistent DMA buffers we need to pass the
+sound.o module a "dmabuf=1" command-line argument. This is normally done
+in /etc/modprobe.conf like so:
+
+options sound dmabuf=1
+
+If you have 16MB or less RAM or a PCI sound card, this is wasteful and
+unnecessary. It is possible that machine with 16MB or less RAM will find
+this option useful, but if your machine is so memory-starved that it
+cannot find a 64K block free, you will be wasting even more RAM by keeping
+the sound modules loaded and the DMA buffers allocated when they are not
+needed. The proper solution is to upgrade your RAM. But you do also have
+this improper solution as well. Use it wisely.
+
+ I'm afraid I know nothing about anything but my setup, being more of a
+text-mode guy anyway. If you have options for other cards or other helpful
+hints, send them to me, Jim Bray, jb@as220.org, http://as220.org/jb.
diff --git a/Documentation/sound/oss/README.ymfsb b/Documentation/sound/oss/README.ymfsb
new file mode 100644
index 0000000..af8a7d3
--- /dev/null
+++ b/Documentation/sound/oss/README.ymfsb
@@ -0,0 +1,107 @@
+Legacy audio driver for YMF7xx PCI cards.
+
+
+FIRST OF ALL
+============
+
+ This code references YAMAHA's sample codes and data sheets.
+ I respect and thank for all people they made open the informations
+ about YMF7xx cards.
+
+ And this codes heavily based on Jeff Garzik <jgarzik@pobox.com>'s
+ old VIA 82Cxxx driver (via82cxxx.c). I also respect him.
+
+
+DISCLIMER
+=========
+
+ This driver is currently at early ALPHA stage. It may cause serious
+ damage to your computer when used.
+ PLEASE USE IT AT YOUR OWN RISK.
+
+
+ABOUT THIS DRIVER
+=================
+
+ This code enables you to use your YMF724[A-F], YMF740[A-C], YMF744, YMF754
+ cards. When enabled, your card acts as "SoundBlaster Pro" compatible card.
+ It can only play 22.05kHz / 8bit / Stereo samples, control external MIDI
+ port.
+ If you want to use your card as recent "16-bit" card, you should use
+ Alsa or OSS/Linux driver. Of course you can write native PCI driver for
+ your cards :)
+
+
+USAGE
+=====
+
+ # modprobe ymfsb (options)
+
+
+OPTIONS FOR MODULE
+==================
+
+ io : SB base address (0x220, 0x240, 0x260, 0x280)
+ synth_io : OPL3 base address (0x388, 0x398, 0x3a0, 0x3a8)
+ dma : DMA number (0,1,3)
+ master_volume: AC'97 PCM out Vol (0-100)
+ spdif_out : SPDIF-out flag (0:disable 1:enable)
+
+ These options will change in future...
+
+
+FREQUENCY
+=========
+
+ When playing sounds via this driver, you will hear its pitch is slightly
+ lower than original sounds. Since this driver recognizes your card acts
+ with 21.739kHz sample rates rather than 22.050kHz (I think it must be
+ hardware restriction). So many players become tone deafness.
+ To prevent this, you should express some options to your sound player
+ that specify correct sample frequency. For example, to play your MP3 file
+ correctly with mpg123, specify the frequency like following:
+
+ % mpg123 -r 21739 foo.mp3
+
+
+SPDIF OUT
+=========
+
+ With installing modules with option 'spdif_out=1', you can enjoy your
+ sounds from SPDIF-out of your card (if it had).
+ Its Fs is fixed to 48kHz (It never means the sample frequency become
+ up to 48kHz. All sounds via SPDIF-out also 22kHz samples). So your
+ digital-in capable components has to be able to handle 48kHz Fs.
+
+
+COPYING
+=======
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+TODO
+====
+ * support for multiple cards
+ (set the different SB_IO,MPU_IO,OPL_IO for each cards)
+
+ * support for OPL (dmfm) : There will be no requirements... :-<
+
+
+AUTHOR
+======
+
+ Daisuke Nagano <breeze.nagano@nifty.ne.jp>
+
diff --git a/Documentation/sound/oss/SoundPro b/Documentation/sound/oss/SoundPro
new file mode 100644
index 0000000..9d4db1f
--- /dev/null
+++ b/Documentation/sound/oss/SoundPro
@@ -0,0 +1,105 @@
+Documentation for the SoundPro CMI8330 extensions in the WSS driver (ad1848.o)
+------------------------------------------------------------------------------
+
+( Be sure to read Documentation/sound/oss/CMI8330 too )
+
+Ion Badulescu, ionut@cs.columbia.edu
+February 24, 1999
+
+(derived from the OPL3-SA2 documentation by Scott Murray)
+
+The SoundPro CMI8330 (ISA) is a chip usually found on some Taiwanese
+motherboards. The official name in the documentation is CMI8330, SoundPro
+is the nickname and the big inscription on the chip itself.
+
+The chip emulates a WSS as well as a SB16, but it has certain differences
+in the mixer section which require separate support. It also emulates an
+MPU401 and an OPL3 synthesizer, so you probably want to enable support
+for these, too.
+
+The chip identifies itself as an AD1848, but its mixer is significantly
+more advanced than the original AD1848 one. If your system works with
+either WSS or SB16 and you are having problems with some mixer controls
+(no CD audio, no line-in, etc), you might want to give this driver a try.
+Detection should work, but it hasn't been widely tested, so it might still
+mis-identify the chip. You can still force soundpro=1 in the modprobe
+parameters for ad1848. Please let me know if it happens to you, so I can
+adjust the detection routine.
+
+The chip is capable of doing full-duplex, but since the driver sees it as an
+AD1848, it cannot take advantage of this. Moreover, the full-duplex mode is
+not achievable through the WSS interface, b/c it needs a dma16 line which is
+assigned only to the SB16 subdevice (with isapnp). Windows documentation
+says the user must use WSS Playback and SB16 Recording for full-duplex, so
+it might be possible to do the same thing under Linux. You can try loading
+up both ad1848 and sb then use one for playback and the other for
+recording. I don't know if this works, b/c I haven't tested it. Anyway, if
+you try it, be very careful: the SB16 mixer *mostly* works, but certain
+settings can have unexpected effects. Use the WSS mixer for best results.
+
+There is also a PCI SoundPro chip. I have not seen this chip, so I have
+no idea if the driver will work with it. I suspect it won't.
+
+As with PnP cards, some configuration is required. There are two ways
+of doing this. The most common is to use the isapnptools package to
+initialize the card, and use the kernel module form of the sound
+subsystem and sound drivers. Alternatively, some BIOS's allow manual
+configuration of installed PnP devices in a BIOS menu, which should
+allow using the non-modular sound drivers, i.e. built into the kernel.
+Since in this latter case you cannot use module parameters, you will
+have to enable support for the SoundPro at compile time.
+
+The IRQ and DMA values can be any that are considered acceptable for a
+WSS. Assuming you've got isapnp all happy, then you should be able to
+do something like the following (which *must* match the isapnp/BIOS
+configuration):
+
+modprobe ad1848 io=0x530 irq=11 dma=0 soundpro=1
+-and maybe-
+modprobe sb io=0x220 irq=5 dma=1 dma16=5
+
+-then-
+modprobe mpu401 io=0x330 irq=9
+modprobe opl3 io=0x388
+
+If all goes well and you see no error messages, you should be able to
+start using the sound capabilities of your system. If you get an
+error message while trying to insert the module(s), then make
+sure that the values of the various arguments match what you specified
+in your isapnp configuration file, and that there is no conflict with
+another device for an I/O port or interrupt. Checking the contents of
+/proc/ioports and /proc/interrupts can be useful to see if you're
+butting heads with another device.
+
+If you do not see the chipset version message, and none of the other
+messages present in the system log are helpful, try adding 'debug=1'
+to the ad1848 parameters, email me the syslog results and I'll do
+my best to help.
+
+Lastly, if you're using modules and want to set up automatic module
+loading with kmod, the kernel module loader, here is the section I
+currently use in my conf.modules file:
+
+# Sound
+post-install sound modprobe -k ad1848; modprobe -k mpu401; modprobe -k opl3
+options ad1848 io=0x530 irq=11 dma=0
+options sb io=0x220 irq=5 dma=1 dma16=5
+options mpu401 io=0x330 irq=9
+options opl3 io=0x388
+
+The above ensures that ad1848 will be loaded whenever the sound system
+is being used.
+
+Good luck.
+
+Ion
+
+NOT REALLY TESTED:
+- recording
+- recording device selection
+- full-duplex
+
+TODO:
+- implement mixer support for surround, loud, digital CD switches.
+- come up with a scheme which allows recording volumes for each subdevice.
+This is a major OSS API change.
diff --git a/Documentation/sound/oss/Soundblaster b/Documentation/sound/oss/Soundblaster
new file mode 100644
index 0000000..b288d46
--- /dev/null
+++ b/Documentation/sound/oss/Soundblaster
@@ -0,0 +1,53 @@
+modprobe sound
+insmod uart401
+insmod sb ...
+
+This loads the driver for the Sound Blaster and assorted clones. Cards that
+are covered by other drivers should not be using this driver.
+
+The Sound Blaster module takes the following arguments
+
+io I/O address of the Sound Blaster chip (0x220,0x240,0x260,0x280)
+irq IRQ of the Sound Blaster chip (5,7,9,10)
+dma 8-bit DMA channel for the Sound Blaster (0,1,3)
+dma16 16-bit DMA channel for SB16 and equivalent cards (5,6,7)
+mpu_io I/O for MPU chip if present (0x300,0x330)
+
+sm_games=1 Set if you have a Logitech soundman games
+acer=1 Set this to detect cards in some ACER notebooks
+mwave_bug=1 Set if you are trying to use this driver with mwave (see on)
+type Use this to specify a specific card type
+
+The following arguments are taken if ISAPnP support is compiled in
+
+isapnp=0 Set this to disable ISAPnP detection (use io=0xXXX etc. above)
+multiple=0 Set to disable detection of multiple Soundblaster cards.
+ Consider it a bug if this option is needed, and send in a
+ report.
+pnplegacy=1 Set this to be able to use a PnP card(s) along with a single
+ non-PnP (legacy) card. Above options for io, irq, etc. are
+ needed, and will apply only to the legacy card.
+reverse=1 Reverses the order of the search in the PnP table.
+uart401=1 Set to enable detection of mpu devices on some clones.
+isapnpjump=n Jumps to slot n in the driver's PnP table. Use the source,
+ Luke.
+
+You may well want to load the opl3 driver for synth music on most SB and
+clone SB devices
+
+insmod opl3 io=0x388
+
+Using Mwave
+
+To make this driver work with Mwave you must set mwave_bug. You also need
+to warm boot from DOS/Windows with the required firmware loaded under this
+OS. IBM are being difficult about documenting how to load this firmware.
+
+Avance Logic ALS007
+
+This card is supported; see the separate file ALS007 for full details.
+
+Avance Logic ALS100
+
+This card is supported; setup should be as for a standard Sound Blaster 16.
+The driver will identify the audio device as a "Sound Blaster 16 (ALS-100)".
diff --git a/Documentation/sound/oss/Tropez+ b/Documentation/sound/oss/Tropez+
new file mode 100644
index 0000000..b93a6b7
--- /dev/null
+++ b/Documentation/sound/oss/Tropez+
@@ -0,0 +1,26 @@
+From: Paul Barton-Davis <pbd@op.net>
+
+Here is the configuration I use with a Tropez+ and my modular
+driver:
+
+ alias char-major-14 wavefront
+ alias synth0 wavefront
+ alias mixer0 cs4232
+ alias audio0 cs4232
+ pre-install wavefront modprobe "-k" "cs4232"
+ post-install wavefront modprobe "-k" "opl3"
+ options wavefront io=0x200 irq=9
+ options cs4232 synthirq=9 synthio=0x200 io=0x530 irq=5 dma=1 dma2=0
+ options opl3 io=0x388
+
+Things to note:
+
+ the wavefront options "io" and "irq" ***MUST*** match the "synthio"
+ and "synthirq" cs4232 options.
+
+ you can do without the opl3 module if you don't
+ want to use the OPL/[34] synth on the soundcard
+
+ the opl3 io parameter is conventionally not adjustable.
+
+Please see drivers/sound/README.wavefront for more details.
diff --git a/Documentation/sound/oss/VIBRA16 b/Documentation/sound/oss/VIBRA16
new file mode 100644
index 0000000..68a5a46
--- /dev/null
+++ b/Documentation/sound/oss/VIBRA16
@@ -0,0 +1,80 @@
+Sound Blaster 16X Vibra addendum
+--------------------------------
+by Marius Ilioaea <mariusi@protv.ro>
+ Stefan Laudat <stefan@asit.ro>
+
+Sat Mar 6 23:55:27 EET 1999
+
+ Hello again,
+
+ Playing with a SB Vibra 16x soundcard we found it very difficult
+to setup because the kernel reported a lot of DMA errors and wouldn't
+simply play any sound.
+ A good starting point is that the vibra16x chip full-duplex facility
+is neither still exploited by the sb driver found in the linux kernel
+(tried it with a 2.2.2-ac7), nor in the commercial OSS package (it reports
+it as half-duplex soundcard). Oh, I almost forgot, the RedHat sndconfig
+failed detecting it ;)
+ So, the big problem still remains, because the sb module wants a
+8-bit and a 16-bit dma, which we could not allocate for vibra... it supports
+only two 8-bit dma channels, the second one will be passed to the module
+as a 16 bit channel, the kernel will yield about that but everything will
+be okay, trust us.
+ The only inconvenient you may find is that you will have
+some sound playing jitters if you have HDD dma support enabled - but this
+will happen with almost all soundcards...
+
+ A fully working isapnp.conf is just here:
+
+<snip here>
+
+(READPORT 0x0203)
+(ISOLATE PRESERVE)
+(IDENTIFY *)
+(VERBOSITY 2)
+(CONFLICT (IO FATAL)(IRQ FATAL)(DMA FATAL)(MEM FATAL)) # or WARNING
+# SB 16 and OPL3 devices
+(CONFIGURE CTL00f0/-1 (LD 0
+(INT 0 (IRQ 5 (MODE +E)))
+(DMA 0 (CHANNEL 1))
+(DMA 1 (CHANNEL 3))
+(IO 0 (SIZE 16) (BASE 0x0220))
+(IO 2 (SIZE 4) (BASE 0x0388))
+(NAME "CTL00f0/-1[0]{Audio }")
+(ACT Y)
+))
+
+# Joystick device - only if you need it :-/
+
+(CONFIGURE CTL00f0/-1 (LD 1
+(IO 0 (SIZE 1) (BASE 0x0200))
+(NAME "CTL00f0/-1[1]{Game }")
+(ACT Y)
+))
+(WAITFORKEY)
+
+<end of snipping>
+
+ So, after a good kernel modules compilation and a 'depmod -a kernel_ver'
+you may want to:
+
+modprobe sb io=0x220 irq=5 dma=1 dma16=3
+
+ Or, take the hard way:
+
+modprobe soundcore
+modprobe sound
+modprobe uart401
+modprobe sb io=0x220 irq=5 dma=1 dma16=3
+# do you need MIDI?
+modprobe opl3=0x388
+
+ Just in case, the kernel sound support should be:
+
+CONFIG_SOUND=m
+CONFIG_SOUND_OSS=m
+CONFIG_SOUND_SB=m
+
+ Enjoy your new noisy Linux box! ;)
+
+
diff --git a/Documentation/sound/oss/WaveArtist b/Documentation/sound/oss/WaveArtist
new file mode 100644
index 0000000..f4f3407
--- /dev/null
+++ b/Documentation/sound/oss/WaveArtist
@@ -0,0 +1,170 @@
+
+ (the following is from the armlinux CVS)
+
+ WaveArtist mixer and volume levels can be accessed via these commands:
+
+ nn30 read registers nn, where nn = 00 - 09 for mixer settings
+ 0a - 13 for channel volumes
+ mm31 write the volume setting in pairs, where mm = (nn - 10) / 2
+ rr32 write the mixer settings in pairs, where rr = nn/2
+ xx33 reset all settings to default
+ 0y34 select mono source, y=0 = left, y=1 = right
+
+ bits
+ nn 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 00 | 0 | 0 0 1 1 | left line mixer gain | left aux1 mixer gain |lmute|
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 01 | 0 | 0 1 0 1 | left aux2 mixer gain | right 2 left mic gain |mmute|
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 02 | 0 | 0 1 1 1 | left mic mixer gain | left mic | left mixer gain |dith |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 03 | 0 | 1 0 0 1 | left mixer input select |lrfg | left ADC gain |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 04 | 0 | 1 0 1 1 | right line mixer gain | right aux1 mixer gain |rmute|
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 05 | 0 | 1 1 0 1 | right aux2 mixer gain | left 2 right mic gain |test |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 06 | 0 | 1 1 1 1 | right mic mixer gain | right mic |right mixer gain |rbyps|
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 07 | 1 | 0 0 0 1 | right mixer select |rrfg | right ADC gain |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 08 | 1 | 0 0 1 1 | mono mixer gain |right ADC mux sel|left ADC mux sel |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 09 | 1 | 0 1 0 1 |loopb|left linout|loop|ADCch|TxFch|OffCD|test |loopb|loopb|osamp|
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 0a | 0 | left PCM channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 0b | 0 | right PCM channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 0c | 0 | left FM channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 0d | 0 | right FM channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 0e | 0 | left wavetable channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 0f | 0 | right wavetable channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 10 | 0 | left PCM expansion channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 11 | 0 | right PCM expansion channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 12 | 0 | left FM expansion channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+ 13 | 0 | right FM expansion channel volume |
+----+---+------------+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+
+
+ lmute: left mute
+ mmute: mono mute
+ dith: dithds
+ lrfg:
+ rmute: right mute
+ rbyps: right bypass
+ rrfg:
+ ADCch:
+ TxFch:
+ OffCD:
+ osamp:
+
+ And the following diagram is derived from the description in the CVS archive:
+
+ MIC L (mouthpiece)
+ +------+
+ -->PreAmp>-\
+ +--^---+ |
+ | |
+ r2b4-5 | +--------+
+ /----*-------------------------------->5 |
+ | | |
+ | /----------------------------------->4 |
+ | | | |
+ | | /--------------------------------->3 1of5 | +---+
+ | | | | mux >-->AMP>--> ADC L
+ | | | /------------------------------->2 | +-^-+
+ | | | | | | |
+ Line | | | | +----+ +------+ +---+ /---->1 | r3b3-0
+ ------------*->mute>--> Gain >--> | | | |
+ L | | | +----+ +------+ | | | *->0 |
+ | | | | | | +---^----+
+ Aux2 | | | +----+ +------+ | | | |
+ ----------*--->mute>--> Gain >--> M | | r8b0-2
+ L | | +----+ +------+ | | |
+ | | | | \------\
+ Aux1 | | +----+ +------+ | | |
+ --------*----->mute>--> Gain >--> I | |
+ L | +----+ +------+ | | |
+ | | | |
+ | +----+ +------+ | | +---+ |
+ *------->mute>--> Gain >--> X >-->AMP>--*
+ | +----+ +------+ | | +-^-+ |
+ | | | | |
+ | +----+ +------+ | | r2b1-3 |
+ | /----->mute>--> Gain >--> E | |
+ | | +----+ +------+ | | |
+ | | | | |
+ | | +----+ +------+ | | |
+ | | /--->mute>--> Gain >--> R | |
+ | | | +----+ +------+ | | |
+ | | | | | | r9b8-9
+ | | | +----+ +------+ | | | |
+ | | | /->mute>--> Gain >--> | | +---v---+
+ | | | | +----+ +------+ +---+ /-*->0 |
+ DAC | | | | | | |
+ ------------*----------------------------------->? | +----+
+ L | | | | | Mux >-->mute>--> L output
+ | | | | /->? | +--^-+
+ | | | | | | | |
+ | | | /--------->? | r0b0
+ | | | | | | +-------+
+ | | | | | |
+ Mono | | | | | | +-------+
+ ----------* | \---> | +----+
+ | | | | | | Mix >-->mute>--> Mono output
+ | | | | *-> | +--^-+
+ | | | | | +-------+ |
+ | | | | | r1b0
+ DAC | | | | | +-------+
+ ------------*-------------------------*--------->1 | +----+
+ R | | | | | | Mux >-->mute>--> R output
+ | | | | +----+ +------+ +---+ *->0 | +--^-+
+ | | | \->mute>--> Gain >--> | | +---^---+ |
+ | | | +----+ +------+ | | | | r5b0
+ | | | | | | r6b0
+ | | | +----+ +------+ | | |
+ | | \--->mute>--> Gain >--> M | |
+ | | +----+ +------+ | | |
+ | | | | |
+ | | +----+ +------+ | | |
+ | *----->mute>--> Gain >--> I | |
+ | | +----+ +------+ | | |
+ | | | | |
+ | | +----+ +------+ | | +---+ |
+ \------->mute>--> Gain >--> X >-->AMP>--*
+ | +----+ +------+ | | +-^-+ |
+ /--/ | | | |
+ Aux1 | +----+ +------+ | | r6b1-3 |
+ -------*------>mute>--> Gain >--> E | |
+ R | | +----+ +------+ | | |
+ | | | | |
+ Aux2 | | +----+ +------+ | | /------/
+ ---------*---->mute>--> Gain >--> R | |
+ R | | | +----+ +------+ | | |
+ | | | | | | +--------+
+ Line | | | +----+ +------+ | | | *->0 |
+ -----------*-->mute>--> Gain >--> | | | |
+ R | | | | +----+ +------+ +---+ \---->1 |
+ | | | | | |
+ | | | \-------------------------------->2 | +---+
+ | | | | Mux >-->AMP>--> ADC R
+ | | \---------------------------------->3 | +-^-+
+ | | | | |
+ | \------------------------------------>4 | r7b3-0
+ | | |
+ \-----*-------------------------------->5 |
+ | +---^----+
+ r6b4-5 | |
+ | | r8b3-5
+ +--v---+ |
+ -->PreAmp>-/
+ +------+
+ MIC R (electret mic)
diff --git a/Documentation/sound/oss/btaudio b/Documentation/sound/oss/btaudio
new file mode 100644
index 0000000..1a693e6
--- /dev/null
+++ b/Documentation/sound/oss/btaudio
@@ -0,0 +1,92 @@
+
+Intro
+=====
+
+people start bugging me about this with questions, looks like I
+should write up some documentation for this beast. That way I
+don't have to answer that much mails I hope. Yes, I'm lazy...
+
+
+You might have noticed that the bt878 grabber cards have actually
+_two_ PCI functions:
+
+$ lspci
+[ ... ]
+00:0a.0 Multimedia video controller: Brooktree Corporation Bt878 (rev 02)
+00:0a.1 Multimedia controller: Brooktree Corporation Bt878 (rev 02)
+[ ... ]
+
+The first does video, it is backward compatible to the bt848. The second
+does audio. btaudio is a driver for the second function. It's a sound
+driver which can be used for recording sound (and _only_ recording, no
+playback). As most TV cards come with a short cable which can be plugged
+into your sound card's line-in you probably don't need this driver if all
+you want to do is just watching TV...
+
+
+Driver Status
+=============
+
+Still somewhat experimental. The driver should work stable, i.e. it
+should'nt crash your box. It might not work as expected, have bugs,
+not being fully OSS API compilant, ...
+
+Latest versions are available from http://bytesex.org/bttv/, the
+driver is in the bttv tarball. Kernel patches might be available too,
+have a look at http://bytesex.org/bttv/listing.html.
+
+The chip knows two different modes. btaudio registers two dsp
+devices, one for each mode. They can not be used at the same time.
+
+
+Digital audio mode
+==================
+
+The chip gives you 16 bit stereo sound. The sample rate depends on
+the external source which feeds the bt878 with digital sound via I2S
+interface. There is a insmod option (rate) to tell the driver which
+sample rate the hardware uses (32000 is the default).
+
+One possible source for digital sound is the msp34xx audio processor
+chip which provides digital sound via I2S with 32 kHz sample rate. My
+Hauppauge board works this way.
+
+The Osprey-200 reportly gives you digital sound with 44100 Hz sample
+rate. It is also possible that you get no sound at all.
+
+
+analog mode (A/D)
+=================
+
+You can tell the driver to use this mode with the insmod option "analog=1".
+The chip has three analog inputs. Consequently you'll get a mixer device
+to control these.
+
+The analog mode supports mono only. Both 8 + 16 bit. Both are _signed_
+int, which is uncommon for the 8 bit case. Sample rate range is 119 kHz
+to 448 kHz. Yes, the number of digits is correct. The driver supports
+downsampling by powers of two, so you can ask for more usual sample rates
+like 44 kHz too.
+
+With my Hauppauge I get noisy sound on the second input (mapped to line2
+by the mixer device). Others get a useable signal on line1.
+
+
+some examples
+=============
+
+* read audio data from btaudio (dsp2), send to es1730 (dsp,dsp1):
+ $ sox -w -r 32000 -t ossdsp /dev/dsp2 -t ossdsp /dev/dsp
+
+* read audio data from btaudio, send to esound daemon (which might be
+ running on another host):
+ $ sox -c 2 -w -r 32000 -t ossdsp /dev/dsp2 -t sw - | esdcat -r 32000
+ $ sox -c 1 -w -r 32000 -t ossdsp /dev/dsp2 -t sw - | esdcat -m -r 32000
+
+
+Have fun,
+
+ Gerd
+
+--
+Gerd Knorr <kraxel@bytesex.org>
diff --git a/Documentation/sound/oss/mwave b/Documentation/sound/oss/mwave
new file mode 100644
index 0000000..5fbcb16
--- /dev/null
+++ b/Documentation/sound/oss/mwave
@@ -0,0 +1,185 @@
+ How to try to survive an IBM Mwave under Linux SB drivers
+
+
++ IBM have now released documentation of sorts and Torsten is busy
+ trying to make the Mwave work. This is not however a trivial task.
+
+----------------------------------------------------------------------------
+
+OK, first thing - the IRQ problem IS a problem, whether the test is bypassed or
+not. It is NOT a Linux problem, but an MWAVE problem that is fixed with the
+latest MWAVE patches. So, in other words, don't bypass the test for MWAVES!
+
+I have Windows 95 on /dev/hda1, swap on /dev/hda2, and Red Hat 5 on /dev/hda3.
+
+The steps, then:
+
+ Boot to Linux.
+ Mount Windows 95 file system (assume mount point = /dos95).
+ mkdir /dos95/linux
+ mkdir /dos95/linux/boot
+ mkdir /dos95/linux/boot/parms
+
+ Copy the kernel, any initrd image, and loadlin to /dos95/linux/boot/.
+
+ Reboot to Windows 95.
+
+ Edit C:/msdos.sys and add or change the following:
+
+ Logo=0
+ BootGUI=0
+
+ Note that msdos.sys is a text file but it needs to be made 'unhidden',
+ readable and writable before it can be edited. This can be done with
+ DOS' "attrib" command.
+
+ Edit config.sys to have multiple config menus. I have one for windows 95 and
+ five for Linux, like this:
+------------
+[menu]
+menuitem=W95, Windows 95
+menuitem=LINTP, Linux - ThinkPad
+menuitem=LINTP3, Linux - ThinkPad Console
+menuitem=LINDOC, Linux - Docked
+menuitem=LINDOC3, Linux - Docked Console
+menuitem=LIN1, Linux - Single User Mode
+REM menudefault=W95,10
+
+[W95]
+
+[LINTP]
+
+[LINDOC]
+
+[LINTP3]
+
+[LINDOC3]
+
+[LIN1]
+
+[COMMON]
+FILES=30
+REM Please read README.TXT in C:\MWW subdirectory before changing the DOS= statement.
+DOS=HIGH,UMB
+DEVICE=C:\MWW\MANAGER\MWD50430.EXE
+SHELL=c:\command.com /e:2048
+-------------------
+
+The important things are the SHELL and DEVICE statements.
+
+ Then change autoexec.bat. Basically everything in there originally should be
+ done ONLY when Windows 95 is booted. Then you add new things specifically
+ for Linux. Mine is as follows
+
+---------------
+@ECHO OFF
+if "%CONFIG%" == "W95" goto W95
+
+REM
+REM Linux stuff
+REM
+SET MWPATH=C:\MWW\DLL;C:\MWW\MWGAMES;C:\MWW\DSP
+SET BLASTER=A220 I5 D1
+SET MWROOT=C:\MWW
+SET LIBPATH=C:\MWW\DLL
+SET PATH=C:\WINDOWS;C:\MWW\DLL;
+CALL MWAVE START NOSHOW
+c:\linux\boot\loadlin.exe @c:\linux\boot\parms\%CONFIG%.par
+
+:W95
+REM
+REM Windows 95 stuff
+REM
+c:\toolkit\guard
+SET MSINPUT=C:\MSINPUT
+SET MWPATH=C:\MWW\DLL;C:\MWW\MWGAMES;C:\MWW\DSP
+REM The following is used by DOS games to recognize Sound Blaster hardware.
+REM If hardware settings are changed, please change this line as well.
+REM See the Mwave README file for instructions.
+SET BLASTER=A220 I5 D1
+SET MWROOT=C:\MWW
+SET LIBPATH=C:\MWW\DLL
+SET PATH=C:\WINDOWS;C:\WINDOWS\COMMAND;E:\ORAWIN95\BIN;f:\msdev\bin;e:\v30\bin.dbg;v:\devt\v30\bin;c:\JavaSDK\Bin;C:\MWW\DLL;
+SET INCLUDE=f:\MSDEV\INCLUDE;F:\MSDEV\MFC\INCLUDE
+SET LIB=F:\MSDEV\LIB;F:\MSDEV\MFC\LIB
+win
+
+------------------------
+
+Now build a file in c:\linux\boot\parms for each Linux config that you have.
+
+For example, my LINDOC3 config is for a docked Thinkpad at runlevel 3 with no
+initrd image, and has a parameter file named LINDOC3.PAR in c:\linux\boot\parms:
+
+-----------------------
+# LOADLIN @param_file image=other_image root=/dev/other
+#
+# Linux Console in docking station
+#
+c:\linux\boot\zImage.krn # First value must be filename of Linux kernel.
+root=/dev/hda3 # device which gets mounted as root FS
+ro # Other kernel arguments go here.
+apm=off
+doc=yes
+3
+-----------------------
+
+The doc=yes parameter is an environment variable used by my init scripts, not
+a kernel argument.
+
+However, the apm=off parameter IS a kernel argument! APM, at least in my setup,
+causes the kernel to crash when loaded via loadlin (but NOT when loaded via
+LILO). The APM stuff COULD be forced out of the kernel via the kernel compile
+options. Instead, I got an unofficial patch to the APM drivers that allows them
+to be dynamically deactivated via kernel arguments. Whatever you chose to
+document, APM, it seems, MUST be off for setups like mine.
+
+Now make sure C:\MWW\MWCONFIG.REF looks like this:
+
+----------------------
+[NativeDOS]
+Default=SB1.5
+SBInputSource=CD
+SYNTH=FM
+QSound=OFF
+Reverb=OFF
+Chorus=OFF
+ReverbDepth=5
+ChorusDepth=5
+SBInputVolume=5
+SBMainVolume=10
+SBWaveVolume=10
+SBSynthVolume=10
+WaveTableVolume=10
+AudioPowerDriver=ON
+
+[FastCFG]
+Show=No
+HideOption=Off
+-----------------------------
+
+OR the Default= line COULD be
+
+Default=SBPRO
+
+Reboot to Windows 95 and choose Linux. When booted, use sndconfig to configure
+the sound modules and voilà - ThinkPad sound with Linux.
+
+Now the gotchas - you can either have CD sound OR Mixers but not both. That's a
+problem with the SB1.5 (CD sound) or SBPRO (Mixers) settings. No one knows why
+this is!
+
+For some reason MPEG3 files, when played through mpg123, sound like they
+are playing at 1/8th speed - not very useful! If you have ANY insight
+on why this second thing might be happening, I would be grateful.
+
+===========================================================
+ _/ _/_/_/_/
+ _/_/ _/_/ _/
+ _/ _/_/ _/_/_/_/ Martin John Bartlett
+ _/ _/ _/ _/ (martin@nitram.demon.co.uk)
+_/ _/_/_/_/
+ _/
+_/ _/
+ _/_/
+===========================================================
diff --git a/Documentation/sound/oss/ultrasound b/Documentation/sound/oss/ultrasound
new file mode 100644
index 0000000..eed331c
--- /dev/null
+++ b/Documentation/sound/oss/ultrasound
@@ -0,0 +1,30 @@
+modprobe sound
+insmod ad1848
+insmod gus io=* irq=* dma=* ...
+
+This loads the driver for the Gravis Ultrasound family of sound cards.
+
+The gus module takes the following arguments
+
+io I/O address of the Ultrasound card (eg. io=0x220)
+irq IRQ of the Sound Blaster card
+dma DMA channel for the Sound Blaster
+dma16 2nd DMA channel, only needed for full duplex operation
+type 1 for PnP card
+gus16 1 for using 16 bit sampling daughter board
+no_wave_dma Set to disable DMA usage for wavetable (see note)
+db16 ???
+
+
+no_wave_dma option
+
+This option defaults to a value of 0, which allows the Ultrasound wavetable
+DSP to use DMA for playback and downloading samples. This is the same
+as the old behaviour. If set to 1, no DMA is needed for downloading samples,
+and allows owners of a GUS MAX to make use of simultaneous digital audio
+(/dev/dsp), MIDI, and wavetable playback.
+
+
+If you have problems in recording with GUS MAX, you could try to use
+just one 8 bit DMA channel. Recording will not work with one DMA
+channel if it's a 16 bit one.
diff --git a/Documentation/sound/oss/vwsnd b/Documentation/sound/oss/vwsnd
new file mode 100644
index 0000000..4c6cbdb
--- /dev/null
+++ b/Documentation/sound/oss/vwsnd
@@ -0,0 +1,293 @@
+vwsnd - Sound driver for the Silicon Graphics 320 and 540 Visual
+Workstations' onboard audio.
+
+Copyright 1999 Silicon Graphics, Inc. All rights reserved.
+
+
+At the time of this writing, March 1999, there are two models of
+Visual Workstation, the 320 and the 540. This document only describes
+those models. Future Visual Workstation models may have different
+sound capabilities, and this driver will probably not work on those
+boxes.
+
+The Visual Workstation has an Analog Devices AD1843 "SoundComm" audio
+codec chip. The AD1843 is accessed through the Cobalt I/O ASIC, also
+known as Lithium. This driver programs both chips.
+
+==============================================================================
+QUICK CONFIGURATION
+
+ # insmod soundcore
+ # insmod vwsnd
+
+==============================================================================
+I/O CONNECTIONS
+
+On the Visual Workstation, only three of the AD1843 inputs are hooked
+up. The analog line in jacks are connected to the AD1843's AUX1
+input. The CD audio lines are connected to the AD1843's AUX2 input.
+The microphone jack is connected to the AD1843's MIC input. The mic
+jack is mono, but the signal is delivered to both the left and right
+MIC inputs. You can record in stereo from the mic input, but you will
+get the same signal on both channels (within the limits of A/D
+accuracy). Full scale on the Line input is +/- 2.0 V. Full scale on
+the MIC input is 20 dB less, or +/- 0.2 V.
+
+The AD1843's LOUT1 outputs are connected to the Line Out jacks. The
+AD1843's HPOUT outputs are connected to the speaker/headphone jack.
+LOUT2 is not connected. Line out's maximum level is +/- 2.0 V peak to
+peak. The speaker/headphone out's maximum is +/- 4.0 V peak to peak.
+
+The AD1843's PCM input channel and one of its output channels (DAC1)
+are connected to Lithium. The other output channel (DAC2) is not
+connected.
+
+==============================================================================
+CAPABILITIES
+
+The AD1843 has PCM input and output (Pulse Code Modulation, also known
+as wavetable). PCM input and output can be mono or stereo in any of
+four formats. The formats are 16 bit signed and 8 bit unsigned,
+u-Law, and A-Law format. Any sample rate from 4 KHz to 49 KHz is
+available, in 1 Hz increments.
+
+The AD1843 includes an analog mixer that can mix all three input
+signals (line, mic and CD) into the analog outputs. The mixer has a
+separate gain control and mute switch for each input.
+
+There are two outputs, line out and speaker/headphone out. They
+always produce the same signal, and the speaker always has 3 dB more
+gain than the line out. The speaker/headphone output can be muted,
+but this driver does not export that function.
+
+The hardware can sync audio to the video clock, but this driver does
+not have a way to specify syncing to video.
+
+==============================================================================
+PROGRAMMING
+
+This section explains the API supported by the driver. Also see the
+Open Sound Programming Guide at http://www.opensound.com/pguide/ .
+This section assumes familiarity with that document.
+
+The driver has two interfaces, an I/O interface and a mixer interface.
+There is no MIDI or sequencer capability.
+
+==============================================================================
+PROGRAMMING PCM I/O
+
+The I/O interface is usually accessed as /dev/audio or /dev/dsp.
+Using the standard Open Sound System (OSS) ioctl calls, the sample
+rate, number of channels, and sample format may be set within the
+limitations described above. The driver supports triggering. It also
+supports getting the input and output pointers with one-sample
+accuracy.
+
+The SNDCTL_DSP_GETCAP ioctl returns these capabilities.
+
+ DSP_CAP_DUPLEX - driver supports full duplex.
+
+ DSP_CAP_TRIGGER - driver supports triggering.
+
+ DSP_CAP_REALTIME - values returned by SNDCTL_DSP_GETIPTR
+ and SNDCTL_DSP_GETOPTR are accurate to a few samples.
+
+Memory mapping (mmap) is not implemented.
+
+The driver permits subdivided fragment sizes from 64 to 4096 bytes.
+The number of fragments can be anything from 3 fragments to however
+many fragments fit into 124 kilobytes. It is up to the user to
+determine how few/small fragments can be used without introducing
+glitches with a given workload. Linux is not realtime, so we can't
+promise anything. (sigh...)
+
+When this driver is switched into or out of mu-Law or A-Law mode on
+output, it may produce an audible click. This is unavoidable. To
+prevent clicking, use signed 16-bit mode instead, and convert from
+mu-Law or A-Law format in software.
+
+==============================================================================
+PROGRAMMING THE MIXER INTERFACE
+
+The mixer interface is usually accessed as /dev/mixer. It is accessed
+through ioctls. The mixer allows the application to control gain or
+mute several audio signal paths, and also allows selection of the
+recording source.
+
+Each of the constants described here can be read using the
+MIXER_READ(SOUND_MIXER_xxx) ioctl. Those that are not read-only can
+also be written using the MIXER_WRITE(SOUND_MIXER_xxx) ioctl. In most
+cases, <sys/soundcard.h> defines constants SOUND_MIXER_READ_xxx and
+SOUND_MIXER_WRITE_xxx which work just as well.
+
+SOUND_MIXER_CAPS Read-only
+
+This is a mask of optional driver capabilities that are implemented.
+This driver's only capability is SOUND_CAP_EXCL_INPUT, which means
+that only one recording source can be active at a time.
+
+SOUND_MIXER_DEVMASK Read-only
+
+This is a mask of the sound channels. This driver's channels are PCM,
+LINE, MIC, CD, and RECLEV.
+
+SOUND_MIXER_STEREODEVS Read-only
+
+This is a mask of which sound channels are capable of stereo. All
+channels are capable of stereo. (But see caveat on MIC input in I/O
+CONNECTIONS section above).
+
+SOUND_MIXER_OUTMASK Read-only
+
+This is a mask of channels that route inputs through to outputs.
+Those are LINE, MIC, and CD.
+
+SOUND_MIXER_RECMASK Read-only
+
+This is a mask of channels that can be recording sources. Those are
+PCM, LINE, MIC, CD.
+
+SOUND_MIXER_PCM Default: 0x5757 (0 dB)
+
+This is the gain control for PCM output. The left and right channel
+gain are controlled independently. This gain control has 64 levels,
+which range from -82.5 dB to +12.0 dB in 1.5 dB steps. Those 64
+levels are mapped onto 100 levels at the ioctl, see below.
+
+SOUND_MIXER_LINE Default: 0x4a4a (0 dB)
+
+This is the gain control for mixing the Line In source into the
+outputs. The left and right channel gain are controlled
+independently. This gain control has 32 levels, which range from
+-34.5 dB to +12.0 dB in 1.5 dB steps. Those 32 levels are mapped onto
+100 levels at the ioctl, see below.
+
+SOUND_MIXER_MIC Default: 0x4a4a (0 dB)
+
+This is the gain control for mixing the MIC source into the outputs.
+The left and right channel gain are controlled independently. This
+gain control has 32 levels, which range from -34.5 dB to +12.0 dB in
+1.5 dB steps. Those 32 levels are mapped onto 100 levels at the
+ioctl, see below.
+
+SOUND_MIXER_CD Default: 0x4a4a (0 dB)
+
+This is the gain control for mixing the CD audio source into the
+outputs. The left and right channel gain are controlled
+independently. This gain control has 32 levels, which range from
+-34.5 dB to +12.0 dB in 1.5 dB steps. Those 32 levels are mapped onto
+100 levels at the ioctl, see below.
+
+SOUND_MIXER_RECLEV Default: 0 (0 dB)
+
+This is the gain control for PCM input (RECording LEVel). The left
+and right channel gain are controlled independently. This gain
+control has 16 levels, which range from 0 dB to +22.5 dB in 1.5 dB
+steps. Those 16 levels are mapped onto 100 levels at the ioctl, see
+below.
+
+SOUND_MIXER_RECSRC Default: SOUND_MASK_LINE
+
+This is a mask of currently selected PCM input sources (RECording
+SouRCes). Because the AD1843 can only have a single recording source
+at a time, only one bit at a time can be set in this mask. The
+allowable values are SOUND_MASK_PCM, SOUND_MASK_LINE, SOUND_MASK_MIC,
+or SOUND_MASK_CD. Selecting SOUND_MASK_PCM sets up internal
+resampling which is useful for loopback testing and for hardware
+sample rate conversion. But software sample rate conversion is
+probably faster, so I don't know how useful that is.
+
+SOUND_MIXER_OUTSRC DEFAULT: SOUND_MASK_LINE|SOUND_MASK_MIC|SOUND_MASK_CD
+
+This is a mask of sources that are currently passed through to the
+outputs. Those sources whose bits are not set are muted.
+
+==============================================================================
+GAIN CONTROL
+
+There are five gain controls listed above. Each has 16, 32, or 64
+steps. Each control has 1.5 dB of gain per step. Each control is
+stereo.
+
+The OSS defines the argument to a channel gain ioctl as having two
+components, left and right, each of which ranges from 0 to 100. The
+two components are packed into the same word, with the left side gain
+in the least significant byte, and the right side gain in the second
+least significant byte. In C, we would say this.
+
+ #include <assert.h>
+
+ ...
+
+ assert(leftgain >= 0 && leftgain <= 100);
+ assert(rightgain >= 0 && rightgain <= 100);
+ arg = leftgain | rightgain << 8;
+
+So each OSS gain control has 101 steps. But the hardware has 16, 32,
+or 64 steps. The hardware steps are spread across the 101 OSS steps
+nearly evenly. The conversion formulas are like this, given N equals
+16, 32, or 64.
+
+ int round = N/2 - 1;
+ OSS_gain_steps = (hw_gain_steps * 100 + round) / (N - 1);
+ hw_gain_steps = (OSS_gain_steps * (N - 1) + round) / 100;
+
+Here is a snippet of C code that will return the left and right gain
+of any channel in dB. Pass it one of the predefined gain_desc_t
+structures to access any of the five channels' gains.
+
+ typedef struct gain_desc {
+ float min_gain;
+ float gain_step;
+ int nbits;
+ int chan;
+ } gain_desc_t;
+
+ const gain_desc_t gain_pcm = { -82.5, 1.5, 6, SOUND_MIXER_PCM };
+ const gain_desc_t gain_line = { -34.5, 1.5, 5, SOUND_MIXER_LINE };
+ const gain_desc_t gain_mic = { -34.5, 1.5, 5, SOUND_MIXER_MIC };
+ const gain_desc_t gain_cd = { -34.5, 1.5, 5, SOUND_MIXER_CD };
+ const gain_desc_t gain_reclev = { 0.0, 1.5, 4, SOUND_MIXER_RECLEV };
+
+ int get_gain_dB(int fd, const gain_desc_t *gp,
+ float *left, float *right)
+ {
+ int word;
+ int lg, rg;
+ int mask = (1 << gp->nbits) - 1;
+
+ if (ioctl(fd, MIXER_READ(gp->chan), &word) != 0)
+ return -1; /* fail */
+ lg = word & 0xFF;
+ rg = word >> 8 & 0xFF;
+ lg = (lg * mask + mask / 2) / 100;
+ rg = (rg * mask + mask / 2) / 100;
+ *left = gp->min_gain + gp->gain_step * lg;
+ *right = gp->min_gain + gp->gain_step * rg;
+ return 0;
+ }
+
+And here is the corresponding routine to set a channel's gain in dB.
+
+ int set_gain_dB(int fd, const gain_desc_t *gp, float left, float right)
+ {
+ float max_gain =
+ gp->min_gain + (1 << gp->nbits) * gp->gain_step;
+ float round = gp->gain_step / 2;
+ int mask = (1 << gp->nbits) - 1;
+ int word;
+ int lg, rg;
+
+ if (left < gp->min_gain || right < gp->min_gain)
+ return EINVAL;
+ lg = (left - gp->min_gain + round) / gp->gain_step;
+ rg = (right - gp->min_gain + round) / gp->gain_step;
+ if (lg >= (1 << gp->nbits) || rg >= (1 << gp->nbits))
+ return EINVAL;
+ lg = (100 * lg + mask / 2) / mask;
+ rg = (100 * rg + mask / 2) / mask;
+ word = lg | rg << 8;
+
+ return ioctl(fd, MIXER_WRITE(gp->chan), &word);
+ }
+
diff --git a/Documentation/sparc/README-2.5 b/Documentation/sparc/README-2.5
new file mode 100644
index 0000000..806fe49
--- /dev/null
+++ b/Documentation/sparc/README-2.5
@@ -0,0 +1,46 @@
+BTFIXUP
+-------
+
+To build new kernels you have to issue "make image". The ready kernel
+in ELF format is placed in arch/sparc/boot/image. Explanation is below.
+
+BTFIXUP is a unique feature of Linux/sparc among other architectures,
+developed by Jakub Jelinek (I think... Obviously David S. Miller took
+part, too). It allows to boot the same kernel at different
+sub-architectures, such as sun4c, sun4m, sun4d, where SunOS uses
+different kernels. This feature is convinient for people who you move
+disks between boxes and for distrution builders.
+
+To function, BTFIXUP must link the kernel "in the draft" first,
+analyze the result, write a special stub code based on that, and
+build the final kernel with the stub (btfix.o).
+
+Kai Germaschewski improved the build system of the kernel in the 2.5 series
+significantly. Unfortunately, the traditional way of running the draft
+linking from architecture specific Makefile before the actual linking
+by generic Makefile is nearly impossible to support properly in the
+new build system. Therefore, the way we integrate BTFIXUP with the
+build system was changed in 2.5.40. Now, generic Makefile performs
+the draft linking and stores the result in file vmlinux. Architecture
+specific post-processing invokes BTFIXUP machinery and final linking
+in the same way as other architectures do bootstraps.
+
+Implications of that change are as follows.
+
+1. Hackers must type "make image" now, instead of just "make", in the same
+ way as s390 people do now. It is analogous to "make bzImage" on i386.
+ This does NOT affect sparc64, you continue to use "make" to build sparc64
+ kernels.
+
+2. vmlinux is not the final kernel, so RPM builders have to adjust
+ their spec files (if they delivered vmlinux for debugging).
+ System.map generated for vmlinux is still valid.
+
+3. Scripts that produce a.out images have to be changed. First, if they
+ invoke make, they have to use "make image". Second, they have to pick up
+ the new kernel in arch/sparc/boot/image instead of vmlinux.
+
+4. Since we are compliant with Kai's build system now, make -j is permitted.
+
+-- Pete Zaitcev
+zaitcev@yahoo.com
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt
new file mode 100644
index 0000000..42f43fa
--- /dev/null
+++ b/Documentation/sparse.txt
@@ -0,0 +1,82 @@
+Copyright 2004 Linus Torvalds
+Copyright 2004 Pavel Machek <pavel@suse.cz>
+Copyright 2006 Bob Copeland <me@bobcopeland.com>
+
+Using sparse for typechecking
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+"__bitwise" is a type attribute, so you have to do something like this:
+
+ typedef int __bitwise pm_request_t;
+
+ enum pm_request {
+ PM_SUSPEND = (__force pm_request_t) 1,
+ PM_RESUME = (__force pm_request_t) 2
+ };
+
+which makes PM_SUSPEND and PM_RESUME "bitwise" integers (the "__force" is
+there because sparse will complain about casting to/from a bitwise type,
+but in this case we really _do_ want to force the conversion). And because
+the enum values are all the same type, now "enum pm_request" will be that
+type too.
+
+And with gcc, all the __bitwise/__force stuff goes away, and it all ends
+up looking just like integers to gcc.
+
+Quite frankly, you don't need the enum there. The above all really just
+boils down to one special "int __bitwise" type.
+
+So the simpler way is to just do
+
+ typedef int __bitwise pm_request_t;
+
+ #define PM_SUSPEND ((__force pm_request_t) 1)
+ #define PM_RESUME ((__force pm_request_t) 2)
+
+and you now have all the infrastructure needed for strict typechecking.
+
+One small note: the constant integer "0" is special. You can use a
+constant zero as a bitwise integer type without sparse ever complaining.
+This is because "bitwise" (as the name implies) was designed for making
+sure that bitwise types don't get mixed up (little-endian vs big-endian
+vs cpu-endian vs whatever), and there the constant "0" really _is_
+special.
+
+Getting sparse
+~~~~~~~~~~~~~~
+
+You can get latest released versions from the Sparse homepage at
+http://www.kernel.org/pub/linux/kernel/people/josh/sparse/
+
+Alternatively, you can get snapshots of the latest development version
+of sparse using git to clone..
+
+ git://git.kernel.org/pub/scm/linux/kernel/git/josh/sparse.git
+
+DaveJ has hourly generated tarballs of the git tree available at..
+
+ http://www.codemonkey.org.uk/projects/git-snapshots/sparse/
+
+
+Once you have it, just do
+
+ make
+ make install
+
+as a regular user, and it will install sparse in your ~/bin directory.
+
+Using sparse
+~~~~~~~~~~~~
+
+Do a kernel make with "make C=1" to run sparse on all the C files that get
+recompiled, or use "make C=2" to run sparse on the files whether they need to
+be recompiled or not. The latter is a fast way to check the whole tree if you
+have already built it.
+
+The optional make variable CF can be used to pass arguments to sparse. The
+build system passes -Wbitwise to sparse automatically. To perform endianness
+checks, you may define __CHECK_ENDIAN__:
+
+ make C=2 CF="-D__CHECK_ENDIAN__"
+
+These checks are disabled by default as they generate a host of warnings.
diff --git a/Documentation/spi/.gitignore b/Documentation/spi/.gitignore
new file mode 100644
index 0000000..4280576
--- /dev/null
+++ b/Documentation/spi/.gitignore
@@ -0,0 +1,2 @@
+spidev_fdx
+spidev_test
diff --git a/Documentation/spi/Makefile b/Documentation/spi/Makefile
new file mode 100644
index 0000000..6155d94
--- /dev/null
+++ b/Documentation/spi/Makefile
@@ -0,0 +1,12 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+CC = $(CROSS_COMPILE)gcc
+# List of programs to build
+hostprogs-y := spidev_test spidev_fdx
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_spidev_test.o += -I$(objtree)/usr/include
+HOSTCFLAGS_spidev_fdx.o += -I$(objtree)/usr/include
diff --git a/Documentation/spi/butterfly b/Documentation/spi/butterfly
new file mode 100644
index 0000000..9927af7
--- /dev/null
+++ b/Documentation/spi/butterfly
@@ -0,0 +1,68 @@
+spi_butterfly - parport-to-butterfly adapter driver
+===================================================
+
+This is a hardware and software project that includes building and using
+a parallel port adapter cable, together with an "AVR Butterfly" to run
+firmware for user interfacing and/or sensors. A Butterfly is a $US20
+battery powered card with an AVR microcontroller and lots of goodies:
+sensors, LCD, flash, toggle stick, and more. You can use AVR-GCC to
+develop firmware for this, and flash it using this adapter cable.
+
+You can make this adapter from an old printer cable and solder things
+directly to the Butterfly. Or (if you have the parts and skills) you
+can come up with something fancier, providing ciruit protection to the
+Butterfly and the printer port, or with a better power supply than two
+signal pins from the printer port. Or for that matter, you can use
+similar cables to talk to many AVR boards, even a breadboard.
+
+This is more powerful than "ISP programming" cables since it lets kernel
+SPI protocol drivers interact with the AVR, and could even let the AVR
+issue interrupts to them. Later, your protocol driver should work
+easily with a "real SPI controller", instead of this bitbanger.
+
+
+The first cable connections will hook Linux up to one SPI bus, with the
+AVR and a DataFlash chip; and to the AVR reset line. This is all you
+need to reflash the firmware, and the pins are the standard Atmel "ISP"
+connector pins (used also on non-Butterfly AVR boards). On the parport
+side this is like "sp12" programming cables.
+
+ Signal Butterfly Parport (DB-25)
+ ------ --------- ---------------
+ SCK = J403.PB1/SCK = pin 2/D0
+ RESET = J403.nRST = pin 3/D1
+ VCC = J403.VCC_EXT = pin 8/D6
+ MOSI = J403.PB2/MOSI = pin 9/D7
+ MISO = J403.PB3/MISO = pin 11/S7,nBUSY
+ GND = J403.GND = pin 23/GND
+
+Then to let Linux master that bus to talk to the DataFlash chip, you must
+(a) flash new firmware that disables SPI (set PRR.2, and disable pullups
+by clearing PORTB.[0-3]); (b) configure the mtd_dataflash driver; and
+(c) cable in the chipselect.
+
+ Signal Butterfly Parport (DB-25)
+ ------ --------- ---------------
+ VCC = J400.VCC_EXT = pin 7/D5
+ SELECT = J400.PB0/nSS = pin 17/C3,nSELECT
+ GND = J400.GND = pin 24/GND
+
+Or you could flash firmware making the AVR into an SPI slave (keeping the
+DataFlash in reset) and tweak the spi_butterfly driver to make it bind to
+the driver for your custom SPI-based protocol.
+
+The "USI" controller, using J405, can also be used for a second SPI bus.
+That would let you talk to the AVR using custom SPI-with-USI firmware,
+while letting either Linux or the AVR use the DataFlash. There are plenty
+of spare parport pins to wire this one up, such as:
+
+ Signal Butterfly Parport (DB-25)
+ ------ --------- ---------------
+ SCK = J403.PE4/USCK = pin 5/D3
+ MOSI = J403.PE5/DI = pin 6/D4
+ MISO = J403.PE6/DO = pin 12/S5,nPAPEROUT
+ GND = J403.GND = pin 22/GND
+
+ IRQ = J402.PF4 = pin 10/S6,ACK
+ GND = J402.GND(P2) = pin 25/GND
+
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx
new file mode 100644
index 0000000..6bb916d
--- /dev/null
+++ b/Documentation/spi/pxa2xx
@@ -0,0 +1,246 @@
+PXA2xx SPI on SSP driver HOWTO
+===================================================
+This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx
+synchronous serial port into a SPI master controller
+(see Documentation/spi/spi_summary). The driver has the following features
+
+- Support for any PXA2xx SSP
+- SSP PIO and SSP DMA data transfers.
+- External and Internal (SSPFRM) chip selects.
+- Per slave device (chip) configuration.
+- Full suspend, freeze, resume support.
+
+The driver is built around a "spi_message" fifo serviced by workqueue and a
+tasklet. The workqueue, "pump_messages", drives message fifo and the tasklet
+(pump_transfer) is responsible for queuing SPI transactions and setting up and
+launching the dma/interrupt driven transfers.
+
+Declaring PXA2xx Master Controllers
+-----------------------------------
+Typically a SPI master is defined in the arch/.../mach-*/board-*.c as a
+"platform device". The master configuration is passed to the driver via a table
+found in arch/arm/mach-pxa/include/mach/pxa2xx_spi.h:
+
+struct pxa2xx_spi_master {
+ enum pxa_ssp_type ssp_type;
+ u32 clock_enable;
+ u16 num_chipselect;
+ u8 enable_dma;
+};
+
+The "pxa2xx_spi_master.ssp_type" field must have a value between 1 and 3 and
+informs the driver which features a particular SSP supports.
+
+The "pxa2xx_spi_master.clock_enable" field is used to enable/disable the
+corresponding SSP peripheral block in the "Clock Enable Register (CKEN"). See
+the "PXA2xx Developer Manual" section "Clocks and Power Management".
+
+The "pxa2xx_spi_master.num_chipselect" field is used to determine the number of
+slave device (chips) attached to this SPI master.
+
+The "pxa2xx_spi_master.enable_dma" field informs the driver that SSP DMA should
+be used. This caused the driver to acquire two DMA channels: rx_channel and
+tx_channel. The rx_channel has a higher DMA service priority the tx_channel.
+See the "PXA2xx Developer Manual" section "DMA Controller".
+
+NSSP MASTER SAMPLE
+------------------
+Below is a sample configuration using the PXA255 NSSP.
+
+static struct resource pxa_spi_nssp_resources[] = {
+ [0] = {
+ .start = __PREG(SSCR0_P(2)), /* Start address of NSSP */
+ .end = __PREG(SSCR0_P(2)) + 0x2c, /* Range of registers */
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .start = IRQ_NSSP, /* NSSP IRQ */
+ .end = IRQ_NSSP,
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+static struct pxa2xx_spi_master pxa_nssp_master_info = {
+ .ssp_type = PXA25x_NSSP, /* Type of SSP */
+ .clock_enable = CKEN_NSSP, /* NSSP Peripheral clock */
+ .num_chipselect = 1, /* Matches the number of chips attached to NSSP */
+ .enable_dma = 1, /* Enables NSSP DMA */
+};
+
+static struct platform_device pxa_spi_nssp = {
+ .name = "pxa2xx-spi", /* MUST BE THIS VALUE, so device match driver */
+ .id = 2, /* Bus number, MUST MATCH SSP number 1..n */
+ .resource = pxa_spi_nssp_resources,
+ .num_resources = ARRAY_SIZE(pxa_spi_nssp_resources),
+ .dev = {
+ .platform_data = &pxa_nssp_master_info, /* Passed to driver */
+ },
+};
+
+static struct platform_device *devices[] __initdata = {
+ &pxa_spi_nssp,
+};
+
+static void __init board_init(void)
+{
+ (void)platform_add_device(devices, ARRAY_SIZE(devices));
+}
+
+Declaring Slave Devices
+-----------------------
+Typically each SPI slave (chip) is defined in the arch/.../mach-*/board-*.c
+using the "spi_board_info" structure found in "linux/spi/spi.h". See
+"Documentation/spi/spi_summary" for additional information.
+
+Each slave device attached to the PXA must provide slave specific configuration
+information via the structure "pxa2xx_spi_chip" found in
+"arch/arm/mach-pxa/include/mach/pxa2xx_spi.h". The pxa2xx_spi master controller driver
+will uses the configuration whenever the driver communicates with the slave
+device. All fields are optional.
+
+struct pxa2xx_spi_chip {
+ u8 tx_threshold;
+ u8 rx_threshold;
+ u8 dma_burst_size;
+ u32 timeout;
+ u8 enable_loopback;
+ void (*cs_control)(u32 command);
+};
+
+The "pxa2xx_spi_chip.tx_threshold" and "pxa2xx_spi_chip.rx_threshold" fields are
+used to configure the SSP hardware fifo. These fields are critical to the
+performance of pxa2xx_spi driver and misconfiguration will result in rx
+fifo overruns (especially in PIO mode transfers). Good default values are
+
+ .tx_threshold = 8,
+ .rx_threshold = 8,
+
+The range is 1 to 16 where zero indicates "use default".
+
+The "pxa2xx_spi_chip.dma_burst_size" field is used to configure PXA2xx DMA
+engine and is related the "spi_device.bits_per_word" field. Read and understand
+the PXA2xx "Developer Manual" sections on the DMA controller and SSP Controllers
+to determine the correct value. An SSP configured for byte-wide transfers would
+use a value of 8. The driver will determine a reasonable default if
+dma_burst_size == 0.
+
+The "pxa2xx_spi_chip.timeout" fields is used to efficiently handle
+trailing bytes in the SSP receiver fifo. The correct value for this field is
+dependent on the SPI bus speed ("spi_board_info.max_speed_hz") and the specific
+slave device. Please note that the PXA2xx SSP 1 does not support trailing byte
+timeouts and must busy-wait any trailing bytes.
+
+The "pxa2xx_spi_chip.enable_loopback" field is used to place the SSP porting
+into internal loopback mode. In this mode the SSP controller internally
+connects the SSPTX pin to the SSPRX pin. This is useful for initial setup
+testing.
+
+The "pxa2xx_spi_chip.cs_control" field is used to point to a board specific
+function for asserting/deasserting a slave device chip select. If the field is
+NULL, the pxa2xx_spi master controller driver assumes that the SSP port is
+configured to use SSPFRM instead.
+
+NOTE: the SPI driver cannot control the chip select if SSPFRM is used, so the
+chipselect is dropped after each spi_transfer. Most devices need chip select
+asserted around the complete message. Use SSPFRM as a GPIO (through cs_control)
+to accomodate these chips.
+
+
+NSSP SLAVE SAMPLE
+-----------------
+The pxa2xx_spi_chip structure is passed to the pxa2xx_spi driver in the
+"spi_board_info.controller_data" field. Below is a sample configuration using
+the PXA255 NSSP.
+
+/* Chip Select control for the CS8415A SPI slave device */
+static void cs8415a_cs_control(u32 command)
+{
+ if (command & PXA2XX_CS_ASSERT)
+ GPCR(2) = GPIO_bit(2);
+ else
+ GPSR(2) = GPIO_bit(2);
+}
+
+/* Chip Select control for the CS8405A SPI slave device */
+static void cs8405a_cs_control(u32 command)
+{
+ if (command & PXA2XX_CS_ASSERT)
+ GPCR(3) = GPIO_bit(3);
+ else
+ GPSR(3) = GPIO_bit(3);
+}
+
+static struct pxa2xx_spi_chip cs8415a_chip_info = {
+ .tx_threshold = 8, /* SSP hardward FIFO threshold */
+ .rx_threshold = 8, /* SSP hardward FIFO threshold */
+ .dma_burst_size = 8, /* Byte wide transfers used so 8 byte bursts */
+ .timeout = 235, /* See Intel documentation */
+ .cs_control = cs8415a_cs_control, /* Use external chip select */
+};
+
+static struct pxa2xx_spi_chip cs8405a_chip_info = {
+ .tx_threshold = 8, /* SSP hardward FIFO threshold */
+ .rx_threshold = 8, /* SSP hardward FIFO threshold */
+ .dma_burst_size = 8, /* Byte wide transfers used so 8 byte bursts */
+ .timeout = 235, /* See Intel documentation */
+ .cs_control = cs8405a_cs_control, /* Use external chip select */
+};
+
+static struct spi_board_info streetracer_spi_board_info[] __initdata = {
+ {
+ .modalias = "cs8415a", /* Name of spi_driver for this device */
+ .max_speed_hz = 3686400, /* Run SSP as fast a possbile */
+ .bus_num = 2, /* Framework bus number */
+ .chip_select = 0, /* Framework chip select */
+ .platform_data = NULL; /* No spi_driver specific config */
+ .controller_data = &cs8415a_chip_info, /* Master chip config */
+ .irq = STREETRACER_APCI_IRQ, /* Slave device interrupt */
+ },
+ {
+ .modalias = "cs8405a", /* Name of spi_driver for this device */
+ .max_speed_hz = 3686400, /* Run SSP as fast a possbile */
+ .bus_num = 2, /* Framework bus number */
+ .chip_select = 1, /* Framework chip select */
+ .controller_data = &cs8405a_chip_info, /* Master chip config */
+ .irq = STREETRACER_APCI_IRQ, /* Slave device interrupt */
+ },
+};
+
+static void __init streetracer_init(void)
+{
+ spi_register_board_info(streetracer_spi_board_info,
+ ARRAY_SIZE(streetracer_spi_board_info));
+}
+
+
+DMA and PIO I/O Support
+-----------------------
+The pxa2xx_spi driver supports both DMA and interrupt driven PIO message
+transfers. The driver defaults to PIO mode and DMA transfers must be enabled
+by setting the "enable_dma" flag in the "pxa2xx_spi_master" structure. The DMA
+mode supports both coherent and stream based DMA mappings.
+
+The following logic is used to determine the type of I/O to be used on
+a per "spi_transfer" basis:
+
+if !enable_dma then
+ always use PIO transfers
+
+if spi_message.len > 8191 then
+ print "rate limited" warning
+ use PIO transfers
+
+if spi_message.is_dma_mapped and rx_dma_buf != 0 and tx_dma_buf != 0 then
+ use coherent DMA mode
+
+if rx_buf and tx_buf are aligned on 8 byte boundary then
+ use streaming DMA mode
+
+otherwise
+ use PIO transfer
+
+THANKS TO
+---------
+
+David Brownell and others for mentoring the development of this driver.
+
diff --git a/Documentation/spi/spi-lm70llp b/Documentation/spi/spi-lm70llp
new file mode 100644
index 0000000..154bd02
--- /dev/null
+++ b/Documentation/spi/spi-lm70llp
@@ -0,0 +1,69 @@
+spi_lm70llp : LM70-LLP parport-to-SPI adapter
+==============================================
+
+Supported board/chip:
+ * National Semiconductor LM70 LLP evaluation board
+ Datasheet: http://www.national.com/pf/LM/LM70.html
+
+Author:
+ Kaiwan N Billimoria <kaiwan@designergraphix.com>
+
+Description
+-----------
+This driver provides glue code connecting a National Semiconductor LM70 LLP
+temperature sensor evaluation board to the kernel's SPI core subsystem.
+
+In effect, this driver turns the parallel port interface on the eval board
+into a SPI bus with a single device, which will be driven by the generic
+LM70 driver (drivers/hwmon/lm70.c).
+
+The hardware interfacing on the LM70 LLP eval board is as follows:
+
+ Parallel LM70 LLP
+ Port Direction JP2 Header
+ ----------- --------- ----------------
+ D0 2 - -
+ D1 3 --> V+ 5
+ D2 4 --> V+ 5
+ D3 5 --> V+ 5
+ D4 6 --> V+ 5
+ D5 7 --> nCS 8
+ D6 8 --> SCLK 3
+ D7 9 --> SI/O 5
+ GND 25 - GND 7
+ Select 13 <-- SI/O 1
+ ----------- --------- ----------------
+
+Note that since the LM70 uses a "3-wire" variant of SPI, the SI/SO pin
+is connected to both pin D7 (as Master Out) and Select (as Master In)
+using an arrangment that lets either the parport or the LM70 pull the
+pin low. This can't be shared with true SPI devices, but other 3-wire
+devices might share the same SI/SO pin.
+
+The bitbanger routine in this driver (lm70_txrx) is called back from
+the bound "hwmon/lm70" protocol driver through its sysfs hook, using a
+spi_write_then_read() call. It performs Mode 0 (SPI/Microwire) bitbanging.
+The lm70 driver then inteprets the resulting digital temperature value
+and exports it through sysfs.
+
+A "gotcha": National Semiconductor's LM70 LLP eval board circuit schematic
+shows that the SI/O line from the LM70 chip is connected to the base of a
+transistor Q1 (and also a pullup, and a zener diode to D7); while the
+collector is tied to VCC.
+
+Interpreting this circuit, when the LM70 SI/O line is High (or tristate
+and not grounded by the host via D7), the transistor conducts and switches
+the collector to zero, which is reflected on pin 13 of the DB25 parport
+connector. When SI/O is Low (driven by the LM70 or the host) on the other
+hand, the transistor is cut off and the voltage tied to it's collector is
+reflected on pin 13 as a High level.
+
+So: the getmiso inline routine in this driver takes this fact into account,
+inverting the value read at pin 13.
+
+
+Thanks to
+---------
+o David Brownell for mentoring the SPI-side driver development.
+o Dr.Craig Hollabaugh for the (early) "manual" bitbanging driver version.
+o Nadir Billimoria for help interpreting the circuit schematic.
diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
new file mode 100644
index 0000000..0f5122e
--- /dev/null
+++ b/Documentation/spi/spi-summary
@@ -0,0 +1,558 @@
+Overview of Linux kernel SPI support
+====================================
+
+21-May-2007
+
+What is SPI?
+------------
+The "Serial Peripheral Interface" (SPI) is a synchronous four wire serial
+link used to connect microcontrollers to sensors, memory, and peripherals.
+It's a simple "de facto" standard, not complicated enough to acquire a
+standardization body. SPI uses a master/slave configuration.
+
+The three signal wires hold a clock (SCK, often on the order of 10 MHz),
+and parallel data lines with "Master Out, Slave In" (MOSI) or "Master In,
+Slave Out" (MISO) signals. (Other names are also used.) There are four
+clocking modes through which data is exchanged; mode-0 and mode-3 are most
+commonly used. Each clock cycle shifts data out and data in; the clock
+doesn't cycle except when there is a data bit to shift. Not all data bits
+are used though; not every protocol uses those full duplex capabilities.
+
+SPI masters use a fourth "chip select" line to activate a given SPI slave
+device, so those three signal wires may be connected to several chips
+in parallel. All SPI slaves support chipselects; they are usually active
+low signals, labeled nCSx for slave 'x' (e.g. nCS0). Some devices have
+other signals, often including an interrupt to the master.
+
+Unlike serial busses like USB or SMBus, even low level protocols for
+SPI slave functions are usually not interoperable between vendors
+(except for commodities like SPI memory chips).
+
+ - SPI may be used for request/response style device protocols, as with
+ touchscreen sensors and memory chips.
+
+ - It may also be used to stream data in either direction (half duplex),
+ or both of them at the same time (full duplex).
+
+ - Some devices may use eight bit words. Others may different word
+ lengths, such as streams of 12-bit or 20-bit digital samples.
+
+ - Words are usually sent with their most significant bit (MSB) first,
+ but sometimes the least significant bit (LSB) goes first instead.
+
+ - Sometimes SPI is used to daisy-chain devices, like shift registers.
+
+In the same way, SPI slaves will only rarely support any kind of automatic
+discovery/enumeration protocol. The tree of slave devices accessible from
+a given SPI master will normally be set up manually, with configuration
+tables.
+
+SPI is only one of the names used by such four-wire protocols, and
+most controllers have no problem handling "MicroWire" (think of it as
+half-duplex SPI, for request/response protocols), SSP ("Synchronous
+Serial Protocol"), PSP ("Programmable Serial Protocol"), and other
+related protocols.
+
+Some chips eliminate a signal line by combining MOSI and MISO, and
+limiting themselves to half-duplex at the hardware level. In fact
+some SPI chips have this signal mode as a strapping option. These
+can be accessed using the same programming interface as SPI, but of
+course they won't handle full duplex transfers. You may find such
+chips described as using "three wire" signaling: SCK, data, nCSx.
+(That data line is sometimes called MOMI or SISO.)
+
+Microcontrollers often support both master and slave sides of the SPI
+protocol. This document (and Linux) currently only supports the master
+side of SPI interactions.
+
+
+Who uses it? On what kinds of systems?
+---------------------------------------
+Linux developers using SPI are probably writing device drivers for embedded
+systems boards. SPI is used to control external chips, and it is also a
+protocol supported by every MMC or SD memory card. (The older "DataFlash"
+cards, predating MMC cards but using the same connectors and card shape,
+support only SPI.) Some PC hardware uses SPI flash for BIOS code.
+
+SPI slave chips range from digital/analog converters used for analog
+sensors and codecs, to memory, to peripherals like USB controllers
+or Ethernet adapters; and more.
+
+Most systems using SPI will integrate a few devices on a mainboard.
+Some provide SPI links on expansion connectors; in cases where no
+dedicated SPI controller exists, GPIO pins can be used to create a
+low speed "bitbanging" adapter. Very few systems will "hotplug" an SPI
+controller; the reasons to use SPI focus on low cost and simple operation,
+and if dynamic reconfiguration is important, USB will often be a more
+appropriate low-pincount peripheral bus.
+
+Many microcontrollers that can run Linux integrate one or more I/O
+interfaces with SPI modes. Given SPI support, they could use MMC or SD
+cards without needing a special purpose MMC/SD/SDIO controller.
+
+
+I'm confused. What are these four SPI "clock modes"?
+-----------------------------------------------------
+It's easy to be confused here, and the vendor documentation you'll
+find isn't necessarily helpful. The four modes combine two mode bits:
+
+ - CPOL indicates the initial clock polarity. CPOL=0 means the
+ clock starts low, so the first (leading) edge is rising, and
+ the second (trailing) edge is falling. CPOL=1 means the clock
+ starts high, so the first (leading) edge is falling.
+
+ - CPHA indicates the clock phase used to sample data; CPHA=0 says
+ sample on the leading edge, CPHA=1 means the trailing edge.
+
+ Since the signal needs to stablize before it's sampled, CPHA=0
+ implies that its data is written half a clock before the first
+ clock edge. The chipselect may have made it become available.
+
+Chip specs won't always say "uses SPI mode X" in as many words,
+but their timing diagrams will make the CPOL and CPHA modes clear.
+
+In the SPI mode number, CPOL is the high order bit and CPHA is the
+low order bit. So when a chip's timing diagram shows the clock
+starting low (CPOL=0) and data stabilized for sampling during the
+trailing clock edge (CPHA=1), that's SPI mode 1.
+
+Note that the clock mode is relevant as soon as the chipselect goes
+active. So the master must set the clock to inactive before selecting
+a slave, and the slave can tell the chosen polarity by sampling the
+clock level when its select line goes active. That's why many devices
+support for example both modes 0 and 3: they don't care about polarity,
+and alway clock data in/out on rising clock edges.
+
+
+How do these driver programming interfaces work?
+------------------------------------------------
+The <linux/spi/spi.h> header file includes kerneldoc, as does the
+main source code, and you should certainly read that chapter of the
+kernel API document. This is just an overview, so you get the big
+picture before those details.
+
+SPI requests always go into I/O queues. Requests for a given SPI device
+are always executed in FIFO order, and complete asynchronously through
+completion callbacks. There are also some simple synchronous wrappers
+for those calls, including ones for common transaction types like writing
+a command and then reading its response.
+
+There are two types of SPI driver, here called:
+
+ Controller drivers ... controllers may be built in to System-On-Chip
+ processors, and often support both Master and Slave roles.
+ These drivers touch hardware registers and may use DMA.
+ Or they can be PIO bitbangers, needing just GPIO pins.
+
+ Protocol drivers ... these pass messages through the controller
+ driver to communicate with a Slave or Master device on the
+ other side of an SPI link.
+
+So for example one protocol driver might talk to the MTD layer to export
+data to filesystems stored on SPI flash like DataFlash; and others might
+control audio interfaces, present touchscreen sensors as input interfaces,
+or monitor temperature and voltage levels during industrial processing.
+And those might all be sharing the same controller driver.
+
+A "struct spi_device" encapsulates the master-side interface between
+those two types of driver. At this writing, Linux has no slave side
+programming interface.
+
+There is a minimal core of SPI programming interfaces, focussing on
+using the driver model to connect controller and protocol drivers using
+device tables provided by board specific initialization code. SPI
+shows up in sysfs in several locations:
+
+ /sys/devices/.../CTLR ... physical node for a given SPI controller
+
+ /sys/devices/.../CTLR/spiB.C ... spi_device on bus "B",
+ chipselect C, accessed through CTLR.
+
+ /sys/bus/spi/devices/spiB.C ... symlink to that physical
+ .../CTLR/spiB.C device
+
+ /sys/devices/.../CTLR/spiB.C/modalias ... identifies the driver
+ that should be used with this device (for hotplug/coldplug)
+
+ /sys/bus/spi/drivers/D ... driver for one or more spi*.* devices
+
+ /sys/class/spi_master/spiB ... symlink (or actual device node) to
+ a logical node which could hold class related state for the
+ controller managing bus "B". All spiB.* devices share one
+ physical SPI bus segment, with SCLK, MOSI, and MISO.
+
+Note that the actual location of the controller's class state depends
+on whether you enabled CONFIG_SYSFS_DEPRECATED or not. At this time,
+the only class-specific state is the bus number ("B" in "spiB"), so
+those /sys/class entries are only useful to quickly identify busses.
+
+
+How does board-specific init code declare SPI devices?
+------------------------------------------------------
+Linux needs several kinds of information to properly configure SPI devices.
+That information is normally provided by board-specific code, even for
+chips that do support some of automated discovery/enumeration.
+
+DECLARE CONTROLLERS
+
+The first kind of information is a list of what SPI controllers exist.
+For System-on-Chip (SOC) based boards, these will usually be platform
+devices, and the controller may need some platform_data in order to
+operate properly. The "struct platform_device" will include resources
+like the physical address of the controller's first register and its IRQ.
+
+Platforms will often abstract the "register SPI controller" operation,
+maybe coupling it with code to initialize pin configurations, so that
+the arch/.../mach-*/board-*.c files for several boards can all share the
+same basic controller setup code. This is because most SOCs have several
+SPI-capable controllers, and only the ones actually usable on a given
+board should normally be set up and registered.
+
+So for example arch/.../mach-*/board-*.c files might have code like:
+
+ #include <mach/spi.h> /* for mysoc_spi_data */
+
+ /* if your mach-* infrastructure doesn't support kernels that can
+ * run on multiple boards, pdata wouldn't benefit from "__init".
+ */
+ static struct mysoc_spi_data __initdata pdata = { ... };
+
+ static __init board_init(void)
+ {
+ ...
+ /* this board only uses SPI controller #2 */
+ mysoc_register_spi(2, &pdata);
+ ...
+ }
+
+And SOC-specific utility code might look something like:
+
+ #include <mach/spi.h>
+
+ static struct platform_device spi2 = { ... };
+
+ void mysoc_register_spi(unsigned n, struct mysoc_spi_data *pdata)
+ {
+ struct mysoc_spi_data *pdata2;
+
+ pdata2 = kmalloc(sizeof *pdata2, GFP_KERNEL);
+ *pdata2 = pdata;
+ ...
+ if (n == 2) {
+ spi2->dev.platform_data = pdata2;
+ register_platform_device(&spi2);
+
+ /* also: set up pin modes so the spi2 signals are
+ * visible on the relevant pins ... bootloaders on
+ * production boards may already have done this, but
+ * developer boards will often need Linux to do it.
+ */
+ }
+ ...
+ }
+
+Notice how the platform_data for boards may be different, even if the
+same SOC controller is used. For example, on one board SPI might use
+an external clock, where another derives the SPI clock from current
+settings of some master clock.
+
+
+DECLARE SLAVE DEVICES
+
+The second kind of information is a list of what SPI slave devices exist
+on the target board, often with some board-specific data needed for the
+driver to work correctly.
+
+Normally your arch/.../mach-*/board-*.c files would provide a small table
+listing the SPI devices on each board. (This would typically be only a
+small handful.) That might look like:
+
+ static struct ads7846_platform_data ads_info = {
+ .vref_delay_usecs = 100,
+ .x_plate_ohms = 580,
+ .y_plate_ohms = 410,
+ };
+
+ static struct spi_board_info spi_board_info[] __initdata = {
+ {
+ .modalias = "ads7846",
+ .platform_data = &ads_info,
+ .mode = SPI_MODE_0,
+ .irq = GPIO_IRQ(31),
+ .max_speed_hz = 120000 /* max sample rate at 3V */ * 16,
+ .bus_num = 1,
+ .chip_select = 0,
+ },
+ };
+
+Again, notice how board-specific information is provided; each chip may need
+several types. This example shows generic constraints like the fastest SPI
+clock to allow (a function of board voltage in this case) or how an IRQ pin
+is wired, plus chip-specific constraints like an important delay that's
+changed by the capacitance at one pin.
+
+(There's also "controller_data", information that may be useful to the
+controller driver. An example would be peripheral-specific DMA tuning
+data or chipselect callbacks. This is stored in spi_device later.)
+
+The board_info should provide enough information to let the system work
+without the chip's driver being loaded. The most troublesome aspect of
+that is likely the SPI_CS_HIGH bit in the spi_device.mode field, since
+sharing a bus with a device that interprets chipselect "backwards" is
+not possible until the infrastructure knows how to deselect it.
+
+Then your board initialization code would register that table with the SPI
+infrastructure, so that it's available later when the SPI master controller
+driver is registered:
+
+ spi_register_board_info(spi_board_info, ARRAY_SIZE(spi_board_info));
+
+Like with other static board-specific setup, you won't unregister those.
+
+The widely used "card" style computers bundle memory, cpu, and little else
+onto a card that's maybe just thirty square centimeters. On such systems,
+your arch/.../mach-.../board-*.c file would primarily provide information
+about the devices on the mainboard into which such a card is plugged. That
+certainly includes SPI devices hooked up through the card connectors!
+
+
+NON-STATIC CONFIGURATIONS
+
+Developer boards often play by different rules than product boards, and one
+example is the potential need to hotplug SPI devices and/or controllers.
+
+For those cases you might need to use spi_busnum_to_master() to look
+up the spi bus master, and will likely need spi_new_device() to provide the
+board info based on the board that was hotplugged. Of course, you'd later
+call at least spi_unregister_device() when that board is removed.
+
+When Linux includes support for MMC/SD/SDIO/DataFlash cards through SPI, those
+configurations will also be dynamic. Fortunately, such devices all support
+basic device identification probes, so they should hotplug normally.
+
+
+How do I write an "SPI Protocol Driver"?
+----------------------------------------
+Most SPI drivers are currently kernel drivers, but there's also support
+for userspace drivers. Here we talk only about kernel drivers.
+
+SPI protocol drivers somewhat resemble platform device drivers:
+
+ static struct spi_driver CHIP_driver = {
+ .driver = {
+ .name = "CHIP",
+ .owner = THIS_MODULE,
+ },
+
+ .probe = CHIP_probe,
+ .remove = __devexit_p(CHIP_remove),
+ .suspend = CHIP_suspend,
+ .resume = CHIP_resume,
+ };
+
+The driver core will autmatically attempt to bind this driver to any SPI
+device whose board_info gave a modalias of "CHIP". Your probe() code
+might look like this unless you're creating a device which is managing
+a bus (appearing under /sys/class/spi_master).
+
+ static int __devinit CHIP_probe(struct spi_device *spi)
+ {
+ struct CHIP *chip;
+ struct CHIP_platform_data *pdata;
+
+ /* assuming the driver requires board-specific data: */
+ pdata = &spi->dev.platform_data;
+ if (!pdata)
+ return -ENODEV;
+
+ /* get memory for driver's per-chip state */
+ chip = kzalloc(sizeof *chip, GFP_KERNEL);
+ if (!chip)
+ return -ENOMEM;
+ spi_set_drvdata(spi, chip);
+
+ ... etc
+ return 0;
+ }
+
+As soon as it enters probe(), the driver may issue I/O requests to
+the SPI device using "struct spi_message". When remove() returns,
+or after probe() fails, the driver guarantees that it won't submit
+any more such messages.
+
+ - An spi_message is a sequence of protocol operations, executed
+ as one atomic sequence. SPI driver controls include:
+
+ + when bidirectional reads and writes start ... by how its
+ sequence of spi_transfer requests is arranged;
+
+ + which I/O buffers are used ... each spi_transfer wraps a
+ buffer for each transfer direction, supporting full duplex
+ (two pointers, maybe the same one in both cases) and half
+ duplex (one pointer is NULL) transfers;
+
+ + optionally defining short delays after transfers ... using
+ the spi_transfer.delay_usecs setting (this delay can be the
+ only protocol effect, if the buffer length is zero);
+
+ + whether the chipselect becomes inactive after a transfer and
+ any delay ... by using the spi_transfer.cs_change flag;
+
+ + hinting whether the next message is likely to go to this same
+ device ... using the spi_transfer.cs_change flag on the last
+ transfer in that atomic group, and potentially saving costs
+ for chip deselect and select operations.
+
+ - Follow standard kernel rules, and provide DMA-safe buffers in
+ your messages. That way controller drivers using DMA aren't forced
+ to make extra copies unless the hardware requires it (e.g. working
+ around hardware errata that force the use of bounce buffering).
+
+ If standard dma_map_single() handling of these buffers is inappropriate,
+ you can use spi_message.is_dma_mapped to tell the controller driver
+ that you've already provided the relevant DMA addresses.
+
+ - The basic I/O primitive is spi_async(). Async requests may be
+ issued in any context (irq handler, task, etc) and completion
+ is reported using a callback provided with the message.
+ After any detected error, the chip is deselected and processing
+ of that spi_message is aborted.
+
+ - There are also synchronous wrappers like spi_sync(), and wrappers
+ like spi_read(), spi_write(), and spi_write_then_read(). These
+ may be issued only in contexts that may sleep, and they're all
+ clean (and small, and "optional") layers over spi_async().
+
+ - The spi_write_then_read() call, and convenience wrappers around
+ it, should only be used with small amounts of data where the
+ cost of an extra copy may be ignored. It's designed to support
+ common RPC-style requests, such as writing an eight bit command
+ and reading a sixteen bit response -- spi_w8r16() being one its
+ wrappers, doing exactly that.
+
+Some drivers may need to modify spi_device characteristics like the
+transfer mode, wordsize, or clock rate. This is done with spi_setup(),
+which would normally be called from probe() before the first I/O is
+done to the device. However, that can also be called at any time
+that no message is pending for that device.
+
+While "spi_device" would be the bottom boundary of the driver, the
+upper boundaries might include sysfs (especially for sensor readings),
+the input layer, ALSA, networking, MTD, the character device framework,
+or other Linux subsystems.
+
+Note that there are two types of memory your driver must manage as part
+of interacting with SPI devices.
+
+ - I/O buffers use the usual Linux rules, and must be DMA-safe.
+ You'd normally allocate them from the heap or free page pool.
+ Don't use the stack, or anything that's declared "static".
+
+ - The spi_message and spi_transfer metadata used to glue those
+ I/O buffers into a group of protocol transactions. These can
+ be allocated anywhere it's convenient, including as part of
+ other allocate-once driver data structures. Zero-init these.
+
+If you like, spi_message_alloc() and spi_message_free() convenience
+routines are available to allocate and zero-initialize an spi_message
+with several transfers.
+
+
+How do I write an "SPI Master Controller Driver"?
+-------------------------------------------------
+An SPI controller will probably be registered on the platform_bus; write
+a driver to bind to the device, whichever bus is involved.
+
+The main task of this type of driver is to provide an "spi_master".
+Use spi_alloc_master() to allocate the master, and spi_master_get_devdata()
+to get the driver-private data allocated for that device.
+
+ struct spi_master *master;
+ struct CONTROLLER *c;
+
+ master = spi_alloc_master(dev, sizeof *c);
+ if (!master)
+ return -ENODEV;
+
+ c = spi_master_get_devdata(master);
+
+The driver will initialize the fields of that spi_master, including the
+bus number (maybe the same as the platform device ID) and three methods
+used to interact with the SPI core and SPI protocol drivers. It will
+also initialize its own internal state. (See below about bus numbering
+and those methods.)
+
+After you initialize the spi_master, then use spi_register_master() to
+publish it to the rest of the system. At that time, device nodes for
+the controller and any predeclared spi devices will be made available,
+and the driver model core will take care of binding them to drivers.
+
+If you need to remove your SPI controller driver, spi_unregister_master()
+will reverse the effect of spi_register_master().
+
+
+BUS NUMBERING
+
+Bus numbering is important, since that's how Linux identifies a given
+SPI bus (shared SCK, MOSI, MISO). Valid bus numbers start at zero. On
+SOC systems, the bus numbers should match the numbers defined by the chip
+manufacturer. For example, hardware controller SPI2 would be bus number 2,
+and spi_board_info for devices connected to it would use that number.
+
+If you don't have such hardware-assigned bus number, and for some reason
+you can't just assign them, then provide a negative bus number. That will
+then be replaced by a dynamically assigned number. You'd then need to treat
+this as a non-static configuration (see above).
+
+
+SPI MASTER METHODS
+
+ master->setup(struct spi_device *spi)
+ This sets up the device clock rate, SPI mode, and word sizes.
+ Drivers may change the defaults provided by board_info, and then
+ call spi_setup(spi) to invoke this routine. It may sleep.
+ Unless each SPI slave has its own configuration registers, don't
+ change them right away ... otherwise drivers could corrupt I/O
+ that's in progress for other SPI devices.
+
+ master->transfer(struct spi_device *spi, struct spi_message *message)
+ This must not sleep. Its responsibility is arrange that the
+ transfer happens and its complete() callback is issued. The two
+ will normally happen later, after other transfers complete, and
+ if the controller is idle it will need to be kickstarted.
+
+ master->cleanup(struct spi_device *spi)
+ Your controller driver may use spi_device.controller_state to hold
+ state it dynamically associates with that device. If you do that,
+ be sure to provide the cleanup() method to free that state.
+
+
+SPI MESSAGE QUEUE
+
+The bulk of the driver will be managing the I/O queue fed by transfer().
+
+That queue could be purely conceptual. For example, a driver used only
+for low-frequency sensor acess might be fine using synchronous PIO.
+
+But the queue will probably be very real, using message->queue, PIO,
+often DMA (especially if the root filesystem is in SPI flash), and
+execution contexts like IRQ handlers, tasklets, or workqueues (such
+as keventd). Your driver can be as fancy, or as simple, as you need.
+Such a transfer() method would normally just add the message to a
+queue, and then start some asynchronous transfer engine (unless it's
+already running).
+
+
+THANKS TO
+---------
+Contributors to Linux-SPI discussions include (in alphabetical order,
+by last name):
+
+David Brownell
+Russell King
+Dmitry Pervushin
+Stephen Street
+Mark Underwood
+Andrew Victor
+Vitaly Wool
+
diff --git a/Documentation/spi/spidev b/Documentation/spi/spidev
new file mode 100644
index 0000000..ed2da5e
--- /dev/null
+++ b/Documentation/spi/spidev
@@ -0,0 +1,143 @@
+SPI devices have a limited userspace API, supporting basic half-duplex
+read() and write() access to SPI slave devices. Using ioctl() requests,
+full duplex transfers and device I/O configuration are also available.
+
+ #include <fcntl.h>
+ #include <unistd.h>
+ #include <sys/ioctl.h>
+ #include <linux/types.h>
+ #include <linux/spi/spidev.h>
+
+Some reasons you might want to use this programming interface include:
+
+ * Prototyping in an environment that's not crash-prone; stray pointers
+ in userspace won't normally bring down any Linux system.
+
+ * Developing simple protocols used to talk to microcontrollers acting
+ as SPI slaves, which you may need to change quite often.
+
+Of course there are drivers that can never be written in userspace, because
+they need to access kernel interfaces (such as IRQ handlers or other layers
+of the driver stack) that are not accessible to userspace.
+
+
+DEVICE CREATION, DRIVER BINDING
+===============================
+The simplest way to arrange to use this driver is to just list it in the
+spi_board_info for a device as the driver it should use: the "modalias"
+entry is "spidev", matching the name of the driver exposing this API.
+Set up the other device characteristics (bits per word, SPI clocking,
+chipselect polarity, etc) as usual, so you won't always need to override
+them later.
+
+(Sysfs also supports userspace driven binding/unbinding of drivers to
+devices. That mechanism might be supported here in the future.)
+
+When you do that, the sysfs node for the SPI device will include a child
+device node with a "dev" attribute that will be understood by udev or mdev.
+(Larger systems will have "udev". Smaller ones may configure "mdev" into
+busybox; it's less featureful, but often enough.) For a SPI device with
+chipselect C on bus B, you should see:
+
+ /dev/spidevB.C ... character special device, major number 153 with
+ a dynamically chosen minor device number. This is the node
+ that userspace programs will open, created by "udev" or "mdev".
+
+ /sys/devices/.../spiB.C ... as usual, the SPI device node will
+ be a child of its SPI master controller.
+
+ /sys/class/spidev/spidevB.C ... created when the "spidev" driver
+ binds to that device. (Directory or symlink, based on whether
+ or not you enabled the "deprecated sysfs files" Kconfig option.)
+
+Do not try to manage the /dev character device special file nodes by hand.
+That's error prone, and you'd need to pay careful attention to system
+security issues; udev/mdev should already be configured securely.
+
+If you unbind the "spidev" driver from that device, those two "spidev" nodes
+(in sysfs and in /dev) should automatically be removed (respectively by the
+kernel and by udev/mdev). You can unbind by removing the "spidev" driver
+module, which will affect all devices using this driver. You can also unbind
+by having kernel code remove the SPI device, probably by removing the driver
+for its SPI controller (so its spi_master vanishes).
+
+Since this is a standard Linux device driver -- even though it just happens
+to expose a low level API to userspace -- it can be associated with any number
+of devices at a time. Just provide one spi_board_info record for each such
+SPI device, and you'll get a /dev device node for each device.
+
+
+BASIC CHARACTER DEVICE API
+==========================
+Normal open() and close() operations on /dev/spidevB.D files work as you
+would expect.
+
+Standard read() and write() operations are obviously only half-duplex, and
+the chipselect is deactivated between those operations. Full-duplex access,
+and composite operation without chipselect de-activation, is available using
+the SPI_IOC_MESSAGE(N) request.
+
+Several ioctl() requests let your driver read or override the device's current
+settings for data transfer parameters:
+
+ SPI_IOC_RD_MODE, SPI_IOC_WR_MODE ... pass a pointer to a byte which will
+ return (RD) or assign (WR) the SPI transfer mode. Use the constants
+ SPI_MODE_0..SPI_MODE_3; or if you prefer you can combine SPI_CPOL
+ (clock polarity, idle high iff this is set) or SPI_CPHA (clock phase,
+ sample on trailing edge iff this is set) flags.
+
+ SPI_IOC_RD_LSB_FIRST, SPI_IOC_WR_LSB_FIRST ... pass a pointer to a byte
+ which will return (RD) or assign (WR) the bit justification used to
+ transfer SPI words. Zero indicates MSB-first; other values indicate
+ the less common LSB-first encoding. In both cases the specified value
+ is right-justified in each word, so that unused (TX) or undefined (RX)
+ bits are in the MSBs.
+
+ SPI_IOC_RD_BITS_PER_WORD, SPI_IOC_WR_BITS_PER_WORD ... pass a pointer to
+ a byte which will return (RD) or assign (WR) the number of bits in
+ each SPI transfer word. The value zero signifies eight bits.
+
+ SPI_IOC_RD_MAX_SPEED_HZ, SPI_IOC_WR_MAX_SPEED_HZ ... pass a pointer to a
+ u32 which will return (RD) or assign (WR) the maximum SPI transfer
+ speed, in Hz. The controller can't necessarily assign that specific
+ clock speed.
+
+NOTES:
+
+ - At this time there is no async I/O support; everything is purely
+ synchronous.
+
+ - There's currently no way to report the actual bit rate used to
+ shift data to/from a given device.
+
+ - From userspace, you can't currently change the chip select polarity;
+ that could corrupt transfers to other devices sharing the SPI bus.
+ Each SPI device is deselected when it's not in active use, allowing
+ other drivers to talk to other devices.
+
+ - There's a limit on the number of bytes each I/O request can transfer
+ to the SPI device. It defaults to one page, but that can be changed
+ using a module parameter.
+
+ - Because SPI has no low-level transfer acknowledgement, you usually
+ won't see any I/O errors when talking to a non-existent device.
+
+
+FULL DUPLEX CHARACTER DEVICE API
+================================
+
+See the spidev_fdx.c sample program for one example showing the use of the
+full duplex programming interface. (Although it doesn't perform a full duplex
+transfer.) The model is the same as that used in the kernel spi_sync()
+request; the individual transfers offer the same capabilities as are
+available to kernel drivers (except that it's not asynchronous).
+
+The example shows one half-duplex RPC-style request and response message.
+These requests commonly require that the chip not be deselected between
+the request and response. Several such requests could be chained into
+a single kernel request, even allowing the chip to be deselected after
+each response. (Other protocol options include changing the word size
+and bitrate for each transfer segment.)
+
+To make a full duplex request, provide both rx_buf and tx_buf for the
+same transfer. It's even OK if those are the same buffer.
diff --git a/Documentation/spi/spidev_fdx.c b/Documentation/spi/spidev_fdx.c
new file mode 100644
index 0000000..fc354f7
--- /dev/null
+++ b/Documentation/spi/spidev_fdx.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/types.h>
+#include <linux/spi/spidev.h>
+
+
+static int verbose;
+
+static void do_read(int fd, int len)
+{
+ unsigned char buf[32], *bp;
+ int status;
+
+ /* read at least 2 bytes, no more than 32 */
+ if (len < 2)
+ len = 2;
+ else if (len > sizeof(buf))
+ len = sizeof(buf);
+ memset(buf, 0, sizeof buf);
+
+ status = read(fd, buf, len);
+ if (status < 0) {
+ perror("read");
+ return;
+ }
+ if (status != len) {
+ fprintf(stderr, "short read\n");
+ return;
+ }
+
+ printf("read(%2d, %2d): %02x %02x,", len, status,
+ buf[0], buf[1]);
+ status -= 2;
+ bp = buf + 2;
+ while (status-- > 0)
+ printf(" %02x", *bp++);
+ printf("\n");
+}
+
+static void do_msg(int fd, int len)
+{
+ struct spi_ioc_transfer xfer[2];
+ unsigned char buf[32], *bp;
+ int status;
+
+ memset(xfer, 0, sizeof xfer);
+ memset(buf, 0, sizeof buf);
+
+ if (len > sizeof buf)
+ len = sizeof buf;
+
+ buf[0] = 0xaa;
+ xfer[0].tx_buf = (__u64) buf;
+ xfer[0].len = 1;
+
+ xfer[1].rx_buf = (__u64) buf;
+ xfer[1].len = len;
+
+ status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer);
+ if (status < 0) {
+ perror("SPI_IOC_MESSAGE");
+ return;
+ }
+
+ printf("response(%2d, %2d): ", len, status);
+ for (bp = buf; len; len--)
+ printf(" %02x", *bp++);
+ printf("\n");
+}
+
+static void dumpstat(const char *name, int fd)
+{
+ __u8 mode, lsb, bits;
+ __u32 speed;
+
+ if (ioctl(fd, SPI_IOC_RD_MODE, &mode) < 0) {
+ perror("SPI rd_mode");
+ return;
+ }
+ if (ioctl(fd, SPI_IOC_RD_LSB_FIRST, &lsb) < 0) {
+ perror("SPI rd_lsb_fist");
+ return;
+ }
+ if (ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits) < 0) {
+ perror("SPI bits_per_word");
+ return;
+ }
+ if (ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed) < 0) {
+ perror("SPI max_speed_hz");
+ return;
+ }
+
+ printf("%s: spi mode %d, %d bits %sper word, %d Hz max\n",
+ name, mode, bits, lsb ? "(lsb first) " : "", speed);
+}
+
+int main(int argc, char **argv)
+{
+ int c;
+ int readcount = 0;
+ int msglen = 0;
+ int fd;
+ const char *name;
+
+ while ((c = getopt(argc, argv, "hm:r:v")) != EOF) {
+ switch (c) {
+ case 'm':
+ msglen = atoi(optarg);
+ if (msglen < 0)
+ goto usage;
+ continue;
+ case 'r':
+ readcount = atoi(optarg);
+ if (readcount < 0)
+ goto usage;
+ continue;
+ case 'v':
+ verbose++;
+ continue;
+ case 'h':
+ case '?':
+usage:
+ fprintf(stderr,
+ "usage: %s [-h] [-m N] [-r N] /dev/spidevB.D\n",
+ argv[0]);
+ return 1;
+ }
+ }
+
+ if ((optind + 1) != argc)
+ goto usage;
+ name = argv[optind];
+
+ fd = open(name, O_RDWR);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ dumpstat(name, fd);
+
+ if (msglen)
+ do_msg(fd, msglen);
+
+ if (readcount)
+ do_read(fd, readcount);
+
+ close(fd);
+ return 0;
+}
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
new file mode 100644
index 0000000..cf0e3ce
--- /dev/null
+++ b/Documentation/spi/spidev_test.c
@@ -0,0 +1,202 @@
+/*
+ * SPI testing utility (using spidev driver)
+ *
+ * Copyright (c) 2007 MontaVista Software, Inc.
+ * Copyright (c) 2007 Anton Vorontsov <avorontsov@ru.mvista.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * Cross-compile with cross-gcc -I/path/to/cross-kernel/include
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/spi/spidev.h>
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+static void pabort(const char *s)
+{
+ perror(s);
+ abort();
+}
+
+static const char *device = "/dev/spidev1.1";
+static uint8_t mode;
+static uint8_t bits = 8;
+static uint32_t speed = 500000;
+static uint16_t delay;
+
+static void transfer(int fd)
+{
+ int ret;
+ uint8_t tx[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x40, 0x00, 0x00, 0x00, 0x00, 0x95,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xDE, 0xAD, 0xBE, 0xEF, 0xBA, 0xAD,
+ 0xF0, 0x0D,
+ };
+ uint8_t rx[ARRAY_SIZE(tx)] = {0, };
+ struct spi_ioc_transfer tr = {
+ .tx_buf = (unsigned long)tx,
+ .rx_buf = (unsigned long)rx,
+ .len = ARRAY_SIZE(tx),
+ .delay_usecs = delay,
+ .speed_hz = speed,
+ .bits_per_word = bits,
+ };
+
+ ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr);
+ if (ret == 1)
+ pabort("can't send spi message");
+
+ for (ret = 0; ret < ARRAY_SIZE(tx); ret++) {
+ if (!(ret % 6))
+ puts("");
+ printf("%.2X ", rx[ret]);
+ }
+ puts("");
+}
+
+void print_usage(const char *prog)
+{
+ printf("Usage: %s [-DsbdlHOLC3]\n", prog);
+ puts(" -D --device device to use (default /dev/spidev1.1)\n"
+ " -s --speed max speed (Hz)\n"
+ " -d --delay delay (usec)\n"
+ " -b --bpw bits per word \n"
+ " -l --loop loopback\n"
+ " -H --cpha clock phase\n"
+ " -O --cpol clock polarity\n"
+ " -L --lsb least significant bit first\n"
+ " -C --cs-high chip select active high\n"
+ " -3 --3wire SI/SO signals shared\n");
+ exit(1);
+}
+
+void parse_opts(int argc, char *argv[])
+{
+ while (1) {
+ static const struct option lopts[] = {
+ { "device", 1, 0, 'D' },
+ { "speed", 1, 0, 's' },
+ { "delay", 1, 0, 'd' },
+ { "bpw", 1, 0, 'b' },
+ { "loop", 0, 0, 'l' },
+ { "cpha", 0, 0, 'H' },
+ { "cpol", 0, 0, 'O' },
+ { "lsb", 0, 0, 'L' },
+ { "cs-high", 0, 0, 'C' },
+ { "3wire", 0, 0, '3' },
+ { NULL, 0, 0, 0 },
+ };
+ int c;
+
+ c = getopt_long(argc, argv, "D:s:d:b:lHOLC3", lopts, NULL);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'D':
+ device = optarg;
+ break;
+ case 's':
+ speed = atoi(optarg);
+ break;
+ case 'd':
+ delay = atoi(optarg);
+ break;
+ case 'b':
+ bits = atoi(optarg);
+ break;
+ case 'l':
+ mode |= SPI_LOOP;
+ break;
+ case 'H':
+ mode |= SPI_CPHA;
+ break;
+ case 'O':
+ mode |= SPI_CPOL;
+ break;
+ case 'L':
+ mode |= SPI_LSB_FIRST;
+ break;
+ case 'C':
+ mode |= SPI_CS_HIGH;
+ break;
+ case '3':
+ mode |= SPI_3WIRE;
+ break;
+ default:
+ print_usage(argv[0]);
+ break;
+ }
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = 0;
+ int fd;
+
+ parse_opts(argc, argv);
+
+ fd = open(device, O_RDWR);
+ if (fd < 0)
+ pabort("can't open device");
+
+ /*
+ * spi mode
+ */
+ ret = ioctl(fd, SPI_IOC_WR_MODE, &mode);
+ if (ret == -1)
+ pabort("can't set spi mode");
+
+ ret = ioctl(fd, SPI_IOC_RD_MODE, &mode);
+ if (ret == -1)
+ pabort("can't get spi mode");
+
+ /*
+ * bits per word
+ */
+ ret = ioctl(fd, SPI_IOC_WR_BITS_PER_WORD, &bits);
+ if (ret == -1)
+ pabort("can't set bits per word");
+
+ ret = ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits);
+ if (ret == -1)
+ pabort("can't get bits per word");
+
+ /*
+ * max speed hz
+ */
+ ret = ioctl(fd, SPI_IOC_WR_MAX_SPEED_HZ, &speed);
+ if (ret == -1)
+ pabort("can't set max speed hz");
+
+ ret = ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed);
+ if (ret == -1)
+ pabort("can't get max speed hz");
+
+ printf("spi mode: %d\n", mode);
+ printf("bits per word: %d\n", bits);
+ printf("max speed: %d Hz (%d KHz)\n", speed, speed/1000);
+
+ transfer(fd);
+
+ close(fd);
+
+ return ret;
+}
diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt
new file mode 100644
index 0000000..619699d
--- /dev/null
+++ b/Documentation/spinlocks.txt
@@ -0,0 +1,236 @@
+SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and
+are hence deprecated.
+
+Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or
+__SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate for static
+initialization.
+
+Most of the time, you can simply turn:
+
+ static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
+
+into:
+
+ static DEFINE_SPINLOCK(xxx_lock);
+
+Static structure member variables go from:
+
+ struct foo bar {
+ .lock = SPIN_LOCK_UNLOCKED;
+ };
+
+to:
+
+ struct foo bar {
+ .lock = __SPIN_LOCK_UNLOCKED(bar.lock);
+ };
+
+Declaration of static rw_locks undergo a similar transformation.
+
+Dynamic initialization, when necessary, may be performed as
+demonstrated below.
+
+ spinlock_t xxx_lock;
+ rwlock_t xxx_rw_lock;
+
+ static int __init xxx_init(void)
+ {
+ spin_lock_init(&xxx_lock);
+ rwlock_init(&xxx_rw_lock);
+ ...
+ }
+
+ module_init(xxx_init);
+
+The following discussion is still valid, however, with the dynamic
+initialization of spinlocks or with DEFINE_SPINLOCK, etc., used
+instead of SPIN_LOCK_UNLOCKED.
+
+-----------------------
+
+On Fri, 2 Jan 1998, Doug Ledford wrote:
+>
+> I'm working on making the aic7xxx driver more SMP friendly (as well as
+> importing the latest FreeBSD sequencer code to have 7895 support) and wanted
+> to get some info from you. The goal here is to make the various routines
+> SMP safe as well as UP safe during interrupts and other manipulating
+> routines. So far, I've added a spin_lock variable to things like my queue
+> structs. Now, from what I recall, there are some spin lock functions I can
+> use to lock these spin locks from other use as opposed to a (nasty)
+> save_flags(); cli(); stuff; restore_flags(); construct. Where do I find
+> these routines and go about making use of them? Do they only lock on a
+> per-processor basis or can they also lock say an interrupt routine from
+> mucking with a queue if the queue routine was manipulating it when the
+> interrupt occurred, or should I still use a cli(); based construct on that
+> one?
+
+See <asm/spinlock.h>. The basic version is:
+
+ spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
+
+
+ unsigned long flags;
+
+ spin_lock_irqsave(&xxx_lock, flags);
+ ... critical section here ..
+ spin_unlock_irqrestore(&xxx_lock, flags);
+
+and the above is always safe. It will disable interrupts _locally_, but the
+spinlock itself will guarantee the global lock, so it will guarantee that
+there is only one thread-of-control within the region(s) protected by that
+lock.
+
+Note that it works well even under UP - the above sequence under UP
+essentially is just the same as doing a
+
+ unsigned long flags;
+
+ save_flags(flags); cli();
+ ... critical section ...
+ restore_flags(flags);
+
+so the code does _not_ need to worry about UP vs SMP issues: the spinlocks
+work correctly under both (and spinlocks are actually more efficient on
+architectures that allow doing the "save_flags + cli" in one go because I
+don't export that interface normally).
+
+NOTE NOTE NOTE! The reason the spinlock is so much faster than a global
+interrupt lock under SMP is exactly because it disables interrupts only on
+the local CPU. The spin-lock is safe only when you _also_ use the lock
+itself to do locking across CPU's, which implies that EVERYTHING that
+touches a shared variable has to agree about the spinlock they want to
+use.
+
+The above is usually pretty simple (you usually need and want only one
+spinlock for most things - using more than one spinlock can make things a
+lot more complex and even slower and is usually worth it only for
+sequences that you _know_ need to be split up: avoid it at all cost if you
+aren't sure). HOWEVER, it _does_ mean that if you have some code that does
+
+ cli();
+ .. critical section ..
+ sti();
+
+and another sequence that does
+
+ spin_lock_irqsave(flags);
+ .. critical section ..
+ spin_unlock_irqrestore(flags);
+
+then they are NOT mutually exclusive, and the critical regions can happen
+at the same time on two different CPU's. That's fine per se, but the
+critical regions had better be critical for different things (ie they
+can't stomp on each other).
+
+The above is a problem mainly if you end up mixing code - for example the
+routines in ll_rw_block() tend to use cli/sti to protect the atomicity of
+their actions, and if a driver uses spinlocks instead then you should
+think about issues like the above..
+
+This is really the only really hard part about spinlocks: once you start
+using spinlocks they tend to expand to areas you might not have noticed
+before, because you have to make sure the spinlocks correctly protect the
+shared data structures _everywhere_ they are used. The spinlocks are most
+easily added to places that are completely independent of other code (ie
+internal driver data structures that nobody else ever touches, for
+example).
+
+----
+
+Lesson 2: reader-writer spinlocks.
+
+If your data accesses have a very natural pattern where you usually tend
+to mostly read from the shared variables, the reader-writer locks
+(rw_lock) versions of the spinlocks are often nicer. They allow multiple
+readers to be in the same critical region at once, but if somebody wants
+to change the variables it has to get an exclusive write lock. The
+routines look the same as above:
+
+ rwlock_t xxx_lock = RW_LOCK_UNLOCKED;
+
+
+ unsigned long flags;
+
+ read_lock_irqsave(&xxx_lock, flags);
+ .. critical section that only reads the info ...
+ read_unlock_irqrestore(&xxx_lock, flags);
+
+ write_lock_irqsave(&xxx_lock, flags);
+ .. read and write exclusive access to the info ...
+ write_unlock_irqrestore(&xxx_lock, flags);
+
+The above kind of lock is useful for complex data structures like linked
+lists etc, especially when you know that most of the work is to just
+traverse the list searching for entries without changing the list itself,
+for example. Then you can use the read lock for that kind of list
+traversal, which allows many concurrent readers. Anything that _changes_
+the list will have to get the write lock.
+
+Note: you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
+time need to do any changes (even if you don't do it every time), you have
+to get the write-lock at the very beginning. I could fairly easily add a
+primitive to create a "upgradeable" read-lock, but it hasn't been an issue
+yet. Tell me if you'd want one.
+
+----
+
+Lesson 3: spinlocks revisited.
+
+The single spin-lock primitives above are by no means the only ones. They
+are the most safe ones, and the ones that work under all circumstances,
+but partly _because_ they are safe they are also fairly slow. They are
+much faster than a generic global cli/sti pair, but slower than they'd
+need to be, because they do have to disable interrupts (which is just a
+single instruction on a x86, but it's an expensive one - and on other
+architectures it can be worse).
+
+If you have a case where you have to protect a data structure across
+several CPU's and you want to use spinlocks you can potentially use
+cheaper versions of the spinlocks. IFF you know that the spinlocks are
+never used in interrupt handlers, you can use the non-irq versions:
+
+ spin_lock(&lock);
+ ...
+ spin_unlock(&lock);
+
+(and the equivalent read-write versions too, of course). The spinlock will
+guarantee the same kind of exclusive access, and it will be much faster.
+This is useful if you know that the data in question is only ever
+manipulated from a "process context", ie no interrupts involved.
+
+The reasons you mustn't use these versions if you have interrupts that
+play with the spinlock is that you can get deadlocks:
+
+ spin_lock(&lock);
+ ...
+ <- interrupt comes in:
+ spin_lock(&lock);
+
+where an interrupt tries to lock an already locked variable. This is ok if
+the other interrupt happens on another CPU, but it is _not_ ok if the
+interrupt happens on the same CPU that already holds the lock, because the
+lock will obviously never be released (because the interrupt is waiting
+for the lock, and the lock-holder is interrupted by the interrupt and will
+not continue until the interrupt has been processed).
+
+(This is also the reason why the irq-versions of the spinlocks only need
+to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
+on other CPU's, because an interrupt on another CPU doesn't interrupt the
+CPU that holds the lock, so the lock-holder can continue and eventually
+releases the lock).
+
+Note that you can be clever with read-write locks and interrupts. For
+example, if you know that the interrupt only ever gets a read-lock, then
+you can use a non-irq version of read locks everywhere - because they
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
+
+For an example of being clever with rw-locks, see the "waitqueue_lock"
+handling in kernel/sched.c - nothing ever _changes_ a wait-queue from
+within an interrupt, they only read the queue in order to know whom to
+wake up. So read-locks are safe (which is good: they are very common
+indeed), while write-locks need to protect themselves against interrupts.
+
+ Linus
+
+
diff --git a/Documentation/stable_api_nonsense.txt b/Documentation/stable_api_nonsense.txt
new file mode 100644
index 0000000..847b342
--- /dev/null
+++ b/Documentation/stable_api_nonsense.txt
@@ -0,0 +1,190 @@
+The Linux Kernel Driver Interface
+(all of your questions answered and then some)
+
+Greg Kroah-Hartman <greg@kroah.com>
+
+This is being written to try to explain why Linux does not have a binary
+kernel interface, nor does it have a stable kernel interface. Please
+realize that this article describes the _in kernel_ interfaces, not the
+kernel to userspace interfaces. The kernel to userspace interface is
+the one that application programs use, the syscall interface. That
+interface is _very_ stable over time, and will not break. I have old
+programs that were built on a pre 0.9something kernel that still work
+just fine on the latest 2.6 kernel release. That interface is the one
+that users and application programmers can count on being stable.
+
+
+Executive Summary
+-----------------
+You think you want a stable kernel interface, but you really do not, and
+you don't even know it. What you want is a stable running driver, and
+you get that only if your driver is in the main kernel tree. You also
+get lots of other good benefits if your driver is in the main kernel
+tree, all of which has made Linux into such a strong, stable, and mature
+operating system which is the reason you are using it in the first
+place.
+
+
+Intro
+-----
+
+It's only the odd person who wants to write a kernel driver that needs
+to worry about the in-kernel interfaces changing. For the majority of
+the world, they neither see this interface, nor do they care about it at
+all.
+
+First off, I'm not going to address _any_ legal issues about closed
+source, hidden source, binary blobs, source wrappers, or any other term
+that describes kernel drivers that do not have their source code
+released under the GPL. Please consult a lawyer if you have any legal
+questions, I'm a programmer and hence, I'm just going to be describing
+the technical issues here (not to make light of the legal issues, they
+are real, and you do need to be aware of them at all times.)
+
+So, there are two main topics here, binary kernel interfaces and stable
+kernel source interfaces. They both depend on each other, but we will
+discuss the binary stuff first to get it out of the way.
+
+
+Binary Kernel Interface
+-----------------------
+Assuming that we had a stable kernel source interface for the kernel, a
+binary interface would naturally happen too, right? Wrong. Please
+consider the following facts about the Linux kernel:
+ - Depending on the version of the C compiler you use, different kernel
+ data structures will contain different alignment of structures, and
+ possibly include different functions in different ways (putting
+ functions inline or not.) The individual function organization
+ isn't that important, but the different data structure padding is
+ very important.
+ - Depending on what kernel build options you select, a wide range of
+ different things can be assumed by the kernel:
+ - different structures can contain different fields
+ - Some functions may not be implemented at all, (i.e. some locks
+ compile away to nothing for non-SMP builds.)
+ - Memory within the kernel can be aligned in different ways,
+ depending on the build options.
+ - Linux runs on a wide range of different processor architectures.
+ There is no way that binary drivers from one architecture will run
+ on another architecture properly.
+
+Now a number of these issues can be addressed by simply compiling your
+module for the exact specific kernel configuration, using the same exact
+C compiler that the kernel was built with. This is sufficient if you
+want to provide a module for a specific release version of a specific
+Linux distribution. But multiply that single build by the number of
+different Linux distributions and the number of different supported
+releases of the Linux distribution and you quickly have a nightmare of
+different build options on different releases. Also realize that each
+Linux distribution release contains a number of different kernels, all
+tuned to different hardware types (different processor types and
+different options), so for even a single release you will need to create
+multiple versions of your module.
+
+Trust me, you will go insane over time if you try to support this kind
+of release, I learned this the hard way a long time ago...
+
+
+Stable Kernel Source Interfaces
+-------------------------------
+
+This is a much more "volatile" topic if you talk to people who try to
+keep a Linux kernel driver that is not in the main kernel tree up to
+date over time.
+
+Linux kernel development is continuous and at a rapid pace, never
+stopping to slow down. As such, the kernel developers find bugs in
+current interfaces, or figure out a better way to do things. If they do
+that, they then fix the current interfaces to work better. When they do
+so, function names may change, structures may grow or shrink, and
+function parameters may be reworked. If this happens, all of the
+instances of where this interface is used within the kernel are fixed up
+at the same time, ensuring that everything continues to work properly.
+
+As a specific examples of this, the in-kernel USB interfaces have
+undergone at least three different reworks over the lifetime of this
+subsystem. These reworks were done to address a number of different
+issues:
+ - A change from a synchronous model of data streams to an asynchronous
+ one. This reduced the complexity of a number of drivers and
+ increased the throughput of all USB drivers such that we are now
+ running almost all USB devices at their maximum speed possible.
+ - A change was made in the way data packets were allocated from the
+ USB core by USB drivers so that all drivers now needed to provide
+ more information to the USB core to fix a number of documented
+ deadlocks.
+
+This is in stark contrast to a number of closed source operating systems
+which have had to maintain their older USB interfaces over time. This
+provides the ability for new developers to accidentally use the old
+interfaces and do things in improper ways, causing the stability of the
+operating system to suffer.
+
+In both of these instances, all developers agreed that these were
+important changes that needed to be made, and they were made, with
+relatively little pain. If Linux had to ensure that it preserve a
+stable source interface, a new interface would have been created, and
+the older, broken one would have had to be maintained over time, leading
+to extra work for the USB developers. Since all Linux USB developers do
+their work on their own time, asking programmers to do extra work for no
+gain, for free, is not a possibility.
+
+Security issues are also very important for Linux. When a
+security issue is found, it is fixed in a very short amount of time. A
+number of times this has caused internal kernel interfaces to be
+reworked to prevent the security problem from occurring. When this
+happens, all drivers that use the interfaces were also fixed at the
+same time, ensuring that the security problem was fixed and could not
+come back at some future time accidentally. If the internal interfaces
+were not allowed to change, fixing this kind of security problem and
+insuring that it could not happen again would not be possible.
+
+Kernel interfaces are cleaned up over time. If there is no one using a
+current interface, it is deleted. This ensures that the kernel remains
+as small as possible, and that all potential interfaces are tested as
+well as they can be (unused interfaces are pretty much impossible to
+test for validity.)
+
+
+What to do
+----------
+
+So, if you have a Linux kernel driver that is not in the main kernel
+tree, what are you, a developer, supposed to do? Releasing a binary
+driver for every different kernel version for every distribution is a
+nightmare, and trying to keep up with an ever changing kernel interface
+is also a rough job.
+
+Simple, get your kernel driver into the main kernel tree (remember we
+are talking about GPL released drivers here, if your code doesn't fall
+under this category, good luck, you are on your own here, you leech
+<insert link to leech comment from Andrew and Linus here>.) If your
+driver is in the tree, and a kernel interface changes, it will be fixed
+up by the person who did the kernel change in the first place. This
+ensures that your driver is always buildable, and works over time, with
+very little effort on your part.
+
+The very good side effects of having your driver in the main kernel tree
+are:
+ - The quality of the driver will rise as the maintenance costs (to the
+ original developer) will decrease.
+ - Other developers will add features to your driver.
+ - Other people will find and fix bugs in your driver.
+ - Other people will find tuning opportunities in your driver.
+ - Other people will update the driver for you when external interface
+ changes require it.
+ - The driver automatically gets shipped in all Linux distributions
+ without having to ask the distros to add it.
+
+As Linux supports a larger number of different devices "out of the box"
+than any other operating system, and it supports these devices on more
+different processor architectures than any other operating system, this
+proven type of development model must be doing something right :)
+
+
+
+------
+
+Thanks to Randy Dunlap, Andrew Morton, David Brownell, Hanna Linder,
+Robert Love, and Nishanth Aravamudan for their review and comments on
+early drafts of this paper.
diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt
new file mode 100644
index 0000000..a452227
--- /dev/null
+++ b/Documentation/stable_kernel_rules.txt
@@ -0,0 +1,60 @@
+Everything you ever wanted to know about Linux 2.6 -stable releases.
+
+Rules on what kind of patches are accepted, and which ones are not, into the
+"-stable" tree:
+
+ - It must be obviously correct and tested.
+ - It cannot be bigger than 100 lines, with context.
+ - It must fix only one thing.
+ - It must fix a real bug that bothers people (not a, "This could be a
+ problem..." type thing).
+ - It must fix a problem that causes a build error (but not for things
+ marked CONFIG_BROKEN), an oops, a hang, data corruption, a real
+ security issue, or some "oh, that's not good" issue. In short, something
+ critical.
+ - New device IDs and quirks are also accepted.
+ - No "theoretical race condition" issues, unless an explanation of how the
+ race can be exploited is also provided.
+ - It cannot contain any "trivial" fixes in it (spelling changes,
+ whitespace cleanups, etc).
+ - It must follow the Documentation/SubmittingPatches rules.
+ - It or an equivalent fix must already exist in Linus' tree. Quote the
+ respective commit ID in Linus' tree in your patch submission to -stable.
+
+
+Procedure for submitting patches to the -stable tree:
+
+ - Send the patch, after verifying that it follows the above rules, to
+ stable@kernel.org.
+ - The sender will receive an ACK when the patch has been accepted into the
+ queue, or a NAK if the patch is rejected. This response might take a few
+ days, according to the developer's schedules.
+ - If accepted, the patch will be added to the -stable queue, for review by
+ other developers and by the relevant subsystem maintainer.
+ - If the stable@kernel.org address is added to a patch, when it goes into
+ Linus's tree it will automatically be emailed to the stable team.
+ - Security patches should not be sent to this alias, but instead to the
+ documented security@kernel.org address.
+
+
+Review cycle:
+
+ - When the -stable maintainers decide for a review cycle, the patches will be
+ sent to the review committee, and the maintainer of the affected area of
+ the patch (unless the submitter is the maintainer of the area) and CC: to
+ the linux-kernel mailing list.
+ - The review committee has 48 hours in which to ACK or NAK the patch.
+ - If the patch is rejected by a member of the committee, or linux-kernel
+ members object to the patch, bringing up issues that the maintainers and
+ members did not realize, the patch will be dropped from the queue.
+ - At the end of the review cycle, the ACKed patches will be added to the
+ latest -stable release, and a new -stable release will happen.
+ - Security patches will be accepted into the -stable tree directly from the
+ security kernel team, and not go through the normal review cycle.
+ Contact the kernel security team for more details on this procedure.
+
+
+Review committee:
+
+ - This is made up of a number of kernel developers who have volunteered for
+ this task, and a few that haven't.
diff --git a/Documentation/svga.txt b/Documentation/svga.txt
new file mode 100644
index 0000000..cd66ec8
--- /dev/null
+++ b/Documentation/svga.txt
@@ -0,0 +1,276 @@
+ Video Mode Selection Support 2.13
+ (c) 1995--1999 Martin Mares, <mj@ucw.cz>
+--------------------------------------------------------------------------------
+
+1. Intro
+~~~~~~~~
+ This small document describes the "Video Mode Selection" feature which
+allows the use of various special video modes supported by the video BIOS. Due
+to usage of the BIOS, the selection is limited to boot time (before the
+kernel decompression starts) and works only on 80X86 machines.
+
+ ** Short intro for the impatient: Just use vga=ask for the first time,
+ ** enter `scan' on the video mode prompt, pick the mode you want to use,
+ ** remember its mode ID (the four-digit hexadecimal number) and then
+ ** set the vga parameter to this number (converted to decimal first).
+
+ The video mode to be used is selected by a kernel parameter which can be
+specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..."
+option of LILO (or some other boot loader you use) or by the "vidmode" utility
+(present in standard Linux utility packages). You can use the following values
+of this parameter:
+
+ NORMAL_VGA - Standard 80x25 mode available on all display adapters.
+
+ EXTENDED_VGA - Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA.
+
+ ASK_VGA - Display a video mode menu upon startup (see below).
+
+ 0..35 - Menu item number (when you have used the menu to view the list of
+ modes available on your adapter, you can specify the menu item you want
+ to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the
+ mode list displayed may vary as the kernel version changes, because the
+ modes are listed in a "first detected -- first displayed" manner. It's
+ better to use absolute mode numbers instead.
+
+ 0x.... - Hexadecimal video mode ID (also displayed on the menu, see below
+ for exact meaning of the ID). Warning: rdev and LILO don't support
+ hexadecimal numbers -- you have to convert it to decimal manually.
+
+2. Menu
+~~~~~~~
+ The ASK_VGA mode causes the kernel to offer a video mode menu upon
+bootup. It displays a "Press <RETURN> to see video modes available, <SPACE>
+to continue or wait 30 secs" message. If you press <RETURN>, you enter the
+menu, if you press <SPACE> or wait 30 seconds, the kernel will boot up in
+the standard 80x25 mode.
+
+ The menu looks like:
+
+Video adapter: <name-of-detected-video-adapter>
+Mode: COLSxROWS:
+0 0F00 80x25
+1 0F01 80x50
+2 0F02 80x43
+3 0F03 80x26
+....
+Enter mode number or `scan': <flashing-cursor-here>
+
+ <name-of-detected-video-adapter> tells what video adapter did Linux detect
+-- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA
+with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection
+of chipsets is turned off by default (see CONFIG_VIDEO_SVGA in chapter 4 to see
+how to enable it if you really want) as it's inherently unreliable due to
+absolutely insane PC design.
+
+ "0 0F00 80x25" means that the first menu item (the menu items are numbered
+from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the
+next section for a description of mode IDs).
+
+ <flashing-cursor-here> encourages you to enter the item number or mode ID
+you wish to set and press <RETURN>. If the computer complains something about
+"Unknown mode ID", it is trying to tell you that it isn't possible to set such
+a mode. It's also possible to press only <RETURN> which leaves the current mode.
+
+ The mode list usually contains a few basic modes and some VESA modes. In
+case your chipset has been detected, some chipset-specific modes are shown as
+well (some of these might be missing or unusable on your machine as different
+BIOSes are often shipped with the same card and the mode numbers depend purely
+on the VGA BIOS).
+
+ The modes displayed on the menu are partially sorted: The list starts with
+the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and
+80x43), local modes (if the local modes feature is enabled), VESA modes and
+finally SVGA modes for the auto-detected adapter.
+
+ If you are not happy with the mode list offered (e.g., if you think your card
+is able to do more), you can enter "scan" instead of item number / mode ID. The
+program will try to ask the BIOS for all possible video mode numbers and test
+what happens then. The screen will be probably flashing wildly for some time and
+strange noises will be heard from inside the monitor and so on and then, really
+all consistent video modes supported by your BIOS will appear (plus maybe some
+`ghost modes'). If you are afraid this could damage your monitor, don't use this
+function.
+
+ After scanning, the mode ordering is a bit different: the auto-detected SVGA
+modes are not listed at all and the modes revealed by `scan' are shown before
+all VESA modes.
+
+3. Mode IDs
+~~~~~~~~~~~
+ Because of the complexity of all the video stuff, the video mode IDs
+used here are also a bit complex. A video mode ID is a 16-bit number usually
+expressed in a hexadecimal notation (starting with "0x"). You can set a mode
+by entering its mode directly if you know it even if it isn't shown on the menu.
+
+The ID numbers can be divided to three regions:
+
+ 0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use
+ outside the menu as this can change from boot to boot (especially if you
+ have used the `scan' feature).
+
+ 0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number
+ (as presented to INT 10, function 00) increased by 0x0100.
+
+ 0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by
+ 0x0100. All VESA modes should be autodetected and shown on the menu.
+
+ 0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05.
+ (Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60,
+ 945=132x28 for the standard Video7 BIOS)
+
+ 0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually
+ by modifying one of the standard modes). Currently available:
+ 0x0f00 standard 80x25, don't reset mode if already set (=FFFF)
+ 0x0f01 standard with 8-point font: 80x43 on EGA, 80x50 on VGA
+ 0x0f02 VGA 80x43 (VGA switched to 350 scanlines with a 8-point font)
+ 0x0f03 VGA 80x28 (standard VGA scans, but 14-point font)
+ 0x0f04 leave current video mode
+ 0x0f05 VGA 80x30 (480 scans, 16-point font)
+ 0x0f06 VGA 80x34 (480 scans, 14-point font)
+ 0x0f07 VGA 80x60 (480 scans, 8-point font)
+ 0x0f08 Graphics hack (see the CONFIG_VIDEO_HACK paragraph below)
+
+ 0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC"
+ form where RR is a number of rows and CC is a number of columns.
+ E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc.
+ This is the only fully portable way to refer to a non-standard mode,
+ but it relies on the mode being found and displayed on the menu
+ (remember that mode scanning is not done automatically).
+
+ 0xff00 to 0xffff - aliases for backward compatibility:
+ 0xffff equivalent to 0x0f00 (standard 80x25)
+ 0xfffe equivalent to 0x0f01 (EGA 80x43 or VGA 80x50)
+
+ If you add 0x8000 to the mode ID, the program will try to recalculate
+vertical display timing according to mode parameters, which can be used to
+eliminate some annoying bugs of certain VGA BIOSes (usually those used for
+cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the
+end of the display.
+
+4. Options
+~~~~~~~~~~
+ Some options can be set in the source text (in arch/i386/boot/video.S).
+All of them are simple #define's -- change them to #undef's when you want to
+switch them off. Currently supported:
+
+ CONFIG_VIDEO_SVGA - enables autodetection of SVGA cards. This is switched
+off by default as it's a bit unreliable due to terribly bad PC design. If you
+really want to have the adapter autodetected (maybe in case the `scan' feature
+doesn't work on your machine), switch this on and don't cry if the results
+are not completely sane. In case you really need this feature, please drop me
+a mail as I think of removing it some day.
+
+ CONFIG_VIDEO_VESA - enables autodetection of VESA modes. If it doesn't work
+on your machine (or displays a "Error: Scanning of VESA modes failed" message),
+you can switch it off and report as a bug.
+
+ CONFIG_VIDEO_COMPACT - enables compacting of the video mode list. If there
+are more modes with the same screen size, only the first one is kept (see above
+for more info on mode ordering). However, in very strange cases it's possible
+that the first "version" of the mode doesn't work although some of the others
+do -- in this case turn this switch off to see the rest.
+
+ CONFIG_VIDEO_RETAIN - enables retaining of screen contents when switching
+video modes. Works only with some boot loaders which leave enough room for the
+buffer. (If you have old LILO, you can adjust heap_end_ptr and loadflags
+in setup.S, but it's better to upgrade the boot loader...)
+
+ CONFIG_VIDEO_LOCAL - enables inclusion of "local modes" in the list. The
+local modes are added automatically to the beginning of the list not depending
+on hardware configuration. The local modes are listed in the source text after
+the "local_mode_table:" line. The comment before this line describes the format
+of the table (which also includes a video card name to be displayed on the
+top of the menu).
+
+ CONFIG_VIDEO_400_HACK - force setting of 400 scan lines for standard VGA
+modes. This option is intended to be used on certain buggy BIOSes which draw
+some useless logo using font download and then fail to reset the correct mode.
+Don't use unless needed as it forces resetting the video card.
+
+ CONFIG_VIDEO_GFX_HACK - includes special hack for setting of graphics modes
+to be used later by special drivers (e.g., 800x600 on IBM ThinkPad -- see
+ftp://ftp.phys.keio.ac.jp/pub/XFree86/800x600/XF86Configs/XF86Config.IBM_TP560).
+Allows to set _any_ BIOS mode including graphic ones and forcing specific
+text screen resolution instead of peeking it from BIOS variables. Don't use
+unless you think you know what you're doing. To activate this setup, use
+mode number 0x0f08 (see section 3).
+
+5. Still doesn't work?
+~~~~~~~~~~~~~~~~~~~~~~
+ When the mode detection doesn't work (e.g., the mode list is incorrect or
+the machine hangs instead of displaying the menu), try to switch off some of
+the configuration options listed in section 4. If it fails, you can still use
+your kernel with the video mode set directly via the kernel parameter.
+
+ In either case, please send me a bug report containing what _exactly_
+happens and how do the configuration switches affect the behaviour of the bug.
+
+ If you start Linux from M$-DOS, you might also use some DOS tools for
+video mode setting. In this case, you must specify the 0x0f04 mode ("leave
+current settings") to Linux, because if you don't and you use any non-standard
+mode, Linux will switch to 80x25 automatically.
+
+ If you set some extended mode and there's one or more extra lines on the
+bottom of the display containing already scrolled-out text, your VGA BIOS
+contains the most common video BIOS bug called "incorrect vertical display
+end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately,
+this must be done manually -- no autodetection mechanisms are available.
+
+ If you have a VGA card and your display still looks as on EGA, your BIOS
+is probably broken and you need to set the CONFIG_VIDEO_400_HACK switch to
+force setting of the correct mode.
+
+6. History
+~~~~~~~~~~
+1.0 (??-Nov-95) First version supporting all adapters supported by the old
+ setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels
+ and then removed due to instability on some machines.
+2.0 (28-Jan-96) Rewritten from scratch. Cirrus Logic 64XX support added, almost
+ everything is configurable, the VESA support should be much more
+ stable, explicit mode numbering allowed, "scan" implemented etc.
+2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution
+ supported. Few bugs fixed. VESA modes are listed prior to
+ modes supplied by SVGA autodetection as they are more reliable.
+ CLGD autodetect works better. Doesn't depend on 80x25 being
+ active when started. Scanning fixed. 80x43 (any VGA) added.
+ Code cleaned up.
+2.2 (01-Feb-96) EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX
+ VESA modes work now). Display end bug workaround supported.
+ Special modes renumbered to allow adding of the "recalculate"
+ flag, 0xffff and 0xfffe became aliases instead of real IDs.
+ Screen contents retained during mode changes.
+2.3 (15-Mar-96) Changed to work with 1.3.74 kernel.
+2.4 (18-Mar-96) Added patches by Hans Lermen fixing a memory overwrite problem
+ with some boot loaders. Memory management rewritten to reflect
+ these changes. Unfortunately, screen contents retaining works
+ only with some loaders now.
+ Added a Tseng 132x60 mode.
+2.5 (19-Mar-96) Fixed a VESA mode scanning bug introduced in 2.4.
+2.6 (25-Mar-96) Some VESA BIOS errors not reported -- it fixes error reports on
+ several cards with broken VESA code (e.g., ATI VGA).
+2.7 (09-Apr-96) - Accepted all VESA modes in range 0x100 to 0x7ff, because some
+ cards use very strange mode numbers.
+ - Added Realtek VGA modes (thanks to Gonzalo Tornaria).
+ - Hardware testing order slightly changed, tests based on ROM
+ contents done as first.
+ - Added support for special Video7 mode switching functions
+ (thanks to Tom Vander Aa).
+ - Added 480-scanline modes (especially useful for notebooks,
+ original version written by hhanemaa@cs.ruu.nl, patched by
+ Jeff Chua, rewritten by me).
+ - Screen store/restore fixed.
+2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA.
+ - Better recognition of text modes during mode scan.
+2.9 (12-May-96) - Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!)
+2.10 (11-Nov-96)- The whole thing made optional.
+ - Added the CONFIG_VIDEO_400_HACK switch.
+ - Added the CONFIG_VIDEO_GFX_HACK switch.
+ - Code cleanup.
+2.11 (03-May-97)- Yet another cleanup, now including also the documentation.
+ - Direct testing of SVGA adapters turned off by default, `scan'
+ offered explicitly on the prompt line.
+ - Removed the doc section describing adding of new probing
+ functions as I try to get rid of _all_ hardware probing here.
+2.12 (25-May-98)- Added support for VESA frame buffer graphics.
+2.13 (14-May-99)- Minor documentation fixes.
diff --git a/Documentation/sysctl/00-INDEX b/Documentation/sysctl/00-INDEX
new file mode 100644
index 0000000..a20a906
--- /dev/null
+++ b/Documentation/sysctl/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - this file.
+README
+ - general information about /proc/sys/ sysctl files.
+abi.txt
+ - documentation for /proc/sys/abi/*.
+ctl_unnumbered.txt
+ - explanation of why one should not add new binary sysctl numbers.
+fs.txt
+ - documentation for /proc/sys/fs/*.
+kernel.txt
+ - documentation for /proc/sys/kernel/*.
+sunrpc.txt
+ - documentation for /proc/sys/sunrpc/*.
+vm.txt
+ - documentation for /proc/sys/vm/*.
diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README
new file mode 100644
index 0000000..8c3306e
--- /dev/null
+++ b/Documentation/sysctl/README
@@ -0,0 +1,75 @@
+Documentation for /proc/sys/ kernel version 2.2.10
+ (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+'Why', I hear you ask, 'would anyone even _want_ documentation
+for them sysctl files? If anybody really needs it, it's all in
+the source...'
+
+Well, this documentation is written because some people either
+don't know they need to tweak something, or because they don't
+have the time or knowledge to read the source code.
+
+Furthermore, the programmers who built sysctl have built it to
+be actually used, not just for the fun of programming it :-)
+
+==============================================================
+
+Legal blurb:
+
+As usual, there are two main things to consider:
+1. you get what you pay for
+2. it's free
+
+The consequences are that I won't guarantee the correctness of
+this document, and if you come to me complaining about how you
+screwed up your system because of wrong documentation, I won't
+feel sorry for you. I might even laugh at you...
+
+But of course, if you _do_ manage to screw up your system using
+only the sysctl options used in this file, I'd like to hear of
+it. Not only to have a great laugh, but also to make sure that
+you're the last RTFMing person to screw up.
+
+In short, e-mail your suggestions, corrections and / or horror
+stories to: <riel@nl.linux.org>
+
+Rik van Riel.
+
+==============================================================
+
+Introduction:
+
+Sysctl is a means of configuring certain aspects of the kernel
+at run-time, and the /proc/sys/ directory is there so that you
+don't even need special tools to do it!
+In fact, there are only four things needed to use these config
+facilities:
+- a running Linux system
+- root access
+- common sense (this is especially hard to come by these days)
+- knowledge of what all those values mean
+
+As a quick 'ls /proc/sys' will show, the directory consists of
+several (arch-dependent?) subdirs. Each subdir is mainly about
+one part of the kernel, so you can do configuration on a piece
+by piece basis, or just some 'thematic frobbing'.
+
+The subdirs are about:
+abi/ execution domains & personalities
+debug/ <empty>
+dev/ device specific information (eg dev/cdrom/info)
+fs/ specific filesystems
+ filehandle, inode, dentry and quota tuning
+ binfmt_misc <Documentation/binfmt_misc.txt>
+kernel/ global kernel info / tuning
+ miscellaneous stuff
+net/ networking stuff, for documentation look in:
+ <Documentation/networking/>
+proc/ <empty>
+sunrpc/ SUN Remote Procedure Call (NFS)
+vm/ memory management tuning
+ buffer and cache management
+
+These are the subdirs I have on my system. There might be more
+or other subdirs in another setup. If you see another dir, I'd
+really like to hear about it :-)
diff --git a/Documentation/sysctl/abi.txt b/Documentation/sysctl/abi.txt
new file mode 100644
index 0000000..63f4ebc
--- /dev/null
+++ b/Documentation/sysctl/abi.txt
@@ -0,0 +1,54 @@
+Documentation for /proc/sys/abi/* kernel version 2.6.0.test2
+ (c) 2003, Fabian Frederick <ffrederick@users.sourceforge.net>
+
+For general info : README.
+
+==============================================================
+
+This path is binary emulation relevant aka personality types aka abi.
+When a process is executed, it's linked to an exec_domain whose
+personality is defined using values available from /proc/sys/abi.
+You can find further details about abi in include/linux/personality.h.
+
+Here are the files featuring in 2.6 kernel :
+
+- defhandler_coff
+- defhandler_elf
+- defhandler_lcall7
+- defhandler_libcso
+- fake_utsname
+- trace
+
+===========================================================
+defhandler_coff:
+defined value :
+PER_SCOSVR3
+0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE
+
+===========================================================
+defhandler_elf:
+defined value :
+PER_LINUX
+0
+
+===========================================================
+defhandler_lcall7:
+defined value :
+PER_SVR4
+0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
+
+===========================================================
+defhandler_libsco:
+defined value:
+PER_SVR4
+0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
+
+===========================================================
+fake_utsname:
+Unused
+
+===========================================================
+trace:
+Unused
+
+===========================================================
diff --git a/Documentation/sysctl/ctl_unnumbered.txt b/Documentation/sysctl/ctl_unnumbered.txt
new file mode 100644
index 0000000..23003a8
--- /dev/null
+++ b/Documentation/sysctl/ctl_unnumbered.txt
@@ -0,0 +1,22 @@
+
+Except for a few extremely rare exceptions user space applications do not use
+the binary sysctl interface. Instead everyone uses /proc/sys/... with
+readable ascii names.
+
+Recently the kernel has started supporting setting the binary sysctl value to
+CTL_UNNUMBERED so we no longer need to assign a binary sysctl path to allow
+sysctls to show up in /proc/sys.
+
+Assigning binary sysctl numbers is an endless source of conflicts in sysctl.h,
+breaking of the user space ABI (because of those conflicts), and maintenance
+problems. A complete pass through all of the sysctl users revealed multiple
+instances where the sysctl binary interface was broken and had gone undetected
+for years.
+
+So please do not add new binary sysctl numbers. They are unneeded and
+problematic.
+
+If you really need a new binary sysctl number please first merge your sysctl
+into the kernel and then as a separate patch allocate a binary sysctl number.
+
+(ebiederm@xmission.com, June 2007)
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
new file mode 100644
index 0000000..f992543
--- /dev/null
+++ b/Documentation/sysctl/fs.txt
@@ -0,0 +1,180 @@
+Documentation for /proc/sys/fs/* kernel version 2.2.10
+ (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+For general info and legal blurb, please look in README.
+
+==============================================================
+
+This file contains documentation for the sysctl files in
+/proc/sys/fs/ and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to tune and monitor
+miscellaneous and general things in the operation of the Linux
+kernel. Since some of the files _can_ be used to screw up your
+system, it is advisable to read both documentation and source
+before actually making adjustments.
+
+Currently, these files are in /proc/sys/fs:
+- dentry-state
+- dquot-max
+- dquot-nr
+- file-max
+- file-nr
+- inode-max
+- inode-nr
+- inode-state
+- nr_open
+- overflowuid
+- overflowgid
+- suid_dumpable
+- super-max
+- super-nr
+
+Documentation for the files in /proc/sys/fs/binfmt_misc is
+in Documentation/binfmt_misc.txt.
+
+==============================================================
+
+dentry-state:
+
+From linux/fs/dentry.c:
+--------------------------------------------------------------
+struct {
+ int nr_dentry;
+ int nr_unused;
+ int age_limit; /* age in seconds */
+ int want_pages; /* pages requested by system */
+ int dummy[2];
+} dentry_stat = {0, 0, 45, 0,};
+--------------------------------------------------------------
+
+Dentries are dynamically allocated and deallocated, and
+nr_dentry seems to be 0 all the time. Hence it's safe to
+assume that only nr_unused, age_limit and want_pages are
+used. Nr_unused seems to be exactly what its name says.
+Age_limit is the age in seconds after which dcache entries
+can be reclaimed when memory is short and want_pages is
+nonzero when shrink_dcache_pages() has been called and the
+dcache isn't pruned yet.
+
+==============================================================
+
+dquot-max & dquot-nr:
+
+The file dquot-max shows the maximum number of cached disk
+quota entries.
+
+The file dquot-nr shows the number of allocated disk quota
+entries and the number of free disk quota entries.
+
+If the number of free cached disk quotas is very low and
+you have some awesome number of simultaneous system users,
+you might want to raise the limit.
+
+==============================================================
+
+file-max & file-nr:
+
+The kernel allocates file handles dynamically, but as yet it
+doesn't free them again.
+
+The value in file-max denotes the maximum number of file-
+handles that the Linux kernel will allocate. When you get lots
+of error messages about running out of file handles, you might
+want to increase this limit.
+
+The three values in file-nr denote the number of allocated
+file handles, the number of unused file handles and the maximum
+number of file handles. When the allocated file handles come
+close to the maximum, but the number of unused file handles is
+significantly greater than 0, you've encountered a peak in your
+usage of file handles and you don't need to increase the maximum.
+
+==============================================================
+
+nr_open:
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+==============================================================
+
+inode-max, inode-nr & inode-state:
+
+As with file handles, the kernel allocates the inode structures
+dynamically, but can't free them yet.
+
+The value in inode-max denotes the maximum number of inode
+handlers. This value should be 3-4 times larger than the value
+in file-max, since stdin, stdout and network sockets also
+need an inode struct to handle them. When you regularly run
+out of inodes, you need to increase this value.
+
+The file inode-nr contains the first two items from
+inode-state, so we'll skip to that file...
+
+Inode-state contains three actual numbers and four dummies.
+The actual numbers are, in order of appearance, nr_inodes,
+nr_free_inodes and preshrink.
+
+Nr_inodes stands for the number of inodes the system has
+allocated, this can be slightly more than inode-max because
+Linux allocates them one pageful at a time.
+
+Nr_free_inodes represents the number of free inodes (?) and
+preshrink is nonzero when the nr_inodes > inode-max and the
+system needs to prune the inode list instead of allocating
+more.
+
+==============================================================
+
+overflowgid & overflowuid:
+
+Some filesystems only support 16-bit UIDs and GIDs, although in Linux
+UIDs and GIDs are 32 bits. When one of these filesystems is mounted
+with writes enabled, any UID or GID that would exceed 65535 is translated
+to a fixed value before being written to disk.
+
+These sysctls allow you to change the value of the fixed UID and GID.
+The default is 65534.
+
+==============================================================
+
+suid_dumpable:
+
+This value can be used to query and set the core dump mode for setuid
+or otherwise protected/tainted binaries. The modes are
+
+0 - (default) - traditional behaviour. Any process which has changed
+ privilege levels or is execute only will not be dumped
+1 - (debug) - all processes dump core when possible. The core dump is
+ owned by the current user and no security is applied. This is
+ intended for system debugging situations only. Ptrace is unchecked.
+2 - (suidsafe) - any binary which normally would not be dumped is dumped
+ readable by root only. This allows the end user to remove
+ such a dump but not access it directly. For security reasons
+ core dumps in this mode will not overwrite one another or
+ other files. This mode is appropriate when administrators are
+ attempting to debug problems in a normal environment.
+
+==============================================================
+
+super-max & super-nr:
+
+These numbers control the maximum number of superblocks, and
+thus the maximum number of mounted filesystems the kernel
+can have. You only need to increase super-max if you need to
+mount more filesystems than the current value in super-max
+allows you to.
+
+==============================================================
+
+aio-nr & aio-max-nr:
+
+aio-nr shows the current system-wide number of asynchronous io
+requests. aio-max-nr allows you to change the maximum value
+aio-nr can grow to.
+
+==============================================================
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
new file mode 100644
index 0000000..a4ccdd1
--- /dev/null
+++ b/Documentation/sysctl/kernel.txt
@@ -0,0 +1,383 @@
+Documentation for /proc/sys/kernel/* kernel version 2.2.10
+ (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+For general info and legal blurb, please look in README.
+
+==============================================================
+
+This file contains documentation for the sysctl files in
+/proc/sys/kernel/ and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to tune and monitor
+miscellaneous and general things in the operation of the Linux
+kernel. Since some of the files _can_ be used to screw up your
+system, it is advisable to read both documentation and source
+before actually making adjustments.
+
+Currently, these files might (depending on your configuration)
+show up in /proc/sys/kernel:
+- acpi_video_flags
+- acct
+- core_pattern
+- core_uses_pid
+- ctrl-alt-del
+- dentry-state
+- domainname
+- hostname
+- hotplug
+- java-appletviewer [ binfmt_java, obsolete ]
+- java-interpreter [ binfmt_java, obsolete ]
+- kstack_depth_to_print [ X86 only ]
+- l2cr [ PPC only ]
+- modprobe ==> Documentation/debugging-modules.txt
+- msgmax
+- msgmnb
+- msgmni
+- osrelease
+- ostype
+- overflowgid
+- overflowuid
+- panic
+- pid_max
+- powersave-nap [ PPC only ]
+- printk
+- randomize_va_space
+- real-root-dev ==> Documentation/initrd.txt
+- reboot-cmd [ SPARC only ]
+- rtsig-max
+- rtsig-nr
+- sem
+- sg-big-buff [ generic SCSI device (sg) ]
+- shmall
+- shmmax [ sysv ipc ]
+- shmmni
+- stop-a [ SPARC only ]
+- sysrq ==> Documentation/sysrq.txt
+- tainted
+- threads-max
+- version
+
+==============================================================
+
+acpi_video_flags:
+
+flags
+
+See Doc*/kernel/power/video.txt, it allows mode of video boot to be
+set during run time.
+
+==============================================================
+
+acct:
+
+highwater lowwater frequency
+
+If BSD-style process accounting is enabled these values control
+its behaviour. If free space on filesystem where the log lives
+goes below <lowwater>% accounting suspends. If free space gets
+above <highwater>% accounting resumes. <Frequency> determines
+how often do we check the amount of free space (value is in
+seconds). Default:
+4 2 30
+That is, suspend accounting if there left <= 2% free; resume it
+if we got >=4%; consider information about amount of free space
+valid for 30 seconds.
+
+==============================================================
+
+core_pattern:
+
+core_pattern is used to specify a core dumpfile pattern name.
+. max length 128 characters; default value is "core"
+. core_pattern is used as a pattern template for the output filename;
+ certain string patterns (beginning with '%') are substituted with
+ their actual values.
+. backward compatibility with core_uses_pid:
+ If core_pattern does not include "%p" (default does not)
+ and core_uses_pid is set, then .PID will be appended to
+ the filename.
+. corename format specifiers:
+ %<NUL> '%' is dropped
+ %% output one '%'
+ %p pid
+ %u uid
+ %g gid
+ %s signal number
+ %t UNIX time of dump
+ %h hostname
+ %e executable filename
+ %<OTHER> both are dropped
+. If the first character of the pattern is a '|', the kernel will treat
+ the rest of the pattern as a command to run. The core dump will be
+ written to the standard input of that program instead of to a file.
+
+==============================================================
+
+core_uses_pid:
+
+The default coredump filename is "core". By setting
+core_uses_pid to 1, the coredump filename becomes core.PID.
+If core_pattern does not include "%p" (default does not)
+and core_uses_pid is set, then .PID will be appended to
+the filename.
+
+==============================================================
+
+ctrl-alt-del:
+
+When the value in this file is 0, ctrl-alt-del is trapped and
+sent to the init(1) program to handle a graceful restart.
+When, however, the value is > 0, Linux's reaction to a Vulcan
+Nerve Pinch (tm) will be an immediate reboot, without even
+syncing its dirty buffers.
+
+Note: when a program (like dosemu) has the keyboard in 'raw'
+mode, the ctrl-alt-del is intercepted by the program before it
+ever reaches the kernel tty layer, and it's up to the program
+to decide what to do with it.
+
+==============================================================
+
+domainname & hostname:
+
+These files can be used to set the NIS/YP domainname and the
+hostname of your box in exactly the same way as the commands
+domainname and hostname, i.e.:
+# echo "darkstar" > /proc/sys/kernel/hostname
+# echo "mydomain" > /proc/sys/kernel/domainname
+has the same effect as
+# hostname "darkstar"
+# domainname "mydomain"
+
+Note, however, that the classic darkstar.frop.org has the
+hostname "darkstar" and DNS (Internet Domain Name Server)
+domainname "frop.org", not to be confused with the NIS (Network
+Information Service) or YP (Yellow Pages) domainname. These two
+domain names are in general different. For a detailed discussion
+see the hostname(1) man page.
+
+==============================================================
+
+hotplug:
+
+Path for the hotplug policy agent.
+Default value is "/sbin/hotplug".
+
+==============================================================
+
+l2cr: (PPC only)
+
+This flag controls the L2 cache of G3 processor boards. If
+0, the cache is disabled. Enabled if nonzero.
+
+==============================================================
+
+kstack_depth_to_print: (X86 only)
+
+Controls the number of words to print when dumping the raw
+kernel stack.
+
+==============================================================
+
+osrelease, ostype & version:
+
+# cat osrelease
+2.1.88
+# cat ostype
+Linux
+# cat version
+#5 Wed Feb 25 21:49:24 MET 1998
+
+The files osrelease and ostype should be clear enough. Version
+needs a little more clarification however. The '#5' means that
+this is the fifth kernel built from this source base and the
+date behind it indicates the time the kernel was built.
+The only way to tune these values is to rebuild the kernel :-)
+
+==============================================================
+
+overflowgid & overflowuid:
+
+if your architecture did not always support 32-bit UIDs (i.e. arm, i386,
+m68k, sh, and sparc32), a fixed UID and GID will be returned to
+applications that use the old 16-bit UID/GID system calls, if the actual
+UID or GID would exceed 65535.
+
+These sysctls allow you to change the value of the fixed UID and GID.
+The default is 65534.
+
+==============================================================
+
+panic:
+
+The value in this file represents the number of seconds the
+kernel waits before rebooting on a panic. When you use the
+software watchdog, the recommended setting is 60.
+
+==============================================================
+
+panic_on_oops:
+
+Controls the kernel's behaviour when an oops or BUG is encountered.
+
+0: try to continue operation
+
+1: panic immediately. If the `panic' sysctl is also non-zero then the
+ machine will be rebooted.
+
+==============================================================
+
+pid_max:
+
+PID allocation wrap value. When the kernel's next PID value
+reaches this value, it wraps back to a minimum PID value.
+PIDs of value pid_max or larger are not allocated.
+
+==============================================================
+
+powersave-nap: (PPC only)
+
+If set, Linux-PPC will use the 'nap' mode of powersaving,
+otherwise the 'doze' mode will be used.
+
+==============================================================
+
+printk:
+
+The four values in printk denote: console_loglevel,
+default_message_loglevel, minimum_console_loglevel and
+default_console_loglevel respectively.
+
+These values influence printk() behavior when printing or
+logging error messages. See 'man 2 syslog' for more info on
+the different loglevels.
+
+- console_loglevel: messages with a higher priority than
+ this will be printed to the console
+- default_message_level: messages without an explicit priority
+ will be printed with this priority
+- minimum_console_loglevel: minimum (highest) value to which
+ console_loglevel can be set
+- default_console_loglevel: default value for console_loglevel
+
+==============================================================
+
+printk_ratelimit:
+
+Some warning messages are rate limited. printk_ratelimit specifies
+the minimum length of time between these messages (in jiffies), by
+default we allow one every 5 seconds.
+
+A value of 0 will disable rate limiting.
+
+==============================================================
+
+printk_ratelimit_burst:
+
+While long term we enforce one message per printk_ratelimit
+seconds, we do allow a burst of messages to pass through.
+printk_ratelimit_burst specifies the number of messages we can
+send before ratelimiting kicks in.
+
+==============================================================
+
+randomize-va-space:
+
+This option can be used to select the type of process address
+space randomization that is used in the system, for architectures
+that support this feature.
+
+0 - Turn the process address space randomization off by default.
+
+1 - Make the addresses of mmap base, stack and VDSO page randomized.
+ This, among other things, implies that shared libraries will be
+ loaded to random addresses. Also for PIE-linked binaries, the location
+ of code start is randomized.
+
+ With heap randomization, the situation is a little bit more
+ complicated.
+ There a few legacy applications out there (such as some ancient
+ versions of libc.so.5 from 1996) that assume that brk area starts
+ just after the end of the code+bss. These applications break when
+ start of the brk area is randomized. There are however no known
+ non-legacy applications that would be broken this way, so for most
+ systems it is safe to choose full randomization. However there is
+ a CONFIG_COMPAT_BRK option for systems with ancient and/or broken
+ binaries, that makes heap non-randomized, but keeps all other
+ parts of process address space randomized if randomize_va_space
+ sysctl is turned on.
+
+==============================================================
+
+reboot-cmd: (Sparc only)
+
+??? This seems to be a way to give an argument to the Sparc
+ROM/Flash boot loader. Maybe to tell it what to do after
+rebooting. ???
+
+==============================================================
+
+rtsig-max & rtsig-nr:
+
+The file rtsig-max can be used to tune the maximum number
+of POSIX realtime (queued) signals that can be outstanding
+in the system.
+
+rtsig-nr shows the number of RT signals currently queued.
+
+==============================================================
+
+sg-big-buff:
+
+This file shows the size of the generic SCSI (sg) buffer.
+You can't tune it just yet, but you could change it on
+compile time by editing include/scsi/sg.h and changing
+the value of SG_BIG_BUFF.
+
+There shouldn't be any reason to change this value. If
+you can come up with one, you probably know what you
+are doing anyway :)
+
+==============================================================
+
+shmmax:
+
+This value can be used to query and set the run time limit
+on the maximum shared memory segment size that can be created.
+Shared memory segments up to 1Gb are now supported in the
+kernel. This value defaults to SHMMAX.
+
+==============================================================
+
+softlockup_thresh:
+
+This value can be used to lower the softlockup tolerance threshold. The
+default threshold is 60 seconds. If a cpu is locked up for 60 seconds,
+the kernel complains. Valid values are 1-60 seconds. Setting this
+tunable to zero will disable the softlockup detection altogether.
+
+==============================================================
+
+tainted:
+
+Non-zero if the kernel has been tainted. Numeric values, which
+can be ORed together:
+
+ 1 - A module with a non-GPL license has been loaded, this
+ includes modules with no license.
+ Set by modutils >= 2.4.9 and module-init-tools.
+ 2 - A module was force loaded by insmod -f.
+ Set by modutils >= 2.4.9 and module-init-tools.
+ 4 - Unsafe SMP processors: SMP with CPUs not designed for SMP.
+ 8 - A module was forcibly unloaded from the system by rmmod -f.
+ 16 - A hardware machine check error occurred on the system.
+ 32 - A bad page was discovered on the system.
+ 64 - The user has asked that the system be marked "tainted". This
+ could be because they are running software that directly modifies
+ the hardware, or for other reasons.
+ 128 - The system has died.
+ 256 - The ACPI DSDT has been overridden with one supplied by the user
+ instead of using the one provided by the hardware.
+ 512 - A kernel warning has occurred.
+1024 - A module from drivers/staging was loaded.
+
diff --git a/Documentation/sysctl/sunrpc.txt b/Documentation/sysctl/sunrpc.txt
new file mode 100644
index 0000000..ae1ecac
--- /dev/null
+++ b/Documentation/sysctl/sunrpc.txt
@@ -0,0 +1,20 @@
+Documentation for /proc/sys/sunrpc/* kernel version 2.2.10
+ (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+For general info and legal blurb, please look in README.
+
+==============================================================
+
+This file contains the documentation for the sysctl files in
+/proc/sys/sunrpc and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to (re)set the debug
+flags of the SUN Remote Procedure Call (RPC) subsystem in
+the Linux kernel. This stuff is used for NFS, KNFSD and
+maybe a few other things as well.
+
+The files in there are used to control the debugging flags:
+rpc_debug, nfs_debug, nfsd_debug and nlm_debug.
+
+These flags are for kernel hackers only. You should read the
+source code in net/sunrpc/ for more information.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
new file mode 100644
index 0000000..d79eeda
--- /dev/null
+++ b/Documentation/sysctl/vm.txt
@@ -0,0 +1,349 @@
+Documentation for /proc/sys/vm/* kernel version 2.2.10
+ (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+For general info and legal blurb, please look in README.
+
+==============================================================
+
+This file contains the documentation for the sysctl files in
+/proc/sys/vm and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to tune the operation
+of the virtual memory (VM) subsystem of the Linux kernel and
+the writeout of dirty data to disk.
+
+Default values and initialization routines for most of these
+files can be found in mm/swap.c.
+
+Currently, these files are in /proc/sys/vm:
+- overcommit_memory
+- page-cluster
+- dirty_ratio
+- dirty_background_ratio
+- dirty_expire_centisecs
+- dirty_writeback_centisecs
+- highmem_is_dirtyable (only if CONFIG_HIGHMEM set)
+- max_map_count
+- min_free_kbytes
+- laptop_mode
+- block_dump
+- drop-caches
+- zone_reclaim_mode
+- min_unmapped_ratio
+- min_slab_ratio
+- panic_on_oom
+- oom_dump_tasks
+- oom_kill_allocating_task
+- mmap_min_address
+- numa_zonelist_order
+- nr_hugepages
+- nr_overcommit_hugepages
+
+==============================================================
+
+dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
+dirty_writeback_centisecs, highmem_is_dirtyable,
+vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout,
+drop-caches, hugepages_treat_as_movable:
+
+See Documentation/filesystems/proc.txt
+
+==============================================================
+
+overcommit_memory:
+
+This value contains a flag that enables memory overcommitment.
+
+When this flag is 0, the kernel attempts to estimate the amount
+of free memory left when userspace requests more memory.
+
+When this flag is 1, the kernel pretends there is always enough
+memory until it actually runs out.
+
+When this flag is 2, the kernel uses a "never overcommit"
+policy that attempts to prevent any overcommit of memory.
+
+This feature can be very useful because there are a lot of
+programs that malloc() huge amounts of memory "just-in-case"
+and don't use much of it.
+
+The default value is 0.
+
+See Documentation/vm/overcommit-accounting and
+security/commoncap.c::cap_vm_enough_memory() for more information.
+
+==============================================================
+
+overcommit_ratio:
+
+When overcommit_memory is set to 2, the committed address
+space is not permitted to exceed swap plus this percentage
+of physical RAM. See above.
+
+==============================================================
+
+page-cluster:
+
+The Linux VM subsystem avoids excessive disk seeks by reading
+multiple pages on a page fault. The number of pages it reads
+is dependent on the amount of memory in your machine.
+
+The number of pages the kernel reads in at once is equal to
+2 ^ page-cluster. Values above 2 ^ 5 don't make much sense
+for swap because we only cluster swap data in 32-page groups.
+
+==============================================================
+
+max_map_count:
+
+This file contains the maximum number of memory map areas a process
+may have. Memory map areas are used as a side-effect of calling
+malloc, directly by mmap and mprotect, and also when loading shared
+libraries.
+
+While most applications need less than a thousand maps, certain
+programs, particularly malloc debuggers, may consume lots of them,
+e.g., up to one or two maps per allocation.
+
+The default value is 65536.
+
+==============================================================
+
+min_free_kbytes:
+
+This is used to force the Linux VM to keep a minimum number
+of kilobytes free. The VM uses this number to compute a pages_min
+value for each lowmem zone in the system. Each lowmem zone gets
+a number of reserved free pages based proportionally on its size.
+
+Some minimal amount of memory is needed to satisfy PF_MEMALLOC
+allocations; if you set this to lower than 1024KB, your system will
+become subtly broken, and prone to deadlock under high loads.
+
+Setting this too high will OOM your machine instantly.
+
+==============================================================
+
+percpu_pagelist_fraction
+
+This is the fraction of pages at most (high mark pcp->high) in each zone that
+are allocated for each per cpu page list. The min value for this is 8. It
+means that we don't allow more than 1/8th of pages in each zone to be
+allocated in any single per_cpu_pagelist. This entry only changes the value
+of hot per cpu pagelists. User can specify a number like 100 to allocate
+1/100th of each zone to each per cpu page list.
+
+The batch value of each per cpu pagelist is also updated as a result. It is
+set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
+
+The initial value is zero. Kernel does not use this value at boot time to set
+the high water marks for each per cpu page list.
+
+===============================================================
+
+zone_reclaim_mode:
+
+Zone_reclaim_mode allows someone to set more or less aggressive approaches to
+reclaim memory when a zone runs out of memory. If it is set to zero then no
+zone reclaim occurs. Allocations will be satisfied from other zones / nodes
+in the system.
+
+This is value ORed together of
+
+1 = Zone reclaim on
+2 = Zone reclaim writes dirty pages out
+4 = Zone reclaim swaps pages
+
+zone_reclaim_mode is set during bootup to 1 if it is determined that pages
+from remote zones will cause a measurable performance reduction. The
+page allocator will then reclaim easily reusable pages (those page
+cache pages that are currently not used) before allocating off node pages.
+
+It may be beneficial to switch off zone reclaim if the system is
+used for a file server and all of memory should be used for caching files
+from disk. In that case the caching effect is more important than
+data locality.
+
+Allowing zone reclaim to write out pages stops processes that are
+writing large amounts of data from dirtying pages on other nodes. Zone
+reclaim will write out dirty pages if a zone fills up and so effectively
+throttle the process. This may decrease the performance of a single process
+since it cannot use all of system memory to buffer the outgoing writes
+anymore but it preserve the memory on other nodes so that the performance
+of other processes running on other nodes will not be affected.
+
+Allowing regular swap effectively restricts allocations to the local
+node unless explicitly overridden by memory policies or cpuset
+configurations.
+
+=============================================================
+
+min_unmapped_ratio:
+
+This is available only on NUMA kernels.
+
+A percentage of the total pages in each zone. Zone reclaim will only
+occur if more than this percentage of pages are file backed and unmapped.
+This is to insure that a minimal amount of local pages is still available for
+file I/O even if the node is overallocated.
+
+The default is 1 percent.
+
+=============================================================
+
+min_slab_ratio:
+
+This is available only on NUMA kernels.
+
+A percentage of the total pages in each zone. On Zone reclaim
+(fallback from the local zone occurs) slabs will be reclaimed if more
+than this percentage of pages in a zone are reclaimable slab pages.
+This insures that the slab growth stays under control even in NUMA
+systems that rarely perform global reclaim.
+
+The default is 5 percent.
+
+Note that slab reclaim is triggered in a per zone / node fashion.
+The process of reclaiming slab memory is currently not node specific
+and may not be fast.
+
+=============================================================
+
+panic_on_oom
+
+This enables or disables panic on out-of-memory feature.
+
+If this is set to 0, the kernel will kill some rogue process,
+called oom_killer. Usually, oom_killer can kill rogue processes and
+system will survive.
+
+If this is set to 1, the kernel panics when out-of-memory happens.
+However, if a process limits using nodes by mempolicy/cpusets,
+and those nodes become memory exhaustion status, one process
+may be killed by oom-killer. No panic occurs in this case.
+Because other nodes' memory may be free. This means system total status
+may be not fatal yet.
+
+If this is set to 2, the kernel panics compulsorily even on the
+above-mentioned.
+
+The default value is 0.
+1 and 2 are for failover of clustering. Please select either
+according to your policy of failover.
+
+=============================================================
+
+oom_dump_tasks
+
+Enables a system-wide task dump (excluding kernel threads) to be
+produced when the kernel performs an OOM-killing and includes such
+information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and
+name. This is helpful to determine why the OOM killer was invoked
+and to identify the rogue task that caused it.
+
+If this is set to zero, this information is suppressed. On very
+large systems with thousands of tasks it may not be feasible to dump
+the memory state information for each one. Such systems should not
+be forced to incur a performance penalty in OOM conditions when the
+information may not be desired.
+
+If this is set to non-zero, this information is shown whenever the
+OOM killer actually kills a memory-hogging task.
+
+The default value is 0.
+
+=============================================================
+
+oom_kill_allocating_task
+
+This enables or disables killing the OOM-triggering task in
+out-of-memory situations.
+
+If this is set to zero, the OOM killer will scan through the entire
+tasklist and select a task based on heuristics to kill. This normally
+selects a rogue memory-hogging task that frees up a large amount of
+memory when killed.
+
+If this is set to non-zero, the OOM killer simply kills the task that
+triggered the out-of-memory condition. This avoids the expensive
+tasklist scan.
+
+If panic_on_oom is selected, it takes precedence over whatever value
+is used in oom_kill_allocating_task.
+
+The default value is 0.
+
+==============================================================
+
+mmap_min_addr
+
+This file indicates the amount of address space which a user process will
+be restricted from mmaping. Since kernel null dereference bugs could
+accidentally operate based on the information in the first couple of pages
+of memory userspace processes should not be allowed to write to them. By
+default this value is set to 0 and no protections will be enforced by the
+security module. Setting this value to something like 64k will allow the
+vast majority of applications to work correctly and provide defense in depth
+against future potential kernel bugs.
+
+==============================================================
+
+numa_zonelist_order
+
+This sysctl is only for NUMA.
+'where the memory is allocated from' is controlled by zonelists.
+(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
+ you may be able to read ZONE_DMA as ZONE_DMA32...)
+
+In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following.
+ZONE_NORMAL -> ZONE_DMA
+This means that a memory allocation request for GFP_KERNEL will
+get memory from ZONE_DMA only when ZONE_NORMAL is not available.
+
+In NUMA case, you can think of following 2 types of order.
+Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL
+
+(A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL
+(B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA.
+
+Type(A) offers the best locality for processes on Node(0), but ZONE_DMA
+will be used before ZONE_NORMAL exhaustion. This increases possibility of
+out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small.
+
+Type(B) cannot offer the best locality but is more robust against OOM of
+the DMA zone.
+
+Type(A) is called as "Node" order. Type (B) is "Zone" order.
+
+"Node order" orders the zonelists by node, then by zone within each node.
+Specify "[Nn]ode" for zone order
+
+"Zone Order" orders the zonelists by zone type, then by node within each
+zone. Specify "[Zz]one"for zode order.
+
+Specify "[Dd]efault" to request automatic configuration. Autoconfiguration
+will select "node" order in following case.
+(1) if the DMA zone does not exist or
+(2) if the DMA zone comprises greater than 50% of the available memory or
+(3) if any node's DMA zone comprises greater than 60% of its local memory and
+ the amount of local memory is big enough.
+
+Otherwise, "zone" order will be selected. Default order is recommended unless
+this is causing problems for your system/application.
+
+==============================================================
+
+nr_hugepages
+
+Change the minimum size of the hugepage pool.
+
+See Documentation/vm/hugetlbpage.txt
+
+==============================================================
+
+nr_overcommit_hugepages
+
+Change the maximum size of the hugepage pool. The maximum is
+nr_hugepages + nr_overcommit_hugepages.
+
+See Documentation/vm/hugetlbpage.txt
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt
new file mode 100644
index 0000000..6049a2a
--- /dev/null
+++ b/Documentation/sysfs-rules.txt
@@ -0,0 +1,163 @@
+Rules on how to access information in the Linux kernel sysfs
+
+The kernel-exported sysfs exports internal kernel implementation details
+and depends on internal kernel structures and layout. It is agreed upon
+by the kernel developers that the Linux kernel does not provide a stable
+internal API. Therefore, there are aspects of the sysfs interface that
+may not be stable across kernel releases.
+
+To minimize the risk of breaking users of sysfs, which are in most cases
+low-level userspace applications, with a new kernel release, the users
+of sysfs must follow some rules to use an as-abstract-as-possible way to
+access this filesystem. The current udev and HAL programs already
+implement this and users are encouraged to plug, if possible, into the
+abstractions these programs provide instead of accessing sysfs directly.
+
+But if you really do want or need to access sysfs directly, please follow
+the following rules and then your programs should work with future
+versions of the sysfs interface.
+
+- Do not use libsysfs
+ It makes assumptions about sysfs which are not true. Its API does not
+ offer any abstraction, it exposes all the kernel driver-core
+ implementation details in its own API. Therefore it is not better than
+ reading directories and opening the files yourself.
+ Also, it is not actively maintained, in the sense of reflecting the
+ current kernel development. The goal of providing a stable interface
+ to sysfs has failed; it causes more problems than it solves. It
+ violates many of the rules in this document.
+
+- sysfs is always at /sys
+ Parsing /proc/mounts is a waste of time. Other mount points are a
+ system configuration bug you should not try to solve. For test cases,
+ possibly support a SYSFS_PATH environment variable to overwrite the
+ application's behavior, but never try to search for sysfs. Never try
+ to mount it, if you are not an early boot script.
+
+- devices are only "devices"
+ There is no such thing like class-, bus-, physical devices,
+ interfaces, and such that you can rely on in userspace. Everything is
+ just simply a "device". Class-, bus-, physical, ... types are just
+ kernel implementation details which should not be expected by
+ applications that look for devices in sysfs.
+
+ The properties of a device are:
+ o devpath (/devices/pci0000:00/0000:00:1d.1/usb2/2-2/2-2:1.0)
+ - identical to the DEVPATH value in the event sent from the kernel
+ at device creation and removal
+ - the unique key to the device at that point in time
+ - the kernel's path to the device directory without the leading
+ /sys, and always starting with with a slash
+ - all elements of a devpath must be real directories. Symlinks
+ pointing to /sys/devices must always be resolved to their real
+ target and the target path must be used to access the device.
+ That way the devpath to the device matches the devpath of the
+ kernel used at event time.
+ - using or exposing symlink values as elements in a devpath string
+ is a bug in the application
+
+ o kernel name (sda, tty, 0000:00:1f.2, ...)
+ - a directory name, identical to the last element of the devpath
+ - applications need to handle spaces and characters like '!' in
+ the name
+
+ o subsystem (block, tty, pci, ...)
+ - simple string, never a path or a link
+ - retrieved by reading the "subsystem"-link and using only the
+ last element of the target path
+
+ o driver (tg3, ata_piix, uhci_hcd)
+ - a simple string, which may contain spaces, never a path or a
+ link
+ - it is retrieved by reading the "driver"-link and using only the
+ last element of the target path
+ - devices which do not have "driver"-link just do not have a
+ driver; copying the driver value in a child device context is a
+ bug in the application
+
+ o attributes
+ - the files in the device directory or files below subdirectories
+ of the same device directory
+ - accessing attributes reached by a symlink pointing to another device,
+ like the "device"-link, is a bug in the application
+
+ Everything else is just a kernel driver-core implementation detail
+ that should not be assumed to be stable across kernel releases.
+
+- Properties of parent devices never belong into a child device.
+ Always look at the parent devices themselves for determining device
+ context properties. If the device 'eth0' or 'sda' does not have a
+ "driver"-link, then this device does not have a driver. Its value is empty.
+ Never copy any property of the parent-device into a child-device. Parent
+ device properties may change dynamically without any notice to the
+ child device.
+
+- Hierarchy in a single device tree
+ There is only one valid place in sysfs where hierarchy can be examined
+ and this is below: /sys/devices.
+ It is planned that all device directories will end up in the tree
+ below this directory.
+
+- Classification by subsystem
+ There are currently three places for classification of devices:
+ /sys/block, /sys/class and /sys/bus. It is planned that these will
+ not contain any device directories themselves, but only flat lists of
+ symlinks pointing to the unified /sys/devices tree.
+ All three places have completely different rules on how to access
+ device information. It is planned to merge all three
+ classification directories into one place at /sys/subsystem,
+ following the layout of the bus directories. All buses and
+ classes, including the converted block subsystem, will show up
+ there.
+ The devices belonging to a subsystem will create a symlink in the
+ "devices" directory at /sys/subsystem/<name>/devices.
+
+ If /sys/subsystem exists, /sys/bus, /sys/class and /sys/block can be
+ ignored. If it does not exist, you have always to scan all three
+ places, as the kernel is free to move a subsystem from one place to
+ the other, as long as the devices are still reachable by the same
+ subsystem name.
+
+ Assuming /sys/class/<subsystem> and /sys/bus/<subsystem>, or
+ /sys/block and /sys/class/block are not interchangeable is a bug in
+ the application.
+
+- Block
+ The converted block subsystem at /sys/class/block or
+ /sys/subsystem/block will contain the links for disks and partitions
+ at the same level, never in a hierarchy. Assuming the block subsytem to
+ contain only disks and not partition devices in the same flat list is
+ a bug in the application.
+
+- "device"-link and <subsystem>:<kernel name>-links
+ Never depend on the "device"-link. The "device"-link is a workaround
+ for the old layout, where class devices are not created in
+ /sys/devices/ like the bus devices. If the link-resolving of a
+ device directory does not end in /sys/devices/, you can use the
+ "device"-link to find the parent devices in /sys/devices/. That is the
+ single valid use of the "device"-link; it must never appear in any
+ path as an element. Assuming the existence of the "device"-link for
+ a device in /sys/devices/ is a bug in the application.
+ Accessing /sys/class/net/eth0/device is a bug in the application.
+
+ Never depend on the class-specific links back to the /sys/class
+ directory. These links are also a workaround for the design mistake
+ that class devices are not created in /sys/devices. If a device
+ directory does not contain directories for child devices, these links
+ may be used to find the child devices in /sys/class. That is the single
+ valid use of these links; they must never appear in any path as an
+ element. Assuming the existence of these links for devices which are
+ real child device directories in the /sys/devices tree is a bug in
+ the application.
+
+ It is planned to remove all these links when all class device
+ directories live in /sys/devices.
+
+- Position of devices along device chain can change.
+ Never depend on a specific parent device position in the devpath,
+ or the chain of parent devices. The kernel is free to insert devices into
+ the chain. You must always request the parent device you are looking for
+ by its subsystem value. You need to walk up the chain until you find
+ the device that matches the expected subsystem. Depending on a specific
+ position of a parent device or exposing relative paths using "../" to
+ access the chain of parents is a bug in the application.
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
new file mode 100644
index 0000000..10a0263
--- /dev/null
+++ b/Documentation/sysrq.txt
@@ -0,0 +1,225 @@
+Linux Magic System Request Key Hacks
+Documentation for sysrq.c
+Last update: 2007-AUG-04
+
+* What is the magic SysRq key?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+It is a 'magical' key combo you can hit which the kernel will respond to
+regardless of whatever else it is doing, unless it is completely locked up.
+
+* How do I enable the magic SysRq key?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+You need to say "yes" to 'Magic SysRq key (CONFIG_MAGIC_SYSRQ)' when
+configuring the kernel. When running a kernel with SysRq compiled in,
+/proc/sys/kernel/sysrq controls the functions allowed to be invoked via
+the SysRq key. By default the file contains 1 which means that every
+possible SysRq request is allowed (in older versions SysRq was disabled
+by default, and you were required to specifically enable it at run-time
+but this is not the case any more). Here is the list of possible values
+in /proc/sys/kernel/sysrq:
+ 0 - disable sysrq completely
+ 1 - enable all functions of sysrq
+ >1 - bitmask of allowed sysrq functions (see below for detailed function
+ description):
+ 2 - enable control of console logging level
+ 4 - enable control of keyboard (SAK, unraw)
+ 8 - enable debugging dumps of processes etc.
+ 16 - enable sync command
+ 32 - enable remount read-only
+ 64 - enable signalling of processes (term, kill, oom-kill)
+ 128 - allow reboot/poweroff
+ 256 - allow nicing of all RT tasks
+
+You can set the value in the file by the following command:
+ echo "number" >/proc/sys/kernel/sysrq
+
+Note that the value of /proc/sys/kernel/sysrq influences only the invocation
+via a keyboard. Invocation of any operation via /proc/sysrq-trigger is always
+allowed (by a user with admin privileges).
+
+* How do I use the magic SysRq key?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+On x86 - You press the key combo 'ALT-SysRq-<command key>'. Note - Some
+ keyboards may not have a key labeled 'SysRq'. The 'SysRq' key is
+ also known as the 'Print Screen' key. Also some keyboards cannot
+ handle so many keys being pressed at the same time, so you might
+ have better luck with "press Alt", "press SysRq", "release SysRq",
+ "press <command key>", release everything.
+
+On SPARC - You press 'ALT-STOP-<command key>', I believe.
+
+On the serial console (PC style standard serial ports only) -
+ You send a BREAK, then within 5 seconds a command key. Sending
+ BREAK twice is interpreted as a normal BREAK.
+
+On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
+ Print Screen (or F13) - <command key> may suffice.
+
+On other - If you know of the key combos for other architectures, please
+ let me know so I can add them to this section.
+
+On all - write a character to /proc/sysrq-trigger. e.g.:
+
+ echo t > /proc/sysrq-trigger
+
+* What are the 'command' keys?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+'b' - Will immediately reboot the system without syncing or unmounting
+ your disks.
+
+'c' - Will perform a kexec reboot in order to take a crashdump.
+
+'d' - Shows all locks that are held.
+
+'e' - Send a SIGTERM to all processes, except for init.
+
+'f' - Will call oom_kill to kill a memory hog process.
+
+'g' - Used by kgdb on ppc and sh platforms.
+
+'h' - Will display help (actually any other key than those listed
+ here will display help. but 'h' is easy to remember :-)
+
+'i' - Send a SIGKILL to all processes, except for init.
+
+'k' - Secure Access Key (SAK) Kills all programs on the current virtual
+ console. NOTE: See important comments below in SAK section.
+
+'l' - Shows a stack backtrace for all active CPUs.
+
+'m' - Will dump current memory info to your console.
+
+'n' - Used to make RT tasks nice-able
+
+'o' - Will shut your system off (if configured and supported).
+
+'p' - Will dump the current registers and flags to your console.
+
+'q' - Will dump per CPU lists of all armed hrtimers (but NOT regular
+ timer_list timers) and detailed information about all
+ clockevent devices.
+
+'r' - Turns off keyboard raw mode and sets it to XLATE.
+
+'s' - Will attempt to sync all mounted filesystems.
+
+'t' - Will dump a list of current tasks and their information to your
+ console.
+
+'u' - Will attempt to remount all mounted filesystems read-only.
+
+'v' - Dumps Voyager SMP processor info to your console.
+
+'w' - Dumps tasks that are in uninterruptable (blocked) state.
+
+'x' - Used by xmon interface on ppc/powerpc platforms.
+
+'0'-'9' - Sets the console log level, controlling which kernel messages
+ will be printed to your console. ('0', for example would make
+ it so that only emergency messages like PANICs or OOPSes would
+ make it to your console.)
+
+* Okay, so what can I use them for?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Well, un'R'aw is very handy when your X server or a svgalib program crashes.
+
+sa'K' (Secure Access Key) is useful when you want to be sure there is no
+trojan program running at console which could grab your password
+when you would try to login. It will kill all programs on given console,
+thus letting you make sure that the login prompt you see is actually
+the one from init, not some trojan program.
+IMPORTANT: In its true form it is not a true SAK like the one in a :IMPORTANT
+IMPORTANT: c2 compliant system, and it should not be mistaken as :IMPORTANT
+IMPORTANT: such. :IMPORTANT
+ It seems others find it useful as (System Attention Key) which is
+useful when you want to exit a program that will not let you switch consoles.
+(For example, X or a svgalib program.)
+
+re'B'oot is good when you're unable to shut down. But you should also 'S'ync
+and 'U'mount first.
+
+'C'rashdump can be used to manually trigger a crashdump when the system is hung.
+The kernel needs to have been built with CONFIG_KEXEC enabled.
+
+'S'ync is great when your system is locked up, it allows you to sync your
+disks and will certainly lessen the chance of data loss and fscking. Note
+that the sync hasn't taken place until you see the "OK" and "Done" appear
+on the screen. (If the kernel is really in strife, you may not ever get the
+OK or Done message...)
+
+'U'mount is basically useful in the same ways as 'S'ync. I generally 'S'ync,
+'U'mount, then re'B'oot when my system locks. It's saved me many a fsck.
+Again, the unmount (remount read-only) hasn't taken place until you see the
+"OK" and "Done" message appear on the screen.
+
+The loglevels '0'-'9' are useful when your console is being flooded with
+kernel messages you do not want to see. Selecting '0' will prevent all but
+the most urgent kernel messages from reaching your console. (They will
+still be logged if syslogd/klogd are alive, though.)
+
+t'E'rm and k'I'll are useful if you have some sort of runaway process you
+are unable to kill any other way, especially if it's spawning other
+processes.
+
+* Sometimes SysRq seems to get 'stuck' after using it, what can I do?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+That happens to me, also. I've found that tapping shift, alt, and control
+on both sides of the keyboard, and hitting an invalid sysrq sequence again
+will fix the problem. (i.e., something like alt-sysrq-z). Switching to another
+virtual console (ALT+Fn) and then back again should also help.
+
+* I hit SysRq, but nothing seems to happen, what's wrong?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There are some keyboards that send different scancodes for SysRq than the
+pre-defined 0x54. So if SysRq doesn't work out of the box for a certain
+keyboard, run 'showkey -s' to find out the proper scancode sequence. Then
+use 'setkeycodes <sequence> 84' to define this sequence to the usual SysRq
+code (84 is decimal for 0x54). It's probably best to put this command in a
+boot script. Oh, and by the way, you exit 'showkey' by not typing anything
+for ten seconds.
+
+* I want to add SysRQ key events to a module, how does it work?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In order to register a basic function with the table, you must first include
+the header 'include/linux/sysrq.h', this will define everything else you need.
+Next, you must create a sysrq_key_op struct, and populate it with A) the key
+handler function you will use, B) a help_msg string, that will print when SysRQ
+prints help, and C) an action_msg string, that will print right before your
+handler is called. Your handler must conform to the prototype in 'sysrq.h'.
+
+After the sysrq_key_op is created, you can call the kernel function
+register_sysrq_key(int key, struct sysrq_key_op *op_p); this will
+register the operation pointed to by 'op_p' at table key 'key',
+if that slot in the table is blank. At module unload time, you must call
+the function unregister_sysrq_key(int key, struct sysrq_key_op *op_p), which
+will remove the key op pointed to by 'op_p' from the key 'key', if and only if
+it is currently registered in that slot. This is in case the slot has been
+overwritten since you registered it.
+
+The Magic SysRQ system works by registering key operations against a key op
+lookup table, which is defined in 'drivers/char/sysrq.c'. This key table has
+a number of operations registered into it at compile time, but is mutable,
+and 2 functions are exported for interface to it:
+ register_sysrq_key and unregister_sysrq_key.
+Of course, never ever leave an invalid pointer in the table. I.e., when
+your module that called register_sysrq_key() exits, it must call
+unregister_sysrq_key() to clean up the sysrq key table entry that it used.
+Null pointers in the table are always safe. :)
+
+If for some reason you feel the need to call the handle_sysrq function from
+within a function called by handle_sysrq, you must be aware that you are in
+a lock (you are also in an interrupt handler, which means don't sleep!), so
+you must call __handle_sysrq_nolock instead.
+
+* I have more questions, who can I ask?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+And I'll answer any questions about the registration system you got, also
+responding as soon as possible.
+ -Crutcher
+
+* Credits
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Written by Mydraal <vulpyne@vulpyne.net>
+Updated by Adam Sulmicki <adam@cfar.umd.edu>
+Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
+Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
diff --git a/Documentation/telephony/00-INDEX b/Documentation/telephony/00-INDEX
new file mode 100644
index 0000000..4ffe0ed
--- /dev/null
+++ b/Documentation/telephony/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - this file.
+ixj.txt
+ - document describing the Quicknet drivers.
diff --git a/Documentation/telephony/ixj.txt b/Documentation/telephony/ixj.txt
new file mode 100644
index 0000000..44d1240
--- /dev/null
+++ b/Documentation/telephony/ixj.txt
@@ -0,0 +1,399 @@
+Linux Quicknet-Drivers-Howto
+Quicknet Technologies, Inc. (www.quicknet.net)
+Version 0.3.4 December 18, 1999
+
+1.0 Introduction
+
+This document describes the first GPL release version of the Linux
+driver for the Quicknet Internet PhoneJACK and Internet LineJACK
+cards. More information about these cards is available at
+www.quicknet.net. The driver version discussed in this document is
+0.3.4.
+
+These cards offer nice telco style interfaces to use your standard
+telephone/key system/PBX as the user interface for VoIP applications.
+The Internet LineJACK also offers PSTN connectivity for a single line
+Internet to PSTN gateway. Of course, you can add more than one card
+to a system to obtain multi-line functionality. At this time, the
+driver supports the POTS port on both the Internet PhoneJACK and the
+Internet LineJACK, but the PSTN port on the latter card is not yet
+supported.
+
+This document, and the drivers for the cards, are intended for a
+limited audience that includes technically capable programmers who
+would like to experiment with Quicknet cards. The drivers are
+considered in ALPHA status and are not yet considered stable enough
+for general, widespread use in an unlimited audience.
+
+That's worth saying again:
+
+THE LINUX DRIVERS FOR QUICKNET CARDS ARE PRESENTLY IN A ALPHA STATE
+AND SHOULD NOT BE CONSIDERED AS READY FOR NORMAL WIDESPREAD USE.
+
+They are released early in the spirit of Internet development and to
+make this technology available to innovators who would benefit from
+early exposure.
+
+When we promote the device driver to "beta" level it will be
+considered ready for non-programmer, non-technical users. Until then,
+please be aware that these drivers may not be stable and may affect
+the performance of your system.
+
+
+1.1 Latest Additions/Improvements
+
+The 0.3.4 version of the driver is the first GPL release. Several
+features had to be removed from the prior binary only module, mostly
+for reasons of Intellectual Property rights. We can't release
+information that is not ours - so certain aspects of the driver had to
+be removed to protect the rights of others.
+
+Specifically, very old Internet PhoneJACK cards have non-standard
+G.723.1 codecs (due to the early nature of the DSPs in those days).
+The auto-conversion code to bring those cards into compliance with
+todays standards is available as a binary only module to those people
+needing it. If you bought your card after 1997 or so, you are OK -
+it's only the very old cards that are affected.
+
+Also, the code to download G.728/G.729/G.729a codecs to the DSP is
+available as a binary only module as well. This IP is not ours to
+release.
+
+Hooks are built into the GPL driver to allow it to work with other
+companion modules that are completely separate from this module.
+
+1.2 Copyright, Trademarks, Disclaimer, & Credits
+
+Copyright
+
+Copyright (c) 1999 Quicknet Technologies, Inc. Permission is granted
+to freely copy and distribute this document provided you preserve it
+in its original form. For corrections and minor changes contact the
+maintainer at linux@quicknet.net.
+
+Trademarks
+
+Internet PhoneJACK and Internet LineJACK are registered trademarks of
+Quicknet Technologies, Inc.
+
+Disclaimer
+
+Much of the info in this HOWTO is early information released by
+Quicknet Technologies, Inc. for the express purpose of allowing early
+testing and use of the Linux drivers developed for their products.
+While every attempt has been made to be thorough, complete and
+accurate, the information contained here may be unreliable and there
+are likely a number of errors in this document. Please let the
+maintainer know about them. Since this is free documentation, it
+should be obvious that neither I nor previous authors can be held
+legally responsible for any errors.
+
+Credits
+
+This HOWTO was written by:
+
+ Greg Herlein <gherlein@quicknet.net>
+ Ed Okerson <eokerson@quicknet.net>
+
+1.3 Future Plans: You Can Help
+
+Please let the maintainer know of any errors in facts, opinions,
+logic, spelling, grammar, clarity, links, etc. But first, if the date
+is over a month old, check to see that you have the latest
+version. Please send any info that you think belongs in this document.
+
+You can also contribute code and/or bug-fixes for the sample
+applications.
+
+
+1.4 Where to get things
+
+You can download the latest versions of the driver from:
+
+http://www.quicknet.net/develop.htm
+
+You can download the latest version of this document from:
+
+http://www.quicknet.net/develop.htm
+
+
+1.5 Mailing List
+
+Quicknet operates a mailing list to provide a public forum on using
+these drivers.
+
+To subscribe to the linux-sdk mailing list, send an email to:
+
+ majordomo@linux.quicknet.net
+
+In the body of the email, type:
+
+ subscribe linux-sdk <your-email-address>
+
+Please delete any signature block that you would normally add to the
+bottom of your email - it tends to confuse majordomo.
+
+To send mail to the list, address your mail to
+
+ linux-sdk@linux.quicknet.net
+
+Your message will go out to everyone on the list.
+
+To unsubscribe to the linux-sdk mailing list, send an email to:
+
+ majordomo@linux.quicknet.net
+
+In the body of the email, type:
+
+ unsubscribe linux-sdk <your-email-address>
+
+
+
+2.0 Requirements
+
+2.1 Quicknet Card(s)
+
+You will need at least one Internet PhoneJACK or Internet LineJACK
+cards. These are ISA or PCI bus devices that use Plug-n-Play for
+configuration, and use no IRQs. The driver will support up to 16
+cards in any one system, of any mix between the two types.
+
+Note that you will need two cards to do any useful testing alone, since
+you will need a card on both ends of the connection. Of course, if
+you are doing collaborative work, perhaps your friends or coworkers
+have cards too. If not, we'll gladly sell them some!
+
+
+2.2 ISAPNP
+
+Since the Quicknet cards are Plug-n-Play devices, you will need the
+isapnp tools package to configure the cards, or you can use the isapnp
+module to autoconfigure them. The former package probably came with
+your Linux distribution. Documentation on this package is available
+online at:
+
+http://mailer.wiwi.uni-marburg.de/linux/LDP/HOWTO/Plug-and-Play-HOWTO.html
+
+The isapnp autoconfiguration is available on the Quicknet website at:
+
+ http://www.quicknet.net/develop.htm
+
+though it may be in the kernel by the time you read this.
+
+
+3.0 Card Configuration
+
+If you did not get your drivers as part of the linux kernel, do the
+following to install them:
+
+ a. untar the distribution file. We use the following command:
+ tar -xvzf ixj-0.x.x.tgz
+
+This creates a subdirectory holding all the necessary files. Go to that
+subdirectory.
+
+ b. run the "ixj_dev_create" script to remove any stray device
+files left in the /dev directory, and to create the new officially
+designated device files. Note that the old devices were called
+/dev/ixj, and the new method uses /dev/phone.
+
+ c. type "make;make install" - this will compile and install the
+module.
+
+ d. type "depmod -av" to rebuild all your kernel version dependencies.
+
+ e. if you are using the isapnp module to configure the cards
+ automatically, then skip to step f. Otherwise, ensure that you
+ have run the isapnp configuration utility to properly configure
+ the cards.
+
+ e1. The Internet PhoneJACK has one configuration register that
+ requires 16 IO ports. The Internet LineJACK card has two
+ configuration registers and isapnp reports that IO 0
+ requires 16 IO ports and IO 1 requires 8. The Quicknet
+ driver assumes that these registers are configured to be
+ contiguous, i.e. if IO 0 is set to 0x340 then IO 1 should
+ be set to 0x350.
+
+ Make sure that none of the cards overlap if you have
+ multiple cards in the system.
+
+ If you are new to the isapnp tools, you can jumpstart
+ yourself by doing the following:
+
+ e2. go to the /etc directory and run pnpdump to get a blank
+ isapnp.conf file.
+
+ pnpdump > /etc/isapnp.conf
+
+ e3. edit the /etc/isapnp.conf file to set the IO warnings and
+ the register IO addresses. The IO warnings means that you
+ should find the line in the file that looks like this:
+
+ (CONFLICT (IO FATAL)(IRQ FATAL)(DMA FATAL)(MEM FATAL)) # or WARNING
+
+ and you should edit the line to look like this:
+
+ (CONFLICT (IO WARNING)(IRQ FATAL)(DMA FATAL)(MEM FATAL)) #
+ or WARNING
+
+ The next step is to set the IO port addresses. The issue
+ here is that isapnp does not identify all of the ports out
+ there. Specifically any device that does not have a driver
+ or module loaded by Linux will not be registered. This
+ includes older sound cards and network cards. We have
+ found that the IO port 0x300 is often used even though
+ isapnp claims that no-one is using those ports. We
+ recommend that for a single card installation that port
+ 0x340 (and 0x350) be used. The IO port line should change
+ from this:
+
+ (IO 0 (SIZE 16) (BASE 0x0300) (CHECK))
+
+ to this:
+
+ (IO 0 (SIZE 16) (BASE 0x0340) )
+
+ e4. if you have multiple Quicknet cards, make sure that you do
+ not have any overlaps. Be especially careful if you are
+ mixing Internet PhoneJACK and Internet LineJACK cards in
+ the same system. In these cases we recommend moving the
+ IO port addresses to the 0x400 block. Please note that on
+ a few machines the 0x400 series are used. Feel free to
+ experiment with other addresses. Our cards have been
+ proven to work using IO addresses of up to 0xFF0.
+
+ e5. the last step is to uncomment the activation line so the
+ drivers will be associated with the port. This means the
+ line (immediately below) the IO line should go from this:
+
+ # (ACT Y)
+
+ to this:
+
+ (ACT Y)
+
+ Once you have finished editing the isapnp.conf file you
+ must submit it into the pnp driverconfigure the cards.
+ This is done using the following command:
+
+ isapnp isapnp.conf
+
+ If this works you should see a line that identifies the
+ Quicknet device, the IO port(s) chosen, and a message
+ "Enabled OK".
+
+ f. if you are loading the module by hand, use insmod. An example
+of this would look like this:
+
+ insmod phonedev
+ insmod ixj dspio=0x320,0x310 xio=0,0x330
+
+Then verify the module loaded by running lsmod. If you are not using a
+module that matches your kernel version, you may need to "force" the
+load using the -f option in the insmod command.
+
+ insmod phonedev
+ insmod -f ixj dspio=0x320,0x310 xio=0,0x330
+
+
+If you are using isapnp to autoconfigure your card, then you do NOT
+need any of the above, though you need to use depmod to load the
+driver, like this:
+
+ depmod ixj
+
+which will result in the needed drivers getting loaded automatically.
+
+ g. if you are planning on having the kernel automatically request
+the module for you, then you need to edit /etc/conf.modules and add the
+following lines:
+
+ options ixj dspio=0x340 xio=0x330 ixjdebug=0
+
+If you do this, then when you execute an application that uses the
+module the kernel will request that it is loaded.
+
+ h. if you want non-root users to be able to read and write to the
+ixj devices (this is a good idea!) you should do the following:
+
+ - decide upon a group name to use and create that group if
+ needed. Add the user names to that group that you wish to
+ have access to the device. For example, we typically will
+ create a group named "ixj" in /etc/group and add all users
+ to that group that we want to run software that can use the
+ ixjX devices.
+
+ - change the permissions on the device files, like this:
+
+ chgrp ixj /dev/ixj*
+ chmod 660 /dev/ixj*
+
+Once this is done, then non-root users should be able to use the
+devices. If you have enabled autoloading of modules, then the user
+should be able to open the device and have the module loaded
+automatically for them.
+
+
+4.0 Driver Installation problems.
+
+We have tested these drivers on the 2.2.9, 2.2.10, 2.2.12, and 2.2.13 kernels
+and in all cases have eventually been able to get the drivers to load and
+run. We have found four types of problems that prevent this from happening.
+The problems and solutions are:
+
+ a. A step was missed in the installation. Go back and use section 3
+as a checklist. Many people miss running the ixj_dev_create script and thus
+never load the device names into the filesystem.
+
+ b. The kernel is inconsistently linked. We have found this problem in
+the Out Of the Box installation of several distributions. The symptoms
+are that neither driver will load, and that the unknown symbols include "jiffy"
+and "kmalloc". The solution is to recompile both the kernel and the
+modules. The command string for the final compile looks like this:
+
+ In the kernel directory:
+ 1. cp .config /tmp
+ 2. make mrproper
+ 3. cp /tmp/.config .
+ 4. make clean;make bzImage;make modules;make modules_install
+
+This rebuilds both the kernel and all the modules and makes sure they all
+have the same linkages. This generally solves the problem once the new
+kernel is installed and the system rebooted.
+
+ c. The kernel has been patched, then unpatched. This happens when
+someone decides to use an earlier kernel after they load a later kernel.
+The symptoms are proceeding through all three above steps and still not
+being able to load the driver. What has happened is that the generated
+header files are out of sync with the kernel itself. The solution is
+to recompile (again) using "make mrproper". This will remove and then
+regenerate all the necessary header files. Once this is done, then you
+need to install and reboot the kernel. We have not seen any problem
+loading one of our drivers after this treatment.
+
+5.0 Known Limitations
+
+We cannot currently play "dial-tone" and listen for DTMF digits at the
+same time using the ISA PhoneJACK. This is a bug in the 8020 DSP chip
+used on that product. All other Quicknet products function normally
+in this regard. We have a work-around, but it's not done yet. Until
+then, if you want dial-tone, you can always play a recorded dial-tone
+sound into the audio until you have gathered the DTMF digits.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
new file mode 100644
index 0000000..70d68ce
--- /dev/null
+++ b/Documentation/thermal/sysfs-api.txt
@@ -0,0 +1,266 @@
+Generic Thermal Sysfs driver How To
+=========================
+
+Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
+
+Updated: 2 January 2008
+
+Copyright (c) 2008 Intel Corporation
+
+
+0. Introduction
+
+The generic thermal sysfs provides a set of interfaces for thermal zone devices (sensors)
+and thermal cooling devices (fan, processor...) to register with the thermal management
+solution and to be a part of it.
+
+This how-to focuses on enabling new thermal zone and cooling devices to participate
+in thermal management.
+This solution is platform independent and any type of thermal zone devices and
+cooling devices should be able to make use of the infrastructure.
+
+The main task of the thermal sysfs driver is to expose thermal zone attributes as well
+as cooling device attributes to the user space.
+An intelligent thermal management application can make decisions based on inputs
+from thermal zone attributes (the current temperature and trip point temperature)
+and throttle appropriate devices.
+
+[0-*] denotes any positive number starting from 0
+[1-*] denotes any positive number starting from 1
+
+1. thermal sysfs driver interface functions
+
+1.1 thermal zone device interface
+1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *name, int trips,
+ void *devdata, struct thermal_zone_device_ops *ops)
+
+ This interface function adds a new thermal zone device (sensor) to
+ /sys/class/thermal folder as thermal_zone[0-*].
+ It tries to bind all the thermal cooling devices registered at the same time.
+
+ name: the thermal zone name.
+ trips: the total number of trip points this thermal zone supports.
+ devdata: device private data
+ ops: thermal zone device call-backs.
+ .bind: bind the thermal zone device with a thermal cooling device.
+ .unbind: unbind the thermal zone device with a thermal cooling device.
+ .get_temp: get the current temperature of the thermal zone.
+ .get_mode: get the current mode (user/kernel) of the thermal zone.
+ "kernel" means thermal management is done in kernel.
+ "user" will prevent kernel thermal driver actions upon trip points
+ so that user applications can take charge of thermal management.
+ .set_mode: set the mode (user/kernel) of the thermal zone.
+ .get_trip_type: get the type of certain trip point.
+ .get_trip_temp: get the temperature above which the certain trip point
+ will be fired.
+
+1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
+
+ This interface function removes the thermal zone device.
+ It deletes the corresponding entry form /sys/class/thermal folder and unbind all
+ the thermal cooling devices it uses.
+
+1.2 thermal cooling device interface
+1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
+ void *devdata, struct thermal_cooling_device_ops *)
+
+ This interface function adds a new thermal cooling device (fan/processor/...) to
+ /sys/class/thermal/ folder as cooling_device[0-*].
+ It tries to bind itself to all the thermal zone devices register at the same time.
+ name: the cooling device name.
+ devdata: device private data.
+ ops: thermal cooling devices call-backs.
+ .get_max_state: get the Maximum throttle state of the cooling device.
+ .get_cur_state: get the Current throttle state of the cooling device.
+ .set_cur_state: set the Current throttle state of the cooling device.
+
+1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+
+ This interface function remove the thermal cooling device.
+ It deletes the corresponding entry form /sys/class/thermal folder and unbind
+ itself from all the thermal zone devices using it.
+
+1.3 interface for binding a thermal zone device with a thermal cooling device
+1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+ int trip, struct thermal_cooling_device *cdev);
+
+ This interface function bind a thermal cooling device to the certain trip point
+ of a thermal zone device.
+ This function is usually called in the thermal zone device .bind callback.
+ tz: the thermal zone device
+ cdev: thermal cooling device
+ trip: indicates which trip point the cooling devices is associated with
+ in this thermal zone.
+
+1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+ int trip, struct thermal_cooling_device *cdev);
+
+ This interface function unbind a thermal cooling device from the certain trip point
+ of a thermal zone device.
+ This function is usually called in the thermal zone device .unbind callback.
+ tz: the thermal zone device
+ cdev: thermal cooling device
+ trip: indicates which trip point the cooling devices is associated with
+ in this thermal zone.
+
+2. sysfs attributes structure
+
+RO read only value
+RW read/write value
+
+Thermal sysfs attributes will be represented under /sys/class/thermal.
+Hwmon sysfs I/F extension is also available under /sys/class/hwmon
+if hwmon is compiled in or built as a module.
+
+Thermal zone device sys I/F, created once it's registered:
+/sys/class/thermal/thermal_zone[0-*]:
+ |-----type: Type of the thermal zone
+ |-----temp: Current temperature
+ |-----mode: Working mode of the thermal zone
+ |-----trip_point_[0-*]_temp: Trip point temperature
+ |-----trip_point_[0-*]_type: Trip point type
+
+Thermal cooling device sys I/F, created once it's registered:
+/sys/class/thermal/cooling_device[0-*]:
+ |-----type : Type of the cooling device(processor/fan/...)
+ |-----max_state: Maximum cooling state of the cooling device
+ |-----cur_state: Current cooling state of the cooling device
+
+
+These two dynamic attributes are created/removed in pairs.
+They represent the relationship between a thermal zone and its associated cooling device.
+They are created/removed for each
+thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device successful execution.
+
+/sys/class/thermal/thermal_zone[0-*]
+ |-----cdev[0-*]: The [0-*]th cooling device in the current thermal zone
+ |-----cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with
+
+Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
+the generic thermal driver also creates a hwmon sysfs I/F for each _type_ of
+thermal zone device. E.g. the generic thermal driver registers one hwmon class device
+and build the associated hwmon sysfs I/F for all the registered ACPI thermal zones.
+/sys/class/hwmon/hwmon[0-*]:
+ |-----name: The type of the thermal zone devices.
+ |-----temp[1-*]_input: The current temperature of thermal zone [1-*].
+ |-----temp[1-*]_critical: The critical trip point of thermal zone [1-*].
+Please read Documentation/hwmon/sysfs-interface for additional information.
+
+***************************
+* Thermal zone attributes *
+***************************
+
+type Strings which represent the thermal zone type.
+ This is given by thermal zone driver as part of registration.
+ Eg: "acpitz" indicates it's an ACPI thermal device.
+ In order to keep it consistent with hwmon sys attribute,
+ this should be a short, lowercase string,
+ not containing spaces nor dashes.
+ RO
+ Required
+
+temp Current temperature as reported by thermal zone (sensor)
+ Unit: millidegree Celsius
+ RO
+ Required
+
+mode One of the predefined values in [kernel, user]
+ This file gives information about the algorithm
+ that is currently managing the thermal zone.
+ It can be either default kernel based algorithm
+ or user space application.
+ RW
+ Optional
+ kernel = Thermal management in kernel thermal zone driver.
+ user = Preventing kernel thermal zone driver actions upon
+ trip points so that user application can take full
+ charge of the thermal management.
+
+trip_point_[0-*]_temp The temperature above which trip point will be fired
+ Unit: millidegree Celsius
+ RO
+ Optional
+
+trip_point_[0-*]_type Strings which indicate the type of the trip point
+ E.g. it can be one of critical, hot, passive,
+ active[0-*] for ACPI thermal zone.
+ RO
+ Optional
+
+cdev[0-*] Sysfs link to the thermal cooling device node where the sys I/F
+ for cooling device throttling control represents.
+ RO
+ Optional
+
+cdev[0-*]_trip_point The trip point with which cdev[0-*] is associated in this thermal zone
+ -1 means the cooling device is not associated with any trip point.
+ RO
+ Optional
+
+******************************
+* Cooling device attributes *
+******************************
+
+type String which represents the type of device
+ eg: For generic ACPI: this should be "Fan",
+ "Processor" or "LCD"
+ eg. For memory controller device on intel_menlow platform:
+ this should be "Memory controller"
+ RO
+ Required
+
+max_state The maximum permissible cooling state of this cooling device.
+ RO
+ Required
+
+cur_state The current cooling state of this cooling device.
+ the value can any integer numbers between 0 and max_state,
+ cur_state == 0 means no cooling
+ cur_state == max_state means the maximum cooling.
+ RW
+ Required
+
+3. A simple implementation
+
+ACPI thermal zone may support multiple trip points like critical/hot/passive/active.
+If an ACPI thermal zone supports critical, passive, active[0] and active[1] at the same time,
+it may register itself as a thermal_zone_device (thermal_zone1) with 4 trip points in all.
+It has one processor and one fan, which are both registered as thermal_cooling_device.
+If the processor is listed in _PSL method, and the fan is listed in _AL0 method,
+the sys I/F structure will be built like this:
+
+/sys/class/thermal:
+
+|thermal_zone1:
+ |-----type: acpitz
+ |-----temp: 37000
+ |-----mode: kernel
+ |-----trip_point_0_temp: 100000
+ |-----trip_point_0_type: critical
+ |-----trip_point_1_temp: 80000
+ |-----trip_point_1_type: passive
+ |-----trip_point_2_temp: 70000
+ |-----trip_point_2_type: active0
+ |-----trip_point_3_temp: 60000
+ |-----trip_point_3_type: active1
+ |-----cdev0: --->/sys/class/thermal/cooling_device0
+ |-----cdev0_trip_point: 1 /* cdev0 can be used for passive */
+ |-----cdev1: --->/sys/class/thermal/cooling_device3
+ |-----cdev1_trip_point: 2 /* cdev1 can be used for active[0]*/
+
+|cooling_device0:
+ |-----type: Processor
+ |-----max_state: 8
+ |-----cur_state: 0
+
+|cooling_device3:
+ |-----type: Fan
+ |-----max_state: 2
+ |-----cur_state: 0
+
+/sys/class/hwmon:
+
+|hwmon0:
+ |-----name: acpitz
+ |-----temp1_input: 37000
+ |-----temp1_crit: 100000
diff --git a/Documentation/timers/00-INDEX b/Documentation/timers/00-INDEX
new file mode 100644
index 0000000..397dc35
--- /dev/null
+++ b/Documentation/timers/00-INDEX
@@ -0,0 +1,10 @@
+00-INDEX
+ - this file
+highres.txt
+ - High resolution timers and dynamic ticks design notes
+hpet.txt
+ - High Precision Event Timer Driver for Linux
+hrtimers.txt
+ - subsystem for high-resolution kernel timers
+timer_stats.txt
+ - timer usage statistics
diff --git a/Documentation/timers/highres.txt b/Documentation/timers/highres.txt
new file mode 100644
index 0000000..2133223
--- /dev/null
+++ b/Documentation/timers/highres.txt
@@ -0,0 +1,249 @@
+High resolution timers and dynamic ticks design notes
+-----------------------------------------------------
+
+Further information can be found in the paper of the OLS 2006 talk "hrtimers
+and beyond". The paper is part of the OLS 2006 Proceedings Volume 1, which can
+be found on the OLS website:
+http://www.linuxsymposium.org/2006/linuxsymposium_procv1.pdf
+
+The slides to this talk are available from:
+http://tglx.de/projects/hrtimers/ols2006-hrtimers.pdf
+
+The slides contain five figures (pages 2, 15, 18, 20, 22), which illustrate the
+changes in the time(r) related Linux subsystems. Figure #1 (p. 2) shows the
+design of the Linux time(r) system before hrtimers and other building blocks
+got merged into mainline.
+
+Note: the paper and the slides are talking about "clock event source", while we
+switched to the name "clock event devices" in meantime.
+
+The design contains the following basic building blocks:
+
+- hrtimer base infrastructure
+- timeofday and clock source management
+- clock event management
+- high resolution timer functionality
+- dynamic ticks
+
+
+hrtimer base infrastructure
+---------------------------
+
+The hrtimer base infrastructure was merged into the 2.6.16 kernel. Details of
+the base implementation are covered in Documentation/hrtimers/hrtimer.txt. See
+also figure #2 (OLS slides p. 15)
+
+The main differences to the timer wheel, which holds the armed timer_list type
+timers are:
+ - time ordered enqueueing into a rb-tree
+ - independent of ticks (the processing is based on nanoseconds)
+
+
+timeofday and clock source management
+-------------------------------------
+
+John Stultz's Generic Time Of Day (GTOD) framework moves a large portion of
+code out of the architecture-specific areas into a generic management
+framework, as illustrated in figure #3 (OLS slides p. 18). The architecture
+specific portion is reduced to the low level hardware details of the clock
+sources, which are registered in the framework and selected on a quality based
+decision. The low level code provides hardware setup and readout routines and
+initializes data structures, which are used by the generic time keeping code to
+convert the clock ticks to nanosecond based time values. All other time keeping
+related functionality is moved into the generic code. The GTOD base patch got
+merged into the 2.6.18 kernel.
+
+Further information about the Generic Time Of Day framework is available in the
+OLS 2005 Proceedings Volume 1:
+http://www.linuxsymposium.org/2005/linuxsymposium_procv1.pdf
+
+The paper "We Are Not Getting Any Younger: A New Approach to Time and
+Timers" was written by J. Stultz, D.V. Hart, & N. Aravamudan.
+
+Figure #3 (OLS slides p.18) illustrates the transformation.
+
+
+clock event management
+----------------------
+
+While clock sources provide read access to the monotonically increasing time
+value, clock event devices are used to schedule the next event
+interrupt(s). The next event is currently defined to be periodic, with its
+period defined at compile time. The setup and selection of the event device
+for various event driven functionalities is hardwired into the architecture
+dependent code. This results in duplicated code across all architectures and
+makes it extremely difficult to change the configuration of the system to use
+event interrupt devices other than those already built into the
+architecture. Another implication of the current design is that it is necessary
+to touch all the architecture-specific implementations in order to provide new
+functionality like high resolution timers or dynamic ticks.
+
+The clock events subsystem tries to address this problem by providing a generic
+solution to manage clock event devices and their usage for the various clock
+event driven kernel functionalities. The goal of the clock event subsystem is
+to minimize the clock event related architecture dependent code to the pure
+hardware related handling and to allow easy addition and utilization of new
+clock event devices. It also minimizes the duplicated code across the
+architectures as it provides generic functionality down to the interrupt
+service handler, which is almost inherently hardware dependent.
+
+Clock event devices are registered either by the architecture dependent boot
+code or at module insertion time. Each clock event device fills a data
+structure with clock-specific property parameters and callback functions. The
+clock event management decides, by using the specified property parameters, the
+set of system functions a clock event device will be used to support. This
+includes the distinction of per-CPU and per-system global event devices.
+
+System-level global event devices are used for the Linux periodic tick. Per-CPU
+event devices are used to provide local CPU functionality such as process
+accounting, profiling, and high resolution timers.
+
+The management layer assigns one or more of the following functions to a clock
+event device:
+ - system global periodic tick (jiffies update)
+ - cpu local update_process_times
+ - cpu local profiling
+ - cpu local next event interrupt (non periodic mode)
+
+The clock event device delegates the selection of those timer interrupt related
+functions completely to the management layer. The clock management layer stores
+a function pointer in the device description structure, which has to be called
+from the hardware level handler. This removes a lot of duplicated code from the
+architecture specific timer interrupt handlers and hands the control over the
+clock event devices and the assignment of timer interrupt related functionality
+to the core code.
+
+The clock event layer API is rather small. Aside from the clock event device
+registration interface it provides functions to schedule the next event
+interrupt, clock event device notification service and support for suspend and
+resume.
+
+The framework adds about 700 lines of code which results in a 2KB increase of
+the kernel binary size. The conversion of i386 removes about 100 lines of
+code. The binary size decrease is in the range of 400 byte. We believe that the
+increase of flexibility and the avoidance of duplicated code across
+architectures justifies the slight increase of the binary size.
+
+The conversion of an architecture has no functional impact, but allows to
+utilize the high resolution and dynamic tick functionalities without any change
+to the clock event device and timer interrupt code. After the conversion the
+enabling of high resolution timers and dynamic ticks is simply provided by
+adding the kernel/time/Kconfig file to the architecture specific Kconfig and
+adding the dynamic tick specific calls to the idle routine (a total of 3 lines
+added to the idle function and the Kconfig file)
+
+Figure #4 (OLS slides p.20) illustrates the transformation.
+
+
+high resolution timer functionality
+-----------------------------------
+
+During system boot it is not possible to use the high resolution timer
+functionality, while making it possible would be difficult and would serve no
+useful function. The initialization of the clock event device framework, the
+clock source framework (GTOD) and hrtimers itself has to be done and
+appropriate clock sources and clock event devices have to be registered before
+the high resolution functionality can work. Up to the point where hrtimers are
+initialized, the system works in the usual low resolution periodic mode. The
+clock source and the clock event device layers provide notification functions
+which inform hrtimers about availability of new hardware. hrtimers validates
+the usability of the registered clock sources and clock event devices before
+switching to high resolution mode. This ensures also that a kernel which is
+configured for high resolution timers can run on a system which lacks the
+necessary hardware support.
+
+The high resolution timer code does not support SMP machines which have only
+global clock event devices. The support of such hardware would involve IPI
+calls when an interrupt happens. The overhead would be much larger than the
+benefit. This is the reason why we currently disable high resolution and
+dynamic ticks on i386 SMP systems which stop the local APIC in C3 power
+state. A workaround is available as an idea, but the problem has not been
+tackled yet.
+
+The time ordered insertion of timers provides all the infrastructure to decide
+whether the event device has to be reprogrammed when a timer is added. The
+decision is made per timer base and synchronized across per-cpu timer bases in
+a support function. The design allows the system to utilize separate per-CPU
+clock event devices for the per-CPU timer bases, but currently only one
+reprogrammable clock event device per-CPU is utilized.
+
+When the timer interrupt happens, the next event interrupt handler is called
+from the clock event distribution code and moves expired timers from the
+red-black tree to a separate double linked list and invokes the softirq
+handler. An additional mode field in the hrtimer structure allows the system to
+execute callback functions directly from the next event interrupt handler. This
+is restricted to code which can safely be executed in the hard interrupt
+context. This applies, for example, to the common case of a wakeup function as
+used by nanosleep. The advantage of executing the handler in the interrupt
+context is the avoidance of up to two context switches - from the interrupted
+context to the softirq and to the task which is woken up by the expired
+timer.
+
+Once a system has switched to high resolution mode, the periodic tick is
+switched off. This disables the per system global periodic clock event device -
+e.g. the PIT on i386 SMP systems.
+
+The periodic tick functionality is provided by an per-cpu hrtimer. The callback
+function is executed in the next event interrupt context and updates jiffies
+and calls update_process_times and profiling. The implementation of the hrtimer
+based periodic tick is designed to be extended with dynamic tick functionality.
+This allows to use a single clock event device to schedule high resolution
+timer and periodic events (jiffies tick, profiling, process accounting) on UP
+systems. This has been proved to work with the PIT on i386 and the Incrementer
+on PPC.
+
+The softirq for running the hrtimer queues and executing the callbacks has been
+separated from the tick bound timer softirq to allow accurate delivery of high
+resolution timer signals which are used by itimer and POSIX interval
+timers. The execution of this softirq can still be delayed by other softirqs,
+but the overall latencies have been significantly improved by this separation.
+
+Figure #5 (OLS slides p.22) illustrates the transformation.
+
+
+dynamic ticks
+-------------
+
+Dynamic ticks are the logical consequence of the hrtimer based periodic tick
+replacement (sched_tick). The functionality of the sched_tick hrtimer is
+extended by three functions:
+
+- hrtimer_stop_sched_tick
+- hrtimer_restart_sched_tick
+- hrtimer_update_jiffies
+
+hrtimer_stop_sched_tick() is called when a CPU goes into idle state. The code
+evaluates the next scheduled timer event (from both hrtimers and the timer
+wheel) and in case that the next event is further away than the next tick it
+reprograms the sched_tick to this future event, to allow longer idle sleeps
+without worthless interruption by the periodic tick. The function is also
+called when an interrupt happens during the idle period, which does not cause a
+reschedule. The call is necessary as the interrupt handler might have armed a
+new timer whose expiry time is before the time which was identified as the
+nearest event in the previous call to hrtimer_stop_sched_tick.
+
+hrtimer_restart_sched_tick() is called when the CPU leaves the idle state before
+it calls schedule(). hrtimer_restart_sched_tick() resumes the periodic tick,
+which is kept active until the next call to hrtimer_stop_sched_tick().
+
+hrtimer_update_jiffies() is called from irq_enter() when an interrupt happens
+in the idle period to make sure that jiffies are up to date and the interrupt
+handler has not to deal with an eventually stale jiffy value.
+
+The dynamic tick feature provides statistical values which are exported to
+userspace via /proc/stats and can be made available for enhanced power
+management control.
+
+The implementation leaves room for further development like full tickless
+systems, where the time slice is controlled by the scheduler, variable
+frequency profiling, and a complete removal of jiffies in the future.
+
+
+Aside the current initial submission of i386 support, the patchset has been
+extended to x86_64 and ARM already. Initial (work in progress) support is also
+available for MIPS and PowerPC.
+
+ Thomas, Ingo
+
+
+
diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt
new file mode 100644
index 0000000..e7c09ab
--- /dev/null
+++ b/Documentation/timers/hpet.txt
@@ -0,0 +1,299 @@
+ High Precision Event Timer Driver for Linux
+
+The High Precision Event Timer (HPET) hardware follows a specification
+by Intel and Microsoft which can be found at
+
+ http://www.intel.com/technology/architecture/hpetspec.htm
+
+Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision")
+and up to 32 comparators. Normally three or more comparators are provided,
+each of which can generate oneshot interupts and at least one of which has
+additional hardware to support periodic interrupts. The comparators are
+also called "timers", which can be misleading since usually timers are
+independent of each other ... these share a counter, complicating resets.
+
+HPET devices can support two interrupt routing modes. In one mode, the
+comparators are additional interrupt sources with no particular system
+role. Many x86 BIOS writers don't route HPET interrupts at all, which
+prevents use of that mode. They support the other "legacy replacement"
+mode where the first two comparators block interrupts from 8254 timers
+and from the RTC.
+
+The driver supports detection of HPET driver allocation and initialization
+of the HPET before the driver module_init routine is called. This enables
+platform code which uses timer 0 or 1 as the main timer to intercept HPET
+initialization. An example of this initialization can be found in
+arch/x86/kernel/hpet.c.
+
+The driver provides a userspace API which resembles the API found in the
+RTC driver framework. An example user space program is provided below.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <memory.h>
+#include <malloc.h>
+#include <time.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <linux/hpet.h>
+
+
+extern void hpet_open_close(int, const char **);
+extern void hpet_info(int, const char **);
+extern void hpet_poll(int, const char **);
+extern void hpet_fasync(int, const char **);
+extern void hpet_read(int, const char **);
+
+#include <sys/poll.h>
+#include <sys/ioctl.h>
+#include <signal.h>
+
+struct hpet_command {
+ char *command;
+ void (*func)(int argc, const char ** argv);
+} hpet_command[] = {
+ {
+ "open-close",
+ hpet_open_close
+ },
+ {
+ "info",
+ hpet_info
+ },
+ {
+ "poll",
+ hpet_poll
+ },
+ {
+ "fasync",
+ hpet_fasync
+ },
+};
+
+int
+main(int argc, const char ** argv)
+{
+ int i;
+
+ argc--;
+ argv++;
+
+ if (!argc) {
+ fprintf(stderr, "-hpet: requires command\n");
+ return -1;
+ }
+
+
+ for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++)
+ if (!strcmp(argv[0], hpet_command[i].command)) {
+ argc--;
+ argv++;
+ fprintf(stderr, "-hpet: executing %s\n",
+ hpet_command[i].command);
+ hpet_command[i].func(argc, argv);
+ return 0;
+ }
+
+ fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]);
+
+ return -1;
+}
+
+void
+hpet_open_close(int argc, const char **argv)
+{
+ int fd;
+
+ if (argc != 1) {
+ fprintf(stderr, "hpet_open_close: device-name\n");
+ return;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+ if (fd < 0)
+ fprintf(stderr, "hpet_open_close: open failed\n");
+ else
+ close(fd);
+
+ return;
+}
+
+void
+hpet_info(int argc, const char **argv)
+{
+}
+
+void
+hpet_poll(int argc, const char **argv)
+{
+ unsigned long freq;
+ int iterations, i, fd;
+ struct pollfd pfd;
+ struct hpet_info info;
+ struct timeval stv, etv;
+ struct timezone tz;
+ long usec;
+
+ if (argc != 3) {
+ fprintf(stderr, "hpet_poll: device-name freq iterations\n");
+ return;
+ }
+
+ freq = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+
+ fd = open(argv[0], O_RDONLY);
+
+ if (fd < 0) {
+ fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]);
+ return;
+ }
+
+ if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
+ fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_poll: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags);
+
+ if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
+ fprintf(stderr, "hpet_poll: HPET_EPI failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_IE_ON, 0) < 0) {
+ fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n");
+ goto out;
+ }
+
+ pfd.fd = fd;
+ pfd.events = POLLIN;
+
+ for (i = 0; i < iterations; i++) {
+ pfd.revents = 0;
+ gettimeofday(&stv, &tz);
+ if (poll(&pfd, 1, -1) < 0)
+ fprintf(stderr, "hpet_poll: poll failed\n");
+ else {
+ long data;
+
+ gettimeofday(&etv, &tz);
+ usec = stv.tv_sec * 1000000 + stv.tv_usec;
+ usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec;
+
+ fprintf(stderr,
+ "hpet_poll: expired time = 0x%lx\n", usec);
+
+ fprintf(stderr, "hpet_poll: revents = 0x%x\n",
+ pfd.revents);
+
+ if (read(fd, &data, sizeof(data)) != sizeof(data)) {
+ fprintf(stderr, "hpet_poll: read failed\n");
+ }
+ else
+ fprintf(stderr, "hpet_poll: data 0x%lx\n",
+ data);
+ }
+ }
+
+out:
+ close(fd);
+ return;
+}
+
+static int hpet_sigio_count;
+
+static void
+hpet_sigio(int val)
+{
+ fprintf(stderr, "hpet_sigio: called\n");
+ hpet_sigio_count++;
+}
+
+void
+hpet_fasync(int argc, const char **argv)
+{
+ unsigned long freq;
+ int iterations, i, fd, value;
+ sig_t oldsig;
+ struct hpet_info info;
+
+ hpet_sigio_count = 0;
+ fd = -1;
+
+ if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) {
+ fprintf(stderr, "hpet_fasync: failed to set signal handler\n");
+ return;
+ }
+
+ if (argc != 3) {
+ fprintf(stderr, "hpet_fasync: device-name freq iterations\n");
+ goto out;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+
+ if (fd < 0) {
+ fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]);
+ return;
+ }
+
+
+ if ((fcntl(fd, F_SETOWN, getpid()) == 1) ||
+ ((value = fcntl(fd, F_GETFL)) == 1) ||
+ (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) {
+ fprintf(stderr, "hpet_fasync: fcntl failed\n");
+ goto out;
+ }
+
+ freq = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+
+ if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
+ fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_fasync: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags);
+
+ if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
+ fprintf(stderr, "hpet_fasync: HPET_EPI failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_IE_ON, 0) < 0) {
+ fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n");
+ goto out;
+ }
+
+ for (i = 0; i < iterations; i++) {
+ (void) pause();
+ fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count);
+ }
+
+out:
+ signal(SIGIO, oldsig);
+
+ if (fd >= 0)
+ close(fd);
+
+ return;
+}
diff --git a/Documentation/timers/hrtimers.txt b/Documentation/timers/hrtimers.txt
new file mode 100644
index 0000000..ce31f65
--- /dev/null
+++ b/Documentation/timers/hrtimers.txt
@@ -0,0 +1,178 @@
+
+hrtimers - subsystem for high-resolution kernel timers
+----------------------------------------------------
+
+This patch introduces a new subsystem for high-resolution kernel timers.
+
+One might ask the question: we already have a timer subsystem
+(kernel/timers.c), why do we need two timer subsystems? After a lot of
+back and forth trying to integrate high-resolution and high-precision
+features into the existing timer framework, and after testing various
+such high-resolution timer implementations in practice, we came to the
+conclusion that the timer wheel code is fundamentally not suitable for
+such an approach. We initially didn't believe this ('there must be a way
+to solve this'), and spent a considerable effort trying to integrate
+things into the timer wheel, but we failed. In hindsight, there are
+several reasons why such integration is hard/impossible:
+
+- the forced handling of low-resolution and high-resolution timers in
+ the same way leads to a lot of compromises, macro magic and #ifdef
+ mess. The timers.c code is very "tightly coded" around jiffies and
+ 32-bitness assumptions, and has been honed and micro-optimized for a
+ relatively narrow use case (jiffies in a relatively narrow HZ range)
+ for many years - and thus even small extensions to it easily break
+ the wheel concept, leading to even worse compromises. The timer wheel
+ code is very good and tight code, there's zero problems with it in its
+ current usage - but it is simply not suitable to be extended for
+ high-res timers.
+
+- the unpredictable [O(N)] overhead of cascading leads to delays which
+ necessitate a more complex handling of high resolution timers, which
+ in turn decreases robustness. Such a design still led to rather large
+ timing inaccuracies. Cascading is a fundamental property of the timer
+ wheel concept, it cannot be 'designed out' without unevitably
+ degrading other portions of the timers.c code in an unacceptable way.
+
+- the implementation of the current posix-timer subsystem on top of
+ the timer wheel has already introduced a quite complex handling of
+ the required readjusting of absolute CLOCK_REALTIME timers at
+ settimeofday or NTP time - further underlying our experience by
+ example: that the timer wheel data structure is too rigid for high-res
+ timers.
+
+- the timer wheel code is most optimal for use cases which can be
+ identified as "timeouts". Such timeouts are usually set up to cover
+ error conditions in various I/O paths, such as networking and block
+ I/O. The vast majority of those timers never expire and are rarely
+ recascaded because the expected correct event arrives in time so they
+ can be removed from the timer wheel before any further processing of
+ them becomes necessary. Thus the users of these timeouts can accept
+ the granularity and precision tradeoffs of the timer wheel, and
+ largely expect the timer subsystem to have near-zero overhead.
+ Accurate timing for them is not a core purpose - in fact most of the
+ timeout values used are ad-hoc. For them it is at most a necessary
+ evil to guarantee the processing of actual timeout completions
+ (because most of the timeouts are deleted before completion), which
+ should thus be as cheap and unintrusive as possible.
+
+The primary users of precision timers are user-space applications that
+utilize nanosleep, posix-timers and itimer interfaces. Also, in-kernel
+users like drivers and subsystems which require precise timed events
+(e.g. multimedia) can benefit from the availability of a separate
+high-resolution timer subsystem as well.
+
+While this subsystem does not offer high-resolution clock sources just
+yet, the hrtimer subsystem can be easily extended with high-resolution
+clock capabilities, and patches for that exist and are maturing quickly.
+The increasing demand for realtime and multimedia applications along
+with other potential users for precise timers gives another reason to
+separate the "timeout" and "precise timer" subsystems.
+
+Another potential benefit is that such a separation allows even more
+special-purpose optimization of the existing timer wheel for the low
+resolution and low precision use cases - once the precision-sensitive
+APIs are separated from the timer wheel and are migrated over to
+hrtimers. E.g. we could decrease the frequency of the timeout subsystem
+from 250 Hz to 100 HZ (or even smaller).
+
+hrtimer subsystem implementation details
+----------------------------------------
+
+the basic design considerations were:
+
+- simplicity
+
+- data structure not bound to jiffies or any other granularity. All the
+ kernel logic works at 64-bit nanoseconds resolution - no compromises.
+
+- simplification of existing, timing related kernel code
+
+another basic requirement was the immediate enqueueing and ordering of
+timers at activation time. After looking at several possible solutions
+such as radix trees and hashes, we chose the red black tree as the basic
+data structure. Rbtrees are available as a library in the kernel and are
+used in various performance-critical areas of e.g. memory management and
+file systems. The rbtree is solely used for time sorted ordering, while
+a separate list is used to give the expiry code fast access to the
+queued timers, without having to walk the rbtree.
+
+(This separate list is also useful for later when we'll introduce
+high-resolution clocks, where we need separate pending and expired
+queues while keeping the time-order intact.)
+
+Time-ordered enqueueing is not purely for the purposes of
+high-resolution clocks though, it also simplifies the handling of
+absolute timers based on a low-resolution CLOCK_REALTIME. The existing
+implementation needed to keep an extra list of all armed absolute
+CLOCK_REALTIME timers along with complex locking. In case of
+settimeofday and NTP, all the timers (!) had to be dequeued, the
+time-changing code had to fix them up one by one, and all of them had to
+be enqueued again. The time-ordered enqueueing and the storage of the
+expiry time in absolute time units removes all this complex and poorly
+scaling code from the posix-timer implementation - the clock can simply
+be set without having to touch the rbtree. This also makes the handling
+of posix-timers simpler in general.
+
+The locking and per-CPU behavior of hrtimers was mostly taken from the
+existing timer wheel code, as it is mature and well suited. Sharing code
+was not really a win, due to the different data structures. Also, the
+hrtimer functions now have clearer behavior and clearer names - such as
+hrtimer_try_to_cancel() and hrtimer_cancel() [which are roughly
+equivalent to del_timer() and del_timer_sync()] - so there's no direct
+1:1 mapping between them on the algorithmical level, and thus no real
+potential for code sharing either.
+
+Basic data types: every time value, absolute or relative, is in a
+special nanosecond-resolution type: ktime_t. The kernel-internal
+representation of ktime_t values and operations is implemented via
+macros and inline functions, and can be switched between a "hybrid
+union" type and a plain "scalar" 64bit nanoseconds representation (at
+compile time). The hybrid union type optimizes time conversions on 32bit
+CPUs. This build-time-selectable ktime_t storage format was implemented
+to avoid the performance impact of 64-bit multiplications and divisions
+on 32bit CPUs. Such operations are frequently necessary to convert
+between the storage formats provided by kernel and userspace interfaces
+and the internal time format. (See include/linux/ktime.h for further
+details.)
+
+hrtimers - rounding of timer values
+-----------------------------------
+
+the hrtimer code will round timer events to lower-resolution clocks
+because it has to. Otherwise it will do no artificial rounding at all.
+
+one question is, what resolution value should be returned to the user by
+the clock_getres() interface. This will return whatever real resolution
+a given clock has - be it low-res, high-res, or artificially-low-res.
+
+hrtimers - testing and verification
+----------------------------------
+
+We used the high-resolution clock subsystem ontop of hrtimers to verify
+the hrtimer implementation details in praxis, and we also ran the posix
+timer tests in order to ensure specification compliance. We also ran
+tests on low-resolution clocks.
+
+The hrtimer patch converts the following kernel functionality to use
+hrtimers:
+
+ - nanosleep
+ - itimers
+ - posix-timers
+
+The conversion of nanosleep and posix-timers enabled the unification of
+nanosleep and clock_nanosleep.
+
+The code was successfully compiled for the following platforms:
+
+ i386, x86_64, ARM, PPC, PPC64, IA64
+
+The code was run-tested on the following platforms:
+
+ i386(UP/SMP), x86_64(UP/SMP), ARM, PPC
+
+hrtimers were also integrated into the -rt tree, along with a
+hrtimers-based high-resolution clock implementation, so the hrtimers
+code got a healthy amount of testing and use in practice.
+
+ Thomas Gleixner, Ingo Molnar
diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt
new file mode 100644
index 0000000..20d368c
--- /dev/null
+++ b/Documentation/timers/timer_stats.txt
@@ -0,0 +1,73 @@
+timer_stats - timer usage statistics
+------------------------------------
+
+timer_stats is a debugging facility to make the timer (ab)usage in a Linux
+system visible to kernel and userspace developers. If enabled in the config
+but not used it has almost zero runtime overhead, and a relatively small
+data structure overhead. Even if collection is enabled runtime all the
+locking is per-CPU and lookup is hashed.
+
+timer_stats should be used by kernel and userspace developers to verify that
+their code does not make unduly use of timers. This helps to avoid unnecessary
+wakeups, which should be avoided to optimize power consumption.
+
+It can be enabled by CONFIG_TIMER_STATS in the "Kernel hacking" configuration
+section.
+
+timer_stats collects information about the timer events which are fired in a
+Linux system over a sample period:
+
+- the pid of the task(process) which initialized the timer
+- the name of the process which initialized the timer
+- the function where the timer was intialized
+- the callback function which is associated to the timer
+- the number of events (callbacks)
+
+timer_stats adds an entry to /proc: /proc/timer_stats
+
+This entry is used to control the statistics functionality and to read out the
+sampled information.
+
+The timer_stats functionality is inactive on bootup.
+
+To activate a sample period issue:
+# echo 1 >/proc/timer_stats
+
+To stop a sample period issue:
+# echo 0 >/proc/timer_stats
+
+The statistics can be retrieved by:
+# cat /proc/timer_stats
+
+The readout of /proc/timer_stats automatically disables sampling. The sampled
+information is kept until a new sample period is started. This allows multiple
+readouts.
+
+Sample output of /proc/timer_stats:
+
+Timerstats sample period: 3.888770 s
+ 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
+ 15, 1 swapper hcd_submit_urb (rh_timer_func)
+ 4, 959 kedac schedule_timeout (process_timeout)
+ 1, 0 swapper page_writeback_init (wb_timer_fn)
+ 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
+ 22, 2948 IRQ 4 tty_flip_buffer_push (delayed_work_timer_fn)
+ 3, 3100 bash schedule_timeout (process_timeout)
+ 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
+ 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
+ 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
+ 1, 2292 ip __netdev_watchdog_up (dev_watchdog)
+ 1, 23 events/1 do_cache_clean (delayed_work_timer_fn)
+90 total events, 30.0 events/sec
+
+The first column is the number of events, the second column the pid, the third
+column is the name of the process. The forth column shows the function which
+initialized the timer and in parantheses the callback function which was
+executed on expiry.
+
+ Thomas, Ingo
+
+Added flag to indicate 'deferrable timer' in /proc/timer_stats. A deferrable
+timer will appear as follows
+ 10D, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
+
diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
new file mode 100644
index 0000000..5d354e1
--- /dev/null
+++ b/Documentation/tracepoints.txt
@@ -0,0 +1,101 @@
+ Using the Linux Kernel Tracepoints
+
+ Mathieu Desnoyers
+
+
+This document introduces Linux Kernel Tracepoints and their use. It provides
+examples of how to insert tracepoints in the kernel and connect probe functions
+to them and provides some examples of probe functions.
+
+
+* Purpose of tracepoints
+
+A tracepoint placed in code provides a hook to call a function (probe) that you
+can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or
+"off" (no probe is attached). When a tracepoint is "off" it has no effect,
+except for adding a tiny time penalty (checking a condition for a branch) and
+space penalty (adding a few bytes for the function call at the end of the
+instrumented function and adds a data structure in a separate section). When a
+tracepoint is "on", the function you provide is called each time the tracepoint
+is executed, in the execution context of the caller. When the function provided
+ends its execution, it returns to the caller (continuing from the tracepoint
+site).
+
+You can put tracepoints at important locations in the code. They are
+lightweight hooks that can pass an arbitrary number of parameters,
+which prototypes are described in a tracepoint declaration placed in a header
+file.
+
+They can be used for tracing and performance accounting.
+
+
+* Usage
+
+Two elements are required for tracepoints :
+
+- A tracepoint definition, placed in a header file.
+- The tracepoint statement, in C code.
+
+In order to use tracepoints, you should include linux/tracepoint.h.
+
+In include/trace/subsys.h :
+
+#include <linux/tracepoint.h>
+
+DEFINE_TRACE(subsys_eventname,
+ TPPTOTO(int firstarg, struct task_struct *p),
+ TPARGS(firstarg, p));
+
+In subsys/file.c (where the tracing statement must be added) :
+
+#include <trace/subsys.h>
+
+void somefct(void)
+{
+ ...
+ trace_subsys_eventname(arg, task);
+ ...
+}
+
+Where :
+- subsys_eventname is an identifier unique to your event
+ - subsys is the name of your subsystem.
+ - eventname is the name of the event to trace.
+- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function
+ called by this tracepoint.
+- TPARGS(firstarg, p) are the parameters names, same as found in the prototype.
+
+Connecting a function (probe) to a tracepoint is done by providing a probe
+(function to call) for the specific tracepoint through
+register_trace_subsys_eventname(). Removing a probe is done through
+unregister_trace_subsys_eventname(); it will remove the probe sure there is no
+caller left using the probe when it returns. Probe removal is preempt-safe
+because preemption is disabled around the probe call. See the "Probe example"
+section below for a sample probe module.
+
+The tracepoint mechanism supports inserting multiple instances of the same
+tracepoint, but a single definition must be made of a given tracepoint name over
+all the kernel to make sure no type conflict will occur. Name mangling of the
+tracepoints is done using the prototypes to make sure typing is correct.
+Verification of probe type correctness is done at the registration site by the
+compiler. Tracepoints can be put in inline functions, inlined static functions,
+and unrolled loops as well as regular functions.
+
+The naming scheme "subsys_event" is suggested here as a convention intended
+to limit collisions. Tracepoint names are global to the kernel: they are
+considered as being the same whether they are in the core kernel image or in
+modules.
+
+
+* Probe / tracepoint example
+
+See the example provided in samples/tracepoints/src
+
+Compile them with your kernel.
+
+Run, as root :
+modprobe tracepoint-example (insmod order is not important)
+modprobe tracepoint-probe-example
+cat /proc/tracepoint-example (returns an expected error)
+rmmod tracepoint-example tracepoint-probe-example
+dmesg
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt
new file mode 100644
index 0000000..cde23b4
--- /dev/null
+++ b/Documentation/tracers/mmiotrace.txt
@@ -0,0 +1,165 @@
+ In-kernel memory-mapped I/O tracing
+
+
+Home page and links to optional user space tools:
+
+ http://nouveau.freedesktop.org/wiki/MmioTrace
+
+MMIO tracing was originally developed by Intel around 2003 for their Fault
+Injection Test Harness. In Dec 2006 - Jan 2007, using the code from Intel,
+Jeff Muizelaar created a tool for tracing MMIO accesses with the Nouveau
+project in mind. Since then many people have contributed.
+
+Mmiotrace was built for reverse engineering any memory-mapped IO device with
+the Nouveau project as the first real user. Only x86 and x86_64 architectures
+are supported.
+
+Out-of-tree mmiotrace was originally modified for mainline inclusion and
+ftrace framework by Pekka Paalanen <pq@iki.fi>.
+
+
+Preparation
+-----------
+
+Mmiotrace feature is compiled in by the CONFIG_MMIOTRACE option. Tracing is
+disabled by default, so it is safe to have this set to yes. SMP systems are
+supported, but tracing is unreliable and may miss events if more than one CPU
+is on-line, therefore mmiotrace takes all but one CPU off-line during run-time
+activation. You can re-enable CPUs by hand, but you have been warned, there
+is no way to automatically detect if you are losing events due to CPUs racing.
+
+
+Usage Quick Reference
+---------------------
+
+$ mount -t debugfs debugfs /debug
+$ echo mmiotrace > /debug/tracing/current_tracer
+$ cat /debug/tracing/trace_pipe > mydump.txt &
+Start X or whatever.
+$ echo "X is up" > /debug/tracing/trace_marker
+$ echo nop > /debug/tracing/current_tracer
+Check for lost events.
+
+
+Usage
+-----
+
+Make sure debugfs is mounted to /debug. If not, (requires root privileges)
+$ mount -t debugfs debugfs /debug
+
+Check that the driver you are about to trace is not loaded.
+
+Activate mmiotrace (requires root privileges):
+$ echo mmiotrace > /debug/tracing/current_tracer
+
+Start storing the trace:
+$ cat /debug/tracing/trace_pipe > mydump.txt &
+The 'cat' process should stay running (sleeping) in the background.
+
+Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
+accesses to areas that are ioremapped while mmiotrace is active.
+
+During tracing you can place comments (markers) into the trace by
+$ echo "X is up" > /debug/tracing/trace_marker
+This makes it easier to see which part of the (huge) trace corresponds to
+which action. It is recommended to place descriptive markers about what you
+do.
+
+Shut down mmiotrace (requires root privileges):
+$ echo nop > /debug/tracing/current_tracer
+The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
+pressing ctrl+c.
+
+Check that mmiotrace did not lose events due to a buffer filling up. Either
+$ grep -i lost mydump.txt
+which tells you exactly how many events were lost, or use
+$ dmesg
+to view your kernel log and look for "mmiotrace has lost events" warning. If
+events were lost, the trace is incomplete. You should enlarge the buffers and
+try again. Buffers are enlarged by first seeing how large the current buffers
+are:
+$ cat /debug/tracing/trace_entries
+gives you a number. Approximately double this number and write it back, for
+instance:
+$ echo 0 > /debug/tracing/tracing_enabled
+$ echo 128000 > /debug/tracing/trace_entries
+$ echo 1 > /debug/tracing/tracing_enabled
+Then start again from the top.
+
+If you are doing a trace for a driver project, e.g. Nouveau, you should also
+do the following before sending your results:
+$ lspci -vvv > lspci.txt
+$ dmesg > dmesg.txt
+$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt
+and then send the .tar.gz file. The trace compresses considerably. Replace
+"pciid" and "nick" with the PCI ID or model name of your piece of hardware
+under investigation and your nick name.
+
+
+How Mmiotrace Works
+-------------------
+
+Access to hardware IO-memory is gained by mapping addresses from PCI bus by
+calling one of the ioremap_*() functions. Mmiotrace is hooked into the
+__ioremap() function and gets called whenever a mapping is created. Mapping is
+an event that is recorded into the trace log. Note, that ISA range mappings
+are not caught, since the mapping always exists and is returned directly.
+
+MMIO accesses are recorded via page faults. Just before __ioremap() returns,
+the mapped pages are marked as not present. Any access to the pages causes a
+fault. The page fault handler calls mmiotrace to handle the fault. Mmiotrace
+marks the page present, sets TF flag to achieve single stepping and exits the
+fault handler. The instruction that faulted is executed and debug trap is
+entered. Here mmiotrace again marks the page as not present. The instruction
+is decoded to get the type of operation (read/write), data width and the value
+read or written. These are stored to the trace log.
+
+Setting the page present in the page fault handler has a race condition on SMP
+machines. During the single stepping other CPUs may run freely on that page
+and events can be missed without a notice. Re-enabling other CPUs during
+tracing is discouraged.
+
+
+Trace Log Format
+----------------
+
+The raw log is text and easily filtered with e.g. grep and awk. One record is
+one line in the log. A record starts with a keyword, followed by keyword
+dependant arguments. Arguments are separated by a space, or continue until the
+end of line. The format for version 20070824 is as follows:
+
+Explanation Keyword Space separated arguments
+---------------------------------------------------------------------------
+
+read event R width, timestamp, map id, physical, value, PC, PID
+write event W width, timestamp, map id, physical, value, PC, PID
+ioremap event MAP timestamp, map id, physical, virtual, length, PC, PID
+iounmap event UNMAP timestamp, map id, PC, PID
+marker MARK timestamp, text
+version VERSION the string "20070824"
+info for reader LSPCI one line from lspci -v
+PCI address map PCIDEV space separated /proc/bus/pci/devices data
+unk. opcode UNKNOWN timestamp, map id, physical, data, PC, PID
+
+Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual
+is a kernel virtual address. Width is the data width in bytes and value is the
+data value. Map id is an arbitrary id number identifying the mapping that was
+used in an operation. PC is the program counter and PID is process id. PC is
+zero if it is not recorded. PID is always zero as tracing MMIO accesses
+originating in user space memory is not yet supported.
+
+For instance, the following awk filter will pass all 32-bit writes that target
+physical addresses in the range [0xfb73ce40, 0xfb800000[
+
+$ awk '/W 4 / { adr=strtonum($5); if (adr >= 0xfb73ce40 &&
+adr < 0xfb800000) print; }'
+
+
+Tools for Developers
+--------------------
+
+The user space tools include utilities for:
+- replacing numeric addresses and values with hardware register names
+- replaying MMIO logs, i.e., re-executing the recorded writes
+
+
diff --git a/Documentation/uml/UserModeLinux-HOWTO.txt b/Documentation/uml/UserModeLinux-HOWTO.txt
new file mode 100644
index 0000000..628013f
--- /dev/null
+++ b/Documentation/uml/UserModeLinux-HOWTO.txt
@@ -0,0 +1,4636 @@
+ User Mode Linux HOWTO
+ User Mode Linux Core Team
+ Mon Nov 18 14:16:16 EST 2002
+
+ This document describes the use and abuse of Jeff Dike's User Mode
+ Linux: a port of the Linux kernel as a normal Intel Linux process.
+ ______________________________________________________________________
+
+ Table of Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1. Introduction
+
+ 1.1 How is User Mode Linux Different?
+ 1.2 Why Would I Want User Mode Linux?
+
+ 2. Compiling the kernel and modules
+
+ 2.1 Compiling the kernel
+ 2.2 Compiling and installing kernel modules
+ 2.3 Compiling and installing uml_utilities
+
+ 3. Running UML and logging in
+
+ 3.1 Running UML
+ 3.2 Logging in
+ 3.3 Examples
+
+ 4. UML on 2G/2G hosts
+
+ 4.1 Introduction
+ 4.2 The problem
+ 4.3 The solution
+
+ 5. Setting up serial lines and consoles
+
+ 5.1 Specifying the device
+ 5.2 Specifying the channel
+ 5.3 Examples
+
+ 6. Setting up the network
+
+ 6.1 General setup
+ 6.2 Userspace daemons
+ 6.3 Specifying ethernet addresses
+ 6.4 UML interface setup
+ 6.5 Multicast
+ 6.6 TUN/TAP with the uml_net helper
+ 6.7 TUN/TAP with a preconfigured tap device
+ 6.8 Ethertap
+ 6.9 The switch daemon
+ 6.10 Slip
+ 6.11 Slirp
+ 6.12 pcap
+ 6.13 Setting up the host yourself
+
+ 7. Sharing Filesystems between Virtual Machines
+
+ 7.1 A warning
+ 7.2 Using layered block devices
+ 7.3 Note!
+ 7.4 Another warning
+ 7.5 uml_moo : Merging a COW file with its backing file
+
+ 8. Creating filesystems
+
+ 8.1 Create the filesystem file
+ 8.2 Assign the file to a UML device
+ 8.3 Creating and mounting the filesystem
+
+ 9. Host file access
+
+ 9.1 Using hostfs
+ 9.2 hostfs as the root filesystem
+ 9.3 Building hostfs
+
+ 10. The Management Console
+ 10.1 version
+ 10.2 halt and reboot
+ 10.3 config
+ 10.4 remove
+ 10.5 sysrq
+ 10.6 help
+ 10.7 cad
+ 10.8 stop
+ 10.9 go
+
+ 11. Kernel debugging
+
+ 11.1 Starting the kernel under gdb
+ 11.2 Examining sleeping processes
+ 11.3 Running ddd on UML
+ 11.4 Debugging modules
+ 11.5 Attaching gdb to the kernel
+ 11.6 Using alternate debuggers
+
+ 12. Kernel debugging examples
+
+ 12.1 The case of the hung fsck
+ 12.2 Episode 2: The case of the hung fsck
+
+ 13. What to do when UML doesn't work
+
+ 13.1 Strange compilation errors when you build from source
+ 13.2 (obsolete)
+ 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem
+ 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid'
+ 13.5 UML doesn't work when /tmp is an NFS filesystem
+ 13.6 UML hangs on boot when compiled with gprof support
+ 13.7 syslogd dies with a SIGTERM on startup
+ 13.8 TUN/TAP networking doesn't work on a 2.4 host
+ 13.9 You can network to the host but not to other machines on the net
+ 13.10 I have no root and I want to scream
+ 13.11 UML build conflict between ptrace.h and ucontext.h
+ 13.12 The UML BogoMips is exactly half the host's BogoMips
+ 13.13 When you run UML, it immediately segfaults
+ 13.14 xterms appear, then immediately disappear
+ 13.15 Any other panic, hang, or strange behavior
+
+ 14. Diagnosing Problems
+
+ 14.1 Case 1 : Normal kernel panics
+ 14.2 Case 2 : Tracing thread panics
+ 14.3 Case 3 : Tracing thread panics caused by other threads
+ 14.4 Case 4 : Hangs
+
+ 15. Thanks
+
+ 15.1 Code and Documentation
+ 15.2 Flushing out bugs
+ 15.3 Buglets and clean-ups
+ 15.4 Case Studies
+ 15.5 Other contributions
+
+
+ ______________________________________________________________________
+
+ 11.. IInnttrroodduuccttiioonn
+
+ Welcome to User Mode Linux. It's going to be fun.
+
+
+
+ 11..11.. HHooww iiss UUsseerr MMooddee LLiinnuuxx DDiiffffeerreenntt??
+
+ Normally, the Linux Kernel talks straight to your hardware (video
+ card, keyboard, hard drives, etc), and any programs which run ask the
+ kernel to operate the hardware, like so:
+
+
+
+ +-----------+-----------+----+
+ | Process 1 | Process 2 | ...|
+ +-----------+-----------+----+
+ | Linux Kernel |
+ +----------------------------+
+ | Hardware |
+ +----------------------------+
+
+
+
+
+ The User Mode Linux Kernel is different; instead of talking to the
+ hardware, it talks to a `real' Linux kernel (called the `host kernel'
+ from now on), like any other program. Programs can then run inside
+ User-Mode Linux as if they were running under a normal kernel, like
+ so:
+
+
+
+ +----------------+
+ | Process 2 | ...|
+ +-----------+----------------+
+ | Process 1 | User-Mode Linux|
+ +----------------------------+
+ | Linux Kernel |
+ +----------------------------+
+ | Hardware |
+ +----------------------------+
+
+
+
+
+
+ 11..22.. WWhhyy WWoouulldd II WWaanntt UUsseerr MMooddee LLiinnuuxx??
+
+
+ 1. If User Mode Linux crashes, your host kernel is still fine.
+
+ 2. You can run a usermode kernel as a non-root user.
+
+ 3. You can debug the User Mode Linux like any normal process.
+
+ 4. You can run gprof (profiling) and gcov (coverage testing).
+
+ 5. You can play with your kernel without breaking things.
+
+ 6. You can use it as a sandbox for testing new apps.
+
+ 7. You can try new development kernels safely.
+
+ 8. You can run different distributions simultaneously.
+
+ 9. It's extremely fun.
+
+
+
+
+
+ 22.. CCoommppiilliinngg tthhee kkeerrnneell aanndd mmoodduulleess
+
+
+
+
+ 22..11.. CCoommppiilliinngg tthhee kkeerrnneell
+
+
+ Compiling the user mode kernel is just like compiling any other
+ kernel. Let's go through the steps, using 2.4.0-prerelease (current
+ as of this writing) as an example:
+
+
+ 1. Download the latest UML patch from
+
+ the download page <http://user-mode-linux.sourceforge.net/dl-
+ sf.html>
+
+ In this example, the file is uml-patch-2.4.0-prerelease.bz2.
+
+
+ 2. Download the matching kernel from your favourite kernel mirror,
+ such as:
+
+ ftp://ftp.ca.kernel.org/pub/kernel/v2.4/linux-2.4.0-prerelease.tar.bz2
+ <ftp://ftp.ca.kernel.org/pub/kernel/v2.4/linux-2.4.0-prerelease.tar.bz2>
+ .
+
+
+ 3. Make a directory and unpack the kernel into it.
+
+
+
+ host%
+ mkdir ~/uml
+
+
+
+
+
+
+ host%
+ cd ~/uml
+
+
+
+
+
+
+ host%
+ tar -xzvf linux-2.4.0-prerelease.tar.bz2
+
+
+
+
+
+
+ 4. Apply the patch using
+
+
+
+ host%
+ cd ~/uml/linux
+
+
+
+ host%
+ bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1
+
+
+
+
+
+
+ 5. Run your favorite config; `make xconfig ARCH=um' is the most
+ convenient. `make config ARCH=um' and 'make menuconfig ARCH=um'
+ will work as well. The defaults will give you a useful kernel. If
+ you want to change something, go ahead, it probably won't hurt
+ anything.
+
+
+ Note: If the host is configured with a 2G/2G address space split
+ rather than the usual 3G/1G split, then the packaged UML binaries
+ will not run. They will immediately segfault. See ``UML on 2G/2G
+ hosts'' for the scoop on running UML on your system.
+
+
+
+ 6. Finish with `make linux ARCH=um': the result is a file called
+ `linux' in the top directory of your source tree.
+
+ Make sure that you don't build this kernel in /usr/src/linux. On some
+ distributions, /usr/include/asm is a link into this pool. The user-
+ mode build changes the other end of that link, and things that include
+ <asm/anything.h> stop compiling.
+
+ The sources are also available from cvs at the project's cvs page,
+ which has directions on getting the sources. You can also browse the
+ CVS pool from there.
+
+ If you get the CVS sources, you will have to check them out into an
+ empty directory. You will then have to copy each file into the
+ corresponding directory in the appropriate kernel pool.
+
+ If you don't have the latest kernel pool, you can get the
+ corresponding user-mode sources with
+
+
+ host% cvs co -r v_2_3_x linux
+
+
+
+
+ where 'x' is the version in your pool. Note that you will not get the
+ bug fixes and enhancements that have gone into subsequent releases.
+
+
+ 22..22.. CCoommppiilliinngg aanndd iinnssttaalllliinngg kkeerrnneell mmoodduulleess
+
+ UML modules are built in the same way as the native kernel (with the
+ exception of the 'ARCH=um' that you always need for UML):
+
+
+ host% make modules ARCH=um
+
+
+
+
+ Any modules that you want to load into this kernel need to be built in
+ the user-mode pool. Modules from the native kernel won't work.
+
+ You can install them by using ftp or something to copy them into the
+ virtual machine and dropping them into /lib/modules/`uname -r`.
+
+ You can also get the kernel build process to install them as follows:
+
+ 1. with the kernel not booted, mount the root filesystem in the top
+ level of the kernel pool:
+
+
+ host% mount root_fs mnt -o loop
+
+
+
+
+
+
+ 2. run
+
+
+ host%
+ make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um
+
+
+
+
+
+
+ 3. unmount the filesystem
+
+
+ host% umount mnt
+
+
+
+
+
+
+ 4. boot the kernel on it
+
+
+ When the system is booted, you can use insmod as usual to get the
+ modules into the kernel. A number of things have been loaded into UML
+ as modules, especially filesystems and network protocols and filters,
+ so most symbols which need to be exported probably already are.
+ However, if you do find symbols that need exporting, let us
+ <http://user-mode-linux.sourceforge.net/contacts.html> know, and
+ they'll be "taken care of".
+
+
+
+ 22..33.. CCoommppiilliinngg aanndd iinnssttaalllliinngg uummll__uuttiilliittiieess
+
+ Many features of the UML kernel require a user-space helper program,
+ so a uml_utilities package is distributed separately from the kernel
+ patch which provides these helpers. Included within this is:
+
+ +o port-helper - Used by consoles which connect to xterms or ports
+
+ +o tunctl - Configuration tool to create and delete tap devices
+
+ +o uml_net - Setuid binary for automatic tap device configuration
+
+ +o uml_switch - User-space virtual switch required for daemon
+ transport
+
+ The uml_utilities tree is compiled with:
+
+
+ host#
+ make && make install
+
+
+
+
+ Note that UML kernel patches may require a specific version of the
+ uml_utilities distribution. If you don't keep up with the mailing
+ lists, ensure that you have the latest release of uml_utilities if you
+ are experiencing problems with your UML kernel, particularly when
+ dealing with consoles or command-line switches to the helper programs
+
+
+
+
+
+
+
+
+ 33.. RRuunnnniinngg UUMMLL aanndd llooggggiinngg iinn
+
+
+
+ 33..11.. RRuunnnniinngg UUMMLL
+
+ It runs on 2.2.15 or later, and all 2.4 kernels.
+
+
+ Booting UML is straightforward. Simply run 'linux': it will try to
+ mount the file `root_fs' in the current directory. You do not need to
+ run it as root. If your root filesystem is not named `root_fs', then
+ you need to put a `ubd0=root_fs_whatever' switch on the linux command
+ line.
+
+
+ You will need a filesystem to boot UML from. There are a number
+ available for download from here <http://user-mode-
+ linux.sourceforge.net/dl-sf.html> . There are also several tools
+ <http://user-mode-linux.sourceforge.net/fs_making.html> which can be
+ used to generate UML-compatible filesystem images from media.
+ The kernel will boot up and present you with a login prompt.
+
+
+ Note: If the host is configured with a 2G/2G address space split
+ rather than the usual 3G/1G split, then the packaged UML binaries will
+ not run. They will immediately segfault. See ``UML on 2G/2G hosts''
+ for the scoop on running UML on your system.
+
+
+
+ 33..22.. LLooggggiinngg iinn
+
+
+
+ The prepackaged filesystems have a root account with password 'root'
+ and a user account with password 'user'. The login banner will
+ generally tell you how to log in. So, you log in and you will find
+ yourself inside a little virtual machine. Our filesystems have a
+ variety of commands and utilities installed (and it is fairly easy to
+ add more), so you will have a lot of tools with which to poke around
+ the system.
+
+ There are a couple of other ways to log in:
+
+ +o On a virtual console
+
+
+
+ Each virtual console that is configured (i.e. the device exists in
+ /dev and /etc/inittab runs a getty on it) will come up in its own
+ xterm. If you get tired of the xterms, read ``Setting up serial
+ lines and consoles'' to see how to attach the consoles to
+ something else, like host ptys.
+
+
+
+ +o Over the serial line
+
+
+ In the boot output, find a line that looks like:
+
+
+
+ serial line 0 assigned pty /dev/ptyp1
+
+
+
+
+ Attach your favorite terminal program to the corresponding tty. I.e.
+ for minicom, the command would be
+
+
+ host% minicom -o -p /dev/ttyp1
+
+
+
+
+
+
+ +o Over the net
+
+
+ If the network is running, then you can telnet to the virtual
+ machine and log in to it. See ``Setting up the network'' to learn
+ about setting up a virtual network.
+
+ When you're done using it, run halt, and the kernel will bring itself
+ down and the process will exit.
+
+
+ 33..33.. EExxaammpplleess
+
+ Here are some examples of UML in action:
+
+ +o A login session <http://user-mode-linux.sourceforge.net/login.html>
+
+ +o A virtual network <http://user-mode-linux.sourceforge.net/net.html>
+
+
+
+
+
+
+
+ 44.. UUMMLL oonn 22GG//22GG hhoossttss
+
+
+
+
+ 44..11.. IInnttrroodduuccttiioonn
+
+
+ Most Linux machines are configured so that the kernel occupies the
+ upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and
+ processes use the lower 3G (0x00000000 - 0xbfffffff). However, some
+ machine are configured with a 2G/2G split, with the kernel occupying
+ the upper 2G (0x80000000 - 0xffffffff) and processes using the lower
+ 2G (0x00000000 - 0x7fffffff).
+
+
+
+
+ 44..22.. TThhee pprroobblleemm
+
+
+ The prebuilt UML binaries on this site will not run on 2G/2G hosts
+ because UML occupies the upper .5G of the 3G process address space
+ (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right
+ in the middle of the kernel address space, so UML won't even load - it
+ will immediately segfault.
+
+
+
+
+ 44..33.. TThhee ssoolluuttiioonn
+
+
+ The fix for this is to rebuild UML from source after enabling
+ CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to
+ load itself in the top .5G of that smaller process address space,
+ where it will run fine. See ``Compiling the kernel and modules'' if
+ you need help building UML from source.
+
+
+
+
+
+
+
+
+
+
+ 55.. SSeettttiinngg uupp sseerriiaall lliinneess aanndd ccoonnssoolleess
+
+
+ It is possible to attach UML serial lines and consoles to many types
+ of host I/O channels by specifying them on the command line.
+
+
+ You can attach them to host ptys, ttys, file descriptors, and ports.
+ This allows you to do things like
+
+ +o have a UML console appear on an unused host console,
+
+ +o hook two virtual machines together by having one attach to a pty
+ and having the other attach to the corresponding tty
+
+ +o make a virtual machine accessible from the net by attaching a
+ console to a port on the host.
+
+
+ The general format of the command line option is device=channel.
+
+
+
+ 55..11.. SSppeecciiffyyiinngg tthhee ddeevviiccee
+
+ Devices are specified with "con" or "ssl" (console or serial line,
+ respectively), optionally with a device number if you are talking
+ about a specific device.
+
+
+ Using just "con" or "ssl" describes all of the consoles or serial
+ lines. If you want to talk about console #3 or serial line #10, they
+ would be "con3" and "ssl10", respectively.
+
+
+ A specific device name will override a less general "con=" or "ssl=".
+ So, for example, you can assign a pty to each of the serial lines
+ except for the first two like this:
+
+
+ ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1
+
+
+
+
+ The specificity of the device name is all that matters; order on the
+ command line is irrelevant.
+
+
+
+ 55..22.. SSppeecciiffyyiinngg tthhee cchhaannnneell
+
+ There are a number of different types of channels to attach a UML
+ device to, each with a different way of specifying exactly what to
+ attach to.
+
+ +o pseudo-terminals - device=pty pts terminals - device=pts
+
+
+ This will cause UML to allocate a free host pseudo-terminal for the
+ device. The terminal that it got will be announced in the boot
+ log. You access it by attaching a terminal program to the
+ corresponding tty:
+
+ +o screen /dev/pts/n
+
+ +o screen /dev/ttyxx
+
+ +o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts
+ devices
+
+ +o kermit - start it up, 'open' the device, then 'connect'
+
+
+
+
+
+ +o terminals - device=tty:tty device file
+
+
+ This will make UML attach the device to the specified tty (i.e
+
+
+ con1=tty:/dev/tty3
+
+
+
+
+ will attach UML's console 1 to the host's /dev/tty3). If the tty that
+ you specify is the slave end of a tty/pty pair, something else must
+ have already opened the corresponding pty in order for this to work.
+
+
+
+
+
+ +o xterms - device=xterm
+
+
+ UML will run an xterm and the device will be attached to it.
+
+
+
+
+
+ +o Port - device=port:port number
+
+
+ This will attach the UML devices to the specified host port.
+ Attaching console 1 to the host's port 9000 would be done like
+ this:
+
+
+ con1=port:9000
+
+
+
+
+ Attaching all the serial lines to that port would be done similarly:
+
+
+ ssl=port:9000
+
+
+
+
+ You access these devices by telnetting to that port. Each active tel-
+ net session gets a different device. If there are more telnets to a
+ port than UML devices attached to it, then the extra telnet sessions
+ will block until an existing telnet detaches, or until another device
+ becomes active (i.e. by being activated in /etc/inittab).
+
+ This channel has the advantage that you can both attach multiple UML
+ devices to it and know how to access them without reading the UML boot
+ log. It is also unique in allowing access to a UML from remote
+ machines without requiring that the UML be networked. This could be
+ useful in allowing public access to UMLs because they would be
+ accessible from the net, but wouldn't need any kind of network
+ filtering or access control because they would have no network access.
+
+
+ If you attach the main console to a portal, then the UML boot will
+ appear to hang. In reality, it's waiting for a telnet to connect, at
+ which point the boot will proceed.
+
+
+
+
+
+ +o already-existing file descriptors - device=file descriptor
+
+
+ If you set up a file descriptor on the UML command line, you can
+ attach a UML device to it. This is most commonly used to put the
+ main console back on stdin and stdout after assigning all the other
+ consoles to something else:
+
+
+ con0=fd:0,fd:1 con=pts
+
+
+
+
+
+
+
+
+ +o Nothing - device=null
+
+
+ This allows the device to be opened, in contrast to 'none', but
+ reads will block, and writes will succeed and the data will be
+ thrown out.
+
+
+
+
+
+ +o None - device=none
+
+
+ This causes the device to disappear.
+
+
+
+ You can also specify different input and output channels for a device
+ by putting a comma between them:
+
+
+ ssl3=tty:/dev/tty2,xterm
+
+
+
+
+ will cause serial line 3 to accept input on the host's /dev/tty3 and
+ display output on an xterm. That's a silly example - the most common
+ use of this syntax is to reattach the main console to stdin and stdout
+ as shown above.
+
+
+ If you decide to move the main console away from stdin/stdout, the
+ initial boot output will appear in the terminal that you're running
+ UML in. However, once the console driver has been officially
+ initialized, then the boot output will start appearing wherever you
+ specified that console 0 should be. That device will receive all
+ subsequent output.
+
+
+
+ 55..33.. EExxaammpplleess
+
+ There are a number of interesting things you can do with this
+ capability.
+
+
+ First, this is how you get rid of those bleeding console xterms by
+ attaching them to host ptys:
+
+
+ con=pty con0=fd:0,fd:1
+
+
+
+
+ This will make a UML console take over an unused host virtual console,
+ so that when you switch to it, you will see the UML login prompt
+ rather than the host login prompt:
+
+
+ con1=tty:/dev/tty6
+
+
+
+
+ You can attach two virtual machines together with what amounts to a
+ serial line as follows:
+
+ Run one UML with a serial line attached to a pty -
+
+
+ ssl1=pty
+
+
+
+
+ Look at the boot log to see what pty it got (this example will assume
+ that it got /dev/ptyp1).
+
+ Boot the other UML with a serial line attached to the corresponding
+ tty -
+
+
+ ssl1=tty:/dev/ttyp1
+
+
+
+
+ Log in, make sure that it has no getty on that serial line, attach a
+ terminal program like minicom to it, and you should see the login
+ prompt of the other virtual machine.
+
+
+ 66.. SSeettttiinngg uupp tthhee nneettwwoorrkk
+
+
+
+ This page describes how to set up the various transports and to
+ provide a UML instance with network access to the host, other machines
+ on the local net, and the rest of the net.
+
+
+ As of 2.4.5, UML networking has been completely redone to make it much
+ easier to set up, fix bugs, and add new features.
+
+
+ There is a new helper, uml_net, which does the host setup that
+ requires root privileges.
+
+
+ There are currently five transport types available for a UML virtual
+ machine to exchange packets with other hosts:
+
+ +o ethertap
+
+ +o TUN/TAP
+
+ +o Multicast
+
+ +o a switch daemon
+
+ +o slip
+
+ +o slirp
+
+ +o pcap
+
+ The TUN/TAP, ethertap, slip, and slirp transports allow a UML
+ instance to exchange packets with the host. They may be directed
+ to the host or the host may just act as a router to provide access
+ to other physical or virtual machines.
+
+
+ The pcap transport is a synthetic read-only interface, using the
+ libpcap binary to collect packets from interfaces on the host and
+ filter them. This is useful for building preconfigured traffic
+ monitors or sniffers.
+
+
+ The daemon and multicast transports provide a completely virtual
+ network to other virtual machines. This network is completely
+ disconnected from the physical network unless one of the virtual
+ machines on it is acting as a gateway.
+
+
+ With so many host transports, which one should you use? Here's when
+ you should use each one:
+
+ +o ethertap - if you want access to the host networking and it is
+ running 2.2
+
+ +o TUN/TAP - if you want access to the host networking and it is
+ running 2.4. Also, the TUN/TAP transport is able to use a
+ preconfigured device, allowing it to avoid using the setuid uml_net
+ helper, which is a security advantage.
+
+ +o Multicast - if you want a purely virtual network and you don't want
+ to set up anything but the UML
+
+ +o a switch daemon - if you want a purely virtual network and you
+ don't mind running the daemon in order to get somewhat better
+ performance
+
+ +o slip - there is no particular reason to run the slip backend unless
+ ethertap and TUN/TAP are just not available for some reason
+
+ +o slirp - if you don't have root access on the host to setup
+ networking, or if you don't want to allocate an IP to your UML
+
+ +o pcap - not much use for actual network connectivity, but great for
+ monitoring traffic on the host
+
+ Ethertap is available on 2.4 and works fine. TUN/TAP is preferred
+ to it because it has better performance and ethertap is officially
+ considered obsolete in 2.4. Also, the root helper only needs to
+ run occasionally for TUN/TAP, rather than handling every packet, as
+ it does with ethertap. This is a slight security advantage since
+ it provides fewer opportunities for a nasty UML user to somehow
+ exploit the helper's root privileges.
+
+
+ 66..11.. GGeenneerraall sseettuupp
+
+ First, you must have the virtual network enabled in your UML. If are
+ running a prebuilt kernel from this site, everything is already
+ enabled. If you build the kernel yourself, under the "Network device
+ support" menu, enable "Network device support", and then the three
+ transports.
+
+
+ The next step is to provide a network device to the virtual machine.
+ This is done by describing it on the kernel command line.
+
+ The general format is
+
+
+ eth <n> = <transport> , <transport args>
+
+
+
+
+ For example, a virtual ethernet device may be attached to a host
+ ethertap device as follows:
+
+
+ eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254
+
+
+
+
+ This sets up eth0 inside the virtual machine to attach itself to the
+ host /dev/tap0, assigns it an ethernet address, and assigns the host
+ tap0 interface an IP address.
+
+
+
+ Note that the IP address you assign to the host end of the tap device
+ must be different than the IP you assign to the eth device inside UML.
+ If you are short on IPs and don't want to consume two per UML, then
+ you can reuse the host's eth IP address for the host ends of the tap
+ devices. Internally, the UMLs must still get unique IPs for their eth
+ devices. You can also give the UMLs non-routable IPs (192.168.x.x or
+ 10.x.x.x) and have the host masquerade them. This will let outgoing
+ connections work, but incoming connections won't without more work,
+ such as port forwarding from the host.
+ Also note that when you configure the host side of an interface, it is
+ only acting as a gateway. It will respond to pings sent to it
+ locally, but is not useful to do that since it's a host interface.
+ You are not talking to the UML when you ping that interface and get a
+ response.
+
+
+ You can also add devices to a UML and remove them at runtime. See the
+ ``The Management Console'' page for details.
+
+
+ The sections below describe this in more detail.
+
+
+ Once you've decided how you're going to set up the devices, you boot
+ UML, log in, configure the UML side of the devices, and set up routes
+ to the outside world. At that point, you will be able to talk to any
+ other machines, physical or virtual, on the net.
+
+
+ If ifconfig inside UML fails and the network refuses to come up, run
+ tell you what went wrong.
+
+
+
+ 66..22.. UUsseerrssppaaccee ddaaeemmoonnss
+
+ You will likely need the setuid helper, or the switch daemon, or both.
+ They are both installed with the RPM and deb, so if you've installed
+ either, you can skip the rest of this section.
+
+
+ If not, then you need to check them out of CVS, build them, and
+ install them. The helper is uml_net, in CVS /tools/uml_net, and the
+ daemon is uml_switch, in CVS /tools/uml_router. They are both built
+ with a plain 'make'. Both need to be installed in a directory that's
+ in your path - /usr/bin is recommend. On top of that, uml_net needs
+ to be setuid root.
+
+
+
+ 66..33.. SSppeecciiffyyiinngg eetthheerrnneett aaddddrreesssseess
+
+ Below, you will see that the TUN/TAP, ethertap, and daemon interfaces
+ allow you to specify hardware addresses for the virtual ethernet
+ devices. This is generally not necessary. If you don't have a
+ specific reason to do it, you probably shouldn't. If one is not
+ specified on the command line, the driver will assign one based on the
+ device IP address. It will provide the address fe:fd:nn:nn:nn:nn
+ where nn.nn.nn.nn is the device IP address. This is nearly always
+ sufficient to guarantee a unique hardware address for the device. A
+ couple of exceptions are:
+
+ +o Another set of virtual ethernet devices are on the same network and
+ they are assigned hardware addresses using a different scheme which
+ may conflict with the UML IP address-based scheme
+
+ +o You aren't going to use the device for IP networking, so you don't
+ assign the device an IP address
+
+ If you let the driver provide the hardware address, you should make
+ sure that the device IP address is known before the interface is
+ brought up. So, inside UML, this will guarantee that:
+
+
+
+ UML#
+ ifconfig eth0 192.168.0.250 up
+
+
+
+
+ If you decide to assign the hardware address yourself, make sure that
+ the first byte of the address is even. Addresses with an odd first
+ byte are broadcast addresses, which you don't want assigned to a
+ device.
+
+
+
+ 66..44.. UUMMLL iinntteerrffaaccee sseettuupp
+
+ Once the network devices have been described on the command line, you
+ should boot UML and log in.
+
+
+ The first thing to do is bring the interface up:
+
+
+ UML# ifconfig ethn ip-address up
+
+
+
+
+ You should be able to ping the host at this point.
+
+
+ To reach the rest of the world, you should set a default route to the
+ host:
+
+
+ UML# route add default gw host ip
+
+
+
+
+ Again, with host ip of 192.168.0.4:
+
+
+ UML# route add default gw 192.168.0.4
+
+
+
+
+ This page used to recommend setting a network route to your local net.
+ This is wrong, because it will cause UML to try to figure out hardware
+ addresses of the local machines by arping on the interface to the
+ host. Since that interface is basically a single strand of ethernet
+ with two nodes on it (UML and the host) and arp requests don't cross
+ networks, they will fail to elicit any responses. So, what you want
+ is for UML to just blindly throw all packets at the host and let it
+ figure out what to do with them, which is what leaving out the network
+ route and adding the default route does.
+
+
+ Note: If you can't communicate with other hosts on your physical
+ ethernet, it's probably because of a network route that's
+ automatically set up. If you run 'route -n' and see a route that
+ looks like this:
+
+
+
+
+ Destination Gateway Genmask Flags Metric Ref Use Iface
+ 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0
+
+
+
+
+ with a mask that's not 255.255.255.255, then replace it with a route
+ to your host:
+
+
+ UML#
+ route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0
+
+
+
+
+
+
+ UML#
+ route add -host 192.168.0.4 dev eth0
+
+
+
+
+ This, plus the default route to the host, will allow UML to exchange
+ packets with any machine on your ethernet.
+
+
+
+ 66..55.. MMuullttiiccaasstt
+
+ The simplest way to set up a virtual network between multiple UMLs is
+ to use the mcast transport. This was written by Harald Welte and is
+ present in UML version 2.4.5-5um and later. Your system must have
+ multicast enabled in the kernel and there must be a multicast-capable
+ network device on the host. Normally, this is eth0, but if there is
+ no ethernet card on the host, then you will likely get strange error
+ messages when you bring the device up inside UML.
+
+
+ To use it, run two UMLs with
+
+
+ eth0=mcast
+
+
+
+
+ on their command lines. Log in, configure the ethernet device in each
+ machine with different IP addresses:
+
+
+ UML1# ifconfig eth0 192.168.0.254
+
+
+
+
+
+
+ UML2# ifconfig eth0 192.168.0.253
+
+
+
+
+ and they should be able to talk to each other.
+
+ The full set of command line options for this transport are
+
+
+
+ ethn=mcast,ethernet address,multicast
+ address,multicast port,ttl
+
+
+
+
+ Harald's original README is here <http://user-mode-linux.source-
+ forge.net/text/mcast.txt> and explains these in detail, as well as
+ some other issues.
+
+
+
+ 66..66.. TTUUNN//TTAAPP wwiitthh tthhee uummll__nneett hheellppeerr
+
+ TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the
+ host. The TUN/TAP backend has been in UML since 2.4.9-3um.
+
+
+ The easiest way to get up and running is to let the setuid uml_net
+ helper do the host setup for you. This involves insmod-ing the tun.o
+ module if necessary, configuring the device, and setting up IP
+ forwarding, routing, and proxy arp. If you are new to UML networking,
+ do this first. If you're concerned about the security implications of
+ the setuid helper, use it to get up and running, then read the next
+ section to see how to have UML use a preconfigured tap device, which
+ avoids the use of uml_net.
+
+
+ If you specify an IP address for the host side of the device, the
+ uml_net helper will do all necessary setup on the host - the only
+ requirement is that TUN/TAP be available, either built in to the host
+ kernel or as the tun.o module.
+
+ The format of the command line switch to attach a device to a TUN/TAP
+ device is
+
+
+ eth <n> =tuntap,,, <IP address>
+
+
+
+
+ For example, this argument will attach the UML's eth0 to the next
+ available tap device and assign an ethernet address to it based on its
+ IP address
+
+
+ eth0=tuntap,,,192.168.0.254
+
+
+
+
+
+
+ Note that the IP address that must be used for the eth device inside
+ UML is fixed by the routing and proxy arp that is set up on the
+ TUN/TAP device on the host. You can use a different one, but it won't
+ work because reply packets won't reach the UML. This is a feature.
+ It prevents a nasty UML user from doing things like setting the UML IP
+ to the same as the network's nameserver or mail server.
+
+
+ There are a couple potential problems with running the TUN/TAP
+ transport on a 2.4 host kernel
+
+ +o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host
+ kernel or use the ethertap transport.
+
+ +o With an upgraded kernel, TUN/TAP may fail with
+
+
+ File descriptor in bad state
+
+
+
+
+ This is due to a header mismatch between the upgraded kernel and the
+ kernel that was originally installed on the machine. The fix is to
+ make sure that /usr/src/linux points to the headers for the running
+ kernel.
+
+ These were pointed out by Tim Robinson <timro at trkr dot net> in
+ <http://www.geocrawler.com/lists/3/SourceForge/597/0/> name="this uml-
+ user post"> .
+
+
+
+ 66..77.. TTUUNN//TTAAPP wwiitthh aa pprreeccoonnffiigguurreedd ttaapp ddeevviiccee
+
+ If you prefer not to have UML use uml_net (which is somewhat
+ insecure), with UML 2.4.17-11, you can set up a TUN/TAP device
+ beforehand. The setup needs to be done as root, but once that's done,
+ there is no need for root assistance. Setting up the device is done
+ as follows:
+
+ +o Create the device with tunctl (available from the UML utilities
+ tarball)
+
+
+
+
+ host# tunctl -u uid
+
+
+
+
+ where uid is the user id or username that UML will be run as. This
+ will tell you what device was created.
+
+ +o Configure the device IP (change IP addresses and device name to
+ suit)
+
+
+
+
+ host# ifconfig tap0 192.168.0.254 up
+
+
+
+
+
+ +o Set up routing and arping if desired - this is my recipe, there are
+ other ways of doing the same thing
+
+
+ host#
+ bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward'
+
+ host#
+ route add -host 192.168.0.253 dev tap0
+
+
+
+
+
+
+ host#
+ bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp'
+
+
+
+
+
+
+ host#
+ arp -Ds 192.168.0.253 eth0 pub
+
+
+
+
+ Note that this must be done every time the host boots - this configu-
+ ration is not stored across host reboots. So, it's probably a good
+ idea to stick it in an rc file. An even better idea would be a little
+ utility which reads the information from a config file and sets up
+ devices at boot time.
+
+ +o Rather than using up two IPs and ARPing for one of them, you can
+ also provide direct access to your LAN by the UML by using a
+ bridge.
+
+
+ host#
+ brctl addbr br0
+
+
+
+
+
+
+ host#
+ ifconfig eth0 0.0.0.0 promisc up
+
+
+
+
+
+
+ host#
+ ifconfig tap0 0.0.0.0 promisc up
+
+
+
+
+
+
+ host#
+ ifconfig br0 192.168.0.1 netmask 255.255.255.0 up
+
+
+
+
+
+
+
+ host#
+ brctl stp br0 off
+
+
+
+
+
+
+ host#
+ brctl setfd br0 1
+
+
+
+
+
+
+ host#
+ brctl sethello br0 1
+
+
+
+
+
+
+ host#
+ brctl addif br0 eth0
+
+
+
+
+
+
+ host#
+ brctl addif br0 tap0
+
+
+
+
+ Note that 'br0' should be setup using ifconfig with the existing IP
+ address of eth0, as eth0 no longer has its own IP.
+
+ +o
+
+
+ Also, the /dev/net/tun device must be writable by the user running
+ UML in order for the UML to use the device that's been configured
+ for it. The simplest thing to do is
+
+
+ host# chmod 666 /dev/net/tun
+
+
+
+
+ Making it world-writable looks bad, but it seems not to be
+ exploitable as a security hole. However, it does allow anyone to cre-
+ ate useless tap devices (useless because they can't configure them),
+ which is a DOS attack. A somewhat more secure alternative would to be
+ to create a group containing all the users who have preconfigured tap
+ devices and chgrp /dev/net/tun to that group with mode 664 or 660.
+
+
+ +o Once the device is set up, run UML with 'eth0=tuntap,device name'
+ (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the
+ mconsole config command).
+
+ +o Bring the eth device up in UML and you're in business.
+
+ If you don't want that tap device any more, you can make it non-
+ persistent with
+
+
+ host# tunctl -d tap device
+
+
+
+
+ Finally, tunctl has a -b (for brief mode) switch which causes it to
+ output only the name of the tap device it created. This makes it
+ suitable for capture by a script:
+
+
+ host# TAP=`tunctl -u 1000 -b`
+
+
+
+
+
+
+ 66..88.. EEtthheerrttaapp
+
+ Ethertap is the general mechanism on 2.2 for userspace processes to
+ exchange packets with the kernel.
+
+
+
+ To use this transport, you need to describe the virtual network device
+ on the UML command line. The general format for this is
+
+
+ eth <n> =ethertap, <device> , <ethernet address> , <tap IP address>
+
+
+
+
+ So, the previous example
+
+
+ eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254
+
+
+
+
+ attaches the UML eth0 device to the host /dev/tap0, assigns it the
+ ethernet address fe:fd:0:0:0:1, and assigns the IP address
+ 192.168.0.254 to the tap device.
+
+
+
+ The tap device is mandatory, but the others are optional. If the
+ ethernet address is omitted, one will be assigned to it.
+
+
+ The presence of the tap IP address will cause the helper to run and do
+ whatever host setup is needed to allow the virtual machine to
+ communicate with the outside world. If you're not sure you know what
+ you're doing, this is the way to go.
+
+
+ If it is absent, then you must configure the tap device and whatever
+ arping and routing you will need on the host. However, even in this
+ case, the uml_net helper still needs to be in your path and it must be
+ setuid root if you're not running UML as root. This is because the
+ tap device doesn't support SIGIO, which UML needs in order to use
+ something as a source of input. So, the helper is used as a
+ convenient asynchronous IO thread.
+
+ If you're using the uml_net helper, you can ignore the following host
+ setup - uml_net will do it for you. You just need to make sure you
+ have ethertap available, either built in to the host kernel or
+ available as a module.
+
+
+ If you want to set things up yourself, you need to make sure that the
+ appropriate /dev entry exists. If it doesn't, become root and create
+ it as follows:
+
+
+ mknod /dev/tap <minor> c 36 <minor> + 16
+
+
+
+
+ For example, this is how to create /dev/tap0:
+
+
+ mknod /dev/tap0 c 36 0 + 16
+
+
+
+
+ You also need to make sure that the host kernel has ethertap support.
+ If ethertap is enabled as a module, you apparently need to insmod
+ ethertap once for each ethertap device you want to enable. So,
+
+
+ host#
+ insmod ethertap
+
+
+
+
+ will give you the tap0 interface. To get the tap1 interface, you need
+ to run
+
+
+ host#
+ insmod ethertap unit=1 -o ethertap1
+
+
+
+
+
+
+
+ 66..99.. TThhee sswwiittcchh ddaaeemmoonn
+
+ NNoottee: This is the daemon formerly known as uml_router, but which was
+ renamed so the network weenies of the world would stop growling at me.
+
+
+ The switch daemon, uml_switch, provides a mechanism for creating a
+ totally virtual network. By default, it provides no connection to the
+ host network (but see -tap, below).
+
+
+ The first thing you need to do is run the daemon. Running it with no
+ arguments will make it listen on a default pair of unix domain
+ sockets.
+
+
+ If you want it to listen on a different pair of sockets, use
+
+
+ -unix control socket data socket
+
+
+
+
+
+ If you want it to act as a hub rather than a switch, use
+
+
+ -hub
+
+
+
+
+
+ If you want the switch to be connected to host networking (allowing
+ the umls to get access to the outside world through the host), use
+
+
+ -tap tap0
+
+
+
+
+
+ Note that the tap device must be preconfigured (see "TUN/TAP with a
+ preconfigured tap device", above). If you're using a different tap
+ device than tap0, specify that instead of tap0.
+
+
+ uml_switch can be backgrounded as follows
+
+
+ host%
+ uml_switch [ options ] < /dev/null > /dev/null
+
+
+
+
+ The reason it doesn't background by default is that it listens to
+ stdin for EOF. When it sees that, it exits.
+
+
+ The general format of the kernel command line switch is
+
+
+
+ ethn=daemon,ethernet address,socket
+ type,control socket,data socket
+
+
+
+
+ You can leave off everything except the 'daemon'. You only need to
+ specify the ethernet address if the one that will be assigned to it
+ isn't acceptable for some reason. The rest of the arguments describe
+ how to communicate with the daemon. You should only specify them if
+ you told the daemon to use different sockets than the default. So, if
+ you ran the daemon with no arguments, running the UML on the same
+ machine with
+ eth0=daemon
+
+
+
+
+ will cause the eth0 driver to attach itself to the daemon correctly.
+
+
+
+ 66..1100.. SSlliipp
+
+ Slip is another, less general, mechanism for a process to communicate
+ with the host networking. In contrast to the ethertap interface,
+ which exchanges ethernet frames with the host and can be used to
+ transport any higher-level protocol, it can only be used to transport
+ IP.
+
+
+ The general format of the command line switch is
+
+
+
+ ethn=slip,slip IP
+
+
+
+
+ The slip IP argument is the IP address that will be assigned to the
+ host end of the slip device. If it is specified, the helper will run
+ and will set up the host so that the virtual machine can reach it and
+ the rest of the network.
+
+
+ There are some oddities with this interface that you should be aware
+ of. You should only specify one slip device on a given virtual
+ machine, and its name inside UML will be 'umn', not 'eth0' or whatever
+ you specified on the command line. These problems will be fixed at
+ some point.
+
+
+
+ 66..1111.. SSlliirrpp
+
+ slirp uses an external program, usually /usr/bin/slirp, to provide IP
+ only networking connectivity through the host. This is similar to IP
+ masquerading with a firewall, although the translation is performed in
+ user-space, rather than by the kernel. As slirp does not set up any
+ interfaces on the host, or changes routing, slirp does not require
+ root access or setuid binaries on the host.
+
+
+ The general format of the command line switch for slirp is:
+
+
+
+ ethn=slirp,ethernet address,slirp path
+
+
+
+
+ The ethernet address is optional, as UML will set up the interface
+ with an ethernet address based upon the initial IP address of the
+ interface. The slirp path is generally /usr/bin/slirp, although it
+ will depend on distribution.
+
+
+ The slirp program can have a number of options passed to the command
+ line and we can't add them to the UML command line, as they will be
+ parsed incorrectly. Instead, a wrapper shell script can be written or
+ the options inserted into the /.slirprc file. More information on
+ all of the slirp options can be found in its man pages.
+
+
+ The eth0 interface on UML should be set up with the IP 10.2.0.15,
+ although you can use anything as long as it is not used by a network
+ you will be connecting to. The default route on UML should be set to
+ use
+
+
+ UML#
+ route add default dev eth0
+
+
+
+
+ slirp provides a number of useful IP addresses which can be used by
+ UML, such as 10.0.2.3 which is an alias for the DNS server specified
+ in /etc/resolv.conf on the host or the IP given in the 'dns' option
+ for slirp.
+
+
+ Even with a baudrate setting higher than 115200, the slirp connection
+ is limited to 115200. If you need it to go faster, the slirp binary
+ needs to be compiled with FULL_BOLT defined in config.h.
+
+
+
+ 66..1122.. ppccaapp
+
+ The pcap transport is attached to a UML ethernet device on the command
+ line or with uml_mconsole with the following syntax:
+
+
+
+ ethn=pcap,host interface,filter
+ expression,option1,option2
+
+
+
+
+ The expression and options are optional.
+
+
+ The interface is whatever network device on the host you want to
+ sniff. The expression is a pcap filter expression, which is also what
+ tcpdump uses, so if you know how to specify tcpdump filters, you will
+ use the same expressions here. The options are up to two of
+ 'promisc', control whether pcap puts the host interface into
+ promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap
+ expression optimizer is used.
+
+
+ Example:
+
+
+
+ eth0=pcap,eth0,tcp
+
+ eth1=pcap,eth0,!tcp
+
+
+
+ will cause the UML eth0 to emit all tcp packets on the host eth0 and
+ the UML eth1 to emit all non-tcp packets on the host eth0.
+
+
+
+ 66..1133.. SSeettttiinngg uupp tthhee hhoosstt yyoouurrsseellff
+
+ If you don't specify an address for the host side of the ethertap or
+ slip device, UML won't do any setup on the host. So this is what is
+ needed to get things working (the examples use a host-side IP of
+ 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your
+ own network):
+
+ +o The device needs to be configured with its IP address. Tap devices
+ are also configured with an mtu of 1484. Slip devices are
+ configured with a point-to-point address pointing at the UML ip
+ address.
+
+
+ host# ifconfig tap0 arp mtu 1484 192.168.0.251 up
+
+
+
+
+
+
+ host#
+ ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up
+
+
+
+
+
+ +o If a tap device is being set up, a route is set to the UML IP.
+
+
+ UML# route add -host 192.168.0.250 gw 192.168.0.251
+
+
+
+
+
+ +o To allow other hosts on your network to see the virtual machine,
+ proxy arp is set up for it.
+
+
+ host# arp -Ds 192.168.0.250 eth0 pub
+
+
+
+
+
+ +o Finally, the host is set up to route packets.
+
+
+ host# echo 1 > /proc/sys/net/ipv4/ip_forward
+
+
+
+
+
+
+
+
+
+
+ 77.. SShhaarriinngg FFiilleessyysstteemmss bbeettwweeeenn VViirrttuuaall MMaacchhiinneess
+
+
+
+
+ 77..11.. AA wwaarrnniinngg
+
+ Don't attempt to share filesystems simply by booting two UMLs from the
+ same file. That's the same thing as booting two physical machines
+ from a shared disk. It will result in filesystem corruption.
+
+
+
+ 77..22.. UUssiinngg llaayyeerreedd bblloocckk ddeevviicceess
+
+ The way to share a filesystem between two virtual machines is to use
+ the copy-on-write (COW) layering capability of the ubd block driver.
+ As of 2.4.6-2um, the driver supports layering a read-write private
+ device over a read-only shared device. A machine's writes are stored
+ in the private device, while reads come from either device - the
+ private one if the requested block is valid in it, the shared one if
+ not. Using this scheme, the majority of data which is unchanged is
+ shared between an arbitrary number of virtual machines, each of which
+ has a much smaller file containing the changes that it has made. With
+ a large number of UMLs booting from a large root filesystem, this
+ leads to a huge disk space saving. It will also help performance,
+ since the host will be able to cache the shared data using a much
+ smaller amount of memory, so UML disk requests will be served from the
+ host's memory rather than its disks.
+
+
+
+
+ To add a copy-on-write layer to an existing block device file, simply
+ add the name of the COW file to the appropriate ubd switch:
+
+
+ ubd0=root_fs_cow,root_fs_debian_22
+
+
+
+
+ where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is
+ the existing shared filesystem. The COW file need not exist. If it
+ doesn't, the driver will create and initialize it. Once the COW file
+ has been initialized, it can be used on its own on the command line:
+
+
+ ubd0=root_fs_cow
+
+
+
+
+ The name of the backing file is stored in the COW file header, so it
+ would be redundant to continue specifying it on the command line.
+
+
+
+ 77..33.. NNoottee!!
+
+ When checking the size of the COW file in order to see the gobs of
+ space that you're saving, make sure you use 'ls -ls' to see the actual
+ disk consumption rather than the length of the file. The COW file is
+ sparse, so the length will be very different from the disk usage.
+ Here is a 'ls -l' of a COW file and backing file from one boot and
+ shutdown:
+ host% ls -l cow.debian debian2.2
+ -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian
+ -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2
+
+
+
+
+ Doesn't look like much saved space, does it? Well, here's 'ls -ls':
+
+
+ host% ls -ls cow.debian debian2.2
+ 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian
+ 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2
+
+
+
+
+ Now, you can see that the COW file has less than a meg of disk, rather
+ than 492 meg.
+
+
+
+ 77..44.. AAnnootthheerr wwaarrnniinngg
+
+ Once a filesystem is being used as a readonly backing file for a COW
+ file, do not boot directly from it or modify it in any way. Doing so
+ will invalidate any COW files that are using it. The mtime and size
+ of the backing file are stored in the COW file header at its creation,
+ and they must continue to match. If they don't, the driver will
+ refuse to use the COW file.
+
+
+
+
+ If you attempt to evade this restriction by changing either the
+ backing file or the COW header by hand, you will get a corrupted
+ filesystem.
+
+
+
+
+ Among other things, this means that upgrading the distribution in a
+ backing file and expecting that all of the COW files using it will see
+ the upgrade will not work.
+
+
+
+
+ 77..55.. uummll__mmoooo :: MMeerrggiinngg aa CCOOWW ffiillee wwiitthh iittss bbaacckkiinngg ffiillee
+
+ Depending on how you use UML and COW devices, it may be advisable to
+ merge the changes in the COW file into the backing file every once in
+ a while.
+
+
+
+
+ The utility that does this is uml_moo. Its usage is
+
+
+ host% uml_moo COW file new backing file
+
+
+
+
+ There's no need to specify the backing file since that information is
+ already in the COW file header. If you're paranoid, boot the new
+ merged file, and if you're happy with it, move it over the old backing
+ file.
+
+
+
+
+ uml_moo creates a new backing file by default as a safety measure. It
+ also has a destructive merge option which will merge the COW file
+ directly into its current backing file. This is really only usable
+ when the backing file only has one COW file associated with it. If
+ there are multiple COWs associated with a backing file, a -d merge of
+ one of them will invalidate all of the others. However, it is
+ convenient if you're short of disk space, and it should also be
+ noticeably faster than a non-destructive merge.
+
+
+
+
+ uml_moo is installed with the UML deb and RPM. If you didn't install
+ UML from one of those packages, you can also get it from the UML
+ utilities <http://user-mode-linux.sourceforge.net/dl-sf.html#UML
+ utilities> tar file in tools/moo.
+
+
+
+
+
+
+
+
+ 88.. CCrreeaattiinngg ffiilleessyysstteemmss
+
+
+ You may want to create and mount new UML filesystems, either because
+ your root filesystem isn't large enough or because you want to use a
+ filesystem other than ext2.
+
+
+ This was written on the occasion of reiserfs being included in the
+ 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will
+ talk about reiserfs. This information is generic, and the examples
+ should be easy to translate to the filesystem of your choice.
+
+
+ 88..11.. CCrreeaattee tthhee ffiilleessyysstteemm ffiillee
+
+ dd is your friend. All you need to do is tell dd to create an empty
+ file of the appropriate size. I usually make it sparse to save time
+ and to avoid allocating disk space until it's actually used. For
+ example, the following command will create a sparse 100 meg file full
+ of zeroes.
+
+
+ host%
+ dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M
+
+
+
+
+
+
+ 88..22.. AAssssiiggnn tthhee ffiillee ttoo aa UUMMLL ddeevviiccee
+
+ Add an argument like the following to the UML command line:
+
+ ubd4=new_filesystem
+
+
+
+
+ making sure that you use an unassigned ubd device number.
+
+
+
+ 88..33.. CCrreeaattiinngg aanndd mmoouunnttiinngg tthhee ffiilleessyysstteemm
+
+ Make sure that the filesystem is available, either by being built into
+ the kernel, or available as a module, then boot up UML and log in. If
+ the root filesystem doesn't have the filesystem utilities (mkfs, fsck,
+ etc), then get them into UML by way of the net or hostfs.
+
+
+ Make the new filesystem on the device assigned to the new file:
+
+
+ host# mkreiserfs /dev/ubd/4
+
+
+ <----------- MKREISERFSv2 ----------->
+
+ ReiserFS version 3.6.25
+ Block size 4096 bytes
+ Block count 25856
+ Used blocks 8212
+ Journal - 8192 blocks (18-8209), journal header is in block 8210
+ Bitmaps: 17
+ Root block 8211
+ Hash function "r5"
+ ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y
+ journal size 8192 (from 18)
+ Initializing journal - 0%....20%....40%....60%....80%....100%
+ Syncing..done.
+
+
+
+
+ Now, mount it:
+
+
+ UML#
+ mount /dev/ubd/4 /mnt
+
+
+
+
+ and you're in business.
+
+
+
+
+
+
+
+
+
+ 99.. HHoosstt ffiillee aacccceessss
+
+
+ If you want to access files on the host machine from inside UML, you
+ can treat it as a separate machine and either nfs mount directories
+ from the host or copy files into the virtual machine with scp or rcp.
+ However, since UML is running on the host, it can access those
+ files just like any other process and make them available inside the
+ virtual machine without needing to use the network.
+
+
+ This is now possible with the hostfs virtual filesystem. With it, you
+ can mount a host directory into the UML filesystem and access the
+ files contained in it just as you would on the host.
+
+
+ 99..11.. UUssiinngg hhoossttffss
+
+ To begin with, make sure that hostfs is available inside the virtual
+ machine with
+
+
+ UML# cat /proc/filesystems
+
+
+
+ . hostfs should be listed. If it's not, either rebuild the kernel
+ with hostfs configured into it or make sure that hostfs is built as a
+ module and available inside the virtual machine, and insmod it.
+
+
+ Now all you need to do is run mount:
+
+
+ UML# mount none /mnt/host -t hostfs
+
+
+
+
+ will mount the host's / on the virtual machine's /mnt/host.
+
+
+ If you don't want to mount the host root directory, then you can
+ specify a subdirectory to mount with the -o switch to mount:
+
+
+ UML# mount none /mnt/home -t hostfs -o /home
+
+
+
+
+ will mount the hosts's /home on the virtual machine's /mnt/home.
+
+
+
+ 99..22.. hhoossttffss aass tthhee rroooott ffiilleessyysstteemm
+
+ It's possible to boot from a directory hierarchy on the host using
+ hostfs rather than using the standard filesystem in a file.
+
+ To start, you need that hierarchy. The easiest way is to loop mount
+ an existing root_fs file:
+
+
+ host# mount root_fs uml_root_dir -o loop
+
+
+
+
+ You need to change the filesystem type of / in etc/fstab to be
+ 'hostfs', so that line looks like this:
+
+ /dev/ubd/0 / hostfs defaults 1 1
+
+
+
+
+ Then you need to chown to yourself all the files in that directory
+ that are owned by root. This worked for me:
+
+
+ host# find . -uid 0 -exec chown jdike {} \;
+
+
+
+
+ Next, make sure that your UML kernel has hostfs compiled in, not as a
+ module. Then run UML with the boot device pointing at that directory:
+
+
+ ubd0=/path/to/uml/root/directory
+
+
+
+
+ UML should then boot as it does normally.
+
+
+ 99..33.. BBuuiillddiinngg hhoossttffss
+
+ If you need to build hostfs because it's not in your kernel, you have
+ two choices:
+
+
+
+ +o Compiling hostfs into the kernel:
+
+
+ Reconfigure the kernel and set the 'Host filesystem' option under
+
+
+ +o Compiling hostfs as a module:
+
+
+ Reconfigure the kernel and set the 'Host filesystem' option under
+ be in arch/um/fs/hostfs/hostfs.o. Install that in
+ /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and
+
+
+ UML# insmod hostfs
+
+
+
+
+
+
+
+
+
+
+
+
+ 1100.. TThhee MMaannaaggeemmeenntt CCoonnssoollee
+
+
+
+ The UML management console is a low-level interface to the kernel,
+ somewhat like the i386 SysRq interface. Since there is a full-blown
+ operating system under UML, there is much greater flexibility possible
+ than with the SysRq mechanism.
+
+
+ There are a number of things you can do with the mconsole interface:
+
+ +o get the kernel version
+
+ +o add and remove devices
+
+ +o halt or reboot the machine
+
+ +o Send SysRq commands
+
+ +o Pause and resume the UML
+
+
+ You need the mconsole client (uml_mconsole) which is present in CVS
+ (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in
+ 2.4.6.
+
+
+ You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML.
+ When you boot UML, you'll see a line like:
+
+
+ mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole
+
+
+
+
+ If you specify a unique machine id one the UML command line, i.e.
+
+
+ umid=debian
+
+
+
+
+ you'll see this
+
+
+ mconsole initialized on /home/jdike/.uml/debian/mconsole
+
+
+
+
+ That file is the socket that uml_mconsole will use to communicate with
+ UML. Run it with either the umid or the full path as its argument:
+
+
+ host% uml_mconsole debian
+
+
+
+
+ or
+
+
+ host% uml_mconsole /home/jdike/.uml/debian/mconsole
+
+
+
+
+ You'll get a prompt, at which you can run one of these commands:
+
+ +o version
+
+ +o halt
+
+ +o reboot
+
+ +o config
+
+ +o remove
+
+ +o sysrq
+
+ +o help
+
+ +o cad
+
+ +o stop
+
+ +o go
+
+
+ 1100..11.. vveerrssiioonn
+
+ This takes no arguments. It prints the UML version.
+
+
+ (mconsole) version
+ OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686
+
+
+
+
+ There are a couple actual uses for this. It's a simple no-op which
+ can be used to check that a UML is running. It's also a way of
+ sending an interrupt to the UML. This is sometimes useful on SMP
+ hosts, where there's a bug which causes signals to UML to be lost,
+ often causing it to appear to hang. Sending such a UML the mconsole
+ version command is a good way to 'wake it up' before networking has
+ been enabled, as it does not do anything to the function of the UML.
+
+
+
+ 1100..22.. hhaalltt aanndd rreebboooott
+
+ These take no arguments. They shut the machine down immediately, with
+ no syncing of disks and no clean shutdown of userspace. So, they are
+ pretty close to crashing the machine.
+
+
+ (mconsole) halt
+ OK
+
+
+
+
+
+
+ 1100..33.. ccoonnffiigg
+
+ "config" adds a new device to the virtual machine. Currently the ubd
+ and network drivers support this. It takes one argument, which is the
+ device to add, with the same syntax as the kernel command line.
+
+
+
+
+ (mconsole)
+ config ubd3=/home/jdike/incoming/roots/root_fs_debian22
+
+ OK
+ (mconsole) config eth1=mcast
+ OK
+
+
+
+
+
+
+ 1100..44.. rreemmoovvee
+
+ "remove" deletes a device from the system. Its argument is just the
+ name of the device to be removed. The device must be idle in whatever
+ sense the driver considers necessary. In the case of the ubd driver,
+ the removed block device must not be mounted, swapped on, or otherwise
+ open, and in the case of the network driver, the device must be down.
+
+
+ (mconsole) remove ubd3
+ OK
+ (mconsole) remove eth1
+ OK
+
+
+
+
+
+
+ 1100..55.. ssyyssrrqq
+
+ This takes one argument, which is a single letter. It calls the
+ generic kernel's SysRq driver, which does whatever is called for by
+ that argument. See the SysRq documentation in Documentation/sysrq.txt
+ in your favorite kernel tree to see what letters are valid and what
+ they do.
+
+
+
+ 1100..66.. hheellpp
+
+ "help" returns a string listing the valid commands and what each one
+ does.
+
+
+
+ 1100..77.. ccaadd
+
+ This invokes the Ctl-Alt-Del action on init. What exactly this ends
+ up doing is up to /etc/inittab. Normally, it reboots the machine.
+ With UML, this is usually not desired, so if a halt would be better,
+ then find the section of inittab that looks like this
+
+
+ # What to do when CTRL-ALT-DEL is pressed.
+ ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now
+
+
+
+
+ and change the command to halt.
+
+
+
+ 1100..88.. ssttoopp
+
+ This puts the UML in a loop reading mconsole requests until a 'go'
+ mconsole command is received. This is very useful for making backups
+ of UML filesystems, as the UML can be stopped, then synced via 'sysrq
+ s', so that everything is written to the filesystem. You can then copy
+ the filesystem and then send the UML 'go' via mconsole.
+
+
+ Note that a UML running with more than one CPU will have problems
+ after you send the 'stop' command, as only one CPU will be held in a
+ mconsole loop and all others will continue as normal. This is a bug,
+ and will be fixed.
+
+
+
+ 1100..99.. ggoo
+
+ This resumes a UML after being paused by a 'stop' command. Note that
+ when the UML has resumed, TCP connections may have timed out and if
+ the UML is paused for a long period of time, crond might go a little
+ crazy, running all the jobs it didn't do earlier.
+
+
+
+
+
+
+
+
+ 1111.. KKeerrnneell ddeebbuuggggiinngg
+
+
+ NNoottee:: The interface that makes debugging, as described here, possible
+ is present in 2.4.0-test6 kernels and later.
+
+
+ Since the user-mode kernel runs as a normal Linux process, it is
+ possible to debug it with gdb almost like any other process. It is
+ slightly different because the kernel's threads are already being
+ ptraced for system call interception, so gdb can't ptrace them.
+ However, a mechanism has been added to work around that problem.
+
+
+ In order to debug the kernel, you need build it from source. See
+ ``Compiling the kernel and modules'' for information on doing that.
+ Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during
+ the config. These will compile the kernel with -g, and enable the
+ ptrace proxy so that gdb works with UML, respectively.
+
+
+
+
+ 1111..11.. SSttaarrttiinngg tthhee kkeerrnneell uunnddeerr ggddbb
+
+ You can have the kernel running under the control of gdb from the
+ beginning by putting 'debug' on the command line. You will get an
+ xterm with gdb running inside it. The kernel will send some commands
+ to gdb which will leave it stopped at the beginning of start_kernel.
+ At this point, you can get things going with 'next', 'step', or
+ 'cont'.
+
+
+ There is a transcript of a debugging session here <debug-
+ session.html> , with breakpoints being set in the scheduler and in an
+ interrupt handler.
+ 1111..22.. EExxaammiinniinngg sslleeeeppiinngg pprroocceesssseess
+
+ Not every bug is evident in the currently running process. Sometimes,
+ processes hang in the kernel when they shouldn't because they've
+ deadlocked on a semaphore or something similar. In this case, when
+ you ^C gdb and get a backtrace, you will see the idle thread, which
+ isn't very relevant.
+
+
+ What you want is the stack of whatever process is sleeping when it
+ shouldn't be. You need to figure out which process that is, which is
+ generally fairly easy. Then you need to get its host process id,
+ which you can do either by looking at ps on the host or at
+ task.thread.extern_pid in gdb.
+
+
+ Now what you do is this:
+
+ +o detach from the current thread
+
+
+ (UML gdb) det
+
+
+
+
+
+ +o attach to the thread you are interested in
+
+
+ (UML gdb) att <host pid>
+
+
+
+
+
+ +o look at its stack and anything else of interest
+
+
+ (UML gdb) bt
+
+
+
+
+ Note that you can't do anything at this point that requires that a
+ process execute, e.g. calling a function
+
+ +o when you're done looking at that process, reattach to the current
+ thread and continue it
+
+
+ (UML gdb)
+ att 1
+
+
+
+
+
+
+ (UML gdb)
+ c
+
+
+
+
+ Here, specifying any pid which is not the process id of a UML thread
+ will cause gdb to reattach to the current thread. I commonly use 1,
+ but any other invalid pid would work.
+
+
+
+ 1111..33.. RRuunnnniinngg dddddd oonn UUMMLL
+
+ ddd works on UML, but requires a special kludge. The process goes
+ like this:
+
+ +o Start ddd
+
+
+ host% ddd linux
+
+
+
+
+
+ +o With ps, get the pid of the gdb that ddd started. You can ask the
+ gdb to tell you, but for some reason that confuses things and
+ causes a hang.
+
+ +o run UML with 'debug=parent gdb-pid=<pid>' added to the command line
+ - it will just sit there after you hit return
+
+ +o type 'att 1' to the ddd gdb and you will see something like
+
+
+ 0xa013dc51 in __kill ()
+
+
+ (gdb)
+
+
+
+
+
+ +o At this point, type 'c', UML will boot up, and you can use ddd just
+ as you do on any other process.
+
+
+
+ 1111..44.. DDeebbuuggggiinngg mmoodduulleess
+
+ gdb has support for debugging code which is dynamically loaded into
+ the process. This support is what is needed to debug kernel modules
+ under UML.
+
+
+ Using that support is somewhat complicated. You have to tell gdb what
+ object file you just loaded into UML and where in memory it is. Then,
+ it can read the symbol table, and figure out where all the symbols are
+ from the load address that you provided. It gets more interesting
+ when you load the module again (i.e. after an rmmod). You have to
+ tell gdb to forget about all its symbols, including the main UML ones
+ for some reason, then load then all back in again.
+
+
+ There's an easy way and a hard way to do this. The easy way is to use
+ the umlgdb expect script written by Chandan Kudige. It basically
+ automates the process for you.
+
+
+ First, you must tell it where your modules are. There is a list in
+ the script that looks like this:
+ set MODULE_PATHS {
+ "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o"
+ "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o"
+ "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o"
+ }
+
+
+
+
+ You change that to list the names and paths of the modules that you
+ are going to debug. Then you run it from the toplevel directory of
+ your UML pool and it basically tells you what to do:
+
+
+
+
+ ******** GDB pid is 21903 ********
+ Start UML as: ./linux <kernel switches> debug gdb-pid=21903
+
+
+
+ GNU gdb 5.0rh-5 Red Hat Linux 7.1
+ Copyright 2001 Free Software Foundation, Inc.
+ GDB is free software, covered by the GNU General Public License, and you are
+ welcome to change it and/or distribute copies of it under certain conditions.
+ Type "show copying" to see the conditions.
+ There is absolutely no warranty for GDB. Type "show warranty" for details.
+ This GDB was configured as "i386-redhat-linux"...
+ (gdb) b sys_init_module
+ Breakpoint 1 at 0xa0011923: file module.c, line 349.
+ (gdb) att 1
+
+
+
+
+ After you run UML and it sits there doing nothing, you hit return at
+ the 'att 1' and continue it:
+
+
+ Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1
+ 0xa00f4221 in __kill ()
+ (UML gdb) c
+ Continuing.
+
+
+
+
+ At this point, you debug normally. When you insmod something, the
+ expect magic will kick in and you'll see something like:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ *** Module hostfs loaded ***
+ Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs",
+ mod_user=0x8070e00) at module.c:349
+ 349 char *name, *n_name, *name_tmp = NULL;
+ (UML gdb) finish
+ Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs",
+ mod_user=0x8070e00) at module.c:349
+ 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411
+ 411 else res = EXECUTE_SYSCALL(syscall, regs);
+ Value returned is $1 = 0
+ (UML gdb)
+ p/x (int)module_list + module_list->size_of_struct
+
+ $2 = 0xa9021054
+ (UML gdb) symbol-file ./linux
+ Load new symbol table from "./linux"? (y or n) y
+ Reading symbols from ./linux...
+ done.
+ (UML gdb)
+ add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054
+
+ add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at
+ .text_addr = 0xa9021054
+ (y or n) y
+
+ Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o...
+ done.
+ (UML gdb) p *module_list
+ $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs",
+ size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1,
+ nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0,
+ init = 0xa90221f0 <init_hostfs>, cleanup = 0xa902222c <exit_hostfs>,
+ ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0,
+ persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0,
+ kallsyms_end = 0x0,
+ archdata_start = 0x1b855 <Address 0x1b855 out of bounds>,
+ archdata_end = 0xe5890000 <Address 0xe5890000 out of bounds>,
+ kernel_data = 0xf689c35d <Address 0xf689c35d out of bounds>}
+ >> Finished loading symbols for hostfs ...
+
+
+
+
+ That's the easy way. It's highly recommended. The hard way is
+ described below in case you're interested in what's going on.
+
+
+ Boot the kernel under the debugger and load the module with insmod or
+ modprobe. With gdb, do:
+
+
+ (UML gdb) p module_list
+
+
+
+
+ This is a list of modules that have been loaded into the kernel, with
+ the most recently loaded module first. Normally, the module you want
+ is at module_list. If it's not, walk down the next links, looking at
+ the name fields until find the module you want to debug. Take the
+ address of that structure, and add module.size_of_struct (which in
+ 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition
+ for you :-):
+
+
+
+ (UML gdb)
+ printf "%#x\n", (int)module_list module_list->size_of_struct
+
+
+
+
+ The offset from the module start occasionally changes (before 2.4.0,
+ it was module.size_of_struct + 4), so it's a good idea to check the
+ init and cleanup addresses once in a while, as describe below. Now
+ do:
+
+
+ (UML gdb)
+ add-symbol-file /path/to/module/on/host that_address
+
+
+
+
+ Tell gdb you really want to do it, and you're in business.
+
+
+ If there's any doubt that you got the offset right, like breakpoints
+ appear not to work, or they're appearing in the wrong place, you can
+ check it by looking at the module structure. The init and cleanup
+ fields should look like:
+
+
+ init = 0x588066b0 <init_hostfs>, cleanup = 0x588066c0 <exit_hostfs>
+
+
+
+
+ with no offsets on the symbol names. If the names are right, but they
+ are offset, then the offset tells you how much you need to add to the
+ address you gave to add-symbol-file.
+
+
+ When you want to load in a new version of the module, you need to get
+ gdb to forget about the old one. The only way I've found to do that
+ is to tell gdb to forget about all symbols that it knows about:
+
+
+ (UML gdb) symbol-file
+
+
+
+
+ Then reload the symbols from the kernel binary:
+
+
+ (UML gdb) symbol-file /path/to/kernel
+
+
+
+
+ and repeat the process above. You'll also need to re-enable break-
+ points. They were disabled when you dumped all the symbols because
+ gdb couldn't figure out where they should go.
+
+
+
+ 1111..55.. AAttttaacchhiinngg ggddbb ttoo tthhee kkeerrnneell
+
+ If you don't have the kernel running under gdb, you can attach gdb to
+ it later by sending the tracing thread a SIGUSR1. The first line of
+ the console output identifies its pid:
+ tracing thread pid = 20093
+
+
+
+
+ When you send it the signal:
+
+
+ host% kill -USR1 20093
+
+
+
+
+ you will get an xterm with gdb running in it.
+
+
+ If you have the mconsole compiled into UML, then the mconsole client
+ can be used to start gdb:
+
+
+ (mconsole) (mconsole) config gdb=xterm
+
+
+
+
+ will fire up an xterm with gdb running in it.
+
+
+
+ 1111..66.. UUssiinngg aalltteerrnnaattee ddeebbuuggggeerrss
+
+ UML has support for attaching to an already running debugger rather
+ than starting gdb itself. This is present in CVS as of 17 Apr 2001.
+ I sent it to Alan for inclusion in the ac tree, and it will be in my
+ 2.4.4 release.
+
+
+ This is useful when gdb is a subprocess of some UI, such as emacs or
+ ddd. It can also be used to run debuggers other than gdb on UML.
+ Below is an example of using strace as an alternate debugger.
+
+
+ To do this, you need to get the pid of the debugger and pass it in
+ with the
+
+
+ If you are using gdb under some UI, then tell it to 'att 1', and
+ you'll find yourself attached to UML.
+
+
+ If you are using something other than gdb as your debugger, then
+ you'll need to get it to do the equivalent of 'att 1' if it doesn't do
+ it automatically.
+
+
+ An example of an alternate debugger is strace. You can strace the
+ actual kernel as follows:
+
+ +o Run the following in a shell
+
+
+ host%
+ sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out'
+
+
+
+ +o Run UML with 'debug' and 'gdb-pid=<pid>' with the pid printed out
+ by the previous command
+
+ +o Hit return in the shell, and UML will start running, and strace
+ output will start accumulating in the output file.
+
+ Note that this is different from running
+
+
+ host% strace ./linux
+
+
+
+
+ That will strace only the main UML thread, the tracing thread, which
+ doesn't do any of the actual kernel work. It just oversees the vir-
+ tual machine. In contrast, using strace as described above will show
+ you the low-level activity of the virtual machine.
+
+
+
+
+
+ 1122.. KKeerrnneell ddeebbuuggggiinngg eexxaammpplleess
+
+ 1122..11.. TThhee ccaassee ooff tthhee hhuunngg ffsscckk
+
+ When booting up the kernel, fsck failed, and dropped me into a shell
+ to fix things up. I ran fsck -y, which hung:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Setting hostname uml [ OK ]
+ Checking root filesystem
+ /dev/fhd0 was not cleanly unmounted, check forced.
+ Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780.
+
+ /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.
+ (i.e., without -a or -p options)
+ [ FAILED ]
+
+ *** An error occurred during the file system check.
+ *** Dropping you to a shell; the system will reboot
+ *** when you leave the shell.
+ Give root password for maintenance
+ (or type Control-D for normal startup):
+
+ [root@uml /root]# fsck -y /dev/fhd0
+ fsck -y /dev/fhd0
+ Parallelizing fsck version 1.14 (9-Jan-1999)
+ e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09
+ /dev/fhd0 contains a file system with errors, check forced.
+ Pass 1: Checking inodes, blocks, and sizes
+ Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes
+
+ Inode 19780, i_blocks is 1548, should be 540. Fix? yes
+
+ Pass 2: Checking directory structure
+ Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes
+
+ Directory inode 11858, block 0, offset 0: directory corrupted
+ Salvage? yes
+
+ Missing '.' in directory inode 11858.
+ Fix? yes
+
+ Missing '..' in directory inode 11858.
+ Fix? yes
+
+
+
+
+
+ The standard drill in this sort of situation is to fire up gdb on the
+ signal thread, which, in this case, was pid 1935. In another window,
+ I run gdb and attach pid 1935.
+
+
+
+
+ ~/linux/2.3.26/um 1016: gdb linux
+ GNU gdb 4.17.0.11 with Linux support
+ Copyright 1998 Free Software Foundation, Inc.
+ GDB is free software, covered by the GNU General Public License, and you are
+ welcome to change it and/or distribute copies of it under certain conditions.
+ Type "show copying" to see the conditions.
+ There is absolutely no warranty for GDB. Type "show warranty" for details.
+ This GDB was configured as "i386-redhat-linux"...
+
+ (gdb) att 1935
+ Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935
+ 0x100756d9 in __wait4 ()
+
+
+
+
+
+
+ Let's see what's currently running:
+
+
+
+ (gdb) p current_task.pid
+ $1 = 0
+
+
+
+
+
+ It's the idle thread, which means that fsck went to sleep for some
+ reason and never woke up.
+
+
+ Let's guess that the last process in the process list is fsck:
+
+
+
+ (gdb) p current_task.prev_task.comm
+ $13 = "fsck.ext2\000\000\000\000\000\000"
+
+
+
+
+
+ It is, so let's see what it thinks it's up to:
+
+
+
+ (gdb) p current_task.prev_task.thread
+ $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0,
+ kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = {
+ 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720},
+ request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = {
+ regs = {1350467584, 2952789424, 0 <repeats 15 times>}, sigstack = 0,
+ pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000,
+ arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = {
+ op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}}
+
+
+
+
+
+ The interesting things here are the fact that its .thread.syscall.id
+ is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or
+ the defines in include/asm-um/arch/unistd.h), and that it never
+ returned. Also, its .request.op is OP_SWITCH (see
+ arch/um/include/user_util.h). These mean that it went into a write,
+ and, for some reason, called schedule().
+
+
+ The fact that it never returned from write means that its stack should
+ be fairly interesting. Its pid is 1980 (.thread.extern_pid). That
+ process is being ptraced by the signal thread, so it must be detached
+ before gdb can attach it:
+
+
+
+
+
+
+
+
+
+
+ (gdb) call detach(1980)
+
+ Program received signal SIGSEGV, Segmentation fault.
+ <function called from gdb>
+ The program being debugged stopped while in a function called from GDB.
+ When the function (detach) is done executing, GDB will silently
+ stop (instead of continuing to evaluate the expression containing
+ the function call).
+ (gdb) call detach(1980)
+ $15 = 0
+
+
+
+
+
+ The first detach segfaults for some reason, and the second one
+ succeeds.
+
+
+ Now I detach from the signal thread, attach to the fsck thread, and
+ look at its stack:
+
+
+ (gdb) det
+ Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935
+ (gdb) att 1980
+ Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980
+ 0x10070451 in __kill ()
+ (gdb) bt
+ #0 0x10070451 in __kill ()
+ #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30
+ #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000)
+ at process_kern.c:156
+ #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000)
+ at process_kern.c:161
+ #4 0x10001d12 in schedule () at sched.c:777
+ #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71
+ #6 0x1006aa10 in __down_failed () at semaphore.c:157
+ #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174
+ #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182
+ #9 <signal handler called>
+ #10 0x10155404 in errno ()
+ #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50
+ #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174
+ #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182
+ #14 <signal handler called>
+ #15 0xc0fd in ?? ()
+ #16 0x10016647 in sys_write (fd=3,
+ buf=0x80b8800 <Address 0x80b8800 out of bounds>, count=1024)
+ at read_write.c:159
+ #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08)
+ at syscall_kern.c:254
+ #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35
+ #19 <signal handler called>
+ #20 0x400dc8b0 in ?? ()
+
+
+
+
+
+ The interesting things here are :
+
+ +o There are two segfaults on this stack (frames 9 and 14)
+
+ +o The first faulting address (frame 11) is 0x50000800
+
+ (gdb) p (void *)1342179328
+ $16 = (void *) 0x50000800
+
+
+
+
+
+ The initial faulting address is interesting because it is on the idle
+ thread's stack. I had been seeing the idle thread segfault for no
+ apparent reason, and the cause looked like stack corruption. In hopes
+ of catching the culprit in the act, I had turned off all protections
+ to that stack while the idle thread wasn't running. This apparently
+ tripped that trap.
+
+
+ However, the more immediate problem is that second segfault and I'm
+ going to concentrate on that. First, I want to see where the fault
+ happened, so I have to go look at the sigcontent struct in frame 8:
+
+
+
+ (gdb) up
+ #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30
+ 30 kill(pid, SIGUSR1);
+ (gdb)
+ #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000)
+ at process_kern.c:156
+ 156 usr1_pid(getpid());
+ (gdb)
+ #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000)
+ at process_kern.c:161
+ 161 _switch_to(prev, next);
+ (gdb)
+ #4 0x10001d12 in schedule () at sched.c:777
+ 777 switch_to(prev, next, prev);
+ (gdb)
+ #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71
+ 71 schedule();
+ (gdb)
+ #6 0x1006aa10 in __down_failed () at semaphore.c:157
+ 157 }
+ (gdb)
+ #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174
+ 174 segv(sc->cr2, sc->err & 2);
+ (gdb)
+ #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182
+ 182 segv_handler(sc);
+ (gdb) p *sc
+ Cannot access memory at address 0x0.
+
+
+
+
+ That's not very useful, so I'll try a more manual method:
+
+
+ (gdb) p *((struct sigcontext *) (&sig + 1))
+ $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43,
+ __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440,
+ esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0,
+ trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118,
+ esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0,
+ cr2 = 1280}
+
+
+
+ The ip is in handle_mm_fault:
+
+
+ (gdb) p (void *)268480945
+ $20 = (void *) 0x1000b1b1
+ (gdb) i sym $20
+ handle_mm_fault + 57 in section .text
+
+
+
+
+
+ Specifically, it's in pte_alloc:
+
+
+ (gdb) i line *$20
+ Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h"
+ starts at address 0x1000b1b1 <handle_mm_fault+57>
+ and ends at 0x1000b1b7 <handle_mm_fault+63>.
+
+
+
+
+
+ To find where in handle_mm_fault this is, I'll jump forward in the
+ code until I see an address in that procedure:
+
+
+
+ (gdb) i line *0x1000b1c0
+ Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h"
+ starts at address 0x1000b1b7 <handle_mm_fault+63>
+ and ends at 0x1000b1c3 <handle_mm_fault+75>.
+ (gdb) i line *0x1000b1d0
+ Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h"
+ starts at address 0x1000b1d0 <handle_mm_fault+88>
+ and ends at 0x1000b1da <handle_mm_fault+98>.
+ (gdb) i line *0x1000b1e0
+ Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h"
+ starts at address 0x1000b1da <handle_mm_fault+98>
+ and ends at 0x1000b1e1 <handle_mm_fault+105>.
+ (gdb) i line *0x1000b1f0
+ Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h"
+ starts at address 0x1000b1f0 <handle_mm_fault+120>
+ and ends at 0x1000b200 <handle_mm_fault+136>.
+ (gdb) i line *0x1000b200
+ Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h"
+ starts at address 0x1000b200 <handle_mm_fault+136>
+ and ends at 0x1000b208 <handle_mm_fault+144>.
+ (gdb) i line *0x1000b210
+ Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h"
+ starts at address 0x1000b210 <handle_mm_fault+152>
+ and ends at 0x1000b219 <handle_mm_fault+161>.
+ (gdb) i line *0x1000b220
+ Line 1168 of "memory.c" starts at address 0x1000b21e <handle_mm_fault+166>
+ and ends at 0x1000b222 <handle_mm_fault+170>.
+
+
+
+
+
+ Something is apparently wrong with the page tables or vma_structs, so
+ lets go back to frame 11 and have a look at them:
+
+
+
+ #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50
+ 50 handle_mm_fault(current, vma, address, is_write);
+ (gdb) call pgd_offset_proc(vma->vm_mm, address)
+ $22 = (pgd_t *) 0x80a548c
+
+
+
+
+
+ That's pretty bogus. Page tables aren't supposed to be in process
+ text or data areas. Let's see what's in the vma:
+
+
+ (gdb) p *vma
+ $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640,
+ vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200,
+ vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000,
+ vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63,
+ vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec,
+ vm_private_data = 0x62}
+ (gdb) p *vma.vm_mm
+ $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000,
+ pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288},
+ map_count = 134909076, mmap_sem = {count = {counter = 135073792},
+ sleepers = -1342177872, wait = {lock = <optimized out or zero length>,
+ task_list = {next = 0xaffffe63, prev = 0xaffffe7a},
+ __magic = -1342177670, __creator = -1342177300}, __magic = 98},
+ page_table_lock = {}, context = 138, start_code = 0, end_code = 0,
+ start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0,
+ arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536,
+ total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0,
+ swap_address = 0, segments = 0x0}
+
+
+
+
+
+ This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx
+ addresses, this is looking like a stack was plonked down on top of
+ these structures. Maybe it's a stack overflow from the next page:
+
+
+
+ (gdb) p vma
+ $25 = (struct vm_area_struct *) 0x507d2434
+
+
+
+
+
+ That's towards the lower quarter of the page, so that would have to
+ have been pretty heavy stack overflow:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (gdb) x/100x $25
+ 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c
+ 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000
+ 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a
+ 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000
+ 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000
+ 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000
+ 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000
+ 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000
+
+
+
+
+
+ It's not stack overflow. The only "stack-like" piece of this data is
+ the vma_struct itself.
+
+
+ At this point, I don't see any avenues to pursue, so I just have to
+ admit that I have no idea what's going on. What I will do, though, is
+ stick a trap on the segfault handler which will stop if it sees any
+ writes to the idle thread's stack. That was the thing that happened
+ first, and it may be that if I can catch it immediately, what's going
+ on will be somewhat clearer.
+
+
+ 1122..22.. EEppiissooddee 22:: TThhee ccaassee ooff tthhee hhuunngg ffsscckk
+
+ After setting a trap in the SEGV handler for accesses to the signal
+ thread's stack, I reran the kernel.
+
+
+ fsck hung again, this time by hitting the trap:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Setting hostname uml [ OK ]
+ Checking root filesystem
+ /dev/fhd0 contains a file system with errors, check forced.
+ Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780.
+
+ /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.
+ (i.e., without -a or -p options)
+ [ FAILED ]
+
+ *** An error occurred during the file system check.
+ *** Dropping you to a shell; the system will reboot
+ *** when you leave the shell.
+ Give root password for maintenance
+ (or type Control-D for normal startup):
+
+ [root@uml /root]# fsck -y /dev/fhd0
+ fsck -y /dev/fhd0
+ Parallelizing fsck version 1.14 (9-Jan-1999)
+ e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09
+ /dev/fhd0 contains a file system with errors, check forced.
+ Pass 1: Checking inodes, blocks, and sizes
+ Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes
+
+ Pass 2: Checking directory structure
+ Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes
+
+ Directory inode 11858, block 0, offset 0: directory corrupted
+ Salvage? yes
+
+ Missing '.' in directory inode 11858.
+ Fix? yes
+
+ Missing '..' in directory inode 11858.
+ Fix? yes
+
+ Untested (4127) [100fe44c]: trap_kern.c line 31
+
+
+
+
+
+ I need to get the signal thread to detach from pid 4127 so that I can
+ attach to it with gdb. This is done by sending it a SIGUSR1, which is
+ caught by the signal thread, which detaches the process:
+
+
+ kill -USR1 4127
+
+
+
+
+
+ Now I can run gdb on it:
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ~/linux/2.3.26/um 1034: gdb linux
+ GNU gdb 4.17.0.11 with Linux support
+ Copyright 1998 Free Software Foundation, Inc.
+ GDB is free software, covered by the GNU General Public License, and you are
+ welcome to change it and/or distribute copies of it under certain conditions.
+ Type "show copying" to see the conditions.
+ There is absolutely no warranty for GDB. Type "show warranty" for details.
+ This GDB was configured as "i386-redhat-linux"...
+ (gdb) att 4127
+ Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127
+ 0x10075891 in __libc_nanosleep ()
+
+
+
+
+
+ The backtrace shows that it was in a write and that the fault address
+ (address in frame 3) is 0x50000800, which is right in the middle of
+ the signal thread's stack page:
+
+
+ (gdb) bt
+ #0 0x10075891 in __libc_nanosleep ()
+ #1 0x1007584d in __sleep (seconds=1000000)
+ at ../sysdeps/unix/sysv/linux/sleep.c:78
+ #2 0x1006ce9a in stop () at user_util.c:191
+ #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31
+ #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174
+ #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182
+ #6 <signal handler called>
+ #7 0xc0fd in ?? ()
+ #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024)
+ at read_write.c:159
+ #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08)
+ at syscall_kern.c:254
+ #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35
+ #11 <signal handler called>
+ #12 0x400dc8b0 in ?? ()
+ #13 <signal handler called>
+ #14 0x400dc8b0 in ?? ()
+ #15 0x80545fd in ?? ()
+ #16 0x804daae in ?? ()
+ #17 0x8054334 in ?? ()
+ #18 0x804d23e in ?? ()
+ #19 0x8049632 in ?? ()
+ #20 0x80491d2 in ?? ()
+ #21 0x80596b5 in ?? ()
+ (gdb) p (void *)1342179328
+ $3 = (void *) 0x50000800
+
+
+
+
+
+ Going up the stack to the segv_handler frame and looking at where in
+ the code the access happened shows that it happened near line 110 of
+ block_dev.c:
+
+
+
+
+
+
+
+
+
+ (gdb) up
+ #1 0x1007584d in __sleep (seconds=1000000)
+ at ../sysdeps/unix/sysv/linux/sleep.c:78
+ ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory.
+ (gdb)
+ #2 0x1006ce9a in stop () at user_util.c:191
+ 191 while(1) sleep(1000000);
+ (gdb)
+ #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31
+ 31 KERN_UNTESTED();
+ (gdb)
+ #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174
+ 174 segv(sc->cr2, sc->err & 2);
+ (gdb) p *sc
+ $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43,
+ __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484,
+ esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14,
+ err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070,
+ esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0,
+ cr2 = 1342179328}
+ (gdb) p (void *)268550834
+ $2 = (void *) 0x1001c2b2
+ (gdb) i sym $2
+ block_write + 1090 in section .text
+ (gdb) i line *$2
+ Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h"
+ starts at address 0x1001c2a1 <block_write+1073>
+ and ends at 0x1001c2bf <block_write+1103>.
+ (gdb) i line *0x1001c2c0
+ Line 110 of "block_dev.c" starts at address 0x1001c2bf <block_write+1103>
+ and ends at 0x1001c2e3 <block_write+1139>.
+
+
+
+
+
+ Looking at the source shows that the fault happened during a call to
+ copy_to_user to copy the data into the kernel:
+
+
+ 107 count -= chars;
+ 108 copy_from_user(p,buf,chars);
+ 109 p += chars;
+ 110 buf += chars;
+
+
+
+
+
+ p is the pointer which must contain 0x50000800, since buf contains
+ 0x80b8800 (frame 8 above). It is defined as:
+
+
+ p = offset + bh->b_data;
+
+
+
+
+
+ I need to figure out what bh is, and it just so happens that bh is
+ passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a
+ few lines later, so I do a little disassembly:
+
+
+
+
+ (gdb) disas 0x1001c2bf 0x1001c2e0
+ Dump of assembler code from 0x1001c2bf to 0x1001c2d0:
+ 0x1001c2bf <block_write+1103>: addl %eax,0xc(%ebp)
+ 0x1001c2c2 <block_write+1106>: movl 0xfffffdd4(%ebp),%edx
+ 0x1001c2c8 <block_write+1112>: btsl $0x0,0x18(%edx)
+ 0x1001c2cd <block_write+1117>: btsl $0x1,0x18(%edx)
+ 0x1001c2d2 <block_write+1122>: sbbl %ecx,%ecx
+ 0x1001c2d4 <block_write+1124>: testl %ecx,%ecx
+ 0x1001c2d6 <block_write+1126>: jne 0x1001c2e3 <block_write+1139>
+ 0x1001c2d8 <block_write+1128>: pushl $0x0
+ 0x1001c2da <block_write+1130>: pushl %edx
+ 0x1001c2db <block_write+1131>: call 0x1001819c <__mark_buffer_dirty>
+ End of assembler dump.
+
+
+
+
+
+ At that point, bh is in %edx (address 0x1001c2da), which is calculated
+ at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is,
+ taking %ebp from the sigcontext_struct above:
+
+
+ (gdb) p (void *)1342631484
+ $5 = (void *) 0x5006ee3c
+ (gdb) p 0x5006ee3c+0xfffffdd4
+ $6 = 1342630928
+ (gdb) p (void *)$6
+ $7 = (void *) 0x5006ec10
+ (gdb) p *((void **)$7)
+ $8 = (void *) 0x50100200
+
+
+
+
+
+ Now, I look at the structure to see what's in it, and particularly,
+ what its b_data field contains:
+
+
+ (gdb) p *((struct buffer_head *)0x50100200)
+ $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0,
+ b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24,
+ b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260,
+ b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58,
+ b_data = 0x50000800 "", b_page = 0x50004000,
+ b_end_io = 0x10017f60 <end_buffer_io_sync>, b_dev_id = 0x0,
+ b_rsector = 98810, b_wait = {lock = <optimized out or zero length>,
+ task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448,
+ __creator = 0}, b_kiobuf = 0x0}
+
+
+
+
+
+ The b_data field is indeed 0x50000800, so the question becomes how
+ that happened. The rest of the structure looks fine, so this probably
+ is not a case of data corruption. It happened on purpose somehow.
+
+
+ The b_page field is a pointer to the page_struct representing the
+ 0x50000000 page. Looking at it shows the kernel's idea of the state
+ of that page:
+
+
+
+ (gdb) p *$13.b_page
+ $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0,
+ index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = {
+ next = 0x50008460, prev = 0x50019350}, wait = {
+ lock = <optimized out or zero length>, task_list = {next = 0x50004024,
+ prev = 0x50004024}, __magic = 1342193708, __creator = 0},
+ pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280,
+ zone = 0x100c5160}
+
+
+
+
+
+ Some sanity-checking: the virtual field shows the "virtual" address of
+ this page, which in this kernel is the same as its "physical" address,
+ and the page_struct itself should be mem_map[0], since it represents
+ the first page of memory:
+
+
+
+ (gdb) p (void *)1342177280
+ $18 = (void *) 0x50000000
+ (gdb) p mem_map
+ $19 = (mem_map_t *) 0x50004000
+
+
+
+
+
+ These check out fine.
+
+
+ Now to check out the page_struct itself. In particular, the flags
+ field shows whether the page is considered free or not:
+
+
+ (gdb) p (void *)132
+ $21 = (void *) 0x84
+
+
+
+
+
+ The "reserved" bit is the high bit, which is definitely not set, so
+ the kernel considers the signal stack page to be free and available to
+ be used.
+
+
+ At this point, I jump to conclusions and start looking at my early
+ boot code, because that's where that page is supposed to be reserved.
+
+
+ In my setup_arch procedure, I have the following code which looks just
+ fine:
+
+
+
+ bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn);
+ free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem);
+
+
+
+
+
+ Two stack pages have already been allocated, and low_physmem points to
+ the third page, which is the beginning of free memory.
+ The init_bootmem call declares the entire memory to the boot memory
+ manager, which marks it all reserved. The free_bootmem call frees up
+ all of it, except for the first two pages. This looks correct to me.
+
+
+ So, I decide to see init_bootmem run and make sure that it is marking
+ those first two pages as reserved. I never get that far.
+
+
+ Stepping into init_bootmem, and looking at bootmem_map before looking
+ at what it contains shows the following:
+
+
+
+ (gdb) p bootmem_map
+ $3 = (void *) 0x50000000
+
+
+
+
+
+ Aha! The light dawns. That first page is doing double duty as a
+ stack and as the boot memory map. The last thing that the boot memory
+ manager does is to free the pages used by its memory map, so this page
+ is getting freed even its marked as reserved.
+
+
+ The fix was to initialize the boot memory manager before allocating
+ those two stack pages, and then allocate them through the boot memory
+ manager. After doing this, and fixing a couple of subsequent buglets,
+ the stack corruption problem disappeared.
+
+
+
+
+
+ 1133.. WWhhaatt ttoo ddoo wwhheenn UUMMLL ddooeessnn''tt wwoorrkk
+
+
+
+
+ 1133..11.. SSttrraannggee ccoommppiillaattiioonn eerrrroorrss wwhheenn yyoouu bbuuiilldd ffrroomm ssoouurrccee
+
+ As of test11, it is necessary to have "ARCH=um" in the environment or
+ on the make command line for all steps in building UML, including
+ clean, distclean, or mrproper, config, menuconfig, or xconfig, dep,
+ and linux. If you forget for any of them, the i386 build seems to
+ contaminate the UML build. If this happens, start from scratch with
+
+
+ host%
+ make mrproper ARCH=um
+
+
+
+
+ and repeat the build process with ARCH=um on all the steps.
+
+
+ See ``Compiling the kernel and modules'' for more details.
+
+
+ Another cause of strange compilation errors is building UML in
+ /usr/src/linux. If you do this, the first thing you need to do is
+ clean up the mess you made. The /usr/src/linux/asm link will now
+ point to /usr/src/linux/asm-um. Make it point back to
+ /usr/src/linux/asm-i386. Then, move your UML pool someplace else and
+ build it there. Also see below, where a more specific set of symptoms
+ is described.
+
+
+
+ 1133..33.. AA vvaarriieettyy ooff ppaanniiccss aanndd hhaannggss wwiitthh //ttmmpp oonn aa rreeiisseerrffss ffiilleessyyss--
+ tteemm
+
+ I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27.
+ Panics preceded by
+
+
+ Detaching pid nnnn
+
+
+
+ are diagnostic of this problem. This is a reiserfs bug which causes a
+ thread to occasionally read stale data from a mmapped page shared with
+ another thread. The fix is to upgrade the filesystem or to have /tmp
+ be an ext2 filesystem.
+
+
+
+ 1133..44.. TThhee ccoommppiillee ffaaiillss wwiitthh eerrrroorrss aabboouutt ccoonnfflliiccttiinngg ttyyppeess ffoorr
+ ''ooppeenn'',, ''dduupp'',, aanndd ''wwaaiittppiidd''
+
+ This happens when you build in /usr/src/linux. The UML build makes
+ the include/asm link point to include/asm-um. /usr/include/asm points
+ to /usr/src/linux/include/asm, so when that link gets moved, files
+ which need to include the asm-i386 versions of headers get the
+ incompatible asm-um versions. The fix is to move the include/asm link
+ back to include/asm-i386 and to do UML builds someplace else.
+
+
+
+ 1133..55.. UUMMLL ddooeessnn''tt wwoorrkk wwhheenn //ttmmpp iiss aann NNFFSS ffiilleessyysstteemm
+
+ This seems to be a similar situation with the ReiserFS problem above.
+ Some versions of NFS seems not to handle mmap correctly, which UML
+ depends on. The workaround is have /tmp be a non-NFS directory.
+
+
+ 1133..66.. UUMMLL hhaannggss oonn bboooott wwhheenn ccoommppiilleedd wwiitthh ggpprrooff ssuuppppoorrtt
+
+ If you build UML with gprof support and, early in the boot, it does
+ this
+
+
+ kernel BUG at page_alloc.c:100!
+
+
+
+
+ you have a buggy gcc. You can work around the problem by removing
+ UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up
+ another bug, but that one is fairly hard to reproduce.
+
+
+
+ 1133..77.. ssyyssllooggdd ddiieess wwiitthh aa SSIIGGTTEERRMM oonn ssttaarrttuupp
+
+ The exact boot error depends on the distribution that you're booting,
+ but Debian produces this:
+
+
+ /etc/rc2.d/S10sysklogd: line 49: 93 Terminated
+ start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD
+
+
+
+
+ This is a syslogd bug. There's a race between a parent process
+ installing a signal handler and its child sending the signal. See
+ this uml-devel post <http://www.geocrawler.com/lists/3/Source-
+ Forge/709/0/6612801> for the details.
+
+
+
+ 1133..88.. TTUUNN//TTAAPP nneettwwoorrkkiinngg ddooeessnn''tt wwoorrkk oonn aa 22..44 hhoosstt
+
+ There are a couple of problems which were
+ <http://www.geocrawler.com/lists/3/SourceForge/597/0/> name="pointed
+ out"> by Tim Robinson <timro at trkr dot net>
+
+ +o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier.
+ The fix is to upgrade to something more recent and then read the
+ next item.
+
+ +o If you see
+
+
+ File descriptor in bad state
+
+
+
+ when you bring up the device inside UML, you have a header mismatch
+ between the original kernel and the upgraded one. Make /usr/src/linux
+ point at the new headers. This will only be a problem if you build
+ uml_net yourself.
+
+
+
+ 1133..99.. YYoouu ccaann nneettwwoorrkk ttoo tthhee hhoosstt bbuutt nnoott ttoo ootthheerr mmaacchhiinneess oonn tthhee
+ nneett
+
+ If you can connect to the host, and the host can connect to UML, but
+ you cannot connect to any other machines, then you may need to enable
+ IP Masquerading on the host. Usually this is only experienced when
+ using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML
+ networking, rather than the public address space that your host is
+ connected to. UML does not enable IP Masquerading, so you will need
+ to create a static rule to enable it:
+
+
+ host%
+ iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE
+
+
+
+
+ Replace eth0 with the interface that you use to talk to the rest of
+ the world.
+
+
+ Documentation on IP Masquerading, and SNAT, can be found at
+ www.netfilter.org <http://www.netfilter.org> .
+
+
+ If you can reach the local net, but not the outside Internet, then
+ that is usually a routing problem. The UML needs a default route:
+
+
+ UML#
+ route add default gw gateway IP
+
+
+
+
+ The gateway IP can be any machine on the local net that knows how to
+ reach the outside world. Usually, this is the host or the local net-
+ work's gateway.
+
+
+ Occasionally, we hear from someone who can reach some machines, but
+ not others on the same net, or who can reach some ports on other
+ machines, but not others. These are usually caused by strange
+ firewalling somewhere between the UML and the other box. You track
+ this down by running tcpdump on every interface the packets travel
+ over and see where they disappear. When you find a machine that takes
+ the packets in, but does not send them onward, that's the culprit.
+
+
+
+ 1133..1100.. II hhaavvee nnoo rroooott aanndd II wwaanntt ttoo ssccrreeaamm
+
+ Thanks to Birgit Wahlich for telling me about this strange one. It
+ turns out that there's a limit of six environment variables on the
+ kernel command line. When that limit is reached or exceeded, argument
+ processing stops, which means that the 'root=' argument that UML
+ usually adds is not seen. So, the filesystem has no idea what the
+ root device is, so it panics.
+
+
+ The fix is to put less stuff on the command line. Glomming all your
+ setup variables into one is probably the best way to go.
+
+
+
+ 1133..1111.. UUMMLL bbuuiilldd ccoonnfflliicctt bbeettwweeeenn ppttrraaccee..hh aanndd uuccoonntteexxtt..hh
+
+ On some older systems, /usr/include/asm/ptrace.h and
+ /usr/include/sys/ucontext.h define the same names. So, when they're
+ included together, the defines from one completely mess up the parsing
+ of the other, producing errors like:
+ /usr/include/sys/ucontext.h:47: parse error before
+ `10'
+
+
+
+
+ plus a pile of warnings.
+
+
+ This is a libc botch, which has since been fixed, and I don't see any
+ way around it besides upgrading.
+
+
+
+ 1133..1122.. TThhee UUMMLL BBooggooMMiippss iiss eexxaaccttllyy hhaallff tthhee hhoosstt''ss BBooggooMMiippss
+
+ On i386 kernels, there are two ways of running the loop that is used
+ to calculate the BogoMips rating, using the TSC if it's there or using
+ a one-instruction loop. The TSC produces twice the BogoMips as the
+ loop. UML uses the loop, since it has nothing resembling a TSC, and
+ will get almost exactly the same BogoMips as a host using the loop.
+ However, on a host with a TSC, its BogoMips will be double the loop
+ BogoMips, and therefore double the UML BogoMips.
+
+
+
+ 1133..1133.. WWhheenn yyoouu rruunn UUMMLL,, iitt iimmmmeeddiiaatteellyy sseeggffaauullttss
+
+ If the host is configured with the 2G/2G address space split, that's
+ why. See ``UML on 2G/2G hosts'' for the details on getting UML to
+ run on your host.
+
+
+
+ 1133..1144.. xxtteerrmmss aappppeeaarr,, tthheenn iimmmmeeddiiaatteellyy ddiissaappppeeaarr
+
+ If you're running an up to date kernel with an old release of
+ uml_utilities, the port-helper program will not work properly, so
+ xterms will exit straight after they appear. The solution is to
+ upgrade to the latest release of uml_utilities. Usually this problem
+ occurs when you have installed a packaged release of UML then compiled
+ your own development kernel without upgrading the uml_utilities from
+ the source distribution.
+
+
+
+ 1133..1155.. AAnnyy ootthheerr ppaanniicc,, hhaanngg,, oorr ssttrraannggee bbeehhaavviioorr
+
+ If you're seeing truly strange behavior, such as hangs or panics that
+ happen in random places, or you try running the debugger to see what's
+ happening and it acts strangely, then it could be a problem in the
+ host kernel. If you're not running a stock Linus or -ac kernel, then
+ try that. An early version of the preemption patch and a 2.4.10 SuSE
+ kernel have caused very strange problems in UML.
+
+
+ Otherwise, let me know about it. Send a message to one of the UML
+ mailing lists - either the developer list - user-mode-linux-devel at
+ lists dot sourceforge dot net (subscription info) or the user list -
+ user-mode-linux-user at lists dot sourceforge do net (subscription
+ info), whichever you prefer. Don't assume that everyone knows about
+ it and that a fix is imminent.
+
+
+ If you want to be super-helpful, read ``Diagnosing Problems'' and
+ follow the instructions contained therein.
+ 1144.. DDiiaaggnnoossiinngg PPrroobblleemmss
+
+
+ If you get UML to crash, hang, or otherwise misbehave, you should
+ report this on one of the project mailing lists, either the developer
+ list - user-mode-linux-devel at lists dot sourceforge dot net
+ (subscription info) or the user list - user-mode-linux-user at lists
+ dot sourceforge dot net (subscription info). When you do, it is
+ likely that I will want more information. So, it would be helpful to
+ read the stuff below, do whatever is applicable in your case, and
+ report the results to the list.
+
+
+ For any diagnosis, you're going to need to build a debugging kernel.
+ The binaries from this site aren't debuggable. If you haven't done
+ this before, read about ``Compiling the kernel and modules'' and
+ ``Kernel debugging'' UML first.
+
+
+ 1144..11.. CCaassee 11 :: NNoorrmmaall kkeerrnneell ppaanniiccss
+
+ The most common case is for a normal thread to panic. To debug this,
+ you will need to run it under the debugger (add 'debug' to the command
+ line). An xterm will start up with gdb running inside it. Continue
+ it when it stops in start_kernel and make it crash. Now ^C gdb and
+
+
+ If the panic was a "Kernel mode fault", then there will be a segv
+ frame on the stack and I'm going to want some more information. The
+ stack might look something like this:
+
+
+ (UML gdb) backtrace
+ #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0)
+ at ../sysdeps/unix/sysv/linux/sigprocmask.c:49
+ #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218
+ #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32
+ #3 0x1009bf38 in __restore ()
+ at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125
+ #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0)
+ at trap_kern.c:66
+ #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285
+ #6 0x1009bf38 in __restore ()
+
+
+
+
+ I'm going to want to see the symbol and line information for the value
+ of ip in the segv frame. In this case, you would do the following:
+
+
+ (UML gdb) i sym 268849158
+
+
+
+
+ and
+
+
+ (UML gdb) i line *268849158
+
+
+
+
+ The reason for this is the __restore frame right above the segv_han-
+ dler frame is hiding the frame that actually segfaulted. So, I have
+ to get that information from the faulting ip.
+
+
+ 1144..22.. CCaassee 22 :: TTrraacciinngg tthhrreeaadd ppaanniiccss
+
+ The less common and more painful case is when the tracing thread
+ panics. In this case, the kernel debugger will be useless because it
+ needs a healthy tracing thread in order to work. The first thing to
+ do is get a backtrace from the tracing thread. This is done by
+ figuring out what its pid is, firing up gdb, and attaching it to that
+ pid. You can figure out the tracing thread pid by looking at the
+ first line of the console output, which will look like this:
+
+
+ tracing thread pid = 15851
+
+
+
+
+ or by running ps on the host and finding the line that looks like
+ this:
+
+
+ jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)]
+
+
+
+
+ If the panic was 'segfault in signals', then follow the instructions
+ above for collecting information about the location of the seg fault.
+
+
+ If the tracing thread flaked out all by itself, then send that
+ backtrace in and wait for our crack debugging team to fix the problem.
+
+
+ 1144..33.. CCaassee 33 :: TTrraacciinngg tthhrreeaadd ppaanniiccss ccaauusseedd bbyy ootthheerr tthhrreeaaddss
+
+ However, there are cases where the misbehavior of another thread
+ caused the problem. The most common panic of this type is:
+
+
+ wait_for_stop failed to wait for <pid> to stop with <signal number>
+
+
+
+
+ In this case, you'll need to get a backtrace from the process men-
+ tioned in the panic, which is complicated by the fact that the kernel
+ debugger is defunct and without some fancy footwork, another gdb can't
+ attach to it. So, this is how the fancy footwork goes:
+
+ In a shell:
+
+
+ host% kill -STOP pid
+
+
+
+
+ Run gdb on the tracing thread as described in case 2 and do:
+
+
+ (host gdb) call detach(pid)
+
+
+ If you get a segfault, do it again. It always works the second time.
+
+ Detach from the tracing thread and attach to that other thread:
+
+
+ (host gdb) detach
+
+
+
+
+
+
+ (host gdb) attach pid
+
+
+
+
+ If gdb hangs when attaching to that process, go back to a shell and
+ do:
+
+
+ host%
+ kill -CONT pid
+
+
+
+
+ And then get the backtrace:
+
+
+ (host gdb) backtrace
+
+
+
+
+
+ 1144..44.. CCaassee 44 :: HHaannggss
+
+ Hangs seem to be fairly rare, but they sometimes happen. When a hang
+ happens, we need a backtrace from the offending process. Run the
+ kernel debugger as described in case 1 and get a backtrace. If the
+ current process is not the idle thread, then send in the backtrace.
+ You can tell that it's the idle thread if the stack looks like this:
+
+
+ #0 0x100b1401 in __libc_nanosleep ()
+ #1 0x100a2885 in idle_sleep (secs=10) at time.c:122
+ #2 0x100a546f in do_idle () at process_kern.c:445
+ #3 0x100a5508 in cpu_idle () at process_kern.c:471
+ #4 0x100ec18f in start_kernel () at init/main.c:592
+ #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71
+ #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50
+
+
+
+
+ If this is the case, then some other process is at fault, and went to
+ sleep when it shouldn't have. Run ps on the host and figure out which
+ process should not have gone to sleep and stayed asleep. Then attach
+ to it with gdb and get a backtrace as described in case 3.
+
+
+
+
+
+
+ 1155.. TThhaannkkss
+
+
+ A number of people have helped this project in various ways, and this
+ page gives recognition where recognition is due.
+
+
+ If you're listed here and you would prefer a real link on your name,
+ or no link at all, instead of the despammed email address pseudo-link,
+ let me know.
+
+
+ If you're not listed here and you think maybe you should be, please
+ let me know that as well. I try to get everyone, but sometimes my
+ bookkeeping lapses and I forget about contributions.
+
+
+ 1155..11.. CCooddee aanndd DDooccuummeennttaattiioonn
+
+ Rusty Russell <rusty at linuxcare.com.au> -
+
+ +o wrote the HOWTO <http://user-mode-
+ linux.sourceforge.net/UserModeLinux-HOWTO.html>
+
+ +o prodded me into making this project official and putting it on
+ SourceForge
+
+ +o came up with the way cool UML logo <http://user-mode-
+ linux.sourceforge.net/uml-small.png>
+
+ +o redid the config process
+
+
+ Peter Moulder <reiter at netspace.net.au> - Fixed my config and build
+ processes, and added some useful code to the block driver
+
+
+ Bill Stearns <wstearns at pobox.com> -
+
+ +o HOWTO updates
+
+ +o lots of bug reports
+
+ +o lots of testing
+
+ +o dedicated a box (uml.ists.dartmouth.edu) to support UML development
+
+ +o wrote the mkrootfs script, which allows bootable filesystems of
+ RPM-based distributions to be cranked out
+
+ +o cranked out a large number of filesystems with said script
+
+
+ Jim Leu <jleu at mindspring.com> - Wrote the virtual ethernet driver
+ and associated usermode tools
+
+ Lars Brinkhoff <http://lars.nocrew.org/> - Contributed the ptrace
+ proxy from his own project <http://a386.nocrew.org/> to allow easier
+ kernel debugging
+
+
+ Andrea Arcangeli <andrea at suse.de> - Redid some of the early boot
+ code so that it would work on machines with Large File Support
+
+
+ Chris Emerson <http://www.chiark.greenend.org.uk/~cemerson/> - Did
+ the first UML port to Linux/ppc
+
+
+ Harald Welte <laforge at gnumonks.org> - Wrote the multicast
+ transport for the network driver
+
+
+ Jorgen Cederlof - Added special file support to hostfs
+
+
+ Greg Lonnon <glonnon at ridgerun dot com> - Changed the ubd driver
+ to allow it to layer a COW file on a shared read-only filesystem and
+ wrote the iomem emulation support
+
+
+ Henrik Nordstrom <http://hem.passagen.se/hno/> - Provided a variety
+ of patches, fixes, and clues
+
+
+ Lennert Buytenhek - Contributed various patches, a rewrite of the
+ network driver, the first implementation of the mconsole driver, and
+ did the bulk of the work needed to get SMP working again.
+
+
+ Yon Uriarte - Fixed the TUN/TAP network backend while I slept.
+
+
+ Adam Heath - Made a bunch of nice cleanups to the initialization code,
+ plus various other small patches.
+
+
+ Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and
+ is doing a real nice job of it. He also noticed and fixed a number of
+ actually and potentially exploitable security holes in uml_net. Plus
+ the occasional patch. I like patches.
+
+
+ James McMechan - James seems to have taken over maintenance of the ubd
+ driver and is doing a nice job of it.
+
+
+ Chandan Kudige - wrote the umlgdb script which automates the reloading
+ of module symbols.
+
+
+ Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers,
+ enabling UML processes to access audio devices on the host. He also
+ submitted patches for the slip transport and lots of other things.
+
+
+ David Coulson <http://davidcoulson.net> -
+
+ +o Set up the usermodelinux.org <http://usermodelinux.org> site,
+ which is a great way of keeping the UML user community on top of
+ UML goings-on.
+
+ +o Site documentation and updates
+
+ +o Nifty little UML management daemon UMLd
+ <http://uml.openconsultancy.com/umld/>
+
+ +o Lots of testing and bug reports
+
+
+
+
+ 1155..22.. FFlluusshhiinngg oouutt bbuuggss
+
+
+
+ +o Yuri Pudgorodsky
+
+ +o Gerald Britton
+
+ +o Ian Wehrman
+
+ +o Gord Lamb
+
+ +o Eugene Koontz
+
+ +o John H. Hartman
+
+ +o Anders Karlsson
+
+ +o Daniel Phillips
+
+ +o John Fremlin
+
+ +o Rainer Burgstaller
+
+ +o James Stevenson
+
+ +o Matt Clay
+
+ +o Cliff Jefferies
+
+ +o Geoff Hoff
+
+ +o Lennert Buytenhek
+
+ +o Al Viro
+
+ +o Frank Klingenhoefer
+
+ +o Livio Baldini Soares
+
+ +o Jon Burgess
+
+ +o Petru Paler
+
+ +o Paul
+
+ +o Chris Reahard
+
+ +o Sverker Nilsson
+
+ +o Gong Su
+
+ +o johan verrept
+
+ +o Bjorn Eriksson
+
+ +o Lorenzo Allegrucci
+
+ +o Muli Ben-Yehuda
+
+ +o David Mansfield
+
+ +o Howard Goff
+
+ +o Mike Anderson
+
+ +o John Byrne
+
+ +o Sapan J. Batia
+
+ +o Iris Huang
+
+ +o Jan Hudec
+
+ +o Voluspa
+
+
+
+
+ 1155..33.. BBuugglleettss aanndd cclleeaann--uuppss
+
+
+
+ +o Dave Zarzycki
+
+ +o Adam Lazur
+
+ +o Boria Feigin
+
+ +o Brian J. Murrell
+
+ +o JS
+
+ +o Roman Zippel
+
+ +o Wil Cooley
+
+ +o Ayelet Shemesh
+
+ +o Will Dyson
+
+ +o Sverker Nilsson
+
+ +o dvorak
+
+ +o v.naga srinivas
+
+ +o Shlomi Fish
+
+ +o Roger Binns
+
+ +o johan verrept
+
+ +o MrChuoi
+
+ +o Peter Cleve
+
+ +o Vincent Guffens
+
+ +o Nathan Scott
+
+ +o Patrick Caulfield
+
+ +o jbearce
+
+ +o Catalin Marinas
+
+ +o Shane Spencer
+
+ +o Zou Min
+
+
+ +o Ryan Boder
+
+ +o Lorenzo Colitti
+
+ +o Gwendal Grignou
+
+ +o Andre' Breiler
+
+ +o Tsutomu Yasuda
+
+
+
+ 1155..44.. CCaassee SSttuuddiieess
+
+
+ +o Jon Wright
+
+ +o William McEwan
+
+ +o Michael Richardson
+
+
+
+ 1155..55.. OOtthheerr ccoonnttrriibbuuttiioonnss
+
+
+ Bill Carr <Bill.Carr at compaq.com> made the Red Hat mkrootfs script
+ work with RH 6.2.
+
+ Michael Jennings <mikejen at hevanet.com> sent in some material which
+ is now gracing the top of the index page <http://user-mode-
+ linux.sourceforge.net/index.html> of this site.
+
+ SGI <http://www.sgi.com> (and more specifically Ralf Baechle <ralf at
+ uni-koblenz.de> ) gave me an account on oss.sgi.com
+ <http://www.oss.sgi.com> . The bandwidth there made it possible to
+ produce most of the filesystems available on the project download
+ page.
+
+ Laurent Bonnaud <Laurent.Bonnaud at inpg.fr> took the old grotty
+ Debian filesystem that I've been distributing and updated it to 2.2.
+ It is now available by itself here.
+
+ Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make
+ releases even when Sourceforge is broken.
+
+ Rodrigo de Castro looked at my broken pte code and told me what was
+ wrong with it, letting me fix a long-standing (several weeks) and
+ serious set of bugs.
+
+ Chris Reahard built a specialized root filesystem for running a DNS
+ server jailed inside UML. It's available from the download
+ <http://user-mode-linux.sourceforge.net/dl-sf.html> page in the Jail
+ Filesystems section.
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Documentation/unaligned-memory-access.txt b/Documentation/unaligned-memory-access.txt
new file mode 100644
index 0000000..f866c72
--- /dev/null
+++ b/Documentation/unaligned-memory-access.txt
@@ -0,0 +1,252 @@
+UNALIGNED MEMORY ACCESSES
+=========================
+
+Linux runs on a wide variety of architectures which have varying behaviour
+when it comes to memory access. This document presents some details about
+unaligned accesses, why you need to write code that doesn't cause them,
+and how to write such code!
+
+
+The definition of an unaligned access
+=====================================
+
+Unaligned memory accesses occur when you try to read N bytes of data starting
+from an address that is not evenly divisible by N (i.e. addr % N != 0).
+For example, reading 4 bytes of data from address 0x10004 is fine, but
+reading 4 bytes of data from address 0x10005 would be an unaligned memory
+access.
+
+The above may seem a little vague, as memory access can happen in different
+ways. The context here is at the machine code level: certain instructions read
+or write a number of bytes to or from memory (e.g. movb, movw, movl in x86
+assembly). As will become clear, it is relatively easy to spot C statements
+which will compile to multiple-byte memory access instructions, namely when
+dealing with types such as u16, u32 and u64.
+
+
+Natural alignment
+=================
+
+The rule mentioned above forms what we refer to as natural alignment:
+When accessing N bytes of memory, the base memory address must be evenly
+divisible by N, i.e. addr % N == 0.
+
+When writing code, assume the target architecture has natural alignment
+requirements.
+
+In reality, only a few architectures require natural alignment on all sizes
+of memory access. However, we must consider ALL supported architectures;
+writing code that satisfies natural alignment requirements is the easiest way
+to achieve full portability.
+
+
+Why unaligned access is bad
+===========================
+
+The effects of performing an unaligned memory access vary from architecture
+to architecture. It would be easy to write a whole document on the differences
+here; a summary of the common scenarios is presented below:
+
+ - Some architectures are able to perform unaligned memory accesses
+ transparently, but there is usually a significant performance cost.
+ - Some architectures raise processor exceptions when unaligned accesses
+ happen. The exception handler is able to correct the unaligned access,
+ at significant cost to performance.
+ - Some architectures raise processor exceptions when unaligned accesses
+ happen, but the exceptions do not contain enough information for the
+ unaligned access to be corrected.
+ - Some architectures are not capable of unaligned memory access, but will
+ silently perform a different memory access to the one that was requested,
+ resulting in a subtle code bug that is hard to detect!
+
+It should be obvious from the above that if your code causes unaligned
+memory accesses to happen, your code will not work correctly on certain
+platforms and will cause performance problems on others.
+
+
+Code that does not cause unaligned access
+=========================================
+
+At first, the concepts above may seem a little hard to relate to actual
+coding practice. After all, you don't have a great deal of control over
+memory addresses of certain variables, etc.
+
+Fortunately things are not too complex, as in most cases, the compiler
+ensures that things will work for you. For example, take the following
+structure:
+
+ struct foo {
+ u16 field1;
+ u32 field2;
+ u8 field3;
+ };
+
+Let us assume that an instance of the above structure resides in memory
+starting at address 0x10000. With a basic level of understanding, it would
+not be unreasonable to expect that accessing field2 would cause an unaligned
+access. You'd be expecting field2 to be located at offset 2 bytes into the
+structure, i.e. address 0x10002, but that address is not evenly divisible
+by 4 (remember, we're reading a 4 byte value here).
+
+Fortunately, the compiler understands the alignment constraints, so in the
+above case it would insert 2 bytes of padding in between field1 and field2.
+Therefore, for standard structure types you can always rely on the compiler
+to pad structures so that accesses to fields are suitably aligned (assuming
+you do not cast the field to a type of different length).
+
+Similarly, you can also rely on the compiler to align variables and function
+parameters to a naturally aligned scheme, based on the size of the type of
+the variable.
+
+At this point, it should be clear that accessing a single byte (u8 or char)
+will never cause an unaligned access, because all memory addresses are evenly
+divisible by one.
+
+On a related topic, with the above considerations in mind you may observe
+that you could reorder the fields in the structure in order to place fields
+where padding would otherwise be inserted, and hence reduce the overall
+resident memory size of structure instances. The optimal layout of the
+above example is:
+
+ struct foo {
+ u32 field2;
+ u16 field1;
+ u8 field3;
+ };
+
+For a natural alignment scheme, the compiler would only have to add a single
+byte of padding at the end of the structure. This padding is added in order
+to satisfy alignment constraints for arrays of these structures.
+
+Another point worth mentioning is the use of __attribute__((packed)) on a
+structure type. This GCC-specific attribute tells the compiler never to
+insert any padding within structures, useful when you want to use a C struct
+to represent some data that comes in a fixed arrangement 'off the wire'.
+
+You might be inclined to believe that usage of this attribute can easily
+lead to unaligned accesses when accessing fields that do not satisfy
+architectural alignment requirements. However, again, the compiler is aware
+of the alignment constraints and will generate extra instructions to perform
+the memory access in a way that does not cause unaligned access. Of course,
+the extra instructions obviously cause a loss in performance compared to the
+non-packed case, so the packed attribute should only be used when avoiding
+structure padding is of importance.
+
+
+Code that causes unaligned access
+=================================
+
+With the above in mind, let's move onto a real life example of a function
+that can cause an unaligned memory access. The following function adapted
+from include/linux/etherdevice.h is an optimized routine to compare two
+ethernet MAC addresses for equality.
+
+unsigned int compare_ether_addr(const u8 *addr1, const u8 *addr2)
+{
+ const u16 *a = (const u16 *) addr1;
+ const u16 *b = (const u16 *) addr2;
+ return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) != 0;
+}
+
+In the above function, the reference to a[0] causes 2 bytes (16 bits) to
+be read from memory starting at address addr1. Think about what would happen
+if addr1 was an odd address such as 0x10003. (Hint: it'd be an unaligned
+access.)
+
+Despite the potential unaligned access problems with the above function, it
+is included in the kernel anyway but is understood to only work on
+16-bit-aligned addresses. It is up to the caller to ensure this alignment or
+not use this function at all. This alignment-unsafe function is still useful
+as it is a decent optimization for the cases when you can ensure alignment,
+which is true almost all of the time in ethernet networking context.
+
+
+Here is another example of some code that could cause unaligned accesses:
+ void myfunc(u8 *data, u32 value)
+ {
+ [...]
+ *((u32 *) data) = cpu_to_le32(value);
+ [...]
+ }
+
+This code will cause unaligned accesses every time the data parameter points
+to an address that is not evenly divisible by 4.
+
+In summary, the 2 main scenarios where you may run into unaligned access
+problems involve:
+ 1. Casting variables to types of different lengths
+ 2. Pointer arithmetic followed by access to at least 2 bytes of data
+
+
+Avoiding unaligned accesses
+===========================
+
+The easiest way to avoid unaligned access is to use the get_unaligned() and
+put_unaligned() macros provided by the <asm/unaligned.h> header file.
+
+Going back to an earlier example of code that potentially causes unaligned
+access:
+
+ void myfunc(u8 *data, u32 value)
+ {
+ [...]
+ *((u32 *) data) = cpu_to_le32(value);
+ [...]
+ }
+
+To avoid the unaligned memory access, you would rewrite it as follows:
+
+ void myfunc(u8 *data, u32 value)
+ {
+ [...]
+ value = cpu_to_le32(value);
+ put_unaligned(value, (u32 *) data);
+ [...]
+ }
+
+The get_unaligned() macro works similarly. Assuming 'data' is a pointer to
+memory and you wish to avoid unaligned access, its usage is as follows:
+
+ u32 value = get_unaligned((u32 *) data);
+
+These macros work for memory accesses of any length (not just 32 bits as
+in the examples above). Be aware that when compared to standard access of
+aligned memory, using these macros to access unaligned memory can be costly in
+terms of performance.
+
+If use of such macros is not convenient, another option is to use memcpy(),
+where the source or destination (or both) are of type u8* or unsigned char*.
+Due to the byte-wise nature of this operation, unaligned accesses are avoided.
+
+
+Alignment vs. Networking
+========================
+
+On architectures that require aligned loads, networking requires that the IP
+header is aligned on a four-byte boundary to optimise the IP stack. For
+regular ethernet hardware, the constant NET_IP_ALIGN is used. On most
+architectures this constant has the value 2 because the normal ethernet
+header is 14 bytes long, so in order to get proper alignment one needs to
+DMA to an address which can be expressed as 4*n + 2. One notable exception
+here is powerpc which defines NET_IP_ALIGN to 0 because DMA to unaligned
+addresses can be very expensive and dwarf the cost of unaligned loads.
+
+For some ethernet hardware that cannot DMA to unaligned addresses like
+4*n+2 or non-ethernet hardware, this can be a problem, and it is then
+required to copy the incoming frame into an aligned buffer. Because this is
+unnecessary on architectures that can do unaligned accesses, the code can be
+made dependent on CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS like so:
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ skb = original skb
+#else
+ skb = copy skb
+#endif
+
+--
+Authors: Daniel Drake <dsd@gentoo.org>,
+ Johannes Berg <johannes@sipsolutions.net>
+With help from: Alan Cox, Avuton Olrich, Heikki Orsila, Jan Engelhardt,
+Kyle McMartin, Kyle Moffett, Randy Dunlap, Robert Hancock, Uli Kunitz,
+Vadim Lobanov
+
diff --git a/Documentation/unicode.txt b/Documentation/unicode.txt
new file mode 100644
index 0000000..4a33f81
--- /dev/null
+++ b/Documentation/unicode.txt
@@ -0,0 +1,175 @@
+ Last update: 2005-01-17, version 1.4
+
+This file is maintained by H. Peter Anvin <unicode@lanana.org> as part
+of the Linux Assigned Names And Numbers Authority (LANANA) project.
+The current version can be found at:
+
+ http://www.lanana.org/docs/unicode/unicode.txt
+
+ ------------------------
+
+The Linux kernel code has been rewritten to use Unicode to map
+characters to fonts. By downloading a single Unicode-to-font table,
+both the eight-bit character sets and UTF-8 mode are changed to use
+the font as indicated.
+
+This changes the semantics of the eight-bit character tables subtly.
+The four character tables are now:
+
+Map symbol Map name Escape code (G0)
+
+LAT1_MAP Latin-1 (ISO 8859-1) ESC ( B
+GRAF_MAP DEC VT100 pseudographics ESC ( 0
+IBMPC_MAP IBM code page 437 ESC ( U
+USER_MAP User defined ESC ( K
+
+In particular, ESC ( U is no longer "straight to font", since the font
+might be completely different than the IBM character set. This
+permits for example the use of block graphics even with a Latin-1 font
+loaded.
+
+Note that although these codes are similar to ISO 2022, neither the
+codes nor their uses match ISO 2022; Linux has two 8-bit codes (G0 and
+G1), whereas ISO 2022 has four 7-bit codes (G0-G3).
+
+In accordance with the Unicode standard/ISO 10646 the range U+F000 to
+U+F8FF has been reserved for OS-wide allocation (the Unicode Standard
+refers to this as a "Corporate Zone", since this is inaccurate for
+Linux we call it the "Linux Zone"). U+F000 was picked as the starting
+point since it lets the direct-mapping area start on a large power of
+two (in case 1024- or 2048-character fonts ever become necessary).
+This leaves U+E000 to U+EFFF as End User Zone.
+
+[v1.2]: The Unicodes range from U+F000 and up to U+F7FF have been
+hard-coded to map directly to the loaded font, bypassing the
+translation table. The user-defined map now defaults to U+F000 to
+U+F0FF, emulating the previous behaviour. In practice, this range
+might be shorter; for example, vgacon can only handle 256-character
+(U+F000..U+F0FF) or 512-character (U+F000..U+F1FF) fonts.
+
+
+Actual characters assigned in the Linux Zone
+--------------------------------------------
+
+In addition, the following characters not present in Unicode 1.1.4
+have been defined; these are used by the DEC VT graphics map. [v1.2]
+THIS USE IS OBSOLETE AND SHOULD NO LONGER BE USED; PLEASE SEE BELOW.
+
+U+F800 DEC VT GRAPHICS HORIZONTAL LINE SCAN 1
+U+F801 DEC VT GRAPHICS HORIZONTAL LINE SCAN 3
+U+F803 DEC VT GRAPHICS HORIZONTAL LINE SCAN 7
+U+F804 DEC VT GRAPHICS HORIZONTAL LINE SCAN 9
+
+The DEC VT220 uses a 6x10 character matrix, and these characters form
+a smooth progression in the DEC VT graphics character set. I have
+omitted the scan 5 line, since it is also used as a block-graphics
+character, and hence has been coded as U+2500 FORMS LIGHT HORIZONTAL.
+
+[v1.3]: These characters have been officially added to Unicode 3.2.0;
+they are added at U+23BA, U+23BB, U+23BC, U+23BD. Linux now uses the
+new values.
+
+[v1.2]: The following characters have been added to represent common
+keyboard symbols that are unlikely to ever be added to Unicode proper
+since they are horribly vendor-specific. This, of course, is an
+excellent example of horrible design.
+
+U+F810 KEYBOARD SYMBOL FLYING FLAG
+U+F811 KEYBOARD SYMBOL PULLDOWN MENU
+U+F812 KEYBOARD SYMBOL OPEN APPLE
+U+F813 KEYBOARD SYMBOL SOLID APPLE
+
+Klingon language support
+------------------------
+
+In 1996, Linux was the first operating system in the world to add
+support for the artificial language Klingon, created by Marc Okrand
+for the "Star Trek" television series. This encoding was later
+adopted by the ConScript Unicode Registry and proposed (but ultimately
+rejected) for inclusion in Unicode Plane 1. Thus, it remains as a
+Linux/CSUR private assignment in the Linux Zone.
+
+This encoding has been endorsed by the Klingon Language Institute.
+For more information, contact them at:
+
+ http://www.kli.org/
+
+Since the characters in the beginning of the Linux CZ have been more
+of the dingbats/symbols/forms type and this is a language, I have
+located it at the end, on a 16-cell boundary in keeping with standard
+Unicode practice.
+
+NOTE: This range is now officially managed by the ConScript Unicode
+Registry. The normative reference is at:
+
+ http://www.evertype.com/standards/csur/klingon.html
+
+Klingon has an alphabet of 26 characters, a positional numeric writing
+system with 10 digits, and is written left-to-right, top-to-bottom.
+
+Several glyph forms for the Klingon alphabet have been proposed.
+However, since the set of symbols appear to be consistent throughout,
+with only the actual shapes being different, in keeping with standard
+Unicode practice these differences are considered font variants.
+
+U+F8D0 KLINGON LETTER A
+U+F8D1 KLINGON LETTER B
+U+F8D2 KLINGON LETTER CH
+U+F8D3 KLINGON LETTER D
+U+F8D4 KLINGON LETTER E
+U+F8D5 KLINGON LETTER GH
+U+F8D6 KLINGON LETTER H
+U+F8D7 KLINGON LETTER I
+U+F8D8 KLINGON LETTER J
+U+F8D9 KLINGON LETTER L
+U+F8DA KLINGON LETTER M
+U+F8DB KLINGON LETTER N
+U+F8DC KLINGON LETTER NG
+U+F8DD KLINGON LETTER O
+U+F8DE KLINGON LETTER P
+U+F8DF KLINGON LETTER Q
+ - Written <q> in standard Okrand Latin transliteration
+U+F8E0 KLINGON LETTER QH
+ - Written <Q> in standard Okrand Latin transliteration
+U+F8E1 KLINGON LETTER R
+U+F8E2 KLINGON LETTER S
+U+F8E3 KLINGON LETTER T
+U+F8E4 KLINGON LETTER TLH
+U+F8E5 KLINGON LETTER U
+U+F8E6 KLINGON LETTER V
+U+F8E7 KLINGON LETTER W
+U+F8E8 KLINGON LETTER Y
+U+F8E9 KLINGON LETTER GLOTTAL STOP
+
+U+F8F0 KLINGON DIGIT ZERO
+U+F8F1 KLINGON DIGIT ONE
+U+F8F2 KLINGON DIGIT TWO
+U+F8F3 KLINGON DIGIT THREE
+U+F8F4 KLINGON DIGIT FOUR
+U+F8F5 KLINGON DIGIT FIVE
+U+F8F6 KLINGON DIGIT SIX
+U+F8F7 KLINGON DIGIT SEVEN
+U+F8F8 KLINGON DIGIT EIGHT
+U+F8F9 KLINGON DIGIT NINE
+
+U+F8FD KLINGON COMMA
+U+F8FE KLINGON FULL STOP
+U+F8FF KLINGON SYMBOL FOR EMPIRE
+
+Other Fictional and Artificial Scripts
+--------------------------------------
+
+Since the assignment of the Klingon Linux Unicode block, a registry of
+fictional and artificial scripts has been established by John Cowan
+<jcowan@reutershealth.com> and Michael Everson <everson@evertype.com>.
+The ConScript Unicode Registry is accessible at:
+
+ http://www.evertype.com/standards/csur/
+
+The ranges used fall at the low end of the End User Zone and can hence
+not be normatively assigned, but it is recommended that people who
+wish to encode fictional scripts use these codes, in the interest of
+interoperability. For Klingon, CSUR has adopted the Linux encoding.
+The CSUR people are driving adding Tengwar and Cirth into Unicode
+Plane 1; the addition of Klingon to Unicode Plane 1 has been rejected
+and so the above encoding remains official.
diff --git a/Documentation/unshare.txt b/Documentation/unshare.txt
new file mode 100644
index 0000000..a864351
--- /dev/null
+++ b/Documentation/unshare.txt
@@ -0,0 +1,295 @@
+
+unshare system call:
+--------------------
+This document describes the new system call, unshare. The document
+provides an overview of the feature, why it is needed, how it can
+be used, its interface specification, design, implementation and
+how it can be tested.
+
+Change Log:
+-----------
+version 0.1 Initial document, Janak Desai (janak@us.ibm.com), Jan 11, 2006
+
+Contents:
+---------
+ 1) Overview
+ 2) Benefits
+ 3) Cost
+ 4) Requirements
+ 5) Functional Specification
+ 6) High Level Design
+ 7) Low Level Design
+ 8) Test Specification
+ 9) Future Work
+
+1) Overview
+-----------
+Most legacy operating system kernels support an abstraction of threads
+as multiple execution contexts within a process. These kernels provide
+special resources and mechanisms to maintain these "threads". The Linux
+kernel, in a clever and simple manner, does not make distinction
+between processes and "threads". The kernel allows processes to share
+resources and thus they can achieve legacy "threads" behavior without
+requiring additional data structures and mechanisms in the kernel. The
+power of implementing threads in this manner comes not only from
+its simplicity but also from allowing application programmers to work
+outside the confinement of all-or-nothing shared resources of legacy
+threads. On Linux, at the time of thread creation using the clone system
+call, applications can selectively choose which resources to share
+between threads.
+
+unshare system call adds a primitive to the Linux thread model that
+allows threads to selectively 'unshare' any resources that were being
+shared at the time of their creation. unshare was conceptualized by
+Al Viro in the August of 2000, on the Linux-Kernel mailing list, as part
+of the discussion on POSIX threads on Linux. unshare augments the
+usefulness of Linux threads for applications that would like to control
+shared resources without creating a new process. unshare is a natural
+addition to the set of available primitives on Linux that implement
+the concept of process/thread as a virtual machine.
+
+2) Benefits
+-----------
+unshare would be useful to large application frameworks such as PAM
+where creating a new process to control sharing/unsharing of process
+resources is not possible. Since namespaces are shared by default
+when creating a new process using fork or clone, unshare can benefit
+even non-threaded applications if they have a need to disassociate
+from default shared namespace. The following lists two use-cases
+where unshare can be used.
+
+2.1 Per-security context namespaces
+-----------------------------------
+unshare can be used to implement polyinstantiated directories using
+the kernel's per-process namespace mechanism. Polyinstantiated directories,
+such as per-user and/or per-security context instance of /tmp, /var/tmp or
+per-security context instance of a user's home directory, isolate user
+processes when working with these directories. Using unshare, a PAM
+module can easily setup a private namespace for a user at login.
+Polyinstantiated directories are required for Common Criteria certification
+with Labeled System Protection Profile, however, with the availability
+of shared-tree feature in the Linux kernel, even regular Linux systems
+can benefit from setting up private namespaces at login and
+polyinstantiating /tmp, /var/tmp and other directories deemed
+appropriate by system administrators.
+
+2.2 unsharing of virtual memory and/or open files
+-------------------------------------------------
+Consider a client/server application where the server is processing
+client requests by creating processes that share resources such as
+virtual memory and open files. Without unshare, the server has to
+decide what needs to be shared at the time of creating the process
+which services the request. unshare allows the server an ability to
+disassociate parts of the context during the servicing of the
+request. For large and complex middleware application frameworks, this
+ability to unshare after the process was created can be very
+useful.
+
+3) Cost
+-------
+In order to not duplicate code and to handle the fact that unshare
+works on an active task (as opposed to clone/fork working on a newly
+allocated inactive task) unshare had to make minor reorganizational
+changes to copy_* functions utilized by clone/fork system call.
+There is a cost associated with altering existing, well tested and
+stable code to implement a new feature that may not get exercised
+extensively in the beginning. However, with proper design and code
+review of the changes and creation of an unshare test for the LTP
+the benefits of this new feature can exceed its cost.
+
+4) Requirements
+---------------
+unshare reverses sharing that was done using clone(2) system call,
+so unshare should have a similar interface as clone(2). That is,
+since flags in clone(int flags, void *stack) specifies what should
+be shared, similar flags in unshare(int flags) should specify
+what should be unshared. Unfortunately, this may appear to invert
+the meaning of the flags from the way they are used in clone(2).
+However, there was no easy solution that was less confusing and that
+allowed incremental context unsharing in future without an ABI change.
+
+unshare interface should accommodate possible future addition of
+new context flags without requiring a rebuild of old applications.
+If and when new context flags are added, unshare design should allow
+incremental unsharing of those resources on an as needed basis.
+
+5) Functional Specification
+---------------------------
+NAME
+ unshare - disassociate parts of the process execution context
+
+SYNOPSIS
+ #include <sched.h>
+
+ int unshare(int flags);
+
+DESCRIPTION
+ unshare allows a process to disassociate parts of its execution
+ context that are currently being shared with other processes. Part
+ of execution context, such as the namespace, is shared by default
+ when a new process is created using fork(2), while other parts,
+ such as the virtual memory, open file descriptors, etc, may be
+ shared by explicit request to share them when creating a process
+ using clone(2).
+
+ The main use of unshare is to allow a process to control its
+ shared execution context without creating a new process.
+
+ The flags argument specifies one or bitwise-or'ed of several of
+ the following constants.
+
+ CLONE_FS
+ If CLONE_FS is set, file system information of the caller
+ is disassociated from the shared file system information.
+
+ CLONE_FILES
+ If CLONE_FILES is set, the file descriptor table of the
+ caller is disassociated from the shared file descriptor
+ table.
+
+ CLONE_NEWNS
+ If CLONE_NEWNS is set, the namespace of the caller is
+ disassociated from the shared namespace.
+
+ CLONE_VM
+ If CLONE_VM is set, the virtual memory of the caller is
+ disassociated from the shared virtual memory.
+
+RETURN VALUE
+ On success, zero returned. On failure, -1 is returned and errno is
+
+ERRORS
+ EPERM CLONE_NEWNS was specified by a non-root process (process
+ without CAP_SYS_ADMIN).
+
+ ENOMEM Cannot allocate sufficient memory to copy parts of caller's
+ context that need to be unshared.
+
+ EINVAL Invalid flag was specified as an argument.
+
+CONFORMING TO
+ The unshare() call is Linux-specific and should not be used
+ in programs intended to be portable.
+
+SEE ALSO
+ clone(2), fork(2)
+
+6) High Level Design
+--------------------
+Depending on the flags argument, the unshare system call allocates
+appropriate process context structures, populates it with values from
+the current shared version, associates newly duplicated structures
+with the current task structure and releases corresponding shared
+versions. Helper functions of clone (copy_*) could not be used
+directly by unshare because of the following two reasons.
+ 1) clone operates on a newly allocated not-yet-active task
+ structure, where as unshare operates on the current active
+ task. Therefore unshare has to take appropriate task_lock()
+ before associating newly duplicated context structures
+ 2) unshare has to allocate and duplicate all context structures
+ that are being unshared, before associating them with the
+ current task and releasing older shared structures. Failure
+ do so will create race conditions and/or oops when trying
+ to backout due to an error. Consider the case of unsharing
+ both virtual memory and namespace. After successfully unsharing
+ vm, if the system call encounters an error while allocating
+ new namespace structure, the error return code will have to
+ reverse the unsharing of vm. As part of the reversal the
+ system call will have to go back to older, shared, vm
+ structure, which may not exist anymore.
+
+Therefore code from copy_* functions that allocated and duplicated
+current context structure was moved into new dup_* functions. Now,
+copy_* functions call dup_* functions to allocate and duplicate
+appropriate context structures and then associate them with the
+task structure that is being constructed. unshare system call on
+the other hand performs the following:
+ 1) Check flags to force missing, but implied, flags
+ 2) For each context structure, call the corresponding unshare
+ helper function to allocate and duplicate a new context
+ structure, if the appropriate bit is set in the flags argument.
+ 3) If there is no error in allocation and duplication and there
+ are new context structures then lock the current task structure,
+ associate new context structures with the current task structure,
+ and release the lock on the current task structure.
+ 4) Appropriately release older, shared, context structures.
+
+7) Low Level Design
+-------------------
+Implementation of unshare can be grouped in the following 4 different
+items:
+ a) Reorganization of existing copy_* functions
+ b) unshare system call service function
+ c) unshare helper functions for each different process context
+ d) Registration of system call number for different architectures
+
+ 7.1) Reorganization of copy_* functions
+ Each copy function such as copy_mm, copy_namespace, copy_files,
+ etc, had roughly two components. The first component allocated
+ and duplicated the appropriate structure and the second component
+ linked it to the task structure passed in as an argument to the copy
+ function. The first component was split into its own function.
+ These dup_* functions allocated and duplicated the appropriate
+ context structure. The reorganized copy_* functions invoked
+ their corresponding dup_* functions and then linked the newly
+ duplicated structures to the task structure with which the
+ copy function was called.
+
+ 7.2) unshare system call service function
+ * Check flags
+ Force implied flags. If CLONE_THREAD is set force CLONE_VM.
+ If CLONE_VM is set, force CLONE_SIGHAND. If CLONE_SIGHAND is
+ set and signals are also being shared, force CLONE_THREAD. If
+ CLONE_NEWNS is set, force CLONE_FS.
+ * For each context flag, invoke the corresponding unshare_*
+ helper routine with flags passed into the system call and a
+ reference to pointer pointing the new unshared structure
+ * If any new structures are created by unshare_* helper
+ functions, take the task_lock() on the current task,
+ modify appropriate context pointers, and release the
+ task lock.
+ * For all newly unshared structures, release the corresponding
+ older, shared, structures.
+
+ 7.3) unshare_* helper functions
+ For unshare_* helpers corresponding to CLONE_SYSVSEM, CLONE_SIGHAND,
+ and CLONE_THREAD, return -EINVAL since they are not implemented yet.
+ For others, check the flag value to see if the unsharing is
+ required for that structure. If it is, invoke the corresponding
+ dup_* function to allocate and duplicate the structure and return
+ a pointer to it.
+
+ 7.4) Appropriately modify architecture specific code to register the
+ new system call.
+
+8) Test Specification
+---------------------
+The test for unshare should test the following:
+ 1) Valid flags: Test to check that clone flags for signal and
+ signal handlers, for which unsharing is not implemented
+ yet, return -EINVAL.
+ 2) Missing/implied flags: Test to make sure that if unsharing
+ namespace without specifying unsharing of filesystem, correctly
+ unshares both namespace and filesystem information.
+ 3) For each of the four (namespace, filesystem, files and vm)
+ supported unsharing, verify that the system call correctly
+ unshares the appropriate structure. Verify that unsharing
+ them individually as well as in combination with each
+ other works as expected.
+ 4) Concurrent execution: Use shared memory segments and futex on
+ an address in the shm segment to synchronize execution of
+ about 10 threads. Have a couple of threads execute execve,
+ a couple _exit and the rest unshare with different combination
+ of flags. Verify that unsharing is performed as expected and
+ that there are no oops or hangs.
+
+9) Future Work
+--------------
+The current implementation of unshare does not allow unsharing of
+signals and signal handlers. Signals are complex to begin with and
+to unshare signals and/or signal handlers of a currently running
+process is even more complex. If in the future there is a specific
+need to allow unsharing of signals and/or signal handlers, it can
+be incrementally added to unshare without affecting legacy
+applications using unshare.
+
diff --git a/Documentation/usb/CREDITS b/Documentation/usb/CREDITS
new file mode 100644
index 0000000..67c59cd
--- /dev/null
+++ b/Documentation/usb/CREDITS
@@ -0,0 +1,175 @@
+Credits for the Simple Linux USB Driver:
+
+The following people have contributed to this code (in alphabetical
+order by last name). I'm sure this list should be longer, its
+difficult to maintain, add yourself with a patch if desired.
+
+ Georg Acher <acher@informatik.tu-muenchen.de>
+ David Brownell <dbrownell@users.sourceforge.net>
+ Alan Cox <alan@lxorguk.ukuu.org.uk>
+ Randy Dunlap <randy.dunlap@intel.com>
+ Johannes Erdfelt <johannes@erdfelt.com>
+ Deti Fliegl <deti@fliegl.de>
+ ham <ham@unsuave.com>
+ Bradley M Keryan <keryan@andrew.cmu.edu>
+ Greg Kroah-Hartman <greg@kroah.com>
+ Pavel Machek <pavel@suse.cz>
+ Paul Mackerras <paulus@cs.anu.edu.au>
+ Petko Manlolov <petkan@dce.bg>
+ David E. Nelson <dnelson@jump.net>
+ Vojtech Pavlik <vojtech@suse.cz>
+ Bill Ryder <bryder@sgi.com>
+ Thomas Sailer <sailer@ife.ee.ethz.ch>
+ Gregory P. Smith <greg@electricrain.com>
+ Linus Torvalds <torvalds@linux-foundation.org>
+ Roman Weissgaerber <weissg@vienna.at>
+ <Kazuki.Yasumatsu@fujixerox.co.jp>
+
+Special thanks to:
+
+ Inaky Perez Gonzalez <inaky@peloncho.fis.ucm.es> for starting the
+ Linux USB driver effort and writing much of the larger uusbd driver.
+ Much has been learned from that effort.
+
+ The NetBSD & FreeBSD USB developers. For being on the Linux USB list
+ and offering suggestions and sharing implementation experiences.
+
+Additional thanks to the following companies and people for donations
+of hardware, support, time and development (this is from the original
+THANKS file in Inaky's driver):
+
+ The following corporations have helped us in the development
+ of Linux USB / UUSBD:
+
+ - 3Com GmbH for donating a ISDN Pro TA and supporting me
+ in technical questions and with test equipment. I'd never
+ expect such a great help.
+
+ - USAR Systems provided us with one of their excellent USB
+ Evaluation Kits. It allows us to test the Linux-USB driver
+ for compliance with the latest USB specification. USAR
+ Systems recognized the importance of an up-to-date open
+ Operating System and supports this project with
+ Hardware. Thanks!.
+
+ - Thanks to Intel Corporation for their precious help.
+
+ - We teamed up with Cherry to make Linux the first OS with
+ built-in USB support. Cherry is one of the biggest keyboard
+ makers in the world.
+
+ - CMD Technology, Inc. sponsored us kindly donating a CSA-6700
+ PCI-to-USB Controller Board to test the OHCI implementation.
+
+ - Due to their support to us, Keytronic can be sure that they
+ will sell keyboards to some of the 3 million (at least)
+ Linux users.
+
+ - Many thanks to ing büro h doran [http://www.ibhdoran.com]!
+ It was almost impossible to get a PC backplate USB connector
+ for the motherboard here at Europe (mine, home-made, was
+ quite lousy :). Now I know where to acquire nice USB stuff!
+
+ - Genius Germany donated a USB mouse to test the mouse boot
+ protocol. They've also donated a F-23 digital joystick and a
+ NetMouse Pro. Thanks!
+
+ - AVM GmbH Berlin is supporting the development of the Linux
+ USB driver for the AVM ISDN Controller B1 USB. AVM is a
+ leading manufacturer for active and passive ISDN Controllers
+ and CAPI 2.0-based software. The active design of the AVM B1
+ is open for all OS platforms, including Linux.
+
+ - Thanks to Y-E Data, Inc. for donating their FlashBuster-U
+ USB Floppy Disk Drive, so we could test the bulk transfer
+ code.
+
+ - Many thanks to Logitech for contributing a three axis USB
+ mouse.
+
+ Logitech designs, manufactures and markets
+ Human Interface Devices, having a long history and
+ experience in making devices such as keyboards, mice,
+ trackballs, cameras, loudspeakers and control devices for
+ gaming and professional use.
+
+ Being a recognized vendor and seller for all these devices,
+ they have donated USB mice, a joystick and a scanner, as a
+ way to acknowledge the importance of Linux and to allow
+ Logitech customers to enjoy support in their favorite
+ operating systems and all Linux users to use Logitech and
+ other USB hardware.
+
+ Logitech is official sponsor of the Linux Conference on
+ Feb. 11th 1999 in Vienna, where we'll will present the
+ current state of the Linux USB effort.
+
+ - CATC has provided means to uncover dark corners of the UHCI
+ inner workings with a USB Inspector.
+
+ - Thanks to Entrega for providing PCI to USB cards, hubs and
+ converter products for development.
+
+ - Thanks to ConnectTech for providing a WhiteHEAT usb to
+ serial converter, and the documentation for the device to
+ allow a driver to be written.
+
+ - Thanks to ADMtek for providing Pegasus and Pegasus II
+ evaluation boards, specs and valuable advices during
+ the driver development.
+
+ And thanks go to (hey! in no particular order :)
+
+ - Oren Tirosh <orenti@hishome.net>, for standing so patiently
+ all my doubts'bout USB and giving lots of cool ideas.
+
+ - Jochen Karrer <karrer@wpfd25.physik.uni-wuerzburg.de>, for
+ pointing out mortal bugs and giving advice.
+
+ - Edmund Humemberger <ed@atnet.at>, for it's great work on
+ public relationships and general management stuff for the
+ Linux-USB effort.
+
+ - Alberto Menegazzi <flash@flash.iol.it> is starting the
+ documentation for the UUSBD. Go for it!
+
+ - Ric Klaren <ia_ric@cs.utwente.nl> for doing nice
+ introductory documents (competing with Alberto's :).
+
+ - Christian Groessler <cpg@aladdin.de>, for it's help on those
+ itchy bits ... :)
+
+ - Paul MacKerras for polishing OHCI and pushing me harder for
+ the iMac support, giving improvements and enhancements.
+
+ - Fernando Herrera <fherrera@eurielec.etsit.upm.es> has taken
+ charge of composing, maintaining and feeding the
+ long-awaited, unique and marvelous UUSBD FAQ! Tadaaaa!!!
+
+ - Rasca Gmelch <thron@gmx.de> has revived the raw driver and
+ pointed bugs, as well as started the uusbd-utils package.
+
+ - Peter Dettori <dettori@ozy.dec.com> is uncovering bugs like
+ crazy, as well as making cool suggestions, great :)
+
+ - All the Free Software and Linux community, the FSF & the GNU
+ project, the MIT X consortium, the TeX people ... everyone!
+ You know who you are!
+
+ - Big thanks to Richard Stallman for creating Emacs!
+
+ - The people at the linux-usb mailing list, for reading so
+ many messages :) Ok, no more kidding; for all your advises!
+
+ - All the people at the USB Implementors Forum for their
+ help and assistance.
+
+ - Nathan Myers <ncm@cantrip.org>, for his advice! (hope you
+ liked Cibeles' party).
+
+ - Linus Torvalds, for starting, developing and managing Linux.
+
+ - Mike Smith, Craig Keithley, Thierry Giron and Janet Schank
+ for convincing me USB Standard hubs are not that standard
+ and that's good to allow for vendor specific quirks on the
+ standard hub driver.
diff --git a/Documentation/usb/URB.txt b/Documentation/usb/URB.txt
new file mode 100644
index 0000000..8ffce74
--- /dev/null
+++ b/Documentation/usb/URB.txt
@@ -0,0 +1,240 @@
+Revised: 2000-Dec-05.
+Again: 2002-Jul-06
+Again: 2005-Sep-19
+
+ NOTE:
+
+ The USB subsystem now has a substantial section in "The Linux Kernel API"
+ guide (in Documentation/DocBook), generated from the current source
+ code. This particular documentation file isn't particularly current or
+ complete; don't rely on it except for a quick overview.
+
+
+1.1. Basic concept or 'What is an URB?'
+
+The basic idea of the new driver is message passing, the message itself is
+called USB Request Block, or URB for short.
+
+- An URB consists of all relevant information to execute any USB transaction
+ and deliver the data and status back.
+
+- Execution of an URB is inherently an asynchronous operation, i.e. the
+ usb_submit_urb(urb) call returns immediately after it has successfully
+ queued the requested action.
+
+- Transfers for one URB can be canceled with usb_unlink_urb(urb) at any time.
+
+- Each URB has a completion handler, which is called after the action
+ has been successfully completed or canceled. The URB also contains a
+ context-pointer for passing information to the completion handler.
+
+- Each endpoint for a device logically supports a queue of requests.
+ You can fill that queue, so that the USB hardware can still transfer
+ data to an endpoint while your driver handles completion of another.
+ This maximizes use of USB bandwidth, and supports seamless streaming
+ of data to (or from) devices when using periodic transfer modes.
+
+
+1.2. The URB structure
+
+Some of the fields in an URB are:
+
+struct urb
+{
+// (IN) device and pipe specify the endpoint queue
+ struct usb_device *dev; // pointer to associated USB device
+ unsigned int pipe; // endpoint information
+
+ unsigned int transfer_flags; // ISO_ASAP, SHORT_NOT_OK, etc.
+
+// (IN) all urbs need completion routines
+ void *context; // context for completion routine
+ void (*complete)(struct urb *); // pointer to completion routine
+
+// (OUT) status after each completion
+ int status; // returned status
+
+// (IN) buffer used for data transfers
+ void *transfer_buffer; // associated data buffer
+ int transfer_buffer_length; // data buffer length
+ int number_of_packets; // size of iso_frame_desc
+
+// (OUT) sometimes only part of CTRL/BULK/INTR transfer_buffer is used
+ int actual_length; // actual data buffer length
+
+// (IN) setup stage for CTRL (pass a struct usb_ctrlrequest)
+ unsigned char* setup_packet; // setup packet (control only)
+
+// Only for PERIODIC transfers (ISO, INTERRUPT)
+ // (IN/OUT) start_frame is set unless ISO_ASAP isn't set
+ int start_frame; // start frame
+ int interval; // polling interval
+
+ // ISO only: packets are only "best effort"; each can have errors
+ int error_count; // number of errors
+ struct usb_iso_packet_descriptor iso_frame_desc[0];
+};
+
+Your driver must create the "pipe" value using values from the appropriate
+endpoint descriptor in an interface that it's claimed.
+
+
+1.3. How to get an URB?
+
+URBs are allocated with the following call
+
+ struct urb *usb_alloc_urb(int isoframes, int mem_flags)
+
+Return value is a pointer to the allocated URB, 0 if allocation failed.
+The parameter isoframes specifies the number of isochronous transfer frames
+you want to schedule. For CTRL/BULK/INT, use 0. The mem_flags parameter
+holds standard memory allocation flags, letting you control (among other
+things) whether the underlying code may block or not.
+
+To free an URB, use
+
+ void usb_free_urb(struct urb *urb)
+
+You may free an urb that you've submitted, but which hasn't yet been
+returned to you in a completion callback. It will automatically be
+deallocated when it is no longer in use.
+
+
+1.4. What has to be filled in?
+
+Depending on the type of transaction, there are some inline functions
+defined in <linux/usb.h> to simplify the initialization, such as
+fill_control_urb() and fill_bulk_urb(). In general, they need the usb
+device pointer, the pipe (usual format from usb.h), the transfer buffer,
+the desired transfer length, the completion handler, and its context.
+Take a look at the some existing drivers to see how they're used.
+
+Flags:
+For ISO there are two startup behaviors: Specified start_frame or ASAP.
+For ASAP set URB_ISO_ASAP in transfer_flags.
+
+If short packets should NOT be tolerated, set URB_SHORT_NOT_OK in
+transfer_flags.
+
+
+1.5. How to submit an URB?
+
+Just call
+
+ int usb_submit_urb(struct urb *urb, int mem_flags)
+
+The mem_flags parameter, such as SLAB_ATOMIC, controls memory allocation,
+such as whether the lower levels may block when memory is tight.
+
+It immediately returns, either with status 0 (request queued) or some
+error code, usually caused by the following:
+
+- Out of memory (-ENOMEM)
+- Unplugged device (-ENODEV)
+- Stalled endpoint (-EPIPE)
+- Too many queued ISO transfers (-EAGAIN)
+- Too many requested ISO frames (-EFBIG)
+- Invalid INT interval (-EINVAL)
+- More than one packet for INT (-EINVAL)
+
+After submission, urb->status is -EINPROGRESS; however, you should never
+look at that value except in your completion callback.
+
+For isochronous endpoints, your completion handlers should (re)submit
+URBs to the same endpoint with the ISO_ASAP flag, using multi-buffering,
+to get seamless ISO streaming.
+
+
+1.6. How to cancel an already running URB?
+
+There are two ways to cancel an URB you've submitted but which hasn't
+been returned to your driver yet. For an asynchronous cancel, call
+
+ int usb_unlink_urb(struct urb *urb)
+
+It removes the urb from the internal list and frees all allocated
+HW descriptors. The status is changed to reflect unlinking. Note
+that the URB will not normally have finished when usb_unlink_urb()
+returns; you must still wait for the completion handler to be called.
+
+To cancel an URB synchronously, call
+
+ void usb_kill_urb(struct urb *urb)
+
+It does everything usb_unlink_urb does, and in addition it waits
+until after the URB has been returned and the completion handler
+has finished. It also marks the URB as temporarily unusable, so
+that if the completion handler or anyone else tries to resubmit it
+they will get a -EPERM error. Thus you can be sure that when
+usb_kill_urb() returns, the URB is totally idle.
+
+
+1.7. What about the completion handler?
+
+The handler is of the following type:
+
+ typedef void (*usb_complete_t)(struct urb *, struct pt_regs *)
+
+I.e., it gets the URB that caused the completion call, plus the
+register values at the time of the corresponding interrupt (if any).
+In the completion handler, you should have a look at urb->status to
+detect any USB errors. Since the context parameter is included in the URB,
+you can pass information to the completion handler.
+
+Note that even when an error (or unlink) is reported, data may have been
+transferred. That's because USB transfers are packetized; it might take
+sixteen packets to transfer your 1KByte buffer, and ten of them might
+have transferred successfully before the completion was called.
+
+
+NOTE: ***** WARNING *****
+NEVER SLEEP IN A COMPLETION HANDLER. These are normally called
+during hardware interrupt processing. If you can, defer substantial
+work to a tasklet (bottom half) to keep system latencies low. You'll
+probably need to use spinlocks to protect data structures you manipulate
+in completion handlers.
+
+
+1.8. How to do isochronous (ISO) transfers?
+
+For ISO transfers you have to fill a usb_iso_packet_descriptor structure,
+allocated at the end of the URB by usb_alloc_urb(n,mem_flags), for each
+packet you want to schedule. You also have to set urb->interval to say
+how often to make transfers; it's often one per frame (which is once
+every microframe for highspeed devices). The actual interval used will
+be a power of two that's no bigger than what you specify.
+
+The usb_submit_urb() call modifies urb->interval to the implemented interval
+value that is less than or equal to the requested interval value. If
+ISO_ASAP scheduling is used, urb->start_frame is also updated.
+
+For each entry you have to specify the data offset for this frame (base is
+transfer_buffer), and the length you want to write/expect to read.
+After completion, actual_length contains the actual transferred length and
+status contains the resulting status for the ISO transfer for this frame.
+It is allowed to specify a varying length from frame to frame (e.g. for
+audio synchronisation/adaptive transfer rates). You can also use the length
+0 to omit one or more frames (striping).
+
+For scheduling you can choose your own start frame or ISO_ASAP. As explained
+earlier, if you always keep at least one URB queued and your completion
+keeps (re)submitting a later URB, you'll get smooth ISO streaming (if usb
+bandwidth utilization allows).
+
+If you specify your own start frame, make sure it's several frames in advance
+of the current frame. You might want this model if you're synchronizing
+ISO data with some other event stream.
+
+
+1.9. How to start interrupt (INT) transfers?
+
+Interrupt transfers, like isochronous transfers, are periodic, and happen
+in intervals that are powers of two (1, 2, 4 etc) units. Units are frames
+for full and low speed devices, and microframes for high speed ones.
+The usb_submit_urb() call modifies urb->interval to the implemented interval
+value that is less than or equal to the requested interval value.
+
+In Linux 2.6, unlike earlier versions, interrupt URBs are not automagically
+restarted when they complete. They end when the completion handler is
+called, just like other URBs. If you want an interrupt URB to be restarted,
+your completion handler must resubmit it.
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/WUSB-Design-overview.txt
new file mode 100644
index 0000000..4c3d62c
--- /dev/null
+++ b/Documentation/usb/WUSB-Design-overview.txt
@@ -0,0 +1,448 @@
+
+Linux UWB + Wireless USB + WiNET
+
+ (C) 2005-2006 Intel Corporation
+ Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License version
+ 2 as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+
+Please visit http://bughost.org/thewiki/Design-overview.txt-1.8 for
+updated content.
+
+ * Design-overview.txt-1.8
+
+This code implements a Ultra Wide Band stack for Linux, as well as
+drivers for the the USB based UWB radio controllers defined in the
+Wireless USB 1.0 specification (including Wireless USB host controller
+and an Intel WiNET controller).
+
+ 1. Introduction
+ 1. HWA: Host Wire adapters, your Wireless USB dongle
+
+ 2. DWA: Device Wired Adaptor, a Wireless USB hub for wired
+ devices
+ 3. WHCI: Wireless Host Controller Interface, the PCI WUSB host
+ adapter
+ 2. The UWB stack
+ 1. Devices and hosts: the basic structure
+
+ 2. Host Controller life cycle
+
+ 3. On the air: beacons and enumerating the radio neighborhood
+
+ 4. Device lists
+ 5. Bandwidth allocation
+
+ 3. Wireless USB Host Controller drivers
+
+ 4. Glossary
+
+
+ Introduction
+
+UWB is a wide-band communication protocol that is to serve also as the
+low-level protocol for others (much like TCP sits on IP). Currently
+these others are Wireless USB and TCP/IP, but seems Bluetooth and
+Firewire/1394 are coming along.
+
+UWB uses a band from roughly 3 to 10 GHz, transmitting at a max of
+~-41dB (or 0.074 uW/MHz--geography specific data is still being
+negotiated w/ regulators, so watch for changes). That band is divided in
+a bunch of ~1.5 GHz wide channels (or band groups) composed of three
+subbands/subchannels (528 MHz each). Each channel is independent of each
+other, so you could consider them different "busses". Initially this
+driver considers them all a single one.
+
+Radio time is divided in 65536 us long /superframes/, each one divided
+in 256 256us long /MASs/ (Media Allocation Slots), which are the basic
+time/media allocation units for transferring data. At the beginning of
+each superframe there is a Beacon Period (BP), where every device
+transmit its beacon on a single MAS. The length of the BP depends on how
+many devices are present and the length of their beacons.
+
+Devices have a MAC (fixed, 48 bit address) and a device (changeable, 16
+bit address) and send periodic beacons to advertise themselves and pass
+info on what they are and do. They advertise their capabilities and a
+bunch of other stuff.
+
+The different logical parts of this driver are:
+
+ *
+
+ *UWB*: the Ultra-Wide-Band stack -- manages the radio and
+ associated spectrum to allow for devices sharing it. Allows to
+ control bandwidth assingment, beaconing, scanning, etc
+
+ *
+
+ *WUSB*: the layer that sits on top of UWB to provide Wireless USB.
+ The Wireless USB spec defines means to control a UWB radio and to
+ do the actual WUSB.
+
+
+ HWA: Host Wire adapters, your Wireless USB dongle
+
+WUSB also defines a device called a Host Wire Adaptor (HWA), which in
+mere terms is a USB dongle that enables your PC to have UWB and Wireless
+USB. The Wireless USB Host Controller in a HWA looks to the host like a
+[Wireless] USB controller connected via USB (!)
+
+The HWA itself is broken in two or three main interfaces:
+
+ *
+
+ *RC*: Radio control -- this implements an interface to the
+ Ultra-Wide-Band radio controller. The driver for this implements a
+ USB-based UWB Radio Controller to the UWB stack.
+
+ *
+
+ *HC*: the wireless USB host controller. It looks like a USB host
+ whose root port is the radio and the WUSB devices connect to it.
+ To the system it looks like a separate USB host. The driver (will)
+ implement a USB host controller (similar to UHCI, OHCI or EHCI)
+ for which the root hub is the radio...To reiterate: it is a USB
+ controller that is connected via USB instead of PCI.
+
+ *
+
+ *WINET*: some HW provide a WiNET interface (IP over UWB). This
+ package provides a driver for it (it looks like a network
+ interface, winetX). The driver detects when there is a link up for
+ their type and kick into gear.
+
+
+ DWA: Device Wired Adaptor, a Wireless USB hub for wired devices
+
+These are the complement to HWAs. They are a USB host for connecting
+wired devices, but it is connected to your PC connected via Wireless
+USB. To the system it looks like yet another USB host. To the untrained
+eye, it looks like a hub that connects upstream wirelessly.
+
+We still offer no support for this; however, it should share a lot of
+code with the HWA-RC driver; there is a bunch of factorization work that
+has been done to support that in upcoming releases.
+
+
+ WHCI: Wireless Host Controller Interface, the PCI WUSB host adapter
+
+This is your usual PCI device that implements WHCI. Similar in concept
+to EHCI, it allows your wireless USB devices (including DWAs) to connect
+to your host via a PCI interface. As in the case of the HWA, it has a
+Radio Control interface and the WUSB Host Controller interface per se.
+
+There is still no driver support for this, but will be in upcoming
+releases.
+
+
+ The UWB stack
+
+The main mission of the UWB stack is to keep a tally of which devices
+are in radio proximity to allow drivers to connect to them. As well, it
+provides an API for controlling the local radio controllers (RCs from
+now on), such as to start/stop beaconing, scan, allocate bandwidth, etc.
+
+
+ Devices and hosts: the basic structure
+
+The main building block here is the UWB device (struct uwb_dev). For
+each device that pops up in radio presence (ie: the UWB host receives a
+beacon from it) you get a struct uwb_dev that will show up in
+/sys/class/uwb and in /sys/bus/uwb/devices.
+
+For each RC that is detected, a new struct uwb_rc is created. In turn, a
+RC is also a device, so they also show in /sys/class/uwb and
+/sys/bus/uwb/devices, but at the same time, only radio controllers show
+up in /sys/class/uwb_rc.
+
+ *
+
+ [*] The reason for RCs being also devices is that not only we can
+ see them while enumerating the system device tree, but also on the
+ radio (their beacons and stuff), so the handling has to be
+ likewise to that of a device.
+
+Each RC driver is implemented by a separate driver that plugs into the
+interface that the UWB stack provides through a struct uwb_rc_ops. The
+spec creators have been nice enough to make the message format the same
+for HWA and WHCI RCs, so the driver is really a very thin transport that
+moves the requests from the UWB API to the device [/uwb_rc_ops->cmd()/]
+and sends the replies and notifications back to the API
+[/uwb_rc_neh_grok()/]. Notifications are handled to the UWB daemon, that
+is chartered, among other things, to keep the tab of how the UWB radio
+neighborhood looks, creating and destroying devices as they show up or
+dissapear.
+
+Command execution is very simple: a command block is sent and a event
+block or reply is expected back. For sending/receiving command/events, a
+handle called /neh/ (Notification/Event Handle) is opened with
+/uwb_rc_neh_open()/.
+
+The HWA-RC (USB dongle) driver (drivers/uwb/hwa-rc.c) does this job for
+the USB connected HWA. Eventually, drivers/whci-rc.c will do the same
+for the PCI connected WHCI controller.
+
+
+ Host Controller life cycle
+
+So let's say we connect a dongle to the system: it is detected and
+firmware uploaded if needed [for Intel's i1480
+/drivers/uwb/ptc/usb.c:ptc_usb_probe()/] and then it is reenumerated.
+Now we have a real HWA device connected and
+/drivers/uwb/hwa-rc.c:hwarc_probe()/ picks it up, that will set up the
+Wire-Adaptor environment and then suck it into the UWB stack's vision of
+the world [/drivers/uwb/lc-rc.c:uwb_rc_add()/].
+
+ *
+
+ [*] The stack should put a new RC to scan for devices
+ [/uwb_rc_scan()/] so it finds what's available around and tries to
+ connect to them, but this is policy stuff and should be driven
+ from user space. As of now, the operator is expected to do it
+ manually; see the release notes for documentation on the procedure.
+
+When a dongle is disconnected, /drivers/uwb/hwa-rc.c:hwarc_disconnect()/
+takes time of tearing everything down safely (or not...).
+
+
+ On the air: beacons and enumerating the radio neighborhood
+
+So assuming we have devices and we have agreed for a channel to connect
+on (let's say 9), we put the new RC to beacon:
+
+ *
+
+ $ echo 9 0 > /sys/class/uwb_rc/uwb0/beacon
+
+Now it is visible. If there were other devices in the same radio channel
+and beacon group (that's what the zero is for), the dongle's radio
+control interface will send beacon notifications on its
+notification/event endpoint (NEEP). The beacon notifications are part of
+the event stream that is funneled into the API with
+/drivers/uwb/neh.c:uwb_rc_neh_grok()/ and delivered to the UWBD, the UWB
+daemon through a notification list.
+
+UWBD wakes up and scans the event list; finds a beacon and adds it to
+the BEACON CACHE (/uwb_beca/). If he receives a number of beacons from
+the same device, he considers it to be 'onair' and creates a new device
+[/drivers/uwb/lc-dev.c:uwbd_dev_onair()/]. Similarly, when no beacons
+are received in some time, the device is considered gone and wiped out
+[uwbd calls periodically /uwb/beacon.c:uwb_beca_purge()/ that will purge
+the beacon cache of dead devices].
+
+
+ Device lists
+
+All UWB devices are kept in the list of the struct bus_type uwb_bus.
+
+
+ Bandwidth allocation
+
+The UWB stack maintains a local copy of DRP availability through
+processing of incoming *DRP Availability Change* notifications. This
+local copy is currently used to present the current bandwidth
+availability to the user through the sysfs file
+/sys/class/uwb_rc/uwbx/bw_avail. In the future the bandwidth
+availability information will be used by the bandwidth reservation
+routines.
+
+The bandwidth reservation routines are in progress and are thus not
+present in the current release. When completed they will enable a user
+to initiate DRP reservation requests through interaction with sysfs. DRP
+reservation requests from remote UWB devices will also be handled. The
+bandwidth management done by the UWB stack will include callbacks to the
+higher layers will enable the higher layers to use the reservations upon
+completion. [Note: The bandwidth reservation work is in progress and
+subject to change.]
+
+
+ Wireless USB Host Controller drivers
+
+*WARNING* This section needs a lot of work!
+
+As explained above, there are three different types of HCs in the WUSB
+world: HWA-HC, DWA-HC and WHCI-HC.
+
+HWA-HC and DWA-HC share that they are Wire-Adapters (USB or WUSB
+connected controllers), and their transfer management system is almost
+identical. So is their notification delivery system.
+
+HWA-HC and WHCI-HC share that they are both WUSB host controllers, so
+they have to deal with WUSB device life cycle and maintenance, wireless
+root-hub
+
+HWA exposes a Host Controller interface (HWA-HC 0xe0/02/02). This has
+three endpoints (Notifications, Data Transfer In and Data Transfer
+Out--known as NEP, DTI and DTO in the code).
+
+We reserve UWB bandwidth for our Wireless USB Cluster, create a Cluster
+ID and tell the HC to use all that. Then we start it. This means the HC
+starts sending MMCs.
+
+ *
+
+ The MMCs are blocks of data defined somewhere in the WUSB1.0 spec
+ that define a stream in the UWB channel time allocated for sending
+ WUSB IEs (host to device commands/notifications) and Device
+ Notifications (device initiated to host). Each host defines a
+ unique Wireless USB cluster through MMCs. Devices can connect to a
+ single cluster at the time. The IEs are Information Elements, and
+ among them are the bandwidth allocations that tell each device
+ when can they transmit or receive.
+
+Now it all depends on external stimuli.
+
+*New device connection*
+
+A new device pops up, it scans the radio looking for MMCs that give out
+the existence of Wireless USB channels. Once one (or more) are found,
+selects which one to connect to. Sends a /DN_Connect/ (device
+notification connect) during the DNTS (Device Notification Time
+Slot--announced in the MMCs
+
+HC picks the /DN_Connect/ out (nep module sends to notif.c for delivery
+into /devconnect/). This process starts the authentication process for
+the device. First we allocate a /fake port/ and assign an
+unauthenticated address (128 to 255--what we really do is
+0x80 | fake_port_idx). We fiddle with the fake port status and /khubd/
+sees a new connection, so he moves on to enable the fake port with a reset.
+
+So now we are in the reset path -- we know we have a non-yet enumerated
+device with an unauthorized address; we ask user space to authenticate
+(FIXME: not yet done, similar to bluetooth pairing), then we do the key
+exchange (FIXME: not yet done) and issue a /set address 0/ to bring the
+device to the default state. Device is authenticated.
+
+From here, the USB stack takes control through the usb_hcd ops. khubd
+has seen the port status changes, as we have been toggling them. It will
+start enumerating and doing transfers through usb_hcd->urb_enqueue() to
+read descriptors and move our data.
+
+*Device life cycle and keep alives*
+
+Everytime there is a succesful transfer to/from a device, we update a
+per-device activity timestamp. If not, every now and then we check and
+if the activity timestamp gets old, we ping the device by sending it a
+Keep Alive IE; it responds with a /DN_Alive/ pong during the DNTS (this
+arrives to us as a notification through
+devconnect.c:wusb_handle_dn_alive(). If a device times out, we
+disconnect it from the system (cleaning up internal information and
+toggling the bits in the fake hub port, which kicks khubd into removing
+the rest of the stuff).
+
+This is done through devconnect:__wusb_check_devs(), which will scan the
+device list looking for whom needs refreshing.
+
+If the device wants to disconnect, it will either die (ugly) or send a
+/DN_Disconnect/ that will prompt a disconnection from the system.
+
+*Sending and receiving data*
+
+Data is sent and received through /Remote Pipes/ (rpipes). An rpipe is
+/aimed/ at an endpoint in a WUSB device. This is the same for HWAs and
+DWAs.
+
+Each HC has a number of rpipes and buffers that can be assigned to them;
+when doing a data transfer (xfer), first the rpipe has to be aimed and
+prepared (buffers assigned), then we can start queueing requests for
+data in or out.
+
+Data buffers have to be segmented out before sending--so we send first a
+header (segment request) and then if there is any data, a data buffer
+immediately after to the DTI interface (yep, even the request). If our
+buffer is bigger than the max segment size, then we just do multiple
+requests.
+
+[This sucks, because doing USB scatter gatter in Linux is resource
+intensive, if any...not that the current approach is not. It just has to
+be cleaned up a lot :)].
+
+If reading, we don't send data buffers, just the segment headers saying
+we want to read segments.
+
+When the xfer is executed, we receive a notification that says data is
+ready in the DTI endpoint (handled through
+xfer.c:wa_handle_notif_xfer()). In there we read from the DTI endpoint a
+descriptor that gives us the status of the transfer, its identification
+(given when we issued it) and the segment number. If it was a data read,
+we issue another URB to read into the destination buffer the chunk of
+data coming out of the remote endpoint. Done, wait for the next guy. The
+callbacks for the URBs issued from here are the ones that will declare
+the xfer complete at some point and call it's callback.
+
+Seems simple, but the implementation is not trivial.
+
+ *
+
+ *WARNING* Old!!
+
+The main xfer descriptor, wa_xfer (equivalent to a URB) contains an
+array of segments, tallys on segments and buffers and callback
+information. Buried in there is a lot of URBs for executing the segments
+and buffer transfers.
+
+For OUT xfers, there is an array of segments, one URB for each, another
+one of buffer URB. When submitting, we submit URBs for segment request
+1, buffer 1, segment 2, buffer 2...etc. Then we wait on the DTI for xfer
+result data; when all the segments are complete, we call the callback to
+finalize the transfer.
+
+For IN xfers, we only issue URBs for the segments we want to read and
+then wait for the xfer result data.
+
+*URB mapping into xfers*
+
+This is done by hwahc_op_urb_[en|de]queue(). In enqueue() we aim an
+rpipe to the endpoint where we have to transmit, create a transfer
+context (wa_xfer) and submit it. When the xfer is done, our callback is
+called and we assign the status bits and release the xfer resources.
+
+In dequeue() we are basically cancelling/aborting the transfer. We issue
+a xfer abort request to the HC, cancell all the URBs we had submitted
+and not yet done and when all that is done, the xfer callback will be
+called--this will call the URB callback.
+
+
+ Glossary
+
+*DWA* -- Device Wire Adapter
+
+USB host, wired for downstream devices, upstream connects wirelessly
+with Wireless USB.
+
+*EVENT* -- Response to a command on the NEEP
+
+*HWA* -- Host Wire Adapter / USB dongle for UWB and Wireless USB
+
+*NEH* -- Notification/Event Handle
+
+Handle/file descriptor for receiving notifications or events. The WA
+code requires you to get one of this to listen for notifications or
+events on the NEEP.
+
+*NEEP* -- Notification/Event EndPoint
+
+Stuff related to the management of the first endpoint of a HWA USB
+dongle that is used to deliver an stream of events and notifications to
+the host.
+
+*NOTIFICATION* -- Message coming in the NEEP as response to something.
+
+*RC* -- Radio Control
+
+Design-overview.txt-1.8 (last edited 2006-11-04 12:22:24 by
+InakyPerezGonzalez)
+
diff --git a/Documentation/usb/acm.txt b/Documentation/usb/acm.txt
new file mode 100644
index 0000000..17f5c2e
--- /dev/null
+++ b/Documentation/usb/acm.txt
@@ -0,0 +1,128 @@
+ Linux ACM driver v0.16
+ (c) 1999 Vojtech Pavlik <vojtech@suse.cz>
+ Sponsored by SuSE
+----------------------------------------------------------------------------
+
+0. Disclaimer
+~~~~~~~~~~~~~
+ This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+ This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+ You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Should you need to contact me, the author, you can do so either by e-mail
+- mail your message to <vojtech@suse.cz>, or by paper mail: Vojtech Pavlik,
+Ucitelska 1576, Prague 8, 182 00 Czech Republic
+
+ For your convenience, the GNU General Public License version 2 is included
+in the package: See the file COPYING.
+
+1. Usage
+~~~~~~~~
+ The drivers/usb/class/cdc-acm.c drivers works with USB modems and USB ISDN terminal
+adapters that conform to the Universal Serial Bus Communication Device Class
+Abstract Control Model (USB CDC ACM) specification.
+
+ Many modems do, here is a list of those I know of:
+
+ 3Com OfficeConnect 56k
+ 3Com Voice FaxModem Pro
+ 3Com Sportster
+ MultiTech MultiModem 56k
+ Zoom 2986L FaxModem
+ Compaq 56k FaxModem
+ ELSA Microlink 56k
+
+ I know of one ISDN TA that does work with the acm driver:
+
+ 3Com USR ISDN Pro TA
+
+ Some cell phones also connect via USB. I know the following phones work:
+
+ SonyEricsson K800i
+
+ Unfortunately many modems and most ISDN TAs use proprietary interfaces and
+thus won't work with this drivers. Check for ACM compliance before buying.
+
+ To use the modems you need these modules loaded:
+
+ usbcore.ko
+ uhci-hcd.ko ohci-hcd.ko or ehci-hcd.ko
+ cdc-acm.ko
+
+ After that, the modem[s] should be accessible. You should be able to use
+minicom, ppp and mgetty with them.
+
+2. Verifying that it works
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+ The first step would be to check /proc/bus/usb/devices, it should look
+like this:
+
+T: Bus=01 Lev=00 Prnt=00 Port=00 Cnt=00 Dev#= 1 Spd=12 MxCh= 2
+B: Alloc= 0/900 us ( 0%), #Int= 0, #Iso= 0
+D: Ver= 1.00 Cls=09(hub ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1
+P: Vendor=0000 ProdID=0000 Rev= 0.00
+S: Product=USB UHCI Root Hub
+S: SerialNumber=6800
+C:* #Ifs= 1 Cfg#= 1 Atr=40 MxPwr= 0mA
+I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub
+E: Ad=81(I) Atr=03(Int.) MxPS= 8 Ivl=255ms
+T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 2 Spd=12 MxCh= 0
+D: Ver= 1.00 Cls=02(comm.) Sub=00 Prot=00 MxPS= 8 #Cfgs= 2
+P: Vendor=04c1 ProdID=008f Rev= 2.07
+S: Manufacturer=3Com Inc.
+S: Product=3Com U.S. Robotics Pro ISDN TA
+S: SerialNumber=UFT53A49BVT7
+C: #Ifs= 1 Cfg#= 1 Atr=60 MxPwr= 0mA
+I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=acm
+E: Ad=85(I) Atr=02(Bulk) MxPS= 64 Ivl= 0ms
+E: Ad=04(O) Atr=02(Bulk) MxPS= 64 Ivl= 0ms
+E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=128ms
+C:* #Ifs= 2 Cfg#= 2 Atr=60 MxPwr= 0mA
+I: If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=01 Driver=acm
+E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=128ms
+I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=acm
+E: Ad=85(I) Atr=02(Bulk) MxPS= 64 Ivl= 0ms
+E: Ad=04(O) Atr=02(Bulk) MxPS= 64 Ivl= 0ms
+
+The presence of these three lines (and the Cls= 'comm' and 'data' classes)
+is important, it means it's an ACM device. The Driver=acm means the acm
+driver is used for the device. If you see only Cls=ff(vend.) then you're out
+of luck, you have a device with vendor specific-interface.
+
+D: Ver= 1.00 Cls=02(comm.) Sub=00 Prot=00 MxPS= 8 #Cfgs= 2
+I: If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=01 Driver=acm
+I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=acm
+
+In the system log you should see:
+
+usb.c: USB new device connect, assigned device number 2
+usb.c: kmalloc IF c7691fa0, numif 1
+usb.c: kmalloc IF c7b5f3e0, numif 2
+usb.c: skipped 4 class/vendor specific interface descriptors
+usb.c: new device strings: Mfr=1, Product=2, SerialNumber=3
+usb.c: USB device number 2 default language ID 0x409
+Manufacturer: 3Com Inc.
+Product: 3Com U.S. Robotics Pro ISDN TA
+SerialNumber: UFT53A49BVT7
+acm.c: probing config 1
+acm.c: probing config 2
+ttyACM0: USB ACM device
+acm.c: acm_control_msg: rq: 0x22 val: 0x0 len: 0x0 result: 0
+acm.c: acm_control_msg: rq: 0x20 val: 0x0 len: 0x7 result: 7
+usb.c: acm driver claimed interface c7b5f3e0
+usb.c: acm driver claimed interface c7b5f3f8
+usb.c: acm driver claimed interface c7691fa0
+
+If all this seems to be OK, fire up minicom and set it to talk to the ttyACM
+device and try typing 'at'. If it responds with 'OK', then everything is
+working.
diff --git a/Documentation/usb/anchors.txt b/Documentation/usb/anchors.txt
new file mode 100644
index 0000000..6f24f56
--- /dev/null
+++ b/Documentation/usb/anchors.txt
@@ -0,0 +1,79 @@
+What is anchor?
+===============
+
+A USB driver needs to support some callbacks requiring
+a driver to cease all IO to an interface. To do so, a
+driver has to keep track of the URBs it has submitted
+to know they've all completed or to call usb_kill_urb
+for them. The anchor is a data structure takes care of
+keeping track of URBs and provides methods to deal with
+multiple URBs.
+
+Allocation and Initialisation
+=============================
+
+There's no API to allocate an anchor. It is simply declared
+as struct usb_anchor. init_usb_anchor() must be called to
+initialise the data structure.
+
+Deallocation
+============
+
+Once it has no more URBs associated with it, the anchor can be
+freed with normal memory management operations.
+
+Association and disassociation of URBs with anchors
+===================================================
+
+An association of URBs to an anchor is made by an explicit
+call to usb_anchor_urb(). The association is maintained until
+an URB is finished by (successfull) completion. Thus disassociation
+is automatic. A function is provided to forcibly finish (kill)
+all URBs associated with an anchor.
+Furthermore, disassociation can be made with usb_unanchor_urb()
+
+Operations on multitudes of URBs
+================================
+
+usb_kill_anchored_urbs()
+------------------------
+
+This function kills all URBs associated with an anchor. The URBs
+are called in the reverse temporal order they were submitted.
+This way no data can be reordered.
+
+usb_unlink_anchored_urbs()
+--------------------------
+
+This function unlinks all URBs associated with an anchor. The URBs
+are processed in the reverse temporal order they were submitted.
+This is similar to usb_kill_anchored_urbs(), but it will not sleep.
+Therefore no guarantee is made that the URBs have been unlinked when
+the call returns. They may be unlinked later but will be unlinked in
+finite time.
+
+usb_scuttle_anchored_urbs()
+---------------------------
+
+All URBs of an anchor are unanchored en masse.
+
+usb_wait_anchor_empty_timeout()
+-------------------------------
+
+This function waits for all URBs associated with an anchor to finish
+or a timeout, whichever comes first. Its return value will tell you
+whether the timeout was reached.
+
+usb_anchor_empty()
+------------------
+
+Returns true if no URBs are associated with an anchor. Locking
+is the caller's responsibility.
+
+usb_get_from_anchor()
+---------------------
+
+Returns the oldest anchored URB of an anchor. The URB is unanchored
+and returned with a reference. As you may mix URBs to several
+destinations in one anchor you have no guarantee the chronologically
+first submitted URB is returned. \ No newline at end of file
diff --git a/Documentation/usb/authorization.txt b/Documentation/usb/authorization.txt
new file mode 100644
index 0000000..381b22e
--- /dev/null
+++ b/Documentation/usb/authorization.txt
@@ -0,0 +1,92 @@
+
+Authorizing (or not) your USB devices to connect to the system
+
+(C) 2007 Inaky Perez-Gonzalez <inaky@linux.intel.com> Intel Corporation
+
+This feature allows you to control if a USB device can be used (or
+not) in a system. This feature will allow you to implement a lock-down
+of USB devices, fully controlled by user space.
+
+As of now, when a USB device is connected it is configured and
+its interfaces are immediately made available to the users. With this
+modification, only if root authorizes the device to be configured will
+then it be possible to use it.
+
+Usage:
+
+Authorize a device to connect:
+
+$ echo 1 > /sys/usb/devices/DEVICE/authorized
+
+Deauthorize a device:
+
+$ echo 0 > /sys/usb/devices/DEVICE/authorized
+
+Set new devices connected to hostX to be deauthorized by default (ie:
+lock down):
+
+$ echo 0 > /sys/bus/devices/usbX/authorized_default
+
+Remove the lock down:
+
+$ echo 1 > /sys/bus/devices/usbX/authorized_default
+
+By default, Wired USB devices are authorized by default to
+connect. Wireless USB hosts deauthorize by default all new connected
+devices (this is so because we need to do an authentication phase
+before authorizing).
+
+
+Example system lockdown (lame)
+-----------------------
+
+Imagine you want to implement a lockdown so only devices of type XYZ
+can be connected (for example, it is a kiosk machine with a visible
+USB port):
+
+boot up
+rc.local ->
+
+ for host in /sys/bus/devices/usb*
+ do
+ echo 0 > $host/authorized_default
+ done
+
+Hookup an script to udev, for new USB devices
+
+ if device_is_my_type $DEV
+ then
+ echo 1 > $device_path/authorized
+ done
+
+
+Now, device_is_my_type() is where the juice for a lockdown is. Just
+checking if the class, type and protocol match something is the worse
+security verification you can make (or the best, for someone willing
+to break it). If you need something secure, use crypto and Certificate
+Authentication or stuff like that. Something simple for an storage key
+could be:
+
+function device_is_my_type()
+{
+ echo 1 > authorized # temporarily authorize it
+ # FIXME: make sure none can mount it
+ mount DEVICENODE /mntpoint
+ sum=$(md5sum /mntpoint/.signature)
+ if [ $sum = $(cat /etc/lockdown/keysum) ]
+ then
+ echo "We are good, connected"
+ umount /mntpoint
+ # Other stuff so others can use it
+ else
+ echo 0 > authorized
+ fi
+}
+
+
+Of course, this is lame, you'd want to do a real certificate
+verification stuff with PKI, so you don't depend on a shared secret,
+etc, but you get the idea. Anybody with access to a device gadget kit
+can fake descriptors and device info. Don't trust that. You are
+welcome.
+
diff --git a/Documentation/usb/callbacks.txt b/Documentation/usb/callbacks.txt
new file mode 100644
index 0000000..7c81241
--- /dev/null
+++ b/Documentation/usb/callbacks.txt
@@ -0,0 +1,132 @@
+What callbacks will usbcore do?
+===============================
+
+Usbcore will call into a driver through callbacks defined in the driver
+structure and through the completion handler of URBs a driver submits.
+Only the former are in the scope of this document. These two kinds of
+callbacks are completely independent of each other. Information on the
+completion callback can be found in Documentation/usb/URB.txt.
+
+The callbacks defined in the driver structure are:
+
+1. Hotplugging callbacks:
+
+ * @probe: Called to see if the driver is willing to manage a particular
+ * interface on a device.
+ * @disconnect: Called when the interface is no longer accessible, usually
+ * because its device has been (or is being) disconnected or the
+ * driver module is being unloaded.
+
+2. Odd backdoor through usbfs:
+
+ * @ioctl: Used for drivers that want to talk to userspace through
+ * the "usbfs" filesystem. This lets devices provide ways to
+ * expose information to user space regardless of where they
+ * do (or don't) show up otherwise in the filesystem.
+
+3. Power management (PM) callbacks:
+
+ * @suspend: Called when the device is going to be suspended.
+ * @resume: Called when the device is being resumed.
+ * @reset_resume: Called when the suspended device has been reset instead
+ * of being resumed.
+
+4. Device level operations:
+
+ * @pre_reset: Called when the device is about to be reset.
+ * @post_reset: Called after the device has been reset
+
+The ioctl interface (2) should be used only if you have a very good
+reason. Sysfs is preferred these days. The PM callbacks are covered
+separately in Documentation/usb/power-management.txt.
+
+Calling conventions
+===================
+
+All callbacks are mutually exclusive. There's no need for locking
+against other USB callbacks. All callbacks are called from a task
+context. You may sleep. However, it is important that all sleeps have a
+small fixed upper limit in time. In particular you must not call out to
+user space and await results.
+
+Hotplugging callbacks
+=====================
+
+These callbacks are intended to associate and disassociate a driver with
+an interface. A driver's bond to an interface is exclusive.
+
+The probe() callback
+--------------------
+
+int (*probe) (struct usb_interface *intf,
+ const struct usb_device_id *id);
+
+Accept or decline an interface. If you accept the device return 0,
+otherwise -ENODEV or -ENXIO. Other error codes should be used only if a
+genuine error occurred during initialisation which prevented a driver
+from accepting a device that would else have been accepted.
+You are strongly encouraged to use usbcore'sfacility,
+usb_set_intfdata(), to associate a data structure with an interface, so
+that you know which internal state and identity you associate with a
+particular interface. The device will not be suspended and you may do IO
+to the interface you are called for and endpoint 0 of the device. Device
+initialisation that doesn't take too long is a good idea here.
+
+The disconnect() callback
+-------------------------
+
+void (*disconnect) (struct usb_interface *intf);
+
+This callback is a signal to break any connection with an interface.
+You are not allowed any IO to a device after returning from this
+callback. You also may not do any other operation that may interfere
+with another driver bound the interface, eg. a power management
+operation.
+If you are called due to a physical disconnection, all your URBs will be
+killed by usbcore. Note that in this case disconnect will be called some
+time after the physical disconnection. Thus your driver must be prepared
+to deal with failing IO even prior to the callback.
+
+Device level callbacks
+======================
+
+pre_reset
+---------
+
+int (*pre_reset)(struct usb_interface *intf);
+
+Another driver or user space is triggering a reset on the device which
+contains the interface passed as an argument. Cease IO and save any
+device state you need to restore.
+
+If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if you
+are in atomic context.
+
+post_reset
+----------
+
+int (*post_reset)(struct usb_interface *intf);
+
+The reset has completed. Restore any saved device state and begin
+using the device again.
+
+If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if you
+are in atomic context.
+
+Call sequences
+==============
+
+No callbacks other than probe will be invoked for an interface
+that isn't bound to your driver.
+
+Probe will never be called for an interface bound to a driver.
+Hence following a successful probe, disconnect will be called
+before there is another probe for the same interface.
+
+Once your driver is bound to an interface, disconnect can be
+called at any time except in between pre_reset and post_reset.
+pre_reset is always followed by post_reset, even if the reset
+failed or the device has been unplugged.
+
+suspend is always followed by one of: resume, reset_resume, or
+disconnect.
diff --git a/Documentation/usb/dma.txt b/Documentation/usb/dma.txt
new file mode 100644
index 0000000..e8b50b7
--- /dev/null
+++ b/Documentation/usb/dma.txt
@@ -0,0 +1,138 @@
+In Linux 2.5 kernels (and later), USB device drivers have additional control
+over how DMA may be used to perform I/O operations. The APIs are detailed
+in the kernel usb programming guide (kerneldoc, from the source code).
+
+
+API OVERVIEW
+
+The big picture is that USB drivers can continue to ignore most DMA issues,
+though they still must provide DMA-ready buffers (see DMA-mapping.txt).
+That's how they've worked through the 2.4 (and earlier) kernels.
+
+OR: they can now be DMA-aware.
+
+- New calls enable DMA-aware drivers, letting them allocate dma buffers and
+ manage dma mappings for existing dma-ready buffers (see below).
+
+- URBs have an additional "transfer_dma" field, as well as a transfer_flags
+ bit saying if it's valid. (Control requests also have "setup_dma" and a
+ corresponding transfer_flags bit.)
+
+- "usbcore" will map those DMA addresses, if a DMA-aware driver didn't do
+ it first and set URB_NO_TRANSFER_DMA_MAP or URB_NO_SETUP_DMA_MAP. HCDs
+ don't manage dma mappings for URBs.
+
+- There's a new "generic DMA API", parts of which are usable by USB device
+ drivers. Never use dma_set_mask() on any USB interface or device; that
+ would potentially break all devices sharing that bus.
+
+
+ELIMINATING COPIES
+
+It's good to avoid making CPUs copy data needlessly. The costs can add up,
+and effects like cache-trashing can impose subtle penalties.
+
+- If you're doing lots of small data transfers from the same buffer all
+ the time, that can really burn up resources on systems which use an
+ IOMMU to manage the DMA mappings. It can cost MUCH more to set up and
+ tear down the IOMMU mappings with each request than perform the I/O!
+
+ For those specific cases, USB has primitives to allocate less expensive
+ memory. They work like kmalloc and kfree versions that give you the right
+ kind of addresses to store in urb->transfer_buffer and urb->transfer_dma.
+ You'd also set URB_NO_TRANSFER_DMA_MAP in urb->transfer_flags:
+
+ void *usb_buffer_alloc (struct usb_device *dev, size_t size,
+ int mem_flags, dma_addr_t *dma);
+
+ void usb_buffer_free (struct usb_device *dev, size_t size,
+ void *addr, dma_addr_t dma);
+
+ Most drivers should *NOT* be using these primitives; they don't need
+ to use this type of memory ("dma-coherent"), and memory returned from
+ kmalloc() will work just fine.
+
+ For control transfers you can use the buffer primitives or not for each
+ of the transfer buffer and setup buffer independently. Set the flag bits
+ URB_NO_TRANSFER_DMA_MAP and URB_NO_SETUP_DMA_MAP to indicate which
+ buffers you have prepared. For non-control transfers URB_NO_SETUP_DMA_MAP
+ is ignored.
+
+ The memory buffer returned is "dma-coherent"; sometimes you might need to
+ force a consistent memory access ordering by using memory barriers. It's
+ not using a streaming DMA mapping, so it's good for small transfers on
+ systems where the I/O would otherwise thrash an IOMMU mapping. (See
+ Documentation/DMA-mapping.txt for definitions of "coherent" and "streaming"
+ DMA mappings.)
+
+ Asking for 1/Nth of a page (as well as asking for N pages) is reasonably
+ space-efficient.
+
+ On most systems the memory returned will be uncached, because the
+ semantics of dma-coherent memory require either bypassing CPU caches
+ or using cache hardware with bus-snooping support. While x86 hardware
+ has such bus-snooping, many other systems use software to flush cache
+ lines to prevent DMA conflicts.
+
+- Devices on some EHCI controllers could handle DMA to/from high memory.
+
+ Unfortunately, the current Linux DMA infrastructure doesn't have a sane
+ way to expose these capabilities ... and in any case, HIGHMEM is mostly a
+ design wart specific to x86_32. So your best bet is to ensure you never
+ pass a highmem buffer into a USB driver. That's easy; it's the default
+ behavior. Just don't override it; e.g. with NETIF_F_HIGHDMA.
+
+ This may force your callers to do some bounce buffering, copying from
+ high memory to "normal" DMA memory. If you can come up with a good way
+ to fix this issue (for x86_32 machines with over 1 GByte of memory),
+ feel free to submit patches.
+
+
+WORKING WITH EXISTING BUFFERS
+
+Existing buffers aren't usable for DMA without first being mapped into the
+DMA address space of the device. However, most buffers passed to your
+driver can safely be used with such DMA mapping. (See the first section
+of DMA-mapping.txt, titled "What memory is DMA-able?")
+
+- When you're using scatterlists, you can map everything at once. On some
+ systems, this kicks in an IOMMU and turns the scatterlists into single
+ DMA transactions:
+
+ int usb_buffer_map_sg (struct usb_device *dev, unsigned pipe,
+ struct scatterlist *sg, int nents);
+
+ void usb_buffer_dmasync_sg (struct usb_device *dev, unsigned pipe,
+ struct scatterlist *sg, int n_hw_ents);
+
+ void usb_buffer_unmap_sg (struct usb_device *dev, unsigned pipe,
+ struct scatterlist *sg, int n_hw_ents);
+
+ It's probably easier to use the new usb_sg_*() calls, which do the DMA
+ mapping and apply other tweaks to make scatterlist i/o be fast.
+
+- Some drivers may prefer to work with the model that they're mapping large
+ buffers, synchronizing their safe re-use. (If there's no re-use, then let
+ usbcore do the map/unmap.) Large periodic transfers make good examples
+ here, since it's cheaper to just synchronize the buffer than to unmap it
+ each time an urb completes and then re-map it on during resubmission.
+
+ These calls all work with initialized urbs: urb->dev, urb->pipe,
+ urb->transfer_buffer, and urb->transfer_buffer_length must all be
+ valid when these calls are used (urb->setup_packet must be valid too
+ if urb is a control request):
+
+ struct urb *usb_buffer_map (struct urb *urb);
+
+ void usb_buffer_dmasync (struct urb *urb);
+
+ void usb_buffer_unmap (struct urb *urb);
+
+ The calls manage urb->transfer_dma for you, and set URB_NO_TRANSFER_DMA_MAP
+ so that usbcore won't map or unmap the buffer. The same goes for
+ urb->setup_dma and URB_NO_SETUP_DMA_MAP for control requests.
+
+Note that several of those interfaces are currently commented out, since
+they don't have current users. See the source code. Other than the dmasync
+calls (where the underlying DMA primitives have changed), most of them can
+easily be commented back in if you want to use them.
diff --git a/Documentation/usb/ehci.txt b/Documentation/usb/ehci.txt
new file mode 100644
index 0000000..1536b7e
--- /dev/null
+++ b/Documentation/usb/ehci.txt
@@ -0,0 +1,212 @@
+27-Dec-2002
+
+The EHCI driver is used to talk to high speed USB 2.0 devices using
+USB 2.0-capable host controller hardware. The USB 2.0 standard is
+compatible with the USB 1.1 standard. It defines three transfer speeds:
+
+ - "High Speed" 480 Mbit/sec (60 MByte/sec)
+ - "Full Speed" 12 Mbit/sec (1.5 MByte/sec)
+ - "Low Speed" 1.5 Mbit/sec
+
+USB 1.1 only addressed full speed and low speed. High speed devices
+can be used on USB 1.1 systems, but they slow down to USB 1.1 speeds.
+
+USB 1.1 devices may also be used on USB 2.0 systems. When plugged
+into an EHCI controller, they are given to a USB 1.1 "companion"
+controller, which is a OHCI or UHCI controller as normally used with
+such devices. When USB 1.1 devices plug into USB 2.0 hubs, they
+interact with the EHCI controller through a "Transaction Translator"
+(TT) in the hub, which turns low or full speed transactions into
+high speed "split transactions" that don't waste transfer bandwidth.
+
+At this writing, this driver has been seen to work with implementations
+of EHCI from (in alphabetical order): Intel, NEC, Philips, and VIA.
+Other EHCI implementations are becoming available from other vendors;
+you should expect this driver to work with them too.
+
+While usb-storage devices have been available since mid-2001 (working
+quite speedily on the 2.4 version of this driver), hubs have only
+been available since late 2001, and other kinds of high speed devices
+appear to be on hold until more systems come with USB 2.0 built-in.
+Such new systems have been available since early 2002, and became much
+more typical in the second half of 2002.
+
+Note that USB 2.0 support involves more than just EHCI. It requires
+other changes to the Linux-USB core APIs, including the hub driver,
+but those changes haven't needed to really change the basic "usbcore"
+APIs exposed to USB device drivers.
+
+- David Brownell
+ <dbrownell@users.sourceforge.net>
+
+
+FUNCTIONALITY
+
+This driver is regularly tested on x86 hardware, and has also been
+used on PPC hardware so big/little endianness issues should be gone.
+It's believed to do all the right PCI magic so that I/O works even on
+systems with interesting DMA mapping issues.
+
+Transfer Types
+
+At this writing the driver should comfortably handle all control, bulk,
+and interrupt transfers, including requests to USB 1.1 devices through
+transaction translators (TTs) in USB 2.0 hubs. But you may find bugs.
+
+High Speed Isochronous (ISO) transfer support is also functional, but
+at this writing no Linux drivers have been using that support.
+
+Full Speed Isochronous transfer support, through transaction translators,
+is not yet available. Note that split transaction support for ISO
+transfers can't share much code with the code for high speed ISO transfers,
+since EHCI represents these with a different data structure. So for now,
+most USB audio and video devices can't be connected to high speed buses.
+
+Driver Behavior
+
+Transfers of all types can be queued. This means that control transfers
+from a driver on one interface (or through usbfs) won't interfere with
+ones from another driver, and that interrupt transfers can use periods
+of one frame without risking data loss due to interrupt processing costs.
+
+The EHCI root hub code hands off USB 1.1 devices to its companion
+controller. This driver doesn't need to know anything about those
+drivers; a OHCI or UHCI driver that works already doesn't need to change
+just because the EHCI driver is also present.
+
+There are some issues with power management; suspend/resume doesn't
+behave quite right at the moment.
+
+Also, some shortcuts have been taken with the scheduling periodic
+transactions (interrupt and isochronous transfers). These place some
+limits on the number of periodic transactions that can be scheduled,
+and prevent use of polling intervals of less than one frame.
+
+
+USE BY
+
+Assuming you have an EHCI controller (on a PCI card or motherboard)
+and have compiled this driver as a module, load this like:
+
+ # modprobe ehci-hcd
+
+and remove it by:
+
+ # rmmod ehci-hcd
+
+You should also have a driver for a "companion controller", such as
+"ohci-hcd" or "uhci-hcd". In case of any trouble with the EHCI driver,
+remove its module and then the driver for that companion controller will
+take over (at lower speed) all the devices that were previously handled
+by the EHCI driver.
+
+Module parameters (pass to "modprobe") include:
+
+ log2_irq_thresh (default 0):
+ Log2 of default interrupt delay, in microframes. The default
+ value is 0, indicating 1 microframe (125 usec). Maximum value
+ is 6, indicating 2^6 = 64 microframes. This controls how often
+ the EHCI controller can issue interrupts.
+
+If you're using this driver on a 2.5 kernel, and you've enabled USB
+debugging support, you'll see three files in the "sysfs" directory for
+any EHCI controller:
+
+ "async" dumps the asynchronous schedule, used for control
+ and bulk transfers. Shows each active qh and the qtds
+ pending, usually one qtd per urb. (Look at it with
+ usb-storage doing disk I/O; watch the request queues!)
+ "periodic" dumps the periodic schedule, used for interrupt
+ and isochronous transfers. Doesn't show qtds.
+ "registers" show controller register state, and
+
+The contents of those files can help identify driver problems.
+
+
+Device drivers shouldn't care whether they're running over EHCI or not,
+but they may want to check for "usb_device->speed == USB_SPEED_HIGH".
+High speed devices can do things that full speed (or low speed) ones
+can't, such as "high bandwidth" periodic (interrupt or ISO) transfers.
+Also, some values in device descriptors (such as polling intervals for
+periodic transfers) use different encodings when operating at high speed.
+
+However, do make a point of testing device drivers through USB 2.0 hubs.
+Those hubs report some failures, such as disconnections, differently when
+transaction translators are in use; some drivers have been seen to behave
+badly when they see different faults than OHCI or UHCI report.
+
+
+PERFORMANCE
+
+USB 2.0 throughput is gated by two main factors: how fast the host
+controller can process requests, and how fast devices can respond to
+them. The 480 Mbit/sec "raw transfer rate" is obeyed by all devices,
+but aggregate throughput is also affected by issues like delays between
+individual high speed packets, driver intelligence, and of course the
+overall system load. Latency is also a performance concern.
+
+Bulk transfers are most often used where throughput is an issue. It's
+good to keep in mind that bulk transfers are always in 512 byte packets,
+and at most 13 of those fit into one USB 2.0 microframe. Eight USB 2.0
+microframes fit in a USB 1.1 frame; a microframe is 1 msec/8 = 125 usec.
+
+So more than 50 MByte/sec is available for bulk transfers, when both
+hardware and device driver software allow it. Periodic transfer modes
+(isochronous and interrupt) allow the larger packet sizes which let you
+approach the quoted 480 MBit/sec transfer rate.
+
+Hardware Performance
+
+At this writing, individual USB 2.0 devices tend to max out at around
+20 MByte/sec transfer rates. This is of course subject to change;
+and some devices now go faster, while others go slower.
+
+The first NEC implementation of EHCI seems to have a hardware bottleneck
+at around 28 MByte/sec aggregate transfer rate. While this is clearly
+enough for a single device at 20 MByte/sec, putting three such devices
+onto one bus does not get you 60 MByte/sec. The issue appears to be
+that the controller hardware won't do concurrent USB and PCI access,
+so that it's only trying six (or maybe seven) USB transactions each
+microframe rather than thirteen. (Seems like a reasonable trade off
+for a product that beat all the others to market by over a year!)
+
+It's expected that newer implementations will better this, throwing
+more silicon real estate at the problem so that new motherboard chip
+sets will get closer to that 60 MByte/sec target. That includes an
+updated implementation from NEC, as well as other vendors' silicon.
+
+There's a minimum latency of one microframe (125 usec) for the host
+to receive interrupts from the EHCI controller indicating completion
+of requests. That latency is tunable; there's a module option. By
+default ehci-hcd driver uses the minimum latency, which means that if
+you issue a control or bulk request you can often expect to learn that
+it completed in less than 250 usec (depending on transfer size).
+
+Software Performance
+
+To get even 20 MByte/sec transfer rates, Linux-USB device drivers will
+need to keep the EHCI queue full. That means issuing large requests,
+or using bulk queuing if a series of small requests needs to be issued.
+When drivers don't do that, their performance results will show it.
+
+In typical situations, a usb_bulk_msg() loop writing out 4 KB chunks is
+going to waste more than half the USB 2.0 bandwidth. Delays between the
+I/O completion and the driver issuing the next request will take longer
+than the I/O. If that same loop used 16 KB chunks, it'd be better; a
+sequence of 128 KB chunks would waste a lot less.
+
+But rather than depending on such large I/O buffers to make synchronous
+I/O be efficient, it's better to just queue up several (bulk) requests
+to the HC, and wait for them all to complete (or be canceled on error).
+Such URB queuing should work with all the USB 1.1 HC drivers too.
+
+In the Linux 2.5 kernels, new usb_sg_*() api calls have been defined; they
+queue all the buffers from a scatterlist. They also use scatterlist DMA
+mapping (which might apply an IOMMU) and IRQ reduction, all of which will
+help make high speed transfers run as fast as they can.
+
+
+TBD: Interrupt and ISO transfer performance issues. Those periodic
+transfers are fully scheduled, so the main issue is likely to be how
+to trigger "high bandwidth" modes.
+
diff --git a/Documentation/usb/error-codes.txt b/Documentation/usb/error-codes.txt
new file mode 100644
index 0000000..9cf83e8
--- /dev/null
+++ b/Documentation/usb/error-codes.txt
@@ -0,0 +1,165 @@
+Revised: 2004-Oct-21
+
+This is the documentation of (hopefully) all possible error codes (and
+their interpretation) that can be returned from usbcore.
+
+Some of them are returned by the Host Controller Drivers (HCDs), which
+device drivers only see through usbcore. As a rule, all the HCDs should
+behave the same except for transfer speed dependent behaviors and the
+way certain faults are reported.
+
+
+**************************************************************************
+* Error codes returned by usb_submit_urb *
+**************************************************************************
+
+Non-USB-specific:
+
+0 URB submission went fine
+
+-ENOMEM no memory for allocation of internal structures
+
+USB-specific:
+
+-ENODEV specified USB-device or bus doesn't exist
+
+-ENOENT specified interface or endpoint does not exist or
+ is not enabled
+
+-ENXIO host controller driver does not support queuing of this type
+ of urb. (treat as a host controller bug.)
+
+-EINVAL a) Invalid transfer type specified (or not supported)
+ b) Invalid or unsupported periodic transfer interval
+ c) ISO: attempted to change transfer interval
+ d) ISO: number_of_packets is < 0
+ e) various other cases
+
+-EAGAIN a) specified ISO start frame too early
+ b) (using ISO-ASAP) too much scheduled for the future
+ wait some time and try again.
+
+-EFBIG Host controller driver can't schedule that many ISO frames.
+
+-EPIPE Specified endpoint is stalled. For non-control endpoints,
+ reset this status with usb_clear_halt().
+
+-EMSGSIZE (a) endpoint maxpacket size is zero; it is not usable
+ in the current interface altsetting.
+ (b) ISO packet is larger than the endpoint maxpacket.
+ (c) requested data transfer length is invalid: negative
+ or too large for the host controller.
+
+-ENOSPC This request would overcommit the usb bandwidth reserved
+ for periodic transfers (interrupt, isochronous).
+
+-ESHUTDOWN The device or host controller has been disabled due to some
+ problem that could not be worked around.
+
+-EPERM Submission failed because urb->reject was set.
+
+-EHOSTUNREACH URB was rejected because the device is suspended.
+
+
+**************************************************************************
+* Error codes returned by in urb->status *
+* or in iso_frame_desc[n].status (for ISO) *
+**************************************************************************
+
+USB device drivers may only test urb status values in completion handlers.
+This is because otherwise there would be a race between HCDs updating
+these values on one CPU, and device drivers testing them on another CPU.
+
+A transfer's actual_length may be positive even when an error has been
+reported. That's because transfers often involve several packets, so that
+one or more packets could finish before an error stops further endpoint I/O.
+
+
+0 Transfer completed successfully
+
+-ENOENT URB was synchronously unlinked by usb_unlink_urb
+
+-EINPROGRESS URB still pending, no results yet
+ (That is, if drivers see this it's a bug.)
+
+-EPROTO (*, **) a) bitstuff error
+ b) no response packet received within the
+ prescribed bus turn-around time
+ c) unknown USB error
+
+-EILSEQ (*, **) a) CRC mismatch
+ b) no response packet received within the
+ prescribed bus turn-around time
+ c) unknown USB error
+
+ Note that often the controller hardware does not
+ distinguish among cases a), b), and c), so a
+ driver cannot tell whether there was a protocol
+ error, a failure to respond (often caused by
+ device disconnect), or some other fault.
+
+-ETIME (**) No response packet received within the prescribed
+ bus turn-around time. This error may instead be
+ reported as -EPROTO or -EILSEQ.
+
+-ETIMEDOUT Synchronous USB message functions use this code
+ to indicate timeout expired before the transfer
+ completed, and no other error was reported by HC.
+
+-EPIPE (**) Endpoint stalled. For non-control endpoints,
+ reset this status with usb_clear_halt().
+
+-ECOMM During an IN transfer, the host controller
+ received data from an endpoint faster than it
+ could be written to system memory
+
+-ENOSR During an OUT transfer, the host controller
+ could not retrieve data from system memory fast
+ enough to keep up with the USB data rate
+
+-EOVERFLOW (*) The amount of data returned by the endpoint was
+ greater than either the max packet size of the
+ endpoint or the remaining buffer size. "Babble".
+
+-EREMOTEIO The data read from the endpoint did not fill the
+ specified buffer, and URB_SHORT_NOT_OK was set in
+ urb->transfer_flags.
+
+-ENODEV Device was removed. Often preceded by a burst of
+ other errors, since the hub driver doesn't detect
+ device removal events immediately.
+
+-EXDEV ISO transfer only partially completed
+ look at individual frame status for details
+
+-EINVAL ISO madness, if this happens: Log off and go home
+
+-ECONNRESET URB was asynchronously unlinked by usb_unlink_urb
+
+-ESHUTDOWN The device or host controller has been disabled due
+ to some problem that could not be worked around,
+ such as a physical disconnect.
+
+
+(*) Error codes like -EPROTO, -EILSEQ and -EOVERFLOW normally indicate
+hardware problems such as bad devices (including firmware) or cables.
+
+(**) This is also one of several codes that different kinds of host
+controller use to indicate a transfer has failed because of device
+disconnect. In the interval before the hub driver starts disconnect
+processing, devices may receive such fault reports for every request.
+
+
+
+**************************************************************************
+* Error codes returned by usbcore-functions *
+* (expect also other submit and transfer status codes) *
+**************************************************************************
+
+usb_register():
+-EINVAL error during registering new driver
+
+usb_get_*/usb_set_*():
+usb_control_msg():
+usb_bulk_msg():
+-ETIMEDOUT Timeout expired before the transfer completed.
diff --git a/Documentation/usb/gadget_printer.txt b/Documentation/usb/gadget_printer.txt
new file mode 100644
index 0000000..ad995bf
--- /dev/null
+++ b/Documentation/usb/gadget_printer.txt
@@ -0,0 +1,510 @@
+
+ Linux USB Printer Gadget Driver
+ 06/04/2007
+
+ Copyright (C) 2007 Craig W. Nadler <craig@nadler.us>
+
+
+
+GENERAL
+=======
+
+This driver may be used if you are writing printer firmware using Linux as
+the embedded OS. This driver has nothing to do with using a printer with
+your Linux host system.
+
+You will need a USB device controller and a Linux driver for it that accepts
+a gadget / "device class" driver using the Linux USB Gadget API. After the
+USB device controller driver is loaded then load the printer gadget driver.
+This will present a printer interface to the USB Host that your USB Device
+port is connected to.
+
+This driver is structured for printer firmware that runs in user mode. The
+user mode printer firmware will read and write data from the kernel mode
+printer gadget driver using a device file. The printer returns a printer status
+byte when the USB HOST sends a device request to get the printer status. The
+user space firmware can read or write this status byte using a device file
+/dev/g_printer . Both blocking and non-blocking read/write calls are supported.
+
+
+
+
+HOWTO USE THIS DRIVER
+=====================
+
+To load the USB device controller driver and the printer gadget driver. The
+following example uses the Netchip 2280 USB device controller driver:
+
+modprobe net2280
+modprobe g_printer
+
+
+The follow command line parameter can be used when loading the printer gadget
+(ex: modprobe g_printer idVendor=0x0525 idProduct=0xa4a8 ):
+
+idVendor - This is the Vendor ID used in the device descriptor. The default is
+ the Netchip vendor id 0x0525. YOU MUST CHANGE TO YOUR OWN VENDOR ID
+ BEFORE RELEASING A PRODUCT. If you plan to release a product and don't
+ already have a Vendor ID please see www.usb.org for details on how to
+ get one.
+
+idProduct - This is the Product ID used in the device descriptor. The default
+ is 0xa4a8, you should change this to an ID that's not used by any of
+ your other USB products if you have any. It would be a good idea to
+ start numbering your products starting with say 0x0001.
+
+bcdDevice - This is the version number of your product. It would be a good idea
+ to put your firmware version here.
+
+iManufacturer - A string containing the name of the Vendor.
+
+iProduct - A string containing the Product Name.
+
+iSerialNum - A string containing the Serial Number. This should be changed for
+ each unit of your product.
+
+iPNPstring - The PNP ID string used for this printer. You will want to set
+ either on the command line or hard code the PNP ID string used for
+ your printer product.
+
+qlen - The number of 8k buffers to use per endpoint. The default is 10, you
+ should tune this for your product. You may also want to tune the
+ size of each buffer for your product.
+
+
+
+
+USING THE EXAMPLE CODE
+======================
+
+This example code talks to stdout, instead of a print engine.
+
+To compile the test code below:
+
+1) save it to a file called prn_example.c
+2) compile the code with the follow command:
+ gcc prn_example.c -o prn_example
+
+
+
+To read printer data from the host to stdout:
+
+ # prn_example -read_data
+
+
+To write printer data from a file (data_file) to the host:
+
+ # cat data_file | prn_example -write_data
+
+
+To get the current printer status for the gadget driver:
+
+ # prn_example -get_status
+
+ Printer status is:
+ Printer is NOT Selected
+ Paper is Out
+ Printer OK
+
+
+To set printer to Selected/On-line:
+
+ # prn_example -selected
+
+
+To set printer to Not Selected/Off-line:
+
+ # prn_example -not_selected
+
+
+To set paper status to paper out:
+
+ # prn_example -paper_out
+
+
+To set paper status to paper loaded:
+
+ # prn_example -paper_loaded
+
+
+To set error status to printer OK:
+
+ # prn_example -no_error
+
+
+To set error status to ERROR:
+
+ # prn_example -error
+
+
+
+
+EXAMPLE CODE
+============
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <linux/poll.h>
+#include <sys/ioctl.h>
+#include <linux/usb/g_printer.h>
+
+#define PRINTER_FILE "/dev/g_printer"
+#define BUF_SIZE 512
+
+
+/*
+ * 'usage()' - Show program usage.
+ */
+
+static void
+usage(const char *option) /* I - Option string or NULL */
+{
+ if (option) {
+ fprintf(stderr,"prn_example: Unknown option \"%s\"!\n",
+ option);
+ }
+
+ fputs("\n", stderr);
+ fputs("Usage: prn_example -[options]\n", stderr);
+ fputs("Options:\n", stderr);
+ fputs("\n", stderr);
+ fputs("-get_status Get the current printer status.\n", stderr);
+ fputs("-selected Set the selected status to selected.\n", stderr);
+ fputs("-not_selected Set the selected status to NOT selected.\n",
+ stderr);
+ fputs("-error Set the error status to error.\n", stderr);
+ fputs("-no_error Set the error status to NO error.\n", stderr);
+ fputs("-paper_out Set the paper status to paper out.\n", stderr);
+ fputs("-paper_loaded Set the paper status to paper loaded.\n",
+ stderr);
+ fputs("-read_data Read printer data from driver.\n", stderr);
+ fputs("-write_data Write printer sata to driver.\n", stderr);
+ fputs("-NB_read_data (Non-Blocking) Read printer data from driver.\n",
+ stderr);
+ fputs("\n\n", stderr);
+
+ exit(1);
+}
+
+
+static int
+read_printer_data()
+{
+ struct pollfd fd[1];
+
+ /* Open device file for printer gadget. */
+ fd[0].fd = open(PRINTER_FILE, O_RDWR);
+ if (fd[0].fd < 0) {
+ printf("Error %d opening %s\n", fd[0].fd, PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ }
+
+ fd[0].events = POLLIN | POLLRDNORM;
+
+ while (1) {
+ static char buf[BUF_SIZE];
+ int bytes_read;
+ int retval;
+
+ /* Wait for up to 1 second for data. */
+ retval = poll(fd, 1, 1000);
+
+ if (retval && (fd[0].revents & POLLRDNORM)) {
+
+ /* Read data from printer gadget driver. */
+ bytes_read = read(fd[0].fd, buf, BUF_SIZE);
+
+ if (bytes_read < 0) {
+ printf("Error %d reading from %s\n",
+ fd[0].fd, PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ } else if (bytes_read > 0) {
+ /* Write data to standard OUTPUT (stdout). */
+ fwrite(buf, 1, bytes_read, stdout);
+ fflush(stdout);
+ }
+
+ }
+
+ }
+
+ /* Close the device file. */
+ close(fd[0].fd);
+
+ return 0;
+}
+
+
+static int
+write_printer_data()
+{
+ struct pollfd fd[1];
+
+ /* Open device file for printer gadget. */
+ fd[0].fd = open (PRINTER_FILE, O_RDWR);
+ if (fd[0].fd < 0) {
+ printf("Error %d opening %s\n", fd[0].fd, PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ }
+
+ fd[0].events = POLLOUT | POLLWRNORM;
+
+ while (1) {
+ int retval;
+ static char buf[BUF_SIZE];
+ /* Read data from standard INPUT (stdin). */
+ int bytes_read = fread(buf, 1, BUF_SIZE, stdin);
+
+ if (!bytes_read) {
+ break;
+ }
+
+ while (bytes_read) {
+
+ /* Wait for up to 1 second to sent data. */
+ retval = poll(fd, 1, 1000);
+
+ /* Write data to printer gadget driver. */
+ if (retval && (fd[0].revents & POLLWRNORM)) {
+ retval = write(fd[0].fd, buf, bytes_read);
+ if (retval < 0) {
+ printf("Error %d writing to %s\n",
+ fd[0].fd,
+ PRINTER_FILE);
+ close(fd[0].fd);
+ return(-1);
+ } else {
+ bytes_read -= retval;
+ }
+
+ }
+
+ }
+
+ }
+
+ /* Wait until the data has been sent. */
+ fsync(fd[0].fd);
+
+ /* Close the device file. */
+ close(fd[0].fd);
+
+ return 0;
+}
+
+
+static int
+read_NB_printer_data()
+{
+ int fd;
+ static char buf[BUF_SIZE];
+ int bytes_read;
+
+ /* Open device file for printer gadget. */
+ fd = open(PRINTER_FILE, O_RDWR|O_NONBLOCK);
+ if (fd < 0) {
+ printf("Error %d opening %s\n", fd, PRINTER_FILE);
+ close(fd);
+ return(-1);
+ }
+
+ while (1) {
+ /* Read data from printer gadget driver. */
+ bytes_read = read(fd, buf, BUF_SIZE);
+ if (bytes_read <= 0) {
+ break;
+ }
+
+ /* Write data to standard OUTPUT (stdout). */
+ fwrite(buf, 1, bytes_read, stdout);
+ fflush(stdout);
+ }
+
+ /* Close the device file. */
+ close(fd);
+
+ return 0;
+}
+
+
+static int
+get_printer_status()
+{
+ int retval;
+ int fd;
+
+ /* Open device file for printer gadget. */
+ fd = open(PRINTER_FILE, O_RDWR);
+ if (fd < 0) {
+ printf("Error %d opening %s\n", fd, PRINTER_FILE);
+ close(fd);
+ return(-1);
+ }
+
+ /* Make the IOCTL call. */
+ retval = ioctl(fd, GADGET_GET_PRINTER_STATUS);
+ if (retval < 0) {
+ fprintf(stderr, "ERROR: Failed to set printer status\n");
+ return(-1);
+ }
+
+ /* Close the device file. */
+ close(fd);
+
+ return(retval);
+}
+
+
+static int
+set_printer_status(unsigned char buf, int clear_printer_status_bit)
+{
+ int retval;
+ int fd;
+
+ retval = get_printer_status();
+ if (retval < 0) {
+ fprintf(stderr, "ERROR: Failed to get printer status\n");
+ return(-1);
+ }
+
+ /* Open device file for printer gadget. */
+ fd = open(PRINTER_FILE, O_RDWR);
+
+ if (fd < 0) {
+ printf("Error %d opening %s\n", fd, PRINTER_FILE);
+ close(fd);
+ return(-1);
+ }
+
+ if (clear_printer_status_bit) {
+ retval &= ~buf;
+ } else {
+ retval |= buf;
+ }
+
+ /* Make the IOCTL call. */
+ if (ioctl(fd, GADGET_SET_PRINTER_STATUS, (unsigned char)retval)) {
+ fprintf(stderr, "ERROR: Failed to set printer status\n");
+ return(-1);
+ }
+
+ /* Close the device file. */
+ close(fd);
+
+ return 0;
+}
+
+
+static int
+display_printer_status()
+{
+ char printer_status;
+
+ printer_status = get_printer_status();
+ if (printer_status < 0) {
+ fprintf(stderr, "ERROR: Failed to get printer status\n");
+ return(-1);
+ }
+
+ printf("Printer status is:\n");
+ if (printer_status & PRINTER_SELECTED) {
+ printf(" Printer is Selected\n");
+ } else {
+ printf(" Printer is NOT Selected\n");
+ }
+ if (printer_status & PRINTER_PAPER_EMPTY) {
+ printf(" Paper is Out\n");
+ } else {
+ printf(" Paper is Loaded\n");
+ }
+ if (printer_status & PRINTER_NOT_ERROR) {
+ printf(" Printer OK\n");
+ } else {
+ printf(" Printer ERROR\n");
+ }
+
+ return(0);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ int i; /* Looping var */
+ int retval = 0;
+
+ /* No Args */
+ if (argc == 1) {
+ usage(0);
+ exit(0);
+ }
+
+ for (i = 1; i < argc && !retval; i ++) {
+
+ if (argv[i][0] != '-') {
+ continue;
+ }
+
+ if (!strcmp(argv[i], "-get_status")) {
+ if (display_printer_status()) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-paper_loaded")) {
+ if (set_printer_status(PRINTER_PAPER_EMPTY, 1)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-paper_out")) {
+ if (set_printer_status(PRINTER_PAPER_EMPTY, 0)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-selected")) {
+ if (set_printer_status(PRINTER_SELECTED, 0)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-not_selected")) {
+ if (set_printer_status(PRINTER_SELECTED, 1)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-error")) {
+ if (set_printer_status(PRINTER_NOT_ERROR, 1)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-no_error")) {
+ if (set_printer_status(PRINTER_NOT_ERROR, 0)) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-read_data")) {
+ if (read_printer_data()) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-write_data")) {
+ if (write_printer_data()) {
+ retval = 1;
+ }
+
+ } else if (!strcmp(argv[i], "-NB_read_data")) {
+ if (read_NB_printer_data()) {
+ retval = 1;
+ }
+
+ } else {
+ usage(argv[i]);
+ retval = 1;
+ }
+ }
+
+ exit(retval);
+}
diff --git a/Documentation/usb/gadget_serial.txt b/Documentation/usb/gadget_serial.txt
new file mode 100644
index 0000000..eac7df9
--- /dev/null
+++ b/Documentation/usb/gadget_serial.txt
@@ -0,0 +1,349 @@
+
+ Linux Gadget Serial Driver v2.0
+ 11/20/2004
+ (updated 8-May-2008 for v2.3)
+
+
+License and Disclaimer
+----------------------
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License as
+published by the Free Software Foundation; either version 2 of
+the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public
+License along with this program; if not, write to the Free
+Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+MA 02111-1307 USA.
+
+This document and the gadget serial driver itself are
+Copyright (C) 2004 by Al Borchers (alborchers@steinerpoint.com).
+
+If you have questions, problems, or suggestions for this driver
+please contact Al Borchers at alborchers@steinerpoint.com.
+
+
+Prerequisites
+-------------
+Versions of the gadget serial driver are available for the
+2.4 Linux kernels, but this document assumes you are using
+version 2.3 or later of the gadget serial driver in a 2.6
+Linux kernel.
+
+This document assumes that you are familiar with Linux and
+Windows and know how to configure and build Linux kernels, run
+standard utilities, use minicom and HyperTerminal, and work with
+USB and serial devices. It also assumes you configure the Linux
+gadget and usb drivers as modules.
+
+With version 2.3 of the driver, major and minor device nodes are
+no longer statically defined. Your Linux based system should mount
+sysfs in /sys, and use "mdev" (in Busybox) or "udev" to make the
+/dev nodes matching the sysfs /sys/class/tty files.
+
+
+
+Overview
+--------
+The gadget serial driver is a Linux USB gadget driver, a USB device
+side driver. It runs on a Linux system that has USB device side
+hardware; for example, a PDA, an embedded Linux system, or a PC
+with a USB development card.
+
+The gadget serial driver talks over USB to either a CDC ACM driver
+or a generic USB serial driver running on a host PC.
+
+ Host
+ --------------------------------------
+ | Host-Side CDC ACM USB Host |
+ | Operating | or | Controller | USB
+ | System | Generic USB | Driver |--------
+ | (Linux or | Serial | and | |
+ | Windows) Driver USB Stack | |
+ -------------------------------------- |
+ |
+ |
+ |
+ Gadget |
+ -------------------------------------- |
+ | Gadget USB Periph. | |
+ | Device-Side | Gadget | Controller | |
+ | Linux | Serial | Driver |--------
+ | Operating | Driver | and |
+ | System USB Stack |
+ --------------------------------------
+
+On the device-side Linux system, the gadget serial driver looks
+like a serial device.
+
+On the host-side system, the gadget serial device looks like a
+CDC ACM compliant class device or a simple vendor specific device
+with bulk in and bulk out endpoints, and it is treated similarly
+to other serial devices.
+
+The host side driver can potentially be any ACM compliant driver
+or any driver that can talk to a device with a simple bulk in/out
+interface. Gadget serial has been tested with the Linux ACM driver,
+the Windows usbser.sys ACM driver, and the Linux USB generic serial
+driver.
+
+With the gadget serial driver and the host side ACM or generic
+serial driver running, you should be able to communicate between
+the host and the gadget side systems as if they were connected by a
+serial cable.
+
+The gadget serial driver only provides simple unreliable data
+communication. It does not yet handle flow control or many other
+features of normal serial devices.
+
+
+Installing the Gadget Serial Driver
+-----------------------------------
+To use the gadget serial driver you must configure the Linux gadget
+side kernel for "Support for USB Gadgets", for a "USB Peripheral
+Controller" (for example, net2280), and for the "Serial Gadget"
+driver. All this are listed under "USB Gadget Support" when
+configuring the kernel. Then rebuild and install the kernel or
+modules.
+
+Then you must load the gadget serial driver. To load it as an
+ACM device (recommended for interoperability), do this:
+
+ modprobe g_serial
+
+To load it as a vendor specific bulk in/out device, do this:
+
+ modprobe g_serial use_acm=0
+
+This will also automatically load the underlying gadget peripheral
+controller driver. This must be done each time you reboot the gadget
+side Linux system. You can add this to the start up scripts, if
+desired.
+
+Your system should use mdev (from busybox) or udev to make the
+device nodes. After this gadget driver has been set up you should
+then see a /dev/ttyGS0 node:
+
+ # ls -l /dev/ttyGS0 | cat
+ crw-rw---- 1 root root 253, 0 May 8 14:10 /dev/ttyGS0
+ #
+
+Note that the major number (253, above) is system-specific. If
+you need to create /dev nodes by hand, the right numbers to use
+will be in the /sys/class/tty/ttyGS0/dev file.
+
+When you link this gadget driver early, perhaps even statically,
+you may want to set up an /etc/inittab entry to run "getty" on it.
+The /dev/ttyGS0 line should work like most any other serial port.
+
+
+If gadget serial is loaded as an ACM device you will want to use
+either the Windows or Linux ACM driver on the host side. If gadget
+serial is loaded as a bulk in/out device, you will want to use the
+Linux generic serial driver on the host side. Follow the appropriate
+instructions below to install the host side driver.
+
+
+Installing the Windows Host ACM Driver
+--------------------------------------
+To use the Windows ACM driver you must have the files "gserial.inf"
+and "usbser.sys" together in a folder on the Windows machine.
+
+The "gserial.inf" file is given here.
+
+-------------------- CUT HERE --------------------
+[Version]
+Signature="$Windows NT$"
+Class=Ports
+ClassGuid={4D36E978-E325-11CE-BFC1-08002BE10318}
+Provider=%LINUX%
+DriverVer=08/17/2004,0.0.2.0
+; Copyright (C) 2004 Al Borchers (alborchers@steinerpoint.com)
+
+[Manufacturer]
+%LINUX%=GSerialDeviceList
+
+[GSerialDeviceList]
+%GSERIAL%=GSerialInstall, USB\VID_0525&PID_A4A7
+
+[DestinationDirs]
+DefaultDestDir=10,System32\Drivers
+
+[GSerialInstall]
+CopyFiles=GSerialCopyFiles
+AddReg=GSerialAddReg
+
+[GSerialCopyFiles]
+usbser.sys
+
+[GSerialAddReg]
+HKR,,DevLoader,,*ntkern
+HKR,,NTMPDriver,,usbser.sys
+HKR,,EnumPropPages32,,"MsPorts.dll,SerialPortPropPageProvider"
+
+[GSerialInstall.Services]
+AddService = usbser,0x0002,GSerialService
+
+[GSerialService]
+DisplayName = %GSERIAL_DISPLAY_NAME%
+ServiceType = 1 ; SERVICE_KERNEL_DRIVER
+StartType = 3 ; SERVICE_DEMAND_START
+ErrorControl = 1 ; SERVICE_ERROR_NORMAL
+ServiceBinary = %10%\System32\Drivers\usbser.sys
+LoadOrderGroup = Base
+
+[Strings]
+LINUX = "Linux"
+GSERIAL = "Gadget Serial"
+GSERIAL_DISPLAY_NAME = "USB Gadget Serial Driver"
+-------------------- CUT HERE --------------------
+
+The "usbser.sys" file comes with various versions of Windows.
+For example, it can be found on Windows XP typically in
+
+ C:\WINDOWS\Driver Cache\i386\driver.cab
+
+Or it can be found on the Windows 98SE CD in the "win98" folder
+in the "DRIVER11.CAB" through "DRIVER20.CAB" cab files. You will
+need the DOS "expand" program, the Cygwin "cabextract" program, or
+a similar program to unpack these cab files and extract "usbser.sys".
+
+For example, to extract "usbser.sys" into the current directory
+on Windows XP, open a DOS window and run a command like
+
+ expand C:\WINDOWS\Driver~1\i386\driver.cab -F:usbser.sys .
+
+(Thanks to Nishant Kamat for pointing out this DOS command.)
+
+When the gadget serial driver is loaded and the USB device connected
+to the Windows host with a USB cable, Windows should recognize the
+gadget serial device and ask for a driver. Tell Windows to find the
+driver in the folder that contains "gserial.inf" and "usbser.sys".
+
+For example, on Windows XP, when the gadget serial device is first
+plugged in, the "Found New Hardware Wizard" starts up. Select
+"Install from a list or specific location (Advanced)", then on
+the next screen select "Include this location in the search" and
+enter the path or browse to the folder containing "gserial.inf" and
+"usbser.sys". Windows will complain that the Gadget Serial driver
+has not passed Windows Logo testing, but select "Continue anyway"
+and finish the driver installation.
+
+On Windows XP, in the "Device Manager" (under "Control Panel",
+"System", "Hardware") expand the "Ports (COM & LPT)" entry and you
+should see "Gadget Serial" listed as the driver for one of the COM
+ports.
+
+To uninstall the Windows XP driver for "Gadget Serial", right click
+on the "Gadget Serial" entry in the "Device Manager" and select
+"Uninstall".
+
+
+Installing the Linux Host ACM Driver
+------------------------------------
+To use the Linux ACM driver you must configure the Linux host side
+kernel for "Support for Host-side USB" and for "USB Modem (CDC ACM)
+support".
+
+Once the gadget serial driver is loaded and the USB device connected
+to the Linux host with a USB cable, the host system should recognize
+the gadget serial device. For example, the command
+
+ cat /proc/bus/usb/devices
+
+should show something like this:
+
+T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 5 Spd=480 MxCh= 0
+D: Ver= 2.00 Cls=02(comm.) Sub=00 Prot=00 MxPS=64 #Cfgs= 1
+P: Vendor=0525 ProdID=a4a7 Rev= 2.01
+S: Manufacturer=Linux 2.6.8.1 with net2280
+S: Product=Gadget Serial
+S: SerialNumber=0
+C:* #Ifs= 2 Cfg#= 2 Atr=c0 MxPwr= 2mA
+I: If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=01 Driver=acm
+E: Ad=83(I) Atr=03(Int.) MxPS= 8 Ivl=32ms
+I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=acm
+E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
+E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
+
+If the host side Linux system is configured properly, the ACM driver
+should be loaded automatically. The command "lsmod" should show the
+"acm" module is loaded.
+
+
+Installing the Linux Host Generic USB Serial Driver
+---------------------------------------------------
+To use the Linux generic USB serial driver you must configure the
+Linux host side kernel for "Support for Host-side USB", for "USB
+Serial Converter support", and for the "USB Generic Serial Driver".
+
+Once the gadget serial driver is loaded and the USB device connected
+to the Linux host with a USB cable, the host system should recognize
+the gadget serial device. For example, the command
+
+ cat /proc/bus/usb/devices
+
+should show something like this:
+
+T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 6 Spd=480 MxCh= 0
+D: Ver= 2.00 Cls=ff(vend.) Sub=00 Prot=00 MxPS=64 #Cfgs= 1
+P: Vendor=0525 ProdID=a4a6 Rev= 2.01
+S: Manufacturer=Linux 2.6.8.1 with net2280
+S: Product=Gadget Serial
+S: SerialNumber=0
+C:* #Ifs= 1 Cfg#= 1 Atr=c0 MxPwr= 2mA
+I: If#= 0 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=serial
+E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
+E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
+
+You must explicitly load the usbserial driver with parameters to
+configure it to recognize the gadget serial device, like this:
+
+ modprobe usbserial vendor=0x0525 product=0xA4A6
+
+If everything is working, usbserial will print a message in the
+system log saying something like "Gadget Serial converter now
+attached to ttyUSB0".
+
+
+Testing with Minicom or HyperTerminal
+-------------------------------------
+Once the gadget serial driver and the host driver are both installed,
+and a USB cable connects the gadget device to the host, you should
+be able to communicate over USB between the gadget and host systems.
+You can use minicom or HyperTerminal to try this out.
+
+On the gadget side run "minicom -s" to configure a new minicom
+session. Under "Serial port setup" set "/dev/ttygserial" as the
+"Serial Device". Set baud rate, data bits, parity, and stop bits,
+to 9600, 8, none, and 1--these settings mostly do not matter.
+Under "Modem and dialing" erase all the modem and dialing strings.
+
+On a Linux host running the ACM driver, configure minicom similarly
+but use "/dev/ttyACM0" as the "Serial Device". (If you have other
+ACM devices connected, change the device name appropriately.)
+
+On a Linux host running the USB generic serial driver, configure
+minicom similarly, but use "/dev/ttyUSB0" as the "Serial Device".
+(If you have other USB serial devices connected, change the device
+name appropriately.)
+
+On a Windows host configure a new HyperTerminal session to use the
+COM port assigned to Gadget Serial. The "Port Settings" will be
+set automatically when HyperTerminal connects to the gadget serial
+device, so you can leave them set to the default values--these
+settings mostly do not matter.
+
+With minicom configured and running on the gadget side and with
+minicom or HyperTerminal configured and running on the host side,
+you should be able to send data back and forth between the gadget
+side and host side systems. Anything you type on the terminal
+window on the gadget side should appear in the terminal window on
+the host side and vice versa.
+
+
diff --git a/Documentation/usb/hiddev.txt b/Documentation/usb/hiddev.txt
new file mode 100644
index 0000000..6e8c9f1
--- /dev/null
+++ b/Documentation/usb/hiddev.txt
@@ -0,0 +1,205 @@
+Care and feeding of your Human Interface Devices
+
+INTRODUCTION
+
+In addition to the normal input type HID devices, USB also uses the
+human interface device protocols for things that are not really human
+interfaces, but have similar sorts of communication needs. The two big
+examples for this are power devices (especially uninterruptable power
+supplies) and monitor control on higher end monitors.
+
+To support these disparate requirements, the Linux USB system provides
+HID events to two separate interfaces:
+* the input subsystem, which converts HID events into normal input
+device interfaces (such as keyboard, mouse and joystick) and a
+normalised event interface - see Documentation/input/input.txt
+* the hiddev interface, which provides fairly raw HID events
+
+The data flow for a HID event produced by a device is something like
+the following :
+
+ usb.c ---> hid-core.c ----> hid-input.c ----> [keyboard/mouse/joystick/event]
+ |
+ |
+ --> hiddev.c ----> POWER / MONITOR CONTROL
+
+In addition, other subsystems (apart from USB) can potentially feed
+events into the input subsystem, but these have no effect on the hid
+device interface.
+
+USING THE HID DEVICE INTERFACE
+
+The hiddev interface is a char interface using the normal USB major,
+with the minor numbers starting at 96 and finishing at 111. Therefore,
+you need the following commands:
+mknod /dev/usb/hiddev0 c 180 96
+mknod /dev/usb/hiddev1 c 180 97
+mknod /dev/usb/hiddev2 c 180 98
+mknod /dev/usb/hiddev3 c 180 99
+mknod /dev/usb/hiddev4 c 180 100
+mknod /dev/usb/hiddev5 c 180 101
+mknod /dev/usb/hiddev6 c 180 102
+mknod /dev/usb/hiddev7 c 180 103
+mknod /dev/usb/hiddev8 c 180 104
+mknod /dev/usb/hiddev9 c 180 105
+mknod /dev/usb/hiddev10 c 180 106
+mknod /dev/usb/hiddev11 c 180 107
+mknod /dev/usb/hiddev12 c 180 108
+mknod /dev/usb/hiddev13 c 180 109
+mknod /dev/usb/hiddev14 c 180 110
+mknod /dev/usb/hiddev15 c 180 111
+
+So you point your hiddev compliant user-space program at the correct
+interface for your device, and it all just works.
+
+Assuming that you have a hiddev compliant user-space program, of
+course. If you need to write one, read on.
+
+
+THE HIDDEV API
+This description should be read in conjunction with the HID
+specification, freely available from http://www.usb.org, and
+conveniently linked of http://www.linux-usb.org.
+
+The hiddev API uses a read() interface, and a set of ioctl() calls.
+
+HID devices exchange data with the host computer using data
+bundles called "reports". Each report is divided into "fields",
+each of which can have one or more "usages". In the hid-core,
+each one of these usages has a single signed 32 bit value.
+
+read():
+This is the event interface. When the HID device's state changes,
+it performs an interrupt transfer containing a report which contains
+the changed value. The hid-core.c module parses the report, and
+returns to hiddev.c the individual usages that have changed within
+the report. In its basic mode, the hiddev will make these individual
+usage changes available to the reader using a struct hiddev_event:
+
+ struct hiddev_event {
+ unsigned hid;
+ signed int value;
+ };
+
+containing the HID usage identifier for the status that changed, and
+the value that it was changed to. Note that the structure is defined
+within <linux/hiddev.h>, along with some other useful #defines and
+structures. The HID usage identifier is a composite of the HID usage
+page shifted to the 16 high order bits ORed with the usage code. The
+behavior of the read() function can be modified using the HIDIOCSFLAG
+ioctl() described below.
+
+
+ioctl():
+This is the control interface. There are a number of controls:
+
+HIDIOCGVERSION - int (read)
+Gets the version code out of the hiddev driver.
+
+HIDIOCAPPLICATION - (none)
+This ioctl call returns the HID application usage associated with the
+hid device. The third argument to ioctl() specifies which application
+index to get. This is useful when the device has more than one
+application collection. If the index is invalid (greater or equal to
+the number of application collections this device has) the ioctl
+returns -1. You can find out beforehand how many application
+collections the device has from the num_applications field from the
+hiddev_devinfo structure.
+
+HIDIOCGCOLLECTIONINFO - struct hiddev_collection_info (read/write)
+This returns a superset of the information above, providing not only
+application collections, but all the collections the device has. It
+also returns the level the collection lives in the hierarchy.
+The user passes in a hiddev_collection_info struct with the index
+field set to the index that should be returned. The ioctl fills in
+the other fields. If the index is larger than the last collection
+index, the ioctl returns -1 and sets errno to -EINVAL.
+
+HIDIOCGDEVINFO - struct hiddev_devinfo (read)
+Gets a hiddev_devinfo structure which describes the device.
+
+HIDIOCGSTRING - struct hiddev_string_descriptor (read/write)
+Gets a string descriptor from the device. The caller must fill in the
+"index" field to indicate which descriptor should be returned.
+
+HIDIOCINITREPORT - (none)
+Instructs the kernel to retrieve all input and feature report values
+from the device. At this point, all the usage structures will contain
+current values for the device, and will maintain it as the device
+changes. Note that the use of this ioctl is unnecessary in general,
+since later kernels automatically initialize the reports from the
+device at attach time.
+
+HIDIOCGNAME - string (variable length)
+Gets the device name
+
+HIDIOCGREPORT - struct hiddev_report_info (write)
+Instructs the kernel to get a feature or input report from the device,
+in order to selectively update the usage structures (in contrast to
+INITREPORT).
+
+HIDIOCSREPORT - struct hiddev_report_info (write)
+Instructs the kernel to send a report to the device. This report can
+be filled in by the user through HIDIOCSUSAGE calls (below) to fill in
+individual usage values in the report before sending the report in full
+to the device.
+
+HIDIOCGREPORTINFO - struct hiddev_report_info (read/write)
+Fills in a hiddev_report_info structure for the user. The report is
+looked up by type (input, output or feature) and id, so these fields
+must be filled in by the user. The ID can be absolute -- the actual
+report id as reported by the device -- or relative --
+HID_REPORT_ID_FIRST for the first report, and (HID_REPORT_ID_NEXT |
+report_id) for the next report after report_id. Without a-priori
+information about report ids, the right way to use this ioctl is to
+use the relative IDs above to enumerate the valid IDs. The ioctl
+returns non-zero when there is no more next ID. The real report ID is
+filled into the returned hiddev_report_info structure.
+
+HIDIOCGFIELDINFO - struct hiddev_field_info (read/write)
+Returns the field information associated with a report in a
+hiddev_field_info structure. The user must fill in report_id and
+report_type in this structure, as above. The field_index should also
+be filled in, which should be a number from 0 and maxfield-1, as
+returned from a previous HIDIOCGREPORTINFO call.
+
+HIDIOCGUCODE - struct hiddev_usage_ref (read/write)
+Returns the usage_code in a hiddev_usage_ref structure, given that
+given its report type, report id, field index, and index within the
+field have already been filled into the structure.
+
+HIDIOCGUSAGE - struct hiddev_usage_ref (read/write)
+Returns the value of a usage in a hiddev_usage_ref structure. The
+usage to be retrieved can be specified as above, or the user can
+choose to fill in the report_type field and specify the report_id as
+HID_REPORT_ID_UNKNOWN. In this case, the hiddev_usage_ref will be
+filled in with the report and field information associated with this
+usage if it is found.
+
+HIDIOCSUSAGE - struct hiddev_usage_ref (write)
+Sets the value of a usage in an output report. The user fills in
+the hiddev_usage_ref structure as above, but additionally fills in
+the value field.
+
+HIDIOGCOLLECTIONINDEX - struct hiddev_usage_ref (write)
+Returns the collection index associated with this usage. This
+indicates where in the collection hierarchy this usage sits.
+
+HIDIOCGFLAG - int (read)
+HIDIOCSFLAG - int (write)
+These operations respectively inspect and replace the mode flags
+that influence the read() call above. The flags are as follows:
+
+ HIDDEV_FLAG_UREF - read() calls will now return
+ struct hiddev_usage_ref instead of struct hiddev_event.
+ This is a larger structure, but in situations where the
+ device has more than one usage in its reports with the
+ same usage code, this mode serves to resolve such
+ ambiguity.
+
+ HIDDEV_FLAG_REPORT - This flag can only be used in conjunction
+ with HIDDEV_FLAG_UREF. With this flag set, when the device
+ sends a report, a struct hiddev_usage_ref will be returned
+ to read() filled in with the report_type and report_id, but
+ with field_index set to FIELD_INDEX_NONE. This serves as
+ additional notification when the device has sent a report.
diff --git a/Documentation/usb/hotplug.txt b/Documentation/usb/hotplug.txt
new file mode 100644
index 0000000..f531706
--- /dev/null
+++ b/Documentation/usb/hotplug.txt
@@ -0,0 +1,148 @@
+LINUX HOTPLUGGING
+
+In hotpluggable busses like USB (and Cardbus PCI), end-users plug devices
+into the bus with power on. In most cases, users expect the devices to become
+immediately usable. That means the system must do many things, including:
+
+ - Find a driver that can handle the device. That may involve
+ loading a kernel module; newer drivers can use module-init-tools
+ to publish their device (and class) support to user utilities.
+
+ - Bind a driver to that device. Bus frameworks do that using a
+ device driver's probe() routine.
+
+ - Tell other subsystems to configure the new device. Print
+ queues may need to be enabled, networks brought up, disk
+ partitions mounted, and so on. In some cases these will
+ be driver-specific actions.
+
+This involves a mix of kernel mode and user mode actions. Making devices
+be immediately usable means that any user mode actions can't wait for an
+administrator to do them: the kernel must trigger them, either passively
+(triggering some monitoring daemon to invoke a helper program) or
+actively (calling such a user mode helper program directly).
+
+Those triggered actions must support a system's administrative policies;
+such programs are called "policy agents" here. Typically they involve
+shell scripts that dispatch to more familiar administration tools.
+
+Because some of those actions rely on information about drivers (metadata)
+that is currently available only when the drivers are dynamically linked,
+you get the best hotplugging when you configure a highly modular system.
+
+
+KERNEL HOTPLUG HELPER (/sbin/hotplug)
+
+When you compile with CONFIG_HOTPLUG, you get a new kernel parameter:
+/proc/sys/kernel/hotplug, which normally holds the pathname "/sbin/hotplug".
+That parameter names a program which the kernel may invoke at various times.
+
+The /sbin/hotplug program can be invoked by any subsystem as part of its
+reaction to a configuration change, from a thread in that subsystem.
+Only one parameter is required: the name of a subsystem being notified of
+some kernel event. That name is used as the first key for further event
+dispatch; any other argument and environment parameters are specified by
+the subsystem making that invocation.
+
+Hotplug software and other resources is available at:
+
+ http://linux-hotplug.sourceforge.net
+
+Mailing list information is also available at that site.
+
+
+--------------------------------------------------------------------------
+
+
+USB POLICY AGENT
+
+The USB subsystem currently invokes /sbin/hotplug when USB devices
+are added or removed from system. The invocation is done by the kernel
+hub daemon thread [khubd], or else as part of root hub initialization
+(done by init, modprobe, kapmd, etc). Its single command line parameter
+is the string "usb", and it passes these environment variables:
+
+ ACTION ... "add", "remove"
+ PRODUCT ... USB vendor, product, and version codes (hex)
+ TYPE ... device class codes (decimal)
+ INTERFACE ... interface 0 class codes (decimal)
+
+If "usbdevfs" is configured, DEVICE and DEVFS are also passed. DEVICE is
+the pathname of the device, and is useful for devices with multiple and/or
+alternate interfaces that complicate driver selection. By design, USB
+hotplugging is independent of "usbdevfs": you can do most essential parts
+of USB device setup without using that filesystem, and without running a
+user mode daemon to detect changes in system configuration.
+
+Currently available policy agent implementations can load drivers for
+modules, and can invoke driver-specific setup scripts. The newest ones
+leverage USB module-init-tools support. Later agents might unload drivers.
+
+
+USB MODUTILS SUPPORT
+
+Current versions of module-init-tools will create a "modules.usbmap" file
+which contains the entries from each driver's MODULE_DEVICE_TABLE. Such
+files can be used by various user mode policy agents to make sure all the
+right driver modules get loaded, either at boot time or later.
+
+See <linux/usb.h> for full information about such table entries; or look
+at existing drivers. Each table entry describes one or more criteria to
+be used when matching a driver to a device or class of devices. The
+specific criteria are identified by bits set in "match_flags", paired
+with field values. You can construct the criteria directly, or with
+macros such as these, and use driver_info to store more information.
+
+ USB_DEVICE (vendorId, productId)
+ ... matching devices with specified vendor and product ids
+ USB_DEVICE_VER (vendorId, productId, lo, hi)
+ ... like USB_DEVICE with lo <= productversion <= hi
+ USB_INTERFACE_INFO (class, subclass, protocol)
+ ... matching specified interface class info
+ USB_DEVICE_INFO (class, subclass, protocol)
+ ... matching specified device class info
+
+A short example, for a driver that supports several specific USB devices
+and their quirks, might have a MODULE_DEVICE_TABLE like this:
+
+ static const struct usb_device_id mydriver_id_table = {
+ { USB_DEVICE (0x9999, 0xaaaa), driver_info: QUIRK_X },
+ { USB_DEVICE (0xbbbb, 0x8888), driver_info: QUIRK_Y|QUIRK_Z },
+ ...
+ { } /* end with an all-zeroes entry */
+ }
+ MODULE_DEVICE_TABLE (usb, mydriver_id_table);
+
+Most USB device drivers should pass these tables to the USB subsystem as
+well as to the module management subsystem. Not all, though: some driver
+frameworks connect using interfaces layered over USB, and so they won't
+need such a "struct usb_driver".
+
+Drivers that connect directly to the USB subsystem should be declared
+something like this:
+
+ static struct usb_driver mydriver = {
+ .name = "mydriver",
+ .id_table = mydriver_id_table,
+ .probe = my_probe,
+ .disconnect = my_disconnect,
+
+ /*
+ if using the usb chardev framework:
+ .minor = MY_USB_MINOR_START,
+ .fops = my_file_ops,
+ if exposing any operations through usbdevfs:
+ .ioctl = my_ioctl,
+ */
+ }
+
+When the USB subsystem knows about a driver's device ID table, it's used when
+choosing drivers to probe(). The thread doing new device processing checks
+drivers' device ID entries from the MODULE_DEVICE_TABLE against interface and
+device descriptors for the device. It will only call probe() if there is a
+match, and the third argument to probe() will be the entry that matched.
+
+If you don't provide an id_table for your driver, then your driver may get
+probed for each new device; the third parameter to probe() will be null.
+
+
diff --git a/Documentation/usb/iuu_phoenix.txt b/Documentation/usb/iuu_phoenix.txt
new file mode 100644
index 0000000..e5f0480
--- /dev/null
+++ b/Documentation/usb/iuu_phoenix.txt
@@ -0,0 +1,84 @@
+Infinity Usb Unlimited Readme
+-----------------------------
+
+Hi all,
+
+
+This module provide a serial interface to use your
+IUU unit in phoenix mode. Loading this module will
+bring a ttyUSB[0-x] interface. This driver must be
+used by your favorite application to pilot the IUU
+
+This driver is still in beta stage, so bugs can
+occur and your system may freeze. As far I now,
+I never had any problem with it, but I'm not a real
+guru, so don't blame me if your system is unstable
+
+You can plug more than one IUU. Every unit will
+have his own device file(/dev/ttyUSB0,/dev/ttyUSB1,...)
+
+
+
+How to tune the reader speed ?
+
+ A few parameters can be used at load time
+ To use parameters, just unload the module if it is
+ already loaded and use modprobe iuu_phoenix param=value.
+ In case of prebuilt module, use the command
+ insmod iuu_phoenix param=value.
+
+ Example:
+
+ modprobe iuu_phoenix clockmode=3
+
+ The parameters are:
+
+ parm: clockmode:1=3Mhz579,2=3Mhz680,3=6Mhz (int)
+ parm: boost:overclock boost percent 100 to 500 (int)
+ parm: cdmode:Card detect mode 0=none, 1=CD, 2=!CD, 3=DSR, 4=!DSR, 5=CTS, 6=!CTS, 7=RING, 8=!RING (int)
+ parm: xmas:xmas color enabled or not (bool)
+ parm: debug:Debug enabled or not (bool)
+
+- clockmode will provide 3 different base settings commonly adopted by
+ different software:
+ 1. 3Mhz579
+ 2. 3Mhz680
+ 3. 6Mhz
+
+- boost provide a way to overclock the reader ( my favorite :-) )
+ For example to have best performance than a simple clockmode=3, try this:
+
+ modprobe boost=195
+
+ This will put the reader in a base of 3Mhz579 but boosted a 195 % !
+ the real clock will be now : 6979050 Hz ( 6Mhz979 ) and will increase
+ the speed to a score 10 to 20% better than the simple clockmode=3 !!!
+
+
+- cdmode permit to setup the signal used to inform the userland ( ioctl answer )
+ if the card is present or not. Eight signals are possible.
+
+- xmas is completely useless except for your eyes. This is one of my friend who was
+ so sad to have a nice device like the iuu without seeing all color range available.
+ So I have added this option to permit him to see a lot of color ( each activity change the color
+ and the frequency randomly )
+
+- debug will produce a lot of debugging messages...
+
+
+ Last notes:
+
+ Don't worry about the serial settings, the serial emulation
+ is an abstraction, so use any speed or parity setting will
+ work. ( This will not change anything ).Later I will perhaps
+ use this settings to deduce de boost but is that feature
+ really necessary ?
+ The autodetect feature used is the serial CD. If that doesn't
+ work for your software, disable detection mechanism in it.
+
+
+ Have fun !
+
+ Alain Degreffe
+
+ eczema(at)ecze.com
diff --git a/Documentation/usb/linux.inf b/Documentation/usb/linux.inf
new file mode 100644
index 0000000..2f7217d
--- /dev/null
+++ b/Documentation/usb/linux.inf
@@ -0,0 +1,200 @@
+; MS-Windows driver config matching some basic modes of the
+; Linux-USB Ethernet/RNDIS gadget firmware:
+;
+; - RNDIS plus CDC Ethernet ... this may be familiar as a DOCSIS
+; cable modem profile, and supports most non-Microsoft USB hosts
+;
+; - RNDIS plus CDC Subset ... used by hardware that incapable of
+; full CDC Ethernet support.
+;
+; Microsoft only directly supports RNDIS drivers, and bundled them into XP.
+; The Microsoft "Remote NDIS USB Driver Kit" is currently found at:
+; http://www.microsoft.com/whdc/hwdev/resources/HWservices/rndis.mspx
+
+
+[Version]
+Signature = "$CHICAGO$"
+Class = Net
+ClassGUID = {4d36e972-e325-11ce-bfc1-08002be10318}
+Provider = %Linux%
+Compatible = 1
+MillenniumPreferred = .ME
+DriverVer = 03/30/2004,0.0.0.0
+; catalog file would be used by WHQL
+;CatalogFile = Linux.cat
+
+[Manufacturer]
+%Linux% = LinuxDevices,NT.5.1
+
+[LinuxDevices]
+; NetChip IDs, used by both firmware modes
+%LinuxDevice% = RNDIS, USB\VID_0525&PID_a4a2
+
+[LinuxDevices.NT.5.1]
+%LinuxDevice% = RNDIS.NT.5.1, USB\VID_0525&PID_a4a2
+
+[ControlFlags]
+ExcludeFromSelect=*
+
+; Windows 98, Windows 98 Second Edition specific sections --------
+
+[RNDIS]
+DeviceID = usb8023
+MaxInstance = 512
+DriverVer = 03/30/2004,0.0.0.0
+AddReg = RNDIS_AddReg_98, RNDIS_AddReg_Common
+
+[RNDIS_AddReg_98]
+HKR, , DevLoader, 0, *ndis
+HKR, , DeviceVxDs, 0, usb8023.sys
+HKR, NDIS, LogDriverName, 0, "usb8023"
+HKR, NDIS, MajorNdisVersion, 1, 5
+HKR, NDIS, MinorNdisVersion, 1, 0
+HKR, Ndi\Interfaces, DefUpper, 0, "ndis3,ndis4,ndis5"
+HKR, Ndi\Interfaces, DefLower, 0, "ethernet"
+HKR, Ndi\Interfaces, UpperRange, 0, "ndis3,ndis4,ndis5"
+HKR, Ndi\Interfaces, LowerRange, 0, "ethernet"
+HKR, Ndi\Install, ndis3, 0, "RNDIS_Install_98"
+HKR, Ndi\Install, ndis4, 0, "RNDIS_Install_98"
+HKR, Ndi\Install, ndis5, 0, "RNDIS_Install_98"
+HKR, Ndi, DeviceId, 0, "USB\VID_0525&PID_a4a2"
+
+[RNDIS_Install_98]
+CopyFiles=RNDIS_CopyFiles_98
+
+[RNDIS_CopyFiles_98]
+usb8023.sys, usb8023w.sys, , 0
+rndismp.sys, rndismpw.sys, , 0
+
+; Windows Millennium Edition specific sections --------------------
+
+[RNDIS.ME]
+DeviceID = usb8023
+MaxInstance = 512
+DriverVer = 03/30/2004,0.0.0.0
+AddReg = RNDIS_AddReg_ME, RNDIS_AddReg_Common
+Characteristics = 0x84 ; NCF_PHYSICAL + NCF_HAS_UI
+BusType = 15
+
+[RNDIS_AddReg_ME]
+HKR, , DevLoader, 0, *ndis
+HKR, , DeviceVxDs, 0, usb8023.sys
+HKR, NDIS, LogDriverName, 0, "usb8023"
+HKR, NDIS, MajorNdisVersion, 1, 5
+HKR, NDIS, MinorNdisVersion, 1, 0
+HKR, Ndi\Interfaces, DefUpper, 0, "ndis3,ndis4,ndis5"
+HKR, Ndi\Interfaces, DefLower, 0, "ethernet"
+HKR, Ndi\Interfaces, UpperRange, 0, "ndis3,ndis4,ndis5"
+HKR, Ndi\Interfaces, LowerRange, 0, "ethernet"
+HKR, Ndi\Install, ndis3, 0, "RNDIS_Install_ME"
+HKR, Ndi\Install, ndis4, 0, "RNDIS_Install_ME"
+HKR, Ndi\Install, ndis5, 0, "RNDIS_Install_ME"
+HKR, Ndi, DeviceId, 0, "USB\VID_0525&PID_a4a2"
+
+[RNDIS_Install_ME]
+CopyFiles=RNDIS_CopyFiles_ME
+
+[RNDIS_CopyFiles_ME]
+usb8023.sys, usb8023m.sys, , 0
+rndismp.sys, rndismpm.sys, , 0
+
+; Windows 2000 specific sections ---------------------------------
+
+[RNDIS.NT]
+Characteristics = 0x84 ; NCF_PHYSICAL + NCF_HAS_UI
+BusType = 15
+DriverVer = 03/30/2004,0.0.0.0
+AddReg = RNDIS_AddReg_NT, RNDIS_AddReg_Common
+CopyFiles = RNDIS_CopyFiles_NT
+
+[RNDIS.NT.Services]
+AddService = USB_RNDIS, 2, RNDIS_ServiceInst_NT, RNDIS_EventLog
+
+[RNDIS_CopyFiles_NT]
+; no rename of files on Windows 2000, use the 'k' names as is
+usb8023k.sys, , , 0
+rndismpk.sys, , , 0
+
+[RNDIS_ServiceInst_NT]
+DisplayName = %ServiceDisplayName%
+ServiceType = 1
+StartType = 3
+ErrorControl = 1
+ServiceBinary = %12%\usb8023k.sys
+LoadOrderGroup = NDIS
+AddReg = RNDIS_WMI_AddReg_NT
+
+[RNDIS_WMI_AddReg_NT]
+HKR, , MofImagePath, 0x00020000, "System32\drivers\rndismpk.sys"
+
+; Windows XP specific sections -----------------------------------
+
+[RNDIS.NT.5.1]
+Characteristics = 0x84 ; NCF_PHYSICAL + NCF_HAS_UI
+BusType = 15
+DriverVer = 03/30/2004,0.0.0.0
+AddReg = RNDIS_AddReg_NT, RNDIS_AddReg_Common
+; no copyfiles - the files are already in place
+
+[RNDIS.NT.5.1.Services]
+AddService = USB_RNDIS, 2, RNDIS_ServiceInst_51, RNDIS_EventLog
+
+[RNDIS_ServiceInst_51]
+DisplayName = %ServiceDisplayName%
+ServiceType = 1
+StartType = 3
+ErrorControl = 1
+ServiceBinary = %12%\usb8023.sys
+LoadOrderGroup = NDIS
+AddReg = RNDIS_WMI_AddReg_51
+
+[RNDIS_WMI_AddReg_51]
+HKR, , MofImagePath, 0x00020000, "System32\drivers\rndismp.sys"
+
+; Windows 2000 and Windows XP common sections --------------------
+
+[RNDIS_AddReg_NT]
+HKR, Ndi, Service, 0, "USB_RNDIS"
+HKR, Ndi\Interfaces, UpperRange, 0, "ndis5"
+HKR, Ndi\Interfaces, LowerRange, 0, "ethernet"
+
+[RNDIS_EventLog]
+AddReg = RNDIS_EventLog_AddReg
+
+[RNDIS_EventLog_AddReg]
+HKR, , EventMessageFile, 0x00020000, "%%SystemRoot%%\System32\netevent.dll"
+HKR, , TypesSupported, 0x00010001, 7
+
+; Common Sections -------------------------------------------------
+
+[RNDIS_AddReg_Common]
+HKR, NDI\params\NetworkAddress, ParamDesc, 0, %NetworkAddress%
+HKR, NDI\params\NetworkAddress, type, 0, "edit"
+HKR, NDI\params\NetworkAddress, LimitText, 0, "12"
+HKR, NDI\params\NetworkAddress, UpperCase, 0, "1"
+HKR, NDI\params\NetworkAddress, default, 0, " "
+HKR, NDI\params\NetworkAddress, optional, 0, "1"
+
+[SourceDisksNames]
+1=%SourceDisk%,,1
+
+[SourceDisksFiles]
+usb8023m.sys=1
+rndismpm.sys=1
+usb8023w.sys=1
+rndismpw.sys=1
+usb8023k.sys=1
+rndismpk.sys=1
+
+[DestinationDirs]
+RNDIS_CopyFiles_98 = 10, system32/drivers
+RNDIS_CopyFiles_ME = 10, system32/drivers
+RNDIS_CopyFiles_NT = 12
+
+[Strings]
+ServiceDisplayName = "USB Remote NDIS Network Device Driver"
+NetworkAddress = "Network Address"
+Linux = "Linux Developer Community"
+LinuxDevice = "Linux USB Ethernet/RNDIS Gadget"
+SourceDisk = "Ethernet/RNDIS Gadget Driver Install Disk"
+
diff --git a/Documentation/usb/misc_usbsevseg.txt b/Documentation/usb/misc_usbsevseg.txt
new file mode 100644
index 0000000..0f6be4f
--- /dev/null
+++ b/Documentation/usb/misc_usbsevseg.txt
@@ -0,0 +1,46 @@
+USB 7-Segment Numeric Display
+Manufactured by Delcom Engineering
+
+Device Information
+------------------
+USB VENDOR_ID 0x0fc5
+USB PRODUCT_ID 0x1227
+Both the 6 character and 8 character displays have PRODUCT_ID,
+and according to Delcom Engineering no queryable information
+can be obtained from the device to tell them apart.
+
+Device Modes
+------------
+By default, the driver assumes the display is only 6 characters
+The mode for 6 characters is:
+ MSB 0x06; LSB 0x3f
+For the 8 character display:
+ MSB 0x08; LSB 0xff
+The device can accept "text" either in raw, hex, or ascii textmode.
+raw controls each segment manually,
+hex expects a value between 0-15 per character,
+ascii expects a value between '0'-'9' and 'A'-'F'.
+The default is ascii.
+
+Device Operation
+----------------
+1. Turn on the device:
+ echo 1 > /sys/bus/usb/.../powered
+2. Set the device's mode:
+ echo $mode_msb > /sys/bus/usb/.../mode_msb
+ echo $mode_lsb > /sys/bus/usb/.../mode_lsb
+3. Set the textmode:
+ echo $textmode > /sys/bus/usb/.../textmode
+4. set the text (for example):
+ echo "123ABC" > /sys/bus/usb/.../text (ascii)
+ echo "A1B2" > /sys/bus/usb/.../text (ascii)
+ echo -ne "\x01\x02\x03" > /sys/bus/usb/.../text (hex)
+5. Set the decimal places.
+ The device has either 6 or 8 decimal points.
+ to set the nth decimal place calculate 10 ** n
+ and echo it in to /sys/bus/usb/.../decimals
+ To set multiple decimals points sum up each power.
+ For example, to set the 0th and 3rd decimal place
+ echo 1001 > /sys/bus/usb/.../decimals
+
+
diff --git a/Documentation/usb/mtouchusb.txt b/Documentation/usb/mtouchusb.txt
new file mode 100644
index 0000000..e43cfff
--- /dev/null
+++ b/Documentation/usb/mtouchusb.txt
@@ -0,0 +1,76 @@
+CHANGES
+
+- 0.3 - Created based off of scanner & INSTALL from the original touchscreen
+ driver on freshmeat (http://freshmeat.net/projects/3mtouchscreendriver)
+- Amended for linux-2.4.18, then 2.4.19
+
+- 0.5 - Complete rewrite using Linux Input in 2.6.3
+ Unfortunately no calibration support at this time
+
+- 1.4 - Multiple changes to support the EXII 5000UC and house cleaning
+ Changed reset from standard USB dev reset to vendor reset
+ Changed data sent to host from compensated to raw coordinates
+ Eliminated vendor/product module params
+ Performed multiple successful tests with an EXII-5010UC
+
+SUPPORTED HARDWARE:
+
+ All controllers have the Vendor: 0x0596 & Product: 0x0001
+
+
+ Controller Description Part Number
+ ------------------------------------------------------
+
+ USB Capacitive - Pearl Case 14-205 (Discontinued)
+ USB Capacitive - Black Case 14-124 (Discontinued)
+ USB Capacitive - No Case 14-206 (Discontinued)
+
+ USB Capacitive - Pearl Case EXII-5010UC
+ USB Capacitive - Black Case EXII-5030UC
+ USB Capacitive - No Case EXII-5050UC
+
+DRIVER NOTES:
+
+Installation is simple, you only need to add Linux Input, Linux USB, and the
+driver to the kernel. The driver can also be optionally built as a module.
+
+This driver appears to be one of possible 2 Linux USB Input Touchscreen
+drivers. Although 3M produces a binary only driver available for
+download, I persist in updating this driver since I would like to use the
+touchscreen for embedded apps using QTEmbedded, DirectFB, etc. So I feel the
+logical choice is to use Linux Input.
+
+Currently there is no way to calibrate the device via this driver. Even if
+the device could be calibrated, the driver pulls to raw coordinate data from
+the controller. This means calibration must be performed within the
+userspace.
+
+The controller screen resolution is now 0 to 16384 for both X and Y reporting
+the raw touch data. This is the same for the old and new capacitive USB
+controllers.
+
+Perhaps at some point an abstract function will be placed into evdev so
+generic functions like calibrations, resets, and vendor information can be
+requested from the userspace (And the drivers would handle the vendor specific
+tasks).
+
+ADDITIONAL INFORMATION/UPDATES/X CONFIGURATION EXAMPLE:
+
+http://groomlakelabs.com/grandamp/code/microtouch/
+
+TODO:
+
+Implement a control urb again to handle requests to and from the device
+such as calibration, etc once/if it becomes available.
+
+DISCLAIMER:
+
+I am not a MicroTouch/3M employee, nor have I ever been. 3M does not support
+this driver! If you want touch drivers only supported within X, please go to:
+
+http://www.3m.com/3MTouchSystems/downloads/
+
+THANKS:
+
+A huge thank you to 3M Touch Systems for the EXII-5010UC controllers for
+testing!
diff --git a/Documentation/usb/ohci.txt b/Documentation/usb/ohci.txt
new file mode 100644
index 0000000..99320d9
--- /dev/null
+++ b/Documentation/usb/ohci.txt
@@ -0,0 +1,32 @@
+23-Aug-2002
+
+The "ohci-hcd" driver is a USB Host Controller Driver (HCD) that is derived
+from the "usb-ohci" driver from the 2.4 kernel series. The "usb-ohci" code
+was written primarily by Roman Weissgaerber <weissg@vienna.at> but with
+contributions from many others (read its copyright/licencing header).
+
+It supports the "Open Host Controller Interface" (OHCI), which standardizes
+hardware register protocols used to talk to USB 1.1 host controllers. As
+compared to the earlier "Universal Host Controller Interface" (UHCI) from
+Intel, it pushes more intelligence into the hardware. USB 1.1 controllers
+from vendors other than Intel and VIA generally use OHCI.
+
+Changes since the 2.4 kernel include
+
+ - improved robustness; bugfixes; and less overhead
+ - supports the updated and simplified usbcore APIs
+ - interrupt transfers can be larger, and can be queued
+ - less code, by using the upper level "hcd" framework
+ - supports some non-PCI implementations of OHCI
+ - ... more
+
+The "ohci-hcd" driver handles all USB 1.1 transfer types. Transfers of all
+types can be queued. That was also true in "usb-ohci", except for interrupt
+transfers. Previously, using periods of one frame would risk data loss due
+to overhead in IRQ processing. When interrupt transfers are queued, those
+risks can be minimized by making sure the hardware always has transfers to
+work on while the OS is getting around to the relevant IRQ processing.
+
+- David Brownell
+ <dbrownell@users.sourceforge.net>
+
diff --git a/Documentation/usb/persist.txt b/Documentation/usb/persist.txt
new file mode 100644
index 0000000..074b159
--- /dev/null
+++ b/Documentation/usb/persist.txt
@@ -0,0 +1,162 @@
+ USB device persistence during system suspend
+
+ Alan Stern <stern@rowland.harvard.edu>
+
+ September 2, 2006 (Updated February 25, 2008)
+
+
+ What is the problem?
+
+According to the USB specification, when a USB bus is suspended the
+bus must continue to supply suspend current (around 1-5 mA). This
+is so that devices can maintain their internal state and hubs can
+detect connect-change events (devices being plugged in or unplugged).
+The technical term is "power session".
+
+If a USB device's power session is interrupted then the system is
+required to behave as though the device has been unplugged. It's a
+conservative approach; in the absence of suspend current the computer
+has no way to know what has actually happened. Perhaps the same
+device is still attached or perhaps it was removed and a different
+device plugged into the port. The system must assume the worst.
+
+By default, Linux behaves according to the spec. If a USB host
+controller loses power during a system suspend, then when the system
+wakes up all the devices attached to that controller are treated as
+though they had disconnected. This is always safe and it is the
+"officially correct" thing to do.
+
+For many sorts of devices this behavior doesn't matter in the least.
+If the kernel wants to believe that your USB keyboard was unplugged
+while the system was asleep and a new keyboard was plugged in when the
+system woke up, who cares? It'll still work the same when you type on
+it.
+
+Unfortunately problems _can_ arise, particularly with mass-storage
+devices. The effect is exactly the same as if the device really had
+been unplugged while the system was suspended. If you had a mounted
+filesystem on the device, you're out of luck -- everything in that
+filesystem is now inaccessible. This is especially annoying if your
+root filesystem was located on the device, since your system will
+instantly crash.
+
+Loss of power isn't the only mechanism to worry about. Anything that
+interrupts a power session will have the same effect. For example,
+even though suspend current may have been maintained while the system
+was asleep, on many systems during the initial stages of wakeup the
+firmware (i.e., the BIOS) resets the motherboard's USB host
+controllers. Result: all the power sessions are destroyed and again
+it's as though you had unplugged all the USB devices. Yes, it's
+entirely the BIOS's fault, but that doesn't do _you_ any good unless
+you can convince the BIOS supplier to fix the problem (lots of luck!).
+
+On many systems the USB host controllers will get reset after a
+suspend-to-RAM. On almost all systems, no suspend current is
+available during hibernation (also known as swsusp or suspend-to-disk).
+You can check the kernel log after resuming to see if either of these
+has happened; look for lines saying "root hub lost power or was reset".
+
+In practice, people are forced to unmount any filesystems on a USB
+device before suspending. If the root filesystem is on a USB device,
+the system can't be suspended at all. (All right, it _can_ be
+suspended -- but it will crash as soon as it wakes up, which isn't
+much better.)
+
+
+ What is the solution?
+
+The kernel includes a feature called USB-persist. It tries to work
+around these issues by allowing the core USB device data structures to
+persist across a power-session disruption.
+
+It works like this. If the kernel sees that a USB host controller is
+not in the expected state during resume (i.e., if the controller was
+reset or otherwise had lost power) then it applies a persistence check
+to each of the USB devices below that controller for which the
+"persist" attribute is set. It doesn't try to resume the device; that
+can't work once the power session is gone. Instead it issues a USB
+port reset and then re-enumerates the device. (This is exactly the
+same thing that happens whenever a USB device is reset.) If the
+re-enumeration shows that the device now attached to that port has the
+same descriptors as before, including the Vendor and Product IDs, then
+the kernel continues to use the same device structure. In effect, the
+kernel treats the device as though it had merely been reset instead of
+unplugged.
+
+The same thing happens if the host controller is in the expected state
+but a USB device was unplugged and then replugged, or if a USB device
+fails to carry out a normal resume.
+
+If no device is now attached to the port, or if the descriptors are
+different from what the kernel remembers, then the treatment is what
+you would expect. The kernel destroys the old device structure and
+behaves as though the old device had been unplugged and a new device
+plugged in.
+
+The end result is that the USB device remains available and usable.
+Filesystem mounts and memory mappings are unaffected, and the world is
+now a good and happy place.
+
+Note that the "USB-persist" feature will be applied only to those
+devices for which it is enabled. You can enable the feature by doing
+(as root):
+
+ echo 1 >/sys/bus/usb/devices/.../power/persist
+
+where the "..." should be filled in the with the device's ID. Disable
+the feature by writing 0 instead of 1. For hubs the feature is
+automatically and permanently enabled and the power/persist file
+doesn't even exist, so you only have to worry about setting it for
+devices where it really matters.
+
+
+ Is this the best solution?
+
+Perhaps not. Arguably, keeping track of mounted filesystems and
+memory mappings across device disconnects should be handled by a
+centralized Logical Volume Manager. Such a solution would allow you
+to plug in a USB flash device, create a persistent volume associated
+with it, unplug the flash device, plug it back in later, and still
+have the same persistent volume associated with the device. As such
+it would be more far-reaching than USB-persist.
+
+On the other hand, writing a persistent volume manager would be a big
+job and using it would require significant input from the user. This
+solution is much quicker and easier -- and it exists now, a giant
+point in its favor!
+
+Furthermore, the USB-persist feature applies to _all_ USB devices, not
+just mass-storage devices. It might turn out to be equally useful for
+other device types, such as network interfaces.
+
+
+ WARNING: USB-persist can be dangerous!!
+
+When recovering an interrupted power session the kernel does its best
+to make sure the USB device hasn't been changed; that is, the same
+device is still plugged into the port as before. But the checks
+aren't guaranteed to be 100% accurate.
+
+If you replace one USB device with another of the same type (same
+manufacturer, same IDs, and so on) there's an excellent chance the
+kernel won't detect the change. The serial number string and other
+descriptors are compared with the kernel's stored values, but this
+might not help since manufacturers frequently omit serial numbers
+entirely in their devices.
+
+Furthermore it's quite possible to leave a USB device exactly the same
+while changing its media. If you replace the flash memory card in a
+USB card reader while the system is asleep, the kernel will have no
+way to know you did it. The kernel will assume that nothing has
+happened and will continue to use the partition tables, inodes, and
+memory mappings for the old card.
+
+If the kernel gets fooled in this way, it's almost certain to cause
+data corruption and to crash your system. You'll have no one to blame
+but yourself.
+
+YOU HAVE BEEN WARNED! USE AT YOUR OWN RISK!
+
+That having been said, most of the time there shouldn't be any trouble
+at all. The USB-persist feature can be extremely useful. Make the
+most of it.
diff --git a/Documentation/usb/power-management.txt b/Documentation/usb/power-management.txt
new file mode 100644
index 0000000..e48ea1d
--- /dev/null
+++ b/Documentation/usb/power-management.txt
@@ -0,0 +1,530 @@
+ Power Management for USB
+
+ Alan Stern <stern@rowland.harvard.edu>
+
+ October 5, 2007
+
+
+
+ What is Power Management?
+ -------------------------
+
+Power Management (PM) is the practice of saving energy by suspending
+parts of a computer system when they aren't being used. While a
+component is "suspended" it is in a nonfunctional low-power state; it
+might even be turned off completely. A suspended component can be
+"resumed" (returned to a functional full-power state) when the kernel
+needs to use it. (There also are forms of PM in which components are
+placed in a less functional but still usable state instead of being
+suspended; an example would be reducing the CPU's clock rate. This
+document will not discuss those other forms.)
+
+When the parts being suspended include the CPU and most of the rest of
+the system, we speak of it as a "system suspend". When a particular
+device is turned off while the system as a whole remains running, we
+call it a "dynamic suspend" (also known as a "runtime suspend" or
+"selective suspend"). This document concentrates mostly on how
+dynamic PM is implemented in the USB subsystem, although system PM is
+covered to some extent (see Documentation/power/*.txt for more
+information about system PM).
+
+Note: Dynamic PM support for USB is present only if the kernel was
+built with CONFIG_USB_SUSPEND enabled. System PM support is present
+only if the kernel was built with CONFIG_SUSPEND or CONFIG_HIBERNATION
+enabled.
+
+
+ What is Remote Wakeup?
+ ----------------------
+
+When a device has been suspended, it generally doesn't resume until
+the computer tells it to. Likewise, if the entire computer has been
+suspended, it generally doesn't resume until the user tells it to, say
+by pressing a power button or opening the cover.
+
+However some devices have the capability of resuming by themselves, or
+asking the kernel to resume them, or even telling the entire computer
+to resume. This capability goes by several names such as "Wake On
+LAN"; we will refer to it generically as "remote wakeup". When a
+device is enabled for remote wakeup and it is suspended, it may resume
+itself (or send a request to be resumed) in response to some external
+event. Examples include a suspended keyboard resuming when a key is
+pressed, or a suspended USB hub resuming when a device is plugged in.
+
+
+ When is a USB device idle?
+ --------------------------
+
+A device is idle whenever the kernel thinks it's not busy doing
+anything important and thus is a candidate for being suspended. The
+exact definition depends on the device's driver; drivers are allowed
+to declare that a device isn't idle even when there's no actual
+communication taking place. (For example, a hub isn't considered idle
+unless all the devices plugged into that hub are already suspended.)
+In addition, a device isn't considered idle so long as a program keeps
+its usbfs file open, whether or not any I/O is going on.
+
+If a USB device has no driver, its usbfs file isn't open, and it isn't
+being accessed through sysfs, then it definitely is idle.
+
+
+ Forms of dynamic PM
+ -------------------
+
+Dynamic suspends can occur in two ways: manual and automatic.
+"Manual" means that the user has told the kernel to suspend a device,
+whereas "automatic" means that the kernel has decided all by itself to
+suspend a device. Automatic suspend is called "autosuspend" for
+short. In general, a device won't be autosuspended unless it has been
+idle for some minimum period of time, the so-called idle-delay time.
+
+Of course, nothing the kernel does on its own initiative should
+prevent the computer or its devices from working properly. If a
+device has been autosuspended and a program tries to use it, the
+kernel will automatically resume the device (autoresume). For the
+same reason, an autosuspended device will usually have remote wakeup
+enabled, if the device supports remote wakeup.
+
+It is worth mentioning that many USB drivers don't support
+autosuspend. In fact, at the time of this writing (Linux 2.6.23) the
+only drivers which do support it are the hub driver, kaweth, asix,
+usblp, usblcd, and usb-skeleton (which doesn't count). If a
+non-supporting driver is bound to a device, the device won't be
+autosuspended. In effect, the kernel pretends the device is never
+idle.
+
+We can categorize power management events in two broad classes:
+external and internal. External events are those triggered by some
+agent outside the USB stack: system suspend/resume (triggered by
+userspace), manual dynamic suspend/resume (also triggered by
+userspace), and remote wakeup (triggered by the device). Internal
+events are those triggered within the USB stack: autosuspend and
+autoresume.
+
+
+ The user interface for dynamic PM
+ ---------------------------------
+
+The user interface for controlling dynamic PM is located in the power/
+subdirectory of each USB device's sysfs directory, that is, in
+/sys/bus/usb/devices/.../power/ where "..." is the device's ID. The
+relevant attribute files are: wakeup, level, and autosuspend.
+
+ power/wakeup
+
+ This file is empty if the device does not support
+ remote wakeup. Otherwise the file contains either the
+ word "enabled" or the word "disabled", and you can
+ write those words to the file. The setting determines
+ whether or not remote wakeup will be enabled when the
+ device is next suspended. (If the setting is changed
+ while the device is suspended, the change won't take
+ effect until the following suspend.)
+
+ power/level
+
+ This file contains one of three words: "on", "auto",
+ or "suspend". You can write those words to the file
+ to change the device's setting.
+
+ "on" means that the device should be resumed and
+ autosuspend is not allowed. (Of course, system
+ suspends are still allowed.)
+
+ "auto" is the normal state in which the kernel is
+ allowed to autosuspend and autoresume the device.
+
+ "suspend" means that the device should remain
+ suspended, and autoresume is not allowed. (But remote
+ wakeup may still be allowed, since it is controlled
+ separately by the power/wakeup attribute.)
+
+ power/autosuspend
+
+ This file contains an integer value, which is the
+ number of seconds the device should remain idle before
+ the kernel will autosuspend it (the idle-delay time).
+ The default is 2. 0 means to autosuspend as soon as
+ the device becomes idle, and -1 means never to
+ autosuspend. You can write a number to the file to
+ change the autosuspend idle-delay time.
+
+Writing "-1" to power/autosuspend and writing "on" to power/level do
+essentially the same thing -- they both prevent the device from being
+autosuspended. Yes, this is a redundancy in the API.
+
+(In 2.6.21 writing "0" to power/autosuspend would prevent the device
+from being autosuspended; the behavior was changed in 2.6.22. The
+power/autosuspend attribute did not exist prior to 2.6.21, and the
+power/level attribute did not exist prior to 2.6.22.)
+
+
+ Changing the default idle-delay time
+ ------------------------------------
+
+The default autosuspend idle-delay time is controlled by a module
+parameter in usbcore. You can specify the value when usbcore is
+loaded. For example, to set it to 5 seconds instead of 2 you would
+do:
+
+ modprobe usbcore autosuspend=5
+
+Equivalently, you could add to /etc/modprobe.conf a line saying:
+
+ options usbcore autosuspend=5
+
+Some distributions load the usbcore module very early during the boot
+process, by means of a program or script running from an initramfs
+image. To alter the parameter value you would have to rebuild that
+image.
+
+If usbcore is compiled into the kernel rather than built as a loadable
+module, you can add
+
+ usbcore.autosuspend=5
+
+to the kernel's boot command line.
+
+Finally, the parameter value can be changed while the system is
+running. If you do:
+
+ echo 5 >/sys/module/usbcore/parameters/autosuspend
+
+then each new USB device will have its autosuspend idle-delay
+initialized to 5. (The idle-delay values for already existing devices
+will not be affected.)
+
+Setting the initial default idle-delay to -1 will prevent any
+autosuspend of any USB device. This is a simple alternative to
+disabling CONFIG_USB_SUSPEND and rebuilding the kernel, and it has the
+added benefit of allowing you to enable autosuspend for selected
+devices.
+
+
+ Warnings
+ --------
+
+The USB specification states that all USB devices must support power
+management. Nevertheless, the sad fact is that many devices do not
+support it very well. You can suspend them all right, but when you
+try to resume them they disconnect themselves from the USB bus or
+they stop working entirely. This seems to be especially prevalent
+among printers and scanners, but plenty of other types of device have
+the same deficiency.
+
+For this reason, by default the kernel disables autosuspend (the
+power/level attribute is initialized to "on") for all devices other
+than hubs. Hubs, at least, appear to be reasonably well-behaved in
+this regard.
+
+(In 2.6.21 and 2.6.22 this wasn't the case. Autosuspend was enabled
+by default for almost all USB devices. A number of people experienced
+problems as a result.)
+
+This means that non-hub devices won't be autosuspended unless the user
+or a program explicitly enables it. As of this writing there aren't
+any widespread programs which will do this; we hope that in the near
+future device managers such as HAL will take on this added
+responsibility. In the meantime you can always carry out the
+necessary operations by hand or add them to a udev script. You can
+also change the idle-delay time; 2 seconds is not the best choice for
+every device.
+
+Sometimes it turns out that even when a device does work okay with
+autosuspend there are still problems. For example, there are
+experimental patches adding autosuspend support to the usbhid driver,
+which manages keyboards and mice, among other things. Tests with a
+number of keyboards showed that typing on a suspended keyboard, while
+causing the keyboard to do a remote wakeup all right, would
+nonetheless frequently result in lost keystrokes. Tests with mice
+showed that some of them would issue a remote-wakeup request in
+response to button presses but not to motion, and some in response to
+neither.
+
+The kernel will not prevent you from enabling autosuspend on devices
+that can't handle it. It is even possible in theory to damage a
+device by suspending it at the wrong time -- for example, suspending a
+USB hard disk might cause it to spin down without parking the heads.
+(Highly unlikely, but possible.) Take care.
+
+
+ The driver interface for Power Management
+ -----------------------------------------
+
+The requirements for a USB driver to support external power management
+are pretty modest; the driver need only define
+
+ .suspend
+ .resume
+ .reset_resume
+
+methods in its usb_driver structure, and the reset_resume method is
+optional. The methods' jobs are quite simple:
+
+ The suspend method is called to warn the driver that the
+ device is going to be suspended. If the driver returns a
+ negative error code, the suspend will be aborted. Normally
+ the driver will return 0, in which case it must cancel all
+ outstanding URBs (usb_kill_urb()) and not submit any more.
+
+ The resume method is called to tell the driver that the
+ device has been resumed and the driver can return to normal
+ operation. URBs may once more be submitted.
+
+ The reset_resume method is called to tell the driver that
+ the device has been resumed and it also has been reset.
+ The driver should redo any necessary device initialization,
+ since the device has probably lost most or all of its state
+ (although the interfaces will be in the same altsettings as
+ before the suspend).
+
+If the device is disconnected or powered down while it is suspended,
+the disconnect method will be called instead of the resume or
+reset_resume method. This is also quite likely to happen when
+waking up from hibernation, as many systems do not maintain suspend
+current to the USB host controllers during hibernation. (It's
+possible to work around the hibernation-forces-disconnect problem by
+using the USB Persist facility.)
+
+The reset_resume method is used by the USB Persist facility (see
+Documentation/usb/persist.txt) and it can also be used under certain
+circumstances when CONFIG_USB_PERSIST is not enabled. Currently, if a
+device is reset during a resume and the driver does not have a
+reset_resume method, the driver won't receive any notification about
+the resume. Later kernels will call the driver's disconnect method;
+2.6.23 doesn't do this.
+
+USB drivers are bound to interfaces, so their suspend and resume
+methods get called when the interfaces are suspended or resumed. In
+principle one might want to suspend some interfaces on a device (i.e.,
+force the drivers for those interface to stop all activity) without
+suspending the other interfaces. The USB core doesn't allow this; all
+interfaces are suspended when the device itself is suspended and all
+interfaces are resumed when the device is resumed. It isn't possible
+to suspend or resume some but not all of a device's interfaces. The
+closest you can come is to unbind the interfaces' drivers.
+
+
+ The driver interface for autosuspend and autoresume
+ ---------------------------------------------------
+
+To support autosuspend and autoresume, a driver should implement all
+three of the methods listed above. In addition, a driver indicates
+that it supports autosuspend by setting the .supports_autosuspend flag
+in its usb_driver structure. It is then responsible for informing the
+USB core whenever one of its interfaces becomes busy or idle. The
+driver does so by calling these three functions:
+
+ int usb_autopm_get_interface(struct usb_interface *intf);
+ void usb_autopm_put_interface(struct usb_interface *intf);
+ int usb_autopm_set_interface(struct usb_interface *intf);
+
+The functions work by maintaining a counter in the usb_interface
+structure. When intf->pm_usage_count is > 0 then the interface is
+deemed to be busy, and the kernel will not autosuspend the interface's
+device. When intf->pm_usage_count is <= 0 then the interface is
+considered to be idle, and the kernel may autosuspend the device.
+
+(There is a similar pm_usage_count field in struct usb_device,
+associated with the device itself rather than any of its interfaces.
+This field is used only by the USB core.)
+
+The driver owns intf->pm_usage_count; it can modify the value however
+and whenever it likes. A nice aspect of the usb_autopm_* routines is
+that the changes they make are protected by the usb_device structure's
+PM mutex (udev->pm_mutex); however drivers may change pm_usage_count
+without holding the mutex.
+
+ usb_autopm_get_interface() increments pm_usage_count and
+ attempts an autoresume if the new value is > 0 and the
+ device is suspended.
+
+ usb_autopm_put_interface() decrements pm_usage_count and
+ attempts an autosuspend if the new value is <= 0 and the
+ device isn't suspended.
+
+ usb_autopm_set_interface() leaves pm_usage_count alone.
+ It attempts an autoresume if the value is > 0 and the device
+ is suspended, and it attempts an autosuspend if the value is
+ <= 0 and the device isn't suspended.
+
+There also are a couple of utility routines drivers can use:
+
+ usb_autopm_enable() sets pm_usage_cnt to 0 and then calls
+ usb_autopm_set_interface(), which will attempt an autosuspend.
+
+ usb_autopm_disable() sets pm_usage_cnt to 1 and then calls
+ usb_autopm_set_interface(), which will attempt an autoresume.
+
+The conventional usage pattern is that a driver calls
+usb_autopm_get_interface() in its open routine and
+usb_autopm_put_interface() in its close or release routine. But
+other patterns are possible.
+
+The autosuspend attempts mentioned above will often fail for one
+reason or another. For example, the power/level attribute might be
+set to "on", or another interface in the same device might not be
+idle. This is perfectly normal. If the reason for failure was that
+the device hasn't been idle for long enough, a delayed workqueue
+routine is automatically set up to carry out the operation when the
+autosuspend idle-delay has expired.
+
+Autoresume attempts also can fail. This will happen if power/level is
+set to "suspend" or if the device doesn't manage to resume properly.
+Unlike autosuspend, there's no delay for an autoresume.
+
+
+ Other parts of the driver interface
+ -----------------------------------
+
+Sometimes a driver needs to make sure that remote wakeup is enabled
+during autosuspend. For example, there's not much point
+autosuspending a keyboard if the user can't cause the keyboard to do a
+remote wakeup by typing on it. If the driver sets
+intf->needs_remote_wakeup to 1, the kernel won't autosuspend the
+device if remote wakeup isn't available or has been disabled through
+the power/wakeup attribute. (If the device is already autosuspended,
+though, setting this flag won't cause the kernel to autoresume it.
+Normally a driver would set this flag in its probe method, at which
+time the device is guaranteed not to be autosuspended.)
+
+The usb_autopm_* routines have to run in a sleepable process context;
+they must not be called from an interrupt handler or while holding a
+spinlock. In fact, the entire autosuspend mechanism is not well geared
+toward interrupt-driven operation. However there is one thing a
+driver can do in an interrupt handler:
+
+ usb_mark_last_busy(struct usb_device *udev);
+
+This sets udev->last_busy to the current time. udev->last_busy is the
+field used for idle-delay calculations; updating it will cause any
+pending autosuspend to be moved back. The usb_autopm_* routines will
+also set the last_busy field to the current time.
+
+Calling urb_mark_last_busy() from within an URB completion handler is
+subject to races: The kernel may have just finished deciding the
+device has been idle for long enough but not yet gotten around to
+calling the driver's suspend method. The driver would have to be
+responsible for synchronizing its suspend method with its URB
+completion handler and causing the autosuspend to fail with -EBUSY if
+an URB had completed too recently.
+
+External suspend calls should never be allowed to fail in this way,
+only autosuspend calls. The driver can tell them apart by checking
+udev->auto_pm; this flag will be set to 1 for internal PM events
+(autosuspend or autoresume) and 0 for external PM events.
+
+Many of the ingredients in the autosuspend framework are oriented
+towards interfaces: The usb_interface structure contains the
+pm_usage_cnt field, and the usb_autopm_* routines take an interface
+pointer as their argument. But somewhat confusingly, a few of the
+pieces (usb_mark_last_busy() and udev->auto_pm) use the usb_device
+structure instead. Drivers need to keep this straight; they can call
+interface_to_usbdev() to find the device structure for a given
+interface.
+
+
+ Locking requirements
+ --------------------
+
+All three suspend/resume methods are always called while holding the
+usb_device's PM mutex. For external events -- but not necessarily for
+autosuspend or autoresume -- the device semaphore (udev->dev.sem) will
+also be held. This implies that external suspend/resume events are
+mutually exclusive with calls to probe, disconnect, pre_reset, and
+post_reset; the USB core guarantees that this is true of internal
+suspend/resume events as well.
+
+If a driver wants to block all suspend/resume calls during some
+critical section, it can simply acquire udev->pm_mutex. Note that
+calls to resume may be triggered indirectly. Block IO due to memory
+allocations can make the vm subsystem resume a device. Thus while
+holding this lock you must not allocate memory with GFP_KERNEL or
+GFP_NOFS.
+
+Alternatively, if the critical section might call some of the
+usb_autopm_* routines, the driver can avoid deadlock by doing:
+
+ down(&udev->dev.sem);
+ rc = usb_autopm_get_interface(intf);
+
+and at the end of the critical section:
+
+ if (!rc)
+ usb_autopm_put_interface(intf);
+ up(&udev->dev.sem);
+
+Holding the device semaphore will block all external PM calls, and the
+usb_autopm_get_interface() will prevent any internal PM calls, even if
+it fails. (Exercise: Why?)
+
+The rules for locking order are:
+
+ Never acquire any device semaphore while holding any PM mutex.
+
+ Never acquire udev->pm_mutex while holding the PM mutex for
+ a device that isn't a descendant of udev.
+
+In other words, PM mutexes should only be acquired going up the device
+tree, and they should be acquired only after locking all the device
+semaphores you need to hold. These rules don't matter to drivers very
+much; they usually affect just the USB core.
+
+Still, drivers do need to be careful. For example, many drivers use a
+private mutex to synchronize their normal I/O activities with their
+disconnect method. Now if the driver supports autosuspend then it
+must call usb_autopm_put_interface() from somewhere -- maybe from its
+close method. It should make the call while holding the private mutex,
+since a driver shouldn't call any of the usb_autopm_* functions for an
+interface from which it has been unbound.
+
+But the usb_autpm_* routines always acquire the device's PM mutex, and
+consequently the locking order has to be: private mutex first, PM
+mutex second. Since the suspend method is always called with the PM
+mutex held, it mustn't try to acquire the private mutex. It has to
+synchronize with the driver's I/O activities in some other way.
+
+
+ Interaction between dynamic PM and system PM
+ --------------------------------------------
+
+Dynamic power management and system power management can interact in
+a couple of ways.
+
+Firstly, a device may already be manually suspended or autosuspended
+when a system suspend occurs. Since system suspends are supposed to
+be as transparent as possible, the device should remain suspended
+following the system resume. The 2.6.23 kernel obeys this principle
+for manually suspended devices but not for autosuspended devices; they
+do get resumed when the system wakes up. (Presumably they will be
+autosuspended again after their idle-delay time expires.) In later
+kernels this behavior will be fixed.
+
+(There is an exception. If a device would undergo a reset-resume
+instead of a normal resume, and the device is enabled for remote
+wakeup, then the reset-resume takes place even if the device was
+already suspended when the system suspend began. The justification is
+that a reset-resume is a kind of remote-wakeup event. Or to put it
+another way, a device which needs a reset won't be able to generate
+normal remote-wakeup signals, so it ought to be resumed immediately.)
+
+Secondly, a dynamic power-management event may occur as a system
+suspend is underway. The window for this is short, since system
+suspends don't take long (a few seconds usually), but it can happen.
+For example, a suspended device may send a remote-wakeup signal while
+the system is suspending. The remote wakeup may succeed, which would
+cause the system suspend to abort. If the remote wakeup doesn't
+succeed, it may still remain active and thus cause the system to
+resume as soon as the system suspend is complete. Or the remote
+wakeup may fail and get lost. Which outcome occurs depends on timing
+and on the hardware and firmware design.
+
+More interestingly, a device might undergo a manual resume or
+autoresume during system suspend. With current kernels this shouldn't
+happen, because manual resumes must be initiated by userspace and
+autoresumes happen in response to I/O requests, but all user processes
+and I/O should be quiescent during a system suspend -- thanks to the
+freezer. However there are plans to do away with the freezer, which
+would mean these things would become possible. If and when this comes
+about, the USB core will carefully arrange matters so that either type
+of resume will block until the entire system has resumed.
diff --git a/Documentation/usb/proc_usb_info.txt b/Documentation/usb/proc_usb_info.txt
new file mode 100644
index 0000000..fafcd47
--- /dev/null
+++ b/Documentation/usb/proc_usb_info.txt
@@ -0,0 +1,377 @@
+/proc/bus/usb filesystem output
+===============================
+(version 2003.05.30)
+
+
+The usbfs filesystem for USB devices is traditionally mounted at
+/proc/bus/usb. It provides the /proc/bus/usb/devices file, as well as
+the /proc/bus/usb/BBB/DDD files.
+
+
+**NOTE**: If /proc/bus/usb appears empty, and a host controller
+ driver has been linked, then you need to mount the
+ filesystem. Issue the command (as root):
+
+ mount -t usbfs none /proc/bus/usb
+
+ An alternative and more permanent method would be to add
+
+ none /proc/bus/usb usbfs defaults 0 0
+
+ to /etc/fstab. This will mount usbfs at each reboot.
+ You can then issue `cat /proc/bus/usb/devices` to extract
+ USB device information, and user mode drivers can use usbfs
+ to interact with USB devices.
+
+ There are a number of mount options supported by usbfs.
+ Consult the source code (linux/drivers/usb/core/inode.c) for
+ information about those options.
+
+**NOTE**: The filesystem has been renamed from "usbdevfs" to
+ "usbfs", to reduce confusion with "devfs". You may
+ still see references to the older "usbdevfs" name.
+
+For more information on mounting the usbfs file system, see the
+"USB Device Filesystem" section of the USB Guide. The latest copy
+of the USB Guide can be found at http://www.linux-usb.org/
+
+
+THE /proc/bus/usb/BBB/DDD FILES:
+--------------------------------
+Each connected USB device has one file. The BBB indicates the bus
+number. The DDD indicates the device address on that bus. Both
+of these numbers are assigned sequentially, and can be reused, so
+you can't rely on them for stable access to devices. For example,
+it's relatively common for devices to re-enumerate while they are
+still connected (perhaps someone jostled their power supply, hub,
+or USB cable), so a device might be 002/027 when you first connect
+it and 002/048 sometime later.
+
+These files can be read as binary data. The binary data consists
+of first the device descriptor, then the descriptors for each
+configuration of the device. Multi-byte fields in the device and
+configuration descriptors, but not other descriptors, are converted
+to host endianness by the kernel. This information is also shown
+in text form by the /proc/bus/usb/devices file, described later.
+
+These files may also be used to write user-level drivers for the USB
+devices. You would open the /proc/bus/usb/BBB/DDD file read/write,
+read its descriptors to make sure it's the device you expect, and then
+bind to an interface (or perhaps several) using an ioctl call. You
+would issue more ioctls to the device to communicate to it using
+control, bulk, or other kinds of USB transfers. The IOCTLs are
+listed in the <linux/usbdevice_fs.h> file, and at this writing the
+source code (linux/drivers/usb/core/devio.c) is the primary reference
+for how to access devices through those files.
+
+Note that since by default these BBB/DDD files are writable only by
+root, only root can write such user mode drivers. You can selectively
+grant read/write permissions to other users by using "chmod". Also,
+usbfs mount options such as "devmode=0666" may be helpful.
+
+
+
+THE /proc/bus/usb/devices FILE:
+-------------------------------
+In /proc/bus/usb/devices, each device's output has multiple
+lines of ASCII output.
+I made it ASCII instead of binary on purpose, so that someone
+can obtain some useful data from it without the use of an
+auxiliary program. However, with an auxiliary program, the numbers
+in the first 4 columns of each "T:" line (topology info:
+Lev, Prnt, Port, Cnt) can be used to build a USB topology diagram.
+
+Each line is tagged with a one-character ID for that line:
+
+T = Topology (etc.)
+B = Bandwidth (applies only to USB host controllers, which are
+ virtualized as root hubs)
+D = Device descriptor info.
+P = Product ID info. (from Device descriptor, but they won't fit
+ together on one line)
+S = String descriptors.
+C = Configuration descriptor info. (* = active configuration)
+I = Interface descriptor info.
+E = Endpoint descriptor info.
+
+=======================================================================
+
+/proc/bus/usb/devices output format:
+
+Legend:
+ d = decimal number (may have leading spaces or 0's)
+ x = hexadecimal number (may have leading spaces or 0's)
+ s = string
+
+
+Topology info:
+
+T: Bus=dd Lev=dd Prnt=dd Port=dd Cnt=dd Dev#=ddd Spd=ddd MxCh=dd
+| | | | | | | | |__MaxChildren
+| | | | | | | |__Device Speed in Mbps
+| | | | | | |__DeviceNumber
+| | | | | |__Count of devices at this level
+| | | | |__Connector/Port on Parent for this device
+| | | |__Parent DeviceNumber
+| | |__Level in topology for this bus
+| |__Bus number
+|__Topology info tag
+
+ Speed may be:
+ 1.5 Mbit/s for low speed USB
+ 12 Mbit/s for full speed USB
+ 480 Mbit/s for high speed USB (added for USB 2.0)
+
+
+Bandwidth info:
+B: Alloc=ddd/ddd us (xx%), #Int=ddd, #Iso=ddd
+| | | |__Number of isochronous requests
+| | |__Number of interrupt requests
+| |__Total Bandwidth allocated to this bus
+|__Bandwidth info tag
+
+ Bandwidth allocation is an approximation of how much of one frame
+ (millisecond) is in use. It reflects only periodic transfers, which
+ are the only transfers that reserve bandwidth. Control and bulk
+ transfers use all other bandwidth, including reserved bandwidth that
+ is not used for transfers (such as for short packets).
+
+ The percentage is how much of the "reserved" bandwidth is scheduled by
+ those transfers. For a low or full speed bus (loosely, "USB 1.1"),
+ 90% of the bus bandwidth is reserved. For a high speed bus (loosely,
+ "USB 2.0") 80% is reserved.
+
+
+Device descriptor info & Product ID info:
+
+D: Ver=x.xx Cls=xx(s) Sub=xx Prot=xx MxPS=dd #Cfgs=dd
+P: Vendor=xxxx ProdID=xxxx Rev=xx.xx
+
+where
+D: Ver=x.xx Cls=xx(sssss) Sub=xx Prot=xx MxPS=dd #Cfgs=dd
+| | | | | | |__NumberConfigurations
+| | | | | |__MaxPacketSize of Default Endpoint
+| | | | |__DeviceProtocol
+| | | |__DeviceSubClass
+| | |__DeviceClass
+| |__Device USB version
+|__Device info tag #1
+
+where
+P: Vendor=xxxx ProdID=xxxx Rev=xx.xx
+| | | |__Product revision number
+| | |__Product ID code
+| |__Vendor ID code
+|__Device info tag #2
+
+
+String descriptor info:
+
+S: Manufacturer=ssss
+| |__Manufacturer of this device as read from the device.
+| For USB host controller drivers (virtual root hubs) this may
+| be omitted, or (for newer drivers) will identify the kernel
+| version and the driver which provides this hub emulation.
+|__String info tag
+
+S: Product=ssss
+| |__Product description of this device as read from the device.
+| For older USB host controller drivers (virtual root hubs) this
+| indicates the driver; for newer ones, it's a product (and vendor)
+| description that often comes from the kernel's PCI ID database.
+|__String info tag
+
+S: SerialNumber=ssss
+| |__Serial Number of this device as read from the device.
+| For USB host controller drivers (virtual root hubs) this is
+| some unique ID, normally a bus ID (address or slot name) that
+| can't be shared with any other device.
+|__String info tag
+
+
+
+Configuration descriptor info:
+
+C:* #Ifs=dd Cfg#=dd Atr=xx MPwr=dddmA
+| | | | | |__MaxPower in mA
+| | | | |__Attributes
+| | | |__ConfiguratioNumber
+| | |__NumberOfInterfaces
+| |__ "*" indicates the active configuration (others are " ")
+|__Config info tag
+
+ USB devices may have multiple configurations, each of which act
+ rather differently. For example, a bus-powered configuration
+ might be much less capable than one that is self-powered. Only
+ one device configuration can be active at a time; most devices
+ have only one configuration.
+
+ Each configuration consists of one or more interfaces. Each
+ interface serves a distinct "function", which is typically bound
+ to a different USB device driver. One common example is a USB
+ speaker with an audio interface for playback, and a HID interface
+ for use with software volume control.
+
+
+Interface descriptor info (can be multiple per Config):
+
+I:* If#=dd Alt=dd #EPs=dd Cls=xx(sssss) Sub=xx Prot=xx Driver=ssss
+| | | | | | | | |__Driver name
+| | | | | | | | or "(none)"
+| | | | | | | |__InterfaceProtocol
+| | | | | | |__InterfaceSubClass
+| | | | | |__InterfaceClass
+| | | | |__NumberOfEndpoints
+| | | |__AlternateSettingNumber
+| | |__InterfaceNumber
+| |__ "*" indicates the active altsetting (others are " ")
+|__Interface info tag
+
+ A given interface may have one or more "alternate" settings.
+ For example, default settings may not use more than a small
+ amount of periodic bandwidth. To use significant fractions
+ of bus bandwidth, drivers must select a non-default altsetting.
+
+ Only one setting for an interface may be active at a time, and
+ only one driver may bind to an interface at a time. Most devices
+ have only one alternate setting per interface.
+
+
+Endpoint descriptor info (can be multiple per Interface):
+
+E: Ad=xx(s) Atr=xx(ssss) MxPS=dddd Ivl=dddss
+| | | | |__Interval (max) between transfers
+| | | |__EndpointMaxPacketSize
+| | |__Attributes(EndpointType)
+| |__EndpointAddress(I=In,O=Out)
+|__Endpoint info tag
+
+ The interval is nonzero for all periodic (interrupt or isochronous)
+ endpoints. For high speed endpoints the transfer interval may be
+ measured in microseconds rather than milliseconds.
+
+ For high speed periodic endpoints, the "MaxPacketSize" reflects
+ the per-microframe data transfer size. For "high bandwidth"
+ endpoints, that can reflect two or three packets (for up to
+ 3KBytes every 125 usec) per endpoint.
+
+ With the Linux-USB stack, periodic bandwidth reservations use the
+ transfer intervals and sizes provided by URBs, which can be less
+ than those found in endpoint descriptor.
+
+
+=======================================================================
+
+
+If a user or script is interested only in Topology info, for
+example, use something like "grep ^T: /proc/bus/usb/devices"
+for only the Topology lines. A command like
+"grep -i ^[tdp]: /proc/bus/usb/devices" can be used to list
+only the lines that begin with the characters in square brackets,
+where the valid characters are TDPCIE. With a slightly more able
+script, it can display any selected lines (for example, only T, D,
+and P lines) and change their output format. (The "procusb"
+Perl script is the beginning of this idea. It will list only
+selected lines [selected from TBDPSCIE] or "All" lines from
+/proc/bus/usb/devices.)
+
+The Topology lines can be used to generate a graphic/pictorial
+of the USB devices on a system's root hub. (See more below
+on how to do this.)
+
+The Interface lines can be used to determine what driver is
+being used for each device, and which altsetting it activated.
+
+The Configuration lines could be used to list maximum power
+(in milliamps) that a system's USB devices are using.
+For example, "grep ^C: /proc/bus/usb/devices".
+
+
+Here's an example, from a system which has a UHCI root hub,
+an external hub connected to the root hub, and a mouse and
+a serial converter connected to the external hub.
+
+T: Bus=00 Lev=00 Prnt=00 Port=00 Cnt=00 Dev#= 1 Spd=12 MxCh= 2
+B: Alloc= 28/900 us ( 3%), #Int= 2, #Iso= 0
+D: Ver= 1.00 Cls=09(hub ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1
+P: Vendor=0000 ProdID=0000 Rev= 0.00
+S: Product=USB UHCI Root Hub
+S: SerialNumber=dce0
+C:* #Ifs= 1 Cfg#= 1 Atr=40 MxPwr= 0mA
+I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub
+E: Ad=81(I) Atr=03(Int.) MxPS= 8 Ivl=255ms
+
+T: Bus=00 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=12 MxCh= 4
+D: Ver= 1.00 Cls=09(hub ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1
+P: Vendor=0451 ProdID=1446 Rev= 1.00
+C:* #Ifs= 1 Cfg#= 1 Atr=e0 MxPwr=100mA
+I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub
+E: Ad=81(I) Atr=03(Int.) MxPS= 1 Ivl=255ms
+
+T: Bus=00 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 3 Spd=1.5 MxCh= 0
+D: Ver= 1.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1
+P: Vendor=04b4 ProdID=0001 Rev= 0.00
+C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA
+I: If#= 0 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=01 Prot=02 Driver=mouse
+E: Ad=81(I) Atr=03(Int.) MxPS= 3 Ivl= 10ms
+
+T: Bus=00 Lev=02 Prnt=02 Port=02 Cnt=02 Dev#= 4 Spd=12 MxCh= 0
+D: Ver= 1.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1
+P: Vendor=0565 ProdID=0001 Rev= 1.08
+S: Manufacturer=Peracom Networks, Inc.
+S: Product=Peracom USB to Serial Converter
+C:* #Ifs= 1 Cfg#= 1 Atr=a0 MxPwr=100mA
+I: If#= 0 Alt= 0 #EPs= 3 Cls=00(>ifc ) Sub=00 Prot=00 Driver=serial
+E: Ad=81(I) Atr=02(Bulk) MxPS= 64 Ivl= 16ms
+E: Ad=01(O) Atr=02(Bulk) MxPS= 16 Ivl= 16ms
+E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl= 8ms
+
+
+Selecting only the "T:" and "I:" lines from this (for example, by using
+"procusb ti"), we have:
+
+T: Bus=00 Lev=00 Prnt=00 Port=00 Cnt=00 Dev#= 1 Spd=12 MxCh= 2
+T: Bus=00 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=12 MxCh= 4
+I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub
+T: Bus=00 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 3 Spd=1.5 MxCh= 0
+I: If#= 0 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=01 Prot=02 Driver=mouse
+T: Bus=00 Lev=02 Prnt=02 Port=02 Cnt=02 Dev#= 4 Spd=12 MxCh= 0
+I: If#= 0 Alt= 0 #EPs= 3 Cls=00(>ifc ) Sub=00 Prot=00 Driver=serial
+
+
+Physically this looks like (or could be converted to):
+
+ +------------------+
+ | PC/root_hub (12)| Dev# = 1
+ +------------------+ (nn) is Mbps.
+ Level 0 | CN.0 | CN.1 | [CN = connector/port #]
+ +------------------+
+ /
+ /
+ +-----------------------+
+ Level 1 | Dev#2: 4-port hub (12)|
+ +-----------------------+
+ |CN.0 |CN.1 |CN.2 |CN.3 |
+ +-----------------------+
+ \ \____________________
+ \_____ \
+ \ \
+ +--------------------+ +--------------------+
+ Level 2 | Dev# 3: mouse (1.5)| | Dev# 4: serial (12)|
+ +--------------------+ +--------------------+
+
+
+
+Or, in a more tree-like structure (ports [Connectors] without
+connections could be omitted):
+
+PC: Dev# 1, root hub, 2 ports, 12 Mbps
+|_ CN.0: Dev# 2, hub, 4 ports, 12 Mbps
+ |_ CN.0: Dev #3, mouse, 1.5 Mbps
+ |_ CN.1:
+ |_ CN.2: Dev #4, serial, 12 Mbps
+ |_ CN.3:
+|_ CN.1:
+
+
+ ### END ###
diff --git a/Documentation/usb/rio.txt b/Documentation/usb/rio.txt
new file mode 100644
index 0000000..aee715a
--- /dev/null
+++ b/Documentation/usb/rio.txt
@@ -0,0 +1,138 @@
+Copyright (C) 1999, 2000 Bruce Tenison
+Portions Copyright (C) 1999, 2000 David Nelson
+Thanks to David Nelson for guidance and the usage of the scanner.txt
+and scanner.c files to model our driver and this informative file.
+
+Mar. 2, 2000
+
+CHANGES
+
+- Initial Revision
+
+
+OVERVIEW
+
+This README will address issues regarding how to configure the kernel
+to access a RIO 500 mp3 player.
+Before I explain how to use this to access the Rio500 please be warned:
+
+W A R N I N G:
+--------------
+
+Please note that this software is still under development. The authors
+are in no way responsible for any damage that may occur, no matter how
+inconsequential.
+
+It seems that the Rio has a problem when sending .mp3 with low batteries.
+I suggest when the batteries are low and you want to transfer stuff that you
+replace it with a fresh one. In my case, what happened is I lost two 16kb
+blocks (they are no longer usable to store information to it). But I don't
+know if that's normal or not; it could simply be a problem with the flash
+memory.
+
+In an extreme case, I left my Rio playing overnight and the batteries wore
+down to nothing and appear to have corrupted the flash memory. My RIO
+needed to be replaced as a result. Diamond tech support is aware of the
+problem. Do NOT allow your batteries to wear down to nothing before
+changing them. It appears RIO 500 firmware does not handle low battery
+power well at all.
+
+On systems with OHCI controllers, the kernel OHCI code appears to have
+power on problems with some chipsets. If you are having problems
+connecting to your RIO 500, try turning it on first and then plugging it
+into the USB cable.
+
+Contact information:
+--------------------
+
+ The main page for the project is hosted at sourceforge.net in the following
+ URL: <http://rio500.sourceforge.net>. You can also go to the project's
+ sourceforge home page at: <http://sourceforge.net/projects/rio500/>.
+ There is also a mailing list: rio500-users@lists.sourceforge.net
+
+Authors:
+-------
+
+Most of the code was written by Cesar Miquel <miquel@df.uba.ar>. Keith
+Clayton <kclayton@jps.net> is incharge of the PPC port and making sure
+things work there. Bruce Tenison <btenison@dibbs.net> is adding support
+for .fon files and also does testing. The program will mostly sure be
+re-written and Pete Ikusz along with the rest will re-design it. I would
+also like to thank Tri Nguyen <tmn_3022000@hotmail.com> who provided use
+with some important information regarding the communication with the Rio.
+
+ADDITIONAL INFORMATION and Userspace tools
+
+http://rio500.sourceforge.net/
+
+
+REQUIREMENTS
+
+A host with a USB port. Ideally, either a UHCI (Intel) or OHCI
+(Compaq and others) hardware port should work.
+
+A Linux development kernel (2.3.x) with USB support enabled or a
+backported version to linux-2.2.x. See http://www.linux-usb.org for
+more information on accomplishing this.
+
+A Linux kernel with RIO 500 support enabled.
+
+'lspci' which is only needed to determine the type of USB hardware
+available in your machine.
+
+CONFIGURATION
+
+Using `lspci -v`, determine the type of USB hardware available.
+
+ If you see something like:
+
+ USB Controller: ......
+ Flags: .....
+ I/O ports at ....
+
+ Then you have a UHCI based controller.
+
+ If you see something like:
+
+ USB Controller: .....
+ Flags: ....
+ Memory at .....
+
+ Then you have a OHCI based controller.
+
+Using `make menuconfig` or your preferred method for configuring the
+kernel, select 'Support for USB', 'OHCI/UHCI' depending on your
+hardware (determined from the steps above), 'USB Diamond Rio500 support', and
+'Preliminary USB device filesystem'. Compile and install the modules
+(you may need to execute `depmod -a` to update the module
+dependencies).
+
+Add a device for the USB rio500:
+ `mknod /dev/usb/rio500 c 180 64`
+
+Set appropriate permissions for /dev/usb/rio500 (don't forget about
+group and world permissions). Both read and write permissions are
+required for proper operation.
+
+Load the appropriate modules (if compiled as modules):
+
+ OHCI:
+ modprobe usbcore
+ modprobe usb-ohci
+ modprobe rio500
+
+ UHCI:
+ modprobe usbcore
+ modprobe usb-uhci (or uhci)
+ modprobe rio500
+
+That's it. The Rio500 Utils at: http://rio500.sourceforge.net should
+be able to access the rio500.
+
+BUGS
+
+If you encounter any problems feel free to drop me an email.
+
+Bruce Tenison
+btenison@dibbs.net
+
diff --git a/Documentation/usb/usb-help.txt b/Documentation/usb/usb-help.txt
new file mode 100644
index 0000000..4273ca2
--- /dev/null
+++ b/Documentation/usb/usb-help.txt
@@ -0,0 +1,16 @@
+usb-help.txt
+2008-Mar-7
+
+For USB help other than the readme files that are located in
+Documentation/usb/*, see the following:
+
+Linux-USB project: http://www.linux-usb.org
+ mirrors at http://usb.in.tum.de/linux-usb/
+ and http://it.linux-usb.org
+Linux USB Guide: http://linux-usb.sourceforge.net
+Linux-USB device overview (working devices and drivers):
+ http://www.qbik.ch/usb/devices/
+
+The Linux-USB mailing list is at linux-usb@vger.kernel.org
+
+###
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
new file mode 100644
index 0000000..ff2c1ff
--- /dev/null
+++ b/Documentation/usb/usb-serial.txt
@@ -0,0 +1,472 @@
+INTRODUCTION
+
+ The USB serial driver currently supports a number of different USB to
+ serial converter products, as well as some devices that use a serial
+ interface from userspace to talk to the device.
+
+ See the individual product section below for specific information about
+ the different devices.
+
+
+CONFIGURATION
+
+ Currently the driver can handle up to 256 different serial interfaces at
+ one time.
+
+ The major number that the driver uses is 188 so to use the driver,
+ create the following nodes:
+ mknod /dev/ttyUSB0 c 188 0
+ mknod /dev/ttyUSB1 c 188 1
+ mknod /dev/ttyUSB2 c 188 2
+ mknod /dev/ttyUSB3 c 188 3
+ .
+ .
+ .
+ mknod /dev/ttyUSB254 c 188 254
+ mknod /dev/ttyUSB255 c 188 255
+
+ When the device is connected and recognized by the driver, the driver
+ will print to the system log, which node(s) the device has been bound
+ to.
+
+
+SPECIFIC DEVICES SUPPORTED
+
+
+ConnectTech WhiteHEAT 4 port converter
+
+ ConnectTech has been very forthcoming with information about their
+ device, including providing a unit to test with.
+
+ The driver is officially supported by Connect Tech Inc.
+ http://www.connecttech.com
+
+ For any questions or problems with this driver, please contact
+ Connect Tech's Support Department at support@connecttech.com
+
+
+HandSpring Visor, Palm USB, and Clié USB driver
+
+ This driver works with all HandSpring USB, Palm USB, and Sony Clié USB
+ devices.
+
+ Only when the device tries to connect to the host, will the device show
+ up to the host as a valid USB device. When this happens, the device is
+ properly enumerated, assigned a port, and then communication _should_ be
+ possible. The driver cleans up properly when the device is removed, or
+ the connection is canceled on the device.
+
+ NOTE:
+ This means that in order to talk to the device, the sync button must be
+ pressed BEFORE trying to get any program to communicate to the device.
+ This goes against the current documentation for pilot-xfer and other
+ packages, but is the only way that it will work due to the hardware
+ in the device.
+
+ When the device is connected, try talking to it on the second port
+ (this is usually /dev/ttyUSB1 if you do not have any other usb-serial
+ devices in the system.) The system log should tell you which port is
+ the port to use for the HotSync transfer. The "Generic" port can be used
+ for other device communication, such as a PPP link.
+
+ For some Sony Clié devices, /dev/ttyUSB0 must be used to talk to the
+ device. This is true for all OS version 3.5 devices, and most devices
+ that have had a flash upgrade to a newer version of the OS. See the
+ kernel system log for information on which is the correct port to use.
+
+ If after pressing the sync button, nothing shows up in the system log,
+ try resetting the device, first a hot reset, and then a cold reset if
+ necessary. Some devices need this before they can talk to the USB port
+ properly.
+
+ Devices that are not compiled into the kernel can be specified with module
+ parameters. e.g. modprobe visor vendor=0x54c product=0x66
+
+ There is a webpage and mailing lists for this portion of the driver at:
+ http://usbvisor.sourceforge.net/
+
+ For any questions or problems with this driver, please contact Greg
+ Kroah-Hartman at greg@kroah.com
+
+
+PocketPC PDA Driver
+
+ This driver can be used to connect to Compaq iPAQ, HP Jornada, Casio EM500
+ and other PDAs running Windows CE 3.0 or PocketPC 2002 using a USB
+ cable/cradle.
+ Most devices supported by ActiveSync are supported out of the box.
+ For others, please use module parameters to specify the product and vendor
+ id. e.g. modprobe ipaq vendor=0x3f0 product=0x1125
+
+ The driver presents a serial interface (usually on /dev/ttyUSB0) over
+ which one may run ppp and establish a TCP/IP link to the PDA. Once this
+ is done, you can transfer files, backup, download email etc. The most
+ significant advantage of using USB is speed - I can get 73 to 113
+ kbytes/sec for download/upload to my iPAQ.
+
+ This driver is only one of a set of components required to utilize
+ the USB connection. Please visit http://synce.sourceforge.net which
+ contains the necessary packages and a simple step-by-step howto.
+
+ Once connected, you can use Win CE programs like ftpView, Pocket Outlook
+ from the PDA and xcerdisp, synce utilities from the Linux side.
+
+ To use Pocket IE, follow the instructions given at
+ http://www.tekguru.co.uk/EM500/usbtonet.htm to achieve the same thing
+ on Win98. Omit the proxy server part; Linux is quite capable of forwarding
+ packets unlike Win98. Another modification is required at least for the
+ iPAQ - disable autosync by going to the Start/Settings/Connections menu
+ and unchecking the "Automatically synchronize ..." box. Go to
+ Start/Programs/Connections, connect the cable and select "usbdial" (or
+ whatever you named your new USB connection). You should finally wind
+ up with a "Connected to usbdial" window with status shown as connected.
+ Now start up PIE and browse away.
+
+ If it doesn't work for some reason, load both the usbserial and ipaq module
+ with the module parameter "debug" set to 1 and examine the system log.
+ You can also try soft-resetting your PDA before attempting a connection.
+
+ Other functionality may be possible depending on your PDA. According to
+ Wes Cilldhaire <billybobjoehenrybob@hotmail.com>, with the Toshiba E570,
+ ...if you boot into the bootloader (hold down the power when hitting the
+ reset button, continuing to hold onto the power until the bootloader screen
+ is displayed), then put it in the cradle with the ipaq driver loaded, open
+ a terminal on /dev/ttyUSB0, it gives you a "USB Reflash" terminal, which can
+ be used to flash the ROM, as well as the microP code.. so much for needing
+ Toshiba's $350 serial cable for flashing!! :D
+ NOTE: This has NOT been tested. Use at your own risk.
+
+ For any questions or problems with the driver, please contact Ganesh
+ Varadarajan <ganesh@veritas.com>
+
+
+Keyspan PDA Serial Adapter
+
+ Single port DB-9 serial adapter, pushed as a PDA adapter for iMacs (mostly
+ sold in Macintosh catalogs, comes in a translucent white/green dongle).
+ Fairly simple device. Firmware is homebrew.
+ This driver also works for the Xircom/Entrgra single port serial adapter.
+
+ Current status:
+ Things that work:
+ basic input/output (tested with 'cu')
+ blocking write when serial line can't keep up
+ changing baud rates (up to 115200)
+ getting/setting modem control pins (TIOCM{GET,SET,BIS,BIC})
+ sending break (although duration looks suspect)
+ Things that don't:
+ device strings (as logged by kernel) have trailing binary garbage
+ device ID isn't right, might collide with other Keyspan products
+ changing baud rates ought to flush tx/rx to avoid mangled half characters
+ Big Things on the todo list:
+ parity, 7 vs 8 bits per char, 1 or 2 stop bits
+ HW flow control
+ not all of the standard USB descriptors are handled: Get_Status, Set_Feature
+ O_NONBLOCK, select()
+
+ For any questions or problems with this driver, please contact Brian
+ Warner at warner@lothar.com
+
+
+Keyspan USA-series Serial Adapters
+
+ Single, Dual and Quad port adapters - driver uses Keyspan supplied
+ firmware and is being developed with their support.
+
+ Current status:
+ The USA-18X, USA-28X, USA-19, USA-19W and USA-49W are supported and
+ have been pretty thoroughly tested at various baud rates with 8-N-1
+ character settings. Other character lengths and parity setups are
+ presently untested.
+
+ The USA-28 isn't yet supported though doing so should be pretty
+ straightforward. Contact the maintainer if you require this
+ functionality.
+
+ More information is available at:
+ http://misc.nu/hugh/keyspan.html
+
+ For any questions or problems with this driver, please contact Hugh
+ Blemings at hugh@misc.nu
+
+
+FTDI Single Port Serial Driver
+
+ This is a single port DB-25 serial adapter.
+
+ For any questions or problems with this driver, please contact Bill Ryder.
+
+
+ZyXEL omni.net lcd plus ISDN TA
+
+ This is an ISDN TA. Please report both successes and troubles to
+ azummo@towertech.it
+
+
+Cypress M8 CY4601 Family Serial Driver
+
+ This driver was in most part developed by Neil "koyama" Whelchel. It
+ has been improved since that previous form to support dynamic serial
+ line settings and improved line handling. The driver is for the most
+ part stable and has been tested on an smp machine. (dual p2)
+
+ Chipsets supported under CY4601 family:
+
+ CY7C63723, CY7C63742, CY7C63743, CY7C64013
+
+ Devices supported:
+
+ -DeLorme's USB Earthmate (SiRF Star II lp arch)
+ -Cypress HID->COM RS232 adapter
+
+ Note: Cypress Semiconductor claims no affiliation with the
+ hid->com device.
+
+ Most devices using chipsets under the CY4601 family should
+ work with the driver. As long as they stay true to the CY4601
+ usbserial specification.
+
+ Technical notes:
+
+ The Earthmate starts out at 4800 8N1 by default... the driver will
+ upon start init to this setting. usbserial core provides the rest
+ of the termios settings, along with some custom termios so that the
+ output is in proper format and parsable.
+
+ The device can be put into sirf mode by issuing NMEA command:
+ $PSRF100,<protocol>,<baud>,<databits>,<stopbits>,<parity>*CHECKSUM
+ $PSRF100,0,9600,8,1,0*0C
+
+ It should then be sufficient to change the port termios to match this
+ to begin communicating.
+
+ As far as I can tell it supports pretty much every sirf command as
+ documented online available with firmware 2.31, with some unknown
+ message ids.
+
+ The hid->com adapter can run at a maximum baud of 115200bps. Please note
+ that the device has trouble or is incapable of raising line voltage properly.
+ It will be fine with null modem links, as long as you do not try to link two
+ together without hacking the adapter to set the line high.
+
+ The driver is smp safe. Performance with the driver is rather low when using
+ it for transferring files. This is being worked on, but I would be willing to
+ accept patches. An urb queue or packet buffer would likely fit the bill here.
+
+ If you have any questions, problems, patches, feature requests, etc. you can
+ contact me here via email:
+ dignome@gmail.com
+ (your problems/patches can alternately be submitted to usb-devel)
+
+
+Digi AccelePort Driver
+
+ This driver supports the Digi AccelePort USB 2 and 4 devices, 2 port
+ (plus a parallel port) and 4 port USB serial converters. The driver
+ does NOT yet support the Digi AccelePort USB 8.
+
+ This driver works under SMP with the usb-uhci driver. It does not
+ work under SMP with the uhci driver.
+
+ The driver is generally working, though we still have a few more ioctls
+ to implement and final testing and debugging to do. The parallel port
+ on the USB 2 is supported as a serial to parallel converter; in other
+ words, it appears as another USB serial port on Linux, even though
+ physically it is really a parallel port. The Digi Acceleport USB 8
+ is not yet supported.
+
+ Please contact Peter Berger (pberger@brimson.com) or Al Borchers
+ (alborchers@steinerpoint.com) for questions or problems with this
+ driver.
+
+
+Belkin USB Serial Adapter F5U103
+
+ Single port DB-9/PS-2 serial adapter from Belkin with firmware by eTEK Labs.
+ The Peracom single port serial adapter also works with this driver, as
+ well as the GoHubs adapter.
+
+ Current status:
+ The following have been tested and work:
+ Baud rate 300-230400
+ Data bits 5-8
+ Stop bits 1-2
+ Parity N,E,O,M,S
+ Handshake None, Software (XON/XOFF), Hardware (CTSRTS,CTSDTR)*
+ Break Set and clear
+ Line control Input/Output query and control **
+
+ * Hardware input flow control is only enabled for firmware
+ levels above 2.06. Read source code comments describing Belkin
+ firmware errata. Hardware output flow control is working for all
+ firmware versions.
+ ** Queries of inputs (CTS,DSR,CD,RI) show the last
+ reported state. Queries of outputs (DTR,RTS) show the last
+ requested state and may not reflect current state as set by
+ automatic hardware flow control.
+
+ TO DO List:
+ -- Add true modem control line query capability. Currently tracks the
+ states reported by the interrupt and the states requested.
+ -- Add error reporting back to application for UART error conditions.
+ -- Add support for flush ioctls.
+ -- Add everything else that is missing :)
+
+ For any questions or problems with this driver, please contact William
+ Greathouse at wgreathouse@smva.com
+
+
+Empeg empeg-car Mark I/II Driver
+
+ This is an experimental driver to provide connectivity support for the
+ client synchronization tools for an Empeg empeg-car mp3 player.
+
+ Tips:
+ * Don't forget to create the device nodes for ttyUSB{0,1,2,...}
+ * modprobe empeg (modprobe is your friend)
+ * emptool --usb /dev/ttyUSB0 (or whatever you named your device node)
+
+ For any questions or problems with this driver, please contact Gary
+ Brubaker at xavyer@ix.netcom.com
+
+
+MCT USB Single Port Serial Adapter U232
+
+ This driver is for the MCT USB-RS232 Converter (25 pin, Model No.
+ U232-P25) from Magic Control Technology Corp. (there is also a 9 pin
+ Model No. U232-P9). More information about this device can be found at
+ the manufacturer's web-site: http://www.mct.com.tw.
+
+ The driver is generally working, though it still needs some more testing.
+ It is derived from the Belkin USB Serial Adapter F5U103 driver and its
+ TODO list is valid for this driver as well.
+
+ This driver has also been found to work for other products, which have
+ the same Vendor ID but different Product IDs. Sitecom's U232-P25 serial
+ converter uses Product ID 0x230 and Vendor ID 0x711 and works with this
+ driver. Also, D-Link's DU-H3SP USB BAY also works with this driver.
+
+ For any questions or problems with this driver, please contact Wolfgang
+ Grandegger at wolfgang@ces.ch
+
+
+Inside Out Networks Edgeport Driver
+
+ This driver supports all devices made by Inside Out Networks, specifically
+ the following models:
+ Edgeport/4
+ Rapidport/4
+ Edgeport/4t
+ Edgeport/2
+ Edgeport/4i
+ Edgeport/2i
+ Edgeport/421
+ Edgeport/21
+ Edgeport/8
+ Edgeport/8 Dual
+ Edgeport/2D8
+ Edgeport/4D8
+ Edgeport/8i
+ Edgeport/2 DIN
+ Edgeport/4 DIN
+ Edgeport/16 Dual
+
+ For any questions or problems with this driver, please contact Greg
+ Kroah-Hartman at greg@kroah.com
+
+
+REINER SCT cyberJack pinpad/e-com USB chipcard reader
+
+ Interface to ISO 7816 compatible contactbased chipcards, e.g. GSM SIMs.
+
+ Current status:
+ This is the kernel part of the driver for this USB card reader.
+ There is also a user part for a CT-API driver available. A site
+ for downloading is TBA. For now, you can request it from the
+ maintainer (linux-usb@sii.li).
+
+ For any questions or problems with this driver, please contact
+ linux-usb@sii.li
+
+
+Prolific PL2303 Driver
+
+ This driver supports any device that has the PL2303 chip from Prolific
+ in it. This includes a number of single port USB to serial
+ converters and USB GPS devices. Devices from Aten (the UC-232) and
+ IO-Data work with this driver, as does the DCU-11 mobile-phone cable.
+
+ For any questions or problems with this driver, please contact Greg
+ Kroah-Hartman at greg@kroah.com
+
+
+KL5KUSB105 chipset / PalmConnect USB single-port adapter
+
+Current status:
+ The driver was put together by looking at the usb bus transactions
+ done by Palm's driver under Windows, so a lot of functionality is
+ still missing. Notably, serial ioctls are sometimes faked or not yet
+ implemented. Support for finding out about DSR and CTS line status is
+ however implemented (though not nicely), so your favorite autopilot(1)
+ and pilot-manager -daemon calls will work. Baud rates up to 115200
+ are supported, but handshaking (software or hardware) is not, which is
+ why it is wise to cut down on the rate used is wise for large
+ transfers until this is settled.
+
+Options supported:
+ If this driver is compiled as a module you can pass the following
+ options to it:
+ debug - extra verbose debugging info
+ (default: 0; nonzero enables)
+ use_lowlatency - use low_latency flag to speed up tty layer
+ when reading from the device.
+ (default: 0; nonzero enables)
+
+ See http://www.uuhaus.de/linux/palmconnect.html for up-to-date
+ information on this driver.
+
+Winchiphead CH341 Driver
+
+ This driver is for the Winchiphead CH341 USB-RS232 Converter. This chip
+ also implements an IEEE 1284 parallel port, I2C and SPI, but that is not
+ supported by the driver. The protocol was analyzed from the behaviour
+ of the Windows driver, no datasheet is available at present.
+ The manufacturer's website: http://www.winchiphead.com/.
+ For any questions or problems with this driver, please contact
+ frank@kingswood-consulting.co.uk.
+
+
+Generic Serial driver
+
+ If your device is not one of the above listed devices, compatible with
+ the above models, you can try out the "generic" interface. This
+ interface does not provide any type of control messages sent to the
+ device, and does not support any kind of device flow control. All that
+ is required of your device is that it has at least one bulk in endpoint,
+ or one bulk out endpoint.
+
+ To enable the generic driver to recognize your device, build the driver
+ as a module and load it by the following invocation:
+ insmod usbserial vendor=0x#### product=0x####
+ where the #### is replaced with the hex representation of your device's
+ vendor id and product id.
+
+ This driver has been successfully used to connect to the NetChip USB
+ development board, providing a way to develop USB firmware without
+ having to write a custom driver.
+
+ For any questions or problems with this driver, please contact Greg
+ Kroah-Hartman at greg@kroah.com
+
+
+CONTACT:
+
+ If anyone has any problems using these drivers, with any of the above
+ specified products, please contact the specific driver's author listed
+ above, or join the Linux-USB mailing list (information on joining the
+ mailing list, as well as a link to its searchable archive is at
+ http://www.linux-usb.org/ )
+
+
+Greg Kroah-Hartman
+greg@kroah.com
diff --git a/Documentation/usb/usbmon.txt b/Documentation/usb/usbmon.txt
new file mode 100644
index 0000000..2704819
--- /dev/null
+++ b/Documentation/usb/usbmon.txt
@@ -0,0 +1,362 @@
+* Introduction
+
+The name "usbmon" in lowercase refers to a facility in kernel which is
+used to collect traces of I/O on the USB bus. This function is analogous
+to a packet socket used by network monitoring tools such as tcpdump(1)
+or Ethereal. Similarly, it is expected that a tool such as usbdump or
+USBMon (with uppercase letters) is used to examine raw traces produced
+by usbmon.
+
+The usbmon reports requests made by peripheral-specific drivers to Host
+Controller Drivers (HCD). So, if HCD is buggy, the traces reported by
+usbmon may not correspond to bus transactions precisely. This is the same
+situation as with tcpdump.
+
+* How to use usbmon to collect raw text traces
+
+Unlike the packet socket, usbmon has an interface which provides traces
+in a text format. This is used for two purposes. First, it serves as a
+common trace exchange format for tools while more sophisticated formats
+are finalized. Second, humans can read it in case tools are not available.
+
+To collect a raw text trace, execute following steps.
+
+1. Prepare
+
+Mount debugfs (it has to be enabled in your kernel configuration), and
+load the usbmon module (if built as module). The second step is skipped
+if usbmon is built into the kernel.
+
+# mount -t debugfs none_debugs /sys/kernel/debug
+# modprobe usbmon
+#
+
+Verify that bus sockets are present.
+
+# ls /sys/kernel/debug/usbmon
+0s 0u 1s 1t 1u 2s 2t 2u 3s 3t 3u 4s 4t 4u
+#
+
+Now you can choose to either use the socket '0u' (to capture packets on all
+buses), and skip to step #3, or find the bus used by your device with step #2.
+This allows to filter away annoying devices that talk continuously.
+
+2. Find which bus connects to the desired device
+
+Run "cat /proc/bus/usb/devices", and find the T-line which corresponds to
+the device. Usually you do it by looking for the vendor string. If you have
+many similar devices, unplug one and compare two /proc/bus/usb/devices outputs.
+The T-line will have a bus number. Example:
+
+T: Bus=03 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=12 MxCh= 0
+D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1
+P: Vendor=0557 ProdID=2004 Rev= 1.00
+S: Manufacturer=ATEN
+S: Product=UC100KM V2.00
+
+Bus=03 means it's bus 3.
+
+3. Start 'cat'
+
+# cat /sys/kernel/debug/usbmon/3u > /tmp/1.mon.out
+
+to listen on a single bus, otherwise, to listen on all buses, type:
+
+# cat /sys/kernel/debug/usbmon/0u > /tmp/1.mon.out
+
+This process will be reading until killed. Naturally, the output can be
+redirected to a desirable location. This is preferred, because it is going
+to be quite long.
+
+4. Perform the desired operation on the USB bus
+
+This is where you do something that creates the traffic: plug in a flash key,
+copy files, control a webcam, etc.
+
+5. Kill cat
+
+Usually it's done with a keyboard interrupt (Control-C).
+
+At this point the output file (/tmp/1.mon.out in this example) can be saved,
+sent by e-mail, or inspected with a text editor. In the last case make sure
+that the file size is not excessive for your favourite editor.
+
+* Raw text data format
+
+Two formats are supported currently: the original, or '1t' format, and
+the '1u' format. The '1t' format is deprecated in kernel 2.6.21. The '1u'
+format adds a few fields, such as ISO frame descriptors, interval, etc.
+It produces slightly longer lines, but otherwise is a perfect superset
+of '1t' format.
+
+If it is desired to recognize one from the other in a program, look at the
+"address" word (see below), where '1u' format adds a bus number. If 2 colons
+are present, it's the '1t' format, otherwise '1u'.
+
+Any text format data consists of a stream of events, such as URB submission,
+URB callback, submission error. Every event is a text line, which consists
+of whitespace separated words. The number or position of words may depend
+on the event type, but there is a set of words, common for all types.
+
+Here is the list of words, from left to right:
+
+- URB Tag. This is used to identify URBs, and is normally an in-kernel address
+ of the URB structure in hexadecimal, but can be a sequence number or any
+ other unique string, within reason.
+
+- Timestamp in microseconds, a decimal number. The timestamp's resolution
+ depends on available clock, and so it can be much worse than a microsecond
+ (if the implementation uses jiffies, for example).
+
+- Event Type. This type refers to the format of the event, not URB type.
+ Available types are: S - submission, C - callback, E - submission error.
+
+- "Address" word (formerly a "pipe"). It consists of four fields, separated by
+ colons: URB type and direction, Bus number, Device address, Endpoint number.
+ Type and direction are encoded with two bytes in the following manner:
+ Ci Co Control input and output
+ Zi Zo Isochronous input and output
+ Ii Io Interrupt input and output
+ Bi Bo Bulk input and output
+ Bus number, Device address, and Endpoint are decimal numbers, but they may
+ have leading zeros, for the sake of human readers.
+
+- URB Status word. This is either a letter, or several numbers separated
+ by colons: URB status, interval, start frame, and error count. Unlike the
+ "address" word, all fields save the status are optional. Interval is printed
+ only for interrupt and isochronous URBs. Start frame is printed only for
+ isochronous URBs. Error count is printed only for isochronous callback
+ events.
+
+ The status field is a decimal number, sometimes negative, which represents
+ a "status" field of the URB. This field makes no sense for submissions, but
+ is present anyway to help scripts with parsing. When an error occurs, the
+ field contains the error code.
+
+ In case of a submission of a Control packet, this field contains a Setup Tag
+ instead of an group of numbers. It is easy to tell whether the Setup Tag is
+ present because it is never a number. Thus if scripts find a set of numbers
+ in this word, they proceed to read Data Length (except for isochronous URBs).
+ If they find something else, like a letter, they read the setup packet before
+ reading the Data Length or isochronous descriptors.
+
+- Setup packet, if present, consists of 5 words: one of each for bmRequestType,
+ bRequest, wValue, wIndex, wLength, as specified by the USB Specification 2.0.
+ These words are safe to decode if Setup Tag was 's'. Otherwise, the setup
+ packet was present, but not captured, and the fields contain filler.
+
+- Number of isochronous frame descriptors and descriptors themselves.
+ If an Isochronous transfer event has a set of descriptors, a total number
+ of them in an URB is printed first, then a word per descriptor, up to a
+ total of 5. The word consists of 3 colon-separated decimal numbers for
+ status, offset, and length respectively. For submissions, initial length
+ is reported. For callbacks, actual length is reported.
+
+- Data Length. For submissions, this is the requested length. For callbacks,
+ this is the actual length.
+
+- Data tag. The usbmon may not always capture data, even if length is nonzero.
+ The data words are present only if this tag is '='.
+
+- Data words follow, in big endian hexadecimal format. Notice that they are
+ not machine words, but really just a byte stream split into words to make
+ it easier to read. Thus, the last word may contain from one to four bytes.
+ The length of collected data is limited and can be less than the data length
+ report in Data Length word.
+
+Here is an example of code to read the data stream in a well known programming
+language:
+
+class ParsedLine {
+ int data_len; /* Available length of data */
+ byte data[];
+
+ void parseData(StringTokenizer st) {
+ int availwords = st.countTokens();
+ data = new byte[availwords * 4];
+ data_len = 0;
+ while (st.hasMoreTokens()) {
+ String data_str = st.nextToken();
+ int len = data_str.length() / 2;
+ int i;
+ int b; // byte is signed, apparently?! XXX
+ for (i = 0; i < len; i++) {
+ // data[data_len] = Byte.parseByte(
+ // data_str.substring(i*2, i*2 + 2),
+ // 16);
+ b = Integer.parseInt(
+ data_str.substring(i*2, i*2 + 2),
+ 16);
+ if (b >= 128)
+ b *= -1;
+ data[data_len] = (byte) b;
+ data_len++;
+ }
+ }
+ }
+}
+
+Examples:
+
+An input control transfer to get a port status.
+
+d5ea89a0 3575914555 S Ci:1:001:0 s a3 00 0000 0003 0004 4 <
+d5ea89a0 3575914560 C Ci:1:001:0 0 4 = 01050000
+
+An output bulk transfer to send a SCSI command 0x5E in a 31-byte Bulk wrapper
+to a storage device at address 5:
+
+dd65f0e8 4128379752 S Bo:1:005:2 -115 31 = 55534243 5e000000 00000000 00000600 00000000 00000000 00000000 000000
+dd65f0e8 4128379808 C Bo:1:005:2 0 31 >
+
+* Raw binary format and API
+
+The overall architecture of the API is about the same as the one above,
+only the events are delivered in binary format. Each event is sent in
+the following structure (its name is made up, so that we can refer to it):
+
+struct usbmon_packet {
+ u64 id; /* 0: URB ID - from submission to callback */
+ unsigned char type; /* 8: Same as text; extensible. */
+ unsigned char xfer_type; /* ISO (0), Intr, Control, Bulk (3) */
+ unsigned char epnum; /* Endpoint number and transfer direction */
+ unsigned char devnum; /* Device address */
+ u16 busnum; /* 12: Bus number */
+ char flag_setup; /* 14: Same as text */
+ char flag_data; /* 15: Same as text; Binary zero is OK. */
+ s64 ts_sec; /* 16: gettimeofday */
+ s32 ts_usec; /* 24: gettimeofday */
+ int status; /* 28: */
+ unsigned int length; /* 32: Length of data (submitted or actual) */
+ unsigned int len_cap; /* 36: Delivered length */
+ unsigned char setup[8]; /* 40: Only for Control 'S' */
+}; /* 48 bytes total */
+
+These events can be received from a character device by reading with read(2),
+with an ioctl(2), or by accessing the buffer with mmap.
+
+The character device is usually called /dev/usbmonN, where N is the USB bus
+number. Number zero (/dev/usbmon0) is special and means "all buses".
+However, this feature is not implemented yet. Note that specific naming
+policy is set by your Linux distribution.
+
+If you create /dev/usbmon0 by hand, make sure that it is owned by root
+and has mode 0600. Otherwise, unpriviledged users will be able to snoop
+keyboard traffic.
+
+The following ioctl calls are available, with MON_IOC_MAGIC 0x92:
+
+ MON_IOCQ_URB_LEN, defined as _IO(MON_IOC_MAGIC, 1)
+
+This call returns the length of data in the next event. Note that majority of
+events contain no data, so if this call returns zero, it does not mean that
+no events are available.
+
+ MON_IOCG_STATS, defined as _IOR(MON_IOC_MAGIC, 3, struct mon_bin_stats)
+
+The argument is a pointer to the following structure:
+
+struct mon_bin_stats {
+ u32 queued;
+ u32 dropped;
+};
+
+The member "queued" refers to the number of events currently queued in the
+buffer (and not to the number of events processed since the last reset).
+
+The member "dropped" is the number of events lost since the last call
+to MON_IOCG_STATS.
+
+ MON_IOCT_RING_SIZE, defined as _IO(MON_IOC_MAGIC, 4)
+
+This call sets the buffer size. The argument is the size in bytes.
+The size may be rounded down to the next chunk (or page). If the requested
+size is out of [unspecified] bounds for this kernel, the call fails with
+-EINVAL.
+
+ MON_IOCQ_RING_SIZE, defined as _IO(MON_IOC_MAGIC, 5)
+
+This call returns the current size of the buffer in bytes.
+
+ MON_IOCX_GET, defined as _IOW(MON_IOC_MAGIC, 6, struct mon_get_arg)
+
+This call waits for events to arrive if none were in the kernel buffer,
+then returns the first event. Its argument is a pointer to the following
+structure:
+
+struct mon_get_arg {
+ struct usbmon_packet *hdr;
+ void *data;
+ size_t alloc; /* Length of data (can be zero) */
+};
+
+Before the call, hdr, data, and alloc should be filled. Upon return, the area
+pointed by hdr contains the next event structure, and the data buffer contains
+the data, if any. The event is removed from the kernel buffer.
+
+ MON_IOCX_MFETCH, defined as _IOWR(MON_IOC_MAGIC, 7, struct mon_mfetch_arg)
+
+This ioctl is primarily used when the application accesses the buffer
+with mmap(2). Its argument is a pointer to the following structure:
+
+struct mon_mfetch_arg {
+ uint32_t *offvec; /* Vector of events fetched */
+ uint32_t nfetch; /* Number of events to fetch (out: fetched) */
+ uint32_t nflush; /* Number of events to flush */
+};
+
+The ioctl operates in 3 stages.
+
+First, it removes and discards up to nflush events from the kernel buffer.
+The actual number of events discarded is returned in nflush.
+
+Second, it waits for an event to be present in the buffer, unless the pseudo-
+device is open with O_NONBLOCK.
+
+Third, it extracts up to nfetch offsets into the mmap buffer, and stores
+them into the offvec. The actual number of event offsets is stored into
+the nfetch.
+
+ MON_IOCH_MFLUSH, defined as _IO(MON_IOC_MAGIC, 8)
+
+This call removes a number of events from the kernel buffer. Its argument
+is the number of events to remove. If the buffer contains fewer events
+than requested, all events present are removed, and no error is reported.
+This works when no events are available too.
+
+ FIONBIO
+
+The ioctl FIONBIO may be implemented in the future, if there's a need.
+
+In addition to ioctl(2) and read(2), the special file of binary API can
+be polled with select(2) and poll(2). But lseek(2) does not work.
+
+* Memory-mapped access of the kernel buffer for the binary API
+
+The basic idea is simple:
+
+To prepare, map the buffer by getting the current size, then using mmap(2).
+Then, execute a loop similar to the one written in pseudo-code below:
+
+ struct mon_mfetch_arg fetch;
+ struct usbmon_packet *hdr;
+ int nflush = 0;
+ for (;;) {
+ fetch.offvec = vec; // Has N 32-bit words
+ fetch.nfetch = N; // Or less than N
+ fetch.nflush = nflush;
+ ioctl(fd, MON_IOCX_MFETCH, &fetch); // Process errors, too
+ nflush = fetch.nfetch; // This many packets to flush when done
+ for (i = 0; i < nflush; i++) {
+ hdr = (struct ubsmon_packet *) &mmap_area[vec[i]];
+ if (hdr->type == '@') // Filler packet
+ continue;
+ caddr_t data = &mmap_area[vec[i]] + 64;
+ process_packet(hdr, data);
+ }
+ }
+
+Thus, the main idea is to execute only one ioctl per N events.
+
+Although the buffer is circular, the returned headers and data do not cross
+the end of the buffer, so the above pseudo-code does not need any gathering.
diff --git a/Documentation/usb/wusb-cbaf b/Documentation/usb/wusb-cbaf
new file mode 100644
index 0000000..2e78b70
--- /dev/null
+++ b/Documentation/usb/wusb-cbaf
@@ -0,0 +1,139 @@
+#! /bin/bash
+#
+
+set -e
+
+progname=$(basename $0)
+function help
+{
+ cat <<EOF
+Usage: $progname COMMAND DEVICEs [ARGS]
+
+Command for manipulating the pairing/authentication credentials of a
+Wireless USB device that supports wired-mode Cable-Based-Association.
+
+Works in conjunction with the wusb-cba.ko driver from http://linuxuwb.org.
+
+
+DEVICE
+
+ sysfs path to the device to authenticate; for example, both this
+ guys are the same:
+
+ /sys/devices/pci0000:00/0000:00:1d.7/usb1/1-4/1-4.4/1-4.4:1.1
+ /sys/bus/usb/drivers/wusb-cbaf/1-4.4:1.1
+
+COMMAND/ARGS are
+
+ start
+
+ Start a WUSB host controller (by setting up a CHID)
+
+ set-chid DEVICE HOST-CHID HOST-BANDGROUP HOST-NAME
+
+ Sets host information in the device; after this you can call the
+ get-cdid to see how does this device report itself to us.
+
+ get-cdid DEVICE
+
+ Get the device ID associated to the HOST-CHDI we sent with
+ 'set-chid'. We might not know about it.
+
+ set-cc DEVICE
+
+ If we allow the device to connect, set a random new CDID and CK
+ (connection key). Device saves them for the next time it wants to
+ connect wireless. We save them for that next time also so we can
+ authenticate the device (when we see the CDID he uses to id
+ itself) and the CK to crypto talk to it.
+
+CHID is always 16 hex bytes in 'XX YY ZZ...' form
+BANDGROUP is almost always 0001
+
+Examples:
+
+ You can default most arguments to '' to get a sane value:
+
+ $ $progname set-chid '' '' '' "My host name"
+
+ A full sequence:
+
+ $ $progname set-chid '' '' '' "My host name"
+ $ $progname get-cdid ''
+ $ $progname set-cc ''
+
+EOF
+}
+
+
+# Defaults
+# FIXME: CHID should come from a database :), band group from the host
+host_CHID="00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff"
+host_band_group="0001"
+host_name=$(hostname)
+
+devs="$(echo /sys/bus/usb/drivers/wusb-cbaf/[0-9]*)"
+hdevs="$(for h in /sys/class/uwb_rc/*/wusbhc; do readlink -f $h; done)"
+
+result=0
+case $1 in
+ start)
+ for dev in ${2:-$hdevs}
+ do
+ uwb_rc=$(readlink -f $dev/uwb_rc)
+ if cat $uwb_rc/beacon | grep -q -- "-1"
+ then
+ echo 13 0 > $uwb_rc/beacon
+ echo I: started beaconing on ch 13 on $(basename $uwb_rc) >&2
+ fi
+ echo $host_CHID > $dev/wusb_chid
+ echo I: started host $(basename $dev) >&2
+ done
+ ;;
+ stop)
+ for dev in ${2:-$hdevs}
+ do
+ echo 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > $dev/wusb_chid
+ echo I: stopped host $(basename $dev) >&2
+ uwb_rc=$(readlink -f $dev/uwb_rc)
+ echo -1 | cat > $uwb_rc/beacon
+ echo I: stopped beaconing on $(basename $uwb_rc) >&2
+ done
+ ;;
+ set-chid)
+ shift
+ for dev in ${2:-$devs}; do
+ echo "${4:-$host_name}" > $dev/wusb_host_name
+ echo "${3:-$host_band_group}" > $dev/wusb_host_band_groups
+ echo ${2:-$host_CHID} > $dev/wusb_chid
+ done
+ ;;
+ get-cdid)
+ for dev in ${2:-$devs}
+ do
+ cat $dev/wusb_cdid
+ done
+ ;;
+ set-cc)
+ for dev in ${2:-$devs}; do
+ shift
+ CDID="$(head --bytes=16 /dev/urandom | od -tx1 -An)"
+ CK="$(head --bytes=16 /dev/urandom | od -tx1 -An)"
+ echo "$CDID" > $dev/wusb_cdid
+ echo "$CK" > $dev/wusb_ck
+
+ echo I: CC set >&2
+ echo "CHID: $(cat $dev/wusb_chid)"
+ echo "CDID:$CDID"
+ echo "CK: $CK"
+ done
+ ;;
+ help|h|--help|-h)
+ help
+ ;;
+ *)
+ echo "E: Unknown usage" 1>&2
+ help 1>&2
+ result=1
+esac
+exit $result
diff --git a/Documentation/video-output.txt b/Documentation/video-output.txt
new file mode 100644
index 0000000..e517011
--- /dev/null
+++ b/Documentation/video-output.txt
@@ -0,0 +1,34 @@
+
+ Video Output Switcher Control
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 2006 luming.yu@intel.com
+
+The output sysfs class driver provides an abstract video output layer that
+can be used to hook platform specific methods to enable/disable video output
+device through common sysfs interface. For example, on my IBM ThinkPad T42
+laptop, The ACPI video driver registered its output devices and read/write
+method for 'state' with output sysfs class. The user interface under sysfs is:
+
+linux:/sys/class/video_output # tree .
+.
+|-- CRT0
+| |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+| |-- state
+| |-- subsystem -> ../../../class/video_output
+| `-- uevent
+|-- DVI0
+| |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+| |-- state
+| |-- subsystem -> ../../../class/video_output
+| `-- uevent
+|-- LCD0
+| |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+| |-- state
+| |-- subsystem -> ../../../class/video_output
+| `-- uevent
+`-- TV0
+ |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+ |-- state
+ |-- subsystem -> ../../../class/video_output
+ `-- uevent
+
diff --git a/Documentation/video4linux/.gitignore b/Documentation/video4linux/.gitignore
new file mode 100644
index 0000000..9527039
--- /dev/null
+++ b/Documentation/video4linux/.gitignore
@@ -0,0 +1 @@
+v4lgrab
diff --git a/Documentation/video4linux/API.html b/Documentation/video4linux/API.html
new file mode 100644
index 0000000..afbe9ae
--- /dev/null
+++ b/Documentation/video4linux/API.html
@@ -0,0 +1,16 @@
+<TITLE>V4L API</TITLE>
+<H1>Video For Linux APIs</H1>
+<table border=0>
+<tr>
+<td>
+<A HREF=http://www.linuxtv.org/downloads/video4linux/API/V4L1_API.html>
+V4L original API</a>
+</td><td>
+Obsoleted by V4L2 API
+</td></tr><tr><td>
+<A HREF=http://www.linuxtv.org/downloads/video4linux/API/V4L2_API>
+V4L2 API</a>
+</td><td>
+Should be used for new projects
+</td></tr>
+</table>
diff --git a/Documentation/video4linux/CARDLIST.au0828 b/Documentation/video4linux/CARDLIST.au0828
new file mode 100644
index 0000000..d5cb4ea
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.au0828
@@ -0,0 +1,6 @@
+ 0 -> Unknown board (au0828)
+ 1 -> Hauppauge HVR950Q (au0828) [2040:7200,2040:7210,2040:7217,2040:721b,2040:721e,2040:721f,2040:7280,0fd9:0008]
+ 2 -> Hauppauge HVR850 (au0828) [2040:7240]
+ 3 -> DViCO FusionHDTV USB (au0828) [0fe9:d620]
+ 4 -> Hauppauge HVR950Q rev xxF8 (au0828) [2040:7201,2040:7211,2040:7281]
+ 5 -> Hauppauge Woodbury (au0828) [2040:8200]
diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv
new file mode 100644
index 0000000..60ba668
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.bttv
@@ -0,0 +1,153 @@
+ 0 -> *** UNKNOWN/GENERIC ***
+ 1 -> MIRO PCTV
+ 2 -> Hauppauge (bt848)
+ 3 -> STB, Gateway P/N 6000699 (bt848)
+ 4 -> Intel Create and Share PCI/ Smart Video Recorder III
+ 5 -> Diamond DTV2000
+ 6 -> AVerMedia TVPhone
+ 7 -> MATRIX-Vision MV-Delta
+ 8 -> Lifeview FlyVideo II (Bt848) LR26 / MAXI TV Video PCI2 LR26
+ 9 -> IMS/IXmicro TurboTV
+ 10 -> Hauppauge (bt878) [0070:13eb,0070:3900,2636:10b4]
+ 11 -> MIRO PCTV pro
+ 12 -> ADS Technologies Channel Surfer TV (bt848)
+ 13 -> AVerMedia TVCapture 98 [1461:0002,1461:0004,1461:0300]
+ 14 -> Aimslab Video Highway Xtreme (VHX)
+ 15 -> Zoltrix TV-Max [a1a0:a0fc]
+ 16 -> Prolink Pixelview PlayTV (bt878)
+ 17 -> Leadtek WinView 601
+ 18 -> AVEC Intercapture
+ 19 -> Lifeview FlyVideo II EZ /FlyKit LR38 Bt848 (capture only)
+ 20 -> CEI Raffles Card
+ 21 -> Lifeview FlyVideo 98/ Lucky Star Image World ConferenceTV LR50
+ 22 -> Askey CPH050/ Phoebe Tv Master + FM [14ff:3002]
+ 23 -> Modular Technology MM201/MM202/MM205/MM210/MM215 PCTV, bt878 [14c7:0101]
+ 24 -> Askey CPH05X/06X (bt878) [many vendors] [144f:3002,144f:3005,144f:5000,14ff:3000]
+ 25 -> Terratec TerraTV+ Version 1.0 (Bt848)/ Terra TValue Version 1.0/ Vobis TV-Boostar
+ 26 -> Hauppauge WinCam newer (bt878)
+ 27 -> Lifeview FlyVideo 98/ MAXI TV Video PCI2 LR50
+ 28 -> Terratec TerraTV+ Version 1.1 (bt878) [153b:1127,1852:1852]
+ 29 -> Imagenation PXC200 [1295:200a]
+ 30 -> Lifeview FlyVideo 98 LR50 [1f7f:1850]
+ 31 -> Formac iProTV, Formac ProTV I (bt848)
+ 32 -> Intel Create and Share PCI/ Smart Video Recorder III
+ 33 -> Terratec TerraTValue Version Bt878 [153b:1117,153b:1118,153b:1119,153b:111a,153b:1134,153b:5018]
+ 34 -> Leadtek WinFast 2000/ WinFast 2000 XP [107d:6606,107d:6609,6606:217d,f6ff:fff6]
+ 35 -> Lifeview FlyVideo 98 LR50 / Chronos Video Shuttle II [1851:1850,1851:a050]
+ 36 -> Lifeview FlyVideo 98FM LR50 / Typhoon TView TV/FM Tuner [1852:1852]
+ 37 -> Prolink PixelView PlayTV pro
+ 38 -> Askey CPH06X TView99 [144f:3000,144f:a005,a04f:a0fc]
+ 39 -> Pinnacle PCTV Studio/Rave [11bd:0012,bd11:1200,bd11:ff00,11bd:ff12]
+ 40 -> STB TV PCI FM, Gateway P/N 6000704 (bt878), 3Dfx VoodooTV 100 [10b4:2636,10b4:2645,121a:3060]
+ 41 -> AVerMedia TVPhone 98 [1461:0001,1461:0003]
+ 42 -> ProVideo PV951 [aa0c:146c]
+ 43 -> Little OnAir TV
+ 44 -> Sigma TVII-FM
+ 45 -> MATRIX-Vision MV-Delta 2
+ 46 -> Zoltrix Genie TV/FM [15b0:4000,15b0:400a,15b0:400d,15b0:4010,15b0:4016]
+ 47 -> Terratec TV/Radio+ [153b:1123]
+ 48 -> Askey CPH03x/ Dynalink Magic TView
+ 49 -> IODATA GV-BCTV3/PCI [10fc:4020]
+ 50 -> Prolink PV-BT878P+4E / PixelView PlayTV PAK / Lenco MXTV-9578 CP
+ 51 -> Eagle Wireless Capricorn2 (bt878A)
+ 52 -> Pinnacle PCTV Studio Pro
+ 53 -> Typhoon TView RDS + FM Stereo / KNC1 TV Station RDS
+ 54 -> Lifeview FlyVideo 2000 /FlyVideo A2/ Lifetec LT 9415 TV [LR90]
+ 55 -> Askey CPH031/ BESTBUY Easy TV
+ 56 -> Lifeview FlyVideo 98FM LR50 [a051:41a0]
+ 57 -> GrandTec 'Grand Video Capture' (Bt848) [4344:4142]
+ 58 -> Askey CPH060/ Phoebe TV Master Only (No FM)
+ 59 -> Askey CPH03x TV Capturer
+ 60 -> Modular Technology MM100PCTV
+ 61 -> AG Electronics GMV1 [15cb:0101]
+ 62 -> Askey CPH061/ BESTBUY Easy TV (bt878)
+ 63 -> ATI TV-Wonder [1002:0001]
+ 64 -> ATI TV-Wonder VE [1002:0003]
+ 65 -> Lifeview FlyVideo 2000S LR90
+ 66 -> Terratec TValueRadio [153b:1135,153b:ff3b]
+ 67 -> IODATA GV-BCTV4/PCI [10fc:4050]
+ 68 -> 3Dfx VoodooTV FM (Euro) [10b4:2637]
+ 69 -> Active Imaging AIMMS
+ 70 -> Prolink Pixelview PV-BT878P+ (Rev.4C,8E)
+ 71 -> Lifeview FlyVideo 98EZ (capture only) LR51 [1851:1851]
+ 72 -> Prolink Pixelview PV-BT878P+9B (PlayTV Pro rev.9B FM+NICAM) [1554:4011]
+ 73 -> Sensoray 311 [6000:0311]
+ 74 -> RemoteVision MX (RV605)
+ 75 -> Powercolor MTV878/ MTV878R/ MTV878F
+ 76 -> Canopus WinDVR PCI (COMPAQ Presario 3524JP, 5112JP) [0e11:0079]
+ 77 -> GrandTec Multi Capture Card (Bt878)
+ 78 -> Jetway TV/Capture JW-TV878-FBK, Kworld KW-TV878RF [0a01:17de]
+ 79 -> DSP Design TCVIDEO
+ 80 -> Hauppauge WinTV PVR [0070:4500]
+ 81 -> IODATA GV-BCTV5/PCI [10fc:4070,10fc:d018]
+ 82 -> Osprey 100/150 (878) [0070:ff00]
+ 83 -> Osprey 100/150 (848)
+ 84 -> Osprey 101 (848)
+ 85 -> Osprey 101/151
+ 86 -> Osprey 101/151 w/ svid
+ 87 -> Osprey 200/201/250/251
+ 88 -> Osprey 200/250 [0070:ff01]
+ 89 -> Osprey 210/220/230
+ 90 -> Osprey 500 [0070:ff02]
+ 91 -> Osprey 540 [0070:ff04]
+ 92 -> Osprey 2000 [0070:ff03]
+ 93 -> IDS Eagle
+ 94 -> Pinnacle PCTV Sat [11bd:001c]
+ 95 -> Formac ProTV II (bt878)
+ 96 -> MachTV
+ 97 -> Euresys Picolo
+ 98 -> ProVideo PV150 [aa00:1460,aa01:1461,aa02:1462,aa03:1463,aa04:1464,aa05:1465,aa06:1466,aa07:1467]
+ 99 -> AD-TVK503
+100 -> Hercules Smart TV Stereo
+101 -> Pace TV & Radio Card
+102 -> IVC-200 [0000:a155,0001:a155,0002:a155,0003:a155,0100:a155,0101:a155,0102:a155,0103:a155]
+103 -> Grand X-Guard / Trust 814PCI [0304:0102]
+104 -> Nebula Electronics DigiTV [0071:0101]
+105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433]
+106 -> PHYTEC VD-009-X1 MiniDIN (bt878)
+107 -> PHYTEC VD-009-X1 Combi (bt878)
+108 -> PHYTEC VD-009 MiniDIN (bt878)
+109 -> PHYTEC VD-009 Combi (bt878)
+110 -> IVC-100 [ff00:a132]
+111 -> IVC-120G [ff00:a182,ff01:a182,ff02:a182,ff03:a182,ff04:a182,ff05:a182,ff06:a182,ff07:a182,ff08:a182,ff09:a182,ff0a:a182,ff0b:a182,ff0c:a182,ff0d:a182,ff0e:a182,ff0f:a182]
+112 -> pcHDTV HD-2000 TV [7063:2000]
+113 -> Twinhan DST + clones [11bd:0026,1822:0001,270f:fc00,1822:0026]
+114 -> Winfast VC100 [107d:6607]
+115 -> Teppro TEV-560/InterVision IV-560
+116 -> SIMUS GVC1100 [aa6a:82b2]
+117 -> NGS NGSTV+
+118 -> LMLBT4
+119 -> Tekram M205 PRO
+120 -> Conceptronic CONTVFMi
+121 -> Euresys Picolo Tetra [1805:0105,1805:0106,1805:0107,1805:0108]
+122 -> Spirit TV Tuner
+123 -> AVerMedia AVerTV DVB-T 771 [1461:0771]
+124 -> AverMedia AverTV DVB-T 761 [1461:0761]
+125 -> MATRIX Vision Sigma-SQ
+126 -> MATRIX Vision Sigma-SLC
+127 -> APAC Viewcomp 878(AMAX)
+128 -> DViCO FusionHDTV DVB-T Lite [18ac:db10,18ac:db11]
+129 -> V-Gear MyVCD
+130 -> Super TV Tuner
+131 -> Tibet Systems 'Progress DVR' CS16
+132 -> Kodicom 4400R (master)
+133 -> Kodicom 4400R (slave)
+134 -> Adlink RTV24
+135 -> DViCO FusionHDTV 5 Lite [18ac:d500]
+136 -> Acorp Y878F [9511:1540]
+137 -> Conceptronic CTVFMi v2
+138 -> Prolink Pixelview PV-BT878P+ (Rev.2E)
+139 -> Prolink PixelView PlayTV MPEG2 PV-M4900
+140 -> Osprey 440 [0070:ff07]
+141 -> Asound Skyeye PCTV
+142 -> Sabrent TV-FM (bttv version)
+143 -> Hauppauge ImpactVCB (bt878) [0070:13eb]
+144 -> MagicTV
+145 -> SSAI Security Video Interface [4149:5353]
+146 -> SSAI Ultrasound Video Interface [414a:5353]
+147 -> VoodooTV 200 (USA) [121a:3000]
+148 -> DViCO FusionHDTV 2 [dbc0:d200]
+149 -> Typhoon TV-Tuner PCI (50684)
+150 -> Geovision GV-600 [008a:763c]
+151 -> Kozumi KTV-01C
+152 -> Encore ENL TV-FM-2 [1000:1801]
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
new file mode 100644
index 0000000..64823cc
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -0,0 +1,13 @@
+ 0 -> UNKNOWN/GENERIC [0070:3400]
+ 1 -> Hauppauge WinTV-HVR1800lp [0070:7600]
+ 2 -> Hauppauge WinTV-HVR1800 [0070:7800,0070:7801,0070:7809]
+ 3 -> Hauppauge WinTV-HVR1250 [0070:7911]
+ 4 -> DViCO FusionHDTV5 Express [18ac:d500]
+ 5 -> Hauppauge WinTV-HVR1500Q [0070:7790,0070:7797]
+ 6 -> Hauppauge WinTV-HVR1500 [0070:7710,0070:7717]
+ 7 -> Hauppauge WinTV-HVR1200 [0070:71d1,0070:71d3]
+ 8 -> Hauppauge WinTV-HVR1700 [0070:8101]
+ 9 -> Hauppauge WinTV-HVR1400 [0070:8010]
+ 10 -> DViCO FusionHDTV7 Dual Express [18ac:d618]
+ 11 -> DViCO FusionHDTV DVB-T Dual Express [18ac:db78]
+ 12 -> Leadtek Winfast PxDVR3200 H [107d:6681]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
new file mode 100644
index 0000000..a5227e3
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -0,0 +1,76 @@
+ 0 -> UNKNOWN/GENERIC
+ 1 -> Hauppauge WinTV 34xxx models [0070:3400,0070:3401]
+ 2 -> GDI Black Gold [14c7:0106,14c7:0107]
+ 3 -> PixelView [1554:4811]
+ 4 -> ATI TV Wonder Pro [1002:00f8]
+ 5 -> Leadtek Winfast 2000XP Expert [107d:6611,107d:6613]
+ 6 -> AverTV Studio 303 (M126) [1461:000b]
+ 7 -> MSI TV-@nywhere Master [1462:8606]
+ 8 -> Leadtek Winfast DV2000 [107d:6620]
+ 9 -> Leadtek PVR 2000 [107d:663b,107d:663c,107d:6632]
+ 10 -> IODATA GV-VCP3/PCI [10fc:d003]
+ 11 -> Prolink PlayTV PVR
+ 12 -> ASUS PVR-416 [1043:4823,1461:c111]
+ 13 -> MSI TV-@nywhere
+ 14 -> KWorld/VStream XPert DVB-T [17de:08a6]
+ 15 -> DViCO FusionHDTV DVB-T1 [18ac:db00]
+ 16 -> KWorld LTV883RF
+ 17 -> DViCO FusionHDTV 3 Gold-Q [18ac:d810,18ac:d800]
+ 18 -> Hauppauge Nova-T DVB-T [0070:9002,0070:9001,0070:9000]
+ 19 -> Conexant DVB-T reference design [14f1:0187]
+ 20 -> Provideo PV259 [1540:2580]
+ 21 -> DViCO FusionHDTV DVB-T Plus [18ac:db10,18ac:db11]
+ 22 -> pcHDTV HD3000 HDTV [7063:3000]
+ 23 -> digitalnow DNTV Live! DVB-T [17de:a8a6]
+ 24 -> Hauppauge WinTV 28xxx (Roslyn) models [0070:2801]
+ 25 -> Digital-Logic MICROSPACE Entertainment Center (MEC) [14f1:0342]
+ 26 -> IODATA GV/BCTV7E [10fc:d035]
+ 27 -> PixelView PlayTV Ultra Pro (Stereo)
+ 28 -> DViCO FusionHDTV 3 Gold-T [18ac:d820]
+ 29 -> ADS Tech Instant TV DVB-T PCI [1421:0334]
+ 30 -> TerraTec Cinergy 1400 DVB-T [153b:1166]
+ 31 -> DViCO FusionHDTV 5 Gold [18ac:d500]
+ 32 -> AverMedia UltraTV Media Center PCI 550 [1461:8011]
+ 33 -> Kworld V-Stream Xpert DVD
+ 34 -> ATI HDTV Wonder [1002:a101]
+ 35 -> WinFast DTV1000-T [107d:665f]
+ 36 -> AVerTV 303 (M126) [1461:000a]
+ 37 -> Hauppauge Nova-S-Plus DVB-S [0070:9201,0070:9202]
+ 38 -> Hauppauge Nova-SE2 DVB-S [0070:9200]
+ 39 -> KWorld DVB-S 100 [17de:08b2,1421:0341]
+ 40 -> Hauppauge WinTV-HVR1100 DVB-T/Hybrid [0070:9400,0070:9402]
+ 41 -> Hauppauge WinTV-HVR1100 DVB-T/Hybrid (Low Profile) [0070:9800,0070:9802]
+ 42 -> digitalnow DNTV Live! DVB-T Pro [1822:0025,1822:0019]
+ 43 -> KWorld/VStream XPert DVB-T with cx22702 [17de:08a1,12ab:2300]
+ 44 -> DViCO FusionHDTV DVB-T Dual Digital [18ac:db50,18ac:db54]
+ 45 -> KWorld HardwareMpegTV XPert [17de:0840,1421:0305]
+ 46 -> DViCO FusionHDTV DVB-T Hybrid [18ac:db40,18ac:db44]
+ 47 -> pcHDTV HD5500 HDTV [7063:5500]
+ 48 -> Kworld MCE 200 Deluxe [17de:0841]
+ 49 -> PixelView PlayTV P7000 [1554:4813]
+ 50 -> NPG Tech Real TV FM Top 10 [14f1:0842]
+ 51 -> WinFast DTV2000 H [107d:665e]
+ 52 -> Geniatech DVB-S [14f1:0084]
+ 53 -> Hauppauge WinTV-HVR3000 TriMode Analog/DVB-S/DVB-T [0070:1404,0070:1400,0070:1401,0070:1402]
+ 54 -> Norwood Micro TV Tuner
+ 55 -> Shenzhen Tungsten Ages Tech TE-DTV-250 / Swann OEM [c180:c980]
+ 56 -> Hauppauge WinTV-HVR1300 DVB-T/Hybrid MPEG Encoder [0070:9600,0070:9601,0070:9602]
+ 57 -> ADS Tech Instant Video PCI [1421:0390]
+ 58 -> Pinnacle PCTV HD 800i [11bd:0051]
+ 59 -> DViCO FusionHDTV 5 PCI nano [18ac:d530]
+ 60 -> Pinnacle Hybrid PCTV [12ab:1788]
+ 61 -> Winfast TV2000 XP Global [107d:6f18]
+ 62 -> PowerColor RA330 [14f1:ea3d]
+ 63 -> Geniatech X8000-MT DVBT [14f1:8852]
+ 64 -> DViCO FusionHDTV DVB-T PRO [18ac:db30]
+ 65 -> DViCO FusionHDTV 7 Gold [18ac:d610]
+ 66 -> Prolink Pixelview MPEG 8000GT [1554:4935]
+ 67 -> Kworld PlusTV HD PCI 120 (ATSC 120) [17de:08c1]
+ 68 -> Hauppauge WinTV-HVR4000 DVB-S/S2/T/Hybrid [0070:6900,0070:6904,0070:6902]
+ 69 -> Hauppauge WinTV-HVR4000(Lite) DVB-S/S2 [0070:6905,0070:6906]
+ 70 -> TeVii S460 DVB-S/S2 [d460:9022]
+ 71 -> Omicom SS4 DVB-S/S2 PCI [A044:2011]
+ 72 -> TBS 8920 DVB-S/S2 [8920:8888]
+ 73 -> TeVii S420 DVB-S [d420:9022]
+ 74 -> Prolink Pixelview Global Extreme [1554:4976]
+ 75 -> PROF 7300 DVB-S/S2 [B033:3033]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
new file mode 100644
index 0000000..187cc48
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -0,0 +1,59 @@
+ 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800]
+ 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2820,eb1a:2860,eb1a:2861,eb1a:2870,eb1a:2881,eb1a:2883]
+ 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036]
+ 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208]
+ 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201]
+ 5 -> MSI VOX USB 2.0 (em2820/em2840)
+ 6 -> Terratec Cinergy 200 USB (em2800)
+ 7 -> Leadtek Winfast USB II (em2800) [0413:6023]
+ 8 -> Kworld USB2800 (em2800)
+ 9 -> Pinnacle Dazzle DVC 90/DVC 100 (em2820/em2840) [2304:0207,2304:021a]
+ 10 -> Hauppauge WinTV HVR 900 (em2880) [2040:6500]
+ 11 -> Terratec Hybrid XS (em2880) [0ccd:0042]
+ 12 -> Kworld PVR TV 2800 RF (em2820/em2840)
+ 13 -> Terratec Prodigy XS (em2880) [0ccd:0047]
+ 14 -> Pixelview Prolink PlayTV USB 2.0 (em2820/em2840) [eb1a:2821]
+ 15 -> V-Gear PocketTV (em2800)
+ 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b,2040:651f]
+ 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227]
+ 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502]
+ 19 -> PointNix Intra-Oral Camera (em2860)
+ 20 -> AMD ATI TV Wonder HD 600 (em2880) [0438:b002]
+ 21 -> eMPIA Technology, Inc. GrabBeeX+ Video Encoder (em2800) [eb1a:2801]
+ 22 -> Unknown EM2750/EM2751 webcam grabber (em2750) [eb1a:2750,eb1a:2751]
+ 23 -> Huaqi DLCW-130 (em2750)
+ 24 -> D-Link DUB-T210 TV Tuner (em2820/em2840) [2001:f112]
+ 25 -> Gadmei UTV310 (em2820/em2840)
+ 26 -> Hercules Smart TV USB 2.0 (em2820/em2840)
+ 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840)
+ 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840)
+ 29 -> Pinnacle Dazzle DVC 100 (em2820/em2840)
+ 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840)
+ 31 -> Usbgear VD204v9 (em2821)
+ 32 -> Supercomp USB 2.0 TV (em2821)
+ 33 -> SIIG AVTuner-PVR/Prolink PlayTV USB 2.0 (em2821)
+ 34 -> Terratec Cinergy A Hybrid XS (em2860) [0ccd:004f]
+ 35 -> Typhoon DVD Maker (em2860)
+ 36 -> NetGMBH Cam (em2860)
+ 37 -> Gadmei UTV330 (em2860)
+ 38 -> Yakumo MovieMixer (em2861)
+ 39 -> KWorld PVRTV 300U (em2861) [eb1a:e300]
+ 40 -> Plextor ConvertX PX-TV100U (em2861) [093b:a005]
+ 41 -> Kworld 350 U DVB-T (em2870) [eb1a:e350]
+ 42 -> Kworld 355 U DVB-T (em2870) [eb1a:e355,eb1a:e357]
+ 43 -> Terratec Cinergy T XS (em2870) [0ccd:0043]
+ 44 -> Terratec Cinergy T XS (MT2060) (em2870)
+ 45 -> Pinnacle PCTV DVB-T (em2870)
+ 46 -> Compro, VideoMate U3 (em2870) [185b:2870]
+ 47 -> KWorld DVB-T 305U (em2880) [eb1a:e305]
+ 48 -> KWorld DVB-T 310U (em2880) [eb1a:e310]
+ 49 -> MSI DigiVox A/D (em2880) [eb1a:e310]
+ 50 -> MSI DigiVox A/D II (em2880) [eb1a:e320]
+ 51 -> Terratec Hybrid XS Secam (em2880) [0ccd:004c]
+ 52 -> DNT DA2 Hybrid (em2881)
+ 53 -> Pinnacle Hybrid Pro (em2881)
+ 54 -> Kworld VS-DVB-T 323UR (em2882) [eb1a:e323]
+ 55 -> Terratec Hybrid XS (em2882) (em2882) [0ccd:005e]
+ 56 -> Pinnacle Hybrid Pro (2) (em2882) [2304:0226]
+ 57 -> Kworld PlusTV HD Hybrid 330 (em2883) [eb1a:a316]
+ 58 -> Compro VideoMate ForYou/Stereo (em2820/em2840) [185b:2041]
diff --git a/Documentation/video4linux/CARDLIST.ivtv b/Documentation/video4linux/CARDLIST.ivtv
new file mode 100644
index 0000000..a019e27
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.ivtv
@@ -0,0 +1,24 @@
+ 1 -> Hauppauge WinTV PVR-250
+ 2 -> Hauppauge WinTV PVR-350
+ 3 -> Hauppauge WinTV PVR-150 or PVR-500
+ 4 -> AVerMedia M179 [1461:a3ce,1461:a3cf]
+ 5 -> Yuan MPG600/Kuroutoshikou iTVC16-STVLP [12ab:fff3,12ab:ffff]
+ 6 -> Yuan MPG160/Kuroutoshikou iTVC15-STVLP [12ab:0000,10fc:40a0]
+ 7 -> Yuan PG600/DiamondMM PVR-550 [ff92:0070,ffab:0600]
+ 8 -> Adaptec AVC-2410 [9005:0093]
+ 9 -> Adaptec AVC-2010 [9005:0092]
+10 -> NAGASE TRANSGEAR 5000TV [1461:bfff]
+11 -> AOpen VA2000MAX-STN6 [0000:ff5f]
+12 -> YUAN MPG600GR/Kuroutoshikou CX23416GYC-STVLP [12ab:0600,fbab:0600,1154:0523]
+13 -> I/O Data GV-MVP/RX [10fc:d01e,10fc:d038,10fc:d039]
+14 -> I/O Data GV-MVP/RX2E [10fc:d025]
+15 -> GOTVIEW PCI DVD (partial support only) [12ab:0600]
+16 -> GOTVIEW PCI DVD2 Deluxe [ffac:0600]
+17 -> Yuan MPC622 [ff01:d998]
+18 -> Digital Cowboy DCT-MTVP1 [1461:bfff]
+19 -> Yuan PG600V2/GotView PCI DVD Lite [ffab:0600,ffad:0600]
+20 -> Club3D ZAP-TV1x01 [ffab:0600]
+21 -> AverTV MCE 116 Plus [1461:c439]
+22 -> ASUS Falcon2 [1043:4b66,1043:462e,1043:4b2e]
+23 -> AverMedia PVR-150 Plus [1461:c035]
+24 -> AverMedia EZMaker PCI Deluxe [1461:c03f]
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
new file mode 100644
index 0000000..dc67eef
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -0,0 +1,153 @@
+ 0 -> UNKNOWN/GENERIC
+ 1 -> Proteus Pro [philips reference design] [1131:2001,1131:2001]
+ 2 -> LifeView FlyVIDEO3000 [5168:0138,4e42:0138]
+ 3 -> LifeView/Typhoon FlyVIDEO2000 [5168:0138,4e42:0138]
+ 4 -> EMPRESS [1131:6752]
+ 5 -> SKNet Monster TV [1131:4e85]
+ 6 -> Tevion MD 9717
+ 7 -> KNC One TV-Station RDS / Typhoon TV Tuner RDS [1131:fe01,1894:fe01]
+ 8 -> Terratec Cinergy 400 TV [153b:1142]
+ 9 -> Medion 5044
+ 10 -> Kworld/KuroutoShikou SAA7130-TVPCI
+ 11 -> Terratec Cinergy 600 TV [153b:1143]
+ 12 -> Medion 7134 [16be:0003]
+ 13 -> Typhoon TV+Radio 90031
+ 14 -> ELSA EX-VISION 300TV [1048:226b]
+ 15 -> ELSA EX-VISION 500TV [1048:226a]
+ 16 -> ASUS TV-FM 7134 [1043:4842,1043:4830,1043:4840]
+ 17 -> AOPEN VA1000 POWER [1131:7133]
+ 18 -> BMK MPEX No Tuner
+ 19 -> Compro VideoMate TV [185b:c100]
+ 20 -> Matrox CronosPlus [102B:48d0]
+ 21 -> 10MOONS PCI TV CAPTURE CARD [1131:2001]
+ 22 -> AverMedia M156 / Medion 2819 [1461:a70b]
+ 23 -> BMK MPEX Tuner
+ 24 -> KNC One TV-Station DVR [1894:a006]
+ 25 -> ASUS TV-FM 7133 [1043:4843]
+ 26 -> Pinnacle PCTV Stereo (saa7134) [11bd:002b]
+ 27 -> Manli MuchTV M-TV002
+ 28 -> Manli MuchTV M-TV001
+ 29 -> Nagase Sangyo TransGear 3000TV [1461:050c]
+ 30 -> Elitegroup ECS TVP3XP FM1216 Tuner Card(PAL-BG,FM) [1019:4cb4]
+ 31 -> Elitegroup ECS TVP3XP FM1236 Tuner Card (NTSC,FM) [1019:4cb5]
+ 32 -> AVACS SmartTV
+ 33 -> AVerMedia DVD EZMaker [1461:10ff]
+ 34 -> Noval Prime TV 7133
+ 35 -> AverMedia AverTV Studio 305 [1461:2115]
+ 36 -> UPMOST PURPLE TV [12ab:0800]
+ 37 -> Items MuchTV Plus / IT-005
+ 38 -> Terratec Cinergy 200 TV [153b:1152]
+ 39 -> LifeView FlyTV Platinum Mini [5168:0212,4e42:0212,5169:1502]
+ 40 -> Compro VideoMate TV PVR/FM [185b:c100]
+ 41 -> Compro VideoMate TV Gold+ [185b:c100]
+ 42 -> Sabrent SBT-TVFM (saa7130)
+ 43 -> :Zolid Xpert TV7134
+ 44 -> Empire PCI TV-Radio LE
+ 45 -> Avermedia AVerTV Studio 307 [1461:9715]
+ 46 -> AVerMedia Cardbus TV/Radio (E500) [1461:d6ee]
+ 47 -> Terratec Cinergy 400 mobile [153b:1162]
+ 48 -> Terratec Cinergy 600 TV MK3 [153b:1158]
+ 49 -> Compro VideoMate Gold+ Pal [185b:c200]
+ 50 -> Pinnacle PCTV 300i DVB-T + PAL [11bd:002d]
+ 51 -> ProVideo PV952 [1540:9524]
+ 52 -> AverMedia AverTV/305 [1461:2108]
+ 53 -> ASUS TV-FM 7135 [1043:4845]
+ 54 -> LifeView FlyTV Platinum FM / Gold [5168:0214,5168:5214,1489:0214,5168:0304]
+ 55 -> LifeView FlyDVB-T DUO / MSI TV@nywhere Duo [5168:0306,4E42:0306]
+ 56 -> Avermedia AVerTV 307 [1461:a70a]
+ 57 -> Avermedia AVerTV GO 007 FM [1461:f31f]
+ 58 -> ADS Tech Instant TV (saa7135) [1421:0350,1421:0351,1421:0370,1421:1370]
+ 59 -> Kworld/Tevion V-Stream Xpert TV PVR7134
+ 60 -> LifeView/Typhoon/Genius FlyDVB-T Duo Cardbus [5168:0502,4e42:0502,1489:0502]
+ 61 -> Philips TOUGH DVB-T reference design [1131:2004]
+ 62 -> Compro VideoMate TV Gold+II
+ 63 -> Kworld Xpert TV PVR7134
+ 64 -> FlyTV mini Asus Digimatrix [1043:0210]
+ 65 -> V-Stream Studio TV Terminator
+ 66 -> Yuan TUN-900 (saa7135)
+ 67 -> Beholder BeholdTV 409 FM [0000:4091]
+ 68 -> GoTView 7135 PCI [5456:7135]
+ 69 -> Philips EUROPA V3 reference design [1131:2004]
+ 70 -> Compro Videomate DVB-T300 [185b:c900]
+ 71 -> Compro Videomate DVB-T200 [185b:c901]
+ 72 -> RTD Embedded Technologies VFG7350 [1435:7350]
+ 73 -> RTD Embedded Technologies VFG7330 [1435:7330]
+ 74 -> LifeView FlyTV Platinum Mini2 [14c0:1212]
+ 75 -> AVerMedia AVerTVHD MCE A180 [1461:1044]
+ 76 -> SKNet MonsterTV Mobile [1131:4ee9]
+ 77 -> Pinnacle PCTV 40i/50i/110i (saa7133) [11bd:002e]
+ 78 -> ASUSTeK P7131 Dual [1043:4862]
+ 79 -> Sedna/MuchTV PC TV Cardbus TV/Radio (ITO25 Rev:2B)
+ 80 -> ASUS Digimatrix TV [1043:0210]
+ 81 -> Philips Tiger reference design [1131:2018]
+ 82 -> MSI TV@Anywhere plus [1462:6231,1462:8624]
+ 83 -> Terratec Cinergy 250 PCI TV [153b:1160]
+ 84 -> LifeView FlyDVB Trio [5168:0319]
+ 85 -> AverTV DVB-T 777 [1461:2c05,1461:2c05]
+ 86 -> LifeView FlyDVB-T / Genius VideoWonder DVB-T [5168:0301,1489:0301]
+ 87 -> ADS Instant TV Duo Cardbus PTV331 [0331:1421]
+ 88 -> Tevion/KWorld DVB-T 220RF [17de:7201]
+ 89 -> ELSA EX-VISION 700TV [1048:226c]
+ 90 -> Kworld ATSC110/115 [17de:7350,17de:7352]
+ 91 -> AVerMedia A169 B [1461:7360]
+ 92 -> AVerMedia A169 B1 [1461:6360]
+ 93 -> Medion 7134 Bridge #2 [16be:0005]
+ 94 -> LifeView FlyDVB-T Hybrid Cardbus/MSI TV @nywhere A/D NB [5168:3306,5168:3502,5168:3307,4e42:3502]
+ 95 -> LifeView FlyVIDEO3000 (NTSC) [5169:0138]
+ 96 -> Medion Md8800 Quadro [16be:0007,16be:0008,16be:000d]
+ 97 -> LifeView FlyDVB-S /Acorp TV134DS [5168:0300,4e42:0300]
+ 98 -> Proteus Pro 2309 [0919:2003]
+ 99 -> AVerMedia TV Hybrid A16AR [1461:2c00]
+100 -> Asus Europa2 OEM [1043:4860]
+101 -> Pinnacle PCTV 310i [11bd:002f]
+102 -> Avermedia AVerTV Studio 507 [1461:9715]
+103 -> Compro Videomate DVB-T200A
+104 -> Hauppauge WinTV-HVR1110 DVB-T/Hybrid [0070:6700,0070:6701,0070:6702,0070:6703,0070:6704,0070:6705]
+105 -> Terratec Cinergy HT PCMCIA [153b:1172]
+106 -> Encore ENLTV [1131:2342,1131:2341,3016:2344]
+107 -> Encore ENLTV-FM [1131:230f]
+108 -> Terratec Cinergy HT PCI [153b:1175]
+109 -> Philips Tiger - S Reference design
+110 -> Avermedia M102 [1461:f31e]
+111 -> ASUS P7131 4871 [1043:4871]
+112 -> ASUSTeK P7131 Hybrid [1043:4876]
+113 -> Elitegroup ECS TVP3XP FM1246 Tuner Card (PAL,FM) [1019:4cb6]
+114 -> KWorld DVB-T 210 [17de:7250]
+115 -> Sabrent PCMCIA TV-PCB05 [0919:2003]
+116 -> 10MOONS TM300 TV Card [1131:2304]
+117 -> Avermedia Super 007 [1461:f01d]
+118 -> Beholder BeholdTV 401 [0000:4016]
+119 -> Beholder BeholdTV 403 [0000:4036]
+120 -> Beholder BeholdTV 403 FM [0000:4037]
+121 -> Beholder BeholdTV 405 [0000:4050]
+122 -> Beholder BeholdTV 405 FM [0000:4051]
+123 -> Beholder BeholdTV 407 [0000:4070]
+124 -> Beholder BeholdTV 407 FM [0000:4071]
+125 -> Beholder BeholdTV 409 [0000:4090]
+126 -> Beholder BeholdTV 505 FM/RDS [0000:5051,0000:505B,5ace:5050]
+127 -> Beholder BeholdTV 507 FM/RDS / BeholdTV 509 FM [0000:5071,0000:507B,5ace:5070,5ace:5090]
+128 -> Beholder BeholdTV Columbus TVFM [0000:5201]
+129 -> Beholder BeholdTV 607 / BeholdTV 609 [5ace:6070,5ace:6071,5ace:6072,5ace:6073,5ace:6090,5ace:6091,5ace:6092,5ace:6093]
+130 -> Beholder BeholdTV M6 [5ace:6190]
+131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022]
+132 -> Genius TVGO AM11MCE
+133 -> NXP Snake DVB-S reference design
+134 -> Medion/Creatix CTX953 Hybrid [16be:0010]
+135 -> MSI TV@nywhere A/D v1.1 [1462:8625]
+136 -> AVerMedia Cardbus TV/Radio (E506R) [1461:f436]
+137 -> AVerMedia Hybrid TV/Radio (A16D) [1461:f936]
+138 -> Avermedia M115 [1461:a836]
+139 -> Compro VideoMate T750 [185b:c900]
+140 -> Avermedia DVB-S Pro A700 [1461:a7a1]
+141 -> Avermedia DVB-S Hybrid+FM A700 [1461:a7a2]
+142 -> Beholder BeholdTV H6 [5ace:6290]
+143 -> Beholder BeholdTV M63 [5ace:6191]
+144 -> Beholder BeholdTV M6 Extra [5ace:6193]
+145 -> AVerMedia MiniPCI DVB-T Hybrid M103 [1461:f636]
+146 -> ASUSTeK P7131 Analog
+147 -> Asus Tiger 3in1 [1043:4878]
+148 -> Encore ENLTV-FM v5.3 [1a7f:2008]
+149 -> Avermedia PCI pure analog (M135A) [1461:f11d]
+150 -> Zogis Real Angel 220
+151 -> ADS Tech Instant HDTV [1421:0380]
+152 -> Asus Tiger Rev:1.00 [1043:4857]
diff --git a/Documentation/video4linux/CARDLIST.tuner b/Documentation/video4linux/CARDLIST.tuner
new file mode 100644
index 0000000..691d2f3
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.tuner
@@ -0,0 +1,78 @@
+tuner=0 - Temic PAL (4002 FH5)
+tuner=1 - Philips PAL_I (FI1246 and compatibles)
+tuner=2 - Philips NTSC (FI1236,FM1236 and compatibles)
+tuner=3 - Philips (SECAM+PAL_BG) (FI1216MF, FM1216MF, FR1216MF)
+tuner=4 - NoTuner
+tuner=5 - Philips PAL_BG (FI1216 and compatibles)
+tuner=6 - Temic NTSC (4032 FY5)
+tuner=7 - Temic PAL_I (4062 FY5)
+tuner=8 - Temic NTSC (4036 FY5)
+tuner=9 - Alps HSBH1
+tuner=10 - Alps TSBE1
+tuner=11 - Alps TSBB5
+tuner=12 - Alps TSBE5
+tuner=13 - Alps TSBC5
+tuner=14 - Temic PAL_BG (4006FH5)
+tuner=15 - Alps TSCH6
+tuner=16 - Temic PAL_DK (4016 FY5)
+tuner=17 - Philips NTSC_M (MK2)
+tuner=18 - Temic PAL_I (4066 FY5)
+tuner=19 - Temic PAL* auto (4006 FN5)
+tuner=20 - Temic PAL_BG (4009 FR5) or PAL_I (4069 FR5)
+tuner=21 - Temic NTSC (4039 FR5)
+tuner=22 - Temic PAL/SECAM multi (4046 FM5)
+tuner=23 - Philips PAL_DK (FI1256 and compatibles)
+tuner=24 - Philips PAL/SECAM multi (FQ1216ME)
+tuner=25 - LG PAL_I+FM (TAPC-I001D)
+tuner=26 - LG PAL_I (TAPC-I701D)
+tuner=27 - LG NTSC+FM (TPI8NSR01F)
+tuner=28 - LG PAL_BG+FM (TPI8PSB01D)
+tuner=29 - LG PAL_BG (TPI8PSB11D)
+tuner=30 - Temic PAL* auto + FM (4009 FN5)
+tuner=31 - SHARP NTSC_JP (2U5JF5540)
+tuner=32 - Samsung PAL TCPM9091PD27
+tuner=33 - MT20xx universal
+tuner=34 - Temic PAL_BG (4106 FH5)
+tuner=35 - Temic PAL_DK/SECAM_L (4012 FY5)
+tuner=36 - Temic NTSC (4136 FY5)
+tuner=37 - LG PAL (newer TAPC series)
+tuner=38 - Philips PAL/SECAM multi (FM1216ME MK3)
+tuner=39 - LG NTSC (newer TAPC series)
+tuner=40 - HITACHI V7-J180AT
+tuner=41 - Philips PAL_MK (FI1216 MK)
+tuner=42 - Philips FCV1236D ATSC/NTSC dual in
+tuner=43 - Philips NTSC MK3 (FM1236MK3 or FM1236/F)
+tuner=44 - Philips 4 in 1 (ATI TV Wonder Pro/Conexant)
+tuner=45 - Microtune 4049 FM5
+tuner=46 - Panasonic VP27s/ENGE4324D
+tuner=47 - LG NTSC (TAPE series)
+tuner=48 - Tenna TNF 8831 BGFF)
+tuner=49 - Microtune 4042 FI5 ATSC/NTSC dual in
+tuner=50 - TCL 2002N
+tuner=51 - Philips PAL/SECAM_D (FM 1256 I-H3)
+tuner=52 - Thomson DTT 7610 (ATSC/NTSC)
+tuner=53 - Philips FQ1286
+tuner=54 - Philips/NXP TDA 8290/8295 + 8275/8275A/18271
+tuner=55 - TCL 2002MB
+tuner=56 - Philips PAL/SECAM multi (FQ1216AME MK4)
+tuner=57 - Philips FQ1236A MK4
+tuner=58 - Ymec TVision TVF-8531MF/8831MF/8731MF
+tuner=59 - Ymec TVision TVF-5533MF
+tuner=60 - Thomson DTT 761X (ATSC/NTSC)
+tuner=61 - Tena TNF9533-D/IF/TNF9533-B/DF
+tuner=62 - Philips TEA5767HN FM Radio
+tuner=63 - Philips FMD1216ME MK3 Hybrid Tuner
+tuner=64 - LG TDVS-H06xF
+tuner=65 - Ymec TVF66T5-B/DFF
+tuner=66 - LG TALN series
+tuner=67 - Philips TD1316 Hybrid Tuner
+tuner=68 - Philips TUV1236D ATSC/NTSC dual in
+tuner=69 - Tena TNF 5335 and similar models
+tuner=70 - Samsung TCPN 2121P30A
+tuner=71 - Xceive xc2028/xc3028 tuner
+tuner=72 - Thomson FE6600
+tuner=73 - Samsung TCPG 6121P30A
+tuner=75 - Philips TEA5761 FM Radio
+tuner=76 - Xceive 5000 tuner
+tuner=77 - TCL tuner MF02GIP-5N-E
+tuner=78 - Philips FMD1216MEX MK3 Hybrid Tuner
diff --git a/Documentation/video4linux/CARDLIST.usbvision b/Documentation/video4linux/CARDLIST.usbvision
new file mode 100644
index 0000000..0b72d3f
--- /dev/null
+++ b/Documentation/video4linux/CARDLIST.usbvision
@@ -0,0 +1,65 @@
+ 0 -> Xanboo [0a6f:0400]
+ 1 -> Belkin USB VideoBus II Adapter [050d:0106]
+ 2 -> Belkin Components USB VideoBus [050d:0207]
+ 3 -> Belkin USB VideoBus II [050d:0208]
+ 4 -> echoFX InterView Lite [0571:0002]
+ 5 -> USBGear USBG-V1 resp. HAMA USB [0573:0003]
+ 6 -> D-Link V100 [0573:0400]
+ 7 -> X10 USB Camera [0573:2000]
+ 8 -> Hauppauge WinTV USB Live (PAL B/G) [0573:2d00]
+ 9 -> Hauppauge WinTV USB Live Pro (NTSC M/N) [0573:2d01]
+ 10 -> Zoran Co. PMD (Nogatech) AV-grabber Manhattan [0573:2101]
+ 11 -> Nogatech USB-TV (NTSC) FM [0573:4100]
+ 12 -> PNY USB-TV (NTSC) FM [0573:4110]
+ 13 -> PixelView PlayTv-USB PRO (PAL) FM [0573:4450]
+ 14 -> ZTV ZT-721 2.4GHz USB A/V Receiver [0573:4550]
+ 15 -> Hauppauge WinTV USB (NTSC M/N) [0573:4d00]
+ 16 -> Hauppauge WinTV USB (PAL B/G) [0573:4d01]
+ 17 -> Hauppauge WinTV USB (PAL I) [0573:4d02]
+ 18 -> Hauppauge WinTV USB (PAL/SECAM L) [0573:4d03]
+ 19 -> Hauppauge WinTV USB (PAL D/K) [0573:4d04]
+ 20 -> Hauppauge WinTV USB (NTSC FM) [0573:4d10]
+ 21 -> Hauppauge WinTV USB (PAL B/G FM) [0573:4d11]
+ 22 -> Hauppauge WinTV USB (PAL I FM) [0573:4d12]
+ 23 -> Hauppauge WinTV USB (PAL D/K FM) [0573:4d14]
+ 24 -> Hauppauge WinTV USB Pro (NTSC M/N) [0573:4d2a]
+ 25 -> Hauppauge WinTV USB Pro (NTSC M/N) V2 [0573:4d2b]
+ 26 -> Hauppauge WinTV USB Pro (PAL/SECAM B/G/I/D/K/L) [0573:4d2c]
+ 27 -> Hauppauge WinTV USB Pro (NTSC M/N) V3 [0573:4d20]
+ 28 -> Hauppauge WinTV USB Pro (PAL B/G) [0573:4d21]
+ 29 -> Hauppauge WinTV USB Pro (PAL I) [0573:4d22]
+ 30 -> Hauppauge WinTV USB Pro (PAL/SECAM L) [0573:4d23]
+ 31 -> Hauppauge WinTV USB Pro (PAL D/K) [0573:4d24]
+ 32 -> Hauppauge WinTV USB Pro (PAL/SECAM BGDK/I/L) [0573:4d25]
+ 33 -> Hauppauge WinTV USB Pro (PAL/SECAM BGDK/I/L) V2 [0573:4d26]
+ 34 -> Hauppauge WinTV USB Pro (PAL B/G) V2 [0573:4d27]
+ 35 -> Hauppauge WinTV USB Pro (PAL B/G,D/K) [0573:4d28]
+ 36 -> Hauppauge WinTV USB Pro (PAL I,D/K) [0573:4d29]
+ 37 -> Hauppauge WinTV USB Pro (NTSC M/N FM) [0573:4d30]
+ 38 -> Hauppauge WinTV USB Pro (PAL B/G FM) [0573:4d31]
+ 39 -> Hauppauge WinTV USB Pro (PAL I FM) [0573:4d32]
+ 40 -> Hauppauge WinTV USB Pro (PAL D/K FM) [0573:4d34]
+ 41 -> Hauppauge WinTV USB Pro (Temic PAL/SECAM B/G/I/D/K/L FM) [0573:4d35]
+ 42 -> Hauppauge WinTV USB Pro (Temic PAL B/G FM) [0573:4d36]
+ 43 -> Hauppauge WinTV USB Pro (PAL/SECAM B/G/I/D/K/L FM) [0573:4d37]
+ 44 -> Hauppauge WinTV USB Pro (NTSC M/N FM) V2 [0573:4d38]
+ 45 -> Camtel Technology USB TV Genie Pro FM Model TVB330 [0768:0006]
+ 46 -> Digital Video Creator I [07d0:0001]
+ 47 -> Global Village GV-007 (NTSC) [07d0:0002]
+ 48 -> Dazzle Fusion Model DVC-50 Rev 1 (NTSC) [07d0:0003]
+ 49 -> Dazzle Fusion Model DVC-80 Rev 1 (PAL) [07d0:0004]
+ 50 -> Dazzle Fusion Model DVC-90 Rev 1 (SECAM) [07d0:0005]
+ 51 -> Eskape Labs MyTV2Go [07f8:9104]
+ 52 -> Pinnacle Studio PCTV USB (PAL) [2304:010d]
+ 53 -> Pinnacle Studio PCTV USB (SECAM) [2304:0109]
+ 54 -> Pinnacle Studio PCTV USB (PAL) FM [2304:0110]
+ 55 -> Miro PCTV USB [2304:0111]
+ 56 -> Pinnacle Studio PCTV USB (NTSC) FM [2304:0112]
+ 57 -> Pinnacle Studio PCTV USB (PAL) FM V2 [2304:0210]
+ 58 -> Pinnacle Studio PCTV USB (NTSC) FM V2 [2304:0212]
+ 59 -> Pinnacle Studio PCTV USB (PAL) FM V3 [2304:0214]
+ 60 -> Pinnacle Studio Linx Video input cable (NTSC) [2304:0300]
+ 61 -> Pinnacle Studio Linx Video input cable (PAL) [2304:0301]
+ 62 -> Pinnacle PCTV Bungee USB (PAL) FM [2304:0419]
+ 63 -> Hauppauge WinTv-USB [2400:4200]
+ 64 -> Pinnacle Studio PCTV USB (NTSC) FM V3 [2304:0113]
diff --git a/Documentation/video4linux/CQcam.txt b/Documentation/video4linux/CQcam.txt
new file mode 100644
index 0000000..04986ef
--- /dev/null
+++ b/Documentation/video4linux/CQcam.txt
@@ -0,0 +1,215 @@
+c-qcam - Connectix Color QuickCam video4linux kernel driver
+
+Copyright (C) 1999 Dave Forrest <drf5n@virginia.edu>
+ released under GNU GPL.
+
+1999-12-08 Dave Forrest, written with kernel version 2.2.12 in mind
+
+
+Table of Contents
+
+1.0 Introduction
+2.0 Compilation, Installation, and Configuration
+3.0 Troubleshooting
+4.0 Future Work / current work arounds
+9.0 Sample Program, v4lgrab
+10.0 Other Information
+
+
+1.0 Introduction
+
+ The file ../drivers/char/c-qcam.c is a device driver for the
+Logitech (nee Connectix) parallel port interface color CCD camera.
+This is a fairly inexpensive device for capturing images. Logitech
+does not currently provide information for developers, but many people
+have engineered several solutions for non-Microsoft use of the Color
+Quickcam.
+
+1.1 Motivation
+
+ I spent a number of hours trying to get my camera to work, and I
+hope this document saves you some time. My camera will not work with
+the 2.2.13 kernel as distributed, but with a few patches to the
+module, I was able to grab some frames. See 4.0, Future Work.
+
+
+
+2.0 Compilation, Installation, and Configuration
+
+ The c-qcam depends on parallel port support, video4linux, and the
+Color Quickcam. It is also nice to have the parallel port readback
+support enabled. I enabled these as modules during the kernel
+configuration. The appropriate flags are:
+
+ CONFIG_PRINTER M for lp.o, parport.o parport_pc.o modules
+ CONFIG_PNP_PARPORT M for autoprobe.o IEEE1284 readback module
+ CONFIG_PRINTER_READBACK M for parport_probe.o IEEE1284 readback module
+ CONFIG_VIDEO_DEV M for videodev.o video4linux module
+ CONFIG_VIDEO_CQCAM M for c-qcam.o Color Quickcam module
+
+ With these flags, the kernel should compile and install the modules.
+To record and monitor the compilation, I use:
+
+ (make zlilo ; \
+ make modules; \
+ make modules_install ;
+ depmod -a ) &>log &
+ less log # then a capital 'F' to watch the progress
+
+But that is my personal preference.
+
+2.2 Configuration
+
+ The configuration requires module configuration and device
+configuration. I like kmod or kerneld process with the
+/etc/modprobe.conf file so the modules can automatically load/unload as
+they are used. The video devices could already exist, be generated
+using MAKEDEV, or need to be created. The following sections detail
+these procedures.
+
+
+2.1 Module Configuration
+
+ Using modules requires a bit of work to install and pass the
+parameters. Understand that entries in /etc/modprobe.conf of:
+
+ alias parport_lowlevel parport_pc
+ options parport_pc io=0x378 irq=none
+ alias char-major-81 videodev
+ alias char-major-81-0 c-qcam
+
+will cause the kmod/modprobe to do certain things. If you are
+using kmod, then a request for a 'char-major-81-0' will cause
+the 'c-qcam' module to load. If you have other video sources with
+modules, you might want to assign the different minor numbers to
+different modules.
+
+2.2 Device Configuration
+
+ At this point, we need to ensure that the device files exist.
+Video4linux used the /dev/video* files, and we want to attach the
+Quickcam to one of these.
+
+ ls -lad /dev/video* # should produce a list of the video devices
+
+If the video devices do not exist, you can create them with:
+
+ su
+ cd /dev
+ for ii in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ; do
+ mknod video$ii c 81 $ii # char-major-81-[0-16]
+ chown root.root video$ii # owned by root
+ chmod 600 video$ii # read/writable by root only
+ done
+
+ Lots of people connect video0 to video and bttv, but you might want
+your c-qcam to mean something more:
+
+ ln -s video0 c-qcam # make /dev/c-qcam a working file
+ ln -s c-qcam video # make /dev/c-qcam your default video source
+
+ But these are conveniences. The important part is to make the proper
+special character files with the right major and minor numbers. All
+of the special device files are listed in ../devices.txt. If you
+would like the c-qcam readable by non-root users, you will need to
+change the permissions.
+
+3.0 Troubleshooting
+
+ If the sample program below, v4lgrab, gives you output then
+everything is working.
+
+ v4lgrab | wc # should give you a count of characters
+
+ Otherwise, you have some problem.
+
+ The c-qcam is IEEE1284 compatible, so if you are using the proc file
+system (CONFIG_PROC_FS), the parallel printer support
+(CONFIG_PRINTER), the IEEE 1284 system,(CONFIG_PRINTER_READBACK), you
+should be able to read some identification from your quickcam with
+
+ modprobe -v parport
+ modprobe -v parport_probe
+ cat /proc/parport/PORTNUMBER/autoprobe
+Returns:
+ CLASS:MEDIA;
+ MODEL:Color QuickCam 2.0;
+ MANUFACTURER:Connectix;
+
+ A good response to this indicates that your color quickcam is alive
+and well. A common problem is that the current driver does not
+reliably detect a c-qcam, even though one is attached. In this case,
+
+ modprobe -v c-qcam
+or
+ insmod -v c-qcam
+
+ Returns a message saying "Device or resource busy" Development is
+currently underway, but a workaround is to patch the module to skip
+the detection code and attach to a defined port. Check the
+video4linux mailing list and archive for more current information.
+
+3.1 Checklist:
+
+ Can you get an image?
+ v4lgrab >qcam.ppm ; wc qcam.ppm ; xv qcam.ppm
+
+ Is a working c-qcam connected to the port?
+ grep ^ /proc/parport/?/autoprobe
+
+ Do the /dev/video* files exist?
+ ls -lad /dev/video
+
+ Is the c-qcam module loaded?
+ modprobe -v c-qcam ; lsmod
+
+ Does the camera work with alternate programs? cqcam, etc?
+
+
+
+
+4.0 Future Work / current workarounds
+
+ It is hoped that this section will soon become obsolete, but if it
+isn't, you might try patching the c-qcam module to add a parport=xxx
+option as in the bw-qcam module so you can specify the parallel port:
+
+ insmod -v c-qcam parport=0
+
+And bypass the detection code, see ../../drivers/char/c-qcam.c and
+look for the 'qc_detect' code and call.
+
+ Note that there is work in progress to change the video4linux API,
+this work is documented at the video4linux2 site listed below.
+
+
+9.0 --- A sample program using v4lgrabber,
+
+v4lgrab is a simple image grabber that will copy a frame from the
+first video device, /dev/video0 to standard output in portable pixmap
+format (.ppm) To produce .jpg output, you can use it like this:
+'v4lgrab | convert - c-qcam.jpg'
+
+
+10.0 --- Other Information
+
+Use the ../../Maintainers file, particularly the VIDEO FOR LINUX and PARALLEL
+PORT SUPPORT sections
+
+The video4linux page:
+ http://linuxtv.org
+
+The V4L2 API spec:
+ http://v4l2spec.bytesex.org/
+
+Some web pages about the quickcams:
+ http://www.dkfz-heidelberg.de/Macromol/wedemann/mini-HOWTO-cqcam.html
+
+ http://www.crynwr.com/qcpc/ QuickCam Third-Party Drivers
+ http://www.crynwr.com/qcpc/re.html Some Reverse Engineering
+ http://cse.unl.edu/~cluening/gqcam/ v4l client
+ http://phobos.illtel.denver.co.us/pub/qcread/ doesn't use v4l
+ ftp://ftp.cs.unm.edu/pub/chris/quickcam/ Has lots of drivers
+ http://www.cs.duke.edu/~reynolds/quickcam/ Has lots of information
+
+
diff --git a/Documentation/video4linux/Makefile b/Documentation/video4linux/Makefile
new file mode 100644
index 0000000..1ed0e98
--- /dev/null
+++ b/Documentation/video4linux/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := v4lgrab
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/video4linux/README.cpia b/Documentation/video4linux/README.cpia
new file mode 100644
index 0000000..19cd3bf
--- /dev/null
+++ b/Documentation/video4linux/README.cpia
@@ -0,0 +1,191 @@
+This is a driver for the CPiA PPC2 driven parallel connected
+Camera. For example the Creative WebcamII is CPiA driven.
+
+ ) [1]Peter Pregler, Linz 2000, published under the [2]GNU GPL
+
+---------------------------------------------------------------------------
+
+USAGE:
+
+General:
+========
+
+1) Make sure you have created the video devices (/dev/video*):
+
+- if you have a recent MAKEDEV do a 'cd /dev;./MAKEDEV video'
+- otherwise do a:
+
+cd /dev
+mknod video0 c 81 0
+ln -s video0 video
+
+2) Compile the kernel (see below for the list of options to use),
+ configure your parport and reboot.
+
+3) If all worked well you should get messages similar
+ to the following (your versions may be different) on the console:
+
+V4L-Driver for Vision CPiA based cameras v0.7.4
+parport0: read2 timeout.
+parport0: Multimedia device, VLSI Vision Ltd PPC2
+Parallel port driver for Vision CPiA based camera
+ CPIA Version: 1.20 (2.0)
+ CPIA PnP-ID: 0553:0002:0100
+ VP-Version: 1.0 0100
+ 1 camera(s) found
+
+
+As modules:
+===========
+
+Make sure you have selected the following kernel options (you can
+select all stuff as modules):
+
+The cpia-stuff is in the section 'Character devices -> Video For Linux'.
+
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_FIFO=y
+CONFIG_PARPORT_1284=y
+CONFIG_VIDEO_DEV=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+
+For autoloading of all those modules you need to tell module-init-tools
+some stuff. Add the following line to your module-init-tools config-file
+(e.g. /etc/modprobe.conf or wherever your distribution does store that
+stuff):
+
+options parport_pc io=0x378 irq=7 dma=3
+alias char-major-81 cpia_pp
+
+The first line tells the dma/irq channels to use. Those _must_ match
+the settings of your BIOS. Do NOT simply use the values above. See
+Documentation/parport.txt for more information about this. The second
+line associates the video-device file with the driver. Of cause you
+can also load the modules once upon boot (usually done in /etc/modules).
+
+Linked into the kernel:
+=======================
+
+Make sure you have selected the following kernel options. Note that
+you cannot compile the parport-stuff as modules and the cpia-driver
+statically (the other way round is okay though).
+
+The cpia-stuff is in the section 'Character devices -> Video For Linux'.
+
+CONFIG_PARPORT=y
+CONFIG_PARPORT_PC=y
+CONFIG_PARPORT_PC_FIFO=y
+CONFIG_PARPORT_1284=y
+CONFIG_VIDEO_DEV=y
+CONFIG_VIDEO_CPIA=y
+CONFIG_VIDEO_CPIA_PP=y
+
+To use DMA/irq you will need to tell the kernel upon boot time the
+hardware configuration of the parport. You can give the boot-parameter
+at the LILO-prompt or specify it in lilo.conf. I use the following
+append-line in lilo.conf:
+
+ append="parport=0x378,7,3"
+
+See Documentation/parport.txt for more information about the
+configuration of the parport and the values given above. Do not simply
+use the values given above.
+
+---------------------------------------------------------------------------
+FEATURES:
+
+- mmap/read v4l-interface (but no overlay)
+- image formats: CIF/QCIF, SIF/QSIF, various others used by isabel;
+ note: all sizes except CIF/QCIF are implemented by clipping, i.e.
+ pixels are not uploaded from the camera
+- palettes: VIDEO_PALETTE_GRAY, VIDEO_PALETTE_RGB565, VIDEO_PALETTE_RGB555,
+ VIDEO_PALETTE_RGB24, VIDEO_PALETTE_RGB32, VIDEO_PALETTE_YUYV,
+ VIDEO_PALETTE_UYVY, VIDEO_PALETTE_YUV422
+- state information (color balance, exposure, ...) is preserved between
+ device opens
+- complete control over camera via proc-interface (_all_ camera settings are
+ supported), there is also a python-gtk application available for this [3]
+- works under SMP (but the driver is completely serialized and synchronous)
+ so you get no benefit from SMP, but at least it does not crash your box
+- might work for non-Intel architecture, let us know about this
+
+---------------------------------------------------------------------------
+TESTED APPLICATIONS:
+
+- a simple test application based on Xt is available at [3]
+- another test-application based on gqcam-0.4 (uses GTK)
+- gqcam-0.6 should work
+- xawtv-3.x (also the webcam software)
+- xawtv-2.46
+- w3cam (cgi-interface and vidcat, e.g. you may try out 'vidcat |xv
+ -maxpect -root -quit +noresetroot -rmode 5 -')
+- vic, the MBONE video conferencing tool (version 2.8ucl4-1)
+- isabel 3R4beta (barely working, but AFAICT all the problems are on
+ their side)
+- camserv-0.40
+
+See [3] for pointers to v4l-applications.
+
+---------------------------------------------------------------------------
+KNOWN PROBLEMS:
+
+- some applications do not handle the image format correctly, you will
+ see strange horizontal stripes instead of a nice picture -> make sure
+ your application does use a supported image size or queries the driver
+ for the actually used size (reason behind this: the camera cannot
+ provide any image format, so if size NxM is requested the driver will
+ use a format to the closest fitting N1xM1, the application should now
+ query for this granted size, most applications do not).
+- all the todo ;)
+- if there is not enough light and the picture is too dark try to
+ adjust the SetSensorFPS setting, automatic frame rate adjustment
+ has its price
+- do not try out isabel 3R4beta (built 135), you will be disappointed
+
+---------------------------------------------------------------------------
+TODO:
+
+- multiple camera support (struct camera or something) - This should work,
+ but hasn't been tested yet.
+- architecture independence?
+- SMP-safe asynchronous mmap interface
+- nibble mode for old parport interfaces
+- streaming capture, this should give a performance gain
+
+---------------------------------------------------------------------------
+IMPLEMENTATION NOTES:
+
+The camera can act in two modes, streaming or grabbing. Right now a
+polling grab-scheme is used. Maybe interrupt driven streaming will be
+used for a asynchronous mmap interface in the next major release of the
+driver. This might give a better frame rate.
+
+---------------------------------------------------------------------------
+THANKS (in no particular order):
+
+- Scott J. Bertin <sbertin@mindspring.com> for cleanups, the proc-filesystem
+ and much more
+- Henry Bruce <whb@vvl.co.uk> for providing developers information about
+ the CPiA chip, I wish all companies would treat Linux as seriously
+- Karoly Erdei <Karoly.Erdei@risc.uni-linz.ac.at> and RISC-Linz for being
+ my boss ;) resp. my employer and for providing me the hardware and
+ allow me to devote some working time to this project
+- Manuel J. Petit de Gabriel <mpetit@dit.upm.es> for providing help
+ with Isabel (http://isabel.dit.upm.es/)
+- Bas Huisman <bhuism@cs.utwente.nl> for writing the initial parport code
+- Jarl Totland <Jarl.Totland@bdc.no> for setting up the mailing list
+ and maintaining the web-server[3]
+- Chris Whiteford <Chris@informinteractive.com> for fixes related to the
+ 1.02 firmware
+- special kudos to all the tester whose machines crashed and/or
+ will crash. :)
+
+---------------------------------------------------------------------------
+REFERENCES
+
+ 1. http://www.risc.uni-linz.ac.at/people/ppregler
+ mailto:Peter_Pregler@email.com
+ 2. see the file COPYING in the top directory of the kernel tree
+ 3. http://webcam.sourceforge.net/
diff --git a/Documentation/video4linux/README.cpia2 b/Documentation/video4linux/README.cpia2
new file mode 100644
index 0000000..ce8213d
--- /dev/null
+++ b/Documentation/video4linux/README.cpia2
@@ -0,0 +1,130 @@
+$Id: README,v 1.7 2005/08/29 23:39:57 sbertin Exp $
+
+1. Introduction
+
+ This is a driver for STMicroelectronics's CPiA2 (second generation
+Colour Processor Interface ASIC) based cameras. This camera outputs an MJPEG
+stream at up to vga size. It implements the Video4Linux interface as much as
+possible. Since the V4L interface does not support compressed formats, only
+an mjpeg enabled application can be used with the camera. We have modified the
+gqcam application to view this stream.
+
+ The driver is implemented as two kernel modules. The cpia2 module
+contains the camera functions and the V4L interface. The cpia2_usb module
+contains usb specific functions. The main reason for this was the size of the
+module was getting out of hand, so I separted them. It is not likely that
+there will be a parallel port version.
+
+FEATURES:
+ - Supports cameras with the Vision stv6410 (CIF) and stv6500 (VGA) cmos
+ sensors. I only have the vga sensor, so can't test the other.
+ - Image formats: VGA, QVGA, CIF, QCIF, and a number of sizes in between.
+ VGA and QVGA are the native image sizes for the VGA camera. CIF is done
+ in the coprocessor by scaling QVGA. All other sizes are done by clipping.
+ - Palette: YCrCb, compressed with MJPEG.
+ - Some compression parameters are settable.
+ - Sensor framerate is adjustable (up to 30 fps CIF, 15 fps VGA).
+ - Adjust brightness, color, contrast while streaming.
+ - Flicker control settable for 50 or 60 Hz mains frequency.
+
+2. Making and installing the stv672 driver modules:
+
+ Requirements:
+ -------------
+ This should work with 2.4 (2.4.23 and later) and 2.6 kernels, but has
+only been tested on 2.6. Video4Linux must be either compiled into the kernel or
+available as a module. Video4Linux2 is automatically detected and made
+available at compile time.
+
+ Compiling:
+ ----------
+ As root, do a make install. This will compile and install the modules
+into the media/video directory in the module tree. For 2.4 kernels, use
+Makefile_2.4 (aka do make -f Makefile_2.4 install).
+
+ Setup:
+ ------
+ Use 'modprobe cpia2' to load and 'modprobe -r cpia2' to unload. This
+may be done automatically by your distribution.
+
+3. Driver options
+
+ Option Description
+ ------ -----------
+ video_nr video device to register (0=/dev/video0, etc)
+ range -1 to 64. default is -1 (first available)
+ If you have more than 1 camera, this MUST be -1.
+ buffer_size Size for each frame buffer in bytes (default 68k)
+ num_buffers Number of frame buffers (1-32, default 3)
+ alternate USB Alternate (2-7, default 7)
+ flicker_freq Frequency for flicker reduction(50 or 60, default 60)
+ flicker_mode 0 to disable, or 1 to enable flicker reduction.
+ (default 0). This is only effective if the camera
+ uses a stv0672 coprocessor.
+
+ Setting the options:
+ --------------------
+ If you are using modules, edit /etc/modules.conf and add an options
+line like this:
+ options cpia2 num_buffers=3 buffer_size=65535
+
+ If the driver is compiled into the kernel, at boot time specify them
+like this:
+ cpia2.num_buffers=3 cpia2.buffer_size=65535
+
+ What buffer size should I use?
+ ------------------------------
+ The maximum image size depends on the alternate you choose, and the
+frame rate achieved by the camera. If the compression engine is able to
+keep up with the frame rate, the maximum image size is given by the table
+below.
+ The compression engine starts out at maximum compression, and will
+increase image quality until it is close to the size in the table. As long
+as the compression engine can keep up with the frame rate, after a short time
+the images will all be about the size in the table, regardless of resolution.
+ At low alternate settings, the compression engine may not be able to
+compress the image enough and will reduce the frame rate by producing larger
+images.
+ The default of 68k should be good for most users. This will handle
+any alternate at frame rates down to 15fps. For lower frame rates, it may
+be necessary to increase the buffer size to avoid having frames dropped due
+to insufficient space.
+
+ Image size(bytes)
+ Alternate bytes/ms 15fps 30fps
+ 2 128 8533 4267
+ 3 384 25600 12800
+ 4 640 42667 21333
+ 5 768 51200 25600
+ 6 896 59733 29867
+ 7 1023 68200 34100
+
+ How many buffers should I use?
+ ------------------------------
+ For normal streaming, 3 should give the best results. With only 2,
+it is possible for the camera to finish sending one image just after a
+program has started reading the other. If this happens, the driver must drop
+a frame. The exception to this is if you have a heavily loaded machine. In
+this case use 2 buffers. You are probably not reading at the full frame rate.
+If the camera can send multiple images before a read finishes, it could
+overwrite the third buffer before the read finishes, leading to a corrupt
+image. Single and double buffering have extra checks to avoid overwriting.
+
+4. Using the camera
+
+ We are providing a modified gqcam application to view the output. In
+order to avoid confusion, here it is called mview. There is also the qx5view
+program which can also control the lights on the qx5 microscope. MJPEG Tools
+(http://mjpeg.sourceforge.net) can also be used to record from the camera.
+
+5. Notes to developers:
+
+ - This is a driver version stripped of the 2.4 back compatibility
+ and old MJPEG ioctl API. See cpia2.sf.net for 2.4 support.
+
+6. Thanks:
+
+ - Peter Pregler <Peter_Pregler@email.com>,
+ Scott J. Bertin <scottbertin@yahoo.com>, and
+ Jarl Totland <Jarl.Totland@bdc.no> for the original cpia driver, which
+ this one was modelled from.
diff --git a/Documentation/video4linux/README.cx88 b/Documentation/video4linux/README.cx88
new file mode 100644
index 0000000..166d596
--- /dev/null
+++ b/Documentation/video4linux/README.cx88
@@ -0,0 +1,69 @@
+
+cx8800 release notes
+====================
+
+This is a v4l2 device driver for the cx2388x chip.
+
+
+current status
+==============
+
+video
+ - Basically works.
+ - Some minor image quality glitches.
+ - For now only capture, overlay support isn't completed yet.
+
+audio
+ - The chip specs for the on-chip TV sound decoder are next
+ to useless :-/
+ - Neverless the builtin TV sound decoder starts working now,
+ at least for PAL-BG. Other TV norms need other code ...
+ FOR ANY REPORTS ON THIS PLEASE MENTION THE TV NORM YOU ARE
+ USING.
+ - Most tuner chips do provide mono sound, which may or may not
+ be useable depending on the board design. With the Hauppauge
+ cards it works, so there is mono sound available as fallback.
+ - audio data dma (i.e. recording without loopback cable to the
+ sound card) should be possible, but there is no code yet ...
+
+vbi
+ - Code present. Works for NTSC closed caption. PAL and other
+ TV norms may or may not work.
+
+
+how to add support for new cards
+================================
+
+The driver needs some config info for the TV cards. This stuff is in
+cx88-cards.c. If the driver doesn't work well you likely need a new
+entry for your card in that file. Check the kernel log (using dmesg)
+to see whenever the driver knows your card or not. There is a line
+like this one:
+
+ cx8800[0]: subsystem: 0070:3400, board: Hauppauge WinTV \
+ 34xxx models [card=1,autodetected]
+
+If your card is listed as "board: UNKNOWN/GENERIC" it is unknown to
+the driver. What to do then?
+
+ (1) Try upgrading to the latest snapshot, maybe it has been added
+ meanwhile.
+ (2) You can try to create a new entry yourself, have a look at
+ cx88-cards.c. If that worked, mail me your changes as unified
+ diff ("diff -u").
+ (3) Or you can mail me the config information. I need at least the
+ following informations to add the card:
+
+ * the PCI Subsystem ID ("0070:3400" from the line above,
+ "lspci -v" output is fine too).
+ * the tuner type used by the card. You can try to find one by
+ trial-and-error using the tuner=<n> insmod option. If you
+ know which one the card has you can also have a look at the
+ list in CARDLIST.tuner
+
+Have fun,
+
+ Gerd
+
+--
+Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]
diff --git a/Documentation/video4linux/README.ir b/Documentation/video4linux/README.ir
new file mode 100644
index 0000000..0da47a8
--- /dev/null
+++ b/Documentation/video4linux/README.ir
@@ -0,0 +1,72 @@
+
+infrared remote control support in video4linux drivers
+======================================================
+
+
+basics
+------
+
+Current versions use the linux input layer to support infrared
+remote controls. I suggest to download my input layer tools
+from http://bytesex.org/snapshot/input-<date>.tar.gz
+
+Modules you have to load:
+
+ saa7134 statically built in, i.e. just the driver :)
+ bttv ir-kbd-gpio or ir-kbd-i2c depending on your
+ card.
+
+ir-kbd-gpio and ir-kbd-i2c don't support all cards lirc supports
+(yet), mainly for the reason that the code of lirc_i2c and lirc_gpio
+was very confusing and I decided to basically start over from scratch.
+Feel free to contact me in case of trouble. Note that the ir-kbd-*
+modules work on 2.6.x kernels only through ...
+
+
+how it works
+------------
+
+The modules register the remote as keyboard within the linux input
+layer, i.e. you'll see the keys of the remote as normal key strokes
+(if CONFIG_INPUT_KEYBOARD is enabled).
+
+Using the event devices (CONFIG_INPUT_EVDEV) it is possible for
+applications to access the remote via /dev/input/event<n> devices.
+You might have to create the special files using "/sbin/MAKEDEV
+input". The input layer tools mentioned above use the event device.
+
+The input layer tools are nice for trouble shooting, i.e. to check
+whenever the input device is really present, which of the devices it
+is, check whenever pressing keys on the remote actually generates
+events and the like. You can also use the kbd utility to change the
+keymaps (2.6.x kernels only through).
+
+
+using with lircd
+================
+
+The cvs version of the lircd daemon supports reading events from the
+linux input layer (via event device). The input layer tools tarball
+comes with a lircd config file.
+
+
+using without lircd
+===================
+
+XFree86 likely can be configured to recognise the remote keys. Once I
+simply tried to configure one of the multimedia keyboards as input
+device, which had the effect that XFree86 recognised some of the keys
+of my remote control and passed volume up/down key presses as
+XF86AudioRaiseVolume and XF86AudioLowerVolume key events to the X11
+clients.
+
+It likely is possible to make that fly with a nice xkb config file,
+I know next to nothing about that through.
+
+
+Have fun,
+
+ Gerd
+
+--
+Gerd Knorr <kraxel@bytesex.org>
diff --git a/Documentation/video4linux/README.ivtv b/Documentation/video4linux/README.ivtv
new file mode 100644
index 0000000..73df22c
--- /dev/null
+++ b/Documentation/video4linux/README.ivtv
@@ -0,0 +1,187 @@
+
+ivtv release notes
+==================
+
+This is a v4l2 device driver for the Conexant cx23415/6 MPEG encoder/decoder.
+The cx23415 can do both encoding and decoding, the cx23416 can only do MPEG
+encoding. Currently the only card featuring full decoding support is the
+Hauppauge PVR-350.
+
+NOTE: this driver requires the latest encoder firmware (version 2.06.039, size
+376836 bytes). Get the firmware from here:
+
+http://dl.ivtvdriver.org/ivtv/firmware/firmware.tar.gz
+
+NOTE: 'normal' TV applications do not work with this driver, you need
+an application that can handle MPEG input such as mplayer, xine, MythTV,
+etc.
+
+The primary goal of the IVTV project is to provide a "clean room" Linux
+Open Source driver implementation for video capture cards based on the
+iCompression iTVC15 or Conexant CX23415/CX23416 MPEG Codec.
+
+Features:
+ * Hardware mpeg2 capture of broadcast video (and sound) via the tuner or
+ S-Video/Composite and audio line-in.
+ * Hardware mpeg2 capture of FM radio where hardware support exists
+ * Supports NTSC, PAL, SECAM with stereo sound
+ * Supports SAP and bilingual transmissions.
+ * Supports raw VBI (closed captions and teletext).
+ * Supports sliced VBI (closed captions and teletext) and is able to insert
+ this into the captured MPEG stream.
+ * Supports raw YUV and PCM input.
+
+Additional features for the PVR-350 (CX23415 based):
+ * Provides hardware mpeg2 playback
+ * Provides comprehensive OSD (On Screen Display: ie. graphics overlaying the
+ video signal)
+ * Provides a framebuffer (allowing X applications to appear on the video
+ device) (this framebuffer is not yet part of the kernel. In the meantime it
+ is available from www.ivtvdriver.org).
+ * Supports raw YUV output.
+
+IMPORTANT: In case of problems first read this page:
+ http://www.ivtvdriver.org/index.php/Troubleshooting
+
+See also:
+
+Homepage + Wiki
+http://www.ivtvdriver.org
+
+IRC
+irc://irc.freenode.net/ivtv-dev
+
+----------------------------------------------------------
+
+Devices
+=======
+
+A maximum of 12 ivtv boards are allowed at the moment.
+
+Cards that don't have a video output capability (i.e. non PVR350 cards)
+lack the vbi8, vbi16, video16 and video48 devices. They also do not
+support the framebuffer device /dev/fbx for OSD.
+
+The radio0 device may or may not be present, depending on whether the
+card has a radio tuner or not.
+
+Here is a list of the base v4l devices:
+crw-rw---- 1 root video 81, 0 Jun 19 22:22 /dev/video0
+crw-rw---- 1 root video 81, 16 Jun 19 22:22 /dev/video16
+crw-rw---- 1 root video 81, 24 Jun 19 22:22 /dev/video24
+crw-rw---- 1 root video 81, 32 Jun 19 22:22 /dev/video32
+crw-rw---- 1 root video 81, 48 Jun 19 22:22 /dev/video48
+crw-rw---- 1 root video 81, 64 Jun 19 22:22 /dev/radio0
+crw-rw---- 1 root video 81, 224 Jun 19 22:22 /dev/vbi0
+crw-rw---- 1 root video 81, 228 Jun 19 22:22 /dev/vbi8
+crw-rw---- 1 root video 81, 232 Jun 19 22:22 /dev/vbi16
+
+Base devices
+============
+
+For every extra card you have the numbers increased by one. For example,
+/dev/video0 is listed as the 'base' encoding capture device so we have:
+
+ /dev/video0 is the encoding capture device for the first card (card 0)
+ /dev/video1 is the encoding capture device for the second card (card 1)
+ /dev/video2 is the encoding capture device for the third card (card 2)
+
+Note that if the first card doesn't have a feature (eg no decoder, so no
+video16, the second card will still use video17. The simple rule is 'add
+the card number to the base device number'. If you have other capture
+cards (e.g. WinTV PCI) that are detected first, then you have to tell
+the ivtv module about it so that it will start counting at 1 (or 2, or
+whatever). Otherwise the device numbers can get confusing. The ivtv
+'ivtv_first_minor' module option can be used for that.
+
+
+/dev/video0
+The encoding capture device(s).
+Read-only.
+
+Reading from this device gets you the MPEG1/2 program stream.
+Example:
+
+cat /dev/video0 > my.mpg (you need to hit ctrl-c to exit)
+
+
+/dev/video16
+The decoder output device(s)
+Write-only. Only present if the MPEG decoder (i.e. CX23415) exists.
+
+An mpeg2 stream sent to this device will appear on the selected video
+display, audio will appear on the line-out/audio out. It is only
+available for cards that support video out. Example:
+
+cat my.mpg >/dev/video16
+
+
+/dev/video24
+The raw audio capture device(s).
+Read-only
+
+The raw audio PCM stereo stream from the currently selected
+tuner or audio line-in. Reading from this device results in a raw
+(signed 16 bit Little Endian, 48000 Hz, stereo pcm) capture.
+This device only captures audio. This should be replaced by an ALSA
+device in the future.
+Note that there is no corresponding raw audio output device, this is
+not supported in the decoder firmware.
+
+
+/dev/video32
+The raw video capture device(s)
+Read-only
+
+The raw YUV video output from the current video input. The YUV format
+is non-standard (V4L2_PIX_FMT_HM12).
+
+Note that the YUV and PCM streams are not synchronized, so they are of
+limited use.
+
+
+/dev/video48
+The raw video display device(s)
+Write-only. Only present if the MPEG decoder (i.e. CX23415) exists.
+
+Writes a YUV stream to the decoder of the card.
+
+
+/dev/radio0
+The radio tuner device(s)
+Cannot be read or written.
+
+Used to enable the radio tuner and tune to a frequency. You cannot
+read or write audio streams with this device. Once you use this
+device to tune the radio, use /dev/video24 to read the raw pcm stream
+or /dev/video0 to get an mpeg2 stream with black video.
+
+
+/dev/vbi0
+The 'vertical blank interval' (Teletext, CC, WSS etc) capture device(s)
+Read-only
+
+Captures the raw (or sliced) video data sent during the Vertical Blank
+Interval. This data is used to encode teletext, closed captions, VPS,
+widescreen signalling, electronic program guide information, and other
+services.
+
+
+/dev/vbi8
+Processed vbi feedback device(s)
+Read-only. Only present if the MPEG decoder (i.e. CX23415) exists.
+
+The sliced VBI data embedded in an MPEG stream is reproduced on this
+device. So while playing back a recording on /dev/video16, you can
+read the embedded VBI data from /dev/vbi8.
+
+
+/dev/vbi16
+The vbi 'display' device(s)
+Write-only. Only present if the MPEG decoder (i.e. CX23415) exists.
+
+Can be used to send sliced VBI data to the video-out connector.
+
+---------------------------------
+
+Hans Verkuil <hverkuil@xs4all.nl>
diff --git a/Documentation/video4linux/README.pvrusb2 b/Documentation/video4linux/README.pvrusb2
new file mode 100644
index 0000000..a747200
--- /dev/null
+++ b/Documentation/video4linux/README.pvrusb2
@@ -0,0 +1,212 @@
+
+$Id$
+Mike Isely <isely@pobox.com>
+
+ pvrusb2 driver
+
+Background:
+
+ This driver is intended for the "Hauppauge WinTV PVR USB 2.0", which
+ is a USB 2.0 hosted TV Tuner. This driver is a work in progress.
+ Its history started with the reverse-engineering effort by Björn
+ Danielsson <pvrusb2@dax.nu> whose web page can be found here:
+
+ http://pvrusb2.dax.nu/
+
+ From there Aurelien Alleaume <slts@free.fr> began an effort to
+ create a video4linux compatible driver. I began with Aurelien's
+ last known snapshot and evolved the driver to the state it is in
+ here.
+
+ More information on this driver can be found at:
+
+ http://www.isely.net/pvrusb2.html
+
+
+ This driver has a strong separation of layers. They are very
+ roughly:
+
+ 1a. Low level wire-protocol implementation with the device.
+
+ 1b. I2C adaptor implementation and corresponding I2C client drivers
+ implemented elsewhere in V4L.
+
+ 1c. High level hardware driver implementation which coordinates all
+ activities that ensure correct operation of the device.
+
+ 2. A "context" layer which manages instancing of driver, setup,
+ tear-down, arbitration, and interaction with high level
+ interfaces appropriately as devices are hotplugged in the
+ system.
+
+ 3. High level interfaces which glue the driver to various published
+ Linux APIs (V4L, sysfs, maybe DVB in the future).
+
+ The most important shearing layer is between the top 2 layers. A
+ lot of work went into the driver to ensure that any kind of
+ conceivable API can be laid on top of the core driver. (Yes, the
+ driver internally leverages V4L to do its work but that really has
+ nothing to do with the API published by the driver to the outside
+ world.) The architecture allows for different APIs to
+ simultaneously access the driver. I have a strong sense of fairness
+ about APIs and also feel that it is a good design principle to keep
+ implementation and interface isolated from each other. Thus while
+ right now the V4L high level interface is the most complete, the
+ sysfs high level interface will work equally well for similar
+ functions, and there's no reason I see right now why it shouldn't be
+ possible to produce a DVB high level interface that can sit right
+ alongside V4L.
+
+ NOTE: Complete documentation on the pvrusb2 driver is contained in
+ the html files within the doc directory; these are exactly the same
+ as what is on the web site at the time. Browse those files
+ (especially the FAQ) before asking questions.
+
+
+Building
+
+ To build these modules essentially amounts to just running "Make",
+ but you need the kernel source tree nearby and you will likely also
+ want to set a few controlling environment variables first in order
+ to link things up with that source tree. Please see the Makefile
+ here for comments that explain how to do that.
+
+
+Source file list / functional overview:
+
+ (Note: The term "module" used below generally refers to loosely
+ defined functional units within the pvrusb2 driver and bears no
+ relation to the Linux kernel's concept of a loadable module.)
+
+ pvrusb2-audio.[ch] - This is glue logic that resides between this
+ driver and the msp3400.ko I2C client driver (which is found
+ elsewhere in V4L).
+
+ pvrusb2-context.[ch] - This module implements the context for an
+ instance of the driver. Everything else eventually ties back to
+ or is otherwise instanced within the data structures implemented
+ here. Hotplugging is ultimately coordinated here. All high level
+ interfaces tie into the driver through this module. This module
+ helps arbitrate each interface's access to the actual driver core,
+ and is designed to allow concurrent access through multiple
+ instances of multiple interfaces (thus you can for example change
+ the tuner's frequency through sysfs while simultaneously streaming
+ video through V4L out to an instance of mplayer).
+
+ pvrusb2-debug.h - This header defines a printk() wrapper and a mask
+ of debugging bit definitions for the various kinds of debug
+ messages that can be enabled within the driver.
+
+ pvrusb2-debugifc.[ch] - This module implements a crude command line
+ oriented debug interface into the driver. Aside from being part
+ of the process for implementing manual firmware extraction (see
+ the pvrusb2 web site mentioned earlier), probably I'm the only one
+ who has ever used this. It is mainly a debugging aid.
+
+ pvrusb2-eeprom.[ch] - This is glue logic that resides between this
+ driver the tveeprom.ko module, which is itself implemented
+ elsewhere in V4L.
+
+ pvrusb2-encoder.[ch] - This module implements all protocol needed to
+ interact with the Conexant mpeg2 encoder chip within the pvrusb2
+ device. It is a crude echo of corresponding logic in ivtv,
+ however the design goals (strict isolation) and physical layer
+ (proxy through USB instead of PCI) are enough different that this
+ implementation had to be completely different.
+
+ pvrusb2-hdw-internal.h - This header defines the core data structure
+ in the driver used to track ALL internal state related to control
+ of the hardware. Nobody outside of the core hardware-handling
+ modules should have any business using this header. All external
+ access to the driver should be through one of the high level
+ interfaces (e.g. V4L, sysfs, etc), and in fact even those high
+ level interfaces are restricted to the API defined in
+ pvrusb2-hdw.h and NOT this header.
+
+ pvrusb2-hdw.h - This header defines the full internal API for
+ controlling the hardware. High level interfaces (e.g. V4L, sysfs)
+ will work through here.
+
+ pvrusb2-hdw.c - This module implements all the various bits of logic
+ that handle overall control of a specific pvrusb2 device.
+ (Policy, instantiation, and arbitration of pvrusb2 devices fall
+ within the jurisdiction of pvrusb-context not here).
+
+ pvrusb2-i2c-chips-*.c - These modules implement the glue logic to
+ tie together and configure various I2C modules as they attach to
+ the I2C bus. There are two versions of this file. The "v4l2"
+ version is intended to be used in-tree alongside V4L, where we
+ implement just the logic that makes sense for a pure V4L
+ environment. The "all" version is intended for use outside of
+ V4L, where we might encounter other possibly "challenging" modules
+ from ivtv or older kernel snapshots (or even the support modules
+ in the standalone snapshot).
+
+ pvrusb2-i2c-cmd-v4l1.[ch] - This module implements generic V4L1
+ compatible commands to the I2C modules. It is here where state
+ changes inside the pvrusb2 driver are translated into V4L1
+ commands that are in turn send to the various I2C modules.
+
+ pvrusb2-i2c-cmd-v4l2.[ch] - This module implements generic V4L2
+ compatible commands to the I2C modules. It is here where state
+ changes inside the pvrusb2 driver are translated into V4L2
+ commands that are in turn send to the various I2C modules.
+
+ pvrusb2-i2c-core.[ch] - This module provides an implementation of a
+ kernel-friendly I2C adaptor driver, through which other external
+ I2C client drivers (e.g. msp3400, tuner, lirc) may connect and
+ operate corresponding chips within the pvrusb2 device. It is
+ through here that other V4L modules can reach into this driver to
+ operate specific pieces (and those modules are in turn driven by
+ glue logic which is coordinated by pvrusb2-hdw, doled out by
+ pvrusb2-context, and then ultimately made available to users
+ through one of the high level interfaces).
+
+ pvrusb2-io.[ch] - This module implements a very low level ring of
+ transfer buffers, required in order to stream data from the
+ device. This module is *very* low level. It only operates the
+ buffers and makes no attempt to define any policy or mechanism for
+ how such buffers might be used.
+
+ pvrusb2-ioread.[ch] - This module layers on top of pvrusb2-io.[ch]
+ to provide a streaming API usable by a read() system call style of
+ I/O. Right now this is the only layer on top of pvrusb2-io.[ch],
+ however the underlying architecture here was intended to allow for
+ other styles of I/O to be implemented with additonal modules, like
+ mmap()'ed buffers or something even more exotic.
+
+ pvrusb2-main.c - This is the top level of the driver. Module level
+ and USB core entry points are here. This is our "main".
+
+ pvrusb2-sysfs.[ch] - This is the high level interface which ties the
+ pvrusb2 driver into sysfs. Through this interface you can do
+ everything with the driver except actually stream data.
+
+ pvrusb2-tuner.[ch] - This is glue logic that resides between this
+ driver and the tuner.ko I2C client driver (which is found
+ elsewhere in V4L).
+
+ pvrusb2-util.h - This header defines some common macros used
+ throughout the driver. These macros are not really specific to
+ the driver, but they had to go somewhere.
+
+ pvrusb2-v4l2.[ch] - This is the high level interface which ties the
+ pvrusb2 driver into video4linux. It is through here that V4L
+ applications can open and operate the driver in the usual V4L
+ ways. Note that **ALL** V4L functionality is published only
+ through here and nowhere else.
+
+ pvrusb2-video-*.[ch] - This is glue logic that resides between this
+ driver and the saa711x.ko I2C client driver (which is found
+ elsewhere in V4L). Note that saa711x.ko used to be known as
+ saa7115.ko in ivtv. There are two versions of this; one is
+ selected depending on the particular saa711[5x].ko that is found.
+
+ pvrusb2.h - This header contains compile time tunable parameters
+ (and at the moment the driver has very little that needs to be
+ tuned).
+
+
+ -Mike Isely
+ isely@pobox.com
+
diff --git a/Documentation/video4linux/README.saa7134 b/Documentation/video4linux/README.saa7134
new file mode 100644
index 0000000..b911f08
--- /dev/null
+++ b/Documentation/video4linux/README.saa7134
@@ -0,0 +1,82 @@
+
+
+What is it?
+===========
+
+This is a v4l2/oss device driver for saa7130/33/34/35 based capture / TV
+boards. See http://www.semiconductors.philips.com/pip/saa7134hl for a
+description.
+
+
+Status
+======
+
+Almost everything is working. video, sound, tuner, radio, mpeg ts, ...
+
+As with bttv, card-specific tweaks are needed. Check CARDLIST for a
+list of known TV cards and saa7134-cards.c for the drivers card
+configuration info.
+
+
+Build
+=====
+
+Pick up videodev + v4l2 patches from http://bytesex.org/patches/.
+Configure, build, install + boot the new kernel. You'll need at least
+these config options:
+
+ CONFIG_I2C=m
+ CONFIG_VIDEO_DEV=m
+
+Type "make" to build the driver now. "make install" installs the
+driver. "modprobe saa7134" should load it. Depending on the card you
+might have to pass card=<nr> as insmod option, check CARDLIST for
+valid choices.
+
+
+Changes / Fixes
+===============
+
+Please mail me unified diffs ("diff -u") with your changes, and don't
+forget to tell me what it changes / which problem it fixes / whatever
+it is good for ...
+
+
+Known Problems
+==============
+
+* The tuner for the flyvideos isn't detected automatically and the
+ default might not work for you depending on which version you have.
+ There is a tuner= insmod option to override the driver's default.
+
+Card Variations:
+================
+
+Cards can use either of these two crystals (xtal):
+ - 32.11 MHz -> .audio_clock=0x187de7
+ - 24.576MHz -> .audio_clock=0x200000
+(xtal * .audio_clock = 51539600)
+
+Some details about 30/34/35:
+
+ - saa7130 - low-price chip, doesn't have mute, that is why all those
+ cards should have .mute field defined in their tuner structure.
+
+ - saa7134 - usual chip
+
+ - saa7133/35 - saa7135 is probably a marketing decision, since all those
+ chips identifies itself as 33 on pci.
+
+Credits
+=======
+
+andrew.stevens@philips.com + werner.leeb@philips.com for providing
+saa7134 hardware specs and sample board.
+
+
+Have fun,
+
+ Gerd
+
+--
+Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]
diff --git a/Documentation/video4linux/Zoran b/Documentation/video4linux/Zoran
new file mode 100644
index 0000000..295462b
--- /dev/null
+++ b/Documentation/video4linux/Zoran
@@ -0,0 +1,580 @@
+Frequently Asked Questions:
+===========================
+subject: unified zoran driver (zr360x7, zoran, buz, dc10(+), dc30(+), lml33)
+website: http://mjpeg.sourceforge.net/driver-zoran/
+
+1. What cards are supported
+1.1 What the TV decoder can do an what not
+1.2 What the TV encoder can do an what not
+2. How do I get this damn thing to work
+3. What mainboard should I use (or why doesn't my card work)
+4. Programming interface
+5. Applications
+6. Concerning buffer sizes, quality, output size etc.
+7. It hangs/crashes/fails/whatevers! Help!
+8. Maintainers/Contacting
+9. License
+
+===========================
+
+1. What cards are supported
+
+Iomega Buz, Linux Media Labs LML33/LML33R10, Pinnacle/Miro
+DC10/DC10+/DC30/DC30+ and related boards (available under various names).
+
+Iomega Buz:
+* Zoran zr36067 PCI controller
+* Zoran zr36060 MJPEG codec
+* Philips saa7111 TV decoder
+* Philips saa7185 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, saa7111, saa7185, zr36060, zr36067
+Inputs/outputs: Composite and S-video
+Norms: PAL, SECAM (720x576 @ 25 fps), NTSC (720x480 @ 29.97 fps)
+Card number: 7
+
+AverMedia 6 Eyes AVS6EYES:
+* Zoran zr36067 PCI controller
+* Zoran zr36060 MJPEG codec
+* Samsung ks0127 TV decoder
+* Conexant bt866 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, ks0127, bt866, zr36060, zr36067
+Inputs/outputs: Six physical inputs. 1-6 are composite,
+ 1-2, 3-4, 5-6 doubles as S-video,
+ 1-3 triples as component.
+ One composite output.
+Norms: PAL, SECAM (720x576 @ 25 fps), NTSC (720x480 @ 29.97 fps)
+Card number: 8
+Not autodetected, card=8 is necessary.
+
+Linux Media Labs LML33:
+* Zoran zr36067 PCI controller
+* Zoran zr36060 MJPEG codec
+* Brooktree bt819 TV decoder
+* Brooktree bt856 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, bt819, bt856, zr36060, zr36067
+Inputs/outputs: Composite and S-video
+Norms: PAL (720x576 @ 25 fps), NTSC (720x480 @ 29.97 fps)
+Card number: 5
+
+Linux Media Labs LML33R10:
+* Zoran zr36067 PCI controller
+* Zoran zr36060 MJPEG codec
+* Philips saa7114 TV decoder
+* Analog Devices adv7170 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, saa7114, adv7170, zr36060, zr36067
+Inputs/outputs: Composite and S-video
+Norms: PAL (720x576 @ 25 fps), NTSC (720x480 @ 29.97 fps)
+Card number: 6
+
+Pinnacle/Miro DC10(new):
+* Zoran zr36057 PCI controller
+* Zoran zr36060 MJPEG codec
+* Philips saa7110a TV decoder
+* Analog Devices adv7176 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, saa7110, adv7175, zr36060, zr36067
+Inputs/outputs: Composite, S-video and Internal
+Norms: PAL, SECAM (768x576 @ 25 fps), NTSC (640x480 @ 29.97 fps)
+Card number: 1
+
+Pinnacle/Miro DC10+:
+* Zoran zr36067 PCI controller
+* Zoran zr36060 MJPEG codec
+* Philips saa7110a TV decoder
+* Analog Devices adv7176 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, sa7110, adv7175, zr36060, zr36067
+Inputs/outputs: Composite, S-video and Internal
+Norms: PAL, SECAM (768x576 @ 25 fps), NTSC (640x480 @ 29.97 fps)
+Card number: 2
+
+Pinnacle/Miro DC10(old): *
+* Zoran zr36057 PCI controller
+* Zoran zr36050 MJPEG codec
+* Zoran zr36016 Video Front End or Fuji md0211 Video Front End (clone?)
+* Micronas vpx3220a TV decoder
+* mse3000 TV encoder or Analog Devices adv7176 TV encoder *
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, vpx3220, mse3000/adv7175, zr36050, zr36016, zr36067
+Inputs/outputs: Composite, S-video and Internal
+Norms: PAL, SECAM (768x576 @ 25 fps), NTSC (640x480 @ 29.97 fps)
+Card number: 0
+
+Pinnacle/Miro DC30: *
+* Zoran zr36057 PCI controller
+* Zoran zr36050 MJPEG codec
+* Zoran zr36016 Video Front End
+* Micronas vpx3225d/vpx3220a/vpx3216b TV decoder
+* Analog Devices adv7176 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, vpx3220/vpx3224, adv7175, zr36050, zr36016, zr36067
+Inputs/outputs: Composite, S-video and Internal
+Norms: PAL, SECAM (768x576 @ 25 fps), NTSC (640x480 @ 29.97 fps)
+Card number: 3
+
+Pinnacle/Miro DC30+: *
+* Zoran zr36067 PCI controller
+* Zoran zr36050 MJPEG codec
+* Zoran zr36016 Video Front End
+* Micronas vpx3225d/vpx3220a/vpx3216b TV decoder
+* Analog Devices adv7176 TV encoder
+Drivers to use: videodev, i2c-core, i2c-algo-bit,
+ videocodec, vpx3220/vpx3224, adv7175, zr36050, zr36015, zr36067
+Inputs/outputs: Composite, S-video and Internal
+Norms: PAL, SECAM (768x576 @ 25 fps), NTSC (640x480 @ 29.97 fps)
+Card number: 4
+
+Note: No module for the mse3000 is available yet
+Note: No module for the vpx3224 is available yet
+Note: use encoder=X or decoder=X for non-default i2c chips (see i2c-id.h)
+
+===========================
+
+1.1 What the TV decoder can do an what not
+
+The best know TV standards are NTSC/PAL/SECAM. but for decoding a frame that
+information is not enough. There are several formats of the TV standards.
+And not every TV decoder is able to handle every format. Also the every
+combination is supported by the driver. There are currently 11 different
+tv broadcast formats all aver the world.
+
+The CCIR defines parameters needed for broadcasting the signal.
+The CCIR has defined different standards: A,B,D,E,F,G,D,H,I,K,K1,L,M,N,...
+The CCIR says not much about the colorsystem used !!!
+And talking about a colorsystem says not to much about how it is broadcast.
+
+The CCIR standards A,E,F are not used any more.
+
+When you speak about NTSC, you usually mean the standard: CCIR - M using
+the NTSC colorsystem which is used in the USA, Japan, Mexico, Canada
+and a few others.
+
+When you talk about PAL, you usually mean: CCIR - B/G using the PAL
+colorsystem which is used in many Countries.
+
+When you talk about SECAM, you mean: CCIR - L using the SECAM Colorsystem
+which is used in France, and a few others.
+
+There the other version of SECAM, CCIR - D/K is used in Bulgaria, China,
+Slovakai, Hungary, Korea (Rep.), Poland, Rumania and a others.
+
+The CCIR - H uses the PAL colorsystem (sometimes SECAM) and is used in
+Egypt, Libya, Sri Lanka, Syrain Arab. Rep.
+
+The CCIR - I uses the PAL colorsystem, and is used in Great Britain, Hong Kong,
+Ireland, Nigeria, South Africa.
+
+The CCIR - N uses the PAL colorsystem and PAL frame size but the NTSC framerate,
+and is used in Argentinia, Uruguay, an a few others
+
+We do not talk about how the audio is broadcast !
+
+A rather good sites about the TV standards are:
+http://www.sony.jp/ServiceArea/Voltage_map/
+http://info.electronicwerkstatt.de/bereiche/fernsehtechnik/frequenzen_und_normen/Fernsehnormen/
+and http://www.cabl.com/restaurant/channel.html
+
+Other weird things around: NTSC 4.43 is a modificated NTSC, which is mainly
+used in PAL VCR's that are able to play back NTSC. PAL 60 seems to be the same
+as NTSC 4.43 . The Datasheets also talk about NTSC 44, It seems as if it would
+be the same as NTSC 4.43.
+NTSC Combs seems to be a decoder mode where the decoder uses a comb filter
+to split coma and luma instead of a Delay line.
+
+But I did not defiantly find out what NTSC Comb is.
+
+Philips saa7111 TV decoder
+was introduced in 1997, is used in the BUZ and
+can handle: PAL B/G/H/I, PAL N, PAL M, NTSC M, NTSC N, NTSC 4.43 and SECAM
+
+Philips saa7110a TV decoder
+was introduced in 1995, is used in the Pinnacle/Miro DC10(new), DC10+ and
+can handle: PAL B/G, NTSC M and SECAM
+
+Philips saa7114 TV decoder
+was introduced in 2000, is used in the LML33R10 and
+can handle: PAL B/G/D/H/I/N, PAL N, PAL M, NTSC M, NTSC 4.43 and SECAM
+
+Brooktree bt819 TV decoder
+was introduced in 1996, and is used in the LML33 and
+can handle: PAL B/D/G/H/I, NTSC M
+
+Micronas vpx3220a TV decoder
+was introduced in 1996, is used in the DC30 and DC30+ and
+can handle: PAL B/G/H/I, PAL N, PAL M, NTSC M, NTSC 44, PAL 60, SECAM,NTSC Comb
+
+Samsung ks0127 TV decoder
+is used in the AVS6EYES card and
+can handle: NTSC-M/N/44, PAL-M/N/B/G/H/I/D/K/L and SECAM
+
+===========================
+
+1.2 What the TV encoder can do an what not
+
+The TV encoder are doing the "same" as the decoder, but in the oder direction.
+You feed them digital data and the generate a Composite or SVHS signal.
+For information about the colorsystems and TV norm take a look in the
+TV decoder section.
+
+Philips saa7185 TV Encoder
+was introduced in 1996, is used in the BUZ
+can generate: PAL B/G, NTSC M
+
+Brooktree bt856 TV Encoder
+was introduced in 1994, is used in the LML33
+can generate: PAL B/D/G/H/I/N, PAL M, NTSC M, PAL-N (Argentina)
+
+Analog Devices adv7170 TV Encoder
+was introduced in 2000, is used in the LML300R10
+can generate: PAL B/D/G/H/I/N, PAL M, NTSC M, PAL 60
+
+Analog Devices adv7175 TV Encoder
+was introduced in 1996, is used in the DC10, DC10+, DC10 old, DC30, DC30+
+can generate: PAL B/D/G/H/I/N, PAL M, NTSC M
+
+ITT mse3000 TV encoder
+was introduced in 1991, is used in the DC10 old
+can generate: PAL , NTSC , SECAM
+
+Conexant bt866 TV encoder
+is used in AVS6EYES, and
+can generate: NTSC/PAL, PAL­M, PAL­N
+
+The adv717x, should be able to produce PAL N. But you find nothing PAL N
+specific in the registers. Seem that you have to reuse a other standard
+to generate PAL N, maybe it would work if you use the PAL M settings.
+
+==========================
+
+2. How do I get this damn thing to work
+
+Load zr36067.o. If it can't autodetect your card, use the card=X insmod
+option with X being the card number as given in the previous section.
+To have more than one card, use card=X1[,X2[,X3,[X4[..]]]]
+
+To automate this, add the following to your /etc/modprobe.conf:
+
+options zr36067 card=X1[,X2[,X3[,X4[..]]]]
+alias char-major-81-0 zr36067
+
+One thing to keep in mind is that this doesn't load zr36067.o itself yet. It
+just automates loading. If you start using xawtv, the device won't load on
+some systems, since you're trying to load modules as a user, which is not
+allowed ("permission denied"). A quick workaround is to add 'Load "v4l"' to
+XF86Config-4 when you use X by default, or to run 'v4l-conf -c <device>' in
+one of your startup scripts (normally rc.local) if you don't use X. Both
+make sure that the modules are loaded on startup, under the root account.
+
+===========================
+
+3. What mainboard should I use (or why doesn't my card work)
+
+<insert lousy disclaimer here>. In short: good=SiS/Intel, bad=VIA.
+
+Experience tells us that people with a Buz, on average, have more problems
+than users with a DC10+/LML33. Also, it tells us that people owning a VIA-
+based mainboard (ktXXX, MVP3) have more problems than users with a mainboard
+based on a different chipset. Here's some notes from Andrew Stevens:
+--
+Here's my experience of using LML33 and Buz on various motherboards:
+
+VIA MVP3
+ Forget it. Pointless. Doesn't work.
+Intel 430FX (Pentium 200)
+ LML33 perfect, Buz tolerable (3 or 4 frames dropped per movie)
+Intel 440BX (early stepping)
+ LML33 tolerable. Buz starting to get annoying (6-10 frames/hour)
+Intel 440BX (late stepping)
+ Buz tolerable, LML3 almost perfect (occasional single frame drops)
+SiS735
+ LML33 perfect, Buz tolerable.
+VIA KT133(*)
+ LML33 starting to get annoying, Buz poor enough that I have up.
+
+Both 440BX boards were dual CPU versions.
+--
+Bernhard Praschinger later added:
+--
+AMD 751
+ Buz perfect-tolerable
+AMD 760
+ Buz perfect-tolerable
+--
+In general, people on the user mailinglist won't give you much of a chance
+if you have a VIA-based motherboard. They may be cheap, but sometimes, you'd
+rather want to spend some more money on better boards. In general, VIA
+mainboard's IDE/PCI performance will also suck badly compared to others.
+You'll noticed the DC10+/DC30+ aren't mentioned anywhere in the overview.
+Basically, you can assume that if the Buz works, the LML33 will work too. If
+the LML33 works, the DC10+/DC30+ will work too. They're most tolerant to
+different mainboard chipsets from all of the supported cards.
+
+If you experience timeouts during capture, buy a better mainboard or lower
+the quality/buffersize during capture (see 'Concerning buffer sizes, quality,
+output size etc.'). If it hangs, there's little we can do as of now. Check
+your IRQs and make sure the card has its own interrupts.
+
+===========================
+
+4. Programming interface
+
+This driver conforms to video4linux and video4linux2, both can be used to
+use the driver. Since video4linux didn't provide adequate calls to fully
+use the cards' features, we've introduced several programming extensions,
+which are currently officially accepted in the 2.4.x branch of the kernel.
+These extensions are known as the v4l/mjpeg extensions. See zoran.h for
+details (structs/ioctls).
+
+Information - video4linux:
+http://roadrunner.swansea.linux.org.uk/v4lapi.shtml
+Documentation/video4linux/API.html
+/usr/include/linux/videodev.h
+
+Information - video4linux/mjpeg extensions:
+./zoran.h
+(also see below)
+
+Information - video4linux2:
+http://linuxtv.org
+http://v4l2spec.bytesex.org/
+/usr/include/linux/videodev2.h
+
+More information on the video4linux/mjpeg extensions, by Serguei
+Miridonovi and Rainer Johanni:
+--
+The ioctls for that interface are as follows:
+
+BUZIOC_G_PARAMS
+BUZIOC_S_PARAMS
+
+Get and set the parameters of the buz. The user should always do a
+BUZIOC_G_PARAMS (with a struct buz_params) to obtain the default
+settings, change what he likes and then make a BUZIOC_S_PARAMS call.
+
+BUZIOC_REQBUFS
+
+Before being able to capture/playback, the user has to request
+the buffers he is wanting to use. Fill the structure
+zoran_requestbuffers with the size (recommended: 256*1024) and
+the number (recommended 32 up to 256). There are no such restrictions
+as for the Video for Linux buffers, you should LEAVE SUFFICIENT
+MEMORY for your system however, else strange things will happen ....
+On return, the zoran_requestbuffers structure contains number and
+size of the actually allocated buffers.
+You should use these numbers for doing a mmap of the buffers
+into the user space.
+The BUZIOC_REQBUFS ioctl also makes it happen, that the next mmap
+maps the MJPEG buffer instead of the V4L buffers.
+
+BUZIOC_QBUF_CAPT
+BUZIOC_QBUF_PLAY
+
+Queue a buffer for capture or playback. The first call also starts
+streaming capture. When streaming capture is going on, you may
+only queue further buffers or issue syncs until streaming
+capture is switched off again with a argument of -1 to
+a BUZIOC_QBUF_CAPT/BUZIOC_QBUF_PLAY ioctl.
+
+BUZIOC_SYNC
+
+Issue this ioctl when all buffers are queued. This ioctl will
+block until the first buffer becomes free for saving its
+data to disk (after BUZIOC_QBUF_CAPT) or for reuse (after BUZIOC_QBUF_PLAY).
+
+BUZIOC_G_STATUS
+
+Get the status of the input lines (video source connected/norm).
+
+For programming example, please, look at lavrec.c and lavplay.c code in
+lavtools-1.2p2 package (URL: http://www.cicese.mx/~mirsev/DC10plus/)
+and the 'examples' directory in the original Buz driver distribution.
+
+Additional notes for software developers:
+
+ The driver returns maxwidth and maxheight parameters according to
+ the current TV standard (norm). Therefore, the software which
+ communicates with the driver and "asks" for these parameters should
+ first set the correct norm. Well, it seems logically correct: TV
+ standard is "more constant" for current country than geometry
+ settings of a variety of TV capture cards which may work in ITU or
+ square pixel format. Remember that users now can lock the norm to
+ avoid any ambiguity.
+--
+Please note that lavplay/lavrec are also included in the MJPEG-tools
+(http://mjpeg.sf.net/).
+
+===========================
+
+5. Applications
+
+Applications known to work with this driver:
+
+TV viewing:
+* xawtv
+* kwintv
+* probably any TV application that supports video4linux or video4linux2.
+
+MJPEG capture/playback:
+* mjpegtools/lavtools (or Linux Video Studio)
+* gstreamer
+* mplayer
+
+General raw capture:
+* xawtv
+* gstreamer
+* probably any application that supports video4linux or video4linux2
+
+Video editing:
+* Cinelerra
+* MainActor
+* mjpegtools (or Linux Video Studio)
+
+===========================
+
+6. Concerning buffer sizes, quality, output size etc.
+
+The zr36060 can do 1:2 JPEG compression. This is really the theoretical
+maximum that the chipset can reach. The driver can, however, limit compression
+to a maximum (size) of 1:4. The reason for this is that some cards (e.g. Buz)
+can't handle 1:2 compression without stopping capture after only a few minutes.
+With 1:4, it'll mostly work. If you have a Buz, use 'low_bitrate=1' to go into
+1:4 max. compression mode.
+
+100% JPEG quality is thus 1:2 compression in practice. So for a full PAL frame
+(size 720x576). The JPEG fields are stored in YUY2 format, so the size of the
+fields are 720x288x16/2 bits/field (2 fields/frame) = 207360 bytes/field x 2 =
+414720 bytes/frame (add some more bytes for headers and DHT (huffman)/DQT
+(quantization) tables, and you'll get to something like 512kB per frame for
+1:2 compression. For 1:4 compression, you'd have frames of half this size.
+
+Some additional explanation by Martin Samuelsson, which also explains the
+importance of buffer sizes:
+--
+> Hmm, I do not think it is really that way. With the current (downloaded
+> at 18:00 Monday) driver I get that output sizes for 10 sec:
+> -q 50 -b 128 : 24.283.332 Bytes
+> -q 50 -b 256 : 48.442.368
+> -q 25 -b 128 : 24.655.992
+> -q 25 -b 256 : 25.859.820
+
+I woke up, and can't go to sleep again. I'll kill some time explaining why
+this doesn't look strange to me.
+
+Let's do some math using a width of 704 pixels. I'm not sure whether the Buz
+actually use that number or not, but that's not too important right now.
+
+704x288 pixels, one field, is 202752 pixels. Divided by 64 pixels per block;
+3168 blocks per field. Each pixel consist of two bytes; 128 bytes per block;
+1024 bits per block. 100% in the new driver mean 1:2 compression; the maximum
+output becomes 512 bits per block. Actually 510, but 512 is simpler to use
+for calculations.
+
+Let's say that we specify d1q50. We thus want 256 bits per block; times 3168
+becomes 811008 bits; 101376 bytes per field. We're talking raw bits and bytes
+here, so we don't need to do any fancy corrections for bits-per-pixel or such
+things. 101376 bytes per field.
+
+d1 video contains two fields per frame. Those sum up to 202752 bytes per
+frame, and one of those frames goes into each buffer.
+
+But wait a second! -b128 gives 128kB buffers! It's not possible to cram
+202752 bytes of JPEG data into 128kB!
+
+This is what the driver notice and automatically compensate for in your
+examples. Let's do some math using this information:
+
+128kB is 131072 bytes. In this buffer, we want to store two fields, which
+leaves 65536 bytes for each field. Using 3168 blocks per field, we get
+20.68686868... available bytes per block; 165 bits. We can't allow the
+request for 256 bits per block when there's only 165 bits available! The -q50
+option is silently overridden, and the -b128 option takes precedence, leaving
+us with the equivalence of -q32.
+
+This gives us a data rate of 165 bits per block, which, times 3168, sums up
+to 65340 bytes per field, out of the allowed 65536. The current driver has
+another level of rate limiting; it won't accept -q values that fill more than
+6/8 of the specified buffers. (I'm not sure why. "Playing it safe" seem to be
+a safe bet. Personally, I think I would have lowered requested-bits-per-block
+by one, or something like that.) We can't use 165 bits per block, but have to
+lower it again, to 6/8 of the available buffer space: We end up with 124 bits
+per block, the equivalence of -q24. With 128kB buffers, you can't use greater
+than -q24 at -d1. (And PAL, and 704 pixels width...)
+
+The third example is limited to -q24 through the same process. The second
+example, using very similar calculations, is limited to -q48. The only
+example that actually grab at the specified -q value is the last one, which
+is clearly visible, looking at the file size.
+--
+
+Conclusion: the quality of the resulting movie depends on buffer size, quality,
+whether or not you use 'low_bitrate=1' as insmod option for the zr36060.c
+module to do 1:4 instead of 1:2 compression, etc.
+
+If you experience timeouts, lowering the quality/buffersize or using
+'low_bitrate=1 as insmod option for zr36060.o might actually help, as is
+proven by the Buz.
+
+===========================
+
+7. It hangs/crashes/fails/whatevers! Help!
+
+Make sure that the card has its own interrupts (see /proc/interrupts), check
+the output of dmesg at high verbosity (load zr36067.o with debug=2,
+load all other modules with debug=1). Check that your mainboard is favorable
+(see question 2) and if not, test the card in another computer. Also see the
+notes given in question 3 and try lowering quality/buffersize/capturesize
+if recording fails after a period of time.
+
+If all this doesn't help, give a clear description of the problem including
+detailed hardware information (memory+brand, mainboard+chipset+brand, which
+MJPEG card, processor, other PCI cards that might be of interest), give the
+system PnP information (/proc/interrupts, /proc/dma, /proc/devices), and give
+the kernel version, driver version, glibc version, gcc version and any other
+information that might possibly be of interest. Also provide the dmesg output
+at high verbosity. See 'Contacting' on how to contact the developers.
+
+===========================
+
+8. Maintainers/Contacting
+
+The driver is currently maintained by Laurent Pinchart and Ronald Bultje
+(<laurent.pinchart@skynet.be> and <rbultje@ronald.bitfreak.net>). For bug
+reports or questions, please contact the mailinglist instead of the developers
+individually. For user questions (i.e. bug reports or how-to questions), send
+an email to <mjpeg-users@lists.sf.net>, for developers (i.e. if you want to
+help programming), send an email to <mjpeg-developer@lists.sf.net>. See
+http://www.sf.net/projects/mjpeg/ for subscription information.
+
+For bug reports, be sure to include all the information as described in
+the section 'It hangs/crashes/fails/whatevers! Help!'. Please make sure
+you're using the latest version (http://mjpeg.sf.net/driver-zoran/).
+
+Previous maintainers/developers of this driver include Serguei Miridonov
+<mirsev@cicese.mx>, Wolfgang Scherr <scherr@net4you.net>, Dave Perks
+<dperks@ibm.net> and Rainer Johanni <Rainer@Johanni.de>.
+
+===========================
+
+9. License
+
+This driver is distributed under the terms of the General Public License.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+See http://www.gnu.org/ for more information.
diff --git a/Documentation/video4linux/bttv/CONTRIBUTORS b/Documentation/video4linux/bttv/CONTRIBUTORS
new file mode 100644
index 0000000..eb41b26
--- /dev/null
+++ b/Documentation/video4linux/bttv/CONTRIBUTORS
@@ -0,0 +1,25 @@
+Contributors to bttv:
+
+Michael Chu <mmchu@pobox.com>
+ AverMedia fix and more flexible card recognition
+
+Alan Cox <alan@lxorguk.ukuu.org.uk>
+ Video4Linux interface and 2.1.x kernel adaptation
+
+Chris Kleitsch
+ Hardware I2C
+
+Gerd Knorr <kraxel@cs.tu-berlin.de>
+ Radio card (ITT sound processor)
+
+bigfoot <bigfoot@net-way.net>
+Ragnar Hojland Espinosa <ragnar@macula.net>
+ ConferenceTV card
+
+
++ many more (please mail me if you are missing in this list and would
+ like to be mentioned)
+
+
+
+
diff --git a/Documentation/video4linux/bttv/Cards b/Documentation/video4linux/bttv/Cards
new file mode 100644
index 0000000..d338965
--- /dev/null
+++ b/Documentation/video4linux/bttv/Cards
@@ -0,0 +1,964 @@
+
+Gunther Mayer's bttv card gallery (graphical version of this text file :-)
+is available at: http://www.bttv-gallery.de/
+
+
+Supported cards:
+Bt848/Bt848a/Bt849/Bt878/Bt879 cards
+------------------------------------
+
+All cards with Bt848/Bt848a/Bt849/Bt878/Bt879 and normal
+Composite/S-VHS inputs are supported. Teletext and Intercast support
+(PAL only) for ALL cards via VBI sample decoding in software.
+
+Some cards with additional multiplexing of inputs or other additional
+fancy chips are only partially supported (unless specifications by the
+card manufacturer are given). When a card is listed here it isn't
+necessarily fully supported.
+
+All other cards only differ by additional components as tuners, sound
+decoders, EEPROMs, teletext decoders ...
+
+
+Unsupported Cards:
+------------------
+
+Cards with Zoran (ZR) or Philips (SAA) or ISA are not supported by
+this driver.
+
+
+MATRIX Vision
+-------------
+
+MV-Delta
+- Bt848A
+- 4 Composite inputs, 1 S-VHS input (shared with 4th composite)
+- EEPROM
+
+http://www.matrix-vision.de/
+
+This card has no tuner but supports all 4 composite (1 shared with an
+S-VHS input) of the Bt848A.
+Very nice card if you only have satellite TV but several tuners connected
+to the card via composite.
+
+Many thanks to Matrix-Vision for giving us 2 cards for free which made
+Bt848a/Bt849 single crytal operation support possible!!!
+
+
+
+Miro/Pinnacle PCTV
+------------------
+
+- Bt848
+ some (all??) come with 2 crystals for PAL/SECAM and NTSC
+- PAL, SECAM or NTSC TV tuner (Philips or TEMIC)
+- MSP34xx sound decoder on add on board
+ decoder is supported but AFAIK does not yet work
+ (other sound MUX setting in GPIO port needed??? somebody who fixed this???)
+- 1 tuner, 1 composite and 1 S-VHS input
+- tuner type is autodetected
+
+http://www.miro.de/
+http://www.miro.com/
+
+
+Many thanks for the free card which made first NTSC support possible back
+in 1997!
+
+
+Hauppauge Win/TV pci
+--------------------
+
+There are many different versions of the Hauppauge cards with different
+tuners (TV+Radio ...), teletext decoders.
+Note that even cards with same model numbers have (depending on the revision)
+different chips on it.
+
+- Bt848 (and others but always in 2 crystal operation???)
+ newer cards have a Bt878
+- PAL, SECAM, NTSC or tuner with or without Radio support
+
+e.g.:
+ PAL:
+ TDA5737: VHF, hyperband and UHF mixer/oscillator for TV and VCR 3-band tuners
+ TSA5522: 1.4 GHz I2C-bus controlled synthesizer, I2C 0xc2-0xc3
+
+ NTSC:
+ TDA5731: VHF, hyperband and UHF mixer/oscillator for TV and VCR 3-band tuners
+ TSA5518: no datasheet available on Philips site
+- Philips SAA5246 or SAA5284 ( or no) Teletext decoder chip
+ with buffer RAM (e.g. Winbond W24257AS-35: 32Kx8 CMOS static RAM)
+ SAA5246 (I2C 0x22) is supported
+- 256 bytes EEPROM: Microchip 24LC02B or Philips 8582E2Y
+ with configuration information
+ I2C address 0xa0 (24LC02B also responds to 0xa2-0xaf)
+- 1 tuner, 1 composite and (depending on model) 1 S-VHS input
+- 14052B: mux for selection of sound source
+- sound decoder: TDA9800, MSP34xx (stereo cards)
+
+
+Askey CPH-Series
+----------------
+Developed by TelSignal(?), OEMed by many vendors (Typhoon, Anubis, Dynalink)
+
+ Card series:
+ CPH01x: BT848 capture only
+ CPH03x: BT848
+ CPH05x: BT878 with FM
+ CPH06x: BT878 (w/o FM)
+ CPH07x: BT878 capture only
+
+ TV standards:
+ CPH0x0: NTSC-M/M
+ CPH0x1: PAL-B/G
+ CPH0x2: PAL-I/I
+ CPH0x3: PAL-D/K
+ CPH0x4: SECAM-L/L
+ CPH0x5: SECAM-B/G
+ CPH0x6: SECAM-D/K
+ CPH0x7: PAL-N/N
+ CPH0x8: PAL-B/H
+ CPH0x9: PAL-M/M
+
+ CPH03x was often sold as "TV capturer".
+
+ Identifying:
+ 1) 878 cards can be identified by PCI Subsystem-ID:
+ 144f:3000 = CPH06x
+ 144F:3002 = CPH05x w/ FM
+ 144F:3005 = CPH06x_LC (w/o remote control)
+ 1) The cards have a sticker with "CPH"-model on the back.
+ 2) These cards have a number printed on the PCB just above the tuner metal box:
+ "80-CP2000300-x" = CPH03X
+ "80-CP2000500-x" = CPH05X
+ "80-CP2000600-x" = CPH06X / CPH06x_LC
+
+ Askey sells these cards as "Magic TView series", Brand "MagicXpress".
+ Other OEM often call these "Tview", "TView99" or else.
+
+Lifeview Flyvideo Series:
+-------------------------
+ The naming of these series differs in time and space.
+
+ Identifying:
+ 1) Some models can be identified by PCI subsystem ID:
+ 1852:1852 = Flyvideo 98 FM
+ 1851:1850 = Flyvideo 98
+ 1851:1851 = Flyvideo 98 EZ (capture only)
+ 2) There is a print on the PCB:
+ LR25 = Flyvideo (Zoran ZR36120, SAA7110A)
+ LR26 Rev.N = Flyvideo II (Bt848)
+ Rev.O = Flyvideo II (Bt878)
+ LR37 Rev.C = Flyvideo EZ (Capture only, ZR36120 + SAA7110)
+ LR38 Rev.A1= Flyvideo II EZ (Bt848 capture only)
+ LR50 Rev.Q = Flyvideo 98 (w/eeprom and PCI subsystem ID)
+ Rev.W = Flyvideo 98 (no eeprom)
+ LR51 Rev.E = Flyvideo 98 EZ (capture only)
+ LR90 = Flyvideo 2000 (Bt878)
+ Flyvideo 2000S (Bt878) w/Stereo TV (Package incl. LR91 daughterboard)
+ LR91 = Stereo daughter card for LR90
+ LR97 = Flyvideo DVBS
+ LR99 Rev.E = Low profile card for OEM integration (only internal audio!) bt878
+ LR136 = Flyvideo 2100/3100 (Low profile, SAA7130/SAA7134)
+ LR137 = Flyvideo DV2000/DV3000 (SAA7130/SAA7134 + IEEE1394)
+ LR138 Rev.C= Flyvideo 2000 (SAA7130)
+ or Flyvideo 3000 (SAA7134) w/Stereo TV
+ These exist in variations w/FM and w/Remote sometimes denoted
+ by suffixes "FM" and "R".
+ 3) You have a laptop (miniPCI card):
+ Product = FlyTV Platinum Mini
+ Model/Chip = LR212/saa7135
+
+ Lifeview.com.tw states (Feb. 2002):
+ "The FlyVideo2000 and FlyVideo2000s product name have renamed to FlyVideo98."
+ Their Bt8x8 cards are listed as discontinued.
+ Flyvideo 2000S was probably sold as Flyvideo 3000 in some contries(Europe?).
+ The new Flyvideo 2000/3000 are SAA7130/SAA7134 based.
+
+ "Flyvideo II" had been the name for the 848 cards, nowadays (in Germany)
+ this name is re-used for LR50 Rev.W.
+ The Lifeview website mentioned Flyvideo III at some time, but such a card
+ has not yet been seen (perhaps it was the german name for LR90 [stereo]).
+ These cards are sold by many OEMs too.
+
+ FlyVideo A2 (Elta 8680)= LR90 Rev.F (w/Remote, w/o FM, stereo TV by tda9821) {Germany}
+ Lifeview 3000 (Elta 8681) as sold by Plus(April 2002), Germany = LR138 w/ saa7134
+
+
+Typhoon TV card series:
+-----------------------
+ These can be CPH, Flyvideo, Pixelview or KNC1 series.
+ Typhoon is the brand of Anubis.
+ Model 50680 got re-used, some model no. had different contents over time.
+
+ Models:
+ 50680 "TV Tuner PCI Pal BG"(old,red package)=can be CPH03x(bt848) or CPH06x(bt878)
+ 50680 "TV Tuner Pal BG" (blue package)= Pixelview PV-BT878P+ (Rev 9B)
+ 50681 "TV Tuner PCI Pal I" (variant of 50680)
+ 50682 "TView TV/FM Tuner Pal BG" = Flyvideo 98FM (LR50 Rev.Q)
+ Note: The package has a picture of CPH05x (which would be a real TView)
+ 50683 "TV Tuner PCI SECAM" (variant of 50680)
+ 50684 "TV Tuner Pal BG" = Pixelview 878TV(Rev.3D)
+ 50686 "TV Tuner" = KNC1 TV Station
+ 50687 "TV Tuner stereo" = KNC1 TV Station pro
+ 50688 "TV Tuner RDS" (black package) = KNC1 TV Station RDS
+ 50689 TV SAT DVB-S CARD CI PCI (SAA7146AH, SU1278?) = "KNC1 TV Station DVB-S"
+ 50692 "TV/FM Tuner" (small PCB)
+ 50694 TV TUNER CARD RDS (PHILIPS CHIPSET SAA7134HL)
+ 50696 TV TUNER STEREO (PHILIPS CHIPSET SAA7134HL, MK3ME Tuner)
+ 50804 PC-SAT TV/Audio Karte = Techni-PC-Sat (ZORAN 36120PQC, Tuner:Alps)
+ 50866 TVIEW SAT RECEIVER+ADR
+ 50868 "TV/FM Tuner Pal I" (variant of 50682)
+ 50999 "TV/FM Tuner Secam" (variant of 50682)
+
+
+Guillemot
+---------
+ Maxi-TV PCI (ZR36120)
+ Maxi TV Video 2 = LR50 Rev.Q (FI1216MF, PAL BG+SECAM)
+ Maxi TV Video 3 = CPH064 (PAL BG + SECAM)
+
+Mentor
+------
+ Mentor TV card ("55-878TV-U1") = Pixelview 878TV(Rev.3F) (w/FM w/Remote)
+
+Prolink
+-------
+ TV cards:
+ PixelView Play TV pro - (Model: PV-BT878P+ REV 8E)
+ PixelView Play TV pro - (Model: PV-BT878P+ REV 9D)
+ PixelView Play TV pro - (Model: PV-BT878P+ REV 4C / 8D / 10A )
+ PixelView Play TV - (Model: PV-BT848P+)
+ 878TV - (Model: PV-BT878TV)
+
+ Multimedia TV packages (card + software pack):
+ PixelView Play TV Theater - (Model: PV-M4200) = PixelView Play TV pro + Software
+ PixelView Play TV PAK - (Model: PV-BT878P+ REV 4E)
+ PixelView Play TV/VCR - (Model: PV-M3200 REV 4C / 8D / 10A )
+ PixelView Studio PAK - (Model: M2200 REV 4C / 8D / 10A )
+ PixelView PowerStudio PAK - (Model: PV-M3600 REV 4E)
+ PixelView DigitalVCR PAK - (Model: PV-M2400 REV 4C / 8D / 10A )
+
+ PixelView PlayTV PAK II (TV/FM card + usb camera) PV-M3800
+ PixelView PlayTV XP PV-M4700,PV-M4700(w/FM)
+ PixelView PlayTV DVR PV-M4600 package contents:PixelView PlayTV pro, windvr & videoMail s/w
+
+ Further Cards:
+ PV-BT878P+rev.9B (Play TV Pro, opt. w/FM w/NICAM)
+ PV-BT878P+rev.2F
+ PV-BT878P Rev.1D (bt878, capture only)
+
+ XCapture PV-CX881P (cx23881)
+ PlayTV HD PV-CX881PL+, PV-CX881PL+(w/FM) (cx23881)
+
+ DTV3000 PV-DTV3000P+ DVB-S CI = Twinhan VP-1030
+ DTV2000 DVB-S = Twinhan VP-1020
+
+ Video Conferencing:
+ PixelView Meeting PAK - (Model: PV-BT878P)
+ PixelView Meeting PAK Lite - (Model: PV-BT878P)
+ PixelView Meeting PAK plus - (Model: PV-BT878P+rev 4C/8D/10A)
+ PixelView Capture - (Model: PV-BT848P)
+
+ PixelView PlayTV USB pro
+ Model No. PV-NT1004+, PV-NT1004+ (w/FM) = NT1004 USB decoder chip + SAA7113 video decoder chip
+
+Dynalink
+--------
+ These are CPH series.
+
+Phoebemicro
+-----------
+ TV Master = CPH030 or CPH060
+ TV Master FM = CPH050
+
+Genius/Kye
+----------
+ Video Wonder/Genius Internet Video Kit = LR37 Rev.C
+ Video Wonder Pro II (848 or 878) = LR26
+
+Tekram
+------
+ VideoCap C205 (Bt848)
+ VideoCap C210 (zr36120 +Philips)
+ CaptureTV M200 (ISA)
+ CaptureTV M205 (Bt848)
+
+Lucky Star
+----------
+ Image World Conference TV = LR50 Rev. Q
+
+Leadtek
+-------
+ WinView 601 (Bt848)
+ WinView 610 (Zoran)
+ WinFast2000
+ WinFast2000 XP
+
+KNC One
+-------
+ TV-Station
+ TV-Station SE (+Software Bundle)
+ TV-Station pro (+TV stereo)
+ TV-Station FM (+Radio)
+ TV-Station RDS (+RDS)
+ TV Station SAT (analog satellite)
+ TV-Station DVB-S
+
+ newer Cards have saa7134, but model name stayed the same?
+
+Provideo
+--------
+ PV951 or PV-951 (also are sold as:
+ Boeder TV-FM Video Capture Card
+ Titanmedia Supervision TV-2400
+ Provideo PV951 TF
+ 3DeMon PV951
+ MediaForte TV-Vision PV951
+ Yoko PV951
+ Vivanco Tuner Card PCI Art.-Nr.: 68404
+ ) now named PV-951T
+
+ Surveillance Series
+ PV-141
+ PV-143
+ PV-147
+ PV-148 (capture only)
+ PV-150
+ PV-151
+
+ TV-FM Tuner Series
+ PV-951TDV (tv tuner + 1394)
+ PV-951T/TF
+ PV-951PT/TF
+ PV-956T/TF Low Profile
+ PV-911
+
+Highscreen
+----------
+ TV Karte = LR50 Rev.S
+ TV-Boostar = Terratec Terra TV+ Version 1.0 (Bt848, tda9821) "ceb105.pcb"
+
+Zoltrix
+-------
+ Face to Face Capture (Bt848 capture only) (PCB "VP-2848")
+ Face To Face TV MAX (Bt848) (PCB "VP-8482 Rev1.3")
+ Genie TV (Bt878) (PCB "VP-8790 Rev 2.1")
+ Genie Wonder Pro
+
+AVerMedia
+---------
+ AVer FunTV Lite (ISA, AV3001 chipset) "M101.C"
+ AVerTV
+ AVerTV Stereo
+ AVerTV Studio (w/FM)
+ AVerMedia TV98 with Remote
+ AVerMedia TV/FM98 Stereo
+ AVerMedia TVCAM98
+ TVCapture (Bt848)
+ TVPhone (Bt848)
+ TVCapture98 (="AVerMedia TV98" in USA) (Bt878)
+ TVPhone98 (Bt878, w/FM)
+
+ PCB PCI-ID Model-Name Eeprom Tuner Sound Country
+ --------------------------------------------------------------------
+ M101.C ISA !
+ M108-B Bt848 -- FR1236 US (2),(3)
+ M1A8-A Bt848 AVer TV-Phone FM1216 --
+ M168-T 1461:0003 AVerTV Studio 48:17 FM1216 TDA9840T D (1) w/FM w/Remote
+ M168-U 1461:0004 TVCapture98 40:11 FI1216 -- D w/Remote
+ M168II-B 1461:0003 Medion MD9592 48:16 FM1216 TDA9873H D w/FM
+
+ (1) Daughterboard MB68-A with TDA9820T and TDA9840T
+ (2) Sony NE41S soldered (stereo sound?)
+ (3) Daughterboard M118-A w/ pic 16c54 and 4 MHz quartz
+
+ US site has different drivers for (as of 09/2002):
+ EZ Capture/InterCam PCI (BT-848 chip)
+ EZ Capture/InterCam PCI (BT-878 chip)
+ TV-Phone (BT-848 chip)
+ TV98 (BT-848 chip)
+ TV98 With Remote (BT-848 chip)
+ TV98 (BT-878 chip)
+ TV98 With Remote (BT-878)
+ TV/FM98 (BT-878 chip)
+ AVerTV
+ AverTV Stereo
+ AVerTV Studio
+
+ DE hat diverse Treiber fuer diese Modelle (Stand 09/2002):
+ TVPhone (848) mit Philips tuner FR12X6 (w/ FM radio)
+ TVPhone (848) mit Philips tuner FM12X6 (w/ FM radio)
+ TVCapture (848) w/Philips tuner FI12X6
+ TVCapture (848) non-Philips tuner
+ TVCapture98 (Bt878)
+ TVPhone98 (Bt878)
+ AVerTV und TVCapture98 w/VCR (Bt 878)
+ AVerTVStudio und TVPhone98 w/VCR (Bt878)
+ AVerTV GO Serie (Kein SVideo Input)
+ AVerTV98 (BT-878 chip)
+ AVerTV98 mit Fernbedienung (BT-878 chip)
+ AVerTV/FM98 (BT-878 chip)
+
+ VDOmate (www.averm.com.cn) = M168U ?
+
+Aimslab
+-------
+ Video Highway or "Video Highway TR200" (ISA)
+ Video Highway Xtreme (aka "VHX") (Bt848, FM w/ TEA5757)
+
+IXMicro (former: IMS=Integrated Micro Solutions)
+-------
+ IXTV BT848 (=TurboTV)
+ IXTV BT878
+ IMS TurboTV (Bt848)
+
+Lifetec/Medion/Tevion/Aldi
+--------------------------
+ LT9306/MD9306 = CPH061
+ LT9415/MD9415 = LR90 Rev.F or Rev.G
+ MD9592 = Avermedia TVphone98 (PCI_ID=1461:0003), PCB-Rev=M168II-B (w/TDA9873H)
+ MD9717 = KNC One (Rev D4, saa7134, FM1216 MK2 tuner)
+ MD5044 = KNC One (Rev D4, saa7134, FM1216ME MK3 tuner)
+
+Modular Technologies (www.modulartech.com) UK
+---------------------------------------------
+ MM100 PCTV (Bt848)
+ MM201 PCTV (Bt878, Bt832) w/ Quartzsight camera
+ MM202 PCTV (Bt878, Bt832, tda9874)
+ MM205 PCTV (Bt878)
+ MM210 PCTV (Bt878) (Galaxy TV, Galaxymedia ?)
+
+Terratec
+--------
+ Terra TV+ Version 1.0 (Bt848), "ceb105.PCB" printed on the PCB, TDA9821
+ Terra TV+ Version 1.1 (Bt878), "LR74 Rev.E" printed on the PCB, TDA9821
+ Terra TValueRadio, "LR102 Rev.C" printed on the PCB
+ Terra TV/Radio+ Version 1.0, "80-CP2830100-0" TTTV3 printed on the PCB,
+ "CPH010-E83" on the back, SAA6588T, TDA9873H
+ Terra TValue Version BT878, "80-CP2830110-0 TTTV4" printed on the PCB,
+ "CPH011-D83" on back
+ Terra TValue Version 1.0 "ceb105.PCB" (really identical to Terra TV+ Version 1.0)
+ Terra TValue New Revision "LR102 Rec.C"
+ Terra Active Radio Upgrade (tea5757h, saa6588t)
+
+ LR74 is a newer PCB revision of ceb105 (both incl. connector for Active Radio Upgrade)
+
+ Cinergy 400 (saa7134), "E877 11(S)", "PM820092D" printed on PCB
+ Cinergy 600 (saa7134)
+
+Technisat
+---------
+ Discos ADR PC-Karte ISA (no TV!)
+ Discos ADR PC-Karte PCI (probably no TV?)
+ Techni-PC-Sat (Sat. analog)
+ Rev 1.2 (zr36120, vpx3220, stv0030, saa5246, BSJE3-494A)
+ Mediafocus I (zr36120/zr36125, drp3510, Sat. analog + ADR Radio)
+ Mediafocus II (saa7146, Sat. analog)
+ SatADR Rev 2.1 (saa7146a, saa7113h, stv0056a, msp3400c, drp3510a, BSKE3-307A)
+ SkyStar 1 DVB (AV7110) = Technotrend Premium
+ SkyStar 2 DVB (B2C2) (=Sky2PC)
+
+Siemens
+-------
+ Multimedia eXtension Board (MXB) (SAA7146, SAA7111)
+
+Stradis
+-------
+ SDM275,SDM250,SDM026,SDM025 (SAA7146, IBMMPEG2): MPEG2 decoder only
+
+Powercolor
+----------
+ MTV878
+ Package comes with different contents:
+ a) pcb "MTV878" (CARD=75)
+ b) Pixelview Rev. 4_
+ MTV878R w/Remote Control
+ MTV878F w/Remote Control w/FM radio
+
+Pinnacle
+--------
+ Mirovideo PCTV (Bt848)
+ Mirovideo PCTV SE (Bt848)
+ Mirovideo PCTV Pro (Bt848 + Daughterboard for TV Stereo and FM)
+ Studio PCTV Rave (Bt848 Version = Mirovideo PCTV)
+ Studio PCTV Rave (Bt878 package w/o infrared)
+ Studio PCTV (Bt878)
+ Studio PCTV Pro (Bt878 stereo w/ FM)
+ Pinnacle PCTV (Bt878, MT2032)
+ Pinnacle PCTV Pro (Bt878, MT2032)
+ Pinncale PCTV Sat (bt878a, HM1821/1221) ["Conexant CX24110 with CX24108 tuner, aka HM1221/HM1811"]
+ Pinnacle PCTV Sat XE
+
+ M(J)PEG capture and playback:
+ DC1+ (ISA)
+ DC10 (zr36057, zr36060, saa7110, adv7176)
+ DC10+ (zr36067, zr36060, saa7110, adv7176)
+ DC20 (ql16x24b,zr36050, zr36016, saa7110, saa7187 ...)
+ DC30 (zr36057, zr36050, zr36016, vpx3220, adv7176, ad1843, tea6415, miro FST97A1)
+ DC30+ (zr36067, zr36050, zr36016, vpx3220, adv7176)
+ DC50 (zr36067, zr36050, zr36016, saa7112, adv7176 (2 pcs.?), ad1843, miro FST97A1, Lattice ???)
+
+Lenco
+-----
+ MXR-9565 (=Technisat Mediafocus?)
+ MXR-9571 (Bt848) (=CPH031?)
+ MXR-9575
+ MXR-9577 (Bt878) (=Prolink 878TV Rev.3x)
+ MXTV-9578CP (Bt878) (= Prolink PV-BT878P+4E)
+
+Iomega
+------
+ Buz (zr36067, zr36060, saa7111, saa7185)
+
+LML
+---
+ LML33 (zr36067, zr36060, bt819, bt856)
+
+Grandtec
+--------
+ Grand Video Capture (Bt848)
+ Multi Capture Card (Bt878)
+
+Koutech
+-------
+ KW-606 (Bt848)
+ KW-607 (Bt848 capture only)
+ KW-606RSF
+ KW-607A (capture only)
+ KW-608 (Zoran capture only)
+
+IODATA (jp)
+------
+ GV-BCTV/PCI
+ GV-BCTV2/PCI
+ GV-BCTV3/PCI
+ GV-BCTV4/PCI
+ GV-VCP/PCI (capture only)
+ GV-VCP2/PCI (capture only)
+
+Canopus (jp)
+-------
+ WinDVR = Kworld "KW-TVL878RF"
+
+www.sigmacom.co.kr
+------------------
+ Sigma Cyber TV II
+
+www.sasem.co.kr
+---------------
+ Litte OnAir TV
+
+hama
+----
+ TV/Radio-Tuner Card, PCI (Model 44677) = CPH051
+
+Sigma Designs
+-------------
+ Hollywood plus (em8300, em9010, adv7175), (PCB "M340-10") MPEG DVD decoder
+
+Formac
+------
+ iProTV (Card for iMac Mezzanine slot, Bt848+SCSI)
+ ProTV (Bt848)
+ ProTV II = ProTV Stereo (Bt878) ["stereo" means FM stereo, tv is still mono]
+
+ATI
+---
+ TV-Wonder
+ TV-Wonder VE
+
+Diamond Multimedia
+------------------
+ DTV2000 (Bt848, tda9875)
+
+Aopen
+-----
+ VA1000 Plus (w/ Stereo)
+ VA1000 Lite
+ VA1000 (=LR90)
+
+Intel
+-----
+ Smart Video Recorder (ISA full-length)
+ Smart Video Recorder pro (ISA half-length)
+ Smart Video Recorder III (Bt848)
+
+STB
+---
+ STB Gateway 6000704 (bt878)
+ STB Gateway 6000699 (bt848)
+ STB Gateway 6000402 (bt848)
+ STB TV130 PCI
+
+Videologic
+----------
+ Captivator Pro/TV (ISA?)
+ Captivator PCI/VC (Bt848 bundled with camera) (capture only)
+
+Technotrend
+------------
+ TT-SAT PCI (PCB "Sat-PCI Rev.:1.3.1"; zr36125, vpx3225d, stc0056a, Tuner:BSKE6-155A
+ TT-DVB-Sat
+ revisions 1.1, 1.3, 1.5, 1.6 and 2.1
+ This card is sold as OEM from:
+ Siemens DVB-s Card
+ Hauppauge WinTV DVB-S
+ Technisat SkyStar 1 DVB
+ Galaxis DVB Sat
+ Now this card is called TT-PCline Premium Family
+ TT-Budget (saa7146, bsru6-701a)
+ This card is sold as OEM from:
+ Hauppauge WinTV Nova
+ Satelco Standard PCI (DVB-S)
+ TT-DVB-C PCI
+
+Teles
+-----
+ DVB-s (Rev. 2.2, BSRV2-301A, data only?)
+
+Remote Vision
+-------------
+ MX RV605 (Bt848 capture only)
+
+Boeder
+------
+ PC ChatCam (Model 68252) (Bt848 capture only)
+ Tv/Fm Capture Card (Model 68404) = PV951
+
+Media-Surfer (esc-kathrein.de)
+-------------------------------
+ Sat-Surfer (ISA)
+ Sat-Surfer PCI = Techni-PC-Sat
+ Cable-Surfer 1
+ Cable-Surfer 2
+ Cable-Surfer PCI (zr36120)
+ Audio-Surfer (ISA Radio card)
+
+Jetway (www.jetway.com.tw)
+--------------------------
+ JW-TV 878M
+ JW-TV 878 = KWorld KW-TV878RF
+
+Galaxis
+-------
+ Galaxis DVB Card S CI
+ Galaxis DVB Card C CI
+ Galaxis DVB Card S
+ Galaxis DVB Card C
+ Galaxis plug.in S [neuer Name: Galaxis DVB Card S CI
+
+Hauppauge
+---------
+ many many WinTV models ...
+ WinTV DVBs = Technotrend Premium 1.3
+ WinTV NOVA = Technotrend Budget 1.1 "S-DVB DATA"
+ WinTV NOVA-CI "SDVBACI"
+ WinTV Nova USB (=Technotrend USB 1.0)
+ WinTV-Nexus-s (=Technotrend Premium 2.1 or 2.2)
+ WinTV PVR
+ WinTV PVR 250
+ WinTV PVR 450
+
+ US models
+ 990 WinTV-PVR-350 (249USD) (iTVC15 chipset + radio)
+ 980 WinTV-PVR-250 (149USD) (iTVC15 chipset)
+ 880 WinTV-PVR-PCI (199USD) (KFIR chipset + bt878)
+ 881 WinTV-PVR-USB
+ 190 WinTV-GO
+ 191 WinTV-GO-FM
+ 404 WinTV
+ 401 WinTV-radio
+ 495 WinTV-Theater
+ 602 WinTV-USB
+ 621 WinTV-USB-FM
+ 600 USB-Live
+ 698 WinTV-HD
+ 697 WinTV-D
+ 564 WinTV-Nexus-S
+
+ Deutsche Modelle
+ 603 WinTV GO
+ 719 WinTV Primio-FM
+ 718 WinTV PCI-FM
+ 497 WinTV Theater
+ 569 WinTV USB
+ 568 WinTV USB-FM
+ 882 WinTV PVR
+ 981 WinTV PVR 250
+ 891 WinTV-PVR-USB
+ 541 WinTV Nova
+ 488 WinTV Nova-Ci
+ 564 WinTV-Nexus-s
+ 727 WinTV-DVB-c
+ 545 Common Interface
+ 898 WinTV-Nova-USB
+
+ UK models
+ 607 WinTV Go
+ 693,793 WinTV Primio FM
+ 647,747 WinTV PCI FM
+ 498 WinTV Theater
+ 883 WinTV PVR
+ 893 WinTV PVR USB (Duplicate entry)
+ 566 WinTV USB (UK)
+ 573 WinTV USB FM
+ 429 Impact VCB (bt848)
+ 600 USB Live (Video-In 1x Comp, 1xSVHS)
+ 542 WinTV Nova
+ 717 WinTV DVB-S
+ 909 Nova-t PCI
+ 893 Nova-t USB (Duplicate entry)
+ 802 MyTV
+ 804 MyView
+ 809 MyVideo
+ 872 MyTV2Go FM
+
+
+ 546 WinTV Nova-S CI
+ 543 WinTV Nova
+ 907 Nova-S USB
+ 908 Nova-T USB
+ 717 WinTV Nexus-S
+ 157 DEC3000-s Standalone + USB
+
+ Spain
+ 685 WinTV-Go
+ 690 WinTV-PrimioFM
+ 416 WinTV-PCI Nicam Estereo
+ 677 WinTV-PCI-FM
+ 699 WinTV-Theater
+ 683 WinTV-USB
+ 678 WinTV-USB-FM
+ 983 WinTV-PVR-250
+ 883 WinTV-PVR-PCI
+ 993 WinTV-PVR-350
+ 893 WinTV-PVR-USB
+ 728 WinTV-DVB-C PCI
+ 832 MyTV2Go
+ 869 MyTV2Go-FM
+ 805 MyVideo (USB)
+
+
+Matrix-Vision
+-------------
+ MATRIX-Vision MV-Delta
+ MATRIX-Vision MV-Delta 2
+ MVsigma-SLC (Bt848)
+
+Conceptronic (.net)
+------------
+ TVCON FM, TV card w/ FM = CPH05x
+ TVCON = CPH06x
+
+BestData
+--------
+ HCC100 = VCC100rev1 + camera
+ VCC100 rev1 (bt848)
+ VCC100 rev2 (bt878)
+
+Gallant (www.gallantcom.com) www.minton.com.tw
+-----------------------------------------------
+ Intervision IV-510 (capture only bt8x8)
+ Intervision IV-550 (bt8x8)
+ Intervision IV-100 (zoran)
+ Intervision IV-1000 (bt8x8)
+
+Asonic (www.asonic.com.cn) (website down)
+-----------------------------------------
+ SkyEye tv 878
+
+Hoontech
+--------
+ 878TV/FM
+
+Teppro (www.itcteppro.com.tw)
+-----------------------------
+ ITC PCITV (Card Ver 1.0) "Teppro TV1/TVFM1 Card"
+ ITC PCITV (Card Ver 2.0)
+ ITC PCITV (Card Ver 3.0) = "PV-BT878P+ (REV.9D)"
+ ITC PCITV (Card Ver 4.0)
+ TEPPRO IV-550 (For BT848 Main Chip)
+ ITC DSTTV (bt878, satellite)
+ ITC VideoMaker (saa7146, StreamMachine sm2110, tvtuner) "PV-SM2210P+ (REV:1C)"
+
+Kworld (www.kworld.com.tw)
+--------------------------
+ PC TV Station
+ KWORLD KW-TV878R TV (no radio)
+ KWORLD KW-TV878RF TV (w/ radio)
+
+ KWORLD KW-TVL878RF (low profile)
+
+ KWORLD KW-TV713XRF (saa7134)
+
+
+ MPEG TV Station (same cards as above plus WinDVR Software MPEG en/decoder)
+ KWORLD KW-TV878R -Pro TV (no Radio)
+ KWORLD KW-TV878RF-Pro TV (w/ Radio)
+ KWORLD KW-TV878R -Ultra TV (no Radio)
+ KWORLD KW-TV878RF-Ultra TV (w/ Radio)
+
+
+
+JTT/ Justy Corp.http://www.justy.co.jp/ (www.jtt.com.jp website down)
+---------------------------------------------------------------------
+ JTT-02 (JTT TV) "TV watchmate pro" (bt848)
+
+ADS www.adstech.com
+-------------------
+ Channel Surfer TV ( CHX-950 )
+ Channel Surfer TV+FM ( CHX-960FM )
+
+AVEC www.prochips.com
+---------------------
+ AVEC Intercapture (bt848, tea6320)
+
+NoBrand
+-------
+ TV Excel = Australian Name for "PV-BT878P+ 8E" or "878TV Rev.3_"
+
+Mach www.machspeed.com
+----
+ Mach TV 878
+
+Eline www.eline-net.com/
+-----
+ Eline Vision TVMaster / TVMaster FM (ELV-TVM/ ELV-TVM-FM) = LR26 (bt878)
+ Eline Vision TVMaster-2000 (ELV-TVM-2000, ELV-TVM-2000-FM)= LR138 (saa713x)
+
+Spirit http://www.spiritmodems.com.au/
+------
+ Spirit TV Tuner/Video Capture Card (bt848)
+
+Boser www.boser.com.tw
+-----
+ HS-878 Mini PCI Capture Add-on Card
+ HS-879 Mini PCI 3D Audio and Capture Add-on Card (w/ ES1938 Solo-1)
+
+Satelco www.citycom-gmbh.de, www.satelco.de
+-------
+ TV-FM =KNC1 saa7134
+ Standard PCI (DVB-S) = Technotrend Budget
+ Standard PCI (DVB-S) w/ CI
+ Satelco Highend PCI (DVB-S) = Technotrend Premium
+
+
+Sensoray www.sensoray.com
+--------
+ Sensoray 311 (PC/104 bus)
+ Sensoray 611 (PCI)
+
+CEI (Chartered Electronics Industries Pte Ltd [CEI] [FCC ID HBY])
+---
+ TV Tuner - HBY-33A-RAFFLES Brooktree Bt848KPF + Philips
+ TV Tuner MG9910 - HBY33A-TVO CEI + Philips SAA7110 + OKI M548262 + ST STV8438CV
+ Primetime TV (ISA)
+ acquired by Singapore Technologies
+ now operating as Chartered Semiconductor Manufacturing
+ Manufacturer of video cards is listed as:
+ Cogent Electronics Industries [CEI]
+
+AITech
+------
+ Wavewatcher TV (ISA)
+ AITech WaveWatcher TV-PCI = can be LR26 (Bt848) or LR50 (BT878)
+ WaveWatcher TVR-202 TV/FM Radio Card (ISA)
+
+MAXRON
+------
+ Maxron MaxTV/FM Radio (KW-TV878-FNT) = Kworld or JW-TV878-FBK
+
+www.ids-imaging.de
+------------------
+ Falcon Series (capture only)
+ In USA: http://www.theimagingsource.com/
+ DFG/LC1
+
+www.sknet-web.co.jp
+-------------------
+ SKnet Monster TV (saa7134)
+
+A-Max www.amaxhk.com (Colormax, Amax, Napa)
+-------------------
+ APAC Viewcomp 878
+
+Cybertainment
+-------------
+ CyberMail AV Video Email Kit w/ PCI Capture Card (capture only)
+ CyberMail Xtreme
+ These are Flyvideo
+
+VCR (http://www.vcrinc.com/)
+---
+ Video Catcher 16
+
+Twinhan
+-------
+ DST Card/DST-IP (bt878, twinhan asic) VP-1020
+ Sold as:
+ KWorld DVBS Satellite TV-Card
+ Powercolor DSTV Satellite Tuner Card
+ Prolink Pixelview DTV2000
+ Provideo PV-911 Digital Satellite TV Tuner Card With Common Interface ?
+ DST-CI Card (DVB Satellite) VP-1030
+ DCT Card (DVB cable)
+
+MSI
+---
+ MSI TV@nywhere Tuner Card (MS-8876) (CX23881/883) Not Bt878 compatible.
+ MS-8401 DVB-S
+
+Focus www.focusinfo.com
+-----
+ InVideo PCI (bt878)
+
+Sdisilk www.sdisilk.com/
+-------
+ SDI Silk 100
+ SDI Silk 200 SDI Input Card
+
+www.euresys.com
+ PICOLO series
+
+PMC/Pace
+www.pacecom.co.uk website closed
+
+Mercury www.kobian.com (UK and FR)
+ LR50
+ LR138RBG-Rx == LR138
+
+TEC sound (package and manuals don't have any other manufacturer info) TecSound
+ Though educated googling found: www.techmakers.com
+ TV-Mate = Zoltrix VP-8482
+
+Lorenzen www.lorenzen.de
+--------
+ SL DVB-S PCI = Technotrend Budget PCI (su1278 or bsru version)
+
+Origo (.uk) www.origo2000.com
+ PC TV Card = LR50
+
+I/O Magic www.iomagic.com
+---------
+ PC PVR - Desktop TV Personal Video Recorder DR-PCTV100 = Pinnacle ROB2D-51009464 4.0 + Cyberlink PowerVCR II
+
+Arowana
+-------
+ TV-Karte / Poso Power TV (?) = Zoltrix VP-8482 (?)
+
+iTVC15 boards:
+-------------
+kuroutoshikou.com ITVC15
+yuan.com MPG160 PCI TV (Internal PCI MPEG2 encoder card plus TV-tuner)
+
+Asus www.asuscom.com
+ Asus TV Tuner Card 880 NTSC (low profile, cx23880)
+ Asus TV (saa7134)
+
+Hoontech
+--------
+http://www.hoontech.com/korean/download/down_driver_list03.html
+ HART Vision 848 (H-ART Vision 848)
+ HART Vision 878 (H-Art Vision 878)
diff --git a/Documentation/video4linux/bttv/ICs b/Documentation/video4linux/bttv/ICs
new file mode 100644
index 0000000..611315f
--- /dev/null
+++ b/Documentation/video4linux/bttv/ICs
@@ -0,0 +1,37 @@
+all boards:
+
+Brooktree Bt848/848A/849/878/879: video capture chip
+
+
+
+Miro PCTV:
+
+Philips or Temic Tuner
+
+
+
+Hauppauge Win/TV pci (version 405):
+
+Microchip 24LC02B or
+Philips 8582E2Y: 256 Byte EEPROM with configuration information
+ I2C 0xa0-0xa1, (24LC02B also responds to 0xa2-0xaf)
+Philips SAA5246AGP/E: Videotext decoder chip, I2C 0x22-0x23
+TDA9800: sound decoder
+Winbond W24257AS-35: 32Kx8 CMOS static RAM (Videotext buffer mem)
+14052B: analog switch for selection of sound source
+
+PAL:
+TDA5737: VHF, hyperband and UHF mixer/oscillator for TV and VCR 3-band tuners
+TSA5522: 1.4 GHz I2C-bus controlled synthesizer, I2C 0xc2-0xc3
+
+NTSC:
+TDA5731: VHF, hyperband and UHF mixer/oscillator for TV and VCR 3-band tuners
+TSA5518: no datasheet available on Philips site
+
+
+
+STB TV pci:
+
+???
+if you want better support for STB cards send me info!
+Look at the board! What chips are on it?
diff --git a/Documentation/video4linux/bttv/Insmod-options b/Documentation/video4linux/bttv/Insmod-options
new file mode 100644
index 0000000..5ef7578
--- /dev/null
+++ b/Documentation/video4linux/bttv/Insmod-options
@@ -0,0 +1,182 @@
+
+Note: "modinfo <module>" prints various informations about a kernel
+module, among them a complete and up-to-date list of insmod options.
+This list tends to be outdated because it is updated manually ...
+
+==========================================================================
+
+bttv.o
+ the bt848/878 (grabber chip) driver
+
+ insmod args:
+ card=n card type, see CARDLIST for a list.
+ tuner=n tuner type, see CARDLIST for a list.
+ radio=0/1 card supports radio
+ pll=0/1/2 pll settings
+ 0: don't use PLL
+ 1: 28 MHz crystal installed
+ 2: 35 MHz crystal installed
+
+ triton1=0/1 for Triton1 (+others) compatibility
+ vsfx=0/1 yet another chipset bug compatibility bit
+ see README.quirks for details on these two.
+
+ bigendian=n Set the endianness of the gfx framebuffer.
+ Default is native endian.
+ fieldnr=0/1 Count fields. Some TV descrambling software
+ needs this, for others it only generates
+ 50 useless IRQs/sec. default is 0 (off).
+ autoload=0/1 autoload helper modules (tuner, audio).
+ default is 1 (on).
+ bttv_verbose=0/1/2 verbose level (at insmod time, while
+ looking at the hardware). default is 1.
+ bttv_debug=0/1 debug messages (for capture).
+ default is 0 (off).
+ irq_debug=0/1 irq handler debug messages.
+ default is 0 (off).
+ gbuffers=2-32 number of capture buffers for mmap'ed capture.
+ default is 4.
+ gbufsize= size of capture buffers. default and
+ maximum value is 0x208000 (~2MB)
+ no_overlay=0 Enable overlay on broken hardware. There
+ are some chipsets (SIS for example) which
+ are known to have problems with the PCI DMA
+ push used by bttv. bttv will disable overlay
+ by default on this hardware to avoid crashes.
+ With this insmod option you can override this.
+ no_overlay=1 Disable overlay. It should be used by broken
+ hardware that doesn't support PCI2PCI direct
+ transfers.
+ automute=0/1 Automatically mutes the sound if there is
+ no TV signal, on by default. You might try
+ to disable this if you have bad input signal
+ quality which leading to unwanted sound
+ dropouts.
+ chroma_agc=0/1 AGC of chroma signal, off by default.
+ adc_crush=0/1 Luminance ADC crush, on by default.
+ i2c_udelay= Allow reduce I2C speed. Default is 5 usecs
+ (meaning 66,67 Kbps). The default is the
+ maximum supported speed by kernel bitbang
+ algorithm. You may use lower numbers, if I2C
+ messages are lost (16 is known to work on
+ all supported cards).
+
+ bttv_gpio=0/1
+ gpiomask=
+ audioall=
+ audiomux=
+ See Sound-FAQ for a detailed description.
+
+ remap, card, radio and pll accept up to four comma-separated arguments
+ (for multiple boards).
+
+tuner.o
+ The tuner driver. You need this unless you want to use only
+ with a camera or external tuner ...
+
+ insmod args:
+ debug=1 print some debug info to the syslog
+ type=n type of the tuner chip. n as follows:
+ see CARDLIST for a complete list.
+ pal=[bdgil] select PAL variant (used for some tuners
+ only, important for the audio carrier).
+
+tvmixer.o
+ registers a mixer device for the TV card's volume/bass/treble
+ controls (requires a i2c audio control chip like the msp3400).
+
+ insmod args:
+ debug=1 print some debug info to the syslog.
+ devnr=n allocate device #n (0 == /dev/mixer,
+ 1 = /dev/mixer1, ...), default is to
+ use the first free one.
+
+tvaudio.o
+ new, experimental module which is supported to provide a single
+ driver for all simple i2c audio control chips (tda/tea*).
+
+ insmod args:
+ tda8425 = 1 enable/disable the support for the
+ tda9840 = 1 various chips.
+ tda9850 = 1 The tea6300 can't be autodetected and is
+ tda9855 = 1 therefore off by default, if you have
+ tda9873 = 1 this one on your card (STB uses these)
+ tda9874a = 1 you have to enable it explicitly.
+ tea6300 = 0 The two tda985x chips use the same i2c
+ tea6420 = 1 address and can't be disturgished from
+ pic16c54 = 1 each other, you might have to disable
+ the wrong one.
+ debug = 1 print debug messages
+
+ insmod args for tda9874a:
+ tda9874a_SIF=1/2 select sound IF input pin (1 or 2)
+ (default is pin 1)
+ tda9874a_AMSEL=0/1 auto-mute select for NICAM (default=0)
+ Please read note 3 below!
+ tda9874a_STD=n select TV sound standard (0..8):
+ 0 - A2, B/G
+ 1 - A2, M (Korea)
+ 2 - A2, D/K (1)
+ 3 - A2, D/K (2)
+ 4 - A2, D/K (3)
+ 5 - NICAM, I
+ 6 - NICAM, B/G
+ 7 - NICAM, D/K (default)
+ 8 - NICAM, L
+
+ Note 1: tda9874a supports both tda9874h (old) and tda9874a (new) chips.
+ Note 2: tda9874h/a and tda9875 (which is supported separately by
+ tda9875.o) use the same i2c address so both modules should not be
+ used at the same time.
+ Note 3: Using tda9874a_AMSEL option depends on your TV card design!
+ AMSEL=0: auto-mute will switch between NICAM sound
+ and the sound on 1st carrier (i.e. FM mono or AM).
+ AMSEL=1: auto-mute will switch between NICAM sound
+ and the analog mono input (MONOIN pin).
+ If tda9874a decoder on your card has MONOIN pin not connected, then
+ use only tda9874_AMSEL=0 or don't specify this option at all.
+ For example:
+ card=65 (FlyVideo 2000S) - set AMSEL=1 or AMSEL=0
+ card=72 (Prolink PV-BT878P rev.9B) - set AMSEL=0 only
+
+msp3400.o
+ The driver for the msp34xx sound processor chips. If you have a
+ stereo card, you probably want to insmod this one.
+
+ insmod args:
+ debug=1/2 print some debug info to the syslog,
+ 2 is more verbose.
+ simple=1 Use the "short programming" method. Newer
+ msp34xx versions support this. You need this
+ for dbx stereo. Default is on if supported by
+ the chip.
+ once=1 Don't check the TV-stations Audio mode
+ every few seconds, but only once after
+ channel switches.
+ amsound=1 Audio carrier is AM/NICAM at 6.5 Mhz. This
+ should improve things for french people, the
+ carrier autoscan seems to work with FM only...
+
+tea6300.o - OBSOLETE (use tvaudio instead)
+ The driver for the tea6300 fader chip. If you have a stereo
+ card and the msp3400.o doesn't work, you might want to try this
+ one. This chip is seen on most STB TV/FM cards (usually from
+ Gateway OEM sold surplus on auction sites).
+
+ insmod args:
+ debug=1 print some debug info to the syslog.
+
+tda8425.o - OBSOLETE (use tvaudio instead)
+ The driver for the tda8425 fader chip. This driver used to be
+ part of bttv.c, so if your sound used to work but does not
+ anymore, try loading this module.
+
+ insmod args:
+ debug=1 print some debug info to the syslog.
+
+tda985x.o - OBSOLETE (use tvaudio instead)
+ The driver for the tda9850/55 audio chips.
+
+ insmod args:
+ debug=1 print some debug info to the syslog.
+ chip=9850/9855 set the chip type.
diff --git a/Documentation/video4linux/bttv/MAKEDEV b/Documentation/video4linux/bttv/MAKEDEV
new file mode 100644
index 0000000..6c29ba4
--- /dev/null
+++ b/Documentation/video4linux/bttv/MAKEDEV
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+function makedev () {
+
+ for dev in 0 1 2 3; do
+ echo "/dev/$1$dev: char 81 $[ $2 + $dev ]"
+ rm -f /dev/$1$dev
+ mknod /dev/$1$dev c 81 $[ $2 + $dev ]
+ chmod 666 /dev/$1$dev
+ done
+
+ # symlink for default device
+ rm -f /dev/$1
+ ln -s /dev/${1}0 /dev/$1
+}
+
+# see http://roadrunner.swansea.uk.linux.org/v4lapi.shtml
+
+echo "*** new device names ***"
+makedev video 0
+makedev radio 64
+makedev vtx 192
+makedev vbi 224
+
+#echo "*** old device names (for compatibility only) ***"
+#makedev bttv 0
+#makedev bttv-fm 64
+#makedev bttv-vbi 224
diff --git a/Documentation/video4linux/bttv/Modprobe.conf b/Documentation/video4linux/bttv/Modprobe.conf
new file mode 100644
index 0000000..55f1465
--- /dev/null
+++ b/Documentation/video4linux/bttv/Modprobe.conf
@@ -0,0 +1,11 @@
+# i2c
+alias char-major-89 i2c-dev
+options i2c-core i2c_debug=1
+options i2c-algo-bit bit_test=1
+
+# bttv
+alias char-major-81 videodev
+alias char-major-81-0 bttv
+options bttv card=2 radio=1
+options tuner debug=1
+
diff --git a/Documentation/video4linux/bttv/Modules.conf b/Documentation/video4linux/bttv/Modules.conf
new file mode 100644
index 0000000..753f159
--- /dev/null
+++ b/Documentation/video4linux/bttv/Modules.conf
@@ -0,0 +1,14 @@
+# For modern kernels (2.6 or above), this belongs in /etc/modprobe.conf
+# For for 2.4 kernels or earlier, this belongs in /etc/modules.conf.
+
+# i2c
+alias char-major-89 i2c-dev
+options i2c-core i2c_debug=1
+options i2c-algo-bit bit_test=1
+
+# bttv
+alias char-major-81 videodev
+alias char-major-81-0 bttv
+options bttv card=2 radio=1
+options tuner debug=1
+
diff --git a/Documentation/video4linux/bttv/PROBLEMS b/Documentation/video4linux/bttv/PROBLEMS
new file mode 100644
index 0000000..2b8b007
--- /dev/null
+++ b/Documentation/video4linux/bttv/PROBLEMS
@@ -0,0 +1,62 @@
+- Start capturing by pressing "c" or by selecting it via a menu!
+
+- Start capturing by pressing "c" or by selecting it via a menu!!!
+
+- The memory of some S3 cards is not recognized right:
+
+ First of all, if you are not using XFree-3.2 or newer, upgrade AT LEAST to
+ XFree-3.2A! This solved the problem for most people.
+
+ Start up X11 like this: "XF86_S3 -probeonly" and write down where the
+ linear frame buffer is.
+ If it is different to the address found by bttv install bttv like this:
+ "insmod bttv vidmem=0xfb0"
+ if the linear frame buffer is at 0xfb000000 (i.e. omit the last 5 zeros!)
+
+ Some S3 cards even take up 64MB of memory but only report 32MB to the BIOS.
+ If this 64MB area overlaps the IO memory of the Bt848 you also have to
+ remap this. E.g.: insmod bttv vidmem=0xfb0 remap=0xfa0
+
+ If the video memory is found at the right place and there are no address
+ conflicts but still no picture (or the computer even crashes),
+ try disabling features of your PCI chipset in the BIOS setup.
+
+ Frank Kapahnke <frank@kapahnke.prima.ruhr.de> also reported that problems
+ with his S3 868 went away when he upgraded to XFree 3.2.
+
+
+- I still only get a black picture with my S3 card!
+
+ Even with XFree-3.2A some people have problems with their S3 cards
+ (mostly with Trio 64 but also with some others)
+ Get the free demo version of Accelerated X from www.xinside.com and try
+ bttv with it. bttv seems to work with most S3 cards with Accelerated X.
+
+ Since I do not know much (better make that almost nothing) about VGA card
+ programming I do not know the reason for this.
+ Looks like XFree does something different when setting up the video memory?
+ Maybe somebody can enlighten me?
+ Would be nice if somebody could get this to work with XFree since
+ Accelerated X costs more than some of the grabber cards ...
+
+ Better linear frame buffer support for S3 cards will probably be in
+ XFree 4.0.
+
+- Grabbing is not switched off when changing consoles with XFree.
+ That's because XFree and some AcceleratedX versions do not send unmap
+ events.
+
+- Some popup windows (e.g. of the window manager) are not refreshed.
+
+ Disable backing store by starting X with the option "-bs"
+
+- When using 32 bpp in XFree or 24+8bpp mode in AccelX 3.1 the system
+ can sometimes lock up if you use more than 1 bt848 card at the same time.
+ You will always get pixel errors when e.g. using more than 1 card in full
+ screen mode. Maybe we need something faster than the PCI bus ...
+
+
+- Some S3 cards and the Matrox Mystique will produce pixel errors with
+ full resolution in 32-bit mode.
+
+- Some video cards have problems with Accelerated X 4.1
diff --git a/Documentation/video4linux/bttv/README b/Documentation/video4linux/bttv/README
new file mode 100644
index 0000000..7ca2154
--- /dev/null
+++ b/Documentation/video4linux/bttv/README
@@ -0,0 +1,90 @@
+
+Release notes for bttv
+======================
+
+You'll need at least these config options for bttv:
+ CONFIG_I2C=m
+ CONFIG_I2C_ALGOBIT=m
+ CONFIG_VIDEO_DEV=m
+
+The latest bttv version is available from http://bytesex.org/bttv/
+
+
+Make bttv work with your card
+-----------------------------
+
+Just try "modprobe bttv" and see if that works.
+
+If it doesn't bttv likely could not autodetect your card and needs some
+insmod options. The most important insmod option for bttv is "card=n"
+to select the correct card type. If you get video but no sound you've
+very likely specified the wrong (or no) card type. A list of supported
+cards is in CARDLIST.bttv
+
+If bttv takes very long to load (happens sometimes with the cheap
+cards which have no tuner), try adding this to your modules.conf:
+ options i2c-algo-bit bit_test=1
+
+For the WinTV/PVR you need one firmware file from the driver CD:
+hcwamc.rbf. The file is in the pvr45xxx.exe archive (self-extracting
+zip file, unzip can unpack it). Put it into the /etc/pvr directory or
+use the firm_altera=<path> insmod option to point the driver to the
+location of the file.
+
+If your card isn't listed in CARDLIST.bttv or if you have trouble making
+audio work, you should read the Sound-FAQ.
+
+
+Autodetecting cards
+-------------------
+
+bttv uses the PCI Subsystem ID to autodetect the card type. lspci lists
+the Subsystem ID in the second line, looks like this:
+
+00:0a.0 Multimedia video controller: Brooktree Corporation Bt878 (rev 02)
+ Subsystem: Hauppauge computer works Inc. WinTV/GO
+ Flags: bus master, medium devsel, latency 32, IRQ 5
+ Memory at e2000000 (32-bit, prefetchable) [size=4K]
+
+only bt878-based cards can have a subsystem ID (which does not mean
+that every card really has one). bt848 cards can't have a Subsystem
+ID and therefore can't be autodetected. There is a list with the ID's
+in bttv-cards.c (in case you are intrested or want to mail patches
+with updates).
+
+
+Still doesn't work?
+-------------------
+
+I do NOT have a lab with 30+ different grabber boards and a
+PAL/NTSC/SECAM test signal generator at home, so I often can't
+reproduce your problems. This makes debugging very difficult for me.
+If you have some knowledge and spare time, please try to fix this
+yourself (patches very welcome of course...) You know: The linux
+slogan is "Do it yourself".
+
+There is a mailing list: video4linux-list@redhat.com.
+https://listman.redhat.com/mailman/listinfo/video4linux-list
+
+If you have trouble with some specific TV card, try to ask there
+instead of mailing me directly. The chance that someone with the
+same card listens there is much higher...
+
+For problems with sound: There are alot of different systems used
+for TV sound all over the world. And there are also different chips
+which decode the audio signal. Reports about sound problems ("stereo
+does'nt work") are pretty useless unless you include some details
+about your hardware and the TV sound scheme used in your country (or
+at least the country you are living in).
+
+
+Finally: If you mail some patches for bttv around the world (to
+linux-kernel/Alan/Linus/...), please Cc: me.
+
+
+Have fun with bttv,
+
+ Gerd
+
+--
+Gerd Knorr <kraxel@bytesex.org>
diff --git a/Documentation/video4linux/bttv/README.WINVIEW b/Documentation/video4linux/bttv/README.WINVIEW
new file mode 100644
index 0000000..c61cf28
--- /dev/null
+++ b/Documentation/video4linux/bttv/README.WINVIEW
@@ -0,0 +1,33 @@
+
+Support for the Leadtek WinView 601 TV/FM by Jon Tombs <jon@gte.esi.us.es>
+
+This card is basically the same as all the rest (Bt484A, Philips tuner),
+the main difference is that they have attached a programmable attenuator to 3
+GPIO lines in order to give some volume control. They have also stuck an
+infra-red remote control decoded on the board, I will add support for this
+when I get time (it simple generates an interrupt for each key press, with
+the key code is placed in the GPIO port).
+
+I don't yet have any application to test the radio support. The tuner
+frequency setting should work but it is possible that the audio multiplexer
+is wrong. If it doesn't work, send me email.
+
+
+- No Thanks to Leadtek they refused to answer any questions about their
+hardware. The driver was written by visual inspection of the card. If you
+use this driver, send an email insult to them, and tell them you won't
+continue buying their hardware unless they support Linux.
+
+- Little thanks to Princeton Technology Corp (http://www.princeton.com.tw)
+who make the audio attenuator. Their publicly available data-sheet available
+on their web site doesn't include the chip programming information! Hidden
+on their server are the full data-sheets, but don't ask how I found it.
+
+To use the driver I use the following options, the tuner and pll settings might
+be different in your country
+
+insmod videodev
+insmod i2c scan=1 i2c_debug=0 verbose=0
+insmod tuner type=1 debug=0
+insmod bttv pll=1 radio=1 card=17
+
diff --git a/Documentation/video4linux/bttv/README.freeze b/Documentation/video4linux/bttv/README.freeze
new file mode 100644
index 0000000..4259dcc
--- /dev/null
+++ b/Documentation/video4linux/bttv/README.freeze
@@ -0,0 +1,74 @@
+
+If the box freezes hard with bttv ...
+=====================================
+
+It might be a bttv driver bug. It also might be bad hardware. It also
+might be something else ...
+
+Just mailing me "bttv freezes" isn't going to help much. This README
+has a few hints how you can help to pin down the problem.
+
+
+bttv bugs
+---------
+
+If some version works and another doesn't it is likely to be a driver
+bug. It is very helpful if you can tell where exactly it broke
+(i.e. the last working and the first broken version).
+
+With a hard freeze you probably doesn't find anything in the logfiles.
+The only way to capture any kernel messages is to hook up a serial
+console and let some terminal application log the messages. /me uses
+screen. See Documentation/serial-console.txt for details on setting
+up a serial console.
+
+Read Documentation/oops-tracing.txt to learn how to get any useful
+information out of a register+stack dump printed by the kernel on
+protection faults (so-called "kernel oops").
+
+If you run into some kind of deadlock, you can try to dump a call trace
+for each process using sysrq-t (see Documentation/sysrq.txt).
+This way it is possible to figure where *exactly* some process in "D"
+state is stuck.
+
+I've seen reports that bttv 0.7.x crashes whereas 0.8.x works rock solid
+for some people. Thus probably a small buglet left somewhere in bttv
+0.7.x. I have no idea where exactly, it works stable for me and alot of
+other people. But in case you have problems with the 0.7.x versions you
+can give 0.8.x a try ...
+
+
+hardware bugs
+-------------
+
+Some hardware can't deal with PCI-PCI transfers (i.e. grabber => vga).
+Sometimes problems show up with bttv just because of the high load on
+the PCI bus. The bt848/878 chips have a few workarounds for known
+incompatibilities, see README.quirks.
+
+Some folks report that increasing the pci latency helps too,
+althrought I'm not sure whenever this really fixes the problems or
+only makes it less likely to happen. Both bttv and btaudio have a
+insmod option to set the PCI latency of the device.
+
+Some mainboard have problems to deal correctly with multiple devices
+doing DMA at the same time. bttv + ide seems to cause this sometimes,
+if this is the case you likely see freezes only with video and hard disk
+access at the same time. Updating the IDE driver to get the latest and
+greatest workarounds for hardware bugs might fix these problems.
+
+
+other
+-----
+
+If you use some binary-only yunk (like nvidia module) try to reproduce
+the problem without.
+
+IRQ sharing is known to cause problems in some cases. It works just
+fine in theory and many configurations. Neverless it might be worth a
+try to shuffle around the PCI cards to give bttv another IRQ or make
+it share the IRQ with some other piece of hardware. IRQ sharing with
+VGA cards seems to cause trouble sometimes. I've also seen funny
+effects with bttv sharing the IRQ with the ACPI bridge (and
+apci-enabled kernel).
+
diff --git a/Documentation/video4linux/bttv/README.quirks b/Documentation/video4linux/bttv/README.quirks
new file mode 100644
index 0000000..92e0392
--- /dev/null
+++ b/Documentation/video4linux/bttv/README.quirks
@@ -0,0 +1,83 @@
+
+Below is what the bt878 data book says about the PCI bug compatibility
+modes of the bt878 chip.
+
+The triton1 insmod option sets the EN_TBFX bit in the control register.
+The vsfx insmod option does the same for EN_VSFX bit. If you have
+stability problems you can try if one of these options makes your box
+work solid.
+
+drivers/pci/quirks.c knows about these issues, this way these bits are
+enabled automagically for known-buggy chipsets (look at the kernel
+messages, bttv tells you).
+
+HTH,
+
+ Gerd
+
+---------------------------- cut here --------------------------
+
+Normal PCI Mode
+---------------
+
+The PCI REQ signal is the logical-or of the incoming function requests.
+The inter-nal GNT[0:1] signals are gated asynchronously with GNT and
+demultiplexed by the audio request signal. Thus the arbiter defaults to
+the video function at power-up and parks there during no requests for
+bus access. This is desirable since the video will request the bus more
+often. However, the audio will have highest bus access priority. Thus
+the audio will have first access to the bus even when issuing a request
+after the video request but before the PCI external arbiter has granted
+access to the Bt879. Neither function can preempt the other once on the
+bus. The duration to empty the entire video PCI FIFO onto the PCI bus is
+very short compared to the bus access latency the audio PCI FIFO can
+tolerate.
+
+
+430FX Compatibility Mode
+------------------------
+
+When using the 430FX PCI, the following rules will ensure
+compatibility:
+
+ (1) Deassert REQ at the same time as asserting FRAME.
+ (2) Do not reassert REQ to request another bus transaction until after
+ finish-ing the previous transaction.
+
+Since the individual bus masters do not have direct control of REQ, a
+simple logical-or of video and audio requests would violate the rules.
+Thus, both the arbiter and the initiator contain 430FX compatibility
+mode logic. To enable 430FX mode, set the EN_TBFX bit as indicated in
+Device Control Register on page 104.
+
+When EN_TBFX is enabled, the arbiter ensures that the two compatibility
+rules are satisfied. Before GNT is asserted by the PCI arbiter, this
+internal arbiter may still logical-or the two requests. However, once
+the GNT is issued, this arbiter must lock in its decision and now route
+only the granted request to the REQ pin. The arbiter decision lock
+happens regardless of the state of FRAME because it does not know when
+FRAME will be asserted (typically - each initiator will assert FRAME on
+the cycle following GNT). When FRAME is asserted, it is the initiator s
+responsibility to remove its request at the same time. It is the
+arbiters responsibility to allow this request to flow through to REQ and
+not allow the other request to hold REQ asserted. The decision lock may
+be removed at the end of the transaction: for example, when the bus is
+idle (FRAME and IRDY). The arbiter decision may then continue
+asynchronously until GNT is again asserted.
+
+
+Interfacing with Non-PCI 2.1 Compliant Core Logic
+-------------------------------------------------
+
+A small percentage of core logic devices may start a bus transaction
+during the same cycle that GNT is de-asserted. This is non PCI 2.1
+compliant. To ensure compatibility when using PCs with these PCI
+controllers, the EN_VSFX bit must be enabled (refer to Device Control
+Register on page 104). When in this mode, the arbiter does not pass GNT
+to the internal functions unless REQ is asserted. This prevents a bus
+transaction from starting the same cycle as GNT is de-asserted. This
+also has the side effect of not being able to take advantage of bus
+parking, thus lowering arbitration performance. The Bt879 drivers must
+query for these non-compliant devices, and set the EN_VSFX bit only if
+required.
+
diff --git a/Documentation/video4linux/bttv/Sound-FAQ b/Documentation/video4linux/bttv/Sound-FAQ
new file mode 100644
index 0000000..1e6328f
--- /dev/null
+++ b/Documentation/video4linux/bttv/Sound-FAQ
@@ -0,0 +1,148 @@
+
+bttv and sound mini howto
+=========================
+
+There are alot of different bt848/849/878/879 based boards available.
+Making video work often is not a big deal, because this is handled
+completely by the bt8xx chip, which is common on all boards. But
+sound is handled in slightly different ways on each board.
+
+To handle the grabber boards correctly, there is a array tvcards[] in
+bttv-cards.c, which holds the informations required for each board.
+Sound will work only, if the correct entry is used (for video it often
+makes no difference). The bttv driver prints a line to the kernel
+log, telling which card type is used. Like this one:
+
+ bttv0: model: BT848(Hauppauge old) [autodetected]
+
+You should verify this is correct. If it isn't, you have to pass the
+correct board type as insmod argument, "insmod bttv card=2" for
+example. The file CARDLIST has a list of valid arguments for card.
+If your card isn't listed there, you might check the source code for
+new entries which are not listed yet. If there isn't one for your
+card, you can check if one of the existing entries does work for you
+(just trial and error...).
+
+Some boards have an extra processor for sound to do stereo decoding
+and other nice features. The msp34xx chips are used by Hauppauge for
+example. If your board has one, you might have to load a helper
+module like msp3400.o to make sound work. If there isn't one for the
+chip used on your board: Bad luck. Start writing a new one. Well,
+you might want to check the video4linux mailing list archive first...
+
+Of course you need a correctly installed soundcard unless you have the
+speakers connected directly to the grabber board. Hint: check the
+mixer settings too. ALSA for example has everything muted by default.
+
+
+How sound works in detail
+=========================
+
+Still doesn't work? Looks like some driver hacking is required.
+Below is a do-it-yourself description for you.
+
+The bt8xx chips have 32 general purpose pins, and registers to control
+these pins. One register is the output enable register
+(BT848_GPIO_OUT_EN), it says which pins are actively driven by the
+bt848 chip. Another one is the data register (BT848_GPIO_DATA), where
+you can get/set the status if these pins. They can be used for input
+and output.
+
+Most grabber board vendors use these pins to control an external chip
+which does the sound routing. But every board is a little different.
+These pins are also used by some companies to drive remote control
+receiver chips. Some boards use the i2c bus instead of the gpio pins
+to connect the mux chip.
+
+As mentioned above, there is a array which holds the required
+informations for each known board. You basically have to create a new
+line for your board. The important fields are these two:
+
+struct tvcard
+{
+ [ ... ]
+ u32 gpiomask;
+ u32 audiomux[6]; /* Tuner, Radio, external, internal, mute, stereo */
+};
+
+gpiomask specifies which pins are used to control the audio mux chip.
+The corresponding bits in the output enable register
+(BT848_GPIO_OUT_EN) will be set as these pins must be driven by the
+bt848 chip.
+
+The audiomux[] array holds the data values for the different inputs
+(i.e. which pins must be high/low for tuner/mute/...). This will be
+written to the data register (BT848_GPIO_DATA) to switch the audio
+mux.
+
+
+What you have to do is figure out the correct values for gpiomask and
+the audiomux array. If you have Windows and the drivers four your
+card installed, you might to check out if you can read these registers
+values used by the windows driver. A tool to do this is available
+from ftp://telepresence.dmem.strath.ac.uk/pub/bt848/winutil, but it
+does'nt work with bt878 boards according to some reports I received.
+Another one with bt878 suport is available from
+http://btwincap.sourceforge.net/Files/btspy2.00.zip
+
+You might also dig around in the *.ini files of the Windows applications.
+You can have a look at the board to see which of the gpio pins are
+connected at all and then start trial-and-error ...
+
+
+Starting with release 0.7.41 bttv has a number of insmod options to
+make the gpio debugging easier:
+
+bttv_gpio=0/1 enable/disable gpio debug messages
+gpiomask=n set the gpiomask value
+audiomux=i,j,... set the values of the audiomux array
+audioall=a set the values of the audiomux array (one
+ value for all array elements, useful to check
+ out which effect the particular value has).
+
+The messages printed with bttv_gpio=1 look like this:
+
+ bttv0: gpio: en=00000027, out=00000024 in=00ffffd8 [audio: off]
+
+en = output _en_able register (BT848_GPIO_OUT_EN)
+out = _out_put bits of the data register (BT848_GPIO_DATA),
+ i.e. BT848_GPIO_DATA & BT848_GPIO_OUT_EN
+in = _in_put bits of the data register,
+ i.e. BT848_GPIO_DATA & ~BT848_GPIO_OUT_EN
+
+
+
+Other elements of the tvcards array
+===================================
+
+If you are trying to make a new card work you might find it useful to
+know what the other elements in the tvcards array are good for:
+
+video_inputs - # of video inputs the card has
+audio_inputs - historical cruft, not used any more.
+tuner - which input is the tuner
+svhs - which input is svhs (all others are labeled composite)
+muxsel - video mux, input->registervalue mapping
+pll - same as pll= insmod option
+tuner_type - same as tuner= insmod option
+*_modulename - hint whenever some card needs this or that audio
+ module loaded to work properly.
+has_radio - whenever this TV card has a radio tuner.
+no_msp34xx - "1" disables loading of msp3400.o module
+no_tda9875 - "1" disables loading of tda9875.o module
+needs_tvaudio - set to "1" to load tvaudio.o module
+
+If some config item is specified both from the tvcards array and as
+insmod option, the insmod option takes precedence.
+
+
+
+Good luck,
+
+ Gerd
+
+
+PS: If you have a new working entry, mail it to me.
+
+--
+Gerd Knorr <kraxel@bytesex.org>
diff --git a/Documentation/video4linux/bttv/Specs b/Documentation/video4linux/bttv/Specs
new file mode 100644
index 0000000..79b9e57
--- /dev/null
+++ b/Documentation/video4linux/bttv/Specs
@@ -0,0 +1,3 @@
+Philips http://www.Semiconductors.COM/pip/
+Conexant http://www.conexant.com/techinfo/default.asp
+Micronas http://www.micronas.de/pages/product_documentation/index.html
diff --git a/Documentation/video4linux/bttv/THANKS b/Documentation/video4linux/bttv/THANKS
new file mode 100644
index 0000000..950aa78
--- /dev/null
+++ b/Documentation/video4linux/bttv/THANKS
@@ -0,0 +1,24 @@
+Many thanks to:
+
+- Markus Schroeder <schroedm@uni-duesseldorf.de> for information on the Bt848
+ and tuner programming and his control program xtvc.
+
+- Martin Buck <martin-2.buck@student.uni-ulm.de> for his great Videotext
+ package.
+
+- Gerd Knorr <kraxel@cs.tu-berlin.de> for the MSP3400 support and the modular
+ I2C, tuner, ... support.
+
+
+- MATRIX Vision for giving us 2 cards for free, which made support of
+ single crystal operation possible.
+
+- MIRO for providing a free PCTV card and detailed information about the
+ components on their cards. (E.g. how the tuner type is detected)
+ Without their card I could not have debugged the NTSC mode.
+
+- Hauppauge for telling how the sound input is selected and what components
+ they do and will use on their radio cards.
+ Also many thanks for faxing me the FM1216 data sheet.
+
+
diff --git a/Documentation/video4linux/bttv/Tuners b/Documentation/video4linux/bttv/Tuners
new file mode 100644
index 0000000..0a371d3
--- /dev/null
+++ b/Documentation/video4linux/bttv/Tuners
@@ -0,0 +1,115 @@
+1) Tuner Programming
+====================
+There are some flavors of Tuner programming APIs.
+These differ mainly by the bandswitch byte.
+
+ L= LG_API (VHF_LO=0x01, VHF_HI=0x02, UHF=0x08, radio=0x04)
+ P= PHILIPS_API (VHF_LO=0xA0, VHF_HI=0x90, UHF=0x30, radio=0x04)
+ T= TEMIC_API (VHF_LO=0x02, VHF_HI=0x04, UHF=0x01)
+ A= ALPS_API (VHF_LO=0x14, VHF_HI=0x12, UHF=0x11)
+ M= PHILIPS_MK3 (VHF_LO=0x01, VHF_HI=0x02, UHF=0x04, radio=0x19)
+
+2) Tuner Manufacturers
+======================
+
+SAMSUNG Tuner identification: (e.g. TCPM9091PD27)
+ TCP [ABCJLMNQ] 90[89][125] [DP] [ACD] 27 [ABCD]
+ [ABCJLMNQ]:
+ A= BG+DK
+ B= BG
+ C= I+DK
+ J= NTSC-Japan
+ L= Secam LL
+ M= BG+I+DK
+ N= NTSC
+ Q= BG+I+DK+LL
+ [89]: ?
+ [125]:
+ 2: No FM
+ 5: With FM
+ [DP]:
+ D= NTSC
+ P= PAL
+ [ACD]:
+ A= F-connector
+ C= Phono connector
+ D= Din Jack
+ [ABCD]:
+ 3-wire/I2C tuning, 2-band/3-band
+
+ These Tuners are PHILIPS_API compatible.
+
+Philips Tuner identification: (e.g. FM1216MF)
+ F[IRMQ]12[1345]6{MF|ME|MP}
+ F[IRMQ]:
+ FI12x6: Tuner Series
+ FR12x6: Tuner + Radio IF
+ FM12x6: Tuner + FM
+ FQ12x6: special
+ FMR12x6: special
+ TD15xx: Digital Tuner ATSC
+ 12[1345]6:
+ 1216: PAL BG
+ 1236: NTSC
+ 1246: PAL I
+ 1256: Pal DK
+ {MF|ME|MP}
+ MF: BG LL w/ Secam (Multi France)
+ ME: BG DK I LL (Multi Europe)
+ MP: BG DK I (Multi PAL)
+ MR: BG DK M (?)
+ MG: BG DKI M (?)
+ MK2 series PHILIPS_API, most tuners are compatible to this one !
+ MK3 series introduced in 2002 w/ PHILIPS_MK3_API
+
+Temic Tuner identification: (.e.g 4006FH5)
+ 4[01][0136][269]F[HYNR]5
+ 40x2: Tuner (5V/33V), TEMIC_API.
+ 40x6: Tuner 5V
+ 41xx: Tuner compact
+ 40x9: Tuner+FM compact
+ [0136]
+ xx0x: PAL BG
+ xx1x: Pal DK, Secam LL
+ xx3x: NTSC
+ xx6x: PAL I
+ F[HYNR]5
+ FH5: Pal BG
+ FY5: others
+ FN5: multistandard
+ FR5: w/ FM radio
+ 3X xxxx: order number with specific connector
+ Note: Only 40x2 series has TEMIC_API, all newer tuners have PHILIPS_API.
+
+LG Innotek Tuner:
+ TPI8NSR11 : NTSC J/M (TPI8NSR01 w/FM) (P,210/497)
+ TPI8PSB11 : PAL B/G (TPI8PSB01 w/FM) (P,170/450)
+ TAPC-I701 : PAL I (TAPC-I001 w/FM) (P,170/450)
+ TPI8PSB12 : PAL D/K+B/G (TPI8PSB02 w/FM) (P,170/450)
+ TAPC-H701P: NTSC_JP (TAPC-H001P w/FM) (L,170/450)
+ TAPC-G701P: PAL B/G (TAPC-G001P w/FM) (L,170/450)
+ TAPC-W701P: PAL I (TAPC-W001P w/FM) (L,170/450)
+ TAPC-Q703P: PAL D/K (TAPC-Q001P w/FM) (L,170/450)
+ TAPC-Q704P: PAL D/K+I (L,170/450)
+ TAPC-G702P: PAL D/K+B/G (L,170/450)
+
+ TADC-H002F: NTSC (L,175/410?; 2-B, C-W+11, W+12-69)
+ TADC-M201D: PAL D/K+B/G+I (L,143/425) (sound control at I2C address 0xc8)
+ TADC-T003F: NTSC Taiwan (L,175/410?; 2-B, C-W+11, W+12-69)
+ Suffix:
+ P= Standard phono female socket
+ D= IEC female socket
+ F= F-connector
+
+Other Tuners:
+TCL2002MB-1 : PAL BG + DK =TUNER_LG_PAL_NEW_TAPC
+TCL2002MB-1F: PAL BG + DK w/FM =PHILIPS_PAL
+TCL2002MI-2 : PAL I = ??
+
+ALPS Tuners:
+ Most are LG_API compatible
+ TSCH6 has ALPS_API (TSCH5 ?)
+ TSBE1 has extra API 05,02,08 Control_byte=0xCB Source:(1)
+
+Lit.
+(1) conexant100029b-PCI-Decoder-ApplicationNote.pdf
diff --git a/Documentation/video4linux/cafe_ccic b/Documentation/video4linux/cafe_ccic
new file mode 100644
index 0000000..8882102
--- /dev/null
+++ b/Documentation/video4linux/cafe_ccic
@@ -0,0 +1,54 @@
+"cafe_ccic" is a driver for the Marvell 88ALP01 "cafe" CMOS camera
+controller. This is the controller found in first-generation OLPC systems,
+and this driver was written with support from the OLPC project.
+
+Current status: the core driver works. It can generate data in YUV422,
+RGB565, and RGB444 formats. (Anybody looking at the code will see RGB32 as
+well, but that is a debugging aid which will be removed shortly). VGA and
+QVGA modes work; CIF is there but the colors remain funky. Only the OV7670
+sensor is known to work with this controller at this time.
+
+To try it out: either of these commands will work:
+
+ mplayer tv:// -tv driver=v4l2:width=640:height=480 -nosound
+ mplayer tv:// -tv driver=v4l2:width=640:height=480:outfmt=bgr16 -nosound
+
+The "xawtv" utility also works; gqcam does not, for unknown reasons.
+
+There are a few load-time options, most of which can be changed after
+loading via sysfs as well:
+
+ - alloc_bufs_at_load: Normally, the driver will not allocate any DMA
+ buffers until the time comes to transfer data. If this option is set,
+ then worst-case-sized buffers will be allocated at module load time.
+ This option nails down the memory for the life of the module, but
+ perhaps decreases the chances of an allocation failure later on.
+
+ - dma_buf_size: The size of DMA buffers to allocate. Note that this
+ option is only consulted for load-time allocation; when buffers are
+ allocated at run time, they will be sized appropriately for the current
+ camera settings.
+
+ - n_dma_bufs: The controller can cycle through either two or three DMA
+ buffers. Normally, the driver tries to use three buffers; on faster
+ systems, however, it will work well with only two.
+
+ - min_buffers: The minimum number of streaming I/O buffers that the driver
+ will consent to work with. Default is one, but, on slower systems,
+ better behavior with mplayer can be achieved by setting to a higher
+ value (like six).
+
+ - max_buffers: The maximum number of streaming I/O buffers; default is
+ ten. That number was carefully picked out of a hat and should not be
+ assumed to actually mean much of anything.
+
+ - flip: If this boolean parameter is set, the sensor will be instructed to
+ invert the video image. Whether it makes sense is determined by how
+ your particular camera is mounted.
+
+Work is ongoing with this driver, stay tuned.
+
+jon
+
+Jonathan Corbet
+corbet@lwn.net
diff --git a/Documentation/video4linux/cpia2_overview.txt b/Documentation/video4linux/cpia2_overview.txt
new file mode 100644
index 0000000..a6e5366
--- /dev/null
+++ b/Documentation/video4linux/cpia2_overview.txt
@@ -0,0 +1,38 @@
+ Programmer's View of Cpia2
+
+Cpia2 is the second generation video coprocessor from VLSI Vision Ltd (now a
+division of ST Microelectronics). There are two versions. The first is the
+STV0672, which is capable of up to 30 frames per second (fps) in frame sizes
+up to CIF, and 15 fps for VGA frames. The STV0676 is an improved version,
+which can handle up to 30 fps VGA. Both coprocessors can be attached to two
+CMOS sensors - the vvl6410 CIF sensor and the vvl6500 VGA sensor. These will
+be referred to as the 410 and the 500 sensors, or the CIF and VGA sensors.
+
+The two chipsets operate almost identically. The core is an 8051 processor,
+running two different versions of firmware. The 672 runs the VP4 video
+processor code, the 676 runs VP5. There are a few differences in register
+mappings for the two chips. In these cases, the symbols defined in the
+header files are marked with VP4 or VP5 as part of the symbol name.
+
+The cameras appear externally as three sets of registers. Setting register
+values is the only way to control the camera. Some settings are
+interdependant, such as the sequence required to power up the camera. I will
+try to make note of all of these cases.
+
+The register sets are called blocks. Block 0 is the system block. This
+section is always powered on when the camera is plugged in. It contains
+registers that control housekeeping functions such as powering up the video
+processor. The video processor is the VP block. These registers control
+how the video from the sensor is processed. Examples are timing registers,
+user mode (vga, qvga), scaling, cropping, framerates, and so on. The last
+block is the video compressor (VC). The video stream sent from the camera is
+compressed as Motion JPEG (JPEGA). The VC controls all of the compression
+parameters. Looking at the file cpia2_registers.h, you can get a full view
+of these registers and the possible values for most of them.
+
+One or more registers can be set or read by sending a usb control message to
+the camera. There are three modes for this. Block mode requests a number
+of contiguous registers. Random mode reads or writes random registers with
+a tuple structure containing address/value pairs. The repeat mode is only
+used by VP4 to load a firmware patch. It contains a starting address and
+a sequence of bytes to be written into a gpio port. \ No newline at end of file
diff --git a/Documentation/video4linux/cx18.txt b/Documentation/video4linux/cx18.txt
new file mode 100644
index 0000000..914cb7e
--- /dev/null
+++ b/Documentation/video4linux/cx18.txt
@@ -0,0 +1,30 @@
+Some notes regarding the cx18 driver for the Conexant CX23418 MPEG
+encoder chip:
+
+1) Currently supported are:
+
+ - Hauppauge HVR-1600
+ - Compro VideoMate H900
+ - Yuan MPC718
+ - Conexant Raptor PAL/SECAM devkit
+
+2) Some people have problems getting the i2c bus to work.
+ The symptom is that the eeprom cannot be read and the card is
+ unusable. This is probably fixed, but if you have problems
+ then post to the video4linux or ivtv-users mailinglist.
+
+3) VBI (raw or sliced) has not yet been implemented.
+
+4) MPEG indexing is not yet implemented.
+
+5) The driver is still a bit rough around the edges, this should
+ improve over time.
+
+
+Firmware:
+
+You can obtain the firmware files here:
+
+http://dl.ivtvdriver.org/ivtv/firmware/cx18-firmware.tar.gz
+
+Untar and copy the .fw files to your firmware directory.
diff --git a/Documentation/video4linux/cx2341x/README.hm12 b/Documentation/video4linux/cx2341x/README.hm12
new file mode 100644
index 0000000..0e213ed
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/README.hm12
@@ -0,0 +1,116 @@
+The cx23416 can produce (and the cx23415 can also read) raw YUV output. The
+format of a YUV frame is specific to this chip and is called HM12. 'HM' stands
+for 'Hauppauge Macroblock', which is a misnomer as 'Conexant Macroblock' would
+be more accurate.
+
+The format is YUV 4:2:0 which uses 1 Y byte per pixel and 1 U and V byte per
+four pixels.
+
+The data is encoded as two macroblock planes, the first containing the Y
+values, the second containing UV macroblocks.
+
+The Y plane is divided into blocks of 16x16 pixels from left to right
+and from top to bottom. Each block is transmitted in turn, line-by-line.
+
+So the first 16 bytes are the first line of the top-left block, the
+second 16 bytes are the second line of the top-left block, etc. After
+transmitting this block the first line of the block on the right to the
+first block is transmitted, etc.
+
+The UV plane is divided into blocks of 16x8 UV values going from left
+to right, top to bottom. Each block is transmitted in turn, line-by-line.
+
+So the first 16 bytes are the first line of the top-left block and
+contain 8 UV value pairs (16 bytes in total). The second 16 bytes are the
+second line of 8 UV pairs of the top-left block, etc. After transmitting
+this block the first line of the block on the right to the first block is
+transmitted, etc.
+
+The code below is given as an example on how to convert HM12 to separate
+Y, U and V planes. This code assumes frames of 720x576 (PAL) pixels.
+
+The width of a frame is always 720 pixels, regardless of the actual specified
+width.
+
+--------------------------------------------------------------------------
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static unsigned char frame[576*720*3/2];
+static unsigned char framey[576*720];
+static unsigned char frameu[576*720 / 4];
+static unsigned char framev[576*720 / 4];
+
+static void de_macro_y(unsigned char* dst, unsigned char *src, int dstride, int w, int h)
+{
+ unsigned int y, x, i;
+
+ // descramble Y plane
+ // dstride = 720 = w
+ // The Y plane is divided into blocks of 16x16 pixels
+ // Each block in transmitted in turn, line-by-line.
+ for (y = 0; y < h; y += 16) {
+ for (x = 0; x < w; x += 16) {
+ for (i = 0; i < 16; i++) {
+ memcpy(dst + x + (y + i) * dstride, src, 16);
+ src += 16;
+ }
+ }
+ }
+}
+
+static void de_macro_uv(unsigned char *dstu, unsigned char *dstv, unsigned char *src, int dstride, int w, int h)
+{
+ unsigned int y, x, i;
+
+ // descramble U/V plane
+ // dstride = 720 / 2 = w
+ // The U/V values are interlaced (UVUV...).
+ // Again, the UV plane is divided into blocks of 16x16 UV values.
+ // Each block in transmitted in turn, line-by-line.
+ for (y = 0; y < h; y += 16) {
+ for (x = 0; x < w; x += 8) {
+ for (i = 0; i < 16; i++) {
+ int idx = x + (y + i) * dstride;
+
+ dstu[idx+0] = src[0]; dstv[idx+0] = src[1];
+ dstu[idx+1] = src[2]; dstv[idx+1] = src[3];
+ dstu[idx+2] = src[4]; dstv[idx+2] = src[5];
+ dstu[idx+3] = src[6]; dstv[idx+3] = src[7];
+ dstu[idx+4] = src[8]; dstv[idx+4] = src[9];
+ dstu[idx+5] = src[10]; dstv[idx+5] = src[11];
+ dstu[idx+6] = src[12]; dstv[idx+6] = src[13];
+ dstu[idx+7] = src[14]; dstv[idx+7] = src[15];
+ src += 16;
+ }
+ }
+ }
+}
+
+/*************************************************************************/
+int main(int argc, char **argv)
+{
+ FILE *fin;
+ int i;
+
+ if (argc == 1) fin = stdin;
+ else fin = fopen(argv[1], "r");
+
+ if (fin == NULL) {
+ fprintf(stderr, "cannot open input\n");
+ exit(-1);
+ }
+ while (fread(frame, sizeof(frame), 1, fin) == 1) {
+ de_macro_y(framey, frame, 720, 720, 576);
+ de_macro_uv(frameu, framev, frame + 720 * 576, 720 / 2, 720 / 2, 576 / 2);
+ fwrite(framey, sizeof(framey), 1, stdout);
+ fwrite(framev, sizeof(framev), 1, stdout);
+ fwrite(frameu, sizeof(frameu), 1, stdout);
+ }
+ fclose(fin);
+ return 0;
+}
+
+--------------------------------------------------------------------------
diff --git a/Documentation/video4linux/cx2341x/README.vbi b/Documentation/video4linux/cx2341x/README.vbi
new file mode 100644
index 0000000..5807cf1
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/README.vbi
@@ -0,0 +1,45 @@
+
+Format of embedded V4L2_MPEG_STREAM_VBI_FMT_IVTV VBI data
+=========================================================
+
+This document describes the V4L2_MPEG_STREAM_VBI_FMT_IVTV format of the VBI data
+embedded in an MPEG-2 program stream. This format is in part dictated by some
+hardware limitations of the ivtv driver (the driver for the Conexant cx23415/6
+chips), in particular a maximum size for the VBI data. Anything longer is cut
+off when the MPEG stream is played back through the cx23415.
+
+The advantage of this format is it is very compact and that all VBI data for
+all lines can be stored while still fitting within the maximum allowed size.
+
+The stream ID of the VBI data is 0xBD. The maximum size of the embedded data is
+4 + 43 * 36, which is 4 bytes for a header and 2 * 18 VBI lines with a 1 byte
+header and a 42 bytes payload each. Anything beyond this limit is cut off by
+the cx23415/6 firmware. Besides the data for the VBI lines we also need 36 bits
+for a bitmask determining which lines are captured and 4 bytes for a magic cookie,
+signifying that this data package contains V4L2_MPEG_STREAM_VBI_FMT_IVTV VBI data.
+If all lines are used, then there is no longer room for the bitmask. To solve this
+two different magic numbers were introduced:
+
+'itv0': After this magic number two unsigned longs follow. Bits 0-17 of the first
+unsigned long denote which lines of the first field are captured. Bits 18-31 of
+the first unsigned long and bits 0-3 of the second unsigned long are used for the
+second field.
+
+'ITV0': This magic number assumes all VBI lines are captured, i.e. it implicitly
+implies that the bitmasks are 0xffffffff and 0xf.
+
+After these magic cookies (and the 8 byte bitmask in case of cookie 'itv0') the
+captured VBI lines start:
+
+For each line the least significant 4 bits of the first byte contain the data type.
+Possible values are shown in the table below. The payload is in the following 42
+bytes.
+
+Here is the list of possible data types:
+
+#define IVTV_SLICED_TYPE_TELETEXT 0x1 // Teletext (uses lines 6-22 for PAL)
+#define IVTV_SLICED_TYPE_CC 0x4 // Closed Captions (line 21 NTSC)
+#define IVTV_SLICED_TYPE_WSS 0x5 // Wide Screen Signal (line 23 PAL)
+#define IVTV_SLICED_TYPE_VPS 0x7 // Video Programming System (PAL) (line 16)
+
+Hans Verkuil <hverkuil@xs4all.nl>
diff --git a/Documentation/video4linux/cx2341x/fw-calling.txt b/Documentation/video4linux/cx2341x/fw-calling.txt
new file mode 100644
index 0000000..8d21181
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-calling.txt
@@ -0,0 +1,69 @@
+This page describes how to make calls to the firmware api.
+
+How to call
+===========
+
+The preferred calling convention is known as the firmware mailbox. The
+mailboxes are basically a fixed length array that serves as the call-stack.
+
+Firmware mailboxes can be located by searching the encoder and decoder memory
+for a 16 byte signature. That signature will be located on a 256-byte boundary.
+
+Signature:
+0x78, 0x56, 0x34, 0x12, 0x12, 0x78, 0x56, 0x34,
+0x34, 0x12, 0x78, 0x56, 0x56, 0x34, 0x12, 0x78
+
+The firmware implements 20 mailboxes of 20 32-bit words. The first 10 are
+reserved for API calls. The second 10 are used by the firmware for event
+notification.
+
+ Index Name
+ ----- ----
+ 0 Flags
+ 1 Command
+ 2 Return value
+ 3 Timeout
+ 4-19 Parameter/Result
+
+
+The flags are defined in the following table. The direction is from the
+perspective of the firmware.
+
+ Bit Direction Purpose
+ --- --------- -------
+ 2 O Firmware has processed the command.
+ 1 I Driver has finished setting the parameters.
+ 0 I Driver is using this mailbox.
+
+
+The command is a 32-bit enumerator. The API specifics may be found in the
+fw-*-api.txt documents.
+
+The return value is a 32-bit enumerator. Only two values are currently defined:
+0=success and -1=command undefined.
+
+There are 16 parameters/results 32-bit fields. The driver populates these fields
+with values for all the parameters required by the call. The driver overwrites
+these fields with result values returned by the call. The API specifics may be
+found in the fw-*-api.txt documents.
+
+The timeout value protects the card from a hung driver thread. If the driver
+doesn't handle the completed call within the timeout specified, the firmware
+will reset that mailbox.
+
+To make an API call, the driver iterates over each mailbox looking for the
+first one available (bit 0 has been cleared). The driver sets that bit, fills
+in the command enumerator, the timeout value and any required parameters. The
+driver then sets the parameter ready bit (bit 1). The firmware scans the
+mailboxes for pending commands, processes them, sets the result code, populates
+the result value array with that call's return values and sets the call
+complete bit (bit 2). Once bit 2 is set, the driver should retrieve the results
+and clear all the flags. If the driver does not perform this task within the
+time set in the timeout register, the firmware will reset that mailbox.
+
+Event notifications are sent from the firmware to the host. The host tells the
+firmware which events it is interested in via an API call. That call tells the
+firmware which notification mailbox to use. The firmware signals the host via
+an interrupt. Only the 16 Results fields are used, the Flags, Command, Return
+value and Timeout words are not used.
+
diff --git a/Documentation/video4linux/cx2341x/fw-decoder-api.txt b/Documentation/video4linux/cx2341x/fw-decoder-api.txt
new file mode 100644
index 0000000..8c317b7
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-decoder-api.txt
@@ -0,0 +1,297 @@
+Decoder firmware API description
+================================
+
+Note: this API is part of the decoder firmware, so it's cx23415 only.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_PING_FW
+Enum 0/0x00
+Description
+ This API call does nothing. It may be used to check if the firmware
+ is responding.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_START_PLAYBACK
+Enum 1/0x01
+Description
+ Begin or resume playback.
+Param[0]
+ 0 based frame number in GOP to begin playback from.
+Param[1]
+ Specifies the number of muted audio frames to play before normal
+ audio resumes. (This is not implemented in the firmware, leave at 0)
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_STOP_PLAYBACK
+Enum 2/0x02
+Description
+ Ends playback and clears all decoder buffers. If PTS is not zero,
+ playback stops at specified PTS.
+Param[0]
+ Display 0=last frame, 1=black
+ Note: this takes effect immediately, so if you want to wait for a PTS,
+ then use '0', otherwise the screen goes to black at once.
+ You can call this later (even if there is no playback) with a 1 value
+ to set the screen to black.
+Param[1]
+ PTS low
+Param[2]
+ PTS high
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_PLAYBACK_SPEED
+Enum 3/0x03
+Description
+ Playback stream at speed other than normal. There are two modes of
+ operation:
+ Smooth: host transfers entire stream and firmware drops unused
+ frames.
+ Coarse: host drops frames based on indexing as required to achieve
+ desired speed.
+Param[0]
+ Bitmap:
+ 0:7 0 normal
+ 1 fast only "1.5 times"
+ n nX fast, 1/nX slow
+ 30 Framedrop:
+ '0' during 1.5 times play, every other B frame is dropped
+ '1' during 1.5 times play, stream is unchanged (bitrate
+ must not exceed 8mbps)
+ 31 Speed:
+ '0' slow
+ '1' fast
+ Note: n is limited to 2. Anything higher does not result in
+ faster playback. Instead the host should start dropping frames.
+Param[1]
+ Direction: 0=forward, 1=reverse
+ Note: to make reverse playback work you have to write full GOPs in
+ reverse order.
+Param[2]
+ Picture mask:
+ 1=I frames
+ 3=I, P frames
+ 7=I, P, B frames
+Param[3]
+ B frames per GOP (for reverse play only)
+ Note: for reverse playback the Picture Mask should be set to I or I, P.
+ Adding B frames to the mask will result in corrupt video. This field
+ has to be set to the correct value in order to keep the timing correct.
+Param[4]
+ Mute audio: 0=disable, 1=enable
+Param[5]
+ Display 0=frame, 1=field
+Param[6]
+ Specifies the number of muted audio frames to play before normal audio
+ resumes. (Not implemented in the firmware, leave at 0)
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_STEP_VIDEO
+Enum 5/0x05
+Description
+ Each call to this API steps the playback to the next unit defined below
+ in the current playback direction.
+Param[0]
+ 0=frame, 1=top field, 2=bottom field
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_DMA_BLOCK_SIZE
+Enum 8/0x08
+Description
+ Set DMA transfer block size. Counterpart to API 0xC9
+Param[0]
+ DMA transfer block size in bytes. A different size may be specified
+ when issuing the DMA transfer command.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_GET_XFER_INFO
+Enum 9/0x09
+Description
+ This API call may be used to detect an end of stream condition.
+Result[0]
+ Stream type
+Result[1]
+ Address offset
+Result[2]
+ Maximum bytes to transfer
+Result[3]
+ Buffer fullness
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_GET_DMA_STATUS
+Enum 10/0x0A
+Description
+ Status of the last DMA transfer
+Result[0]
+ Bit 1 set means transfer complete
+ Bit 2 set means DMA error
+ Bit 3 set means linked list error
+Result[1]
+ DMA type: 0=MPEG, 1=OSD, 2=YUV
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SCHED_DMA_FROM_HOST
+Enum 11/0x0B
+Description
+ Setup DMA from host operation. Counterpart to API 0xCC
+Param[0]
+ Memory address of link list
+Param[1]
+ Total # of bytes to transfer
+Param[2]
+ DMA type (0=MPEG, 1=OSD, 2=YUV)
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_PAUSE_PLAYBACK
+Enum 13/0x0D
+Description
+ Freeze playback immediately. In this mode, when internal buffers are
+ full, no more data will be accepted and data request IRQs will be
+ masked.
+Param[0]
+ Display: 0=last frame, 1=black
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_HALT_FW
+Enum 14/0x0E
+Description
+ The firmware is halted and no further API calls are serviced until
+ the firmware is uploaded again.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_STANDARD
+Enum 16/0x10
+Description
+ Selects display standard
+Param[0]
+ 0=NTSC, 1=PAL
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_GET_VERSION
+Enum 17/0x11
+Description
+ Returns decoder firmware version information
+Result[0]
+ Version bitmask:
+ Bits 0:15 build
+ Bits 16:23 minor
+ Bits 24:31 major
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_STREAM_INPUT
+Enum 20/0x14
+Description
+ Select decoder stream input port
+Param[0]
+ 0=memory (default), 1=streaming
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_GET_TIMING_INFO
+Enum 21/0x15
+Description
+ Returns timing information from start of playback
+Result[0]
+ Frame count by decode order
+Result[1]
+ Video PTS bits 0:31 by display order
+Result[2]
+ Video PTS bit 32 by display order
+Result[3]
+ SCR bits 0:31 by display order
+Result[4]
+ SCR bit 32 by display order
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_AUDIO_MODE
+Enum 22/0x16
+Description
+ Select audio mode
+Param[0]
+ Dual mono mode action
+ 0=Stereo, 1=Left, 2=Right, 3=Mono, 4=Swap, -1=Unchanged
+Param[1]
+ Stereo mode action:
+ 0=Stereo, 1=Left, 2=Right, 3=Mono, 4=Swap, -1=Unchanged
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_EVENT_NOTIFICATION
+Enum 23/0x17
+Description
+ Setup firmware to notify the host about a particular event.
+ Counterpart to API 0xD5
+Param[0]
+ Event: 0=Audio mode change between mono, (joint) stereo and dual channel.
+ Event: 3=Decoder started
+ Event: 4=Unknown: goes off 10-15 times per second while decoding.
+ Event: 5=Some sync event: goes off once per frame.
+Param[1]
+ Notification 0=disabled, 1=enabled
+Param[2]
+ Interrupt bit
+Param[3]
+ Mailbox slot, -1 if no mailbox required.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_DISPLAY_BUFFERS
+Enum 24/0x18
+Description
+ Number of display buffers. To decode all frames in reverse playback you
+ must use nine buffers.
+Param[0]
+ 0=six buffers, 1=nine buffers
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_EXTRACT_VBI
+Enum 25/0x19
+Description
+ Extracts VBI data
+Param[0]
+ 0=extract from extension & user data, 1=extract from private packets
+Result[0]
+ VBI table location
+Result[1]
+ VBI table size
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_DECODER_SOURCE
+Enum 26/0x1A
+Description
+ Selects decoder source. Ensure that the parameters passed to this
+ API match the encoder settings.
+Param[0]
+ Mode: 0=MPEG from host, 1=YUV from encoder, 2=YUV from host
+Param[1]
+ YUV picture width
+Param[2]
+ YUV picture height
+Param[3]
+ Bitmap: see Param[0] of API 0xBD
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_DEC_SET_PREBUFFERING
+Enum 30/0x1E
+Description
+ Decoder prebuffering, when enabled up to 128KB are buffered for
+ streams <8mpbs or 640KB for streams >8mbps
+Param[0]
+ 0=off, 1=on
diff --git a/Documentation/video4linux/cx2341x/fw-decoder-regs.txt b/Documentation/video4linux/cx2341x/fw-decoder-regs.txt
new file mode 100644
index 0000000..cf52c8f
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-decoder-regs.txt
@@ -0,0 +1,817 @@
+PVR350 Video decoder registers 0x02002800 -> 0x02002B00
+=======================================================
+
+This list has been worked out through trial and error. There will be mistakes
+and omissions. Some registers have no obvious effect so it's hard to say what
+they do, while others interact with each other, or require a certain load
+sequence. Horizontal filter setup is one example, with six registers working
+in unison and requiring a certain load sequence to correctly configure. The
+indexed colour palette is much easier to set at just two registers, but again
+it requires a certain load sequence.
+
+Some registers are fussy about what they are set to. Load in a bad value & the
+decoder will fail. A firmware reload will often recover, but sometimes a reset
+is required. For registers containing size information, setting them to 0 is
+generally a bad idea. For other control registers i.e. 2878, you'll only find
+out what values are bad when it hangs.
+
+--------------------------------------------------------------------------------
+2800
+ bit 0
+ Decoder enable
+ 0 = disable
+ 1 = enable
+--------------------------------------------------------------------------------
+2804
+ bits 0:31
+ Decoder horizontal Y alias register 1
+---------------
+2808
+ bits 0:31
+ Decoder horizontal Y alias register 2
+---------------
+280C
+ bits 0:31
+ Decoder horizontal Y alias register 3
+---------------
+2810
+ bits 0:31
+ Decoder horizontal Y alias register 4
+---------------
+2814
+ bits 0:31
+ Decoder horizontal Y alias register 5
+---------------
+2818
+ bits 0:31
+ Decoder horizontal Y alias trigger
+
+ These six registers control the horizontal aliasing filter for the Y plane.
+ The first five registers must all be loaded before accessing the trigger
+ (2818), as this register actually clocks the data through for the first
+ five.
+
+ To correctly program set the filter, this whole procedure must be done 16
+ times. The actual register contents are copied from a lookup-table in the
+ firmware which contains 4 different filter settings.
+
+--------------------------------------------------------------------------------
+281C
+ bits 0:31
+ Decoder horizontal UV alias register 1
+---------------
+2820
+ bits 0:31
+ Decoder horizontal UV alias register 2
+---------------
+2824
+ bits 0:31
+ Decoder horizontal UV alias register 3
+---------------
+2828
+ bits 0:31
+ Decoder horizontal UV alias register 4
+---------------
+282C
+ bits 0:31
+ Decoder horizontal UV alias register 5
+---------------
+2830
+ bits 0:31
+ Decoder horizontal UV alias trigger
+
+ These six registers control the horizontal aliasing for the UV plane.
+ Operation is the same as the Y filter, with 2830 being the trigger
+ register.
+
+--------------------------------------------------------------------------------
+2834
+ bits 0:15
+ Decoder Y source width in pixels
+
+ bits 16:31
+ Decoder Y destination width in pixels
+---------------
+2838
+ bits 0:15
+ Decoder UV source width in pixels
+
+ bits 16:31
+ Decoder UV destination width in pixels
+
+ NOTE: For both registers, the resulting image must be fully visible on
+ screen. If the image exceeds the right edge both the source and destination
+ size must be adjusted to reflect the visible portion. For the source width,
+ you must take into account the scaling when calculating the new value.
+--------------------------------------------------------------------------------
+
+283C
+ bits 0:31
+ Decoder Y horizontal scaling
+ Normally = Reg 2854 >> 2
+---------------
+2840
+ bits 0:31
+ Decoder ?? unknown - horizontal scaling
+ Usually 0x00080514
+---------------
+2844
+ bits 0:31
+ Decoder UV horizontal scaling
+ Normally = Reg 2854 >> 2
+---------------
+2848
+ bits 0:31
+ Decoder ?? unknown - horizontal scaling
+ Usually 0x00100514
+---------------
+284C
+ bits 0:31
+ Decoder ?? unknown - Y plane
+ Usually 0x00200020
+---------------
+2850
+ bits 0:31
+ Decoder ?? unknown - UV plane
+ Usually 0x00200020
+---------------
+2854
+ bits 0:31
+ Decoder 'master' value for horizontal scaling
+---------------
+2858
+ bits 0:31
+ Decoder ?? unknown
+ Usually 0
+---------------
+285C
+ bits 0:31
+ Decoder ?? unknown
+ Normally = Reg 2854 >> 1
+---------------
+2860
+ bits 0:31
+ Decoder ?? unknown
+ Usually 0
+---------------
+2864
+ bits 0:31
+ Decoder ?? unknown
+ Normally = Reg 2854 >> 1
+---------------
+2868
+ bits 0:31
+ Decoder ?? unknown
+ Usually 0
+
+ Most of these registers either control horizontal scaling, or appear linked
+ to it in some way. Register 2854 contains the 'master' value & the other
+ registers can be calculated from that one. You must also remember to
+ correctly set the divider in Reg 2874.
+
+ To enlarge:
+ Reg 2854 = (source_width * 0x00200000) / destination_width
+ Reg 2874 = No divide
+
+ To reduce from full size down to half size:
+ Reg 2854 = (source_width/2 * 0x00200000) / destination width
+ Reg 2874 = Divide by 2
+
+ To reduce from half size down to quarter size:
+ Reg 2854 = (source_width/4 * 0x00200000) / destination width
+ Reg 2874 = Divide by 4
+
+ The result is always rounded up.
+
+--------------------------------------------------------------------------------
+286C
+ bits 0:15
+ Decoder horizontal Y buffer offset
+
+ bits 15:31
+ Decoder horizontal UV buffer offset
+
+ Offset into the video image buffer. If the offset is gradually incremented,
+ the on screen image will move left & wrap around higher up on the right.
+
+--------------------------------------------------------------------------------
+2870
+ bits 0:15
+ Decoder horizontal Y output offset
+
+ bits 16:31
+ Decoder horizontal UV output offset
+
+ Offsets the actual video output. Controls output alignment of the Y & UV
+ planes. The higher the value, the greater the shift to the left. Use
+ reg 2890 to move the image right.
+
+--------------------------------------------------------------------------------
+2874
+ bits 0:1
+ Decoder horizontal Y output size divider
+ 00 = No divide
+ 01 = Divide by 2
+ 10 = Divide by 3
+
+ bits 4:5
+ Decoder horizontal UV output size divider
+ 00 = No divide
+ 01 = Divide by 2
+ 10 = Divide by 3
+
+ bit 8
+ Decoder ?? unknown
+ 0 = Normal
+ 1 = Affects video output levels
+
+ bit 16
+ Decoder ?? unknown
+ 0 = Normal
+ 1 = Disable horizontal filter
+
+--------------------------------------------------------------------------------
+2878
+ bit 0
+ ?? unknown
+
+ bit 1
+ osd on/off
+ 0 = osd off
+ 1 = osd on
+
+ bit 2
+ Decoder + osd video timing
+ 0 = NTSC
+ 1 = PAL
+
+ bits 3:4
+ ?? unknown
+
+ bit 5
+ Decoder + osd
+ Swaps upper & lower fields
+
+--------------------------------------------------------------------------------
+287C
+ bits 0:10
+ Decoder & osd ?? unknown
+ Moves entire screen horizontally. Starts at 0x005 with the screen
+ shifted heavily to the right. Incrementing in steps of 0x004 will
+ gradually shift the screen to the left.
+
+ bits 11:31
+ ?? unknown
+
+ Normally contents are 0x00101111 (NTSC) or 0x1010111d (PAL)
+
+--------------------------------------------------------------------------------
+2880 -------- ?? unknown
+2884 -------- ?? unknown
+--------------------------------------------------------------------------------
+2888
+ bit 0
+ Decoder + osd ?? unknown
+ 0 = Normal
+ 1 = Misaligned fields (Correctable through 289C & 28A4)
+
+ bit 4
+ ?? unknown
+
+ bit 8
+ ?? unknown
+
+ Warning: Bad values will require a firmware reload to recover.
+ Known to be bad are 0x000,0x011,0x100,0x111
+--------------------------------------------------------------------------------
+288C
+ bits 0:15
+ osd ?? unknown
+ Appears to affect the osd position stability. The higher the value the
+ more unstable it becomes. Decoder output remains stable.
+
+ bits 16:31
+ osd ?? unknown
+ Same as bits 0:15
+
+--------------------------------------------------------------------------------
+2890
+ bits 0:11
+ Decoder output horizontal offset.
+
+ Horizontal offset moves the video image right. A small left shift is
+ possible, but it's better to use reg 2870 for that due to its greater
+ range.
+
+ NOTE: Video corruption will occur if video window is shifted off the right
+ edge. To avoid this read the notes for 2834 & 2838.
+--------------------------------------------------------------------------------
+2894
+ bits 0:23
+ Decoder output video surround colour.
+
+ Contains the colour (in yuv) used to fill the screen when the video is
+ running in a window.
+--------------------------------------------------------------------------------
+2898
+ bits 0:23
+ Decoder video window colour
+ Contains the colour (in yuv) used to fill the video window when the
+ video is turned off.
+
+ bit 24
+ Decoder video output
+ 0 = Video on
+ 1 = Video off
+
+ bit 28
+ Decoder plane order
+ 0 = Y,UV
+ 1 = UV,Y
+
+ bit 29
+ Decoder second plane byte order
+ 0 = Normal (UV)
+ 1 = Swapped (VU)
+
+ In normal usage, the first plane is Y & the second plane is UV. Though the
+ order of the planes can be swapped, only the byte order of the second plane
+ can be swapped. This isn't much use for the Y plane, but can be useful for
+ the UV plane.
+
+--------------------------------------------------------------------------------
+289C
+ bits 0:15
+ Decoder vertical field offset 1
+
+ bits 16:31
+ Decoder vertical field offset 2
+
+ Controls field output vertical alignment. The higher the number, the lower
+ the image on screen. Known starting values are 0x011E0017 (NTSC) &
+ 0x01500017 (PAL)
+--------------------------------------------------------------------------------
+28A0
+ bits 0:15
+ Decoder & osd width in pixels
+
+ bits 16:31
+ Decoder & osd height in pixels
+
+ All output from the decoder & osd are disabled beyond this area. Decoder
+ output will simply go black outside of this region. If the osd tries to
+ exceed this area it will become corrupt.
+--------------------------------------------------------------------------------
+28A4
+ bits 0:11
+ osd left shift.
+
+ Has a range of 0x770->0x7FF. With the exception of 0, any value outside of
+ this range corrupts the osd.
+--------------------------------------------------------------------------------
+28A8
+ bits 0:15
+ osd vertical field offset 1
+
+ bits 16:31
+ osd vertical field offset 2
+
+ Controls field output vertical alignment. The higher the number, the lower
+ the image on screen. Known starting values are 0x011E0017 (NTSC) &
+ 0x01500017 (PAL)
+--------------------------------------------------------------------------------
+28AC -------- ?? unknown
+ |
+ V
+28BC -------- ?? unknown
+--------------------------------------------------------------------------------
+28C0
+ bit 0
+ Current output field
+ 0 = first field
+ 1 = second field
+
+ bits 16:31
+ Current scanline
+ The scanline counts from the top line of the first field
+ through to the last line of the second field.
+--------------------------------------------------------------------------------
+28C4 -------- ?? unknown
+ |
+ V
+28F8 -------- ?? unknown
+--------------------------------------------------------------------------------
+28FC
+ bit 0
+ ?? unknown
+ 0 = Normal
+ 1 = Breaks decoder & osd output
+--------------------------------------------------------------------------------
+2900
+ bits 0:31
+ Decoder vertical Y alias register 1
+---------------
+2904
+ bits 0:31
+ Decoder vertical Y alias register 2
+---------------
+2908
+ bits 0:31
+ Decoder vertical Y alias trigger
+
+ These three registers control the vertical aliasing filter for the Y plane.
+ Operation is similar to the horizontal Y filter (2804). The only real
+ difference is that there are only two registers to set before accessing
+ the trigger register (2908). As for the horizontal filter, the values are
+ taken from a lookup table in the firmware, and the procedure must be
+ repeated 16 times to fully program the filter.
+--------------------------------------------------------------------------------
+290C
+ bits 0:31
+ Decoder vertical UV alias register 1
+---------------
+2910
+ bits 0:31
+ Decoder vertical UV alias register 2
+---------------
+2914
+ bits 0:31
+ Decoder vertical UV alias trigger
+
+ These three registers control the vertical aliasing filter for the UV
+ plane. Operation is the same as the Y filter, with 2914 being the trigger.
+--------------------------------------------------------------------------------
+2918
+ bits 0:15
+ Decoder Y source height in pixels
+
+ bits 16:31
+ Decoder Y destination height in pixels
+---------------
+291C
+ bits 0:15
+ Decoder UV source height in pixels divided by 2
+
+ bits 16:31
+ Decoder UV destination height in pixels
+
+ NOTE: For both registers, the resulting image must be fully visible on
+ screen. If the image exceeds the bottom edge both the source and
+ destination size must be adjusted to reflect the visible portion. For the
+ source height, you must take into account the scaling when calculating the
+ new value.
+--------------------------------------------------------------------------------
+2920
+ bits 0:31
+ Decoder Y vertical scaling
+ Normally = Reg 2930 >> 2
+---------------
+2924
+ bits 0:31
+ Decoder Y vertical scaling
+ Normally = Reg 2920 + 0x514
+---------------
+2928
+ bits 0:31
+ Decoder UV vertical scaling
+ When enlarging = Reg 2930 >> 2
+ When reducing = Reg 2930 >> 3
+---------------
+292C
+ bits 0:31
+ Decoder UV vertical scaling
+ Normally = Reg 2928 + 0x514
+---------------
+2930
+ bits 0:31
+ Decoder 'master' value for vertical scaling
+---------------
+2934
+ bits 0:31
+ Decoder ?? unknown - Y vertical scaling
+---------------
+2938
+ bits 0:31
+ Decoder Y vertical scaling
+ Normally = Reg 2930
+---------------
+293C
+ bits 0:31
+ Decoder ?? unknown - Y vertical scaling
+---------------
+2940
+ bits 0:31
+ Decoder UV vertical scaling
+ When enlarging = Reg 2930 >> 1
+ When reducing = Reg 2930
+---------------
+2944
+ bits 0:31
+ Decoder ?? unknown - UV vertical scaling
+---------------
+2948
+ bits 0:31
+ Decoder UV vertical scaling
+ Normally = Reg 2940
+---------------
+294C
+ bits 0:31
+ Decoder ?? unknown - UV vertical scaling
+
+ Most of these registers either control vertical scaling, or appear linked
+ to it in some way. Register 2930 contains the 'master' value & all other
+ registers can be calculated from that one. You must also remember to
+ correctly set the divider in Reg 296C
+
+ To enlarge:
+ Reg 2930 = (source_height * 0x00200000) / destination_height
+ Reg 296C = No divide
+
+ To reduce from full size down to half size:
+ Reg 2930 = (source_height/2 * 0x00200000) / destination height
+ Reg 296C = Divide by 2
+
+ To reduce from half down to quarter.
+ Reg 2930 = (source_height/4 * 0x00200000) / destination height
+ Reg 296C = Divide by 4
+
+--------------------------------------------------------------------------------
+2950
+ bits 0:15
+ Decoder Y line index into display buffer, first field
+
+ bits 16:31
+ Decoder Y vertical line skip, first field
+--------------------------------------------------------------------------------
+2954
+ bits 0:15
+ Decoder Y line index into display buffer, second field
+
+ bits 16:31
+ Decoder Y vertical line skip, second field
+--------------------------------------------------------------------------------
+2958
+ bits 0:15
+ Decoder UV line index into display buffer, first field
+
+ bits 16:31
+ Decoder UV vertical line skip, first field
+--------------------------------------------------------------------------------
+295C
+ bits 0:15
+ Decoder UV line index into display buffer, second field
+
+ bits 16:31
+ Decoder UV vertical line skip, second field
+--------------------------------------------------------------------------------
+2960
+ bits 0:15
+ Decoder destination height minus 1
+
+ bits 16:31
+ Decoder destination height divided by 2
+--------------------------------------------------------------------------------
+2964
+ bits 0:15
+ Decoder Y vertical offset, second field
+
+ bits 16:31
+ Decoder Y vertical offset, first field
+
+ These two registers shift the Y plane up. The higher the number, the
+ greater the shift.
+--------------------------------------------------------------------------------
+2968
+ bits 0:15
+ Decoder UV vertical offset, second field
+
+ bits 16:31
+ Decoder UV vertical offset, first field
+
+ These two registers shift the UV plane up. The higher the number, the
+ greater the shift.
+--------------------------------------------------------------------------------
+296C
+ bits 0:1
+ Decoder vertical Y output size divider
+ 00 = No divide
+ 01 = Divide by 2
+ 10 = Divide by 4
+
+ bits 8:9
+ Decoder vertical UV output size divider
+ 00 = No divide
+ 01 = Divide by 2
+ 10 = Divide by 4
+--------------------------------------------------------------------------------
+2970
+ bit 0
+ Decoder ?? unknown
+ 0 = Normal
+ 1 = Affect video output levels
+
+ bit 16
+ Decoder ?? unknown
+ 0 = Normal
+ 1 = Disable vertical filter
+
+--------------------------------------------------------------------------------
+2974 -------- ?? unknown
+ |
+ V
+29EF -------- ?? unknown
+--------------------------------------------------------------------------------
+2A00
+ bits 0:2
+ osd colour mode
+ 000 = 8 bit indexed
+ 001 = 16 bit (565)
+ 010 = 15 bit (555)
+ 011 = 12 bit (444)
+ 100 = 32 bit (8888)
+
+ bits 4:5
+ osd display bpp
+ 01 = 8 bit
+ 10 = 16 bit
+ 11 = 32 bit
+
+ bit 8
+ osd global alpha
+ 0 = Off
+ 1 = On
+
+ bit 9
+ osd local alpha
+ 0 = Off
+ 1 = On
+
+ bit 10
+ osd colour key
+ 0 = Off
+ 1 = On
+
+ bit 11
+ osd ?? unknown
+ Must be 1
+
+ bit 13
+ osd colour space
+ 0 = ARGB
+ 1 = AYVU
+
+ bits 16:31
+ osd ?? unknown
+ Must be 0x001B (some kind of buffer pointer ?)
+
+ When the bits-per-pixel is set to 8, the colour mode is ignored and
+ assumed to be 8 bit indexed. For 16 & 32 bits-per-pixel the colour depth
+ is honoured, and when using a colour depth that requires fewer bytes than
+ allocated the extra bytes are used as padding. So for a 32 bpp with 8 bit
+ index colour, there are 3 padding bytes per pixel. It's also possible to
+ select 16bpp with a 32 bit colour mode. This results in the pixel width
+ being doubled, but the color key will not work as expected in this mode.
+
+ Colour key is as it suggests. You designate a colour which will become
+ completely transparent. When using 565, 555 or 444 colour modes, the
+ colour key is always 16 bits wide. The colour to key on is set in Reg 2A18.
+
+ Local alpha works differently depending on the colour mode. For 32bpp & 8
+ bit indexed, local alpha is a per-pixel 256 step transparency, with 0 being
+ transparent and 255 being solid. For the 16bpp modes 555 & 444, the unused
+ bit(s) act as a simple transparency switch, with 0 being solid & 1 being
+ fully transparent. There is no local alpha support for 16bit 565.
+
+ Global alpha is a 256 step transparency that applies to the entire osd,
+ with 0 being transparent & 255 being solid.
+
+ It's possible to combine colour key, local alpha & global alpha.
+--------------------------------------------------------------------------------
+2A04
+ bits 0:15
+ osd x coord for left edge
+
+ bits 16:31
+ osd y coord for top edge
+---------------
+2A08
+ bits 0:15
+ osd x coord for right edge
+
+ bits 16:31
+ osd y coord for bottom edge
+
+ For both registers, (0,0) = top left corner of the display area. These
+ registers do not control the osd size, only where it's positioned & how
+ much is visible. The visible osd area cannot exceed the right edge of the
+ display, otherwise the osd will become corrupt. See reg 2A10 for
+ setting osd width.
+--------------------------------------------------------------------------------
+2A0C
+ bits 0:31
+ osd buffer index
+
+ An index into the osd buffer. Slowly incrementing this moves the osd left,
+ wrapping around onto the right edge
+--------------------------------------------------------------------------------
+2A10
+ bits 0:11
+ osd buffer 32 bit word width
+
+ Contains the width of the osd measured in 32 bit words. This means that all
+ colour modes are restricted to a byte width which is divisible by 4.
+--------------------------------------------------------------------------------
+2A14
+ bits 0:15
+ osd height in pixels
+
+ bits 16:32
+ osd line index into buffer
+ osd will start displaying from this line.
+--------------------------------------------------------------------------------
+2A18
+ bits 0:31
+ osd colour key
+
+ Contains the colour value which will be transparent.
+--------------------------------------------------------------------------------
+2A1C
+ bits 0:7
+ osd global alpha
+
+ Contains the global alpha value (equiv ivtvfbctl --alpha XX)
+--------------------------------------------------------------------------------
+2A20 -------- ?? unknown
+ |
+ V
+2A2C -------- ?? unknown
+--------------------------------------------------------------------------------
+2A30
+ bits 0:7
+ osd colour to change in indexed palette
+---------------
+2A34
+ bits 0:31
+ osd colour for indexed palette
+
+ To set the new palette, first load the index of the colour to change into
+ 2A30, then load the new colour into 2A34. The full palette is 256 colours,
+ so the index range is 0x00-0xFF
+--------------------------------------------------------------------------------
+2A38 -------- ?? unknown
+2A3C -------- ?? unknown
+--------------------------------------------------------------------------------
+2A40
+ bits 0:31
+ osd ?? unknown
+
+ Affects overall brightness, wrapping around to black
+--------------------------------------------------------------------------------
+2A44
+ bits 0:31
+ osd ?? unknown
+
+ Green tint
+--------------------------------------------------------------------------------
+2A48
+ bits 0:31
+ osd ?? unknown
+
+ Red tint
+--------------------------------------------------------------------------------
+2A4C
+ bits 0:31
+ osd ?? unknown
+
+ Affects overall brightness, wrapping around to black
+--------------------------------------------------------------------------------
+2A50
+ bits 0:31
+ osd ?? unknown
+
+ Colour shift
+--------------------------------------------------------------------------------
+2A54
+ bits 0:31
+ osd ?? unknown
+
+ Colour shift
+--------------------------------------------------------------------------------
+2A58 -------- ?? unknown
+ |
+ V
+2AFC -------- ?? unknown
+--------------------------------------------------------------------------------
+2B00
+ bit 0
+ osd filter control
+ 0 = filter off
+ 1 = filter on
+
+ bits 1:4
+ osd ?? unknown
+
+--------------------------------------------------------------------------------
+
+v0.4 - 12 March 2007 - Ian Armstrong (ian@iarmst.demon.co.uk)
+
diff --git a/Documentation/video4linux/cx2341x/fw-dma.txt b/Documentation/video4linux/cx2341x/fw-dma.txt
new file mode 100644
index 0000000..be52b6f
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-dma.txt
@@ -0,0 +1,96 @@
+This page describes the structures and procedures used by the cx2341x DMA
+engine.
+
+Introduction
+============
+
+The cx2341x PCI interface is busmaster capable. This means it has a DMA
+engine to efficiently transfer large volumes of data between the card and main
+memory without requiring help from a CPU. Like most hardware, it must operate
+on contiguous physical memory. This is difficult to come by in large quantities
+on virtual memory machines.
+
+Therefore, it also supports a technique called "scatter-gather". The card can
+transfer multiple buffers in one operation. Instead of allocating one large
+contiguous buffer, the driver can allocate several smaller buffers.
+
+In practice, I've seen the average transfer to be roughly 80K, but transfers
+above 128K were not uncommon, particularly at startup. The 128K figure is
+important, because that is the largest block that the kernel can normally
+allocate. Even still, 128K blocks are hard to come by, so the driver writer is
+urged to choose a smaller block size and learn the scatter-gather technique.
+
+Mailbox #10 is reserved for DMA transfer information.
+
+Note: the hardware expects little-endian data ('intel format').
+
+Flow
+====
+
+This section describes, in general, the order of events when handling DMA
+transfers. Detailed information follows this section.
+
+- The card raises the Encoder interrupt.
+- The driver reads the transfer type, offset and size from Mailbox #10.
+- The driver constructs the scatter-gather array from enough free dma buffers
+ to cover the size.
+- The driver schedules the DMA transfer via the ScheduleDMAtoHost API call.
+- The card raises the DMA Complete interrupt.
+- The driver checks the DMA status register for any errors.
+- The driver post-processes the newly transferred buffers.
+
+NOTE! It is possible that the Encoder and DMA Complete interrupts get raised
+simultaneously. (End of the last, start of the next, etc.)
+
+Mailbox #10
+===========
+
+The Flags, Command, Return Value and Timeout fields are ignored.
+
+Name: Mailbox #10
+Results[0]: Type: 0: MPEG.
+Results[1]: Offset: The position relative to the card's memory space.
+Results[2]: Size: The exact number of bytes to transfer.
+
+My speculation is that since the StartCapture API has a capture type of "RAW"
+available, that the type field will have other values that correspond to YUV
+and PCM data.
+
+Scatter-Gather Array
+====================
+
+The scatter-gather array is a contiguously allocated block of memory that
+tells the card the source and destination of each data-block to transfer.
+Card "addresses" are derived from the offset supplied by Mailbox #10. Host
+addresses are the physical memory location of the target DMA buffer.
+
+Each S-G array element is a struct of three 32-bit words. The first word is
+the source address, the second is the destination address. Both take up the
+entire 32 bits. The lowest 18 bits of the third word is the transfer byte
+count. The high-bit of the third word is the "last" flag. The last-flag tells
+the card to raise the DMA_DONE interrupt. From hard personal experience, if
+you forget to set this bit, the card will still "work" but the stream will
+most likely get corrupted.
+
+The transfer count must be a multiple of 256. Therefore, the driver will need
+to track how much data in the target buffer is valid and deal with it
+accordingly.
+
+Array Element:
+
+- 32-bit Source Address
+- 32-bit Destination Address
+- 14-bit reserved (high bit is the last flag)
+- 18-bit byte count
+
+DMA Transfer Status
+===================
+
+Register 0x0004 holds the DMA Transfer Status:
+
+Bit
+0 read completed
+1 write completed
+2 DMA read error
+3 DMA write error
+4 Scatter-Gather array error
diff --git a/Documentation/video4linux/cx2341x/fw-encoder-api.txt b/Documentation/video4linux/cx2341x/fw-encoder-api.txt
new file mode 100644
index 0000000..5a27af2
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-encoder-api.txt
@@ -0,0 +1,709 @@
+Encoder firmware API description
+================================
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_PING_FW
+Enum 128/0x80
+Description
+ Does nothing. Can be used to check if the firmware is responding.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_START_CAPTURE
+Enum 129/0x81
+Description
+ Commences the capture of video, audio and/or VBI data. All encoding
+ parameters must be initialized prior to this API call. Captures frames
+ continuously or until a predefined number of frames have been captured.
+Param[0]
+ Capture stream type:
+ 0=MPEG
+ 1=Raw
+ 2=Raw passthrough
+ 3=VBI
+
+Param[1]
+ Bitmask:
+ Bit 0 when set, captures YUV
+ Bit 1 when set, captures PCM audio
+ Bit 2 when set, captures VBI (same as param[0]=3)
+ Bit 3 when set, the capture destination is the decoder
+ (same as param[0]=2)
+ Bit 4 when set, the capture destination is the host
+ Note: this parameter is only meaningful for RAW capture type.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_STOP_CAPTURE
+Enum 130/0x82
+Description
+ Ends a capture in progress
+Param[0]
+ 0=stop at end of GOP (generates IRQ)
+ 1=stop immediate (no IRQ)
+Param[1]
+ Stream type to stop, see param[0] of API 0x81
+Param[2]
+ Subtype, see param[1] of API 0x81
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_AUDIO_ID
+Enum 137/0x89
+Description
+ Assigns the transport stream ID of the encoded audio stream
+Param[0]
+ Audio Stream ID
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_VIDEO_ID
+Enum 139/0x8B
+Description
+ Set video transport stream ID
+Param[0]
+ Video stream ID
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_PCR_ID
+Enum 141/0x8D
+Description
+ Assigns the transport stream ID for PCR packets
+Param[0]
+ PCR Stream ID
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_FRAME_RATE
+Enum 143/0x8F
+Description
+ Set video frames per second. Change occurs at start of new GOP.
+Param[0]
+ 0=30fps
+ 1=25fps
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_FRAME_SIZE
+Enum 145/0x91
+Description
+ Select video stream encoding resolution.
+Param[0]
+ Height in lines. Default 480
+Param[1]
+ Width in pixels. Default 720
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_BIT_RATE
+Enum 149/0x95
+Description
+ Assign average video stream bitrate. Note on the last three params:
+ Param[3] and [4] seem to be always 0, param [5] doesn't seem to be used.
+Param[0]
+ 0=variable bitrate, 1=constant bitrate
+Param[1]
+ bitrate in bits per second
+Param[2]
+ peak bitrate in bits per second, divided by 400
+Param[3]
+ Mux bitrate in bits per second, divided by 400. May be 0 (default).
+Param[4]
+ Rate Control VBR Padding
+Param[5]
+ VBV Buffer used by encoder
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_GOP_PROPERTIES
+Enum 151/0x97
+Description
+ Setup the GOP structure
+Param[0]
+ GOP size (maximum is 34)
+Param[1]
+ Number of B frames between the I and P frame, plus 1.
+ For example: IBBPBBPBBPBB --> GOP size: 12, number of B frames: 2+1 = 3
+ Note that GOP size must be a multiple of (B-frames + 1).
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_ASPECT_RATIO
+Enum 153/0x99
+Description
+ Sets the encoding aspect ratio. Changes in the aspect ratio take effect
+ at the start of the next GOP.
+Param[0]
+ '0000' forbidden
+ '0001' 1:1 square
+ '0010' 4:3
+ '0011' 16:9
+ '0100' 2.21:1
+ '0101' reserved
+ ....
+ '1111' reserved
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_DNR_FILTER_MODE
+Enum 155/0x9B
+Description
+ Assign Dynamic Noise Reduction operating mode
+Param[0]
+ Bit0: Spatial filter, set=auto, clear=manual
+ Bit1: Temporal filter, set=auto, clear=manual
+Param[1]
+ Median filter:
+ 0=Disabled
+ 1=Horizontal
+ 2=Vertical
+ 3=Horiz/Vert
+ 4=Diagonal
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_DNR_FILTER_PROPS
+Enum 157/0x9D
+Description
+ These Dynamic Noise Reduction filter values are only meaningful when
+ the respective filter is set to "manual" (See API 0x9B)
+Param[0]
+ Spatial filter: default 0, range 0:15
+Param[1]
+ Temporal filter: default 0, range 0:31
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_CORING_LEVELS
+Enum 159/0x9F
+Description
+ Assign Dynamic Noise Reduction median filter properties.
+Param[0]
+ Threshold above which the luminance median filter is enabled.
+ Default: 0, range 0:255
+Param[1]
+ Threshold below which the luminance median filter is enabled.
+ Default: 255, range 0:255
+Param[2]
+ Threshold above which the chrominance median filter is enabled.
+ Default: 0, range 0:255
+Param[3]
+ Threshold below which the chrominance median filter is enabled.
+ Default: 255, range 0:255
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_SPATIAL_FILTER_TYPE
+Enum 161/0xA1
+Description
+ Assign spatial prefilter parameters
+Param[0]
+ Luminance filter
+ 0=Off
+ 1=1D Horizontal
+ 2=1D Vertical
+ 3=2D H/V Separable (default)
+ 4=2D Symmetric non-separable
+Param[1]
+ Chrominance filter
+ 0=Off
+ 1=1D Horizontal (default)
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_VBI_LINE
+Enum 183/0xB7
+Description
+ Selects VBI line number.
+Param[0]
+ Bits 0:4 line number
+ Bit 31 0=top_field, 1=bottom_field
+ Bits 0:31 all set specifies "all lines"
+Param[1]
+ VBI line information features: 0=disabled, 1=enabled
+Param[2]
+ Slicing: 0=None, 1=Closed Caption
+ Almost certainly not implemented. Set to 0.
+Param[3]
+ Luminance samples in this line.
+ Almost certainly not implemented. Set to 0.
+Param[4]
+ Chrominance samples in this line
+ Almost certainly not implemented. Set to 0.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_STREAM_TYPE
+Enum 185/0xB9
+Description
+ Assign stream type
+ Note: Transport stream is not working in recent firmwares.
+ And in older firmwares the timestamps in the TS seem to be
+ unreliable.
+Param[0]
+ 0=Program stream
+ 1=Transport stream
+ 2=MPEG1 stream
+ 3=PES A/V stream
+ 5=PES Video stream
+ 7=PES Audio stream
+ 10=DVD stream
+ 11=VCD stream
+ 12=SVCD stream
+ 13=DVD_S1 stream
+ 14=DVD_S2 stream
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_OUTPUT_PORT
+Enum 187/0xBB
+Description
+ Assign stream output port. Normally 0 when the data is copied through
+ the PCI bus (DMA), and 1 when the data is streamed to another chip
+ (pvrusb and cx88-blackbird).
+Param[0]
+ 0=Memory (default)
+ 1=Streaming
+ 2=Serial
+Param[1]
+ Unknown, but leaving this to 0 seems to work best. Indications are that
+ this might have to do with USB support, although passing anything but 0
+ only breaks things.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_AUDIO_PROPERTIES
+Enum 189/0xBD
+Description
+ Set audio stream properties, may be called while encoding is in progress.
+ Note: all bitfields are consistent with ISO11172 documentation except
+ bits 2:3 which ISO docs define as:
+ '11' Layer I
+ '10' Layer II
+ '01' Layer III
+ '00' Undefined
+ This discrepancy may indicate a possible error in the documentation.
+ Testing indicated that only Layer II is actually working, and that
+ the minimum bitrate should be 192 kbps.
+Param[0]
+ Bitmask:
+ 0:1 '00' 44.1Khz
+ '01' 48Khz
+ '10' 32Khz
+ '11' reserved
+
+ 2:3 '01'=Layer I
+ '10'=Layer II
+
+ 4:7 Bitrate:
+ Index | Layer I | Layer II
+ ------+-------------+------------
+ '0000' | free format | free format
+ '0001' | 32 kbit/s | 32 kbit/s
+ '0010' | 64 kbit/s | 48 kbit/s
+ '0011' | 96 kbit/s | 56 kbit/s
+ '0100' | 128 kbit/s | 64 kbit/s
+ '0101' | 160 kbit/s | 80 kbit/s
+ '0110' | 192 kbit/s | 96 kbit/s
+ '0111' | 224 kbit/s | 112 kbit/s
+ '1000' | 256 kbit/s | 128 kbit/s
+ '1001' | 288 kbit/s | 160 kbit/s
+ '1010' | 320 kbit/s | 192 kbit/s
+ '1011' | 352 kbit/s | 224 kbit/s
+ '1100' | 384 kbit/s | 256 kbit/s
+ '1101' | 416 kbit/s | 320 kbit/s
+ '1110' | 448 kbit/s | 384 kbit/s
+ Note: For Layer II, not all combinations of total bitrate
+ and mode are allowed. See ISO11172-3 3-Annex B, Table 3-B.2
+
+ 8:9 '00'=Stereo
+ '01'=JointStereo
+ '10'=Dual
+ '11'=Mono
+ Note: the cx23415 cannot decode Joint Stereo properly.
+
+ 10:11 Mode Extension used in joint_stereo mode.
+ In Layer I and II they indicate which subbands are in
+ intensity_stereo. All other subbands are coded in stereo.
+ '00' subbands 4-31 in intensity_stereo, bound==4
+ '01' subbands 8-31 in intensity_stereo, bound==8
+ '10' subbands 12-31 in intensity_stereo, bound==12
+ '11' subbands 16-31 in intensity_stereo, bound==16
+
+ 12:13 Emphasis:
+ '00' None
+ '01' 50/15uS
+ '10' reserved
+ '11' CCITT J.17
+
+ 14 CRC:
+ '0' off
+ '1' on
+
+ 15 Copyright:
+ '0' off
+ '1' on
+
+ 16 Generation:
+ '0' copy
+ '1' original
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_HALT_FW
+Enum 195/0xC3
+Description
+ The firmware is halted and no further API calls are serviced until the
+ firmware is uploaded again.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_GET_VERSION
+Enum 196/0xC4
+Description
+ Returns the version of the encoder firmware.
+Result[0]
+ Version bitmask:
+ Bits 0:15 build
+ Bits 16:23 minor
+ Bits 24:31 major
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_GOP_CLOSURE
+Enum 197/0xC5
+Description
+ Assigns the GOP open/close property.
+Param[0]
+ 0=Open
+ 1=Closed
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_GET_SEQ_END
+Enum 198/0xC6
+Description
+ Obtains the sequence end code of the encoder's buffer. When a capture
+ is started a number of interrupts are still generated, the last of
+ which will have Result[0] set to 1 and Result[1] will contain the size
+ of the buffer.
+Result[0]
+ State of the transfer (1 if last buffer)
+Result[1]
+ If Result[0] is 1, this contains the size of the last buffer, undefined
+ otherwise.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_PGM_INDEX_INFO
+Enum 199/0xC7
+Description
+ Sets the Program Index Information.
+ The information is stored as follows:
+
+ struct info {
+ u32 length; // Length of this frame
+ u32 offset_low; // Offset in the file of the
+ u32 offset_high; // start of this frame
+ u32 mask1; // Bits 0-2 are the type mask:
+ // 1=I, 2=P, 4=B
+ // 0=End of Program Index, other fields
+ // are invalid.
+ u32 pts; // The PTS of the frame
+ u32 mask2; // Bit 0 is bit 32 of the pts.
+ };
+ u32 table_ptr;
+ struct info index[400];
+
+ The table_ptr is the encoder memory address in the table were
+ *new* entries will be written. Note that this is a ringbuffer,
+ so the table_ptr will wraparound.
+Param[0]
+ Picture Mask:
+ 0=No index capture
+ 1=I frames
+ 3=I,P frames
+ 7=I,P,B frames
+ (Seems to be ignored, it always indexes I, P and B frames)
+Param[1]
+ Elements requested (up to 400)
+Result[0]
+ Offset in the encoder memory of the start of the table.
+Result[1]
+ Number of allocated elements up to a maximum of Param[1]
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_VBI_CONFIG
+Enum 200/0xC8
+Description
+ Configure VBI settings
+Param[0]
+ Bitmap:
+ 0 Mode '0' Sliced, '1' Raw
+ 1:3 Insertion:
+ '000' insert in extension & user data
+ '001' insert in private packets
+ '010' separate stream and user data
+ '111' separate stream and private data
+ 8:15 Stream ID (normally 0xBD)
+Param[1]
+ Frames per interrupt (max 8). Only valid in raw mode.
+Param[2]
+ Total raw VBI frames. Only valid in raw mode.
+Param[3]
+ Start codes
+Param[4]
+ Stop codes
+Param[5]
+ Lines per frame
+Param[6]
+ Byte per line
+Result[0]
+ Observed frames per interrupt in raw mode only. Rage 1 to Param[1]
+Result[1]
+ Observed number of frames in raw mode. Range 1 to Param[2]
+Result[2]
+ Memory offset to start or raw VBI data
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_DMA_BLOCK_SIZE
+Enum 201/0xC9
+Description
+ Set DMA transfer block size
+Param[0]
+ DMA transfer block size in bytes or frames. When unit is bytes,
+ supported block sizes are 2^7, 2^8 and 2^9 bytes.
+Param[1]
+ Unit: 0=bytes, 1=frames
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_GET_PREV_DMA_INFO_MB_10
+Enum 202/0xCA
+Description
+ Returns information on the previous DMA transfer in conjunction with
+ bit 27 of the interrupt mask. Uses mailbox 10.
+Result[0]
+ Type of stream
+Result[1]
+ Address Offset
+Result[2]
+ Maximum size of transfer
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_GET_PREV_DMA_INFO_MB_9
+Enum 203/0xCB
+Description
+ Returns information on the previous DMA transfer in conjunction with
+ bit 27 or 18 of the interrupt mask. Uses mailbox 9.
+Result[0]
+ Status bits:
+ 0 read completed
+ 1 write completed
+ 2 DMA read error
+ 3 DMA write error
+ 4 Scatter-Gather array error
+Result[1]
+ DMA type
+Result[2]
+ Presentation Time Stamp bits 0..31
+Result[3]
+ Presentation Time Stamp bit 32
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SCHED_DMA_TO_HOST
+Enum 204/0xCC
+Description
+ Setup DMA to host operation
+Param[0]
+ Memory address of link list
+Param[1]
+ Length of link list (wtf: what units ???)
+Param[2]
+ DMA type (0=MPEG)
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_INITIALIZE_INPUT
+Enum 205/0xCD
+Description
+ Initializes the video input
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_FRAME_DROP_RATE
+Enum 208/0xD0
+Description
+ For each frame captured, skip specified number of frames.
+Param[0]
+ Number of frames to skip
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_PAUSE_ENCODER
+Enum 210/0xD2
+Description
+ During a pause condition, all frames are dropped instead of being encoded.
+Param[0]
+ 0=Pause encoding
+ 1=Continue encoding
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_REFRESH_INPUT
+Enum 211/0xD3
+Description
+ Refreshes the video input
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_COPYRIGHT
+Enum 212/0xD4
+Description
+ Sets stream copyright property
+Param[0]
+ 0=Stream is not copyrighted
+ 1=Stream is copyrighted
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_EVENT_NOTIFICATION
+Enum 213/0xD5
+Description
+ Setup firmware to notify the host about a particular event. Host must
+ unmask the interrupt bit.
+Param[0]
+ Event (0=refresh encoder input)
+Param[1]
+ Notification 0=disabled 1=enabled
+Param[2]
+ Interrupt bit
+Param[3]
+ Mailbox slot, -1 if no mailbox required.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_NUM_VSYNC_LINES
+Enum 214/0xD6
+Description
+ Depending on the analog video decoder used, this assigns the number
+ of lines for field 1 and 2.
+Param[0]
+ Field 1 number of lines:
+ 0x00EF for SAA7114
+ 0x00F0 for SAA7115
+ 0x0105 for Micronas
+Param[1]
+ Field 2 number of lines:
+ 0x00EF for SAA7114
+ 0x00F0 for SAA7115
+ 0x0106 for Micronas
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_PLACEHOLDER
+Enum 215/0xD7
+Description
+ Provides a mechanism of inserting custom user data in the MPEG stream.
+Param[0]
+ 0=extension & user data
+ 1=private packet with stream ID 0xBD
+Param[1]
+ Rate at which to insert data, in units of frames (for private packet)
+ or GOPs (for ext. & user data)
+Param[2]
+ Number of data DWORDs (below) to insert
+Param[3]
+ Custom data 0
+Param[4]
+ Custom data 1
+Param[5]
+ Custom data 2
+Param[6]
+ Custom data 3
+Param[7]
+ Custom data 4
+Param[8]
+ Custom data 5
+Param[9]
+ Custom data 6
+Param[10]
+ Custom data 7
+Param[11]
+ Custom data 8
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_MUTE_VIDEO
+Enum 217/0xD9
+Description
+ Video muting
+Param[0]
+ Bit usage:
+ 0 '0'=video not muted
+ '1'=video muted, creates frames with the YUV color defined below
+ 1:7 Unused
+ 8:15 V chrominance information
+ 16:23 U chrominance information
+ 24:31 Y luminance information
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_MUTE_AUDIO
+Enum 218/0xDA
+Description
+ Audio muting
+Param[0]
+ 0=audio not muted
+ 1=audio muted (produces silent mpeg audio stream)
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_SET_VERT_CROP_LINE
+Enum 219/0xDB
+Description
+ Something to do with 'Vertical Crop Line'
+Param[0]
+ If saa7114 and raw VBI capture and 60 Hz, then set to 10001.
+ Else 0.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_ENC_MISC
+Enum 220/0xDC
+Description
+ Miscellaneous actions. Not known for 100% what it does. It's really a
+ sort of ioctl call. The first parameter is a command number, the second
+ the value.
+Param[0]
+ Command number:
+ 1=set initial SCR value when starting encoding (works).
+ 2=set quality mode (apparently some test setting).
+ 3=setup advanced VIM protection handling.
+ Always 1 for the cx23416 and 0 for cx23415.
+ 4=generate DVD compatible PTS timestamps
+ 5=USB flush mode
+ 6=something to do with the quantization matrix
+ 7=set navigation pack insertion for DVD: adds 0xbf (private stream 2)
+ packets to the MPEG. The size of these packets is 2048 bytes (including
+ the header of 6 bytes: 0x000001bf + length). The payload is zeroed and
+ it is up to the application to fill them in. These packets are apparently
+ inserted every four frames.
+ 8=enable scene change detection (seems to be a failure)
+ 9=set history parameters of the video input module
+ 10=set input field order of VIM
+ 11=set quantization matrix
+ 12=reset audio interface after channel change or input switch (has no argument).
+ Needed for the cx2584x, not needed for the mspx4xx, but it doesn't seem to
+ do any harm calling it regardless.
+ 13=set audio volume delay
+ 14=set audio delay
+
+Param[1]
+ Command value.
diff --git a/Documentation/video4linux/cx2341x/fw-memory.txt b/Documentation/video4linux/cx2341x/fw-memory.txt
new file mode 100644
index 0000000..9d736fe
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-memory.txt
@@ -0,0 +1,139 @@
+This document describes the cx2341x memory map and documents some of the register
+space.
+
+Note: the memory long words are little-endian ('intel format').
+
+Warning! This information was figured out from searching through the memory and
+registers, this information may not be correct and is certainly not complete, and
+was not derived from anything more than searching through the memory space with
+commands like:
+
+ ivtvctl -O min=0x02000000,max=0x020000ff
+
+So take this as is, I'm always searching for more stuff, it's a large
+register space :-).
+
+Memory Map
+==========
+
+The cx2341x exposes its entire 64M memory space to the PCI host via the PCI BAR0
+(Base Address Register 0). The addresses here are offsets relative to the
+address held in BAR0.
+
+0x00000000-0x00ffffff Encoder memory space
+0x00000000-0x0003ffff Encode.rom
+ ???-??? MPEG buffer(s)
+ ???-??? Raw video capture buffer(s)
+ ???-??? Raw audio capture buffer(s)
+ ???-??? Display buffers (6 or 9)
+
+0x01000000-0x01ffffff Decoder memory space
+0x01000000-0x0103ffff Decode.rom
+ ???-??? MPEG buffers(s)
+0x0114b000-0x0115afff Audio.rom (deprecated?)
+
+0x02000000-0x0200ffff Register Space
+
+Registers
+=========
+
+The registers occupy the 64k space starting at the 0x02000000 offset from BAR0.
+All of these registers are 32 bits wide.
+
+DMA Registers 0x000-0xff:
+
+ 0x00 - Control:
+ 0=reset/cancel, 1=read, 2=write, 4=stop
+ 0x04 - DMA status:
+ 1=read busy, 2=write busy, 4=read error, 8=write error, 16=link list error
+ 0x08 - pci DMA pointer for read link list
+ 0x0c - pci DMA pointer for write link list
+ 0x10 - read/write DMA enable:
+ 1=read enable, 2=write enable
+ 0x14 - always 0xffffffff, if set any lower instability occurs, 0x00 crashes
+ 0x18 - ??
+ 0x1c - always 0x20 or 32, smaller values slow down DMA transactions
+ 0x20 - always value of 0x780a010a
+ 0x24-0x3c - usually just random values???
+ 0x40 - Interrupt status
+ 0x44 - Write a bit here and shows up in Interrupt status 0x40
+ 0x48 - Interrupt Mask
+ 0x4C - always value of 0xfffdffff,
+ if changed to 0xffffffff DMA write interrupts break.
+ 0x50 - always 0xffffffff
+ 0x54 - always 0xffffffff (0x4c, 0x50, 0x54 seem like interrupt masks, are
+ 3 processors on chip, Java ones, VPU, SPU, APU, maybe these are the
+ interrupt masks???).
+ 0x60-0x7C - random values
+ 0x80 - first write linked list reg, for Encoder Memory addr
+ 0x84 - first write linked list reg, for pci memory addr
+ 0x88 - first write linked list reg, for length of buffer in memory addr
+ (|0x80000000 or this for last link)
+ 0x8c-0xdc - rest of write linked list reg, 8 sets of 3 total, DMA goes here
+ from linked list addr in reg 0x0c, firmware must push through or
+ something.
+ 0xe0 - first (and only) read linked list reg, for pci memory addr
+ 0xe4 - first (and only) read linked list reg, for Decoder memory addr
+ 0xe8 - first (and only) read linked list reg, for length of buffer
+ 0xec-0xff - Nothing seems to be in these registers, 0xec-f4 are 0x00000000.
+
+Memory locations for Encoder Buffers 0x700-0x7ff:
+
+These registers show offsets of memory locations pertaining to each
+buffer area used for encoding, have to shift them by <<1 first.
+
+0x07F8: Encoder SDRAM refresh
+0x07FC: Encoder SDRAM pre-charge
+
+Memory locations for Decoder Buffers 0x800-0x8ff:
+
+These registers show offsets of memory locations pertaining to each
+buffer area used for decoding, have to shift them by <<1 first.
+
+0x08F8: Decoder SDRAM refresh
+0x08FC: Decoder SDRAM pre-charge
+
+Other memory locations:
+
+0x2800: Video Display Module control
+0x2D00: AO (audio output?) control
+0x2D24: Bytes Flushed
+0x7000: LSB I2C write clock bit (inverted)
+0x7004: LSB I2C write data bit (inverted)
+0x7008: LSB I2C read clock bit
+0x700c: LSB I2C read data bit
+0x9008: GPIO get input state
+0x900c: GPIO set output state
+0x9020: GPIO direction (Bit7 (GPIO 0..7) - 0:input, 1:output)
+0x9050: SPU control
+0x9054: Reset HW blocks
+0x9058: VPU control
+0xA018: Bit6: interrupt pending?
+0xA064: APU command
+
+
+Interrupt Status Register
+=========================
+
+The definition of the bits in the interrupt status register 0x0040, and the
+interrupt mask 0x0048. If a bit is cleared in the mask, then we want our ISR to
+execute.
+
+Bit
+31 Encoder Start Capture
+30 Encoder EOS
+29 Encoder VBI capture
+28 Encoder Video Input Module reset event
+27 Encoder DMA complete
+24 Decoder audio mode change detection event (through event notification)
+22 Decoder data request
+20 Decoder DMA complete
+19 Decoder VBI re-insertion
+18 Decoder DMA err (linked-list bad)
+
+Missing
+Encoder API call completed
+Decoder API call completed
+Encoder API post(?)
+Decoder API post(?)
+Decoder VTRACE event
diff --git a/Documentation/video4linux/cx2341x/fw-osd-api.txt b/Documentation/video4linux/cx2341x/fw-osd-api.txt
new file mode 100644
index 0000000..89c4601
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-osd-api.txt
@@ -0,0 +1,350 @@
+OSD firmware API description
+============================
+
+Note: this API is part of the decoder firmware, so it's cx23415 only.
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_FRAMEBUFFER
+Enum 65/0x41
+Description
+ Return base and length of contiguous OSD memory.
+Result[0]
+ OSD base address
+Result[1]
+ OSD length
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_PIXEL_FORMAT
+Enum 66/0x42
+Description
+ Query OSD format
+Result[0]
+ 0=8bit index
+ 1=16bit RGB 5:6:5
+ 2=16bit ARGB 1:5:5:5
+ 3=16bit ARGB 1:4:4:4
+ 4=32bit ARGB 8:8:8:8
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_PIXEL_FORMAT
+Enum 67/0x43
+Description
+ Assign pixel format
+Param[0]
+ 0=8bit index
+ 1=16bit RGB 5:6:5
+ 2=16bit ARGB 1:5:5:5
+ 3=16bit ARGB 1:4:4:4
+ 4=32bit ARGB 8:8:8:8
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_STATE
+Enum 68/0x44
+Description
+ Query OSD state
+Result[0]
+ Bit 0 0=off, 1=on
+ Bits 1:2 alpha control
+ Bits 3:5 pixel format
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_STATE
+Enum 69/0x45
+Description
+ OSD switch
+Param[0]
+ 0=off, 1=on
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_OSD_COORDS
+Enum 70/0x46
+Description
+ Retrieve coordinates of OSD area blended with video
+Result[0]
+ OSD buffer address
+Result[1]
+ Stride in pixels
+Result[2]
+ Lines in OSD buffer
+Result[3]
+ Horizontal offset in buffer
+Result[4]
+ Vertical offset in buffer
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_OSD_COORDS
+Enum 71/0x47
+Description
+ Assign the coordinates of the OSD area to blend with video
+Param[0]
+ buffer address
+Param[1]
+ buffer stride in pixels
+Param[2]
+ lines in buffer
+Param[3]
+ horizontal offset
+Param[4]
+ vertical offset
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_SCREEN_COORDS
+Enum 72/0x48
+Description
+ Retrieve OSD screen area coordinates
+Result[0]
+ top left horizontal offset
+Result[1]
+ top left vertical offset
+Result[2]
+ bottom right horizontal offset
+Result[3]
+ bottom right vertical offset
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_SCREEN_COORDS
+Enum 73/0x49
+Description
+ Assign the coordinates of the screen area to blend with video
+Param[0]
+ top left horizontal offset
+Param[1]
+ top left vertical offset
+Param[2]
+ bottom left horizontal offset
+Param[3]
+ bottom left vertical offset
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_GLOBAL_ALPHA
+Enum 74/0x4A
+Description
+ Retrieve OSD global alpha
+Result[0]
+ global alpha: 0=off, 1=on
+Result[1]
+ bits 0:7 global alpha
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_GLOBAL_ALPHA
+Enum 75/0x4B
+Description
+ Update global alpha
+Param[0]
+ global alpha: 0=off, 1=on
+Param[1]
+ global alpha (8 bits)
+Param[2]
+ local alpha: 0=on, 1=off
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_BLEND_COORDS
+Enum 78/0x4C
+Description
+ Move start of blending area within display buffer
+Param[0]
+ horizontal offset in buffer
+Param[1]
+ vertical offset in buffer
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_FLICKER_STATE
+Enum 79/0x4F
+Description
+ Retrieve flicker reduction module state
+Result[0]
+ flicker state: 0=off, 1=on
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_FLICKER_STATE
+Enum 80/0x50
+Description
+ Set flicker reduction module state
+Param[0]
+ State: 0=off, 1=on
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_BLT_COPY
+Enum 82/0x52
+Description
+ BLT copy
+Param[0]
+'0000' zero
+'0001' ~destination AND ~source
+'0010' ~destination AND source
+'0011' ~destination
+'0100' destination AND ~source
+'0101' ~source
+'0110' destination XOR source
+'0111' ~destination OR ~source
+'1000' ~destination AND ~source
+'1001' destination XNOR source
+'1010' source
+'1011' ~destination OR source
+'1100' destination
+'1101' destination OR ~source
+'1110' destination OR source
+'1111' one
+
+Param[1]
+ Resulting alpha blending
+ '01' source_alpha
+ '10' destination_alpha
+ '11' source_alpha*destination_alpha+1
+ (zero if both source and destination alpha are zero)
+Param[2]
+ '00' output_pixel = source_pixel
+
+ '01' if source_alpha=0:
+ output_pixel = destination_pixel
+ if 256 > source_alpha > 1:
+ output_pixel = ((source_alpha + 1)*source_pixel +
+ (255 - source_alpha)*destination_pixel)/256
+
+ '10' if destination_alpha=0:
+ output_pixel = source_pixel
+ if 255 > destination_alpha > 0:
+ output_pixel = ((255 - destination_alpha)*source_pixel +
+ (destination_alpha + 1)*destination_pixel)/256
+
+ '11' if source_alpha=0:
+ source_temp = 0
+ if source_alpha=255:
+ source_temp = source_pixel*256
+ if 255 > source_alpha > 0:
+ source_temp = source_pixel*(source_alpha + 1)
+ if destination_alpha=0:
+ destination_temp = 0
+ if destination_alpha=255:
+ destination_temp = destination_pixel*256
+ if 255 > destination_alpha > 0:
+ destination_temp = destination_pixel*(destination_alpha + 1)
+ output_pixel = (source_temp + destination_temp)/256
+Param[3]
+ width
+Param[4]
+ height
+Param[5]
+ destination pixel mask
+Param[6]
+ destination rectangle start address
+Param[7]
+ destination stride in dwords
+Param[8]
+ source stride in dwords
+Param[9]
+ source rectangle start address
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_BLT_FILL
+Enum 83/0x53
+Description
+ BLT fill color
+Param[0]
+ Same as Param[0] on API 0x52
+Param[1]
+ Same as Param[1] on API 0x52
+Param[2]
+ Same as Param[2] on API 0x52
+Param[3]
+ width
+Param[4]
+ height
+Param[5]
+ destination pixel mask
+Param[6]
+ destination rectangle start address
+Param[7]
+ destination stride in dwords
+Param[8]
+ color fill value
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_BLT_TEXT
+Enum 84/0x54
+Description
+ BLT for 8 bit alpha text source
+Param[0]
+ Same as Param[0] on API 0x52
+Param[1]
+ Same as Param[1] on API 0x52
+Param[2]
+ Same as Param[2] on API 0x52
+Param[3]
+ width
+Param[4]
+ height
+Param[5]
+ destination pixel mask
+Param[6]
+ destination rectangle start address
+Param[7]
+ destination stride in dwords
+Param[8]
+ source stride in dwords
+Param[9]
+ source rectangle start address
+Param[10]
+ color fill value
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_FRAMEBUFFER_WINDOW
+Enum 86/0x56
+Description
+ Positions the main output window on the screen. The coordinates must be
+ such that the entire window fits on the screen.
+Param[0]
+ window width
+Param[1]
+ window height
+Param[2]
+ top left window corner horizontal offset
+Param[3]
+ top left window corner vertical offset
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_CHROMA_KEY
+Enum 96/0x60
+Description
+ Chroma key switch and color
+Param[0]
+ state: 0=off, 1=on
+Param[1]
+ color
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_GET_ALPHA_CONTENT_INDEX
+Enum 97/0x61
+Description
+ Retrieve alpha content index
+Result[0]
+ alpha content index, Range 0:15
+
+-------------------------------------------------------------------------------
+
+Name CX2341X_OSD_SET_ALPHA_CONTENT_INDEX
+Enum 98/0x62
+Description
+ Assign alpha content index
+Param[0]
+ alpha content index, range 0:15
diff --git a/Documentation/video4linux/cx2341x/fw-upload.txt b/Documentation/video4linux/cx2341x/fw-upload.txt
new file mode 100644
index 0000000..60c502c
--- /dev/null
+++ b/Documentation/video4linux/cx2341x/fw-upload.txt
@@ -0,0 +1,49 @@
+This document describes how to upload the cx2341x firmware to the card.
+
+How to find
+===========
+
+See the web pages of the various projects that uses this chip for information
+on how to obtain the firmware.
+
+The firmware stored in a Windows driver can be detected as follows:
+
+- Each firmware image is 256k bytes.
+- The 1st 32-bit word of the Encoder image is 0x0000da7
+- The 1st 32-bit word of the Decoder image is 0x00003a7
+- The 2nd 32-bit word of both images is 0xaa55bb66
+
+How to load
+===========
+
+- Issue the FWapi command to stop the encoder if it is running. Wait for the
+ command to complete.
+- Issue the FWapi command to stop the decoder if it is running. Wait for the
+ command to complete.
+- Issue the I2C command to the digitizer to stop emitting VSYNC events.
+- Issue the FWapi command to halt the encoder's firmware.
+- Sleep for 10ms.
+- Issue the FWapi command to halt the decoder's firmware.
+- Sleep for 10ms.
+- Write 0x00000000 to register 0x2800 to stop the Video Display Module.
+- Write 0x00000005 to register 0x2D00 to stop the AO (audio output?).
+- Write 0x00000000 to register 0xA064 to ping? the APU.
+- Write 0xFFFFFFFE to register 0x9058 to stop the VPU.
+- Write 0xFFFFFFFF to register 0x9054 to reset the HW blocks.
+- Write 0x00000001 to register 0x9050 to stop the SPU.
+- Sleep for 10ms.
+- Write 0x0000001A to register 0x07FC to init the Encoder SDRAM's pre-charge.
+- Write 0x80000640 to register 0x07F8 to init the Encoder SDRAM's refresh to 1us.
+- Write 0x0000001A to register 0x08FC to init the Decoder SDRAM's pre-charge.
+- Write 0x80000640 to register 0x08F8 to init the Decoder SDRAM's refresh to 1us.
+- Sleep for 512ms. (600ms is recommended)
+- Transfer the encoder's firmware image to offset 0 in Encoder memory space.
+- Transfer the decoder's firmware image to offset 0 in Decoder memory space.
+- Use a read-modify-write operation to Clear bit 0 of register 0x9050 to
+ re-enable the SPU.
+- Sleep for 1 second.
+- Use a read-modify-write operation to Clear bits 3 and 0 of register 0x9058
+ to re-enable the VPU.
+- Sleep for 1 second.
+- Issue status API commands to both firmware images to verify.
+
diff --git a/Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt b/Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt
new file mode 100644
index 0000000..faccee6
--- /dev/null
+++ b/Documentation/video4linux/cx88/hauppauge-wintv-cx88-ir.txt
@@ -0,0 +1,54 @@
+The controls for the mux are GPIO [0,1] for source, and GPIO 2 for muting.
+
+GPIO0 GPIO1
+ 0 0 TV Audio
+ 1 0 FM radio
+ 0 1 Line-In
+ 1 1 Mono tuner bypass or CD passthru (tuner specific)
+
+GPIO 16(i believe) is tied to the IR port (if present).
+
+------------------------------------------------------------------------------------
+
+>From the data sheet:
+ Register 24'h20004 PCI Interrupt Status
+ bit [18] IR_SMP_INT Set when 32 input samples have been collected over
+ gpio[16] pin into GP_SAMPLE register.
+
+What's missing from the data sheet:
+
+Setup 4KHz sampling rate (roughly 2x oversampled; good enough for our RC5
+compat remote)
+set register 0x35C050 to 0xa80a80
+
+enable sampling
+set register 0x35C054 to 0x5
+
+Of course, enable the IRQ bit 18 in the interrupt mask register .(and
+provide for a handler)
+
+GP_SAMPLE register is at 0x35C058
+
+Bits are then right shifted into the GP_SAMPLE register at the specified
+rate; you get an interrupt when a full DWORD is received.
+You need to recover the actual RC5 bits out of the (oversampled) IR sensor
+bits. (Hint: look for the 0/1and 1/0 crossings of the RC5 bi-phase data) An
+actual raw RC5 code will span 2-3 DWORDS, depending on the actual alignment.
+
+I'm pretty sure when no IR signal is present the receiver is always in a
+marking state(1); but stray light, etc can cause intermittent noise values
+as well. Remember, this is a free running sample of the IR receiver state
+over time, so don't assume any sample starts at any particular place.
+
+http://www.atmel.com/dyn/resources/prod_documents/doc2817.pdf
+This data sheet (google search) seems to have a lovely description of the
+RC5 basics
+
+http://users.pandora.be/nenya/electronics/rc5/ and more data
+
+http://www.ee.washington.edu/circuit_archive/text/ir_decode.txt
+and even a reference to how to decode a bi-phase data stream.
+
+http://www.xs4all.nl/~sbp/knowledge/ir/rc5.htm
+still more info
+
diff --git a/Documentation/video4linux/et61x251.txt b/Documentation/video4linux/et61x251.txt
new file mode 100644
index 0000000..1247566
--- /dev/null
+++ b/Documentation/video4linux/et61x251.txt
@@ -0,0 +1,315 @@
+
+ ET61X[12]51 PC Camera Controllers
+ Driver for Linux
+ =================================
+
+ - Documentation -
+
+
+Index
+=====
+1. Copyright
+2. Disclaimer
+3. License
+4. Overview and features
+5. Module dependencies
+6. Module loading
+7. Module parameters
+8. Optional device control through "sysfs"
+9. Supported devices
+10. Notes for V4L2 application developers
+11. Contact information
+
+
+1. Copyright
+============
+Copyright (C) 2006-2007 by Luca Risolia <luca.risolia@studio.unibo.it>
+
+
+2. Disclaimer
+=============
+Etoms is a trademark of Etoms Electronics Corp.
+This software is not developed or sponsored by Etoms Electronics.
+
+
+3. License
+==========
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+4. Overview and features
+========================
+This driver supports the video interface of the devices mounting the ET61X151
+or ET61X251 PC Camera Controllers.
+
+It's worth to note that Etoms Electronics has never collaborated with the
+author during the development of this project; despite several requests,
+Etoms Electronics also refused to release enough detailed specifications of
+the video compression engine.
+
+The driver relies on the Video4Linux2 and USB core modules. It has been
+designed to run properly on SMP systems as well.
+
+The latest version of the ET61X[12]51 driver can be found at the following URL:
+http://www.linux-projects.org/
+
+Some of the features of the driver are:
+
+- full compliance with the Video4Linux2 API (see also "Notes for V4L2
+ application developers" paragraph);
+- available mmap or read/poll methods for video streaming through isochronous
+ data transfers;
+- automatic detection of image sensor;
+- support for any window resolutions and optional panning within the maximum
+ pixel area of image sensor;
+- image downscaling with arbitrary scaling factors from 1 and 2 in both
+ directions (see "Notes for V4L2 application developers" paragraph);
+- two different video formats for uncompressed or compressed data in low or
+ high compression quality (see also "Notes for V4L2 application developers"
+ paragraph);
+- full support for the capabilities of every possible image sensors that can
+ be connected to the ET61X[12]51 bridges, including, for instance, red, green,
+ blue and global gain adjustments and exposure control (see "Supported
+ devices" paragraph for details);
+- use of default color settings for sunlight conditions;
+- dynamic I/O interface for both ET61X[12]51 and image sensor control (see
+ "Optional device control through 'sysfs'" paragraph);
+- dynamic driver control thanks to various module parameters (see "Module
+ parameters" paragraph);
+- up to 64 cameras can be handled at the same time; they can be connected and
+ disconnected from the host many times without turning off the computer, if
+ the system supports hotplugging;
+- no known bugs.
+
+
+5. Module dependencies
+======================
+For it to work properly, the driver needs kernel support for Video4Linux and
+USB.
+
+The following options of the kernel configuration file must be enabled and
+corresponding modules must be compiled:
+
+ # Multimedia devices
+ #
+ CONFIG_VIDEO_DEV=m
+
+To enable advanced debugging functionality on the device through /sysfs:
+
+ # Multimedia devices
+ #
+ CONFIG_VIDEO_ADV_DEBUG=y
+
+ # USB support
+ #
+ CONFIG_USB=m
+
+In addition, depending on the hardware being used, the modules below are
+necessary:
+
+ # USB Host Controller Drivers
+ #
+ CONFIG_USB_EHCI_HCD=m
+ CONFIG_USB_UHCI_HCD=m
+ CONFIG_USB_OHCI_HCD=m
+
+And finally:
+
+ # USB Multimedia devices
+ #
+ CONFIG_USB_ET61X251=m
+
+
+6. Module loading
+=================
+To use the driver, it is necessary to load the "et61x251" module into memory
+after every other module required: "videodev", "v4l2_common", "compat_ioctl32",
+"usbcore" and, depending on the USB host controller you have, "ehci-hcd",
+"uhci-hcd" or "ohci-hcd".
+
+Loading can be done as shown below:
+
+ [root@localhost home]# modprobe et61x251
+
+At this point the devices should be recognized. You can invoke "dmesg" to
+analyze kernel messages and verify that the loading process has gone well:
+
+ [user@localhost home]$ dmesg
+
+
+7. Module parameters
+====================
+Module parameters are listed below:
+-------------------------------------------------------------------------------
+Name: video_nr
+Type: short array (min = 0, max = 64)
+Syntax: <-1|n[,...]>
+Description: Specify V4L2 minor mode number:
+ -1 = use next available
+ n = use minor number n
+ You can specify up to 64 cameras this way.
+ For example:
+ video_nr=-1,2,-1 would assign minor number 2 to the second
+ registered camera and use auto for the first one and for every
+ other camera.
+Default: -1
+-------------------------------------------------------------------------------
+Name: force_munmap
+Type: bool array (min = 0, max = 64)
+Syntax: <0|1[,...]>
+Description: Force the application to unmap previously mapped buffer memory
+ before calling any VIDIOC_S_CROP or VIDIOC_S_FMT ioctl's. Not
+ all the applications support this feature. This parameter is
+ specific for each detected camera.
+ 0 = do not force memory unmapping
+ 1 = force memory unmapping (save memory)
+Default: 0
+-------------------------------------------------------------------------------
+Name: frame_timeout
+Type: uint array (min = 0, max = 64)
+Syntax: <n[,...]>
+Description: Timeout for a video frame in seconds. This parameter is
+ specific for each detected camera. This parameter can be
+ changed at runtime thanks to the /sys filesystem interface.
+Default: 2
+-------------------------------------------------------------------------------
+Name: debug
+Type: ushort
+Syntax: <n>
+Description: Debugging information level, from 0 to 3:
+ 0 = none (use carefully)
+ 1 = critical errors
+ 2 = significant informations
+ 3 = more verbose messages
+ Level 3 is useful for testing only, when only one device
+ is used at the same time. It also shows some more informations
+ about the hardware being detected. This module parameter can be
+ changed at runtime thanks to the /sys filesystem interface.
+Default: 2
+-------------------------------------------------------------------------------
+
+
+8. Optional device control through "sysfs"
+==========================================
+If the kernel has been compiled with the CONFIG_VIDEO_ADV_DEBUG option enabled,
+it is possible to read and write both the ET61X[12]51 and the image sensor
+registers by using the "sysfs" filesystem interface.
+
+There are four files in the /sys/class/video4linux/videoX directory for each
+registered camera: "reg", "val", "i2c_reg" and "i2c_val". The first two files
+control the ET61X[12]51 bridge, while the other two control the sensor chip.
+"reg" and "i2c_reg" hold the values of the current register index where the
+following reading/writing operations are addressed at through "val" and
+"i2c_val". Their use is not intended for end-users, unless you know what you
+are doing. Remember that you must be logged in as root before writing to them.
+
+As an example, suppose we were to want to read the value contained in the
+register number 1 of the sensor register table - which is usually the product
+identifier - of the camera registered as "/dev/video0":
+
+ [root@localhost #] cd /sys/class/video4linux/video0
+ [root@localhost #] echo 1 > i2c_reg
+ [root@localhost #] cat i2c_val
+
+Note that if the sensor registers cannot be read, "cat" will fail.
+To avoid race conditions, all the I/O accesses to the files are serialized.
+
+
+9. Supported devices
+====================
+None of the names of the companies as well as their products will be mentioned
+here. They have never collaborated with the author, so no advertising.
+
+From the point of view of a driver, what unambiguously identify a device are
+its vendor and product USB identifiers. Below is a list of known identifiers of
+devices mounting the ET61X[12]51 PC camera controllers:
+
+Vendor ID Product ID
+--------- ----------
+0x102c 0x6151
+0x102c 0x6251
+0x102c 0x6253
+0x102c 0x6254
+0x102c 0x6255
+0x102c 0x6256
+0x102c 0x6257
+0x102c 0x6258
+0x102c 0x6259
+0x102c 0x625a
+0x102c 0x625b
+0x102c 0x625c
+0x102c 0x625d
+0x102c 0x625e
+0x102c 0x625f
+0x102c 0x6260
+0x102c 0x6261
+0x102c 0x6262
+0x102c 0x6263
+0x102c 0x6264
+0x102c 0x6265
+0x102c 0x6266
+0x102c 0x6267
+0x102c 0x6268
+0x102c 0x6269
+
+The following image sensors are supported:
+
+Model Manufacturer
+----- ------------
+TAS5130D1B Taiwan Advanced Sensor Corporation
+
+All the available control settings of each image sensor are supported through
+the V4L2 interface.
+
+
+10. Notes for V4L2 application developers
+=========================================
+This driver follows the V4L2 API specifications. In particular, it enforces two
+rules:
+
+- exactly one I/O method, either "mmap" or "read", is associated with each
+file descriptor. Once it is selected, the application must close and reopen the
+device to switch to the other I/O method;
+
+- although it is not mandatory, previously mapped buffer memory should always
+be unmapped before calling any "VIDIOC_S_CROP" or "VIDIOC_S_FMT" ioctl's.
+The same number of buffers as before will be allocated again to match the size
+of the new video frames, so you have to map the buffers again before any I/O
+attempts on them.
+
+Consistently with the hardware limits, this driver also supports image
+downscaling with arbitrary scaling factors from 1 and 2 in both directions.
+However, the V4L2 API specifications don't correctly define how the scaling
+factor can be chosen arbitrarily by the "negotiation" of the "source" and
+"target" rectangles. To work around this flaw, we have added the convention
+that, during the negotiation, whenever the "VIDIOC_S_CROP" ioctl is issued, the
+scaling factor is restored to 1.
+
+This driver supports two different video formats: the first one is the "8-bit
+Sequential Bayer" format and can be used to obtain uncompressed video data
+from the device through the current I/O method, while the second one provides
+"raw" compressed video data (without frame headers not related to the
+compressed data). The current compression quality may vary from 0 to 1 and can
+be selected or queried thanks to the VIDIOC_S_JPEGCOMP and VIDIOC_G_JPEGCOMP
+V4L2 ioctl's.
+
+
+11. Contact information
+=======================
+The author may be contacted by e-mail at <luca.risolia@studio.unibo.it>.
+
+GPG/PGP encrypted e-mail's are accepted. The GPG key ID of the author is
+'FCE635A4'; the public 1024-bit key should be available at any keyserver;
+the fingerprint is: '88E8 F32F 7244 68BA 3958 5D40 99DA 5D2A FCE6 35A4'.
diff --git a/Documentation/video4linux/extract_xc3028.pl b/Documentation/video4linux/extract_xc3028.pl
new file mode 100644
index 0000000..2cb8160
--- /dev/null
+++ b/Documentation/video4linux/extract_xc3028.pl
@@ -0,0 +1,926 @@
+#!/usr/bin/perl
+
+# Copyright (c) Mauro Carvalho Chehab <mchehab@infradead.org>
+# Released under GPLv2
+#
+# In order to use, you need to:
+# 1) Download the windows driver with something like:
+# wget http://www.steventoth.net/linux/xc5000/HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip
+# 2) Extract the file hcw85bda.sys from the zip into the current dir:
+# unzip -j HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip Driver85/hcw85bda.sys
+# 3) run the script:
+# ./extract_xc3028.pl
+# 4) copy the generated file:
+# cp xc3028-v27.fw /lib/firmware
+
+#use strict;
+use IO::Handle;
+
+my $debug=0;
+
+sub verify ($$)
+{
+ my ($filename, $hash) = @_;
+ my ($testhash);
+
+ if (system("which md5sum > /dev/null 2>&1")) {
+ die "This firmware requires the md5sum command - see http://www.gnu.org/software/coreutils/\n";
+ }
+
+ open(CMD, "md5sum ".$filename."|");
+ $testhash = <CMD>;
+ $testhash =~ /([a-zA-Z0-9]*)/;
+ $testhash = $1;
+ close CMD;
+ die "Hash of extracted file does not match (found $testhash, expected $hash!\n" if ($testhash ne $hash);
+}
+
+sub get_hunk ($$)
+{
+ my ($offset, $length) = @_;
+ my ($chunklength, $buf, $rcount, $out);
+
+ sysseek(INFILE, $offset, SEEK_SET);
+ while ($length > 0) {
+ # Calc chunk size
+ $chunklength = 2048;
+ $chunklength = $length if ($chunklength > $length);
+
+ $rcount = sysread(INFILE, $buf, $chunklength);
+ die "Ran out of data\n" if ($rcount != $chunklength);
+ $out .= $buf;
+ $length -= $rcount;
+ }
+ return $out;
+}
+
+sub write_le16($)
+{
+ my $val = shift;
+ my $msb = ($val >> 8) &0xff;
+ my $lsb = $val & 0xff;
+
+ syswrite(OUTFILE, chr($lsb).chr($msb));
+}
+
+sub write_le32($)
+{
+ my $val = shift;
+ my $l3 = ($val >> 24) & 0xff;
+ my $l2 = ($val >> 16) & 0xff;
+ my $l1 = ($val >> 8) & 0xff;
+ my $l0 = $val & 0xff;
+
+ syswrite(OUTFILE, chr($l0).chr($l1).chr($l2).chr($l3));
+}
+
+sub write_le64($$)
+{
+ my $msb_val = shift;
+ my $lsb_val = shift;
+ my $l7 = ($msb_val >> 24) & 0xff;
+ my $l6 = ($msb_val >> 16) & 0xff;
+ my $l5 = ($msb_val >> 8) & 0xff;
+ my $l4 = $msb_val & 0xff;
+
+ my $l3 = ($lsb_val >> 24) & 0xff;
+ my $l2 = ($lsb_val >> 16) & 0xff;
+ my $l1 = ($lsb_val >> 8) & 0xff;
+ my $l0 = $lsb_val & 0xff;
+
+ syswrite(OUTFILE,
+ chr($l0).chr($l1).chr($l2).chr($l3).
+ chr($l4).chr($l5).chr($l6).chr($l7));
+}
+
+sub write_hunk($$)
+{
+ my ($offset, $length) = @_;
+ my $out = get_hunk($offset, $length);
+
+ printf "(len %d) ",$length if ($debug);
+
+ for (my $i=0;$i<$length;$i++) {
+ printf "%02x ",ord(substr($out,$i,1)) if ($debug);
+ }
+ printf "\n" if ($debug);
+
+ syswrite(OUTFILE, $out);
+}
+
+sub write_hunk_fix_endian($$)
+{
+ my ($offset, $length) = @_;
+ my $out = get_hunk($offset, $length);
+
+ printf "(len_fix %d) ",$length if ($debug);
+
+ for (my $i=0;$i<$length;$i++) {
+ printf "%02x ",ord(substr($out,$i,1)) if ($debug);
+ }
+ printf "\n" if ($debug);
+
+ my $i=0;
+ while ($i<$length) {
+ my $size = ord(substr($out,$i,1))*256+ord(substr($out,$i+1,1));
+ syswrite(OUTFILE, substr($out,$i+1,1));
+ syswrite(OUTFILE, substr($out,$i,1));
+ $i+=2;
+ if ($size>0 && $size <0x8000) {
+ for (my $j=0;$j<$size;$j++) {
+ syswrite(OUTFILE, substr($out,$j+$i,1));
+ }
+ $i+=$size;
+ }
+ }
+}
+
+sub main_firmware($$$$)
+{
+ my $out;
+ my $j=0;
+ my $outfile = shift;
+ my $name = shift;
+ my $version = shift;
+ my $nr_desc = shift;
+
+ for ($j = length($name); $j <32; $j++) {
+ $name = $name.chr(0);
+}
+
+ open OUTFILE, ">$outfile";
+ syswrite(OUTFILE, $name);
+ write_le16($version);
+ write_le16($nr_desc);
+
+ #
+ # Firmware 0, type: BASE FW F8MHZ (0x00000003), id: (0000000000000000), size: 8718
+ #
+
+ write_le32(0x00000003); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(8718); # Size
+ write_hunk_fix_endian(813432, 8718);
+
+ #
+ # Firmware 1, type: BASE FW F8MHZ MTS (0x00000007), id: (0000000000000000), size: 8712
+ #
+
+ write_le32(0x00000007); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(8712); # Size
+ write_hunk_fix_endian(822152, 8712);
+
+ #
+ # Firmware 2, type: BASE FW FM (0x00000401), id: (0000000000000000), size: 8562
+ #
+
+ write_le32(0x00000401); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(8562); # Size
+ write_hunk_fix_endian(830872, 8562);
+
+ #
+ # Firmware 3, type: BASE FW FM INPUT1 (0x00000c01), id: (0000000000000000), size: 8576
+ #
+
+ write_le32(0x00000c01); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(8576); # Size
+ write_hunk_fix_endian(839440, 8576);
+
+ #
+ # Firmware 4, type: BASE FW (0x00000001), id: (0000000000000000), size: 8706
+ #
+
+ write_le32(0x00000001); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(8706); # Size
+ write_hunk_fix_endian(848024, 8706);
+
+ #
+ # Firmware 5, type: BASE FW MTS (0x00000005), id: (0000000000000000), size: 8682
+ #
+
+ write_le32(0x00000005); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(8682); # Size
+ write_hunk_fix_endian(856736, 8682);
+
+ #
+ # Firmware 6, type: STD FW (0x00000000), id: PAL/BG A2/A (0000000100000007), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000001, 0x00000007); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(865424, 161);
+
+ #
+ # Firmware 7, type: STD FW MTS (0x00000004), id: PAL/BG A2/A (0000000100000007), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000001, 0x00000007); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(865592, 169);
+
+ #
+ # Firmware 8, type: STD FW (0x00000000), id: PAL/BG A2/B (0000000200000007), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000002, 0x00000007); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(865424, 161);
+
+ #
+ # Firmware 9, type: STD FW MTS (0x00000004), id: PAL/BG A2/B (0000000200000007), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000002, 0x00000007); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(865592, 169);
+
+ #
+ # Firmware 10, type: STD FW (0x00000000), id: PAL/BG NICAM/A (0000000400000007), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000004, 0x00000007); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(866112, 161);
+
+ #
+ # Firmware 11, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/A (0000000400000007), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000004, 0x00000007); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(866280, 169);
+
+ #
+ # Firmware 12, type: STD FW (0x00000000), id: PAL/BG NICAM/B (0000000800000007), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000008, 0x00000007); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(866112, 161);
+
+ #
+ # Firmware 13, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/B (0000000800000007), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000008, 0x00000007); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(866280, 169);
+
+ #
+ # Firmware 14, type: STD FW (0x00000000), id: PAL/DK A2 (00000003000000e0), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000003, 0x000000e0); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(866800, 161);
+
+ #
+ # Firmware 15, type: STD FW MTS (0x00000004), id: PAL/DK A2 (00000003000000e0), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000003, 0x000000e0); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(866968, 169);
+
+ #
+ # Firmware 16, type: STD FW (0x00000000), id: PAL/DK NICAM (0000000c000000e0), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x0000000c, 0x000000e0); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(867144, 161);
+
+ #
+ # Firmware 17, type: STD FW MTS (0x00000004), id: PAL/DK NICAM (0000000c000000e0), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x0000000c, 0x000000e0); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(867312, 169);
+
+ #
+ # Firmware 18, type: STD FW (0x00000000), id: SECAM/K1 (0000000000200000), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000000, 0x00200000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(867488, 161);
+
+ #
+ # Firmware 19, type: STD FW MTS (0x00000004), id: SECAM/K1 (0000000000200000), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000000, 0x00200000); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(867656, 169);
+
+ #
+ # Firmware 20, type: STD FW (0x00000000), id: SECAM/K3 (0000000004000000), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000000, 0x04000000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(867832, 161);
+
+ #
+ # Firmware 21, type: STD FW MTS (0x00000004), id: SECAM/K3 (0000000004000000), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000000, 0x04000000); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(868000, 169);
+
+ #
+ # Firmware 22, type: STD FW D2633 DTV6 ATSC (0x00010030), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00010030); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868176, 149);
+
+ #
+ # Firmware 23, type: STD FW D2620 DTV6 QAM (0x00000068), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000068); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868336, 149);
+
+ #
+ # Firmware 24, type: STD FW D2633 DTV6 QAM (0x00000070), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000070); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868488, 149);
+
+ #
+ # Firmware 25, type: STD FW D2620 DTV7 (0x00000088), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000088); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868648, 149);
+
+ #
+ # Firmware 26, type: STD FW D2633 DTV7 (0x00000090), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000090); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868800, 149);
+
+ #
+ # Firmware 27, type: STD FW D2620 DTV78 (0x00000108), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000108); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868960, 149);
+
+ #
+ # Firmware 28, type: STD FW D2633 DTV78 (0x00000110), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000110); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(869112, 149);
+
+ #
+ # Firmware 29, type: STD FW D2620 DTV8 (0x00000208), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000208); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868648, 149);
+
+ #
+ # Firmware 30, type: STD FW D2633 DTV8 (0x00000210), id: (0000000000000000), size: 149
+ #
+
+ write_le32(0x00000210); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(149); # Size
+ write_hunk_fix_endian(868800, 149);
+
+ #
+ # Firmware 31, type: STD FW FM (0x00000400), id: (0000000000000000), size: 135
+ #
+
+ write_le32(0x00000400); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le32(135); # Size
+ write_hunk_fix_endian(869584, 135);
+
+ #
+ # Firmware 32, type: STD FW (0x00000000), id: PAL/I (0000000000000010), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000000, 0x00000010); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(869728, 161);
+
+ #
+ # Firmware 33, type: STD FW MTS (0x00000004), id: PAL/I (0000000000000010), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000000, 0x00000010); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(869896, 169);
+
+ #
+ # Firmware 34, type: STD FW (0x00000000), id: SECAM/L AM (0000001000400000), size: 169
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000010, 0x00400000); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(870072, 169);
+
+ #
+ # Firmware 35, type: STD FW (0x00000000), id: SECAM/L NICAM (0000000c00400000), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x0000000c, 0x00400000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(870248, 161);
+
+ #
+ # Firmware 36, type: STD FW (0x00000000), id: SECAM/Lc (0000000000800000), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000000, 0x00800000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(870416, 161);
+
+ #
+ # Firmware 37, type: STD FW (0x00000000), id: NTSC/M Kr (0000000000008000), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000000, 0x00008000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(870584, 161);
+
+ #
+ # Firmware 38, type: STD FW LCD (0x00001000), id: NTSC/M Kr (0000000000008000), size: 161
+ #
+
+ write_le32(0x00001000); # Type
+ write_le64(0x00000000, 0x00008000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(870752, 161);
+
+ #
+ # Firmware 39, type: STD FW LCD NOGD (0x00003000), id: NTSC/M Kr (0000000000008000), size: 161
+ #
+
+ write_le32(0x00003000); # Type
+ write_le64(0x00000000, 0x00008000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(870920, 161);
+
+ #
+ # Firmware 40, type: STD FW MTS (0x00000004), id: NTSC/M Kr (0000000000008000), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000000, 0x00008000); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(871088, 169);
+
+ #
+ # Firmware 41, type: STD FW (0x00000000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000000, 0x0000b700); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(871264, 161);
+
+ #
+ # Firmware 42, type: STD FW LCD (0x00001000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
+ #
+
+ write_le32(0x00001000); # Type
+ write_le64(0x00000000, 0x0000b700); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(871432, 161);
+
+ #
+ # Firmware 43, type: STD FW LCD NOGD (0x00003000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
+ #
+
+ write_le32(0x00003000); # Type
+ write_le64(0x00000000, 0x0000b700); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(871600, 161);
+
+ #
+ # Firmware 44, type: STD FW (0x00000000), id: NTSC/M Jp (0000000000002000), size: 161
+ #
+
+ write_le32(0x00000000); # Type
+ write_le64(0x00000000, 0x00002000); # ID
+ write_le32(161); # Size
+ write_hunk_fix_endian(871264, 161);
+
+ #
+ # Firmware 45, type: STD FW MTS (0x00000004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
+ #
+
+ write_le32(0x00000004); # Type
+ write_le64(0x00000000, 0x0000b700); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(871936, 169);
+
+ #
+ # Firmware 46, type: STD FW MTS LCD (0x00001004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
+ #
+
+ write_le32(0x00001004); # Type
+ write_le64(0x00000000, 0x0000b700); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(872112, 169);
+
+ #
+ # Firmware 47, type: STD FW MTS LCD NOGD (0x00003004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
+ #
+
+ write_le32(0x00003004); # Type
+ write_le64(0x00000000, 0x0000b700); # ID
+ write_le32(169); # Size
+ write_hunk_fix_endian(872288, 169);
+
+ #
+ # Firmware 48, type: SCODE FW HAS IF (0x60000000), IF = 3.28 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(3280); # IF
+ write_le32(192); # Size
+ write_hunk(811896, 192);
+
+ #
+ # Firmware 49, type: SCODE FW HAS IF (0x60000000), IF = 3.30 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(3300); # IF
+ write_le32(192); # Size
+ write_hunk(813048, 192);
+
+ #
+ # Firmware 50, type: SCODE FW HAS IF (0x60000000), IF = 3.44 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(3440); # IF
+ write_le32(192); # Size
+ write_hunk(812280, 192);
+
+ #
+ # Firmware 51, type: SCODE FW HAS IF (0x60000000), IF = 3.46 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(3460); # IF
+ write_le32(192); # Size
+ write_hunk(812472, 192);
+
+ #
+ # Firmware 52, type: SCODE FW DTV6 ATSC OREN36 HAS IF (0x60210020), IF = 3.80 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60210020); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(3800); # IF
+ write_le32(192); # Size
+ write_hunk(809784, 192);
+
+ #
+ # Firmware 53, type: SCODE FW HAS IF (0x60000000), IF = 4.00 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(4000); # IF
+ write_le32(192); # Size
+ write_hunk(812088, 192);
+
+ #
+ # Firmware 54, type: SCODE FW DTV6 ATSC TOYOTA388 HAS IF (0x60410020), IF = 4.08 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60410020); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(4080); # IF
+ write_le32(192); # Size
+ write_hunk(809976, 192);
+
+ #
+ # Firmware 55, type: SCODE FW HAS IF (0x60000000), IF = 4.20 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(4200); # IF
+ write_le32(192); # Size
+ write_hunk(811704, 192);
+
+ #
+ # Firmware 56, type: SCODE FW MONO HAS IF (0x60008000), IF = 4.32 MHz id: NTSC/M Kr (0000000000008000), size: 192
+ #
+
+ write_le32(0x60008000); # Type
+ write_le64(0x00000000, 0x00008000); # ID
+ write_le16(4320); # IF
+ write_le32(192); # Size
+ write_hunk(808056, 192);
+
+ #
+ # Firmware 57, type: SCODE FW HAS IF (0x60000000), IF = 4.45 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(4450); # IF
+ write_le32(192); # Size
+ write_hunk(812664, 192);
+
+ #
+ # Firmware 58, type: SCODE FW MTS LCD NOGD MONO IF HAS IF (0x6002b004), IF = 4.50 MHz id: NTSC PAL/M PAL/N (000000000000b700), size: 192
+ #
+
+ write_le32(0x6002b004); # Type
+ write_le64(0x00000000, 0x0000b700); # ID
+ write_le16(4500); # IF
+ write_le32(192); # Size
+ write_hunk(807672, 192);
+
+ #
+ # Firmware 59, type: SCODE FW LCD NOGD IF HAS IF (0x60023000), IF = 4.60 MHz id: NTSC/M Kr (0000000000008000), size: 192
+ #
+
+ write_le32(0x60023000); # Type
+ write_le64(0x00000000, 0x00008000); # ID
+ write_le16(4600); # IF
+ write_le32(192); # Size
+ write_hunk(807864, 192);
+
+ #
+ # Firmware 60, type: SCODE FW DTV6 QAM DTV7 DTV78 DTV8 ZARLINK456 HAS IF (0x620003e0), IF = 4.76 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x620003e0); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(4760); # IF
+ write_le32(192); # Size
+ write_hunk(807288, 192);
+
+ #
+ # Firmware 61, type: SCODE FW HAS IF (0x60000000), IF = 4.94 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(4940); # IF
+ write_le32(192); # Size
+ write_hunk(811512, 192);
+
+ #
+ # Firmware 62, type: SCODE FW HAS IF (0x60000000), IF = 5.26 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(5260); # IF
+ write_le32(192); # Size
+ write_hunk(810552, 192);
+
+ #
+ # Firmware 63, type: SCODE FW MONO HAS IF (0x60008000), IF = 5.32 MHz id: PAL/BG A2 NICAM (0000000f00000007), size: 192
+ #
+
+ write_le32(0x60008000); # Type
+ write_le64(0x0000000f, 0x00000007); # ID
+ write_le16(5320); # IF
+ write_le32(192); # Size
+ write_hunk(810744, 192);
+
+ #
+ # Firmware 64, type: SCODE FW DTV7 DTV78 DTV8 DIBCOM52 CHINA HAS IF (0x65000380), IF = 5.40 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x65000380); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(5400); # IF
+ write_le32(192); # Size
+ write_hunk(807096, 192);
+
+ #
+ # Firmware 65, type: SCODE FW DTV6 ATSC OREN538 HAS IF (0x60110020), IF = 5.58 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60110020); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(5580); # IF
+ write_le32(192); # Size
+ write_hunk(809592, 192);
+
+ #
+ # Firmware 66, type: SCODE FW HAS IF (0x60000000), IF = 5.64 MHz id: PAL/BG A2 (0000000300000007), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000003, 0x00000007); # ID
+ write_le16(5640); # IF
+ write_le32(192); # Size
+ write_hunk(808440, 192);
+
+ #
+ # Firmware 67, type: SCODE FW HAS IF (0x60000000), IF = 5.74 MHz id: PAL/BG NICAM (0000000c00000007), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x0000000c, 0x00000007); # ID
+ write_le16(5740); # IF
+ write_le32(192); # Size
+ write_hunk(808632, 192);
+
+ #
+ # Firmware 68, type: SCODE FW HAS IF (0x60000000), IF = 5.90 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(5900); # IF
+ write_le32(192); # Size
+ write_hunk(810360, 192);
+
+ #
+ # Firmware 69, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.00 MHz id: PAL/DK PAL/I SECAM/K3 SECAM/L SECAM/Lc NICAM (0000000c04c000f0), size: 192
+ #
+
+ write_le32(0x60008000); # Type
+ write_le64(0x0000000c, 0x04c000f0); # ID
+ write_le16(6000); # IF
+ write_le32(192); # Size
+ write_hunk(808824, 192);
+
+ #
+ # Firmware 70, type: SCODE FW DTV6 QAM ATSC LG60 F6MHZ HAS IF (0x68050060), IF = 6.20 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x68050060); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(6200); # IF
+ write_le32(192); # Size
+ write_hunk(809400, 192);
+
+ #
+ # Firmware 71, type: SCODE FW HAS IF (0x60000000), IF = 6.24 MHz id: PAL/I (0000000000000010), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000010); # ID
+ write_le16(6240); # IF
+ write_le32(192); # Size
+ write_hunk(808248, 192);
+
+ #
+ # Firmware 72, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.32 MHz id: SECAM/K1 (0000000000200000), size: 192
+ #
+
+ write_le32(0x60008000); # Type
+ write_le64(0x00000000, 0x00200000); # ID
+ write_le16(6320); # IF
+ write_le32(192); # Size
+ write_hunk(811320, 192);
+
+ #
+ # Firmware 73, type: SCODE FW HAS IF (0x60000000), IF = 6.34 MHz id: SECAM/K1 (0000000000200000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00200000); # ID
+ write_le16(6340); # IF
+ write_le32(192); # Size
+ write_hunk(809208, 192);
+
+ #
+ # Firmware 74, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.50 MHz id: PAL/DK SECAM/K3 SECAM/L NICAM (0000000c044000e0), size: 192
+ #
+
+ write_le32(0x60008000); # Type
+ write_le64(0x0000000c, 0x044000e0); # ID
+ write_le16(6500); # IF
+ write_le32(192); # Size
+ write_hunk(811128, 192);
+
+ #
+ # Firmware 75, type: SCODE FW DTV6 ATSC ATI638 HAS IF (0x60090020), IF = 6.58 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60090020); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(6580); # IF
+ write_le32(192); # Size
+ write_hunk(807480, 192);
+
+ #
+ # Firmware 76, type: SCODE FW HAS IF (0x60000000), IF = 6.60 MHz id: PAL/DK A2 (00000003000000e0), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000003, 0x000000e0); # ID
+ write_le16(6600); # IF
+ write_le32(192); # Size
+ write_hunk(809016, 192);
+
+ #
+ # Firmware 77, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.68 MHz id: PAL/DK A2 (00000003000000e0), size: 192
+ #
+
+ write_le32(0x60008000); # Type
+ write_le64(0x00000003, 0x000000e0); # ID
+ write_le16(6680); # IF
+ write_le32(192); # Size
+ write_hunk(810936, 192);
+
+ #
+ # Firmware 78, type: SCODE FW DTV6 ATSC TOYOTA794 HAS IF (0x60810020), IF = 8.14 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60810020); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(8140); # IF
+ write_le32(192); # Size
+ write_hunk(810168, 192);
+
+ #
+ # Firmware 79, type: SCODE FW HAS IF (0x60000000), IF = 8.20 MHz id: (0000000000000000), size: 192
+ #
+
+ write_le32(0x60000000); # Type
+ write_le64(0x00000000, 0x00000000); # ID
+ write_le16(8200); # IF
+ write_le32(192); # Size
+ write_hunk(812856, 192);
+}
+
+sub extract_firmware {
+ my $sourcefile = "hcw85bda.sys";
+ my $hash = "0e44dbf63bb0169d57446aec21881ff2";
+ my $outfile = "xc3028-v27.fw";
+ my $name = "xc2028 firmware";
+ my $version = 519;
+ my $nr_desc = 80;
+ my $out;
+
+ verify($sourcefile, $hash);
+
+ open INFILE, "<$sourcefile";
+ main_firmware($outfile, $name, $version, $nr_desc);
+ close INFILE;
+}
+
+extract_firmware;
+printf "Firmwares generated.\n";
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
new file mode 100644
index 0000000..004818f
--- /dev/null
+++ b/Documentation/video4linux/gspca.txt
@@ -0,0 +1,274 @@
+List of the webcams known by gspca.
+
+The modules are:
+ gspca_main main driver
+ gspca_xxxx subdriver module with xxxx as follows
+
+xxxx vend:prod
+----
+spca501 0000:0000 MystFromOri Unknow Camera
+m5602 0402:5602 ALi Video Camera Controller
+spca501 040a:0002 Kodak DVC-325
+spca500 040a:0300 Kodak EZ200
+zc3xx 041e:041e Creative WebCam Live!
+spca500 041e:400a Creative PC-CAM 300
+sunplus 041e:400b Creative PC-CAM 600
+sunplus 041e:4012 PC-Cam350
+sunplus 041e:4013 Creative Pccam750
+zc3xx 041e:4017 Creative Webcam Mobile PD1090
+spca508 041e:4018 Creative Webcam Vista (PD1100)
+spca561 041e:401a Creative Webcam Vista (PD1100)
+zc3xx 041e:401c Creative NX
+spca505 041e:401d Creative Webcam NX ULTRA
+zc3xx 041e:401e Creative Nx Pro
+zc3xx 041e:401f Creative Webcam Notebook PD1171
+pac207 041e:4028 Creative Webcam Vista Plus
+zc3xx 041e:4029 Creative WebCam Vista Pro
+zc3xx 041e:4034 Creative Instant P0620
+zc3xx 041e:4035 Creative Instant P0620D
+zc3xx 041e:4036 Creative Live !
+zc3xx 041e:403a Creative Nx Pro 2
+spca561 041e:403b Creative Webcam Vista (VF0010)
+zc3xx 041e:4051 Creative Live!Cam Notebook Pro (VF0250)
+ov519 041e:4052 Creative Live! VISTA IM
+zc3xx 041e:4053 Creative Live!Cam Video IM
+ov519 041e:405f Creative Live! VISTA VF0330
+ov519 041e:4060 Creative Live! VISTA VF0350
+ov519 041e:4061 Creative Live! VISTA VF0400
+ov519 041e:4064 Creative Live! VISTA VF0420
+ov519 041e:4068 Creative Live! VISTA VF0470
+spca561 0458:7004 Genius VideoCAM Express V2
+sunplus 0458:7006 Genius Dsc 1.3 Smart
+zc3xx 0458:7007 Genius VideoCam V2
+zc3xx 0458:700c Genius VideoCam V3
+zc3xx 0458:700f Genius VideoCam Web V2
+sonixj 0458:7025 Genius Eye 311Q
+sonixj 0458:702e Genius Slim 310 NB
+sonixj 045e:00f5 MicroSoft VX3000
+sonixj 045e:00f7 MicroSoft VX1000
+ov519 045e:028c Micro$oft xbox cam
+spca508 0461:0815 Micro Innovation IC200
+sunplus 0461:0821 Fujifilm MV-1
+zc3xx 0461:0a00 MicroInnovation WebCam320
+spca500 046d:0890 Logitech QuickCam traveler
+vc032x 046d:0892 Logitech Orbicam
+vc032x 046d:0896 Logitech Orbicam
+zc3xx 046d:08a0 Logitech QC IM
+zc3xx 046d:08a1 Logitech QC IM 0x08A1 +sound
+zc3xx 046d:08a2 Labtec Webcam Pro
+zc3xx 046d:08a3 Logitech QC Chat
+zc3xx 046d:08a6 Logitech QCim
+zc3xx 046d:08a7 Logitech QuickCam Image
+zc3xx 046d:08a9 Logitech Notebook Deluxe
+zc3xx 046d:08aa Labtec Webcam Notebook
+zc3xx 046d:08ac Logitech QuickCam Cool
+zc3xx 046d:08ad Logitech QCCommunicate STX
+zc3xx 046d:08ae Logitech QuickCam for Notebooks
+zc3xx 046d:08af Logitech QuickCam Cool
+zc3xx 046d:08b9 Logitech QC IM ???
+zc3xx 046d:08d7 Logitech QCam STX
+zc3xx 046d:08d9 Logitech QuickCam IM/Connect
+zc3xx 046d:08d8 Logitech Notebook Deluxe
+zc3xx 046d:08da Logitech QuickCam Messenger
+zc3xx 046d:08dd Logitech QuickCam for Notebooks
+spca500 046d:0900 Logitech Inc. ClickSmart 310
+spca500 046d:0901 Logitech Inc. ClickSmart 510
+sunplus 046d:0905 Logitech ClickSmart 820
+tv8532 046d:0920 QC Express
+tv8532 046d:0921 Labtec Webcam
+spca561 046d:0928 Logitech QC Express Etch2
+spca561 046d:0929 Labtec Webcam Elch2
+spca561 046d:092a Logitech QC for Notebook
+spca561 046d:092b Labtec Webcam Plus
+spca561 046d:092c Logitech QC chat Elch2
+spca561 046d:092d Logitech QC Elch2
+spca561 046d:092e Logitech QC Elch2
+spca561 046d:092f Logitech QuickCam Express Plus
+sunplus 046d:0960 Logitech ClickSmart 420
+sunplus 0471:0322 Philips DMVC1300K
+zc3xx 0471:0325 Philips SPC 200 NC
+zc3xx 0471:0326 Philips SPC 300 NC
+sonixj 0471:0327 Philips SPC 600 NC
+sonixj 0471:0328 Philips SPC 700 NC
+zc3xx 0471:032d Philips SPC 210 NC
+zc3xx 0471:032e Philips SPC 315 NC
+sonixj 0471:0330 Philips SPC 710 NC
+spca501 0497:c001 Smile International
+sunplus 04a5:3003 Benq DC 1300
+sunplus 04a5:3008 Benq DC 1500
+sunplus 04a5:300a Benq DC 3410
+spca500 04a5:300c Benq DC 1016
+finepix 04cb:0104 Fujifilm FinePix 4800
+finepix 04cb:0109 Fujifilm FinePix A202
+finepix 04cb:010b Fujifilm FinePix A203
+finepix 04cb:010f Fujifilm FinePix A204
+finepix 04cb:0111 Fujifilm FinePix A205
+finepix 04cb:0113 Fujifilm FinePix A210
+finepix 04cb:0115 Fujifilm FinePix A303
+finepix 04cb:0117 Fujifilm FinePix A310
+finepix 04cb:0119 Fujifilm FinePix F401
+finepix 04cb:011b Fujifilm FinePix F402
+finepix 04cb:011d Fujifilm FinePix F410
+finepix 04cb:0121 Fujifilm FinePix F601
+finepix 04cb:0123 Fujifilm FinePix F700
+finepix 04cb:0125 Fujifilm FinePix M603
+finepix 04cb:0127 Fujifilm FinePix S300
+finepix 04cb:0129 Fujifilm FinePix S304
+finepix 04cb:012b Fujifilm FinePix S500
+finepix 04cb:012d Fujifilm FinePix S602
+finepix 04cb:012f Fujifilm FinePix S700
+finepix 04cb:0131 Fujifilm FinePix unknown model
+finepix 04cb:013b Fujifilm FinePix unknown model
+finepix 04cb:013d Fujifilm FinePix unknown model
+finepix 04cb:013f Fujifilm FinePix F420
+sunplus 04f1:1001 JVC GC A50
+spca561 04fc:0561 Flexcam 100
+sunplus 04fc:500c Sunplus CA500C
+sunplus 04fc:504a Aiptek Mini PenCam 1.3
+sunplus 04fc:504b Maxell MaxPocket LE 1.3
+sunplus 04fc:5330 Digitrex 2110
+sunplus 04fc:5360 Sunplus Generic
+spca500 04fc:7333 PalmPixDC85
+sunplus 04fc:ffff Pure DigitalDakota
+spca501 0506:00df 3Com HomeConnect Lite
+sunplus 052b:1513 Megapix V4
+tv8532 0545:808b Veo Stingray
+tv8532 0545:8333 Veo Stingray
+sunplus 0546:3155 Polaroid PDC3070
+sunplus 0546:3191 Polaroid Ion 80
+sunplus 0546:3273 Polaroid PDC2030
+ov519 054c:0154 Sonny toy4
+ov519 054c:0155 Sonny toy5
+zc3xx 055f:c005 Mustek Wcam300A
+spca500 055f:c200 Mustek Gsmart 300
+sunplus 055f:c211 Kowa Bs888e Microcamera
+spca500 055f:c220 Gsmart Mini
+sunplus 055f:c230 Mustek Digicam 330K
+sunplus 055f:c232 Mustek MDC3500
+sunplus 055f:c360 Mustek DV4000 Mpeg4
+sunplus 055f:c420 Mustek gSmart Mini 2
+sunplus 055f:c430 Mustek Gsmart LCD 2
+sunplus 055f:c440 Mustek DV 3000
+sunplus 055f:c520 Mustek gSmart Mini 3
+sunplus 055f:c530 Mustek Gsmart LCD 3
+sunplus 055f:c540 Gsmart D30
+sunplus 055f:c630 Mustek MDC4000
+sunplus 055f:c650 Mustek MDC5500Z
+zc3xx 055f:d003 Mustek WCam300A
+zc3xx 055f:d004 Mustek WCam300 AN
+conex 0572:0041 Creative Notebook cx11646
+ov519 05a9:0519 OmniVision
+ov519 05a9:0530 OmniVision
+ov519 05a9:4519 OmniVision
+ov519 05a9:8519 OmniVision
+sunplus 05da:1018 Digital Dream Enigma 1.3
+stk014 05e1:0893 Syntek DV4000
+spca561 060b:a001 Maxell Compact Pc PM3
+zc3xx 0698:2003 CTX M730V built in
+spca500 06bd:0404 Agfa CL20
+spca500 06be:0800 Optimedia
+sunplus 06d6:0031 Trust 610 LCD PowerC@m Zoom
+spca506 06e1:a190 ADS Instant VCD
+spca508 0733:0110 ViewQuest VQ110
+spca508 0130:0130 Clone Digital Webcam 11043
+spca501 0733:0401 Intel Create and Share
+spca501 0733:0402 ViewQuest M318B
+spca505 0733:0430 Intel PC Camera Pro
+sunplus 0733:1311 Digital Dream Epsilon 1.3
+sunplus 0733:1314 Mercury 2.1MEG Deluxe Classic Cam
+sunplus 0733:2211 Jenoptik jdc 21 LCD
+sunplus 0733:2221 Mercury Digital Pro 3.1p
+sunplus 0733:3261 Concord 3045 spca536a
+sunplus 0733:3281 Cyberpix S550V
+spca506 0734:043b 3DeMon USB Capture aka
+spca500 084d:0003 D-Link DSC-350
+spca500 08ca:0103 Aiptek PocketDV
+sunplus 08ca:0104 Aiptek PocketDVII 1.3
+sunplus 08ca:0106 Aiptek Pocket DV3100+
+sunplus 08ca:2008 Aiptek Mini PenCam 2 M
+sunplus 08ca:2010 Aiptek PocketCam 3M
+sunplus 08ca:2016 Aiptek PocketCam 2 Mega
+sunplus 08ca:2018 Aiptek Pencam SD 2M
+sunplus 08ca:2020 Aiptek Slim 3000F
+sunplus 08ca:2022 Aiptek Slim 3200
+sunplus 08ca:2024 Aiptek DV3500 Mpeg4
+sunplus 08ca:2028 Aiptek PocketCam4M
+sunplus 08ca:2040 Aiptek PocketDV4100M
+sunplus 08ca:2042 Aiptek PocketDV5100
+sunplus 08ca:2050 Medion MD 41437
+sunplus 08ca:2060 Aiptek PocketDV5300
+tv8532 0923:010f ICM532 cams
+mars 093a:050f Mars-Semi Pc-Camera
+pac207 093a:2460 PAC207 Qtec Webcam 100
+pac207 093a:2463 Philips SPC 220 NC
+pac207 093a:2464 Labtec Webcam 1200
+pac207 093a:2468 PAC207
+pac207 093a:2470 Genius GF112
+pac207 093a:2471 Genius VideoCam ge111
+pac207 093a:2472 Genius VideoCam ge110
+pac207 093a:2476 Genius e-Messenger 112
+pac7311 093a:2600 PAC7311 Typhoon
+pac7311 093a:2601 Philips SPC 610 NC
+pac7311 093a:2603 PAC7312
+pac7311 093a:2608 Trust WB-3300p
+pac7311 093a:260e Gigaware VGA PC Camera, Trust WB-3350p, SIGMA cam 2350
+pac7311 093a:260f SnakeCam
+pac7311 093a:2621 PAC731x
+pac7311 093a:2624 PAC7302
+pac7311 093a:2626 Labtec 2200
+pac7311 093a:262a Webcam 300k
+zc3xx 0ac8:0302 Z-star Vimicro zc0302
+vc032x 0ac8:0321 Vimicro generic vc0321
+vc032x 0ac8:0323 Vimicro Vc0323
+vc032x 0ac8:0328 A4Tech PK-130MG
+zc3xx 0ac8:301b Z-Star zc301b
+zc3xx 0ac8:303b Vimicro 0x303b
+zc3xx 0ac8:305b Z-star Vimicro zc0305b
+zc3xx 0ac8:307b Ldlc VC302+Ov7620
+vc032x 0ac8:c001 Sony embedded vimicro
+vc032x 0ac8:c002 Sony embedded vimicro
+spca508 0af9:0010 Hama USB Sightcam 100
+spca508 0af9:0011 Hama USB Sightcam 100
+sonixb 0c45:6001 Genius VideoCAM NB
+sonixb 0c45:6005 Microdia Sweex Mini Webcam
+sonixb 0c45:6007 Sonix sn9c101 + Tas5110D
+sonixb 0c45:6009 spcaCam@120
+sonixb 0c45:600d spcaCam@120
+sonixb 0c45:6011 Microdia PC Camera (SN9C102)
+sonixb 0c45:6019 Generic Sonix OV7630
+sonixb 0c45:6024 Generic Sonix Tas5130c
+sonixb 0c45:6025 Xcam Shanga
+sonixb 0c45:6028 Sonix Btc Pc380
+sonixb 0c45:6029 spcaCam@150
+sonixb 0c45:602c Generic Sonix OV7630
+sonixb 0c45:602d LIC-200 LG
+sonixb 0c45:602e Genius VideoCam Messenger
+sonixj 0c45:6040 Speed NVC 350K
+sonixj 0c45:607c Sonix sn9c102p Hv7131R
+sonixj 0c45:60c0 Sangha Sn535
+sonixj 0c45:60ec SN9C105+MO4000
+sonixj 0c45:60fb Surfer NoName
+sonixj 0c45:60fc LG-LIC300
+sonixj 0c45:6128 Microdia/Sonix SNP325
+sonixj 0c45:612a Avant Camera
+sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix
+sonixj 0c45:6130 Sonix Pccam
+sonixj 0c45:6138 Sn9c120 Mo4000
+sonixj 0c45:613b Surfer SN-206
+sonixj 0c45:613c Sonix Pccam168
+sonixj 0c45:6143 Sonix Pccam168
+sunplus 0d64:0303 Sunplus FashionCam DXG
+etoms 102c:6151 Qcam Sangha CIF
+etoms 102c:6251 Qcam xxxxxx VGA
+zc3xx 10fd:0128 Typhoon Webshot II USB 300k 0x0128
+spca561 10fd:7e50 FlyCam Usb 100
+zc3xx 10fd:8050 Typhoon Webshot II USB 300k
+spca501 1776:501c Arowana 300K CMOS Camera
+t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops
+vc032x 17ef:4802 Lenovo Vc0323+MI1310_SOC
+pac207 2001:f115 D-Link DSB-C120
+spca500 2899:012c Toptro Industrial
+spca508 8086:0110 Intel Easy PC Camera
+spca500 8086:0630 Intel Pocket PC Camera
+spca506 99fa:8988 Grandtec V.cap
+spca561 abcd:cdee Petcam
diff --git a/Documentation/video4linux/hauppauge-wintv-cx88-ir.txt b/Documentation/video4linux/hauppauge-wintv-cx88-ir.txt
new file mode 100644
index 0000000..faccee6
--- /dev/null
+++ b/Documentation/video4linux/hauppauge-wintv-cx88-ir.txt
@@ -0,0 +1,54 @@
+The controls for the mux are GPIO [0,1] for source, and GPIO 2 for muting.
+
+GPIO0 GPIO1
+ 0 0 TV Audio
+ 1 0 FM radio
+ 0 1 Line-In
+ 1 1 Mono tuner bypass or CD passthru (tuner specific)
+
+GPIO 16(i believe) is tied to the IR port (if present).
+
+------------------------------------------------------------------------------------
+
+>From the data sheet:
+ Register 24'h20004 PCI Interrupt Status
+ bit [18] IR_SMP_INT Set when 32 input samples have been collected over
+ gpio[16] pin into GP_SAMPLE register.
+
+What's missing from the data sheet:
+
+Setup 4KHz sampling rate (roughly 2x oversampled; good enough for our RC5
+compat remote)
+set register 0x35C050 to 0xa80a80
+
+enable sampling
+set register 0x35C054 to 0x5
+
+Of course, enable the IRQ bit 18 in the interrupt mask register .(and
+provide for a handler)
+
+GP_SAMPLE register is at 0x35C058
+
+Bits are then right shifted into the GP_SAMPLE register at the specified
+rate; you get an interrupt when a full DWORD is received.
+You need to recover the actual RC5 bits out of the (oversampled) IR sensor
+bits. (Hint: look for the 0/1and 1/0 crossings of the RC5 bi-phase data) An
+actual raw RC5 code will span 2-3 DWORDS, depending on the actual alignment.
+
+I'm pretty sure when no IR signal is present the receiver is always in a
+marking state(1); but stray light, etc can cause intermittent noise values
+as well. Remember, this is a free running sample of the IR receiver state
+over time, so don't assume any sample starts at any particular place.
+
+http://www.atmel.com/dyn/resources/prod_documents/doc2817.pdf
+This data sheet (google search) seems to have a lovely description of the
+RC5 basics
+
+http://users.pandora.be/nenya/electronics/rc5/ and more data
+
+http://www.ee.washington.edu/circuit_archive/text/ir_decode.txt
+and even a reference to how to decode a bi-phase data stream.
+
+http://www.xs4all.nl/~sbp/knowledge/ir/rc5.htm
+still more info
+
diff --git a/Documentation/video4linux/ibmcam.txt b/Documentation/video4linux/ibmcam.txt
new file mode 100644
index 0000000..397a94e
--- /dev/null
+++ b/Documentation/video4linux/ibmcam.txt
@@ -0,0 +1,324 @@
+README for Linux device driver for the IBM "C-It" USB video camera
+
+INTRODUCTION:
+
+This driver does not use all features known to exist in
+the IBM camera. However most of needed features work well.
+
+This driver was developed using logs of observed USB traffic
+which was produced by standard Windows driver (c-it98.sys).
+I did not have data sheets from Xirlink.
+
+Video formats:
+ 128x96 [model 1]
+ 176x144
+ 320x240 [model 2]
+ 352x240 [model 2]
+ 352x288
+Frame rate: 3 - 30 frames per second (FPS)
+External interface: USB
+Internal interface: Video For Linux (V4L)
+Supported controls:
+- by V4L: Contrast, Brightness, Color, Hue
+- by driver options: frame rate, lighting conditions, video format,
+ default picture settings, sharpness.
+
+SUPPORTED CAMERAS:
+
+Xirlink "C-It" camera, also known as "IBM PC Camera".
+The device uses proprietary ASIC (and compression method);
+it is manufactured by Xirlink. See http://www.xirlink.com/
+(renamed to http://www.veo.com), http://www.ibmpccamera.com,
+or http://www.c-itnow.com/ for details and pictures.
+
+This very chipset ("X Chip", as marked at the factory)
+is used in several other cameras, and they are supported
+as well:
+
+- IBM NetCamera
+- Veo Stingray
+
+The Linux driver was developed with camera with following
+model number (or FCC ID): KSX-XVP510. This camera has three
+interfaces, each with one endpoint (control, iso, iso). This
+type of cameras is referred to as "model 1". These cameras are
+no longer manufactured.
+
+Xirlink now manufactures new cameras which are somewhat different.
+In particular, following models [FCC ID] belong to that category:
+
+XVP300 [KSX-X9903]
+XVP600 [KSX-X9902]
+XVP610 [KSX-X9902]
+
+(see http://www.xirlink.com/ibmpccamera/ for updates, they refer
+to these new cameras by Windows driver dated 12-27-99, v3005 BETA)
+These cameras have two interfaces, one endpoint in each (iso, bulk).
+Such type of cameras is referred to as "model 2". They are supported
+(with exception of 352x288 native mode).
+
+Some IBM NetCameras (Model 4) are made to generate only compressed
+video streams. This is great for performance, but unfortunately
+nobody knows how to decompress the stream :-( Therefore, these
+cameras are *unsupported* and if you try to use one of those, all
+you get is random colored horizontal streaks, not the image!
+If you have one of those cameras, you probably should return it
+to the store and get something that is supported.
+
+Tell me more about all that "model" business
+--------------------------------------------
+
+I just invented model numbers to uniquely identify flavors of the
+hardware/firmware that were sold. It was very confusing to use
+brand names or some other internal numbering schemes. So I found
+by experimentation that all Xirlink chipsets fall into four big
+classes, and I called them "models". Each model is programmed in
+its own way, and each model sends back the video in its own way.
+
+Quirks of Model 2 cameras:
+-------------------------
+
+Model 2 does not have hardware contrast control. Corresponding V4L
+control is implemented in software, which is not very nice to your
+CPU, but at least it works.
+
+This driver provides 352x288 mode by switching the camera into
+quasi-352x288 RGB mode (800 Kbits per frame) essentially limiting
+this mode to 10 frames per second or less, in ideal conditions on
+the bus (USB is shared, after all). The frame rate
+has to be programmed very conservatively. Additional concern is that
+frame rate depends on brightness setting; therefore the picture can
+be good at one brightness and broken at another! I did not want to fix
+the frame rate at slowest setting, but I had to move it pretty much down
+the scale (so that framerate option barely matters). I also noticed that
+camera after first powering up produces frames slightly faster than during
+consecutive uses. All this means that if you use 352x288 (which is
+default), be warned - you may encounter broken picture on first connect;
+try to adjust brightness - brighter image is slower, so USB will be able
+to send all data. However if you regularly use Model 2 cameras you may
+prefer 176x144 which makes perfectly good I420, with no scaling and
+lesser demands on USB (300 Kbits per second, or 26 frames per second).
+
+Another strange effect of 352x288 mode is the fine vertical grid visible
+on some colored surfaces. I am sure it is caused by me not understanding
+what the camera is trying to say. Blame trade secrets for that.
+
+The camera that I had also has a hardware quirk: if disconnected,
+it needs few minutes to "relax" before it can be plugged in again
+(poorly designed USB processor reset circuit?)
+
+[Veo Stingray with Product ID 0x800C is also Model 2, but I haven't
+observed this particular flaw in it.]
+
+Model 2 camera can be programmed for very high sensitivity (even starlight
+may be enough), this makes it convenient for tinkering with. The driver
+code has enough comments to help a programmer to tweak the camera
+as s/he feels necessary.
+
+WHAT YOU NEED:
+
+- A supported IBM PC (C-it) camera (model 1 or 2)
+
+- A Linux box with USB support (2.3/2.4; 2.2 w/backport may work)
+
+- A Video4Linux compatible frame grabber program such as xawtv.
+
+HOW TO COMPILE THE DRIVER:
+
+You need to compile the driver only if you are a developer
+or if you want to make changes to the code. Most distributions
+precompile all modules, so you can go directly to the next
+section "HOW TO USE THE DRIVER".
+
+The ibmcam driver uses usbvideo helper library (module),
+so if you are studying the ibmcam code you will be led there.
+
+The driver itself consists of only one file in usb/ directory:
+ibmcam.c. This file is included into the Linux kernel build
+process if you configure the kernel for CONFIG_USB_IBMCAM.
+Run "make xconfig" and in USB section you will find the IBM
+camera driver. Select it, save the configuration and recompile.
+
+HOW TO USE THE DRIVER:
+
+I recommend to compile driver as a module. This gives you an
+easier access to its configuration. The camera has many more
+settings than V4L can operate, so some settings are done using
+module options.
+
+To begin with, on most modern Linux distributions the driver
+will be automatically loaded whenever you plug the supported
+camera in. Therefore, you don't need to do anything. However
+if you want to experiment with some module parameters then
+you can load and unload the driver manually, with camera
+plugged in or unplugged.
+
+Typically module is installed with command 'modprobe', like this:
+
+# modprobe ibmcam framerate=1
+
+Alternatively you can use 'insmod' in similar fashion:
+
+# insmod /lib/modules/2.x.y/usb/ibmcam.o framerate=1
+
+Module can be inserted with camera connected or disconnected.
+
+The driver can have options, though some defaults are provided.
+
+Driver options: (* indicates that option is model-dependent)
+
+Name Type Range [default] Example
+-------------- -------------- -------------- ------------------
+debug Integer 0-9 [0] debug=1
+flags Integer 0-0xFF [0] flags=0x0d
+framerate Integer 0-6 [2] framerate=1
+hue_correction Integer 0-255 [128] hue_correction=115
+init_brightness Integer 0-255 [128] init_brightness=100
+init_contrast Integer 0-255 [192] init_contrast=200
+init_color Integer 0-255 [128] init_color=130
+init_hue Integer 0-255 [128] init_hue=115
+lighting Integer 0-2* [1] lighting=2
+sharpness Integer 0-6* [4] sharpness=3
+size Integer 0-2* [2] size=1
+
+Options for Model 2 only:
+
+Name Type Range [default] Example
+-------------- -------------- -------------- ------------------
+init_model2_rg Integer 0..255 [0x70] init_model2_rg=128
+init_model2_rg2 Integer 0..255 [0x2f] init_model2_rg2=50
+init_model2_sat Integer 0..255 [0x34] init_model2_sat=65
+init_model2_yb Integer 0..255 [0xa0] init_model2_yb=200
+
+debug You don't need this option unless you are a developer.
+ If you are a developer then you will see in the code
+ what values do what. 0=off.
+
+flags This is a bit mask, and you can combine any number of
+ bits to produce what you want. Usually you don't want
+ any of extra features this option provides:
+
+ FLAGS_RETRY_VIDIOCSYNC 1 This bit allows to retry failed
+ VIDIOCSYNC ioctls without failing.
+ Will work with xawtv, will not
+ with xrealproducer. Default is
+ not set.
+ FLAGS_MONOCHROME 2 Activates monochrome (b/w) mode.
+ FLAGS_DISPLAY_HINTS 4 Shows colored pixels which have
+ magic meaning to developers.
+ FLAGS_OVERLAY_STATS 8 Shows tiny numbers on screen,
+ useful only for debugging.
+ FLAGS_FORCE_TESTPATTERN 16 Shows blue screen with numbers.
+ FLAGS_SEPARATE_FRAMES 32 Shows each frame separately, as
+ it was received from the camera.
+ Default (not set) is to mix the
+ preceding frame in to compensate
+ for occasional loss of Isoc data
+ on high frame rates.
+ FLAGS_CLEAN_FRAMES 64 Forces "cleanup" of each frame
+ prior to use; relevant only if
+ FLAGS_SEPARATE_FRAMES is set.
+ Default is not to clean frames,
+ this is a little faster but may
+ produce flicker if frame rate is
+ too high and Isoc data gets lost.
+ FLAGS_NO_DECODING 128 This flag turns the video stream
+ decoder off, and dumps the raw
+ Isoc data from the camera into
+ the reading process. Useful to
+ developers, but not to users.
+
+framerate This setting controls frame rate of the camera. This is
+ an approximate setting (in terms of "worst" ... "best")
+ because camera changes frame rate depending on amount
+ of light available. Setting 0 is slowest, 6 is fastest.
+ Beware - fast settings are very demanding and may not
+ work well with all video sizes. Be conservative.
+
+hue_correction This highly optional setting allows to adjust the
+ hue of the image in a way slightly different from
+ what usual "hue" control does. Both controls affect
+ YUV colorspace: regular "hue" control adjusts only
+ U component, and this "hue_correction" option similarly
+ adjusts only V component. However usually it is enough
+ to tweak only U or V to compensate for colored light or
+ color temperature; this option simply allows more
+ complicated correction when and if it is necessary.
+
+init_brightness These settings specify _initial_ values which will be
+init_contrast used to set up the camera. If your V4L application has
+init_color its own controls to adjust the picture then these
+init_hue controls will be used too. These options allow you to
+ preconfigure the camera when it gets connected, before
+ any V4L application connects to it. Good for webcams.
+
+init_model2_rg These initial settings alter color balance of the
+init_model2_rg2 camera on hardware level. All four settings may be used
+init_model2_sat to tune the camera to specific lighting conditions. These
+init_model2_yb settings only apply to Model 2 cameras.
+
+lighting This option selects one of three hardware-defined
+ photosensitivity settings of the camera. 0=bright light,
+ 1=Medium (default), 2=Low light. This setting affects
+ frame rate: the dimmer the lighting the lower the frame
+ rate (because longer exposition time is needed). The
+ Model 2 cameras allow values more than 2 for this option,
+ thus enabling extremely high sensitivity at cost of frame
+ rate, color saturation and imaging sensor noise.
+
+sharpness This option controls smoothing (noise reduction)
+ made by camera. Setting 0 is most smooth, setting 6
+ is most sharp. Be aware that CMOS sensor used in the
+ camera is pretty noisy, so if you choose 6 you will
+ be greeted with "snowy" image. Default is 4. Model 2
+ cameras do not support this feature.
+
+size This setting chooses one of several image sizes that are
+ supported by this driver. Cameras may support more, but
+ it's difficult to reverse-engineer all formats.
+ Following video sizes are supported:
+
+ size=0 128x96 (Model 1 only)
+ size=1 160x120
+ size=2 176x144
+ size=3 320x240 (Model 2 only)
+ size=4 352x240 (Model 2 only)
+ size=5 352x288
+ size=6 640x480 (Model 3 only)
+
+ The 352x288 is the native size of the Model 1 sensor
+ array, so it's the best resolution the camera can
+ yield. The best resolution of Model 2 is 176x144, and
+ larger images are produced by stretching the bitmap.
+ Model 3 has sensor with 640x480 grid, and it works too,
+ but the frame rate will be exceptionally low (1-2 FPS);
+ it may be still OK for some applications, like security.
+ Choose the image size you need. The smaller image can
+ support faster frame rate. Default is 352x288.
+
+For more information and the Troubleshooting FAQ visit this URL:
+
+ http://www.linux-usb.org/ibmcam/
+
+WHAT NEEDS TO BE DONE:
+
+- The button on the camera is not used. I don't know how to get to it.
+ I know now how to read button on Model 2, but what to do with it?
+
+- Camera reports its status back to the driver; however I don't know
+ what returned data means. If camera fails at some initialization
+ stage then something should be done, and I don't do that because
+ I don't even know that some command failed. This is mostly Model 1
+ concern because Model 2 uses different commands which do not return
+ status (and seem to complete successfully every time).
+
+- Some flavors of Model 4 NetCameras produce only compressed video
+ streams, and I don't know how to decode them.
+
+CREDITS:
+
+The code is based in no small part on the CPiA driver by Johannes Erdfelt,
+Randy Dunlap, and others. Big thanks to them for their pioneering work on that
+and the USB stack.
+
+I also thank John Lightsey for his donation of the Veo Stingray camera.
diff --git a/Documentation/video4linux/lifeview.txt b/Documentation/video4linux/lifeview.txt
new file mode 100644
index 0000000..05f9eb5
--- /dev/null
+++ b/Documentation/video4linux/lifeview.txt
@@ -0,0 +1,42 @@
+collecting data about the lifeview models and the config coding on
+gpio pins 0-9 ...
+==================================================================
+
+bt878:
+ LR50 rev. Q ("PARTS: 7031505116), Tuner wurde als Nr. 5 erkannt, Eingänge
+ SVideo, TV, Composite, Audio, Remote. CP9..1=100001001 (1: 0-Ohm-Widerstand
+ gegen GND unbestückt; 0: bestückt)
+
+------------------------------------------------------------------------------
+
+saa7134:
+ /* LifeView FlyTV Platinum FM (LR214WF) */
+ /* "Peter Missel <peter.missel@onlinehome.de> */
+ .name = "LifeView FlyTV Platinum FM",
+ /* GP27 MDT2005 PB4 pin 10 */
+ /* GP26 MDT2005 PB3 pin 9 */
+ /* GP25 MDT2005 PB2 pin 8 */
+ /* GP23 MDT2005 PB1 pin 7 */
+ /* GP22 MDT2005 PB0 pin 6 */
+ /* GP21 MDT2005 PB5 pin 11 */
+ /* GP20 MDT2005 PB6 pin 12 */
+ /* GP19 MDT2005 PB7 pin 13 */
+ /* nc MDT2005 PA3 pin 2 */
+ /* Remote MDT2005 PA2 pin 1 */
+ /* GP18 MDT2005 PA1 pin 18 */
+ /* nc MDT2005 PA0 pin 17 strap low */
+
+ /* GP17 Strap "GP7"=High */
+ /* GP16 Strap "GP6"=High
+ 0=Radio 1=TV
+ Drives SA630D ENCH1 and HEF4052 A1 pins
+ to do FM radio through SIF input */
+ /* GP15 nc */
+ /* GP14 nc */
+ /* GP13 nc */
+ /* GP12 Strap "GP5" = High */
+ /* GP11 Strap "GP4" = High */
+ /* GP10 Strap "GP3" = High */
+ /* GP09 Strap "GP2" = Low */
+ /* GP08 Strap "GP1" = Low */
+ /* GP07.00 nc */
diff --git a/Documentation/video4linux/m5602.txt b/Documentation/video4linux/m5602.txt
new file mode 100644
index 0000000..4450ab1
--- /dev/null
+++ b/Documentation/video4linux/m5602.txt
@@ -0,0 +1,12 @@
+This document describes the ALi m5602 bridge connected
+to the following supported sensors:
+OmniVision OV9650,
+Samsung s5k83a,
+Samsung s5k4aa,
+Micron mt9m111,
+Pixel plus PO1030
+
+This driver mimics the windows drivers, which have a braindead implementation sending bayer-encoded frames at VGA resolution.
+In a perfect world we should be able to reprogram the m5602 and the connected sensor in hardware instead, supporting a range of resolutions and pixelformats
+
+Anyway, have fun and please report any bugs to m560x-driver-devel@lists.sourceforge.net
diff --git a/Documentation/video4linux/meye.txt b/Documentation/video4linux/meye.txt
new file mode 100644
index 0000000..bf3af5f
--- /dev/null
+++ b/Documentation/video4linux/meye.txt
@@ -0,0 +1,129 @@
+Vaio Picturebook Motion Eye Camera Driver Readme
+------------------------------------------------
+ Copyright (C) 2001-2004 Stelian Pop <stelian@popies.net>
+ Copyright (C) 2001-2002 Alcôve <www.alcove.com>
+ Copyright (C) 2000 Andrew Tridgell <tridge@samba.org>
+
+This driver enable the use of video4linux compatible applications with the
+Motion Eye camera. This driver requires the "Sony Laptop Extras" driver (which
+can be found in the "Misc devices" section of the kernel configuration utility)
+to be compiled and installed (using its "camera=1" parameter).
+
+It can do at maximum 30 fps @ 320x240 or 15 fps @ 640x480.
+
+Grabbing is supported in packed YUV colorspace only.
+
+MJPEG hardware grabbing is supported via a private API (see below).
+
+Hardware supported:
+-------------------
+
+This driver supports the 'second' version of the MotionEye camera :)
+
+The first version was connected directly on the video bus of the Neomagic
+video card and is unsupported.
+
+The second one, made by Kawasaki Steel is fully supported by this
+driver (PCI vendor/device is 0x136b/0xff01)
+
+The third one, present in recent (more or less last year) Picturebooks
+(C1M* models), is not supported. The manufacturer has given the specs
+to the developers under a NDA (which allows the development of a GPL
+driver however), but things are not moving very fast (see
+http://r-engine.sourceforge.net/) (PCI vendor/device is 0x10cf/0x2011).
+
+There is a forth model connected on the USB bus in TR1* Vaio laptops.
+This camera is not supported at all by the current driver, in fact
+little information if any is available for this camera
+(USB vendor/device is 0x054c/0x0107).
+
+Driver options:
+---------------
+
+Several options can be passed to the meye driver using the standard
+module argument syntax (<param>=<value> when passing the option to the
+module or meye.<param>=<value> on the kernel boot line when meye is
+statically linked into the kernel). Those options are:
+
+ forcev4l1: force use of V4L1 API instead of V4L2
+
+ gbuffers: number of capture buffers, default is 2 (32 max)
+
+ gbufsize: size of each capture buffer, default is 614400
+
+ video_nr: video device to register (0 = /dev/video0, etc)
+
+Module use:
+-----------
+
+In order to automatically load the meye module on use, you can put those lines
+in your /etc/modprobe.conf file:
+
+ alias char-major-81 videodev
+ alias char-major-81-0 meye
+ options meye gbuffers=32
+
+Usage:
+------
+
+ xawtv >= 3.49 (<http://bytesex.org/xawtv/>)
+ for display and uncompressed video capture:
+
+ xawtv -c /dev/video0 -geometry 640x480
+ or
+ xawtv -c /dev/video0 -geometry 320x240
+
+ motioneye (<http://popies.net/meye/>)
+ for getting ppm or jpg snapshots, mjpeg video
+
+Private API:
+------------
+
+ The driver supports frame grabbing with the video4linux API
+ (either v4l1 or v4l2), so all video4linux tools (like xawtv)
+ should work with this driver.
+
+ Besides the video4linux interface, the driver has a private interface
+ for accessing the Motion Eye extended parameters (camera sharpness,
+ agc, video framerate), the shapshot and the MJPEG capture facilities.
+
+ This interface consists of several ioctls (prototypes and structures
+ can be found in include/linux/meye.h):
+
+ MEYEIOC_G_PARAMS
+ MEYEIOC_S_PARAMS
+ Get and set the extended parameters of the motion eye camera.
+ The user should always query the current parameters with
+ MEYEIOC_G_PARAMS, change what he likes and then issue the
+ MEYEIOC_S_PARAMS call (checking for -EINVAL). The extended
+ parameters are described by the meye_params structure.
+
+
+ MEYEIOC_QBUF_CAPT
+ Queue a buffer for capture (the buffers must have been
+ obtained with a VIDIOCGMBUF call and mmap'ed by the
+ application). The argument to MEYEIOC_QBUF_CAPT is the
+ buffer number to queue (or -1 to end capture). The first
+ call to MEYEIOC_QBUF_CAPT starts the streaming capture.
+
+ MEYEIOC_SYNC
+ Takes as an argument the buffer number you want to sync.
+ This ioctl blocks until the buffer is filled and ready
+ for the application to use. It returns the buffer size.
+
+ MEYEIOC_STILLCAPT
+ MEYEIOC_STILLJCAPT
+ Takes a snapshot in an uncompressed or compressed jpeg format.
+ This ioctl blocks until the snapshot is done and returns (for
+ jpeg snapshot) the size of the image. The image data is
+ available from the first mmap'ed buffer.
+
+ Look at the 'motioneye' application code for an actual example.
+
+Bugs / Todo:
+------------
+
+ - the driver could be much cleaned up by removing the v4l1 support.
+ However, this means all v4l1-only applications will stop working.
+
+ - 'motioneye' still uses the meye private v4l1 API extensions.
diff --git a/Documentation/video4linux/not-in-cx2388x-datasheet.txt b/Documentation/video4linux/not-in-cx2388x-datasheet.txt
new file mode 100644
index 0000000..edbfe74
--- /dev/null
+++ b/Documentation/video4linux/not-in-cx2388x-datasheet.txt
@@ -0,0 +1,41 @@
+=================================================================================
+MO_OUTPUT_FORMAT (0x310164)
+
+ Previous default from DScaler: 0x1c1f0008
+ Digit 8: 31-28
+ 28: PREVREMOD = 1
+
+ Digit 7: 27-24 (0xc = 12 = b1100 )
+ 27: COMBALT = 1
+ 26: PAL_INV_PHASE
+ (DScaler apparently set this to 1, resulted in sucky picture)
+
+ Digits 6,5: 23-16
+ 25-16: COMB_RANGE = 0x1f [default] (9 bits -> max 512)
+
+ Digit 4: 15-12
+ 15: DISIFX = 0
+ 14: INVCBF = 0
+ 13: DISADAPT = 0
+ 12: NARROWADAPT = 0
+
+ Digit 3: 11-8
+ 11: FORCE2H
+ 10: FORCEREMD
+ 9: NCHROMAEN
+ 8: NREMODEN
+
+ Digit 2: 7-4
+ 7-6: YCORE
+ 5-4: CCORE
+
+ Digit 1: 3-0
+ 3: RANGE = 1
+ 2: HACTEXT
+ 1: HSFMT
+
+0x47 is the sync byte for MPEG-2 transport stream packets.
+Datasheet incorrectly states to use 47 decimal. 188 is the length.
+All DVB compliant frontends output packets with this start code.
+
+=================================================================================
diff --git a/Documentation/video4linux/ov511.txt b/Documentation/video4linux/ov511.txt
new file mode 100644
index 0000000..b3326b1
--- /dev/null
+++ b/Documentation/video4linux/ov511.txt
@@ -0,0 +1,288 @@
+-------------------------------------------------------------------------------
+Readme for Linux device driver for the OmniVision OV511 USB to camera bridge IC
+-------------------------------------------------------------------------------
+
+Author: Mark McClelland
+Homepage: http://alpha.dyndns.org/ov511
+
+INTRODUCTION:
+
+This is a driver for the OV511, a USB-only chip used in many "webcam" devices.
+Any camera using the OV511/OV511+ and the OV6620/OV7610/20/20AE should work.
+Video capture devices that use the Philips SAA7111A decoder also work. It
+supports streaming and capture of color or monochrome video via the Video4Linux
+API. Most V4L apps are compatible with it. Most resolutions with a width and
+height that are a multiple of 8 are supported.
+
+If you need more information, please visit the OV511 homepage at the above URL.
+
+WHAT YOU NEED:
+
+- If you want to help with the development, get the chip's specification docs at
+ http://www.ovt.com/omniusbp.html
+
+- A Video4Linux compatible frame grabber program (I recommend vidcat and xawtv)
+ vidcat is part of the w3cam package: http://mpx.freeshell.net/
+ xawtv is available at: http://linux.bytesex.org/xawtv/
+
+HOW TO USE IT:
+
+Note: These are simplified instructions. For complete instructions see:
+ http://alpha.dyndns.org/ov511/install.html
+
+You must have first compiled USB support, support for your specific USB host
+controller (UHCI or OHCI), and Video4Linux support for your kernel (I recommend
+making them modules.) Make sure "Enforce bandwidth allocation" is NOT enabled.
+
+Next, (as root):
+
+ modprobe usbcore
+ modprobe usb-uhci <OR> modprobe usb-ohci
+ modprobe videodev
+ modprobe ov511
+
+If it is not already there (it usually is), create the video device:
+
+ mknod /dev/video0 c 81 0
+
+Optionally, symlink /dev/video to /dev/video0
+
+You will have to set permissions on this device to allow you to read/write
+from it:
+
+ chmod 666 /dev/video
+ chmod 666 /dev/video0 (if necessary)
+
+Now you are ready to run a video app! Both vidcat and xawtv work well for me
+at 640x480.
+
+[Using vidcat:]
+
+ vidcat -s 640x480 -p c > test.jpg
+ xview test.jpg
+
+[Using xawtv:]
+
+From the main xawtv directory:
+
+ make clean
+ ./configure
+ make
+ make install
+
+Now you should be able to run xawtv. Right click for the options dialog.
+
+MODULE PARAMETERS:
+
+ You can set these with: insmod ov511 NAME=VALUE
+ There is currently no way to set these on a per-camera basis.
+
+ NAME: autobright
+ TYPE: integer (Boolean)
+ DEFAULT: 1
+ DESC: Brightness is normally under automatic control and can't be set
+ manually by the video app. Set to 0 for manual control.
+
+ NAME: autogain
+ TYPE: integer (Boolean)
+ DEFAULT: 1
+ DESC: Auto Gain Control enable. This feature is not yet implemented.
+
+ NAME: autoexp
+ TYPE: integer (Boolean)
+ DEFAULT: 1
+ DESC: Auto Exposure Control enable. This feature is not yet implemented.
+
+ NAME: debug
+ TYPE: integer (0-6)
+ DEFAULT: 3
+ DESC: Sets the threshold for printing debug messages. The higher the value,
+ the more is printed. The levels are cumulative, and are as follows:
+ 0=no debug messages
+ 1=init/detection/unload and other significant messages
+ 2=some warning messages
+ 3=config/control function calls
+ 4=most function calls and data parsing messages
+ 5=highly repetitive mesgs
+
+ NAME: snapshot
+ TYPE: integer (Boolean)
+ DEFAULT: 0
+ DESC: Set to 1 to enable snapshot mode. read()/VIDIOCSYNC will block until
+ the snapshot button is pressed. Note: enabling this mode disables
+ /proc/video/ov511/<minor#>/button
+
+ NAME: cams
+ TYPE: integer (1-4 for OV511, 1-31 for OV511+)
+ DEFAULT: 1
+ DESC: Number of cameras allowed to stream simultaneously on a single bus.
+ Values higher than 1 reduce the data rate of each camera, allowing two
+ or more to be used at once. If you have a complicated setup involving
+ both OV511 and OV511+ cameras, trial-and-error may be necessary for
+ finding the optimum setting.
+
+ NAME: compress
+ TYPE: integer (Boolean)
+ DEFAULT: 0
+ DESC: Set this to 1 to turn on the camera's compression engine. This can
+ potentially increase the frame rate at the expense of quality, if you
+ have a fast CPU. You must load the proper compression module for your
+ camera before starting your application (ov511_decomp or ov518_decomp).
+
+ NAME: testpat
+ TYPE: integer (Boolean)
+ DEFAULT: 0
+ DESC: This configures the camera's sensor to transmit a colored test-pattern
+ instead of an image. This does not work correctly yet.
+
+ NAME: dumppix
+ TYPE: integer (0-2)
+ DEFAULT: 0
+ DESC: Dumps raw pixel data and skips post-processing and format conversion.
+ It is for debugging purposes only. Options are:
+ 0: Disable (default)
+ 1: Dump raw data from camera, excluding headers and trailers
+ 2: Dumps data exactly as received from camera
+
+ NAME: led
+ TYPE: integer (0-2)
+ DEFAULT: 1 (Always on)
+ DESC: Controls whether the LED (the little light) on the front of the camera
+ is always off (0), always on (1), or only on when driver is open (2).
+ This is not supported with the OV511, and might only work with certain
+ cameras (ones that actually have the LED wired to the control pin, and
+ not just hard-wired to be on all the time).
+
+ NAME: dump_bridge
+ TYPE: integer (Boolean)
+ DEFAULT: 0
+ DESC: Dumps the bridge (OV511[+] or OV518[+]) register values to the system
+ log. Only useful for serious debugging/development purposes.
+
+ NAME: dump_sensor
+ TYPE: integer (Boolean)
+ DEFAULT: 0
+ DESC: Dumps the sensor register values to the system log. Only useful for
+ serious debugging/development purposes.
+
+ NAME: printph
+ TYPE: integer (Boolean)
+ DEFAULT: 0
+ DESC: Setting this to 1 will dump the first 12 bytes of each isoc frame. This
+ is only useful if you are trying to debug problems with the isoc data
+ stream (i.e.: camera initializes, but vidcat hangs until Ctrl-C). Be
+ warned that this dumps a large number of messages to your kernel log.
+
+ NAME: phy, phuv, pvy, pvuv, qhy, qhuv, qvy, qvuv
+ TYPE: integer (0-63 for phy and phuv, 0-255 for rest)
+ DEFAULT: OV511 default values
+ DESC: These are registers 70h - 77h of the OV511, which control the
+ prediction ranges and quantization thresholds of the compressor, for
+ the Y and UV channels in the horizontal and vertical directions. See
+ the OV511 or OV511+ data sheet for more detailed descriptions. These
+ normally do not need to be changed.
+
+ NAME: lightfreq
+ TYPE: integer (0, 50, or 60)
+ DEFAULT: 0 (use sensor default)
+ DESC: Sets the sensor to match your lighting frequency. This can reduce the
+ appearance of "banding", i.e. horizontal lines or waves of light and
+ dark that are often caused by artificial lighting. Valid values are:
+ 0 - Use default (depends on sensor, most likely 60 Hz)
+ 50 - For European and Asian 50 Hz power
+ 60 - For American 60 Hz power
+
+ NAME: bandingfilter
+ TYPE: integer (Boolean)
+ DEFAULT: 0 (off)
+ DESC: Enables the sensor´s banding filter exposure algorithm. This reduces
+ or stabilizes the "banding" caused by some artificial light sources
+ (especially fluorescent). You might have to set lightfreq correctly for
+ this to work right. As an added bonus, this sometimes makes it
+ possible to capture your monitor´s output.
+
+ NAME: fastset
+ TYPE: integer (Boolean)
+ DEFAULT: 0 (off)
+ DESC: Allows picture settings (brightness, contrast, color, and hue) to take
+ effect immediately, even in the middle of a frame. This reduces the
+ time to change settings, but can ruin frames during the change. Only
+ affects OmniVision sensors.
+
+ NAME: force_palette
+ TYPE: integer (Boolean)
+ DEFAULT: 0 (off)
+ DESC: Forces the palette (color format) to a specific value. If an
+ application requests a different palette, it will be rejected, thereby
+ forcing it to try others until it succeeds. This is useful for forcing
+ greyscale mode with a color camera, for example. Supported modes are:
+ 0 (Allows all the following formats)
+ 1 VIDEO_PALETTE_GREY (Linear greyscale)
+ 10 VIDEO_PALETTE_YUV420 (YUV 4:2:0 Planar)
+ 15 VIDEO_PALETTE_YUV420P (YUV 4:2:0 Planar, same as 10)
+
+ NAME: backlight
+ TYPE: integer (Boolean)
+ DEFAULT: 0 (off)
+ DESC: Setting this flag changes the exposure algorithm for OmniVision sensors
+ such that objects in the camera's view (i.e. your head) can be clearly
+ seen when they are illuminated from behind. It reduces or eliminates
+ the sensor's auto-exposure function, so it should only be used when
+ needed. Additionally, it is only supported with the OV6620 and OV7620.
+
+ NAME: unit_video
+ TYPE: Up to 16 comma-separated integers
+ DEFAULT: 0,0,0... (automatically assign the next available minor(s))
+ DESC: You can specify up to 16 minor numbers to be assigned to ov511 devices.
+ For example, "unit_video=1,3" will make the driver use /dev/video1 and
+ /dev/video3 for the first two devices it detects. Additional devices
+ will be assigned automatically starting at the first available device
+ node (/dev/video0 in this case). Note that you cannot specify 0 as a
+ minor number. This feature requires kernel version 2.4.5 or higher.
+
+ NAME: remove_zeros
+ TYPE: integer (Boolean)
+ DEFAULT: 0 (do not skip any incoming data)
+ DESC: Setting this to 1 will remove zero-padding from incoming data. This
+ will compensate for the blocks of corruption that can appear when the
+ camera cannot keep up with the speed of the USB bus (eg. at low frame
+ resolutions). This feature is always enabled when compression is on.
+
+ NAME: mirror
+ TYPE: integer (Boolean)
+ DEFAULT: 0 (off)
+ DESC: Setting this to 1 will reverse ("mirror") the image horizontally. This
+ might be necessary if your camera has a custom lens assembly. This has
+ no effect with video capture devices.
+
+ NAME: ov518_color
+ TYPE: integer (Boolean)
+ DEFAULT: 0 (off)
+ DESC: Enable OV518 color support. This is off by default since it doesn't
+ work most of the time. If you want to try it, you must also load
+ ov518_decomp with the "nouv=0" parameter. If you get improper colors or
+ diagonal lines through the image, restart your video app and try again.
+ Repeat as necessary.
+
+WORKING FEATURES:
+ o Color streaming/capture at most widths and heights that are multiples of 8.
+ o Monochrome (use force_palette=1 to enable)
+ o Setting/getting of saturation, contrast, brightness, and hue (only some of
+ them work the OV7620 and OV7620AE)
+ o /proc status reporting
+ o SAA7111A video capture support at 320x240 and 640x480
+ o Compression support
+ o SMP compatibility
+
+HOW TO CONTACT ME:
+
+You can email me at mark@alpha.dyndns.org . Please prefix the subject line
+with "OV511: " so that I am certain to notice your message.
+
+CREDITS:
+
+The code is based in no small part on the CPiA driver by Johannes Erdfelt,
+Randy Dunlap, and others. Big thanks to them for their pioneering work on that
+and the USB stack. Thanks to Bret Wallach for getting camera reg IO, ISOC, and
+image capture working. Thanks to Orion Sky Lawlor, Kevin Moore, and Claudio
+Matsuoka for their work as well.
diff --git a/Documentation/video4linux/radiotrack.txt b/Documentation/video4linux/radiotrack.txt
new file mode 100644
index 0000000..d1f3ed1
--- /dev/null
+++ b/Documentation/video4linux/radiotrack.txt
@@ -0,0 +1,147 @@
+NOTES ON RADIOTRACK CARD CONTROL
+by Stephen M. Benoit (benoits@servicepro.com) Dec 14, 1996
+----------------------------------------------------------------------------
+
+Document version 1.0
+
+ACKNOWLEDGMENTS
+----------------
+This document was made based on 'C' code for Linux from Gideon le Grange
+(legrang@active.co.za or legrang@cs.sun.ac.za) in 1994, and elaborations from
+Frans Brinkman (brinkman@esd.nl) in 1996. The results reported here are from
+experiments that the author performed on his own setup, so your mileage may
+vary... I make no guarantees, claims or warranties to the suitability or
+validity of this information. No other documentation on the AIMS
+Lab (http://www.aimslab.com/) RadioTrack card was made available to the
+author. This document is offered in the hopes that it might help users who
+want to use the RadioTrack card in an environment other than MS Windows.
+
+WHY THIS DOCUMENT?
+------------------
+I have a RadioTrack card from back when I ran an MS-Windows platform. After
+converting to Linux, I found Gideon le Grange's command-line software for
+running the card, and found that it was good! Frans Brinkman made a
+comfortable X-windows interface, and added a scanning feature. For hack
+value, I wanted to see if the tuner could be tuned beyond the usual FM radio
+broadcast band, so I could pick up the audio carriers from North American
+broadcast TV channels, situated just below and above the 87.0-109.0 MHz range.
+I did not get much success, but I learned about programming ioports under
+Linux and gained some insights about the hardware design used for the card.
+
+So, without further delay, here are the details.
+
+
+PHYSICAL DESCRIPTION
+--------------------
+The RadioTrack card is an ISA 8-bit FM radio card. The radio frequency (RF)
+input is simply an antenna lead, and the output is a power audio signal
+available through a miniature phone plug. Its RF frequencies of operation are
+more or less limited from 87.0 to 109.0 MHz (the commercial FM broadcast
+band). Although the registers can be programmed to request frequencies beyond
+these limits, experiments did not give promising results. The variable
+frequency oscillator (VFO) that demodulates the intermediate frequency (IF)
+signal probably has a small range of useful frequencies, and wraps around or
+gets clipped beyond the limits mentioned above.
+
+
+CONTROLLING THE CARD WITH IOPORT
+--------------------------------
+The RadioTrack (base) ioport is configurable for 0x30c or 0x20c. Only one
+ioport seems to be involved. The ioport decoding circuitry must be pretty
+simple, as individual ioport bits are directly matched to specific functions
+(or blocks) of the radio card. This way, many functions can be changed in
+parallel with one write to the ioport. The only feedback available through
+the ioports appears to be the "Stereo Detect" bit.
+
+The bits of the ioport are arranged as follows:
+
+ MSb LSb
++------+------+------+--------+--------+-------+---------+--------+
+| VolA | VolB | ???? | Stereo | Radio | TuneA | TuneB | Tune |
+| (+) | (-) | | Detect | Audio | (bit) | (latch) | Update |
+| | | | Enable | Enable | | | Enable |
++------+------+------+--------+--------+-------+---------+--------+
+
+
+VolA . VolB [AB......]
+-----------
+0 0 : audio mute
+0 1 : volume + (some delay required)
+1 0 : volume - (some delay required)
+1 1 : stay at present volume
+
+Stereo Detect Enable [...S....]
+--------------------
+0 : No Detect
+1 : Detect
+
+ Results available by reading ioport >60 msec after last port write.
+ 0xff ==> no stereo detected, 0xfd ==> stereo detected.
+
+Radio to Audio (path) Enable [....R...]
+----------------------------
+0 : Disable path (silence)
+1 : Enable path (audio produced)
+
+TuneA . TuneB [.....AB.]
+-------------
+0 0 : "zero" bit phase 1
+0 1 : "zero" bit phase 2
+
+1 0 : "one" bit phase 1
+1 1 : "one" bit phase 2
+
+ 24-bit code, where bits = (freq*40) + 10486188.
+ The Most Significant 11 bits must be 1010 xxxx 0x0 to be valid.
+ The bits are shifted in LSb first.
+
+Tune Update Enable [.......T]
+------------------
+0 : Tuner held constant
+1 : Tuner updating in progress
+
+
+PROGRAMMING EXAMPLES
+--------------------
+Default: BASE <-- 0xc8 (current volume, no stereo detect,
+ radio enable, tuner adjust disable)
+
+Card Off: BASE <-- 0x00 (audio mute, no stereo detect,
+ radio disable, tuner adjust disable)
+
+Card On: BASE <-- 0x00 (see "Card Off", clears any unfinished business)
+ BASE <-- 0xc8 (see "Default")
+
+Volume Down: BASE <-- 0x48 (volume down, no stereo detect,
+ radio enable, tuner adjust disable)
+ * wait 10 msec *
+ BASE <-- 0xc8 (see "Default")
+
+Volume Up: BASE <-- 0x88 (volume up, no stereo detect,
+ radio enable, tuner adjust disable)
+ * wait 10 msec *
+ BASE <-- 0xc8 (see "Default")
+
+Check Stereo: BASE <-- 0xd8 (current volume, stereo detect,
+ radio enable, tuner adjust disable)
+ * wait 100 msec *
+ x <-- BASE (read ioport)
+ BASE <-- 0xc8 (see "Default")
+
+ x=0xff ==> "not stereo", x=0xfd ==> "stereo detected"
+
+Set Frequency: code = (freq*40) + 10486188
+ foreach of the 24 bits in code,
+ (from Least to Most Significant):
+ to write a "zero" bit,
+ BASE <-- 0x01 (audio mute, no stereo detect, radio
+ disable, "zero" bit phase 1, tuner adjust)
+ BASE <-- 0x03 (audio mute, no stereo detect, radio
+ disable, "zero" bit phase 2, tuner adjust)
+ to write a "one" bit,
+ BASE <-- 0x05 (audio mute, no stereo detect, radio
+ disable, "one" bit phase 1, tuner adjust)
+ BASE <-- 0x07 (audio mute, no stereo detect, radio
+ disable, "one" bit phase 2, tuner adjust)
+
+----------------------------------------------------------------------------
diff --git a/Documentation/video4linux/se401.txt b/Documentation/video4linux/se401.txt
new file mode 100644
index 0000000..7b9d1c9
--- /dev/null
+++ b/Documentation/video4linux/se401.txt
@@ -0,0 +1,54 @@
+Linux driver for SE401 based USB cameras
+
+Copyright, 2001, Jeroen Vreeken
+
+
+INTRODUCTION:
+
+The SE401 chip is the used in low-cost usb webcams.
+It is produced by Endpoints Inc. (www.endpoints.com).
+It interfaces directly to a cmos image sensor and USB. The only other major
+part in a se401 based camera is a dram chip.
+
+The following cameras are known to work with this driver:
+
+Aox se401 (non-branded) cameras
+Philips PVCV665 USB VGA webcam 'Vesta Fun'
+Kensington VideoCAM PC Camera Model 67014
+Kensington VideoCAM PC Camera Model 67015
+Kensington VideoCAM PC Camera Model 67016
+Kensington VideoCAM PC Camera Model 67017
+
+
+WHAT YOU NEED:
+
+- USB support
+- VIDEO4LINUX support
+
+More information about USB support for linux can be found at:
+http://www.linux-usb.org
+
+
+MODULE OPTIONS:
+
+When the driver is compiled as a module you can also use the 'flickerless'
+option. With it exposure is limited to values that do not interfere with the
+net frequency. Valid options for this option are 0, 50 and 60. (0=disable,
+50=50hz, 60=60hz)
+
+
+KNOWN PROBLEMS:
+
+The driver works fine with the usb-ohci and uhci host controller drivers,
+the default settings also work with usb-uhci. But sending more than one bulk
+transfer at a time with usb-uhci doesn't work yet.
+Users of usb-ohci and uhci can safely enlarge SE401_NUMSBUF in se401.h in
+order to increase the throughput (and thus framerate).
+
+
+HELP:
+
+The latest info on this driver can be found at:
+http://www.chello.nl/~j.vreeken/se401/
+And questions to me can be send to:
+pe1rxq@amsat.org
diff --git a/Documentation/video4linux/si470x.txt b/Documentation/video4linux/si470x.txt
new file mode 100644
index 0000000..11c5fd2
--- /dev/null
+++ b/Documentation/video4linux/si470x.txt
@@ -0,0 +1,118 @@
+Driver for USB radios for the Silicon Labs Si470x FM Radio Receivers
+
+Copyright (c) 2008 Tobias Lorenz <tobias.lorenz@gmx.net>
+
+
+Information from Silicon Labs
+=============================
+Silicon Laboratories is the manufacturer of the radio ICs, that nowadays are the
+most often used radio receivers in cell phones. Usually they are connected with
+I2C. But SiLabs also provides a reference design, which integrates this IC,
+together with a small microcontroller C8051F321, to form a USB radio.
+Part of this reference design is also a radio application in binary and source
+code. The software also contains an automatic firmware upgrade to the most
+current version. Information on these can be downloaded here:
+http://www.silabs.com/usbradio
+
+
+Supported ICs
+=============
+The following ICs have a very similar register set, so that they are or will be
+supported somewhen by the driver:
+- Si4700: FM radio receiver
+- Si4701: FM radio receiver, RDS Support
+- Si4702: FM radio receiver
+- Si4703: FM radio receiver, RDS Support
+- Si4704: FM radio receiver, no external antenna required
+- Si4705: FM radio receiver, no external antenna required, RDS support, Dig I/O
+- Si4706: Enhanced FM RDS/TMC radio receiver, no external antenna required, RDS
+ Support
+- Si4707: Dedicated weather band radio receiver with SAME decoder, RDS Support
+- Si4708: Smallest FM receivers
+- Si4709: Smallest FM receivers, RDS Support
+More information on these can be downloaded here:
+http://www.silabs.com/products/mcu/Pages/USBFMRadioRD.aspx
+
+
+Supported USB devices
+=====================
+Currently the following USB radios (vendor:product) with the Silicon Labs si470x
+chips are known to work:
+- 10c4:818a: Silicon Labs USB FM Radio Reference Design
+- 06e1:a155: ADS/Tech FM Radio Receiver (formerly Instant FM Music) (RDX-155-EF)
+- 1b80:d700: KWorld USB FM Radio SnapMusic Mobile 700 (FM700)
+
+
+Software
+========
+Testing is usually done with most application under Debian/testing:
+- fmtools - Utility for managing FM tuner cards
+- gnomeradio - FM-radio tuner for the GNOME desktop
+- gradio - GTK FM radio tuner
+- kradio - Comfortable Radio Application for KDE
+- radio - ncurses-based radio application
+
+There is also a library libv4l, which can be used. It's going to have a function
+for frequency seeking, either by using hardware functionality as in radio-si470x
+or by implementing a function as we currently have in every of the mentioned
+programs. Somewhen the radio programs should make use of libv4l.
+
+For processing RDS information, there is a project ongoing at:
+http://rdsd.berlios.de/
+
+There is currently no project for making TMC sentences human readable.
+
+
+Audio Listing
+=============
+USB Audio is provided by the ALSA snd_usb_audio module. It is recommended to
+also select SND_USB_AUDIO, as this is required to get sound from the radio. For
+listing you have to redirect the sound, for example using one of the following
+commands.
+
+If you just want to test audio (very poor quality):
+cat /dev/dsp1 > /dev/dsp
+
+If you use OSS try:
+sox -2 --endian little -r 96000 -t oss /dev/dsp1 -t oss /dev/dsp
+
+If you use arts try:
+arecord -D hw:1,0 -r96000 -c2 -f S16_LE | artsdsp aplay -B -
+
+
+Module Parameters
+=================
+After loading the module, you still have access to some of them in the sysfs
+mount under /sys/module/radio_si470x/parameters. The contents of read-only files
+(0444) are not updated, even if space, band and de are changed using private
+video controls. The others are runtime changeable.
+
+
+Errors
+======
+Increase tune_timeout, if you often get -EIO errors.
+
+When timed out or band limit is reached, hw_freq_seek returns -EAGAIN.
+
+If you get any errors from snd_usb_audio, please report them to the ALSA people.
+
+
+Open Issues
+===========
+V4L minor device allocation and parameter setting is not perfect. A solution is
+currently under discussion.
+
+There is an USB interface for downloading/uploading new firmware images. Support
+for it can be implemented using the request_firmware interface.
+
+There is a RDS interrupt mode. The driver is already using the same interface
+for polling RDS information, but is currently not using the interrupt mode.
+
+There is a LED interface, which can be used to override the LED control
+programmed in the firmware. This can be made available using the LED support
+functions in the kernel.
+
+
+Other useful information and links
+==================================
+http://www.silabs.com/usbradio
diff --git a/Documentation/video4linux/sn9c102.txt b/Documentation/video4linux/sn9c102.txt
new file mode 100644
index 0000000..73de405
--- /dev/null
+++ b/Documentation/video4linux/sn9c102.txt
@@ -0,0 +1,592 @@
+
+ SN9C1xx PC Camera Controllers
+ Driver for Linux
+ =============================
+
+ - Documentation -
+
+
+Index
+=====
+1. Copyright
+2. Disclaimer
+3. License
+4. Overview and features
+5. Module dependencies
+6. Module loading
+7. Module parameters
+8. Optional device control through "sysfs"
+9. Supported devices
+10. Notes for V4L2 application developers
+11. Video frame formats
+12. Contact information
+13. Credits
+
+
+1. Copyright
+============
+Copyright (C) 2004-2007 by Luca Risolia <luca.risolia@studio.unibo.it>
+
+
+2. Disclaimer
+=============
+SONiX is a trademark of SONiX Technology Company Limited, inc.
+This software is not sponsored or developed by SONiX.
+
+
+3. License
+==========
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+4. Overview and features
+========================
+This driver attempts to support the video interface of the devices assembling
+the SONiX SN9C101, SN9C102, SN9C103, SN9C105 and SN9C120 PC Camera Controllers
+("SN9C1xx" from now on).
+
+The driver relies on the Video4Linux2 and USB core modules. It has been
+designed to run properly on SMP systems as well.
+
+The latest version of the SN9C1xx driver can be found at the following URL:
+http://www.linux-projects.org/
+
+Some of the features of the driver are:
+
+- full compliance with the Video4Linux2 API (see also "Notes for V4L2
+ application developers" paragraph);
+- available mmap or read/poll methods for video streaming through isochronous
+ data transfers;
+- automatic detection of image sensor;
+- support for built-in microphone interface;
+- support for any window resolutions and optional panning within the maximum
+ pixel area of image sensor;
+- image downscaling with arbitrary scaling factors from 1, 2 and 4 in both
+ directions (see "Notes for V4L2 application developers" paragraph);
+- two different video formats for uncompressed or compressed data in low or
+ high compression quality (see also "Notes for V4L2 application developers"
+ and "Video frame formats" paragraphs);
+- full support for the capabilities of many of the possible image sensors that
+ can be connected to the SN9C1xx bridges, including, for instance, red, green,
+ blue and global gain adjustments and exposure (see "Supported devices"
+ paragraph for details);
+- use of default color settings for sunlight conditions;
+- dynamic I/O interface for both SN9C1xx and image sensor control and
+ monitoring (see "Optional device control through 'sysfs'" paragraph);
+- dynamic driver control thanks to various module parameters (see "Module
+ parameters" paragraph);
+- up to 64 cameras can be handled at the same time; they can be connected and
+ disconnected from the host many times without turning off the computer, if
+ the system supports hotplugging;
+- no known bugs.
+
+
+5. Module dependencies
+======================
+For it to work properly, the driver needs kernel support for Video4Linux and
+USB.
+
+The following options of the kernel configuration file must be enabled and
+corresponding modules must be compiled:
+
+ # Multimedia devices
+ #
+ CONFIG_VIDEO_DEV=m
+
+To enable advanced debugging functionality on the device through /sysfs:
+
+ # Multimedia devices
+ #
+ CONFIG_VIDEO_ADV_DEBUG=y
+
+ # USB support
+ #
+ CONFIG_USB=m
+
+In addition, depending on the hardware being used, the modules below are
+necessary:
+
+ # USB Host Controller Drivers
+ #
+ CONFIG_USB_EHCI_HCD=m
+ CONFIG_USB_UHCI_HCD=m
+ CONFIG_USB_OHCI_HCD=m
+
+The SN9C103, SN9c105 and SN9C120 controllers also provide a built-in microphone
+interface. It is supported by the USB Audio driver thanks to the ALSA API:
+
+ # Sound
+ #
+ CONFIG_SOUND=y
+
+ # Advanced Linux Sound Architecture
+ #
+ CONFIG_SND=m
+
+ # USB devices
+ #
+ CONFIG_SND_USB_AUDIO=m
+
+And finally:
+
+ # USB Multimedia devices
+ #
+ CONFIG_USB_SN9C102=m
+
+
+6. Module loading
+=================
+To use the driver, it is necessary to load the "sn9c102" module into memory
+after every other module required: "videodev", "v4l2_common", "compat_ioctl32",
+"usbcore" and, depending on the USB host controller you have, "ehci-hcd",
+"uhci-hcd" or "ohci-hcd".
+
+Loading can be done as shown below:
+
+ [root@localhost home]# modprobe sn9c102
+
+Note that the module is called "sn9c102" for historic reasons, although it
+does not just support the SN9C102.
+
+At this point all the devices supported by the driver and connected to the USB
+ports should be recognized. You can invoke "dmesg" to analyze kernel messages
+and verify that the loading process has gone well:
+
+ [user@localhost home]$ dmesg
+
+or, to isolate all the kernel messages generated by the driver:
+
+ [user@localhost home]$ dmesg | grep sn9c102
+
+
+7. Module parameters
+====================
+Module parameters are listed below:
+-------------------------------------------------------------------------------
+Name: video_nr
+Type: short array (min = 0, max = 64)
+Syntax: <-1|n[,...]>
+Description: Specify V4L2 minor mode number:
+ -1 = use next available
+ n = use minor number n
+ You can specify up to 64 cameras this way.
+ For example:
+ video_nr=-1,2,-1 would assign minor number 2 to the second
+ recognized camera and use auto for the first one and for every
+ other camera.
+Default: -1
+-------------------------------------------------------------------------------
+Name: force_munmap
+Type: bool array (min = 0, max = 64)
+Syntax: <0|1[,...]>
+Description: Force the application to unmap previously mapped buffer memory
+ before calling any VIDIOC_S_CROP or VIDIOC_S_FMT ioctl's. Not
+ all the applications support this feature. This parameter is
+ specific for each detected camera.
+ 0 = do not force memory unmapping
+ 1 = force memory unmapping (save memory)
+Default: 0
+-------------------------------------------------------------------------------
+Name: frame_timeout
+Type: uint array (min = 0, max = 64)
+Syntax: <0|n[,...]>
+Description: Timeout for a video frame in seconds before returning an I/O
+ error; 0 for infinity. This parameter is specific for each
+ detected camera and can be changed at runtime thanks to the
+ /sys filesystem interface.
+Default: 2
+-------------------------------------------------------------------------------
+Name: debug
+Type: ushort
+Syntax: <n>
+Description: Debugging information level, from 0 to 3:
+ 0 = none (use carefully)
+ 1 = critical errors
+ 2 = significant informations
+ 3 = more verbose messages
+ Level 3 is useful for testing only. It also shows some more
+ informations about the hardware being detected.
+ This parameter can be changed at runtime thanks to the /sys
+ filesystem interface.
+Default: 2
+-------------------------------------------------------------------------------
+
+
+8. Optional device control through "sysfs" [1]
+==========================================
+If the kernel has been compiled with the CONFIG_VIDEO_ADV_DEBUG option enabled,
+it is possible to read and write both the SN9C1xx and the image sensor
+registers by using the "sysfs" filesystem interface.
+
+Every time a supported device is recognized, a write-only file named "green" is
+created in the /sys/class/video4linux/videoX directory. You can set the green
+channel's gain by writing the desired value to it. The value may range from 0
+to 15 for the SN9C101 or SN9C102 bridges, from 0 to 127 for the SN9C103,
+SN9C105 and SN9C120 bridges.
+Similarly, only for the SN9C103, SN9C105 and SN9C120 controllers, blue and red
+gain control files are available in the same directory, for which accepted
+values may range from 0 to 127.
+
+There are other four entries in the directory above for each registered camera:
+"reg", "val", "i2c_reg" and "i2c_val". The first two files control the
+SN9C1xx bridge, while the other two control the sensor chip. "reg" and
+"i2c_reg" hold the values of the current register index where the following
+reading/writing operations are addressed at through "val" and "i2c_val". Their
+use is not intended for end-users. Note that "i2c_reg" and "i2c_val" will not
+be created if the sensor does not actually support the standard I2C protocol or
+its registers are not 8-bit long. Also, remember that you must be logged in as
+root before writing to them.
+
+As an example, suppose we were to want to read the value contained in the
+register number 1 of the sensor register table - which is usually the product
+identifier - of the camera registered as "/dev/video0":
+
+ [root@localhost #] cd /sys/class/video4linux/video0
+ [root@localhost #] echo 1 > i2c_reg
+ [root@localhost #] cat i2c_val
+
+Note that "cat" will fail if sensor registers cannot be read.
+
+Now let's set the green gain's register of the SN9C101 or SN9C102 chips to 2:
+
+ [root@localhost #] echo 0x11 > reg
+ [root@localhost #] echo 2 > val
+
+Note that the SN9C1xx always returns 0 when some of its registers are read.
+To avoid race conditions, all the I/O accesses to the above files are
+serialized.
+The sysfs interface also provides the "frame_header" entry, which exports the
+frame header of the most recent requested and captured video frame. The header
+is always 18-bytes long and is appended to every video frame by the SN9C1xx
+controllers. As an example, this additional information can be used by the user
+application for implementing auto-exposure features via software.
+
+The following table describes the frame header exported by the SN9C101 and
+SN9C102:
+
+Byte # Value or bits Description
+------ ------------- -----------
+0x00 0xFF Frame synchronisation pattern
+0x01 0xFF Frame synchronisation pattern
+0x02 0x00 Frame synchronisation pattern
+0x03 0xC4 Frame synchronisation pattern
+0x04 0xC4 Frame synchronisation pattern
+0x05 0x96 Frame synchronisation pattern
+0x06 [3:0] Read channel gain control = (1+R_GAIN/8)
+ [7:4] Blue channel gain control = (1+B_GAIN/8)
+0x07 [ 0 ] Compression mode. 0=No compression, 1=Compression enabled
+ [2:1] Maximum scale factor for compression
+ [ 3 ] 1 = USB fifo(2K bytes) is full
+ [ 4 ] 1 = Digital gain is finish
+ [ 5 ] 1 = Exposure is finish
+ [7:6] Frame index
+0x08 [7:0] Y sum inside Auto-Exposure area (low-byte)
+0x09 [7:0] Y sum inside Auto-Exposure area (high-byte)
+ where Y sum = (R/4 + 5G/16 + B/8) / 32
+0x0A [7:0] Y sum outside Auto-Exposure area (low-byte)
+0x0B [7:0] Y sum outside Auto-Exposure area (high-byte)
+ where Y sum = (R/4 + 5G/16 + B/8) / 128
+0x0C 0xXX Not used
+0x0D 0xXX Not used
+0x0E 0xXX Not used
+0x0F 0xXX Not used
+0x10 0xXX Not used
+0x11 0xXX Not used
+
+The following table describes the frame header exported by the SN9C103:
+
+Byte # Value or bits Description
+------ ------------- -----------
+0x00 0xFF Frame synchronisation pattern
+0x01 0xFF Frame synchronisation pattern
+0x02 0x00 Frame synchronisation pattern
+0x03 0xC4 Frame synchronisation pattern
+0x04 0xC4 Frame synchronisation pattern
+0x05 0x96 Frame synchronisation pattern
+0x06 [6:0] Read channel gain control = (1/2+R_GAIN/64)
+0x07 [6:0] Blue channel gain control = (1/2+B_GAIN/64)
+ [7:4]
+0x08 [ 0 ] Compression mode. 0=No compression, 1=Compression enabled
+ [2:1] Maximum scale factor for compression
+ [ 3 ] 1 = USB fifo(2K bytes) is full
+ [ 4 ] 1 = Digital gain is finish
+ [ 5 ] 1 = Exposure is finish
+ [7:6] Frame index
+0x09 [7:0] Y sum inside Auto-Exposure area (low-byte)
+0x0A [7:0] Y sum inside Auto-Exposure area (high-byte)
+ where Y sum = (R/4 + 5G/16 + B/8) / 32
+0x0B [7:0] Y sum outside Auto-Exposure area (low-byte)
+0x0C [7:0] Y sum outside Auto-Exposure area (high-byte)
+ where Y sum = (R/4 + 5G/16 + B/8) / 128
+0x0D [1:0] Audio frame number
+ [ 2 ] 1 = Audio is recording
+0x0E [7:0] Audio summation (low-byte)
+0x0F [7:0] Audio summation (high-byte)
+0x10 [7:0] Audio sample count
+0x11 [7:0] Audio peak data in audio frame
+
+The AE area (sx, sy, ex, ey) in the active window can be set by programming the
+registers 0x1c, 0x1d, 0x1e and 0x1f of the SN9C1xx controllers, where one unit
+corresponds to 32 pixels.
+
+[1] The frame headers exported by the SN9C105 and SN9C120 are not described.
+
+
+9. Supported devices
+====================
+None of the names of the companies as well as their products will be mentioned
+here. They have never collaborated with the author, so no advertising.
+
+From the point of view of a driver, what unambiguously identify a device are
+its vendor and product USB identifiers. Below is a list of known identifiers of
+devices assembling the SN9C1xx PC camera controllers:
+
+Vendor ID Product ID
+--------- ----------
+0x0458 0x7025
+0x045e 0x00f5
+0x045e 0x00f7
+0x0471 0x0327
+0x0471 0x0328
+0x0c45 0x6001
+0x0c45 0x6005
+0x0c45 0x6007
+0x0c45 0x6009
+0x0c45 0x600d
+0x0c45 0x6011
+0x0c45 0x6019
+0x0c45 0x6024
+0x0c45 0x6025
+0x0c45 0x6028
+0x0c45 0x6029
+0x0c45 0x602a
+0x0c45 0x602b
+0x0c45 0x602c
+0x0c45 0x602d
+0x0c45 0x602e
+0x0c45 0x6030
+0x0c45 0x603f
+0x0c45 0x6080
+0x0c45 0x6082
+0x0c45 0x6083
+0x0c45 0x6088
+0x0c45 0x608a
+0x0c45 0x608b
+0x0c45 0x608c
+0x0c45 0x608e
+0x0c45 0x608f
+0x0c45 0x60a0
+0x0c45 0x60a2
+0x0c45 0x60a3
+0x0c45 0x60a8
+0x0c45 0x60aa
+0x0c45 0x60ab
+0x0c45 0x60ac
+0x0c45 0x60ae
+0x0c45 0x60af
+0x0c45 0x60b0
+0x0c45 0x60b2
+0x0c45 0x60b3
+0x0c45 0x60b8
+0x0c45 0x60ba
+0x0c45 0x60bb
+0x0c45 0x60bc
+0x0c45 0x60be
+0x0c45 0x60c0
+0x0c45 0x60c2
+0x0c45 0x60c8
+0x0c45 0x60cc
+0x0c45 0x60ea
+0x0c45 0x60ec
+0x0c45 0x60ef
+0x0c45 0x60fa
+0x0c45 0x60fb
+0x0c45 0x60fc
+0x0c45 0x60fe
+0x0c45 0x6102
+0x0c45 0x6108
+0x0c45 0x610f
+0x0c45 0x6130
+0x0c45 0x6138
+0x0c45 0x613a
+0x0c45 0x613b
+0x0c45 0x613c
+0x0c45 0x613e
+
+The list above does not imply that all those devices work with this driver: up
+until now only the ones that assemble the following pairs of SN9C1xx bridges
+and image sensors are supported; kernel messages will always tell you whether
+this is the case (see "Module loading" paragraph):
+
+Image sensor / SN9C1xx bridge | SN9C10[12] SN9C103 SN9C105 SN9C120
+-------------------------------------------------------------------------------
+HV7131D Hynix Semiconductor | Yes No No No
+HV7131R Hynix Semiconductor | No Yes Yes Yes
+MI-0343 Micron Technology | Yes No No No
+MI-0360 Micron Technology | No Yes Yes Yes
+OV7630 OmniVision Technologies | Yes Yes Yes Yes
+OV7660 OmniVision Technologies | No No Yes Yes
+PAS106B PixArt Imaging | Yes No No No
+PAS202B PixArt Imaging | Yes Yes No No
+TAS5110C1B Taiwan Advanced Sensor | Yes No No No
+TAS5110D Taiwan Advanced Sensor | Yes No No No
+TAS5130D1B Taiwan Advanced Sensor | Yes No No No
+
+"Yes" means that the pair is supported by the driver, while "No" means that the
+pair does not exist or is not supported by the driver.
+
+Only some of the available control settings of each image sensor are supported
+through the V4L2 interface.
+
+Donations of new models for further testing and support would be much
+appreciated. Non-available hardware will not be supported by the author of this
+driver.
+
+
+10. Notes for V4L2 application developers
+=========================================
+This driver follows the V4L2 API specifications. In particular, it enforces two
+rules:
+
+- exactly one I/O method, either "mmap" or "read", is associated with each
+file descriptor. Once it is selected, the application must close and reopen the
+device to switch to the other I/O method;
+
+- although it is not mandatory, previously mapped buffer memory should always
+be unmapped before calling any "VIDIOC_S_CROP" or "VIDIOC_S_FMT" ioctl's.
+The same number of buffers as before will be allocated again to match the size
+of the new video frames, so you have to map the buffers again before any I/O
+attempts on them.
+
+Consistently with the hardware limits, this driver also supports image
+downscaling with arbitrary scaling factors from 1, 2 and 4 in both directions.
+However, the V4L2 API specifications don't correctly define how the scaling
+factor can be chosen arbitrarily by the "negotiation" of the "source" and
+"target" rectangles. To work around this flaw, we have added the convention
+that, during the negotiation, whenever the "VIDIOC_S_CROP" ioctl is issued, the
+scaling factor is restored to 1.
+
+This driver supports two different video formats: the first one is the "8-bit
+Sequential Bayer" format and can be used to obtain uncompressed video data
+from the device through the current I/O method, while the second one provides
+either "raw" compressed video data (without frame headers not related to the
+compressed data) or standard JPEG (with frame headers). The compression quality
+may vary from 0 to 1 and can be selected or queried thanks to the
+VIDIOC_S_JPEGCOMP and VIDIOC_G_JPEGCOMP V4L2 ioctl's. For maximum flexibility,
+both the default active video format and the default compression quality
+depend on how the image sensor being used is initialized.
+
+
+11. Video frame formats [1]
+=======================
+The SN9C1xx PC Camera Controllers can send images in two possible video
+formats over the USB: either native "Sequential RGB Bayer" or compressed.
+The compression is used to achieve high frame rates. With regard to the
+SN9C101, SN9C102 and SN9C103, the compression is based on the Huffman encoding
+algorithm described below, while with regard to the SN9C105 and SN9C120 the
+compression is based on the JPEG standard.
+The current video format may be selected or queried from the user application
+by calling the VIDIOC_S_FMT or VIDIOC_G_FMT ioctl's, as described in the V4L2
+API specifications.
+
+The name "Sequential Bayer" indicates the organization of the red, green and
+blue pixels in one video frame. Each pixel is associated with a 8-bit long
+value and is disposed in memory according to the pattern shown below:
+
+B[0] G[1] B[2] G[3] ... B[m-2] G[m-1]
+G[m] R[m+1] G[m+2] R[m+2] ... G[2m-2] R[2m-1]
+...
+... B[(n-1)(m-2)] G[(n-1)(m-1)]
+... G[n(m-2)] R[n(m-1)]
+
+The above matrix also represents the sequential or progressive read-out mode of
+the (n, m) Bayer color filter array used in many CCD or CMOS image sensors.
+
+The Huffman compressed video frame consists of a bitstream that encodes for
+every R, G, or B pixel the difference between the value of the pixel itself and
+some reference pixel value. Pixels are organised in the Bayer pattern and the
+Bayer sub-pixels are tracked individually and alternatingly. For example, in
+the first line values for the B and G1 pixels are alternatingly encoded, while
+in the second line values for the G2 and R pixels are alternatingly encoded.
+
+The pixel reference value is calculated as follows:
+- the 4 top left pixels are encoded in raw uncompressed 8-bit format;
+- the value in the top two rows is the value of the pixel left of the current
+ pixel;
+- the value in the left column is the value of the pixel above the current
+ pixel;
+- for all other pixels, the reference value is the average of the value of the
+ pixel on the left and the value of the pixel above the current pixel;
+- there is one code in the bitstream that specifies the value of a pixel
+ directly (in 4-bit resolution);
+- pixel values need to be clamped inside the range [0..255] for proper
+ decoding.
+
+The algorithm purely describes the conversion from compressed Bayer code used
+in the SN9C101, SN9C102 and SN9C103 chips to uncompressed Bayer. Additional
+steps are required to convert this to a color image (i.e. a color interpolation
+algorithm).
+
+The following Huffman codes have been found:
+0: +0 (relative to reference pixel value)
+100: +4
+101: -4?
+1110xxxx: set absolute value to xxxx.0000
+1101: +11
+1111: -11
+11001: +20
+110000: -20
+110001: ??? - these codes are apparently not used
+
+[1] The Huffman compression algorithm has been reverse-engineered and
+ documented by Bertrik Sikken.
+
+
+12. Contact information
+=======================
+The author may be contacted by e-mail at <luca.risolia@studio.unibo.it>.
+
+GPG/PGP encrypted e-mail's are accepted. The GPG key ID of the author is
+'FCE635A4'; the public 1024-bit key should be available at any keyserver;
+the fingerprint is: '88E8 F32F 7244 68BA 3958 5D40 99DA 5D2A FCE6 35A4'.
+
+
+13. Credits
+===========
+Many thanks to following persons for their contribute (listed in alphabetical
+order):
+
+- David Anderson for the donation of a webcam;
+- Luca Capello for the donation of a webcam;
+- Philippe Coval for having helped testing the PAS202BCA image sensor;
+- Joao Rodrigo Fuzaro, Joao Limirio, Claudio Filho and Caio Begotti for the
+ donation of a webcam;
+- Dennis Heitmann for the donation of a webcam;
+- Jon Hollstrom for the donation of a webcam;
+- Nick McGill for the donation of a webcam;
+- Carlos Eduardo Medaglia Dyonisio, who added the support for the PAS202BCB
+ image sensor;
+- Stefano Mozzi, who donated 45 EU;
+- Andrew Pearce for the donation of a webcam;
+- John Pullan for the donation of a webcam;
+- Bertrik Sikken, who reverse-engineered and documented the Huffman compression
+ algorithm used in the SN9C101, SN9C102 and SN9C103 controllers and
+ implemented the first decoder;
+- Ronny Standke for the donation of a webcam;
+- Mizuno Takafumi for the donation of a webcam;
+- an "anonymous" donator (who didn't want his name to be revealed) for the
+ donation of a webcam.
+- an anonymous donator for the donation of four webcams and two boards with ten
+ image sensors.
diff --git a/Documentation/video4linux/soc-camera.txt b/Documentation/video4linux/soc-camera.txt
new file mode 100644
index 0000000..178ef3c
--- /dev/null
+++ b/Documentation/video4linux/soc-camera.txt
@@ -0,0 +1,120 @@
+ Soc-Camera Subsystem
+ ====================
+
+Terminology
+-----------
+
+The following terms are used in this document:
+ - camera / camera device / camera sensor - a video-camera sensor chip, capable
+ of connecting to a variety of systems and interfaces, typically uses i2c for
+ control and configuration, and a parallel or a serial bus for data.
+ - camera host - an interface, to which a camera is connected. Typically a
+ specialised interface, present on many SoCs, e.g., PXA27x and PXA3xx, SuperH,
+ AVR32, i.MX27, i.MX31.
+ - camera host bus - a connection between a camera host and a camera. Can be
+ parallel or serial, consists of data and control lines, e.g., clock, vertical
+ and horizontal synchronization signals.
+
+Purpose of the soc-camera subsystem
+-----------------------------------
+
+The soc-camera subsystem provides a unified API between camera host drivers and
+camera sensor drivers. It implements a V4L2 interface to the user, currently
+only the mmap method is supported.
+
+This subsystem has been written to connect drivers for System-on-Chip (SoC)
+video capture interfaces with drivers for CMOS camera sensor chips to enable
+the reuse of sensor drivers with various hosts. The subsystem has been designed
+to support multiple camera host interfaces and multiple cameras per interface,
+although most applications have only one camera sensor.
+
+Existing drivers
+----------------
+
+As of 2.6.27-rc4 there are two host drivers in the mainline: pxa_camera.c for
+PXA27x SoCs and sh_mobile_ceu_camera.c for SuperH SoCs, and four sensor drivers:
+mt9m001.c, mt9m111.c, mt9v022.c and a generic soc_camera_platform.c driver. This
+list is not supposed to be updated, look for more examples in your tree.
+
+Camera host API
+---------------
+
+A host camera driver is registered using the
+
+soc_camera_host_register(struct soc_camera_host *);
+
+function. The host object can be initialized as follows:
+
+static struct soc_camera_host pxa_soc_camera_host = {
+ .drv_name = PXA_CAM_DRV_NAME,
+ .ops = &pxa_soc_camera_host_ops,
+};
+
+All camera host methods are passed in a struct soc_camera_host_ops:
+
+static struct soc_camera_host_ops pxa_soc_camera_host_ops = {
+ .owner = THIS_MODULE,
+ .add = pxa_camera_add_device,
+ .remove = pxa_camera_remove_device,
+ .suspend = pxa_camera_suspend,
+ .resume = pxa_camera_resume,
+ .set_fmt_cap = pxa_camera_set_fmt_cap,
+ .try_fmt_cap = pxa_camera_try_fmt_cap,
+ .init_videobuf = pxa_camera_init_videobuf,
+ .reqbufs = pxa_camera_reqbufs,
+ .poll = pxa_camera_poll,
+ .querycap = pxa_camera_querycap,
+ .try_bus_param = pxa_camera_try_bus_param,
+ .set_bus_param = pxa_camera_set_bus_param,
+};
+
+.add and .remove methods are called when a sensor is attached to or detached
+from the host, apart from performing host-internal tasks they shall also call
+sensor driver's .init and .release methods respectively. .suspend and .resume
+methods implement host's power-management functionality and its their
+responsibility to call respective sensor's methods. .try_bus_param and
+.set_bus_param are used to negotiate physical connection parameters between the
+host and the sensor. .init_videobuf is called by soc-camera core when a
+video-device is opened, further video-buffer management is implemented completely
+by the specific camera host driver. The rest of the methods are called from
+respective V4L2 operations.
+
+Camera API
+----------
+
+Sensor drivers can use struct soc_camera_link, typically provided by the
+platform, and used to specify to which camera host bus the sensor is connected,
+and arbitrarily provide platform .power and .reset methods for the camera.
+soc_camera_device_register() and soc_camera_device_unregister() functions are
+used to add a sensor driver to or remove one from the system. The registration
+function takes a pointer to struct soc_camera_device as the only parameter.
+This struct can be initialized as follows:
+
+ /* link to driver operations */
+ icd->ops = &mt9m001_ops;
+ /* link to the underlying physical (e.g., i2c) device */
+ icd->control = &client->dev;
+ /* window geometry */
+ icd->x_min = 20;
+ icd->y_min = 12;
+ icd->x_current = 20;
+ icd->y_current = 12;
+ icd->width_min = 48;
+ icd->width_max = 1280;
+ icd->height_min = 32;
+ icd->height_max = 1024;
+ icd->y_skip_top = 1;
+ /* camera bus ID, typically obtained from platform data */
+ icd->iface = icl->bus_id;
+
+struct soc_camera_ops provides .probe and .remove methods, which are called by
+the soc-camera core, when a camera is matched against or removed from a camera
+host bus, .init, .release, .suspend, and .resume are called from the camera host
+driver as discussed above. Other members of this struct provide respective V4L2
+functionality.
+
+struct soc_camera_device also links to an array of struct soc_camera_data_format,
+listing pixel formats, supported by the camera.
+
+--
+Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
diff --git a/Documentation/video4linux/stv680.txt b/Documentation/video4linux/stv680.txt
new file mode 100644
index 0000000..4f8946f
--- /dev/null
+++ b/Documentation/video4linux/stv680.txt
@@ -0,0 +1,53 @@
+Linux driver for STV0680 based USB cameras
+
+Copyright, 2001, Kevin Sisson
+
+
+INTRODUCTION:
+
+STMicroelectronics produces the STV0680B chip, which comes in two
+types, -001 and -003. The -003 version allows the recording and downloading
+of sound clips from the camera, and allows a flash attachment. Otherwise,
+it uses the same commands as the -001 version. Both versions support a
+variety of SDRAM sizes and sensors, allowing for a maximum of 26 VGA or 20
+CIF pictures. The STV0680 supports either a serial or a usb interface, and
+video is possible through the usb interface.
+
+The following cameras are known to work with this driver, although any
+camera with Vendor/Product codes of 0553/0202 should work:
+
+Aiptek Pencam (various models)
+Nisis QuickPix 2
+Radio Shack 'Kid's digital camera' (#60-1207)
+At least one Trust Spycam model
+Several other European brand models
+
+WHAT YOU NEED:
+
+- USB support
+- VIDEO4LINUX support
+
+More information about USB support for linux can be found at:
+http://www.linux-usb.org
+
+
+MODULE OPTIONS:
+
+When the driver is compiled as a module, you can set a "swapRGB=1"
+option, if necessary, for those applications that require it
+(such as xawtv). However, the driver should detect and set this
+automatically, so this option should not normally be used.
+
+
+KNOWN PROBLEMS:
+
+The driver seems to work better with the usb-ohci than the usb-uhci host
+controller driver.
+
+HELP:
+
+The latest info on this driver can be found at:
+http://personal.clt.bellsouth.net/~kjsisson or at
+http://stv0680-usb.sourceforge.net
+
+Any questions to me can be send to: kjsisson@bellsouth.net \ No newline at end of file
diff --git a/Documentation/video4linux/v4lgrab.c b/Documentation/video4linux/v4lgrab.c
new file mode 100644
index 0000000..079b628
--- /dev/null
+++ b/Documentation/video4linux/v4lgrab.c
@@ -0,0 +1,192 @@
+/* Simple Video4Linux image grabber. */
+/*
+ * Video4Linux Driver Test/Example Framegrabbing Program
+ *
+ * Compile with:
+ * gcc -s -Wall -Wstrict-prototypes v4lgrab.c -o v4lgrab
+ * Use as:
+ * v4lgrab >image.ppm
+ *
+ * Copyright (C) 1998-05-03, Phil Blundell <philb@gnu.org>
+ * Copied from http://www.tazenda.demon.co.uk/phil/vgrabber.c
+ * with minor modifications (Dave Forrest, drf5n@virginia.edu).
+ *
+ */
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+
+#include <linux/types.h>
+#include <linux/videodev.h>
+
+#define FILE "/dev/video0"
+
+/* Stole this from tvset.c */
+
+#define READ_VIDEO_PIXEL(buf, format, depth, r, g, b) \
+{ \
+ switch (format) \
+ { \
+ case VIDEO_PALETTE_GREY: \
+ switch (depth) \
+ { \
+ case 4: \
+ case 6: \
+ case 8: \
+ (r) = (g) = (b) = (*buf++ << 8);\
+ break; \
+ \
+ case 16: \
+ (r) = (g) = (b) = \
+ *((unsigned short *) buf); \
+ buf += 2; \
+ break; \
+ } \
+ break; \
+ \
+ \
+ case VIDEO_PALETTE_RGB565: \
+ { \
+ unsigned short tmp = *(unsigned short *)buf; \
+ (r) = tmp&0xF800; \
+ (g) = (tmp<<5)&0xFC00; \
+ (b) = (tmp<<11)&0xF800; \
+ buf += 2; \
+ } \
+ break; \
+ \
+ case VIDEO_PALETTE_RGB555: \
+ (r) = (buf[0]&0xF8)<<8; \
+ (g) = ((buf[0] << 5 | buf[1] >> 3)&0xF8)<<8; \
+ (b) = ((buf[1] << 2 ) & 0xF8)<<8; \
+ buf += 2; \
+ break; \
+ \
+ case VIDEO_PALETTE_RGB24: \
+ (r) = buf[0] << 8; (g) = buf[1] << 8; \
+ (b) = buf[2] << 8; \
+ buf += 3; \
+ break; \
+ \
+ default: \
+ fprintf(stderr, \
+ "Format %d not yet supported\n", \
+ format); \
+ } \
+}
+
+int get_brightness_adj(unsigned char *image, long size, int *brightness) {
+ long i, tot = 0;
+ for (i=0;i<size*3;i++)
+ tot += image[i];
+ *brightness = (128 - tot/(size*3))/3;
+ return !((tot/(size*3)) >= 126 && (tot/(size*3)) <= 130);
+}
+
+int main(int argc, char ** argv)
+{
+ int fd = open(FILE, O_RDONLY), f;
+ struct video_capability cap;
+ struct video_window win;
+ struct video_picture vpic;
+
+ unsigned char *buffer, *src;
+ int bpp = 24, r, g, b;
+ unsigned int i, src_depth;
+
+ if (fd < 0) {
+ perror(FILE);
+ exit(1);
+ }
+
+ if (ioctl(fd, VIDIOCGCAP, &cap) < 0) {
+ perror("VIDIOGCAP");
+ fprintf(stderr, "(" FILE " not a video4linux device?)\n");
+ close(fd);
+ exit(1);
+ }
+
+ if (ioctl(fd, VIDIOCGWIN, &win) < 0) {
+ perror("VIDIOCGWIN");
+ close(fd);
+ exit(1);
+ }
+
+ if (ioctl(fd, VIDIOCGPICT, &vpic) < 0) {
+ perror("VIDIOCGPICT");
+ close(fd);
+ exit(1);
+ }
+
+ if (cap.type & VID_TYPE_MONOCHROME) {
+ vpic.depth=8;
+ vpic.palette=VIDEO_PALETTE_GREY; /* 8bit grey */
+ if(ioctl(fd, VIDIOCSPICT, &vpic) < 0) {
+ vpic.depth=6;
+ if(ioctl(fd, VIDIOCSPICT, &vpic) < 0) {
+ vpic.depth=4;
+ if(ioctl(fd, VIDIOCSPICT, &vpic) < 0) {
+ fprintf(stderr, "Unable to find a supported capture format.\n");
+ close(fd);
+ exit(1);
+ }
+ }
+ }
+ } else {
+ vpic.depth=24;
+ vpic.palette=VIDEO_PALETTE_RGB24;
+
+ if(ioctl(fd, VIDIOCSPICT, &vpic) < 0) {
+ vpic.palette=VIDEO_PALETTE_RGB565;
+ vpic.depth=16;
+
+ if(ioctl(fd, VIDIOCSPICT, &vpic)==-1) {
+ vpic.palette=VIDEO_PALETTE_RGB555;
+ vpic.depth=15;
+
+ if(ioctl(fd, VIDIOCSPICT, &vpic)==-1) {
+ fprintf(stderr, "Unable to find a supported capture format.\n");
+ return -1;
+ }
+ }
+ }
+ }
+
+ buffer = malloc(win.width * win.height * bpp);
+ if (!buffer) {
+ fprintf(stderr, "Out of memory.\n");
+ exit(1);
+ }
+
+ do {
+ int newbright;
+ read(fd, buffer, win.width * win.height * bpp);
+ f = get_brightness_adj(buffer, win.width * win.height, &newbright);
+ if (f) {
+ vpic.brightness += (newbright << 8);
+ if(ioctl(fd, VIDIOCSPICT, &vpic)==-1) {
+ perror("VIDIOSPICT");
+ break;
+ }
+ }
+ } while (f);
+
+ fprintf(stdout, "P6\n%d %d 255\n", win.width, win.height);
+
+ src = buffer;
+
+ for (i = 0; i < win.width * win.height; i++) {
+ READ_VIDEO_PIXEL(src, vpic.palette, src_depth, r, g, b);
+ fputc(r>>8, stdout);
+ fputc(g>>8, stdout);
+ fputc(b>>8, stdout);
+ }
+
+ close(fd);
+ return 0;
+}
diff --git a/Documentation/video4linux/w9966.txt b/Documentation/video4linux/w9966.txt
new file mode 100644
index 0000000..78a6512
--- /dev/null
+++ b/Documentation/video4linux/w9966.txt
@@ -0,0 +1,33 @@
+W9966 Camera driver, written by Jakob Kemi (jakob.kemi@telia.com)
+
+After a lot of work in softice & wdasm, reading .pdf-files and tiresome
+trial-and-error work I've finally got everything to work. I needed vision for a
+robotics project so I borrowed this camera from a friend and started hacking.
+Anyway I've converted my original code from the AVR 8bit RISC C/ASM code into
+a working Linux driver.
+
+To get it working simply configure your kernel to support
+parport, ieee1284, video4linux and w9966
+
+If w9966 is statically linked it will always perform aggressive probing for
+the camera. If built as a module you'll have more configuration options.
+
+Options:
+ modprobe w9966.o pardev=parport0(or whatever) parmode=0 (0=auto, 1=ecp, 2=epp)
+voila!
+
+you can also type 'modinfo -p w9966.o' for option usage
+(or checkout w9966.c)
+
+The only thing to keep in mind is that the image format is in Y-U-Y-V format
+where every two pixels take 4 bytes. In SDL (www.libsdl.org) this format
+is called VIDEO_PALETTE_YUV422 (16 bpp).
+
+A minimal test application (with source) is available from:
+ http://hem.fyristorg.com/mogul/w9966.html
+
+The slow framerate is due to missing DMA ECP read support in the
+parport drivers. I might add working EPP support later.
+
+Good luck!
+ /Jakob Kemi
diff --git a/Documentation/video4linux/w9968cf.txt b/Documentation/video4linux/w9968cf.txt
new file mode 100644
index 0000000..05138e8
--- /dev/null
+++ b/Documentation/video4linux/w9968cf.txt
@@ -0,0 +1,458 @@
+
+ W996[87]CF JPEG USB Dual Mode Camera Chip
+ Driver for Linux 2.6 (basic version)
+ =========================================
+
+ - Documentation -
+
+
+Index
+=====
+1. Copyright
+2. Disclaimer
+3. License
+4. Overview
+5. Supported devices
+6. Module dependencies
+7. Module loading
+8. Module parameters
+9. Contact information
+10. Credits
+
+
+1. Copyright
+============
+Copyright (C) 2002-2004 by Luca Risolia <luca.risolia@studio.unibo.it>
+
+
+2. Disclaimer
+=============
+Winbond is a trademark of Winbond Electronics Corporation.
+This software is not sponsored or developed by Winbond.
+
+
+3. License
+==========
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+4. Overview
+===========
+This driver supports the video streaming capabilities of the devices mounting
+Winbond W9967CF and Winbond W9968CF JPEG USB Dual Mode Camera Chips. OV681
+based cameras should be supported as well.
+
+The driver is divided into two modules: the basic one, "w9968cf", is needed for
+the supported devices to work; the second one, "w9968cf-vpp", is an optional
+module, which provides some useful video post-processing functions like video
+decoding, up-scaling and colour conversions.
+
+Note that the official kernels do neither include nor support the second
+module for performance purposes. Therefore, it is always recommended to
+download and install the latest and complete release of the driver,
+replacing the existing one, if present.
+
+The latest and full-featured version of the W996[87]CF driver can be found at:
+http://www.linux-projects.org. Please refer to the documentation included in
+that package, if you are going to use it.
+
+Up to 32 cameras can be handled at the same time. They can be connected and
+disconnected from the host many times without turning off the computer, if
+your system supports the hotplug facility.
+
+To change the default settings for each camera, many parameters can be passed
+through command line when the module is loaded into memory.
+
+The driver relies on the Video4Linux, USB and I2C core modules. It has been
+designed to run properly on SMP systems as well. An additional module,
+"ovcamchip", is mandatory; it provides support for some OmniVision image
+sensors connected to the W996[87]CF chips; if found in the system, the module
+will be automatically loaded by default (provided that the kernel has been
+compiled with the automatic module loading option).
+
+
+5. Supported devices
+====================
+At the moment, known W996[87]CF and OV681 based devices are:
+- Aroma Digi Pen VGA Dual Mode ADG-5000 (unknown image sensor)
+- AVerMedia AVerTV USB (SAA7111A, Philips FI1216Mk2 tuner, PT2313L audio chip)
+- Creative Labs Video Blaster WebCam Go (OmniVision OV7610 sensor)
+- Creative Labs Video Blaster WebCam Go Plus (OmniVision OV7620 sensor)
+- Lebon LDC-035A (unknown image sensor)
+- Ezonics EZ-802 EZMega Cam (OmniVision OV8610C sensor)
+- OmniVision OV8610-EDE (OmniVision OV8610 sensor)
+- OPCOM Digi Pen VGA Dual Mode Pen Camera (unknown image sensor)
+- Pretec Digi Pen-II (OmniVision OV7620 sensor)
+- Pretec DigiPen-480 (OmniVision OV8610 sensor)
+
+If you know any other W996[87]CF or OV681 based cameras, please contact me.
+
+The list above does not imply that all those devices work with this driver: up
+until now only webcams that have an image sensor supported by the "ovcamchip"
+module work. Kernel messages will always tell you whether this is case.
+
+Possible external microcontrollers of those webcams are not supported: this
+means that still images cannot be downloaded from the device memory.
+
+Furthermore, it's worth to note that I was only able to run tests on my
+"Creative Labs Video Blaster WebCam Go". Donations of other models, for
+additional testing and full support, would be much appreciated.
+
+
+6. Module dependencies
+======================
+For it to work properly, the driver needs kernel support for Video4Linux, USB
+and I2C, and the "ovcamchip" module for the image sensor. Make sure you are not
+actually using any external "ovcamchip" module, given that the W996[87]CF
+driver depends on the version of the module present in the official kernels.
+
+The following options of the kernel configuration file must be enabled and
+corresponding modules must be compiled:
+
+ # Multimedia devices
+ #
+ CONFIG_VIDEO_DEV=m
+
+ # I2C support
+ #
+ CONFIG_I2C=m
+
+The I2C core module can be compiled statically in the kernel as well.
+
+ # OmniVision Camera Chip support
+ #
+ CONFIG_VIDEO_OVCAMCHIP=m
+
+ # USB support
+ #
+ CONFIG_USB=m
+
+In addition, depending on the hardware being used, only one of the modules
+below is necessary:
+
+ # USB Host Controller Drivers
+ #
+ CONFIG_USB_EHCI_HCD=m
+ CONFIG_USB_UHCI_HCD=m
+ CONFIG_USB_OHCI_HCD=m
+
+And finally:
+
+ # USB Multimedia devices
+ #
+ CONFIG_USB_W9968CF=m
+
+
+7. Module loading
+=================
+To use the driver, it is necessary to load the "w9968cf" module into memory
+after every other module required.
+
+Loading can be done this way, from root:
+
+ [root@localhost home]# modprobe usbcore
+ [root@localhost home]# modprobe i2c-core
+ [root@localhost home]# modprobe videodev
+ [root@localhost home]# modprobe w9968cf
+
+At this point the pertinent devices should be recognized: "dmesg" can be used
+to analyze kernel messages:
+
+ [user@localhost home]$ dmesg
+
+There are a lot of parameters the module can use to change the default
+settings for each device. To list every possible parameter with a brief
+explanation about them and which syntax to use, it is recommended to run the
+"modinfo" command:
+
+ [root@locahost home]# modinfo w9968cf
+
+
+8. Module parameters
+====================
+Module parameters are listed below:
+-------------------------------------------------------------------------------
+Name: ovmod_load
+Type: bool
+Syntax: <0|1>
+Description: Automatic 'ovcamchip' module loading: 0 disabled, 1 enabled.
+ If enabled, 'insmod' searches for the required 'ovcamchip'
+ module in the system, according to its configuration, and
+ loads that module automatically. This action is performed as
+ once soon as the 'w9968cf' module is loaded into memory.
+Default: 1
+-------------------------------------------------------------------------------
+Name: simcams
+Type: int
+Syntax: <n>
+Description: Number of cameras allowed to stream simultaneously.
+ n may vary from 0 to 32.
+Default: 32
+-------------------------------------------------------------------------------
+Name: video_nr
+Type: int array (min = 0, max = 32)
+Syntax: <-1|n[,...]>
+Description: Specify V4L minor mode number.
+ -1 = use next available
+ n = use minor number n
+ You can specify up to 32 cameras this way.
+ For example:
+ video_nr=-1,2,-1 would assign minor number 2 to the second
+ recognized camera and use auto for the first one and for every
+ other camera.
+Default: -1
+-------------------------------------------------------------------------------
+Name: packet_size
+Type: int array (min = 0, max = 32)
+Syntax: <n[,...]>
+Description: Specify the maximum data payload size in bytes for alternate
+ settings, for each device. n is scaled between 63 and 1023.
+Default: 1023
+-------------------------------------------------------------------------------
+Name: max_buffers
+Type: int array (min = 0, max = 32)
+Syntax: <n[,...]>
+Description: For advanced users.
+ Specify the maximum number of video frame buffers to allocate
+ for each device, from 2 to 32.
+Default: 2
+-------------------------------------------------------------------------------
+Name: double_buffer
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Hardware double buffering: 0 disabled, 1 enabled.
+ It should be enabled if you want smooth video output: if you
+ obtain out of sync. video, disable it, or try to
+ decrease the 'clockdiv' module parameter value.
+Default: 1 for every device.
+-------------------------------------------------------------------------------
+Name: clamping
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Video data clamping: 0 disabled, 1 enabled.
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: filter_type
+Type: int array (min = 0, max = 32)
+Syntax: <0|1|2[,...]>
+Description: Video filter type.
+ 0 none, 1 (1-2-1) 3-tap filter, 2 (2-3-6-3-2) 5-tap filter.
+ The filter is used to reduce noise and aliasing artifacts
+ produced by the CCD or CMOS image sensor.
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: largeview
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Large view: 0 disabled, 1 enabled.
+Default: 1 for every device.
+-------------------------------------------------------------------------------
+Name: upscaling
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Software scaling (for non-compressed video only):
+ 0 disabled, 1 enabled.
+ Disable it if you have a slow CPU or you don't have enough
+ memory.
+Default: 0 for every device.
+Note: If 'w9968cf-vpp' is not present, this parameter is set to 0.
+-------------------------------------------------------------------------------
+Name: decompression
+Type: int array (min = 0, max = 32)
+Syntax: <0|1|2[,...]>
+Description: Software video decompression:
+ 0 = disables decompression
+ (doesn't allow formats needing decompression).
+ 1 = forces decompression
+ (allows formats needing decompression only).
+ 2 = allows any permitted formats.
+ Formats supporting (de)compressed video are YUV422P and
+ YUV420P/YUV420 in any resolutions where width and height are
+ multiples of 16.
+Default: 2 for every device.
+Note: If 'w9968cf-vpp' is not present, forcing decompression is not
+ allowed; in this case this parameter is set to 2.
+-------------------------------------------------------------------------------
+Name: force_palette
+Type: int array (min = 0, max = 32)
+Syntax: <0|9|10|13|15|8|7|1|6|3|4|5[,...]>
+Description: Force picture palette.
+ In order:
+ 0 = Off - allows any of the following formats:
+ 9 = UYVY 16 bpp - Original video, compression disabled
+ 10 = YUV420 12 bpp - Original video, compression enabled
+ 13 = YUV422P 16 bpp - Original video, compression enabled
+ 15 = YUV420P 12 bpp - Original video, compression enabled
+ 8 = YUVY 16 bpp - Software conversion from UYVY
+ 7 = YUV422 16 bpp - Software conversion from UYVY
+ 1 = GREY 8 bpp - Software conversion from UYVY
+ 6 = RGB555 16 bpp - Software conversion from UYVY
+ 3 = RGB565 16 bpp - Software conversion from UYVY
+ 4 = RGB24 24 bpp - Software conversion from UYVY
+ 5 = RGB32 32 bpp - Software conversion from UYVY
+ When not 0, this parameter will override 'decompression'.
+Default: 0 for every device. Initial palette is 9 (UYVY).
+Note: If 'w9968cf-vpp' is not present, this parameter is set to 9.
+-------------------------------------------------------------------------------
+Name: force_rgb
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Read RGB video data instead of BGR:
+ 1 = use RGB component ordering.
+ 0 = use BGR component ordering.
+ This parameter has effect when using RGBX palettes only.
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: autobright
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Image sensor automatically changes brightness:
+ 0 = no, 1 = yes
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: autoexp
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Image sensor automatically changes exposure:
+ 0 = no, 1 = yes
+Default: 1 for every device.
+-------------------------------------------------------------------------------
+Name: lightfreq
+Type: int array (min = 0, max = 32)
+Syntax: <50|60[,...]>
+Description: Light frequency in Hz:
+ 50 for European and Asian lighting, 60 for American lighting.
+Default: 50 for every device.
+-------------------------------------------------------------------------------
+Name: bandingfilter
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Banding filter to reduce effects of fluorescent
+ lighting:
+ 0 disabled, 1 enabled.
+ This filter tries to reduce the pattern of horizontal
+ light/dark bands caused by some (usually fluorescent) lighting.
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: clockdiv
+Type: int array (min = 0, max = 32)
+Syntax: <-1|n[,...]>
+Description: Force pixel clock divisor to a specific value (for experts):
+ n may vary from 0 to 127.
+ -1 for automatic value.
+ See also the 'double_buffer' module parameter.
+Default: -1 for every device.
+-------------------------------------------------------------------------------
+Name: backlight
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Objects are lit from behind:
+ 0 = no, 1 = yes
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: mirror
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: Reverse image horizontally:
+ 0 = no, 1 = yes
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: monochrome
+Type: bool array (min = 0, max = 32)
+Syntax: <0|1[,...]>
+Description: The image sensor is monochrome:
+ 0 = no, 1 = yes
+Default: 0 for every device.
+-------------------------------------------------------------------------------
+Name: brightness
+Type: long array (min = 0, max = 32)
+Syntax: <n[,...]>
+Description: Set picture brightness (0-65535).
+ This parameter has no effect if 'autobright' is enabled.
+Default: 31000 for every device.
+-------------------------------------------------------------------------------
+Name: hue
+Type: long array (min = 0, max = 32)
+Syntax: <n[,...]>
+Description: Set picture hue (0-65535).
+Default: 32768 for every device.
+-------------------------------------------------------------------------------
+Name: colour
+Type: long array (min = 0, max = 32)
+Syntax: <n[,...]>
+Description: Set picture saturation (0-65535).
+Default: 32768 for every device.
+-------------------------------------------------------------------------------
+Name: contrast
+Type: long array (min = 0, max = 32)
+Syntax: <n[,...]>
+Description: Set picture contrast (0-65535).
+Default: 50000 for every device.
+-------------------------------------------------------------------------------
+Name: whiteness
+Type: long array (min = 0, max = 32)
+Syntax: <n[,...]>
+Description: Set picture whiteness (0-65535).
+Default: 32768 for every device.
+-------------------------------------------------------------------------------
+Name: debug
+Type: int
+Syntax: <n>
+Description: Debugging information level, from 0 to 6:
+ 0 = none (use carefully)
+ 1 = critical errors
+ 2 = significant informations
+ 3 = configuration or general messages
+ 4 = warnings
+ 5 = called functions
+ 6 = function internals
+ Level 5 and 6 are useful for testing only, when only one
+ device is used.
+Default: 2
+-------------------------------------------------------------------------------
+Name: specific_debug
+Type: bool
+Syntax: <0|1>
+Description: Enable or disable specific debugging messages:
+ 0 = print messages concerning every level <= 'debug' level.
+ 1 = print messages concerning the level indicated by 'debug'.
+Default: 0
+-------------------------------------------------------------------------------
+
+
+9. Contact information
+======================
+I may be contacted by e-mail at <luca.risolia@studio.unibo.it>.
+
+I can accept GPG/PGP encrypted e-mail. My GPG key ID is 'FCE635A4'.
+My public 1024-bit key should be available at your keyserver; the fingerprint
+is: '88E8 F32F 7244 68BA 3958 5D40 99DA 5D2A FCE6 35A4'.
+
+
+10. Credits
+==========
+The development would not have proceed much further without having looked at
+the source code of other drivers and without the help of several persons; in
+particular:
+
+- the I2C interface to kernel and high-level image sensor control routines have
+ been taken from the OV511 driver by Mark McClelland;
+
+- memory management code has been copied from the bttv driver by Ralph Metzler,
+ Marcus Metzler and Gerd Knorr;
+
+- the low-level I2C read function has been written by Frederic Jouault;
+
+- the low-level I2C fast write function has been written by Piotr Czerczak.
diff --git a/Documentation/video4linux/zc0301.txt b/Documentation/video4linux/zc0301.txt
new file mode 100644
index 0000000..befdfda
--- /dev/null
+++ b/Documentation/video4linux/zc0301.txt
@@ -0,0 +1,270 @@
+
+ ZC0301 and ZC0301P Image Processor and Control Chip
+ Driver for Linux
+ ===================================================
+
+ - Documentation -
+
+
+Index
+=====
+1. Copyright
+2. Disclaimer
+3. License
+4. Overview and features
+5. Module dependencies
+6. Module loading
+7. Module parameters
+8. Supported devices
+9. Notes for V4L2 application developers
+10. Contact information
+11. Credits
+
+
+1. Copyright
+============
+Copyright (C) 2006-2007 by Luca Risolia <luca.risolia@studio.unibo.it>
+
+
+2. Disclaimer
+=============
+This software is not developed or sponsored by Z-Star Microelectronics Corp.
+Trademarks are property of their respective owner.
+
+
+3. License
+==========
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+4. Overview and features
+========================
+This driver supports the video interface of the devices mounting the ZC0301 or
+ZC0301P Image Processors and Control Chips.
+
+The driver relies on the Video4Linux2 and USB core modules. It has been
+designed to run properly on SMP systems as well.
+
+The latest version of the ZC0301[P] driver can be found at the following URL:
+http://www.linux-projects.org/
+
+Some of the features of the driver are:
+
+- full compliance with the Video4Linux2 API (see also "Notes for V4L2
+ application developers" paragraph);
+- available mmap or read/poll methods for video streaming through isochronous
+ data transfers;
+- automatic detection of image sensor;
+- video format is standard JPEG;
+- dynamic driver control thanks to various module parameters (see "Module
+ parameters" paragraph);
+- up to 64 cameras can be handled at the same time; they can be connected and
+ disconnected from the host many times without turning off the computer, if
+ the system supports hotplugging;
+
+
+5. Module dependencies
+======================
+For it to work properly, the driver needs kernel support for Video4Linux and
+USB.
+
+The following options of the kernel configuration file must be enabled and
+corresponding modules must be compiled:
+
+ # Multimedia devices
+ #
+ CONFIG_VIDEO_DEV=m
+
+ # USB support
+ #
+ CONFIG_USB=m
+
+In addition, depending on the hardware being used, the modules below are
+necessary:
+
+ # USB Host Controller Drivers
+ #
+ CONFIG_USB_EHCI_HCD=m
+ CONFIG_USB_UHCI_HCD=m
+ CONFIG_USB_OHCI_HCD=m
+
+The ZC0301 controller also provides a built-in microphone interface. It is
+supported by the USB Audio driver thanks to the ALSA API:
+
+ # Sound
+ #
+ CONFIG_SOUND=y
+
+ # Advanced Linux Sound Architecture
+ #
+ CONFIG_SND=m
+
+ # USB devices
+ #
+ CONFIG_SND_USB_AUDIO=m
+
+And finally:
+
+ # V4L USB devices
+ #
+ CONFIG_USB_ZC0301=m
+
+
+6. Module loading
+=================
+To use the driver, it is necessary to load the "zc0301" module into memory
+after every other module required: "videodev", "v4l2_common", "compat_ioctl32",
+"usbcore" and, depending on the USB host controller you have, "ehci-hcd",
+"uhci-hcd" or "ohci-hcd".
+
+Loading can be done as shown below:
+
+ [root@localhost home]# modprobe zc0301
+
+At this point the devices should be recognized. You can invoke "dmesg" to
+analyze kernel messages and verify that the loading process has gone well:
+
+ [user@localhost home]$ dmesg
+
+
+7. Module parameters
+====================
+Module parameters are listed below:
+-------------------------------------------------------------------------------
+Name: video_nr
+Type: short array (min = 0, max = 64)
+Syntax: <-1|n[,...]>
+Description: Specify V4L2 minor mode number:
+ -1 = use next available
+ n = use minor number n
+ You can specify up to 64 cameras this way.
+ For example:
+ video_nr=-1,2,-1 would assign minor number 2 to the second
+ registered camera and use auto for the first one and for every
+ other camera.
+Default: -1
+-------------------------------------------------------------------------------
+Name: force_munmap
+Type: bool array (min = 0, max = 64)
+Syntax: <0|1[,...]>
+Description: Force the application to unmap previously mapped buffer memory
+ before calling any VIDIOC_S_CROP or VIDIOC_S_FMT ioctl's. Not
+ all the applications support this feature. This parameter is
+ specific for each detected camera.
+ 0 = do not force memory unmapping
+ 1 = force memory unmapping (save memory)
+Default: 0
+-------------------------------------------------------------------------------
+Name: frame_timeout
+Type: uint array (min = 0, max = 64)
+Syntax: <n[,...]>
+Description: Timeout for a video frame in seconds. This parameter is
+ specific for each detected camera. This parameter can be
+ changed at runtime thanks to the /sys filesystem interface.
+Default: 2
+-------------------------------------------------------------------------------
+Name: debug
+Type: ushort
+Syntax: <n>
+Description: Debugging information level, from 0 to 3:
+ 0 = none (use carefully)
+ 1 = critical errors
+ 2 = significant informations
+ 3 = more verbose messages
+ Level 3 is useful for testing only, when only one device
+ is used at the same time. It also shows some more informations
+ about the hardware being detected. This module parameter can be
+ changed at runtime thanks to the /sys filesystem interface.
+Default: 2
+-------------------------------------------------------------------------------
+
+
+8. Supported devices
+====================
+None of the names of the companies as well as their products will be mentioned
+here. They have never collaborated with the author, so no advertising.
+
+From the point of view of a driver, what unambiguously identify a device are
+its vendor and product USB identifiers. Below is a list of known identifiers of
+devices mounting the ZC0301 Image Processor and Control Chips:
+
+Vendor ID Product ID
+--------- ----------
+0x041e 0x4017
+0x041e 0x401c
+0x041e 0x401e
+0x041e 0x401f
+0x041e 0x4022
+0x041e 0x4034
+0x041e 0x4035
+0x041e 0x4036
+0x041e 0x403a
+0x0458 0x7007
+0x0458 0x700c
+0x0458 0x700f
+0x046d 0x08ae
+0x055f 0xd003
+0x055f 0xd004
+0x0ac8 0x0301
+0x0ac8 0x301b
+0x0ac8 0x303b
+0x10fd 0x0128
+0x10fd 0x8050
+0x10fd 0x804e
+
+The list above does not imply that all those devices work with this driver: up
+until now only the ones that mount the following image sensors are supported;
+kernel messages will always tell you whether this is the case:
+
+Model Manufacturer
+----- ------------
+PAS202BCB PixArt Imaging, Inc.
+PB-0330 Photobit Corporation
+
+
+9. Notes for V4L2 application developers
+========================================
+This driver follows the V4L2 API specifications. In particular, it enforces two
+rules:
+
+- exactly one I/O method, either "mmap" or "read", is associated with each
+file descriptor. Once it is selected, the application must close and reopen the
+device to switch to the other I/O method;
+
+- although it is not mandatory, previously mapped buffer memory should always
+be unmapped before calling any "VIDIOC_S_CROP" or "VIDIOC_S_FMT" ioctl's.
+The same number of buffers as before will be allocated again to match the size
+of the new video frames, so you have to map the buffers again before any I/O
+attempts on them.
+
+
+10. Contact information
+=======================
+The author may be contacted by e-mail at <luca.risolia@studio.unibo.it>.
+
+GPG/PGP encrypted e-mail's are accepted. The GPG key ID of the author is
+'FCE635A4'; the public 1024-bit key should be available at any keyserver;
+the fingerprint is: '88E8 F32F 7244 68BA 3958 5D40 99DA 5D2A FCE6 35A4'.
+
+
+11. Credits
+===========
+- Informations about the chip internals needed to enable the I2C protocol have
+ been taken from the documentation of the ZC030x Video4Linux1 driver written
+ by Andrew Birkett <andy@nobugs.org>;
+- The initialization values of the ZC0301 controller connected to the PAS202BCB
+ and PB-0330 image sensors have been taken from the SPCA5XX driver maintained
+ by Michel Xhaard <mxhaard@magic.fr>;
+- Stanislav Lechev donated one camera.
diff --git a/Documentation/video4linux/zr364xx.txt b/Documentation/video4linux/zr364xx.txt
new file mode 100644
index 0000000..5c81e3a
--- /dev/null
+++ b/Documentation/video4linux/zr364xx.txt
@@ -0,0 +1,67 @@
+Zoran 364xx based USB webcam module version 0.72
+site: http://royale.zerezo.com/zr364xx/
+mail: royale@zerezo.com
+
+introduction:
+This brings support under Linux for the Aiptek PocketDV 3300 in webcam mode.
+If you just want to get on your PC the pictures and movies on the camera, you should use the usb-storage module instead.
+The driver works with several other cameras in webcam mode (see the list below).
+Maybe this code can work for other JPEG/USB cams based on the Coach chips from Zoran?
+Possible chipsets are : ZR36430 (ZR36430BGC) and maybe ZR36431, ZR36440, ZR36442...
+You can try the experience changing the vendor/product ID values (look at the source code).
+You can get these values by looking at /var/log/messages when you plug your camera, or by typing : cat /proc/bus/usb/devices.
+If you manage to use your cam with this code, you can send me a mail (royale@zerezo.com) with the name of your cam and a patch if needed.
+This is a beta release of the driver.
+Since version 0.70, this driver is only compatible with V4L2 API and 2.6.x kernels.
+If you need V4L1 or 2.4x kernels support, please use an older version, but the code is not maintained anymore.
+Good luck!
+
+install:
+In order to use this driver, you must compile it with your kernel.
+Location: Device Drivers -> Multimedia devices -> Video For Linux -> Video Capture Adapters -> V4L USB devices
+
+usage:
+modprobe zr364xx debug=X mode=Y
+ - debug : set to 1 to enable verbose debug messages
+ - mode : 0 = 320x240, 1 = 160x120, 2 = 640x480
+You can then use the camera with V4L2 compatible applications, for example Ekiga.
+To capture a single image, try this: dd if=/dev/video0 of=test.jpg bs=1M count=1
+
+links :
+http://mxhaard.free.fr/ (support for many others cams including some Aiptek PocketDV)
+http://www.harmwal.nl/pccam880/ (this project also supports cameras based on this chipset)
+
+supported devices:
+------ ------- ----------- -----
+Vendor Product Distributor Model
+------ ------- ----------- -----
+0x08ca 0x0109 Aiptek PocketDV 3300
+0x08ca 0x0109 Maxell Maxcam PRO DV3
+0x041e 0x4024 Creative PC-CAM 880
+0x0d64 0x0108 Aiptek Fidelity 3200
+0x0d64 0x0108 Praktica DCZ 1.3 S
+0x0d64 0x0108 Genius Digital Camera (?)
+0x0d64 0x0108 DXG Technology Fashion Cam
+0x0546 0x3187 Polaroid iON 230
+0x0d64 0x3108 Praktica Exakta DC 2200
+0x0d64 0x3108 Genius G-Shot D211
+0x0595 0x4343 Concord Eye-Q Duo 1300
+0x0595 0x4343 Concord Eye-Q Duo 2000
+0x0595 0x4343 Fujifilm EX-10
+0x0595 0x4343 Ricoh RDC-6000
+0x0595 0x4343 Digitrex DSC 1300
+0x0595 0x4343 Firstline FDC 2000
+0x0bb0 0x500d Concord EyeQ Go Wireless
+0x0feb 0x2004 CRS Electronic 3.3 Digital Camera
+0x0feb 0x2004 Packard Bell DSC-300
+0x055f 0xb500 Mustek MDC 3000
+0x08ca 0x2062 Aiptek PocketDV 5700
+0x052b 0x1a18 Chiphead Megapix V12
+0x04c8 0x0729 Konica Revio 2
+0x04f2 0xa208 Creative PC-CAM 850
+0x0784 0x0040 Traveler Slimline X5
+0x06d6 0x0034 Trust Powerc@m 750
+0x0a17 0x0062 Pentax Optio 50L
+0x06d6 0x003b Trust Powerc@m 970Z
+0x0a17 0x004e Pentax Optio 50
+0x041e 0x405d Creative DiVi CAM 516
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
new file mode 100644
index 0000000..33e8a02
--- /dev/null
+++ b/Documentation/vm/.gitignore
@@ -0,0 +1 @@
+slabinfo
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
new file mode 100644
index 0000000..2131b00
--- /dev/null
+++ b/Documentation/vm/00-INDEX
@@ -0,0 +1,20 @@
+00-INDEX
+ - this file.
+balance
+ - various information on memory balancing.
+hugetlbpage.txt
+ - a brief summary of hugetlbpage support in the Linux kernel.
+locking
+ - info on how locking and synchronization is done in the Linux vm code.
+numa
+ - information about NUMA specific code in the Linux vm.
+numa_memory_policy.txt
+ - documentation of concepts and APIs of the 2.6 memory policy support.
+overcommit-accounting
+ - description of the Linux kernels overcommit handling modes.
+page_migration
+ - description of page migration in NUMA systems.
+slabinfo.c
+ - source code for a tool to get reports about slabs.
+slub.txt
+ - a short users guide for SLUB.
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
new file mode 100644
index 0000000..6f562f7
--- /dev/null
+++ b/Documentation/vm/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := slabinfo
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
new file mode 100644
index 0000000..bd3d31b
--- /dev/null
+++ b/Documentation/vm/balance
@@ -0,0 +1,93 @@
+Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
+
+Memory balancing is needed for non __GFP_WAIT as well as for non
+__GFP_IO allocations.
+
+There are two reasons to be requesting non __GFP_WAIT allocations:
+the caller can not sleep (typically intr context), or does not want
+to incur cost overheads of page stealing and possible swap io for
+whatever reasons.
+
+__GFP_IO allocation requests are made to prevent file system deadlocks.
+
+In the absence of non sleepable allocation requests, it seems detrimental
+to be doing balancing. Page reclamation can be kicked off lazily, that
+is, only when needed (aka zone free memory is 0), instead of making it
+a proactive process.
+
+That being said, the kernel should try to fulfill requests for direct
+mapped pages from the direct mapped pool, instead of falling back on
+the dma pool, so as to keep the dma pool filled for dma requests (atomic
+or not). A similar argument applies to highmem and direct mapped pages.
+OTOH, if there is a lot of free dma pages, it is preferable to satisfy
+regular memory requests by allocating one from the dma pool, instead
+of incurring the overhead of regular zone balancing.
+
+In 2.2, memory balancing/page reclamation would kick off only when the
+_total_ number of free pages fell below 1/64 th of total memory. With the
+right ratio of dma and regular memory, it is quite possible that balancing
+would not be done even when the dma zone was completely empty. 2.2 has
+been running production machines of varying memory sizes, and seems to be
+doing fine even with the presence of this problem. In 2.3, due to
+HIGHMEM, this problem is aggravated.
+
+In 2.3, zone balancing can be done in one of two ways: depending on the
+zone size (and possibly of the size of lower class zones), we can decide
+at init time how many free pages we should aim for while balancing any
+zone. The good part is, while balancing, we do not need to look at sizes
+of lower class zones, the bad part is, we might do too frequent balancing
+due to ignoring possibly lower usage in the lower class zones. Also,
+with a slight change in the allocation routine, it is possible to reduce
+the memclass() macro to be a simple equality.
+
+Another possible solution is that we balance only when the free memory
+of a zone _and_ all its lower class zones falls below 1/64th of the
+total memory in the zone and its lower class zones. This fixes the 2.2
+balancing problem, and stays as close to 2.2 behavior as possible. Also,
+the balancing algorithm works the same way on the various architectures,
+which have different numbers and types of zones. If we wanted to get
+fancy, we could assign different weights to free pages in different
+zones in the future.
+
+Note that if the size of the regular zone is huge compared to dma zone,
+it becomes less significant to consider the free dma pages while
+deciding whether to balance the regular zone. The first solution
+becomes more attractive then.
+
+The appended patch implements the second solution. It also "fixes" two
+problems: first, kswapd is woken up as in 2.2 on low memory conditions
+for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
+so as to give a fighting chance for replace_with_highmem() to get a
+HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
+fall back into regular zone. This also makes sure that HIGHMEM pages
+are not leaked (for example, in situations where a HIGHMEM page is in
+the swapcache but is not being used by anyone)
+
+kswapd also needs to know about the zones it should balance. kswapd is
+primarily needed in a situation where balancing can not be done,
+probably because all allocation requests are coming from intr context
+and all process contexts are sleeping. For 2.3, kswapd does not really
+need to balance the highmem zone, since intr context does not request
+highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
+structure to decide whether a zone needs balancing.
+
+Page stealing from process memory and shm is done if stealing the page would
+alleviate memory pressure on any zone in the page's node that has fallen below
+its watermark.
+
+pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are
+per-zone fields, used to determine when a zone needs to be balanced. When
+the number of pages falls below pages_min, the hysteric field low_on_memory
+gets set. This stays set till the number of free pages becomes pages_high.
+When low_on_memory is set, page allocation requests will try to free some
+pages in the zone (providing GFP_WAIT is set in the request). Orthogonal
+to this, is the decision to poke kswapd to free some zone pages. That
+decision is not hysteresis based, and is done when the number of free
+pages is below pages_low; in which case zone_wake_kswapd is also set.
+
+
+(Good) Ideas that I have heard:
+1. Dynamic experience should influence balancing: number of failed requests
+for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
+dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
new file mode 100644
index 0000000..ea8714f
--- /dev/null
+++ b/Documentation/vm/hugetlbpage.txt
@@ -0,0 +1,339 @@
+
+The intent of this file is to give a brief summary of hugetlbpage support in
+the Linux kernel. This support is built on top of multiple page size support
+that is provided by most modern architectures. For example, i386
+architecture supports 4K and 4M (2M in PAE mode) page sizes, ia64
+architecture supports multiple page sizes 4K, 8K, 64K, 256K, 1M, 4M, 16M,
+256M and ppc64 supports 4K and 16M. A TLB is a cache of virtual-to-physical
+translations. Typically this is a very scarce resource on processor.
+Operating systems try to make best use of limited number of TLB resources.
+This optimization is more critical now as bigger and bigger physical memories
+(several GBs) are more readily available.
+
+Users can use the huge page support in Linux kernel by either using the mmap
+system call or standard SYSv shared memory system calls (shmget, shmat).
+
+First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
+(present under "File systems") and CONFIG_HUGETLB_PAGE (selected
+automatically when CONFIG_HUGETLBFS is selected) configuration
+options.
+
+The kernel built with hugepage support should show the number of configured
+hugepages in the system by running the "cat /proc/meminfo" command.
+
+/proc/meminfo also provides information about the total number of hugetlb
+pages configured in the kernel. It also displays information about the
+number of free hugetlb pages at any time. It also displays information about
+the configured hugepage size - this is needed for generating the proper
+alignment and size of the arguments to the above system calls.
+
+The output of "cat /proc/meminfo" will have lines like:
+
+.....
+HugePages_Total: vvv
+HugePages_Free: www
+HugePages_Rsvd: xxx
+HugePages_Surp: yyy
+Hugepagesize: zzz kB
+
+where:
+HugePages_Total is the size of the pool of hugepages.
+HugePages_Free is the number of hugepages in the pool that are not yet
+allocated.
+HugePages_Rsvd is short for "reserved," and is the number of hugepages
+for which a commitment to allocate from the pool has been made, but no
+allocation has yet been made. It's vaguely analogous to overcommit.
+HugePages_Surp is short for "surplus," and is the number of hugepages in
+the pool above the value in /proc/sys/vm/nr_hugepages. The maximum
+number of surplus hugepages is controlled by
+/proc/sys/vm/nr_overcommit_hugepages.
+
+/proc/filesystems should also show a filesystem of type "hugetlbfs" configured
+in the kernel.
+
+/proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb
+pages in the kernel. Super user can dynamically request more (or free some
+pre-configured) hugepages.
+The allocation (or deallocation) of hugetlb pages is possible only if there are
+enough physically contiguous free pages in system (freeing of hugepages is
+possible only if there are enough hugetlb pages free that can be transferred
+back to regular memory pool).
+
+Pages that are used as hugetlb pages are reserved inside the kernel and cannot
+be used for other purposes.
+
+Once the kernel with Hugetlb page support is built and running, a user can
+use either the mmap system call or shared memory system calls to start using
+the huge pages. It is required that the system administrator preallocate
+enough memory for huge page purposes.
+
+Use the following command to dynamically allocate/deallocate hugepages:
+
+ echo 20 > /proc/sys/vm/nr_hugepages
+
+This command will try to configure 20 hugepages in the system. The success
+or failure of allocation depends on the amount of physically contiguous
+memory that is preset in system at this time. System administrators may want
+to put this command in one of the local rc init files. This will enable the
+kernel to request huge pages early in the boot process (when the possibility
+of getting physical contiguous pages is still very high). In either
+case, administrators will want to verify the number of hugepages actually
+allocated by checking the sysctl or meminfo.
+
+/proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of
+hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are
+requested by applications. echo'ing any non-zero value into this file
+indicates that the hugetlb subsystem is allowed to try to obtain
+hugepages from the buddy allocator, if the normal pool is exhausted. As
+these surplus hugepages go out of use, they are freed back to the buddy
+allocator.
+
+Caveat: Shrinking the pool via nr_hugepages such that it becomes less
+than the number of hugepages in use will convert the balance to surplus
+huge pages even if it would exceed the overcommit value. As long as
+this condition holds, however, no more surplus huge pages will be
+allowed on the system until one of the two sysctls are increased
+sufficiently, or the surplus huge pages go out of use and are freed.
+
+With support for multiple hugepage pools at run-time available, much of
+the hugepage userspace interface has been duplicated in sysfs. The above
+information applies to the default hugepage size (which will be
+controlled by the proc interfaces for backwards compatibility). The root
+hugepage control directory is
+
+ /sys/kernel/mm/hugepages
+
+For each hugepage size supported by the running kernel, a subdirectory
+will exist, of the form
+
+ hugepages-${size}kB
+
+Inside each of these directories, the same set of files will exist:
+
+ nr_hugepages
+ nr_overcommit_hugepages
+ free_hugepages
+ resv_hugepages
+ surplus_hugepages
+
+which function as described above for the default hugepage-sized case.
+
+If the user applications are going to request hugepages using mmap system
+call, then it is required that system administrator mount a file system of
+type hugetlbfs:
+
+ mount -t hugetlbfs \
+ -o uid=<value>,gid=<value>,mode=<value>,size=<value>,nr_inodes=<value> \
+ none /mnt/huge
+
+This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
+/mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid
+options sets the owner and group of the root of the file system. By default
+the uid and gid of the current process are taken. The mode option sets the
+mode of root of file system to value & 0777. This value is given in octal.
+By default the value 0755 is picked. The size option sets the maximum value of
+memory (huge pages) allowed for that filesystem (/mnt/huge). The size is
+rounded down to HPAGE_SIZE. The option nr_inodes sets the maximum number of
+inodes that /mnt/huge can use. If the size or nr_inodes option is not
+provided on command line then no limits are set. For size and nr_inodes
+options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For
+example, size=2K has the same meaning as size=2048.
+
+While read system calls are supported on files that reside on hugetlb
+file systems, write system calls are not.
+
+Regular chown, chgrp, and chmod commands (with right permissions) could be
+used to change the file attributes on hugetlbfs.
+
+Also, it is important to note that no such mount command is required if the
+applications are going to use only shmat/shmget system calls. Users who
+wish to use hugetlb page via shared memory segment should be a member of
+a supplementary group and system admin needs to configure that gid into
+/proc/sys/vm/hugetlb_shm_group. It is possible for same or different
+applications to use any combination of mmaps and shm* calls, though the
+mount of filesystem will be required for using mmap calls.
+
+*******************************************************************
+
+/*
+ * Example of using hugepage memory in a user application using Sys V shared
+ * memory system calls. In this example the app is requesting 256MB of
+ * memory that is backed by huge pages. The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting hugepages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * hugepages. That means the addresses starting with 0x800000... will need
+ * to be specified. Specifying a fixed address is not required on ppc64,
+ * i386 or x86_64.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x) printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+ int shmid;
+ unsigned long i;
+ char *shmaddr;
+
+ if ((shmid = shmget(2, LENGTH,
+ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+ perror("shmget");
+ exit(1);
+ }
+ printf("shmid: 0x%x\n", shmid);
+
+ shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+ if (shmaddr == (char *)-1) {
+ perror("Shared memory attach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(2);
+ }
+ printf("shmaddr: %p\n", shmaddr);
+
+ dprintf("Starting the writes:\n");
+ for (i = 0; i < LENGTH; i++) {
+ shmaddr[i] = (char)(i);
+ if (!(i % (1024 * 1024)))
+ dprintf(".");
+ }
+ dprintf("\n");
+
+ dprintf("Starting the Check...");
+ for (i = 0; i < LENGTH; i++)
+ if (shmaddr[i] != (char)i)
+ printf("\nIndex %lu mismatched\n", i);
+ dprintf("Done.\n");
+
+ if (shmdt((const void *)shmaddr) != 0) {
+ perror("Detach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(3);
+ }
+
+ shmctl(shmid, IPC_RMID, NULL);
+
+ return 0;
+}
+
+*******************************************************************
+
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call. Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified. Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define FILE_NAME "/mnt/hugepagefile"
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+void check_bytes(char *addr)
+{
+ printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+void write_bytes(char *addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < LENGTH; i++)
+ *(addr + i) = (char)i;
+}
+
+void read_bytes(char *addr)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < LENGTH; i++)
+ if (*(addr + i) != (char)i) {
+ printf("Mismatch at %lu\n", i);
+ break;
+ }
+}
+
+int main(void)
+{
+ void *addr;
+ int fd;
+
+ fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
+ if (fd < 0) {
+ perror("Open failed");
+ exit(1);
+ }
+
+ addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ unlink(FILE_NAME);
+ exit(1);
+ }
+
+ printf("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr);
+ read_bytes(addr);
+
+ munmap(addr, LENGTH);
+ close(fd);
+ unlink(FILE_NAME);
+
+ return 0;
+}
diff --git a/Documentation/vm/locking b/Documentation/vm/locking
new file mode 100644
index 0000000..f366fa9
--- /dev/null
+++ b/Documentation/vm/locking
@@ -0,0 +1,130 @@
+Started Oct 1999 by Kanoj Sarcar <kanojsarcar@yahoo.com>
+
+The intent of this file is to have an uptodate, running commentary
+from different people about how locking and synchronization is done
+in the Linux vm code.
+
+page_table_lock & mmap_sem
+--------------------------------------
+
+Page stealers pick processes out of the process pool and scan for
+the best process to steal pages from. To guarantee the existence
+of the victim mm, a mm_count inc and a mmdrop are done in swap_out().
+Page stealers hold kernel_lock to protect against a bunch of races.
+The vma list of the victim mm is also scanned by the stealer,
+and the page_table_lock is used to preserve list sanity against the
+process adding/deleting to the list. This also guarantees existence
+of the vma. Vma existence is not guaranteed once try_to_swap_out()
+drops the page_table_lock. To guarantee the existence of the underlying
+file structure, a get_file is done before the swapout() method is
+invoked. The page passed into swapout() is guaranteed not to be reused
+for a different purpose because the page reference count due to being
+present in the user's pte is not released till after swapout() returns.
+
+Any code that modifies the vmlist, or the vm_start/vm_end/
+vm_flags:VM_LOCKED/vm_next of any vma *in the list* must prevent
+kswapd from looking at the chain.
+
+The rules are:
+1. To scan the vmlist (look but don't touch) you must hold the
+ mmap_sem with read bias, i.e. down_read(&mm->mmap_sem)
+2. To modify the vmlist you need to hold the mmap_sem with
+ read&write bias, i.e. down_write(&mm->mmap_sem) *AND*
+ you need to take the page_table_lock.
+3. The swapper takes _just_ the page_table_lock, this is done
+ because the mmap_sem can be an extremely long lived lock
+ and the swapper just cannot sleep on that.
+4. The exception to this rule is expand_stack, which just
+ takes the read lock and the page_table_lock, this is ok
+ because it doesn't really modify fields anybody relies on.
+5. You must be able to guarantee that while holding page_table_lock
+ or page_table_lock of mm A, you will not try to get either lock
+ for mm B.
+
+The caveats are:
+1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
+The update of mmap_cache is racy (page stealer can race with other code
+that invokes find_vma with mmap_sem held), but that is okay, since it
+is a hint. This can be fixed, if desired, by having find_vma grab the
+page_table_lock.
+
+
+Code that add/delete elements from the vmlist chain are
+1. callers of insert_vm_struct
+2. callers of merge_segments
+3. callers of avl_remove
+
+Code that changes vm_start/vm_end/vm_flags:VM_LOCKED of vma's on
+the list:
+1. expand_stack
+2. mprotect
+3. mlock
+4. mremap
+
+It is advisable that changes to vm_start/vm_end be protected, although
+in some cases it is not really needed. Eg, vm_start is modified by
+expand_stack(), it is hard to come up with a destructive scenario without
+having the vmlist protection in this case.
+
+The page_table_lock nests with the inode i_mmap_lock and the kmem cache
+c_spinlock spinlocks. This is okay, since the kmem code asks for pages after
+dropping c_spinlock. The page_table_lock also nests with pagecache_lock and
+pagemap_lru_lock spinlocks, and no code asks for memory with these locks
+held.
+
+The page_table_lock is grabbed while holding the kernel_lock spinning monitor.
+
+The page_table_lock is a spin lock.
+
+Note: PTL can also be used to guarantee that no new clones using the
+mm start up ... this is a loose form of stability on mm_users. For
+example, it is used in copy_mm to protect against a racing tlb_gather_mmu
+single address space optimization, so that the zap_page_range (from
+vmtruncate) does not lose sending ipi's to cloned threads that might
+be spawned underneath it and go to user mode to drag in pte's into tlbs.
+
+swap_lock
+--------------
+The swap devices are chained in priority order from the "swap_list" header.
+The "swap_list" is used for the round-robin swaphandle allocation strategy.
+The #free swaphandles is maintained in "nr_swap_pages". These two together
+are protected by the swap_lock.
+
+The swap_lock also protects all the device reference counts on the
+corresponding swaphandles, maintained in the "swap_map" array, and the
+"highest_bit" and "lowest_bit" fields.
+
+The swap_lock is a spinlock, and is never acquired from intr level.
+
+To prevent races between swap space deletion or async readahead swapins
+deciding whether a swap handle is being used, ie worthy of being read in
+from disk, and an unmap -> swap_free making the handle unused, the swap
+delete and readahead code grabs a temp reference on the swaphandle to
+prevent warning messages from swap_duplicate <- read_swap_cache_async.
+
+Swap cache locking
+------------------
+Pages are added into the swap cache with kernel_lock held, to make sure
+that multiple pages are not being added (and hence lost) by associating
+all of them with the same swaphandle.
+
+Pages are guaranteed not to be removed from the scache if the page is
+"shared": ie, other processes hold reference on the page or the associated
+swap handle. The only code that does not follow this rule is shrink_mmap,
+which deletes pages from the swap cache if no process has a reference on
+the page (multiple processes might have references on the corresponding
+swap handle though). lookup_swap_cache() races with shrink_mmap, when
+establishing a reference on a scache page, so, it must check whether the
+page it located is still in the swapcache, or shrink_mmap deleted it.
+(This race is due to the fact that shrink_mmap looks at the page ref
+count with pagecache_lock, but then drops pagecache_lock before deleting
+the page from the scache).
+
+do_wp_page and do_swap_page have MP races in them while trying to figure
+out whether a page is "shared", by looking at the page_count + swap_count.
+To preserve the sum of the counts, the page lock _must_ be acquired before
+calling is_page_shared (else processes might switch their swap_count refs
+to the page count refs, after the page count ref has been snapshotted).
+
+Swap device deletion code currently breaks all the scache assumptions,
+since it grabs neither mmap_sem nor page_table_lock.
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
new file mode 100644
index 0000000..e93ad94
--- /dev/null
+++ b/Documentation/vm/numa
@@ -0,0 +1,41 @@
+Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
+
+The intent of this file is to have an uptodate, running commentary
+from different people about NUMA specific code in the Linux vm.
+
+What is NUMA? It is an architecture where the memory access times
+for different regions of memory from a given processor varies
+according to the "distance" of the memory region from the processor.
+Each region of memory to which access times are the same from any
+cpu, is called a node. On such architectures, it is beneficial if
+the kernel tries to minimize inter node communications. Schemes
+for this range from kernel text and read-only data replication
+across nodes, and trying to house all the data structures that
+key components of the kernel need on memory on that node.
+
+Currently, all the numa support is to provide efficient handling
+of widely discontiguous physical memory, so architectures which
+are not NUMA but can have huge holes in the physical address space
+can use the same code. All this code is bracketed by CONFIG_DISCONTIGMEM.
+
+The initial port includes NUMAizing the bootmem allocator code by
+encapsulating all the pieces of information into a bootmem_data_t
+structure. Node specific calls have been added to the allocator.
+In theory, any platform which uses the bootmem allocator should
+be able to put the bootmem and mem_map data structures anywhere
+it deems best.
+
+Each node's page allocation data structures have also been encapsulated
+into a pg_data_t. The bootmem_data_t is just one part of this. To
+make the code look uniform between NUMA and regular UMA platforms,
+UMA platforms have a statically allocated pg_data_t too (contig_page_data).
+For the sake of uniformity, the function num_online_nodes() is also defined
+for all platforms. As we run benchmarks, we might decide to NUMAize
+more variables like low_on_memory, nr_free_pages etc into the pg_data_t.
+
+The NUMA aware page allocation code currently tries to allocate pages
+from different nodes in a round robin manner. This will be changed to
+do concentratic circle search, starting from current node, once the
+NUMA port achieves more maturity. The call alloc_pages_node has been
+added, so that drivers can make the call and not worry about whether
+it is running on a NUMA or UMA platform.
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
new file mode 100644
index 0000000..6aaaeb3
--- /dev/null
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -0,0 +1,452 @@
+
+What is Linux Memory Policy?
+
+In the Linux kernel, "memory policy" determines from which node the kernel will
+allocate memory in a NUMA system or in an emulated NUMA system. Linux has
+supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
+The current memory policy support was added to Linux 2.6 around May 2004. This
+document attempts to describe the concepts and APIs of the 2.6 memory policy
+support.
+
+Memory policies should not be confused with cpusets (Documentation/cpusets.txt)
+which is an administrative mechanism for restricting the nodes from which
+memory may be allocated by a set of processes. Memory policies are a
+programming interface that a NUMA-aware application can take advantage of. When
+both cpusets and policies are applied to a task, the restrictions of the cpuset
+takes priority. See "MEMORY POLICIES AND CPUSETS" below for more details.
+
+MEMORY POLICY CONCEPTS
+
+Scope of Memory Policies
+
+The Linux kernel supports _scopes_ of memory policy, described here from
+most general to most specific:
+
+ System Default Policy: this policy is "hard coded" into the kernel. It
+ is the policy that governs all page allocations that aren't controlled
+ by one of the more specific policy scopes discussed below. When the
+ system is "up and running", the system default policy will use "local
+ allocation" described below. However, during boot up, the system
+ default policy will be set to interleave allocations across all nodes
+ with "sufficient" memory, so as not to overload the initial boot node
+ with boot-time allocations.
+
+ Task/Process Policy: this is an optional, per-task policy. When defined
+ for a specific task, this policy controls all page allocations made by or
+ on behalf of the task that aren't controlled by a more specific scope.
+ If a task does not define a task policy, then all page allocations that
+ would have been controlled by the task policy "fall back" to the System
+ Default Policy.
+
+ The task policy applies to the entire address space of a task. Thus,
+ it is inheritable, and indeed is inherited, across both fork()
+ [clone() w/o the CLONE_VM flag] and exec*(). This allows a parent task
+ to establish the task policy for a child task exec()'d from an
+ executable image that has no awareness of memory policy. See the
+ MEMORY POLICY APIS section, below, for an overview of the system call
+ that a task may use to set/change it's task/process policy.
+
+ In a multi-threaded task, task policies apply only to the thread
+ [Linux kernel task] that installs the policy and any threads
+ subsequently created by that thread. Any sibling threads existing
+ at the time a new task policy is installed retain their current
+ policy.
+
+ A task policy applies only to pages allocated after the policy is
+ installed. Any pages already faulted in by the task when the task
+ changes its task policy remain where they were allocated based on
+ the policy at the time they were allocated.
+
+ VMA Policy: A "VMA" or "Virtual Memory Area" refers to a range of a task's
+ virtual address space. A task may define a specific policy for a range
+ of its virtual address space. See the MEMORY POLICIES APIS section,
+ below, for an overview of the mbind() system call used to set a VMA
+ policy.
+
+ A VMA policy will govern the allocation of pages that back this region of
+ the address space. Any regions of the task's address space that don't
+ have an explicit VMA policy will fall back to the task policy, which may
+ itself fall back to the System Default Policy.
+
+ VMA policies have a few complicating details:
+
+ VMA policy applies ONLY to anonymous pages. These include pages
+ allocated for anonymous segments, such as the task stack and heap, and
+ any regions of the address space mmap()ed with the MAP_ANONYMOUS flag.
+ If a VMA policy is applied to a file mapping, it will be ignored if
+ the mapping used the MAP_SHARED flag. If the file mapping used the
+ MAP_PRIVATE flag, the VMA policy will only be applied when an
+ anonymous page is allocated on an attempt to write to the mapping--
+ i.e., at Copy-On-Write.
+
+ VMA policies are shared between all tasks that share a virtual address
+ space--a.k.a. threads--independent of when the policy is installed; and
+ they are inherited across fork(). However, because VMA policies refer
+ to a specific region of a task's address space, and because the address
+ space is discarded and recreated on exec*(), VMA policies are NOT
+ inheritable across exec(). Thus, only NUMA-aware applications may
+ use VMA policies.
+
+ A task may install a new VMA policy on a sub-range of a previously
+ mmap()ed region. When this happens, Linux splits the existing virtual
+ memory area into 2 or 3 VMAs, each with it's own policy.
+
+ By default, VMA policy applies only to pages allocated after the policy
+ is installed. Any pages already faulted into the VMA range remain
+ where they were allocated based on the policy at the time they were
+ allocated. However, since 2.6.16, Linux supports page migration via
+ the mbind() system call, so that page contents can be moved to match
+ a newly installed policy.
+
+ Shared Policy: Conceptually, shared policies apply to "memory objects"
+ mapped shared into one or more tasks' distinct address spaces. An
+ application installs a shared policies the same way as VMA policies--using
+ the mbind() system call specifying a range of virtual addresses that map
+ the shared object. However, unlike VMA policies, which can be considered
+ to be an attribute of a range of a task's address space, shared policies
+ apply directly to the shared object. Thus, all tasks that attach to the
+ object share the policy, and all pages allocated for the shared object,
+ by any task, will obey the shared policy.
+
+ As of 2.6.22, only shared memory segments, created by shmget() or
+ mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy. When shared
+ policy support was added to Linux, the associated data structures were
+ added to hugetlbfs shmem segments. At the time, hugetlbfs did not
+ support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
+ shmem segments were never "hooked up" to the shared policy support.
+ Although hugetlbfs segments now support lazy allocation, their support
+ for shared policy has not been completed.
+
+ As mentioned above [re: VMA policies], allocations of page cache
+ pages for regular files mmap()ed with MAP_SHARED ignore any VMA
+ policy installed on the virtual address range backed by the shared
+ file mapping. Rather, shared page cache pages, including pages backing
+ private mappings that have not yet been written by the task, follow
+ task policy, if any, else System Default Policy.
+
+ The shared policy infrastructure supports different policies on subset
+ ranges of the shared object. However, Linux still splits the VMA of
+ the task that installs the policy for each range of distinct policy.
+ Thus, different tasks that attach to a shared memory segment can have
+ different VMA configurations mapping that one shared object. This
+ can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
+ a shared memory region, when one task has installed shared policy on
+ one or more ranges of the region.
+
+Components of Memory Policies
+
+ A Linux memory policy consists of a "mode", optional mode flags, and an
+ optional set of nodes. The mode determines the behavior of the policy,
+ the optional mode flags determine the behavior of the mode, and the
+ optional set of nodes can be viewed as the arguments to the policy
+ behavior.
+
+ Internally, memory policies are implemented by a reference counted
+ structure, struct mempolicy. Details of this structure will be discussed
+ in context, below, as required to explain the behavior.
+
+ Linux memory policy supports the following 4 behavioral modes:
+
+ Default Mode--MPOL_DEFAULT: This mode is only used in the memory
+ policy APIs. Internally, MPOL_DEFAULT is converted to the NULL
+ memory policy in all policy scopes. Any existing non-default policy
+ will simply be removed when MPOL_DEFAULT is specified. As a result,
+ MPOL_DEFAULT means "fall back to the next most specific policy scope."
+
+ For example, a NULL or default task policy will fall back to the
+ system default policy. A NULL or default vma policy will fall
+ back to the task policy.
+
+ When specified in one of the memory policy APIs, the Default mode
+ does not use the optional set of nodes.
+
+ It is an error for the set of nodes specified for this policy to
+ be non-empty.
+
+ MPOL_BIND: This mode specifies that memory must come from the
+ set of nodes specified by the policy. Memory will be allocated from
+ the node in the set with sufficient free memory that is closest to
+ the node where the allocation takes place.
+
+ MPOL_PREFERRED: This mode specifies that the allocation should be
+ attempted from the single node specified in the policy. If that
+ allocation fails, the kernel will search other nodes, in order of
+ increasing distance from the preferred node based on information
+ provided by the platform firmware.
+ containing the cpu where the allocation takes place.
+
+ Internally, the Preferred policy uses a single node--the
+ preferred_node member of struct mempolicy. When the internal
+ mode flag MPOL_F_LOCAL is set, the preferred_node is ignored and
+ the policy is interpreted as local allocation. "Local" allocation
+ policy can be viewed as a Preferred policy that starts at the node
+ containing the cpu where the allocation takes place.
+
+ It is possible for the user to specify that local allocation is
+ always preferred by passing an empty nodemask with this mode.
+ If an empty nodemask is passed, the policy cannot use the
+ MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags described
+ below.
+
+ MPOL_INTERLEAVED: This mode specifies that page allocations be
+ interleaved, on a page granularity, across the nodes specified in
+ the policy. This mode also behaves slightly differently, based on
+ the context where it is used:
+
+ For allocation of anonymous pages and shared memory pages,
+ Interleave mode indexes the set of nodes specified by the policy
+ using the page offset of the faulting address into the segment
+ [VMA] containing the address modulo the number of nodes specified
+ by the policy. It then attempts to allocate a page, starting at
+ the selected node, as if the node had been specified by a Preferred
+ policy or had been selected by a local allocation. That is,
+ allocation will follow the per node zonelist.
+
+ For allocation of page cache pages, Interleave mode indexes the set
+ of nodes specified by the policy using a node counter maintained
+ per task. This counter wraps around to the lowest specified node
+ after it reaches the highest specified node. This will tend to
+ spread the pages out over the nodes specified by the policy based
+ on the order in which they are allocated, rather than based on any
+ page offset into an address range or file. During system boot up,
+ the temporary interleaved system default policy works in this
+ mode.
+
+ Linux memory policy supports the following optional mode flags:
+
+ MPOL_F_STATIC_NODES: This flag specifies that the nodemask passed by
+ the user should not be remapped if the task or VMA's set of allowed
+ nodes changes after the memory policy has been defined.
+
+ Without this flag, anytime a mempolicy is rebound because of a
+ change in the set of allowed nodes, the node (Preferred) or
+ nodemask (Bind, Interleave) is remapped to the new set of
+ allowed nodes. This may result in nodes being used that were
+ previously undesired.
+
+ With this flag, if the user-specified nodes overlap with the
+ nodes allowed by the task's cpuset, then the memory policy is
+ applied to their intersection. If the two sets of nodes do not
+ overlap, the Default policy is used.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 1-3 that sets an Interleave policy over the same set. If
+ the cpuset's mems change to 3-5, the Interleave will now occur
+ over nodes 3, 4, and 5. With this flag, however, since only node
+ 3 is allowed from the user's nodemask, the "interleave" only
+ occurs over that node. If no nodes from the user's nodemask are
+ now allowed, the Default behavior is used.
+
+ MPOL_F_STATIC_NODES cannot be combined with the
+ MPOL_F_RELATIVE_NODES flag. It also cannot be used for
+ MPOL_PREFERRED policies that were created with an empty nodemask
+ (local allocation).
+
+ MPOL_F_RELATIVE_NODES: This flag specifies that the nodemask passed
+ by the user will be mapped relative to the set of the task or VMA's
+ set of allowed nodes. The kernel stores the user-passed nodemask,
+ and if the allowed nodes changes, then that original nodemask will
+ be remapped relative to the new set of allowed nodes.
+
+ Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+ mempolicy is rebound because of a change in the set of allowed
+ nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+ remapped to the new set of allowed nodes. That remap may not
+ preserve the relative nature of the user's passed nodemask to its
+ set of allowed nodes upon successive rebinds: a nodemask of
+ 1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+ allowed nodes is restored to its original state.
+
+ With this flag, the remap is done so that the node numbers from
+ the user's passed nodemask are relative to the set of allowed
+ nodes. In other words, if nodes 0, 2, and 4 are set in the user's
+ nodemask, the policy will be effected over the first (and in the
+ Bind or Interleave case, the third and fifth) nodes in the set of
+ allowed nodes. The nodemask passed by the user represents nodes
+ relative to task or VMA's set of allowed nodes.
+
+ If the user's nodemask includes nodes that are outside the range
+ of the new set of allowed nodes (for example, node 5 is set in
+ the user's nodemask when the set of allowed nodes is only 0-3),
+ then the remap wraps around to the beginning of the nodemask and,
+ if not already set, sets the node in the mempolicy nodemask.
+
+ For example, consider a task that is attached to a cpuset with
+ mems 2-5 that sets an Interleave policy over the same set with
+ MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the
+ interleave now occurs over nodes 3,5-6. If the cpuset's mems
+ then change to 0,2-3,5, then the interleave occurs over nodes
+ 0,3,5.
+
+ Thanks to the consistent remapping, applications preparing
+ nodemasks to specify memory policies using this flag should
+ disregard their current, actual cpuset imposed memory placement
+ and prepare the nodemask as if they were always located on
+ memory nodes 0 to N-1, where N is the number of memory nodes the
+ policy is intended to manage. Let the kernel then remap to the
+ set of memory nodes allowed by the task's cpuset, as that may
+ change over time.
+
+ MPOL_F_RELATIVE_NODES cannot be combined with the
+ MPOL_F_STATIC_NODES flag. It also cannot be used for
+ MPOL_PREFERRED policies that were created with an empty nodemask
+ (local allocation).
+
+MEMORY POLICY REFERENCE COUNTING
+
+To resolve use/free races, struct mempolicy contains an atomic reference
+count field. Internal interfaces, mpol_get()/mpol_put() increment and
+decrement this reference count, respectively. mpol_put() will only free
+the structure back to the mempolicy kmem cache when the reference count
+goes to zero.
+
+When a new memory policy is allocated, it's reference count is initialized
+to '1', representing the reference held by the task that is installing the
+new policy. When a pointer to a memory policy structure is stored in another
+structure, another reference is added, as the task's reference will be dropped
+on completion of the policy installation.
+
+During run-time "usage" of the policy, we attempt to minimize atomic operations
+on the reference count, as this can lead to cache lines bouncing between cpus
+and NUMA nodes. "Usage" here means one of the following:
+
+1) querying of the policy, either by the task itself [using the get_mempolicy()
+ API discussed below] or by another task using the /proc/<pid>/numa_maps
+ interface.
+
+2) examination of the policy to determine the policy mode and associated node
+ or node lists, if any, for page allocation. This is considered a "hot
+ path". Note that for MPOL_BIND, the "usage" extends across the entire
+ allocation process, which may sleep during page reclaimation, because the
+ BIND policy nodemask is used, by reference, to filter ineligible nodes.
+
+We can avoid taking an extra reference during the usages listed above as
+follows:
+
+1) we never need to get/free the system default policy as this is never
+ changed nor freed, once the system is up and running.
+
+2) for querying the policy, we do not need to take an extra reference on the
+ target task's task policy nor vma policies because we always acquire the
+ task's mm's mmap_sem for read during the query. The set_mempolicy() and
+ mbind() APIs [see below] always acquire the mmap_sem for write when
+ installing or replacing task or vma policies. Thus, there is no possibility
+ of a task or thread freeing a policy while another task or thread is
+ querying it.
+
+3) Page allocation usage of task or vma policy occurs in the fault path where
+ we hold them mmap_sem for read. Again, because replacing the task or vma
+ policy requires that the mmap_sem be held for write, the policy can't be
+ freed out from under us while we're using it for page allocation.
+
+4) Shared policies require special consideration. One task can replace a
+ shared memory policy while another task, with a distinct mmap_sem, is
+ querying or allocating a page based on the policy. To resolve this
+ potential race, the shared policy infrastructure adds an extra reference
+ to the shared policy during lookup while holding a spin lock on the shared
+ policy management structure. This requires that we drop this extra
+ reference when we're finished "using" the policy. We must drop the
+ extra reference on shared policies in the same query/allocation paths
+ used for non-shared policies. For this reason, shared policies are marked
+ as such, and the extra reference is dropped "conditionally"--i.e., only
+ for shared policies.
+
+ Because of this extra reference counting, and because we must lookup
+ shared policies in a tree structure under spinlock, shared policies are
+ more expensive to use in the page allocation path. This is especially
+ true for shared policies on shared memory regions shared by tasks running
+ on different NUMA nodes. This extra overhead can be avoided by always
+ falling back to task or system default policy for shared memory regions,
+ or by prefaulting the entire shared memory region into memory and locking
+ it down. However, this might not be appropriate for all applications.
+
+MEMORY POLICY APIs
+
+Linux supports 3 system calls for controlling memory policy. These APIS
+always affect only the calling task, the calling task's address space, or
+some shared object mapped into the calling task's address space.
+
+ Note: the headers that define these APIs and the parameter data types
+ for user space applications reside in a package that is not part of
+ the Linux kernel. The kernel system call interfaces, with the 'sys_'
+ prefix, are defined in <linux/syscalls.h>; the mode and flag
+ definitions are defined in <linux/mempolicy.h>.
+
+Set [Task] Memory Policy:
+
+ long set_mempolicy(int mode, const unsigned long *nmask,
+ unsigned long maxnode);
+
+ Set's the calling task's "task/process memory policy" to mode
+ specified by the 'mode' argument and the set of nodes defined
+ by 'nmask'. 'nmask' points to a bit mask of node ids containing
+ at least 'maxnode' ids. Optional mode flags may be passed by
+ combining the 'mode' argument with the flag (for example:
+ MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
+
+ See the set_mempolicy(2) man page for more details
+
+
+Get [Task] Memory Policy or Related Information
+
+ long get_mempolicy(int *mode,
+ const unsigned long *nmask, unsigned long maxnode,
+ void *addr, int flags);
+
+ Queries the "task/process memory policy" of the calling task, or
+ the policy or location of a specified virtual address, depending
+ on the 'flags' argument.
+
+ See the get_mempolicy(2) man page for more details
+
+
+Install VMA/Shared Policy for a Range of Task's Address Space
+
+ long mbind(void *start, unsigned long len, int mode,
+ const unsigned long *nmask, unsigned long maxnode,
+ unsigned flags);
+
+ mbind() installs the policy specified by (mode, nmask, maxnodes) as
+ a VMA policy for the range of the calling task's address space
+ specified by the 'start' and 'len' arguments. Additional actions
+ may be requested via the 'flags' argument.
+
+ See the mbind(2) man page for more details.
+
+MEMORY POLICY COMMAND LINE INTERFACE
+
+Although not strictly part of the Linux implementation of memory policy,
+a command line tool, numactl(8), exists that allows one to:
+
++ set the task policy for a specified program via set_mempolicy(2), fork(2) and
+ exec(2)
+
++ set the shared policy for a shared memory segment via mbind(2)
+
+The numactl(8) tool is packages with the run-time version of the library
+containing the memory policy system call wrappers. Some distributions
+package the headers and compile-time libraries in a separate development
+package.
+
+
+MEMORY POLICIES AND CPUSETS
+
+Memory policies work within cpusets as described above. For memory policies
+that require a node or set of nodes, the nodes are restricted to the set of
+nodes whose memories are allowed by the cpuset constraints. If the nodemask
+specified for the policy contains nodes that are not allowed by the cpuset and
+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
+specified for the policy and the set of nodes with memory is used. If the
+result is the empty set, the policy is considered invalid and cannot be
+installed. If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
+onto and folded into the task's set of allowed nodes as previously described.
+
+The interaction of memory policies and cpusets can be problematic when tasks
+in two cpusets share access to a memory region, such as shared memory segments
+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
+any of the tasks install shared policy on the region, only nodes whose
+memories are allowed in both cpusets may be used in the policies. Obtaining
+this information requires "stepping outside" the memory policy APIs to use the
+cpuset information and requires that one know in what cpusets other task might
+be attaching to the shared region. Furthermore, if the cpusets' allowed
+memory sets are disjoint, "local" allocation is the only valid policy.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
new file mode 100644
index 0000000..21c7b1f
--- /dev/null
+++ b/Documentation/vm/overcommit-accounting
@@ -0,0 +1,73 @@
+The Linux kernel supports the following overcommit handling modes
+
+0 - Heuristic overcommit handling. Obvious overcommits of
+ address space are refused. Used for a typical system. It
+ ensures a seriously wild allocation fails while allowing
+ overcommit to reduce swap usage. root is allowed to
+ allocate slighly more memory in this mode. This is the
+ default.
+
+1 - Always overcommit. Appropriate for some scientific
+ applications.
+
+2 - Don't overcommit. The total address space commit
+ for the system is not permitted to exceed swap + a
+ configurable percentage (default is 50) of physical RAM.
+ Depending on the percentage you use, in most situations
+ this means a process will not be killed while accessing
+ pages but will receive errors on memory allocation as
+ appropriate.
+
+The overcommit policy is set via the sysctl `vm.overcommit_memory'.
+
+The overcommit percentage is set via `vm.overcommit_ratio'.
+
+The current overcommit limit and amount committed are viewable in
+/proc/meminfo as CommitLimit and Committed_AS respectively.
+
+Gotchas
+-------
+
+The C language stack growth does an implicit mremap. If you want absolute
+guarantees and run close to the edge you MUST mmap your stack for the
+largest size you think you will need. For typical stack usage this does
+not matter much but it's a corner case if you really really care
+
+In mode 2 the MAP_NORESERVE flag is ignored.
+
+
+How It Works
+------------
+
+The overcommit is based on the following rules
+
+For a file backed map
+ SHARED or READ-only - 0 cost (the file is the map not swap)
+ PRIVATE WRITABLE - size of mapping per instance
+
+For an anonymous or /dev/zero map
+ SHARED - size of mapping
+ PRIVATE READ-only - 0 cost (but of little use)
+ PRIVATE WRITABLE - size of mapping per instance
+
+Additional accounting
+ Pages made writable copies by mmap
+ shmfs memory drawn from the same pool
+
+Status
+------
+
+o We account mmap memory mappings
+o We account mprotect changes in commit
+o We account mremap changes in size
+o We account brk
+o We account munmap
+o We report the commit status in /proc
+o Account and check on fork
+o Review stack handling/building on exec
+o SHMfs accounting
+o Implement actual limit enforcement
+
+To Do
+-----
+o Account ptrace pages (this is hard)
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
new file mode 100644
index 0000000..d5fdfd3
--- /dev/null
+++ b/Documentation/vm/page_migration
@@ -0,0 +1,148 @@
+Page migration
+--------------
+
+Page migration allows the moving of the physical location of pages between
+nodes in a numa system while the process is running. This means that the
+virtual addresses that the process sees do not change. However, the
+system rearranges the physical location of those pages.
+
+The main intend of page migration is to reduce the latency of memory access
+by moving pages near to the processor where the process accessing that memory
+is running.
+
+Page migration allows a process to manually relocate the node on which its
+pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
+a new memory policy via mbind(). The pages of process can also be relocated
+from another process using the sys_migrate_pages() function call. The
+migrate_pages function call takes two sets of nodes and moves pages of a
+process that are located on the from nodes to the destination nodes.
+Page migration functions are provided by the numactl package by Andi Kleen
+(a version later than 0.9.3 is required. Get it from
+ftp://oss.sgi.com/www/projects/libnuma/download/). numactl provides libnuma
+which provides an interface similar to other numa functionality for page
+migration. cat /proc/<pid>/numa_maps allows an easy review of where the
+pages of a process are located. See also the numa_maps documentation in the
+proc(5) man page.
+
+Manual migration is useful if for example the scheduler has relocated
+a process to a processor on a distant node. A batch scheduler or an
+administrator may detect the situation and move the pages of the process
+nearer to the new processor. The kernel itself does only provide
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+A NUMA profiler may f.e. obtain a log showing frequent off node
+accesses and may use the result to move pages to more advantageous
+locations.
+
+Larger installations usually partition the system using cpusets into
+sections of nodes. Paul Jackson has equipped cpusets with the ability to
+move pages when a task is moved to another cpuset (See ../cpusets.txt).
+Cpusets allows the automation of process locality. If a task is moved to
+a new cpuset then also all its pages are moved with it so that the
+performance of the process does not sink dramatically. Also the pages
+of processes in a cpuset are moved if the allowed memory nodes of a
+cpuset are changed.
+
+Page migration allows the preservation of the relative location of pages
+within a group of nodes for all migration techniques which will preserve a
+particular memory allocation pattern generated even after migrating a
+process. This is necessary in order to preserve the memory latencies.
+Processes will run with similar performance after migration.
+
+Page migration occurs in several steps. First a high level
+description for those trying to use migrate_pages() from the kernel
+(for userspace usage see the Andi Kleen's numactl package mentioned above)
+and then a low level description of how the low level details work.
+
+A. In kernel use of migrate_pages()
+-----------------------------------
+
+1. Remove pages from the LRU.
+
+ Lists of pages to be migrated are generated by scanning over
+ pages and moving them into lists. This is done by
+ calling isolate_lru_page().
+ Calling isolate_lru_page increases the references to the page
+ so that it cannot vanish while the page migration occurs.
+ It also prevents the swapper or other scans to encounter
+ the page.
+
+2. We need to have a function of type new_page_t that can be
+ passed to migrate_pages(). This function should figure out
+ how to allocate the correct new page given the old page.
+
+3. The migrate_pages() function is called which attempts
+ to do the migration. It will call the function to allocate
+ the new page for each page that is considered for
+ moving.
+
+B. How migrate_pages() works
+----------------------------
+
+migrate_pages() does several passes over its list of pages. A page is moved
+if all references to a page are removable at the time. The page has
+already been removed from the LRU via isolate_lru_page() and the refcount
+is increased so that the page cannot be freed while page migration occurs.
+
+Steps:
+
+1. Lock the page to be migrated
+
+2. Insure that writeback is complete.
+
+3. Prep the new page that we want to move to. It is locked
+ and set to not being uptodate so that all accesses to the new
+ page immediately lock while the move is in progress.
+
+4. The new page is prepped with some settings from the old page so that
+ accesses to the new page will discover a page with the correct settings.
+
+5. All the page table references to the page are converted
+ to migration entries or dropped (nonlinear vmas).
+ This decrease the mapcount of a page. If the resulting
+ mapcount is not zero then we do not migrate the page.
+ All user space processes that attempt to access the page
+ will now wait on the page lock.
+
+6. The radix tree lock is taken. This will cause all processes trying
+ to access the page via the mapping to block on the radix tree spinlock.
+
+7. The refcount of the page is examined and we back out if references remain
+ otherwise we know that we are the only one referencing this page.
+
+8. The radix tree is checked and if it does not contain the pointer to this
+ page then we back out because someone else modified the radix tree.
+
+9. The radix tree is changed to point to the new page.
+
+10. The reference count of the old page is dropped because the radix tree
+ reference is gone. A reference to the new page is established because
+ the new page is referenced to by the radix tree.
+
+11. The radix tree lock is dropped. With that lookups in the mapping
+ become possible again. Processes will move from spinning on the tree_lock
+ to sleeping on the locked new page.
+
+12. The page contents are copied to the new page.
+
+13. The remaining page flags are copied to the new page.
+
+14. The old page flags are cleared to indicate that the page does
+ not provide any information anymore.
+
+15. Queued up writeback on the new page is triggered.
+
+16. If migration entries were page then replace them with real ptes. Doing
+ so will enable access for user space processes not already waiting for
+ the page lock.
+
+19. The page locks are dropped from the old and new page.
+ Processes waiting on the page lock will redo their page faults
+ and will reach the new page.
+
+20. The new page is moved to the LRU and can be scanned by the swapper
+ etc again.
+
+Christoph Lameter, May 8, 2006.
+
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
new file mode 100644
index 0000000..ce72c0f
--- /dev/null
+++ b/Documentation/vm/pagemap.txt
@@ -0,0 +1,77 @@
+pagemap, from the userspace perspective
+---------------------------------------
+
+pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
+userspace programs to examine the page tables and related information by
+reading files in /proc.
+
+There are three components to pagemap:
+
+ * /proc/pid/pagemap. This file lets a userspace process find out which
+ physical frame each virtual page is mapped to. It contains one 64-bit
+ value for each virtual page, containing the following data (from
+ fs/proc/task_mmu.c, above pagemap_read):
+
+ * Bits 0-55 page frame number (PFN) if present
+ * Bits 0-4 swap type if swapped
+ * Bits 5-55 swap offset if swapped
+ * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit 61 reserved for future use
+ * Bit 62 page swapped
+ * Bit 63 page present
+
+ If the page is not present but in swap, then the PFN contains an
+ encoding of the swap file number and the page's offset into the
+ swap. Unmapped pages return a null PFN. This allows determining
+ precisely which pages are mapped (or in swap) and comparing mapped
+ pages between processes.
+
+ Efficient users of this interface will use /proc/pid/maps to
+ determine which areas of memory are actually mapped and llseek to
+ skip over unmapped regions.
+
+ * /proc/kpagecount. This file contains a 64-bit count of the number of
+ times each page is mapped, indexed by PFN.
+
+ * /proc/kpageflags. This file contains a 64-bit set of flags for each
+ page, indexed by PFN.
+
+ The flags are (from fs/proc/proc_misc, above kpageflags_read):
+
+ 0. LOCKED
+ 1. ERROR
+ 2. REFERENCED
+ 3. UPTODATE
+ 4. DIRTY
+ 5. LRU
+ 6. ACTIVE
+ 7. SLAB
+ 8. WRITEBACK
+ 9. RECLAIM
+ 10. BUDDY
+
+Using pagemap to do something useful:
+
+The general procedure for using pagemap to find out about a process' memory
+usage goes like this:
+
+ 1. Read /proc/pid/maps to determine which parts of the memory space are
+ mapped to what.
+ 2. Select the maps you are interested in -- all of them, or a particular
+ library, or the stack or the heap, etc.
+ 3. Open /proc/pid/pagemap and seek to the pages you would like to examine.
+ 4. Read a u64 for each page from pagemap.
+ 5. Open /proc/kpagecount and/or /proc/kpageflags. For each PFN you just
+ read, seek to that entry in the file, and read the data you want.
+
+For example, to find the "unique set size" (USS), which is the amount of
+memory that a process is using that is not shared with any other process,
+you can go through every map in the process, find the PFNs, look those up
+in kpagecount, and tally up the number of pages that are only referenced
+once.
+
+Other notes:
+
+Reading from any of the files will return -EINVAL if you are not starting
+the read on an 8-byte boundary (e.g., if you seeked an odd number of bytes
+into the file), or if the size of the read is not a multiple of 8 bytes.
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
new file mode 100644
index 0000000..df32276
--- /dev/null
+++ b/Documentation/vm/slabinfo.c
@@ -0,0 +1,1364 @@
+/*
+ * Slabinfo: Tool to get reports about slabs
+ *
+ * (C) 2007 sgi, Christoph Lameter
+ *
+ * Compile by:
+ *
+ * gcc -o slabinfo slabinfo.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <strings.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <getopt.h>
+#include <regex.h>
+#include <errno.h>
+
+#define MAX_SLABS 500
+#define MAX_ALIASES 500
+#define MAX_NODES 1024
+
+struct slabinfo {
+ char *name;
+ int alias;
+ int refs;
+ int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
+ int hwcache_align, object_size, objs_per_slab;
+ int sanity_checks, slab_size, store_user, trace;
+ int order, poison, reclaim_account, red_zone;
+ unsigned long partial, objects, slabs, objects_partial, objects_total;
+ unsigned long alloc_fastpath, alloc_slowpath;
+ unsigned long free_fastpath, free_slowpath;
+ unsigned long free_frozen, free_add_partial, free_remove_partial;
+ unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill;
+ unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
+ unsigned long deactivate_to_head, deactivate_to_tail;
+ unsigned long deactivate_remote_frees, order_fallback;
+ int numa[MAX_NODES];
+ int numa_partial[MAX_NODES];
+} slabinfo[MAX_SLABS];
+
+struct aliasinfo {
+ char *name;
+ char *ref;
+ struct slabinfo *slab;
+} aliasinfo[MAX_ALIASES];
+
+int slabs = 0;
+int actual_slabs = 0;
+int aliases = 0;
+int alias_targets = 0;
+int highest_node = 0;
+
+char buffer[4096];
+
+int show_empty = 0;
+int show_report = 0;
+int show_alias = 0;
+int show_slab = 0;
+int skip_zero = 1;
+int show_numa = 0;
+int show_track = 0;
+int show_first_alias = 0;
+int validate = 0;
+int shrink = 0;
+int show_inverted = 0;
+int show_single_ref = 0;
+int show_totals = 0;
+int sort_size = 0;
+int sort_active = 0;
+int set_debug = 0;
+int show_ops = 0;
+int show_activity = 0;
+
+/* Debug options */
+int sanity = 0;
+int redzone = 0;
+int poison = 0;
+int tracking = 0;
+int tracing = 0;
+
+int page_size;
+
+regex_t pattern;
+
+void fatal(const char *x, ...)
+{
+ va_list ap;
+
+ va_start(ap, x);
+ vfprintf(stderr, x, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+void usage(void)
+{
+ printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n"
+ "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
+ "-a|--aliases Show aliases\n"
+ "-A|--activity Most active slabs first\n"
+ "-d<options>|--debug=<options> Set/Clear Debug options\n"
+ "-D|--display-active Switch line format to activity\n"
+ "-e|--empty Show empty slabs\n"
+ "-f|--first-alias Show first alias\n"
+ "-h|--help Show usage information\n"
+ "-i|--inverted Inverted list\n"
+ "-l|--slabs Show slabs\n"
+ "-n|--numa Show NUMA information\n"
+ "-o|--ops Show kmem_cache_ops\n"
+ "-s|--shrink Shrink slabs\n"
+ "-r|--report Detailed report on single slabs\n"
+ "-S|--Size Sort by size\n"
+ "-t|--tracking Show alloc/free information\n"
+ "-T|--Totals Show summary information\n"
+ "-v|--validate Validate slabs\n"
+ "-z|--zero Include empty slabs\n"
+ "-1|--1ref Single reference\n"
+ "\nValid debug options (FZPUT may be combined)\n"
+ "a / A Switch on all debug options (=FZUP)\n"
+ "- Switch off all debug options\n"
+ "f / F Sanity Checks (SLAB_DEBUG_FREE)\n"
+ "z / Z Redzoning\n"
+ "p / P Poisoning\n"
+ "u / U Tracking\n"
+ "t / T Tracing\n"
+ );
+}
+
+unsigned long read_obj(const char *name)
+{
+ FILE *f = fopen(name, "r");
+
+ if (!f)
+ buffer[0] = 0;
+ else {
+ if (!fgets(buffer, sizeof(buffer), f))
+ buffer[0] = 0;
+ fclose(f);
+ if (buffer[strlen(buffer)] == '\n')
+ buffer[strlen(buffer)] = 0;
+ }
+ return strlen(buffer);
+}
+
+
+/*
+ * Get the contents of an attribute
+ */
+unsigned long get_obj(const char *name)
+{
+ if (!read_obj(name))
+ return 0;
+
+ return atol(buffer);
+}
+
+unsigned long get_obj_and_str(const char *name, char **x)
+{
+ unsigned long result = 0;
+ char *p;
+
+ *x = NULL;
+
+ if (!read_obj(name)) {
+ x = NULL;
+ return 0;
+ }
+ result = strtoul(buffer, &p, 10);
+ while (*p == ' ')
+ p++;
+ if (*p)
+ *x = strdup(p);
+ return result;
+}
+
+void set_obj(struct slabinfo *s, const char *name, int n)
+{
+ char x[100];
+ FILE *f;
+
+ snprintf(x, 100, "%s/%s", s->name, name);
+ f = fopen(x, "w");
+ if (!f)
+ fatal("Cannot write to %s\n", x);
+
+ fprintf(f, "%d\n", n);
+ fclose(f);
+}
+
+unsigned long read_slab_obj(struct slabinfo *s, const char *name)
+{
+ char x[100];
+ FILE *f;
+ size_t l;
+
+ snprintf(x, 100, "%s/%s", s->name, name);
+ f = fopen(x, "r");
+ if (!f) {
+ buffer[0] = 0;
+ l = 0;
+ } else {
+ l = fread(buffer, 1, sizeof(buffer), f);
+ buffer[l] = 0;
+ fclose(f);
+ }
+ return l;
+}
+
+
+/*
+ * Put a size string together
+ */
+int store_size(char *buffer, unsigned long value)
+{
+ unsigned long divisor = 1;
+ char trailer = 0;
+ int n;
+
+ if (value > 1000000000UL) {
+ divisor = 100000000UL;
+ trailer = 'G';
+ } else if (value > 1000000UL) {
+ divisor = 100000UL;
+ trailer = 'M';
+ } else if (value > 1000UL) {
+ divisor = 100;
+ trailer = 'K';
+ }
+
+ value /= divisor;
+ n = sprintf(buffer, "%ld",value);
+ if (trailer) {
+ buffer[n] = trailer;
+ n++;
+ buffer[n] = 0;
+ }
+ if (divisor != 1) {
+ memmove(buffer + n - 2, buffer + n - 3, 4);
+ buffer[n-2] = '.';
+ n++;
+ }
+ return n;
+}
+
+void decode_numa_list(int *numa, char *t)
+{
+ int node;
+ int nr;
+
+ memset(numa, 0, MAX_NODES * sizeof(int));
+
+ if (!t)
+ return;
+
+ while (*t == 'N') {
+ t++;
+ node = strtoul(t, &t, 10);
+ if (*t == '=') {
+ t++;
+ nr = strtoul(t, &t, 10);
+ numa[node] = nr;
+ if (node > highest_node)
+ highest_node = node;
+ }
+ while (*t == ' ')
+ t++;
+ }
+}
+
+void slab_validate(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ set_obj(s, "validate", 1);
+}
+
+void slab_shrink(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ set_obj(s, "shrink", 1);
+}
+
+int line = 0;
+
+void first_line(void)
+{
+ if (show_activity)
+ printf("Name Objects Alloc Free %%Fast Fallb O\n");
+ else
+ printf("Name Objects Objsize Space "
+ "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
+}
+
+/*
+ * Find the shortest alias of a slab
+ */
+struct aliasinfo *find_one_alias(struct slabinfo *find)
+{
+ struct aliasinfo *a;
+ struct aliasinfo *best = NULL;
+
+ for(a = aliasinfo;a < aliasinfo + aliases; a++) {
+ if (a->slab == find &&
+ (!best || strlen(best->name) < strlen(a->name))) {
+ best = a;
+ if (strncmp(a->name,"kmall", 5) == 0)
+ return best;
+ }
+ }
+ return best;
+}
+
+unsigned long slab_size(struct slabinfo *s)
+{
+ return s->slabs * (page_size << s->order);
+}
+
+unsigned long slab_activity(struct slabinfo *s)
+{
+ return s->alloc_fastpath + s->free_fastpath +
+ s->alloc_slowpath + s->free_slowpath;
+}
+
+void slab_numa(struct slabinfo *s, int mode)
+{
+ int node;
+
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (!highest_node) {
+ printf("\n%s: No NUMA information available.\n", s->name);
+ return;
+ }
+
+ if (skip_zero && !s->slabs)
+ return;
+
+ if (!line) {
+ printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+ for(node = 0; node <= highest_node; node++)
+ printf(" %4d", node);
+ printf("\n----------------------");
+ for(node = 0; node <= highest_node; node++)
+ printf("-----");
+ printf("\n");
+ }
+ printf("%-21s ", mode ? "All slabs" : s->name);
+ for(node = 0; node <= highest_node; node++) {
+ char b[20];
+
+ store_size(b, s->numa[node]);
+ printf(" %4s", b);
+ }
+ printf("\n");
+ if (mode) {
+ printf("%-21s ", "Partial slabs");
+ for(node = 0; node <= highest_node; node++) {
+ char b[20];
+
+ store_size(b, s->numa_partial[node]);
+ printf(" %4s", b);
+ }
+ printf("\n");
+ }
+ line++;
+}
+
+void show_tracking(struct slabinfo *s)
+{
+ printf("\n%s: Kernel object allocation\n", s->name);
+ printf("-----------------------------------------------------------------------\n");
+ if (read_slab_obj(s, "alloc_calls"))
+ printf(buffer);
+ else
+ printf("No Data\n");
+
+ printf("\n%s: Kernel object freeing\n", s->name);
+ printf("------------------------------------------------------------------------\n");
+ if (read_slab_obj(s, "free_calls"))
+ printf(buffer);
+ else
+ printf("No Data\n");
+
+}
+
+void ops(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (read_slab_obj(s, "ops")) {
+ printf("\n%s: kmem_cache operations\n", s->name);
+ printf("--------------------------------------------\n");
+ printf(buffer);
+ } else
+ printf("\n%s has no kmem_cache operations\n", s->name);
+}
+
+const char *onoff(int x)
+{
+ if (x)
+ return "On ";
+ return "Off";
+}
+
+void slab_stats(struct slabinfo *s)
+{
+ unsigned long total_alloc;
+ unsigned long total_free;
+ unsigned long total;
+
+ if (!s->alloc_slab)
+ return;
+
+ total_alloc = s->alloc_fastpath + s->alloc_slowpath;
+ total_free = s->free_fastpath + s->free_slowpath;
+
+ if (!total_alloc)
+ return;
+
+ printf("\n");
+ printf("Slab Perf Counter Alloc Free %%Al %%Fr\n");
+ printf("--------------------------------------------------\n");
+ printf("Fastpath %8lu %8lu %3lu %3lu\n",
+ s->alloc_fastpath, s->free_fastpath,
+ s->alloc_fastpath * 100 / total_alloc,
+ s->free_fastpath * 100 / total_free);
+ printf("Slowpath %8lu %8lu %3lu %3lu\n",
+ total_alloc - s->alloc_fastpath, s->free_slowpath,
+ (total_alloc - s->alloc_fastpath) * 100 / total_alloc,
+ s->free_slowpath * 100 / total_free);
+ printf("Page Alloc %8lu %8lu %3lu %3lu\n",
+ s->alloc_slab, s->free_slab,
+ s->alloc_slab * 100 / total_alloc,
+ s->free_slab * 100 / total_free);
+ printf("Add partial %8lu %8lu %3lu %3lu\n",
+ s->deactivate_to_head + s->deactivate_to_tail,
+ s->free_add_partial,
+ (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc,
+ s->free_add_partial * 100 / total_free);
+ printf("Remove partial %8lu %8lu %3lu %3lu\n",
+ s->alloc_from_partial, s->free_remove_partial,
+ s->alloc_from_partial * 100 / total_alloc,
+ s->free_remove_partial * 100 / total_free);
+
+ printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n",
+ s->deactivate_remote_frees, s->free_frozen,
+ s->deactivate_remote_frees * 100 / total_alloc,
+ s->free_frozen * 100 / total_free);
+
+ printf("Total %8lu %8lu\n\n", total_alloc, total_free);
+
+ if (s->cpuslab_flush)
+ printf("Flushes %8lu\n", s->cpuslab_flush);
+
+ if (s->alloc_refill)
+ printf("Refill %8lu\n", s->alloc_refill);
+
+ total = s->deactivate_full + s->deactivate_empty +
+ s->deactivate_to_head + s->deactivate_to_tail;
+
+ if (total)
+ printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) "
+ "ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n",
+ s->deactivate_full, (s->deactivate_full * 100) / total,
+ s->deactivate_empty, (s->deactivate_empty * 100) / total,
+ s->deactivate_to_head, (s->deactivate_to_head * 100) / total,
+ s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
+}
+
+void report(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n",
+ s->name, s->aliases, s->order, s->objects);
+ if (s->hwcache_align)
+ printf("** Hardware cacheline aligned\n");
+ if (s->cache_dma)
+ printf("** Memory is allocated in a special DMA zone\n");
+ if (s->destroy_by_rcu)
+ printf("** Slabs are destroyed via RCU\n");
+ if (s->reclaim_account)
+ printf("** Reclaim accounting active\n");
+
+ printf("\nSizes (bytes) Slabs Debug Memory\n");
+ printf("------------------------------------------------------------------------\n");
+ printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n",
+ s->object_size, s->slabs, onoff(s->sanity_checks),
+ s->slabs * (page_size << s->order));
+ printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n",
+ s->slab_size, s->slabs - s->partial - s->cpu_slabs,
+ onoff(s->red_zone), s->objects * s->object_size);
+ printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n",
+ page_size << s->order, s->partial, onoff(s->poison),
+ s->slabs * (page_size << s->order) - s->objects * s->object_size);
+ printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n",
+ s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
+ (s->slab_size - s->object_size) * s->objects);
+ printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n",
+ s->align, s->objs_per_slab, onoff(s->trace),
+ ((page_size << s->order) - s->objs_per_slab * s->slab_size) *
+ s->slabs);
+
+ ops(s);
+ show_tracking(s);
+ slab_numa(s, 1);
+ slab_stats(s);
+}
+
+void slabcache(struct slabinfo *s)
+{
+ char size_str[20];
+ char dist_str[40];
+ char flags[20];
+ char *p = flags;
+
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (actual_slabs == 1) {
+ report(s);
+ return;
+ }
+
+ if (skip_zero && !show_empty && !s->slabs)
+ return;
+
+ if (show_empty && s->slabs)
+ return;
+
+ store_size(size_str, slab_size(s));
+ snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
+ s->partial, s->cpu_slabs);
+
+ if (!line++)
+ first_line();
+
+ if (s->aliases)
+ *p++ = '*';
+ if (s->cache_dma)
+ *p++ = 'd';
+ if (s->hwcache_align)
+ *p++ = 'A';
+ if (s->poison)
+ *p++ = 'P';
+ if (s->reclaim_account)
+ *p++ = 'a';
+ if (s->red_zone)
+ *p++ = 'Z';
+ if (s->sanity_checks)
+ *p++ = 'F';
+ if (s->store_user)
+ *p++ = 'U';
+ if (s->trace)
+ *p++ = 'T';
+
+ *p = 0;
+ if (show_activity) {
+ unsigned long total_alloc;
+ unsigned long total_free;
+
+ total_alloc = s->alloc_fastpath + s->alloc_slowpath;
+ total_free = s->free_fastpath + s->free_slowpath;
+
+ printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d\n",
+ s->name, s->objects,
+ total_alloc, total_free,
+ total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
+ total_free ? (s->free_fastpath * 100 / total_free) : 0,
+ s->order_fallback, s->order);
+ }
+ else
+ printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
+ s->name, s->objects, s->object_size, size_str, dist_str,
+ s->objs_per_slab, s->order,
+ s->slabs ? (s->partial * 100) / s->slabs : 100,
+ s->slabs ? (s->objects * s->object_size * 100) /
+ (s->slabs * (page_size << s->order)) : 100,
+ flags);
+}
+
+/*
+ * Analyze debug options. Return false if something is amiss.
+ */
+int debug_opt_scan(char *opt)
+{
+ if (!opt || !opt[0] || strcmp(opt, "-") == 0)
+ return 1;
+
+ if (strcasecmp(opt, "a") == 0) {
+ sanity = 1;
+ poison = 1;
+ redzone = 1;
+ tracking = 1;
+ return 1;
+ }
+
+ for ( ; *opt; opt++)
+ switch (*opt) {
+ case 'F' : case 'f':
+ if (sanity)
+ return 0;
+ sanity = 1;
+ break;
+ case 'P' : case 'p':
+ if (poison)
+ return 0;
+ poison = 1;
+ break;
+
+ case 'Z' : case 'z':
+ if (redzone)
+ return 0;
+ redzone = 1;
+ break;
+
+ case 'U' : case 'u':
+ if (tracking)
+ return 0;
+ tracking = 1;
+ break;
+
+ case 'T' : case 't':
+ if (tracing)
+ return 0;
+ tracing = 1;
+ break;
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+int slab_empty(struct slabinfo *s)
+{
+ if (s->objects > 0)
+ return 0;
+
+ /*
+ * We may still have slabs even if there are no objects. Shrinking will
+ * remove them.
+ */
+ if (s->slabs != 0)
+ set_obj(s, "shrink", 1);
+
+ return 1;
+}
+
+void slab_debug(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (sanity && !s->sanity_checks) {
+ set_obj(s, "sanity", 1);
+ }
+ if (!sanity && s->sanity_checks) {
+ if (slab_empty(s))
+ set_obj(s, "sanity", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
+ }
+ if (redzone && !s->red_zone) {
+ if (slab_empty(s))
+ set_obj(s, "red_zone", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
+ }
+ if (!redzone && s->red_zone) {
+ if (slab_empty(s))
+ set_obj(s, "red_zone", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
+ }
+ if (poison && !s->poison) {
+ if (slab_empty(s))
+ set_obj(s, "poison", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
+ }
+ if (!poison && s->poison) {
+ if (slab_empty(s))
+ set_obj(s, "poison", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
+ }
+ if (tracking && !s->store_user) {
+ if (slab_empty(s))
+ set_obj(s, "store_user", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
+ }
+ if (!tracking && s->store_user) {
+ if (slab_empty(s))
+ set_obj(s, "store_user", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
+ }
+ if (tracing && !s->trace) {
+ if (slabs == 1)
+ set_obj(s, "trace", 1);
+ else
+ fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
+ }
+ if (!tracing && s->trace)
+ set_obj(s, "trace", 1);
+}
+
+void totals(void)
+{
+ struct slabinfo *s;
+
+ int used_slabs = 0;
+ char b1[20], b2[20], b3[20], b4[20];
+ unsigned long long max = 1ULL << 63;
+
+ /* Object size */
+ unsigned long long min_objsize = max, max_objsize = 0, avg_objsize;
+
+ /* Number of partial slabs in a slabcache */
+ unsigned long long min_partial = max, max_partial = 0,
+ avg_partial, total_partial = 0;
+
+ /* Number of slabs in a slab cache */
+ unsigned long long min_slabs = max, max_slabs = 0,
+ avg_slabs, total_slabs = 0;
+
+ /* Size of the whole slab */
+ unsigned long long min_size = max, max_size = 0,
+ avg_size, total_size = 0;
+
+ /* Bytes used for object storage in a slab */
+ unsigned long long min_used = max, max_used = 0,
+ avg_used, total_used = 0;
+
+ /* Waste: Bytes used for alignment and padding */
+ unsigned long long min_waste = max, max_waste = 0,
+ avg_waste, total_waste = 0;
+ /* Number of objects in a slab */
+ unsigned long long min_objects = max, max_objects = 0,
+ avg_objects, total_objects = 0;
+ /* Waste per object */
+ unsigned long long min_objwaste = max,
+ max_objwaste = 0, avg_objwaste,
+ total_objwaste = 0;
+
+ /* Memory per object */
+ unsigned long long min_memobj = max,
+ max_memobj = 0, avg_memobj,
+ total_objsize = 0;
+
+ /* Percentage of partial slabs per slab */
+ unsigned long min_ppart = 100, max_ppart = 0,
+ avg_ppart, total_ppart = 0;
+
+ /* Number of objects in partial slabs */
+ unsigned long min_partobj = max, max_partobj = 0,
+ avg_partobj, total_partobj = 0;
+
+ /* Percentage of partial objects of all objects in a slab */
+ unsigned long min_ppartobj = 100, max_ppartobj = 0,
+ avg_ppartobj, total_ppartobj = 0;
+
+
+ for (s = slabinfo; s < slabinfo + slabs; s++) {
+ unsigned long long size;
+ unsigned long used;
+ unsigned long long wasted;
+ unsigned long long objwaste;
+ unsigned long percentage_partial_slabs;
+ unsigned long percentage_partial_objs;
+
+ if (!s->slabs || !s->objects)
+ continue;
+
+ used_slabs++;
+
+ size = slab_size(s);
+ used = s->objects * s->object_size;
+ wasted = size - used;
+ objwaste = s->slab_size - s->object_size;
+
+ percentage_partial_slabs = s->partial * 100 / s->slabs;
+ if (percentage_partial_slabs > 100)
+ percentage_partial_slabs = 100;
+
+ percentage_partial_objs = s->objects_partial * 100
+ / s->objects;
+
+ if (percentage_partial_objs > 100)
+ percentage_partial_objs = 100;
+
+ if (s->object_size < min_objsize)
+ min_objsize = s->object_size;
+ if (s->partial < min_partial)
+ min_partial = s->partial;
+ if (s->slabs < min_slabs)
+ min_slabs = s->slabs;
+ if (size < min_size)
+ min_size = size;
+ if (wasted < min_waste)
+ min_waste = wasted;
+ if (objwaste < min_objwaste)
+ min_objwaste = objwaste;
+ if (s->objects < min_objects)
+ min_objects = s->objects;
+ if (used < min_used)
+ min_used = used;
+ if (s->objects_partial < min_partobj)
+ min_partobj = s->objects_partial;
+ if (percentage_partial_slabs < min_ppart)
+ min_ppart = percentage_partial_slabs;
+ if (percentage_partial_objs < min_ppartobj)
+ min_ppartobj = percentage_partial_objs;
+ if (s->slab_size < min_memobj)
+ min_memobj = s->slab_size;
+
+ if (s->object_size > max_objsize)
+ max_objsize = s->object_size;
+ if (s->partial > max_partial)
+ max_partial = s->partial;
+ if (s->slabs > max_slabs)
+ max_slabs = s->slabs;
+ if (size > max_size)
+ max_size = size;
+ if (wasted > max_waste)
+ max_waste = wasted;
+ if (objwaste > max_objwaste)
+ max_objwaste = objwaste;
+ if (s->objects > max_objects)
+ max_objects = s->objects;
+ if (used > max_used)
+ max_used = used;
+ if (s->objects_partial > max_partobj)
+ max_partobj = s->objects_partial;
+ if (percentage_partial_slabs > max_ppart)
+ max_ppart = percentage_partial_slabs;
+ if (percentage_partial_objs > max_ppartobj)
+ max_ppartobj = percentage_partial_objs;
+ if (s->slab_size > max_memobj)
+ max_memobj = s->slab_size;
+
+ total_partial += s->partial;
+ total_slabs += s->slabs;
+ total_size += size;
+ total_waste += wasted;
+
+ total_objects += s->objects;
+ total_used += used;
+ total_partobj += s->objects_partial;
+ total_ppart += percentage_partial_slabs;
+ total_ppartobj += percentage_partial_objs;
+
+ total_objwaste += s->objects * objwaste;
+ total_objsize += s->objects * s->slab_size;
+ }
+
+ if (!total_objects) {
+ printf("No objects\n");
+ return;
+ }
+ if (!used_slabs) {
+ printf("No slabs\n");
+ return;
+ }
+
+ /* Per slab averages */
+ avg_partial = total_partial / used_slabs;
+ avg_slabs = total_slabs / used_slabs;
+ avg_size = total_size / used_slabs;
+ avg_waste = total_waste / used_slabs;
+
+ avg_objects = total_objects / used_slabs;
+ avg_used = total_used / used_slabs;
+ avg_partobj = total_partobj / used_slabs;
+ avg_ppart = total_ppart / used_slabs;
+ avg_ppartobj = total_ppartobj / used_slabs;
+
+ /* Per object object sizes */
+ avg_objsize = total_used / total_objects;
+ avg_objwaste = total_objwaste / total_objects;
+ avg_partobj = total_partobj * 100 / total_objects;
+ avg_memobj = total_objsize / total_objects;
+
+ printf("Slabcache Totals\n");
+ printf("----------------\n");
+ printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n",
+ slabs, aliases, alias_targets, used_slabs);
+
+ store_size(b1, total_size);store_size(b2, total_waste);
+ store_size(b3, total_waste * 100 / total_used);
+ printf("Memory used: %6s # Loss : %6s MRatio:%6s%%\n", b1, b2, b3);
+
+ store_size(b1, total_objects);store_size(b2, total_partobj);
+ store_size(b3, total_partobj * 100 / total_objects);
+ printf("# Objects : %6s # PartObj: %6s ORatio:%6s%%\n", b1, b2, b3);
+
+ printf("\n");
+ printf("Per Cache Average Min Max Total\n");
+ printf("---------------------------------------------------------\n");
+
+ store_size(b1, avg_objects);store_size(b2, min_objects);
+ store_size(b3, max_objects);store_size(b4, total_objects);
+ printf("#Objects %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_slabs);store_size(b2, min_slabs);
+ store_size(b3, max_slabs);store_size(b4, total_slabs);
+ printf("#Slabs %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_partial);store_size(b2, min_partial);
+ store_size(b3, max_partial);store_size(b4, total_partial);
+ printf("#PartSlab %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+ store_size(b1, avg_ppart);store_size(b2, min_ppart);
+ store_size(b3, max_ppart);
+ store_size(b4, total_partial * 100 / total_slabs);
+ printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_partobj);store_size(b2, min_partobj);
+ store_size(b3, max_partobj);
+ store_size(b4, total_partobj);
+ printf("PartObjs %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
+ store_size(b3, max_ppartobj);
+ store_size(b4, total_partobj * 100 / total_objects);
+ printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_size);store_size(b2, min_size);
+ store_size(b3, max_size);store_size(b4, total_size);
+ printf("Memory %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_used);store_size(b2, min_used);
+ store_size(b3, max_used);store_size(b4, total_used);
+ printf("Used %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_waste);store_size(b2, min_waste);
+ store_size(b3, max_waste);store_size(b4, total_waste);
+ printf("Loss %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ printf("\n");
+ printf("Per Object Average Min Max\n");
+ printf("---------------------------------------------\n");
+
+ store_size(b1, avg_memobj);store_size(b2, min_memobj);
+ store_size(b3, max_memobj);
+ printf("Memory %10s %10s %10s\n",
+ b1, b2, b3);
+ store_size(b1, avg_objsize);store_size(b2, min_objsize);
+ store_size(b3, max_objsize);
+ printf("User %10s %10s %10s\n",
+ b1, b2, b3);
+
+ store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
+ store_size(b3, max_objwaste);
+ printf("Loss %10s %10s %10s\n",
+ b1, b2, b3);
+}
+
+void sort_slabs(void)
+{
+ struct slabinfo *s1,*s2;
+
+ for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
+ for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
+ int result;
+
+ if (sort_size)
+ result = slab_size(s1) < slab_size(s2);
+ else if (sort_active)
+ result = slab_activity(s1) < slab_activity(s2);
+ else
+ result = strcasecmp(s1->name, s2->name);
+
+ if (show_inverted)
+ result = -result;
+
+ if (result > 0) {
+ struct slabinfo t;
+
+ memcpy(&t, s1, sizeof(struct slabinfo));
+ memcpy(s1, s2, sizeof(struct slabinfo));
+ memcpy(s2, &t, sizeof(struct slabinfo));
+ }
+ }
+ }
+}
+
+void sort_aliases(void)
+{
+ struct aliasinfo *a1,*a2;
+
+ for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
+ for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
+ char *n1, *n2;
+
+ n1 = a1->name;
+ n2 = a2->name;
+ if (show_alias && !show_inverted) {
+ n1 = a1->ref;
+ n2 = a2->ref;
+ }
+ if (strcasecmp(n1, n2) > 0) {
+ struct aliasinfo t;
+
+ memcpy(&t, a1, sizeof(struct aliasinfo));
+ memcpy(a1, a2, sizeof(struct aliasinfo));
+ memcpy(a2, &t, sizeof(struct aliasinfo));
+ }
+ }
+ }
+}
+
+void link_slabs(void)
+{
+ struct aliasinfo *a;
+ struct slabinfo *s;
+
+ for (a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+ for (s = slabinfo; s < slabinfo + slabs; s++)
+ if (strcmp(a->ref, s->name) == 0) {
+ a->slab = s;
+ s->refs++;
+ break;
+ }
+ if (s == slabinfo + slabs)
+ fatal("Unresolved alias %s\n", a->ref);
+ }
+}
+
+void alias(void)
+{
+ struct aliasinfo *a;
+ char *active = NULL;
+
+ sort_aliases();
+ link_slabs();
+
+ for(a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+ if (!show_single_ref && a->slab->refs == 1)
+ continue;
+
+ if (!show_inverted) {
+ if (active) {
+ if (strcmp(a->slab->name, active) == 0) {
+ printf(" %s", a->name);
+ continue;
+ }
+ }
+ printf("\n%-12s <- %s", a->slab->name, a->name);
+ active = a->slab->name;
+ }
+ else
+ printf("%-20s -> %s\n", a->name, a->slab->name);
+ }
+ if (active)
+ printf("\n");
+}
+
+
+void rename_slabs(void)
+{
+ struct slabinfo *s;
+ struct aliasinfo *a;
+
+ for (s = slabinfo; s < slabinfo + slabs; s++) {
+ if (*s->name != ':')
+ continue;
+
+ if (s->refs > 1 && !show_first_alias)
+ continue;
+
+ a = find_one_alias(s);
+
+ if (a)
+ s->name = a->name;
+ else {
+ s->name = "*";
+ actual_slabs--;
+ }
+ }
+}
+
+int slab_mismatch(char *slab)
+{
+ return regexec(&pattern, slab, 0, NULL, 0);
+}
+
+void read_slab_dir(void)
+{
+ DIR *dir;
+ struct dirent *de;
+ struct slabinfo *slab = slabinfo;
+ struct aliasinfo *alias = aliasinfo;
+ char *p;
+ char *t;
+ int count;
+
+ if (chdir("/sys/kernel/slab") && chdir("/sys/slab"))
+ fatal("SYSFS support for SLUB not active\n");
+
+ dir = opendir(".");
+ while ((de = readdir(dir))) {
+ if (de->d_name[0] == '.' ||
+ (de->d_name[0] != ':' && slab_mismatch(de->d_name)))
+ continue;
+ switch (de->d_type) {
+ case DT_LNK:
+ alias->name = strdup(de->d_name);
+ count = readlink(de->d_name, buffer, sizeof(buffer));
+
+ if (count < 0)
+ fatal("Cannot read symlink %s\n", de->d_name);
+
+ buffer[count] = 0;
+ p = buffer + count;
+ while (p > buffer && p[-1] != '/')
+ p--;
+ alias->ref = strdup(p);
+ alias++;
+ break;
+ case DT_DIR:
+ if (chdir(de->d_name))
+ fatal("Unable to access slab %s\n", slab->name);
+ slab->name = strdup(de->d_name);
+ slab->alias = 0;
+ slab->refs = 0;
+ slab->aliases = get_obj("aliases");
+ slab->align = get_obj("align");
+ slab->cache_dma = get_obj("cache_dma");
+ slab->cpu_slabs = get_obj("cpu_slabs");
+ slab->destroy_by_rcu = get_obj("destroy_by_rcu");
+ slab->hwcache_align = get_obj("hwcache_align");
+ slab->object_size = get_obj("object_size");
+ slab->objects = get_obj("objects");
+ slab->objects_partial = get_obj("objects_partial");
+ slab->objects_total = get_obj("objects_total");
+ slab->objs_per_slab = get_obj("objs_per_slab");
+ slab->order = get_obj("order");
+ slab->partial = get_obj("partial");
+ slab->partial = get_obj_and_str("partial", &t);
+ decode_numa_list(slab->numa_partial, t);
+ free(t);
+ slab->poison = get_obj("poison");
+ slab->reclaim_account = get_obj("reclaim_account");
+ slab->red_zone = get_obj("red_zone");
+ slab->sanity_checks = get_obj("sanity_checks");
+ slab->slab_size = get_obj("slab_size");
+ slab->slabs = get_obj_and_str("slabs", &t);
+ decode_numa_list(slab->numa, t);
+ free(t);
+ slab->store_user = get_obj("store_user");
+ slab->trace = get_obj("trace");
+ slab->alloc_fastpath = get_obj("alloc_fastpath");
+ slab->alloc_slowpath = get_obj("alloc_slowpath");
+ slab->free_fastpath = get_obj("free_fastpath");
+ slab->free_slowpath = get_obj("free_slowpath");
+ slab->free_frozen= get_obj("free_frozen");
+ slab->free_add_partial = get_obj("free_add_partial");
+ slab->free_remove_partial = get_obj("free_remove_partial");
+ slab->alloc_from_partial = get_obj("alloc_from_partial");
+ slab->alloc_slab = get_obj("alloc_slab");
+ slab->alloc_refill = get_obj("alloc_refill");
+ slab->free_slab = get_obj("free_slab");
+ slab->cpuslab_flush = get_obj("cpuslab_flush");
+ slab->deactivate_full = get_obj("deactivate_full");
+ slab->deactivate_empty = get_obj("deactivate_empty");
+ slab->deactivate_to_head = get_obj("deactivate_to_head");
+ slab->deactivate_to_tail = get_obj("deactivate_to_tail");
+ slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
+ slab->order_fallback = get_obj("order_fallback");
+ chdir("..");
+ if (slab->name[0] == ':')
+ alias_targets++;
+ slab++;
+ break;
+ default :
+ fatal("Unknown file type %lx\n", de->d_type);
+ }
+ }
+ closedir(dir);
+ slabs = slab - slabinfo;
+ actual_slabs = slabs;
+ aliases = alias - aliasinfo;
+ if (slabs > MAX_SLABS)
+ fatal("Too many slabs\n");
+ if (aliases > MAX_ALIASES)
+ fatal("Too many aliases\n");
+}
+
+void output_slabs(void)
+{
+ struct slabinfo *slab;
+
+ for (slab = slabinfo; slab < slabinfo + slabs; slab++) {
+
+ if (slab->alias)
+ continue;
+
+
+ if (show_numa)
+ slab_numa(slab, 0);
+ else if (show_track)
+ show_tracking(slab);
+ else if (validate)
+ slab_validate(slab);
+ else if (shrink)
+ slab_shrink(slab);
+ else if (set_debug)
+ slab_debug(slab);
+ else if (show_ops)
+ ops(slab);
+ else if (show_slab)
+ slabcache(slab);
+ else if (show_report)
+ report(slab);
+ }
+}
+
+struct option opts[] = {
+ { "aliases", 0, NULL, 'a' },
+ { "activity", 0, NULL, 'A' },
+ { "debug", 2, NULL, 'd' },
+ { "display-activity", 0, NULL, 'D' },
+ { "empty", 0, NULL, 'e' },
+ { "first-alias", 0, NULL, 'f' },
+ { "help", 0, NULL, 'h' },
+ { "inverted", 0, NULL, 'i'},
+ { "numa", 0, NULL, 'n' },
+ { "ops", 0, NULL, 'o' },
+ { "report", 0, NULL, 'r' },
+ { "shrink", 0, NULL, 's' },
+ { "slabs", 0, NULL, 'l' },
+ { "track", 0, NULL, 't'},
+ { "validate", 0, NULL, 'v' },
+ { "zero", 0, NULL, 'z' },
+ { "1ref", 0, NULL, '1'},
+ { NULL, 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+ int c;
+ int err;
+ char *pattern_source;
+
+ page_size = getpagesize();
+
+ while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
+ opts, NULL)) != -1)
+ switch (c) {
+ case '1':
+ show_single_ref = 1;
+ break;
+ case 'a':
+ show_alias = 1;
+ break;
+ case 'A':
+ sort_active = 1;
+ break;
+ case 'd':
+ set_debug = 1;
+ if (!debug_opt_scan(optarg))
+ fatal("Invalid debug option '%s'\n", optarg);
+ break;
+ case 'D':
+ show_activity = 1;
+ break;
+ case 'e':
+ show_empty = 1;
+ break;
+ case 'f':
+ show_first_alias = 1;
+ break;
+ case 'h':
+ usage();
+ return 0;
+ case 'i':
+ show_inverted = 1;
+ break;
+ case 'n':
+ show_numa = 1;
+ break;
+ case 'o':
+ show_ops = 1;
+ break;
+ case 'r':
+ show_report = 1;
+ break;
+ case 's':
+ shrink = 1;
+ break;
+ case 'l':
+ show_slab = 1;
+ break;
+ case 't':
+ show_track = 1;
+ break;
+ case 'v':
+ validate = 1;
+ break;
+ case 'z':
+ skip_zero = 0;
+ break;
+ case 'T':
+ show_totals = 1;
+ break;
+ case 'S':
+ sort_size = 1;
+ break;
+
+ default:
+ fatal("%s: Invalid option '%c'\n", argv[0], optopt);
+
+ }
+
+ if (!show_slab && !show_alias && !show_track && !show_report
+ && !validate && !shrink && !set_debug && !show_ops)
+ show_slab = 1;
+
+ if (argc > optind)
+ pattern_source = argv[optind];
+ else
+ pattern_source = ".*";
+
+ err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB);
+ if (err)
+ fatal("%s: Invalid pattern '%s' code %d\n",
+ argv[0], pattern_source, err);
+ read_slab_dir();
+ if (show_alias)
+ alias();
+ else
+ if (show_totals)
+ totals();
+ else {
+ link_slabs();
+ rename_slabs();
+ sort_slabs();
+ output_slabs();
+ }
+ return 0;
+}
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
new file mode 100644
index 0000000..bb1f5c6
--- /dev/null
+++ b/Documentation/vm/slub.txt
@@ -0,0 +1,269 @@
+Short users guide for SLUB
+--------------------------
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+slab caches. SLUB always includes full debugging but it is off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add a option "slub_debug"
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the "slabinfo" command to get statistical
+data and perform operation on the slabs. By default slabinfo only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. slabinfo can be compiled with
+
+gcc -o slabinfo Documentation/vm/slabinfo.c
+
+Some of the modes of operation of slabinfo require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to slub_debug. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options> Enable options for all slabs
+slub_debug=<Debug-Options>,<slab name>
+ Enable options only for select slabs
+
+Possible debug options are
+ F Sanity checks on (enables SLAB_DEBUG_FREE. Sorry
+ SLAB legacy issues)
+ Z Red zoning
+ P Poisoning (object and padding)
+ U User tracking (free and alloc)
+ T Trace (please only use on single slabs)
+ - Switch all debugging off (useful if the kernel is
+ configured with CONFIG_SLUB_DEBUG_ON)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify:
+
+ slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try
+
+ slub_debug=,dentry
+
+to only enable debugging on the dentry cache.
+
+Red zoning and tracking may realign the slab. We can just apply sanity checks
+to the dentry cache with
+
+ slub_debug=F,dentry
+
+In case you forgot to enable debugging on the kernel command line: It is
+possible to enable debugging manually when the kernel is up. Look at the
+contents of:
+
+/sys/kernel/slab/<slab name>/
+
+Look at the writable files. Writing 1 to them will enable the
+corresponding debug option. All options can be set on a slab that does
+not contain objects. If the slab already contains objects then sanity checks
+and tracing may only be enabled. The other options may cause the realignment
+of objects.
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+Slab merging
+------------
+
+If no debug options are specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+slabinfo -a displays which slabs were merged together.
+
+Slab validation
+---------------
+
+SLUB can validate all object if the kernel was booted with slub_debug. In
+order to do so you must have the slabinfo tool. Then you can do
+
+slabinfo -v
+
+which will test all objects. Output will be generated to the syslog.
+
+This also works in a more limited way if boot was without slab debug.
+In that case slabinfo -v simply tests all reachable objects. Usually
+these are in the cpu slabs and the partial slabs. Full slabs are not
+tracked by SLUB in a non debug situation.
+
+Getting more performance
+------------------------
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+slub_min_objects=x (default 4)
+slub_min_order=x (default 0)
+slub_max_order=x (default 1)
+
+slub_min_objects allows to specify how many objects must at least fit
+into one slab in order for the allocation order to be acceptable.
+In general slub will be able to perform this number of allocations
+on a slab without consulting centralized resources (list_lock) where
+contention may occur.
+
+slub_min_order specifies a minim order of slabs. A similar effect like
+slub_min_objects.
+
+slub_max_order specified the order at which slub_min_objects should no
+longer be checked. This is useful to avoid SLUB trying to generate
+super large order pages to fit slub_min_objects of a slab cache with
+large object sizes into one high order page.
+
+SLUB Debug output
+-----------------
+
+Here is a sample of slub debug output:
+
+====================================================================
+BUG kmalloc-8: Redzone overwritten
+--------------------------------------------------------------------
+
+INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
+INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
+INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
+INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
+
+Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
+ Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005
+ Redzone 0xc90f6d28: 00 cc cc cc .
+ Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
+
+ [<c010523d>] dump_trace+0x63/0x1eb
+ [<c01053df>] show_trace_log_lvl+0x1a/0x2f
+ [<c010601d>] show_trace+0x12/0x14
+ [<c0106035>] dump_stack+0x16/0x18
+ [<c017e0fa>] object_err+0x143/0x14b
+ [<c017e2cc>] check_object+0x66/0x234
+ [<c017eb43>] __slab_free+0x239/0x384
+ [<c017f446>] kfree+0xa6/0xc6
+ [<c02e2335>] get_modalias+0xb9/0xf5
+ [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
+ [<c027866a>] dev_uevent+0x1ad/0x1da
+ [<c0205024>] kobject_uevent_env+0x20a/0x45b
+ [<c020527f>] kobject_uevent+0xa/0xf
+ [<c02779f1>] store_uevent+0x4f/0x58
+ [<c027758e>] dev_attr_store+0x29/0x2f
+ [<c01bec4f>] sysfs_write_file+0x16e/0x19c
+ [<c0183ba7>] vfs_write+0xd1/0x15a
+ [<c01841d7>] sys_write+0x3d/0x72
+ [<c0104112>] sysenter_past_esp+0x5f/0x99
+ [<b7f7b410>] 0xb7f7b410
+ =======================
+
+FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
+
+If SLUB encounters a corrupted object (full detection requires the kernel
+to be booted with slub_debug) then the following output will be dumped
+into the syslog:
+
+1. Description of the problem encountered
+
+This will be a message in the system log starting with
+
+===============================================
+BUG <slab cache affected>: <What went wrong>
+-----------------------------------------------
+
+INFO: <corruption start>-<corruption_end> <more info>
+INFO: Slab <address> <slab information>
+INFO: Object <address> <object information>
+INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
+ cpu> pid=<pid of the process>
+INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
+ pid=<pid of the process>
+
+(Object allocation / free information is only available if SLAB_STORE_USER is
+set for the slab. slub_debug sets that option)
+
+2. The object contents if an object was involved.
+
+Various types of lines can follow the BUG SLUB line:
+
+Bytes b4 <address> : <bytes>
+ Shows a few bytes before the object where the problem was detected.
+ Can be useful if the corruption does not stop with the start of the
+ object.
+
+Object <address> : <bytes>
+ The bytes of the object. If the object is inactive then the bytes
+ typically contain poison values. Any non-poison value shows a
+ corruption by a write after free.
+
+Redzone <address> : <bytes>
+ The Redzone following the object. The Redzone is used to detect
+ writes after the object. All bytes should always have the same
+ value. If there is any deviation then it is due to a write after
+ the object boundary.
+
+ (Redzone information is only available if SLAB_RED_ZONE is set.
+ slub_debug sets that option)
+
+Padding <address> : <bytes>
+ Unused data to fill up the space in order to get the next object
+ properly aligned. In the debug case we make sure that there are
+ at least 4 bytes of padding. This allows the detection of writes
+ before the object.
+
+3. A stackdump
+
+The stackdump describes the location where the error was detected. The cause
+of the corruption is may be more likely found by looking at the function that
+allocated or freed the object.
+
+4. Report on how the problem was dealt with in order to ensure the continued
+operation of the system.
+
+These are messages in the system log beginning with
+
+FIX <slab cache affected>: <corrective action taken>
+
+In the above sample SLUB found that the Redzone of an active object has
+been overwritten. Here a string of 8 characters was written into a slab that
+has the length of 8 characters. However, a 8 character string needs a
+terminating 0. That zero has overwritten the first byte of the Redzone field.
+After reporting the details of the issue encountered the FIX SLUB message
+tell us that SLUB has restored the Redzone to its proper value and then
+system operations continue.
+
+Emergency operations:
+---------------------
+
+Minimal debugging (sanity checks alone) can be enabled by booting with
+
+ slub_debug=F
+
+This will be generally be enough to enable the resiliency features of slub
+which will keep the system running even if a bad kernel component will
+keep corrupting objects. This may be important for production systems.
+Performance will be impacted by the sanity checks and there will be a
+continual stream of error messages to the syslog but no additional memory
+will be used (unlike full debugging).
+
+No guarantees. The kernel component still needs to be fixed. Performance
+may be optimized further by locating the slab that experiences corruption
+and enabling debugging only for that cache
+
+I.e.
+
+ slub_debug=F,dentry
+
+If the corruption occurs by writing after the end of the object then it
+may be advisable to enable a Redzone to avoid corrupting the beginning
+of other objects.
+
+ slub_debug=FZ,dentry
+
+Christoph Lameter, May 30, 2007
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
new file mode 100644
index 0000000..125eed5
--- /dev/null
+++ b/Documentation/vm/unevictable-lru.txt
@@ -0,0 +1,615 @@
+
+This document describes the Linux memory management "Unevictable LRU"
+infrastructure and the use of this infrastructure to manage several types
+of "unevictable" pages. The document attempts to provide the overall
+rationale behind this mechanism and the rationale for some of the design
+decisions that drove the implementation. The latter design rationale is
+discussed in the context of an implementation description. Admittedly, one
+can obtain the implementation details--the "what does it do?"--by reading the
+code. One hopes that the descriptions below add value by provide the answer
+to "why does it do that?".
+
+Unevictable LRU Infrastructure:
+
+The Unevictable LRU adds an additional LRU list to track unevictable pages
+and to hide these pages from vmscan. This mechanism is based on a patch by
+Larry Woodman of Red Hat to address several scalability problems with page
+reclaim in Linux. The problems have been observed at customer sites on large
+memory x86_64 systems. For example, a non-numal x86_64 platform with 128GB
+of main memory will have over 32 million 4k pages in a single zone. When a
+large fraction of these pages are not evictable for any reason [see below],
+vmscan will spend a lot of time scanning the LRU lists looking for the small
+fraction of pages that are evictable. This can result in a situation where
+all cpus are spending 100% of their time in vmscan for hours or days on end,
+with the system completely unresponsive.
+
+The Unevictable LRU infrastructure addresses the following classes of
+unevictable pages:
+
++ page owned by ramfs
++ page mapped into SHM_LOCKed shared memory regions
++ page mapped into VM_LOCKED [mlock()ed] vmas
+
+The infrastructure might be able to handle other conditions that make pages
+unevictable, either by definition or by circumstance, in the future.
+
+
+The Unevictable LRU List
+
+The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
+called the "unevictable" list and an associated page flag, PG_unevictable, to
+indicate that the page is being managed on the unevictable list. The
+PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active
+flag in that it indicates on which LRU list a page resides when PG_lru is set.
+The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU
+Kconfig option.
+
+The Unevictable LRU infrastructure maintains unevictable pages on an additional
+LRU list for a few reasons:
+
+1) We get to "treat unevictable pages just like we treat other pages in the
+ system, which means we get to use the same code to manipulate them, the
+ same code to isolate them (for migrate, etc.), the same code to keep track
+ of the statistics, etc..." [Rik van Riel]
+
+2) We want to be able to migrate unevictable pages between nodes--for memory
+ defragmentation, workload management and memory hotplug. The linux kernel
+ can only migrate pages that it can successfully isolate from the lru lists.
+ If we were to maintain pages elsewise than on an lru-like list, where they
+ can be found by isolate_lru_page(), we would prevent their migration, unless
+ we reworked migration code to find the unevictable pages.
+
+
+The unevictable LRU list does not differentiate between file backed and swap
+backed [anon] pages. This differentiation is only important while the pages
+are, in fact, evictable.
+
+The unevictable LRU list benefits from the "arrayification" of the per-zone
+LRU lists and statistics originally proposed and posted by Christoph Lameter.
+
+The unevictable list does not use the lru pagevec mechanism. Rather,
+unevictable pages are placed directly on the page's zone's unevictable
+list under the zone lru_lock. The reason for this is to prevent stranding
+of pages on the unevictable list when one task has the page isolated from the
+lru and other tasks are changing the "evictability" state of the page.
+
+
+Unevictable LRU and Memory Controller Interaction
+
+The memory controller data structure automatically gets a per zone unevictable
+lru list as a result of the "arrayification" of the per-zone LRU lists. The
+memory controller tracks the movement of pages to and from the unevictable list.
+When a memory control group comes under memory pressure, the controller will
+not attempt to reclaim pages on the unevictable list. This has a couple of
+effects. Because the pages are "hidden" from reclaim on the unevictable list,
+the reclaim process can be more efficient, dealing only with pages that have
+a chance of being reclaimed. On the other hand, if too many of the pages
+charged to the control group are unevictable, the evictable portion of the
+working set of the tasks in the control group may not fit into the available
+memory. This can cause the control group to thrash or to oom-kill tasks.
+
+
+Unevictable LRU: Detecting Unevictable Pages
+
+The function page_evictable(page, vma) in vmscan.c determines whether a
+page is evictable or not. For ramfs pages and pages in SHM_LOCKed regions,
+page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's
+address space using a wrapper function. Wrapper functions are used to set,
+clear and test the flag to reduce the requirement for #ifdef's throughout the
+source code. AS_UNEVICTABLE is set on ramfs inode/mapping when it is created.
+This flag remains for the life of the inode.
+
+For shared memory regions, AS_UNEVICTABLE is set when an application
+successfully SHM_LOCKs the region and is removed when the region is
+SHM_UNLOCKed. Note that shmctl(SHM_LOCK, ...) does not populate the page
+tables for the region as does, for example, mlock(). So, we make no special
+effort to push any pages in the SHM_LOCKed region to the unevictable list.
+Vmscan will do this when/if it encounters the pages during reclaim. On
+SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the
+unevictable list if no other condition keeps them unevictable. If a SHM_LOCKed
+region is destroyed, the pages are also "rescued" from the unevictable list in
+the process of freeing them.
+
+page_evictable() detects mlock()ed pages by testing an additional page flag,
+PG_mlocked via the PageMlocked() wrapper. If the page is NOT mlocked, and a
+non-NULL vma is supplied, page_evictable() will check whether the vma is
+VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and
+update the appropriate statistics if the vma is VM_LOCKED. This method allows
+efficient "culling" of pages in the fault path that are being faulted in to
+VM_LOCKED vmas.
+
+
+Unevictable Pages and Vmscan [shrink_*_list()]
+
+If unevictable pages are culled in the fault path, or moved to the unevictable
+list at mlock() or mmap() time, vmscan will never encounter the pages until
+they have become evictable again, for example, via munlock() and have been
+"rescued" from the unevictable list. However, there may be situations where we
+decide, for the sake of expediency, to leave a unevictable page on one of the
+regular active/inactive LRU lists for vmscan to deal with. Vmscan checks for
+such pages in all of the shrink_{active|inactive|page}_list() functions and
+will "cull" such pages that it encounters--that is, it diverts those pages to
+the unevictable list for the zone being scanned.
+
+There may be situations where a page is mapped into a VM_LOCKED vma, but the
+page is not marked as PageMlocked. Such pages will make it all the way to
+shrink_page_list() where they will be detected when vmscan walks the reverse
+map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list()
+will cull the page at that point.
+
+Note that for anonymous pages, shrink_page_list() attempts to add the page to
+the swap cache before it tries to unmap the page. To avoid this unnecessary
+consumption of swap space, shrink_page_list() calls try_to_munlock() to check
+whether any VM_LOCKED vmas map the page without attempting to unmap the page.
+If try_to_munlock() returns SWAP_MLOCK, shrink_page_list() will cull the page
+without consuming swap space. try_to_munlock() will be described below.
+
+To "cull" an unevictable page, vmscan simply puts the page back on the lru
+list using putback_lru_page()--the inverse operation to isolate_lru_page()--
+after dropping the page lock. Because the condition which makes the page
+unevictable may change once the page is unlocked, putback_lru_page() will
+recheck the unevictable state of a page that it places on the unevictable lru
+list. If the page has become unevictable, putback_lru_page() removes it from
+the list and retries, including the page_unevictable() test. Because such a
+race is a rare event and movement of pages onto the unevictable list should be
+rare, these extra evictabilty checks should not occur in the majority of calls
+to putback_lru_page().
+
+
+Mlocked Page: Prior Work
+
+The "Unevictable Mlocked Pages" infrastructure is based on work originally
+posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
+Nick posted his patch as an alternative to a patch posted by Christoph
+Lameter to achieve the same objective--hiding mlocked pages from vmscan.
+In Nick's patch, he used one of the struct page lru list link fields as a count
+of VM_LOCKED vmas that map the page. This use of the link field for a count
+prevented the management of the pages on an LRU list. Thus, mlocked pages were
+not migratable as isolate_lru_page() could not find them and the lru list link
+field was not available to the migration subsystem. Nick resolved this by
+putting mlocked pages back on the lru list before attempting to isolate them,
+thus abandoning the count of VM_LOCKED vmas. When Nick's patch was integrated
+with the Unevictable LRU work, the count was replaced by walking the reverse
+map to determine whether any VM_LOCKED vmas mapped the page. More on this
+below.
+
+
+Mlocked Pages: Basic Management
+
+Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of
+unevictable pages. When such a page has been "noticed" by the memory
+management subsystem, the page is marked with the PG_mlocked [PageMlocked()]
+flag. A PageMlocked() page will be placed on the unevictable LRU list when
+it is added to the LRU. Pages can be "noticed" by memory management in
+several places:
+
+1) in the mlock()/mlockall() system call handlers.
+2) in the mmap() system call handler when mmap()ing a region with the
+ MAP_LOCKED flag, or mmap()ing a region in a task that has called
+ mlockall() with the MCL_FUTURE flag. Both of these conditions result
+ in the VM_LOCKED flag being set for the vma.
+3) in the fault path, if mlocked pages are "culled" in the fault path,
+ and when a VM_LOCKED stack segment is expanded.
+4) as mentioned above, in vmscan:shrink_page_list() with attempting to
+ reclaim a page in a VM_LOCKED vma--via try_to_unmap() or try_to_munlock().
+
+Mlocked pages become unlocked and rescued from the unevictable list when:
+
+1) mapped in a range unlocked via the munlock()/munlockall() system calls.
+2) munmapped() out of the last VM_LOCKED vma that maps the page, including
+ unmapping at task exit.
+3) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file.
+4) before a page is COWed in a VM_LOCKED vma.
+
+
+Mlocked Pages: mlock()/mlockall() System Call Handling
+
+Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
+for each vma in the range specified by the call. In the case of mlockall(),
+this is the entire active address space of the task. Note that mlock_fixup()
+is used for both mlock()ing and munlock()ing a range of memory. A call to
+mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED
+is treated as a no-op--mlock_fixup() simply returns.
+
+If the vma passes some filtering described in "Mlocked Pages: Filtering Vmas"
+below, mlock_fixup() will attempt to merge the vma with its neighbors or split
+off a subset of the vma if the range does not cover the entire vma. Once the
+vma has been merged or split or neither, mlock_fixup() will call
+__mlock_vma_pages_range() to fault in the pages via get_user_pages() and
+to mark the pages as mlocked via mlock_vma_page().
+
+Note that the vma being mlocked might be mapped with PROT_NONE. In this case,
+get_user_pages() will be unable to fault in the pages. That's OK. If pages
+do end up getting faulted into this VM_LOCKED vma, we'll handle them in the
+fault path or in vmscan.
+
+Also note that a page returned by get_user_pages() could be truncated or
+migrated out from under us, while we're trying to mlock it. To detect
+this, __mlock_vma_pages_range() tests the page_mapping after acquiring
+the page lock. If the page is still associated with its mapping, we'll
+go ahead and call mlock_vma_page(). If the mapping is gone, we just
+unlock the page and move on. Worse case, this results in page mapped
+in a VM_LOCKED vma remaining on a normal LRU list without being
+PageMlocked(). Again, vmscan will detect and cull such pages.
+
+mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will
+TestSetPageMlocked() for each page returned by get_user_pages(). We use
+TestSetPageMlocked() because the page might already be mlocked by another
+task/vma and we don't want to do extra work. We especially do not want to
+count an mlocked page more than once in the statistics. If the page was
+already mlocked, mlock_vma_page() is done.
+
+If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
+page from the LRU, as it is likely on the appropriate active or inactive list
+at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will
+putback the page--putback_lru_page()--which will notice that the page is now
+mlocked and divert the page to the zone's unevictable LRU list. If
+mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
+it later if/when it attempts to reclaim the page.
+
+
+Mlocked Pages: Filtering Special Vmas
+
+mlock_fixup() filters several classes of "special" vmas:
+
+1) vmas with VM_IO|VM_PFNMAP set are skipped entirely. The pages behind
+ these mappings are inherently pinned, so we don't need to mark them as
+ mlocked. In any case, most of the pages have no struct page in which to
+ so mark the page. Because of this, get_user_pages() will fail for these
+ vmas, so there is no sense in attempting to visit them.
+
+2) vmas mapping hugetlbfs page are already effectively pinned into memory.
+ We don't need nor want to mlock() these pages. However, to preserve the
+ prior behavior of mlock()--before the unevictable/mlock changes--mlock_fixup()
+ will call make_pages_present() in the hugetlbfs vma range to allocate the
+ huge pages and populate the ptes.
+
+3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of
+ kernel pages, such as the vdso page, relay channel pages, etc. These pages
+ are inherently unevictable and are not managed on the LRU lists.
+ mlock_fixup() treats these vmas the same as hugetlbfs vmas. It calls
+ make_pages_present() to populate the ptes.
+
+Note that for all of these special vmas, mlock_fixup() does not set the
+VM_LOCKED flag. Therefore, we won't have to deal with them later during
+munlock() or munmap()--for example, at task exit. Neither does mlock_fixup()
+account these vmas against the task's "locked_vm".
+
+Mlocked Pages: Downgrading the Mmap Semaphore.
+
+mlock_fixup() must be called with the mmap semaphore held for write, because
+it may have to merge or split vmas. However, mlocking a large region of
+memory can take a long time--especially if vmscan must reclaim pages to
+satisfy the regions requirements. Faulting in a large region with the mmap
+semaphore held for write can hold off other faults on the address space, in
+the case of a multi-threaded task. It can also hold off scans of the task's
+address space via /proc. While testing under heavy load, it was observed that
+the ps(1) command could be held off for many minutes while a large segment was
+mlock()ed down.
+
+To address this issue, and to make the system more responsive during mlock()ing
+of large segments, mlock_fixup() downgrades the mmap semaphore to read mode
+during the call to __mlock_vma_pages_range(). This works fine. However, the
+callers of mlock_fixup() expect the semaphore to be returned in write mode.
+So, mlock_fixup() "upgrades" the semphore to write mode. Linux does not
+support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore
+and reacquire it in write mode. In a multi-threaded task, it is possible for
+the task memory map to change while the semaphore is dropped. Therefore,
+mlock_fixup() looks up the vma at the range start address after reacquiring
+the semaphore in write mode and verifies that it still covers the original
+range. If not, mlock_fixup() returns an error [-EAGAIN]. All callers of
+mlock_fixup() have been changed to deal with this new error condition.
+
+Note: when munlocking a region, all of the pages should already be resident--
+unless we have racing threads mlocking() and munlocking() regions. So,
+unlocking should not have to wait for page allocations nor faults of any kind.
+Therefore mlock_fixup() does not downgrade the semaphore for munlock().
+
+
+Mlocked Pages: munlock()/munlockall() System Call Handling
+
+The munlock() and munlockall() system calls are handled by the same functions--
+do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock
+vs lock operation indicated by an argument. So, these system calls are also
+handled by mlock_fixup(). Again, if called for an already munlock()ed vma,
+mlock_fixup() simply returns. Because of the vma filtering discussed above,
+VM_LOCKED will not be set in any "special" vmas. So, these vmas will be
+ignored for munlock.
+
+If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off
+the specified range. The range is then munlocked via the function
+__mlock_vma_pages_range()--the same function used to mlock a vma range--
+passing a flag to indicate that munlock() is being performed.
+
+Because the vma access protections could have been changed to PROT_NONE after
+faulting in and mlocking some pages, get_user_pages() was unreliable for visiting
+these pages for munlocking. Because we don't want to leave pages mlocked(),
+get_user_pages() was enhanced to accept a flag to ignore the permissions when
+fetching the pages--all of which should be resident as a result of previous
+mlock()ing.
+
+For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
+munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked
+flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page()
+use the Test*PageMlocked() function to handle the case where the page might
+have already been unlocked by another task. If the page was mlocked,
+munlock_vma_page() updates that zone statistics for the number of mlocked
+pages. Note, however, that at this point we haven't checked whether the page
+is mapped by other VM_LOCKED vmas.
+
+We can't call try_to_munlock(), the function that walks the reverse map to check
+for other VM_LOCKED vmas, without first isolating the page from the LRU.
+try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
+not be on an lru list. [More on these below.] However, the call to
+isolate_lru_page() could fail, in which case we couldn't try_to_munlock().
+So, we go ahead and clear PG_mlocked up front, as this might be the only chance
+we have. If we can successfully isolate the page, we go ahead and
+try_to_munlock(), which will restore the PG_mlocked flag and update the zone
+page statistics if it finds another vma holding the page mlocked. If we fail
+to isolate the page, we'll have left a potentially mlocked page on the LRU.
+This is fine, because we'll catch it later when/if vmscan tries to reclaim the
+page. This should be relatively rare.
+
+Mlocked Pages: Migrating Them...
+
+A page that is being migrated has been isolated from the lru lists and is
+held locked across unmapping of the page, updating the page's mapping
+[address_space] entry and copying the contents and state, until the
+page table entry has been replaced with an entry that refers to the new
+page. Linux supports migration of mlocked pages and other unevictable
+pages. This involves simply moving the PageMlocked and PageUnevictable states
+from the old page to the new page.
+
+Note that page migration can race with mlocking or munlocking of the same
+page. This has been discussed from the mlock/munlock perspective in the
+respective sections above. Both processes [migration, m[un]locking], hold
+the page locked. This provides the first level of synchronization. Page
+migration zeros out the page_mapping of the old page before unlocking it,
+so m[un]lock can skip these pages by testing the page mapping under page
+lock.
+
+When completing page migration, we place the new and old pages back onto the
+lru after dropping the page lock. The "unneeded" page--old page on success,
+new page on failure--will be freed when the reference count held by the
+migration process is released. To ensure that we don't strand pages on the
+unevictable list because of a race between munlock and migration, page
+migration uses the putback_lru_page() function to add migrated pages back to
+the lru.
+
+
+Mlocked Pages: mmap(MAP_LOCKED) System Call Handling
+
+In addition the the mlock()/mlockall() system calls, an application can request
+that a region of memory be mlocked using the MAP_LOCKED flag with the mmap()
+call. Furthermore, any mmap() call or brk() call that expands the heap by a
+task that has previously called mlockall() with the MCL_FUTURE flag will result
+in the newly mapped memory being mlocked. Before the unevictable/mlock changes,
+the kernel simply called make_pages_present() to allocate pages and populate
+the page table.
+
+To mlock a range of memory under the unevictable/mlock infrastructure, the
+mmap() handler and task address space expansion functions call
+mlock_vma_pages_range() specifying the vma and the address range to mlock.
+mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in
+"Mlocked Pages: Filtering Vmas". It will clear the VM_LOCKED flag, which will
+have already been set by the caller, in filtered vmas. Thus these vma's need
+not be visited for munlock when the region is unmapped.
+
+For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
+fault/allocate the pages and mlock them. Again, like mlock_fixup(),
+mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
+attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore
+back to write mode before returning.
+
+The callers of mlock_vma_pages_range() will have already added the memory
+range to be mlocked to the task's "locked_vm". To account for filtered vmas,
+mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the
+callers then subtract a non-negative return value from the task's locked_vm.
+A negative return value represent an error--for example, from get_user_pages()
+attempting to fault in a vma with PROT_NONE access. In this case, we leave
+the memory range accounted as locked_vm, as the protections could be changed
+later and pages allocated into that region.
+
+
+Mlocked Pages: munmap()/exit()/exec() System Call Handling
+
+When unmapping an mlocked region of memory, whether by an explicit call to
+munmap() or via an internal unmap from exit() or exec() processing, we must
+munlock the pages if we're removing the last VM_LOCKED vma that maps the pages.
+Before the unevictable/mlock changes, mlocking did not mark the pages in any way,
+so unmapping them required no processing.
+
+To munlock a range of memory under the unevictable/mlock infrastructure, the
+munmap() hander and task address space tear down function call
+munlock_vma_pages_all(). The name reflects the observation that one always
+specifies the entire vma range when munlock()ing during unmap of a region.
+Because of the vma filtering when mlocking() regions, only "normal" vmas that
+actually contain mlocked pages will be passed to munlock_vma_pages_all().
+
+munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup()
+for the munlock case, calls __munlock_vma_pages_range() to walk the page table
+for the vma's memory range and munlock_vma_page() each resident page mapped by
+the vma. This effectively munlocks the page, only if this is the last
+VM_LOCKED vma that maps the page.
+
+
+Mlocked Page: try_to_unmap()
+
+[Note: the code changes represented by this section are really quite small
+compared to the text to describe what happening and why, and to discuss the
+implications.]
+
+Pages can, of course, be mapped into multiple vmas. Some of these vmas may
+have VM_LOCKED flag set. It is possible for a page mapped into one or more
+VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one
+of the active or inactive LRU lists. This could happen if, for example, a
+task in the process of munlock()ing the page could not isolate the page from
+the LRU. As a result, vmscan/shrink_page_list() might encounter such a page
+as described in "Unevictable Pages and Vmscan [shrink_*_list()]". To
+handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED
+vmas while it is walking a page's reverse map.
+
+try_to_unmap() is always called, by either vmscan for reclaim or for page
+migration, with the argument page locked and isolated from the LRU. BUG_ON()
+assertions enforce this requirement. Separate functions handle anonymous and
+mapped file pages, as these types of pages have different reverse map
+mechanisms.
+
+ try_to_unmap_anon()
+
+To unmap anonymous pages, each vma in the list anchored in the anon_vma must be
+visited--at least until a VM_LOCKED vma is encountered. If the page is being
+unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked
+pages are migratable. However, for reclaim, if the page is mapped into a
+VM_LOCKED vma, the scan stops. try_to_unmap() attempts to acquire the mmap
+semphore of the mm_struct to which the vma belongs in read mode. If this is
+successful, try_to_unmap() will mlock the page via mlock_vma_page()--we
+wouldn't have gotten to try_to_unmap() if the page were already mlocked--and
+will return SWAP_MLOCK, indicating that the page is unevictable. If the
+mmap semaphore cannot be acquired, we are not sure whether the page is really
+unevictable or not. In this case, try_to_unmap() will return SWAP_AGAIN.
+
+ try_to_unmap_file() -- linear mappings
+
+Unmapping of a mapped file page works the same, except that the scan visits
+all vmas that maps the page's index/page offset in the page's mapping's
+reverse map priority search tree. It must also visit each vma in the page's
+mapping's non-linear list, if the list is non-empty. As for anonymous pages,
+on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will
+attempt to acquire the associated mm_struct's mmap semaphore to mlock the page,
+returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not.
+
+ try_to_unmap_file() -- non-linear mappings
+
+If a page's mapping contains a non-empty non-linear mapping vma list, then
+try_to_un{map|lock}() must also visit each vma in that list to determine
+whether the page is mapped in a VM_LOCKED vma. Again, the scan must visit
+all vmas in the non-linear list to ensure that the pages is not/should not be
+mlocked. If a VM_LOCKED vma is found in the list, the scan could terminate.
+However, there is no easy way to determine whether the page is actually mapped
+in a given vma--either for unmapping or testing whether the VM_LOCKED vma
+actually pins the page.
+
+So, try_to_unmap_file() handles non-linear mappings by scanning a certain
+number of pages--a "cluster"--in each non-linear vma associated with the page's
+mapping, for each file mapped page that vmscan tries to unmap. If this happens
+to unmap the page we're trying to unmap, try_to_unmap() will notice this on
+return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS. Otherwise, it
+will return SWAP_AGAIN, causing vmscan to recirculate this page. We take
+advantage of the cluster scan in try_to_unmap_cluster() as follows:
+
+For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap
+semaphore of the associated mm_struct for read without blocking. If this
+attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will
+retain the mmap semaphore for the scan; otherwise it drops it here. Then,
+for each page in the cluster, if we're holding the mmap semaphore for a locked
+vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page. This
+call is a no-op if the page is already locked, but will mlock any pages in
+the non-linear mapping that happen to be unlocked. If one of the pages so
+mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will
+return SWAP_MLOCK, rather than the default SWAP_AGAIN. This will allow vmscan
+to cull the page, rather than recirculating it on the inactive list. Again,
+if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns
+SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but
+couldn't be mlocked.
+
+
+Mlocked pages: try_to_munlock() Reverse Map Scan
+
+TODO/FIXME: a better name might be page_mlocked()--analogous to the
+page_referenced() reverse map walker--especially if we continue to call this
+from shrink_page_list(). See related TODO/FIXME below.
+
+When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() System
+Call Handling" above--tries to munlock a page, or when shrink_page_list()
+encounters an anonymous page that is not yet in the swap cache, they need to
+determine whether or not the page is mapped by any VM_LOCKED vma, without
+actually attempting to unmap all ptes from the page. For this purpose, the
+unevictable/mlock infrastructure introduced a variant of try_to_unmap() called
+try_to_munlock().
+
+try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
+mapped file pages with an additional argument specifing unlock versus unmap
+processing. Again, these functions walk the respective reverse maps looking
+for VM_LOCKED vmas. When such a vma is found for anonymous pages and file
+pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
+attempt to acquire the associated mmap semphore, mlock the page via
+mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the
+pre-clearing of the page's PG_mlocked done by munlock_vma_page() and informs
+shrink_page_list() that the anonymous page should be culled rather than added
+to the swap cache in preparation for a try_to_unmap() that will almost
+certainly fail.
+
+If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap
+semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list()
+to recycle the page on the inactive list and hope that it has better luck
+with the page next time.
+
+For file pages mapped into non-linear vmas, the try_to_munlock() logic works
+slightly differently. On encountering a VM_LOCKED non-linear vma that might
+map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking
+the page. munlock_vma_page() will just leave the page unlocked and let
+vmscan deal with it--the usual fallback position.
+
+Note that try_to_munlock()'s reverse map walk must visit every vma in a pages'
+reverse map to determine that a page is NOT mapped into any VM_LOCKED vma.
+However, the scan can terminate when it encounters a VM_LOCKED vma and can
+successfully acquire the vma's mmap semphore for read and mlock the page.
+Although try_to_munlock() can be called many [very many!] times when
+munlock()ing a large region or tearing down a large address space that has been
+mlocked via mlockall(), overall this is a fairly rare event. In addition,
+although shrink_page_list() calls try_to_munlock() for every anonymous page that
+it handles that is not yet in the swap cache, on average anonymous pages will
+have very short reverse map lists.
+
+Mlocked Page: Page Reclaim in shrink_*_list()
+
+shrink_active_list() culls any obviously unevictable pages--i.e.,
+!page_evictable(page, NULL)--diverting these to the unevictable lru
+list. However, shrink_active_list() only sees unevictable pages that
+made it onto the active/inactive lru lists. Note that these pages do not
+have PageUnevictable set--otherwise, they would be on the unevictable list and
+shrink_active_list would never see them.
+
+Some examples of these unevictable pages on the LRU lists are:
+
+1) ramfs pages that have been placed on the lru lists when first allocated.
+
+2) SHM_LOCKed shared memory pages. shmctl(SHM_LOCK) does not attempt to
+ allocate or fault in the pages in the shared memory region. This happens
+ when an application accesses the page the first time after SHM_LOCKing
+ the segment.
+
+3) Mlocked pages that could not be isolated from the lru and moved to the
+ unevictable list in mlock_vma_page().
+
+3) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't
+ acquire the vma's mmap semaphore to test the flags and set PageMlocked.
+ munlock_vma_page() was forced to let the page back on to the normal
+ LRU list for vmscan to handle.
+
+shrink_inactive_list() also culls any unevictable pages that it finds
+on the inactive lists, again diverting them to the appropriate zone's unevictable
+lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became
+SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or
+pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from
+the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice
+the latter, but will pass on to shrink_page_list().
+
+shrink_page_list() again culls obviously unevictable pages that it could
+encounter for similar reason to shrink_inactive_list(). As already discussed,
+shrink_page_list() proactively looks for anonymous pages that should have
+PG_mlocked set but don't--these would not be detected by page_evictable()--to
+avoid adding them to the swap cache unnecessarily. File pages mapped into
+VM_LOCKED vmas but without PG_mlocked set will make it all the way to
+try_to_unmap(). shrink_page_list() will divert them to the unevictable list when
+try_to_unmap() returns SWAP_MLOCK, as discussed above.
+
+TODO/FIXME: If we can enhance the swap cache to reliably remove entries
+with page_count(page) > 2, as long as all ptes are mapped to the page and
+not the swap entry, we can probably remove the call to try_to_munlock() in
+shrink_page_list() and just remove the page from the swap cache when
+try_to_unmap() returns SWAP_MLOCK. Currently, remove_exclusive_swap_page()
+doesn't seem to allow that.
+
+
diff --git a/Documentation/volatile-considered-harmful.txt b/Documentation/volatile-considered-harmful.txt
new file mode 100644
index 0000000..991c26a
--- /dev/null
+++ b/Documentation/volatile-considered-harmful.txt
@@ -0,0 +1,119 @@
+Why the "volatile" type class should not be used
+------------------------------------------------
+
+C programmers have often taken volatile to mean that the variable could be
+changed outside of the current thread of execution; as a result, they are
+sometimes tempted to use it in kernel code when shared data structures are
+being used. In other words, they have been known to treat volatile types
+as a sort of easy atomic variable, which they are not. The use of volatile in
+kernel code is almost never correct; this document describes why.
+
+The key point to understand with regard to volatile is that its purpose is
+to suppress optimization, which is almost never what one really wants to
+do. In the kernel, one must protect shared data structures against
+unwanted concurrent access, which is very much a different task. The
+process of protecting against unwanted concurrency will also avoid almost
+all optimization-related problems in a more efficient way.
+
+Like volatile, the kernel primitives which make concurrent access to data
+safe (spinlocks, mutexes, memory barriers, etc.) are designed to prevent
+unwanted optimization. If they are being used properly, there will be no
+need to use volatile as well. If volatile is still necessary, there is
+almost certainly a bug in the code somewhere. In properly-written kernel
+code, volatile can only serve to slow things down.
+
+Consider a typical block of kernel code:
+
+ spin_lock(&the_lock);
+ do_something_on(&shared_data);
+ do_something_else_with(&shared_data);
+ spin_unlock(&the_lock);
+
+If all the code follows the locking rules, the value of shared_data cannot
+change unexpectedly while the_lock is held. Any other code which might
+want to play with that data will be waiting on the lock. The spinlock
+primitives act as memory barriers - they are explicitly written to do so -
+meaning that data accesses will not be optimized across them. So the
+compiler might think it knows what will be in shared_data, but the
+spin_lock() call, since it acts as a memory barrier, will force it to
+forget anything it knows. There will be no optimization problems with
+accesses to that data.
+
+If shared_data were declared volatile, the locking would still be
+necessary. But the compiler would also be prevented from optimizing access
+to shared_data _within_ the critical section, when we know that nobody else
+can be working with it. While the lock is held, shared_data is not
+volatile. When dealing with shared data, proper locking makes volatile
+unnecessary - and potentially harmful.
+
+The volatile storage class was originally meant for memory-mapped I/O
+registers. Within the kernel, register accesses, too, should be protected
+by locks, but one also does not want the compiler "optimizing" register
+accesses within a critical section. But, within the kernel, I/O memory
+accesses are always done through accessor functions; accessing I/O memory
+directly through pointers is frowned upon and does not work on all
+architectures. Those accessors are written to prevent unwanted
+optimization, so, once again, volatile is unnecessary.
+
+Another situation where one might be tempted to use volatile is
+when the processor is busy-waiting on the value of a variable. The right
+way to perform a busy wait is:
+
+ while (my_variable != what_i_want)
+ cpu_relax();
+
+The cpu_relax() call can lower CPU power consumption or yield to a
+hyperthreaded twin processor; it also happens to serve as a memory barrier,
+so, once again, volatile is unnecessary. Of course, busy-waiting is
+generally an anti-social act to begin with.
+
+There are still a few rare situations where volatile makes sense in the
+kernel:
+
+ - The above-mentioned accessor functions might use volatile on
+ architectures where direct I/O memory access does work. Essentially,
+ each accessor call becomes a little critical section on its own and
+ ensures that the access happens as expected by the programmer.
+
+ - Inline assembly code which changes memory, but which has no other
+ visible side effects, risks being deleted by GCC. Adding the volatile
+ keyword to asm statements will prevent this removal.
+
+ - The jiffies variable is special in that it can have a different value
+ every time it is referenced, but it can be read without any special
+ locking. So jiffies can be volatile, but the addition of other
+ variables of this type is strongly frowned upon. Jiffies is considered
+ to be a "stupid legacy" issue (Linus's words) in this regard; fixing it
+ would be more trouble than it is worth.
+
+ - Pointers to data structures in coherent memory which might be modified
+ by I/O devices can, sometimes, legitimately be volatile. A ring buffer
+ used by a network adapter, where that adapter changes pointers to
+ indicate which descriptors have been processed, is an example of this
+ type of situation.
+
+For most code, none of the above justifications for volatile apply. As a
+result, the use of volatile is likely to be seen as a bug and will bring
+additional scrutiny to the code. Developers who are tempted to use
+volatile should take a step back and think about what they are truly trying
+to accomplish.
+
+Patches to remove volatile variables are generally welcome - as long as
+they come with a justification which shows that the concurrency issues have
+been properly thought through.
+
+
+NOTES
+-----
+
+[1] http://lwn.net/Articles/233481/
+[2] http://lwn.net/Articles/233482/
+
+CREDITS
+-------
+
+Original impetus and research by Randy Dunlap
+Written by Jonathan Corbet
+Improvements via comments from Satyam Sharma, Johannes Stezenbach, Jesper
+ Juhl, Heikki Orsila, H. Peter Anvin, Philipp Hahn, and Stefan
+ Richter.
diff --git a/Documentation/voyager.txt b/Documentation/voyager.txt
new file mode 100644
index 0000000..2749af5
--- /dev/null
+++ b/Documentation/voyager.txt
@@ -0,0 +1,95 @@
+Running Linux on the Voyager Architecture
+=========================================
+
+For full details and current project status, see
+
+http://www.hansenpartnership.com/voyager
+
+The voyager architecture was designed by NCR in the mid 80s to be a
+fully SMP capable RAS computing architecture built around intel's 486
+chip set. The voyager came in three levels of architectural
+sophistication: 3,4 and 5 --- 1 and 2 never made it out of prototype.
+The linux patches support only the Level 5 voyager architecture (any
+machine class 3435 and above).
+
+The Voyager Architecture
+------------------------
+
+Voyager machines consist of a Baseboard with a 386 diagnostic
+processor, a Power Supply Interface (PSI) a Primary and possibly
+Secondary Microchannel bus and between 2 and 20 voyager slots. The
+voyager slots can be populated with memory and cpu cards (up to 4GB
+memory and from 1 486 to 32 Pentium Pro processors). Internally, the
+voyager has a dual arbitrated system bus and a configuration and test
+bus (CAT). The voyager bus speed is 40MHz. Therefore (since all
+voyager cards are dual ported for each system bus) the maximum
+transfer rate is 320Mb/s but only if you have your slot configuration
+tuned (only memory cards can communicate with both busses at once, CPU
+cards utilise them one at a time).
+
+Voyager SMP
+-----------
+
+Since voyager was the first intel based SMP system, it is slightly
+more primitive than the Intel IO-APIC approach to SMP. Voyager allows
+arbitrary interrupt routing (including processor affinity routing) of
+all 16 PC type interrupts. However it does this by using a modified
+5259 master/slave chip set instead of an APIC bus. Additionally,
+voyager supports Cross Processor Interrupts (CPI) equivalent to the
+APIC IPIs. There are two routed voyager interrupt lines provided to
+each slot.
+
+Processor Cards
+---------------
+
+These come in single, dyadic and quad configurations (the quads are
+problematic--see later). The maximum configuration is 8 quad cards
+for 32 way SMP.
+
+Quad Processors
+---------------
+
+Because voyager only supplies two interrupt lines to each Processor
+card, the Quad processors have to be configured (and Bootstrapped) in
+as a pair of Master/Slave processors.
+
+In fact, most Quad cards only accept one VIC interrupt line, so they
+have one interrupt handling processor (called the VIC extended
+processor) and three non-interrupt handling processors.
+
+Current Status
+--------------
+
+The System will boot on Mono, Dyad and Quad cards. There was
+originally a Quad boot problem which has been fixed by proper gdt
+alignment in the initial boot loader. If you still cannot get your
+voyager system to boot, email me at:
+
+<J.E.J.Bottomley@HansenPartnership.com>
+
+
+The Quad cards now support using the separate Quad CPI vectors instead
+of going through the VIC mailbox system.
+
+The Level 4 architecture (3430 and 3360 Machines) should also work
+fine.
+
+Dump Switch
+-----------
+
+The voyager dump switch sends out a broadcast NMI which the voyager
+code intercepts and does a task dump.
+
+Power Switch
+------------
+
+The front panel power switch is intercepted by the kernel and should
+cause a system shutdown and power off.
+
+A Note About Mixed CPU Systems
+------------------------------
+
+Linux isn't designed to handle mixed CPU systems very well. In order
+to get everything going you *must* make sure that your lowest
+capability CPU is used for booting. Also, mixing CPU classes
+(e.g. 486 and 586) is really not going to work very well at all.
diff --git a/Documentation/w1/00-INDEX b/Documentation/w1/00-INDEX
new file mode 100644
index 0000000..cb49802
--- /dev/null
+++ b/Documentation/w1/00-INDEX
@@ -0,0 +1,10 @@
+00-INDEX
+ - This file
+slaves/
+ - Drivers that provide support for specific family codes.
+masters/
+ - Individual chips providing 1-wire busses.
+w1.generic
+ - The 1-wire (w1) bus
+w1.netlink
+ - Userspace communication protocol over connector [1].
diff --git a/Documentation/w1/masters/00-INDEX b/Documentation/w1/masters/00-INDEX
new file mode 100644
index 0000000..7b0ceaa
--- /dev/null
+++ b/Documentation/w1/masters/00-INDEX
@@ -0,0 +1,8 @@
+00-INDEX
+ - This file
+ds2482
+ - The Maxim/Dallas Semiconductor DS2482 provides 1-wire busses.
+ds2490
+ - The Maxim/Dallas Semiconductor DS2490 builds USB <-> W1 bridges.
+w1-gpio
+ - GPIO 1-wire bus master driver.
diff --git a/Documentation/w1/masters/ds2482 b/Documentation/w1/masters/ds2482
new file mode 100644
index 0000000..9210d6f
--- /dev/null
+++ b/Documentation/w1/masters/ds2482
@@ -0,0 +1,31 @@
+Kernel driver ds2482
+====================
+
+Supported chips:
+ * Maxim DS2482-100, Maxim DS2482-800
+ Prefix: 'ds2482'
+ Addresses scanned: None
+ Datasheets:
+ http://pdfserv.maxim-ic.com/en/ds/DS2482-100-DS2482S-100.pdf
+ http://pdfserv.maxim-ic.com/en/ds/DS2482-800-DS2482S-800.pdf
+
+Author: Ben Gardner <bgardner@wabtec.com>
+
+
+Description
+-----------
+
+The Maxim/Dallas Semiconductor DS2482 is a I2C device that provides
+one (DS2482-100) or eight (DS2482-800) 1-wire busses.
+
+
+General Remarks
+---------------
+
+Valid addresses are 0x18, 0x19, 0x1a, and 0x1b.
+However, the device cannot be detected without writing to the i2c bus, so no
+detection is done.
+You should force the device address.
+
+$ modprobe ds2482 force=0,0x18
+
diff --git a/Documentation/w1/masters/ds2490 b/Documentation/w1/masters/ds2490
new file mode 100644
index 0000000..28176de
--- /dev/null
+++ b/Documentation/w1/masters/ds2490
@@ -0,0 +1,70 @@
+Kernel driver ds2490
+====================
+
+Supported chips:
+ * Maxim DS2490 based
+
+Author: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+
+
+Description
+-----------
+
+The Maxim/Dallas Semiconductor DS2490 is a chip
+which allows to build USB <-> W1 bridges.
+
+DS9490(R) is a USB <-> W1 bus master device
+which has 0x81 family ID integrated chip and DS2490
+low-level operational chip.
+
+Notes and limitations.
+- The weak pullup current is a minimum of 0.9mA and maximum of 6.0mA.
+- The 5V strong pullup is supported with a minimum of 5.9mA and a
+ maximum of 30.4 mA. (From DS2490.pdf)
+- While the ds2490 supports a hardware search the code doesn't take
+ advantage of it (in tested case it only returned first device).
+- The hardware will detect when devices are attached to the bus on the
+ next bus (reset?) operation, however only a message is printed as
+ the core w1 code doesn't make use of the information. Connecting
+ one device tends to give multiple new device notifications.
+- The number of USB bus transactions could be reduced if w1_reset_send
+ was added to the API. The name is just a suggestion. It would take
+ a write buffer and a read buffer (along with sizes) as arguments.
+ The ds2490 block I/O command supports reset, write buffer, read
+ buffer, and strong pullup all in one command, instead of the current
+ 1 reset bus, 2 write the match rom command and slave rom id, 3 block
+ write and read data. The write buffer needs to have the match rom
+ command and slave rom id prepended to the front of the requested
+ write buffer, both of which are known to the driver.
+- The hardware supports normal, flexible, and overdrive bus
+ communication speeds, but only the normal is supported.
+- The registered w1_bus_master functions don't define error
+ conditions. If a bus search is in progress and the ds2490 is
+ removed it can produce a good amount of error output before the bus
+ search finishes.
+- The hardware supports detecting some error conditions, such as
+ short, alarming presence on reset, and no presence on reset, but the
+ driver doesn't query those values.
+- The ds2490 specification doesn't cover short bulk in reads in
+ detail, but my observation is if fewer bytes are requested than are
+ available, the bulk read will return an error and the hardware will
+ clear the entire bulk in buffer. It would be possible to read the
+ maximum buffer size to not run into this error condition, only extra
+ bytes in the buffer is a logic error in the driver. The code should
+ should match reads and writes as well as data sizes. Reads and
+ writes are serialized and the status verifies that the chip is idle
+ (and data is available) before the read is executed, so it should
+ not happen.
+- Running x86_64 2.6.24 UHCI under qemu 0.9.0 under x86_64 2.6.22-rc6
+ with a OHCI controller, ds2490 running in the guest would operate
+ normally the first time the module was loaded after qemu attached
+ the ds2490 hardware, but if the module was unloaded, then reloaded
+ most of the time one of the bulk out or in, and usually the bulk in
+ would fail. qemu sets a 50ms timeout and the bulk in would timeout
+ even when the status shows data available. A bulk out write would
+ show a successful completion, but the ds2490 status register would
+ show 0 bytes written. Detaching qemu from the ds2490 hardware and
+ reattaching would clear the problem. usbmon output in the guest and
+ host did not explain the problem. My guess is a bug in either qemu
+ or the host OS and more likely the host OS.
+-- 03-06-2008 David Fries <David@Fries.net>
diff --git a/Documentation/w1/masters/omap-hdq b/Documentation/w1/masters/omap-hdq
new file mode 100644
index 0000000..ca722e0
--- /dev/null
+++ b/Documentation/w1/masters/omap-hdq
@@ -0,0 +1,46 @@
+Kernel driver for omap HDQ/1-wire module.
+========================================
+
+Supported chips:
+================
+ HDQ/1-wire controller on the TI OMAP 2430/3430 platforms.
+
+A useful link about HDQ basics:
+===============================
+http://focus.ti.com/lit/an/slua408/slua408.pdf
+
+Description:
+============
+The HDQ/1-Wire module of TI OMAP2430/3430 platforms implement the hardware
+protocol of the master functions of the Benchmark HDQ and the Dallas
+Semiconductor 1-Wire protocols. These protocols use a single wire for
+communication between the master (HDQ/1-Wire controller) and the slave
+(HDQ/1-Wire external compliant device).
+
+A typical application of the HDQ/1-Wire module is the communication with battery
+monitor (gas gauge) integrated circuits.
+
+The controller supports operation in both HDQ and 1-wire mode. The essential
+difference between the HDQ and 1-wire mode is how the slave device responds to
+initialization pulse.In HDQ mode, the firmware does not require the host to
+create an initialization pulse to the slave.However, the slave can be reset by
+using an initialization pulse (also referred to as a break pulse).The slave
+does not respond with a presence pulse as it does in the 1-Wire protocol.
+
+Remarks:
+========
+The driver (drivers/w1/masters/omap_hdq.c) supports the HDQ mode of the
+controller. In this mode, as we can not read the ID which obeys the W1
+spec(family:id:crc), a module parameter can be passed to the driver which will
+be used to calculate the CRC and pass back an appropriate slave ID to the W1
+core.
+
+By default the master driver and the BQ slave i/f
+driver(drivers/w1/slaves/w1_bq27000.c) sets the ID to 1.
+Please note to load both the modules with a different ID if required, but note
+that the ID used should be same for both master and slave driver loading.
+
+e.g:
+insmod omap_hdq.ko W1_ID=2
+inamod w1_bq27000.ko F_ID=2
+
diff --git a/Documentation/w1/masters/w1-gpio b/Documentation/w1/masters/w1-gpio
new file mode 100644
index 0000000..af5d3b4
--- /dev/null
+++ b/Documentation/w1/masters/w1-gpio
@@ -0,0 +1,33 @@
+Kernel driver w1-gpio
+=====================
+
+Author: Ville Syrjala <syrjala@sci.fi>
+
+
+Description
+-----------
+
+GPIO 1-wire bus master driver. The driver uses the GPIO API to control the
+wire and the GPIO pin can be specified using platform data.
+
+
+Example (mach-at91)
+-------------------
+
+#include <linux/w1-gpio.h>
+
+static struct w1_gpio_platform_data foo_w1_gpio_pdata = {
+ .pin = AT91_PIN_PB20,
+ .is_open_drain = 1,
+};
+
+static struct platform_device foo_w1_device = {
+ .name = "w1-gpio",
+ .id = -1,
+ .dev.platform_data = &foo_w1_gpio_pdata,
+};
+
+...
+ at91_set_GPIO_periph(foo_w1_gpio_pdata.pin, 1);
+ at91_set_multi_drive(foo_w1_gpio_pdata.pin, 1);
+ platform_device_register(&foo_w1_device);
diff --git a/Documentation/w1/slaves/00-INDEX b/Documentation/w1/slaves/00-INDEX
new file mode 100644
index 0000000..f8101d6
--- /dev/null
+++ b/Documentation/w1/slaves/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - This file
+w1_therm
+ - The Maxim/Dallas Semiconductor ds18*20 temperature sensor.
diff --git a/Documentation/w1/slaves/w1_therm b/Documentation/w1/slaves/w1_therm
new file mode 100644
index 0000000..0403aaa
--- /dev/null
+++ b/Documentation/w1/slaves/w1_therm
@@ -0,0 +1,41 @@
+Kernel driver w1_therm
+====================
+
+Supported chips:
+ * Maxim ds18*20 based temperature sensors.
+
+Author: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+
+
+Description
+-----------
+
+w1_therm provides basic temperature conversion for ds18*20 devices.
+supported family codes:
+W1_THERM_DS18S20 0x10
+W1_THERM_DS1822 0x22
+W1_THERM_DS18B20 0x28
+
+Support is provided through the sysfs w1_slave file. Each open and
+read sequence will initiate a temperature conversion then provide two
+lines of ASCII output. The first line contains the nine hex bytes
+read along with a calculated crc value and YES or NO if it matched.
+If the crc matched the returned values are retained. The second line
+displays the retained values along with a temperature in millidegrees
+Centigrade after t=.
+
+Parasite powered devices are limited to one slave performing a
+temperature conversion at a time. If none of the devices are parasite
+powered it would be possible to convert all the devices at the same
+time and then go back to read individual sensors. That isn't
+currently supported. The driver also doesn't support reduced
+precision (which would also reduce the conversion time).
+
+The module parameter strong_pullup can be set to 0 to disable the
+strong pullup or 1 to enable. If enabled the 5V strong pullup will be
+enabled when the conversion is taking place provided the master driver
+must support the strong pullup (or it falls back to a pullup
+resistor). The DS18b20 temperature sensor specification lists a
+maximum current draw of 1.5mA and that a 5k pullup resistor is not
+sufficient. The strong pullup is designed to provide the additional
+current required.
diff --git a/Documentation/w1/w1.generic b/Documentation/w1/w1.generic
new file mode 100644
index 0000000..e3333ee
--- /dev/null
+++ b/Documentation/w1/w1.generic
@@ -0,0 +1,113 @@
+The 1-wire (w1) subsystem
+------------------------------------------------------------------
+The 1-wire bus is a simple master-slave bus that communicates via a single
+signal wire (plus ground, so two wires).
+
+Devices communicate on the bus by pulling the signal to ground via an open
+drain output and by sampling the logic level of the signal line.
+
+The w1 subsystem provides the framework for managing w1 masters and
+communication with slaves.
+
+All w1 slave devices must be connected to a w1 bus master device.
+
+Example w1 master devices:
+ DS9490 usb device
+ W1-over-GPIO
+ DS2482 (i2c to w1 bridge)
+ Emulated devices, such as a RS232 converter, parallel port adapter, etc
+
+
+What does the w1 subsystem do?
+------------------------------------------------------------------
+When a w1 master driver registers with the w1 subsystem, the following occurs:
+
+ - sysfs entries for that w1 master are created
+ - the w1 bus is periodically searched for new slave devices
+
+When a device is found on the bus, w1 core checks if driver for it's family is
+loaded. If so, the family driver is attached to the slave.
+If there is no driver for the family, default one is assigned, which allows to perform
+almost any kind of operations. Each logical operation is a transaction
+in nature, which can contain several (two or one) low-level operations.
+Let's see how one can read EEPROM context:
+1. one must write control buffer, i.e. buffer containing command byte
+and two byte address. At this step bus is reset and appropriate device
+is selected using either W1_SKIP_ROM or W1_MATCH_ROM command.
+Then provided control buffer is being written to the wire.
+2. reading. This will issue reading eeprom response.
+
+It is possible that between 1. and 2. w1 master thread will reset bus for searching
+and slave device will be even removed, but in this case 0xff will
+be read, since no device was selected.
+
+
+W1 device families
+------------------------------------------------------------------
+Slave devices are handled by a driver written for a family of w1 devices.
+
+A family driver populates a struct w1_family_ops (see w1_family.h) and
+registers with the w1 subsystem.
+
+Current family drivers:
+w1_therm - (ds18?20 thermal sensor family driver)
+ provides temperature reading function which is bound to ->rbin() method
+ of the above w1_family_ops structure.
+
+w1_smem - driver for simple 64bit memory cell provides ID reading method.
+
+You can call above methods by reading appropriate sysfs files.
+
+
+What does a w1 master driver need to implement?
+------------------------------------------------------------------
+
+The driver for w1 bus master must provide at minimum two functions.
+
+Emulated devices must provide the ability to set the output signal level
+(write_bit) and sample the signal level (read_bit).
+
+Devices that support the 1-wire natively must provide the ability to write and
+sample a bit (touch_bit) and reset the bus (reset_bus).
+
+Most hardware provides higher-level functions that offload w1 handling.
+See struct w1_bus_master definition in w1.h for details.
+
+
+w1 master sysfs interface
+------------------------------------------------------------------
+<xx-xxxxxxxxxxxxx> - a directory for a found device. The format is family-serial
+bus - (standard) symlink to the w1 bus
+driver - (standard) symlink to the w1 driver
+w1_master_add - Manually register a slave device
+w1_master_attempts - the number of times a search was attempted
+w1_master_max_slave_count
+ - the maximum slaves that may be attached to a master
+w1_master_name - the name of the device (w1_bus_masterX)
+w1_master_pullup - 5V strong pullup 0 enabled, 1 disabled
+w1_master_remove - Manually remove a slave device
+w1_master_search - the number of searches left to do, -1=continual (default)
+w1_master_slave_count
+ - the number of slaves found
+w1_master_slaves - the names of the slaves, one per line
+w1_master_timeout - the delay in seconds between searches
+
+If you have a w1 bus that never changes (you don't add or remove devices),
+you can set the module parameter search_count to a small positive number
+for an initially small number of bus searches. Alternatively it could be
+set to zero, then manually add the slave device serial numbers by
+w1_master_add device file. The w1_master_add and w1_master_remove files
+generally only make sense when searching is disabled, as a search will
+redetect manually removed devices that are present and timeout manually
+added devices that aren't on the bus.
+
+
+w1 slave sysfs interface
+------------------------------------------------------------------
+bus - (standard) symlink to the w1 bus
+driver - (standard) symlink to the w1 driver
+name - the device name, usually the same as the directory name
+w1_slave - (optional) a binary file whose meaning depends on the
+ family driver
+rw - (optional) created for slave devices which do not have
+ appropriate family driver. Allows to read/write binary data.
diff --git a/Documentation/w1/w1.netlink b/Documentation/w1/w1.netlink
new file mode 100644
index 0000000..3640c7c
--- /dev/null
+++ b/Documentation/w1/w1.netlink
@@ -0,0 +1,98 @@
+Userspace communication protocol over connector [1].
+
+
+Message types.
+=============
+
+There are three types of messages between w1 core and userspace:
+1. Events. They are generated each time new master or slave device found
+ either due to automatic or requested search.
+2. Userspace commands. Includes read/write and search/alarm search comamnds.
+3. Replies to userspace commands.
+
+
+Protocol.
+========
+
+[struct cn_msg] - connector header. It's length field is equal to size of the attached data.
+[struct w1_netlink_msg] - w1 netlink header.
+ __u8 type - message type.
+ W1_SLAVE_ADD/W1_SLAVE_REMOVE - slave add/remove events.
+ W1_MASTER_ADD/W1_MASTER_REMOVE - master add/remove events.
+ W1_MASTER_CMD - userspace command for bus master device (search/alarm search).
+ W1_SLAVE_CMD - userspace command for slave device (read/write/ search/alarm search
+ for bus master device where given slave device found).
+ __u8 res - reserved
+ __u16 len - size of attached to this header data.
+ union {
+ __u8 id; - slave unique device id
+ struct w1_mst {
+ __u32 id; - master's id.
+ __u32 res; - reserved
+ } mst;
+ } id;
+
+[strucrt w1_netlink_cmd] - command for gived master or slave device.
+ __u8 cmd - command opcode.
+ W1_CMD_READ - read command.
+ W1_CMD_WRITE - write command.
+ W1_CMD_SEARCH - search command.
+ W1_CMD_ALARM_SEARCH - alarm search command.
+ __u8 res - reserved
+ __u16 len - length of data for this command.
+ For read command data must be allocated like for write command.
+ __u8 data[0] - data for this command.
+
+
+Each connector message can include one or more w1_netlink_msg with zero of more attached w1_netlink_cmd messages.
+
+For event messages there are no w1_netlink_cmd embedded structures, only connector header
+and w1_netlink_msg strucutre with "len" field being zero and filled type (one of event types)
+and id - either 8 bytes of slave unique id in host order, or master's id, which is assigned
+to bus master device when it is added to w1 core.
+
+Currently replies to userspace commands are only generated for read command request.
+One reply is generated exactly for one w1_netlink_cmd read request.
+Replies are not combined when sent - i.e. typical reply messages looks like the following:
+[cn_msg][w1_netlink_msg][w1_netlink_cmd]
+cn_msg.len = sizeof(struct w1_netlink_msg) + sizeof(struct w1_netlink_cmd) + cmd->len;
+w1_netlink_msg.len = sizeof(struct w1_netlink_cmd) + cmd->len;
+w1_netlink_cmd.len = cmd->len;
+
+
+Operation steps in w1 core when new command is received.
+=======================================================
+
+When new message (w1_netlink_msg) is received w1 core detects if it is master of slave request,
+according to w1_netlink_msg.type field.
+Then master or slave device is searched for.
+When found, master device (requested or those one on where slave device is found) is locked.
+If slave command is requested, then reset/select procedure is started to select given device.
+
+Then all requested in w1_netlink_msg operations are performed one by one.
+If command requires reply (like read command) it is sent on command completion.
+
+When all commands (w1_netlink_cmd) are processed muster device is unlocked
+and next w1_netlink_msg header processing started.
+
+
+Connector [1] specific documentation.
+====================================
+
+Each connector message includes two u32 fields as "address".
+w1 uses CN_W1_IDX and CN_W1_VAL defined in include/linux/connector.h header.
+Each message also includes sequence and acknowledge numbers.
+Sequence number for event messages is appropriate bus master sequence number increased with
+each event message sent "through" this master.
+Sequence number for userspace requests is set by userspace application.
+Sequence number for reply is the same as was in request, and
+acknowledge number is set to seq+1.
+
+
+Additional documantion, source code examples.
+============================================
+
+1. Documentation/connector
+2. http://tservice.net.ru/~s0mbre/archive/w1
+This archive includes userspace application w1d.c which
+uses read/write/search commands for all master/slave devices found on the bus.
diff --git a/Documentation/watchdog/00-INDEX b/Documentation/watchdog/00-INDEX
new file mode 100644
index 0000000..c3ea47e
--- /dev/null
+++ b/Documentation/watchdog/00-INDEX
@@ -0,0 +1,10 @@
+00-INDEX
+ - this file.
+pcwd-watchdog.txt
+ - documentation for Berkshire Products PC Watchdog ISA cards.
+src/
+ - directory holding watchdog related example programs.
+watchdog-api.txt
+ - description of the Linux Watchdog driver API.
+wdt.txt
+ - description of the Watchdog Timer Interfaces for Linux.
diff --git a/Documentation/watchdog/pcwd-watchdog.txt b/Documentation/watchdog/pcwd-watchdog.txt
new file mode 100644
index 0000000..4f68052
--- /dev/null
+++ b/Documentation/watchdog/pcwd-watchdog.txt
@@ -0,0 +1,66 @@
+Last reviewed: 10/05/2007
+
+ Berkshire Products PC Watchdog Card
+ Support for ISA Cards Revision A and C
+ Documentation and Driver by Ken Hollis <kenji@bitgate.com>
+
+ The PC Watchdog is a card that offers the same type of functionality that
+ the WDT card does, only it doesn't require an IRQ to run. Furthermore,
+ the Revision C card allows you to monitor any IO Port to automatically
+ trigger the card into being reset. This way you can make the card
+ monitor hard drive status, or anything else you need.
+
+ The Watchdog Driver has one basic role: to talk to the card and send
+ signals to it so it doesn't reset your computer ... at least during
+ normal operation.
+
+ The Watchdog Driver will automatically find your watchdog card, and will
+ attach a running driver for use with that card. After the watchdog
+ drivers have initialized, you can then talk to the card using a PC
+ Watchdog program.
+
+ I suggest putting a "watchdog -d" before the beginning of an fsck, and
+ a "watchdog -e -t 1" immediately after the end of an fsck. (Remember
+ to run the program with an "&" to run it in the background!)
+
+ If you want to write a program to be compatible with the PC Watchdog
+ driver, simply use of modify the watchdog test program:
+ Documentation/watchdog/src/watchdog-test.c
+
+
+ Other IOCTL functions include:
+
+ WDIOC_GETSUPPORT
+ This returns the support of the card itself. This
+ returns in structure "PCWDS" which returns:
+ options = WDIOS_TEMPPANIC
+ (This card supports temperature)
+ firmware_version = xxxx
+ (Firmware version of the card)
+
+ WDIOC_GETSTATUS
+ This returns the status of the card, with the bits of
+ WDIOF_* bitwise-anded into the value. (The comments
+ are in linux/pcwd.h)
+
+ WDIOC_GETBOOTSTATUS
+ This returns the status of the card that was reported
+ at bootup.
+
+ WDIOC_GETTEMP
+ This returns the temperature of the card. (You can also
+ read /dev/watchdog, which gives a temperature update
+ every second.)
+
+ WDIOC_SETOPTIONS
+ This lets you set the options of the card. You can either
+ enable or disable the card this way.
+
+ WDIOC_KEEPALIVE
+ This pings the card to tell it not to reset your computer.
+
+ And that's all she wrote!
+
+ -- Ken Hollis
+ (kenji@bitgate.com)
+
diff --git a/Documentation/watchdog/src/.gitignore b/Documentation/watchdog/src/.gitignore
new file mode 100644
index 0000000..ac90997
--- /dev/null
+++ b/Documentation/watchdog/src/.gitignore
@@ -0,0 +1,2 @@
+watchdog-simple
+watchdog-test
diff --git a/Documentation/watchdog/src/Makefile b/Documentation/watchdog/src/Makefile
new file mode 100644
index 0000000..40e5f46
--- /dev/null
+++ b/Documentation/watchdog/src/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := watchdog-simple watchdog-test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c
new file mode 100644
index 0000000..4cf72f3
--- /dev/null
+++ b/Documentation/watchdog/src/watchdog-simple.c
@@ -0,0 +1,27 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(void)
+{
+ int fd = open("/dev/watchdog", O_WRONLY);
+ int ret = 0;
+ if (fd == -1) {
+ perror("watchdog");
+ exit(EXIT_FAILURE);
+ }
+ while (1) {
+ ret = write(fd, "\0", 1);
+ if (ret != 1) {
+ ret = -1;
+ break;
+ }
+ ret = fsync(fd);
+ if (ret)
+ break;
+ sleep(10);
+ }
+ close(fd);
+ return ret;
+}
diff --git a/Documentation/watchdog/src/watchdog-test.c b/Documentation/watchdog/src/watchdog-test.c
new file mode 100644
index 0000000..65f6c19
--- /dev/null
+++ b/Documentation/watchdog/src/watchdog-test.c
@@ -0,0 +1,68 @@
+/*
+ * Watchdog Driver Test Program
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
+
+int fd;
+
+/*
+ * This function simply sends an IOCTL to the driver, which in turn ticks
+ * the PC Watchdog card to reset its internal timer so it doesn't trigger
+ * a computer reset.
+ */
+void keep_alive(void)
+{
+ int dummy;
+
+ ioctl(fd, WDIOC_KEEPALIVE, &dummy);
+}
+
+/*
+ * The main program. Run the program with "-d" to disable the card,
+ * or "-e" to enable the card.
+ */
+int main(int argc, char *argv[])
+{
+ fd = open("/dev/watchdog", O_WRONLY);
+
+ if (fd == -1) {
+ fprintf(stderr, "Watchdog device not enabled.\n");
+ fflush(stderr);
+ exit(-1);
+ }
+
+ if (argc > 1) {
+ if (!strncasecmp(argv[1], "-d", 2)) {
+ ioctl(fd, WDIOC_SETOPTIONS, WDIOS_DISABLECARD);
+ fprintf(stderr, "Watchdog card disabled.\n");
+ fflush(stderr);
+ exit(0);
+ } else if (!strncasecmp(argv[1], "-e", 2)) {
+ ioctl(fd, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
+ fprintf(stderr, "Watchdog card enabled.\n");
+ fflush(stderr);
+ exit(0);
+ } else {
+ fprintf(stderr, "-d to disable, -e to enable.\n");
+ fprintf(stderr, "run by itself to tick the card.\n");
+ fflush(stderr);
+ exit(0);
+ }
+ } else {
+ fprintf(stderr, "Watchdog Ticking Away!\n");
+ fflush(stderr);
+ }
+
+ while(1) {
+ keep_alive();
+ sleep(1);
+ }
+}
diff --git a/Documentation/watchdog/watchdog-api.txt b/Documentation/watchdog/watchdog-api.txt
new file mode 100644
index 0000000..4cc4ba9
--- /dev/null
+++ b/Documentation/watchdog/watchdog-api.txt
@@ -0,0 +1,238 @@
+Last reviewed: 10/05/2007
+
+
+The Linux Watchdog driver API.
+
+Copyright 2002 Christer Weingel <wingel@nano-system.com>
+
+Some parts of this document are copied verbatim from the sbc60xxwdt
+driver which is (c) Copyright 2000 Jakob Oestergaard <jakob@ostenfeld.dk>
+
+This document describes the state of the Linux 2.4.18 kernel.
+
+Introduction:
+
+A Watchdog Timer (WDT) is a hardware circuit that can reset the
+computer system in case of a software fault. You probably knew that
+already.
+
+Usually a userspace daemon will notify the kernel watchdog driver via the
+/dev/watchdog special device file that userspace is still alive, at
+regular intervals. When such a notification occurs, the driver will
+usually tell the hardware watchdog that everything is in order, and
+that the watchdog should wait for yet another little while to reset
+the system. If userspace fails (RAM error, kernel bug, whatever), the
+notifications cease to occur, and the hardware watchdog will reset the
+system (causing a reboot) after the timeout occurs.
+
+The Linux watchdog API is a rather ad-hoc construction and different
+drivers implement different, and sometimes incompatible, parts of it.
+This file is an attempt to document the existing usage and allow
+future driver writers to use it as a reference.
+
+The simplest API:
+
+All drivers support the basic mode of operation, where the watchdog
+activates as soon as /dev/watchdog is opened and will reboot unless
+the watchdog is pinged within a certain time, this time is called the
+timeout or margin. The simplest way to ping the watchdog is to write
+some data to the device. So a very simple watchdog daemon would look
+like this source file: see Documentation/watchdog/src/watchdog-simple.c
+
+A more advanced driver could for example check that a HTTP server is
+still responding before doing the write call to ping the watchdog.
+
+When the device is closed, the watchdog is disabled, unless the "Magic
+Close" feature is supported (see below). This is not always such a
+good idea, since if there is a bug in the watchdog daemon and it
+crashes the system will not reboot. Because of this, some of the
+drivers support the configuration option "Disable watchdog shutdown on
+close", CONFIG_WATCHDOG_NOWAYOUT. If it is set to Y when compiling
+the kernel, there is no way of disabling the watchdog once it has been
+started. So, if the watchdog daemon crashes, the system will reboot
+after the timeout has passed. Watchdog devices also usually support
+the nowayout module parameter so that this option can be controlled at
+runtime.
+
+Magic Close feature:
+
+If a driver supports "Magic Close", the driver will not disable the
+watchdog unless a specific magic character 'V' has been sent to
+/dev/watchdog just before closing the file. If the userspace daemon
+closes the file without sending this special character, the driver
+will assume that the daemon (and userspace in general) died, and will
+stop pinging the watchdog without disabling it first. This will then
+cause a reboot if the watchdog is not re-opened in sufficient time.
+
+The ioctl API:
+
+All conforming drivers also support an ioctl API.
+
+Pinging the watchdog using an ioctl:
+
+All drivers that have an ioctl interface support at least one ioctl,
+KEEPALIVE. This ioctl does exactly the same thing as a write to the
+watchdog device, so the main loop in the above program could be
+replaced with:
+
+ while (1) {
+ ioctl(fd, WDIOC_KEEPALIVE, 0);
+ sleep(10);
+ }
+
+the argument to the ioctl is ignored.
+
+Setting and getting the timeout:
+
+For some drivers it is possible to modify the watchdog timeout on the
+fly with the SETTIMEOUT ioctl, those drivers have the WDIOF_SETTIMEOUT
+flag set in their option field. The argument is an integer
+representing the timeout in seconds. The driver returns the real
+timeout used in the same variable, and this timeout might differ from
+the requested one due to limitation of the hardware.
+
+ int timeout = 45;
+ ioctl(fd, WDIOC_SETTIMEOUT, &timeout);
+ printf("The timeout was set to %d seconds\n", timeout);
+
+This example might actually print "The timeout was set to 60 seconds"
+if the device has a granularity of minutes for its timeout.
+
+Starting with the Linux 2.4.18 kernel, it is possible to query the
+current timeout using the GETTIMEOUT ioctl.
+
+ ioctl(fd, WDIOC_GETTIMEOUT, &timeout);
+ printf("The timeout was is %d seconds\n", timeout);
+
+Pretimeouts:
+
+Some watchdog timers can be set to have a trigger go off before the
+actual time they will reset the system. This can be done with an NMI,
+interrupt, or other mechanism. This allows Linux to record useful
+information (like panic information and kernel coredumps) before it
+resets.
+
+ pretimeout = 10;
+ ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout);
+
+Note that the pretimeout is the number of seconds before the time
+when the timeout will go off. It is not the number of seconds until
+the pretimeout. So, for instance, if you set the timeout to 60 seconds
+and the pretimeout to 10 seconds, the pretimout will go of in 50
+seconds. Setting a pretimeout to zero disables it.
+
+There is also a get function for getting the pretimeout:
+
+ ioctl(fd, WDIOC_GETPRETIMEOUT, &timeout);
+ printf("The pretimeout was is %d seconds\n", timeout);
+
+Not all watchdog drivers will support a pretimeout.
+
+Get the number of seconds before reboot:
+
+Some watchdog drivers have the ability to report the remaining time
+before the system will reboot. The WDIOC_GETTIMELEFT is the ioctl
+that returns the number of seconds before reboot.
+
+ ioctl(fd, WDIOC_GETTIMELEFT, &timeleft);
+ printf("The timeout was is %d seconds\n", timeleft);
+
+Environmental monitoring:
+
+All watchdog drivers are required return more information about the system,
+some do temperature, fan and power level monitoring, some can tell you
+the reason for the last reboot of the system. The GETSUPPORT ioctl is
+available to ask what the device can do:
+
+ struct watchdog_info ident;
+ ioctl(fd, WDIOC_GETSUPPORT, &ident);
+
+the fields returned in the ident struct are:
+
+ identity a string identifying the watchdog driver
+ firmware_version the firmware version of the card if available
+ options a flags describing what the device supports
+
+the options field can have the following bits set, and describes what
+kind of information that the GET_STATUS and GET_BOOT_STATUS ioctls can
+return. [FIXME -- Is this correct?]
+
+ WDIOF_OVERHEAT Reset due to CPU overheat
+
+The machine was last rebooted by the watchdog because the thermal limit was
+exceeded
+
+ WDIOF_FANFAULT Fan failed
+
+A system fan monitored by the watchdog card has failed
+
+ WDIOF_EXTERN1 External relay 1
+
+External monitoring relay/source 1 was triggered. Controllers intended for
+real world applications include external monitoring pins that will trigger
+a reset.
+
+ WDIOF_EXTERN2 External relay 2
+
+External monitoring relay/source 2 was triggered
+
+ WDIOF_POWERUNDER Power bad/power fault
+
+The machine is showing an undervoltage status
+
+ WDIOF_CARDRESET Card previously reset the CPU
+
+The last reboot was caused by the watchdog card
+
+ WDIOF_POWEROVER Power over voltage
+
+The machine is showing an overvoltage status. Note that if one level is
+under and one over both bits will be set - this may seem odd but makes
+sense.
+
+ WDIOF_KEEPALIVEPING Keep alive ping reply
+
+The watchdog saw a keepalive ping since it was last queried.
+
+ WDIOF_SETTIMEOUT Can set/get the timeout
+
+The watchdog can do pretimeouts.
+
+ WDIOF_PRETIMEOUT Pretimeout (in seconds), get/set
+
+
+For those drivers that return any bits set in the option field, the
+GETSTATUS and GETBOOTSTATUS ioctls can be used to ask for the current
+status, and the status at the last reboot, respectively.
+
+ int flags;
+ ioctl(fd, WDIOC_GETSTATUS, &flags);
+
+ or
+
+ ioctl(fd, WDIOC_GETBOOTSTATUS, &flags);
+
+Note that not all devices support these two calls, and some only
+support the GETBOOTSTATUS call.
+
+Some drivers can measure the temperature using the GETTEMP ioctl. The
+returned value is the temperature in degrees fahrenheit.
+
+ int temperature;
+ ioctl(fd, WDIOC_GETTEMP, &temperature);
+
+Finally the SETOPTIONS ioctl can be used to control some aspects of
+the cards operation; right now the pcwd driver is the only one
+supporting this ioctl.
+
+ int options = 0;
+ ioctl(fd, WDIOC_SETOPTIONS, options);
+
+The following options are available:
+
+ WDIOS_DISABLECARD Turn off the watchdog timer
+ WDIOS_ENABLECARD Turn on the watchdog timer
+ WDIOS_TEMPPANIC Kernel panic on temperature trip
+
+[FIXME -- better explanations]
+
diff --git a/Documentation/watchdog/wdt.txt b/Documentation/watchdog/wdt.txt
new file mode 100644
index 0000000..03fd756
--- /dev/null
+++ b/Documentation/watchdog/wdt.txt
@@ -0,0 +1,43 @@
+Last Reviewed: 10/05/2007
+
+ WDT Watchdog Timer Interfaces For The Linux Operating System
+ Alan Cox <alan@lxorguk.ukuu.org.uk>
+
+ ICS WDT501-P
+ ICS WDT501-P (no fan tachometer)
+ ICS WDT500-P
+
+All the interfaces provide /dev/watchdog, which when open must be written
+to within a timeout or the machine will reboot. Each write delays the reboot
+time another timeout. In the case of the software watchdog the ability to
+reboot will depend on the state of the machines and interrupts. The hardware
+boards physically pull the machine down off their own onboard timers and
+will reboot from almost anything.
+
+A second temperature monitoring interface is available on the WDT501P cards
+This provides /dev/temperature. This is the machine internal temperature in
+degrees Fahrenheit. Each read returns a single byte giving the temperature.
+
+The third interface logs kernel messages on additional alert events.
+
+The wdt card cannot be safely probed for. Instead you need to pass
+wdt=ioaddr,irq as a boot parameter - eg "wdt=0x240,11".
+
+Features
+--------
+ WDT501P WDT500P
+Reboot Timer X X
+External Reboot X X
+I/O Port Monitor o o
+Temperature X o
+Fan Speed X o
+Power Under X o
+Power Over X o
+Overheat X o
+
+The external event interfaces on the WDT boards are not currently supported.
+Minor numbers are however allocated for it.
+
+
+Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c
+
diff --git a/Documentation/x86/00-INDEX b/Documentation/x86/00-INDEX
new file mode 100644
index 0000000..dbe3377
--- /dev/null
+++ b/Documentation/x86/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - this file
+mtrr.txt
+ - how to use x86 Memory Type Range Registers to increase performance
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
new file mode 100644
index 0000000..83c0033
--- /dev/null
+++ b/Documentation/x86/boot.txt
@@ -0,0 +1,900 @@
+ THE LINUX/x86 BOOT PROTOCOL
+ ---------------------------
+
+On the x86 platform, the Linux kernel uses a rather complicated boot
+convention. This has evolved partially due to historical aspects, as
+well as the desire in the early days to have the kernel itself be a
+bootable image, the complicated PC memory model and due to changed
+expectations in the PC industry caused by the effective demise of
+real-mode DOS as a mainstream operating system.
+
+Currently, the following versions of the Linux/x86 boot protocol exist.
+
+Old kernels: zImage/Image support only. Some very early kernels
+ may not even support a command line.
+
+Protocol 2.00: (Kernel 1.3.73) Added bzImage and initrd support, as
+ well as a formalized way to communicate between the
+ boot loader and the kernel. setup.S made relocatable,
+ although the traditional setup area still assumed
+ writable.
+
+Protocol 2.01: (Kernel 1.3.76) Added a heap overrun warning.
+
+Protocol 2.02: (Kernel 2.4.0-test3-pre3) New command line protocol.
+ Lower the conventional memory ceiling. No overwrite
+ of the traditional setup area, thus making booting
+ safe for systems which use the EBDA from SMM or 32-bit
+ BIOS entry points. zImage deprecated but still
+ supported.
+
+Protocol 2.03: (Kernel 2.4.18-pre1) Explicitly makes the highest possible
+ initrd address available to the bootloader.
+
+Protocol 2.04: (Kernel 2.6.14) Extend the syssize field to four bytes.
+
+Protocol 2.05: (Kernel 2.6.20) Make protected mode kernel relocatable.
+ Introduce relocatable_kernel and kernel_alignment fields.
+
+Protocol 2.06: (Kernel 2.6.22) Added a field that contains the size of
+ the boot command line.
+
+Protocol 2.07: (Kernel 2.6.24) Added paravirtualised boot protocol.
+ Introduced hardware_subarch and hardware_subarch_data
+ and KEEP_SEGMENTS flag in load_flags.
+
+Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format
+ payload. Introduced payload_offset and payload length
+ fields to aid in locating the payload.
+
+Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical
+ pointer to single linked list of struct setup_data.
+
+**** MEMORY LAYOUT
+
+The traditional memory map for the kernel loader, used for Image or
+zImage kernels, typically looks like:
+
+ | |
+0A0000 +------------------------+
+ | Reserved for BIOS | Do not use. Reserved for BIOS EBDA.
+09A000 +------------------------+
+ | Command line |
+ | Stack/heap | For use by the kernel real-mode code.
+098000 +------------------------+
+ | Kernel setup | The kernel real-mode code.
+090200 +------------------------+
+ | Kernel boot sector | The kernel legacy boot sector.
+090000 +------------------------+
+ | Protected-mode kernel | The bulk of the kernel image.
+010000 +------------------------+
+ | Boot loader | <- Boot sector entry point 0000:7C00
+001000 +------------------------+
+ | Reserved for MBR/BIOS |
+000800 +------------------------+
+ | Typically used by MBR |
+000600 +------------------------+
+ | BIOS use only |
+000000 +------------------------+
+
+
+When using bzImage, the protected-mode kernel was relocated to
+0x100000 ("high memory"), and the kernel real-mode block (boot sector,
+setup, and stack/heap) was made relocatable to any address between
+0x10000 and end of low memory. Unfortunately, in protocols 2.00 and
+2.01 the 0x90000+ memory range is still used internally by the kernel;
+the 2.02 protocol resolves that problem.
+
+It is desirable to keep the "memory ceiling" -- the highest point in
+low memory touched by the boot loader -- as low as possible, since
+some newer BIOSes have begun to allocate some rather large amounts of
+memory, called the Extended BIOS Data Area, near the top of low
+memory. The boot loader should use the "INT 12h" BIOS call to verify
+how much low memory is available.
+
+Unfortunately, if INT 12h reports that the amount of memory is too
+low, there is usually nothing the boot loader can do but to report an
+error to the user. The boot loader should therefore be designed to
+take up as little space in low memory as it reasonably can. For
+zImage or old bzImage kernels, which need data written into the
+0x90000 segment, the boot loader should make sure not to use memory
+above the 0x9A000 point; too many BIOSes will break above that point.
+
+For a modern bzImage kernel with boot protocol version >= 2.02, a
+memory layout like the following is suggested:
+
+ ~ ~
+ | Protected-mode kernel |
+100000 +------------------------+
+ | I/O memory hole |
+0A0000 +------------------------+
+ | Reserved for BIOS | Leave as much as possible unused
+ ~ ~
+ | Command line | (Can also be below the X+10000 mark)
+X+10000 +------------------------+
+ | Stack/heap | For use by the kernel real-mode code.
+X+08000 +------------------------+
+ | Kernel setup | The kernel real-mode code.
+ | Kernel boot sector | The kernel legacy boot sector.
+X +------------------------+
+ | Boot loader | <- Boot sector entry point 0000:7C00
+001000 +------------------------+
+ | Reserved for MBR/BIOS |
+000800 +------------------------+
+ | Typically used by MBR |
+000600 +------------------------+
+ | BIOS use only |
+000000 +------------------------+
+
+... where the address X is as low as the design of the boot loader
+permits.
+
+
+**** THE REAL-MODE KERNEL HEADER
+
+In the following text, and anywhere in the kernel boot sequence, "a
+sector" refers to 512 bytes. It is independent of the actual sector
+size of the underlying medium.
+
+The first step in loading a Linux kernel should be to load the
+real-mode code (boot sector and setup code) and then examine the
+following header at offset 0x01f1. The real-mode code can total up to
+32K, although the boot loader may choose to load only the first two
+sectors (1K) and then examine the bootup sector size.
+
+The header looks like:
+
+Offset Proto Name Meaning
+/Size
+
+01F1/1 ALL(1 setup_sects The size of the setup in sectors
+01F2/2 ALL root_flags If set, the root is mounted readonly
+01F4/4 2.04+(2 syssize The size of the 32-bit code in 16-byte paras
+01F8/2 ALL ram_size DO NOT USE - for bootsect.S use only
+01FA/2 ALL vid_mode Video mode control
+01FC/2 ALL root_dev Default root device number
+01FE/2 ALL boot_flag 0xAA55 magic number
+0200/2 2.00+ jump Jump instruction
+0202/4 2.00+ header Magic signature "HdrS"
+0206/2 2.00+ version Boot protocol version supported
+0208/4 2.00+ realmode_swtch Boot loader hook (see below)
+020C/2 2.00+ start_sys The load-low segment (0x1000) (obsolete)
+020E/2 2.00+ kernel_version Pointer to kernel version string
+0210/1 2.00+ type_of_loader Boot loader identifier
+0211/1 2.00+ loadflags Boot protocol option flags
+0212/2 2.00+ setup_move_size Move to high memory size (used with hooks)
+0214/4 2.00+ code32_start Boot loader hook (see below)
+0218/4 2.00+ ramdisk_image initrd load address (set by boot loader)
+021C/4 2.00+ ramdisk_size initrd size (set by boot loader)
+0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only
+0224/2 2.01+ heap_end_ptr Free memory after setup end
+0226/2 N/A pad1 Unused
+0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line
+022C/4 2.03+ initrd_addr_max Highest legal initrd address
+0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel
+0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
+0235/3 N/A pad2 Unused
+0238/4 2.06+ cmdline_size Maximum size of the kernel command line
+023C/4 2.07+ hardware_subarch Hardware subarchitecture
+0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data
+0248/4 2.08+ payload_offset Offset of kernel payload
+024C/4 2.08+ payload_length Length of kernel payload
+0250/8 2.09+ setup_data 64-bit physical pointer to linked list
+ of struct setup_data
+
+(1) For backwards compatibility, if the setup_sects field contains 0, the
+ real value is 4.
+
+(2) For boot protocol prior to 2.04, the upper two bytes of the syssize
+ field are unusable, which means the size of a bzImage kernel
+ cannot be determined.
+
+If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
+the boot protocol version is "old". Loading an old kernel, the
+following parameters should be assumed:
+
+ Image type = zImage
+ initrd not supported
+ Real-mode kernel must be located at 0x90000.
+
+Otherwise, the "version" field contains the protocol version,
+e.g. protocol version 2.01 will contain 0x0201 in this field. When
+setting fields in the header, you must make sure only to set fields
+supported by the protocol version in use.
+
+
+**** DETAILS OF HEADER FIELDS
+
+For each field, some are information from the kernel to the bootloader
+("read"), some are expected to be filled out by the bootloader
+("write"), and some are expected to be read and modified by the
+bootloader ("modify").
+
+All general purpose boot loaders should write the fields marked
+(obligatory). Boot loaders who want to load the kernel at a
+nonstandard address should fill in the fields marked (reloc); other
+boot loaders can ignore those fields.
+
+The byte order of all fields is littleendian (this is x86, after all.)
+
+Field name: setup_sects
+Type: read
+Offset/size: 0x1f1/1
+Protocol: ALL
+
+ The size of the setup code in 512-byte sectors. If this field is
+ 0, the real value is 4. The real-mode code consists of the boot
+ sector (always one 512-byte sector) plus the setup code.
+
+Field name: root_flags
+Type: modify (optional)
+Offset/size: 0x1f2/2
+Protocol: ALL
+
+ If this field is nonzero, the root defaults to readonly. The use of
+ this field is deprecated; use the "ro" or "rw" options on the
+ command line instead.
+
+Field name: syssize
+Type: read
+Offset/size: 0x1f4/4 (protocol 2.04+) 0x1f4/2 (protocol ALL)
+Protocol: 2.04+
+
+ The size of the protected-mode code in units of 16-byte paragraphs.
+ For protocol versions older than 2.04 this field is only two bytes
+ wide, and therefore cannot be trusted for the size of a kernel if
+ the LOAD_HIGH flag is set.
+
+Field name: ram_size
+Type: kernel internal
+Offset/size: 0x1f8/2
+Protocol: ALL
+
+ This field is obsolete.
+
+Field name: vid_mode
+Type: modify (obligatory)
+Offset/size: 0x1fa/2
+
+ Please see the section on SPECIAL COMMAND LINE OPTIONS.
+
+Field name: root_dev
+Type: modify (optional)
+Offset/size: 0x1fc/2
+Protocol: ALL
+
+ The default root device device number. The use of this field is
+ deprecated, use the "root=" option on the command line instead.
+
+Field name: boot_flag
+Type: read
+Offset/size: 0x1fe/2
+Protocol: ALL
+
+ Contains 0xAA55. This is the closest thing old Linux kernels have
+ to a magic number.
+
+Field name: jump
+Type: read
+Offset/size: 0x200/2
+Protocol: 2.00+
+
+ Contains an x86 jump instruction, 0xEB followed by a signed offset
+ relative to byte 0x202. This can be used to determine the size of
+ the header.
+
+Field name: header
+Type: read
+Offset/size: 0x202/4
+Protocol: 2.00+
+
+ Contains the magic number "HdrS" (0x53726448).
+
+Field name: version
+Type: read
+Offset/size: 0x206/2
+Protocol: 2.00+
+
+ Contains the boot protocol version, in (major << 8)+minor format,
+ e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version
+ 10.17.
+
+Field name: readmode_swtch
+Type: modify (optional)
+Offset/size: 0x208/4
+Protocol: 2.00+
+
+ Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.)
+
+Field name: start_sys
+Type: read
+Offset/size: 0x20c/2
+Protocol: 2.00+
+
+ The load low segment (0x1000). Obsolete.
+
+Field name: kernel_version
+Type: read
+Offset/size: 0x20e/2
+Protocol: 2.00+
+
+ If set to a nonzero value, contains a pointer to a NUL-terminated
+ human-readable kernel version number string, less 0x200. This can
+ be used to display the kernel version to the user. This value
+ should be less than (0x200*setup_sects).
+
+ For example, if this value is set to 0x1c00, the kernel version
+ number string can be found at offset 0x1e00 in the kernel file.
+ This is a valid value if and only if the "setup_sects" field
+ contains the value 15 or higher, as:
+
+ 0x1c00 < 15*0x200 (= 0x1e00) but
+ 0x1c00 >= 14*0x200 (= 0x1c00)
+
+ 0x1c00 >> 9 = 14, so the minimum value for setup_secs is 15.
+
+Field name: type_of_loader
+Type: write (obligatory)
+Offset/size: 0x210/1
+Protocol: 2.00+
+
+ If your boot loader has an assigned id (see table below), enter
+ 0xTV here, where T is an identifier for the boot loader and V is
+ a version number. Otherwise, enter 0xFF here.
+
+ Assigned boot loader ids:
+ 0 LILO (0x00 reserved for pre-2.00 bootloader)
+ 1 Loadlin
+ 2 bootsect-loader (0x20, all other values reserved)
+ 3 SYSLINUX
+ 4 EtherBoot
+ 5 ELILO
+ 7 GRuB
+ 8 U-BOOT
+ 9 Xen
+ A Gujin
+ B Qemu
+
+ Please contact <hpa@zytor.com> if you need a bootloader ID
+ value assigned.
+
+Field name: loadflags
+Type: modify (obligatory)
+Offset/size: 0x211/1
+Protocol: 2.00+
+
+ This field is a bitmask.
+
+ Bit 0 (read): LOADED_HIGH
+ - If 0, the protected-mode code is loaded at 0x10000.
+ - If 1, the protected-mode code is loaded at 0x100000.
+
+ Bit 5 (write): QUIET_FLAG
+ - If 0, print early messages.
+ - If 1, suppress early messages.
+ This requests to the kernel (decompressor and early
+ kernel) to not write early messages that require
+ accessing the display hardware directly.
+
+ Bit 6 (write): KEEP_SEGMENTS
+ Protocol: 2.07+
+ - If 0, reload the segment registers in the 32bit entry point.
+ - If 1, do not reload the segment registers in the 32bit entry point.
+ Assume that %cs %ds %ss %es are all set to flat segments with
+ a base of 0 (or the equivalent for their environment).
+
+ Bit 7 (write): CAN_USE_HEAP
+ Set this bit to 1 to indicate that the value entered in the
+ heap_end_ptr is valid. If this field is clear, some setup code
+ functionality will be disabled.
+
+Field name: setup_move_size
+Type: modify (obligatory)
+Offset/size: 0x212/2
+Protocol: 2.00-2.01
+
+ When using protocol 2.00 or 2.01, if the real mode kernel is not
+ loaded at 0x90000, it gets moved there later in the loading
+ sequence. Fill in this field if you want additional data (such as
+ the kernel command line) moved in addition to the real-mode kernel
+ itself.
+
+ The unit is bytes starting with the beginning of the boot sector.
+
+ This field is can be ignored when the protocol is 2.02 or higher, or
+ if the real-mode code is loaded at 0x90000.
+
+Field name: code32_start
+Type: modify (optional, reloc)
+Offset/size: 0x214/4
+Protocol: 2.00+
+
+ The address to jump to in protected mode. This defaults to the load
+ address of the kernel, and can be used by the boot loader to
+ determine the proper load address.
+
+ This field can be modified for two purposes:
+
+ 1. as a boot loader hook (see ADVANCED BOOT LOADER HOOKS below.)
+
+ 2. if a bootloader which does not install a hook loads a
+ relocatable kernel at a nonstandard address it will have to modify
+ this field to point to the load address.
+
+Field name: ramdisk_image
+Type: write (obligatory)
+Offset/size: 0x218/4
+Protocol: 2.00+
+
+ The 32-bit linear address of the initial ramdisk or ramfs. Leave at
+ zero if there is no initial ramdisk/ramfs.
+
+Field name: ramdisk_size
+Type: write (obligatory)
+Offset/size: 0x21c/4
+Protocol: 2.00+
+
+ Size of the initial ramdisk or ramfs. Leave at zero if there is no
+ initial ramdisk/ramfs.
+
+Field name: bootsect_kludge
+Type: kernel internal
+Offset/size: 0x220/4
+Protocol: 2.00+
+
+ This field is obsolete.
+
+Field name: heap_end_ptr
+Type: write (obligatory)
+Offset/size: 0x224/2
+Protocol: 2.01+
+
+ Set this field to the offset (from the beginning of the real-mode
+ code) of the end of the setup stack/heap, minus 0x0200.
+
+Field name: cmd_line_ptr
+Type: write (obligatory)
+Offset/size: 0x228/4
+Protocol: 2.02+
+
+ Set this field to the linear address of the kernel command line.
+ The kernel command line can be located anywhere between the end of
+ the setup heap and 0xA0000; it does not have to be located in the
+ same 64K segment as the real-mode code itself.
+
+ Fill in this field even if your boot loader does not support a
+ command line, in which case you can point this to an empty string
+ (or better yet, to the string "auto".) If this field is left at
+ zero, the kernel will assume that your boot loader does not support
+ the 2.02+ protocol.
+
+Field name: initrd_addr_max
+Type: read
+Offset/size: 0x22c/4
+Protocol: 2.03+
+
+ The maximum address that may be occupied by the initial
+ ramdisk/ramfs contents. For boot protocols 2.02 or earlier, this
+ field is not present, and the maximum address is 0x37FFFFFF. (This
+ address is defined as the address of the highest safe byte, so if
+ your ramdisk is exactly 131072 bytes long and this field is
+ 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
+
+Field name: kernel_alignment
+Type: read (reloc)
+Offset/size: 0x230/4
+Protocol: 2.05+
+
+ Alignment unit required by the kernel (if relocatable_kernel is true.)
+
+Field name: relocatable_kernel
+Type: read (reloc)
+Offset/size: 0x234/1
+Protocol: 2.05+
+
+ If this field is nonzero, the protected-mode part of the kernel can
+ be loaded at any address that satisfies the kernel_alignment field.
+ After loading, the boot loader must set the code32_start field to
+ point to the loaded code, or to a boot loader hook.
+
+Field name: cmdline_size
+Type: read
+Offset/size: 0x238/4
+Protocol: 2.06+
+
+ The maximum size of the command line without the terminating
+ zero. This means that the command line can contain at most
+ cmdline_size characters. With protocol version 2.05 and earlier, the
+ maximum size was 255.
+
+Field name: hardware_subarch
+Type: write (optional, defaults to x86/PC)
+Offset/size: 0x23c/4
+Protocol: 2.07+
+
+ In a paravirtualized environment the hardware low level architectural
+ pieces such as interrupt handling, page table handling, and
+ accessing process control registers needs to be done differently.
+
+ This field allows the bootloader to inform the kernel we are in one
+ one of those environments.
+
+ 0x00000000 The default x86/PC environment
+ 0x00000001 lguest
+ 0x00000002 Xen
+
+Field name: hardware_subarch_data
+Type: write (subarch-dependent)
+Offset/size: 0x240/8
+Protocol: 2.07+
+
+ A pointer to data that is specific to hardware subarch
+ This field is currently unused for the default x86/PC environment,
+ do not modify.
+
+Field name: payload_offset
+Type: read
+Offset/size: 0x248/4
+Protocol: 2.08+
+
+ If non-zero then this field contains the offset from the end of the
+ real-mode code to the payload.
+
+ The payload may be compressed. The format of both the compressed and
+ uncompressed data should be determined using the standard magic
+ numbers. Currently only gzip compressed ELF is used.
+
+Field name: payload_length
+Type: read
+Offset/size: 0x24c/4
+Protocol: 2.08+
+
+ The length of the payload.
+
+Field name: setup_data
+Type: write (special)
+Offset/size: 0x250/8
+Protocol: 2.09+
+
+ The 64-bit physical pointer to NULL terminated single linked list of
+ struct setup_data. This is used to define a more extensible boot
+ parameters passing mechanism. The definition of struct setup_data is
+ as follow:
+
+ struct setup_data {
+ u64 next;
+ u32 type;
+ u32 len;
+ u8 data[0];
+ };
+
+ Where, the next is a 64-bit physical pointer to the next node of
+ linked list, the next field of the last node is 0; the type is used
+ to identify the contents of data; the len is the length of data
+ field; the data holds the real payload.
+
+ This list may be modified at a number of points during the bootup
+ process. Therefore, when modifying this list one should always make
+ sure to consider the case where the linked list already contains
+ entries.
+
+
+**** THE IMAGE CHECKSUM
+
+From boot protocol version 2.08 onwards the CRC-32 is calculated over
+the entire file using the characteristic polynomial 0x04C11DB7 and an
+initial remainder of 0xffffffff. The checksum is appended to the
+file; therefore the CRC of the file up to the limit specified in the
+syssize field of the header is always 0.
+
+
+**** THE KERNEL COMMAND LINE
+
+The kernel command line has become an important way for the boot
+loader to communicate with the kernel. Some of its options are also
+relevant to the boot loader itself, see "special command line options"
+below.
+
+The kernel command line is a null-terminated string. The maximum
+length can be retrieved from the field cmdline_size. Before protocol
+version 2.06, the maximum was 255 characters. A string that is too
+long will be automatically truncated by the kernel.
+
+If the boot protocol version is 2.02 or later, the address of the
+kernel command line is given by the header field cmd_line_ptr (see
+above.) This address can be anywhere between the end of the setup
+heap and 0xA0000.
+
+If the protocol version is *not* 2.02 or higher, the kernel
+command line is entered using the following protocol:
+
+ At offset 0x0020 (word), "cmd_line_magic", enter the magic
+ number 0xA33F.
+
+ At offset 0x0022 (word), "cmd_line_offset", enter the offset
+ of the kernel command line (relative to the start of the
+ real-mode kernel).
+
+ The kernel command line *must* be within the memory region
+ covered by setup_move_size, so you may need to adjust this
+ field.
+
+
+**** MEMORY LAYOUT OF THE REAL-MODE CODE
+
+The real-mode code requires a stack/heap to be set up, as well as
+memory allocated for the kernel command line. This needs to be done
+in the real-mode accessible memory in bottom megabyte.
+
+It should be noted that modern machines often have a sizable Extended
+BIOS Data Area (EBDA). As a result, it is advisable to use as little
+of the low megabyte as possible.
+
+Unfortunately, under the following circumstances the 0x90000 memory
+segment has to be used:
+
+ - When loading a zImage kernel ((loadflags & 0x01) == 0).
+ - When loading a 2.01 or earlier boot protocol kernel.
+
+ -> For the 2.00 and 2.01 boot protocols, the real-mode code
+ can be loaded at another address, but it is internally
+ relocated to 0x90000. For the "old" protocol, the
+ real-mode code must be loaded at 0x90000.
+
+When loading at 0x90000, avoid using memory above 0x9a000.
+
+For boot protocol 2.02 or higher, the command line does not have to be
+located in the same 64K segment as the real-mode setup code; it is
+thus permitted to give the stack/heap the full 64K segment and locate
+the command line above it.
+
+The kernel command line should not be located below the real-mode
+code, nor should it be located in high memory.
+
+
+**** SAMPLE BOOT CONFIGURATION
+
+As a sample configuration, assume the following layout of the real
+mode segment:
+
+ When loading below 0x90000, use the entire segment:
+
+ 0x0000-0x7fff Real mode kernel
+ 0x8000-0xdfff Stack and heap
+ 0xe000-0xffff Kernel command line
+
+ When loading at 0x90000 OR the protocol version is 2.01 or earlier:
+
+ 0x0000-0x7fff Real mode kernel
+ 0x8000-0x97ff Stack and heap
+ 0x9800-0x9fff Kernel command line
+
+Such a boot loader should enter the following fields in the header:
+
+ unsigned long base_ptr; /* base address for real-mode segment */
+
+ if ( setup_sects == 0 ) {
+ setup_sects = 4;
+ }
+
+ if ( protocol >= 0x0200 ) {
+ type_of_loader = <type code>;
+ if ( loading_initrd ) {
+ ramdisk_image = <initrd_address>;
+ ramdisk_size = <initrd_size>;
+ }
+
+ if ( protocol >= 0x0202 && loadflags & 0x01 )
+ heap_end = 0xe000;
+ else
+ heap_end = 0x9800;
+
+ if ( protocol >= 0x0201 ) {
+ heap_end_ptr = heap_end - 0x200;
+ loadflags |= 0x80; /* CAN_USE_HEAP */
+ }
+
+ if ( protocol >= 0x0202 ) {
+ cmd_line_ptr = base_ptr + heap_end;
+ strcpy(cmd_line_ptr, cmdline);
+ } else {
+ cmd_line_magic = 0xA33F;
+ cmd_line_offset = heap_end;
+ setup_move_size = heap_end + strlen(cmdline)+1;
+ strcpy(base_ptr+cmd_line_offset, cmdline);
+ }
+ } else {
+ /* Very old kernel */
+
+ heap_end = 0x9800;
+
+ cmd_line_magic = 0xA33F;
+ cmd_line_offset = heap_end;
+
+ /* A very old kernel MUST have its real-mode code
+ loaded at 0x90000 */
+
+ if ( base_ptr != 0x90000 ) {
+ /* Copy the real-mode kernel */
+ memcpy(0x90000, base_ptr, (setup_sects+1)*512);
+ base_ptr = 0x90000; /* Relocated */
+ }
+
+ strcpy(0x90000+cmd_line_offset, cmdline);
+
+ /* It is recommended to clear memory up to the 32K mark */
+ memset(0x90000 + (setup_sects+1)*512, 0,
+ (64-(setup_sects+1))*512);
+ }
+
+
+**** LOADING THE REST OF THE KERNEL
+
+The 32-bit (non-real-mode) kernel starts at offset (setup_sects+1)*512
+in the kernel file (again, if setup_sects == 0 the real value is 4.)
+It should be loaded at address 0x10000 for Image/zImage kernels and
+0x100000 for bzImage kernels.
+
+The kernel is a bzImage kernel if the protocol >= 2.00 and the 0x01
+bit (LOAD_HIGH) in the loadflags field is set:
+
+ is_bzImage = (protocol >= 0x0200) && (loadflags & 0x01);
+ load_address = is_bzImage ? 0x100000 : 0x10000;
+
+Note that Image/zImage kernels can be up to 512K in size, and thus use
+the entire 0x10000-0x90000 range of memory. This means it is pretty
+much a requirement for these kernels to load the real-mode part at
+0x90000. bzImage kernels allow much more flexibility.
+
+
+**** SPECIAL COMMAND LINE OPTIONS
+
+If the command line provided by the boot loader is entered by the
+user, the user may expect the following command line options to work.
+They should normally not be deleted from the kernel command line even
+though not all of them are actually meaningful to the kernel. Boot
+loader authors who need additional command line options for the boot
+loader itself should get them registered in
+Documentation/kernel-parameters.txt to make sure they will not
+conflict with actual kernel options now or in the future.
+
+ vga=<mode>
+ <mode> here is either an integer (in C notation, either
+ decimal, octal, or hexadecimal) or one of the strings
+ "normal" (meaning 0xFFFF), "ext" (meaning 0xFFFE) or "ask"
+ (meaning 0xFFFD). This value should be entered into the
+ vid_mode field, as it is used by the kernel before the command
+ line is parsed.
+
+ mem=<size>
+ <size> is an integer in C notation optionally followed by
+ (case insensitive) K, M, G, T, P or E (meaning << 10, << 20,
+ << 30, << 40, << 50 or << 60). This specifies the end of
+ memory to the kernel. This affects the possible placement of
+ an initrd, since an initrd should be placed near end of
+ memory. Note that this is an option to *both* the kernel and
+ the bootloader!
+
+ initrd=<file>
+ An initrd should be loaded. The meaning of <file> is
+ obviously bootloader-dependent, and some boot loaders
+ (e.g. LILO) do not have such a command.
+
+In addition, some boot loaders add the following options to the
+user-specified command line:
+
+ BOOT_IMAGE=<file>
+ The boot image which was loaded. Again, the meaning of <file>
+ is obviously bootloader-dependent.
+
+ auto
+ The kernel was booted without explicit user intervention.
+
+If these options are added by the boot loader, it is highly
+recommended that they are located *first*, before the user-specified
+or configuration-specified command line. Otherwise, "init=/bin/sh"
+gets confused by the "auto" option.
+
+
+**** RUNNING THE KERNEL
+
+The kernel is started by jumping to the kernel entry point, which is
+located at *segment* offset 0x20 from the start of the real mode
+kernel. This means that if you loaded your real-mode kernel code at
+0x90000, the kernel entry point is 9020:0000.
+
+At entry, ds = es = ss should point to the start of the real-mode
+kernel code (0x9000 if the code is loaded at 0x90000), sp should be
+set up properly, normally pointing to the top of the heap, and
+interrupts should be disabled. Furthermore, to guard against bugs in
+the kernel, it is recommended that the boot loader sets fs = gs = ds =
+es = ss.
+
+In our example from above, we would do:
+
+ /* Note: in the case of the "old" kernel protocol, base_ptr must
+ be == 0x90000 at this point; see the previous sample code */
+
+ seg = base_ptr >> 4;
+
+ cli(); /* Enter with interrupts disabled! */
+
+ /* Set up the real-mode kernel stack */
+ _SS = seg;
+ _SP = heap_end;
+
+ _DS = _ES = _FS = _GS = seg;
+ jmp_far(seg+0x20, 0); /* Run the kernel */
+
+If your boot sector accesses a floppy drive, it is recommended to
+switch off the floppy motor before running the kernel, since the
+kernel boot leaves interrupts off and thus the motor will not be
+switched off, especially if the loaded kernel has the floppy driver as
+a demand-loaded module!
+
+
+**** ADVANCED BOOT LOADER HOOKS
+
+If the boot loader runs in a particularly hostile environment (such as
+LOADLIN, which runs under DOS) it may be impossible to follow the
+standard memory location requirements. Such a boot loader may use the
+following hooks that, if set, are invoked by the kernel at the
+appropriate time. The use of these hooks should probably be
+considered an absolutely last resort!
+
+IMPORTANT: All the hooks are required to preserve %esp, %ebp, %esi and
+%edi across invocation.
+
+ realmode_swtch:
+ A 16-bit real mode far subroutine invoked immediately before
+ entering protected mode. The default routine disables NMI, so
+ your routine should probably do so, too.
+
+ code32_start:
+ A 32-bit flat-mode routine *jumped* to immediately after the
+ transition to protected mode, but before the kernel is
+ uncompressed. No segments, except CS, are guaranteed to be
+ set up (current kernels do, but older ones do not); you should
+ set them up to BOOT_DS (0x18) yourself.
+
+ After completing your hook, you should jump to the address
+ that was in this field before your boot loader overwrote it
+ (relocated, if appropriate.)
+
+
+**** 32-bit BOOT PROTOCOL
+
+For machine with some new BIOS other than legacy BIOS, such as EFI,
+LinuxBIOS, etc, and kexec, the 16-bit real mode setup code in kernel
+based on legacy BIOS can not be used, so a 32-bit boot protocol needs
+to be defined.
+
+In 32-bit boot protocol, the first step in loading a Linux kernel
+should be to setup the boot parameters (struct boot_params,
+traditionally known as "zero page"). The memory for struct boot_params
+should be allocated and initialized to all zero. Then the setup header
+from offset 0x01f1 of kernel image on should be loaded into struct
+boot_params and examined. The end of setup header can be calculated as
+follow:
+
+ 0x0202 + byte value at offset 0x0201
+
+In addition to read/modify/write the setup header of the struct
+boot_params as that of 16-bit boot protocol, the boot loader should
+also fill the additional fields of the struct boot_params as that
+described in zero-page.txt.
+
+After setupping the struct boot_params, the boot loader can load the
+32/64-bit kernel in the same way as that of 16-bit boot protocol.
+
+In 32-bit boot protocol, the kernel is started by jumping to the
+32-bit kernel entry point, which is the start address of loaded
+32/64-bit kernel.
+
+At entry, the CPU must be in 32-bit protected mode with paging
+disabled; a GDT must be loaded with the descriptors for selectors
+__BOOT_CS(0x10) and __BOOT_DS(0x18); both descriptors must be 4G flat
+segment; __BOOS_CS must have execute/read permission, and __BOOT_DS
+must have read/write permission; CS must be __BOOT_CS and DS, ES, SS
+must be __BOOT_DS; interrupt must be disabled; %esi must hold the base
+address of the struct boot_params; %ebp, %edi and %ebx must be zero.
diff --git a/Documentation/x86/i386/IO-APIC.txt b/Documentation/x86/i386/IO-APIC.txt
new file mode 100644
index 0000000..30b4c71
--- /dev/null
+++ b/Documentation/x86/i386/IO-APIC.txt
@@ -0,0 +1,119 @@
+Most (all) Intel-MP compliant SMP boards have the so-called 'IO-APIC',
+which is an enhanced interrupt controller. It enables us to route
+hardware interrupts to multiple CPUs, or to CPU groups. Without an
+IO-APIC, interrupts from hardware will be delivered only to the
+CPU which boots the operating system (usually CPU#0).
+
+Linux supports all variants of compliant SMP boards, including ones with
+multiple IO-APICs. Multiple IO-APICs are used in high-end servers to
+distribute IRQ load further.
+
+There are (a few) known breakages in certain older boards, such bugs are
+usually worked around by the kernel. If your MP-compliant SMP board does
+not boot Linux, then consult the linux-smp mailing list archives first.
+
+If your box boots fine with enabled IO-APIC IRQs, then your
+/proc/interrupts will look like this one:
+
+ ---------------------------->
+ hell:~> cat /proc/interrupts
+ CPU0
+ 0: 1360293 IO-APIC-edge timer
+ 1: 4 IO-APIC-edge keyboard
+ 2: 0 XT-PIC cascade
+ 13: 1 XT-PIC fpu
+ 14: 1448 IO-APIC-edge ide0
+ 16: 28232 IO-APIC-level Intel EtherExpress Pro 10/100 Ethernet
+ 17: 51304 IO-APIC-level eth0
+ NMI: 0
+ ERR: 0
+ hell:~>
+ <----------------------------
+
+Some interrupts are still listed as 'XT PIC', but this is not a problem;
+none of those IRQ sources is performance-critical.
+
+
+In the unlikely case that your board does not create a working mp-table,
+you can use the pirq= boot parameter to 'hand-construct' IRQ entries. This
+is non-trivial though and cannot be automated. One sample /etc/lilo.conf
+entry:
+
+ append="pirq=15,11,10"
+
+The actual numbers depend on your system, on your PCI cards and on their
+PCI slot position. Usually PCI slots are 'daisy chained' before they are
+connected to the PCI chipset IRQ routing facility (the incoming PIRQ1-4
+lines):
+
+ ,-. ,-. ,-. ,-. ,-.
+ PIRQ4 ----| |-. ,-| |-. ,-| |-. ,-| |--------| |
+ |S| \ / |S| \ / |S| \ / |S| |S|
+ PIRQ3 ----|l|-. `/---|l|-. `/---|l|-. `/---|l|--------|l|
+ |o| \/ |o| \/ |o| \/ |o| |o|
+ PIRQ2 ----|t|-./`----|t|-./`----|t|-./`----|t|--------|t|
+ |1| /\ |2| /\ |3| /\ |4| |5|
+ PIRQ1 ----| |- `----| |- `----| |- `----| |--------| |
+ `-' `-' `-' `-' `-'
+
+Every PCI card emits a PCI IRQ, which can be INTA, INTB, INTC or INTD:
+
+ ,-.
+ INTD--| |
+ |S|
+ INTC--|l|
+ |o|
+ INTB--|t|
+ |x|
+ INTA--| |
+ `-'
+
+These INTA-D PCI IRQs are always 'local to the card', their real meaning
+depends on which slot they are in. If you look at the daisy chaining diagram,
+a card in slot4, issuing INTA IRQ, it will end up as a signal on PIRQ4 of
+the PCI chipset. Most cards issue INTA, this creates optimal distribution
+between the PIRQ lines. (distributing IRQ sources properly is not a
+necessity, PCI IRQs can be shared at will, but it's a good for performance
+to have non shared interrupts). Slot5 should be used for videocards, they
+do not use interrupts normally, thus they are not daisy chained either.
+
+so if you have your SCSI card (IRQ11) in Slot1, Tulip card (IRQ9) in
+Slot2, then you'll have to specify this pirq= line:
+
+ append="pirq=11,9"
+
+the following script tries to figure out such a default pirq= line from
+your PCI configuration:
+
+ echo -n pirq=; echo `scanpci | grep T_L | cut -c56-` | sed 's/ /,/g'
+
+note that this script wont work if you have skipped a few slots or if your
+board does not do default daisy-chaining. (or the IO-APIC has the PIRQ pins
+connected in some strange way). E.g. if in the above case you have your SCSI
+card (IRQ11) in Slot3, and have Slot1 empty:
+
+ append="pirq=0,9,11"
+
+[value '0' is a generic 'placeholder', reserved for empty (or non-IRQ emitting)
+slots.]
+
+Generally, it's always possible to find out the correct pirq= settings, just
+permute all IRQ numbers properly ... it will take some time though. An
+'incorrect' pirq line will cause the booting process to hang, or a device
+won't function properly (e.g. if it's inserted as a module).
+
+If you have 2 PCI buses, then you can use up to 8 pirq values, although such
+boards tend to have a good configuration.
+
+Be prepared that it might happen that you need some strange pirq line:
+
+ append="pirq=0,0,0,0,0,0,9,11"
+
+Use smart trial-and-error techniques to find out the correct pirq line ...
+
+Good luck and mail to linux-smp@vger.kernel.org or
+linux-kernel@vger.kernel.org if you have any problems that are not covered
+by this document.
+
+-- mingo
+
diff --git a/Documentation/x86/mtrr.txt b/Documentation/x86/mtrr.txt
new file mode 100644
index 0000000..cc071dc
--- /dev/null
+++ b/Documentation/x86/mtrr.txt
@@ -0,0 +1,305 @@
+MTRR (Memory Type Range Register) control
+3 Jun 1999
+Richard Gooch
+<rgooch@atnf.csiro.au>
+
+ On Intel P6 family processors (Pentium Pro, Pentium II and later)
+ the Memory Type Range Registers (MTRRs) may be used to control
+ processor access to memory ranges. This is most useful when you have
+ a video (VGA) card on a PCI or AGP bus. Enabling write-combining
+ allows bus write transfers to be combined into a larger transfer
+ before bursting over the PCI/AGP bus. This can increase performance
+ of image write operations 2.5 times or more.
+
+ The Cyrix 6x86, 6x86MX and M II processors have Address Range
+ Registers (ARRs) which provide a similar functionality to MTRRs. For
+ these, the ARRs are used to emulate the MTRRs.
+
+ The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
+ MTRRs. These are supported. The AMD Athlon family provide 8 Intel
+ style MTRRs.
+
+ The Centaur C6 (WinChip) has 8 MCRs, allowing write-combining. These
+ are supported.
+
+ The VIA Cyrix III and VIA C3 CPUs offer 8 Intel style MTRRs.
+
+ The CONFIG_MTRR option creates a /proc/mtrr file which may be used
+ to manipulate your MTRRs. Typically the X server should use
+ this. This should have a reasonably generic interface so that
+ similar control registers on other processors can be easily
+ supported.
+
+
+There are two interfaces to /proc/mtrr: one is an ASCII interface
+which allows you to read and write. The other is an ioctl()
+interface. The ASCII interface is meant for administration. The
+ioctl() interface is meant for C programs (i.e. the X server). The
+interfaces are described below, with sample commands and C code.
+
+===============================================================================
+Reading MTRRs from the shell:
+
+% cat /proc/mtrr
+reg00: base=0x00000000 ( 0MB), size= 128MB: write-back, count=1
+reg01: base=0x08000000 ( 128MB), size= 64MB: write-back, count=1
+===============================================================================
+Creating MTRRs from the C-shell:
+# echo "base=0xf8000000 size=0x400000 type=write-combining" >! /proc/mtrr
+or if you use bash:
+# echo "base=0xf8000000 size=0x400000 type=write-combining" >| /proc/mtrr
+
+And the result thereof:
+% cat /proc/mtrr
+reg00: base=0x00000000 ( 0MB), size= 128MB: write-back, count=1
+reg01: base=0x08000000 ( 128MB), size= 64MB: write-back, count=1
+reg02: base=0xf8000000 (3968MB), size= 4MB: write-combining, count=1
+
+This is for video RAM at base address 0xf8000000 and size 4 megabytes. To
+find out your base address, you need to look at the output of your X
+server, which tells you where the linear framebuffer address is. A
+typical line that you may get is:
+
+(--) S3: PCI: 968 rev 0, Linear FB @ 0xf8000000
+
+Note that you should only use the value from the X server, as it may
+move the framebuffer base address, so the only value you can trust is
+that reported by the X server.
+
+To find out the size of your framebuffer (what, you don't actually
+know?), the following line will tell you:
+
+(--) S3: videoram: 4096k
+
+That's 4 megabytes, which is 0x400000 bytes (in hexadecimal).
+A patch is being written for XFree86 which will make this automatic:
+in other words the X server will manipulate /proc/mtrr using the
+ioctl() interface, so users won't have to do anything. If you use a
+commercial X server, lobby your vendor to add support for MTRRs.
+===============================================================================
+Creating overlapping MTRRs:
+
+%echo "base=0xfb000000 size=0x1000000 type=write-combining" >/proc/mtrr
+%echo "base=0xfb000000 size=0x1000 type=uncachable" >/proc/mtrr
+
+And the results: cat /proc/mtrr
+reg00: base=0x00000000 ( 0MB), size= 64MB: write-back, count=1
+reg01: base=0xfb000000 (4016MB), size= 16MB: write-combining, count=1
+reg02: base=0xfb000000 (4016MB), size= 4kB: uncachable, count=1
+
+Some cards (especially Voodoo Graphics boards) need this 4 kB area
+excluded from the beginning of the region because it is used for
+registers.
+
+NOTE: You can only create type=uncachable region, if the first
+region that you created is type=write-combining.
+===============================================================================
+Removing MTRRs from the C-shell:
+% echo "disable=2" >! /proc/mtrr
+or using bash:
+% echo "disable=2" >| /proc/mtrr
+===============================================================================
+Reading MTRRs from a C program using ioctl()'s:
+
+/* mtrr-show.c
+
+ Source file for mtrr-show (example program to show MTRRs using ioctl()'s)
+
+ Copyright (C) 1997-1998 Richard Gooch
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+*/
+
+/*
+ This program will use an ioctl() on /proc/mtrr to show the current MTRR
+ settings. This is an alternative to reading /proc/mtrr.
+
+
+ Written by Richard Gooch 17-DEC-1997
+
+ Last updated by Richard Gooch 2-MAY-1998
+
+
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <asm/mtrr.h>
+
+#define TRUE 1
+#define FALSE 0
+#define ERRSTRING strerror (errno)
+
+static char *mtrr_strings[MTRR_NUM_TYPES] =
+{
+ "uncachable", /* 0 */
+ "write-combining", /* 1 */
+ "?", /* 2 */
+ "?", /* 3 */
+ "write-through", /* 4 */
+ "write-protect", /* 5 */
+ "write-back", /* 6 */
+};
+
+int main ()
+{
+ int fd;
+ struct mtrr_gentry gentry;
+
+ if ( ( fd = open ("/proc/mtrr", O_RDONLY, 0) ) == -1 )
+ {
+ if (errno == ENOENT)
+ {
+ fputs ("/proc/mtrr not found: not supported or you don't have a PPro?\n",
+ stderr);
+ exit (1);
+ }
+ fprintf (stderr, "Error opening /proc/mtrr\t%s\n", ERRSTRING);
+ exit (2);
+ }
+ for (gentry.regnum = 0; ioctl (fd, MTRRIOC_GET_ENTRY, &gentry) == 0;
+ ++gentry.regnum)
+ {
+ if (gentry.size < 1)
+ {
+ fprintf (stderr, "Register: %u disabled\n", gentry.regnum);
+ continue;
+ }
+ fprintf (stderr, "Register: %u base: 0x%lx size: 0x%lx type: %s\n",
+ gentry.regnum, gentry.base, gentry.size,
+ mtrr_strings[gentry.type]);
+ }
+ if (errno == EINVAL) exit (0);
+ fprintf (stderr, "Error doing ioctl(2) on /dev/mtrr\t%s\n", ERRSTRING);
+ exit (3);
+} /* End Function main */
+===============================================================================
+Creating MTRRs from a C programme using ioctl()'s:
+
+/* mtrr-add.c
+
+ Source file for mtrr-add (example programme to add an MTRRs using ioctl())
+
+ Copyright (C) 1997-1998 Richard Gooch
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+*/
+
+/*
+ This programme will use an ioctl() on /proc/mtrr to add an entry. The first
+ available mtrr is used. This is an alternative to writing /proc/mtrr.
+
+
+ Written by Richard Gooch 17-DEC-1997
+
+ Last updated by Richard Gooch 2-MAY-1998
+
+
+*/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <asm/mtrr.h>
+
+#define TRUE 1
+#define FALSE 0
+#define ERRSTRING strerror (errno)
+
+static char *mtrr_strings[MTRR_NUM_TYPES] =
+{
+ "uncachable", /* 0 */
+ "write-combining", /* 1 */
+ "?", /* 2 */
+ "?", /* 3 */
+ "write-through", /* 4 */
+ "write-protect", /* 5 */
+ "write-back", /* 6 */
+};
+
+int main (int argc, char **argv)
+{
+ int fd;
+ struct mtrr_sentry sentry;
+
+ if (argc != 4)
+ {
+ fprintf (stderr, "Usage:\tmtrr-add base size type\n");
+ exit (1);
+ }
+ sentry.base = strtoul (argv[1], NULL, 0);
+ sentry.size = strtoul (argv[2], NULL, 0);
+ for (sentry.type = 0; sentry.type < MTRR_NUM_TYPES; ++sentry.type)
+ {
+ if (strcmp (argv[3], mtrr_strings[sentry.type]) == 0) break;
+ }
+ if (sentry.type >= MTRR_NUM_TYPES)
+ {
+ fprintf (stderr, "Illegal type: \"%s\"\n", argv[3]);
+ exit (2);
+ }
+ if ( ( fd = open ("/proc/mtrr", O_WRONLY, 0) ) == -1 )
+ {
+ if (errno == ENOENT)
+ {
+ fputs ("/proc/mtrr not found: not supported or you don't have a PPro?\n",
+ stderr);
+ exit (3);
+ }
+ fprintf (stderr, "Error opening /proc/mtrr\t%s\n", ERRSTRING);
+ exit (4);
+ }
+ if (ioctl (fd, MTRRIOC_ADD_ENTRY, &sentry) == -1)
+ {
+ fprintf (stderr, "Error doing ioctl(2) on /dev/mtrr\t%s\n", ERRSTRING);
+ exit (5);
+ }
+ fprintf (stderr, "Sleeping for 5 seconds so you can see the new entry\n");
+ sleep (5);
+ close (fd);
+ fputs ("I've just closed /proc/mtrr so now the new entry should be gone\n",
+ stderr);
+} /* End Function main */
+===============================================================================
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt
new file mode 100644
index 0000000..c93ff5f
--- /dev/null
+++ b/Documentation/x86/pat.txt
@@ -0,0 +1,136 @@
+
+PAT (Page Attribute Table)
+
+x86 Page Attribute Table (PAT) allows for setting the memory attribute at the
+page level granularity. PAT is complementary to the MTRR settings which allows
+for setting of memory types over physical address ranges. However, PAT is
+more flexible than MTRR due to its capability to set attributes at page level
+and also due to the fact that there are no hardware limitations on number of
+such attribute settings allowed. Added flexibility comes with guidelines for
+not having memory type aliasing for the same physical memory with multiple
+virtual addresses.
+
+PAT allows for different types of memory attributes. The most commonly used
+ones that will be supported at this time are Write-back, Uncached,
+Write-combined and Uncached Minus.
+
+
+PAT APIs
+--------
+
+There are many different APIs in the kernel that allows setting of memory
+attributes at the page level. In order to avoid aliasing, these interfaces
+should be used thoughtfully. Below is a table of interfaces available,
+their intended usage and their memory attribute relationships. Internally,
+these APIs use a reserve_memtype()/free_memtype() interface on the physical
+address range to avoid any aliasing.
+
+
+-------------------------------------------------------------------
+API | RAM | ACPI,... | Reserved/Holes |
+-----------------------|----------|------------|------------------|
+ | | | |
+ioremap | -- | UC- | UC- |
+ | | | |
+ioremap_cache | -- | WB | WB |
+ | | | |
+ioremap_nocache | -- | UC- | UC- |
+ | | | |
+ioremap_wc | -- | -- | WC |
+ | | | |
+set_memory_uc | UC- | -- | -- |
+ set_memory_wb | | | |
+ | | | |
+set_memory_wc | WC | -- | -- |
+ set_memory_wb | | | |
+ | | | |
+pci sysfs resource | -- | -- | UC- |
+ | | | |
+pci sysfs resource_wc | -- | -- | WC |
+ is IORESOURCE_PREFETCH| | | |
+ | | | |
+pci proc | -- | -- | UC- |
+ !PCIIOC_WRITE_COMBINE | | | |
+ | | | |
+pci proc | -- | -- | WC |
+ PCIIOC_WRITE_COMBINE | | | |
+ | | | |
+/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
+ read-write | | | |
+ | | | |
+/dev/mem | -- | UC- | UC- |
+ mmap SYNC flag | | | |
+ | | | |
+/dev/mem | -- | WB/WC/UC- | WB/WC/UC- |
+ mmap !SYNC flag | |(from exist-| (from exist- |
+ and | | ing alias)| ing alias) |
+ any alias to this area| | | |
+ | | | |
+/dev/mem | -- | WB | WB |
+ mmap !SYNC flag | | | |
+ no alias to this area | | | |
+ and | | | |
+ MTRR says WB | | | |
+ | | | |
+/dev/mem | -- | -- | UC- |
+ mmap !SYNC flag | | | |
+ no alias to this area | | | |
+ and | | | |
+ MTRR says !WB | | | |
+ | | | |
+-------------------------------------------------------------------
+
+Notes:
+
+-- in the above table mean "Not suggested usage for the API". Some of the --'s
+are strictly enforced by the kernel. Some others are not really enforced
+today, but may be enforced in future.
+
+For ioremap and pci access through /sys or /proc - The actual type returned
+can be more restrictive, in case of any existing aliasing for that address.
+For example: If there is an existing uncached mapping, a new ioremap_wc can
+return uncached mapping in place of write-combine requested.
+
+set_memory_[uc|wc] and set_memory_wb should be used in pairs, where driver will
+first make a region uc or wc and switch it back to wb after use.
+
+Over time writes to /proc/mtrr will be deprecated in favor of using PAT based
+interfaces. Users writing to /proc/mtrr are suggested to use above interfaces.
+
+Drivers should use ioremap_[uc|wc] to access PCI BARs with [uc|wc] access
+types.
+
+Drivers should use set_memory_[uc|wc] to set access type for RAM ranges.
+
+
+PAT debugging
+-------------
+
+With CONFIG_DEBUG_FS enabled, PAT memtype list can be examined by
+
+# mount -t debugfs debugfs /sys/kernel/debug
+# cat /sys/kernel/debug/x86/pat_memtype_list
+PAT memtype list:
+uncached-minus @ 0x7fadf000-0x7fae0000
+uncached-minus @ 0x7fb19000-0x7fb1a000
+uncached-minus @ 0x7fb1a000-0x7fb1b000
+uncached-minus @ 0x7fb1b000-0x7fb1c000
+uncached-minus @ 0x7fb1c000-0x7fb1d000
+uncached-minus @ 0x7fb1d000-0x7fb1e000
+uncached-minus @ 0x7fb1e000-0x7fb25000
+uncached-minus @ 0x7fb25000-0x7fb26000
+uncached-minus @ 0x7fb26000-0x7fb27000
+uncached-minus @ 0x7fb27000-0x7fb28000
+uncached-minus @ 0x7fb28000-0x7fb2e000
+uncached-minus @ 0x7fb2e000-0x7fb2f000
+uncached-minus @ 0x7fb2f000-0x7fb30000
+uncached-minus @ 0x7fb31000-0x7fb32000
+uncached-minus @ 0x80000000-0x90000000
+
+This list shows physical address ranges and various PAT settings used to
+access those physical address ranges.
+
+Another, more verbose way of getting PAT related debug messages is with
+"debugpat" boot parameter. With this parameter, various debug messages are
+printed to dmesg log.
+
diff --git a/Documentation/x86/usb-legacy-support.txt b/Documentation/x86/usb-legacy-support.txt
new file mode 100644
index 0000000..1894cdf
--- /dev/null
+++ b/Documentation/x86/usb-legacy-support.txt
@@ -0,0 +1,44 @@
+USB Legacy support
+~~~~~~~~~~~~~~~~~~
+
+Vojtech Pavlik <vojtech@suse.cz>, January 2004
+
+
+Also known as "USB Keyboard" or "USB Mouse support" in the BIOS Setup is a
+feature that allows one to use the USB mouse and keyboard as if they were
+their classic PS/2 counterparts. This means one can use an USB keyboard to
+type in LILO for example.
+
+It has several drawbacks, though:
+
+1) On some machines, the emulated PS/2 mouse takes over even when no USB
+ mouse is present and a real PS/2 mouse is present. In that case the extra
+ features (wheel, extra buttons, touchpad mode) of the real PS/2 mouse may
+ not be available.
+
+2) If CONFIG_HIGHMEM64G is enabled, the PS/2 mouse emulation can cause
+ system crashes, because the SMM BIOS is not expecting to be in PAE mode.
+ The Intel E7505 is a typical machine where this happens.
+
+3) If AMD64 64-bit mode is enabled, again system crashes often happen,
+ because the SMM BIOS isn't expecting the CPU to be in 64-bit mode. The
+ BIOS manufacturers only test with Windows, and Windows doesn't do 64-bit
+ yet.
+
+Solutions:
+
+Problem 1) can be solved by loading the USB drivers prior to loading the
+PS/2 mouse driver. Since the PS/2 mouse driver is in 2.6 compiled into
+the kernel unconditionally, this means the USB drivers need to be
+compiled-in, too.
+
+Problem 2) can currently only be solved by either disabling HIGHMEM64G
+in the kernel config or USB Legacy support in the BIOS. A BIOS update
+could help, but so far no such update exists.
+
+Problem 3) is usually fixed by a BIOS update. Check the board
+manufacturers web site. If an update is not available, disable USB
+Legacy support in the BIOS. If this alone doesn't help, try also adding
+idle=poll on the kernel command line. The BIOS may be entering the SMM
+on the HLT instruction as well.
+
diff --git a/Documentation/x86/x86_64/00-INDEX b/Documentation/x86/x86_64/00-INDEX
new file mode 100644
index 0000000..92fc20a
--- /dev/null
+++ b/Documentation/x86/x86_64/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - This file
+boot-options.txt
+ - AMD64-specific boot options.
+cpu-hotplug-spec
+ - Firmware support for CPU hotplug under Linux/x86-64
+fake-numa-for-cpusets
+ - Using numa=fake and CPUSets for Resource Management
+kernel-stacks
+ - Context-specific per-processor interrupt stacks.
+machinecheck
+ - Configurable sysfs parameters for the x86-64 machine check code.
+mm.txt
+ - Memory layout of x86-64 (4 level page tables, 46 bits physical).
+uefi.txt
+ - Booting Linux via Unified Extensible Firmware Interface.
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
new file mode 100644
index 0000000..f6d561a
--- /dev/null
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -0,0 +1,310 @@
+AMD64 specific boot options
+
+There are many others (usually documented in driver documentation), but
+only the AMD64 specific ones are listed here.
+
+Machine check
+
+ mce=off disable machine check
+ mce=bootlog Enable logging of machine checks left over from booting.
+ Disabled by default on AMD because some BIOS leave bogus ones.
+ If your BIOS doesn't do that it's a good idea to enable though
+ to make sure you log even machine check events that result
+ in a reboot. On Intel systems it is enabled by default.
+ mce=nobootlog
+ Disable boot machine check logging.
+ mce=tolerancelevel (number)
+ 0: always panic on uncorrected errors, log corrected errors
+ 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ 2: SIGBUS or log uncorrected errors, log corrected errors
+ 3: never panic or SIGBUS, log all errors (for testing only)
+ Default is 1
+ Can be also set using sysfs which is preferable.
+
+ nomce (for compatibility with i386): same as mce=off
+
+ Everything else is in sysfs now.
+
+APICs
+
+ apic Use IO-APIC. Default
+
+ noapic Don't use the IO-APIC.
+
+ disableapic Don't use the local APIC
+
+ nolapic Don't use the local APIC (alias for i386 compatibility)
+
+ pirq=... See Documentation/x86/i386/IO-APIC.txt
+
+ noapictimer Don't set up the APIC timer
+
+ no_timer_check Don't check the IO-APIC timer. This can work around
+ problems with incorrect timer initialization on some boards.
+
+ apicmaintimer Run time keeping from the local APIC timer instead
+ of using the PIT/HPET interrupt for this. This is useful
+ when the PIT/HPET interrupts are unreliable.
+
+ noapicmaintimer Don't do time keeping using the APIC timer.
+ Useful when this option was auto selected, but doesn't work.
+
+ apicpmtimer
+ Do APIC timer calibration using the pmtimer. Implies
+ apicmaintimer. Useful when your PIT timer is totally
+ broken.
+
+Early Console
+
+ syntax: earlyprintk=vga
+ earlyprintk=serial[,ttySn[,baudrate]]
+
+ The early console is useful when the kernel crashes before the
+ normal console is initialized. It is not enabled by
+ default because it has some cosmetic problems.
+ Append ,keep to not disable it when the real console takes over.
+ Only vga or serial at a time, not both.
+ Currently only ttyS0 and ttyS1 are supported.
+ Interaction with the standard serial driver is not very good.
+ The VGA output is eventually overwritten by the real console.
+
+Timing
+
+ notsc
+ Don't use the CPU time stamp counter to read the wall time.
+ This can be used to work around timing problems on multiprocessor systems
+ with not properly synchronized CPUs.
+
+ report_lost_ticks
+ Report when timer interrupts are lost because some code turned off
+ interrupts for too long.
+
+ nmi_watchdog=NUMBER[,panic]
+ NUMBER can be:
+ 0 don't use an NMI watchdog
+ 1 use the IO-APIC timer for the NMI watchdog
+ 2 use the local APIC for the NMI watchdog using a performance counter. Note
+ This will use one performance counter and the local APIC's performance
+ vector.
+ When panic is specified panic when an NMI watchdog timeout occurs.
+ This is useful when you use a panic=... timeout and need the box
+ quickly up again.
+
+ nohpet
+ Don't use the HPET timer.
+
+Idle loop
+
+ idle=poll
+ Don't do power saving in the idle loop using HLT, but poll for rescheduling
+ event. This will make the CPUs eat a lot more power, but may be useful
+ to get slightly better performance in multiprocessor benchmarks. It also
+ makes some profiling using performance counters more accurate.
+ Please note that on systems with MONITOR/MWAIT support (like Intel EM64T
+ CPUs) this option has no performance advantage over the normal idle loop.
+ It may also interact badly with hyperthreading.
+
+Rebooting
+
+ reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] [, [w]arm | [c]old]
+ bios Use the CPU reboot vector for warm reset
+ warm Don't set the cold reboot flag
+ cold Set the cold reboot flag
+ triple Force a triple fault (init)
+ kbd Use the keyboard controller. cold reset (default)
+ acpi Use the ACPI RESET_REG in the FADT. If ACPI is not configured or the
+ ACPI reset does not work, the reboot path attempts the reset using
+ the keyboard controller.
+ efi Use efi reset_system runtime service. If EFI is not configured or the
+ EFI reset does not work, the reboot path attempts the reset using
+ the keyboard controller.
+
+ Using warm reset will be much faster especially on big memory
+ systems because the BIOS will not go through the memory check.
+ Disadvantage is that not all hardware will be completely reinitialized
+ on reboot so there may be boot problems on some systems.
+
+ reboot=force
+
+ Don't stop other CPUs on reboot. This can make reboot more reliable
+ in some cases.
+
+Non Executable Mappings
+
+ noexec=on|off
+
+ on Enable(default)
+ off Disable
+
+SMP
+
+ additional_cpus=NUM Allow NUM more CPUs for hotplug
+ (defaults are specified by the BIOS, see Documentation/x86/x86_64/cpu-hotplug-spec)
+
+NUMA
+
+ numa=off Only set up a single NUMA node spanning all memory.
+
+ numa=noacpi Don't parse the SRAT table for NUMA setup
+
+ numa=fake=CMDLINE
+ If a number, fakes CMDLINE nodes and ignores NUMA setup of the
+ actual machine. Otherwise, system memory is configured
+ depending on the sizes and coefficients listed. For example:
+ numa=fake=2*512,1024,4*256,*128
+ gives two 512M nodes, a 1024M node, four 256M nodes, and the
+ rest split into 128M chunks. If the last character of CMDLINE
+ is a *, the remaining memory is divided up equally among its
+ coefficient:
+ numa=fake=2*512,2*
+ gives two 512M nodes and the rest split into two nodes.
+ Otherwise, the remaining system RAM is allocated to an
+ additional node.
+
+ numa=hotadd=percent
+ Only allow hotadd memory to preallocate page structures upto
+ percent of already available memory.
+ numa=hotadd=0 will disable hotadd memory.
+
+ACPI
+
+ acpi=off Don't enable ACPI
+ acpi=ht Use ACPI boot table parsing, but don't enable ACPI
+ interpreter
+ acpi=force Force ACPI on (currently not needed)
+
+ acpi=strict Disable out of spec ACPI workarounds.
+
+ acpi_sci={edge,level,high,low} Set up ACPI SCI interrupt.
+
+ acpi=noirq Don't route interrupts
+
+PCI
+
+ pci=off Don't use PCI
+ pci=conf1 Use conf1 access.
+ pci=conf2 Use conf2 access.
+ pci=rom Assign ROMs.
+ pci=assign-busses Assign busses
+ pci=irqmask=MASK Set PCI interrupt mask to MASK
+ pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says.
+ pci=noacpi Don't use ACPI to set up PCI interrupt routing.
+
+IOMMU (input/output memory management unit)
+
+ Currently four x86-64 PCI-DMA mapping implementations exist:
+
+ 1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all
+ (e.g. because you have < 3 GB memory).
+ Kernel boot message: "PCI-DMA: Disabling IOMMU"
+
+ 2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU.
+ Kernel boot message: "PCI-DMA: using GART IOMMU"
+
+ 3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
+ e.g. if there is no hardware IOMMU in the system and it is need because
+ you have >3GB memory or told the kernel to us it (iommu=soft))
+ Kernel boot message: "PCI-DMA: Using software bounce buffering
+ for IO (SWIOTLB)"
+
+ 4. <arch/x86_64/pci-calgary.c> : IBM Calgary hardware IOMMU. Used in IBM
+ pSeries and xSeries servers. This hardware IOMMU supports DMA address
+ mapping with memory protection, etc.
+ Kernel boot message: "PCI-DMA: Using Calgary IOMMU"
+
+ iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>]
+ [,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge]
+ [,noaperture][,calgary]
+
+ General iommu options:
+ off Don't initialize and use any kind of IOMMU.
+ noforce Don't force hardware IOMMU usage when it is not needed.
+ (default).
+ force Force the use of the hardware IOMMU even when it is
+ not actually needed (e.g. because < 3 GB memory).
+ soft Use software bounce buffering (SWIOTLB) (default for
+ Intel machines). This can be used to prevent the usage
+ of an available hardware IOMMU.
+
+ iommu options only relevant to the AMD GART hardware IOMMU:
+ <size> Set the size of the remapping area in bytes.
+ allowed Overwrite iommu off workarounds for specific chipsets.
+ fullflush Flush IOMMU on each allocation (default).
+ nofullflush Don't use IOMMU fullflush.
+ leak Turn on simple iommu leak tracing (only when
+ CONFIG_IOMMU_LEAK is on). Default number of leak pages
+ is 20.
+ memaper[=<order>] Allocate an own aperture over RAM with size 32MB<<order.
+ (default: order=1, i.e. 64MB)
+ merge Do scatter-gather (SG) merging. Implies "force"
+ (experimental).
+ nomerge Don't do scatter-gather (SG) merging.
+ noaperture Ask the IOMMU not to touch the aperture for AGP.
+ forcesac Force single-address cycle (SAC) mode for masks <40bits
+ (experimental).
+ noagp Don't initialize the AGP driver and use full aperture.
+ allowdac Allow double-address cycle (DAC) mode, i.e. DMA >4GB.
+ DAC is used with 32-bit PCI to push a 64-bit address in
+ two cycles. When off all DMA over >4GB is forced through
+ an IOMMU or software bounce buffering.
+ nodac Forbid DAC mode, i.e. DMA >4GB.
+ panic Always panic when IOMMU overflows.
+ calgary Use the Calgary IOMMU if it is available
+
+ iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
+ implementation:
+ swiotlb=<pages>[,force]
+ <pages> Prereserve that many 128K pages for the software IO
+ bounce buffering.
+ force Force all IO through the software TLB.
+
+ Settings for the IBM Calgary hardware IOMMU currently found in IBM
+ pSeries and xSeries machines:
+
+ calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
+ calgary=[translate_empty_slots]
+ calgary=[disable=<PCI bus number>]
+ panic Always panic when IOMMU overflows
+
+ 64k,...,8M - Set the size of each PCI slot's translation table
+ when using the Calgary IOMMU. This is the size of the translation
+ table itself in main memory. The smallest table, 64k, covers an IO
+ space of 32MB; the largest, 8MB table, can cover an IO space of
+ 4GB. Normally the kernel will make the right choice by itself.
+
+ translate_empty_slots - Enable translation even on slots that have
+ no devices attached to them, in case a device will be hotplugged
+ in the future.
+
+ disable=<PCI bus number> - Disable translation on a given PHB. For
+ example, the built-in graphics adapter resides on the first bridge
+ (PCI bus number 0); if translation (isolation) is enabled on this
+ bridge, X servers that access the hardware directly from user
+ space might stop working. Use this option if you have devices that
+ are accessed from userspace directly on some PCI host bridge.
+
+Debugging
+
+ oops=panic Always panic on oopses. Default is to just kill the process,
+ but there is a small probability of deadlocking the machine.
+ This will also cause panics on machine check exceptions.
+ Useful together with panic=30 to trigger a reboot.
+
+ kstack=N Print N words from the kernel stack in oops dumps.
+
+ pagefaulttrace Dump all page faults. Only useful for extreme debugging
+ and will create a lot of output.
+
+ call_trace=[old|both|newfallback|new]
+ old: use old inexact backtracer
+ new: use new exact dwarf2 unwinder
+ both: print entries from both
+ newfallback: use new unwinder but fall back to old if it gets
+ stuck (default)
+
+Miscellaneous
+
+ nogbpages
+ Do not use GB pages for kernel direct mappings.
+ gbpages
+ Use GB pages for kernel direct mappings.
diff --git a/Documentation/x86/x86_64/cpu-hotplug-spec b/Documentation/x86/x86_64/cpu-hotplug-spec
new file mode 100644
index 0000000..3c23e05
--- /dev/null
+++ b/Documentation/x86/x86_64/cpu-hotplug-spec
@@ -0,0 +1,21 @@
+Firmware support for CPU hotplug under Linux/x86-64
+---------------------------------------------------
+
+Linux/x86-64 supports CPU hotplug now. For various reasons Linux wants to
+know in advance of boot time the maximum number of CPUs that could be plugged
+into the system. ACPI 3.0 currently has no official way to supply
+this information from the firmware to the operating system.
+
+In ACPI each CPU needs an LAPIC object in the MADT table (5.2.11.5 in the
+ACPI 3.0 specification). ACPI already has the concept of disabled LAPIC
+objects by setting the Enabled bit in the LAPIC object to zero.
+
+For CPU hotplug Linux/x86-64 expects now that any possible future hotpluggable
+CPU is already available in the MADT. If the CPU is not available yet
+it should have its LAPIC Enabled bit set to 0. Linux will use the number
+of disabled LAPICs to compute the maximum number of future CPUs.
+
+In the worst case the user can overwrite this choice using a command line
+option (additional_cpus=...), but it is recommended to supply the correct
+number (or a reasonable approximation of it, with erring towards more not less)
+in the MADT to avoid manual configuration.
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets
new file mode 100644
index 0000000..33bb566
--- /dev/null
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets
@@ -0,0 +1,66 @@
+Using numa=fake and CPUSets for Resource Management
+Written by David Rientjes <rientjes@cs.washington.edu>
+
+This document describes how the numa=fake x86_64 command-line option can be used
+in conjunction with cpusets for coarse memory management. Using this feature,
+you can create fake NUMA nodes that represent contiguous chunks of memory and
+assign them to cpusets and their attached tasks. This is a way of limiting the
+amount of system memory that are available to a certain class of tasks.
+
+For more information on the features of cpusets, see Documentation/cpusets.txt.
+There are a number of different configurations you can use for your needs. For
+more information on the numa=fake command line option and its various ways of
+configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
+
+For the purposes of this introduction, we'll assume a very primitive NUMA
+emulation setup of "numa=fake=4*512,". This will split our system memory into
+four equal chunks of 512M each that we can now use to assign to cpusets. As
+you become more familiar with using this combination for resource control,
+you'll determine a better setup to minimize the number of nodes you have to deal
+with.
+
+A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:
+
+ Faking node 0 at 0000000000000000-0000000020000000 (512MB)
+ Faking node 1 at 0000000020000000-0000000040000000 (512MB)
+ Faking node 2 at 0000000040000000-0000000060000000 (512MB)
+ Faking node 3 at 0000000060000000-0000000080000000 (512MB)
+ ...
+ On node 0 totalpages: 130975
+ On node 1 totalpages: 131072
+ On node 2 totalpages: 131072
+ On node 3 totalpages: 131072
+
+Now following the instructions for mounting the cpusets filesystem from
+Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
+address spaces) to individual cpusets:
+
+ [root@xroads /]# mkdir exampleset
+ [root@xroads /]# mount -t cpuset none exampleset
+ [root@xroads /]# mkdir exampleset/ddset
+ [root@xroads /]# cd exampleset/ddset
+ [root@xroads /exampleset/ddset]# echo 0-1 > cpus
+ [root@xroads /exampleset/ddset]# echo 0-1 > mems
+
+Now this cpuset, 'ddset', will only allowed access to fake nodes 0 and 1 for
+memory allocations (1G).
+
+You can now assign tasks to these cpusets to limit the memory resources
+available to them according to the fake nodes assigned as mems:
+
+ [root@xroads /exampleset/ddset]# echo $$ > tasks
+ [root@xroads /exampleset/ddset]# dd if=/dev/zero of=tmp bs=1024 count=1G
+ [1] 13425
+
+Notice the difference between the system memory usage as reported by
+/proc/meminfo between the restricted cpuset case above and the unrestricted
+case (i.e. running the same 'dd' command without assigning it to a fake NUMA
+cpuset):
+ Unrestricted Restricted
+ MemTotal: 3091900 kB 3091900 kB
+ MemFree: 42113 kB 1513236 kB
+
+This allows for coarse memory management for the tasks you assign to particular
+cpusets. Since cpusets can form a hierarchy, you can create some pretty
+interesting combinations of use-cases for various classes of tasks for your
+memory management needs.
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks
new file mode 100644
index 0000000..5ad65d5
--- /dev/null
+++ b/Documentation/x86/x86_64/kernel-stacks
@@ -0,0 +1,99 @@
+Most of the text from Keith Owens, hacked by AK
+
+x86_64 page size (PAGE_SIZE) is 4K.
+
+Like all other architectures, x86_64 has a kernel stack for every
+active thread. These thread stacks are THREAD_SIZE (2*PAGE_SIZE) big.
+These stacks contain useful data as long as a thread is alive or a
+zombie. While the thread is in user space the kernel stack is empty
+except for the thread_info structure at the bottom.
+
+In addition to the per thread stacks, there are specialized stacks
+associated with each CPU. These stacks are only used while the kernel
+is in control on that CPU; when a CPU returns to user space the
+specialized stacks contain no useful data. The main CPU stacks are:
+
+* Interrupt stack. IRQSTACKSIZE
+
+ Used for external hardware interrupts. If this is the first external
+ hardware interrupt (i.e. not a nested hardware interrupt) then the
+ kernel switches from the current task to the interrupt stack. Like
+ the split thread and interrupt stacks on i386 (with CONFIG_4KSTACKS),
+ this gives more room for kernel interrupt processing without having
+ to increase the size of every per thread stack.
+
+ The interrupt stack is also used when processing a softirq.
+
+Switching to the kernel interrupt stack is done by software based on a
+per CPU interrupt nest counter. This is needed because x86-64 "IST"
+hardware stacks cannot nest without races.
+
+x86_64 also has a feature which is not available on i386, the ability
+to automatically switch to a new stack for designated events such as
+double fault or NMI, which makes it easier to handle these unusual
+events on x86_64. This feature is called the Interrupt Stack Table
+(IST). There can be up to 7 IST entries per CPU. The IST code is an
+index into the Task State Segment (TSS). The IST entries in the TSS
+point to dedicated stacks; each stack can be a different size.
+
+An IST is selected by a non-zero value in the IST field of an
+interrupt-gate descriptor. When an interrupt occurs and the hardware
+loads such a descriptor, the hardware automatically sets the new stack
+pointer based on the IST value, then invokes the interrupt handler. If
+software wants to allow nested IST interrupts then the handler must
+adjust the IST values on entry to and exit from the interrupt handler.
+(This is occasionally done, e.g. for debug exceptions.)
+
+Events with different IST codes (i.e. with different stacks) can be
+nested. For example, a debug interrupt can safely be interrupted by an
+NMI. arch/x86_64/kernel/entry.S::paranoidentry adjusts the stack
+pointers on entry to and exit from all IST events, in theory allowing
+IST events with the same code to be nested. However in most cases, the
+stack size allocated to an IST assumes no nesting for the same code.
+If that assumption is ever broken then the stacks will become corrupt.
+
+The currently assigned IST stacks are :-
+
+* STACKFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE).
+
+ Used for interrupt 12 - Stack Fault Exception (#SS).
+
+ This allows the CPU to recover from invalid stack segments. Rarely
+ happens.
+
+* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE).
+
+ Used for interrupt 8 - Double Fault Exception (#DF).
+
+ Invoked when handling one exception causes another exception. Happens
+ when the kernel is very confused (e.g. kernel stack pointer corrupt).
+ Using a separate stack allows the kernel to recover from it well enough
+ in many cases to still output an oops.
+
+* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE).
+
+ Used for non-maskable interrupts (NMI).
+
+ NMI can be delivered at any time, including when the kernel is in the
+ middle of switching stacks. Using IST for NMI events avoids making
+ assumptions about the previous state of the kernel stack.
+
+* DEBUG_STACK. DEBUG_STKSZ
+
+ Used for hardware debug interrupts (interrupt 1) and for software
+ debug interrupts (INT3).
+
+ When debugging a kernel, debug interrupts (both hardware and
+ software) can occur at any time. Using IST for these interrupts
+ avoids making assumptions about the previous state of the kernel
+ stack.
+
+* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE).
+
+ Used for interrupt 18 - Machine Check Exception (#MC).
+
+ MCE can be delivered at any time, including when the kernel is in the
+ middle of switching stacks. Using IST for MCE events avoids making
+ assumptions about the previous state of the kernel stack.
+
+For more details see the Intel IA32 or AMD AMD64 architecture manuals.
diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck
new file mode 100644
index 0000000..a05e58e
--- /dev/null
+++ b/Documentation/x86/x86_64/machinecheck
@@ -0,0 +1,77 @@
+
+Configurable sysfs parameters for the x86-64 machine check code.
+
+Machine checks report internal hardware error conditions detected
+by the CPU. Uncorrected errors typically cause a machine check
+(often with panic), corrected ones cause a machine check log entry.
+
+Machine checks are organized in banks (normally associated with
+a hardware subsystem) and subevents in a bank. The exact meaning
+of the banks and subevent is CPU specific.
+
+mcelog knows how to decode them.
+
+When you see the "Machine check errors logged" message in the system
+log then mcelog should run to collect and decode machine check entries
+from /dev/mcelog. Normally mcelog should be run regularly from a cronjob.
+
+Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN
+(N = CPU number)
+
+The directory contains some configurable entries:
+
+Entries:
+
+bankNctl
+(N bank number)
+ 64bit Hex bitmask enabling/disabling specific subevents for bank N
+ When a bit in the bitmask is zero then the respective
+ subevent will not be reported.
+ By default all events are enabled.
+ Note that BIOS maintain another mask to disable specific events
+ per bank. This is not visible here
+
+The following entries appear for each CPU, but they are truly shared
+between all CPUs.
+
+check_interval
+ How often to poll for corrected machine check errors, in seconds
+ (Note output is hexademical). Default 5 minutes. When the poller
+ finds MCEs it triggers an exponential speedup (poll more often) on
+ the polling interval. When the poller stops finding MCEs, it
+ triggers an exponential backoff (poll less often) on the polling
+ interval. The check_interval variable is both the initial and
+ maximum polling interval.
+
+tolerant
+ Tolerance level. When a machine check exception occurs for a non
+ corrected machine check the kernel can take different actions.
+ Since machine check exceptions can happen any time it is sometimes
+ risky for the kernel to kill a process because it defies
+ normal kernel locking rules. The tolerance level configures
+ how hard the kernel tries to recover even at some risk of
+ deadlock. Higher tolerant values trade potentially better uptime
+ with the risk of a crash or even corruption (for tolerant >= 3).
+
+ 0: always panic on uncorrected errors, log corrected errors
+ 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ 2: SIGBUS or log uncorrected errors, log corrected errors
+ 3: never panic or SIGBUS, log all errors (for testing only)
+
+ Default: 1
+
+ Note this only makes a difference if the CPU allows recovery
+ from a machine check exception. Current x86 CPUs generally do not.
+
+trigger
+ Program to run when a machine check event is detected.
+ This is an alternative to running mcelog regularly from cron
+ and allows to detect events faster.
+
+TBD document entries for AMD threshold interrupt configuration
+
+For more details about the x86 machine check architecture
+see the Intel and AMD architecture manuals from their developer websites.
+
+For more details about the architecture see
+see http://one.firstfloor.org/~andi/mce.pdf
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
new file mode 100644
index 0000000..efce750
--- /dev/null
+++ b/Documentation/x86/x86_64/mm.txt
@@ -0,0 +1,28 @@
+
+<previous description obsolete, deleted>
+
+Virtual memory map with 4 level page tables:
+
+0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
+hole caused by [48:63] sign extension
+ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
+ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory
+ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
+ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
+... unused hole ...
+ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
+ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
+
+The direct mapping covers all memory in the system up to the highest
+memory address (this means in some cases it can also include PCI memory
+holes).
+
+vmalloc space is lazily synchronized into the different PML4 pages of
+the processes using the page fault handler, with init_level4_pgt as
+reference.
+
+Current X86-64 implementations only support 40 bits of address space,
+but we support up to 46 bits. This expands into MBZ space in the page tables.
+
+-Andi Kleen, Jul 2004
diff --git a/Documentation/x86/x86_64/uefi.txt b/Documentation/x86/x86_64/uefi.txt
new file mode 100644
index 0000000..a5e2b4f
--- /dev/null
+++ b/Documentation/x86/x86_64/uefi.txt
@@ -0,0 +1,42 @@
+General note on [U]EFI x86_64 support
+-------------------------------------
+
+The nomenclature EFI and UEFI are used interchangeably in this document.
+
+Although the tools below are _not_ needed for building the kernel,
+the needed bootloader support and associated tools for x86_64 platforms
+with EFI firmware and specifications are listed below.
+
+1. UEFI specification: http://www.uefi.org
+
+2. Booting Linux kernel on UEFI x86_64 platform requires bootloader
+ support. Elilo with x86_64 support can be used.
+
+3. x86_64 platform with EFI/UEFI firmware.
+
+Mechanics:
+---------
+- Build the kernel with the following configuration.
+ CONFIG_FB_EFI=y
+ CONFIG_FRAMEBUFFER_CONSOLE=y
+ If EFI runtime services are expected, the following configuration should
+ be selected.
+ CONFIG_EFI=y
+ CONFIG_EFI_VARS=y or m # optional
+- Create a VFAT partition on the disk
+- Copy the following to the VFAT partition:
+ elilo bootloader with x86_64 support, elilo configuration file,
+ kernel image built in first step and corresponding
+ initrd. Instructions on building elilo and its dependencies
+ can be found in the elilo sourceforge project.
+- Boot to EFI shell and invoke elilo choosing the kernel image built
+ in first step.
+- If some or all EFI runtime services don't work, you can try following
+ kernel command line parameters to turn off some or all EFI runtime
+ services.
+ noefi turn off all EFI runtime services
+ reboot_type=k turn off EFI reboot runtime service
+- If the EFI memory map has additional entries not in the E820 map,
+ you can include those entries in the kernels memory map of available
+ physical RAM by using the following kernel command line parameter.
+ add_efi_memmap include EFI memory map of available physical RAM
diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
new file mode 100644
index 0000000..169ad42
--- /dev/null
+++ b/Documentation/x86/zero-page.txt
@@ -0,0 +1,31 @@
+The additional fields in struct boot_params as a part of 32-bit boot
+protocol of kernel. These should be filled by bootloader or 16-bit
+real-mode setup code of the kernel. References/settings to it mainly
+are in:
+
+ include/asm-x86/bootparam.h
+
+
+Offset Proto Name Meaning
+/Size
+
+000/040 ALL screen_info Text mode or frame buffer information
+ (struct screen_info)
+040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info)
+060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information
+ (struct ist_info)
+080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!!
+090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!!
+0A0/010 ALL sys_desc_table System description table (struct sys_desc_table)
+140/080 ALL edid_info Video mode setup (struct edid_info)
+1C0/020 ALL efi_info EFI 32 information (struct efi_info)
+1E0/004 ALL alk_mem_k Alternative mem check, in KB
+1E4/004 ALL scratch Scratch field for the kernel setup code
+1E8/001 ALL e820_entries Number of entries in e820_map (below)
+1E9/001 ALL eddbuf_entries Number of entries in eddbuf (below)
+1EA/001 ALL edd_mbr_sig_buf_entries Number of entries in edd_mbr_sig_buffer
+ (below)
+290/040 ALL edd_mbr_sig_buffer EDD MBR signatures
+2D0/A00 ALL e820_map E820 memory map table
+ (array of struct e820entry)
+D00/1EC ALL eddbuf EDD data (array of struct edd_info)
diff --git a/Documentation/zh_CN/CodingStyle b/Documentation/zh_CN/CodingStyle
new file mode 100644
index 0000000..ecd9307
--- /dev/null
+++ b/Documentation/zh_CN/CodingStyle
@@ -0,0 +1,701 @@
+Chinese translated version of Documentation/CodingStyle
+
+If you have any comment or update to the content, please post to LKML directly.
+However, if you have problem communicating in English you can also ask the
+Chinese maintainer for help. Contact the Chinese maintainer, if this
+translation is outdated or there is problem with translation.
+
+Chinese maintainer: Zhang Le <r0bertz@gentoo.org>
+---------------------------------------------------------------------
+Documentation/CodingStyle的中文翻译
+
+如果想评论或更新本文的内容,请直接发信到LKML。如果你使用英文交流有困难的话,也可
+以向中文版维护者求助。如果本翻译更新不及时或者翻译存在问题,请联系中文版维护者。
+
+中文版维护者: 张乐 Zhang Le <r0bertz@gentoo.org>
+中文版翻译者: 张乐 Zhang Le <r0bertz@gentoo.org>
+中文版校译者: 王聪 Wang Cong <xiyou.wangcong@gmail.com>
+ wheelz <kernel.zeng@gmail.com>
+ 管旭东 Xudong Guan <xudong.guan@gmail.com>
+ Li Zefan <lizf@cn.fujitsu.com>
+ Wang Chen <wangchen@cn.fujitsu.com>
+以下为正文
+---------------------------------------------------------------------
+
+ Linux内核代码风格
+
+这是一个简短的文档,描述了linux内核的首选代码风格。代码风格是因人而异的,而且我
+不愿意把我的观点强加给任何人,不过这里所讲述的是我必须要维护的代码所遵守的风格,
+并且我也希望绝大多数其他代码也能遵守这个风格。请在写代码时至少考虑一下本文所述的
+风格。
+
+首先,我建议你打印一份GNU代码规范,然后不要读它。烧了它,这是一个具有重大象征性
+意义的动作。
+
+不管怎样,现在我们开始:
+
+
+ 第一章:缩进
+
+制表符是8个字符,所以缩进也是8个字符。有些异端运动试图将缩进变为4(乃至2)个字符
+深,这几乎相当于尝试将圆周率的值定义为3。
+
+理由:缩进的全部意义就在于清楚的定义一个控制块起止于何处。尤其是当你盯着你的屏幕
+连续看了20小时之后,你将会发现大一点的缩进会使你更容易分辨缩进。
+
+现在,有些人会抱怨8个字符的缩进会使代码向右边移动的太远,在80个字符的终端屏幕上
+就很难读这样的代码。这个问题的答案是,如果你需要3级以上的缩进,不管用何种方式你
+的代码已经有问题了,应该修正你的程序。
+
+简而言之,8个字符的缩进可以让代码更容易阅读,还有一个好处是当你的函数嵌套太深的
+时候可以给你警告。留心这个警告。
+
+在switch语句中消除多级缩进的首选的方式是让“switch”和从属于它的“case”标签对齐于同
+一列,而不要“两次缩进”“case”标签。比如:
+
+ switch (suffix) {
+ case 'G':
+ case 'g':
+ mem <<= 30;
+ break;
+ case 'M':
+ case 'm':
+ mem <<= 20;
+ break;
+ case 'K':
+ case 'k':
+ mem <<= 10;
+ /* fall through */
+ default:
+ break;
+ }
+
+
+不要把多个语句放在一行里,除非你有什么东西要隐藏:
+
+ if (condition) do_this;
+ do_something_everytime;
+
+也不要在一行里放多个赋值语句。内核代码风格超级简单。就是避免可能导致别人误读的表
+达式。
+
+除了注释、文档和Kconfig之外,不要使用空格来缩进,前面的例子是例外,是有意为之。
+
+选用一个好的编辑器,不要在行尾留空格。
+
+
+ 第二章:把长的行和字符串打散
+
+代码风格的意义就在于使用平常使用的工具来维持代码的可读性和可维护性。
+
+每一行的长度的限制是80列,我们强烈建议您遵守这个惯例。
+
+长于80列的语句要打散成有意义的片段。每个片段要明显短于原来的语句,而且放置的位置
+也明显的靠右。同样的规则也适用于有很长参数列表的函数头。长字符串也要打散成较短的
+字符串。唯一的例外是超过80列可以大幅度提高可读性并且不会隐藏信息的情况。
+
+void fun(int a, int b, int c)
+{
+ if (condition)
+ printk(KERN_WARNING "Warning this is a long printk with "
+ "3 parameters a: %u b: %u "
+ "c: %u \n", a, b, c);
+ else
+ next_statement;
+}
+
+ 第三章:大括号和空格的放置
+
+C语言风格中另外一个常见问题是大括号的放置。和缩进大小不同,选择或弃用某种放置策
+略并没有多少技术上的原因,不过首选的方式,就像Kernighan和Ritchie展示给我们的,是
+把起始大括号放在行尾,而把结束大括号放在行首,所以:
+
+ if (x is true) {
+ we do y
+ }
+
+这适用于所有的非函数语句块(if、switch、for、while、do)。比如:
+
+ switch (action) {
+ case KOBJ_ADD:
+ return "add";
+ case KOBJ_REMOVE:
+ return "remove";
+ case KOBJ_CHANGE:
+ return "change";
+ default:
+ return NULL;
+ }
+
+不过,有一个例外,那就是函数:函数的起始大括号放置于下一行的开头,所以:
+
+ int function(int x)
+ {
+ body of function
+ }
+
+全世界的异端可能会抱怨这个不一致性是……呃……不一致的,不过所有思维健全的人都知道(
+a)K&R是_正确的_,并且(b)K&R是正确的。此外,不管怎样函数都是特殊的(在C语言中
+,函数是不能嵌套的)。
+
+注意结束大括号独自占据一行,除非它后面跟着同一个语句的剩余部分,也就是do语句中的
+“while”或者if语句中的“else”,像这样:
+
+ do {
+ body of do-loop
+ } while (condition);
+
+和
+
+ if (x == y) {
+ ..
+ } else if (x > y) {
+ ...
+ } else {
+ ....
+ }
+
+理由:K&R。
+
+也请注意这种大括号的放置方式也能使空(或者差不多空的)行的数量最小化,同时不失可
+读性。因此,由于你的屏幕上的新行是不可再生资源(想想25行的终端屏幕),你将会有更
+多的空行来放置注释。
+
+当只有一个单独的语句的时候,不用加不必要的大括号。
+
+if (condition)
+ action();
+
+这点不适用于本身为某个条件语句的一个分支的单独语句。这时需要在两个分支里都使用大
+括号。
+
+if (condition) {
+ do_this();
+ do_that();
+} else {
+ otherwise();
+}
+
+ 3.1:空格
+
+Linux内核的空格使用方式(主要)取决于它是用于函数还是关键字。(大多数)关键字后
+要加一个空格。值得注意的例外是sizeof、typeof、alignof和__attribute__,这些关键字
+某些程度上看起来更像函数(它们在Linux里也常常伴随小括号而使用,尽管在C语言里这样
+的小括号不是必需的,就像“struct fileinfo info”声明过后的“sizeof info”)。
+
+所以在这些关键字之后放一个空格:
+ if, switch, case, for, do, while
+但是不要在sizeof、typeof、alignof或者__attribute__这些关键字之后放空格。例如,
+ s = sizeof(struct file);
+
+不要在小括号里的表达式两侧加空格。这是一个反例:
+
+ s = sizeof( struct file );
+
+当声明指针类型或者返回指针类型的函数时,“*”的首选使用方式是使之靠近变量名或者函
+数名,而不是靠近类型名。例子:
+
+ char *linux_banner;
+ unsigned long long memparse(char *ptr, char **retptr);
+ char *match_strdup(substring_t *s);
+
+在大多数二元和三元操作符两侧使用一个空格,例如下面所有这些操作符:
+
+ = + - < > * / % | & ^ <= >= == != ? :
+
+但是一元操作符后不要加空格:
+ & * + - ~ ! sizeof typeof alignof __attribute__ defined
+
+后缀自加和自减一元操作符前不加空格:
+ ++ --
+
+前缀自加和自减一元操作符后不加空格:
+ ++ --
+
+“.”和“->”结构体成员操作符前后不加空格。
+
+不要在行尾留空白。有些可以自动缩进的编辑器会在新行的行首加入适量的空白,然后你
+就可以直接在那一行输入代码。不过假如你最后没有在那一行输入代码,有些编辑器就不
+会移除已经加入的空白,就像你故意留下一个只有空白的行。包含行尾空白的行就这样产
+生了。
+
+当git发现补丁包含了行尾空白的时候会警告你,并且可以应你的要求去掉行尾空白;不过
+如果你是正在打一系列补丁,这样做会导致后面的补丁失败,因为你改变了补丁的上下文。
+
+
+ 第四章:命名
+
+C是一个简朴的语言,你的命名也应该这样。和Modula-2和Pascal程序员不同,C程序员不使
+用类似ThisVariableIsATemporaryCounter这样华丽的名字。C程序员会称那个变量为“tmp”
+,这样写起来会更容易,而且至少不会令其难于理解。
+
+不过,虽然混用大小写的名字是不提倡使用的,但是全局变量还是需要一个具描述性的名字
+。称一个全局函数为“foo”是一个难以饶恕的错误。
+
+全局变量(只有当你真正需要它们的时候再用它)需要有一个具描述性的名字,就像全局函
+数。如果你有一个可以计算活动用户数量的函数,你应该叫它“count_active_users()”或者
+类似的名字,你不应该叫它“cntuser()”。
+
+在函数名中包含函数类型(所谓的匈牙利命名法)是脑子出了问题——编译器知道那些类型而
+且能够检查那些类型,这样做只能把程序员弄糊涂了。难怪微软总是制造出有问题的程序。
+
+本地变量名应该简短,而且能够表达相关的含义。如果你有一些随机的整数型的循环计数器
+,它应该被称为“i”。叫它“loop_counter”并无益处,如果它没有被误解的可能的话。类似
+的,“tmp”可以用来称呼任意类型的临时变量。
+
+如果你怕混淆了你的本地变量名,你就遇到另一个问题了,叫做函数增长荷尔蒙失衡综合症
+。请看第六章(函数)。
+
+
+ 第五章:Typedef
+
+不要使用类似“vps_t”之类的东西。
+
+对结构体和指针使用typedef是一个错误。当你在代码里看到:
+
+ vps_t a;
+
+这代表什么意思呢?
+
+相反,如果是这样
+
+ struct virtual_container *a;
+
+你就知道“a”是什么了。
+
+很多人认为typedef“能提高可读性”。实际不是这样的。它们只在下列情况下有用:
+
+ (a) 完全不透明的对象(这种情况下要主动使用typedef来隐藏这个对象实际上是什么)。
+
+ 例如:“pte_t”等不透明对象,你只能用合适的访问函数来访问它们。
+
+ 注意!不透明性和“访问函数”本身是不好的。我们使用pte_t等类型的原因在于真的是
+ 完全没有任何共用的可访问信息。
+
+ (b) 清楚的整数类型,如此,这层抽象就可以帮助消除到底是“int”还是“long”的混淆。
+
+ u8/u16/u32是完全没有问题的typedef,不过它们更符合类别(d)而不是这里。
+
+ 再次注意!要这样做,必须事出有因。如果某个变量是“unsigned long“,那么没有必要
+
+ typedef unsigned long myflags_t;
+
+ 不过如果有一个明确的原因,比如它在某种情况下可能会是一个“unsigned int”而在
+ 其他情况下可能为“unsigned long”,那么就不要犹豫,请务必使用typedef。
+
+ (c) 当你使用sparse按字面的创建一个新类型来做类型检查的时候。
+
+ (d) 和标准C99类型相同的类型,在某些例外的情况下。
+
+ 虽然让眼睛和脑筋来适应新的标准类型比如“uint32_t”不需要花很多时间,可是有些
+ 人仍然拒绝使用它们。
+
+ 因此,Linux特有的等同于标准类型的“u8/u16/u32/u64”类型和它们的有符号类型是被
+ 允许的——尽管在你自己的新代码中,它们不是强制要求要使用的。
+
+ 当编辑已经使用了某个类型集的已有代码时,你应该遵循那些代码中已经做出的选择。
+
+ (e) 可以在用户空间安全使用的类型。
+
+ 在某些用户空间可见的结构体里,我们不能要求C99类型而且不能用上面提到的“u32”
+ 类型。因此,我们在与用户空间共享的所有结构体中使用__u32和类似的类型。
+
+可能还有其他的情况,不过基本的规则是永远不要使用typedef,除非你可以明确的应用上
+述某个规则中的一个。
+
+总的来说,如果一个指针或者一个结构体里的元素可以合理的被直接访问到,那么它们就不
+应该是一个typedef。
+
+
+ 第六章:函数
+
+函数应该简短而漂亮,并且只完成一件事情。函数应该可以一屏或者两屏显示完(我们都知
+道ISO/ANSI屏幕大小是80x24),只做一件事情,而且把它做好。
+
+一个函数的最大长度是和该函数的复杂度和缩进级数成反比的。所以,如果你有一个理论上
+很简单的只有一个很长(但是简单)的case语句的函数,而且你需要在每个case里做很多很
+小的事情,这样的函数尽管很长,但也是可以的。
+
+不过,如果你有一个复杂的函数,而且你怀疑一个天分不是很高的高中一年级学生可能甚至
+搞不清楚这个函数的目的,你应该严格的遵守前面提到的长度限制。使用辅助函数,并为之
+取个具描述性的名字(如果你觉得它们的性能很重要的话,可以让编译器内联它们,这样的
+效果往往会比你写一个复杂函数的效果要好。)
+
+函数的另外一个衡量标准是本地变量的数量。此数量不应超过5-10个,否则你的函数就有
+问题了。重新考虑一下你的函数,把它分拆成更小的函数。人的大脑一般可以轻松的同时跟
+踪7个不同的事物,如果再增多的话,就会糊涂了。即便你聪颖过人,你也可能会记不清你2
+个星期前做过的事情。
+
+在源文件里,使用空行隔开不同的函数。如果该函数需要被导出,它的EXPORT*宏应该紧贴
+在它的结束大括号之下。比如:
+
+int system_is_up(void)
+{
+ return system_state == SYSTEM_RUNNING;
+}
+EXPORT_SYMBOL(system_is_up);
+
+在函数原型中,包含函数名和它们的数据类型。虽然C语言里没有这样的要求,在Linux里这
+是提倡的做法,因为这样可以很简单的给读者提供更多的有价值的信息。
+
+
+ 第七章:集中的函数退出途径
+
+虽然被某些人声称已经过时,但是goto语句的等价物还是经常被编译器所使用,具体形式是
+无条件跳转指令。
+
+当一个函数从多个位置退出并且需要做一些通用的清理工作的时候,goto的好处就显现出来
+了。
+
+理由是:
+
+- 无条件语句容易理解和跟踪
+- 嵌套程度减小
+- 可以避免由于修改时忘记更新某个单独的退出点而导致的错误
+- 减轻了编译器的工作,无需删除冗余代码;)
+
+int fun(int a)
+{
+ int result = 0;
+ char *buffer = kmalloc(SIZE);
+
+ if (buffer == NULL)
+ return -ENOMEM;
+
+ if (condition1) {
+ while (loop1) {
+ ...
+ }
+ result = 1;
+ goto out;
+ }
+ ...
+out:
+ kfree(buffer);
+ return result;
+}
+
+ 第八章:注释
+
+注释是好的,不过有过度注释的危险。永远不要在注释里解释你的代码是如何运作的:更好
+的做法是让别人一看你的代码就可以明白,解释写的很差的代码是浪费时间。
+
+一般的,你想要你的注释告诉别人你的代码做了什么,而不是怎么做的。也请你不要把注释
+放在一个函数体内部:如果函数复杂到你需要独立的注释其中的一部分,你很可能需要回到
+第六章看一看。你可以做一些小注释来注明或警告某些很聪明(或者槽糕)的做法,但不要
+加太多。你应该做的,是把注释放在函数的头部,告诉人们它做了什么,也可以加上它做这
+些事情的原因。
+
+当注释内核API函数时,请使用kernel-doc格式。请看
+Documentation/kernel-doc-nano-HOWTO.txt和scripts/kernel-doc以获得详细信息。
+
+Linux的注释风格是C89“/* ... */”风格。不要使用C99风格“// ...”注释。
+
+长(多行)的首选注释风格是:
+
+ /*
+ * This is the preferred style for multi-line
+ * comments in the Linux kernel source code.
+ * Please use it consistently.
+ *
+ * Description: A column of asterisks on the left side,
+ * with beginning and ending almost-blank lines.
+ */
+
+注释数据也是很重要的,不管是基本类型还是衍生类型。为了方便实现这一点,每一行应只
+声明一个数据(不要使用逗号来一次声明多个数据)。这样你就有空间来为每个数据写一段
+小注释来解释它们的用途了。
+
+
+ 第九章:你已经把事情弄糟了
+
+这没什么,我们都是这样。可能你的使用了很长时间Unix的朋友已经告诉你“GNU emacs”能
+自动帮你格式化C源代码,而且你也注意到了,确实是这样,不过它所使用的默认值和我们
+想要的相去甚远(实际上,甚至比随机打的还要差——无数个猴子在GNU emacs里打字永远不
+会创造出一个好程序)(译注:请参考Infinite Monkey Theorem)
+
+所以你要么放弃GNU emacs,要么改变它让它使用更合理的设定。要采用后一个方案,你可
+以把下面这段粘贴到你的.emacs文件里。
+
+(defun linux-c-mode ()
+ "C mode with adjusted defaults for use with the Linux kernel."
+ (interactive)
+ (c-mode)
+ (c-set-style "K&R")
+ (setq tab-width 8)
+ (setq indent-tabs-mode t)
+ (setq c-basic-offset 8))
+
+这样就定义了M-x linux-c-mode命令。当你hack一个模块的时候,如果你把字符串
+-*- linux-c -*-放在头两行的某个位置,这个模式将会被自动调用。如果你希望在你修改
+/usr/src/linux里的文件时魔术般自动打开linux-c-mode的话,你也可能需要添加
+
+(setq auto-mode-alist (cons '("/usr/src/linux.*/.*\\.[ch]$" . linux-c-mode)
+ auto-mode-alist))
+
+到你的.emacs文件里。
+
+不过就算你尝试让emacs正确的格式化代码失败了,也并不意味着你失去了一切:还可以用“
+indent”。
+
+不过,GNU indent也有和GNU emacs一样有问题的设定,所以你需要给它一些命令选项。不
+过,这还不算太糟糕,因为就算是GNU indent的作者也认同K&R的权威性(GNU的人并不是坏
+人,他们只是在这个问题上被严重的误导了),所以你只要给indent指定选项“-kr -i8”
+(代表“K&R,8个字符缩进”),或者使用“scripts/Lindent”,这样就可以以最时髦的方式
+缩进源代码。
+
+“indent”有很多选项,特别是重新格式化注释的时候,你可能需要看一下它的手册页。不过
+记住:“indent”不能修正坏的编程习惯。
+
+
+ 第十章:Kconfig配置文件
+
+对于遍布源码树的所有Kconfig*配置文件来说,它们缩进方式与C代码相比有所不同。紧挨
+在“config”定义下面的行缩进一个制表符,帮助信息则再多缩进2个空格。比如:
+
+config AUDIT
+ bool "Auditing support"
+ depends on NET
+ help
+ Enable auditing infrastructure that can be used with another
+ kernel subsystem, such as SELinux (which requires this for
+ logging of avc messages output). Does not do system-call
+ auditing without CONFIG_AUDITSYSCALL.
+
+仍然被认为不够稳定的功能应该被定义为依赖于“EXPERIMENTAL”:
+
+config SLUB
+ depends on EXPERIMENTAL && !ARCH_USES_SLAB_PAGE_STRUCT
+ bool "SLUB (Unqueued Allocator)"
+ ...
+
+而那些危险的功能(比如某些文件系统的写支持)应该在它们的提示字符串里显著的声明这
+一点:
+
+config ADFS_FS_RW
+ bool "ADFS write support (DANGEROUS)"
+ depends on ADFS_FS
+ ...
+
+要查看配置文件的完整文档,请看Documentation/kbuild/kconfig-language.txt。
+
+
+ 第十一章:数据结构
+
+如果一个数据结构,在创建和销毁它的单线执行环境之外可见,那么它必须要有一个引用计
+数器。内核里没有垃圾收集(并且内核之外的垃圾收集慢且效率低下),这意味着你绝对需
+要记录你对这种数据结构的使用情况。
+
+引用计数意味着你能够避免上锁,并且允许多个用户并行访问这个数据结构——而不需要担心
+这个数据结构仅仅因为暂时不被使用就消失了,那些用户可能不过是沉睡了一阵或者做了一
+些其他事情而已。
+
+注意上锁不能取代引用计数。上锁是为了保持数据结构的一致性,而引用计数是一个内存管
+理技巧。通常二者都需要,不要把两个搞混了。
+
+很多数据结构实际上有2级引用计数,它们通常有不同“类”的用户。子类计数器统计子类用
+户的数量,每当子类计数器减至零时,全局计数器减一。
+
+这种“多级引用计数”的例子可以在内存管理(“struct mm_struct”:mm_users和mm_count)
+和文件系统(“struct super_block”:s_count和s_active)中找到。
+
+记住:如果另一个执行线索可以找到你的数据结构,但是这个数据结构没有引用计数器,这
+里几乎肯定是一个bug。
+
+
+ 第十二章:宏,枚举和RTL
+
+用于定义常量的宏的名字及枚举里的标签需要大写。
+
+#define CONSTANT 0x12345
+
+在定义几个相关的常量时,最好用枚举。
+
+宏的名字请用大写字母,不过形如函数的宏的名字可以用小写字母。
+
+一般的,如果能写成内联函数就不要写成像函数的宏。
+
+含有多个语句的宏应该被包含在一个do-while代码块里:
+
+#define macrofun(a, b, c) \
+ do { \
+ if (a == 5) \
+ do_this(b, c); \
+ } while (0)
+
+使用宏的时候应避免的事情:
+
+1) 影响控制流程的宏:
+
+#define FOO(x) \
+ do { \
+ if (blah(x) < 0) \
+ return -EBUGGERED; \
+ } while(0)
+
+非常不好。它看起来像一个函数,不过却能导致“调用”它的函数退出;不要打乱读者大脑里
+的语法分析器。
+
+2) 依赖于一个固定名字的本地变量的宏:
+
+#define FOO(val) bar(index, val)
+
+可能看起来像是个不错的东西,不过它非常容易把读代码的人搞糊涂,而且容易导致看起来
+不相关的改动带来错误。
+
+3) 作为左值的带参数的宏: FOO(x) = y;如果有人把FOO变成一个内联函数的话,这种用
+法就会出错了。
+
+4) 忘记了优先级:使用表达式定义常量的宏必须将表达式置于一对小括号之内。带参数的
+宏也要注意此类问题。
+
+#define CONSTANT 0x4000
+#define CONSTEXP (CONSTANT | 3)
+
+cpp手册对宏的讲解很详细。Gcc internals手册也详细讲解了RTL(译注:register
+transfer language),内核里的汇编语言经常用到它。
+
+
+ 第十三章:打印内核消息
+
+内核开发者应该是受过良好教育的。请一定注意内核信息的拼写,以给人以好的印象。不要
+用不规范的单词比如“dont”,而要用“do not”或者“don't”。保证这些信息简单、明了、无
+歧义。
+
+内核信息不必以句号(译注:英文句号,即点)结束。
+
+在小括号里打印数字(%d)没有任何价值,应该避免这样做。
+
+<linux/device.h>里有一些驱动模型诊断宏,你应该使用它们,以确保信息对应于正确的
+设备和驱动,并且被标记了正确的消息级别。这些宏有:dev_err(), dev_warn(),
+dev_info()等等。对于那些不和某个特定设备相关连的信息,<linux/kernel.h>定义了
+pr_debug()和pr_info()。
+
+写出好的调试信息可以是一个很大的挑战;当你写出来之后,这些信息在远程除错的时候
+就会成为极大的帮助。当DEBUG符号没有被定义的时候,这些信息不应该被编译进内核里
+(也就是说,默认地,它们不应该被包含在内)。如果你使用dev_dbg()或者pr_debug(),
+就能自动达到这个效果。很多子系统拥有Kconfig选项来启用-DDEBUG。还有一个相关的惯例
+是使用VERBOSE_DEBUG来添加dev_vdbg()消息到那些已经由DEBUG启用的消息之上。
+
+
+ 第十四章:分配内存
+
+内核提供了下面的一般用途的内存分配函数:kmalloc(),kzalloc(),kcalloc()和
+vmalloc()。请参考API文档以获取有关它们的详细信息。
+
+传递结构体大小的首选形式是这样的:
+
+ p = kmalloc(sizeof(*p), ...);
+
+另外一种传递方式中,sizeof的操作数是结构体的名字,这样会降低可读性,并且可能会引
+入bug。有可能指针变量类型被改变时,而对应的传递给内存分配函数的sizeof的结果不变。
+
+强制转换一个void指针返回值是多余的。C语言本身保证了从void指针到其他任何指针类型
+的转换是没有问题的。
+
+
+ 第十五章:内联弊病
+
+有一个常见的误解是内联函数是gcc提供的可以让代码运行更快的一个选项。虽然使用内联
+函数有时候是恰当的(比如作为一种替代宏的方式,请看第十二章),不过很多情况下不是
+这样。inline关键字的过度使用会使内核变大,从而使整个系统运行速度变慢。因为大内核
+会占用更多的指令高速缓存(译注:一级缓存通常是指令缓存和数据缓存分开的)而且会导
+致pagecache的可用内存减少。想象一下,一次pagecache未命中就会导致一次磁盘寻址,将
+耗时5毫秒。5毫秒的时间内CPU能执行很多很多指令。
+
+一个基本的原则是如果一个函数有3行以上,就不要把它变成内联函数。这个原则的一个例
+外是,如果你知道某个参数是一个编译时常量,而且因为这个常量你确定编译器在编译时能
+优化掉你的函数的大部分代码,那仍然可以给它加上inline关键字。kmalloc()内联函数就
+是一个很好的例子。
+
+人们经常主张给static的而且只用了一次的函数加上inline,如此不会有任何损失,因为没
+有什么好权衡的。虽然从技术上说这是正确的,但是实际上这种情况下即使不加inline gcc
+也可以自动使其内联。而且其他用户可能会要求移除inline,由此而来的争论会抵消inline
+自身的潜在价值,得不偿失。
+
+
+ 第十六章:函数返回值及命名
+
+函数可以返回很多种不同类型的值,最常见的一种是表明函数执行成功或者失败的值。这样
+的一个值可以表示为一个错误代码整数(-Exxx=失败,0=成功)或者一个“成功”布尔值(
+0=失败,非0=成功)。
+
+混合使用这两种表达方式是难于发现的bug的来源。如果C语言本身严格区分整形和布尔型变
+量,那么编译器就能够帮我们发现这些错误……不过C语言不区分。为了避免产生这种bug,请
+遵循下面的惯例:
+
+ 如果函数的名字是一个动作或者强制性的命令,那么这个函数应该返回错误代码整
+ 数。如果是一个判断,那么函数应该返回一个“成功”布尔值。
+
+比如,“add work”是一个命令,所以add_work()函数在成功时返回0,在失败时返回-EBUSY。
+类似的,因为“PCI device present”是一个判断,所以pci_dev_present()函数在成功找到
+一个匹配的设备时应该返回1,如果找不到时应该返回0。
+
+所有导出(译注:EXPORT)的函数都必须遵守这个惯例,所有的公共函数也都应该如此。私
+有(static)函数不需要如此,但是我们也推荐这样做。
+
+返回值是实际计算结果而不是计算是否成功的标志的函数不受此惯例的限制。一般的,他们
+通过返回一些正常值范围之外的结果来表示出错。典型的例子是返回指针的函数,他们使用
+NULL或者ERR_PTR机制来报告错误。
+
+
+ 第十七章:不要重新发明内核宏
+
+头文件include/linux/kernel.h包含了一些宏,你应该使用它们,而不要自己写一些它们的
+变种。比如,如果你需要计算一个数组的长度,使用这个宏
+
+ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+类似的,如果你要计算某结构体成员的大小,使用
+
+ #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
+
+还有可以做严格的类型检查的min()和max()宏,如果你需要可以使用它们。你可以自己看看
+那个头文件里还定义了什么你可以拿来用的东西,如果有定义的话,你就不应在你的代码里
+自己重新定义。
+
+
+ 第十八章:编辑器模式行和其他需要罗嗦的事情
+
+有一些编辑器可以解释嵌入在源文件里的由一些特殊标记标明的配置信息。比如,emacs
+能够解释被标记成这样的行:
+
+-*- mode: c -*-
+
+或者这样的:
+
+/*
+Local Variables:
+compile-command: "gcc -DMAGIC_DEBUG_FLAG foo.c"
+End:
+*/
+
+Vim能够解释这样的标记:
+
+/* vim:set sw=8 noet */
+
+不要在源代码中包含任何这样的内容。每个人都有他自己的编辑器配置,你的源文件不应
+该覆盖别人的配置。这包括有关缩进和模式配置的标记。人们可以使用他们自己定制的模
+式,或者使用其他可以产生正确的缩进的巧妙方法。
+
+
+
+ 附录 I:参考
+
+The C Programming Language, 第二版, 作者Brian W. Kernighan和Denni
+M. Ritchie. Prentice Hall, Inc., 1988. ISBN 0-13-110362-8 (软皮),
+0-13-110370-9 (硬皮). URL: http://cm.bell-labs.com/cm/cs/cbook/
+
+The Practice of Programming 作者Brian W. Kernighan和Rob Pike. Addison-Wesley,
+Inc., 1999. ISBN 0-201-61586-X. URL: http://cm.bell-labs.com/cm/cs/tpop/
+
+cpp,gcc,gcc internals和indent的GNU手册——和K&R及本文相符合的部分,全部可以在
+http://www.gnu.org/manual/找到
+
+WG14是C语言的国际标准化工作组,URL: http://www.open-std.org/JTC1/SC22/WG14/
+
+Kernel CodingStyle,作者greg@kroah.com发表于OLS 2002:
+http://www.kroah.com/linux/talks/ols_2002_kernel_codingstyle_talk/html/
+
+--
+最后更新于2007年7月13日。
diff --git a/Documentation/zh_CN/HOWTO b/Documentation/zh_CN/HOWTO
new file mode 100644
index 0000000..3d80e8a
--- /dev/null
+++ b/Documentation/zh_CN/HOWTO
@@ -0,0 +1,538 @@
+Chinese translated version of Documentation/HOWTO
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have a problem
+communicating in English you can also ask the Chinese maintainer for
+help. Contact the Chinese maintainer if this translation is outdated
+or if there is a problem with the translation.
+
+Maintainer: Greg Kroah-Hartman <greg@kroah.com>
+Chinese maintainer: Li Yang <leoli@freescale.com>
+---------------------------------------------------------------------
+Documentation/HOWTO 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+英文版维护者: Greg Kroah-Hartman <greg@kroah.com>
+中文版维护者: 李阳 Li Yang <leoli@freescale.com>
+中文版翻译者: 李阳 Li Yang <leoli@freescale.com>
+中文版校译者: 钟宇 TripleX Chung <xxx.phy@gmail.com>
+ 陈琦 Maggie Chen <chenqi@beyondsoft.com>
+ 王聪 Wang Cong <xiyou.wangcong@gmail.com>
+
+以下为正文
+---------------------------------------------------------------------
+
+如何参与Linux内核开发
+---------------------
+
+这是一篇将如何参与Linux内核开发的相关问题一网打尽的终极秘笈。它将指导你
+成为一名Linux内核开发者,并且学会如何同Linux内核开发社区合作。它尽可能不
+包括任何关于内核编程的技术细节,但会给你指引一条获得这些知识的正确途径。
+
+如果这篇文章中的任何内容不再适用,请给文末列出的文件维护者发送补丁。
+
+
+入门
+----
+
+你想了解如何成为一名Linux内核开发者?或者老板吩咐你“给这个设备写个Linux
+驱动程序”?这篇文章的目的就是教会你达成这些目标的全部诀窍,它将描述你需
+要经过的流程以及给出如何同内核社区合作的一些提示。它还将试图解释内核社区
+为何这样运作。
+
+Linux内核大部分是由C语言写成的,一些体系结构相关的代码用到了汇编语言。要
+参与内核开发,你必须精通C语言。除非你想为某个架构开发底层代码,否则你并
+不需要了解(任何体系结构的)汇编语言。下面列举的书籍虽然不能替代扎实的C
+语言教育和多年的开发经验,但如果需要的话,做为参考还是不错的:
+ - "The C Programming Language" by Kernighan and Ritchie [Prentice Hall]
+ 《C程序设计语言(第2版·新版)》(徐宝文 李志 译)[机械工业出版社]
+ - "Practical C Programming" by Steve Oualline [O'Reilly]
+ 《实用C语言编程(第三版)》(郭大海 译)[中国电力出版社]
+ - "C: A Reference Manual" by Harbison and Steele [Prentice Hall]
+ 《C语言参考手册(原书第5版)》(邱仲潘 等译)[机械工业出版社]
+
+Linux内核使用GNU C和GNU工具链开发。虽然它遵循ISO C89标准,但也用到了一些
+标准中没有定义的扩展。内核是自给自足的C环境,不依赖于标准C库的支持,所以
+并不支持C标准中的部分定义。比如long long类型的大数除法和浮点运算就不允许
+使用。有时候确实很难弄清楚内核对工具链的要求和它所使用的扩展,不幸的是目
+前还没有明确的参考资料可以解释它们。请查阅gcc信息页(使用“info gcc”命令
+显示)获得一些这方面信息。
+
+请记住你是在学习怎么和已经存在的开发社区打交道。它由一群形形色色的人组成,
+他们对代码、风格和过程有着很高的标准。这些标准是在长期实践中总结出来的,
+适应于地理上分散的大型开发团队。它们已经被很好得整理成档,建议你在开发
+之前尽可能多的学习这些标准,而不要期望别人来适应你或者你公司的行为方式。
+
+
+法律问题
+--------
+
+Linux内核源代码都是在GPL(通用公共许可证)的保护下发布的。要了解这种许可
+的细节请查看源代码主目录下的COPYING文件。如果你对它还有更深入问题请联系
+律师,而不要在Linux内核邮件组上提问。因为邮件组里的人并不是律师,不要期
+望他们的话有法律效力。
+
+对于GPL的常见问题和解答,请访问以下链接:
+ http://www.gnu.org/licenses/gpl-faq.html
+
+
+文档
+----
+
+Linux内核代码中包含有大量的文档。这些文档对于学习如何与内核社区互动有着
+不可估量的价值。当一个新的功能被加入内核,最好把解释如何使用这个功能的文
+档也放进内核。当内核的改动导致面向用户空间的接口发生变化时,最好将相关信
+息或手册页(manpages)的补丁发到mtk.manpages@gmail.com,以向手册页(manpages)
+的维护者解释这些变化。
+
+以下是内核代码中需要阅读的文档:
+ README
+ 文件简要介绍了Linux内核的背景,并且描述了如何配置和编译内核。内核的
+ 新用户应该从这里开始。
+
+ Documentation/Changes
+ 文件给出了用来编译和使用内核所需要的最小软件包列表。
+
+ Documentation/CodingStyle
+ 描述Linux内核的代码风格和理由。所有新代码需要遵守这篇文档中定义的规
+ 范。大多数维护者只会接收符合规定的补丁,很多人也只会帮忙检查符合风格
+ 的代码。
+
+ Documentation/SubmittingPatches
+ Documentation/SubmittingDrivers
+ 这两份文档明确描述如何创建和发送补丁,其中包括(但不仅限于):
+ - 邮件内容
+ - 邮件格式
+ - 选择收件人
+ 遵守这些规定并不能保证提交成功(因为所有补丁需要通过严格的内容和风格
+ 审查),但是忽视他们几乎就意味着失败。
+
+ 其他关于如何正确地生成补丁的优秀文档包括:
+ "The Perfect Patch"
+ http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+ "Linux kernel patch submission format"
+ http://linux.yyz.us/patch-format.html
+
+ Documentation/stable_api_nonsense.txt
+ 论证内核为什么特意不包括稳定的内核内部API,也就是说不包括像这样的特
+ 性:
+ - 子系统中间层(为了兼容性?)
+ - 在不同操作系统间易于移植的驱动程序
+ - 减缓(甚至阻止)内核代码的快速变化
+ 这篇文档对于理解Linux的开发哲学至关重要。对于将开发平台从其他操作系
+ 统转移到Linux的人来说也很重要。
+
+ Documentation/SecurityBugs
+ 如果你认为自己发现了Linux内核的安全性问题,请根据这篇文档中的步骤来
+ 提醒其他内核开发者并帮助解决这个问题。
+
+ Documentation/ManagementStyle
+ 描述内核维护者的工作方法及其共有特点。这对于刚刚接触内核开发(或者对
+ 它感到好奇)的人来说很重要,因为它解释了很多对于内核维护者独特行为的
+ 普遍误解与迷惑。
+
+ Documentation/stable_kernel_rules.txt
+ 解释了稳定版内核发布的规则,以及如何将改动放入这些版本的步骤。
+
+ Documentation/kernel-docs.txt
+ 有助于内核开发的外部文档列表。如果你在内核自带的文档中没有找到你想找
+ 的内容,可以查看这些文档。
+
+ Documentation/applying-patches.txt
+ 关于补丁是什么以及如何将它打在不同内核开发分支上的好介绍
+
+内核还拥有大量从代码自动生成的文档。它包含内核内部API的全面介绍以及如何
+妥善处理加锁的规则。生成的文档会放在 Documentation/DocBook/目录下。在内
+核源码的主目录中使用以下不同命令将会分别生成PDF、Postscript、HTML和手册
+页等不同格式的文档:
+ make pdfdocs
+ make psdocs
+ make htmldocs
+ make mandocs
+
+
+如何成为内核开发者
+------------------
+如果你对Linux内核开发一无所知,你应该访问“Linux内核新手”计划:
+ http://kernelnewbies.org
+它拥有一个可以问各种最基本的内核开发问题的邮件列表(在提问之前一定要记得
+查找已往的邮件,确认是否有人已经回答过相同的问题)。它还拥有一个可以获得
+实时反馈的IRC聊天频道,以及大量对于学习Linux内核开发相当有帮助的文档。
+
+网站简要介绍了源代码组织结构、子系统划分以及目前正在进行的项目(包括内核
+中的和单独维护的)。它还提供了一些基本的帮助信息,比如如何编译内核和打补
+丁。
+
+如果你想加入内核开发社区并协助完成一些任务,却找不到从哪里开始,可以访问
+“Linux内核房管员”计划:
+ http://janitor.kernelnewbies.org/
+这是极佳的起点。它提供一个相对简单的任务列表,列出内核代码中需要被重新
+整理或者改正的地方。通过和负责这个计划的开发者们一同工作,你会学到将补丁
+集成进内核的基本原理。如果还没有决定下一步要做什么的话,你还可能会得到方
+向性的指点。
+
+如果你已经有一些现成的代码想要放到内核中,但是需要一些帮助来使它们拥有正
+确的格式。请访问“内核导师”计划。这个计划就是用来帮助你完成这个目标的。它
+是一个邮件列表,地址如下:
+ http://selenic.com/mailman/listinfo/kernel-mentors
+
+在真正动手修改内核代码之前,理解要修改的代码如何运作是必需的。要达到这个
+目的,没什么办法比直接读代码更有效了(大多数花招都会有相应的注释),而且
+一些特制的工具还可以提供帮助。例如,“Linux代码交叉引用”项目就是一个值得
+特别推荐的帮助工具,它将源代码显示在有编目和索引的网页上。其中一个更新及
+时的内核源码库,可以通过以下地址访问:
+ http://sosdg.org/~coywolf/lxr/
+
+
+开发流程
+--------
+
+目前Linux内核开发流程包括几个“主内核分支”和很多子系统相关的内核分支。这
+些分支包括:
+ - 2.6.x主内核源码树
+ - 2.6.x.y -stable内核源码树
+ - 2.6.x -git内核补丁集
+ - 2.6.x -mm内核补丁集
+ - 子系统相关的内核源码树和补丁集
+
+
+2.6.x内核主源码树
+-----------------
+2.6.x内核是由Linus Torvalds(Linux的创造者)亲自维护的。你可以在
+kernel.org网站的pub/linux/kernel/v2.6/目录下找到它。它的开发遵循以下步
+骤:
+ - 每当一个新版本的内核被发布,为期两周的集成窗口将被打开。在这段时间里
+ 维护者可以向Linus提交大段的修改,通常这些修改已经被放到-mm内核中几个
+ 星期了。提交大量修改的首选方式是使用git工具(内核的代码版本管理工具
+ ,更多的信息可以在http://git.or.cz/获取),不过使用普通补丁也是可以
+ 的。
+ - 两个星期以后-rc1版本内核发布。之后只有不包含可能影响整个内核稳定性的
+ 新功能的补丁才可能被接受。请注意一个全新的驱动程序(或者文件系统)有
+ 可能在-rc1后被接受是因为这样的修改完全独立,不会影响其他的代码,所以
+ 没有造成内核退步的风险。在-rc1以后也可以用git向Linus提交补丁,不过所
+ 有的补丁需要同时被发送到相应的公众邮件列表以征询意见。
+ - 当Linus认为当前的git源码树已经达到一个合理健全的状态足以发布供人测试
+ 时,一个新的-rc版本就会被发布。计划是每周都发布新的-rc版本。
+ - 这个过程一直持续下去直到内核被认为达到足够稳定的状态,持续时间大概是
+ 6个星期。
+ - 以下地址跟踪了在每个-rc发布中发现的退步列表:
+ http://kernelnewbies.org/known_regressions
+
+关于内核发布,值得一提的是Andrew Morton在linux-kernel邮件列表中如是说:
+ “没有人知道新内核何时会被发布,因为发布是根据已知bug的情况来决定
+ 的,而不是根据一个事先制定好的时间表。”
+
+
+2.6.x.y -stable(稳定版)内核源码树
+-----------------------------------
+由4个数字组成的内核版本号说明此内核是-stable版本。它们包含基于2.6.x版本
+内核的相对较小且至关重要的修补,这些修补针对安全性问题或者严重的内核退步。
+
+这种版本的内核适用于那些期望获得最新的稳定版内核并且不想参与测试开发版或
+者实验版的用户。
+
+如果没有2.6.x.y版本内核存在,那么最新的2.6.x版本内核就相当于是当前的稳定
+版内核。
+
+2.6.x.y版本由“稳定版”小组(邮件地址<stable@kernel.org>)维护,一般隔周发
+布新版本。
+
+内核源码中的Documentation/stable_kernel_rules.txt文件具体描述了可被稳定
+版内核接受的修改类型以及发布的流程。
+
+
+2.6.x -git补丁集
+----------------
+Linus的内核源码树的每日快照,这个源码树是由git工具管理的(由此得名)。这
+些补丁通常每天更新以反映Linus的源码树的最新状态。它们比-rc版本的内核源码
+树更具试验性质,因为这个补丁集是全自动生成的,没有任何人来确认其是否真正
+健全。
+
+
+2.6.x -mm补丁集
+---------------
+这是由Andrew Morton维护的试验性内核补丁集。Andrew将所有子系统的内核源码
+和补丁拼凑到一起,并且加入了大量从linux-kernel邮件列表中采集的补丁。这个
+源码树是新功能和补丁的试炼场。当补丁在-mm补丁集里证明了其价值以后Andrew
+或者相应子系统的维护者会将补丁发给Linus以便集成进主内核源码树。
+
+在将所有新补丁发给Linus以集成到主内核源码树之前,我们非常鼓励先把这些补
+丁放在-mm版内核源码树中进行测试。
+
+这些内核版本不适合在需要稳定运行的系统上运行,因为运行它们比运行任何其他
+内核分支都更具有风险。
+
+如果你想为内核开发进程提供帮助,请尝试并使用这些内核版本,并在
+linux-kernel邮件列表中提供反馈,告诉大家你遇到了问题还是一切正常。
+
+通常-mm版补丁集不光包括这些额外的试验性补丁,还包括发布时-git版主源码树
+中的改动。
+
+-mm版内核没有固定的发布周期,但是通常在每两个-rc版内核发布之间都会有若干
+个-mm版内核发布(一般是1至3个)。
+
+
+子系统相关内核源码树和补丁集
+----------------------------
+相当一部分内核子系统开发者会公开他们自己的开发源码树,以便其他人能了解内
+核的不同领域正在发生的事情。如上所述,这些源码树会被集成到-mm版本内核中。
+
+下面是目前可用的一些内核源码树的列表:
+ 通过git管理的源码树:
+ - Kbuild开发源码树, Sam Ravnborg <sam@ravnborg.org>
+ git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
+
+ - ACPI开发源码树, Len Brown <len.brown@intel.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
+
+ - 块设备开发源码树, Jens Axboe <axboe@suse.de>
+ git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
+
+ - DRM开发源码树, Dave Airlie <airlied@linux.ie>
+ git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
+
+ - ia64开发源码树, Tony Luck <tony.luck@intel.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
+
+ - ieee1394开发源码树, Jody McIntyre <scjody@modernduck.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git
+
+ - infiniband开发源码树, Roland Dreier <rolandd@cisco.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
+
+ - libata开发源码树, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
+
+ - 网络驱动程序开发源码树, Jeff Garzik <jgarzik@pobox.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
+
+ - pcmcia开发源码树, Dominik Brodowski <linux@dominikbrodowski.net>
+ git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
+
+ - SCSI开发源码树, James Bottomley <James.Bottomley@SteelEye.com>
+ git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
+
+ 使用quilt管理的补丁集:
+ - USB, PCI, 驱动程序核心和I2C, Greg Kroah-Hartman <gregkh@suse.de>
+ kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+ - x86-64, 部分i386, Andi Kleen <ak@suse.de>
+ ftp.firstfloor.org:/pub/ak/x86_64/quilt/
+
+ 其他内核源码树可以在http://git.kernel.org的列表中和MAINTAINERS文件里
+ 找到。
+
+报告bug
+-------
+
+bugzilla.kernel.org是Linux内核开发者们用来跟踪内核Bug的网站。我们鼓励用
+户在这个工具中报告找到的所有bug。如何使用内核bugzilla的细节请访问:
+ http://test.kernel.org/bugzilla/faq.html
+
+内核源码主目录中的REPORTING-BUGS文件里有一个很好的模板。它指导用户如何报
+告可能的内核bug以及需要提供哪些信息来帮助内核开发者们找到问题的根源。
+
+
+利用bug报告
+-----------
+
+练习内核开发技能的最好办法就是修改其他人报告的bug。你不光可以帮助内核变
+得更加稳定,还可以学会如何解决实际问题从而提高自己的技能,并且让其他开发
+者感受到你的存在。修改bug是赢得其他开发者赞誉的最好办法,因为并不是很多
+人都喜欢浪费时间去修改别人报告的bug。
+
+要尝试修改已知的bug,请访问http://bugzilla.kernel.org网址。如果你想获得
+最新bug的通知,可以订阅bugme-new邮件列表(只有新的bug报告会被寄到这里)
+或者订阅bugme-janitor邮件列表(所有bugzilla的变动都会被寄到这里)。
+
+ http://lists.osdl.org/mailman/listinfo/bugme-new
+ http://lists.osdl.org/mailman/listinfo/bugme-janitors
+
+
+邮件列表
+--------
+
+正如上面的文档所描述,大多数的骨干内核开发者都加入了Linux Kernel邮件列
+表。如何订阅和退订列表的细节可以在这里找到:
+ http://vger.kernel.org/vger-lists.html#linux-kernel
+网上很多地方都有这个邮件列表的存档(archive)。可以使用搜索引擎来找到这些
+存档。比如:
+ http://dir.gmane.org/gmane.linux.kernel
+在发信之前,我们强烈建议你先在存档中搜索你想要讨论的问题。很多已经被详细
+讨论过的问题只在邮件列表的存档中可以找到。
+
+大多数内核子系统也有自己独立的邮件列表来协调各自的开发工作。从
+MAINTAINERS文件中可以找到不同话题对应的邮件列表。
+
+很多邮件列表架设在kernel.org服务器上。这些列表的信息可以在这里找到:
+ http://vger.kernel.org/vger-lists.html
+
+在使用这些邮件列表时,请记住保持良好的行为习惯。下面的链接提供了与这些列
+表(或任何其它邮件列表)交流的一些简单规则,虽然内容有点滥竽充数。
+ http://www.albion.com/netiquette/
+
+当有很多人回复你的邮件时,邮件的抄送列表会变得很长。请不要将任何人从抄送
+列表中删除,除非你有足够的理由这么做。也不要只回复到邮件列表。请习惯于同
+一封邮件接收两次(一封来自发送者一封来自邮件列表),而不要试图通过添加一
+些奇特的邮件头来解决这个问题,人们不会喜欢的。
+
+记住保留你所回复内容的上下文和源头。在你回复邮件的顶部保留“某某某说到……”
+这几行。将你的评论加在被引用的段落之间而不要放在邮件的顶部。
+
+如果你在邮件中附带补丁,请确认它们是可以直接阅读的纯文本(如
+Documentation/SubmittingPatches文档中所述)。内核开发者们不希望遇到附件
+或者被压缩了的补丁。只有这样才能保证他们可以直接评论你的每行代码。请确保
+你使用的邮件发送程序不会修改空格和制表符。一个防范性的测试方法是先将邮件
+发送给自己,然后自己尝试是否可以顺利地打上收到的补丁。如果测试不成功,请
+调整或者更换你的邮件发送程序直到它正确工作为止。
+
+总而言之,请尊重其他的邮件列表订阅者。
+
+
+同内核社区合作
+----------------
+
+内核社区的目标就是提供尽善尽美的内核。所以当你提交补丁期望被接受进内核的
+时候,它的技术价值以及其他方面都将被评审。那么你可能会得到什么呢?
+ - 批评
+ - 评论
+ - 要求修改
+ - 要求证明修改的必要性
+ - 沉默
+
+要记住,这些是把补丁放进内核的正常情况。你必须学会听取对补丁的批评和评论,
+从技术层面评估它们,然后要么重写你的补丁要么简明扼要地论证修改是不必要
+的。如果你发的邮件没有得到任何回应,请过几天后再试一次,因为有时信件会湮
+没在茫茫信海中。
+
+你不应该做的事情:
+ - 期望自己的补丁不受任何质疑就直接被接受
+ - 翻脸
+ - 忽略别人的评论
+ - 没有按照别人的要求做任何修改就重新提交
+
+在一个努力追寻最好技术方案的社区里,对于一个补丁有多少好处总会有不同的见
+解。你必须要抱着合作的态度,愿意改变自己的观点来适应内核的风格。或者至少
+愿意去证明你的想法是有价值的。记住,犯错误是允许的,只要你愿意朝着正确的
+方案去努力。
+
+如果你的第一个补丁换来的是一堆修改建议,这是很正常的。这并不代表你的补丁
+不会被接受,也不意味着有人和你作对。你只需要改正所有提出的问题然后重新发
+送你的补丁。
+
+内核社区和公司文化的差异
+------------------------
+
+内核社区的工作模式同大多数传统公司开发队伍的工作模式并不相同。下面这些例
+子,可以帮助你避免某些可能发生问题:
+ 用这些话介绍你的修改提案会有好处:
+ - 它同时解决了多个问题
+ - 它删除了2000行代码
+ - 这是补丁,它已经解释了我想要说明的
+ - 我在5种不同的体系结构上测试过它……
+ - 这是一系列小补丁用来……
+ - 这个修改提高了普通机器的性能……
+
+ 应该避免如下的说法:
+ - 我们在AIX/ptx/Solaris就是这么做的,所以这么做肯定是好的……
+ - 我做这行已经20年了,所以……
+ - 为了我们公司赚钱考虑必须这么做
+ - 这是我们的企业产品线所需要的
+ - 这里是描述我观点的1000页设计文档
+ - 这是一个5000行的补丁用来……
+ - 我重写了现在乱七八糟的代码,这就是……
+ - 我被规定了最后期限,所以这个补丁需要立刻被接受
+
+另外一个内核社区与大部分传统公司的软件开发队伍不同的地方是无法面对面地交
+流。使用电子邮件和IRC聊天工具做为主要沟通工具的一个好处是性别和种族歧视
+将会更少。Linux内核的工作环境更能接受妇女和少数族群,因为每个人在别人眼
+里只是一个邮件地址。国际化也帮助了公平的实现,因为你无法通过姓名来判断人
+的性别。男人有可能叫李丽,女人也有可能叫王刚。大多数在Linux内核上工作过
+并表达过看法的女性对在linux上工作的经历都给出了正面的评价。
+
+对于一些不习惯使用英语的人来说,语言可能是一个引起问题的障碍。在邮件列表
+中要正确地表达想法必需良好地掌握语言,所以建议你在发送邮件之前最好检查一
+下英文写得是否正确。
+
+
+拆分修改
+--------
+
+Linux内核社区并不喜欢一下接收大段的代码。修改需要被恰当地介绍、讨论并且
+拆分成独立的小段。这几乎完全和公司中的习惯背道而驰。你的想法应该在开发最
+开始的阶段就让大家知道,这样你就可以及时获得对你正在进行的开发的反馈。这
+样也会让社区觉得你是在和他们协作,而不是仅仅把他们当作倾销新功能的对象。
+无论如何,你不要一次性地向邮件列表发送50封信,你的补丁序列应该永远用不到
+这么多。
+
+将补丁拆开的原因如下:
+
+1) 小的补丁更有可能被接受,因为它们不需要太多的时间和精力去验证其正确性。
+ 一个5行的补丁,可能在维护者看了一眼以后就会被接受。而500行的补丁则
+ 需要数个小时来审查其正确性(所需时间随补丁大小增加大约呈指数级增长)。
+
+ 当出了问题的时候,小的补丁也会让调试变得非常容易。一个一个补丁地回溯
+ 将会比仔细剖析一个被打上的大补丁(这个补丁破坏了其他东西)容易得多。
+
+2)不光发送小的补丁很重要,在提交之前重新编排、化简(或者仅仅重新排列)
+ 补丁也是很重要的。
+
+这里有内核开发者Al Viro打的一个比方:
+ “想象一个老师正在给学生批改数学作业。老师并不希望看到学生为了得
+ 到正确解法所进行的尝试和产生的错误。他希望看到的是最干净最优雅的
+ 解答。好学生了解这点,绝不会把最终解决之前的中间方案提交上去。”
+
+ 内核开发也是这样。维护者和评审者不希望看到一个人在解决问题时的思
+ 考过程。他们只希望看到简单和优雅的解决方案。
+
+直接给出一流的解决方案,和社区一起协作讨论尚未完成的工作,这两者之间似乎
+很难找到一个平衡点。所以最好尽早开始收集有利于你进行改进的反馈;同时也要
+保证修改分成很多小块,这样在整个项目都准备好被包含进内核之前,其中的一部
+分可能会先被接收。
+
+必须了解这样做是不可接受的:试图将未完成的工作提交进内核,然后再找时间修
+复。
+
+
+证明修改的必要性
+----------------
+除了将补丁拆成小块,很重要的一点是让Linux社区了解他们为什么需要这样修改。
+你必须证明新功能是有人需要的并且是有用的。
+
+
+记录修改
+--------
+
+当你发送补丁的时候,需要特别留意邮件正文的内容。因为这里的信息将会做为补
+丁的修改记录(ChangeLog),会被一直保留以备大家查阅。它需要完全地描述补丁,
+包括:
+ - 为什么需要这个修改
+ - 补丁的总体设计
+ - 实现细节
+ - 测试结果
+
+想了解它具体应该看起来像什么,请查阅以下文档中的“ChangeLog”章节:
+ “The Perfect Patch”
+ http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+
+
+这些事情有时候做起来很难。要在任何方面都做到完美可能需要好几年时间。这是
+一个持续提高的过程,它需要大量的耐心和决心。只要不放弃,你一定可以做到。
+很多人已经做到了,而他们都曾经和现在的你站在同样的起点上。
+
+
+---------------
+感谢Paolo Ciarrocchi允许“开发流程”部分基于他所写的文章
+(http://linux.tar.bz/articles/2.6-development_process),感谢Randy
+Dunlap和Gerrit Huizenga完善了应该说和不该说的列表。感谢Pat Mochel, Hanna
+Linder, Randy Dunlap, Kay Sievers, Vojtech Pavlik, Jan Kara, Josh Boyer,
+Kees Cook, Andrew Morton, Andi Kleen, Vadim Lobanov, Jesper Juhl, Adrian
+Bunk, Keri Harris, Frans Pop, David A. Wheeler, Junio Hamano, Michael
+Kerrisk和Alex Shepard的评审、建议和贡献。没有他们的帮助,这篇文档是不可
+能完成的。
+
+
+
+英文版维护者: Greg Kroah-Hartman <greg@kroah.com>
diff --git a/Documentation/zh_CN/SubmittingDrivers b/Documentation/zh_CN/SubmittingDrivers
new file mode 100644
index 0000000..5f4815c
--- /dev/null
+++ b/Documentation/zh_CN/SubmittingDrivers
@@ -0,0 +1,168 @@
+Chinese translated version of Documentation/SubmittingDrivers
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have a problem
+communicating in English you can also ask the Chinese maintainer for
+help. Contact the Chinese maintainer if this translation is outdated
+or if there is a problem with the translation.
+
+Chinese maintainer: Li Yang <leo@zh-kernel.org>
+---------------------------------------------------------------------
+Documentation/SubmittingDrivers 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+中文版维护者: 李阳 Li Yang <leo@zh-kernel.org>
+中文版翻译者: 李阳 Li Yang <leo@zh-kernel.org>
+中文版校译者: 陈琦 Maggie Chen <chenqi@beyondsoft.com>
+ 王聪 Wang Cong <xiyou.wangcong@gmail.com>
+ 张巍 Zhang Wei <Wei.Zhang@freescale.com>
+
+以下为正文
+---------------------------------------------------------------------
+
+如何向 Linux 内核提交驱动程序
+-----------------------------
+
+这篇文档将会解释如何向不同的内核源码树提交设备驱动程序。请注意,如果你感
+兴趣的是显卡驱动程序,你也许应该访问 XFree86 项目(http://www.xfree86.org/)
+和/或 X.org 项目 (http://x.org)。
+
+另请参阅 Documentation/SubmittingPatches 文档。
+
+
+分配设备号
+----------
+
+块设备和字符设备的主设备号与从设备号是由 Linux 命名编号分配权威 LANANA(
+现在是 Torben Mathiasen)负责分配。申请的网址是 http://www.lanana.org/。
+即使不准备提交到主流内核的设备驱动也需要在这里分配设备号。有关详细信息,
+请参阅 Documentation/devices.txt。
+
+如果你使用的不是已经分配的设备号,那么当你提交设备驱动的时候,它将会被强
+制分配一个新的设备号,即便这个设备号和你之前发给客户的截然不同。
+
+设备驱动的提交对象
+------------------
+
+Linux 2.0:
+ 此内核源码树不接受新的驱动程序。
+
+Linux 2.2:
+ 此内核源码树不接受新的驱动程序。
+
+Linux 2.4:
+ 如果所属的代码领域在内核的 MAINTAINERS 文件中列有一个总维护者,
+ 那么请将驱动程序提交给他。如果此维护者没有回应或者你找不到恰当的
+ 维护者,那么请联系 Willy Tarreau <w@1wt.eu>。
+
+Linux 2.6:
+ 除了遵循和 2.4 版内核同样的规则外,你还需要在 linux-kernel 邮件
+ 列表上跟踪最新的 API 变化。向 Linux 2.6 内核提交驱动的顶级联系人
+ 是 Andrew Morton <akpm@osdl.org>。
+
+决定设备驱动能否被接受的条件
+----------------------------
+
+许可: 代码必须使用 GNU 通用公开许可证 (GPL) 提交给 Linux,但是
+ 我们并不要求 GPL 是唯一的许可。你或许会希望同时使用多种
+ 许可证发布,如果希望驱动程序可以被其他开源社区(比如BSD)
+ 使用。请参考 include/linux/module.h 文件中所列出的可被
+ 接受共存的许可。
+
+版权: 版权所有者必须同意使用 GPL 许可。最好提交者和版权所有者
+ 是相同个人或实体。否则,必需列出授权使用 GPL 的版权所有
+ 人或实体,以备验证之需。
+
+接口: 如果你的驱动程序使用现成的接口并且和其他同类的驱动程序行
+ 为相似,而不是去发明无谓的新接口,那么它将会更容易被接受。
+ 如果你需要一个 Linux 和 NT 的通用驱动接口,那么请在用
+ 户空间实现它。
+
+代码: 请使用 Documentation/CodingStyle 中所描述的 Linux 代码风
+ 格。如果你的某些代码段(例如那些与 Windows 驱动程序包共
+ 享的代码段)需要使用其他格式,而你却只希望维护一份代码,
+ 那么请将它们很好地区分出来,并且注明原因。
+
+可移植性: 请注意,指针并不永远是 32 位的,不是所有的计算机都使用小
+ 尾模式 (little endian) 存储数据,不是所有的人都拥有浮点
+ 单元,不要随便在你的驱动程序里嵌入 x86 汇编指令。只能在
+ x86 上运行的驱动程序一般是不受欢迎的。虽然你可能只有 x86
+ 硬件,很难测试驱动程序在其他平台上是否可用,但是确保代码
+ 可以被轻松地移植却是很简单的。
+
+清晰度: 做到所有人都能修补这个驱动程序将会很有好处,因为这样你将
+ 会直接收到修复的补丁而不是 bug 报告。如果你提交一个试图
+ 隐藏硬件工作机理的驱动程序,那么它将会被扔进废纸篓。
+
+电源管理: 因为 Linux 正在被很多移动设备和桌面系统使用,所以你的驱
+ 动程序也很有可能被使用在这些设备上。它应该支持最基本的电
+ 源管理,即在需要的情况下实现系统级休眠和唤醒要用到的
+ .suspend 和 .resume 函数。你应该检查你的驱动程序是否能正
+ 确地处理休眠与唤醒,如果实在无法确认,请至少把 .suspend
+ 函数定义成返回 -ENOSYS(功能未实现)错误。你还应该尝试确
+ 保你的驱动在什么都不干的情况下将耗电降到最低。要获得驱动
+ 程序测试的指导,请参阅
+ Documentation/power/drivers-testing.txt。有关驱动程序电
+ 源管理问题相对全面的概述,请参阅
+ Documentation/power/devices.txt。
+
+管理: 如果一个驱动程序的作者还在进行有效的维护,那么通常除了那
+ 些明显正确且不需要任何检查的补丁以外,其他所有的补丁都会
+ 被转发给作者。如果你希望成为驱动程序的联系人和更新者,最
+ 好在代码注释中写明并且在 MAINTAINERS 文件中加入这个驱动
+ 程序的条目。
+
+不影响设备驱动能否被接受的条件
+------------------------------
+
+供应商: 由硬件供应商来维护驱动程序通常是一件好事。不过,如果源码
+ 树里已经有其他人提供了可稳定工作的驱动程序,那么请不要期
+ 望“我是供应商”会成为内核改用你的驱动程序的理由。理想的情
+ 况是:供应商与现有驱动程序的作者合作,构建一个统一完美的
+ 驱动程序。
+
+作者: 驱动程序是由大的 Linux 公司研发还是由你个人编写,并不影
+ 响其是否能被内核接受。没有人对内核源码树享有特权。只要你
+ 充分了解内核社区,你就会发现这一点。
+
+
+资源列表
+--------
+
+Linux 内核主源码树:
+ ftp.??.kernel.org:/pub/linux/kernel/...
+ ?? == 你的国家代码,例如 "cn"、"us"、"uk"、"fr" 等等
+
+Linux 内核邮件列表:
+ linux-kernel@vger.kernel.org
+ [可通过向majordomo@vger.kernel.org发邮件来订阅]
+
+Linux 设备驱动程序,第三版(探讨 2.6.10 版内核):
+ http://lwn.net/Kernel/LDD3/ (免费版)
+
+LWN.net:
+ 每周内核开发活动摘要 - http://lwn.net/
+ 2.6 版中 API 的变更:
+ http://lwn.net/Articles/2.6-kernel-api/
+ 将旧版内核的驱动程序移植到 2.6 版:
+ http://lwn.net/Articles/driver-porting/
+
+KernelTrap:
+ Linux 内核的最新动态以及开发者访谈
+ http://kerneltrap.org/
+
+内核新手(KernelNewbies):
+ 为新的内核开发者提供文档和帮助
+ http://kernelnewbies.org/
+
+Linux USB项目:
+ http://www.linux-usb.org/
+
+写内核驱动的“不要”(Arjan van de Ven著):
+ http://www.fenrus.org/how-to-not-write-a-device-driver-paper.pdf
+
+内核清洁工 (Kernel Janitor):
+ http://janitor.kernelnewbies.org/
diff --git a/Documentation/zh_CN/SubmittingPatches b/Documentation/zh_CN/SubmittingPatches
new file mode 100644
index 0000000..985c92e
--- /dev/null
+++ b/Documentation/zh_CN/SubmittingPatches
@@ -0,0 +1,416 @@
+Chinese translated version of Documentation/SubmittingPatches
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have a problem
+communicating in English you can also ask the Chinese maintainer for
+help. Contact the Chinese maintainer if this translation is outdated
+or if there is a problem with the translation.
+
+Chinese maintainer: TripleX Chung <triplex@zh-kernel.org>
+---------------------------------------------------------------------
+Documentation/SubmittingPatches 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+中文版维护者: 钟宇 TripleX Chung <triplex@zh-kernel.org>
+中文版翻译者: 钟宇 TripleX Chung <triplex@zh-kernel.org>
+中文版校译者: 李阳 Li Yang <leo@zh-kernel.org>
+ 王聪 Wang Cong <xiyou.wangcong@gmail.com>
+
+以下为正文
+---------------------------------------------------------------------
+
+ 如何让你的改动进入内核
+ 或者
+ 获得亲爱的 Linus Torvalds 的关注和处理
+----------------------------------
+
+对于想要将改动提交到 Linux 内核的个人或者公司来说,如果不熟悉“规矩”,
+提交的流程会让人畏惧。本文档收集了一系列建议,这些建议可以大大的提高你
+的改动被接受的机会。
+阅读 Documentation/SubmitChecklist 来获得在提交代码前需要检查的项目的列
+表。如果你在提交一个驱动程序,那么同时阅读一下
+Documentation/SubmittingDrivers 。
+
+
+--------------------------
+第一节 - 创建并发送你的改动
+--------------------------
+
+1) "diff -up"
+-----------
+
+使用 "diff -up" 或者 "diff -uprN" 来创建补丁。
+
+所有内核的改动,都是以补丁的形式呈现的,补丁由 diff(1) 生成。创建补丁的
+时候,要确认它是以 "unified diff" 格式创建的,这种格式由 diff(1) 的 '-u'
+参数生成。而且,请使用 '-p' 参数,那样会显示每个改动所在的C函数,使得
+产生的补丁容易读得多。补丁应该基于内核源代码树的根目录,而不是里边的任
+何子目录。
+为一个单独的文件创建补丁,一般来说这样做就够了:
+
+ SRCTREE= linux-2.6
+ MYFILE= drivers/net/mydriver.c
+
+ cd $SRCTREE
+ cp $MYFILE $MYFILE.orig
+ vi $MYFILE # make your change
+ cd ..
+ diff -up $SRCTREE/$MYFILE{.orig,} > /tmp/patch
+
+为多个文件创建补丁,你可以解开一个没有修改过的内核源代码树,然后和你自
+己的代码树之间做 diff 。例如:
+
+ MYSRC= /devel/linux-2.6
+
+ tar xvfz linux-2.6.12.tar.gz
+ mv linux-2.6.12 linux-2.6.12-vanilla
+ diff -uprN -X linux-2.6.12-vanilla/Documentation/dontdiff \
+ linux-2.6.12-vanilla $MYSRC > /tmp/patch
+
+"dontdiff" 是内核在编译的时候产生的文件的列表,列表中的文件在 diff(1)
+产生的补丁里会被跳过。"dontdiff" 文件被包含在2.6.12和之后版本的内核源代
+码树中。对于更早的内核版本,你可以从
+<http://www.xenotime.net/linux/doc/dontdiff> 获取它。
+确定你的补丁里没有包含任何不属于这次补丁提交的额外文件。记得在用diff(1)
+生成补丁之后,审阅一次补丁,以确保准确。
+如果你的改动很散乱,你应该研究一下如何将补丁分割成独立的部分,将改动分
+割成一系列合乎逻辑的步骤。这样更容易让其他内核开发者审核,如果你想你的
+补丁被接受,这是很重要的。下面这些脚本能够帮助你做这件事情:
+Quilt:
+http://savannah.nongnu.org/projects/quilt
+
+Andrew Morton 的补丁脚本:
+http://www.zip.com.au/~akpm/linux/patches/
+作为这些脚本的替代,quilt 是值得推荐的补丁管理工具(看上面的链接)。
+
+2)描述你的改动。
+描述你的改动包含的技术细节。
+
+要多具体就写多具体。最糟糕的描述可能是像下面这些语句:“更新了某驱动程
+序”,“修正了某驱动程序的bug”,或者“这个补丁包含了某子系统的修改,请
+使用。”
+
+如果你的描述开始变长,这表示你也许需要拆分你的补丁了,请看第3小节,
+继续。
+
+3)拆分你的改动
+
+将改动拆分,逻辑类似的放到同一个补丁文件里。
+
+例如,如果你的改动里同时有bug修正和性能优化,那么把这些改动才分到两个或
+者更多的补丁文件中。如果你的改动包含对API的修改,并且修改了驱动程序来适
+应这些新的API,那么把这些修改分成两个补丁。
+
+另一方面,如果你将一个单独的改动做成多个补丁文件,那么将它们合并成一个
+单独的补丁文件。这样一个逻辑上单独的改动只被包含在一个补丁文件里。
+
+如果有一个补丁依赖另外一个补丁来完成它的改动,那没问题。简单的在你的补
+丁描述里指出“这个补丁依赖某补丁”就好了。
+
+如果你不能将补丁浓缩成更少的文件,那么每次大约发送出15个,然后等待审查
+和整合。
+
+4)选择 e-mail 的收件人
+
+看一遍 MAINTAINERS 文件和源代码,看看你所的改动所在的内核子系统有没有指
+定的维护者。如果有,给他们发e-mail。
+
+如果没有找到维护者,或者维护者没有反馈,将你的补丁发送到内核开发者主邮
+件列表 linux-kernel@vger.kernel.org。大部分的内核开发者都跟踪这个邮件列
+表,可以评价你的改动。
+
+每次不要发送超过15个补丁到 vger 邮件列表!!!
+
+Linus Torvalds 是决定改动能否进入 Linux 内核的最终裁决者。他的 e-mail
+地址是 <torvalds@linux-foundation.org> 。他收到的 e-mail 很多,所以一般
+的说,最好别给他发 e-mail。
+
+那些修正bug,“显而易见”的修改或者是类似的只需要很少讨论的补丁可以直接
+发送或者CC给Linus。那些需要讨论或者没有很清楚的好处的补丁,一般先发送到
+linux-kernel邮件列表。只有当补丁被讨论得差不多了,才提交给Linus。
+
+5)选择CC( e-mail 抄送)列表
+
+除非你有理由不这样做,否则CC linux-kernel@vger.kernel.org。
+
+除了 Linus 之外,其他内核开发者也需要注意到你的改动,这样他们才能评论你
+的改动并提供代码审查和建议。linux-kernel 是 Linux 内核开发者主邮件列表
+。其它的邮件列表为特定的子系统提供服务,比如 USB,framebuffer 设备,虚
+拟文件系统,SCSI 子系统,等等。查看 MAINTAINERS 文件来获得和你的改动有
+关的邮件列表。
+
+Majordomo lists of VGER.KERNEL.ORG at:
+ <http://vger.kernel.org/vger-lists.html>
+
+如果改动影响了用户空间和内核之间的接口,请给 MAN-PAGES 的维护者(列在
+MAITAINERS 文件里的)发送一个手册页(man-pages)补丁,或者至少通知一下改
+变,让一些信息有途径进入手册页。
+
+即使在第四步的时候,维护者没有作出回应,也要确认在修改他们的代码的时候
+,一直将维护者拷贝到CC列表中。
+
+对于小的补丁,你也许会CC到 Adrian Bunk 管理的搜集琐碎补丁的邮件列表
+(Trivial Patch Monkey)trivial@kernel.org,那里专门收集琐碎的补丁。下面这样
+的补丁会被看作“琐碎的”补丁:
+ 文档的拼写修正。
+ 修正会影响到 grep(1) 的拼写。
+ 警告信息修正(频繁的打印无用的警告是不好的。)
+ 编译错误修正(代码逻辑的确是对的,只是编译有问题。)
+ 运行时修正(只要真的修正了错误。)
+ 移除使用了被废弃的函数/宏的代码(例如 check_region。)
+ 联系方式和文档修正。
+ 用可移植的代码替换不可移植的代码(即使在体系结构相关的代码中,既然有
+ 人拷贝,只要它是琐碎的)
+ 任何文件的作者/维护者对该文件的改动(例如 patch monkey 在重传模式下)
+
+URL: <http://www.kernel.org/pub/linux/kernel/people/bunk/trivial/>
+
+(译注,关于“琐碎补丁”的一些说明:因为原文的这一部分写得比较简单,所以不得不
+违例写一下译注。"trivial"这个英文单词的本意是“琐碎的,不重要的。”但是在这里
+有稍微有一些变化,例如对一些明显的NULL指针的修正,属于运行时修正,会被归类
+到琐碎补丁里。虽然NULL指针的修正很重要,但是这样的修正往往很小而且很容易得到
+检验,所以也被归入琐碎补丁。琐碎补丁更精确的归类应该是
+“simple, localized & easy to verify”,也就是说简单的,局部的和易于检验的。
+trivial@kernel.org邮件列表的目的是针对这样的补丁,为提交者提供一个中心,来
+降低提交的门槛。)
+
+6)没有 MIME 编码,没有链接,没有压缩,没有附件,只有纯文本。
+
+Linus 和其他的内核开发者需要阅读和评论你提交的改动。对于内核开发者来说
+,可以“引用”你的改动很重要,使用一般的 e-mail 工具,他们就可以在你的
+代码的任何位置添加评论。
+
+因为这个原因,所有的提交的补丁都是 e-mail 中“内嵌”的。
+警告:如果你使用剪切-粘贴你的补丁,小心你的编辑器的自动换行功能破坏你的
+补丁。
+
+不要将补丁作为 MIME 编码的附件,不管是否压缩。很多流行的 e-mail 软件不
+是任何时候都将 MIME 编码的附件当作纯文本发送的,这会使得别人无法在你的
+代码中加评论。另外,MIME 编码的附件会让 Linus 多花一点时间来处理,这就
+降低了你的改动被接受的可能性。
+
+警告:一些邮件软件,比如 Mozilla 会将你的信息以如下格式发送:
+---- 邮件头 ----
+Content-Type: text/plain; charset=us-ascii; format=flowed
+---- 邮件头 ----
+问题在于 “format=flowed” 会让接收端的某些邮件软件将邮件中的制表符替换
+成空格以及做一些类似的替换。这样,你发送的时候看起来没问题的补丁就被破
+坏了。
+
+要修正这个问题,只需要将你的 mozilla 的 defaults/pref/mailnews.js 文件
+里的
+pref("mailnews.send_plaintext_flowed", false); // RFC 2646=======
+修改成
+pref("mailnews.display.disable_format_flowed_support", true);
+就可以了。
+
+7) e-mail 的大小
+
+给 Linus 发送补丁的时候,永远按照第6小节说的做。
+
+大的改动对邮件列表不合适,对某些维护者也不合适。如果你的补丁,在不压缩
+的情况下,超过了40kB,那么你最好将补丁放在一个能通过 internet 访问的服
+务器上,然后用指向你的补丁的 URL 替代。
+
+8) 指出你的内核版本
+
+在标题和在补丁的描述中,指出补丁对应的内核的版本,是很重要的。
+
+如果补丁不能干净的在最新版本的内核上打上,Linus 是不会接受它的。
+
+9) 不要气馁,继续提交。
+
+当你提交了改动以后,耐心地等待。如果 Linus 喜欢你的改动并且同意它,那么
+它将在下一个内核发布版本中出现。
+
+然而,如果你的改动没有出现在下一个版本的内核中,可能有若干原因。减少那
+些原因,修正错误,重新提交更新后的改动,是你自己的工作。
+
+Linus不给出任何评论就“丢弃”你的补丁是常见的事情。在系统中这样的事情很
+平常。如果他没有接受你的补丁,也许是由于以下原本:
+* 你的补丁不能在最新版本的内核上干净的打上。
+* 你的补丁在 linux-kernel 邮件列表中没有得到充分的讨论。
+* 风格问题(参照第2小节)
+* 邮件格式问题(重读本节)
+* 你的改动有技术问题。
+* 他收到了成吨的 e-mail,而你的在混乱中丢失了。
+* 你让人为难。
+
+有疑问的时候,在 linux-kernel 邮件列表上请求评论。
+
+10) 在标题上加上 PATCH 的字样
+
+Linus 和 linux-kernel 邮件列表的 e-mail 流量都很高,一个通常的约定是标
+题行以 [PATCH] 开头。这样可以让 Linus 和其他内核开发人员可以从 e-mail
+的讨论中很轻易的将补丁分辨出来。
+
+11)为你的工作签名
+
+为了加强对谁做了何事的追踪,尤其是对那些透过好几层的维护者的补丁,我们
+建议在发送出去的补丁上加一个 “sign-off” 的过程。
+
+"sign-off" 是在补丁的注释的最后的简单的一行文字,认证你编写了它或者其他
+人有权力将它作为开放源代码的补丁传递。规则很简单:如果你能认证如下信息
+:
+ 开发者来源证书 1.1
+ 对于本项目的贡献,我认证如下信息:
+ (a)这些贡献是完全或者部分的由我创建,我有权利以文件中指出
+ 的开放源代码许可证提交它;或者
+ (b)这些贡献基于以前的工作,据我所知,这些以前的工作受恰当的开放
+ 源代码许可证保护,而且,根据许可证,我有权提交修改后的贡献,
+ 无论是完全还是部分由我创造,这些贡献都使用同一个开放源代码许可证
+ (除非我被允许用其它的许可证),正如文件中指出的;或者
+ (c)这些贡献由认证(a),(b)或者(c)的人直接提供给我,而
+ 且我没有修改它。
+ (d)我理解并同意这个项目和贡献是公开的,贡献的记录(包括我
+ 一起提交的个人记录,包括 sign-off )被永久维护并且可以和这个项目
+ 或者开放源代码的许可证同步地再发行。
+ 那么加入这样一行:
+ Signed-off-by: Random J Developer <random@developer.example.org>
+
+使用你的真名(抱歉,不能使用假名或者匿名。)
+
+有人在最后加上标签。现在这些东西会被忽略,但是你可以这样做,来标记公司
+内部的过程,或者只是指出关于 sign-off 的一些特殊细节。
+
+12)标准补丁格式
+
+标准的补丁,标题行是:
+ Subject: [PATCH 001/123] 子系统:一句话概述
+
+标准补丁的信体存在如下部分:
+
+ - 一个 "from" 行指出补丁作者。
+
+ - 一个空行
+
+ - 说明的主体,这些说明文字会被拷贝到描述该补丁的永久改动记录里。
+
+ - 一个由"---"构成的标记行
+
+ - 不合适放到改动记录里的额外的注解。
+
+ - 补丁本身(diff 输出)
+
+标题行的格式,使得对标题行按字母序排序非常的容易 - 很多 e-mail 客户端都
+可以支持 - 因为序列号是用零填充的,所以按数字排序和按字母排序是一样的。
+
+e-mail 标题中的“子系统”标识哪个内核子系统将被打补丁。
+
+e-mail 标题中的“一句话概述”扼要的描述 e-mail 中的补丁。“一句话概述”
+不应该是一个文件名。对于一个补丁系列(“补丁系列”指一系列的多个相关补
+丁),不要对每个补丁都使用同样的“一句话概述”。
+
+记住 e-mail 的“一句话概述”会成为该补丁的全局唯一标识。它会蔓延到 git
+的改动记录里。然后“一句话概述”会被用在开发者的讨论里,用来指代这个补
+丁。用户将希望通过 google 来搜索"一句话概述"来找到那些讨论这个补丁的文
+章。
+
+一些标题的例子:
+
+ Subject: [patch 2/5] ext2: improve scalability of bitmap searching
+ Subject: [PATCHv2 001/207] x86: fix eflags tracking
+
+"from" 行是信体里的最上面一行,具有如下格式:
+ From: Original Author <author@example.com>
+
+"from" 行指明在永久改动日志里,谁会被确认为作者。如果没有 "from" 行,那
+么邮件头里的 "From: " 行会被用来决定改动日志中的作者。
+
+说明的主题将会被提交到永久的源代码改动日志里,因此对那些早已经不记得和
+这个补丁相关的讨论细节的有能力的读者来说,是有意义的。
+
+"---" 标记行对于补丁处理工具要找到哪里是改动日志信息的结束,是不可缺少
+的。
+
+对于 "---" 标记之后的额外注解,一个好的用途就是用来写 diffstat,用来显
+示修改了什么文件和每个文件都增加和删除了多少行。diffstat 对于比较大的补
+丁特别有用。其余那些只是和时刻或者开发者相关的注解,不合适放到永久的改
+动日志里的,也应该放这里。
+使用 diffstat的选项 "-p 1 -w 70" 这样文件名就会从内核源代码树的目录开始
+,不会占用太宽的空间(很容易适合80列的宽度,也许会有一些缩进。)
+
+在后面的参考资料中能看到适当的补丁格式的更多细节。
+
+-------------------------------
+第二节 提示,建议和诀窍
+-------------------------------
+
+本节包含很多和提交到内核的代码有关的通常的"规则"。事情永远有例外...但是
+你必须真的有好的理由这样做。你可以把本节叫做Linus的计算机科学入门课。
+
+1) 读 Document/CodingStyle
+
+Nuff 说过,如果你的代码和这个偏离太多,那么它有可能会被拒绝,没有更多的
+审查,没有更多的评价。
+
+2) #ifdef 是丑陋的
+混杂了 ifdef 的代码难以阅读和维护。别这样做。作为替代,将你的 ifdef 放
+在头文件里,有条件地定义 "static inline" 函数,或者宏,在代码里用这些东
+西。让编译器把那些"空操作"优化掉。
+
+一个简单的例子,不好的代码:
+
+ dev = alloc_etherdev (sizeof(struct funky_private));
+ if (!dev)
+ return -ENODEV;
+ #ifdef CONFIG_NET_FUNKINESS
+ init_funky_net(dev);
+ #endif
+
+清理后的例子:
+
+(头文件里)
+ #ifndef CONFIG_NET_FUNKINESS
+ static inline void init_funky_net (struct net_device *d) {}
+ #endif
+
+(代码文件里)
+ dev = alloc_etherdev (sizeof(struct funky_private));
+ if (!dev)
+ return -ENODEV;
+ init_funky_net(dev);
+
+3) 'static inline' 比宏好
+
+Static inline 函数相比宏来说,是好得多的选择。Static inline 函数提供了
+类型安全,没有长度限制,没有格式限制,在 gcc 下开销和宏一样小。
+
+宏只在 static inline 函数不是最优的时候[在 fast paths 里有很少的独立的
+案例],或者不可能用 static inline 函数的时候[例如字符串分配]。
+应该用 'static inline' 而不是 'static __inline__', 'extern inline' 和
+'extern __inline__' 。
+
+4) 不要过度设计
+
+不要试图预计模糊的未来事情,这些事情也许有用也许没有用:"让事情尽可能的
+简单,而不是更简单"。
+
+----------------
+第三节 参考文献
+----------------
+
+Andrew Morton, "The perfect patch" (tpp).
+ <http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt>
+
+Jeff Garzik, "Linux kernel patch submission format".
+ <http://linux.yyz.us/patch-format.html>
+
+Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer".
+ <http://www.kroah.com/log/2005/03/31/>
+ <http://www.kroah.com/log/2005/07/08/>
+ <http://www.kroah.com/log/2005/10/19/>
+ <http://www.kroah.com/log/2006/01/11/>
+
+NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!
+ <http://marc.theaimsgroup.com/?l=linux-kernel&m=112112749912944&w=2>
+
+Kernel Documentation/CodingStyle:
+ <http://sosdg.org/~coywolf/lxr/source/Documentation/CodingStyle>
+
+Linus Torvalds's mail on the canonical patch format:
+ <http://lkml.org/lkml/2005/4/7/183>
+--
diff --git a/Documentation/zh_CN/oops-tracing.txt b/Documentation/zh_CN/oops-tracing.txt
new file mode 100644
index 0000000..9312608
--- /dev/null
+++ b/Documentation/zh_CN/oops-tracing.txt
@@ -0,0 +1,212 @@
+Chinese translated version of Documentation/oops-tracing.txt
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have a problem
+communicating in English you can also ask the Chinese maintainer for
+help. Contact the Chinese maintainer if this translation is outdated
+or if there is a problem with the translation.
+
+Chinese maintainer: Dave Young <hidave.darkstar@gmail.com>
+---------------------------------------------------------------------
+Documentation/oops-tracing.txt 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+中文版维护者: 杨瑞 Dave Young <hidave.darkstar@gmail.com>
+中文版翻译者: 杨瑞 Dave Young <hidave.darkstar@gmail.com>
+中文版校译者: 李阳 Li Yang <leo@zh-kernel.org>
+ 王聪 Wang Cong <xiyou.wangcong@gmail.com>
+
+以下为正文
+---------------------------------------------------------------------
+
+注意: ksymoops 在2.6中是没有用的。 请以原有格式使用Oops(来自dmesg,等等)。
+忽略任何这样那样关于“解码Oops”或者“通过ksymoops运行”的文档。 如果你贴出运行过
+ksymoops的来自2.6的Oops,人们只会让你重贴一次。
+
+快速总结
+-------------
+
+发现Oops并发送给看似相关的内核领域的维护者。别太担心对不上号。如果你不确定就发给
+和你所做的事情相关的代码的负责人。 如果可重现试着描述怎样重构。 那甚至比oops更有
+价值。
+
+如果你对于发送给谁一无所知, 发给linux-kernel@vger.kernel.org。感谢你帮助Linux
+尽可能地稳定。
+
+Oops在哪里?
+----------------------
+
+通常Oops文本由klogd从内核缓冲区里读取并传给syslogd,由syslogd写到syslog文件中,
+典型地是/var/log/messages(依赖于/etc/syslog.conf)。有时klogd崩溃了,这种情况下你
+能够运行dmesg > file来从内核缓冲区中读取数据并保存下来。 否则你可以
+cat /proc/kmsg > file, 然而你必须介入中止传输, kmsg是一个“永不结束的文件”。如
+果机器崩溃坏到你不能输入命令或者磁盘不可用那么你有三种选择:-
+
+(1) 手抄屏幕上的文本待机器重启后再输入计算机。 麻烦但如果没有针对崩溃的准备,
+这是仅有的选择。 另外,你可以用数码相机把屏幕拍下来-不太好,但比没有强。 如果信
+息滚动到了终端的上面,你会发现以高分辩率启动(比如,vga=791)会让你读到更多的文
+本。(注意:这需要vesafb,所以对‘早期’的oops没有帮助)
+
+(2)用串口终端启动(请参看Documentation/serial-console.txt),运行一个null
+modem到另一台机器并用你喜欢的通讯工具获取输出。Minicom工作地很好。
+
+(3)使用Kdump(请参看Documentation/kdump/kdump.txt),
+使用在Documentation/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核
+环形缓冲区。
+
+完整信息
+----------------
+
+注意:以下来自于Linus的邮件适用于2.4内核。 我因为历史原因保留了它,并且因为其中
+一些信息仍然适用。 特别注意的是,请忽略任何ksymoops的引用。
+
+From: Linus Torvalds <torvalds@osdl.org>
+
+怎样跟踪Oops.. [原发到linux-kernel的一封邮件]
+
+主要的窍门是有五年和这些烦人的oops消息打交道的经验;-)
+
+实际上,你有办法使它更简单。我有两个不同的方法:
+
+ gdb /usr/src/linux/vmlinux
+ gdb> disassemble <offending_function>
+
+那是发现问题的简单办法,至少如果bug报告做的好的情况下(象这个一样-运行ksymoops
+得到oops发生的函数及函数内的偏移)。
+
+哦,如果报告发生的内核以相同的编译器和相似的配置编译它会有帮助的。
+
+另一件要做的事是反汇编bug报告的“Code”部分:ksymoops也会用正确的工具来做这件事,
+但如果没有那些工具你可以写一个傻程序:
+
+ char str[] = "\xXX\xXX\xXX...";
+ main(){}
+
+并用gcc -g编译它然后执行“disassemble str”(XX部分是由Oops报告的值-你可以仅剪切
+粘贴并用“\x”替换空格-我就是这么做的,因为我懒得写程序自动做这一切)。
+
+另外,你可以用scripts/decodecode这个shell脚本。它的使用方法是:
+decodecode < oops.txt
+
+“Code”之后的十六进制字节可能(在某些架构上)有一些当前指令之前的指令字节以及
+当前和之后的指令字节
+
+Code: f9 0f 8d f9 00 00 00 8d 42 0c e8 dd 26 11 c7 a1 60 ea 2b f9 8b 50 08 a1
+64 ea 2b f9 8d 34 82 8b 1e 85 db 74 6d 8b 15 60 ea 2b f9 <8b> 43 04 39 42 54
+7e 04 40 89 42 54 8b 43 04 3b 05 00 f6 52 c0
+
+最后,如果你想知道代码来自哪里,你可以:
+
+ cd /usr/src/linux
+ make fs/buffer.s # 或任何产生BUG的文件
+
+然后你会比gdb反汇编更清楚的知道发生了什么。
+
+现在,问题是把你所拥有的所有数据结合起来:C源码(关于它应该怎样的一般知识),
+汇编代码及其反汇编得到的代码(另外还有从“oops”消息得到的寄存器状态-对了解毁坏的
+指针有用,而且当你有了汇编代码你也能拿其它的寄存器和任何它们对应的C表达式做匹配
+)。
+
+实际上,你仅需看看哪里不匹配(这个例子是“Code”反汇编和编译器生成的代码不匹配)。
+然后你须要找出为什么不匹配。通常很简单-你看到代码使用了空指针然后你看代码想知道
+空指针是怎么出现的,还有检查它是否合法..
+
+现在,如果明白这是一项耗时的工作而且需要一丁点儿的专心,没错。这就是我为什么大多
+只是忽略那些没有符号表信息的崩溃报告的原因:简单的说太难查找了(我有一些
+程序用于在内核代码段中搜索特定的模式,而且有时我也已经能找出那些崩溃的地方,但是
+仅仅是找出正确的序列也确实需要相当扎实的内核知识)
+
+_有时_会发生这种情况,我仅看到崩溃中的反汇编代码序列, 然后我马上就明白问题出在
+哪里。这时我才意识到自己干这个工作已经太长时间了;-)
+
+ Linus
+
+
+---------------------------------------------------------------------------
+关于Oops跟踪的注解:
+
+为了帮助Linus和其它内核开发者,klogd纳入了大量的支持来处理保护错误。为了拥有对
+地址解析的完整支持至少应该使用1.3-pl3的sysklogd包。
+
+当保护错误发生时,klogd守护进程自动把内核日志信息中的重要地址翻译成它们相应的符
+号。
+
+klogd执行两种类型的地址解析。首先是静态翻译其次是动态翻译。静态翻译和ksymoops
+一样使用System.map文件。为了做静态翻译klogd守护进程必须在初始化时能找到system
+map文件。关于klogd怎样搜索map文件请参看klogd手册页。
+
+动态地址翻译在使用内核可装载模块时很重要。 因为内核模块的内存是从内核动态内存池
+里分配的,所以不管是模块开始位置还是模块中函数和符号的位置都不是固定的。
+
+内核支持允许程序决定装载哪些模块和它们在内存中位置的系统调用。使用这些系统调用
+klogd守护进程生成一张符号表用于调试发生在可装载模块中的保护错误。
+
+至少klogd会提供产生保护错误的模块名。还可有额外的符号信息供可装载模块开发者选择
+以从模块中输出符号信息。
+
+因为内核模块环境可能是动态的,所以必须有一种机制当模块环境发生改变时来通知klogd
+守护进程。 有一些可用的命令行选项允许klogd向当前执行中的守护进程发送信号,告知符
+号信息应该被刷新了。 更多信息请参看klogd手册页。
+
+sysklogd发布时包含一个补丁修改了modules-2.0.0包,无论何时一个模块装载或者卸载都
+会自动向klogd发送信号。打上这个补丁提供了必要的对调试发生于内核可装载模块的保护
+错误的无缝支持。
+
+以下是被klogd处理过的发生在可装载模块中的一个保护错误例子:
+---------------------------------------------------------------------------
+Aug 29 09:51:01 blizard kernel: Unable to handle kernel paging request at virtual address f15e97cc
+Aug 29 09:51:01 blizard kernel: current->tss.cr3 = 0062d000, %cr3 = 0062d000
+Aug 29 09:51:01 blizard kernel: *pde = 00000000
+Aug 29 09:51:01 blizard kernel: Oops: 0002
+Aug 29 09:51:01 blizard kernel: CPU: 0
+Aug 29 09:51:01 blizard kernel: EIP: 0010:[oops:_oops+16/3868]
+Aug 29 09:51:01 blizard kernel: EFLAGS: 00010212
+Aug 29 09:51:01 blizard kernel: eax: 315e97cc ebx: 003a6f80 ecx: 001be77b edx: 00237c0c
+Aug 29 09:51:01 blizard kernel: esi: 00000000 edi: bffffdb3 ebp: 00589f90 esp: 00589f8c
+Aug 29 09:51:01 blizard kernel: ds: 0018 es: 0018 fs: 002b gs: 002b ss: 0018
+Aug 29 09:51:01 blizard kernel: Process oops_test (pid: 3374, process nr: 21, stackpage=00589000)
+Aug 29 09:51:01 blizard kernel: Stack: 315e97cc 00589f98 0100b0b4 bffffed4 0012e38e 00240c64 003a6f80 00000001
+Aug 29 09:51:01 blizard kernel: 00000000 00237810 bfffff00 0010a7fa 00000003 00000001 00000000 bfffff00
+Aug 29 09:51:01 blizard kernel: bffffdb3 bffffed4 ffffffda 0000002b 0007002b 0000002b 0000002b 00000036
+Aug 29 09:51:01 blizard kernel: Call Trace: [oops:_oops_ioctl+48/80] [_sys_ioctl+254/272] [_system_call+82/128]
+Aug 29 09:51:01 blizard kernel: Code: c7 00 05 00 00 00 eb 08 90 90 90 90 90 90 90 90 89 ec 5d c3
+---------------------------------------------------------------------------
+
+Dr. G.W. Wettstein Oncology Research Div. Computing Facility
+Roger Maris Cancer Center INTERNET: greg@wind.rmcc.com
+820 4th St. N.
+Fargo, ND 58122
+Phone: 701-234-7556
+
+
+---------------------------------------------------------------------------
+受污染的内核
+
+一些oops报告在程序记数器之后包含字符串'Tainted: '。这表明内核已经被一些东西给污
+染了。 该字符串之后紧跟着一系列的位置敏感的字符,每个代表一个特定的污染值。
+
+ 1:'G'如果所有装载的模块都有GPL或相容的许可证,'P'如果装载了任何的专有模块。
+没有模块MODULE_LICENSE或者带有insmod认为是与GPL不相容的的MODULE_LICENSE的模块被
+认定是专有的。
+
+ 2:'F'如果有任何通过“insmod -f”被强制装载的模块,' '如果所有模块都被正常装载。
+
+ 3:'S'如果oops发生在SMP内核中,运行于没有证明安全运行多处理器的硬件。 当前这种
+情况仅限于几种不支持SMP的速龙处理器。
+
+ 4:'R'如果模块通过“insmod -f”被强制装载,' '如果所有模块都被正常装载。
+
+ 5:'M'如果任何处理器报告了机器检查异常,' '如果没有发生机器检查异常。
+
+ 6:'B'如果页释放函数发现了一个错误的页引用或者一些非预期的页标志。
+
+ 7:'U'如果用户或者用户应用程序特别请求设置污染标志,否则' '。
+
+ 8:'D'如果内核刚刚死掉,比如有OOPS或者BUG。
+
+使用'Tainted: '字符串的主要原因是要告诉内核调试者,这是否是一个干净的内核亦或发
+生了任何的不正常的事。污染是永久的:即使出错的模块已经被卸载了,污染值仍然存在,
+以表明内核不再值得信任。
diff --git a/Documentation/zh_CN/sparse.txt b/Documentation/zh_CN/sparse.txt
new file mode 100644
index 0000000..75992a6
--- /dev/null
+++ b/Documentation/zh_CN/sparse.txt
@@ -0,0 +1,100 @@
+Chinese translated version of Documentation/sparse.txt
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have a problem
+communicating in English you can also ask the Chinese maintainer for
+help. Contact the Chinese maintainer if this translation is outdated
+or if there is a problem with the translation.
+
+Chinese maintainer: Li Yang <leo@zh-kernel.org>
+---------------------------------------------------------------------
+Documentation/sparse.txt 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+中文版维护者: 李阳 Li Yang <leo@zh-kernel.org>
+中文版翻译者: 李阳 Li Yang <leo@zh-kernel.org>
+
+
+以下为正文
+---------------------------------------------------------------------
+
+Copyright 2004 Linus Torvalds
+Copyright 2004 Pavel Machek <pavel@suse.cz>
+Copyright 2006 Bob Copeland <me@bobcopeland.com>
+
+使用 sparse 工具做类型检查
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+"__bitwise" 是一种类型属性,所以你应该这样使用它:
+
+ typedef int __bitwise pm_request_t;
+
+ enum pm_request {
+ PM_SUSPEND = (__force pm_request_t) 1,
+ PM_RESUME = (__force pm_request_t) 2
+ };
+
+这样会使 PM_SUSPEND 和 PM_RESUME 成为位方式(bitwise)整数(使用"__force"
+是因为 sparse 会抱怨改变位方式的类型转换,但是这里我们确实需要强制进行转
+换)。而且因为所有枚举值都使用了相同的类型,这里的"enum pm_request"也将
+会使用那个类型做为底层实现。
+
+而且使用 gcc 编译的时候,所有的 __bitwise/__force 都会消失,最后在 gcc
+看来它们只不过是普通的整数。
+
+坦白来说,你并不需要使用枚举类型。上面那些实际都可以浓缩成一个特殊的"int
+__bitwise"类型。
+
+所以更简单的办法只要这样做:
+
+ typedef int __bitwise pm_request_t;
+
+ #define PM_SUSPEND ((__force pm_request_t) 1)
+ #define PM_RESUME ((__force pm_request_t) 2)
+
+现在你就有了严格的类型检查所需要的所有基础架构。
+
+一个小提醒:常数整数"0"是特殊的。你可以直接把常数零当作位方式整数使用而
+不用担心 sparse 会抱怨。这是因为"bitwise"(恰如其名)是用来确保不同位方
+式类型不会被弄混(小尾模式,大尾模式,cpu尾模式,或者其他),对他们来说
+常数"0"确实是特殊的。
+
+获取 sparse 工具
+~~~~~~~~~~~~~~~~
+
+你可以从 Sparse 的主页获取最新的发布版本:
+
+ http://www.kernel.org/pub/linux/kernel/people/josh/sparse/
+
+或者,你也可以使用 git 克隆最新的 sparse 开发版本:
+
+ git://git.kernel.org/pub/scm/linux/kernel/git/josh/sparse.git
+
+DaveJ 把每小时自动生成的 git 源码树 tar 包放在以下地址:
+
+ http://www.codemonkey.org.uk/projects/git-snapshots/sparse/
+
+一旦你下载了源码,只要以普通用户身份运行:
+
+ make
+ make install
+
+它将会被自动安装到你的 ~/bin 目录下。
+
+使用 sparse 工具
+~~~~~~~~~~~~~~~~
+
+用"make C=1"命令来编译内核,会对所有重新编译的 C 文件使用 sparse 工具。
+或者使用"make C=2"命令,无论文件是否被重新编译都会对其使用 sparse 工具。
+如果你已经编译了内核,用后一种方式可以很快地检查整个源码树。
+
+make 的可选变量 CHECKFLAGS 可以用来向 sparse 工具传递参数。编译系统会自
+动向 sparse 工具传递 -Wbitwise 参数。你可以定义 __CHECK_ENDIAN__ 来进行
+大小尾检查。
+
+ make C=2 CHECKFLAGS="-D__CHECK_ENDIAN__"
+
+这些检查默认都是被关闭的,因为他们通常会产生大量的警告。
diff --git a/Documentation/zh_CN/stable_api_nonsense.txt b/Documentation/zh_CN/stable_api_nonsense.txt
new file mode 100644
index 0000000..c26a27d
--- /dev/null
+++ b/Documentation/zh_CN/stable_api_nonsense.txt
@@ -0,0 +1,157 @@
+Chinese translated version of Documentation/stable_api_nonsense.txt
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have problem
+communicating in English you can also ask the Chinese maintainer for help.
+Contact the Chinese maintainer, if this translation is outdated or there
+is problem with translation.
+
+Maintainer: Greg Kroah-Hartman <greg@kroah.com>
+Chinese maintainer: TripleX Chung <zhongyu@18mail.cn>
+---------------------------------------------------------------------
+Documentation/stable_api_nonsense.txt 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+英文版维护者: Greg Kroah-Hartman <greg@kroah.com>
+中文版维护者: 钟宇 TripleX Chung <zhongyu@18mail.cn>
+中文版翻译者: 钟宇 TripleX Chung <zhongyu@18mail.cn>
+中文版校译者: 李阳 Li Yang <leoli@freescale.com>
+以下为正文
+---------------------------------------------------------------------
+
+写作本文档的目的,是为了解释为什么Linux既没有二进制内核接口,也没有稳定
+的内核接口。这里所说的内核接口,是指内核里的接口,而不是内核和用户空间
+的接口。内核到用户空间的接口,是提供给应用程序使用的系统调用,系统调用
+在历史上几乎没有过变化,将来也不会有变化。我有一些老应用程序是在0.9版本
+或者更早版本的内核上编译的,在使用2.6版本内核的Linux发布上依然用得很好
+。用户和应用程序作者可以将这个接口看成是稳定的。
+
+
+执行纲要
+--------
+
+你也许以为自己想要稳定的内核接口,但是你不清楚你要的实际上不是它。你需
+要的其实是稳定的驱动程序,而你只有将驱动程序放到公版内核的源代码树里,
+才有可能达到这个目的。而且这样做还有很多其它好处,正是因为这些好处使得
+Linux能成为强壮,稳定,成熟的操作系统,这也是你最开始选择Linux的原因。
+
+
+入门
+-----
+
+只有那些写驱动程序的“怪人”才会担心内核接口的改变,对广大用户来说,既
+看不到内核接口,也不需要去关心它。
+
+首先,我不打算讨论关于任何非GPL许可的内核驱动的法律问题,这些非GPL许可
+的驱动程序包括不公开源代码,隐藏源代码,二进制或者是用源代码包装,或者
+是其它任何形式的不能以GPL许可公开源代码的驱动程序。如果有法律问题,请咨
+询律师,我只是一个程序员,所以我只打算探讨技术问题(不是小看法律问题,
+法律问题很实际,并且需要一直关注)。
+
+既然只谈技术问题,我们就有了下面两个主题:二进制内核接口和稳定的内核源
+代码接口。这两个问题是互相关联的,让我们先解决掉二进制接口的问题。
+
+
+二进制内核接口
+--------------
+假如我们有一个稳定的内核源代码接口,那么自然而然的,我们就拥有了稳定的
+二进制接口,是这样的吗?错。让我们看看关于Linux内核的几点事实:
+ - 取决于所用的C编译器的版本,不同的内核数据结构里的结构体的对齐方
+式会有差别,代码中不同函数的表现形式也不一样(函数是不是被inline编译取
+决于编译器行为)。不同的函数的表现形式并不重要,但是数据结构内部的对齐
+方式很关键。
+ - 取决于内核的配置选项,不同的选项会让内核的很多东西发生改变:
+ - 同一个结构体可能包含不同的成员变量
+ - 有的函数可能根本不会被实现(比如编译的时候没有选择SMP支持
+,一些锁函数就会被定义成空函数)。
+ - 内核使用的内存会以不同的方式对齐,这取决于不同的内核配置选
+项。
+ - Linux可以在很多的不同体系结构的处理器上运行。在某个体系结构上编
+译好的二进制驱动程序,不可能在另外一个体系结构上正确的运行。
+
+对于一个特定的内核,满足这些条件并不难,使用同一个C编译器和同样的内核配
+置选项来编译驱动程序模块就可以了。这对于给一个特定Linux发布的特定版本提
+供驱动程序,是完全可以满足需求的。但是如果你要给不同发布的不同版本都发
+布一个驱动程序,就需要在每个发布上用不同的内核设置参数都编译一次内核,
+这简直跟噩梦一样。而且还要注意到,每个Linux发布还提供不同的Linux内核,
+这些内核都针对不同的硬件类型进行了优化(有很多种不同的处理器,还有不同
+的内核设置选项)。所以每发布一次驱动程序,都需要提供很多不同版本的内核
+模块。
+
+相信我,如果你真的要采取这种发布方式,一定会慢慢疯掉,我很久以前就有过
+深刻的教训...
+
+
+稳定的内核源代码接口
+--------------------
+
+如果有人不将他的内核驱动程序,放入公版内核的源代码树,而又想让驱动程序
+一直保持在最新的内核中可用,那么这个话题将会变得没完没了。
+ 内核开发是持续而且快节奏的,从来都不会慢下来。内核开发人员在当前接口中
+找到bug,或者找到更好的实现方式。一旦发现这些,他们就很快会去修改当前的
+接口。修改接口意味着,函数名可能会改变,结构体可能被扩充或者删减,函数
+的参数也可能发生改变。一旦接口被修改,内核中使用这些接口的地方需要同时
+修正,这样才能保证所有的东西继续工作。
+
+举一个例子,内核的USB驱动程序接口在USB子系统的整个生命周期中,至少经历
+了三次重写。这些重写解决以下问题:
+ - 把数据流从同步模式改成非同步模式,这个改动减少了一些驱动程序的
+复杂度,提高了所有USB驱动程序的吞吐率,这样几乎所有的USB设备都能以最大
+速率工作了。
+ - 修改了USB核心代码中为USB驱动分配数据包内存的方式,所有的驱动都
+需要提供更多的参数给USB核心,以修正了很多已经被记录在案的死锁。
+
+这和一些封闭源代码的操作系统形成鲜明的对比,在那些操作系统上,不得不额
+外的维护旧的USB接口。这导致了一个可能性,新的开发者依然会不小心使用旧的
+接口,以不恰当的方式编写代码,进而影响到操作系统的稳定性。
+ 在上面的例子中,所有的开发者都同意这些重要的改动,在这样的情况下修改代
+价很低。如果Linux保持一个稳定的内核源代码接口,那么就得创建一个新的接口
+;旧的,有问题的接口必须一直维护,给Linux USB开发者带来额外的工作。既然
+所有的Linux USB驱动的作者都是利用自己的时间工作,那么要求他们去做毫无意
+义的免费额外工作,是不可能的。
+ 安全问题对Linux来说十分重要。一个安全问题被发现,就会在短时间内得到修
+正。在很多情况下,这将导致Linux内核中的一些接口被重写,以从根本上避免安
+全问题。一旦接口被重写,所有使用这些接口的驱动程序,必须同时得到修正,
+以确定安全问题已经得到修复并且不可能在未来还有同样的安全问题。如果内核
+内部接口不允许改变,那么就不可能修复这样的安全问题,也不可能确认这样的
+安全问题以后不会发生。
+开发者一直在清理内核接口。如果一个接口没有人在使用了,它就会被删除。这
+样可以确保内核尽可能的小,而且所有潜在的接口都会得到尽可能完整的测试
+(没有人使用的接口是不可能得到良好的测试的)。
+
+
+要做什么
+-------
+
+如果你写了一个Linux内核驱动,但是它还不在Linux源代码树里,作为一个开发
+者,你应该怎么做?为每个发布的每个版本提供一个二进制驱动,那简直是一个
+噩梦,要跟上永远处于变化之中的内核接口,也是一件辛苦活。
+很简单,让你的驱动进入内核源代码树(要记得我们在谈论的是以GPL许可发行
+的驱动,如果你的代码不符合GPL,那么祝你好运,你只能自己解决这个问题了,
+你这个吸血鬼<把Andrew和Linus对吸血鬼的定义链接到这里>)。当你的代码加入
+公版内核源代码树之后,如果一个内核接口改变,你的驱动会直接被修改接口的
+那个人修改。保证你的驱动永远都可以编译通过,并且一直工作,你几乎不需要
+做什么事情。
+
+把驱动放到内核源代码树里会有很多的好处:
+ - 驱动的质量会提升,而维护成本(对原始作者来说)会下降。
+ - 其他人会给驱动添加新特性。
+ - 其他人会找到驱动中的bug并修复。
+ - 其他人会在驱动中找到性能优化的机会。
+ - 当外部的接口的改变需要修改驱动程序的时候,其他人会修改驱动程序
+。
+ - 不需要联系任何发行商,这个驱动会自动的随着所有的Linux发布一起发
+布。
+
+和别的操作系统相比,Linux为更多不同的设备提供现成的驱动,而且能在更多不
+同体系结构的处理器上支持这些设备。这个经过考验的开发模式,必然是错不了
+的 :)
+
+-------------
+感谢 Randy Dunlap, Andrew Morton, David Brownell, Hanna Linder,
+Robert Love, and Nishanth Aravamudan 对于本文档早期版本的评审和建议。
+
+英文版维护者: Greg Kroah-Hartman <greg@kroah.com>
diff --git a/Documentation/zh_CN/stable_kernel_rules.txt b/Documentation/zh_CN/stable_kernel_rules.txt
new file mode 100644
index 0000000..b5b9b0a
--- /dev/null
+++ b/Documentation/zh_CN/stable_kernel_rules.txt
@@ -0,0 +1,66 @@
+Chinese translated version of Documentation/stable_kernel_rules.txt
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have a problem
+communicating in English you can also ask the Chinese maintainer for
+help. Contact the Chinese maintainer if this translation is outdated
+or if there is a problem with the translation.
+
+Chinese maintainer: TripleX Chung <triplex@zh-kernel.org>
+---------------------------------------------------------------------
+Documentation/stable_kernel_rules.txt 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+
+中文版维护者: 钟宇 TripleX Chung <triplex@zh-kernel.org>
+中文版翻译者: 钟宇 TripleX Chung <triplex@zh-kernel.org>
+中文版校译者: 李阳 Li Yang <leo@zh-kernel.org>
+ Kangkai Yin <e12051@motorola.com>
+
+以下为正文
+---------------------------------------------------------------------
+
+关于Linux 2.6稳定版发布,所有你想知道的事情。
+
+关于哪些类型的补丁可以被接收进入稳定版代码树,哪些不可以的规则:
+
+ - 必须是显而易见的正确,并且经过测试的。
+ - 连同上下文,不能大于100行。
+ - 必须只修正一件事情。
+ - 必须修正了一个给大家带来麻烦的真正的bug(不是“这也许是一个问题...”
+ 那样的东西)。
+ - 必须修正带来如下后果的问题:编译错误(对被标记为CONFIG_BROKEN的例外),
+ 内核崩溃,挂起,数据损坏,真正的安全问题,或者一些类似“哦,这不
+ 好”的问题。简短的说,就是一些致命的问题。
+ - 没有“理论上的竞争条件”,除非能给出竞争条件如何被利用的解释。
+ - 不能存在任何的“琐碎的”修正(拼写修正,去掉多余空格之类的)。
+ - 必须被相关子系统的维护者接受。
+ - 必须遵循Documentation/SubmittingPatches里的规则。
+
+向稳定版代码树提交补丁的过程:
+
+ - 在确认了补丁符合以上的规则后,将补丁发送到stable@kernel.org。
+ - 如果补丁被接受到队列里,发送者会收到一个ACK回复,如果没有被接受,收
+ 到的是NAK回复。回复需要几天的时间,这取决于开发者的时间安排。
+ - 被接受的补丁会被加到稳定版本队列里,等待其他开发者的审查。
+ - 安全方面的补丁不要发到这个列表,应该发送到security@kernel.org。
+
+审查周期:
+
+ - 当稳定版的维护者决定开始一个审查周期,补丁将被发送到审查委员会,以
+ 及被补丁影响的领域的维护者(除非提交者就是该领域的维护者)并且抄送
+ 到linux-kernel邮件列表。
+ - 审查委员会有48小时的时间,用来决定给该补丁回复ACK还是NAK。
+ - 如果委员会中有成员拒绝这个补丁,或者linux-kernel列表上有人反对这个
+ 补丁,并提出维护者和审查委员会之前没有意识到的问题,补丁会从队列中
+ 丢弃。
+ - 在审查周期结束的时候,那些得到ACK回应的补丁将会被加入到最新的稳定版
+ 发布中,一个新的稳定版发布就此产生。
+ - 安全性补丁将从内核安全小组那里直接接收到稳定版代码树中,而不是通过
+ 通常的审查周期。请联系内核安全小组以获得关于这个过程的更多细节。
+
+审查委员会:
+ - 由一些自愿承担这项任务的内核开发者,和几个非志愿的组成。
diff --git a/Documentation/zh_CN/volatile-considered-harmful.txt b/Documentation/zh_CN/volatile-considered-harmful.txt
new file mode 100644
index 0000000..ba8149d
--- /dev/null
+++ b/Documentation/zh_CN/volatile-considered-harmful.txt
@@ -0,0 +1,113 @@
+Chinese translated version of Documentation/volatile-considered-harmful.txt
+
+If you have any comment or update to the content, please contact the
+original document maintainer directly. However, if you have a problem
+communicating in English you can also ask the Chinese maintainer for
+help. Contact the Chinese maintainer if this translation is outdated
+or if there is a problem with the translation.
+
+Maintainer: Jonathan Corbet <corbet@lwn.net>
+Chinese maintainer: Bryan Wu <bryan.wu@analog.com>
+---------------------------------------------------------------------
+Documentation/volatile-considered-harmful.txt 的中文翻译
+
+如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文
+交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻
+译存在问题,请联系中文版维护者。
+
+英文版维护者: Jonathan Corbet <corbet@lwn.net>
+中文版维护者: 伍鹏 Bryan Wu <bryan.wu@analog.com>
+中文版翻译者: 伍鹏 Bryan Wu <bryan.wu@analog.com>
+中文版校译者: 张汉辉 Eugene Teo <eugeneteo@kernel.sg>
+ 杨瑞 Dave Young <hidave.darkstar@gmail.com>
+以下为正文
+---------------------------------------------------------------------
+
+为什么不应该使用“volatile”类型
+------------------------------
+
+C程序员通常认为volatile表示某个变量可以在当前执行的线程之外被改变;因此,在内核
+中用到共享数据结构时,常常会有C程序员喜欢使用volatile这类变量。换句话说,他们经
+常会把volatile类型看成某种简易的原子变量,当然它们不是。在内核中使用volatile几
+乎总是错误的;本文档将解释为什么这样。
+
+理解volatile的关键是知道它的目的是用来消除优化,实际上很少有人真正需要这样的应
+用。在内核中,程序员必须防止意外的并发访问破坏共享的数据结构,这其实是一个完全
+不同的任务。用来防止意外并发访问的保护措施,可以更加高效的避免大多数优化相关的
+问题。
+
+像volatile一样,内核提供了很多原语来保证并发访问时的数据安全(自旋锁, 互斥量,内
+存屏障等等),同样可以防止意外的优化。如果可以正确使用这些内核原语,那么就没有
+必要再使用volatile。如果仍然必须使用volatile,那么几乎可以肯定在代码的某处有一
+个bug。在正确设计的内核代码中,volatile能带来的仅仅是使事情变慢。
+
+思考一下这段典型的内核代码:
+
+ spin_lock(&the_lock);
+ do_something_on(&shared_data);
+ do_something_else_with(&shared_data);
+ spin_unlock(&the_lock);
+
+如果所有的代码都遵循加锁规则,当持有the_lock的时候,不可能意外的改变shared_data的
+值。任何可能访问该数据的其他代码都会在这个锁上等待。自旋锁原语跟内存屏障一样—— 它
+们显式的用来书写成这样 —— 意味着数据访问不会跨越它们而被优化。所以本来编译器认为
+它知道在shared_data里面将有什么,但是因为spin_lock()调用跟内存屏障一样,会强制编
+译器忘记它所知道的一切。那么在访问这些数据时不会有优化的问题。
+
+如果shared_data被声名为volatile,锁操作将仍然是必须的。就算我们知道没有其他人正在
+使用它,编译器也将被阻止优化对临界区内shared_data的访问。在锁有效的同时,
+shared_data不是volatile的。在处理共享数据的时候,适当的锁操作可以不再需要
+volatile —— 并且是有潜在危害的。
+
+volatile的存储类型最初是为那些内存映射的I/O寄存器而定义。在内核里,寄存器访问也应
+该被锁保护,但是人们也不希望编译器“优化”临界区内的寄存器访问。内核里I/O的内存访问
+是通过访问函数完成的;不赞成通过指针对I/O内存的直接访问,并且不是在所有体系架构上
+都能工作。那些访问函数正是为了防止意外优化而写的,因此,再说一次,volatile类型不
+是必需的。
+
+另一种引起用户可能使用volatile的情况是当处理器正忙着等待一个变量的值。正确执行一
+个忙等待的方法是:
+
+ while (my_variable != what_i_want)
+ cpu_relax();
+
+cpu_relax()调用会降低CPU的能量消耗或者让位于超线程双处理器;它也作为内存屏障一样出
+现,所以,再一次,volatile不是必需的。当然,忙等待一开始就是一种反常规的做法。
+
+在内核中,一些稀少的情况下volatile仍然是有意义的:
+
+ - 在一些体系架构的系统上,允许直接的I/0内存访问,那么前面提到的访问函数可以使用
+ volatile。基本上,每一个访问函数调用它自己都是一个小的临界区域并且保证了按照
+ 程序员期望的那样发生访问操作。
+
+ - 某些会改变内存的内联汇编代码虽然没有什么其他明显的附作用,但是有被GCC删除的可
+ 能性。在汇编声明中加上volatile关键字可以防止这种删除操作。
+
+ - Jiffies变量是一种特殊情况,虽然每次引用它的时候都可以有不同的值,但读jiffies
+ 变量时不需要任何特殊的加锁保护。所以jiffies变量可以使用volatile,但是不赞成
+ 其他跟jiffies相同类型变量使用volatile。Jiffies被认为是一种“愚蠢的遗留物"
+ (Linus的话)因为解决这个问题比保持现状要麻烦的多。
+
+ - 由于某些I/0设备可能会修改连续一致的内存,所以有时,指向连续一致内存的数据结构
+ 的指针需要正确的使用volatile。网络适配器使用的环状缓存区正是这类情形的一个例
+ 子,其中适配器用改变指针来表示哪些描述符已经处理过了。
+
+对于大多代码,上述几种可以使用volatile的情况都不适用。所以,使用volatile是一种
+bug并且需要对这样的代码额外仔细检查。那些试图使用volatile的开发人员需要退一步想想
+他们真正想实现的是什么。
+
+非常欢迎删除volatile变量的补丁 - 只要证明这些补丁完整的考虑了并发问题。
+
+注释
+----
+
+[1] http://lwn.net/Articles/233481/
+[2] http://lwn.net/Articles/233482/
+
+致谢
+----
+
+最初由Randy Dunlap推动并作初步研究
+由Jonathan Corbet撰写
+参考Satyam Sharma,Johannes Stezenbach,Jesper Juhl,Heikki Orsila,
+H. Peter Anvin,Philipp Hahn和Stefan Richter的意见改善了本档。
diff --git a/Documentation/zorro.txt b/Documentation/zorro.txt
new file mode 100644
index 0000000..d5829d1
--- /dev/null
+++ b/Documentation/zorro.txt
@@ -0,0 +1,102 @@
+ Writing Device Drivers for Zorro Devices
+ ----------------------------------------
+
+Written by Geert Uytterhoeven <geert@linux-m68k.org>
+Last revised: September 5, 2003
+
+
+1. Introduction
+---------------
+
+The Zorro bus is the bus used in the Amiga family of computers. Thanks to
+AutoConfig(tm), it's 100% Plug-and-Play.
+
+There are two types of Zorro busses, Zorro II and Zorro III:
+
+ - The Zorro II address space is 24-bit and lies within the first 16 MB of the
+ Amiga's address map.
+
+ - Zorro III is a 32-bit extension of Zorro II, which is backwards compatible
+ with Zorro II. The Zorro III address space lies outside the first 16 MB.
+
+
+2. Probing for Zorro Devices
+----------------------------
+
+Zorro devices are found by calling `zorro_find_device()', which returns a
+pointer to the `next' Zorro device with the specified Zorro ID. A probe loop
+for the board with Zorro ID `ZORRO_PROD_xxx' looks like:
+
+ struct zorro_dev *z = NULL;
+
+ while ((z = zorro_find_device(ZORRO_PROD_xxx, z))) {
+ if (!zorro_request_region(z->resource.start+MY_START, MY_SIZE,
+ "My explanation"))
+ ...
+ }
+
+`ZORRO_WILDCARD' acts as a wildcard and finds any Zorro device. If your driver
+supports different types of boards, you can use a construct like:
+
+ struct zorro_dev *z = NULL;
+
+ while ((z = zorro_find_device(ZORRO_WILDCARD, z))) {
+ if (z->id != ZORRO_PROD_xxx1 && z->id != ZORRO_PROD_xxx2 && ...)
+ continue;
+ if (!zorro_request_region(z->resource.start+MY_START, MY_SIZE,
+ "My explanation"))
+ ...
+ }
+
+
+3. Zorro Resources
+------------------
+
+Before you can access a Zorro device's registers, you have to make sure it's
+not yet in use. This is done using the I/O memory space resource management
+functions:
+
+ request_mem_region()
+ release_mem_region()
+
+Shortcuts to claim the whole device's address space are provided as well:
+
+ zorro_request_device
+ zorro_release_device
+
+
+4. Accessing the Zorro Address Space
+------------------------------------
+
+The address regions in the Zorro device resources are Zorro bus address
+regions. Due to the identity bus-physical address mapping on the Zorro bus,
+they are CPU physical addresses as well.
+
+The treatment of these regions depends on the type of Zorro space:
+
+ - Zorro II address space is always mapped and does not have to be mapped
+ explicitly using z_ioremap().
+
+ Conversion from bus/physical Zorro II addresses to kernel virtual addresses
+ and vice versa is done using:
+
+ virt_addr = ZTWO_VADDR(bus_addr);
+ bus_addr = ZTWO_PADDR(virt_addr);
+
+ - Zorro III address space must be mapped explicitly using z_ioremap() first
+ before it can be accessed:
+
+ virt_addr = z_ioremap(bus_addr, size);
+ ...
+ z_iounmap(virt_addr);
+
+
+5. References
+-------------
+
+linux/include/linux/zorro.h
+linux/include/asm-{m68k,ppc}/zorro.h
+linux/include/linux/zorro_ids.h
+linux/drivers/zorro
+/proc/bus/zorro
+
OpenPOWER on IntegriCloud